]> git.ipfire.org Git - thirdparty/gcc.git/blob - gcc/tree-vect-loop.cc
[PATCH v1 1/1] RISC-V: Nan-box the result of movbf on soft-bf16
[thirdparty/gcc.git] / gcc / tree-vect-loop.cc
1 /* Loop Vectorization
2 Copyright (C) 2003-2024 Free Software Foundation, Inc.
3 Contributed by Dorit Naishlos <dorit@il.ibm.com> and
4 Ira Rosen <irar@il.ibm.com>
5
6 This file is part of GCC.
7
8 GCC is free software; you can redistribute it and/or modify it under
9 the terms of the GNU General Public License as published by the Free
10 Software Foundation; either version 3, or (at your option) any later
11 version.
12
13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
16 for more details.
17
18 You should have received a copy of the GNU General Public License
19 along with GCC; see the file COPYING3. If not see
20 <http://www.gnu.org/licenses/>. */
21
22 #define INCLUDE_ALGORITHM
23 #include "config.h"
24 #include "system.h"
25 #include "coretypes.h"
26 #include "backend.h"
27 #include "target.h"
28 #include "rtl.h"
29 #include "tree.h"
30 #include "gimple.h"
31 #include "cfghooks.h"
32 #include "tree-pass.h"
33 #include "ssa.h"
34 #include "optabs-tree.h"
35 #include "memmodel.h"
36 #include "optabs.h"
37 #include "diagnostic-core.h"
38 #include "fold-const.h"
39 #include "stor-layout.h"
40 #include "cfganal.h"
41 #include "gimplify.h"
42 #include "gimple-iterator.h"
43 #include "gimplify-me.h"
44 #include "tree-ssa-loop-ivopts.h"
45 #include "tree-ssa-loop-manip.h"
46 #include "tree-ssa-loop-niter.h"
47 #include "tree-ssa-loop.h"
48 #include "cfgloop.h"
49 #include "tree-scalar-evolution.h"
50 #include "tree-vectorizer.h"
51 #include "gimple-fold.h"
52 #include "cgraph.h"
53 #include "tree-cfg.h"
54 #include "tree-if-conv.h"
55 #include "internal-fn.h"
56 #include "tree-vector-builder.h"
57 #include "vec-perm-indices.h"
58 #include "tree-eh.h"
59 #include "case-cfn-macros.h"
60 #include "langhooks.h"
61
62 /* Loop Vectorization Pass.
63
64 This pass tries to vectorize loops.
65
66 For example, the vectorizer transforms the following simple loop:
67
68 short a[N]; short b[N]; short c[N]; int i;
69
70 for (i=0; i<N; i++){
71 a[i] = b[i] + c[i];
72 }
73
74 as if it was manually vectorized by rewriting the source code into:
75
76 typedef int __attribute__((mode(V8HI))) v8hi;
77 short a[N]; short b[N]; short c[N]; int i;
78 v8hi *pa = (v8hi*)a, *pb = (v8hi*)b, *pc = (v8hi*)c;
79 v8hi va, vb, vc;
80
81 for (i=0; i<N/8; i++){
82 vb = pb[i];
83 vc = pc[i];
84 va = vb + vc;
85 pa[i] = va;
86 }
87
88 The main entry to this pass is vectorize_loops(), in which
89 the vectorizer applies a set of analyses on a given set of loops,
90 followed by the actual vectorization transformation for the loops that
91 had successfully passed the analysis phase.
92 Throughout this pass we make a distinction between two types of
93 data: scalars (which are represented by SSA_NAMES), and memory references
94 ("data-refs"). These two types of data require different handling both
95 during analysis and transformation. The types of data-refs that the
96 vectorizer currently supports are ARRAY_REFS which base is an array DECL
97 (not a pointer), and INDIRECT_REFS through pointers; both array and pointer
98 accesses are required to have a simple (consecutive) access pattern.
99
100 Analysis phase:
101 ===============
102 The driver for the analysis phase is vect_analyze_loop().
103 It applies a set of analyses, some of which rely on the scalar evolution
104 analyzer (scev) developed by Sebastian Pop.
105
106 During the analysis phase the vectorizer records some information
107 per stmt in a "stmt_vec_info" struct which is attached to each stmt in the
108 loop, as well as general information about the loop as a whole, which is
109 recorded in a "loop_vec_info" struct attached to each loop.
110
111 Transformation phase:
112 =====================
113 The loop transformation phase scans all the stmts in the loop, and
114 creates a vector stmt (or a sequence of stmts) for each scalar stmt S in
115 the loop that needs to be vectorized. It inserts the vector code sequence
116 just before the scalar stmt S, and records a pointer to the vector code
117 in STMT_VINFO_VEC_STMT (stmt_info) (stmt_info is the stmt_vec_info struct
118 attached to S). This pointer will be used for the vectorization of following
119 stmts which use the def of stmt S. Stmt S is removed if it writes to memory;
120 otherwise, we rely on dead code elimination for removing it.
121
122 For example, say stmt S1 was vectorized into stmt VS1:
123
124 VS1: vb = px[i];
125 S1: b = x[i]; STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
126 S2: a = b;
127
128 To vectorize stmt S2, the vectorizer first finds the stmt that defines
129 the operand 'b' (S1), and gets the relevant vector def 'vb' from the
130 vector stmt VS1 pointed to by STMT_VINFO_VEC_STMT (stmt_info (S1)). The
131 resulting sequence would be:
132
133 VS1: vb = px[i];
134 S1: b = x[i]; STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
135 VS2: va = vb;
136 S2: a = b; STMT_VINFO_VEC_STMT (stmt_info (S2)) = VS2
137
138 Operands that are not SSA_NAMEs, are data-refs that appear in
139 load/store operations (like 'x[i]' in S1), and are handled differently.
140
141 Target modeling:
142 =================
143 Currently the only target specific information that is used is the
144 size of the vector (in bytes) - "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD".
145 Targets that can support different sizes of vectors, for now will need
146 to specify one value for "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD". More
147 flexibility will be added in the future.
148
149 Since we only vectorize operations which vector form can be
150 expressed using existing tree codes, to verify that an operation is
151 supported, the vectorizer checks the relevant optab at the relevant
152 machine_mode (e.g, optab_handler (add_optab, V8HImode)). If
153 the value found is CODE_FOR_nothing, then there's no target support, and
154 we can't vectorize the stmt.
155
156 For additional information on this project see:
157 http://gcc.gnu.org/projects/tree-ssa/vectorization.html
158 */
159
160 static void vect_estimate_min_profitable_iters (loop_vec_info, int *, int *,
161 unsigned *);
162 static stmt_vec_info vect_is_simple_reduction (loop_vec_info, stmt_vec_info,
163 bool *, bool *, bool);
164
165 /* Subroutine of vect_determine_vf_for_stmt that handles only one
166 statement. VECTYPE_MAYBE_SET_P is true if STMT_VINFO_VECTYPE
167 may already be set for general statements (not just data refs). */
168
169 static opt_result
170 vect_determine_vf_for_stmt_1 (vec_info *vinfo, stmt_vec_info stmt_info,
171 bool vectype_maybe_set_p,
172 poly_uint64 *vf)
173 {
174 gimple *stmt = stmt_info->stmt;
175
176 if ((!STMT_VINFO_RELEVANT_P (stmt_info)
177 && !STMT_VINFO_LIVE_P (stmt_info))
178 || gimple_clobber_p (stmt))
179 {
180 if (dump_enabled_p ())
181 dump_printf_loc (MSG_NOTE, vect_location, "skip.\n");
182 return opt_result::success ();
183 }
184
185 tree stmt_vectype, nunits_vectype;
186 opt_result res = vect_get_vector_types_for_stmt (vinfo, stmt_info,
187 &stmt_vectype,
188 &nunits_vectype);
189 if (!res)
190 return res;
191
192 if (stmt_vectype)
193 {
194 if (STMT_VINFO_VECTYPE (stmt_info))
195 /* The only case when a vectype had been already set is for stmts
196 that contain a data ref, or for "pattern-stmts" (stmts generated
197 by the vectorizer to represent/replace a certain idiom). */
198 gcc_assert ((STMT_VINFO_DATA_REF (stmt_info)
199 || vectype_maybe_set_p)
200 && STMT_VINFO_VECTYPE (stmt_info) == stmt_vectype);
201 else
202 STMT_VINFO_VECTYPE (stmt_info) = stmt_vectype;
203 }
204
205 if (nunits_vectype)
206 vect_update_max_nunits (vf, nunits_vectype);
207
208 return opt_result::success ();
209 }
210
211 /* Subroutine of vect_determine_vectorization_factor. Set the vector
212 types of STMT_INFO and all attached pattern statements and update
213 the vectorization factor VF accordingly. Return true on success
214 or false if something prevented vectorization. */
215
216 static opt_result
217 vect_determine_vf_for_stmt (vec_info *vinfo,
218 stmt_vec_info stmt_info, poly_uint64 *vf)
219 {
220 if (dump_enabled_p ())
221 dump_printf_loc (MSG_NOTE, vect_location, "==> examining statement: %G",
222 stmt_info->stmt);
223 opt_result res = vect_determine_vf_for_stmt_1 (vinfo, stmt_info, false, vf);
224 if (!res)
225 return res;
226
227 if (STMT_VINFO_IN_PATTERN_P (stmt_info)
228 && STMT_VINFO_RELATED_STMT (stmt_info))
229 {
230 gimple *pattern_def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
231 stmt_info = STMT_VINFO_RELATED_STMT (stmt_info);
232
233 /* If a pattern statement has def stmts, analyze them too. */
234 for (gimple_stmt_iterator si = gsi_start (pattern_def_seq);
235 !gsi_end_p (si); gsi_next (&si))
236 {
237 stmt_vec_info def_stmt_info = vinfo->lookup_stmt (gsi_stmt (si));
238 if (dump_enabled_p ())
239 dump_printf_loc (MSG_NOTE, vect_location,
240 "==> examining pattern def stmt: %G",
241 def_stmt_info->stmt);
242 res = vect_determine_vf_for_stmt_1 (vinfo, def_stmt_info, true, vf);
243 if (!res)
244 return res;
245 }
246
247 if (dump_enabled_p ())
248 dump_printf_loc (MSG_NOTE, vect_location,
249 "==> examining pattern statement: %G",
250 stmt_info->stmt);
251 res = vect_determine_vf_for_stmt_1 (vinfo, stmt_info, true, vf);
252 if (!res)
253 return res;
254 }
255
256 return opt_result::success ();
257 }
258
259 /* Function vect_determine_vectorization_factor
260
261 Determine the vectorization factor (VF). VF is the number of data elements
262 that are operated upon in parallel in a single iteration of the vectorized
263 loop. For example, when vectorizing a loop that operates on 4byte elements,
264 on a target with vector size (VS) 16byte, the VF is set to 4, since 4
265 elements can fit in a single vector register.
266
267 We currently support vectorization of loops in which all types operated upon
268 are of the same size. Therefore this function currently sets VF according to
269 the size of the types operated upon, and fails if there are multiple sizes
270 in the loop.
271
272 VF is also the factor by which the loop iterations are strip-mined, e.g.:
273 original loop:
274 for (i=0; i<N; i++){
275 a[i] = b[i] + c[i];
276 }
277
278 vectorized loop:
279 for (i=0; i<N; i+=VF){
280 a[i:VF] = b[i:VF] + c[i:VF];
281 }
282 */
283
284 static opt_result
285 vect_determine_vectorization_factor (loop_vec_info loop_vinfo)
286 {
287 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
288 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
289 unsigned nbbs = loop->num_nodes;
290 poly_uint64 vectorization_factor = 1;
291 tree scalar_type = NULL_TREE;
292 gphi *phi;
293 tree vectype;
294 stmt_vec_info stmt_info;
295 unsigned i;
296
297 DUMP_VECT_SCOPE ("vect_determine_vectorization_factor");
298
299 for (i = 0; i < nbbs; i++)
300 {
301 basic_block bb = bbs[i];
302
303 for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
304 gsi_next (&si))
305 {
306 phi = si.phi ();
307 stmt_info = loop_vinfo->lookup_stmt (phi);
308 if (dump_enabled_p ())
309 dump_printf_loc (MSG_NOTE, vect_location, "==> examining phi: %G",
310 (gimple *) phi);
311
312 gcc_assert (stmt_info);
313
314 if (STMT_VINFO_RELEVANT_P (stmt_info)
315 || STMT_VINFO_LIVE_P (stmt_info))
316 {
317 gcc_assert (!STMT_VINFO_VECTYPE (stmt_info));
318 scalar_type = TREE_TYPE (PHI_RESULT (phi));
319
320 if (dump_enabled_p ())
321 dump_printf_loc (MSG_NOTE, vect_location,
322 "get vectype for scalar type: %T\n",
323 scalar_type);
324
325 vectype = get_vectype_for_scalar_type (loop_vinfo, scalar_type);
326 if (!vectype)
327 return opt_result::failure_at (phi,
328 "not vectorized: unsupported "
329 "data-type %T\n",
330 scalar_type);
331 STMT_VINFO_VECTYPE (stmt_info) = vectype;
332
333 if (dump_enabled_p ())
334 dump_printf_loc (MSG_NOTE, vect_location, "vectype: %T\n",
335 vectype);
336
337 if (dump_enabled_p ())
338 {
339 dump_printf_loc (MSG_NOTE, vect_location, "nunits = ");
340 dump_dec (MSG_NOTE, TYPE_VECTOR_SUBPARTS (vectype));
341 dump_printf (MSG_NOTE, "\n");
342 }
343
344 vect_update_max_nunits (&vectorization_factor, vectype);
345 }
346 }
347
348 for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
349 gsi_next (&si))
350 {
351 if (is_gimple_debug (gsi_stmt (si)))
352 continue;
353 stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
354 opt_result res
355 = vect_determine_vf_for_stmt (loop_vinfo,
356 stmt_info, &vectorization_factor);
357 if (!res)
358 return res;
359 }
360 }
361
362 /* TODO: Analyze cost. Decide if worth while to vectorize. */
363 if (dump_enabled_p ())
364 {
365 dump_printf_loc (MSG_NOTE, vect_location, "vectorization factor = ");
366 dump_dec (MSG_NOTE, vectorization_factor);
367 dump_printf (MSG_NOTE, "\n");
368 }
369
370 if (known_le (vectorization_factor, 1U))
371 return opt_result::failure_at (vect_location,
372 "not vectorized: unsupported data-type\n");
373 LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
374 return opt_result::success ();
375 }
376
377
378 /* Function vect_is_simple_iv_evolution.
379
380 FORNOW: A simple evolution of an induction variables in the loop is
381 considered a polynomial evolution. */
382
383 static bool
384 vect_is_simple_iv_evolution (unsigned loop_nb, tree access_fn, tree * init,
385 tree * step)
386 {
387 tree init_expr;
388 tree step_expr;
389 tree evolution_part = evolution_part_in_loop_num (access_fn, loop_nb);
390 basic_block bb;
391
392 /* When there is no evolution in this loop, the evolution function
393 is not "simple". */
394 if (evolution_part == NULL_TREE)
395 return false;
396
397 /* When the evolution is a polynomial of degree >= 2
398 the evolution function is not "simple". */
399 if (tree_is_chrec (evolution_part))
400 return false;
401
402 step_expr = evolution_part;
403 init_expr = unshare_expr (initial_condition_in_loop_num (access_fn, loop_nb));
404
405 if (dump_enabled_p ())
406 dump_printf_loc (MSG_NOTE, vect_location, "step: %T, init: %T\n",
407 step_expr, init_expr);
408
409 *init = init_expr;
410 *step = step_expr;
411
412 if (TREE_CODE (step_expr) != INTEGER_CST
413 && (TREE_CODE (step_expr) != SSA_NAME
414 || ((bb = gimple_bb (SSA_NAME_DEF_STMT (step_expr)))
415 && flow_bb_inside_loop_p (get_loop (cfun, loop_nb), bb))
416 || (!INTEGRAL_TYPE_P (TREE_TYPE (step_expr))
417 && (!SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr))
418 || !flag_associative_math)))
419 && (TREE_CODE (step_expr) != REAL_CST
420 || !flag_associative_math))
421 {
422 if (dump_enabled_p ())
423 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
424 "step unknown.\n");
425 return false;
426 }
427
428 return true;
429 }
430
431 /* Function vect_is_nonlinear_iv_evolution
432
433 Only support nonlinear induction for integer type
434 1. neg
435 2. mul by constant
436 3. lshift/rshift by constant.
437
438 For neg induction, return a fake step as integer -1. */
439 static bool
440 vect_is_nonlinear_iv_evolution (class loop* loop, stmt_vec_info stmt_info,
441 gphi* loop_phi_node, tree *init, tree *step)
442 {
443 tree init_expr, ev_expr, result, op1, op2;
444 gimple* def;
445
446 if (gimple_phi_num_args (loop_phi_node) != 2)
447 return false;
448
449 init_expr = PHI_ARG_DEF_FROM_EDGE (loop_phi_node, loop_preheader_edge (loop));
450 ev_expr = PHI_ARG_DEF_FROM_EDGE (loop_phi_node, loop_latch_edge (loop));
451
452 /* Support nonlinear induction only for integer type. */
453 if (!INTEGRAL_TYPE_P (TREE_TYPE (init_expr)))
454 return false;
455
456 *init = init_expr;
457 result = PHI_RESULT (loop_phi_node);
458
459 if (TREE_CODE (ev_expr) != SSA_NAME
460 || ((def = SSA_NAME_DEF_STMT (ev_expr)), false)
461 || !is_gimple_assign (def))
462 return false;
463
464 enum tree_code t_code = gimple_assign_rhs_code (def);
465 switch (t_code)
466 {
467 case NEGATE_EXPR:
468 if (gimple_assign_rhs1 (def) != result)
469 return false;
470 *step = build_int_cst (TREE_TYPE (init_expr), -1);
471 STMT_VINFO_LOOP_PHI_EVOLUTION_TYPE (stmt_info) = vect_step_op_neg;
472 break;
473
474 case RSHIFT_EXPR:
475 case LSHIFT_EXPR:
476 case MULT_EXPR:
477 op1 = gimple_assign_rhs1 (def);
478 op2 = gimple_assign_rhs2 (def);
479 if (TREE_CODE (op2) != INTEGER_CST
480 || op1 != result)
481 return false;
482 *step = op2;
483 if (t_code == LSHIFT_EXPR)
484 STMT_VINFO_LOOP_PHI_EVOLUTION_TYPE (stmt_info) = vect_step_op_shl;
485 else if (t_code == RSHIFT_EXPR)
486 STMT_VINFO_LOOP_PHI_EVOLUTION_TYPE (stmt_info) = vect_step_op_shr;
487 /* NEGATE_EXPR and MULT_EXPR are both vect_step_op_mul. */
488 else
489 STMT_VINFO_LOOP_PHI_EVOLUTION_TYPE (stmt_info) = vect_step_op_mul;
490 break;
491
492 default:
493 return false;
494 }
495
496 STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_info) = *init;
497 STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_info) = *step;
498
499 return true;
500 }
501
502 /* Return true if PHI, described by STMT_INFO, is the inner PHI in
503 what we are assuming is a double reduction. For example, given
504 a structure like this:
505
506 outer1:
507 x_1 = PHI <x_4(outer2), ...>;
508 ...
509
510 inner:
511 x_2 = PHI <x_1(outer1), ...>;
512 ...
513 x_3 = ...;
514 ...
515
516 outer2:
517 x_4 = PHI <x_3(inner)>;
518 ...
519
520 outer loop analysis would treat x_1 as a double reduction phi and
521 this function would then return true for x_2. */
522
523 static bool
524 vect_inner_phi_in_double_reduction_p (loop_vec_info loop_vinfo, gphi *phi)
525 {
526 use_operand_p use_p;
527 ssa_op_iter op_iter;
528 FOR_EACH_PHI_ARG (use_p, phi, op_iter, SSA_OP_USE)
529 if (stmt_vec_info def_info = loop_vinfo->lookup_def (USE_FROM_PTR (use_p)))
530 if (STMT_VINFO_DEF_TYPE (def_info) == vect_double_reduction_def)
531 return true;
532 return false;
533 }
534
535 /* Returns true if Phi is a first-order recurrence. A first-order
536 recurrence is a non-reduction recurrence relation in which the value of
537 the recurrence in the current loop iteration equals a value defined in
538 the previous iteration. */
539
540 static bool
541 vect_phi_first_order_recurrence_p (loop_vec_info loop_vinfo, class loop *loop,
542 gphi *phi)
543 {
544 /* A nested cycle isn't vectorizable as first order recurrence. */
545 if (LOOP_VINFO_LOOP (loop_vinfo) != loop)
546 return false;
547
548 /* Ensure the loop latch definition is from within the loop. */
549 edge latch = loop_latch_edge (loop);
550 tree ldef = PHI_ARG_DEF_FROM_EDGE (phi, latch);
551 if (TREE_CODE (ldef) != SSA_NAME
552 || SSA_NAME_IS_DEFAULT_DEF (ldef)
553 || is_a <gphi *> (SSA_NAME_DEF_STMT (ldef))
554 || !flow_bb_inside_loop_p (loop, gimple_bb (SSA_NAME_DEF_STMT (ldef))))
555 return false;
556
557 tree def = gimple_phi_result (phi);
558
559 /* Ensure every use_stmt of the phi node is dominated by the latch
560 definition. */
561 imm_use_iterator imm_iter;
562 use_operand_p use_p;
563 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, def)
564 if (!is_gimple_debug (USE_STMT (use_p))
565 && (SSA_NAME_DEF_STMT (ldef) == USE_STMT (use_p)
566 || !vect_stmt_dominates_stmt_p (SSA_NAME_DEF_STMT (ldef),
567 USE_STMT (use_p))))
568 return false;
569
570 /* First-order recurrence autovectorization needs shuffle vector. */
571 tree scalar_type = TREE_TYPE (def);
572 tree vectype = get_vectype_for_scalar_type (loop_vinfo, scalar_type);
573 if (!vectype)
574 return false;
575
576 return true;
577 }
578
579 /* Function vect_analyze_scalar_cycles_1.
580
581 Examine the cross iteration def-use cycles of scalar variables
582 in LOOP. LOOP_VINFO represents the loop that is now being
583 considered for vectorization (can be LOOP, or an outer-loop
584 enclosing LOOP). SLP indicates there will be some subsequent
585 slp analyses or not. */
586
587 static void
588 vect_analyze_scalar_cycles_1 (loop_vec_info loop_vinfo, class loop *loop,
589 bool slp)
590 {
591 basic_block bb = loop->header;
592 tree init, step;
593 auto_vec<stmt_vec_info, 64> worklist;
594 gphi_iterator gsi;
595 bool double_reduc, reduc_chain;
596
597 DUMP_VECT_SCOPE ("vect_analyze_scalar_cycles");
598
599 /* First - identify all inductions. Reduction detection assumes that all the
600 inductions have been identified, therefore, this order must not be
601 changed. */
602 for (gsi = gsi_start_phis (bb); !gsi_end_p (gsi); gsi_next (&gsi))
603 {
604 gphi *phi = gsi.phi ();
605 tree access_fn = NULL;
606 tree def = PHI_RESULT (phi);
607 stmt_vec_info stmt_vinfo = loop_vinfo->lookup_stmt (phi);
608
609 if (dump_enabled_p ())
610 dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: %G",
611 (gimple *) phi);
612
613 /* Skip virtual phi's. The data dependences that are associated with
614 virtual defs/uses (i.e., memory accesses) are analyzed elsewhere. */
615 if (virtual_operand_p (def))
616 continue;
617
618 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_unknown_def_type;
619
620 /* Analyze the evolution function. */
621 access_fn = analyze_scalar_evolution (loop, def);
622 if (access_fn)
623 {
624 STRIP_NOPS (access_fn);
625 if (dump_enabled_p ())
626 dump_printf_loc (MSG_NOTE, vect_location,
627 "Access function of PHI: %T\n", access_fn);
628 STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo)
629 = initial_condition_in_loop_num (access_fn, loop->num);
630 STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo)
631 = evolution_part_in_loop_num (access_fn, loop->num);
632 }
633
634 if ((!access_fn
635 || vect_inner_phi_in_double_reduction_p (loop_vinfo, phi)
636 || !vect_is_simple_iv_evolution (loop->num, access_fn,
637 &init, &step)
638 || (LOOP_VINFO_LOOP (loop_vinfo) != loop
639 && TREE_CODE (step) != INTEGER_CST))
640 /* Only handle nonlinear iv for same loop. */
641 && (LOOP_VINFO_LOOP (loop_vinfo) != loop
642 || !vect_is_nonlinear_iv_evolution (loop, stmt_vinfo,
643 phi, &init, &step)))
644 {
645 worklist.safe_push (stmt_vinfo);
646 continue;
647 }
648
649 gcc_assert (STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo)
650 != NULL_TREE);
651 gcc_assert (STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo) != NULL_TREE);
652
653 if (dump_enabled_p ())
654 dump_printf_loc (MSG_NOTE, vect_location, "Detected induction.\n");
655 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_induction_def;
656 }
657
658
659 /* Second - identify all reductions and nested cycles. */
660 while (worklist.length () > 0)
661 {
662 stmt_vec_info stmt_vinfo = worklist.pop ();
663 gphi *phi = as_a <gphi *> (stmt_vinfo->stmt);
664 tree def = PHI_RESULT (phi);
665
666 if (dump_enabled_p ())
667 dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: %G",
668 (gimple *) phi);
669
670 gcc_assert (!virtual_operand_p (def)
671 && STMT_VINFO_DEF_TYPE (stmt_vinfo) == vect_unknown_def_type);
672
673 stmt_vec_info reduc_stmt_info
674 = vect_is_simple_reduction (loop_vinfo, stmt_vinfo, &double_reduc,
675 &reduc_chain, slp);
676 if (reduc_stmt_info)
677 {
678 STMT_VINFO_REDUC_DEF (stmt_vinfo) = reduc_stmt_info;
679 STMT_VINFO_REDUC_DEF (reduc_stmt_info) = stmt_vinfo;
680 if (double_reduc)
681 {
682 if (dump_enabled_p ())
683 dump_printf_loc (MSG_NOTE, vect_location,
684 "Detected double reduction.\n");
685
686 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_double_reduction_def;
687 STMT_VINFO_DEF_TYPE (reduc_stmt_info) = vect_double_reduction_def;
688 }
689 else
690 {
691 if (loop != LOOP_VINFO_LOOP (loop_vinfo))
692 {
693 if (dump_enabled_p ())
694 dump_printf_loc (MSG_NOTE, vect_location,
695 "Detected vectorizable nested cycle.\n");
696
697 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_nested_cycle;
698 }
699 else
700 {
701 if (dump_enabled_p ())
702 dump_printf_loc (MSG_NOTE, vect_location,
703 "Detected reduction.\n");
704
705 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_reduction_def;
706 STMT_VINFO_DEF_TYPE (reduc_stmt_info) = vect_reduction_def;
707 /* Store the reduction cycles for possible vectorization in
708 loop-aware SLP if it was not detected as reduction
709 chain. */
710 if (! reduc_chain)
711 LOOP_VINFO_REDUCTIONS (loop_vinfo).safe_push
712 (reduc_stmt_info);
713 }
714 }
715 }
716 else if (vect_phi_first_order_recurrence_p (loop_vinfo, loop, phi))
717 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_first_order_recurrence;
718 else
719 if (dump_enabled_p ())
720 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
721 "Unknown def-use cycle pattern.\n");
722 }
723 }
724
725
726 /* Function vect_analyze_scalar_cycles.
727
728 Examine the cross iteration def-use cycles of scalar variables, by
729 analyzing the loop-header PHIs of scalar variables. Classify each
730 cycle as one of the following: invariant, induction, reduction, unknown.
731 We do that for the loop represented by LOOP_VINFO, and also to its
732 inner-loop, if exists.
733 Examples for scalar cycles:
734
735 Example1: reduction:
736
737 loop1:
738 for (i=0; i<N; i++)
739 sum += a[i];
740
741 Example2: induction:
742
743 loop2:
744 for (i=0; i<N; i++)
745 a[i] = i; */
746
747 static void
748 vect_analyze_scalar_cycles (loop_vec_info loop_vinfo, bool slp)
749 {
750 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
751
752 vect_analyze_scalar_cycles_1 (loop_vinfo, loop, slp);
753
754 /* When vectorizing an outer-loop, the inner-loop is executed sequentially.
755 Reductions in such inner-loop therefore have different properties than
756 the reductions in the nest that gets vectorized:
757 1. When vectorized, they are executed in the same order as in the original
758 scalar loop, so we can't change the order of computation when
759 vectorizing them.
760 2. FIXME: Inner-loop reductions can be used in the inner-loop, so the
761 current checks are too strict. */
762
763 if (loop->inner)
764 vect_analyze_scalar_cycles_1 (loop_vinfo, loop->inner, slp);
765 }
766
767 /* Transfer group and reduction information from STMT_INFO to its
768 pattern stmt. */
769
770 static void
771 vect_fixup_reduc_chain (stmt_vec_info stmt_info)
772 {
773 stmt_vec_info firstp = STMT_VINFO_RELATED_STMT (stmt_info);
774 stmt_vec_info stmtp;
775 gcc_assert (!REDUC_GROUP_FIRST_ELEMENT (firstp)
776 && REDUC_GROUP_FIRST_ELEMENT (stmt_info));
777 REDUC_GROUP_SIZE (firstp) = REDUC_GROUP_SIZE (stmt_info);
778 do
779 {
780 stmtp = STMT_VINFO_RELATED_STMT (stmt_info);
781 gcc_checking_assert (STMT_VINFO_DEF_TYPE (stmtp)
782 == STMT_VINFO_DEF_TYPE (stmt_info));
783 REDUC_GROUP_FIRST_ELEMENT (stmtp) = firstp;
784 stmt_info = REDUC_GROUP_NEXT_ELEMENT (stmt_info);
785 if (stmt_info)
786 REDUC_GROUP_NEXT_ELEMENT (stmtp)
787 = STMT_VINFO_RELATED_STMT (stmt_info);
788 }
789 while (stmt_info);
790 }
791
792 /* Fixup scalar cycles that now have their stmts detected as patterns. */
793
794 static void
795 vect_fixup_scalar_cycles_with_patterns (loop_vec_info loop_vinfo)
796 {
797 stmt_vec_info first;
798 unsigned i;
799
800 FOR_EACH_VEC_ELT (LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo), i, first)
801 {
802 stmt_vec_info next = REDUC_GROUP_NEXT_ELEMENT (first);
803 while (next)
804 {
805 if ((STMT_VINFO_IN_PATTERN_P (next)
806 != STMT_VINFO_IN_PATTERN_P (first))
807 || STMT_VINFO_REDUC_IDX (vect_stmt_to_vectorize (next)) == -1)
808 break;
809 next = REDUC_GROUP_NEXT_ELEMENT (next);
810 }
811 /* If all reduction chain members are well-formed patterns adjust
812 the group to group the pattern stmts instead. */
813 if (! next
814 && STMT_VINFO_REDUC_IDX (vect_stmt_to_vectorize (first)) != -1)
815 {
816 if (STMT_VINFO_IN_PATTERN_P (first))
817 {
818 vect_fixup_reduc_chain (first);
819 LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo)[i]
820 = STMT_VINFO_RELATED_STMT (first);
821 }
822 }
823 /* If not all stmt in the chain are patterns or if we failed
824 to update STMT_VINFO_REDUC_IDX dissolve the chain and handle
825 it as regular reduction instead. */
826 else
827 {
828 stmt_vec_info vinfo = first;
829 stmt_vec_info last = NULL;
830 while (vinfo)
831 {
832 next = REDUC_GROUP_NEXT_ELEMENT (vinfo);
833 REDUC_GROUP_FIRST_ELEMENT (vinfo) = NULL;
834 REDUC_GROUP_NEXT_ELEMENT (vinfo) = NULL;
835 last = vinfo;
836 vinfo = next;
837 }
838 STMT_VINFO_DEF_TYPE (vect_stmt_to_vectorize (first))
839 = vect_internal_def;
840 loop_vinfo->reductions.safe_push (vect_stmt_to_vectorize (last));
841 LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo).unordered_remove (i);
842 --i;
843 }
844 }
845 }
846
847 /* Function vect_get_loop_niters.
848
849 Determine how many iterations the loop is executed and place it
850 in NUMBER_OF_ITERATIONS. Place the number of latch iterations
851 in NUMBER_OF_ITERATIONSM1. Place the condition under which the
852 niter information holds in ASSUMPTIONS.
853
854 Return the loop exit conditions. */
855
856
857 static vec<gcond *>
858 vect_get_loop_niters (class loop *loop, const_edge main_exit, tree *assumptions,
859 tree *number_of_iterations, tree *number_of_iterationsm1)
860 {
861 auto_vec<edge> exits = get_loop_exit_edges (loop);
862 vec<gcond *> conds;
863 conds.create (exits.length ());
864 class tree_niter_desc niter_desc;
865 tree niter_assumptions, niter, may_be_zero;
866
867 *assumptions = boolean_true_node;
868 *number_of_iterationsm1 = chrec_dont_know;
869 *number_of_iterations = chrec_dont_know;
870
871 DUMP_VECT_SCOPE ("get_loop_niters");
872
873 if (exits.is_empty ())
874 return conds;
875
876 if (dump_enabled_p ())
877 dump_printf_loc (MSG_NOTE, vect_location, "Loop has %d exits.\n",
878 exits.length ());
879
880 edge exit;
881 unsigned int i;
882 FOR_EACH_VEC_ELT (exits, i, exit)
883 {
884 gcond *cond = get_loop_exit_condition (exit);
885 if (cond)
886 conds.safe_push (cond);
887
888 if (dump_enabled_p ())
889 dump_printf_loc (MSG_NOTE, vect_location, "Analyzing exit %d...\n", i);
890
891 if (exit != main_exit)
892 continue;
893
894 may_be_zero = NULL_TREE;
895 if (!number_of_iterations_exit_assumptions (loop, exit, &niter_desc, NULL)
896 || chrec_contains_undetermined (niter_desc.niter))
897 continue;
898
899 niter_assumptions = niter_desc.assumptions;
900 may_be_zero = niter_desc.may_be_zero;
901 niter = niter_desc.niter;
902
903 if (may_be_zero && integer_zerop (may_be_zero))
904 may_be_zero = NULL_TREE;
905
906 if (may_be_zero)
907 {
908 if (COMPARISON_CLASS_P (may_be_zero))
909 {
910 /* Try to combine may_be_zero with assumptions, this can simplify
911 computation of niter expression. */
912 if (niter_assumptions && !integer_nonzerop (niter_assumptions))
913 niter_assumptions = fold_build2 (TRUTH_AND_EXPR, boolean_type_node,
914 niter_assumptions,
915 fold_build1 (TRUTH_NOT_EXPR,
916 boolean_type_node,
917 may_be_zero));
918 else
919 niter = fold_build3 (COND_EXPR, TREE_TYPE (niter), may_be_zero,
920 build_int_cst (TREE_TYPE (niter), 0),
921 rewrite_to_non_trapping_overflow (niter));
922
923 may_be_zero = NULL_TREE;
924 }
925 else if (integer_nonzerop (may_be_zero))
926 {
927 *number_of_iterationsm1 = build_int_cst (TREE_TYPE (niter), 0);
928 *number_of_iterations = build_int_cst (TREE_TYPE (niter), 1);
929 continue;
930 }
931 else
932 continue;
933 }
934
935 /* Loop assumptions are based off the normal exit. */
936 *assumptions = niter_assumptions;
937 *number_of_iterationsm1 = niter;
938
939 /* We want the number of loop header executions which is the number
940 of latch executions plus one.
941 ??? For UINT_MAX latch executions this number overflows to zero
942 for loops like do { n++; } while (n != 0); */
943 if (niter && !chrec_contains_undetermined (niter))
944 {
945 niter = fold_build2 (PLUS_EXPR, TREE_TYPE (niter),
946 unshare_expr (niter),
947 build_int_cst (TREE_TYPE (niter), 1));
948 if (TREE_CODE (niter) == INTEGER_CST
949 && TREE_CODE (*number_of_iterationsm1) != INTEGER_CST)
950 {
951 /* If we manage to fold niter + 1 into INTEGER_CST even when
952 niter is some complex expression, ensure back
953 *number_of_iterationsm1 is an INTEGER_CST as well. See
954 PR113210. */
955 *number_of_iterationsm1
956 = fold_build2 (PLUS_EXPR, TREE_TYPE (niter), niter,
957 build_minus_one_cst (TREE_TYPE (niter)));
958 }
959 }
960 *number_of_iterations = niter;
961 }
962
963 if (dump_enabled_p ())
964 dump_printf_loc (MSG_NOTE, vect_location, "All loop exits successfully analyzed.\n");
965
966 return conds;
967 }
968
969 /* Determine the main loop exit for the vectorizer. */
970
971 edge
972 vec_init_loop_exit_info (class loop *loop)
973 {
974 /* Before we begin we must first determine which exit is the main one and
975 which are auxilary exits. */
976 auto_vec<edge> exits = get_loop_exit_edges (loop);
977 if (exits.length () == 1)
978 return exits[0];
979
980 /* If we have multiple exits we only support counting IV at the moment.
981 Analyze all exits and return the last one we can analyze. */
982 class tree_niter_desc niter_desc;
983 edge candidate = NULL;
984 for (edge exit : exits)
985 {
986 if (!get_loop_exit_condition (exit))
987 continue;
988
989 if (number_of_iterations_exit_assumptions (loop, exit, &niter_desc, NULL)
990 && !chrec_contains_undetermined (niter_desc.niter))
991 {
992 tree may_be_zero = niter_desc.may_be_zero;
993 if ((integer_zerop (may_be_zero)
994 /* As we are handling may_be_zero that's not false by
995 rewriting niter to may_be_zero ? 0 : niter we require
996 an empty latch. */
997 || (single_pred_p (loop->latch)
998 && exit->src == single_pred (loop->latch)
999 && (integer_nonzerop (may_be_zero)
1000 || COMPARISON_CLASS_P (may_be_zero))))
1001 && (!candidate
1002 || dominated_by_p (CDI_DOMINATORS, exit->src,
1003 candidate->src)))
1004 candidate = exit;
1005 }
1006 }
1007
1008 return candidate;
1009 }
1010
1011 /* Function bb_in_loop_p
1012
1013 Used as predicate for dfs order traversal of the loop bbs. */
1014
1015 static bool
1016 bb_in_loop_p (const_basic_block bb, const void *data)
1017 {
1018 const class loop *const loop = (const class loop *)data;
1019 if (flow_bb_inside_loop_p (loop, bb))
1020 return true;
1021 return false;
1022 }
1023
1024
1025 /* Create and initialize a new loop_vec_info struct for LOOP_IN, as well as
1026 stmt_vec_info structs for all the stmts in LOOP_IN. */
1027
1028 _loop_vec_info::_loop_vec_info (class loop *loop_in, vec_info_shared *shared)
1029 : vec_info (vec_info::loop, shared),
1030 loop (loop_in),
1031 bbs (XCNEWVEC (basic_block, loop->num_nodes)),
1032 num_itersm1 (NULL_TREE),
1033 num_iters (NULL_TREE),
1034 num_iters_unchanged (NULL_TREE),
1035 num_iters_assumptions (NULL_TREE),
1036 vector_costs (nullptr),
1037 scalar_costs (nullptr),
1038 th (0),
1039 versioning_threshold (0),
1040 vectorization_factor (0),
1041 main_loop_edge (nullptr),
1042 skip_main_loop_edge (nullptr),
1043 skip_this_loop_edge (nullptr),
1044 reusable_accumulators (),
1045 suggested_unroll_factor (1),
1046 max_vectorization_factor (0),
1047 mask_skip_niters (NULL_TREE),
1048 rgroup_compare_type (NULL_TREE),
1049 simd_if_cond (NULL_TREE),
1050 partial_vector_style (vect_partial_vectors_none),
1051 unaligned_dr (NULL),
1052 peeling_for_alignment (0),
1053 ptr_mask (0),
1054 ivexpr_map (NULL),
1055 scan_map (NULL),
1056 slp_unrolling_factor (1),
1057 inner_loop_cost_factor (param_vect_inner_loop_cost_factor),
1058 vectorizable (false),
1059 can_use_partial_vectors_p (param_vect_partial_vector_usage != 0),
1060 using_partial_vectors_p (false),
1061 using_decrementing_iv_p (false),
1062 using_select_vl_p (false),
1063 epil_using_partial_vectors_p (false),
1064 partial_load_store_bias (0),
1065 peeling_for_gaps (false),
1066 peeling_for_niter (false),
1067 early_breaks (false),
1068 no_data_dependencies (false),
1069 has_mask_store (false),
1070 scalar_loop_scaling (profile_probability::uninitialized ()),
1071 scalar_loop (NULL),
1072 orig_loop_info (NULL),
1073 vec_loop_iv_exit (NULL),
1074 vec_epilogue_loop_iv_exit (NULL),
1075 scalar_loop_iv_exit (NULL)
1076 {
1077 /* CHECKME: We want to visit all BBs before their successors (except for
1078 latch blocks, for which this assertion wouldn't hold). In the simple
1079 case of the loop forms we allow, a dfs order of the BBs would the same
1080 as reversed postorder traversal, so we are safe. */
1081
1082 unsigned int nbbs = dfs_enumerate_from (loop->header, 0, bb_in_loop_p,
1083 bbs, loop->num_nodes, loop);
1084 gcc_assert (nbbs == loop->num_nodes);
1085
1086 for (unsigned int i = 0; i < nbbs; i++)
1087 {
1088 basic_block bb = bbs[i];
1089 gimple_stmt_iterator si;
1090
1091 for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
1092 {
1093 gimple *phi = gsi_stmt (si);
1094 gimple_set_uid (phi, 0);
1095 add_stmt (phi);
1096 }
1097
1098 for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
1099 {
1100 gimple *stmt = gsi_stmt (si);
1101 gimple_set_uid (stmt, 0);
1102 if (is_gimple_debug (stmt))
1103 continue;
1104 add_stmt (stmt);
1105 /* If .GOMP_SIMD_LANE call for the current loop has 3 arguments, the
1106 third argument is the #pragma omp simd if (x) condition, when 0,
1107 loop shouldn't be vectorized, when non-zero constant, it should
1108 be vectorized normally, otherwise versioned with vectorized loop
1109 done if the condition is non-zero at runtime. */
1110 if (loop_in->simduid
1111 && is_gimple_call (stmt)
1112 && gimple_call_internal_p (stmt)
1113 && gimple_call_internal_fn (stmt) == IFN_GOMP_SIMD_LANE
1114 && gimple_call_num_args (stmt) >= 3
1115 && TREE_CODE (gimple_call_arg (stmt, 0)) == SSA_NAME
1116 && (loop_in->simduid
1117 == SSA_NAME_VAR (gimple_call_arg (stmt, 0))))
1118 {
1119 tree arg = gimple_call_arg (stmt, 2);
1120 if (integer_zerop (arg) || TREE_CODE (arg) == SSA_NAME)
1121 simd_if_cond = arg;
1122 else
1123 gcc_assert (integer_nonzerop (arg));
1124 }
1125 }
1126 }
1127
1128 epilogue_vinfos.create (6);
1129 }
1130
1131 /* Free all levels of rgroup CONTROLS. */
1132
1133 void
1134 release_vec_loop_controls (vec<rgroup_controls> *controls)
1135 {
1136 rgroup_controls *rgc;
1137 unsigned int i;
1138 FOR_EACH_VEC_ELT (*controls, i, rgc)
1139 rgc->controls.release ();
1140 controls->release ();
1141 }
1142
1143 /* Free all memory used by the _loop_vec_info, as well as all the
1144 stmt_vec_info structs of all the stmts in the loop. */
1145
1146 _loop_vec_info::~_loop_vec_info ()
1147 {
1148 free (bbs);
1149
1150 release_vec_loop_controls (&masks.rgc_vec);
1151 release_vec_loop_controls (&lens);
1152 delete ivexpr_map;
1153 delete scan_map;
1154 epilogue_vinfos.release ();
1155 delete scalar_costs;
1156 delete vector_costs;
1157
1158 /* When we release an epiloge vinfo that we do not intend to use
1159 avoid clearing AUX of the main loop which should continue to
1160 point to the main loop vinfo since otherwise we'll leak that. */
1161 if (loop->aux == this)
1162 loop->aux = NULL;
1163 }
1164
1165 /* Return an invariant or register for EXPR and emit necessary
1166 computations in the LOOP_VINFO loop preheader. */
1167
1168 tree
1169 cse_and_gimplify_to_preheader (loop_vec_info loop_vinfo, tree expr)
1170 {
1171 if (is_gimple_reg (expr)
1172 || is_gimple_min_invariant (expr))
1173 return expr;
1174
1175 if (! loop_vinfo->ivexpr_map)
1176 loop_vinfo->ivexpr_map = new hash_map<tree_operand_hash, tree>;
1177 tree &cached = loop_vinfo->ivexpr_map->get_or_insert (expr);
1178 if (! cached)
1179 {
1180 gimple_seq stmts = NULL;
1181 cached = force_gimple_operand (unshare_expr (expr),
1182 &stmts, true, NULL_TREE);
1183 if (stmts)
1184 {
1185 edge e = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo));
1186 gsi_insert_seq_on_edge_immediate (e, stmts);
1187 }
1188 }
1189 return cached;
1190 }
1191
1192 /* Return true if we can use CMP_TYPE as the comparison type to produce
1193 all masks required to mask LOOP_VINFO. */
1194
1195 static bool
1196 can_produce_all_loop_masks_p (loop_vec_info loop_vinfo, tree cmp_type)
1197 {
1198 rgroup_controls *rgm;
1199 unsigned int i;
1200 FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo).rgc_vec, i, rgm)
1201 if (rgm->type != NULL_TREE
1202 && !direct_internal_fn_supported_p (IFN_WHILE_ULT,
1203 cmp_type, rgm->type,
1204 OPTIMIZE_FOR_SPEED))
1205 return false;
1206 return true;
1207 }
1208
1209 /* Calculate the maximum number of scalars per iteration for every
1210 rgroup in LOOP_VINFO. */
1211
1212 static unsigned int
1213 vect_get_max_nscalars_per_iter (loop_vec_info loop_vinfo)
1214 {
1215 unsigned int res = 1;
1216 unsigned int i;
1217 rgroup_controls *rgm;
1218 FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo).rgc_vec, i, rgm)
1219 res = MAX (res, rgm->max_nscalars_per_iter);
1220 return res;
1221 }
1222
1223 /* Calculate the minimum precision necessary to represent:
1224
1225 MAX_NITERS * FACTOR
1226
1227 as an unsigned integer, where MAX_NITERS is the maximum number of
1228 loop header iterations for the original scalar form of LOOP_VINFO. */
1229
1230 static unsigned
1231 vect_min_prec_for_max_niters (loop_vec_info loop_vinfo, unsigned int factor)
1232 {
1233 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1234
1235 /* Get the maximum number of iterations that is representable
1236 in the counter type. */
1237 tree ni_type = TREE_TYPE (LOOP_VINFO_NITERSM1 (loop_vinfo));
1238 widest_int max_ni = wi::to_widest (TYPE_MAX_VALUE (ni_type)) + 1;
1239
1240 /* Get a more refined estimate for the number of iterations. */
1241 widest_int max_back_edges;
1242 if (max_loop_iterations (loop, &max_back_edges))
1243 max_ni = wi::smin (max_ni, max_back_edges + 1);
1244
1245 /* Work out how many bits we need to represent the limit. */
1246 return wi::min_precision (max_ni * factor, UNSIGNED);
1247 }
1248
1249 /* True if the loop needs peeling or partial vectors when vectorized. */
1250
1251 static bool
1252 vect_need_peeling_or_partial_vectors_p (loop_vec_info loop_vinfo)
1253 {
1254 unsigned HOST_WIDE_INT const_vf;
1255 HOST_WIDE_INT max_niter
1256 = likely_max_stmt_executions_int (LOOP_VINFO_LOOP (loop_vinfo));
1257
1258 unsigned th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
1259 if (!th && LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo))
1260 th = LOOP_VINFO_COST_MODEL_THRESHOLD (LOOP_VINFO_ORIG_LOOP_INFO
1261 (loop_vinfo));
1262
1263 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
1264 && LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) >= 0)
1265 {
1266 /* Work out the (constant) number of iterations that need to be
1267 peeled for reasons other than niters. */
1268 unsigned int peel_niter = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
1269 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
1270 peel_niter += 1;
1271 if (!multiple_p (LOOP_VINFO_INT_NITERS (loop_vinfo) - peel_niter,
1272 LOOP_VINFO_VECT_FACTOR (loop_vinfo)))
1273 return true;
1274 }
1275 else if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
1276 /* ??? When peeling for gaps but not alignment, we could
1277 try to check whether the (variable) niters is known to be
1278 VF * N + 1. That's something of a niche case though. */
1279 || LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
1280 || !LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant (&const_vf)
1281 || ((tree_ctz (LOOP_VINFO_NITERS (loop_vinfo))
1282 < (unsigned) exact_log2 (const_vf))
1283 /* In case of versioning, check if the maximum number of
1284 iterations is greater than th. If they are identical,
1285 the epilogue is unnecessary. */
1286 && (!LOOP_REQUIRES_VERSIONING (loop_vinfo)
1287 || ((unsigned HOST_WIDE_INT) max_niter
1288 /* We'd like to use LOOP_VINFO_VERSIONING_THRESHOLD
1289 but that's only computed later based on our result.
1290 The following is the most conservative approximation. */
1291 > (std::max ((unsigned HOST_WIDE_INT) th,
1292 const_vf) / const_vf) * const_vf))))
1293 return true;
1294
1295 return false;
1296 }
1297
1298 /* Each statement in LOOP_VINFO can be masked where necessary. Check
1299 whether we can actually generate the masks required. Return true if so,
1300 storing the type of the scalar IV in LOOP_VINFO_RGROUP_COMPARE_TYPE. */
1301
1302 static bool
1303 vect_verify_full_masking (loop_vec_info loop_vinfo)
1304 {
1305 unsigned int min_ni_width;
1306
1307 /* Use a normal loop if there are no statements that need masking.
1308 This only happens in rare degenerate cases: it means that the loop
1309 has no loads, no stores, and no live-out values. */
1310 if (LOOP_VINFO_MASKS (loop_vinfo).is_empty ())
1311 return false;
1312
1313 /* Produce the rgroup controls. */
1314 for (auto mask : LOOP_VINFO_MASKS (loop_vinfo).mask_set)
1315 {
1316 vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo);
1317 tree vectype = mask.first;
1318 unsigned nvectors = mask.second;
1319
1320 if (masks->rgc_vec.length () < nvectors)
1321 masks->rgc_vec.safe_grow_cleared (nvectors, true);
1322 rgroup_controls *rgm = &(*masks).rgc_vec[nvectors - 1];
1323 /* The number of scalars per iteration and the number of vectors are
1324 both compile-time constants. */
1325 unsigned int nscalars_per_iter
1326 = exact_div (nvectors * TYPE_VECTOR_SUBPARTS (vectype),
1327 LOOP_VINFO_VECT_FACTOR (loop_vinfo)).to_constant ();
1328
1329 if (rgm->max_nscalars_per_iter < nscalars_per_iter)
1330 {
1331 rgm->max_nscalars_per_iter = nscalars_per_iter;
1332 rgm->type = truth_type_for (vectype);
1333 rgm->factor = 1;
1334 }
1335 }
1336
1337 unsigned int max_nscalars_per_iter
1338 = vect_get_max_nscalars_per_iter (loop_vinfo);
1339
1340 /* Work out how many bits we need to represent the limit. */
1341 min_ni_width
1342 = vect_min_prec_for_max_niters (loop_vinfo, max_nscalars_per_iter);
1343
1344 /* Find a scalar mode for which WHILE_ULT is supported. */
1345 opt_scalar_int_mode cmp_mode_iter;
1346 tree cmp_type = NULL_TREE;
1347 tree iv_type = NULL_TREE;
1348 widest_int iv_limit = vect_iv_limit_for_partial_vectors (loop_vinfo);
1349 unsigned int iv_precision = UINT_MAX;
1350
1351 if (iv_limit != -1)
1352 iv_precision = wi::min_precision (iv_limit * max_nscalars_per_iter,
1353 UNSIGNED);
1354
1355 FOR_EACH_MODE_IN_CLASS (cmp_mode_iter, MODE_INT)
1356 {
1357 unsigned int cmp_bits = GET_MODE_BITSIZE (cmp_mode_iter.require ());
1358 if (cmp_bits >= min_ni_width
1359 && targetm.scalar_mode_supported_p (cmp_mode_iter.require ()))
1360 {
1361 tree this_type = build_nonstandard_integer_type (cmp_bits, true);
1362 if (this_type
1363 && can_produce_all_loop_masks_p (loop_vinfo, this_type))
1364 {
1365 /* Although we could stop as soon as we find a valid mode,
1366 there are at least two reasons why that's not always the
1367 best choice:
1368
1369 - An IV that's Pmode or wider is more likely to be reusable
1370 in address calculations than an IV that's narrower than
1371 Pmode.
1372
1373 - Doing the comparison in IV_PRECISION or wider allows
1374 a natural 0-based IV, whereas using a narrower comparison
1375 type requires mitigations against wrap-around.
1376
1377 Conversely, if the IV limit is variable, doing the comparison
1378 in a wider type than the original type can introduce
1379 unnecessary extensions, so picking the widest valid mode
1380 is not always a good choice either.
1381
1382 Here we prefer the first IV type that's Pmode or wider,
1383 and the first comparison type that's IV_PRECISION or wider.
1384 (The comparison type must be no wider than the IV type,
1385 to avoid extensions in the vector loop.)
1386
1387 ??? We might want to try continuing beyond Pmode for ILP32
1388 targets if CMP_BITS < IV_PRECISION. */
1389 iv_type = this_type;
1390 if (!cmp_type || iv_precision > TYPE_PRECISION (cmp_type))
1391 cmp_type = this_type;
1392 if (cmp_bits >= GET_MODE_BITSIZE (Pmode))
1393 break;
1394 }
1395 }
1396 }
1397
1398 if (!cmp_type)
1399 {
1400 LOOP_VINFO_MASKS (loop_vinfo).rgc_vec.release ();
1401 return false;
1402 }
1403
1404 LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo) = cmp_type;
1405 LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo) = iv_type;
1406 LOOP_VINFO_PARTIAL_VECTORS_STYLE (loop_vinfo) = vect_partial_vectors_while_ult;
1407 return true;
1408 }
1409
1410 /* Each statement in LOOP_VINFO can be masked where necessary. Check
1411 whether we can actually generate AVX512 style masks. Return true if so,
1412 storing the type of the scalar IV in LOOP_VINFO_RGROUP_IV_TYPE. */
1413
1414 static bool
1415 vect_verify_full_masking_avx512 (loop_vec_info loop_vinfo)
1416 {
1417 /* Produce differently organized rgc_vec and differently check
1418 we can produce masks. */
1419
1420 /* Use a normal loop if there are no statements that need masking.
1421 This only happens in rare degenerate cases: it means that the loop
1422 has no loads, no stores, and no live-out values. */
1423 if (LOOP_VINFO_MASKS (loop_vinfo).is_empty ())
1424 return false;
1425
1426 /* For the decrementing IV we need to represent all values in
1427 [0, niter + niter_skip] where niter_skip is the elements we
1428 skip in the first iteration for prologue peeling. */
1429 tree iv_type = NULL_TREE;
1430 widest_int iv_limit = vect_iv_limit_for_partial_vectors (loop_vinfo);
1431 unsigned int iv_precision = UINT_MAX;
1432 if (iv_limit != -1)
1433 iv_precision = wi::min_precision (iv_limit, UNSIGNED);
1434
1435 /* First compute the type for the IV we use to track the remaining
1436 scalar iterations. */
1437 opt_scalar_int_mode cmp_mode_iter;
1438 FOR_EACH_MODE_IN_CLASS (cmp_mode_iter, MODE_INT)
1439 {
1440 unsigned int cmp_bits = GET_MODE_BITSIZE (cmp_mode_iter.require ());
1441 if (cmp_bits >= iv_precision
1442 && targetm.scalar_mode_supported_p (cmp_mode_iter.require ()))
1443 {
1444 iv_type = build_nonstandard_integer_type (cmp_bits, true);
1445 if (iv_type)
1446 break;
1447 }
1448 }
1449 if (!iv_type)
1450 return false;
1451
1452 /* Produce the rgroup controls. */
1453 for (auto const &mask : LOOP_VINFO_MASKS (loop_vinfo).mask_set)
1454 {
1455 vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo);
1456 tree vectype = mask.first;
1457 unsigned nvectors = mask.second;
1458
1459 /* The number of scalars per iteration and the number of vectors are
1460 both compile-time constants. */
1461 unsigned int nscalars_per_iter
1462 = exact_div (nvectors * TYPE_VECTOR_SUBPARTS (vectype),
1463 LOOP_VINFO_VECT_FACTOR (loop_vinfo)).to_constant ();
1464
1465 /* We index the rgroup_controls vector with nscalars_per_iter
1466 which we keep constant and instead have a varying nvectors,
1467 remembering the vector mask with the fewest nV. */
1468 if (masks->rgc_vec.length () < nscalars_per_iter)
1469 masks->rgc_vec.safe_grow_cleared (nscalars_per_iter, true);
1470 rgroup_controls *rgm = &(*masks).rgc_vec[nscalars_per_iter - 1];
1471
1472 if (!rgm->type || rgm->factor > nvectors)
1473 {
1474 rgm->type = truth_type_for (vectype);
1475 rgm->compare_type = NULL_TREE;
1476 rgm->max_nscalars_per_iter = nscalars_per_iter;
1477 rgm->factor = nvectors;
1478 rgm->bias_adjusted_ctrl = NULL_TREE;
1479 }
1480 }
1481
1482 /* There is no fixed compare type we are going to use but we have to
1483 be able to get at one for each mask group. */
1484 unsigned int min_ni_width
1485 = wi::min_precision (vect_max_vf (loop_vinfo), UNSIGNED);
1486
1487 bool ok = true;
1488 for (auto &rgc : LOOP_VINFO_MASKS (loop_vinfo).rgc_vec)
1489 {
1490 tree mask_type = rgc.type;
1491 if (!mask_type)
1492 continue;
1493
1494 /* For now vect_get_loop_mask only supports integer mode masks
1495 when we need to split it. */
1496 if (GET_MODE_CLASS (TYPE_MODE (mask_type)) != MODE_INT
1497 || TYPE_PRECISION (TREE_TYPE (mask_type)) != 1)
1498 {
1499 ok = false;
1500 break;
1501 }
1502
1503 /* If iv_type is usable as compare type use that - we can elide the
1504 saturation in that case. */
1505 if (TYPE_PRECISION (iv_type) >= min_ni_width)
1506 {
1507 tree cmp_vectype
1508 = build_vector_type (iv_type, TYPE_VECTOR_SUBPARTS (mask_type));
1509 if (expand_vec_cmp_expr_p (cmp_vectype, mask_type, LT_EXPR))
1510 rgc.compare_type = cmp_vectype;
1511 }
1512 if (!rgc.compare_type)
1513 FOR_EACH_MODE_IN_CLASS (cmp_mode_iter, MODE_INT)
1514 {
1515 unsigned int cmp_bits = GET_MODE_BITSIZE (cmp_mode_iter.require ());
1516 if (cmp_bits >= min_ni_width
1517 && targetm.scalar_mode_supported_p (cmp_mode_iter.require ()))
1518 {
1519 tree cmp_type = build_nonstandard_integer_type (cmp_bits, true);
1520 if (!cmp_type)
1521 continue;
1522
1523 /* Check whether we can produce the mask with cmp_type. */
1524 tree cmp_vectype
1525 = build_vector_type (cmp_type, TYPE_VECTOR_SUBPARTS (mask_type));
1526 if (expand_vec_cmp_expr_p (cmp_vectype, mask_type, LT_EXPR))
1527 {
1528 rgc.compare_type = cmp_vectype;
1529 break;
1530 }
1531 }
1532 }
1533 if (!rgc.compare_type)
1534 {
1535 ok = false;
1536 break;
1537 }
1538 }
1539 if (!ok)
1540 {
1541 release_vec_loop_controls (&LOOP_VINFO_MASKS (loop_vinfo).rgc_vec);
1542 return false;
1543 }
1544
1545 LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo) = error_mark_node;
1546 LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo) = iv_type;
1547 LOOP_VINFO_PARTIAL_VECTORS_STYLE (loop_vinfo) = vect_partial_vectors_avx512;
1548 return true;
1549 }
1550
1551 /* Check whether we can use vector access with length based on precison
1552 comparison. So far, to keep it simple, we only allow the case that the
1553 precision of the target supported length is larger than the precision
1554 required by loop niters. */
1555
1556 static bool
1557 vect_verify_loop_lens (loop_vec_info loop_vinfo)
1558 {
1559 if (LOOP_VINFO_LENS (loop_vinfo).is_empty ())
1560 return false;
1561
1562 machine_mode len_load_mode, len_store_mode;
1563 if (!get_len_load_store_mode (loop_vinfo->vector_mode, true)
1564 .exists (&len_load_mode))
1565 return false;
1566 if (!get_len_load_store_mode (loop_vinfo->vector_mode, false)
1567 .exists (&len_store_mode))
1568 return false;
1569
1570 signed char partial_load_bias = internal_len_load_store_bias
1571 (IFN_LEN_LOAD, len_load_mode);
1572
1573 signed char partial_store_bias = internal_len_load_store_bias
1574 (IFN_LEN_STORE, len_store_mode);
1575
1576 gcc_assert (partial_load_bias == partial_store_bias);
1577
1578 if (partial_load_bias == VECT_PARTIAL_BIAS_UNSUPPORTED)
1579 return false;
1580
1581 /* If the backend requires a bias of -1 for LEN_LOAD, we must not emit
1582 len_loads with a length of zero. In order to avoid that we prohibit
1583 more than one loop length here. */
1584 if (partial_load_bias == -1
1585 && LOOP_VINFO_LENS (loop_vinfo).length () > 1)
1586 return false;
1587
1588 LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo) = partial_load_bias;
1589
1590 unsigned int max_nitems_per_iter = 1;
1591 unsigned int i;
1592 rgroup_controls *rgl;
1593 /* Find the maximum number of items per iteration for every rgroup. */
1594 FOR_EACH_VEC_ELT (LOOP_VINFO_LENS (loop_vinfo), i, rgl)
1595 {
1596 unsigned nitems_per_iter = rgl->max_nscalars_per_iter * rgl->factor;
1597 max_nitems_per_iter = MAX (max_nitems_per_iter, nitems_per_iter);
1598 }
1599
1600 /* Work out how many bits we need to represent the length limit. */
1601 unsigned int min_ni_prec
1602 = vect_min_prec_for_max_niters (loop_vinfo, max_nitems_per_iter);
1603
1604 /* Now use the maximum of below precisions for one suitable IV type:
1605 - the IV's natural precision
1606 - the precision needed to hold: the maximum number of scalar
1607 iterations multiplied by the scale factor (min_ni_prec above)
1608 - the Pmode precision
1609
1610 If min_ni_prec is less than the precision of the current niters,
1611 we perfer to still use the niters type. Prefer to use Pmode and
1612 wider IV to avoid narrow conversions. */
1613
1614 unsigned int ni_prec
1615 = TYPE_PRECISION (TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo)));
1616 min_ni_prec = MAX (min_ni_prec, ni_prec);
1617 min_ni_prec = MAX (min_ni_prec, GET_MODE_BITSIZE (Pmode));
1618
1619 tree iv_type = NULL_TREE;
1620 opt_scalar_int_mode tmode_iter;
1621 FOR_EACH_MODE_IN_CLASS (tmode_iter, MODE_INT)
1622 {
1623 scalar_mode tmode = tmode_iter.require ();
1624 unsigned int tbits = GET_MODE_BITSIZE (tmode);
1625
1626 /* ??? Do we really want to construct one IV whose precision exceeds
1627 BITS_PER_WORD? */
1628 if (tbits > BITS_PER_WORD)
1629 break;
1630
1631 /* Find the first available standard integral type. */
1632 if (tbits >= min_ni_prec && targetm.scalar_mode_supported_p (tmode))
1633 {
1634 iv_type = build_nonstandard_integer_type (tbits, true);
1635 break;
1636 }
1637 }
1638
1639 if (!iv_type)
1640 {
1641 if (dump_enabled_p ())
1642 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1643 "can't vectorize with length-based partial vectors"
1644 " because there is no suitable iv type.\n");
1645 return false;
1646 }
1647
1648 LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo) = iv_type;
1649 LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo) = iv_type;
1650 LOOP_VINFO_PARTIAL_VECTORS_STYLE (loop_vinfo) = vect_partial_vectors_len;
1651
1652 return true;
1653 }
1654
1655 /* Calculate the cost of one scalar iteration of the loop. */
1656 static void
1657 vect_compute_single_scalar_iteration_cost (loop_vec_info loop_vinfo)
1658 {
1659 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1660 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1661 int nbbs = loop->num_nodes, factor;
1662 int innerloop_iters, i;
1663
1664 DUMP_VECT_SCOPE ("vect_compute_single_scalar_iteration_cost");
1665
1666 /* Gather costs for statements in the scalar loop. */
1667
1668 /* FORNOW. */
1669 innerloop_iters = 1;
1670 if (loop->inner)
1671 innerloop_iters = LOOP_VINFO_INNER_LOOP_COST_FACTOR (loop_vinfo);
1672
1673 for (i = 0; i < nbbs; i++)
1674 {
1675 gimple_stmt_iterator si;
1676 basic_block bb = bbs[i];
1677
1678 if (bb->loop_father == loop->inner)
1679 factor = innerloop_iters;
1680 else
1681 factor = 1;
1682
1683 for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
1684 {
1685 gimple *stmt = gsi_stmt (si);
1686 stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (stmt);
1687
1688 if (!is_gimple_assign (stmt) && !is_gimple_call (stmt))
1689 continue;
1690
1691 /* Skip stmts that are not vectorized inside the loop. */
1692 stmt_vec_info vstmt_info = vect_stmt_to_vectorize (stmt_info);
1693 if (!STMT_VINFO_RELEVANT_P (vstmt_info)
1694 && (!STMT_VINFO_LIVE_P (vstmt_info)
1695 || !VECTORIZABLE_CYCLE_DEF
1696 (STMT_VINFO_DEF_TYPE (vstmt_info))))
1697 continue;
1698
1699 vect_cost_for_stmt kind;
1700 if (STMT_VINFO_DATA_REF (stmt_info))
1701 {
1702 if (DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
1703 kind = scalar_load;
1704 else
1705 kind = scalar_store;
1706 }
1707 else if (vect_nop_conversion_p (stmt_info))
1708 continue;
1709 else
1710 kind = scalar_stmt;
1711
1712 /* We are using vect_prologue here to avoid scaling twice
1713 by the inner loop factor. */
1714 record_stmt_cost (&LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1715 factor, kind, stmt_info, 0, vect_prologue);
1716 }
1717 }
1718
1719 /* Now accumulate cost. */
1720 loop_vinfo->scalar_costs = init_cost (loop_vinfo, true);
1721 add_stmt_costs (loop_vinfo->scalar_costs,
1722 &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo));
1723 loop_vinfo->scalar_costs->finish_cost (nullptr);
1724 }
1725
1726 /* Function vect_analyze_loop_form.
1727
1728 Verify that certain CFG restrictions hold, including:
1729 - the loop has a pre-header
1730 - the loop has a single entry
1731 - nested loops can have only a single exit.
1732 - the loop exit condition is simple enough
1733 - the number of iterations can be analyzed, i.e, a countable loop. The
1734 niter could be analyzed under some assumptions. */
1735
1736 opt_result
1737 vect_analyze_loop_form (class loop *loop, vect_loop_form_info *info)
1738 {
1739 DUMP_VECT_SCOPE ("vect_analyze_loop_form");
1740
1741 edge exit_e = vec_init_loop_exit_info (loop);
1742 if (!exit_e)
1743 return opt_result::failure_at (vect_location,
1744 "not vectorized:"
1745 " could not determine main exit from"
1746 " loop with multiple exits.\n");
1747 info->loop_exit = exit_e;
1748 if (dump_enabled_p ())
1749 dump_printf_loc (MSG_NOTE, vect_location,
1750 "using as main loop exit: %d -> %d [AUX: %p]\n",
1751 exit_e->src->index, exit_e->dest->index, exit_e->aux);
1752
1753 /* Check if we have any control flow that doesn't leave the loop. */
1754 class loop *v_loop = loop->inner ? loop->inner : loop;
1755 basic_block *bbs = get_loop_body (v_loop);
1756 for (unsigned i = 0; i < v_loop->num_nodes; i++)
1757 if (EDGE_COUNT (bbs[i]->succs) != 1
1758 && (EDGE_COUNT (bbs[i]->succs) != 2
1759 || !loop_exits_from_bb_p (bbs[i]->loop_father, bbs[i])))
1760 {
1761 free (bbs);
1762 return opt_result::failure_at (vect_location,
1763 "not vectorized:"
1764 " unsupported control flow in loop.\n");
1765 }
1766 free (bbs);
1767
1768 /* Different restrictions apply when we are considering an inner-most loop,
1769 vs. an outer (nested) loop.
1770 (FORNOW. May want to relax some of these restrictions in the future). */
1771
1772 info->inner_loop_cond = NULL;
1773 if (!loop->inner)
1774 {
1775 /* Inner-most loop. */
1776
1777 if (empty_block_p (loop->header))
1778 return opt_result::failure_at (vect_location,
1779 "not vectorized: empty loop.\n");
1780 }
1781 else
1782 {
1783 class loop *innerloop = loop->inner;
1784 edge entryedge;
1785
1786 /* Nested loop. We currently require that the loop is doubly-nested,
1787 contains a single inner loop with a single exit to the block
1788 with the single exit condition in the outer loop.
1789 Vectorizable outer-loops look like this:
1790
1791 (pre-header)
1792 |
1793 header <---+
1794 | |
1795 inner-loop |
1796 | |
1797 tail ------+
1798 |
1799 (exit-bb)
1800
1801 The inner-loop also has the properties expected of inner-most loops
1802 as described above. */
1803
1804 if ((loop->inner)->inner || (loop->inner)->next)
1805 return opt_result::failure_at (vect_location,
1806 "not vectorized:"
1807 " multiple nested loops.\n");
1808
1809 entryedge = loop_preheader_edge (innerloop);
1810 if (entryedge->src != loop->header
1811 || !single_exit (innerloop)
1812 || single_exit (innerloop)->dest != EDGE_PRED (loop->latch, 0)->src)
1813 return opt_result::failure_at (vect_location,
1814 "not vectorized:"
1815 " unsupported outerloop form.\n");
1816
1817 /* Analyze the inner-loop. */
1818 vect_loop_form_info inner;
1819 opt_result res = vect_analyze_loop_form (loop->inner, &inner);
1820 if (!res)
1821 {
1822 if (dump_enabled_p ())
1823 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1824 "not vectorized: Bad inner loop.\n");
1825 return res;
1826 }
1827
1828 /* Don't support analyzing niter under assumptions for inner
1829 loop. */
1830 if (!integer_onep (inner.assumptions))
1831 return opt_result::failure_at (vect_location,
1832 "not vectorized: Bad inner loop.\n");
1833
1834 if (!expr_invariant_in_loop_p (loop, inner.number_of_iterations))
1835 return opt_result::failure_at (vect_location,
1836 "not vectorized: inner-loop count not"
1837 " invariant.\n");
1838
1839 if (dump_enabled_p ())
1840 dump_printf_loc (MSG_NOTE, vect_location,
1841 "Considering outer-loop vectorization.\n");
1842 info->inner_loop_cond = inner.conds[0];
1843 }
1844
1845 if (EDGE_COUNT (loop->header->preds) != 2)
1846 return opt_result::failure_at (vect_location,
1847 "not vectorized:"
1848 " too many incoming edges.\n");
1849
1850 /* We assume that the latch is empty. */
1851 if (!empty_block_p (loop->latch)
1852 || !gimple_seq_empty_p (phi_nodes (loop->latch)))
1853 return opt_result::failure_at (vect_location,
1854 "not vectorized: latch block not empty.\n");
1855
1856 /* Make sure there is no abnormal exit. */
1857 auto_vec<edge> exits = get_loop_exit_edges (loop);
1858 for (edge e : exits)
1859 {
1860 if (e->flags & EDGE_ABNORMAL)
1861 return opt_result::failure_at (vect_location,
1862 "not vectorized:"
1863 " abnormal loop exit edge.\n");
1864 }
1865
1866 info->conds
1867 = vect_get_loop_niters (loop, exit_e, &info->assumptions,
1868 &info->number_of_iterations,
1869 &info->number_of_iterationsm1);
1870 if (info->conds.is_empty ())
1871 return opt_result::failure_at
1872 (vect_location,
1873 "not vectorized: complicated exit condition.\n");
1874
1875 /* Determine what the primary and alternate exit conds are. */
1876 for (unsigned i = 0; i < info->conds.length (); i++)
1877 {
1878 gcond *cond = info->conds[i];
1879 if (exit_e->src == gimple_bb (cond))
1880 std::swap (info->conds[0], info->conds[i]);
1881 }
1882
1883 if (integer_zerop (info->assumptions)
1884 || !info->number_of_iterations
1885 || chrec_contains_undetermined (info->number_of_iterations))
1886 return opt_result::failure_at
1887 (info->conds[0],
1888 "not vectorized: number of iterations cannot be computed.\n");
1889
1890 if (integer_zerop (info->number_of_iterations))
1891 return opt_result::failure_at
1892 (info->conds[0],
1893 "not vectorized: number of iterations = 0.\n");
1894
1895 if (!(tree_fits_shwi_p (info->number_of_iterations)
1896 && tree_to_shwi (info->number_of_iterations) > 0))
1897 {
1898 if (dump_enabled_p ())
1899 {
1900 dump_printf_loc (MSG_NOTE, vect_location,
1901 "Symbolic number of iterations is ");
1902 dump_generic_expr (MSG_NOTE, TDF_DETAILS, info->number_of_iterations);
1903 dump_printf (MSG_NOTE, "\n");
1904 }
1905 }
1906
1907 return opt_result::success ();
1908 }
1909
1910 /* Create a loop_vec_info for LOOP with SHARED and the
1911 vect_analyze_loop_form result. */
1912
1913 loop_vec_info
1914 vect_create_loop_vinfo (class loop *loop, vec_info_shared *shared,
1915 const vect_loop_form_info *info,
1916 loop_vec_info main_loop_info)
1917 {
1918 loop_vec_info loop_vinfo = new _loop_vec_info (loop, shared);
1919 LOOP_VINFO_NITERSM1 (loop_vinfo) = info->number_of_iterationsm1;
1920 LOOP_VINFO_NITERS (loop_vinfo) = info->number_of_iterations;
1921 LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = info->number_of_iterations;
1922 LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo) = main_loop_info;
1923 /* Also record the assumptions for versioning. */
1924 if (!integer_onep (info->assumptions) && !main_loop_info)
1925 LOOP_VINFO_NITERS_ASSUMPTIONS (loop_vinfo) = info->assumptions;
1926
1927 for (gcond *cond : info->conds)
1928 {
1929 stmt_vec_info loop_cond_info = loop_vinfo->lookup_stmt (cond);
1930 STMT_VINFO_TYPE (loop_cond_info) = loop_exit_ctrl_vec_info_type;
1931 /* Mark the statement as a condition. */
1932 STMT_VINFO_DEF_TYPE (loop_cond_info) = vect_condition_def;
1933 }
1934
1935 for (unsigned i = 1; i < info->conds.length (); i ++)
1936 LOOP_VINFO_LOOP_CONDS (loop_vinfo).safe_push (info->conds[i]);
1937 LOOP_VINFO_LOOP_IV_COND (loop_vinfo) = info->conds[0];
1938
1939 LOOP_VINFO_IV_EXIT (loop_vinfo) = info->loop_exit;
1940
1941 /* Check to see if we're vectorizing multiple exits. */
1942 LOOP_VINFO_EARLY_BREAKS (loop_vinfo)
1943 = !LOOP_VINFO_LOOP_CONDS (loop_vinfo).is_empty ();
1944
1945 if (info->inner_loop_cond)
1946 {
1947 stmt_vec_info inner_loop_cond_info
1948 = loop_vinfo->lookup_stmt (info->inner_loop_cond);
1949 STMT_VINFO_TYPE (inner_loop_cond_info) = loop_exit_ctrl_vec_info_type;
1950 /* If we have an estimate on the number of iterations of the inner
1951 loop use that to limit the scale for costing, otherwise use
1952 --param vect-inner-loop-cost-factor literally. */
1953 widest_int nit;
1954 if (estimated_stmt_executions (loop->inner, &nit))
1955 LOOP_VINFO_INNER_LOOP_COST_FACTOR (loop_vinfo)
1956 = wi::smin (nit, param_vect_inner_loop_cost_factor).to_uhwi ();
1957 }
1958
1959 return loop_vinfo;
1960 }
1961
1962
1963
1964 /* Scan the loop stmts and dependent on whether there are any (non-)SLP
1965 statements update the vectorization factor. */
1966
1967 static void
1968 vect_update_vf_for_slp (loop_vec_info loop_vinfo)
1969 {
1970 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1971 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1972 int nbbs = loop->num_nodes;
1973 poly_uint64 vectorization_factor;
1974 int i;
1975
1976 DUMP_VECT_SCOPE ("vect_update_vf_for_slp");
1977
1978 vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1979 gcc_assert (known_ne (vectorization_factor, 0U));
1980
1981 /* If all the stmts in the loop can be SLPed, we perform only SLP, and
1982 vectorization factor of the loop is the unrolling factor required by
1983 the SLP instances. If that unrolling factor is 1, we say, that we
1984 perform pure SLP on loop - cross iteration parallelism is not
1985 exploited. */
1986 bool only_slp_in_loop = true;
1987 for (i = 0; i < nbbs; i++)
1988 {
1989 basic_block bb = bbs[i];
1990 for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
1991 gsi_next (&si))
1992 {
1993 stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (si.phi ());
1994 if (!stmt_info)
1995 continue;
1996 if ((STMT_VINFO_RELEVANT_P (stmt_info)
1997 || VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
1998 && !PURE_SLP_STMT (stmt_info))
1999 /* STMT needs both SLP and loop-based vectorization. */
2000 only_slp_in_loop = false;
2001 }
2002 for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
2003 gsi_next (&si))
2004 {
2005 if (is_gimple_debug (gsi_stmt (si)))
2006 continue;
2007 stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
2008 stmt_info = vect_stmt_to_vectorize (stmt_info);
2009 if ((STMT_VINFO_RELEVANT_P (stmt_info)
2010 || VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
2011 && !PURE_SLP_STMT (stmt_info))
2012 /* STMT needs both SLP and loop-based vectorization. */
2013 only_slp_in_loop = false;
2014 }
2015 }
2016
2017 if (only_slp_in_loop)
2018 {
2019 if (dump_enabled_p ())
2020 dump_printf_loc (MSG_NOTE, vect_location,
2021 "Loop contains only SLP stmts\n");
2022 vectorization_factor = LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo);
2023 }
2024 else
2025 {
2026 if (dump_enabled_p ())
2027 dump_printf_loc (MSG_NOTE, vect_location,
2028 "Loop contains SLP and non-SLP stmts\n");
2029 /* Both the vectorization factor and unroll factor have the form
2030 GET_MODE_SIZE (loop_vinfo->vector_mode) * X for some rational X,
2031 so they must have a common multiple. */
2032 vectorization_factor
2033 = force_common_multiple (vectorization_factor,
2034 LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo));
2035 }
2036
2037 LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
2038 if (dump_enabled_p ())
2039 {
2040 dump_printf_loc (MSG_NOTE, vect_location,
2041 "Updating vectorization factor to ");
2042 dump_dec (MSG_NOTE, vectorization_factor);
2043 dump_printf (MSG_NOTE, ".\n");
2044 }
2045 }
2046
2047 /* Return true if STMT_INFO describes a double reduction phi and if
2048 the other phi in the reduction is also relevant for vectorization.
2049 This rejects cases such as:
2050
2051 outer1:
2052 x_1 = PHI <x_3(outer2), ...>;
2053 ...
2054
2055 inner:
2056 x_2 = ...;
2057 ...
2058
2059 outer2:
2060 x_3 = PHI <x_2(inner)>;
2061
2062 if nothing in x_2 or elsewhere makes x_1 relevant. */
2063
2064 static bool
2065 vect_active_double_reduction_p (stmt_vec_info stmt_info)
2066 {
2067 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_double_reduction_def)
2068 return false;
2069
2070 return STMT_VINFO_RELEVANT_P (STMT_VINFO_REDUC_DEF (stmt_info));
2071 }
2072
2073 /* Function vect_analyze_loop_operations.
2074
2075 Scan the loop stmts and make sure they are all vectorizable. */
2076
2077 static opt_result
2078 vect_analyze_loop_operations (loop_vec_info loop_vinfo)
2079 {
2080 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2081 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
2082 int nbbs = loop->num_nodes;
2083 int i;
2084 stmt_vec_info stmt_info;
2085 bool need_to_vectorize = false;
2086 bool ok;
2087
2088 DUMP_VECT_SCOPE ("vect_analyze_loop_operations");
2089
2090 auto_vec<stmt_info_for_cost> cost_vec;
2091
2092 for (i = 0; i < nbbs; i++)
2093 {
2094 basic_block bb = bbs[i];
2095
2096 for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
2097 gsi_next (&si))
2098 {
2099 gphi *phi = si.phi ();
2100 ok = true;
2101
2102 stmt_info = loop_vinfo->lookup_stmt (phi);
2103 if (dump_enabled_p ())
2104 dump_printf_loc (MSG_NOTE, vect_location, "examining phi: %G",
2105 (gimple *) phi);
2106 if (virtual_operand_p (gimple_phi_result (phi)))
2107 continue;
2108
2109 /* Inner-loop loop-closed exit phi in outer-loop vectorization
2110 (i.e., a phi in the tail of the outer-loop). */
2111 if (! is_loop_header_bb_p (bb))
2112 {
2113 /* FORNOW: we currently don't support the case that these phis
2114 are not used in the outerloop (unless it is double reduction,
2115 i.e., this phi is vect_reduction_def), cause this case
2116 requires to actually do something here. */
2117 if (STMT_VINFO_LIVE_P (stmt_info)
2118 && !vect_active_double_reduction_p (stmt_info))
2119 return opt_result::failure_at (phi,
2120 "Unsupported loop-closed phi"
2121 " in outer-loop.\n");
2122
2123 /* If PHI is used in the outer loop, we check that its operand
2124 is defined in the inner loop. */
2125 if (STMT_VINFO_RELEVANT_P (stmt_info))
2126 {
2127 tree phi_op;
2128
2129 if (gimple_phi_num_args (phi) != 1)
2130 return opt_result::failure_at (phi, "unsupported phi");
2131
2132 phi_op = PHI_ARG_DEF (phi, 0);
2133 stmt_vec_info op_def_info = loop_vinfo->lookup_def (phi_op);
2134 if (!op_def_info)
2135 return opt_result::failure_at (phi, "unsupported phi\n");
2136
2137 if (STMT_VINFO_RELEVANT (op_def_info) != vect_used_in_outer
2138 && (STMT_VINFO_RELEVANT (op_def_info)
2139 != vect_used_in_outer_by_reduction))
2140 return opt_result::failure_at (phi, "unsupported phi\n");
2141
2142 if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_internal_def
2143 || (STMT_VINFO_DEF_TYPE (stmt_info)
2144 == vect_double_reduction_def))
2145 && !vectorizable_lc_phi (loop_vinfo,
2146 stmt_info, NULL, NULL))
2147 return opt_result::failure_at (phi, "unsupported phi\n");
2148 }
2149
2150 continue;
2151 }
2152
2153 gcc_assert (stmt_info);
2154
2155 if ((STMT_VINFO_RELEVANT (stmt_info) == vect_used_in_scope
2156 || STMT_VINFO_LIVE_P (stmt_info))
2157 && STMT_VINFO_DEF_TYPE (stmt_info) != vect_induction_def
2158 && STMT_VINFO_DEF_TYPE (stmt_info) != vect_first_order_recurrence)
2159 /* A scalar-dependence cycle that we don't support. */
2160 return opt_result::failure_at (phi,
2161 "not vectorized:"
2162 " scalar dependence cycle.\n");
2163
2164 if (STMT_VINFO_RELEVANT_P (stmt_info))
2165 {
2166 need_to_vectorize = true;
2167 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def
2168 && ! PURE_SLP_STMT (stmt_info))
2169 ok = vectorizable_induction (loop_vinfo,
2170 stmt_info, NULL, NULL,
2171 &cost_vec);
2172 else if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
2173 || (STMT_VINFO_DEF_TYPE (stmt_info)
2174 == vect_double_reduction_def)
2175 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle)
2176 && ! PURE_SLP_STMT (stmt_info))
2177 ok = vectorizable_reduction (loop_vinfo,
2178 stmt_info, NULL, NULL, &cost_vec);
2179 else if ((STMT_VINFO_DEF_TYPE (stmt_info)
2180 == vect_first_order_recurrence)
2181 && ! PURE_SLP_STMT (stmt_info))
2182 ok = vectorizable_recurr (loop_vinfo, stmt_info, NULL, NULL,
2183 &cost_vec);
2184 }
2185
2186 /* SLP PHIs are tested by vect_slp_analyze_node_operations. */
2187 if (ok
2188 && STMT_VINFO_LIVE_P (stmt_info)
2189 && !PURE_SLP_STMT (stmt_info))
2190 ok = vectorizable_live_operation (loop_vinfo, stmt_info, NULL, NULL,
2191 -1, false, &cost_vec);
2192
2193 if (!ok)
2194 return opt_result::failure_at (phi,
2195 "not vectorized: relevant phi not "
2196 "supported: %G",
2197 static_cast <gimple *> (phi));
2198 }
2199
2200 for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
2201 gsi_next (&si))
2202 {
2203 gimple *stmt = gsi_stmt (si);
2204 if (!gimple_clobber_p (stmt)
2205 && !is_gimple_debug (stmt))
2206 {
2207 opt_result res
2208 = vect_analyze_stmt (loop_vinfo,
2209 loop_vinfo->lookup_stmt (stmt),
2210 &need_to_vectorize,
2211 NULL, NULL, &cost_vec);
2212 if (!res)
2213 return res;
2214 }
2215 }
2216 } /* bbs */
2217
2218 add_stmt_costs (loop_vinfo->vector_costs, &cost_vec);
2219
2220 /* All operations in the loop are either irrelevant (deal with loop
2221 control, or dead), or only used outside the loop and can be moved
2222 out of the loop (e.g. invariants, inductions). The loop can be
2223 optimized away by scalar optimizations. We're better off not
2224 touching this loop. */
2225 if (!need_to_vectorize)
2226 {
2227 if (dump_enabled_p ())
2228 dump_printf_loc (MSG_NOTE, vect_location,
2229 "All the computation can be taken out of the loop.\n");
2230 return opt_result::failure_at
2231 (vect_location,
2232 "not vectorized: redundant loop. no profit to vectorize.\n");
2233 }
2234
2235 return opt_result::success ();
2236 }
2237
2238 /* Return true if we know that the iteration count is smaller than the
2239 vectorization factor. Return false if it isn't, or if we can't be sure
2240 either way. */
2241
2242 static bool
2243 vect_known_niters_smaller_than_vf (loop_vec_info loop_vinfo)
2244 {
2245 unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
2246
2247 HOST_WIDE_INT max_niter;
2248 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
2249 max_niter = LOOP_VINFO_INT_NITERS (loop_vinfo);
2250 else
2251 max_niter = max_stmt_executions_int (LOOP_VINFO_LOOP (loop_vinfo));
2252
2253 if (max_niter != -1 && (unsigned HOST_WIDE_INT) max_niter < assumed_vf)
2254 return true;
2255
2256 return false;
2257 }
2258
2259 /* Analyze the cost of the loop described by LOOP_VINFO. Decide if it
2260 is worthwhile to vectorize. Return 1 if definitely yes, 0 if
2261 definitely no, or -1 if it's worth retrying. */
2262
2263 static int
2264 vect_analyze_loop_costing (loop_vec_info loop_vinfo,
2265 unsigned *suggested_unroll_factor)
2266 {
2267 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2268 unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
2269
2270 /* Only loops that can handle partially-populated vectors can have iteration
2271 counts less than the vectorization factor. */
2272 if (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo)
2273 && vect_known_niters_smaller_than_vf (loop_vinfo))
2274 {
2275 if (dump_enabled_p ())
2276 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2277 "not vectorized: iteration count smaller than "
2278 "vectorization factor.\n");
2279 return 0;
2280 }
2281
2282 /* If we know the number of iterations we can do better, for the
2283 epilogue we can also decide whether the main loop leaves us
2284 with enough iterations, prefering a smaller vector epilog then
2285 also possibly used for the case we skip the vector loop. */
2286 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
2287 {
2288 widest_int scalar_niters
2289 = wi::to_widest (LOOP_VINFO_NITERSM1 (loop_vinfo)) + 1;
2290 if (LOOP_VINFO_EPILOGUE_P (loop_vinfo))
2291 {
2292 loop_vec_info orig_loop_vinfo
2293 = LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo);
2294 unsigned lowest_vf
2295 = constant_lower_bound (LOOP_VINFO_VECT_FACTOR (orig_loop_vinfo));
2296 int prolog_peeling = 0;
2297 if (!vect_use_loop_mask_for_alignment_p (loop_vinfo))
2298 prolog_peeling = LOOP_VINFO_PEELING_FOR_ALIGNMENT (orig_loop_vinfo);
2299 if (prolog_peeling >= 0
2300 && known_eq (LOOP_VINFO_VECT_FACTOR (orig_loop_vinfo),
2301 lowest_vf))
2302 {
2303 unsigned gap
2304 = LOOP_VINFO_PEELING_FOR_GAPS (orig_loop_vinfo) ? 1 : 0;
2305 scalar_niters = ((scalar_niters - gap - prolog_peeling)
2306 % lowest_vf + gap);
2307 }
2308 }
2309 /* Reject vectorizing for a single scalar iteration, even if
2310 we could in principle implement that using partial vectors. */
2311 unsigned peeling_gap = LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo);
2312 if (scalar_niters <= peeling_gap + 1)
2313 {
2314 if (dump_enabled_p ())
2315 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2316 "not vectorized: loop only has a single "
2317 "scalar iteration.\n");
2318 return 0;
2319 }
2320
2321 if (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
2322 {
2323 /* Check that the loop processes at least one full vector. */
2324 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2325 if (known_lt (scalar_niters, vf))
2326 {
2327 if (dump_enabled_p ())
2328 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2329 "loop does not have enough iterations "
2330 "to support vectorization.\n");
2331 return 0;
2332 }
2333
2334 /* If we need to peel an extra epilogue iteration to handle data
2335 accesses with gaps, check that there are enough scalar iterations
2336 available.
2337
2338 The check above is redundant with this one when peeling for gaps,
2339 but the distinction is useful for diagnostics. */
2340 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
2341 && known_le (scalar_niters, vf))
2342 {
2343 if (dump_enabled_p ())
2344 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2345 "loop does not have enough iterations "
2346 "to support peeling for gaps.\n");
2347 return 0;
2348 }
2349 }
2350 }
2351
2352 /* If using the "very cheap" model. reject cases in which we'd keep
2353 a copy of the scalar code (even if we might be able to vectorize it). */
2354 if (loop_cost_model (loop) == VECT_COST_MODEL_VERY_CHEAP
2355 && (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
2356 || LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
2357 || LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo)))
2358 {
2359 if (dump_enabled_p ())
2360 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2361 "some scalar iterations would need to be peeled\n");
2362 return 0;
2363 }
2364
2365 int min_profitable_iters, min_profitable_estimate;
2366 vect_estimate_min_profitable_iters (loop_vinfo, &min_profitable_iters,
2367 &min_profitable_estimate,
2368 suggested_unroll_factor);
2369
2370 if (min_profitable_iters < 0)
2371 {
2372 if (dump_enabled_p ())
2373 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2374 "not vectorized: vectorization not profitable.\n");
2375 if (dump_enabled_p ())
2376 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2377 "not vectorized: vector version will never be "
2378 "profitable.\n");
2379 return -1;
2380 }
2381
2382 int min_scalar_loop_bound = (param_min_vect_loop_bound
2383 * assumed_vf);
2384
2385 /* Use the cost model only if it is more conservative than user specified
2386 threshold. */
2387 unsigned int th = (unsigned) MAX (min_scalar_loop_bound,
2388 min_profitable_iters);
2389
2390 LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo) = th;
2391
2392 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2393 && LOOP_VINFO_INT_NITERS (loop_vinfo) < th)
2394 {
2395 if (dump_enabled_p ())
2396 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2397 "not vectorized: vectorization not profitable.\n");
2398 if (dump_enabled_p ())
2399 dump_printf_loc (MSG_NOTE, vect_location,
2400 "not vectorized: iteration count smaller than user "
2401 "specified loop bound parameter or minimum profitable "
2402 "iterations (whichever is more conservative).\n");
2403 return 0;
2404 }
2405
2406 /* The static profitablity threshold min_profitable_estimate includes
2407 the cost of having to check at runtime whether the scalar loop
2408 should be used instead. If it turns out that we don't need or want
2409 such a check, the threshold we should use for the static estimate
2410 is simply the point at which the vector loop becomes more profitable
2411 than the scalar loop. */
2412 if (min_profitable_estimate > min_profitable_iters
2413 && !LOOP_REQUIRES_VERSIONING (loop_vinfo)
2414 && !LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo)
2415 && !LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
2416 && !vect_apply_runtime_profitability_check_p (loop_vinfo))
2417 {
2418 if (dump_enabled_p ())
2419 dump_printf_loc (MSG_NOTE, vect_location, "no need for a runtime"
2420 " choice between the scalar and vector loops\n");
2421 min_profitable_estimate = min_profitable_iters;
2422 }
2423
2424 /* If the vector loop needs multiple iterations to be beneficial then
2425 things are probably too close to call, and the conservative thing
2426 would be to stick with the scalar code. */
2427 if (loop_cost_model (loop) == VECT_COST_MODEL_VERY_CHEAP
2428 && min_profitable_estimate > (int) vect_vf_for_cost (loop_vinfo))
2429 {
2430 if (dump_enabled_p ())
2431 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2432 "one iteration of the vector loop would be"
2433 " more expensive than the equivalent number of"
2434 " iterations of the scalar loop\n");
2435 return 0;
2436 }
2437
2438 HOST_WIDE_INT estimated_niter;
2439
2440 /* If we are vectorizing an epilogue then we know the maximum number of
2441 scalar iterations it will cover is at least one lower than the
2442 vectorization factor of the main loop. */
2443 if (LOOP_VINFO_EPILOGUE_P (loop_vinfo))
2444 estimated_niter
2445 = vect_vf_for_cost (LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo)) - 1;
2446 else
2447 {
2448 estimated_niter = estimated_stmt_executions_int (loop);
2449 if (estimated_niter == -1)
2450 estimated_niter = likely_max_stmt_executions_int (loop);
2451 }
2452 if (estimated_niter != -1
2453 && ((unsigned HOST_WIDE_INT) estimated_niter
2454 < MAX (th, (unsigned) min_profitable_estimate)))
2455 {
2456 if (dump_enabled_p ())
2457 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2458 "not vectorized: estimated iteration count too "
2459 "small.\n");
2460 if (dump_enabled_p ())
2461 dump_printf_loc (MSG_NOTE, vect_location,
2462 "not vectorized: estimated iteration count smaller "
2463 "than specified loop bound parameter or minimum "
2464 "profitable iterations (whichever is more "
2465 "conservative).\n");
2466 return -1;
2467 }
2468
2469 return 1;
2470 }
2471
2472 static opt_result
2473 vect_get_datarefs_in_loop (loop_p loop, basic_block *bbs,
2474 vec<data_reference_p> *datarefs,
2475 unsigned int *n_stmts)
2476 {
2477 *n_stmts = 0;
2478 for (unsigned i = 0; i < loop->num_nodes; i++)
2479 for (gimple_stmt_iterator gsi = gsi_start_bb (bbs[i]);
2480 !gsi_end_p (gsi); gsi_next (&gsi))
2481 {
2482 gimple *stmt = gsi_stmt (gsi);
2483 if (is_gimple_debug (stmt))
2484 continue;
2485 ++(*n_stmts);
2486 opt_result res = vect_find_stmt_data_reference (loop, stmt, datarefs,
2487 NULL, 0);
2488 if (!res)
2489 {
2490 if (is_gimple_call (stmt) && loop->safelen)
2491 {
2492 tree fndecl = gimple_call_fndecl (stmt), op;
2493 if (fndecl == NULL_TREE
2494 && gimple_call_internal_p (stmt, IFN_MASK_CALL))
2495 {
2496 fndecl = gimple_call_arg (stmt, 0);
2497 gcc_checking_assert (TREE_CODE (fndecl) == ADDR_EXPR);
2498 fndecl = TREE_OPERAND (fndecl, 0);
2499 gcc_checking_assert (TREE_CODE (fndecl) == FUNCTION_DECL);
2500 }
2501 if (fndecl != NULL_TREE)
2502 {
2503 cgraph_node *node = cgraph_node::get (fndecl);
2504 if (node != NULL && node->simd_clones != NULL)
2505 {
2506 unsigned int j, n = gimple_call_num_args (stmt);
2507 for (j = 0; j < n; j++)
2508 {
2509 op = gimple_call_arg (stmt, j);
2510 if (DECL_P (op)
2511 || (REFERENCE_CLASS_P (op)
2512 && get_base_address (op)))
2513 break;
2514 }
2515 op = gimple_call_lhs (stmt);
2516 /* Ignore #pragma omp declare simd functions
2517 if they don't have data references in the
2518 call stmt itself. */
2519 if (j == n
2520 && !(op
2521 && (DECL_P (op)
2522 || (REFERENCE_CLASS_P (op)
2523 && get_base_address (op)))))
2524 continue;
2525 }
2526 }
2527 }
2528 return res;
2529 }
2530 /* If dependence analysis will give up due to the limit on the
2531 number of datarefs stop here and fail fatally. */
2532 if (datarefs->length ()
2533 > (unsigned)param_loop_max_datarefs_for_datadeps)
2534 return opt_result::failure_at (stmt, "exceeded param "
2535 "loop-max-datarefs-for-datadeps\n");
2536 }
2537 return opt_result::success ();
2538 }
2539
2540 /* Look for SLP-only access groups and turn each individual access into its own
2541 group. */
2542 static void
2543 vect_dissolve_slp_only_groups (loop_vec_info loop_vinfo)
2544 {
2545 unsigned int i;
2546 struct data_reference *dr;
2547
2548 DUMP_VECT_SCOPE ("vect_dissolve_slp_only_groups");
2549
2550 vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
2551 FOR_EACH_VEC_ELT (datarefs, i, dr)
2552 {
2553 gcc_assert (DR_REF (dr));
2554 stmt_vec_info stmt_info
2555 = vect_stmt_to_vectorize (loop_vinfo->lookup_stmt (DR_STMT (dr)));
2556
2557 /* Check if the load is a part of an interleaving chain. */
2558 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
2559 {
2560 stmt_vec_info first_element = DR_GROUP_FIRST_ELEMENT (stmt_info);
2561 dr_vec_info *dr_info = STMT_VINFO_DR_INFO (first_element);
2562 unsigned int group_size = DR_GROUP_SIZE (first_element);
2563
2564 /* Check if SLP-only groups. */
2565 if (!STMT_SLP_TYPE (stmt_info)
2566 && STMT_VINFO_SLP_VECT_ONLY (first_element))
2567 {
2568 /* Dissolve the group. */
2569 STMT_VINFO_SLP_VECT_ONLY (first_element) = false;
2570
2571 stmt_vec_info vinfo = first_element;
2572 while (vinfo)
2573 {
2574 stmt_vec_info next = DR_GROUP_NEXT_ELEMENT (vinfo);
2575 DR_GROUP_FIRST_ELEMENT (vinfo) = vinfo;
2576 DR_GROUP_NEXT_ELEMENT (vinfo) = NULL;
2577 DR_GROUP_SIZE (vinfo) = 1;
2578 if (STMT_VINFO_STRIDED_P (first_element)
2579 /* We cannot handle stores with gaps. */
2580 || DR_IS_WRITE (dr_info->dr))
2581 {
2582 STMT_VINFO_STRIDED_P (vinfo) = true;
2583 DR_GROUP_GAP (vinfo) = 0;
2584 }
2585 else
2586 DR_GROUP_GAP (vinfo) = group_size - 1;
2587 /* Duplicate and adjust alignment info, it needs to
2588 be present on each group leader, see dr_misalignment. */
2589 if (vinfo != first_element)
2590 {
2591 dr_vec_info *dr_info2 = STMT_VINFO_DR_INFO (vinfo);
2592 dr_info2->target_alignment = dr_info->target_alignment;
2593 int misalignment = dr_info->misalignment;
2594 if (misalignment != DR_MISALIGNMENT_UNKNOWN)
2595 {
2596 HOST_WIDE_INT diff
2597 = (TREE_INT_CST_LOW (DR_INIT (dr_info2->dr))
2598 - TREE_INT_CST_LOW (DR_INIT (dr_info->dr)));
2599 unsigned HOST_WIDE_INT align_c
2600 = dr_info->target_alignment.to_constant ();
2601 misalignment = (misalignment + diff) % align_c;
2602 }
2603 dr_info2->misalignment = misalignment;
2604 }
2605 vinfo = next;
2606 }
2607 }
2608 }
2609 }
2610 }
2611
2612 /* Determine if operating on full vectors for LOOP_VINFO might leave
2613 some scalar iterations still to do. If so, decide how we should
2614 handle those scalar iterations. The possibilities are:
2615
2616 (1) Make LOOP_VINFO operate on partial vectors instead of full vectors.
2617 In this case:
2618
2619 LOOP_VINFO_USING_PARTIAL_VECTORS_P == true
2620 LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P == false
2621 LOOP_VINFO_PEELING_FOR_NITER == false
2622
2623 (2) Make LOOP_VINFO operate on full vectors and use an epilogue loop
2624 to handle the remaining scalar iterations. In this case:
2625
2626 LOOP_VINFO_USING_PARTIAL_VECTORS_P == false
2627 LOOP_VINFO_PEELING_FOR_NITER == true
2628
2629 There are two choices:
2630
2631 (2a) Consider vectorizing the epilogue loop at the same VF as the
2632 main loop, but using partial vectors instead of full vectors.
2633 In this case:
2634
2635 LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P == true
2636
2637 (2b) Consider vectorizing the epilogue loop at lower VFs only.
2638 In this case:
2639
2640 LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P == false
2641 */
2642
2643 opt_result
2644 vect_determine_partial_vectors_and_peeling (loop_vec_info loop_vinfo)
2645 {
2646 /* Determine whether there would be any scalar iterations left over. */
2647 bool need_peeling_or_partial_vectors_p
2648 = vect_need_peeling_or_partial_vectors_p (loop_vinfo);
2649
2650 /* Decide whether to vectorize the loop with partial vectors. */
2651 LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo) = false;
2652 LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P (loop_vinfo) = false;
2653 if (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)
2654 && need_peeling_or_partial_vectors_p)
2655 {
2656 /* For partial-vector-usage=1, try to push the handling of partial
2657 vectors to the epilogue, with the main loop continuing to operate
2658 on full vectors.
2659
2660 If we are unrolling we also do not want to use partial vectors. This
2661 is to avoid the overhead of generating multiple masks and also to
2662 avoid having to execute entire iterations of FALSE masked instructions
2663 when dealing with one or less full iterations.
2664
2665 ??? We could then end up failing to use partial vectors if we
2666 decide to peel iterations into a prologue, and if the main loop
2667 then ends up processing fewer than VF iterations. */
2668 if ((param_vect_partial_vector_usage == 1
2669 || loop_vinfo->suggested_unroll_factor > 1)
2670 && !LOOP_VINFO_EPILOGUE_P (loop_vinfo)
2671 && !vect_known_niters_smaller_than_vf (loop_vinfo))
2672 LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P (loop_vinfo) = true;
2673 else
2674 LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo) = true;
2675 }
2676
2677 if (dump_enabled_p ())
2678 dump_printf_loc (MSG_NOTE, vect_location,
2679 "operating on %s vectors%s.\n",
2680 LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo)
2681 ? "partial" : "full",
2682 LOOP_VINFO_EPILOGUE_P (loop_vinfo)
2683 ? " for epilogue loop" : "");
2684
2685 LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo)
2686 = (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo)
2687 && need_peeling_or_partial_vectors_p);
2688
2689 /* We set LOOP_VINFO_USING_SELECT_VL_P as true before loop vectorization
2690 analysis that we don't know whether the loop is vectorized by partial
2691 vectors (More details see tree-vect-loop-manip.cc).
2692
2693 However, SELECT_VL vectorizaton style should only applied on partial
2694 vectorization since SELECT_VL is the GIMPLE IR that calculates the
2695 number of elements to be process for each iteration.
2696
2697 After loop vectorization analysis, Clear LOOP_VINFO_USING_SELECT_VL_P
2698 if it is not partial vectorized loop. */
2699 if (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
2700 LOOP_VINFO_USING_SELECT_VL_P (loop_vinfo) = false;
2701
2702 return opt_result::success ();
2703 }
2704
2705 /* Function vect_analyze_loop_2.
2706
2707 Apply a set of analyses on LOOP specified by LOOP_VINFO, the different
2708 analyses will record information in some members of LOOP_VINFO. FATAL
2709 indicates if some analysis meets fatal error. If one non-NULL pointer
2710 SUGGESTED_UNROLL_FACTOR is provided, it's intent to be filled with one
2711 worked out suggested unroll factor, while one NULL pointer shows it's
2712 going to apply the suggested unroll factor. SLP_DONE_FOR_SUGGESTED_UF
2713 is to hold the slp decision when the suggested unroll factor is worked
2714 out. */
2715 static opt_result
2716 vect_analyze_loop_2 (loop_vec_info loop_vinfo, bool &fatal,
2717 unsigned *suggested_unroll_factor,
2718 bool& slp_done_for_suggested_uf)
2719 {
2720 opt_result ok = opt_result::success ();
2721 int res;
2722 unsigned int max_vf = MAX_VECTORIZATION_FACTOR;
2723 poly_uint64 min_vf = 2;
2724 loop_vec_info orig_loop_vinfo = NULL;
2725
2726 /* If we are dealing with an epilogue then orig_loop_vinfo points to the
2727 loop_vec_info of the first vectorized loop. */
2728 if (LOOP_VINFO_EPILOGUE_P (loop_vinfo))
2729 orig_loop_vinfo = LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo);
2730 else
2731 orig_loop_vinfo = loop_vinfo;
2732 gcc_assert (orig_loop_vinfo);
2733
2734 /* The first group of checks is independent of the vector size. */
2735 fatal = true;
2736
2737 if (LOOP_VINFO_SIMD_IF_COND (loop_vinfo)
2738 && integer_zerop (LOOP_VINFO_SIMD_IF_COND (loop_vinfo)))
2739 return opt_result::failure_at (vect_location,
2740 "not vectorized: simd if(0)\n");
2741
2742 /* Find all data references in the loop (which correspond to vdefs/vuses)
2743 and analyze their evolution in the loop. */
2744
2745 loop_p loop = LOOP_VINFO_LOOP (loop_vinfo);
2746
2747 /* Gather the data references and count stmts in the loop. */
2748 if (!LOOP_VINFO_DATAREFS (loop_vinfo).exists ())
2749 {
2750 opt_result res
2751 = vect_get_datarefs_in_loop (loop, LOOP_VINFO_BBS (loop_vinfo),
2752 &LOOP_VINFO_DATAREFS (loop_vinfo),
2753 &LOOP_VINFO_N_STMTS (loop_vinfo));
2754 if (!res)
2755 {
2756 if (dump_enabled_p ())
2757 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2758 "not vectorized: loop contains function "
2759 "calls or data references that cannot "
2760 "be analyzed\n");
2761 return res;
2762 }
2763 loop_vinfo->shared->save_datarefs ();
2764 }
2765 else
2766 loop_vinfo->shared->check_datarefs ();
2767
2768 /* Analyze the data references and also adjust the minimal
2769 vectorization factor according to the loads and stores. */
2770
2771 ok = vect_analyze_data_refs (loop_vinfo, &min_vf, &fatal);
2772 if (!ok)
2773 {
2774 if (dump_enabled_p ())
2775 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2776 "bad data references.\n");
2777 return ok;
2778 }
2779
2780 /* Check if we are applying unroll factor now. */
2781 bool applying_suggested_uf = loop_vinfo->suggested_unroll_factor > 1;
2782 gcc_assert (!applying_suggested_uf || !suggested_unroll_factor);
2783
2784 /* If the slp decision is false when suggested unroll factor is worked
2785 out, and we are applying suggested unroll factor, we can simply skip
2786 all slp related analyses this time. */
2787 bool slp = !applying_suggested_uf || slp_done_for_suggested_uf;
2788
2789 /* Classify all cross-iteration scalar data-flow cycles.
2790 Cross-iteration cycles caused by virtual phis are analyzed separately. */
2791 vect_analyze_scalar_cycles (loop_vinfo, slp);
2792
2793 vect_pattern_recog (loop_vinfo);
2794
2795 vect_fixup_scalar_cycles_with_patterns (loop_vinfo);
2796
2797 /* Analyze the access patterns of the data-refs in the loop (consecutive,
2798 complex, etc.). FORNOW: Only handle consecutive access pattern. */
2799
2800 ok = vect_analyze_data_ref_accesses (loop_vinfo, NULL);
2801 if (!ok)
2802 {
2803 if (dump_enabled_p ())
2804 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2805 "bad data access.\n");
2806 return ok;
2807 }
2808
2809 /* Data-flow analysis to detect stmts that do not need to be vectorized. */
2810
2811 ok = vect_mark_stmts_to_be_vectorized (loop_vinfo, &fatal);
2812 if (!ok)
2813 {
2814 if (dump_enabled_p ())
2815 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2816 "unexpected pattern.\n");
2817 return ok;
2818 }
2819
2820 /* While the rest of the analysis below depends on it in some way. */
2821 fatal = false;
2822
2823 /* Analyze data dependences between the data-refs in the loop
2824 and adjust the maximum vectorization factor according to
2825 the dependences.
2826 FORNOW: fail at the first data dependence that we encounter. */
2827
2828 ok = vect_analyze_data_ref_dependences (loop_vinfo, &max_vf);
2829 if (!ok)
2830 {
2831 if (dump_enabled_p ())
2832 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2833 "bad data dependence.\n");
2834 return ok;
2835 }
2836 if (max_vf != MAX_VECTORIZATION_FACTOR
2837 && maybe_lt (max_vf, min_vf))
2838 return opt_result::failure_at (vect_location, "bad data dependence.\n");
2839 LOOP_VINFO_MAX_VECT_FACTOR (loop_vinfo) = max_vf;
2840
2841 ok = vect_determine_vectorization_factor (loop_vinfo);
2842 if (!ok)
2843 {
2844 if (dump_enabled_p ())
2845 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2846 "can't determine vectorization factor.\n");
2847 return ok;
2848 }
2849
2850 /* Compute the scalar iteration cost. */
2851 vect_compute_single_scalar_iteration_cost (loop_vinfo);
2852
2853 poly_uint64 saved_vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2854
2855 if (slp)
2856 {
2857 /* Check the SLP opportunities in the loop, analyze and build
2858 SLP trees. */
2859 ok = vect_analyze_slp (loop_vinfo, LOOP_VINFO_N_STMTS (loop_vinfo));
2860 if (!ok)
2861 return ok;
2862
2863 /* If there are any SLP instances mark them as pure_slp. */
2864 slp = vect_make_slp_decision (loop_vinfo);
2865 if (slp)
2866 {
2867 /* Find stmts that need to be both vectorized and SLPed. */
2868 vect_detect_hybrid_slp (loop_vinfo);
2869
2870 /* Update the vectorization factor based on the SLP decision. */
2871 vect_update_vf_for_slp (loop_vinfo);
2872
2873 /* Optimize the SLP graph with the vectorization factor fixed. */
2874 vect_optimize_slp (loop_vinfo);
2875
2876 /* Gather the loads reachable from the SLP graph entries. */
2877 vect_gather_slp_loads (loop_vinfo);
2878 }
2879 }
2880
2881 bool saved_can_use_partial_vectors_p
2882 = LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo);
2883
2884 /* We don't expect to have to roll back to anything other than an empty
2885 set of rgroups. */
2886 gcc_assert (LOOP_VINFO_MASKS (loop_vinfo).is_empty ());
2887
2888 /* This is the point where we can re-start analysis with SLP forced off. */
2889 start_over:
2890
2891 /* Apply the suggested unrolling factor, this was determined by the backend
2892 during finish_cost the first time we ran the analyzis for this
2893 vector mode. */
2894 if (applying_suggested_uf)
2895 LOOP_VINFO_VECT_FACTOR (loop_vinfo) *= loop_vinfo->suggested_unroll_factor;
2896
2897 /* Now the vectorization factor is final. */
2898 poly_uint64 vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2899 gcc_assert (known_ne (vectorization_factor, 0U));
2900
2901 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) && dump_enabled_p ())
2902 {
2903 dump_printf_loc (MSG_NOTE, vect_location,
2904 "vectorization_factor = ");
2905 dump_dec (MSG_NOTE, vectorization_factor);
2906 dump_printf (MSG_NOTE, ", niters = %wd\n",
2907 LOOP_VINFO_INT_NITERS (loop_vinfo));
2908 }
2909
2910 if (max_vf != MAX_VECTORIZATION_FACTOR
2911 && maybe_lt (max_vf, LOOP_VINFO_VECT_FACTOR (loop_vinfo)))
2912 return opt_result::failure_at (vect_location, "bad data dependence.\n");
2913
2914 loop_vinfo->vector_costs = init_cost (loop_vinfo, false);
2915
2916 /* Analyze the alignment of the data-refs in the loop.
2917 Fail if a data reference is found that cannot be vectorized. */
2918
2919 ok = vect_analyze_data_refs_alignment (loop_vinfo);
2920 if (!ok)
2921 {
2922 if (dump_enabled_p ())
2923 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2924 "bad data alignment.\n");
2925 return ok;
2926 }
2927
2928 /* Prune the list of ddrs to be tested at run-time by versioning for alias.
2929 It is important to call pruning after vect_analyze_data_ref_accesses,
2930 since we use grouping information gathered by interleaving analysis. */
2931 ok = vect_prune_runtime_alias_test_list (loop_vinfo);
2932 if (!ok)
2933 return ok;
2934
2935 /* Do not invoke vect_enhance_data_refs_alignment for epilogue
2936 vectorization, since we do not want to add extra peeling or
2937 add versioning for alignment. */
2938 if (!LOOP_VINFO_EPILOGUE_P (loop_vinfo))
2939 /* This pass will decide on using loop versioning and/or loop peeling in
2940 order to enhance the alignment of data references in the loop. */
2941 ok = vect_enhance_data_refs_alignment (loop_vinfo);
2942 if (!ok)
2943 return ok;
2944
2945 if (slp)
2946 {
2947 /* Analyze operations in the SLP instances. Note this may
2948 remove unsupported SLP instances which makes the above
2949 SLP kind detection invalid. */
2950 unsigned old_size = LOOP_VINFO_SLP_INSTANCES (loop_vinfo).length ();
2951 vect_slp_analyze_operations (loop_vinfo);
2952 if (LOOP_VINFO_SLP_INSTANCES (loop_vinfo).length () != old_size)
2953 {
2954 ok = opt_result::failure_at (vect_location,
2955 "unsupported SLP instances\n");
2956 goto again;
2957 }
2958
2959 /* Check whether any load in ALL SLP instances is possibly permuted. */
2960 slp_tree load_node, slp_root;
2961 unsigned i, x;
2962 slp_instance instance;
2963 bool can_use_lanes = true;
2964 FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), x, instance)
2965 {
2966 slp_root = SLP_INSTANCE_TREE (instance);
2967 int group_size = SLP_TREE_LANES (slp_root);
2968 tree vectype = SLP_TREE_VECTYPE (slp_root);
2969 bool loads_permuted = false;
2970 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), i, load_node)
2971 {
2972 if (!SLP_TREE_LOAD_PERMUTATION (load_node).exists ())
2973 continue;
2974 unsigned j;
2975 stmt_vec_info load_info;
2976 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (load_node), j, load_info)
2977 if (SLP_TREE_LOAD_PERMUTATION (load_node)[j] != j)
2978 {
2979 loads_permuted = true;
2980 break;
2981 }
2982 }
2983
2984 /* If the loads and stores can be handled with load/store-lane
2985 instructions record it and move on to the next instance. */
2986 if (loads_permuted
2987 && SLP_INSTANCE_KIND (instance) == slp_inst_kind_store
2988 && vect_store_lanes_supported (vectype, group_size, false)
2989 != IFN_LAST)
2990 {
2991 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), i, load_node)
2992 if (STMT_VINFO_GROUPED_ACCESS
2993 (SLP_TREE_REPRESENTATIVE (load_node)))
2994 {
2995 stmt_vec_info stmt_vinfo = DR_GROUP_FIRST_ELEMENT
2996 (SLP_TREE_REPRESENTATIVE (load_node));
2997 /* Use SLP for strided accesses (or if we can't
2998 load-lanes). */
2999 if (STMT_VINFO_STRIDED_P (stmt_vinfo)
3000 || vect_load_lanes_supported
3001 (STMT_VINFO_VECTYPE (stmt_vinfo),
3002 DR_GROUP_SIZE (stmt_vinfo), false) == IFN_LAST)
3003 break;
3004 }
3005
3006 can_use_lanes
3007 = can_use_lanes && i == SLP_INSTANCE_LOADS (instance).length ();
3008
3009 if (can_use_lanes && dump_enabled_p ())
3010 dump_printf_loc (MSG_NOTE, vect_location,
3011 "SLP instance %p can use load/store-lanes\n",
3012 (void *) instance);
3013 }
3014 else
3015 {
3016 can_use_lanes = false;
3017 break;
3018 }
3019 }
3020
3021 /* If all SLP instances can use load/store-lanes abort SLP and try again
3022 with SLP disabled. */
3023 if (can_use_lanes)
3024 {
3025 ok = opt_result::failure_at (vect_location,
3026 "Built SLP cancelled: can use "
3027 "load/store-lanes\n");
3028 if (dump_enabled_p ())
3029 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3030 "Built SLP cancelled: all SLP instances support "
3031 "load/store-lanes\n");
3032 goto again;
3033 }
3034 }
3035
3036 /* Dissolve SLP-only groups. */
3037 vect_dissolve_slp_only_groups (loop_vinfo);
3038
3039 /* Scan all the remaining operations in the loop that are not subject
3040 to SLP and make sure they are vectorizable. */
3041 ok = vect_analyze_loop_operations (loop_vinfo);
3042 if (!ok)
3043 {
3044 if (dump_enabled_p ())
3045 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3046 "bad operation or unsupported loop bound.\n");
3047 return ok;
3048 }
3049
3050 /* For now, we don't expect to mix both masking and length approaches for one
3051 loop, disable it if both are recorded. */
3052 if (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)
3053 && !LOOP_VINFO_MASKS (loop_vinfo).is_empty ()
3054 && !LOOP_VINFO_LENS (loop_vinfo).is_empty ())
3055 {
3056 if (dump_enabled_p ())
3057 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3058 "can't vectorize a loop with partial vectors"
3059 " because we don't expect to mix different"
3060 " approaches with partial vectors for the"
3061 " same loop.\n");
3062 LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
3063 }
3064
3065 /* If we still have the option of using partial vectors,
3066 check whether we can generate the necessary loop controls. */
3067 if (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo))
3068 {
3069 if (!LOOP_VINFO_MASKS (loop_vinfo).is_empty ())
3070 {
3071 if (!vect_verify_full_masking (loop_vinfo)
3072 && !vect_verify_full_masking_avx512 (loop_vinfo))
3073 LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
3074 }
3075 else /* !LOOP_VINFO_LENS (loop_vinfo).is_empty () */
3076 if (!vect_verify_loop_lens (loop_vinfo))
3077 LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
3078 }
3079
3080 /* If we're vectorizing a loop that uses length "controls" and
3081 can iterate more than once, we apply decrementing IV approach
3082 in loop control. */
3083 if (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)
3084 && LOOP_VINFO_PARTIAL_VECTORS_STYLE (loop_vinfo) == vect_partial_vectors_len
3085 && LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo) == 0
3086 && !(LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
3087 && known_le (LOOP_VINFO_INT_NITERS (loop_vinfo),
3088 LOOP_VINFO_VECT_FACTOR (loop_vinfo))))
3089 LOOP_VINFO_USING_DECREMENTING_IV_P (loop_vinfo) = true;
3090
3091 /* If a loop uses length controls and has a decrementing loop control IV,
3092 we will normally pass that IV through a MIN_EXPR to calcaluate the
3093 basis for the length controls. E.g. in a loop that processes one
3094 element per scalar iteration, the number of elements would be
3095 MIN_EXPR <N, VF>, where N is the number of scalar iterations left.
3096
3097 This MIN_EXPR approach allows us to use pointer IVs with an invariant
3098 step, since only the final iteration of the vector loop can have
3099 inactive lanes.
3100
3101 However, some targets have a dedicated instruction for calculating the
3102 preferred length, given the total number of elements that still need to
3103 be processed. This is encapsulated in the SELECT_VL internal function.
3104
3105 If the target supports SELECT_VL, we can use it instead of MIN_EXPR
3106 to determine the basis for the length controls. However, unlike the
3107 MIN_EXPR calculation, the SELECT_VL calculation can decide to make
3108 lanes inactive in any iteration of the vector loop, not just the last
3109 iteration. This SELECT_VL approach therefore requires us to use pointer
3110 IVs with variable steps.
3111
3112 Once we've decided how many elements should be processed by one
3113 iteration of the vector loop, we need to populate the rgroup controls.
3114 If a loop has multiple rgroups, we need to make sure that those rgroups
3115 "line up" (that is, they must be consistent about which elements are
3116 active and which aren't). This is done by vect_adjust_loop_lens_control.
3117
3118 In principle, it would be possible to use vect_adjust_loop_lens_control
3119 on either the result of a MIN_EXPR or the result of a SELECT_VL.
3120 However:
3121
3122 (1) In practice, it only makes sense to use SELECT_VL when a vector
3123 operation will be controlled directly by the result. It is not
3124 worth using SELECT_VL if it would only be the input to other
3125 calculations.
3126
3127 (2) If we use SELECT_VL for an rgroup that has N controls, each associated
3128 pointer IV will need N updates by a variable amount (N-1 updates
3129 within the iteration and 1 update to move to the next iteration).
3130
3131 Because of this, we prefer to use the MIN_EXPR approach whenever there
3132 is more than one length control.
3133
3134 In addition, SELECT_VL always operates to a granularity of 1 unit.
3135 If we wanted to use it to control an SLP operation on N consecutive
3136 elements, we would need to make the SELECT_VL inputs measure scalar
3137 iterations (rather than elements) and then multiply the SELECT_VL
3138 result by N. But using SELECT_VL this way is inefficient because
3139 of (1) above.
3140
3141 2. We don't apply SELECT_VL on single-rgroup when both (1) and (2) are
3142 satisfied:
3143
3144 (1). LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) is true.
3145 (2). LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant () is true.
3146
3147 Since SELECT_VL (variable step) will make SCEV analysis failed and then
3148 we will fail to gain benefits of following unroll optimizations. We prefer
3149 using the MIN_EXPR approach in this situation. */
3150 if (LOOP_VINFO_USING_DECREMENTING_IV_P (loop_vinfo))
3151 {
3152 tree iv_type = LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo);
3153 if (direct_internal_fn_supported_p (IFN_SELECT_VL, iv_type,
3154 OPTIMIZE_FOR_SPEED)
3155 && LOOP_VINFO_LENS (loop_vinfo).length () == 1
3156 && LOOP_VINFO_LENS (loop_vinfo)[0].factor == 1 && !slp
3157 && (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
3158 || !LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant ()))
3159 LOOP_VINFO_USING_SELECT_VL_P (loop_vinfo) = true;
3160 }
3161
3162 /* Decide whether this loop_vinfo should use partial vectors or peeling,
3163 assuming that the loop will be used as a main loop. We will redo
3164 this analysis later if we instead decide to use the loop as an
3165 epilogue loop. */
3166 ok = vect_determine_partial_vectors_and_peeling (loop_vinfo);
3167 if (!ok)
3168 return ok;
3169
3170 /* If we're vectorizing an epilogue loop, the vectorized loop either needs
3171 to be able to handle fewer than VF scalars, or needs to have a lower VF
3172 than the main loop. */
3173 if (LOOP_VINFO_EPILOGUE_P (loop_vinfo)
3174 && !LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
3175 {
3176 poly_uint64 unscaled_vf
3177 = exact_div (LOOP_VINFO_VECT_FACTOR (orig_loop_vinfo),
3178 orig_loop_vinfo->suggested_unroll_factor);
3179 if (maybe_ge (LOOP_VINFO_VECT_FACTOR (loop_vinfo), unscaled_vf))
3180 return opt_result::failure_at (vect_location,
3181 "Vectorization factor too high for"
3182 " epilogue loop.\n");
3183 }
3184
3185 /* Check the costings of the loop make vectorizing worthwhile. */
3186 res = vect_analyze_loop_costing (loop_vinfo, suggested_unroll_factor);
3187 if (res < 0)
3188 {
3189 ok = opt_result::failure_at (vect_location,
3190 "Loop costings may not be worthwhile.\n");
3191 goto again;
3192 }
3193 if (!res)
3194 return opt_result::failure_at (vect_location,
3195 "Loop costings not worthwhile.\n");
3196
3197 /* If an epilogue loop is required make sure we can create one. */
3198 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
3199 || LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo)
3200 || LOOP_VINFO_EARLY_BREAKS (loop_vinfo))
3201 {
3202 if (dump_enabled_p ())
3203 dump_printf_loc (MSG_NOTE, vect_location, "epilog loop required\n");
3204 if (!vect_can_advance_ivs_p (loop_vinfo)
3205 || !slpeel_can_duplicate_loop_p (loop,
3206 LOOP_VINFO_IV_EXIT (loop_vinfo),
3207 LOOP_VINFO_IV_EXIT (loop_vinfo)))
3208 {
3209 ok = opt_result::failure_at (vect_location,
3210 "not vectorized: can't create required "
3211 "epilog loop\n");
3212 goto again;
3213 }
3214 }
3215
3216 /* During peeling, we need to check if number of loop iterations is
3217 enough for both peeled prolog loop and vector loop. This check
3218 can be merged along with threshold check of loop versioning, so
3219 increase threshold for this case if necessary.
3220
3221 If we are analyzing an epilogue we still want to check what its
3222 versioning threshold would be. If we decide to vectorize the epilogues we
3223 will want to use the lowest versioning threshold of all epilogues and main
3224 loop. This will enable us to enter a vectorized epilogue even when
3225 versioning the loop. We can't simply check whether the epilogue requires
3226 versioning though since we may have skipped some versioning checks when
3227 analyzing the epilogue. For instance, checks for alias versioning will be
3228 skipped when dealing with epilogues as we assume we already checked them
3229 for the main loop. So instead we always check the 'orig_loop_vinfo'. */
3230 if (LOOP_REQUIRES_VERSIONING (orig_loop_vinfo))
3231 {
3232 poly_uint64 niters_th = 0;
3233 unsigned int th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
3234
3235 if (!vect_use_loop_mask_for_alignment_p (loop_vinfo))
3236 {
3237 /* Niters for peeled prolog loop. */
3238 if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) < 0)
3239 {
3240 dr_vec_info *dr_info = LOOP_VINFO_UNALIGNED_DR (loop_vinfo);
3241 tree vectype = STMT_VINFO_VECTYPE (dr_info->stmt);
3242 niters_th += TYPE_VECTOR_SUBPARTS (vectype) - 1;
3243 }
3244 else
3245 niters_th += LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
3246 }
3247
3248 /* Niters for at least one iteration of vectorized loop. */
3249 if (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
3250 niters_th += LOOP_VINFO_VECT_FACTOR (loop_vinfo);
3251 /* One additional iteration because of peeling for gap. */
3252 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
3253 niters_th += 1;
3254
3255 /* Use the same condition as vect_transform_loop to decide when to use
3256 the cost to determine a versioning threshold. */
3257 if (vect_apply_runtime_profitability_check_p (loop_vinfo)
3258 && ordered_p (th, niters_th))
3259 niters_th = ordered_max (poly_uint64 (th), niters_th);
3260
3261 LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo) = niters_th;
3262 }
3263
3264 gcc_assert (known_eq (vectorization_factor,
3265 LOOP_VINFO_VECT_FACTOR (loop_vinfo)));
3266
3267 slp_done_for_suggested_uf = slp;
3268
3269 /* Ok to vectorize! */
3270 LOOP_VINFO_VECTORIZABLE_P (loop_vinfo) = 1;
3271 return opt_result::success ();
3272
3273 again:
3274 /* Ensure that "ok" is false (with an opt_problem if dumping is enabled). */
3275 gcc_assert (!ok);
3276
3277 /* Try again with SLP forced off but if we didn't do any SLP there is
3278 no point in re-trying. */
3279 if (!slp)
3280 return ok;
3281
3282 /* If the slp decision is true when suggested unroll factor is worked
3283 out, and we are applying suggested unroll factor, we don't need to
3284 re-try any more. */
3285 if (applying_suggested_uf && slp_done_for_suggested_uf)
3286 return ok;
3287
3288 /* If there are reduction chains re-trying will fail anyway. */
3289 if (! LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo).is_empty ())
3290 return ok;
3291
3292 /* Likewise if the grouped loads or stores in the SLP cannot be handled
3293 via interleaving or lane instructions. */
3294 slp_instance instance;
3295 slp_tree node;
3296 unsigned i, j;
3297 FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), i, instance)
3298 {
3299 stmt_vec_info vinfo;
3300 vinfo = SLP_TREE_SCALAR_STMTS (SLP_INSTANCE_TREE (instance))[0];
3301 if (! STMT_VINFO_GROUPED_ACCESS (vinfo))
3302 continue;
3303 vinfo = DR_GROUP_FIRST_ELEMENT (vinfo);
3304 unsigned int size = DR_GROUP_SIZE (vinfo);
3305 tree vectype = STMT_VINFO_VECTYPE (vinfo);
3306 if (vect_store_lanes_supported (vectype, size, false) == IFN_LAST
3307 && ! known_eq (TYPE_VECTOR_SUBPARTS (vectype), 1U)
3308 && ! vect_grouped_store_supported (vectype, size))
3309 return opt_result::failure_at (vinfo->stmt,
3310 "unsupported grouped store\n");
3311 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), j, node)
3312 {
3313 vinfo = SLP_TREE_REPRESENTATIVE (node);
3314 if (STMT_VINFO_GROUPED_ACCESS (vinfo))
3315 {
3316 vinfo = DR_GROUP_FIRST_ELEMENT (vinfo);
3317 bool single_element_p = !DR_GROUP_NEXT_ELEMENT (vinfo);
3318 size = DR_GROUP_SIZE (vinfo);
3319 vectype = STMT_VINFO_VECTYPE (vinfo);
3320 if (vect_load_lanes_supported (vectype, size, false) == IFN_LAST
3321 && ! vect_grouped_load_supported (vectype, single_element_p,
3322 size))
3323 return opt_result::failure_at (vinfo->stmt,
3324 "unsupported grouped load\n");
3325 }
3326 }
3327 }
3328
3329 if (dump_enabled_p ())
3330 dump_printf_loc (MSG_NOTE, vect_location,
3331 "re-trying with SLP disabled\n");
3332
3333 /* Roll back state appropriately. No SLP this time. */
3334 slp = false;
3335 /* Restore vectorization factor as it were without SLP. */
3336 LOOP_VINFO_VECT_FACTOR (loop_vinfo) = saved_vectorization_factor;
3337 /* Free the SLP instances. */
3338 FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), j, instance)
3339 vect_free_slp_instance (instance);
3340 LOOP_VINFO_SLP_INSTANCES (loop_vinfo).release ();
3341 /* Reset SLP type to loop_vect on all stmts. */
3342 for (i = 0; i < LOOP_VINFO_LOOP (loop_vinfo)->num_nodes; ++i)
3343 {
3344 basic_block bb = LOOP_VINFO_BBS (loop_vinfo)[i];
3345 for (gimple_stmt_iterator si = gsi_start_phis (bb);
3346 !gsi_end_p (si); gsi_next (&si))
3347 {
3348 stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
3349 STMT_SLP_TYPE (stmt_info) = loop_vect;
3350 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
3351 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
3352 {
3353 /* vectorizable_reduction adjusts reduction stmt def-types,
3354 restore them to that of the PHI. */
3355 STMT_VINFO_DEF_TYPE (STMT_VINFO_REDUC_DEF (stmt_info))
3356 = STMT_VINFO_DEF_TYPE (stmt_info);
3357 STMT_VINFO_DEF_TYPE (vect_stmt_to_vectorize
3358 (STMT_VINFO_REDUC_DEF (stmt_info)))
3359 = STMT_VINFO_DEF_TYPE (stmt_info);
3360 }
3361 }
3362 for (gimple_stmt_iterator si = gsi_start_bb (bb);
3363 !gsi_end_p (si); gsi_next (&si))
3364 {
3365 if (is_gimple_debug (gsi_stmt (si)))
3366 continue;
3367 stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
3368 STMT_SLP_TYPE (stmt_info) = loop_vect;
3369 if (STMT_VINFO_IN_PATTERN_P (stmt_info))
3370 {
3371 stmt_vec_info pattern_stmt_info
3372 = STMT_VINFO_RELATED_STMT (stmt_info);
3373 if (STMT_VINFO_SLP_VECT_ONLY_PATTERN (pattern_stmt_info))
3374 STMT_VINFO_IN_PATTERN_P (stmt_info) = false;
3375
3376 gimple *pattern_def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
3377 STMT_SLP_TYPE (pattern_stmt_info) = loop_vect;
3378 for (gimple_stmt_iterator pi = gsi_start (pattern_def_seq);
3379 !gsi_end_p (pi); gsi_next (&pi))
3380 STMT_SLP_TYPE (loop_vinfo->lookup_stmt (gsi_stmt (pi)))
3381 = loop_vect;
3382 }
3383 }
3384 }
3385 /* Free optimized alias test DDRS. */
3386 LOOP_VINFO_LOWER_BOUNDS (loop_vinfo).truncate (0);
3387 LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo).release ();
3388 LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo).release ();
3389 /* Reset target cost data. */
3390 delete loop_vinfo->vector_costs;
3391 loop_vinfo->vector_costs = nullptr;
3392 /* Reset accumulated rgroup information. */
3393 LOOP_VINFO_MASKS (loop_vinfo).mask_set.empty ();
3394 release_vec_loop_controls (&LOOP_VINFO_MASKS (loop_vinfo).rgc_vec);
3395 release_vec_loop_controls (&LOOP_VINFO_LENS (loop_vinfo));
3396 /* Reset assorted flags. */
3397 LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = false;
3398 LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) = false;
3399 LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo) = 0;
3400 LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo) = 0;
3401 LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)
3402 = saved_can_use_partial_vectors_p;
3403 LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo) = false;
3404
3405 goto start_over;
3406 }
3407
3408 /* Return true if vectorizing a loop using NEW_LOOP_VINFO appears
3409 to be better than vectorizing it using OLD_LOOP_VINFO. Assume that
3410 OLD_LOOP_VINFO is better unless something specifically indicates
3411 otherwise.
3412
3413 Note that this deliberately isn't a partial order. */
3414
3415 static bool
3416 vect_better_loop_vinfo_p (loop_vec_info new_loop_vinfo,
3417 loop_vec_info old_loop_vinfo)
3418 {
3419 struct loop *loop = LOOP_VINFO_LOOP (new_loop_vinfo);
3420 gcc_assert (LOOP_VINFO_LOOP (old_loop_vinfo) == loop);
3421
3422 poly_int64 new_vf = LOOP_VINFO_VECT_FACTOR (new_loop_vinfo);
3423 poly_int64 old_vf = LOOP_VINFO_VECT_FACTOR (old_loop_vinfo);
3424
3425 /* Always prefer a VF of loop->simdlen over any other VF. */
3426 if (loop->simdlen)
3427 {
3428 bool new_simdlen_p = known_eq (new_vf, loop->simdlen);
3429 bool old_simdlen_p = known_eq (old_vf, loop->simdlen);
3430 if (new_simdlen_p != old_simdlen_p)
3431 return new_simdlen_p;
3432 }
3433
3434 const auto *old_costs = old_loop_vinfo->vector_costs;
3435 const auto *new_costs = new_loop_vinfo->vector_costs;
3436 if (loop_vec_info main_loop = LOOP_VINFO_ORIG_LOOP_INFO (old_loop_vinfo))
3437 return new_costs->better_epilogue_loop_than_p (old_costs, main_loop);
3438
3439 return new_costs->better_main_loop_than_p (old_costs);
3440 }
3441
3442 /* Decide whether to replace OLD_LOOP_VINFO with NEW_LOOP_VINFO. Return
3443 true if we should. */
3444
3445 static bool
3446 vect_joust_loop_vinfos (loop_vec_info new_loop_vinfo,
3447 loop_vec_info old_loop_vinfo)
3448 {
3449 if (!vect_better_loop_vinfo_p (new_loop_vinfo, old_loop_vinfo))
3450 return false;
3451
3452 if (dump_enabled_p ())
3453 dump_printf_loc (MSG_NOTE, vect_location,
3454 "***** Preferring vector mode %s to vector mode %s\n",
3455 GET_MODE_NAME (new_loop_vinfo->vector_mode),
3456 GET_MODE_NAME (old_loop_vinfo->vector_mode));
3457 return true;
3458 }
3459
3460 /* Analyze LOOP with VECTOR_MODES[MODE_I] and as epilogue if MAIN_LOOP_VINFO is
3461 not NULL. Set AUTODETECTED_VECTOR_MODE if VOIDmode and advance
3462 MODE_I to the next mode useful to analyze.
3463 Return the loop_vinfo on success and wrapped null on failure. */
3464
3465 static opt_loop_vec_info
3466 vect_analyze_loop_1 (class loop *loop, vec_info_shared *shared,
3467 const vect_loop_form_info *loop_form_info,
3468 loop_vec_info main_loop_vinfo,
3469 const vector_modes &vector_modes, unsigned &mode_i,
3470 machine_mode &autodetected_vector_mode,
3471 bool &fatal)
3472 {
3473 loop_vec_info loop_vinfo
3474 = vect_create_loop_vinfo (loop, shared, loop_form_info, main_loop_vinfo);
3475
3476 machine_mode vector_mode = vector_modes[mode_i];
3477 loop_vinfo->vector_mode = vector_mode;
3478 unsigned int suggested_unroll_factor = 1;
3479 bool slp_done_for_suggested_uf = false;
3480
3481 /* Run the main analysis. */
3482 opt_result res = vect_analyze_loop_2 (loop_vinfo, fatal,
3483 &suggested_unroll_factor,
3484 slp_done_for_suggested_uf);
3485 if (dump_enabled_p ())
3486 dump_printf_loc (MSG_NOTE, vect_location,
3487 "***** Analysis %s with vector mode %s\n",
3488 res ? "succeeded" : " failed",
3489 GET_MODE_NAME (loop_vinfo->vector_mode));
3490
3491 if (res && !main_loop_vinfo && suggested_unroll_factor > 1)
3492 {
3493 if (dump_enabled_p ())
3494 dump_printf_loc (MSG_NOTE, vect_location,
3495 "***** Re-trying analysis for unrolling"
3496 " with unroll factor %d and slp %s.\n",
3497 suggested_unroll_factor,
3498 slp_done_for_suggested_uf ? "on" : "off");
3499 loop_vec_info unroll_vinfo
3500 = vect_create_loop_vinfo (loop, shared, loop_form_info, main_loop_vinfo);
3501 unroll_vinfo->vector_mode = vector_mode;
3502 unroll_vinfo->suggested_unroll_factor = suggested_unroll_factor;
3503 opt_result new_res = vect_analyze_loop_2 (unroll_vinfo, fatal, NULL,
3504 slp_done_for_suggested_uf);
3505 if (new_res)
3506 {
3507 delete loop_vinfo;
3508 loop_vinfo = unroll_vinfo;
3509 }
3510 else
3511 delete unroll_vinfo;
3512 }
3513
3514 /* Remember the autodetected vector mode. */
3515 if (vector_mode == VOIDmode)
3516 autodetected_vector_mode = loop_vinfo->vector_mode;
3517
3518 /* Advance mode_i, first skipping modes that would result in the
3519 same analysis result. */
3520 while (mode_i + 1 < vector_modes.length ()
3521 && vect_chooses_same_modes_p (loop_vinfo,
3522 vector_modes[mode_i + 1]))
3523 {
3524 if (dump_enabled_p ())
3525 dump_printf_loc (MSG_NOTE, vect_location,
3526 "***** The result for vector mode %s would"
3527 " be the same\n",
3528 GET_MODE_NAME (vector_modes[mode_i + 1]));
3529 mode_i += 1;
3530 }
3531 if (mode_i + 1 < vector_modes.length ()
3532 && VECTOR_MODE_P (autodetected_vector_mode)
3533 && (related_vector_mode (vector_modes[mode_i + 1],
3534 GET_MODE_INNER (autodetected_vector_mode))
3535 == autodetected_vector_mode)
3536 && (related_vector_mode (autodetected_vector_mode,
3537 GET_MODE_INNER (vector_modes[mode_i + 1]))
3538 == vector_modes[mode_i + 1]))
3539 {
3540 if (dump_enabled_p ())
3541 dump_printf_loc (MSG_NOTE, vect_location,
3542 "***** Skipping vector mode %s, which would"
3543 " repeat the analysis for %s\n",
3544 GET_MODE_NAME (vector_modes[mode_i + 1]),
3545 GET_MODE_NAME (autodetected_vector_mode));
3546 mode_i += 1;
3547 }
3548 mode_i++;
3549
3550 if (!res)
3551 {
3552 delete loop_vinfo;
3553 if (fatal)
3554 gcc_checking_assert (main_loop_vinfo == NULL);
3555 return opt_loop_vec_info::propagate_failure (res);
3556 }
3557
3558 return opt_loop_vec_info::success (loop_vinfo);
3559 }
3560
3561 /* Function vect_analyze_loop.
3562
3563 Apply a set of analyses on LOOP, and create a loop_vec_info struct
3564 for it. The different analyses will record information in the
3565 loop_vec_info struct. */
3566 opt_loop_vec_info
3567 vect_analyze_loop (class loop *loop, vec_info_shared *shared)
3568 {
3569 DUMP_VECT_SCOPE ("analyze_loop_nest");
3570
3571 if (loop_outer (loop)
3572 && loop_vec_info_for_loop (loop_outer (loop))
3573 && LOOP_VINFO_VECTORIZABLE_P (loop_vec_info_for_loop (loop_outer (loop))))
3574 return opt_loop_vec_info::failure_at (vect_location,
3575 "outer-loop already vectorized.\n");
3576
3577 if (!find_loop_nest (loop, &shared->loop_nest))
3578 return opt_loop_vec_info::failure_at
3579 (vect_location,
3580 "not vectorized: loop nest containing two or more consecutive inner"
3581 " loops cannot be vectorized\n");
3582
3583 /* Analyze the loop form. */
3584 vect_loop_form_info loop_form_info;
3585 opt_result res = vect_analyze_loop_form (loop, &loop_form_info);
3586 if (!res)
3587 {
3588 if (dump_enabled_p ())
3589 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3590 "bad loop form.\n");
3591 return opt_loop_vec_info::propagate_failure (res);
3592 }
3593 if (!integer_onep (loop_form_info.assumptions))
3594 {
3595 /* We consider to vectorize this loop by versioning it under
3596 some assumptions. In order to do this, we need to clear
3597 existing information computed by scev and niter analyzer. */
3598 scev_reset_htab ();
3599 free_numbers_of_iterations_estimates (loop);
3600 /* Also set flag for this loop so that following scev and niter
3601 analysis are done under the assumptions. */
3602 loop_constraint_set (loop, LOOP_C_FINITE);
3603 }
3604 else
3605 /* Clear the existing niter information to make sure the nonwrapping flag
3606 will be calculated and set propriately. */
3607 free_numbers_of_iterations_estimates (loop);
3608
3609 auto_vector_modes vector_modes;
3610 /* Autodetect first vector size we try. */
3611 vector_modes.safe_push (VOIDmode);
3612 unsigned int autovec_flags
3613 = targetm.vectorize.autovectorize_vector_modes (&vector_modes,
3614 loop->simdlen != 0);
3615 bool pick_lowest_cost_p = ((autovec_flags & VECT_COMPARE_COSTS)
3616 && !unlimited_cost_model (loop));
3617 machine_mode autodetected_vector_mode = VOIDmode;
3618 opt_loop_vec_info first_loop_vinfo = opt_loop_vec_info::success (NULL);
3619 unsigned int mode_i = 0;
3620 unsigned HOST_WIDE_INT simdlen = loop->simdlen;
3621
3622 /* Keep track of the VF for each mode. Initialize all to 0 which indicates
3623 a mode has not been analyzed. */
3624 auto_vec<poly_uint64, 8> cached_vf_per_mode;
3625 for (unsigned i = 0; i < vector_modes.length (); ++i)
3626 cached_vf_per_mode.safe_push (0);
3627
3628 /* First determine the main loop vectorization mode, either the first
3629 one that works, starting with auto-detecting the vector mode and then
3630 following the targets order of preference, or the one with the
3631 lowest cost if pick_lowest_cost_p. */
3632 while (1)
3633 {
3634 bool fatal;
3635 unsigned int last_mode_i = mode_i;
3636 /* Set cached VF to -1 prior to analysis, which indicates a mode has
3637 failed. */
3638 cached_vf_per_mode[last_mode_i] = -1;
3639 opt_loop_vec_info loop_vinfo
3640 = vect_analyze_loop_1 (loop, shared, &loop_form_info,
3641 NULL, vector_modes, mode_i,
3642 autodetected_vector_mode, fatal);
3643 if (fatal)
3644 break;
3645
3646 if (loop_vinfo)
3647 {
3648 /* Analyzis has been successful so update the VF value. The
3649 VF should always be a multiple of unroll_factor and we want to
3650 capture the original VF here. */
3651 cached_vf_per_mode[last_mode_i]
3652 = exact_div (LOOP_VINFO_VECT_FACTOR (loop_vinfo),
3653 loop_vinfo->suggested_unroll_factor);
3654 /* Once we hit the desired simdlen for the first time,
3655 discard any previous attempts. */
3656 if (simdlen
3657 && known_eq (LOOP_VINFO_VECT_FACTOR (loop_vinfo), simdlen))
3658 {
3659 delete first_loop_vinfo;
3660 first_loop_vinfo = opt_loop_vec_info::success (NULL);
3661 simdlen = 0;
3662 }
3663 else if (pick_lowest_cost_p
3664 && first_loop_vinfo
3665 && vect_joust_loop_vinfos (loop_vinfo, first_loop_vinfo))
3666 {
3667 /* Pick loop_vinfo over first_loop_vinfo. */
3668 delete first_loop_vinfo;
3669 first_loop_vinfo = opt_loop_vec_info::success (NULL);
3670 }
3671 if (first_loop_vinfo == NULL)
3672 first_loop_vinfo = loop_vinfo;
3673 else
3674 {
3675 delete loop_vinfo;
3676 loop_vinfo = opt_loop_vec_info::success (NULL);
3677 }
3678
3679 /* Commit to first_loop_vinfo if we have no reason to try
3680 alternatives. */
3681 if (!simdlen && !pick_lowest_cost_p)
3682 break;
3683 }
3684 if (mode_i == vector_modes.length ()
3685 || autodetected_vector_mode == VOIDmode)
3686 break;
3687
3688 /* Try the next biggest vector size. */
3689 if (dump_enabled_p ())
3690 dump_printf_loc (MSG_NOTE, vect_location,
3691 "***** Re-trying analysis with vector mode %s\n",
3692 GET_MODE_NAME (vector_modes[mode_i]));
3693 }
3694 if (!first_loop_vinfo)
3695 return opt_loop_vec_info::propagate_failure (res);
3696
3697 if (dump_enabled_p ())
3698 dump_printf_loc (MSG_NOTE, vect_location,
3699 "***** Choosing vector mode %s\n",
3700 GET_MODE_NAME (first_loop_vinfo->vector_mode));
3701
3702 /* Only vectorize epilogues if PARAM_VECT_EPILOGUES_NOMASK is
3703 enabled, SIMDUID is not set, it is the innermost loop and we have
3704 either already found the loop's SIMDLEN or there was no SIMDLEN to
3705 begin with.
3706 TODO: Enable epilogue vectorization for loops with SIMDUID set. */
3707 bool vect_epilogues = (!simdlen
3708 && loop->inner == NULL
3709 && param_vect_epilogues_nomask
3710 && LOOP_VINFO_PEELING_FOR_NITER (first_loop_vinfo)
3711 /* No code motion support for multiple epilogues so for now
3712 not supported when multiple exits. */
3713 && !LOOP_VINFO_EARLY_BREAKS (first_loop_vinfo)
3714 && !loop->simduid);
3715 if (!vect_epilogues)
3716 return first_loop_vinfo;
3717
3718 /* Now analyze first_loop_vinfo for epilogue vectorization. */
3719 poly_uint64 lowest_th = LOOP_VINFO_VERSIONING_THRESHOLD (first_loop_vinfo);
3720
3721 /* For epilogues start the analysis from the first mode. The motivation
3722 behind starting from the beginning comes from cases where the VECTOR_MODES
3723 array may contain length-agnostic and length-specific modes. Their
3724 ordering is not guaranteed, so we could end up picking a mode for the main
3725 loop that is after the epilogue's optimal mode. */
3726 vector_modes[0] = autodetected_vector_mode;
3727 mode_i = 0;
3728
3729 bool supports_partial_vectors =
3730 partial_vectors_supported_p () && param_vect_partial_vector_usage != 0;
3731 poly_uint64 first_vinfo_vf = LOOP_VINFO_VECT_FACTOR (first_loop_vinfo);
3732
3733 while (1)
3734 {
3735 /* If the target does not support partial vectors we can shorten the
3736 number of modes to analyze for the epilogue as we know we can't pick a
3737 mode that would lead to a VF at least as big as the
3738 FIRST_VINFO_VF. */
3739 if (!supports_partial_vectors
3740 && maybe_ge (cached_vf_per_mode[mode_i], first_vinfo_vf))
3741 {
3742 mode_i++;
3743 if (mode_i == vector_modes.length ())
3744 break;
3745 continue;
3746 }
3747
3748 if (dump_enabled_p ())
3749 dump_printf_loc (MSG_NOTE, vect_location,
3750 "***** Re-trying epilogue analysis with vector "
3751 "mode %s\n", GET_MODE_NAME (vector_modes[mode_i]));
3752
3753 bool fatal;
3754 opt_loop_vec_info loop_vinfo
3755 = vect_analyze_loop_1 (loop, shared, &loop_form_info,
3756 first_loop_vinfo,
3757 vector_modes, mode_i,
3758 autodetected_vector_mode, fatal);
3759 if (fatal)
3760 break;
3761
3762 if (loop_vinfo)
3763 {
3764 if (pick_lowest_cost_p)
3765 {
3766 /* Keep trying to roll back vectorization attempts while the
3767 loop_vec_infos they produced were worse than this one. */
3768 vec<loop_vec_info> &vinfos = first_loop_vinfo->epilogue_vinfos;
3769 while (!vinfos.is_empty ()
3770 && vect_joust_loop_vinfos (loop_vinfo, vinfos.last ()))
3771 {
3772 gcc_assert (vect_epilogues);
3773 delete vinfos.pop ();
3774 }
3775 }
3776 /* For now only allow one epilogue loop. */
3777 if (first_loop_vinfo->epilogue_vinfos.is_empty ())
3778 {
3779 first_loop_vinfo->epilogue_vinfos.safe_push (loop_vinfo);
3780 poly_uint64 th = LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo);
3781 gcc_assert (!LOOP_REQUIRES_VERSIONING (loop_vinfo)
3782 || maybe_ne (lowest_th, 0U));
3783 /* Keep track of the known smallest versioning
3784 threshold. */
3785 if (ordered_p (lowest_th, th))
3786 lowest_th = ordered_min (lowest_th, th);
3787 }
3788 else
3789 {
3790 delete loop_vinfo;
3791 loop_vinfo = opt_loop_vec_info::success (NULL);
3792 }
3793
3794 /* For now only allow one epilogue loop, but allow
3795 pick_lowest_cost_p to replace it, so commit to the
3796 first epilogue if we have no reason to try alternatives. */
3797 if (!pick_lowest_cost_p)
3798 break;
3799 }
3800
3801 if (mode_i == vector_modes.length ())
3802 break;
3803
3804 }
3805
3806 if (!first_loop_vinfo->epilogue_vinfos.is_empty ())
3807 {
3808 LOOP_VINFO_VERSIONING_THRESHOLD (first_loop_vinfo) = lowest_th;
3809 if (dump_enabled_p ())
3810 dump_printf_loc (MSG_NOTE, vect_location,
3811 "***** Choosing epilogue vector mode %s\n",
3812 GET_MODE_NAME
3813 (first_loop_vinfo->epilogue_vinfos[0]->vector_mode));
3814 }
3815
3816 return first_loop_vinfo;
3817 }
3818
3819 /* Return true if there is an in-order reduction function for CODE, storing
3820 it in *REDUC_FN if so. */
3821
3822 static bool
3823 fold_left_reduction_fn (code_helper code, internal_fn *reduc_fn)
3824 {
3825 /* We support MINUS_EXPR by negating the operand. This also preserves an
3826 initial -0.0 since -0.0 - 0.0 (neutral op for MINUS_EXPR) == -0.0 +
3827 (-0.0) = -0.0. */
3828 if (code == PLUS_EXPR || code == MINUS_EXPR)
3829 {
3830 *reduc_fn = IFN_FOLD_LEFT_PLUS;
3831 return true;
3832 }
3833 return false;
3834 }
3835
3836 /* Function reduction_fn_for_scalar_code
3837
3838 Input:
3839 CODE - tree_code of a reduction operations.
3840
3841 Output:
3842 REDUC_FN - the corresponding internal function to be used to reduce the
3843 vector of partial results into a single scalar result, or IFN_LAST
3844 if the operation is a supported reduction operation, but does not have
3845 such an internal function.
3846
3847 Return FALSE if CODE currently cannot be vectorized as reduction. */
3848
3849 bool
3850 reduction_fn_for_scalar_code (code_helper code, internal_fn *reduc_fn)
3851 {
3852 if (code.is_tree_code ())
3853 switch (tree_code (code))
3854 {
3855 case MAX_EXPR:
3856 *reduc_fn = IFN_REDUC_MAX;
3857 return true;
3858
3859 case MIN_EXPR:
3860 *reduc_fn = IFN_REDUC_MIN;
3861 return true;
3862
3863 case PLUS_EXPR:
3864 *reduc_fn = IFN_REDUC_PLUS;
3865 return true;
3866
3867 case BIT_AND_EXPR:
3868 *reduc_fn = IFN_REDUC_AND;
3869 return true;
3870
3871 case BIT_IOR_EXPR:
3872 *reduc_fn = IFN_REDUC_IOR;
3873 return true;
3874
3875 case BIT_XOR_EXPR:
3876 *reduc_fn = IFN_REDUC_XOR;
3877 return true;
3878
3879 case MULT_EXPR:
3880 case MINUS_EXPR:
3881 *reduc_fn = IFN_LAST;
3882 return true;
3883
3884 default:
3885 return false;
3886 }
3887 else
3888 switch (combined_fn (code))
3889 {
3890 CASE_CFN_FMAX:
3891 *reduc_fn = IFN_REDUC_FMAX;
3892 return true;
3893
3894 CASE_CFN_FMIN:
3895 *reduc_fn = IFN_REDUC_FMIN;
3896 return true;
3897
3898 default:
3899 return false;
3900 }
3901 }
3902
3903 /* If there is a neutral value X such that a reduction would not be affected
3904 by the introduction of additional X elements, return that X, otherwise
3905 return null. CODE is the code of the reduction and SCALAR_TYPE is type
3906 of the scalar elements. If the reduction has just a single initial value
3907 then INITIAL_VALUE is that value, otherwise it is null.
3908 If AS_INITIAL is TRUE the value is supposed to be used as initial value.
3909 In that case no signed zero is returned. */
3910
3911 tree
3912 neutral_op_for_reduction (tree scalar_type, code_helper code,
3913 tree initial_value, bool as_initial)
3914 {
3915 if (code.is_tree_code ())
3916 switch (tree_code (code))
3917 {
3918 case DOT_PROD_EXPR:
3919 case SAD_EXPR:
3920 case MINUS_EXPR:
3921 case BIT_IOR_EXPR:
3922 case BIT_XOR_EXPR:
3923 return build_zero_cst (scalar_type);
3924 case WIDEN_SUM_EXPR:
3925 case PLUS_EXPR:
3926 if (!as_initial && HONOR_SIGNED_ZEROS (scalar_type))
3927 return build_real (scalar_type, dconstm0);
3928 else
3929 return build_zero_cst (scalar_type);
3930
3931 case MULT_EXPR:
3932 return build_one_cst (scalar_type);
3933
3934 case BIT_AND_EXPR:
3935 return build_all_ones_cst (scalar_type);
3936
3937 case MAX_EXPR:
3938 case MIN_EXPR:
3939 return initial_value;
3940
3941 default:
3942 return NULL_TREE;
3943 }
3944 else
3945 switch (combined_fn (code))
3946 {
3947 CASE_CFN_FMIN:
3948 CASE_CFN_FMAX:
3949 return initial_value;
3950
3951 default:
3952 return NULL_TREE;
3953 }
3954 }
3955
3956 /* Error reporting helper for vect_is_simple_reduction below. GIMPLE statement
3957 STMT is printed with a message MSG. */
3958
3959 static void
3960 report_vect_op (dump_flags_t msg_type, gimple *stmt, const char *msg)
3961 {
3962 dump_printf_loc (msg_type, vect_location, "%s%G", msg, stmt);
3963 }
3964
3965 /* Return true if we need an in-order reduction for operation CODE
3966 on type TYPE. NEED_WRAPPING_INTEGRAL_OVERFLOW is true if integer
3967 overflow must wrap. */
3968
3969 bool
3970 needs_fold_left_reduction_p (tree type, code_helper code)
3971 {
3972 /* CHECKME: check for !flag_finite_math_only too? */
3973 if (SCALAR_FLOAT_TYPE_P (type))
3974 {
3975 if (code.is_tree_code ())
3976 switch (tree_code (code))
3977 {
3978 case MIN_EXPR:
3979 case MAX_EXPR:
3980 return false;
3981
3982 default:
3983 return !flag_associative_math;
3984 }
3985 else
3986 switch (combined_fn (code))
3987 {
3988 CASE_CFN_FMIN:
3989 CASE_CFN_FMAX:
3990 return false;
3991
3992 default:
3993 return !flag_associative_math;
3994 }
3995 }
3996
3997 if (INTEGRAL_TYPE_P (type))
3998 return (!code.is_tree_code ()
3999 || !operation_no_trapping_overflow (type, tree_code (code)));
4000
4001 if (SAT_FIXED_POINT_TYPE_P (type))
4002 return true;
4003
4004 return false;
4005 }
4006
4007 /* Return true if the reduction PHI in LOOP with latch arg LOOP_ARG and
4008 has a handled computation expression. Store the main reduction
4009 operation in *CODE. */
4010
4011 static bool
4012 check_reduction_path (dump_user_location_t loc, loop_p loop, gphi *phi,
4013 tree loop_arg, code_helper *code,
4014 vec<std::pair<ssa_op_iter, use_operand_p> > &path)
4015 {
4016 auto_bitmap visited;
4017 tree lookfor = PHI_RESULT (phi);
4018 ssa_op_iter curri;
4019 use_operand_p curr = op_iter_init_phiuse (&curri, phi, SSA_OP_USE);
4020 while (USE_FROM_PTR (curr) != loop_arg)
4021 curr = op_iter_next_use (&curri);
4022 curri.i = curri.numops;
4023 do
4024 {
4025 path.safe_push (std::make_pair (curri, curr));
4026 tree use = USE_FROM_PTR (curr);
4027 if (use == lookfor)
4028 break;
4029 gimple *def = SSA_NAME_DEF_STMT (use);
4030 if (gimple_nop_p (def)
4031 || ! flow_bb_inside_loop_p (loop, gimple_bb (def)))
4032 {
4033 pop:
4034 do
4035 {
4036 std::pair<ssa_op_iter, use_operand_p> x = path.pop ();
4037 curri = x.first;
4038 curr = x.second;
4039 do
4040 curr = op_iter_next_use (&curri);
4041 /* Skip already visited or non-SSA operands (from iterating
4042 over PHI args). */
4043 while (curr != NULL_USE_OPERAND_P
4044 && (TREE_CODE (USE_FROM_PTR (curr)) != SSA_NAME
4045 || ! bitmap_set_bit (visited,
4046 SSA_NAME_VERSION
4047 (USE_FROM_PTR (curr)))));
4048 }
4049 while (curr == NULL_USE_OPERAND_P && ! path.is_empty ());
4050 if (curr == NULL_USE_OPERAND_P)
4051 break;
4052 }
4053 else
4054 {
4055 if (gimple_code (def) == GIMPLE_PHI)
4056 curr = op_iter_init_phiuse (&curri, as_a <gphi *>(def), SSA_OP_USE);
4057 else
4058 curr = op_iter_init_use (&curri, def, SSA_OP_USE);
4059 while (curr != NULL_USE_OPERAND_P
4060 && (TREE_CODE (USE_FROM_PTR (curr)) != SSA_NAME
4061 || ! bitmap_set_bit (visited,
4062 SSA_NAME_VERSION
4063 (USE_FROM_PTR (curr)))))
4064 curr = op_iter_next_use (&curri);
4065 if (curr == NULL_USE_OPERAND_P)
4066 goto pop;
4067 }
4068 }
4069 while (1);
4070 if (dump_file && (dump_flags & TDF_DETAILS))
4071 {
4072 dump_printf_loc (MSG_NOTE, loc, "reduction path: ");
4073 unsigned i;
4074 std::pair<ssa_op_iter, use_operand_p> *x;
4075 FOR_EACH_VEC_ELT (path, i, x)
4076 dump_printf (MSG_NOTE, "%T ", USE_FROM_PTR (x->second));
4077 dump_printf (MSG_NOTE, "\n");
4078 }
4079
4080 /* Check whether the reduction path detected is valid. */
4081 bool fail = path.length () == 0;
4082 bool neg = false;
4083 int sign = -1;
4084 *code = ERROR_MARK;
4085 for (unsigned i = 1; i < path.length (); ++i)
4086 {
4087 gimple *use_stmt = USE_STMT (path[i].second);
4088 gimple_match_op op;
4089 if (!gimple_extract_op (use_stmt, &op))
4090 {
4091 fail = true;
4092 break;
4093 }
4094 unsigned int opi = op.num_ops;
4095 if (gassign *assign = dyn_cast<gassign *> (use_stmt))
4096 {
4097 /* The following make sure we can compute the operand index
4098 easily plus it mostly disallows chaining via COND_EXPR condition
4099 operands. */
4100 for (opi = 0; opi < op.num_ops; ++opi)
4101 if (gimple_assign_rhs1_ptr (assign) + opi == path[i].second->use)
4102 break;
4103 }
4104 else if (gcall *call = dyn_cast<gcall *> (use_stmt))
4105 {
4106 for (opi = 0; opi < op.num_ops; ++opi)
4107 if (gimple_call_arg_ptr (call, opi) == path[i].second->use)
4108 break;
4109 }
4110 if (opi == op.num_ops)
4111 {
4112 fail = true;
4113 break;
4114 }
4115 op.code = canonicalize_code (op.code, op.type);
4116 if (op.code == MINUS_EXPR)
4117 {
4118 op.code = PLUS_EXPR;
4119 /* Track whether we negate the reduction value each iteration. */
4120 if (op.ops[1] == op.ops[opi])
4121 neg = ! neg;
4122 }
4123 else if (op.code == IFN_COND_SUB)
4124 {
4125 op.code = IFN_COND_ADD;
4126 /* Track whether we negate the reduction value each iteration. */
4127 if (op.ops[2] == op.ops[opi])
4128 neg = ! neg;
4129 }
4130 if (CONVERT_EXPR_CODE_P (op.code)
4131 && tree_nop_conversion_p (op.type, TREE_TYPE (op.ops[0])))
4132 ;
4133 else if (*code == ERROR_MARK)
4134 {
4135 *code = op.code;
4136 sign = TYPE_SIGN (op.type);
4137 }
4138 else if (op.code != *code)
4139 {
4140 fail = true;
4141 break;
4142 }
4143 else if ((op.code == MIN_EXPR
4144 || op.code == MAX_EXPR)
4145 && sign != TYPE_SIGN (op.type))
4146 {
4147 fail = true;
4148 break;
4149 }
4150 /* Check there's only a single stmt the op is used on. For the
4151 not value-changing tail and the last stmt allow out-of-loop uses.
4152 ??? We could relax this and handle arbitrary live stmts by
4153 forcing a scalar epilogue for example. */
4154 imm_use_iterator imm_iter;
4155 use_operand_p use_p;
4156 gimple *op_use_stmt;
4157 unsigned cnt = 0;
4158 bool cond_fn_p = op.code.is_internal_fn ()
4159 && (conditional_internal_fn_code (internal_fn (op.code))
4160 != ERROR_MARK);
4161
4162 FOR_EACH_IMM_USE_STMT (op_use_stmt, imm_iter, op.ops[opi])
4163 {
4164 /* In case of a COND_OP (mask, op1, op2, op1) reduction we might have
4165 op1 twice (once as definition, once as else) in the same operation.
4166 Allow this. */
4167 if (cond_fn_p && op_use_stmt == use_stmt)
4168 {
4169 gcall *call = as_a<gcall *> (use_stmt);
4170 unsigned else_pos
4171 = internal_fn_else_index (internal_fn (op.code));
4172
4173 for (unsigned int j = 0; j < gimple_call_num_args (call); ++j)
4174 {
4175 if (j == else_pos)
4176 continue;
4177 if (gimple_call_arg (call, j) == op.ops[opi])
4178 cnt++;
4179 }
4180 }
4181 else if (!is_gimple_debug (op_use_stmt)
4182 && (*code != ERROR_MARK
4183 || flow_bb_inside_loop_p (loop,
4184 gimple_bb (op_use_stmt))))
4185 FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
4186 cnt++;
4187 }
4188
4189 if (cnt != 1)
4190 {
4191 fail = true;
4192 break;
4193 }
4194 }
4195 return ! fail && ! neg && *code != ERROR_MARK;
4196 }
4197
4198 bool
4199 check_reduction_path (dump_user_location_t loc, loop_p loop, gphi *phi,
4200 tree loop_arg, enum tree_code code)
4201 {
4202 auto_vec<std::pair<ssa_op_iter, use_operand_p> > path;
4203 code_helper code_;
4204 return (check_reduction_path (loc, loop, phi, loop_arg, &code_, path)
4205 && code_ == code);
4206 }
4207
4208
4209
4210 /* Function vect_is_simple_reduction
4211
4212 (1) Detect a cross-iteration def-use cycle that represents a simple
4213 reduction computation. We look for the following pattern:
4214
4215 loop_header:
4216 a1 = phi < a0, a2 >
4217 a3 = ...
4218 a2 = operation (a3, a1)
4219
4220 or
4221
4222 a3 = ...
4223 loop_header:
4224 a1 = phi < a0, a2 >
4225 a2 = operation (a3, a1)
4226
4227 such that:
4228 1. operation is commutative and associative and it is safe to
4229 change the order of the computation
4230 2. no uses for a2 in the loop (a2 is used out of the loop)
4231 3. no uses of a1 in the loop besides the reduction operation
4232 4. no uses of a1 outside the loop.
4233
4234 Conditions 1,4 are tested here.
4235 Conditions 2,3 are tested in vect_mark_stmts_to_be_vectorized.
4236
4237 (2) Detect a cross-iteration def-use cycle in nested loops, i.e.,
4238 nested cycles.
4239
4240 (3) Detect cycles of phi nodes in outer-loop vectorization, i.e., double
4241 reductions:
4242
4243 a1 = phi < a0, a2 >
4244 inner loop (def of a3)
4245 a2 = phi < a3 >
4246
4247 (4) Detect condition expressions, ie:
4248 for (int i = 0; i < N; i++)
4249 if (a[i] < val)
4250 ret_val = a[i];
4251
4252 */
4253
4254 static stmt_vec_info
4255 vect_is_simple_reduction (loop_vec_info loop_info, stmt_vec_info phi_info,
4256 bool *double_reduc, bool *reduc_chain_p, bool slp)
4257 {
4258 gphi *phi = as_a <gphi *> (phi_info->stmt);
4259 gimple *phi_use_stmt = NULL;
4260 imm_use_iterator imm_iter;
4261 use_operand_p use_p;
4262
4263 *double_reduc = false;
4264 *reduc_chain_p = false;
4265 STMT_VINFO_REDUC_TYPE (phi_info) = TREE_CODE_REDUCTION;
4266
4267 tree phi_name = PHI_RESULT (phi);
4268 /* ??? If there are no uses of the PHI result the inner loop reduction
4269 won't be detected as possibly double-reduction by vectorizable_reduction
4270 because that tries to walk the PHI arg from the preheader edge which
4271 can be constant. See PR60382. */
4272 if (has_zero_uses (phi_name))
4273 return NULL;
4274 class loop *loop = (gimple_bb (phi))->loop_father;
4275 unsigned nphi_def_loop_uses = 0;
4276 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, phi_name)
4277 {
4278 gimple *use_stmt = USE_STMT (use_p);
4279 if (is_gimple_debug (use_stmt))
4280 continue;
4281
4282 if (!flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
4283 {
4284 if (dump_enabled_p ())
4285 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4286 "intermediate value used outside loop.\n");
4287
4288 return NULL;
4289 }
4290
4291 /* In case of a COND_OP (mask, op1, op2, op1) reduction we might have
4292 op1 twice (once as definition, once as else) in the same operation.
4293 Only count it as one. */
4294 if (use_stmt != phi_use_stmt)
4295 {
4296 nphi_def_loop_uses++;
4297 phi_use_stmt = use_stmt;
4298 }
4299 }
4300
4301 tree latch_def = PHI_ARG_DEF_FROM_EDGE (phi, loop_latch_edge (loop));
4302 if (TREE_CODE (latch_def) != SSA_NAME)
4303 {
4304 if (dump_enabled_p ())
4305 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4306 "reduction: not ssa_name: %T\n", latch_def);
4307 return NULL;
4308 }
4309
4310 stmt_vec_info def_stmt_info = loop_info->lookup_def (latch_def);
4311 if (!def_stmt_info
4312 || !flow_bb_inside_loop_p (loop, gimple_bb (def_stmt_info->stmt)))
4313 return NULL;
4314
4315 bool nested_in_vect_loop
4316 = flow_loop_nested_p (LOOP_VINFO_LOOP (loop_info), loop);
4317 unsigned nlatch_def_loop_uses = 0;
4318 auto_vec<gphi *, 3> lcphis;
4319 bool inner_loop_of_double_reduc = false;
4320 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, latch_def)
4321 {
4322 gimple *use_stmt = USE_STMT (use_p);
4323 if (is_gimple_debug (use_stmt))
4324 continue;
4325 if (flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
4326 nlatch_def_loop_uses++;
4327 else
4328 {
4329 /* We can have more than one loop-closed PHI. */
4330 lcphis.safe_push (as_a <gphi *> (use_stmt));
4331 if (nested_in_vect_loop
4332 && (STMT_VINFO_DEF_TYPE (loop_info->lookup_stmt (use_stmt))
4333 == vect_double_reduction_def))
4334 inner_loop_of_double_reduc = true;
4335 }
4336 }
4337
4338 /* If we are vectorizing an inner reduction we are executing that
4339 in the original order only in case we are not dealing with a
4340 double reduction. */
4341 if (nested_in_vect_loop && !inner_loop_of_double_reduc)
4342 {
4343 if (dump_enabled_p ())
4344 report_vect_op (MSG_NOTE, def_stmt_info->stmt,
4345 "detected nested cycle: ");
4346 return def_stmt_info;
4347 }
4348
4349 /* When the inner loop of a double reduction ends up with more than
4350 one loop-closed PHI we have failed to classify alternate such
4351 PHIs as double reduction, leading to wrong code. See PR103237. */
4352 if (inner_loop_of_double_reduc && lcphis.length () != 1)
4353 {
4354 if (dump_enabled_p ())
4355 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4356 "unhandle double reduction\n");
4357 return NULL;
4358 }
4359
4360 /* If this isn't a nested cycle or if the nested cycle reduction value
4361 is used ouside of the inner loop we cannot handle uses of the reduction
4362 value. */
4363 if (nlatch_def_loop_uses > 1 || nphi_def_loop_uses > 1)
4364 {
4365 if (dump_enabled_p ())
4366 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4367 "reduction used in loop.\n");
4368 return NULL;
4369 }
4370
4371 /* If DEF_STMT is a phi node itself, we expect it to have a single argument
4372 defined in the inner loop. */
4373 if (gphi *def_stmt = dyn_cast <gphi *> (def_stmt_info->stmt))
4374 {
4375 tree op1 = PHI_ARG_DEF (def_stmt, 0);
4376 if (gimple_phi_num_args (def_stmt) != 1
4377 || TREE_CODE (op1) != SSA_NAME)
4378 {
4379 if (dump_enabled_p ())
4380 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4381 "unsupported phi node definition.\n");
4382
4383 return NULL;
4384 }
4385
4386 /* Verify there is an inner cycle composed of the PHI phi_use_stmt
4387 and the latch definition op1. */
4388 gimple *def1 = SSA_NAME_DEF_STMT (op1);
4389 if (gimple_bb (def1)
4390 && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
4391 && loop->inner
4392 && flow_bb_inside_loop_p (loop->inner, gimple_bb (def1))
4393 && (is_gimple_assign (def1) || is_gimple_call (def1))
4394 && is_a <gphi *> (phi_use_stmt)
4395 && flow_bb_inside_loop_p (loop->inner, gimple_bb (phi_use_stmt))
4396 && (op1 == PHI_ARG_DEF_FROM_EDGE (phi_use_stmt,
4397 loop_latch_edge (loop->inner))))
4398 {
4399 if (dump_enabled_p ())
4400 report_vect_op (MSG_NOTE, def_stmt,
4401 "detected double reduction: ");
4402
4403 *double_reduc = true;
4404 return def_stmt_info;
4405 }
4406
4407 return NULL;
4408 }
4409
4410 /* Look for the expression computing latch_def from then loop PHI result. */
4411 auto_vec<std::pair<ssa_op_iter, use_operand_p> > path;
4412 code_helper code;
4413 if (check_reduction_path (vect_location, loop, phi, latch_def, &code,
4414 path))
4415 {
4416 STMT_VINFO_REDUC_CODE (phi_info) = code;
4417 if (code == COND_EXPR && !nested_in_vect_loop)
4418 STMT_VINFO_REDUC_TYPE (phi_info) = COND_REDUCTION;
4419
4420 /* Fill in STMT_VINFO_REDUC_IDX and gather stmts for an SLP
4421 reduction chain for which the additional restriction is that
4422 all operations in the chain are the same. */
4423 auto_vec<stmt_vec_info, 8> reduc_chain;
4424 unsigned i;
4425 bool is_slp_reduc = !nested_in_vect_loop && code != COND_EXPR;
4426 for (i = path.length () - 1; i >= 1; --i)
4427 {
4428 gimple *stmt = USE_STMT (path[i].second);
4429 stmt_vec_info stmt_info = loop_info->lookup_stmt (stmt);
4430 gimple_match_op op;
4431 if (!gimple_extract_op (stmt, &op))
4432 gcc_unreachable ();
4433 if (gassign *assign = dyn_cast<gassign *> (stmt))
4434 STMT_VINFO_REDUC_IDX (stmt_info)
4435 = path[i].second->use - gimple_assign_rhs1_ptr (assign);
4436 else
4437 {
4438 gcall *call = as_a<gcall *> (stmt);
4439 STMT_VINFO_REDUC_IDX (stmt_info)
4440 = path[i].second->use - gimple_call_arg_ptr (call, 0);
4441 }
4442 bool leading_conversion = (CONVERT_EXPR_CODE_P (op.code)
4443 && (i == 1 || i == path.length () - 1));
4444 if ((op.code != code && !leading_conversion)
4445 /* We can only handle the final value in epilogue
4446 generation for reduction chains. */
4447 || (i != 1 && !has_single_use (gimple_get_lhs (stmt))))
4448 is_slp_reduc = false;
4449 /* For reduction chains we support a trailing/leading
4450 conversions. We do not store those in the actual chain. */
4451 if (leading_conversion)
4452 continue;
4453 reduc_chain.safe_push (stmt_info);
4454 }
4455 if (slp && is_slp_reduc && reduc_chain.length () > 1)
4456 {
4457 for (unsigned i = 0; i < reduc_chain.length () - 1; ++i)
4458 {
4459 REDUC_GROUP_FIRST_ELEMENT (reduc_chain[i]) = reduc_chain[0];
4460 REDUC_GROUP_NEXT_ELEMENT (reduc_chain[i]) = reduc_chain[i+1];
4461 }
4462 REDUC_GROUP_FIRST_ELEMENT (reduc_chain.last ()) = reduc_chain[0];
4463 REDUC_GROUP_NEXT_ELEMENT (reduc_chain.last ()) = NULL;
4464
4465 /* Save the chain for further analysis in SLP detection. */
4466 LOOP_VINFO_REDUCTION_CHAINS (loop_info).safe_push (reduc_chain[0]);
4467 REDUC_GROUP_SIZE (reduc_chain[0]) = reduc_chain.length ();
4468
4469 *reduc_chain_p = true;
4470 if (dump_enabled_p ())
4471 dump_printf_loc (MSG_NOTE, vect_location,
4472 "reduction: detected reduction chain\n");
4473 }
4474 else if (dump_enabled_p ())
4475 dump_printf_loc (MSG_NOTE, vect_location,
4476 "reduction: detected reduction\n");
4477
4478 return def_stmt_info;
4479 }
4480
4481 if (dump_enabled_p ())
4482 dump_printf_loc (MSG_NOTE, vect_location,
4483 "reduction: unknown pattern\n");
4484
4485 return NULL;
4486 }
4487
4488 /* Estimate the number of peeled epilogue iterations for LOOP_VINFO.
4489 PEEL_ITERS_PROLOGUE is the number of peeled prologue iterations,
4490 or -1 if not known. */
4491
4492 static int
4493 vect_get_peel_iters_epilogue (loop_vec_info loop_vinfo, int peel_iters_prologue)
4494 {
4495 int assumed_vf = vect_vf_for_cost (loop_vinfo);
4496 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) || peel_iters_prologue == -1)
4497 {
4498 if (dump_enabled_p ())
4499 dump_printf_loc (MSG_NOTE, vect_location,
4500 "cost model: epilogue peel iters set to vf/2 "
4501 "because loop iterations are unknown .\n");
4502 return assumed_vf / 2;
4503 }
4504 else
4505 {
4506 int niters = LOOP_VINFO_INT_NITERS (loop_vinfo);
4507 peel_iters_prologue = MIN (niters, peel_iters_prologue);
4508 int peel_iters_epilogue = (niters - peel_iters_prologue) % assumed_vf;
4509 /* If we need to peel for gaps, but no peeling is required, we have to
4510 peel VF iterations. */
4511 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) && !peel_iters_epilogue)
4512 peel_iters_epilogue = assumed_vf;
4513 return peel_iters_epilogue;
4514 }
4515 }
4516
4517 /* Calculate cost of peeling the loop PEEL_ITERS_PROLOGUE times. */
4518 int
4519 vect_get_known_peeling_cost (loop_vec_info loop_vinfo, int peel_iters_prologue,
4520 int *peel_iters_epilogue,
4521 stmt_vector_for_cost *scalar_cost_vec,
4522 stmt_vector_for_cost *prologue_cost_vec,
4523 stmt_vector_for_cost *epilogue_cost_vec)
4524 {
4525 int retval = 0;
4526
4527 *peel_iters_epilogue
4528 = vect_get_peel_iters_epilogue (loop_vinfo, peel_iters_prologue);
4529
4530 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
4531 {
4532 /* If peeled iterations are known but number of scalar loop
4533 iterations are unknown, count a taken branch per peeled loop. */
4534 if (peel_iters_prologue > 0)
4535 retval = record_stmt_cost (prologue_cost_vec, 1, cond_branch_taken,
4536 vect_prologue);
4537 if (*peel_iters_epilogue > 0)
4538 retval += record_stmt_cost (epilogue_cost_vec, 1, cond_branch_taken,
4539 vect_epilogue);
4540 }
4541
4542 stmt_info_for_cost *si;
4543 int j;
4544 if (peel_iters_prologue)
4545 FOR_EACH_VEC_ELT (*scalar_cost_vec, j, si)
4546 retval += record_stmt_cost (prologue_cost_vec,
4547 si->count * peel_iters_prologue,
4548 si->kind, si->stmt_info, si->misalign,
4549 vect_prologue);
4550 if (*peel_iters_epilogue)
4551 FOR_EACH_VEC_ELT (*scalar_cost_vec, j, si)
4552 retval += record_stmt_cost (epilogue_cost_vec,
4553 si->count * *peel_iters_epilogue,
4554 si->kind, si->stmt_info, si->misalign,
4555 vect_epilogue);
4556
4557 return retval;
4558 }
4559
4560 /* Function vect_estimate_min_profitable_iters
4561
4562 Return the number of iterations required for the vector version of the
4563 loop to be profitable relative to the cost of the scalar version of the
4564 loop.
4565
4566 *RET_MIN_PROFITABLE_NITERS is a cost model profitability threshold
4567 of iterations for vectorization. -1 value means loop vectorization
4568 is not profitable. This returned value may be used for dynamic
4569 profitability check.
4570
4571 *RET_MIN_PROFITABLE_ESTIMATE is a profitability threshold to be used
4572 for static check against estimated number of iterations. */
4573
4574 static void
4575 vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo,
4576 int *ret_min_profitable_niters,
4577 int *ret_min_profitable_estimate,
4578 unsigned *suggested_unroll_factor)
4579 {
4580 int min_profitable_iters;
4581 int min_profitable_estimate;
4582 int peel_iters_prologue;
4583 int peel_iters_epilogue;
4584 unsigned vec_inside_cost = 0;
4585 int vec_outside_cost = 0;
4586 unsigned vec_prologue_cost = 0;
4587 unsigned vec_epilogue_cost = 0;
4588 int scalar_single_iter_cost = 0;
4589 int scalar_outside_cost = 0;
4590 int assumed_vf = vect_vf_for_cost (loop_vinfo);
4591 int npeel = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
4592 vector_costs *target_cost_data = loop_vinfo->vector_costs;
4593
4594 /* Cost model disabled. */
4595 if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
4596 {
4597 if (dump_enabled_p ())
4598 dump_printf_loc (MSG_NOTE, vect_location, "cost model disabled.\n");
4599 *ret_min_profitable_niters = 0;
4600 *ret_min_profitable_estimate = 0;
4601 return;
4602 }
4603
4604 /* Requires loop versioning tests to handle misalignment. */
4605 if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo))
4606 {
4607 /* FIXME: Make cost depend on complexity of individual check. */
4608 unsigned len = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).length ();
4609 (void) add_stmt_cost (target_cost_data, len, scalar_stmt, vect_prologue);
4610 if (dump_enabled_p ())
4611 dump_printf (MSG_NOTE,
4612 "cost model: Adding cost of checks for loop "
4613 "versioning to treat misalignment.\n");
4614 }
4615
4616 /* Requires loop versioning with alias checks. */
4617 if (LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
4618 {
4619 /* FIXME: Make cost depend on complexity of individual check. */
4620 unsigned len = LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo).length ();
4621 (void) add_stmt_cost (target_cost_data, len, scalar_stmt, vect_prologue);
4622 len = LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo).length ();
4623 if (len)
4624 /* Count LEN - 1 ANDs and LEN comparisons. */
4625 (void) add_stmt_cost (target_cost_data, len * 2 - 1,
4626 scalar_stmt, vect_prologue);
4627 len = LOOP_VINFO_LOWER_BOUNDS (loop_vinfo).length ();
4628 if (len)
4629 {
4630 /* Count LEN - 1 ANDs and LEN comparisons. */
4631 unsigned int nstmts = len * 2 - 1;
4632 /* +1 for each bias that needs adding. */
4633 for (unsigned int i = 0; i < len; ++i)
4634 if (!LOOP_VINFO_LOWER_BOUNDS (loop_vinfo)[i].unsigned_p)
4635 nstmts += 1;
4636 (void) add_stmt_cost (target_cost_data, nstmts,
4637 scalar_stmt, vect_prologue);
4638 }
4639 if (dump_enabled_p ())
4640 dump_printf (MSG_NOTE,
4641 "cost model: Adding cost of checks for loop "
4642 "versioning aliasing.\n");
4643 }
4644
4645 /* Requires loop versioning with niter checks. */
4646 if (LOOP_REQUIRES_VERSIONING_FOR_NITERS (loop_vinfo))
4647 {
4648 /* FIXME: Make cost depend on complexity of individual check. */
4649 (void) add_stmt_cost (target_cost_data, 1, vector_stmt,
4650 NULL, NULL, NULL_TREE, 0, vect_prologue);
4651 if (dump_enabled_p ())
4652 dump_printf (MSG_NOTE,
4653 "cost model: Adding cost of checks for loop "
4654 "versioning niters.\n");
4655 }
4656
4657 if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
4658 (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken,
4659 vect_prologue);
4660
4661 /* Count statements in scalar loop. Using this as scalar cost for a single
4662 iteration for now.
4663
4664 TODO: Add outer loop support.
4665
4666 TODO: Consider assigning different costs to different scalar
4667 statements. */
4668
4669 scalar_single_iter_cost = loop_vinfo->scalar_costs->total_cost ();
4670
4671 /* Add additional cost for the peeled instructions in prologue and epilogue
4672 loop. (For fully-masked loops there will be no peeling.)
4673
4674 FORNOW: If we don't know the value of peel_iters for prologue or epilogue
4675 at compile-time - we assume it's vf/2 (the worst would be vf-1).
4676
4677 TODO: Build an expression that represents peel_iters for prologue and
4678 epilogue to be used in a run-time test. */
4679
4680 bool prologue_need_br_taken_cost = false;
4681 bool prologue_need_br_not_taken_cost = false;
4682
4683 /* Calculate peel_iters_prologue. */
4684 if (vect_use_loop_mask_for_alignment_p (loop_vinfo))
4685 peel_iters_prologue = 0;
4686 else if (npeel < 0)
4687 {
4688 peel_iters_prologue = assumed_vf / 2;
4689 if (dump_enabled_p ())
4690 dump_printf (MSG_NOTE, "cost model: "
4691 "prologue peel iters set to vf/2.\n");
4692
4693 /* If peeled iterations are unknown, count a taken branch and a not taken
4694 branch per peeled loop. Even if scalar loop iterations are known,
4695 vector iterations are not known since peeled prologue iterations are
4696 not known. Hence guards remain the same. */
4697 prologue_need_br_taken_cost = true;
4698 prologue_need_br_not_taken_cost = true;
4699 }
4700 else
4701 {
4702 peel_iters_prologue = npeel;
4703 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) && peel_iters_prologue > 0)
4704 /* If peeled iterations are known but number of scalar loop
4705 iterations are unknown, count a taken branch per peeled loop. */
4706 prologue_need_br_taken_cost = true;
4707 }
4708
4709 bool epilogue_need_br_taken_cost = false;
4710 bool epilogue_need_br_not_taken_cost = false;
4711
4712 /* Calculate peel_iters_epilogue. */
4713 if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
4714 /* We need to peel exactly one iteration for gaps. */
4715 peel_iters_epilogue = LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) ? 1 : 0;
4716 else if (npeel < 0)
4717 {
4718 /* If peeling for alignment is unknown, loop bound of main loop
4719 becomes unknown. */
4720 peel_iters_epilogue = assumed_vf / 2;
4721 if (dump_enabled_p ())
4722 dump_printf (MSG_NOTE, "cost model: "
4723 "epilogue peel iters set to vf/2 because "
4724 "peeling for alignment is unknown.\n");
4725
4726 /* See the same reason above in peel_iters_prologue calculation. */
4727 epilogue_need_br_taken_cost = true;
4728 epilogue_need_br_not_taken_cost = true;
4729 }
4730 else
4731 {
4732 peel_iters_epilogue = vect_get_peel_iters_epilogue (loop_vinfo, npeel);
4733 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) && peel_iters_epilogue > 0)
4734 /* If peeled iterations are known but number of scalar loop
4735 iterations are unknown, count a taken branch per peeled loop. */
4736 epilogue_need_br_taken_cost = true;
4737 }
4738
4739 stmt_info_for_cost *si;
4740 int j;
4741 /* Add costs associated with peel_iters_prologue. */
4742 if (peel_iters_prologue)
4743 FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo), j, si)
4744 {
4745 (void) add_stmt_cost (target_cost_data,
4746 si->count * peel_iters_prologue, si->kind,
4747 si->stmt_info, si->node, si->vectype,
4748 si->misalign, vect_prologue);
4749 }
4750
4751 /* Add costs associated with peel_iters_epilogue. */
4752 if (peel_iters_epilogue)
4753 FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo), j, si)
4754 {
4755 (void) add_stmt_cost (target_cost_data,
4756 si->count * peel_iters_epilogue, si->kind,
4757 si->stmt_info, si->node, si->vectype,
4758 si->misalign, vect_epilogue);
4759 }
4760
4761 /* Add possible cond_branch_taken/cond_branch_not_taken cost. */
4762
4763 if (prologue_need_br_taken_cost)
4764 (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken,
4765 vect_prologue);
4766
4767 if (prologue_need_br_not_taken_cost)
4768 (void) add_stmt_cost (target_cost_data, 1,
4769 cond_branch_not_taken, vect_prologue);
4770
4771 if (epilogue_need_br_taken_cost)
4772 (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken,
4773 vect_epilogue);
4774
4775 if (epilogue_need_br_not_taken_cost)
4776 (void) add_stmt_cost (target_cost_data, 1,
4777 cond_branch_not_taken, vect_epilogue);
4778
4779 /* Take care of special costs for rgroup controls of partial vectors. */
4780 if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
4781 && (LOOP_VINFO_PARTIAL_VECTORS_STYLE (loop_vinfo)
4782 == vect_partial_vectors_avx512))
4783 {
4784 /* Calculate how many masks we need to generate. */
4785 unsigned int num_masks = 0;
4786 bool need_saturation = false;
4787 for (auto rgm : LOOP_VINFO_MASKS (loop_vinfo).rgc_vec)
4788 if (rgm.type)
4789 {
4790 unsigned nvectors = rgm.factor;
4791 num_masks += nvectors;
4792 if (TYPE_PRECISION (TREE_TYPE (rgm.compare_type))
4793 < TYPE_PRECISION (LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo)))
4794 need_saturation = true;
4795 }
4796
4797 /* ??? The target isn't able to identify the costs below as
4798 producing masks so it cannot penaltize cases where we'd run
4799 out of mask registers for example. */
4800
4801 /* ??? We are also failing to account for smaller vector masks
4802 we generate by splitting larger masks in vect_get_loop_mask. */
4803
4804 /* In the worst case, we need to generate each mask in the prologue
4805 and in the loop body. We need one splat per group and one
4806 compare per mask.
4807
4808 Sometimes the prologue mask will fold to a constant,
4809 so the actual prologue cost might be smaller. However, it's
4810 simpler and safer to use the worst-case cost; if this ends up
4811 being the tie-breaker between vectorizing or not, then it's
4812 probably better not to vectorize. */
4813 (void) add_stmt_cost (target_cost_data,
4814 num_masks
4815 + LOOP_VINFO_MASKS (loop_vinfo).rgc_vec.length (),
4816 vector_stmt, NULL, NULL, NULL_TREE, 0,
4817 vect_prologue);
4818 (void) add_stmt_cost (target_cost_data,
4819 num_masks
4820 + LOOP_VINFO_MASKS (loop_vinfo).rgc_vec.length (),
4821 vector_stmt, NULL, NULL, NULL_TREE, 0, vect_body);
4822
4823 /* When we need saturation we need it both in the prologue and
4824 the epilogue. */
4825 if (need_saturation)
4826 {
4827 (void) add_stmt_cost (target_cost_data, 1, scalar_stmt,
4828 NULL, NULL, NULL_TREE, 0, vect_prologue);
4829 (void) add_stmt_cost (target_cost_data, 1, scalar_stmt,
4830 NULL, NULL, NULL_TREE, 0, vect_body);
4831 }
4832 }
4833 else if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
4834 && (LOOP_VINFO_PARTIAL_VECTORS_STYLE (loop_vinfo)
4835 == vect_partial_vectors_while_ult))
4836 {
4837 /* Calculate how many masks we need to generate. */
4838 unsigned int num_masks = 0;
4839 rgroup_controls *rgm;
4840 unsigned int num_vectors_m1;
4841 FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo).rgc_vec,
4842 num_vectors_m1, rgm)
4843 if (rgm->type)
4844 num_masks += num_vectors_m1 + 1;
4845 gcc_assert (num_masks > 0);
4846
4847 /* In the worst case, we need to generate each mask in the prologue
4848 and in the loop body. One of the loop body mask instructions
4849 replaces the comparison in the scalar loop, and since we don't
4850 count the scalar comparison against the scalar body, we shouldn't
4851 count that vector instruction against the vector body either.
4852
4853 Sometimes we can use unpacks instead of generating prologue
4854 masks and sometimes the prologue mask will fold to a constant,
4855 so the actual prologue cost might be smaller. However, it's
4856 simpler and safer to use the worst-case cost; if this ends up
4857 being the tie-breaker between vectorizing or not, then it's
4858 probably better not to vectorize. */
4859 (void) add_stmt_cost (target_cost_data, num_masks,
4860 vector_stmt, NULL, NULL, NULL_TREE, 0,
4861 vect_prologue);
4862 (void) add_stmt_cost (target_cost_data, num_masks - 1,
4863 vector_stmt, NULL, NULL, NULL_TREE, 0,
4864 vect_body);
4865 }
4866 else if (LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo))
4867 {
4868 /* Referring to the functions vect_set_loop_condition_partial_vectors
4869 and vect_set_loop_controls_directly, we need to generate each
4870 length in the prologue and in the loop body if required. Although
4871 there are some possible optimizations, we consider the worst case
4872 here. */
4873
4874 bool niters_known_p = LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo);
4875 signed char partial_load_store_bias
4876 = LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo);
4877 bool need_iterate_p
4878 = (!LOOP_VINFO_EPILOGUE_P (loop_vinfo)
4879 && !vect_known_niters_smaller_than_vf (loop_vinfo));
4880
4881 /* Calculate how many statements to be added. */
4882 unsigned int prologue_stmts = 0;
4883 unsigned int body_stmts = 0;
4884
4885 rgroup_controls *rgc;
4886 unsigned int num_vectors_m1;
4887 FOR_EACH_VEC_ELT (LOOP_VINFO_LENS (loop_vinfo), num_vectors_m1, rgc)
4888 if (rgc->type)
4889 {
4890 /* May need one SHIFT for nitems_total computation. */
4891 unsigned nitems = rgc->max_nscalars_per_iter * rgc->factor;
4892 if (nitems != 1 && !niters_known_p)
4893 prologue_stmts += 1;
4894
4895 /* May need one MAX and one MINUS for wrap around. */
4896 if (vect_rgroup_iv_might_wrap_p (loop_vinfo, rgc))
4897 prologue_stmts += 2;
4898
4899 /* Need one MAX and one MINUS for each batch limit excepting for
4900 the 1st one. */
4901 prologue_stmts += num_vectors_m1 * 2;
4902
4903 unsigned int num_vectors = num_vectors_m1 + 1;
4904
4905 /* Need to set up lengths in prologue, only one MIN required
4906 for each since start index is zero. */
4907 prologue_stmts += num_vectors;
4908
4909 /* If we have a non-zero partial load bias, we need one PLUS
4910 to adjust the load length. */
4911 if (partial_load_store_bias != 0)
4912 body_stmts += 1;
4913
4914 unsigned int length_update_cost = 0;
4915 if (LOOP_VINFO_USING_DECREMENTING_IV_P (loop_vinfo))
4916 /* For decrement IV style, Each only need a single SELECT_VL
4917 or MIN since beginning to calculate the number of elements
4918 need to be processed in current iteration. */
4919 length_update_cost = 1;
4920 else
4921 /* For increment IV stype, Each may need two MINs and one MINUS to
4922 update lengths in body for next iteration. */
4923 length_update_cost = 3;
4924
4925 if (need_iterate_p)
4926 body_stmts += length_update_cost * num_vectors;
4927 }
4928
4929 (void) add_stmt_cost (target_cost_data, prologue_stmts,
4930 scalar_stmt, vect_prologue);
4931 (void) add_stmt_cost (target_cost_data, body_stmts,
4932 scalar_stmt, vect_body);
4933 }
4934
4935 /* FORNOW: The scalar outside cost is incremented in one of the
4936 following ways:
4937
4938 1. The vectorizer checks for alignment and aliasing and generates
4939 a condition that allows dynamic vectorization. A cost model
4940 check is ANDED with the versioning condition. Hence scalar code
4941 path now has the added cost of the versioning check.
4942
4943 if (cost > th & versioning_check)
4944 jmp to vector code
4945
4946 Hence run-time scalar is incremented by not-taken branch cost.
4947
4948 2. The vectorizer then checks if a prologue is required. If the
4949 cost model check was not done before during versioning, it has to
4950 be done before the prologue check.
4951
4952 if (cost <= th)
4953 prologue = scalar_iters
4954 if (prologue == 0)
4955 jmp to vector code
4956 else
4957 execute prologue
4958 if (prologue == num_iters)
4959 go to exit
4960
4961 Hence the run-time scalar cost is incremented by a taken branch,
4962 plus a not-taken branch, plus a taken branch cost.
4963
4964 3. The vectorizer then checks if an epilogue is required. If the
4965 cost model check was not done before during prologue check, it
4966 has to be done with the epilogue check.
4967
4968 if (prologue == 0)
4969 jmp to vector code
4970 else
4971 execute prologue
4972 if (prologue == num_iters)
4973 go to exit
4974 vector code:
4975 if ((cost <= th) | (scalar_iters-prologue-epilogue == 0))
4976 jmp to epilogue
4977
4978 Hence the run-time scalar cost should be incremented by 2 taken
4979 branches.
4980
4981 TODO: The back end may reorder the BBS's differently and reverse
4982 conditions/branch directions. Change the estimates below to
4983 something more reasonable. */
4984
4985 /* If the number of iterations is known and we do not do versioning, we can
4986 decide whether to vectorize at compile time. Hence the scalar version
4987 do not carry cost model guard costs. */
4988 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
4989 || LOOP_REQUIRES_VERSIONING (loop_vinfo))
4990 {
4991 /* Cost model check occurs at versioning. */
4992 if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
4993 scalar_outside_cost += vect_get_stmt_cost (cond_branch_not_taken);
4994 else
4995 {
4996 /* Cost model check occurs at prologue generation. */
4997 if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) < 0)
4998 scalar_outside_cost += 2 * vect_get_stmt_cost (cond_branch_taken)
4999 + vect_get_stmt_cost (cond_branch_not_taken);
5000 /* Cost model check occurs at epilogue generation. */
5001 else
5002 scalar_outside_cost += 2 * vect_get_stmt_cost (cond_branch_taken);
5003 }
5004 }
5005
5006 /* Complete the target-specific cost calculations. */
5007 finish_cost (loop_vinfo->vector_costs, loop_vinfo->scalar_costs,
5008 &vec_prologue_cost, &vec_inside_cost, &vec_epilogue_cost,
5009 suggested_unroll_factor);
5010
5011 if (suggested_unroll_factor && *suggested_unroll_factor > 1
5012 && LOOP_VINFO_MAX_VECT_FACTOR (loop_vinfo) != MAX_VECTORIZATION_FACTOR
5013 && !known_le (LOOP_VINFO_VECT_FACTOR (loop_vinfo) *
5014 *suggested_unroll_factor,
5015 LOOP_VINFO_MAX_VECT_FACTOR (loop_vinfo)))
5016 {
5017 if (dump_enabled_p ())
5018 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5019 "can't unroll as unrolled vectorization factor larger"
5020 " than maximum vectorization factor: "
5021 HOST_WIDE_INT_PRINT_UNSIGNED "\n",
5022 LOOP_VINFO_MAX_VECT_FACTOR (loop_vinfo));
5023 *suggested_unroll_factor = 1;
5024 }
5025
5026 vec_outside_cost = (int)(vec_prologue_cost + vec_epilogue_cost);
5027
5028 if (dump_enabled_p ())
5029 {
5030 dump_printf_loc (MSG_NOTE, vect_location, "Cost model analysis: \n");
5031 dump_printf (MSG_NOTE, " Vector inside of loop cost: %d\n",
5032 vec_inside_cost);
5033 dump_printf (MSG_NOTE, " Vector prologue cost: %d\n",
5034 vec_prologue_cost);
5035 dump_printf (MSG_NOTE, " Vector epilogue cost: %d\n",
5036 vec_epilogue_cost);
5037 dump_printf (MSG_NOTE, " Scalar iteration cost: %d\n",
5038 scalar_single_iter_cost);
5039 dump_printf (MSG_NOTE, " Scalar outside cost: %d\n",
5040 scalar_outside_cost);
5041 dump_printf (MSG_NOTE, " Vector outside cost: %d\n",
5042 vec_outside_cost);
5043 dump_printf (MSG_NOTE, " prologue iterations: %d\n",
5044 peel_iters_prologue);
5045 dump_printf (MSG_NOTE, " epilogue iterations: %d\n",
5046 peel_iters_epilogue);
5047 }
5048
5049 /* Calculate number of iterations required to make the vector version
5050 profitable, relative to the loop bodies only. The following condition
5051 must hold true:
5052 SIC * niters + SOC > VIC * ((niters - NPEEL) / VF) + VOC
5053 where
5054 SIC = scalar iteration cost, VIC = vector iteration cost,
5055 VOC = vector outside cost, VF = vectorization factor,
5056 NPEEL = prologue iterations + epilogue iterations,
5057 SOC = scalar outside cost for run time cost model check. */
5058
5059 int saving_per_viter = (scalar_single_iter_cost * assumed_vf
5060 - vec_inside_cost);
5061 if (saving_per_viter <= 0)
5062 {
5063 if (LOOP_VINFO_LOOP (loop_vinfo)->force_vectorize)
5064 warning_at (vect_location.get_location_t (), OPT_Wopenmp_simd,
5065 "vectorization did not happen for a simd loop");
5066
5067 if (dump_enabled_p ())
5068 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5069 "cost model: the vector iteration cost = %d "
5070 "divided by the scalar iteration cost = %d "
5071 "is greater or equal to the vectorization factor = %d"
5072 ".\n",
5073 vec_inside_cost, scalar_single_iter_cost, assumed_vf);
5074 *ret_min_profitable_niters = -1;
5075 *ret_min_profitable_estimate = -1;
5076 return;
5077 }
5078
5079 /* ??? The "if" arm is written to handle all cases; see below for what
5080 we would do for !LOOP_VINFO_USING_PARTIAL_VECTORS_P. */
5081 if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
5082 {
5083 /* Rewriting the condition above in terms of the number of
5084 vector iterations (vniters) rather than the number of
5085 scalar iterations (niters) gives:
5086
5087 SIC * (vniters * VF + NPEEL) + SOC > VIC * vniters + VOC
5088
5089 <==> vniters * (SIC * VF - VIC) > VOC - SIC * NPEEL - SOC
5090
5091 For integer N, X and Y when X > 0:
5092
5093 N * X > Y <==> N >= (Y /[floor] X) + 1. */
5094 int outside_overhead = (vec_outside_cost
5095 - scalar_single_iter_cost * peel_iters_prologue
5096 - scalar_single_iter_cost * peel_iters_epilogue
5097 - scalar_outside_cost);
5098 /* We're only interested in cases that require at least one
5099 vector iteration. */
5100 int min_vec_niters = 1;
5101 if (outside_overhead > 0)
5102 min_vec_niters = outside_overhead / saving_per_viter + 1;
5103
5104 if (dump_enabled_p ())
5105 dump_printf (MSG_NOTE, " Minimum number of vector iterations: %d\n",
5106 min_vec_niters);
5107
5108 if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
5109 {
5110 /* Now that we know the minimum number of vector iterations,
5111 find the minimum niters for which the scalar cost is larger:
5112
5113 SIC * niters > VIC * vniters + VOC - SOC
5114
5115 We know that the minimum niters is no more than
5116 vniters * VF + NPEEL, but it might be (and often is) less
5117 than that if a partial vector iteration is cheaper than the
5118 equivalent scalar code. */
5119 int threshold = (vec_inside_cost * min_vec_niters
5120 + vec_outside_cost
5121 - scalar_outside_cost);
5122 if (threshold <= 0)
5123 min_profitable_iters = 1;
5124 else
5125 min_profitable_iters = threshold / scalar_single_iter_cost + 1;
5126 }
5127 else
5128 /* Convert the number of vector iterations into a number of
5129 scalar iterations. */
5130 min_profitable_iters = (min_vec_niters * assumed_vf
5131 + peel_iters_prologue
5132 + peel_iters_epilogue);
5133 }
5134 else
5135 {
5136 min_profitable_iters = ((vec_outside_cost - scalar_outside_cost)
5137 * assumed_vf
5138 - vec_inside_cost * peel_iters_prologue
5139 - vec_inside_cost * peel_iters_epilogue);
5140 if (min_profitable_iters <= 0)
5141 min_profitable_iters = 0;
5142 else
5143 {
5144 min_profitable_iters /= saving_per_viter;
5145
5146 if ((scalar_single_iter_cost * assumed_vf * min_profitable_iters)
5147 <= (((int) vec_inside_cost * min_profitable_iters)
5148 + (((int) vec_outside_cost - scalar_outside_cost)
5149 * assumed_vf)))
5150 min_profitable_iters++;
5151 }
5152 }
5153
5154 if (dump_enabled_p ())
5155 dump_printf (MSG_NOTE,
5156 " Calculated minimum iters for profitability: %d\n",
5157 min_profitable_iters);
5158
5159 if (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo)
5160 && min_profitable_iters < (assumed_vf + peel_iters_prologue))
5161 /* We want the vectorized loop to execute at least once. */
5162 min_profitable_iters = assumed_vf + peel_iters_prologue;
5163 else if (min_profitable_iters < peel_iters_prologue)
5164 /* For LOOP_VINFO_USING_PARTIAL_VECTORS_P, we need to ensure the
5165 vectorized loop executes at least once. */
5166 min_profitable_iters = peel_iters_prologue;
5167
5168 if (dump_enabled_p ())
5169 dump_printf_loc (MSG_NOTE, vect_location,
5170 " Runtime profitability threshold = %d\n",
5171 min_profitable_iters);
5172
5173 *ret_min_profitable_niters = min_profitable_iters;
5174
5175 /* Calculate number of iterations required to make the vector version
5176 profitable, relative to the loop bodies only.
5177
5178 Non-vectorized variant is SIC * niters and it must win over vector
5179 variant on the expected loop trip count. The following condition must hold true:
5180 SIC * niters > VIC * ((niters - NPEEL) / VF) + VOC + SOC */
5181
5182 if (vec_outside_cost <= 0)
5183 min_profitable_estimate = 0;
5184 /* ??? This "else if" arm is written to handle all cases; see below for
5185 what we would do for !LOOP_VINFO_USING_PARTIAL_VECTORS_P. */
5186 else if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
5187 {
5188 /* This is a repeat of the code above, but with + SOC rather
5189 than - SOC. */
5190 int outside_overhead = (vec_outside_cost
5191 - scalar_single_iter_cost * peel_iters_prologue
5192 - scalar_single_iter_cost * peel_iters_epilogue
5193 + scalar_outside_cost);
5194 int min_vec_niters = 1;
5195 if (outside_overhead > 0)
5196 min_vec_niters = outside_overhead / saving_per_viter + 1;
5197
5198 if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
5199 {
5200 int threshold = (vec_inside_cost * min_vec_niters
5201 + vec_outside_cost
5202 + scalar_outside_cost);
5203 min_profitable_estimate = threshold / scalar_single_iter_cost + 1;
5204 }
5205 else
5206 min_profitable_estimate = (min_vec_niters * assumed_vf
5207 + peel_iters_prologue
5208 + peel_iters_epilogue);
5209 }
5210 else
5211 {
5212 min_profitable_estimate = ((vec_outside_cost + scalar_outside_cost)
5213 * assumed_vf
5214 - vec_inside_cost * peel_iters_prologue
5215 - vec_inside_cost * peel_iters_epilogue)
5216 / ((scalar_single_iter_cost * assumed_vf)
5217 - vec_inside_cost);
5218 }
5219 min_profitable_estimate = MAX (min_profitable_estimate, min_profitable_iters);
5220 if (dump_enabled_p ())
5221 dump_printf_loc (MSG_NOTE, vect_location,
5222 " Static estimate profitability threshold = %d\n",
5223 min_profitable_estimate);
5224
5225 *ret_min_profitable_estimate = min_profitable_estimate;
5226 }
5227
5228 /* Writes into SEL a mask for a vec_perm, equivalent to a vec_shr by OFFSET
5229 vector elements (not bits) for a vector with NELT elements. */
5230 static void
5231 calc_vec_perm_mask_for_shift (unsigned int offset, unsigned int nelt,
5232 vec_perm_builder *sel)
5233 {
5234 /* The encoding is a single stepped pattern. Any wrap-around is handled
5235 by vec_perm_indices. */
5236 sel->new_vector (nelt, 1, 3);
5237 for (unsigned int i = 0; i < 3; i++)
5238 sel->quick_push (i + offset);
5239 }
5240
5241 /* Checks whether the target supports whole-vector shifts for vectors of mode
5242 MODE. This is the case if _either_ the platform handles vec_shr_optab, _or_
5243 it supports vec_perm_const with masks for all necessary shift amounts. */
5244 static bool
5245 have_whole_vector_shift (machine_mode mode)
5246 {
5247 if (optab_handler (vec_shr_optab, mode) != CODE_FOR_nothing)
5248 return true;
5249
5250 /* Variable-length vectors should be handled via the optab. */
5251 unsigned int nelt;
5252 if (!GET_MODE_NUNITS (mode).is_constant (&nelt))
5253 return false;
5254
5255 vec_perm_builder sel;
5256 vec_perm_indices indices;
5257 for (unsigned int i = nelt / 2; i >= 1; i /= 2)
5258 {
5259 calc_vec_perm_mask_for_shift (i, nelt, &sel);
5260 indices.new_vector (sel, 2, nelt);
5261 if (!can_vec_perm_const_p (mode, mode, indices, false))
5262 return false;
5263 }
5264 return true;
5265 }
5266
5267 /* Return true if (a) STMT_INFO is a DOT_PROD_EXPR reduction whose
5268 multiplication operands have differing signs and (b) we intend
5269 to emulate the operation using a series of signed DOT_PROD_EXPRs.
5270 See vect_emulate_mixed_dot_prod for the actual sequence used. */
5271
5272 static bool
5273 vect_is_emulated_mixed_dot_prod (loop_vec_info loop_vinfo,
5274 stmt_vec_info stmt_info)
5275 {
5276 gassign *assign = dyn_cast<gassign *> (stmt_info->stmt);
5277 if (!assign || gimple_assign_rhs_code (assign) != DOT_PROD_EXPR)
5278 return false;
5279
5280 tree rhs1 = gimple_assign_rhs1 (assign);
5281 tree rhs2 = gimple_assign_rhs2 (assign);
5282 if (TYPE_SIGN (TREE_TYPE (rhs1)) == TYPE_SIGN (TREE_TYPE (rhs2)))
5283 return false;
5284
5285 stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info);
5286 gcc_assert (reduc_info->is_reduc_info);
5287 return !directly_supported_p (DOT_PROD_EXPR,
5288 STMT_VINFO_REDUC_VECTYPE_IN (reduc_info),
5289 optab_vector_mixed_sign);
5290 }
5291
5292 /* TODO: Close dependency between vect_model_*_cost and vectorizable_*
5293 functions. Design better to avoid maintenance issues. */
5294
5295 /* Function vect_model_reduction_cost.
5296
5297 Models cost for a reduction operation, including the vector ops
5298 generated within the strip-mine loop in some cases, the initial
5299 definition before the loop, and the epilogue code that must be generated. */
5300
5301 static void
5302 vect_model_reduction_cost (loop_vec_info loop_vinfo,
5303 stmt_vec_info stmt_info, internal_fn reduc_fn,
5304 vect_reduction_type reduction_type,
5305 int ncopies, stmt_vector_for_cost *cost_vec)
5306 {
5307 int prologue_cost = 0, epilogue_cost = 0, inside_cost = 0;
5308 tree vectype;
5309 machine_mode mode;
5310 class loop *loop = NULL;
5311
5312 if (loop_vinfo)
5313 loop = LOOP_VINFO_LOOP (loop_vinfo);
5314
5315 /* Condition reductions generate two reductions in the loop. */
5316 if (reduction_type == COND_REDUCTION)
5317 ncopies *= 2;
5318
5319 vectype = STMT_VINFO_VECTYPE (stmt_info);
5320 mode = TYPE_MODE (vectype);
5321 stmt_vec_info orig_stmt_info = vect_orig_stmt (stmt_info);
5322
5323 gimple_match_op op;
5324 if (!gimple_extract_op (orig_stmt_info->stmt, &op))
5325 gcc_unreachable ();
5326
5327 bool emulated_mixed_dot_prod
5328 = vect_is_emulated_mixed_dot_prod (loop_vinfo, stmt_info);
5329 if (reduction_type == EXTRACT_LAST_REDUCTION)
5330 /* No extra instructions are needed in the prologue. The loop body
5331 operations are costed in vectorizable_condition. */
5332 inside_cost = 0;
5333 else if (reduction_type == FOLD_LEFT_REDUCTION)
5334 {
5335 /* No extra instructions needed in the prologue. */
5336 prologue_cost = 0;
5337
5338 if (reduc_fn != IFN_LAST)
5339 /* Count one reduction-like operation per vector. */
5340 inside_cost = record_stmt_cost (cost_vec, ncopies, vec_to_scalar,
5341 stmt_info, 0, vect_body);
5342 else
5343 {
5344 /* Use NELEMENTS extracts and NELEMENTS scalar ops. */
5345 unsigned int nelements = ncopies * vect_nunits_for_cost (vectype);
5346 inside_cost = record_stmt_cost (cost_vec, nelements,
5347 vec_to_scalar, stmt_info, 0,
5348 vect_body);
5349 inside_cost += record_stmt_cost (cost_vec, nelements,
5350 scalar_stmt, stmt_info, 0,
5351 vect_body);
5352 }
5353 }
5354 else
5355 {
5356 /* Add in the cost of the initial definitions. */
5357 int prologue_stmts;
5358 if (reduction_type == COND_REDUCTION)
5359 /* For cond reductions we have four vectors: initial index, step,
5360 initial result of the data reduction, initial value of the index
5361 reduction. */
5362 prologue_stmts = 4;
5363 else if (emulated_mixed_dot_prod)
5364 /* We need the initial reduction value and two invariants:
5365 one that contains the minimum signed value and one that
5366 contains half of its negative. */
5367 prologue_stmts = 3;
5368 else
5369 prologue_stmts = 1;
5370 prologue_cost += record_stmt_cost (cost_vec, prologue_stmts,
5371 scalar_to_vec, stmt_info, 0,
5372 vect_prologue);
5373 }
5374
5375 /* Determine cost of epilogue code.
5376
5377 We have a reduction operator that will reduce the vector in one statement.
5378 Also requires scalar extract. */
5379
5380 if (!loop || !nested_in_vect_loop_p (loop, orig_stmt_info))
5381 {
5382 if (reduc_fn != IFN_LAST)
5383 {
5384 if (reduction_type == COND_REDUCTION)
5385 {
5386 /* An EQ stmt and an COND_EXPR stmt. */
5387 epilogue_cost += record_stmt_cost (cost_vec, 2,
5388 vector_stmt, stmt_info, 0,
5389 vect_epilogue);
5390 /* Reduction of the max index and a reduction of the found
5391 values. */
5392 epilogue_cost += record_stmt_cost (cost_vec, 2,
5393 vec_to_scalar, stmt_info, 0,
5394 vect_epilogue);
5395 /* A broadcast of the max value. */
5396 epilogue_cost += record_stmt_cost (cost_vec, 1,
5397 scalar_to_vec, stmt_info, 0,
5398 vect_epilogue);
5399 }
5400 else
5401 {
5402 epilogue_cost += record_stmt_cost (cost_vec, 1, vector_stmt,
5403 stmt_info, 0, vect_epilogue);
5404 epilogue_cost += record_stmt_cost (cost_vec, 1,
5405 vec_to_scalar, stmt_info, 0,
5406 vect_epilogue);
5407 }
5408 }
5409 else if (reduction_type == COND_REDUCTION)
5410 {
5411 unsigned estimated_nunits = vect_nunits_for_cost (vectype);
5412 /* Extraction of scalar elements. */
5413 epilogue_cost += record_stmt_cost (cost_vec,
5414 2 * estimated_nunits,
5415 vec_to_scalar, stmt_info, 0,
5416 vect_epilogue);
5417 /* Scalar max reductions via COND_EXPR / MAX_EXPR. */
5418 epilogue_cost += record_stmt_cost (cost_vec,
5419 2 * estimated_nunits - 3,
5420 scalar_stmt, stmt_info, 0,
5421 vect_epilogue);
5422 }
5423 else if (reduction_type == EXTRACT_LAST_REDUCTION
5424 || reduction_type == FOLD_LEFT_REDUCTION)
5425 /* No extra instructions need in the epilogue. */
5426 ;
5427 else
5428 {
5429 int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
5430 tree bitsize = TYPE_SIZE (op.type);
5431 int element_bitsize = tree_to_uhwi (bitsize);
5432 int nelements = vec_size_in_bits / element_bitsize;
5433
5434 if (op.code == COND_EXPR)
5435 op.code = MAX_EXPR;
5436
5437 /* We have a whole vector shift available. */
5438 if (VECTOR_MODE_P (mode)
5439 && directly_supported_p (op.code, vectype)
5440 && have_whole_vector_shift (mode))
5441 {
5442 /* Final reduction via vector shifts and the reduction operator.
5443 Also requires scalar extract. */
5444 epilogue_cost += record_stmt_cost (cost_vec,
5445 exact_log2 (nelements) * 2,
5446 vector_stmt, stmt_info, 0,
5447 vect_epilogue);
5448 epilogue_cost += record_stmt_cost (cost_vec, 1,
5449 vec_to_scalar, stmt_info, 0,
5450 vect_epilogue);
5451 }
5452 else
5453 /* Use extracts and reduction op for final reduction. For N
5454 elements, we have N extracts and N-1 reduction ops. */
5455 epilogue_cost += record_stmt_cost (cost_vec,
5456 nelements + nelements - 1,
5457 vector_stmt, stmt_info, 0,
5458 vect_epilogue);
5459 }
5460 }
5461
5462 if (dump_enabled_p ())
5463 dump_printf (MSG_NOTE,
5464 "vect_model_reduction_cost: inside_cost = %d, "
5465 "prologue_cost = %d, epilogue_cost = %d .\n", inside_cost,
5466 prologue_cost, epilogue_cost);
5467 }
5468
5469 /* SEQ is a sequence of instructions that initialize the reduction
5470 described by REDUC_INFO. Emit them in the appropriate place. */
5471
5472 static void
5473 vect_emit_reduction_init_stmts (loop_vec_info loop_vinfo,
5474 stmt_vec_info reduc_info, gimple *seq)
5475 {
5476 if (reduc_info->reused_accumulator)
5477 {
5478 /* When reusing an accumulator from the main loop, we only need
5479 initialization instructions if the main loop can be skipped.
5480 In that case, emit the initialization instructions at the end
5481 of the guard block that does the skip. */
5482 edge skip_edge = loop_vinfo->skip_main_loop_edge;
5483 gcc_assert (skip_edge);
5484 gimple_stmt_iterator gsi = gsi_last_bb (skip_edge->src);
5485 gsi_insert_seq_before (&gsi, seq, GSI_SAME_STMT);
5486 }
5487 else
5488 {
5489 /* The normal case: emit the initialization instructions on the
5490 preheader edge. */
5491 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
5492 gsi_insert_seq_on_edge_immediate (loop_preheader_edge (loop), seq);
5493 }
5494 }
5495
5496 /* Function get_initial_def_for_reduction
5497
5498 Input:
5499 REDUC_INFO - the info_for_reduction
5500 INIT_VAL - the initial value of the reduction variable
5501 NEUTRAL_OP - a value that has no effect on the reduction, as per
5502 neutral_op_for_reduction
5503
5504 Output:
5505 Return a vector variable, initialized according to the operation that
5506 STMT_VINFO performs. This vector will be used as the initial value
5507 of the vector of partial results.
5508
5509 The value we need is a vector in which element 0 has value INIT_VAL
5510 and every other element has value NEUTRAL_OP. */
5511
5512 static tree
5513 get_initial_def_for_reduction (loop_vec_info loop_vinfo,
5514 stmt_vec_info reduc_info,
5515 tree init_val, tree neutral_op)
5516 {
5517 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
5518 tree scalar_type = TREE_TYPE (init_val);
5519 tree vectype = get_vectype_for_scalar_type (loop_vinfo, scalar_type);
5520 tree init_def;
5521 gimple_seq stmts = NULL;
5522
5523 gcc_assert (vectype);
5524
5525 gcc_assert (POINTER_TYPE_P (scalar_type) || INTEGRAL_TYPE_P (scalar_type)
5526 || SCALAR_FLOAT_TYPE_P (scalar_type));
5527
5528 gcc_assert (nested_in_vect_loop_p (loop, reduc_info)
5529 || loop == (gimple_bb (reduc_info->stmt))->loop_father);
5530
5531 if (operand_equal_p (init_val, neutral_op))
5532 {
5533 /* If both elements are equal then the vector described above is
5534 just a splat. */
5535 neutral_op = gimple_convert (&stmts, TREE_TYPE (vectype), neutral_op);
5536 init_def = gimple_build_vector_from_val (&stmts, vectype, neutral_op);
5537 }
5538 else
5539 {
5540 neutral_op = gimple_convert (&stmts, TREE_TYPE (vectype), neutral_op);
5541 init_val = gimple_convert (&stmts, TREE_TYPE (vectype), init_val);
5542 if (!TYPE_VECTOR_SUBPARTS (vectype).is_constant ())
5543 {
5544 /* Construct a splat of NEUTRAL_OP and insert INIT_VAL into
5545 element 0. */
5546 init_def = gimple_build_vector_from_val (&stmts, vectype,
5547 neutral_op);
5548 init_def = gimple_build (&stmts, CFN_VEC_SHL_INSERT,
5549 vectype, init_def, init_val);
5550 }
5551 else
5552 {
5553 /* Build {INIT_VAL, NEUTRAL_OP, NEUTRAL_OP, ...}. */
5554 tree_vector_builder elts (vectype, 1, 2);
5555 elts.quick_push (init_val);
5556 elts.quick_push (neutral_op);
5557 init_def = gimple_build_vector (&stmts, &elts);
5558 }
5559 }
5560
5561 if (stmts)
5562 vect_emit_reduction_init_stmts (loop_vinfo, reduc_info, stmts);
5563 return init_def;
5564 }
5565
5566 /* Get at the initial defs for the reduction PHIs for REDUC_INFO,
5567 which performs a reduction involving GROUP_SIZE scalar statements.
5568 NUMBER_OF_VECTORS is the number of vector defs to create. If NEUTRAL_OP
5569 is nonnull, introducing extra elements of that value will not change the
5570 result. */
5571
5572 static void
5573 get_initial_defs_for_reduction (loop_vec_info loop_vinfo,
5574 stmt_vec_info reduc_info,
5575 vec<tree> *vec_oprnds,
5576 unsigned int number_of_vectors,
5577 unsigned int group_size, tree neutral_op)
5578 {
5579 vec<tree> &initial_values = reduc_info->reduc_initial_values;
5580 unsigned HOST_WIDE_INT nunits;
5581 unsigned j, number_of_places_left_in_vector;
5582 tree vector_type = STMT_VINFO_VECTYPE (reduc_info);
5583 unsigned int i;
5584
5585 gcc_assert (group_size == initial_values.length () || neutral_op);
5586
5587 /* NUMBER_OF_COPIES is the number of times we need to use the same values in
5588 created vectors. It is greater than 1 if unrolling is performed.
5589
5590 For example, we have two scalar operands, s1 and s2 (e.g., group of
5591 strided accesses of size two), while NUNITS is four (i.e., four scalars
5592 of this type can be packed in a vector). The output vector will contain
5593 two copies of each scalar operand: {s1, s2, s1, s2}. (NUMBER_OF_COPIES
5594 will be 2).
5595
5596 If REDUC_GROUP_SIZE > NUNITS, the scalars will be split into several
5597 vectors containing the operands.
5598
5599 For example, NUNITS is four as before, and the group size is 8
5600 (s1, s2, ..., s8). We will create two vectors {s1, s2, s3, s4} and
5601 {s5, s6, s7, s8}. */
5602
5603 if (!TYPE_VECTOR_SUBPARTS (vector_type).is_constant (&nunits))
5604 nunits = group_size;
5605
5606 number_of_places_left_in_vector = nunits;
5607 bool constant_p = true;
5608 tree_vector_builder elts (vector_type, nunits, 1);
5609 elts.quick_grow (nunits);
5610 gimple_seq ctor_seq = NULL;
5611 for (j = 0; j < nunits * number_of_vectors; ++j)
5612 {
5613 tree op;
5614 i = j % group_size;
5615
5616 /* Get the def before the loop. In reduction chain we have only
5617 one initial value. Else we have as many as PHIs in the group. */
5618 if (i >= initial_values.length () || (j > i && neutral_op))
5619 op = neutral_op;
5620 else
5621 {
5622 if (!useless_type_conversion_p (TREE_TYPE (vector_type),
5623 TREE_TYPE (initial_values[i])))
5624 initial_values[i] = gimple_convert (&ctor_seq,
5625 TREE_TYPE (vector_type),
5626 initial_values[i]);
5627 op = initial_values[i];
5628 }
5629
5630 /* Create 'vect_ = {op0,op1,...,opn}'. */
5631 number_of_places_left_in_vector--;
5632 elts[nunits - number_of_places_left_in_vector - 1] = op;
5633 if (!CONSTANT_CLASS_P (op))
5634 constant_p = false;
5635
5636 if (number_of_places_left_in_vector == 0)
5637 {
5638 tree init;
5639 if (constant_p && !neutral_op
5640 ? multiple_p (TYPE_VECTOR_SUBPARTS (vector_type), nunits)
5641 : known_eq (TYPE_VECTOR_SUBPARTS (vector_type), nunits))
5642 /* Build the vector directly from ELTS. */
5643 init = gimple_build_vector (&ctor_seq, &elts);
5644 else if (neutral_op)
5645 {
5646 /* Build a vector of the neutral value and shift the
5647 other elements into place. */
5648 init = gimple_build_vector_from_val (&ctor_seq, vector_type,
5649 neutral_op);
5650 int k = nunits;
5651 while (k > 0 && elts[k - 1] == neutral_op)
5652 k -= 1;
5653 while (k > 0)
5654 {
5655 k -= 1;
5656 init = gimple_build (&ctor_seq, CFN_VEC_SHL_INSERT,
5657 vector_type, init, elts[k]);
5658 }
5659 }
5660 else
5661 {
5662 /* First time round, duplicate ELTS to fill the
5663 required number of vectors. */
5664 duplicate_and_interleave (loop_vinfo, &ctor_seq, vector_type,
5665 elts, number_of_vectors, *vec_oprnds);
5666 break;
5667 }
5668 vec_oprnds->quick_push (init);
5669
5670 number_of_places_left_in_vector = nunits;
5671 elts.new_vector (vector_type, nunits, 1);
5672 elts.quick_grow (nunits);
5673 constant_p = true;
5674 }
5675 }
5676 if (ctor_seq != NULL)
5677 vect_emit_reduction_init_stmts (loop_vinfo, reduc_info, ctor_seq);
5678 }
5679
5680 /* For a statement STMT_INFO taking part in a reduction operation return
5681 the stmt_vec_info the meta information is stored on. */
5682
5683 stmt_vec_info
5684 info_for_reduction (vec_info *vinfo, stmt_vec_info stmt_info)
5685 {
5686 stmt_info = vect_orig_stmt (stmt_info);
5687 gcc_assert (STMT_VINFO_REDUC_DEF (stmt_info));
5688 if (!is_a <gphi *> (stmt_info->stmt)
5689 || !VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
5690 stmt_info = STMT_VINFO_REDUC_DEF (stmt_info);
5691 gphi *phi = as_a <gphi *> (stmt_info->stmt);
5692 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
5693 {
5694 if (gimple_phi_num_args (phi) == 1)
5695 stmt_info = STMT_VINFO_REDUC_DEF (stmt_info);
5696 }
5697 else if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle)
5698 {
5699 stmt_vec_info info = vinfo->lookup_def (vect_phi_initial_value (phi));
5700 if (info && STMT_VINFO_DEF_TYPE (info) == vect_double_reduction_def)
5701 stmt_info = info;
5702 }
5703 return stmt_info;
5704 }
5705
5706 /* See if LOOP_VINFO is an epilogue loop whose main loop had a reduction that
5707 REDUC_INFO can build on. Adjust REDUC_INFO and return true if so, otherwise
5708 return false. */
5709
5710 static bool
5711 vect_find_reusable_accumulator (loop_vec_info loop_vinfo,
5712 stmt_vec_info reduc_info)
5713 {
5714 loop_vec_info main_loop_vinfo = LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo);
5715 if (!main_loop_vinfo)
5716 return false;
5717
5718 if (STMT_VINFO_REDUC_TYPE (reduc_info) != TREE_CODE_REDUCTION)
5719 return false;
5720
5721 unsigned int num_phis = reduc_info->reduc_initial_values.length ();
5722 auto_vec<tree, 16> main_loop_results (num_phis);
5723 auto_vec<tree, 16> initial_values (num_phis);
5724 if (edge main_loop_edge = loop_vinfo->main_loop_edge)
5725 {
5726 /* The epilogue loop can be entered either from the main loop or
5727 from an earlier guard block. */
5728 edge skip_edge = loop_vinfo->skip_main_loop_edge;
5729 for (tree incoming_value : reduc_info->reduc_initial_values)
5730 {
5731 /* Look for:
5732
5733 INCOMING_VALUE = phi<MAIN_LOOP_RESULT(main loop),
5734 INITIAL_VALUE(guard block)>. */
5735 gcc_assert (TREE_CODE (incoming_value) == SSA_NAME);
5736
5737 gphi *phi = as_a <gphi *> (SSA_NAME_DEF_STMT (incoming_value));
5738 gcc_assert (gimple_bb (phi) == main_loop_edge->dest);
5739
5740 tree from_main_loop = PHI_ARG_DEF_FROM_EDGE (phi, main_loop_edge);
5741 tree from_skip = PHI_ARG_DEF_FROM_EDGE (phi, skip_edge);
5742
5743 main_loop_results.quick_push (from_main_loop);
5744 initial_values.quick_push (from_skip);
5745 }
5746 }
5747 else
5748 /* The main loop dominates the epilogue loop. */
5749 main_loop_results.splice (reduc_info->reduc_initial_values);
5750
5751 /* See if the main loop has the kind of accumulator we need. */
5752 vect_reusable_accumulator *accumulator
5753 = main_loop_vinfo->reusable_accumulators.get (main_loop_results[0]);
5754 if (!accumulator
5755 || num_phis != accumulator->reduc_info->reduc_scalar_results.length ()
5756 || !std::equal (main_loop_results.begin (), main_loop_results.end (),
5757 accumulator->reduc_info->reduc_scalar_results.begin ()))
5758 return false;
5759
5760 /* Handle the case where we can reduce wider vectors to narrower ones. */
5761 tree vectype = STMT_VINFO_VECTYPE (reduc_info);
5762 tree old_vectype = TREE_TYPE (accumulator->reduc_input);
5763 unsigned HOST_WIDE_INT m;
5764 if (!constant_multiple_p (TYPE_VECTOR_SUBPARTS (old_vectype),
5765 TYPE_VECTOR_SUBPARTS (vectype), &m))
5766 return false;
5767 /* Check the intermediate vector types and operations are available. */
5768 tree prev_vectype = old_vectype;
5769 poly_uint64 intermediate_nunits = TYPE_VECTOR_SUBPARTS (old_vectype);
5770 while (known_gt (intermediate_nunits, TYPE_VECTOR_SUBPARTS (vectype)))
5771 {
5772 intermediate_nunits = exact_div (intermediate_nunits, 2);
5773 tree intermediate_vectype = get_related_vectype_for_scalar_type
5774 (TYPE_MODE (vectype), TREE_TYPE (vectype), intermediate_nunits);
5775 if (!intermediate_vectype
5776 || !directly_supported_p (STMT_VINFO_REDUC_CODE (reduc_info),
5777 intermediate_vectype)
5778 || !can_vec_extract (TYPE_MODE (prev_vectype),
5779 TYPE_MODE (intermediate_vectype)))
5780 return false;
5781 prev_vectype = intermediate_vectype;
5782 }
5783
5784 /* Non-SLP reductions might apply an adjustment after the reduction
5785 operation, in order to simplify the initialization of the accumulator.
5786 If the epilogue loop carries on from where the main loop left off,
5787 it should apply the same adjustment to the final reduction result.
5788
5789 If the epilogue loop can also be entered directly (rather than via
5790 the main loop), we need to be able to handle that case in the same way,
5791 with the same adjustment. (In principle we could add a PHI node
5792 to select the correct adjustment, but in practice that shouldn't be
5793 necessary.) */
5794 tree main_adjustment
5795 = STMT_VINFO_REDUC_EPILOGUE_ADJUSTMENT (accumulator->reduc_info);
5796 if (loop_vinfo->main_loop_edge && main_adjustment)
5797 {
5798 gcc_assert (num_phis == 1);
5799 tree initial_value = initial_values[0];
5800 /* Check that we can use INITIAL_VALUE as the adjustment and
5801 initialize the accumulator with a neutral value instead. */
5802 if (!operand_equal_p (initial_value, main_adjustment))
5803 return false;
5804 code_helper code = STMT_VINFO_REDUC_CODE (reduc_info);
5805 initial_values[0] = neutral_op_for_reduction (TREE_TYPE (initial_value),
5806 code, initial_value);
5807 }
5808 STMT_VINFO_REDUC_EPILOGUE_ADJUSTMENT (reduc_info) = main_adjustment;
5809 reduc_info->reduc_initial_values.truncate (0);
5810 reduc_info->reduc_initial_values.splice (initial_values);
5811 reduc_info->reused_accumulator = accumulator;
5812 return true;
5813 }
5814
5815 /* Reduce the vector VEC_DEF down to VECTYPE with reduction operation
5816 CODE emitting stmts before GSI. Returns a vector def of VECTYPE. */
5817
5818 static tree
5819 vect_create_partial_epilog (tree vec_def, tree vectype, code_helper code,
5820 gimple_seq *seq)
5821 {
5822 unsigned nunits = TYPE_VECTOR_SUBPARTS (TREE_TYPE (vec_def)).to_constant ();
5823 unsigned nunits1 = TYPE_VECTOR_SUBPARTS (vectype).to_constant ();
5824 tree stype = TREE_TYPE (vectype);
5825 tree new_temp = vec_def;
5826 while (nunits > nunits1)
5827 {
5828 nunits /= 2;
5829 tree vectype1 = get_related_vectype_for_scalar_type (TYPE_MODE (vectype),
5830 stype, nunits);
5831 unsigned int bitsize = tree_to_uhwi (TYPE_SIZE (vectype1));
5832
5833 /* The target has to make sure we support lowpart/highpart
5834 extraction, either via direct vector extract or through
5835 an integer mode punning. */
5836 tree dst1, dst2;
5837 gimple *epilog_stmt;
5838 if (convert_optab_handler (vec_extract_optab,
5839 TYPE_MODE (TREE_TYPE (new_temp)),
5840 TYPE_MODE (vectype1))
5841 != CODE_FOR_nothing)
5842 {
5843 /* Extract sub-vectors directly once vec_extract becomes
5844 a conversion optab. */
5845 dst1 = make_ssa_name (vectype1);
5846 epilog_stmt
5847 = gimple_build_assign (dst1, BIT_FIELD_REF,
5848 build3 (BIT_FIELD_REF, vectype1,
5849 new_temp, TYPE_SIZE (vectype1),
5850 bitsize_int (0)));
5851 gimple_seq_add_stmt_without_update (seq, epilog_stmt);
5852 dst2 = make_ssa_name (vectype1);
5853 epilog_stmt
5854 = gimple_build_assign (dst2, BIT_FIELD_REF,
5855 build3 (BIT_FIELD_REF, vectype1,
5856 new_temp, TYPE_SIZE (vectype1),
5857 bitsize_int (bitsize)));
5858 gimple_seq_add_stmt_without_update (seq, epilog_stmt);
5859 }
5860 else
5861 {
5862 /* Extract via punning to appropriately sized integer mode
5863 vector. */
5864 tree eltype = build_nonstandard_integer_type (bitsize, 1);
5865 tree etype = build_vector_type (eltype, 2);
5866 gcc_assert (convert_optab_handler (vec_extract_optab,
5867 TYPE_MODE (etype),
5868 TYPE_MODE (eltype))
5869 != CODE_FOR_nothing);
5870 tree tem = make_ssa_name (etype);
5871 epilog_stmt = gimple_build_assign (tem, VIEW_CONVERT_EXPR,
5872 build1 (VIEW_CONVERT_EXPR,
5873 etype, new_temp));
5874 gimple_seq_add_stmt_without_update (seq, epilog_stmt);
5875 new_temp = tem;
5876 tem = make_ssa_name (eltype);
5877 epilog_stmt
5878 = gimple_build_assign (tem, BIT_FIELD_REF,
5879 build3 (BIT_FIELD_REF, eltype,
5880 new_temp, TYPE_SIZE (eltype),
5881 bitsize_int (0)));
5882 gimple_seq_add_stmt_without_update (seq, epilog_stmt);
5883 dst1 = make_ssa_name (vectype1);
5884 epilog_stmt = gimple_build_assign (dst1, VIEW_CONVERT_EXPR,
5885 build1 (VIEW_CONVERT_EXPR,
5886 vectype1, tem));
5887 gimple_seq_add_stmt_without_update (seq, epilog_stmt);
5888 tem = make_ssa_name (eltype);
5889 epilog_stmt
5890 = gimple_build_assign (tem, BIT_FIELD_REF,
5891 build3 (BIT_FIELD_REF, eltype,
5892 new_temp, TYPE_SIZE (eltype),
5893 bitsize_int (bitsize)));
5894 gimple_seq_add_stmt_without_update (seq, epilog_stmt);
5895 dst2 = make_ssa_name (vectype1);
5896 epilog_stmt = gimple_build_assign (dst2, VIEW_CONVERT_EXPR,
5897 build1 (VIEW_CONVERT_EXPR,
5898 vectype1, tem));
5899 gimple_seq_add_stmt_without_update (seq, epilog_stmt);
5900 }
5901
5902 new_temp = gimple_build (seq, code, vectype1, dst1, dst2);
5903 }
5904
5905 return new_temp;
5906 }
5907
5908 /* Function vect_create_epilog_for_reduction
5909
5910 Create code at the loop-epilog to finalize the result of a reduction
5911 computation.
5912
5913 STMT_INFO is the scalar reduction stmt that is being vectorized.
5914 SLP_NODE is an SLP node containing a group of reduction statements. The
5915 first one in this group is STMT_INFO.
5916 SLP_NODE_INSTANCE is the SLP node instance containing SLP_NODE
5917 REDUC_INDEX says which rhs operand of the STMT_INFO is the reduction phi
5918 (counting from 0)
5919 LOOP_EXIT is the edge to update in the merge block. In the case of a single
5920 exit this edge is always the main loop exit.
5921
5922 This function:
5923 1. Completes the reduction def-use cycles.
5924 2. "Reduces" each vector of partial results VECT_DEFS into a single result,
5925 by calling the function specified by REDUC_FN if available, or by
5926 other means (whole-vector shifts or a scalar loop).
5927 The function also creates a new phi node at the loop exit to preserve
5928 loop-closed form, as illustrated below.
5929
5930 The flow at the entry to this function:
5931
5932 loop:
5933 vec_def = phi <vec_init, null> # REDUCTION_PHI
5934 VECT_DEF = vector_stmt # vectorized form of STMT_INFO
5935 s_loop = scalar_stmt # (scalar) STMT_INFO
5936 loop_exit:
5937 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
5938 use <s_out0>
5939 use <s_out0>
5940
5941 The above is transformed by this function into:
5942
5943 loop:
5944 vec_def = phi <vec_init, VECT_DEF> # REDUCTION_PHI
5945 VECT_DEF = vector_stmt # vectorized form of STMT_INFO
5946 s_loop = scalar_stmt # (scalar) STMT_INFO
5947 loop_exit:
5948 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
5949 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
5950 v_out2 = reduce <v_out1>
5951 s_out3 = extract_field <v_out2, 0>
5952 s_out4 = adjust_result <s_out3>
5953 use <s_out4>
5954 use <s_out4>
5955 */
5956
5957 static void
5958 vect_create_epilog_for_reduction (loop_vec_info loop_vinfo,
5959 stmt_vec_info stmt_info,
5960 slp_tree slp_node,
5961 slp_instance slp_node_instance,
5962 edge loop_exit)
5963 {
5964 stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info);
5965 gcc_assert (reduc_info->is_reduc_info);
5966 /* For double reductions we need to get at the inner loop reduction
5967 stmt which has the meta info attached. Our stmt_info is that of the
5968 loop-closed PHI of the inner loop which we remember as
5969 def for the reduction PHI generation. */
5970 bool double_reduc = false;
5971 stmt_vec_info rdef_info = stmt_info;
5972 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
5973 {
5974 gcc_assert (!slp_node);
5975 double_reduc = true;
5976 stmt_info = loop_vinfo->lookup_def (gimple_phi_arg_def
5977 (stmt_info->stmt, 0));
5978 stmt_info = vect_stmt_to_vectorize (stmt_info);
5979 }
5980 code_helper code = STMT_VINFO_REDUC_CODE (reduc_info);
5981 internal_fn reduc_fn = STMT_VINFO_REDUC_FN (reduc_info);
5982 tree vectype;
5983 machine_mode mode;
5984 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo), *outer_loop = NULL;
5985 basic_block exit_bb;
5986 tree scalar_dest;
5987 tree scalar_type;
5988 gimple *new_phi = NULL, *phi = NULL;
5989 gimple_stmt_iterator exit_gsi;
5990 tree new_temp = NULL_TREE, new_name, new_scalar_dest;
5991 gimple *epilog_stmt = NULL;
5992 gimple *exit_phi;
5993 tree bitsize;
5994 tree def;
5995 tree orig_name, scalar_result;
5996 imm_use_iterator imm_iter, phi_imm_iter;
5997 use_operand_p use_p, phi_use_p;
5998 gimple *use_stmt;
5999 auto_vec<tree> reduc_inputs;
6000 int j, i;
6001 vec<tree> &scalar_results = reduc_info->reduc_scalar_results;
6002 unsigned int group_size = 1, k;
6003 /* SLP reduction without reduction chain, e.g.,
6004 # a1 = phi <a2, a0>
6005 # b1 = phi <b2, b0>
6006 a2 = operation (a1)
6007 b2 = operation (b1) */
6008 bool slp_reduc = (slp_node && !REDUC_GROUP_FIRST_ELEMENT (stmt_info));
6009 bool direct_slp_reduc;
6010 tree induction_index = NULL_TREE;
6011
6012 if (slp_node)
6013 group_size = SLP_TREE_LANES (slp_node);
6014
6015 if (nested_in_vect_loop_p (loop, stmt_info))
6016 {
6017 outer_loop = loop;
6018 loop = loop->inner;
6019 gcc_assert (!slp_node && double_reduc);
6020 }
6021
6022 vectype = STMT_VINFO_REDUC_VECTYPE (reduc_info);
6023 gcc_assert (vectype);
6024 mode = TYPE_MODE (vectype);
6025
6026 tree induc_val = NULL_TREE;
6027 tree adjustment_def = NULL;
6028 if (slp_node)
6029 ;
6030 else
6031 {
6032 /* Optimize: for induction condition reduction, if we can't use zero
6033 for induc_val, use initial_def. */
6034 if (STMT_VINFO_REDUC_TYPE (reduc_info) == INTEGER_INDUC_COND_REDUCTION)
6035 induc_val = STMT_VINFO_VEC_INDUC_COND_INITIAL_VAL (reduc_info);
6036 else if (double_reduc)
6037 ;
6038 else
6039 adjustment_def = STMT_VINFO_REDUC_EPILOGUE_ADJUSTMENT (reduc_info);
6040 }
6041
6042 stmt_vec_info single_live_out_stmt[] = { stmt_info };
6043 array_slice<const stmt_vec_info> live_out_stmts = single_live_out_stmt;
6044 if (slp_reduc)
6045 /* All statements produce live-out values. */
6046 live_out_stmts = SLP_TREE_SCALAR_STMTS (slp_node);
6047
6048 unsigned vec_num;
6049 int ncopies;
6050 if (slp_node)
6051 {
6052 vec_num = SLP_TREE_VEC_DEFS (slp_node_instance->reduc_phis).length ();
6053 ncopies = 1;
6054 }
6055 else
6056 {
6057 vec_num = 1;
6058 ncopies = STMT_VINFO_VEC_STMTS (reduc_info).length ();
6059 }
6060
6061 /* For cond reductions we want to create a new vector (INDEX_COND_EXPR)
6062 which is updated with the current index of the loop for every match of
6063 the original loop's cond_expr (VEC_STMT). This results in a vector
6064 containing the last time the condition passed for that vector lane.
6065 The first match will be a 1 to allow 0 to be used for non-matching
6066 indexes. If there are no matches at all then the vector will be all
6067 zeroes.
6068
6069 PR92772: This algorithm is broken for architectures that support
6070 masked vectors, but do not provide fold_extract_last. */
6071 if (STMT_VINFO_REDUC_TYPE (reduc_info) == COND_REDUCTION)
6072 {
6073 auto_vec<std::pair<tree, bool>, 2> ccompares;
6074 stmt_vec_info cond_info = STMT_VINFO_REDUC_DEF (reduc_info);
6075 cond_info = vect_stmt_to_vectorize (cond_info);
6076 while (cond_info != reduc_info)
6077 {
6078 if (gimple_assign_rhs_code (cond_info->stmt) == COND_EXPR)
6079 {
6080 gimple *vec_stmt = STMT_VINFO_VEC_STMTS (cond_info)[0];
6081 gcc_assert (gimple_assign_rhs_code (vec_stmt) == VEC_COND_EXPR);
6082 ccompares.safe_push
6083 (std::make_pair (unshare_expr (gimple_assign_rhs1 (vec_stmt)),
6084 STMT_VINFO_REDUC_IDX (cond_info) == 2));
6085 }
6086 cond_info
6087 = loop_vinfo->lookup_def (gimple_op (cond_info->stmt,
6088 1 + STMT_VINFO_REDUC_IDX
6089 (cond_info)));
6090 cond_info = vect_stmt_to_vectorize (cond_info);
6091 }
6092 gcc_assert (ccompares.length () != 0);
6093
6094 tree indx_before_incr, indx_after_incr;
6095 poly_uint64 nunits_out = TYPE_VECTOR_SUBPARTS (vectype);
6096 int scalar_precision
6097 = GET_MODE_PRECISION (SCALAR_TYPE_MODE (TREE_TYPE (vectype)));
6098 tree cr_index_scalar_type = make_unsigned_type (scalar_precision);
6099 tree cr_index_vector_type = get_related_vectype_for_scalar_type
6100 (TYPE_MODE (vectype), cr_index_scalar_type,
6101 TYPE_VECTOR_SUBPARTS (vectype));
6102
6103 /* First we create a simple vector induction variable which starts
6104 with the values {1,2,3,...} (SERIES_VECT) and increments by the
6105 vector size (STEP). */
6106
6107 /* Create a {1,2,3,...} vector. */
6108 tree series_vect = build_index_vector (cr_index_vector_type, 1, 1);
6109
6110 /* Create a vector of the step value. */
6111 tree step = build_int_cst (cr_index_scalar_type, nunits_out);
6112 tree vec_step = build_vector_from_val (cr_index_vector_type, step);
6113
6114 /* Create an induction variable. */
6115 gimple_stmt_iterator incr_gsi;
6116 bool insert_after;
6117 vect_iv_increment_position (loop_exit, &incr_gsi, &insert_after);
6118 create_iv (series_vect, PLUS_EXPR, vec_step, NULL_TREE, loop, &incr_gsi,
6119 insert_after, &indx_before_incr, &indx_after_incr);
6120
6121 /* Next create a new phi node vector (NEW_PHI_TREE) which starts
6122 filled with zeros (VEC_ZERO). */
6123
6124 /* Create a vector of 0s. */
6125 tree zero = build_zero_cst (cr_index_scalar_type);
6126 tree vec_zero = build_vector_from_val (cr_index_vector_type, zero);
6127
6128 /* Create a vector phi node. */
6129 tree new_phi_tree = make_ssa_name (cr_index_vector_type);
6130 new_phi = create_phi_node (new_phi_tree, loop->header);
6131 add_phi_arg (as_a <gphi *> (new_phi), vec_zero,
6132 loop_preheader_edge (loop), UNKNOWN_LOCATION);
6133
6134 /* Now take the condition from the loops original cond_exprs
6135 and produce a new cond_exprs (INDEX_COND_EXPR) which for
6136 every match uses values from the induction variable
6137 (INDEX_BEFORE_INCR) otherwise uses values from the phi node
6138 (NEW_PHI_TREE).
6139 Finally, we update the phi (NEW_PHI_TREE) to take the value of
6140 the new cond_expr (INDEX_COND_EXPR). */
6141 gimple_seq stmts = NULL;
6142 for (int i = ccompares.length () - 1; i != -1; --i)
6143 {
6144 tree ccompare = ccompares[i].first;
6145 if (ccompares[i].second)
6146 new_phi_tree = gimple_build (&stmts, VEC_COND_EXPR,
6147 cr_index_vector_type,
6148 ccompare,
6149 indx_before_incr, new_phi_tree);
6150 else
6151 new_phi_tree = gimple_build (&stmts, VEC_COND_EXPR,
6152 cr_index_vector_type,
6153 ccompare,
6154 new_phi_tree, indx_before_incr);
6155 }
6156 gsi_insert_seq_before (&incr_gsi, stmts, GSI_SAME_STMT);
6157
6158 /* Update the phi with the vec cond. */
6159 induction_index = new_phi_tree;
6160 add_phi_arg (as_a <gphi *> (new_phi), induction_index,
6161 loop_latch_edge (loop), UNKNOWN_LOCATION);
6162 }
6163
6164 /* 2. Create epilog code.
6165 The reduction epilog code operates across the elements of the vector
6166 of partial results computed by the vectorized loop.
6167 The reduction epilog code consists of:
6168
6169 step 1: compute the scalar result in a vector (v_out2)
6170 step 2: extract the scalar result (s_out3) from the vector (v_out2)
6171 step 3: adjust the scalar result (s_out3) if needed.
6172
6173 Step 1 can be accomplished using one the following three schemes:
6174 (scheme 1) using reduc_fn, if available.
6175 (scheme 2) using whole-vector shifts, if available.
6176 (scheme 3) using a scalar loop. In this case steps 1+2 above are
6177 combined.
6178
6179 The overall epilog code looks like this:
6180
6181 s_out0 = phi <s_loop> # original EXIT_PHI
6182 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
6183 v_out2 = reduce <v_out1> # step 1
6184 s_out3 = extract_field <v_out2, 0> # step 2
6185 s_out4 = adjust_result <s_out3> # step 3
6186
6187 (step 3 is optional, and steps 1 and 2 may be combined).
6188 Lastly, the uses of s_out0 are replaced by s_out4. */
6189
6190
6191 /* 2.1 Create new loop-exit-phis to preserve loop-closed form:
6192 v_out1 = phi <VECT_DEF>
6193 Store them in NEW_PHIS. */
6194 if (double_reduc)
6195 loop = outer_loop;
6196 /* We need to reduce values in all exits. */
6197 exit_bb = loop_exit->dest;
6198 exit_gsi = gsi_after_labels (exit_bb);
6199 reduc_inputs.create (slp_node ? vec_num : ncopies);
6200 for (unsigned i = 0; i < vec_num; i++)
6201 {
6202 gimple_seq stmts = NULL;
6203 if (slp_node)
6204 def = vect_get_slp_vect_def (slp_node, i);
6205 else
6206 def = gimple_get_lhs (STMT_VINFO_VEC_STMTS (rdef_info)[0]);
6207 for (j = 0; j < ncopies; j++)
6208 {
6209 tree new_def = copy_ssa_name (def);
6210 phi = create_phi_node (new_def, exit_bb);
6211 if (j)
6212 def = gimple_get_lhs (STMT_VINFO_VEC_STMTS (rdef_info)[j]);
6213 if (LOOP_VINFO_IV_EXIT (loop_vinfo) == loop_exit)
6214 SET_PHI_ARG_DEF (phi, loop_exit->dest_idx, def);
6215 else
6216 {
6217 for (unsigned k = 0; k < gimple_phi_num_args (phi); k++)
6218 SET_PHI_ARG_DEF (phi, k, def);
6219 }
6220 new_def = gimple_convert (&stmts, vectype, new_def);
6221 reduc_inputs.quick_push (new_def);
6222 }
6223 gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
6224 }
6225
6226 /* 2.2 Get the relevant tree-code to use in the epilog for schemes 2,3
6227 (i.e. when reduc_fn is not available) and in the final adjustment
6228 code (if needed). Also get the original scalar reduction variable as
6229 defined in the loop. In case STMT is a "pattern-stmt" (i.e. - it
6230 represents a reduction pattern), the tree-code and scalar-def are
6231 taken from the original stmt that the pattern-stmt (STMT) replaces.
6232 Otherwise (it is a regular reduction) - the tree-code and scalar-def
6233 are taken from STMT. */
6234
6235 stmt_vec_info orig_stmt_info = vect_orig_stmt (stmt_info);
6236 if (orig_stmt_info != stmt_info)
6237 {
6238 /* Reduction pattern */
6239 gcc_assert (STMT_VINFO_IN_PATTERN_P (orig_stmt_info));
6240 gcc_assert (STMT_VINFO_RELATED_STMT (orig_stmt_info) == stmt_info);
6241 }
6242
6243 scalar_dest = gimple_get_lhs (orig_stmt_info->stmt);
6244 scalar_type = TREE_TYPE (scalar_dest);
6245 scalar_results.truncate (0);
6246 scalar_results.reserve_exact (group_size);
6247 new_scalar_dest = vect_create_destination_var (scalar_dest, NULL);
6248 bitsize = TYPE_SIZE (scalar_type);
6249
6250 /* True if we should implement SLP_REDUC using native reduction operations
6251 instead of scalar operations. */
6252 direct_slp_reduc = (reduc_fn != IFN_LAST
6253 && slp_reduc
6254 && !TYPE_VECTOR_SUBPARTS (vectype).is_constant ());
6255
6256 /* In case of reduction chain, e.g.,
6257 # a1 = phi <a3, a0>
6258 a2 = operation (a1)
6259 a3 = operation (a2),
6260
6261 we may end up with more than one vector result. Here we reduce them
6262 to one vector.
6263
6264 The same is true for a SLP reduction, e.g.,
6265 # a1 = phi <a2, a0>
6266 # b1 = phi <b2, b0>
6267 a2 = operation (a1)
6268 b2 = operation (a2),
6269
6270 where we can end up with more than one vector as well. We can
6271 easily accumulate vectors when the number of vector elements is
6272 a multiple of the SLP group size.
6273
6274 The same is true if we couldn't use a single defuse cycle. */
6275 if (REDUC_GROUP_FIRST_ELEMENT (stmt_info)
6276 || direct_slp_reduc
6277 || (slp_reduc
6278 && constant_multiple_p (TYPE_VECTOR_SUBPARTS (vectype), group_size))
6279 || ncopies > 1)
6280 {
6281 gimple_seq stmts = NULL;
6282 tree single_input = reduc_inputs[0];
6283 for (k = 1; k < reduc_inputs.length (); k++)
6284 single_input = gimple_build (&stmts, code, vectype,
6285 single_input, reduc_inputs[k]);
6286 gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
6287
6288 reduc_inputs.truncate (0);
6289 reduc_inputs.safe_push (single_input);
6290 }
6291
6292 tree orig_reduc_input = reduc_inputs[0];
6293
6294 /* If this loop is an epilogue loop that can be skipped after the
6295 main loop, we can only share a reduction operation between the
6296 main loop and the epilogue if we put it at the target of the
6297 skip edge.
6298
6299 We can still reuse accumulators if this check fails. Doing so has
6300 the minor(?) benefit of making the epilogue loop's scalar result
6301 independent of the main loop's scalar result. */
6302 bool unify_with_main_loop_p = false;
6303 if (reduc_info->reused_accumulator
6304 && loop_vinfo->skip_this_loop_edge
6305 && single_succ_p (exit_bb)
6306 && single_succ (exit_bb) == loop_vinfo->skip_this_loop_edge->dest)
6307 {
6308 unify_with_main_loop_p = true;
6309
6310 basic_block reduc_block = loop_vinfo->skip_this_loop_edge->dest;
6311 reduc_inputs[0] = make_ssa_name (vectype);
6312 gphi *new_phi = create_phi_node (reduc_inputs[0], reduc_block);
6313 add_phi_arg (new_phi, orig_reduc_input, single_succ_edge (exit_bb),
6314 UNKNOWN_LOCATION);
6315 add_phi_arg (new_phi, reduc_info->reused_accumulator->reduc_input,
6316 loop_vinfo->skip_this_loop_edge, UNKNOWN_LOCATION);
6317 exit_gsi = gsi_after_labels (reduc_block);
6318 }
6319
6320 /* Shouldn't be used beyond this point. */
6321 exit_bb = nullptr;
6322
6323 if (STMT_VINFO_REDUC_TYPE (reduc_info) == COND_REDUCTION
6324 && reduc_fn != IFN_LAST)
6325 {
6326 /* For condition reductions, we have a vector (REDUC_INPUTS 0) containing
6327 various data values where the condition matched and another vector
6328 (INDUCTION_INDEX) containing all the indexes of those matches. We
6329 need to extract the last matching index (which will be the index with
6330 highest value) and use this to index into the data vector.
6331 For the case where there were no matches, the data vector will contain
6332 all default values and the index vector will be all zeros. */
6333
6334 /* Get various versions of the type of the vector of indexes. */
6335 tree index_vec_type = TREE_TYPE (induction_index);
6336 gcc_checking_assert (TYPE_UNSIGNED (index_vec_type));
6337 tree index_scalar_type = TREE_TYPE (index_vec_type);
6338 tree index_vec_cmp_type = truth_type_for (index_vec_type);
6339
6340 /* Get an unsigned integer version of the type of the data vector. */
6341 int scalar_precision
6342 = GET_MODE_PRECISION (SCALAR_TYPE_MODE (scalar_type));
6343 tree scalar_type_unsigned = make_unsigned_type (scalar_precision);
6344 tree vectype_unsigned = get_same_sized_vectype (scalar_type_unsigned,
6345 vectype);
6346
6347 /* First we need to create a vector (ZERO_VEC) of zeros and another
6348 vector (MAX_INDEX_VEC) filled with the last matching index, which we
6349 can create using a MAX reduction and then expanding.
6350 In the case where the loop never made any matches, the max index will
6351 be zero. */
6352
6353 /* Vector of {0, 0, 0,...}. */
6354 tree zero_vec = build_zero_cst (vectype);
6355
6356 /* Find maximum value from the vector of found indexes. */
6357 tree max_index = make_ssa_name (index_scalar_type);
6358 gcall *max_index_stmt = gimple_build_call_internal (IFN_REDUC_MAX,
6359 1, induction_index);
6360 gimple_call_set_lhs (max_index_stmt, max_index);
6361 gsi_insert_before (&exit_gsi, max_index_stmt, GSI_SAME_STMT);
6362
6363 /* Vector of {max_index, max_index, max_index,...}. */
6364 tree max_index_vec = make_ssa_name (index_vec_type);
6365 tree max_index_vec_rhs = build_vector_from_val (index_vec_type,
6366 max_index);
6367 gimple *max_index_vec_stmt = gimple_build_assign (max_index_vec,
6368 max_index_vec_rhs);
6369 gsi_insert_before (&exit_gsi, max_index_vec_stmt, GSI_SAME_STMT);
6370
6371 /* Next we compare the new vector (MAX_INDEX_VEC) full of max indexes
6372 with the vector (INDUCTION_INDEX) of found indexes, choosing values
6373 from the data vector (REDUC_INPUTS 0) for matches, 0 (ZERO_VEC)
6374 otherwise. Only one value should match, resulting in a vector
6375 (VEC_COND) with one data value and the rest zeros.
6376 In the case where the loop never made any matches, every index will
6377 match, resulting in a vector with all data values (which will all be
6378 the default value). */
6379
6380 /* Compare the max index vector to the vector of found indexes to find
6381 the position of the max value. */
6382 tree vec_compare = make_ssa_name (index_vec_cmp_type);
6383 gimple *vec_compare_stmt = gimple_build_assign (vec_compare, EQ_EXPR,
6384 induction_index,
6385 max_index_vec);
6386 gsi_insert_before (&exit_gsi, vec_compare_stmt, GSI_SAME_STMT);
6387
6388 /* Use the compare to choose either values from the data vector or
6389 zero. */
6390 tree vec_cond = make_ssa_name (vectype);
6391 gimple *vec_cond_stmt = gimple_build_assign (vec_cond, VEC_COND_EXPR,
6392 vec_compare,
6393 reduc_inputs[0],
6394 zero_vec);
6395 gsi_insert_before (&exit_gsi, vec_cond_stmt, GSI_SAME_STMT);
6396
6397 /* Finally we need to extract the data value from the vector (VEC_COND)
6398 into a scalar (MATCHED_DATA_REDUC). Logically we want to do a OR
6399 reduction, but because this doesn't exist, we can use a MAX reduction
6400 instead. The data value might be signed or a float so we need to cast
6401 it first.
6402 In the case where the loop never made any matches, the data values are
6403 all identical, and so will reduce down correctly. */
6404
6405 /* Make the matched data values unsigned. */
6406 tree vec_cond_cast = make_ssa_name (vectype_unsigned);
6407 tree vec_cond_cast_rhs = build1 (VIEW_CONVERT_EXPR, vectype_unsigned,
6408 vec_cond);
6409 gimple *vec_cond_cast_stmt = gimple_build_assign (vec_cond_cast,
6410 VIEW_CONVERT_EXPR,
6411 vec_cond_cast_rhs);
6412 gsi_insert_before (&exit_gsi, vec_cond_cast_stmt, GSI_SAME_STMT);
6413
6414 /* Reduce down to a scalar value. */
6415 tree data_reduc = make_ssa_name (scalar_type_unsigned);
6416 gcall *data_reduc_stmt = gimple_build_call_internal (IFN_REDUC_MAX,
6417 1, vec_cond_cast);
6418 gimple_call_set_lhs (data_reduc_stmt, data_reduc);
6419 gsi_insert_before (&exit_gsi, data_reduc_stmt, GSI_SAME_STMT);
6420
6421 /* Convert the reduced value back to the result type and set as the
6422 result. */
6423 gimple_seq stmts = NULL;
6424 new_temp = gimple_build (&stmts, VIEW_CONVERT_EXPR, scalar_type,
6425 data_reduc);
6426 gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
6427 scalar_results.safe_push (new_temp);
6428 }
6429 else if (STMT_VINFO_REDUC_TYPE (reduc_info) == COND_REDUCTION
6430 && reduc_fn == IFN_LAST)
6431 {
6432 /* Condition reduction without supported IFN_REDUC_MAX. Generate
6433 idx = 0;
6434 idx_val = induction_index[0];
6435 val = data_reduc[0];
6436 for (idx = 0, val = init, i = 0; i < nelts; ++i)
6437 if (induction_index[i] > idx_val)
6438 val = data_reduc[i], idx_val = induction_index[i];
6439 return val; */
6440
6441 tree data_eltype = TREE_TYPE (vectype);
6442 tree idx_eltype = TREE_TYPE (TREE_TYPE (induction_index));
6443 unsigned HOST_WIDE_INT el_size = tree_to_uhwi (TYPE_SIZE (idx_eltype));
6444 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (TREE_TYPE (induction_index));
6445 /* Enforced by vectorizable_reduction, which ensures we have target
6446 support before allowing a conditional reduction on variable-length
6447 vectors. */
6448 unsigned HOST_WIDE_INT v_size = el_size * nunits.to_constant ();
6449 tree idx_val = NULL_TREE, val = NULL_TREE;
6450 for (unsigned HOST_WIDE_INT off = 0; off < v_size; off += el_size)
6451 {
6452 tree old_idx_val = idx_val;
6453 tree old_val = val;
6454 idx_val = make_ssa_name (idx_eltype);
6455 epilog_stmt = gimple_build_assign (idx_val, BIT_FIELD_REF,
6456 build3 (BIT_FIELD_REF, idx_eltype,
6457 induction_index,
6458 bitsize_int (el_size),
6459 bitsize_int (off)));
6460 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
6461 val = make_ssa_name (data_eltype);
6462 epilog_stmt = gimple_build_assign (val, BIT_FIELD_REF,
6463 build3 (BIT_FIELD_REF,
6464 data_eltype,
6465 reduc_inputs[0],
6466 bitsize_int (el_size),
6467 bitsize_int (off)));
6468 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
6469 if (off != 0)
6470 {
6471 tree new_idx_val = idx_val;
6472 if (off != v_size - el_size)
6473 {
6474 new_idx_val = make_ssa_name (idx_eltype);
6475 epilog_stmt = gimple_build_assign (new_idx_val,
6476 MAX_EXPR, idx_val,
6477 old_idx_val);
6478 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
6479 }
6480 tree cond = make_ssa_name (boolean_type_node);
6481 epilog_stmt = gimple_build_assign (cond, GT_EXPR,
6482 idx_val, old_idx_val);
6483 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
6484 tree new_val = make_ssa_name (data_eltype);
6485 epilog_stmt = gimple_build_assign (new_val, COND_EXPR,
6486 cond, val, old_val);
6487 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
6488 idx_val = new_idx_val;
6489 val = new_val;
6490 }
6491 }
6492 /* Convert the reduced value back to the result type and set as the
6493 result. */
6494 gimple_seq stmts = NULL;
6495 val = gimple_convert (&stmts, scalar_type, val);
6496 gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
6497 scalar_results.safe_push (val);
6498 }
6499
6500 /* 2.3 Create the reduction code, using one of the three schemes described
6501 above. In SLP we simply need to extract all the elements from the
6502 vector (without reducing them), so we use scalar shifts. */
6503 else if (reduc_fn != IFN_LAST && !slp_reduc)
6504 {
6505 tree tmp;
6506 tree vec_elem_type;
6507
6508 /* Case 1: Create:
6509 v_out2 = reduc_expr <v_out1> */
6510
6511 if (dump_enabled_p ())
6512 dump_printf_loc (MSG_NOTE, vect_location,
6513 "Reduce using direct vector reduction.\n");
6514
6515 gimple_seq stmts = NULL;
6516 vec_elem_type = TREE_TYPE (vectype);
6517 new_temp = gimple_build (&stmts, as_combined_fn (reduc_fn),
6518 vec_elem_type, reduc_inputs[0]);
6519 new_temp = gimple_convert (&stmts, scalar_type, new_temp);
6520 gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
6521
6522 if ((STMT_VINFO_REDUC_TYPE (reduc_info) == INTEGER_INDUC_COND_REDUCTION)
6523 && induc_val)
6524 {
6525 /* Earlier we set the initial value to be a vector if induc_val
6526 values. Check the result and if it is induc_val then replace
6527 with the original initial value, unless induc_val is
6528 the same as initial_def already. */
6529 tree zcompare = make_ssa_name (boolean_type_node);
6530 epilog_stmt = gimple_build_assign (zcompare, EQ_EXPR,
6531 new_temp, induc_val);
6532 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
6533 tree initial_def = reduc_info->reduc_initial_values[0];
6534 tmp = make_ssa_name (new_scalar_dest);
6535 epilog_stmt = gimple_build_assign (tmp, COND_EXPR, zcompare,
6536 initial_def, new_temp);
6537 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
6538 new_temp = tmp;
6539 }
6540
6541 scalar_results.safe_push (new_temp);
6542 }
6543 else if (direct_slp_reduc)
6544 {
6545 /* Here we create one vector for each of the REDUC_GROUP_SIZE results,
6546 with the elements for other SLP statements replaced with the
6547 neutral value. We can then do a normal reduction on each vector. */
6548
6549 /* Enforced by vectorizable_reduction. */
6550 gcc_assert (reduc_inputs.length () == 1);
6551 gcc_assert (pow2p_hwi (group_size));
6552
6553 gimple_seq seq = NULL;
6554
6555 /* Build a vector {0, 1, 2, ...}, with the same number of elements
6556 and the same element size as VECTYPE. */
6557 tree index = build_index_vector (vectype, 0, 1);
6558 tree index_type = TREE_TYPE (index);
6559 tree index_elt_type = TREE_TYPE (index_type);
6560 tree mask_type = truth_type_for (index_type);
6561
6562 /* Create a vector that, for each element, identifies which of
6563 the REDUC_GROUP_SIZE results should use it. */
6564 tree index_mask = build_int_cst (index_elt_type, group_size - 1);
6565 index = gimple_build (&seq, BIT_AND_EXPR, index_type, index,
6566 build_vector_from_val (index_type, index_mask));
6567
6568 /* Get a neutral vector value. This is simply a splat of the neutral
6569 scalar value if we have one, otherwise the initial scalar value
6570 is itself a neutral value. */
6571 tree vector_identity = NULL_TREE;
6572 tree neutral_op = NULL_TREE;
6573 if (slp_node)
6574 {
6575 tree initial_value = NULL_TREE;
6576 if (REDUC_GROUP_FIRST_ELEMENT (stmt_info))
6577 initial_value = reduc_info->reduc_initial_values[0];
6578 neutral_op = neutral_op_for_reduction (TREE_TYPE (vectype), code,
6579 initial_value, false);
6580 }
6581 if (neutral_op)
6582 vector_identity = gimple_build_vector_from_val (&seq, vectype,
6583 neutral_op);
6584 for (unsigned int i = 0; i < group_size; ++i)
6585 {
6586 /* If there's no univeral neutral value, we can use the
6587 initial scalar value from the original PHI. This is used
6588 for MIN and MAX reduction, for example. */
6589 if (!neutral_op)
6590 {
6591 tree scalar_value = reduc_info->reduc_initial_values[i];
6592 scalar_value = gimple_convert (&seq, TREE_TYPE (vectype),
6593 scalar_value);
6594 vector_identity = gimple_build_vector_from_val (&seq, vectype,
6595 scalar_value);
6596 }
6597
6598 /* Calculate the equivalent of:
6599
6600 sel[j] = (index[j] == i);
6601
6602 which selects the elements of REDUC_INPUTS[0] that should
6603 be included in the result. */
6604 tree compare_val = build_int_cst (index_elt_type, i);
6605 compare_val = build_vector_from_val (index_type, compare_val);
6606 tree sel = gimple_build (&seq, EQ_EXPR, mask_type,
6607 index, compare_val);
6608
6609 /* Calculate the equivalent of:
6610
6611 vec = seq ? reduc_inputs[0] : vector_identity;
6612
6613 VEC is now suitable for a full vector reduction. */
6614 tree vec = gimple_build (&seq, VEC_COND_EXPR, vectype,
6615 sel, reduc_inputs[0], vector_identity);
6616
6617 /* Do the reduction and convert it to the appropriate type. */
6618 tree scalar = gimple_build (&seq, as_combined_fn (reduc_fn),
6619 TREE_TYPE (vectype), vec);
6620 scalar = gimple_convert (&seq, scalar_type, scalar);
6621 scalar_results.safe_push (scalar);
6622 }
6623 gsi_insert_seq_before (&exit_gsi, seq, GSI_SAME_STMT);
6624 }
6625 else
6626 {
6627 bool reduce_with_shift;
6628 tree vec_temp;
6629
6630 gcc_assert (slp_reduc || reduc_inputs.length () == 1);
6631
6632 /* See if the target wants to do the final (shift) reduction
6633 in a vector mode of smaller size and first reduce upper/lower
6634 halves against each other. */
6635 enum machine_mode mode1 = mode;
6636 tree stype = TREE_TYPE (vectype);
6637 unsigned nunits = TYPE_VECTOR_SUBPARTS (vectype).to_constant ();
6638 unsigned nunits1 = nunits;
6639 if ((mode1 = targetm.vectorize.split_reduction (mode)) != mode
6640 && reduc_inputs.length () == 1)
6641 {
6642 nunits1 = GET_MODE_NUNITS (mode1).to_constant ();
6643 /* For SLP reductions we have to make sure lanes match up, but
6644 since we're doing individual element final reduction reducing
6645 vector width here is even more important.
6646 ??? We can also separate lanes with permutes, for the common
6647 case of power-of-two group-size odd/even extracts would work. */
6648 if (slp_reduc && nunits != nunits1)
6649 {
6650 nunits1 = least_common_multiple (nunits1, group_size);
6651 gcc_assert (exact_log2 (nunits1) != -1 && nunits1 <= nunits);
6652 }
6653 }
6654 if (!slp_reduc
6655 && (mode1 = targetm.vectorize.split_reduction (mode)) != mode)
6656 nunits1 = GET_MODE_NUNITS (mode1).to_constant ();
6657
6658 tree vectype1 = get_related_vectype_for_scalar_type (TYPE_MODE (vectype),
6659 stype, nunits1);
6660 reduce_with_shift = have_whole_vector_shift (mode1);
6661 if (!VECTOR_MODE_P (mode1)
6662 || !directly_supported_p (code, vectype1))
6663 reduce_with_shift = false;
6664
6665 /* First reduce the vector to the desired vector size we should
6666 do shift reduction on by combining upper and lower halves. */
6667 gimple_seq stmts = NULL;
6668 new_temp = vect_create_partial_epilog (reduc_inputs[0], vectype1,
6669 code, &stmts);
6670 gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
6671 reduc_inputs[0] = new_temp;
6672
6673 if (reduce_with_shift && !slp_reduc)
6674 {
6675 int element_bitsize = tree_to_uhwi (bitsize);
6676 /* Enforced by vectorizable_reduction, which disallows SLP reductions
6677 for variable-length vectors and also requires direct target support
6678 for loop reductions. */
6679 int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype1));
6680 int nelements = vec_size_in_bits / element_bitsize;
6681 vec_perm_builder sel;
6682 vec_perm_indices indices;
6683
6684 int elt_offset;
6685
6686 tree zero_vec = build_zero_cst (vectype1);
6687 /* Case 2: Create:
6688 for (offset = nelements/2; offset >= 1; offset/=2)
6689 {
6690 Create: va' = vec_shift <va, offset>
6691 Create: va = vop <va, va'>
6692 } */
6693
6694 tree rhs;
6695
6696 if (dump_enabled_p ())
6697 dump_printf_loc (MSG_NOTE, vect_location,
6698 "Reduce using vector shifts\n");
6699
6700 gimple_seq stmts = NULL;
6701 new_temp = gimple_convert (&stmts, vectype1, new_temp);
6702 for (elt_offset = nelements / 2;
6703 elt_offset >= 1;
6704 elt_offset /= 2)
6705 {
6706 calc_vec_perm_mask_for_shift (elt_offset, nelements, &sel);
6707 indices.new_vector (sel, 2, nelements);
6708 tree mask = vect_gen_perm_mask_any (vectype1, indices);
6709 new_name = gimple_build (&stmts, VEC_PERM_EXPR, vectype1,
6710 new_temp, zero_vec, mask);
6711 new_temp = gimple_build (&stmts, code,
6712 vectype1, new_name, new_temp);
6713 }
6714 gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
6715
6716 /* 2.4 Extract the final scalar result. Create:
6717 s_out3 = extract_field <v_out2, bitpos> */
6718
6719 if (dump_enabled_p ())
6720 dump_printf_loc (MSG_NOTE, vect_location,
6721 "extract scalar result\n");
6722
6723 rhs = build3 (BIT_FIELD_REF, scalar_type, new_temp,
6724 bitsize, bitsize_zero_node);
6725 epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
6726 new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
6727 gimple_assign_set_lhs (epilog_stmt, new_temp);
6728 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
6729 scalar_results.safe_push (new_temp);
6730 }
6731 else
6732 {
6733 /* Case 3: Create:
6734 s = extract_field <v_out2, 0>
6735 for (offset = element_size;
6736 offset < vector_size;
6737 offset += element_size;)
6738 {
6739 Create: s' = extract_field <v_out2, offset>
6740 Create: s = op <s, s'> // For non SLP cases
6741 } */
6742
6743 if (dump_enabled_p ())
6744 dump_printf_loc (MSG_NOTE, vect_location,
6745 "Reduce using scalar code.\n");
6746
6747 int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype1));
6748 int element_bitsize = tree_to_uhwi (bitsize);
6749 tree compute_type = TREE_TYPE (vectype);
6750 gimple_seq stmts = NULL;
6751 FOR_EACH_VEC_ELT (reduc_inputs, i, vec_temp)
6752 {
6753 int bit_offset;
6754 new_temp = gimple_build (&stmts, BIT_FIELD_REF, compute_type,
6755 vec_temp, bitsize, bitsize_zero_node);
6756
6757 /* In SLP we don't need to apply reduction operation, so we just
6758 collect s' values in SCALAR_RESULTS. */
6759 if (slp_reduc)
6760 scalar_results.safe_push (new_temp);
6761
6762 for (bit_offset = element_bitsize;
6763 bit_offset < vec_size_in_bits;
6764 bit_offset += element_bitsize)
6765 {
6766 tree bitpos = bitsize_int (bit_offset);
6767 new_name = gimple_build (&stmts, BIT_FIELD_REF,
6768 compute_type, vec_temp,
6769 bitsize, bitpos);
6770 if (slp_reduc)
6771 {
6772 /* In SLP we don't need to apply reduction operation, so
6773 we just collect s' values in SCALAR_RESULTS. */
6774 new_temp = new_name;
6775 scalar_results.safe_push (new_name);
6776 }
6777 else
6778 new_temp = gimple_build (&stmts, code, compute_type,
6779 new_name, new_temp);
6780 }
6781 }
6782
6783 /* The only case where we need to reduce scalar results in SLP, is
6784 unrolling. If the size of SCALAR_RESULTS is greater than
6785 REDUC_GROUP_SIZE, we reduce them combining elements modulo
6786 REDUC_GROUP_SIZE. */
6787 if (slp_reduc)
6788 {
6789 tree res, first_res, new_res;
6790
6791 /* Reduce multiple scalar results in case of SLP unrolling. */
6792 for (j = group_size; scalar_results.iterate (j, &res);
6793 j++)
6794 {
6795 first_res = scalar_results[j % group_size];
6796 new_res = gimple_build (&stmts, code, compute_type,
6797 first_res, res);
6798 scalar_results[j % group_size] = new_res;
6799 }
6800 scalar_results.truncate (group_size);
6801 for (k = 0; k < group_size; k++)
6802 scalar_results[k] = gimple_convert (&stmts, scalar_type,
6803 scalar_results[k]);
6804 }
6805 else
6806 {
6807 /* Not SLP - we have one scalar to keep in SCALAR_RESULTS. */
6808 new_temp = gimple_convert (&stmts, scalar_type, new_temp);
6809 scalar_results.safe_push (new_temp);
6810 }
6811
6812 gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
6813 }
6814
6815 if ((STMT_VINFO_REDUC_TYPE (reduc_info) == INTEGER_INDUC_COND_REDUCTION)
6816 && induc_val)
6817 {
6818 /* Earlier we set the initial value to be a vector if induc_val
6819 values. Check the result and if it is induc_val then replace
6820 with the original initial value, unless induc_val is
6821 the same as initial_def already. */
6822 tree zcompare = make_ssa_name (boolean_type_node);
6823 epilog_stmt = gimple_build_assign (zcompare, EQ_EXPR, new_temp,
6824 induc_val);
6825 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
6826 tree initial_def = reduc_info->reduc_initial_values[0];
6827 tree tmp = make_ssa_name (new_scalar_dest);
6828 epilog_stmt = gimple_build_assign (tmp, COND_EXPR, zcompare,
6829 initial_def, new_temp);
6830 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
6831 scalar_results[0] = tmp;
6832 }
6833 }
6834
6835 /* 2.5 Adjust the final result by the initial value of the reduction
6836 variable. (When such adjustment is not needed, then
6837 'adjustment_def' is zero). For example, if code is PLUS we create:
6838 new_temp = loop_exit_def + adjustment_def */
6839
6840 if (adjustment_def)
6841 {
6842 gcc_assert (!slp_reduc);
6843 gimple_seq stmts = NULL;
6844 if (double_reduc)
6845 {
6846 gcc_assert (VECTOR_TYPE_P (TREE_TYPE (adjustment_def)));
6847 adjustment_def = gimple_convert (&stmts, vectype, adjustment_def);
6848 new_temp = gimple_build (&stmts, code, vectype,
6849 reduc_inputs[0], adjustment_def);
6850 }
6851 else
6852 {
6853 new_temp = scalar_results[0];
6854 gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) != VECTOR_TYPE);
6855 adjustment_def = gimple_convert (&stmts, TREE_TYPE (vectype),
6856 adjustment_def);
6857 new_temp = gimple_convert (&stmts, TREE_TYPE (vectype), new_temp);
6858 new_temp = gimple_build (&stmts, code, TREE_TYPE (vectype),
6859 new_temp, adjustment_def);
6860 new_temp = gimple_convert (&stmts, scalar_type, new_temp);
6861 }
6862
6863 epilog_stmt = gimple_seq_last_stmt (stmts);
6864 gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
6865 scalar_results[0] = new_temp;
6866 }
6867
6868 /* Record this operation if it could be reused by the epilogue loop. */
6869 if (STMT_VINFO_REDUC_TYPE (reduc_info) == TREE_CODE_REDUCTION
6870 && reduc_inputs.length () == 1)
6871 loop_vinfo->reusable_accumulators.put (scalar_results[0],
6872 { orig_reduc_input, reduc_info });
6873
6874 if (double_reduc)
6875 loop = outer_loop;
6876
6877 /* 2.6 Handle the loop-exit phis. Replace the uses of scalar loop-exit
6878 phis with new adjusted scalar results, i.e., replace use <s_out0>
6879 with use <s_out4>.
6880
6881 Transform:
6882 loop_exit:
6883 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
6884 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
6885 v_out2 = reduce <v_out1>
6886 s_out3 = extract_field <v_out2, 0>
6887 s_out4 = adjust_result <s_out3>
6888 use <s_out0>
6889 use <s_out0>
6890
6891 into:
6892
6893 loop_exit:
6894 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
6895 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
6896 v_out2 = reduce <v_out1>
6897 s_out3 = extract_field <v_out2, 0>
6898 s_out4 = adjust_result <s_out3>
6899 use <s_out4>
6900 use <s_out4> */
6901
6902 gcc_assert (live_out_stmts.size () == scalar_results.length ());
6903 auto_vec<gimple *> phis;
6904 for (k = 0; k < live_out_stmts.size (); k++)
6905 {
6906 stmt_vec_info scalar_stmt_info = vect_orig_stmt (live_out_stmts[k]);
6907 scalar_dest = gimple_get_lhs (scalar_stmt_info->stmt);
6908
6909 /* Find the loop-closed-use at the loop exit of the original scalar
6910 result. (The reduction result is expected to have two immediate uses,
6911 one at the latch block, and one at the loop exit). For double
6912 reductions we are looking for exit phis of the outer loop. */
6913 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, scalar_dest)
6914 {
6915 if (!flow_bb_inside_loop_p (loop, gimple_bb (USE_STMT (use_p))))
6916 {
6917 if (!is_gimple_debug (USE_STMT (use_p))
6918 && gimple_bb (USE_STMT (use_p)) == loop_exit->dest)
6919 phis.safe_push (USE_STMT (use_p));
6920 }
6921 else
6922 {
6923 if (double_reduc && gimple_code (USE_STMT (use_p)) == GIMPLE_PHI)
6924 {
6925 tree phi_res = PHI_RESULT (USE_STMT (use_p));
6926
6927 FOR_EACH_IMM_USE_FAST (phi_use_p, phi_imm_iter, phi_res)
6928 {
6929 if (!flow_bb_inside_loop_p (loop,
6930 gimple_bb (USE_STMT (phi_use_p)))
6931 && !is_gimple_debug (USE_STMT (phi_use_p)))
6932 phis.safe_push (USE_STMT (phi_use_p));
6933 }
6934 }
6935 }
6936 }
6937
6938 FOR_EACH_VEC_ELT (phis, i, exit_phi)
6939 {
6940 /* Replace the uses: */
6941 orig_name = PHI_RESULT (exit_phi);
6942
6943 /* Look for a single use at the target of the skip edge. */
6944 if (unify_with_main_loop_p)
6945 {
6946 use_operand_p use_p;
6947 gimple *user;
6948 if (!single_imm_use (orig_name, &use_p, &user))
6949 gcc_unreachable ();
6950 orig_name = gimple_get_lhs (user);
6951 }
6952
6953 scalar_result = scalar_results[k];
6954 FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name)
6955 {
6956 FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
6957 SET_USE (use_p, scalar_result);
6958 update_stmt (use_stmt);
6959 }
6960 }
6961
6962 phis.truncate (0);
6963 }
6964 }
6965
6966 /* Return a vector of type VECTYPE that is equal to the vector select
6967 operation "MASK ? VEC : IDENTITY". Insert the select statements
6968 before GSI. */
6969
6970 static tree
6971 merge_with_identity (gimple_stmt_iterator *gsi, tree mask, tree vectype,
6972 tree vec, tree identity)
6973 {
6974 tree cond = make_temp_ssa_name (vectype, NULL, "cond");
6975 gimple *new_stmt = gimple_build_assign (cond, VEC_COND_EXPR,
6976 mask, vec, identity);
6977 gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
6978 return cond;
6979 }
6980
6981 /* Successively apply CODE to each element of VECTOR_RHS, in left-to-right
6982 order, starting with LHS. Insert the extraction statements before GSI and
6983 associate the new scalar SSA names with variable SCALAR_DEST.
6984 If MASK is nonzero mask the input and then operate on it unconditionally.
6985 Return the SSA name for the result. */
6986
6987 static tree
6988 vect_expand_fold_left (gimple_stmt_iterator *gsi, tree scalar_dest,
6989 tree_code code, tree lhs, tree vector_rhs,
6990 tree mask)
6991 {
6992 tree vectype = TREE_TYPE (vector_rhs);
6993 tree scalar_type = TREE_TYPE (vectype);
6994 tree bitsize = TYPE_SIZE (scalar_type);
6995 unsigned HOST_WIDE_INT vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
6996 unsigned HOST_WIDE_INT element_bitsize = tree_to_uhwi (bitsize);
6997
6998 /* Re-create a VEC_COND_EXPR to mask the input here in order to be able
6999 to perform an unconditional element-wise reduction of it. */
7000 if (mask)
7001 {
7002 tree masked_vector_rhs = make_temp_ssa_name (vectype, NULL,
7003 "masked_vector_rhs");
7004 tree neutral_op = neutral_op_for_reduction (scalar_type, code, NULL_TREE,
7005 false);
7006 tree vector_identity = build_vector_from_val (vectype, neutral_op);
7007 gassign *select = gimple_build_assign (masked_vector_rhs, VEC_COND_EXPR,
7008 mask, vector_rhs, vector_identity);
7009 gsi_insert_before (gsi, select, GSI_SAME_STMT);
7010 vector_rhs = masked_vector_rhs;
7011 }
7012
7013 for (unsigned HOST_WIDE_INT bit_offset = 0;
7014 bit_offset < vec_size_in_bits;
7015 bit_offset += element_bitsize)
7016 {
7017 tree bitpos = bitsize_int (bit_offset);
7018 tree rhs = build3 (BIT_FIELD_REF, scalar_type, vector_rhs,
7019 bitsize, bitpos);
7020
7021 gassign *stmt = gimple_build_assign (scalar_dest, rhs);
7022 rhs = make_ssa_name (scalar_dest, stmt);
7023 gimple_assign_set_lhs (stmt, rhs);
7024 gsi_insert_before (gsi, stmt, GSI_SAME_STMT);
7025
7026 stmt = gimple_build_assign (scalar_dest, code, lhs, rhs);
7027 tree new_name = make_ssa_name (scalar_dest, stmt);
7028 gimple_assign_set_lhs (stmt, new_name);
7029 gsi_insert_before (gsi, stmt, GSI_SAME_STMT);
7030 lhs = new_name;
7031 }
7032 return lhs;
7033 }
7034
7035 /* Get a masked internal function equivalent to REDUC_FN. VECTYPE_IN is the
7036 type of the vector input. */
7037
7038 static internal_fn
7039 get_masked_reduction_fn (internal_fn reduc_fn, tree vectype_in)
7040 {
7041 internal_fn mask_reduc_fn;
7042 internal_fn mask_len_reduc_fn;
7043
7044 switch (reduc_fn)
7045 {
7046 case IFN_FOLD_LEFT_PLUS:
7047 mask_reduc_fn = IFN_MASK_FOLD_LEFT_PLUS;
7048 mask_len_reduc_fn = IFN_MASK_LEN_FOLD_LEFT_PLUS;
7049 break;
7050
7051 default:
7052 return IFN_LAST;
7053 }
7054
7055 if (direct_internal_fn_supported_p (mask_reduc_fn, vectype_in,
7056 OPTIMIZE_FOR_SPEED))
7057 return mask_reduc_fn;
7058 if (direct_internal_fn_supported_p (mask_len_reduc_fn, vectype_in,
7059 OPTIMIZE_FOR_SPEED))
7060 return mask_len_reduc_fn;
7061 return IFN_LAST;
7062 }
7063
7064 /* Perform an in-order reduction (FOLD_LEFT_REDUCTION). STMT_INFO is the
7065 statement that sets the live-out value. REDUC_DEF_STMT is the phi
7066 statement. CODE is the operation performed by STMT_INFO and OPS are
7067 its scalar operands. REDUC_INDEX is the index of the operand in
7068 OPS that is set by REDUC_DEF_STMT. REDUC_FN is the function that
7069 implements in-order reduction, or IFN_LAST if we should open-code it.
7070 VECTYPE_IN is the type of the vector input. MASKS specifies the masks
7071 that should be used to control the operation in a fully-masked loop. */
7072
7073 static bool
7074 vectorize_fold_left_reduction (loop_vec_info loop_vinfo,
7075 stmt_vec_info stmt_info,
7076 gimple_stmt_iterator *gsi,
7077 gimple **vec_stmt, slp_tree slp_node,
7078 gimple *reduc_def_stmt,
7079 code_helper code, internal_fn reduc_fn,
7080 tree *ops, int num_ops, tree vectype_in,
7081 int reduc_index, vec_loop_masks *masks,
7082 vec_loop_lens *lens)
7083 {
7084 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
7085 tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
7086 internal_fn mask_reduc_fn = get_masked_reduction_fn (reduc_fn, vectype_in);
7087
7088 int ncopies;
7089 if (slp_node)
7090 ncopies = 1;
7091 else
7092 ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
7093
7094 gcc_assert (!nested_in_vect_loop_p (loop, stmt_info));
7095 gcc_assert (ncopies == 1);
7096
7097 bool is_cond_op = false;
7098 if (!code.is_tree_code ())
7099 {
7100 code = conditional_internal_fn_code (internal_fn (code));
7101 gcc_assert (code != ERROR_MARK);
7102 is_cond_op = true;
7103 }
7104
7105 gcc_assert (TREE_CODE_LENGTH (tree_code (code)) == binary_op);
7106
7107 if (slp_node)
7108 {
7109 if (is_cond_op)
7110 {
7111 if (dump_enabled_p ())
7112 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7113 "fold-left reduction on SLP not supported.\n");
7114 return false;
7115 }
7116
7117 gcc_assert (known_eq (TYPE_VECTOR_SUBPARTS (vectype_out),
7118 TYPE_VECTOR_SUBPARTS (vectype_in)));
7119 }
7120
7121 /* The operands either come from a binary operation or an IFN_COND operation.
7122 The former is a gimple assign with binary rhs and the latter is a
7123 gimple call with four arguments. */
7124 gcc_assert (num_ops == 2 || num_ops == 4);
7125 tree op0, opmask;
7126 if (!is_cond_op)
7127 op0 = ops[1 - reduc_index];
7128 else
7129 {
7130 op0 = ops[2 + (1 - reduc_index)];
7131 opmask = ops[0];
7132 gcc_assert (!slp_node);
7133 }
7134
7135 int group_size = 1;
7136 stmt_vec_info scalar_dest_def_info;
7137 auto_vec<tree> vec_oprnds0, vec_opmask;
7138 if (slp_node)
7139 {
7140 auto_vec<vec<tree> > vec_defs (2);
7141 vect_get_slp_defs (loop_vinfo, slp_node, &vec_defs);
7142 vec_oprnds0.safe_splice (vec_defs[1 - reduc_index]);
7143 vec_defs[0].release ();
7144 vec_defs[1].release ();
7145 group_size = SLP_TREE_SCALAR_STMTS (slp_node).length ();
7146 scalar_dest_def_info = SLP_TREE_SCALAR_STMTS (slp_node)[group_size - 1];
7147 }
7148 else
7149 {
7150 vect_get_vec_defs_for_operand (loop_vinfo, stmt_info, 1,
7151 op0, &vec_oprnds0);
7152 scalar_dest_def_info = stmt_info;
7153
7154 /* For an IFN_COND_OP we also need the vector mask operand. */
7155 if (is_cond_op)
7156 vect_get_vec_defs_for_operand (loop_vinfo, stmt_info, 1,
7157 opmask, &vec_opmask);
7158 }
7159
7160 gimple *sdef = vect_orig_stmt (scalar_dest_def_info)->stmt;
7161 tree scalar_dest = gimple_get_lhs (sdef);
7162 tree scalar_type = TREE_TYPE (scalar_dest);
7163 tree reduc_var = gimple_phi_result (reduc_def_stmt);
7164
7165 int vec_num = vec_oprnds0.length ();
7166 gcc_assert (vec_num == 1 || slp_node);
7167 tree vec_elem_type = TREE_TYPE (vectype_out);
7168 gcc_checking_assert (useless_type_conversion_p (scalar_type, vec_elem_type));
7169
7170 tree vector_identity = NULL_TREE;
7171 if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
7172 {
7173 vector_identity = build_zero_cst (vectype_out);
7174 if (!HONOR_SIGNED_ZEROS (vectype_out))
7175 ;
7176 else
7177 {
7178 gcc_assert (!HONOR_SIGN_DEPENDENT_ROUNDING (vectype_out));
7179 vector_identity = const_unop (NEGATE_EXPR, vectype_out,
7180 vector_identity);
7181 }
7182 }
7183
7184 tree scalar_dest_var = vect_create_destination_var (scalar_dest, NULL);
7185 int i;
7186 tree def0;
7187 FOR_EACH_VEC_ELT (vec_oprnds0, i, def0)
7188 {
7189 gimple *new_stmt;
7190 tree mask = NULL_TREE;
7191 tree len = NULL_TREE;
7192 tree bias = NULL_TREE;
7193 if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
7194 mask = vect_get_loop_mask (loop_vinfo, gsi, masks, vec_num, vectype_in, i);
7195 else if (is_cond_op)
7196 mask = vec_opmask[0];
7197 if (LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo))
7198 {
7199 len = vect_get_loop_len (loop_vinfo, gsi, lens, vec_num, vectype_in,
7200 i, 1);
7201 signed char biasval = LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo);
7202 bias = build_int_cst (intQI_type_node, biasval);
7203 if (!is_cond_op)
7204 mask = build_minus_one_cst (truth_type_for (vectype_in));
7205 }
7206
7207 /* Handle MINUS by adding the negative. */
7208 if (reduc_fn != IFN_LAST && code == MINUS_EXPR)
7209 {
7210 tree negated = make_ssa_name (vectype_out);
7211 new_stmt = gimple_build_assign (negated, NEGATE_EXPR, def0);
7212 gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
7213 def0 = negated;
7214 }
7215
7216 if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
7217 && mask && mask_reduc_fn == IFN_LAST)
7218 def0 = merge_with_identity (gsi, mask, vectype_out, def0,
7219 vector_identity);
7220
7221 /* On the first iteration the input is simply the scalar phi
7222 result, and for subsequent iterations it is the output of
7223 the preceding operation. */
7224 if (reduc_fn != IFN_LAST || (mask && mask_reduc_fn != IFN_LAST))
7225 {
7226 if (mask && len && mask_reduc_fn == IFN_MASK_LEN_FOLD_LEFT_PLUS)
7227 new_stmt = gimple_build_call_internal (mask_reduc_fn, 5, reduc_var,
7228 def0, mask, len, bias);
7229 else if (mask && mask_reduc_fn == IFN_MASK_FOLD_LEFT_PLUS)
7230 new_stmt = gimple_build_call_internal (mask_reduc_fn, 3, reduc_var,
7231 def0, mask);
7232 else
7233 new_stmt = gimple_build_call_internal (reduc_fn, 2, reduc_var,
7234 def0);
7235 /* For chained SLP reductions the output of the previous reduction
7236 operation serves as the input of the next. For the final statement
7237 the output cannot be a temporary - we reuse the original
7238 scalar destination of the last statement. */
7239 if (i != vec_num - 1)
7240 {
7241 gimple_set_lhs (new_stmt, scalar_dest_var);
7242 reduc_var = make_ssa_name (scalar_dest_var, new_stmt);
7243 gimple_set_lhs (new_stmt, reduc_var);
7244 }
7245 }
7246 else
7247 {
7248 reduc_var = vect_expand_fold_left (gsi, scalar_dest_var,
7249 tree_code (code), reduc_var, def0,
7250 mask);
7251 new_stmt = SSA_NAME_DEF_STMT (reduc_var);
7252 /* Remove the statement, so that we can use the same code paths
7253 as for statements that we've just created. */
7254 gimple_stmt_iterator tmp_gsi = gsi_for_stmt (new_stmt);
7255 gsi_remove (&tmp_gsi, true);
7256 }
7257
7258 if (i == vec_num - 1)
7259 {
7260 gimple_set_lhs (new_stmt, scalar_dest);
7261 vect_finish_replace_stmt (loop_vinfo,
7262 scalar_dest_def_info,
7263 new_stmt);
7264 }
7265 else
7266 vect_finish_stmt_generation (loop_vinfo,
7267 scalar_dest_def_info,
7268 new_stmt, gsi);
7269
7270 if (slp_node)
7271 slp_node->push_vec_def (new_stmt);
7272 else
7273 {
7274 STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_stmt);
7275 *vec_stmt = new_stmt;
7276 }
7277 }
7278
7279 return true;
7280 }
7281
7282 /* Function is_nonwrapping_integer_induction.
7283
7284 Check if STMT_VINO (which is part of loop LOOP) both increments and
7285 does not cause overflow. */
7286
7287 static bool
7288 is_nonwrapping_integer_induction (stmt_vec_info stmt_vinfo, class loop *loop)
7289 {
7290 gphi *phi = as_a <gphi *> (stmt_vinfo->stmt);
7291 tree base = STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo);
7292 tree step = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo);
7293 tree lhs_type = TREE_TYPE (gimple_phi_result (phi));
7294 widest_int ni, max_loop_value, lhs_max;
7295 wi::overflow_type overflow = wi::OVF_NONE;
7296
7297 /* Make sure the loop is integer based. */
7298 if (TREE_CODE (base) != INTEGER_CST
7299 || TREE_CODE (step) != INTEGER_CST)
7300 return false;
7301
7302 /* Check that the max size of the loop will not wrap. */
7303
7304 if (TYPE_OVERFLOW_UNDEFINED (lhs_type))
7305 return true;
7306
7307 if (! max_stmt_executions (loop, &ni))
7308 return false;
7309
7310 max_loop_value = wi::mul (wi::to_widest (step), ni, TYPE_SIGN (lhs_type),
7311 &overflow);
7312 if (overflow)
7313 return false;
7314
7315 max_loop_value = wi::add (wi::to_widest (base), max_loop_value,
7316 TYPE_SIGN (lhs_type), &overflow);
7317 if (overflow)
7318 return false;
7319
7320 return (wi::min_precision (max_loop_value, TYPE_SIGN (lhs_type))
7321 <= TYPE_PRECISION (lhs_type));
7322 }
7323
7324 /* Check if masking can be supported by inserting a conditional expression.
7325 CODE is the code for the operation. COND_FN is the conditional internal
7326 function, if it exists. VECTYPE_IN is the type of the vector input. */
7327 static bool
7328 use_mask_by_cond_expr_p (code_helper code, internal_fn cond_fn,
7329 tree vectype_in)
7330 {
7331 if (cond_fn != IFN_LAST
7332 && direct_internal_fn_supported_p (cond_fn, vectype_in,
7333 OPTIMIZE_FOR_SPEED))
7334 return false;
7335
7336 if (code.is_tree_code ())
7337 switch (tree_code (code))
7338 {
7339 case DOT_PROD_EXPR:
7340 case SAD_EXPR:
7341 return true;
7342
7343 default:
7344 break;
7345 }
7346 return false;
7347 }
7348
7349 /* Insert a conditional expression to enable masked vectorization. CODE is the
7350 code for the operation. VOP is the array of operands. MASK is the loop
7351 mask. GSI is a statement iterator used to place the new conditional
7352 expression. */
7353 static void
7354 build_vect_cond_expr (code_helper code, tree vop[3], tree mask,
7355 gimple_stmt_iterator *gsi)
7356 {
7357 switch (tree_code (code))
7358 {
7359 case DOT_PROD_EXPR:
7360 {
7361 tree vectype = TREE_TYPE (vop[1]);
7362 tree zero = build_zero_cst (vectype);
7363 tree masked_op1 = make_temp_ssa_name (vectype, NULL, "masked_op1");
7364 gassign *select = gimple_build_assign (masked_op1, VEC_COND_EXPR,
7365 mask, vop[1], zero);
7366 gsi_insert_before (gsi, select, GSI_SAME_STMT);
7367 vop[1] = masked_op1;
7368 break;
7369 }
7370
7371 case SAD_EXPR:
7372 {
7373 tree vectype = TREE_TYPE (vop[1]);
7374 tree masked_op1 = make_temp_ssa_name (vectype, NULL, "masked_op1");
7375 gassign *select = gimple_build_assign (masked_op1, VEC_COND_EXPR,
7376 mask, vop[1], vop[0]);
7377 gsi_insert_before (gsi, select, GSI_SAME_STMT);
7378 vop[1] = masked_op1;
7379 break;
7380 }
7381
7382 default:
7383 gcc_unreachable ();
7384 }
7385 }
7386
7387 /* Function vectorizable_reduction.
7388
7389 Check if STMT_INFO performs a reduction operation that can be vectorized.
7390 If VEC_STMT is also passed, vectorize STMT_INFO: create a vectorized
7391 stmt to replace it, put it in VEC_STMT, and insert it at GSI.
7392 Return true if STMT_INFO is vectorizable in this way.
7393
7394 This function also handles reduction idioms (patterns) that have been
7395 recognized in advance during vect_pattern_recog. In this case, STMT_INFO
7396 may be of this form:
7397 X = pattern_expr (arg0, arg1, ..., X)
7398 and its STMT_VINFO_RELATED_STMT points to the last stmt in the original
7399 sequence that had been detected and replaced by the pattern-stmt
7400 (STMT_INFO).
7401
7402 This function also handles reduction of condition expressions, for example:
7403 for (int i = 0; i < N; i++)
7404 if (a[i] < value)
7405 last = a[i];
7406 This is handled by vectorising the loop and creating an additional vector
7407 containing the loop indexes for which "a[i] < value" was true. In the
7408 function epilogue this is reduced to a single max value and then used to
7409 index into the vector of results.
7410
7411 In some cases of reduction patterns, the type of the reduction variable X is
7412 different than the type of the other arguments of STMT_INFO.
7413 In such cases, the vectype that is used when transforming STMT_INFO into
7414 a vector stmt is different than the vectype that is used to determine the
7415 vectorization factor, because it consists of a different number of elements
7416 than the actual number of elements that are being operated upon in parallel.
7417
7418 For example, consider an accumulation of shorts into an int accumulator.
7419 On some targets it's possible to vectorize this pattern operating on 8
7420 shorts at a time (hence, the vectype for purposes of determining the
7421 vectorization factor should be V8HI); on the other hand, the vectype that
7422 is used to create the vector form is actually V4SI (the type of the result).
7423
7424 Upon entry to this function, STMT_VINFO_VECTYPE records the vectype that
7425 indicates what is the actual level of parallelism (V8HI in the example), so
7426 that the right vectorization factor would be derived. This vectype
7427 corresponds to the type of arguments to the reduction stmt, and should *NOT*
7428 be used to create the vectorized stmt. The right vectype for the vectorized
7429 stmt is obtained from the type of the result X:
7430 get_vectype_for_scalar_type (vinfo, TREE_TYPE (X))
7431
7432 This means that, contrary to "regular" reductions (or "regular" stmts in
7433 general), the following equation:
7434 STMT_VINFO_VECTYPE == get_vectype_for_scalar_type (vinfo, TREE_TYPE (X))
7435 does *NOT* necessarily hold for reduction patterns. */
7436
7437 bool
7438 vectorizable_reduction (loop_vec_info loop_vinfo,
7439 stmt_vec_info stmt_info, slp_tree slp_node,
7440 slp_instance slp_node_instance,
7441 stmt_vector_for_cost *cost_vec)
7442 {
7443 tree vectype_in = NULL_TREE;
7444 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
7445 enum vect_def_type cond_reduc_dt = vect_unknown_def_type;
7446 stmt_vec_info cond_stmt_vinfo = NULL;
7447 int i;
7448 int ncopies;
7449 bool single_defuse_cycle = false;
7450 bool nested_cycle = false;
7451 bool double_reduc = false;
7452 int vec_num;
7453 tree cr_index_scalar_type = NULL_TREE, cr_index_vector_type = NULL_TREE;
7454 tree cond_reduc_val = NULL_TREE;
7455
7456 /* Make sure it was already recognized as a reduction computation. */
7457 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_reduction_def
7458 && STMT_VINFO_DEF_TYPE (stmt_info) != vect_double_reduction_def
7459 && STMT_VINFO_DEF_TYPE (stmt_info) != vect_nested_cycle)
7460 return false;
7461
7462 /* The stmt we store reduction analysis meta on. */
7463 stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info);
7464 reduc_info->is_reduc_info = true;
7465
7466 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle)
7467 {
7468 if (is_a <gphi *> (stmt_info->stmt))
7469 {
7470 if (slp_node)
7471 {
7472 /* We eventually need to set a vector type on invariant
7473 arguments. */
7474 unsigned j;
7475 slp_tree child;
7476 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (slp_node), j, child)
7477 if (!vect_maybe_update_slp_op_vectype
7478 (child, SLP_TREE_VECTYPE (slp_node)))
7479 {
7480 if (dump_enabled_p ())
7481 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7482 "incompatible vector types for "
7483 "invariants\n");
7484 return false;
7485 }
7486 }
7487 /* Analysis for double-reduction is done on the outer
7488 loop PHI, nested cycles have no further restrictions. */
7489 STMT_VINFO_TYPE (stmt_info) = cycle_phi_info_type;
7490 }
7491 else
7492 STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
7493 return true;
7494 }
7495
7496 stmt_vec_info orig_stmt_of_analysis = stmt_info;
7497 stmt_vec_info phi_info = stmt_info;
7498 if (!is_a <gphi *> (stmt_info->stmt))
7499 {
7500 STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
7501 return true;
7502 }
7503 if (slp_node)
7504 {
7505 slp_node_instance->reduc_phis = slp_node;
7506 /* ??? We're leaving slp_node to point to the PHIs, we only
7507 need it to get at the number of vector stmts which wasn't
7508 yet initialized for the instance root. */
7509 }
7510 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
7511 {
7512 use_operand_p use_p;
7513 gimple *use_stmt;
7514 bool res = single_imm_use (gimple_phi_result (stmt_info->stmt),
7515 &use_p, &use_stmt);
7516 gcc_assert (res);
7517 phi_info = loop_vinfo->lookup_stmt (use_stmt);
7518 }
7519
7520 /* PHIs should not participate in patterns. */
7521 gcc_assert (!STMT_VINFO_RELATED_STMT (phi_info));
7522 gphi *reduc_def_phi = as_a <gphi *> (phi_info->stmt);
7523
7524 /* Verify following REDUC_IDX from the latch def leads us back to the PHI
7525 and compute the reduction chain length. Discover the real
7526 reduction operation stmt on the way (stmt_info and slp_for_stmt_info). */
7527 tree reduc_def
7528 = PHI_ARG_DEF_FROM_EDGE (reduc_def_phi,
7529 loop_latch_edge
7530 (gimple_bb (reduc_def_phi)->loop_father));
7531 unsigned reduc_chain_length = 0;
7532 bool only_slp_reduc_chain = true;
7533 stmt_info = NULL;
7534 slp_tree slp_for_stmt_info = slp_node ? slp_node_instance->root : NULL;
7535 while (reduc_def != PHI_RESULT (reduc_def_phi))
7536 {
7537 stmt_vec_info def = loop_vinfo->lookup_def (reduc_def);
7538 stmt_vec_info vdef = vect_stmt_to_vectorize (def);
7539 if (STMT_VINFO_REDUC_IDX (vdef) == -1)
7540 {
7541 if (dump_enabled_p ())
7542 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7543 "reduction chain broken by patterns.\n");
7544 return false;
7545 }
7546 if (!REDUC_GROUP_FIRST_ELEMENT (vdef))
7547 only_slp_reduc_chain = false;
7548 /* For epilogue generation live members of the chain need
7549 to point back to the PHI via their original stmt for
7550 info_for_reduction to work. For SLP we need to look at
7551 all lanes here - even though we only will vectorize from
7552 the SLP node with live lane zero the other live lanes also
7553 need to be identified as part of a reduction to be able
7554 to skip code generation for them. */
7555 if (slp_for_stmt_info)
7556 {
7557 for (auto s : SLP_TREE_SCALAR_STMTS (slp_for_stmt_info))
7558 if (STMT_VINFO_LIVE_P (s))
7559 STMT_VINFO_REDUC_DEF (vect_orig_stmt (s)) = phi_info;
7560 }
7561 else if (STMT_VINFO_LIVE_P (vdef))
7562 STMT_VINFO_REDUC_DEF (def) = phi_info;
7563 gimple_match_op op;
7564 if (!gimple_extract_op (vdef->stmt, &op))
7565 {
7566 if (dump_enabled_p ())
7567 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7568 "reduction chain includes unsupported"
7569 " statement type.\n");
7570 return false;
7571 }
7572 if (CONVERT_EXPR_CODE_P (op.code))
7573 {
7574 if (!tree_nop_conversion_p (op.type, TREE_TYPE (op.ops[0])))
7575 {
7576 if (dump_enabled_p ())
7577 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7578 "conversion in the reduction chain.\n");
7579 return false;
7580 }
7581 }
7582 else if (!stmt_info)
7583 /* First non-conversion stmt. */
7584 stmt_info = vdef;
7585 reduc_def = op.ops[STMT_VINFO_REDUC_IDX (vdef)];
7586 reduc_chain_length++;
7587 if (!stmt_info && slp_node)
7588 slp_for_stmt_info = SLP_TREE_CHILDREN (slp_for_stmt_info)[0];
7589 }
7590 /* PHIs should not participate in patterns. */
7591 gcc_assert (!STMT_VINFO_RELATED_STMT (phi_info));
7592
7593 if (nested_in_vect_loop_p (loop, stmt_info))
7594 {
7595 loop = loop->inner;
7596 nested_cycle = true;
7597 }
7598
7599 /* STMT_VINFO_REDUC_DEF doesn't point to the first but the last
7600 element. */
7601 if (slp_node && REDUC_GROUP_FIRST_ELEMENT (stmt_info))
7602 {
7603 gcc_assert (!REDUC_GROUP_NEXT_ELEMENT (stmt_info));
7604 stmt_info = REDUC_GROUP_FIRST_ELEMENT (stmt_info);
7605 }
7606 if (REDUC_GROUP_FIRST_ELEMENT (stmt_info))
7607 gcc_assert (slp_node
7608 && REDUC_GROUP_FIRST_ELEMENT (stmt_info) == stmt_info);
7609
7610 /* 1. Is vectorizable reduction? */
7611 /* Not supportable if the reduction variable is used in the loop, unless
7612 it's a reduction chain. */
7613 if (STMT_VINFO_RELEVANT (stmt_info) > vect_used_in_outer
7614 && !REDUC_GROUP_FIRST_ELEMENT (stmt_info))
7615 return false;
7616
7617 /* Reductions that are not used even in an enclosing outer-loop,
7618 are expected to be "live" (used out of the loop). */
7619 if (STMT_VINFO_RELEVANT (stmt_info) == vect_unused_in_scope
7620 && !STMT_VINFO_LIVE_P (stmt_info))
7621 return false;
7622
7623 /* 2. Has this been recognized as a reduction pattern?
7624
7625 Check if STMT represents a pattern that has been recognized
7626 in earlier analysis stages. For stmts that represent a pattern,
7627 the STMT_VINFO_RELATED_STMT field records the last stmt in
7628 the original sequence that constitutes the pattern. */
7629
7630 stmt_vec_info orig_stmt_info = STMT_VINFO_RELATED_STMT (stmt_info);
7631 if (orig_stmt_info)
7632 {
7633 gcc_assert (STMT_VINFO_IN_PATTERN_P (orig_stmt_info));
7634 gcc_assert (!STMT_VINFO_IN_PATTERN_P (stmt_info));
7635 }
7636
7637 /* 3. Check the operands of the operation. The first operands are defined
7638 inside the loop body. The last operand is the reduction variable,
7639 which is defined by the loop-header-phi. */
7640
7641 tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
7642 STMT_VINFO_REDUC_VECTYPE (reduc_info) = vectype_out;
7643 gimple_match_op op;
7644 if (!gimple_extract_op (stmt_info->stmt, &op))
7645 gcc_unreachable ();
7646 bool lane_reduc_code_p = (op.code == DOT_PROD_EXPR
7647 || op.code == WIDEN_SUM_EXPR
7648 || op.code == SAD_EXPR);
7649
7650 if (!POINTER_TYPE_P (op.type) && !INTEGRAL_TYPE_P (op.type)
7651 && !SCALAR_FLOAT_TYPE_P (op.type))
7652 return false;
7653
7654 /* Do not try to vectorize bit-precision reductions. */
7655 if (!type_has_mode_precision_p (op.type))
7656 return false;
7657
7658 /* For lane-reducing ops we're reducing the number of reduction PHIs
7659 which means the only use of that may be in the lane-reducing operation. */
7660 if (lane_reduc_code_p
7661 && reduc_chain_length != 1
7662 && !only_slp_reduc_chain)
7663 {
7664 if (dump_enabled_p ())
7665 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7666 "lane-reducing reduction with extra stmts.\n");
7667 return false;
7668 }
7669
7670 /* All uses but the last are expected to be defined in the loop.
7671 The last use is the reduction variable. In case of nested cycle this
7672 assumption is not true: we use reduc_index to record the index of the
7673 reduction variable. */
7674 slp_tree *slp_op = XALLOCAVEC (slp_tree, op.num_ops);
7675 tree *vectype_op = XALLOCAVEC (tree, op.num_ops);
7676 /* We need to skip an extra operand for COND_EXPRs with embedded
7677 comparison. */
7678 unsigned opno_adjust = 0;
7679 if (op.code == COND_EXPR && COMPARISON_CLASS_P (op.ops[0]))
7680 opno_adjust = 1;
7681 for (i = 0; i < (int) op.num_ops; i++)
7682 {
7683 /* The condition of COND_EXPR is checked in vectorizable_condition(). */
7684 if (i == 0 && op.code == COND_EXPR)
7685 continue;
7686
7687 stmt_vec_info def_stmt_info;
7688 enum vect_def_type dt;
7689 if (!vect_is_simple_use (loop_vinfo, stmt_info, slp_for_stmt_info,
7690 i + opno_adjust, &op.ops[i], &slp_op[i], &dt,
7691 &vectype_op[i], &def_stmt_info))
7692 {
7693 if (dump_enabled_p ())
7694 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7695 "use not simple.\n");
7696 return false;
7697 }
7698 if (i == STMT_VINFO_REDUC_IDX (stmt_info))
7699 continue;
7700
7701 /* For an IFN_COND_OP we might hit the reduction definition operand
7702 twice (once as definition, once as else). */
7703 if (op.ops[i] == op.ops[STMT_VINFO_REDUC_IDX (stmt_info)])
7704 continue;
7705
7706 /* There should be only one cycle def in the stmt, the one
7707 leading to reduc_def. */
7708 if (VECTORIZABLE_CYCLE_DEF (dt))
7709 return false;
7710
7711 if (!vectype_op[i])
7712 vectype_op[i]
7713 = get_vectype_for_scalar_type (loop_vinfo,
7714 TREE_TYPE (op.ops[i]), slp_op[i]);
7715
7716 /* To properly compute ncopies we are interested in the widest
7717 non-reduction input type in case we're looking at a widening
7718 accumulation that we later handle in vect_transform_reduction. */
7719 if (lane_reduc_code_p
7720 && vectype_op[i]
7721 && (!vectype_in
7722 || (GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (vectype_in)))
7723 < GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (vectype_op[i]))))))
7724 vectype_in = vectype_op[i];
7725
7726 /* Record how the non-reduction-def value of COND_EXPR is defined.
7727 ??? For a chain of multiple CONDs we'd have to match them up all. */
7728 if (op.code == COND_EXPR && reduc_chain_length == 1)
7729 {
7730 if (dt == vect_constant_def)
7731 {
7732 cond_reduc_dt = dt;
7733 cond_reduc_val = op.ops[i];
7734 }
7735 else if (dt == vect_induction_def
7736 && def_stmt_info
7737 && is_nonwrapping_integer_induction (def_stmt_info, loop))
7738 {
7739 cond_reduc_dt = dt;
7740 cond_stmt_vinfo = def_stmt_info;
7741 }
7742 }
7743 }
7744 if (!vectype_in)
7745 vectype_in = STMT_VINFO_VECTYPE (phi_info);
7746 STMT_VINFO_REDUC_VECTYPE_IN (reduc_info) = vectype_in;
7747
7748 enum vect_reduction_type v_reduc_type = STMT_VINFO_REDUC_TYPE (phi_info);
7749 STMT_VINFO_REDUC_TYPE (reduc_info) = v_reduc_type;
7750 /* If we have a condition reduction, see if we can simplify it further. */
7751 if (v_reduc_type == COND_REDUCTION)
7752 {
7753 if (slp_node)
7754 return false;
7755
7756 /* When the condition uses the reduction value in the condition, fail. */
7757 if (STMT_VINFO_REDUC_IDX (stmt_info) == 0)
7758 {
7759 if (dump_enabled_p ())
7760 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7761 "condition depends on previous iteration\n");
7762 return false;
7763 }
7764
7765 if (reduc_chain_length == 1
7766 && (direct_internal_fn_supported_p (IFN_FOLD_EXTRACT_LAST, vectype_in,
7767 OPTIMIZE_FOR_SPEED)
7768 || direct_internal_fn_supported_p (IFN_LEN_FOLD_EXTRACT_LAST,
7769 vectype_in,
7770 OPTIMIZE_FOR_SPEED)))
7771 {
7772 if (dump_enabled_p ())
7773 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7774 "optimizing condition reduction with"
7775 " FOLD_EXTRACT_LAST.\n");
7776 STMT_VINFO_REDUC_TYPE (reduc_info) = EXTRACT_LAST_REDUCTION;
7777 }
7778 else if (cond_reduc_dt == vect_induction_def)
7779 {
7780 tree base
7781 = STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (cond_stmt_vinfo);
7782 tree step = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (cond_stmt_vinfo);
7783
7784 gcc_assert (TREE_CODE (base) == INTEGER_CST
7785 && TREE_CODE (step) == INTEGER_CST);
7786 cond_reduc_val = NULL_TREE;
7787 enum tree_code cond_reduc_op_code = ERROR_MARK;
7788 tree res = PHI_RESULT (STMT_VINFO_STMT (cond_stmt_vinfo));
7789 if (!types_compatible_p (TREE_TYPE (res), TREE_TYPE (base)))
7790 ;
7791 /* Find a suitable value, for MAX_EXPR below base, for MIN_EXPR
7792 above base; punt if base is the minimum value of the type for
7793 MAX_EXPR or maximum value of the type for MIN_EXPR for now. */
7794 else if (tree_int_cst_sgn (step) == -1)
7795 {
7796 cond_reduc_op_code = MIN_EXPR;
7797 if (tree_int_cst_sgn (base) == -1)
7798 cond_reduc_val = build_int_cst (TREE_TYPE (base), 0);
7799 else if (tree_int_cst_lt (base,
7800 TYPE_MAX_VALUE (TREE_TYPE (base))))
7801 cond_reduc_val
7802 = int_const_binop (PLUS_EXPR, base, integer_one_node);
7803 }
7804 else
7805 {
7806 cond_reduc_op_code = MAX_EXPR;
7807 if (tree_int_cst_sgn (base) == 1)
7808 cond_reduc_val = build_int_cst (TREE_TYPE (base), 0);
7809 else if (tree_int_cst_lt (TYPE_MIN_VALUE (TREE_TYPE (base)),
7810 base))
7811 cond_reduc_val
7812 = int_const_binop (MINUS_EXPR, base, integer_one_node);
7813 }
7814 if (cond_reduc_val)
7815 {
7816 if (dump_enabled_p ())
7817 dump_printf_loc (MSG_NOTE, vect_location,
7818 "condition expression based on "
7819 "integer induction.\n");
7820 STMT_VINFO_REDUC_CODE (reduc_info) = cond_reduc_op_code;
7821 STMT_VINFO_VEC_INDUC_COND_INITIAL_VAL (reduc_info)
7822 = cond_reduc_val;
7823 STMT_VINFO_REDUC_TYPE (reduc_info) = INTEGER_INDUC_COND_REDUCTION;
7824 }
7825 }
7826 else if (cond_reduc_dt == vect_constant_def)
7827 {
7828 enum vect_def_type cond_initial_dt;
7829 tree cond_initial_val = vect_phi_initial_value (reduc_def_phi);
7830 vect_is_simple_use (cond_initial_val, loop_vinfo, &cond_initial_dt);
7831 if (cond_initial_dt == vect_constant_def
7832 && types_compatible_p (TREE_TYPE (cond_initial_val),
7833 TREE_TYPE (cond_reduc_val)))
7834 {
7835 tree e = fold_binary (LE_EXPR, boolean_type_node,
7836 cond_initial_val, cond_reduc_val);
7837 if (e && (integer_onep (e) || integer_zerop (e)))
7838 {
7839 if (dump_enabled_p ())
7840 dump_printf_loc (MSG_NOTE, vect_location,
7841 "condition expression based on "
7842 "compile time constant.\n");
7843 /* Record reduction code at analysis stage. */
7844 STMT_VINFO_REDUC_CODE (reduc_info)
7845 = integer_onep (e) ? MAX_EXPR : MIN_EXPR;
7846 STMT_VINFO_REDUC_TYPE (reduc_info) = CONST_COND_REDUCTION;
7847 }
7848 }
7849 }
7850 }
7851
7852 if (STMT_VINFO_LIVE_P (phi_info))
7853 return false;
7854
7855 if (slp_node)
7856 ncopies = 1;
7857 else
7858 ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
7859
7860 gcc_assert (ncopies >= 1);
7861
7862 poly_uint64 nunits_out = TYPE_VECTOR_SUBPARTS (vectype_out);
7863
7864 if (nested_cycle)
7865 {
7866 gcc_assert (STMT_VINFO_DEF_TYPE (reduc_info)
7867 == vect_double_reduction_def);
7868 double_reduc = true;
7869 }
7870
7871 /* 4.2. Check support for the epilog operation.
7872
7873 If STMT represents a reduction pattern, then the type of the
7874 reduction variable may be different than the type of the rest
7875 of the arguments. For example, consider the case of accumulation
7876 of shorts into an int accumulator; The original code:
7877 S1: int_a = (int) short_a;
7878 orig_stmt-> S2: int_acc = plus <int_a ,int_acc>;
7879
7880 was replaced with:
7881 STMT: int_acc = widen_sum <short_a, int_acc>
7882
7883 This means that:
7884 1. The tree-code that is used to create the vector operation in the
7885 epilog code (that reduces the partial results) is not the
7886 tree-code of STMT, but is rather the tree-code of the original
7887 stmt from the pattern that STMT is replacing. I.e, in the example
7888 above we want to use 'widen_sum' in the loop, but 'plus' in the
7889 epilog.
7890 2. The type (mode) we use to check available target support
7891 for the vector operation to be created in the *epilog*, is
7892 determined by the type of the reduction variable (in the example
7893 above we'd check this: optab_handler (plus_optab, vect_int_mode])).
7894 However the type (mode) we use to check available target support
7895 for the vector operation to be created *inside the loop*, is
7896 determined by the type of the other arguments to STMT (in the
7897 example we'd check this: optab_handler (widen_sum_optab,
7898 vect_short_mode)).
7899
7900 This is contrary to "regular" reductions, in which the types of all
7901 the arguments are the same as the type of the reduction variable.
7902 For "regular" reductions we can therefore use the same vector type
7903 (and also the same tree-code) when generating the epilog code and
7904 when generating the code inside the loop. */
7905
7906 code_helper orig_code = STMT_VINFO_REDUC_CODE (phi_info);
7907
7908 /* If conversion might have created a conditional operation like
7909 IFN_COND_ADD already. Use the internal code for the following checks. */
7910 if (orig_code.is_internal_fn ())
7911 {
7912 tree_code new_code = conditional_internal_fn_code (internal_fn (orig_code));
7913 orig_code = new_code != ERROR_MARK ? new_code : orig_code;
7914 }
7915
7916 STMT_VINFO_REDUC_CODE (reduc_info) = orig_code;
7917
7918 vect_reduction_type reduction_type = STMT_VINFO_REDUC_TYPE (reduc_info);
7919 if (reduction_type == TREE_CODE_REDUCTION)
7920 {
7921 /* Check whether it's ok to change the order of the computation.
7922 Generally, when vectorizing a reduction we change the order of the
7923 computation. This may change the behavior of the program in some
7924 cases, so we need to check that this is ok. One exception is when
7925 vectorizing an outer-loop: the inner-loop is executed sequentially,
7926 and therefore vectorizing reductions in the inner-loop during
7927 outer-loop vectorization is safe. Likewise when we are vectorizing
7928 a series of reductions using SLP and the VF is one the reductions
7929 are performed in scalar order. */
7930 if (slp_node
7931 && !REDUC_GROUP_FIRST_ELEMENT (stmt_info)
7932 && known_eq (LOOP_VINFO_VECT_FACTOR (loop_vinfo), 1u))
7933 ;
7934 else if (needs_fold_left_reduction_p (op.type, orig_code))
7935 {
7936 /* When vectorizing a reduction chain w/o SLP the reduction PHI
7937 is not directy used in stmt. */
7938 if (!only_slp_reduc_chain
7939 && reduc_chain_length != 1)
7940 {
7941 if (dump_enabled_p ())
7942 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7943 "in-order reduction chain without SLP.\n");
7944 return false;
7945 }
7946 STMT_VINFO_REDUC_TYPE (reduc_info)
7947 = reduction_type = FOLD_LEFT_REDUCTION;
7948 }
7949 else if (!commutative_binary_op_p (orig_code, op.type)
7950 || !associative_binary_op_p (orig_code, op.type))
7951 {
7952 if (dump_enabled_p ())
7953 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7954 "reduction: not commutative/associative\n");
7955 return false;
7956 }
7957 }
7958
7959 if ((double_reduc || reduction_type != TREE_CODE_REDUCTION)
7960 && ncopies > 1)
7961 {
7962 if (dump_enabled_p ())
7963 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7964 "multiple types in double reduction or condition "
7965 "reduction or fold-left reduction.\n");
7966 return false;
7967 }
7968
7969 internal_fn reduc_fn = IFN_LAST;
7970 if (reduction_type == TREE_CODE_REDUCTION
7971 || reduction_type == FOLD_LEFT_REDUCTION
7972 || reduction_type == INTEGER_INDUC_COND_REDUCTION
7973 || reduction_type == CONST_COND_REDUCTION)
7974 {
7975 if (reduction_type == FOLD_LEFT_REDUCTION
7976 ? fold_left_reduction_fn (orig_code, &reduc_fn)
7977 : reduction_fn_for_scalar_code (orig_code, &reduc_fn))
7978 {
7979 if (reduc_fn != IFN_LAST
7980 && !direct_internal_fn_supported_p (reduc_fn, vectype_out,
7981 OPTIMIZE_FOR_SPEED))
7982 {
7983 if (dump_enabled_p ())
7984 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7985 "reduc op not supported by target.\n");
7986
7987 reduc_fn = IFN_LAST;
7988 }
7989 }
7990 else
7991 {
7992 if (!nested_cycle || double_reduc)
7993 {
7994 if (dump_enabled_p ())
7995 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7996 "no reduc code for scalar code.\n");
7997
7998 return false;
7999 }
8000 }
8001 }
8002 else if (reduction_type == COND_REDUCTION)
8003 {
8004 int scalar_precision
8005 = GET_MODE_PRECISION (SCALAR_TYPE_MODE (op.type));
8006 cr_index_scalar_type = make_unsigned_type (scalar_precision);
8007 cr_index_vector_type = get_same_sized_vectype (cr_index_scalar_type,
8008 vectype_out);
8009
8010 if (direct_internal_fn_supported_p (IFN_REDUC_MAX, cr_index_vector_type,
8011 OPTIMIZE_FOR_SPEED))
8012 reduc_fn = IFN_REDUC_MAX;
8013 }
8014 STMT_VINFO_REDUC_FN (reduc_info) = reduc_fn;
8015
8016 if (reduction_type != EXTRACT_LAST_REDUCTION
8017 && (!nested_cycle || double_reduc)
8018 && reduc_fn == IFN_LAST
8019 && !nunits_out.is_constant ())
8020 {
8021 if (dump_enabled_p ())
8022 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8023 "missing target support for reduction on"
8024 " variable-length vectors.\n");
8025 return false;
8026 }
8027
8028 /* For SLP reductions, see if there is a neutral value we can use. */
8029 tree neutral_op = NULL_TREE;
8030 if (slp_node)
8031 {
8032 tree initial_value = NULL_TREE;
8033 if (REDUC_GROUP_FIRST_ELEMENT (stmt_info) != NULL)
8034 initial_value = vect_phi_initial_value (reduc_def_phi);
8035 neutral_op = neutral_op_for_reduction (TREE_TYPE (vectype_out),
8036 orig_code, initial_value);
8037 }
8038
8039 if (double_reduc && reduction_type == FOLD_LEFT_REDUCTION)
8040 {
8041 /* We can't support in-order reductions of code such as this:
8042
8043 for (int i = 0; i < n1; ++i)
8044 for (int j = 0; j < n2; ++j)
8045 l += a[j];
8046
8047 since GCC effectively transforms the loop when vectorizing:
8048
8049 for (int i = 0; i < n1 / VF; ++i)
8050 for (int j = 0; j < n2; ++j)
8051 for (int k = 0; k < VF; ++k)
8052 l += a[j];
8053
8054 which is a reassociation of the original operation. */
8055 if (dump_enabled_p ())
8056 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8057 "in-order double reduction not supported.\n");
8058
8059 return false;
8060 }
8061
8062 if (reduction_type == FOLD_LEFT_REDUCTION
8063 && slp_node
8064 && !REDUC_GROUP_FIRST_ELEMENT (stmt_info))
8065 {
8066 /* We cannot use in-order reductions in this case because there is
8067 an implicit reassociation of the operations involved. */
8068 if (dump_enabled_p ())
8069 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8070 "in-order unchained SLP reductions not supported.\n");
8071 return false;
8072 }
8073
8074 /* For double reductions, and for SLP reductions with a neutral value,
8075 we construct a variable-length initial vector by loading a vector
8076 full of the neutral value and then shift-and-inserting the start
8077 values into the low-numbered elements. */
8078 if ((double_reduc || neutral_op)
8079 && !nunits_out.is_constant ()
8080 && !direct_internal_fn_supported_p (IFN_VEC_SHL_INSERT,
8081 vectype_out, OPTIMIZE_FOR_SPEED))
8082 {
8083 if (dump_enabled_p ())
8084 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8085 "reduction on variable-length vectors requires"
8086 " target support for a vector-shift-and-insert"
8087 " operation.\n");
8088 return false;
8089 }
8090
8091 /* Check extra constraints for variable-length unchained SLP reductions. */
8092 if (slp_node
8093 && !REDUC_GROUP_FIRST_ELEMENT (stmt_info)
8094 && !nunits_out.is_constant ())
8095 {
8096 /* We checked above that we could build the initial vector when
8097 there's a neutral element value. Check here for the case in
8098 which each SLP statement has its own initial value and in which
8099 that value needs to be repeated for every instance of the
8100 statement within the initial vector. */
8101 unsigned int group_size = SLP_TREE_LANES (slp_node);
8102 if (!neutral_op
8103 && !can_duplicate_and_interleave_p (loop_vinfo, group_size,
8104 TREE_TYPE (vectype_out)))
8105 {
8106 if (dump_enabled_p ())
8107 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8108 "unsupported form of SLP reduction for"
8109 " variable-length vectors: cannot build"
8110 " initial vector.\n");
8111 return false;
8112 }
8113 /* The epilogue code relies on the number of elements being a multiple
8114 of the group size. The duplicate-and-interleave approach to setting
8115 up the initial vector does too. */
8116 if (!multiple_p (nunits_out, group_size))
8117 {
8118 if (dump_enabled_p ())
8119 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8120 "unsupported form of SLP reduction for"
8121 " variable-length vectors: the vector size"
8122 " is not a multiple of the number of results.\n");
8123 return false;
8124 }
8125 }
8126
8127 if (reduction_type == COND_REDUCTION)
8128 {
8129 widest_int ni;
8130
8131 if (! max_loop_iterations (loop, &ni))
8132 {
8133 if (dump_enabled_p ())
8134 dump_printf_loc (MSG_NOTE, vect_location,
8135 "loop count not known, cannot create cond "
8136 "reduction.\n");
8137 return false;
8138 }
8139 /* Convert backedges to iterations. */
8140 ni += 1;
8141
8142 /* The additional index will be the same type as the condition. Check
8143 that the loop can fit into this less one (because we'll use up the
8144 zero slot for when there are no matches). */
8145 tree max_index = TYPE_MAX_VALUE (cr_index_scalar_type);
8146 if (wi::geu_p (ni, wi::to_widest (max_index)))
8147 {
8148 if (dump_enabled_p ())
8149 dump_printf_loc (MSG_NOTE, vect_location,
8150 "loop size is greater than data size.\n");
8151 return false;
8152 }
8153 }
8154
8155 /* In case the vectorization factor (VF) is bigger than the number
8156 of elements that we can fit in a vectype (nunits), we have to generate
8157 more than one vector stmt - i.e - we need to "unroll" the
8158 vector stmt by a factor VF/nunits. For more details see documentation
8159 in vectorizable_operation. */
8160
8161 /* If the reduction is used in an outer loop we need to generate
8162 VF intermediate results, like so (e.g. for ncopies=2):
8163 r0 = phi (init, r0)
8164 r1 = phi (init, r1)
8165 r0 = x0 + r0;
8166 r1 = x1 + r1;
8167 (i.e. we generate VF results in 2 registers).
8168 In this case we have a separate def-use cycle for each copy, and therefore
8169 for each copy we get the vector def for the reduction variable from the
8170 respective phi node created for this copy.
8171
8172 Otherwise (the reduction is unused in the loop nest), we can combine
8173 together intermediate results, like so (e.g. for ncopies=2):
8174 r = phi (init, r)
8175 r = x0 + r;
8176 r = x1 + r;
8177 (i.e. we generate VF/2 results in a single register).
8178 In this case for each copy we get the vector def for the reduction variable
8179 from the vectorized reduction operation generated in the previous iteration.
8180
8181 This only works when we see both the reduction PHI and its only consumer
8182 in vectorizable_reduction and there are no intermediate stmts
8183 participating. When unrolling we want each unrolled iteration to have its
8184 own reduction accumulator since one of the main goals of unrolling a
8185 reduction is to reduce the aggregate loop-carried latency. */
8186 if (ncopies > 1
8187 && (STMT_VINFO_RELEVANT (stmt_info) <= vect_used_only_live)
8188 && reduc_chain_length == 1
8189 && loop_vinfo->suggested_unroll_factor == 1)
8190 single_defuse_cycle = true;
8191
8192 if (single_defuse_cycle || lane_reduc_code_p)
8193 {
8194 gcc_assert (op.code != COND_EXPR);
8195
8196 /* 4. Supportable by target? */
8197 bool ok = true;
8198
8199 /* 4.1. check support for the operation in the loop
8200
8201 This isn't necessary for the lane reduction codes, since they
8202 can only be produced by pattern matching, and it's up to the
8203 pattern matcher to test for support. The main reason for
8204 specifically skipping this step is to avoid rechecking whether
8205 mixed-sign dot-products can be implemented using signed
8206 dot-products. */
8207 machine_mode vec_mode = TYPE_MODE (vectype_in);
8208 if (!lane_reduc_code_p
8209 && !directly_supported_p (op.code, vectype_in, optab_vector))
8210 {
8211 if (dump_enabled_p ())
8212 dump_printf (MSG_NOTE, "op not supported by target.\n");
8213 if (maybe_ne (GET_MODE_SIZE (vec_mode), UNITS_PER_WORD)
8214 || !vect_can_vectorize_without_simd_p (op.code))
8215 ok = false;
8216 else
8217 if (dump_enabled_p ())
8218 dump_printf (MSG_NOTE, "proceeding using word mode.\n");
8219 }
8220
8221 if (vect_emulated_vector_p (vectype_in)
8222 && !vect_can_vectorize_without_simd_p (op.code))
8223 {
8224 if (dump_enabled_p ())
8225 dump_printf (MSG_NOTE, "using word mode not possible.\n");
8226 return false;
8227 }
8228
8229 /* lane-reducing operations have to go through vect_transform_reduction.
8230 For the other cases try without the single cycle optimization. */
8231 if (!ok)
8232 {
8233 if (lane_reduc_code_p)
8234 return false;
8235 else
8236 single_defuse_cycle = false;
8237 }
8238 }
8239 STMT_VINFO_FORCE_SINGLE_CYCLE (reduc_info) = single_defuse_cycle;
8240
8241 /* If the reduction stmt is one of the patterns that have lane
8242 reduction embedded we cannot handle the case of ! single_defuse_cycle. */
8243 if ((ncopies > 1 && ! single_defuse_cycle)
8244 && lane_reduc_code_p)
8245 {
8246 if (dump_enabled_p ())
8247 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8248 "multi def-use cycle not possible for lane-reducing "
8249 "reduction operation\n");
8250 return false;
8251 }
8252
8253 if (slp_node
8254 && !(!single_defuse_cycle
8255 && !lane_reduc_code_p
8256 && reduction_type != FOLD_LEFT_REDUCTION))
8257 for (i = 0; i < (int) op.num_ops; i++)
8258 if (!vect_maybe_update_slp_op_vectype (slp_op[i], vectype_op[i]))
8259 {
8260 if (dump_enabled_p ())
8261 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8262 "incompatible vector types for invariants\n");
8263 return false;
8264 }
8265
8266 if (slp_node)
8267 vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
8268 else
8269 vec_num = 1;
8270
8271 vect_model_reduction_cost (loop_vinfo, stmt_info, reduc_fn,
8272 reduction_type, ncopies, cost_vec);
8273 /* Cost the reduction op inside the loop if transformed via
8274 vect_transform_reduction. Otherwise this is costed by the
8275 separate vectorizable_* routines. */
8276 if (single_defuse_cycle || lane_reduc_code_p)
8277 {
8278 int factor = 1;
8279 if (vect_is_emulated_mixed_dot_prod (loop_vinfo, stmt_info))
8280 /* Three dot-products and a subtraction. */
8281 factor = 4;
8282 record_stmt_cost (cost_vec, ncopies * factor, vector_stmt,
8283 stmt_info, 0, vect_body);
8284 }
8285
8286 if (dump_enabled_p ()
8287 && reduction_type == FOLD_LEFT_REDUCTION)
8288 dump_printf_loc (MSG_NOTE, vect_location,
8289 "using an in-order (fold-left) reduction.\n");
8290 STMT_VINFO_TYPE (orig_stmt_of_analysis) = cycle_phi_info_type;
8291 /* All but single defuse-cycle optimized, lane-reducing and fold-left
8292 reductions go through their own vectorizable_* routines. */
8293 if (!single_defuse_cycle
8294 && !lane_reduc_code_p
8295 && reduction_type != FOLD_LEFT_REDUCTION)
8296 {
8297 stmt_vec_info tem
8298 = vect_stmt_to_vectorize (STMT_VINFO_REDUC_DEF (phi_info));
8299 if (slp_node && REDUC_GROUP_FIRST_ELEMENT (tem))
8300 {
8301 gcc_assert (!REDUC_GROUP_NEXT_ELEMENT (tem));
8302 tem = REDUC_GROUP_FIRST_ELEMENT (tem);
8303 }
8304 STMT_VINFO_DEF_TYPE (vect_orig_stmt (tem)) = vect_internal_def;
8305 STMT_VINFO_DEF_TYPE (tem) = vect_internal_def;
8306 }
8307 else if (loop_vinfo && LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo))
8308 {
8309 vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo);
8310 vec_loop_lens *lens = &LOOP_VINFO_LENS (loop_vinfo);
8311 internal_fn cond_fn = get_conditional_internal_fn (op.code, op.type);
8312
8313 if (reduction_type != FOLD_LEFT_REDUCTION
8314 && !use_mask_by_cond_expr_p (op.code, cond_fn, vectype_in)
8315 && (cond_fn == IFN_LAST
8316 || !direct_internal_fn_supported_p (cond_fn, vectype_in,
8317 OPTIMIZE_FOR_SPEED)))
8318 {
8319 if (dump_enabled_p ())
8320 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8321 "can't operate on partial vectors because"
8322 " no conditional operation is available.\n");
8323 LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
8324 }
8325 else if (reduction_type == FOLD_LEFT_REDUCTION
8326 && reduc_fn == IFN_LAST
8327 && !expand_vec_cond_expr_p (vectype_in,
8328 truth_type_for (vectype_in),
8329 SSA_NAME))
8330 {
8331 if (dump_enabled_p ())
8332 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8333 "can't operate on partial vectors because"
8334 " no conditional operation is available.\n");
8335 LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
8336 }
8337 else if (reduction_type == FOLD_LEFT_REDUCTION
8338 && internal_fn_mask_index (reduc_fn) == -1
8339 && FLOAT_TYPE_P (vectype_in)
8340 && HONOR_SIGN_DEPENDENT_ROUNDING (vectype_in))
8341 {
8342 if (dump_enabled_p ())
8343 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8344 "can't operate on partial vectors because"
8345 " signed zeros cannot be preserved.\n");
8346 LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
8347 }
8348 else
8349 {
8350 internal_fn mask_reduc_fn
8351 = get_masked_reduction_fn (reduc_fn, vectype_in);
8352
8353 if (mask_reduc_fn == IFN_MASK_LEN_FOLD_LEFT_PLUS)
8354 vect_record_loop_len (loop_vinfo, lens, ncopies * vec_num,
8355 vectype_in, 1);
8356 else
8357 vect_record_loop_mask (loop_vinfo, masks, ncopies * vec_num,
8358 vectype_in, NULL);
8359 }
8360 }
8361 return true;
8362 }
8363
8364 /* STMT_INFO is a dot-product reduction whose multiplication operands
8365 have different signs. Emit a sequence to emulate the operation
8366 using a series of signed DOT_PROD_EXPRs and return the last
8367 statement generated. VEC_DEST is the result of the vector operation
8368 and VOP lists its inputs. */
8369
8370 static gassign *
8371 vect_emulate_mixed_dot_prod (loop_vec_info loop_vinfo, stmt_vec_info stmt_info,
8372 gimple_stmt_iterator *gsi, tree vec_dest,
8373 tree vop[3])
8374 {
8375 tree wide_vectype = signed_type_for (TREE_TYPE (vec_dest));
8376 tree narrow_vectype = signed_type_for (TREE_TYPE (vop[0]));
8377 tree narrow_elttype = TREE_TYPE (narrow_vectype);
8378 gimple *new_stmt;
8379
8380 /* Make VOP[0] the unsigned operand VOP[1] the signed operand. */
8381 if (!TYPE_UNSIGNED (TREE_TYPE (vop[0])))
8382 std::swap (vop[0], vop[1]);
8383
8384 /* Convert all inputs to signed types. */
8385 for (int i = 0; i < 3; ++i)
8386 if (TYPE_UNSIGNED (TREE_TYPE (vop[i])))
8387 {
8388 tree tmp = make_ssa_name (signed_type_for (TREE_TYPE (vop[i])));
8389 new_stmt = gimple_build_assign (tmp, NOP_EXPR, vop[i]);
8390 vect_finish_stmt_generation (loop_vinfo, stmt_info, new_stmt, gsi);
8391 vop[i] = tmp;
8392 }
8393
8394 /* In the comments below we assume 8-bit inputs for simplicity,
8395 but the approach works for any full integer type. */
8396
8397 /* Create a vector of -128. */
8398 tree min_narrow_elttype = TYPE_MIN_VALUE (narrow_elttype);
8399 tree min_narrow = build_vector_from_val (narrow_vectype,
8400 min_narrow_elttype);
8401
8402 /* Create a vector of 64. */
8403 auto half_wi = wi::lrshift (wi::to_wide (min_narrow_elttype), 1);
8404 tree half_narrow = wide_int_to_tree (narrow_elttype, half_wi);
8405 half_narrow = build_vector_from_val (narrow_vectype, half_narrow);
8406
8407 /* Emit: SUB_RES = VOP[0] - 128. */
8408 tree sub_res = make_ssa_name (narrow_vectype);
8409 new_stmt = gimple_build_assign (sub_res, PLUS_EXPR, vop[0], min_narrow);
8410 vect_finish_stmt_generation (loop_vinfo, stmt_info, new_stmt, gsi);
8411
8412 /* Emit:
8413
8414 STAGE1 = DOT_PROD_EXPR <VOP[1], 64, VOP[2]>;
8415 STAGE2 = DOT_PROD_EXPR <VOP[1], 64, STAGE1>;
8416 STAGE3 = DOT_PROD_EXPR <SUB_RES, -128, STAGE2>;
8417
8418 on the basis that x * y == (x - 128) * y + 64 * y + 64 * y
8419 Doing the two 64 * y steps first allows more time to compute x. */
8420 tree stage1 = make_ssa_name (wide_vectype);
8421 new_stmt = gimple_build_assign (stage1, DOT_PROD_EXPR,
8422 vop[1], half_narrow, vop[2]);
8423 vect_finish_stmt_generation (loop_vinfo, stmt_info, new_stmt, gsi);
8424
8425 tree stage2 = make_ssa_name (wide_vectype);
8426 new_stmt = gimple_build_assign (stage2, DOT_PROD_EXPR,
8427 vop[1], half_narrow, stage1);
8428 vect_finish_stmt_generation (loop_vinfo, stmt_info, new_stmt, gsi);
8429
8430 tree stage3 = make_ssa_name (wide_vectype);
8431 new_stmt = gimple_build_assign (stage3, DOT_PROD_EXPR,
8432 sub_res, vop[1], stage2);
8433 vect_finish_stmt_generation (loop_vinfo, stmt_info, new_stmt, gsi);
8434
8435 /* Convert STAGE3 to the reduction type. */
8436 return gimple_build_assign (vec_dest, CONVERT_EXPR, stage3);
8437 }
8438
8439 /* Transform the definition stmt STMT_INFO of a reduction PHI backedge
8440 value. */
8441
8442 bool
8443 vect_transform_reduction (loop_vec_info loop_vinfo,
8444 stmt_vec_info stmt_info, gimple_stmt_iterator *gsi,
8445 gimple **vec_stmt, slp_tree slp_node)
8446 {
8447 tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
8448 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
8449 int i;
8450 int ncopies;
8451 int vec_num;
8452
8453 stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info);
8454 gcc_assert (reduc_info->is_reduc_info);
8455
8456 if (nested_in_vect_loop_p (loop, stmt_info))
8457 {
8458 loop = loop->inner;
8459 gcc_assert (STMT_VINFO_DEF_TYPE (reduc_info) == vect_double_reduction_def);
8460 }
8461
8462 gimple_match_op op;
8463 if (!gimple_extract_op (stmt_info->stmt, &op))
8464 gcc_unreachable ();
8465
8466 /* All uses but the last are expected to be defined in the loop.
8467 The last use is the reduction variable. In case of nested cycle this
8468 assumption is not true: we use reduc_index to record the index of the
8469 reduction variable. */
8470 stmt_vec_info phi_info = STMT_VINFO_REDUC_DEF (vect_orig_stmt (stmt_info));
8471 gphi *reduc_def_phi = as_a <gphi *> (phi_info->stmt);
8472 int reduc_index = STMT_VINFO_REDUC_IDX (stmt_info);
8473 tree vectype_in = STMT_VINFO_REDUC_VECTYPE_IN (reduc_info);
8474
8475 if (slp_node)
8476 {
8477 ncopies = 1;
8478 vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
8479 }
8480 else
8481 {
8482 ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
8483 vec_num = 1;
8484 }
8485
8486 code_helper code = canonicalize_code (op.code, op.type);
8487 internal_fn cond_fn = get_conditional_internal_fn (code, op.type);
8488
8489 vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo);
8490 vec_loop_lens *lens = &LOOP_VINFO_LENS (loop_vinfo);
8491 bool mask_by_cond_expr = use_mask_by_cond_expr_p (code, cond_fn, vectype_in);
8492
8493 /* Transform. */
8494 tree new_temp = NULL_TREE;
8495 auto_vec<tree> vec_oprnds0;
8496 auto_vec<tree> vec_oprnds1;
8497 auto_vec<tree> vec_oprnds2;
8498 tree def0;
8499
8500 if (dump_enabled_p ())
8501 dump_printf_loc (MSG_NOTE, vect_location, "transform reduction.\n");
8502
8503 /* FORNOW: Multiple types are not supported for condition. */
8504 if (code == COND_EXPR)
8505 gcc_assert (ncopies == 1);
8506
8507 /* A binary COND_OP reduction must have the same definition and else
8508 value. */
8509 bool cond_fn_p = code.is_internal_fn ()
8510 && conditional_internal_fn_code (internal_fn (code)) != ERROR_MARK;
8511 if (cond_fn_p)
8512 {
8513 gcc_assert (code == IFN_COND_ADD || code == IFN_COND_SUB
8514 || code == IFN_COND_MUL || code == IFN_COND_AND
8515 || code == IFN_COND_IOR || code == IFN_COND_XOR
8516 || code == IFN_COND_MIN || code == IFN_COND_MAX);
8517 gcc_assert (op.num_ops == 4
8518 && (op.ops[reduc_index]
8519 == op.ops[internal_fn_else_index ((internal_fn) code)]));
8520 }
8521
8522 bool masked_loop_p = LOOP_VINFO_FULLY_MASKED_P (loop_vinfo);
8523
8524 vect_reduction_type reduction_type = STMT_VINFO_REDUC_TYPE (reduc_info);
8525 if (reduction_type == FOLD_LEFT_REDUCTION)
8526 {
8527 internal_fn reduc_fn = STMT_VINFO_REDUC_FN (reduc_info);
8528 gcc_assert (code.is_tree_code () || cond_fn_p);
8529 return vectorize_fold_left_reduction
8530 (loop_vinfo, stmt_info, gsi, vec_stmt, slp_node, reduc_def_phi,
8531 code, reduc_fn, op.ops, op.num_ops, vectype_in,
8532 reduc_index, masks, lens);
8533 }
8534
8535 bool single_defuse_cycle = STMT_VINFO_FORCE_SINGLE_CYCLE (reduc_info);
8536 gcc_assert (single_defuse_cycle
8537 || code == DOT_PROD_EXPR
8538 || code == WIDEN_SUM_EXPR
8539 || code == SAD_EXPR);
8540
8541 /* Create the destination vector */
8542 tree scalar_dest = gimple_get_lhs (stmt_info->stmt);
8543 tree vec_dest = vect_create_destination_var (scalar_dest, vectype_out);
8544
8545 /* Get NCOPIES vector definitions for all operands except the reduction
8546 definition. */
8547 if (!cond_fn_p)
8548 {
8549 vect_get_vec_defs (loop_vinfo, stmt_info, slp_node, ncopies,
8550 single_defuse_cycle && reduc_index == 0
8551 ? NULL_TREE : op.ops[0], &vec_oprnds0,
8552 single_defuse_cycle && reduc_index == 1
8553 ? NULL_TREE : op.ops[1], &vec_oprnds1,
8554 op.num_ops == 3
8555 && !(single_defuse_cycle && reduc_index == 2)
8556 ? op.ops[2] : NULL_TREE, &vec_oprnds2);
8557 }
8558 else
8559 {
8560 /* For a conditional operation pass the truth type as mask
8561 vectype. */
8562 gcc_assert (single_defuse_cycle
8563 && (reduc_index == 1 || reduc_index == 2));
8564 vect_get_vec_defs (loop_vinfo, stmt_info, slp_node, ncopies,
8565 op.ops[0], truth_type_for (vectype_in), &vec_oprnds0,
8566 reduc_index == 1 ? NULL_TREE : op.ops[1],
8567 NULL_TREE, &vec_oprnds1,
8568 reduc_index == 2 ? NULL_TREE : op.ops[2],
8569 NULL_TREE, &vec_oprnds2);
8570 }
8571
8572 /* For single def-use cycles get one copy of the vectorized reduction
8573 definition. */
8574 if (single_defuse_cycle)
8575 {
8576 gcc_assert (!slp_node);
8577 vect_get_vec_defs_for_operand (loop_vinfo, stmt_info, 1,
8578 op.ops[reduc_index],
8579 reduc_index == 0 ? &vec_oprnds0
8580 : (reduc_index == 1 ? &vec_oprnds1
8581 : &vec_oprnds2));
8582 }
8583
8584 bool emulated_mixed_dot_prod
8585 = vect_is_emulated_mixed_dot_prod (loop_vinfo, stmt_info);
8586 FOR_EACH_VEC_ELT (vec_oprnds0, i, def0)
8587 {
8588 gimple *new_stmt;
8589 tree vop[3] = { def0, vec_oprnds1[i], NULL_TREE };
8590 if (masked_loop_p && !mask_by_cond_expr)
8591 {
8592 /* No conditional ifns have been defined for dot-product yet. */
8593 gcc_assert (code != DOT_PROD_EXPR);
8594
8595 /* Make sure that the reduction accumulator is vop[0]. */
8596 if (reduc_index == 1)
8597 {
8598 gcc_assert (commutative_binary_op_p (code, op.type));
8599 std::swap (vop[0], vop[1]);
8600 }
8601 tree mask = vect_get_loop_mask (loop_vinfo, gsi, masks,
8602 vec_num * ncopies, vectype_in, i);
8603 gcall *call = gimple_build_call_internal (cond_fn, 4, mask,
8604 vop[0], vop[1], vop[0]);
8605 new_temp = make_ssa_name (vec_dest, call);
8606 gimple_call_set_lhs (call, new_temp);
8607 gimple_call_set_nothrow (call, true);
8608 vect_finish_stmt_generation (loop_vinfo, stmt_info, call, gsi);
8609 new_stmt = call;
8610 }
8611 else
8612 {
8613 if (op.num_ops >= 3)
8614 vop[2] = vec_oprnds2[i];
8615
8616 if (masked_loop_p && mask_by_cond_expr)
8617 {
8618 tree mask = vect_get_loop_mask (loop_vinfo, gsi, masks,
8619 vec_num * ncopies, vectype_in, i);
8620 build_vect_cond_expr (code, vop, mask, gsi);
8621 }
8622
8623 if (emulated_mixed_dot_prod)
8624 new_stmt = vect_emulate_mixed_dot_prod (loop_vinfo, stmt_info, gsi,
8625 vec_dest, vop);
8626
8627 else if (code.is_internal_fn () && !cond_fn_p)
8628 new_stmt = gimple_build_call_internal (internal_fn (code),
8629 op.num_ops,
8630 vop[0], vop[1], vop[2]);
8631 else if (code.is_internal_fn () && cond_fn_p)
8632 new_stmt = gimple_build_call_internal (internal_fn (code),
8633 op.num_ops,
8634 vop[0], vop[1], vop[2],
8635 vop[1]);
8636 else
8637 new_stmt = gimple_build_assign (vec_dest, tree_code (op.code),
8638 vop[0], vop[1], vop[2]);
8639 new_temp = make_ssa_name (vec_dest, new_stmt);
8640 gimple_set_lhs (new_stmt, new_temp);
8641 vect_finish_stmt_generation (loop_vinfo, stmt_info, new_stmt, gsi);
8642 }
8643
8644 if (slp_node)
8645 slp_node->push_vec_def (new_stmt);
8646 else if (single_defuse_cycle
8647 && i < ncopies - 1)
8648 {
8649 if (reduc_index == 0)
8650 vec_oprnds0.safe_push (gimple_get_lhs (new_stmt));
8651 else if (reduc_index == 1)
8652 vec_oprnds1.safe_push (gimple_get_lhs (new_stmt));
8653 else if (reduc_index == 2)
8654 vec_oprnds2.safe_push (gimple_get_lhs (new_stmt));
8655 }
8656 else
8657 STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_stmt);
8658 }
8659
8660 if (!slp_node)
8661 *vec_stmt = STMT_VINFO_VEC_STMTS (stmt_info)[0];
8662
8663 return true;
8664 }
8665
8666 /* Transform phase of a cycle PHI. */
8667
8668 bool
8669 vect_transform_cycle_phi (loop_vec_info loop_vinfo,
8670 stmt_vec_info stmt_info, gimple **vec_stmt,
8671 slp_tree slp_node, slp_instance slp_node_instance)
8672 {
8673 tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
8674 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
8675 int i;
8676 int ncopies;
8677 int j;
8678 bool nested_cycle = false;
8679 int vec_num;
8680
8681 if (nested_in_vect_loop_p (loop, stmt_info))
8682 {
8683 loop = loop->inner;
8684 nested_cycle = true;
8685 }
8686
8687 stmt_vec_info reduc_stmt_info = STMT_VINFO_REDUC_DEF (stmt_info);
8688 reduc_stmt_info = vect_stmt_to_vectorize (reduc_stmt_info);
8689 stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info);
8690 gcc_assert (reduc_info->is_reduc_info);
8691
8692 if (STMT_VINFO_REDUC_TYPE (reduc_info) == EXTRACT_LAST_REDUCTION
8693 || STMT_VINFO_REDUC_TYPE (reduc_info) == FOLD_LEFT_REDUCTION)
8694 /* Leave the scalar phi in place. */
8695 return true;
8696
8697 tree vectype_in = STMT_VINFO_REDUC_VECTYPE_IN (reduc_info);
8698 /* For a nested cycle we do not fill the above. */
8699 if (!vectype_in)
8700 vectype_in = STMT_VINFO_VECTYPE (stmt_info);
8701 gcc_assert (vectype_in);
8702
8703 if (slp_node)
8704 {
8705 /* The size vect_schedule_slp_instance computes is off for us. */
8706 vec_num = vect_get_num_vectors (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
8707 * SLP_TREE_LANES (slp_node), vectype_in);
8708 ncopies = 1;
8709 }
8710 else
8711 {
8712 vec_num = 1;
8713 ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
8714 }
8715
8716 /* Check whether we should use a single PHI node and accumulate
8717 vectors to one before the backedge. */
8718 if (STMT_VINFO_FORCE_SINGLE_CYCLE (reduc_info))
8719 ncopies = 1;
8720
8721 /* Create the destination vector */
8722 gphi *phi = as_a <gphi *> (stmt_info->stmt);
8723 tree vec_dest = vect_create_destination_var (gimple_phi_result (phi),
8724 vectype_out);
8725
8726 /* Get the loop-entry arguments. */
8727 tree vec_initial_def = NULL_TREE;
8728 auto_vec<tree> vec_initial_defs;
8729 if (slp_node)
8730 {
8731 vec_initial_defs.reserve (vec_num);
8732 if (nested_cycle)
8733 {
8734 unsigned phi_idx = loop_preheader_edge (loop)->dest_idx;
8735 vect_get_slp_defs (SLP_TREE_CHILDREN (slp_node)[phi_idx],
8736 &vec_initial_defs);
8737 }
8738 else
8739 {
8740 gcc_assert (slp_node == slp_node_instance->reduc_phis);
8741 vec<tree> &initial_values = reduc_info->reduc_initial_values;
8742 vec<stmt_vec_info> &stmts = SLP_TREE_SCALAR_STMTS (slp_node);
8743
8744 unsigned int num_phis = stmts.length ();
8745 if (REDUC_GROUP_FIRST_ELEMENT (reduc_stmt_info))
8746 num_phis = 1;
8747 initial_values.reserve (num_phis);
8748 for (unsigned int i = 0; i < num_phis; ++i)
8749 {
8750 gphi *this_phi = as_a<gphi *> (stmts[i]->stmt);
8751 initial_values.quick_push (vect_phi_initial_value (this_phi));
8752 }
8753 if (vec_num == 1)
8754 vect_find_reusable_accumulator (loop_vinfo, reduc_info);
8755 if (!initial_values.is_empty ())
8756 {
8757 tree initial_value
8758 = (num_phis == 1 ? initial_values[0] : NULL_TREE);
8759 code_helper code = STMT_VINFO_REDUC_CODE (reduc_info);
8760 tree neutral_op
8761 = neutral_op_for_reduction (TREE_TYPE (vectype_out),
8762 code, initial_value);
8763 get_initial_defs_for_reduction (loop_vinfo, reduc_info,
8764 &vec_initial_defs, vec_num,
8765 stmts.length (), neutral_op);
8766 }
8767 }
8768 }
8769 else
8770 {
8771 /* Get at the scalar def before the loop, that defines the initial
8772 value of the reduction variable. */
8773 tree initial_def = vect_phi_initial_value (phi);
8774 reduc_info->reduc_initial_values.safe_push (initial_def);
8775 /* Optimize: if initial_def is for REDUC_MAX smaller than the base
8776 and we can't use zero for induc_val, use initial_def. Similarly
8777 for REDUC_MIN and initial_def larger than the base. */
8778 if (STMT_VINFO_REDUC_TYPE (reduc_info) == INTEGER_INDUC_COND_REDUCTION)
8779 {
8780 tree induc_val = STMT_VINFO_VEC_INDUC_COND_INITIAL_VAL (reduc_info);
8781 if (TREE_CODE (initial_def) == INTEGER_CST
8782 && !integer_zerop (induc_val)
8783 && ((STMT_VINFO_REDUC_CODE (reduc_info) == MAX_EXPR
8784 && tree_int_cst_lt (initial_def, induc_val))
8785 || (STMT_VINFO_REDUC_CODE (reduc_info) == MIN_EXPR
8786 && tree_int_cst_lt (induc_val, initial_def))))
8787 {
8788 induc_val = initial_def;
8789 /* Communicate we used the initial_def to epilouge
8790 generation. */
8791 STMT_VINFO_VEC_INDUC_COND_INITIAL_VAL (reduc_info) = NULL_TREE;
8792 }
8793 vec_initial_def = build_vector_from_val (vectype_out, induc_val);
8794 }
8795 else if (nested_cycle)
8796 {
8797 /* Do not use an adjustment def as that case is not supported
8798 correctly if ncopies is not one. */
8799 vect_get_vec_defs_for_operand (loop_vinfo, reduc_stmt_info,
8800 ncopies, initial_def,
8801 &vec_initial_defs);
8802 }
8803 else if (STMT_VINFO_REDUC_TYPE (reduc_info) == CONST_COND_REDUCTION
8804 || STMT_VINFO_REDUC_TYPE (reduc_info) == COND_REDUCTION)
8805 /* Fill the initial vector with the initial scalar value. */
8806 vec_initial_def
8807 = get_initial_def_for_reduction (loop_vinfo, reduc_stmt_info,
8808 initial_def, initial_def);
8809 else
8810 {
8811 if (ncopies == 1)
8812 vect_find_reusable_accumulator (loop_vinfo, reduc_info);
8813 if (!reduc_info->reduc_initial_values.is_empty ())
8814 {
8815 initial_def = reduc_info->reduc_initial_values[0];
8816 code_helper code = STMT_VINFO_REDUC_CODE (reduc_info);
8817 tree neutral_op
8818 = neutral_op_for_reduction (TREE_TYPE (initial_def),
8819 code, initial_def);
8820 gcc_assert (neutral_op);
8821 /* Try to simplify the vector initialization by applying an
8822 adjustment after the reduction has been performed. */
8823 if (!reduc_info->reused_accumulator
8824 && STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
8825 && !operand_equal_p (neutral_op, initial_def))
8826 {
8827 STMT_VINFO_REDUC_EPILOGUE_ADJUSTMENT (reduc_info)
8828 = initial_def;
8829 initial_def = neutral_op;
8830 }
8831 vec_initial_def
8832 = get_initial_def_for_reduction (loop_vinfo, reduc_info,
8833 initial_def, neutral_op);
8834 }
8835 }
8836 }
8837
8838 if (vec_initial_def)
8839 {
8840 vec_initial_defs.create (ncopies);
8841 for (i = 0; i < ncopies; ++i)
8842 vec_initial_defs.quick_push (vec_initial_def);
8843 }
8844
8845 if (auto *accumulator = reduc_info->reused_accumulator)
8846 {
8847 tree def = accumulator->reduc_input;
8848 if (!useless_type_conversion_p (vectype_out, TREE_TYPE (def)))
8849 {
8850 unsigned int nreduc;
8851 bool res = constant_multiple_p (TYPE_VECTOR_SUBPARTS
8852 (TREE_TYPE (def)),
8853 TYPE_VECTOR_SUBPARTS (vectype_out),
8854 &nreduc);
8855 gcc_assert (res);
8856 gimple_seq stmts = NULL;
8857 /* Reduce the single vector to a smaller one. */
8858 if (nreduc != 1)
8859 {
8860 /* Perform the reduction in the appropriate type. */
8861 tree rvectype = vectype_out;
8862 if (!useless_type_conversion_p (TREE_TYPE (vectype_out),
8863 TREE_TYPE (TREE_TYPE (def))))
8864 rvectype = build_vector_type (TREE_TYPE (TREE_TYPE (def)),
8865 TYPE_VECTOR_SUBPARTS
8866 (vectype_out));
8867 def = vect_create_partial_epilog (def, rvectype,
8868 STMT_VINFO_REDUC_CODE
8869 (reduc_info),
8870 &stmts);
8871 }
8872 /* The epilogue loop might use a different vector mode, like
8873 VNx2DI vs. V2DI. */
8874 if (TYPE_MODE (vectype_out) != TYPE_MODE (TREE_TYPE (def)))
8875 {
8876 tree reduc_type = build_vector_type_for_mode
8877 (TREE_TYPE (TREE_TYPE (def)), TYPE_MODE (vectype_out));
8878 def = gimple_convert (&stmts, reduc_type, def);
8879 }
8880 /* Adjust the input so we pick up the partially reduced value
8881 for the skip edge in vect_create_epilog_for_reduction. */
8882 accumulator->reduc_input = def;
8883 /* And the reduction could be carried out using a different sign. */
8884 if (!useless_type_conversion_p (vectype_out, TREE_TYPE (def)))
8885 def = gimple_convert (&stmts, vectype_out, def);
8886 if (loop_vinfo->main_loop_edge)
8887 {
8888 /* While we'd like to insert on the edge this will split
8889 blocks and disturb bookkeeping, we also will eventually
8890 need this on the skip edge. Rely on sinking to
8891 fixup optimal placement and insert in the pred. */
8892 gimple_stmt_iterator gsi
8893 = gsi_last_bb (loop_vinfo->main_loop_edge->src);
8894 /* Insert before a cond that eventually skips the
8895 epilogue. */
8896 if (!gsi_end_p (gsi) && stmt_ends_bb_p (gsi_stmt (gsi)))
8897 gsi_prev (&gsi);
8898 gsi_insert_seq_after (&gsi, stmts, GSI_CONTINUE_LINKING);
8899 }
8900 else
8901 gsi_insert_seq_on_edge_immediate (loop_preheader_edge (loop),
8902 stmts);
8903 }
8904 if (loop_vinfo->main_loop_edge)
8905 vec_initial_defs[0]
8906 = vect_get_main_loop_result (loop_vinfo, def,
8907 vec_initial_defs[0]);
8908 else
8909 vec_initial_defs.safe_push (def);
8910 }
8911
8912 /* Generate the reduction PHIs upfront. */
8913 for (i = 0; i < vec_num; i++)
8914 {
8915 tree vec_init_def = vec_initial_defs[i];
8916 for (j = 0; j < ncopies; j++)
8917 {
8918 /* Create the reduction-phi that defines the reduction
8919 operand. */
8920 gphi *new_phi = create_phi_node (vec_dest, loop->header);
8921
8922 /* Set the loop-entry arg of the reduction-phi. */
8923 if (j != 0 && nested_cycle)
8924 vec_init_def = vec_initial_defs[j];
8925 add_phi_arg (new_phi, vec_init_def, loop_preheader_edge (loop),
8926 UNKNOWN_LOCATION);
8927
8928 /* The loop-latch arg is set in epilogue processing. */
8929
8930 if (slp_node)
8931 slp_node->push_vec_def (new_phi);
8932 else
8933 {
8934 if (j == 0)
8935 *vec_stmt = new_phi;
8936 STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_phi);
8937 }
8938 }
8939 }
8940
8941 return true;
8942 }
8943
8944 /* Vectorizes LC PHIs. */
8945
8946 bool
8947 vectorizable_lc_phi (loop_vec_info loop_vinfo,
8948 stmt_vec_info stmt_info, gimple **vec_stmt,
8949 slp_tree slp_node)
8950 {
8951 if (!loop_vinfo
8952 || !is_a <gphi *> (stmt_info->stmt)
8953 || gimple_phi_num_args (stmt_info->stmt) != 1)
8954 return false;
8955
8956 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_internal_def
8957 && STMT_VINFO_DEF_TYPE (stmt_info) != vect_double_reduction_def)
8958 return false;
8959
8960 if (!vec_stmt) /* transformation not required. */
8961 {
8962 /* Deal with copies from externs or constants that disguise as
8963 loop-closed PHI nodes (PR97886). */
8964 if (slp_node
8965 && !vect_maybe_update_slp_op_vectype (SLP_TREE_CHILDREN (slp_node)[0],
8966 SLP_TREE_VECTYPE (slp_node)))
8967 {
8968 if (dump_enabled_p ())
8969 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8970 "incompatible vector types for invariants\n");
8971 return false;
8972 }
8973 STMT_VINFO_TYPE (stmt_info) = lc_phi_info_type;
8974 return true;
8975 }
8976
8977 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
8978 tree scalar_dest = gimple_phi_result (stmt_info->stmt);
8979 basic_block bb = gimple_bb (stmt_info->stmt);
8980 edge e = single_pred_edge (bb);
8981 tree vec_dest = vect_create_destination_var (scalar_dest, vectype);
8982 auto_vec<tree> vec_oprnds;
8983 vect_get_vec_defs (loop_vinfo, stmt_info, slp_node,
8984 !slp_node ? vect_get_num_copies (loop_vinfo, vectype) : 1,
8985 gimple_phi_arg_def (stmt_info->stmt, 0), &vec_oprnds);
8986 for (unsigned i = 0; i < vec_oprnds.length (); i++)
8987 {
8988 /* Create the vectorized LC PHI node. */
8989 gphi *new_phi = create_phi_node (vec_dest, bb);
8990 add_phi_arg (new_phi, vec_oprnds[i], e, UNKNOWN_LOCATION);
8991 if (slp_node)
8992 slp_node->push_vec_def (new_phi);
8993 else
8994 STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_phi);
8995 }
8996 if (!slp_node)
8997 *vec_stmt = STMT_VINFO_VEC_STMTS (stmt_info)[0];
8998
8999 return true;
9000 }
9001
9002 /* Vectorizes PHIs. */
9003
9004 bool
9005 vectorizable_phi (vec_info *,
9006 stmt_vec_info stmt_info, gimple **vec_stmt,
9007 slp_tree slp_node, stmt_vector_for_cost *cost_vec)
9008 {
9009 if (!is_a <gphi *> (stmt_info->stmt) || !slp_node)
9010 return false;
9011
9012 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_internal_def)
9013 return false;
9014
9015 tree vectype = SLP_TREE_VECTYPE (slp_node);
9016
9017 if (!vec_stmt) /* transformation not required. */
9018 {
9019 slp_tree child;
9020 unsigned i;
9021 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (slp_node), i, child)
9022 if (!child)
9023 {
9024 if (dump_enabled_p ())
9025 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9026 "PHI node with unvectorized backedge def\n");
9027 return false;
9028 }
9029 else if (!vect_maybe_update_slp_op_vectype (child, vectype))
9030 {
9031 if (dump_enabled_p ())
9032 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9033 "incompatible vector types for invariants\n");
9034 return false;
9035 }
9036 else if (SLP_TREE_DEF_TYPE (child) == vect_internal_def
9037 && !useless_type_conversion_p (vectype,
9038 SLP_TREE_VECTYPE (child)))
9039 {
9040 /* With bools we can have mask and non-mask precision vectors
9041 or different non-mask precisions. while pattern recog is
9042 supposed to guarantee consistency here bugs in it can cause
9043 mismatches (PR103489 and PR103800 for example).
9044 Deal with them here instead of ICEing later. */
9045 if (dump_enabled_p ())
9046 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9047 "incompatible vector type setup from "
9048 "bool pattern detection\n");
9049 return false;
9050 }
9051
9052 /* For single-argument PHIs assume coalescing which means zero cost
9053 for the scalar and the vector PHIs. This avoids artificially
9054 favoring the vector path (but may pessimize it in some cases). */
9055 if (gimple_phi_num_args (as_a <gphi *> (stmt_info->stmt)) > 1)
9056 record_stmt_cost (cost_vec, SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node),
9057 vector_stmt, stmt_info, vectype, 0, vect_body);
9058 STMT_VINFO_TYPE (stmt_info) = phi_info_type;
9059 return true;
9060 }
9061
9062 tree scalar_dest = gimple_phi_result (stmt_info->stmt);
9063 basic_block bb = gimple_bb (stmt_info->stmt);
9064 tree vec_dest = vect_create_destination_var (scalar_dest, vectype);
9065 auto_vec<gphi *> new_phis;
9066 for (unsigned i = 0; i < gimple_phi_num_args (stmt_info->stmt); ++i)
9067 {
9068 slp_tree child = SLP_TREE_CHILDREN (slp_node)[i];
9069
9070 /* Skip not yet vectorized defs. */
9071 if (SLP_TREE_DEF_TYPE (child) == vect_internal_def
9072 && SLP_TREE_VEC_DEFS (child).is_empty ())
9073 continue;
9074
9075 auto_vec<tree> vec_oprnds;
9076 vect_get_slp_defs (SLP_TREE_CHILDREN (slp_node)[i], &vec_oprnds);
9077 if (!new_phis.exists ())
9078 {
9079 new_phis.create (vec_oprnds.length ());
9080 for (unsigned j = 0; j < vec_oprnds.length (); j++)
9081 {
9082 /* Create the vectorized LC PHI node. */
9083 new_phis.quick_push (create_phi_node (vec_dest, bb));
9084 slp_node->push_vec_def (new_phis[j]);
9085 }
9086 }
9087 edge e = gimple_phi_arg_edge (as_a <gphi *> (stmt_info->stmt), i);
9088 for (unsigned j = 0; j < vec_oprnds.length (); j++)
9089 add_phi_arg (new_phis[j], vec_oprnds[j], e, UNKNOWN_LOCATION);
9090 }
9091 /* We should have at least one already vectorized child. */
9092 gcc_assert (new_phis.exists ());
9093
9094 return true;
9095 }
9096
9097 /* Vectorizes first order recurrences. An overview of the transformation
9098 is described below. Suppose we have the following loop.
9099
9100 int t = 0;
9101 for (int i = 0; i < n; ++i)
9102 {
9103 b[i] = a[i] - t;
9104 t = a[i];
9105 }
9106
9107 There is a first-order recurrence on 'a'. For this loop, the scalar IR
9108 looks (simplified) like:
9109
9110 scalar.preheader:
9111 init = 0;
9112
9113 scalar.body:
9114 i = PHI <0(scalar.preheader), i+1(scalar.body)>
9115 _2 = PHI <(init(scalar.preheader), <_1(scalar.body)>
9116 _1 = a[i]
9117 b[i] = _1 - _2
9118 if (i < n) goto scalar.body
9119
9120 In this example, _2 is a recurrence because it's value depends on the
9121 previous iteration. We vectorize this as (VF = 4)
9122
9123 vector.preheader:
9124 vect_init = vect_cst(..., ..., ..., 0)
9125
9126 vector.body
9127 i = PHI <0(vector.preheader), i+4(vector.body)>
9128 vect_1 = PHI <vect_init(vector.preheader), v2(vector.body)>
9129 vect_2 = a[i, i+1, i+2, i+3];
9130 vect_3 = vec_perm (vect_1, vect_2, { 3, 4, 5, 6 })
9131 b[i, i+1, i+2, i+3] = vect_2 - vect_3
9132 if (..) goto vector.body
9133
9134 In this function, vectorizable_recurr, we code generate both the
9135 vector PHI node and the permute since those together compute the
9136 vectorized value of the scalar PHI. We do not yet have the
9137 backedge value to fill in there nor into the vec_perm. Those
9138 are filled in maybe_set_vectorized_backedge_value and
9139 vect_schedule_scc.
9140
9141 TODO: Since the scalar loop does not have a use of the recurrence
9142 outside of the loop the natural way to implement peeling via
9143 vectorizing the live value doesn't work. For now peeling of loops
9144 with a recurrence is not implemented. For SLP the supported cases
9145 are restricted to those requiring a single vector recurrence PHI. */
9146
9147 bool
9148 vectorizable_recurr (loop_vec_info loop_vinfo, stmt_vec_info stmt_info,
9149 gimple **vec_stmt, slp_tree slp_node,
9150 stmt_vector_for_cost *cost_vec)
9151 {
9152 if (!loop_vinfo || !is_a<gphi *> (stmt_info->stmt))
9153 return false;
9154
9155 gphi *phi = as_a<gphi *> (stmt_info->stmt);
9156
9157 /* So far we only support first-order recurrence auto-vectorization. */
9158 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_first_order_recurrence)
9159 return false;
9160
9161 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
9162 unsigned ncopies;
9163 if (slp_node)
9164 ncopies = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
9165 else
9166 ncopies = vect_get_num_copies (loop_vinfo, vectype);
9167 poly_int64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
9168 unsigned dist = slp_node ? SLP_TREE_LANES (slp_node) : 1;
9169 /* We need to be able to make progress with a single vector. */
9170 if (maybe_gt (dist * 2, nunits))
9171 {
9172 if (dump_enabled_p ())
9173 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9174 "first order recurrence exceeds half of "
9175 "a vector\n");
9176 return false;
9177 }
9178
9179 /* First-order recurrence autovectorization needs to handle permutation
9180 with indices = [nunits-1, nunits, nunits+1, ...]. */
9181 vec_perm_builder sel (nunits, 1, 3);
9182 for (int i = 0; i < 3; ++i)
9183 sel.quick_push (nunits - dist + i);
9184 vec_perm_indices indices (sel, 2, nunits);
9185
9186 if (!vec_stmt) /* transformation not required. */
9187 {
9188 if (!can_vec_perm_const_p (TYPE_MODE (vectype), TYPE_MODE (vectype),
9189 indices))
9190 return false;
9191
9192 if (slp_node)
9193 {
9194 /* We eventually need to set a vector type on invariant
9195 arguments. */
9196 unsigned j;
9197 slp_tree child;
9198 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (slp_node), j, child)
9199 if (!vect_maybe_update_slp_op_vectype
9200 (child, SLP_TREE_VECTYPE (slp_node)))
9201 {
9202 if (dump_enabled_p ())
9203 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9204 "incompatible vector types for "
9205 "invariants\n");
9206 return false;
9207 }
9208 }
9209
9210 /* Verify we have set up compatible types. */
9211 edge le = loop_latch_edge (LOOP_VINFO_LOOP (loop_vinfo));
9212 tree latch_vectype = NULL_TREE;
9213 if (slp_node)
9214 {
9215 slp_tree latch_def = SLP_TREE_CHILDREN (slp_node)[le->dest_idx];
9216 latch_vectype = SLP_TREE_VECTYPE (latch_def);
9217 }
9218 else
9219 {
9220 tree latch_def = PHI_ARG_DEF_FROM_EDGE (phi, le);
9221 if (TREE_CODE (latch_def) == SSA_NAME)
9222 {
9223 stmt_vec_info latch_def_info = loop_vinfo->lookup_def (latch_def);
9224 latch_def_info = vect_stmt_to_vectorize (latch_def_info);
9225 latch_vectype = STMT_VINFO_VECTYPE (latch_def_info);
9226 }
9227 }
9228 if (!types_compatible_p (latch_vectype, vectype))
9229 return false;
9230
9231 /* The recurrence costs the initialization vector and one permute
9232 for each copy. */
9233 unsigned prologue_cost = record_stmt_cost (cost_vec, 1, scalar_to_vec,
9234 stmt_info, 0, vect_prologue);
9235 unsigned inside_cost = record_stmt_cost (cost_vec, ncopies, vector_stmt,
9236 stmt_info, 0, vect_body);
9237 if (dump_enabled_p ())
9238 dump_printf_loc (MSG_NOTE, vect_location,
9239 "vectorizable_recurr: inside_cost = %d, "
9240 "prologue_cost = %d .\n", inside_cost,
9241 prologue_cost);
9242
9243 STMT_VINFO_TYPE (stmt_info) = recurr_info_type;
9244 return true;
9245 }
9246
9247 edge pe = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo));
9248 basic_block bb = gimple_bb (phi);
9249 tree preheader = PHI_ARG_DEF_FROM_EDGE (phi, pe);
9250 if (!useless_type_conversion_p (TREE_TYPE (vectype), TREE_TYPE (preheader)))
9251 {
9252 gimple_seq stmts = NULL;
9253 preheader = gimple_convert (&stmts, TREE_TYPE (vectype), preheader);
9254 gsi_insert_seq_on_edge_immediate (pe, stmts);
9255 }
9256 tree vec_init = build_vector_from_val (vectype, preheader);
9257 vec_init = vect_init_vector (loop_vinfo, stmt_info, vec_init, vectype, NULL);
9258
9259 /* Create the vectorized first-order PHI node. */
9260 tree vec_dest = vect_get_new_vect_var (vectype,
9261 vect_simple_var, "vec_recur_");
9262 gphi *new_phi = create_phi_node (vec_dest, bb);
9263 add_phi_arg (new_phi, vec_init, pe, UNKNOWN_LOCATION);
9264
9265 /* Insert shuffles the first-order recurrence autovectorization.
9266 result = VEC_PERM <vec_recur, vect_1, index[nunits-1, nunits, ...]>. */
9267 tree perm = vect_gen_perm_mask_checked (vectype, indices);
9268
9269 /* Insert the required permute after the latch definition. The
9270 second and later operands are tentative and will be updated when we have
9271 vectorized the latch definition. */
9272 edge le = loop_latch_edge (LOOP_VINFO_LOOP (loop_vinfo));
9273 gimple *latch_def = SSA_NAME_DEF_STMT (PHI_ARG_DEF_FROM_EDGE (phi, le));
9274 gimple_stmt_iterator gsi2 = gsi_for_stmt (latch_def);
9275 gsi_next (&gsi2);
9276
9277 for (unsigned i = 0; i < ncopies; ++i)
9278 {
9279 vec_dest = make_ssa_name (vectype);
9280 gassign *vperm
9281 = gimple_build_assign (vec_dest, VEC_PERM_EXPR,
9282 i == 0 ? gimple_phi_result (new_phi) : NULL,
9283 NULL, perm);
9284 vect_finish_stmt_generation (loop_vinfo, stmt_info, vperm, &gsi2);
9285
9286 if (slp_node)
9287 slp_node->push_vec_def (vperm);
9288 else
9289 STMT_VINFO_VEC_STMTS (stmt_info).safe_push (vperm);
9290 }
9291
9292 if (!slp_node)
9293 *vec_stmt = STMT_VINFO_VEC_STMTS (stmt_info)[0];
9294 return true;
9295 }
9296
9297 /* Return true if VECTYPE represents a vector that requires lowering
9298 by the vector lowering pass. */
9299
9300 bool
9301 vect_emulated_vector_p (tree vectype)
9302 {
9303 return (!VECTOR_MODE_P (TYPE_MODE (vectype))
9304 && (!VECTOR_BOOLEAN_TYPE_P (vectype)
9305 || TYPE_PRECISION (TREE_TYPE (vectype)) != 1));
9306 }
9307
9308 /* Return true if we can emulate CODE on an integer mode representation
9309 of a vector. */
9310
9311 bool
9312 vect_can_vectorize_without_simd_p (tree_code code)
9313 {
9314 switch (code)
9315 {
9316 case PLUS_EXPR:
9317 case MINUS_EXPR:
9318 case NEGATE_EXPR:
9319 case BIT_AND_EXPR:
9320 case BIT_IOR_EXPR:
9321 case BIT_XOR_EXPR:
9322 case BIT_NOT_EXPR:
9323 return true;
9324
9325 default:
9326 return false;
9327 }
9328 }
9329
9330 /* Likewise, but taking a code_helper. */
9331
9332 bool
9333 vect_can_vectorize_without_simd_p (code_helper code)
9334 {
9335 return (code.is_tree_code ()
9336 && vect_can_vectorize_without_simd_p (tree_code (code)));
9337 }
9338
9339 /* Create vector init for vectorized iv. */
9340 static tree
9341 vect_create_nonlinear_iv_init (gimple_seq* stmts, tree init_expr,
9342 tree step_expr, poly_uint64 nunits,
9343 tree vectype,
9344 enum vect_induction_op_type induction_type)
9345 {
9346 unsigned HOST_WIDE_INT const_nunits;
9347 tree vec_shift, vec_init, new_name;
9348 unsigned i;
9349 tree itype = TREE_TYPE (vectype);
9350
9351 /* iv_loop is the loop to be vectorized. Create:
9352 vec_init = [X, X+S, X+2*S, X+3*S] (S = step_expr, X = init_expr). */
9353 new_name = gimple_convert (stmts, itype, init_expr);
9354 switch (induction_type)
9355 {
9356 case vect_step_op_shr:
9357 case vect_step_op_shl:
9358 /* Build the Initial value from shift_expr. */
9359 vec_init = gimple_build_vector_from_val (stmts,
9360 vectype,
9361 new_name);
9362 vec_shift = gimple_build (stmts, VEC_SERIES_EXPR, vectype,
9363 build_zero_cst (itype), step_expr);
9364 vec_init = gimple_build (stmts,
9365 (induction_type == vect_step_op_shr
9366 ? RSHIFT_EXPR : LSHIFT_EXPR),
9367 vectype, vec_init, vec_shift);
9368 break;
9369
9370 case vect_step_op_neg:
9371 {
9372 vec_init = gimple_build_vector_from_val (stmts,
9373 vectype,
9374 new_name);
9375 tree vec_neg = gimple_build (stmts, NEGATE_EXPR,
9376 vectype, vec_init);
9377 /* The encoding has 2 interleaved stepped patterns. */
9378 vec_perm_builder sel (nunits, 2, 3);
9379 sel.quick_grow (6);
9380 for (i = 0; i < 3; i++)
9381 {
9382 sel[2 * i] = i;
9383 sel[2 * i + 1] = i + nunits;
9384 }
9385 vec_perm_indices indices (sel, 2, nunits);
9386 /* Don't use vect_gen_perm_mask_checked since can_vec_perm_const_p may
9387 fail when vec_init is const vector. In that situation vec_perm is not
9388 really needed. */
9389 tree perm_mask_even
9390 = vect_gen_perm_mask_any (vectype, indices);
9391 vec_init = gimple_build (stmts, VEC_PERM_EXPR,
9392 vectype,
9393 vec_init, vec_neg,
9394 perm_mask_even);
9395 }
9396 break;
9397
9398 case vect_step_op_mul:
9399 {
9400 /* Use unsigned mult to avoid UD integer overflow. */
9401 gcc_assert (nunits.is_constant (&const_nunits));
9402 tree utype = unsigned_type_for (itype);
9403 tree uvectype = build_vector_type (utype,
9404 TYPE_VECTOR_SUBPARTS (vectype));
9405 new_name = gimple_convert (stmts, utype, new_name);
9406 vec_init = gimple_build_vector_from_val (stmts,
9407 uvectype,
9408 new_name);
9409 tree_vector_builder elts (uvectype, const_nunits, 1);
9410 tree elt_step = build_one_cst (utype);
9411
9412 elts.quick_push (elt_step);
9413 for (i = 1; i < const_nunits; i++)
9414 {
9415 /* Create: new_name_i = new_name + step_expr. */
9416 elt_step = gimple_build (stmts, MULT_EXPR,
9417 utype, elt_step, step_expr);
9418 elts.quick_push (elt_step);
9419 }
9420 /* Create a vector from [new_name_0, new_name_1, ...,
9421 new_name_nunits-1]. */
9422 tree vec_mul = gimple_build_vector (stmts, &elts);
9423 vec_init = gimple_build (stmts, MULT_EXPR, uvectype,
9424 vec_init, vec_mul);
9425 vec_init = gimple_convert (stmts, vectype, vec_init);
9426 }
9427 break;
9428
9429 default:
9430 gcc_unreachable ();
9431 }
9432
9433 return vec_init;
9434 }
9435
9436 /* Peel init_expr by skip_niter for induction_type. */
9437 tree
9438 vect_peel_nonlinear_iv_init (gimple_seq* stmts, tree init_expr,
9439 tree skip_niters, tree step_expr,
9440 enum vect_induction_op_type induction_type)
9441 {
9442 gcc_assert (TREE_CODE (skip_niters) == INTEGER_CST);
9443 tree type = TREE_TYPE (init_expr);
9444 unsigned prec = TYPE_PRECISION (type);
9445 switch (induction_type)
9446 {
9447 case vect_step_op_neg:
9448 if (TREE_INT_CST_LOW (skip_niters) % 2)
9449 init_expr = gimple_build (stmts, NEGATE_EXPR, type, init_expr);
9450 /* else no change. */
9451 break;
9452
9453 case vect_step_op_shr:
9454 case vect_step_op_shl:
9455 skip_niters = gimple_convert (stmts, type, skip_niters);
9456 step_expr = gimple_build (stmts, MULT_EXPR, type, step_expr, skip_niters);
9457 /* When shift mount >= precision, need to avoid UD.
9458 In the original loop, there's no UD, and according to semantic,
9459 init_expr should be 0 for lshr, ashl, and >>= (prec - 1) for ashr. */
9460 if (!tree_fits_uhwi_p (step_expr)
9461 || tree_to_uhwi (step_expr) >= prec)
9462 {
9463 if (induction_type == vect_step_op_shl
9464 || TYPE_UNSIGNED (type))
9465 init_expr = build_zero_cst (type);
9466 else
9467 init_expr = gimple_build (stmts, RSHIFT_EXPR, type,
9468 init_expr,
9469 wide_int_to_tree (type, prec - 1));
9470 }
9471 else
9472 init_expr = gimple_build (stmts, (induction_type == vect_step_op_shr
9473 ? RSHIFT_EXPR : LSHIFT_EXPR),
9474 type, init_expr, step_expr);
9475 break;
9476
9477 case vect_step_op_mul:
9478 {
9479 tree utype = unsigned_type_for (type);
9480 init_expr = gimple_convert (stmts, utype, init_expr);
9481 wide_int skipn = wi::to_wide (skip_niters);
9482 wide_int begin = wi::to_wide (step_expr);
9483 auto_mpz base, exp, mod, res;
9484 wi::to_mpz (begin, base, TYPE_SIGN (type));
9485 wi::to_mpz (skipn, exp, UNSIGNED);
9486 mpz_ui_pow_ui (mod, 2, TYPE_PRECISION (type));
9487 mpz_powm (res, base, exp, mod);
9488 begin = wi::from_mpz (utype, res, true);
9489 tree mult_expr = wide_int_to_tree (utype, begin);
9490 init_expr = gimple_build (stmts, MULT_EXPR, utype,
9491 init_expr, mult_expr);
9492 init_expr = gimple_convert (stmts, type, init_expr);
9493 }
9494 break;
9495
9496 default:
9497 gcc_unreachable ();
9498 }
9499
9500 return init_expr;
9501 }
9502
9503 /* Create vector step for vectorized iv. */
9504 static tree
9505 vect_create_nonlinear_iv_step (gimple_seq* stmts, tree step_expr,
9506 poly_uint64 vf,
9507 enum vect_induction_op_type induction_type)
9508 {
9509 tree expr = build_int_cst (TREE_TYPE (step_expr), vf);
9510 tree new_name = NULL;
9511 /* Step should be pow (step, vf) for mult induction. */
9512 if (induction_type == vect_step_op_mul)
9513 {
9514 gcc_assert (vf.is_constant ());
9515 wide_int begin = wi::to_wide (step_expr);
9516
9517 for (unsigned i = 0; i != vf.to_constant () - 1; i++)
9518 begin = wi::mul (begin, wi::to_wide (step_expr));
9519
9520 new_name = wide_int_to_tree (TREE_TYPE (step_expr), begin);
9521 }
9522 else if (induction_type == vect_step_op_neg)
9523 /* Do nothing. */
9524 ;
9525 else
9526 new_name = gimple_build (stmts, MULT_EXPR, TREE_TYPE (step_expr),
9527 expr, step_expr);
9528 return new_name;
9529 }
9530
9531 static tree
9532 vect_create_nonlinear_iv_vec_step (loop_vec_info loop_vinfo,
9533 stmt_vec_info stmt_info,
9534 tree new_name, tree vectype,
9535 enum vect_induction_op_type induction_type)
9536 {
9537 /* No step is needed for neg induction. */
9538 if (induction_type == vect_step_op_neg)
9539 return NULL;
9540
9541 tree t = unshare_expr (new_name);
9542 gcc_assert (CONSTANT_CLASS_P (new_name)
9543 || TREE_CODE (new_name) == SSA_NAME);
9544 tree new_vec = build_vector_from_val (vectype, t);
9545 tree vec_step = vect_init_vector (loop_vinfo, stmt_info,
9546 new_vec, vectype, NULL);
9547 return vec_step;
9548 }
9549
9550 /* Update vectorized iv with vect_step, induc_def is init. */
9551 static tree
9552 vect_update_nonlinear_iv (gimple_seq* stmts, tree vectype,
9553 tree induc_def, tree vec_step,
9554 enum vect_induction_op_type induction_type)
9555 {
9556 tree vec_def = induc_def;
9557 switch (induction_type)
9558 {
9559 case vect_step_op_mul:
9560 {
9561 /* Use unsigned mult to avoid UD integer overflow. */
9562 tree uvectype
9563 = build_vector_type (unsigned_type_for (TREE_TYPE (vectype)),
9564 TYPE_VECTOR_SUBPARTS (vectype));
9565 vec_def = gimple_convert (stmts, uvectype, vec_def);
9566 vec_step = gimple_convert (stmts, uvectype, vec_step);
9567 vec_def = gimple_build (stmts, MULT_EXPR, uvectype,
9568 vec_def, vec_step);
9569 vec_def = gimple_convert (stmts, vectype, vec_def);
9570 }
9571 break;
9572
9573 case vect_step_op_shr:
9574 vec_def = gimple_build (stmts, RSHIFT_EXPR, vectype,
9575 vec_def, vec_step);
9576 break;
9577
9578 case vect_step_op_shl:
9579 vec_def = gimple_build (stmts, LSHIFT_EXPR, vectype,
9580 vec_def, vec_step);
9581 break;
9582 case vect_step_op_neg:
9583 vec_def = induc_def;
9584 /* Do nothing. */
9585 break;
9586 default:
9587 gcc_unreachable ();
9588 }
9589
9590 return vec_def;
9591
9592 }
9593
9594 /* Function vectorizable_induction
9595
9596 Check if STMT_INFO performs an nonlinear induction computation that can be
9597 vectorized. If VEC_STMT is also passed, vectorize the induction PHI: create
9598 a vectorized phi to replace it, put it in VEC_STMT, and add it to the same
9599 basic block.
9600 Return true if STMT_INFO is vectorizable in this way. */
9601
9602 static bool
9603 vectorizable_nonlinear_induction (loop_vec_info loop_vinfo,
9604 stmt_vec_info stmt_info,
9605 gimple **vec_stmt, slp_tree slp_node,
9606 stmt_vector_for_cost *cost_vec)
9607 {
9608 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
9609 unsigned ncopies;
9610 bool nested_in_vect_loop = false;
9611 class loop *iv_loop;
9612 tree vec_def;
9613 edge pe = loop_preheader_edge (loop);
9614 basic_block new_bb;
9615 tree vec_init, vec_step;
9616 tree new_name;
9617 gimple *new_stmt;
9618 gphi *induction_phi;
9619 tree induc_def, vec_dest;
9620 tree init_expr, step_expr;
9621 tree niters_skip;
9622 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
9623 unsigned i;
9624 gimple_stmt_iterator si;
9625
9626 gphi *phi = dyn_cast <gphi *> (stmt_info->stmt);
9627
9628 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
9629 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
9630 enum vect_induction_op_type induction_type
9631 = STMT_VINFO_LOOP_PHI_EVOLUTION_TYPE (stmt_info);
9632
9633 gcc_assert (induction_type > vect_step_op_add);
9634
9635 if (slp_node)
9636 ncopies = 1;
9637 else
9638 ncopies = vect_get_num_copies (loop_vinfo, vectype);
9639 gcc_assert (ncopies >= 1);
9640
9641 /* FORNOW. Only handle nonlinear induction in the same loop. */
9642 if (nested_in_vect_loop_p (loop, stmt_info))
9643 {
9644 if (dump_enabled_p ())
9645 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9646 "nonlinear induction in nested loop.\n");
9647 return false;
9648 }
9649
9650 iv_loop = loop;
9651 gcc_assert (iv_loop == (gimple_bb (phi))->loop_father);
9652
9653 /* TODO: Support slp for nonlinear iv. There should be separate vector iv
9654 update for each iv and a permutation to generate wanted vector iv. */
9655 if (slp_node)
9656 {
9657 if (dump_enabled_p ())
9658 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9659 "SLP induction not supported for nonlinear"
9660 " induction.\n");
9661 return false;
9662 }
9663
9664 if (!INTEGRAL_TYPE_P (TREE_TYPE (vectype)))
9665 {
9666 if (dump_enabled_p ())
9667 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9668 "floating point nonlinear induction vectorization"
9669 " not supported.\n");
9670 return false;
9671 }
9672
9673 step_expr = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_info);
9674 init_expr = vect_phi_initial_value (phi);
9675 gcc_assert (step_expr != NULL_TREE && init_expr != NULL
9676 && TREE_CODE (step_expr) == INTEGER_CST);
9677 /* step_expr should be aligned with init_expr,
9678 .i.e. uint64 a >> 1, step is int, but vector<uint64> shift is used. */
9679 step_expr = fold_convert (TREE_TYPE (vectype), step_expr);
9680
9681 if (TREE_CODE (init_expr) == INTEGER_CST)
9682 init_expr = fold_convert (TREE_TYPE (vectype), init_expr);
9683 else if (!tree_nop_conversion_p (TREE_TYPE (vectype), TREE_TYPE (init_expr)))
9684 {
9685 /* INIT_EXPR could be a bit_field, bail out for such case. */
9686 if (dump_enabled_p ())
9687 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9688 "nonlinear induction vectorization failed:"
9689 " component type of vectype is not a nop conversion"
9690 " from type of init_expr.\n");
9691 return false;
9692 }
9693
9694 switch (induction_type)
9695 {
9696 case vect_step_op_neg:
9697 if (maybe_eq (TYPE_VECTOR_SUBPARTS (vectype), 1u))
9698 return false;
9699 if (TREE_CODE (init_expr) != INTEGER_CST
9700 && TREE_CODE (init_expr) != REAL_CST)
9701 {
9702 /* Check for backend support of NEGATE_EXPR and vec_perm. */
9703 if (!directly_supported_p (NEGATE_EXPR, vectype))
9704 return false;
9705
9706 /* The encoding has 2 interleaved stepped patterns. */
9707 vec_perm_builder sel (nunits, 2, 3);
9708 machine_mode mode = TYPE_MODE (vectype);
9709 sel.quick_grow (6);
9710 for (i = 0; i < 3; i++)
9711 {
9712 sel[i * 2] = i;
9713 sel[i * 2 + 1] = i + nunits;
9714 }
9715 vec_perm_indices indices (sel, 2, nunits);
9716 if (!can_vec_perm_const_p (mode, mode, indices))
9717 return false;
9718 }
9719 break;
9720
9721 case vect_step_op_mul:
9722 {
9723 /* Check for backend support of MULT_EXPR. */
9724 if (!directly_supported_p (MULT_EXPR, vectype))
9725 return false;
9726
9727 /* ?? How to construct vector step for variable number vector.
9728 [ 1, step, pow (step, 2), pow (step, 4), .. ]. */
9729 if (!vf.is_constant ())
9730 return false;
9731 }
9732 break;
9733
9734 case vect_step_op_shr:
9735 /* Check for backend support of RSHIFT_EXPR. */
9736 if (!directly_supported_p (RSHIFT_EXPR, vectype, optab_vector))
9737 return false;
9738
9739 /* Don't shift more than type precision to avoid UD. */
9740 if (!tree_fits_uhwi_p (step_expr)
9741 || maybe_ge (nunits * tree_to_uhwi (step_expr),
9742 TYPE_PRECISION (TREE_TYPE (init_expr))))
9743 return false;
9744 break;
9745
9746 case vect_step_op_shl:
9747 /* Check for backend support of RSHIFT_EXPR. */
9748 if (!directly_supported_p (LSHIFT_EXPR, vectype, optab_vector))
9749 return false;
9750
9751 /* Don't shift more than type precision to avoid UD. */
9752 if (!tree_fits_uhwi_p (step_expr)
9753 || maybe_ge (nunits * tree_to_uhwi (step_expr),
9754 TYPE_PRECISION (TREE_TYPE (init_expr))))
9755 return false;
9756
9757 break;
9758
9759 default:
9760 gcc_unreachable ();
9761 }
9762
9763 if (!vec_stmt) /* transformation not required. */
9764 {
9765 unsigned inside_cost = 0, prologue_cost = 0;
9766 /* loop cost for vec_loop. Neg induction doesn't have any
9767 inside_cost. */
9768 inside_cost = record_stmt_cost (cost_vec, ncopies, vector_stmt,
9769 stmt_info, 0, vect_body);
9770
9771 /* loop cost for vec_loop. Neg induction doesn't have any
9772 inside_cost. */
9773 if (induction_type == vect_step_op_neg)
9774 inside_cost = 0;
9775
9776 /* prologue cost for vec_init and vec_step. */
9777 prologue_cost = record_stmt_cost (cost_vec, 2, scalar_to_vec,
9778 stmt_info, 0, vect_prologue);
9779
9780 if (dump_enabled_p ())
9781 dump_printf_loc (MSG_NOTE, vect_location,
9782 "vect_model_induction_cost: inside_cost = %d, "
9783 "prologue_cost = %d. \n", inside_cost,
9784 prologue_cost);
9785
9786 STMT_VINFO_TYPE (stmt_info) = induc_vec_info_type;
9787 DUMP_VECT_SCOPE ("vectorizable_nonlinear_induction");
9788 return true;
9789 }
9790
9791 /* Transform. */
9792
9793 /* Compute a vector variable, initialized with the first VF values of
9794 the induction variable. E.g., for an iv with IV_PHI='X' and
9795 evolution S, for a vector of 4 units, we want to compute:
9796 [X, X + S, X + 2*S, X + 3*S]. */
9797
9798 if (dump_enabled_p ())
9799 dump_printf_loc (MSG_NOTE, vect_location, "transform induction phi.\n");
9800
9801 pe = loop_preheader_edge (iv_loop);
9802 /* Find the first insertion point in the BB. */
9803 basic_block bb = gimple_bb (phi);
9804 si = gsi_after_labels (bb);
9805
9806 gimple_seq stmts = NULL;
9807
9808 niters_skip = LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo);
9809 /* If we are using the loop mask to "peel" for alignment then we need
9810 to adjust the start value here. */
9811 if (niters_skip != NULL_TREE)
9812 init_expr = vect_peel_nonlinear_iv_init (&stmts, init_expr, niters_skip,
9813 step_expr, induction_type);
9814
9815 vec_init = vect_create_nonlinear_iv_init (&stmts, init_expr,
9816 step_expr, nunits, vectype,
9817 induction_type);
9818 if (stmts)
9819 {
9820 new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
9821 gcc_assert (!new_bb);
9822 }
9823
9824 stmts = NULL;
9825 new_name = vect_create_nonlinear_iv_step (&stmts, step_expr,
9826 vf, induction_type);
9827 if (stmts)
9828 {
9829 new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
9830 gcc_assert (!new_bb);
9831 }
9832
9833 vec_step = vect_create_nonlinear_iv_vec_step (loop_vinfo, stmt_info,
9834 new_name, vectype,
9835 induction_type);
9836 /* Create the following def-use cycle:
9837 loop prolog:
9838 vec_init = ...
9839 vec_step = ...
9840 loop:
9841 vec_iv = PHI <vec_init, vec_loop>
9842 ...
9843 STMT
9844 ...
9845 vec_loop = vec_iv + vec_step; */
9846
9847 /* Create the induction-phi that defines the induction-operand. */
9848 vec_dest = vect_get_new_vect_var (vectype, vect_simple_var, "vec_iv_");
9849 induction_phi = create_phi_node (vec_dest, iv_loop->header);
9850 induc_def = PHI_RESULT (induction_phi);
9851
9852 /* Create the iv update inside the loop. */
9853 stmts = NULL;
9854 vec_def = vect_update_nonlinear_iv (&stmts, vectype,
9855 induc_def, vec_step,
9856 induction_type);
9857
9858 gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
9859 new_stmt = SSA_NAME_DEF_STMT (vec_def);
9860
9861 /* Set the arguments of the phi node: */
9862 add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION);
9863 add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
9864 UNKNOWN_LOCATION);
9865
9866 STMT_VINFO_VEC_STMTS (stmt_info).safe_push (induction_phi);
9867 *vec_stmt = induction_phi;
9868
9869 /* In case that vectorization factor (VF) is bigger than the number
9870 of elements that we can fit in a vectype (nunits), we have to generate
9871 more than one vector stmt - i.e - we need to "unroll" the
9872 vector stmt by a factor VF/nunits. For more details see documentation
9873 in vectorizable_operation. */
9874
9875 if (ncopies > 1)
9876 {
9877 stmts = NULL;
9878 /* FORNOW. This restriction should be relaxed. */
9879 gcc_assert (!nested_in_vect_loop);
9880
9881 new_name = vect_create_nonlinear_iv_step (&stmts, step_expr,
9882 nunits, induction_type);
9883
9884 vec_step = vect_create_nonlinear_iv_vec_step (loop_vinfo, stmt_info,
9885 new_name, vectype,
9886 induction_type);
9887 vec_def = induc_def;
9888 for (i = 1; i < ncopies; i++)
9889 {
9890 /* vec_i = vec_prev + vec_step. */
9891 stmts = NULL;
9892 vec_def = vect_update_nonlinear_iv (&stmts, vectype,
9893 vec_def, vec_step,
9894 induction_type);
9895 gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
9896 new_stmt = SSA_NAME_DEF_STMT (vec_def);
9897 STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_stmt);
9898 }
9899 }
9900
9901 if (dump_enabled_p ())
9902 dump_printf_loc (MSG_NOTE, vect_location,
9903 "transform induction: created def-use cycle: %G%G",
9904 (gimple *) induction_phi, SSA_NAME_DEF_STMT (vec_def));
9905
9906 return true;
9907 }
9908
9909 /* Function vectorizable_induction
9910
9911 Check if STMT_INFO performs an induction computation that can be vectorized.
9912 If VEC_STMT is also passed, vectorize the induction PHI: create a vectorized
9913 phi to replace it, put it in VEC_STMT, and add it to the same basic block.
9914 Return true if STMT_INFO is vectorizable in this way. */
9915
9916 bool
9917 vectorizable_induction (loop_vec_info loop_vinfo,
9918 stmt_vec_info stmt_info,
9919 gimple **vec_stmt, slp_tree slp_node,
9920 stmt_vector_for_cost *cost_vec)
9921 {
9922 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
9923 unsigned ncopies;
9924 bool nested_in_vect_loop = false;
9925 class loop *iv_loop;
9926 tree vec_def;
9927 edge pe = loop_preheader_edge (loop);
9928 basic_block new_bb;
9929 tree new_vec, vec_init, vec_step, t;
9930 tree new_name;
9931 gimple *new_stmt;
9932 gphi *induction_phi;
9933 tree induc_def, vec_dest;
9934 tree init_expr, step_expr;
9935 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
9936 unsigned i;
9937 tree expr;
9938 gimple_stmt_iterator si;
9939 enum vect_induction_op_type induction_type
9940 = STMT_VINFO_LOOP_PHI_EVOLUTION_TYPE (stmt_info);
9941
9942 gphi *phi = dyn_cast <gphi *> (stmt_info->stmt);
9943 if (!phi)
9944 return false;
9945
9946 if (!STMT_VINFO_RELEVANT_P (stmt_info))
9947 return false;
9948
9949 /* Make sure it was recognized as induction computation. */
9950 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_induction_def)
9951 return false;
9952
9953 /* Handle nonlinear induction in a separate place. */
9954 if (induction_type != vect_step_op_add)
9955 return vectorizable_nonlinear_induction (loop_vinfo, stmt_info,
9956 vec_stmt, slp_node, cost_vec);
9957
9958 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
9959 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
9960
9961 if (slp_node)
9962 ncopies = 1;
9963 else
9964 ncopies = vect_get_num_copies (loop_vinfo, vectype);
9965 gcc_assert (ncopies >= 1);
9966
9967 /* FORNOW. These restrictions should be relaxed. */
9968 if (nested_in_vect_loop_p (loop, stmt_info))
9969 {
9970 imm_use_iterator imm_iter;
9971 use_operand_p use_p;
9972 gimple *exit_phi;
9973 edge latch_e;
9974 tree loop_arg;
9975
9976 if (ncopies > 1)
9977 {
9978 if (dump_enabled_p ())
9979 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9980 "multiple types in nested loop.\n");
9981 return false;
9982 }
9983
9984 exit_phi = NULL;
9985 latch_e = loop_latch_edge (loop->inner);
9986 loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
9987 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, loop_arg)
9988 {
9989 gimple *use_stmt = USE_STMT (use_p);
9990 if (is_gimple_debug (use_stmt))
9991 continue;
9992
9993 if (!flow_bb_inside_loop_p (loop->inner, gimple_bb (use_stmt)))
9994 {
9995 exit_phi = use_stmt;
9996 break;
9997 }
9998 }
9999 if (exit_phi)
10000 {
10001 stmt_vec_info exit_phi_vinfo = loop_vinfo->lookup_stmt (exit_phi);
10002 if (!(STMT_VINFO_RELEVANT_P (exit_phi_vinfo)
10003 && !STMT_VINFO_LIVE_P (exit_phi_vinfo)))
10004 {
10005 if (dump_enabled_p ())
10006 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
10007 "inner-loop induction only used outside "
10008 "of the outer vectorized loop.\n");
10009 return false;
10010 }
10011 }
10012
10013 nested_in_vect_loop = true;
10014 iv_loop = loop->inner;
10015 }
10016 else
10017 iv_loop = loop;
10018 gcc_assert (iv_loop == (gimple_bb (phi))->loop_father);
10019
10020 if (slp_node && !nunits.is_constant ())
10021 {
10022 /* The current SLP code creates the step value element-by-element. */
10023 if (dump_enabled_p ())
10024 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
10025 "SLP induction not supported for variable-length"
10026 " vectors.\n");
10027 return false;
10028 }
10029
10030 if (FLOAT_TYPE_P (vectype) && !param_vect_induction_float)
10031 {
10032 if (dump_enabled_p ())
10033 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
10034 "floating point induction vectorization disabled\n");
10035 return false;
10036 }
10037
10038 step_expr = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_info);
10039 gcc_assert (step_expr != NULL_TREE);
10040 if (INTEGRAL_TYPE_P (TREE_TYPE (step_expr))
10041 && !type_has_mode_precision_p (TREE_TYPE (step_expr)))
10042 {
10043 if (dump_enabled_p ())
10044 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
10045 "bit-precision induction vectorization not "
10046 "supported.\n");
10047 return false;
10048 }
10049 tree step_vectype = get_same_sized_vectype (TREE_TYPE (step_expr), vectype);
10050
10051 /* Check for backend support of PLUS/MINUS_EXPR. */
10052 if (!directly_supported_p (PLUS_EXPR, step_vectype)
10053 || !directly_supported_p (MINUS_EXPR, step_vectype))
10054 return false;
10055
10056 if (!vec_stmt) /* transformation not required. */
10057 {
10058 unsigned inside_cost = 0, prologue_cost = 0;
10059 if (slp_node)
10060 {
10061 /* We eventually need to set a vector type on invariant
10062 arguments. */
10063 unsigned j;
10064 slp_tree child;
10065 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (slp_node), j, child)
10066 if (!vect_maybe_update_slp_op_vectype
10067 (child, SLP_TREE_VECTYPE (slp_node)))
10068 {
10069 if (dump_enabled_p ())
10070 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
10071 "incompatible vector types for "
10072 "invariants\n");
10073 return false;
10074 }
10075 /* loop cost for vec_loop. */
10076 inside_cost
10077 = record_stmt_cost (cost_vec,
10078 SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node),
10079 vector_stmt, stmt_info, 0, vect_body);
10080 /* prologue cost for vec_init (if not nested) and step. */
10081 prologue_cost = record_stmt_cost (cost_vec, 1 + !nested_in_vect_loop,
10082 scalar_to_vec,
10083 stmt_info, 0, vect_prologue);
10084 }
10085 else /* if (!slp_node) */
10086 {
10087 /* loop cost for vec_loop. */
10088 inside_cost = record_stmt_cost (cost_vec, ncopies, vector_stmt,
10089 stmt_info, 0, vect_body);
10090 /* prologue cost for vec_init and vec_step. */
10091 prologue_cost = record_stmt_cost (cost_vec, 2, scalar_to_vec,
10092 stmt_info, 0, vect_prologue);
10093 }
10094 if (dump_enabled_p ())
10095 dump_printf_loc (MSG_NOTE, vect_location,
10096 "vect_model_induction_cost: inside_cost = %d, "
10097 "prologue_cost = %d .\n", inside_cost,
10098 prologue_cost);
10099
10100 STMT_VINFO_TYPE (stmt_info) = induc_vec_info_type;
10101 DUMP_VECT_SCOPE ("vectorizable_induction");
10102 return true;
10103 }
10104
10105 /* Transform. */
10106
10107 /* Compute a vector variable, initialized with the first VF values of
10108 the induction variable. E.g., for an iv with IV_PHI='X' and
10109 evolution S, for a vector of 4 units, we want to compute:
10110 [X, X + S, X + 2*S, X + 3*S]. */
10111
10112 if (dump_enabled_p ())
10113 dump_printf_loc (MSG_NOTE, vect_location, "transform induction phi.\n");
10114
10115 pe = loop_preheader_edge (iv_loop);
10116 /* Find the first insertion point in the BB. */
10117 basic_block bb = gimple_bb (phi);
10118 si = gsi_after_labels (bb);
10119
10120 /* For SLP induction we have to generate several IVs as for example
10121 with group size 3 we need
10122 [i0, i1, i2, i0 + S0] [i1 + S1, i2 + S2, i0 + 2*S0, i1 + 2*S1]
10123 [i2 + 2*S2, i0 + 3*S0, i1 + 3*S1, i2 + 3*S2]. */
10124 if (slp_node)
10125 {
10126 /* Enforced above. */
10127 unsigned int const_nunits = nunits.to_constant ();
10128
10129 /* The initial values are vectorized, but any lanes > group_size
10130 need adjustment. */
10131 slp_tree init_node
10132 = SLP_TREE_CHILDREN (slp_node)[pe->dest_idx];
10133
10134 /* Gather steps. Since we do not vectorize inductions as
10135 cycles we have to reconstruct the step from SCEV data. */
10136 unsigned group_size = SLP_TREE_LANES (slp_node);
10137 tree *steps = XALLOCAVEC (tree, group_size);
10138 tree *inits = XALLOCAVEC (tree, group_size);
10139 stmt_vec_info phi_info;
10140 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (slp_node), i, phi_info)
10141 {
10142 steps[i] = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (phi_info);
10143 if (!init_node)
10144 inits[i] = gimple_phi_arg_def (as_a<gphi *> (phi_info->stmt),
10145 pe->dest_idx);
10146 }
10147
10148 /* Now generate the IVs. */
10149 unsigned nvects = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
10150 gcc_assert ((const_nunits * nvects) % group_size == 0);
10151 unsigned nivs;
10152 if (nested_in_vect_loop)
10153 nivs = nvects;
10154 else
10155 {
10156 /* Compute the number of distinct IVs we need. First reduce
10157 group_size if it is a multiple of const_nunits so we get
10158 one IV for a group_size of 4 but const_nunits 2. */
10159 unsigned group_sizep = group_size;
10160 if (group_sizep % const_nunits == 0)
10161 group_sizep = group_sizep / const_nunits;
10162 nivs = least_common_multiple (group_sizep,
10163 const_nunits) / const_nunits;
10164 }
10165 tree stept = TREE_TYPE (step_vectype);
10166 tree lupdate_mul = NULL_TREE;
10167 if (!nested_in_vect_loop)
10168 {
10169 /* The number of iterations covered in one vector iteration. */
10170 unsigned lup_mul = (nvects * const_nunits) / group_size;
10171 lupdate_mul
10172 = build_vector_from_val (step_vectype,
10173 SCALAR_FLOAT_TYPE_P (stept)
10174 ? build_real_from_wide (stept, lup_mul,
10175 UNSIGNED)
10176 : build_int_cstu (stept, lup_mul));
10177 }
10178 tree peel_mul = NULL_TREE;
10179 gimple_seq init_stmts = NULL;
10180 if (LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo))
10181 {
10182 if (SCALAR_FLOAT_TYPE_P (stept))
10183 peel_mul = gimple_build (&init_stmts, FLOAT_EXPR, stept,
10184 LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo));
10185 else
10186 peel_mul = gimple_convert (&init_stmts, stept,
10187 LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo));
10188 peel_mul = gimple_build_vector_from_val (&init_stmts,
10189 step_vectype, peel_mul);
10190 }
10191 unsigned ivn;
10192 auto_vec<tree> vec_steps;
10193 for (ivn = 0; ivn < nivs; ++ivn)
10194 {
10195 tree_vector_builder step_elts (step_vectype, const_nunits, 1);
10196 tree_vector_builder init_elts (vectype, const_nunits, 1);
10197 tree_vector_builder mul_elts (step_vectype, const_nunits, 1);
10198 for (unsigned eltn = 0; eltn < const_nunits; ++eltn)
10199 {
10200 /* The scalar steps of the IVs. */
10201 tree elt = steps[(ivn*const_nunits + eltn) % group_size];
10202 elt = gimple_convert (&init_stmts, TREE_TYPE (step_vectype), elt);
10203 step_elts.quick_push (elt);
10204 if (!init_node)
10205 {
10206 /* The scalar inits of the IVs if not vectorized. */
10207 elt = inits[(ivn*const_nunits + eltn) % group_size];
10208 if (!useless_type_conversion_p (TREE_TYPE (vectype),
10209 TREE_TYPE (elt)))
10210 elt = gimple_build (&init_stmts, VIEW_CONVERT_EXPR,
10211 TREE_TYPE (vectype), elt);
10212 init_elts.quick_push (elt);
10213 }
10214 /* The number of steps to add to the initial values. */
10215 unsigned mul_elt = (ivn*const_nunits + eltn) / group_size;
10216 mul_elts.quick_push (SCALAR_FLOAT_TYPE_P (stept)
10217 ? build_real_from_wide (stept,
10218 mul_elt, UNSIGNED)
10219 : build_int_cstu (stept, mul_elt));
10220 }
10221 vec_step = gimple_build_vector (&init_stmts, &step_elts);
10222 vec_steps.safe_push (vec_step);
10223 tree step_mul = gimple_build_vector (&init_stmts, &mul_elts);
10224 if (peel_mul)
10225 step_mul = gimple_build (&init_stmts, PLUS_EXPR, step_vectype,
10226 step_mul, peel_mul);
10227 if (!init_node)
10228 vec_init = gimple_build_vector (&init_stmts, &init_elts);
10229
10230 /* Create the induction-phi that defines the induction-operand. */
10231 vec_dest = vect_get_new_vect_var (vectype, vect_simple_var,
10232 "vec_iv_");
10233 induction_phi = create_phi_node (vec_dest, iv_loop->header);
10234 induc_def = PHI_RESULT (induction_phi);
10235
10236 /* Create the iv update inside the loop */
10237 tree up = vec_step;
10238 if (lupdate_mul)
10239 up = gimple_build (&init_stmts, MULT_EXPR, step_vectype,
10240 vec_step, lupdate_mul);
10241 gimple_seq stmts = NULL;
10242 vec_def = gimple_convert (&stmts, step_vectype, induc_def);
10243 vec_def = gimple_build (&stmts,
10244 PLUS_EXPR, step_vectype, vec_def, up);
10245 vec_def = gimple_convert (&stmts, vectype, vec_def);
10246 gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
10247 add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
10248 UNKNOWN_LOCATION);
10249
10250 if (init_node)
10251 vec_init = vect_get_slp_vect_def (init_node, ivn);
10252 if (!nested_in_vect_loop
10253 && !integer_zerop (step_mul))
10254 {
10255 vec_def = gimple_convert (&init_stmts, step_vectype, vec_init);
10256 up = gimple_build (&init_stmts, MULT_EXPR, step_vectype,
10257 vec_step, step_mul);
10258 vec_def = gimple_build (&init_stmts, PLUS_EXPR, step_vectype,
10259 vec_def, up);
10260 vec_init = gimple_convert (&init_stmts, vectype, vec_def);
10261 }
10262
10263 /* Set the arguments of the phi node: */
10264 add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION);
10265
10266 slp_node->push_vec_def (induction_phi);
10267 }
10268 if (!nested_in_vect_loop)
10269 {
10270 /* Fill up to the number of vectors we need for the whole group. */
10271 nivs = least_common_multiple (group_size,
10272 const_nunits) / const_nunits;
10273 vec_steps.reserve (nivs-ivn);
10274 for (; ivn < nivs; ++ivn)
10275 {
10276 slp_node->push_vec_def (SLP_TREE_VEC_DEFS (slp_node)[0]);
10277 vec_steps.quick_push (vec_steps[0]);
10278 }
10279 }
10280
10281 /* Re-use IVs when we can. We are generating further vector
10282 stmts by adding VF' * stride to the IVs generated above. */
10283 if (ivn < nvects)
10284 {
10285 unsigned vfp
10286 = least_common_multiple (group_size, const_nunits) / group_size;
10287 tree lupdate_mul
10288 = build_vector_from_val (step_vectype,
10289 SCALAR_FLOAT_TYPE_P (stept)
10290 ? build_real_from_wide (stept,
10291 vfp, UNSIGNED)
10292 : build_int_cstu (stept, vfp));
10293 for (; ivn < nvects; ++ivn)
10294 {
10295 gimple *iv
10296 = SSA_NAME_DEF_STMT (SLP_TREE_VEC_DEFS (slp_node)[ivn - nivs]);
10297 tree def = gimple_get_lhs (iv);
10298 if (ivn < 2*nivs)
10299 vec_steps[ivn - nivs]
10300 = gimple_build (&init_stmts, MULT_EXPR, step_vectype,
10301 vec_steps[ivn - nivs], lupdate_mul);
10302 gimple_seq stmts = NULL;
10303 def = gimple_convert (&stmts, step_vectype, def);
10304 def = gimple_build (&stmts, PLUS_EXPR, step_vectype,
10305 def, vec_steps[ivn % nivs]);
10306 def = gimple_convert (&stmts, vectype, def);
10307 if (gimple_code (iv) == GIMPLE_PHI)
10308 gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
10309 else
10310 {
10311 gimple_stmt_iterator tgsi = gsi_for_stmt (iv);
10312 gsi_insert_seq_after (&tgsi, stmts, GSI_CONTINUE_LINKING);
10313 }
10314 slp_node->push_vec_def (def);
10315 }
10316 }
10317
10318 new_bb = gsi_insert_seq_on_edge_immediate (pe, init_stmts);
10319 gcc_assert (!new_bb);
10320
10321 return true;
10322 }
10323
10324 init_expr = vect_phi_initial_value (phi);
10325
10326 gimple_seq stmts = NULL;
10327 if (!nested_in_vect_loop)
10328 {
10329 /* Convert the initial value to the IV update type. */
10330 tree new_type = TREE_TYPE (step_expr);
10331 init_expr = gimple_convert (&stmts, new_type, init_expr);
10332
10333 /* If we are using the loop mask to "peel" for alignment then we need
10334 to adjust the start value here. */
10335 tree skip_niters = LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo);
10336 if (skip_niters != NULL_TREE)
10337 {
10338 if (FLOAT_TYPE_P (vectype))
10339 skip_niters = gimple_build (&stmts, FLOAT_EXPR, new_type,
10340 skip_niters);
10341 else
10342 skip_niters = gimple_convert (&stmts, new_type, skip_niters);
10343 tree skip_step = gimple_build (&stmts, MULT_EXPR, new_type,
10344 skip_niters, step_expr);
10345 init_expr = gimple_build (&stmts, MINUS_EXPR, new_type,
10346 init_expr, skip_step);
10347 }
10348 }
10349
10350 if (stmts)
10351 {
10352 new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
10353 gcc_assert (!new_bb);
10354 }
10355
10356 /* Create the vector that holds the initial_value of the induction. */
10357 if (nested_in_vect_loop)
10358 {
10359 /* iv_loop is nested in the loop to be vectorized. init_expr had already
10360 been created during vectorization of previous stmts. We obtain it
10361 from the STMT_VINFO_VEC_STMT of the defining stmt. */
10362 auto_vec<tree> vec_inits;
10363 vect_get_vec_defs_for_operand (loop_vinfo, stmt_info, 1,
10364 init_expr, &vec_inits);
10365 vec_init = vec_inits[0];
10366 /* If the initial value is not of proper type, convert it. */
10367 if (!useless_type_conversion_p (vectype, TREE_TYPE (vec_init)))
10368 {
10369 new_stmt
10370 = gimple_build_assign (vect_get_new_ssa_name (vectype,
10371 vect_simple_var,
10372 "vec_iv_"),
10373 VIEW_CONVERT_EXPR,
10374 build1 (VIEW_CONVERT_EXPR, vectype,
10375 vec_init));
10376 vec_init = gimple_assign_lhs (new_stmt);
10377 new_bb = gsi_insert_on_edge_immediate (loop_preheader_edge (iv_loop),
10378 new_stmt);
10379 gcc_assert (!new_bb);
10380 }
10381 }
10382 else
10383 {
10384 /* iv_loop is the loop to be vectorized. Create:
10385 vec_init = [X, X+S, X+2*S, X+3*S] (S = step_expr, X = init_expr) */
10386 stmts = NULL;
10387 new_name = gimple_convert (&stmts, TREE_TYPE (step_expr), init_expr);
10388
10389 unsigned HOST_WIDE_INT const_nunits;
10390 if (nunits.is_constant (&const_nunits))
10391 {
10392 tree_vector_builder elts (step_vectype, const_nunits, 1);
10393 elts.quick_push (new_name);
10394 for (i = 1; i < const_nunits; i++)
10395 {
10396 /* Create: new_name_i = new_name + step_expr */
10397 new_name = gimple_build (&stmts, PLUS_EXPR, TREE_TYPE (new_name),
10398 new_name, step_expr);
10399 elts.quick_push (new_name);
10400 }
10401 /* Create a vector from [new_name_0, new_name_1, ...,
10402 new_name_nunits-1] */
10403 vec_init = gimple_build_vector (&stmts, &elts);
10404 }
10405 else if (INTEGRAL_TYPE_P (TREE_TYPE (step_expr)))
10406 /* Build the initial value directly from a VEC_SERIES_EXPR. */
10407 vec_init = gimple_build (&stmts, VEC_SERIES_EXPR, step_vectype,
10408 new_name, step_expr);
10409 else
10410 {
10411 /* Build:
10412 [base, base, base, ...]
10413 + (vectype) [0, 1, 2, ...] * [step, step, step, ...]. */
10414 gcc_assert (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)));
10415 gcc_assert (flag_associative_math);
10416 tree index = build_index_vector (step_vectype, 0, 1);
10417 tree base_vec = gimple_build_vector_from_val (&stmts, step_vectype,
10418 new_name);
10419 tree step_vec = gimple_build_vector_from_val (&stmts, step_vectype,
10420 step_expr);
10421 vec_init = gimple_build (&stmts, FLOAT_EXPR, step_vectype, index);
10422 vec_init = gimple_build (&stmts, MULT_EXPR, step_vectype,
10423 vec_init, step_vec);
10424 vec_init = gimple_build (&stmts, PLUS_EXPR, step_vectype,
10425 vec_init, base_vec);
10426 }
10427 vec_init = gimple_convert (&stmts, vectype, vec_init);
10428
10429 if (stmts)
10430 {
10431 new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
10432 gcc_assert (!new_bb);
10433 }
10434 }
10435
10436
10437 /* Create the vector that holds the step of the induction. */
10438 gimple_stmt_iterator *step_iv_si = NULL;
10439 if (nested_in_vect_loop)
10440 /* iv_loop is nested in the loop to be vectorized. Generate:
10441 vec_step = [S, S, S, S] */
10442 new_name = step_expr;
10443 else if (LOOP_VINFO_USING_SELECT_VL_P (loop_vinfo))
10444 {
10445 /* When we're using loop_len produced by SELEC_VL, the non-final
10446 iterations are not always processing VF elements. So vectorize
10447 induction variable instead of
10448
10449 _21 = vect_vec_iv_.6_22 + { VF, ... };
10450
10451 We should generate:
10452
10453 _35 = .SELECT_VL (ivtmp_33, VF);
10454 vect_cst__22 = [vec_duplicate_expr] _35;
10455 _21 = vect_vec_iv_.6_22 + vect_cst__22; */
10456 gcc_assert (!slp_node);
10457 gimple_seq seq = NULL;
10458 vec_loop_lens *lens = &LOOP_VINFO_LENS (loop_vinfo);
10459 tree len = vect_get_loop_len (loop_vinfo, NULL, lens, 1, vectype, 0, 0);
10460 expr = force_gimple_operand (fold_convert (TREE_TYPE (step_expr),
10461 unshare_expr (len)),
10462 &seq, true, NULL_TREE);
10463 new_name = gimple_build (&seq, MULT_EXPR, TREE_TYPE (step_expr), expr,
10464 step_expr);
10465 gsi_insert_seq_before (&si, seq, GSI_SAME_STMT);
10466 step_iv_si = &si;
10467 }
10468 else
10469 {
10470 /* iv_loop is the loop to be vectorized. Generate:
10471 vec_step = [VF*S, VF*S, VF*S, VF*S] */
10472 gimple_seq seq = NULL;
10473 if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
10474 {
10475 expr = build_int_cst (integer_type_node, vf);
10476 expr = gimple_build (&seq, FLOAT_EXPR, TREE_TYPE (step_expr), expr);
10477 }
10478 else
10479 expr = build_int_cst (TREE_TYPE (step_expr), vf);
10480 new_name = gimple_build (&seq, MULT_EXPR, TREE_TYPE (step_expr),
10481 expr, step_expr);
10482 if (seq)
10483 {
10484 new_bb = gsi_insert_seq_on_edge_immediate (pe, seq);
10485 gcc_assert (!new_bb);
10486 }
10487 }
10488
10489 t = unshare_expr (new_name);
10490 gcc_assert (CONSTANT_CLASS_P (new_name)
10491 || TREE_CODE (new_name) == SSA_NAME);
10492 new_vec = build_vector_from_val (step_vectype, t);
10493 vec_step = vect_init_vector (loop_vinfo, stmt_info,
10494 new_vec, step_vectype, step_iv_si);
10495
10496
10497 /* Create the following def-use cycle:
10498 loop prolog:
10499 vec_init = ...
10500 vec_step = ...
10501 loop:
10502 vec_iv = PHI <vec_init, vec_loop>
10503 ...
10504 STMT
10505 ...
10506 vec_loop = vec_iv + vec_step; */
10507
10508 /* Create the induction-phi that defines the induction-operand. */
10509 vec_dest = vect_get_new_vect_var (vectype, vect_simple_var, "vec_iv_");
10510 induction_phi = create_phi_node (vec_dest, iv_loop->header);
10511 induc_def = PHI_RESULT (induction_phi);
10512
10513 /* Create the iv update inside the loop */
10514 stmts = NULL;
10515 vec_def = gimple_convert (&stmts, step_vectype, induc_def);
10516 vec_def = gimple_build (&stmts, PLUS_EXPR, step_vectype, vec_def, vec_step);
10517 vec_def = gimple_convert (&stmts, vectype, vec_def);
10518 gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
10519 new_stmt = SSA_NAME_DEF_STMT (vec_def);
10520
10521 /* Set the arguments of the phi node: */
10522 add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION);
10523 add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
10524 UNKNOWN_LOCATION);
10525
10526 STMT_VINFO_VEC_STMTS (stmt_info).safe_push (induction_phi);
10527 *vec_stmt = induction_phi;
10528
10529 /* In case that vectorization factor (VF) is bigger than the number
10530 of elements that we can fit in a vectype (nunits), we have to generate
10531 more than one vector stmt - i.e - we need to "unroll" the
10532 vector stmt by a factor VF/nunits. For more details see documentation
10533 in vectorizable_operation. */
10534
10535 if (ncopies > 1)
10536 {
10537 gimple_seq seq = NULL;
10538 /* FORNOW. This restriction should be relaxed. */
10539 gcc_assert (!nested_in_vect_loop);
10540 /* We expect LOOP_VINFO_USING_SELECT_VL_P to be false if ncopies > 1. */
10541 gcc_assert (!LOOP_VINFO_USING_SELECT_VL_P (loop_vinfo));
10542
10543 /* Create the vector that holds the step of the induction. */
10544 if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
10545 {
10546 expr = build_int_cst (integer_type_node, nunits);
10547 expr = gimple_build (&seq, FLOAT_EXPR, TREE_TYPE (step_expr), expr);
10548 }
10549 else
10550 expr = build_int_cst (TREE_TYPE (step_expr), nunits);
10551 new_name = gimple_build (&seq, MULT_EXPR, TREE_TYPE (step_expr),
10552 expr, step_expr);
10553 if (seq)
10554 {
10555 new_bb = gsi_insert_seq_on_edge_immediate (pe, seq);
10556 gcc_assert (!new_bb);
10557 }
10558
10559 t = unshare_expr (new_name);
10560 gcc_assert (CONSTANT_CLASS_P (new_name)
10561 || TREE_CODE (new_name) == SSA_NAME);
10562 new_vec = build_vector_from_val (step_vectype, t);
10563 vec_step = vect_init_vector (loop_vinfo, stmt_info,
10564 new_vec, step_vectype, NULL);
10565
10566 vec_def = induc_def;
10567 for (i = 1; i < ncopies + 1; i++)
10568 {
10569 /* vec_i = vec_prev + vec_step */
10570 gimple_seq stmts = NULL;
10571 vec_def = gimple_convert (&stmts, step_vectype, vec_def);
10572 vec_def = gimple_build (&stmts,
10573 PLUS_EXPR, step_vectype, vec_def, vec_step);
10574 vec_def = gimple_convert (&stmts, vectype, vec_def);
10575
10576 gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
10577 if (i < ncopies)
10578 {
10579 new_stmt = SSA_NAME_DEF_STMT (vec_def);
10580 STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_stmt);
10581 }
10582 else
10583 {
10584 /* vec_1 = vec_iv + (VF/n * S)
10585 vec_2 = vec_1 + (VF/n * S)
10586 ...
10587 vec_n = vec_prev + (VF/n * S) = vec_iv + VF * S = vec_loop
10588
10589 vec_n is used as vec_loop to save the large step register and
10590 related operations. */
10591 add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
10592 UNKNOWN_LOCATION);
10593 }
10594 }
10595 }
10596
10597 if (dump_enabled_p ())
10598 dump_printf_loc (MSG_NOTE, vect_location,
10599 "transform induction: created def-use cycle: %G%G",
10600 (gimple *) induction_phi, SSA_NAME_DEF_STMT (vec_def));
10601
10602 return true;
10603 }
10604
10605 /* Function vectorizable_live_operation_1.
10606
10607 helper function for vectorizable_live_operation. */
10608
10609 static tree
10610 vectorizable_live_operation_1 (loop_vec_info loop_vinfo,
10611 stmt_vec_info stmt_info, basic_block exit_bb,
10612 tree vectype, int ncopies, slp_tree slp_node,
10613 tree bitsize, tree bitstart, tree vec_lhs,
10614 tree lhs_type, gimple_stmt_iterator *exit_gsi)
10615 {
10616 gcc_assert (single_pred_p (exit_bb) || LOOP_VINFO_EARLY_BREAKS (loop_vinfo));
10617
10618 tree vec_lhs_phi = copy_ssa_name (vec_lhs);
10619 gimple *phi = create_phi_node (vec_lhs_phi, exit_bb);
10620 for (unsigned i = 0; i < gimple_phi_num_args (phi); i++)
10621 SET_PHI_ARG_DEF (phi, i, vec_lhs);
10622
10623 gimple_seq stmts = NULL;
10624 tree new_tree;
10625
10626 /* If bitstart is 0 then we can use a BIT_FIELD_REF */
10627 if (integer_zerop (bitstart))
10628 {
10629 tree scalar_res = gimple_build (&stmts, BIT_FIELD_REF, TREE_TYPE (vectype),
10630 vec_lhs_phi, bitsize, bitstart);
10631
10632 /* Convert the extracted vector element to the scalar type. */
10633 new_tree = gimple_convert (&stmts, lhs_type, scalar_res);
10634 }
10635 else if (LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo))
10636 {
10637 /* Emit:
10638
10639 SCALAR_RES = VEC_EXTRACT <VEC_LHS, LEN + BIAS - 1>
10640
10641 where VEC_LHS is the vectorized live-out result and MASK is
10642 the loop mask for the final iteration. */
10643 gcc_assert (ncopies == 1 && !slp_node);
10644 gimple_seq tem = NULL;
10645 gimple_stmt_iterator gsi = gsi_last (tem);
10646 tree len = vect_get_loop_len (loop_vinfo, &gsi,
10647 &LOOP_VINFO_LENS (loop_vinfo),
10648 1, vectype, 0, 0);
10649
10650 /* BIAS - 1. */
10651 signed char biasval = LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo);
10652 tree bias_minus_one
10653 = int_const_binop (MINUS_EXPR,
10654 build_int_cst (TREE_TYPE (len), biasval),
10655 build_one_cst (TREE_TYPE (len)));
10656
10657 /* LAST_INDEX = LEN + (BIAS - 1). */
10658 tree last_index = gimple_build (&stmts, PLUS_EXPR, TREE_TYPE (len),
10659 len, bias_minus_one);
10660
10661 /* SCALAR_RES = VEC_EXTRACT <VEC_LHS, LEN + BIAS - 1>. */
10662 tree scalar_res
10663 = gimple_build (&stmts, CFN_VEC_EXTRACT, TREE_TYPE (vectype),
10664 vec_lhs_phi, last_index);
10665
10666 /* Convert the extracted vector element to the scalar type. */
10667 new_tree = gimple_convert (&stmts, lhs_type, scalar_res);
10668 }
10669 else if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
10670 {
10671 /* Emit:
10672
10673 SCALAR_RES = EXTRACT_LAST <VEC_LHS, MASK>
10674
10675 where VEC_LHS is the vectorized live-out result and MASK is
10676 the loop mask for the final iteration. */
10677 gcc_assert (!slp_node);
10678 tree scalar_type = TREE_TYPE (STMT_VINFO_VECTYPE (stmt_info));
10679 gimple_seq tem = NULL;
10680 gimple_stmt_iterator gsi = gsi_last (tem);
10681 tree mask = vect_get_loop_mask (loop_vinfo, &gsi,
10682 &LOOP_VINFO_MASKS (loop_vinfo),
10683 1, vectype, 0);
10684 tree scalar_res;
10685 gimple_seq_add_seq (&stmts, tem);
10686
10687 scalar_res = gimple_build (&stmts, CFN_EXTRACT_LAST, scalar_type,
10688 mask, vec_lhs_phi);
10689
10690 /* Convert the extracted vector element to the scalar type. */
10691 new_tree = gimple_convert (&stmts, lhs_type, scalar_res);
10692 }
10693 else
10694 {
10695 tree bftype = TREE_TYPE (vectype);
10696 if (VECTOR_BOOLEAN_TYPE_P (vectype))
10697 bftype = build_nonstandard_integer_type (tree_to_uhwi (bitsize), 1);
10698 new_tree = build3 (BIT_FIELD_REF, bftype, vec_lhs_phi, bitsize, bitstart);
10699 new_tree = force_gimple_operand (fold_convert (lhs_type, new_tree),
10700 &stmts, true, NULL_TREE);
10701 }
10702
10703 *exit_gsi = gsi_after_labels (exit_bb);
10704 if (stmts)
10705 gsi_insert_seq_before (exit_gsi, stmts, GSI_SAME_STMT);
10706
10707 return new_tree;
10708 }
10709
10710 /* Function vectorizable_live_operation.
10711
10712 STMT_INFO computes a value that is used outside the loop. Check if
10713 it can be supported. */
10714
10715 bool
10716 vectorizable_live_operation (vec_info *vinfo, stmt_vec_info stmt_info,
10717 slp_tree slp_node, slp_instance slp_node_instance,
10718 int slp_index, bool vec_stmt_p,
10719 stmt_vector_for_cost *cost_vec)
10720 {
10721 loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
10722 imm_use_iterator imm_iter;
10723 tree lhs, lhs_type, bitsize;
10724 tree vectype = (slp_node
10725 ? SLP_TREE_VECTYPE (slp_node)
10726 : STMT_VINFO_VECTYPE (stmt_info));
10727 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
10728 int ncopies;
10729 gimple *use_stmt;
10730 use_operand_p use_p;
10731 auto_vec<tree> vec_oprnds;
10732 int vec_entry = 0;
10733 poly_uint64 vec_index = 0;
10734
10735 gcc_assert (STMT_VINFO_LIVE_P (stmt_info)
10736 || LOOP_VINFO_EARLY_BREAKS (loop_vinfo));
10737
10738 /* If a stmt of a reduction is live, vectorize it via
10739 vect_create_epilog_for_reduction. vectorizable_reduction assessed
10740 validity so just trigger the transform here. */
10741 if (STMT_VINFO_REDUC_DEF (vect_orig_stmt (stmt_info)))
10742 {
10743 if (!vec_stmt_p)
10744 return true;
10745 /* For SLP reductions we vectorize the epilogue for all involved stmts
10746 together. */
10747 if (slp_node && !REDUC_GROUP_FIRST_ELEMENT (stmt_info) && slp_index != 0)
10748 return true;
10749 stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info);
10750 gcc_assert (reduc_info->is_reduc_info);
10751 if (STMT_VINFO_REDUC_TYPE (reduc_info) == FOLD_LEFT_REDUCTION
10752 || STMT_VINFO_REDUC_TYPE (reduc_info) == EXTRACT_LAST_REDUCTION)
10753 return true;
10754
10755 if (!LOOP_VINFO_EARLY_BREAKS (loop_vinfo)
10756 || !LOOP_VINFO_EARLY_BREAKS_VECT_PEELED (loop_vinfo))
10757 vect_create_epilog_for_reduction (loop_vinfo, stmt_info, slp_node,
10758 slp_node_instance,
10759 LOOP_VINFO_IV_EXIT (loop_vinfo));
10760
10761 /* If early break we only have to materialize the reduction on the merge
10762 block, but we have to find an alternate exit first. */
10763 if (LOOP_VINFO_EARLY_BREAKS (loop_vinfo))
10764 {
10765 slp_tree phis_node = slp_node ? slp_node_instance->reduc_phis : NULL;
10766 for (auto exit : get_loop_exit_edges (LOOP_VINFO_LOOP (loop_vinfo)))
10767 if (exit != LOOP_VINFO_IV_EXIT (loop_vinfo))
10768 {
10769 vect_create_epilog_for_reduction (loop_vinfo, reduc_info,
10770 phis_node, slp_node_instance,
10771 exit);
10772 break;
10773 }
10774 if (LOOP_VINFO_EARLY_BREAKS_VECT_PEELED (loop_vinfo))
10775 vect_create_epilog_for_reduction (loop_vinfo, reduc_info,
10776 phis_node, slp_node_instance,
10777 LOOP_VINFO_IV_EXIT (loop_vinfo));
10778 }
10779
10780 return true;
10781 }
10782
10783 /* If STMT is not relevant and it is a simple assignment and its inputs are
10784 invariant then it can remain in place, unvectorized. The original last
10785 scalar value that it computes will be used. */
10786 if (!STMT_VINFO_RELEVANT_P (stmt_info))
10787 {
10788 gcc_assert (is_simple_and_all_uses_invariant (stmt_info, loop_vinfo));
10789 if (dump_enabled_p ())
10790 dump_printf_loc (MSG_NOTE, vect_location,
10791 "statement is simple and uses invariant. Leaving in "
10792 "place.\n");
10793 return true;
10794 }
10795
10796 if (slp_node)
10797 ncopies = 1;
10798 else
10799 ncopies = vect_get_num_copies (loop_vinfo, vectype);
10800
10801 if (slp_node)
10802 {
10803 gcc_assert (slp_index >= 0);
10804
10805 /* Get the last occurrence of the scalar index from the concatenation of
10806 all the slp vectors. Calculate which slp vector it is and the index
10807 within. */
10808 int num_scalar = SLP_TREE_LANES (slp_node);
10809 int num_vec = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
10810 poly_uint64 pos = (num_vec * nunits) - num_scalar + slp_index;
10811
10812 /* Calculate which vector contains the result, and which lane of
10813 that vector we need. */
10814 if (!can_div_trunc_p (pos, nunits, &vec_entry, &vec_index))
10815 {
10816 if (dump_enabled_p ())
10817 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
10818 "Cannot determine which vector holds the"
10819 " final result.\n");
10820 return false;
10821 }
10822 }
10823
10824 if (!vec_stmt_p)
10825 {
10826 /* No transformation required. */
10827 if (loop_vinfo && LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo))
10828 {
10829 if (slp_node)
10830 {
10831 if (dump_enabled_p ())
10832 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
10833 "can't operate on partial vectors "
10834 "because an SLP statement is live after "
10835 "the loop.\n");
10836 LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
10837 }
10838 else if (ncopies > 1)
10839 {
10840 if (dump_enabled_p ())
10841 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
10842 "can't operate on partial vectors "
10843 "because ncopies is greater than 1.\n");
10844 LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
10845 }
10846 else
10847 {
10848 gcc_assert (ncopies == 1 && !slp_node);
10849 if (direct_internal_fn_supported_p (IFN_EXTRACT_LAST, vectype,
10850 OPTIMIZE_FOR_SPEED))
10851 vect_record_loop_mask (loop_vinfo,
10852 &LOOP_VINFO_MASKS (loop_vinfo),
10853 1, vectype, NULL);
10854 else if (can_vec_extract_var_idx_p (
10855 TYPE_MODE (vectype), TYPE_MODE (TREE_TYPE (vectype))))
10856 vect_record_loop_len (loop_vinfo,
10857 &LOOP_VINFO_LENS (loop_vinfo),
10858 1, vectype, 1);
10859 else
10860 {
10861 if (dump_enabled_p ())
10862 dump_printf_loc (
10863 MSG_MISSED_OPTIMIZATION, vect_location,
10864 "can't operate on partial vectors "
10865 "because the target doesn't support extract "
10866 "last reduction.\n");
10867 LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
10868 }
10869 }
10870 }
10871 /* ??? Enable for loop costing as well. */
10872 if (!loop_vinfo)
10873 record_stmt_cost (cost_vec, 1, vec_to_scalar, stmt_info, NULL_TREE,
10874 0, vect_epilogue);
10875 return true;
10876 }
10877
10878 /* Use the lhs of the original scalar statement. */
10879 gimple *stmt = vect_orig_stmt (stmt_info)->stmt;
10880 if (dump_enabled_p ())
10881 dump_printf_loc (MSG_NOTE, vect_location, "extracting lane for live "
10882 "stmt %G", stmt);
10883
10884 lhs = gimple_get_lhs (stmt);
10885 lhs_type = TREE_TYPE (lhs);
10886
10887 bitsize = vector_element_bits_tree (vectype);
10888
10889 /* Get the vectorized lhs of STMT and the lane to use (counted in bits). */
10890 tree vec_lhs, vec_lhs0, bitstart;
10891 gimple *vec_stmt, *vec_stmt0;
10892 if (slp_node)
10893 {
10894 gcc_assert (!loop_vinfo
10895 || (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
10896 && !LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo)));
10897
10898 /* Get the correct slp vectorized stmt. */
10899 vec_lhs = SLP_TREE_VEC_DEFS (slp_node)[vec_entry];
10900 vec_stmt = SSA_NAME_DEF_STMT (vec_lhs);
10901
10902 /* In case we need to early break vectorize also get the first stmt. */
10903 vec_lhs0 = SLP_TREE_VEC_DEFS (slp_node)[0];
10904 vec_stmt0 = SSA_NAME_DEF_STMT (vec_lhs0);
10905
10906 /* Get entry to use. */
10907 bitstart = bitsize_int (vec_index);
10908 bitstart = int_const_binop (MULT_EXPR, bitsize, bitstart);
10909 }
10910 else
10911 {
10912 /* For multiple copies, get the last copy. */
10913 vec_stmt = STMT_VINFO_VEC_STMTS (stmt_info).last ();
10914 vec_lhs = gimple_get_lhs (vec_stmt);
10915
10916 /* In case we need to early break vectorize also get the first stmt. */
10917 vec_stmt0 = STMT_VINFO_VEC_STMTS (stmt_info)[0];
10918 vec_lhs0 = gimple_get_lhs (vec_stmt0);
10919
10920 /* Get the last lane in the vector. */
10921 bitstart = int_const_binop (MULT_EXPR, bitsize, bitsize_int (nunits - 1));
10922 }
10923
10924 if (loop_vinfo)
10925 {
10926 /* Ensure the VEC_LHS for lane extraction stmts satisfy loop-closed PHI
10927 requirement, insert one phi node for it. It looks like:
10928 loop;
10929 BB:
10930 # lhs' = PHI <lhs>
10931 ==>
10932 loop;
10933 BB:
10934 # vec_lhs' = PHI <vec_lhs>
10935 new_tree = lane_extract <vec_lhs', ...>;
10936 lhs' = new_tree; */
10937
10938 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
10939 /* Check if we have a loop where the chosen exit is not the main exit,
10940 in these cases for an early break we restart the iteration the vector code
10941 did. For the live values we want the value at the start of the iteration
10942 rather than at the end. */
10943 edge main_e = LOOP_VINFO_IV_EXIT (loop_vinfo);
10944 bool all_exits_as_early_p = LOOP_VINFO_EARLY_BREAKS_VECT_PEELED (loop_vinfo);
10945 FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, lhs)
10946 if (!is_gimple_debug (use_stmt)
10947 && !flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
10948 FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
10949 {
10950 edge e = gimple_phi_arg_edge (as_a <gphi *> (use_stmt),
10951 phi_arg_index_from_use (use_p));
10952 gcc_assert (loop_exit_edge_p (loop, e));
10953 bool main_exit_edge = e == main_e;
10954 tree tmp_vec_lhs = vec_lhs;
10955 tree tmp_bitstart = bitstart;
10956
10957 /* For early exit where the exit is not in the BB that leads
10958 to the latch then we're restarting the iteration in the
10959 scalar loop. So get the first live value. */
10960 if ((all_exits_as_early_p || !main_exit_edge)
10961 && STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def)
10962 {
10963 tmp_vec_lhs = vec_lhs0;
10964 tmp_bitstart = build_zero_cst (TREE_TYPE (bitstart));
10965 }
10966
10967 gimple_stmt_iterator exit_gsi;
10968 tree new_tree
10969 = vectorizable_live_operation_1 (loop_vinfo, stmt_info,
10970 e->dest, vectype, ncopies,
10971 slp_node, bitsize,
10972 tmp_bitstart, tmp_vec_lhs,
10973 lhs_type, &exit_gsi);
10974
10975 auto gsi = gsi_for_stmt (use_stmt);
10976 tree lhs_phi = gimple_phi_result (use_stmt);
10977 remove_phi_node (&gsi, false);
10978 gimple *copy = gimple_build_assign (lhs_phi, new_tree);
10979 gsi_insert_before (&exit_gsi, copy, GSI_SAME_STMT);
10980 break;
10981 }
10982
10983 /* There a no further out-of-loop uses of lhs by LC-SSA construction. */
10984 FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, lhs)
10985 gcc_assert (flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)));
10986 }
10987 else
10988 {
10989 /* For basic-block vectorization simply insert the lane-extraction. */
10990 tree bftype = TREE_TYPE (vectype);
10991 if (VECTOR_BOOLEAN_TYPE_P (vectype))
10992 bftype = build_nonstandard_integer_type (tree_to_uhwi (bitsize), 1);
10993 tree new_tree = build3 (BIT_FIELD_REF, bftype,
10994 vec_lhs, bitsize, bitstart);
10995 gimple_seq stmts = NULL;
10996 new_tree = force_gimple_operand (fold_convert (lhs_type, new_tree),
10997 &stmts, true, NULL_TREE);
10998 if (TREE_CODE (new_tree) == SSA_NAME
10999 && SSA_NAME_OCCURS_IN_ABNORMAL_PHI (lhs))
11000 SSA_NAME_OCCURS_IN_ABNORMAL_PHI (new_tree) = 1;
11001 if (is_a <gphi *> (vec_stmt))
11002 {
11003 gimple_stmt_iterator si = gsi_after_labels (gimple_bb (vec_stmt));
11004 gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
11005 }
11006 else
11007 {
11008 gimple_stmt_iterator si = gsi_for_stmt (vec_stmt);
11009 gsi_insert_seq_after (&si, stmts, GSI_SAME_STMT);
11010 }
11011
11012 /* Replace use of lhs with newly computed result. If the use stmt is a
11013 single arg PHI, just replace all uses of PHI result. It's necessary
11014 because lcssa PHI defining lhs may be before newly inserted stmt. */
11015 use_operand_p use_p;
11016 stmt_vec_info use_stmt_info;
11017 FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, lhs)
11018 if (!is_gimple_debug (use_stmt)
11019 && (!(use_stmt_info = vinfo->lookup_stmt (use_stmt))
11020 || !PURE_SLP_STMT (vect_stmt_to_vectorize (use_stmt_info))))
11021 {
11022 /* ??? This can happen when the live lane ends up being
11023 rooted in a vector construction code-generated by an
11024 external SLP node (and code-generation for that already
11025 happened). See gcc.dg/vect/bb-slp-47.c.
11026 Doing this is what would happen if that vector CTOR
11027 were not code-generated yet so it is not too bad.
11028 ??? In fact we'd likely want to avoid this situation
11029 in the first place. */
11030 if (TREE_CODE (new_tree) == SSA_NAME
11031 && !SSA_NAME_IS_DEFAULT_DEF (new_tree)
11032 && gimple_code (use_stmt) != GIMPLE_PHI
11033 && !vect_stmt_dominates_stmt_p (SSA_NAME_DEF_STMT (new_tree),
11034 use_stmt))
11035 {
11036 if (dump_enabled_p ())
11037 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
11038 "Using original scalar computation for "
11039 "live lane because use preceeds vector "
11040 "def\n");
11041 continue;
11042 }
11043 /* ??? It can also happen that we end up pulling a def into
11044 a loop where replacing out-of-loop uses would require
11045 a new LC SSA PHI node. Retain the original scalar in
11046 those cases as well. PR98064. */
11047 if (TREE_CODE (new_tree) == SSA_NAME
11048 && !SSA_NAME_IS_DEFAULT_DEF (new_tree)
11049 && (gimple_bb (use_stmt)->loop_father
11050 != gimple_bb (vec_stmt)->loop_father)
11051 && !flow_loop_nested_p (gimple_bb (vec_stmt)->loop_father,
11052 gimple_bb (use_stmt)->loop_father))
11053 {
11054 if (dump_enabled_p ())
11055 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
11056 "Using original scalar computation for "
11057 "live lane because there is an out-of-loop "
11058 "definition for it\n");
11059 continue;
11060 }
11061 FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
11062 SET_USE (use_p, new_tree);
11063 update_stmt (use_stmt);
11064 }
11065 }
11066
11067 return true;
11068 }
11069
11070 /* Kill any debug uses outside LOOP of SSA names defined in STMT_INFO. */
11071
11072 static void
11073 vect_loop_kill_debug_uses (class loop *loop, stmt_vec_info stmt_info)
11074 {
11075 ssa_op_iter op_iter;
11076 imm_use_iterator imm_iter;
11077 def_operand_p def_p;
11078 gimple *ustmt;
11079
11080 FOR_EACH_PHI_OR_STMT_DEF (def_p, stmt_info->stmt, op_iter, SSA_OP_DEF)
11081 {
11082 FOR_EACH_IMM_USE_STMT (ustmt, imm_iter, DEF_FROM_PTR (def_p))
11083 {
11084 basic_block bb;
11085
11086 if (!is_gimple_debug (ustmt))
11087 continue;
11088
11089 bb = gimple_bb (ustmt);
11090
11091 if (!flow_bb_inside_loop_p (loop, bb))
11092 {
11093 if (gimple_debug_bind_p (ustmt))
11094 {
11095 if (dump_enabled_p ())
11096 dump_printf_loc (MSG_NOTE, vect_location,
11097 "killing debug use\n");
11098
11099 gimple_debug_bind_reset_value (ustmt);
11100 update_stmt (ustmt);
11101 }
11102 else
11103 gcc_unreachable ();
11104 }
11105 }
11106 }
11107 }
11108
11109 /* Given loop represented by LOOP_VINFO, return true if computation of
11110 LOOP_VINFO_NITERS (= LOOP_VINFO_NITERSM1 + 1) doesn't overflow, false
11111 otherwise. */
11112
11113 static bool
11114 loop_niters_no_overflow (loop_vec_info loop_vinfo)
11115 {
11116 /* Constant case. */
11117 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
11118 {
11119 tree cst_niters = LOOP_VINFO_NITERS (loop_vinfo);
11120 tree cst_nitersm1 = LOOP_VINFO_NITERSM1 (loop_vinfo);
11121
11122 gcc_assert (TREE_CODE (cst_niters) == INTEGER_CST);
11123 gcc_assert (TREE_CODE (cst_nitersm1) == INTEGER_CST);
11124 if (wi::to_widest (cst_nitersm1) < wi::to_widest (cst_niters))
11125 return true;
11126 }
11127
11128 widest_int max;
11129 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
11130 /* Check the upper bound of loop niters. */
11131 if (get_max_loop_iterations (loop, &max))
11132 {
11133 tree type = TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo));
11134 signop sgn = TYPE_SIGN (type);
11135 widest_int type_max = widest_int::from (wi::max_value (type), sgn);
11136 if (max < type_max)
11137 return true;
11138 }
11139 return false;
11140 }
11141
11142 /* Return a mask type with half the number of elements as OLD_TYPE,
11143 given that it should have mode NEW_MODE. */
11144
11145 tree
11146 vect_halve_mask_nunits (tree old_type, machine_mode new_mode)
11147 {
11148 poly_uint64 nunits = exact_div (TYPE_VECTOR_SUBPARTS (old_type), 2);
11149 return build_truth_vector_type_for_mode (nunits, new_mode);
11150 }
11151
11152 /* Return a mask type with twice as many elements as OLD_TYPE,
11153 given that it should have mode NEW_MODE. */
11154
11155 tree
11156 vect_double_mask_nunits (tree old_type, machine_mode new_mode)
11157 {
11158 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (old_type) * 2;
11159 return build_truth_vector_type_for_mode (nunits, new_mode);
11160 }
11161
11162 /* Record that a fully-masked version of LOOP_VINFO would need MASKS to
11163 contain a sequence of NVECTORS masks that each control a vector of type
11164 VECTYPE. If SCALAR_MASK is nonnull, the fully-masked loop would AND
11165 these vector masks with the vector version of SCALAR_MASK. */
11166
11167 void
11168 vect_record_loop_mask (loop_vec_info loop_vinfo, vec_loop_masks *masks,
11169 unsigned int nvectors, tree vectype, tree scalar_mask)
11170 {
11171 gcc_assert (nvectors != 0);
11172
11173 if (scalar_mask)
11174 {
11175 scalar_cond_masked_key cond (scalar_mask, nvectors);
11176 loop_vinfo->scalar_cond_masked_set.add (cond);
11177 }
11178
11179 masks->mask_set.add (std::make_pair (vectype, nvectors));
11180 }
11181
11182 /* Given a complete set of masks MASKS, extract mask number INDEX
11183 for an rgroup that operates on NVECTORS vectors of type VECTYPE,
11184 where 0 <= INDEX < NVECTORS. Insert any set-up statements before GSI.
11185
11186 See the comment above vec_loop_masks for more details about the mask
11187 arrangement. */
11188
11189 tree
11190 vect_get_loop_mask (loop_vec_info loop_vinfo,
11191 gimple_stmt_iterator *gsi, vec_loop_masks *masks,
11192 unsigned int nvectors, tree vectype, unsigned int index)
11193 {
11194 if (LOOP_VINFO_PARTIAL_VECTORS_STYLE (loop_vinfo)
11195 == vect_partial_vectors_while_ult)
11196 {
11197 rgroup_controls *rgm = &(masks->rgc_vec)[nvectors - 1];
11198 tree mask_type = rgm->type;
11199
11200 /* Populate the rgroup's mask array, if this is the first time we've
11201 used it. */
11202 if (rgm->controls.is_empty ())
11203 {
11204 rgm->controls.safe_grow_cleared (nvectors, true);
11205 for (unsigned int i = 0; i < nvectors; ++i)
11206 {
11207 tree mask = make_temp_ssa_name (mask_type, NULL, "loop_mask");
11208 /* Provide a dummy definition until the real one is available. */
11209 SSA_NAME_DEF_STMT (mask) = gimple_build_nop ();
11210 rgm->controls[i] = mask;
11211 }
11212 }
11213
11214 tree mask = rgm->controls[index];
11215 if (maybe_ne (TYPE_VECTOR_SUBPARTS (mask_type),
11216 TYPE_VECTOR_SUBPARTS (vectype)))
11217 {
11218 /* A loop mask for data type X can be reused for data type Y
11219 if X has N times more elements than Y and if Y's elements
11220 are N times bigger than X's. In this case each sequence
11221 of N elements in the loop mask will be all-zero or all-one.
11222 We can then view-convert the mask so that each sequence of
11223 N elements is replaced by a single element. */
11224 gcc_assert (multiple_p (TYPE_VECTOR_SUBPARTS (mask_type),
11225 TYPE_VECTOR_SUBPARTS (vectype)));
11226 gimple_seq seq = NULL;
11227 mask_type = truth_type_for (vectype);
11228 mask = gimple_build (&seq, VIEW_CONVERT_EXPR, mask_type, mask);
11229 if (seq)
11230 gsi_insert_seq_before (gsi, seq, GSI_SAME_STMT);
11231 }
11232 return mask;
11233 }
11234 else if (LOOP_VINFO_PARTIAL_VECTORS_STYLE (loop_vinfo)
11235 == vect_partial_vectors_avx512)
11236 {
11237 /* The number of scalars per iteration and the number of vectors are
11238 both compile-time constants. */
11239 unsigned int nscalars_per_iter
11240 = exact_div (nvectors * TYPE_VECTOR_SUBPARTS (vectype),
11241 LOOP_VINFO_VECT_FACTOR (loop_vinfo)).to_constant ();
11242
11243 rgroup_controls *rgm = &masks->rgc_vec[nscalars_per_iter - 1];
11244
11245 /* The stored nV is dependent on the mask type produced. */
11246 gcc_assert (exact_div (nvectors * TYPE_VECTOR_SUBPARTS (vectype),
11247 TYPE_VECTOR_SUBPARTS (rgm->type)).to_constant ()
11248 == rgm->factor);
11249 nvectors = rgm->factor;
11250
11251 /* Populate the rgroup's mask array, if this is the first time we've
11252 used it. */
11253 if (rgm->controls.is_empty ())
11254 {
11255 rgm->controls.safe_grow_cleared (nvectors, true);
11256 for (unsigned int i = 0; i < nvectors; ++i)
11257 {
11258 tree mask = make_temp_ssa_name (rgm->type, NULL, "loop_mask");
11259 /* Provide a dummy definition until the real one is available. */
11260 SSA_NAME_DEF_STMT (mask) = gimple_build_nop ();
11261 rgm->controls[i] = mask;
11262 }
11263 }
11264 if (known_eq (TYPE_VECTOR_SUBPARTS (rgm->type),
11265 TYPE_VECTOR_SUBPARTS (vectype)))
11266 return rgm->controls[index];
11267
11268 /* Split the vector if needed. Since we are dealing with integer mode
11269 masks with AVX512 we can operate on the integer representation
11270 performing the whole vector shifting. */
11271 unsigned HOST_WIDE_INT factor;
11272 bool ok = constant_multiple_p (TYPE_VECTOR_SUBPARTS (rgm->type),
11273 TYPE_VECTOR_SUBPARTS (vectype), &factor);
11274 gcc_assert (ok);
11275 gcc_assert (GET_MODE_CLASS (TYPE_MODE (rgm->type)) == MODE_INT);
11276 tree mask_type = truth_type_for (vectype);
11277 gcc_assert (GET_MODE_CLASS (TYPE_MODE (mask_type)) == MODE_INT);
11278 unsigned vi = index / factor;
11279 unsigned vpart = index % factor;
11280 tree vec = rgm->controls[vi];
11281 gimple_seq seq = NULL;
11282 vec = gimple_build (&seq, VIEW_CONVERT_EXPR,
11283 lang_hooks.types.type_for_mode
11284 (TYPE_MODE (rgm->type), 1), vec);
11285 /* For integer mode masks simply shift the right bits into position. */
11286 if (vpart != 0)
11287 vec = gimple_build (&seq, RSHIFT_EXPR, TREE_TYPE (vec), vec,
11288 build_int_cst (integer_type_node,
11289 (TYPE_VECTOR_SUBPARTS (vectype)
11290 * vpart)));
11291 vec = gimple_convert (&seq, lang_hooks.types.type_for_mode
11292 (TYPE_MODE (mask_type), 1), vec);
11293 vec = gimple_build (&seq, VIEW_CONVERT_EXPR, mask_type, vec);
11294 if (seq)
11295 gsi_insert_seq_before (gsi, seq, GSI_SAME_STMT);
11296 return vec;
11297 }
11298 else
11299 gcc_unreachable ();
11300 }
11301
11302 /* Record that LOOP_VINFO would need LENS to contain a sequence of NVECTORS
11303 lengths for controlling an operation on VECTYPE. The operation splits
11304 each element of VECTYPE into FACTOR separate subelements, measuring the
11305 length as a number of these subelements. */
11306
11307 void
11308 vect_record_loop_len (loop_vec_info loop_vinfo, vec_loop_lens *lens,
11309 unsigned int nvectors, tree vectype, unsigned int factor)
11310 {
11311 gcc_assert (nvectors != 0);
11312 if (lens->length () < nvectors)
11313 lens->safe_grow_cleared (nvectors, true);
11314 rgroup_controls *rgl = &(*lens)[nvectors - 1];
11315
11316 /* The number of scalars per iteration, scalar occupied bytes and
11317 the number of vectors are both compile-time constants. */
11318 unsigned int nscalars_per_iter
11319 = exact_div (nvectors * TYPE_VECTOR_SUBPARTS (vectype),
11320 LOOP_VINFO_VECT_FACTOR (loop_vinfo)).to_constant ();
11321
11322 if (rgl->max_nscalars_per_iter < nscalars_per_iter)
11323 {
11324 /* For now, we only support cases in which all loads and stores fall back
11325 to VnQI or none do. */
11326 gcc_assert (!rgl->max_nscalars_per_iter
11327 || (rgl->factor == 1 && factor == 1)
11328 || (rgl->max_nscalars_per_iter * rgl->factor
11329 == nscalars_per_iter * factor));
11330 rgl->max_nscalars_per_iter = nscalars_per_iter;
11331 rgl->type = vectype;
11332 rgl->factor = factor;
11333 }
11334 }
11335
11336 /* Given a complete set of lengths LENS, extract length number INDEX
11337 for an rgroup that operates on NVECTORS vectors of type VECTYPE,
11338 where 0 <= INDEX < NVECTORS. Return a value that contains FACTOR
11339 multipled by the number of elements that should be processed.
11340 Insert any set-up statements before GSI. */
11341
11342 tree
11343 vect_get_loop_len (loop_vec_info loop_vinfo, gimple_stmt_iterator *gsi,
11344 vec_loop_lens *lens, unsigned int nvectors, tree vectype,
11345 unsigned int index, unsigned int factor)
11346 {
11347 rgroup_controls *rgl = &(*lens)[nvectors - 1];
11348 bool use_bias_adjusted_len =
11349 LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo) != 0;
11350
11351 /* Populate the rgroup's len array, if this is the first time we've
11352 used it. */
11353 if (rgl->controls.is_empty ())
11354 {
11355 rgl->controls.safe_grow_cleared (nvectors, true);
11356 for (unsigned int i = 0; i < nvectors; ++i)
11357 {
11358 tree len_type = LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo);
11359 gcc_assert (len_type != NULL_TREE);
11360
11361 tree len = make_temp_ssa_name (len_type, NULL, "loop_len");
11362
11363 /* Provide a dummy definition until the real one is available. */
11364 SSA_NAME_DEF_STMT (len) = gimple_build_nop ();
11365 rgl->controls[i] = len;
11366
11367 if (use_bias_adjusted_len)
11368 {
11369 gcc_assert (i == 0);
11370 tree adjusted_len =
11371 make_temp_ssa_name (len_type, NULL, "adjusted_loop_len");
11372 SSA_NAME_DEF_STMT (adjusted_len) = gimple_build_nop ();
11373 rgl->bias_adjusted_ctrl = adjusted_len;
11374 }
11375 }
11376 }
11377
11378 if (use_bias_adjusted_len)
11379 return rgl->bias_adjusted_ctrl;
11380
11381 tree loop_len = rgl->controls[index];
11382 if (rgl->factor == 1 && factor == 1)
11383 {
11384 poly_int64 nunits1 = TYPE_VECTOR_SUBPARTS (rgl->type);
11385 poly_int64 nunits2 = TYPE_VECTOR_SUBPARTS (vectype);
11386 if (maybe_ne (nunits1, nunits2))
11387 {
11388 /* A loop len for data type X can be reused for data type Y
11389 if X has N times more elements than Y and if Y's elements
11390 are N times bigger than X's. */
11391 gcc_assert (multiple_p (nunits1, nunits2));
11392 factor = exact_div (nunits1, nunits2).to_constant ();
11393 tree iv_type = LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo);
11394 gimple_seq seq = NULL;
11395 loop_len = gimple_build (&seq, RDIV_EXPR, iv_type, loop_len,
11396 build_int_cst (iv_type, factor));
11397 if (seq)
11398 gsi_insert_seq_before (gsi, seq, GSI_SAME_STMT);
11399 }
11400 }
11401 return loop_len;
11402 }
11403
11404 /* Scale profiling counters by estimation for LOOP which is vectorized
11405 by factor VF.
11406 If FLAT is true, the loop we started with had unrealistically flat
11407 profile. */
11408
11409 static void
11410 scale_profile_for_vect_loop (class loop *loop, edge exit_e, unsigned vf, bool flat)
11411 {
11412 /* For flat profiles do not scale down proportionally by VF and only
11413 cap by known iteration count bounds. */
11414 if (flat)
11415 {
11416 if (dump_file && (dump_flags & TDF_DETAILS))
11417 fprintf (dump_file,
11418 "Vectorized loop profile seems flat; not scaling iteration "
11419 "count down by the vectorization factor %i\n", vf);
11420 scale_loop_profile (loop, profile_probability::always (),
11421 get_likely_max_loop_iterations_int (loop));
11422 return;
11423 }
11424 /* Loop body executes VF fewer times and exit increases VF times. */
11425 profile_count entry_count = loop_preheader_edge (loop)->count ();
11426
11427 /* If we have unreliable loop profile avoid dropping entry
11428 count bellow header count. This can happen since loops
11429 has unrealistically low trip counts. */
11430 while (vf > 1
11431 && loop->header->count > entry_count
11432 && loop->header->count < entry_count * vf)
11433 {
11434 if (dump_file && (dump_flags & TDF_DETAILS))
11435 fprintf (dump_file,
11436 "Vectorization factor %i seems too large for profile "
11437 "prevoiusly believed to be consistent; reducing.\n", vf);
11438 vf /= 2;
11439 }
11440
11441 if (entry_count.nonzero_p ())
11442 set_edge_probability_and_rescale_others
11443 (exit_e,
11444 entry_count.probability_in (loop->header->count / vf));
11445 /* Avoid producing very large exit probability when we do not have
11446 sensible profile. */
11447 else if (exit_e->probability < profile_probability::always () / (vf * 2))
11448 set_edge_probability_and_rescale_others (exit_e, exit_e->probability * vf);
11449 loop->latch->count = single_pred_edge (loop->latch)->count ();
11450
11451 scale_loop_profile (loop, profile_probability::always () / vf,
11452 get_likely_max_loop_iterations_int (loop));
11453 }
11454
11455 /* For a vectorized stmt DEF_STMT_INFO adjust all vectorized PHI
11456 latch edge values originally defined by it. */
11457
11458 static void
11459 maybe_set_vectorized_backedge_value (loop_vec_info loop_vinfo,
11460 stmt_vec_info def_stmt_info)
11461 {
11462 tree def = gimple_get_lhs (vect_orig_stmt (def_stmt_info)->stmt);
11463 if (!def || TREE_CODE (def) != SSA_NAME)
11464 return;
11465 stmt_vec_info phi_info;
11466 imm_use_iterator iter;
11467 use_operand_p use_p;
11468 FOR_EACH_IMM_USE_FAST (use_p, iter, def)
11469 {
11470 gphi *phi = dyn_cast <gphi *> (USE_STMT (use_p));
11471 if (!phi)
11472 continue;
11473 if (!(gimple_bb (phi)->loop_father->header == gimple_bb (phi)
11474 && (phi_info = loop_vinfo->lookup_stmt (phi))
11475 && STMT_VINFO_RELEVANT_P (phi_info)))
11476 continue;
11477 loop_p loop = gimple_bb (phi)->loop_father;
11478 edge e = loop_latch_edge (loop);
11479 if (PHI_ARG_DEF_FROM_EDGE (phi, e) != def)
11480 continue;
11481
11482 if (VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (phi_info))
11483 && STMT_VINFO_REDUC_TYPE (phi_info) != FOLD_LEFT_REDUCTION
11484 && STMT_VINFO_REDUC_TYPE (phi_info) != EXTRACT_LAST_REDUCTION)
11485 {
11486 vec<gimple *> &phi_defs = STMT_VINFO_VEC_STMTS (phi_info);
11487 vec<gimple *> &latch_defs = STMT_VINFO_VEC_STMTS (def_stmt_info);
11488 gcc_assert (phi_defs.length () == latch_defs.length ());
11489 for (unsigned i = 0; i < phi_defs.length (); ++i)
11490 add_phi_arg (as_a <gphi *> (phi_defs[i]),
11491 gimple_get_lhs (latch_defs[i]), e,
11492 gimple_phi_arg_location (phi, e->dest_idx));
11493 }
11494 else if (STMT_VINFO_DEF_TYPE (phi_info) == vect_first_order_recurrence)
11495 {
11496 /* For first order recurrences we have to update both uses of
11497 the latch definition, the one in the PHI node and the one
11498 in the generated VEC_PERM_EXPR. */
11499 vec<gimple *> &phi_defs = STMT_VINFO_VEC_STMTS (phi_info);
11500 vec<gimple *> &latch_defs = STMT_VINFO_VEC_STMTS (def_stmt_info);
11501 gcc_assert (phi_defs.length () == latch_defs.length ());
11502 tree phidef = gimple_assign_rhs1 (phi_defs[0]);
11503 gphi *vphi = as_a <gphi *> (SSA_NAME_DEF_STMT (phidef));
11504 for (unsigned i = 0; i < phi_defs.length (); ++i)
11505 {
11506 gassign *perm = as_a <gassign *> (phi_defs[i]);
11507 if (i > 0)
11508 gimple_assign_set_rhs1 (perm, gimple_get_lhs (latch_defs[i-1]));
11509 gimple_assign_set_rhs2 (perm, gimple_get_lhs (latch_defs[i]));
11510 update_stmt (perm);
11511 }
11512 add_phi_arg (vphi, gimple_get_lhs (latch_defs.last ()), e,
11513 gimple_phi_arg_location (phi, e->dest_idx));
11514 }
11515 }
11516 }
11517
11518 /* Vectorize STMT_INFO if relevant, inserting any new instructions before GSI.
11519 When vectorizing STMT_INFO as a store, set *SEEN_STORE to its
11520 stmt_vec_info. */
11521
11522 static bool
11523 vect_transform_loop_stmt (loop_vec_info loop_vinfo, stmt_vec_info stmt_info,
11524 gimple_stmt_iterator *gsi, stmt_vec_info *seen_store)
11525 {
11526 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
11527 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
11528
11529 if (dump_enabled_p ())
11530 dump_printf_loc (MSG_NOTE, vect_location,
11531 "------>vectorizing statement: %G", stmt_info->stmt);
11532
11533 if (MAY_HAVE_DEBUG_BIND_STMTS && !STMT_VINFO_LIVE_P (stmt_info))
11534 vect_loop_kill_debug_uses (loop, stmt_info);
11535
11536 if (!STMT_VINFO_RELEVANT_P (stmt_info)
11537 && !STMT_VINFO_LIVE_P (stmt_info))
11538 {
11539 if (is_gimple_call (stmt_info->stmt)
11540 && gimple_call_internal_p (stmt_info->stmt, IFN_MASK_CALL))
11541 {
11542 gcc_assert (!gimple_call_lhs (stmt_info->stmt));
11543 *seen_store = stmt_info;
11544 return false;
11545 }
11546 return false;
11547 }
11548
11549 if (STMT_VINFO_VECTYPE (stmt_info))
11550 {
11551 poly_uint64 nunits
11552 = TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info));
11553 if (!STMT_SLP_TYPE (stmt_info)
11554 && maybe_ne (nunits, vf)
11555 && dump_enabled_p ())
11556 /* For SLP VF is set according to unrolling factor, and not
11557 to vector size, hence for SLP this print is not valid. */
11558 dump_printf_loc (MSG_NOTE, vect_location, "multiple-types.\n");
11559 }
11560
11561 /* Pure SLP statements have already been vectorized. We still need
11562 to apply loop vectorization to hybrid SLP statements. */
11563 if (PURE_SLP_STMT (stmt_info))
11564 return false;
11565
11566 if (dump_enabled_p ())
11567 dump_printf_loc (MSG_NOTE, vect_location, "transform statement.\n");
11568
11569 if (vect_transform_stmt (loop_vinfo, stmt_info, gsi, NULL, NULL))
11570 *seen_store = stmt_info;
11571
11572 return true;
11573 }
11574
11575 /* Helper function to pass to simplify_replace_tree to enable replacing tree's
11576 in the hash_map with its corresponding values. */
11577
11578 static tree
11579 find_in_mapping (tree t, void *context)
11580 {
11581 hash_map<tree,tree>* mapping = (hash_map<tree, tree>*) context;
11582
11583 tree *value = mapping->get (t);
11584 return value ? *value : t;
11585 }
11586
11587 /* Update EPILOGUE's loop_vec_info. EPILOGUE was constructed as a copy of the
11588 original loop that has now been vectorized.
11589
11590 The inits of the data_references need to be advanced with the number of
11591 iterations of the main loop. This has been computed in vect_do_peeling and
11592 is stored in parameter ADVANCE. We first restore the data_references
11593 initial offset with the values recored in ORIG_DRS_INIT.
11594
11595 Since the loop_vec_info of this EPILOGUE was constructed for the original
11596 loop, its stmt_vec_infos all point to the original statements. These need
11597 to be updated to point to their corresponding copies as well as the SSA_NAMES
11598 in their PATTERN_DEF_SEQs and RELATED_STMTs.
11599
11600 The data_reference's connections also need to be updated. Their
11601 corresponding dr_vec_info need to be reconnected to the EPILOGUE's
11602 stmt_vec_infos, their statements need to point to their corresponding copy,
11603 if they are gather loads or scatter stores then their reference needs to be
11604 updated to point to its corresponding copy. */
11605
11606 static void
11607 update_epilogue_loop_vinfo (class loop *epilogue, tree advance)
11608 {
11609 loop_vec_info epilogue_vinfo = loop_vec_info_for_loop (epilogue);
11610 auto_vec<gimple *> stmt_worklist;
11611 hash_map<tree,tree> mapping;
11612 gimple *orig_stmt, *new_stmt;
11613 gimple_stmt_iterator epilogue_gsi;
11614 gphi_iterator epilogue_phi_gsi;
11615 stmt_vec_info stmt_vinfo = NULL, related_vinfo;
11616 basic_block *epilogue_bbs = get_loop_body (epilogue);
11617 unsigned i;
11618
11619 free (LOOP_VINFO_BBS (epilogue_vinfo));
11620 LOOP_VINFO_BBS (epilogue_vinfo) = epilogue_bbs;
11621
11622 /* Advance data_reference's with the number of iterations of the previous
11623 loop and its prologue. */
11624 vect_update_inits_of_drs (epilogue_vinfo, advance, PLUS_EXPR);
11625
11626
11627 /* The EPILOGUE loop is a copy of the original loop so they share the same
11628 gimple UIDs. In this loop we update the loop_vec_info of the EPILOGUE to
11629 point to the copied statements. We also create a mapping of all LHS' in
11630 the original loop and all the LHS' in the EPILOGUE and create worklists to
11631 update teh STMT_VINFO_PATTERN_DEF_SEQs and STMT_VINFO_RELATED_STMTs. */
11632 for (unsigned i = 0; i < epilogue->num_nodes; ++i)
11633 {
11634 for (epilogue_phi_gsi = gsi_start_phis (epilogue_bbs[i]);
11635 !gsi_end_p (epilogue_phi_gsi); gsi_next (&epilogue_phi_gsi))
11636 {
11637 new_stmt = epilogue_phi_gsi.phi ();
11638
11639 gcc_assert (gimple_uid (new_stmt) > 0);
11640 stmt_vinfo
11641 = epilogue_vinfo->stmt_vec_infos[gimple_uid (new_stmt) - 1];
11642
11643 orig_stmt = STMT_VINFO_STMT (stmt_vinfo);
11644 STMT_VINFO_STMT (stmt_vinfo) = new_stmt;
11645
11646 mapping.put (gimple_phi_result (orig_stmt),
11647 gimple_phi_result (new_stmt));
11648 /* PHI nodes can not have patterns or related statements. */
11649 gcc_assert (STMT_VINFO_PATTERN_DEF_SEQ (stmt_vinfo) == NULL
11650 && STMT_VINFO_RELATED_STMT (stmt_vinfo) == NULL);
11651 }
11652
11653 for (epilogue_gsi = gsi_start_bb (epilogue_bbs[i]);
11654 !gsi_end_p (epilogue_gsi); gsi_next (&epilogue_gsi))
11655 {
11656 new_stmt = gsi_stmt (epilogue_gsi);
11657 if (is_gimple_debug (new_stmt))
11658 continue;
11659
11660 gcc_assert (gimple_uid (new_stmt) > 0);
11661 stmt_vinfo
11662 = epilogue_vinfo->stmt_vec_infos[gimple_uid (new_stmt) - 1];
11663
11664 orig_stmt = STMT_VINFO_STMT (stmt_vinfo);
11665 STMT_VINFO_STMT (stmt_vinfo) = new_stmt;
11666
11667 if (tree old_lhs = gimple_get_lhs (orig_stmt))
11668 mapping.put (old_lhs, gimple_get_lhs (new_stmt));
11669
11670 if (STMT_VINFO_PATTERN_DEF_SEQ (stmt_vinfo))
11671 {
11672 gimple_seq seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_vinfo);
11673 for (gimple_stmt_iterator gsi = gsi_start (seq);
11674 !gsi_end_p (gsi); gsi_next (&gsi))
11675 stmt_worklist.safe_push (gsi_stmt (gsi));
11676 }
11677
11678 related_vinfo = STMT_VINFO_RELATED_STMT (stmt_vinfo);
11679 if (related_vinfo != NULL && related_vinfo != stmt_vinfo)
11680 {
11681 gimple *stmt = STMT_VINFO_STMT (related_vinfo);
11682 stmt_worklist.safe_push (stmt);
11683 /* Set BB such that the assert in
11684 'get_initial_def_for_reduction' is able to determine that
11685 the BB of the related stmt is inside this loop. */
11686 gimple_set_bb (stmt,
11687 gimple_bb (new_stmt));
11688 related_vinfo = STMT_VINFO_RELATED_STMT (related_vinfo);
11689 gcc_assert (related_vinfo == NULL
11690 || related_vinfo == stmt_vinfo);
11691 }
11692 }
11693 }
11694
11695 /* The PATTERN_DEF_SEQs and RELATED_STMTs in the epilogue were constructed
11696 using the original main loop and thus need to be updated to refer to the
11697 cloned variables used in the epilogue. */
11698 for (unsigned i = 0; i < stmt_worklist.length (); ++i)
11699 {
11700 gimple *stmt = stmt_worklist[i];
11701 tree *new_op;
11702
11703 for (unsigned j = 1; j < gimple_num_ops (stmt); ++j)
11704 {
11705 tree op = gimple_op (stmt, j);
11706 if ((new_op = mapping.get(op)))
11707 gimple_set_op (stmt, j, *new_op);
11708 else
11709 {
11710 /* PR92429: The last argument of simplify_replace_tree disables
11711 folding when replacing arguments. This is required as
11712 otherwise you might end up with different statements than the
11713 ones analyzed in vect_loop_analyze, leading to different
11714 vectorization. */
11715 op = simplify_replace_tree (op, NULL_TREE, NULL_TREE,
11716 &find_in_mapping, &mapping, false);
11717 gimple_set_op (stmt, j, op);
11718 }
11719 }
11720 }
11721
11722 struct data_reference *dr;
11723 vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (epilogue_vinfo);
11724 FOR_EACH_VEC_ELT (datarefs, i, dr)
11725 {
11726 orig_stmt = DR_STMT (dr);
11727 gcc_assert (gimple_uid (orig_stmt) > 0);
11728 stmt_vinfo = epilogue_vinfo->stmt_vec_infos[gimple_uid (orig_stmt) - 1];
11729 /* Data references for gather loads and scatter stores do not use the
11730 updated offset we set using ADVANCE. Instead we have to make sure the
11731 reference in the data references point to the corresponding copy of
11732 the original in the epilogue. Make sure to update both
11733 gather/scatters recognized by dataref analysis and also other
11734 refs that get_load_store_type classified as VMAT_GATHER_SCATTER. */
11735 auto vstmt_vinfo = vect_stmt_to_vectorize (stmt_vinfo);
11736 if (STMT_VINFO_MEMORY_ACCESS_TYPE (vstmt_vinfo) == VMAT_GATHER_SCATTER
11737 || STMT_VINFO_GATHER_SCATTER_P (vstmt_vinfo))
11738 {
11739 DR_REF (dr)
11740 = simplify_replace_tree (DR_REF (dr), NULL_TREE, NULL_TREE,
11741 &find_in_mapping, &mapping);
11742 DR_BASE_ADDRESS (dr)
11743 = simplify_replace_tree (DR_BASE_ADDRESS (dr), NULL_TREE, NULL_TREE,
11744 &find_in_mapping, &mapping);
11745 }
11746 DR_STMT (dr) = STMT_VINFO_STMT (stmt_vinfo);
11747 stmt_vinfo->dr_aux.stmt = stmt_vinfo;
11748 }
11749
11750 epilogue_vinfo->shared->datarefs_copy.release ();
11751 epilogue_vinfo->shared->save_datarefs ();
11752 }
11753
11754 /* When vectorizing early break statements instructions that happen before
11755 the early break in the current BB need to be moved to after the early
11756 break. This function deals with that and assumes that any validity
11757 checks has already been performed.
11758
11759 While moving the instructions if it encounters a VUSE or VDEF it then
11760 corrects the VUSES as it moves the statements along. GDEST is the location
11761 in which to insert the new statements. */
11762
11763 static void
11764 move_early_exit_stmts (loop_vec_info loop_vinfo)
11765 {
11766 DUMP_VECT_SCOPE ("move_early_exit_stmts");
11767
11768 if (LOOP_VINFO_EARLY_BRK_STORES (loop_vinfo).is_empty ())
11769 return;
11770
11771 /* Move all stmts that need moving. */
11772 basic_block dest_bb = LOOP_VINFO_EARLY_BRK_DEST_BB (loop_vinfo);
11773 gimple_stmt_iterator dest_gsi = gsi_after_labels (dest_bb);
11774
11775 tree last_seen_vuse = NULL_TREE;
11776 for (gimple *stmt : LOOP_VINFO_EARLY_BRK_STORES (loop_vinfo))
11777 {
11778 /* We have to update crossed degenerate virtual PHIs. Simply
11779 elide them. */
11780 if (gphi *vphi = dyn_cast <gphi *> (stmt))
11781 {
11782 tree vdef = gimple_phi_result (vphi);
11783 tree vuse = gimple_phi_arg_def (vphi, 0);
11784 imm_use_iterator iter;
11785 use_operand_p use_p;
11786 gimple *use_stmt;
11787 FOR_EACH_IMM_USE_STMT (use_stmt, iter, vdef)
11788 {
11789 FOR_EACH_IMM_USE_ON_STMT (use_p, iter)
11790 SET_USE (use_p, vuse);
11791 }
11792 auto gsi = gsi_for_stmt (stmt);
11793 remove_phi_node (&gsi, true);
11794 last_seen_vuse = vuse;
11795 continue;
11796 }
11797
11798 /* Check to see if statement is still required for vect or has been
11799 elided. */
11800 auto stmt_info = loop_vinfo->lookup_stmt (stmt);
11801 if (!stmt_info)
11802 continue;
11803
11804 if (dump_enabled_p ())
11805 dump_printf_loc (MSG_NOTE, vect_location, "moving stmt %G", stmt);
11806
11807 gimple_stmt_iterator stmt_gsi = gsi_for_stmt (stmt);
11808 gsi_move_before (&stmt_gsi, &dest_gsi, GSI_NEW_STMT);
11809 last_seen_vuse = gimple_vuse (stmt);
11810 }
11811
11812 /* Update all the stmts with their new reaching VUSES. */
11813 for (auto p : LOOP_VINFO_EARLY_BRK_VUSES (loop_vinfo))
11814 {
11815 if (dump_enabled_p ())
11816 dump_printf_loc (MSG_NOTE, vect_location,
11817 "updating vuse to %T for load %G",
11818 last_seen_vuse, p);
11819 gimple_set_vuse (p, last_seen_vuse);
11820 update_stmt (p);
11821 }
11822
11823 /* And update the LC PHIs on exits. */
11824 for (edge e : get_loop_exit_edges (LOOP_VINFO_LOOP (loop_vinfo)))
11825 if (!dominated_by_p (CDI_DOMINATORS, e->src, dest_bb))
11826 if (gphi *phi = get_virtual_phi (e->dest))
11827 SET_PHI_ARG_DEF_ON_EDGE (phi, e, last_seen_vuse);
11828 }
11829
11830 /* Function vect_transform_loop.
11831
11832 The analysis phase has determined that the loop is vectorizable.
11833 Vectorize the loop - created vectorized stmts to replace the scalar
11834 stmts in the loop, and update the loop exit condition.
11835 Returns scalar epilogue loop if any. */
11836
11837 class loop *
11838 vect_transform_loop (loop_vec_info loop_vinfo, gimple *loop_vectorized_call)
11839 {
11840 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
11841 class loop *epilogue = NULL;
11842 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
11843 int nbbs = loop->num_nodes;
11844 int i;
11845 tree niters_vector = NULL_TREE;
11846 tree step_vector = NULL_TREE;
11847 tree niters_vector_mult_vf = NULL_TREE;
11848 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
11849 unsigned int lowest_vf = constant_lower_bound (vf);
11850 gimple *stmt;
11851 bool check_profitability = false;
11852 unsigned int th;
11853 bool flat = maybe_flat_loop_profile (loop);
11854
11855 DUMP_VECT_SCOPE ("vec_transform_loop");
11856
11857 loop_vinfo->shared->check_datarefs ();
11858
11859 /* Use the more conservative vectorization threshold. If the number
11860 of iterations is constant assume the cost check has been performed
11861 by our caller. If the threshold makes all loops profitable that
11862 run at least the (estimated) vectorization factor number of times
11863 checking is pointless, too. */
11864 th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
11865 if (vect_apply_runtime_profitability_check_p (loop_vinfo))
11866 {
11867 if (dump_enabled_p ())
11868 dump_printf_loc (MSG_NOTE, vect_location,
11869 "Profitability threshold is %d loop iterations.\n",
11870 th);
11871 check_profitability = true;
11872 }
11873
11874 /* Make sure there exists a single-predecessor exit bb. Do this before
11875 versioning. */
11876 edge e = LOOP_VINFO_IV_EXIT (loop_vinfo);
11877 if (! single_pred_p (e->dest) && !LOOP_VINFO_EARLY_BREAKS (loop_vinfo))
11878 {
11879 split_loop_exit_edge (e, true);
11880 if (dump_enabled_p ())
11881 dump_printf (MSG_NOTE, "split exit edge\n");
11882 }
11883
11884 /* Version the loop first, if required, so the profitability check
11885 comes first. */
11886
11887 if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
11888 {
11889 class loop *sloop
11890 = vect_loop_versioning (loop_vinfo, loop_vectorized_call);
11891 sloop->force_vectorize = false;
11892 check_profitability = false;
11893 }
11894
11895 /* Make sure there exists a single-predecessor exit bb also on the
11896 scalar loop copy. Do this after versioning but before peeling
11897 so CFG structure is fine for both scalar and if-converted loop
11898 to make slpeel_duplicate_current_defs_from_edges face matched
11899 loop closed PHI nodes on the exit. */
11900 if (LOOP_VINFO_SCALAR_LOOP (loop_vinfo))
11901 {
11902 e = LOOP_VINFO_SCALAR_IV_EXIT (loop_vinfo);
11903 if (! single_pred_p (e->dest))
11904 {
11905 split_loop_exit_edge (e, true);
11906 if (dump_enabled_p ())
11907 dump_printf (MSG_NOTE, "split exit edge of scalar loop\n");
11908 }
11909 }
11910
11911 tree niters = vect_build_loop_niters (loop_vinfo);
11912 LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = niters;
11913 tree nitersm1 = unshare_expr (LOOP_VINFO_NITERSM1 (loop_vinfo));
11914 bool niters_no_overflow = loop_niters_no_overflow (loop_vinfo);
11915 tree advance;
11916 drs_init_vec orig_drs_init;
11917
11918 epilogue = vect_do_peeling (loop_vinfo, niters, nitersm1, &niters_vector,
11919 &step_vector, &niters_vector_mult_vf, th,
11920 check_profitability, niters_no_overflow,
11921 &advance);
11922 if (LOOP_VINFO_SCALAR_LOOP (loop_vinfo)
11923 && LOOP_VINFO_SCALAR_LOOP_SCALING (loop_vinfo).initialized_p ())
11924 {
11925 /* Ifcvt duplicates loop preheader, loop body and produces an basic
11926 block after loop exit. We need to scale all that. */
11927 basic_block preheader
11928 = loop_preheader_edge (LOOP_VINFO_SCALAR_LOOP (loop_vinfo))->src;
11929 preheader->count
11930 = preheader->count.apply_probability
11931 (LOOP_VINFO_SCALAR_LOOP_SCALING (loop_vinfo));
11932 scale_loop_frequencies (LOOP_VINFO_SCALAR_LOOP (loop_vinfo),
11933 LOOP_VINFO_SCALAR_LOOP_SCALING (loop_vinfo));
11934 LOOP_VINFO_SCALAR_IV_EXIT (loop_vinfo)->dest->count = preheader->count;
11935 }
11936
11937 if (niters_vector == NULL_TREE)
11938 {
11939 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
11940 && !LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo)
11941 && known_eq (lowest_vf, vf))
11942 {
11943 niters_vector
11944 = build_int_cst (TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo)),
11945 LOOP_VINFO_INT_NITERS (loop_vinfo) / lowest_vf);
11946 step_vector = build_one_cst (TREE_TYPE (niters));
11947 }
11948 else if (vect_use_loop_mask_for_alignment_p (loop_vinfo))
11949 vect_gen_vector_loop_niters (loop_vinfo, niters, &niters_vector,
11950 &step_vector, niters_no_overflow);
11951 else
11952 /* vect_do_peeling subtracted the number of peeled prologue
11953 iterations from LOOP_VINFO_NITERS. */
11954 vect_gen_vector_loop_niters (loop_vinfo, LOOP_VINFO_NITERS (loop_vinfo),
11955 &niters_vector, &step_vector,
11956 niters_no_overflow);
11957 }
11958
11959 /* 1) Make sure the loop header has exactly two entries
11960 2) Make sure we have a preheader basic block. */
11961
11962 gcc_assert (EDGE_COUNT (loop->header->preds) == 2);
11963
11964 split_edge (loop_preheader_edge (loop));
11965
11966 if (vect_use_loop_mask_for_alignment_p (loop_vinfo))
11967 /* This will deal with any possible peeling. */
11968 vect_prepare_for_masked_peels (loop_vinfo);
11969
11970 /* Handle any code motion that we need to for early-break vectorization after
11971 we've done peeling but just before we start vectorizing. */
11972 if (LOOP_VINFO_EARLY_BREAKS (loop_vinfo))
11973 move_early_exit_stmts (loop_vinfo);
11974
11975 /* Schedule the SLP instances first, then handle loop vectorization
11976 below. */
11977 if (!loop_vinfo->slp_instances.is_empty ())
11978 {
11979 DUMP_VECT_SCOPE ("scheduling SLP instances");
11980 vect_schedule_slp (loop_vinfo, LOOP_VINFO_SLP_INSTANCES (loop_vinfo));
11981 }
11982
11983 /* FORNOW: the vectorizer supports only loops which body consist
11984 of one basic block (header + empty latch). When the vectorizer will
11985 support more involved loop forms, the order by which the BBs are
11986 traversed need to be reconsidered. */
11987
11988 for (i = 0; i < nbbs; i++)
11989 {
11990 basic_block bb = bbs[i];
11991 stmt_vec_info stmt_info;
11992
11993 for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
11994 gsi_next (&si))
11995 {
11996 gphi *phi = si.phi ();
11997 if (dump_enabled_p ())
11998 dump_printf_loc (MSG_NOTE, vect_location,
11999 "------>vectorizing phi: %G", (gimple *) phi);
12000 stmt_info = loop_vinfo->lookup_stmt (phi);
12001 if (!stmt_info)
12002 continue;
12003
12004 if (MAY_HAVE_DEBUG_BIND_STMTS && !STMT_VINFO_LIVE_P (stmt_info))
12005 vect_loop_kill_debug_uses (loop, stmt_info);
12006
12007 if (!STMT_VINFO_RELEVANT_P (stmt_info)
12008 && !STMT_VINFO_LIVE_P (stmt_info))
12009 continue;
12010
12011 if (STMT_VINFO_VECTYPE (stmt_info)
12012 && (maybe_ne
12013 (TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info)), vf))
12014 && dump_enabled_p ())
12015 dump_printf_loc (MSG_NOTE, vect_location, "multiple-types.\n");
12016
12017 if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def
12018 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
12019 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def
12020 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle
12021 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_first_order_recurrence
12022 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_internal_def)
12023 && ! PURE_SLP_STMT (stmt_info))
12024 {
12025 if (dump_enabled_p ())
12026 dump_printf_loc (MSG_NOTE, vect_location, "transform phi.\n");
12027 vect_transform_stmt (loop_vinfo, stmt_info, NULL, NULL, NULL);
12028 }
12029 }
12030
12031 for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
12032 gsi_next (&si))
12033 {
12034 gphi *phi = si.phi ();
12035 stmt_info = loop_vinfo->lookup_stmt (phi);
12036 if (!stmt_info)
12037 continue;
12038
12039 if (!STMT_VINFO_RELEVANT_P (stmt_info)
12040 && !STMT_VINFO_LIVE_P (stmt_info))
12041 continue;
12042
12043 if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def
12044 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
12045 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def
12046 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle
12047 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_internal_def
12048 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_first_order_recurrence)
12049 && ! PURE_SLP_STMT (stmt_info))
12050 maybe_set_vectorized_backedge_value (loop_vinfo, stmt_info);
12051 }
12052
12053 for (gimple_stmt_iterator si = gsi_start_bb (bb);
12054 !gsi_end_p (si);)
12055 {
12056 stmt = gsi_stmt (si);
12057 /* During vectorization remove existing clobber stmts. */
12058 if (gimple_clobber_p (stmt))
12059 {
12060 unlink_stmt_vdef (stmt);
12061 gsi_remove (&si, true);
12062 release_defs (stmt);
12063 }
12064 else
12065 {
12066 /* Ignore vector stmts created in the outer loop. */
12067 stmt_info = loop_vinfo->lookup_stmt (stmt);
12068
12069 /* vector stmts created in the outer-loop during vectorization of
12070 stmts in an inner-loop may not have a stmt_info, and do not
12071 need to be vectorized. */
12072 stmt_vec_info seen_store = NULL;
12073 if (stmt_info)
12074 {
12075 if (STMT_VINFO_IN_PATTERN_P (stmt_info))
12076 {
12077 gimple *def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
12078 for (gimple_stmt_iterator subsi = gsi_start (def_seq);
12079 !gsi_end_p (subsi); gsi_next (&subsi))
12080 {
12081 stmt_vec_info pat_stmt_info
12082 = loop_vinfo->lookup_stmt (gsi_stmt (subsi));
12083 vect_transform_loop_stmt (loop_vinfo, pat_stmt_info,
12084 &si, &seen_store);
12085 }
12086 stmt_vec_info pat_stmt_info
12087 = STMT_VINFO_RELATED_STMT (stmt_info);
12088 if (vect_transform_loop_stmt (loop_vinfo, pat_stmt_info,
12089 &si, &seen_store))
12090 maybe_set_vectorized_backedge_value (loop_vinfo,
12091 pat_stmt_info);
12092 }
12093 else
12094 {
12095 if (vect_transform_loop_stmt (loop_vinfo, stmt_info, &si,
12096 &seen_store))
12097 maybe_set_vectorized_backedge_value (loop_vinfo,
12098 stmt_info);
12099 }
12100 }
12101 gsi_next (&si);
12102 if (seen_store)
12103 {
12104 if (STMT_VINFO_GROUPED_ACCESS (seen_store))
12105 /* Interleaving. If IS_STORE is TRUE, the
12106 vectorization of the interleaving chain was
12107 completed - free all the stores in the chain. */
12108 vect_remove_stores (loop_vinfo,
12109 DR_GROUP_FIRST_ELEMENT (seen_store));
12110 else
12111 /* Free the attached stmt_vec_info and remove the stmt. */
12112 loop_vinfo->remove_stmt (stmt_info);
12113 }
12114 }
12115 }
12116
12117 /* Stub out scalar statements that must not survive vectorization.
12118 Doing this here helps with grouped statements, or statements that
12119 are involved in patterns. */
12120 for (gimple_stmt_iterator gsi = gsi_start_bb (bb);
12121 !gsi_end_p (gsi); gsi_next (&gsi))
12122 {
12123 gcall *call = dyn_cast <gcall *> (gsi_stmt (gsi));
12124 if (!call || !gimple_call_internal_p (call))
12125 continue;
12126 internal_fn ifn = gimple_call_internal_fn (call);
12127 if (ifn == IFN_MASK_LOAD)
12128 {
12129 tree lhs = gimple_get_lhs (call);
12130 if (!VECTOR_TYPE_P (TREE_TYPE (lhs)))
12131 {
12132 tree zero = build_zero_cst (TREE_TYPE (lhs));
12133 gimple *new_stmt = gimple_build_assign (lhs, zero);
12134 gsi_replace (&gsi, new_stmt, true);
12135 }
12136 }
12137 else if (conditional_internal_fn_code (ifn) != ERROR_MARK)
12138 {
12139 tree lhs = gimple_get_lhs (call);
12140 if (!VECTOR_TYPE_P (TREE_TYPE (lhs)))
12141 {
12142 tree else_arg
12143 = gimple_call_arg (call, gimple_call_num_args (call) - 1);
12144 gimple *new_stmt = gimple_build_assign (lhs, else_arg);
12145 gsi_replace (&gsi, new_stmt, true);
12146 }
12147 }
12148 }
12149 } /* BBs in loop */
12150
12151 /* The vectorization factor is always > 1, so if we use an IV increment of 1.
12152 a zero NITERS becomes a nonzero NITERS_VECTOR. */
12153 if (integer_onep (step_vector))
12154 niters_no_overflow = true;
12155 vect_set_loop_condition (loop, LOOP_VINFO_IV_EXIT (loop_vinfo), loop_vinfo,
12156 niters_vector, step_vector, niters_vector_mult_vf,
12157 !niters_no_overflow);
12158
12159 unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
12160
12161 /* True if the final iteration might not handle a full vector's
12162 worth of scalar iterations. */
12163 bool final_iter_may_be_partial
12164 = LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo)
12165 || LOOP_VINFO_EARLY_BREAKS (loop_vinfo);
12166
12167 /* +1 to convert latch counts to loop iteration counts. */
12168 int bias_for_lowest = 1;
12169
12170 /* When we are peeling for gaps then we take away one scalar iteration
12171 from the vector loop. Thus we can adjust the upper bound by one
12172 scalar iteration. But only when we know the bound applies to the
12173 IV exit test which might not be true when we have multiple exits. */
12174 if (!LOOP_VINFO_EARLY_BREAKS (loop_vinfo))
12175 bias_for_lowest -= LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) ? 1 : 0;
12176
12177 int bias_for_assumed = bias_for_lowest;
12178 int alignment_npeels = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
12179 if (alignment_npeels && LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
12180 {
12181 /* When the amount of peeling is known at compile time, the first
12182 iteration will have exactly alignment_npeels active elements.
12183 In the worst case it will have at least one. */
12184 int min_first_active = (alignment_npeels > 0 ? alignment_npeels : 1);
12185 bias_for_lowest += lowest_vf - min_first_active;
12186 bias_for_assumed += assumed_vf - min_first_active;
12187 }
12188 /* In these calculations the "- 1" converts loop iteration counts
12189 back to latch counts. */
12190 if (loop->any_upper_bound)
12191 {
12192 loop_vec_info main_vinfo = LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo);
12193 loop->nb_iterations_upper_bound
12194 = (final_iter_may_be_partial
12195 ? wi::udiv_ceil (loop->nb_iterations_upper_bound + bias_for_lowest,
12196 lowest_vf) - 1
12197 : wi::udiv_floor (loop->nb_iterations_upper_bound + bias_for_lowest,
12198 lowest_vf) - 1);
12199 if (main_vinfo
12200 /* Both peeling for alignment and peeling for gaps can end up
12201 with the scalar epilogue running for more than VF-1 iterations. */
12202 && !main_vinfo->peeling_for_alignment
12203 && !main_vinfo->peeling_for_gaps)
12204 {
12205 unsigned int bound;
12206 poly_uint64 main_iters
12207 = upper_bound (LOOP_VINFO_VECT_FACTOR (main_vinfo),
12208 LOOP_VINFO_COST_MODEL_THRESHOLD (main_vinfo));
12209 main_iters
12210 = upper_bound (main_iters,
12211 LOOP_VINFO_VERSIONING_THRESHOLD (main_vinfo));
12212 if (can_div_away_from_zero_p (main_iters,
12213 LOOP_VINFO_VECT_FACTOR (loop_vinfo),
12214 &bound))
12215 loop->nb_iterations_upper_bound
12216 = wi::umin ((bound_wide_int) (bound - 1),
12217 loop->nb_iterations_upper_bound);
12218 }
12219 }
12220 if (loop->any_likely_upper_bound)
12221 loop->nb_iterations_likely_upper_bound
12222 = (final_iter_may_be_partial
12223 ? wi::udiv_ceil (loop->nb_iterations_likely_upper_bound
12224 + bias_for_lowest, lowest_vf) - 1
12225 : wi::udiv_floor (loop->nb_iterations_likely_upper_bound
12226 + bias_for_lowest, lowest_vf) - 1);
12227 if (loop->any_estimate)
12228 loop->nb_iterations_estimate
12229 = (final_iter_may_be_partial
12230 ? wi::udiv_ceil (loop->nb_iterations_estimate + bias_for_assumed,
12231 assumed_vf) - 1
12232 : wi::udiv_floor (loop->nb_iterations_estimate + bias_for_assumed,
12233 assumed_vf) - 1);
12234 scale_profile_for_vect_loop (loop, LOOP_VINFO_IV_EXIT (loop_vinfo),
12235 assumed_vf, flat);
12236
12237 if (dump_enabled_p ())
12238 {
12239 if (!LOOP_VINFO_EPILOGUE_P (loop_vinfo))
12240 {
12241 dump_printf_loc (MSG_NOTE, vect_location,
12242 "LOOP VECTORIZED\n");
12243 if (loop->inner)
12244 dump_printf_loc (MSG_NOTE, vect_location,
12245 "OUTER LOOP VECTORIZED\n");
12246 dump_printf (MSG_NOTE, "\n");
12247 }
12248 else
12249 dump_printf_loc (MSG_NOTE, vect_location,
12250 "LOOP EPILOGUE VECTORIZED (MODE=%s)\n",
12251 GET_MODE_NAME (loop_vinfo->vector_mode));
12252 }
12253
12254 /* Loops vectorized with a variable factor won't benefit from
12255 unrolling/peeling. */
12256 if (!vf.is_constant ())
12257 {
12258 loop->unroll = 1;
12259 if (dump_enabled_p ())
12260 dump_printf_loc (MSG_NOTE, vect_location, "Disabling unrolling due to"
12261 " variable-length vectorization factor\n");
12262 }
12263 /* Free SLP instances here because otherwise stmt reference counting
12264 won't work. */
12265 slp_instance instance;
12266 FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), i, instance)
12267 vect_free_slp_instance (instance);
12268 LOOP_VINFO_SLP_INSTANCES (loop_vinfo).release ();
12269 /* Clear-up safelen field since its value is invalid after vectorization
12270 since vectorized loop can have loop-carried dependencies. */
12271 loop->safelen = 0;
12272
12273 if (epilogue)
12274 {
12275 update_epilogue_loop_vinfo (epilogue, advance);
12276
12277 epilogue->simduid = loop->simduid;
12278 epilogue->force_vectorize = loop->force_vectorize;
12279 epilogue->dont_vectorize = false;
12280 }
12281
12282 return epilogue;
12283 }
12284
12285 /* The code below is trying to perform simple optimization - revert
12286 if-conversion for masked stores, i.e. if the mask of a store is zero
12287 do not perform it and all stored value producers also if possible.
12288 For example,
12289 for (i=0; i<n; i++)
12290 if (c[i])
12291 {
12292 p1[i] += 1;
12293 p2[i] = p3[i] +2;
12294 }
12295 this transformation will produce the following semi-hammock:
12296
12297 if (!mask__ifc__42.18_165 == { 0, 0, 0, 0, 0, 0, 0, 0 })
12298 {
12299 vect__11.19_170 = MASK_LOAD (vectp_p1.20_168, 0B, mask__ifc__42.18_165);
12300 vect__12.22_172 = vect__11.19_170 + vect_cst__171;
12301 MASK_STORE (vectp_p1.23_175, 0B, mask__ifc__42.18_165, vect__12.22_172);
12302 vect__18.25_182 = MASK_LOAD (vectp_p3.26_180, 0B, mask__ifc__42.18_165);
12303 vect__19.28_184 = vect__18.25_182 + vect_cst__183;
12304 MASK_STORE (vectp_p2.29_187, 0B, mask__ifc__42.18_165, vect__19.28_184);
12305 }
12306 */
12307
12308 void
12309 optimize_mask_stores (class loop *loop)
12310 {
12311 basic_block *bbs = get_loop_body (loop);
12312 unsigned nbbs = loop->num_nodes;
12313 unsigned i;
12314 basic_block bb;
12315 class loop *bb_loop;
12316 gimple_stmt_iterator gsi;
12317 gimple *stmt;
12318 auto_vec<gimple *> worklist;
12319 auto_purge_vect_location sentinel;
12320
12321 vect_location = find_loop_location (loop);
12322 /* Pick up all masked stores in loop if any. */
12323 for (i = 0; i < nbbs; i++)
12324 {
12325 bb = bbs[i];
12326 for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi);
12327 gsi_next (&gsi))
12328 {
12329 stmt = gsi_stmt (gsi);
12330 if (gimple_call_internal_p (stmt, IFN_MASK_STORE))
12331 worklist.safe_push (stmt);
12332 }
12333 }
12334
12335 free (bbs);
12336 if (worklist.is_empty ())
12337 return;
12338
12339 /* Loop has masked stores. */
12340 while (!worklist.is_empty ())
12341 {
12342 gimple *last, *last_store;
12343 edge e, efalse;
12344 tree mask;
12345 basic_block store_bb, join_bb;
12346 gimple_stmt_iterator gsi_to;
12347 tree vdef, new_vdef;
12348 gphi *phi;
12349 tree vectype;
12350 tree zero;
12351
12352 last = worklist.pop ();
12353 mask = gimple_call_arg (last, 2);
12354 bb = gimple_bb (last);
12355 /* Create then_bb and if-then structure in CFG, then_bb belongs to
12356 the same loop as if_bb. It could be different to LOOP when two
12357 level loop-nest is vectorized and mask_store belongs to the inner
12358 one. */
12359 e = split_block (bb, last);
12360 bb_loop = bb->loop_father;
12361 gcc_assert (loop == bb_loop || flow_loop_nested_p (loop, bb_loop));
12362 join_bb = e->dest;
12363 store_bb = create_empty_bb (bb);
12364 add_bb_to_loop (store_bb, bb_loop);
12365 e->flags = EDGE_TRUE_VALUE;
12366 efalse = make_edge (bb, store_bb, EDGE_FALSE_VALUE);
12367 /* Put STORE_BB to likely part. */
12368 efalse->probability = profile_probability::likely ();
12369 e->probability = efalse->probability.invert ();
12370 store_bb->count = efalse->count ();
12371 make_single_succ_edge (store_bb, join_bb, EDGE_FALLTHRU);
12372 if (dom_info_available_p (CDI_DOMINATORS))
12373 set_immediate_dominator (CDI_DOMINATORS, store_bb, bb);
12374 if (dump_enabled_p ())
12375 dump_printf_loc (MSG_NOTE, vect_location,
12376 "Create new block %d to sink mask stores.",
12377 store_bb->index);
12378 /* Create vector comparison with boolean result. */
12379 vectype = TREE_TYPE (mask);
12380 zero = build_zero_cst (vectype);
12381 stmt = gimple_build_cond (EQ_EXPR, mask, zero, NULL_TREE, NULL_TREE);
12382 gsi = gsi_last_bb (bb);
12383 gsi_insert_after (&gsi, stmt, GSI_SAME_STMT);
12384 /* Create new PHI node for vdef of the last masked store:
12385 .MEM_2 = VDEF <.MEM_1>
12386 will be converted to
12387 .MEM.3 = VDEF <.MEM_1>
12388 and new PHI node will be created in join bb
12389 .MEM_2 = PHI <.MEM_1, .MEM_3>
12390 */
12391 vdef = gimple_vdef (last);
12392 new_vdef = make_ssa_name (gimple_vop (cfun), last);
12393 gimple_set_vdef (last, new_vdef);
12394 phi = create_phi_node (vdef, join_bb);
12395 add_phi_arg (phi, new_vdef, EDGE_SUCC (store_bb, 0), UNKNOWN_LOCATION);
12396
12397 /* Put all masked stores with the same mask to STORE_BB if possible. */
12398 while (true)
12399 {
12400 gimple_stmt_iterator gsi_from;
12401 gimple *stmt1 = NULL;
12402
12403 /* Move masked store to STORE_BB. */
12404 last_store = last;
12405 gsi = gsi_for_stmt (last);
12406 gsi_from = gsi;
12407 /* Shift GSI to the previous stmt for further traversal. */
12408 gsi_prev (&gsi);
12409 gsi_to = gsi_start_bb (store_bb);
12410 gsi_move_before (&gsi_from, &gsi_to);
12411 /* Setup GSI_TO to the non-empty block start. */
12412 gsi_to = gsi_start_bb (store_bb);
12413 if (dump_enabled_p ())
12414 dump_printf_loc (MSG_NOTE, vect_location,
12415 "Move stmt to created bb\n%G", last);
12416 /* Move all stored value producers if possible. */
12417 while (!gsi_end_p (gsi))
12418 {
12419 tree lhs;
12420 imm_use_iterator imm_iter;
12421 use_operand_p use_p;
12422 bool res;
12423
12424 /* Skip debug statements. */
12425 if (is_gimple_debug (gsi_stmt (gsi)))
12426 {
12427 gsi_prev (&gsi);
12428 continue;
12429 }
12430 stmt1 = gsi_stmt (gsi);
12431 /* Do not consider statements writing to memory or having
12432 volatile operand. */
12433 if (gimple_vdef (stmt1)
12434 || gimple_has_volatile_ops (stmt1))
12435 break;
12436 gsi_from = gsi;
12437 gsi_prev (&gsi);
12438 lhs = gimple_get_lhs (stmt1);
12439 if (!lhs)
12440 break;
12441
12442 /* LHS of vectorized stmt must be SSA_NAME. */
12443 if (TREE_CODE (lhs) != SSA_NAME)
12444 break;
12445
12446 if (!VECTOR_TYPE_P (TREE_TYPE (lhs)))
12447 {
12448 /* Remove dead scalar statement. */
12449 if (has_zero_uses (lhs))
12450 {
12451 gsi_remove (&gsi_from, true);
12452 continue;
12453 }
12454 }
12455
12456 /* Check that LHS does not have uses outside of STORE_BB. */
12457 res = true;
12458 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, lhs)
12459 {
12460 gimple *use_stmt;
12461 use_stmt = USE_STMT (use_p);
12462 if (is_gimple_debug (use_stmt))
12463 continue;
12464 if (gimple_bb (use_stmt) != store_bb)
12465 {
12466 res = false;
12467 break;
12468 }
12469 }
12470 if (!res)
12471 break;
12472
12473 if (gimple_vuse (stmt1)
12474 && gimple_vuse (stmt1) != gimple_vuse (last_store))
12475 break;
12476
12477 /* Can move STMT1 to STORE_BB. */
12478 if (dump_enabled_p ())
12479 dump_printf_loc (MSG_NOTE, vect_location,
12480 "Move stmt to created bb\n%G", stmt1);
12481 gsi_move_before (&gsi_from, &gsi_to);
12482 /* Shift GSI_TO for further insertion. */
12483 gsi_prev (&gsi_to);
12484 }
12485 /* Put other masked stores with the same mask to STORE_BB. */
12486 if (worklist.is_empty ()
12487 || gimple_call_arg (worklist.last (), 2) != mask
12488 || worklist.last () != stmt1)
12489 break;
12490 last = worklist.pop ();
12491 }
12492 add_phi_arg (phi, gimple_vuse (last_store), e, UNKNOWN_LOCATION);
12493 }
12494 }
12495
12496 /* Decide whether it is possible to use a zero-based induction variable
12497 when vectorizing LOOP_VINFO with partial vectors. If it is, return
12498 the value that the induction variable must be able to hold in order
12499 to ensure that the rgroups eventually have no active vector elements.
12500 Return -1 otherwise. */
12501
12502 widest_int
12503 vect_iv_limit_for_partial_vectors (loop_vec_info loop_vinfo)
12504 {
12505 tree niters_skip = LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo);
12506 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
12507 unsigned HOST_WIDE_INT max_vf = vect_max_vf (loop_vinfo);
12508
12509 /* Calculate the value that the induction variable must be able
12510 to hit in order to ensure that we end the loop with an all-false mask.
12511 This involves adding the maximum number of inactive trailing scalar
12512 iterations. */
12513 widest_int iv_limit = -1;
12514 if (max_loop_iterations (loop, &iv_limit))
12515 {
12516 if (niters_skip)
12517 {
12518 /* Add the maximum number of skipped iterations to the
12519 maximum iteration count. */
12520 if (TREE_CODE (niters_skip) == INTEGER_CST)
12521 iv_limit += wi::to_widest (niters_skip);
12522 else
12523 iv_limit += max_vf - 1;
12524 }
12525 else if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo))
12526 /* Make a conservatively-correct assumption. */
12527 iv_limit += max_vf - 1;
12528
12529 /* IV_LIMIT is the maximum number of latch iterations, which is also
12530 the maximum in-range IV value. Round this value down to the previous
12531 vector alignment boundary and then add an extra full iteration. */
12532 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
12533 iv_limit = (iv_limit & -(int) known_alignment (vf)) + max_vf;
12534 }
12535 return iv_limit;
12536 }
12537
12538 /* For the given rgroup_controls RGC, check whether an induction variable
12539 would ever hit a value that produces a set of all-false masks or zero
12540 lengths before wrapping around. Return true if it's possible to wrap
12541 around before hitting the desirable value, otherwise return false. */
12542
12543 bool
12544 vect_rgroup_iv_might_wrap_p (loop_vec_info loop_vinfo, rgroup_controls *rgc)
12545 {
12546 widest_int iv_limit = vect_iv_limit_for_partial_vectors (loop_vinfo);
12547
12548 if (iv_limit == -1)
12549 return true;
12550
12551 tree compare_type = LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo);
12552 unsigned int compare_precision = TYPE_PRECISION (compare_type);
12553 unsigned nitems = rgc->max_nscalars_per_iter * rgc->factor;
12554
12555 if (wi::min_precision (iv_limit * nitems, UNSIGNED) > compare_precision)
12556 return true;
12557
12558 return false;
12559 }