]> git.ipfire.org Git - thirdparty/gcc.git/blob - gcc/tree-vect-loop.c
x86: Remove "%!" before ret
[thirdparty/gcc.git] / gcc / tree-vect-loop.c
1 /* Loop Vectorization
2 Copyright (C) 2003-2021 Free Software Foundation, Inc.
3 Contributed by Dorit Naishlos <dorit@il.ibm.com> and
4 Ira Rosen <irar@il.ibm.com>
5
6 This file is part of GCC.
7
8 GCC is free software; you can redistribute it and/or modify it under
9 the terms of the GNU General Public License as published by the Free
10 Software Foundation; either version 3, or (at your option) any later
11 version.
12
13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
16 for more details.
17
18 You should have received a copy of the GNU General Public License
19 along with GCC; see the file COPYING3. If not see
20 <http://www.gnu.org/licenses/>. */
21
22 #define INCLUDE_ALGORITHM
23 #include "config.h"
24 #include "system.h"
25 #include "coretypes.h"
26 #include "backend.h"
27 #include "target.h"
28 #include "rtl.h"
29 #include "tree.h"
30 #include "gimple.h"
31 #include "cfghooks.h"
32 #include "tree-pass.h"
33 #include "ssa.h"
34 #include "optabs-tree.h"
35 #include "diagnostic-core.h"
36 #include "fold-const.h"
37 #include "stor-layout.h"
38 #include "cfganal.h"
39 #include "gimplify.h"
40 #include "gimple-iterator.h"
41 #include "gimplify-me.h"
42 #include "tree-ssa-loop-ivopts.h"
43 #include "tree-ssa-loop-manip.h"
44 #include "tree-ssa-loop-niter.h"
45 #include "tree-ssa-loop.h"
46 #include "cfgloop.h"
47 #include "tree-scalar-evolution.h"
48 #include "tree-vectorizer.h"
49 #include "gimple-fold.h"
50 #include "cgraph.h"
51 #include "tree-cfg.h"
52 #include "tree-if-conv.h"
53 #include "internal-fn.h"
54 #include "tree-vector-builder.h"
55 #include "vec-perm-indices.h"
56 #include "tree-eh.h"
57
58 /* Loop Vectorization Pass.
59
60 This pass tries to vectorize loops.
61
62 For example, the vectorizer transforms the following simple loop:
63
64 short a[N]; short b[N]; short c[N]; int i;
65
66 for (i=0; i<N; i++){
67 a[i] = b[i] + c[i];
68 }
69
70 as if it was manually vectorized by rewriting the source code into:
71
72 typedef int __attribute__((mode(V8HI))) v8hi;
73 short a[N]; short b[N]; short c[N]; int i;
74 v8hi *pa = (v8hi*)a, *pb = (v8hi*)b, *pc = (v8hi*)c;
75 v8hi va, vb, vc;
76
77 for (i=0; i<N/8; i++){
78 vb = pb[i];
79 vc = pc[i];
80 va = vb + vc;
81 pa[i] = va;
82 }
83
84 The main entry to this pass is vectorize_loops(), in which
85 the vectorizer applies a set of analyses on a given set of loops,
86 followed by the actual vectorization transformation for the loops that
87 had successfully passed the analysis phase.
88 Throughout this pass we make a distinction between two types of
89 data: scalars (which are represented by SSA_NAMES), and memory references
90 ("data-refs"). These two types of data require different handling both
91 during analysis and transformation. The types of data-refs that the
92 vectorizer currently supports are ARRAY_REFS which base is an array DECL
93 (not a pointer), and INDIRECT_REFS through pointers; both array and pointer
94 accesses are required to have a simple (consecutive) access pattern.
95
96 Analysis phase:
97 ===============
98 The driver for the analysis phase is vect_analyze_loop().
99 It applies a set of analyses, some of which rely on the scalar evolution
100 analyzer (scev) developed by Sebastian Pop.
101
102 During the analysis phase the vectorizer records some information
103 per stmt in a "stmt_vec_info" struct which is attached to each stmt in the
104 loop, as well as general information about the loop as a whole, which is
105 recorded in a "loop_vec_info" struct attached to each loop.
106
107 Transformation phase:
108 =====================
109 The loop transformation phase scans all the stmts in the loop, and
110 creates a vector stmt (or a sequence of stmts) for each scalar stmt S in
111 the loop that needs to be vectorized. It inserts the vector code sequence
112 just before the scalar stmt S, and records a pointer to the vector code
113 in STMT_VINFO_VEC_STMT (stmt_info) (stmt_info is the stmt_vec_info struct
114 attached to S). This pointer will be used for the vectorization of following
115 stmts which use the def of stmt S. Stmt S is removed if it writes to memory;
116 otherwise, we rely on dead code elimination for removing it.
117
118 For example, say stmt S1 was vectorized into stmt VS1:
119
120 VS1: vb = px[i];
121 S1: b = x[i]; STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
122 S2: a = b;
123
124 To vectorize stmt S2, the vectorizer first finds the stmt that defines
125 the operand 'b' (S1), and gets the relevant vector def 'vb' from the
126 vector stmt VS1 pointed to by STMT_VINFO_VEC_STMT (stmt_info (S1)). The
127 resulting sequence would be:
128
129 VS1: vb = px[i];
130 S1: b = x[i]; STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
131 VS2: va = vb;
132 S2: a = b; STMT_VINFO_VEC_STMT (stmt_info (S2)) = VS2
133
134 Operands that are not SSA_NAMEs, are data-refs that appear in
135 load/store operations (like 'x[i]' in S1), and are handled differently.
136
137 Target modeling:
138 =================
139 Currently the only target specific information that is used is the
140 size of the vector (in bytes) - "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD".
141 Targets that can support different sizes of vectors, for now will need
142 to specify one value for "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD". More
143 flexibility will be added in the future.
144
145 Since we only vectorize operations which vector form can be
146 expressed using existing tree codes, to verify that an operation is
147 supported, the vectorizer checks the relevant optab at the relevant
148 machine_mode (e.g, optab_handler (add_optab, V8HImode)). If
149 the value found is CODE_FOR_nothing, then there's no target support, and
150 we can't vectorize the stmt.
151
152 For additional information on this project see:
153 http://gcc.gnu.org/projects/tree-ssa/vectorization.html
154 */
155
156 static void vect_estimate_min_profitable_iters (loop_vec_info, int *, int *);
157 static stmt_vec_info vect_is_simple_reduction (loop_vec_info, stmt_vec_info,
158 bool *, bool *);
159
160 /* Subroutine of vect_determine_vf_for_stmt that handles only one
161 statement. VECTYPE_MAYBE_SET_P is true if STMT_VINFO_VECTYPE
162 may already be set for general statements (not just data refs). */
163
164 static opt_result
165 vect_determine_vf_for_stmt_1 (vec_info *vinfo, stmt_vec_info stmt_info,
166 bool vectype_maybe_set_p,
167 poly_uint64 *vf)
168 {
169 gimple *stmt = stmt_info->stmt;
170
171 if ((!STMT_VINFO_RELEVANT_P (stmt_info)
172 && !STMT_VINFO_LIVE_P (stmt_info))
173 || gimple_clobber_p (stmt))
174 {
175 if (dump_enabled_p ())
176 dump_printf_loc (MSG_NOTE, vect_location, "skip.\n");
177 return opt_result::success ();
178 }
179
180 tree stmt_vectype, nunits_vectype;
181 opt_result res = vect_get_vector_types_for_stmt (vinfo, stmt_info,
182 &stmt_vectype,
183 &nunits_vectype);
184 if (!res)
185 return res;
186
187 if (stmt_vectype)
188 {
189 if (STMT_VINFO_VECTYPE (stmt_info))
190 /* The only case when a vectype had been already set is for stmts
191 that contain a data ref, or for "pattern-stmts" (stmts generated
192 by the vectorizer to represent/replace a certain idiom). */
193 gcc_assert ((STMT_VINFO_DATA_REF (stmt_info)
194 || vectype_maybe_set_p)
195 && STMT_VINFO_VECTYPE (stmt_info) == stmt_vectype);
196 else
197 STMT_VINFO_VECTYPE (stmt_info) = stmt_vectype;
198 }
199
200 if (nunits_vectype)
201 vect_update_max_nunits (vf, nunits_vectype);
202
203 return opt_result::success ();
204 }
205
206 /* Subroutine of vect_determine_vectorization_factor. Set the vector
207 types of STMT_INFO and all attached pattern statements and update
208 the vectorization factor VF accordingly. Return true on success
209 or false if something prevented vectorization. */
210
211 static opt_result
212 vect_determine_vf_for_stmt (vec_info *vinfo,
213 stmt_vec_info stmt_info, poly_uint64 *vf)
214 {
215 if (dump_enabled_p ())
216 dump_printf_loc (MSG_NOTE, vect_location, "==> examining statement: %G",
217 stmt_info->stmt);
218 opt_result res = vect_determine_vf_for_stmt_1 (vinfo, stmt_info, false, vf);
219 if (!res)
220 return res;
221
222 if (STMT_VINFO_IN_PATTERN_P (stmt_info)
223 && STMT_VINFO_RELATED_STMT (stmt_info))
224 {
225 gimple *pattern_def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
226 stmt_info = STMT_VINFO_RELATED_STMT (stmt_info);
227
228 /* If a pattern statement has def stmts, analyze them too. */
229 for (gimple_stmt_iterator si = gsi_start (pattern_def_seq);
230 !gsi_end_p (si); gsi_next (&si))
231 {
232 stmt_vec_info def_stmt_info = vinfo->lookup_stmt (gsi_stmt (si));
233 if (dump_enabled_p ())
234 dump_printf_loc (MSG_NOTE, vect_location,
235 "==> examining pattern def stmt: %G",
236 def_stmt_info->stmt);
237 res = vect_determine_vf_for_stmt_1 (vinfo, def_stmt_info, true, vf);
238 if (!res)
239 return res;
240 }
241
242 if (dump_enabled_p ())
243 dump_printf_loc (MSG_NOTE, vect_location,
244 "==> examining pattern statement: %G",
245 stmt_info->stmt);
246 res = vect_determine_vf_for_stmt_1 (vinfo, stmt_info, true, vf);
247 if (!res)
248 return res;
249 }
250
251 return opt_result::success ();
252 }
253
254 /* Function vect_determine_vectorization_factor
255
256 Determine the vectorization factor (VF). VF is the number of data elements
257 that are operated upon in parallel in a single iteration of the vectorized
258 loop. For example, when vectorizing a loop that operates on 4byte elements,
259 on a target with vector size (VS) 16byte, the VF is set to 4, since 4
260 elements can fit in a single vector register.
261
262 We currently support vectorization of loops in which all types operated upon
263 are of the same size. Therefore this function currently sets VF according to
264 the size of the types operated upon, and fails if there are multiple sizes
265 in the loop.
266
267 VF is also the factor by which the loop iterations are strip-mined, e.g.:
268 original loop:
269 for (i=0; i<N; i++){
270 a[i] = b[i] + c[i];
271 }
272
273 vectorized loop:
274 for (i=0; i<N; i+=VF){
275 a[i:VF] = b[i:VF] + c[i:VF];
276 }
277 */
278
279 static opt_result
280 vect_determine_vectorization_factor (loop_vec_info loop_vinfo)
281 {
282 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
283 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
284 unsigned nbbs = loop->num_nodes;
285 poly_uint64 vectorization_factor = 1;
286 tree scalar_type = NULL_TREE;
287 gphi *phi;
288 tree vectype;
289 stmt_vec_info stmt_info;
290 unsigned i;
291
292 DUMP_VECT_SCOPE ("vect_determine_vectorization_factor");
293
294 for (i = 0; i < nbbs; i++)
295 {
296 basic_block bb = bbs[i];
297
298 for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
299 gsi_next (&si))
300 {
301 phi = si.phi ();
302 stmt_info = loop_vinfo->lookup_stmt (phi);
303 if (dump_enabled_p ())
304 dump_printf_loc (MSG_NOTE, vect_location, "==> examining phi: %G",
305 phi);
306
307 gcc_assert (stmt_info);
308
309 if (STMT_VINFO_RELEVANT_P (stmt_info)
310 || STMT_VINFO_LIVE_P (stmt_info))
311 {
312 gcc_assert (!STMT_VINFO_VECTYPE (stmt_info));
313 scalar_type = TREE_TYPE (PHI_RESULT (phi));
314
315 if (dump_enabled_p ())
316 dump_printf_loc (MSG_NOTE, vect_location,
317 "get vectype for scalar type: %T\n",
318 scalar_type);
319
320 vectype = get_vectype_for_scalar_type (loop_vinfo, scalar_type);
321 if (!vectype)
322 return opt_result::failure_at (phi,
323 "not vectorized: unsupported "
324 "data-type %T\n",
325 scalar_type);
326 STMT_VINFO_VECTYPE (stmt_info) = vectype;
327
328 if (dump_enabled_p ())
329 dump_printf_loc (MSG_NOTE, vect_location, "vectype: %T\n",
330 vectype);
331
332 if (dump_enabled_p ())
333 {
334 dump_printf_loc (MSG_NOTE, vect_location, "nunits = ");
335 dump_dec (MSG_NOTE, TYPE_VECTOR_SUBPARTS (vectype));
336 dump_printf (MSG_NOTE, "\n");
337 }
338
339 vect_update_max_nunits (&vectorization_factor, vectype);
340 }
341 }
342
343 for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
344 gsi_next (&si))
345 {
346 if (is_gimple_debug (gsi_stmt (si)))
347 continue;
348 stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
349 opt_result res
350 = vect_determine_vf_for_stmt (loop_vinfo,
351 stmt_info, &vectorization_factor);
352 if (!res)
353 return res;
354 }
355 }
356
357 /* TODO: Analyze cost. Decide if worth while to vectorize. */
358 if (dump_enabled_p ())
359 {
360 dump_printf_loc (MSG_NOTE, vect_location, "vectorization factor = ");
361 dump_dec (MSG_NOTE, vectorization_factor);
362 dump_printf (MSG_NOTE, "\n");
363 }
364
365 if (known_le (vectorization_factor, 1U))
366 return opt_result::failure_at (vect_location,
367 "not vectorized: unsupported data-type\n");
368 LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
369 return opt_result::success ();
370 }
371
372
373 /* Function vect_is_simple_iv_evolution.
374
375 FORNOW: A simple evolution of an induction variables in the loop is
376 considered a polynomial evolution. */
377
378 static bool
379 vect_is_simple_iv_evolution (unsigned loop_nb, tree access_fn, tree * init,
380 tree * step)
381 {
382 tree init_expr;
383 tree step_expr;
384 tree evolution_part = evolution_part_in_loop_num (access_fn, loop_nb);
385 basic_block bb;
386
387 /* When there is no evolution in this loop, the evolution function
388 is not "simple". */
389 if (evolution_part == NULL_TREE)
390 return false;
391
392 /* When the evolution is a polynomial of degree >= 2
393 the evolution function is not "simple". */
394 if (tree_is_chrec (evolution_part))
395 return false;
396
397 step_expr = evolution_part;
398 init_expr = unshare_expr (initial_condition_in_loop_num (access_fn, loop_nb));
399
400 if (dump_enabled_p ())
401 dump_printf_loc (MSG_NOTE, vect_location, "step: %T, init: %T\n",
402 step_expr, init_expr);
403
404 *init = init_expr;
405 *step = step_expr;
406
407 if (TREE_CODE (step_expr) != INTEGER_CST
408 && (TREE_CODE (step_expr) != SSA_NAME
409 || ((bb = gimple_bb (SSA_NAME_DEF_STMT (step_expr)))
410 && flow_bb_inside_loop_p (get_loop (cfun, loop_nb), bb))
411 || (!INTEGRAL_TYPE_P (TREE_TYPE (step_expr))
412 && (!SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr))
413 || !flag_associative_math)))
414 && (TREE_CODE (step_expr) != REAL_CST
415 || !flag_associative_math))
416 {
417 if (dump_enabled_p ())
418 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
419 "step unknown.\n");
420 return false;
421 }
422
423 return true;
424 }
425
426 /* Return true if PHI, described by STMT_INFO, is the inner PHI in
427 what we are assuming is a double reduction. For example, given
428 a structure like this:
429
430 outer1:
431 x_1 = PHI <x_4(outer2), ...>;
432 ...
433
434 inner:
435 x_2 = PHI <x_1(outer1), ...>;
436 ...
437 x_3 = ...;
438 ...
439
440 outer2:
441 x_4 = PHI <x_3(inner)>;
442 ...
443
444 outer loop analysis would treat x_1 as a double reduction phi and
445 this function would then return true for x_2. */
446
447 static bool
448 vect_inner_phi_in_double_reduction_p (loop_vec_info loop_vinfo, gphi *phi)
449 {
450 use_operand_p use_p;
451 ssa_op_iter op_iter;
452 FOR_EACH_PHI_ARG (use_p, phi, op_iter, SSA_OP_USE)
453 if (stmt_vec_info def_info = loop_vinfo->lookup_def (USE_FROM_PTR (use_p)))
454 if (STMT_VINFO_DEF_TYPE (def_info) == vect_double_reduction_def)
455 return true;
456 return false;
457 }
458
459 /* Function vect_analyze_scalar_cycles_1.
460
461 Examine the cross iteration def-use cycles of scalar variables
462 in LOOP. LOOP_VINFO represents the loop that is now being
463 considered for vectorization (can be LOOP, or an outer-loop
464 enclosing LOOP). */
465
466 static void
467 vect_analyze_scalar_cycles_1 (loop_vec_info loop_vinfo, class loop *loop)
468 {
469 basic_block bb = loop->header;
470 tree init, step;
471 auto_vec<stmt_vec_info, 64> worklist;
472 gphi_iterator gsi;
473 bool double_reduc, reduc_chain;
474
475 DUMP_VECT_SCOPE ("vect_analyze_scalar_cycles");
476
477 /* First - identify all inductions. Reduction detection assumes that all the
478 inductions have been identified, therefore, this order must not be
479 changed. */
480 for (gsi = gsi_start_phis (bb); !gsi_end_p (gsi); gsi_next (&gsi))
481 {
482 gphi *phi = gsi.phi ();
483 tree access_fn = NULL;
484 tree def = PHI_RESULT (phi);
485 stmt_vec_info stmt_vinfo = loop_vinfo->lookup_stmt (phi);
486
487 if (dump_enabled_p ())
488 dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: %G", phi);
489
490 /* Skip virtual phi's. The data dependences that are associated with
491 virtual defs/uses (i.e., memory accesses) are analyzed elsewhere. */
492 if (virtual_operand_p (def))
493 continue;
494
495 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_unknown_def_type;
496
497 /* Analyze the evolution function. */
498 access_fn = analyze_scalar_evolution (loop, def);
499 if (access_fn)
500 {
501 STRIP_NOPS (access_fn);
502 if (dump_enabled_p ())
503 dump_printf_loc (MSG_NOTE, vect_location,
504 "Access function of PHI: %T\n", access_fn);
505 STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo)
506 = initial_condition_in_loop_num (access_fn, loop->num);
507 STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo)
508 = evolution_part_in_loop_num (access_fn, loop->num);
509 }
510
511 if (!access_fn
512 || vect_inner_phi_in_double_reduction_p (loop_vinfo, phi)
513 || !vect_is_simple_iv_evolution (loop->num, access_fn, &init, &step)
514 || (LOOP_VINFO_LOOP (loop_vinfo) != loop
515 && TREE_CODE (step) != INTEGER_CST))
516 {
517 worklist.safe_push (stmt_vinfo);
518 continue;
519 }
520
521 gcc_assert (STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo)
522 != NULL_TREE);
523 gcc_assert (STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo) != NULL_TREE);
524
525 if (dump_enabled_p ())
526 dump_printf_loc (MSG_NOTE, vect_location, "Detected induction.\n");
527 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_induction_def;
528 }
529
530
531 /* Second - identify all reductions and nested cycles. */
532 while (worklist.length () > 0)
533 {
534 stmt_vec_info stmt_vinfo = worklist.pop ();
535 gphi *phi = as_a <gphi *> (stmt_vinfo->stmt);
536 tree def = PHI_RESULT (phi);
537
538 if (dump_enabled_p ())
539 dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: %G", phi);
540
541 gcc_assert (!virtual_operand_p (def)
542 && STMT_VINFO_DEF_TYPE (stmt_vinfo) == vect_unknown_def_type);
543
544 stmt_vec_info reduc_stmt_info
545 = vect_is_simple_reduction (loop_vinfo, stmt_vinfo, &double_reduc,
546 &reduc_chain);
547 if (reduc_stmt_info)
548 {
549 STMT_VINFO_REDUC_DEF (stmt_vinfo) = reduc_stmt_info;
550 STMT_VINFO_REDUC_DEF (reduc_stmt_info) = stmt_vinfo;
551 if (double_reduc)
552 {
553 if (dump_enabled_p ())
554 dump_printf_loc (MSG_NOTE, vect_location,
555 "Detected double reduction.\n");
556
557 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_double_reduction_def;
558 STMT_VINFO_DEF_TYPE (reduc_stmt_info) = vect_double_reduction_def;
559 }
560 else
561 {
562 if (loop != LOOP_VINFO_LOOP (loop_vinfo))
563 {
564 if (dump_enabled_p ())
565 dump_printf_loc (MSG_NOTE, vect_location,
566 "Detected vectorizable nested cycle.\n");
567
568 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_nested_cycle;
569 }
570 else
571 {
572 if (dump_enabled_p ())
573 dump_printf_loc (MSG_NOTE, vect_location,
574 "Detected reduction.\n");
575
576 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_reduction_def;
577 STMT_VINFO_DEF_TYPE (reduc_stmt_info) = vect_reduction_def;
578 /* Store the reduction cycles for possible vectorization in
579 loop-aware SLP if it was not detected as reduction
580 chain. */
581 if (! reduc_chain)
582 LOOP_VINFO_REDUCTIONS (loop_vinfo).safe_push
583 (reduc_stmt_info);
584 }
585 }
586 }
587 else
588 if (dump_enabled_p ())
589 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
590 "Unknown def-use cycle pattern.\n");
591 }
592 }
593
594
595 /* Function vect_analyze_scalar_cycles.
596
597 Examine the cross iteration def-use cycles of scalar variables, by
598 analyzing the loop-header PHIs of scalar variables. Classify each
599 cycle as one of the following: invariant, induction, reduction, unknown.
600 We do that for the loop represented by LOOP_VINFO, and also to its
601 inner-loop, if exists.
602 Examples for scalar cycles:
603
604 Example1: reduction:
605
606 loop1:
607 for (i=0; i<N; i++)
608 sum += a[i];
609
610 Example2: induction:
611
612 loop2:
613 for (i=0; i<N; i++)
614 a[i] = i; */
615
616 static void
617 vect_analyze_scalar_cycles (loop_vec_info loop_vinfo)
618 {
619 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
620
621 vect_analyze_scalar_cycles_1 (loop_vinfo, loop);
622
623 /* When vectorizing an outer-loop, the inner-loop is executed sequentially.
624 Reductions in such inner-loop therefore have different properties than
625 the reductions in the nest that gets vectorized:
626 1. When vectorized, they are executed in the same order as in the original
627 scalar loop, so we can't change the order of computation when
628 vectorizing them.
629 2. FIXME: Inner-loop reductions can be used in the inner-loop, so the
630 current checks are too strict. */
631
632 if (loop->inner)
633 vect_analyze_scalar_cycles_1 (loop_vinfo, loop->inner);
634 }
635
636 /* Transfer group and reduction information from STMT_INFO to its
637 pattern stmt. */
638
639 static void
640 vect_fixup_reduc_chain (stmt_vec_info stmt_info)
641 {
642 stmt_vec_info firstp = STMT_VINFO_RELATED_STMT (stmt_info);
643 stmt_vec_info stmtp;
644 gcc_assert (!REDUC_GROUP_FIRST_ELEMENT (firstp)
645 && REDUC_GROUP_FIRST_ELEMENT (stmt_info));
646 REDUC_GROUP_SIZE (firstp) = REDUC_GROUP_SIZE (stmt_info);
647 do
648 {
649 stmtp = STMT_VINFO_RELATED_STMT (stmt_info);
650 gcc_checking_assert (STMT_VINFO_DEF_TYPE (stmtp)
651 == STMT_VINFO_DEF_TYPE (stmt_info));
652 REDUC_GROUP_FIRST_ELEMENT (stmtp) = firstp;
653 stmt_info = REDUC_GROUP_NEXT_ELEMENT (stmt_info);
654 if (stmt_info)
655 REDUC_GROUP_NEXT_ELEMENT (stmtp)
656 = STMT_VINFO_RELATED_STMT (stmt_info);
657 }
658 while (stmt_info);
659 }
660
661 /* Fixup scalar cycles that now have their stmts detected as patterns. */
662
663 static void
664 vect_fixup_scalar_cycles_with_patterns (loop_vec_info loop_vinfo)
665 {
666 stmt_vec_info first;
667 unsigned i;
668
669 FOR_EACH_VEC_ELT (LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo), i, first)
670 {
671 stmt_vec_info next = REDUC_GROUP_NEXT_ELEMENT (first);
672 while (next)
673 {
674 if ((STMT_VINFO_IN_PATTERN_P (next)
675 != STMT_VINFO_IN_PATTERN_P (first))
676 || STMT_VINFO_REDUC_IDX (vect_stmt_to_vectorize (next)) == -1)
677 break;
678 next = REDUC_GROUP_NEXT_ELEMENT (next);
679 }
680 /* If all reduction chain members are well-formed patterns adjust
681 the group to group the pattern stmts instead. */
682 if (! next
683 && STMT_VINFO_REDUC_IDX (vect_stmt_to_vectorize (first)) != -1)
684 {
685 if (STMT_VINFO_IN_PATTERN_P (first))
686 {
687 vect_fixup_reduc_chain (first);
688 LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo)[i]
689 = STMT_VINFO_RELATED_STMT (first);
690 }
691 }
692 /* If not all stmt in the chain are patterns or if we failed
693 to update STMT_VINFO_REDUC_IDX dissolve the chain and handle
694 it as regular reduction instead. */
695 else
696 {
697 stmt_vec_info vinfo = first;
698 stmt_vec_info last = NULL;
699 while (vinfo)
700 {
701 next = REDUC_GROUP_NEXT_ELEMENT (vinfo);
702 REDUC_GROUP_FIRST_ELEMENT (vinfo) = NULL;
703 REDUC_GROUP_NEXT_ELEMENT (vinfo) = NULL;
704 last = vinfo;
705 vinfo = next;
706 }
707 STMT_VINFO_DEF_TYPE (vect_stmt_to_vectorize (first))
708 = vect_internal_def;
709 loop_vinfo->reductions.safe_push (vect_stmt_to_vectorize (last));
710 LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo).unordered_remove (i);
711 --i;
712 }
713 }
714 }
715
716 /* Function vect_get_loop_niters.
717
718 Determine how many iterations the loop is executed and place it
719 in NUMBER_OF_ITERATIONS. Place the number of latch iterations
720 in NUMBER_OF_ITERATIONSM1. Place the condition under which the
721 niter information holds in ASSUMPTIONS.
722
723 Return the loop exit condition. */
724
725
726 static gcond *
727 vect_get_loop_niters (class loop *loop, tree *assumptions,
728 tree *number_of_iterations, tree *number_of_iterationsm1)
729 {
730 edge exit = single_exit (loop);
731 class tree_niter_desc niter_desc;
732 tree niter_assumptions, niter, may_be_zero;
733 gcond *cond = get_loop_exit_condition (loop);
734
735 *assumptions = boolean_true_node;
736 *number_of_iterationsm1 = chrec_dont_know;
737 *number_of_iterations = chrec_dont_know;
738 DUMP_VECT_SCOPE ("get_loop_niters");
739
740 if (!exit)
741 return cond;
742
743 may_be_zero = NULL_TREE;
744 if (!number_of_iterations_exit_assumptions (loop, exit, &niter_desc, NULL)
745 || chrec_contains_undetermined (niter_desc.niter))
746 return cond;
747
748 niter_assumptions = niter_desc.assumptions;
749 may_be_zero = niter_desc.may_be_zero;
750 niter = niter_desc.niter;
751
752 if (may_be_zero && integer_zerop (may_be_zero))
753 may_be_zero = NULL_TREE;
754
755 if (may_be_zero)
756 {
757 if (COMPARISON_CLASS_P (may_be_zero))
758 {
759 /* Try to combine may_be_zero with assumptions, this can simplify
760 computation of niter expression. */
761 if (niter_assumptions && !integer_nonzerop (niter_assumptions))
762 niter_assumptions = fold_build2 (TRUTH_AND_EXPR, boolean_type_node,
763 niter_assumptions,
764 fold_build1 (TRUTH_NOT_EXPR,
765 boolean_type_node,
766 may_be_zero));
767 else
768 niter = fold_build3 (COND_EXPR, TREE_TYPE (niter), may_be_zero,
769 build_int_cst (TREE_TYPE (niter), 0),
770 rewrite_to_non_trapping_overflow (niter));
771
772 may_be_zero = NULL_TREE;
773 }
774 else if (integer_nonzerop (may_be_zero))
775 {
776 *number_of_iterationsm1 = build_int_cst (TREE_TYPE (niter), 0);
777 *number_of_iterations = build_int_cst (TREE_TYPE (niter), 1);
778 return cond;
779 }
780 else
781 return cond;
782 }
783
784 *assumptions = niter_assumptions;
785 *number_of_iterationsm1 = niter;
786
787 /* We want the number of loop header executions which is the number
788 of latch executions plus one.
789 ??? For UINT_MAX latch executions this number overflows to zero
790 for loops like do { n++; } while (n != 0); */
791 if (niter && !chrec_contains_undetermined (niter))
792 niter = fold_build2 (PLUS_EXPR, TREE_TYPE (niter), unshare_expr (niter),
793 build_int_cst (TREE_TYPE (niter), 1));
794 *number_of_iterations = niter;
795
796 return cond;
797 }
798
799 /* Function bb_in_loop_p
800
801 Used as predicate for dfs order traversal of the loop bbs. */
802
803 static bool
804 bb_in_loop_p (const_basic_block bb, const void *data)
805 {
806 const class loop *const loop = (const class loop *)data;
807 if (flow_bb_inside_loop_p (loop, bb))
808 return true;
809 return false;
810 }
811
812
813 /* Create and initialize a new loop_vec_info struct for LOOP_IN, as well as
814 stmt_vec_info structs for all the stmts in LOOP_IN. */
815
816 _loop_vec_info::_loop_vec_info (class loop *loop_in, vec_info_shared *shared)
817 : vec_info (vec_info::loop, shared),
818 loop (loop_in),
819 bbs (XCNEWVEC (basic_block, loop->num_nodes)),
820 num_itersm1 (NULL_TREE),
821 num_iters (NULL_TREE),
822 num_iters_unchanged (NULL_TREE),
823 num_iters_assumptions (NULL_TREE),
824 vector_costs (nullptr),
825 scalar_costs (nullptr),
826 th (0),
827 versioning_threshold (0),
828 vectorization_factor (0),
829 main_loop_edge (nullptr),
830 skip_main_loop_edge (nullptr),
831 skip_this_loop_edge (nullptr),
832 reusable_accumulators (),
833 max_vectorization_factor (0),
834 mask_skip_niters (NULL_TREE),
835 rgroup_compare_type (NULL_TREE),
836 simd_if_cond (NULL_TREE),
837 unaligned_dr (NULL),
838 peeling_for_alignment (0),
839 ptr_mask (0),
840 ivexpr_map (NULL),
841 scan_map (NULL),
842 slp_unrolling_factor (1),
843 inner_loop_cost_factor (param_vect_inner_loop_cost_factor),
844 vectorizable (false),
845 can_use_partial_vectors_p (param_vect_partial_vector_usage != 0),
846 using_partial_vectors_p (false),
847 epil_using_partial_vectors_p (false),
848 peeling_for_gaps (false),
849 peeling_for_niter (false),
850 no_data_dependencies (false),
851 has_mask_store (false),
852 scalar_loop_scaling (profile_probability::uninitialized ()),
853 scalar_loop (NULL),
854 orig_loop_info (NULL)
855 {
856 /* CHECKME: We want to visit all BBs before their successors (except for
857 latch blocks, for which this assertion wouldn't hold). In the simple
858 case of the loop forms we allow, a dfs order of the BBs would the same
859 as reversed postorder traversal, so we are safe. */
860
861 unsigned int nbbs = dfs_enumerate_from (loop->header, 0, bb_in_loop_p,
862 bbs, loop->num_nodes, loop);
863 gcc_assert (nbbs == loop->num_nodes);
864
865 for (unsigned int i = 0; i < nbbs; i++)
866 {
867 basic_block bb = bbs[i];
868 gimple_stmt_iterator si;
869
870 for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
871 {
872 gimple *phi = gsi_stmt (si);
873 gimple_set_uid (phi, 0);
874 add_stmt (phi);
875 }
876
877 for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
878 {
879 gimple *stmt = gsi_stmt (si);
880 gimple_set_uid (stmt, 0);
881 if (is_gimple_debug (stmt))
882 continue;
883 add_stmt (stmt);
884 /* If .GOMP_SIMD_LANE call for the current loop has 3 arguments, the
885 third argument is the #pragma omp simd if (x) condition, when 0,
886 loop shouldn't be vectorized, when non-zero constant, it should
887 be vectorized normally, otherwise versioned with vectorized loop
888 done if the condition is non-zero at runtime. */
889 if (loop_in->simduid
890 && is_gimple_call (stmt)
891 && gimple_call_internal_p (stmt)
892 && gimple_call_internal_fn (stmt) == IFN_GOMP_SIMD_LANE
893 && gimple_call_num_args (stmt) >= 3
894 && TREE_CODE (gimple_call_arg (stmt, 0)) == SSA_NAME
895 && (loop_in->simduid
896 == SSA_NAME_VAR (gimple_call_arg (stmt, 0))))
897 {
898 tree arg = gimple_call_arg (stmt, 2);
899 if (integer_zerop (arg) || TREE_CODE (arg) == SSA_NAME)
900 simd_if_cond = arg;
901 else
902 gcc_assert (integer_nonzerop (arg));
903 }
904 }
905 }
906
907 epilogue_vinfos.create (6);
908 }
909
910 /* Free all levels of rgroup CONTROLS. */
911
912 void
913 release_vec_loop_controls (vec<rgroup_controls> *controls)
914 {
915 rgroup_controls *rgc;
916 unsigned int i;
917 FOR_EACH_VEC_ELT (*controls, i, rgc)
918 rgc->controls.release ();
919 controls->release ();
920 }
921
922 /* Free all memory used by the _loop_vec_info, as well as all the
923 stmt_vec_info structs of all the stmts in the loop. */
924
925 _loop_vec_info::~_loop_vec_info ()
926 {
927 free (bbs);
928
929 release_vec_loop_controls (&masks);
930 release_vec_loop_controls (&lens);
931 delete ivexpr_map;
932 delete scan_map;
933 epilogue_vinfos.release ();
934 delete scalar_costs;
935 delete vector_costs;
936
937 /* When we release an epiloge vinfo that we do not intend to use
938 avoid clearing AUX of the main loop which should continue to
939 point to the main loop vinfo since otherwise we'll leak that. */
940 if (loop->aux == this)
941 loop->aux = NULL;
942 }
943
944 /* Return an invariant or register for EXPR and emit necessary
945 computations in the LOOP_VINFO loop preheader. */
946
947 tree
948 cse_and_gimplify_to_preheader (loop_vec_info loop_vinfo, tree expr)
949 {
950 if (is_gimple_reg (expr)
951 || is_gimple_min_invariant (expr))
952 return expr;
953
954 if (! loop_vinfo->ivexpr_map)
955 loop_vinfo->ivexpr_map = new hash_map<tree_operand_hash, tree>;
956 tree &cached = loop_vinfo->ivexpr_map->get_or_insert (expr);
957 if (! cached)
958 {
959 gimple_seq stmts = NULL;
960 cached = force_gimple_operand (unshare_expr (expr),
961 &stmts, true, NULL_TREE);
962 if (stmts)
963 {
964 edge e = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo));
965 gsi_insert_seq_on_edge_immediate (e, stmts);
966 }
967 }
968 return cached;
969 }
970
971 /* Return true if we can use CMP_TYPE as the comparison type to produce
972 all masks required to mask LOOP_VINFO. */
973
974 static bool
975 can_produce_all_loop_masks_p (loop_vec_info loop_vinfo, tree cmp_type)
976 {
977 rgroup_controls *rgm;
978 unsigned int i;
979 FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo), i, rgm)
980 if (rgm->type != NULL_TREE
981 && !direct_internal_fn_supported_p (IFN_WHILE_ULT,
982 cmp_type, rgm->type,
983 OPTIMIZE_FOR_SPEED))
984 return false;
985 return true;
986 }
987
988 /* Calculate the maximum number of scalars per iteration for every
989 rgroup in LOOP_VINFO. */
990
991 static unsigned int
992 vect_get_max_nscalars_per_iter (loop_vec_info loop_vinfo)
993 {
994 unsigned int res = 1;
995 unsigned int i;
996 rgroup_controls *rgm;
997 FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo), i, rgm)
998 res = MAX (res, rgm->max_nscalars_per_iter);
999 return res;
1000 }
1001
1002 /* Calculate the minimum precision necessary to represent:
1003
1004 MAX_NITERS * FACTOR
1005
1006 as an unsigned integer, where MAX_NITERS is the maximum number of
1007 loop header iterations for the original scalar form of LOOP_VINFO. */
1008
1009 static unsigned
1010 vect_min_prec_for_max_niters (loop_vec_info loop_vinfo, unsigned int factor)
1011 {
1012 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1013
1014 /* Get the maximum number of iterations that is representable
1015 in the counter type. */
1016 tree ni_type = TREE_TYPE (LOOP_VINFO_NITERSM1 (loop_vinfo));
1017 widest_int max_ni = wi::to_widest (TYPE_MAX_VALUE (ni_type)) + 1;
1018
1019 /* Get a more refined estimate for the number of iterations. */
1020 widest_int max_back_edges;
1021 if (max_loop_iterations (loop, &max_back_edges))
1022 max_ni = wi::smin (max_ni, max_back_edges + 1);
1023
1024 /* Work out how many bits we need to represent the limit. */
1025 return wi::min_precision (max_ni * factor, UNSIGNED);
1026 }
1027
1028 /* True if the loop needs peeling or partial vectors when vectorized. */
1029
1030 static bool
1031 vect_need_peeling_or_partial_vectors_p (loop_vec_info loop_vinfo)
1032 {
1033 unsigned HOST_WIDE_INT const_vf;
1034 HOST_WIDE_INT max_niter
1035 = likely_max_stmt_executions_int (LOOP_VINFO_LOOP (loop_vinfo));
1036
1037 unsigned th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
1038 if (!th && LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo))
1039 th = LOOP_VINFO_COST_MODEL_THRESHOLD (LOOP_VINFO_ORIG_LOOP_INFO
1040 (loop_vinfo));
1041
1042 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
1043 && LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) >= 0)
1044 {
1045 /* Work out the (constant) number of iterations that need to be
1046 peeled for reasons other than niters. */
1047 unsigned int peel_niter = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
1048 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
1049 peel_niter += 1;
1050 if (!multiple_p (LOOP_VINFO_INT_NITERS (loop_vinfo) - peel_niter,
1051 LOOP_VINFO_VECT_FACTOR (loop_vinfo)))
1052 return true;
1053 }
1054 else if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
1055 /* ??? When peeling for gaps but not alignment, we could
1056 try to check whether the (variable) niters is known to be
1057 VF * N + 1. That's something of a niche case though. */
1058 || LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
1059 || !LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant (&const_vf)
1060 || ((tree_ctz (LOOP_VINFO_NITERS (loop_vinfo))
1061 < (unsigned) exact_log2 (const_vf))
1062 /* In case of versioning, check if the maximum number of
1063 iterations is greater than th. If they are identical,
1064 the epilogue is unnecessary. */
1065 && (!LOOP_REQUIRES_VERSIONING (loop_vinfo)
1066 || ((unsigned HOST_WIDE_INT) max_niter
1067 > (th / const_vf) * const_vf))))
1068 return true;
1069
1070 return false;
1071 }
1072
1073 /* Each statement in LOOP_VINFO can be masked where necessary. Check
1074 whether we can actually generate the masks required. Return true if so,
1075 storing the type of the scalar IV in LOOP_VINFO_RGROUP_COMPARE_TYPE. */
1076
1077 static bool
1078 vect_verify_full_masking (loop_vec_info loop_vinfo)
1079 {
1080 unsigned int min_ni_width;
1081 unsigned int max_nscalars_per_iter
1082 = vect_get_max_nscalars_per_iter (loop_vinfo);
1083
1084 /* Use a normal loop if there are no statements that need masking.
1085 This only happens in rare degenerate cases: it means that the loop
1086 has no loads, no stores, and no live-out values. */
1087 if (LOOP_VINFO_MASKS (loop_vinfo).is_empty ())
1088 return false;
1089
1090 /* Work out how many bits we need to represent the limit. */
1091 min_ni_width
1092 = vect_min_prec_for_max_niters (loop_vinfo, max_nscalars_per_iter);
1093
1094 /* Find a scalar mode for which WHILE_ULT is supported. */
1095 opt_scalar_int_mode cmp_mode_iter;
1096 tree cmp_type = NULL_TREE;
1097 tree iv_type = NULL_TREE;
1098 widest_int iv_limit = vect_iv_limit_for_partial_vectors (loop_vinfo);
1099 unsigned int iv_precision = UINT_MAX;
1100
1101 if (iv_limit != -1)
1102 iv_precision = wi::min_precision (iv_limit * max_nscalars_per_iter,
1103 UNSIGNED);
1104
1105 FOR_EACH_MODE_IN_CLASS (cmp_mode_iter, MODE_INT)
1106 {
1107 unsigned int cmp_bits = GET_MODE_BITSIZE (cmp_mode_iter.require ());
1108 if (cmp_bits >= min_ni_width
1109 && targetm.scalar_mode_supported_p (cmp_mode_iter.require ()))
1110 {
1111 tree this_type = build_nonstandard_integer_type (cmp_bits, true);
1112 if (this_type
1113 && can_produce_all_loop_masks_p (loop_vinfo, this_type))
1114 {
1115 /* Although we could stop as soon as we find a valid mode,
1116 there are at least two reasons why that's not always the
1117 best choice:
1118
1119 - An IV that's Pmode or wider is more likely to be reusable
1120 in address calculations than an IV that's narrower than
1121 Pmode.
1122
1123 - Doing the comparison in IV_PRECISION or wider allows
1124 a natural 0-based IV, whereas using a narrower comparison
1125 type requires mitigations against wrap-around.
1126
1127 Conversely, if the IV limit is variable, doing the comparison
1128 in a wider type than the original type can introduce
1129 unnecessary extensions, so picking the widest valid mode
1130 is not always a good choice either.
1131
1132 Here we prefer the first IV type that's Pmode or wider,
1133 and the first comparison type that's IV_PRECISION or wider.
1134 (The comparison type must be no wider than the IV type,
1135 to avoid extensions in the vector loop.)
1136
1137 ??? We might want to try continuing beyond Pmode for ILP32
1138 targets if CMP_BITS < IV_PRECISION. */
1139 iv_type = this_type;
1140 if (!cmp_type || iv_precision > TYPE_PRECISION (cmp_type))
1141 cmp_type = this_type;
1142 if (cmp_bits >= GET_MODE_BITSIZE (Pmode))
1143 break;
1144 }
1145 }
1146 }
1147
1148 if (!cmp_type)
1149 return false;
1150
1151 LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo) = cmp_type;
1152 LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo) = iv_type;
1153 return true;
1154 }
1155
1156 /* Check whether we can use vector access with length based on precison
1157 comparison. So far, to keep it simple, we only allow the case that the
1158 precision of the target supported length is larger than the precision
1159 required by loop niters. */
1160
1161 static bool
1162 vect_verify_loop_lens (loop_vec_info loop_vinfo)
1163 {
1164 if (LOOP_VINFO_LENS (loop_vinfo).is_empty ())
1165 return false;
1166
1167 unsigned int max_nitems_per_iter = 1;
1168 unsigned int i;
1169 rgroup_controls *rgl;
1170 /* Find the maximum number of items per iteration for every rgroup. */
1171 FOR_EACH_VEC_ELT (LOOP_VINFO_LENS (loop_vinfo), i, rgl)
1172 {
1173 unsigned nitems_per_iter = rgl->max_nscalars_per_iter * rgl->factor;
1174 max_nitems_per_iter = MAX (max_nitems_per_iter, nitems_per_iter);
1175 }
1176
1177 /* Work out how many bits we need to represent the length limit. */
1178 unsigned int min_ni_prec
1179 = vect_min_prec_for_max_niters (loop_vinfo, max_nitems_per_iter);
1180
1181 /* Now use the maximum of below precisions for one suitable IV type:
1182 - the IV's natural precision
1183 - the precision needed to hold: the maximum number of scalar
1184 iterations multiplied by the scale factor (min_ni_prec above)
1185 - the Pmode precision
1186
1187 If min_ni_prec is less than the precision of the current niters,
1188 we perfer to still use the niters type. Prefer to use Pmode and
1189 wider IV to avoid narrow conversions. */
1190
1191 unsigned int ni_prec
1192 = TYPE_PRECISION (TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo)));
1193 min_ni_prec = MAX (min_ni_prec, ni_prec);
1194 min_ni_prec = MAX (min_ni_prec, GET_MODE_BITSIZE (Pmode));
1195
1196 tree iv_type = NULL_TREE;
1197 opt_scalar_int_mode tmode_iter;
1198 FOR_EACH_MODE_IN_CLASS (tmode_iter, MODE_INT)
1199 {
1200 scalar_mode tmode = tmode_iter.require ();
1201 unsigned int tbits = GET_MODE_BITSIZE (tmode);
1202
1203 /* ??? Do we really want to construct one IV whose precision exceeds
1204 BITS_PER_WORD? */
1205 if (tbits > BITS_PER_WORD)
1206 break;
1207
1208 /* Find the first available standard integral type. */
1209 if (tbits >= min_ni_prec && targetm.scalar_mode_supported_p (tmode))
1210 {
1211 iv_type = build_nonstandard_integer_type (tbits, true);
1212 break;
1213 }
1214 }
1215
1216 if (!iv_type)
1217 {
1218 if (dump_enabled_p ())
1219 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1220 "can't vectorize with length-based partial vectors"
1221 " because there is no suitable iv type.\n");
1222 return false;
1223 }
1224
1225 LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo) = iv_type;
1226 LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo) = iv_type;
1227
1228 return true;
1229 }
1230
1231 /* Calculate the cost of one scalar iteration of the loop. */
1232 static void
1233 vect_compute_single_scalar_iteration_cost (loop_vec_info loop_vinfo)
1234 {
1235 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1236 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1237 int nbbs = loop->num_nodes, factor;
1238 int innerloop_iters, i;
1239
1240 DUMP_VECT_SCOPE ("vect_compute_single_scalar_iteration_cost");
1241
1242 /* Gather costs for statements in the scalar loop. */
1243
1244 /* FORNOW. */
1245 innerloop_iters = 1;
1246 if (loop->inner)
1247 innerloop_iters = LOOP_VINFO_INNER_LOOP_COST_FACTOR (loop_vinfo);
1248
1249 for (i = 0; i < nbbs; i++)
1250 {
1251 gimple_stmt_iterator si;
1252 basic_block bb = bbs[i];
1253
1254 if (bb->loop_father == loop->inner)
1255 factor = innerloop_iters;
1256 else
1257 factor = 1;
1258
1259 for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
1260 {
1261 gimple *stmt = gsi_stmt (si);
1262 stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (stmt);
1263
1264 if (!is_gimple_assign (stmt) && !is_gimple_call (stmt))
1265 continue;
1266
1267 /* Skip stmts that are not vectorized inside the loop. */
1268 stmt_vec_info vstmt_info = vect_stmt_to_vectorize (stmt_info);
1269 if (!STMT_VINFO_RELEVANT_P (vstmt_info)
1270 && (!STMT_VINFO_LIVE_P (vstmt_info)
1271 || !VECTORIZABLE_CYCLE_DEF
1272 (STMT_VINFO_DEF_TYPE (vstmt_info))))
1273 continue;
1274
1275 vect_cost_for_stmt kind;
1276 if (STMT_VINFO_DATA_REF (stmt_info))
1277 {
1278 if (DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
1279 kind = scalar_load;
1280 else
1281 kind = scalar_store;
1282 }
1283 else if (vect_nop_conversion_p (stmt_info))
1284 continue;
1285 else
1286 kind = scalar_stmt;
1287
1288 /* We are using vect_prologue here to avoid scaling twice
1289 by the inner loop factor. */
1290 record_stmt_cost (&LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1291 factor, kind, stmt_info, 0, vect_prologue);
1292 }
1293 }
1294
1295 /* Now accumulate cost. */
1296 loop_vinfo->scalar_costs = init_cost (loop_vinfo, true);
1297 stmt_info_for_cost *si;
1298 int j;
1299 FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1300 j, si)
1301 (void) add_stmt_cost (loop_vinfo->scalar_costs, si->count,
1302 si->kind, si->stmt_info, si->vectype,
1303 si->misalign, si->where);
1304 loop_vinfo->scalar_costs->finish_cost (nullptr);
1305 }
1306
1307
1308 /* Function vect_analyze_loop_form.
1309
1310 Verify that certain CFG restrictions hold, including:
1311 - the loop has a pre-header
1312 - the loop has a single entry and exit
1313 - the loop exit condition is simple enough
1314 - the number of iterations can be analyzed, i.e, a countable loop. The
1315 niter could be analyzed under some assumptions. */
1316
1317 opt_result
1318 vect_analyze_loop_form (class loop *loop, vect_loop_form_info *info)
1319 {
1320 DUMP_VECT_SCOPE ("vect_analyze_loop_form");
1321
1322 /* Different restrictions apply when we are considering an inner-most loop,
1323 vs. an outer (nested) loop.
1324 (FORNOW. May want to relax some of these restrictions in the future). */
1325
1326 info->inner_loop_cond = NULL;
1327 if (!loop->inner)
1328 {
1329 /* Inner-most loop. We currently require that the number of BBs is
1330 exactly 2 (the header and latch). Vectorizable inner-most loops
1331 look like this:
1332
1333 (pre-header)
1334 |
1335 header <--------+
1336 | | |
1337 | +--> latch --+
1338 |
1339 (exit-bb) */
1340
1341 if (loop->num_nodes != 2)
1342 return opt_result::failure_at (vect_location,
1343 "not vectorized:"
1344 " control flow in loop.\n");
1345
1346 if (empty_block_p (loop->header))
1347 return opt_result::failure_at (vect_location,
1348 "not vectorized: empty loop.\n");
1349 }
1350 else
1351 {
1352 class loop *innerloop = loop->inner;
1353 edge entryedge;
1354
1355 /* Nested loop. We currently require that the loop is doubly-nested,
1356 contains a single inner loop, and the number of BBs is exactly 5.
1357 Vectorizable outer-loops look like this:
1358
1359 (pre-header)
1360 |
1361 header <---+
1362 | |
1363 inner-loop |
1364 | |
1365 tail ------+
1366 |
1367 (exit-bb)
1368
1369 The inner-loop has the properties expected of inner-most loops
1370 as described above. */
1371
1372 if ((loop->inner)->inner || (loop->inner)->next)
1373 return opt_result::failure_at (vect_location,
1374 "not vectorized:"
1375 " multiple nested loops.\n");
1376
1377 if (loop->num_nodes != 5)
1378 return opt_result::failure_at (vect_location,
1379 "not vectorized:"
1380 " control flow in loop.\n");
1381
1382 entryedge = loop_preheader_edge (innerloop);
1383 if (entryedge->src != loop->header
1384 || !single_exit (innerloop)
1385 || single_exit (innerloop)->dest != EDGE_PRED (loop->latch, 0)->src)
1386 return opt_result::failure_at (vect_location,
1387 "not vectorized:"
1388 " unsupported outerloop form.\n");
1389
1390 /* Analyze the inner-loop. */
1391 vect_loop_form_info inner;
1392 opt_result res = vect_analyze_loop_form (loop->inner, &inner);
1393 if (!res)
1394 {
1395 if (dump_enabled_p ())
1396 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1397 "not vectorized: Bad inner loop.\n");
1398 return res;
1399 }
1400
1401 /* Don't support analyzing niter under assumptions for inner
1402 loop. */
1403 if (!integer_onep (inner.assumptions))
1404 return opt_result::failure_at (vect_location,
1405 "not vectorized: Bad inner loop.\n");
1406
1407 if (!expr_invariant_in_loop_p (loop, inner.number_of_iterations))
1408 return opt_result::failure_at (vect_location,
1409 "not vectorized: inner-loop count not"
1410 " invariant.\n");
1411
1412 if (dump_enabled_p ())
1413 dump_printf_loc (MSG_NOTE, vect_location,
1414 "Considering outer-loop vectorization.\n");
1415 info->inner_loop_cond = inner.loop_cond;
1416 }
1417
1418 if (!single_exit (loop))
1419 return opt_result::failure_at (vect_location,
1420 "not vectorized: multiple exits.\n");
1421 if (EDGE_COUNT (loop->header->preds) != 2)
1422 return opt_result::failure_at (vect_location,
1423 "not vectorized:"
1424 " too many incoming edges.\n");
1425
1426 /* We assume that the loop exit condition is at the end of the loop. i.e,
1427 that the loop is represented as a do-while (with a proper if-guard
1428 before the loop if needed), where the loop header contains all the
1429 executable statements, and the latch is empty. */
1430 if (!empty_block_p (loop->latch)
1431 || !gimple_seq_empty_p (phi_nodes (loop->latch)))
1432 return opt_result::failure_at (vect_location,
1433 "not vectorized: latch block not empty.\n");
1434
1435 /* Make sure the exit is not abnormal. */
1436 edge e = single_exit (loop);
1437 if (e->flags & EDGE_ABNORMAL)
1438 return opt_result::failure_at (vect_location,
1439 "not vectorized:"
1440 " abnormal loop exit edge.\n");
1441
1442 info->loop_cond
1443 = vect_get_loop_niters (loop, &info->assumptions,
1444 &info->number_of_iterations,
1445 &info->number_of_iterationsm1);
1446 if (!info->loop_cond)
1447 return opt_result::failure_at
1448 (vect_location,
1449 "not vectorized: complicated exit condition.\n");
1450
1451 if (integer_zerop (info->assumptions)
1452 || !info->number_of_iterations
1453 || chrec_contains_undetermined (info->number_of_iterations))
1454 return opt_result::failure_at
1455 (info->loop_cond,
1456 "not vectorized: number of iterations cannot be computed.\n");
1457
1458 if (integer_zerop (info->number_of_iterations))
1459 return opt_result::failure_at
1460 (info->loop_cond,
1461 "not vectorized: number of iterations = 0.\n");
1462
1463 if (!(tree_fits_shwi_p (info->number_of_iterations)
1464 && tree_to_shwi (info->number_of_iterations) > 0))
1465 {
1466 if (dump_enabled_p ())
1467 {
1468 dump_printf_loc (MSG_NOTE, vect_location,
1469 "Symbolic number of iterations is ");
1470 dump_generic_expr (MSG_NOTE, TDF_DETAILS, info->number_of_iterations);
1471 dump_printf (MSG_NOTE, "\n");
1472 }
1473 }
1474
1475 return opt_result::success ();
1476 }
1477
1478 /* Create a loop_vec_info for LOOP with SHARED and the
1479 vect_analyze_loop_form result. */
1480
1481 loop_vec_info
1482 vect_create_loop_vinfo (class loop *loop, vec_info_shared *shared,
1483 const vect_loop_form_info *info,
1484 loop_vec_info main_loop_info)
1485 {
1486 loop_vec_info loop_vinfo = new _loop_vec_info (loop, shared);
1487 LOOP_VINFO_NITERSM1 (loop_vinfo) = info->number_of_iterationsm1;
1488 LOOP_VINFO_NITERS (loop_vinfo) = info->number_of_iterations;
1489 LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = info->number_of_iterations;
1490 LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo) = main_loop_info;
1491 /* Also record the assumptions for versioning. */
1492 if (!integer_onep (info->assumptions) && !main_loop_info)
1493 LOOP_VINFO_NITERS_ASSUMPTIONS (loop_vinfo) = info->assumptions;
1494
1495 stmt_vec_info loop_cond_info = loop_vinfo->lookup_stmt (info->loop_cond);
1496 STMT_VINFO_TYPE (loop_cond_info) = loop_exit_ctrl_vec_info_type;
1497 if (info->inner_loop_cond)
1498 {
1499 stmt_vec_info inner_loop_cond_info
1500 = loop_vinfo->lookup_stmt (info->inner_loop_cond);
1501 STMT_VINFO_TYPE (inner_loop_cond_info) = loop_exit_ctrl_vec_info_type;
1502 /* If we have an estimate on the number of iterations of the inner
1503 loop use that to limit the scale for costing, otherwise use
1504 --param vect-inner-loop-cost-factor literally. */
1505 widest_int nit;
1506 if (estimated_stmt_executions (loop->inner, &nit))
1507 LOOP_VINFO_INNER_LOOP_COST_FACTOR (loop_vinfo)
1508 = wi::smin (nit, param_vect_inner_loop_cost_factor).to_uhwi ();
1509 }
1510
1511 return loop_vinfo;
1512 }
1513
1514
1515
1516 /* Scan the loop stmts and dependent on whether there are any (non-)SLP
1517 statements update the vectorization factor. */
1518
1519 static void
1520 vect_update_vf_for_slp (loop_vec_info loop_vinfo)
1521 {
1522 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1523 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1524 int nbbs = loop->num_nodes;
1525 poly_uint64 vectorization_factor;
1526 int i;
1527
1528 DUMP_VECT_SCOPE ("vect_update_vf_for_slp");
1529
1530 vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1531 gcc_assert (known_ne (vectorization_factor, 0U));
1532
1533 /* If all the stmts in the loop can be SLPed, we perform only SLP, and
1534 vectorization factor of the loop is the unrolling factor required by
1535 the SLP instances. If that unrolling factor is 1, we say, that we
1536 perform pure SLP on loop - cross iteration parallelism is not
1537 exploited. */
1538 bool only_slp_in_loop = true;
1539 for (i = 0; i < nbbs; i++)
1540 {
1541 basic_block bb = bbs[i];
1542 for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
1543 gsi_next (&si))
1544 {
1545 stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (si.phi ());
1546 if (!stmt_info)
1547 continue;
1548 if ((STMT_VINFO_RELEVANT_P (stmt_info)
1549 || VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
1550 && !PURE_SLP_STMT (stmt_info))
1551 /* STMT needs both SLP and loop-based vectorization. */
1552 only_slp_in_loop = false;
1553 }
1554 for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
1555 gsi_next (&si))
1556 {
1557 if (is_gimple_debug (gsi_stmt (si)))
1558 continue;
1559 stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
1560 stmt_info = vect_stmt_to_vectorize (stmt_info);
1561 if ((STMT_VINFO_RELEVANT_P (stmt_info)
1562 || VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
1563 && !PURE_SLP_STMT (stmt_info))
1564 /* STMT needs both SLP and loop-based vectorization. */
1565 only_slp_in_loop = false;
1566 }
1567 }
1568
1569 if (only_slp_in_loop)
1570 {
1571 if (dump_enabled_p ())
1572 dump_printf_loc (MSG_NOTE, vect_location,
1573 "Loop contains only SLP stmts\n");
1574 vectorization_factor = LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo);
1575 }
1576 else
1577 {
1578 if (dump_enabled_p ())
1579 dump_printf_loc (MSG_NOTE, vect_location,
1580 "Loop contains SLP and non-SLP stmts\n");
1581 /* Both the vectorization factor and unroll factor have the form
1582 GET_MODE_SIZE (loop_vinfo->vector_mode) * X for some rational X,
1583 so they must have a common multiple. */
1584 vectorization_factor
1585 = force_common_multiple (vectorization_factor,
1586 LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo));
1587 }
1588
1589 LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
1590 if (dump_enabled_p ())
1591 {
1592 dump_printf_loc (MSG_NOTE, vect_location,
1593 "Updating vectorization factor to ");
1594 dump_dec (MSG_NOTE, vectorization_factor);
1595 dump_printf (MSG_NOTE, ".\n");
1596 }
1597 }
1598
1599 /* Return true if STMT_INFO describes a double reduction phi and if
1600 the other phi in the reduction is also relevant for vectorization.
1601 This rejects cases such as:
1602
1603 outer1:
1604 x_1 = PHI <x_3(outer2), ...>;
1605 ...
1606
1607 inner:
1608 x_2 = ...;
1609 ...
1610
1611 outer2:
1612 x_3 = PHI <x_2(inner)>;
1613
1614 if nothing in x_2 or elsewhere makes x_1 relevant. */
1615
1616 static bool
1617 vect_active_double_reduction_p (stmt_vec_info stmt_info)
1618 {
1619 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_double_reduction_def)
1620 return false;
1621
1622 return STMT_VINFO_RELEVANT_P (STMT_VINFO_REDUC_DEF (stmt_info));
1623 }
1624
1625 /* Function vect_analyze_loop_operations.
1626
1627 Scan the loop stmts and make sure they are all vectorizable. */
1628
1629 static opt_result
1630 vect_analyze_loop_operations (loop_vec_info loop_vinfo)
1631 {
1632 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1633 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1634 int nbbs = loop->num_nodes;
1635 int i;
1636 stmt_vec_info stmt_info;
1637 bool need_to_vectorize = false;
1638 bool ok;
1639
1640 DUMP_VECT_SCOPE ("vect_analyze_loop_operations");
1641
1642 auto_vec<stmt_info_for_cost> cost_vec;
1643
1644 for (i = 0; i < nbbs; i++)
1645 {
1646 basic_block bb = bbs[i];
1647
1648 for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
1649 gsi_next (&si))
1650 {
1651 gphi *phi = si.phi ();
1652 ok = true;
1653
1654 stmt_info = loop_vinfo->lookup_stmt (phi);
1655 if (dump_enabled_p ())
1656 dump_printf_loc (MSG_NOTE, vect_location, "examining phi: %G", phi);
1657 if (virtual_operand_p (gimple_phi_result (phi)))
1658 continue;
1659
1660 /* Inner-loop loop-closed exit phi in outer-loop vectorization
1661 (i.e., a phi in the tail of the outer-loop). */
1662 if (! is_loop_header_bb_p (bb))
1663 {
1664 /* FORNOW: we currently don't support the case that these phis
1665 are not used in the outerloop (unless it is double reduction,
1666 i.e., this phi is vect_reduction_def), cause this case
1667 requires to actually do something here. */
1668 if (STMT_VINFO_LIVE_P (stmt_info)
1669 && !vect_active_double_reduction_p (stmt_info))
1670 return opt_result::failure_at (phi,
1671 "Unsupported loop-closed phi"
1672 " in outer-loop.\n");
1673
1674 /* If PHI is used in the outer loop, we check that its operand
1675 is defined in the inner loop. */
1676 if (STMT_VINFO_RELEVANT_P (stmt_info))
1677 {
1678 tree phi_op;
1679
1680 if (gimple_phi_num_args (phi) != 1)
1681 return opt_result::failure_at (phi, "unsupported phi");
1682
1683 phi_op = PHI_ARG_DEF (phi, 0);
1684 stmt_vec_info op_def_info = loop_vinfo->lookup_def (phi_op);
1685 if (!op_def_info)
1686 return opt_result::failure_at (phi, "unsupported phi\n");
1687
1688 if (STMT_VINFO_RELEVANT (op_def_info) != vect_used_in_outer
1689 && (STMT_VINFO_RELEVANT (op_def_info)
1690 != vect_used_in_outer_by_reduction))
1691 return opt_result::failure_at (phi, "unsupported phi\n");
1692
1693 if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_internal_def
1694 || (STMT_VINFO_DEF_TYPE (stmt_info)
1695 == vect_double_reduction_def))
1696 && !vectorizable_lc_phi (loop_vinfo,
1697 stmt_info, NULL, NULL))
1698 return opt_result::failure_at (phi, "unsupported phi\n");
1699 }
1700
1701 continue;
1702 }
1703
1704 gcc_assert (stmt_info);
1705
1706 if ((STMT_VINFO_RELEVANT (stmt_info) == vect_used_in_scope
1707 || STMT_VINFO_LIVE_P (stmt_info))
1708 && STMT_VINFO_DEF_TYPE (stmt_info) != vect_induction_def)
1709 /* A scalar-dependence cycle that we don't support. */
1710 return opt_result::failure_at (phi,
1711 "not vectorized:"
1712 " scalar dependence cycle.\n");
1713
1714 if (STMT_VINFO_RELEVANT_P (stmt_info))
1715 {
1716 need_to_vectorize = true;
1717 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def
1718 && ! PURE_SLP_STMT (stmt_info))
1719 ok = vectorizable_induction (loop_vinfo,
1720 stmt_info, NULL, NULL,
1721 &cost_vec);
1722 else if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
1723 || (STMT_VINFO_DEF_TYPE (stmt_info)
1724 == vect_double_reduction_def)
1725 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle)
1726 && ! PURE_SLP_STMT (stmt_info))
1727 ok = vectorizable_reduction (loop_vinfo,
1728 stmt_info, NULL, NULL, &cost_vec);
1729 }
1730
1731 /* SLP PHIs are tested by vect_slp_analyze_node_operations. */
1732 if (ok
1733 && STMT_VINFO_LIVE_P (stmt_info)
1734 && !PURE_SLP_STMT (stmt_info))
1735 ok = vectorizable_live_operation (loop_vinfo,
1736 stmt_info, NULL, NULL, NULL,
1737 -1, false, &cost_vec);
1738
1739 if (!ok)
1740 return opt_result::failure_at (phi,
1741 "not vectorized: relevant phi not "
1742 "supported: %G",
1743 static_cast <gimple *> (phi));
1744 }
1745
1746 for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
1747 gsi_next (&si))
1748 {
1749 gimple *stmt = gsi_stmt (si);
1750 if (!gimple_clobber_p (stmt)
1751 && !is_gimple_debug (stmt))
1752 {
1753 opt_result res
1754 = vect_analyze_stmt (loop_vinfo,
1755 loop_vinfo->lookup_stmt (stmt),
1756 &need_to_vectorize,
1757 NULL, NULL, &cost_vec);
1758 if (!res)
1759 return res;
1760 }
1761 }
1762 } /* bbs */
1763
1764 add_stmt_costs (loop_vinfo->vector_costs, &cost_vec);
1765
1766 /* All operations in the loop are either irrelevant (deal with loop
1767 control, or dead), or only used outside the loop and can be moved
1768 out of the loop (e.g. invariants, inductions). The loop can be
1769 optimized away by scalar optimizations. We're better off not
1770 touching this loop. */
1771 if (!need_to_vectorize)
1772 {
1773 if (dump_enabled_p ())
1774 dump_printf_loc (MSG_NOTE, vect_location,
1775 "All the computation can be taken out of the loop.\n");
1776 return opt_result::failure_at
1777 (vect_location,
1778 "not vectorized: redundant loop. no profit to vectorize.\n");
1779 }
1780
1781 return opt_result::success ();
1782 }
1783
1784 /* Return true if we know that the iteration count is smaller than the
1785 vectorization factor. Return false if it isn't, or if we can't be sure
1786 either way. */
1787
1788 static bool
1789 vect_known_niters_smaller_than_vf (loop_vec_info loop_vinfo)
1790 {
1791 unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
1792
1793 HOST_WIDE_INT max_niter;
1794 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
1795 max_niter = LOOP_VINFO_INT_NITERS (loop_vinfo);
1796 else
1797 max_niter = max_stmt_executions_int (LOOP_VINFO_LOOP (loop_vinfo));
1798
1799 if (max_niter != -1 && (unsigned HOST_WIDE_INT) max_niter < assumed_vf)
1800 return true;
1801
1802 return false;
1803 }
1804
1805 /* Analyze the cost of the loop described by LOOP_VINFO. Decide if it
1806 is worthwhile to vectorize. Return 1 if definitely yes, 0 if
1807 definitely no, or -1 if it's worth retrying. */
1808
1809 static int
1810 vect_analyze_loop_costing (loop_vec_info loop_vinfo)
1811 {
1812 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1813 unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
1814
1815 /* Only loops that can handle partially-populated vectors can have iteration
1816 counts less than the vectorization factor. */
1817 if (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
1818 {
1819 if (vect_known_niters_smaller_than_vf (loop_vinfo))
1820 {
1821 if (dump_enabled_p ())
1822 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1823 "not vectorized: iteration count smaller than "
1824 "vectorization factor.\n");
1825 return 0;
1826 }
1827 }
1828
1829 /* If using the "very cheap" model. reject cases in which we'd keep
1830 a copy of the scalar code (even if we might be able to vectorize it). */
1831 if (loop_cost_model (loop) == VECT_COST_MODEL_VERY_CHEAP
1832 && (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
1833 || LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
1834 || LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo)))
1835 {
1836 if (dump_enabled_p ())
1837 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1838 "some scalar iterations would need to be peeled\n");
1839 return 0;
1840 }
1841
1842 int min_profitable_iters, min_profitable_estimate;
1843 vect_estimate_min_profitable_iters (loop_vinfo, &min_profitable_iters,
1844 &min_profitable_estimate);
1845
1846 if (min_profitable_iters < 0)
1847 {
1848 if (dump_enabled_p ())
1849 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1850 "not vectorized: vectorization not profitable.\n");
1851 if (dump_enabled_p ())
1852 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1853 "not vectorized: vector version will never be "
1854 "profitable.\n");
1855 return -1;
1856 }
1857
1858 int min_scalar_loop_bound = (param_min_vect_loop_bound
1859 * assumed_vf);
1860
1861 /* Use the cost model only if it is more conservative than user specified
1862 threshold. */
1863 unsigned int th = (unsigned) MAX (min_scalar_loop_bound,
1864 min_profitable_iters);
1865
1866 LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo) = th;
1867
1868 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
1869 && LOOP_VINFO_INT_NITERS (loop_vinfo) < th)
1870 {
1871 if (dump_enabled_p ())
1872 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1873 "not vectorized: vectorization not profitable.\n");
1874 if (dump_enabled_p ())
1875 dump_printf_loc (MSG_NOTE, vect_location,
1876 "not vectorized: iteration count smaller than user "
1877 "specified loop bound parameter or minimum profitable "
1878 "iterations (whichever is more conservative).\n");
1879 return 0;
1880 }
1881
1882 /* The static profitablity threshold min_profitable_estimate includes
1883 the cost of having to check at runtime whether the scalar loop
1884 should be used instead. If it turns out that we don't need or want
1885 such a check, the threshold we should use for the static estimate
1886 is simply the point at which the vector loop becomes more profitable
1887 than the scalar loop. */
1888 if (min_profitable_estimate > min_profitable_iters
1889 && !LOOP_REQUIRES_VERSIONING (loop_vinfo)
1890 && !LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo)
1891 && !LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
1892 && !vect_apply_runtime_profitability_check_p (loop_vinfo))
1893 {
1894 if (dump_enabled_p ())
1895 dump_printf_loc (MSG_NOTE, vect_location, "no need for a runtime"
1896 " choice between the scalar and vector loops\n");
1897 min_profitable_estimate = min_profitable_iters;
1898 }
1899
1900 /* If the vector loop needs multiple iterations to be beneficial then
1901 things are probably too close to call, and the conservative thing
1902 would be to stick with the scalar code. */
1903 if (loop_cost_model (loop) == VECT_COST_MODEL_VERY_CHEAP
1904 && min_profitable_estimate > (int) vect_vf_for_cost (loop_vinfo))
1905 {
1906 if (dump_enabled_p ())
1907 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1908 "one iteration of the vector loop would be"
1909 " more expensive than the equivalent number of"
1910 " iterations of the scalar loop\n");
1911 return 0;
1912 }
1913
1914 HOST_WIDE_INT estimated_niter;
1915
1916 /* If we are vectorizing an epilogue then we know the maximum number of
1917 scalar iterations it will cover is at least one lower than the
1918 vectorization factor of the main loop. */
1919 if (LOOP_VINFO_EPILOGUE_P (loop_vinfo))
1920 estimated_niter
1921 = vect_vf_for_cost (LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo)) - 1;
1922 else
1923 {
1924 estimated_niter = estimated_stmt_executions_int (loop);
1925 if (estimated_niter == -1)
1926 estimated_niter = likely_max_stmt_executions_int (loop);
1927 }
1928 if (estimated_niter != -1
1929 && ((unsigned HOST_WIDE_INT) estimated_niter
1930 < MAX (th, (unsigned) min_profitable_estimate)))
1931 {
1932 if (dump_enabled_p ())
1933 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1934 "not vectorized: estimated iteration count too "
1935 "small.\n");
1936 if (dump_enabled_p ())
1937 dump_printf_loc (MSG_NOTE, vect_location,
1938 "not vectorized: estimated iteration count smaller "
1939 "than specified loop bound parameter or minimum "
1940 "profitable iterations (whichever is more "
1941 "conservative).\n");
1942 return -1;
1943 }
1944
1945 return 1;
1946 }
1947
1948 static opt_result
1949 vect_get_datarefs_in_loop (loop_p loop, basic_block *bbs,
1950 vec<data_reference_p> *datarefs,
1951 unsigned int *n_stmts)
1952 {
1953 *n_stmts = 0;
1954 for (unsigned i = 0; i < loop->num_nodes; i++)
1955 for (gimple_stmt_iterator gsi = gsi_start_bb (bbs[i]);
1956 !gsi_end_p (gsi); gsi_next (&gsi))
1957 {
1958 gimple *stmt = gsi_stmt (gsi);
1959 if (is_gimple_debug (stmt))
1960 continue;
1961 ++(*n_stmts);
1962 opt_result res = vect_find_stmt_data_reference (loop, stmt, datarefs,
1963 NULL, 0);
1964 if (!res)
1965 {
1966 if (is_gimple_call (stmt) && loop->safelen)
1967 {
1968 tree fndecl = gimple_call_fndecl (stmt), op;
1969 if (fndecl != NULL_TREE)
1970 {
1971 cgraph_node *node = cgraph_node::get (fndecl);
1972 if (node != NULL && node->simd_clones != NULL)
1973 {
1974 unsigned int j, n = gimple_call_num_args (stmt);
1975 for (j = 0; j < n; j++)
1976 {
1977 op = gimple_call_arg (stmt, j);
1978 if (DECL_P (op)
1979 || (REFERENCE_CLASS_P (op)
1980 && get_base_address (op)))
1981 break;
1982 }
1983 op = gimple_call_lhs (stmt);
1984 /* Ignore #pragma omp declare simd functions
1985 if they don't have data references in the
1986 call stmt itself. */
1987 if (j == n
1988 && !(op
1989 && (DECL_P (op)
1990 || (REFERENCE_CLASS_P (op)
1991 && get_base_address (op)))))
1992 continue;
1993 }
1994 }
1995 }
1996 return res;
1997 }
1998 /* If dependence analysis will give up due to the limit on the
1999 number of datarefs stop here and fail fatally. */
2000 if (datarefs->length ()
2001 > (unsigned)param_loop_max_datarefs_for_datadeps)
2002 return opt_result::failure_at (stmt, "exceeded param "
2003 "loop-max-datarefs-for-datadeps\n");
2004 }
2005 return opt_result::success ();
2006 }
2007
2008 /* Look for SLP-only access groups and turn each individual access into its own
2009 group. */
2010 static void
2011 vect_dissolve_slp_only_groups (loop_vec_info loop_vinfo)
2012 {
2013 unsigned int i;
2014 struct data_reference *dr;
2015
2016 DUMP_VECT_SCOPE ("vect_dissolve_slp_only_groups");
2017
2018 vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
2019 FOR_EACH_VEC_ELT (datarefs, i, dr)
2020 {
2021 gcc_assert (DR_REF (dr));
2022 stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (DR_STMT (dr));
2023
2024 /* Check if the load is a part of an interleaving chain. */
2025 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
2026 {
2027 stmt_vec_info first_element = DR_GROUP_FIRST_ELEMENT (stmt_info);
2028 dr_vec_info *dr_info = STMT_VINFO_DR_INFO (first_element);
2029 unsigned int group_size = DR_GROUP_SIZE (first_element);
2030
2031 /* Check if SLP-only groups. */
2032 if (!STMT_SLP_TYPE (stmt_info)
2033 && STMT_VINFO_SLP_VECT_ONLY (first_element))
2034 {
2035 /* Dissolve the group. */
2036 STMT_VINFO_SLP_VECT_ONLY (first_element) = false;
2037
2038 stmt_vec_info vinfo = first_element;
2039 while (vinfo)
2040 {
2041 stmt_vec_info next = DR_GROUP_NEXT_ELEMENT (vinfo);
2042 DR_GROUP_FIRST_ELEMENT (vinfo) = vinfo;
2043 DR_GROUP_NEXT_ELEMENT (vinfo) = NULL;
2044 DR_GROUP_SIZE (vinfo) = 1;
2045 if (STMT_VINFO_STRIDED_P (first_element))
2046 DR_GROUP_GAP (vinfo) = 0;
2047 else
2048 DR_GROUP_GAP (vinfo) = group_size - 1;
2049 /* Duplicate and adjust alignment info, it needs to
2050 be present on each group leader, see dr_misalignment. */
2051 if (vinfo != first_element)
2052 {
2053 dr_vec_info *dr_info2 = STMT_VINFO_DR_INFO (vinfo);
2054 dr_info2->target_alignment = dr_info->target_alignment;
2055 int misalignment = dr_info->misalignment;
2056 if (misalignment != DR_MISALIGNMENT_UNKNOWN)
2057 {
2058 HOST_WIDE_INT diff
2059 = (TREE_INT_CST_LOW (DR_INIT (dr_info2->dr))
2060 - TREE_INT_CST_LOW (DR_INIT (dr_info->dr)));
2061 unsigned HOST_WIDE_INT align_c
2062 = dr_info->target_alignment.to_constant ();
2063 misalignment = (misalignment + diff) % align_c;
2064 }
2065 dr_info2->misalignment = misalignment;
2066 }
2067 vinfo = next;
2068 }
2069 }
2070 }
2071 }
2072 }
2073
2074 /* Determine if operating on full vectors for LOOP_VINFO might leave
2075 some scalar iterations still to do. If so, decide how we should
2076 handle those scalar iterations. The possibilities are:
2077
2078 (1) Make LOOP_VINFO operate on partial vectors instead of full vectors.
2079 In this case:
2080
2081 LOOP_VINFO_USING_PARTIAL_VECTORS_P == true
2082 LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P == false
2083 LOOP_VINFO_PEELING_FOR_NITER == false
2084
2085 (2) Make LOOP_VINFO operate on full vectors and use an epilogue loop
2086 to handle the remaining scalar iterations. In this case:
2087
2088 LOOP_VINFO_USING_PARTIAL_VECTORS_P == false
2089 LOOP_VINFO_PEELING_FOR_NITER == true
2090
2091 There are two choices:
2092
2093 (2a) Consider vectorizing the epilogue loop at the same VF as the
2094 main loop, but using partial vectors instead of full vectors.
2095 In this case:
2096
2097 LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P == true
2098
2099 (2b) Consider vectorizing the epilogue loop at lower VFs only.
2100 In this case:
2101
2102 LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P == false
2103
2104 When FOR_EPILOGUE_P is true, make this determination based on the
2105 assumption that LOOP_VINFO is an epilogue loop, otherwise make it
2106 based on the assumption that LOOP_VINFO is the main loop. The caller
2107 has made sure that the number of iterations is set appropriately for
2108 this value of FOR_EPILOGUE_P. */
2109
2110 opt_result
2111 vect_determine_partial_vectors_and_peeling (loop_vec_info loop_vinfo,
2112 bool for_epilogue_p)
2113 {
2114 /* Determine whether there would be any scalar iterations left over. */
2115 bool need_peeling_or_partial_vectors_p
2116 = vect_need_peeling_or_partial_vectors_p (loop_vinfo);
2117
2118 /* Decide whether to vectorize the loop with partial vectors. */
2119 LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo) = false;
2120 LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P (loop_vinfo) = false;
2121 if (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)
2122 && need_peeling_or_partial_vectors_p)
2123 {
2124 /* For partial-vector-usage=1, try to push the handling of partial
2125 vectors to the epilogue, with the main loop continuing to operate
2126 on full vectors.
2127
2128 ??? We could then end up failing to use partial vectors if we
2129 decide to peel iterations into a prologue, and if the main loop
2130 then ends up processing fewer than VF iterations. */
2131 if (param_vect_partial_vector_usage == 1
2132 && !LOOP_VINFO_EPILOGUE_P (loop_vinfo)
2133 && !vect_known_niters_smaller_than_vf (loop_vinfo))
2134 LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P (loop_vinfo) = true;
2135 else
2136 LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo) = true;
2137 }
2138
2139 if (dump_enabled_p ())
2140 {
2141 if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
2142 dump_printf_loc (MSG_NOTE, vect_location,
2143 "operating on partial vectors%s.\n",
2144 for_epilogue_p ? " for epilogue loop" : "");
2145 else
2146 dump_printf_loc (MSG_NOTE, vect_location,
2147 "operating only on full vectors%s.\n",
2148 for_epilogue_p ? " for epilogue loop" : "");
2149 }
2150
2151 if (for_epilogue_p)
2152 {
2153 loop_vec_info orig_loop_vinfo = LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo);
2154 gcc_assert (orig_loop_vinfo);
2155 if (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
2156 gcc_assert (known_lt (LOOP_VINFO_VECT_FACTOR (loop_vinfo),
2157 LOOP_VINFO_VECT_FACTOR (orig_loop_vinfo)));
2158 }
2159
2160 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2161 && !LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
2162 {
2163 /* Check that the loop processes at least one full vector. */
2164 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2165 tree scalar_niters = LOOP_VINFO_NITERS (loop_vinfo);
2166 if (known_lt (wi::to_widest (scalar_niters), vf))
2167 return opt_result::failure_at (vect_location,
2168 "loop does not have enough iterations"
2169 " to support vectorization.\n");
2170
2171 /* If we need to peel an extra epilogue iteration to handle data
2172 accesses with gaps, check that there are enough scalar iterations
2173 available.
2174
2175 The check above is redundant with this one when peeling for gaps,
2176 but the distinction is useful for diagnostics. */
2177 tree scalar_nitersm1 = LOOP_VINFO_NITERSM1 (loop_vinfo);
2178 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
2179 && known_lt (wi::to_widest (scalar_nitersm1), vf))
2180 return opt_result::failure_at (vect_location,
2181 "loop does not have enough iterations"
2182 " to support peeling for gaps.\n");
2183 }
2184
2185 LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo)
2186 = (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo)
2187 && need_peeling_or_partial_vectors_p);
2188
2189 return opt_result::success ();
2190 }
2191
2192 /* Function vect_analyze_loop_2.
2193
2194 Apply a set of analyses on LOOP, and create a loop_vec_info struct
2195 for it. The different analyses will record information in the
2196 loop_vec_info struct. */
2197 static opt_result
2198 vect_analyze_loop_2 (loop_vec_info loop_vinfo, bool &fatal)
2199 {
2200 opt_result ok = opt_result::success ();
2201 int res;
2202 unsigned int max_vf = MAX_VECTORIZATION_FACTOR;
2203 poly_uint64 min_vf = 2;
2204 loop_vec_info orig_loop_vinfo = NULL;
2205
2206 /* If we are dealing with an epilogue then orig_loop_vinfo points to the
2207 loop_vec_info of the first vectorized loop. */
2208 if (LOOP_VINFO_EPILOGUE_P (loop_vinfo))
2209 orig_loop_vinfo = LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo);
2210 else
2211 orig_loop_vinfo = loop_vinfo;
2212 gcc_assert (orig_loop_vinfo);
2213
2214 /* The first group of checks is independent of the vector size. */
2215 fatal = true;
2216
2217 if (LOOP_VINFO_SIMD_IF_COND (loop_vinfo)
2218 && integer_zerop (LOOP_VINFO_SIMD_IF_COND (loop_vinfo)))
2219 return opt_result::failure_at (vect_location,
2220 "not vectorized: simd if(0)\n");
2221
2222 /* Find all data references in the loop (which correspond to vdefs/vuses)
2223 and analyze their evolution in the loop. */
2224
2225 loop_p loop = LOOP_VINFO_LOOP (loop_vinfo);
2226
2227 /* Gather the data references and count stmts in the loop. */
2228 if (!LOOP_VINFO_DATAREFS (loop_vinfo).exists ())
2229 {
2230 opt_result res
2231 = vect_get_datarefs_in_loop (loop, LOOP_VINFO_BBS (loop_vinfo),
2232 &LOOP_VINFO_DATAREFS (loop_vinfo),
2233 &LOOP_VINFO_N_STMTS (loop_vinfo));
2234 if (!res)
2235 {
2236 if (dump_enabled_p ())
2237 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2238 "not vectorized: loop contains function "
2239 "calls or data references that cannot "
2240 "be analyzed\n");
2241 return res;
2242 }
2243 loop_vinfo->shared->save_datarefs ();
2244 }
2245 else
2246 loop_vinfo->shared->check_datarefs ();
2247
2248 /* Analyze the data references and also adjust the minimal
2249 vectorization factor according to the loads and stores. */
2250
2251 ok = vect_analyze_data_refs (loop_vinfo, &min_vf, &fatal);
2252 if (!ok)
2253 {
2254 if (dump_enabled_p ())
2255 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2256 "bad data references.\n");
2257 return ok;
2258 }
2259
2260 /* Classify all cross-iteration scalar data-flow cycles.
2261 Cross-iteration cycles caused by virtual phis are analyzed separately. */
2262 vect_analyze_scalar_cycles (loop_vinfo);
2263
2264 vect_pattern_recog (loop_vinfo);
2265
2266 vect_fixup_scalar_cycles_with_patterns (loop_vinfo);
2267
2268 /* Analyze the access patterns of the data-refs in the loop (consecutive,
2269 complex, etc.). FORNOW: Only handle consecutive access pattern. */
2270
2271 ok = vect_analyze_data_ref_accesses (loop_vinfo, NULL);
2272 if (!ok)
2273 {
2274 if (dump_enabled_p ())
2275 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2276 "bad data access.\n");
2277 return ok;
2278 }
2279
2280 /* Data-flow analysis to detect stmts that do not need to be vectorized. */
2281
2282 ok = vect_mark_stmts_to_be_vectorized (loop_vinfo, &fatal);
2283 if (!ok)
2284 {
2285 if (dump_enabled_p ())
2286 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2287 "unexpected pattern.\n");
2288 return ok;
2289 }
2290
2291 /* While the rest of the analysis below depends on it in some way. */
2292 fatal = false;
2293
2294 /* Analyze data dependences between the data-refs in the loop
2295 and adjust the maximum vectorization factor according to
2296 the dependences.
2297 FORNOW: fail at the first data dependence that we encounter. */
2298
2299 ok = vect_analyze_data_ref_dependences (loop_vinfo, &max_vf);
2300 if (!ok)
2301 {
2302 if (dump_enabled_p ())
2303 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2304 "bad data dependence.\n");
2305 return ok;
2306 }
2307 if (max_vf != MAX_VECTORIZATION_FACTOR
2308 && maybe_lt (max_vf, min_vf))
2309 return opt_result::failure_at (vect_location, "bad data dependence.\n");
2310 LOOP_VINFO_MAX_VECT_FACTOR (loop_vinfo) = max_vf;
2311
2312 ok = vect_determine_vectorization_factor (loop_vinfo);
2313 if (!ok)
2314 {
2315 if (dump_enabled_p ())
2316 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2317 "can't determine vectorization factor.\n");
2318 return ok;
2319 }
2320 if (max_vf != MAX_VECTORIZATION_FACTOR
2321 && maybe_lt (max_vf, LOOP_VINFO_VECT_FACTOR (loop_vinfo)))
2322 return opt_result::failure_at (vect_location, "bad data dependence.\n");
2323
2324 /* Compute the scalar iteration cost. */
2325 vect_compute_single_scalar_iteration_cost (loop_vinfo);
2326
2327 poly_uint64 saved_vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2328
2329 /* Check the SLP opportunities in the loop, analyze and build SLP trees. */
2330 ok = vect_analyze_slp (loop_vinfo, LOOP_VINFO_N_STMTS (loop_vinfo));
2331 if (!ok)
2332 return ok;
2333
2334 /* If there are any SLP instances mark them as pure_slp. */
2335 bool slp = vect_make_slp_decision (loop_vinfo);
2336 if (slp)
2337 {
2338 /* Find stmts that need to be both vectorized and SLPed. */
2339 vect_detect_hybrid_slp (loop_vinfo);
2340
2341 /* Update the vectorization factor based on the SLP decision. */
2342 vect_update_vf_for_slp (loop_vinfo);
2343
2344 /* Optimize the SLP graph with the vectorization factor fixed. */
2345 vect_optimize_slp (loop_vinfo);
2346
2347 /* Gather the loads reachable from the SLP graph entries. */
2348 vect_gather_slp_loads (loop_vinfo);
2349 }
2350
2351 bool saved_can_use_partial_vectors_p
2352 = LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo);
2353
2354 /* We don't expect to have to roll back to anything other than an empty
2355 set of rgroups. */
2356 gcc_assert (LOOP_VINFO_MASKS (loop_vinfo).is_empty ());
2357
2358 /* This is the point where we can re-start analysis with SLP forced off. */
2359 start_over:
2360
2361 /* Now the vectorization factor is final. */
2362 poly_uint64 vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2363 gcc_assert (known_ne (vectorization_factor, 0U));
2364
2365 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) && dump_enabled_p ())
2366 {
2367 dump_printf_loc (MSG_NOTE, vect_location,
2368 "vectorization_factor = ");
2369 dump_dec (MSG_NOTE, vectorization_factor);
2370 dump_printf (MSG_NOTE, ", niters = %wd\n",
2371 LOOP_VINFO_INT_NITERS (loop_vinfo));
2372 }
2373
2374 loop_vinfo->vector_costs = init_cost (loop_vinfo, false);
2375
2376 /* Analyze the alignment of the data-refs in the loop.
2377 Fail if a data reference is found that cannot be vectorized. */
2378
2379 ok = vect_analyze_data_refs_alignment (loop_vinfo);
2380 if (!ok)
2381 {
2382 if (dump_enabled_p ())
2383 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2384 "bad data alignment.\n");
2385 return ok;
2386 }
2387
2388 /* Prune the list of ddrs to be tested at run-time by versioning for alias.
2389 It is important to call pruning after vect_analyze_data_ref_accesses,
2390 since we use grouping information gathered by interleaving analysis. */
2391 ok = vect_prune_runtime_alias_test_list (loop_vinfo);
2392 if (!ok)
2393 return ok;
2394
2395 /* Do not invoke vect_enhance_data_refs_alignment for epilogue
2396 vectorization, since we do not want to add extra peeling or
2397 add versioning for alignment. */
2398 if (!LOOP_VINFO_EPILOGUE_P (loop_vinfo))
2399 /* This pass will decide on using loop versioning and/or loop peeling in
2400 order to enhance the alignment of data references in the loop. */
2401 ok = vect_enhance_data_refs_alignment (loop_vinfo);
2402 if (!ok)
2403 return ok;
2404
2405 if (slp)
2406 {
2407 /* Analyze operations in the SLP instances. Note this may
2408 remove unsupported SLP instances which makes the above
2409 SLP kind detection invalid. */
2410 unsigned old_size = LOOP_VINFO_SLP_INSTANCES (loop_vinfo).length ();
2411 vect_slp_analyze_operations (loop_vinfo);
2412 if (LOOP_VINFO_SLP_INSTANCES (loop_vinfo).length () != old_size)
2413 {
2414 ok = opt_result::failure_at (vect_location,
2415 "unsupported SLP instances\n");
2416 goto again;
2417 }
2418
2419 /* Check whether any load in ALL SLP instances is possibly permuted. */
2420 slp_tree load_node, slp_root;
2421 unsigned i, x;
2422 slp_instance instance;
2423 bool can_use_lanes = true;
2424 FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), x, instance)
2425 {
2426 slp_root = SLP_INSTANCE_TREE (instance);
2427 int group_size = SLP_TREE_LANES (slp_root);
2428 tree vectype = SLP_TREE_VECTYPE (slp_root);
2429 bool loads_permuted = false;
2430 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), i, load_node)
2431 {
2432 if (!SLP_TREE_LOAD_PERMUTATION (load_node).exists ())
2433 continue;
2434 unsigned j;
2435 stmt_vec_info load_info;
2436 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (load_node), j, load_info)
2437 if (SLP_TREE_LOAD_PERMUTATION (load_node)[j] != j)
2438 {
2439 loads_permuted = true;
2440 break;
2441 }
2442 }
2443
2444 /* If the loads and stores can be handled with load/store-lane
2445 instructions record it and move on to the next instance. */
2446 if (loads_permuted
2447 && SLP_INSTANCE_KIND (instance) == slp_inst_kind_store
2448 && vect_store_lanes_supported (vectype, group_size, false))
2449 {
2450 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), i, load_node)
2451 {
2452 stmt_vec_info stmt_vinfo = DR_GROUP_FIRST_ELEMENT
2453 (SLP_TREE_SCALAR_STMTS (load_node)[0]);
2454 /* Use SLP for strided accesses (or if we can't
2455 load-lanes). */
2456 if (STMT_VINFO_STRIDED_P (stmt_vinfo)
2457 || ! vect_load_lanes_supported
2458 (STMT_VINFO_VECTYPE (stmt_vinfo),
2459 DR_GROUP_SIZE (stmt_vinfo), false))
2460 break;
2461 }
2462
2463 can_use_lanes
2464 = can_use_lanes && i == SLP_INSTANCE_LOADS (instance).length ();
2465
2466 if (can_use_lanes && dump_enabled_p ())
2467 dump_printf_loc (MSG_NOTE, vect_location,
2468 "SLP instance %p can use load/store-lanes\n",
2469 instance);
2470 }
2471 else
2472 {
2473 can_use_lanes = false;
2474 break;
2475 }
2476 }
2477
2478 /* If all SLP instances can use load/store-lanes abort SLP and try again
2479 with SLP disabled. */
2480 if (can_use_lanes)
2481 {
2482 ok = opt_result::failure_at (vect_location,
2483 "Built SLP cancelled: can use "
2484 "load/store-lanes\n");
2485 if (dump_enabled_p ())
2486 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2487 "Built SLP cancelled: all SLP instances support "
2488 "load/store-lanes\n");
2489 goto again;
2490 }
2491 }
2492
2493 /* Dissolve SLP-only groups. */
2494 vect_dissolve_slp_only_groups (loop_vinfo);
2495
2496 /* Scan all the remaining operations in the loop that are not subject
2497 to SLP and make sure they are vectorizable. */
2498 ok = vect_analyze_loop_operations (loop_vinfo);
2499 if (!ok)
2500 {
2501 if (dump_enabled_p ())
2502 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2503 "bad operation or unsupported loop bound.\n");
2504 return ok;
2505 }
2506
2507 /* For now, we don't expect to mix both masking and length approaches for one
2508 loop, disable it if both are recorded. */
2509 if (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)
2510 && !LOOP_VINFO_MASKS (loop_vinfo).is_empty ()
2511 && !LOOP_VINFO_LENS (loop_vinfo).is_empty ())
2512 {
2513 if (dump_enabled_p ())
2514 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2515 "can't vectorize a loop with partial vectors"
2516 " because we don't expect to mix different"
2517 " approaches with partial vectors for the"
2518 " same loop.\n");
2519 LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
2520 }
2521
2522 /* If we still have the option of using partial vectors,
2523 check whether we can generate the necessary loop controls. */
2524 if (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)
2525 && !vect_verify_full_masking (loop_vinfo)
2526 && !vect_verify_loop_lens (loop_vinfo))
2527 LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
2528
2529 /* If we're vectorizing an epilogue loop, the vectorized loop either needs
2530 to be able to handle fewer than VF scalars, or needs to have a lower VF
2531 than the main loop. */
2532 if (LOOP_VINFO_EPILOGUE_P (loop_vinfo)
2533 && !LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)
2534 && maybe_ge (LOOP_VINFO_VECT_FACTOR (loop_vinfo),
2535 LOOP_VINFO_VECT_FACTOR (orig_loop_vinfo)))
2536 return opt_result::failure_at (vect_location,
2537 "Vectorization factor too high for"
2538 " epilogue loop.\n");
2539
2540 /* Decide whether this loop_vinfo should use partial vectors or peeling,
2541 assuming that the loop will be used as a main loop. We will redo
2542 this analysis later if we instead decide to use the loop as an
2543 epilogue loop. */
2544 ok = vect_determine_partial_vectors_and_peeling (loop_vinfo, false);
2545 if (!ok)
2546 return ok;
2547
2548 /* Check the costings of the loop make vectorizing worthwhile. */
2549 res = vect_analyze_loop_costing (loop_vinfo);
2550 if (res < 0)
2551 {
2552 ok = opt_result::failure_at (vect_location,
2553 "Loop costings may not be worthwhile.\n");
2554 goto again;
2555 }
2556 if (!res)
2557 return opt_result::failure_at (vect_location,
2558 "Loop costings not worthwhile.\n");
2559
2560 /* If an epilogue loop is required make sure we can create one. */
2561 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
2562 || LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo))
2563 {
2564 if (dump_enabled_p ())
2565 dump_printf_loc (MSG_NOTE, vect_location, "epilog loop required\n");
2566 if (!vect_can_advance_ivs_p (loop_vinfo)
2567 || !slpeel_can_duplicate_loop_p (LOOP_VINFO_LOOP (loop_vinfo),
2568 single_exit (LOOP_VINFO_LOOP
2569 (loop_vinfo))))
2570 {
2571 ok = opt_result::failure_at (vect_location,
2572 "not vectorized: can't create required "
2573 "epilog loop\n");
2574 goto again;
2575 }
2576 }
2577
2578 /* During peeling, we need to check if number of loop iterations is
2579 enough for both peeled prolog loop and vector loop. This check
2580 can be merged along with threshold check of loop versioning, so
2581 increase threshold for this case if necessary.
2582
2583 If we are analyzing an epilogue we still want to check what its
2584 versioning threshold would be. If we decide to vectorize the epilogues we
2585 will want to use the lowest versioning threshold of all epilogues and main
2586 loop. This will enable us to enter a vectorized epilogue even when
2587 versioning the loop. We can't simply check whether the epilogue requires
2588 versioning though since we may have skipped some versioning checks when
2589 analyzing the epilogue. For instance, checks for alias versioning will be
2590 skipped when dealing with epilogues as we assume we already checked them
2591 for the main loop. So instead we always check the 'orig_loop_vinfo'. */
2592 if (LOOP_REQUIRES_VERSIONING (orig_loop_vinfo))
2593 {
2594 poly_uint64 niters_th = 0;
2595 unsigned int th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
2596
2597 if (!vect_use_loop_mask_for_alignment_p (loop_vinfo))
2598 {
2599 /* Niters for peeled prolog loop. */
2600 if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) < 0)
2601 {
2602 dr_vec_info *dr_info = LOOP_VINFO_UNALIGNED_DR (loop_vinfo);
2603 tree vectype = STMT_VINFO_VECTYPE (dr_info->stmt);
2604 niters_th += TYPE_VECTOR_SUBPARTS (vectype) - 1;
2605 }
2606 else
2607 niters_th += LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
2608 }
2609
2610 /* Niters for at least one iteration of vectorized loop. */
2611 if (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
2612 niters_th += LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2613 /* One additional iteration because of peeling for gap. */
2614 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
2615 niters_th += 1;
2616
2617 /* Use the same condition as vect_transform_loop to decide when to use
2618 the cost to determine a versioning threshold. */
2619 if (vect_apply_runtime_profitability_check_p (loop_vinfo)
2620 && ordered_p (th, niters_th))
2621 niters_th = ordered_max (poly_uint64 (th), niters_th);
2622
2623 LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo) = niters_th;
2624 }
2625
2626 gcc_assert (known_eq (vectorization_factor,
2627 LOOP_VINFO_VECT_FACTOR (loop_vinfo)));
2628
2629 /* Ok to vectorize! */
2630 LOOP_VINFO_VECTORIZABLE_P (loop_vinfo) = 1;
2631 return opt_result::success ();
2632
2633 again:
2634 /* Ensure that "ok" is false (with an opt_problem if dumping is enabled). */
2635 gcc_assert (!ok);
2636
2637 /* Try again with SLP forced off but if we didn't do any SLP there is
2638 no point in re-trying. */
2639 if (!slp)
2640 return ok;
2641
2642 /* If there are reduction chains re-trying will fail anyway. */
2643 if (! LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo).is_empty ())
2644 return ok;
2645
2646 /* Likewise if the grouped loads or stores in the SLP cannot be handled
2647 via interleaving or lane instructions. */
2648 slp_instance instance;
2649 slp_tree node;
2650 unsigned i, j;
2651 FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), i, instance)
2652 {
2653 stmt_vec_info vinfo;
2654 vinfo = SLP_TREE_SCALAR_STMTS (SLP_INSTANCE_TREE (instance))[0];
2655 if (! STMT_VINFO_GROUPED_ACCESS (vinfo))
2656 continue;
2657 vinfo = DR_GROUP_FIRST_ELEMENT (vinfo);
2658 unsigned int size = DR_GROUP_SIZE (vinfo);
2659 tree vectype = STMT_VINFO_VECTYPE (vinfo);
2660 if (! vect_store_lanes_supported (vectype, size, false)
2661 && ! known_eq (TYPE_VECTOR_SUBPARTS (vectype), 1U)
2662 && ! vect_grouped_store_supported (vectype, size))
2663 return opt_result::failure_at (vinfo->stmt,
2664 "unsupported grouped store\n");
2665 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), j, node)
2666 {
2667 vinfo = SLP_TREE_SCALAR_STMTS (node)[0];
2668 vinfo = DR_GROUP_FIRST_ELEMENT (vinfo);
2669 bool single_element_p = !DR_GROUP_NEXT_ELEMENT (vinfo);
2670 size = DR_GROUP_SIZE (vinfo);
2671 vectype = STMT_VINFO_VECTYPE (vinfo);
2672 if (! vect_load_lanes_supported (vectype, size, false)
2673 && ! vect_grouped_load_supported (vectype, single_element_p,
2674 size))
2675 return opt_result::failure_at (vinfo->stmt,
2676 "unsupported grouped load\n");
2677 }
2678 }
2679
2680 if (dump_enabled_p ())
2681 dump_printf_loc (MSG_NOTE, vect_location,
2682 "re-trying with SLP disabled\n");
2683
2684 /* Roll back state appropriately. No SLP this time. */
2685 slp = false;
2686 /* Restore vectorization factor as it were without SLP. */
2687 LOOP_VINFO_VECT_FACTOR (loop_vinfo) = saved_vectorization_factor;
2688 /* Free the SLP instances. */
2689 FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), j, instance)
2690 vect_free_slp_instance (instance);
2691 LOOP_VINFO_SLP_INSTANCES (loop_vinfo).release ();
2692 /* Reset SLP type to loop_vect on all stmts. */
2693 for (i = 0; i < LOOP_VINFO_LOOP (loop_vinfo)->num_nodes; ++i)
2694 {
2695 basic_block bb = LOOP_VINFO_BBS (loop_vinfo)[i];
2696 for (gimple_stmt_iterator si = gsi_start_phis (bb);
2697 !gsi_end_p (si); gsi_next (&si))
2698 {
2699 stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
2700 STMT_SLP_TYPE (stmt_info) = loop_vect;
2701 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
2702 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
2703 {
2704 /* vectorizable_reduction adjusts reduction stmt def-types,
2705 restore them to that of the PHI. */
2706 STMT_VINFO_DEF_TYPE (STMT_VINFO_REDUC_DEF (stmt_info))
2707 = STMT_VINFO_DEF_TYPE (stmt_info);
2708 STMT_VINFO_DEF_TYPE (vect_stmt_to_vectorize
2709 (STMT_VINFO_REDUC_DEF (stmt_info)))
2710 = STMT_VINFO_DEF_TYPE (stmt_info);
2711 }
2712 }
2713 for (gimple_stmt_iterator si = gsi_start_bb (bb);
2714 !gsi_end_p (si); gsi_next (&si))
2715 {
2716 if (is_gimple_debug (gsi_stmt (si)))
2717 continue;
2718 stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
2719 STMT_SLP_TYPE (stmt_info) = loop_vect;
2720 if (STMT_VINFO_IN_PATTERN_P (stmt_info))
2721 {
2722 stmt_vec_info pattern_stmt_info
2723 = STMT_VINFO_RELATED_STMT (stmt_info);
2724 if (STMT_VINFO_SLP_VECT_ONLY_PATTERN (pattern_stmt_info))
2725 STMT_VINFO_IN_PATTERN_P (stmt_info) = false;
2726
2727 gimple *pattern_def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
2728 STMT_SLP_TYPE (pattern_stmt_info) = loop_vect;
2729 for (gimple_stmt_iterator pi = gsi_start (pattern_def_seq);
2730 !gsi_end_p (pi); gsi_next (&pi))
2731 STMT_SLP_TYPE (loop_vinfo->lookup_stmt (gsi_stmt (pi)))
2732 = loop_vect;
2733 }
2734 }
2735 }
2736 /* Free optimized alias test DDRS. */
2737 LOOP_VINFO_LOWER_BOUNDS (loop_vinfo).truncate (0);
2738 LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo).release ();
2739 LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo).release ();
2740 /* Reset target cost data. */
2741 delete loop_vinfo->vector_costs;
2742 loop_vinfo->vector_costs = nullptr;
2743 /* Reset accumulated rgroup information. */
2744 release_vec_loop_controls (&LOOP_VINFO_MASKS (loop_vinfo));
2745 release_vec_loop_controls (&LOOP_VINFO_LENS (loop_vinfo));
2746 /* Reset assorted flags. */
2747 LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = false;
2748 LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) = false;
2749 LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo) = 0;
2750 LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo) = 0;
2751 LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)
2752 = saved_can_use_partial_vectors_p;
2753
2754 goto start_over;
2755 }
2756
2757 /* Return true if vectorizing a loop using NEW_LOOP_VINFO appears
2758 to be better than vectorizing it using OLD_LOOP_VINFO. Assume that
2759 OLD_LOOP_VINFO is better unless something specifically indicates
2760 otherwise.
2761
2762 Note that this deliberately isn't a partial order. */
2763
2764 static bool
2765 vect_better_loop_vinfo_p (loop_vec_info new_loop_vinfo,
2766 loop_vec_info old_loop_vinfo)
2767 {
2768 struct loop *loop = LOOP_VINFO_LOOP (new_loop_vinfo);
2769 gcc_assert (LOOP_VINFO_LOOP (old_loop_vinfo) == loop);
2770
2771 poly_int64 new_vf = LOOP_VINFO_VECT_FACTOR (new_loop_vinfo);
2772 poly_int64 old_vf = LOOP_VINFO_VECT_FACTOR (old_loop_vinfo);
2773
2774 /* Always prefer a VF of loop->simdlen over any other VF. */
2775 if (loop->simdlen)
2776 {
2777 bool new_simdlen_p = known_eq (new_vf, loop->simdlen);
2778 bool old_simdlen_p = known_eq (old_vf, loop->simdlen);
2779 if (new_simdlen_p != old_simdlen_p)
2780 return new_simdlen_p;
2781 }
2782
2783 const auto *old_costs = old_loop_vinfo->vector_costs;
2784 const auto *new_costs = new_loop_vinfo->vector_costs;
2785 if (loop_vec_info main_loop = LOOP_VINFO_ORIG_LOOP_INFO (old_loop_vinfo))
2786 return new_costs->better_epilogue_loop_than_p (old_costs, main_loop);
2787
2788 return new_costs->better_main_loop_than_p (old_costs);
2789 }
2790
2791 /* Decide whether to replace OLD_LOOP_VINFO with NEW_LOOP_VINFO. Return
2792 true if we should. */
2793
2794 static bool
2795 vect_joust_loop_vinfos (loop_vec_info new_loop_vinfo,
2796 loop_vec_info old_loop_vinfo)
2797 {
2798 if (!vect_better_loop_vinfo_p (new_loop_vinfo, old_loop_vinfo))
2799 return false;
2800
2801 if (dump_enabled_p ())
2802 dump_printf_loc (MSG_NOTE, vect_location,
2803 "***** Preferring vector mode %s to vector mode %s\n",
2804 GET_MODE_NAME (new_loop_vinfo->vector_mode),
2805 GET_MODE_NAME (old_loop_vinfo->vector_mode));
2806 return true;
2807 }
2808
2809 /* Analyze LOOP with VECTOR_MODES[MODE_I] and as epilogue if MAIN_LOOP_VINFO is
2810 not NULL. Set AUTODETECTED_VECTOR_MODE if VOIDmode and advance
2811 MODE_I to the next mode useful to analyze.
2812 Return the loop_vinfo on success and wrapped null on failure. */
2813
2814 static opt_loop_vec_info
2815 vect_analyze_loop_1 (class loop *loop, vec_info_shared *shared,
2816 const vect_loop_form_info *loop_form_info,
2817 loop_vec_info main_loop_vinfo,
2818 const vector_modes &vector_modes, unsigned &mode_i,
2819 machine_mode &autodetected_vector_mode,
2820 bool &fatal)
2821 {
2822 loop_vec_info loop_vinfo
2823 = vect_create_loop_vinfo (loop, shared, loop_form_info, main_loop_vinfo);
2824
2825 machine_mode vector_mode = vector_modes[mode_i];
2826 loop_vinfo->vector_mode = vector_mode;
2827
2828 /* Run the main analysis. */
2829 opt_result res = vect_analyze_loop_2 (loop_vinfo, fatal);
2830 if (dump_enabled_p ())
2831 dump_printf_loc (MSG_NOTE, vect_location,
2832 "***** Analysis %s with vector mode %s\n",
2833 res ? "succeeded" : " failed",
2834 GET_MODE_NAME (loop_vinfo->vector_mode));
2835
2836 /* Remember the autodetected vector mode. */
2837 if (vector_mode == VOIDmode)
2838 autodetected_vector_mode = loop_vinfo->vector_mode;
2839
2840 /* Advance mode_i, first skipping modes that would result in the
2841 same analysis result. */
2842 while (mode_i + 1 < vector_modes.length ()
2843 && vect_chooses_same_modes_p (loop_vinfo,
2844 vector_modes[mode_i + 1]))
2845 {
2846 if (dump_enabled_p ())
2847 dump_printf_loc (MSG_NOTE, vect_location,
2848 "***** The result for vector mode %s would"
2849 " be the same\n",
2850 GET_MODE_NAME (vector_modes[mode_i + 1]));
2851 mode_i += 1;
2852 }
2853 if (mode_i + 1 < vector_modes.length ()
2854 && VECTOR_MODE_P (autodetected_vector_mode)
2855 && (related_vector_mode (vector_modes[mode_i + 1],
2856 GET_MODE_INNER (autodetected_vector_mode))
2857 == autodetected_vector_mode)
2858 && (related_vector_mode (autodetected_vector_mode,
2859 GET_MODE_INNER (vector_modes[mode_i + 1]))
2860 == vector_modes[mode_i + 1]))
2861 {
2862 if (dump_enabled_p ())
2863 dump_printf_loc (MSG_NOTE, vect_location,
2864 "***** Skipping vector mode %s, which would"
2865 " repeat the analysis for %s\n",
2866 GET_MODE_NAME (vector_modes[mode_i + 1]),
2867 GET_MODE_NAME (autodetected_vector_mode));
2868 mode_i += 1;
2869 }
2870 mode_i++;
2871
2872 if (!res)
2873 {
2874 delete loop_vinfo;
2875 if (fatal)
2876 gcc_checking_assert (main_loop_vinfo == NULL);
2877 return opt_loop_vec_info::propagate_failure (res);
2878 }
2879
2880 return opt_loop_vec_info::success (loop_vinfo);
2881 }
2882
2883 /* Function vect_analyze_loop.
2884
2885 Apply a set of analyses on LOOP, and create a loop_vec_info struct
2886 for it. The different analyses will record information in the
2887 loop_vec_info struct. */
2888 opt_loop_vec_info
2889 vect_analyze_loop (class loop *loop, vec_info_shared *shared)
2890 {
2891 DUMP_VECT_SCOPE ("analyze_loop_nest");
2892
2893 if (loop_outer (loop)
2894 && loop_vec_info_for_loop (loop_outer (loop))
2895 && LOOP_VINFO_VECTORIZABLE_P (loop_vec_info_for_loop (loop_outer (loop))))
2896 return opt_loop_vec_info::failure_at (vect_location,
2897 "outer-loop already vectorized.\n");
2898
2899 if (!find_loop_nest (loop, &shared->loop_nest))
2900 return opt_loop_vec_info::failure_at
2901 (vect_location,
2902 "not vectorized: loop nest containing two or more consecutive inner"
2903 " loops cannot be vectorized\n");
2904
2905 /* Analyze the loop form. */
2906 vect_loop_form_info loop_form_info;
2907 opt_result res = vect_analyze_loop_form (loop, &loop_form_info);
2908 if (!res)
2909 {
2910 if (dump_enabled_p ())
2911 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2912 "bad loop form.\n");
2913 return opt_loop_vec_info::propagate_failure (res);
2914 }
2915 if (!integer_onep (loop_form_info.assumptions))
2916 {
2917 /* We consider to vectorize this loop by versioning it under
2918 some assumptions. In order to do this, we need to clear
2919 existing information computed by scev and niter analyzer. */
2920 scev_reset_htab ();
2921 free_numbers_of_iterations_estimates (loop);
2922 /* Also set flag for this loop so that following scev and niter
2923 analysis are done under the assumptions. */
2924 loop_constraint_set (loop, LOOP_C_FINITE);
2925 }
2926
2927 auto_vector_modes vector_modes;
2928 /* Autodetect first vector size we try. */
2929 vector_modes.safe_push (VOIDmode);
2930 unsigned int autovec_flags
2931 = targetm.vectorize.autovectorize_vector_modes (&vector_modes,
2932 loop->simdlen != 0);
2933 bool pick_lowest_cost_p = ((autovec_flags & VECT_COMPARE_COSTS)
2934 && !unlimited_cost_model (loop));
2935 machine_mode autodetected_vector_mode = VOIDmode;
2936 opt_loop_vec_info first_loop_vinfo = opt_loop_vec_info::success (NULL);
2937 unsigned int mode_i = 0;
2938 unsigned int first_loop_i = 0;
2939 unsigned int first_loop_next_i = 0;
2940 unsigned HOST_WIDE_INT simdlen = loop->simdlen;
2941
2942 /* First determine the main loop vectorization mode, either the first
2943 one that works, starting with auto-detecting the vector mode and then
2944 following the targets order of preference, or the one with the
2945 lowest cost if pick_lowest_cost_p. */
2946 while (1)
2947 {
2948 unsigned int loop_vinfo_i = mode_i;
2949 bool fatal;
2950 opt_loop_vec_info loop_vinfo
2951 = vect_analyze_loop_1 (loop, shared, &loop_form_info,
2952 NULL, vector_modes, mode_i,
2953 autodetected_vector_mode, fatal);
2954 if (fatal)
2955 break;
2956
2957 if (loop_vinfo)
2958 {
2959 /* Once we hit the desired simdlen for the first time,
2960 discard any previous attempts. */
2961 if (simdlen
2962 && known_eq (LOOP_VINFO_VECT_FACTOR (loop_vinfo), simdlen))
2963 {
2964 delete first_loop_vinfo;
2965 first_loop_vinfo = opt_loop_vec_info::success (NULL);
2966 simdlen = 0;
2967 }
2968 else if (pick_lowest_cost_p
2969 && first_loop_vinfo
2970 && vect_joust_loop_vinfos (loop_vinfo, first_loop_vinfo))
2971 {
2972 /* Pick loop_vinfo over first_loop_vinfo. */
2973 delete first_loop_vinfo;
2974 first_loop_vinfo = opt_loop_vec_info::success (NULL);
2975 }
2976 if (first_loop_vinfo == NULL)
2977 {
2978 first_loop_vinfo = loop_vinfo;
2979 first_loop_i = loop_vinfo_i;
2980 first_loop_next_i = mode_i;
2981 }
2982 else
2983 {
2984 delete loop_vinfo;
2985 loop_vinfo = opt_loop_vec_info::success (NULL);
2986 }
2987
2988 /* Commit to first_loop_vinfo if we have no reason to try
2989 alternatives. */
2990 if (!simdlen && !pick_lowest_cost_p)
2991 break;
2992 }
2993 if (mode_i == vector_modes.length ()
2994 || autodetected_vector_mode == VOIDmode)
2995 break;
2996
2997 /* Try the next biggest vector size. */
2998 if (dump_enabled_p ())
2999 dump_printf_loc (MSG_NOTE, vect_location,
3000 "***** Re-trying analysis with vector mode %s\n",
3001 GET_MODE_NAME (vector_modes[mode_i]));
3002 }
3003 if (!first_loop_vinfo)
3004 return opt_loop_vec_info::propagate_failure (res);
3005
3006 if (dump_enabled_p ())
3007 dump_printf_loc (MSG_NOTE, vect_location,
3008 "***** Choosing vector mode %s\n",
3009 GET_MODE_NAME (first_loop_vinfo->vector_mode));
3010
3011 /* Only vectorize epilogues if PARAM_VECT_EPILOGUES_NOMASK is
3012 enabled, SIMDUID is not set, it is the innermost loop and we have
3013 either already found the loop's SIMDLEN or there was no SIMDLEN to
3014 begin with.
3015 TODO: Enable epilogue vectorization for loops with SIMDUID set. */
3016 bool vect_epilogues = (!simdlen
3017 && loop->inner == NULL
3018 && param_vect_epilogues_nomask
3019 && LOOP_VINFO_PEELING_FOR_NITER (first_loop_vinfo)
3020 && !loop->simduid);
3021 if (!vect_epilogues)
3022 return first_loop_vinfo;
3023
3024 /* Now analyze first_loop_vinfo for epilogue vectorization. */
3025 poly_uint64 lowest_th = LOOP_VINFO_VERSIONING_THRESHOLD (first_loop_vinfo);
3026
3027 /* Handle the case that the original loop can use partial
3028 vectorization, but want to only adopt it for the epilogue.
3029 The retry should be in the same mode as original. */
3030 if (LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P (first_loop_vinfo))
3031 {
3032 gcc_assert (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (first_loop_vinfo)
3033 && !LOOP_VINFO_USING_PARTIAL_VECTORS_P (first_loop_vinfo));
3034 if (dump_enabled_p ())
3035 dump_printf_loc (MSG_NOTE, vect_location,
3036 "***** Re-trying analysis with same vector mode"
3037 " %s for epilogue with partial vectors.\n",
3038 GET_MODE_NAME (first_loop_vinfo->vector_mode));
3039 mode_i = first_loop_i;
3040 }
3041 else
3042 {
3043 mode_i = first_loop_next_i;
3044 if (mode_i == vector_modes.length ())
3045 return first_loop_vinfo;
3046 }
3047
3048 /* ??? If first_loop_vinfo was using VOIDmode then we probably
3049 want to instead search for the corresponding mode in vector_modes[]. */
3050
3051 while (1)
3052 {
3053 bool fatal;
3054 opt_loop_vec_info loop_vinfo
3055 = vect_analyze_loop_1 (loop, shared, &loop_form_info,
3056 first_loop_vinfo,
3057 vector_modes, mode_i,
3058 autodetected_vector_mode, fatal);
3059 if (fatal)
3060 break;
3061
3062 if (loop_vinfo)
3063 {
3064 if (pick_lowest_cost_p)
3065 {
3066 /* Keep trying to roll back vectorization attempts while the
3067 loop_vec_infos they produced were worse than this one. */
3068 vec<loop_vec_info> &vinfos = first_loop_vinfo->epilogue_vinfos;
3069 while (!vinfos.is_empty ()
3070 && vect_joust_loop_vinfos (loop_vinfo, vinfos.last ()))
3071 {
3072 gcc_assert (vect_epilogues);
3073 delete vinfos.pop ();
3074 }
3075 }
3076 /* For now only allow one epilogue loop. */
3077 if (first_loop_vinfo->epilogue_vinfos.is_empty ())
3078 {
3079 first_loop_vinfo->epilogue_vinfos.safe_push (loop_vinfo);
3080 poly_uint64 th = LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo);
3081 gcc_assert (!LOOP_REQUIRES_VERSIONING (loop_vinfo)
3082 || maybe_ne (lowest_th, 0U));
3083 /* Keep track of the known smallest versioning
3084 threshold. */
3085 if (ordered_p (lowest_th, th))
3086 lowest_th = ordered_min (lowest_th, th);
3087 }
3088 else
3089 {
3090 delete loop_vinfo;
3091 loop_vinfo = opt_loop_vec_info::success (NULL);
3092 }
3093
3094 /* For now only allow one epilogue loop, but allow
3095 pick_lowest_cost_p to replace it, so commit to the
3096 first epilogue if we have no reason to try alternatives. */
3097 if (!pick_lowest_cost_p)
3098 break;
3099 }
3100
3101 if (mode_i == vector_modes.length ())
3102 break;
3103
3104 /* Try the next biggest vector size. */
3105 if (dump_enabled_p ())
3106 dump_printf_loc (MSG_NOTE, vect_location,
3107 "***** Re-trying epilogue analysis with vector "
3108 "mode %s\n", GET_MODE_NAME (vector_modes[mode_i]));
3109 }
3110
3111 if (!first_loop_vinfo->epilogue_vinfos.is_empty ())
3112 {
3113 LOOP_VINFO_VERSIONING_THRESHOLD (first_loop_vinfo) = lowest_th;
3114 if (dump_enabled_p ())
3115 dump_printf_loc (MSG_NOTE, vect_location,
3116 "***** Choosing epilogue vector mode %s\n",
3117 GET_MODE_NAME
3118 (first_loop_vinfo->epilogue_vinfos[0]->vector_mode));
3119 }
3120
3121 return first_loop_vinfo;
3122 }
3123
3124 /* Return true if there is an in-order reduction function for CODE, storing
3125 it in *REDUC_FN if so. */
3126
3127 static bool
3128 fold_left_reduction_fn (tree_code code, internal_fn *reduc_fn)
3129 {
3130 switch (code)
3131 {
3132 case PLUS_EXPR:
3133 *reduc_fn = IFN_FOLD_LEFT_PLUS;
3134 return true;
3135
3136 default:
3137 return false;
3138 }
3139 }
3140
3141 /* Function reduction_fn_for_scalar_code
3142
3143 Input:
3144 CODE - tree_code of a reduction operations.
3145
3146 Output:
3147 REDUC_FN - the corresponding internal function to be used to reduce the
3148 vector of partial results into a single scalar result, or IFN_LAST
3149 if the operation is a supported reduction operation, but does not have
3150 such an internal function.
3151
3152 Return FALSE if CODE currently cannot be vectorized as reduction. */
3153
3154 bool
3155 reduction_fn_for_scalar_code (enum tree_code code, internal_fn *reduc_fn)
3156 {
3157 switch (code)
3158 {
3159 case MAX_EXPR:
3160 *reduc_fn = IFN_REDUC_MAX;
3161 return true;
3162
3163 case MIN_EXPR:
3164 *reduc_fn = IFN_REDUC_MIN;
3165 return true;
3166
3167 case PLUS_EXPR:
3168 *reduc_fn = IFN_REDUC_PLUS;
3169 return true;
3170
3171 case BIT_AND_EXPR:
3172 *reduc_fn = IFN_REDUC_AND;
3173 return true;
3174
3175 case BIT_IOR_EXPR:
3176 *reduc_fn = IFN_REDUC_IOR;
3177 return true;
3178
3179 case BIT_XOR_EXPR:
3180 *reduc_fn = IFN_REDUC_XOR;
3181 return true;
3182
3183 case MULT_EXPR:
3184 case MINUS_EXPR:
3185 *reduc_fn = IFN_LAST;
3186 return true;
3187
3188 default:
3189 return false;
3190 }
3191 }
3192
3193 /* If there is a neutral value X such that a reduction would not be affected
3194 by the introduction of additional X elements, return that X, otherwise
3195 return null. CODE is the code of the reduction and SCALAR_TYPE is type
3196 of the scalar elements. If the reduction has just a single initial value
3197 then INITIAL_VALUE is that value, otherwise it is null. */
3198
3199 tree
3200 neutral_op_for_reduction (tree scalar_type, tree_code code, tree initial_value)
3201 {
3202 switch (code)
3203 {
3204 case WIDEN_SUM_EXPR:
3205 case DOT_PROD_EXPR:
3206 case SAD_EXPR:
3207 case PLUS_EXPR:
3208 case MINUS_EXPR:
3209 case BIT_IOR_EXPR:
3210 case BIT_XOR_EXPR:
3211 return build_zero_cst (scalar_type);
3212
3213 case MULT_EXPR:
3214 return build_one_cst (scalar_type);
3215
3216 case BIT_AND_EXPR:
3217 return build_all_ones_cst (scalar_type);
3218
3219 case MAX_EXPR:
3220 case MIN_EXPR:
3221 return initial_value;
3222
3223 default:
3224 return NULL_TREE;
3225 }
3226 }
3227
3228 /* Error reporting helper for vect_is_simple_reduction below. GIMPLE statement
3229 STMT is printed with a message MSG. */
3230
3231 static void
3232 report_vect_op (dump_flags_t msg_type, gimple *stmt, const char *msg)
3233 {
3234 dump_printf_loc (msg_type, vect_location, "%s%G", msg, stmt);
3235 }
3236
3237 /* Return true if we need an in-order reduction for operation CODE
3238 on type TYPE. NEED_WRAPPING_INTEGRAL_OVERFLOW is true if integer
3239 overflow must wrap. */
3240
3241 bool
3242 needs_fold_left_reduction_p (tree type, tree_code code)
3243 {
3244 /* CHECKME: check for !flag_finite_math_only too? */
3245 if (SCALAR_FLOAT_TYPE_P (type))
3246 switch (code)
3247 {
3248 case MIN_EXPR:
3249 case MAX_EXPR:
3250 return false;
3251
3252 default:
3253 return !flag_associative_math;
3254 }
3255
3256 if (INTEGRAL_TYPE_P (type))
3257 {
3258 if (!operation_no_trapping_overflow (type, code))
3259 return true;
3260 return false;
3261 }
3262
3263 if (SAT_FIXED_POINT_TYPE_P (type))
3264 return true;
3265
3266 return false;
3267 }
3268
3269 /* Return true if the reduction PHI in LOOP with latch arg LOOP_ARG and
3270 has a handled computation expression. Store the main reduction
3271 operation in *CODE. */
3272
3273 static bool
3274 check_reduction_path (dump_user_location_t loc, loop_p loop, gphi *phi,
3275 tree loop_arg, enum tree_code *code,
3276 vec<std::pair<ssa_op_iter, use_operand_p> > &path)
3277 {
3278 auto_bitmap visited;
3279 tree lookfor = PHI_RESULT (phi);
3280 ssa_op_iter curri;
3281 use_operand_p curr = op_iter_init_phiuse (&curri, phi, SSA_OP_USE);
3282 while (USE_FROM_PTR (curr) != loop_arg)
3283 curr = op_iter_next_use (&curri);
3284 curri.i = curri.numops;
3285 do
3286 {
3287 path.safe_push (std::make_pair (curri, curr));
3288 tree use = USE_FROM_PTR (curr);
3289 if (use == lookfor)
3290 break;
3291 gimple *def = SSA_NAME_DEF_STMT (use);
3292 if (gimple_nop_p (def)
3293 || ! flow_bb_inside_loop_p (loop, gimple_bb (def)))
3294 {
3295 pop:
3296 do
3297 {
3298 std::pair<ssa_op_iter, use_operand_p> x = path.pop ();
3299 curri = x.first;
3300 curr = x.second;
3301 do
3302 curr = op_iter_next_use (&curri);
3303 /* Skip already visited or non-SSA operands (from iterating
3304 over PHI args). */
3305 while (curr != NULL_USE_OPERAND_P
3306 && (TREE_CODE (USE_FROM_PTR (curr)) != SSA_NAME
3307 || ! bitmap_set_bit (visited,
3308 SSA_NAME_VERSION
3309 (USE_FROM_PTR (curr)))));
3310 }
3311 while (curr == NULL_USE_OPERAND_P && ! path.is_empty ());
3312 if (curr == NULL_USE_OPERAND_P)
3313 break;
3314 }
3315 else
3316 {
3317 if (gimple_code (def) == GIMPLE_PHI)
3318 curr = op_iter_init_phiuse (&curri, as_a <gphi *>(def), SSA_OP_USE);
3319 else
3320 curr = op_iter_init_use (&curri, def, SSA_OP_USE);
3321 while (curr != NULL_USE_OPERAND_P
3322 && (TREE_CODE (USE_FROM_PTR (curr)) != SSA_NAME
3323 || ! bitmap_set_bit (visited,
3324 SSA_NAME_VERSION
3325 (USE_FROM_PTR (curr)))))
3326 curr = op_iter_next_use (&curri);
3327 if (curr == NULL_USE_OPERAND_P)
3328 goto pop;
3329 }
3330 }
3331 while (1);
3332 if (dump_file && (dump_flags & TDF_DETAILS))
3333 {
3334 dump_printf_loc (MSG_NOTE, loc, "reduction path: ");
3335 unsigned i;
3336 std::pair<ssa_op_iter, use_operand_p> *x;
3337 FOR_EACH_VEC_ELT (path, i, x)
3338 dump_printf (MSG_NOTE, "%T ", USE_FROM_PTR (x->second));
3339 dump_printf (MSG_NOTE, "\n");
3340 }
3341
3342 /* Check whether the reduction path detected is valid. */
3343 bool fail = path.length () == 0;
3344 bool neg = false;
3345 int sign = -1;
3346 *code = ERROR_MARK;
3347 for (unsigned i = 1; i < path.length (); ++i)
3348 {
3349 gimple *use_stmt = USE_STMT (path[i].second);
3350 tree op = USE_FROM_PTR (path[i].second);
3351 if (! is_gimple_assign (use_stmt)
3352 /* The following make sure we can compute the operand index
3353 easily plus it mostly disallows chaining via COND_EXPR condition
3354 operands. */
3355 || (gimple_assign_rhs1_ptr (use_stmt) != path[i].second->use
3356 && (gimple_num_ops (use_stmt) <= 2
3357 || gimple_assign_rhs2_ptr (use_stmt) != path[i].second->use)
3358 && (gimple_num_ops (use_stmt) <= 3
3359 || gimple_assign_rhs3_ptr (use_stmt) != path[i].second->use)))
3360 {
3361 fail = true;
3362 break;
3363 }
3364 tree_code use_code = gimple_assign_rhs_code (use_stmt);
3365 if (use_code == MINUS_EXPR)
3366 {
3367 use_code = PLUS_EXPR;
3368 /* Track whether we negate the reduction value each iteration. */
3369 if (gimple_assign_rhs2 (use_stmt) == op)
3370 neg = ! neg;
3371 }
3372 if (CONVERT_EXPR_CODE_P (use_code)
3373 && tree_nop_conversion_p (TREE_TYPE (gimple_assign_lhs (use_stmt)),
3374 TREE_TYPE (gimple_assign_rhs1 (use_stmt))))
3375 ;
3376 else if (*code == ERROR_MARK)
3377 {
3378 *code = use_code;
3379 sign = TYPE_SIGN (TREE_TYPE (gimple_assign_lhs (use_stmt)));
3380 }
3381 else if (use_code != *code)
3382 {
3383 fail = true;
3384 break;
3385 }
3386 else if ((use_code == MIN_EXPR
3387 || use_code == MAX_EXPR)
3388 && sign != TYPE_SIGN (TREE_TYPE (gimple_assign_lhs (use_stmt))))
3389 {
3390 fail = true;
3391 break;
3392 }
3393 /* Check there's only a single stmt the op is used on. For the
3394 not value-changing tail and the last stmt allow out-of-loop uses.
3395 ??? We could relax this and handle arbitrary live stmts by
3396 forcing a scalar epilogue for example. */
3397 imm_use_iterator imm_iter;
3398 gimple *op_use_stmt;
3399 unsigned cnt = 0;
3400 FOR_EACH_IMM_USE_STMT (op_use_stmt, imm_iter, op)
3401 if (!is_gimple_debug (op_use_stmt)
3402 && (*code != ERROR_MARK
3403 || flow_bb_inside_loop_p (loop, gimple_bb (op_use_stmt))))
3404 {
3405 /* We want to allow x + x but not x < 1 ? x : 2. */
3406 if (is_gimple_assign (op_use_stmt)
3407 && gimple_assign_rhs_code (op_use_stmt) == COND_EXPR)
3408 {
3409 use_operand_p use_p;
3410 FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
3411 cnt++;
3412 }
3413 else
3414 cnt++;
3415 }
3416 if (cnt != 1)
3417 {
3418 fail = true;
3419 break;
3420 }
3421 }
3422 return ! fail && ! neg && *code != ERROR_MARK;
3423 }
3424
3425 bool
3426 check_reduction_path (dump_user_location_t loc, loop_p loop, gphi *phi,
3427 tree loop_arg, enum tree_code code)
3428 {
3429 auto_vec<std::pair<ssa_op_iter, use_operand_p> > path;
3430 enum tree_code code_;
3431 return (check_reduction_path (loc, loop, phi, loop_arg, &code_, path)
3432 && code_ == code);
3433 }
3434
3435
3436
3437 /* Function vect_is_simple_reduction
3438
3439 (1) Detect a cross-iteration def-use cycle that represents a simple
3440 reduction computation. We look for the following pattern:
3441
3442 loop_header:
3443 a1 = phi < a0, a2 >
3444 a3 = ...
3445 a2 = operation (a3, a1)
3446
3447 or
3448
3449 a3 = ...
3450 loop_header:
3451 a1 = phi < a0, a2 >
3452 a2 = operation (a3, a1)
3453
3454 such that:
3455 1. operation is commutative and associative and it is safe to
3456 change the order of the computation
3457 2. no uses for a2 in the loop (a2 is used out of the loop)
3458 3. no uses of a1 in the loop besides the reduction operation
3459 4. no uses of a1 outside the loop.
3460
3461 Conditions 1,4 are tested here.
3462 Conditions 2,3 are tested in vect_mark_stmts_to_be_vectorized.
3463
3464 (2) Detect a cross-iteration def-use cycle in nested loops, i.e.,
3465 nested cycles.
3466
3467 (3) Detect cycles of phi nodes in outer-loop vectorization, i.e., double
3468 reductions:
3469
3470 a1 = phi < a0, a2 >
3471 inner loop (def of a3)
3472 a2 = phi < a3 >
3473
3474 (4) Detect condition expressions, ie:
3475 for (int i = 0; i < N; i++)
3476 if (a[i] < val)
3477 ret_val = a[i];
3478
3479 */
3480
3481 static stmt_vec_info
3482 vect_is_simple_reduction (loop_vec_info loop_info, stmt_vec_info phi_info,
3483 bool *double_reduc, bool *reduc_chain_p)
3484 {
3485 gphi *phi = as_a <gphi *> (phi_info->stmt);
3486 gimple *phi_use_stmt = NULL;
3487 imm_use_iterator imm_iter;
3488 use_operand_p use_p;
3489
3490 *double_reduc = false;
3491 *reduc_chain_p = false;
3492 STMT_VINFO_REDUC_TYPE (phi_info) = TREE_CODE_REDUCTION;
3493
3494 tree phi_name = PHI_RESULT (phi);
3495 /* ??? If there are no uses of the PHI result the inner loop reduction
3496 won't be detected as possibly double-reduction by vectorizable_reduction
3497 because that tries to walk the PHI arg from the preheader edge which
3498 can be constant. See PR60382. */
3499 if (has_zero_uses (phi_name))
3500 return NULL;
3501 class loop *loop = (gimple_bb (phi))->loop_father;
3502 unsigned nphi_def_loop_uses = 0;
3503 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, phi_name)
3504 {
3505 gimple *use_stmt = USE_STMT (use_p);
3506 if (is_gimple_debug (use_stmt))
3507 continue;
3508
3509 if (!flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
3510 {
3511 if (dump_enabled_p ())
3512 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3513 "intermediate value used outside loop.\n");
3514
3515 return NULL;
3516 }
3517
3518 nphi_def_loop_uses++;
3519 phi_use_stmt = use_stmt;
3520 }
3521
3522 tree latch_def = PHI_ARG_DEF_FROM_EDGE (phi, loop_latch_edge (loop));
3523 if (TREE_CODE (latch_def) != SSA_NAME)
3524 {
3525 if (dump_enabled_p ())
3526 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3527 "reduction: not ssa_name: %T\n", latch_def);
3528 return NULL;
3529 }
3530
3531 stmt_vec_info def_stmt_info = loop_info->lookup_def (latch_def);
3532 if (!def_stmt_info
3533 || !flow_bb_inside_loop_p (loop, gimple_bb (def_stmt_info->stmt)))
3534 return NULL;
3535
3536 bool nested_in_vect_loop
3537 = flow_loop_nested_p (LOOP_VINFO_LOOP (loop_info), loop);
3538 unsigned nlatch_def_loop_uses = 0;
3539 auto_vec<gphi *, 3> lcphis;
3540 bool inner_loop_of_double_reduc = false;
3541 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, latch_def)
3542 {
3543 gimple *use_stmt = USE_STMT (use_p);
3544 if (is_gimple_debug (use_stmt))
3545 continue;
3546 if (flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
3547 nlatch_def_loop_uses++;
3548 else
3549 {
3550 /* We can have more than one loop-closed PHI. */
3551 lcphis.safe_push (as_a <gphi *> (use_stmt));
3552 if (nested_in_vect_loop
3553 && (STMT_VINFO_DEF_TYPE (loop_info->lookup_stmt (use_stmt))
3554 == vect_double_reduction_def))
3555 inner_loop_of_double_reduc = true;
3556 }
3557 }
3558
3559 /* If we are vectorizing an inner reduction we are executing that
3560 in the original order only in case we are not dealing with a
3561 double reduction. */
3562 if (nested_in_vect_loop && !inner_loop_of_double_reduc)
3563 {
3564 if (dump_enabled_p ())
3565 report_vect_op (MSG_NOTE, def_stmt_info->stmt,
3566 "detected nested cycle: ");
3567 return def_stmt_info;
3568 }
3569
3570 /* When the inner loop of a double reduction ends up with more than
3571 one loop-closed PHI we have failed to classify alternate such
3572 PHIs as double reduction, leading to wrong code. See PR103237. */
3573 if (inner_loop_of_double_reduc && lcphis.length () != 1)
3574 {
3575 if (dump_enabled_p ())
3576 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3577 "unhandle double reduction\n");
3578 return NULL;
3579 }
3580
3581 /* If this isn't a nested cycle or if the nested cycle reduction value
3582 is used ouside of the inner loop we cannot handle uses of the reduction
3583 value. */
3584 if (nlatch_def_loop_uses > 1 || nphi_def_loop_uses > 1)
3585 {
3586 if (dump_enabled_p ())
3587 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3588 "reduction used in loop.\n");
3589 return NULL;
3590 }
3591
3592 /* If DEF_STMT is a phi node itself, we expect it to have a single argument
3593 defined in the inner loop. */
3594 if (gphi *def_stmt = dyn_cast <gphi *> (def_stmt_info->stmt))
3595 {
3596 tree op1 = PHI_ARG_DEF (def_stmt, 0);
3597 if (gimple_phi_num_args (def_stmt) != 1
3598 || TREE_CODE (op1) != SSA_NAME)
3599 {
3600 if (dump_enabled_p ())
3601 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3602 "unsupported phi node definition.\n");
3603
3604 return NULL;
3605 }
3606
3607 gimple *def1 = SSA_NAME_DEF_STMT (op1);
3608 if (gimple_bb (def1)
3609 && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
3610 && loop->inner
3611 && flow_bb_inside_loop_p (loop->inner, gimple_bb (def1))
3612 && is_gimple_assign (def1)
3613 && is_a <gphi *> (phi_use_stmt)
3614 && flow_bb_inside_loop_p (loop->inner, gimple_bb (phi_use_stmt)))
3615 {
3616 if (dump_enabled_p ())
3617 report_vect_op (MSG_NOTE, def_stmt,
3618 "detected double reduction: ");
3619
3620 *double_reduc = true;
3621 return def_stmt_info;
3622 }
3623
3624 return NULL;
3625 }
3626
3627 /* Look for the expression computing latch_def from then loop PHI result. */
3628 auto_vec<std::pair<ssa_op_iter, use_operand_p> > path;
3629 enum tree_code code;
3630 if (check_reduction_path (vect_location, loop, phi, latch_def, &code,
3631 path))
3632 {
3633 STMT_VINFO_REDUC_CODE (phi_info) = code;
3634 if (code == COND_EXPR && !nested_in_vect_loop)
3635 STMT_VINFO_REDUC_TYPE (phi_info) = COND_REDUCTION;
3636
3637 /* Fill in STMT_VINFO_REDUC_IDX and gather stmts for an SLP
3638 reduction chain for which the additional restriction is that
3639 all operations in the chain are the same. */
3640 auto_vec<stmt_vec_info, 8> reduc_chain;
3641 unsigned i;
3642 bool is_slp_reduc = !nested_in_vect_loop && code != COND_EXPR;
3643 for (i = path.length () - 1; i >= 1; --i)
3644 {
3645 gimple *stmt = USE_STMT (path[i].second);
3646 stmt_vec_info stmt_info = loop_info->lookup_stmt (stmt);
3647 STMT_VINFO_REDUC_IDX (stmt_info)
3648 = path[i].second->use - gimple_assign_rhs1_ptr (stmt);
3649 enum tree_code stmt_code = gimple_assign_rhs_code (stmt);
3650 bool leading_conversion = (CONVERT_EXPR_CODE_P (stmt_code)
3651 && (i == 1 || i == path.length () - 1));
3652 if ((stmt_code != code && !leading_conversion)
3653 /* We can only handle the final value in epilogue
3654 generation for reduction chains. */
3655 || (i != 1 && !has_single_use (gimple_assign_lhs (stmt))))
3656 is_slp_reduc = false;
3657 /* For reduction chains we support a trailing/leading
3658 conversions. We do not store those in the actual chain. */
3659 if (leading_conversion)
3660 continue;
3661 reduc_chain.safe_push (stmt_info);
3662 }
3663 if (is_slp_reduc && reduc_chain.length () > 1)
3664 {
3665 for (unsigned i = 0; i < reduc_chain.length () - 1; ++i)
3666 {
3667 REDUC_GROUP_FIRST_ELEMENT (reduc_chain[i]) = reduc_chain[0];
3668 REDUC_GROUP_NEXT_ELEMENT (reduc_chain[i]) = reduc_chain[i+1];
3669 }
3670 REDUC_GROUP_FIRST_ELEMENT (reduc_chain.last ()) = reduc_chain[0];
3671 REDUC_GROUP_NEXT_ELEMENT (reduc_chain.last ()) = NULL;
3672
3673 /* Save the chain for further analysis in SLP detection. */
3674 LOOP_VINFO_REDUCTION_CHAINS (loop_info).safe_push (reduc_chain[0]);
3675 REDUC_GROUP_SIZE (reduc_chain[0]) = reduc_chain.length ();
3676
3677 *reduc_chain_p = true;
3678 if (dump_enabled_p ())
3679 dump_printf_loc (MSG_NOTE, vect_location,
3680 "reduction: detected reduction chain\n");
3681 }
3682 else if (dump_enabled_p ())
3683 dump_printf_loc (MSG_NOTE, vect_location,
3684 "reduction: detected reduction\n");
3685
3686 return def_stmt_info;
3687 }
3688
3689 if (dump_enabled_p ())
3690 dump_printf_loc (MSG_NOTE, vect_location,
3691 "reduction: unknown pattern\n");
3692
3693 return NULL;
3694 }
3695
3696 /* Estimate the number of peeled epilogue iterations for LOOP_VINFO.
3697 PEEL_ITERS_PROLOGUE is the number of peeled prologue iterations,
3698 or -1 if not known. */
3699
3700 static int
3701 vect_get_peel_iters_epilogue (loop_vec_info loop_vinfo, int peel_iters_prologue)
3702 {
3703 int assumed_vf = vect_vf_for_cost (loop_vinfo);
3704 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) || peel_iters_prologue == -1)
3705 {
3706 if (dump_enabled_p ())
3707 dump_printf_loc (MSG_NOTE, vect_location,
3708 "cost model: epilogue peel iters set to vf/2 "
3709 "because loop iterations are unknown .\n");
3710 return assumed_vf / 2;
3711 }
3712 else
3713 {
3714 int niters = LOOP_VINFO_INT_NITERS (loop_vinfo);
3715 peel_iters_prologue = MIN (niters, peel_iters_prologue);
3716 int peel_iters_epilogue = (niters - peel_iters_prologue) % assumed_vf;
3717 /* If we need to peel for gaps, but no peeling is required, we have to
3718 peel VF iterations. */
3719 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) && !peel_iters_epilogue)
3720 peel_iters_epilogue = assumed_vf;
3721 return peel_iters_epilogue;
3722 }
3723 }
3724
3725 /* Calculate cost of peeling the loop PEEL_ITERS_PROLOGUE times. */
3726 int
3727 vect_get_known_peeling_cost (loop_vec_info loop_vinfo, int peel_iters_prologue,
3728 int *peel_iters_epilogue,
3729 stmt_vector_for_cost *scalar_cost_vec,
3730 stmt_vector_for_cost *prologue_cost_vec,
3731 stmt_vector_for_cost *epilogue_cost_vec)
3732 {
3733 int retval = 0;
3734
3735 *peel_iters_epilogue
3736 = vect_get_peel_iters_epilogue (loop_vinfo, peel_iters_prologue);
3737
3738 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
3739 {
3740 /* If peeled iterations are known but number of scalar loop
3741 iterations are unknown, count a taken branch per peeled loop. */
3742 if (peel_iters_prologue > 0)
3743 retval = record_stmt_cost (prologue_cost_vec, 1, cond_branch_taken,
3744 NULL, NULL_TREE, 0, vect_prologue);
3745 if (*peel_iters_epilogue > 0)
3746 retval += record_stmt_cost (epilogue_cost_vec, 1, cond_branch_taken,
3747 NULL, NULL_TREE, 0, vect_epilogue);
3748 }
3749
3750 stmt_info_for_cost *si;
3751 int j;
3752 if (peel_iters_prologue)
3753 FOR_EACH_VEC_ELT (*scalar_cost_vec, j, si)
3754 retval += record_stmt_cost (prologue_cost_vec,
3755 si->count * peel_iters_prologue,
3756 si->kind, si->stmt_info, si->misalign,
3757 vect_prologue);
3758 if (*peel_iters_epilogue)
3759 FOR_EACH_VEC_ELT (*scalar_cost_vec, j, si)
3760 retval += record_stmt_cost (epilogue_cost_vec,
3761 si->count * *peel_iters_epilogue,
3762 si->kind, si->stmt_info, si->misalign,
3763 vect_epilogue);
3764
3765 return retval;
3766 }
3767
3768 /* Function vect_estimate_min_profitable_iters
3769
3770 Return the number of iterations required for the vector version of the
3771 loop to be profitable relative to the cost of the scalar version of the
3772 loop.
3773
3774 *RET_MIN_PROFITABLE_NITERS is a cost model profitability threshold
3775 of iterations for vectorization. -1 value means loop vectorization
3776 is not profitable. This returned value may be used for dynamic
3777 profitability check.
3778
3779 *RET_MIN_PROFITABLE_ESTIMATE is a profitability threshold to be used
3780 for static check against estimated number of iterations. */
3781
3782 static void
3783 vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo,
3784 int *ret_min_profitable_niters,
3785 int *ret_min_profitable_estimate)
3786 {
3787 int min_profitable_iters;
3788 int min_profitable_estimate;
3789 int peel_iters_prologue;
3790 int peel_iters_epilogue;
3791 unsigned vec_inside_cost = 0;
3792 int vec_outside_cost = 0;
3793 unsigned vec_prologue_cost = 0;
3794 unsigned vec_epilogue_cost = 0;
3795 int scalar_single_iter_cost = 0;
3796 int scalar_outside_cost = 0;
3797 int assumed_vf = vect_vf_for_cost (loop_vinfo);
3798 int npeel = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
3799 vector_costs *target_cost_data = loop_vinfo->vector_costs;
3800
3801 /* Cost model disabled. */
3802 if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
3803 {
3804 if (dump_enabled_p ())
3805 dump_printf_loc (MSG_NOTE, vect_location, "cost model disabled.\n");
3806 *ret_min_profitable_niters = 0;
3807 *ret_min_profitable_estimate = 0;
3808 return;
3809 }
3810
3811 /* Requires loop versioning tests to handle misalignment. */
3812 if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo))
3813 {
3814 /* FIXME: Make cost depend on complexity of individual check. */
3815 unsigned len = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).length ();
3816 (void) add_stmt_cost (target_cost_data, len, vector_stmt,
3817 NULL, NULL_TREE, 0, vect_prologue);
3818 if (dump_enabled_p ())
3819 dump_printf (MSG_NOTE,
3820 "cost model: Adding cost of checks for loop "
3821 "versioning to treat misalignment.\n");
3822 }
3823
3824 /* Requires loop versioning with alias checks. */
3825 if (LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
3826 {
3827 /* FIXME: Make cost depend on complexity of individual check. */
3828 unsigned len = LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo).length ();
3829 (void) add_stmt_cost (target_cost_data, len, vector_stmt,
3830 NULL, NULL_TREE, 0, vect_prologue);
3831 len = LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo).length ();
3832 if (len)
3833 /* Count LEN - 1 ANDs and LEN comparisons. */
3834 (void) add_stmt_cost (target_cost_data, len * 2 - 1,
3835 scalar_stmt, NULL, NULL_TREE, 0, vect_prologue);
3836 len = LOOP_VINFO_LOWER_BOUNDS (loop_vinfo).length ();
3837 if (len)
3838 {
3839 /* Count LEN - 1 ANDs and LEN comparisons. */
3840 unsigned int nstmts = len * 2 - 1;
3841 /* +1 for each bias that needs adding. */
3842 for (unsigned int i = 0; i < len; ++i)
3843 if (!LOOP_VINFO_LOWER_BOUNDS (loop_vinfo)[i].unsigned_p)
3844 nstmts += 1;
3845 (void) add_stmt_cost (target_cost_data, nstmts,
3846 scalar_stmt, NULL, NULL_TREE, 0, vect_prologue);
3847 }
3848 if (dump_enabled_p ())
3849 dump_printf (MSG_NOTE,
3850 "cost model: Adding cost of checks for loop "
3851 "versioning aliasing.\n");
3852 }
3853
3854 /* Requires loop versioning with niter checks. */
3855 if (LOOP_REQUIRES_VERSIONING_FOR_NITERS (loop_vinfo))
3856 {
3857 /* FIXME: Make cost depend on complexity of individual check. */
3858 (void) add_stmt_cost (target_cost_data, 1, vector_stmt,
3859 NULL, NULL_TREE, 0, vect_prologue);
3860 if (dump_enabled_p ())
3861 dump_printf (MSG_NOTE,
3862 "cost model: Adding cost of checks for loop "
3863 "versioning niters.\n");
3864 }
3865
3866 if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
3867 (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken,
3868 NULL, NULL_TREE, 0, vect_prologue);
3869
3870 /* Count statements in scalar loop. Using this as scalar cost for a single
3871 iteration for now.
3872
3873 TODO: Add outer loop support.
3874
3875 TODO: Consider assigning different costs to different scalar
3876 statements. */
3877
3878 scalar_single_iter_cost = loop_vinfo->scalar_costs->total_cost ();
3879
3880 /* Add additional cost for the peeled instructions in prologue and epilogue
3881 loop. (For fully-masked loops there will be no peeling.)
3882
3883 FORNOW: If we don't know the value of peel_iters for prologue or epilogue
3884 at compile-time - we assume it's vf/2 (the worst would be vf-1).
3885
3886 TODO: Build an expression that represents peel_iters for prologue and
3887 epilogue to be used in a run-time test. */
3888
3889 bool prologue_need_br_taken_cost = false;
3890 bool prologue_need_br_not_taken_cost = false;
3891
3892 /* Calculate peel_iters_prologue. */
3893 if (vect_use_loop_mask_for_alignment_p (loop_vinfo))
3894 peel_iters_prologue = 0;
3895 else if (npeel < 0)
3896 {
3897 peel_iters_prologue = assumed_vf / 2;
3898 if (dump_enabled_p ())
3899 dump_printf (MSG_NOTE, "cost model: "
3900 "prologue peel iters set to vf/2.\n");
3901
3902 /* If peeled iterations are unknown, count a taken branch and a not taken
3903 branch per peeled loop. Even if scalar loop iterations are known,
3904 vector iterations are not known since peeled prologue iterations are
3905 not known. Hence guards remain the same. */
3906 prologue_need_br_taken_cost = true;
3907 prologue_need_br_not_taken_cost = true;
3908 }
3909 else
3910 {
3911 peel_iters_prologue = npeel;
3912 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) && peel_iters_prologue > 0)
3913 /* If peeled iterations are known but number of scalar loop
3914 iterations are unknown, count a taken branch per peeled loop. */
3915 prologue_need_br_taken_cost = true;
3916 }
3917
3918 bool epilogue_need_br_taken_cost = false;
3919 bool epilogue_need_br_not_taken_cost = false;
3920
3921 /* Calculate peel_iters_epilogue. */
3922 if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
3923 /* We need to peel exactly one iteration for gaps. */
3924 peel_iters_epilogue = LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) ? 1 : 0;
3925 else if (npeel < 0)
3926 {
3927 /* If peeling for alignment is unknown, loop bound of main loop
3928 becomes unknown. */
3929 peel_iters_epilogue = assumed_vf / 2;
3930 if (dump_enabled_p ())
3931 dump_printf (MSG_NOTE, "cost model: "
3932 "epilogue peel iters set to vf/2 because "
3933 "peeling for alignment is unknown.\n");
3934
3935 /* See the same reason above in peel_iters_prologue calculation. */
3936 epilogue_need_br_taken_cost = true;
3937 epilogue_need_br_not_taken_cost = true;
3938 }
3939 else
3940 {
3941 peel_iters_epilogue = vect_get_peel_iters_epilogue (loop_vinfo, npeel);
3942 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) && peel_iters_epilogue > 0)
3943 /* If peeled iterations are known but number of scalar loop
3944 iterations are unknown, count a taken branch per peeled loop. */
3945 epilogue_need_br_taken_cost = true;
3946 }
3947
3948 stmt_info_for_cost *si;
3949 int j;
3950 /* Add costs associated with peel_iters_prologue. */
3951 if (peel_iters_prologue)
3952 FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo), j, si)
3953 {
3954 (void) add_stmt_cost (target_cost_data,
3955 si->count * peel_iters_prologue, si->kind,
3956 si->stmt_info, si->vectype, si->misalign,
3957 vect_prologue);
3958 }
3959
3960 /* Add costs associated with peel_iters_epilogue. */
3961 if (peel_iters_epilogue)
3962 FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo), j, si)
3963 {
3964 (void) add_stmt_cost (target_cost_data,
3965 si->count * peel_iters_epilogue, si->kind,
3966 si->stmt_info, si->vectype, si->misalign,
3967 vect_epilogue);
3968 }
3969
3970 /* Add possible cond_branch_taken/cond_branch_not_taken cost. */
3971
3972 if (prologue_need_br_taken_cost)
3973 (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken,
3974 NULL, NULL_TREE, 0, vect_prologue);
3975
3976 if (prologue_need_br_not_taken_cost)
3977 (void) add_stmt_cost (target_cost_data, 1,
3978 cond_branch_not_taken, NULL, NULL_TREE, 0,
3979 vect_prologue);
3980
3981 if (epilogue_need_br_taken_cost)
3982 (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken,
3983 NULL, NULL_TREE, 0, vect_epilogue);
3984
3985 if (epilogue_need_br_not_taken_cost)
3986 (void) add_stmt_cost (target_cost_data, 1,
3987 cond_branch_not_taken, NULL, NULL_TREE, 0,
3988 vect_epilogue);
3989
3990 /* Take care of special costs for rgroup controls of partial vectors. */
3991 if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
3992 {
3993 /* Calculate how many masks we need to generate. */
3994 unsigned int num_masks = 0;
3995 rgroup_controls *rgm;
3996 unsigned int num_vectors_m1;
3997 FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo), num_vectors_m1, rgm)
3998 if (rgm->type)
3999 num_masks += num_vectors_m1 + 1;
4000 gcc_assert (num_masks > 0);
4001
4002 /* In the worst case, we need to generate each mask in the prologue
4003 and in the loop body. One of the loop body mask instructions
4004 replaces the comparison in the scalar loop, and since we don't
4005 count the scalar comparison against the scalar body, we shouldn't
4006 count that vector instruction against the vector body either.
4007
4008 Sometimes we can use unpacks instead of generating prologue
4009 masks and sometimes the prologue mask will fold to a constant,
4010 so the actual prologue cost might be smaller. However, it's
4011 simpler and safer to use the worst-case cost; if this ends up
4012 being the tie-breaker between vectorizing or not, then it's
4013 probably better not to vectorize. */
4014 (void) add_stmt_cost (target_cost_data, num_masks,
4015 vector_stmt, NULL, NULL_TREE, 0, vect_prologue);
4016 (void) add_stmt_cost (target_cost_data, num_masks - 1,
4017 vector_stmt, NULL, NULL_TREE, 0, vect_body);
4018 }
4019 else if (LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo))
4020 {
4021 /* Referring to the functions vect_set_loop_condition_partial_vectors
4022 and vect_set_loop_controls_directly, we need to generate each
4023 length in the prologue and in the loop body if required. Although
4024 there are some possible optimizations, we consider the worst case
4025 here. */
4026
4027 bool niters_known_p = LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo);
4028 bool need_iterate_p
4029 = (!LOOP_VINFO_EPILOGUE_P (loop_vinfo)
4030 && !vect_known_niters_smaller_than_vf (loop_vinfo));
4031
4032 /* Calculate how many statements to be added. */
4033 unsigned int prologue_stmts = 0;
4034 unsigned int body_stmts = 0;
4035
4036 rgroup_controls *rgc;
4037 unsigned int num_vectors_m1;
4038 FOR_EACH_VEC_ELT (LOOP_VINFO_LENS (loop_vinfo), num_vectors_m1, rgc)
4039 if (rgc->type)
4040 {
4041 /* May need one SHIFT for nitems_total computation. */
4042 unsigned nitems = rgc->max_nscalars_per_iter * rgc->factor;
4043 if (nitems != 1 && !niters_known_p)
4044 prologue_stmts += 1;
4045
4046 /* May need one MAX and one MINUS for wrap around. */
4047 if (vect_rgroup_iv_might_wrap_p (loop_vinfo, rgc))
4048 prologue_stmts += 2;
4049
4050 /* Need one MAX and one MINUS for each batch limit excepting for
4051 the 1st one. */
4052 prologue_stmts += num_vectors_m1 * 2;
4053
4054 unsigned int num_vectors = num_vectors_m1 + 1;
4055
4056 /* Need to set up lengths in prologue, only one MIN required
4057 for each since start index is zero. */
4058 prologue_stmts += num_vectors;
4059
4060 /* Each may need two MINs and one MINUS to update lengths in body
4061 for next iteration. */
4062 if (need_iterate_p)
4063 body_stmts += 3 * num_vectors;
4064 }
4065
4066 (void) add_stmt_cost (target_cost_data, prologue_stmts,
4067 scalar_stmt, NULL, NULL_TREE, 0, vect_prologue);
4068 (void) add_stmt_cost (target_cost_data, body_stmts,
4069 scalar_stmt, NULL, NULL_TREE, 0, vect_body);
4070 }
4071
4072 /* FORNOW: The scalar outside cost is incremented in one of the
4073 following ways:
4074
4075 1. The vectorizer checks for alignment and aliasing and generates
4076 a condition that allows dynamic vectorization. A cost model
4077 check is ANDED with the versioning condition. Hence scalar code
4078 path now has the added cost of the versioning check.
4079
4080 if (cost > th & versioning_check)
4081 jmp to vector code
4082
4083 Hence run-time scalar is incremented by not-taken branch cost.
4084
4085 2. The vectorizer then checks if a prologue is required. If the
4086 cost model check was not done before during versioning, it has to
4087 be done before the prologue check.
4088
4089 if (cost <= th)
4090 prologue = scalar_iters
4091 if (prologue == 0)
4092 jmp to vector code
4093 else
4094 execute prologue
4095 if (prologue == num_iters)
4096 go to exit
4097
4098 Hence the run-time scalar cost is incremented by a taken branch,
4099 plus a not-taken branch, plus a taken branch cost.
4100
4101 3. The vectorizer then checks if an epilogue is required. If the
4102 cost model check was not done before during prologue check, it
4103 has to be done with the epilogue check.
4104
4105 if (prologue == 0)
4106 jmp to vector code
4107 else
4108 execute prologue
4109 if (prologue == num_iters)
4110 go to exit
4111 vector code:
4112 if ((cost <= th) | (scalar_iters-prologue-epilogue == 0))
4113 jmp to epilogue
4114
4115 Hence the run-time scalar cost should be incremented by 2 taken
4116 branches.
4117
4118 TODO: The back end may reorder the BBS's differently and reverse
4119 conditions/branch directions. Change the estimates below to
4120 something more reasonable. */
4121
4122 /* If the number of iterations is known and we do not do versioning, we can
4123 decide whether to vectorize at compile time. Hence the scalar version
4124 do not carry cost model guard costs. */
4125 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
4126 || LOOP_REQUIRES_VERSIONING (loop_vinfo))
4127 {
4128 /* Cost model check occurs at versioning. */
4129 if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
4130 scalar_outside_cost += vect_get_stmt_cost (cond_branch_not_taken);
4131 else
4132 {
4133 /* Cost model check occurs at prologue generation. */
4134 if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) < 0)
4135 scalar_outside_cost += 2 * vect_get_stmt_cost (cond_branch_taken)
4136 + vect_get_stmt_cost (cond_branch_not_taken);
4137 /* Cost model check occurs at epilogue generation. */
4138 else
4139 scalar_outside_cost += 2 * vect_get_stmt_cost (cond_branch_taken);
4140 }
4141 }
4142
4143 /* Complete the target-specific cost calculations. */
4144 finish_cost (loop_vinfo->vector_costs, loop_vinfo->scalar_costs,
4145 &vec_prologue_cost, &vec_inside_cost, &vec_epilogue_cost);
4146
4147 vec_outside_cost = (int)(vec_prologue_cost + vec_epilogue_cost);
4148
4149 if (dump_enabled_p ())
4150 {
4151 dump_printf_loc (MSG_NOTE, vect_location, "Cost model analysis: \n");
4152 dump_printf (MSG_NOTE, " Vector inside of loop cost: %d\n",
4153 vec_inside_cost);
4154 dump_printf (MSG_NOTE, " Vector prologue cost: %d\n",
4155 vec_prologue_cost);
4156 dump_printf (MSG_NOTE, " Vector epilogue cost: %d\n",
4157 vec_epilogue_cost);
4158 dump_printf (MSG_NOTE, " Scalar iteration cost: %d\n",
4159 scalar_single_iter_cost);
4160 dump_printf (MSG_NOTE, " Scalar outside cost: %d\n",
4161 scalar_outside_cost);
4162 dump_printf (MSG_NOTE, " Vector outside cost: %d\n",
4163 vec_outside_cost);
4164 dump_printf (MSG_NOTE, " prologue iterations: %d\n",
4165 peel_iters_prologue);
4166 dump_printf (MSG_NOTE, " epilogue iterations: %d\n",
4167 peel_iters_epilogue);
4168 }
4169
4170 /* Calculate number of iterations required to make the vector version
4171 profitable, relative to the loop bodies only. The following condition
4172 must hold true:
4173 SIC * niters + SOC > VIC * ((niters - NPEEL) / VF) + VOC
4174 where
4175 SIC = scalar iteration cost, VIC = vector iteration cost,
4176 VOC = vector outside cost, VF = vectorization factor,
4177 NPEEL = prologue iterations + epilogue iterations,
4178 SOC = scalar outside cost for run time cost model check. */
4179
4180 int saving_per_viter = (scalar_single_iter_cost * assumed_vf
4181 - vec_inside_cost);
4182 if (saving_per_viter <= 0)
4183 {
4184 if (LOOP_VINFO_LOOP (loop_vinfo)->force_vectorize)
4185 warning_at (vect_location.get_location_t (), OPT_Wopenmp_simd,
4186 "vectorization did not happen for a simd loop");
4187
4188 if (dump_enabled_p ())
4189 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4190 "cost model: the vector iteration cost = %d "
4191 "divided by the scalar iteration cost = %d "
4192 "is greater or equal to the vectorization factor = %d"
4193 ".\n",
4194 vec_inside_cost, scalar_single_iter_cost, assumed_vf);
4195 *ret_min_profitable_niters = -1;
4196 *ret_min_profitable_estimate = -1;
4197 return;
4198 }
4199
4200 /* ??? The "if" arm is written to handle all cases; see below for what
4201 we would do for !LOOP_VINFO_USING_PARTIAL_VECTORS_P. */
4202 if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
4203 {
4204 /* Rewriting the condition above in terms of the number of
4205 vector iterations (vniters) rather than the number of
4206 scalar iterations (niters) gives:
4207
4208 SIC * (vniters * VF + NPEEL) + SOC > VIC * vniters + VOC
4209
4210 <==> vniters * (SIC * VF - VIC) > VOC - SIC * NPEEL - SOC
4211
4212 For integer N, X and Y when X > 0:
4213
4214 N * X > Y <==> N >= (Y /[floor] X) + 1. */
4215 int outside_overhead = (vec_outside_cost
4216 - scalar_single_iter_cost * peel_iters_prologue
4217 - scalar_single_iter_cost * peel_iters_epilogue
4218 - scalar_outside_cost);
4219 /* We're only interested in cases that require at least one
4220 vector iteration. */
4221 int min_vec_niters = 1;
4222 if (outside_overhead > 0)
4223 min_vec_niters = outside_overhead / saving_per_viter + 1;
4224
4225 if (dump_enabled_p ())
4226 dump_printf (MSG_NOTE, " Minimum number of vector iterations: %d\n",
4227 min_vec_niters);
4228
4229 if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
4230 {
4231 /* Now that we know the minimum number of vector iterations,
4232 find the minimum niters for which the scalar cost is larger:
4233
4234 SIC * niters > VIC * vniters + VOC - SOC
4235
4236 We know that the minimum niters is no more than
4237 vniters * VF + NPEEL, but it might be (and often is) less
4238 than that if a partial vector iteration is cheaper than the
4239 equivalent scalar code. */
4240 int threshold = (vec_inside_cost * min_vec_niters
4241 + vec_outside_cost
4242 - scalar_outside_cost);
4243 if (threshold <= 0)
4244 min_profitable_iters = 1;
4245 else
4246 min_profitable_iters = threshold / scalar_single_iter_cost + 1;
4247 }
4248 else
4249 /* Convert the number of vector iterations into a number of
4250 scalar iterations. */
4251 min_profitable_iters = (min_vec_niters * assumed_vf
4252 + peel_iters_prologue
4253 + peel_iters_epilogue);
4254 }
4255 else
4256 {
4257 min_profitable_iters = ((vec_outside_cost - scalar_outside_cost)
4258 * assumed_vf
4259 - vec_inside_cost * peel_iters_prologue
4260 - vec_inside_cost * peel_iters_epilogue);
4261 if (min_profitable_iters <= 0)
4262 min_profitable_iters = 0;
4263 else
4264 {
4265 min_profitable_iters /= saving_per_viter;
4266
4267 if ((scalar_single_iter_cost * assumed_vf * min_profitable_iters)
4268 <= (((int) vec_inside_cost * min_profitable_iters)
4269 + (((int) vec_outside_cost - scalar_outside_cost)
4270 * assumed_vf)))
4271 min_profitable_iters++;
4272 }
4273 }
4274
4275 if (dump_enabled_p ())
4276 dump_printf (MSG_NOTE,
4277 " Calculated minimum iters for profitability: %d\n",
4278 min_profitable_iters);
4279
4280 if (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo)
4281 && min_profitable_iters < (assumed_vf + peel_iters_prologue))
4282 /* We want the vectorized loop to execute at least once. */
4283 min_profitable_iters = assumed_vf + peel_iters_prologue;
4284 else if (min_profitable_iters < peel_iters_prologue)
4285 /* For LOOP_VINFO_USING_PARTIAL_VECTORS_P, we need to ensure the
4286 vectorized loop executes at least once. */
4287 min_profitable_iters = peel_iters_prologue;
4288
4289 if (dump_enabled_p ())
4290 dump_printf_loc (MSG_NOTE, vect_location,
4291 " Runtime profitability threshold = %d\n",
4292 min_profitable_iters);
4293
4294 *ret_min_profitable_niters = min_profitable_iters;
4295
4296 /* Calculate number of iterations required to make the vector version
4297 profitable, relative to the loop bodies only.
4298
4299 Non-vectorized variant is SIC * niters and it must win over vector
4300 variant on the expected loop trip count. The following condition must hold true:
4301 SIC * niters > VIC * ((niters - NPEEL) / VF) + VOC + SOC */
4302
4303 if (vec_outside_cost <= 0)
4304 min_profitable_estimate = 0;
4305 /* ??? This "else if" arm is written to handle all cases; see below for
4306 what we would do for !LOOP_VINFO_USING_PARTIAL_VECTORS_P. */
4307 else if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
4308 {
4309 /* This is a repeat of the code above, but with + SOC rather
4310 than - SOC. */
4311 int outside_overhead = (vec_outside_cost
4312 - scalar_single_iter_cost * peel_iters_prologue
4313 - scalar_single_iter_cost * peel_iters_epilogue
4314 + scalar_outside_cost);
4315 int min_vec_niters = 1;
4316 if (outside_overhead > 0)
4317 min_vec_niters = outside_overhead / saving_per_viter + 1;
4318
4319 if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
4320 {
4321 int threshold = (vec_inside_cost * min_vec_niters
4322 + vec_outside_cost
4323 + scalar_outside_cost);
4324 min_profitable_estimate = threshold / scalar_single_iter_cost + 1;
4325 }
4326 else
4327 min_profitable_estimate = (min_vec_niters * assumed_vf
4328 + peel_iters_prologue
4329 + peel_iters_epilogue);
4330 }
4331 else
4332 {
4333 min_profitable_estimate = ((vec_outside_cost + scalar_outside_cost)
4334 * assumed_vf
4335 - vec_inside_cost * peel_iters_prologue
4336 - vec_inside_cost * peel_iters_epilogue)
4337 / ((scalar_single_iter_cost * assumed_vf)
4338 - vec_inside_cost);
4339 }
4340 min_profitable_estimate = MAX (min_profitable_estimate, min_profitable_iters);
4341 if (dump_enabled_p ())
4342 dump_printf_loc (MSG_NOTE, vect_location,
4343 " Static estimate profitability threshold = %d\n",
4344 min_profitable_estimate);
4345
4346 *ret_min_profitable_estimate = min_profitable_estimate;
4347 }
4348
4349 /* Writes into SEL a mask for a vec_perm, equivalent to a vec_shr by OFFSET
4350 vector elements (not bits) for a vector with NELT elements. */
4351 static void
4352 calc_vec_perm_mask_for_shift (unsigned int offset, unsigned int nelt,
4353 vec_perm_builder *sel)
4354 {
4355 /* The encoding is a single stepped pattern. Any wrap-around is handled
4356 by vec_perm_indices. */
4357 sel->new_vector (nelt, 1, 3);
4358 for (unsigned int i = 0; i < 3; i++)
4359 sel->quick_push (i + offset);
4360 }
4361
4362 /* Checks whether the target supports whole-vector shifts for vectors of mode
4363 MODE. This is the case if _either_ the platform handles vec_shr_optab, _or_
4364 it supports vec_perm_const with masks for all necessary shift amounts. */
4365 static bool
4366 have_whole_vector_shift (machine_mode mode)
4367 {
4368 if (optab_handler (vec_shr_optab, mode) != CODE_FOR_nothing)
4369 return true;
4370
4371 /* Variable-length vectors should be handled via the optab. */
4372 unsigned int nelt;
4373 if (!GET_MODE_NUNITS (mode).is_constant (&nelt))
4374 return false;
4375
4376 vec_perm_builder sel;
4377 vec_perm_indices indices;
4378 for (unsigned int i = nelt / 2; i >= 1; i /= 2)
4379 {
4380 calc_vec_perm_mask_for_shift (i, nelt, &sel);
4381 indices.new_vector (sel, 2, nelt);
4382 if (!can_vec_perm_const_p (mode, indices, false))
4383 return false;
4384 }
4385 return true;
4386 }
4387
4388 /* TODO: Close dependency between vect_model_*_cost and vectorizable_*
4389 functions. Design better to avoid maintenance issues. */
4390
4391 /* Function vect_model_reduction_cost.
4392
4393 Models cost for a reduction operation, including the vector ops
4394 generated within the strip-mine loop in some cases, the initial
4395 definition before the loop, and the epilogue code that must be generated. */
4396
4397 static void
4398 vect_model_reduction_cost (loop_vec_info loop_vinfo,
4399 stmt_vec_info stmt_info, internal_fn reduc_fn,
4400 vect_reduction_type reduction_type,
4401 int ncopies, stmt_vector_for_cost *cost_vec)
4402 {
4403 int prologue_cost = 0, epilogue_cost = 0, inside_cost = 0;
4404 enum tree_code code;
4405 optab optab;
4406 tree vectype;
4407 machine_mode mode;
4408 class loop *loop = NULL;
4409
4410 if (loop_vinfo)
4411 loop = LOOP_VINFO_LOOP (loop_vinfo);
4412
4413 /* Condition reductions generate two reductions in the loop. */
4414 if (reduction_type == COND_REDUCTION)
4415 ncopies *= 2;
4416
4417 vectype = STMT_VINFO_VECTYPE (stmt_info);
4418 mode = TYPE_MODE (vectype);
4419 stmt_vec_info orig_stmt_info = vect_orig_stmt (stmt_info);
4420
4421 code = gimple_assign_rhs_code (orig_stmt_info->stmt);
4422
4423 if (reduction_type == EXTRACT_LAST_REDUCTION)
4424 /* No extra instructions are needed in the prologue. The loop body
4425 operations are costed in vectorizable_condition. */
4426 inside_cost = 0;
4427 else if (reduction_type == FOLD_LEFT_REDUCTION)
4428 {
4429 /* No extra instructions needed in the prologue. */
4430 prologue_cost = 0;
4431
4432 if (reduc_fn != IFN_LAST)
4433 /* Count one reduction-like operation per vector. */
4434 inside_cost = record_stmt_cost (cost_vec, ncopies, vec_to_scalar,
4435 stmt_info, 0, vect_body);
4436 else
4437 {
4438 /* Use NELEMENTS extracts and NELEMENTS scalar ops. */
4439 unsigned int nelements = ncopies * vect_nunits_for_cost (vectype);
4440 inside_cost = record_stmt_cost (cost_vec, nelements,
4441 vec_to_scalar, stmt_info, 0,
4442 vect_body);
4443 inside_cost += record_stmt_cost (cost_vec, nelements,
4444 scalar_stmt, stmt_info, 0,
4445 vect_body);
4446 }
4447 }
4448 else
4449 {
4450 /* Add in cost for initial definition.
4451 For cond reduction we have four vectors: initial index, step,
4452 initial result of the data reduction, initial value of the index
4453 reduction. */
4454 int prologue_stmts = reduction_type == COND_REDUCTION ? 4 : 1;
4455 prologue_cost += record_stmt_cost (cost_vec, prologue_stmts,
4456 scalar_to_vec, stmt_info, 0,
4457 vect_prologue);
4458 }
4459
4460 /* Determine cost of epilogue code.
4461
4462 We have a reduction operator that will reduce the vector in one statement.
4463 Also requires scalar extract. */
4464
4465 if (!loop || !nested_in_vect_loop_p (loop, orig_stmt_info))
4466 {
4467 if (reduc_fn != IFN_LAST)
4468 {
4469 if (reduction_type == COND_REDUCTION)
4470 {
4471 /* An EQ stmt and an COND_EXPR stmt. */
4472 epilogue_cost += record_stmt_cost (cost_vec, 2,
4473 vector_stmt, stmt_info, 0,
4474 vect_epilogue);
4475 /* Reduction of the max index and a reduction of the found
4476 values. */
4477 epilogue_cost += record_stmt_cost (cost_vec, 2,
4478 vec_to_scalar, stmt_info, 0,
4479 vect_epilogue);
4480 /* A broadcast of the max value. */
4481 epilogue_cost += record_stmt_cost (cost_vec, 1,
4482 scalar_to_vec, stmt_info, 0,
4483 vect_epilogue);
4484 }
4485 else
4486 {
4487 epilogue_cost += record_stmt_cost (cost_vec, 1, vector_stmt,
4488 stmt_info, 0, vect_epilogue);
4489 epilogue_cost += record_stmt_cost (cost_vec, 1,
4490 vec_to_scalar, stmt_info, 0,
4491 vect_epilogue);
4492 }
4493 }
4494 else if (reduction_type == COND_REDUCTION)
4495 {
4496 unsigned estimated_nunits = vect_nunits_for_cost (vectype);
4497 /* Extraction of scalar elements. */
4498 epilogue_cost += record_stmt_cost (cost_vec,
4499 2 * estimated_nunits,
4500 vec_to_scalar, stmt_info, 0,
4501 vect_epilogue);
4502 /* Scalar max reductions via COND_EXPR / MAX_EXPR. */
4503 epilogue_cost += record_stmt_cost (cost_vec,
4504 2 * estimated_nunits - 3,
4505 scalar_stmt, stmt_info, 0,
4506 vect_epilogue);
4507 }
4508 else if (reduction_type == EXTRACT_LAST_REDUCTION
4509 || reduction_type == FOLD_LEFT_REDUCTION)
4510 /* No extra instructions need in the epilogue. */
4511 ;
4512 else
4513 {
4514 int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
4515 tree bitsize =
4516 TYPE_SIZE (TREE_TYPE (gimple_assign_lhs (orig_stmt_info->stmt)));
4517 int element_bitsize = tree_to_uhwi (bitsize);
4518 int nelements = vec_size_in_bits / element_bitsize;
4519
4520 if (code == COND_EXPR)
4521 code = MAX_EXPR;
4522
4523 optab = optab_for_tree_code (code, vectype, optab_default);
4524
4525 /* We have a whole vector shift available. */
4526 if (optab != unknown_optab
4527 && VECTOR_MODE_P (mode)
4528 && optab_handler (optab, mode) != CODE_FOR_nothing
4529 && have_whole_vector_shift (mode))
4530 {
4531 /* Final reduction via vector shifts and the reduction operator.
4532 Also requires scalar extract. */
4533 epilogue_cost += record_stmt_cost (cost_vec,
4534 exact_log2 (nelements) * 2,
4535 vector_stmt, stmt_info, 0,
4536 vect_epilogue);
4537 epilogue_cost += record_stmt_cost (cost_vec, 1,
4538 vec_to_scalar, stmt_info, 0,
4539 vect_epilogue);
4540 }
4541 else
4542 /* Use extracts and reduction op for final reduction. For N
4543 elements, we have N extracts and N-1 reduction ops. */
4544 epilogue_cost += record_stmt_cost (cost_vec,
4545 nelements + nelements - 1,
4546 vector_stmt, stmt_info, 0,
4547 vect_epilogue);
4548 }
4549 }
4550
4551 if (dump_enabled_p ())
4552 dump_printf (MSG_NOTE,
4553 "vect_model_reduction_cost: inside_cost = %d, "
4554 "prologue_cost = %d, epilogue_cost = %d .\n", inside_cost,
4555 prologue_cost, epilogue_cost);
4556 }
4557
4558 /* SEQ is a sequence of instructions that initialize the reduction
4559 described by REDUC_INFO. Emit them in the appropriate place. */
4560
4561 static void
4562 vect_emit_reduction_init_stmts (loop_vec_info loop_vinfo,
4563 stmt_vec_info reduc_info, gimple *seq)
4564 {
4565 if (reduc_info->reused_accumulator)
4566 {
4567 /* When reusing an accumulator from the main loop, we only need
4568 initialization instructions if the main loop can be skipped.
4569 In that case, emit the initialization instructions at the end
4570 of the guard block that does the skip. */
4571 edge skip_edge = loop_vinfo->skip_main_loop_edge;
4572 gcc_assert (skip_edge);
4573 gimple_stmt_iterator gsi = gsi_last_bb (skip_edge->src);
4574 gsi_insert_seq_before (&gsi, seq, GSI_SAME_STMT);
4575 }
4576 else
4577 {
4578 /* The normal case: emit the initialization instructions on the
4579 preheader edge. */
4580 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
4581 gsi_insert_seq_on_edge_immediate (loop_preheader_edge (loop), seq);
4582 }
4583 }
4584
4585 /* Function get_initial_def_for_reduction
4586
4587 Input:
4588 REDUC_INFO - the info_for_reduction
4589 INIT_VAL - the initial value of the reduction variable
4590 NEUTRAL_OP - a value that has no effect on the reduction, as per
4591 neutral_op_for_reduction
4592
4593 Output:
4594 Return a vector variable, initialized according to the operation that
4595 STMT_VINFO performs. This vector will be used as the initial value
4596 of the vector of partial results.
4597
4598 The value we need is a vector in which element 0 has value INIT_VAL
4599 and every other element has value NEUTRAL_OP. */
4600
4601 static tree
4602 get_initial_def_for_reduction (loop_vec_info loop_vinfo,
4603 stmt_vec_info reduc_info,
4604 tree init_val, tree neutral_op)
4605 {
4606 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
4607 tree scalar_type = TREE_TYPE (init_val);
4608 tree vectype = get_vectype_for_scalar_type (loop_vinfo, scalar_type);
4609 tree init_def;
4610 gimple_seq stmts = NULL;
4611
4612 gcc_assert (vectype);
4613
4614 gcc_assert (POINTER_TYPE_P (scalar_type) || INTEGRAL_TYPE_P (scalar_type)
4615 || SCALAR_FLOAT_TYPE_P (scalar_type));
4616
4617 gcc_assert (nested_in_vect_loop_p (loop, reduc_info)
4618 || loop == (gimple_bb (reduc_info->stmt))->loop_father);
4619
4620 if (operand_equal_p (init_val, neutral_op))
4621 {
4622 /* If both elements are equal then the vector described above is
4623 just a splat. */
4624 neutral_op = gimple_convert (&stmts, TREE_TYPE (vectype), neutral_op);
4625 init_def = gimple_build_vector_from_val (&stmts, vectype, neutral_op);
4626 }
4627 else
4628 {
4629 neutral_op = gimple_convert (&stmts, TREE_TYPE (vectype), neutral_op);
4630 init_val = gimple_convert (&stmts, TREE_TYPE (vectype), init_val);
4631 if (!TYPE_VECTOR_SUBPARTS (vectype).is_constant ())
4632 {
4633 /* Construct a splat of NEUTRAL_OP and insert INIT_VAL into
4634 element 0. */
4635 init_def = gimple_build_vector_from_val (&stmts, vectype,
4636 neutral_op);
4637 init_def = gimple_build (&stmts, CFN_VEC_SHL_INSERT,
4638 vectype, init_def, init_val);
4639 }
4640 else
4641 {
4642 /* Build {INIT_VAL, NEUTRAL_OP, NEUTRAL_OP, ...}. */
4643 tree_vector_builder elts (vectype, 1, 2);
4644 elts.quick_push (init_val);
4645 elts.quick_push (neutral_op);
4646 init_def = gimple_build_vector (&stmts, &elts);
4647 }
4648 }
4649
4650 if (stmts)
4651 vect_emit_reduction_init_stmts (loop_vinfo, reduc_info, stmts);
4652 return init_def;
4653 }
4654
4655 /* Get at the initial defs for the reduction PHIs for REDUC_INFO,
4656 which performs a reduction involving GROUP_SIZE scalar statements.
4657 NUMBER_OF_VECTORS is the number of vector defs to create. If NEUTRAL_OP
4658 is nonnull, introducing extra elements of that value will not change the
4659 result. */
4660
4661 static void
4662 get_initial_defs_for_reduction (loop_vec_info loop_vinfo,
4663 stmt_vec_info reduc_info,
4664 vec<tree> *vec_oprnds,
4665 unsigned int number_of_vectors,
4666 unsigned int group_size, tree neutral_op)
4667 {
4668 vec<tree> &initial_values = reduc_info->reduc_initial_values;
4669 unsigned HOST_WIDE_INT nunits;
4670 unsigned j, number_of_places_left_in_vector;
4671 tree vector_type = STMT_VINFO_VECTYPE (reduc_info);
4672 unsigned int i;
4673
4674 gcc_assert (group_size == initial_values.length () || neutral_op);
4675
4676 /* NUMBER_OF_COPIES is the number of times we need to use the same values in
4677 created vectors. It is greater than 1 if unrolling is performed.
4678
4679 For example, we have two scalar operands, s1 and s2 (e.g., group of
4680 strided accesses of size two), while NUNITS is four (i.e., four scalars
4681 of this type can be packed in a vector). The output vector will contain
4682 two copies of each scalar operand: {s1, s2, s1, s2}. (NUMBER_OF_COPIES
4683 will be 2).
4684
4685 If REDUC_GROUP_SIZE > NUNITS, the scalars will be split into several
4686 vectors containing the operands.
4687
4688 For example, NUNITS is four as before, and the group size is 8
4689 (s1, s2, ..., s8). We will create two vectors {s1, s2, s3, s4} and
4690 {s5, s6, s7, s8}. */
4691
4692 if (!TYPE_VECTOR_SUBPARTS (vector_type).is_constant (&nunits))
4693 nunits = group_size;
4694
4695 number_of_places_left_in_vector = nunits;
4696 bool constant_p = true;
4697 tree_vector_builder elts (vector_type, nunits, 1);
4698 elts.quick_grow (nunits);
4699 gimple_seq ctor_seq = NULL;
4700 for (j = 0; j < nunits * number_of_vectors; ++j)
4701 {
4702 tree op;
4703 i = j % group_size;
4704
4705 /* Get the def before the loop. In reduction chain we have only
4706 one initial value. Else we have as many as PHIs in the group. */
4707 if (i >= initial_values.length () || (j > i && neutral_op))
4708 op = neutral_op;
4709 else
4710 op = initial_values[i];
4711
4712 /* Create 'vect_ = {op0,op1,...,opn}'. */
4713 number_of_places_left_in_vector--;
4714 elts[nunits - number_of_places_left_in_vector - 1] = op;
4715 if (!CONSTANT_CLASS_P (op))
4716 constant_p = false;
4717
4718 if (number_of_places_left_in_vector == 0)
4719 {
4720 tree init;
4721 if (constant_p && !neutral_op
4722 ? multiple_p (TYPE_VECTOR_SUBPARTS (vector_type), nunits)
4723 : known_eq (TYPE_VECTOR_SUBPARTS (vector_type), nunits))
4724 /* Build the vector directly from ELTS. */
4725 init = gimple_build_vector (&ctor_seq, &elts);
4726 else if (neutral_op)
4727 {
4728 /* Build a vector of the neutral value and shift the
4729 other elements into place. */
4730 init = gimple_build_vector_from_val (&ctor_seq, vector_type,
4731 neutral_op);
4732 int k = nunits;
4733 while (k > 0 && elts[k - 1] == neutral_op)
4734 k -= 1;
4735 while (k > 0)
4736 {
4737 k -= 1;
4738 init = gimple_build (&ctor_seq, CFN_VEC_SHL_INSERT,
4739 vector_type, init, elts[k]);
4740 }
4741 }
4742 else
4743 {
4744 /* First time round, duplicate ELTS to fill the
4745 required number of vectors. */
4746 duplicate_and_interleave (loop_vinfo, &ctor_seq, vector_type,
4747 elts, number_of_vectors, *vec_oprnds);
4748 break;
4749 }
4750 vec_oprnds->quick_push (init);
4751
4752 number_of_places_left_in_vector = nunits;
4753 elts.new_vector (vector_type, nunits, 1);
4754 elts.quick_grow (nunits);
4755 constant_p = true;
4756 }
4757 }
4758 if (ctor_seq != NULL)
4759 vect_emit_reduction_init_stmts (loop_vinfo, reduc_info, ctor_seq);
4760 }
4761
4762 /* For a statement STMT_INFO taking part in a reduction operation return
4763 the stmt_vec_info the meta information is stored on. */
4764
4765 stmt_vec_info
4766 info_for_reduction (vec_info *vinfo, stmt_vec_info stmt_info)
4767 {
4768 stmt_info = vect_orig_stmt (stmt_info);
4769 gcc_assert (STMT_VINFO_REDUC_DEF (stmt_info));
4770 if (!is_a <gphi *> (stmt_info->stmt)
4771 || !VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
4772 stmt_info = STMT_VINFO_REDUC_DEF (stmt_info);
4773 gphi *phi = as_a <gphi *> (stmt_info->stmt);
4774 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
4775 {
4776 if (gimple_phi_num_args (phi) == 1)
4777 stmt_info = STMT_VINFO_REDUC_DEF (stmt_info);
4778 }
4779 else if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle)
4780 {
4781 stmt_vec_info info = vinfo->lookup_def (vect_phi_initial_value (phi));
4782 if (info && STMT_VINFO_DEF_TYPE (info) == vect_double_reduction_def)
4783 stmt_info = info;
4784 }
4785 return stmt_info;
4786 }
4787
4788 /* See if LOOP_VINFO is an epilogue loop whose main loop had a reduction that
4789 REDUC_INFO can build on. Adjust REDUC_INFO and return true if so, otherwise
4790 return false. */
4791
4792 static bool
4793 vect_find_reusable_accumulator (loop_vec_info loop_vinfo,
4794 stmt_vec_info reduc_info)
4795 {
4796 loop_vec_info main_loop_vinfo = LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo);
4797 if (!main_loop_vinfo)
4798 return false;
4799
4800 if (STMT_VINFO_REDUC_TYPE (reduc_info) != TREE_CODE_REDUCTION)
4801 return false;
4802
4803 unsigned int num_phis = reduc_info->reduc_initial_values.length ();
4804 auto_vec<tree, 16> main_loop_results (num_phis);
4805 auto_vec<tree, 16> initial_values (num_phis);
4806 if (edge main_loop_edge = loop_vinfo->main_loop_edge)
4807 {
4808 /* The epilogue loop can be entered either from the main loop or
4809 from an earlier guard block. */
4810 edge skip_edge = loop_vinfo->skip_main_loop_edge;
4811 for (tree incoming_value : reduc_info->reduc_initial_values)
4812 {
4813 /* Look for:
4814
4815 INCOMING_VALUE = phi<MAIN_LOOP_RESULT(main loop),
4816 INITIAL_VALUE(guard block)>. */
4817 gcc_assert (TREE_CODE (incoming_value) == SSA_NAME);
4818
4819 gphi *phi = as_a <gphi *> (SSA_NAME_DEF_STMT (incoming_value));
4820 gcc_assert (gimple_bb (phi) == main_loop_edge->dest);
4821
4822 tree from_main_loop = PHI_ARG_DEF_FROM_EDGE (phi, main_loop_edge);
4823 tree from_skip = PHI_ARG_DEF_FROM_EDGE (phi, skip_edge);
4824
4825 main_loop_results.quick_push (from_main_loop);
4826 initial_values.quick_push (from_skip);
4827 }
4828 }
4829 else
4830 /* The main loop dominates the epilogue loop. */
4831 main_loop_results.splice (reduc_info->reduc_initial_values);
4832
4833 /* See if the main loop has the kind of accumulator we need. */
4834 vect_reusable_accumulator *accumulator
4835 = main_loop_vinfo->reusable_accumulators.get (main_loop_results[0]);
4836 if (!accumulator
4837 || num_phis != accumulator->reduc_info->reduc_scalar_results.length ()
4838 || !std::equal (main_loop_results.begin (), main_loop_results.end (),
4839 accumulator->reduc_info->reduc_scalar_results.begin ()))
4840 return false;
4841
4842 /* Handle the case where we can reduce wider vectors to narrower ones. */
4843 tree vectype = STMT_VINFO_VECTYPE (reduc_info);
4844 tree old_vectype = TREE_TYPE (accumulator->reduc_input);
4845 if (!constant_multiple_p (TYPE_VECTOR_SUBPARTS (old_vectype),
4846 TYPE_VECTOR_SUBPARTS (vectype)))
4847 return false;
4848
4849 /* Non-SLP reductions might apply an adjustment after the reduction
4850 operation, in order to simplify the initialization of the accumulator.
4851 If the epilogue loop carries on from where the main loop left off,
4852 it should apply the same adjustment to the final reduction result.
4853
4854 If the epilogue loop can also be entered directly (rather than via
4855 the main loop), we need to be able to handle that case in the same way,
4856 with the same adjustment. (In principle we could add a PHI node
4857 to select the correct adjustment, but in practice that shouldn't be
4858 necessary.) */
4859 tree main_adjustment
4860 = STMT_VINFO_REDUC_EPILOGUE_ADJUSTMENT (accumulator->reduc_info);
4861 if (loop_vinfo->main_loop_edge && main_adjustment)
4862 {
4863 gcc_assert (num_phis == 1);
4864 tree initial_value = initial_values[0];
4865 /* Check that we can use INITIAL_VALUE as the adjustment and
4866 initialize the accumulator with a neutral value instead. */
4867 if (!operand_equal_p (initial_value, main_adjustment))
4868 return false;
4869 tree_code code = STMT_VINFO_REDUC_CODE (reduc_info);
4870 initial_values[0] = neutral_op_for_reduction (TREE_TYPE (initial_value),
4871 code, initial_value);
4872 }
4873 STMT_VINFO_REDUC_EPILOGUE_ADJUSTMENT (reduc_info) = main_adjustment;
4874 reduc_info->reduc_initial_values.truncate (0);
4875 reduc_info->reduc_initial_values.splice (initial_values);
4876 reduc_info->reused_accumulator = accumulator;
4877 return true;
4878 }
4879
4880 /* Reduce the vector VEC_DEF down to VECTYPE with reduction operation
4881 CODE emitting stmts before GSI. Returns a vector def of VECTYPE. */
4882
4883 static tree
4884 vect_create_partial_epilog (tree vec_def, tree vectype, enum tree_code code,
4885 gimple_seq *seq)
4886 {
4887 unsigned nunits = TYPE_VECTOR_SUBPARTS (TREE_TYPE (vec_def)).to_constant ();
4888 unsigned nunits1 = TYPE_VECTOR_SUBPARTS (vectype).to_constant ();
4889 tree stype = TREE_TYPE (vectype);
4890 tree new_temp = vec_def;
4891 while (nunits > nunits1)
4892 {
4893 nunits /= 2;
4894 tree vectype1 = get_related_vectype_for_scalar_type (TYPE_MODE (vectype),
4895 stype, nunits);
4896 unsigned int bitsize = tree_to_uhwi (TYPE_SIZE (vectype1));
4897
4898 /* The target has to make sure we support lowpart/highpart
4899 extraction, either via direct vector extract or through
4900 an integer mode punning. */
4901 tree dst1, dst2;
4902 gimple *epilog_stmt;
4903 if (convert_optab_handler (vec_extract_optab,
4904 TYPE_MODE (TREE_TYPE (new_temp)),
4905 TYPE_MODE (vectype1))
4906 != CODE_FOR_nothing)
4907 {
4908 /* Extract sub-vectors directly once vec_extract becomes
4909 a conversion optab. */
4910 dst1 = make_ssa_name (vectype1);
4911 epilog_stmt
4912 = gimple_build_assign (dst1, BIT_FIELD_REF,
4913 build3 (BIT_FIELD_REF, vectype1,
4914 new_temp, TYPE_SIZE (vectype1),
4915 bitsize_int (0)));
4916 gimple_seq_add_stmt_without_update (seq, epilog_stmt);
4917 dst2 = make_ssa_name (vectype1);
4918 epilog_stmt
4919 = gimple_build_assign (dst2, BIT_FIELD_REF,
4920 build3 (BIT_FIELD_REF, vectype1,
4921 new_temp, TYPE_SIZE (vectype1),
4922 bitsize_int (bitsize)));
4923 gimple_seq_add_stmt_without_update (seq, epilog_stmt);
4924 }
4925 else
4926 {
4927 /* Extract via punning to appropriately sized integer mode
4928 vector. */
4929 tree eltype = build_nonstandard_integer_type (bitsize, 1);
4930 tree etype = build_vector_type (eltype, 2);
4931 gcc_assert (convert_optab_handler (vec_extract_optab,
4932 TYPE_MODE (etype),
4933 TYPE_MODE (eltype))
4934 != CODE_FOR_nothing);
4935 tree tem = make_ssa_name (etype);
4936 epilog_stmt = gimple_build_assign (tem, VIEW_CONVERT_EXPR,
4937 build1 (VIEW_CONVERT_EXPR,
4938 etype, new_temp));
4939 gimple_seq_add_stmt_without_update (seq, epilog_stmt);
4940 new_temp = tem;
4941 tem = make_ssa_name (eltype);
4942 epilog_stmt
4943 = gimple_build_assign (tem, BIT_FIELD_REF,
4944 build3 (BIT_FIELD_REF, eltype,
4945 new_temp, TYPE_SIZE (eltype),
4946 bitsize_int (0)));
4947 gimple_seq_add_stmt_without_update (seq, epilog_stmt);
4948 dst1 = make_ssa_name (vectype1);
4949 epilog_stmt = gimple_build_assign (dst1, VIEW_CONVERT_EXPR,
4950 build1 (VIEW_CONVERT_EXPR,
4951 vectype1, tem));
4952 gimple_seq_add_stmt_without_update (seq, epilog_stmt);
4953 tem = make_ssa_name (eltype);
4954 epilog_stmt
4955 = gimple_build_assign (tem, BIT_FIELD_REF,
4956 build3 (BIT_FIELD_REF, eltype,
4957 new_temp, TYPE_SIZE (eltype),
4958 bitsize_int (bitsize)));
4959 gimple_seq_add_stmt_without_update (seq, epilog_stmt);
4960 dst2 = make_ssa_name (vectype1);
4961 epilog_stmt = gimple_build_assign (dst2, VIEW_CONVERT_EXPR,
4962 build1 (VIEW_CONVERT_EXPR,
4963 vectype1, tem));
4964 gimple_seq_add_stmt_without_update (seq, epilog_stmt);
4965 }
4966
4967 new_temp = make_ssa_name (vectype1);
4968 epilog_stmt = gimple_build_assign (new_temp, code, dst1, dst2);
4969 gimple_seq_add_stmt_without_update (seq, epilog_stmt);
4970 }
4971
4972 return new_temp;
4973 }
4974
4975 /* Function vect_create_epilog_for_reduction
4976
4977 Create code at the loop-epilog to finalize the result of a reduction
4978 computation.
4979
4980 STMT_INFO is the scalar reduction stmt that is being vectorized.
4981 SLP_NODE is an SLP node containing a group of reduction statements. The
4982 first one in this group is STMT_INFO.
4983 SLP_NODE_INSTANCE is the SLP node instance containing SLP_NODE
4984 REDUC_INDEX says which rhs operand of the STMT_INFO is the reduction phi
4985 (counting from 0)
4986
4987 This function:
4988 1. Completes the reduction def-use cycles.
4989 2. "Reduces" each vector of partial results VECT_DEFS into a single result,
4990 by calling the function specified by REDUC_FN if available, or by
4991 other means (whole-vector shifts or a scalar loop).
4992 The function also creates a new phi node at the loop exit to preserve
4993 loop-closed form, as illustrated below.
4994
4995 The flow at the entry to this function:
4996
4997 loop:
4998 vec_def = phi <vec_init, null> # REDUCTION_PHI
4999 VECT_DEF = vector_stmt # vectorized form of STMT_INFO
5000 s_loop = scalar_stmt # (scalar) STMT_INFO
5001 loop_exit:
5002 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
5003 use <s_out0>
5004 use <s_out0>
5005
5006 The above is transformed by this function into:
5007
5008 loop:
5009 vec_def = phi <vec_init, VECT_DEF> # REDUCTION_PHI
5010 VECT_DEF = vector_stmt # vectorized form of STMT_INFO
5011 s_loop = scalar_stmt # (scalar) STMT_INFO
5012 loop_exit:
5013 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
5014 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
5015 v_out2 = reduce <v_out1>
5016 s_out3 = extract_field <v_out2, 0>
5017 s_out4 = adjust_result <s_out3>
5018 use <s_out4>
5019 use <s_out4>
5020 */
5021
5022 static void
5023 vect_create_epilog_for_reduction (loop_vec_info loop_vinfo,
5024 stmt_vec_info stmt_info,
5025 slp_tree slp_node,
5026 slp_instance slp_node_instance)
5027 {
5028 stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info);
5029 gcc_assert (reduc_info->is_reduc_info);
5030 /* For double reductions we need to get at the inner loop reduction
5031 stmt which has the meta info attached. Our stmt_info is that of the
5032 loop-closed PHI of the inner loop which we remember as
5033 def for the reduction PHI generation. */
5034 bool double_reduc = false;
5035 stmt_vec_info rdef_info = stmt_info;
5036 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
5037 {
5038 gcc_assert (!slp_node);
5039 double_reduc = true;
5040 stmt_info = loop_vinfo->lookup_def (gimple_phi_arg_def
5041 (stmt_info->stmt, 0));
5042 stmt_info = vect_stmt_to_vectorize (stmt_info);
5043 }
5044 gphi *reduc_def_stmt
5045 = as_a <gphi *> (STMT_VINFO_REDUC_DEF (vect_orig_stmt (stmt_info))->stmt);
5046 enum tree_code code = STMT_VINFO_REDUC_CODE (reduc_info);
5047 internal_fn reduc_fn = STMT_VINFO_REDUC_FN (reduc_info);
5048 tree vectype;
5049 machine_mode mode;
5050 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo), *outer_loop = NULL;
5051 basic_block exit_bb;
5052 tree scalar_dest;
5053 tree scalar_type;
5054 gimple *new_phi = NULL, *phi;
5055 gimple_stmt_iterator exit_gsi;
5056 tree new_temp = NULL_TREE, new_name, new_scalar_dest;
5057 gimple *epilog_stmt = NULL;
5058 gimple *exit_phi;
5059 tree bitsize;
5060 tree def;
5061 tree orig_name, scalar_result;
5062 imm_use_iterator imm_iter, phi_imm_iter;
5063 use_operand_p use_p, phi_use_p;
5064 gimple *use_stmt;
5065 auto_vec<tree> reduc_inputs;
5066 int j, i;
5067 vec<tree> &scalar_results = reduc_info->reduc_scalar_results;
5068 unsigned int group_size = 1, k;
5069 auto_vec<gimple *> phis;
5070 /* SLP reduction without reduction chain, e.g.,
5071 # a1 = phi <a2, a0>
5072 # b1 = phi <b2, b0>
5073 a2 = operation (a1)
5074 b2 = operation (b1) */
5075 bool slp_reduc = (slp_node && !REDUC_GROUP_FIRST_ELEMENT (stmt_info));
5076 bool direct_slp_reduc;
5077 tree induction_index = NULL_TREE;
5078
5079 if (slp_node)
5080 group_size = SLP_TREE_LANES (slp_node);
5081
5082 if (nested_in_vect_loop_p (loop, stmt_info))
5083 {
5084 outer_loop = loop;
5085 loop = loop->inner;
5086 gcc_assert (!slp_node && double_reduc);
5087 }
5088
5089 vectype = STMT_VINFO_REDUC_VECTYPE (reduc_info);
5090 gcc_assert (vectype);
5091 mode = TYPE_MODE (vectype);
5092
5093 tree induc_val = NULL_TREE;
5094 tree adjustment_def = NULL;
5095 if (slp_node)
5096 ;
5097 else
5098 {
5099 /* Optimize: for induction condition reduction, if we can't use zero
5100 for induc_val, use initial_def. */
5101 if (STMT_VINFO_REDUC_TYPE (reduc_info) == INTEGER_INDUC_COND_REDUCTION)
5102 induc_val = STMT_VINFO_VEC_INDUC_COND_INITIAL_VAL (reduc_info);
5103 else if (double_reduc)
5104 ;
5105 else
5106 adjustment_def = STMT_VINFO_REDUC_EPILOGUE_ADJUSTMENT (reduc_info);
5107 }
5108
5109 stmt_vec_info single_live_out_stmt[] = { stmt_info };
5110 array_slice<const stmt_vec_info> live_out_stmts = single_live_out_stmt;
5111 if (slp_reduc)
5112 /* All statements produce live-out values. */
5113 live_out_stmts = SLP_TREE_SCALAR_STMTS (slp_node);
5114 else if (slp_node)
5115 /* The last statement in the reduction chain produces the live-out
5116 value. */
5117 single_live_out_stmt[0] = SLP_TREE_SCALAR_STMTS (slp_node)[group_size - 1];
5118
5119 unsigned vec_num;
5120 int ncopies;
5121 if (slp_node)
5122 {
5123 vec_num = SLP_TREE_VEC_STMTS (slp_node_instance->reduc_phis).length ();
5124 ncopies = 1;
5125 }
5126 else
5127 {
5128 stmt_vec_info reduc_info = loop_vinfo->lookup_stmt (reduc_def_stmt);
5129 vec_num = 1;
5130 ncopies = STMT_VINFO_VEC_STMTS (reduc_info).length ();
5131 }
5132
5133 /* For cond reductions we want to create a new vector (INDEX_COND_EXPR)
5134 which is updated with the current index of the loop for every match of
5135 the original loop's cond_expr (VEC_STMT). This results in a vector
5136 containing the last time the condition passed for that vector lane.
5137 The first match will be a 1 to allow 0 to be used for non-matching
5138 indexes. If there are no matches at all then the vector will be all
5139 zeroes.
5140
5141 PR92772: This algorithm is broken for architectures that support
5142 masked vectors, but do not provide fold_extract_last. */
5143 if (STMT_VINFO_REDUC_TYPE (reduc_info) == COND_REDUCTION)
5144 {
5145 auto_vec<std::pair<tree, bool>, 2> ccompares;
5146 stmt_vec_info cond_info = STMT_VINFO_REDUC_DEF (reduc_info);
5147 cond_info = vect_stmt_to_vectorize (cond_info);
5148 while (cond_info != reduc_info)
5149 {
5150 if (gimple_assign_rhs_code (cond_info->stmt) == COND_EXPR)
5151 {
5152 gimple *vec_stmt = STMT_VINFO_VEC_STMTS (cond_info)[0];
5153 gcc_assert (gimple_assign_rhs_code (vec_stmt) == VEC_COND_EXPR);
5154 ccompares.safe_push
5155 (std::make_pair (unshare_expr (gimple_assign_rhs1 (vec_stmt)),
5156 STMT_VINFO_REDUC_IDX (cond_info) == 2));
5157 }
5158 cond_info
5159 = loop_vinfo->lookup_def (gimple_op (cond_info->stmt,
5160 1 + STMT_VINFO_REDUC_IDX
5161 (cond_info)));
5162 cond_info = vect_stmt_to_vectorize (cond_info);
5163 }
5164 gcc_assert (ccompares.length () != 0);
5165
5166 tree indx_before_incr, indx_after_incr;
5167 poly_uint64 nunits_out = TYPE_VECTOR_SUBPARTS (vectype);
5168 int scalar_precision
5169 = GET_MODE_PRECISION (SCALAR_TYPE_MODE (TREE_TYPE (vectype)));
5170 tree cr_index_scalar_type = make_unsigned_type (scalar_precision);
5171 tree cr_index_vector_type = get_related_vectype_for_scalar_type
5172 (TYPE_MODE (vectype), cr_index_scalar_type,
5173 TYPE_VECTOR_SUBPARTS (vectype));
5174
5175 /* First we create a simple vector induction variable which starts
5176 with the values {1,2,3,...} (SERIES_VECT) and increments by the
5177 vector size (STEP). */
5178
5179 /* Create a {1,2,3,...} vector. */
5180 tree series_vect = build_index_vector (cr_index_vector_type, 1, 1);
5181
5182 /* Create a vector of the step value. */
5183 tree step = build_int_cst (cr_index_scalar_type, nunits_out);
5184 tree vec_step = build_vector_from_val (cr_index_vector_type, step);
5185
5186 /* Create an induction variable. */
5187 gimple_stmt_iterator incr_gsi;
5188 bool insert_after;
5189 standard_iv_increment_position (loop, &incr_gsi, &insert_after);
5190 create_iv (series_vect, vec_step, NULL_TREE, loop, &incr_gsi,
5191 insert_after, &indx_before_incr, &indx_after_incr);
5192
5193 /* Next create a new phi node vector (NEW_PHI_TREE) which starts
5194 filled with zeros (VEC_ZERO). */
5195
5196 /* Create a vector of 0s. */
5197 tree zero = build_zero_cst (cr_index_scalar_type);
5198 tree vec_zero = build_vector_from_val (cr_index_vector_type, zero);
5199
5200 /* Create a vector phi node. */
5201 tree new_phi_tree = make_ssa_name (cr_index_vector_type);
5202 new_phi = create_phi_node (new_phi_tree, loop->header);
5203 add_phi_arg (as_a <gphi *> (new_phi), vec_zero,
5204 loop_preheader_edge (loop), UNKNOWN_LOCATION);
5205
5206 /* Now take the condition from the loops original cond_exprs
5207 and produce a new cond_exprs (INDEX_COND_EXPR) which for
5208 every match uses values from the induction variable
5209 (INDEX_BEFORE_INCR) otherwise uses values from the phi node
5210 (NEW_PHI_TREE).
5211 Finally, we update the phi (NEW_PHI_TREE) to take the value of
5212 the new cond_expr (INDEX_COND_EXPR). */
5213 gimple_seq stmts = NULL;
5214 for (int i = ccompares.length () - 1; i != -1; --i)
5215 {
5216 tree ccompare = ccompares[i].first;
5217 if (ccompares[i].second)
5218 new_phi_tree = gimple_build (&stmts, VEC_COND_EXPR,
5219 cr_index_vector_type,
5220 ccompare,
5221 indx_before_incr, new_phi_tree);
5222 else
5223 new_phi_tree = gimple_build (&stmts, VEC_COND_EXPR,
5224 cr_index_vector_type,
5225 ccompare,
5226 new_phi_tree, indx_before_incr);
5227 }
5228 gsi_insert_seq_before (&incr_gsi, stmts, GSI_SAME_STMT);
5229
5230 /* Update the phi with the vec cond. */
5231 induction_index = new_phi_tree;
5232 add_phi_arg (as_a <gphi *> (new_phi), induction_index,
5233 loop_latch_edge (loop), UNKNOWN_LOCATION);
5234 }
5235
5236 /* 2. Create epilog code.
5237 The reduction epilog code operates across the elements of the vector
5238 of partial results computed by the vectorized loop.
5239 The reduction epilog code consists of:
5240
5241 step 1: compute the scalar result in a vector (v_out2)
5242 step 2: extract the scalar result (s_out3) from the vector (v_out2)
5243 step 3: adjust the scalar result (s_out3) if needed.
5244
5245 Step 1 can be accomplished using one the following three schemes:
5246 (scheme 1) using reduc_fn, if available.
5247 (scheme 2) using whole-vector shifts, if available.
5248 (scheme 3) using a scalar loop. In this case steps 1+2 above are
5249 combined.
5250
5251 The overall epilog code looks like this:
5252
5253 s_out0 = phi <s_loop> # original EXIT_PHI
5254 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
5255 v_out2 = reduce <v_out1> # step 1
5256 s_out3 = extract_field <v_out2, 0> # step 2
5257 s_out4 = adjust_result <s_out3> # step 3
5258
5259 (step 3 is optional, and steps 1 and 2 may be combined).
5260 Lastly, the uses of s_out0 are replaced by s_out4. */
5261
5262
5263 /* 2.1 Create new loop-exit-phis to preserve loop-closed form:
5264 v_out1 = phi <VECT_DEF>
5265 Store them in NEW_PHIS. */
5266 if (double_reduc)
5267 loop = outer_loop;
5268 exit_bb = single_exit (loop)->dest;
5269 exit_gsi = gsi_after_labels (exit_bb);
5270 reduc_inputs.create (slp_node ? vec_num : ncopies);
5271 for (unsigned i = 0; i < vec_num; i++)
5272 {
5273 gimple_seq stmts = NULL;
5274 if (slp_node)
5275 def = vect_get_slp_vect_def (slp_node, i);
5276 else
5277 def = gimple_get_lhs (STMT_VINFO_VEC_STMTS (rdef_info)[0]);
5278 for (j = 0; j < ncopies; j++)
5279 {
5280 tree new_def = copy_ssa_name (def);
5281 phi = create_phi_node (new_def, exit_bb);
5282 if (j)
5283 def = gimple_get_lhs (STMT_VINFO_VEC_STMTS (rdef_info)[j]);
5284 SET_PHI_ARG_DEF (phi, single_exit (loop)->dest_idx, def);
5285 new_def = gimple_convert (&stmts, vectype, new_def);
5286 reduc_inputs.quick_push (new_def);
5287 }
5288 gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
5289 }
5290
5291 /* 2.2 Get the relevant tree-code to use in the epilog for schemes 2,3
5292 (i.e. when reduc_fn is not available) and in the final adjustment
5293 code (if needed). Also get the original scalar reduction variable as
5294 defined in the loop. In case STMT is a "pattern-stmt" (i.e. - it
5295 represents a reduction pattern), the tree-code and scalar-def are
5296 taken from the original stmt that the pattern-stmt (STMT) replaces.
5297 Otherwise (it is a regular reduction) - the tree-code and scalar-def
5298 are taken from STMT. */
5299
5300 stmt_vec_info orig_stmt_info = vect_orig_stmt (stmt_info);
5301 if (orig_stmt_info != stmt_info)
5302 {
5303 /* Reduction pattern */
5304 gcc_assert (STMT_VINFO_IN_PATTERN_P (orig_stmt_info));
5305 gcc_assert (STMT_VINFO_RELATED_STMT (orig_stmt_info) == stmt_info);
5306 }
5307
5308 scalar_dest = gimple_get_lhs (orig_stmt_info->stmt);
5309 scalar_type = TREE_TYPE (scalar_dest);
5310 scalar_results.create (group_size);
5311 new_scalar_dest = vect_create_destination_var (scalar_dest, NULL);
5312 bitsize = TYPE_SIZE (scalar_type);
5313
5314 /* True if we should implement SLP_REDUC using native reduction operations
5315 instead of scalar operations. */
5316 direct_slp_reduc = (reduc_fn != IFN_LAST
5317 && slp_reduc
5318 && !TYPE_VECTOR_SUBPARTS (vectype).is_constant ());
5319
5320 /* In case of reduction chain, e.g.,
5321 # a1 = phi <a3, a0>
5322 a2 = operation (a1)
5323 a3 = operation (a2),
5324
5325 we may end up with more than one vector result. Here we reduce them
5326 to one vector.
5327
5328 The same is true if we couldn't use a single defuse cycle. */
5329 if (REDUC_GROUP_FIRST_ELEMENT (stmt_info)
5330 || direct_slp_reduc
5331 || ncopies > 1)
5332 {
5333 gimple_seq stmts = NULL;
5334 tree single_input = reduc_inputs[0];
5335 for (k = 1; k < reduc_inputs.length (); k++)
5336 single_input = gimple_build (&stmts, code, vectype,
5337 single_input, reduc_inputs[k]);
5338 gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
5339
5340 reduc_inputs.truncate (0);
5341 reduc_inputs.safe_push (single_input);
5342 }
5343
5344 tree orig_reduc_input = reduc_inputs[0];
5345
5346 /* If this loop is an epilogue loop that can be skipped after the
5347 main loop, we can only share a reduction operation between the
5348 main loop and the epilogue if we put it at the target of the
5349 skip edge.
5350
5351 We can still reuse accumulators if this check fails. Doing so has
5352 the minor(?) benefit of making the epilogue loop's scalar result
5353 independent of the main loop's scalar result. */
5354 bool unify_with_main_loop_p = false;
5355 if (reduc_info->reused_accumulator
5356 && loop_vinfo->skip_this_loop_edge
5357 && single_succ_p (exit_bb)
5358 && single_succ (exit_bb) == loop_vinfo->skip_this_loop_edge->dest)
5359 {
5360 unify_with_main_loop_p = true;
5361
5362 basic_block reduc_block = loop_vinfo->skip_this_loop_edge->dest;
5363 reduc_inputs[0] = make_ssa_name (vectype);
5364 gphi *new_phi = create_phi_node (reduc_inputs[0], reduc_block);
5365 add_phi_arg (new_phi, orig_reduc_input, single_succ_edge (exit_bb),
5366 UNKNOWN_LOCATION);
5367 add_phi_arg (new_phi, reduc_info->reused_accumulator->reduc_input,
5368 loop_vinfo->skip_this_loop_edge, UNKNOWN_LOCATION);
5369 exit_gsi = gsi_after_labels (reduc_block);
5370 }
5371
5372 /* Shouldn't be used beyond this point. */
5373 exit_bb = nullptr;
5374
5375 if (STMT_VINFO_REDUC_TYPE (reduc_info) == COND_REDUCTION
5376 && reduc_fn != IFN_LAST)
5377 {
5378 /* For condition reductions, we have a vector (REDUC_INPUTS 0) containing
5379 various data values where the condition matched and another vector
5380 (INDUCTION_INDEX) containing all the indexes of those matches. We
5381 need to extract the last matching index (which will be the index with
5382 highest value) and use this to index into the data vector.
5383 For the case where there were no matches, the data vector will contain
5384 all default values and the index vector will be all zeros. */
5385
5386 /* Get various versions of the type of the vector of indexes. */
5387 tree index_vec_type = TREE_TYPE (induction_index);
5388 gcc_checking_assert (TYPE_UNSIGNED (index_vec_type));
5389 tree index_scalar_type = TREE_TYPE (index_vec_type);
5390 tree index_vec_cmp_type = truth_type_for (index_vec_type);
5391
5392 /* Get an unsigned integer version of the type of the data vector. */
5393 int scalar_precision
5394 = GET_MODE_PRECISION (SCALAR_TYPE_MODE (scalar_type));
5395 tree scalar_type_unsigned = make_unsigned_type (scalar_precision);
5396 tree vectype_unsigned = get_same_sized_vectype (scalar_type_unsigned,
5397 vectype);
5398
5399 /* First we need to create a vector (ZERO_VEC) of zeros and another
5400 vector (MAX_INDEX_VEC) filled with the last matching index, which we
5401 can create using a MAX reduction and then expanding.
5402 In the case where the loop never made any matches, the max index will
5403 be zero. */
5404
5405 /* Vector of {0, 0, 0,...}. */
5406 tree zero_vec = build_zero_cst (vectype);
5407
5408 /* Find maximum value from the vector of found indexes. */
5409 tree max_index = make_ssa_name (index_scalar_type);
5410 gcall *max_index_stmt = gimple_build_call_internal (IFN_REDUC_MAX,
5411 1, induction_index);
5412 gimple_call_set_lhs (max_index_stmt, max_index);
5413 gsi_insert_before (&exit_gsi, max_index_stmt, GSI_SAME_STMT);
5414
5415 /* Vector of {max_index, max_index, max_index,...}. */
5416 tree max_index_vec = make_ssa_name (index_vec_type);
5417 tree max_index_vec_rhs = build_vector_from_val (index_vec_type,
5418 max_index);
5419 gimple *max_index_vec_stmt = gimple_build_assign (max_index_vec,
5420 max_index_vec_rhs);
5421 gsi_insert_before (&exit_gsi, max_index_vec_stmt, GSI_SAME_STMT);
5422
5423 /* Next we compare the new vector (MAX_INDEX_VEC) full of max indexes
5424 with the vector (INDUCTION_INDEX) of found indexes, choosing values
5425 from the data vector (REDUC_INPUTS 0) for matches, 0 (ZERO_VEC)
5426 otherwise. Only one value should match, resulting in a vector
5427 (VEC_COND) with one data value and the rest zeros.
5428 In the case where the loop never made any matches, every index will
5429 match, resulting in a vector with all data values (which will all be
5430 the default value). */
5431
5432 /* Compare the max index vector to the vector of found indexes to find
5433 the position of the max value. */
5434 tree vec_compare = make_ssa_name (index_vec_cmp_type);
5435 gimple *vec_compare_stmt = gimple_build_assign (vec_compare, EQ_EXPR,
5436 induction_index,
5437 max_index_vec);
5438 gsi_insert_before (&exit_gsi, vec_compare_stmt, GSI_SAME_STMT);
5439
5440 /* Use the compare to choose either values from the data vector or
5441 zero. */
5442 tree vec_cond = make_ssa_name (vectype);
5443 gimple *vec_cond_stmt = gimple_build_assign (vec_cond, VEC_COND_EXPR,
5444 vec_compare,
5445 reduc_inputs[0],
5446 zero_vec);
5447 gsi_insert_before (&exit_gsi, vec_cond_stmt, GSI_SAME_STMT);
5448
5449 /* Finally we need to extract the data value from the vector (VEC_COND)
5450 into a scalar (MATCHED_DATA_REDUC). Logically we want to do a OR
5451 reduction, but because this doesn't exist, we can use a MAX reduction
5452 instead. The data value might be signed or a float so we need to cast
5453 it first.
5454 In the case where the loop never made any matches, the data values are
5455 all identical, and so will reduce down correctly. */
5456
5457 /* Make the matched data values unsigned. */
5458 tree vec_cond_cast = make_ssa_name (vectype_unsigned);
5459 tree vec_cond_cast_rhs = build1 (VIEW_CONVERT_EXPR, vectype_unsigned,
5460 vec_cond);
5461 gimple *vec_cond_cast_stmt = gimple_build_assign (vec_cond_cast,
5462 VIEW_CONVERT_EXPR,
5463 vec_cond_cast_rhs);
5464 gsi_insert_before (&exit_gsi, vec_cond_cast_stmt, GSI_SAME_STMT);
5465
5466 /* Reduce down to a scalar value. */
5467 tree data_reduc = make_ssa_name (scalar_type_unsigned);
5468 gcall *data_reduc_stmt = gimple_build_call_internal (IFN_REDUC_MAX,
5469 1, vec_cond_cast);
5470 gimple_call_set_lhs (data_reduc_stmt, data_reduc);
5471 gsi_insert_before (&exit_gsi, data_reduc_stmt, GSI_SAME_STMT);
5472
5473 /* Convert the reduced value back to the result type and set as the
5474 result. */
5475 gimple_seq stmts = NULL;
5476 new_temp = gimple_build (&stmts, VIEW_CONVERT_EXPR, scalar_type,
5477 data_reduc);
5478 gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
5479 scalar_results.safe_push (new_temp);
5480 }
5481 else if (STMT_VINFO_REDUC_TYPE (reduc_info) == COND_REDUCTION
5482 && reduc_fn == IFN_LAST)
5483 {
5484 /* Condition reduction without supported IFN_REDUC_MAX. Generate
5485 idx = 0;
5486 idx_val = induction_index[0];
5487 val = data_reduc[0];
5488 for (idx = 0, val = init, i = 0; i < nelts; ++i)
5489 if (induction_index[i] > idx_val)
5490 val = data_reduc[i], idx_val = induction_index[i];
5491 return val; */
5492
5493 tree data_eltype = TREE_TYPE (vectype);
5494 tree idx_eltype = TREE_TYPE (TREE_TYPE (induction_index));
5495 unsigned HOST_WIDE_INT el_size = tree_to_uhwi (TYPE_SIZE (idx_eltype));
5496 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (TREE_TYPE (induction_index));
5497 /* Enforced by vectorizable_reduction, which ensures we have target
5498 support before allowing a conditional reduction on variable-length
5499 vectors. */
5500 unsigned HOST_WIDE_INT v_size = el_size * nunits.to_constant ();
5501 tree idx_val = NULL_TREE, val = NULL_TREE;
5502 for (unsigned HOST_WIDE_INT off = 0; off < v_size; off += el_size)
5503 {
5504 tree old_idx_val = idx_val;
5505 tree old_val = val;
5506 idx_val = make_ssa_name (idx_eltype);
5507 epilog_stmt = gimple_build_assign (idx_val, BIT_FIELD_REF,
5508 build3 (BIT_FIELD_REF, idx_eltype,
5509 induction_index,
5510 bitsize_int (el_size),
5511 bitsize_int (off)));
5512 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5513 val = make_ssa_name (data_eltype);
5514 epilog_stmt = gimple_build_assign (val, BIT_FIELD_REF,
5515 build3 (BIT_FIELD_REF,
5516 data_eltype,
5517 reduc_inputs[0],
5518 bitsize_int (el_size),
5519 bitsize_int (off)));
5520 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5521 if (off != 0)
5522 {
5523 tree new_idx_val = idx_val;
5524 if (off != v_size - el_size)
5525 {
5526 new_idx_val = make_ssa_name (idx_eltype);
5527 epilog_stmt = gimple_build_assign (new_idx_val,
5528 MAX_EXPR, idx_val,
5529 old_idx_val);
5530 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5531 }
5532 tree new_val = make_ssa_name (data_eltype);
5533 epilog_stmt = gimple_build_assign (new_val,
5534 COND_EXPR,
5535 build2 (GT_EXPR,
5536 boolean_type_node,
5537 idx_val,
5538 old_idx_val),
5539 val, old_val);
5540 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5541 idx_val = new_idx_val;
5542 val = new_val;
5543 }
5544 }
5545 /* Convert the reduced value back to the result type and set as the
5546 result. */
5547 gimple_seq stmts = NULL;
5548 val = gimple_convert (&stmts, scalar_type, val);
5549 gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
5550 scalar_results.safe_push (val);
5551 }
5552
5553 /* 2.3 Create the reduction code, using one of the three schemes described
5554 above. In SLP we simply need to extract all the elements from the
5555 vector (without reducing them), so we use scalar shifts. */
5556 else if (reduc_fn != IFN_LAST && !slp_reduc)
5557 {
5558 tree tmp;
5559 tree vec_elem_type;
5560
5561 /* Case 1: Create:
5562 v_out2 = reduc_expr <v_out1> */
5563
5564 if (dump_enabled_p ())
5565 dump_printf_loc (MSG_NOTE, vect_location,
5566 "Reduce using direct vector reduction.\n");
5567
5568 gimple_seq stmts = NULL;
5569 vec_elem_type = TREE_TYPE (vectype);
5570 new_temp = gimple_build (&stmts, as_combined_fn (reduc_fn),
5571 vec_elem_type, reduc_inputs[0]);
5572 new_temp = gimple_convert (&stmts, scalar_type, new_temp);
5573 gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
5574
5575 if ((STMT_VINFO_REDUC_TYPE (reduc_info) == INTEGER_INDUC_COND_REDUCTION)
5576 && induc_val)
5577 {
5578 /* Earlier we set the initial value to be a vector if induc_val
5579 values. Check the result and if it is induc_val then replace
5580 with the original initial value, unless induc_val is
5581 the same as initial_def already. */
5582 tree zcompare = build2 (EQ_EXPR, boolean_type_node, new_temp,
5583 induc_val);
5584 tree initial_def = reduc_info->reduc_initial_values[0];
5585
5586 tmp = make_ssa_name (new_scalar_dest);
5587 epilog_stmt = gimple_build_assign (tmp, COND_EXPR, zcompare,
5588 initial_def, new_temp);
5589 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5590 new_temp = tmp;
5591 }
5592
5593 scalar_results.safe_push (new_temp);
5594 }
5595 else if (direct_slp_reduc)
5596 {
5597 /* Here we create one vector for each of the REDUC_GROUP_SIZE results,
5598 with the elements for other SLP statements replaced with the
5599 neutral value. We can then do a normal reduction on each vector. */
5600
5601 /* Enforced by vectorizable_reduction. */
5602 gcc_assert (reduc_inputs.length () == 1);
5603 gcc_assert (pow2p_hwi (group_size));
5604
5605 gimple_seq seq = NULL;
5606
5607 /* Build a vector {0, 1, 2, ...}, with the same number of elements
5608 and the same element size as VECTYPE. */
5609 tree index = build_index_vector (vectype, 0, 1);
5610 tree index_type = TREE_TYPE (index);
5611 tree index_elt_type = TREE_TYPE (index_type);
5612 tree mask_type = truth_type_for (index_type);
5613
5614 /* Create a vector that, for each element, identifies which of
5615 the REDUC_GROUP_SIZE results should use it. */
5616 tree index_mask = build_int_cst (index_elt_type, group_size - 1);
5617 index = gimple_build (&seq, BIT_AND_EXPR, index_type, index,
5618 build_vector_from_val (index_type, index_mask));
5619
5620 /* Get a neutral vector value. This is simply a splat of the neutral
5621 scalar value if we have one, otherwise the initial scalar value
5622 is itself a neutral value. */
5623 tree vector_identity = NULL_TREE;
5624 tree neutral_op = NULL_TREE;
5625 if (slp_node)
5626 {
5627 tree initial_value = NULL_TREE;
5628 if (REDUC_GROUP_FIRST_ELEMENT (stmt_info))
5629 initial_value = reduc_info->reduc_initial_values[0];
5630 neutral_op = neutral_op_for_reduction (TREE_TYPE (vectype), code,
5631 initial_value);
5632 }
5633 if (neutral_op)
5634 vector_identity = gimple_build_vector_from_val (&seq, vectype,
5635 neutral_op);
5636 for (unsigned int i = 0; i < group_size; ++i)
5637 {
5638 /* If there's no univeral neutral value, we can use the
5639 initial scalar value from the original PHI. This is used
5640 for MIN and MAX reduction, for example. */
5641 if (!neutral_op)
5642 {
5643 tree scalar_value = reduc_info->reduc_initial_values[i];
5644 scalar_value = gimple_convert (&seq, TREE_TYPE (vectype),
5645 scalar_value);
5646 vector_identity = gimple_build_vector_from_val (&seq, vectype,
5647 scalar_value);
5648 }
5649
5650 /* Calculate the equivalent of:
5651
5652 sel[j] = (index[j] == i);
5653
5654 which selects the elements of REDUC_INPUTS[0] that should
5655 be included in the result. */
5656 tree compare_val = build_int_cst (index_elt_type, i);
5657 compare_val = build_vector_from_val (index_type, compare_val);
5658 tree sel = gimple_build (&seq, EQ_EXPR, mask_type,
5659 index, compare_val);
5660
5661 /* Calculate the equivalent of:
5662
5663 vec = seq ? reduc_inputs[0] : vector_identity;
5664
5665 VEC is now suitable for a full vector reduction. */
5666 tree vec = gimple_build (&seq, VEC_COND_EXPR, vectype,
5667 sel, reduc_inputs[0], vector_identity);
5668
5669 /* Do the reduction and convert it to the appropriate type. */
5670 tree scalar = gimple_build (&seq, as_combined_fn (reduc_fn),
5671 TREE_TYPE (vectype), vec);
5672 scalar = gimple_convert (&seq, scalar_type, scalar);
5673 scalar_results.safe_push (scalar);
5674 }
5675 gsi_insert_seq_before (&exit_gsi, seq, GSI_SAME_STMT);
5676 }
5677 else
5678 {
5679 bool reduce_with_shift;
5680 tree vec_temp;
5681
5682 gcc_assert (slp_reduc || reduc_inputs.length () == 1);
5683
5684 /* See if the target wants to do the final (shift) reduction
5685 in a vector mode of smaller size and first reduce upper/lower
5686 halves against each other. */
5687 enum machine_mode mode1 = mode;
5688 tree stype = TREE_TYPE (vectype);
5689 unsigned nunits = TYPE_VECTOR_SUBPARTS (vectype).to_constant ();
5690 unsigned nunits1 = nunits;
5691 if ((mode1 = targetm.vectorize.split_reduction (mode)) != mode
5692 && reduc_inputs.length () == 1)
5693 {
5694 nunits1 = GET_MODE_NUNITS (mode1).to_constant ();
5695 /* For SLP reductions we have to make sure lanes match up, but
5696 since we're doing individual element final reduction reducing
5697 vector width here is even more important.
5698 ??? We can also separate lanes with permutes, for the common
5699 case of power-of-two group-size odd/even extracts would work. */
5700 if (slp_reduc && nunits != nunits1)
5701 {
5702 nunits1 = least_common_multiple (nunits1, group_size);
5703 gcc_assert (exact_log2 (nunits1) != -1 && nunits1 <= nunits);
5704 }
5705 }
5706 if (!slp_reduc
5707 && (mode1 = targetm.vectorize.split_reduction (mode)) != mode)
5708 nunits1 = GET_MODE_NUNITS (mode1).to_constant ();
5709
5710 tree vectype1 = get_related_vectype_for_scalar_type (TYPE_MODE (vectype),
5711 stype, nunits1);
5712 reduce_with_shift = have_whole_vector_shift (mode1);
5713 if (!VECTOR_MODE_P (mode1))
5714 reduce_with_shift = false;
5715 else
5716 {
5717 optab optab = optab_for_tree_code (code, vectype1, optab_default);
5718 if (optab_handler (optab, mode1) == CODE_FOR_nothing)
5719 reduce_with_shift = false;
5720 }
5721
5722 /* First reduce the vector to the desired vector size we should
5723 do shift reduction on by combining upper and lower halves. */
5724 gimple_seq stmts = NULL;
5725 new_temp = vect_create_partial_epilog (reduc_inputs[0], vectype1,
5726 code, &stmts);
5727 gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
5728 reduc_inputs[0] = new_temp;
5729
5730 if (reduce_with_shift && !slp_reduc)
5731 {
5732 int element_bitsize = tree_to_uhwi (bitsize);
5733 /* Enforced by vectorizable_reduction, which disallows SLP reductions
5734 for variable-length vectors and also requires direct target support
5735 for loop reductions. */
5736 int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype1));
5737 int nelements = vec_size_in_bits / element_bitsize;
5738 vec_perm_builder sel;
5739 vec_perm_indices indices;
5740
5741 int elt_offset;
5742
5743 tree zero_vec = build_zero_cst (vectype1);
5744 /* Case 2: Create:
5745 for (offset = nelements/2; offset >= 1; offset/=2)
5746 {
5747 Create: va' = vec_shift <va, offset>
5748 Create: va = vop <va, va'>
5749 } */
5750
5751 tree rhs;
5752
5753 if (dump_enabled_p ())
5754 dump_printf_loc (MSG_NOTE, vect_location,
5755 "Reduce using vector shifts\n");
5756
5757 gimple_seq stmts = NULL;
5758 new_temp = gimple_convert (&stmts, vectype1, new_temp);
5759 for (elt_offset = nelements / 2;
5760 elt_offset >= 1;
5761 elt_offset /= 2)
5762 {
5763 calc_vec_perm_mask_for_shift (elt_offset, nelements, &sel);
5764 indices.new_vector (sel, 2, nelements);
5765 tree mask = vect_gen_perm_mask_any (vectype1, indices);
5766 new_name = gimple_build (&stmts, VEC_PERM_EXPR, vectype1,
5767 new_temp, zero_vec, mask);
5768 new_temp = gimple_build (&stmts, code,
5769 vectype1, new_name, new_temp);
5770 }
5771 gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
5772
5773 /* 2.4 Extract the final scalar result. Create:
5774 s_out3 = extract_field <v_out2, bitpos> */
5775
5776 if (dump_enabled_p ())
5777 dump_printf_loc (MSG_NOTE, vect_location,
5778 "extract scalar result\n");
5779
5780 rhs = build3 (BIT_FIELD_REF, scalar_type, new_temp,
5781 bitsize, bitsize_zero_node);
5782 epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
5783 new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
5784 gimple_assign_set_lhs (epilog_stmt, new_temp);
5785 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5786 scalar_results.safe_push (new_temp);
5787 }
5788 else
5789 {
5790 /* Case 3: Create:
5791 s = extract_field <v_out2, 0>
5792 for (offset = element_size;
5793 offset < vector_size;
5794 offset += element_size;)
5795 {
5796 Create: s' = extract_field <v_out2, offset>
5797 Create: s = op <s, s'> // For non SLP cases
5798 } */
5799
5800 if (dump_enabled_p ())
5801 dump_printf_loc (MSG_NOTE, vect_location,
5802 "Reduce using scalar code.\n");
5803
5804 int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype1));
5805 int element_bitsize = tree_to_uhwi (bitsize);
5806 tree compute_type = TREE_TYPE (vectype);
5807 gimple_seq stmts = NULL;
5808 FOR_EACH_VEC_ELT (reduc_inputs, i, vec_temp)
5809 {
5810 int bit_offset;
5811 new_temp = gimple_build (&stmts, BIT_FIELD_REF, compute_type,
5812 vec_temp, bitsize, bitsize_zero_node);
5813
5814 /* In SLP we don't need to apply reduction operation, so we just
5815 collect s' values in SCALAR_RESULTS. */
5816 if (slp_reduc)
5817 scalar_results.safe_push (new_temp);
5818
5819 for (bit_offset = element_bitsize;
5820 bit_offset < vec_size_in_bits;
5821 bit_offset += element_bitsize)
5822 {
5823 tree bitpos = bitsize_int (bit_offset);
5824 new_name = gimple_build (&stmts, BIT_FIELD_REF,
5825 compute_type, vec_temp,
5826 bitsize, bitpos);
5827 if (slp_reduc)
5828 {
5829 /* In SLP we don't need to apply reduction operation, so
5830 we just collect s' values in SCALAR_RESULTS. */
5831 new_temp = new_name;
5832 scalar_results.safe_push (new_name);
5833 }
5834 else
5835 new_temp = gimple_build (&stmts, code, compute_type,
5836 new_name, new_temp);
5837 }
5838 }
5839
5840 /* The only case where we need to reduce scalar results in SLP, is
5841 unrolling. If the size of SCALAR_RESULTS is greater than
5842 REDUC_GROUP_SIZE, we reduce them combining elements modulo
5843 REDUC_GROUP_SIZE. */
5844 if (slp_reduc)
5845 {
5846 tree res, first_res, new_res;
5847
5848 /* Reduce multiple scalar results in case of SLP unrolling. */
5849 for (j = group_size; scalar_results.iterate (j, &res);
5850 j++)
5851 {
5852 first_res = scalar_results[j % group_size];
5853 new_res = gimple_build (&stmts, code, compute_type,
5854 first_res, res);
5855 scalar_results[j % group_size] = new_res;
5856 }
5857 scalar_results.truncate (group_size);
5858 for (k = 0; k < group_size; k++)
5859 scalar_results[k] = gimple_convert (&stmts, scalar_type,
5860 scalar_results[k]);
5861 }
5862 else
5863 {
5864 /* Not SLP - we have one scalar to keep in SCALAR_RESULTS. */
5865 new_temp = gimple_convert (&stmts, scalar_type, new_temp);
5866 scalar_results.safe_push (new_temp);
5867 }
5868
5869 gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
5870 }
5871
5872 if ((STMT_VINFO_REDUC_TYPE (reduc_info) == INTEGER_INDUC_COND_REDUCTION)
5873 && induc_val)
5874 {
5875 /* Earlier we set the initial value to be a vector if induc_val
5876 values. Check the result and if it is induc_val then replace
5877 with the original initial value, unless induc_val is
5878 the same as initial_def already. */
5879 tree zcompare = build2 (EQ_EXPR, boolean_type_node, new_temp,
5880 induc_val);
5881 tree initial_def = reduc_info->reduc_initial_values[0];
5882
5883 tree tmp = make_ssa_name (new_scalar_dest);
5884 epilog_stmt = gimple_build_assign (tmp, COND_EXPR, zcompare,
5885 initial_def, new_temp);
5886 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5887 scalar_results[0] = tmp;
5888 }
5889 }
5890
5891 /* 2.5 Adjust the final result by the initial value of the reduction
5892 variable. (When such adjustment is not needed, then
5893 'adjustment_def' is zero). For example, if code is PLUS we create:
5894 new_temp = loop_exit_def + adjustment_def */
5895
5896 if (adjustment_def)
5897 {
5898 gcc_assert (!slp_reduc);
5899 gimple_seq stmts = NULL;
5900 if (double_reduc)
5901 {
5902 gcc_assert (VECTOR_TYPE_P (TREE_TYPE (adjustment_def)));
5903 adjustment_def = gimple_convert (&stmts, vectype, adjustment_def);
5904 new_temp = gimple_build (&stmts, code, vectype,
5905 reduc_inputs[0], adjustment_def);
5906 }
5907 else
5908 {
5909 new_temp = scalar_results[0];
5910 gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) != VECTOR_TYPE);
5911 adjustment_def = gimple_convert (&stmts, scalar_type, adjustment_def);
5912 new_temp = gimple_build (&stmts, code, scalar_type,
5913 new_temp, adjustment_def);
5914 }
5915
5916 epilog_stmt = gimple_seq_last_stmt (stmts);
5917 gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
5918 scalar_results[0] = new_temp;
5919 }
5920
5921 /* Record this operation if it could be reused by the epilogue loop. */
5922 if (STMT_VINFO_REDUC_TYPE (reduc_info) == TREE_CODE_REDUCTION)
5923 loop_vinfo->reusable_accumulators.put (scalar_results[0],
5924 { orig_reduc_input, reduc_info });
5925
5926 if (double_reduc)
5927 loop = outer_loop;
5928
5929 /* 2.6 Handle the loop-exit phis. Replace the uses of scalar loop-exit
5930 phis with new adjusted scalar results, i.e., replace use <s_out0>
5931 with use <s_out4>.
5932
5933 Transform:
5934 loop_exit:
5935 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
5936 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
5937 v_out2 = reduce <v_out1>
5938 s_out3 = extract_field <v_out2, 0>
5939 s_out4 = adjust_result <s_out3>
5940 use <s_out0>
5941 use <s_out0>
5942
5943 into:
5944
5945 loop_exit:
5946 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
5947 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
5948 v_out2 = reduce <v_out1>
5949 s_out3 = extract_field <v_out2, 0>
5950 s_out4 = adjust_result <s_out3>
5951 use <s_out4>
5952 use <s_out4> */
5953
5954 gcc_assert (live_out_stmts.size () == scalar_results.length ());
5955 for (k = 0; k < live_out_stmts.size (); k++)
5956 {
5957 stmt_vec_info scalar_stmt_info = vect_orig_stmt (live_out_stmts[k]);
5958 scalar_dest = gimple_assign_lhs (scalar_stmt_info->stmt);
5959
5960 phis.create (3);
5961 /* Find the loop-closed-use at the loop exit of the original scalar
5962 result. (The reduction result is expected to have two immediate uses,
5963 one at the latch block, and one at the loop exit). For double
5964 reductions we are looking for exit phis of the outer loop. */
5965 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, scalar_dest)
5966 {
5967 if (!flow_bb_inside_loop_p (loop, gimple_bb (USE_STMT (use_p))))
5968 {
5969 if (!is_gimple_debug (USE_STMT (use_p)))
5970 phis.safe_push (USE_STMT (use_p));
5971 }
5972 else
5973 {
5974 if (double_reduc && gimple_code (USE_STMT (use_p)) == GIMPLE_PHI)
5975 {
5976 tree phi_res = PHI_RESULT (USE_STMT (use_p));
5977
5978 FOR_EACH_IMM_USE_FAST (phi_use_p, phi_imm_iter, phi_res)
5979 {
5980 if (!flow_bb_inside_loop_p (loop,
5981 gimple_bb (USE_STMT (phi_use_p)))
5982 && !is_gimple_debug (USE_STMT (phi_use_p)))
5983 phis.safe_push (USE_STMT (phi_use_p));
5984 }
5985 }
5986 }
5987 }
5988
5989 FOR_EACH_VEC_ELT (phis, i, exit_phi)
5990 {
5991 /* Replace the uses: */
5992 orig_name = PHI_RESULT (exit_phi);
5993
5994 /* Look for a single use at the target of the skip edge. */
5995 if (unify_with_main_loop_p)
5996 {
5997 use_operand_p use_p;
5998 gimple *user;
5999 if (!single_imm_use (orig_name, &use_p, &user))
6000 gcc_unreachable ();
6001 orig_name = gimple_get_lhs (user);
6002 }
6003
6004 scalar_result = scalar_results[k];
6005 FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name)
6006 {
6007 FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
6008 SET_USE (use_p, scalar_result);
6009 update_stmt (use_stmt);
6010 }
6011 }
6012
6013 phis.release ();
6014 }
6015 }
6016
6017 /* Return a vector of type VECTYPE that is equal to the vector select
6018 operation "MASK ? VEC : IDENTITY". Insert the select statements
6019 before GSI. */
6020
6021 static tree
6022 merge_with_identity (gimple_stmt_iterator *gsi, tree mask, tree vectype,
6023 tree vec, tree identity)
6024 {
6025 tree cond = make_temp_ssa_name (vectype, NULL, "cond");
6026 gimple *new_stmt = gimple_build_assign (cond, VEC_COND_EXPR,
6027 mask, vec, identity);
6028 gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
6029 return cond;
6030 }
6031
6032 /* Successively apply CODE to each element of VECTOR_RHS, in left-to-right
6033 order, starting with LHS. Insert the extraction statements before GSI and
6034 associate the new scalar SSA names with variable SCALAR_DEST.
6035 Return the SSA name for the result. */
6036
6037 static tree
6038 vect_expand_fold_left (gimple_stmt_iterator *gsi, tree scalar_dest,
6039 tree_code code, tree lhs, tree vector_rhs)
6040 {
6041 tree vectype = TREE_TYPE (vector_rhs);
6042 tree scalar_type = TREE_TYPE (vectype);
6043 tree bitsize = TYPE_SIZE (scalar_type);
6044 unsigned HOST_WIDE_INT vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
6045 unsigned HOST_WIDE_INT element_bitsize = tree_to_uhwi (bitsize);
6046
6047 for (unsigned HOST_WIDE_INT bit_offset = 0;
6048 bit_offset < vec_size_in_bits;
6049 bit_offset += element_bitsize)
6050 {
6051 tree bitpos = bitsize_int (bit_offset);
6052 tree rhs = build3 (BIT_FIELD_REF, scalar_type, vector_rhs,
6053 bitsize, bitpos);
6054
6055 gassign *stmt = gimple_build_assign (scalar_dest, rhs);
6056 rhs = make_ssa_name (scalar_dest, stmt);
6057 gimple_assign_set_lhs (stmt, rhs);
6058 gsi_insert_before (gsi, stmt, GSI_SAME_STMT);
6059
6060 stmt = gimple_build_assign (scalar_dest, code, lhs, rhs);
6061 tree new_name = make_ssa_name (scalar_dest, stmt);
6062 gimple_assign_set_lhs (stmt, new_name);
6063 gsi_insert_before (gsi, stmt, GSI_SAME_STMT);
6064 lhs = new_name;
6065 }
6066 return lhs;
6067 }
6068
6069 /* Get a masked internal function equivalent to REDUC_FN. VECTYPE_IN is the
6070 type of the vector input. */
6071
6072 static internal_fn
6073 get_masked_reduction_fn (internal_fn reduc_fn, tree vectype_in)
6074 {
6075 internal_fn mask_reduc_fn;
6076
6077 switch (reduc_fn)
6078 {
6079 case IFN_FOLD_LEFT_PLUS:
6080 mask_reduc_fn = IFN_MASK_FOLD_LEFT_PLUS;
6081 break;
6082
6083 default:
6084 return IFN_LAST;
6085 }
6086
6087 if (direct_internal_fn_supported_p (mask_reduc_fn, vectype_in,
6088 OPTIMIZE_FOR_SPEED))
6089 return mask_reduc_fn;
6090 return IFN_LAST;
6091 }
6092
6093 /* Perform an in-order reduction (FOLD_LEFT_REDUCTION). STMT_INFO is the
6094 statement that sets the live-out value. REDUC_DEF_STMT is the phi
6095 statement. CODE is the operation performed by STMT_INFO and OPS are
6096 its scalar operands. REDUC_INDEX is the index of the operand in
6097 OPS that is set by REDUC_DEF_STMT. REDUC_FN is the function that
6098 implements in-order reduction, or IFN_LAST if we should open-code it.
6099 VECTYPE_IN is the type of the vector input. MASKS specifies the masks
6100 that should be used to control the operation in a fully-masked loop. */
6101
6102 static bool
6103 vectorize_fold_left_reduction (loop_vec_info loop_vinfo,
6104 stmt_vec_info stmt_info,
6105 gimple_stmt_iterator *gsi,
6106 gimple **vec_stmt, slp_tree slp_node,
6107 gimple *reduc_def_stmt,
6108 tree_code code, internal_fn reduc_fn,
6109 tree ops[3], tree vectype_in,
6110 int reduc_index, vec_loop_masks *masks)
6111 {
6112 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
6113 tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
6114 internal_fn mask_reduc_fn = get_masked_reduction_fn (reduc_fn, vectype_in);
6115
6116 int ncopies;
6117 if (slp_node)
6118 ncopies = 1;
6119 else
6120 ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
6121
6122 gcc_assert (!nested_in_vect_loop_p (loop, stmt_info));
6123 gcc_assert (ncopies == 1);
6124 gcc_assert (TREE_CODE_LENGTH (code) == binary_op);
6125
6126 if (slp_node)
6127 gcc_assert (known_eq (TYPE_VECTOR_SUBPARTS (vectype_out),
6128 TYPE_VECTOR_SUBPARTS (vectype_in)));
6129
6130 tree op0 = ops[1 - reduc_index];
6131
6132 int group_size = 1;
6133 stmt_vec_info scalar_dest_def_info;
6134 auto_vec<tree> vec_oprnds0;
6135 if (slp_node)
6136 {
6137 auto_vec<vec<tree> > vec_defs (2);
6138 vect_get_slp_defs (loop_vinfo, slp_node, &vec_defs);
6139 vec_oprnds0.safe_splice (vec_defs[1 - reduc_index]);
6140 vec_defs[0].release ();
6141 vec_defs[1].release ();
6142 group_size = SLP_TREE_SCALAR_STMTS (slp_node).length ();
6143 scalar_dest_def_info = SLP_TREE_SCALAR_STMTS (slp_node)[group_size - 1];
6144 }
6145 else
6146 {
6147 vect_get_vec_defs_for_operand (loop_vinfo, stmt_info, 1,
6148 op0, &vec_oprnds0);
6149 scalar_dest_def_info = stmt_info;
6150 }
6151
6152 tree scalar_dest = gimple_assign_lhs (scalar_dest_def_info->stmt);
6153 tree scalar_type = TREE_TYPE (scalar_dest);
6154 tree reduc_var = gimple_phi_result (reduc_def_stmt);
6155
6156 int vec_num = vec_oprnds0.length ();
6157 gcc_assert (vec_num == 1 || slp_node);
6158 tree vec_elem_type = TREE_TYPE (vectype_out);
6159 gcc_checking_assert (useless_type_conversion_p (scalar_type, vec_elem_type));
6160
6161 tree vector_identity = NULL_TREE;
6162 if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
6163 vector_identity = build_zero_cst (vectype_out);
6164
6165 tree scalar_dest_var = vect_create_destination_var (scalar_dest, NULL);
6166 int i;
6167 tree def0;
6168 FOR_EACH_VEC_ELT (vec_oprnds0, i, def0)
6169 {
6170 gimple *new_stmt;
6171 tree mask = NULL_TREE;
6172 if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
6173 mask = vect_get_loop_mask (gsi, masks, vec_num, vectype_in, i);
6174
6175 /* Handle MINUS by adding the negative. */
6176 if (reduc_fn != IFN_LAST && code == MINUS_EXPR)
6177 {
6178 tree negated = make_ssa_name (vectype_out);
6179 new_stmt = gimple_build_assign (negated, NEGATE_EXPR, def0);
6180 gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
6181 def0 = negated;
6182 }
6183
6184 if (mask && mask_reduc_fn == IFN_LAST)
6185 def0 = merge_with_identity (gsi, mask, vectype_out, def0,
6186 vector_identity);
6187
6188 /* On the first iteration the input is simply the scalar phi
6189 result, and for subsequent iterations it is the output of
6190 the preceding operation. */
6191 if (reduc_fn != IFN_LAST || (mask && mask_reduc_fn != IFN_LAST))
6192 {
6193 if (mask && mask_reduc_fn != IFN_LAST)
6194 new_stmt = gimple_build_call_internal (mask_reduc_fn, 3, reduc_var,
6195 def0, mask);
6196 else
6197 new_stmt = gimple_build_call_internal (reduc_fn, 2, reduc_var,
6198 def0);
6199 /* For chained SLP reductions the output of the previous reduction
6200 operation serves as the input of the next. For the final statement
6201 the output cannot be a temporary - we reuse the original
6202 scalar destination of the last statement. */
6203 if (i != vec_num - 1)
6204 {
6205 gimple_set_lhs (new_stmt, scalar_dest_var);
6206 reduc_var = make_ssa_name (scalar_dest_var, new_stmt);
6207 gimple_set_lhs (new_stmt, reduc_var);
6208 }
6209 }
6210 else
6211 {
6212 reduc_var = vect_expand_fold_left (gsi, scalar_dest_var, code,
6213 reduc_var, def0);
6214 new_stmt = SSA_NAME_DEF_STMT (reduc_var);
6215 /* Remove the statement, so that we can use the same code paths
6216 as for statements that we've just created. */
6217 gimple_stmt_iterator tmp_gsi = gsi_for_stmt (new_stmt);
6218 gsi_remove (&tmp_gsi, true);
6219 }
6220
6221 if (i == vec_num - 1)
6222 {
6223 gimple_set_lhs (new_stmt, scalar_dest);
6224 vect_finish_replace_stmt (loop_vinfo,
6225 scalar_dest_def_info,
6226 new_stmt);
6227 }
6228 else
6229 vect_finish_stmt_generation (loop_vinfo,
6230 scalar_dest_def_info,
6231 new_stmt, gsi);
6232
6233 if (slp_node)
6234 SLP_TREE_VEC_STMTS (slp_node).quick_push (new_stmt);
6235 else
6236 {
6237 STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_stmt);
6238 *vec_stmt = new_stmt;
6239 }
6240 }
6241
6242 return true;
6243 }
6244
6245 /* Function is_nonwrapping_integer_induction.
6246
6247 Check if STMT_VINO (which is part of loop LOOP) both increments and
6248 does not cause overflow. */
6249
6250 static bool
6251 is_nonwrapping_integer_induction (stmt_vec_info stmt_vinfo, class loop *loop)
6252 {
6253 gphi *phi = as_a <gphi *> (stmt_vinfo->stmt);
6254 tree base = STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo);
6255 tree step = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo);
6256 tree lhs_type = TREE_TYPE (gimple_phi_result (phi));
6257 widest_int ni, max_loop_value, lhs_max;
6258 wi::overflow_type overflow = wi::OVF_NONE;
6259
6260 /* Make sure the loop is integer based. */
6261 if (TREE_CODE (base) != INTEGER_CST
6262 || TREE_CODE (step) != INTEGER_CST)
6263 return false;
6264
6265 /* Check that the max size of the loop will not wrap. */
6266
6267 if (TYPE_OVERFLOW_UNDEFINED (lhs_type))
6268 return true;
6269
6270 if (! max_stmt_executions (loop, &ni))
6271 return false;
6272
6273 max_loop_value = wi::mul (wi::to_widest (step), ni, TYPE_SIGN (lhs_type),
6274 &overflow);
6275 if (overflow)
6276 return false;
6277
6278 max_loop_value = wi::add (wi::to_widest (base), max_loop_value,
6279 TYPE_SIGN (lhs_type), &overflow);
6280 if (overflow)
6281 return false;
6282
6283 return (wi::min_precision (max_loop_value, TYPE_SIGN (lhs_type))
6284 <= TYPE_PRECISION (lhs_type));
6285 }
6286
6287 /* Check if masking can be supported by inserting a conditional expression.
6288 CODE is the code for the operation. COND_FN is the conditional internal
6289 function, if it exists. VECTYPE_IN is the type of the vector input. */
6290 static bool
6291 use_mask_by_cond_expr_p (enum tree_code code, internal_fn cond_fn,
6292 tree vectype_in)
6293 {
6294 if (cond_fn != IFN_LAST
6295 && direct_internal_fn_supported_p (cond_fn, vectype_in,
6296 OPTIMIZE_FOR_SPEED))
6297 return false;
6298
6299 switch (code)
6300 {
6301 case DOT_PROD_EXPR:
6302 case SAD_EXPR:
6303 return true;
6304
6305 default:
6306 return false;
6307 }
6308 }
6309
6310 /* Insert a conditional expression to enable masked vectorization. CODE is the
6311 code for the operation. VOP is the array of operands. MASK is the loop
6312 mask. GSI is a statement iterator used to place the new conditional
6313 expression. */
6314 static void
6315 build_vect_cond_expr (enum tree_code code, tree vop[3], tree mask,
6316 gimple_stmt_iterator *gsi)
6317 {
6318 switch (code)
6319 {
6320 case DOT_PROD_EXPR:
6321 {
6322 tree vectype = TREE_TYPE (vop[1]);
6323 tree zero = build_zero_cst (vectype);
6324 tree masked_op1 = make_temp_ssa_name (vectype, NULL, "masked_op1");
6325 gassign *select = gimple_build_assign (masked_op1, VEC_COND_EXPR,
6326 mask, vop[1], zero);
6327 gsi_insert_before (gsi, select, GSI_SAME_STMT);
6328 vop[1] = masked_op1;
6329 break;
6330 }
6331
6332 case SAD_EXPR:
6333 {
6334 tree vectype = TREE_TYPE (vop[1]);
6335 tree masked_op1 = make_temp_ssa_name (vectype, NULL, "masked_op1");
6336 gassign *select = gimple_build_assign (masked_op1, VEC_COND_EXPR,
6337 mask, vop[1], vop[0]);
6338 gsi_insert_before (gsi, select, GSI_SAME_STMT);
6339 vop[1] = masked_op1;
6340 break;
6341 }
6342
6343 default:
6344 gcc_unreachable ();
6345 }
6346 }
6347
6348 /* Function vectorizable_reduction.
6349
6350 Check if STMT_INFO performs a reduction operation that can be vectorized.
6351 If VEC_STMT is also passed, vectorize STMT_INFO: create a vectorized
6352 stmt to replace it, put it in VEC_STMT, and insert it at GSI.
6353 Return true if STMT_INFO is vectorizable in this way.
6354
6355 This function also handles reduction idioms (patterns) that have been
6356 recognized in advance during vect_pattern_recog. In this case, STMT_INFO
6357 may be of this form:
6358 X = pattern_expr (arg0, arg1, ..., X)
6359 and its STMT_VINFO_RELATED_STMT points to the last stmt in the original
6360 sequence that had been detected and replaced by the pattern-stmt
6361 (STMT_INFO).
6362
6363 This function also handles reduction of condition expressions, for example:
6364 for (int i = 0; i < N; i++)
6365 if (a[i] < value)
6366 last = a[i];
6367 This is handled by vectorising the loop and creating an additional vector
6368 containing the loop indexes for which "a[i] < value" was true. In the
6369 function epilogue this is reduced to a single max value and then used to
6370 index into the vector of results.
6371
6372 In some cases of reduction patterns, the type of the reduction variable X is
6373 different than the type of the other arguments of STMT_INFO.
6374 In such cases, the vectype that is used when transforming STMT_INFO into
6375 a vector stmt is different than the vectype that is used to determine the
6376 vectorization factor, because it consists of a different number of elements
6377 than the actual number of elements that are being operated upon in parallel.
6378
6379 For example, consider an accumulation of shorts into an int accumulator.
6380 On some targets it's possible to vectorize this pattern operating on 8
6381 shorts at a time (hence, the vectype for purposes of determining the
6382 vectorization factor should be V8HI); on the other hand, the vectype that
6383 is used to create the vector form is actually V4SI (the type of the result).
6384
6385 Upon entry to this function, STMT_VINFO_VECTYPE records the vectype that
6386 indicates what is the actual level of parallelism (V8HI in the example), so
6387 that the right vectorization factor would be derived. This vectype
6388 corresponds to the type of arguments to the reduction stmt, and should *NOT*
6389 be used to create the vectorized stmt. The right vectype for the vectorized
6390 stmt is obtained from the type of the result X:
6391 get_vectype_for_scalar_type (vinfo, TREE_TYPE (X))
6392
6393 This means that, contrary to "regular" reductions (or "regular" stmts in
6394 general), the following equation:
6395 STMT_VINFO_VECTYPE == get_vectype_for_scalar_type (vinfo, TREE_TYPE (X))
6396 does *NOT* necessarily hold for reduction patterns. */
6397
6398 bool
6399 vectorizable_reduction (loop_vec_info loop_vinfo,
6400 stmt_vec_info stmt_info, slp_tree slp_node,
6401 slp_instance slp_node_instance,
6402 stmt_vector_for_cost *cost_vec)
6403 {
6404 tree scalar_dest;
6405 tree vectype_in = NULL_TREE;
6406 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
6407 enum vect_def_type cond_reduc_dt = vect_unknown_def_type;
6408 stmt_vec_info cond_stmt_vinfo = NULL;
6409 tree scalar_type;
6410 int i;
6411 int ncopies;
6412 bool single_defuse_cycle = false;
6413 bool nested_cycle = false;
6414 bool double_reduc = false;
6415 int vec_num;
6416 tree tem;
6417 tree cr_index_scalar_type = NULL_TREE, cr_index_vector_type = NULL_TREE;
6418 tree cond_reduc_val = NULL_TREE;
6419
6420 /* Make sure it was already recognized as a reduction computation. */
6421 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_reduction_def
6422 && STMT_VINFO_DEF_TYPE (stmt_info) != vect_double_reduction_def
6423 && STMT_VINFO_DEF_TYPE (stmt_info) != vect_nested_cycle)
6424 return false;
6425
6426 /* The stmt we store reduction analysis meta on. */
6427 stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info);
6428 reduc_info->is_reduc_info = true;
6429
6430 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle)
6431 {
6432 if (is_a <gphi *> (stmt_info->stmt))
6433 {
6434 if (slp_node)
6435 {
6436 /* We eventually need to set a vector type on invariant
6437 arguments. */
6438 unsigned j;
6439 slp_tree child;
6440 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (slp_node), j, child)
6441 if (!vect_maybe_update_slp_op_vectype
6442 (child, SLP_TREE_VECTYPE (slp_node)))
6443 {
6444 if (dump_enabled_p ())
6445 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6446 "incompatible vector types for "
6447 "invariants\n");
6448 return false;
6449 }
6450 }
6451 /* Analysis for double-reduction is done on the outer
6452 loop PHI, nested cycles have no further restrictions. */
6453 STMT_VINFO_TYPE (stmt_info) = cycle_phi_info_type;
6454 }
6455 else
6456 STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
6457 return true;
6458 }
6459
6460 stmt_vec_info orig_stmt_of_analysis = stmt_info;
6461 stmt_vec_info phi_info = stmt_info;
6462 if (!is_a <gphi *> (stmt_info->stmt))
6463 {
6464 STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
6465 return true;
6466 }
6467 if (slp_node)
6468 {
6469 slp_node_instance->reduc_phis = slp_node;
6470 /* ??? We're leaving slp_node to point to the PHIs, we only
6471 need it to get at the number of vector stmts which wasn't
6472 yet initialized for the instance root. */
6473 }
6474 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def)
6475 stmt_info = vect_stmt_to_vectorize (STMT_VINFO_REDUC_DEF (stmt_info));
6476 else
6477 {
6478 gcc_assert (STMT_VINFO_DEF_TYPE (stmt_info)
6479 == vect_double_reduction_def);
6480 use_operand_p use_p;
6481 gimple *use_stmt;
6482 bool res = single_imm_use (gimple_phi_result (stmt_info->stmt),
6483 &use_p, &use_stmt);
6484 gcc_assert (res);
6485 phi_info = loop_vinfo->lookup_stmt (use_stmt);
6486 stmt_info = vect_stmt_to_vectorize (STMT_VINFO_REDUC_DEF (phi_info));
6487 }
6488
6489 /* PHIs should not participate in patterns. */
6490 gcc_assert (!STMT_VINFO_RELATED_STMT (phi_info));
6491 gphi *reduc_def_phi = as_a <gphi *> (phi_info->stmt);
6492
6493 /* Verify following REDUC_IDX from the latch def leads us back to the PHI
6494 and compute the reduction chain length. Discover the real
6495 reduction operation stmt on the way (stmt_info and slp_for_stmt_info). */
6496 tree reduc_def
6497 = PHI_ARG_DEF_FROM_EDGE (reduc_def_phi,
6498 loop_latch_edge
6499 (gimple_bb (reduc_def_phi)->loop_father));
6500 unsigned reduc_chain_length = 0;
6501 bool only_slp_reduc_chain = true;
6502 stmt_info = NULL;
6503 slp_tree slp_for_stmt_info = slp_node ? slp_node_instance->root : NULL;
6504 while (reduc_def != PHI_RESULT (reduc_def_phi))
6505 {
6506 stmt_vec_info def = loop_vinfo->lookup_def (reduc_def);
6507 stmt_vec_info vdef = vect_stmt_to_vectorize (def);
6508 if (STMT_VINFO_REDUC_IDX (vdef) == -1)
6509 {
6510 if (dump_enabled_p ())
6511 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6512 "reduction chain broken by patterns.\n");
6513 return false;
6514 }
6515 if (!REDUC_GROUP_FIRST_ELEMENT (vdef))
6516 only_slp_reduc_chain = false;
6517 /* ??? For epilogue generation live members of the chain need
6518 to point back to the PHI via their original stmt for
6519 info_for_reduction to work. */
6520 if (STMT_VINFO_LIVE_P (vdef))
6521 STMT_VINFO_REDUC_DEF (def) = phi_info;
6522 gassign *assign = dyn_cast <gassign *> (vdef->stmt);
6523 if (!assign)
6524 {
6525 if (dump_enabled_p ())
6526 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6527 "reduction chain includes calls.\n");
6528 return false;
6529 }
6530 if (CONVERT_EXPR_CODE_P (gimple_assign_rhs_code (assign)))
6531 {
6532 if (!tree_nop_conversion_p (TREE_TYPE (gimple_assign_lhs (assign)),
6533 TREE_TYPE (gimple_assign_rhs1 (assign))))
6534 {
6535 if (dump_enabled_p ())
6536 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6537 "conversion in the reduction chain.\n");
6538 return false;
6539 }
6540 }
6541 else if (!stmt_info)
6542 /* First non-conversion stmt. */
6543 stmt_info = vdef;
6544 reduc_def = gimple_op (vdef->stmt, 1 + STMT_VINFO_REDUC_IDX (vdef));
6545 reduc_chain_length++;
6546 if (!stmt_info && slp_node)
6547 slp_for_stmt_info = SLP_TREE_CHILDREN (slp_for_stmt_info)[0];
6548 }
6549 /* PHIs should not participate in patterns. */
6550 gcc_assert (!STMT_VINFO_RELATED_STMT (phi_info));
6551
6552 if (nested_in_vect_loop_p (loop, stmt_info))
6553 {
6554 loop = loop->inner;
6555 nested_cycle = true;
6556 }
6557
6558 /* STMT_VINFO_REDUC_DEF doesn't point to the first but the last
6559 element. */
6560 if (slp_node && REDUC_GROUP_FIRST_ELEMENT (stmt_info))
6561 {
6562 gcc_assert (!REDUC_GROUP_NEXT_ELEMENT (stmt_info));
6563 stmt_info = REDUC_GROUP_FIRST_ELEMENT (stmt_info);
6564 }
6565 if (REDUC_GROUP_FIRST_ELEMENT (stmt_info))
6566 gcc_assert (slp_node
6567 && REDUC_GROUP_FIRST_ELEMENT (stmt_info) == stmt_info);
6568
6569 /* 1. Is vectorizable reduction? */
6570 /* Not supportable if the reduction variable is used in the loop, unless
6571 it's a reduction chain. */
6572 if (STMT_VINFO_RELEVANT (stmt_info) > vect_used_in_outer
6573 && !REDUC_GROUP_FIRST_ELEMENT (stmt_info))
6574 return false;
6575
6576 /* Reductions that are not used even in an enclosing outer-loop,
6577 are expected to be "live" (used out of the loop). */
6578 if (STMT_VINFO_RELEVANT (stmt_info) == vect_unused_in_scope
6579 && !STMT_VINFO_LIVE_P (stmt_info))
6580 return false;
6581
6582 /* 2. Has this been recognized as a reduction pattern?
6583
6584 Check if STMT represents a pattern that has been recognized
6585 in earlier analysis stages. For stmts that represent a pattern,
6586 the STMT_VINFO_RELATED_STMT field records the last stmt in
6587 the original sequence that constitutes the pattern. */
6588
6589 stmt_vec_info orig_stmt_info = STMT_VINFO_RELATED_STMT (stmt_info);
6590 if (orig_stmt_info)
6591 {
6592 gcc_assert (STMT_VINFO_IN_PATTERN_P (orig_stmt_info));
6593 gcc_assert (!STMT_VINFO_IN_PATTERN_P (stmt_info));
6594 }
6595
6596 /* 3. Check the operands of the operation. The first operands are defined
6597 inside the loop body. The last operand is the reduction variable,
6598 which is defined by the loop-header-phi. */
6599
6600 tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
6601 STMT_VINFO_REDUC_VECTYPE (reduc_info) = vectype_out;
6602 gassign *stmt = as_a <gassign *> (stmt_info->stmt);
6603 enum tree_code code = gimple_assign_rhs_code (stmt);
6604 bool lane_reduc_code_p
6605 = (code == DOT_PROD_EXPR || code == WIDEN_SUM_EXPR || code == SAD_EXPR);
6606 int op_type = TREE_CODE_LENGTH (code);
6607 enum optab_subtype optab_query_kind = optab_vector;
6608 if (code == DOT_PROD_EXPR
6609 && TYPE_SIGN (TREE_TYPE (gimple_assign_rhs1 (stmt)))
6610 != TYPE_SIGN (TREE_TYPE (gimple_assign_rhs2 (stmt))))
6611 optab_query_kind = optab_vector_mixed_sign;
6612
6613
6614 scalar_dest = gimple_assign_lhs (stmt);
6615 scalar_type = TREE_TYPE (scalar_dest);
6616 if (!POINTER_TYPE_P (scalar_type) && !INTEGRAL_TYPE_P (scalar_type)
6617 && !SCALAR_FLOAT_TYPE_P (scalar_type))
6618 return false;
6619
6620 /* Do not try to vectorize bit-precision reductions. */
6621 if (!type_has_mode_precision_p (scalar_type))
6622 return false;
6623
6624 /* For lane-reducing ops we're reducing the number of reduction PHIs
6625 which means the only use of that may be in the lane-reducing operation. */
6626 if (lane_reduc_code_p
6627 && reduc_chain_length != 1
6628 && !only_slp_reduc_chain)
6629 {
6630 if (dump_enabled_p ())
6631 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6632 "lane-reducing reduction with extra stmts.\n");
6633 return false;
6634 }
6635
6636 /* All uses but the last are expected to be defined in the loop.
6637 The last use is the reduction variable. In case of nested cycle this
6638 assumption is not true: we use reduc_index to record the index of the
6639 reduction variable. */
6640 slp_tree *slp_op = XALLOCAVEC (slp_tree, op_type);
6641 /* We need to skip an extra operand for COND_EXPRs with embedded
6642 comparison. */
6643 unsigned opno_adjust = 0;
6644 if (code == COND_EXPR
6645 && COMPARISON_CLASS_P (gimple_assign_rhs1 (stmt)))
6646 opno_adjust = 1;
6647 for (i = 0; i < op_type; i++)
6648 {
6649 /* The condition of COND_EXPR is checked in vectorizable_condition(). */
6650 if (i == 0 && code == COND_EXPR)
6651 continue;
6652
6653 stmt_vec_info def_stmt_info;
6654 enum vect_def_type dt;
6655 tree op;
6656 if (!vect_is_simple_use (loop_vinfo, stmt_info, slp_for_stmt_info,
6657 i + opno_adjust, &op, &slp_op[i], &dt, &tem,
6658 &def_stmt_info))
6659 {
6660 if (dump_enabled_p ())
6661 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6662 "use not simple.\n");
6663 return false;
6664 }
6665 if (i == STMT_VINFO_REDUC_IDX (stmt_info))
6666 continue;
6667
6668 /* There should be only one cycle def in the stmt, the one
6669 leading to reduc_def. */
6670 if (VECTORIZABLE_CYCLE_DEF (dt))
6671 return false;
6672
6673 /* To properly compute ncopies we are interested in the widest
6674 non-reduction input type in case we're looking at a widening
6675 accumulation that we later handle in vect_transform_reduction. */
6676 if (lane_reduc_code_p
6677 && tem
6678 && (!vectype_in
6679 || (GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (vectype_in)))
6680 < GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (tem))))))
6681 vectype_in = tem;
6682
6683 if (code == COND_EXPR)
6684 {
6685 /* Record how the non-reduction-def value of COND_EXPR is defined. */
6686 if (dt == vect_constant_def)
6687 {
6688 cond_reduc_dt = dt;
6689 cond_reduc_val = op;
6690 }
6691 if (dt == vect_induction_def
6692 && def_stmt_info
6693 && is_nonwrapping_integer_induction (def_stmt_info, loop))
6694 {
6695 cond_reduc_dt = dt;
6696 cond_stmt_vinfo = def_stmt_info;
6697 }
6698 }
6699 }
6700 if (!vectype_in)
6701 vectype_in = STMT_VINFO_VECTYPE (phi_info);
6702 STMT_VINFO_REDUC_VECTYPE_IN (reduc_info) = vectype_in;
6703
6704 enum vect_reduction_type v_reduc_type = STMT_VINFO_REDUC_TYPE (phi_info);
6705 STMT_VINFO_REDUC_TYPE (reduc_info) = v_reduc_type;
6706 /* If we have a condition reduction, see if we can simplify it further. */
6707 if (v_reduc_type == COND_REDUCTION)
6708 {
6709 if (slp_node)
6710 return false;
6711
6712 /* When the condition uses the reduction value in the condition, fail. */
6713 if (STMT_VINFO_REDUC_IDX (stmt_info) == 0)
6714 {
6715 if (dump_enabled_p ())
6716 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6717 "condition depends on previous iteration\n");
6718 return false;
6719 }
6720
6721 if (reduc_chain_length == 1
6722 && direct_internal_fn_supported_p (IFN_FOLD_EXTRACT_LAST,
6723 vectype_in, OPTIMIZE_FOR_SPEED))
6724 {
6725 if (dump_enabled_p ())
6726 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6727 "optimizing condition reduction with"
6728 " FOLD_EXTRACT_LAST.\n");
6729 STMT_VINFO_REDUC_TYPE (reduc_info) = EXTRACT_LAST_REDUCTION;
6730 }
6731 else if (cond_reduc_dt == vect_induction_def)
6732 {
6733 tree base
6734 = STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (cond_stmt_vinfo);
6735 tree step = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (cond_stmt_vinfo);
6736
6737 gcc_assert (TREE_CODE (base) == INTEGER_CST
6738 && TREE_CODE (step) == INTEGER_CST);
6739 cond_reduc_val = NULL_TREE;
6740 enum tree_code cond_reduc_op_code = ERROR_MARK;
6741 tree res = PHI_RESULT (STMT_VINFO_STMT (cond_stmt_vinfo));
6742 if (!types_compatible_p (TREE_TYPE (res), TREE_TYPE (base)))
6743 ;
6744 /* Find a suitable value, for MAX_EXPR below base, for MIN_EXPR
6745 above base; punt if base is the minimum value of the type for
6746 MAX_EXPR or maximum value of the type for MIN_EXPR for now. */
6747 else if (tree_int_cst_sgn (step) == -1)
6748 {
6749 cond_reduc_op_code = MIN_EXPR;
6750 if (tree_int_cst_sgn (base) == -1)
6751 cond_reduc_val = build_int_cst (TREE_TYPE (base), 0);
6752 else if (tree_int_cst_lt (base,
6753 TYPE_MAX_VALUE (TREE_TYPE (base))))
6754 cond_reduc_val
6755 = int_const_binop (PLUS_EXPR, base, integer_one_node);
6756 }
6757 else
6758 {
6759 cond_reduc_op_code = MAX_EXPR;
6760 if (tree_int_cst_sgn (base) == 1)
6761 cond_reduc_val = build_int_cst (TREE_TYPE (base), 0);
6762 else if (tree_int_cst_lt (TYPE_MIN_VALUE (TREE_TYPE (base)),
6763 base))
6764 cond_reduc_val
6765 = int_const_binop (MINUS_EXPR, base, integer_one_node);
6766 }
6767 if (cond_reduc_val)
6768 {
6769 if (dump_enabled_p ())
6770 dump_printf_loc (MSG_NOTE, vect_location,
6771 "condition expression based on "
6772 "integer induction.\n");
6773 STMT_VINFO_REDUC_CODE (reduc_info) = cond_reduc_op_code;
6774 STMT_VINFO_VEC_INDUC_COND_INITIAL_VAL (reduc_info)
6775 = cond_reduc_val;
6776 STMT_VINFO_REDUC_TYPE (reduc_info) = INTEGER_INDUC_COND_REDUCTION;
6777 }
6778 }
6779 else if (cond_reduc_dt == vect_constant_def)
6780 {
6781 enum vect_def_type cond_initial_dt;
6782 tree cond_initial_val = vect_phi_initial_value (reduc_def_phi);
6783 vect_is_simple_use (cond_initial_val, loop_vinfo, &cond_initial_dt);
6784 if (cond_initial_dt == vect_constant_def
6785 && types_compatible_p (TREE_TYPE (cond_initial_val),
6786 TREE_TYPE (cond_reduc_val)))
6787 {
6788 tree e = fold_binary (LE_EXPR, boolean_type_node,
6789 cond_initial_val, cond_reduc_val);
6790 if (e && (integer_onep (e) || integer_zerop (e)))
6791 {
6792 if (dump_enabled_p ())
6793 dump_printf_loc (MSG_NOTE, vect_location,
6794 "condition expression based on "
6795 "compile time constant.\n");
6796 /* Record reduction code at analysis stage. */
6797 STMT_VINFO_REDUC_CODE (reduc_info)
6798 = integer_onep (e) ? MAX_EXPR : MIN_EXPR;
6799 STMT_VINFO_REDUC_TYPE (reduc_info) = CONST_COND_REDUCTION;
6800 }
6801 }
6802 }
6803 }
6804
6805 if (STMT_VINFO_LIVE_P (phi_info))
6806 return false;
6807
6808 if (slp_node)
6809 ncopies = 1;
6810 else
6811 ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
6812
6813 gcc_assert (ncopies >= 1);
6814
6815 poly_uint64 nunits_out = TYPE_VECTOR_SUBPARTS (vectype_out);
6816
6817 if (nested_cycle)
6818 {
6819 gcc_assert (STMT_VINFO_DEF_TYPE (reduc_info)
6820 == vect_double_reduction_def);
6821 double_reduc = true;
6822 }
6823
6824 /* 4.2. Check support for the epilog operation.
6825
6826 If STMT represents a reduction pattern, then the type of the
6827 reduction variable may be different than the type of the rest
6828 of the arguments. For example, consider the case of accumulation
6829 of shorts into an int accumulator; The original code:
6830 S1: int_a = (int) short_a;
6831 orig_stmt-> S2: int_acc = plus <int_a ,int_acc>;
6832
6833 was replaced with:
6834 STMT: int_acc = widen_sum <short_a, int_acc>
6835
6836 This means that:
6837 1. The tree-code that is used to create the vector operation in the
6838 epilog code (that reduces the partial results) is not the
6839 tree-code of STMT, but is rather the tree-code of the original
6840 stmt from the pattern that STMT is replacing. I.e, in the example
6841 above we want to use 'widen_sum' in the loop, but 'plus' in the
6842 epilog.
6843 2. The type (mode) we use to check available target support
6844 for the vector operation to be created in the *epilog*, is
6845 determined by the type of the reduction variable (in the example
6846 above we'd check this: optab_handler (plus_optab, vect_int_mode])).
6847 However the type (mode) we use to check available target support
6848 for the vector operation to be created *inside the loop*, is
6849 determined by the type of the other arguments to STMT (in the
6850 example we'd check this: optab_handler (widen_sum_optab,
6851 vect_short_mode)).
6852
6853 This is contrary to "regular" reductions, in which the types of all
6854 the arguments are the same as the type of the reduction variable.
6855 For "regular" reductions we can therefore use the same vector type
6856 (and also the same tree-code) when generating the epilog code and
6857 when generating the code inside the loop. */
6858
6859 enum tree_code orig_code = STMT_VINFO_REDUC_CODE (phi_info);
6860 STMT_VINFO_REDUC_CODE (reduc_info) = orig_code;
6861
6862 vect_reduction_type reduction_type = STMT_VINFO_REDUC_TYPE (reduc_info);
6863 if (reduction_type == TREE_CODE_REDUCTION)
6864 {
6865 /* Check whether it's ok to change the order of the computation.
6866 Generally, when vectorizing a reduction we change the order of the
6867 computation. This may change the behavior of the program in some
6868 cases, so we need to check that this is ok. One exception is when
6869 vectorizing an outer-loop: the inner-loop is executed sequentially,
6870 and therefore vectorizing reductions in the inner-loop during
6871 outer-loop vectorization is safe. Likewise when we are vectorizing
6872 a series of reductions using SLP and the VF is one the reductions
6873 are performed in scalar order. */
6874 if (slp_node
6875 && !REDUC_GROUP_FIRST_ELEMENT (stmt_info)
6876 && known_eq (LOOP_VINFO_VECT_FACTOR (loop_vinfo), 1u))
6877 ;
6878 else if (needs_fold_left_reduction_p (scalar_type, orig_code))
6879 {
6880 /* When vectorizing a reduction chain w/o SLP the reduction PHI
6881 is not directy used in stmt. */
6882 if (!only_slp_reduc_chain
6883 && reduc_chain_length != 1)
6884 {
6885 if (dump_enabled_p ())
6886 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6887 "in-order reduction chain without SLP.\n");
6888 return false;
6889 }
6890 STMT_VINFO_REDUC_TYPE (reduc_info)
6891 = reduction_type = FOLD_LEFT_REDUCTION;
6892 }
6893 else if (!commutative_tree_code (orig_code)
6894 || !associative_tree_code (orig_code))
6895 {
6896 if (dump_enabled_p ())
6897 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6898 "reduction: not commutative/associative");
6899 return false;
6900 }
6901 }
6902
6903 if ((double_reduc || reduction_type != TREE_CODE_REDUCTION)
6904 && ncopies > 1)
6905 {
6906 if (dump_enabled_p ())
6907 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6908 "multiple types in double reduction or condition "
6909 "reduction or fold-left reduction.\n");
6910 return false;
6911 }
6912
6913 internal_fn reduc_fn = IFN_LAST;
6914 if (reduction_type == TREE_CODE_REDUCTION
6915 || reduction_type == FOLD_LEFT_REDUCTION
6916 || reduction_type == INTEGER_INDUC_COND_REDUCTION
6917 || reduction_type == CONST_COND_REDUCTION)
6918 {
6919 if (reduction_type == FOLD_LEFT_REDUCTION
6920 ? fold_left_reduction_fn (orig_code, &reduc_fn)
6921 : reduction_fn_for_scalar_code (orig_code, &reduc_fn))
6922 {
6923 if (reduc_fn != IFN_LAST
6924 && !direct_internal_fn_supported_p (reduc_fn, vectype_out,
6925 OPTIMIZE_FOR_SPEED))
6926 {
6927 if (dump_enabled_p ())
6928 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6929 "reduc op not supported by target.\n");
6930
6931 reduc_fn = IFN_LAST;
6932 }
6933 }
6934 else
6935 {
6936 if (!nested_cycle || double_reduc)
6937 {
6938 if (dump_enabled_p ())
6939 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6940 "no reduc code for scalar code.\n");
6941
6942 return false;
6943 }
6944 }
6945 }
6946 else if (reduction_type == COND_REDUCTION)
6947 {
6948 int scalar_precision
6949 = GET_MODE_PRECISION (SCALAR_TYPE_MODE (scalar_type));
6950 cr_index_scalar_type = make_unsigned_type (scalar_precision);
6951 cr_index_vector_type = get_same_sized_vectype (cr_index_scalar_type,
6952 vectype_out);
6953
6954 if (direct_internal_fn_supported_p (IFN_REDUC_MAX, cr_index_vector_type,
6955 OPTIMIZE_FOR_SPEED))
6956 reduc_fn = IFN_REDUC_MAX;
6957 }
6958 STMT_VINFO_REDUC_FN (reduc_info) = reduc_fn;
6959
6960 if (reduction_type != EXTRACT_LAST_REDUCTION
6961 && (!nested_cycle || double_reduc)
6962 && reduc_fn == IFN_LAST
6963 && !nunits_out.is_constant ())
6964 {
6965 if (dump_enabled_p ())
6966 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6967 "missing target support for reduction on"
6968 " variable-length vectors.\n");
6969 return false;
6970 }
6971
6972 /* For SLP reductions, see if there is a neutral value we can use. */
6973 tree neutral_op = NULL_TREE;
6974 if (slp_node)
6975 {
6976 tree initial_value = NULL_TREE;
6977 if (REDUC_GROUP_FIRST_ELEMENT (stmt_info) != NULL)
6978 initial_value = vect_phi_initial_value (reduc_def_phi);
6979 neutral_op = neutral_op_for_reduction (TREE_TYPE (vectype_out),
6980 orig_code, initial_value);
6981 }
6982
6983 if (double_reduc && reduction_type == FOLD_LEFT_REDUCTION)
6984 {
6985 /* We can't support in-order reductions of code such as this:
6986
6987 for (int i = 0; i < n1; ++i)
6988 for (int j = 0; j < n2; ++j)
6989 l += a[j];
6990
6991 since GCC effectively transforms the loop when vectorizing:
6992
6993 for (int i = 0; i < n1 / VF; ++i)
6994 for (int j = 0; j < n2; ++j)
6995 for (int k = 0; k < VF; ++k)
6996 l += a[j];
6997
6998 which is a reassociation of the original operation. */
6999 if (dump_enabled_p ())
7000 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7001 "in-order double reduction not supported.\n");
7002
7003 return false;
7004 }
7005
7006 if (reduction_type == FOLD_LEFT_REDUCTION
7007 && slp_node
7008 && !REDUC_GROUP_FIRST_ELEMENT (stmt_info))
7009 {
7010 /* We cannot use in-order reductions in this case because there is
7011 an implicit reassociation of the operations involved. */
7012 if (dump_enabled_p ())
7013 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7014 "in-order unchained SLP reductions not supported.\n");
7015 return false;
7016 }
7017
7018 /* For double reductions, and for SLP reductions with a neutral value,
7019 we construct a variable-length initial vector by loading a vector
7020 full of the neutral value and then shift-and-inserting the start
7021 values into the low-numbered elements. */
7022 if ((double_reduc || neutral_op)
7023 && !nunits_out.is_constant ()
7024 && !direct_internal_fn_supported_p (IFN_VEC_SHL_INSERT,
7025 vectype_out, OPTIMIZE_FOR_SPEED))
7026 {
7027 if (dump_enabled_p ())
7028 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7029 "reduction on variable-length vectors requires"
7030 " target support for a vector-shift-and-insert"
7031 " operation.\n");
7032 return false;
7033 }
7034
7035 /* Check extra constraints for variable-length unchained SLP reductions. */
7036 if (STMT_SLP_TYPE (stmt_info)
7037 && !REDUC_GROUP_FIRST_ELEMENT (stmt_info)
7038 && !nunits_out.is_constant ())
7039 {
7040 /* We checked above that we could build the initial vector when
7041 there's a neutral element value. Check here for the case in
7042 which each SLP statement has its own initial value and in which
7043 that value needs to be repeated for every instance of the
7044 statement within the initial vector. */
7045 unsigned int group_size = SLP_TREE_LANES (slp_node);
7046 if (!neutral_op
7047 && !can_duplicate_and_interleave_p (loop_vinfo, group_size,
7048 TREE_TYPE (vectype_out)))
7049 {
7050 if (dump_enabled_p ())
7051 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7052 "unsupported form of SLP reduction for"
7053 " variable-length vectors: cannot build"
7054 " initial vector.\n");
7055 return false;
7056 }
7057 /* The epilogue code relies on the number of elements being a multiple
7058 of the group size. The duplicate-and-interleave approach to setting
7059 up the initial vector does too. */
7060 if (!multiple_p (nunits_out, group_size))
7061 {
7062 if (dump_enabled_p ())
7063 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7064 "unsupported form of SLP reduction for"
7065 " variable-length vectors: the vector size"
7066 " is not a multiple of the number of results.\n");
7067 return false;
7068 }
7069 }
7070
7071 if (reduction_type == COND_REDUCTION)
7072 {
7073 widest_int ni;
7074
7075 if (! max_loop_iterations (loop, &ni))
7076 {
7077 if (dump_enabled_p ())
7078 dump_printf_loc (MSG_NOTE, vect_location,
7079 "loop count not known, cannot create cond "
7080 "reduction.\n");
7081 return false;
7082 }
7083 /* Convert backedges to iterations. */
7084 ni += 1;
7085
7086 /* The additional index will be the same type as the condition. Check
7087 that the loop can fit into this less one (because we'll use up the
7088 zero slot for when there are no matches). */
7089 tree max_index = TYPE_MAX_VALUE (cr_index_scalar_type);
7090 if (wi::geu_p (ni, wi::to_widest (max_index)))
7091 {
7092 if (dump_enabled_p ())
7093 dump_printf_loc (MSG_NOTE, vect_location,
7094 "loop size is greater than data size.\n");
7095 return false;
7096 }
7097 }
7098
7099 /* In case the vectorization factor (VF) is bigger than the number
7100 of elements that we can fit in a vectype (nunits), we have to generate
7101 more than one vector stmt - i.e - we need to "unroll" the
7102 vector stmt by a factor VF/nunits. For more details see documentation
7103 in vectorizable_operation. */
7104
7105 /* If the reduction is used in an outer loop we need to generate
7106 VF intermediate results, like so (e.g. for ncopies=2):
7107 r0 = phi (init, r0)
7108 r1 = phi (init, r1)
7109 r0 = x0 + r0;
7110 r1 = x1 + r1;
7111 (i.e. we generate VF results in 2 registers).
7112 In this case we have a separate def-use cycle for each copy, and therefore
7113 for each copy we get the vector def for the reduction variable from the
7114 respective phi node created for this copy.
7115
7116 Otherwise (the reduction is unused in the loop nest), we can combine
7117 together intermediate results, like so (e.g. for ncopies=2):
7118 r = phi (init, r)
7119 r = x0 + r;
7120 r = x1 + r;
7121 (i.e. we generate VF/2 results in a single register).
7122 In this case for each copy we get the vector def for the reduction variable
7123 from the vectorized reduction operation generated in the previous iteration.
7124
7125 This only works when we see both the reduction PHI and its only consumer
7126 in vectorizable_reduction and there are no intermediate stmts
7127 participating. */
7128 if (ncopies > 1
7129 && (STMT_VINFO_RELEVANT (stmt_info) <= vect_used_only_live)
7130 && reduc_chain_length == 1)
7131 single_defuse_cycle = true;
7132
7133 if (single_defuse_cycle || lane_reduc_code_p)
7134 {
7135 gcc_assert (code != COND_EXPR);
7136
7137 /* 4. Supportable by target? */
7138 bool ok = true;
7139
7140 /* 4.1. check support for the operation in the loop */
7141 optab optab = optab_for_tree_code (code, vectype_in, optab_query_kind);
7142 if (!optab)
7143 {
7144 if (dump_enabled_p ())
7145 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7146 "no optab.\n");
7147 ok = false;
7148 }
7149
7150 machine_mode vec_mode = TYPE_MODE (vectype_in);
7151 if (ok && optab_handler (optab, vec_mode) == CODE_FOR_nothing)
7152 {
7153 if (dump_enabled_p ())
7154 dump_printf (MSG_NOTE, "op not supported by target.\n");
7155 if (maybe_ne (GET_MODE_SIZE (vec_mode), UNITS_PER_WORD)
7156 || !vect_can_vectorize_without_simd_p (code))
7157 ok = false;
7158 else
7159 if (dump_enabled_p ())
7160 dump_printf (MSG_NOTE, "proceeding using word mode.\n");
7161 }
7162
7163 if (vect_emulated_vector_p (vectype_in)
7164 && !vect_can_vectorize_without_simd_p (code))
7165 {
7166 if (dump_enabled_p ())
7167 dump_printf (MSG_NOTE, "using word mode not possible.\n");
7168 return false;
7169 }
7170
7171 /* lane-reducing operations have to go through vect_transform_reduction.
7172 For the other cases try without the single cycle optimization. */
7173 if (!ok)
7174 {
7175 if (lane_reduc_code_p)
7176 return false;
7177 else
7178 single_defuse_cycle = false;
7179 }
7180 }
7181 STMT_VINFO_FORCE_SINGLE_CYCLE (reduc_info) = single_defuse_cycle;
7182
7183 /* If the reduction stmt is one of the patterns that have lane
7184 reduction embedded we cannot handle the case of ! single_defuse_cycle. */
7185 if ((ncopies > 1 && ! single_defuse_cycle)
7186 && lane_reduc_code_p)
7187 {
7188 if (dump_enabled_p ())
7189 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7190 "multi def-use cycle not possible for lane-reducing "
7191 "reduction operation\n");
7192 return false;
7193 }
7194
7195 if (slp_node
7196 && !(!single_defuse_cycle
7197 && code != DOT_PROD_EXPR
7198 && code != WIDEN_SUM_EXPR
7199 && code != SAD_EXPR
7200 && reduction_type != FOLD_LEFT_REDUCTION))
7201 for (i = 0; i < op_type; i++)
7202 if (!vect_maybe_update_slp_op_vectype (slp_op[i], vectype_in))
7203 {
7204 if (dump_enabled_p ())
7205 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7206 "incompatible vector types for invariants\n");
7207 return false;
7208 }
7209
7210 if (slp_node)
7211 vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
7212 else
7213 vec_num = 1;
7214
7215 vect_model_reduction_cost (loop_vinfo, stmt_info, reduc_fn,
7216 reduction_type, ncopies, cost_vec);
7217 /* Cost the reduction op inside the loop if transformed via
7218 vect_transform_reduction. Otherwise this is costed by the
7219 separate vectorizable_* routines. */
7220 if (single_defuse_cycle
7221 || code == DOT_PROD_EXPR
7222 || code == WIDEN_SUM_EXPR
7223 || code == SAD_EXPR)
7224 record_stmt_cost (cost_vec, ncopies, vector_stmt, stmt_info, 0, vect_body);
7225
7226 if (dump_enabled_p ()
7227 && reduction_type == FOLD_LEFT_REDUCTION)
7228 dump_printf_loc (MSG_NOTE, vect_location,
7229 "using an in-order (fold-left) reduction.\n");
7230 STMT_VINFO_TYPE (orig_stmt_of_analysis) = cycle_phi_info_type;
7231 /* All but single defuse-cycle optimized, lane-reducing and fold-left
7232 reductions go through their own vectorizable_* routines. */
7233 if (!single_defuse_cycle
7234 && code != DOT_PROD_EXPR
7235 && code != WIDEN_SUM_EXPR
7236 && code != SAD_EXPR
7237 && reduction_type != FOLD_LEFT_REDUCTION)
7238 {
7239 stmt_vec_info tem
7240 = vect_stmt_to_vectorize (STMT_VINFO_REDUC_DEF (phi_info));
7241 if (slp_node && REDUC_GROUP_FIRST_ELEMENT (tem))
7242 {
7243 gcc_assert (!REDUC_GROUP_NEXT_ELEMENT (tem));
7244 tem = REDUC_GROUP_FIRST_ELEMENT (tem);
7245 }
7246 STMT_VINFO_DEF_TYPE (vect_orig_stmt (tem)) = vect_internal_def;
7247 STMT_VINFO_DEF_TYPE (tem) = vect_internal_def;
7248 }
7249 else if (loop_vinfo && LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo))
7250 {
7251 vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo);
7252 internal_fn cond_fn = get_conditional_internal_fn (code);
7253
7254 if (reduction_type != FOLD_LEFT_REDUCTION
7255 && !use_mask_by_cond_expr_p (code, cond_fn, vectype_in)
7256 && (cond_fn == IFN_LAST
7257 || !direct_internal_fn_supported_p (cond_fn, vectype_in,
7258 OPTIMIZE_FOR_SPEED)))
7259 {
7260 if (dump_enabled_p ())
7261 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7262 "can't operate on partial vectors because"
7263 " no conditional operation is available.\n");
7264 LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
7265 }
7266 else if (reduction_type == FOLD_LEFT_REDUCTION
7267 && reduc_fn == IFN_LAST
7268 && !expand_vec_cond_expr_p (vectype_in,
7269 truth_type_for (vectype_in),
7270 SSA_NAME))
7271 {
7272 if (dump_enabled_p ())
7273 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7274 "can't operate on partial vectors because"
7275 " no conditional operation is available.\n");
7276 LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
7277 }
7278 else
7279 vect_record_loop_mask (loop_vinfo, masks, ncopies * vec_num,
7280 vectype_in, NULL);
7281 }
7282 return true;
7283 }
7284
7285 /* Transform the definition stmt STMT_INFO of a reduction PHI backedge
7286 value. */
7287
7288 bool
7289 vect_transform_reduction (loop_vec_info loop_vinfo,
7290 stmt_vec_info stmt_info, gimple_stmt_iterator *gsi,
7291 gimple **vec_stmt, slp_tree slp_node)
7292 {
7293 tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
7294 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
7295 int i;
7296 int ncopies;
7297 int vec_num;
7298
7299 stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info);
7300 gcc_assert (reduc_info->is_reduc_info);
7301
7302 if (nested_in_vect_loop_p (loop, stmt_info))
7303 {
7304 loop = loop->inner;
7305 gcc_assert (STMT_VINFO_DEF_TYPE (reduc_info) == vect_double_reduction_def);
7306 }
7307
7308 gassign *stmt = as_a <gassign *> (stmt_info->stmt);
7309 enum tree_code code = gimple_assign_rhs_code (stmt);
7310 int op_type = TREE_CODE_LENGTH (code);
7311
7312 /* Flatten RHS. */
7313 tree ops[3];
7314 switch (get_gimple_rhs_class (code))
7315 {
7316 case GIMPLE_TERNARY_RHS:
7317 ops[2] = gimple_assign_rhs3 (stmt);
7318 /* Fall thru. */
7319 case GIMPLE_BINARY_RHS:
7320 ops[0] = gimple_assign_rhs1 (stmt);
7321 ops[1] = gimple_assign_rhs2 (stmt);
7322 break;
7323 default:
7324 gcc_unreachable ();
7325 }
7326
7327 /* All uses but the last are expected to be defined in the loop.
7328 The last use is the reduction variable. In case of nested cycle this
7329 assumption is not true: we use reduc_index to record the index of the
7330 reduction variable. */
7331 stmt_vec_info phi_info = STMT_VINFO_REDUC_DEF (vect_orig_stmt (stmt_info));
7332 gphi *reduc_def_phi = as_a <gphi *> (phi_info->stmt);
7333 int reduc_index = STMT_VINFO_REDUC_IDX (stmt_info);
7334 tree vectype_in = STMT_VINFO_REDUC_VECTYPE_IN (reduc_info);
7335
7336 if (slp_node)
7337 {
7338 ncopies = 1;
7339 vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
7340 }
7341 else
7342 {
7343 ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
7344 vec_num = 1;
7345 }
7346
7347 internal_fn cond_fn = get_conditional_internal_fn (code);
7348 vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo);
7349 bool mask_by_cond_expr = use_mask_by_cond_expr_p (code, cond_fn, vectype_in);
7350
7351 /* Transform. */
7352 tree new_temp = NULL_TREE;
7353 auto_vec<tree> vec_oprnds0;
7354 auto_vec<tree> vec_oprnds1;
7355 auto_vec<tree> vec_oprnds2;
7356 tree def0;
7357
7358 if (dump_enabled_p ())
7359 dump_printf_loc (MSG_NOTE, vect_location, "transform reduction.\n");
7360
7361 /* FORNOW: Multiple types are not supported for condition. */
7362 if (code == COND_EXPR)
7363 gcc_assert (ncopies == 1);
7364
7365 bool masked_loop_p = LOOP_VINFO_FULLY_MASKED_P (loop_vinfo);
7366
7367 vect_reduction_type reduction_type = STMT_VINFO_REDUC_TYPE (reduc_info);
7368 if (reduction_type == FOLD_LEFT_REDUCTION)
7369 {
7370 internal_fn reduc_fn = STMT_VINFO_REDUC_FN (reduc_info);
7371 return vectorize_fold_left_reduction
7372 (loop_vinfo, stmt_info, gsi, vec_stmt, slp_node, reduc_def_phi, code,
7373 reduc_fn, ops, vectype_in, reduc_index, masks);
7374 }
7375
7376 bool single_defuse_cycle = STMT_VINFO_FORCE_SINGLE_CYCLE (reduc_info);
7377 gcc_assert (single_defuse_cycle
7378 || code == DOT_PROD_EXPR
7379 || code == WIDEN_SUM_EXPR
7380 || code == SAD_EXPR);
7381
7382 /* Create the destination vector */
7383 tree scalar_dest = gimple_assign_lhs (stmt);
7384 tree vec_dest = vect_create_destination_var (scalar_dest, vectype_out);
7385
7386 vect_get_vec_defs (loop_vinfo, stmt_info, slp_node, ncopies,
7387 single_defuse_cycle && reduc_index == 0
7388 ? NULL_TREE : ops[0], &vec_oprnds0,
7389 single_defuse_cycle && reduc_index == 1
7390 ? NULL_TREE : ops[1], &vec_oprnds1,
7391 op_type == ternary_op
7392 && !(single_defuse_cycle && reduc_index == 2)
7393 ? ops[2] : NULL_TREE, &vec_oprnds2);
7394 if (single_defuse_cycle)
7395 {
7396 gcc_assert (!slp_node);
7397 vect_get_vec_defs_for_operand (loop_vinfo, stmt_info, 1,
7398 ops[reduc_index],
7399 reduc_index == 0 ? &vec_oprnds0
7400 : (reduc_index == 1 ? &vec_oprnds1
7401 : &vec_oprnds2));
7402 }
7403
7404 FOR_EACH_VEC_ELT (vec_oprnds0, i, def0)
7405 {
7406 gimple *new_stmt;
7407 tree vop[3] = { def0, vec_oprnds1[i], NULL_TREE };
7408 if (masked_loop_p && !mask_by_cond_expr)
7409 {
7410 /* Make sure that the reduction accumulator is vop[0]. */
7411 if (reduc_index == 1)
7412 {
7413 gcc_assert (commutative_tree_code (code));
7414 std::swap (vop[0], vop[1]);
7415 }
7416 tree mask = vect_get_loop_mask (gsi, masks, vec_num * ncopies,
7417 vectype_in, i);
7418 gcall *call = gimple_build_call_internal (cond_fn, 4, mask,
7419 vop[0], vop[1], vop[0]);
7420 new_temp = make_ssa_name (vec_dest, call);
7421 gimple_call_set_lhs (call, new_temp);
7422 gimple_call_set_nothrow (call, true);
7423 vect_finish_stmt_generation (loop_vinfo, stmt_info, call, gsi);
7424 new_stmt = call;
7425 }
7426 else
7427 {
7428 if (op_type == ternary_op)
7429 vop[2] = vec_oprnds2[i];
7430
7431 if (masked_loop_p && mask_by_cond_expr)
7432 {
7433 tree mask = vect_get_loop_mask (gsi, masks, vec_num * ncopies,
7434 vectype_in, i);
7435 build_vect_cond_expr (code, vop, mask, gsi);
7436 }
7437
7438 new_stmt = gimple_build_assign (vec_dest, code,
7439 vop[0], vop[1], vop[2]);
7440 new_temp = make_ssa_name (vec_dest, new_stmt);
7441 gimple_assign_set_lhs (new_stmt, new_temp);
7442 vect_finish_stmt_generation (loop_vinfo, stmt_info, new_stmt, gsi);
7443 }
7444
7445 if (slp_node)
7446 SLP_TREE_VEC_STMTS (slp_node).quick_push (new_stmt);
7447 else if (single_defuse_cycle
7448 && i < ncopies - 1)
7449 {
7450 if (reduc_index == 0)
7451 vec_oprnds0.safe_push (gimple_get_lhs (new_stmt));
7452 else if (reduc_index == 1)
7453 vec_oprnds1.safe_push (gimple_get_lhs (new_stmt));
7454 else if (reduc_index == 2)
7455 vec_oprnds2.safe_push (gimple_get_lhs (new_stmt));
7456 }
7457 else
7458 STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_stmt);
7459 }
7460
7461 if (!slp_node)
7462 *vec_stmt = STMT_VINFO_VEC_STMTS (stmt_info)[0];
7463
7464 return true;
7465 }
7466
7467 /* Transform phase of a cycle PHI. */
7468
7469 bool
7470 vect_transform_cycle_phi (loop_vec_info loop_vinfo,
7471 stmt_vec_info stmt_info, gimple **vec_stmt,
7472 slp_tree slp_node, slp_instance slp_node_instance)
7473 {
7474 tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
7475 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
7476 int i;
7477 int ncopies;
7478 int j;
7479 bool nested_cycle = false;
7480 int vec_num;
7481
7482 if (nested_in_vect_loop_p (loop, stmt_info))
7483 {
7484 loop = loop->inner;
7485 nested_cycle = true;
7486 }
7487
7488 stmt_vec_info reduc_stmt_info = STMT_VINFO_REDUC_DEF (stmt_info);
7489 reduc_stmt_info = vect_stmt_to_vectorize (reduc_stmt_info);
7490 stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info);
7491 gcc_assert (reduc_info->is_reduc_info);
7492
7493 if (STMT_VINFO_REDUC_TYPE (reduc_info) == EXTRACT_LAST_REDUCTION
7494 || STMT_VINFO_REDUC_TYPE (reduc_info) == FOLD_LEFT_REDUCTION)
7495 /* Leave the scalar phi in place. */
7496 return true;
7497
7498 tree vectype_in = STMT_VINFO_REDUC_VECTYPE_IN (reduc_info);
7499 /* For a nested cycle we do not fill the above. */
7500 if (!vectype_in)
7501 vectype_in = STMT_VINFO_VECTYPE (stmt_info);
7502 gcc_assert (vectype_in);
7503
7504 if (slp_node)
7505 {
7506 /* The size vect_schedule_slp_instance computes is off for us. */
7507 vec_num = vect_get_num_vectors (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
7508 * SLP_TREE_LANES (slp_node), vectype_in);
7509 ncopies = 1;
7510 }
7511 else
7512 {
7513 vec_num = 1;
7514 ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
7515 }
7516
7517 /* Check whether we should use a single PHI node and accumulate
7518 vectors to one before the backedge. */
7519 if (STMT_VINFO_FORCE_SINGLE_CYCLE (reduc_info))
7520 ncopies = 1;
7521
7522 /* Create the destination vector */
7523 gphi *phi = as_a <gphi *> (stmt_info->stmt);
7524 tree vec_dest = vect_create_destination_var (gimple_phi_result (phi),
7525 vectype_out);
7526
7527 /* Get the loop-entry arguments. */
7528 tree vec_initial_def = NULL_TREE;
7529 auto_vec<tree> vec_initial_defs;
7530 if (slp_node)
7531 {
7532 vec_initial_defs.reserve (vec_num);
7533 if (nested_cycle)
7534 {
7535 unsigned phi_idx = loop_preheader_edge (loop)->dest_idx;
7536 vect_get_slp_defs (SLP_TREE_CHILDREN (slp_node)[phi_idx],
7537 &vec_initial_defs);
7538 }
7539 else
7540 {
7541 gcc_assert (slp_node == slp_node_instance->reduc_phis);
7542 vec<tree> &initial_values = reduc_info->reduc_initial_values;
7543 vec<stmt_vec_info> &stmts = SLP_TREE_SCALAR_STMTS (slp_node);
7544
7545 unsigned int num_phis = stmts.length ();
7546 if (REDUC_GROUP_FIRST_ELEMENT (reduc_stmt_info))
7547 num_phis = 1;
7548 initial_values.reserve (num_phis);
7549 for (unsigned int i = 0; i < num_phis; ++i)
7550 {
7551 gphi *this_phi = as_a<gphi *> (stmts[i]->stmt);
7552 initial_values.quick_push (vect_phi_initial_value (this_phi));
7553 }
7554 if (vec_num == 1)
7555 vect_find_reusable_accumulator (loop_vinfo, reduc_info);
7556 if (!initial_values.is_empty ())
7557 {
7558 tree initial_value
7559 = (num_phis == 1 ? initial_values[0] : NULL_TREE);
7560 tree_code code = STMT_VINFO_REDUC_CODE (reduc_info);
7561 tree neutral_op
7562 = neutral_op_for_reduction (TREE_TYPE (vectype_out),
7563 code, initial_value);
7564 get_initial_defs_for_reduction (loop_vinfo, reduc_info,
7565 &vec_initial_defs, vec_num,
7566 stmts.length (), neutral_op);
7567 }
7568 }
7569 }
7570 else
7571 {
7572 /* Get at the scalar def before the loop, that defines the initial
7573 value of the reduction variable. */
7574 tree initial_def = vect_phi_initial_value (phi);
7575 reduc_info->reduc_initial_values.safe_push (initial_def);
7576 /* Optimize: if initial_def is for REDUC_MAX smaller than the base
7577 and we can't use zero for induc_val, use initial_def. Similarly
7578 for REDUC_MIN and initial_def larger than the base. */
7579 if (STMT_VINFO_REDUC_TYPE (reduc_info) == INTEGER_INDUC_COND_REDUCTION)
7580 {
7581 tree induc_val = STMT_VINFO_VEC_INDUC_COND_INITIAL_VAL (reduc_info);
7582 if (TREE_CODE (initial_def) == INTEGER_CST
7583 && !integer_zerop (induc_val)
7584 && ((STMT_VINFO_REDUC_CODE (reduc_info) == MAX_EXPR
7585 && tree_int_cst_lt (initial_def, induc_val))
7586 || (STMT_VINFO_REDUC_CODE (reduc_info) == MIN_EXPR
7587 && tree_int_cst_lt (induc_val, initial_def))))
7588 {
7589 induc_val = initial_def;
7590 /* Communicate we used the initial_def to epilouge
7591 generation. */
7592 STMT_VINFO_VEC_INDUC_COND_INITIAL_VAL (reduc_info) = NULL_TREE;
7593 }
7594 vec_initial_def = build_vector_from_val (vectype_out, induc_val);
7595 }
7596 else if (nested_cycle)
7597 {
7598 /* Do not use an adjustment def as that case is not supported
7599 correctly if ncopies is not one. */
7600 vect_get_vec_defs_for_operand (loop_vinfo, reduc_stmt_info,
7601 ncopies, initial_def,
7602 &vec_initial_defs);
7603 }
7604 else if (STMT_VINFO_REDUC_TYPE (reduc_info) == CONST_COND_REDUCTION
7605 || STMT_VINFO_REDUC_TYPE (reduc_info) == COND_REDUCTION)
7606 /* Fill the initial vector with the initial scalar value. */
7607 vec_initial_def
7608 = get_initial_def_for_reduction (loop_vinfo, reduc_stmt_info,
7609 initial_def, initial_def);
7610 else
7611 {
7612 if (ncopies == 1)
7613 vect_find_reusable_accumulator (loop_vinfo, reduc_info);
7614 if (!reduc_info->reduc_initial_values.is_empty ())
7615 {
7616 initial_def = reduc_info->reduc_initial_values[0];
7617 enum tree_code code = STMT_VINFO_REDUC_CODE (reduc_info);
7618 tree neutral_op
7619 = neutral_op_for_reduction (TREE_TYPE (initial_def),
7620 code, initial_def);
7621 gcc_assert (neutral_op);
7622 /* Try to simplify the vector initialization by applying an
7623 adjustment after the reduction has been performed. */
7624 if (!reduc_info->reused_accumulator
7625 && STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
7626 && !operand_equal_p (neutral_op, initial_def))
7627 {
7628 STMT_VINFO_REDUC_EPILOGUE_ADJUSTMENT (reduc_info)
7629 = initial_def;
7630 initial_def = neutral_op;
7631 }
7632 vec_initial_def
7633 = get_initial_def_for_reduction (loop_vinfo, reduc_info,
7634 initial_def, neutral_op);
7635 }
7636 }
7637 }
7638
7639 if (vec_initial_def)
7640 {
7641 vec_initial_defs.create (ncopies);
7642 for (i = 0; i < ncopies; ++i)
7643 vec_initial_defs.quick_push (vec_initial_def);
7644 }
7645
7646 if (auto *accumulator = reduc_info->reused_accumulator)
7647 {
7648 tree def = accumulator->reduc_input;
7649 if (!useless_type_conversion_p (vectype_out, TREE_TYPE (def)))
7650 {
7651 unsigned int nreduc;
7652 bool res = constant_multiple_p (TYPE_VECTOR_SUBPARTS
7653 (TREE_TYPE (def)),
7654 TYPE_VECTOR_SUBPARTS (vectype_out),
7655 &nreduc);
7656 gcc_assert (res);
7657 gimple_seq stmts = NULL;
7658 /* Reduce the single vector to a smaller one. */
7659 if (nreduc != 1)
7660 {
7661 /* Perform the reduction in the appropriate type. */
7662 tree rvectype = vectype_out;
7663 if (!useless_type_conversion_p (TREE_TYPE (vectype_out),
7664 TREE_TYPE (TREE_TYPE (def))))
7665 rvectype = build_vector_type (TREE_TYPE (TREE_TYPE (def)),
7666 TYPE_VECTOR_SUBPARTS
7667 (vectype_out));
7668 def = vect_create_partial_epilog (def, rvectype,
7669 STMT_VINFO_REDUC_CODE
7670 (reduc_info),
7671 &stmts);
7672 }
7673 /* The epilogue loop might use a different vector mode, like
7674 VNx2DI vs. V2DI. */
7675 if (TYPE_MODE (vectype_out) != TYPE_MODE (TREE_TYPE (def)))
7676 {
7677 tree reduc_type = build_vector_type_for_mode
7678 (TREE_TYPE (TREE_TYPE (def)), TYPE_MODE (vectype_out));
7679 def = gimple_convert (&stmts, reduc_type, def);
7680 }
7681 /* Adjust the input so we pick up the partially reduced value
7682 for the skip edge in vect_create_epilog_for_reduction. */
7683 accumulator->reduc_input = def;
7684 /* And the reduction could be carried out using a different sign. */
7685 if (!useless_type_conversion_p (vectype_out, TREE_TYPE (def)))
7686 def = gimple_convert (&stmts, vectype_out, def);
7687 if (loop_vinfo->main_loop_edge)
7688 {
7689 /* While we'd like to insert on the edge this will split
7690 blocks and disturb bookkeeping, we also will eventually
7691 need this on the skip edge. Rely on sinking to
7692 fixup optimal placement and insert in the pred. */
7693 gimple_stmt_iterator gsi
7694 = gsi_last_bb (loop_vinfo->main_loop_edge->src);
7695 /* Insert before a cond that eventually skips the
7696 epilogue. */
7697 if (!gsi_end_p (gsi) && stmt_ends_bb_p (gsi_stmt (gsi)))
7698 gsi_prev (&gsi);
7699 gsi_insert_seq_after (&gsi, stmts, GSI_CONTINUE_LINKING);
7700 }
7701 else
7702 gsi_insert_seq_on_edge_immediate (loop_preheader_edge (loop),
7703 stmts);
7704 }
7705 if (loop_vinfo->main_loop_edge)
7706 vec_initial_defs[0]
7707 = vect_get_main_loop_result (loop_vinfo, def,
7708 vec_initial_defs[0]);
7709 else
7710 vec_initial_defs.safe_push (def);
7711 }
7712
7713 /* Generate the reduction PHIs upfront. */
7714 for (i = 0; i < vec_num; i++)
7715 {
7716 tree vec_init_def = vec_initial_defs[i];
7717 for (j = 0; j < ncopies; j++)
7718 {
7719 /* Create the reduction-phi that defines the reduction
7720 operand. */
7721 gphi *new_phi = create_phi_node (vec_dest, loop->header);
7722
7723 /* Set the loop-entry arg of the reduction-phi. */
7724 if (j != 0 && nested_cycle)
7725 vec_init_def = vec_initial_defs[j];
7726 add_phi_arg (new_phi, vec_init_def, loop_preheader_edge (loop),
7727 UNKNOWN_LOCATION);
7728
7729 /* The loop-latch arg is set in epilogue processing. */
7730
7731 if (slp_node)
7732 SLP_TREE_VEC_STMTS (slp_node).quick_push (new_phi);
7733 else
7734 {
7735 if (j == 0)
7736 *vec_stmt = new_phi;
7737 STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_phi);
7738 }
7739 }
7740 }
7741
7742 return true;
7743 }
7744
7745 /* Vectorizes LC PHIs. */
7746
7747 bool
7748 vectorizable_lc_phi (loop_vec_info loop_vinfo,
7749 stmt_vec_info stmt_info, gimple **vec_stmt,
7750 slp_tree slp_node)
7751 {
7752 if (!loop_vinfo
7753 || !is_a <gphi *> (stmt_info->stmt)
7754 || gimple_phi_num_args (stmt_info->stmt) != 1)
7755 return false;
7756
7757 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_internal_def
7758 && STMT_VINFO_DEF_TYPE (stmt_info) != vect_double_reduction_def)
7759 return false;
7760
7761 if (!vec_stmt) /* transformation not required. */
7762 {
7763 /* Deal with copies from externs or constants that disguise as
7764 loop-closed PHI nodes (PR97886). */
7765 if (slp_node
7766 && !vect_maybe_update_slp_op_vectype (SLP_TREE_CHILDREN (slp_node)[0],
7767 SLP_TREE_VECTYPE (slp_node)))
7768 {
7769 if (dump_enabled_p ())
7770 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7771 "incompatible vector types for invariants\n");
7772 return false;
7773 }
7774 STMT_VINFO_TYPE (stmt_info) = lc_phi_info_type;
7775 return true;
7776 }
7777
7778 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
7779 tree scalar_dest = gimple_phi_result (stmt_info->stmt);
7780 basic_block bb = gimple_bb (stmt_info->stmt);
7781 edge e = single_pred_edge (bb);
7782 tree vec_dest = vect_create_destination_var (scalar_dest, vectype);
7783 auto_vec<tree> vec_oprnds;
7784 vect_get_vec_defs (loop_vinfo, stmt_info, slp_node,
7785 !slp_node ? vect_get_num_copies (loop_vinfo, vectype) : 1,
7786 gimple_phi_arg_def (stmt_info->stmt, 0), &vec_oprnds);
7787 for (unsigned i = 0; i < vec_oprnds.length (); i++)
7788 {
7789 /* Create the vectorized LC PHI node. */
7790 gphi *new_phi = create_phi_node (vec_dest, bb);
7791 add_phi_arg (new_phi, vec_oprnds[i], e, UNKNOWN_LOCATION);
7792 if (slp_node)
7793 SLP_TREE_VEC_STMTS (slp_node).quick_push (new_phi);
7794 else
7795 STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_phi);
7796 }
7797 if (!slp_node)
7798 *vec_stmt = STMT_VINFO_VEC_STMTS (stmt_info)[0];
7799
7800 return true;
7801 }
7802
7803 /* Vectorizes PHIs. */
7804
7805 bool
7806 vectorizable_phi (vec_info *,
7807 stmt_vec_info stmt_info, gimple **vec_stmt,
7808 slp_tree slp_node, stmt_vector_for_cost *cost_vec)
7809 {
7810 if (!is_a <gphi *> (stmt_info->stmt) || !slp_node)
7811 return false;
7812
7813 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_internal_def)
7814 return false;
7815
7816 tree vectype = SLP_TREE_VECTYPE (slp_node);
7817
7818 if (!vec_stmt) /* transformation not required. */
7819 {
7820 slp_tree child;
7821 unsigned i;
7822 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (slp_node), i, child)
7823 if (!child)
7824 {
7825 if (dump_enabled_p ())
7826 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7827 "PHI node with unvectorized backedge def\n");
7828 return false;
7829 }
7830 else if (!vect_maybe_update_slp_op_vectype (child, vectype))
7831 {
7832 if (dump_enabled_p ())
7833 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7834 "incompatible vector types for invariants\n");
7835 return false;
7836 }
7837 /* For single-argument PHIs assume coalescing which means zero cost
7838 for the scalar and the vector PHIs. This avoids artificially
7839 favoring the vector path (but may pessimize it in some cases). */
7840 if (gimple_phi_num_args (as_a <gphi *> (stmt_info->stmt)) > 1)
7841 record_stmt_cost (cost_vec, SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node),
7842 vector_stmt, stmt_info, vectype, 0, vect_body);
7843 STMT_VINFO_TYPE (stmt_info) = phi_info_type;
7844 return true;
7845 }
7846
7847 tree scalar_dest = gimple_phi_result (stmt_info->stmt);
7848 basic_block bb = gimple_bb (stmt_info->stmt);
7849 tree vec_dest = vect_create_destination_var (scalar_dest, vectype);
7850 auto_vec<gphi *> new_phis;
7851 for (unsigned i = 0; i < gimple_phi_num_args (stmt_info->stmt); ++i)
7852 {
7853 slp_tree child = SLP_TREE_CHILDREN (slp_node)[i];
7854
7855 /* Skip not yet vectorized defs. */
7856 if (SLP_TREE_DEF_TYPE (child) == vect_internal_def
7857 && SLP_TREE_VEC_STMTS (child).is_empty ())
7858 continue;
7859
7860 auto_vec<tree> vec_oprnds;
7861 vect_get_slp_defs (SLP_TREE_CHILDREN (slp_node)[i], &vec_oprnds);
7862 if (!new_phis.exists ())
7863 {
7864 new_phis.create (vec_oprnds.length ());
7865 for (unsigned j = 0; j < vec_oprnds.length (); j++)
7866 {
7867 /* Create the vectorized LC PHI node. */
7868 new_phis.quick_push (create_phi_node (vec_dest, bb));
7869 SLP_TREE_VEC_STMTS (slp_node).quick_push (new_phis[j]);
7870 }
7871 }
7872 edge e = gimple_phi_arg_edge (as_a <gphi *> (stmt_info->stmt), i);
7873 for (unsigned j = 0; j < vec_oprnds.length (); j++)
7874 add_phi_arg (new_phis[j], vec_oprnds[j], e, UNKNOWN_LOCATION);
7875 }
7876 /* We should have at least one already vectorized child. */
7877 gcc_assert (new_phis.exists ());
7878
7879 return true;
7880 }
7881
7882 /* Return true if VECTYPE represents a vector that requires lowering
7883 by the vector lowering pass. */
7884
7885 bool
7886 vect_emulated_vector_p (tree vectype)
7887 {
7888 return (!VECTOR_MODE_P (TYPE_MODE (vectype))
7889 && (!VECTOR_BOOLEAN_TYPE_P (vectype)
7890 || TYPE_PRECISION (TREE_TYPE (vectype)) != 1));
7891 }
7892
7893 /* Return true if we can emulate CODE on an integer mode representation
7894 of a vector. */
7895
7896 bool
7897 vect_can_vectorize_without_simd_p (tree_code code)
7898 {
7899 switch (code)
7900 {
7901 case PLUS_EXPR:
7902 case MINUS_EXPR:
7903 case NEGATE_EXPR:
7904 case BIT_AND_EXPR:
7905 case BIT_IOR_EXPR:
7906 case BIT_XOR_EXPR:
7907 case BIT_NOT_EXPR:
7908 return true;
7909
7910 default:
7911 return false;
7912 }
7913 }
7914
7915 /* Function vectorizable_induction
7916
7917 Check if STMT_INFO performs an induction computation that can be vectorized.
7918 If VEC_STMT is also passed, vectorize the induction PHI: create a vectorized
7919 phi to replace it, put it in VEC_STMT, and add it to the same basic block.
7920 Return true if STMT_INFO is vectorizable in this way. */
7921
7922 bool
7923 vectorizable_induction (loop_vec_info loop_vinfo,
7924 stmt_vec_info stmt_info,
7925 gimple **vec_stmt, slp_tree slp_node,
7926 stmt_vector_for_cost *cost_vec)
7927 {
7928 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
7929 unsigned ncopies;
7930 bool nested_in_vect_loop = false;
7931 class loop *iv_loop;
7932 tree vec_def;
7933 edge pe = loop_preheader_edge (loop);
7934 basic_block new_bb;
7935 tree new_vec, vec_init, vec_step, t;
7936 tree new_name;
7937 gimple *new_stmt;
7938 gphi *induction_phi;
7939 tree induc_def, vec_dest;
7940 tree init_expr, step_expr;
7941 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
7942 unsigned i;
7943 tree expr;
7944 gimple_stmt_iterator si;
7945
7946 gphi *phi = dyn_cast <gphi *> (stmt_info->stmt);
7947 if (!phi)
7948 return false;
7949
7950 if (!STMT_VINFO_RELEVANT_P (stmt_info))
7951 return false;
7952
7953 /* Make sure it was recognized as induction computation. */
7954 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_induction_def)
7955 return false;
7956
7957 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
7958 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
7959
7960 if (slp_node)
7961 ncopies = 1;
7962 else
7963 ncopies = vect_get_num_copies (loop_vinfo, vectype);
7964 gcc_assert (ncopies >= 1);
7965
7966 /* FORNOW. These restrictions should be relaxed. */
7967 if (nested_in_vect_loop_p (loop, stmt_info))
7968 {
7969 imm_use_iterator imm_iter;
7970 use_operand_p use_p;
7971 gimple *exit_phi;
7972 edge latch_e;
7973 tree loop_arg;
7974
7975 if (ncopies > 1)
7976 {
7977 if (dump_enabled_p ())
7978 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7979 "multiple types in nested loop.\n");
7980 return false;
7981 }
7982
7983 exit_phi = NULL;
7984 latch_e = loop_latch_edge (loop->inner);
7985 loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
7986 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, loop_arg)
7987 {
7988 gimple *use_stmt = USE_STMT (use_p);
7989 if (is_gimple_debug (use_stmt))
7990 continue;
7991
7992 if (!flow_bb_inside_loop_p (loop->inner, gimple_bb (use_stmt)))
7993 {
7994 exit_phi = use_stmt;
7995 break;
7996 }
7997 }
7998 if (exit_phi)
7999 {
8000 stmt_vec_info exit_phi_vinfo = loop_vinfo->lookup_stmt (exit_phi);
8001 if (!(STMT_VINFO_RELEVANT_P (exit_phi_vinfo)
8002 && !STMT_VINFO_LIVE_P (exit_phi_vinfo)))
8003 {
8004 if (dump_enabled_p ())
8005 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8006 "inner-loop induction only used outside "
8007 "of the outer vectorized loop.\n");
8008 return false;
8009 }
8010 }
8011
8012 nested_in_vect_loop = true;
8013 iv_loop = loop->inner;
8014 }
8015 else
8016 iv_loop = loop;
8017 gcc_assert (iv_loop == (gimple_bb (phi))->loop_father);
8018
8019 if (slp_node && !nunits.is_constant ())
8020 {
8021 /* The current SLP code creates the step value element-by-element. */
8022 if (dump_enabled_p ())
8023 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8024 "SLP induction not supported for variable-length"
8025 " vectors.\n");
8026 return false;
8027 }
8028
8029 if (!vec_stmt) /* transformation not required. */
8030 {
8031 unsigned inside_cost = 0, prologue_cost = 0;
8032 if (slp_node)
8033 {
8034 /* We eventually need to set a vector type on invariant
8035 arguments. */
8036 unsigned j;
8037 slp_tree child;
8038 FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (slp_node), j, child)
8039 if (!vect_maybe_update_slp_op_vectype
8040 (child, SLP_TREE_VECTYPE (slp_node)))
8041 {
8042 if (dump_enabled_p ())
8043 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8044 "incompatible vector types for "
8045 "invariants\n");
8046 return false;
8047 }
8048 /* loop cost for vec_loop. */
8049 inside_cost
8050 = record_stmt_cost (cost_vec,
8051 SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node),
8052 vector_stmt, stmt_info, 0, vect_body);
8053 /* prologue cost for vec_init (if not nested) and step. */
8054 prologue_cost = record_stmt_cost (cost_vec, 1 + !nested_in_vect_loop,
8055 scalar_to_vec,
8056 stmt_info, 0, vect_prologue);
8057 }
8058 else /* if (!slp_node) */
8059 {
8060 /* loop cost for vec_loop. */
8061 inside_cost = record_stmt_cost (cost_vec, ncopies, vector_stmt,
8062 stmt_info, 0, vect_body);
8063 /* prologue cost for vec_init and vec_step. */
8064 prologue_cost = record_stmt_cost (cost_vec, 2, scalar_to_vec,
8065 stmt_info, 0, vect_prologue);
8066 }
8067 if (dump_enabled_p ())
8068 dump_printf_loc (MSG_NOTE, vect_location,
8069 "vect_model_induction_cost: inside_cost = %d, "
8070 "prologue_cost = %d .\n", inside_cost,
8071 prologue_cost);
8072
8073 STMT_VINFO_TYPE (stmt_info) = induc_vec_info_type;
8074 DUMP_VECT_SCOPE ("vectorizable_induction");
8075 return true;
8076 }
8077
8078 /* Transform. */
8079
8080 /* Compute a vector variable, initialized with the first VF values of
8081 the induction variable. E.g., for an iv with IV_PHI='X' and
8082 evolution S, for a vector of 4 units, we want to compute:
8083 [X, X + S, X + 2*S, X + 3*S]. */
8084
8085 if (dump_enabled_p ())
8086 dump_printf_loc (MSG_NOTE, vect_location, "transform induction phi.\n");
8087
8088 step_expr = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_info);
8089 gcc_assert (step_expr != NULL_TREE);
8090 tree step_vectype = get_same_sized_vectype (TREE_TYPE (step_expr), vectype);
8091
8092 pe = loop_preheader_edge (iv_loop);
8093 /* Find the first insertion point in the BB. */
8094 basic_block bb = gimple_bb (phi);
8095 si = gsi_after_labels (bb);
8096
8097 /* For SLP induction we have to generate several IVs as for example
8098 with group size 3 we need
8099 [i0, i1, i2, i0 + S0] [i1 + S1, i2 + S2, i0 + 2*S0, i1 + 2*S1]
8100 [i2 + 2*S2, i0 + 3*S0, i1 + 3*S1, i2 + 3*S2]. */
8101 if (slp_node)
8102 {
8103 /* Enforced above. */
8104 unsigned int const_nunits = nunits.to_constant ();
8105
8106 /* The initial values are vectorized, but any lanes > group_size
8107 need adjustment. */
8108 slp_tree init_node
8109 = SLP_TREE_CHILDREN (slp_node)[pe->dest_idx];
8110
8111 /* Gather steps. Since we do not vectorize inductions as
8112 cycles we have to reconstruct the step from SCEV data. */
8113 unsigned group_size = SLP_TREE_LANES (slp_node);
8114 tree *steps = XALLOCAVEC (tree, group_size);
8115 tree *inits = XALLOCAVEC (tree, group_size);
8116 stmt_vec_info phi_info;
8117 FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (slp_node), i, phi_info)
8118 {
8119 steps[i] = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (phi_info);
8120 if (!init_node)
8121 inits[i] = gimple_phi_arg_def (as_a<gphi *> (phi_info->stmt),
8122 pe->dest_idx);
8123 }
8124
8125 /* Now generate the IVs. */
8126 unsigned nvects = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
8127 gcc_assert ((const_nunits * nvects) % group_size == 0);
8128 unsigned nivs;
8129 if (nested_in_vect_loop)
8130 nivs = nvects;
8131 else
8132 {
8133 /* Compute the number of distinct IVs we need. First reduce
8134 group_size if it is a multiple of const_nunits so we get
8135 one IV for a group_size of 4 but const_nunits 2. */
8136 unsigned group_sizep = group_size;
8137 if (group_sizep % const_nunits == 0)
8138 group_sizep = group_sizep / const_nunits;
8139 nivs = least_common_multiple (group_sizep,
8140 const_nunits) / const_nunits;
8141 }
8142 tree stept = TREE_TYPE (step_vectype);
8143 tree lupdate_mul = NULL_TREE;
8144 if (!nested_in_vect_loop)
8145 {
8146 /* The number of iterations covered in one vector iteration. */
8147 unsigned lup_mul = (nvects * const_nunits) / group_size;
8148 lupdate_mul
8149 = build_vector_from_val (step_vectype,
8150 SCALAR_FLOAT_TYPE_P (stept)
8151 ? build_real_from_wide (stept, lup_mul,
8152 UNSIGNED)
8153 : build_int_cstu (stept, lup_mul));
8154 }
8155 tree peel_mul = NULL_TREE;
8156 gimple_seq init_stmts = NULL;
8157 if (LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo))
8158 {
8159 if (SCALAR_FLOAT_TYPE_P (stept))
8160 peel_mul = gimple_build (&init_stmts, FLOAT_EXPR, stept,
8161 LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo));
8162 else
8163 peel_mul = gimple_convert (&init_stmts, stept,
8164 LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo));
8165 peel_mul = gimple_build_vector_from_val (&init_stmts,
8166 step_vectype, peel_mul);
8167 }
8168 unsigned ivn;
8169 auto_vec<tree> vec_steps;
8170 for (ivn = 0; ivn < nivs; ++ivn)
8171 {
8172 tree_vector_builder step_elts (step_vectype, const_nunits, 1);
8173 tree_vector_builder init_elts (vectype, const_nunits, 1);
8174 tree_vector_builder mul_elts (step_vectype, const_nunits, 1);
8175 for (unsigned eltn = 0; eltn < const_nunits; ++eltn)
8176 {
8177 /* The scalar steps of the IVs. */
8178 tree elt = steps[(ivn*const_nunits + eltn) % group_size];
8179 elt = gimple_convert (&init_stmts, TREE_TYPE (step_vectype), elt);
8180 step_elts.quick_push (elt);
8181 if (!init_node)
8182 {
8183 /* The scalar inits of the IVs if not vectorized. */
8184 elt = inits[(ivn*const_nunits + eltn) % group_size];
8185 if (!useless_type_conversion_p (TREE_TYPE (vectype),
8186 TREE_TYPE (elt)))
8187 elt = gimple_build (&init_stmts, VIEW_CONVERT_EXPR,
8188 TREE_TYPE (vectype), elt);
8189 init_elts.quick_push (elt);
8190 }
8191 /* The number of steps to add to the initial values. */
8192 unsigned mul_elt = (ivn*const_nunits + eltn) / group_size;
8193 mul_elts.quick_push (SCALAR_FLOAT_TYPE_P (stept)
8194 ? build_real_from_wide (stept,
8195 mul_elt, UNSIGNED)
8196 : build_int_cstu (stept, mul_elt));
8197 }
8198 vec_step = gimple_build_vector (&init_stmts, &step_elts);
8199 vec_steps.safe_push (vec_step);
8200 tree step_mul = gimple_build_vector (&init_stmts, &mul_elts);
8201 if (peel_mul)
8202 step_mul = gimple_build (&init_stmts, PLUS_EXPR, step_vectype,
8203 step_mul, peel_mul);
8204 if (!init_node)
8205 vec_init = gimple_build_vector (&init_stmts, &init_elts);
8206
8207 /* Create the induction-phi that defines the induction-operand. */
8208 vec_dest = vect_get_new_vect_var (vectype, vect_simple_var,
8209 "vec_iv_");
8210 induction_phi = create_phi_node (vec_dest, iv_loop->header);
8211 induc_def = PHI_RESULT (induction_phi);
8212
8213 /* Create the iv update inside the loop */
8214 tree up = vec_step;
8215 if (lupdate_mul)
8216 up = gimple_build (&init_stmts, MULT_EXPR, step_vectype,
8217 vec_step, lupdate_mul);
8218 gimple_seq stmts = NULL;
8219 vec_def = gimple_convert (&stmts, step_vectype, induc_def);
8220 vec_def = gimple_build (&stmts,
8221 PLUS_EXPR, step_vectype, vec_def, up);
8222 vec_def = gimple_convert (&stmts, vectype, vec_def);
8223 gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
8224 add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
8225 UNKNOWN_LOCATION);
8226
8227 if (init_node)
8228 vec_init = vect_get_slp_vect_def (init_node, ivn);
8229 if (!nested_in_vect_loop
8230 && !integer_zerop (step_mul))
8231 {
8232 vec_def = gimple_convert (&init_stmts, step_vectype, vec_init);
8233 up = gimple_build (&init_stmts, MULT_EXPR, step_vectype,
8234 vec_step, step_mul);
8235 vec_def = gimple_build (&init_stmts, PLUS_EXPR, step_vectype,
8236 vec_def, up);
8237 vec_init = gimple_convert (&init_stmts, vectype, vec_def);
8238 }
8239
8240 /* Set the arguments of the phi node: */
8241 add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION);
8242
8243 SLP_TREE_VEC_STMTS (slp_node).quick_push (induction_phi);
8244 }
8245 if (!nested_in_vect_loop)
8246 {
8247 /* Fill up to the number of vectors we need for the whole group. */
8248 nivs = least_common_multiple (group_size,
8249 const_nunits) / const_nunits;
8250 vec_steps.reserve (nivs-ivn);
8251 for (; ivn < nivs; ++ivn)
8252 {
8253 SLP_TREE_VEC_STMTS (slp_node)
8254 .quick_push (SLP_TREE_VEC_STMTS (slp_node)[0]);
8255 vec_steps.quick_push (vec_steps[0]);
8256 }
8257 }
8258
8259 /* Re-use IVs when we can. We are generating further vector
8260 stmts by adding VF' * stride to the IVs generated above. */
8261 if (ivn < nvects)
8262 {
8263 unsigned vfp
8264 = least_common_multiple (group_size, const_nunits) / group_size;
8265 tree lupdate_mul
8266 = build_vector_from_val (step_vectype,
8267 SCALAR_FLOAT_TYPE_P (stept)
8268 ? build_real_from_wide (stept,
8269 vfp, UNSIGNED)
8270 : build_int_cstu (stept, vfp));
8271 for (; ivn < nvects; ++ivn)
8272 {
8273 gimple *iv = SLP_TREE_VEC_STMTS (slp_node)[ivn - nivs];
8274 tree def = gimple_get_lhs (iv);
8275 if (ivn < 2*nivs)
8276 vec_steps[ivn - nivs]
8277 = gimple_build (&init_stmts, MULT_EXPR, step_vectype,
8278 vec_steps[ivn - nivs], lupdate_mul);
8279 gimple_seq stmts = NULL;
8280 def = gimple_convert (&stmts, step_vectype, def);
8281 def = gimple_build (&stmts, PLUS_EXPR, step_vectype,
8282 def, vec_steps[ivn % nivs]);
8283 def = gimple_convert (&stmts, vectype, def);
8284 if (gimple_code (iv) == GIMPLE_PHI)
8285 gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
8286 else
8287 {
8288 gimple_stmt_iterator tgsi = gsi_for_stmt (iv);
8289 gsi_insert_seq_after (&tgsi, stmts, GSI_CONTINUE_LINKING);
8290 }
8291 SLP_TREE_VEC_STMTS (slp_node)
8292 .quick_push (SSA_NAME_DEF_STMT (def));
8293 }
8294 }
8295
8296 new_bb = gsi_insert_seq_on_edge_immediate (pe, init_stmts);
8297 gcc_assert (!new_bb);
8298
8299 return true;
8300 }
8301
8302 init_expr = vect_phi_initial_value (phi);
8303
8304 gimple_seq stmts = NULL;
8305 if (!nested_in_vect_loop)
8306 {
8307 /* Convert the initial value to the IV update type. */
8308 tree new_type = TREE_TYPE (step_expr);
8309 init_expr = gimple_convert (&stmts, new_type, init_expr);
8310
8311 /* If we are using the loop mask to "peel" for alignment then we need
8312 to adjust the start value here. */
8313 tree skip_niters = LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo);
8314 if (skip_niters != NULL_TREE)
8315 {
8316 if (FLOAT_TYPE_P (vectype))
8317 skip_niters = gimple_build (&stmts, FLOAT_EXPR, new_type,
8318 skip_niters);
8319 else
8320 skip_niters = gimple_convert (&stmts, new_type, skip_niters);
8321 tree skip_step = gimple_build (&stmts, MULT_EXPR, new_type,
8322 skip_niters, step_expr);
8323 init_expr = gimple_build (&stmts, MINUS_EXPR, new_type,
8324 init_expr, skip_step);
8325 }
8326 }
8327
8328 if (stmts)
8329 {
8330 new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
8331 gcc_assert (!new_bb);
8332 }
8333
8334 /* Create the vector that holds the initial_value of the induction. */
8335 if (nested_in_vect_loop)
8336 {
8337 /* iv_loop is nested in the loop to be vectorized. init_expr had already
8338 been created during vectorization of previous stmts. We obtain it
8339 from the STMT_VINFO_VEC_STMT of the defining stmt. */
8340 auto_vec<tree> vec_inits;
8341 vect_get_vec_defs_for_operand (loop_vinfo, stmt_info, 1,
8342 init_expr, &vec_inits);
8343 vec_init = vec_inits[0];
8344 /* If the initial value is not of proper type, convert it. */
8345 if (!useless_type_conversion_p (vectype, TREE_TYPE (vec_init)))
8346 {
8347 new_stmt
8348 = gimple_build_assign (vect_get_new_ssa_name (vectype,
8349 vect_simple_var,
8350 "vec_iv_"),
8351 VIEW_CONVERT_EXPR,
8352 build1 (VIEW_CONVERT_EXPR, vectype,
8353 vec_init));
8354 vec_init = gimple_assign_lhs (new_stmt);
8355 new_bb = gsi_insert_on_edge_immediate (loop_preheader_edge (iv_loop),
8356 new_stmt);
8357 gcc_assert (!new_bb);
8358 }
8359 }
8360 else
8361 {
8362 /* iv_loop is the loop to be vectorized. Create:
8363 vec_init = [X, X+S, X+2*S, X+3*S] (S = step_expr, X = init_expr) */
8364 stmts = NULL;
8365 new_name = gimple_convert (&stmts, TREE_TYPE (step_expr), init_expr);
8366
8367 unsigned HOST_WIDE_INT const_nunits;
8368 if (nunits.is_constant (&const_nunits))
8369 {
8370 tree_vector_builder elts (step_vectype, const_nunits, 1);
8371 elts.quick_push (new_name);
8372 for (i = 1; i < const_nunits; i++)
8373 {
8374 /* Create: new_name_i = new_name + step_expr */
8375 new_name = gimple_build (&stmts, PLUS_EXPR, TREE_TYPE (new_name),
8376 new_name, step_expr);
8377 elts.quick_push (new_name);
8378 }
8379 /* Create a vector from [new_name_0, new_name_1, ...,
8380 new_name_nunits-1] */
8381 vec_init = gimple_build_vector (&stmts, &elts);
8382 }
8383 else if (INTEGRAL_TYPE_P (TREE_TYPE (step_expr)))
8384 /* Build the initial value directly from a VEC_SERIES_EXPR. */
8385 vec_init = gimple_build (&stmts, VEC_SERIES_EXPR, step_vectype,
8386 new_name, step_expr);
8387 else
8388 {
8389 /* Build:
8390 [base, base, base, ...]
8391 + (vectype) [0, 1, 2, ...] * [step, step, step, ...]. */
8392 gcc_assert (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)));
8393 gcc_assert (flag_associative_math);
8394 tree index = build_index_vector (step_vectype, 0, 1);
8395 tree base_vec = gimple_build_vector_from_val (&stmts, step_vectype,
8396 new_name);
8397 tree step_vec = gimple_build_vector_from_val (&stmts, step_vectype,
8398 step_expr);
8399 vec_init = gimple_build (&stmts, FLOAT_EXPR, step_vectype, index);
8400 vec_init = gimple_build (&stmts, MULT_EXPR, step_vectype,
8401 vec_init, step_vec);
8402 vec_init = gimple_build (&stmts, PLUS_EXPR, step_vectype,
8403 vec_init, base_vec);
8404 }
8405 vec_init = gimple_convert (&stmts, vectype, vec_init);
8406
8407 if (stmts)
8408 {
8409 new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
8410 gcc_assert (!new_bb);
8411 }
8412 }
8413
8414
8415 /* Create the vector that holds the step of the induction. */
8416 if (nested_in_vect_loop)
8417 /* iv_loop is nested in the loop to be vectorized. Generate:
8418 vec_step = [S, S, S, S] */
8419 new_name = step_expr;
8420 else
8421 {
8422 /* iv_loop is the loop to be vectorized. Generate:
8423 vec_step = [VF*S, VF*S, VF*S, VF*S] */
8424 gimple_seq seq = NULL;
8425 if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
8426 {
8427 expr = build_int_cst (integer_type_node, vf);
8428 expr = gimple_build (&seq, FLOAT_EXPR, TREE_TYPE (step_expr), expr);
8429 }
8430 else
8431 expr = build_int_cst (TREE_TYPE (step_expr), vf);
8432 new_name = gimple_build (&seq, MULT_EXPR, TREE_TYPE (step_expr),
8433 expr, step_expr);
8434 if (seq)
8435 {
8436 new_bb = gsi_insert_seq_on_edge_immediate (pe, seq);
8437 gcc_assert (!new_bb);
8438 }
8439 }
8440
8441 t = unshare_expr (new_name);
8442 gcc_assert (CONSTANT_CLASS_P (new_name)
8443 || TREE_CODE (new_name) == SSA_NAME);
8444 new_vec = build_vector_from_val (step_vectype, t);
8445 vec_step = vect_init_vector (loop_vinfo, stmt_info,
8446 new_vec, step_vectype, NULL);
8447
8448
8449 /* Create the following def-use cycle:
8450 loop prolog:
8451 vec_init = ...
8452 vec_step = ...
8453 loop:
8454 vec_iv = PHI <vec_init, vec_loop>
8455 ...
8456 STMT
8457 ...
8458 vec_loop = vec_iv + vec_step; */
8459
8460 /* Create the induction-phi that defines the induction-operand. */
8461 vec_dest = vect_get_new_vect_var (vectype, vect_simple_var, "vec_iv_");
8462 induction_phi = create_phi_node (vec_dest, iv_loop->header);
8463 induc_def = PHI_RESULT (induction_phi);
8464
8465 /* Create the iv update inside the loop */
8466 stmts = NULL;
8467 vec_def = gimple_convert (&stmts, step_vectype, induc_def);
8468 vec_def = gimple_build (&stmts, PLUS_EXPR, step_vectype, vec_def, vec_step);
8469 vec_def = gimple_convert (&stmts, vectype, vec_def);
8470 gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
8471 new_stmt = SSA_NAME_DEF_STMT (vec_def);
8472
8473 /* Set the arguments of the phi node: */
8474 add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION);
8475 add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
8476 UNKNOWN_LOCATION);
8477
8478 STMT_VINFO_VEC_STMTS (stmt_info).safe_push (induction_phi);
8479 *vec_stmt = induction_phi;
8480
8481 /* In case that vectorization factor (VF) is bigger than the number
8482 of elements that we can fit in a vectype (nunits), we have to generate
8483 more than one vector stmt - i.e - we need to "unroll" the
8484 vector stmt by a factor VF/nunits. For more details see documentation
8485 in vectorizable_operation. */
8486
8487 if (ncopies > 1)
8488 {
8489 gimple_seq seq = NULL;
8490 /* FORNOW. This restriction should be relaxed. */
8491 gcc_assert (!nested_in_vect_loop);
8492
8493 /* Create the vector that holds the step of the induction. */
8494 if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
8495 {
8496 expr = build_int_cst (integer_type_node, nunits);
8497 expr = gimple_build (&seq, FLOAT_EXPR, TREE_TYPE (step_expr), expr);
8498 }
8499 else
8500 expr = build_int_cst (TREE_TYPE (step_expr), nunits);
8501 new_name = gimple_build (&seq, MULT_EXPR, TREE_TYPE (step_expr),
8502 expr, step_expr);
8503 if (seq)
8504 {
8505 new_bb = gsi_insert_seq_on_edge_immediate (pe, seq);
8506 gcc_assert (!new_bb);
8507 }
8508
8509 t = unshare_expr (new_name);
8510 gcc_assert (CONSTANT_CLASS_P (new_name)
8511 || TREE_CODE (new_name) == SSA_NAME);
8512 new_vec = build_vector_from_val (step_vectype, t);
8513 vec_step = vect_init_vector (loop_vinfo, stmt_info,
8514 new_vec, step_vectype, NULL);
8515
8516 vec_def = induc_def;
8517 for (i = 1; i < ncopies; i++)
8518 {
8519 /* vec_i = vec_prev + vec_step */
8520 gimple_seq stmts = NULL;
8521 vec_def = gimple_convert (&stmts, step_vectype, vec_def);
8522 vec_def = gimple_build (&stmts,
8523 PLUS_EXPR, step_vectype, vec_def, vec_step);
8524 vec_def = gimple_convert (&stmts, vectype, vec_def);
8525
8526 gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
8527 new_stmt = SSA_NAME_DEF_STMT (vec_def);
8528 STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_stmt);
8529 }
8530 }
8531
8532 if (dump_enabled_p ())
8533 dump_printf_loc (MSG_NOTE, vect_location,
8534 "transform induction: created def-use cycle: %G%G",
8535 induction_phi, SSA_NAME_DEF_STMT (vec_def));
8536
8537 return true;
8538 }
8539
8540 /* Function vectorizable_live_operation.
8541
8542 STMT_INFO computes a value that is used outside the loop. Check if
8543 it can be supported. */
8544
8545 bool
8546 vectorizable_live_operation (vec_info *vinfo,
8547 stmt_vec_info stmt_info,
8548 gimple_stmt_iterator *gsi,
8549 slp_tree slp_node, slp_instance slp_node_instance,
8550 int slp_index, bool vec_stmt_p,
8551 stmt_vector_for_cost *cost_vec)
8552 {
8553 loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
8554 imm_use_iterator imm_iter;
8555 tree lhs, lhs_type, bitsize;
8556 tree vectype = (slp_node
8557 ? SLP_TREE_VECTYPE (slp_node)
8558 : STMT_VINFO_VECTYPE (stmt_info));
8559 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
8560 int ncopies;
8561 gimple *use_stmt;
8562 auto_vec<tree> vec_oprnds;
8563 int vec_entry = 0;
8564 poly_uint64 vec_index = 0;
8565
8566 gcc_assert (STMT_VINFO_LIVE_P (stmt_info));
8567
8568 /* If a stmt of a reduction is live, vectorize it via
8569 vect_create_epilog_for_reduction. vectorizable_reduction assessed
8570 validity so just trigger the transform here. */
8571 if (STMT_VINFO_REDUC_DEF (vect_orig_stmt (stmt_info)))
8572 {
8573 if (!vec_stmt_p)
8574 return true;
8575 if (slp_node)
8576 {
8577 /* For reduction chains the meta-info is attached to
8578 the group leader. */
8579 if (REDUC_GROUP_FIRST_ELEMENT (stmt_info))
8580 stmt_info = REDUC_GROUP_FIRST_ELEMENT (stmt_info);
8581 /* For SLP reductions we vectorize the epilogue for
8582 all involved stmts together. */
8583 else if (slp_index != 0)
8584 return true;
8585 else
8586 /* For SLP reductions the meta-info is attached to
8587 the representative. */
8588 stmt_info = SLP_TREE_REPRESENTATIVE (slp_node);
8589 }
8590 stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info);
8591 gcc_assert (reduc_info->is_reduc_info);
8592 if (STMT_VINFO_REDUC_TYPE (reduc_info) == FOLD_LEFT_REDUCTION
8593 || STMT_VINFO_REDUC_TYPE (reduc_info) == EXTRACT_LAST_REDUCTION)
8594 return true;
8595 vect_create_epilog_for_reduction (loop_vinfo, stmt_info, slp_node,
8596 slp_node_instance);
8597 return true;
8598 }
8599
8600 /* If STMT is not relevant and it is a simple assignment and its inputs are
8601 invariant then it can remain in place, unvectorized. The original last
8602 scalar value that it computes will be used. */
8603 if (!STMT_VINFO_RELEVANT_P (stmt_info))
8604 {
8605 gcc_assert (is_simple_and_all_uses_invariant (stmt_info, loop_vinfo));
8606 if (dump_enabled_p ())
8607 dump_printf_loc (MSG_NOTE, vect_location,
8608 "statement is simple and uses invariant. Leaving in "
8609 "place.\n");
8610 return true;
8611 }
8612
8613 if (slp_node)
8614 ncopies = 1;
8615 else
8616 ncopies = vect_get_num_copies (loop_vinfo, vectype);
8617
8618 if (slp_node)
8619 {
8620 gcc_assert (slp_index >= 0);
8621
8622 /* Get the last occurrence of the scalar index from the concatenation of
8623 all the slp vectors. Calculate which slp vector it is and the index
8624 within. */
8625 int num_scalar = SLP_TREE_LANES (slp_node);
8626 int num_vec = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
8627 poly_uint64 pos = (num_vec * nunits) - num_scalar + slp_index;
8628
8629 /* Calculate which vector contains the result, and which lane of
8630 that vector we need. */
8631 if (!can_div_trunc_p (pos, nunits, &vec_entry, &vec_index))
8632 {
8633 if (dump_enabled_p ())
8634 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8635 "Cannot determine which vector holds the"
8636 " final result.\n");
8637 return false;
8638 }
8639 }
8640
8641 if (!vec_stmt_p)
8642 {
8643 /* No transformation required. */
8644 if (loop_vinfo && LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo))
8645 {
8646 if (!direct_internal_fn_supported_p (IFN_EXTRACT_LAST, vectype,
8647 OPTIMIZE_FOR_SPEED))
8648 {
8649 if (dump_enabled_p ())
8650 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8651 "can't operate on partial vectors "
8652 "because the target doesn't support extract "
8653 "last reduction.\n");
8654 LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
8655 }
8656 else if (slp_node)
8657 {
8658 if (dump_enabled_p ())
8659 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8660 "can't operate on partial vectors "
8661 "because an SLP statement is live after "
8662 "the loop.\n");
8663 LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
8664 }
8665 else if (ncopies > 1)
8666 {
8667 if (dump_enabled_p ())
8668 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8669 "can't operate on partial vectors "
8670 "because ncopies is greater than 1.\n");
8671 LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
8672 }
8673 else
8674 {
8675 gcc_assert (ncopies == 1 && !slp_node);
8676 vect_record_loop_mask (loop_vinfo,
8677 &LOOP_VINFO_MASKS (loop_vinfo),
8678 1, vectype, NULL);
8679 }
8680 }
8681 /* ??? Enable for loop costing as well. */
8682 if (!loop_vinfo)
8683 record_stmt_cost (cost_vec, 1, vec_to_scalar, stmt_info, NULL_TREE,
8684 0, vect_epilogue);
8685 return true;
8686 }
8687
8688 /* Use the lhs of the original scalar statement. */
8689 gimple *stmt = vect_orig_stmt (stmt_info)->stmt;
8690 if (dump_enabled_p ())
8691 dump_printf_loc (MSG_NOTE, vect_location, "extracting lane for live "
8692 "stmt %G", stmt);
8693
8694 lhs = gimple_get_lhs (stmt);
8695 lhs_type = TREE_TYPE (lhs);
8696
8697 bitsize = vector_element_bits_tree (vectype);
8698
8699 /* Get the vectorized lhs of STMT and the lane to use (counted in bits). */
8700 tree vec_lhs, bitstart;
8701 gimple *vec_stmt;
8702 if (slp_node)
8703 {
8704 gcc_assert (!loop_vinfo || !LOOP_VINFO_FULLY_MASKED_P (loop_vinfo));
8705
8706 /* Get the correct slp vectorized stmt. */
8707 vec_stmt = SLP_TREE_VEC_STMTS (slp_node)[vec_entry];
8708 vec_lhs = gimple_get_lhs (vec_stmt);
8709
8710 /* Get entry to use. */
8711 bitstart = bitsize_int (vec_index);
8712 bitstart = int_const_binop (MULT_EXPR, bitsize, bitstart);
8713 }
8714 else
8715 {
8716 /* For multiple copies, get the last copy. */
8717 vec_stmt = STMT_VINFO_VEC_STMTS (stmt_info).last ();
8718 vec_lhs = gimple_get_lhs (vec_stmt);
8719
8720 /* Get the last lane in the vector. */
8721 bitstart = int_const_binop (MULT_EXPR, bitsize, bitsize_int (nunits - 1));
8722 }
8723
8724 if (loop_vinfo)
8725 {
8726 /* Ensure the VEC_LHS for lane extraction stmts satisfy loop-closed PHI
8727 requirement, insert one phi node for it. It looks like:
8728 loop;
8729 BB:
8730 # lhs' = PHI <lhs>
8731 ==>
8732 loop;
8733 BB:
8734 # vec_lhs' = PHI <vec_lhs>
8735 new_tree = lane_extract <vec_lhs', ...>;
8736 lhs' = new_tree; */
8737
8738 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
8739 basic_block exit_bb = single_exit (loop)->dest;
8740 gcc_assert (single_pred_p (exit_bb));
8741
8742 tree vec_lhs_phi = copy_ssa_name (vec_lhs);
8743 gimple *phi = create_phi_node (vec_lhs_phi, exit_bb);
8744 SET_PHI_ARG_DEF (phi, single_exit (loop)->dest_idx, vec_lhs);
8745
8746 gimple_seq stmts = NULL;
8747 tree new_tree;
8748 if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
8749 {
8750 /* Emit:
8751
8752 SCALAR_RES = EXTRACT_LAST <VEC_LHS, MASK>
8753
8754 where VEC_LHS is the vectorized live-out result and MASK is
8755 the loop mask for the final iteration. */
8756 gcc_assert (ncopies == 1 && !slp_node);
8757 tree scalar_type = TREE_TYPE (STMT_VINFO_VECTYPE (stmt_info));
8758 tree mask = vect_get_loop_mask (gsi, &LOOP_VINFO_MASKS (loop_vinfo),
8759 1, vectype, 0);
8760 tree scalar_res = gimple_build (&stmts, CFN_EXTRACT_LAST, scalar_type,
8761 mask, vec_lhs_phi);
8762
8763 /* Convert the extracted vector element to the scalar type. */
8764 new_tree = gimple_convert (&stmts, lhs_type, scalar_res);
8765 }
8766 else
8767 {
8768 tree bftype = TREE_TYPE (vectype);
8769 if (VECTOR_BOOLEAN_TYPE_P (vectype))
8770 bftype = build_nonstandard_integer_type (tree_to_uhwi (bitsize), 1);
8771 new_tree = build3 (BIT_FIELD_REF, bftype,
8772 vec_lhs_phi, bitsize, bitstart);
8773 new_tree = force_gimple_operand (fold_convert (lhs_type, new_tree),
8774 &stmts, true, NULL_TREE);
8775 }
8776
8777 if (stmts)
8778 {
8779 gimple_stmt_iterator exit_gsi = gsi_after_labels (exit_bb);
8780 gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
8781
8782 /* Remove existing phi from lhs and create one copy from new_tree. */
8783 tree lhs_phi = NULL_TREE;
8784 gimple_stmt_iterator gsi;
8785 for (gsi = gsi_start_phis (exit_bb);
8786 !gsi_end_p (gsi); gsi_next (&gsi))
8787 {
8788 gimple *phi = gsi_stmt (gsi);
8789 if ((gimple_phi_arg_def (phi, 0) == lhs))
8790 {
8791 remove_phi_node (&gsi, false);
8792 lhs_phi = gimple_phi_result (phi);
8793 gimple *copy = gimple_build_assign (lhs_phi, new_tree);
8794 gsi_insert_before (&exit_gsi, copy, GSI_SAME_STMT);
8795 break;
8796 }
8797 }
8798 }
8799
8800 /* Replace use of lhs with newly computed result. If the use stmt is a
8801 single arg PHI, just replace all uses of PHI result. It's necessary
8802 because lcssa PHI defining lhs may be before newly inserted stmt. */
8803 use_operand_p use_p;
8804 FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, lhs)
8805 if (!flow_bb_inside_loop_p (loop, gimple_bb (use_stmt))
8806 && !is_gimple_debug (use_stmt))
8807 {
8808 if (gimple_code (use_stmt) == GIMPLE_PHI
8809 && gimple_phi_num_args (use_stmt) == 1)
8810 {
8811 replace_uses_by (gimple_phi_result (use_stmt), new_tree);
8812 }
8813 else
8814 {
8815 FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
8816 SET_USE (use_p, new_tree);
8817 }
8818 update_stmt (use_stmt);
8819 }
8820 }
8821 else
8822 {
8823 /* For basic-block vectorization simply insert the lane-extraction. */
8824 tree bftype = TREE_TYPE (vectype);
8825 if (VECTOR_BOOLEAN_TYPE_P (vectype))
8826 bftype = build_nonstandard_integer_type (tree_to_uhwi (bitsize), 1);
8827 tree new_tree = build3 (BIT_FIELD_REF, bftype,
8828 vec_lhs, bitsize, bitstart);
8829 gimple_seq stmts = NULL;
8830 new_tree = force_gimple_operand (fold_convert (lhs_type, new_tree),
8831 &stmts, true, NULL_TREE);
8832 if (TREE_CODE (new_tree) == SSA_NAME
8833 && SSA_NAME_OCCURS_IN_ABNORMAL_PHI (lhs))
8834 SSA_NAME_OCCURS_IN_ABNORMAL_PHI (new_tree) = 1;
8835 if (is_a <gphi *> (vec_stmt))
8836 {
8837 gimple_stmt_iterator si = gsi_after_labels (gimple_bb (vec_stmt));
8838 gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
8839 }
8840 else
8841 {
8842 gimple_stmt_iterator si = gsi_for_stmt (vec_stmt);
8843 gsi_insert_seq_after (&si, stmts, GSI_SAME_STMT);
8844 }
8845
8846 /* Replace use of lhs with newly computed result. If the use stmt is a
8847 single arg PHI, just replace all uses of PHI result. It's necessary
8848 because lcssa PHI defining lhs may be before newly inserted stmt. */
8849 use_operand_p use_p;
8850 stmt_vec_info use_stmt_info;
8851 FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, lhs)
8852 if (!is_gimple_debug (use_stmt)
8853 && (!(use_stmt_info = vinfo->lookup_stmt (use_stmt))
8854 || !PURE_SLP_STMT (vect_stmt_to_vectorize (use_stmt_info))))
8855 {
8856 /* ??? This can happen when the live lane ends up being
8857 used in a vector construction code-generated by an
8858 external SLP node (and code-generation for that already
8859 happened). See gcc.dg/vect/bb-slp-47.c.
8860 Doing this is what would happen if that vector CTOR
8861 were not code-generated yet so it is not too bad.
8862 ??? In fact we'd likely want to avoid this situation
8863 in the first place. */
8864 if (TREE_CODE (new_tree) == SSA_NAME
8865 && !SSA_NAME_IS_DEFAULT_DEF (new_tree)
8866 && gimple_code (use_stmt) != GIMPLE_PHI
8867 && !vect_stmt_dominates_stmt_p (SSA_NAME_DEF_STMT (new_tree),
8868 use_stmt))
8869 {
8870 enum tree_code code = gimple_assign_rhs_code (use_stmt);
8871 gcc_assert (code == CONSTRUCTOR
8872 || code == VIEW_CONVERT_EXPR
8873 || CONVERT_EXPR_CODE_P (code));
8874 if (dump_enabled_p ())
8875 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8876 "Using original scalar computation for "
8877 "live lane because use preceeds vector "
8878 "def\n");
8879 continue;
8880 }
8881 /* ??? It can also happen that we end up pulling a def into
8882 a loop where replacing out-of-loop uses would require
8883 a new LC SSA PHI node. Retain the original scalar in
8884 those cases as well. PR98064. */
8885 if (TREE_CODE (new_tree) == SSA_NAME
8886 && !SSA_NAME_IS_DEFAULT_DEF (new_tree)
8887 && (gimple_bb (use_stmt)->loop_father
8888 != gimple_bb (vec_stmt)->loop_father)
8889 && !flow_loop_nested_p (gimple_bb (vec_stmt)->loop_father,
8890 gimple_bb (use_stmt)->loop_father))
8891 {
8892 if (dump_enabled_p ())
8893 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8894 "Using original scalar computation for "
8895 "live lane because there is an out-of-loop "
8896 "definition for it\n");
8897 continue;
8898 }
8899 FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
8900 SET_USE (use_p, new_tree);
8901 update_stmt (use_stmt);
8902 }
8903 }
8904
8905 return true;
8906 }
8907
8908 /* Kill any debug uses outside LOOP of SSA names defined in STMT_INFO. */
8909
8910 static void
8911 vect_loop_kill_debug_uses (class loop *loop, stmt_vec_info stmt_info)
8912 {
8913 ssa_op_iter op_iter;
8914 imm_use_iterator imm_iter;
8915 def_operand_p def_p;
8916 gimple *ustmt;
8917
8918 FOR_EACH_PHI_OR_STMT_DEF (def_p, stmt_info->stmt, op_iter, SSA_OP_DEF)
8919 {
8920 FOR_EACH_IMM_USE_STMT (ustmt, imm_iter, DEF_FROM_PTR (def_p))
8921 {
8922 basic_block bb;
8923
8924 if (!is_gimple_debug (ustmt))
8925 continue;
8926
8927 bb = gimple_bb (ustmt);
8928
8929 if (!flow_bb_inside_loop_p (loop, bb))
8930 {
8931 if (gimple_debug_bind_p (ustmt))
8932 {
8933 if (dump_enabled_p ())
8934 dump_printf_loc (MSG_NOTE, vect_location,
8935 "killing debug use\n");
8936
8937 gimple_debug_bind_reset_value (ustmt);
8938 update_stmt (ustmt);
8939 }
8940 else
8941 gcc_unreachable ();
8942 }
8943 }
8944 }
8945 }
8946
8947 /* Given loop represented by LOOP_VINFO, return true if computation of
8948 LOOP_VINFO_NITERS (= LOOP_VINFO_NITERSM1 + 1) doesn't overflow, false
8949 otherwise. */
8950
8951 static bool
8952 loop_niters_no_overflow (loop_vec_info loop_vinfo)
8953 {
8954 /* Constant case. */
8955 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
8956 {
8957 tree cst_niters = LOOP_VINFO_NITERS (loop_vinfo);
8958 tree cst_nitersm1 = LOOP_VINFO_NITERSM1 (loop_vinfo);
8959
8960 gcc_assert (TREE_CODE (cst_niters) == INTEGER_CST);
8961 gcc_assert (TREE_CODE (cst_nitersm1) == INTEGER_CST);
8962 if (wi::to_widest (cst_nitersm1) < wi::to_widest (cst_niters))
8963 return true;
8964 }
8965
8966 widest_int max;
8967 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
8968 /* Check the upper bound of loop niters. */
8969 if (get_max_loop_iterations (loop, &max))
8970 {
8971 tree type = TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo));
8972 signop sgn = TYPE_SIGN (type);
8973 widest_int type_max = widest_int::from (wi::max_value (type), sgn);
8974 if (max < type_max)
8975 return true;
8976 }
8977 return false;
8978 }
8979
8980 /* Return a mask type with half the number of elements as OLD_TYPE,
8981 given that it should have mode NEW_MODE. */
8982
8983 tree
8984 vect_halve_mask_nunits (tree old_type, machine_mode new_mode)
8985 {
8986 poly_uint64 nunits = exact_div (TYPE_VECTOR_SUBPARTS (old_type), 2);
8987 return build_truth_vector_type_for_mode (nunits, new_mode);
8988 }
8989
8990 /* Return a mask type with twice as many elements as OLD_TYPE,
8991 given that it should have mode NEW_MODE. */
8992
8993 tree
8994 vect_double_mask_nunits (tree old_type, machine_mode new_mode)
8995 {
8996 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (old_type) * 2;
8997 return build_truth_vector_type_for_mode (nunits, new_mode);
8998 }
8999
9000 /* Record that a fully-masked version of LOOP_VINFO would need MASKS to
9001 contain a sequence of NVECTORS masks that each control a vector of type
9002 VECTYPE. If SCALAR_MASK is nonnull, the fully-masked loop would AND
9003 these vector masks with the vector version of SCALAR_MASK. */
9004
9005 void
9006 vect_record_loop_mask (loop_vec_info loop_vinfo, vec_loop_masks *masks,
9007 unsigned int nvectors, tree vectype, tree scalar_mask)
9008 {
9009 gcc_assert (nvectors != 0);
9010 if (masks->length () < nvectors)
9011 masks->safe_grow_cleared (nvectors, true);
9012 rgroup_controls *rgm = &(*masks)[nvectors - 1];
9013 /* The number of scalars per iteration and the number of vectors are
9014 both compile-time constants. */
9015 unsigned int nscalars_per_iter
9016 = exact_div (nvectors * TYPE_VECTOR_SUBPARTS (vectype),
9017 LOOP_VINFO_VECT_FACTOR (loop_vinfo)).to_constant ();
9018
9019 if (scalar_mask)
9020 {
9021 scalar_cond_masked_key cond (scalar_mask, nvectors);
9022 loop_vinfo->scalar_cond_masked_set.add (cond);
9023 }
9024
9025 if (rgm->max_nscalars_per_iter < nscalars_per_iter)
9026 {
9027 rgm->max_nscalars_per_iter = nscalars_per_iter;
9028 rgm->type = truth_type_for (vectype);
9029 rgm->factor = 1;
9030 }
9031 }
9032
9033 /* Given a complete set of masks MASKS, extract mask number INDEX
9034 for an rgroup that operates on NVECTORS vectors of type VECTYPE,
9035 where 0 <= INDEX < NVECTORS. Insert any set-up statements before GSI.
9036
9037 See the comment above vec_loop_masks for more details about the mask
9038 arrangement. */
9039
9040 tree
9041 vect_get_loop_mask (gimple_stmt_iterator *gsi, vec_loop_masks *masks,
9042 unsigned int nvectors, tree vectype, unsigned int index)
9043 {
9044 rgroup_controls *rgm = &(*masks)[nvectors - 1];
9045 tree mask_type = rgm->type;
9046
9047 /* Populate the rgroup's mask array, if this is the first time we've
9048 used it. */
9049 if (rgm->controls.is_empty ())
9050 {
9051 rgm->controls.safe_grow_cleared (nvectors, true);
9052 for (unsigned int i = 0; i < nvectors; ++i)
9053 {
9054 tree mask = make_temp_ssa_name (mask_type, NULL, "loop_mask");
9055 /* Provide a dummy definition until the real one is available. */
9056 SSA_NAME_DEF_STMT (mask) = gimple_build_nop ();
9057 rgm->controls[i] = mask;
9058 }
9059 }
9060
9061 tree mask = rgm->controls[index];
9062 if (maybe_ne (TYPE_VECTOR_SUBPARTS (mask_type),
9063 TYPE_VECTOR_SUBPARTS (vectype)))
9064 {
9065 /* A loop mask for data type X can be reused for data type Y
9066 if X has N times more elements than Y and if Y's elements
9067 are N times bigger than X's. In this case each sequence
9068 of N elements in the loop mask will be all-zero or all-one.
9069 We can then view-convert the mask so that each sequence of
9070 N elements is replaced by a single element. */
9071 gcc_assert (multiple_p (TYPE_VECTOR_SUBPARTS (mask_type),
9072 TYPE_VECTOR_SUBPARTS (vectype)));
9073 gimple_seq seq = NULL;
9074 mask_type = truth_type_for (vectype);
9075 mask = gimple_build (&seq, VIEW_CONVERT_EXPR, mask_type, mask);
9076 if (seq)
9077 gsi_insert_seq_before (gsi, seq, GSI_SAME_STMT);
9078 }
9079 return mask;
9080 }
9081
9082 /* Record that LOOP_VINFO would need LENS to contain a sequence of NVECTORS
9083 lengths for controlling an operation on VECTYPE. The operation splits
9084 each element of VECTYPE into FACTOR separate subelements, measuring the
9085 length as a number of these subelements. */
9086
9087 void
9088 vect_record_loop_len (loop_vec_info loop_vinfo, vec_loop_lens *lens,
9089 unsigned int nvectors, tree vectype, unsigned int factor)
9090 {
9091 gcc_assert (nvectors != 0);
9092 if (lens->length () < nvectors)
9093 lens->safe_grow_cleared (nvectors, true);
9094 rgroup_controls *rgl = &(*lens)[nvectors - 1];
9095
9096 /* The number of scalars per iteration, scalar occupied bytes and
9097 the number of vectors are both compile-time constants. */
9098 unsigned int nscalars_per_iter
9099 = exact_div (nvectors * TYPE_VECTOR_SUBPARTS (vectype),
9100 LOOP_VINFO_VECT_FACTOR (loop_vinfo)).to_constant ();
9101
9102 if (rgl->max_nscalars_per_iter < nscalars_per_iter)
9103 {
9104 /* For now, we only support cases in which all loads and stores fall back
9105 to VnQI or none do. */
9106 gcc_assert (!rgl->max_nscalars_per_iter
9107 || (rgl->factor == 1 && factor == 1)
9108 || (rgl->max_nscalars_per_iter * rgl->factor
9109 == nscalars_per_iter * factor));
9110 rgl->max_nscalars_per_iter = nscalars_per_iter;
9111 rgl->type = vectype;
9112 rgl->factor = factor;
9113 }
9114 }
9115
9116 /* Given a complete set of length LENS, extract length number INDEX for an
9117 rgroup that operates on NVECTORS vectors, where 0 <= INDEX < NVECTORS. */
9118
9119 tree
9120 vect_get_loop_len (loop_vec_info loop_vinfo, vec_loop_lens *lens,
9121 unsigned int nvectors, unsigned int index)
9122 {
9123 rgroup_controls *rgl = &(*lens)[nvectors - 1];
9124
9125 /* Populate the rgroup's len array, if this is the first time we've
9126 used it. */
9127 if (rgl->controls.is_empty ())
9128 {
9129 rgl->controls.safe_grow_cleared (nvectors, true);
9130 for (unsigned int i = 0; i < nvectors; ++i)
9131 {
9132 tree len_type = LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo);
9133 gcc_assert (len_type != NULL_TREE);
9134 tree len = make_temp_ssa_name (len_type, NULL, "loop_len");
9135
9136 /* Provide a dummy definition until the real one is available. */
9137 SSA_NAME_DEF_STMT (len) = gimple_build_nop ();
9138 rgl->controls[i] = len;
9139 }
9140 }
9141
9142 return rgl->controls[index];
9143 }
9144
9145 /* Scale profiling counters by estimation for LOOP which is vectorized
9146 by factor VF. */
9147
9148 static void
9149 scale_profile_for_vect_loop (class loop *loop, unsigned vf)
9150 {
9151 edge preheader = loop_preheader_edge (loop);
9152 /* Reduce loop iterations by the vectorization factor. */
9153 gcov_type new_est_niter = niter_for_unrolled_loop (loop, vf);
9154 profile_count freq_h = loop->header->count, freq_e = preheader->count ();
9155
9156 if (freq_h.nonzero_p ())
9157 {
9158 profile_probability p;
9159
9160 /* Avoid dropping loop body profile counter to 0 because of zero count
9161 in loop's preheader. */
9162 if (!(freq_e == profile_count::zero ()))
9163 freq_e = freq_e.force_nonzero ();
9164 p = freq_e.apply_scale (new_est_niter + 1, 1).probability_in (freq_h);
9165 scale_loop_frequencies (loop, p);
9166 }
9167
9168 edge exit_e = single_exit (loop);
9169 exit_e->probability = profile_probability::always ()
9170 .apply_scale (1, new_est_niter + 1);
9171
9172 edge exit_l = single_pred_edge (loop->latch);
9173 profile_probability prob = exit_l->probability;
9174 exit_l->probability = exit_e->probability.invert ();
9175 if (prob.initialized_p () && exit_l->probability.initialized_p ())
9176 scale_bbs_frequencies (&loop->latch, 1, exit_l->probability / prob);
9177 }
9178
9179 /* For a vectorized stmt DEF_STMT_INFO adjust all vectorized PHI
9180 latch edge values originally defined by it. */
9181
9182 static void
9183 maybe_set_vectorized_backedge_value (loop_vec_info loop_vinfo,
9184 stmt_vec_info def_stmt_info)
9185 {
9186 tree def = gimple_get_lhs (vect_orig_stmt (def_stmt_info)->stmt);
9187 if (!def || TREE_CODE (def) != SSA_NAME)
9188 return;
9189 stmt_vec_info phi_info;
9190 imm_use_iterator iter;
9191 use_operand_p use_p;
9192 FOR_EACH_IMM_USE_FAST (use_p, iter, def)
9193 if (gphi *phi = dyn_cast <gphi *> (USE_STMT (use_p)))
9194 if (gimple_bb (phi)->loop_father->header == gimple_bb (phi)
9195 && (phi_info = loop_vinfo->lookup_stmt (phi))
9196 && STMT_VINFO_RELEVANT_P (phi_info)
9197 && VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (phi_info))
9198 && STMT_VINFO_REDUC_TYPE (phi_info) != FOLD_LEFT_REDUCTION
9199 && STMT_VINFO_REDUC_TYPE (phi_info) != EXTRACT_LAST_REDUCTION)
9200 {
9201 loop_p loop = gimple_bb (phi)->loop_father;
9202 edge e = loop_latch_edge (loop);
9203 if (PHI_ARG_DEF_FROM_EDGE (phi, e) == def)
9204 {
9205 vec<gimple *> &phi_defs = STMT_VINFO_VEC_STMTS (phi_info);
9206 vec<gimple *> &latch_defs = STMT_VINFO_VEC_STMTS (def_stmt_info);
9207 gcc_assert (phi_defs.length () == latch_defs.length ());
9208 for (unsigned i = 0; i < phi_defs.length (); ++i)
9209 add_phi_arg (as_a <gphi *> (phi_defs[i]),
9210 gimple_get_lhs (latch_defs[i]), e,
9211 gimple_phi_arg_location (phi, e->dest_idx));
9212 }
9213 }
9214 }
9215
9216 /* Vectorize STMT_INFO if relevant, inserting any new instructions before GSI.
9217 When vectorizing STMT_INFO as a store, set *SEEN_STORE to its
9218 stmt_vec_info. */
9219
9220 static bool
9221 vect_transform_loop_stmt (loop_vec_info loop_vinfo, stmt_vec_info stmt_info,
9222 gimple_stmt_iterator *gsi, stmt_vec_info *seen_store)
9223 {
9224 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
9225 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
9226
9227 if (dump_enabled_p ())
9228 dump_printf_loc (MSG_NOTE, vect_location,
9229 "------>vectorizing statement: %G", stmt_info->stmt);
9230
9231 if (MAY_HAVE_DEBUG_BIND_STMTS && !STMT_VINFO_LIVE_P (stmt_info))
9232 vect_loop_kill_debug_uses (loop, stmt_info);
9233
9234 if (!STMT_VINFO_RELEVANT_P (stmt_info)
9235 && !STMT_VINFO_LIVE_P (stmt_info))
9236 return false;
9237
9238 if (STMT_VINFO_VECTYPE (stmt_info))
9239 {
9240 poly_uint64 nunits
9241 = TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info));
9242 if (!STMT_SLP_TYPE (stmt_info)
9243 && maybe_ne (nunits, vf)
9244 && dump_enabled_p ())
9245 /* For SLP VF is set according to unrolling factor, and not
9246 to vector size, hence for SLP this print is not valid. */
9247 dump_printf_loc (MSG_NOTE, vect_location, "multiple-types.\n");
9248 }
9249
9250 /* Pure SLP statements have already been vectorized. We still need
9251 to apply loop vectorization to hybrid SLP statements. */
9252 if (PURE_SLP_STMT (stmt_info))
9253 return false;
9254
9255 if (dump_enabled_p ())
9256 dump_printf_loc (MSG_NOTE, vect_location, "transform statement.\n");
9257
9258 if (vect_transform_stmt (loop_vinfo, stmt_info, gsi, NULL, NULL))
9259 *seen_store = stmt_info;
9260
9261 return true;
9262 }
9263
9264 /* Helper function to pass to simplify_replace_tree to enable replacing tree's
9265 in the hash_map with its corresponding values. */
9266
9267 static tree
9268 find_in_mapping (tree t, void *context)
9269 {
9270 hash_map<tree,tree>* mapping = (hash_map<tree, tree>*) context;
9271
9272 tree *value = mapping->get (t);
9273 return value ? *value : t;
9274 }
9275
9276 /* Update EPILOGUE's loop_vec_info. EPILOGUE was constructed as a copy of the
9277 original loop that has now been vectorized.
9278
9279 The inits of the data_references need to be advanced with the number of
9280 iterations of the main loop. This has been computed in vect_do_peeling and
9281 is stored in parameter ADVANCE. We first restore the data_references
9282 initial offset with the values recored in ORIG_DRS_INIT.
9283
9284 Since the loop_vec_info of this EPILOGUE was constructed for the original
9285 loop, its stmt_vec_infos all point to the original statements. These need
9286 to be updated to point to their corresponding copies as well as the SSA_NAMES
9287 in their PATTERN_DEF_SEQs and RELATED_STMTs.
9288
9289 The data_reference's connections also need to be updated. Their
9290 corresponding dr_vec_info need to be reconnected to the EPILOGUE's
9291 stmt_vec_infos, their statements need to point to their corresponding copy,
9292 if they are gather loads or scatter stores then their reference needs to be
9293 updated to point to its corresponding copy and finally we set
9294 'base_misaligned' to false as we have already peeled for alignment in the
9295 prologue of the main loop. */
9296
9297 static void
9298 update_epilogue_loop_vinfo (class loop *epilogue, tree advance)
9299 {
9300 loop_vec_info epilogue_vinfo = loop_vec_info_for_loop (epilogue);
9301 auto_vec<gimple *> stmt_worklist;
9302 hash_map<tree,tree> mapping;
9303 gimple *orig_stmt, *new_stmt;
9304 gimple_stmt_iterator epilogue_gsi;
9305 gphi_iterator epilogue_phi_gsi;
9306 stmt_vec_info stmt_vinfo = NULL, related_vinfo;
9307 basic_block *epilogue_bbs = get_loop_body (epilogue);
9308 unsigned i;
9309
9310 free (LOOP_VINFO_BBS (epilogue_vinfo));
9311 LOOP_VINFO_BBS (epilogue_vinfo) = epilogue_bbs;
9312
9313 /* Advance data_reference's with the number of iterations of the previous
9314 loop and its prologue. */
9315 vect_update_inits_of_drs (epilogue_vinfo, advance, PLUS_EXPR);
9316
9317
9318 /* The EPILOGUE loop is a copy of the original loop so they share the same
9319 gimple UIDs. In this loop we update the loop_vec_info of the EPILOGUE to
9320 point to the copied statements. We also create a mapping of all LHS' in
9321 the original loop and all the LHS' in the EPILOGUE and create worklists to
9322 update teh STMT_VINFO_PATTERN_DEF_SEQs and STMT_VINFO_RELATED_STMTs. */
9323 for (unsigned i = 0; i < epilogue->num_nodes; ++i)
9324 {
9325 for (epilogue_phi_gsi = gsi_start_phis (epilogue_bbs[i]);
9326 !gsi_end_p (epilogue_phi_gsi); gsi_next (&epilogue_phi_gsi))
9327 {
9328 new_stmt = epilogue_phi_gsi.phi ();
9329
9330 gcc_assert (gimple_uid (new_stmt) > 0);
9331 stmt_vinfo
9332 = epilogue_vinfo->stmt_vec_infos[gimple_uid (new_stmt) - 1];
9333
9334 orig_stmt = STMT_VINFO_STMT (stmt_vinfo);
9335 STMT_VINFO_STMT (stmt_vinfo) = new_stmt;
9336
9337 mapping.put (gimple_phi_result (orig_stmt),
9338 gimple_phi_result (new_stmt));
9339 /* PHI nodes can not have patterns or related statements. */
9340 gcc_assert (STMT_VINFO_PATTERN_DEF_SEQ (stmt_vinfo) == NULL
9341 && STMT_VINFO_RELATED_STMT (stmt_vinfo) == NULL);
9342 }
9343
9344 for (epilogue_gsi = gsi_start_bb (epilogue_bbs[i]);
9345 !gsi_end_p (epilogue_gsi); gsi_next (&epilogue_gsi))
9346 {
9347 new_stmt = gsi_stmt (epilogue_gsi);
9348 if (is_gimple_debug (new_stmt))
9349 continue;
9350
9351 gcc_assert (gimple_uid (new_stmt) > 0);
9352 stmt_vinfo
9353 = epilogue_vinfo->stmt_vec_infos[gimple_uid (new_stmt) - 1];
9354
9355 orig_stmt = STMT_VINFO_STMT (stmt_vinfo);
9356 STMT_VINFO_STMT (stmt_vinfo) = new_stmt;
9357
9358 if (tree old_lhs = gimple_get_lhs (orig_stmt))
9359 mapping.put (old_lhs, gimple_get_lhs (new_stmt));
9360
9361 if (STMT_VINFO_PATTERN_DEF_SEQ (stmt_vinfo))
9362 {
9363 gimple_seq seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_vinfo);
9364 for (gimple_stmt_iterator gsi = gsi_start (seq);
9365 !gsi_end_p (gsi); gsi_next (&gsi))
9366 stmt_worklist.safe_push (gsi_stmt (gsi));
9367 }
9368
9369 related_vinfo = STMT_VINFO_RELATED_STMT (stmt_vinfo);
9370 if (related_vinfo != NULL && related_vinfo != stmt_vinfo)
9371 {
9372 gimple *stmt = STMT_VINFO_STMT (related_vinfo);
9373 stmt_worklist.safe_push (stmt);
9374 /* Set BB such that the assert in
9375 'get_initial_def_for_reduction' is able to determine that
9376 the BB of the related stmt is inside this loop. */
9377 gimple_set_bb (stmt,
9378 gimple_bb (new_stmt));
9379 related_vinfo = STMT_VINFO_RELATED_STMT (related_vinfo);
9380 gcc_assert (related_vinfo == NULL
9381 || related_vinfo == stmt_vinfo);
9382 }
9383 }
9384 }
9385
9386 /* The PATTERN_DEF_SEQs and RELATED_STMTs in the epilogue were constructed
9387 using the original main loop and thus need to be updated to refer to the
9388 cloned variables used in the epilogue. */
9389 for (unsigned i = 0; i < stmt_worklist.length (); ++i)
9390 {
9391 gimple *stmt = stmt_worklist[i];
9392 tree *new_op;
9393
9394 for (unsigned j = 1; j < gimple_num_ops (stmt); ++j)
9395 {
9396 tree op = gimple_op (stmt, j);
9397 if ((new_op = mapping.get(op)))
9398 gimple_set_op (stmt, j, *new_op);
9399 else
9400 {
9401 /* PR92429: The last argument of simplify_replace_tree disables
9402 folding when replacing arguments. This is required as
9403 otherwise you might end up with different statements than the
9404 ones analyzed in vect_loop_analyze, leading to different
9405 vectorization. */
9406 op = simplify_replace_tree (op, NULL_TREE, NULL_TREE,
9407 &find_in_mapping, &mapping, false);
9408 gimple_set_op (stmt, j, op);
9409 }
9410 }
9411 }
9412
9413 struct data_reference *dr;
9414 vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (epilogue_vinfo);
9415 FOR_EACH_VEC_ELT (datarefs, i, dr)
9416 {
9417 orig_stmt = DR_STMT (dr);
9418 gcc_assert (gimple_uid (orig_stmt) > 0);
9419 stmt_vinfo = epilogue_vinfo->stmt_vec_infos[gimple_uid (orig_stmt) - 1];
9420 /* Data references for gather loads and scatter stores do not use the
9421 updated offset we set using ADVANCE. Instead we have to make sure the
9422 reference in the data references point to the corresponding copy of
9423 the original in the epilogue. */
9424 if (STMT_VINFO_MEMORY_ACCESS_TYPE (vect_stmt_to_vectorize (stmt_vinfo))
9425 == VMAT_GATHER_SCATTER)
9426 {
9427 DR_REF (dr)
9428 = simplify_replace_tree (DR_REF (dr), NULL_TREE, NULL_TREE,
9429 &find_in_mapping, &mapping);
9430 DR_BASE_ADDRESS (dr)
9431 = simplify_replace_tree (DR_BASE_ADDRESS (dr), NULL_TREE, NULL_TREE,
9432 &find_in_mapping, &mapping);
9433 }
9434 DR_STMT (dr) = STMT_VINFO_STMT (stmt_vinfo);
9435 stmt_vinfo->dr_aux.stmt = stmt_vinfo;
9436 /* The vector size of the epilogue is smaller than that of the main loop
9437 so the alignment is either the same or lower. This means the dr will
9438 thus by definition be aligned. */
9439 STMT_VINFO_DR_INFO (stmt_vinfo)->base_misaligned = false;
9440 }
9441
9442 epilogue_vinfo->shared->datarefs_copy.release ();
9443 epilogue_vinfo->shared->save_datarefs ();
9444 }
9445
9446 /* Function vect_transform_loop.
9447
9448 The analysis phase has determined that the loop is vectorizable.
9449 Vectorize the loop - created vectorized stmts to replace the scalar
9450 stmts in the loop, and update the loop exit condition.
9451 Returns scalar epilogue loop if any. */
9452
9453 class loop *
9454 vect_transform_loop (loop_vec_info loop_vinfo, gimple *loop_vectorized_call)
9455 {
9456 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
9457 class loop *epilogue = NULL;
9458 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
9459 int nbbs = loop->num_nodes;
9460 int i;
9461 tree niters_vector = NULL_TREE;
9462 tree step_vector = NULL_TREE;
9463 tree niters_vector_mult_vf = NULL_TREE;
9464 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
9465 unsigned int lowest_vf = constant_lower_bound (vf);
9466 gimple *stmt;
9467 bool check_profitability = false;
9468 unsigned int th;
9469
9470 DUMP_VECT_SCOPE ("vec_transform_loop");
9471
9472 loop_vinfo->shared->check_datarefs ();
9473
9474 /* Use the more conservative vectorization threshold. If the number
9475 of iterations is constant assume the cost check has been performed
9476 by our caller. If the threshold makes all loops profitable that
9477 run at least the (estimated) vectorization factor number of times
9478 checking is pointless, too. */
9479 th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
9480 if (vect_apply_runtime_profitability_check_p (loop_vinfo))
9481 {
9482 if (dump_enabled_p ())
9483 dump_printf_loc (MSG_NOTE, vect_location,
9484 "Profitability threshold is %d loop iterations.\n",
9485 th);
9486 check_profitability = true;
9487 }
9488
9489 /* Make sure there exists a single-predecessor exit bb. Do this before
9490 versioning. */
9491 edge e = single_exit (loop);
9492 if (! single_pred_p (e->dest))
9493 {
9494 split_loop_exit_edge (e, true);
9495 if (dump_enabled_p ())
9496 dump_printf (MSG_NOTE, "split exit edge\n");
9497 }
9498
9499 /* Version the loop first, if required, so the profitability check
9500 comes first. */
9501
9502 if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
9503 {
9504 class loop *sloop
9505 = vect_loop_versioning (loop_vinfo, loop_vectorized_call);
9506 sloop->force_vectorize = false;
9507 check_profitability = false;
9508 }
9509
9510 /* Make sure there exists a single-predecessor exit bb also on the
9511 scalar loop copy. Do this after versioning but before peeling
9512 so CFG structure is fine for both scalar and if-converted loop
9513 to make slpeel_duplicate_current_defs_from_edges face matched
9514 loop closed PHI nodes on the exit. */
9515 if (LOOP_VINFO_SCALAR_LOOP (loop_vinfo))
9516 {
9517 e = single_exit (LOOP_VINFO_SCALAR_LOOP (loop_vinfo));
9518 if (! single_pred_p (e->dest))
9519 {
9520 split_loop_exit_edge (e, true);
9521 if (dump_enabled_p ())
9522 dump_printf (MSG_NOTE, "split exit edge of scalar loop\n");
9523 }
9524 }
9525
9526 tree niters = vect_build_loop_niters (loop_vinfo);
9527 LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = niters;
9528 tree nitersm1 = unshare_expr (LOOP_VINFO_NITERSM1 (loop_vinfo));
9529 bool niters_no_overflow = loop_niters_no_overflow (loop_vinfo);
9530 tree advance;
9531 drs_init_vec orig_drs_init;
9532
9533 epilogue = vect_do_peeling (loop_vinfo, niters, nitersm1, &niters_vector,
9534 &step_vector, &niters_vector_mult_vf, th,
9535 check_profitability, niters_no_overflow,
9536 &advance);
9537
9538 if (LOOP_VINFO_SCALAR_LOOP (loop_vinfo)
9539 && LOOP_VINFO_SCALAR_LOOP_SCALING (loop_vinfo).initialized_p ())
9540 scale_loop_frequencies (LOOP_VINFO_SCALAR_LOOP (loop_vinfo),
9541 LOOP_VINFO_SCALAR_LOOP_SCALING (loop_vinfo));
9542
9543 if (niters_vector == NULL_TREE)
9544 {
9545 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
9546 && !LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo)
9547 && known_eq (lowest_vf, vf))
9548 {
9549 niters_vector
9550 = build_int_cst (TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo)),
9551 LOOP_VINFO_INT_NITERS (loop_vinfo) / lowest_vf);
9552 step_vector = build_one_cst (TREE_TYPE (niters));
9553 }
9554 else if (vect_use_loop_mask_for_alignment_p (loop_vinfo))
9555 vect_gen_vector_loop_niters (loop_vinfo, niters, &niters_vector,
9556 &step_vector, niters_no_overflow);
9557 else
9558 /* vect_do_peeling subtracted the number of peeled prologue
9559 iterations from LOOP_VINFO_NITERS. */
9560 vect_gen_vector_loop_niters (loop_vinfo, LOOP_VINFO_NITERS (loop_vinfo),
9561 &niters_vector, &step_vector,
9562 niters_no_overflow);
9563 }
9564
9565 /* 1) Make sure the loop header has exactly two entries
9566 2) Make sure we have a preheader basic block. */
9567
9568 gcc_assert (EDGE_COUNT (loop->header->preds) == 2);
9569
9570 split_edge (loop_preheader_edge (loop));
9571
9572 if (vect_use_loop_mask_for_alignment_p (loop_vinfo))
9573 /* This will deal with any possible peeling. */
9574 vect_prepare_for_masked_peels (loop_vinfo);
9575
9576 /* Schedule the SLP instances first, then handle loop vectorization
9577 below. */
9578 if (!loop_vinfo->slp_instances.is_empty ())
9579 {
9580 DUMP_VECT_SCOPE ("scheduling SLP instances");
9581 vect_schedule_slp (loop_vinfo, LOOP_VINFO_SLP_INSTANCES (loop_vinfo));
9582 }
9583
9584 /* FORNOW: the vectorizer supports only loops which body consist
9585 of one basic block (header + empty latch). When the vectorizer will
9586 support more involved loop forms, the order by which the BBs are
9587 traversed need to be reconsidered. */
9588
9589 for (i = 0; i < nbbs; i++)
9590 {
9591 basic_block bb = bbs[i];
9592 stmt_vec_info stmt_info;
9593
9594 for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
9595 gsi_next (&si))
9596 {
9597 gphi *phi = si.phi ();
9598 if (dump_enabled_p ())
9599 dump_printf_loc (MSG_NOTE, vect_location,
9600 "------>vectorizing phi: %G", phi);
9601 stmt_info = loop_vinfo->lookup_stmt (phi);
9602 if (!stmt_info)
9603 continue;
9604
9605 if (MAY_HAVE_DEBUG_BIND_STMTS && !STMT_VINFO_LIVE_P (stmt_info))
9606 vect_loop_kill_debug_uses (loop, stmt_info);
9607
9608 if (!STMT_VINFO_RELEVANT_P (stmt_info)
9609 && !STMT_VINFO_LIVE_P (stmt_info))
9610 continue;
9611
9612 if (STMT_VINFO_VECTYPE (stmt_info)
9613 && (maybe_ne
9614 (TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info)), vf))
9615 && dump_enabled_p ())
9616 dump_printf_loc (MSG_NOTE, vect_location, "multiple-types.\n");
9617
9618 if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def
9619 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
9620 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def
9621 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle
9622 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_internal_def)
9623 && ! PURE_SLP_STMT (stmt_info))
9624 {
9625 if (dump_enabled_p ())
9626 dump_printf_loc (MSG_NOTE, vect_location, "transform phi.\n");
9627 vect_transform_stmt (loop_vinfo, stmt_info, NULL, NULL, NULL);
9628 }
9629 }
9630
9631 for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
9632 gsi_next (&si))
9633 {
9634 gphi *phi = si.phi ();
9635 stmt_info = loop_vinfo->lookup_stmt (phi);
9636 if (!stmt_info)
9637 continue;
9638
9639 if (!STMT_VINFO_RELEVANT_P (stmt_info)
9640 && !STMT_VINFO_LIVE_P (stmt_info))
9641 continue;
9642
9643 if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def
9644 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
9645 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def
9646 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle
9647 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_internal_def)
9648 && ! PURE_SLP_STMT (stmt_info))
9649 maybe_set_vectorized_backedge_value (loop_vinfo, stmt_info);
9650 }
9651
9652 for (gimple_stmt_iterator si = gsi_start_bb (bb);
9653 !gsi_end_p (si);)
9654 {
9655 stmt = gsi_stmt (si);
9656 /* During vectorization remove existing clobber stmts. */
9657 if (gimple_clobber_p (stmt))
9658 {
9659 unlink_stmt_vdef (stmt);
9660 gsi_remove (&si, true);
9661 release_defs (stmt);
9662 }
9663 else
9664 {
9665 /* Ignore vector stmts created in the outer loop. */
9666 stmt_info = loop_vinfo->lookup_stmt (stmt);
9667
9668 /* vector stmts created in the outer-loop during vectorization of
9669 stmts in an inner-loop may not have a stmt_info, and do not
9670 need to be vectorized. */
9671 stmt_vec_info seen_store = NULL;
9672 if (stmt_info)
9673 {
9674 if (STMT_VINFO_IN_PATTERN_P (stmt_info))
9675 {
9676 gimple *def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
9677 for (gimple_stmt_iterator subsi = gsi_start (def_seq);
9678 !gsi_end_p (subsi); gsi_next (&subsi))
9679 {
9680 stmt_vec_info pat_stmt_info
9681 = loop_vinfo->lookup_stmt (gsi_stmt (subsi));
9682 vect_transform_loop_stmt (loop_vinfo, pat_stmt_info,
9683 &si, &seen_store);
9684 }
9685 stmt_vec_info pat_stmt_info
9686 = STMT_VINFO_RELATED_STMT (stmt_info);
9687 if (vect_transform_loop_stmt (loop_vinfo, pat_stmt_info,
9688 &si, &seen_store))
9689 maybe_set_vectorized_backedge_value (loop_vinfo,
9690 pat_stmt_info);
9691 }
9692 else
9693 {
9694 if (vect_transform_loop_stmt (loop_vinfo, stmt_info, &si,
9695 &seen_store))
9696 maybe_set_vectorized_backedge_value (loop_vinfo,
9697 stmt_info);
9698 }
9699 }
9700 gsi_next (&si);
9701 if (seen_store)
9702 {
9703 if (STMT_VINFO_GROUPED_ACCESS (seen_store))
9704 /* Interleaving. If IS_STORE is TRUE, the
9705 vectorization of the interleaving chain was
9706 completed - free all the stores in the chain. */
9707 vect_remove_stores (loop_vinfo,
9708 DR_GROUP_FIRST_ELEMENT (seen_store));
9709 else
9710 /* Free the attached stmt_vec_info and remove the stmt. */
9711 loop_vinfo->remove_stmt (stmt_info);
9712 }
9713 }
9714 }
9715
9716 /* Stub out scalar statements that must not survive vectorization.
9717 Doing this here helps with grouped statements, or statements that
9718 are involved in patterns. */
9719 for (gimple_stmt_iterator gsi = gsi_start_bb (bb);
9720 !gsi_end_p (gsi); gsi_next (&gsi))
9721 {
9722 gcall *call = dyn_cast <gcall *> (gsi_stmt (gsi));
9723 if (!call || !gimple_call_internal_p (call))
9724 continue;
9725 internal_fn ifn = gimple_call_internal_fn (call);
9726 if (ifn == IFN_MASK_LOAD)
9727 {
9728 tree lhs = gimple_get_lhs (call);
9729 if (!VECTOR_TYPE_P (TREE_TYPE (lhs)))
9730 {
9731 tree zero = build_zero_cst (TREE_TYPE (lhs));
9732 gimple *new_stmt = gimple_build_assign (lhs, zero);
9733 gsi_replace (&gsi, new_stmt, true);
9734 }
9735 }
9736 else if (conditional_internal_fn_code (ifn) != ERROR_MARK)
9737 {
9738 tree lhs = gimple_get_lhs (call);
9739 if (!VECTOR_TYPE_P (TREE_TYPE (lhs)))
9740 {
9741 tree else_arg
9742 = gimple_call_arg (call, gimple_call_num_args (call) - 1);
9743 gimple *new_stmt = gimple_build_assign (lhs, else_arg);
9744 gsi_replace (&gsi, new_stmt, true);
9745 }
9746 }
9747 }
9748 } /* BBs in loop */
9749
9750 /* The vectorization factor is always > 1, so if we use an IV increment of 1.
9751 a zero NITERS becomes a nonzero NITERS_VECTOR. */
9752 if (integer_onep (step_vector))
9753 niters_no_overflow = true;
9754 vect_set_loop_condition (loop, loop_vinfo, niters_vector, step_vector,
9755 niters_vector_mult_vf, !niters_no_overflow);
9756
9757 unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
9758 scale_profile_for_vect_loop (loop, assumed_vf);
9759
9760 /* True if the final iteration might not handle a full vector's
9761 worth of scalar iterations. */
9762 bool final_iter_may_be_partial
9763 = LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo);
9764 /* The minimum number of iterations performed by the epilogue. This
9765 is 1 when peeling for gaps because we always need a final scalar
9766 iteration. */
9767 int min_epilogue_iters = LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) ? 1 : 0;
9768 /* +1 to convert latch counts to loop iteration counts,
9769 -min_epilogue_iters to remove iterations that cannot be performed
9770 by the vector code. */
9771 int bias_for_lowest = 1 - min_epilogue_iters;
9772 int bias_for_assumed = bias_for_lowest;
9773 int alignment_npeels = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
9774 if (alignment_npeels && LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
9775 {
9776 /* When the amount of peeling is known at compile time, the first
9777 iteration will have exactly alignment_npeels active elements.
9778 In the worst case it will have at least one. */
9779 int min_first_active = (alignment_npeels > 0 ? alignment_npeels : 1);
9780 bias_for_lowest += lowest_vf - min_first_active;
9781 bias_for_assumed += assumed_vf - min_first_active;
9782 }
9783 /* In these calculations the "- 1" converts loop iteration counts
9784 back to latch counts. */
9785 if (loop->any_upper_bound)
9786 {
9787 loop_vec_info main_vinfo = LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo);
9788 loop->nb_iterations_upper_bound
9789 = (final_iter_may_be_partial
9790 ? wi::udiv_ceil (loop->nb_iterations_upper_bound + bias_for_lowest,
9791 lowest_vf) - 1
9792 : wi::udiv_floor (loop->nb_iterations_upper_bound + bias_for_lowest,
9793 lowest_vf) - 1);
9794 if (main_vinfo)
9795 {
9796 unsigned int bound;
9797 poly_uint64 main_iters
9798 = upper_bound (LOOP_VINFO_VECT_FACTOR (main_vinfo),
9799 LOOP_VINFO_COST_MODEL_THRESHOLD (main_vinfo));
9800 main_iters
9801 = upper_bound (main_iters,
9802 LOOP_VINFO_VERSIONING_THRESHOLD (main_vinfo));
9803 if (can_div_away_from_zero_p (main_iters,
9804 LOOP_VINFO_VECT_FACTOR (loop_vinfo),
9805 &bound))
9806 loop->nb_iterations_upper_bound
9807 = wi::umin ((widest_int) (bound - 1),
9808 loop->nb_iterations_upper_bound);
9809 }
9810 }
9811 if (loop->any_likely_upper_bound)
9812 loop->nb_iterations_likely_upper_bound
9813 = (final_iter_may_be_partial
9814 ? wi::udiv_ceil (loop->nb_iterations_likely_upper_bound
9815 + bias_for_lowest, lowest_vf) - 1
9816 : wi::udiv_floor (loop->nb_iterations_likely_upper_bound
9817 + bias_for_lowest, lowest_vf) - 1);
9818 if (loop->any_estimate)
9819 loop->nb_iterations_estimate
9820 = (final_iter_may_be_partial
9821 ? wi::udiv_ceil (loop->nb_iterations_estimate + bias_for_assumed,
9822 assumed_vf) - 1
9823 : wi::udiv_floor (loop->nb_iterations_estimate + bias_for_assumed,
9824 assumed_vf) - 1);
9825
9826 if (dump_enabled_p ())
9827 {
9828 if (!LOOP_VINFO_EPILOGUE_P (loop_vinfo))
9829 {
9830 dump_printf_loc (MSG_NOTE, vect_location,
9831 "LOOP VECTORIZED\n");
9832 if (loop->inner)
9833 dump_printf_loc (MSG_NOTE, vect_location,
9834 "OUTER LOOP VECTORIZED\n");
9835 dump_printf (MSG_NOTE, "\n");
9836 }
9837 else
9838 dump_printf_loc (MSG_NOTE, vect_location,
9839 "LOOP EPILOGUE VECTORIZED (MODE=%s)\n",
9840 GET_MODE_NAME (loop_vinfo->vector_mode));
9841 }
9842
9843 /* Loops vectorized with a variable factor won't benefit from
9844 unrolling/peeling. */
9845 if (!vf.is_constant ())
9846 {
9847 loop->unroll = 1;
9848 if (dump_enabled_p ())
9849 dump_printf_loc (MSG_NOTE, vect_location, "Disabling unrolling due to"
9850 " variable-length vectorization factor\n");
9851 }
9852 /* Free SLP instances here because otherwise stmt reference counting
9853 won't work. */
9854 slp_instance instance;
9855 FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), i, instance)
9856 vect_free_slp_instance (instance);
9857 LOOP_VINFO_SLP_INSTANCES (loop_vinfo).release ();
9858 /* Clear-up safelen field since its value is invalid after vectorization
9859 since vectorized loop can have loop-carried dependencies. */
9860 loop->safelen = 0;
9861
9862 if (epilogue)
9863 {
9864 update_epilogue_loop_vinfo (epilogue, advance);
9865
9866 epilogue->simduid = loop->simduid;
9867 epilogue->force_vectorize = loop->force_vectorize;
9868 epilogue->dont_vectorize = false;
9869 }
9870
9871 return epilogue;
9872 }
9873
9874 /* The code below is trying to perform simple optimization - revert
9875 if-conversion for masked stores, i.e. if the mask of a store is zero
9876 do not perform it and all stored value producers also if possible.
9877 For example,
9878 for (i=0; i<n; i++)
9879 if (c[i])
9880 {
9881 p1[i] += 1;
9882 p2[i] = p3[i] +2;
9883 }
9884 this transformation will produce the following semi-hammock:
9885
9886 if (!mask__ifc__42.18_165 == { 0, 0, 0, 0, 0, 0, 0, 0 })
9887 {
9888 vect__11.19_170 = MASK_LOAD (vectp_p1.20_168, 0B, mask__ifc__42.18_165);
9889 vect__12.22_172 = vect__11.19_170 + vect_cst__171;
9890 MASK_STORE (vectp_p1.23_175, 0B, mask__ifc__42.18_165, vect__12.22_172);
9891 vect__18.25_182 = MASK_LOAD (vectp_p3.26_180, 0B, mask__ifc__42.18_165);
9892 vect__19.28_184 = vect__18.25_182 + vect_cst__183;
9893 MASK_STORE (vectp_p2.29_187, 0B, mask__ifc__42.18_165, vect__19.28_184);
9894 }
9895 */
9896
9897 void
9898 optimize_mask_stores (class loop *loop)
9899 {
9900 basic_block *bbs = get_loop_body (loop);
9901 unsigned nbbs = loop->num_nodes;
9902 unsigned i;
9903 basic_block bb;
9904 class loop *bb_loop;
9905 gimple_stmt_iterator gsi;
9906 gimple *stmt;
9907 auto_vec<gimple *> worklist;
9908 auto_purge_vect_location sentinel;
9909
9910 vect_location = find_loop_location (loop);
9911 /* Pick up all masked stores in loop if any. */
9912 for (i = 0; i < nbbs; i++)
9913 {
9914 bb = bbs[i];
9915 for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi);
9916 gsi_next (&gsi))
9917 {
9918 stmt = gsi_stmt (gsi);
9919 if (gimple_call_internal_p (stmt, IFN_MASK_STORE))
9920 worklist.safe_push (stmt);
9921 }
9922 }
9923
9924 free (bbs);
9925 if (worklist.is_empty ())
9926 return;
9927
9928 /* Loop has masked stores. */
9929 while (!worklist.is_empty ())
9930 {
9931 gimple *last, *last_store;
9932 edge e, efalse;
9933 tree mask;
9934 basic_block store_bb, join_bb;
9935 gimple_stmt_iterator gsi_to;
9936 tree vdef, new_vdef;
9937 gphi *phi;
9938 tree vectype;
9939 tree zero;
9940
9941 last = worklist.pop ();
9942 mask = gimple_call_arg (last, 2);
9943 bb = gimple_bb (last);
9944 /* Create then_bb and if-then structure in CFG, then_bb belongs to
9945 the same loop as if_bb. It could be different to LOOP when two
9946 level loop-nest is vectorized and mask_store belongs to the inner
9947 one. */
9948 e = split_block (bb, last);
9949 bb_loop = bb->loop_father;
9950 gcc_assert (loop == bb_loop || flow_loop_nested_p (loop, bb_loop));
9951 join_bb = e->dest;
9952 store_bb = create_empty_bb (bb);
9953 add_bb_to_loop (store_bb, bb_loop);
9954 e->flags = EDGE_TRUE_VALUE;
9955 efalse = make_edge (bb, store_bb, EDGE_FALSE_VALUE);
9956 /* Put STORE_BB to likely part. */
9957 efalse->probability = profile_probability::unlikely ();
9958 store_bb->count = efalse->count ();
9959 make_single_succ_edge (store_bb, join_bb, EDGE_FALLTHRU);
9960 if (dom_info_available_p (CDI_DOMINATORS))
9961 set_immediate_dominator (CDI_DOMINATORS, store_bb, bb);
9962 if (dump_enabled_p ())
9963 dump_printf_loc (MSG_NOTE, vect_location,
9964 "Create new block %d to sink mask stores.",
9965 store_bb->index);
9966 /* Create vector comparison with boolean result. */
9967 vectype = TREE_TYPE (mask);
9968 zero = build_zero_cst (vectype);
9969 stmt = gimple_build_cond (EQ_EXPR, mask, zero, NULL_TREE, NULL_TREE);
9970 gsi = gsi_last_bb (bb);
9971 gsi_insert_after (&gsi, stmt, GSI_SAME_STMT);
9972 /* Create new PHI node for vdef of the last masked store:
9973 .MEM_2 = VDEF <.MEM_1>
9974 will be converted to
9975 .MEM.3 = VDEF <.MEM_1>
9976 and new PHI node will be created in join bb
9977 .MEM_2 = PHI <.MEM_1, .MEM_3>
9978 */
9979 vdef = gimple_vdef (last);
9980 new_vdef = make_ssa_name (gimple_vop (cfun), last);
9981 gimple_set_vdef (last, new_vdef);
9982 phi = create_phi_node (vdef, join_bb);
9983 add_phi_arg (phi, new_vdef, EDGE_SUCC (store_bb, 0), UNKNOWN_LOCATION);
9984
9985 /* Put all masked stores with the same mask to STORE_BB if possible. */
9986 while (true)
9987 {
9988 gimple_stmt_iterator gsi_from;
9989 gimple *stmt1 = NULL;
9990
9991 /* Move masked store to STORE_BB. */
9992 last_store = last;
9993 gsi = gsi_for_stmt (last);
9994 gsi_from = gsi;
9995 /* Shift GSI to the previous stmt for further traversal. */
9996 gsi_prev (&gsi);
9997 gsi_to = gsi_start_bb (store_bb);
9998 gsi_move_before (&gsi_from, &gsi_to);
9999 /* Setup GSI_TO to the non-empty block start. */
10000 gsi_to = gsi_start_bb (store_bb);
10001 if (dump_enabled_p ())
10002 dump_printf_loc (MSG_NOTE, vect_location,
10003 "Move stmt to created bb\n%G", last);
10004 /* Move all stored value producers if possible. */
10005 while (!gsi_end_p (gsi))
10006 {
10007 tree lhs;
10008 imm_use_iterator imm_iter;
10009 use_operand_p use_p;
10010 bool res;
10011
10012 /* Skip debug statements. */
10013 if (is_gimple_debug (gsi_stmt (gsi)))
10014 {
10015 gsi_prev (&gsi);
10016 continue;
10017 }
10018 stmt1 = gsi_stmt (gsi);
10019 /* Do not consider statements writing to memory or having
10020 volatile operand. */
10021 if (gimple_vdef (stmt1)
10022 || gimple_has_volatile_ops (stmt1))
10023 break;
10024 gsi_from = gsi;
10025 gsi_prev (&gsi);
10026 lhs = gimple_get_lhs (stmt1);
10027 if (!lhs)
10028 break;
10029
10030 /* LHS of vectorized stmt must be SSA_NAME. */
10031 if (TREE_CODE (lhs) != SSA_NAME)
10032 break;
10033
10034 if (!VECTOR_TYPE_P (TREE_TYPE (lhs)))
10035 {
10036 /* Remove dead scalar statement. */
10037 if (has_zero_uses (lhs))
10038 {
10039 gsi_remove (&gsi_from, true);
10040 continue;
10041 }
10042 }
10043
10044 /* Check that LHS does not have uses outside of STORE_BB. */
10045 res = true;
10046 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, lhs)
10047 {
10048 gimple *use_stmt;
10049 use_stmt = USE_STMT (use_p);
10050 if (is_gimple_debug (use_stmt))
10051 continue;
10052 if (gimple_bb (use_stmt) != store_bb)
10053 {
10054 res = false;
10055 break;
10056 }
10057 }
10058 if (!res)
10059 break;
10060
10061 if (gimple_vuse (stmt1)
10062 && gimple_vuse (stmt1) != gimple_vuse (last_store))
10063 break;
10064
10065 /* Can move STMT1 to STORE_BB. */
10066 if (dump_enabled_p ())
10067 dump_printf_loc (MSG_NOTE, vect_location,
10068 "Move stmt to created bb\n%G", stmt1);
10069 gsi_move_before (&gsi_from, &gsi_to);
10070 /* Shift GSI_TO for further insertion. */
10071 gsi_prev (&gsi_to);
10072 }
10073 /* Put other masked stores with the same mask to STORE_BB. */
10074 if (worklist.is_empty ()
10075 || gimple_call_arg (worklist.last (), 2) != mask
10076 || worklist.last () != stmt1)
10077 break;
10078 last = worklist.pop ();
10079 }
10080 add_phi_arg (phi, gimple_vuse (last_store), e, UNKNOWN_LOCATION);
10081 }
10082 }
10083
10084 /* Decide whether it is possible to use a zero-based induction variable
10085 when vectorizing LOOP_VINFO with partial vectors. If it is, return
10086 the value that the induction variable must be able to hold in order
10087 to ensure that the rgroups eventually have no active vector elements.
10088 Return -1 otherwise. */
10089
10090 widest_int
10091 vect_iv_limit_for_partial_vectors (loop_vec_info loop_vinfo)
10092 {
10093 tree niters_skip = LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo);
10094 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
10095 unsigned HOST_WIDE_INT max_vf = vect_max_vf (loop_vinfo);
10096
10097 /* Calculate the value that the induction variable must be able
10098 to hit in order to ensure that we end the loop with an all-false mask.
10099 This involves adding the maximum number of inactive trailing scalar
10100 iterations. */
10101 widest_int iv_limit = -1;
10102 if (max_loop_iterations (loop, &iv_limit))
10103 {
10104 if (niters_skip)
10105 {
10106 /* Add the maximum number of skipped iterations to the
10107 maximum iteration count. */
10108 if (TREE_CODE (niters_skip) == INTEGER_CST)
10109 iv_limit += wi::to_widest (niters_skip);
10110 else
10111 iv_limit += max_vf - 1;
10112 }
10113 else if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo))
10114 /* Make a conservatively-correct assumption. */
10115 iv_limit += max_vf - 1;
10116
10117 /* IV_LIMIT is the maximum number of latch iterations, which is also
10118 the maximum in-range IV value. Round this value down to the previous
10119 vector alignment boundary and then add an extra full iteration. */
10120 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
10121 iv_limit = (iv_limit & -(int) known_alignment (vf)) + max_vf;
10122 }
10123 return iv_limit;
10124 }
10125
10126 /* For the given rgroup_controls RGC, check whether an induction variable
10127 would ever hit a value that produces a set of all-false masks or zero
10128 lengths before wrapping around. Return true if it's possible to wrap
10129 around before hitting the desirable value, otherwise return false. */
10130
10131 bool
10132 vect_rgroup_iv_might_wrap_p (loop_vec_info loop_vinfo, rgroup_controls *rgc)
10133 {
10134 widest_int iv_limit = vect_iv_limit_for_partial_vectors (loop_vinfo);
10135
10136 if (iv_limit == -1)
10137 return true;
10138
10139 tree compare_type = LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo);
10140 unsigned int compare_precision = TYPE_PRECISION (compare_type);
10141 unsigned nitems = rgc->max_nscalars_per_iter * rgc->factor;
10142
10143 if (wi::min_precision (iv_limit * nitems, UNSIGNED) > compare_precision)
10144 return true;
10145
10146 return false;
10147 }