]> git.ipfire.org Git - thirdparty/gcc.git/blob - gcc/tree-vect-loop.c
coroutines: Make call argument handling more robust [PR95440]
[thirdparty/gcc.git] / gcc / tree-vect-loop.c
1 /* Loop Vectorization
2 Copyright (C) 2003-2020 Free Software Foundation, Inc.
3 Contributed by Dorit Naishlos <dorit@il.ibm.com> and
4 Ira Rosen <irar@il.ibm.com>
5
6 This file is part of GCC.
7
8 GCC is free software; you can redistribute it and/or modify it under
9 the terms of the GNU General Public License as published by the Free
10 Software Foundation; either version 3, or (at your option) any later
11 version.
12
13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
16 for more details.
17
18 You should have received a copy of the GNU General Public License
19 along with GCC; see the file COPYING3. If not see
20 <http://www.gnu.org/licenses/>. */
21
22 #include "config.h"
23 #include "system.h"
24 #include "coretypes.h"
25 #include "backend.h"
26 #include "target.h"
27 #include "rtl.h"
28 #include "tree.h"
29 #include "gimple.h"
30 #include "cfghooks.h"
31 #include "tree-pass.h"
32 #include "ssa.h"
33 #include "optabs-tree.h"
34 #include "diagnostic-core.h"
35 #include "fold-const.h"
36 #include "stor-layout.h"
37 #include "cfganal.h"
38 #include "gimplify.h"
39 #include "gimple-iterator.h"
40 #include "gimplify-me.h"
41 #include "tree-ssa-loop-ivopts.h"
42 #include "tree-ssa-loop-manip.h"
43 #include "tree-ssa-loop-niter.h"
44 #include "tree-ssa-loop.h"
45 #include "cfgloop.h"
46 #include "tree-scalar-evolution.h"
47 #include "tree-vectorizer.h"
48 #include "gimple-fold.h"
49 #include "cgraph.h"
50 #include "tree-cfg.h"
51 #include "tree-if-conv.h"
52 #include "internal-fn.h"
53 #include "tree-vector-builder.h"
54 #include "vec-perm-indices.h"
55 #include "tree-eh.h"
56
57 /* Loop Vectorization Pass.
58
59 This pass tries to vectorize loops.
60
61 For example, the vectorizer transforms the following simple loop:
62
63 short a[N]; short b[N]; short c[N]; int i;
64
65 for (i=0; i<N; i++){
66 a[i] = b[i] + c[i];
67 }
68
69 as if it was manually vectorized by rewriting the source code into:
70
71 typedef int __attribute__((mode(V8HI))) v8hi;
72 short a[N]; short b[N]; short c[N]; int i;
73 v8hi *pa = (v8hi*)a, *pb = (v8hi*)b, *pc = (v8hi*)c;
74 v8hi va, vb, vc;
75
76 for (i=0; i<N/8; i++){
77 vb = pb[i];
78 vc = pc[i];
79 va = vb + vc;
80 pa[i] = va;
81 }
82
83 The main entry to this pass is vectorize_loops(), in which
84 the vectorizer applies a set of analyses on a given set of loops,
85 followed by the actual vectorization transformation for the loops that
86 had successfully passed the analysis phase.
87 Throughout this pass we make a distinction between two types of
88 data: scalars (which are represented by SSA_NAMES), and memory references
89 ("data-refs"). These two types of data require different handling both
90 during analysis and transformation. The types of data-refs that the
91 vectorizer currently supports are ARRAY_REFS which base is an array DECL
92 (not a pointer), and INDIRECT_REFS through pointers; both array and pointer
93 accesses are required to have a simple (consecutive) access pattern.
94
95 Analysis phase:
96 ===============
97 The driver for the analysis phase is vect_analyze_loop().
98 It applies a set of analyses, some of which rely on the scalar evolution
99 analyzer (scev) developed by Sebastian Pop.
100
101 During the analysis phase the vectorizer records some information
102 per stmt in a "stmt_vec_info" struct which is attached to each stmt in the
103 loop, as well as general information about the loop as a whole, which is
104 recorded in a "loop_vec_info" struct attached to each loop.
105
106 Transformation phase:
107 =====================
108 The loop transformation phase scans all the stmts in the loop, and
109 creates a vector stmt (or a sequence of stmts) for each scalar stmt S in
110 the loop that needs to be vectorized. It inserts the vector code sequence
111 just before the scalar stmt S, and records a pointer to the vector code
112 in STMT_VINFO_VEC_STMT (stmt_info) (stmt_info is the stmt_vec_info struct
113 attached to S). This pointer will be used for the vectorization of following
114 stmts which use the def of stmt S. Stmt S is removed if it writes to memory;
115 otherwise, we rely on dead code elimination for removing it.
116
117 For example, say stmt S1 was vectorized into stmt VS1:
118
119 VS1: vb = px[i];
120 S1: b = x[i]; STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
121 S2: a = b;
122
123 To vectorize stmt S2, the vectorizer first finds the stmt that defines
124 the operand 'b' (S1), and gets the relevant vector def 'vb' from the
125 vector stmt VS1 pointed to by STMT_VINFO_VEC_STMT (stmt_info (S1)). The
126 resulting sequence would be:
127
128 VS1: vb = px[i];
129 S1: b = x[i]; STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
130 VS2: va = vb;
131 S2: a = b; STMT_VINFO_VEC_STMT (stmt_info (S2)) = VS2
132
133 Operands that are not SSA_NAMEs, are data-refs that appear in
134 load/store operations (like 'x[i]' in S1), and are handled differently.
135
136 Target modeling:
137 =================
138 Currently the only target specific information that is used is the
139 size of the vector (in bytes) - "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD".
140 Targets that can support different sizes of vectors, for now will need
141 to specify one value for "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD". More
142 flexibility will be added in the future.
143
144 Since we only vectorize operations which vector form can be
145 expressed using existing tree codes, to verify that an operation is
146 supported, the vectorizer checks the relevant optab at the relevant
147 machine_mode (e.g, optab_handler (add_optab, V8HImode)). If
148 the value found is CODE_FOR_nothing, then there's no target support, and
149 we can't vectorize the stmt.
150
151 For additional information on this project see:
152 http://gcc.gnu.org/projects/tree-ssa/vectorization.html
153 */
154
155 static void vect_estimate_min_profitable_iters (loop_vec_info, int *, int *);
156 static stmt_vec_info vect_is_simple_reduction (loop_vec_info, stmt_vec_info,
157 bool *, bool *);
158
159 /* Subroutine of vect_determine_vf_for_stmt that handles only one
160 statement. VECTYPE_MAYBE_SET_P is true if STMT_VINFO_VECTYPE
161 may already be set for general statements (not just data refs). */
162
163 static opt_result
164 vect_determine_vf_for_stmt_1 (vec_info *vinfo, stmt_vec_info stmt_info,
165 bool vectype_maybe_set_p,
166 poly_uint64 *vf)
167 {
168 gimple *stmt = stmt_info->stmt;
169
170 if ((!STMT_VINFO_RELEVANT_P (stmt_info)
171 && !STMT_VINFO_LIVE_P (stmt_info))
172 || gimple_clobber_p (stmt))
173 {
174 if (dump_enabled_p ())
175 dump_printf_loc (MSG_NOTE, vect_location, "skip.\n");
176 return opt_result::success ();
177 }
178
179 tree stmt_vectype, nunits_vectype;
180 opt_result res = vect_get_vector_types_for_stmt (vinfo, stmt_info,
181 &stmt_vectype,
182 &nunits_vectype);
183 if (!res)
184 return res;
185
186 if (stmt_vectype)
187 {
188 if (STMT_VINFO_VECTYPE (stmt_info))
189 /* The only case when a vectype had been already set is for stmts
190 that contain a data ref, or for "pattern-stmts" (stmts generated
191 by the vectorizer to represent/replace a certain idiom). */
192 gcc_assert ((STMT_VINFO_DATA_REF (stmt_info)
193 || vectype_maybe_set_p)
194 && STMT_VINFO_VECTYPE (stmt_info) == stmt_vectype);
195 else
196 STMT_VINFO_VECTYPE (stmt_info) = stmt_vectype;
197 }
198
199 if (nunits_vectype)
200 vect_update_max_nunits (vf, nunits_vectype);
201
202 return opt_result::success ();
203 }
204
205 /* Subroutine of vect_determine_vectorization_factor. Set the vector
206 types of STMT_INFO and all attached pattern statements and update
207 the vectorization factor VF accordingly. Return true on success
208 or false if something prevented vectorization. */
209
210 static opt_result
211 vect_determine_vf_for_stmt (vec_info *vinfo,
212 stmt_vec_info stmt_info, poly_uint64 *vf)
213 {
214 if (dump_enabled_p ())
215 dump_printf_loc (MSG_NOTE, vect_location, "==> examining statement: %G",
216 stmt_info->stmt);
217 opt_result res = vect_determine_vf_for_stmt_1 (vinfo, stmt_info, false, vf);
218 if (!res)
219 return res;
220
221 if (STMT_VINFO_IN_PATTERN_P (stmt_info)
222 && STMT_VINFO_RELATED_STMT (stmt_info))
223 {
224 gimple *pattern_def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
225 stmt_info = STMT_VINFO_RELATED_STMT (stmt_info);
226
227 /* If a pattern statement has def stmts, analyze them too. */
228 for (gimple_stmt_iterator si = gsi_start (pattern_def_seq);
229 !gsi_end_p (si); gsi_next (&si))
230 {
231 stmt_vec_info def_stmt_info = vinfo->lookup_stmt (gsi_stmt (si));
232 if (dump_enabled_p ())
233 dump_printf_loc (MSG_NOTE, vect_location,
234 "==> examining pattern def stmt: %G",
235 def_stmt_info->stmt);
236 res = vect_determine_vf_for_stmt_1 (vinfo, def_stmt_info, true, vf);
237 if (!res)
238 return res;
239 }
240
241 if (dump_enabled_p ())
242 dump_printf_loc (MSG_NOTE, vect_location,
243 "==> examining pattern statement: %G",
244 stmt_info->stmt);
245 res = vect_determine_vf_for_stmt_1 (vinfo, stmt_info, true, vf);
246 if (!res)
247 return res;
248 }
249
250 return opt_result::success ();
251 }
252
253 /* Function vect_determine_vectorization_factor
254
255 Determine the vectorization factor (VF). VF is the number of data elements
256 that are operated upon in parallel in a single iteration of the vectorized
257 loop. For example, when vectorizing a loop that operates on 4byte elements,
258 on a target with vector size (VS) 16byte, the VF is set to 4, since 4
259 elements can fit in a single vector register.
260
261 We currently support vectorization of loops in which all types operated upon
262 are of the same size. Therefore this function currently sets VF according to
263 the size of the types operated upon, and fails if there are multiple sizes
264 in the loop.
265
266 VF is also the factor by which the loop iterations are strip-mined, e.g.:
267 original loop:
268 for (i=0; i<N; i++){
269 a[i] = b[i] + c[i];
270 }
271
272 vectorized loop:
273 for (i=0; i<N; i+=VF){
274 a[i:VF] = b[i:VF] + c[i:VF];
275 }
276 */
277
278 static opt_result
279 vect_determine_vectorization_factor (loop_vec_info loop_vinfo)
280 {
281 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
282 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
283 unsigned nbbs = loop->num_nodes;
284 poly_uint64 vectorization_factor = 1;
285 tree scalar_type = NULL_TREE;
286 gphi *phi;
287 tree vectype;
288 stmt_vec_info stmt_info;
289 unsigned i;
290
291 DUMP_VECT_SCOPE ("vect_determine_vectorization_factor");
292
293 for (i = 0; i < nbbs; i++)
294 {
295 basic_block bb = bbs[i];
296
297 for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
298 gsi_next (&si))
299 {
300 phi = si.phi ();
301 stmt_info = loop_vinfo->lookup_stmt (phi);
302 if (dump_enabled_p ())
303 dump_printf_loc (MSG_NOTE, vect_location, "==> examining phi: %G",
304 phi);
305
306 gcc_assert (stmt_info);
307
308 if (STMT_VINFO_RELEVANT_P (stmt_info)
309 || STMT_VINFO_LIVE_P (stmt_info))
310 {
311 gcc_assert (!STMT_VINFO_VECTYPE (stmt_info));
312 scalar_type = TREE_TYPE (PHI_RESULT (phi));
313
314 if (dump_enabled_p ())
315 dump_printf_loc (MSG_NOTE, vect_location,
316 "get vectype for scalar type: %T\n",
317 scalar_type);
318
319 vectype = get_vectype_for_scalar_type (loop_vinfo, scalar_type);
320 if (!vectype)
321 return opt_result::failure_at (phi,
322 "not vectorized: unsupported "
323 "data-type %T\n",
324 scalar_type);
325 STMT_VINFO_VECTYPE (stmt_info) = vectype;
326
327 if (dump_enabled_p ())
328 dump_printf_loc (MSG_NOTE, vect_location, "vectype: %T\n",
329 vectype);
330
331 if (dump_enabled_p ())
332 {
333 dump_printf_loc (MSG_NOTE, vect_location, "nunits = ");
334 dump_dec (MSG_NOTE, TYPE_VECTOR_SUBPARTS (vectype));
335 dump_printf (MSG_NOTE, "\n");
336 }
337
338 vect_update_max_nunits (&vectorization_factor, vectype);
339 }
340 }
341
342 for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
343 gsi_next (&si))
344 {
345 if (is_gimple_debug (gsi_stmt (si)))
346 continue;
347 stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
348 opt_result res
349 = vect_determine_vf_for_stmt (loop_vinfo,
350 stmt_info, &vectorization_factor);
351 if (!res)
352 return res;
353 }
354 }
355
356 /* TODO: Analyze cost. Decide if worth while to vectorize. */
357 if (dump_enabled_p ())
358 {
359 dump_printf_loc (MSG_NOTE, vect_location, "vectorization factor = ");
360 dump_dec (MSG_NOTE, vectorization_factor);
361 dump_printf (MSG_NOTE, "\n");
362 }
363
364 if (known_le (vectorization_factor, 1U))
365 return opt_result::failure_at (vect_location,
366 "not vectorized: unsupported data-type\n");
367 LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
368 return opt_result::success ();
369 }
370
371
372 /* Function vect_is_simple_iv_evolution.
373
374 FORNOW: A simple evolution of an induction variables in the loop is
375 considered a polynomial evolution. */
376
377 static bool
378 vect_is_simple_iv_evolution (unsigned loop_nb, tree access_fn, tree * init,
379 tree * step)
380 {
381 tree init_expr;
382 tree step_expr;
383 tree evolution_part = evolution_part_in_loop_num (access_fn, loop_nb);
384 basic_block bb;
385
386 /* When there is no evolution in this loop, the evolution function
387 is not "simple". */
388 if (evolution_part == NULL_TREE)
389 return false;
390
391 /* When the evolution is a polynomial of degree >= 2
392 the evolution function is not "simple". */
393 if (tree_is_chrec (evolution_part))
394 return false;
395
396 step_expr = evolution_part;
397 init_expr = unshare_expr (initial_condition_in_loop_num (access_fn, loop_nb));
398
399 if (dump_enabled_p ())
400 dump_printf_loc (MSG_NOTE, vect_location, "step: %T, init: %T\n",
401 step_expr, init_expr);
402
403 *init = init_expr;
404 *step = step_expr;
405
406 if (TREE_CODE (step_expr) != INTEGER_CST
407 && (TREE_CODE (step_expr) != SSA_NAME
408 || ((bb = gimple_bb (SSA_NAME_DEF_STMT (step_expr)))
409 && flow_bb_inside_loop_p (get_loop (cfun, loop_nb), bb))
410 || (!INTEGRAL_TYPE_P (TREE_TYPE (step_expr))
411 && (!SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr))
412 || !flag_associative_math)))
413 && (TREE_CODE (step_expr) != REAL_CST
414 || !flag_associative_math))
415 {
416 if (dump_enabled_p ())
417 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
418 "step unknown.\n");
419 return false;
420 }
421
422 return true;
423 }
424
425 /* Return true if PHI, described by STMT_INFO, is the inner PHI in
426 what we are assuming is a double reduction. For example, given
427 a structure like this:
428
429 outer1:
430 x_1 = PHI <x_4(outer2), ...>;
431 ...
432
433 inner:
434 x_2 = PHI <x_1(outer1), ...>;
435 ...
436 x_3 = ...;
437 ...
438
439 outer2:
440 x_4 = PHI <x_3(inner)>;
441 ...
442
443 outer loop analysis would treat x_1 as a double reduction phi and
444 this function would then return true for x_2. */
445
446 static bool
447 vect_inner_phi_in_double_reduction_p (loop_vec_info loop_vinfo, gphi *phi)
448 {
449 use_operand_p use_p;
450 ssa_op_iter op_iter;
451 FOR_EACH_PHI_ARG (use_p, phi, op_iter, SSA_OP_USE)
452 if (stmt_vec_info def_info = loop_vinfo->lookup_def (USE_FROM_PTR (use_p)))
453 if (STMT_VINFO_DEF_TYPE (def_info) == vect_double_reduction_def)
454 return true;
455 return false;
456 }
457
458 /* Function vect_analyze_scalar_cycles_1.
459
460 Examine the cross iteration def-use cycles of scalar variables
461 in LOOP. LOOP_VINFO represents the loop that is now being
462 considered for vectorization (can be LOOP, or an outer-loop
463 enclosing LOOP). */
464
465 static void
466 vect_analyze_scalar_cycles_1 (loop_vec_info loop_vinfo, class loop *loop)
467 {
468 basic_block bb = loop->header;
469 tree init, step;
470 auto_vec<stmt_vec_info, 64> worklist;
471 gphi_iterator gsi;
472 bool double_reduc, reduc_chain;
473
474 DUMP_VECT_SCOPE ("vect_analyze_scalar_cycles");
475
476 /* First - identify all inductions. Reduction detection assumes that all the
477 inductions have been identified, therefore, this order must not be
478 changed. */
479 for (gsi = gsi_start_phis (bb); !gsi_end_p (gsi); gsi_next (&gsi))
480 {
481 gphi *phi = gsi.phi ();
482 tree access_fn = NULL;
483 tree def = PHI_RESULT (phi);
484 stmt_vec_info stmt_vinfo = loop_vinfo->lookup_stmt (phi);
485
486 if (dump_enabled_p ())
487 dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: %G", phi);
488
489 /* Skip virtual phi's. The data dependences that are associated with
490 virtual defs/uses (i.e., memory accesses) are analyzed elsewhere. */
491 if (virtual_operand_p (def))
492 continue;
493
494 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_unknown_def_type;
495
496 /* Analyze the evolution function. */
497 access_fn = analyze_scalar_evolution (loop, def);
498 if (access_fn)
499 {
500 STRIP_NOPS (access_fn);
501 if (dump_enabled_p ())
502 dump_printf_loc (MSG_NOTE, vect_location,
503 "Access function of PHI: %T\n", access_fn);
504 STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo)
505 = initial_condition_in_loop_num (access_fn, loop->num);
506 STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo)
507 = evolution_part_in_loop_num (access_fn, loop->num);
508 }
509
510 if (!access_fn
511 || vect_inner_phi_in_double_reduction_p (loop_vinfo, phi)
512 || !vect_is_simple_iv_evolution (loop->num, access_fn, &init, &step)
513 || (LOOP_VINFO_LOOP (loop_vinfo) != loop
514 && TREE_CODE (step) != INTEGER_CST))
515 {
516 worklist.safe_push (stmt_vinfo);
517 continue;
518 }
519
520 gcc_assert (STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo)
521 != NULL_TREE);
522 gcc_assert (STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo) != NULL_TREE);
523
524 if (dump_enabled_p ())
525 dump_printf_loc (MSG_NOTE, vect_location, "Detected induction.\n");
526 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_induction_def;
527 }
528
529
530 /* Second - identify all reductions and nested cycles. */
531 while (worklist.length () > 0)
532 {
533 stmt_vec_info stmt_vinfo = worklist.pop ();
534 gphi *phi = as_a <gphi *> (stmt_vinfo->stmt);
535 tree def = PHI_RESULT (phi);
536
537 if (dump_enabled_p ())
538 dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: %G", phi);
539
540 gcc_assert (!virtual_operand_p (def)
541 && STMT_VINFO_DEF_TYPE (stmt_vinfo) == vect_unknown_def_type);
542
543 stmt_vec_info reduc_stmt_info
544 = vect_is_simple_reduction (loop_vinfo, stmt_vinfo, &double_reduc,
545 &reduc_chain);
546 if (reduc_stmt_info)
547 {
548 STMT_VINFO_REDUC_DEF (stmt_vinfo) = reduc_stmt_info;
549 STMT_VINFO_REDUC_DEF (reduc_stmt_info) = stmt_vinfo;
550 if (double_reduc)
551 {
552 if (dump_enabled_p ())
553 dump_printf_loc (MSG_NOTE, vect_location,
554 "Detected double reduction.\n");
555
556 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_double_reduction_def;
557 STMT_VINFO_DEF_TYPE (reduc_stmt_info) = vect_double_reduction_def;
558 }
559 else
560 {
561 if (loop != LOOP_VINFO_LOOP (loop_vinfo))
562 {
563 if (dump_enabled_p ())
564 dump_printf_loc (MSG_NOTE, vect_location,
565 "Detected vectorizable nested cycle.\n");
566
567 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_nested_cycle;
568 }
569 else
570 {
571 if (dump_enabled_p ())
572 dump_printf_loc (MSG_NOTE, vect_location,
573 "Detected reduction.\n");
574
575 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_reduction_def;
576 STMT_VINFO_DEF_TYPE (reduc_stmt_info) = vect_reduction_def;
577 /* Store the reduction cycles for possible vectorization in
578 loop-aware SLP if it was not detected as reduction
579 chain. */
580 if (! reduc_chain)
581 LOOP_VINFO_REDUCTIONS (loop_vinfo).safe_push
582 (reduc_stmt_info);
583 }
584 }
585 }
586 else
587 if (dump_enabled_p ())
588 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
589 "Unknown def-use cycle pattern.\n");
590 }
591 }
592
593
594 /* Function vect_analyze_scalar_cycles.
595
596 Examine the cross iteration def-use cycles of scalar variables, by
597 analyzing the loop-header PHIs of scalar variables. Classify each
598 cycle as one of the following: invariant, induction, reduction, unknown.
599 We do that for the loop represented by LOOP_VINFO, and also to its
600 inner-loop, if exists.
601 Examples for scalar cycles:
602
603 Example1: reduction:
604
605 loop1:
606 for (i=0; i<N; i++)
607 sum += a[i];
608
609 Example2: induction:
610
611 loop2:
612 for (i=0; i<N; i++)
613 a[i] = i; */
614
615 static void
616 vect_analyze_scalar_cycles (loop_vec_info loop_vinfo)
617 {
618 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
619
620 vect_analyze_scalar_cycles_1 (loop_vinfo, loop);
621
622 /* When vectorizing an outer-loop, the inner-loop is executed sequentially.
623 Reductions in such inner-loop therefore have different properties than
624 the reductions in the nest that gets vectorized:
625 1. When vectorized, they are executed in the same order as in the original
626 scalar loop, so we can't change the order of computation when
627 vectorizing them.
628 2. FIXME: Inner-loop reductions can be used in the inner-loop, so the
629 current checks are too strict. */
630
631 if (loop->inner)
632 vect_analyze_scalar_cycles_1 (loop_vinfo, loop->inner);
633 }
634
635 /* Transfer group and reduction information from STMT_INFO to its
636 pattern stmt. */
637
638 static void
639 vect_fixup_reduc_chain (stmt_vec_info stmt_info)
640 {
641 stmt_vec_info firstp = STMT_VINFO_RELATED_STMT (stmt_info);
642 stmt_vec_info stmtp;
643 gcc_assert (!REDUC_GROUP_FIRST_ELEMENT (firstp)
644 && REDUC_GROUP_FIRST_ELEMENT (stmt_info));
645 REDUC_GROUP_SIZE (firstp) = REDUC_GROUP_SIZE (stmt_info);
646 do
647 {
648 stmtp = STMT_VINFO_RELATED_STMT (stmt_info);
649 gcc_checking_assert (STMT_VINFO_DEF_TYPE (stmtp)
650 == STMT_VINFO_DEF_TYPE (stmt_info));
651 REDUC_GROUP_FIRST_ELEMENT (stmtp) = firstp;
652 stmt_info = REDUC_GROUP_NEXT_ELEMENT (stmt_info);
653 if (stmt_info)
654 REDUC_GROUP_NEXT_ELEMENT (stmtp)
655 = STMT_VINFO_RELATED_STMT (stmt_info);
656 }
657 while (stmt_info);
658 }
659
660 /* Fixup scalar cycles that now have their stmts detected as patterns. */
661
662 static void
663 vect_fixup_scalar_cycles_with_patterns (loop_vec_info loop_vinfo)
664 {
665 stmt_vec_info first;
666 unsigned i;
667
668 FOR_EACH_VEC_ELT (LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo), i, first)
669 if (STMT_VINFO_IN_PATTERN_P (first))
670 {
671 stmt_vec_info next = REDUC_GROUP_NEXT_ELEMENT (first);
672 while (next)
673 {
674 if (! STMT_VINFO_IN_PATTERN_P (next)
675 || STMT_VINFO_REDUC_IDX (STMT_VINFO_RELATED_STMT (next)) == -1)
676 break;
677 next = REDUC_GROUP_NEXT_ELEMENT (next);
678 }
679 /* If not all stmt in the chain are patterns or if we failed
680 to update STMT_VINFO_REDUC_IDX try to handle the chain
681 without patterns. */
682 if (! next
683 && STMT_VINFO_REDUC_IDX (STMT_VINFO_RELATED_STMT (first)) != -1)
684 {
685 vect_fixup_reduc_chain (first);
686 LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo)[i]
687 = STMT_VINFO_RELATED_STMT (first);
688 }
689 }
690 }
691
692 /* Function vect_get_loop_niters.
693
694 Determine how many iterations the loop is executed and place it
695 in NUMBER_OF_ITERATIONS. Place the number of latch iterations
696 in NUMBER_OF_ITERATIONSM1. Place the condition under which the
697 niter information holds in ASSUMPTIONS.
698
699 Return the loop exit condition. */
700
701
702 static gcond *
703 vect_get_loop_niters (class loop *loop, tree *assumptions,
704 tree *number_of_iterations, tree *number_of_iterationsm1)
705 {
706 edge exit = single_exit (loop);
707 class tree_niter_desc niter_desc;
708 tree niter_assumptions, niter, may_be_zero;
709 gcond *cond = get_loop_exit_condition (loop);
710
711 *assumptions = boolean_true_node;
712 *number_of_iterationsm1 = chrec_dont_know;
713 *number_of_iterations = chrec_dont_know;
714 DUMP_VECT_SCOPE ("get_loop_niters");
715
716 if (!exit)
717 return cond;
718
719 may_be_zero = NULL_TREE;
720 if (!number_of_iterations_exit_assumptions (loop, exit, &niter_desc, NULL)
721 || chrec_contains_undetermined (niter_desc.niter))
722 return cond;
723
724 niter_assumptions = niter_desc.assumptions;
725 may_be_zero = niter_desc.may_be_zero;
726 niter = niter_desc.niter;
727
728 if (may_be_zero && integer_zerop (may_be_zero))
729 may_be_zero = NULL_TREE;
730
731 if (may_be_zero)
732 {
733 if (COMPARISON_CLASS_P (may_be_zero))
734 {
735 /* Try to combine may_be_zero with assumptions, this can simplify
736 computation of niter expression. */
737 if (niter_assumptions && !integer_nonzerop (niter_assumptions))
738 niter_assumptions = fold_build2 (TRUTH_AND_EXPR, boolean_type_node,
739 niter_assumptions,
740 fold_build1 (TRUTH_NOT_EXPR,
741 boolean_type_node,
742 may_be_zero));
743 else
744 niter = fold_build3 (COND_EXPR, TREE_TYPE (niter), may_be_zero,
745 build_int_cst (TREE_TYPE (niter), 0),
746 rewrite_to_non_trapping_overflow (niter));
747
748 may_be_zero = NULL_TREE;
749 }
750 else if (integer_nonzerop (may_be_zero))
751 {
752 *number_of_iterationsm1 = build_int_cst (TREE_TYPE (niter), 0);
753 *number_of_iterations = build_int_cst (TREE_TYPE (niter), 1);
754 return cond;
755 }
756 else
757 return cond;
758 }
759
760 *assumptions = niter_assumptions;
761 *number_of_iterationsm1 = niter;
762
763 /* We want the number of loop header executions which is the number
764 of latch executions plus one.
765 ??? For UINT_MAX latch executions this number overflows to zero
766 for loops like do { n++; } while (n != 0); */
767 if (niter && !chrec_contains_undetermined (niter))
768 niter = fold_build2 (PLUS_EXPR, TREE_TYPE (niter), unshare_expr (niter),
769 build_int_cst (TREE_TYPE (niter), 1));
770 *number_of_iterations = niter;
771
772 return cond;
773 }
774
775 /* Function bb_in_loop_p
776
777 Used as predicate for dfs order traversal of the loop bbs. */
778
779 static bool
780 bb_in_loop_p (const_basic_block bb, const void *data)
781 {
782 const class loop *const loop = (const class loop *)data;
783 if (flow_bb_inside_loop_p (loop, bb))
784 return true;
785 return false;
786 }
787
788
789 /* Create and initialize a new loop_vec_info struct for LOOP_IN, as well as
790 stmt_vec_info structs for all the stmts in LOOP_IN. */
791
792 _loop_vec_info::_loop_vec_info (class loop *loop_in, vec_info_shared *shared)
793 : vec_info (vec_info::loop, init_cost (loop_in), shared),
794 loop (loop_in),
795 bbs (XCNEWVEC (basic_block, loop->num_nodes)),
796 num_itersm1 (NULL_TREE),
797 num_iters (NULL_TREE),
798 num_iters_unchanged (NULL_TREE),
799 num_iters_assumptions (NULL_TREE),
800 th (0),
801 versioning_threshold (0),
802 vectorization_factor (0),
803 max_vectorization_factor (0),
804 mask_skip_niters (NULL_TREE),
805 mask_compare_type (NULL_TREE),
806 simd_if_cond (NULL_TREE),
807 unaligned_dr (NULL),
808 peeling_for_alignment (0),
809 ptr_mask (0),
810 ivexpr_map (NULL),
811 scan_map (NULL),
812 slp_unrolling_factor (1),
813 single_scalar_iteration_cost (0),
814 vec_outside_cost (0),
815 vec_inside_cost (0),
816 vectorizable (false),
817 can_fully_mask_p (true),
818 fully_masked_p (false),
819 peeling_for_gaps (false),
820 peeling_for_niter (false),
821 no_data_dependencies (false),
822 has_mask_store (false),
823 scalar_loop_scaling (profile_probability::uninitialized ()),
824 scalar_loop (NULL),
825 orig_loop_info (NULL)
826 {
827 /* CHECKME: We want to visit all BBs before their successors (except for
828 latch blocks, for which this assertion wouldn't hold). In the simple
829 case of the loop forms we allow, a dfs order of the BBs would the same
830 as reversed postorder traversal, so we are safe. */
831
832 unsigned int nbbs = dfs_enumerate_from (loop->header, 0, bb_in_loop_p,
833 bbs, loop->num_nodes, loop);
834 gcc_assert (nbbs == loop->num_nodes);
835
836 for (unsigned int i = 0; i < nbbs; i++)
837 {
838 basic_block bb = bbs[i];
839 gimple_stmt_iterator si;
840
841 for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
842 {
843 gimple *phi = gsi_stmt (si);
844 gimple_set_uid (phi, 0);
845 add_stmt (phi);
846 }
847
848 for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
849 {
850 gimple *stmt = gsi_stmt (si);
851 gimple_set_uid (stmt, 0);
852 if (is_gimple_debug (stmt))
853 continue;
854 add_stmt (stmt);
855 /* If .GOMP_SIMD_LANE call for the current loop has 3 arguments, the
856 third argument is the #pragma omp simd if (x) condition, when 0,
857 loop shouldn't be vectorized, when non-zero constant, it should
858 be vectorized normally, otherwise versioned with vectorized loop
859 done if the condition is non-zero at runtime. */
860 if (loop_in->simduid
861 && is_gimple_call (stmt)
862 && gimple_call_internal_p (stmt)
863 && gimple_call_internal_fn (stmt) == IFN_GOMP_SIMD_LANE
864 && gimple_call_num_args (stmt) >= 3
865 && TREE_CODE (gimple_call_arg (stmt, 0)) == SSA_NAME
866 && (loop_in->simduid
867 == SSA_NAME_VAR (gimple_call_arg (stmt, 0))))
868 {
869 tree arg = gimple_call_arg (stmt, 2);
870 if (integer_zerop (arg) || TREE_CODE (arg) == SSA_NAME)
871 simd_if_cond = arg;
872 else
873 gcc_assert (integer_nonzerop (arg));
874 }
875 }
876 }
877
878 epilogue_vinfos.create (6);
879 }
880
881 /* Free all levels of MASKS. */
882
883 void
884 release_vec_loop_masks (vec_loop_masks *masks)
885 {
886 rgroup_masks *rgm;
887 unsigned int i;
888 FOR_EACH_VEC_ELT (*masks, i, rgm)
889 rgm->masks.release ();
890 masks->release ();
891 }
892
893 /* Free all memory used by the _loop_vec_info, as well as all the
894 stmt_vec_info structs of all the stmts in the loop. */
895
896 _loop_vec_info::~_loop_vec_info ()
897 {
898 free (bbs);
899
900 release_vec_loop_masks (&masks);
901 delete ivexpr_map;
902 delete scan_map;
903 epilogue_vinfos.release ();
904
905 loop->aux = NULL;
906 }
907
908 /* Return an invariant or register for EXPR and emit necessary
909 computations in the LOOP_VINFO loop preheader. */
910
911 tree
912 cse_and_gimplify_to_preheader (loop_vec_info loop_vinfo, tree expr)
913 {
914 if (is_gimple_reg (expr)
915 || is_gimple_min_invariant (expr))
916 return expr;
917
918 if (! loop_vinfo->ivexpr_map)
919 loop_vinfo->ivexpr_map = new hash_map<tree_operand_hash, tree>;
920 tree &cached = loop_vinfo->ivexpr_map->get_or_insert (expr);
921 if (! cached)
922 {
923 gimple_seq stmts = NULL;
924 cached = force_gimple_operand (unshare_expr (expr),
925 &stmts, true, NULL_TREE);
926 if (stmts)
927 {
928 edge e = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo));
929 gsi_insert_seq_on_edge_immediate (e, stmts);
930 }
931 }
932 return cached;
933 }
934
935 /* Return true if we can use CMP_TYPE as the comparison type to produce
936 all masks required to mask LOOP_VINFO. */
937
938 static bool
939 can_produce_all_loop_masks_p (loop_vec_info loop_vinfo, tree cmp_type)
940 {
941 rgroup_masks *rgm;
942 unsigned int i;
943 FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo), i, rgm)
944 if (rgm->mask_type != NULL_TREE
945 && !direct_internal_fn_supported_p (IFN_WHILE_ULT,
946 cmp_type, rgm->mask_type,
947 OPTIMIZE_FOR_SPEED))
948 return false;
949 return true;
950 }
951
952 /* Calculate the maximum number of scalars per iteration for every
953 rgroup in LOOP_VINFO. */
954
955 static unsigned int
956 vect_get_max_nscalars_per_iter (loop_vec_info loop_vinfo)
957 {
958 unsigned int res = 1;
959 unsigned int i;
960 rgroup_masks *rgm;
961 FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo), i, rgm)
962 res = MAX (res, rgm->max_nscalars_per_iter);
963 return res;
964 }
965
966 /* Each statement in LOOP_VINFO can be masked where necessary. Check
967 whether we can actually generate the masks required. Return true if so,
968 storing the type of the scalar IV in LOOP_VINFO_MASK_COMPARE_TYPE. */
969
970 static bool
971 vect_verify_full_masking (loop_vec_info loop_vinfo)
972 {
973 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
974 unsigned int min_ni_width;
975 unsigned int max_nscalars_per_iter
976 = vect_get_max_nscalars_per_iter (loop_vinfo);
977
978 /* Use a normal loop if there are no statements that need masking.
979 This only happens in rare degenerate cases: it means that the loop
980 has no loads, no stores, and no live-out values. */
981 if (LOOP_VINFO_MASKS (loop_vinfo).is_empty ())
982 return false;
983
984 /* Get the maximum number of iterations that is representable
985 in the counter type. */
986 tree ni_type = TREE_TYPE (LOOP_VINFO_NITERSM1 (loop_vinfo));
987 widest_int max_ni = wi::to_widest (TYPE_MAX_VALUE (ni_type)) + 1;
988
989 /* Get a more refined estimate for the number of iterations. */
990 widest_int max_back_edges;
991 if (max_loop_iterations (loop, &max_back_edges))
992 max_ni = wi::smin (max_ni, max_back_edges + 1);
993
994 /* Account for rgroup masks, in which each bit is replicated N times. */
995 max_ni *= max_nscalars_per_iter;
996
997 /* Work out how many bits we need to represent the limit. */
998 min_ni_width = wi::min_precision (max_ni, UNSIGNED);
999
1000 /* Find a scalar mode for which WHILE_ULT is supported. */
1001 opt_scalar_int_mode cmp_mode_iter;
1002 tree cmp_type = NULL_TREE;
1003 tree iv_type = NULL_TREE;
1004 widest_int iv_limit = vect_iv_limit_for_full_masking (loop_vinfo);
1005 unsigned int iv_precision = UINT_MAX;
1006
1007 if (iv_limit != -1)
1008 iv_precision = wi::min_precision (iv_limit * max_nscalars_per_iter,
1009 UNSIGNED);
1010
1011 FOR_EACH_MODE_IN_CLASS (cmp_mode_iter, MODE_INT)
1012 {
1013 unsigned int cmp_bits = GET_MODE_BITSIZE (cmp_mode_iter.require ());
1014 if (cmp_bits >= min_ni_width
1015 && targetm.scalar_mode_supported_p (cmp_mode_iter.require ()))
1016 {
1017 tree this_type = build_nonstandard_integer_type (cmp_bits, true);
1018 if (this_type
1019 && can_produce_all_loop_masks_p (loop_vinfo, this_type))
1020 {
1021 /* Although we could stop as soon as we find a valid mode,
1022 there are at least two reasons why that's not always the
1023 best choice:
1024
1025 - An IV that's Pmode or wider is more likely to be reusable
1026 in address calculations than an IV that's narrower than
1027 Pmode.
1028
1029 - Doing the comparison in IV_PRECISION or wider allows
1030 a natural 0-based IV, whereas using a narrower comparison
1031 type requires mitigations against wrap-around.
1032
1033 Conversely, if the IV limit is variable, doing the comparison
1034 in a wider type than the original type can introduce
1035 unnecessary extensions, so picking the widest valid mode
1036 is not always a good choice either.
1037
1038 Here we prefer the first IV type that's Pmode or wider,
1039 and the first comparison type that's IV_PRECISION or wider.
1040 (The comparison type must be no wider than the IV type,
1041 to avoid extensions in the vector loop.)
1042
1043 ??? We might want to try continuing beyond Pmode for ILP32
1044 targets if CMP_BITS < IV_PRECISION. */
1045 iv_type = this_type;
1046 if (!cmp_type || iv_precision > TYPE_PRECISION (cmp_type))
1047 cmp_type = this_type;
1048 if (cmp_bits >= GET_MODE_BITSIZE (Pmode))
1049 break;
1050 }
1051 }
1052 }
1053
1054 if (!cmp_type)
1055 return false;
1056
1057 LOOP_VINFO_MASK_COMPARE_TYPE (loop_vinfo) = cmp_type;
1058 LOOP_VINFO_MASK_IV_TYPE (loop_vinfo) = iv_type;
1059 return true;
1060 }
1061
1062 /* Calculate the cost of one scalar iteration of the loop. */
1063 static void
1064 vect_compute_single_scalar_iteration_cost (loop_vec_info loop_vinfo)
1065 {
1066 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1067 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1068 int nbbs = loop->num_nodes, factor;
1069 int innerloop_iters, i;
1070
1071 DUMP_VECT_SCOPE ("vect_compute_single_scalar_iteration_cost");
1072
1073 /* Gather costs for statements in the scalar loop. */
1074
1075 /* FORNOW. */
1076 innerloop_iters = 1;
1077 if (loop->inner)
1078 innerloop_iters = 50; /* FIXME */
1079
1080 for (i = 0; i < nbbs; i++)
1081 {
1082 gimple_stmt_iterator si;
1083 basic_block bb = bbs[i];
1084
1085 if (bb->loop_father == loop->inner)
1086 factor = innerloop_iters;
1087 else
1088 factor = 1;
1089
1090 for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
1091 {
1092 gimple *stmt = gsi_stmt (si);
1093 stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (stmt);
1094
1095 if (!is_gimple_assign (stmt) && !is_gimple_call (stmt))
1096 continue;
1097
1098 /* Skip stmts that are not vectorized inside the loop. */
1099 stmt_vec_info vstmt_info = vect_stmt_to_vectorize (stmt_info);
1100 if (!STMT_VINFO_RELEVANT_P (vstmt_info)
1101 && (!STMT_VINFO_LIVE_P (vstmt_info)
1102 || !VECTORIZABLE_CYCLE_DEF
1103 (STMT_VINFO_DEF_TYPE (vstmt_info))))
1104 continue;
1105
1106 vect_cost_for_stmt kind;
1107 if (STMT_VINFO_DATA_REF (stmt_info))
1108 {
1109 if (DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
1110 kind = scalar_load;
1111 else
1112 kind = scalar_store;
1113 }
1114 else if (vect_nop_conversion_p (stmt_info))
1115 continue;
1116 else
1117 kind = scalar_stmt;
1118
1119 record_stmt_cost (&LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1120 factor, kind, stmt_info, 0, vect_prologue);
1121 }
1122 }
1123
1124 /* Now accumulate cost. */
1125 void *target_cost_data = init_cost (loop);
1126 stmt_info_for_cost *si;
1127 int j;
1128 FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1129 j, si)
1130 (void) add_stmt_cost (loop_vinfo, target_cost_data, si->count,
1131 si->kind, si->stmt_info, si->vectype,
1132 si->misalign, vect_body);
1133 unsigned dummy, body_cost = 0;
1134 finish_cost (target_cost_data, &dummy, &body_cost, &dummy);
1135 destroy_cost_data (target_cost_data);
1136 LOOP_VINFO_SINGLE_SCALAR_ITERATION_COST (loop_vinfo) = body_cost;
1137 }
1138
1139
1140 /* Function vect_analyze_loop_form_1.
1141
1142 Verify that certain CFG restrictions hold, including:
1143 - the loop has a pre-header
1144 - the loop has a single entry and exit
1145 - the loop exit condition is simple enough
1146 - the number of iterations can be analyzed, i.e, a countable loop. The
1147 niter could be analyzed under some assumptions. */
1148
1149 opt_result
1150 vect_analyze_loop_form_1 (class loop *loop, gcond **loop_cond,
1151 tree *assumptions, tree *number_of_iterationsm1,
1152 tree *number_of_iterations, gcond **inner_loop_cond)
1153 {
1154 DUMP_VECT_SCOPE ("vect_analyze_loop_form");
1155
1156 /* Different restrictions apply when we are considering an inner-most loop,
1157 vs. an outer (nested) loop.
1158 (FORNOW. May want to relax some of these restrictions in the future). */
1159
1160 if (!loop->inner)
1161 {
1162 /* Inner-most loop. We currently require that the number of BBs is
1163 exactly 2 (the header and latch). Vectorizable inner-most loops
1164 look like this:
1165
1166 (pre-header)
1167 |
1168 header <--------+
1169 | | |
1170 | +--> latch --+
1171 |
1172 (exit-bb) */
1173
1174 if (loop->num_nodes != 2)
1175 return opt_result::failure_at (vect_location,
1176 "not vectorized:"
1177 " control flow in loop.\n");
1178
1179 if (empty_block_p (loop->header))
1180 return opt_result::failure_at (vect_location,
1181 "not vectorized: empty loop.\n");
1182 }
1183 else
1184 {
1185 class loop *innerloop = loop->inner;
1186 edge entryedge;
1187
1188 /* Nested loop. We currently require that the loop is doubly-nested,
1189 contains a single inner loop, and the number of BBs is exactly 5.
1190 Vectorizable outer-loops look like this:
1191
1192 (pre-header)
1193 |
1194 header <---+
1195 | |
1196 inner-loop |
1197 | |
1198 tail ------+
1199 |
1200 (exit-bb)
1201
1202 The inner-loop has the properties expected of inner-most loops
1203 as described above. */
1204
1205 if ((loop->inner)->inner || (loop->inner)->next)
1206 return opt_result::failure_at (vect_location,
1207 "not vectorized:"
1208 " multiple nested loops.\n");
1209
1210 if (loop->num_nodes != 5)
1211 return opt_result::failure_at (vect_location,
1212 "not vectorized:"
1213 " control flow in loop.\n");
1214
1215 entryedge = loop_preheader_edge (innerloop);
1216 if (entryedge->src != loop->header
1217 || !single_exit (innerloop)
1218 || single_exit (innerloop)->dest != EDGE_PRED (loop->latch, 0)->src)
1219 return opt_result::failure_at (vect_location,
1220 "not vectorized:"
1221 " unsupported outerloop form.\n");
1222
1223 /* Analyze the inner-loop. */
1224 tree inner_niterm1, inner_niter, inner_assumptions;
1225 opt_result res
1226 = vect_analyze_loop_form_1 (loop->inner, inner_loop_cond,
1227 &inner_assumptions, &inner_niterm1,
1228 &inner_niter, NULL);
1229 if (!res)
1230 {
1231 if (dump_enabled_p ())
1232 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1233 "not vectorized: Bad inner loop.\n");
1234 return res;
1235 }
1236
1237 /* Don't support analyzing niter under assumptions for inner
1238 loop. */
1239 if (!integer_onep (inner_assumptions))
1240 return opt_result::failure_at (vect_location,
1241 "not vectorized: Bad inner loop.\n");
1242
1243 if (!expr_invariant_in_loop_p (loop, inner_niter))
1244 return opt_result::failure_at (vect_location,
1245 "not vectorized: inner-loop count not"
1246 " invariant.\n");
1247
1248 if (dump_enabled_p ())
1249 dump_printf_loc (MSG_NOTE, vect_location,
1250 "Considering outer-loop vectorization.\n");
1251 }
1252
1253 if (!single_exit (loop))
1254 return opt_result::failure_at (vect_location,
1255 "not vectorized: multiple exits.\n");
1256 if (EDGE_COUNT (loop->header->preds) != 2)
1257 return opt_result::failure_at (vect_location,
1258 "not vectorized:"
1259 " too many incoming edges.\n");
1260
1261 /* We assume that the loop exit condition is at the end of the loop. i.e,
1262 that the loop is represented as a do-while (with a proper if-guard
1263 before the loop if needed), where the loop header contains all the
1264 executable statements, and the latch is empty. */
1265 if (!empty_block_p (loop->latch)
1266 || !gimple_seq_empty_p (phi_nodes (loop->latch)))
1267 return opt_result::failure_at (vect_location,
1268 "not vectorized: latch block not empty.\n");
1269
1270 /* Make sure the exit is not abnormal. */
1271 edge e = single_exit (loop);
1272 if (e->flags & EDGE_ABNORMAL)
1273 return opt_result::failure_at (vect_location,
1274 "not vectorized:"
1275 " abnormal loop exit edge.\n");
1276
1277 *loop_cond = vect_get_loop_niters (loop, assumptions, number_of_iterations,
1278 number_of_iterationsm1);
1279 if (!*loop_cond)
1280 return opt_result::failure_at
1281 (vect_location,
1282 "not vectorized: complicated exit condition.\n");
1283
1284 if (integer_zerop (*assumptions)
1285 || !*number_of_iterations
1286 || chrec_contains_undetermined (*number_of_iterations))
1287 return opt_result::failure_at
1288 (*loop_cond,
1289 "not vectorized: number of iterations cannot be computed.\n");
1290
1291 if (integer_zerop (*number_of_iterations))
1292 return opt_result::failure_at
1293 (*loop_cond,
1294 "not vectorized: number of iterations = 0.\n");
1295
1296 return opt_result::success ();
1297 }
1298
1299 /* Analyze LOOP form and return a loop_vec_info if it is of suitable form. */
1300
1301 opt_loop_vec_info
1302 vect_analyze_loop_form (class loop *loop, vec_info_shared *shared)
1303 {
1304 tree assumptions, number_of_iterations, number_of_iterationsm1;
1305 gcond *loop_cond, *inner_loop_cond = NULL;
1306
1307 opt_result res
1308 = vect_analyze_loop_form_1 (loop, &loop_cond,
1309 &assumptions, &number_of_iterationsm1,
1310 &number_of_iterations, &inner_loop_cond);
1311 if (!res)
1312 return opt_loop_vec_info::propagate_failure (res);
1313
1314 loop_vec_info loop_vinfo = new _loop_vec_info (loop, shared);
1315 LOOP_VINFO_NITERSM1 (loop_vinfo) = number_of_iterationsm1;
1316 LOOP_VINFO_NITERS (loop_vinfo) = number_of_iterations;
1317 LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = number_of_iterations;
1318 if (!integer_onep (assumptions))
1319 {
1320 /* We consider to vectorize this loop by versioning it under
1321 some assumptions. In order to do this, we need to clear
1322 existing information computed by scev and niter analyzer. */
1323 scev_reset_htab ();
1324 free_numbers_of_iterations_estimates (loop);
1325 /* Also set flag for this loop so that following scev and niter
1326 analysis are done under the assumptions. */
1327 loop_constraint_set (loop, LOOP_C_FINITE);
1328 /* Also record the assumptions for versioning. */
1329 LOOP_VINFO_NITERS_ASSUMPTIONS (loop_vinfo) = assumptions;
1330 }
1331
1332 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
1333 {
1334 if (dump_enabled_p ())
1335 {
1336 dump_printf_loc (MSG_NOTE, vect_location,
1337 "Symbolic number of iterations is ");
1338 dump_generic_expr (MSG_NOTE, TDF_DETAILS, number_of_iterations);
1339 dump_printf (MSG_NOTE, "\n");
1340 }
1341 }
1342
1343 stmt_vec_info loop_cond_info = loop_vinfo->lookup_stmt (loop_cond);
1344 STMT_VINFO_TYPE (loop_cond_info) = loop_exit_ctrl_vec_info_type;
1345 if (inner_loop_cond)
1346 {
1347 stmt_vec_info inner_loop_cond_info
1348 = loop_vinfo->lookup_stmt (inner_loop_cond);
1349 STMT_VINFO_TYPE (inner_loop_cond_info) = loop_exit_ctrl_vec_info_type;
1350 }
1351
1352 gcc_assert (!loop->aux);
1353 loop->aux = loop_vinfo;
1354 return opt_loop_vec_info::success (loop_vinfo);
1355 }
1356
1357
1358
1359 /* Scan the loop stmts and dependent on whether there are any (non-)SLP
1360 statements update the vectorization factor. */
1361
1362 static void
1363 vect_update_vf_for_slp (loop_vec_info loop_vinfo)
1364 {
1365 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1366 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1367 int nbbs = loop->num_nodes;
1368 poly_uint64 vectorization_factor;
1369 int i;
1370
1371 DUMP_VECT_SCOPE ("vect_update_vf_for_slp");
1372
1373 vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1374 gcc_assert (known_ne (vectorization_factor, 0U));
1375
1376 /* If all the stmts in the loop can be SLPed, we perform only SLP, and
1377 vectorization factor of the loop is the unrolling factor required by
1378 the SLP instances. If that unrolling factor is 1, we say, that we
1379 perform pure SLP on loop - cross iteration parallelism is not
1380 exploited. */
1381 bool only_slp_in_loop = true;
1382 for (i = 0; i < nbbs; i++)
1383 {
1384 basic_block bb = bbs[i];
1385 for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
1386 gsi_next (&si))
1387 {
1388 stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (si.phi ());
1389 if (!stmt_info)
1390 continue;
1391 if ((STMT_VINFO_RELEVANT_P (stmt_info)
1392 || VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
1393 && !PURE_SLP_STMT (stmt_info))
1394 /* STMT needs both SLP and loop-based vectorization. */
1395 only_slp_in_loop = false;
1396 }
1397 for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
1398 gsi_next (&si))
1399 {
1400 if (is_gimple_debug (gsi_stmt (si)))
1401 continue;
1402 stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
1403 stmt_info = vect_stmt_to_vectorize (stmt_info);
1404 if ((STMT_VINFO_RELEVANT_P (stmt_info)
1405 || VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
1406 && !PURE_SLP_STMT (stmt_info))
1407 /* STMT needs both SLP and loop-based vectorization. */
1408 only_slp_in_loop = false;
1409 }
1410 }
1411
1412 if (only_slp_in_loop)
1413 {
1414 if (dump_enabled_p ())
1415 dump_printf_loc (MSG_NOTE, vect_location,
1416 "Loop contains only SLP stmts\n");
1417 vectorization_factor = LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo);
1418 }
1419 else
1420 {
1421 if (dump_enabled_p ())
1422 dump_printf_loc (MSG_NOTE, vect_location,
1423 "Loop contains SLP and non-SLP stmts\n");
1424 /* Both the vectorization factor and unroll factor have the form
1425 GET_MODE_SIZE (loop_vinfo->vector_mode) * X for some rational X,
1426 so they must have a common multiple. */
1427 vectorization_factor
1428 = force_common_multiple (vectorization_factor,
1429 LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo));
1430 }
1431
1432 LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
1433 if (dump_enabled_p ())
1434 {
1435 dump_printf_loc (MSG_NOTE, vect_location,
1436 "Updating vectorization factor to ");
1437 dump_dec (MSG_NOTE, vectorization_factor);
1438 dump_printf (MSG_NOTE, ".\n");
1439 }
1440 }
1441
1442 /* Return true if STMT_INFO describes a double reduction phi and if
1443 the other phi in the reduction is also relevant for vectorization.
1444 This rejects cases such as:
1445
1446 outer1:
1447 x_1 = PHI <x_3(outer2), ...>;
1448 ...
1449
1450 inner:
1451 x_2 = ...;
1452 ...
1453
1454 outer2:
1455 x_3 = PHI <x_2(inner)>;
1456
1457 if nothing in x_2 or elsewhere makes x_1 relevant. */
1458
1459 static bool
1460 vect_active_double_reduction_p (stmt_vec_info stmt_info)
1461 {
1462 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_double_reduction_def)
1463 return false;
1464
1465 return STMT_VINFO_RELEVANT_P (STMT_VINFO_REDUC_DEF (stmt_info));
1466 }
1467
1468 /* Function vect_analyze_loop_operations.
1469
1470 Scan the loop stmts and make sure they are all vectorizable. */
1471
1472 static opt_result
1473 vect_analyze_loop_operations (loop_vec_info loop_vinfo)
1474 {
1475 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1476 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1477 int nbbs = loop->num_nodes;
1478 int i;
1479 stmt_vec_info stmt_info;
1480 bool need_to_vectorize = false;
1481 bool ok;
1482
1483 DUMP_VECT_SCOPE ("vect_analyze_loop_operations");
1484
1485 auto_vec<stmt_info_for_cost> cost_vec;
1486
1487 for (i = 0; i < nbbs; i++)
1488 {
1489 basic_block bb = bbs[i];
1490
1491 for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
1492 gsi_next (&si))
1493 {
1494 gphi *phi = si.phi ();
1495 ok = true;
1496
1497 stmt_info = loop_vinfo->lookup_stmt (phi);
1498 if (dump_enabled_p ())
1499 dump_printf_loc (MSG_NOTE, vect_location, "examining phi: %G", phi);
1500 if (virtual_operand_p (gimple_phi_result (phi)))
1501 continue;
1502
1503 /* Inner-loop loop-closed exit phi in outer-loop vectorization
1504 (i.e., a phi in the tail of the outer-loop). */
1505 if (! is_loop_header_bb_p (bb))
1506 {
1507 /* FORNOW: we currently don't support the case that these phis
1508 are not used in the outerloop (unless it is double reduction,
1509 i.e., this phi is vect_reduction_def), cause this case
1510 requires to actually do something here. */
1511 if (STMT_VINFO_LIVE_P (stmt_info)
1512 && !vect_active_double_reduction_p (stmt_info))
1513 return opt_result::failure_at (phi,
1514 "Unsupported loop-closed phi"
1515 " in outer-loop.\n");
1516
1517 /* If PHI is used in the outer loop, we check that its operand
1518 is defined in the inner loop. */
1519 if (STMT_VINFO_RELEVANT_P (stmt_info))
1520 {
1521 tree phi_op;
1522
1523 if (gimple_phi_num_args (phi) != 1)
1524 return opt_result::failure_at (phi, "unsupported phi");
1525
1526 phi_op = PHI_ARG_DEF (phi, 0);
1527 stmt_vec_info op_def_info = loop_vinfo->lookup_def (phi_op);
1528 if (!op_def_info)
1529 return opt_result::failure_at (phi, "unsupported phi\n");
1530
1531 if (STMT_VINFO_RELEVANT (op_def_info) != vect_used_in_outer
1532 && (STMT_VINFO_RELEVANT (op_def_info)
1533 != vect_used_in_outer_by_reduction))
1534 return opt_result::failure_at (phi, "unsupported phi\n");
1535
1536 if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_internal_def
1537 || (STMT_VINFO_DEF_TYPE (stmt_info)
1538 == vect_double_reduction_def))
1539 && !vectorizable_lc_phi (loop_vinfo,
1540 stmt_info, NULL, NULL))
1541 return opt_result::failure_at (phi, "unsupported phi\n");
1542 }
1543
1544 continue;
1545 }
1546
1547 gcc_assert (stmt_info);
1548
1549 if ((STMT_VINFO_RELEVANT (stmt_info) == vect_used_in_scope
1550 || STMT_VINFO_LIVE_P (stmt_info))
1551 && STMT_VINFO_DEF_TYPE (stmt_info) != vect_induction_def)
1552 /* A scalar-dependence cycle that we don't support. */
1553 return opt_result::failure_at (phi,
1554 "not vectorized:"
1555 " scalar dependence cycle.\n");
1556
1557 if (STMT_VINFO_RELEVANT_P (stmt_info))
1558 {
1559 need_to_vectorize = true;
1560 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def
1561 && ! PURE_SLP_STMT (stmt_info))
1562 ok = vectorizable_induction (loop_vinfo,
1563 stmt_info, NULL, NULL, NULL,
1564 &cost_vec);
1565 else if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
1566 || (STMT_VINFO_DEF_TYPE (stmt_info)
1567 == vect_double_reduction_def)
1568 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle)
1569 && ! PURE_SLP_STMT (stmt_info))
1570 ok = vectorizable_reduction (loop_vinfo,
1571 stmt_info, NULL, NULL, &cost_vec);
1572 }
1573
1574 /* SLP PHIs are tested by vect_slp_analyze_node_operations. */
1575 if (ok
1576 && STMT_VINFO_LIVE_P (stmt_info)
1577 && !PURE_SLP_STMT (stmt_info))
1578 ok = vectorizable_live_operation (loop_vinfo,
1579 stmt_info, NULL, NULL, NULL,
1580 -1, false, &cost_vec);
1581
1582 if (!ok)
1583 return opt_result::failure_at (phi,
1584 "not vectorized: relevant phi not "
1585 "supported: %G",
1586 static_cast <gimple *> (phi));
1587 }
1588
1589 for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
1590 gsi_next (&si))
1591 {
1592 gimple *stmt = gsi_stmt (si);
1593 if (!gimple_clobber_p (stmt)
1594 && !is_gimple_debug (stmt))
1595 {
1596 opt_result res
1597 = vect_analyze_stmt (loop_vinfo,
1598 loop_vinfo->lookup_stmt (stmt),
1599 &need_to_vectorize,
1600 NULL, NULL, &cost_vec);
1601 if (!res)
1602 return res;
1603 }
1604 }
1605 } /* bbs */
1606
1607 add_stmt_costs (loop_vinfo, loop_vinfo->target_cost_data, &cost_vec);
1608
1609 /* All operations in the loop are either irrelevant (deal with loop
1610 control, or dead), or only used outside the loop and can be moved
1611 out of the loop (e.g. invariants, inductions). The loop can be
1612 optimized away by scalar optimizations. We're better off not
1613 touching this loop. */
1614 if (!need_to_vectorize)
1615 {
1616 if (dump_enabled_p ())
1617 dump_printf_loc (MSG_NOTE, vect_location,
1618 "All the computation can be taken out of the loop.\n");
1619 return opt_result::failure_at
1620 (vect_location,
1621 "not vectorized: redundant loop. no profit to vectorize.\n");
1622 }
1623
1624 return opt_result::success ();
1625 }
1626
1627 /* Analyze the cost of the loop described by LOOP_VINFO. Decide if it
1628 is worthwhile to vectorize. Return 1 if definitely yes, 0 if
1629 definitely no, or -1 if it's worth retrying. */
1630
1631 static int
1632 vect_analyze_loop_costing (loop_vec_info loop_vinfo)
1633 {
1634 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1635 unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
1636
1637 /* Only fully-masked loops can have iteration counts less than the
1638 vectorization factor. */
1639 if (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
1640 {
1641 HOST_WIDE_INT max_niter;
1642
1643 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
1644 max_niter = LOOP_VINFO_INT_NITERS (loop_vinfo);
1645 else
1646 max_niter = max_stmt_executions_int (loop);
1647
1648 if (max_niter != -1
1649 && (unsigned HOST_WIDE_INT) max_niter < assumed_vf)
1650 {
1651 if (dump_enabled_p ())
1652 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1653 "not vectorized: iteration count smaller than "
1654 "vectorization factor.\n");
1655 return 0;
1656 }
1657 }
1658
1659 int min_profitable_iters, min_profitable_estimate;
1660 vect_estimate_min_profitable_iters (loop_vinfo, &min_profitable_iters,
1661 &min_profitable_estimate);
1662
1663 if (min_profitable_iters < 0)
1664 {
1665 if (dump_enabled_p ())
1666 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1667 "not vectorized: vectorization not profitable.\n");
1668 if (dump_enabled_p ())
1669 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1670 "not vectorized: vector version will never be "
1671 "profitable.\n");
1672 return -1;
1673 }
1674
1675 int min_scalar_loop_bound = (param_min_vect_loop_bound
1676 * assumed_vf);
1677
1678 /* Use the cost model only if it is more conservative than user specified
1679 threshold. */
1680 unsigned int th = (unsigned) MAX (min_scalar_loop_bound,
1681 min_profitable_iters);
1682
1683 LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo) = th;
1684
1685 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
1686 && LOOP_VINFO_INT_NITERS (loop_vinfo) < th)
1687 {
1688 if (dump_enabled_p ())
1689 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1690 "not vectorized: vectorization not profitable.\n");
1691 if (dump_enabled_p ())
1692 dump_printf_loc (MSG_NOTE, vect_location,
1693 "not vectorized: iteration count smaller than user "
1694 "specified loop bound parameter or minimum profitable "
1695 "iterations (whichever is more conservative).\n");
1696 return 0;
1697 }
1698
1699 /* The static profitablity threshold min_profitable_estimate includes
1700 the cost of having to check at runtime whether the scalar loop
1701 should be used instead. If it turns out that we don't need or want
1702 such a check, the threshold we should use for the static estimate
1703 is simply the point at which the vector loop becomes more profitable
1704 than the scalar loop. */
1705 if (min_profitable_estimate > min_profitable_iters
1706 && !LOOP_REQUIRES_VERSIONING (loop_vinfo)
1707 && !LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo)
1708 && !LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
1709 && !vect_apply_runtime_profitability_check_p (loop_vinfo))
1710 {
1711 if (dump_enabled_p ())
1712 dump_printf_loc (MSG_NOTE, vect_location, "no need for a runtime"
1713 " choice between the scalar and vector loops\n");
1714 min_profitable_estimate = min_profitable_iters;
1715 }
1716
1717 HOST_WIDE_INT estimated_niter;
1718
1719 /* If we are vectorizing an epilogue then we know the maximum number of
1720 scalar iterations it will cover is at least one lower than the
1721 vectorization factor of the main loop. */
1722 if (LOOP_VINFO_EPILOGUE_P (loop_vinfo))
1723 estimated_niter
1724 = vect_vf_for_cost (LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo)) - 1;
1725 else
1726 {
1727 estimated_niter = estimated_stmt_executions_int (loop);
1728 if (estimated_niter == -1)
1729 estimated_niter = likely_max_stmt_executions_int (loop);
1730 }
1731 if (estimated_niter != -1
1732 && ((unsigned HOST_WIDE_INT) estimated_niter
1733 < MAX (th, (unsigned) min_profitable_estimate)))
1734 {
1735 if (dump_enabled_p ())
1736 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1737 "not vectorized: estimated iteration count too "
1738 "small.\n");
1739 if (dump_enabled_p ())
1740 dump_printf_loc (MSG_NOTE, vect_location,
1741 "not vectorized: estimated iteration count smaller "
1742 "than specified loop bound parameter or minimum "
1743 "profitable iterations (whichever is more "
1744 "conservative).\n");
1745 return -1;
1746 }
1747
1748 return 1;
1749 }
1750
1751 static opt_result
1752 vect_get_datarefs_in_loop (loop_p loop, basic_block *bbs,
1753 vec<data_reference_p> *datarefs,
1754 unsigned int *n_stmts)
1755 {
1756 *n_stmts = 0;
1757 for (unsigned i = 0; i < loop->num_nodes; i++)
1758 for (gimple_stmt_iterator gsi = gsi_start_bb (bbs[i]);
1759 !gsi_end_p (gsi); gsi_next (&gsi))
1760 {
1761 gimple *stmt = gsi_stmt (gsi);
1762 if (is_gimple_debug (stmt))
1763 continue;
1764 ++(*n_stmts);
1765 opt_result res = vect_find_stmt_data_reference (loop, stmt, datarefs);
1766 if (!res)
1767 {
1768 if (is_gimple_call (stmt) && loop->safelen)
1769 {
1770 tree fndecl = gimple_call_fndecl (stmt), op;
1771 if (fndecl != NULL_TREE)
1772 {
1773 cgraph_node *node = cgraph_node::get (fndecl);
1774 if (node != NULL && node->simd_clones != NULL)
1775 {
1776 unsigned int j, n = gimple_call_num_args (stmt);
1777 for (j = 0; j < n; j++)
1778 {
1779 op = gimple_call_arg (stmt, j);
1780 if (DECL_P (op)
1781 || (REFERENCE_CLASS_P (op)
1782 && get_base_address (op)))
1783 break;
1784 }
1785 op = gimple_call_lhs (stmt);
1786 /* Ignore #pragma omp declare simd functions
1787 if they don't have data references in the
1788 call stmt itself. */
1789 if (j == n
1790 && !(op
1791 && (DECL_P (op)
1792 || (REFERENCE_CLASS_P (op)
1793 && get_base_address (op)))))
1794 continue;
1795 }
1796 }
1797 }
1798 return res;
1799 }
1800 /* If dependence analysis will give up due to the limit on the
1801 number of datarefs stop here and fail fatally. */
1802 if (datarefs->length ()
1803 > (unsigned)param_loop_max_datarefs_for_datadeps)
1804 return opt_result::failure_at (stmt, "exceeded param "
1805 "loop-max-datarefs-for-datadeps\n");
1806 }
1807 return opt_result::success ();
1808 }
1809
1810 /* Look for SLP-only access groups and turn each individual access into its own
1811 group. */
1812 static void
1813 vect_dissolve_slp_only_groups (loop_vec_info loop_vinfo)
1814 {
1815 unsigned int i;
1816 struct data_reference *dr;
1817
1818 DUMP_VECT_SCOPE ("vect_dissolve_slp_only_groups");
1819
1820 vec<data_reference_p> datarefs = loop_vinfo->shared->datarefs;
1821 FOR_EACH_VEC_ELT (datarefs, i, dr)
1822 {
1823 gcc_assert (DR_REF (dr));
1824 stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (DR_STMT (dr));
1825
1826 /* Check if the load is a part of an interleaving chain. */
1827 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
1828 {
1829 stmt_vec_info first_element = DR_GROUP_FIRST_ELEMENT (stmt_info);
1830 unsigned int group_size = DR_GROUP_SIZE (first_element);
1831
1832 /* Check if SLP-only groups. */
1833 if (!STMT_SLP_TYPE (stmt_info)
1834 && STMT_VINFO_SLP_VECT_ONLY (first_element))
1835 {
1836 /* Dissolve the group. */
1837 STMT_VINFO_SLP_VECT_ONLY (first_element) = false;
1838
1839 stmt_vec_info vinfo = first_element;
1840 while (vinfo)
1841 {
1842 stmt_vec_info next = DR_GROUP_NEXT_ELEMENT (vinfo);
1843 DR_GROUP_FIRST_ELEMENT (vinfo) = vinfo;
1844 DR_GROUP_NEXT_ELEMENT (vinfo) = NULL;
1845 DR_GROUP_SIZE (vinfo) = 1;
1846 if (STMT_VINFO_STRIDED_P (first_element))
1847 DR_GROUP_GAP (vinfo) = 0;
1848 else
1849 DR_GROUP_GAP (vinfo) = group_size - 1;
1850 vinfo = next;
1851 }
1852 }
1853 }
1854 }
1855 }
1856
1857
1858 /* Decides whether we need to create an epilogue loop to handle
1859 remaining scalar iterations and sets PEELING_FOR_NITERS accordingly. */
1860
1861 void
1862 determine_peel_for_niter (loop_vec_info loop_vinfo)
1863 {
1864 LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = false;
1865
1866 unsigned HOST_WIDE_INT const_vf;
1867 HOST_WIDE_INT max_niter
1868 = likely_max_stmt_executions_int (LOOP_VINFO_LOOP (loop_vinfo));
1869
1870 unsigned th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
1871 if (!th && LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo))
1872 th = LOOP_VINFO_COST_MODEL_THRESHOLD (LOOP_VINFO_ORIG_LOOP_INFO
1873 (loop_vinfo));
1874
1875 if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
1876 /* The main loop handles all iterations. */
1877 LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = false;
1878 else if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
1879 && LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) >= 0)
1880 {
1881 /* Work out the (constant) number of iterations that need to be
1882 peeled for reasons other than niters. */
1883 unsigned int peel_niter = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
1884 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
1885 peel_niter += 1;
1886 if (!multiple_p (LOOP_VINFO_INT_NITERS (loop_vinfo) - peel_niter,
1887 LOOP_VINFO_VECT_FACTOR (loop_vinfo)))
1888 LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = true;
1889 }
1890 else if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
1891 /* ??? When peeling for gaps but not alignment, we could
1892 try to check whether the (variable) niters is known to be
1893 VF * N + 1. That's something of a niche case though. */
1894 || LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
1895 || !LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant (&const_vf)
1896 || ((tree_ctz (LOOP_VINFO_NITERS (loop_vinfo))
1897 < (unsigned) exact_log2 (const_vf))
1898 /* In case of versioning, check if the maximum number of
1899 iterations is greater than th. If they are identical,
1900 the epilogue is unnecessary. */
1901 && (!LOOP_REQUIRES_VERSIONING (loop_vinfo)
1902 || ((unsigned HOST_WIDE_INT) max_niter
1903 > (th / const_vf) * const_vf))))
1904 LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = true;
1905 }
1906
1907
1908 /* Function vect_analyze_loop_2.
1909
1910 Apply a set of analyses on LOOP, and create a loop_vec_info struct
1911 for it. The different analyses will record information in the
1912 loop_vec_info struct. */
1913 static opt_result
1914 vect_analyze_loop_2 (loop_vec_info loop_vinfo, bool &fatal, unsigned *n_stmts)
1915 {
1916 opt_result ok = opt_result::success ();
1917 int res;
1918 unsigned int max_vf = MAX_VECTORIZATION_FACTOR;
1919 poly_uint64 min_vf = 2;
1920 loop_vec_info orig_loop_vinfo = NULL;
1921
1922 /* If we are dealing with an epilogue then orig_loop_vinfo points to the
1923 loop_vec_info of the first vectorized loop. */
1924 if (LOOP_VINFO_EPILOGUE_P (loop_vinfo))
1925 orig_loop_vinfo = LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo);
1926 else
1927 orig_loop_vinfo = loop_vinfo;
1928 gcc_assert (orig_loop_vinfo);
1929
1930 /* The first group of checks is independent of the vector size. */
1931 fatal = true;
1932
1933 if (LOOP_VINFO_SIMD_IF_COND (loop_vinfo)
1934 && integer_zerop (LOOP_VINFO_SIMD_IF_COND (loop_vinfo)))
1935 return opt_result::failure_at (vect_location,
1936 "not vectorized: simd if(0)\n");
1937
1938 /* Find all data references in the loop (which correspond to vdefs/vuses)
1939 and analyze their evolution in the loop. */
1940
1941 loop_p loop = LOOP_VINFO_LOOP (loop_vinfo);
1942
1943 /* Gather the data references and count stmts in the loop. */
1944 if (!LOOP_VINFO_DATAREFS (loop_vinfo).exists ())
1945 {
1946 opt_result res
1947 = vect_get_datarefs_in_loop (loop, LOOP_VINFO_BBS (loop_vinfo),
1948 &LOOP_VINFO_DATAREFS (loop_vinfo),
1949 n_stmts);
1950 if (!res)
1951 {
1952 if (dump_enabled_p ())
1953 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1954 "not vectorized: loop contains function "
1955 "calls or data references that cannot "
1956 "be analyzed\n");
1957 return res;
1958 }
1959 loop_vinfo->shared->save_datarefs ();
1960 }
1961 else
1962 loop_vinfo->shared->check_datarefs ();
1963
1964 /* Analyze the data references and also adjust the minimal
1965 vectorization factor according to the loads and stores. */
1966
1967 ok = vect_analyze_data_refs (loop_vinfo, &min_vf, &fatal);
1968 if (!ok)
1969 {
1970 if (dump_enabled_p ())
1971 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1972 "bad data references.\n");
1973 return ok;
1974 }
1975
1976 /* Classify all cross-iteration scalar data-flow cycles.
1977 Cross-iteration cycles caused by virtual phis are analyzed separately. */
1978 vect_analyze_scalar_cycles (loop_vinfo);
1979
1980 vect_pattern_recog (loop_vinfo);
1981
1982 vect_fixup_scalar_cycles_with_patterns (loop_vinfo);
1983
1984 /* Analyze the access patterns of the data-refs in the loop (consecutive,
1985 complex, etc.). FORNOW: Only handle consecutive access pattern. */
1986
1987 ok = vect_analyze_data_ref_accesses (loop_vinfo);
1988 if (!ok)
1989 {
1990 if (dump_enabled_p ())
1991 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1992 "bad data access.\n");
1993 return ok;
1994 }
1995
1996 /* Data-flow analysis to detect stmts that do not need to be vectorized. */
1997
1998 ok = vect_mark_stmts_to_be_vectorized (loop_vinfo, &fatal);
1999 if (!ok)
2000 {
2001 if (dump_enabled_p ())
2002 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2003 "unexpected pattern.\n");
2004 return ok;
2005 }
2006
2007 /* While the rest of the analysis below depends on it in some way. */
2008 fatal = false;
2009
2010 /* Analyze data dependences between the data-refs in the loop
2011 and adjust the maximum vectorization factor according to
2012 the dependences.
2013 FORNOW: fail at the first data dependence that we encounter. */
2014
2015 ok = vect_analyze_data_ref_dependences (loop_vinfo, &max_vf);
2016 if (!ok)
2017 {
2018 if (dump_enabled_p ())
2019 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2020 "bad data dependence.\n");
2021 return ok;
2022 }
2023 if (max_vf != MAX_VECTORIZATION_FACTOR
2024 && maybe_lt (max_vf, min_vf))
2025 return opt_result::failure_at (vect_location, "bad data dependence.\n");
2026 LOOP_VINFO_MAX_VECT_FACTOR (loop_vinfo) = max_vf;
2027
2028 ok = vect_determine_vectorization_factor (loop_vinfo);
2029 if (!ok)
2030 {
2031 if (dump_enabled_p ())
2032 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2033 "can't determine vectorization factor.\n");
2034 return ok;
2035 }
2036 if (max_vf != MAX_VECTORIZATION_FACTOR
2037 && maybe_lt (max_vf, LOOP_VINFO_VECT_FACTOR (loop_vinfo)))
2038 return opt_result::failure_at (vect_location, "bad data dependence.\n");
2039
2040 /* Compute the scalar iteration cost. */
2041 vect_compute_single_scalar_iteration_cost (loop_vinfo);
2042
2043 poly_uint64 saved_vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2044
2045 /* Check the SLP opportunities in the loop, analyze and build SLP trees. */
2046 ok = vect_analyze_slp (loop_vinfo, *n_stmts);
2047 if (!ok)
2048 return ok;
2049
2050 /* If there are any SLP instances mark them as pure_slp. */
2051 bool slp = vect_make_slp_decision (loop_vinfo);
2052 if (slp)
2053 {
2054 /* Find stmts that need to be both vectorized and SLPed. */
2055 vect_detect_hybrid_slp (loop_vinfo);
2056
2057 /* Update the vectorization factor based on the SLP decision. */
2058 vect_update_vf_for_slp (loop_vinfo);
2059
2060 /* Optimize the SLP graph with the vectorization factor fixed. */
2061 vect_optimize_slp (loop_vinfo);
2062 }
2063
2064 bool saved_can_fully_mask_p = LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo);
2065
2066 /* We don't expect to have to roll back to anything other than an empty
2067 set of rgroups. */
2068 gcc_assert (LOOP_VINFO_MASKS (loop_vinfo).is_empty ());
2069
2070 /* This is the point where we can re-start analysis with SLP forced off. */
2071 start_over:
2072
2073 /* Now the vectorization factor is final. */
2074 poly_uint64 vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2075 gcc_assert (known_ne (vectorization_factor, 0U));
2076
2077 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) && dump_enabled_p ())
2078 {
2079 dump_printf_loc (MSG_NOTE, vect_location,
2080 "vectorization_factor = ");
2081 dump_dec (MSG_NOTE, vectorization_factor);
2082 dump_printf (MSG_NOTE, ", niters = %wd\n",
2083 LOOP_VINFO_INT_NITERS (loop_vinfo));
2084 }
2085
2086 /* Analyze the alignment of the data-refs in the loop.
2087 Fail if a data reference is found that cannot be vectorized. */
2088
2089 ok = vect_analyze_data_refs_alignment (loop_vinfo);
2090 if (!ok)
2091 {
2092 if (dump_enabled_p ())
2093 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2094 "bad data alignment.\n");
2095 return ok;
2096 }
2097
2098 /* Prune the list of ddrs to be tested at run-time by versioning for alias.
2099 It is important to call pruning after vect_analyze_data_ref_accesses,
2100 since we use grouping information gathered by interleaving analysis. */
2101 ok = vect_prune_runtime_alias_test_list (loop_vinfo);
2102 if (!ok)
2103 return ok;
2104
2105 /* Do not invoke vect_enhance_data_refs_alignment for epilogue
2106 vectorization, since we do not want to add extra peeling or
2107 add versioning for alignment. */
2108 if (!LOOP_VINFO_EPILOGUE_P (loop_vinfo))
2109 /* This pass will decide on using loop versioning and/or loop peeling in
2110 order to enhance the alignment of data references in the loop. */
2111 ok = vect_enhance_data_refs_alignment (loop_vinfo);
2112 else
2113 ok = vect_verify_datarefs_alignment (loop_vinfo);
2114 if (!ok)
2115 return ok;
2116
2117 if (slp)
2118 {
2119 /* Analyze operations in the SLP instances. Note this may
2120 remove unsupported SLP instances which makes the above
2121 SLP kind detection invalid. */
2122 unsigned old_size = LOOP_VINFO_SLP_INSTANCES (loop_vinfo).length ();
2123 vect_slp_analyze_operations (loop_vinfo);
2124 if (LOOP_VINFO_SLP_INSTANCES (loop_vinfo).length () != old_size)
2125 {
2126 ok = opt_result::failure_at (vect_location,
2127 "unsupported SLP instances\n");
2128 goto again;
2129 }
2130 }
2131
2132 /* Dissolve SLP-only groups. */
2133 vect_dissolve_slp_only_groups (loop_vinfo);
2134
2135 /* Scan all the remaining operations in the loop that are not subject
2136 to SLP and make sure they are vectorizable. */
2137 ok = vect_analyze_loop_operations (loop_vinfo);
2138 if (!ok)
2139 {
2140 if (dump_enabled_p ())
2141 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2142 "bad operation or unsupported loop bound.\n");
2143 return ok;
2144 }
2145
2146 /* Decide whether to use a fully-masked loop for this vectorization
2147 factor. */
2148 LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
2149 = (LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo)
2150 && vect_verify_full_masking (loop_vinfo));
2151 if (dump_enabled_p ())
2152 {
2153 if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
2154 dump_printf_loc (MSG_NOTE, vect_location,
2155 "using a fully-masked loop.\n");
2156 else
2157 dump_printf_loc (MSG_NOTE, vect_location,
2158 "not using a fully-masked loop.\n");
2159 }
2160
2161 /* If epilog loop is required because of data accesses with gaps,
2162 one additional iteration needs to be peeled. Check if there is
2163 enough iterations for vectorization. */
2164 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
2165 && LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2166 && !LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
2167 {
2168 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2169 tree scalar_niters = LOOP_VINFO_NITERSM1 (loop_vinfo);
2170
2171 if (known_lt (wi::to_widest (scalar_niters), vf))
2172 return opt_result::failure_at (vect_location,
2173 "loop has no enough iterations to"
2174 " support peeling for gaps.\n");
2175 }
2176
2177 /* If we're vectorizing an epilogue loop, we either need a fully-masked
2178 loop or a loop that has a lower VF than the main loop. */
2179 if (LOOP_VINFO_EPILOGUE_P (loop_vinfo)
2180 && !LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
2181 && maybe_ge (LOOP_VINFO_VECT_FACTOR (loop_vinfo),
2182 LOOP_VINFO_VECT_FACTOR (orig_loop_vinfo)))
2183 return opt_result::failure_at (vect_location,
2184 "Vectorization factor too high for"
2185 " epilogue loop.\n");
2186
2187 /* Check the costings of the loop make vectorizing worthwhile. */
2188 res = vect_analyze_loop_costing (loop_vinfo);
2189 if (res < 0)
2190 {
2191 ok = opt_result::failure_at (vect_location,
2192 "Loop costings may not be worthwhile.\n");
2193 goto again;
2194 }
2195 if (!res)
2196 return opt_result::failure_at (vect_location,
2197 "Loop costings not worthwhile.\n");
2198
2199 determine_peel_for_niter (loop_vinfo);
2200 /* If an epilogue loop is required make sure we can create one. */
2201 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
2202 || LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo))
2203 {
2204 if (dump_enabled_p ())
2205 dump_printf_loc (MSG_NOTE, vect_location, "epilog loop required\n");
2206 if (!vect_can_advance_ivs_p (loop_vinfo)
2207 || !slpeel_can_duplicate_loop_p (LOOP_VINFO_LOOP (loop_vinfo),
2208 single_exit (LOOP_VINFO_LOOP
2209 (loop_vinfo))))
2210 {
2211 ok = opt_result::failure_at (vect_location,
2212 "not vectorized: can't create required "
2213 "epilog loop\n");
2214 goto again;
2215 }
2216 }
2217
2218 /* During peeling, we need to check if number of loop iterations is
2219 enough for both peeled prolog loop and vector loop. This check
2220 can be merged along with threshold check of loop versioning, so
2221 increase threshold for this case if necessary.
2222
2223 If we are analyzing an epilogue we still want to check what its
2224 versioning threshold would be. If we decide to vectorize the epilogues we
2225 will want to use the lowest versioning threshold of all epilogues and main
2226 loop. This will enable us to enter a vectorized epilogue even when
2227 versioning the loop. We can't simply check whether the epilogue requires
2228 versioning though since we may have skipped some versioning checks when
2229 analyzing the epilogue. For instance, checks for alias versioning will be
2230 skipped when dealing with epilogues as we assume we already checked them
2231 for the main loop. So instead we always check the 'orig_loop_vinfo'. */
2232 if (LOOP_REQUIRES_VERSIONING (orig_loop_vinfo))
2233 {
2234 poly_uint64 niters_th = 0;
2235 unsigned int th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
2236
2237 if (!vect_use_loop_mask_for_alignment_p (loop_vinfo))
2238 {
2239 /* Niters for peeled prolog loop. */
2240 if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) < 0)
2241 {
2242 dr_vec_info *dr_info = LOOP_VINFO_UNALIGNED_DR (loop_vinfo);
2243 tree vectype = STMT_VINFO_VECTYPE (dr_info->stmt);
2244 niters_th += TYPE_VECTOR_SUBPARTS (vectype) - 1;
2245 }
2246 else
2247 niters_th += LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
2248 }
2249
2250 /* Niters for at least one iteration of vectorized loop. */
2251 if (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
2252 niters_th += LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2253 /* One additional iteration because of peeling for gap. */
2254 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
2255 niters_th += 1;
2256
2257 /* Use the same condition as vect_transform_loop to decide when to use
2258 the cost to determine a versioning threshold. */
2259 if (vect_apply_runtime_profitability_check_p (loop_vinfo)
2260 && ordered_p (th, niters_th))
2261 niters_th = ordered_max (poly_uint64 (th), niters_th);
2262
2263 LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo) = niters_th;
2264 }
2265
2266 gcc_assert (known_eq (vectorization_factor,
2267 LOOP_VINFO_VECT_FACTOR (loop_vinfo)));
2268
2269 /* Ok to vectorize! */
2270 return opt_result::success ();
2271
2272 again:
2273 /* Ensure that "ok" is false (with an opt_problem if dumping is enabled). */
2274 gcc_assert (!ok);
2275
2276 /* Try again with SLP forced off but if we didn't do any SLP there is
2277 no point in re-trying. */
2278 if (!slp)
2279 return ok;
2280
2281 /* If there are reduction chains re-trying will fail anyway. */
2282 if (! LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo).is_empty ())
2283 return ok;
2284
2285 /* Likewise if the grouped loads or stores in the SLP cannot be handled
2286 via interleaving or lane instructions. */
2287 slp_instance instance;
2288 slp_tree node;
2289 unsigned i, j;
2290 FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), i, instance)
2291 {
2292 stmt_vec_info vinfo;
2293 vinfo = SLP_TREE_SCALAR_STMTS (SLP_INSTANCE_TREE (instance))[0];
2294 if (! STMT_VINFO_GROUPED_ACCESS (vinfo))
2295 continue;
2296 vinfo = DR_GROUP_FIRST_ELEMENT (vinfo);
2297 unsigned int size = DR_GROUP_SIZE (vinfo);
2298 tree vectype = STMT_VINFO_VECTYPE (vinfo);
2299 if (! vect_store_lanes_supported (vectype, size, false)
2300 && ! known_eq (TYPE_VECTOR_SUBPARTS (vectype), 1U)
2301 && ! vect_grouped_store_supported (vectype, size))
2302 return opt_result::failure_at (vinfo->stmt,
2303 "unsupported grouped store\n");
2304 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), j, node)
2305 {
2306 vinfo = SLP_TREE_SCALAR_STMTS (node)[0];
2307 vinfo = DR_GROUP_FIRST_ELEMENT (vinfo);
2308 bool single_element_p = !DR_GROUP_NEXT_ELEMENT (vinfo);
2309 size = DR_GROUP_SIZE (vinfo);
2310 vectype = STMT_VINFO_VECTYPE (vinfo);
2311 if (! vect_load_lanes_supported (vectype, size, false)
2312 && ! vect_grouped_load_supported (vectype, single_element_p,
2313 size))
2314 return opt_result::failure_at (vinfo->stmt,
2315 "unsupported grouped load\n");
2316 }
2317 }
2318
2319 if (dump_enabled_p ())
2320 dump_printf_loc (MSG_NOTE, vect_location,
2321 "re-trying with SLP disabled\n");
2322
2323 /* Roll back state appropriately. No SLP this time. */
2324 slp = false;
2325 /* Restore vectorization factor as it were without SLP. */
2326 LOOP_VINFO_VECT_FACTOR (loop_vinfo) = saved_vectorization_factor;
2327 /* Free the SLP instances. */
2328 FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), j, instance)
2329 vect_free_slp_instance (instance, false);
2330 LOOP_VINFO_SLP_INSTANCES (loop_vinfo).release ();
2331 /* Reset SLP type to loop_vect on all stmts. */
2332 for (i = 0; i < LOOP_VINFO_LOOP (loop_vinfo)->num_nodes; ++i)
2333 {
2334 basic_block bb = LOOP_VINFO_BBS (loop_vinfo)[i];
2335 for (gimple_stmt_iterator si = gsi_start_phis (bb);
2336 !gsi_end_p (si); gsi_next (&si))
2337 {
2338 stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
2339 STMT_SLP_TYPE (stmt_info) = loop_vect;
2340 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
2341 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
2342 {
2343 /* vectorizable_reduction adjusts reduction stmt def-types,
2344 restore them to that of the PHI. */
2345 STMT_VINFO_DEF_TYPE (STMT_VINFO_REDUC_DEF (stmt_info))
2346 = STMT_VINFO_DEF_TYPE (stmt_info);
2347 STMT_VINFO_DEF_TYPE (vect_stmt_to_vectorize
2348 (STMT_VINFO_REDUC_DEF (stmt_info)))
2349 = STMT_VINFO_DEF_TYPE (stmt_info);
2350 }
2351 }
2352 for (gimple_stmt_iterator si = gsi_start_bb (bb);
2353 !gsi_end_p (si); gsi_next (&si))
2354 {
2355 if (is_gimple_debug (gsi_stmt (si)))
2356 continue;
2357 stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
2358 STMT_SLP_TYPE (stmt_info) = loop_vect;
2359 if (STMT_VINFO_IN_PATTERN_P (stmt_info))
2360 {
2361 gimple *pattern_def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
2362 stmt_info = STMT_VINFO_RELATED_STMT (stmt_info);
2363 STMT_SLP_TYPE (stmt_info) = loop_vect;
2364 for (gimple_stmt_iterator pi = gsi_start (pattern_def_seq);
2365 !gsi_end_p (pi); gsi_next (&pi))
2366 STMT_SLP_TYPE (loop_vinfo->lookup_stmt (gsi_stmt (pi)))
2367 = loop_vect;
2368 }
2369 }
2370 }
2371 /* Free optimized alias test DDRS. */
2372 LOOP_VINFO_LOWER_BOUNDS (loop_vinfo).truncate (0);
2373 LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo).release ();
2374 LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo).release ();
2375 /* Reset target cost data. */
2376 destroy_cost_data (LOOP_VINFO_TARGET_COST_DATA (loop_vinfo));
2377 LOOP_VINFO_TARGET_COST_DATA (loop_vinfo)
2378 = init_cost (LOOP_VINFO_LOOP (loop_vinfo));
2379 /* Reset accumulated rgroup information. */
2380 release_vec_loop_masks (&LOOP_VINFO_MASKS (loop_vinfo));
2381 /* Reset assorted flags. */
2382 LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = false;
2383 LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) = false;
2384 LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo) = 0;
2385 LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo) = 0;
2386 LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = saved_can_fully_mask_p;
2387
2388 goto start_over;
2389 }
2390
2391 /* Return true if vectorizing a loop using NEW_LOOP_VINFO appears
2392 to be better than vectorizing it using OLD_LOOP_VINFO. Assume that
2393 OLD_LOOP_VINFO is better unless something specifically indicates
2394 otherwise.
2395
2396 Note that this deliberately isn't a partial order. */
2397
2398 static bool
2399 vect_better_loop_vinfo_p (loop_vec_info new_loop_vinfo,
2400 loop_vec_info old_loop_vinfo)
2401 {
2402 struct loop *loop = LOOP_VINFO_LOOP (new_loop_vinfo);
2403 gcc_assert (LOOP_VINFO_LOOP (old_loop_vinfo) == loop);
2404
2405 poly_int64 new_vf = LOOP_VINFO_VECT_FACTOR (new_loop_vinfo);
2406 poly_int64 old_vf = LOOP_VINFO_VECT_FACTOR (old_loop_vinfo);
2407
2408 /* Always prefer a VF of loop->simdlen over any other VF. */
2409 if (loop->simdlen)
2410 {
2411 bool new_simdlen_p = known_eq (new_vf, loop->simdlen);
2412 bool old_simdlen_p = known_eq (old_vf, loop->simdlen);
2413 if (new_simdlen_p != old_simdlen_p)
2414 return new_simdlen_p;
2415 }
2416
2417 /* Limit the VFs to what is likely to be the maximum number of iterations,
2418 to handle cases in which at least one loop_vinfo is fully-masked. */
2419 HOST_WIDE_INT estimated_max_niter = likely_max_stmt_executions_int (loop);
2420 if (estimated_max_niter != -1)
2421 {
2422 if (known_le (estimated_max_niter, new_vf))
2423 new_vf = estimated_max_niter;
2424 if (known_le (estimated_max_niter, old_vf))
2425 old_vf = estimated_max_niter;
2426 }
2427
2428 /* Check whether the (fractional) cost per scalar iteration is lower
2429 or higher: new_inside_cost / new_vf vs. old_inside_cost / old_vf. */
2430 poly_widest_int rel_new = (new_loop_vinfo->vec_inside_cost
2431 * poly_widest_int (old_vf));
2432 poly_widest_int rel_old = (old_loop_vinfo->vec_inside_cost
2433 * poly_widest_int (new_vf));
2434 if (maybe_lt (rel_old, rel_new))
2435 {
2436 /* When old_loop_vinfo uses a variable vectorization factor,
2437 we know that it has a lower cost for at least one runtime VF.
2438 However, we don't know how likely that VF is.
2439
2440 One option would be to compare the costs for the estimated VFs.
2441 The problem is that that can put too much pressure on the cost
2442 model. E.g. if the estimated VF is also the lowest possible VF,
2443 and if old_loop_vinfo is 1 unit worse than new_loop_vinfo
2444 for the estimated VF, we'd then choose new_loop_vinfo even
2445 though (a) new_loop_vinfo might not actually be better than
2446 old_loop_vinfo for that VF and (b) it would be significantly
2447 worse at larger VFs.
2448
2449 Here we go for a hacky compromise: pick new_loop_vinfo if it is
2450 no more expensive than old_loop_vinfo even after doubling the
2451 estimated old_loop_vinfo VF. For all but trivial loops, this
2452 ensures that we only pick new_loop_vinfo if it is significantly
2453 better than old_loop_vinfo at the estimated VF. */
2454 if (rel_new.is_constant ())
2455 return false;
2456
2457 HOST_WIDE_INT new_estimated_vf = estimated_poly_value (new_vf);
2458 HOST_WIDE_INT old_estimated_vf = estimated_poly_value (old_vf);
2459 widest_int estimated_rel_new = (new_loop_vinfo->vec_inside_cost
2460 * widest_int (old_estimated_vf));
2461 widest_int estimated_rel_old = (old_loop_vinfo->vec_inside_cost
2462 * widest_int (new_estimated_vf));
2463 return estimated_rel_new * 2 <= estimated_rel_old;
2464 }
2465 if (known_lt (rel_new, rel_old))
2466 return true;
2467
2468 /* If there's nothing to choose between the loop bodies, see whether
2469 there's a difference in the prologue and epilogue costs. */
2470 if (new_loop_vinfo->vec_outside_cost != old_loop_vinfo->vec_outside_cost)
2471 return new_loop_vinfo->vec_outside_cost < old_loop_vinfo->vec_outside_cost;
2472
2473 return false;
2474 }
2475
2476 /* Decide whether to replace OLD_LOOP_VINFO with NEW_LOOP_VINFO. Return
2477 true if we should. */
2478
2479 static bool
2480 vect_joust_loop_vinfos (loop_vec_info new_loop_vinfo,
2481 loop_vec_info old_loop_vinfo)
2482 {
2483 if (!vect_better_loop_vinfo_p (new_loop_vinfo, old_loop_vinfo))
2484 return false;
2485
2486 if (dump_enabled_p ())
2487 dump_printf_loc (MSG_NOTE, vect_location,
2488 "***** Preferring vector mode %s to vector mode %s\n",
2489 GET_MODE_NAME (new_loop_vinfo->vector_mode),
2490 GET_MODE_NAME (old_loop_vinfo->vector_mode));
2491 return true;
2492 }
2493
2494 /* Function vect_analyze_loop.
2495
2496 Apply a set of analyses on LOOP, and create a loop_vec_info struct
2497 for it. The different analyses will record information in the
2498 loop_vec_info struct. */
2499 opt_loop_vec_info
2500 vect_analyze_loop (class loop *loop, vec_info_shared *shared)
2501 {
2502 auto_vector_modes vector_modes;
2503
2504 /* Autodetect first vector size we try. */
2505 unsigned int autovec_flags
2506 = targetm.vectorize.autovectorize_vector_modes (&vector_modes,
2507 loop->simdlen != 0);
2508 unsigned int mode_i = 0;
2509
2510 DUMP_VECT_SCOPE ("analyze_loop_nest");
2511
2512 if (loop_outer (loop)
2513 && loop_vec_info_for_loop (loop_outer (loop))
2514 && LOOP_VINFO_VECTORIZABLE_P (loop_vec_info_for_loop (loop_outer (loop))))
2515 return opt_loop_vec_info::failure_at (vect_location,
2516 "outer-loop already vectorized.\n");
2517
2518 if (!find_loop_nest (loop, &shared->loop_nest))
2519 return opt_loop_vec_info::failure_at
2520 (vect_location,
2521 "not vectorized: loop nest containing two or more consecutive inner"
2522 " loops cannot be vectorized\n");
2523
2524 unsigned n_stmts = 0;
2525 machine_mode autodetected_vector_mode = VOIDmode;
2526 opt_loop_vec_info first_loop_vinfo = opt_loop_vec_info::success (NULL);
2527 machine_mode next_vector_mode = VOIDmode;
2528 poly_uint64 lowest_th = 0;
2529 unsigned vectorized_loops = 0;
2530 bool pick_lowest_cost_p = ((autovec_flags & VECT_COMPARE_COSTS)
2531 && !unlimited_cost_model (loop));
2532
2533 bool vect_epilogues = false;
2534 opt_result res = opt_result::success ();
2535 unsigned HOST_WIDE_INT simdlen = loop->simdlen;
2536 while (1)
2537 {
2538 /* Check the CFG characteristics of the loop (nesting, entry/exit). */
2539 opt_loop_vec_info loop_vinfo = vect_analyze_loop_form (loop, shared);
2540 if (!loop_vinfo)
2541 {
2542 if (dump_enabled_p ())
2543 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2544 "bad loop form.\n");
2545 gcc_checking_assert (first_loop_vinfo == NULL);
2546 return loop_vinfo;
2547 }
2548 loop_vinfo->vector_mode = next_vector_mode;
2549
2550 bool fatal = false;
2551
2552 /* When pick_lowest_cost_p is true, we should in principle iterate
2553 over all the loop_vec_infos that LOOP_VINFO could replace and
2554 try to vectorize LOOP_VINFO under the same conditions.
2555 E.g. when trying to replace an epilogue loop, we should vectorize
2556 LOOP_VINFO as an epilogue loop with the same VF limit. When trying
2557 to replace the main loop, we should vectorize LOOP_VINFO as a main
2558 loop too.
2559
2560 However, autovectorize_vector_modes is usually sorted as follows:
2561
2562 - Modes that naturally produce lower VFs usually follow modes that
2563 naturally produce higher VFs.
2564
2565 - When modes naturally produce the same VF, maskable modes
2566 usually follow unmaskable ones, so that the maskable mode
2567 can be used to vectorize the epilogue of the unmaskable mode.
2568
2569 This order is preferred because it leads to the maximum
2570 epilogue vectorization opportunities. Targets should only use
2571 a different order if they want to make wide modes available while
2572 disparaging them relative to earlier, smaller modes. The assumption
2573 in that case is that the wider modes are more expensive in some
2574 way that isn't reflected directly in the costs.
2575
2576 There should therefore be few interesting cases in which
2577 LOOP_VINFO fails when treated as an epilogue loop, succeeds when
2578 treated as a standalone loop, and ends up being genuinely cheaper
2579 than FIRST_LOOP_VINFO. */
2580 if (vect_epilogues)
2581 LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo) = first_loop_vinfo;
2582
2583 res = vect_analyze_loop_2 (loop_vinfo, fatal, &n_stmts);
2584 if (mode_i == 0)
2585 autodetected_vector_mode = loop_vinfo->vector_mode;
2586 if (dump_enabled_p ())
2587 {
2588 if (res)
2589 dump_printf_loc (MSG_NOTE, vect_location,
2590 "***** Analysis succeeded with vector mode %s\n",
2591 GET_MODE_NAME (loop_vinfo->vector_mode));
2592 else
2593 dump_printf_loc (MSG_NOTE, vect_location,
2594 "***** Analysis failed with vector mode %s\n",
2595 GET_MODE_NAME (loop_vinfo->vector_mode));
2596 }
2597
2598 loop->aux = NULL;
2599
2600 if (!fatal)
2601 while (mode_i < vector_modes.length ()
2602 && vect_chooses_same_modes_p (loop_vinfo, vector_modes[mode_i]))
2603 {
2604 if (dump_enabled_p ())
2605 dump_printf_loc (MSG_NOTE, vect_location,
2606 "***** The result for vector mode %s would"
2607 " be the same\n",
2608 GET_MODE_NAME (vector_modes[mode_i]));
2609 mode_i += 1;
2610 }
2611
2612 if (res)
2613 {
2614 LOOP_VINFO_VECTORIZABLE_P (loop_vinfo) = 1;
2615 vectorized_loops++;
2616
2617 /* Once we hit the desired simdlen for the first time,
2618 discard any previous attempts. */
2619 if (simdlen
2620 && known_eq (LOOP_VINFO_VECT_FACTOR (loop_vinfo), simdlen))
2621 {
2622 delete first_loop_vinfo;
2623 first_loop_vinfo = opt_loop_vec_info::success (NULL);
2624 LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo) = NULL;
2625 simdlen = 0;
2626 }
2627 else if (pick_lowest_cost_p && first_loop_vinfo)
2628 {
2629 /* Keep trying to roll back vectorization attempts while the
2630 loop_vec_infos they produced were worse than this one. */
2631 vec<loop_vec_info> &vinfos = first_loop_vinfo->epilogue_vinfos;
2632 while (!vinfos.is_empty ()
2633 && vect_joust_loop_vinfos (loop_vinfo, vinfos.last ()))
2634 {
2635 gcc_assert (vect_epilogues);
2636 delete vinfos.pop ();
2637 }
2638 if (vinfos.is_empty ()
2639 && vect_joust_loop_vinfos (loop_vinfo, first_loop_vinfo))
2640 {
2641 delete first_loop_vinfo;
2642 first_loop_vinfo = opt_loop_vec_info::success (NULL);
2643 LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo) = NULL;
2644 }
2645 }
2646
2647 if (first_loop_vinfo == NULL)
2648 {
2649 first_loop_vinfo = loop_vinfo;
2650 lowest_th = LOOP_VINFO_VERSIONING_THRESHOLD (first_loop_vinfo);
2651 }
2652 else if (vect_epilogues
2653 /* For now only allow one epilogue loop. */
2654 && first_loop_vinfo->epilogue_vinfos.is_empty ())
2655 {
2656 first_loop_vinfo->epilogue_vinfos.safe_push (loop_vinfo);
2657 poly_uint64 th = LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo);
2658 gcc_assert (!LOOP_REQUIRES_VERSIONING (loop_vinfo)
2659 || maybe_ne (lowest_th, 0U));
2660 /* Keep track of the known smallest versioning
2661 threshold. */
2662 if (ordered_p (lowest_th, th))
2663 lowest_th = ordered_min (lowest_th, th);
2664 }
2665 else
2666 delete loop_vinfo;
2667
2668 /* Only vectorize epilogues if PARAM_VECT_EPILOGUES_NOMASK is
2669 enabled, SIMDUID is not set, it is the innermost loop and we have
2670 either already found the loop's SIMDLEN or there was no SIMDLEN to
2671 begin with.
2672 TODO: Enable epilogue vectorization for loops with SIMDUID set. */
2673 vect_epilogues = (!simdlen
2674 && loop->inner == NULL
2675 && param_vect_epilogues_nomask
2676 && LOOP_VINFO_PEELING_FOR_NITER (first_loop_vinfo)
2677 && !loop->simduid
2678 /* For now only allow one epilogue loop, but allow
2679 pick_lowest_cost_p to replace it. */
2680 && (first_loop_vinfo->epilogue_vinfos.is_empty ()
2681 || pick_lowest_cost_p));
2682
2683 /* Commit to first_loop_vinfo if we have no reason to try
2684 alternatives. */
2685 if (!simdlen && !vect_epilogues && !pick_lowest_cost_p)
2686 break;
2687 }
2688 else
2689 {
2690 delete loop_vinfo;
2691 if (fatal)
2692 {
2693 gcc_checking_assert (first_loop_vinfo == NULL);
2694 break;
2695 }
2696 }
2697
2698 if (mode_i < vector_modes.length ()
2699 && VECTOR_MODE_P (autodetected_vector_mode)
2700 && (related_vector_mode (vector_modes[mode_i],
2701 GET_MODE_INNER (autodetected_vector_mode))
2702 == autodetected_vector_mode)
2703 && (related_vector_mode (autodetected_vector_mode,
2704 GET_MODE_INNER (vector_modes[mode_i]))
2705 == vector_modes[mode_i]))
2706 {
2707 if (dump_enabled_p ())
2708 dump_printf_loc (MSG_NOTE, vect_location,
2709 "***** Skipping vector mode %s, which would"
2710 " repeat the analysis for %s\n",
2711 GET_MODE_NAME (vector_modes[mode_i]),
2712 GET_MODE_NAME (autodetected_vector_mode));
2713 mode_i += 1;
2714 }
2715
2716 if (mode_i == vector_modes.length ()
2717 || autodetected_vector_mode == VOIDmode)
2718 break;
2719
2720 /* Try the next biggest vector size. */
2721 next_vector_mode = vector_modes[mode_i++];
2722 if (dump_enabled_p ())
2723 dump_printf_loc (MSG_NOTE, vect_location,
2724 "***** Re-trying analysis with vector mode %s\n",
2725 GET_MODE_NAME (next_vector_mode));
2726 }
2727
2728 if (first_loop_vinfo)
2729 {
2730 loop->aux = (loop_vec_info) first_loop_vinfo;
2731 if (dump_enabled_p ())
2732 dump_printf_loc (MSG_NOTE, vect_location,
2733 "***** Choosing vector mode %s\n",
2734 GET_MODE_NAME (first_loop_vinfo->vector_mode));
2735 LOOP_VINFO_VERSIONING_THRESHOLD (first_loop_vinfo) = lowest_th;
2736 return first_loop_vinfo;
2737 }
2738
2739 return opt_loop_vec_info::propagate_failure (res);
2740 }
2741
2742 /* Return true if there is an in-order reduction function for CODE, storing
2743 it in *REDUC_FN if so. */
2744
2745 static bool
2746 fold_left_reduction_fn (tree_code code, internal_fn *reduc_fn)
2747 {
2748 switch (code)
2749 {
2750 case PLUS_EXPR:
2751 *reduc_fn = IFN_FOLD_LEFT_PLUS;
2752 return true;
2753
2754 default:
2755 return false;
2756 }
2757 }
2758
2759 /* Function reduction_fn_for_scalar_code
2760
2761 Input:
2762 CODE - tree_code of a reduction operations.
2763
2764 Output:
2765 REDUC_FN - the corresponding internal function to be used to reduce the
2766 vector of partial results into a single scalar result, or IFN_LAST
2767 if the operation is a supported reduction operation, but does not have
2768 such an internal function.
2769
2770 Return FALSE if CODE currently cannot be vectorized as reduction. */
2771
2772 static bool
2773 reduction_fn_for_scalar_code (enum tree_code code, internal_fn *reduc_fn)
2774 {
2775 switch (code)
2776 {
2777 case MAX_EXPR:
2778 *reduc_fn = IFN_REDUC_MAX;
2779 return true;
2780
2781 case MIN_EXPR:
2782 *reduc_fn = IFN_REDUC_MIN;
2783 return true;
2784
2785 case PLUS_EXPR:
2786 *reduc_fn = IFN_REDUC_PLUS;
2787 return true;
2788
2789 case BIT_AND_EXPR:
2790 *reduc_fn = IFN_REDUC_AND;
2791 return true;
2792
2793 case BIT_IOR_EXPR:
2794 *reduc_fn = IFN_REDUC_IOR;
2795 return true;
2796
2797 case BIT_XOR_EXPR:
2798 *reduc_fn = IFN_REDUC_XOR;
2799 return true;
2800
2801 case MULT_EXPR:
2802 case MINUS_EXPR:
2803 *reduc_fn = IFN_LAST;
2804 return true;
2805
2806 default:
2807 return false;
2808 }
2809 }
2810
2811 /* If there is a neutral value X such that SLP reduction NODE would not
2812 be affected by the introduction of additional X elements, return that X,
2813 otherwise return null. CODE is the code of the reduction and VECTOR_TYPE
2814 is the vector type that would hold element X. REDUC_CHAIN is true if
2815 the SLP statements perform a single reduction, false if each statement
2816 performs an independent reduction. */
2817
2818 static tree
2819 neutral_op_for_slp_reduction (slp_tree slp_node, tree vector_type,
2820 tree_code code, bool reduc_chain)
2821 {
2822 vec<stmt_vec_info> stmts = SLP_TREE_SCALAR_STMTS (slp_node);
2823 stmt_vec_info stmt_vinfo = stmts[0];
2824 tree scalar_type = TREE_TYPE (vector_type);
2825 class loop *loop = gimple_bb (stmt_vinfo->stmt)->loop_father;
2826 gcc_assert (loop);
2827
2828 switch (code)
2829 {
2830 case WIDEN_SUM_EXPR:
2831 case DOT_PROD_EXPR:
2832 case SAD_EXPR:
2833 case PLUS_EXPR:
2834 case MINUS_EXPR:
2835 case BIT_IOR_EXPR:
2836 case BIT_XOR_EXPR:
2837 return build_zero_cst (scalar_type);
2838
2839 case MULT_EXPR:
2840 return build_one_cst (scalar_type);
2841
2842 case BIT_AND_EXPR:
2843 return build_all_ones_cst (scalar_type);
2844
2845 case MAX_EXPR:
2846 case MIN_EXPR:
2847 /* For MIN/MAX the initial values are neutral. A reduction chain
2848 has only a single initial value, so that value is neutral for
2849 all statements. */
2850 if (reduc_chain)
2851 return PHI_ARG_DEF_FROM_EDGE (stmt_vinfo->stmt,
2852 loop_preheader_edge (loop));
2853 return NULL_TREE;
2854
2855 default:
2856 return NULL_TREE;
2857 }
2858 }
2859
2860 /* Error reporting helper for vect_is_simple_reduction below. GIMPLE statement
2861 STMT is printed with a message MSG. */
2862
2863 static void
2864 report_vect_op (dump_flags_t msg_type, gimple *stmt, const char *msg)
2865 {
2866 dump_printf_loc (msg_type, vect_location, "%s%G", msg, stmt);
2867 }
2868
2869 /* Return true if we need an in-order reduction for operation CODE
2870 on type TYPE. NEED_WRAPPING_INTEGRAL_OVERFLOW is true if integer
2871 overflow must wrap. */
2872
2873 bool
2874 needs_fold_left_reduction_p (tree type, tree_code code)
2875 {
2876 /* CHECKME: check for !flag_finite_math_only too? */
2877 if (SCALAR_FLOAT_TYPE_P (type))
2878 switch (code)
2879 {
2880 case MIN_EXPR:
2881 case MAX_EXPR:
2882 return false;
2883
2884 default:
2885 return !flag_associative_math;
2886 }
2887
2888 if (INTEGRAL_TYPE_P (type))
2889 {
2890 if (!operation_no_trapping_overflow (type, code))
2891 return true;
2892 return false;
2893 }
2894
2895 if (SAT_FIXED_POINT_TYPE_P (type))
2896 return true;
2897
2898 return false;
2899 }
2900
2901 /* Return true if the reduction PHI in LOOP with latch arg LOOP_ARG and
2902 has a handled computation expression. Store the main reduction
2903 operation in *CODE. */
2904
2905 static bool
2906 check_reduction_path (dump_user_location_t loc, loop_p loop, gphi *phi,
2907 tree loop_arg, enum tree_code *code,
2908 vec<std::pair<ssa_op_iter, use_operand_p> > &path)
2909 {
2910 auto_bitmap visited;
2911 tree lookfor = PHI_RESULT (phi);
2912 ssa_op_iter curri;
2913 use_operand_p curr = op_iter_init_phiuse (&curri, phi, SSA_OP_USE);
2914 while (USE_FROM_PTR (curr) != loop_arg)
2915 curr = op_iter_next_use (&curri);
2916 curri.i = curri.numops;
2917 do
2918 {
2919 path.safe_push (std::make_pair (curri, curr));
2920 tree use = USE_FROM_PTR (curr);
2921 if (use == lookfor)
2922 break;
2923 gimple *def = SSA_NAME_DEF_STMT (use);
2924 if (gimple_nop_p (def)
2925 || ! flow_bb_inside_loop_p (loop, gimple_bb (def)))
2926 {
2927 pop:
2928 do
2929 {
2930 std::pair<ssa_op_iter, use_operand_p> x = path.pop ();
2931 curri = x.first;
2932 curr = x.second;
2933 do
2934 curr = op_iter_next_use (&curri);
2935 /* Skip already visited or non-SSA operands (from iterating
2936 over PHI args). */
2937 while (curr != NULL_USE_OPERAND_P
2938 && (TREE_CODE (USE_FROM_PTR (curr)) != SSA_NAME
2939 || ! bitmap_set_bit (visited,
2940 SSA_NAME_VERSION
2941 (USE_FROM_PTR (curr)))));
2942 }
2943 while (curr == NULL_USE_OPERAND_P && ! path.is_empty ());
2944 if (curr == NULL_USE_OPERAND_P)
2945 break;
2946 }
2947 else
2948 {
2949 if (gimple_code (def) == GIMPLE_PHI)
2950 curr = op_iter_init_phiuse (&curri, as_a <gphi *>(def), SSA_OP_USE);
2951 else
2952 curr = op_iter_init_use (&curri, def, SSA_OP_USE);
2953 while (curr != NULL_USE_OPERAND_P
2954 && (TREE_CODE (USE_FROM_PTR (curr)) != SSA_NAME
2955 || ! bitmap_set_bit (visited,
2956 SSA_NAME_VERSION
2957 (USE_FROM_PTR (curr)))))
2958 curr = op_iter_next_use (&curri);
2959 if (curr == NULL_USE_OPERAND_P)
2960 goto pop;
2961 }
2962 }
2963 while (1);
2964 if (dump_file && (dump_flags & TDF_DETAILS))
2965 {
2966 dump_printf_loc (MSG_NOTE, loc, "reduction path: ");
2967 unsigned i;
2968 std::pair<ssa_op_iter, use_operand_p> *x;
2969 FOR_EACH_VEC_ELT (path, i, x)
2970 dump_printf (MSG_NOTE, "%T ", USE_FROM_PTR (x->second));
2971 dump_printf (MSG_NOTE, "\n");
2972 }
2973
2974 /* Check whether the reduction path detected is valid. */
2975 bool fail = path.length () == 0;
2976 bool neg = false;
2977 int sign = -1;
2978 *code = ERROR_MARK;
2979 for (unsigned i = 1; i < path.length (); ++i)
2980 {
2981 gimple *use_stmt = USE_STMT (path[i].second);
2982 tree op = USE_FROM_PTR (path[i].second);
2983 if (! is_gimple_assign (use_stmt)
2984 /* The following make sure we can compute the operand index
2985 easily plus it mostly disallows chaining via COND_EXPR condition
2986 operands. */
2987 || (gimple_assign_rhs1_ptr (use_stmt) != path[i].second->use
2988 && (gimple_num_ops (use_stmt) <= 2
2989 || gimple_assign_rhs2_ptr (use_stmt) != path[i].second->use)
2990 && (gimple_num_ops (use_stmt) <= 3
2991 || gimple_assign_rhs3_ptr (use_stmt) != path[i].second->use)))
2992 {
2993 fail = true;
2994 break;
2995 }
2996 /* Check there's only a single stmt the op is used on inside
2997 of the loop. */
2998 imm_use_iterator imm_iter;
2999 gimple *op_use_stmt;
3000 unsigned cnt = 0;
3001 FOR_EACH_IMM_USE_STMT (op_use_stmt, imm_iter, op)
3002 if (!is_gimple_debug (op_use_stmt)
3003 && flow_bb_inside_loop_p (loop, gimple_bb (op_use_stmt)))
3004 {
3005 /* We want to allow x + x but not x < 1 ? x : 2. */
3006 if (is_gimple_assign (op_use_stmt)
3007 && gimple_assign_rhs_code (op_use_stmt) == COND_EXPR)
3008 {
3009 use_operand_p use_p;
3010 FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
3011 cnt++;
3012 }
3013 else
3014 cnt++;
3015 }
3016 if (cnt != 1)
3017 {
3018 fail = true;
3019 break;
3020 }
3021 tree_code use_code = gimple_assign_rhs_code (use_stmt);
3022 if (use_code == MINUS_EXPR)
3023 {
3024 use_code = PLUS_EXPR;
3025 /* Track whether we negate the reduction value each iteration. */
3026 if (gimple_assign_rhs2 (use_stmt) == op)
3027 neg = ! neg;
3028 }
3029 if (CONVERT_EXPR_CODE_P (use_code)
3030 && tree_nop_conversion_p (TREE_TYPE (gimple_assign_lhs (use_stmt)),
3031 TREE_TYPE (gimple_assign_rhs1 (use_stmt))))
3032 ;
3033 else if (*code == ERROR_MARK)
3034 {
3035 *code = use_code;
3036 sign = TYPE_SIGN (TREE_TYPE (gimple_assign_lhs (use_stmt)));
3037 }
3038 else if (use_code != *code)
3039 {
3040 fail = true;
3041 break;
3042 }
3043 else if ((use_code == MIN_EXPR
3044 || use_code == MAX_EXPR)
3045 && sign != TYPE_SIGN (TREE_TYPE (gimple_assign_lhs (use_stmt))))
3046 {
3047 fail = true;
3048 break;
3049 }
3050 }
3051 return ! fail && ! neg && *code != ERROR_MARK;
3052 }
3053
3054 bool
3055 check_reduction_path (dump_user_location_t loc, loop_p loop, gphi *phi,
3056 tree loop_arg, enum tree_code code)
3057 {
3058 auto_vec<std::pair<ssa_op_iter, use_operand_p> > path;
3059 enum tree_code code_;
3060 return (check_reduction_path (loc, loop, phi, loop_arg, &code_, path)
3061 && code_ == code);
3062 }
3063
3064
3065
3066 /* Function vect_is_simple_reduction
3067
3068 (1) Detect a cross-iteration def-use cycle that represents a simple
3069 reduction computation. We look for the following pattern:
3070
3071 loop_header:
3072 a1 = phi < a0, a2 >
3073 a3 = ...
3074 a2 = operation (a3, a1)
3075
3076 or
3077
3078 a3 = ...
3079 loop_header:
3080 a1 = phi < a0, a2 >
3081 a2 = operation (a3, a1)
3082
3083 such that:
3084 1. operation is commutative and associative and it is safe to
3085 change the order of the computation
3086 2. no uses for a2 in the loop (a2 is used out of the loop)
3087 3. no uses of a1 in the loop besides the reduction operation
3088 4. no uses of a1 outside the loop.
3089
3090 Conditions 1,4 are tested here.
3091 Conditions 2,3 are tested in vect_mark_stmts_to_be_vectorized.
3092
3093 (2) Detect a cross-iteration def-use cycle in nested loops, i.e.,
3094 nested cycles.
3095
3096 (3) Detect cycles of phi nodes in outer-loop vectorization, i.e., double
3097 reductions:
3098
3099 a1 = phi < a0, a2 >
3100 inner loop (def of a3)
3101 a2 = phi < a3 >
3102
3103 (4) Detect condition expressions, ie:
3104 for (int i = 0; i < N; i++)
3105 if (a[i] < val)
3106 ret_val = a[i];
3107
3108 */
3109
3110 static stmt_vec_info
3111 vect_is_simple_reduction (loop_vec_info loop_info, stmt_vec_info phi_info,
3112 bool *double_reduc, bool *reduc_chain_p)
3113 {
3114 gphi *phi = as_a <gphi *> (phi_info->stmt);
3115 gimple *phi_use_stmt = NULL;
3116 imm_use_iterator imm_iter;
3117 use_operand_p use_p;
3118
3119 *double_reduc = false;
3120 *reduc_chain_p = false;
3121 STMT_VINFO_REDUC_TYPE (phi_info) = TREE_CODE_REDUCTION;
3122
3123 tree phi_name = PHI_RESULT (phi);
3124 /* ??? If there are no uses of the PHI result the inner loop reduction
3125 won't be detected as possibly double-reduction by vectorizable_reduction
3126 because that tries to walk the PHI arg from the preheader edge which
3127 can be constant. See PR60382. */
3128 if (has_zero_uses (phi_name))
3129 return NULL;
3130 class loop *loop = (gimple_bb (phi))->loop_father;
3131 unsigned nphi_def_loop_uses = 0;
3132 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, phi_name)
3133 {
3134 gimple *use_stmt = USE_STMT (use_p);
3135 if (is_gimple_debug (use_stmt))
3136 continue;
3137
3138 if (!flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
3139 {
3140 if (dump_enabled_p ())
3141 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3142 "intermediate value used outside loop.\n");
3143
3144 return NULL;
3145 }
3146
3147 nphi_def_loop_uses++;
3148 phi_use_stmt = use_stmt;
3149 }
3150
3151 tree latch_def = PHI_ARG_DEF_FROM_EDGE (phi, loop_latch_edge (loop));
3152 if (TREE_CODE (latch_def) != SSA_NAME)
3153 {
3154 if (dump_enabled_p ())
3155 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3156 "reduction: not ssa_name: %T\n", latch_def);
3157 return NULL;
3158 }
3159
3160 stmt_vec_info def_stmt_info = loop_info->lookup_def (latch_def);
3161 if (!def_stmt_info
3162 || !flow_bb_inside_loop_p (loop, gimple_bb (def_stmt_info->stmt)))
3163 return NULL;
3164
3165 bool nested_in_vect_loop
3166 = flow_loop_nested_p (LOOP_VINFO_LOOP (loop_info), loop);
3167 unsigned nlatch_def_loop_uses = 0;
3168 auto_vec<gphi *, 3> lcphis;
3169 bool inner_loop_of_double_reduc = false;
3170 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, latch_def)
3171 {
3172 gimple *use_stmt = USE_STMT (use_p);
3173 if (is_gimple_debug (use_stmt))
3174 continue;
3175 if (flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
3176 nlatch_def_loop_uses++;
3177 else
3178 {
3179 /* We can have more than one loop-closed PHI. */
3180 lcphis.safe_push (as_a <gphi *> (use_stmt));
3181 if (nested_in_vect_loop
3182 && (STMT_VINFO_DEF_TYPE (loop_info->lookup_stmt (use_stmt))
3183 == vect_double_reduction_def))
3184 inner_loop_of_double_reduc = true;
3185 }
3186 }
3187
3188 /* If we are vectorizing an inner reduction we are executing that
3189 in the original order only in case we are not dealing with a
3190 double reduction. */
3191 if (nested_in_vect_loop && !inner_loop_of_double_reduc)
3192 {
3193 if (dump_enabled_p ())
3194 report_vect_op (MSG_NOTE, def_stmt_info->stmt,
3195 "detected nested cycle: ");
3196 return def_stmt_info;
3197 }
3198
3199 /* If this isn't a nested cycle or if the nested cycle reduction value
3200 is used ouside of the inner loop we cannot handle uses of the reduction
3201 value. */
3202 if (nlatch_def_loop_uses > 1 || nphi_def_loop_uses > 1)
3203 {
3204 if (dump_enabled_p ())
3205 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3206 "reduction used in loop.\n");
3207 return NULL;
3208 }
3209
3210 /* If DEF_STMT is a phi node itself, we expect it to have a single argument
3211 defined in the inner loop. */
3212 if (gphi *def_stmt = dyn_cast <gphi *> (def_stmt_info->stmt))
3213 {
3214 tree op1 = PHI_ARG_DEF (def_stmt, 0);
3215 if (gimple_phi_num_args (def_stmt) != 1
3216 || TREE_CODE (op1) != SSA_NAME)
3217 {
3218 if (dump_enabled_p ())
3219 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3220 "unsupported phi node definition.\n");
3221
3222 return NULL;
3223 }
3224
3225 gimple *def1 = SSA_NAME_DEF_STMT (op1);
3226 if (gimple_bb (def1)
3227 && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
3228 && loop->inner
3229 && flow_bb_inside_loop_p (loop->inner, gimple_bb (def1))
3230 && is_gimple_assign (def1)
3231 && is_a <gphi *> (phi_use_stmt)
3232 && flow_bb_inside_loop_p (loop->inner, gimple_bb (phi_use_stmt)))
3233 {
3234 if (dump_enabled_p ())
3235 report_vect_op (MSG_NOTE, def_stmt,
3236 "detected double reduction: ");
3237
3238 *double_reduc = true;
3239 return def_stmt_info;
3240 }
3241
3242 return NULL;
3243 }
3244
3245 /* Look for the expression computing latch_def from then loop PHI result. */
3246 auto_vec<std::pair<ssa_op_iter, use_operand_p> > path;
3247 enum tree_code code;
3248 if (check_reduction_path (vect_location, loop, phi, latch_def, &code,
3249 path))
3250 {
3251 STMT_VINFO_REDUC_CODE (phi_info) = code;
3252 if (code == COND_EXPR && !nested_in_vect_loop)
3253 STMT_VINFO_REDUC_TYPE (phi_info) = COND_REDUCTION;
3254
3255 /* Fill in STMT_VINFO_REDUC_IDX and gather stmts for an SLP
3256 reduction chain for which the additional restriction is that
3257 all operations in the chain are the same. */
3258 auto_vec<stmt_vec_info, 8> reduc_chain;
3259 unsigned i;
3260 bool is_slp_reduc = !nested_in_vect_loop && code != COND_EXPR;
3261 for (i = path.length () - 1; i >= 1; --i)
3262 {
3263 gimple *stmt = USE_STMT (path[i].second);
3264 stmt_vec_info stmt_info = loop_info->lookup_stmt (stmt);
3265 STMT_VINFO_REDUC_IDX (stmt_info)
3266 = path[i].second->use - gimple_assign_rhs1_ptr (stmt);
3267 enum tree_code stmt_code = gimple_assign_rhs_code (stmt);
3268 bool leading_conversion = (CONVERT_EXPR_CODE_P (stmt_code)
3269 && (i == 1 || i == path.length () - 1));
3270 if ((stmt_code != code && !leading_conversion)
3271 /* We can only handle the final value in epilogue
3272 generation for reduction chains. */
3273 || (i != 1 && !has_single_use (gimple_assign_lhs (stmt))))
3274 is_slp_reduc = false;
3275 /* For reduction chains we support a trailing/leading
3276 conversions. We do not store those in the actual chain. */
3277 if (leading_conversion)
3278 continue;
3279 reduc_chain.safe_push (stmt_info);
3280 }
3281 if (is_slp_reduc && reduc_chain.length () > 1)
3282 {
3283 for (unsigned i = 0; i < reduc_chain.length () - 1; ++i)
3284 {
3285 REDUC_GROUP_FIRST_ELEMENT (reduc_chain[i]) = reduc_chain[0];
3286 REDUC_GROUP_NEXT_ELEMENT (reduc_chain[i]) = reduc_chain[i+1];
3287 }
3288 REDUC_GROUP_FIRST_ELEMENT (reduc_chain.last ()) = reduc_chain[0];
3289 REDUC_GROUP_NEXT_ELEMENT (reduc_chain.last ()) = NULL;
3290
3291 /* Save the chain for further analysis in SLP detection. */
3292 LOOP_VINFO_REDUCTION_CHAINS (loop_info).safe_push (reduc_chain[0]);
3293 REDUC_GROUP_SIZE (reduc_chain[0]) = reduc_chain.length ();
3294
3295 *reduc_chain_p = true;
3296 if (dump_enabled_p ())
3297 dump_printf_loc (MSG_NOTE, vect_location,
3298 "reduction: detected reduction chain\n");
3299 }
3300 else if (dump_enabled_p ())
3301 dump_printf_loc (MSG_NOTE, vect_location,
3302 "reduction: detected reduction\n");
3303
3304 return def_stmt_info;
3305 }
3306
3307 if (dump_enabled_p ())
3308 dump_printf_loc (MSG_NOTE, vect_location,
3309 "reduction: unknown pattern\n");
3310
3311 return NULL;
3312 }
3313
3314 /* Calculate cost of peeling the loop PEEL_ITERS_PROLOGUE times. */
3315 int
3316 vect_get_known_peeling_cost (loop_vec_info loop_vinfo, int peel_iters_prologue,
3317 int *peel_iters_epilogue,
3318 stmt_vector_for_cost *scalar_cost_vec,
3319 stmt_vector_for_cost *prologue_cost_vec,
3320 stmt_vector_for_cost *epilogue_cost_vec)
3321 {
3322 int retval = 0;
3323 int assumed_vf = vect_vf_for_cost (loop_vinfo);
3324
3325 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
3326 {
3327 *peel_iters_epilogue = assumed_vf / 2;
3328 if (dump_enabled_p ())
3329 dump_printf_loc (MSG_NOTE, vect_location,
3330 "cost model: epilogue peel iters set to vf/2 "
3331 "because loop iterations are unknown .\n");
3332
3333 /* If peeled iterations are known but number of scalar loop
3334 iterations are unknown, count a taken branch per peeled loop. */
3335 retval = record_stmt_cost (prologue_cost_vec, 1, cond_branch_taken,
3336 NULL, NULL_TREE, 0, vect_prologue);
3337 retval += record_stmt_cost (epilogue_cost_vec, 1, cond_branch_taken,
3338 NULL, NULL_TREE, 0, vect_epilogue);
3339 }
3340 else
3341 {
3342 int niters = LOOP_VINFO_INT_NITERS (loop_vinfo);
3343 peel_iters_prologue = niters < peel_iters_prologue ?
3344 niters : peel_iters_prologue;
3345 *peel_iters_epilogue = (niters - peel_iters_prologue) % assumed_vf;
3346 /* If we need to peel for gaps, but no peeling is required, we have to
3347 peel VF iterations. */
3348 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) && !*peel_iters_epilogue)
3349 *peel_iters_epilogue = assumed_vf;
3350 }
3351
3352 stmt_info_for_cost *si;
3353 int j;
3354 if (peel_iters_prologue)
3355 FOR_EACH_VEC_ELT (*scalar_cost_vec, j, si)
3356 retval += record_stmt_cost (prologue_cost_vec,
3357 si->count * peel_iters_prologue,
3358 si->kind, si->stmt_info, si->misalign,
3359 vect_prologue);
3360 if (*peel_iters_epilogue)
3361 FOR_EACH_VEC_ELT (*scalar_cost_vec, j, si)
3362 retval += record_stmt_cost (epilogue_cost_vec,
3363 si->count * *peel_iters_epilogue,
3364 si->kind, si->stmt_info, si->misalign,
3365 vect_epilogue);
3366
3367 return retval;
3368 }
3369
3370 /* Function vect_estimate_min_profitable_iters
3371
3372 Return the number of iterations required for the vector version of the
3373 loop to be profitable relative to the cost of the scalar version of the
3374 loop.
3375
3376 *RET_MIN_PROFITABLE_NITERS is a cost model profitability threshold
3377 of iterations for vectorization. -1 value means loop vectorization
3378 is not profitable. This returned value may be used for dynamic
3379 profitability check.
3380
3381 *RET_MIN_PROFITABLE_ESTIMATE is a profitability threshold to be used
3382 for static check against estimated number of iterations. */
3383
3384 static void
3385 vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo,
3386 int *ret_min_profitable_niters,
3387 int *ret_min_profitable_estimate)
3388 {
3389 int min_profitable_iters;
3390 int min_profitable_estimate;
3391 int peel_iters_prologue;
3392 int peel_iters_epilogue;
3393 unsigned vec_inside_cost = 0;
3394 int vec_outside_cost = 0;
3395 unsigned vec_prologue_cost = 0;
3396 unsigned vec_epilogue_cost = 0;
3397 int scalar_single_iter_cost = 0;
3398 int scalar_outside_cost = 0;
3399 int assumed_vf = vect_vf_for_cost (loop_vinfo);
3400 int npeel = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
3401 void *target_cost_data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
3402
3403 /* Cost model disabled. */
3404 if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
3405 {
3406 if (dump_enabled_p ())
3407 dump_printf_loc (MSG_NOTE, vect_location, "cost model disabled.\n");
3408 *ret_min_profitable_niters = 0;
3409 *ret_min_profitable_estimate = 0;
3410 return;
3411 }
3412
3413 /* Requires loop versioning tests to handle misalignment. */
3414 if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo))
3415 {
3416 /* FIXME: Make cost depend on complexity of individual check. */
3417 unsigned len = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).length ();
3418 (void) add_stmt_cost (loop_vinfo, target_cost_data, len, vector_stmt,
3419 NULL, NULL_TREE, 0, vect_prologue);
3420 if (dump_enabled_p ())
3421 dump_printf (MSG_NOTE,
3422 "cost model: Adding cost of checks for loop "
3423 "versioning to treat misalignment.\n");
3424 }
3425
3426 /* Requires loop versioning with alias checks. */
3427 if (LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
3428 {
3429 /* FIXME: Make cost depend on complexity of individual check. */
3430 unsigned len = LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo).length ();
3431 (void) add_stmt_cost (loop_vinfo, target_cost_data, len, vector_stmt,
3432 NULL, NULL_TREE, 0, vect_prologue);
3433 len = LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo).length ();
3434 if (len)
3435 /* Count LEN - 1 ANDs and LEN comparisons. */
3436 (void) add_stmt_cost (loop_vinfo, target_cost_data, len * 2 - 1,
3437 scalar_stmt, NULL, NULL_TREE, 0, vect_prologue);
3438 len = LOOP_VINFO_LOWER_BOUNDS (loop_vinfo).length ();
3439 if (len)
3440 {
3441 /* Count LEN - 1 ANDs and LEN comparisons. */
3442 unsigned int nstmts = len * 2 - 1;
3443 /* +1 for each bias that needs adding. */
3444 for (unsigned int i = 0; i < len; ++i)
3445 if (!LOOP_VINFO_LOWER_BOUNDS (loop_vinfo)[i].unsigned_p)
3446 nstmts += 1;
3447 (void) add_stmt_cost (loop_vinfo, target_cost_data, nstmts,
3448 scalar_stmt, NULL, NULL_TREE, 0, vect_prologue);
3449 }
3450 if (dump_enabled_p ())
3451 dump_printf (MSG_NOTE,
3452 "cost model: Adding cost of checks for loop "
3453 "versioning aliasing.\n");
3454 }
3455
3456 /* Requires loop versioning with niter checks. */
3457 if (LOOP_REQUIRES_VERSIONING_FOR_NITERS (loop_vinfo))
3458 {
3459 /* FIXME: Make cost depend on complexity of individual check. */
3460 (void) add_stmt_cost (loop_vinfo, target_cost_data, 1, vector_stmt,
3461 NULL, NULL_TREE, 0, vect_prologue);
3462 if (dump_enabled_p ())
3463 dump_printf (MSG_NOTE,
3464 "cost model: Adding cost of checks for loop "
3465 "versioning niters.\n");
3466 }
3467
3468 if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
3469 (void) add_stmt_cost (loop_vinfo, target_cost_data, 1, cond_branch_taken,
3470 NULL, NULL_TREE, 0, vect_prologue);
3471
3472 /* Count statements in scalar loop. Using this as scalar cost for a single
3473 iteration for now.
3474
3475 TODO: Add outer loop support.
3476
3477 TODO: Consider assigning different costs to different scalar
3478 statements. */
3479
3480 scalar_single_iter_cost
3481 = LOOP_VINFO_SINGLE_SCALAR_ITERATION_COST (loop_vinfo);
3482
3483 /* Add additional cost for the peeled instructions in prologue and epilogue
3484 loop. (For fully-masked loops there will be no peeling.)
3485
3486 FORNOW: If we don't know the value of peel_iters for prologue or epilogue
3487 at compile-time - we assume it's vf/2 (the worst would be vf-1).
3488
3489 TODO: Build an expression that represents peel_iters for prologue and
3490 epilogue to be used in a run-time test. */
3491
3492 if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
3493 {
3494 peel_iters_prologue = 0;
3495 peel_iters_epilogue = 0;
3496
3497 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
3498 {
3499 /* We need to peel exactly one iteration. */
3500 peel_iters_epilogue += 1;
3501 stmt_info_for_cost *si;
3502 int j;
3503 FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
3504 j, si)
3505 (void) add_stmt_cost (loop_vinfo, target_cost_data, si->count,
3506 si->kind, si->stmt_info, si->vectype,
3507 si->misalign, vect_epilogue);
3508 }
3509
3510 /* Calculate how many masks we need to generate. */
3511 unsigned int num_masks = 0;
3512 rgroup_masks *rgm;
3513 unsigned int num_vectors_m1;
3514 FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo), num_vectors_m1, rgm)
3515 if (rgm->mask_type)
3516 num_masks += num_vectors_m1 + 1;
3517 gcc_assert (num_masks > 0);
3518
3519 /* In the worst case, we need to generate each mask in the prologue
3520 and in the loop body. One of the loop body mask instructions
3521 replaces the comparison in the scalar loop, and since we don't
3522 count the scalar comparison against the scalar body, we shouldn't
3523 count that vector instruction against the vector body either.
3524
3525 Sometimes we can use unpacks instead of generating prologue
3526 masks and sometimes the prologue mask will fold to a constant,
3527 so the actual prologue cost might be smaller. However, it's
3528 simpler and safer to use the worst-case cost; if this ends up
3529 being the tie-breaker between vectorizing or not, then it's
3530 probably better not to vectorize. */
3531 (void) add_stmt_cost (loop_vinfo,
3532 target_cost_data, num_masks, vector_stmt,
3533 NULL, NULL_TREE, 0, vect_prologue);
3534 (void) add_stmt_cost (loop_vinfo,
3535 target_cost_data, num_masks - 1, vector_stmt,
3536 NULL, NULL_TREE, 0, vect_body);
3537 }
3538 else if (npeel < 0)
3539 {
3540 peel_iters_prologue = assumed_vf / 2;
3541 if (dump_enabled_p ())
3542 dump_printf (MSG_NOTE, "cost model: "
3543 "prologue peel iters set to vf/2.\n");
3544
3545 /* If peeling for alignment is unknown, loop bound of main loop becomes
3546 unknown. */
3547 peel_iters_epilogue = assumed_vf / 2;
3548 if (dump_enabled_p ())
3549 dump_printf (MSG_NOTE, "cost model: "
3550 "epilogue peel iters set to vf/2 because "
3551 "peeling for alignment is unknown.\n");
3552
3553 /* If peeled iterations are unknown, count a taken branch and a not taken
3554 branch per peeled loop. Even if scalar loop iterations are known,
3555 vector iterations are not known since peeled prologue iterations are
3556 not known. Hence guards remain the same. */
3557 (void) add_stmt_cost (loop_vinfo, target_cost_data, 1, cond_branch_taken,
3558 NULL, NULL_TREE, 0, vect_prologue);
3559 (void) add_stmt_cost (loop_vinfo,
3560 target_cost_data, 1, cond_branch_not_taken,
3561 NULL, NULL_TREE, 0, vect_prologue);
3562 (void) add_stmt_cost (loop_vinfo, target_cost_data, 1, cond_branch_taken,
3563 NULL, NULL_TREE, 0, vect_epilogue);
3564 (void) add_stmt_cost (loop_vinfo,
3565 target_cost_data, 1, cond_branch_not_taken,
3566 NULL, NULL_TREE, 0, vect_epilogue);
3567 stmt_info_for_cost *si;
3568 int j;
3569 FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo), j, si)
3570 {
3571 (void) add_stmt_cost (loop_vinfo, target_cost_data,
3572 si->count * peel_iters_prologue,
3573 si->kind, si->stmt_info, si->vectype,
3574 si->misalign,
3575 vect_prologue);
3576 (void) add_stmt_cost (loop_vinfo, target_cost_data,
3577 si->count * peel_iters_epilogue,
3578 si->kind, si->stmt_info, si->vectype,
3579 si->misalign,
3580 vect_epilogue);
3581 }
3582 }
3583 else
3584 {
3585 stmt_vector_for_cost prologue_cost_vec, epilogue_cost_vec;
3586 stmt_info_for_cost *si;
3587 int j;
3588 void *data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
3589
3590 prologue_cost_vec.create (2);
3591 epilogue_cost_vec.create (2);
3592 peel_iters_prologue = npeel;
3593
3594 (void) vect_get_known_peeling_cost (loop_vinfo, peel_iters_prologue,
3595 &peel_iters_epilogue,
3596 &LOOP_VINFO_SCALAR_ITERATION_COST
3597 (loop_vinfo),
3598 &prologue_cost_vec,
3599 &epilogue_cost_vec);
3600
3601 FOR_EACH_VEC_ELT (prologue_cost_vec, j, si)
3602 (void) add_stmt_cost (loop_vinfo,
3603 data, si->count, si->kind, si->stmt_info,
3604 si->vectype, si->misalign, vect_prologue);
3605
3606 FOR_EACH_VEC_ELT (epilogue_cost_vec, j, si)
3607 (void) add_stmt_cost (loop_vinfo,
3608 data, si->count, si->kind, si->stmt_info,
3609 si->vectype, si->misalign, vect_epilogue);
3610
3611 prologue_cost_vec.release ();
3612 epilogue_cost_vec.release ();
3613 }
3614
3615 /* FORNOW: The scalar outside cost is incremented in one of the
3616 following ways:
3617
3618 1. The vectorizer checks for alignment and aliasing and generates
3619 a condition that allows dynamic vectorization. A cost model
3620 check is ANDED with the versioning condition. Hence scalar code
3621 path now has the added cost of the versioning check.
3622
3623 if (cost > th & versioning_check)
3624 jmp to vector code
3625
3626 Hence run-time scalar is incremented by not-taken branch cost.
3627
3628 2. The vectorizer then checks if a prologue is required. If the
3629 cost model check was not done before during versioning, it has to
3630 be done before the prologue check.
3631
3632 if (cost <= th)
3633 prologue = scalar_iters
3634 if (prologue == 0)
3635 jmp to vector code
3636 else
3637 execute prologue
3638 if (prologue == num_iters)
3639 go to exit
3640
3641 Hence the run-time scalar cost is incremented by a taken branch,
3642 plus a not-taken branch, plus a taken branch cost.
3643
3644 3. The vectorizer then checks if an epilogue is required. If the
3645 cost model check was not done before during prologue check, it
3646 has to be done with the epilogue check.
3647
3648 if (prologue == 0)
3649 jmp to vector code
3650 else
3651 execute prologue
3652 if (prologue == num_iters)
3653 go to exit
3654 vector code:
3655 if ((cost <= th) | (scalar_iters-prologue-epilogue == 0))
3656 jmp to epilogue
3657
3658 Hence the run-time scalar cost should be incremented by 2 taken
3659 branches.
3660
3661 TODO: The back end may reorder the BBS's differently and reverse
3662 conditions/branch directions. Change the estimates below to
3663 something more reasonable. */
3664
3665 /* If the number of iterations is known and we do not do versioning, we can
3666 decide whether to vectorize at compile time. Hence the scalar version
3667 do not carry cost model guard costs. */
3668 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
3669 || LOOP_REQUIRES_VERSIONING (loop_vinfo))
3670 {
3671 /* Cost model check occurs at versioning. */
3672 if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
3673 scalar_outside_cost += vect_get_stmt_cost (cond_branch_not_taken);
3674 else
3675 {
3676 /* Cost model check occurs at prologue generation. */
3677 if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) < 0)
3678 scalar_outside_cost += 2 * vect_get_stmt_cost (cond_branch_taken)
3679 + vect_get_stmt_cost (cond_branch_not_taken);
3680 /* Cost model check occurs at epilogue generation. */
3681 else
3682 scalar_outside_cost += 2 * vect_get_stmt_cost (cond_branch_taken);
3683 }
3684 }
3685
3686 /* Complete the target-specific cost calculations. */
3687 finish_cost (LOOP_VINFO_TARGET_COST_DATA (loop_vinfo), &vec_prologue_cost,
3688 &vec_inside_cost, &vec_epilogue_cost);
3689
3690 vec_outside_cost = (int)(vec_prologue_cost + vec_epilogue_cost);
3691
3692 /* Stash the costs so that we can compare two loop_vec_infos. */
3693 loop_vinfo->vec_inside_cost = vec_inside_cost;
3694 loop_vinfo->vec_outside_cost = vec_outside_cost;
3695
3696 if (dump_enabled_p ())
3697 {
3698 dump_printf_loc (MSG_NOTE, vect_location, "Cost model analysis: \n");
3699 dump_printf (MSG_NOTE, " Vector inside of loop cost: %d\n",
3700 vec_inside_cost);
3701 dump_printf (MSG_NOTE, " Vector prologue cost: %d\n",
3702 vec_prologue_cost);
3703 dump_printf (MSG_NOTE, " Vector epilogue cost: %d\n",
3704 vec_epilogue_cost);
3705 dump_printf (MSG_NOTE, " Scalar iteration cost: %d\n",
3706 scalar_single_iter_cost);
3707 dump_printf (MSG_NOTE, " Scalar outside cost: %d\n",
3708 scalar_outside_cost);
3709 dump_printf (MSG_NOTE, " Vector outside cost: %d\n",
3710 vec_outside_cost);
3711 dump_printf (MSG_NOTE, " prologue iterations: %d\n",
3712 peel_iters_prologue);
3713 dump_printf (MSG_NOTE, " epilogue iterations: %d\n",
3714 peel_iters_epilogue);
3715 }
3716
3717 /* Calculate number of iterations required to make the vector version
3718 profitable, relative to the loop bodies only. The following condition
3719 must hold true:
3720 SIC * niters + SOC > VIC * ((niters - NPEEL) / VF) + VOC
3721 where
3722 SIC = scalar iteration cost, VIC = vector iteration cost,
3723 VOC = vector outside cost, VF = vectorization factor,
3724 NPEEL = prologue iterations + epilogue iterations,
3725 SOC = scalar outside cost for run time cost model check. */
3726
3727 int saving_per_viter = (scalar_single_iter_cost * assumed_vf
3728 - vec_inside_cost);
3729 if (saving_per_viter <= 0)
3730 {
3731 if (LOOP_VINFO_LOOP (loop_vinfo)->force_vectorize)
3732 warning_at (vect_location.get_location_t (), OPT_Wopenmp_simd,
3733 "vectorization did not happen for a simd loop");
3734
3735 if (dump_enabled_p ())
3736 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3737 "cost model: the vector iteration cost = %d "
3738 "divided by the scalar iteration cost = %d "
3739 "is greater or equal to the vectorization factor = %d"
3740 ".\n",
3741 vec_inside_cost, scalar_single_iter_cost, assumed_vf);
3742 *ret_min_profitable_niters = -1;
3743 *ret_min_profitable_estimate = -1;
3744 return;
3745 }
3746
3747 /* ??? The "if" arm is written to handle all cases; see below for what
3748 we would do for !LOOP_VINFO_FULLY_MASKED_P. */
3749 if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
3750 {
3751 /* Rewriting the condition above in terms of the number of
3752 vector iterations (vniters) rather than the number of
3753 scalar iterations (niters) gives:
3754
3755 SIC * (vniters * VF + NPEEL) + SOC > VIC * vniters + VOC
3756
3757 <==> vniters * (SIC * VF - VIC) > VOC - SIC * NPEEL - SOC
3758
3759 For integer N, X and Y when X > 0:
3760
3761 N * X > Y <==> N >= (Y /[floor] X) + 1. */
3762 int outside_overhead = (vec_outside_cost
3763 - scalar_single_iter_cost * peel_iters_prologue
3764 - scalar_single_iter_cost * peel_iters_epilogue
3765 - scalar_outside_cost);
3766 /* We're only interested in cases that require at least one
3767 vector iteration. */
3768 int min_vec_niters = 1;
3769 if (outside_overhead > 0)
3770 min_vec_niters = outside_overhead / saving_per_viter + 1;
3771
3772 if (dump_enabled_p ())
3773 dump_printf (MSG_NOTE, " Minimum number of vector iterations: %d\n",
3774 min_vec_niters);
3775
3776 if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
3777 {
3778 /* Now that we know the minimum number of vector iterations,
3779 find the minimum niters for which the scalar cost is larger:
3780
3781 SIC * niters > VIC * vniters + VOC - SOC
3782
3783 We know that the minimum niters is no more than
3784 vniters * VF + NPEEL, but it might be (and often is) less
3785 than that if a partial vector iteration is cheaper than the
3786 equivalent scalar code. */
3787 int threshold = (vec_inside_cost * min_vec_niters
3788 + vec_outside_cost
3789 - scalar_outside_cost);
3790 if (threshold <= 0)
3791 min_profitable_iters = 1;
3792 else
3793 min_profitable_iters = threshold / scalar_single_iter_cost + 1;
3794 }
3795 else
3796 /* Convert the number of vector iterations into a number of
3797 scalar iterations. */
3798 min_profitable_iters = (min_vec_niters * assumed_vf
3799 + peel_iters_prologue
3800 + peel_iters_epilogue);
3801 }
3802 else
3803 {
3804 min_profitable_iters = ((vec_outside_cost - scalar_outside_cost)
3805 * assumed_vf
3806 - vec_inside_cost * peel_iters_prologue
3807 - vec_inside_cost * peel_iters_epilogue);
3808 if (min_profitable_iters <= 0)
3809 min_profitable_iters = 0;
3810 else
3811 {
3812 min_profitable_iters /= saving_per_viter;
3813
3814 if ((scalar_single_iter_cost * assumed_vf * min_profitable_iters)
3815 <= (((int) vec_inside_cost * min_profitable_iters)
3816 + (((int) vec_outside_cost - scalar_outside_cost)
3817 * assumed_vf)))
3818 min_profitable_iters++;
3819 }
3820 }
3821
3822 if (dump_enabled_p ())
3823 dump_printf (MSG_NOTE,
3824 " Calculated minimum iters for profitability: %d\n",
3825 min_profitable_iters);
3826
3827 if (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
3828 && min_profitable_iters < (assumed_vf + peel_iters_prologue))
3829 /* We want the vectorized loop to execute at least once. */
3830 min_profitable_iters = assumed_vf + peel_iters_prologue;
3831
3832 if (dump_enabled_p ())
3833 dump_printf_loc (MSG_NOTE, vect_location,
3834 " Runtime profitability threshold = %d\n",
3835 min_profitable_iters);
3836
3837 *ret_min_profitable_niters = min_profitable_iters;
3838
3839 /* Calculate number of iterations required to make the vector version
3840 profitable, relative to the loop bodies only.
3841
3842 Non-vectorized variant is SIC * niters and it must win over vector
3843 variant on the expected loop trip count. The following condition must hold true:
3844 SIC * niters > VIC * ((niters - NPEEL) / VF) + VOC + SOC */
3845
3846 if (vec_outside_cost <= 0)
3847 min_profitable_estimate = 0;
3848 else if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
3849 {
3850 /* This is a repeat of the code above, but with + SOC rather
3851 than - SOC. */
3852 int outside_overhead = (vec_outside_cost
3853 - scalar_single_iter_cost * peel_iters_prologue
3854 - scalar_single_iter_cost * peel_iters_epilogue
3855 + scalar_outside_cost);
3856 int min_vec_niters = 1;
3857 if (outside_overhead > 0)
3858 min_vec_niters = outside_overhead / saving_per_viter + 1;
3859
3860 if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
3861 {
3862 int threshold = (vec_inside_cost * min_vec_niters
3863 + vec_outside_cost
3864 + scalar_outside_cost);
3865 min_profitable_estimate = threshold / scalar_single_iter_cost + 1;
3866 }
3867 else
3868 min_profitable_estimate = (min_vec_niters * assumed_vf
3869 + peel_iters_prologue
3870 + peel_iters_epilogue);
3871 }
3872 else
3873 {
3874 min_profitable_estimate = ((vec_outside_cost + scalar_outside_cost)
3875 * assumed_vf
3876 - vec_inside_cost * peel_iters_prologue
3877 - vec_inside_cost * peel_iters_epilogue)
3878 / ((scalar_single_iter_cost * assumed_vf)
3879 - vec_inside_cost);
3880 }
3881 min_profitable_estimate = MAX (min_profitable_estimate, min_profitable_iters);
3882 if (dump_enabled_p ())
3883 dump_printf_loc (MSG_NOTE, vect_location,
3884 " Static estimate profitability threshold = %d\n",
3885 min_profitable_estimate);
3886
3887 *ret_min_profitable_estimate = min_profitable_estimate;
3888 }
3889
3890 /* Writes into SEL a mask for a vec_perm, equivalent to a vec_shr by OFFSET
3891 vector elements (not bits) for a vector with NELT elements. */
3892 static void
3893 calc_vec_perm_mask_for_shift (unsigned int offset, unsigned int nelt,
3894 vec_perm_builder *sel)
3895 {
3896 /* The encoding is a single stepped pattern. Any wrap-around is handled
3897 by vec_perm_indices. */
3898 sel->new_vector (nelt, 1, 3);
3899 for (unsigned int i = 0; i < 3; i++)
3900 sel->quick_push (i + offset);
3901 }
3902
3903 /* Checks whether the target supports whole-vector shifts for vectors of mode
3904 MODE. This is the case if _either_ the platform handles vec_shr_optab, _or_
3905 it supports vec_perm_const with masks for all necessary shift amounts. */
3906 static bool
3907 have_whole_vector_shift (machine_mode mode)
3908 {
3909 if (optab_handler (vec_shr_optab, mode) != CODE_FOR_nothing)
3910 return true;
3911
3912 /* Variable-length vectors should be handled via the optab. */
3913 unsigned int nelt;
3914 if (!GET_MODE_NUNITS (mode).is_constant (&nelt))
3915 return false;
3916
3917 vec_perm_builder sel;
3918 vec_perm_indices indices;
3919 for (unsigned int i = nelt / 2; i >= 1; i /= 2)
3920 {
3921 calc_vec_perm_mask_for_shift (i, nelt, &sel);
3922 indices.new_vector (sel, 2, nelt);
3923 if (!can_vec_perm_const_p (mode, indices, false))
3924 return false;
3925 }
3926 return true;
3927 }
3928
3929 /* TODO: Close dependency between vect_model_*_cost and vectorizable_*
3930 functions. Design better to avoid maintenance issues. */
3931
3932 /* Function vect_model_reduction_cost.
3933
3934 Models cost for a reduction operation, including the vector ops
3935 generated within the strip-mine loop, the initial definition before
3936 the loop, and the epilogue code that must be generated. */
3937
3938 static void
3939 vect_model_reduction_cost (loop_vec_info loop_vinfo,
3940 stmt_vec_info stmt_info, internal_fn reduc_fn,
3941 vect_reduction_type reduction_type,
3942 int ncopies, stmt_vector_for_cost *cost_vec)
3943 {
3944 int prologue_cost = 0, epilogue_cost = 0, inside_cost;
3945 enum tree_code code;
3946 optab optab;
3947 tree vectype;
3948 machine_mode mode;
3949 class loop *loop = NULL;
3950
3951 if (loop_vinfo)
3952 loop = LOOP_VINFO_LOOP (loop_vinfo);
3953
3954 /* Condition reductions generate two reductions in the loop. */
3955 if (reduction_type == COND_REDUCTION)
3956 ncopies *= 2;
3957
3958 vectype = STMT_VINFO_VECTYPE (stmt_info);
3959 mode = TYPE_MODE (vectype);
3960 stmt_vec_info orig_stmt_info = vect_orig_stmt (stmt_info);
3961
3962 code = gimple_assign_rhs_code (orig_stmt_info->stmt);
3963
3964 if (reduction_type == EXTRACT_LAST_REDUCTION)
3965 /* No extra instructions are needed in the prologue. The loop body
3966 operations are costed in vectorizable_condition. */
3967 inside_cost = 0;
3968 else if (reduction_type == FOLD_LEFT_REDUCTION)
3969 {
3970 /* No extra instructions needed in the prologue. */
3971 prologue_cost = 0;
3972
3973 if (reduc_fn != IFN_LAST)
3974 /* Count one reduction-like operation per vector. */
3975 inside_cost = record_stmt_cost (cost_vec, ncopies, vec_to_scalar,
3976 stmt_info, 0, vect_body);
3977 else
3978 {
3979 /* Use NELEMENTS extracts and NELEMENTS scalar ops. */
3980 unsigned int nelements = ncopies * vect_nunits_for_cost (vectype);
3981 inside_cost = record_stmt_cost (cost_vec, nelements,
3982 vec_to_scalar, stmt_info, 0,
3983 vect_body);
3984 inside_cost += record_stmt_cost (cost_vec, nelements,
3985 scalar_stmt, stmt_info, 0,
3986 vect_body);
3987 }
3988 }
3989 else
3990 {
3991 /* Add in cost for initial definition.
3992 For cond reduction we have four vectors: initial index, step,
3993 initial result of the data reduction, initial value of the index
3994 reduction. */
3995 int prologue_stmts = reduction_type == COND_REDUCTION ? 4 : 1;
3996 prologue_cost += record_stmt_cost (cost_vec, prologue_stmts,
3997 scalar_to_vec, stmt_info, 0,
3998 vect_prologue);
3999
4000 /* Cost of reduction op inside loop. */
4001 inside_cost = record_stmt_cost (cost_vec, ncopies, vector_stmt,
4002 stmt_info, 0, vect_body);
4003 }
4004
4005 /* Determine cost of epilogue code.
4006
4007 We have a reduction operator that will reduce the vector in one statement.
4008 Also requires scalar extract. */
4009
4010 if (!loop || !nested_in_vect_loop_p (loop, orig_stmt_info))
4011 {
4012 if (reduc_fn != IFN_LAST)
4013 {
4014 if (reduction_type == COND_REDUCTION)
4015 {
4016 /* An EQ stmt and an COND_EXPR stmt. */
4017 epilogue_cost += record_stmt_cost (cost_vec, 2,
4018 vector_stmt, stmt_info, 0,
4019 vect_epilogue);
4020 /* Reduction of the max index and a reduction of the found
4021 values. */
4022 epilogue_cost += record_stmt_cost (cost_vec, 2,
4023 vec_to_scalar, stmt_info, 0,
4024 vect_epilogue);
4025 /* A broadcast of the max value. */
4026 epilogue_cost += record_stmt_cost (cost_vec, 1,
4027 scalar_to_vec, stmt_info, 0,
4028 vect_epilogue);
4029 }
4030 else
4031 {
4032 epilogue_cost += record_stmt_cost (cost_vec, 1, vector_stmt,
4033 stmt_info, 0, vect_epilogue);
4034 epilogue_cost += record_stmt_cost (cost_vec, 1,
4035 vec_to_scalar, stmt_info, 0,
4036 vect_epilogue);
4037 }
4038 }
4039 else if (reduction_type == COND_REDUCTION)
4040 {
4041 unsigned estimated_nunits = vect_nunits_for_cost (vectype);
4042 /* Extraction of scalar elements. */
4043 epilogue_cost += record_stmt_cost (cost_vec,
4044 2 * estimated_nunits,
4045 vec_to_scalar, stmt_info, 0,
4046 vect_epilogue);
4047 /* Scalar max reductions via COND_EXPR / MAX_EXPR. */
4048 epilogue_cost += record_stmt_cost (cost_vec,
4049 2 * estimated_nunits - 3,
4050 scalar_stmt, stmt_info, 0,
4051 vect_epilogue);
4052 }
4053 else if (reduction_type == EXTRACT_LAST_REDUCTION
4054 || reduction_type == FOLD_LEFT_REDUCTION)
4055 /* No extra instructions need in the epilogue. */
4056 ;
4057 else
4058 {
4059 int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
4060 tree bitsize =
4061 TYPE_SIZE (TREE_TYPE (gimple_assign_lhs (orig_stmt_info->stmt)));
4062 int element_bitsize = tree_to_uhwi (bitsize);
4063 int nelements = vec_size_in_bits / element_bitsize;
4064
4065 if (code == COND_EXPR)
4066 code = MAX_EXPR;
4067
4068 optab = optab_for_tree_code (code, vectype, optab_default);
4069
4070 /* We have a whole vector shift available. */
4071 if (optab != unknown_optab
4072 && VECTOR_MODE_P (mode)
4073 && optab_handler (optab, mode) != CODE_FOR_nothing
4074 && have_whole_vector_shift (mode))
4075 {
4076 /* Final reduction via vector shifts and the reduction operator.
4077 Also requires scalar extract. */
4078 epilogue_cost += record_stmt_cost (cost_vec,
4079 exact_log2 (nelements) * 2,
4080 vector_stmt, stmt_info, 0,
4081 vect_epilogue);
4082 epilogue_cost += record_stmt_cost (cost_vec, 1,
4083 vec_to_scalar, stmt_info, 0,
4084 vect_epilogue);
4085 }
4086 else
4087 /* Use extracts and reduction op for final reduction. For N
4088 elements, we have N extracts and N-1 reduction ops. */
4089 epilogue_cost += record_stmt_cost (cost_vec,
4090 nelements + nelements - 1,
4091 vector_stmt, stmt_info, 0,
4092 vect_epilogue);
4093 }
4094 }
4095
4096 if (dump_enabled_p ())
4097 dump_printf (MSG_NOTE,
4098 "vect_model_reduction_cost: inside_cost = %d, "
4099 "prologue_cost = %d, epilogue_cost = %d .\n", inside_cost,
4100 prologue_cost, epilogue_cost);
4101 }
4102
4103
4104 /* Function vect_model_induction_cost.
4105
4106 Models cost for induction operations. */
4107
4108 static void
4109 vect_model_induction_cost (stmt_vec_info stmt_info, int ncopies,
4110 stmt_vector_for_cost *cost_vec)
4111 {
4112 unsigned inside_cost, prologue_cost;
4113
4114 if (PURE_SLP_STMT (stmt_info))
4115 return;
4116
4117 /* loop cost for vec_loop. */
4118 inside_cost = record_stmt_cost (cost_vec, ncopies, vector_stmt,
4119 stmt_info, 0, vect_body);
4120
4121 /* prologue cost for vec_init and vec_step. */
4122 prologue_cost = record_stmt_cost (cost_vec, 2, scalar_to_vec,
4123 stmt_info, 0, vect_prologue);
4124
4125 if (dump_enabled_p ())
4126 dump_printf_loc (MSG_NOTE, vect_location,
4127 "vect_model_induction_cost: inside_cost = %d, "
4128 "prologue_cost = %d .\n", inside_cost, prologue_cost);
4129 }
4130
4131
4132
4133 /* Function get_initial_def_for_reduction
4134
4135 Input:
4136 STMT_VINFO - a stmt that performs a reduction operation in the loop.
4137 INIT_VAL - the initial value of the reduction variable
4138
4139 Output:
4140 ADJUSTMENT_DEF - a tree that holds a value to be added to the final result
4141 of the reduction (used for adjusting the epilog - see below).
4142 Return a vector variable, initialized according to the operation that
4143 STMT_VINFO performs. This vector will be used as the initial value
4144 of the vector of partial results.
4145
4146 Option1 (adjust in epilog): Initialize the vector as follows:
4147 add/bit or/xor: [0,0,...,0,0]
4148 mult/bit and: [1,1,...,1,1]
4149 min/max/cond_expr: [init_val,init_val,..,init_val,init_val]
4150 and when necessary (e.g. add/mult case) let the caller know
4151 that it needs to adjust the result by init_val.
4152
4153 Option2: Initialize the vector as follows:
4154 add/bit or/xor: [init_val,0,0,...,0]
4155 mult/bit and: [init_val,1,1,...,1]
4156 min/max/cond_expr: [init_val,init_val,...,init_val]
4157 and no adjustments are needed.
4158
4159 For example, for the following code:
4160
4161 s = init_val;
4162 for (i=0;i<n;i++)
4163 s = s + a[i];
4164
4165 STMT_VINFO is 's = s + a[i]', and the reduction variable is 's'.
4166 For a vector of 4 units, we want to return either [0,0,0,init_val],
4167 or [0,0,0,0] and let the caller know that it needs to adjust
4168 the result at the end by 'init_val'.
4169
4170 FORNOW, we are using the 'adjust in epilog' scheme, because this way the
4171 initialization vector is simpler (same element in all entries), if
4172 ADJUSTMENT_DEF is not NULL, and Option2 otherwise.
4173
4174 A cost model should help decide between these two schemes. */
4175
4176 static tree
4177 get_initial_def_for_reduction (loop_vec_info loop_vinfo,
4178 stmt_vec_info stmt_vinfo,
4179 enum tree_code code, tree init_val,
4180 tree *adjustment_def)
4181 {
4182 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
4183 tree scalar_type = TREE_TYPE (init_val);
4184 tree vectype = get_vectype_for_scalar_type (loop_vinfo, scalar_type);
4185 tree def_for_init;
4186 tree init_def;
4187 REAL_VALUE_TYPE real_init_val = dconst0;
4188 int int_init_val = 0;
4189 gimple_seq stmts = NULL;
4190
4191 gcc_assert (vectype);
4192
4193 gcc_assert (POINTER_TYPE_P (scalar_type) || INTEGRAL_TYPE_P (scalar_type)
4194 || SCALAR_FLOAT_TYPE_P (scalar_type));
4195
4196 gcc_assert (nested_in_vect_loop_p (loop, stmt_vinfo)
4197 || loop == (gimple_bb (stmt_vinfo->stmt))->loop_father);
4198
4199 /* ADJUSTMENT_DEF is NULL when called from
4200 vect_create_epilog_for_reduction to vectorize double reduction. */
4201 if (adjustment_def)
4202 *adjustment_def = NULL;
4203
4204 switch (code)
4205 {
4206 case WIDEN_SUM_EXPR:
4207 case DOT_PROD_EXPR:
4208 case SAD_EXPR:
4209 case PLUS_EXPR:
4210 case MINUS_EXPR:
4211 case BIT_IOR_EXPR:
4212 case BIT_XOR_EXPR:
4213 case MULT_EXPR:
4214 case BIT_AND_EXPR:
4215 {
4216 if (code == MULT_EXPR)
4217 {
4218 real_init_val = dconst1;
4219 int_init_val = 1;
4220 }
4221
4222 if (code == BIT_AND_EXPR)
4223 int_init_val = -1;
4224
4225 if (SCALAR_FLOAT_TYPE_P (scalar_type))
4226 def_for_init = build_real (scalar_type, real_init_val);
4227 else
4228 def_for_init = build_int_cst (scalar_type, int_init_val);
4229
4230 if (adjustment_def || operand_equal_p (def_for_init, init_val, 0))
4231 {
4232 /* Option1: the first element is '0' or '1' as well. */
4233 if (!operand_equal_p (def_for_init, init_val, 0))
4234 *adjustment_def = init_val;
4235 init_def = gimple_build_vector_from_val (&stmts, vectype,
4236 def_for_init);
4237 }
4238 else if (!TYPE_VECTOR_SUBPARTS (vectype).is_constant ())
4239 {
4240 /* Option2 (variable length): the first element is INIT_VAL. */
4241 init_def = gimple_build_vector_from_val (&stmts, vectype,
4242 def_for_init);
4243 init_def = gimple_build (&stmts, CFN_VEC_SHL_INSERT,
4244 vectype, init_def, init_val);
4245 }
4246 else
4247 {
4248 /* Option2: the first element is INIT_VAL. */
4249 tree_vector_builder elts (vectype, 1, 2);
4250 elts.quick_push (init_val);
4251 elts.quick_push (def_for_init);
4252 init_def = gimple_build_vector (&stmts, &elts);
4253 }
4254 }
4255 break;
4256
4257 case MIN_EXPR:
4258 case MAX_EXPR:
4259 case COND_EXPR:
4260 {
4261 init_val = gimple_convert (&stmts, TREE_TYPE (vectype), init_val);
4262 init_def = gimple_build_vector_from_val (&stmts, vectype, init_val);
4263 }
4264 break;
4265
4266 default:
4267 gcc_unreachable ();
4268 }
4269
4270 if (stmts)
4271 gsi_insert_seq_on_edge_immediate (loop_preheader_edge (loop), stmts);
4272 return init_def;
4273 }
4274
4275 /* Get at the initial defs for the reduction PHIs in SLP_NODE.
4276 NUMBER_OF_VECTORS is the number of vector defs to create.
4277 If NEUTRAL_OP is nonnull, introducing extra elements of that
4278 value will not change the result. */
4279
4280 static void
4281 get_initial_defs_for_reduction (vec_info *vinfo,
4282 slp_tree slp_node,
4283 vec<tree> *vec_oprnds,
4284 unsigned int number_of_vectors,
4285 bool reduc_chain, tree neutral_op)
4286 {
4287 vec<stmt_vec_info> stmts = SLP_TREE_SCALAR_STMTS (slp_node);
4288 stmt_vec_info stmt_vinfo = stmts[0];
4289 unsigned HOST_WIDE_INT nunits;
4290 unsigned j, number_of_places_left_in_vector;
4291 tree vector_type;
4292 unsigned int group_size = stmts.length ();
4293 unsigned int i;
4294 class loop *loop;
4295
4296 vector_type = STMT_VINFO_VECTYPE (stmt_vinfo);
4297
4298 gcc_assert (STMT_VINFO_DEF_TYPE (stmt_vinfo) == vect_reduction_def);
4299
4300 loop = (gimple_bb (stmt_vinfo->stmt))->loop_father;
4301 gcc_assert (loop);
4302 edge pe = loop_preheader_edge (loop);
4303
4304 gcc_assert (!reduc_chain || neutral_op);
4305
4306 /* NUMBER_OF_COPIES is the number of times we need to use the same values in
4307 created vectors. It is greater than 1 if unrolling is performed.
4308
4309 For example, we have two scalar operands, s1 and s2 (e.g., group of
4310 strided accesses of size two), while NUNITS is four (i.e., four scalars
4311 of this type can be packed in a vector). The output vector will contain
4312 two copies of each scalar operand: {s1, s2, s1, s2}. (NUMBER_OF_COPIES
4313 will be 2).
4314
4315 If REDUC_GROUP_SIZE > NUNITS, the scalars will be split into several
4316 vectors containing the operands.
4317
4318 For example, NUNITS is four as before, and the group size is 8
4319 (s1, s2, ..., s8). We will create two vectors {s1, s2, s3, s4} and
4320 {s5, s6, s7, s8}. */
4321
4322 if (!TYPE_VECTOR_SUBPARTS (vector_type).is_constant (&nunits))
4323 nunits = group_size;
4324
4325 number_of_places_left_in_vector = nunits;
4326 bool constant_p = true;
4327 tree_vector_builder elts (vector_type, nunits, 1);
4328 elts.quick_grow (nunits);
4329 gimple_seq ctor_seq = NULL;
4330 for (j = 0; j < nunits * number_of_vectors; ++j)
4331 {
4332 tree op;
4333 i = j % group_size;
4334 stmt_vinfo = stmts[i];
4335
4336 /* Get the def before the loop. In reduction chain we have only
4337 one initial value. Else we have as many as PHIs in the group. */
4338 if (reduc_chain)
4339 op = j != 0 ? neutral_op : PHI_ARG_DEF_FROM_EDGE (stmt_vinfo->stmt, pe);
4340 else if (((vec_oprnds->length () + 1) * nunits
4341 - number_of_places_left_in_vector >= group_size)
4342 && neutral_op)
4343 op = neutral_op;
4344 else
4345 op = PHI_ARG_DEF_FROM_EDGE (stmt_vinfo->stmt, pe);
4346
4347 /* Create 'vect_ = {op0,op1,...,opn}'. */
4348 number_of_places_left_in_vector--;
4349 elts[nunits - number_of_places_left_in_vector - 1] = op;
4350 if (!CONSTANT_CLASS_P (op))
4351 constant_p = false;
4352
4353 if (number_of_places_left_in_vector == 0)
4354 {
4355 tree init;
4356 if (constant_p && !neutral_op
4357 ? multiple_p (TYPE_VECTOR_SUBPARTS (vector_type), nunits)
4358 : known_eq (TYPE_VECTOR_SUBPARTS (vector_type), nunits))
4359 /* Build the vector directly from ELTS. */
4360 init = gimple_build_vector (&ctor_seq, &elts);
4361 else if (neutral_op)
4362 {
4363 /* Build a vector of the neutral value and shift the
4364 other elements into place. */
4365 init = gimple_build_vector_from_val (&ctor_seq, vector_type,
4366 neutral_op);
4367 int k = nunits;
4368 while (k > 0 && elts[k - 1] == neutral_op)
4369 k -= 1;
4370 while (k > 0)
4371 {
4372 k -= 1;
4373 init = gimple_build (&ctor_seq, CFN_VEC_SHL_INSERT,
4374 vector_type, init, elts[k]);
4375 }
4376 }
4377 else
4378 {
4379 /* First time round, duplicate ELTS to fill the
4380 required number of vectors. */
4381 duplicate_and_interleave (vinfo, &ctor_seq, vector_type, elts,
4382 number_of_vectors, *vec_oprnds);
4383 break;
4384 }
4385 vec_oprnds->quick_push (init);
4386
4387 number_of_places_left_in_vector = nunits;
4388 elts.new_vector (vector_type, nunits, 1);
4389 elts.quick_grow (nunits);
4390 constant_p = true;
4391 }
4392 }
4393 if (ctor_seq != NULL)
4394 gsi_insert_seq_on_edge_immediate (pe, ctor_seq);
4395 }
4396
4397 /* For a statement STMT_INFO taking part in a reduction operation return
4398 the stmt_vec_info the meta information is stored on. */
4399
4400 stmt_vec_info
4401 info_for_reduction (vec_info *vinfo, stmt_vec_info stmt_info)
4402 {
4403 stmt_info = vect_orig_stmt (stmt_info);
4404 gcc_assert (STMT_VINFO_REDUC_DEF (stmt_info));
4405 if (!is_a <gphi *> (stmt_info->stmt))
4406 stmt_info = STMT_VINFO_REDUC_DEF (stmt_info);
4407 gphi *phi = as_a <gphi *> (stmt_info->stmt);
4408 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
4409 {
4410 if (gimple_phi_num_args (phi) == 1)
4411 stmt_info = STMT_VINFO_REDUC_DEF (stmt_info);
4412 }
4413 else if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle)
4414 {
4415 edge pe = loop_preheader_edge (gimple_bb (phi)->loop_father);
4416 stmt_vec_info info
4417 = vinfo->lookup_def (PHI_ARG_DEF_FROM_EDGE (phi, pe));
4418 if (info && STMT_VINFO_DEF_TYPE (info) == vect_double_reduction_def)
4419 stmt_info = info;
4420 }
4421 return stmt_info;
4422 }
4423
4424 /* Function vect_create_epilog_for_reduction
4425
4426 Create code at the loop-epilog to finalize the result of a reduction
4427 computation.
4428
4429 STMT_INFO is the scalar reduction stmt that is being vectorized.
4430 SLP_NODE is an SLP node containing a group of reduction statements. The
4431 first one in this group is STMT_INFO.
4432 SLP_NODE_INSTANCE is the SLP node instance containing SLP_NODE
4433 REDUC_INDEX says which rhs operand of the STMT_INFO is the reduction phi
4434 (counting from 0)
4435
4436 This function:
4437 1. Completes the reduction def-use cycles.
4438 2. "Reduces" each vector of partial results VECT_DEFS into a single result,
4439 by calling the function specified by REDUC_FN if available, or by
4440 other means (whole-vector shifts or a scalar loop).
4441 The function also creates a new phi node at the loop exit to preserve
4442 loop-closed form, as illustrated below.
4443
4444 The flow at the entry to this function:
4445
4446 loop:
4447 vec_def = phi <vec_init, null> # REDUCTION_PHI
4448 VECT_DEF = vector_stmt # vectorized form of STMT_INFO
4449 s_loop = scalar_stmt # (scalar) STMT_INFO
4450 loop_exit:
4451 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
4452 use <s_out0>
4453 use <s_out0>
4454
4455 The above is transformed by this function into:
4456
4457 loop:
4458 vec_def = phi <vec_init, VECT_DEF> # REDUCTION_PHI
4459 VECT_DEF = vector_stmt # vectorized form of STMT_INFO
4460 s_loop = scalar_stmt # (scalar) STMT_INFO
4461 loop_exit:
4462 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
4463 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
4464 v_out2 = reduce <v_out1>
4465 s_out3 = extract_field <v_out2, 0>
4466 s_out4 = adjust_result <s_out3>
4467 use <s_out4>
4468 use <s_out4>
4469 */
4470
4471 static void
4472 vect_create_epilog_for_reduction (loop_vec_info loop_vinfo,
4473 stmt_vec_info stmt_info,
4474 slp_tree slp_node,
4475 slp_instance slp_node_instance)
4476 {
4477 stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info);
4478 gcc_assert (reduc_info->is_reduc_info);
4479 /* For double reductions we need to get at the inner loop reduction
4480 stmt which has the meta info attached. Our stmt_info is that of the
4481 loop-closed PHI of the inner loop which we remember as
4482 def for the reduction PHI generation. */
4483 bool double_reduc = false;
4484 stmt_vec_info rdef_info = stmt_info;
4485 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
4486 {
4487 gcc_assert (!slp_node);
4488 double_reduc = true;
4489 stmt_info = loop_vinfo->lookup_def (gimple_phi_arg_def
4490 (stmt_info->stmt, 0));
4491 stmt_info = vect_stmt_to_vectorize (stmt_info);
4492 }
4493 gphi *reduc_def_stmt
4494 = as_a <gphi *> (STMT_VINFO_REDUC_DEF (vect_orig_stmt (stmt_info))->stmt);
4495 enum tree_code code = STMT_VINFO_REDUC_CODE (reduc_info);
4496 internal_fn reduc_fn = STMT_VINFO_REDUC_FN (reduc_info);
4497 tree vectype;
4498 machine_mode mode;
4499 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo), *outer_loop = NULL;
4500 basic_block exit_bb;
4501 tree scalar_dest;
4502 tree scalar_type;
4503 gimple *new_phi = NULL, *phi;
4504 gimple_stmt_iterator exit_gsi;
4505 tree new_temp = NULL_TREE, new_name, new_scalar_dest;
4506 gimple *epilog_stmt = NULL;
4507 gimple *exit_phi;
4508 tree bitsize;
4509 tree def;
4510 tree orig_name, scalar_result;
4511 imm_use_iterator imm_iter, phi_imm_iter;
4512 use_operand_p use_p, phi_use_p;
4513 gimple *use_stmt;
4514 bool nested_in_vect_loop = false;
4515 auto_vec<gimple *> new_phis;
4516 int j, i;
4517 auto_vec<tree> scalar_results;
4518 unsigned int group_size = 1, k;
4519 auto_vec<gimple *> phis;
4520 bool slp_reduc = false;
4521 bool direct_slp_reduc;
4522 tree new_phi_result;
4523 tree induction_index = NULL_TREE;
4524
4525 if (slp_node)
4526 group_size = SLP_TREE_LANES (slp_node);
4527
4528 if (nested_in_vect_loop_p (loop, stmt_info))
4529 {
4530 outer_loop = loop;
4531 loop = loop->inner;
4532 nested_in_vect_loop = true;
4533 gcc_assert (!slp_node);
4534 }
4535 gcc_assert (!nested_in_vect_loop || double_reduc);
4536
4537 vectype = STMT_VINFO_REDUC_VECTYPE (reduc_info);
4538 gcc_assert (vectype);
4539 mode = TYPE_MODE (vectype);
4540
4541 tree initial_def = NULL;
4542 tree induc_val = NULL_TREE;
4543 tree adjustment_def = NULL;
4544 if (slp_node)
4545 ;
4546 else
4547 {
4548 /* Get at the scalar def before the loop, that defines the initial value
4549 of the reduction variable. */
4550 initial_def = PHI_ARG_DEF_FROM_EDGE (reduc_def_stmt,
4551 loop_preheader_edge (loop));
4552 /* Optimize: for induction condition reduction, if we can't use zero
4553 for induc_val, use initial_def. */
4554 if (STMT_VINFO_REDUC_TYPE (reduc_info) == INTEGER_INDUC_COND_REDUCTION)
4555 induc_val = STMT_VINFO_VEC_INDUC_COND_INITIAL_VAL (reduc_info);
4556 else if (double_reduc)
4557 ;
4558 else if (nested_in_vect_loop)
4559 ;
4560 else
4561 adjustment_def = STMT_VINFO_REDUC_EPILOGUE_ADJUSTMENT (reduc_info);
4562 }
4563
4564 unsigned vec_num;
4565 int ncopies;
4566 if (slp_node)
4567 {
4568 vec_num = SLP_TREE_VEC_STMTS (slp_node_instance->reduc_phis).length ();
4569 ncopies = 1;
4570 }
4571 else
4572 {
4573 stmt_vec_info reduc_info = loop_vinfo->lookup_stmt (reduc_def_stmt);
4574 vec_num = 1;
4575 ncopies = STMT_VINFO_VEC_STMTS (reduc_info).length ();
4576 }
4577
4578 /* For cond reductions we want to create a new vector (INDEX_COND_EXPR)
4579 which is updated with the current index of the loop for every match of
4580 the original loop's cond_expr (VEC_STMT). This results in a vector
4581 containing the last time the condition passed for that vector lane.
4582 The first match will be a 1 to allow 0 to be used for non-matching
4583 indexes. If there are no matches at all then the vector will be all
4584 zeroes.
4585
4586 PR92772: This algorithm is broken for architectures that support
4587 masked vectors, but do not provide fold_extract_last. */
4588 if (STMT_VINFO_REDUC_TYPE (reduc_info) == COND_REDUCTION)
4589 {
4590 auto_vec<std::pair<tree, bool>, 2> ccompares;
4591 stmt_vec_info cond_info = STMT_VINFO_REDUC_DEF (reduc_info);
4592 cond_info = vect_stmt_to_vectorize (cond_info);
4593 while (cond_info != reduc_info)
4594 {
4595 if (gimple_assign_rhs_code (cond_info->stmt) == COND_EXPR)
4596 {
4597 gimple *vec_stmt = STMT_VINFO_VEC_STMTS (cond_info)[0];
4598 gcc_assert (gimple_assign_rhs_code (vec_stmt) == VEC_COND_EXPR);
4599 ccompares.safe_push
4600 (std::make_pair (unshare_expr (gimple_assign_rhs1 (vec_stmt)),
4601 STMT_VINFO_REDUC_IDX (cond_info) == 2));
4602 }
4603 cond_info
4604 = loop_vinfo->lookup_def (gimple_op (cond_info->stmt,
4605 1 + STMT_VINFO_REDUC_IDX
4606 (cond_info)));
4607 cond_info = vect_stmt_to_vectorize (cond_info);
4608 }
4609 gcc_assert (ccompares.length () != 0);
4610
4611 tree indx_before_incr, indx_after_incr;
4612 poly_uint64 nunits_out = TYPE_VECTOR_SUBPARTS (vectype);
4613 int scalar_precision
4614 = GET_MODE_PRECISION (SCALAR_TYPE_MODE (TREE_TYPE (vectype)));
4615 tree cr_index_scalar_type = make_unsigned_type (scalar_precision);
4616 tree cr_index_vector_type = get_related_vectype_for_scalar_type
4617 (TYPE_MODE (vectype), cr_index_scalar_type,
4618 TYPE_VECTOR_SUBPARTS (vectype));
4619
4620 /* First we create a simple vector induction variable which starts
4621 with the values {1,2,3,...} (SERIES_VECT) and increments by the
4622 vector size (STEP). */
4623
4624 /* Create a {1,2,3,...} vector. */
4625 tree series_vect = build_index_vector (cr_index_vector_type, 1, 1);
4626
4627 /* Create a vector of the step value. */
4628 tree step = build_int_cst (cr_index_scalar_type, nunits_out);
4629 tree vec_step = build_vector_from_val (cr_index_vector_type, step);
4630
4631 /* Create an induction variable. */
4632 gimple_stmt_iterator incr_gsi;
4633 bool insert_after;
4634 standard_iv_increment_position (loop, &incr_gsi, &insert_after);
4635 create_iv (series_vect, vec_step, NULL_TREE, loop, &incr_gsi,
4636 insert_after, &indx_before_incr, &indx_after_incr);
4637
4638 /* Next create a new phi node vector (NEW_PHI_TREE) which starts
4639 filled with zeros (VEC_ZERO). */
4640
4641 /* Create a vector of 0s. */
4642 tree zero = build_zero_cst (cr_index_scalar_type);
4643 tree vec_zero = build_vector_from_val (cr_index_vector_type, zero);
4644
4645 /* Create a vector phi node. */
4646 tree new_phi_tree = make_ssa_name (cr_index_vector_type);
4647 new_phi = create_phi_node (new_phi_tree, loop->header);
4648 add_phi_arg (as_a <gphi *> (new_phi), vec_zero,
4649 loop_preheader_edge (loop), UNKNOWN_LOCATION);
4650
4651 /* Now take the condition from the loops original cond_exprs
4652 and produce a new cond_exprs (INDEX_COND_EXPR) which for
4653 every match uses values from the induction variable
4654 (INDEX_BEFORE_INCR) otherwise uses values from the phi node
4655 (NEW_PHI_TREE).
4656 Finally, we update the phi (NEW_PHI_TREE) to take the value of
4657 the new cond_expr (INDEX_COND_EXPR). */
4658 gimple_seq stmts = NULL;
4659 for (int i = ccompares.length () - 1; i != -1; --i)
4660 {
4661 tree ccompare = ccompares[i].first;
4662 if (ccompares[i].second)
4663 new_phi_tree = gimple_build (&stmts, VEC_COND_EXPR,
4664 cr_index_vector_type,
4665 ccompare,
4666 indx_before_incr, new_phi_tree);
4667 else
4668 new_phi_tree = gimple_build (&stmts, VEC_COND_EXPR,
4669 cr_index_vector_type,
4670 ccompare,
4671 new_phi_tree, indx_before_incr);
4672 }
4673 gsi_insert_seq_before (&incr_gsi, stmts, GSI_SAME_STMT);
4674
4675 /* Update the phi with the vec cond. */
4676 induction_index = new_phi_tree;
4677 add_phi_arg (as_a <gphi *> (new_phi), induction_index,
4678 loop_latch_edge (loop), UNKNOWN_LOCATION);
4679 }
4680
4681 /* 2. Create epilog code.
4682 The reduction epilog code operates across the elements of the vector
4683 of partial results computed by the vectorized loop.
4684 The reduction epilog code consists of:
4685
4686 step 1: compute the scalar result in a vector (v_out2)
4687 step 2: extract the scalar result (s_out3) from the vector (v_out2)
4688 step 3: adjust the scalar result (s_out3) if needed.
4689
4690 Step 1 can be accomplished using one the following three schemes:
4691 (scheme 1) using reduc_fn, if available.
4692 (scheme 2) using whole-vector shifts, if available.
4693 (scheme 3) using a scalar loop. In this case steps 1+2 above are
4694 combined.
4695
4696 The overall epilog code looks like this:
4697
4698 s_out0 = phi <s_loop> # original EXIT_PHI
4699 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
4700 v_out2 = reduce <v_out1> # step 1
4701 s_out3 = extract_field <v_out2, 0> # step 2
4702 s_out4 = adjust_result <s_out3> # step 3
4703
4704 (step 3 is optional, and steps 1 and 2 may be combined).
4705 Lastly, the uses of s_out0 are replaced by s_out4. */
4706
4707
4708 /* 2.1 Create new loop-exit-phis to preserve loop-closed form:
4709 v_out1 = phi <VECT_DEF>
4710 Store them in NEW_PHIS. */
4711 if (double_reduc)
4712 loop = outer_loop;
4713 exit_bb = single_exit (loop)->dest;
4714 new_phis.create (slp_node ? vec_num : ncopies);
4715 for (unsigned i = 0; i < vec_num; i++)
4716 {
4717 if (slp_node)
4718 def = vect_get_slp_vect_def (slp_node, i);
4719 else
4720 def = gimple_get_lhs (STMT_VINFO_VEC_STMTS (rdef_info)[0]);
4721 for (j = 0; j < ncopies; j++)
4722 {
4723 tree new_def = copy_ssa_name (def);
4724 phi = create_phi_node (new_def, exit_bb);
4725 if (j == 0)
4726 new_phis.quick_push (phi);
4727 else
4728 {
4729 def = gimple_get_lhs (STMT_VINFO_VEC_STMTS (rdef_info)[j]);
4730 new_phis.quick_push (phi);
4731 }
4732
4733 SET_PHI_ARG_DEF (phi, single_exit (loop)->dest_idx, def);
4734 }
4735 }
4736
4737 exit_gsi = gsi_after_labels (exit_bb);
4738
4739 /* 2.2 Get the relevant tree-code to use in the epilog for schemes 2,3
4740 (i.e. when reduc_fn is not available) and in the final adjustment
4741 code (if needed). Also get the original scalar reduction variable as
4742 defined in the loop. In case STMT is a "pattern-stmt" (i.e. - it
4743 represents a reduction pattern), the tree-code and scalar-def are
4744 taken from the original stmt that the pattern-stmt (STMT) replaces.
4745 Otherwise (it is a regular reduction) - the tree-code and scalar-def
4746 are taken from STMT. */
4747
4748 stmt_vec_info orig_stmt_info = vect_orig_stmt (stmt_info);
4749 if (orig_stmt_info != stmt_info)
4750 {
4751 /* Reduction pattern */
4752 gcc_assert (STMT_VINFO_IN_PATTERN_P (orig_stmt_info));
4753 gcc_assert (STMT_VINFO_RELATED_STMT (orig_stmt_info) == stmt_info);
4754 }
4755
4756 scalar_dest = gimple_assign_lhs (orig_stmt_info->stmt);
4757 scalar_type = TREE_TYPE (scalar_dest);
4758 scalar_results.create (group_size);
4759 new_scalar_dest = vect_create_destination_var (scalar_dest, NULL);
4760 bitsize = TYPE_SIZE (scalar_type);
4761
4762 /* SLP reduction without reduction chain, e.g.,
4763 # a1 = phi <a2, a0>
4764 # b1 = phi <b2, b0>
4765 a2 = operation (a1)
4766 b2 = operation (b1) */
4767 slp_reduc = (slp_node && !REDUC_GROUP_FIRST_ELEMENT (stmt_info));
4768
4769 /* True if we should implement SLP_REDUC using native reduction operations
4770 instead of scalar operations. */
4771 direct_slp_reduc = (reduc_fn != IFN_LAST
4772 && slp_reduc
4773 && !TYPE_VECTOR_SUBPARTS (vectype).is_constant ());
4774
4775 /* In case of reduction chain, e.g.,
4776 # a1 = phi <a3, a0>
4777 a2 = operation (a1)
4778 a3 = operation (a2),
4779
4780 we may end up with more than one vector result. Here we reduce them to
4781 one vector. */
4782 if (REDUC_GROUP_FIRST_ELEMENT (stmt_info) || direct_slp_reduc)
4783 {
4784 gimple_seq stmts = NULL;
4785 tree first_vect = PHI_RESULT (new_phis[0]);
4786 first_vect = gimple_convert (&stmts, vectype, first_vect);
4787 for (k = 1; k < new_phis.length (); k++)
4788 {
4789 gimple *next_phi = new_phis[k];
4790 tree second_vect = PHI_RESULT (next_phi);
4791 second_vect = gimple_convert (&stmts, vectype, second_vect);
4792 first_vect = gimple_build (&stmts, code, vectype,
4793 first_vect, second_vect);
4794 }
4795 gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
4796
4797 new_phi_result = first_vect;
4798 new_phis.truncate (0);
4799 new_phis.safe_push (SSA_NAME_DEF_STMT (first_vect));
4800 }
4801 /* Likewise if we couldn't use a single defuse cycle. */
4802 else if (ncopies > 1)
4803 {
4804 gimple_seq stmts = NULL;
4805 tree first_vect = PHI_RESULT (new_phis[0]);
4806 first_vect = gimple_convert (&stmts, vectype, first_vect);
4807 for (int k = 1; k < ncopies; ++k)
4808 {
4809 tree second_vect = PHI_RESULT (new_phis[k]);
4810 second_vect = gimple_convert (&stmts, vectype, second_vect);
4811 first_vect = gimple_build (&stmts, code, vectype,
4812 first_vect, second_vect);
4813 }
4814 gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
4815 new_phi_result = first_vect;
4816 new_phis.truncate (0);
4817 new_phis.safe_push (SSA_NAME_DEF_STMT (first_vect));
4818 }
4819 else
4820 new_phi_result = PHI_RESULT (new_phis[0]);
4821
4822 if (STMT_VINFO_REDUC_TYPE (reduc_info) == COND_REDUCTION
4823 && reduc_fn != IFN_LAST)
4824 {
4825 /* For condition reductions, we have a vector (NEW_PHI_RESULT) containing
4826 various data values where the condition matched and another vector
4827 (INDUCTION_INDEX) containing all the indexes of those matches. We
4828 need to extract the last matching index (which will be the index with
4829 highest value) and use this to index into the data vector.
4830 For the case where there were no matches, the data vector will contain
4831 all default values and the index vector will be all zeros. */
4832
4833 /* Get various versions of the type of the vector of indexes. */
4834 tree index_vec_type = TREE_TYPE (induction_index);
4835 gcc_checking_assert (TYPE_UNSIGNED (index_vec_type));
4836 tree index_scalar_type = TREE_TYPE (index_vec_type);
4837 tree index_vec_cmp_type = truth_type_for (index_vec_type);
4838
4839 /* Get an unsigned integer version of the type of the data vector. */
4840 int scalar_precision
4841 = GET_MODE_PRECISION (SCALAR_TYPE_MODE (scalar_type));
4842 tree scalar_type_unsigned = make_unsigned_type (scalar_precision);
4843 tree vectype_unsigned = build_vector_type
4844 (scalar_type_unsigned, TYPE_VECTOR_SUBPARTS (vectype));
4845
4846 /* First we need to create a vector (ZERO_VEC) of zeros and another
4847 vector (MAX_INDEX_VEC) filled with the last matching index, which we
4848 can create using a MAX reduction and then expanding.
4849 In the case where the loop never made any matches, the max index will
4850 be zero. */
4851
4852 /* Vector of {0, 0, 0,...}. */
4853 tree zero_vec = build_zero_cst (vectype);
4854
4855 gimple_seq stmts = NULL;
4856 new_phi_result = gimple_convert (&stmts, vectype, new_phi_result);
4857 gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
4858
4859 /* Find maximum value from the vector of found indexes. */
4860 tree max_index = make_ssa_name (index_scalar_type);
4861 gcall *max_index_stmt = gimple_build_call_internal (IFN_REDUC_MAX,
4862 1, induction_index);
4863 gimple_call_set_lhs (max_index_stmt, max_index);
4864 gsi_insert_before (&exit_gsi, max_index_stmt, GSI_SAME_STMT);
4865
4866 /* Vector of {max_index, max_index, max_index,...}. */
4867 tree max_index_vec = make_ssa_name (index_vec_type);
4868 tree max_index_vec_rhs = build_vector_from_val (index_vec_type,
4869 max_index);
4870 gimple *max_index_vec_stmt = gimple_build_assign (max_index_vec,
4871 max_index_vec_rhs);
4872 gsi_insert_before (&exit_gsi, max_index_vec_stmt, GSI_SAME_STMT);
4873
4874 /* Next we compare the new vector (MAX_INDEX_VEC) full of max indexes
4875 with the vector (INDUCTION_INDEX) of found indexes, choosing values
4876 from the data vector (NEW_PHI_RESULT) for matches, 0 (ZERO_VEC)
4877 otherwise. Only one value should match, resulting in a vector
4878 (VEC_COND) with one data value and the rest zeros.
4879 In the case where the loop never made any matches, every index will
4880 match, resulting in a vector with all data values (which will all be
4881 the default value). */
4882
4883 /* Compare the max index vector to the vector of found indexes to find
4884 the position of the max value. */
4885 tree vec_compare = make_ssa_name (index_vec_cmp_type);
4886 gimple *vec_compare_stmt = gimple_build_assign (vec_compare, EQ_EXPR,
4887 induction_index,
4888 max_index_vec);
4889 gsi_insert_before (&exit_gsi, vec_compare_stmt, GSI_SAME_STMT);
4890
4891 /* Use the compare to choose either values from the data vector or
4892 zero. */
4893 tree vec_cond = make_ssa_name (vectype);
4894 gimple *vec_cond_stmt = gimple_build_assign (vec_cond, VEC_COND_EXPR,
4895 vec_compare, new_phi_result,
4896 zero_vec);
4897 gsi_insert_before (&exit_gsi, vec_cond_stmt, GSI_SAME_STMT);
4898
4899 /* Finally we need to extract the data value from the vector (VEC_COND)
4900 into a scalar (MATCHED_DATA_REDUC). Logically we want to do a OR
4901 reduction, but because this doesn't exist, we can use a MAX reduction
4902 instead. The data value might be signed or a float so we need to cast
4903 it first.
4904 In the case where the loop never made any matches, the data values are
4905 all identical, and so will reduce down correctly. */
4906
4907 /* Make the matched data values unsigned. */
4908 tree vec_cond_cast = make_ssa_name (vectype_unsigned);
4909 tree vec_cond_cast_rhs = build1 (VIEW_CONVERT_EXPR, vectype_unsigned,
4910 vec_cond);
4911 gimple *vec_cond_cast_stmt = gimple_build_assign (vec_cond_cast,
4912 VIEW_CONVERT_EXPR,
4913 vec_cond_cast_rhs);
4914 gsi_insert_before (&exit_gsi, vec_cond_cast_stmt, GSI_SAME_STMT);
4915
4916 /* Reduce down to a scalar value. */
4917 tree data_reduc = make_ssa_name (scalar_type_unsigned);
4918 gcall *data_reduc_stmt = gimple_build_call_internal (IFN_REDUC_MAX,
4919 1, vec_cond_cast);
4920 gimple_call_set_lhs (data_reduc_stmt, data_reduc);
4921 gsi_insert_before (&exit_gsi, data_reduc_stmt, GSI_SAME_STMT);
4922
4923 /* Convert the reduced value back to the result type and set as the
4924 result. */
4925 stmts = NULL;
4926 new_temp = gimple_build (&stmts, VIEW_CONVERT_EXPR, scalar_type,
4927 data_reduc);
4928 gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
4929 scalar_results.safe_push (new_temp);
4930 }
4931 else if (STMT_VINFO_REDUC_TYPE (reduc_info) == COND_REDUCTION
4932 && reduc_fn == IFN_LAST)
4933 {
4934 /* Condition reduction without supported IFN_REDUC_MAX. Generate
4935 idx = 0;
4936 idx_val = induction_index[0];
4937 val = data_reduc[0];
4938 for (idx = 0, val = init, i = 0; i < nelts; ++i)
4939 if (induction_index[i] > idx_val)
4940 val = data_reduc[i], idx_val = induction_index[i];
4941 return val; */
4942
4943 tree data_eltype = TREE_TYPE (TREE_TYPE (new_phi_result));
4944 tree idx_eltype = TREE_TYPE (TREE_TYPE (induction_index));
4945 unsigned HOST_WIDE_INT el_size = tree_to_uhwi (TYPE_SIZE (idx_eltype));
4946 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (TREE_TYPE (induction_index));
4947 /* Enforced by vectorizable_reduction, which ensures we have target
4948 support before allowing a conditional reduction on variable-length
4949 vectors. */
4950 unsigned HOST_WIDE_INT v_size = el_size * nunits.to_constant ();
4951 tree idx_val = NULL_TREE, val = NULL_TREE;
4952 for (unsigned HOST_WIDE_INT off = 0; off < v_size; off += el_size)
4953 {
4954 tree old_idx_val = idx_val;
4955 tree old_val = val;
4956 idx_val = make_ssa_name (idx_eltype);
4957 epilog_stmt = gimple_build_assign (idx_val, BIT_FIELD_REF,
4958 build3 (BIT_FIELD_REF, idx_eltype,
4959 induction_index,
4960 bitsize_int (el_size),
4961 bitsize_int (off)));
4962 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4963 val = make_ssa_name (data_eltype);
4964 epilog_stmt = gimple_build_assign (val, BIT_FIELD_REF,
4965 build3 (BIT_FIELD_REF,
4966 data_eltype,
4967 new_phi_result,
4968 bitsize_int (el_size),
4969 bitsize_int (off)));
4970 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4971 if (off != 0)
4972 {
4973 tree new_idx_val = idx_val;
4974 if (off != v_size - el_size)
4975 {
4976 new_idx_val = make_ssa_name (idx_eltype);
4977 epilog_stmt = gimple_build_assign (new_idx_val,
4978 MAX_EXPR, idx_val,
4979 old_idx_val);
4980 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4981 }
4982 tree new_val = make_ssa_name (data_eltype);
4983 epilog_stmt = gimple_build_assign (new_val,
4984 COND_EXPR,
4985 build2 (GT_EXPR,
4986 boolean_type_node,
4987 idx_val,
4988 old_idx_val),
4989 val, old_val);
4990 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4991 idx_val = new_idx_val;
4992 val = new_val;
4993 }
4994 }
4995 /* Convert the reduced value back to the result type and set as the
4996 result. */
4997 gimple_seq stmts = NULL;
4998 val = gimple_convert (&stmts, scalar_type, val);
4999 gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
5000 scalar_results.safe_push (val);
5001 }
5002
5003 /* 2.3 Create the reduction code, using one of the three schemes described
5004 above. In SLP we simply need to extract all the elements from the
5005 vector (without reducing them), so we use scalar shifts. */
5006 else if (reduc_fn != IFN_LAST && !slp_reduc)
5007 {
5008 tree tmp;
5009 tree vec_elem_type;
5010
5011 /* Case 1: Create:
5012 v_out2 = reduc_expr <v_out1> */
5013
5014 if (dump_enabled_p ())
5015 dump_printf_loc (MSG_NOTE, vect_location,
5016 "Reduce using direct vector reduction.\n");
5017
5018 gimple_seq stmts = NULL;
5019 new_phi_result = gimple_convert (&stmts, vectype, new_phi_result);
5020 vec_elem_type = TREE_TYPE (TREE_TYPE (new_phi_result));
5021 new_temp = gimple_build (&stmts, as_combined_fn (reduc_fn),
5022 vec_elem_type, new_phi_result);
5023 new_temp = gimple_convert (&stmts, scalar_type, new_temp);
5024 gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
5025
5026 if ((STMT_VINFO_REDUC_TYPE (reduc_info) == INTEGER_INDUC_COND_REDUCTION)
5027 && induc_val)
5028 {
5029 /* Earlier we set the initial value to be a vector if induc_val
5030 values. Check the result and if it is induc_val then replace
5031 with the original initial value, unless induc_val is
5032 the same as initial_def already. */
5033 tree zcompare = build2 (EQ_EXPR, boolean_type_node, new_temp,
5034 induc_val);
5035
5036 tmp = make_ssa_name (new_scalar_dest);
5037 epilog_stmt = gimple_build_assign (tmp, COND_EXPR, zcompare,
5038 initial_def, new_temp);
5039 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5040 new_temp = tmp;
5041 }
5042
5043 scalar_results.safe_push (new_temp);
5044 }
5045 else if (direct_slp_reduc)
5046 {
5047 /* Here we create one vector for each of the REDUC_GROUP_SIZE results,
5048 with the elements for other SLP statements replaced with the
5049 neutral value. We can then do a normal reduction on each vector. */
5050
5051 /* Enforced by vectorizable_reduction. */
5052 gcc_assert (new_phis.length () == 1);
5053 gcc_assert (pow2p_hwi (group_size));
5054
5055 slp_tree orig_phis_slp_node = slp_node_instance->reduc_phis;
5056 vec<stmt_vec_info> orig_phis
5057 = SLP_TREE_SCALAR_STMTS (orig_phis_slp_node);
5058 gimple_seq seq = NULL;
5059
5060 /* Build a vector {0, 1, 2, ...}, with the same number of elements
5061 and the same element size as VECTYPE. */
5062 tree index = build_index_vector (vectype, 0, 1);
5063 tree index_type = TREE_TYPE (index);
5064 tree index_elt_type = TREE_TYPE (index_type);
5065 tree mask_type = truth_type_for (index_type);
5066
5067 /* Create a vector that, for each element, identifies which of
5068 the REDUC_GROUP_SIZE results should use it. */
5069 tree index_mask = build_int_cst (index_elt_type, group_size - 1);
5070 index = gimple_build (&seq, BIT_AND_EXPR, index_type, index,
5071 build_vector_from_val (index_type, index_mask));
5072
5073 /* Get a neutral vector value. This is simply a splat of the neutral
5074 scalar value if we have one, otherwise the initial scalar value
5075 is itself a neutral value. */
5076 tree vector_identity = NULL_TREE;
5077 tree neutral_op = NULL_TREE;
5078 if (slp_node)
5079 {
5080 stmt_vec_info first = REDUC_GROUP_FIRST_ELEMENT (stmt_info);
5081 neutral_op
5082 = neutral_op_for_slp_reduction (slp_node_instance->reduc_phis,
5083 vectype, code, first != NULL);
5084 }
5085 if (neutral_op)
5086 vector_identity = gimple_build_vector_from_val (&seq, vectype,
5087 neutral_op);
5088 for (unsigned int i = 0; i < group_size; ++i)
5089 {
5090 /* If there's no univeral neutral value, we can use the
5091 initial scalar value from the original PHI. This is used
5092 for MIN and MAX reduction, for example. */
5093 if (!neutral_op)
5094 {
5095 tree scalar_value
5096 = PHI_ARG_DEF_FROM_EDGE (orig_phis[i]->stmt,
5097 loop_preheader_edge (loop));
5098 scalar_value = gimple_convert (&seq, TREE_TYPE (vectype),
5099 scalar_value);
5100 vector_identity = gimple_build_vector_from_val (&seq, vectype,
5101 scalar_value);
5102 }
5103
5104 /* Calculate the equivalent of:
5105
5106 sel[j] = (index[j] == i);
5107
5108 which selects the elements of NEW_PHI_RESULT that should
5109 be included in the result. */
5110 tree compare_val = build_int_cst (index_elt_type, i);
5111 compare_val = build_vector_from_val (index_type, compare_val);
5112 tree sel = gimple_build (&seq, EQ_EXPR, mask_type,
5113 index, compare_val);
5114
5115 /* Calculate the equivalent of:
5116
5117 vec = seq ? new_phi_result : vector_identity;
5118
5119 VEC is now suitable for a full vector reduction. */
5120 tree vec = gimple_build (&seq, VEC_COND_EXPR, vectype,
5121 sel, new_phi_result, vector_identity);
5122
5123 /* Do the reduction and convert it to the appropriate type. */
5124 tree scalar = gimple_build (&seq, as_combined_fn (reduc_fn),
5125 TREE_TYPE (vectype), vec);
5126 scalar = gimple_convert (&seq, scalar_type, scalar);
5127 scalar_results.safe_push (scalar);
5128 }
5129 gsi_insert_seq_before (&exit_gsi, seq, GSI_SAME_STMT);
5130 }
5131 else
5132 {
5133 bool reduce_with_shift;
5134 tree vec_temp;
5135
5136 gcc_assert (slp_reduc || new_phis.length () == 1);
5137
5138 /* See if the target wants to do the final (shift) reduction
5139 in a vector mode of smaller size and first reduce upper/lower
5140 halves against each other. */
5141 enum machine_mode mode1 = mode;
5142 tree stype = TREE_TYPE (vectype);
5143 unsigned nunits = TYPE_VECTOR_SUBPARTS (vectype).to_constant ();
5144 unsigned nunits1 = nunits;
5145 if ((mode1 = targetm.vectorize.split_reduction (mode)) != mode
5146 && new_phis.length () == 1)
5147 {
5148 nunits1 = GET_MODE_NUNITS (mode1).to_constant ();
5149 /* For SLP reductions we have to make sure lanes match up, but
5150 since we're doing individual element final reduction reducing
5151 vector width here is even more important.
5152 ??? We can also separate lanes with permutes, for the common
5153 case of power-of-two group-size odd/even extracts would work. */
5154 if (slp_reduc && nunits != nunits1)
5155 {
5156 nunits1 = least_common_multiple (nunits1, group_size);
5157 gcc_assert (exact_log2 (nunits1) != -1 && nunits1 <= nunits);
5158 }
5159 }
5160 if (!slp_reduc
5161 && (mode1 = targetm.vectorize.split_reduction (mode)) != mode)
5162 nunits1 = GET_MODE_NUNITS (mode1).to_constant ();
5163
5164 tree vectype1 = get_related_vectype_for_scalar_type (TYPE_MODE (vectype),
5165 stype, nunits1);
5166 reduce_with_shift = have_whole_vector_shift (mode1);
5167 if (!VECTOR_MODE_P (mode1))
5168 reduce_with_shift = false;
5169 else
5170 {
5171 optab optab = optab_for_tree_code (code, vectype1, optab_default);
5172 if (optab_handler (optab, mode1) == CODE_FOR_nothing)
5173 reduce_with_shift = false;
5174 }
5175
5176 /* First reduce the vector to the desired vector size we should
5177 do shift reduction on by combining upper and lower halves. */
5178 new_temp = new_phi_result;
5179 while (nunits > nunits1)
5180 {
5181 nunits /= 2;
5182 vectype1 = get_related_vectype_for_scalar_type (TYPE_MODE (vectype),
5183 stype, nunits);
5184 unsigned int bitsize = tree_to_uhwi (TYPE_SIZE (vectype1));
5185
5186 /* The target has to make sure we support lowpart/highpart
5187 extraction, either via direct vector extract or through
5188 an integer mode punning. */
5189 tree dst1, dst2;
5190 if (convert_optab_handler (vec_extract_optab,
5191 TYPE_MODE (TREE_TYPE (new_temp)),
5192 TYPE_MODE (vectype1))
5193 != CODE_FOR_nothing)
5194 {
5195 /* Extract sub-vectors directly once vec_extract becomes
5196 a conversion optab. */
5197 dst1 = make_ssa_name (vectype1);
5198 epilog_stmt
5199 = gimple_build_assign (dst1, BIT_FIELD_REF,
5200 build3 (BIT_FIELD_REF, vectype1,
5201 new_temp, TYPE_SIZE (vectype1),
5202 bitsize_int (0)));
5203 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5204 dst2 = make_ssa_name (vectype1);
5205 epilog_stmt
5206 = gimple_build_assign (dst2, BIT_FIELD_REF,
5207 build3 (BIT_FIELD_REF, vectype1,
5208 new_temp, TYPE_SIZE (vectype1),
5209 bitsize_int (bitsize)));
5210 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5211 }
5212 else
5213 {
5214 /* Extract via punning to appropriately sized integer mode
5215 vector. */
5216 tree eltype = build_nonstandard_integer_type (bitsize, 1);
5217 tree etype = build_vector_type (eltype, 2);
5218 gcc_assert (convert_optab_handler (vec_extract_optab,
5219 TYPE_MODE (etype),
5220 TYPE_MODE (eltype))
5221 != CODE_FOR_nothing);
5222 tree tem = make_ssa_name (etype);
5223 epilog_stmt = gimple_build_assign (tem, VIEW_CONVERT_EXPR,
5224 build1 (VIEW_CONVERT_EXPR,
5225 etype, new_temp));
5226 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5227 new_temp = tem;
5228 tem = make_ssa_name (eltype);
5229 epilog_stmt
5230 = gimple_build_assign (tem, BIT_FIELD_REF,
5231 build3 (BIT_FIELD_REF, eltype,
5232 new_temp, TYPE_SIZE (eltype),
5233 bitsize_int (0)));
5234 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5235 dst1 = make_ssa_name (vectype1);
5236 epilog_stmt = gimple_build_assign (dst1, VIEW_CONVERT_EXPR,
5237 build1 (VIEW_CONVERT_EXPR,
5238 vectype1, tem));
5239 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5240 tem = make_ssa_name (eltype);
5241 epilog_stmt
5242 = gimple_build_assign (tem, BIT_FIELD_REF,
5243 build3 (BIT_FIELD_REF, eltype,
5244 new_temp, TYPE_SIZE (eltype),
5245 bitsize_int (bitsize)));
5246 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5247 dst2 = make_ssa_name (vectype1);
5248 epilog_stmt = gimple_build_assign (dst2, VIEW_CONVERT_EXPR,
5249 build1 (VIEW_CONVERT_EXPR,
5250 vectype1, tem));
5251 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5252 }
5253
5254 new_temp = make_ssa_name (vectype1);
5255 epilog_stmt = gimple_build_assign (new_temp, code, dst1, dst2);
5256 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5257 new_phis[0] = epilog_stmt;
5258 }
5259
5260 if (reduce_with_shift && !slp_reduc)
5261 {
5262 int element_bitsize = tree_to_uhwi (bitsize);
5263 /* Enforced by vectorizable_reduction, which disallows SLP reductions
5264 for variable-length vectors and also requires direct target support
5265 for loop reductions. */
5266 int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype1));
5267 int nelements = vec_size_in_bits / element_bitsize;
5268 vec_perm_builder sel;
5269 vec_perm_indices indices;
5270
5271 int elt_offset;
5272
5273 tree zero_vec = build_zero_cst (vectype1);
5274 /* Case 2: Create:
5275 for (offset = nelements/2; offset >= 1; offset/=2)
5276 {
5277 Create: va' = vec_shift <va, offset>
5278 Create: va = vop <va, va'>
5279 } */
5280
5281 tree rhs;
5282
5283 if (dump_enabled_p ())
5284 dump_printf_loc (MSG_NOTE, vect_location,
5285 "Reduce using vector shifts\n");
5286
5287 gimple_seq stmts = NULL;
5288 new_temp = gimple_convert (&stmts, vectype1, new_temp);
5289 for (elt_offset = nelements / 2;
5290 elt_offset >= 1;
5291 elt_offset /= 2)
5292 {
5293 calc_vec_perm_mask_for_shift (elt_offset, nelements, &sel);
5294 indices.new_vector (sel, 2, nelements);
5295 tree mask = vect_gen_perm_mask_any (vectype1, indices);
5296 new_name = gimple_build (&stmts, VEC_PERM_EXPR, vectype1,
5297 new_temp, zero_vec, mask);
5298 new_temp = gimple_build (&stmts, code,
5299 vectype1, new_name, new_temp);
5300 }
5301 gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
5302
5303 /* 2.4 Extract the final scalar result. Create:
5304 s_out3 = extract_field <v_out2, bitpos> */
5305
5306 if (dump_enabled_p ())
5307 dump_printf_loc (MSG_NOTE, vect_location,
5308 "extract scalar result\n");
5309
5310 rhs = build3 (BIT_FIELD_REF, scalar_type, new_temp,
5311 bitsize, bitsize_zero_node);
5312 epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
5313 new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
5314 gimple_assign_set_lhs (epilog_stmt, new_temp);
5315 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5316 scalar_results.safe_push (new_temp);
5317 }
5318 else
5319 {
5320 /* Case 3: Create:
5321 s = extract_field <v_out2, 0>
5322 for (offset = element_size;
5323 offset < vector_size;
5324 offset += element_size;)
5325 {
5326 Create: s' = extract_field <v_out2, offset>
5327 Create: s = op <s, s'> // For non SLP cases
5328 } */
5329
5330 if (dump_enabled_p ())
5331 dump_printf_loc (MSG_NOTE, vect_location,
5332 "Reduce using scalar code.\n");
5333
5334 int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype1));
5335 int element_bitsize = tree_to_uhwi (bitsize);
5336 tree compute_type = TREE_TYPE (vectype);
5337 gimple_seq stmts = NULL;
5338 FOR_EACH_VEC_ELT (new_phis, i, new_phi)
5339 {
5340 int bit_offset;
5341 if (gimple_code (new_phi) == GIMPLE_PHI)
5342 vec_temp = PHI_RESULT (new_phi);
5343 else
5344 vec_temp = gimple_assign_lhs (new_phi);
5345 new_temp = gimple_build (&stmts, BIT_FIELD_REF, compute_type,
5346 vec_temp, bitsize, bitsize_zero_node);
5347
5348 /* In SLP we don't need to apply reduction operation, so we just
5349 collect s' values in SCALAR_RESULTS. */
5350 if (slp_reduc)
5351 scalar_results.safe_push (new_temp);
5352
5353 for (bit_offset = element_bitsize;
5354 bit_offset < vec_size_in_bits;
5355 bit_offset += element_bitsize)
5356 {
5357 tree bitpos = bitsize_int (bit_offset);
5358 new_name = gimple_build (&stmts, BIT_FIELD_REF,
5359 compute_type, vec_temp,
5360 bitsize, bitpos);
5361 if (slp_reduc)
5362 {
5363 /* In SLP we don't need to apply reduction operation, so
5364 we just collect s' values in SCALAR_RESULTS. */
5365 new_temp = new_name;
5366 scalar_results.safe_push (new_name);
5367 }
5368 else
5369 new_temp = gimple_build (&stmts, code, compute_type,
5370 new_name, new_temp);
5371 }
5372 }
5373
5374 /* The only case where we need to reduce scalar results in SLP, is
5375 unrolling. If the size of SCALAR_RESULTS is greater than
5376 REDUC_GROUP_SIZE, we reduce them combining elements modulo
5377 REDUC_GROUP_SIZE. */
5378 if (slp_reduc)
5379 {
5380 tree res, first_res, new_res;
5381
5382 /* Reduce multiple scalar results in case of SLP unrolling. */
5383 for (j = group_size; scalar_results.iterate (j, &res);
5384 j++)
5385 {
5386 first_res = scalar_results[j % group_size];
5387 new_res = gimple_build (&stmts, code, compute_type,
5388 first_res, res);
5389 scalar_results[j % group_size] = new_res;
5390 }
5391 for (k = 0; k < group_size; k++)
5392 scalar_results[k] = gimple_convert (&stmts, scalar_type,
5393 scalar_results[k]);
5394 }
5395 else
5396 {
5397 /* Not SLP - we have one scalar to keep in SCALAR_RESULTS. */
5398 new_temp = gimple_convert (&stmts, scalar_type, new_temp);
5399 scalar_results.safe_push (new_temp);
5400 }
5401
5402 gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
5403 }
5404
5405 if ((STMT_VINFO_REDUC_TYPE (reduc_info) == INTEGER_INDUC_COND_REDUCTION)
5406 && induc_val)
5407 {
5408 /* Earlier we set the initial value to be a vector if induc_val
5409 values. Check the result and if it is induc_val then replace
5410 with the original initial value, unless induc_val is
5411 the same as initial_def already. */
5412 tree zcompare = build2 (EQ_EXPR, boolean_type_node, new_temp,
5413 induc_val);
5414
5415 tree tmp = make_ssa_name (new_scalar_dest);
5416 epilog_stmt = gimple_build_assign (tmp, COND_EXPR, zcompare,
5417 initial_def, new_temp);
5418 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5419 scalar_results[0] = tmp;
5420 }
5421 }
5422
5423 /* 2.5 Adjust the final result by the initial value of the reduction
5424 variable. (When such adjustment is not needed, then
5425 'adjustment_def' is zero). For example, if code is PLUS we create:
5426 new_temp = loop_exit_def + adjustment_def */
5427
5428 if (adjustment_def)
5429 {
5430 gcc_assert (!slp_reduc);
5431 gimple_seq stmts = NULL;
5432 if (nested_in_vect_loop)
5433 {
5434 new_phi = new_phis[0];
5435 gcc_assert (VECTOR_TYPE_P (TREE_TYPE (adjustment_def)));
5436 adjustment_def = gimple_convert (&stmts, vectype, adjustment_def);
5437 new_temp = gimple_build (&stmts, code, vectype,
5438 PHI_RESULT (new_phi), adjustment_def);
5439 }
5440 else
5441 {
5442 new_temp = scalar_results[0];
5443 gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) != VECTOR_TYPE);
5444 adjustment_def = gimple_convert (&stmts, scalar_type, adjustment_def);
5445 new_temp = gimple_build (&stmts, code, scalar_type,
5446 new_temp, adjustment_def);
5447 }
5448
5449 epilog_stmt = gimple_seq_last_stmt (stmts);
5450 gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
5451 if (nested_in_vect_loop)
5452 {
5453 if (!double_reduc)
5454 scalar_results.quick_push (new_temp);
5455 else
5456 scalar_results[0] = new_temp;
5457 }
5458 else
5459 scalar_results[0] = new_temp;
5460
5461 new_phis[0] = epilog_stmt;
5462 }
5463
5464 if (double_reduc)
5465 loop = loop->inner;
5466
5467 /* 2.6 Handle the loop-exit phis. Replace the uses of scalar loop-exit
5468 phis with new adjusted scalar results, i.e., replace use <s_out0>
5469 with use <s_out4>.
5470
5471 Transform:
5472 loop_exit:
5473 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
5474 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
5475 v_out2 = reduce <v_out1>
5476 s_out3 = extract_field <v_out2, 0>
5477 s_out4 = adjust_result <s_out3>
5478 use <s_out0>
5479 use <s_out0>
5480
5481 into:
5482
5483 loop_exit:
5484 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
5485 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
5486 v_out2 = reduce <v_out1>
5487 s_out3 = extract_field <v_out2, 0>
5488 s_out4 = adjust_result <s_out3>
5489 use <s_out4>
5490 use <s_out4> */
5491
5492
5493 /* In SLP reduction chain we reduce vector results into one vector if
5494 necessary, hence we set here REDUC_GROUP_SIZE to 1. SCALAR_DEST is the
5495 LHS of the last stmt in the reduction chain, since we are looking for
5496 the loop exit phi node. */
5497 if (REDUC_GROUP_FIRST_ELEMENT (stmt_info))
5498 {
5499 stmt_vec_info dest_stmt_info
5500 = vect_orig_stmt (SLP_TREE_SCALAR_STMTS (slp_node)[group_size - 1]);
5501 scalar_dest = gimple_assign_lhs (dest_stmt_info->stmt);
5502 group_size = 1;
5503 }
5504
5505 /* In SLP we may have several statements in NEW_PHIS and REDUCTION_PHIS (in
5506 case that REDUC_GROUP_SIZE is greater than vectorization factor).
5507 Therefore, we need to match SCALAR_RESULTS with corresponding statements.
5508 The first (REDUC_GROUP_SIZE / number of new vector stmts) scalar results
5509 correspond to the first vector stmt, etc.
5510 (RATIO is equal to (REDUC_GROUP_SIZE / number of new vector stmts)). */
5511 if (group_size > new_phis.length ())
5512 gcc_assert (!(group_size % new_phis.length ()));
5513
5514 for (k = 0; k < group_size; k++)
5515 {
5516 if (slp_reduc)
5517 {
5518 stmt_vec_info scalar_stmt_info = SLP_TREE_SCALAR_STMTS (slp_node)[k];
5519
5520 orig_stmt_info = STMT_VINFO_RELATED_STMT (scalar_stmt_info);
5521 /* SLP statements can't participate in patterns. */
5522 gcc_assert (!orig_stmt_info);
5523 scalar_dest = gimple_assign_lhs (scalar_stmt_info->stmt);
5524 }
5525
5526 if (nested_in_vect_loop)
5527 {
5528 if (double_reduc)
5529 loop = outer_loop;
5530 else
5531 gcc_unreachable ();
5532 }
5533
5534 phis.create (3);
5535 /* Find the loop-closed-use at the loop exit of the original scalar
5536 result. (The reduction result is expected to have two immediate uses,
5537 one at the latch block, and one at the loop exit). For double
5538 reductions we are looking for exit phis of the outer loop. */
5539 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, scalar_dest)
5540 {
5541 if (!flow_bb_inside_loop_p (loop, gimple_bb (USE_STMT (use_p))))
5542 {
5543 if (!is_gimple_debug (USE_STMT (use_p)))
5544 phis.safe_push (USE_STMT (use_p));
5545 }
5546 else
5547 {
5548 if (double_reduc && gimple_code (USE_STMT (use_p)) == GIMPLE_PHI)
5549 {
5550 tree phi_res = PHI_RESULT (USE_STMT (use_p));
5551
5552 FOR_EACH_IMM_USE_FAST (phi_use_p, phi_imm_iter, phi_res)
5553 {
5554 if (!flow_bb_inside_loop_p (loop,
5555 gimple_bb (USE_STMT (phi_use_p)))
5556 && !is_gimple_debug (USE_STMT (phi_use_p)))
5557 phis.safe_push (USE_STMT (phi_use_p));
5558 }
5559 }
5560 }
5561 }
5562
5563 FOR_EACH_VEC_ELT (phis, i, exit_phi)
5564 {
5565 /* Replace the uses: */
5566 orig_name = PHI_RESULT (exit_phi);
5567 scalar_result = scalar_results[k];
5568 FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name)
5569 {
5570 FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
5571 SET_USE (use_p, scalar_result);
5572 update_stmt (use_stmt);
5573 }
5574 }
5575
5576 phis.release ();
5577 }
5578 }
5579
5580 /* Return a vector of type VECTYPE that is equal to the vector select
5581 operation "MASK ? VEC : IDENTITY". Insert the select statements
5582 before GSI. */
5583
5584 static tree
5585 merge_with_identity (gimple_stmt_iterator *gsi, tree mask, tree vectype,
5586 tree vec, tree identity)
5587 {
5588 tree cond = make_temp_ssa_name (vectype, NULL, "cond");
5589 gimple *new_stmt = gimple_build_assign (cond, VEC_COND_EXPR,
5590 mask, vec, identity);
5591 gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
5592 return cond;
5593 }
5594
5595 /* Successively apply CODE to each element of VECTOR_RHS, in left-to-right
5596 order, starting with LHS. Insert the extraction statements before GSI and
5597 associate the new scalar SSA names with variable SCALAR_DEST.
5598 Return the SSA name for the result. */
5599
5600 static tree
5601 vect_expand_fold_left (gimple_stmt_iterator *gsi, tree scalar_dest,
5602 tree_code code, tree lhs, tree vector_rhs)
5603 {
5604 tree vectype = TREE_TYPE (vector_rhs);
5605 tree scalar_type = TREE_TYPE (vectype);
5606 tree bitsize = TYPE_SIZE (scalar_type);
5607 unsigned HOST_WIDE_INT vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
5608 unsigned HOST_WIDE_INT element_bitsize = tree_to_uhwi (bitsize);
5609
5610 for (unsigned HOST_WIDE_INT bit_offset = 0;
5611 bit_offset < vec_size_in_bits;
5612 bit_offset += element_bitsize)
5613 {
5614 tree bitpos = bitsize_int (bit_offset);
5615 tree rhs = build3 (BIT_FIELD_REF, scalar_type, vector_rhs,
5616 bitsize, bitpos);
5617
5618 gassign *stmt = gimple_build_assign (scalar_dest, rhs);
5619 rhs = make_ssa_name (scalar_dest, stmt);
5620 gimple_assign_set_lhs (stmt, rhs);
5621 gsi_insert_before (gsi, stmt, GSI_SAME_STMT);
5622
5623 stmt = gimple_build_assign (scalar_dest, code, lhs, rhs);
5624 tree new_name = make_ssa_name (scalar_dest, stmt);
5625 gimple_assign_set_lhs (stmt, new_name);
5626 gsi_insert_before (gsi, stmt, GSI_SAME_STMT);
5627 lhs = new_name;
5628 }
5629 return lhs;
5630 }
5631
5632 /* Get a masked internal function equivalent to REDUC_FN. VECTYPE_IN is the
5633 type of the vector input. */
5634
5635 static internal_fn
5636 get_masked_reduction_fn (internal_fn reduc_fn, tree vectype_in)
5637 {
5638 internal_fn mask_reduc_fn;
5639
5640 switch (reduc_fn)
5641 {
5642 case IFN_FOLD_LEFT_PLUS:
5643 mask_reduc_fn = IFN_MASK_FOLD_LEFT_PLUS;
5644 break;
5645
5646 default:
5647 return IFN_LAST;
5648 }
5649
5650 if (direct_internal_fn_supported_p (mask_reduc_fn, vectype_in,
5651 OPTIMIZE_FOR_SPEED))
5652 return mask_reduc_fn;
5653 return IFN_LAST;
5654 }
5655
5656 /* Perform an in-order reduction (FOLD_LEFT_REDUCTION). STMT_INFO is the
5657 statement that sets the live-out value. REDUC_DEF_STMT is the phi
5658 statement. CODE is the operation performed by STMT_INFO and OPS are
5659 its scalar operands. REDUC_INDEX is the index of the operand in
5660 OPS that is set by REDUC_DEF_STMT. REDUC_FN is the function that
5661 implements in-order reduction, or IFN_LAST if we should open-code it.
5662 VECTYPE_IN is the type of the vector input. MASKS specifies the masks
5663 that should be used to control the operation in a fully-masked loop. */
5664
5665 static bool
5666 vectorize_fold_left_reduction (loop_vec_info loop_vinfo,
5667 stmt_vec_info stmt_info,
5668 gimple_stmt_iterator *gsi,
5669 gimple **vec_stmt, slp_tree slp_node,
5670 gimple *reduc_def_stmt,
5671 tree_code code, internal_fn reduc_fn,
5672 tree ops[3], tree vectype_in,
5673 int reduc_index, vec_loop_masks *masks)
5674 {
5675 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
5676 tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
5677 internal_fn mask_reduc_fn = get_masked_reduction_fn (reduc_fn, vectype_in);
5678
5679 int ncopies;
5680 if (slp_node)
5681 ncopies = 1;
5682 else
5683 ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
5684
5685 gcc_assert (!nested_in_vect_loop_p (loop, stmt_info));
5686 gcc_assert (ncopies == 1);
5687 gcc_assert (TREE_CODE_LENGTH (code) == binary_op);
5688
5689 if (slp_node)
5690 gcc_assert (known_eq (TYPE_VECTOR_SUBPARTS (vectype_out),
5691 TYPE_VECTOR_SUBPARTS (vectype_in)));
5692
5693 tree op0 = ops[1 - reduc_index];
5694
5695 int group_size = 1;
5696 stmt_vec_info scalar_dest_def_info;
5697 auto_vec<tree> vec_oprnds0;
5698 if (slp_node)
5699 {
5700 auto_vec<vec<tree> > vec_defs (2);
5701 vect_get_slp_defs (loop_vinfo, slp_node, &vec_defs);
5702 vec_oprnds0.safe_splice (vec_defs[1 - reduc_index]);
5703 vec_defs[0].release ();
5704 vec_defs[1].release ();
5705 group_size = SLP_TREE_SCALAR_STMTS (slp_node).length ();
5706 scalar_dest_def_info = SLP_TREE_SCALAR_STMTS (slp_node)[group_size - 1];
5707 }
5708 else
5709 {
5710 vect_get_vec_defs_for_operand (loop_vinfo, stmt_info, 1,
5711 op0, &vec_oprnds0);
5712 scalar_dest_def_info = stmt_info;
5713 }
5714
5715 tree scalar_dest = gimple_assign_lhs (scalar_dest_def_info->stmt);
5716 tree scalar_type = TREE_TYPE (scalar_dest);
5717 tree reduc_var = gimple_phi_result (reduc_def_stmt);
5718
5719 int vec_num = vec_oprnds0.length ();
5720 gcc_assert (vec_num == 1 || slp_node);
5721 tree vec_elem_type = TREE_TYPE (vectype_out);
5722 gcc_checking_assert (useless_type_conversion_p (scalar_type, vec_elem_type));
5723
5724 tree vector_identity = NULL_TREE;
5725 if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
5726 vector_identity = build_zero_cst (vectype_out);
5727
5728 tree scalar_dest_var = vect_create_destination_var (scalar_dest, NULL);
5729 int i;
5730 tree def0;
5731 FOR_EACH_VEC_ELT (vec_oprnds0, i, def0)
5732 {
5733 gimple *new_stmt;
5734 tree mask = NULL_TREE;
5735 if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
5736 mask = vect_get_loop_mask (gsi, masks, vec_num, vectype_in, i);
5737
5738 /* Handle MINUS by adding the negative. */
5739 if (reduc_fn != IFN_LAST && code == MINUS_EXPR)
5740 {
5741 tree negated = make_ssa_name (vectype_out);
5742 new_stmt = gimple_build_assign (negated, NEGATE_EXPR, def0);
5743 gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
5744 def0 = negated;
5745 }
5746
5747 if (mask && mask_reduc_fn == IFN_LAST)
5748 def0 = merge_with_identity (gsi, mask, vectype_out, def0,
5749 vector_identity);
5750
5751 /* On the first iteration the input is simply the scalar phi
5752 result, and for subsequent iterations it is the output of
5753 the preceding operation. */
5754 if (reduc_fn != IFN_LAST || (mask && mask_reduc_fn != IFN_LAST))
5755 {
5756 if (mask && mask_reduc_fn != IFN_LAST)
5757 new_stmt = gimple_build_call_internal (mask_reduc_fn, 3, reduc_var,
5758 def0, mask);
5759 else
5760 new_stmt = gimple_build_call_internal (reduc_fn, 2, reduc_var,
5761 def0);
5762 /* For chained SLP reductions the output of the previous reduction
5763 operation serves as the input of the next. For the final statement
5764 the output cannot be a temporary - we reuse the original
5765 scalar destination of the last statement. */
5766 if (i != vec_num - 1)
5767 {
5768 gimple_set_lhs (new_stmt, scalar_dest_var);
5769 reduc_var = make_ssa_name (scalar_dest_var, new_stmt);
5770 gimple_set_lhs (new_stmt, reduc_var);
5771 }
5772 }
5773 else
5774 {
5775 reduc_var = vect_expand_fold_left (gsi, scalar_dest_var, code,
5776 reduc_var, def0);
5777 new_stmt = SSA_NAME_DEF_STMT (reduc_var);
5778 /* Remove the statement, so that we can use the same code paths
5779 as for statements that we've just created. */
5780 gimple_stmt_iterator tmp_gsi = gsi_for_stmt (new_stmt);
5781 gsi_remove (&tmp_gsi, true);
5782 }
5783
5784 if (i == vec_num - 1)
5785 {
5786 gimple_set_lhs (new_stmt, scalar_dest);
5787 vect_finish_replace_stmt (loop_vinfo,
5788 scalar_dest_def_info,
5789 new_stmt);
5790 }
5791 else
5792 vect_finish_stmt_generation (loop_vinfo,
5793 scalar_dest_def_info,
5794 new_stmt, gsi);
5795
5796 if (slp_node)
5797 SLP_TREE_VEC_STMTS (slp_node).quick_push (new_stmt);
5798 else
5799 {
5800 STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_stmt);
5801 *vec_stmt = new_stmt;
5802 }
5803 }
5804
5805 return true;
5806 }
5807
5808 /* Function is_nonwrapping_integer_induction.
5809
5810 Check if STMT_VINO (which is part of loop LOOP) both increments and
5811 does not cause overflow. */
5812
5813 static bool
5814 is_nonwrapping_integer_induction (stmt_vec_info stmt_vinfo, class loop *loop)
5815 {
5816 gphi *phi = as_a <gphi *> (stmt_vinfo->stmt);
5817 tree base = STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo);
5818 tree step = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo);
5819 tree lhs_type = TREE_TYPE (gimple_phi_result (phi));
5820 widest_int ni, max_loop_value, lhs_max;
5821 wi::overflow_type overflow = wi::OVF_NONE;
5822
5823 /* Make sure the loop is integer based. */
5824 if (TREE_CODE (base) != INTEGER_CST
5825 || TREE_CODE (step) != INTEGER_CST)
5826 return false;
5827
5828 /* Check that the max size of the loop will not wrap. */
5829
5830 if (TYPE_OVERFLOW_UNDEFINED (lhs_type))
5831 return true;
5832
5833 if (! max_stmt_executions (loop, &ni))
5834 return false;
5835
5836 max_loop_value = wi::mul (wi::to_widest (step), ni, TYPE_SIGN (lhs_type),
5837 &overflow);
5838 if (overflow)
5839 return false;
5840
5841 max_loop_value = wi::add (wi::to_widest (base), max_loop_value,
5842 TYPE_SIGN (lhs_type), &overflow);
5843 if (overflow)
5844 return false;
5845
5846 return (wi::min_precision (max_loop_value, TYPE_SIGN (lhs_type))
5847 <= TYPE_PRECISION (lhs_type));
5848 }
5849
5850 /* Check if masking can be supported by inserting a conditional expression.
5851 CODE is the code for the operation. COND_FN is the conditional internal
5852 function, if it exists. VECTYPE_IN is the type of the vector input. */
5853 static bool
5854 use_mask_by_cond_expr_p (enum tree_code code, internal_fn cond_fn,
5855 tree vectype_in)
5856 {
5857 if (cond_fn != IFN_LAST
5858 && direct_internal_fn_supported_p (cond_fn, vectype_in,
5859 OPTIMIZE_FOR_SPEED))
5860 return false;
5861
5862 switch (code)
5863 {
5864 case DOT_PROD_EXPR:
5865 case SAD_EXPR:
5866 return true;
5867
5868 default:
5869 return false;
5870 }
5871 }
5872
5873 /* Insert a conditional expression to enable masked vectorization. CODE is the
5874 code for the operation. VOP is the array of operands. MASK is the loop
5875 mask. GSI is a statement iterator used to place the new conditional
5876 expression. */
5877 static void
5878 build_vect_cond_expr (enum tree_code code, tree vop[3], tree mask,
5879 gimple_stmt_iterator *gsi)
5880 {
5881 switch (code)
5882 {
5883 case DOT_PROD_EXPR:
5884 {
5885 tree vectype = TREE_TYPE (vop[1]);
5886 tree zero = build_zero_cst (vectype);
5887 tree masked_op1 = make_temp_ssa_name (vectype, NULL, "masked_op1");
5888 gassign *select = gimple_build_assign (masked_op1, VEC_COND_EXPR,
5889 mask, vop[1], zero);
5890 gsi_insert_before (gsi, select, GSI_SAME_STMT);
5891 vop[1] = masked_op1;
5892 break;
5893 }
5894
5895 case SAD_EXPR:
5896 {
5897 tree vectype = TREE_TYPE (vop[1]);
5898 tree masked_op1 = make_temp_ssa_name (vectype, NULL, "masked_op1");
5899 gassign *select = gimple_build_assign (masked_op1, VEC_COND_EXPR,
5900 mask, vop[1], vop[0]);
5901 gsi_insert_before (gsi, select, GSI_SAME_STMT);
5902 vop[1] = masked_op1;
5903 break;
5904 }
5905
5906 default:
5907 gcc_unreachable ();
5908 }
5909 }
5910
5911 /* Function vectorizable_reduction.
5912
5913 Check if STMT_INFO performs a reduction operation that can be vectorized.
5914 If VEC_STMT is also passed, vectorize STMT_INFO: create a vectorized
5915 stmt to replace it, put it in VEC_STMT, and insert it at GSI.
5916 Return true if STMT_INFO is vectorizable in this way.
5917
5918 This function also handles reduction idioms (patterns) that have been
5919 recognized in advance during vect_pattern_recog. In this case, STMT_INFO
5920 may be of this form:
5921 X = pattern_expr (arg0, arg1, ..., X)
5922 and its STMT_VINFO_RELATED_STMT points to the last stmt in the original
5923 sequence that had been detected and replaced by the pattern-stmt
5924 (STMT_INFO).
5925
5926 This function also handles reduction of condition expressions, for example:
5927 for (int i = 0; i < N; i++)
5928 if (a[i] < value)
5929 last = a[i];
5930 This is handled by vectorising the loop and creating an additional vector
5931 containing the loop indexes for which "a[i] < value" was true. In the
5932 function epilogue this is reduced to a single max value and then used to
5933 index into the vector of results.
5934
5935 In some cases of reduction patterns, the type of the reduction variable X is
5936 different than the type of the other arguments of STMT_INFO.
5937 In such cases, the vectype that is used when transforming STMT_INFO into
5938 a vector stmt is different than the vectype that is used to determine the
5939 vectorization factor, because it consists of a different number of elements
5940 than the actual number of elements that are being operated upon in parallel.
5941
5942 For example, consider an accumulation of shorts into an int accumulator.
5943 On some targets it's possible to vectorize this pattern operating on 8
5944 shorts at a time (hence, the vectype for purposes of determining the
5945 vectorization factor should be V8HI); on the other hand, the vectype that
5946 is used to create the vector form is actually V4SI (the type of the result).
5947
5948 Upon entry to this function, STMT_VINFO_VECTYPE records the vectype that
5949 indicates what is the actual level of parallelism (V8HI in the example), so
5950 that the right vectorization factor would be derived. This vectype
5951 corresponds to the type of arguments to the reduction stmt, and should *NOT*
5952 be used to create the vectorized stmt. The right vectype for the vectorized
5953 stmt is obtained from the type of the result X:
5954 get_vectype_for_scalar_type (vinfo, TREE_TYPE (X))
5955
5956 This means that, contrary to "regular" reductions (or "regular" stmts in
5957 general), the following equation:
5958 STMT_VINFO_VECTYPE == get_vectype_for_scalar_type (vinfo, TREE_TYPE (X))
5959 does *NOT* necessarily hold for reduction patterns. */
5960
5961 bool
5962 vectorizable_reduction (loop_vec_info loop_vinfo,
5963 stmt_vec_info stmt_info, slp_tree slp_node,
5964 slp_instance slp_node_instance,
5965 stmt_vector_for_cost *cost_vec)
5966 {
5967 tree scalar_dest;
5968 tree vectype_in = NULL_TREE;
5969 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
5970 enum vect_def_type cond_reduc_dt = vect_unknown_def_type;
5971 stmt_vec_info cond_stmt_vinfo = NULL;
5972 tree scalar_type;
5973 int i;
5974 int ncopies;
5975 bool single_defuse_cycle = false;
5976 bool nested_cycle = false;
5977 bool double_reduc = false;
5978 int vec_num;
5979 tree tem;
5980 tree cr_index_scalar_type = NULL_TREE, cr_index_vector_type = NULL_TREE;
5981 tree cond_reduc_val = NULL_TREE;
5982
5983 /* Make sure it was already recognized as a reduction computation. */
5984 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_reduction_def
5985 && STMT_VINFO_DEF_TYPE (stmt_info) != vect_double_reduction_def
5986 && STMT_VINFO_DEF_TYPE (stmt_info) != vect_nested_cycle)
5987 return false;
5988
5989 /* The stmt we store reduction analysis meta on. */
5990 stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info);
5991 reduc_info->is_reduc_info = true;
5992
5993 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle)
5994 {
5995 if (is_a <gphi *> (stmt_info->stmt))
5996 /* Analysis for double-reduction is done on the outer
5997 loop PHI, nested cycles have no further restrictions. */
5998 STMT_VINFO_TYPE (stmt_info) = cycle_phi_info_type;
5999 else
6000 STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
6001 return true;
6002 }
6003
6004 stmt_vec_info orig_stmt_of_analysis = stmt_info;
6005 stmt_vec_info phi_info = stmt_info;
6006 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
6007 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
6008 {
6009 if (!is_a <gphi *> (stmt_info->stmt))
6010 {
6011 STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
6012 return true;
6013 }
6014 if (slp_node)
6015 {
6016 slp_node_instance->reduc_phis = slp_node;
6017 /* ??? We're leaving slp_node to point to the PHIs, we only
6018 need it to get at the number of vector stmts which wasn't
6019 yet initialized for the instance root. */
6020 }
6021 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def)
6022 stmt_info = vect_stmt_to_vectorize (STMT_VINFO_REDUC_DEF (stmt_info));
6023 else /* STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def */
6024 {
6025 use_operand_p use_p;
6026 gimple *use_stmt;
6027 bool res = single_imm_use (gimple_phi_result (stmt_info->stmt),
6028 &use_p, &use_stmt);
6029 gcc_assert (res);
6030 phi_info = loop_vinfo->lookup_stmt (use_stmt);
6031 stmt_info = vect_stmt_to_vectorize (STMT_VINFO_REDUC_DEF (phi_info));
6032 }
6033 }
6034
6035 /* PHIs should not participate in patterns. */
6036 gcc_assert (!STMT_VINFO_RELATED_STMT (phi_info));
6037 gphi *reduc_def_phi = as_a <gphi *> (phi_info->stmt);
6038
6039 /* Verify following REDUC_IDX from the latch def leads us back to the PHI
6040 and compute the reduction chain length. */
6041 tree reduc_def = PHI_ARG_DEF_FROM_EDGE (reduc_def_phi,
6042 loop_latch_edge (loop));
6043 unsigned reduc_chain_length = 0;
6044 bool only_slp_reduc_chain = true;
6045 stmt_info = NULL;
6046 while (reduc_def != PHI_RESULT (reduc_def_phi))
6047 {
6048 stmt_vec_info def = loop_vinfo->lookup_def (reduc_def);
6049 stmt_vec_info vdef = vect_stmt_to_vectorize (def);
6050 if (STMT_VINFO_REDUC_IDX (vdef) == -1)
6051 {
6052 if (dump_enabled_p ())
6053 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6054 "reduction chain broken by patterns.\n");
6055 return false;
6056 }
6057 if (!REDUC_GROUP_FIRST_ELEMENT (vdef))
6058 only_slp_reduc_chain = false;
6059 /* ??? For epilogue generation live members of the chain need
6060 to point back to the PHI via their original stmt for
6061 info_for_reduction to work. */
6062 if (STMT_VINFO_LIVE_P (vdef))
6063 STMT_VINFO_REDUC_DEF (def) = phi_info;
6064 gassign *assign = dyn_cast <gassign *> (vdef->stmt);
6065 if (!assign)
6066 {
6067 if (dump_enabled_p ())
6068 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6069 "reduction chain includes calls.\n");
6070 return false;
6071 }
6072 if (CONVERT_EXPR_CODE_P (gimple_assign_rhs_code (assign)))
6073 {
6074 if (!tree_nop_conversion_p (TREE_TYPE (gimple_assign_lhs (assign)),
6075 TREE_TYPE (gimple_assign_rhs1 (assign))))
6076 {
6077 if (dump_enabled_p ())
6078 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6079 "conversion in the reduction chain.\n");
6080 return false;
6081 }
6082 }
6083 else if (!stmt_info)
6084 /* First non-conversion stmt. */
6085 stmt_info = vdef;
6086 reduc_def = gimple_op (vdef->stmt, 1 + STMT_VINFO_REDUC_IDX (vdef));
6087 reduc_chain_length++;
6088 }
6089 /* PHIs should not participate in patterns. */
6090 gcc_assert (!STMT_VINFO_RELATED_STMT (phi_info));
6091
6092 if (nested_in_vect_loop_p (loop, stmt_info))
6093 {
6094 loop = loop->inner;
6095 nested_cycle = true;
6096 }
6097
6098 /* STMT_VINFO_REDUC_DEF doesn't point to the first but the last
6099 element. */
6100 if (slp_node && REDUC_GROUP_FIRST_ELEMENT (stmt_info))
6101 {
6102 gcc_assert (!REDUC_GROUP_NEXT_ELEMENT (stmt_info));
6103 stmt_info = REDUC_GROUP_FIRST_ELEMENT (stmt_info);
6104 }
6105 if (REDUC_GROUP_FIRST_ELEMENT (stmt_info))
6106 gcc_assert (slp_node
6107 && REDUC_GROUP_FIRST_ELEMENT (stmt_info) == stmt_info);
6108
6109 /* 1. Is vectorizable reduction? */
6110 /* Not supportable if the reduction variable is used in the loop, unless
6111 it's a reduction chain. */
6112 if (STMT_VINFO_RELEVANT (stmt_info) > vect_used_in_outer
6113 && !REDUC_GROUP_FIRST_ELEMENT (stmt_info))
6114 return false;
6115
6116 /* Reductions that are not used even in an enclosing outer-loop,
6117 are expected to be "live" (used out of the loop). */
6118 if (STMT_VINFO_RELEVANT (stmt_info) == vect_unused_in_scope
6119 && !STMT_VINFO_LIVE_P (stmt_info))
6120 return false;
6121
6122 /* 2. Has this been recognized as a reduction pattern?
6123
6124 Check if STMT represents a pattern that has been recognized
6125 in earlier analysis stages. For stmts that represent a pattern,
6126 the STMT_VINFO_RELATED_STMT field records the last stmt in
6127 the original sequence that constitutes the pattern. */
6128
6129 stmt_vec_info orig_stmt_info = STMT_VINFO_RELATED_STMT (stmt_info);
6130 if (orig_stmt_info)
6131 {
6132 gcc_assert (STMT_VINFO_IN_PATTERN_P (orig_stmt_info));
6133 gcc_assert (!STMT_VINFO_IN_PATTERN_P (stmt_info));
6134 }
6135
6136 /* 3. Check the operands of the operation. The first operands are defined
6137 inside the loop body. The last operand is the reduction variable,
6138 which is defined by the loop-header-phi. */
6139
6140 tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
6141 STMT_VINFO_REDUC_VECTYPE (reduc_info) = vectype_out;
6142 gassign *stmt = as_a <gassign *> (stmt_info->stmt);
6143 enum tree_code code = gimple_assign_rhs_code (stmt);
6144 bool lane_reduc_code_p
6145 = (code == DOT_PROD_EXPR || code == WIDEN_SUM_EXPR || code == SAD_EXPR);
6146 int op_type = TREE_CODE_LENGTH (code);
6147
6148 scalar_dest = gimple_assign_lhs (stmt);
6149 scalar_type = TREE_TYPE (scalar_dest);
6150 if (!POINTER_TYPE_P (scalar_type) && !INTEGRAL_TYPE_P (scalar_type)
6151 && !SCALAR_FLOAT_TYPE_P (scalar_type))
6152 return false;
6153
6154 /* Do not try to vectorize bit-precision reductions. */
6155 if (!type_has_mode_precision_p (scalar_type))
6156 return false;
6157
6158 /* For lane-reducing ops we're reducing the number of reduction PHIs
6159 which means the only use of that may be in the lane-reducing operation. */
6160 if (lane_reduc_code_p
6161 && reduc_chain_length != 1
6162 && !only_slp_reduc_chain)
6163 {
6164 if (dump_enabled_p ())
6165 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6166 "lane-reducing reduction with extra stmts.\n");
6167 return false;
6168 }
6169
6170 /* All uses but the last are expected to be defined in the loop.
6171 The last use is the reduction variable. In case of nested cycle this
6172 assumption is not true: we use reduc_index to record the index of the
6173 reduction variable. */
6174 /* ??? To get at invariant/constant uses on the SLP node we have to
6175 get to it here, slp_node is still the reduction PHI. */
6176 slp_tree slp_for_stmt_info = NULL;
6177 if (slp_node)
6178 {
6179 slp_for_stmt_info = slp_node_instance->root;
6180 /* And then there's reduction chain with a conversion ... */
6181 if (SLP_TREE_REPRESENTATIVE (slp_for_stmt_info) != stmt_info)
6182 slp_for_stmt_info = SLP_TREE_CHILDREN (slp_for_stmt_info)[0];
6183 gcc_assert (SLP_TREE_REPRESENTATIVE (slp_for_stmt_info) == stmt_info);
6184 }
6185 slp_tree *slp_op = XALLOCAVEC (slp_tree, op_type);
6186 /* We need to skip an extra operand for COND_EXPRs with embedded
6187 comparison. */
6188 unsigned opno_adjust = 0;
6189 if (code == COND_EXPR
6190 && COMPARISON_CLASS_P (gimple_assign_rhs1 (stmt)))
6191 opno_adjust = 1;
6192 for (i = 0; i < op_type; i++)
6193 {
6194 /* The condition of COND_EXPR is checked in vectorizable_condition(). */
6195 if (i == 0 && code == COND_EXPR)
6196 continue;
6197
6198 stmt_vec_info def_stmt_info;
6199 enum vect_def_type dt;
6200 tree op;
6201 if (!vect_is_simple_use (loop_vinfo, stmt_info, slp_for_stmt_info,
6202 i + opno_adjust, &op, &slp_op[i], &dt, &tem,
6203 &def_stmt_info))
6204 {
6205 if (dump_enabled_p ())
6206 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6207 "use not simple.\n");
6208 return false;
6209 }
6210 if (i == STMT_VINFO_REDUC_IDX (stmt_info))
6211 continue;
6212
6213 /* There should be only one cycle def in the stmt, the one
6214 leading to reduc_def. */
6215 if (VECTORIZABLE_CYCLE_DEF (dt))
6216 return false;
6217
6218 /* To properly compute ncopies we are interested in the widest
6219 non-reduction input type in case we're looking at a widening
6220 accumulation that we later handle in vect_transform_reduction. */
6221 if (lane_reduc_code_p
6222 && tem
6223 && (!vectype_in
6224 || (GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (vectype_in)))
6225 < GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (tem))))))
6226 vectype_in = tem;
6227
6228 if (code == COND_EXPR)
6229 {
6230 /* Record how the non-reduction-def value of COND_EXPR is defined. */
6231 if (dt == vect_constant_def)
6232 {
6233 cond_reduc_dt = dt;
6234 cond_reduc_val = op;
6235 }
6236 if (dt == vect_induction_def
6237 && def_stmt_info
6238 && is_nonwrapping_integer_induction (def_stmt_info, loop))
6239 {
6240 cond_reduc_dt = dt;
6241 cond_stmt_vinfo = def_stmt_info;
6242 }
6243 }
6244 }
6245 if (!vectype_in)
6246 vectype_in = STMT_VINFO_VECTYPE (phi_info);
6247 STMT_VINFO_REDUC_VECTYPE_IN (reduc_info) = vectype_in;
6248
6249 enum vect_reduction_type v_reduc_type = STMT_VINFO_REDUC_TYPE (phi_info);
6250 STMT_VINFO_REDUC_TYPE (reduc_info) = v_reduc_type;
6251 /* If we have a condition reduction, see if we can simplify it further. */
6252 if (v_reduc_type == COND_REDUCTION)
6253 {
6254 if (slp_node)
6255 return false;
6256
6257 /* When the condition uses the reduction value in the condition, fail. */
6258 if (STMT_VINFO_REDUC_IDX (stmt_info) == 0)
6259 {
6260 if (dump_enabled_p ())
6261 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6262 "condition depends on previous iteration\n");
6263 return false;
6264 }
6265
6266 if (reduc_chain_length == 1
6267 && direct_internal_fn_supported_p (IFN_FOLD_EXTRACT_LAST,
6268 vectype_in, OPTIMIZE_FOR_SPEED))
6269 {
6270 if (dump_enabled_p ())
6271 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6272 "optimizing condition reduction with"
6273 " FOLD_EXTRACT_LAST.\n");
6274 STMT_VINFO_REDUC_TYPE (reduc_info) = EXTRACT_LAST_REDUCTION;
6275 }
6276 else if (cond_reduc_dt == vect_induction_def)
6277 {
6278 tree base
6279 = STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (cond_stmt_vinfo);
6280 tree step = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (cond_stmt_vinfo);
6281
6282 gcc_assert (TREE_CODE (base) == INTEGER_CST
6283 && TREE_CODE (step) == INTEGER_CST);
6284 cond_reduc_val = NULL_TREE;
6285 enum tree_code cond_reduc_op_code = ERROR_MARK;
6286 tree res = PHI_RESULT (STMT_VINFO_STMT (cond_stmt_vinfo));
6287 if (!types_compatible_p (TREE_TYPE (res), TREE_TYPE (base)))
6288 ;
6289 /* Find a suitable value, for MAX_EXPR below base, for MIN_EXPR
6290 above base; punt if base is the minimum value of the type for
6291 MAX_EXPR or maximum value of the type for MIN_EXPR for now. */
6292 else if (tree_int_cst_sgn (step) == -1)
6293 {
6294 cond_reduc_op_code = MIN_EXPR;
6295 if (tree_int_cst_sgn (base) == -1)
6296 cond_reduc_val = build_int_cst (TREE_TYPE (base), 0);
6297 else if (tree_int_cst_lt (base,
6298 TYPE_MAX_VALUE (TREE_TYPE (base))))
6299 cond_reduc_val
6300 = int_const_binop (PLUS_EXPR, base, integer_one_node);
6301 }
6302 else
6303 {
6304 cond_reduc_op_code = MAX_EXPR;
6305 if (tree_int_cst_sgn (base) == 1)
6306 cond_reduc_val = build_int_cst (TREE_TYPE (base), 0);
6307 else if (tree_int_cst_lt (TYPE_MIN_VALUE (TREE_TYPE (base)),
6308 base))
6309 cond_reduc_val
6310 = int_const_binop (MINUS_EXPR, base, integer_one_node);
6311 }
6312 if (cond_reduc_val)
6313 {
6314 if (dump_enabled_p ())
6315 dump_printf_loc (MSG_NOTE, vect_location,
6316 "condition expression based on "
6317 "integer induction.\n");
6318 STMT_VINFO_REDUC_CODE (reduc_info) = cond_reduc_op_code;
6319 STMT_VINFO_VEC_INDUC_COND_INITIAL_VAL (reduc_info)
6320 = cond_reduc_val;
6321 STMT_VINFO_REDUC_TYPE (reduc_info) = INTEGER_INDUC_COND_REDUCTION;
6322 }
6323 }
6324 else if (cond_reduc_dt == vect_constant_def)
6325 {
6326 enum vect_def_type cond_initial_dt;
6327 tree cond_initial_val
6328 = PHI_ARG_DEF_FROM_EDGE (reduc_def_phi, loop_preheader_edge (loop));
6329
6330 gcc_assert (cond_reduc_val != NULL_TREE);
6331 vect_is_simple_use (cond_initial_val, loop_vinfo, &cond_initial_dt);
6332 if (cond_initial_dt == vect_constant_def
6333 && types_compatible_p (TREE_TYPE (cond_initial_val),
6334 TREE_TYPE (cond_reduc_val)))
6335 {
6336 tree e = fold_binary (LE_EXPR, boolean_type_node,
6337 cond_initial_val, cond_reduc_val);
6338 if (e && (integer_onep (e) || integer_zerop (e)))
6339 {
6340 if (dump_enabled_p ())
6341 dump_printf_loc (MSG_NOTE, vect_location,
6342 "condition expression based on "
6343 "compile time constant.\n");
6344 /* Record reduction code at analysis stage. */
6345 STMT_VINFO_REDUC_CODE (reduc_info)
6346 = integer_onep (e) ? MAX_EXPR : MIN_EXPR;
6347 STMT_VINFO_REDUC_TYPE (reduc_info) = CONST_COND_REDUCTION;
6348 }
6349 }
6350 }
6351 }
6352
6353 if (STMT_VINFO_LIVE_P (phi_info))
6354 return false;
6355
6356 if (slp_node)
6357 ncopies = 1;
6358 else
6359 ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
6360
6361 gcc_assert (ncopies >= 1);
6362
6363 poly_uint64 nunits_out = TYPE_VECTOR_SUBPARTS (vectype_out);
6364
6365 if (nested_cycle)
6366 {
6367 gcc_assert (STMT_VINFO_DEF_TYPE (reduc_info)
6368 == vect_double_reduction_def);
6369 double_reduc = true;
6370 }
6371
6372 /* 4.2. Check support for the epilog operation.
6373
6374 If STMT represents a reduction pattern, then the type of the
6375 reduction variable may be different than the type of the rest
6376 of the arguments. For example, consider the case of accumulation
6377 of shorts into an int accumulator; The original code:
6378 S1: int_a = (int) short_a;
6379 orig_stmt-> S2: int_acc = plus <int_a ,int_acc>;
6380
6381 was replaced with:
6382 STMT: int_acc = widen_sum <short_a, int_acc>
6383
6384 This means that:
6385 1. The tree-code that is used to create the vector operation in the
6386 epilog code (that reduces the partial results) is not the
6387 tree-code of STMT, but is rather the tree-code of the original
6388 stmt from the pattern that STMT is replacing. I.e, in the example
6389 above we want to use 'widen_sum' in the loop, but 'plus' in the
6390 epilog.
6391 2. The type (mode) we use to check available target support
6392 for the vector operation to be created in the *epilog*, is
6393 determined by the type of the reduction variable (in the example
6394 above we'd check this: optab_handler (plus_optab, vect_int_mode])).
6395 However the type (mode) we use to check available target support
6396 for the vector operation to be created *inside the loop*, is
6397 determined by the type of the other arguments to STMT (in the
6398 example we'd check this: optab_handler (widen_sum_optab,
6399 vect_short_mode)).
6400
6401 This is contrary to "regular" reductions, in which the types of all
6402 the arguments are the same as the type of the reduction variable.
6403 For "regular" reductions we can therefore use the same vector type
6404 (and also the same tree-code) when generating the epilog code and
6405 when generating the code inside the loop. */
6406
6407 enum tree_code orig_code = STMT_VINFO_REDUC_CODE (phi_info);
6408 STMT_VINFO_REDUC_CODE (reduc_info) = orig_code;
6409
6410 vect_reduction_type reduction_type = STMT_VINFO_REDUC_TYPE (reduc_info);
6411 if (reduction_type == TREE_CODE_REDUCTION)
6412 {
6413 /* Check whether it's ok to change the order of the computation.
6414 Generally, when vectorizing a reduction we change the order of the
6415 computation. This may change the behavior of the program in some
6416 cases, so we need to check that this is ok. One exception is when
6417 vectorizing an outer-loop: the inner-loop is executed sequentially,
6418 and therefore vectorizing reductions in the inner-loop during
6419 outer-loop vectorization is safe. */
6420 if (needs_fold_left_reduction_p (scalar_type, orig_code))
6421 {
6422 /* When vectorizing a reduction chain w/o SLP the reduction PHI
6423 is not directy used in stmt. */
6424 if (!only_slp_reduc_chain
6425 && reduc_chain_length != 1)
6426 {
6427 if (dump_enabled_p ())
6428 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6429 "in-order reduction chain without SLP.\n");
6430 return false;
6431 }
6432 STMT_VINFO_REDUC_TYPE (reduc_info)
6433 = reduction_type = FOLD_LEFT_REDUCTION;
6434 }
6435 else if (!commutative_tree_code (orig_code)
6436 || !associative_tree_code (orig_code))
6437 {
6438 if (dump_enabled_p ())
6439 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6440 "reduction: not commutative/associative");
6441 return false;
6442 }
6443 }
6444
6445 if ((double_reduc || reduction_type != TREE_CODE_REDUCTION)
6446 && ncopies > 1)
6447 {
6448 if (dump_enabled_p ())
6449 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6450 "multiple types in double reduction or condition "
6451 "reduction or fold-left reduction.\n");
6452 return false;
6453 }
6454
6455 internal_fn reduc_fn = IFN_LAST;
6456 if (reduction_type == TREE_CODE_REDUCTION
6457 || reduction_type == FOLD_LEFT_REDUCTION
6458 || reduction_type == INTEGER_INDUC_COND_REDUCTION
6459 || reduction_type == CONST_COND_REDUCTION)
6460 {
6461 if (reduction_type == FOLD_LEFT_REDUCTION
6462 ? fold_left_reduction_fn (orig_code, &reduc_fn)
6463 : reduction_fn_for_scalar_code (orig_code, &reduc_fn))
6464 {
6465 if (reduc_fn != IFN_LAST
6466 && !direct_internal_fn_supported_p (reduc_fn, vectype_out,
6467 OPTIMIZE_FOR_SPEED))
6468 {
6469 if (dump_enabled_p ())
6470 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6471 "reduc op not supported by target.\n");
6472
6473 reduc_fn = IFN_LAST;
6474 }
6475 }
6476 else
6477 {
6478 if (!nested_cycle || double_reduc)
6479 {
6480 if (dump_enabled_p ())
6481 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6482 "no reduc code for scalar code.\n");
6483
6484 return false;
6485 }
6486 }
6487 }
6488 else if (reduction_type == COND_REDUCTION)
6489 {
6490 int scalar_precision
6491 = GET_MODE_PRECISION (SCALAR_TYPE_MODE (scalar_type));
6492 cr_index_scalar_type = make_unsigned_type (scalar_precision);
6493 cr_index_vector_type = build_vector_type (cr_index_scalar_type,
6494 nunits_out);
6495
6496 if (direct_internal_fn_supported_p (IFN_REDUC_MAX, cr_index_vector_type,
6497 OPTIMIZE_FOR_SPEED))
6498 reduc_fn = IFN_REDUC_MAX;
6499 }
6500 STMT_VINFO_REDUC_FN (reduc_info) = reduc_fn;
6501
6502 if (reduction_type != EXTRACT_LAST_REDUCTION
6503 && (!nested_cycle || double_reduc)
6504 && reduc_fn == IFN_LAST
6505 && !nunits_out.is_constant ())
6506 {
6507 if (dump_enabled_p ())
6508 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6509 "missing target support for reduction on"
6510 " variable-length vectors.\n");
6511 return false;
6512 }
6513
6514 /* For SLP reductions, see if there is a neutral value we can use. */
6515 tree neutral_op = NULL_TREE;
6516 if (slp_node)
6517 neutral_op = neutral_op_for_slp_reduction
6518 (slp_node_instance->reduc_phis, vectype_out, orig_code,
6519 REDUC_GROUP_FIRST_ELEMENT (stmt_info) != NULL);
6520
6521 if (double_reduc && reduction_type == FOLD_LEFT_REDUCTION)
6522 {
6523 /* We can't support in-order reductions of code such as this:
6524
6525 for (int i = 0; i < n1; ++i)
6526 for (int j = 0; j < n2; ++j)
6527 l += a[j];
6528
6529 since GCC effectively transforms the loop when vectorizing:
6530
6531 for (int i = 0; i < n1 / VF; ++i)
6532 for (int j = 0; j < n2; ++j)
6533 for (int k = 0; k < VF; ++k)
6534 l += a[j];
6535
6536 which is a reassociation of the original operation. */
6537 if (dump_enabled_p ())
6538 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6539 "in-order double reduction not supported.\n");
6540
6541 return false;
6542 }
6543
6544 if (reduction_type == FOLD_LEFT_REDUCTION
6545 && slp_node
6546 && !REDUC_GROUP_FIRST_ELEMENT (stmt_info))
6547 {
6548 /* We cannot use in-order reductions in this case because there is
6549 an implicit reassociation of the operations involved. */
6550 if (dump_enabled_p ())
6551 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6552 "in-order unchained SLP reductions not supported.\n");
6553 return false;
6554 }
6555
6556 /* For double reductions, and for SLP reductions with a neutral value,
6557 we construct a variable-length initial vector by loading a vector
6558 full of the neutral value and then shift-and-inserting the start
6559 values into the low-numbered elements. */
6560 if ((double_reduc || neutral_op)
6561 && !nunits_out.is_constant ()
6562 && !direct_internal_fn_supported_p (IFN_VEC_SHL_INSERT,
6563 vectype_out, OPTIMIZE_FOR_SPEED))
6564 {
6565 if (dump_enabled_p ())
6566 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6567 "reduction on variable-length vectors requires"
6568 " target support for a vector-shift-and-insert"
6569 " operation.\n");
6570 return false;
6571 }
6572
6573 /* Check extra constraints for variable-length unchained SLP reductions. */
6574 if (STMT_SLP_TYPE (stmt_info)
6575 && !REDUC_GROUP_FIRST_ELEMENT (stmt_info)
6576 && !nunits_out.is_constant ())
6577 {
6578 /* We checked above that we could build the initial vector when
6579 there's a neutral element value. Check here for the case in
6580 which each SLP statement has its own initial value and in which
6581 that value needs to be repeated for every instance of the
6582 statement within the initial vector. */
6583 unsigned int group_size = SLP_TREE_LANES (slp_node);
6584 if (!neutral_op
6585 && !can_duplicate_and_interleave_p (loop_vinfo, group_size,
6586 TREE_TYPE (vectype_out)))
6587 {
6588 if (dump_enabled_p ())
6589 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6590 "unsupported form of SLP reduction for"
6591 " variable-length vectors: cannot build"
6592 " initial vector.\n");
6593 return false;
6594 }
6595 /* The epilogue code relies on the number of elements being a multiple
6596 of the group size. The duplicate-and-interleave approach to setting
6597 up the initial vector does too. */
6598 if (!multiple_p (nunits_out, group_size))
6599 {
6600 if (dump_enabled_p ())
6601 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6602 "unsupported form of SLP reduction for"
6603 " variable-length vectors: the vector size"
6604 " is not a multiple of the number of results.\n");
6605 return false;
6606 }
6607 }
6608
6609 if (reduction_type == COND_REDUCTION)
6610 {
6611 widest_int ni;
6612
6613 if (! max_loop_iterations (loop, &ni))
6614 {
6615 if (dump_enabled_p ())
6616 dump_printf_loc (MSG_NOTE, vect_location,
6617 "loop count not known, cannot create cond "
6618 "reduction.\n");
6619 return false;
6620 }
6621 /* Convert backedges to iterations. */
6622 ni += 1;
6623
6624 /* The additional index will be the same type as the condition. Check
6625 that the loop can fit into this less one (because we'll use up the
6626 zero slot for when there are no matches). */
6627 tree max_index = TYPE_MAX_VALUE (cr_index_scalar_type);
6628 if (wi::geu_p (ni, wi::to_widest (max_index)))
6629 {
6630 if (dump_enabled_p ())
6631 dump_printf_loc (MSG_NOTE, vect_location,
6632 "loop size is greater than data size.\n");
6633 return false;
6634 }
6635 }
6636
6637 /* In case the vectorization factor (VF) is bigger than the number
6638 of elements that we can fit in a vectype (nunits), we have to generate
6639 more than one vector stmt - i.e - we need to "unroll" the
6640 vector stmt by a factor VF/nunits. For more details see documentation
6641 in vectorizable_operation. */
6642
6643 /* If the reduction is used in an outer loop we need to generate
6644 VF intermediate results, like so (e.g. for ncopies=2):
6645 r0 = phi (init, r0)
6646 r1 = phi (init, r1)
6647 r0 = x0 + r0;
6648 r1 = x1 + r1;
6649 (i.e. we generate VF results in 2 registers).
6650 In this case we have a separate def-use cycle for each copy, and therefore
6651 for each copy we get the vector def for the reduction variable from the
6652 respective phi node created for this copy.
6653
6654 Otherwise (the reduction is unused in the loop nest), we can combine
6655 together intermediate results, like so (e.g. for ncopies=2):
6656 r = phi (init, r)
6657 r = x0 + r;
6658 r = x1 + r;
6659 (i.e. we generate VF/2 results in a single register).
6660 In this case for each copy we get the vector def for the reduction variable
6661 from the vectorized reduction operation generated in the previous iteration.
6662
6663 This only works when we see both the reduction PHI and its only consumer
6664 in vectorizable_reduction and there are no intermediate stmts
6665 participating. */
6666 if (ncopies > 1
6667 && (STMT_VINFO_RELEVANT (stmt_info) <= vect_used_only_live)
6668 && reduc_chain_length == 1)
6669 single_defuse_cycle = true;
6670
6671 if (single_defuse_cycle || lane_reduc_code_p)
6672 {
6673 gcc_assert (code != COND_EXPR);
6674
6675 /* 4. Supportable by target? */
6676 bool ok = true;
6677
6678 /* 4.1. check support for the operation in the loop */
6679 optab optab = optab_for_tree_code (code, vectype_in, optab_vector);
6680 if (!optab)
6681 {
6682 if (dump_enabled_p ())
6683 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6684 "no optab.\n");
6685 ok = false;
6686 }
6687
6688 machine_mode vec_mode = TYPE_MODE (vectype_in);
6689 if (ok && optab_handler (optab, vec_mode) == CODE_FOR_nothing)
6690 {
6691 if (dump_enabled_p ())
6692 dump_printf (MSG_NOTE, "op not supported by target.\n");
6693 if (maybe_ne (GET_MODE_SIZE (vec_mode), UNITS_PER_WORD)
6694 || !vect_worthwhile_without_simd_p (loop_vinfo, code))
6695 ok = false;
6696 else
6697 if (dump_enabled_p ())
6698 dump_printf (MSG_NOTE, "proceeding using word mode.\n");
6699 }
6700
6701 /* Worthwhile without SIMD support? */
6702 if (ok
6703 && !VECTOR_MODE_P (TYPE_MODE (vectype_in))
6704 && !vect_worthwhile_without_simd_p (loop_vinfo, code))
6705 {
6706 if (dump_enabled_p ())
6707 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6708 "not worthwhile without SIMD support.\n");
6709 ok = false;
6710 }
6711
6712 /* lane-reducing operations have to go through vect_transform_reduction.
6713 For the other cases try without the single cycle optimization. */
6714 if (!ok)
6715 {
6716 if (lane_reduc_code_p)
6717 return false;
6718 else
6719 single_defuse_cycle = false;
6720 }
6721 }
6722 STMT_VINFO_FORCE_SINGLE_CYCLE (reduc_info) = single_defuse_cycle;
6723
6724 /* If the reduction stmt is one of the patterns that have lane
6725 reduction embedded we cannot handle the case of ! single_defuse_cycle. */
6726 if ((ncopies > 1 && ! single_defuse_cycle)
6727 && lane_reduc_code_p)
6728 {
6729 if (dump_enabled_p ())
6730 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6731 "multi def-use cycle not possible for lane-reducing "
6732 "reduction operation\n");
6733 return false;
6734 }
6735
6736 if (slp_node
6737 && !(!single_defuse_cycle
6738 && code != DOT_PROD_EXPR
6739 && code != WIDEN_SUM_EXPR
6740 && code != SAD_EXPR
6741 && reduction_type != FOLD_LEFT_REDUCTION))
6742 for (i = 0; i < op_type; i++)
6743 if (!vect_maybe_update_slp_op_vectype (slp_op[i], vectype_in))
6744 {
6745 if (dump_enabled_p ())
6746 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6747 "incompatible vector types for invariants\n");
6748 return false;
6749 }
6750
6751 if (slp_node)
6752 vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
6753 else
6754 vec_num = 1;
6755
6756 vect_model_reduction_cost (loop_vinfo, stmt_info, reduc_fn,
6757 reduction_type, ncopies, cost_vec);
6758 if (dump_enabled_p ()
6759 && reduction_type == FOLD_LEFT_REDUCTION)
6760 dump_printf_loc (MSG_NOTE, vect_location,
6761 "using an in-order (fold-left) reduction.\n");
6762 STMT_VINFO_TYPE (orig_stmt_of_analysis) = cycle_phi_info_type;
6763 /* All but single defuse-cycle optimized, lane-reducing and fold-left
6764 reductions go through their own vectorizable_* routines. */
6765 if (!single_defuse_cycle
6766 && code != DOT_PROD_EXPR
6767 && code != WIDEN_SUM_EXPR
6768 && code != SAD_EXPR
6769 && reduction_type != FOLD_LEFT_REDUCTION)
6770 {
6771 stmt_vec_info tem
6772 = vect_stmt_to_vectorize (STMT_VINFO_REDUC_DEF (phi_info));
6773 if (slp_node && REDUC_GROUP_FIRST_ELEMENT (tem))
6774 {
6775 gcc_assert (!REDUC_GROUP_NEXT_ELEMENT (tem));
6776 tem = REDUC_GROUP_FIRST_ELEMENT (tem);
6777 }
6778 STMT_VINFO_DEF_TYPE (vect_orig_stmt (tem)) = vect_internal_def;
6779 STMT_VINFO_DEF_TYPE (tem) = vect_internal_def;
6780 }
6781 else if (loop_vinfo && LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo))
6782 {
6783 vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo);
6784 internal_fn cond_fn = get_conditional_internal_fn (code);
6785
6786 if (reduction_type != FOLD_LEFT_REDUCTION
6787 && !use_mask_by_cond_expr_p (code, cond_fn, vectype_in)
6788 && (cond_fn == IFN_LAST
6789 || !direct_internal_fn_supported_p (cond_fn, vectype_in,
6790 OPTIMIZE_FOR_SPEED)))
6791 {
6792 if (dump_enabled_p ())
6793 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6794 "can't use a fully-masked loop because no"
6795 " conditional operation is available.\n");
6796 LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = false;
6797 }
6798 else if (reduction_type == FOLD_LEFT_REDUCTION
6799 && reduc_fn == IFN_LAST
6800 && !expand_vec_cond_expr_p (vectype_in,
6801 truth_type_for (vectype_in),
6802 SSA_NAME))
6803 {
6804 if (dump_enabled_p ())
6805 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6806 "can't use a fully-masked loop because no"
6807 " conditional operation is available.\n");
6808 LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = false;
6809 }
6810 else
6811 vect_record_loop_mask (loop_vinfo, masks, ncopies * vec_num,
6812 vectype_in, NULL);
6813 }
6814 return true;
6815 }
6816
6817 /* Transform the definition stmt STMT_INFO of a reduction PHI backedge
6818 value. */
6819
6820 bool
6821 vect_transform_reduction (loop_vec_info loop_vinfo,
6822 stmt_vec_info stmt_info, gimple_stmt_iterator *gsi,
6823 gimple **vec_stmt, slp_tree slp_node)
6824 {
6825 tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
6826 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
6827 int i;
6828 int ncopies;
6829 int vec_num;
6830
6831 stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info);
6832 gcc_assert (reduc_info->is_reduc_info);
6833
6834 if (nested_in_vect_loop_p (loop, stmt_info))
6835 {
6836 loop = loop->inner;
6837 gcc_assert (STMT_VINFO_DEF_TYPE (reduc_info) == vect_double_reduction_def);
6838 }
6839
6840 gassign *stmt = as_a <gassign *> (stmt_info->stmt);
6841 enum tree_code code = gimple_assign_rhs_code (stmt);
6842 int op_type = TREE_CODE_LENGTH (code);
6843
6844 /* Flatten RHS. */
6845 tree ops[3];
6846 switch (get_gimple_rhs_class (code))
6847 {
6848 case GIMPLE_TERNARY_RHS:
6849 ops[2] = gimple_assign_rhs3 (stmt);
6850 /* Fall thru. */
6851 case GIMPLE_BINARY_RHS:
6852 ops[0] = gimple_assign_rhs1 (stmt);
6853 ops[1] = gimple_assign_rhs2 (stmt);
6854 break;
6855 default:
6856 gcc_unreachable ();
6857 }
6858
6859 /* All uses but the last are expected to be defined in the loop.
6860 The last use is the reduction variable. In case of nested cycle this
6861 assumption is not true: we use reduc_index to record the index of the
6862 reduction variable. */
6863 stmt_vec_info phi_info = STMT_VINFO_REDUC_DEF (vect_orig_stmt (stmt_info));
6864 gphi *reduc_def_phi = as_a <gphi *> (phi_info->stmt);
6865 int reduc_index = STMT_VINFO_REDUC_IDX (stmt_info);
6866 tree vectype_in = STMT_VINFO_REDUC_VECTYPE_IN (reduc_info);
6867
6868 if (slp_node)
6869 {
6870 ncopies = 1;
6871 vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
6872 }
6873 else
6874 {
6875 ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
6876 vec_num = 1;
6877 }
6878
6879 internal_fn cond_fn = get_conditional_internal_fn (code);
6880 vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo);
6881 bool mask_by_cond_expr = use_mask_by_cond_expr_p (code, cond_fn, vectype_in);
6882
6883 /* Transform. */
6884 tree new_temp = NULL_TREE;
6885 auto_vec<tree> vec_oprnds0;
6886 auto_vec<tree> vec_oprnds1;
6887 auto_vec<tree> vec_oprnds2;
6888 tree def0;
6889
6890 if (dump_enabled_p ())
6891 dump_printf_loc (MSG_NOTE, vect_location, "transform reduction.\n");
6892
6893 /* FORNOW: Multiple types are not supported for condition. */
6894 if (code == COND_EXPR)
6895 gcc_assert (ncopies == 1);
6896
6897 bool masked_loop_p = LOOP_VINFO_FULLY_MASKED_P (loop_vinfo);
6898
6899 vect_reduction_type reduction_type = STMT_VINFO_REDUC_TYPE (reduc_info);
6900 if (reduction_type == FOLD_LEFT_REDUCTION)
6901 {
6902 internal_fn reduc_fn = STMT_VINFO_REDUC_FN (reduc_info);
6903 return vectorize_fold_left_reduction
6904 (loop_vinfo, stmt_info, gsi, vec_stmt, slp_node, reduc_def_phi, code,
6905 reduc_fn, ops, vectype_in, reduc_index, masks);
6906 }
6907
6908 bool single_defuse_cycle = STMT_VINFO_FORCE_SINGLE_CYCLE (reduc_info);
6909 gcc_assert (single_defuse_cycle
6910 || code == DOT_PROD_EXPR
6911 || code == WIDEN_SUM_EXPR
6912 || code == SAD_EXPR);
6913
6914 /* Create the destination vector */
6915 tree scalar_dest = gimple_assign_lhs (stmt);
6916 tree vec_dest = vect_create_destination_var (scalar_dest, vectype_out);
6917
6918 vect_get_vec_defs (loop_vinfo, stmt_info, slp_node, ncopies,
6919 single_defuse_cycle && reduc_index == 0
6920 ? NULL_TREE : ops[0], &vec_oprnds0,
6921 single_defuse_cycle && reduc_index == 1
6922 ? NULL_TREE : ops[1], &vec_oprnds1,
6923 op_type == ternary_op
6924 && !(single_defuse_cycle && reduc_index == 2)
6925 ? ops[2] : NULL_TREE, &vec_oprnds2);
6926 if (single_defuse_cycle)
6927 {
6928 gcc_assert (!slp_node);
6929 vect_get_vec_defs_for_operand (loop_vinfo, stmt_info, 1,
6930 ops[reduc_index],
6931 reduc_index == 0 ? &vec_oprnds0
6932 : (reduc_index == 1 ? &vec_oprnds1
6933 : &vec_oprnds2));
6934 }
6935
6936 FOR_EACH_VEC_ELT (vec_oprnds0, i, def0)
6937 {
6938 gimple *new_stmt;
6939 tree vop[3] = { def0, vec_oprnds1[i], NULL_TREE };
6940 if (masked_loop_p && !mask_by_cond_expr)
6941 {
6942 /* Make sure that the reduction accumulator is vop[0]. */
6943 if (reduc_index == 1)
6944 {
6945 gcc_assert (commutative_tree_code (code));
6946 std::swap (vop[0], vop[1]);
6947 }
6948 tree mask = vect_get_loop_mask (gsi, masks, vec_num * ncopies,
6949 vectype_in, i);
6950 gcall *call = gimple_build_call_internal (cond_fn, 4, mask,
6951 vop[0], vop[1], vop[0]);
6952 new_temp = make_ssa_name (vec_dest, call);
6953 gimple_call_set_lhs (call, new_temp);
6954 gimple_call_set_nothrow (call, true);
6955 vect_finish_stmt_generation (loop_vinfo, stmt_info, call, gsi);
6956 new_stmt = call;
6957 }
6958 else
6959 {
6960 if (op_type == ternary_op)
6961 vop[2] = vec_oprnds2[i];
6962
6963 if (masked_loop_p && mask_by_cond_expr)
6964 {
6965 tree mask = vect_get_loop_mask (gsi, masks, vec_num * ncopies,
6966 vectype_in, i);
6967 build_vect_cond_expr (code, vop, mask, gsi);
6968 }
6969
6970 new_stmt = gimple_build_assign (vec_dest, code,
6971 vop[0], vop[1], vop[2]);
6972 new_temp = make_ssa_name (vec_dest, new_stmt);
6973 gimple_assign_set_lhs (new_stmt, new_temp);
6974 vect_finish_stmt_generation (loop_vinfo, stmt_info, new_stmt, gsi);
6975 }
6976
6977 if (slp_node)
6978 SLP_TREE_VEC_STMTS (slp_node).quick_push (new_stmt);
6979 else if (single_defuse_cycle
6980 && i < ncopies - 1)
6981 {
6982 if (reduc_index == 0)
6983 vec_oprnds0.safe_push (gimple_get_lhs (new_stmt));
6984 else if (reduc_index == 1)
6985 vec_oprnds1.safe_push (gimple_get_lhs (new_stmt));
6986 else if (reduc_index == 2)
6987 vec_oprnds2.safe_push (gimple_get_lhs (new_stmt));
6988 }
6989 else
6990 STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_stmt);
6991 }
6992
6993 if (!slp_node)
6994 *vec_stmt = STMT_VINFO_VEC_STMTS (stmt_info)[0];
6995
6996 return true;
6997 }
6998
6999 /* Transform phase of a cycle PHI. */
7000
7001 bool
7002 vect_transform_cycle_phi (loop_vec_info loop_vinfo,
7003 stmt_vec_info stmt_info, gimple **vec_stmt,
7004 slp_tree slp_node, slp_instance slp_node_instance)
7005 {
7006 tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
7007 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
7008 int i;
7009 int ncopies;
7010 int j;
7011 bool nested_cycle = false;
7012 int vec_num;
7013
7014 if (nested_in_vect_loop_p (loop, stmt_info))
7015 {
7016 loop = loop->inner;
7017 nested_cycle = true;
7018 }
7019
7020 stmt_vec_info reduc_stmt_info = STMT_VINFO_REDUC_DEF (stmt_info);
7021 reduc_stmt_info = vect_stmt_to_vectorize (reduc_stmt_info);
7022 stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info);
7023 gcc_assert (reduc_info->is_reduc_info);
7024
7025 if (STMT_VINFO_REDUC_TYPE (reduc_info) == EXTRACT_LAST_REDUCTION
7026 || STMT_VINFO_REDUC_TYPE (reduc_info) == FOLD_LEFT_REDUCTION)
7027 /* Leave the scalar phi in place. */
7028 return true;
7029
7030 tree vectype_in = STMT_VINFO_REDUC_VECTYPE_IN (reduc_info);
7031 /* For a nested cycle we do not fill the above. */
7032 if (!vectype_in)
7033 vectype_in = STMT_VINFO_VECTYPE (stmt_info);
7034 gcc_assert (vectype_in);
7035
7036 if (slp_node)
7037 {
7038 /* The size vect_schedule_slp_instance computes is off for us. */
7039 vec_num = vect_get_num_vectors (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
7040 * SLP_TREE_LANES (slp_node), vectype_in);
7041 ncopies = 1;
7042 }
7043 else
7044 {
7045 vec_num = 1;
7046 ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
7047 }
7048
7049 /* Check whether we should use a single PHI node and accumulate
7050 vectors to one before the backedge. */
7051 if (STMT_VINFO_FORCE_SINGLE_CYCLE (reduc_info))
7052 ncopies = 1;
7053
7054 /* Create the destination vector */
7055 gphi *phi = as_a <gphi *> (stmt_info->stmt);
7056 tree vec_dest = vect_create_destination_var (gimple_phi_result (phi),
7057 vectype_out);
7058
7059 /* Get the loop-entry arguments. */
7060 tree vec_initial_def;
7061 auto_vec<tree> vec_initial_defs;
7062 if (slp_node)
7063 {
7064 vec_initial_defs.reserve (vec_num);
7065 gcc_assert (slp_node == slp_node_instance->reduc_phis);
7066 stmt_vec_info first = REDUC_GROUP_FIRST_ELEMENT (reduc_stmt_info);
7067 tree neutral_op
7068 = neutral_op_for_slp_reduction (slp_node, vectype_out,
7069 STMT_VINFO_REDUC_CODE (reduc_info),
7070 first != NULL);
7071 get_initial_defs_for_reduction (loop_vinfo, slp_node_instance->reduc_phis,
7072 &vec_initial_defs, vec_num,
7073 first != NULL, neutral_op);
7074 }
7075 else
7076 {
7077 /* Get at the scalar def before the loop, that defines the initial
7078 value of the reduction variable. */
7079 tree initial_def = PHI_ARG_DEF_FROM_EDGE (phi,
7080 loop_preheader_edge (loop));
7081 /* Optimize: if initial_def is for REDUC_MAX smaller than the base
7082 and we can't use zero for induc_val, use initial_def. Similarly
7083 for REDUC_MIN and initial_def larger than the base. */
7084 if (STMT_VINFO_REDUC_TYPE (reduc_info) == INTEGER_INDUC_COND_REDUCTION)
7085 {
7086 tree induc_val = STMT_VINFO_VEC_INDUC_COND_INITIAL_VAL (reduc_info);
7087 if (TREE_CODE (initial_def) == INTEGER_CST
7088 && !integer_zerop (induc_val)
7089 && ((STMT_VINFO_REDUC_CODE (reduc_info) == MAX_EXPR
7090 && tree_int_cst_lt (initial_def, induc_val))
7091 || (STMT_VINFO_REDUC_CODE (reduc_info) == MIN_EXPR
7092 && tree_int_cst_lt (induc_val, initial_def))))
7093 {
7094 induc_val = initial_def;
7095 /* Communicate we used the initial_def to epilouge
7096 generation. */
7097 STMT_VINFO_VEC_INDUC_COND_INITIAL_VAL (reduc_info) = NULL_TREE;
7098 }
7099 vec_initial_def = build_vector_from_val (vectype_out, induc_val);
7100 vec_initial_defs.create (ncopies);
7101 for (i = 0; i < ncopies; ++i)
7102 vec_initial_defs.quick_push (vec_initial_def);
7103 }
7104 else if (nested_cycle)
7105 {
7106 /* Do not use an adjustment def as that case is not supported
7107 correctly if ncopies is not one. */
7108 vect_get_vec_defs_for_operand (loop_vinfo, reduc_stmt_info,
7109 ncopies, initial_def,
7110 &vec_initial_defs);
7111 }
7112 else
7113 {
7114 tree adjustment_def = NULL_TREE;
7115 tree *adjustment_defp = &adjustment_def;
7116 enum tree_code code = STMT_VINFO_REDUC_CODE (reduc_info);
7117 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
7118 adjustment_defp = NULL;
7119 vec_initial_def
7120 = get_initial_def_for_reduction (loop_vinfo, reduc_stmt_info, code,
7121 initial_def, adjustment_defp);
7122 STMT_VINFO_REDUC_EPILOGUE_ADJUSTMENT (reduc_info) = adjustment_def;
7123 vec_initial_defs.create (ncopies);
7124 for (i = 0; i < ncopies; ++i)
7125 vec_initial_defs.quick_push (vec_initial_def);
7126 }
7127 }
7128
7129 /* Generate the reduction PHIs upfront. */
7130 for (i = 0; i < vec_num; i++)
7131 {
7132 tree vec_init_def = vec_initial_defs[i];
7133 for (j = 0; j < ncopies; j++)
7134 {
7135 /* Create the reduction-phi that defines the reduction
7136 operand. */
7137 gphi *new_phi = create_phi_node (vec_dest, loop->header);
7138
7139 /* Set the loop-entry arg of the reduction-phi. */
7140 if (j != 0 && nested_cycle)
7141 vec_init_def = vec_initial_defs[j];
7142 add_phi_arg (new_phi, vec_init_def, loop_preheader_edge (loop),
7143 UNKNOWN_LOCATION);
7144
7145 /* The loop-latch arg is set in epilogue processing. */
7146
7147 if (slp_node)
7148 SLP_TREE_VEC_STMTS (slp_node).quick_push (new_phi);
7149 else
7150 {
7151 if (j == 0)
7152 *vec_stmt = new_phi;
7153 STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_phi);
7154 }
7155 }
7156 }
7157
7158 return true;
7159 }
7160
7161 /* Vectorizes LC PHIs. */
7162
7163 bool
7164 vectorizable_lc_phi (loop_vec_info loop_vinfo,
7165 stmt_vec_info stmt_info, gimple **vec_stmt,
7166 slp_tree slp_node)
7167 {
7168 if (!loop_vinfo
7169 || !is_a <gphi *> (stmt_info->stmt)
7170 || gimple_phi_num_args (stmt_info->stmt) != 1)
7171 return false;
7172
7173 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_internal_def
7174 && STMT_VINFO_DEF_TYPE (stmt_info) != vect_double_reduction_def)
7175 return false;
7176
7177 if (!vec_stmt) /* transformation not required. */
7178 {
7179 STMT_VINFO_TYPE (stmt_info) = lc_phi_info_type;
7180 return true;
7181 }
7182
7183 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
7184 tree scalar_dest = gimple_phi_result (stmt_info->stmt);
7185 basic_block bb = gimple_bb (stmt_info->stmt);
7186 edge e = single_pred_edge (bb);
7187 tree vec_dest = vect_create_destination_var (scalar_dest, vectype);
7188 auto_vec<tree> vec_oprnds;
7189 vect_get_vec_defs (loop_vinfo, stmt_info, slp_node,
7190 !slp_node ? vect_get_num_copies (loop_vinfo, vectype) : 1,
7191 gimple_phi_arg_def (stmt_info->stmt, 0), &vec_oprnds);
7192 for (unsigned i = 0; i < vec_oprnds.length (); i++)
7193 {
7194 /* Create the vectorized LC PHI node. */
7195 gphi *new_phi = create_phi_node (vec_dest, bb);
7196 add_phi_arg (new_phi, vec_oprnds[i], e, UNKNOWN_LOCATION);
7197 if (slp_node)
7198 SLP_TREE_VEC_STMTS (slp_node).quick_push (new_phi);
7199 else
7200 STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_phi);
7201 }
7202 if (!slp_node)
7203 *vec_stmt = STMT_VINFO_VEC_STMTS (stmt_info)[0];
7204
7205 return true;
7206 }
7207
7208
7209 /* Function vect_min_worthwhile_factor.
7210
7211 For a loop where we could vectorize the operation indicated by CODE,
7212 return the minimum vectorization factor that makes it worthwhile
7213 to use generic vectors. */
7214 static unsigned int
7215 vect_min_worthwhile_factor (enum tree_code code)
7216 {
7217 switch (code)
7218 {
7219 case PLUS_EXPR:
7220 case MINUS_EXPR:
7221 case NEGATE_EXPR:
7222 return 4;
7223
7224 case BIT_AND_EXPR:
7225 case BIT_IOR_EXPR:
7226 case BIT_XOR_EXPR:
7227 case BIT_NOT_EXPR:
7228 return 2;
7229
7230 default:
7231 return INT_MAX;
7232 }
7233 }
7234
7235 /* Return true if VINFO indicates we are doing loop vectorization and if
7236 it is worth decomposing CODE operations into scalar operations for
7237 that loop's vectorization factor. */
7238
7239 bool
7240 vect_worthwhile_without_simd_p (vec_info *vinfo, tree_code code)
7241 {
7242 loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
7243 unsigned HOST_WIDE_INT value;
7244 return (loop_vinfo
7245 && LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant (&value)
7246 && value >= vect_min_worthwhile_factor (code));
7247 }
7248
7249 /* Function vectorizable_induction
7250
7251 Check if STMT_INFO performs an induction computation that can be vectorized.
7252 If VEC_STMT is also passed, vectorize the induction PHI: create a vectorized
7253 phi to replace it, put it in VEC_STMT, and add it to the same basic block.
7254 Return true if STMT_INFO is vectorizable in this way. */
7255
7256 bool
7257 vectorizable_induction (loop_vec_info loop_vinfo,
7258 stmt_vec_info stmt_info,
7259 gimple_stmt_iterator *gsi ATTRIBUTE_UNUSED,
7260 gimple **vec_stmt, slp_tree slp_node,
7261 stmt_vector_for_cost *cost_vec)
7262 {
7263 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
7264 unsigned ncopies;
7265 bool nested_in_vect_loop = false;
7266 class loop *iv_loop;
7267 tree vec_def;
7268 edge pe = loop_preheader_edge (loop);
7269 basic_block new_bb;
7270 tree new_vec, vec_init, vec_step, t;
7271 tree new_name;
7272 gimple *new_stmt;
7273 gphi *induction_phi;
7274 tree induc_def, vec_dest;
7275 tree init_expr, step_expr;
7276 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
7277 unsigned i;
7278 tree expr;
7279 gimple_seq stmts;
7280 gimple_stmt_iterator si;
7281
7282 gphi *phi = dyn_cast <gphi *> (stmt_info->stmt);
7283 if (!phi)
7284 return false;
7285
7286 if (!STMT_VINFO_RELEVANT_P (stmt_info))
7287 return false;
7288
7289 /* Make sure it was recognized as induction computation. */
7290 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_induction_def)
7291 return false;
7292
7293 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
7294 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
7295
7296 if (slp_node)
7297 ncopies = 1;
7298 else
7299 ncopies = vect_get_num_copies (loop_vinfo, vectype);
7300 gcc_assert (ncopies >= 1);
7301
7302 /* FORNOW. These restrictions should be relaxed. */
7303 if (nested_in_vect_loop_p (loop, stmt_info))
7304 {
7305 imm_use_iterator imm_iter;
7306 use_operand_p use_p;
7307 gimple *exit_phi;
7308 edge latch_e;
7309 tree loop_arg;
7310
7311 if (ncopies > 1)
7312 {
7313 if (dump_enabled_p ())
7314 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7315 "multiple types in nested loop.\n");
7316 return false;
7317 }
7318
7319 /* FORNOW: outer loop induction with SLP not supported. */
7320 if (STMT_SLP_TYPE (stmt_info))
7321 return false;
7322
7323 exit_phi = NULL;
7324 latch_e = loop_latch_edge (loop->inner);
7325 loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
7326 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, loop_arg)
7327 {
7328 gimple *use_stmt = USE_STMT (use_p);
7329 if (is_gimple_debug (use_stmt))
7330 continue;
7331
7332 if (!flow_bb_inside_loop_p (loop->inner, gimple_bb (use_stmt)))
7333 {
7334 exit_phi = use_stmt;
7335 break;
7336 }
7337 }
7338 if (exit_phi)
7339 {
7340 stmt_vec_info exit_phi_vinfo = loop_vinfo->lookup_stmt (exit_phi);
7341 if (!(STMT_VINFO_RELEVANT_P (exit_phi_vinfo)
7342 && !STMT_VINFO_LIVE_P (exit_phi_vinfo)))
7343 {
7344 if (dump_enabled_p ())
7345 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7346 "inner-loop induction only used outside "
7347 "of the outer vectorized loop.\n");
7348 return false;
7349 }
7350 }
7351
7352 nested_in_vect_loop = true;
7353 iv_loop = loop->inner;
7354 }
7355 else
7356 iv_loop = loop;
7357 gcc_assert (iv_loop == (gimple_bb (phi))->loop_father);
7358
7359 if (slp_node && !nunits.is_constant ())
7360 {
7361 /* The current SLP code creates the initial value element-by-element. */
7362 if (dump_enabled_p ())
7363 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7364 "SLP induction not supported for variable-length"
7365 " vectors.\n");
7366 return false;
7367 }
7368
7369 if (!vec_stmt) /* transformation not required. */
7370 {
7371 STMT_VINFO_TYPE (stmt_info) = induc_vec_info_type;
7372 DUMP_VECT_SCOPE ("vectorizable_induction");
7373 vect_model_induction_cost (stmt_info, ncopies, cost_vec);
7374 return true;
7375 }
7376
7377 /* Transform. */
7378
7379 /* Compute a vector variable, initialized with the first VF values of
7380 the induction variable. E.g., for an iv with IV_PHI='X' and
7381 evolution S, for a vector of 4 units, we want to compute:
7382 [X, X + S, X + 2*S, X + 3*S]. */
7383
7384 if (dump_enabled_p ())
7385 dump_printf_loc (MSG_NOTE, vect_location, "transform induction phi.\n");
7386
7387 step_expr = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_info);
7388 gcc_assert (step_expr != NULL_TREE);
7389 tree step_vectype = get_same_sized_vectype (TREE_TYPE (step_expr), vectype);
7390
7391 pe = loop_preheader_edge (iv_loop);
7392 init_expr = PHI_ARG_DEF_FROM_EDGE (phi,
7393 loop_preheader_edge (iv_loop));
7394
7395 stmts = NULL;
7396 if (!nested_in_vect_loop)
7397 {
7398 /* Convert the initial value to the IV update type. */
7399 tree new_type = TREE_TYPE (step_expr);
7400 init_expr = gimple_convert (&stmts, new_type, init_expr);
7401
7402 /* If we are using the loop mask to "peel" for alignment then we need
7403 to adjust the start value here. */
7404 tree skip_niters = LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo);
7405 if (skip_niters != NULL_TREE)
7406 {
7407 if (FLOAT_TYPE_P (vectype))
7408 skip_niters = gimple_build (&stmts, FLOAT_EXPR, new_type,
7409 skip_niters);
7410 else
7411 skip_niters = gimple_convert (&stmts, new_type, skip_niters);
7412 tree skip_step = gimple_build (&stmts, MULT_EXPR, new_type,
7413 skip_niters, step_expr);
7414 init_expr = gimple_build (&stmts, MINUS_EXPR, new_type,
7415 init_expr, skip_step);
7416 }
7417 }
7418
7419 if (stmts)
7420 {
7421 new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
7422 gcc_assert (!new_bb);
7423 }
7424
7425 /* Find the first insertion point in the BB. */
7426 basic_block bb = gimple_bb (phi);
7427 si = gsi_after_labels (bb);
7428
7429 /* For SLP induction we have to generate several IVs as for example
7430 with group size 3 we need [i, i, i, i + S] [i + S, i + S, i + 2*S, i + 2*S]
7431 [i + 2*S, i + 3*S, i + 3*S, i + 3*S]. The step is the same uniform
7432 [VF*S, VF*S, VF*S, VF*S] for all. */
7433 if (slp_node)
7434 {
7435 /* Enforced above. */
7436 unsigned int const_nunits = nunits.to_constant ();
7437
7438 /* Generate [VF*S, VF*S, ... ]. */
7439 if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
7440 {
7441 expr = build_int_cst (integer_type_node, vf);
7442 expr = fold_convert (TREE_TYPE (step_expr), expr);
7443 }
7444 else
7445 expr = build_int_cst (TREE_TYPE (step_expr), vf);
7446 new_name = fold_build2 (MULT_EXPR, TREE_TYPE (step_expr),
7447 expr, step_expr);
7448 if (! CONSTANT_CLASS_P (new_name))
7449 new_name = vect_init_vector (loop_vinfo, stmt_info, new_name,
7450 TREE_TYPE (step_expr), NULL);
7451 new_vec = build_vector_from_val (step_vectype, new_name);
7452 vec_step = vect_init_vector (loop_vinfo, stmt_info,
7453 new_vec, step_vectype, NULL);
7454
7455 /* Now generate the IVs. */
7456 unsigned group_size = SLP_TREE_LANES (slp_node);
7457 unsigned nvects = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
7458 unsigned elts = const_nunits * nvects;
7459 /* Compute the number of distinct IVs we need. First reduce
7460 group_size if it is a multiple of const_nunits so we get
7461 one IV for a group_size of 4 but const_nunits 2. */
7462 unsigned group_sizep = group_size;
7463 if (group_sizep % const_nunits == 0)
7464 group_sizep = group_sizep / const_nunits;
7465 unsigned nivs = least_common_multiple (group_sizep,
7466 const_nunits) / const_nunits;
7467 gcc_assert (elts % group_size == 0);
7468 tree elt = init_expr;
7469 unsigned ivn;
7470 for (ivn = 0; ivn < nivs; ++ivn)
7471 {
7472 tree_vector_builder elts (step_vectype, const_nunits, 1);
7473 stmts = NULL;
7474 for (unsigned eltn = 0; eltn < const_nunits; ++eltn)
7475 {
7476 if (ivn*const_nunits + eltn >= group_size
7477 && (ivn * const_nunits + eltn) % group_size == 0)
7478 elt = gimple_build (&stmts, PLUS_EXPR, TREE_TYPE (elt),
7479 elt, step_expr);
7480 elts.quick_push (elt);
7481 }
7482 vec_init = gimple_build_vector (&stmts, &elts);
7483 vec_init = gimple_convert (&stmts, vectype, vec_init);
7484 if (stmts)
7485 {
7486 new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
7487 gcc_assert (!new_bb);
7488 }
7489
7490 /* Create the induction-phi that defines the induction-operand. */
7491 vec_dest = vect_get_new_vect_var (vectype, vect_simple_var, "vec_iv_");
7492 induction_phi = create_phi_node (vec_dest, iv_loop->header);
7493 induc_def = PHI_RESULT (induction_phi);
7494
7495 /* Create the iv update inside the loop */
7496 gimple_seq stmts = NULL;
7497 vec_def = gimple_convert (&stmts, step_vectype, induc_def);
7498 vec_def = gimple_build (&stmts,
7499 PLUS_EXPR, step_vectype, vec_def, vec_step);
7500 vec_def = gimple_convert (&stmts, vectype, vec_def);
7501 gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
7502
7503 /* Set the arguments of the phi node: */
7504 add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION);
7505 add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
7506 UNKNOWN_LOCATION);
7507
7508 SLP_TREE_VEC_STMTS (slp_node).quick_push (induction_phi);
7509 }
7510 /* Fill up to the number of vectors we need for the whole group. */
7511 nivs = least_common_multiple (group_size,
7512 const_nunits) / const_nunits;
7513 for (; ivn < nivs; ++ivn)
7514 SLP_TREE_VEC_STMTS (slp_node)
7515 .quick_push (SLP_TREE_VEC_STMTS (slp_node)[0]);
7516
7517 /* Re-use IVs when we can. */
7518 if (ivn < nvects)
7519 {
7520 unsigned vfp
7521 = least_common_multiple (group_size, const_nunits) / group_size;
7522 /* Generate [VF'*S, VF'*S, ... ]. */
7523 if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
7524 {
7525 expr = build_int_cst (integer_type_node, vfp);
7526 expr = fold_convert (TREE_TYPE (step_expr), expr);
7527 }
7528 else
7529 expr = build_int_cst (TREE_TYPE (step_expr), vfp);
7530 new_name = fold_build2 (MULT_EXPR, TREE_TYPE (step_expr),
7531 expr, step_expr);
7532 if (! CONSTANT_CLASS_P (new_name))
7533 new_name = vect_init_vector (loop_vinfo, stmt_info, new_name,
7534 TREE_TYPE (step_expr), NULL);
7535 new_vec = build_vector_from_val (step_vectype, new_name);
7536 vec_step = vect_init_vector (loop_vinfo, stmt_info, new_vec,
7537 step_vectype, NULL);
7538 for (; ivn < nvects; ++ivn)
7539 {
7540 gimple *iv = SLP_TREE_VEC_STMTS (slp_node)[ivn - nivs];
7541 tree def;
7542 if (gimple_code (iv) == GIMPLE_PHI)
7543 def = gimple_phi_result (iv);
7544 else
7545 def = gimple_assign_lhs (iv);
7546 gimple_seq stmts = NULL;
7547 def = gimple_convert (&stmts, step_vectype, def);
7548 def = gimple_build (&stmts,
7549 PLUS_EXPR, step_vectype, def, vec_step);
7550 def = gimple_convert (&stmts, vectype, def);
7551 if (gimple_code (iv) == GIMPLE_PHI)
7552 gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
7553 else
7554 {
7555 gimple_stmt_iterator tgsi = gsi_for_stmt (iv);
7556 gsi_insert_seq_after (&tgsi, stmts, GSI_CONTINUE_LINKING);
7557 }
7558 SLP_TREE_VEC_STMTS (slp_node)
7559 .quick_push (SSA_NAME_DEF_STMT (def));
7560 }
7561 }
7562
7563 return true;
7564 }
7565
7566 /* Create the vector that holds the initial_value of the induction. */
7567 if (nested_in_vect_loop)
7568 {
7569 /* iv_loop is nested in the loop to be vectorized. init_expr had already
7570 been created during vectorization of previous stmts. We obtain it
7571 from the STMT_VINFO_VEC_STMT of the defining stmt. */
7572 auto_vec<tree> vec_inits;
7573 vect_get_vec_defs_for_operand (loop_vinfo, stmt_info, 1,
7574 init_expr, &vec_inits);
7575 vec_init = vec_inits[0];
7576 /* If the initial value is not of proper type, convert it. */
7577 if (!useless_type_conversion_p (vectype, TREE_TYPE (vec_init)))
7578 {
7579 new_stmt
7580 = gimple_build_assign (vect_get_new_ssa_name (vectype,
7581 vect_simple_var,
7582 "vec_iv_"),
7583 VIEW_CONVERT_EXPR,
7584 build1 (VIEW_CONVERT_EXPR, vectype,
7585 vec_init));
7586 vec_init = gimple_assign_lhs (new_stmt);
7587 new_bb = gsi_insert_on_edge_immediate (loop_preheader_edge (iv_loop),
7588 new_stmt);
7589 gcc_assert (!new_bb);
7590 }
7591 }
7592 else
7593 {
7594 /* iv_loop is the loop to be vectorized. Create:
7595 vec_init = [X, X+S, X+2*S, X+3*S] (S = step_expr, X = init_expr) */
7596 stmts = NULL;
7597 new_name = gimple_convert (&stmts, TREE_TYPE (step_expr), init_expr);
7598
7599 unsigned HOST_WIDE_INT const_nunits;
7600 if (nunits.is_constant (&const_nunits))
7601 {
7602 tree_vector_builder elts (step_vectype, const_nunits, 1);
7603 elts.quick_push (new_name);
7604 for (i = 1; i < const_nunits; i++)
7605 {
7606 /* Create: new_name_i = new_name + step_expr */
7607 new_name = gimple_build (&stmts, PLUS_EXPR, TREE_TYPE (new_name),
7608 new_name, step_expr);
7609 elts.quick_push (new_name);
7610 }
7611 /* Create a vector from [new_name_0, new_name_1, ...,
7612 new_name_nunits-1] */
7613 vec_init = gimple_build_vector (&stmts, &elts);
7614 }
7615 else if (INTEGRAL_TYPE_P (TREE_TYPE (step_expr)))
7616 /* Build the initial value directly from a VEC_SERIES_EXPR. */
7617 vec_init = gimple_build (&stmts, VEC_SERIES_EXPR, step_vectype,
7618 new_name, step_expr);
7619 else
7620 {
7621 /* Build:
7622 [base, base, base, ...]
7623 + (vectype) [0, 1, 2, ...] * [step, step, step, ...]. */
7624 gcc_assert (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)));
7625 gcc_assert (flag_associative_math);
7626 tree index = build_index_vector (step_vectype, 0, 1);
7627 tree base_vec = gimple_build_vector_from_val (&stmts, step_vectype,
7628 new_name);
7629 tree step_vec = gimple_build_vector_from_val (&stmts, step_vectype,
7630 step_expr);
7631 vec_init = gimple_build (&stmts, FLOAT_EXPR, step_vectype, index);
7632 vec_init = gimple_build (&stmts, MULT_EXPR, step_vectype,
7633 vec_init, step_vec);
7634 vec_init = gimple_build (&stmts, PLUS_EXPR, step_vectype,
7635 vec_init, base_vec);
7636 }
7637 vec_init = gimple_convert (&stmts, vectype, vec_init);
7638
7639 if (stmts)
7640 {
7641 new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
7642 gcc_assert (!new_bb);
7643 }
7644 }
7645
7646
7647 /* Create the vector that holds the step of the induction. */
7648 if (nested_in_vect_loop)
7649 /* iv_loop is nested in the loop to be vectorized. Generate:
7650 vec_step = [S, S, S, S] */
7651 new_name = step_expr;
7652 else
7653 {
7654 /* iv_loop is the loop to be vectorized. Generate:
7655 vec_step = [VF*S, VF*S, VF*S, VF*S] */
7656 gimple_seq seq = NULL;
7657 if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
7658 {
7659 expr = build_int_cst (integer_type_node, vf);
7660 expr = gimple_build (&seq, FLOAT_EXPR, TREE_TYPE (step_expr), expr);
7661 }
7662 else
7663 expr = build_int_cst (TREE_TYPE (step_expr), vf);
7664 new_name = gimple_build (&seq, MULT_EXPR, TREE_TYPE (step_expr),
7665 expr, step_expr);
7666 if (seq)
7667 {
7668 new_bb = gsi_insert_seq_on_edge_immediate (pe, seq);
7669 gcc_assert (!new_bb);
7670 }
7671 }
7672
7673 t = unshare_expr (new_name);
7674 gcc_assert (CONSTANT_CLASS_P (new_name)
7675 || TREE_CODE (new_name) == SSA_NAME);
7676 new_vec = build_vector_from_val (step_vectype, t);
7677 vec_step = vect_init_vector (loop_vinfo, stmt_info,
7678 new_vec, step_vectype, NULL);
7679
7680
7681 /* Create the following def-use cycle:
7682 loop prolog:
7683 vec_init = ...
7684 vec_step = ...
7685 loop:
7686 vec_iv = PHI <vec_init, vec_loop>
7687 ...
7688 STMT
7689 ...
7690 vec_loop = vec_iv + vec_step; */
7691
7692 /* Create the induction-phi that defines the induction-operand. */
7693 vec_dest = vect_get_new_vect_var (vectype, vect_simple_var, "vec_iv_");
7694 induction_phi = create_phi_node (vec_dest, iv_loop->header);
7695 induc_def = PHI_RESULT (induction_phi);
7696
7697 /* Create the iv update inside the loop */
7698 stmts = NULL;
7699 vec_def = gimple_convert (&stmts, step_vectype, induc_def);
7700 vec_def = gimple_build (&stmts, PLUS_EXPR, step_vectype, vec_def, vec_step);
7701 vec_def = gimple_convert (&stmts, vectype, vec_def);
7702 gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
7703 new_stmt = SSA_NAME_DEF_STMT (vec_def);
7704
7705 /* Set the arguments of the phi node: */
7706 add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION);
7707 add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
7708 UNKNOWN_LOCATION);
7709
7710 STMT_VINFO_VEC_STMTS (stmt_info).safe_push (induction_phi);
7711 *vec_stmt = induction_phi;
7712
7713 /* In case that vectorization factor (VF) is bigger than the number
7714 of elements that we can fit in a vectype (nunits), we have to generate
7715 more than one vector stmt - i.e - we need to "unroll" the
7716 vector stmt by a factor VF/nunits. For more details see documentation
7717 in vectorizable_operation. */
7718
7719 if (ncopies > 1)
7720 {
7721 gimple_seq seq = NULL;
7722 /* FORNOW. This restriction should be relaxed. */
7723 gcc_assert (!nested_in_vect_loop);
7724
7725 /* Create the vector that holds the step of the induction. */
7726 if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
7727 {
7728 expr = build_int_cst (integer_type_node, nunits);
7729 expr = gimple_build (&seq, FLOAT_EXPR, TREE_TYPE (step_expr), expr);
7730 }
7731 else
7732 expr = build_int_cst (TREE_TYPE (step_expr), nunits);
7733 new_name = gimple_build (&seq, MULT_EXPR, TREE_TYPE (step_expr),
7734 expr, step_expr);
7735 if (seq)
7736 {
7737 new_bb = gsi_insert_seq_on_edge_immediate (pe, seq);
7738 gcc_assert (!new_bb);
7739 }
7740
7741 t = unshare_expr (new_name);
7742 gcc_assert (CONSTANT_CLASS_P (new_name)
7743 || TREE_CODE (new_name) == SSA_NAME);
7744 new_vec = build_vector_from_val (step_vectype, t);
7745 vec_step = vect_init_vector (loop_vinfo, stmt_info,
7746 new_vec, step_vectype, NULL);
7747
7748 vec_def = induc_def;
7749 for (i = 1; i < ncopies; i++)
7750 {
7751 /* vec_i = vec_prev + vec_step */
7752 gimple_seq stmts = NULL;
7753 vec_def = gimple_convert (&stmts, step_vectype, vec_def);
7754 vec_def = gimple_build (&stmts,
7755 PLUS_EXPR, step_vectype, vec_def, vec_step);
7756 vec_def = gimple_convert (&stmts, vectype, vec_def);
7757
7758 gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
7759 new_stmt = SSA_NAME_DEF_STMT (vec_def);
7760 STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_stmt);
7761 }
7762 }
7763
7764 if (dump_enabled_p ())
7765 dump_printf_loc (MSG_NOTE, vect_location,
7766 "transform induction: created def-use cycle: %G%G",
7767 induction_phi, SSA_NAME_DEF_STMT (vec_def));
7768
7769 return true;
7770 }
7771
7772 /* Function vectorizable_live_operation.
7773
7774 STMT_INFO computes a value that is used outside the loop. Check if
7775 it can be supported. */
7776
7777 bool
7778 vectorizable_live_operation (loop_vec_info loop_vinfo,
7779 stmt_vec_info stmt_info,
7780 gimple_stmt_iterator *gsi,
7781 slp_tree slp_node, slp_instance slp_node_instance,
7782 int slp_index, bool vec_stmt_p,
7783 stmt_vector_for_cost *)
7784 {
7785 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
7786 imm_use_iterator imm_iter;
7787 tree lhs, lhs_type, bitsize, vec_bitsize;
7788 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
7789 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
7790 int ncopies;
7791 gimple *use_stmt;
7792 auto_vec<tree> vec_oprnds;
7793 int vec_entry = 0;
7794 poly_uint64 vec_index = 0;
7795
7796 gcc_assert (STMT_VINFO_LIVE_P (stmt_info));
7797
7798 /* If a stmt of a reduction is live, vectorize it via
7799 vect_create_epilog_for_reduction. vectorizable_reduction assessed
7800 validity so just trigger the transform here. */
7801 if (STMT_VINFO_REDUC_DEF (vect_orig_stmt (stmt_info)))
7802 {
7803 if (!vec_stmt_p)
7804 return true;
7805 if (slp_node)
7806 {
7807 /* For reduction chains the meta-info is attached to
7808 the group leader. */
7809 if (REDUC_GROUP_FIRST_ELEMENT (stmt_info))
7810 stmt_info = REDUC_GROUP_FIRST_ELEMENT (stmt_info);
7811 /* For SLP reductions we vectorize the epilogue for
7812 all involved stmts together. */
7813 else if (slp_index != 0)
7814 return true;
7815 else
7816 /* For SLP reductions the meta-info is attached to
7817 the representative. */
7818 stmt_info = SLP_TREE_REPRESENTATIVE (slp_node);
7819 }
7820 stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info);
7821 gcc_assert (reduc_info->is_reduc_info);
7822 if (STMT_VINFO_REDUC_TYPE (reduc_info) == FOLD_LEFT_REDUCTION
7823 || STMT_VINFO_REDUC_TYPE (reduc_info) == EXTRACT_LAST_REDUCTION)
7824 return true;
7825 vect_create_epilog_for_reduction (loop_vinfo, stmt_info, slp_node,
7826 slp_node_instance);
7827 return true;
7828 }
7829
7830 /* FORNOW. CHECKME. */
7831 if (nested_in_vect_loop_p (loop, stmt_info))
7832 return false;
7833
7834 /* If STMT is not relevant and it is a simple assignment and its inputs are
7835 invariant then it can remain in place, unvectorized. The original last
7836 scalar value that it computes will be used. */
7837 if (!STMT_VINFO_RELEVANT_P (stmt_info))
7838 {
7839 gcc_assert (is_simple_and_all_uses_invariant (stmt_info, loop_vinfo));
7840 if (dump_enabled_p ())
7841 dump_printf_loc (MSG_NOTE, vect_location,
7842 "statement is simple and uses invariant. Leaving in "
7843 "place.\n");
7844 return true;
7845 }
7846
7847 if (slp_node)
7848 ncopies = 1;
7849 else
7850 ncopies = vect_get_num_copies (loop_vinfo, vectype);
7851
7852 if (slp_node)
7853 {
7854 gcc_assert (slp_index >= 0);
7855
7856 int num_scalar = SLP_TREE_LANES (slp_node);
7857 int num_vec = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
7858
7859 /* Get the last occurrence of the scalar index from the concatenation of
7860 all the slp vectors. Calculate which slp vector it is and the index
7861 within. */
7862 poly_uint64 pos = (num_vec * nunits) - num_scalar + slp_index;
7863
7864 /* Calculate which vector contains the result, and which lane of
7865 that vector we need. */
7866 if (!can_div_trunc_p (pos, nunits, &vec_entry, &vec_index))
7867 {
7868 if (dump_enabled_p ())
7869 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7870 "Cannot determine which vector holds the"
7871 " final result.\n");
7872 return false;
7873 }
7874 }
7875
7876 if (!vec_stmt_p)
7877 {
7878 /* No transformation required. */
7879 if (LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo))
7880 {
7881 if (!direct_internal_fn_supported_p (IFN_EXTRACT_LAST, vectype,
7882 OPTIMIZE_FOR_SPEED))
7883 {
7884 if (dump_enabled_p ())
7885 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7886 "can't use a fully-masked loop because "
7887 "the target doesn't support extract last "
7888 "reduction.\n");
7889 LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = false;
7890 }
7891 else if (slp_node)
7892 {
7893 if (dump_enabled_p ())
7894 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7895 "can't use a fully-masked loop because an "
7896 "SLP statement is live after the loop.\n");
7897 LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = false;
7898 }
7899 else if (ncopies > 1)
7900 {
7901 if (dump_enabled_p ())
7902 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7903 "can't use a fully-masked loop because"
7904 " ncopies is greater than 1.\n");
7905 LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = false;
7906 }
7907 else
7908 {
7909 gcc_assert (ncopies == 1 && !slp_node);
7910 vect_record_loop_mask (loop_vinfo,
7911 &LOOP_VINFO_MASKS (loop_vinfo),
7912 1, vectype, NULL);
7913 }
7914 }
7915 return true;
7916 }
7917
7918 /* Use the lhs of the original scalar statement. */
7919 gimple *stmt = vect_orig_stmt (stmt_info)->stmt;
7920
7921 lhs = (is_a <gphi *> (stmt)) ? gimple_phi_result (stmt)
7922 : gimple_get_lhs (stmt);
7923 lhs_type = TREE_TYPE (lhs);
7924
7925 bitsize = vector_element_bits_tree (vectype);
7926 vec_bitsize = TYPE_SIZE (vectype);
7927
7928 /* Get the vectorized lhs of STMT and the lane to use (counted in bits). */
7929 tree vec_lhs, bitstart;
7930 if (slp_node)
7931 {
7932 gcc_assert (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo));
7933
7934 /* Get the correct slp vectorized stmt. */
7935 gimple *vec_stmt = SLP_TREE_VEC_STMTS (slp_node)[vec_entry];
7936 if (gphi *phi = dyn_cast <gphi *> (vec_stmt))
7937 vec_lhs = gimple_phi_result (phi);
7938 else
7939 vec_lhs = gimple_get_lhs (vec_stmt);
7940
7941 /* Get entry to use. */
7942 bitstart = bitsize_int (vec_index);
7943 bitstart = int_const_binop (MULT_EXPR, bitsize, bitstart);
7944 }
7945 else
7946 {
7947 /* For multiple copies, get the last copy. */
7948 vec_lhs = gimple_get_lhs (STMT_VINFO_VEC_STMTS (stmt_info).last ());
7949
7950 /* Get the last lane in the vector. */
7951 bitstart = int_const_binop (MINUS_EXPR, vec_bitsize, bitsize);
7952 }
7953
7954 /* Ensure the VEC_LHS for lane extraction stmts satisfy loop-closed PHI
7955 requirement, insert one phi node for it. It looks like:
7956 loop;
7957 BB:
7958 # lhs' = PHI <lhs>
7959 ==>
7960 loop;
7961 BB:
7962 # vec_lhs' = PHI <vec_lhs>
7963 new_tree = lane_extract <vec_lhs', ...>;
7964 lhs' = new_tree; */
7965
7966 basic_block exit_bb = single_exit (loop)->dest;
7967 gcc_assert (single_pred_p (exit_bb));
7968
7969 tree vec_lhs_phi = copy_ssa_name (vec_lhs);
7970 gimple *phi = create_phi_node (vec_lhs_phi, exit_bb);
7971 SET_PHI_ARG_DEF (phi, single_exit (loop)->dest_idx, vec_lhs);
7972
7973 gimple_seq stmts = NULL;
7974 tree new_tree;
7975 if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
7976 {
7977 /* Emit:
7978
7979 SCALAR_RES = EXTRACT_LAST <VEC_LHS, MASK>
7980
7981 where VEC_LHS is the vectorized live-out result and MASK is
7982 the loop mask for the final iteration. */
7983 gcc_assert (ncopies == 1 && !slp_node);
7984 tree scalar_type = TREE_TYPE (STMT_VINFO_VECTYPE (stmt_info));
7985 tree mask = vect_get_loop_mask (gsi, &LOOP_VINFO_MASKS (loop_vinfo), 1,
7986 vectype, 0);
7987 tree scalar_res = gimple_build (&stmts, CFN_EXTRACT_LAST, scalar_type,
7988 mask, vec_lhs_phi);
7989
7990 /* Convert the extracted vector element to the required scalar type. */
7991 new_tree = gimple_convert (&stmts, lhs_type, scalar_res);
7992 }
7993 else
7994 {
7995 tree bftype = TREE_TYPE (vectype);
7996 if (VECTOR_BOOLEAN_TYPE_P (vectype))
7997 bftype = build_nonstandard_integer_type (tree_to_uhwi (bitsize), 1);
7998 new_tree = build3 (BIT_FIELD_REF, bftype, vec_lhs_phi, bitsize, bitstart);
7999 new_tree = force_gimple_operand (fold_convert (lhs_type, new_tree),
8000 &stmts, true, NULL_TREE);
8001 }
8002
8003 if (stmts)
8004 {
8005 gimple_stmt_iterator exit_gsi = gsi_after_labels (exit_bb);
8006 gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
8007
8008 /* Remove existing phi from lhs and create one copy from new_tree. */
8009 tree lhs_phi = NULL_TREE;
8010 gimple_stmt_iterator gsi;
8011 for (gsi = gsi_start_phis (exit_bb); !gsi_end_p (gsi); gsi_next (&gsi))
8012 {
8013 gimple *phi = gsi_stmt (gsi);
8014 if ((gimple_phi_arg_def (phi, 0) == lhs))
8015 {
8016 remove_phi_node (&gsi, false);
8017 lhs_phi = gimple_phi_result (phi);
8018 gimple *copy = gimple_build_assign (lhs_phi, new_tree);
8019 gsi_insert_before (&exit_gsi, copy, GSI_SAME_STMT);
8020 break;
8021 }
8022 }
8023 }
8024
8025 /* Replace use of lhs with newly computed result. If the use stmt is a
8026 single arg PHI, just replace all uses of PHI result. It's necessary
8027 because lcssa PHI defining lhs may be before newly inserted stmt. */
8028 use_operand_p use_p;
8029 FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, lhs)
8030 if (!flow_bb_inside_loop_p (loop, gimple_bb (use_stmt))
8031 && !is_gimple_debug (use_stmt))
8032 {
8033 if (gimple_code (use_stmt) == GIMPLE_PHI
8034 && gimple_phi_num_args (use_stmt) == 1)
8035 {
8036 replace_uses_by (gimple_phi_result (use_stmt), new_tree);
8037 }
8038 else
8039 {
8040 FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
8041 SET_USE (use_p, new_tree);
8042 }
8043 update_stmt (use_stmt);
8044 }
8045
8046 return true;
8047 }
8048
8049 /* Kill any debug uses outside LOOP of SSA names defined in STMT_INFO. */
8050
8051 static void
8052 vect_loop_kill_debug_uses (class loop *loop, stmt_vec_info stmt_info)
8053 {
8054 ssa_op_iter op_iter;
8055 imm_use_iterator imm_iter;
8056 def_operand_p def_p;
8057 gimple *ustmt;
8058
8059 FOR_EACH_PHI_OR_STMT_DEF (def_p, stmt_info->stmt, op_iter, SSA_OP_DEF)
8060 {
8061 FOR_EACH_IMM_USE_STMT (ustmt, imm_iter, DEF_FROM_PTR (def_p))
8062 {
8063 basic_block bb;
8064
8065 if (!is_gimple_debug (ustmt))
8066 continue;
8067
8068 bb = gimple_bb (ustmt);
8069
8070 if (!flow_bb_inside_loop_p (loop, bb))
8071 {
8072 if (gimple_debug_bind_p (ustmt))
8073 {
8074 if (dump_enabled_p ())
8075 dump_printf_loc (MSG_NOTE, vect_location,
8076 "killing debug use\n");
8077
8078 gimple_debug_bind_reset_value (ustmt);
8079 update_stmt (ustmt);
8080 }
8081 else
8082 gcc_unreachable ();
8083 }
8084 }
8085 }
8086 }
8087
8088 /* Given loop represented by LOOP_VINFO, return true if computation of
8089 LOOP_VINFO_NITERS (= LOOP_VINFO_NITERSM1 + 1) doesn't overflow, false
8090 otherwise. */
8091
8092 static bool
8093 loop_niters_no_overflow (loop_vec_info loop_vinfo)
8094 {
8095 /* Constant case. */
8096 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
8097 {
8098 tree cst_niters = LOOP_VINFO_NITERS (loop_vinfo);
8099 tree cst_nitersm1 = LOOP_VINFO_NITERSM1 (loop_vinfo);
8100
8101 gcc_assert (TREE_CODE (cst_niters) == INTEGER_CST);
8102 gcc_assert (TREE_CODE (cst_nitersm1) == INTEGER_CST);
8103 if (wi::to_widest (cst_nitersm1) < wi::to_widest (cst_niters))
8104 return true;
8105 }
8106
8107 widest_int max;
8108 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
8109 /* Check the upper bound of loop niters. */
8110 if (get_max_loop_iterations (loop, &max))
8111 {
8112 tree type = TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo));
8113 signop sgn = TYPE_SIGN (type);
8114 widest_int type_max = widest_int::from (wi::max_value (type), sgn);
8115 if (max < type_max)
8116 return true;
8117 }
8118 return false;
8119 }
8120
8121 /* Return a mask type with half the number of elements as OLD_TYPE,
8122 given that it should have mode NEW_MODE. */
8123
8124 tree
8125 vect_halve_mask_nunits (tree old_type, machine_mode new_mode)
8126 {
8127 poly_uint64 nunits = exact_div (TYPE_VECTOR_SUBPARTS (old_type), 2);
8128 return build_truth_vector_type_for_mode (nunits, new_mode);
8129 }
8130
8131 /* Return a mask type with twice as many elements as OLD_TYPE,
8132 given that it should have mode NEW_MODE. */
8133
8134 tree
8135 vect_double_mask_nunits (tree old_type, machine_mode new_mode)
8136 {
8137 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (old_type) * 2;
8138 return build_truth_vector_type_for_mode (nunits, new_mode);
8139 }
8140
8141 /* Record that a fully-masked version of LOOP_VINFO would need MASKS to
8142 contain a sequence of NVECTORS masks that each control a vector of type
8143 VECTYPE. If SCALAR_MASK is nonnull, the fully-masked loop would AND
8144 these vector masks with the vector version of SCALAR_MASK. */
8145
8146 void
8147 vect_record_loop_mask (loop_vec_info loop_vinfo, vec_loop_masks *masks,
8148 unsigned int nvectors, tree vectype, tree scalar_mask)
8149 {
8150 gcc_assert (nvectors != 0);
8151 if (masks->length () < nvectors)
8152 masks->safe_grow_cleared (nvectors);
8153 rgroup_masks *rgm = &(*masks)[nvectors - 1];
8154 /* The number of scalars per iteration and the number of vectors are
8155 both compile-time constants. */
8156 unsigned int nscalars_per_iter
8157 = exact_div (nvectors * TYPE_VECTOR_SUBPARTS (vectype),
8158 LOOP_VINFO_VECT_FACTOR (loop_vinfo)).to_constant ();
8159
8160 if (scalar_mask)
8161 {
8162 scalar_cond_masked_key cond (scalar_mask, nvectors);
8163 loop_vinfo->scalar_cond_masked_set.add (cond);
8164 }
8165
8166 if (rgm->max_nscalars_per_iter < nscalars_per_iter)
8167 {
8168 rgm->max_nscalars_per_iter = nscalars_per_iter;
8169 rgm->mask_type = truth_type_for (vectype);
8170 }
8171 }
8172
8173 /* Given a complete set of masks MASKS, extract mask number INDEX
8174 for an rgroup that operates on NVECTORS vectors of type VECTYPE,
8175 where 0 <= INDEX < NVECTORS. Insert any set-up statements before GSI.
8176
8177 See the comment above vec_loop_masks for more details about the mask
8178 arrangement. */
8179
8180 tree
8181 vect_get_loop_mask (gimple_stmt_iterator *gsi, vec_loop_masks *masks,
8182 unsigned int nvectors, tree vectype, unsigned int index)
8183 {
8184 rgroup_masks *rgm = &(*masks)[nvectors - 1];
8185 tree mask_type = rgm->mask_type;
8186
8187 /* Populate the rgroup's mask array, if this is the first time we've
8188 used it. */
8189 if (rgm->masks.is_empty ())
8190 {
8191 rgm->masks.safe_grow_cleared (nvectors);
8192 for (unsigned int i = 0; i < nvectors; ++i)
8193 {
8194 tree mask = make_temp_ssa_name (mask_type, NULL, "loop_mask");
8195 /* Provide a dummy definition until the real one is available. */
8196 SSA_NAME_DEF_STMT (mask) = gimple_build_nop ();
8197 rgm->masks[i] = mask;
8198 }
8199 }
8200
8201 tree mask = rgm->masks[index];
8202 if (maybe_ne (TYPE_VECTOR_SUBPARTS (mask_type),
8203 TYPE_VECTOR_SUBPARTS (vectype)))
8204 {
8205 /* A loop mask for data type X can be reused for data type Y
8206 if X has N times more elements than Y and if Y's elements
8207 are N times bigger than X's. In this case each sequence
8208 of N elements in the loop mask will be all-zero or all-one.
8209 We can then view-convert the mask so that each sequence of
8210 N elements is replaced by a single element. */
8211 gcc_assert (multiple_p (TYPE_VECTOR_SUBPARTS (mask_type),
8212 TYPE_VECTOR_SUBPARTS (vectype)));
8213 gimple_seq seq = NULL;
8214 mask_type = truth_type_for (vectype);
8215 mask = gimple_build (&seq, VIEW_CONVERT_EXPR, mask_type, mask);
8216 if (seq)
8217 gsi_insert_seq_before (gsi, seq, GSI_SAME_STMT);
8218 }
8219 return mask;
8220 }
8221
8222 /* Scale profiling counters by estimation for LOOP which is vectorized
8223 by factor VF. */
8224
8225 static void
8226 scale_profile_for_vect_loop (class loop *loop, unsigned vf)
8227 {
8228 edge preheader = loop_preheader_edge (loop);
8229 /* Reduce loop iterations by the vectorization factor. */
8230 gcov_type new_est_niter = niter_for_unrolled_loop (loop, vf);
8231 profile_count freq_h = loop->header->count, freq_e = preheader->count ();
8232
8233 if (freq_h.nonzero_p ())
8234 {
8235 profile_probability p;
8236
8237 /* Avoid dropping loop body profile counter to 0 because of zero count
8238 in loop's preheader. */
8239 if (!(freq_e == profile_count::zero ()))
8240 freq_e = freq_e.force_nonzero ();
8241 p = freq_e.apply_scale (new_est_niter + 1, 1).probability_in (freq_h);
8242 scale_loop_frequencies (loop, p);
8243 }
8244
8245 edge exit_e = single_exit (loop);
8246 exit_e->probability = profile_probability::always ()
8247 .apply_scale (1, new_est_niter + 1);
8248
8249 edge exit_l = single_pred_edge (loop->latch);
8250 profile_probability prob = exit_l->probability;
8251 exit_l->probability = exit_e->probability.invert ();
8252 if (prob.initialized_p () && exit_l->probability.initialized_p ())
8253 scale_bbs_frequencies (&loop->latch, 1, exit_l->probability / prob);
8254 }
8255
8256 /* Vectorize STMT_INFO if relevant, inserting any new instructions before GSI.
8257 When vectorizing STMT_INFO as a store, set *SEEN_STORE to its
8258 stmt_vec_info. */
8259
8260 static void
8261 vect_transform_loop_stmt (loop_vec_info loop_vinfo, stmt_vec_info stmt_info,
8262 gimple_stmt_iterator *gsi, stmt_vec_info *seen_store)
8263 {
8264 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
8265 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
8266
8267 if (dump_enabled_p ())
8268 dump_printf_loc (MSG_NOTE, vect_location,
8269 "------>vectorizing statement: %G", stmt_info->stmt);
8270
8271 if (MAY_HAVE_DEBUG_BIND_STMTS && !STMT_VINFO_LIVE_P (stmt_info))
8272 vect_loop_kill_debug_uses (loop, stmt_info);
8273
8274 if (!STMT_VINFO_RELEVANT_P (stmt_info)
8275 && !STMT_VINFO_LIVE_P (stmt_info))
8276 return;
8277
8278 if (STMT_VINFO_VECTYPE (stmt_info))
8279 {
8280 poly_uint64 nunits
8281 = TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info));
8282 if (!STMT_SLP_TYPE (stmt_info)
8283 && maybe_ne (nunits, vf)
8284 && dump_enabled_p ())
8285 /* For SLP VF is set according to unrolling factor, and not
8286 to vector size, hence for SLP this print is not valid. */
8287 dump_printf_loc (MSG_NOTE, vect_location, "multiple-types.\n");
8288 }
8289
8290 /* Pure SLP statements have already been vectorized. We still need
8291 to apply loop vectorization to hybrid SLP statements. */
8292 if (PURE_SLP_STMT (stmt_info))
8293 return;
8294
8295 if (dump_enabled_p ())
8296 dump_printf_loc (MSG_NOTE, vect_location, "transform statement.\n");
8297
8298 if (vect_transform_stmt (loop_vinfo, stmt_info, gsi, NULL, NULL))
8299 *seen_store = stmt_info;
8300 }
8301
8302 /* Helper function to pass to simplify_replace_tree to enable replacing tree's
8303 in the hash_map with its corresponding values. */
8304
8305 static tree
8306 find_in_mapping (tree t, void *context)
8307 {
8308 hash_map<tree,tree>* mapping = (hash_map<tree, tree>*) context;
8309
8310 tree *value = mapping->get (t);
8311 return value ? *value : t;
8312 }
8313
8314 /* Update EPILOGUE's loop_vec_info. EPILOGUE was constructed as a copy of the
8315 original loop that has now been vectorized.
8316
8317 The inits of the data_references need to be advanced with the number of
8318 iterations of the main loop. This has been computed in vect_do_peeling and
8319 is stored in parameter ADVANCE. We first restore the data_references
8320 initial offset with the values recored in ORIG_DRS_INIT.
8321
8322 Since the loop_vec_info of this EPILOGUE was constructed for the original
8323 loop, its stmt_vec_infos all point to the original statements. These need
8324 to be updated to point to their corresponding copies as well as the SSA_NAMES
8325 in their PATTERN_DEF_SEQs and RELATED_STMTs.
8326
8327 The data_reference's connections also need to be updated. Their
8328 corresponding dr_vec_info need to be reconnected to the EPILOGUE's
8329 stmt_vec_infos, their statements need to point to their corresponding copy,
8330 if they are gather loads or scatter stores then their reference needs to be
8331 updated to point to its corresponding copy and finally we set
8332 'base_misaligned' to false as we have already peeled for alignment in the
8333 prologue of the main loop. */
8334
8335 static void
8336 update_epilogue_loop_vinfo (class loop *epilogue, tree advance)
8337 {
8338 loop_vec_info epilogue_vinfo = loop_vec_info_for_loop (epilogue);
8339 auto_vec<gimple *> stmt_worklist;
8340 hash_map<tree,tree> mapping;
8341 gimple *orig_stmt, *new_stmt;
8342 gimple_stmt_iterator epilogue_gsi;
8343 gphi_iterator epilogue_phi_gsi;
8344 stmt_vec_info stmt_vinfo = NULL, related_vinfo;
8345 basic_block *epilogue_bbs = get_loop_body (epilogue);
8346 unsigned i;
8347
8348 LOOP_VINFO_BBS (epilogue_vinfo) = epilogue_bbs;
8349
8350 /* Advance data_reference's with the number of iterations of the previous
8351 loop and its prologue. */
8352 vect_update_inits_of_drs (epilogue_vinfo, advance, PLUS_EXPR);
8353
8354
8355 /* The EPILOGUE loop is a copy of the original loop so they share the same
8356 gimple UIDs. In this loop we update the loop_vec_info of the EPILOGUE to
8357 point to the copied statements. We also create a mapping of all LHS' in
8358 the original loop and all the LHS' in the EPILOGUE and create worklists to
8359 update teh STMT_VINFO_PATTERN_DEF_SEQs and STMT_VINFO_RELATED_STMTs. */
8360 for (unsigned i = 0; i < epilogue->num_nodes; ++i)
8361 {
8362 for (epilogue_phi_gsi = gsi_start_phis (epilogue_bbs[i]);
8363 !gsi_end_p (epilogue_phi_gsi); gsi_next (&epilogue_phi_gsi))
8364 {
8365 new_stmt = epilogue_phi_gsi.phi ();
8366
8367 gcc_assert (gimple_uid (new_stmt) > 0);
8368 stmt_vinfo
8369 = epilogue_vinfo->stmt_vec_infos[gimple_uid (new_stmt) - 1];
8370
8371 orig_stmt = STMT_VINFO_STMT (stmt_vinfo);
8372 STMT_VINFO_STMT (stmt_vinfo) = new_stmt;
8373
8374 mapping.put (gimple_phi_result (orig_stmt),
8375 gimple_phi_result (new_stmt));
8376 /* PHI nodes can not have patterns or related statements. */
8377 gcc_assert (STMT_VINFO_PATTERN_DEF_SEQ (stmt_vinfo) == NULL
8378 && STMT_VINFO_RELATED_STMT (stmt_vinfo) == NULL);
8379 }
8380
8381 for (epilogue_gsi = gsi_start_bb (epilogue_bbs[i]);
8382 !gsi_end_p (epilogue_gsi); gsi_next (&epilogue_gsi))
8383 {
8384 new_stmt = gsi_stmt (epilogue_gsi);
8385 if (is_gimple_debug (new_stmt))
8386 continue;
8387
8388 gcc_assert (gimple_uid (new_stmt) > 0);
8389 stmt_vinfo
8390 = epilogue_vinfo->stmt_vec_infos[gimple_uid (new_stmt) - 1];
8391
8392 orig_stmt = STMT_VINFO_STMT (stmt_vinfo);
8393 STMT_VINFO_STMT (stmt_vinfo) = new_stmt;
8394
8395 if (tree old_lhs = gimple_get_lhs (orig_stmt))
8396 mapping.put (old_lhs, gimple_get_lhs (new_stmt));
8397
8398 if (STMT_VINFO_PATTERN_DEF_SEQ (stmt_vinfo))
8399 {
8400 gimple_seq seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_vinfo);
8401 for (gimple_stmt_iterator gsi = gsi_start (seq);
8402 !gsi_end_p (gsi); gsi_next (&gsi))
8403 stmt_worklist.safe_push (gsi_stmt (gsi));
8404 }
8405
8406 related_vinfo = STMT_VINFO_RELATED_STMT (stmt_vinfo);
8407 if (related_vinfo != NULL && related_vinfo != stmt_vinfo)
8408 {
8409 gimple *stmt = STMT_VINFO_STMT (related_vinfo);
8410 stmt_worklist.safe_push (stmt);
8411 /* Set BB such that the assert in
8412 'get_initial_def_for_reduction' is able to determine that
8413 the BB of the related stmt is inside this loop. */
8414 gimple_set_bb (stmt,
8415 gimple_bb (new_stmt));
8416 related_vinfo = STMT_VINFO_RELATED_STMT (related_vinfo);
8417 gcc_assert (related_vinfo == NULL
8418 || related_vinfo == stmt_vinfo);
8419 }
8420 }
8421 }
8422
8423 /* The PATTERN_DEF_SEQs and RELATED_STMTs in the epilogue were constructed
8424 using the original main loop and thus need to be updated to refer to the
8425 cloned variables used in the epilogue. */
8426 for (unsigned i = 0; i < stmt_worklist.length (); ++i)
8427 {
8428 gimple *stmt = stmt_worklist[i];
8429 tree *new_op;
8430
8431 for (unsigned j = 1; j < gimple_num_ops (stmt); ++j)
8432 {
8433 tree op = gimple_op (stmt, j);
8434 if ((new_op = mapping.get(op)))
8435 gimple_set_op (stmt, j, *new_op);
8436 else
8437 {
8438 /* PR92429: The last argument of simplify_replace_tree disables
8439 folding when replacing arguments. This is required as
8440 otherwise you might end up with different statements than the
8441 ones analyzed in vect_loop_analyze, leading to different
8442 vectorization. */
8443 op = simplify_replace_tree (op, NULL_TREE, NULL_TREE,
8444 &find_in_mapping, &mapping, false);
8445 gimple_set_op (stmt, j, op);
8446 }
8447 }
8448 }
8449
8450 struct data_reference *dr;
8451 vec<data_reference_p> datarefs = epilogue_vinfo->shared->datarefs;
8452 FOR_EACH_VEC_ELT (datarefs, i, dr)
8453 {
8454 orig_stmt = DR_STMT (dr);
8455 gcc_assert (gimple_uid (orig_stmt) > 0);
8456 stmt_vinfo = epilogue_vinfo->stmt_vec_infos[gimple_uid (orig_stmt) - 1];
8457 /* Data references for gather loads and scatter stores do not use the
8458 updated offset we set using ADVANCE. Instead we have to make sure the
8459 reference in the data references point to the corresponding copy of
8460 the original in the epilogue. */
8461 if (STMT_VINFO_MEMORY_ACCESS_TYPE (vect_stmt_to_vectorize (stmt_vinfo))
8462 == VMAT_GATHER_SCATTER)
8463 {
8464 DR_REF (dr)
8465 = simplify_replace_tree (DR_REF (dr), NULL_TREE, NULL_TREE,
8466 &find_in_mapping, &mapping);
8467 DR_BASE_ADDRESS (dr)
8468 = simplify_replace_tree (DR_BASE_ADDRESS (dr), NULL_TREE, NULL_TREE,
8469 &find_in_mapping, &mapping);
8470 }
8471 DR_STMT (dr) = STMT_VINFO_STMT (stmt_vinfo);
8472 stmt_vinfo->dr_aux.stmt = stmt_vinfo;
8473 /* The vector size of the epilogue is smaller than that of the main loop
8474 so the alignment is either the same or lower. This means the dr will
8475 thus by definition be aligned. */
8476 STMT_VINFO_DR_INFO (stmt_vinfo)->base_misaligned = false;
8477 }
8478
8479 epilogue_vinfo->shared->datarefs_copy.release ();
8480 epilogue_vinfo->shared->save_datarefs ();
8481 }
8482
8483 /* Function vect_transform_loop.
8484
8485 The analysis phase has determined that the loop is vectorizable.
8486 Vectorize the loop - created vectorized stmts to replace the scalar
8487 stmts in the loop, and update the loop exit condition.
8488 Returns scalar epilogue loop if any. */
8489
8490 class loop *
8491 vect_transform_loop (loop_vec_info loop_vinfo, gimple *loop_vectorized_call)
8492 {
8493 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
8494 class loop *epilogue = NULL;
8495 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
8496 int nbbs = loop->num_nodes;
8497 int i;
8498 tree niters_vector = NULL_TREE;
8499 tree step_vector = NULL_TREE;
8500 tree niters_vector_mult_vf = NULL_TREE;
8501 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
8502 unsigned int lowest_vf = constant_lower_bound (vf);
8503 gimple *stmt;
8504 bool check_profitability = false;
8505 unsigned int th;
8506
8507 DUMP_VECT_SCOPE ("vec_transform_loop");
8508
8509 loop_vinfo->shared->check_datarefs ();
8510
8511 /* Use the more conservative vectorization threshold. If the number
8512 of iterations is constant assume the cost check has been performed
8513 by our caller. If the threshold makes all loops profitable that
8514 run at least the (estimated) vectorization factor number of times
8515 checking is pointless, too. */
8516 th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
8517 if (vect_apply_runtime_profitability_check_p (loop_vinfo))
8518 {
8519 if (dump_enabled_p ())
8520 dump_printf_loc (MSG_NOTE, vect_location,
8521 "Profitability threshold is %d loop iterations.\n",
8522 th);
8523 check_profitability = true;
8524 }
8525
8526 /* Make sure there exists a single-predecessor exit bb. Do this before
8527 versioning. */
8528 edge e = single_exit (loop);
8529 if (! single_pred_p (e->dest))
8530 {
8531 split_loop_exit_edge (e, true);
8532 if (dump_enabled_p ())
8533 dump_printf (MSG_NOTE, "split exit edge\n");
8534 }
8535
8536 /* Version the loop first, if required, so the profitability check
8537 comes first. */
8538
8539 if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
8540 {
8541 class loop *sloop
8542 = vect_loop_versioning (loop_vinfo, loop_vectorized_call);
8543 sloop->force_vectorize = false;
8544 check_profitability = false;
8545 }
8546
8547 /* Make sure there exists a single-predecessor exit bb also on the
8548 scalar loop copy. Do this after versioning but before peeling
8549 so CFG structure is fine for both scalar and if-converted loop
8550 to make slpeel_duplicate_current_defs_from_edges face matched
8551 loop closed PHI nodes on the exit. */
8552 if (LOOP_VINFO_SCALAR_LOOP (loop_vinfo))
8553 {
8554 e = single_exit (LOOP_VINFO_SCALAR_LOOP (loop_vinfo));
8555 if (! single_pred_p (e->dest))
8556 {
8557 split_loop_exit_edge (e, true);
8558 if (dump_enabled_p ())
8559 dump_printf (MSG_NOTE, "split exit edge of scalar loop\n");
8560 }
8561 }
8562
8563 tree niters = vect_build_loop_niters (loop_vinfo);
8564 LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = niters;
8565 tree nitersm1 = unshare_expr (LOOP_VINFO_NITERSM1 (loop_vinfo));
8566 bool niters_no_overflow = loop_niters_no_overflow (loop_vinfo);
8567 tree advance;
8568 drs_init_vec orig_drs_init;
8569
8570 epilogue = vect_do_peeling (loop_vinfo, niters, nitersm1, &niters_vector,
8571 &step_vector, &niters_vector_mult_vf, th,
8572 check_profitability, niters_no_overflow,
8573 &advance);
8574
8575 if (LOOP_VINFO_SCALAR_LOOP (loop_vinfo)
8576 && LOOP_VINFO_SCALAR_LOOP_SCALING (loop_vinfo).initialized_p ())
8577 scale_loop_frequencies (LOOP_VINFO_SCALAR_LOOP (loop_vinfo),
8578 LOOP_VINFO_SCALAR_LOOP_SCALING (loop_vinfo));
8579
8580 if (niters_vector == NULL_TREE)
8581 {
8582 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
8583 && !LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
8584 && known_eq (lowest_vf, vf))
8585 {
8586 niters_vector
8587 = build_int_cst (TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo)),
8588 LOOP_VINFO_INT_NITERS (loop_vinfo) / lowest_vf);
8589 step_vector = build_one_cst (TREE_TYPE (niters));
8590 }
8591 else
8592 vect_gen_vector_loop_niters (loop_vinfo, niters, &niters_vector,
8593 &step_vector, niters_no_overflow);
8594 }
8595
8596 /* 1) Make sure the loop header has exactly two entries
8597 2) Make sure we have a preheader basic block. */
8598
8599 gcc_assert (EDGE_COUNT (loop->header->preds) == 2);
8600
8601 split_edge (loop_preheader_edge (loop));
8602
8603 if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
8604 && vect_use_loop_mask_for_alignment_p (loop_vinfo))
8605 /* This will deal with any possible peeling. */
8606 vect_prepare_for_masked_peels (loop_vinfo);
8607
8608 /* Schedule the SLP instances first, then handle loop vectorization
8609 below. */
8610 if (!loop_vinfo->slp_instances.is_empty ())
8611 {
8612 DUMP_VECT_SCOPE ("scheduling SLP instances");
8613 vect_schedule_slp (loop_vinfo);
8614 }
8615
8616 /* FORNOW: the vectorizer supports only loops which body consist
8617 of one basic block (header + empty latch). When the vectorizer will
8618 support more involved loop forms, the order by which the BBs are
8619 traversed need to be reconsidered. */
8620
8621 for (i = 0; i < nbbs; i++)
8622 {
8623 basic_block bb = bbs[i];
8624 stmt_vec_info stmt_info;
8625
8626 for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
8627 gsi_next (&si))
8628 {
8629 gphi *phi = si.phi ();
8630 if (dump_enabled_p ())
8631 dump_printf_loc (MSG_NOTE, vect_location,
8632 "------>vectorizing phi: %G", phi);
8633 stmt_info = loop_vinfo->lookup_stmt (phi);
8634 if (!stmt_info)
8635 continue;
8636
8637 if (MAY_HAVE_DEBUG_BIND_STMTS && !STMT_VINFO_LIVE_P (stmt_info))
8638 vect_loop_kill_debug_uses (loop, stmt_info);
8639
8640 if (!STMT_VINFO_RELEVANT_P (stmt_info)
8641 && !STMT_VINFO_LIVE_P (stmt_info))
8642 continue;
8643
8644 if (STMT_VINFO_VECTYPE (stmt_info)
8645 && (maybe_ne
8646 (TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info)), vf))
8647 && dump_enabled_p ())
8648 dump_printf_loc (MSG_NOTE, vect_location, "multiple-types.\n");
8649
8650 if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def
8651 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
8652 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def
8653 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle
8654 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_internal_def)
8655 && ! PURE_SLP_STMT (stmt_info))
8656 {
8657 if (dump_enabled_p ())
8658 dump_printf_loc (MSG_NOTE, vect_location, "transform phi.\n");
8659 vect_transform_stmt (loop_vinfo, stmt_info, NULL, NULL, NULL);
8660 }
8661 }
8662
8663 for (gimple_stmt_iterator si = gsi_start_bb (bb);
8664 !gsi_end_p (si);)
8665 {
8666 stmt = gsi_stmt (si);
8667 /* During vectorization remove existing clobber stmts. */
8668 if (gimple_clobber_p (stmt))
8669 {
8670 unlink_stmt_vdef (stmt);
8671 gsi_remove (&si, true);
8672 release_defs (stmt);
8673 }
8674 else
8675 {
8676 /* Ignore vector stmts created in the outer loop. */
8677 stmt_info = loop_vinfo->lookup_stmt (stmt);
8678
8679 /* vector stmts created in the outer-loop during vectorization of
8680 stmts in an inner-loop may not have a stmt_info, and do not
8681 need to be vectorized. */
8682 stmt_vec_info seen_store = NULL;
8683 if (stmt_info)
8684 {
8685 if (STMT_VINFO_IN_PATTERN_P (stmt_info))
8686 {
8687 gimple *def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
8688 for (gimple_stmt_iterator subsi = gsi_start (def_seq);
8689 !gsi_end_p (subsi); gsi_next (&subsi))
8690 {
8691 stmt_vec_info pat_stmt_info
8692 = loop_vinfo->lookup_stmt (gsi_stmt (subsi));
8693 vect_transform_loop_stmt (loop_vinfo, pat_stmt_info,
8694 &si, &seen_store);
8695 }
8696 stmt_vec_info pat_stmt_info
8697 = STMT_VINFO_RELATED_STMT (stmt_info);
8698 vect_transform_loop_stmt (loop_vinfo, pat_stmt_info, &si,
8699 &seen_store);
8700 }
8701 vect_transform_loop_stmt (loop_vinfo, stmt_info, &si,
8702 &seen_store);
8703 }
8704 gsi_next (&si);
8705 if (seen_store)
8706 {
8707 if (STMT_VINFO_GROUPED_ACCESS (seen_store))
8708 /* Interleaving. If IS_STORE is TRUE, the
8709 vectorization of the interleaving chain was
8710 completed - free all the stores in the chain. */
8711 vect_remove_stores (loop_vinfo,
8712 DR_GROUP_FIRST_ELEMENT (seen_store));
8713 else
8714 /* Free the attached stmt_vec_info and remove the stmt. */
8715 loop_vinfo->remove_stmt (stmt_info);
8716 }
8717 }
8718 }
8719
8720 /* Stub out scalar statements that must not survive vectorization.
8721 Doing this here helps with grouped statements, or statements that
8722 are involved in patterns. */
8723 for (gimple_stmt_iterator gsi = gsi_start_bb (bb);
8724 !gsi_end_p (gsi); gsi_next (&gsi))
8725 {
8726 gcall *call = dyn_cast <gcall *> (gsi_stmt (gsi));
8727 if (call && gimple_call_internal_p (call, IFN_MASK_LOAD))
8728 {
8729 tree lhs = gimple_get_lhs (call);
8730 if (!VECTOR_TYPE_P (TREE_TYPE (lhs)))
8731 {
8732 tree zero = build_zero_cst (TREE_TYPE (lhs));
8733 gimple *new_stmt = gimple_build_assign (lhs, zero);
8734 gsi_replace (&gsi, new_stmt, true);
8735 }
8736 }
8737 }
8738 } /* BBs in loop */
8739
8740 /* The vectorization factor is always > 1, so if we use an IV increment of 1.
8741 a zero NITERS becomes a nonzero NITERS_VECTOR. */
8742 if (integer_onep (step_vector))
8743 niters_no_overflow = true;
8744 vect_set_loop_condition (loop, loop_vinfo, niters_vector, step_vector,
8745 niters_vector_mult_vf, !niters_no_overflow);
8746
8747 unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
8748 scale_profile_for_vect_loop (loop, assumed_vf);
8749
8750 /* True if the final iteration might not handle a full vector's
8751 worth of scalar iterations. */
8752 bool final_iter_may_be_partial = LOOP_VINFO_FULLY_MASKED_P (loop_vinfo);
8753 /* The minimum number of iterations performed by the epilogue. This
8754 is 1 when peeling for gaps because we always need a final scalar
8755 iteration. */
8756 int min_epilogue_iters = LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) ? 1 : 0;
8757 /* +1 to convert latch counts to loop iteration counts,
8758 -min_epilogue_iters to remove iterations that cannot be performed
8759 by the vector code. */
8760 int bias_for_lowest = 1 - min_epilogue_iters;
8761 int bias_for_assumed = bias_for_lowest;
8762 int alignment_npeels = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
8763 if (alignment_npeels && LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
8764 {
8765 /* When the amount of peeling is known at compile time, the first
8766 iteration will have exactly alignment_npeels active elements.
8767 In the worst case it will have at least one. */
8768 int min_first_active = (alignment_npeels > 0 ? alignment_npeels : 1);
8769 bias_for_lowest += lowest_vf - min_first_active;
8770 bias_for_assumed += assumed_vf - min_first_active;
8771 }
8772 /* In these calculations the "- 1" converts loop iteration counts
8773 back to latch counts. */
8774 if (loop->any_upper_bound)
8775 loop->nb_iterations_upper_bound
8776 = (final_iter_may_be_partial
8777 ? wi::udiv_ceil (loop->nb_iterations_upper_bound + bias_for_lowest,
8778 lowest_vf) - 1
8779 : wi::udiv_floor (loop->nb_iterations_upper_bound + bias_for_lowest,
8780 lowest_vf) - 1);
8781 if (loop->any_likely_upper_bound)
8782 loop->nb_iterations_likely_upper_bound
8783 = (final_iter_may_be_partial
8784 ? wi::udiv_ceil (loop->nb_iterations_likely_upper_bound
8785 + bias_for_lowest, lowest_vf) - 1
8786 : wi::udiv_floor (loop->nb_iterations_likely_upper_bound
8787 + bias_for_lowest, lowest_vf) - 1);
8788 if (loop->any_estimate)
8789 loop->nb_iterations_estimate
8790 = (final_iter_may_be_partial
8791 ? wi::udiv_ceil (loop->nb_iterations_estimate + bias_for_assumed,
8792 assumed_vf) - 1
8793 : wi::udiv_floor (loop->nb_iterations_estimate + bias_for_assumed,
8794 assumed_vf) - 1);
8795
8796 if (dump_enabled_p ())
8797 {
8798 if (!LOOP_VINFO_EPILOGUE_P (loop_vinfo))
8799 {
8800 dump_printf_loc (MSG_NOTE, vect_location,
8801 "LOOP VECTORIZED\n");
8802 if (loop->inner)
8803 dump_printf_loc (MSG_NOTE, vect_location,
8804 "OUTER LOOP VECTORIZED\n");
8805 dump_printf (MSG_NOTE, "\n");
8806 }
8807 else
8808 dump_printf_loc (MSG_NOTE, vect_location,
8809 "LOOP EPILOGUE VECTORIZED (MODE=%s)\n",
8810 GET_MODE_NAME (loop_vinfo->vector_mode));
8811 }
8812
8813 /* Loops vectorized with a variable factor won't benefit from
8814 unrolling/peeling. */
8815 if (!vf.is_constant ())
8816 {
8817 loop->unroll = 1;
8818 if (dump_enabled_p ())
8819 dump_printf_loc (MSG_NOTE, vect_location, "Disabling unrolling due to"
8820 " variable-length vectorization factor\n");
8821 }
8822 /* Free SLP instances here because otherwise stmt reference counting
8823 won't work. */
8824 slp_instance instance;
8825 FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), i, instance)
8826 vect_free_slp_instance (instance, true);
8827 LOOP_VINFO_SLP_INSTANCES (loop_vinfo).release ();
8828 /* Clear-up safelen field since its value is invalid after vectorization
8829 since vectorized loop can have loop-carried dependencies. */
8830 loop->safelen = 0;
8831
8832 if (epilogue)
8833 {
8834 update_epilogue_loop_vinfo (epilogue, advance);
8835
8836 epilogue->simduid = loop->simduid;
8837 epilogue->force_vectorize = loop->force_vectorize;
8838 epilogue->dont_vectorize = false;
8839 }
8840
8841 return epilogue;
8842 }
8843
8844 /* The code below is trying to perform simple optimization - revert
8845 if-conversion for masked stores, i.e. if the mask of a store is zero
8846 do not perform it and all stored value producers also if possible.
8847 For example,
8848 for (i=0; i<n; i++)
8849 if (c[i])
8850 {
8851 p1[i] += 1;
8852 p2[i] = p3[i] +2;
8853 }
8854 this transformation will produce the following semi-hammock:
8855
8856 if (!mask__ifc__42.18_165 == { 0, 0, 0, 0, 0, 0, 0, 0 })
8857 {
8858 vect__11.19_170 = MASK_LOAD (vectp_p1.20_168, 0B, mask__ifc__42.18_165);
8859 vect__12.22_172 = vect__11.19_170 + vect_cst__171;
8860 MASK_STORE (vectp_p1.23_175, 0B, mask__ifc__42.18_165, vect__12.22_172);
8861 vect__18.25_182 = MASK_LOAD (vectp_p3.26_180, 0B, mask__ifc__42.18_165);
8862 vect__19.28_184 = vect__18.25_182 + vect_cst__183;
8863 MASK_STORE (vectp_p2.29_187, 0B, mask__ifc__42.18_165, vect__19.28_184);
8864 }
8865 */
8866
8867 void
8868 optimize_mask_stores (class loop *loop)
8869 {
8870 basic_block *bbs = get_loop_body (loop);
8871 unsigned nbbs = loop->num_nodes;
8872 unsigned i;
8873 basic_block bb;
8874 class loop *bb_loop;
8875 gimple_stmt_iterator gsi;
8876 gimple *stmt;
8877 auto_vec<gimple *> worklist;
8878 auto_purge_vect_location sentinel;
8879
8880 vect_location = find_loop_location (loop);
8881 /* Pick up all masked stores in loop if any. */
8882 for (i = 0; i < nbbs; i++)
8883 {
8884 bb = bbs[i];
8885 for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi);
8886 gsi_next (&gsi))
8887 {
8888 stmt = gsi_stmt (gsi);
8889 if (gimple_call_internal_p (stmt, IFN_MASK_STORE))
8890 worklist.safe_push (stmt);
8891 }
8892 }
8893
8894 free (bbs);
8895 if (worklist.is_empty ())
8896 return;
8897
8898 /* Loop has masked stores. */
8899 while (!worklist.is_empty ())
8900 {
8901 gimple *last, *last_store;
8902 edge e, efalse;
8903 tree mask;
8904 basic_block store_bb, join_bb;
8905 gimple_stmt_iterator gsi_to;
8906 tree vdef, new_vdef;
8907 gphi *phi;
8908 tree vectype;
8909 tree zero;
8910
8911 last = worklist.pop ();
8912 mask = gimple_call_arg (last, 2);
8913 bb = gimple_bb (last);
8914 /* Create then_bb and if-then structure in CFG, then_bb belongs to
8915 the same loop as if_bb. It could be different to LOOP when two
8916 level loop-nest is vectorized and mask_store belongs to the inner
8917 one. */
8918 e = split_block (bb, last);
8919 bb_loop = bb->loop_father;
8920 gcc_assert (loop == bb_loop || flow_loop_nested_p (loop, bb_loop));
8921 join_bb = e->dest;
8922 store_bb = create_empty_bb (bb);
8923 add_bb_to_loop (store_bb, bb_loop);
8924 e->flags = EDGE_TRUE_VALUE;
8925 efalse = make_edge (bb, store_bb, EDGE_FALSE_VALUE);
8926 /* Put STORE_BB to likely part. */
8927 efalse->probability = profile_probability::unlikely ();
8928 store_bb->count = efalse->count ();
8929 make_single_succ_edge (store_bb, join_bb, EDGE_FALLTHRU);
8930 if (dom_info_available_p (CDI_DOMINATORS))
8931 set_immediate_dominator (CDI_DOMINATORS, store_bb, bb);
8932 if (dump_enabled_p ())
8933 dump_printf_loc (MSG_NOTE, vect_location,
8934 "Create new block %d to sink mask stores.",
8935 store_bb->index);
8936 /* Create vector comparison with boolean result. */
8937 vectype = TREE_TYPE (mask);
8938 zero = build_zero_cst (vectype);
8939 stmt = gimple_build_cond (EQ_EXPR, mask, zero, NULL_TREE, NULL_TREE);
8940 gsi = gsi_last_bb (bb);
8941 gsi_insert_after (&gsi, stmt, GSI_SAME_STMT);
8942 /* Create new PHI node for vdef of the last masked store:
8943 .MEM_2 = VDEF <.MEM_1>
8944 will be converted to
8945 .MEM.3 = VDEF <.MEM_1>
8946 and new PHI node will be created in join bb
8947 .MEM_2 = PHI <.MEM_1, .MEM_3>
8948 */
8949 vdef = gimple_vdef (last);
8950 new_vdef = make_ssa_name (gimple_vop (cfun), last);
8951 gimple_set_vdef (last, new_vdef);
8952 phi = create_phi_node (vdef, join_bb);
8953 add_phi_arg (phi, new_vdef, EDGE_SUCC (store_bb, 0), UNKNOWN_LOCATION);
8954
8955 /* Put all masked stores with the same mask to STORE_BB if possible. */
8956 while (true)
8957 {
8958 gimple_stmt_iterator gsi_from;
8959 gimple *stmt1 = NULL;
8960
8961 /* Move masked store to STORE_BB. */
8962 last_store = last;
8963 gsi = gsi_for_stmt (last);
8964 gsi_from = gsi;
8965 /* Shift GSI to the previous stmt for further traversal. */
8966 gsi_prev (&gsi);
8967 gsi_to = gsi_start_bb (store_bb);
8968 gsi_move_before (&gsi_from, &gsi_to);
8969 /* Setup GSI_TO to the non-empty block start. */
8970 gsi_to = gsi_start_bb (store_bb);
8971 if (dump_enabled_p ())
8972 dump_printf_loc (MSG_NOTE, vect_location,
8973 "Move stmt to created bb\n%G", last);
8974 /* Move all stored value producers if possible. */
8975 while (!gsi_end_p (gsi))
8976 {
8977 tree lhs;
8978 imm_use_iterator imm_iter;
8979 use_operand_p use_p;
8980 bool res;
8981
8982 /* Skip debug statements. */
8983 if (is_gimple_debug (gsi_stmt (gsi)))
8984 {
8985 gsi_prev (&gsi);
8986 continue;
8987 }
8988 stmt1 = gsi_stmt (gsi);
8989 /* Do not consider statements writing to memory or having
8990 volatile operand. */
8991 if (gimple_vdef (stmt1)
8992 || gimple_has_volatile_ops (stmt1))
8993 break;
8994 gsi_from = gsi;
8995 gsi_prev (&gsi);
8996 lhs = gimple_get_lhs (stmt1);
8997 if (!lhs)
8998 break;
8999
9000 /* LHS of vectorized stmt must be SSA_NAME. */
9001 if (TREE_CODE (lhs) != SSA_NAME)
9002 break;
9003
9004 if (!VECTOR_TYPE_P (TREE_TYPE (lhs)))
9005 {
9006 /* Remove dead scalar statement. */
9007 if (has_zero_uses (lhs))
9008 {
9009 gsi_remove (&gsi_from, true);
9010 continue;
9011 }
9012 }
9013
9014 /* Check that LHS does not have uses outside of STORE_BB. */
9015 res = true;
9016 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, lhs)
9017 {
9018 gimple *use_stmt;
9019 use_stmt = USE_STMT (use_p);
9020 if (is_gimple_debug (use_stmt))
9021 continue;
9022 if (gimple_bb (use_stmt) != store_bb)
9023 {
9024 res = false;
9025 break;
9026 }
9027 }
9028 if (!res)
9029 break;
9030
9031 if (gimple_vuse (stmt1)
9032 && gimple_vuse (stmt1) != gimple_vuse (last_store))
9033 break;
9034
9035 /* Can move STMT1 to STORE_BB. */
9036 if (dump_enabled_p ())
9037 dump_printf_loc (MSG_NOTE, vect_location,
9038 "Move stmt to created bb\n%G", stmt1);
9039 gsi_move_before (&gsi_from, &gsi_to);
9040 /* Shift GSI_TO for further insertion. */
9041 gsi_prev (&gsi_to);
9042 }
9043 /* Put other masked stores with the same mask to STORE_BB. */
9044 if (worklist.is_empty ()
9045 || gimple_call_arg (worklist.last (), 2) != mask
9046 || worklist.last () != stmt1)
9047 break;
9048 last = worklist.pop ();
9049 }
9050 add_phi_arg (phi, gimple_vuse (last_store), e, UNKNOWN_LOCATION);
9051 }
9052 }
9053
9054 /* Decide whether it is possible to use a zero-based induction variable
9055 when vectorizing LOOP_VINFO with a fully-masked loop. If it is,
9056 return the value that the induction variable must be able to hold
9057 in order to ensure that the loop ends with an all-false mask.
9058 Return -1 otherwise. */
9059 widest_int
9060 vect_iv_limit_for_full_masking (loop_vec_info loop_vinfo)
9061 {
9062 tree niters_skip = LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo);
9063 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
9064 unsigned HOST_WIDE_INT max_vf = vect_max_vf (loop_vinfo);
9065
9066 /* Calculate the value that the induction variable must be able
9067 to hit in order to ensure that we end the loop with an all-false mask.
9068 This involves adding the maximum number of inactive trailing scalar
9069 iterations. */
9070 widest_int iv_limit = -1;
9071 if (max_loop_iterations (loop, &iv_limit))
9072 {
9073 if (niters_skip)
9074 {
9075 /* Add the maximum number of skipped iterations to the
9076 maximum iteration count. */
9077 if (TREE_CODE (niters_skip) == INTEGER_CST)
9078 iv_limit += wi::to_widest (niters_skip);
9079 else
9080 iv_limit += max_vf - 1;
9081 }
9082 else if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo))
9083 /* Make a conservatively-correct assumption. */
9084 iv_limit += max_vf - 1;
9085
9086 /* IV_LIMIT is the maximum number of latch iterations, which is also
9087 the maximum in-range IV value. Round this value down to the previous
9088 vector alignment boundary and then add an extra full iteration. */
9089 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
9090 iv_limit = (iv_limit & -(int) known_alignment (vf)) + max_vf;
9091 }
9092 return iv_limit;
9093 }
9094