]> git.ipfire.org Git - thirdparty/gcc.git/blob - gcc/tree-vect-loop.c
fa5c2162f2f81bf6f6ac31b9abef6bb60b422185
[thirdparty/gcc.git] / gcc / tree-vect-loop.c
1 /* Loop Vectorization
2 Copyright (C) 2003-2019 Free Software Foundation, Inc.
3 Contributed by Dorit Naishlos <dorit@il.ibm.com> and
4 Ira Rosen <irar@il.ibm.com>
5
6 This file is part of GCC.
7
8 GCC is free software; you can redistribute it and/or modify it under
9 the terms of the GNU General Public License as published by the Free
10 Software Foundation; either version 3, or (at your option) any later
11 version.
12
13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
16 for more details.
17
18 You should have received a copy of the GNU General Public License
19 along with GCC; see the file COPYING3. If not see
20 <http://www.gnu.org/licenses/>. */
21
22 #include "config.h"
23 #include "system.h"
24 #include "coretypes.h"
25 #include "backend.h"
26 #include "target.h"
27 #include "rtl.h"
28 #include "tree.h"
29 #include "gimple.h"
30 #include "cfghooks.h"
31 #include "tree-pass.h"
32 #include "ssa.h"
33 #include "optabs-tree.h"
34 #include "diagnostic-core.h"
35 #include "fold-const.h"
36 #include "stor-layout.h"
37 #include "cfganal.h"
38 #include "gimplify.h"
39 #include "gimple-iterator.h"
40 #include "gimplify-me.h"
41 #include "tree-ssa-loop-ivopts.h"
42 #include "tree-ssa-loop-manip.h"
43 #include "tree-ssa-loop-niter.h"
44 #include "tree-ssa-loop.h"
45 #include "cfgloop.h"
46 #include "params.h"
47 #include "tree-scalar-evolution.h"
48 #include "tree-vectorizer.h"
49 #include "gimple-fold.h"
50 #include "cgraph.h"
51 #include "tree-cfg.h"
52 #include "tree-if-conv.h"
53 #include "internal-fn.h"
54 #include "tree-vector-builder.h"
55 #include "vec-perm-indices.h"
56 #include "tree-eh.h"
57
58 /* Loop Vectorization Pass.
59
60 This pass tries to vectorize loops.
61
62 For example, the vectorizer transforms the following simple loop:
63
64 short a[N]; short b[N]; short c[N]; int i;
65
66 for (i=0; i<N; i++){
67 a[i] = b[i] + c[i];
68 }
69
70 as if it was manually vectorized by rewriting the source code into:
71
72 typedef int __attribute__((mode(V8HI))) v8hi;
73 short a[N]; short b[N]; short c[N]; int i;
74 v8hi *pa = (v8hi*)a, *pb = (v8hi*)b, *pc = (v8hi*)c;
75 v8hi va, vb, vc;
76
77 for (i=0; i<N/8; i++){
78 vb = pb[i];
79 vc = pc[i];
80 va = vb + vc;
81 pa[i] = va;
82 }
83
84 The main entry to this pass is vectorize_loops(), in which
85 the vectorizer applies a set of analyses on a given set of loops,
86 followed by the actual vectorization transformation for the loops that
87 had successfully passed the analysis phase.
88 Throughout this pass we make a distinction between two types of
89 data: scalars (which are represented by SSA_NAMES), and memory references
90 ("data-refs"). These two types of data require different handling both
91 during analysis and transformation. The types of data-refs that the
92 vectorizer currently supports are ARRAY_REFS which base is an array DECL
93 (not a pointer), and INDIRECT_REFS through pointers; both array and pointer
94 accesses are required to have a simple (consecutive) access pattern.
95
96 Analysis phase:
97 ===============
98 The driver for the analysis phase is vect_analyze_loop().
99 It applies a set of analyses, some of which rely on the scalar evolution
100 analyzer (scev) developed by Sebastian Pop.
101
102 During the analysis phase the vectorizer records some information
103 per stmt in a "stmt_vec_info" struct which is attached to each stmt in the
104 loop, as well as general information about the loop as a whole, which is
105 recorded in a "loop_vec_info" struct attached to each loop.
106
107 Transformation phase:
108 =====================
109 The loop transformation phase scans all the stmts in the loop, and
110 creates a vector stmt (or a sequence of stmts) for each scalar stmt S in
111 the loop that needs to be vectorized. It inserts the vector code sequence
112 just before the scalar stmt S, and records a pointer to the vector code
113 in STMT_VINFO_VEC_STMT (stmt_info) (stmt_info is the stmt_vec_info struct
114 attached to S). This pointer will be used for the vectorization of following
115 stmts which use the def of stmt S. Stmt S is removed if it writes to memory;
116 otherwise, we rely on dead code elimination for removing it.
117
118 For example, say stmt S1 was vectorized into stmt VS1:
119
120 VS1: vb = px[i];
121 S1: b = x[i]; STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
122 S2: a = b;
123
124 To vectorize stmt S2, the vectorizer first finds the stmt that defines
125 the operand 'b' (S1), and gets the relevant vector def 'vb' from the
126 vector stmt VS1 pointed to by STMT_VINFO_VEC_STMT (stmt_info (S1)). The
127 resulting sequence would be:
128
129 VS1: vb = px[i];
130 S1: b = x[i]; STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
131 VS2: va = vb;
132 S2: a = b; STMT_VINFO_VEC_STMT (stmt_info (S2)) = VS2
133
134 Operands that are not SSA_NAMEs, are data-refs that appear in
135 load/store operations (like 'x[i]' in S1), and are handled differently.
136
137 Target modeling:
138 =================
139 Currently the only target specific information that is used is the
140 size of the vector (in bytes) - "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD".
141 Targets that can support different sizes of vectors, for now will need
142 to specify one value for "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD". More
143 flexibility will be added in the future.
144
145 Since we only vectorize operations which vector form can be
146 expressed using existing tree codes, to verify that an operation is
147 supported, the vectorizer checks the relevant optab at the relevant
148 machine_mode (e.g, optab_handler (add_optab, V8HImode)). If
149 the value found is CODE_FOR_nothing, then there's no target support, and
150 we can't vectorize the stmt.
151
152 For additional information on this project see:
153 http://gcc.gnu.org/projects/tree-ssa/vectorization.html
154 */
155
156 static void vect_estimate_min_profitable_iters (loop_vec_info, int *, int *);
157 static stmt_vec_info vect_is_simple_reduction (loop_vec_info, stmt_vec_info,
158 bool *);
159
160 /* Subroutine of vect_determine_vf_for_stmt that handles only one
161 statement. VECTYPE_MAYBE_SET_P is true if STMT_VINFO_VECTYPE
162 may already be set for general statements (not just data refs). */
163
164 static opt_result
165 vect_determine_vf_for_stmt_1 (stmt_vec_info stmt_info,
166 bool vectype_maybe_set_p,
167 poly_uint64 *vf,
168 vec<stmt_vec_info > *mask_producers)
169 {
170 gimple *stmt = stmt_info->stmt;
171
172 if ((!STMT_VINFO_RELEVANT_P (stmt_info)
173 && !STMT_VINFO_LIVE_P (stmt_info))
174 || gimple_clobber_p (stmt))
175 {
176 if (dump_enabled_p ())
177 dump_printf_loc (MSG_NOTE, vect_location, "skip.\n");
178 return opt_result::success ();
179 }
180
181 tree stmt_vectype, nunits_vectype;
182 opt_result res = vect_get_vector_types_for_stmt (stmt_info, &stmt_vectype,
183 &nunits_vectype);
184 if (!res)
185 return res;
186
187 if (stmt_vectype)
188 {
189 if (STMT_VINFO_VECTYPE (stmt_info))
190 /* The only case when a vectype had been already set is for stmts
191 that contain a data ref, or for "pattern-stmts" (stmts generated
192 by the vectorizer to represent/replace a certain idiom). */
193 gcc_assert ((STMT_VINFO_DATA_REF (stmt_info)
194 || vectype_maybe_set_p)
195 && STMT_VINFO_VECTYPE (stmt_info) == stmt_vectype);
196 else if (stmt_vectype == boolean_type_node)
197 mask_producers->safe_push (stmt_info);
198 else
199 STMT_VINFO_VECTYPE (stmt_info) = stmt_vectype;
200 }
201
202 if (nunits_vectype)
203 vect_update_max_nunits (vf, nunits_vectype);
204
205 return opt_result::success ();
206 }
207
208 /* Subroutine of vect_determine_vectorization_factor. Set the vector
209 types of STMT_INFO and all attached pattern statements and update
210 the vectorization factor VF accordingly. If some of the statements
211 produce a mask result whose vector type can only be calculated later,
212 add them to MASK_PRODUCERS. Return true on success or false if
213 something prevented vectorization. */
214
215 static opt_result
216 vect_determine_vf_for_stmt (stmt_vec_info stmt_info, poly_uint64 *vf,
217 vec<stmt_vec_info > *mask_producers)
218 {
219 vec_info *vinfo = stmt_info->vinfo;
220 if (dump_enabled_p ())
221 dump_printf_loc (MSG_NOTE, vect_location, "==> examining statement: %G",
222 stmt_info->stmt);
223 opt_result res
224 = vect_determine_vf_for_stmt_1 (stmt_info, false, vf, mask_producers);
225 if (!res)
226 return res;
227
228 if (STMT_VINFO_IN_PATTERN_P (stmt_info)
229 && STMT_VINFO_RELATED_STMT (stmt_info))
230 {
231 gimple *pattern_def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
232 stmt_info = STMT_VINFO_RELATED_STMT (stmt_info);
233
234 /* If a pattern statement has def stmts, analyze them too. */
235 for (gimple_stmt_iterator si = gsi_start (pattern_def_seq);
236 !gsi_end_p (si); gsi_next (&si))
237 {
238 stmt_vec_info def_stmt_info = vinfo->lookup_stmt (gsi_stmt (si));
239 if (dump_enabled_p ())
240 dump_printf_loc (MSG_NOTE, vect_location,
241 "==> examining pattern def stmt: %G",
242 def_stmt_info->stmt);
243 if (!vect_determine_vf_for_stmt_1 (def_stmt_info, true,
244 vf, mask_producers))
245 res = vect_determine_vf_for_stmt_1 (def_stmt_info, true,
246 vf, mask_producers);
247 if (!res)
248 return res;
249 }
250
251 if (dump_enabled_p ())
252 dump_printf_loc (MSG_NOTE, vect_location,
253 "==> examining pattern statement: %G",
254 stmt_info->stmt);
255 res = vect_determine_vf_for_stmt_1 (stmt_info, true, vf, mask_producers);
256 if (!res)
257 return res;
258 }
259
260 return opt_result::success ();
261 }
262
263 /* Function vect_determine_vectorization_factor
264
265 Determine the vectorization factor (VF). VF is the number of data elements
266 that are operated upon in parallel in a single iteration of the vectorized
267 loop. For example, when vectorizing a loop that operates on 4byte elements,
268 on a target with vector size (VS) 16byte, the VF is set to 4, since 4
269 elements can fit in a single vector register.
270
271 We currently support vectorization of loops in which all types operated upon
272 are of the same size. Therefore this function currently sets VF according to
273 the size of the types operated upon, and fails if there are multiple sizes
274 in the loop.
275
276 VF is also the factor by which the loop iterations are strip-mined, e.g.:
277 original loop:
278 for (i=0; i<N; i++){
279 a[i] = b[i] + c[i];
280 }
281
282 vectorized loop:
283 for (i=0; i<N; i+=VF){
284 a[i:VF] = b[i:VF] + c[i:VF];
285 }
286 */
287
288 static opt_result
289 vect_determine_vectorization_factor (loop_vec_info loop_vinfo)
290 {
291 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
292 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
293 unsigned nbbs = loop->num_nodes;
294 poly_uint64 vectorization_factor = 1;
295 tree scalar_type = NULL_TREE;
296 gphi *phi;
297 tree vectype;
298 stmt_vec_info stmt_info;
299 unsigned i;
300 auto_vec<stmt_vec_info> mask_producers;
301
302 DUMP_VECT_SCOPE ("vect_determine_vectorization_factor");
303
304 for (i = 0; i < nbbs; i++)
305 {
306 basic_block bb = bbs[i];
307
308 for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
309 gsi_next (&si))
310 {
311 phi = si.phi ();
312 stmt_info = loop_vinfo->lookup_stmt (phi);
313 if (dump_enabled_p ())
314 dump_printf_loc (MSG_NOTE, vect_location, "==> examining phi: %G",
315 phi);
316
317 gcc_assert (stmt_info);
318
319 if (STMT_VINFO_RELEVANT_P (stmt_info)
320 || STMT_VINFO_LIVE_P (stmt_info))
321 {
322 gcc_assert (!STMT_VINFO_VECTYPE (stmt_info));
323 scalar_type = TREE_TYPE (PHI_RESULT (phi));
324
325 if (dump_enabled_p ())
326 dump_printf_loc (MSG_NOTE, vect_location,
327 "get vectype for scalar type: %T\n",
328 scalar_type);
329
330 vectype = get_vectype_for_scalar_type (scalar_type);
331 if (!vectype)
332 return opt_result::failure_at (phi,
333 "not vectorized: unsupported "
334 "data-type %T\n",
335 scalar_type);
336 STMT_VINFO_VECTYPE (stmt_info) = vectype;
337
338 if (dump_enabled_p ())
339 dump_printf_loc (MSG_NOTE, vect_location, "vectype: %T\n",
340 vectype);
341
342 if (dump_enabled_p ())
343 {
344 dump_printf_loc (MSG_NOTE, vect_location, "nunits = ");
345 dump_dec (MSG_NOTE, TYPE_VECTOR_SUBPARTS (vectype));
346 dump_printf (MSG_NOTE, "\n");
347 }
348
349 vect_update_max_nunits (&vectorization_factor, vectype);
350 }
351 }
352
353 for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
354 gsi_next (&si))
355 {
356 stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
357 opt_result res
358 = vect_determine_vf_for_stmt (stmt_info, &vectorization_factor,
359 &mask_producers);
360 if (!res)
361 return res;
362 }
363 }
364
365 /* TODO: Analyze cost. Decide if worth while to vectorize. */
366 if (dump_enabled_p ())
367 {
368 dump_printf_loc (MSG_NOTE, vect_location, "vectorization factor = ");
369 dump_dec (MSG_NOTE, vectorization_factor);
370 dump_printf (MSG_NOTE, "\n");
371 }
372
373 if (known_le (vectorization_factor, 1U))
374 return opt_result::failure_at (vect_location,
375 "not vectorized: unsupported data-type\n");
376 LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
377
378 for (i = 0; i < mask_producers.length (); i++)
379 {
380 stmt_info = mask_producers[i];
381 opt_tree mask_type = vect_get_mask_type_for_stmt (stmt_info);
382 if (!mask_type)
383 return opt_result::propagate_failure (mask_type);
384 STMT_VINFO_VECTYPE (stmt_info) = mask_type;
385 }
386
387 return opt_result::success ();
388 }
389
390
391 /* Function vect_is_simple_iv_evolution.
392
393 FORNOW: A simple evolution of an induction variables in the loop is
394 considered a polynomial evolution. */
395
396 static bool
397 vect_is_simple_iv_evolution (unsigned loop_nb, tree access_fn, tree * init,
398 tree * step)
399 {
400 tree init_expr;
401 tree step_expr;
402 tree evolution_part = evolution_part_in_loop_num (access_fn, loop_nb);
403 basic_block bb;
404
405 /* When there is no evolution in this loop, the evolution function
406 is not "simple". */
407 if (evolution_part == NULL_TREE)
408 return false;
409
410 /* When the evolution is a polynomial of degree >= 2
411 the evolution function is not "simple". */
412 if (tree_is_chrec (evolution_part))
413 return false;
414
415 step_expr = evolution_part;
416 init_expr = unshare_expr (initial_condition_in_loop_num (access_fn, loop_nb));
417
418 if (dump_enabled_p ())
419 dump_printf_loc (MSG_NOTE, vect_location, "step: %T, init: %T\n",
420 step_expr, init_expr);
421
422 *init = init_expr;
423 *step = step_expr;
424
425 if (TREE_CODE (step_expr) != INTEGER_CST
426 && (TREE_CODE (step_expr) != SSA_NAME
427 || ((bb = gimple_bb (SSA_NAME_DEF_STMT (step_expr)))
428 && flow_bb_inside_loop_p (get_loop (cfun, loop_nb), bb))
429 || (!INTEGRAL_TYPE_P (TREE_TYPE (step_expr))
430 && (!SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr))
431 || !flag_associative_math)))
432 && (TREE_CODE (step_expr) != REAL_CST
433 || !flag_associative_math))
434 {
435 if (dump_enabled_p ())
436 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
437 "step unknown.\n");
438 return false;
439 }
440
441 return true;
442 }
443
444 /* Return true if PHI, described by STMT_INFO, is the inner PHI in
445 what we are assuming is a double reduction. For example, given
446 a structure like this:
447
448 outer1:
449 x_1 = PHI <x_4(outer2), ...>;
450 ...
451
452 inner:
453 x_2 = PHI <x_1(outer1), ...>;
454 ...
455 x_3 = ...;
456 ...
457
458 outer2:
459 x_4 = PHI <x_3(inner)>;
460 ...
461
462 outer loop analysis would treat x_1 as a double reduction phi and
463 this function would then return true for x_2. */
464
465 static bool
466 vect_inner_phi_in_double_reduction_p (stmt_vec_info stmt_info, gphi *phi)
467 {
468 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
469 use_operand_p use_p;
470 ssa_op_iter op_iter;
471 FOR_EACH_PHI_ARG (use_p, phi, op_iter, SSA_OP_USE)
472 if (stmt_vec_info def_info = loop_vinfo->lookup_def (USE_FROM_PTR (use_p)))
473 if (STMT_VINFO_DEF_TYPE (def_info) == vect_double_reduction_def)
474 return true;
475 return false;
476 }
477
478 /* Function vect_analyze_scalar_cycles_1.
479
480 Examine the cross iteration def-use cycles of scalar variables
481 in LOOP. LOOP_VINFO represents the loop that is now being
482 considered for vectorization (can be LOOP, or an outer-loop
483 enclosing LOOP). */
484
485 static void
486 vect_analyze_scalar_cycles_1 (loop_vec_info loop_vinfo, class loop *loop)
487 {
488 basic_block bb = loop->header;
489 tree init, step;
490 auto_vec<stmt_vec_info, 64> worklist;
491 gphi_iterator gsi;
492 bool double_reduc;
493
494 DUMP_VECT_SCOPE ("vect_analyze_scalar_cycles");
495
496 /* First - identify all inductions. Reduction detection assumes that all the
497 inductions have been identified, therefore, this order must not be
498 changed. */
499 for (gsi = gsi_start_phis (bb); !gsi_end_p (gsi); gsi_next (&gsi))
500 {
501 gphi *phi = gsi.phi ();
502 tree access_fn = NULL;
503 tree def = PHI_RESULT (phi);
504 stmt_vec_info stmt_vinfo = loop_vinfo->lookup_stmt (phi);
505
506 if (dump_enabled_p ())
507 dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: %G", phi);
508
509 /* Skip virtual phi's. The data dependences that are associated with
510 virtual defs/uses (i.e., memory accesses) are analyzed elsewhere. */
511 if (virtual_operand_p (def))
512 continue;
513
514 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_unknown_def_type;
515
516 /* Analyze the evolution function. */
517 access_fn = analyze_scalar_evolution (loop, def);
518 if (access_fn)
519 {
520 STRIP_NOPS (access_fn);
521 if (dump_enabled_p ())
522 dump_printf_loc (MSG_NOTE, vect_location,
523 "Access function of PHI: %T\n", access_fn);
524 STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo)
525 = initial_condition_in_loop_num (access_fn, loop->num);
526 STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo)
527 = evolution_part_in_loop_num (access_fn, loop->num);
528 }
529
530 if (!access_fn
531 || vect_inner_phi_in_double_reduction_p (stmt_vinfo, phi)
532 || !vect_is_simple_iv_evolution (loop->num, access_fn, &init, &step)
533 || (LOOP_VINFO_LOOP (loop_vinfo) != loop
534 && TREE_CODE (step) != INTEGER_CST))
535 {
536 worklist.safe_push (stmt_vinfo);
537 continue;
538 }
539
540 gcc_assert (STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo)
541 != NULL_TREE);
542 gcc_assert (STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo) != NULL_TREE);
543
544 if (dump_enabled_p ())
545 dump_printf_loc (MSG_NOTE, vect_location, "Detected induction.\n");
546 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_induction_def;
547 }
548
549
550 /* Second - identify all reductions and nested cycles. */
551 while (worklist.length () > 0)
552 {
553 stmt_vec_info stmt_vinfo = worklist.pop ();
554 gphi *phi = as_a <gphi *> (stmt_vinfo->stmt);
555 tree def = PHI_RESULT (phi);
556
557 if (dump_enabled_p ())
558 dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: %G", phi);
559
560 gcc_assert (!virtual_operand_p (def)
561 && STMT_VINFO_DEF_TYPE (stmt_vinfo) == vect_unknown_def_type);
562
563 stmt_vec_info reduc_stmt_info
564 = vect_is_simple_reduction (loop_vinfo, stmt_vinfo, &double_reduc);
565 if (reduc_stmt_info)
566 {
567 STMT_VINFO_REDUC_DEF (stmt_vinfo) = reduc_stmt_info;
568 STMT_VINFO_REDUC_DEF (reduc_stmt_info) = stmt_vinfo;
569 if (double_reduc)
570 {
571 if (dump_enabled_p ())
572 dump_printf_loc (MSG_NOTE, vect_location,
573 "Detected double reduction.\n");
574
575 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_double_reduction_def;
576 STMT_VINFO_DEF_TYPE (reduc_stmt_info) = vect_double_reduction_def;
577 }
578 else
579 {
580 if (loop != LOOP_VINFO_LOOP (loop_vinfo))
581 {
582 if (dump_enabled_p ())
583 dump_printf_loc (MSG_NOTE, vect_location,
584 "Detected vectorizable nested cycle.\n");
585
586 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_nested_cycle;
587 }
588 else
589 {
590 if (dump_enabled_p ())
591 dump_printf_loc (MSG_NOTE, vect_location,
592 "Detected reduction.\n");
593
594 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_reduction_def;
595 STMT_VINFO_DEF_TYPE (reduc_stmt_info) = vect_reduction_def;
596 /* Store the reduction cycles for possible vectorization in
597 loop-aware SLP if it was not detected as reduction
598 chain. */
599 if (! REDUC_GROUP_FIRST_ELEMENT (reduc_stmt_info))
600 LOOP_VINFO_REDUCTIONS (loop_vinfo).safe_push
601 (reduc_stmt_info);
602 }
603 }
604 }
605 else
606 if (dump_enabled_p ())
607 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
608 "Unknown def-use cycle pattern.\n");
609 }
610 }
611
612
613 /* Function vect_analyze_scalar_cycles.
614
615 Examine the cross iteration def-use cycles of scalar variables, by
616 analyzing the loop-header PHIs of scalar variables. Classify each
617 cycle as one of the following: invariant, induction, reduction, unknown.
618 We do that for the loop represented by LOOP_VINFO, and also to its
619 inner-loop, if exists.
620 Examples for scalar cycles:
621
622 Example1: reduction:
623
624 loop1:
625 for (i=0; i<N; i++)
626 sum += a[i];
627
628 Example2: induction:
629
630 loop2:
631 for (i=0; i<N; i++)
632 a[i] = i; */
633
634 static void
635 vect_analyze_scalar_cycles (loop_vec_info loop_vinfo)
636 {
637 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
638
639 vect_analyze_scalar_cycles_1 (loop_vinfo, loop);
640
641 /* When vectorizing an outer-loop, the inner-loop is executed sequentially.
642 Reductions in such inner-loop therefore have different properties than
643 the reductions in the nest that gets vectorized:
644 1. When vectorized, they are executed in the same order as in the original
645 scalar loop, so we can't change the order of computation when
646 vectorizing them.
647 2. FIXME: Inner-loop reductions can be used in the inner-loop, so the
648 current checks are too strict. */
649
650 if (loop->inner)
651 vect_analyze_scalar_cycles_1 (loop_vinfo, loop->inner);
652 }
653
654 /* Transfer group and reduction information from STMT_INFO to its
655 pattern stmt. */
656
657 static void
658 vect_fixup_reduc_chain (stmt_vec_info stmt_info)
659 {
660 stmt_vec_info firstp = STMT_VINFO_RELATED_STMT (stmt_info);
661 stmt_vec_info stmtp;
662 gcc_assert (!REDUC_GROUP_FIRST_ELEMENT (firstp)
663 && REDUC_GROUP_FIRST_ELEMENT (stmt_info));
664 REDUC_GROUP_SIZE (firstp) = REDUC_GROUP_SIZE (stmt_info);
665 do
666 {
667 stmtp = STMT_VINFO_RELATED_STMT (stmt_info);
668 REDUC_GROUP_FIRST_ELEMENT (stmtp) = firstp;
669 stmt_info = REDUC_GROUP_NEXT_ELEMENT (stmt_info);
670 if (stmt_info)
671 REDUC_GROUP_NEXT_ELEMENT (stmtp)
672 = STMT_VINFO_RELATED_STMT (stmt_info);
673 }
674 while (stmt_info);
675 STMT_VINFO_DEF_TYPE (stmtp) = vect_reduction_def;
676 }
677
678 /* Fixup scalar cycles that now have their stmts detected as patterns. */
679
680 static void
681 vect_fixup_scalar_cycles_with_patterns (loop_vec_info loop_vinfo)
682 {
683 stmt_vec_info first;
684 unsigned i;
685
686 FOR_EACH_VEC_ELT (LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo), i, first)
687 if (STMT_VINFO_IN_PATTERN_P (first))
688 {
689 stmt_vec_info next = REDUC_GROUP_NEXT_ELEMENT (first);
690 while (next)
691 {
692 if (! STMT_VINFO_IN_PATTERN_P (next))
693 break;
694 next = REDUC_GROUP_NEXT_ELEMENT (next);
695 }
696 /* If not all stmt in the chain are patterns try to handle
697 the chain without patterns. */
698 if (! next)
699 {
700 vect_fixup_reduc_chain (first);
701 LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo)[i]
702 = STMT_VINFO_RELATED_STMT (first);
703 }
704 }
705 }
706
707 /* Function vect_get_loop_niters.
708
709 Determine how many iterations the loop is executed and place it
710 in NUMBER_OF_ITERATIONS. Place the number of latch iterations
711 in NUMBER_OF_ITERATIONSM1. Place the condition under which the
712 niter information holds in ASSUMPTIONS.
713
714 Return the loop exit condition. */
715
716
717 static gcond *
718 vect_get_loop_niters (class loop *loop, tree *assumptions,
719 tree *number_of_iterations, tree *number_of_iterationsm1)
720 {
721 edge exit = single_exit (loop);
722 class tree_niter_desc niter_desc;
723 tree niter_assumptions, niter, may_be_zero;
724 gcond *cond = get_loop_exit_condition (loop);
725
726 *assumptions = boolean_true_node;
727 *number_of_iterationsm1 = chrec_dont_know;
728 *number_of_iterations = chrec_dont_know;
729 DUMP_VECT_SCOPE ("get_loop_niters");
730
731 if (!exit)
732 return cond;
733
734 may_be_zero = NULL_TREE;
735 if (!number_of_iterations_exit_assumptions (loop, exit, &niter_desc, NULL)
736 || chrec_contains_undetermined (niter_desc.niter))
737 return cond;
738
739 niter_assumptions = niter_desc.assumptions;
740 may_be_zero = niter_desc.may_be_zero;
741 niter = niter_desc.niter;
742
743 if (may_be_zero && integer_zerop (may_be_zero))
744 may_be_zero = NULL_TREE;
745
746 if (may_be_zero)
747 {
748 if (COMPARISON_CLASS_P (may_be_zero))
749 {
750 /* Try to combine may_be_zero with assumptions, this can simplify
751 computation of niter expression. */
752 if (niter_assumptions && !integer_nonzerop (niter_assumptions))
753 niter_assumptions = fold_build2 (TRUTH_AND_EXPR, boolean_type_node,
754 niter_assumptions,
755 fold_build1 (TRUTH_NOT_EXPR,
756 boolean_type_node,
757 may_be_zero));
758 else
759 niter = fold_build3 (COND_EXPR, TREE_TYPE (niter), may_be_zero,
760 build_int_cst (TREE_TYPE (niter), 0),
761 rewrite_to_non_trapping_overflow (niter));
762
763 may_be_zero = NULL_TREE;
764 }
765 else if (integer_nonzerop (may_be_zero))
766 {
767 *number_of_iterationsm1 = build_int_cst (TREE_TYPE (niter), 0);
768 *number_of_iterations = build_int_cst (TREE_TYPE (niter), 1);
769 return cond;
770 }
771 else
772 return cond;
773 }
774
775 *assumptions = niter_assumptions;
776 *number_of_iterationsm1 = niter;
777
778 /* We want the number of loop header executions which is the number
779 of latch executions plus one.
780 ??? For UINT_MAX latch executions this number overflows to zero
781 for loops like do { n++; } while (n != 0); */
782 if (niter && !chrec_contains_undetermined (niter))
783 niter = fold_build2 (PLUS_EXPR, TREE_TYPE (niter), unshare_expr (niter),
784 build_int_cst (TREE_TYPE (niter), 1));
785 *number_of_iterations = niter;
786
787 return cond;
788 }
789
790 /* Function bb_in_loop_p
791
792 Used as predicate for dfs order traversal of the loop bbs. */
793
794 static bool
795 bb_in_loop_p (const_basic_block bb, const void *data)
796 {
797 const class loop *const loop = (const class loop *)data;
798 if (flow_bb_inside_loop_p (loop, bb))
799 return true;
800 return false;
801 }
802
803
804 /* Create and initialize a new loop_vec_info struct for LOOP_IN, as well as
805 stmt_vec_info structs for all the stmts in LOOP_IN. */
806
807 _loop_vec_info::_loop_vec_info (class loop *loop_in, vec_info_shared *shared)
808 : vec_info (vec_info::loop, init_cost (loop_in), shared),
809 loop (loop_in),
810 bbs (XCNEWVEC (basic_block, loop->num_nodes)),
811 num_itersm1 (NULL_TREE),
812 num_iters (NULL_TREE),
813 num_iters_unchanged (NULL_TREE),
814 num_iters_assumptions (NULL_TREE),
815 th (0),
816 versioning_threshold (0),
817 vectorization_factor (0),
818 max_vectorization_factor (0),
819 mask_skip_niters (NULL_TREE),
820 mask_compare_type (NULL_TREE),
821 simd_if_cond (NULL_TREE),
822 unaligned_dr (NULL),
823 peeling_for_alignment (0),
824 ptr_mask (0),
825 ivexpr_map (NULL),
826 scan_map (NULL),
827 slp_unrolling_factor (1),
828 single_scalar_iteration_cost (0),
829 vectorizable (false),
830 can_fully_mask_p (true),
831 fully_masked_p (false),
832 peeling_for_gaps (false),
833 peeling_for_niter (false),
834 no_data_dependencies (false),
835 has_mask_store (false),
836 scalar_loop_scaling (profile_probability::uninitialized ()),
837 scalar_loop (NULL),
838 orig_loop_info (NULL)
839 {
840 /* CHECKME: We want to visit all BBs before their successors (except for
841 latch blocks, for which this assertion wouldn't hold). In the simple
842 case of the loop forms we allow, a dfs order of the BBs would the same
843 as reversed postorder traversal, so we are safe. */
844
845 unsigned int nbbs = dfs_enumerate_from (loop->header, 0, bb_in_loop_p,
846 bbs, loop->num_nodes, loop);
847 gcc_assert (nbbs == loop->num_nodes);
848
849 for (unsigned int i = 0; i < nbbs; i++)
850 {
851 basic_block bb = bbs[i];
852 gimple_stmt_iterator si;
853
854 for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
855 {
856 gimple *phi = gsi_stmt (si);
857 gimple_set_uid (phi, 0);
858 add_stmt (phi);
859 }
860
861 for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
862 {
863 gimple *stmt = gsi_stmt (si);
864 gimple_set_uid (stmt, 0);
865 add_stmt (stmt);
866 /* If .GOMP_SIMD_LANE call for the current loop has 3 arguments, the
867 third argument is the #pragma omp simd if (x) condition, when 0,
868 loop shouldn't be vectorized, when non-zero constant, it should
869 be vectorized normally, otherwise versioned with vectorized loop
870 done if the condition is non-zero at runtime. */
871 if (loop_in->simduid
872 && is_gimple_call (stmt)
873 && gimple_call_internal_p (stmt)
874 && gimple_call_internal_fn (stmt) == IFN_GOMP_SIMD_LANE
875 && gimple_call_num_args (stmt) >= 3
876 && TREE_CODE (gimple_call_arg (stmt, 0)) == SSA_NAME
877 && (loop_in->simduid
878 == SSA_NAME_VAR (gimple_call_arg (stmt, 0))))
879 {
880 tree arg = gimple_call_arg (stmt, 2);
881 if (integer_zerop (arg) || TREE_CODE (arg) == SSA_NAME)
882 simd_if_cond = arg;
883 else
884 gcc_assert (integer_nonzerop (arg));
885 }
886 }
887 }
888 }
889
890 /* Free all levels of MASKS. */
891
892 void
893 release_vec_loop_masks (vec_loop_masks *masks)
894 {
895 rgroup_masks *rgm;
896 unsigned int i;
897 FOR_EACH_VEC_ELT (*masks, i, rgm)
898 rgm->masks.release ();
899 masks->release ();
900 }
901
902 /* Free all memory used by the _loop_vec_info, as well as all the
903 stmt_vec_info structs of all the stmts in the loop. */
904
905 _loop_vec_info::~_loop_vec_info ()
906 {
907 free (bbs);
908
909 release_vec_loop_masks (&masks);
910 delete ivexpr_map;
911 delete scan_map;
912
913 loop->aux = NULL;
914 }
915
916 /* Return an invariant or register for EXPR and emit necessary
917 computations in the LOOP_VINFO loop preheader. */
918
919 tree
920 cse_and_gimplify_to_preheader (loop_vec_info loop_vinfo, tree expr)
921 {
922 if (is_gimple_reg (expr)
923 || is_gimple_min_invariant (expr))
924 return expr;
925
926 if (! loop_vinfo->ivexpr_map)
927 loop_vinfo->ivexpr_map = new hash_map<tree_operand_hash, tree>;
928 tree &cached = loop_vinfo->ivexpr_map->get_or_insert (expr);
929 if (! cached)
930 {
931 gimple_seq stmts = NULL;
932 cached = force_gimple_operand (unshare_expr (expr),
933 &stmts, true, NULL_TREE);
934 if (stmts)
935 {
936 edge e = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo));
937 gsi_insert_seq_on_edge_immediate (e, stmts);
938 }
939 }
940 return cached;
941 }
942
943 /* Return true if we can use CMP_TYPE as the comparison type to produce
944 all masks required to mask LOOP_VINFO. */
945
946 static bool
947 can_produce_all_loop_masks_p (loop_vec_info loop_vinfo, tree cmp_type)
948 {
949 rgroup_masks *rgm;
950 unsigned int i;
951 FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo), i, rgm)
952 if (rgm->mask_type != NULL_TREE
953 && !direct_internal_fn_supported_p (IFN_WHILE_ULT,
954 cmp_type, rgm->mask_type,
955 OPTIMIZE_FOR_SPEED))
956 return false;
957 return true;
958 }
959
960 /* Calculate the maximum number of scalars per iteration for every
961 rgroup in LOOP_VINFO. */
962
963 static unsigned int
964 vect_get_max_nscalars_per_iter (loop_vec_info loop_vinfo)
965 {
966 unsigned int res = 1;
967 unsigned int i;
968 rgroup_masks *rgm;
969 FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo), i, rgm)
970 res = MAX (res, rgm->max_nscalars_per_iter);
971 return res;
972 }
973
974 /* Each statement in LOOP_VINFO can be masked where necessary. Check
975 whether we can actually generate the masks required. Return true if so,
976 storing the type of the scalar IV in LOOP_VINFO_MASK_COMPARE_TYPE. */
977
978 static bool
979 vect_verify_full_masking (loop_vec_info loop_vinfo)
980 {
981 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
982 unsigned int min_ni_width;
983 unsigned int max_nscalars_per_iter
984 = vect_get_max_nscalars_per_iter (loop_vinfo);
985
986 /* Use a normal loop if there are no statements that need masking.
987 This only happens in rare degenerate cases: it means that the loop
988 has no loads, no stores, and no live-out values. */
989 if (LOOP_VINFO_MASKS (loop_vinfo).is_empty ())
990 return false;
991
992 /* Get the maximum number of iterations that is representable
993 in the counter type. */
994 tree ni_type = TREE_TYPE (LOOP_VINFO_NITERSM1 (loop_vinfo));
995 widest_int max_ni = wi::to_widest (TYPE_MAX_VALUE (ni_type)) + 1;
996
997 /* Get a more refined estimate for the number of iterations. */
998 widest_int max_back_edges;
999 if (max_loop_iterations (loop, &max_back_edges))
1000 max_ni = wi::smin (max_ni, max_back_edges + 1);
1001
1002 /* Account for rgroup masks, in which each bit is replicated N times. */
1003 max_ni *= max_nscalars_per_iter;
1004
1005 /* Work out how many bits we need to represent the limit. */
1006 min_ni_width = wi::min_precision (max_ni, UNSIGNED);
1007
1008 /* Find a scalar mode for which WHILE_ULT is supported. */
1009 opt_scalar_int_mode cmp_mode_iter;
1010 tree cmp_type = NULL_TREE;
1011 tree iv_type = NULL_TREE;
1012 widest_int iv_limit = vect_iv_limit_for_full_masking (loop_vinfo);
1013 unsigned int iv_precision = UINT_MAX;
1014
1015 if (iv_limit != -1)
1016 iv_precision = wi::min_precision (iv_limit * max_nscalars_per_iter,
1017 UNSIGNED);
1018
1019 FOR_EACH_MODE_IN_CLASS (cmp_mode_iter, MODE_INT)
1020 {
1021 unsigned int cmp_bits = GET_MODE_BITSIZE (cmp_mode_iter.require ());
1022 if (cmp_bits >= min_ni_width
1023 && targetm.scalar_mode_supported_p (cmp_mode_iter.require ()))
1024 {
1025 tree this_type = build_nonstandard_integer_type (cmp_bits, true);
1026 if (this_type
1027 && can_produce_all_loop_masks_p (loop_vinfo, this_type))
1028 {
1029 /* Although we could stop as soon as we find a valid mode,
1030 there are at least two reasons why that's not always the
1031 best choice:
1032
1033 - An IV that's Pmode or wider is more likely to be reusable
1034 in address calculations than an IV that's narrower than
1035 Pmode.
1036
1037 - Doing the comparison in IV_PRECISION or wider allows
1038 a natural 0-based IV, whereas using a narrower comparison
1039 type requires mitigations against wrap-around.
1040
1041 Conversely, if the IV limit is variable, doing the comparison
1042 in a wider type than the original type can introduce
1043 unnecessary extensions, so picking the widest valid mode
1044 is not always a good choice either.
1045
1046 Here we prefer the first IV type that's Pmode or wider,
1047 and the first comparison type that's IV_PRECISION or wider.
1048 (The comparison type must be no wider than the IV type,
1049 to avoid extensions in the vector loop.)
1050
1051 ??? We might want to try continuing beyond Pmode for ILP32
1052 targets if CMP_BITS < IV_PRECISION. */
1053 iv_type = this_type;
1054 if (!cmp_type || iv_precision > TYPE_PRECISION (cmp_type))
1055 cmp_type = this_type;
1056 if (cmp_bits >= GET_MODE_BITSIZE (Pmode))
1057 break;
1058 }
1059 }
1060 }
1061
1062 if (!cmp_type)
1063 return false;
1064
1065 LOOP_VINFO_MASK_COMPARE_TYPE (loop_vinfo) = cmp_type;
1066 LOOP_VINFO_MASK_IV_TYPE (loop_vinfo) = iv_type;
1067 return true;
1068 }
1069
1070 /* Calculate the cost of one scalar iteration of the loop. */
1071 static void
1072 vect_compute_single_scalar_iteration_cost (loop_vec_info loop_vinfo)
1073 {
1074 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1075 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1076 int nbbs = loop->num_nodes, factor;
1077 int innerloop_iters, i;
1078
1079 DUMP_VECT_SCOPE ("vect_compute_single_scalar_iteration_cost");
1080
1081 /* Gather costs for statements in the scalar loop. */
1082
1083 /* FORNOW. */
1084 innerloop_iters = 1;
1085 if (loop->inner)
1086 innerloop_iters = 50; /* FIXME */
1087
1088 for (i = 0; i < nbbs; i++)
1089 {
1090 gimple_stmt_iterator si;
1091 basic_block bb = bbs[i];
1092
1093 if (bb->loop_father == loop->inner)
1094 factor = innerloop_iters;
1095 else
1096 factor = 1;
1097
1098 for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
1099 {
1100 gimple *stmt = gsi_stmt (si);
1101 stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (stmt);
1102
1103 if (!is_gimple_assign (stmt) && !is_gimple_call (stmt))
1104 continue;
1105
1106 /* Skip stmts that are not vectorized inside the loop. */
1107 stmt_vec_info vstmt_info = vect_stmt_to_vectorize (stmt_info);
1108 if (!STMT_VINFO_RELEVANT_P (vstmt_info)
1109 && (!STMT_VINFO_LIVE_P (vstmt_info)
1110 || !VECTORIZABLE_CYCLE_DEF
1111 (STMT_VINFO_DEF_TYPE (vstmt_info))))
1112 continue;
1113
1114 vect_cost_for_stmt kind;
1115 if (STMT_VINFO_DATA_REF (stmt_info))
1116 {
1117 if (DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
1118 kind = scalar_load;
1119 else
1120 kind = scalar_store;
1121 }
1122 else
1123 kind = scalar_stmt;
1124
1125 record_stmt_cost (&LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1126 factor, kind, stmt_info, 0, vect_prologue);
1127 }
1128 }
1129
1130 /* Now accumulate cost. */
1131 void *target_cost_data = init_cost (loop);
1132 stmt_info_for_cost *si;
1133 int j;
1134 FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1135 j, si)
1136 (void) add_stmt_cost (target_cost_data, si->count,
1137 si->kind, si->stmt_info, si->misalign,
1138 vect_body);
1139 unsigned dummy, body_cost = 0;
1140 finish_cost (target_cost_data, &dummy, &body_cost, &dummy);
1141 destroy_cost_data (target_cost_data);
1142 LOOP_VINFO_SINGLE_SCALAR_ITERATION_COST (loop_vinfo) = body_cost;
1143 }
1144
1145
1146 /* Function vect_analyze_loop_form_1.
1147
1148 Verify that certain CFG restrictions hold, including:
1149 - the loop has a pre-header
1150 - the loop has a single entry and exit
1151 - the loop exit condition is simple enough
1152 - the number of iterations can be analyzed, i.e, a countable loop. The
1153 niter could be analyzed under some assumptions. */
1154
1155 opt_result
1156 vect_analyze_loop_form_1 (class loop *loop, gcond **loop_cond,
1157 tree *assumptions, tree *number_of_iterationsm1,
1158 tree *number_of_iterations, gcond **inner_loop_cond)
1159 {
1160 DUMP_VECT_SCOPE ("vect_analyze_loop_form");
1161
1162 /* Different restrictions apply when we are considering an inner-most loop,
1163 vs. an outer (nested) loop.
1164 (FORNOW. May want to relax some of these restrictions in the future). */
1165
1166 if (!loop->inner)
1167 {
1168 /* Inner-most loop. We currently require that the number of BBs is
1169 exactly 2 (the header and latch). Vectorizable inner-most loops
1170 look like this:
1171
1172 (pre-header)
1173 |
1174 header <--------+
1175 | | |
1176 | +--> latch --+
1177 |
1178 (exit-bb) */
1179
1180 if (loop->num_nodes != 2)
1181 return opt_result::failure_at (vect_location,
1182 "not vectorized:"
1183 " control flow in loop.\n");
1184
1185 if (empty_block_p (loop->header))
1186 return opt_result::failure_at (vect_location,
1187 "not vectorized: empty loop.\n");
1188 }
1189 else
1190 {
1191 class loop *innerloop = loop->inner;
1192 edge entryedge;
1193
1194 /* Nested loop. We currently require that the loop is doubly-nested,
1195 contains a single inner loop, and the number of BBs is exactly 5.
1196 Vectorizable outer-loops look like this:
1197
1198 (pre-header)
1199 |
1200 header <---+
1201 | |
1202 inner-loop |
1203 | |
1204 tail ------+
1205 |
1206 (exit-bb)
1207
1208 The inner-loop has the properties expected of inner-most loops
1209 as described above. */
1210
1211 if ((loop->inner)->inner || (loop->inner)->next)
1212 return opt_result::failure_at (vect_location,
1213 "not vectorized:"
1214 " multiple nested loops.\n");
1215
1216 if (loop->num_nodes != 5)
1217 return opt_result::failure_at (vect_location,
1218 "not vectorized:"
1219 " control flow in loop.\n");
1220
1221 entryedge = loop_preheader_edge (innerloop);
1222 if (entryedge->src != loop->header
1223 || !single_exit (innerloop)
1224 || single_exit (innerloop)->dest != EDGE_PRED (loop->latch, 0)->src)
1225 return opt_result::failure_at (vect_location,
1226 "not vectorized:"
1227 " unsupported outerloop form.\n");
1228
1229 /* Analyze the inner-loop. */
1230 tree inner_niterm1, inner_niter, inner_assumptions;
1231 opt_result res
1232 = vect_analyze_loop_form_1 (loop->inner, inner_loop_cond,
1233 &inner_assumptions, &inner_niterm1,
1234 &inner_niter, NULL);
1235 if (!res)
1236 {
1237 if (dump_enabled_p ())
1238 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1239 "not vectorized: Bad inner loop.\n");
1240 return res;
1241 }
1242
1243 /* Don't support analyzing niter under assumptions for inner
1244 loop. */
1245 if (!integer_onep (inner_assumptions))
1246 return opt_result::failure_at (vect_location,
1247 "not vectorized: Bad inner loop.\n");
1248
1249 if (!expr_invariant_in_loop_p (loop, inner_niter))
1250 return opt_result::failure_at (vect_location,
1251 "not vectorized: inner-loop count not"
1252 " invariant.\n");
1253
1254 if (dump_enabled_p ())
1255 dump_printf_loc (MSG_NOTE, vect_location,
1256 "Considering outer-loop vectorization.\n");
1257 }
1258
1259 if (!single_exit (loop))
1260 return opt_result::failure_at (vect_location,
1261 "not vectorized: multiple exits.\n");
1262 if (EDGE_COUNT (loop->header->preds) != 2)
1263 return opt_result::failure_at (vect_location,
1264 "not vectorized:"
1265 " too many incoming edges.\n");
1266
1267 /* We assume that the loop exit condition is at the end of the loop. i.e,
1268 that the loop is represented as a do-while (with a proper if-guard
1269 before the loop if needed), where the loop header contains all the
1270 executable statements, and the latch is empty. */
1271 if (!empty_block_p (loop->latch)
1272 || !gimple_seq_empty_p (phi_nodes (loop->latch)))
1273 return opt_result::failure_at (vect_location,
1274 "not vectorized: latch block not empty.\n");
1275
1276 /* Make sure the exit is not abnormal. */
1277 edge e = single_exit (loop);
1278 if (e->flags & EDGE_ABNORMAL)
1279 return opt_result::failure_at (vect_location,
1280 "not vectorized:"
1281 " abnormal loop exit edge.\n");
1282
1283 *loop_cond = vect_get_loop_niters (loop, assumptions, number_of_iterations,
1284 number_of_iterationsm1);
1285 if (!*loop_cond)
1286 return opt_result::failure_at
1287 (vect_location,
1288 "not vectorized: complicated exit condition.\n");
1289
1290 if (integer_zerop (*assumptions)
1291 || !*number_of_iterations
1292 || chrec_contains_undetermined (*number_of_iterations))
1293 return opt_result::failure_at
1294 (*loop_cond,
1295 "not vectorized: number of iterations cannot be computed.\n");
1296
1297 if (integer_zerop (*number_of_iterations))
1298 return opt_result::failure_at
1299 (*loop_cond,
1300 "not vectorized: number of iterations = 0.\n");
1301
1302 return opt_result::success ();
1303 }
1304
1305 /* Analyze LOOP form and return a loop_vec_info if it is of suitable form. */
1306
1307 opt_loop_vec_info
1308 vect_analyze_loop_form (class loop *loop, vec_info_shared *shared)
1309 {
1310 tree assumptions, number_of_iterations, number_of_iterationsm1;
1311 gcond *loop_cond, *inner_loop_cond = NULL;
1312
1313 opt_result res
1314 = vect_analyze_loop_form_1 (loop, &loop_cond,
1315 &assumptions, &number_of_iterationsm1,
1316 &number_of_iterations, &inner_loop_cond);
1317 if (!res)
1318 return opt_loop_vec_info::propagate_failure (res);
1319
1320 loop_vec_info loop_vinfo = new _loop_vec_info (loop, shared);
1321 LOOP_VINFO_NITERSM1 (loop_vinfo) = number_of_iterationsm1;
1322 LOOP_VINFO_NITERS (loop_vinfo) = number_of_iterations;
1323 LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = number_of_iterations;
1324 if (!integer_onep (assumptions))
1325 {
1326 /* We consider to vectorize this loop by versioning it under
1327 some assumptions. In order to do this, we need to clear
1328 existing information computed by scev and niter analyzer. */
1329 scev_reset_htab ();
1330 free_numbers_of_iterations_estimates (loop);
1331 /* Also set flag for this loop so that following scev and niter
1332 analysis are done under the assumptions. */
1333 loop_constraint_set (loop, LOOP_C_FINITE);
1334 /* Also record the assumptions for versioning. */
1335 LOOP_VINFO_NITERS_ASSUMPTIONS (loop_vinfo) = assumptions;
1336 }
1337
1338 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
1339 {
1340 if (dump_enabled_p ())
1341 {
1342 dump_printf_loc (MSG_NOTE, vect_location,
1343 "Symbolic number of iterations is ");
1344 dump_generic_expr (MSG_NOTE, TDF_DETAILS, number_of_iterations);
1345 dump_printf (MSG_NOTE, "\n");
1346 }
1347 }
1348
1349 stmt_vec_info loop_cond_info = loop_vinfo->lookup_stmt (loop_cond);
1350 STMT_VINFO_TYPE (loop_cond_info) = loop_exit_ctrl_vec_info_type;
1351 if (inner_loop_cond)
1352 {
1353 stmt_vec_info inner_loop_cond_info
1354 = loop_vinfo->lookup_stmt (inner_loop_cond);
1355 STMT_VINFO_TYPE (inner_loop_cond_info) = loop_exit_ctrl_vec_info_type;
1356 }
1357
1358 gcc_assert (!loop->aux);
1359 loop->aux = loop_vinfo;
1360 return opt_loop_vec_info::success (loop_vinfo);
1361 }
1362
1363
1364
1365 /* Scan the loop stmts and dependent on whether there are any (non-)SLP
1366 statements update the vectorization factor. */
1367
1368 static void
1369 vect_update_vf_for_slp (loop_vec_info loop_vinfo)
1370 {
1371 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1372 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1373 int nbbs = loop->num_nodes;
1374 poly_uint64 vectorization_factor;
1375 int i;
1376
1377 DUMP_VECT_SCOPE ("vect_update_vf_for_slp");
1378
1379 vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1380 gcc_assert (known_ne (vectorization_factor, 0U));
1381
1382 /* If all the stmts in the loop can be SLPed, we perform only SLP, and
1383 vectorization factor of the loop is the unrolling factor required by
1384 the SLP instances. If that unrolling factor is 1, we say, that we
1385 perform pure SLP on loop - cross iteration parallelism is not
1386 exploited. */
1387 bool only_slp_in_loop = true;
1388 for (i = 0; i < nbbs; i++)
1389 {
1390 basic_block bb = bbs[i];
1391 for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
1392 gsi_next (&si))
1393 {
1394 stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
1395 stmt_info = vect_stmt_to_vectorize (stmt_info);
1396 if ((STMT_VINFO_RELEVANT_P (stmt_info)
1397 || VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
1398 && !PURE_SLP_STMT (stmt_info))
1399 /* STMT needs both SLP and loop-based vectorization. */
1400 only_slp_in_loop = false;
1401 }
1402 }
1403
1404 if (only_slp_in_loop)
1405 {
1406 if (dump_enabled_p ())
1407 dump_printf_loc (MSG_NOTE, vect_location,
1408 "Loop contains only SLP stmts\n");
1409 vectorization_factor = LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo);
1410 }
1411 else
1412 {
1413 if (dump_enabled_p ())
1414 dump_printf_loc (MSG_NOTE, vect_location,
1415 "Loop contains SLP and non-SLP stmts\n");
1416 /* Both the vectorization factor and unroll factor have the form
1417 current_vector_size * X for some rational X, so they must have
1418 a common multiple. */
1419 vectorization_factor
1420 = force_common_multiple (vectorization_factor,
1421 LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo));
1422 }
1423
1424 LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
1425 if (dump_enabled_p ())
1426 {
1427 dump_printf_loc (MSG_NOTE, vect_location,
1428 "Updating vectorization factor to ");
1429 dump_dec (MSG_NOTE, vectorization_factor);
1430 dump_printf (MSG_NOTE, ".\n");
1431 }
1432 }
1433
1434 /* Return true if STMT_INFO describes a double reduction phi and if
1435 the other phi in the reduction is also relevant for vectorization.
1436 This rejects cases such as:
1437
1438 outer1:
1439 x_1 = PHI <x_3(outer2), ...>;
1440 ...
1441
1442 inner:
1443 x_2 = ...;
1444 ...
1445
1446 outer2:
1447 x_3 = PHI <x_2(inner)>;
1448
1449 if nothing in x_2 or elsewhere makes x_1 relevant. */
1450
1451 static bool
1452 vect_active_double_reduction_p (stmt_vec_info stmt_info)
1453 {
1454 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_double_reduction_def)
1455 return false;
1456
1457 return STMT_VINFO_RELEVANT_P (STMT_VINFO_REDUC_DEF (stmt_info));
1458 }
1459
1460 /* Function vect_analyze_loop_operations.
1461
1462 Scan the loop stmts and make sure they are all vectorizable. */
1463
1464 static opt_result
1465 vect_analyze_loop_operations (loop_vec_info loop_vinfo)
1466 {
1467 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1468 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1469 int nbbs = loop->num_nodes;
1470 int i;
1471 stmt_vec_info stmt_info;
1472 bool need_to_vectorize = false;
1473 bool ok;
1474
1475 DUMP_VECT_SCOPE ("vect_analyze_loop_operations");
1476
1477 auto_vec<stmt_info_for_cost> cost_vec;
1478
1479 for (i = 0; i < nbbs; i++)
1480 {
1481 basic_block bb = bbs[i];
1482
1483 for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
1484 gsi_next (&si))
1485 {
1486 gphi *phi = si.phi ();
1487 ok = true;
1488
1489 stmt_info = loop_vinfo->lookup_stmt (phi);
1490 if (dump_enabled_p ())
1491 dump_printf_loc (MSG_NOTE, vect_location, "examining phi: %G", phi);
1492 if (virtual_operand_p (gimple_phi_result (phi)))
1493 continue;
1494
1495 /* Inner-loop loop-closed exit phi in outer-loop vectorization
1496 (i.e., a phi in the tail of the outer-loop). */
1497 if (! is_loop_header_bb_p (bb))
1498 {
1499 /* FORNOW: we currently don't support the case that these phis
1500 are not used in the outerloop (unless it is double reduction,
1501 i.e., this phi is vect_reduction_def), cause this case
1502 requires to actually do something here. */
1503 if (STMT_VINFO_LIVE_P (stmt_info)
1504 && !vect_active_double_reduction_p (stmt_info))
1505 return opt_result::failure_at (phi,
1506 "Unsupported loop-closed phi"
1507 " in outer-loop.\n");
1508
1509 /* If PHI is used in the outer loop, we check that its operand
1510 is defined in the inner loop. */
1511 if (STMT_VINFO_RELEVANT_P (stmt_info))
1512 {
1513 tree phi_op;
1514
1515 if (gimple_phi_num_args (phi) != 1)
1516 return opt_result::failure_at (phi, "unsupported phi");
1517
1518 phi_op = PHI_ARG_DEF (phi, 0);
1519 stmt_vec_info op_def_info = loop_vinfo->lookup_def (phi_op);
1520 if (!op_def_info)
1521 return opt_result::failure_at (phi, "unsupported phi\n");
1522
1523 if (STMT_VINFO_RELEVANT (op_def_info) != vect_used_in_outer
1524 && (STMT_VINFO_RELEVANT (op_def_info)
1525 != vect_used_in_outer_by_reduction))
1526 return opt_result::failure_at (phi, "unsupported phi\n");
1527
1528 if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_internal_def
1529 || (STMT_VINFO_DEF_TYPE (stmt_info)
1530 == vect_double_reduction_def))
1531 && !vectorizable_lc_phi (stmt_info, NULL, NULL))
1532 return opt_result::failure_at (phi, "unsupported phi\n");
1533 }
1534
1535 continue;
1536 }
1537
1538 gcc_assert (stmt_info);
1539
1540 if ((STMT_VINFO_RELEVANT (stmt_info) == vect_used_in_scope
1541 || STMT_VINFO_LIVE_P (stmt_info))
1542 && STMT_VINFO_DEF_TYPE (stmt_info) != vect_induction_def)
1543 /* A scalar-dependence cycle that we don't support. */
1544 return opt_result::failure_at (phi,
1545 "not vectorized:"
1546 " scalar dependence cycle.\n");
1547
1548 if (STMT_VINFO_RELEVANT_P (stmt_info))
1549 {
1550 need_to_vectorize = true;
1551 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def
1552 && ! PURE_SLP_STMT (stmt_info))
1553 ok = vectorizable_induction (stmt_info, NULL, NULL, NULL,
1554 &cost_vec);
1555 else if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
1556 || (STMT_VINFO_DEF_TYPE (stmt_info)
1557 == vect_double_reduction_def)
1558 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle)
1559 && ! PURE_SLP_STMT (stmt_info))
1560 ok = vectorizable_reduction (stmt_info, NULL, NULL, &cost_vec);
1561 }
1562
1563 /* SLP PHIs are tested by vect_slp_analyze_node_operations. */
1564 if (ok
1565 && STMT_VINFO_LIVE_P (stmt_info)
1566 && !PURE_SLP_STMT (stmt_info))
1567 ok = vectorizable_live_operation (stmt_info, NULL, NULL, NULL,
1568 -1, false, &cost_vec);
1569
1570 if (!ok)
1571 return opt_result::failure_at (phi,
1572 "not vectorized: relevant phi not "
1573 "supported: %G",
1574 static_cast <gimple *> (phi));
1575 }
1576
1577 for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
1578 gsi_next (&si))
1579 {
1580 gimple *stmt = gsi_stmt (si);
1581 if (!gimple_clobber_p (stmt))
1582 {
1583 opt_result res
1584 = vect_analyze_stmt (loop_vinfo->lookup_stmt (stmt),
1585 &need_to_vectorize,
1586 NULL, NULL, &cost_vec);
1587 if (!res)
1588 return res;
1589 }
1590 }
1591 } /* bbs */
1592
1593 add_stmt_costs (loop_vinfo->target_cost_data, &cost_vec);
1594
1595 /* All operations in the loop are either irrelevant (deal with loop
1596 control, or dead), or only used outside the loop and can be moved
1597 out of the loop (e.g. invariants, inductions). The loop can be
1598 optimized away by scalar optimizations. We're better off not
1599 touching this loop. */
1600 if (!need_to_vectorize)
1601 {
1602 if (dump_enabled_p ())
1603 dump_printf_loc (MSG_NOTE, vect_location,
1604 "All the computation can be taken out of the loop.\n");
1605 return opt_result::failure_at
1606 (vect_location,
1607 "not vectorized: redundant loop. no profit to vectorize.\n");
1608 }
1609
1610 return opt_result::success ();
1611 }
1612
1613 /* Analyze the cost of the loop described by LOOP_VINFO. Decide if it
1614 is worthwhile to vectorize. Return 1 if definitely yes, 0 if
1615 definitely no, or -1 if it's worth retrying. */
1616
1617 static int
1618 vect_analyze_loop_costing (loop_vec_info loop_vinfo)
1619 {
1620 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1621 unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
1622
1623 /* Only fully-masked loops can have iteration counts less than the
1624 vectorization factor. */
1625 if (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
1626 {
1627 HOST_WIDE_INT max_niter;
1628
1629 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
1630 max_niter = LOOP_VINFO_INT_NITERS (loop_vinfo);
1631 else
1632 max_niter = max_stmt_executions_int (loop);
1633
1634 if (max_niter != -1
1635 && (unsigned HOST_WIDE_INT) max_niter < assumed_vf)
1636 {
1637 if (dump_enabled_p ())
1638 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1639 "not vectorized: iteration count smaller than "
1640 "vectorization factor.\n");
1641 return 0;
1642 }
1643 }
1644
1645 int min_profitable_iters, min_profitable_estimate;
1646 vect_estimate_min_profitable_iters (loop_vinfo, &min_profitable_iters,
1647 &min_profitable_estimate);
1648
1649 if (min_profitable_iters < 0)
1650 {
1651 if (dump_enabled_p ())
1652 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1653 "not vectorized: vectorization not profitable.\n");
1654 if (dump_enabled_p ())
1655 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1656 "not vectorized: vector version will never be "
1657 "profitable.\n");
1658 return -1;
1659 }
1660
1661 int min_scalar_loop_bound = (PARAM_VALUE (PARAM_MIN_VECT_LOOP_BOUND)
1662 * assumed_vf);
1663
1664 /* Use the cost model only if it is more conservative than user specified
1665 threshold. */
1666 unsigned int th = (unsigned) MAX (min_scalar_loop_bound,
1667 min_profitable_iters);
1668
1669 LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo) = th;
1670
1671 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
1672 && LOOP_VINFO_INT_NITERS (loop_vinfo) < th)
1673 {
1674 if (dump_enabled_p ())
1675 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1676 "not vectorized: vectorization not profitable.\n");
1677 if (dump_enabled_p ())
1678 dump_printf_loc (MSG_NOTE, vect_location,
1679 "not vectorized: iteration count smaller than user "
1680 "specified loop bound parameter or minimum profitable "
1681 "iterations (whichever is more conservative).\n");
1682 return 0;
1683 }
1684
1685 HOST_WIDE_INT estimated_niter = estimated_stmt_executions_int (loop);
1686 if (estimated_niter == -1)
1687 estimated_niter = likely_max_stmt_executions_int (loop);
1688 if (estimated_niter != -1
1689 && ((unsigned HOST_WIDE_INT) estimated_niter
1690 < MAX (th, (unsigned) min_profitable_estimate)))
1691 {
1692 if (dump_enabled_p ())
1693 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1694 "not vectorized: estimated iteration count too "
1695 "small.\n");
1696 if (dump_enabled_p ())
1697 dump_printf_loc (MSG_NOTE, vect_location,
1698 "not vectorized: estimated iteration count smaller "
1699 "than specified loop bound parameter or minimum "
1700 "profitable iterations (whichever is more "
1701 "conservative).\n");
1702 return -1;
1703 }
1704
1705 return 1;
1706 }
1707
1708 static opt_result
1709 vect_get_datarefs_in_loop (loop_p loop, basic_block *bbs,
1710 vec<data_reference_p> *datarefs,
1711 unsigned int *n_stmts)
1712 {
1713 *n_stmts = 0;
1714 for (unsigned i = 0; i < loop->num_nodes; i++)
1715 for (gimple_stmt_iterator gsi = gsi_start_bb (bbs[i]);
1716 !gsi_end_p (gsi); gsi_next (&gsi))
1717 {
1718 gimple *stmt = gsi_stmt (gsi);
1719 if (is_gimple_debug (stmt))
1720 continue;
1721 ++(*n_stmts);
1722 opt_result res = vect_find_stmt_data_reference (loop, stmt, datarefs);
1723 if (!res)
1724 {
1725 if (is_gimple_call (stmt) && loop->safelen)
1726 {
1727 tree fndecl = gimple_call_fndecl (stmt), op;
1728 if (fndecl != NULL_TREE)
1729 {
1730 cgraph_node *node = cgraph_node::get (fndecl);
1731 if (node != NULL && node->simd_clones != NULL)
1732 {
1733 unsigned int j, n = gimple_call_num_args (stmt);
1734 for (j = 0; j < n; j++)
1735 {
1736 op = gimple_call_arg (stmt, j);
1737 if (DECL_P (op)
1738 || (REFERENCE_CLASS_P (op)
1739 && get_base_address (op)))
1740 break;
1741 }
1742 op = gimple_call_lhs (stmt);
1743 /* Ignore #pragma omp declare simd functions
1744 if they don't have data references in the
1745 call stmt itself. */
1746 if (j == n
1747 && !(op
1748 && (DECL_P (op)
1749 || (REFERENCE_CLASS_P (op)
1750 && get_base_address (op)))))
1751 continue;
1752 }
1753 }
1754 }
1755 return res;
1756 }
1757 /* If dependence analysis will give up due to the limit on the
1758 number of datarefs stop here and fail fatally. */
1759 if (datarefs->length ()
1760 > (unsigned)PARAM_VALUE (PARAM_LOOP_MAX_DATAREFS_FOR_DATADEPS))
1761 return opt_result::failure_at (stmt, "exceeded param "
1762 "loop-max-datarefs-for-datadeps\n");
1763 }
1764 return opt_result::success ();
1765 }
1766
1767 /* Look for SLP-only access groups and turn each individual access into its own
1768 group. */
1769 static void
1770 vect_dissolve_slp_only_groups (loop_vec_info loop_vinfo)
1771 {
1772 unsigned int i;
1773 struct data_reference *dr;
1774
1775 DUMP_VECT_SCOPE ("vect_dissolve_slp_only_groups");
1776
1777 vec<data_reference_p> datarefs = loop_vinfo->shared->datarefs;
1778 FOR_EACH_VEC_ELT (datarefs, i, dr)
1779 {
1780 gcc_assert (DR_REF (dr));
1781 stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (DR_STMT (dr));
1782
1783 /* Check if the load is a part of an interleaving chain. */
1784 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
1785 {
1786 stmt_vec_info first_element = DR_GROUP_FIRST_ELEMENT (stmt_info);
1787 unsigned int group_size = DR_GROUP_SIZE (first_element);
1788
1789 /* Check if SLP-only groups. */
1790 if (!STMT_SLP_TYPE (stmt_info)
1791 && STMT_VINFO_SLP_VECT_ONLY (first_element))
1792 {
1793 /* Dissolve the group. */
1794 STMT_VINFO_SLP_VECT_ONLY (first_element) = false;
1795
1796 stmt_vec_info vinfo = first_element;
1797 while (vinfo)
1798 {
1799 stmt_vec_info next = DR_GROUP_NEXT_ELEMENT (vinfo);
1800 DR_GROUP_FIRST_ELEMENT (vinfo) = vinfo;
1801 DR_GROUP_NEXT_ELEMENT (vinfo) = NULL;
1802 DR_GROUP_SIZE (vinfo) = 1;
1803 DR_GROUP_GAP (vinfo) = group_size - 1;
1804 vinfo = next;
1805 }
1806 }
1807 }
1808 }
1809 }
1810
1811
1812 /* Decides whether we need to create an epilogue loop to handle
1813 remaining scalar iterations and sets PEELING_FOR_NITERS accordingly. */
1814
1815 void
1816 determine_peel_for_niter (loop_vec_info loop_vinfo)
1817 {
1818 LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = false;
1819
1820 unsigned HOST_WIDE_INT const_vf;
1821 HOST_WIDE_INT max_niter
1822 = likely_max_stmt_executions_int (LOOP_VINFO_LOOP (loop_vinfo));
1823
1824 unsigned th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
1825 if (!th && LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo))
1826 th = LOOP_VINFO_COST_MODEL_THRESHOLD (LOOP_VINFO_ORIG_LOOP_INFO
1827 (loop_vinfo));
1828
1829 if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
1830 /* The main loop handles all iterations. */
1831 LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = false;
1832 else if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
1833 && LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) >= 0)
1834 {
1835 /* Work out the (constant) number of iterations that need to be
1836 peeled for reasons other than niters. */
1837 unsigned int peel_niter = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
1838 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
1839 peel_niter += 1;
1840 if (!multiple_p (LOOP_VINFO_INT_NITERS (loop_vinfo) - peel_niter,
1841 LOOP_VINFO_VECT_FACTOR (loop_vinfo)))
1842 LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = true;
1843 }
1844 else if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
1845 /* ??? When peeling for gaps but not alignment, we could
1846 try to check whether the (variable) niters is known to be
1847 VF * N + 1. That's something of a niche case though. */
1848 || LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
1849 || !LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant (&const_vf)
1850 || ((tree_ctz (LOOP_VINFO_NITERS (loop_vinfo))
1851 < (unsigned) exact_log2 (const_vf))
1852 /* In case of versioning, check if the maximum number of
1853 iterations is greater than th. If they are identical,
1854 the epilogue is unnecessary. */
1855 && (!LOOP_REQUIRES_VERSIONING (loop_vinfo)
1856 || ((unsigned HOST_WIDE_INT) max_niter
1857 > (th / const_vf) * const_vf))))
1858 LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = true;
1859 }
1860
1861
1862 /* Function vect_analyze_loop_2.
1863
1864 Apply a set of analyses on LOOP, and create a loop_vec_info struct
1865 for it. The different analyses will record information in the
1866 loop_vec_info struct. */
1867 static opt_result
1868 vect_analyze_loop_2 (loop_vec_info loop_vinfo, bool &fatal, unsigned *n_stmts)
1869 {
1870 opt_result ok = opt_result::success ();
1871 int res;
1872 unsigned int max_vf = MAX_VECTORIZATION_FACTOR;
1873 poly_uint64 min_vf = 2;
1874
1875 /* The first group of checks is independent of the vector size. */
1876 fatal = true;
1877
1878 if (LOOP_VINFO_SIMD_IF_COND (loop_vinfo)
1879 && integer_zerop (LOOP_VINFO_SIMD_IF_COND (loop_vinfo)))
1880 return opt_result::failure_at (vect_location,
1881 "not vectorized: simd if(0)\n");
1882
1883 /* Find all data references in the loop (which correspond to vdefs/vuses)
1884 and analyze their evolution in the loop. */
1885
1886 loop_p loop = LOOP_VINFO_LOOP (loop_vinfo);
1887
1888 /* Gather the data references and count stmts in the loop. */
1889 if (!LOOP_VINFO_DATAREFS (loop_vinfo).exists ())
1890 {
1891 opt_result res
1892 = vect_get_datarefs_in_loop (loop, LOOP_VINFO_BBS (loop_vinfo),
1893 &LOOP_VINFO_DATAREFS (loop_vinfo),
1894 n_stmts);
1895 if (!res)
1896 {
1897 if (dump_enabled_p ())
1898 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1899 "not vectorized: loop contains function "
1900 "calls or data references that cannot "
1901 "be analyzed\n");
1902 return res;
1903 }
1904 loop_vinfo->shared->save_datarefs ();
1905 }
1906 else
1907 loop_vinfo->shared->check_datarefs ();
1908
1909 /* Analyze the data references and also adjust the minimal
1910 vectorization factor according to the loads and stores. */
1911
1912 ok = vect_analyze_data_refs (loop_vinfo, &min_vf, &fatal);
1913 if (!ok)
1914 {
1915 if (dump_enabled_p ())
1916 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1917 "bad data references.\n");
1918 return ok;
1919 }
1920
1921 /* Classify all cross-iteration scalar data-flow cycles.
1922 Cross-iteration cycles caused by virtual phis are analyzed separately. */
1923 vect_analyze_scalar_cycles (loop_vinfo);
1924
1925 vect_pattern_recog (loop_vinfo);
1926
1927 vect_fixup_scalar_cycles_with_patterns (loop_vinfo);
1928
1929 /* Analyze the access patterns of the data-refs in the loop (consecutive,
1930 complex, etc.). FORNOW: Only handle consecutive access pattern. */
1931
1932 ok = vect_analyze_data_ref_accesses (loop_vinfo);
1933 if (!ok)
1934 {
1935 if (dump_enabled_p ())
1936 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1937 "bad data access.\n");
1938 return ok;
1939 }
1940
1941 /* Data-flow analysis to detect stmts that do not need to be vectorized. */
1942
1943 ok = vect_mark_stmts_to_be_vectorized (loop_vinfo, &fatal);
1944 if (!ok)
1945 {
1946 if (dump_enabled_p ())
1947 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1948 "unexpected pattern.\n");
1949 return ok;
1950 }
1951
1952 /* While the rest of the analysis below depends on it in some way. */
1953 fatal = false;
1954
1955 /* Analyze data dependences between the data-refs in the loop
1956 and adjust the maximum vectorization factor according to
1957 the dependences.
1958 FORNOW: fail at the first data dependence that we encounter. */
1959
1960 ok = vect_analyze_data_ref_dependences (loop_vinfo, &max_vf);
1961 if (!ok)
1962 {
1963 if (dump_enabled_p ())
1964 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1965 "bad data dependence.\n");
1966 return ok;
1967 }
1968 if (max_vf != MAX_VECTORIZATION_FACTOR
1969 && maybe_lt (max_vf, min_vf))
1970 return opt_result::failure_at (vect_location, "bad data dependence.\n");
1971 LOOP_VINFO_MAX_VECT_FACTOR (loop_vinfo) = max_vf;
1972
1973 ok = vect_determine_vectorization_factor (loop_vinfo);
1974 if (!ok)
1975 {
1976 if (dump_enabled_p ())
1977 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1978 "can't determine vectorization factor.\n");
1979 return ok;
1980 }
1981 if (max_vf != MAX_VECTORIZATION_FACTOR
1982 && maybe_lt (max_vf, LOOP_VINFO_VECT_FACTOR (loop_vinfo)))
1983 return opt_result::failure_at (vect_location, "bad data dependence.\n");
1984
1985 /* Compute the scalar iteration cost. */
1986 vect_compute_single_scalar_iteration_cost (loop_vinfo);
1987
1988 poly_uint64 saved_vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1989
1990 /* Check the SLP opportunities in the loop, analyze and build SLP trees. */
1991 ok = vect_analyze_slp (loop_vinfo, *n_stmts);
1992 if (!ok)
1993 return ok;
1994
1995 /* If there are any SLP instances mark them as pure_slp. */
1996 bool slp = vect_make_slp_decision (loop_vinfo);
1997 if (slp)
1998 {
1999 /* Find stmts that need to be both vectorized and SLPed. */
2000 vect_detect_hybrid_slp (loop_vinfo);
2001
2002 /* Update the vectorization factor based on the SLP decision. */
2003 vect_update_vf_for_slp (loop_vinfo);
2004 }
2005
2006 bool saved_can_fully_mask_p = LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo);
2007
2008 /* We don't expect to have to roll back to anything other than an empty
2009 set of rgroups. */
2010 gcc_assert (LOOP_VINFO_MASKS (loop_vinfo).is_empty ());
2011
2012 /* This is the point where we can re-start analysis with SLP forced off. */
2013 start_over:
2014
2015 /* Now the vectorization factor is final. */
2016 poly_uint64 vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2017 gcc_assert (known_ne (vectorization_factor, 0U));
2018
2019 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) && dump_enabled_p ())
2020 {
2021 dump_printf_loc (MSG_NOTE, vect_location,
2022 "vectorization_factor = ");
2023 dump_dec (MSG_NOTE, vectorization_factor);
2024 dump_printf (MSG_NOTE, ", niters = %wd\n",
2025 LOOP_VINFO_INT_NITERS (loop_vinfo));
2026 }
2027
2028 /* Analyze the alignment of the data-refs in the loop.
2029 Fail if a data reference is found that cannot be vectorized. */
2030
2031 ok = vect_analyze_data_refs_alignment (loop_vinfo);
2032 if (!ok)
2033 {
2034 if (dump_enabled_p ())
2035 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2036 "bad data alignment.\n");
2037 return ok;
2038 }
2039
2040 /* Prune the list of ddrs to be tested at run-time by versioning for alias.
2041 It is important to call pruning after vect_analyze_data_ref_accesses,
2042 since we use grouping information gathered by interleaving analysis. */
2043 ok = vect_prune_runtime_alias_test_list (loop_vinfo);
2044 if (!ok)
2045 return ok;
2046
2047 /* Do not invoke vect_enhance_data_refs_alignment for epilogue
2048 vectorization, since we do not want to add extra peeling or
2049 add versioning for alignment. */
2050 if (!LOOP_VINFO_EPILOGUE_P (loop_vinfo))
2051 /* This pass will decide on using loop versioning and/or loop peeling in
2052 order to enhance the alignment of data references in the loop. */
2053 ok = vect_enhance_data_refs_alignment (loop_vinfo);
2054 else
2055 ok = vect_verify_datarefs_alignment (loop_vinfo);
2056 if (!ok)
2057 return ok;
2058
2059 if (slp)
2060 {
2061 /* Analyze operations in the SLP instances. Note this may
2062 remove unsupported SLP instances which makes the above
2063 SLP kind detection invalid. */
2064 unsigned old_size = LOOP_VINFO_SLP_INSTANCES (loop_vinfo).length ();
2065 vect_slp_analyze_operations (loop_vinfo);
2066 if (LOOP_VINFO_SLP_INSTANCES (loop_vinfo).length () != old_size)
2067 {
2068 ok = opt_result::failure_at (vect_location,
2069 "unsupported SLP instances\n");
2070 goto again;
2071 }
2072 }
2073
2074 /* Dissolve SLP-only groups. */
2075 vect_dissolve_slp_only_groups (loop_vinfo);
2076
2077 /* Scan all the remaining operations in the loop that are not subject
2078 to SLP and make sure they are vectorizable. */
2079 ok = vect_analyze_loop_operations (loop_vinfo);
2080 if (!ok)
2081 {
2082 if (dump_enabled_p ())
2083 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2084 "bad operation or unsupported loop bound.\n");
2085 return ok;
2086 }
2087
2088 /* Decide whether to use a fully-masked loop for this vectorization
2089 factor. */
2090 LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
2091 = (LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo)
2092 && vect_verify_full_masking (loop_vinfo));
2093 if (dump_enabled_p ())
2094 {
2095 if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
2096 dump_printf_loc (MSG_NOTE, vect_location,
2097 "using a fully-masked loop.\n");
2098 else
2099 dump_printf_loc (MSG_NOTE, vect_location,
2100 "not using a fully-masked loop.\n");
2101 }
2102
2103 /* If epilog loop is required because of data accesses with gaps,
2104 one additional iteration needs to be peeled. Check if there is
2105 enough iterations for vectorization. */
2106 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
2107 && LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2108 && !LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
2109 {
2110 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2111 tree scalar_niters = LOOP_VINFO_NITERSM1 (loop_vinfo);
2112
2113 if (known_lt (wi::to_widest (scalar_niters), vf))
2114 return opt_result::failure_at (vect_location,
2115 "loop has no enough iterations to"
2116 " support peeling for gaps.\n");
2117 }
2118
2119 /* Check the costings of the loop make vectorizing worthwhile. */
2120 res = vect_analyze_loop_costing (loop_vinfo);
2121 if (res < 0)
2122 {
2123 ok = opt_result::failure_at (vect_location,
2124 "Loop costings may not be worthwhile.\n");
2125 goto again;
2126 }
2127 if (!res)
2128 return opt_result::failure_at (vect_location,
2129 "Loop costings not worthwhile.\n");
2130
2131 determine_peel_for_niter (loop_vinfo);
2132 /* If an epilogue loop is required make sure we can create one. */
2133 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
2134 || LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo))
2135 {
2136 if (dump_enabled_p ())
2137 dump_printf_loc (MSG_NOTE, vect_location, "epilog loop required\n");
2138 if (!vect_can_advance_ivs_p (loop_vinfo)
2139 || !slpeel_can_duplicate_loop_p (LOOP_VINFO_LOOP (loop_vinfo),
2140 single_exit (LOOP_VINFO_LOOP
2141 (loop_vinfo))))
2142 {
2143 ok = opt_result::failure_at (vect_location,
2144 "not vectorized: can't create required "
2145 "epilog loop\n");
2146 goto again;
2147 }
2148 }
2149
2150 /* During peeling, we need to check if number of loop iterations is
2151 enough for both peeled prolog loop and vector loop. This check
2152 can be merged along with threshold check of loop versioning, so
2153 increase threshold for this case if necessary. */
2154 if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
2155 {
2156 poly_uint64 niters_th = 0;
2157 unsigned int th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
2158
2159 if (!vect_use_loop_mask_for_alignment_p (loop_vinfo))
2160 {
2161 /* Niters for peeled prolog loop. */
2162 if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) < 0)
2163 {
2164 dr_vec_info *dr_info = LOOP_VINFO_UNALIGNED_DR (loop_vinfo);
2165 tree vectype = STMT_VINFO_VECTYPE (dr_info->stmt);
2166 niters_th += TYPE_VECTOR_SUBPARTS (vectype) - 1;
2167 }
2168 else
2169 niters_th += LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
2170 }
2171
2172 /* Niters for at least one iteration of vectorized loop. */
2173 if (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
2174 niters_th += LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2175 /* One additional iteration because of peeling for gap. */
2176 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
2177 niters_th += 1;
2178
2179 /* Use the same condition as vect_transform_loop to decide when to use
2180 the cost to determine a versioning threshold. */
2181 if (th >= vect_vf_for_cost (loop_vinfo)
2182 && !LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2183 && ordered_p (th, niters_th))
2184 niters_th = ordered_max (poly_uint64 (th), niters_th);
2185
2186 LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo) = niters_th;
2187 }
2188
2189 gcc_assert (known_eq (vectorization_factor,
2190 LOOP_VINFO_VECT_FACTOR (loop_vinfo)));
2191
2192 /* Ok to vectorize! */
2193 return opt_result::success ();
2194
2195 again:
2196 /* Ensure that "ok" is false (with an opt_problem if dumping is enabled). */
2197 gcc_assert (!ok);
2198
2199 /* Try again with SLP forced off but if we didn't do any SLP there is
2200 no point in re-trying. */
2201 if (!slp)
2202 return ok;
2203
2204 /* If there are reduction chains re-trying will fail anyway. */
2205 if (! LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo).is_empty ())
2206 return ok;
2207
2208 /* Likewise if the grouped loads or stores in the SLP cannot be handled
2209 via interleaving or lane instructions. */
2210 slp_instance instance;
2211 slp_tree node;
2212 unsigned i, j;
2213 FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), i, instance)
2214 {
2215 stmt_vec_info vinfo;
2216 vinfo = SLP_TREE_SCALAR_STMTS (SLP_INSTANCE_TREE (instance))[0];
2217 if (! STMT_VINFO_GROUPED_ACCESS (vinfo))
2218 continue;
2219 vinfo = DR_GROUP_FIRST_ELEMENT (vinfo);
2220 unsigned int size = DR_GROUP_SIZE (vinfo);
2221 tree vectype = STMT_VINFO_VECTYPE (vinfo);
2222 if (! vect_store_lanes_supported (vectype, size, false)
2223 && ! known_eq (TYPE_VECTOR_SUBPARTS (vectype), 1U)
2224 && ! vect_grouped_store_supported (vectype, size))
2225 return opt_result::failure_at (vinfo->stmt,
2226 "unsupported grouped store\n");
2227 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), j, node)
2228 {
2229 vinfo = SLP_TREE_SCALAR_STMTS (node)[0];
2230 vinfo = DR_GROUP_FIRST_ELEMENT (vinfo);
2231 bool single_element_p = !DR_GROUP_NEXT_ELEMENT (vinfo);
2232 size = DR_GROUP_SIZE (vinfo);
2233 vectype = STMT_VINFO_VECTYPE (vinfo);
2234 if (! vect_load_lanes_supported (vectype, size, false)
2235 && ! vect_grouped_load_supported (vectype, single_element_p,
2236 size))
2237 return opt_result::failure_at (vinfo->stmt,
2238 "unsupported grouped load\n");
2239 }
2240 }
2241
2242 if (dump_enabled_p ())
2243 dump_printf_loc (MSG_NOTE, vect_location,
2244 "re-trying with SLP disabled\n");
2245
2246 /* Roll back state appropriately. No SLP this time. */
2247 slp = false;
2248 /* Restore vectorization factor as it were without SLP. */
2249 LOOP_VINFO_VECT_FACTOR (loop_vinfo) = saved_vectorization_factor;
2250 /* Free the SLP instances. */
2251 FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), j, instance)
2252 vect_free_slp_instance (instance, false);
2253 LOOP_VINFO_SLP_INSTANCES (loop_vinfo).release ();
2254 /* Reset SLP type to loop_vect on all stmts. */
2255 for (i = 0; i < LOOP_VINFO_LOOP (loop_vinfo)->num_nodes; ++i)
2256 {
2257 basic_block bb = LOOP_VINFO_BBS (loop_vinfo)[i];
2258 for (gimple_stmt_iterator si = gsi_start_phis (bb);
2259 !gsi_end_p (si); gsi_next (&si))
2260 {
2261 stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
2262 STMT_SLP_TYPE (stmt_info) = loop_vect;
2263 }
2264 for (gimple_stmt_iterator si = gsi_start_bb (bb);
2265 !gsi_end_p (si); gsi_next (&si))
2266 {
2267 stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
2268 STMT_SLP_TYPE (stmt_info) = loop_vect;
2269 if (STMT_VINFO_IN_PATTERN_P (stmt_info))
2270 {
2271 gimple *pattern_def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
2272 stmt_info = STMT_VINFO_RELATED_STMT (stmt_info);
2273 STMT_SLP_TYPE (stmt_info) = loop_vect;
2274 for (gimple_stmt_iterator pi = gsi_start (pattern_def_seq);
2275 !gsi_end_p (pi); gsi_next (&pi))
2276 STMT_SLP_TYPE (loop_vinfo->lookup_stmt (gsi_stmt (pi)))
2277 = loop_vect;
2278 }
2279 }
2280 }
2281 /* Free optimized alias test DDRS. */
2282 LOOP_VINFO_LOWER_BOUNDS (loop_vinfo).truncate (0);
2283 LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo).release ();
2284 LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo).release ();
2285 /* Reset target cost data. */
2286 destroy_cost_data (LOOP_VINFO_TARGET_COST_DATA (loop_vinfo));
2287 LOOP_VINFO_TARGET_COST_DATA (loop_vinfo)
2288 = init_cost (LOOP_VINFO_LOOP (loop_vinfo));
2289 /* Reset accumulated rgroup information. */
2290 release_vec_loop_masks (&LOOP_VINFO_MASKS (loop_vinfo));
2291 /* Reset assorted flags. */
2292 LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = false;
2293 LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) = false;
2294 LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo) = 0;
2295 LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo) = 0;
2296 LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = saved_can_fully_mask_p;
2297
2298 goto start_over;
2299 }
2300
2301 /* Function vect_analyze_loop.
2302
2303 Apply a set of analyses on LOOP, and create a loop_vec_info struct
2304 for it. The different analyses will record information in the
2305 loop_vec_info struct. If ORIG_LOOP_VINFO is not NULL epilogue must
2306 be vectorized. */
2307 opt_loop_vec_info
2308 vect_analyze_loop (class loop *loop, loop_vec_info orig_loop_vinfo,
2309 vec_info_shared *shared)
2310 {
2311 auto_vector_sizes vector_sizes;
2312
2313 /* Autodetect first vector size we try. */
2314 current_vector_size = 0;
2315 targetm.vectorize.autovectorize_vector_sizes (&vector_sizes,
2316 loop->simdlen != 0);
2317 unsigned int next_size = 0;
2318
2319 DUMP_VECT_SCOPE ("analyze_loop_nest");
2320
2321 if (loop_outer (loop)
2322 && loop_vec_info_for_loop (loop_outer (loop))
2323 && LOOP_VINFO_VECTORIZABLE_P (loop_vec_info_for_loop (loop_outer (loop))))
2324 return opt_loop_vec_info::failure_at (vect_location,
2325 "outer-loop already vectorized.\n");
2326
2327 if (!find_loop_nest (loop, &shared->loop_nest))
2328 return opt_loop_vec_info::failure_at
2329 (vect_location,
2330 "not vectorized: loop nest containing two or more consecutive inner"
2331 " loops cannot be vectorized\n");
2332
2333 unsigned n_stmts = 0;
2334 poly_uint64 autodetected_vector_size = 0;
2335 opt_loop_vec_info first_loop_vinfo = opt_loop_vec_info::success (NULL);
2336 poly_uint64 first_vector_size = 0;
2337 while (1)
2338 {
2339 /* Check the CFG characteristics of the loop (nesting, entry/exit). */
2340 opt_loop_vec_info loop_vinfo
2341 = vect_analyze_loop_form (loop, shared);
2342 if (!loop_vinfo)
2343 {
2344 if (dump_enabled_p ())
2345 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2346 "bad loop form.\n");
2347 gcc_checking_assert (first_loop_vinfo == NULL);
2348 return loop_vinfo;
2349 }
2350
2351 bool fatal = false;
2352
2353 if (orig_loop_vinfo)
2354 LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo) = orig_loop_vinfo;
2355
2356 opt_result res = vect_analyze_loop_2 (loop_vinfo, fatal, &n_stmts);
2357 if (res)
2358 {
2359 LOOP_VINFO_VECTORIZABLE_P (loop_vinfo) = 1;
2360
2361 if (loop->simdlen
2362 && maybe_ne (LOOP_VINFO_VECT_FACTOR (loop_vinfo),
2363 (unsigned HOST_WIDE_INT) loop->simdlen))
2364 {
2365 if (first_loop_vinfo == NULL)
2366 {
2367 first_loop_vinfo = loop_vinfo;
2368 first_vector_size = current_vector_size;
2369 loop->aux = NULL;
2370 }
2371 else
2372 delete loop_vinfo;
2373 }
2374 else
2375 {
2376 delete first_loop_vinfo;
2377 return loop_vinfo;
2378 }
2379 }
2380 else
2381 delete loop_vinfo;
2382
2383 if (next_size == 0)
2384 autodetected_vector_size = current_vector_size;
2385
2386 if (next_size < vector_sizes.length ()
2387 && known_eq (vector_sizes[next_size], autodetected_vector_size))
2388 next_size += 1;
2389
2390 if (fatal)
2391 {
2392 gcc_checking_assert (first_loop_vinfo == NULL);
2393 return opt_loop_vec_info::propagate_failure (res);
2394 }
2395
2396 if (next_size == vector_sizes.length ()
2397 || known_eq (current_vector_size, 0U))
2398 {
2399 if (first_loop_vinfo)
2400 {
2401 current_vector_size = first_vector_size;
2402 loop->aux = (loop_vec_info) first_loop_vinfo;
2403 if (dump_enabled_p ())
2404 {
2405 dump_printf_loc (MSG_NOTE, vect_location,
2406 "***** Choosing vector size ");
2407 dump_dec (MSG_NOTE, current_vector_size);
2408 dump_printf (MSG_NOTE, "\n");
2409 }
2410 return first_loop_vinfo;
2411 }
2412 else
2413 return opt_loop_vec_info::propagate_failure (res);
2414 }
2415
2416 /* Try the next biggest vector size. */
2417 current_vector_size = vector_sizes[next_size++];
2418 if (dump_enabled_p ())
2419 {
2420 dump_printf_loc (MSG_NOTE, vect_location,
2421 "***** Re-trying analysis with "
2422 "vector size ");
2423 dump_dec (MSG_NOTE, current_vector_size);
2424 dump_printf (MSG_NOTE, "\n");
2425 }
2426 }
2427 }
2428
2429 /* Return true if there is an in-order reduction function for CODE, storing
2430 it in *REDUC_FN if so. */
2431
2432 static bool
2433 fold_left_reduction_fn (tree_code code, internal_fn *reduc_fn)
2434 {
2435 switch (code)
2436 {
2437 case PLUS_EXPR:
2438 *reduc_fn = IFN_FOLD_LEFT_PLUS;
2439 return true;
2440
2441 default:
2442 return false;
2443 }
2444 }
2445
2446 /* Function reduction_fn_for_scalar_code
2447
2448 Input:
2449 CODE - tree_code of a reduction operations.
2450
2451 Output:
2452 REDUC_FN - the corresponding internal function to be used to reduce the
2453 vector of partial results into a single scalar result, or IFN_LAST
2454 if the operation is a supported reduction operation, but does not have
2455 such an internal function.
2456
2457 Return FALSE if CODE currently cannot be vectorized as reduction. */
2458
2459 static bool
2460 reduction_fn_for_scalar_code (enum tree_code code, internal_fn *reduc_fn)
2461 {
2462 switch (code)
2463 {
2464 case MAX_EXPR:
2465 *reduc_fn = IFN_REDUC_MAX;
2466 return true;
2467
2468 case MIN_EXPR:
2469 *reduc_fn = IFN_REDUC_MIN;
2470 return true;
2471
2472 case PLUS_EXPR:
2473 *reduc_fn = IFN_REDUC_PLUS;
2474 return true;
2475
2476 case BIT_AND_EXPR:
2477 *reduc_fn = IFN_REDUC_AND;
2478 return true;
2479
2480 case BIT_IOR_EXPR:
2481 *reduc_fn = IFN_REDUC_IOR;
2482 return true;
2483
2484 case BIT_XOR_EXPR:
2485 *reduc_fn = IFN_REDUC_XOR;
2486 return true;
2487
2488 case MULT_EXPR:
2489 case MINUS_EXPR:
2490 *reduc_fn = IFN_LAST;
2491 return true;
2492
2493 default:
2494 return false;
2495 }
2496 }
2497
2498 /* If there is a neutral value X such that SLP reduction NODE would not
2499 be affected by the introduction of additional X elements, return that X,
2500 otherwise return null. CODE is the code of the reduction. REDUC_CHAIN
2501 is true if the SLP statements perform a single reduction, false if each
2502 statement performs an independent reduction. */
2503
2504 static tree
2505 neutral_op_for_slp_reduction (slp_tree slp_node, tree_code code,
2506 bool reduc_chain)
2507 {
2508 vec<stmt_vec_info> stmts = SLP_TREE_SCALAR_STMTS (slp_node);
2509 stmt_vec_info stmt_vinfo = stmts[0];
2510 tree vector_type = STMT_VINFO_VECTYPE (stmt_vinfo);
2511 tree scalar_type = TREE_TYPE (vector_type);
2512 class loop *loop = gimple_bb (stmt_vinfo->stmt)->loop_father;
2513 gcc_assert (loop);
2514
2515 switch (code)
2516 {
2517 case WIDEN_SUM_EXPR:
2518 case DOT_PROD_EXPR:
2519 case SAD_EXPR:
2520 case PLUS_EXPR:
2521 case MINUS_EXPR:
2522 case BIT_IOR_EXPR:
2523 case BIT_XOR_EXPR:
2524 return build_zero_cst (scalar_type);
2525
2526 case MULT_EXPR:
2527 return build_one_cst (scalar_type);
2528
2529 case BIT_AND_EXPR:
2530 return build_all_ones_cst (scalar_type);
2531
2532 case MAX_EXPR:
2533 case MIN_EXPR:
2534 /* For MIN/MAX the initial values are neutral. A reduction chain
2535 has only a single initial value, so that value is neutral for
2536 all statements. */
2537 if (reduc_chain)
2538 return PHI_ARG_DEF_FROM_EDGE (stmt_vinfo->stmt,
2539 loop_preheader_edge (loop));
2540 return NULL_TREE;
2541
2542 default:
2543 return NULL_TREE;
2544 }
2545 }
2546
2547 /* Error reporting helper for vect_is_simple_reduction below. GIMPLE statement
2548 STMT is printed with a message MSG. */
2549
2550 static void
2551 report_vect_op (dump_flags_t msg_type, gimple *stmt, const char *msg)
2552 {
2553 dump_printf_loc (msg_type, vect_location, "%s%G", msg, stmt);
2554 }
2555
2556 /* Return true if we need an in-order reduction for operation CODE
2557 on type TYPE. NEED_WRAPPING_INTEGRAL_OVERFLOW is true if integer
2558 overflow must wrap. */
2559
2560 bool
2561 needs_fold_left_reduction_p (tree type, tree_code code)
2562 {
2563 /* CHECKME: check for !flag_finite_math_only too? */
2564 if (SCALAR_FLOAT_TYPE_P (type))
2565 switch (code)
2566 {
2567 case MIN_EXPR:
2568 case MAX_EXPR:
2569 return false;
2570
2571 default:
2572 return !flag_associative_math;
2573 }
2574
2575 if (INTEGRAL_TYPE_P (type))
2576 {
2577 if (!operation_no_trapping_overflow (type, code))
2578 return true;
2579 return false;
2580 }
2581
2582 if (SAT_FIXED_POINT_TYPE_P (type))
2583 return true;
2584
2585 return false;
2586 }
2587
2588 /* Return true if the reduction PHI in LOOP with latch arg LOOP_ARG and
2589 reduction operation CODE has a handled computation expression. */
2590
2591 static bool
2592 check_reduction_path (dump_user_location_t loc, loop_p loop, gphi *phi,
2593 tree loop_arg, enum tree_code code,
2594 vec<std::pair<ssa_op_iter, use_operand_p> > &path)
2595 {
2596 auto_bitmap visited;
2597 tree lookfor = PHI_RESULT (phi);
2598 ssa_op_iter curri;
2599 use_operand_p curr = op_iter_init_phiuse (&curri, phi, SSA_OP_USE);
2600 while (USE_FROM_PTR (curr) != loop_arg)
2601 curr = op_iter_next_use (&curri);
2602 curri.i = curri.numops;
2603 do
2604 {
2605 path.safe_push (std::make_pair (curri, curr));
2606 tree use = USE_FROM_PTR (curr);
2607 if (use == lookfor)
2608 break;
2609 gimple *def = SSA_NAME_DEF_STMT (use);
2610 if (gimple_nop_p (def)
2611 || ! flow_bb_inside_loop_p (loop, gimple_bb (def)))
2612 {
2613 pop:
2614 do
2615 {
2616 std::pair<ssa_op_iter, use_operand_p> x = path.pop ();
2617 curri = x.first;
2618 curr = x.second;
2619 do
2620 curr = op_iter_next_use (&curri);
2621 /* Skip already visited or non-SSA operands (from iterating
2622 over PHI args). */
2623 while (curr != NULL_USE_OPERAND_P
2624 && (TREE_CODE (USE_FROM_PTR (curr)) != SSA_NAME
2625 || ! bitmap_set_bit (visited,
2626 SSA_NAME_VERSION
2627 (USE_FROM_PTR (curr)))));
2628 }
2629 while (curr == NULL_USE_OPERAND_P && ! path.is_empty ());
2630 if (curr == NULL_USE_OPERAND_P)
2631 break;
2632 }
2633 else
2634 {
2635 if (gimple_code (def) == GIMPLE_PHI)
2636 curr = op_iter_init_phiuse (&curri, as_a <gphi *>(def), SSA_OP_USE);
2637 else
2638 curr = op_iter_init_use (&curri, def, SSA_OP_USE);
2639 while (curr != NULL_USE_OPERAND_P
2640 && (TREE_CODE (USE_FROM_PTR (curr)) != SSA_NAME
2641 || ! bitmap_set_bit (visited,
2642 SSA_NAME_VERSION
2643 (USE_FROM_PTR (curr)))))
2644 curr = op_iter_next_use (&curri);
2645 if (curr == NULL_USE_OPERAND_P)
2646 goto pop;
2647 }
2648 }
2649 while (1);
2650 if (dump_file && (dump_flags & TDF_DETAILS))
2651 {
2652 dump_printf_loc (MSG_NOTE, loc, "reduction path: ");
2653 unsigned i;
2654 std::pair<ssa_op_iter, use_operand_p> *x;
2655 FOR_EACH_VEC_ELT (path, i, x)
2656 dump_printf (MSG_NOTE, "%T ", USE_FROM_PTR (x->second));
2657 dump_printf (MSG_NOTE, "\n");
2658 }
2659
2660 /* Check whether the reduction path detected is valid. */
2661 bool fail = path.length () == 0;
2662 bool neg = false;
2663 for (unsigned i = 1; i < path.length (); ++i)
2664 {
2665 gimple *use_stmt = USE_STMT (path[i].second);
2666 tree op = USE_FROM_PTR (path[i].second);
2667 if (! has_single_use (op)
2668 || ! is_gimple_assign (use_stmt)
2669 /* The following make sure we can compute the operand index
2670 easily plus it mostly disallows chaining via COND_EXPR condition
2671 operands. */
2672 || (gimple_assign_rhs1 (use_stmt) != op
2673 && gimple_assign_rhs2 (use_stmt) != op
2674 && gimple_assign_rhs3 (use_stmt) != op))
2675 {
2676 fail = true;
2677 break;
2678 }
2679 if (gimple_assign_rhs_code (use_stmt) != code)
2680 {
2681 if (code == PLUS_EXPR
2682 && gimple_assign_rhs_code (use_stmt) == MINUS_EXPR)
2683 {
2684 /* Track whether we negate the reduction value each iteration. */
2685 if (gimple_assign_rhs2 (use_stmt) == op)
2686 neg = ! neg;
2687 }
2688 else
2689 {
2690 fail = true;
2691 break;
2692 }
2693 }
2694 }
2695 return ! fail && ! neg;
2696 }
2697
2698 bool
2699 check_reduction_path (dump_user_location_t loc, loop_p loop, gphi *phi,
2700 tree loop_arg, enum tree_code code)
2701 {
2702 auto_vec<std::pair<ssa_op_iter, use_operand_p> > path;
2703 return check_reduction_path (loc, loop, phi, loop_arg, code, path);
2704 }
2705
2706
2707
2708 /* Function vect_is_simple_reduction
2709
2710 (1) Detect a cross-iteration def-use cycle that represents a simple
2711 reduction computation. We look for the following pattern:
2712
2713 loop_header:
2714 a1 = phi < a0, a2 >
2715 a3 = ...
2716 a2 = operation (a3, a1)
2717
2718 or
2719
2720 a3 = ...
2721 loop_header:
2722 a1 = phi < a0, a2 >
2723 a2 = operation (a3, a1)
2724
2725 such that:
2726 1. operation is commutative and associative and it is safe to
2727 change the order of the computation
2728 2. no uses for a2 in the loop (a2 is used out of the loop)
2729 3. no uses of a1 in the loop besides the reduction operation
2730 4. no uses of a1 outside the loop.
2731
2732 Conditions 1,4 are tested here.
2733 Conditions 2,3 are tested in vect_mark_stmts_to_be_vectorized.
2734
2735 (2) Detect a cross-iteration def-use cycle in nested loops, i.e.,
2736 nested cycles.
2737
2738 (3) Detect cycles of phi nodes in outer-loop vectorization, i.e., double
2739 reductions:
2740
2741 a1 = phi < a0, a2 >
2742 inner loop (def of a3)
2743 a2 = phi < a3 >
2744
2745 (4) Detect condition expressions, ie:
2746 for (int i = 0; i < N; i++)
2747 if (a[i] < val)
2748 ret_val = a[i];
2749
2750 */
2751
2752 static stmt_vec_info
2753 vect_is_simple_reduction (loop_vec_info loop_info, stmt_vec_info phi_info,
2754 bool *double_reduc)
2755 {
2756 gphi *phi = as_a <gphi *> (phi_info->stmt);
2757 gimple *phi_use_stmt = NULL;
2758 imm_use_iterator imm_iter;
2759 use_operand_p use_p;
2760
2761 *double_reduc = false;
2762 STMT_VINFO_REDUC_TYPE (phi_info) = TREE_CODE_REDUCTION;
2763
2764 tree phi_name = PHI_RESULT (phi);
2765 /* ??? If there are no uses of the PHI result the inner loop reduction
2766 won't be detected as possibly double-reduction by vectorizable_reduction
2767 because that tries to walk the PHI arg from the preheader edge which
2768 can be constant. See PR60382. */
2769 if (has_zero_uses (phi_name))
2770 return NULL;
2771 class loop *loop = (gimple_bb (phi))->loop_father;
2772 unsigned nphi_def_loop_uses = 0;
2773 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, phi_name)
2774 {
2775 gimple *use_stmt = USE_STMT (use_p);
2776 if (is_gimple_debug (use_stmt))
2777 continue;
2778
2779 if (!flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
2780 {
2781 if (dump_enabled_p ())
2782 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2783 "intermediate value used outside loop.\n");
2784
2785 return NULL;
2786 }
2787
2788 nphi_def_loop_uses++;
2789 phi_use_stmt = use_stmt;
2790 }
2791
2792 tree latch_def = PHI_ARG_DEF_FROM_EDGE (phi, loop_latch_edge (loop));
2793 if (TREE_CODE (latch_def) != SSA_NAME)
2794 {
2795 if (dump_enabled_p ())
2796 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2797 "reduction: not ssa_name: %T\n", latch_def);
2798 return NULL;
2799 }
2800
2801 stmt_vec_info def_stmt_info = loop_info->lookup_def (latch_def);
2802 if (!def_stmt_info
2803 || !flow_bb_inside_loop_p (loop, gimple_bb (def_stmt_info->stmt)))
2804 return NULL;
2805
2806 bool nested_in_vect_loop
2807 = flow_loop_nested_p (LOOP_VINFO_LOOP (loop_info), loop);
2808 unsigned nlatch_def_loop_uses = 0;
2809 auto_vec<gphi *, 3> lcphis;
2810 bool inner_loop_of_double_reduc = false;
2811 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, latch_def)
2812 {
2813 gimple *use_stmt = USE_STMT (use_p);
2814 if (is_gimple_debug (use_stmt))
2815 continue;
2816 if (flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
2817 nlatch_def_loop_uses++;
2818 else
2819 {
2820 /* We can have more than one loop-closed PHI. */
2821 lcphis.safe_push (as_a <gphi *> (use_stmt));
2822 if (nested_in_vect_loop
2823 && (STMT_VINFO_DEF_TYPE (loop_info->lookup_stmt (use_stmt))
2824 == vect_double_reduction_def))
2825 inner_loop_of_double_reduc = true;
2826 }
2827 }
2828
2829 /* If we are vectorizing an inner reduction we are executing that
2830 in the original order only in case we are not dealing with a
2831 double reduction. */
2832 if (nested_in_vect_loop && !inner_loop_of_double_reduc)
2833 {
2834 if (dump_enabled_p ())
2835 report_vect_op (MSG_NOTE, def_stmt_info->stmt,
2836 "detected nested cycle: ");
2837 return def_stmt_info;
2838 }
2839
2840 /* If this isn't a nested cycle or if the nested cycle reduction value
2841 is used ouside of the inner loop we cannot handle uses of the reduction
2842 value. */
2843 if (nlatch_def_loop_uses > 1 || nphi_def_loop_uses > 1)
2844 {
2845 if (dump_enabled_p ())
2846 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2847 "reduction used in loop.\n");
2848 return NULL;
2849 }
2850
2851 /* If DEF_STMT is a phi node itself, we expect it to have a single argument
2852 defined in the inner loop. */
2853 if (gphi *def_stmt = dyn_cast <gphi *> (def_stmt_info->stmt))
2854 {
2855 tree op1 = PHI_ARG_DEF (def_stmt, 0);
2856 if (gimple_phi_num_args (def_stmt) != 1
2857 || TREE_CODE (op1) != SSA_NAME)
2858 {
2859 if (dump_enabled_p ())
2860 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2861 "unsupported phi node definition.\n");
2862
2863 return NULL;
2864 }
2865
2866 gimple *def1 = SSA_NAME_DEF_STMT (op1);
2867 if (gimple_bb (def1)
2868 && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
2869 && loop->inner
2870 && flow_bb_inside_loop_p (loop->inner, gimple_bb (def1))
2871 && is_gimple_assign (def1)
2872 && is_a <gphi *> (phi_use_stmt)
2873 && flow_bb_inside_loop_p (loop->inner, gimple_bb (phi_use_stmt)))
2874 {
2875 if (dump_enabled_p ())
2876 report_vect_op (MSG_NOTE, def_stmt,
2877 "detected double reduction: ");
2878
2879 *double_reduc = true;
2880 return def_stmt_info;
2881 }
2882
2883 return NULL;
2884 }
2885
2886 gassign *def_stmt = dyn_cast <gassign *> (def_stmt_info->stmt);
2887 if (!def_stmt)
2888 {
2889 if (dump_enabled_p ())
2890 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2891 "reduction: unhandled reduction operation: %G",
2892 def_stmt_info->stmt);
2893 return NULL;
2894 }
2895 enum tree_code code = gimple_assign_rhs_code (def_stmt);
2896
2897 /* We can handle "res -= x[i]", which is non-associative by
2898 simply rewriting this into "res += -x[i]". Avoid changing
2899 gimple instruction for the first simple tests and only do this
2900 if we're allowed to change code at all. */
2901 if (code == MINUS_EXPR && gimple_assign_rhs2 (def_stmt) != phi_name)
2902 code = PLUS_EXPR;
2903
2904 tree op1, op2;
2905 if (code == COND_EXPR)
2906 {
2907 if (! nested_in_vect_loop)
2908 STMT_VINFO_REDUC_TYPE (phi_info) = COND_REDUCTION;
2909 op1 = gimple_assign_rhs2 (def_stmt);
2910 op2 = gimple_assign_rhs3 (def_stmt);
2911 }
2912 else if (get_gimple_rhs_class (code) == GIMPLE_BINARY_RHS)
2913 {
2914 op1 = gimple_assign_rhs1 (def_stmt);
2915 op2 = gimple_assign_rhs2 (def_stmt);
2916 }
2917 else
2918 {
2919 if (dump_enabled_p ())
2920 report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2921 "reduction: not handled operation: ");
2922 return NULL;
2923 }
2924
2925 if (TREE_CODE (op1) != SSA_NAME && TREE_CODE (op2) != SSA_NAME)
2926 {
2927 if (dump_enabled_p ())
2928 report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2929 "reduction: both uses not ssa_names: ");
2930
2931 return NULL;
2932 }
2933
2934 /* Reduction is safe. We're dealing with one of the following:
2935 1) integer arithmetic and no trapv
2936 2) floating point arithmetic, and special flags permit this optimization
2937 3) nested cycle (i.e., outer loop vectorization). */
2938
2939 /* Check for the simple case that one def is the reduction def,
2940 defined by the PHI node. */
2941 stmt_vec_info def1_info = loop_info->lookup_def (op1);
2942 stmt_vec_info def2_info = loop_info->lookup_def (op2);
2943 if (def2_info && def2_info->stmt == phi)
2944 {
2945 STMT_VINFO_REDUC_IDX (def_stmt_info) = 1 + (code == COND_EXPR ? 1 : 0);
2946 if (dump_enabled_p ())
2947 report_vect_op (MSG_NOTE, def_stmt, "detected reduction: ");
2948 return def_stmt_info;
2949 }
2950 else if (def1_info && def1_info->stmt == phi)
2951 {
2952 STMT_VINFO_REDUC_IDX (def_stmt_info) = 0 + (code == COND_EXPR ? 1 : 0);
2953 if (dump_enabled_p ())
2954 report_vect_op (MSG_NOTE, def_stmt, "detected reduction: ");
2955 return def_stmt_info;
2956 }
2957
2958 /* Look for the expression computing latch_def from then loop PHI result
2959 in a way involving more than one stmt. */
2960 auto_vec<std::pair<ssa_op_iter, use_operand_p> > path;
2961 if (check_reduction_path (vect_location, loop, phi, latch_def, code,
2962 path))
2963 {
2964 /* Try building an SLP reduction chain for which the additional
2965 restriction is that all operations in the chain are the same. */
2966 auto_vec<stmt_vec_info, 8> reduc_chain;
2967 unsigned i;
2968 bool is_slp_reduc = !nested_in_vect_loop && code != COND_EXPR;
2969 for (i = path.length () - 1; i >= 1; --i)
2970 {
2971 gimple *stmt = USE_STMT (path[i].second);
2972 if (gimple_assign_rhs_code (stmt) != code)
2973 is_slp_reduc = false;
2974 stmt_vec_info stmt_info = loop_info->lookup_stmt (stmt);
2975 STMT_VINFO_REDUC_IDX (stmt_info)
2976 = path[i].second->use - gimple_assign_rhs1_ptr (stmt);
2977 reduc_chain.safe_push (stmt_info);
2978 }
2979 if (is_slp_reduc)
2980 {
2981 for (unsigned i = 0; i < reduc_chain.length () - 1; ++i)
2982 {
2983 REDUC_GROUP_FIRST_ELEMENT (reduc_chain[i]) = reduc_chain[0];
2984 REDUC_GROUP_NEXT_ELEMENT (reduc_chain[i]) = reduc_chain[i+1];
2985 }
2986 REDUC_GROUP_FIRST_ELEMENT (reduc_chain.last ()) = reduc_chain[0];
2987 REDUC_GROUP_NEXT_ELEMENT (reduc_chain.last ()) = NULL;
2988
2989 /* Save the chain for further analysis in SLP detection. */
2990 LOOP_VINFO_REDUCTION_CHAINS (loop_info).safe_push (reduc_chain[0]);
2991 REDUC_GROUP_SIZE (reduc_chain[0]) = reduc_chain.length ();
2992
2993 if (dump_enabled_p ())
2994 report_vect_op (MSG_NOTE, def_stmt,
2995 "reduction: detected reduction chain: ");
2996 }
2997
2998 return def_stmt_info;
2999 }
3000
3001 if (dump_enabled_p ())
3002 {
3003 report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
3004 "reduction: unknown pattern: ");
3005 }
3006
3007 return NULL;
3008 }
3009
3010 /* Calculate cost of peeling the loop PEEL_ITERS_PROLOGUE times. */
3011 int
3012 vect_get_known_peeling_cost (loop_vec_info loop_vinfo, int peel_iters_prologue,
3013 int *peel_iters_epilogue,
3014 stmt_vector_for_cost *scalar_cost_vec,
3015 stmt_vector_for_cost *prologue_cost_vec,
3016 stmt_vector_for_cost *epilogue_cost_vec)
3017 {
3018 int retval = 0;
3019 int assumed_vf = vect_vf_for_cost (loop_vinfo);
3020
3021 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
3022 {
3023 *peel_iters_epilogue = assumed_vf / 2;
3024 if (dump_enabled_p ())
3025 dump_printf_loc (MSG_NOTE, vect_location,
3026 "cost model: epilogue peel iters set to vf/2 "
3027 "because loop iterations are unknown .\n");
3028
3029 /* If peeled iterations are known but number of scalar loop
3030 iterations are unknown, count a taken branch per peeled loop. */
3031 retval = record_stmt_cost (prologue_cost_vec, 1, cond_branch_taken,
3032 NULL, 0, vect_prologue);
3033 retval += record_stmt_cost (epilogue_cost_vec, 1, cond_branch_taken,
3034 NULL, 0, vect_epilogue);
3035 }
3036 else
3037 {
3038 int niters = LOOP_VINFO_INT_NITERS (loop_vinfo);
3039 peel_iters_prologue = niters < peel_iters_prologue ?
3040 niters : peel_iters_prologue;
3041 *peel_iters_epilogue = (niters - peel_iters_prologue) % assumed_vf;
3042 /* If we need to peel for gaps, but no peeling is required, we have to
3043 peel VF iterations. */
3044 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) && !*peel_iters_epilogue)
3045 *peel_iters_epilogue = assumed_vf;
3046 }
3047
3048 stmt_info_for_cost *si;
3049 int j;
3050 if (peel_iters_prologue)
3051 FOR_EACH_VEC_ELT (*scalar_cost_vec, j, si)
3052 retval += record_stmt_cost (prologue_cost_vec,
3053 si->count * peel_iters_prologue,
3054 si->kind, si->stmt_info, si->misalign,
3055 vect_prologue);
3056 if (*peel_iters_epilogue)
3057 FOR_EACH_VEC_ELT (*scalar_cost_vec, j, si)
3058 retval += record_stmt_cost (epilogue_cost_vec,
3059 si->count * *peel_iters_epilogue,
3060 si->kind, si->stmt_info, si->misalign,
3061 vect_epilogue);
3062
3063 return retval;
3064 }
3065
3066 /* Function vect_estimate_min_profitable_iters
3067
3068 Return the number of iterations required for the vector version of the
3069 loop to be profitable relative to the cost of the scalar version of the
3070 loop.
3071
3072 *RET_MIN_PROFITABLE_NITERS is a cost model profitability threshold
3073 of iterations for vectorization. -1 value means loop vectorization
3074 is not profitable. This returned value may be used for dynamic
3075 profitability check.
3076
3077 *RET_MIN_PROFITABLE_ESTIMATE is a profitability threshold to be used
3078 for static check against estimated number of iterations. */
3079
3080 static void
3081 vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo,
3082 int *ret_min_profitable_niters,
3083 int *ret_min_profitable_estimate)
3084 {
3085 int min_profitable_iters;
3086 int min_profitable_estimate;
3087 int peel_iters_prologue;
3088 int peel_iters_epilogue;
3089 unsigned vec_inside_cost = 0;
3090 int vec_outside_cost = 0;
3091 unsigned vec_prologue_cost = 0;
3092 unsigned vec_epilogue_cost = 0;
3093 int scalar_single_iter_cost = 0;
3094 int scalar_outside_cost = 0;
3095 int assumed_vf = vect_vf_for_cost (loop_vinfo);
3096 int npeel = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
3097 void *target_cost_data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
3098
3099 /* Cost model disabled. */
3100 if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
3101 {
3102 if (dump_enabled_p ())
3103 dump_printf_loc (MSG_NOTE, vect_location, "cost model disabled.\n");
3104 *ret_min_profitable_niters = 0;
3105 *ret_min_profitable_estimate = 0;
3106 return;
3107 }
3108
3109 /* Requires loop versioning tests to handle misalignment. */
3110 if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo))
3111 {
3112 /* FIXME: Make cost depend on complexity of individual check. */
3113 unsigned len = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).length ();
3114 (void) add_stmt_cost (target_cost_data, len, vector_stmt, NULL, 0,
3115 vect_prologue);
3116 if (dump_enabled_p ())
3117 dump_printf (MSG_NOTE,
3118 "cost model: Adding cost of checks for loop "
3119 "versioning to treat misalignment.\n");
3120 }
3121
3122 /* Requires loop versioning with alias checks. */
3123 if (LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
3124 {
3125 /* FIXME: Make cost depend on complexity of individual check. */
3126 unsigned len = LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo).length ();
3127 (void) add_stmt_cost (target_cost_data, len, vector_stmt, NULL, 0,
3128 vect_prologue);
3129 len = LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo).length ();
3130 if (len)
3131 /* Count LEN - 1 ANDs and LEN comparisons. */
3132 (void) add_stmt_cost (target_cost_data, len * 2 - 1, scalar_stmt,
3133 NULL, 0, vect_prologue);
3134 len = LOOP_VINFO_LOWER_BOUNDS (loop_vinfo).length ();
3135 if (len)
3136 {
3137 /* Count LEN - 1 ANDs and LEN comparisons. */
3138 unsigned int nstmts = len * 2 - 1;
3139 /* +1 for each bias that needs adding. */
3140 for (unsigned int i = 0; i < len; ++i)
3141 if (!LOOP_VINFO_LOWER_BOUNDS (loop_vinfo)[i].unsigned_p)
3142 nstmts += 1;
3143 (void) add_stmt_cost (target_cost_data, nstmts, scalar_stmt,
3144 NULL, 0, vect_prologue);
3145 }
3146 if (dump_enabled_p ())
3147 dump_printf (MSG_NOTE,
3148 "cost model: Adding cost of checks for loop "
3149 "versioning aliasing.\n");
3150 }
3151
3152 /* Requires loop versioning with niter checks. */
3153 if (LOOP_REQUIRES_VERSIONING_FOR_NITERS (loop_vinfo))
3154 {
3155 /* FIXME: Make cost depend on complexity of individual check. */
3156 (void) add_stmt_cost (target_cost_data, 1, vector_stmt, NULL, 0,
3157 vect_prologue);
3158 if (dump_enabled_p ())
3159 dump_printf (MSG_NOTE,
3160 "cost model: Adding cost of checks for loop "
3161 "versioning niters.\n");
3162 }
3163
3164 if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
3165 (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken, NULL, 0,
3166 vect_prologue);
3167
3168 /* Count statements in scalar loop. Using this as scalar cost for a single
3169 iteration for now.
3170
3171 TODO: Add outer loop support.
3172
3173 TODO: Consider assigning different costs to different scalar
3174 statements. */
3175
3176 scalar_single_iter_cost
3177 = LOOP_VINFO_SINGLE_SCALAR_ITERATION_COST (loop_vinfo);
3178
3179 /* Add additional cost for the peeled instructions in prologue and epilogue
3180 loop. (For fully-masked loops there will be no peeling.)
3181
3182 FORNOW: If we don't know the value of peel_iters for prologue or epilogue
3183 at compile-time - we assume it's vf/2 (the worst would be vf-1).
3184
3185 TODO: Build an expression that represents peel_iters for prologue and
3186 epilogue to be used in a run-time test. */
3187
3188 if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
3189 {
3190 peel_iters_prologue = 0;
3191 peel_iters_epilogue = 0;
3192
3193 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
3194 {
3195 /* We need to peel exactly one iteration. */
3196 peel_iters_epilogue += 1;
3197 stmt_info_for_cost *si;
3198 int j;
3199 FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
3200 j, si)
3201 (void) add_stmt_cost (target_cost_data, si->count,
3202 si->kind, si->stmt_info, si->misalign,
3203 vect_epilogue);
3204 }
3205 }
3206 else if (npeel < 0)
3207 {
3208 peel_iters_prologue = assumed_vf / 2;
3209 if (dump_enabled_p ())
3210 dump_printf (MSG_NOTE, "cost model: "
3211 "prologue peel iters set to vf/2.\n");
3212
3213 /* If peeling for alignment is unknown, loop bound of main loop becomes
3214 unknown. */
3215 peel_iters_epilogue = assumed_vf / 2;
3216 if (dump_enabled_p ())
3217 dump_printf (MSG_NOTE, "cost model: "
3218 "epilogue peel iters set to vf/2 because "
3219 "peeling for alignment is unknown.\n");
3220
3221 /* If peeled iterations are unknown, count a taken branch and a not taken
3222 branch per peeled loop. Even if scalar loop iterations are known,
3223 vector iterations are not known since peeled prologue iterations are
3224 not known. Hence guards remain the same. */
3225 (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken,
3226 NULL, 0, vect_prologue);
3227 (void) add_stmt_cost (target_cost_data, 1, cond_branch_not_taken,
3228 NULL, 0, vect_prologue);
3229 (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken,
3230 NULL, 0, vect_epilogue);
3231 (void) add_stmt_cost (target_cost_data, 1, cond_branch_not_taken,
3232 NULL, 0, vect_epilogue);
3233 stmt_info_for_cost *si;
3234 int j;
3235 FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo), j, si)
3236 {
3237 (void) add_stmt_cost (target_cost_data,
3238 si->count * peel_iters_prologue,
3239 si->kind, si->stmt_info, si->misalign,
3240 vect_prologue);
3241 (void) add_stmt_cost (target_cost_data,
3242 si->count * peel_iters_epilogue,
3243 si->kind, si->stmt_info, si->misalign,
3244 vect_epilogue);
3245 }
3246 }
3247 else
3248 {
3249 stmt_vector_for_cost prologue_cost_vec, epilogue_cost_vec;
3250 stmt_info_for_cost *si;
3251 int j;
3252 void *data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
3253
3254 prologue_cost_vec.create (2);
3255 epilogue_cost_vec.create (2);
3256 peel_iters_prologue = npeel;
3257
3258 (void) vect_get_known_peeling_cost (loop_vinfo, peel_iters_prologue,
3259 &peel_iters_epilogue,
3260 &LOOP_VINFO_SCALAR_ITERATION_COST
3261 (loop_vinfo),
3262 &prologue_cost_vec,
3263 &epilogue_cost_vec);
3264
3265 FOR_EACH_VEC_ELT (prologue_cost_vec, j, si)
3266 (void) add_stmt_cost (data, si->count, si->kind, si->stmt_info,
3267 si->misalign, vect_prologue);
3268
3269 FOR_EACH_VEC_ELT (epilogue_cost_vec, j, si)
3270 (void) add_stmt_cost (data, si->count, si->kind, si->stmt_info,
3271 si->misalign, vect_epilogue);
3272
3273 prologue_cost_vec.release ();
3274 epilogue_cost_vec.release ();
3275 }
3276
3277 /* FORNOW: The scalar outside cost is incremented in one of the
3278 following ways:
3279
3280 1. The vectorizer checks for alignment and aliasing and generates
3281 a condition that allows dynamic vectorization. A cost model
3282 check is ANDED with the versioning condition. Hence scalar code
3283 path now has the added cost of the versioning check.
3284
3285 if (cost > th & versioning_check)
3286 jmp to vector code
3287
3288 Hence run-time scalar is incremented by not-taken branch cost.
3289
3290 2. The vectorizer then checks if a prologue is required. If the
3291 cost model check was not done before during versioning, it has to
3292 be done before the prologue check.
3293
3294 if (cost <= th)
3295 prologue = scalar_iters
3296 if (prologue == 0)
3297 jmp to vector code
3298 else
3299 execute prologue
3300 if (prologue == num_iters)
3301 go to exit
3302
3303 Hence the run-time scalar cost is incremented by a taken branch,
3304 plus a not-taken branch, plus a taken branch cost.
3305
3306 3. The vectorizer then checks if an epilogue is required. If the
3307 cost model check was not done before during prologue check, it
3308 has to be done with the epilogue check.
3309
3310 if (prologue == 0)
3311 jmp to vector code
3312 else
3313 execute prologue
3314 if (prologue == num_iters)
3315 go to exit
3316 vector code:
3317 if ((cost <= th) | (scalar_iters-prologue-epilogue == 0))
3318 jmp to epilogue
3319
3320 Hence the run-time scalar cost should be incremented by 2 taken
3321 branches.
3322
3323 TODO: The back end may reorder the BBS's differently and reverse
3324 conditions/branch directions. Change the estimates below to
3325 something more reasonable. */
3326
3327 /* If the number of iterations is known and we do not do versioning, we can
3328 decide whether to vectorize at compile time. Hence the scalar version
3329 do not carry cost model guard costs. */
3330 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
3331 || LOOP_REQUIRES_VERSIONING (loop_vinfo))
3332 {
3333 /* Cost model check occurs at versioning. */
3334 if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
3335 scalar_outside_cost += vect_get_stmt_cost (cond_branch_not_taken);
3336 else
3337 {
3338 /* Cost model check occurs at prologue generation. */
3339 if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) < 0)
3340 scalar_outside_cost += 2 * vect_get_stmt_cost (cond_branch_taken)
3341 + vect_get_stmt_cost (cond_branch_not_taken);
3342 /* Cost model check occurs at epilogue generation. */
3343 else
3344 scalar_outside_cost += 2 * vect_get_stmt_cost (cond_branch_taken);
3345 }
3346 }
3347
3348 /* Complete the target-specific cost calculations. */
3349 finish_cost (LOOP_VINFO_TARGET_COST_DATA (loop_vinfo), &vec_prologue_cost,
3350 &vec_inside_cost, &vec_epilogue_cost);
3351
3352 vec_outside_cost = (int)(vec_prologue_cost + vec_epilogue_cost);
3353
3354 if (dump_enabled_p ())
3355 {
3356 dump_printf_loc (MSG_NOTE, vect_location, "Cost model analysis: \n");
3357 dump_printf (MSG_NOTE, " Vector inside of loop cost: %d\n",
3358 vec_inside_cost);
3359 dump_printf (MSG_NOTE, " Vector prologue cost: %d\n",
3360 vec_prologue_cost);
3361 dump_printf (MSG_NOTE, " Vector epilogue cost: %d\n",
3362 vec_epilogue_cost);
3363 dump_printf (MSG_NOTE, " Scalar iteration cost: %d\n",
3364 scalar_single_iter_cost);
3365 dump_printf (MSG_NOTE, " Scalar outside cost: %d\n",
3366 scalar_outside_cost);
3367 dump_printf (MSG_NOTE, " Vector outside cost: %d\n",
3368 vec_outside_cost);
3369 dump_printf (MSG_NOTE, " prologue iterations: %d\n",
3370 peel_iters_prologue);
3371 dump_printf (MSG_NOTE, " epilogue iterations: %d\n",
3372 peel_iters_epilogue);
3373 }
3374
3375 /* Calculate number of iterations required to make the vector version
3376 profitable, relative to the loop bodies only. The following condition
3377 must hold true:
3378 SIC * niters + SOC > VIC * ((niters - NPEEL) / VF) + VOC
3379 where
3380 SIC = scalar iteration cost, VIC = vector iteration cost,
3381 VOC = vector outside cost, VF = vectorization factor,
3382 NPEEL = prologue iterations + epilogue iterations,
3383 SOC = scalar outside cost for run time cost model check. */
3384
3385 int saving_per_viter = (scalar_single_iter_cost * assumed_vf
3386 - vec_inside_cost);
3387 if (saving_per_viter <= 0)
3388 {
3389 if (LOOP_VINFO_LOOP (loop_vinfo)->force_vectorize)
3390 warning_at (vect_location.get_location_t (), OPT_Wopenmp_simd,
3391 "vectorization did not happen for a simd loop");
3392
3393 if (dump_enabled_p ())
3394 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3395 "cost model: the vector iteration cost = %d "
3396 "divided by the scalar iteration cost = %d "
3397 "is greater or equal to the vectorization factor = %d"
3398 ".\n",
3399 vec_inside_cost, scalar_single_iter_cost, assumed_vf);
3400 *ret_min_profitable_niters = -1;
3401 *ret_min_profitable_estimate = -1;
3402 return;
3403 }
3404
3405 /* ??? The "if" arm is written to handle all cases; see below for what
3406 we would do for !LOOP_VINFO_FULLY_MASKED_P. */
3407 if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
3408 {
3409 /* Rewriting the condition above in terms of the number of
3410 vector iterations (vniters) rather than the number of
3411 scalar iterations (niters) gives:
3412
3413 SIC * (vniters * VF + NPEEL) + SOC > VIC * vniters + VOC
3414
3415 <==> vniters * (SIC * VF - VIC) > VOC - SIC * NPEEL - SOC
3416
3417 For integer N, X and Y when X > 0:
3418
3419 N * X > Y <==> N >= (Y /[floor] X) + 1. */
3420 int outside_overhead = (vec_outside_cost
3421 - scalar_single_iter_cost * peel_iters_prologue
3422 - scalar_single_iter_cost * peel_iters_epilogue
3423 - scalar_outside_cost);
3424 /* We're only interested in cases that require at least one
3425 vector iteration. */
3426 int min_vec_niters = 1;
3427 if (outside_overhead > 0)
3428 min_vec_niters = outside_overhead / saving_per_viter + 1;
3429
3430 if (dump_enabled_p ())
3431 dump_printf (MSG_NOTE, " Minimum number of vector iterations: %d\n",
3432 min_vec_niters);
3433
3434 if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
3435 {
3436 /* Now that we know the minimum number of vector iterations,
3437 find the minimum niters for which the scalar cost is larger:
3438
3439 SIC * niters > VIC * vniters + VOC - SOC
3440
3441 We know that the minimum niters is no more than
3442 vniters * VF + NPEEL, but it might be (and often is) less
3443 than that if a partial vector iteration is cheaper than the
3444 equivalent scalar code. */
3445 int threshold = (vec_inside_cost * min_vec_niters
3446 + vec_outside_cost
3447 - scalar_outside_cost);
3448 if (threshold <= 0)
3449 min_profitable_iters = 1;
3450 else
3451 min_profitable_iters = threshold / scalar_single_iter_cost + 1;
3452 }
3453 else
3454 /* Convert the number of vector iterations into a number of
3455 scalar iterations. */
3456 min_profitable_iters = (min_vec_niters * assumed_vf
3457 + peel_iters_prologue
3458 + peel_iters_epilogue);
3459 }
3460 else
3461 {
3462 min_profitable_iters = ((vec_outside_cost - scalar_outside_cost)
3463 * assumed_vf
3464 - vec_inside_cost * peel_iters_prologue
3465 - vec_inside_cost * peel_iters_epilogue);
3466 if (min_profitable_iters <= 0)
3467 min_profitable_iters = 0;
3468 else
3469 {
3470 min_profitable_iters /= saving_per_viter;
3471
3472 if ((scalar_single_iter_cost * assumed_vf * min_profitable_iters)
3473 <= (((int) vec_inside_cost * min_profitable_iters)
3474 + (((int) vec_outside_cost - scalar_outside_cost)
3475 * assumed_vf)))
3476 min_profitable_iters++;
3477 }
3478 }
3479
3480 if (dump_enabled_p ())
3481 dump_printf (MSG_NOTE,
3482 " Calculated minimum iters for profitability: %d\n",
3483 min_profitable_iters);
3484
3485 if (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
3486 && min_profitable_iters < (assumed_vf + peel_iters_prologue))
3487 /* We want the vectorized loop to execute at least once. */
3488 min_profitable_iters = assumed_vf + peel_iters_prologue;
3489
3490 if (dump_enabled_p ())
3491 dump_printf_loc (MSG_NOTE, vect_location,
3492 " Runtime profitability threshold = %d\n",
3493 min_profitable_iters);
3494
3495 *ret_min_profitable_niters = min_profitable_iters;
3496
3497 /* Calculate number of iterations required to make the vector version
3498 profitable, relative to the loop bodies only.
3499
3500 Non-vectorized variant is SIC * niters and it must win over vector
3501 variant on the expected loop trip count. The following condition must hold true:
3502 SIC * niters > VIC * ((niters - NPEEL) / VF) + VOC + SOC */
3503
3504 if (vec_outside_cost <= 0)
3505 min_profitable_estimate = 0;
3506 else if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
3507 {
3508 /* This is a repeat of the code above, but with + SOC rather
3509 than - SOC. */
3510 int outside_overhead = (vec_outside_cost
3511 - scalar_single_iter_cost * peel_iters_prologue
3512 - scalar_single_iter_cost * peel_iters_epilogue
3513 + scalar_outside_cost);
3514 int min_vec_niters = 1;
3515 if (outside_overhead > 0)
3516 min_vec_niters = outside_overhead / saving_per_viter + 1;
3517
3518 if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
3519 {
3520 int threshold = (vec_inside_cost * min_vec_niters
3521 + vec_outside_cost
3522 + scalar_outside_cost);
3523 min_profitable_estimate = threshold / scalar_single_iter_cost + 1;
3524 }
3525 else
3526 min_profitable_estimate = (min_vec_niters * assumed_vf
3527 + peel_iters_prologue
3528 + peel_iters_epilogue);
3529 }
3530 else
3531 {
3532 min_profitable_estimate = ((vec_outside_cost + scalar_outside_cost)
3533 * assumed_vf
3534 - vec_inside_cost * peel_iters_prologue
3535 - vec_inside_cost * peel_iters_epilogue)
3536 / ((scalar_single_iter_cost * assumed_vf)
3537 - vec_inside_cost);
3538 }
3539 min_profitable_estimate = MAX (min_profitable_estimate, min_profitable_iters);
3540 if (dump_enabled_p ())
3541 dump_printf_loc (MSG_NOTE, vect_location,
3542 " Static estimate profitability threshold = %d\n",
3543 min_profitable_estimate);
3544
3545 *ret_min_profitable_estimate = min_profitable_estimate;
3546 }
3547
3548 /* Writes into SEL a mask for a vec_perm, equivalent to a vec_shr by OFFSET
3549 vector elements (not bits) for a vector with NELT elements. */
3550 static void
3551 calc_vec_perm_mask_for_shift (unsigned int offset, unsigned int nelt,
3552 vec_perm_builder *sel)
3553 {
3554 /* The encoding is a single stepped pattern. Any wrap-around is handled
3555 by vec_perm_indices. */
3556 sel->new_vector (nelt, 1, 3);
3557 for (unsigned int i = 0; i < 3; i++)
3558 sel->quick_push (i + offset);
3559 }
3560
3561 /* Checks whether the target supports whole-vector shifts for vectors of mode
3562 MODE. This is the case if _either_ the platform handles vec_shr_optab, _or_
3563 it supports vec_perm_const with masks for all necessary shift amounts. */
3564 static bool
3565 have_whole_vector_shift (machine_mode mode)
3566 {
3567 if (optab_handler (vec_shr_optab, mode) != CODE_FOR_nothing)
3568 return true;
3569
3570 /* Variable-length vectors should be handled via the optab. */
3571 unsigned int nelt;
3572 if (!GET_MODE_NUNITS (mode).is_constant (&nelt))
3573 return false;
3574
3575 vec_perm_builder sel;
3576 vec_perm_indices indices;
3577 for (unsigned int i = nelt / 2; i >= 1; i /= 2)
3578 {
3579 calc_vec_perm_mask_for_shift (i, nelt, &sel);
3580 indices.new_vector (sel, 2, nelt);
3581 if (!can_vec_perm_const_p (mode, indices, false))
3582 return false;
3583 }
3584 return true;
3585 }
3586
3587 /* TODO: Close dependency between vect_model_*_cost and vectorizable_*
3588 functions. Design better to avoid maintenance issues. */
3589
3590 /* Function vect_model_reduction_cost.
3591
3592 Models cost for a reduction operation, including the vector ops
3593 generated within the strip-mine loop, the initial definition before
3594 the loop, and the epilogue code that must be generated. */
3595
3596 static void
3597 vect_model_reduction_cost (stmt_vec_info stmt_info, internal_fn reduc_fn,
3598 vect_reduction_type reduction_type,
3599 int ncopies, stmt_vector_for_cost *cost_vec)
3600 {
3601 int prologue_cost = 0, epilogue_cost = 0, inside_cost;
3602 enum tree_code code;
3603 optab optab;
3604 tree vectype;
3605 machine_mode mode;
3606 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
3607 class loop *loop = NULL;
3608
3609 if (loop_vinfo)
3610 loop = LOOP_VINFO_LOOP (loop_vinfo);
3611
3612 /* Condition reductions generate two reductions in the loop. */
3613 if (reduction_type == COND_REDUCTION)
3614 ncopies *= 2;
3615
3616 vectype = STMT_VINFO_VECTYPE (stmt_info);
3617 mode = TYPE_MODE (vectype);
3618 stmt_vec_info orig_stmt_info = vect_orig_stmt (stmt_info);
3619
3620 code = gimple_assign_rhs_code (orig_stmt_info->stmt);
3621
3622 if (reduction_type == EXTRACT_LAST_REDUCTION
3623 || reduction_type == FOLD_LEFT_REDUCTION)
3624 {
3625 /* No extra instructions needed in the prologue. */
3626 prologue_cost = 0;
3627
3628 if (reduction_type == EXTRACT_LAST_REDUCTION || reduc_fn != IFN_LAST)
3629 /* Count one reduction-like operation per vector. */
3630 inside_cost = record_stmt_cost (cost_vec, ncopies, vec_to_scalar,
3631 stmt_info, 0, vect_body);
3632 else
3633 {
3634 /* Use NELEMENTS extracts and NELEMENTS scalar ops. */
3635 unsigned int nelements = ncopies * vect_nunits_for_cost (vectype);
3636 inside_cost = record_stmt_cost (cost_vec, nelements,
3637 vec_to_scalar, stmt_info, 0,
3638 vect_body);
3639 inside_cost += record_stmt_cost (cost_vec, nelements,
3640 scalar_stmt, stmt_info, 0,
3641 vect_body);
3642 }
3643 }
3644 else
3645 {
3646 /* Add in cost for initial definition.
3647 For cond reduction we have four vectors: initial index, step,
3648 initial result of the data reduction, initial value of the index
3649 reduction. */
3650 int prologue_stmts = reduction_type == COND_REDUCTION ? 4 : 1;
3651 prologue_cost += record_stmt_cost (cost_vec, prologue_stmts,
3652 scalar_to_vec, stmt_info, 0,
3653 vect_prologue);
3654
3655 /* Cost of reduction op inside loop. */
3656 inside_cost = record_stmt_cost (cost_vec, ncopies, vector_stmt,
3657 stmt_info, 0, vect_body);
3658 }
3659
3660 /* Determine cost of epilogue code.
3661
3662 We have a reduction operator that will reduce the vector in one statement.
3663 Also requires scalar extract. */
3664
3665 if (!loop || !nested_in_vect_loop_p (loop, orig_stmt_info))
3666 {
3667 if (reduc_fn != IFN_LAST)
3668 {
3669 if (reduction_type == COND_REDUCTION)
3670 {
3671 /* An EQ stmt and an COND_EXPR stmt. */
3672 epilogue_cost += record_stmt_cost (cost_vec, 2,
3673 vector_stmt, stmt_info, 0,
3674 vect_epilogue);
3675 /* Reduction of the max index and a reduction of the found
3676 values. */
3677 epilogue_cost += record_stmt_cost (cost_vec, 2,
3678 vec_to_scalar, stmt_info, 0,
3679 vect_epilogue);
3680 /* A broadcast of the max value. */
3681 epilogue_cost += record_stmt_cost (cost_vec, 1,
3682 scalar_to_vec, stmt_info, 0,
3683 vect_epilogue);
3684 }
3685 else
3686 {
3687 epilogue_cost += record_stmt_cost (cost_vec, 1, vector_stmt,
3688 stmt_info, 0, vect_epilogue);
3689 epilogue_cost += record_stmt_cost (cost_vec, 1,
3690 vec_to_scalar, stmt_info, 0,
3691 vect_epilogue);
3692 }
3693 }
3694 else if (reduction_type == COND_REDUCTION)
3695 {
3696 unsigned estimated_nunits = vect_nunits_for_cost (vectype);
3697 /* Extraction of scalar elements. */
3698 epilogue_cost += record_stmt_cost (cost_vec,
3699 2 * estimated_nunits,
3700 vec_to_scalar, stmt_info, 0,
3701 vect_epilogue);
3702 /* Scalar max reductions via COND_EXPR / MAX_EXPR. */
3703 epilogue_cost += record_stmt_cost (cost_vec,
3704 2 * estimated_nunits - 3,
3705 scalar_stmt, stmt_info, 0,
3706 vect_epilogue);
3707 }
3708 else if (reduction_type == EXTRACT_LAST_REDUCTION
3709 || reduction_type == FOLD_LEFT_REDUCTION)
3710 /* No extra instructions need in the epilogue. */
3711 ;
3712 else
3713 {
3714 int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
3715 tree bitsize =
3716 TYPE_SIZE (TREE_TYPE (gimple_assign_lhs (orig_stmt_info->stmt)));
3717 int element_bitsize = tree_to_uhwi (bitsize);
3718 int nelements = vec_size_in_bits / element_bitsize;
3719
3720 if (code == COND_EXPR)
3721 code = MAX_EXPR;
3722
3723 optab = optab_for_tree_code (code, vectype, optab_default);
3724
3725 /* We have a whole vector shift available. */
3726 if (optab != unknown_optab
3727 && VECTOR_MODE_P (mode)
3728 && optab_handler (optab, mode) != CODE_FOR_nothing
3729 && have_whole_vector_shift (mode))
3730 {
3731 /* Final reduction via vector shifts and the reduction operator.
3732 Also requires scalar extract. */
3733 epilogue_cost += record_stmt_cost (cost_vec,
3734 exact_log2 (nelements) * 2,
3735 vector_stmt, stmt_info, 0,
3736 vect_epilogue);
3737 epilogue_cost += record_stmt_cost (cost_vec, 1,
3738 vec_to_scalar, stmt_info, 0,
3739 vect_epilogue);
3740 }
3741 else
3742 /* Use extracts and reduction op for final reduction. For N
3743 elements, we have N extracts and N-1 reduction ops. */
3744 epilogue_cost += record_stmt_cost (cost_vec,
3745 nelements + nelements - 1,
3746 vector_stmt, stmt_info, 0,
3747 vect_epilogue);
3748 }
3749 }
3750
3751 if (dump_enabled_p ())
3752 dump_printf (MSG_NOTE,
3753 "vect_model_reduction_cost: inside_cost = %d, "
3754 "prologue_cost = %d, epilogue_cost = %d .\n", inside_cost,
3755 prologue_cost, epilogue_cost);
3756 }
3757
3758
3759 /* Function vect_model_induction_cost.
3760
3761 Models cost for induction operations. */
3762
3763 static void
3764 vect_model_induction_cost (stmt_vec_info stmt_info, int ncopies,
3765 stmt_vector_for_cost *cost_vec)
3766 {
3767 unsigned inside_cost, prologue_cost;
3768
3769 if (PURE_SLP_STMT (stmt_info))
3770 return;
3771
3772 /* loop cost for vec_loop. */
3773 inside_cost = record_stmt_cost (cost_vec, ncopies, vector_stmt,
3774 stmt_info, 0, vect_body);
3775
3776 /* prologue cost for vec_init and vec_step. */
3777 prologue_cost = record_stmt_cost (cost_vec, 2, scalar_to_vec,
3778 stmt_info, 0, vect_prologue);
3779
3780 if (dump_enabled_p ())
3781 dump_printf_loc (MSG_NOTE, vect_location,
3782 "vect_model_induction_cost: inside_cost = %d, "
3783 "prologue_cost = %d .\n", inside_cost, prologue_cost);
3784 }
3785
3786
3787
3788 /* Function get_initial_def_for_reduction
3789
3790 Input:
3791 STMT_VINFO - a stmt that performs a reduction operation in the loop.
3792 INIT_VAL - the initial value of the reduction variable
3793
3794 Output:
3795 ADJUSTMENT_DEF - a tree that holds a value to be added to the final result
3796 of the reduction (used for adjusting the epilog - see below).
3797 Return a vector variable, initialized according to the operation that
3798 STMT_VINFO performs. This vector will be used as the initial value
3799 of the vector of partial results.
3800
3801 Option1 (adjust in epilog): Initialize the vector as follows:
3802 add/bit or/xor: [0,0,...,0,0]
3803 mult/bit and: [1,1,...,1,1]
3804 min/max/cond_expr: [init_val,init_val,..,init_val,init_val]
3805 and when necessary (e.g. add/mult case) let the caller know
3806 that it needs to adjust the result by init_val.
3807
3808 Option2: Initialize the vector as follows:
3809 add/bit or/xor: [init_val,0,0,...,0]
3810 mult/bit and: [init_val,1,1,...,1]
3811 min/max/cond_expr: [init_val,init_val,...,init_val]
3812 and no adjustments are needed.
3813
3814 For example, for the following code:
3815
3816 s = init_val;
3817 for (i=0;i<n;i++)
3818 s = s + a[i];
3819
3820 STMT_VINFO is 's = s + a[i]', and the reduction variable is 's'.
3821 For a vector of 4 units, we want to return either [0,0,0,init_val],
3822 or [0,0,0,0] and let the caller know that it needs to adjust
3823 the result at the end by 'init_val'.
3824
3825 FORNOW, we are using the 'adjust in epilog' scheme, because this way the
3826 initialization vector is simpler (same element in all entries), if
3827 ADJUSTMENT_DEF is not NULL, and Option2 otherwise.
3828
3829 A cost model should help decide between these two schemes. */
3830
3831 static tree
3832 get_initial_def_for_reduction (stmt_vec_info stmt_vinfo,
3833 enum tree_code code, tree init_val,
3834 tree *adjustment_def)
3835 {
3836 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_vinfo);
3837 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
3838 tree scalar_type = TREE_TYPE (init_val);
3839 tree vectype = get_vectype_for_scalar_type (scalar_type);
3840 tree def_for_init;
3841 tree init_def;
3842 REAL_VALUE_TYPE real_init_val = dconst0;
3843 int int_init_val = 0;
3844 gimple_seq stmts = NULL;
3845
3846 gcc_assert (vectype);
3847
3848 gcc_assert (POINTER_TYPE_P (scalar_type) || INTEGRAL_TYPE_P (scalar_type)
3849 || SCALAR_FLOAT_TYPE_P (scalar_type));
3850
3851 gcc_assert (nested_in_vect_loop_p (loop, stmt_vinfo)
3852 || loop == (gimple_bb (stmt_vinfo->stmt))->loop_father);
3853
3854 /* ADJUSTMENT_DEF is NULL when called from
3855 vect_create_epilog_for_reduction to vectorize double reduction. */
3856 if (adjustment_def)
3857 *adjustment_def = NULL;
3858
3859 switch (code)
3860 {
3861 case WIDEN_SUM_EXPR:
3862 case DOT_PROD_EXPR:
3863 case SAD_EXPR:
3864 case PLUS_EXPR:
3865 case MINUS_EXPR:
3866 case BIT_IOR_EXPR:
3867 case BIT_XOR_EXPR:
3868 case MULT_EXPR:
3869 case BIT_AND_EXPR:
3870 {
3871 if (code == MULT_EXPR)
3872 {
3873 real_init_val = dconst1;
3874 int_init_val = 1;
3875 }
3876
3877 if (code == BIT_AND_EXPR)
3878 int_init_val = -1;
3879
3880 if (SCALAR_FLOAT_TYPE_P (scalar_type))
3881 def_for_init = build_real (scalar_type, real_init_val);
3882 else
3883 def_for_init = build_int_cst (scalar_type, int_init_val);
3884
3885 if (adjustment_def || operand_equal_p (def_for_init, init_val, 0))
3886 {
3887 /* Option1: the first element is '0' or '1' as well. */
3888 if (!operand_equal_p (def_for_init, init_val, 0))
3889 *adjustment_def = init_val;
3890 init_def = gimple_build_vector_from_val (&stmts, vectype,
3891 def_for_init);
3892 }
3893 else if (!TYPE_VECTOR_SUBPARTS (vectype).is_constant ())
3894 {
3895 /* Option2 (variable length): the first element is INIT_VAL. */
3896 init_def = gimple_build_vector_from_val (&stmts, vectype,
3897 def_for_init);
3898 init_def = gimple_build (&stmts, CFN_VEC_SHL_INSERT,
3899 vectype, init_def, init_val);
3900 }
3901 else
3902 {
3903 /* Option2: the first element is INIT_VAL. */
3904 tree_vector_builder elts (vectype, 1, 2);
3905 elts.quick_push (init_val);
3906 elts.quick_push (def_for_init);
3907 init_def = gimple_build_vector (&stmts, &elts);
3908 }
3909 }
3910 break;
3911
3912 case MIN_EXPR:
3913 case MAX_EXPR:
3914 case COND_EXPR:
3915 {
3916 init_val = gimple_convert (&stmts, TREE_TYPE (vectype), init_val);
3917 init_def = gimple_build_vector_from_val (&stmts, vectype, init_val);
3918 }
3919 break;
3920
3921 default:
3922 gcc_unreachable ();
3923 }
3924
3925 if (stmts)
3926 gsi_insert_seq_on_edge_immediate (loop_preheader_edge (loop), stmts);
3927 return init_def;
3928 }
3929
3930 /* Get at the initial defs for the reduction PHIs in SLP_NODE.
3931 NUMBER_OF_VECTORS is the number of vector defs to create.
3932 If NEUTRAL_OP is nonnull, introducing extra elements of that
3933 value will not change the result. */
3934
3935 static void
3936 get_initial_defs_for_reduction (slp_tree slp_node,
3937 vec<tree> *vec_oprnds,
3938 unsigned int number_of_vectors,
3939 bool reduc_chain, tree neutral_op)
3940 {
3941 vec<stmt_vec_info> stmts = SLP_TREE_SCALAR_STMTS (slp_node);
3942 stmt_vec_info stmt_vinfo = stmts[0];
3943 unsigned HOST_WIDE_INT nunits;
3944 unsigned j, number_of_places_left_in_vector;
3945 tree vector_type;
3946 unsigned int group_size = stmts.length ();
3947 unsigned int i;
3948 class loop *loop;
3949
3950 vector_type = STMT_VINFO_VECTYPE (stmt_vinfo);
3951
3952 gcc_assert (STMT_VINFO_DEF_TYPE (stmt_vinfo) == vect_reduction_def);
3953
3954 loop = (gimple_bb (stmt_vinfo->stmt))->loop_father;
3955 gcc_assert (loop);
3956 edge pe = loop_preheader_edge (loop);
3957
3958 gcc_assert (!reduc_chain || neutral_op);
3959
3960 /* NUMBER_OF_COPIES is the number of times we need to use the same values in
3961 created vectors. It is greater than 1 if unrolling is performed.
3962
3963 For example, we have two scalar operands, s1 and s2 (e.g., group of
3964 strided accesses of size two), while NUNITS is four (i.e., four scalars
3965 of this type can be packed in a vector). The output vector will contain
3966 two copies of each scalar operand: {s1, s2, s1, s2}. (NUMBER_OF_COPIES
3967 will be 2).
3968
3969 If REDUC_GROUP_SIZE > NUNITS, the scalars will be split into several
3970 vectors containing the operands.
3971
3972 For example, NUNITS is four as before, and the group size is 8
3973 (s1, s2, ..., s8). We will create two vectors {s1, s2, s3, s4} and
3974 {s5, s6, s7, s8}. */
3975
3976 if (!TYPE_VECTOR_SUBPARTS (vector_type).is_constant (&nunits))
3977 nunits = group_size;
3978
3979 number_of_places_left_in_vector = nunits;
3980 bool constant_p = true;
3981 tree_vector_builder elts (vector_type, nunits, 1);
3982 elts.quick_grow (nunits);
3983 gimple_seq ctor_seq = NULL;
3984 for (j = 0; j < nunits * number_of_vectors; ++j)
3985 {
3986 tree op;
3987 i = j % group_size;
3988 stmt_vinfo = stmts[i];
3989
3990 /* Get the def before the loop. In reduction chain we have only
3991 one initial value. Else we have as many as PHIs in the group. */
3992 if (reduc_chain)
3993 op = j != 0 ? neutral_op : PHI_ARG_DEF_FROM_EDGE (stmt_vinfo->stmt, pe);
3994 else if (((vec_oprnds->length () + 1) * nunits
3995 - number_of_places_left_in_vector >= group_size)
3996 && neutral_op)
3997 op = neutral_op;
3998 else
3999 op = PHI_ARG_DEF_FROM_EDGE (stmt_vinfo->stmt, pe);
4000
4001 /* Create 'vect_ = {op0,op1,...,opn}'. */
4002 number_of_places_left_in_vector--;
4003 elts[nunits - number_of_places_left_in_vector - 1] = op;
4004 if (!CONSTANT_CLASS_P (op))
4005 constant_p = false;
4006
4007 if (number_of_places_left_in_vector == 0)
4008 {
4009 tree init;
4010 if (constant_p && !neutral_op
4011 ? multiple_p (TYPE_VECTOR_SUBPARTS (vector_type), nunits)
4012 : known_eq (TYPE_VECTOR_SUBPARTS (vector_type), nunits))
4013 /* Build the vector directly from ELTS. */
4014 init = gimple_build_vector (&ctor_seq, &elts);
4015 else if (neutral_op)
4016 {
4017 /* Build a vector of the neutral value and shift the
4018 other elements into place. */
4019 init = gimple_build_vector_from_val (&ctor_seq, vector_type,
4020 neutral_op);
4021 int k = nunits;
4022 while (k > 0 && elts[k - 1] == neutral_op)
4023 k -= 1;
4024 while (k > 0)
4025 {
4026 k -= 1;
4027 init = gimple_build (&ctor_seq, CFN_VEC_SHL_INSERT,
4028 vector_type, init, elts[k]);
4029 }
4030 }
4031 else
4032 {
4033 /* First time round, duplicate ELTS to fill the
4034 required number of vectors. */
4035 duplicate_and_interleave (&ctor_seq, vector_type, elts,
4036 number_of_vectors, *vec_oprnds);
4037 break;
4038 }
4039 vec_oprnds->quick_push (init);
4040
4041 number_of_places_left_in_vector = nunits;
4042 elts.new_vector (vector_type, nunits, 1);
4043 elts.quick_grow (nunits);
4044 constant_p = true;
4045 }
4046 }
4047 if (ctor_seq != NULL)
4048 gsi_insert_seq_on_edge_immediate (pe, ctor_seq);
4049 }
4050
4051 /* For a statement STMT_INFO taking part in a reduction operation return
4052 the stmt_vec_info the meta information is stored on. */
4053
4054 stmt_vec_info
4055 info_for_reduction (stmt_vec_info stmt_info)
4056 {
4057 stmt_info = vect_orig_stmt (stmt_info);
4058 gcc_assert (STMT_VINFO_REDUC_DEF (stmt_info));
4059 if (!is_a <gphi *> (stmt_info->stmt))
4060 stmt_info = STMT_VINFO_REDUC_DEF (stmt_info);
4061 gphi *phi = as_a <gphi *> (stmt_info->stmt);
4062 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
4063 {
4064 if (gimple_phi_num_args (phi) == 1)
4065 stmt_info = STMT_VINFO_REDUC_DEF (stmt_info);
4066 }
4067 else if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle)
4068 {
4069 edge pe = loop_preheader_edge (gimple_bb (phi)->loop_father);
4070 stmt_vec_info info
4071 = stmt_info->vinfo->lookup_def (PHI_ARG_DEF_FROM_EDGE (phi, pe));
4072 if (info && STMT_VINFO_DEF_TYPE (info) == vect_double_reduction_def)
4073 stmt_info = info;
4074 }
4075 return stmt_info;
4076 }
4077
4078 /* Function vect_create_epilog_for_reduction
4079
4080 Create code at the loop-epilog to finalize the result of a reduction
4081 computation.
4082
4083 STMT_INFO is the scalar reduction stmt that is being vectorized.
4084 SLP_NODE is an SLP node containing a group of reduction statements. The
4085 first one in this group is STMT_INFO.
4086 SLP_NODE_INSTANCE is the SLP node instance containing SLP_NODE
4087 REDUC_INDEX says which rhs operand of the STMT_INFO is the reduction phi
4088 (counting from 0)
4089
4090 This function:
4091 1. Completes the reduction def-use cycles.
4092 2. "Reduces" each vector of partial results VECT_DEFS into a single result,
4093 by calling the function specified by REDUC_FN if available, or by
4094 other means (whole-vector shifts or a scalar loop).
4095 The function also creates a new phi node at the loop exit to preserve
4096 loop-closed form, as illustrated below.
4097
4098 The flow at the entry to this function:
4099
4100 loop:
4101 vec_def = phi <vec_init, null> # REDUCTION_PHI
4102 VECT_DEF = vector_stmt # vectorized form of STMT_INFO
4103 s_loop = scalar_stmt # (scalar) STMT_INFO
4104 loop_exit:
4105 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
4106 use <s_out0>
4107 use <s_out0>
4108
4109 The above is transformed by this function into:
4110
4111 loop:
4112 vec_def = phi <vec_init, VECT_DEF> # REDUCTION_PHI
4113 VECT_DEF = vector_stmt # vectorized form of STMT_INFO
4114 s_loop = scalar_stmt # (scalar) STMT_INFO
4115 loop_exit:
4116 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
4117 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
4118 v_out2 = reduce <v_out1>
4119 s_out3 = extract_field <v_out2, 0>
4120 s_out4 = adjust_result <s_out3>
4121 use <s_out4>
4122 use <s_out4>
4123 */
4124
4125 static void
4126 vect_create_epilog_for_reduction (stmt_vec_info stmt_info,
4127 slp_tree slp_node,
4128 slp_instance slp_node_instance)
4129 {
4130 stmt_vec_info reduc_info = info_for_reduction (stmt_info);
4131 gcc_assert (reduc_info->is_reduc_info);
4132 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4133 /* For double reductions we need to get at the inner loop reduction
4134 stmt which has the meta info attached. Our stmt_info is that of the
4135 loop-closed PHI of the inner loop which we remember as
4136 def for the reduction PHI generation. */
4137 bool double_reduc = false;
4138 stmt_vec_info rdef_info = stmt_info;
4139 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
4140 {
4141 gcc_assert (!slp_node);
4142 double_reduc = true;
4143 stmt_info = loop_vinfo->lookup_def (gimple_phi_arg_def
4144 (stmt_info->stmt, 0));
4145 stmt_info = vect_stmt_to_vectorize (stmt_info);
4146 }
4147 gphi *reduc_def_stmt
4148 = as_a <gphi *> (STMT_VINFO_REDUC_DEF (vect_orig_stmt (stmt_info))->stmt);
4149 enum tree_code code = STMT_VINFO_REDUC_CODE (reduc_info);
4150 internal_fn reduc_fn = STMT_VINFO_REDUC_FN (reduc_info);
4151 tree neutral_op = NULL_TREE;
4152 if (slp_node)
4153 neutral_op
4154 = neutral_op_for_slp_reduction (slp_node_instance->reduc_phis, code,
4155 REDUC_GROUP_FIRST_ELEMENT (stmt_info));
4156 stmt_vec_info prev_phi_info;
4157 tree vectype;
4158 machine_mode mode;
4159 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo), *outer_loop = NULL;
4160 basic_block exit_bb;
4161 tree scalar_dest;
4162 tree scalar_type;
4163 gimple *new_phi = NULL, *phi;
4164 stmt_vec_info phi_info;
4165 gimple_stmt_iterator exit_gsi;
4166 tree vec_dest;
4167 tree new_temp = NULL_TREE, new_dest, new_name, new_scalar_dest;
4168 gimple *epilog_stmt = NULL;
4169 gimple *exit_phi;
4170 tree bitsize;
4171 tree expr, def;
4172 tree orig_name, scalar_result;
4173 imm_use_iterator imm_iter, phi_imm_iter;
4174 use_operand_p use_p, phi_use_p;
4175 gimple *use_stmt;
4176 bool nested_in_vect_loop = false;
4177 auto_vec<gimple *> new_phis;
4178 int j, i;
4179 auto_vec<tree> scalar_results;
4180 unsigned int group_size = 1, k;
4181 auto_vec<gimple *> phis;
4182 bool slp_reduc = false;
4183 bool direct_slp_reduc;
4184 tree new_phi_result;
4185 tree induction_index = NULL_TREE;
4186
4187 if (slp_node)
4188 group_size = SLP_TREE_SCALAR_STMTS (slp_node).length ();
4189
4190 if (nested_in_vect_loop_p (loop, stmt_info))
4191 {
4192 outer_loop = loop;
4193 loop = loop->inner;
4194 nested_in_vect_loop = true;
4195 gcc_assert (!slp_node);
4196 }
4197 gcc_assert (!nested_in_vect_loop || double_reduc);
4198
4199 vectype = STMT_VINFO_VECTYPE (stmt_info);
4200 gcc_assert (vectype);
4201 mode = TYPE_MODE (vectype);
4202
4203 tree initial_def = NULL;
4204 tree induc_val = NULL_TREE;
4205 tree adjustment_def = NULL;
4206 if (slp_node)
4207 ;
4208 else
4209 {
4210 /* Get at the scalar def before the loop, that defines the initial value
4211 of the reduction variable. */
4212 initial_def = PHI_ARG_DEF_FROM_EDGE (reduc_def_stmt,
4213 loop_preheader_edge (loop));
4214 /* Optimize: for induction condition reduction, if we can't use zero
4215 for induc_val, use initial_def. */
4216 if (STMT_VINFO_REDUC_TYPE (reduc_info) == INTEGER_INDUC_COND_REDUCTION)
4217 induc_val = STMT_VINFO_VEC_INDUC_COND_INITIAL_VAL (reduc_info);
4218 else if (double_reduc)
4219 ;
4220 else if (nested_in_vect_loop)
4221 ;
4222 else
4223 adjustment_def = STMT_VINFO_REDUC_EPILOGUE_ADJUSTMENT (reduc_info);
4224 }
4225
4226 unsigned vec_num;
4227 int ncopies;
4228 if (slp_node)
4229 {
4230 vec_num = SLP_TREE_VEC_STMTS (slp_node_instance->reduc_phis).length ();
4231 ncopies = 1;
4232 }
4233 else
4234 {
4235 vec_num = 1;
4236 ncopies = 0;
4237 phi_info = STMT_VINFO_VEC_STMT (loop_vinfo->lookup_stmt (reduc_def_stmt));
4238 do
4239 {
4240 ncopies++;
4241 phi_info = STMT_VINFO_RELATED_STMT (phi_info);
4242 }
4243 while (phi_info);
4244 }
4245
4246 /* For cond reductions we want to create a new vector (INDEX_COND_EXPR)
4247 which is updated with the current index of the loop for every match of
4248 the original loop's cond_expr (VEC_STMT). This results in a vector
4249 containing the last time the condition passed for that vector lane.
4250 The first match will be a 1 to allow 0 to be used for non-matching
4251 indexes. If there are no matches at all then the vector will be all
4252 zeroes. */
4253 if (STMT_VINFO_REDUC_TYPE (reduc_info) == COND_REDUCTION)
4254 {
4255 tree indx_before_incr, indx_after_incr;
4256 poly_uint64 nunits_out = TYPE_VECTOR_SUBPARTS (vectype);
4257
4258 gimple *vec_stmt = STMT_VINFO_VEC_STMT (stmt_info)->stmt;
4259 gcc_assert (gimple_assign_rhs_code (vec_stmt) == VEC_COND_EXPR);
4260
4261 int scalar_precision
4262 = GET_MODE_PRECISION (SCALAR_TYPE_MODE (TREE_TYPE (vectype)));
4263 tree cr_index_scalar_type = make_unsigned_type (scalar_precision);
4264 tree cr_index_vector_type = build_vector_type
4265 (cr_index_scalar_type, TYPE_VECTOR_SUBPARTS (vectype));
4266
4267 /* First we create a simple vector induction variable which starts
4268 with the values {1,2,3,...} (SERIES_VECT) and increments by the
4269 vector size (STEP). */
4270
4271 /* Create a {1,2,3,...} vector. */
4272 tree series_vect = build_index_vector (cr_index_vector_type, 1, 1);
4273
4274 /* Create a vector of the step value. */
4275 tree step = build_int_cst (cr_index_scalar_type, nunits_out);
4276 tree vec_step = build_vector_from_val (cr_index_vector_type, step);
4277
4278 /* Create an induction variable. */
4279 gimple_stmt_iterator incr_gsi;
4280 bool insert_after;
4281 standard_iv_increment_position (loop, &incr_gsi, &insert_after);
4282 create_iv (series_vect, vec_step, NULL_TREE, loop, &incr_gsi,
4283 insert_after, &indx_before_incr, &indx_after_incr);
4284
4285 /* Next create a new phi node vector (NEW_PHI_TREE) which starts
4286 filled with zeros (VEC_ZERO). */
4287
4288 /* Create a vector of 0s. */
4289 tree zero = build_zero_cst (cr_index_scalar_type);
4290 tree vec_zero = build_vector_from_val (cr_index_vector_type, zero);
4291
4292 /* Create a vector phi node. */
4293 tree new_phi_tree = make_ssa_name (cr_index_vector_type);
4294 new_phi = create_phi_node (new_phi_tree, loop->header);
4295 loop_vinfo->add_stmt (new_phi);
4296 add_phi_arg (as_a <gphi *> (new_phi), vec_zero,
4297 loop_preheader_edge (loop), UNKNOWN_LOCATION);
4298
4299 /* Now take the condition from the loops original cond_expr
4300 (VEC_STMT) and produce a new cond_expr (INDEX_COND_EXPR) which for
4301 every match uses values from the induction variable
4302 (INDEX_BEFORE_INCR) otherwise uses values from the phi node
4303 (NEW_PHI_TREE).
4304 Finally, we update the phi (NEW_PHI_TREE) to take the value of
4305 the new cond_expr (INDEX_COND_EXPR). */
4306
4307 /* Duplicate the condition from vec_stmt. */
4308 tree ccompare = unshare_expr (gimple_assign_rhs1 (vec_stmt));
4309
4310 /* Create a conditional, where the condition is taken from vec_stmt
4311 (CCOMPARE). The then and else values mirror the main VEC_COND_EXPR:
4312 the reduction phi corresponds to NEW_PHI_TREE and the new values
4313 correspond to INDEX_BEFORE_INCR. */
4314 gcc_assert (STMT_VINFO_REDUC_IDX (stmt_info) >= 1);
4315 tree index_cond_expr;
4316 if (STMT_VINFO_REDUC_IDX (stmt_info) == 2)
4317 index_cond_expr = build3 (VEC_COND_EXPR, cr_index_vector_type,
4318 ccompare, indx_before_incr, new_phi_tree);
4319 else
4320 index_cond_expr = build3 (VEC_COND_EXPR, cr_index_vector_type,
4321 ccompare, new_phi_tree, indx_before_incr);
4322 induction_index = make_ssa_name (cr_index_vector_type);
4323 gimple *index_condition = gimple_build_assign (induction_index,
4324 index_cond_expr);
4325 gsi_insert_before (&incr_gsi, index_condition, GSI_SAME_STMT);
4326 stmt_vec_info index_vec_info = loop_vinfo->add_stmt (index_condition);
4327 STMT_VINFO_VECTYPE (index_vec_info) = cr_index_vector_type;
4328
4329 /* Update the phi with the vec cond. */
4330 add_phi_arg (as_a <gphi *> (new_phi), induction_index,
4331 loop_latch_edge (loop), UNKNOWN_LOCATION);
4332 }
4333
4334 /* 2. Create epilog code.
4335 The reduction epilog code operates across the elements of the vector
4336 of partial results computed by the vectorized loop.
4337 The reduction epilog code consists of:
4338
4339 step 1: compute the scalar result in a vector (v_out2)
4340 step 2: extract the scalar result (s_out3) from the vector (v_out2)
4341 step 3: adjust the scalar result (s_out3) if needed.
4342
4343 Step 1 can be accomplished using one the following three schemes:
4344 (scheme 1) using reduc_fn, if available.
4345 (scheme 2) using whole-vector shifts, if available.
4346 (scheme 3) using a scalar loop. In this case steps 1+2 above are
4347 combined.
4348
4349 The overall epilog code looks like this:
4350
4351 s_out0 = phi <s_loop> # original EXIT_PHI
4352 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
4353 v_out2 = reduce <v_out1> # step 1
4354 s_out3 = extract_field <v_out2, 0> # step 2
4355 s_out4 = adjust_result <s_out3> # step 3
4356
4357 (step 3 is optional, and steps 1 and 2 may be combined).
4358 Lastly, the uses of s_out0 are replaced by s_out4. */
4359
4360
4361 /* 2.1 Create new loop-exit-phis to preserve loop-closed form:
4362 v_out1 = phi <VECT_DEF>
4363 Store them in NEW_PHIS. */
4364 if (double_reduc)
4365 loop = outer_loop;
4366 exit_bb = single_exit (loop)->dest;
4367 prev_phi_info = NULL;
4368 new_phis.create (slp_node ? vec_num : ncopies);
4369 for (unsigned i = 0; i < vec_num; i++)
4370 {
4371 if (slp_node)
4372 def = gimple_get_lhs (SLP_TREE_VEC_STMTS (slp_node)[i]->stmt);
4373 else
4374 def = gimple_get_lhs (STMT_VINFO_VEC_STMT (rdef_info)->stmt);
4375 for (j = 0; j < ncopies; j++)
4376 {
4377 tree new_def = copy_ssa_name (def);
4378 phi = create_phi_node (new_def, exit_bb);
4379 stmt_vec_info phi_info = loop_vinfo->add_stmt (phi);
4380 if (j == 0)
4381 new_phis.quick_push (phi);
4382 else
4383 {
4384 def = vect_get_vec_def_for_stmt_copy (loop_vinfo, def);
4385 STMT_VINFO_RELATED_STMT (prev_phi_info) = phi_info;
4386 }
4387
4388 SET_PHI_ARG_DEF (phi, single_exit (loop)->dest_idx, def);
4389 prev_phi_info = phi_info;
4390 }
4391 }
4392
4393 exit_gsi = gsi_after_labels (exit_bb);
4394
4395 /* 2.2 Get the relevant tree-code to use in the epilog for schemes 2,3
4396 (i.e. when reduc_fn is not available) and in the final adjustment
4397 code (if needed). Also get the original scalar reduction variable as
4398 defined in the loop. In case STMT is a "pattern-stmt" (i.e. - it
4399 represents a reduction pattern), the tree-code and scalar-def are
4400 taken from the original stmt that the pattern-stmt (STMT) replaces.
4401 Otherwise (it is a regular reduction) - the tree-code and scalar-def
4402 are taken from STMT. */
4403
4404 stmt_vec_info orig_stmt_info = vect_orig_stmt (stmt_info);
4405 if (orig_stmt_info != stmt_info)
4406 {
4407 /* Reduction pattern */
4408 gcc_assert (STMT_VINFO_IN_PATTERN_P (orig_stmt_info));
4409 gcc_assert (STMT_VINFO_RELATED_STMT (orig_stmt_info) == stmt_info);
4410 }
4411
4412 scalar_dest = gimple_assign_lhs (orig_stmt_info->stmt);
4413 scalar_type = TREE_TYPE (scalar_dest);
4414 scalar_results.create (group_size);
4415 new_scalar_dest = vect_create_destination_var (scalar_dest, NULL);
4416 bitsize = TYPE_SIZE (scalar_type);
4417
4418 /* SLP reduction without reduction chain, e.g.,
4419 # a1 = phi <a2, a0>
4420 # b1 = phi <b2, b0>
4421 a2 = operation (a1)
4422 b2 = operation (b1) */
4423 slp_reduc = (slp_node && !REDUC_GROUP_FIRST_ELEMENT (stmt_info));
4424
4425 /* True if we should implement SLP_REDUC using native reduction operations
4426 instead of scalar operations. */
4427 direct_slp_reduc = (reduc_fn != IFN_LAST
4428 && slp_reduc
4429 && !TYPE_VECTOR_SUBPARTS (vectype).is_constant ());
4430
4431 /* In case of reduction chain, e.g.,
4432 # a1 = phi <a3, a0>
4433 a2 = operation (a1)
4434 a3 = operation (a2),
4435
4436 we may end up with more than one vector result. Here we reduce them to
4437 one vector. */
4438 if (REDUC_GROUP_FIRST_ELEMENT (stmt_info) || direct_slp_reduc)
4439 {
4440 tree first_vect = PHI_RESULT (new_phis[0]);
4441 gassign *new_vec_stmt = NULL;
4442 vec_dest = vect_create_destination_var (scalar_dest, vectype);
4443 for (k = 1; k < new_phis.length (); k++)
4444 {
4445 gimple *next_phi = new_phis[k];
4446 tree second_vect = PHI_RESULT (next_phi);
4447 tree tem = make_ssa_name (vec_dest, new_vec_stmt);
4448 new_vec_stmt = gimple_build_assign (tem, code,
4449 first_vect, second_vect);
4450 gsi_insert_before (&exit_gsi, new_vec_stmt, GSI_SAME_STMT);
4451 first_vect = tem;
4452 }
4453
4454 new_phi_result = first_vect;
4455 if (new_vec_stmt)
4456 {
4457 new_phis.truncate (0);
4458 new_phis.safe_push (new_vec_stmt);
4459 }
4460 }
4461 /* Likewise if we couldn't use a single defuse cycle. */
4462 else if (ncopies > 1)
4463 {
4464 gcc_assert (new_phis.length () == 1);
4465 tree first_vect = PHI_RESULT (new_phis[0]);
4466 gassign *new_vec_stmt = NULL;
4467 vec_dest = vect_create_destination_var (scalar_dest, vectype);
4468 stmt_vec_info next_phi_info = loop_vinfo->lookup_stmt (new_phis[0]);
4469 for (int k = 1; k < ncopies; ++k)
4470 {
4471 next_phi_info = STMT_VINFO_RELATED_STMT (next_phi_info);
4472 tree second_vect = PHI_RESULT (next_phi_info->stmt);
4473 tree tem = make_ssa_name (vec_dest, new_vec_stmt);
4474 new_vec_stmt = gimple_build_assign (tem, code,
4475 first_vect, second_vect);
4476 gsi_insert_before (&exit_gsi, new_vec_stmt, GSI_SAME_STMT);
4477 first_vect = tem;
4478 }
4479 new_phi_result = first_vect;
4480 new_phis.truncate (0);
4481 new_phis.safe_push (new_vec_stmt);
4482 }
4483 else
4484 new_phi_result = PHI_RESULT (new_phis[0]);
4485
4486 if (STMT_VINFO_REDUC_TYPE (reduc_info) == COND_REDUCTION
4487 && reduc_fn != IFN_LAST)
4488 {
4489 /* For condition reductions, we have a vector (NEW_PHI_RESULT) containing
4490 various data values where the condition matched and another vector
4491 (INDUCTION_INDEX) containing all the indexes of those matches. We
4492 need to extract the last matching index (which will be the index with
4493 highest value) and use this to index into the data vector.
4494 For the case where there were no matches, the data vector will contain
4495 all default values and the index vector will be all zeros. */
4496
4497 /* Get various versions of the type of the vector of indexes. */
4498 tree index_vec_type = TREE_TYPE (induction_index);
4499 gcc_checking_assert (TYPE_UNSIGNED (index_vec_type));
4500 tree index_scalar_type = TREE_TYPE (index_vec_type);
4501 tree index_vec_cmp_type = build_same_sized_truth_vector_type
4502 (index_vec_type);
4503
4504 /* Get an unsigned integer version of the type of the data vector. */
4505 int scalar_precision
4506 = GET_MODE_PRECISION (SCALAR_TYPE_MODE (scalar_type));
4507 tree scalar_type_unsigned = make_unsigned_type (scalar_precision);
4508 tree vectype_unsigned = build_vector_type
4509 (scalar_type_unsigned, TYPE_VECTOR_SUBPARTS (vectype));
4510
4511 /* First we need to create a vector (ZERO_VEC) of zeros and another
4512 vector (MAX_INDEX_VEC) filled with the last matching index, which we
4513 can create using a MAX reduction and then expanding.
4514 In the case where the loop never made any matches, the max index will
4515 be zero. */
4516
4517 /* Vector of {0, 0, 0,...}. */
4518 tree zero_vec = make_ssa_name (vectype);
4519 tree zero_vec_rhs = build_zero_cst (vectype);
4520 gimple *zero_vec_stmt = gimple_build_assign (zero_vec, zero_vec_rhs);
4521 gsi_insert_before (&exit_gsi, zero_vec_stmt, GSI_SAME_STMT);
4522
4523 /* Find maximum value from the vector of found indexes. */
4524 tree max_index = make_ssa_name (index_scalar_type);
4525 gcall *max_index_stmt = gimple_build_call_internal (IFN_REDUC_MAX,
4526 1, induction_index);
4527 gimple_call_set_lhs (max_index_stmt, max_index);
4528 gsi_insert_before (&exit_gsi, max_index_stmt, GSI_SAME_STMT);
4529
4530 /* Vector of {max_index, max_index, max_index,...}. */
4531 tree max_index_vec = make_ssa_name (index_vec_type);
4532 tree max_index_vec_rhs = build_vector_from_val (index_vec_type,
4533 max_index);
4534 gimple *max_index_vec_stmt = gimple_build_assign (max_index_vec,
4535 max_index_vec_rhs);
4536 gsi_insert_before (&exit_gsi, max_index_vec_stmt, GSI_SAME_STMT);
4537
4538 /* Next we compare the new vector (MAX_INDEX_VEC) full of max indexes
4539 with the vector (INDUCTION_INDEX) of found indexes, choosing values
4540 from the data vector (NEW_PHI_RESULT) for matches, 0 (ZERO_VEC)
4541 otherwise. Only one value should match, resulting in a vector
4542 (VEC_COND) with one data value and the rest zeros.
4543 In the case where the loop never made any matches, every index will
4544 match, resulting in a vector with all data values (which will all be
4545 the default value). */
4546
4547 /* Compare the max index vector to the vector of found indexes to find
4548 the position of the max value. */
4549 tree vec_compare = make_ssa_name (index_vec_cmp_type);
4550 gimple *vec_compare_stmt = gimple_build_assign (vec_compare, EQ_EXPR,
4551 induction_index,
4552 max_index_vec);
4553 gsi_insert_before (&exit_gsi, vec_compare_stmt, GSI_SAME_STMT);
4554
4555 /* Use the compare to choose either values from the data vector or
4556 zero. */
4557 tree vec_cond = make_ssa_name (vectype);
4558 gimple *vec_cond_stmt = gimple_build_assign (vec_cond, VEC_COND_EXPR,
4559 vec_compare, new_phi_result,
4560 zero_vec);
4561 gsi_insert_before (&exit_gsi, vec_cond_stmt, GSI_SAME_STMT);
4562
4563 /* Finally we need to extract the data value from the vector (VEC_COND)
4564 into a scalar (MATCHED_DATA_REDUC). Logically we want to do a OR
4565 reduction, but because this doesn't exist, we can use a MAX reduction
4566 instead. The data value might be signed or a float so we need to cast
4567 it first.
4568 In the case where the loop never made any matches, the data values are
4569 all identical, and so will reduce down correctly. */
4570
4571 /* Make the matched data values unsigned. */
4572 tree vec_cond_cast = make_ssa_name (vectype_unsigned);
4573 tree vec_cond_cast_rhs = build1 (VIEW_CONVERT_EXPR, vectype_unsigned,
4574 vec_cond);
4575 gimple *vec_cond_cast_stmt = gimple_build_assign (vec_cond_cast,
4576 VIEW_CONVERT_EXPR,
4577 vec_cond_cast_rhs);
4578 gsi_insert_before (&exit_gsi, vec_cond_cast_stmt, GSI_SAME_STMT);
4579
4580 /* Reduce down to a scalar value. */
4581 tree data_reduc = make_ssa_name (scalar_type_unsigned);
4582 gcall *data_reduc_stmt = gimple_build_call_internal (IFN_REDUC_MAX,
4583 1, vec_cond_cast);
4584 gimple_call_set_lhs (data_reduc_stmt, data_reduc);
4585 gsi_insert_before (&exit_gsi, data_reduc_stmt, GSI_SAME_STMT);
4586
4587 /* Convert the reduced value back to the result type and set as the
4588 result. */
4589 gimple_seq stmts = NULL;
4590 new_temp = gimple_build (&stmts, VIEW_CONVERT_EXPR, scalar_type,
4591 data_reduc);
4592 gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
4593 scalar_results.safe_push (new_temp);
4594 }
4595 else if (STMT_VINFO_REDUC_TYPE (reduc_info) == COND_REDUCTION
4596 && reduc_fn == IFN_LAST)
4597 {
4598 /* Condition reduction without supported IFN_REDUC_MAX. Generate
4599 idx = 0;
4600 idx_val = induction_index[0];
4601 val = data_reduc[0];
4602 for (idx = 0, val = init, i = 0; i < nelts; ++i)
4603 if (induction_index[i] > idx_val)
4604 val = data_reduc[i], idx_val = induction_index[i];
4605 return val; */
4606
4607 tree data_eltype = TREE_TYPE (TREE_TYPE (new_phi_result));
4608 tree idx_eltype = TREE_TYPE (TREE_TYPE (induction_index));
4609 unsigned HOST_WIDE_INT el_size = tree_to_uhwi (TYPE_SIZE (idx_eltype));
4610 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (TREE_TYPE (induction_index));
4611 /* Enforced by vectorizable_reduction, which ensures we have target
4612 support before allowing a conditional reduction on variable-length
4613 vectors. */
4614 unsigned HOST_WIDE_INT v_size = el_size * nunits.to_constant ();
4615 tree idx_val = NULL_TREE, val = NULL_TREE;
4616 for (unsigned HOST_WIDE_INT off = 0; off < v_size; off += el_size)
4617 {
4618 tree old_idx_val = idx_val;
4619 tree old_val = val;
4620 idx_val = make_ssa_name (idx_eltype);
4621 epilog_stmt = gimple_build_assign (idx_val, BIT_FIELD_REF,
4622 build3 (BIT_FIELD_REF, idx_eltype,
4623 induction_index,
4624 bitsize_int (el_size),
4625 bitsize_int (off)));
4626 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4627 val = make_ssa_name (data_eltype);
4628 epilog_stmt = gimple_build_assign (val, BIT_FIELD_REF,
4629 build3 (BIT_FIELD_REF,
4630 data_eltype,
4631 new_phi_result,
4632 bitsize_int (el_size),
4633 bitsize_int (off)));
4634 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4635 if (off != 0)
4636 {
4637 tree new_idx_val = idx_val;
4638 if (off != v_size - el_size)
4639 {
4640 new_idx_val = make_ssa_name (idx_eltype);
4641 epilog_stmt = gimple_build_assign (new_idx_val,
4642 MAX_EXPR, idx_val,
4643 old_idx_val);
4644 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4645 }
4646 tree new_val = make_ssa_name (data_eltype);
4647 epilog_stmt = gimple_build_assign (new_val,
4648 COND_EXPR,
4649 build2 (GT_EXPR,
4650 boolean_type_node,
4651 idx_val,
4652 old_idx_val),
4653 val, old_val);
4654 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4655 idx_val = new_idx_val;
4656 val = new_val;
4657 }
4658 }
4659 /* Convert the reduced value back to the result type and set as the
4660 result. */
4661 gimple_seq stmts = NULL;
4662 val = gimple_convert (&stmts, scalar_type, val);
4663 gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
4664 scalar_results.safe_push (val);
4665 }
4666
4667 /* 2.3 Create the reduction code, using one of the three schemes described
4668 above. In SLP we simply need to extract all the elements from the
4669 vector (without reducing them), so we use scalar shifts. */
4670 else if (reduc_fn != IFN_LAST && !slp_reduc)
4671 {
4672 tree tmp;
4673 tree vec_elem_type;
4674
4675 /* Case 1: Create:
4676 v_out2 = reduc_expr <v_out1> */
4677
4678 if (dump_enabled_p ())
4679 dump_printf_loc (MSG_NOTE, vect_location,
4680 "Reduce using direct vector reduction.\n");
4681
4682 vec_elem_type = TREE_TYPE (TREE_TYPE (new_phi_result));
4683 if (!useless_type_conversion_p (scalar_type, vec_elem_type))
4684 {
4685 tree tmp_dest
4686 = vect_create_destination_var (scalar_dest, vec_elem_type);
4687 epilog_stmt = gimple_build_call_internal (reduc_fn, 1,
4688 new_phi_result);
4689 gimple_set_lhs (epilog_stmt, tmp_dest);
4690 new_temp = make_ssa_name (tmp_dest, epilog_stmt);
4691 gimple_set_lhs (epilog_stmt, new_temp);
4692 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4693
4694 epilog_stmt = gimple_build_assign (new_scalar_dest, NOP_EXPR,
4695 new_temp);
4696 }
4697 else
4698 {
4699 epilog_stmt = gimple_build_call_internal (reduc_fn, 1,
4700 new_phi_result);
4701 gimple_set_lhs (epilog_stmt, new_scalar_dest);
4702 }
4703
4704 new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
4705 gimple_set_lhs (epilog_stmt, new_temp);
4706 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4707
4708 if ((STMT_VINFO_REDUC_TYPE (reduc_info) == INTEGER_INDUC_COND_REDUCTION)
4709 && induc_val)
4710 {
4711 /* Earlier we set the initial value to be a vector if induc_val
4712 values. Check the result and if it is induc_val then replace
4713 with the original initial value, unless induc_val is
4714 the same as initial_def already. */
4715 tree zcompare = build2 (EQ_EXPR, boolean_type_node, new_temp,
4716 induc_val);
4717
4718 tmp = make_ssa_name (new_scalar_dest);
4719 epilog_stmt = gimple_build_assign (tmp, COND_EXPR, zcompare,
4720 initial_def, new_temp);
4721 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4722 new_temp = tmp;
4723 }
4724
4725 scalar_results.safe_push (new_temp);
4726 }
4727 else if (direct_slp_reduc)
4728 {
4729 /* Here we create one vector for each of the REDUC_GROUP_SIZE results,
4730 with the elements for other SLP statements replaced with the
4731 neutral value. We can then do a normal reduction on each vector. */
4732
4733 /* Enforced by vectorizable_reduction. */
4734 gcc_assert (new_phis.length () == 1);
4735 gcc_assert (pow2p_hwi (group_size));
4736
4737 slp_tree orig_phis_slp_node = slp_node_instance->reduc_phis;
4738 vec<stmt_vec_info> orig_phis
4739 = SLP_TREE_SCALAR_STMTS (orig_phis_slp_node);
4740 gimple_seq seq = NULL;
4741
4742 /* Build a vector {0, 1, 2, ...}, with the same number of elements
4743 and the same element size as VECTYPE. */
4744 tree index = build_index_vector (vectype, 0, 1);
4745 tree index_type = TREE_TYPE (index);
4746 tree index_elt_type = TREE_TYPE (index_type);
4747 tree mask_type = build_same_sized_truth_vector_type (index_type);
4748
4749 /* Create a vector that, for each element, identifies which of
4750 the REDUC_GROUP_SIZE results should use it. */
4751 tree index_mask = build_int_cst (index_elt_type, group_size - 1);
4752 index = gimple_build (&seq, BIT_AND_EXPR, index_type, index,
4753 build_vector_from_val (index_type, index_mask));
4754
4755 /* Get a neutral vector value. This is simply a splat of the neutral
4756 scalar value if we have one, otherwise the initial scalar value
4757 is itself a neutral value. */
4758 tree vector_identity = NULL_TREE;
4759 if (neutral_op)
4760 vector_identity = gimple_build_vector_from_val (&seq, vectype,
4761 neutral_op);
4762 for (unsigned int i = 0; i < group_size; ++i)
4763 {
4764 /* If there's no univeral neutral value, we can use the
4765 initial scalar value from the original PHI. This is used
4766 for MIN and MAX reduction, for example. */
4767 if (!neutral_op)
4768 {
4769 tree scalar_value
4770 = PHI_ARG_DEF_FROM_EDGE (orig_phis[i]->stmt,
4771 loop_preheader_edge (loop));
4772 vector_identity = gimple_build_vector_from_val (&seq, vectype,
4773 scalar_value);
4774 }
4775
4776 /* Calculate the equivalent of:
4777
4778 sel[j] = (index[j] == i);
4779
4780 which selects the elements of NEW_PHI_RESULT that should
4781 be included in the result. */
4782 tree compare_val = build_int_cst (index_elt_type, i);
4783 compare_val = build_vector_from_val (index_type, compare_val);
4784 tree sel = gimple_build (&seq, EQ_EXPR, mask_type,
4785 index, compare_val);
4786
4787 /* Calculate the equivalent of:
4788
4789 vec = seq ? new_phi_result : vector_identity;
4790
4791 VEC is now suitable for a full vector reduction. */
4792 tree vec = gimple_build (&seq, VEC_COND_EXPR, vectype,
4793 sel, new_phi_result, vector_identity);
4794
4795 /* Do the reduction and convert it to the appropriate type. */
4796 tree scalar = gimple_build (&seq, as_combined_fn (reduc_fn),
4797 TREE_TYPE (vectype), vec);
4798 scalar = gimple_convert (&seq, scalar_type, scalar);
4799 scalar_results.safe_push (scalar);
4800 }
4801 gsi_insert_seq_before (&exit_gsi, seq, GSI_SAME_STMT);
4802 }
4803 else
4804 {
4805 bool reduce_with_shift;
4806 tree vec_temp;
4807
4808 /* See if the target wants to do the final (shift) reduction
4809 in a vector mode of smaller size and first reduce upper/lower
4810 halves against each other. */
4811 enum machine_mode mode1 = mode;
4812 unsigned sz = tree_to_uhwi (TYPE_SIZE_UNIT (vectype));
4813 unsigned sz1 = sz;
4814 if (!slp_reduc
4815 && (mode1 = targetm.vectorize.split_reduction (mode)) != mode)
4816 sz1 = GET_MODE_SIZE (mode1).to_constant ();
4817
4818 tree vectype1 = get_vectype_for_scalar_type_and_size (scalar_type, sz1);
4819 reduce_with_shift = have_whole_vector_shift (mode1);
4820 if (!VECTOR_MODE_P (mode1))
4821 reduce_with_shift = false;
4822 else
4823 {
4824 optab optab = optab_for_tree_code (code, vectype1, optab_default);
4825 if (optab_handler (optab, mode1) == CODE_FOR_nothing)
4826 reduce_with_shift = false;
4827 }
4828
4829 /* First reduce the vector to the desired vector size we should
4830 do shift reduction on by combining upper and lower halves. */
4831 new_temp = new_phi_result;
4832 while (sz > sz1)
4833 {
4834 gcc_assert (!slp_reduc);
4835 sz /= 2;
4836 vectype1 = get_vectype_for_scalar_type_and_size (scalar_type, sz);
4837
4838 /* The target has to make sure we support lowpart/highpart
4839 extraction, either via direct vector extract or through
4840 an integer mode punning. */
4841 tree dst1, dst2;
4842 if (convert_optab_handler (vec_extract_optab,
4843 TYPE_MODE (TREE_TYPE (new_temp)),
4844 TYPE_MODE (vectype1))
4845 != CODE_FOR_nothing)
4846 {
4847 /* Extract sub-vectors directly once vec_extract becomes
4848 a conversion optab. */
4849 dst1 = make_ssa_name (vectype1);
4850 epilog_stmt
4851 = gimple_build_assign (dst1, BIT_FIELD_REF,
4852 build3 (BIT_FIELD_REF, vectype1,
4853 new_temp, TYPE_SIZE (vectype1),
4854 bitsize_int (0)));
4855 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4856 dst2 = make_ssa_name (vectype1);
4857 epilog_stmt
4858 = gimple_build_assign (dst2, BIT_FIELD_REF,
4859 build3 (BIT_FIELD_REF, vectype1,
4860 new_temp, TYPE_SIZE (vectype1),
4861 bitsize_int (sz * BITS_PER_UNIT)));
4862 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4863 }
4864 else
4865 {
4866 /* Extract via punning to appropriately sized integer mode
4867 vector. */
4868 tree eltype = build_nonstandard_integer_type (sz * BITS_PER_UNIT,
4869 1);
4870 tree etype = build_vector_type (eltype, 2);
4871 gcc_assert (convert_optab_handler (vec_extract_optab,
4872 TYPE_MODE (etype),
4873 TYPE_MODE (eltype))
4874 != CODE_FOR_nothing);
4875 tree tem = make_ssa_name (etype);
4876 epilog_stmt = gimple_build_assign (tem, VIEW_CONVERT_EXPR,
4877 build1 (VIEW_CONVERT_EXPR,
4878 etype, new_temp));
4879 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4880 new_temp = tem;
4881 tem = make_ssa_name (eltype);
4882 epilog_stmt
4883 = gimple_build_assign (tem, BIT_FIELD_REF,
4884 build3 (BIT_FIELD_REF, eltype,
4885 new_temp, TYPE_SIZE (eltype),
4886 bitsize_int (0)));
4887 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4888 dst1 = make_ssa_name (vectype1);
4889 epilog_stmt = gimple_build_assign (dst1, VIEW_CONVERT_EXPR,
4890 build1 (VIEW_CONVERT_EXPR,
4891 vectype1, tem));
4892 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4893 tem = make_ssa_name (eltype);
4894 epilog_stmt
4895 = gimple_build_assign (tem, BIT_FIELD_REF,
4896 build3 (BIT_FIELD_REF, eltype,
4897 new_temp, TYPE_SIZE (eltype),
4898 bitsize_int (sz * BITS_PER_UNIT)));
4899 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4900 dst2 = make_ssa_name (vectype1);
4901 epilog_stmt = gimple_build_assign (dst2, VIEW_CONVERT_EXPR,
4902 build1 (VIEW_CONVERT_EXPR,
4903 vectype1, tem));
4904 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4905 }
4906
4907 new_temp = make_ssa_name (vectype1);
4908 epilog_stmt = gimple_build_assign (new_temp, code, dst1, dst2);
4909 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4910 }
4911
4912 if (reduce_with_shift && !slp_reduc)
4913 {
4914 int element_bitsize = tree_to_uhwi (bitsize);
4915 /* Enforced by vectorizable_reduction, which disallows SLP reductions
4916 for variable-length vectors and also requires direct target support
4917 for loop reductions. */
4918 int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype1));
4919 int nelements = vec_size_in_bits / element_bitsize;
4920 vec_perm_builder sel;
4921 vec_perm_indices indices;
4922
4923 int elt_offset;
4924
4925 tree zero_vec = build_zero_cst (vectype1);
4926 /* Case 2: Create:
4927 for (offset = nelements/2; offset >= 1; offset/=2)
4928 {
4929 Create: va' = vec_shift <va, offset>
4930 Create: va = vop <va, va'>
4931 } */
4932
4933 tree rhs;
4934
4935 if (dump_enabled_p ())
4936 dump_printf_loc (MSG_NOTE, vect_location,
4937 "Reduce using vector shifts\n");
4938
4939 vec_dest = vect_create_destination_var (scalar_dest, vectype1);
4940 for (elt_offset = nelements / 2;
4941 elt_offset >= 1;
4942 elt_offset /= 2)
4943 {
4944 calc_vec_perm_mask_for_shift (elt_offset, nelements, &sel);
4945 indices.new_vector (sel, 2, nelements);
4946 tree mask = vect_gen_perm_mask_any (vectype1, indices);
4947 epilog_stmt = gimple_build_assign (vec_dest, VEC_PERM_EXPR,
4948 new_temp, zero_vec, mask);
4949 new_name = make_ssa_name (vec_dest, epilog_stmt);
4950 gimple_assign_set_lhs (epilog_stmt, new_name);
4951 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4952
4953 epilog_stmt = gimple_build_assign (vec_dest, code, new_name,
4954 new_temp);
4955 new_temp = make_ssa_name (vec_dest, epilog_stmt);
4956 gimple_assign_set_lhs (epilog_stmt, new_temp);
4957 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4958 }
4959
4960 /* 2.4 Extract the final scalar result. Create:
4961 s_out3 = extract_field <v_out2, bitpos> */
4962
4963 if (dump_enabled_p ())
4964 dump_printf_loc (MSG_NOTE, vect_location,
4965 "extract scalar result\n");
4966
4967 rhs = build3 (BIT_FIELD_REF, scalar_type, new_temp,
4968 bitsize, bitsize_zero_node);
4969 epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
4970 new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
4971 gimple_assign_set_lhs (epilog_stmt, new_temp);
4972 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4973 scalar_results.safe_push (new_temp);
4974 }
4975 else
4976 {
4977 /* Case 3: Create:
4978 s = extract_field <v_out2, 0>
4979 for (offset = element_size;
4980 offset < vector_size;
4981 offset += element_size;)
4982 {
4983 Create: s' = extract_field <v_out2, offset>
4984 Create: s = op <s, s'> // For non SLP cases
4985 } */
4986
4987 if (dump_enabled_p ())
4988 dump_printf_loc (MSG_NOTE, vect_location,
4989 "Reduce using scalar code.\n");
4990
4991 int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype1));
4992 int element_bitsize = tree_to_uhwi (bitsize);
4993 FOR_EACH_VEC_ELT (new_phis, i, new_phi)
4994 {
4995 int bit_offset;
4996 if (gimple_code (new_phi) == GIMPLE_PHI)
4997 vec_temp = PHI_RESULT (new_phi);
4998 else
4999 vec_temp = gimple_assign_lhs (new_phi);
5000 tree rhs = build3 (BIT_FIELD_REF, scalar_type, vec_temp, bitsize,
5001 bitsize_zero_node);
5002 epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
5003 new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
5004 gimple_assign_set_lhs (epilog_stmt, new_temp);
5005 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5006
5007 /* In SLP we don't need to apply reduction operation, so we just
5008 collect s' values in SCALAR_RESULTS. */
5009 if (slp_reduc)
5010 scalar_results.safe_push (new_temp);
5011
5012 for (bit_offset = element_bitsize;
5013 bit_offset < vec_size_in_bits;
5014 bit_offset += element_bitsize)
5015 {
5016 tree bitpos = bitsize_int (bit_offset);
5017 tree rhs = build3 (BIT_FIELD_REF, scalar_type, vec_temp,
5018 bitsize, bitpos);
5019
5020 epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
5021 new_name = make_ssa_name (new_scalar_dest, epilog_stmt);
5022 gimple_assign_set_lhs (epilog_stmt, new_name);
5023 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5024
5025 if (slp_reduc)
5026 {
5027 /* In SLP we don't need to apply reduction operation, so
5028 we just collect s' values in SCALAR_RESULTS. */
5029 new_temp = new_name;
5030 scalar_results.safe_push (new_name);
5031 }
5032 else
5033 {
5034 epilog_stmt = gimple_build_assign (new_scalar_dest, code,
5035 new_name, new_temp);
5036 new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
5037 gimple_assign_set_lhs (epilog_stmt, new_temp);
5038 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5039 }
5040 }
5041 }
5042
5043 /* The only case where we need to reduce scalar results in SLP, is
5044 unrolling. If the size of SCALAR_RESULTS is greater than
5045 REDUC_GROUP_SIZE, we reduce them combining elements modulo
5046 REDUC_GROUP_SIZE. */
5047 if (slp_reduc)
5048 {
5049 tree res, first_res, new_res;
5050 gimple *new_stmt;
5051
5052 /* Reduce multiple scalar results in case of SLP unrolling. */
5053 for (j = group_size; scalar_results.iterate (j, &res);
5054 j++)
5055 {
5056 first_res = scalar_results[j % group_size];
5057 new_stmt = gimple_build_assign (new_scalar_dest, code,
5058 first_res, res);
5059 new_res = make_ssa_name (new_scalar_dest, new_stmt);
5060 gimple_assign_set_lhs (new_stmt, new_res);
5061 gsi_insert_before (&exit_gsi, new_stmt, GSI_SAME_STMT);
5062 scalar_results[j % group_size] = new_res;
5063 }
5064 }
5065 else
5066 /* Not SLP - we have one scalar to keep in SCALAR_RESULTS. */
5067 scalar_results.safe_push (new_temp);
5068 }
5069
5070 if ((STMT_VINFO_REDUC_TYPE (reduc_info) == INTEGER_INDUC_COND_REDUCTION)
5071 && induc_val)
5072 {
5073 /* Earlier we set the initial value to be a vector if induc_val
5074 values. Check the result and if it is induc_val then replace
5075 with the original initial value, unless induc_val is
5076 the same as initial_def already. */
5077 tree zcompare = build2 (EQ_EXPR, boolean_type_node, new_temp,
5078 induc_val);
5079
5080 tree tmp = make_ssa_name (new_scalar_dest);
5081 epilog_stmt = gimple_build_assign (tmp, COND_EXPR, zcompare,
5082 initial_def, new_temp);
5083 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5084 scalar_results[0] = tmp;
5085 }
5086 }
5087
5088 /* 2.5 Adjust the final result by the initial value of the reduction
5089 variable. (When such adjustment is not needed, then
5090 'adjustment_def' is zero). For example, if code is PLUS we create:
5091 new_temp = loop_exit_def + adjustment_def */
5092
5093 if (adjustment_def)
5094 {
5095 gcc_assert (!slp_reduc);
5096 if (nested_in_vect_loop)
5097 {
5098 new_phi = new_phis[0];
5099 gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) == VECTOR_TYPE);
5100 expr = build2 (code, vectype, PHI_RESULT (new_phi), adjustment_def);
5101 new_dest = vect_create_destination_var (scalar_dest, vectype);
5102 }
5103 else
5104 {
5105 new_temp = scalar_results[0];
5106 gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) != VECTOR_TYPE);
5107 expr = build2 (code, scalar_type, new_temp, adjustment_def);
5108 new_dest = vect_create_destination_var (scalar_dest, scalar_type);
5109 }
5110
5111 epilog_stmt = gimple_build_assign (new_dest, expr);
5112 new_temp = make_ssa_name (new_dest, epilog_stmt);
5113 gimple_assign_set_lhs (epilog_stmt, new_temp);
5114 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5115 if (nested_in_vect_loop)
5116 {
5117 stmt_vec_info epilog_stmt_info = loop_vinfo->add_stmt (epilog_stmt);
5118 STMT_VINFO_RELATED_STMT (epilog_stmt_info)
5119 = STMT_VINFO_RELATED_STMT (loop_vinfo->lookup_stmt (new_phi));
5120
5121 if (!double_reduc)
5122 scalar_results.quick_push (new_temp);
5123 else
5124 scalar_results[0] = new_temp;
5125 }
5126 else
5127 scalar_results[0] = new_temp;
5128
5129 new_phis[0] = epilog_stmt;
5130 }
5131
5132 if (double_reduc)
5133 loop = loop->inner;
5134
5135 /* 2.6 Handle the loop-exit phis. Replace the uses of scalar loop-exit
5136 phis with new adjusted scalar results, i.e., replace use <s_out0>
5137 with use <s_out4>.
5138
5139 Transform:
5140 loop_exit:
5141 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
5142 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
5143 v_out2 = reduce <v_out1>
5144 s_out3 = extract_field <v_out2, 0>
5145 s_out4 = adjust_result <s_out3>
5146 use <s_out0>
5147 use <s_out0>
5148
5149 into:
5150
5151 loop_exit:
5152 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
5153 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
5154 v_out2 = reduce <v_out1>
5155 s_out3 = extract_field <v_out2, 0>
5156 s_out4 = adjust_result <s_out3>
5157 use <s_out4>
5158 use <s_out4> */
5159
5160
5161 /* In SLP reduction chain we reduce vector results into one vector if
5162 necessary, hence we set here REDUC_GROUP_SIZE to 1. SCALAR_DEST is the
5163 LHS of the last stmt in the reduction chain, since we are looking for
5164 the loop exit phi node. */
5165 if (REDUC_GROUP_FIRST_ELEMENT (stmt_info))
5166 {
5167 stmt_vec_info dest_stmt_info
5168 = vect_orig_stmt (SLP_TREE_SCALAR_STMTS (slp_node)[group_size - 1]);
5169 scalar_dest = gimple_assign_lhs (dest_stmt_info->stmt);
5170 group_size = 1;
5171 }
5172
5173 /* In SLP we may have several statements in NEW_PHIS and REDUCTION_PHIS (in
5174 case that REDUC_GROUP_SIZE is greater than vectorization factor).
5175 Therefore, we need to match SCALAR_RESULTS with corresponding statements.
5176 The first (REDUC_GROUP_SIZE / number of new vector stmts) scalar results
5177 correspond to the first vector stmt, etc.
5178 (RATIO is equal to (REDUC_GROUP_SIZE / number of new vector stmts)). */
5179 if (group_size > new_phis.length ())
5180 gcc_assert (!(group_size % new_phis.length ()));
5181
5182 for (k = 0; k < group_size; k++)
5183 {
5184 if (slp_reduc)
5185 {
5186 stmt_vec_info scalar_stmt_info = SLP_TREE_SCALAR_STMTS (slp_node)[k];
5187
5188 orig_stmt_info = STMT_VINFO_RELATED_STMT (scalar_stmt_info);
5189 /* SLP statements can't participate in patterns. */
5190 gcc_assert (!orig_stmt_info);
5191 scalar_dest = gimple_assign_lhs (scalar_stmt_info->stmt);
5192 }
5193
5194 if (nested_in_vect_loop)
5195 {
5196 if (double_reduc)
5197 loop = outer_loop;
5198 else
5199 gcc_unreachable ();
5200 }
5201
5202 phis.create (3);
5203 /* Find the loop-closed-use at the loop exit of the original scalar
5204 result. (The reduction result is expected to have two immediate uses,
5205 one at the latch block, and one at the loop exit). For double
5206 reductions we are looking for exit phis of the outer loop. */
5207 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, scalar_dest)
5208 {
5209 if (!flow_bb_inside_loop_p (loop, gimple_bb (USE_STMT (use_p))))
5210 {
5211 if (!is_gimple_debug (USE_STMT (use_p)))
5212 phis.safe_push (USE_STMT (use_p));
5213 }
5214 else
5215 {
5216 if (double_reduc && gimple_code (USE_STMT (use_p)) == GIMPLE_PHI)
5217 {
5218 tree phi_res = PHI_RESULT (USE_STMT (use_p));
5219
5220 FOR_EACH_IMM_USE_FAST (phi_use_p, phi_imm_iter, phi_res)
5221 {
5222 if (!flow_bb_inside_loop_p (loop,
5223 gimple_bb (USE_STMT (phi_use_p)))
5224 && !is_gimple_debug (USE_STMT (phi_use_p)))
5225 phis.safe_push (USE_STMT (phi_use_p));
5226 }
5227 }
5228 }
5229 }
5230
5231 FOR_EACH_VEC_ELT (phis, i, exit_phi)
5232 {
5233 /* Replace the uses: */
5234 orig_name = PHI_RESULT (exit_phi);
5235 scalar_result = scalar_results[k];
5236 FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name)
5237 FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
5238 SET_USE (use_p, scalar_result);
5239 }
5240
5241 phis.release ();
5242 }
5243 }
5244
5245 /* Return a vector of type VECTYPE that is equal to the vector select
5246 operation "MASK ? VEC : IDENTITY". Insert the select statements
5247 before GSI. */
5248
5249 static tree
5250 merge_with_identity (gimple_stmt_iterator *gsi, tree mask, tree vectype,
5251 tree vec, tree identity)
5252 {
5253 tree cond = make_temp_ssa_name (vectype, NULL, "cond");
5254 gimple *new_stmt = gimple_build_assign (cond, VEC_COND_EXPR,
5255 mask, vec, identity);
5256 gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
5257 return cond;
5258 }
5259
5260 /* Successively apply CODE to each element of VECTOR_RHS, in left-to-right
5261 order, starting with LHS. Insert the extraction statements before GSI and
5262 associate the new scalar SSA names with variable SCALAR_DEST.
5263 Return the SSA name for the result. */
5264
5265 static tree
5266 vect_expand_fold_left (gimple_stmt_iterator *gsi, tree scalar_dest,
5267 tree_code code, tree lhs, tree vector_rhs)
5268 {
5269 tree vectype = TREE_TYPE (vector_rhs);
5270 tree scalar_type = TREE_TYPE (vectype);
5271 tree bitsize = TYPE_SIZE (scalar_type);
5272 unsigned HOST_WIDE_INT vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
5273 unsigned HOST_WIDE_INT element_bitsize = tree_to_uhwi (bitsize);
5274
5275 for (unsigned HOST_WIDE_INT bit_offset = 0;
5276 bit_offset < vec_size_in_bits;
5277 bit_offset += element_bitsize)
5278 {
5279 tree bitpos = bitsize_int (bit_offset);
5280 tree rhs = build3 (BIT_FIELD_REF, scalar_type, vector_rhs,
5281 bitsize, bitpos);
5282
5283 gassign *stmt = gimple_build_assign (scalar_dest, rhs);
5284 rhs = make_ssa_name (scalar_dest, stmt);
5285 gimple_assign_set_lhs (stmt, rhs);
5286 gsi_insert_before (gsi, stmt, GSI_SAME_STMT);
5287
5288 stmt = gimple_build_assign (scalar_dest, code, lhs, rhs);
5289 tree new_name = make_ssa_name (scalar_dest, stmt);
5290 gimple_assign_set_lhs (stmt, new_name);
5291 gsi_insert_before (gsi, stmt, GSI_SAME_STMT);
5292 lhs = new_name;
5293 }
5294 return lhs;
5295 }
5296
5297 /* Get a masked internal function equivalent to REDUC_FN. VECTYPE_IN is the
5298 type of the vector input. */
5299
5300 static internal_fn
5301 get_masked_reduction_fn (internal_fn reduc_fn, tree vectype_in)
5302 {
5303 internal_fn mask_reduc_fn;
5304
5305 switch (reduc_fn)
5306 {
5307 case IFN_FOLD_LEFT_PLUS:
5308 mask_reduc_fn = IFN_MASK_FOLD_LEFT_PLUS;
5309 break;
5310
5311 default:
5312 return IFN_LAST;
5313 }
5314
5315 if (direct_internal_fn_supported_p (mask_reduc_fn, vectype_in,
5316 OPTIMIZE_FOR_SPEED))
5317 return mask_reduc_fn;
5318 return IFN_LAST;
5319 }
5320
5321 /* Perform an in-order reduction (FOLD_LEFT_REDUCTION). STMT_INFO is the
5322 statement that sets the live-out value. REDUC_DEF_STMT is the phi
5323 statement. CODE is the operation performed by STMT_INFO and OPS are
5324 its scalar operands. REDUC_INDEX is the index of the operand in
5325 OPS that is set by REDUC_DEF_STMT. REDUC_FN is the function that
5326 implements in-order reduction, or IFN_LAST if we should open-code it.
5327 VECTYPE_IN is the type of the vector input. MASKS specifies the masks
5328 that should be used to control the operation in a fully-masked loop. */
5329
5330 static bool
5331 vectorize_fold_left_reduction (stmt_vec_info stmt_info,
5332 gimple_stmt_iterator *gsi,
5333 stmt_vec_info *vec_stmt, slp_tree slp_node,
5334 gimple *reduc_def_stmt,
5335 tree_code code, internal_fn reduc_fn,
5336 tree ops[3], tree vectype_in,
5337 int reduc_index, vec_loop_masks *masks)
5338 {
5339 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
5340 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
5341 tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
5342 stmt_vec_info new_stmt_info = NULL;
5343 internal_fn mask_reduc_fn = get_masked_reduction_fn (reduc_fn, vectype_in);
5344
5345 int ncopies;
5346 if (slp_node)
5347 ncopies = 1;
5348 else
5349 ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
5350
5351 gcc_assert (!nested_in_vect_loop_p (loop, stmt_info));
5352 gcc_assert (ncopies == 1);
5353 gcc_assert (TREE_CODE_LENGTH (code) == binary_op);
5354
5355 if (slp_node)
5356 gcc_assert (known_eq (TYPE_VECTOR_SUBPARTS (vectype_out),
5357 TYPE_VECTOR_SUBPARTS (vectype_in)));
5358
5359 tree op0 = ops[1 - reduc_index];
5360
5361 int group_size = 1;
5362 stmt_vec_info scalar_dest_def_info;
5363 auto_vec<tree> vec_oprnds0;
5364 if (slp_node)
5365 {
5366 auto_vec<vec<tree> > vec_defs (2);
5367 auto_vec<tree> sops(2);
5368 sops.quick_push (ops[0]);
5369 sops.quick_push (ops[1]);
5370 vect_get_slp_defs (sops, slp_node, &vec_defs);
5371 vec_oprnds0.safe_splice (vec_defs[1 - reduc_index]);
5372 vec_defs[0].release ();
5373 vec_defs[1].release ();
5374 group_size = SLP_TREE_SCALAR_STMTS (slp_node).length ();
5375 scalar_dest_def_info = SLP_TREE_SCALAR_STMTS (slp_node)[group_size - 1];
5376 }
5377 else
5378 {
5379 tree loop_vec_def0 = vect_get_vec_def_for_operand (op0, stmt_info);
5380 vec_oprnds0.create (1);
5381 vec_oprnds0.quick_push (loop_vec_def0);
5382 scalar_dest_def_info = stmt_info;
5383 }
5384
5385 tree scalar_dest = gimple_assign_lhs (scalar_dest_def_info->stmt);
5386 tree scalar_type = TREE_TYPE (scalar_dest);
5387 tree reduc_var = gimple_phi_result (reduc_def_stmt);
5388
5389 int vec_num = vec_oprnds0.length ();
5390 gcc_assert (vec_num == 1 || slp_node);
5391 tree vec_elem_type = TREE_TYPE (vectype_out);
5392 gcc_checking_assert (useless_type_conversion_p (scalar_type, vec_elem_type));
5393
5394 tree vector_identity = NULL_TREE;
5395 if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
5396 vector_identity = build_zero_cst (vectype_out);
5397
5398 tree scalar_dest_var = vect_create_destination_var (scalar_dest, NULL);
5399 int i;
5400 tree def0;
5401 FOR_EACH_VEC_ELT (vec_oprnds0, i, def0)
5402 {
5403 gimple *new_stmt;
5404 tree mask = NULL_TREE;
5405 if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
5406 mask = vect_get_loop_mask (gsi, masks, vec_num, vectype_in, i);
5407
5408 /* Handle MINUS by adding the negative. */
5409 if (reduc_fn != IFN_LAST && code == MINUS_EXPR)
5410 {
5411 tree negated = make_ssa_name (vectype_out);
5412 new_stmt = gimple_build_assign (negated, NEGATE_EXPR, def0);
5413 gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
5414 def0 = negated;
5415 }
5416
5417 if (mask && mask_reduc_fn == IFN_LAST)
5418 def0 = merge_with_identity (gsi, mask, vectype_out, def0,
5419 vector_identity);
5420
5421 /* On the first iteration the input is simply the scalar phi
5422 result, and for subsequent iterations it is the output of
5423 the preceding operation. */
5424 if (reduc_fn != IFN_LAST || (mask && mask_reduc_fn != IFN_LAST))
5425 {
5426 if (mask && mask_reduc_fn != IFN_LAST)
5427 new_stmt = gimple_build_call_internal (mask_reduc_fn, 3, reduc_var,
5428 def0, mask);
5429 else
5430 new_stmt = gimple_build_call_internal (reduc_fn, 2, reduc_var,
5431 def0);
5432 /* For chained SLP reductions the output of the previous reduction
5433 operation serves as the input of the next. For the final statement
5434 the output cannot be a temporary - we reuse the original
5435 scalar destination of the last statement. */
5436 if (i != vec_num - 1)
5437 {
5438 gimple_set_lhs (new_stmt, scalar_dest_var);
5439 reduc_var = make_ssa_name (scalar_dest_var, new_stmt);
5440 gimple_set_lhs (new_stmt, reduc_var);
5441 }
5442 }
5443 else
5444 {
5445 reduc_var = vect_expand_fold_left (gsi, scalar_dest_var, code,
5446 reduc_var, def0);
5447 new_stmt = SSA_NAME_DEF_STMT (reduc_var);
5448 /* Remove the statement, so that we can use the same code paths
5449 as for statements that we've just created. */
5450 gimple_stmt_iterator tmp_gsi = gsi_for_stmt (new_stmt);
5451 gsi_remove (&tmp_gsi, true);
5452 }
5453
5454 if (i == vec_num - 1)
5455 {
5456 gimple_set_lhs (new_stmt, scalar_dest);
5457 new_stmt_info = vect_finish_replace_stmt (scalar_dest_def_info,
5458 new_stmt);
5459 }
5460 else
5461 new_stmt_info = vect_finish_stmt_generation (scalar_dest_def_info,
5462 new_stmt, gsi);
5463
5464 if (slp_node)
5465 SLP_TREE_VEC_STMTS (slp_node).quick_push (new_stmt_info);
5466 }
5467
5468 if (!slp_node)
5469 STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt_info;
5470
5471 return true;
5472 }
5473
5474 /* Function is_nonwrapping_integer_induction.
5475
5476 Check if STMT_VINO (which is part of loop LOOP) both increments and
5477 does not cause overflow. */
5478
5479 static bool
5480 is_nonwrapping_integer_induction (stmt_vec_info stmt_vinfo, class loop *loop)
5481 {
5482 gphi *phi = as_a <gphi *> (stmt_vinfo->stmt);
5483 tree base = STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo);
5484 tree step = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo);
5485 tree lhs_type = TREE_TYPE (gimple_phi_result (phi));
5486 widest_int ni, max_loop_value, lhs_max;
5487 wi::overflow_type overflow = wi::OVF_NONE;
5488
5489 /* Make sure the loop is integer based. */
5490 if (TREE_CODE (base) != INTEGER_CST
5491 || TREE_CODE (step) != INTEGER_CST)
5492 return false;
5493
5494 /* Check that the max size of the loop will not wrap. */
5495
5496 if (TYPE_OVERFLOW_UNDEFINED (lhs_type))
5497 return true;
5498
5499 if (! max_stmt_executions (loop, &ni))
5500 return false;
5501
5502 max_loop_value = wi::mul (wi::to_widest (step), ni, TYPE_SIGN (lhs_type),
5503 &overflow);
5504 if (overflow)
5505 return false;
5506
5507 max_loop_value = wi::add (wi::to_widest (base), max_loop_value,
5508 TYPE_SIGN (lhs_type), &overflow);
5509 if (overflow)
5510 return false;
5511
5512 return (wi::min_precision (max_loop_value, TYPE_SIGN (lhs_type))
5513 <= TYPE_PRECISION (lhs_type));
5514 }
5515
5516 /* Check if masking can be supported by inserting a conditional expression.
5517 CODE is the code for the operation. COND_FN is the conditional internal
5518 function, if it exists. VECTYPE_IN is the type of the vector input. */
5519 static bool
5520 use_mask_by_cond_expr_p (enum tree_code code, internal_fn cond_fn,
5521 tree vectype_in)
5522 {
5523 if (cond_fn != IFN_LAST
5524 && direct_internal_fn_supported_p (cond_fn, vectype_in,
5525 OPTIMIZE_FOR_SPEED))
5526 return false;
5527
5528 switch (code)
5529 {
5530 case DOT_PROD_EXPR:
5531 case SAD_EXPR:
5532 return true;
5533
5534 default:
5535 return false;
5536 }
5537 }
5538
5539 /* Insert a conditional expression to enable masked vectorization. CODE is the
5540 code for the operation. VOP is the array of operands. MASK is the loop
5541 mask. GSI is a statement iterator used to place the new conditional
5542 expression. */
5543 static void
5544 build_vect_cond_expr (enum tree_code code, tree vop[3], tree mask,
5545 gimple_stmt_iterator *gsi)
5546 {
5547 switch (code)
5548 {
5549 case DOT_PROD_EXPR:
5550 {
5551 tree vectype = TREE_TYPE (vop[1]);
5552 tree zero = build_zero_cst (vectype);
5553 tree masked_op1 = make_temp_ssa_name (vectype, NULL, "masked_op1");
5554 gassign *select = gimple_build_assign (masked_op1, VEC_COND_EXPR,
5555 mask, vop[1], zero);
5556 gsi_insert_before (gsi, select, GSI_SAME_STMT);
5557 vop[1] = masked_op1;
5558 break;
5559 }
5560
5561 case SAD_EXPR:
5562 {
5563 tree vectype = TREE_TYPE (vop[1]);
5564 tree masked_op1 = make_temp_ssa_name (vectype, NULL, "masked_op1");
5565 gassign *select = gimple_build_assign (masked_op1, VEC_COND_EXPR,
5566 mask, vop[1], vop[0]);
5567 gsi_insert_before (gsi, select, GSI_SAME_STMT);
5568 vop[1] = masked_op1;
5569 break;
5570 }
5571
5572 default:
5573 gcc_unreachable ();
5574 }
5575 }
5576
5577 /* Function vectorizable_reduction.
5578
5579 Check if STMT_INFO performs a reduction operation that can be vectorized.
5580 If VEC_STMT is also passed, vectorize STMT_INFO: create a vectorized
5581 stmt to replace it, put it in VEC_STMT, and insert it at GSI.
5582 Return true if STMT_INFO is vectorizable in this way.
5583
5584 This function also handles reduction idioms (patterns) that have been
5585 recognized in advance during vect_pattern_recog. In this case, STMT_INFO
5586 may be of this form:
5587 X = pattern_expr (arg0, arg1, ..., X)
5588 and its STMT_VINFO_RELATED_STMT points to the last stmt in the original
5589 sequence that had been detected and replaced by the pattern-stmt
5590 (STMT_INFO).
5591
5592 This function also handles reduction of condition expressions, for example:
5593 for (int i = 0; i < N; i++)
5594 if (a[i] < value)
5595 last = a[i];
5596 This is handled by vectorising the loop and creating an additional vector
5597 containing the loop indexes for which "a[i] < value" was true. In the
5598 function epilogue this is reduced to a single max value and then used to
5599 index into the vector of results.
5600
5601 In some cases of reduction patterns, the type of the reduction variable X is
5602 different than the type of the other arguments of STMT_INFO.
5603 In such cases, the vectype that is used when transforming STMT_INFO into
5604 a vector stmt is different than the vectype that is used to determine the
5605 vectorization factor, because it consists of a different number of elements
5606 than the actual number of elements that are being operated upon in parallel.
5607
5608 For example, consider an accumulation of shorts into an int accumulator.
5609 On some targets it's possible to vectorize this pattern operating on 8
5610 shorts at a time (hence, the vectype for purposes of determining the
5611 vectorization factor should be V8HI); on the other hand, the vectype that
5612 is used to create the vector form is actually V4SI (the type of the result).
5613
5614 Upon entry to this function, STMT_VINFO_VECTYPE records the vectype that
5615 indicates what is the actual level of parallelism (V8HI in the example), so
5616 that the right vectorization factor would be derived. This vectype
5617 corresponds to the type of arguments to the reduction stmt, and should *NOT*
5618 be used to create the vectorized stmt. The right vectype for the vectorized
5619 stmt is obtained from the type of the result X:
5620 get_vectype_for_scalar_type (TREE_TYPE (X))
5621
5622 This means that, contrary to "regular" reductions (or "regular" stmts in
5623 general), the following equation:
5624 STMT_VINFO_VECTYPE == get_vectype_for_scalar_type (TREE_TYPE (X))
5625 does *NOT* necessarily hold for reduction patterns. */
5626
5627 bool
5628 vectorizable_reduction (stmt_vec_info stmt_info, slp_tree slp_node,
5629 slp_instance slp_node_instance,
5630 stmt_vector_for_cost *cost_vec)
5631 {
5632 tree scalar_dest;
5633 tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
5634 tree vectype_in = NULL_TREE;
5635 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
5636 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
5637 enum tree_code code;
5638 int op_type;
5639 enum vect_def_type dt, cond_reduc_dt = vect_unknown_def_type;
5640 stmt_vec_info cond_stmt_vinfo = NULL;
5641 tree scalar_type;
5642 int i;
5643 int ncopies;
5644 bool single_defuse_cycle = false;
5645 tree ops[3];
5646 enum vect_def_type dts[3];
5647 bool nested_cycle = false, found_nested_cycle_def = false;
5648 bool double_reduc = false;
5649 int vec_num;
5650 tree tem;
5651 tree cr_index_scalar_type = NULL_TREE, cr_index_vector_type = NULL_TREE;
5652 tree cond_reduc_val = NULL_TREE;
5653
5654 /* Make sure it was already recognized as a reduction computation. */
5655 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_reduction_def
5656 && STMT_VINFO_DEF_TYPE (stmt_info) != vect_double_reduction_def
5657 && STMT_VINFO_DEF_TYPE (stmt_info) != vect_nested_cycle)
5658 return false;
5659
5660 /* The stmt we store reduction analysis meta on. */
5661 stmt_vec_info reduc_info = info_for_reduction (stmt_info);
5662 reduc_info->is_reduc_info = true;
5663
5664 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle)
5665 {
5666 if (is_a <gphi *> (stmt_info->stmt))
5667 /* Analysis for double-reduction is done on the outer
5668 loop PHI, nested cycles have no further restrictions. */
5669 STMT_VINFO_TYPE (stmt_info) = cycle_phi_info_type;
5670 else
5671 STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
5672 return true;
5673 }
5674
5675 stmt_vec_info orig_stmt_of_analysis = stmt_info;
5676 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
5677 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
5678 {
5679 if (!is_a <gphi *> (stmt_info->stmt))
5680 {
5681 STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
5682 return true;
5683 }
5684 if (slp_node)
5685 {
5686 slp_node_instance->reduc_phis = slp_node;
5687 /* ??? We're leaving slp_node to point to the PHIs, we only
5688 need it to get at the number of vector stmts which wasn't
5689 yet initialized for the instance root. */
5690 }
5691 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def)
5692 stmt_info = vect_stmt_to_vectorize (STMT_VINFO_REDUC_DEF (stmt_info));
5693 else /* STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def */
5694 {
5695 use_operand_p use_p;
5696 gimple *use_stmt;
5697 bool res = single_imm_use (gimple_phi_result (stmt_info->stmt),
5698 &use_p, &use_stmt);
5699 gcc_assert (res);
5700 stmt_info = loop_vinfo->lookup_stmt (use_stmt);
5701 stmt_info = vect_stmt_to_vectorize (STMT_VINFO_REDUC_DEF (stmt_info));
5702 }
5703 /* STMT_VINFO_REDUC_DEF doesn't point to the first but the last
5704 element. */
5705 if (slp_node && REDUC_GROUP_FIRST_ELEMENT (stmt_info))
5706 {
5707 gcc_assert (!REDUC_GROUP_NEXT_ELEMENT (stmt_info));
5708 stmt_info = REDUC_GROUP_FIRST_ELEMENT (stmt_info);
5709 }
5710 }
5711
5712 if (nested_in_vect_loop_p (loop, stmt_info))
5713 {
5714 loop = loop->inner;
5715 nested_cycle = true;
5716 }
5717
5718 if (REDUC_GROUP_FIRST_ELEMENT (stmt_info))
5719 gcc_assert (slp_node
5720 && REDUC_GROUP_FIRST_ELEMENT (stmt_info) == stmt_info);
5721
5722 /* 1. Is vectorizable reduction? */
5723 /* Not supportable if the reduction variable is used in the loop, unless
5724 it's a reduction chain. */
5725 if (STMT_VINFO_RELEVANT (stmt_info) > vect_used_in_outer
5726 && !REDUC_GROUP_FIRST_ELEMENT (stmt_info))
5727 return false;
5728
5729 /* Reductions that are not used even in an enclosing outer-loop,
5730 are expected to be "live" (used out of the loop). */
5731 if (STMT_VINFO_RELEVANT (stmt_info) == vect_unused_in_scope
5732 && !STMT_VINFO_LIVE_P (stmt_info))
5733 return false;
5734
5735 /* 2. Has this been recognized as a reduction pattern?
5736
5737 Check if STMT represents a pattern that has been recognized
5738 in earlier analysis stages. For stmts that represent a pattern,
5739 the STMT_VINFO_RELATED_STMT field records the last stmt in
5740 the original sequence that constitutes the pattern. */
5741
5742 stmt_vec_info orig_stmt_info = STMT_VINFO_RELATED_STMT (stmt_info);
5743 if (orig_stmt_info)
5744 {
5745 gcc_assert (STMT_VINFO_IN_PATTERN_P (orig_stmt_info));
5746 gcc_assert (!STMT_VINFO_IN_PATTERN_P (stmt_info));
5747 }
5748
5749 /* 3. Check the operands of the operation. The first operands are defined
5750 inside the loop body. The last operand is the reduction variable,
5751 which is defined by the loop-header-phi. */
5752
5753 gassign *stmt = as_a <gassign *> (stmt_info->stmt);
5754
5755 /* Flatten RHS. */
5756 switch (get_gimple_rhs_class (gimple_assign_rhs_code (stmt)))
5757 {
5758 case GIMPLE_BINARY_RHS:
5759 code = gimple_assign_rhs_code (stmt);
5760 op_type = TREE_CODE_LENGTH (code);
5761 gcc_assert (op_type == binary_op);
5762 ops[0] = gimple_assign_rhs1 (stmt);
5763 ops[1] = gimple_assign_rhs2 (stmt);
5764 break;
5765
5766 case GIMPLE_TERNARY_RHS:
5767 code = gimple_assign_rhs_code (stmt);
5768 op_type = TREE_CODE_LENGTH (code);
5769 gcc_assert (op_type == ternary_op);
5770 ops[0] = gimple_assign_rhs1 (stmt);
5771 ops[1] = gimple_assign_rhs2 (stmt);
5772 ops[2] = gimple_assign_rhs3 (stmt);
5773 break;
5774
5775 case GIMPLE_UNARY_RHS:
5776 case GIMPLE_SINGLE_RHS:
5777 return false;
5778
5779 default:
5780 gcc_unreachable ();
5781 }
5782
5783 if (code == COND_EXPR && slp_node)
5784 return false;
5785
5786 scalar_dest = gimple_assign_lhs (stmt);
5787 scalar_type = TREE_TYPE (scalar_dest);
5788 if (!POINTER_TYPE_P (scalar_type) && !INTEGRAL_TYPE_P (scalar_type)
5789 && !SCALAR_FLOAT_TYPE_P (scalar_type))
5790 return false;
5791
5792 /* Do not try to vectorize bit-precision reductions. */
5793 if (!type_has_mode_precision_p (scalar_type))
5794 return false;
5795
5796 /* All uses but the last are expected to be defined in the loop.
5797 The last use is the reduction variable. In case of nested cycle this
5798 assumption is not true: we use reduc_index to record the index of the
5799 reduction variable. */
5800 stmt_vec_info phi_info = STMT_VINFO_REDUC_DEF (vect_orig_stmt (stmt_info));
5801 /* PHIs should not participate in patterns. */
5802 gcc_assert (!STMT_VINFO_RELATED_STMT (phi_info));
5803 gphi *reduc_def_phi = as_a <gphi *> (phi_info->stmt);
5804 tree reduc_def = PHI_RESULT (reduc_def_phi);
5805 int reduc_index = -1;
5806 for (i = 0; i < op_type; i++)
5807 {
5808 /* The condition of COND_EXPR is checked in vectorizable_condition(). */
5809 if (i == 0 && code == COND_EXPR)
5810 continue;
5811
5812 stmt_vec_info def_stmt_info;
5813 if (!vect_is_simple_use (ops[i], loop_vinfo, &dts[i], &tem,
5814 &def_stmt_info))
5815 {
5816 if (dump_enabled_p ())
5817 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5818 "use not simple.\n");
5819 return false;
5820 }
5821 dt = dts[i];
5822 if (dt == vect_reduction_def
5823 && ops[i] == reduc_def)
5824 {
5825 reduc_index = i;
5826 continue;
5827 }
5828 else if (tem)
5829 {
5830 /* To properly compute ncopies we are interested in the widest
5831 input type in case we're looking at a widening accumulation. */
5832 if (!vectype_in
5833 || (GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (vectype_in)))
5834 < GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (tem)))))
5835 vectype_in = tem;
5836 }
5837
5838 if (dt != vect_internal_def
5839 && dt != vect_external_def
5840 && dt != vect_constant_def
5841 && dt != vect_induction_def
5842 && !(dt == vect_nested_cycle && nested_cycle))
5843 return false;
5844
5845 if (dt == vect_nested_cycle
5846 && ops[i] == reduc_def)
5847 {
5848 found_nested_cycle_def = true;
5849 reduc_index = i;
5850 }
5851
5852 if (code == COND_EXPR)
5853 {
5854 /* Record how the non-reduction-def value of COND_EXPR is defined. */
5855 if (dt == vect_constant_def)
5856 {
5857 cond_reduc_dt = dt;
5858 cond_reduc_val = ops[i];
5859 }
5860 if (dt == vect_induction_def
5861 && def_stmt_info
5862 && is_nonwrapping_integer_induction (def_stmt_info, loop))
5863 {
5864 cond_reduc_dt = dt;
5865 cond_stmt_vinfo = def_stmt_info;
5866 }
5867 }
5868 }
5869 if (!vectype_in)
5870 vectype_in = vectype_out;
5871 STMT_VINFO_REDUC_VECTYPE_IN (reduc_info) = vectype_in;
5872 /* For the SSA cycle we store on each participating stmt the operand index
5873 where the cycle continues. Store the one relevant for the actual
5874 operation in the reduction meta. */
5875 STMT_VINFO_REDUC_IDX (reduc_info) = reduc_index;
5876
5877 if (!(reduc_index == -1
5878 || dts[reduc_index] == vect_reduction_def
5879 || dts[reduc_index] == vect_nested_cycle
5880 || ((dts[reduc_index] == vect_internal_def
5881 || dts[reduc_index] == vect_external_def
5882 || dts[reduc_index] == vect_constant_def
5883 || dts[reduc_index] == vect_induction_def)
5884 && nested_cycle && found_nested_cycle_def)))
5885 {
5886 /* For pattern recognized stmts, orig_stmt might be a reduction,
5887 but some helper statements for the pattern might not, or
5888 might be COND_EXPRs with reduction uses in the condition. */
5889 gcc_assert (orig_stmt_info);
5890 return false;
5891 }
5892
5893 enum vect_reduction_type v_reduc_type = STMT_VINFO_REDUC_TYPE (phi_info);
5894 STMT_VINFO_REDUC_TYPE (reduc_info) = v_reduc_type;
5895 /* If we have a condition reduction, see if we can simplify it further. */
5896 if (v_reduc_type == COND_REDUCTION)
5897 {
5898 /* TODO: We can't yet handle reduction chains, since we need to treat
5899 each COND_EXPR in the chain specially, not just the last one.
5900 E.g. for:
5901
5902 x_1 = PHI <x_3, ...>
5903 x_2 = a_2 ? ... : x_1;
5904 x_3 = a_3 ? ... : x_2;
5905
5906 we're interested in the last element in x_3 for which a_2 || a_3
5907 is true, whereas the current reduction chain handling would
5908 vectorize x_2 as a normal VEC_COND_EXPR and only treat x_3
5909 as a reduction operation. */
5910 if (reduc_index == -1)
5911 {
5912 if (dump_enabled_p ())
5913 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5914 "conditional reduction chains not supported\n");
5915 return false;
5916 }
5917
5918 /* When the condition uses the reduction value in the condition, fail. */
5919 if (reduc_index == 0)
5920 {
5921 if (dump_enabled_p ())
5922 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5923 "condition depends on previous iteration\n");
5924 return false;
5925 }
5926
5927 if (direct_internal_fn_supported_p (IFN_FOLD_EXTRACT_LAST,
5928 vectype_in, OPTIMIZE_FOR_SPEED))
5929 {
5930 if (dump_enabled_p ())
5931 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5932 "optimizing condition reduction with"
5933 " FOLD_EXTRACT_LAST.\n");
5934 STMT_VINFO_REDUC_TYPE (reduc_info) = EXTRACT_LAST_REDUCTION;
5935 }
5936 else if (cond_reduc_dt == vect_induction_def)
5937 {
5938 tree base
5939 = STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (cond_stmt_vinfo);
5940 tree step = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (cond_stmt_vinfo);
5941
5942 gcc_assert (TREE_CODE (base) == INTEGER_CST
5943 && TREE_CODE (step) == INTEGER_CST);
5944 cond_reduc_val = NULL_TREE;
5945 enum tree_code cond_reduc_op_code = ERROR_MARK;
5946 tree res = PHI_RESULT (STMT_VINFO_STMT (cond_stmt_vinfo));
5947 if (!types_compatible_p (TREE_TYPE (res), TREE_TYPE (base)))
5948 ;
5949 /* Find a suitable value, for MAX_EXPR below base, for MIN_EXPR
5950 above base; punt if base is the minimum value of the type for
5951 MAX_EXPR or maximum value of the type for MIN_EXPR for now. */
5952 else if (tree_int_cst_sgn (step) == -1)
5953 {
5954 cond_reduc_op_code = MIN_EXPR;
5955 if (tree_int_cst_sgn (base) == -1)
5956 cond_reduc_val = build_int_cst (TREE_TYPE (base), 0);
5957 else if (tree_int_cst_lt (base,
5958 TYPE_MAX_VALUE (TREE_TYPE (base))))
5959 cond_reduc_val
5960 = int_const_binop (PLUS_EXPR, base, integer_one_node);
5961 }
5962 else
5963 {
5964 cond_reduc_op_code = MAX_EXPR;
5965 if (tree_int_cst_sgn (base) == 1)
5966 cond_reduc_val = build_int_cst (TREE_TYPE (base), 0);
5967 else if (tree_int_cst_lt (TYPE_MIN_VALUE (TREE_TYPE (base)),
5968 base))
5969 cond_reduc_val
5970 = int_const_binop (MINUS_EXPR, base, integer_one_node);
5971 }
5972 if (cond_reduc_val)
5973 {
5974 if (dump_enabled_p ())
5975 dump_printf_loc (MSG_NOTE, vect_location,
5976 "condition expression based on "
5977 "integer induction.\n");
5978 STMT_VINFO_VEC_COND_REDUC_CODE (reduc_info) = cond_reduc_op_code;
5979 STMT_VINFO_VEC_INDUC_COND_INITIAL_VAL (reduc_info)
5980 = cond_reduc_val;
5981 STMT_VINFO_REDUC_TYPE (reduc_info) = INTEGER_INDUC_COND_REDUCTION;
5982 }
5983 }
5984 else if (cond_reduc_dt == vect_constant_def)
5985 {
5986 enum vect_def_type cond_initial_dt;
5987 gimple *def_stmt = SSA_NAME_DEF_STMT (ops[reduc_index]);
5988 tree cond_initial_val
5989 = PHI_ARG_DEF_FROM_EDGE (def_stmt, loop_preheader_edge (loop));
5990
5991 gcc_assert (cond_reduc_val != NULL_TREE);
5992 vect_is_simple_use (cond_initial_val, loop_vinfo, &cond_initial_dt);
5993 if (cond_initial_dt == vect_constant_def
5994 && types_compatible_p (TREE_TYPE (cond_initial_val),
5995 TREE_TYPE (cond_reduc_val)))
5996 {
5997 tree e = fold_binary (LE_EXPR, boolean_type_node,
5998 cond_initial_val, cond_reduc_val);
5999 if (e && (integer_onep (e) || integer_zerop (e)))
6000 {
6001 if (dump_enabled_p ())
6002 dump_printf_loc (MSG_NOTE, vect_location,
6003 "condition expression based on "
6004 "compile time constant.\n");
6005 /* Record reduction code at analysis stage. */
6006 STMT_VINFO_VEC_COND_REDUC_CODE (reduc_info)
6007 = integer_onep (e) ? MAX_EXPR : MIN_EXPR;
6008 STMT_VINFO_REDUC_TYPE (reduc_info) = CONST_COND_REDUCTION;
6009 }
6010 }
6011 }
6012 }
6013
6014 if (REDUC_GROUP_FIRST_ELEMENT (stmt_info))
6015 /* We changed STMT to be the first stmt in reduction chain, hence we
6016 check that in this case the first element in the chain is STMT. */
6017 gcc_assert (REDUC_GROUP_FIRST_ELEMENT (STMT_VINFO_REDUC_DEF (phi_info))
6018 == vect_orig_stmt (stmt_info));
6019
6020 if (STMT_VINFO_LIVE_P (phi_info))
6021 return false;
6022
6023 if (slp_node)
6024 ncopies = 1;
6025 else
6026 ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
6027
6028 gcc_assert (ncopies >= 1);
6029
6030 poly_uint64 nunits_out = TYPE_VECTOR_SUBPARTS (vectype_out);
6031
6032 if (nested_cycle)
6033 {
6034 gcc_assert (STMT_VINFO_DEF_TYPE (reduc_info)
6035 == vect_double_reduction_def);
6036 double_reduc = true;
6037 }
6038
6039 /* 4.2. Check support for the epilog operation.
6040
6041 If STMT represents a reduction pattern, then the type of the
6042 reduction variable may be different than the type of the rest
6043 of the arguments. For example, consider the case of accumulation
6044 of shorts into an int accumulator; The original code:
6045 S1: int_a = (int) short_a;
6046 orig_stmt-> S2: int_acc = plus <int_a ,int_acc>;
6047
6048 was replaced with:
6049 STMT: int_acc = widen_sum <short_a, int_acc>
6050
6051 This means that:
6052 1. The tree-code that is used to create the vector operation in the
6053 epilog code (that reduces the partial results) is not the
6054 tree-code of STMT, but is rather the tree-code of the original
6055 stmt from the pattern that STMT is replacing. I.e, in the example
6056 above we want to use 'widen_sum' in the loop, but 'plus' in the
6057 epilog.
6058 2. The type (mode) we use to check available target support
6059 for the vector operation to be created in the *epilog*, is
6060 determined by the type of the reduction variable (in the example
6061 above we'd check this: optab_handler (plus_optab, vect_int_mode])).
6062 However the type (mode) we use to check available target support
6063 for the vector operation to be created *inside the loop*, is
6064 determined by the type of the other arguments to STMT (in the
6065 example we'd check this: optab_handler (widen_sum_optab,
6066 vect_short_mode)).
6067
6068 This is contrary to "regular" reductions, in which the types of all
6069 the arguments are the same as the type of the reduction variable.
6070 For "regular" reductions we can therefore use the same vector type
6071 (and also the same tree-code) when generating the epilog code and
6072 when generating the code inside the loop. */
6073
6074 vect_reduction_type reduction_type = STMT_VINFO_REDUC_TYPE (reduc_info);
6075 enum tree_code orig_code = ERROR_MARK;
6076 if (reduction_type == CONST_COND_REDUCTION
6077 || reduction_type == INTEGER_INDUC_COND_REDUCTION)
6078 {
6079 /* For simple condition reductions, replace with the actual expression
6080 we want to base our reduction around. */
6081 orig_code = STMT_VINFO_VEC_COND_REDUC_CODE (reduc_info);
6082 gcc_assert (orig_code == MAX_EXPR || orig_code == MIN_EXPR);
6083 }
6084 else if (reduction_type == COND_REDUCTION)
6085 orig_code = COND_EXPR;
6086 else if (reduction_type == TREE_CODE_REDUCTION
6087 || reduction_type == FOLD_LEFT_REDUCTION)
6088 {
6089 if (orig_stmt_info)
6090 orig_code = gimple_assign_rhs_code (orig_stmt_info->stmt);
6091 else
6092 orig_code = code;
6093 gcc_assert (vectype_out);
6094 if (orig_code == MINUS_EXPR)
6095 orig_code = PLUS_EXPR;
6096 }
6097 STMT_VINFO_REDUC_CODE (reduc_info) = orig_code;
6098
6099 if (reduction_type == TREE_CODE_REDUCTION)
6100 {
6101 /* Check whether it's ok to change the order of the computation.
6102 Generally, when vectorizing a reduction we change the order of the
6103 computation. This may change the behavior of the program in some
6104 cases, so we need to check that this is ok. One exception is when
6105 vectorizing an outer-loop: the inner-loop is executed sequentially,
6106 and therefore vectorizing reductions in the inner-loop during
6107 outer-loop vectorization is safe. */
6108 if (needs_fold_left_reduction_p (scalar_type, orig_code))
6109 {
6110 STMT_VINFO_REDUC_TYPE (reduc_info)
6111 = reduction_type = FOLD_LEFT_REDUCTION;
6112 /* When vectorizing a reduction chain w/o SLP the reduction PHI is not
6113 directy used in stmt. */
6114 if (reduc_index == -1)
6115 {
6116 if (dump_enabled_p ())
6117 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6118 "in-order reduction chain without SLP.\n");
6119 return false;
6120 }
6121 }
6122 else if (!commutative_tree_code (orig_code)
6123 || !associative_tree_code (orig_code))
6124 {
6125 if (dump_enabled_p ())
6126 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6127 "reduction: not commutative/associative");
6128 return false;
6129 }
6130 }
6131
6132 if ((double_reduc || reduction_type != TREE_CODE_REDUCTION)
6133 && ncopies > 1)
6134 {
6135 if (dump_enabled_p ())
6136 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6137 "multiple types in double reduction or condition "
6138 "reduction or fold-left reduction.\n");
6139 return false;
6140 }
6141
6142 internal_fn reduc_fn = IFN_LAST;
6143 if (reduction_type == TREE_CODE_REDUCTION
6144 || reduction_type == FOLD_LEFT_REDUCTION
6145 || reduction_type == INTEGER_INDUC_COND_REDUCTION
6146 || reduction_type == CONST_COND_REDUCTION)
6147 {
6148 if (reduction_type == FOLD_LEFT_REDUCTION
6149 ? fold_left_reduction_fn (orig_code, &reduc_fn)
6150 : reduction_fn_for_scalar_code (orig_code, &reduc_fn))
6151 {
6152 if (reduc_fn != IFN_LAST
6153 && !direct_internal_fn_supported_p (reduc_fn, vectype_out,
6154 OPTIMIZE_FOR_SPEED))
6155 {
6156 if (dump_enabled_p ())
6157 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6158 "reduc op not supported by target.\n");
6159
6160 reduc_fn = IFN_LAST;
6161 }
6162 }
6163 else
6164 {
6165 if (!nested_cycle || double_reduc)
6166 {
6167 if (dump_enabled_p ())
6168 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6169 "no reduc code for scalar code.\n");
6170
6171 return false;
6172 }
6173 }
6174 }
6175 else if (reduction_type == COND_REDUCTION)
6176 {
6177 int scalar_precision
6178 = GET_MODE_PRECISION (SCALAR_TYPE_MODE (scalar_type));
6179 cr_index_scalar_type = make_unsigned_type (scalar_precision);
6180 cr_index_vector_type = build_vector_type (cr_index_scalar_type,
6181 nunits_out);
6182
6183 if (direct_internal_fn_supported_p (IFN_REDUC_MAX, cr_index_vector_type,
6184 OPTIMIZE_FOR_SPEED))
6185 reduc_fn = IFN_REDUC_MAX;
6186 }
6187 STMT_VINFO_REDUC_FN (reduc_info) = reduc_fn;
6188
6189 if (reduction_type != EXTRACT_LAST_REDUCTION
6190 && (!nested_cycle || double_reduc)
6191 && reduc_fn == IFN_LAST
6192 && !nunits_out.is_constant ())
6193 {
6194 if (dump_enabled_p ())
6195 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6196 "missing target support for reduction on"
6197 " variable-length vectors.\n");
6198 return false;
6199 }
6200
6201 /* For SLP reductions, see if there is a neutral value we can use. */
6202 tree neutral_op = NULL_TREE;
6203 if (slp_node)
6204 neutral_op = neutral_op_for_slp_reduction
6205 (slp_node_instance->reduc_phis, code,
6206 REDUC_GROUP_FIRST_ELEMENT (stmt_info) != NULL);
6207
6208 if (double_reduc && reduction_type == FOLD_LEFT_REDUCTION)
6209 {
6210 /* We can't support in-order reductions of code such as this:
6211
6212 for (int i = 0; i < n1; ++i)
6213 for (int j = 0; j < n2; ++j)
6214 l += a[j];
6215
6216 since GCC effectively transforms the loop when vectorizing:
6217
6218 for (int i = 0; i < n1 / VF; ++i)
6219 for (int j = 0; j < n2; ++j)
6220 for (int k = 0; k < VF; ++k)
6221 l += a[j];
6222
6223 which is a reassociation of the original operation. */
6224 if (dump_enabled_p ())
6225 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6226 "in-order double reduction not supported.\n");
6227
6228 return false;
6229 }
6230
6231 if (reduction_type == FOLD_LEFT_REDUCTION
6232 && slp_node
6233 && !REDUC_GROUP_FIRST_ELEMENT (stmt_info))
6234 {
6235 /* We cannot use in-order reductions in this case because there is
6236 an implicit reassociation of the operations involved. */
6237 if (dump_enabled_p ())
6238 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6239 "in-order unchained SLP reductions not supported.\n");
6240 return false;
6241 }
6242
6243 /* For double reductions, and for SLP reductions with a neutral value,
6244 we construct a variable-length initial vector by loading a vector
6245 full of the neutral value and then shift-and-inserting the start
6246 values into the low-numbered elements. */
6247 if ((double_reduc || neutral_op)
6248 && !nunits_out.is_constant ()
6249 && !direct_internal_fn_supported_p (IFN_VEC_SHL_INSERT,
6250 vectype_out, OPTIMIZE_FOR_SPEED))
6251 {
6252 if (dump_enabled_p ())
6253 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6254 "reduction on variable-length vectors requires"
6255 " target support for a vector-shift-and-insert"
6256 " operation.\n");
6257 return false;
6258 }
6259
6260 /* Check extra constraints for variable-length unchained SLP reductions. */
6261 if (STMT_SLP_TYPE (stmt_info)
6262 && !REDUC_GROUP_FIRST_ELEMENT (stmt_info)
6263 && !nunits_out.is_constant ())
6264 {
6265 /* We checked above that we could build the initial vector when
6266 there's a neutral element value. Check here for the case in
6267 which each SLP statement has its own initial value and in which
6268 that value needs to be repeated for every instance of the
6269 statement within the initial vector. */
6270 unsigned int group_size = SLP_INSTANCE_GROUP_SIZE (slp_node_instance);
6271 scalar_mode elt_mode = SCALAR_TYPE_MODE (TREE_TYPE (vectype_out));
6272 if (!neutral_op
6273 && !can_duplicate_and_interleave_p (group_size, elt_mode))
6274 {
6275 if (dump_enabled_p ())
6276 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6277 "unsupported form of SLP reduction for"
6278 " variable-length vectors: cannot build"
6279 " initial vector.\n");
6280 return false;
6281 }
6282 /* The epilogue code relies on the number of elements being a multiple
6283 of the group size. The duplicate-and-interleave approach to setting
6284 up the the initial vector does too. */
6285 if (!multiple_p (nunits_out, group_size))
6286 {
6287 if (dump_enabled_p ())
6288 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6289 "unsupported form of SLP reduction for"
6290 " variable-length vectors: the vector size"
6291 " is not a multiple of the number of results.\n");
6292 return false;
6293 }
6294 }
6295
6296 /* In case of widenning multiplication by a constant, we update the type
6297 of the constant to be the type of the other operand. We check that the
6298 constant fits the type in the pattern recognition pass. */
6299 if (code == DOT_PROD_EXPR
6300 && !types_compatible_p (TREE_TYPE (ops[0]), TREE_TYPE (ops[1])))
6301 /* No testcase for this. PR49478. */
6302 gcc_unreachable ();
6303
6304 if (reduction_type == COND_REDUCTION)
6305 {
6306 widest_int ni;
6307
6308 if (! max_loop_iterations (loop, &ni))
6309 {
6310 if (dump_enabled_p ())
6311 dump_printf_loc (MSG_NOTE, vect_location,
6312 "loop count not known, cannot create cond "
6313 "reduction.\n");
6314 return false;
6315 }
6316 /* Convert backedges to iterations. */
6317 ni += 1;
6318
6319 /* The additional index will be the same type as the condition. Check
6320 that the loop can fit into this less one (because we'll use up the
6321 zero slot for when there are no matches). */
6322 tree max_index = TYPE_MAX_VALUE (cr_index_scalar_type);
6323 if (wi::geu_p (ni, wi::to_widest (max_index)))
6324 {
6325 if (dump_enabled_p ())
6326 dump_printf_loc (MSG_NOTE, vect_location,
6327 "loop size is greater than data size.\n");
6328 return false;
6329 }
6330 }
6331
6332 /* In case the vectorization factor (VF) is bigger than the number
6333 of elements that we can fit in a vectype (nunits), we have to generate
6334 more than one vector stmt - i.e - we need to "unroll" the
6335 vector stmt by a factor VF/nunits. For more details see documentation
6336 in vectorizable_operation. */
6337
6338 /* If the reduction is used in an outer loop we need to generate
6339 VF intermediate results, like so (e.g. for ncopies=2):
6340 r0 = phi (init, r0)
6341 r1 = phi (init, r1)
6342 r0 = x0 + r0;
6343 r1 = x1 + r1;
6344 (i.e. we generate VF results in 2 registers).
6345 In this case we have a separate def-use cycle for each copy, and therefore
6346 for each copy we get the vector def for the reduction variable from the
6347 respective phi node created for this copy.
6348
6349 Otherwise (the reduction is unused in the loop nest), we can combine
6350 together intermediate results, like so (e.g. for ncopies=2):
6351 r = phi (init, r)
6352 r = x0 + r;
6353 r = x1 + r;
6354 (i.e. we generate VF/2 results in a single register).
6355 In this case for each copy we get the vector def for the reduction variable
6356 from the vectorized reduction operation generated in the previous iteration.
6357
6358 This only works when we see both the reduction PHI and its only consumer
6359 in vectorizable_reduction and there are no intermediate stmts
6360 participating. */
6361 stmt_vec_info use_stmt_info;
6362 tree reduc_phi_result = gimple_phi_result (reduc_def_phi);
6363 if (ncopies > 1
6364 && (STMT_VINFO_RELEVANT (stmt_info) <= vect_used_only_live)
6365 && (use_stmt_info = loop_vinfo->lookup_single_use (reduc_phi_result))
6366 && (!STMT_VINFO_IN_PATTERN_P (use_stmt_info)
6367 || !STMT_VINFO_PATTERN_DEF_SEQ (use_stmt_info))
6368 && vect_stmt_to_vectorize (use_stmt_info) == stmt_info)
6369 STMT_VINFO_FORCE_SINGLE_CYCLE (reduc_info) = single_defuse_cycle = true;
6370
6371 if (single_defuse_cycle
6372 || code == DOT_PROD_EXPR
6373 || code == WIDEN_SUM_EXPR
6374 || code == SAD_EXPR)
6375 {
6376 gcc_assert (code != COND_EXPR);
6377
6378 /* 4. Supportable by target? */
6379
6380 /* 4.1. check support for the operation in the loop */
6381 optab optab = optab_for_tree_code (code, vectype_in, optab_default);
6382 if (!optab)
6383 {
6384 if (dump_enabled_p ())
6385 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6386 "no optab.\n");
6387
6388 return false;
6389 }
6390
6391 machine_mode vec_mode = TYPE_MODE (vectype_in);
6392 if (optab_handler (optab, vec_mode) == CODE_FOR_nothing)
6393 {
6394 if (dump_enabled_p ())
6395 dump_printf (MSG_NOTE, "op not supported by target.\n");
6396
6397 if (maybe_ne (GET_MODE_SIZE (vec_mode), UNITS_PER_WORD)
6398 || !vect_worthwhile_without_simd_p (loop_vinfo, code))
6399 return false;
6400
6401 if (dump_enabled_p ())
6402 dump_printf (MSG_NOTE, "proceeding using word mode.\n");
6403 }
6404
6405 /* Worthwhile without SIMD support? */
6406 if (!VECTOR_MODE_P (TYPE_MODE (vectype_in))
6407 && !vect_worthwhile_without_simd_p (loop_vinfo, code))
6408 {
6409 if (dump_enabled_p ())
6410 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6411 "not worthwhile without SIMD support.\n");
6412
6413 return false;
6414 }
6415 }
6416
6417 /* If the reduction stmt is one of the patterns that have lane
6418 reduction embedded we cannot handle the case of ! single_defuse_cycle. */
6419 if ((ncopies > 1
6420 && ! single_defuse_cycle)
6421 && (code == DOT_PROD_EXPR
6422 || code == WIDEN_SUM_EXPR
6423 || code == SAD_EXPR))
6424 {
6425 if (dump_enabled_p ())
6426 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6427 "multi def-use cycle not possible for lane-reducing "
6428 "reduction operation\n");
6429 return false;
6430 }
6431
6432 if (slp_node)
6433 vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
6434 else
6435 vec_num = 1;
6436
6437 internal_fn cond_fn = get_conditional_internal_fn (code);
6438 vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo);
6439 bool mask_by_cond_expr = use_mask_by_cond_expr_p (code, cond_fn, vectype_in);
6440
6441 vect_model_reduction_cost (stmt_info, reduc_fn, reduction_type, ncopies,
6442 cost_vec);
6443 if (loop_vinfo && LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo))
6444 {
6445 if (reduction_type != FOLD_LEFT_REDUCTION
6446 && !mask_by_cond_expr
6447 && (cond_fn == IFN_LAST
6448 || !direct_internal_fn_supported_p (cond_fn, vectype_in,
6449 OPTIMIZE_FOR_SPEED)))
6450 {
6451 if (dump_enabled_p ())
6452 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6453 "can't use a fully-masked loop because no"
6454 " conditional operation is available.\n");
6455 LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = false;
6456 }
6457 else if (reduc_index == -1)
6458 {
6459 if (dump_enabled_p ())
6460 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6461 "can't use a fully-masked loop for chained"
6462 " reductions.\n");
6463 LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = false;
6464 }
6465 else
6466 vect_record_loop_mask (loop_vinfo, masks, ncopies * vec_num,
6467 vectype_in);
6468 }
6469 if (dump_enabled_p ()
6470 && reduction_type == FOLD_LEFT_REDUCTION)
6471 dump_printf_loc (MSG_NOTE, vect_location,
6472 "using an in-order (fold-left) reduction.\n");
6473 STMT_VINFO_TYPE (orig_stmt_of_analysis) = cycle_phi_info_type;
6474 /* All but single defuse-cycle optimized, lane-reducing and fold-left
6475 reductions go through their own vectorizable_* routines. */
6476 if (!single_defuse_cycle
6477 && code != DOT_PROD_EXPR
6478 && code != WIDEN_SUM_EXPR
6479 && code != SAD_EXPR
6480 && reduction_type != FOLD_LEFT_REDUCTION)
6481 {
6482 STMT_VINFO_DEF_TYPE (stmt_info) = vect_internal_def;
6483 STMT_VINFO_DEF_TYPE (vect_orig_stmt (stmt_info)) = vect_internal_def;
6484 }
6485 return true;
6486 }
6487
6488 /* Transform the definition stmt STMT_INFO of a reduction PHI backedge
6489 value. */
6490
6491 bool
6492 vect_transform_reduction (stmt_vec_info stmt_info, gimple_stmt_iterator *gsi,
6493 stmt_vec_info *vec_stmt, slp_tree slp_node)
6494 {
6495 tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
6496 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
6497 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
6498 int i;
6499 int ncopies;
6500 int j;
6501 int vec_num;
6502
6503 stmt_vec_info reduc_info = info_for_reduction (stmt_info);
6504 gcc_assert (reduc_info->is_reduc_info);
6505
6506 if (nested_in_vect_loop_p (loop, stmt_info))
6507 {
6508 loop = loop->inner;
6509 gcc_assert (STMT_VINFO_DEF_TYPE (reduc_info) == vect_double_reduction_def);
6510 }
6511
6512 gassign *stmt = as_a <gassign *> (stmt_info->stmt);
6513 enum tree_code code = gimple_assign_rhs_code (stmt);
6514 int op_type = TREE_CODE_LENGTH (code);
6515
6516 /* Flatten RHS. */
6517 tree ops[3];
6518 switch (get_gimple_rhs_class (code))
6519 {
6520 case GIMPLE_TERNARY_RHS:
6521 ops[2] = gimple_assign_rhs3 (stmt);
6522 /* Fall thru. */
6523 case GIMPLE_BINARY_RHS:
6524 ops[0] = gimple_assign_rhs1 (stmt);
6525 ops[1] = gimple_assign_rhs2 (stmt);
6526 break;
6527 default:
6528 gcc_unreachable ();
6529 }
6530
6531 /* All uses but the last are expected to be defined in the loop.
6532 The last use is the reduction variable. In case of nested cycle this
6533 assumption is not true: we use reduc_index to record the index of the
6534 reduction variable. */
6535 stmt_vec_info phi_info = STMT_VINFO_REDUC_DEF (vect_orig_stmt (stmt_info));
6536 gphi *reduc_def_phi = as_a <gphi *> (phi_info->stmt);
6537 int reduc_index = STMT_VINFO_REDUC_IDX (reduc_info);
6538 tree vectype_in = STMT_VINFO_REDUC_VECTYPE_IN (reduc_info);
6539
6540 if (slp_node)
6541 {
6542 ncopies = 1;
6543 vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
6544 }
6545 else
6546 {
6547 ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
6548 vec_num = 1;
6549 }
6550
6551 internal_fn cond_fn = get_conditional_internal_fn (code);
6552 vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo);
6553 bool mask_by_cond_expr = use_mask_by_cond_expr_p (code, cond_fn, vectype_in);
6554
6555 /* Transform. */
6556 stmt_vec_info new_stmt_info = NULL;
6557 stmt_vec_info prev_stmt_info;
6558 tree new_temp = NULL_TREE;
6559 auto_vec<tree> vec_oprnds0;
6560 auto_vec<tree> vec_oprnds1;
6561 auto_vec<tree> vec_oprnds2;
6562 tree def0;
6563
6564 if (dump_enabled_p ())
6565 dump_printf_loc (MSG_NOTE, vect_location, "transform reduction.\n");
6566
6567 /* FORNOW: Multiple types are not supported for condition. */
6568 if (code == COND_EXPR)
6569 gcc_assert (ncopies == 1);
6570
6571 bool masked_loop_p = LOOP_VINFO_FULLY_MASKED_P (loop_vinfo);
6572
6573 vect_reduction_type reduction_type = STMT_VINFO_REDUC_TYPE (reduc_info);
6574 if (reduction_type == FOLD_LEFT_REDUCTION)
6575 {
6576 internal_fn reduc_fn = STMT_VINFO_REDUC_FN (reduc_info);
6577 return vectorize_fold_left_reduction
6578 (stmt_info, gsi, vec_stmt, slp_node, reduc_def_phi, code,
6579 reduc_fn, ops, vectype_in, reduc_index, masks);
6580 }
6581
6582 bool single_defuse_cycle = STMT_VINFO_FORCE_SINGLE_CYCLE (reduc_info);
6583 gcc_assert (single_defuse_cycle
6584 || code == DOT_PROD_EXPR
6585 || code == WIDEN_SUM_EXPR
6586 || code == SAD_EXPR);
6587
6588 /* Create the destination vector */
6589 tree scalar_dest = gimple_assign_lhs (stmt);
6590 tree vec_dest = vect_create_destination_var (scalar_dest, vectype_out);
6591
6592 prev_stmt_info = NULL;
6593 if (!slp_node)
6594 {
6595 vec_oprnds0.create (1);
6596 vec_oprnds1.create (1);
6597 if (op_type == ternary_op)
6598 vec_oprnds2.create (1);
6599 }
6600
6601 for (j = 0; j < ncopies; j++)
6602 {
6603 /* Handle uses. */
6604 if (j == 0)
6605 {
6606 if (slp_node)
6607 {
6608 /* Get vec defs for all the operands except the reduction index,
6609 ensuring the ordering of the ops in the vector is kept. */
6610 auto_vec<tree, 3> slp_ops;
6611 auto_vec<vec<tree>, 3> vec_defs;
6612
6613 slp_ops.quick_push (ops[0]);
6614 slp_ops.quick_push (ops[1]);
6615 if (op_type == ternary_op)
6616 slp_ops.quick_push (ops[2]);
6617
6618 vect_get_slp_defs (slp_ops, slp_node, &vec_defs);
6619
6620 vec_oprnds0.safe_splice (vec_defs[0]);
6621 vec_defs[0].release ();
6622 vec_oprnds1.safe_splice (vec_defs[1]);
6623 vec_defs[1].release ();
6624 if (op_type == ternary_op)
6625 {
6626 vec_oprnds2.safe_splice (vec_defs[2]);
6627 vec_defs[2].release ();
6628 }
6629 }
6630 else
6631 {
6632 vec_oprnds0.quick_push
6633 (vect_get_vec_def_for_operand (ops[0], stmt_info));
6634 vec_oprnds1.quick_push
6635 (vect_get_vec_def_for_operand (ops[1], stmt_info));
6636 if (op_type == ternary_op)
6637 vec_oprnds2.quick_push
6638 (vect_get_vec_def_for_operand (ops[2], stmt_info));
6639 }
6640 }
6641 else
6642 {
6643 if (!slp_node)
6644 {
6645 gcc_assert (reduc_index != -1 || ! single_defuse_cycle);
6646
6647 if (single_defuse_cycle && reduc_index == 0)
6648 vec_oprnds0[0] = gimple_get_lhs (new_stmt_info->stmt);
6649 else
6650 vec_oprnds0[0]
6651 = vect_get_vec_def_for_stmt_copy (loop_vinfo,
6652 vec_oprnds0[0]);
6653 if (single_defuse_cycle && reduc_index == 1)
6654 vec_oprnds1[0] = gimple_get_lhs (new_stmt_info->stmt);
6655 else
6656 vec_oprnds1[0]
6657 = vect_get_vec_def_for_stmt_copy (loop_vinfo,
6658 vec_oprnds1[0]);
6659 if (op_type == ternary_op)
6660 {
6661 if (single_defuse_cycle && reduc_index == 2)
6662 vec_oprnds2[0] = gimple_get_lhs (new_stmt_info->stmt);
6663 else
6664 vec_oprnds2[0]
6665 = vect_get_vec_def_for_stmt_copy (loop_vinfo,
6666 vec_oprnds2[0]);
6667 }
6668 }
6669 }
6670
6671 FOR_EACH_VEC_ELT (vec_oprnds0, i, def0)
6672 {
6673 tree vop[3] = { def0, vec_oprnds1[i], NULL_TREE };
6674 if (masked_loop_p && !mask_by_cond_expr)
6675 {
6676 /* Make sure that the reduction accumulator is vop[0]. */
6677 if (reduc_index == 1)
6678 {
6679 gcc_assert (commutative_tree_code (code));
6680 std::swap (vop[0], vop[1]);
6681 }
6682 tree mask = vect_get_loop_mask (gsi, masks, vec_num * ncopies,
6683 vectype_in, i * ncopies + j);
6684 gcall *call = gimple_build_call_internal (cond_fn, 4, mask,
6685 vop[0], vop[1],
6686 vop[0]);
6687 new_temp = make_ssa_name (vec_dest, call);
6688 gimple_call_set_lhs (call, new_temp);
6689 gimple_call_set_nothrow (call, true);
6690 new_stmt_info
6691 = vect_finish_stmt_generation (stmt_info, call, gsi);
6692 }
6693 else
6694 {
6695 if (op_type == ternary_op)
6696 vop[2] = vec_oprnds2[i];
6697
6698 if (masked_loop_p && mask_by_cond_expr)
6699 {
6700 tree mask = vect_get_loop_mask (gsi, masks,
6701 vec_num * ncopies,
6702 vectype_in, i * ncopies + j);
6703 build_vect_cond_expr (code, vop, mask, gsi);
6704 }
6705
6706 gassign *new_stmt = gimple_build_assign (vec_dest, code,
6707 vop[0], vop[1], vop[2]);
6708 new_temp = make_ssa_name (vec_dest, new_stmt);
6709 gimple_assign_set_lhs (new_stmt, new_temp);
6710 new_stmt_info
6711 = vect_finish_stmt_generation (stmt_info, new_stmt, gsi);
6712 }
6713
6714 if (slp_node)
6715 SLP_TREE_VEC_STMTS (slp_node).quick_push (new_stmt_info);
6716 }
6717
6718 if (slp_node || single_defuse_cycle)
6719 continue;
6720
6721 if (j == 0)
6722 STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt_info;
6723 else
6724 STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt_info;
6725
6726 prev_stmt_info = new_stmt_info;
6727 }
6728
6729 if (single_defuse_cycle && !slp_node)
6730 STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt_info;
6731
6732 return true;
6733 }
6734
6735 /* Transform phase of a cycle PHI. */
6736
6737 bool
6738 vect_transform_cycle_phi (stmt_vec_info stmt_info, stmt_vec_info *vec_stmt,
6739 slp_tree slp_node, slp_instance slp_node_instance)
6740 {
6741 tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
6742 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
6743 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
6744 int i;
6745 int ncopies;
6746 stmt_vec_info prev_phi_info;
6747 int j;
6748 bool nested_cycle = false;
6749 int vec_num;
6750
6751 if (nested_in_vect_loop_p (loop, stmt_info))
6752 {
6753 loop = loop->inner;
6754 nested_cycle = true;
6755 }
6756
6757 stmt_vec_info reduc_stmt_info = STMT_VINFO_REDUC_DEF (stmt_info);
6758 reduc_stmt_info = vect_stmt_to_vectorize (reduc_stmt_info);
6759 stmt_vec_info reduc_info = info_for_reduction (stmt_info);
6760 gcc_assert (reduc_info->is_reduc_info);
6761
6762 if (STMT_VINFO_REDUC_TYPE (reduc_info) == EXTRACT_LAST_REDUCTION
6763 || STMT_VINFO_REDUC_TYPE (reduc_info) == FOLD_LEFT_REDUCTION)
6764 /* Leave the scalar phi in place. */
6765 return true;
6766
6767 tree vectype_in = STMT_VINFO_REDUC_VECTYPE_IN (reduc_info);
6768 /* For a nested cycle we do not fill the above. */
6769 if (!vectype_in)
6770 vectype_in = STMT_VINFO_VECTYPE (stmt_info);
6771 gcc_assert (vectype_in);
6772
6773 if (slp_node)
6774 {
6775 /* The size vect_schedule_slp_instance computes is off for us. */
6776 vec_num = vect_get_num_vectors
6777 (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
6778 * SLP_TREE_SCALAR_STMTS (slp_node).length (), vectype_in);
6779 ncopies = 1;
6780 }
6781 else
6782 {
6783 vec_num = 1;
6784 ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
6785 }
6786
6787 /* Check whether we should use a single PHI node and accumulate
6788 vectors to one before the backedge. */
6789 if (STMT_VINFO_FORCE_SINGLE_CYCLE (reduc_info))
6790 ncopies = 1;
6791
6792 /* Create the destination vector */
6793 gphi *phi = as_a <gphi *> (stmt_info->stmt);
6794 tree vec_dest = vect_create_destination_var (gimple_phi_result (phi),
6795 vectype_out);
6796
6797 /* Get the loop-entry arguments. */
6798 tree vec_initial_def;
6799 auto_vec<tree> vec_initial_defs;
6800 if (slp_node)
6801 {
6802 vec_initial_defs.reserve (vec_num);
6803 gcc_assert (slp_node == slp_node_instance->reduc_phis);
6804 stmt_vec_info first = REDUC_GROUP_FIRST_ELEMENT (reduc_stmt_info);
6805 tree neutral_op
6806 = neutral_op_for_slp_reduction (slp_node,
6807 STMT_VINFO_REDUC_CODE (reduc_info),
6808 first != NULL);
6809 get_initial_defs_for_reduction (slp_node_instance->reduc_phis,
6810 &vec_initial_defs, vec_num,
6811 first != NULL, neutral_op);
6812 }
6813 else
6814 {
6815 /* Get at the scalar def before the loop, that defines the initial
6816 value of the reduction variable. */
6817 tree initial_def = PHI_ARG_DEF_FROM_EDGE (phi,
6818 loop_preheader_edge (loop));
6819 /* Optimize: if initial_def is for REDUC_MAX smaller than the base
6820 and we can't use zero for induc_val, use initial_def. Similarly
6821 for REDUC_MIN and initial_def larger than the base. */
6822 if (STMT_VINFO_REDUC_TYPE (reduc_info) == INTEGER_INDUC_COND_REDUCTION)
6823 {
6824 tree induc_val = STMT_VINFO_VEC_INDUC_COND_INITIAL_VAL (reduc_info);
6825 if (TREE_CODE (initial_def) == INTEGER_CST
6826 && !integer_zerop (induc_val)
6827 && (((STMT_VINFO_VEC_COND_REDUC_CODE (reduc_info) == MAX_EXPR)
6828 && tree_int_cst_lt (initial_def, induc_val))
6829 || ((STMT_VINFO_VEC_COND_REDUC_CODE (reduc_info) == MIN_EXPR)
6830 && tree_int_cst_lt (induc_val, initial_def))))
6831 {
6832 induc_val = initial_def;
6833 /* Communicate we used the initial_def to epilouge
6834 generation. */
6835 STMT_VINFO_VEC_INDUC_COND_INITIAL_VAL (reduc_info) = NULL_TREE;
6836 }
6837 vec_initial_def = build_vector_from_val (vectype_out, induc_val);
6838 }
6839 else if (nested_cycle)
6840 {
6841 /* Do not use an adjustment def as that case is not supported
6842 correctly if ncopies is not one. */
6843 vec_initial_def = vect_get_vec_def_for_operand (initial_def,
6844 reduc_stmt_info);
6845 }
6846 else
6847 {
6848 tree adjustment_def = NULL_TREE;
6849 tree *adjustment_defp = &adjustment_def;
6850 enum tree_code code = STMT_VINFO_REDUC_CODE (reduc_info);
6851 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
6852 adjustment_defp = NULL;
6853 vec_initial_def
6854 = get_initial_def_for_reduction (reduc_stmt_info, code,
6855 initial_def, adjustment_defp);
6856 STMT_VINFO_REDUC_EPILOGUE_ADJUSTMENT (reduc_info) = adjustment_def;
6857 }
6858 vec_initial_defs.create (1);
6859 vec_initial_defs.quick_push (vec_initial_def);
6860 }
6861
6862 /* Generate the reduction PHIs upfront. */
6863 prev_phi_info = NULL;
6864 for (i = 0; i < vec_num; i++)
6865 {
6866 tree vec_init_def = vec_initial_defs[i];
6867 for (j = 0; j < ncopies; j++)
6868 {
6869 /* Create the reduction-phi that defines the reduction
6870 operand. */
6871 gphi *new_phi = create_phi_node (vec_dest, loop->header);
6872 stmt_vec_info new_phi_info = loop_vinfo->add_stmt (new_phi);
6873
6874 /* Set the loop-entry arg of the reduction-phi. */
6875 if (j != 0 && nested_cycle)
6876 vec_init_def = vect_get_vec_def_for_stmt_copy (loop_vinfo,
6877 vec_init_def);
6878 add_phi_arg (new_phi, vec_init_def, loop_preheader_edge (loop),
6879 UNKNOWN_LOCATION);
6880
6881 /* The loop-latch arg is set in epilogue processing. */
6882
6883 if (slp_node)
6884 SLP_TREE_VEC_STMTS (slp_node).quick_push (new_phi_info);
6885 else
6886 {
6887 if (j == 0)
6888 STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_phi_info;
6889 else
6890 STMT_VINFO_RELATED_STMT (prev_phi_info) = new_phi_info;
6891 prev_phi_info = new_phi_info;
6892 }
6893 }
6894 }
6895
6896 return true;
6897 }
6898
6899 /* Vectorizes LC PHIs. */
6900
6901 bool
6902 vectorizable_lc_phi (stmt_vec_info stmt_info, stmt_vec_info *vec_stmt,
6903 slp_tree slp_node)
6904 {
6905 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
6906 if (!loop_vinfo
6907 || !is_a <gphi *> (stmt_info->stmt)
6908 || gimple_phi_num_args (stmt_info->stmt) != 1)
6909 return false;
6910
6911 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_internal_def
6912 && STMT_VINFO_DEF_TYPE (stmt_info) != vect_double_reduction_def)
6913 return false;
6914
6915 if (!vec_stmt) /* transformation not required. */
6916 {
6917 STMT_VINFO_TYPE (stmt_info) = lc_phi_info_type;
6918 return true;
6919 }
6920
6921 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
6922 tree scalar_dest = gimple_phi_result (stmt_info->stmt);
6923 basic_block bb = gimple_bb (stmt_info->stmt);
6924 edge e = single_pred_edge (bb);
6925 tree vec_dest = vect_create_destination_var (scalar_dest, vectype);
6926 vec<tree> vec_oprnds = vNULL;
6927 vect_get_vec_defs (gimple_phi_arg_def (stmt_info->stmt, 0), NULL_TREE,
6928 stmt_info, &vec_oprnds, NULL, slp_node);
6929 if (slp_node)
6930 {
6931 unsigned vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
6932 gcc_assert (vec_oprnds.length () == vec_num);
6933 for (unsigned i = 0; i < vec_num; i++)
6934 {
6935 /* Create the vectorized LC PHI node. */
6936 gphi *new_phi = create_phi_node (vec_dest, bb);
6937 add_phi_arg (new_phi, vec_oprnds[i], e, UNKNOWN_LOCATION);
6938 stmt_vec_info new_phi_info = loop_vinfo->add_stmt (new_phi);
6939 SLP_TREE_VEC_STMTS (slp_node).quick_push (new_phi_info);
6940 }
6941 }
6942 else
6943 {
6944 unsigned ncopies = vect_get_num_copies (loop_vinfo, vectype);
6945 stmt_vec_info prev_phi_info = NULL;
6946 for (unsigned i = 0; i < ncopies; i++)
6947 {
6948 if (i != 0)
6949 vect_get_vec_defs_for_stmt_copy (loop_vinfo, &vec_oprnds, NULL);
6950 /* Create the vectorized LC PHI node. */
6951 gphi *new_phi = create_phi_node (vec_dest, bb);
6952 add_phi_arg (new_phi, vec_oprnds[0], e, UNKNOWN_LOCATION);
6953 stmt_vec_info new_phi_info = loop_vinfo->add_stmt (new_phi);
6954 if (i == 0)
6955 STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_phi_info;
6956 else
6957 STMT_VINFO_RELATED_STMT (prev_phi_info) = new_phi_info;
6958 prev_phi_info = new_phi_info;
6959 }
6960 }
6961 vec_oprnds.release ();
6962
6963 return true;
6964 }
6965
6966
6967 /* Function vect_min_worthwhile_factor.
6968
6969 For a loop where we could vectorize the operation indicated by CODE,
6970 return the minimum vectorization factor that makes it worthwhile
6971 to use generic vectors. */
6972 static unsigned int
6973 vect_min_worthwhile_factor (enum tree_code code)
6974 {
6975 switch (code)
6976 {
6977 case PLUS_EXPR:
6978 case MINUS_EXPR:
6979 case NEGATE_EXPR:
6980 return 4;
6981
6982 case BIT_AND_EXPR:
6983 case BIT_IOR_EXPR:
6984 case BIT_XOR_EXPR:
6985 case BIT_NOT_EXPR:
6986 return 2;
6987
6988 default:
6989 return INT_MAX;
6990 }
6991 }
6992
6993 /* Return true if VINFO indicates we are doing loop vectorization and if
6994 it is worth decomposing CODE operations into scalar operations for
6995 that loop's vectorization factor. */
6996
6997 bool
6998 vect_worthwhile_without_simd_p (vec_info *vinfo, tree_code code)
6999 {
7000 loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
7001 unsigned HOST_WIDE_INT value;
7002 return (loop_vinfo
7003 && LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant (&value)
7004 && value >= vect_min_worthwhile_factor (code));
7005 }
7006
7007 /* Function vectorizable_induction
7008
7009 Check if STMT_INFO performs an induction computation that can be vectorized.
7010 If VEC_STMT is also passed, vectorize the induction PHI: create a vectorized
7011 phi to replace it, put it in VEC_STMT, and add it to the same basic block.
7012 Return true if STMT_INFO is vectorizable in this way. */
7013
7014 bool
7015 vectorizable_induction (stmt_vec_info stmt_info,
7016 gimple_stmt_iterator *gsi ATTRIBUTE_UNUSED,
7017 stmt_vec_info *vec_stmt, slp_tree slp_node,
7018 stmt_vector_for_cost *cost_vec)
7019 {
7020 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
7021 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
7022 unsigned ncopies;
7023 bool nested_in_vect_loop = false;
7024 class loop *iv_loop;
7025 tree vec_def;
7026 edge pe = loop_preheader_edge (loop);
7027 basic_block new_bb;
7028 tree new_vec, vec_init, vec_step, t;
7029 tree new_name;
7030 gimple *new_stmt;
7031 gphi *induction_phi;
7032 tree induc_def, vec_dest;
7033 tree init_expr, step_expr;
7034 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
7035 unsigned i;
7036 tree expr;
7037 gimple_seq stmts;
7038 imm_use_iterator imm_iter;
7039 use_operand_p use_p;
7040 gimple *exit_phi;
7041 edge latch_e;
7042 tree loop_arg;
7043 gimple_stmt_iterator si;
7044
7045 gphi *phi = dyn_cast <gphi *> (stmt_info->stmt);
7046 if (!phi)
7047 return false;
7048
7049 if (!STMT_VINFO_RELEVANT_P (stmt_info))
7050 return false;
7051
7052 /* Make sure it was recognized as induction computation. */
7053 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_induction_def)
7054 return false;
7055
7056 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
7057 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
7058
7059 if (slp_node)
7060 ncopies = 1;
7061 else
7062 ncopies = vect_get_num_copies (loop_vinfo, vectype);
7063 gcc_assert (ncopies >= 1);
7064
7065 /* FORNOW. These restrictions should be relaxed. */
7066 if (nested_in_vect_loop_p (loop, stmt_info))
7067 {
7068 imm_use_iterator imm_iter;
7069 use_operand_p use_p;
7070 gimple *exit_phi;
7071 edge latch_e;
7072 tree loop_arg;
7073
7074 if (ncopies > 1)
7075 {
7076 if (dump_enabled_p ())
7077 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7078 "multiple types in nested loop.\n");
7079 return false;
7080 }
7081
7082 /* FORNOW: outer loop induction with SLP not supported. */
7083 if (STMT_SLP_TYPE (stmt_info))
7084 return false;
7085
7086 exit_phi = NULL;
7087 latch_e = loop_latch_edge (loop->inner);
7088 loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
7089 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, loop_arg)
7090 {
7091 gimple *use_stmt = USE_STMT (use_p);
7092 if (is_gimple_debug (use_stmt))
7093 continue;
7094
7095 if (!flow_bb_inside_loop_p (loop->inner, gimple_bb (use_stmt)))
7096 {
7097 exit_phi = use_stmt;
7098 break;
7099 }
7100 }
7101 if (exit_phi)
7102 {
7103 stmt_vec_info exit_phi_vinfo = loop_vinfo->lookup_stmt (exit_phi);
7104 if (!(STMT_VINFO_RELEVANT_P (exit_phi_vinfo)
7105 && !STMT_VINFO_LIVE_P (exit_phi_vinfo)))
7106 {
7107 if (dump_enabled_p ())
7108 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7109 "inner-loop induction only used outside "
7110 "of the outer vectorized loop.\n");
7111 return false;
7112 }
7113 }
7114
7115 nested_in_vect_loop = true;
7116 iv_loop = loop->inner;
7117 }
7118 else
7119 iv_loop = loop;
7120 gcc_assert (iv_loop == (gimple_bb (phi))->loop_father);
7121
7122 if (slp_node && !nunits.is_constant ())
7123 {
7124 /* The current SLP code creates the initial value element-by-element. */
7125 if (dump_enabled_p ())
7126 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7127 "SLP induction not supported for variable-length"
7128 " vectors.\n");
7129 return false;
7130 }
7131
7132 if (!vec_stmt) /* transformation not required. */
7133 {
7134 STMT_VINFO_TYPE (stmt_info) = induc_vec_info_type;
7135 DUMP_VECT_SCOPE ("vectorizable_induction");
7136 vect_model_induction_cost (stmt_info, ncopies, cost_vec);
7137 return true;
7138 }
7139
7140 /* Transform. */
7141
7142 /* Compute a vector variable, initialized with the first VF values of
7143 the induction variable. E.g., for an iv with IV_PHI='X' and
7144 evolution S, for a vector of 4 units, we want to compute:
7145 [X, X + S, X + 2*S, X + 3*S]. */
7146
7147 if (dump_enabled_p ())
7148 dump_printf_loc (MSG_NOTE, vect_location, "transform induction phi.\n");
7149
7150 latch_e = loop_latch_edge (iv_loop);
7151 loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
7152
7153 step_expr = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_info);
7154 gcc_assert (step_expr != NULL_TREE);
7155 tree step_vectype = get_same_sized_vectype (TREE_TYPE (step_expr), vectype);
7156
7157 pe = loop_preheader_edge (iv_loop);
7158 init_expr = PHI_ARG_DEF_FROM_EDGE (phi,
7159 loop_preheader_edge (iv_loop));
7160
7161 stmts = NULL;
7162 if (!nested_in_vect_loop)
7163 {
7164 /* Convert the initial value to the IV update type. */
7165 tree new_type = TREE_TYPE (step_expr);
7166 init_expr = gimple_convert (&stmts, new_type, init_expr);
7167
7168 /* If we are using the loop mask to "peel" for alignment then we need
7169 to adjust the start value here. */
7170 tree skip_niters = LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo);
7171 if (skip_niters != NULL_TREE)
7172 {
7173 if (FLOAT_TYPE_P (vectype))
7174 skip_niters = gimple_build (&stmts, FLOAT_EXPR, new_type,
7175 skip_niters);
7176 else
7177 skip_niters = gimple_convert (&stmts, new_type, skip_niters);
7178 tree skip_step = gimple_build (&stmts, MULT_EXPR, new_type,
7179 skip_niters, step_expr);
7180 init_expr = gimple_build (&stmts, MINUS_EXPR, new_type,
7181 init_expr, skip_step);
7182 }
7183 }
7184
7185 if (stmts)
7186 {
7187 new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
7188 gcc_assert (!new_bb);
7189 }
7190
7191 /* Find the first insertion point in the BB. */
7192 basic_block bb = gimple_bb (phi);
7193 si = gsi_after_labels (bb);
7194
7195 /* For SLP induction we have to generate several IVs as for example
7196 with group size 3 we need [i, i, i, i + S] [i + S, i + S, i + 2*S, i + 2*S]
7197 [i + 2*S, i + 3*S, i + 3*S, i + 3*S]. The step is the same uniform
7198 [VF*S, VF*S, VF*S, VF*S] for all. */
7199 if (slp_node)
7200 {
7201 /* Enforced above. */
7202 unsigned int const_nunits = nunits.to_constant ();
7203
7204 /* Generate [VF*S, VF*S, ... ]. */
7205 if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
7206 {
7207 expr = build_int_cst (integer_type_node, vf);
7208 expr = fold_convert (TREE_TYPE (step_expr), expr);
7209 }
7210 else
7211 expr = build_int_cst (TREE_TYPE (step_expr), vf);
7212 new_name = fold_build2 (MULT_EXPR, TREE_TYPE (step_expr),
7213 expr, step_expr);
7214 if (! CONSTANT_CLASS_P (new_name))
7215 new_name = vect_init_vector (stmt_info, new_name,
7216 TREE_TYPE (step_expr), NULL);
7217 new_vec = build_vector_from_val (step_vectype, new_name);
7218 vec_step = vect_init_vector (stmt_info, new_vec, step_vectype, NULL);
7219
7220 /* Now generate the IVs. */
7221 unsigned group_size = SLP_TREE_SCALAR_STMTS (slp_node).length ();
7222 unsigned nvects = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
7223 unsigned elts = const_nunits * nvects;
7224 unsigned nivs = least_common_multiple (group_size,
7225 const_nunits) / const_nunits;
7226 gcc_assert (elts % group_size == 0);
7227 tree elt = init_expr;
7228 unsigned ivn;
7229 for (ivn = 0; ivn < nivs; ++ivn)
7230 {
7231 tree_vector_builder elts (step_vectype, const_nunits, 1);
7232 stmts = NULL;
7233 for (unsigned eltn = 0; eltn < const_nunits; ++eltn)
7234 {
7235 if (ivn*const_nunits + eltn >= group_size
7236 && (ivn * const_nunits + eltn) % group_size == 0)
7237 elt = gimple_build (&stmts, PLUS_EXPR, TREE_TYPE (elt),
7238 elt, step_expr);
7239 elts.quick_push (elt);
7240 }
7241 vec_init = gimple_build_vector (&stmts, &elts);
7242 vec_init = gimple_convert (&stmts, vectype, vec_init);
7243 if (stmts)
7244 {
7245 new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
7246 gcc_assert (!new_bb);
7247 }
7248
7249 /* Create the induction-phi that defines the induction-operand. */
7250 vec_dest = vect_get_new_vect_var (vectype, vect_simple_var, "vec_iv_");
7251 induction_phi = create_phi_node (vec_dest, iv_loop->header);
7252 stmt_vec_info induction_phi_info
7253 = loop_vinfo->add_stmt (induction_phi);
7254 induc_def = PHI_RESULT (induction_phi);
7255
7256 /* Create the iv update inside the loop */
7257 gimple_seq stmts = NULL;
7258 vec_def = gimple_convert (&stmts, step_vectype, induc_def);
7259 vec_def = gimple_build (&stmts,
7260 PLUS_EXPR, step_vectype, vec_def, vec_step);
7261 vec_def = gimple_convert (&stmts, vectype, vec_def);
7262 loop_vinfo->add_stmt (SSA_NAME_DEF_STMT (vec_def));
7263 gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
7264
7265 /* Set the arguments of the phi node: */
7266 add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION);
7267 add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
7268 UNKNOWN_LOCATION);
7269
7270 SLP_TREE_VEC_STMTS (slp_node).quick_push (induction_phi_info);
7271 }
7272
7273 /* Re-use IVs when we can. */
7274 if (ivn < nvects)
7275 {
7276 unsigned vfp
7277 = least_common_multiple (group_size, const_nunits) / group_size;
7278 /* Generate [VF'*S, VF'*S, ... ]. */
7279 if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
7280 {
7281 expr = build_int_cst (integer_type_node, vfp);
7282 expr = fold_convert (TREE_TYPE (step_expr), expr);
7283 }
7284 else
7285 expr = build_int_cst (TREE_TYPE (step_expr), vfp);
7286 new_name = fold_build2 (MULT_EXPR, TREE_TYPE (step_expr),
7287 expr, step_expr);
7288 if (! CONSTANT_CLASS_P (new_name))
7289 new_name = vect_init_vector (stmt_info, new_name,
7290 TREE_TYPE (step_expr), NULL);
7291 new_vec = build_vector_from_val (step_vectype, new_name);
7292 vec_step = vect_init_vector (stmt_info, new_vec, step_vectype, NULL);
7293 for (; ivn < nvects; ++ivn)
7294 {
7295 gimple *iv = SLP_TREE_VEC_STMTS (slp_node)[ivn - nivs]->stmt;
7296 tree def;
7297 if (gimple_code (iv) == GIMPLE_PHI)
7298 def = gimple_phi_result (iv);
7299 else
7300 def = gimple_assign_lhs (iv);
7301 gimple_seq stmts = NULL;
7302 def = gimple_convert (&stmts, step_vectype, def);
7303 def = gimple_build (&stmts,
7304 PLUS_EXPR, step_vectype, def, vec_step);
7305 def = gimple_convert (&stmts, vectype, def);
7306 if (gimple_code (iv) == GIMPLE_PHI)
7307 gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
7308 else
7309 {
7310 gimple_stmt_iterator tgsi = gsi_for_stmt (iv);
7311 gsi_insert_seq_after (&tgsi, stmts, GSI_CONTINUE_LINKING);
7312 }
7313 SLP_TREE_VEC_STMTS (slp_node).quick_push
7314 (loop_vinfo->add_stmt (SSA_NAME_DEF_STMT (def)));
7315 }
7316 }
7317
7318 return true;
7319 }
7320
7321 /* Create the vector that holds the initial_value of the induction. */
7322 if (nested_in_vect_loop)
7323 {
7324 /* iv_loop is nested in the loop to be vectorized. init_expr had already
7325 been created during vectorization of previous stmts. We obtain it
7326 from the STMT_VINFO_VEC_STMT of the defining stmt. */
7327 vec_init = vect_get_vec_def_for_operand (init_expr, stmt_info);
7328 /* If the initial value is not of proper type, convert it. */
7329 if (!useless_type_conversion_p (vectype, TREE_TYPE (vec_init)))
7330 {
7331 new_stmt
7332 = gimple_build_assign (vect_get_new_ssa_name (vectype,
7333 vect_simple_var,
7334 "vec_iv_"),
7335 VIEW_CONVERT_EXPR,
7336 build1 (VIEW_CONVERT_EXPR, vectype,
7337 vec_init));
7338 vec_init = gimple_assign_lhs (new_stmt);
7339 new_bb = gsi_insert_on_edge_immediate (loop_preheader_edge (iv_loop),
7340 new_stmt);
7341 gcc_assert (!new_bb);
7342 loop_vinfo->add_stmt (new_stmt);
7343 }
7344 }
7345 else
7346 {
7347 /* iv_loop is the loop to be vectorized. Create:
7348 vec_init = [X, X+S, X+2*S, X+3*S] (S = step_expr, X = init_expr) */
7349 stmts = NULL;
7350 new_name = gimple_convert (&stmts, TREE_TYPE (step_expr), init_expr);
7351
7352 unsigned HOST_WIDE_INT const_nunits;
7353 if (nunits.is_constant (&const_nunits))
7354 {
7355 tree_vector_builder elts (step_vectype, const_nunits, 1);
7356 elts.quick_push (new_name);
7357 for (i = 1; i < const_nunits; i++)
7358 {
7359 /* Create: new_name_i = new_name + step_expr */
7360 new_name = gimple_build (&stmts, PLUS_EXPR, TREE_TYPE (new_name),
7361 new_name, step_expr);
7362 elts.quick_push (new_name);
7363 }
7364 /* Create a vector from [new_name_0, new_name_1, ...,
7365 new_name_nunits-1] */
7366 vec_init = gimple_build_vector (&stmts, &elts);
7367 }
7368 else if (INTEGRAL_TYPE_P (TREE_TYPE (step_expr)))
7369 /* Build the initial value directly from a VEC_SERIES_EXPR. */
7370 vec_init = gimple_build (&stmts, VEC_SERIES_EXPR, step_vectype,
7371 new_name, step_expr);
7372 else
7373 {
7374 /* Build:
7375 [base, base, base, ...]
7376 + (vectype) [0, 1, 2, ...] * [step, step, step, ...]. */
7377 gcc_assert (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)));
7378 gcc_assert (flag_associative_math);
7379 tree index = build_index_vector (step_vectype, 0, 1);
7380 tree base_vec = gimple_build_vector_from_val (&stmts, step_vectype,
7381 new_name);
7382 tree step_vec = gimple_build_vector_from_val (&stmts, step_vectype,
7383 step_expr);
7384 vec_init = gimple_build (&stmts, FLOAT_EXPR, step_vectype, index);
7385 vec_init = gimple_build (&stmts, MULT_EXPR, step_vectype,
7386 vec_init, step_vec);
7387 vec_init = gimple_build (&stmts, PLUS_EXPR, step_vectype,
7388 vec_init, base_vec);
7389 }
7390 vec_init = gimple_convert (&stmts, vectype, vec_init);
7391
7392 if (stmts)
7393 {
7394 new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
7395 gcc_assert (!new_bb);
7396 }
7397 }
7398
7399
7400 /* Create the vector that holds the step of the induction. */
7401 if (nested_in_vect_loop)
7402 /* iv_loop is nested in the loop to be vectorized. Generate:
7403 vec_step = [S, S, S, S] */
7404 new_name = step_expr;
7405 else
7406 {
7407 /* iv_loop is the loop to be vectorized. Generate:
7408 vec_step = [VF*S, VF*S, VF*S, VF*S] */
7409 gimple_seq seq = NULL;
7410 if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
7411 {
7412 expr = build_int_cst (integer_type_node, vf);
7413 expr = gimple_build (&seq, FLOAT_EXPR, TREE_TYPE (step_expr), expr);
7414 }
7415 else
7416 expr = build_int_cst (TREE_TYPE (step_expr), vf);
7417 new_name = gimple_build (&seq, MULT_EXPR, TREE_TYPE (step_expr),
7418 expr, step_expr);
7419 if (seq)
7420 {
7421 new_bb = gsi_insert_seq_on_edge_immediate (pe, seq);
7422 gcc_assert (!new_bb);
7423 }
7424 }
7425
7426 t = unshare_expr (new_name);
7427 gcc_assert (CONSTANT_CLASS_P (new_name)
7428 || TREE_CODE (new_name) == SSA_NAME);
7429 new_vec = build_vector_from_val (step_vectype, t);
7430 vec_step = vect_init_vector (stmt_info, new_vec, step_vectype, NULL);
7431
7432
7433 /* Create the following def-use cycle:
7434 loop prolog:
7435 vec_init = ...
7436 vec_step = ...
7437 loop:
7438 vec_iv = PHI <vec_init, vec_loop>
7439 ...
7440 STMT
7441 ...
7442 vec_loop = vec_iv + vec_step; */
7443
7444 /* Create the induction-phi that defines the induction-operand. */
7445 vec_dest = vect_get_new_vect_var (vectype, vect_simple_var, "vec_iv_");
7446 induction_phi = create_phi_node (vec_dest, iv_loop->header);
7447 stmt_vec_info induction_phi_info = loop_vinfo->add_stmt (induction_phi);
7448 induc_def = PHI_RESULT (induction_phi);
7449
7450 /* Create the iv update inside the loop */
7451 stmts = NULL;
7452 vec_def = gimple_convert (&stmts, step_vectype, induc_def);
7453 vec_def = gimple_build (&stmts, PLUS_EXPR, step_vectype, vec_def, vec_step);
7454 vec_def = gimple_convert (&stmts, vectype, vec_def);
7455 gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
7456 new_stmt = SSA_NAME_DEF_STMT (vec_def);
7457 stmt_vec_info new_stmt_info = loop_vinfo->add_stmt (new_stmt);
7458
7459 /* Set the arguments of the phi node: */
7460 add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION);
7461 add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
7462 UNKNOWN_LOCATION);
7463
7464 STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = induction_phi_info;
7465
7466 /* In case that vectorization factor (VF) is bigger than the number
7467 of elements that we can fit in a vectype (nunits), we have to generate
7468 more than one vector stmt - i.e - we need to "unroll" the
7469 vector stmt by a factor VF/nunits. For more details see documentation
7470 in vectorizable_operation. */
7471
7472 if (ncopies > 1)
7473 {
7474 gimple_seq seq = NULL;
7475 stmt_vec_info prev_stmt_vinfo;
7476 /* FORNOW. This restriction should be relaxed. */
7477 gcc_assert (!nested_in_vect_loop);
7478
7479 /* Create the vector that holds the step of the induction. */
7480 if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
7481 {
7482 expr = build_int_cst (integer_type_node, nunits);
7483 expr = gimple_build (&seq, FLOAT_EXPR, TREE_TYPE (step_expr), expr);
7484 }
7485 else
7486 expr = build_int_cst (TREE_TYPE (step_expr), nunits);
7487 new_name = gimple_build (&seq, MULT_EXPR, TREE_TYPE (step_expr),
7488 expr, step_expr);
7489 if (seq)
7490 {
7491 new_bb = gsi_insert_seq_on_edge_immediate (pe, seq);
7492 gcc_assert (!new_bb);
7493 }
7494
7495 t = unshare_expr (new_name);
7496 gcc_assert (CONSTANT_CLASS_P (new_name)
7497 || TREE_CODE (new_name) == SSA_NAME);
7498 new_vec = build_vector_from_val (step_vectype, t);
7499 vec_step = vect_init_vector (stmt_info, new_vec, step_vectype, NULL);
7500
7501 vec_def = induc_def;
7502 prev_stmt_vinfo = induction_phi_info;
7503 for (i = 1; i < ncopies; i++)
7504 {
7505 /* vec_i = vec_prev + vec_step */
7506 gimple_seq stmts = NULL;
7507 vec_def = gimple_convert (&stmts, step_vectype, vec_def);
7508 vec_def = gimple_build (&stmts,
7509 PLUS_EXPR, step_vectype, vec_def, vec_step);
7510 vec_def = gimple_convert (&stmts, vectype, vec_def);
7511
7512 gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
7513 new_stmt = SSA_NAME_DEF_STMT (vec_def);
7514 new_stmt_info = loop_vinfo->add_stmt (new_stmt);
7515 STMT_VINFO_RELATED_STMT (prev_stmt_vinfo) = new_stmt_info;
7516 prev_stmt_vinfo = new_stmt_info;
7517 }
7518 }
7519
7520 if (nested_in_vect_loop)
7521 {
7522 /* Find the loop-closed exit-phi of the induction, and record
7523 the final vector of induction results: */
7524 exit_phi = NULL;
7525 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, loop_arg)
7526 {
7527 gimple *use_stmt = USE_STMT (use_p);
7528 if (is_gimple_debug (use_stmt))
7529 continue;
7530
7531 if (!flow_bb_inside_loop_p (iv_loop, gimple_bb (use_stmt)))
7532 {
7533 exit_phi = use_stmt;
7534 break;
7535 }
7536 }
7537 if (exit_phi)
7538 {
7539 stmt_vec_info stmt_vinfo = loop_vinfo->lookup_stmt (exit_phi);
7540 /* FORNOW. Currently not supporting the case that an inner-loop induction
7541 is not used in the outer-loop (i.e. only outside the outer-loop). */
7542 gcc_assert (STMT_VINFO_RELEVANT_P (stmt_vinfo)
7543 && !STMT_VINFO_LIVE_P (stmt_vinfo));
7544
7545 STMT_VINFO_VEC_STMT (stmt_vinfo) = new_stmt_info;
7546 if (dump_enabled_p ())
7547 dump_printf_loc (MSG_NOTE, vect_location,
7548 "vector of inductions after inner-loop:%G",
7549 new_stmt);
7550 }
7551 }
7552
7553
7554 if (dump_enabled_p ())
7555 dump_printf_loc (MSG_NOTE, vect_location,
7556 "transform induction: created def-use cycle: %G%G",
7557 induction_phi, SSA_NAME_DEF_STMT (vec_def));
7558
7559 return true;
7560 }
7561
7562 /* Function vectorizable_live_operation.
7563
7564 STMT_INFO computes a value that is used outside the loop. Check if
7565 it can be supported. */
7566
7567 bool
7568 vectorizable_live_operation (stmt_vec_info stmt_info,
7569 gimple_stmt_iterator *gsi,
7570 slp_tree slp_node, slp_instance slp_node_instance,
7571 int slp_index, bool vec_stmt_p,
7572 stmt_vector_for_cost *)
7573 {
7574 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
7575 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
7576 imm_use_iterator imm_iter;
7577 tree lhs, lhs_type, bitsize, vec_bitsize;
7578 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
7579 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
7580 int ncopies;
7581 gimple *use_stmt;
7582 auto_vec<tree> vec_oprnds;
7583 int vec_entry = 0;
7584 poly_uint64 vec_index = 0;
7585
7586 gcc_assert (STMT_VINFO_LIVE_P (stmt_info));
7587
7588 /* The last stmt of a reduction is live and vectorized via
7589 vect_create_epilog_for_reduction. vectorizable_reduction assessed
7590 validity so just trigger the transform here. */
7591 if (STMT_VINFO_REDUC_DEF (vect_orig_stmt (stmt_info)))
7592 {
7593 if (!vec_stmt_p)
7594 return true;
7595 if (slp_node)
7596 {
7597 /* For reduction chains the meta-info is attached to
7598 the group leader. */
7599 if (REDUC_GROUP_FIRST_ELEMENT (stmt_info))
7600 stmt_info = REDUC_GROUP_FIRST_ELEMENT (stmt_info);
7601 /* For SLP reductions we vectorize the epilogue for
7602 all involved stmts together. */
7603 else if (slp_index != 0)
7604 return true;
7605 }
7606 stmt_vec_info reduc_info = info_for_reduction (stmt_info);
7607 gcc_assert (reduc_info->is_reduc_info);
7608 if (STMT_VINFO_REDUC_TYPE (reduc_info) == FOLD_LEFT_REDUCTION
7609 || STMT_VINFO_REDUC_TYPE (reduc_info) == EXTRACT_LAST_REDUCTION)
7610 return true;
7611 vect_create_epilog_for_reduction (stmt_info, slp_node,
7612 slp_node_instance);
7613 return true;
7614 }
7615
7616 /* FORNOW. CHECKME. */
7617 if (nested_in_vect_loop_p (loop, stmt_info))
7618 return false;
7619
7620 /* If STMT is not relevant and it is a simple assignment and its inputs are
7621 invariant then it can remain in place, unvectorized. The original last
7622 scalar value that it computes will be used. */
7623 if (!STMT_VINFO_RELEVANT_P (stmt_info))
7624 {
7625 gcc_assert (is_simple_and_all_uses_invariant (stmt_info, loop_vinfo));
7626 if (dump_enabled_p ())
7627 dump_printf_loc (MSG_NOTE, vect_location,
7628 "statement is simple and uses invariant. Leaving in "
7629 "place.\n");
7630 return true;
7631 }
7632
7633 if (slp_node)
7634 ncopies = 1;
7635 else
7636 ncopies = vect_get_num_copies (loop_vinfo, vectype);
7637
7638 if (slp_node)
7639 {
7640 gcc_assert (slp_index >= 0);
7641
7642 int num_scalar = SLP_TREE_SCALAR_STMTS (slp_node).length ();
7643 int num_vec = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
7644
7645 /* Get the last occurrence of the scalar index from the concatenation of
7646 all the slp vectors. Calculate which slp vector it is and the index
7647 within. */
7648 poly_uint64 pos = (num_vec * nunits) - num_scalar + slp_index;
7649
7650 /* Calculate which vector contains the result, and which lane of
7651 that vector we need. */
7652 if (!can_div_trunc_p (pos, nunits, &vec_entry, &vec_index))
7653 {
7654 if (dump_enabled_p ())
7655 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7656 "Cannot determine which vector holds the"
7657 " final result.\n");
7658 return false;
7659 }
7660 }
7661
7662 if (!vec_stmt_p)
7663 {
7664 /* No transformation required. */
7665 if (LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo))
7666 {
7667 if (!direct_internal_fn_supported_p (IFN_EXTRACT_LAST, vectype,
7668 OPTIMIZE_FOR_SPEED))
7669 {
7670 if (dump_enabled_p ())
7671 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7672 "can't use a fully-masked loop because "
7673 "the target doesn't support extract last "
7674 "reduction.\n");
7675 LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = false;
7676 }
7677 else if (slp_node)
7678 {
7679 if (dump_enabled_p ())
7680 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7681 "can't use a fully-masked loop because an "
7682 "SLP statement is live after the loop.\n");
7683 LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = false;
7684 }
7685 else if (ncopies > 1)
7686 {
7687 if (dump_enabled_p ())
7688 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7689 "can't use a fully-masked loop because"
7690 " ncopies is greater than 1.\n");
7691 LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = false;
7692 }
7693 else
7694 {
7695 gcc_assert (ncopies == 1 && !slp_node);
7696 vect_record_loop_mask (loop_vinfo,
7697 &LOOP_VINFO_MASKS (loop_vinfo),
7698 1, vectype);
7699 }
7700 }
7701 return true;
7702 }
7703
7704 /* Use the lhs of the original scalar statement. */
7705 gimple *stmt = vect_orig_stmt (stmt_info)->stmt;
7706
7707 lhs = (is_a <gphi *> (stmt)) ? gimple_phi_result (stmt)
7708 : gimple_get_lhs (stmt);
7709 lhs_type = TREE_TYPE (lhs);
7710
7711 bitsize = (VECTOR_BOOLEAN_TYPE_P (vectype)
7712 ? bitsize_int (TYPE_PRECISION (TREE_TYPE (vectype)))
7713 : TYPE_SIZE (TREE_TYPE (vectype)));
7714 vec_bitsize = TYPE_SIZE (vectype);
7715
7716 /* Get the vectorized lhs of STMT and the lane to use (counted in bits). */
7717 tree vec_lhs, bitstart;
7718 if (slp_node)
7719 {
7720 gcc_assert (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo));
7721
7722 /* Get the correct slp vectorized stmt. */
7723 gimple *vec_stmt = SLP_TREE_VEC_STMTS (slp_node)[vec_entry]->stmt;
7724 if (gphi *phi = dyn_cast <gphi *> (vec_stmt))
7725 vec_lhs = gimple_phi_result (phi);
7726 else
7727 vec_lhs = gimple_get_lhs (vec_stmt);
7728
7729 /* Get entry to use. */
7730 bitstart = bitsize_int (vec_index);
7731 bitstart = int_const_binop (MULT_EXPR, bitsize, bitstart);
7732 }
7733 else
7734 {
7735 enum vect_def_type dt = STMT_VINFO_DEF_TYPE (stmt_info);
7736 vec_lhs = vect_get_vec_def_for_operand_1 (stmt_info, dt);
7737 gcc_checking_assert (ncopies == 1
7738 || !LOOP_VINFO_FULLY_MASKED_P (loop_vinfo));
7739
7740 /* For multiple copies, get the last copy. */
7741 for (int i = 1; i < ncopies; ++i)
7742 vec_lhs = vect_get_vec_def_for_stmt_copy (loop_vinfo, vec_lhs);
7743
7744 /* Get the last lane in the vector. */
7745 bitstart = int_const_binop (MINUS_EXPR, vec_bitsize, bitsize);
7746 }
7747
7748 gimple_seq stmts = NULL;
7749 tree new_tree;
7750 if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
7751 {
7752 /* Emit:
7753
7754 SCALAR_RES = EXTRACT_LAST <VEC_LHS, MASK>
7755
7756 where VEC_LHS is the vectorized live-out result and MASK is
7757 the loop mask for the final iteration. */
7758 gcc_assert (ncopies == 1 && !slp_node);
7759 tree scalar_type = TREE_TYPE (STMT_VINFO_VECTYPE (stmt_info));
7760 tree mask = vect_get_loop_mask (gsi, &LOOP_VINFO_MASKS (loop_vinfo),
7761 1, vectype, 0);
7762 tree scalar_res = gimple_build (&stmts, CFN_EXTRACT_LAST,
7763 scalar_type, mask, vec_lhs);
7764
7765 /* Convert the extracted vector element to the required scalar type. */
7766 new_tree = gimple_convert (&stmts, lhs_type, scalar_res);
7767 }
7768 else
7769 {
7770 tree bftype = TREE_TYPE (vectype);
7771 if (VECTOR_BOOLEAN_TYPE_P (vectype))
7772 bftype = build_nonstandard_integer_type (tree_to_uhwi (bitsize), 1);
7773 new_tree = build3 (BIT_FIELD_REF, bftype, vec_lhs, bitsize, bitstart);
7774 new_tree = force_gimple_operand (fold_convert (lhs_type, new_tree),
7775 &stmts, true, NULL_TREE);
7776 }
7777
7778 if (stmts)
7779 gsi_insert_seq_on_edge_immediate (single_exit (loop), stmts);
7780
7781 /* Replace use of lhs with newly computed result. If the use stmt is a
7782 single arg PHI, just replace all uses of PHI result. It's necessary
7783 because lcssa PHI defining lhs may be before newly inserted stmt. */
7784 use_operand_p use_p;
7785 FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, lhs)
7786 if (!flow_bb_inside_loop_p (loop, gimple_bb (use_stmt))
7787 && !is_gimple_debug (use_stmt))
7788 {
7789 if (gimple_code (use_stmt) == GIMPLE_PHI
7790 && gimple_phi_num_args (use_stmt) == 1)
7791 {
7792 replace_uses_by (gimple_phi_result (use_stmt), new_tree);
7793 }
7794 else
7795 {
7796 FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
7797 SET_USE (use_p, new_tree);
7798 }
7799 update_stmt (use_stmt);
7800 }
7801
7802 return true;
7803 }
7804
7805 /* Kill any debug uses outside LOOP of SSA names defined in STMT_INFO. */
7806
7807 static void
7808 vect_loop_kill_debug_uses (class loop *loop, stmt_vec_info stmt_info)
7809 {
7810 ssa_op_iter op_iter;
7811 imm_use_iterator imm_iter;
7812 def_operand_p def_p;
7813 gimple *ustmt;
7814
7815 FOR_EACH_PHI_OR_STMT_DEF (def_p, stmt_info->stmt, op_iter, SSA_OP_DEF)
7816 {
7817 FOR_EACH_IMM_USE_STMT (ustmt, imm_iter, DEF_FROM_PTR (def_p))
7818 {
7819 basic_block bb;
7820
7821 if (!is_gimple_debug (ustmt))
7822 continue;
7823
7824 bb = gimple_bb (ustmt);
7825
7826 if (!flow_bb_inside_loop_p (loop, bb))
7827 {
7828 if (gimple_debug_bind_p (ustmt))
7829 {
7830 if (dump_enabled_p ())
7831 dump_printf_loc (MSG_NOTE, vect_location,
7832 "killing debug use\n");
7833
7834 gimple_debug_bind_reset_value (ustmt);
7835 update_stmt (ustmt);
7836 }
7837 else
7838 gcc_unreachable ();
7839 }
7840 }
7841 }
7842 }
7843
7844 /* Given loop represented by LOOP_VINFO, return true if computation of
7845 LOOP_VINFO_NITERS (= LOOP_VINFO_NITERSM1 + 1) doesn't overflow, false
7846 otherwise. */
7847
7848 static bool
7849 loop_niters_no_overflow (loop_vec_info loop_vinfo)
7850 {
7851 /* Constant case. */
7852 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
7853 {
7854 tree cst_niters = LOOP_VINFO_NITERS (loop_vinfo);
7855 tree cst_nitersm1 = LOOP_VINFO_NITERSM1 (loop_vinfo);
7856
7857 gcc_assert (TREE_CODE (cst_niters) == INTEGER_CST);
7858 gcc_assert (TREE_CODE (cst_nitersm1) == INTEGER_CST);
7859 if (wi::to_widest (cst_nitersm1) < wi::to_widest (cst_niters))
7860 return true;
7861 }
7862
7863 widest_int max;
7864 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
7865 /* Check the upper bound of loop niters. */
7866 if (get_max_loop_iterations (loop, &max))
7867 {
7868 tree type = TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo));
7869 signop sgn = TYPE_SIGN (type);
7870 widest_int type_max = widest_int::from (wi::max_value (type), sgn);
7871 if (max < type_max)
7872 return true;
7873 }
7874 return false;
7875 }
7876
7877 /* Return a mask type with half the number of elements as TYPE. */
7878
7879 tree
7880 vect_halve_mask_nunits (tree type)
7881 {
7882 poly_uint64 nunits = exact_div (TYPE_VECTOR_SUBPARTS (type), 2);
7883 return build_truth_vector_type (nunits, current_vector_size);
7884 }
7885
7886 /* Return a mask type with twice as many elements as TYPE. */
7887
7888 tree
7889 vect_double_mask_nunits (tree type)
7890 {
7891 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (type) * 2;
7892 return build_truth_vector_type (nunits, current_vector_size);
7893 }
7894
7895 /* Record that a fully-masked version of LOOP_VINFO would need MASKS to
7896 contain a sequence of NVECTORS masks that each control a vector of type
7897 VECTYPE. */
7898
7899 void
7900 vect_record_loop_mask (loop_vec_info loop_vinfo, vec_loop_masks *masks,
7901 unsigned int nvectors, tree vectype)
7902 {
7903 gcc_assert (nvectors != 0);
7904 if (masks->length () < nvectors)
7905 masks->safe_grow_cleared (nvectors);
7906 rgroup_masks *rgm = &(*masks)[nvectors - 1];
7907 /* The number of scalars per iteration and the number of vectors are
7908 both compile-time constants. */
7909 unsigned int nscalars_per_iter
7910 = exact_div (nvectors * TYPE_VECTOR_SUBPARTS (vectype),
7911 LOOP_VINFO_VECT_FACTOR (loop_vinfo)).to_constant ();
7912 if (rgm->max_nscalars_per_iter < nscalars_per_iter)
7913 {
7914 rgm->max_nscalars_per_iter = nscalars_per_iter;
7915 rgm->mask_type = build_same_sized_truth_vector_type (vectype);
7916 }
7917 }
7918
7919 /* Given a complete set of masks MASKS, extract mask number INDEX
7920 for an rgroup that operates on NVECTORS vectors of type VECTYPE,
7921 where 0 <= INDEX < NVECTORS. Insert any set-up statements before GSI.
7922
7923 See the comment above vec_loop_masks for more details about the mask
7924 arrangement. */
7925
7926 tree
7927 vect_get_loop_mask (gimple_stmt_iterator *gsi, vec_loop_masks *masks,
7928 unsigned int nvectors, tree vectype, unsigned int index)
7929 {
7930 rgroup_masks *rgm = &(*masks)[nvectors - 1];
7931 tree mask_type = rgm->mask_type;
7932
7933 /* Populate the rgroup's mask array, if this is the first time we've
7934 used it. */
7935 if (rgm->masks.is_empty ())
7936 {
7937 rgm->masks.safe_grow_cleared (nvectors);
7938 for (unsigned int i = 0; i < nvectors; ++i)
7939 {
7940 tree mask = make_temp_ssa_name (mask_type, NULL, "loop_mask");
7941 /* Provide a dummy definition until the real one is available. */
7942 SSA_NAME_DEF_STMT (mask) = gimple_build_nop ();
7943 rgm->masks[i] = mask;
7944 }
7945 }
7946
7947 tree mask = rgm->masks[index];
7948 if (maybe_ne (TYPE_VECTOR_SUBPARTS (mask_type),
7949 TYPE_VECTOR_SUBPARTS (vectype)))
7950 {
7951 /* A loop mask for data type X can be reused for data type Y
7952 if X has N times more elements than Y and if Y's elements
7953 are N times bigger than X's. In this case each sequence
7954 of N elements in the loop mask will be all-zero or all-one.
7955 We can then view-convert the mask so that each sequence of
7956 N elements is replaced by a single element. */
7957 gcc_assert (multiple_p (TYPE_VECTOR_SUBPARTS (mask_type),
7958 TYPE_VECTOR_SUBPARTS (vectype)));
7959 gimple_seq seq = NULL;
7960 mask_type = build_same_sized_truth_vector_type (vectype);
7961 mask = gimple_build (&seq, VIEW_CONVERT_EXPR, mask_type, mask);
7962 if (seq)
7963 gsi_insert_seq_before (gsi, seq, GSI_SAME_STMT);
7964 }
7965 return mask;
7966 }
7967
7968 /* Scale profiling counters by estimation for LOOP which is vectorized
7969 by factor VF. */
7970
7971 static void
7972 scale_profile_for_vect_loop (class loop *loop, unsigned vf)
7973 {
7974 edge preheader = loop_preheader_edge (loop);
7975 /* Reduce loop iterations by the vectorization factor. */
7976 gcov_type new_est_niter = niter_for_unrolled_loop (loop, vf);
7977 profile_count freq_h = loop->header->count, freq_e = preheader->count ();
7978
7979 if (freq_h.nonzero_p ())
7980 {
7981 profile_probability p;
7982
7983 /* Avoid dropping loop body profile counter to 0 because of zero count
7984 in loop's preheader. */
7985 if (!(freq_e == profile_count::zero ()))
7986 freq_e = freq_e.force_nonzero ();
7987 p = freq_e.apply_scale (new_est_niter + 1, 1).probability_in (freq_h);
7988 scale_loop_frequencies (loop, p);
7989 }
7990
7991 edge exit_e = single_exit (loop);
7992 exit_e->probability = profile_probability::always ()
7993 .apply_scale (1, new_est_niter + 1);
7994
7995 edge exit_l = single_pred_edge (loop->latch);
7996 profile_probability prob = exit_l->probability;
7997 exit_l->probability = exit_e->probability.invert ();
7998 if (prob.initialized_p () && exit_l->probability.initialized_p ())
7999 scale_bbs_frequencies (&loop->latch, 1, exit_l->probability / prob);
8000 }
8001
8002 /* Vectorize STMT_INFO if relevant, inserting any new instructions before GSI.
8003 When vectorizing STMT_INFO as a store, set *SEEN_STORE to its
8004 stmt_vec_info. */
8005
8006 static void
8007 vect_transform_loop_stmt (loop_vec_info loop_vinfo, stmt_vec_info stmt_info,
8008 gimple_stmt_iterator *gsi, stmt_vec_info *seen_store)
8009 {
8010 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
8011 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
8012
8013 if (dump_enabled_p ())
8014 dump_printf_loc (MSG_NOTE, vect_location,
8015 "------>vectorizing statement: %G", stmt_info->stmt);
8016
8017 if (MAY_HAVE_DEBUG_BIND_STMTS && !STMT_VINFO_LIVE_P (stmt_info))
8018 vect_loop_kill_debug_uses (loop, stmt_info);
8019
8020 if (!STMT_VINFO_RELEVANT_P (stmt_info)
8021 && !STMT_VINFO_LIVE_P (stmt_info))
8022 return;
8023
8024 if (STMT_VINFO_VECTYPE (stmt_info))
8025 {
8026 poly_uint64 nunits
8027 = TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info));
8028 if (!STMT_SLP_TYPE (stmt_info)
8029 && maybe_ne (nunits, vf)
8030 && dump_enabled_p ())
8031 /* For SLP VF is set according to unrolling factor, and not
8032 to vector size, hence for SLP this print is not valid. */
8033 dump_printf_loc (MSG_NOTE, vect_location, "multiple-types.\n");
8034 }
8035
8036 /* Pure SLP statements have already been vectorized. We still need
8037 to apply loop vectorization to hybrid SLP statements. */
8038 if (PURE_SLP_STMT (stmt_info))
8039 return;
8040
8041 if (dump_enabled_p ())
8042 dump_printf_loc (MSG_NOTE, vect_location, "transform statement.\n");
8043
8044 if (vect_transform_stmt (stmt_info, gsi, NULL, NULL))
8045 *seen_store = stmt_info;
8046 }
8047
8048 /* Function vect_transform_loop.
8049
8050 The analysis phase has determined that the loop is vectorizable.
8051 Vectorize the loop - created vectorized stmts to replace the scalar
8052 stmts in the loop, and update the loop exit condition.
8053 Returns scalar epilogue loop if any. */
8054
8055 class loop *
8056 vect_transform_loop (loop_vec_info loop_vinfo)
8057 {
8058 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
8059 class loop *epilogue = NULL;
8060 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
8061 int nbbs = loop->num_nodes;
8062 int i;
8063 tree niters_vector = NULL_TREE;
8064 tree step_vector = NULL_TREE;
8065 tree niters_vector_mult_vf = NULL_TREE;
8066 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
8067 unsigned int lowest_vf = constant_lower_bound (vf);
8068 gimple *stmt;
8069 bool check_profitability = false;
8070 unsigned int th;
8071
8072 DUMP_VECT_SCOPE ("vec_transform_loop");
8073
8074 loop_vinfo->shared->check_datarefs ();
8075
8076 /* Use the more conservative vectorization threshold. If the number
8077 of iterations is constant assume the cost check has been performed
8078 by our caller. If the threshold makes all loops profitable that
8079 run at least the (estimated) vectorization factor number of times
8080 checking is pointless, too. */
8081 th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
8082 if (th >= vect_vf_for_cost (loop_vinfo)
8083 && !LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
8084 {
8085 if (dump_enabled_p ())
8086 dump_printf_loc (MSG_NOTE, vect_location,
8087 "Profitability threshold is %d loop iterations.\n",
8088 th);
8089 check_profitability = true;
8090 }
8091
8092 /* Make sure there exists a single-predecessor exit bb. Do this before
8093 versioning. */
8094 edge e = single_exit (loop);
8095 if (! single_pred_p (e->dest))
8096 {
8097 split_loop_exit_edge (e, true);
8098 if (dump_enabled_p ())
8099 dump_printf (MSG_NOTE, "split exit edge\n");
8100 }
8101
8102 /* Version the loop first, if required, so the profitability check
8103 comes first. */
8104
8105 if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
8106 {
8107 class loop *sloop
8108 = vect_loop_versioning (loop_vinfo);
8109 sloop->force_vectorize = false;
8110 check_profitability = false;
8111 }
8112
8113 /* Make sure there exists a single-predecessor exit bb also on the
8114 scalar loop copy. Do this after versioning but before peeling
8115 so CFG structure is fine for both scalar and if-converted loop
8116 to make slpeel_duplicate_current_defs_from_edges face matched
8117 loop closed PHI nodes on the exit. */
8118 if (LOOP_VINFO_SCALAR_LOOP (loop_vinfo))
8119 {
8120 e = single_exit (LOOP_VINFO_SCALAR_LOOP (loop_vinfo));
8121 if (! single_pred_p (e->dest))
8122 {
8123 split_loop_exit_edge (e, true);
8124 if (dump_enabled_p ())
8125 dump_printf (MSG_NOTE, "split exit edge of scalar loop\n");
8126 }
8127 }
8128
8129 tree niters = vect_build_loop_niters (loop_vinfo);
8130 LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = niters;
8131 tree nitersm1 = unshare_expr (LOOP_VINFO_NITERSM1 (loop_vinfo));
8132 bool niters_no_overflow = loop_niters_no_overflow (loop_vinfo);
8133 epilogue = vect_do_peeling (loop_vinfo, niters, nitersm1, &niters_vector,
8134 &step_vector, &niters_vector_mult_vf, th,
8135 check_profitability, niters_no_overflow);
8136 if (LOOP_VINFO_SCALAR_LOOP (loop_vinfo)
8137 && LOOP_VINFO_SCALAR_LOOP_SCALING (loop_vinfo).initialized_p ())
8138 scale_loop_frequencies (LOOP_VINFO_SCALAR_LOOP (loop_vinfo),
8139 LOOP_VINFO_SCALAR_LOOP_SCALING (loop_vinfo));
8140
8141 if (niters_vector == NULL_TREE)
8142 {
8143 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
8144 && !LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
8145 && known_eq (lowest_vf, vf))
8146 {
8147 niters_vector
8148 = build_int_cst (TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo)),
8149 LOOP_VINFO_INT_NITERS (loop_vinfo) / lowest_vf);
8150 step_vector = build_one_cst (TREE_TYPE (niters));
8151 }
8152 else
8153 vect_gen_vector_loop_niters (loop_vinfo, niters, &niters_vector,
8154 &step_vector, niters_no_overflow);
8155 }
8156
8157 /* 1) Make sure the loop header has exactly two entries
8158 2) Make sure we have a preheader basic block. */
8159
8160 gcc_assert (EDGE_COUNT (loop->header->preds) == 2);
8161
8162 split_edge (loop_preheader_edge (loop));
8163
8164 if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
8165 && vect_use_loop_mask_for_alignment_p (loop_vinfo))
8166 /* This will deal with any possible peeling. */
8167 vect_prepare_for_masked_peels (loop_vinfo);
8168
8169 /* Schedule the SLP instances first, then handle loop vectorization
8170 below. */
8171 if (!loop_vinfo->slp_instances.is_empty ())
8172 {
8173 DUMP_VECT_SCOPE ("scheduling SLP instances");
8174 vect_schedule_slp (loop_vinfo);
8175 }
8176
8177 /* FORNOW: the vectorizer supports only loops which body consist
8178 of one basic block (header + empty latch). When the vectorizer will
8179 support more involved loop forms, the order by which the BBs are
8180 traversed need to be reconsidered. */
8181
8182 for (i = 0; i < nbbs; i++)
8183 {
8184 basic_block bb = bbs[i];
8185 stmt_vec_info stmt_info;
8186
8187 for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
8188 gsi_next (&si))
8189 {
8190 gphi *phi = si.phi ();
8191 if (dump_enabled_p ())
8192 dump_printf_loc (MSG_NOTE, vect_location,
8193 "------>vectorizing phi: %G", phi);
8194 stmt_info = loop_vinfo->lookup_stmt (phi);
8195 if (!stmt_info)
8196 continue;
8197
8198 if (MAY_HAVE_DEBUG_BIND_STMTS && !STMT_VINFO_LIVE_P (stmt_info))
8199 vect_loop_kill_debug_uses (loop, stmt_info);
8200
8201 if (!STMT_VINFO_RELEVANT_P (stmt_info)
8202 && !STMT_VINFO_LIVE_P (stmt_info))
8203 continue;
8204
8205 if (STMT_VINFO_VECTYPE (stmt_info)
8206 && (maybe_ne
8207 (TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info)), vf))
8208 && dump_enabled_p ())
8209 dump_printf_loc (MSG_NOTE, vect_location, "multiple-types.\n");
8210
8211 if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def
8212 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
8213 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def
8214 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle
8215 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_internal_def)
8216 && ! PURE_SLP_STMT (stmt_info))
8217 {
8218 if (dump_enabled_p ())
8219 dump_printf_loc (MSG_NOTE, vect_location, "transform phi.\n");
8220 vect_transform_stmt (stmt_info, NULL, NULL, NULL);
8221 }
8222 }
8223
8224 for (gimple_stmt_iterator si = gsi_start_bb (bb);
8225 !gsi_end_p (si);)
8226 {
8227 stmt = gsi_stmt (si);
8228 /* During vectorization remove existing clobber stmts. */
8229 if (gimple_clobber_p (stmt))
8230 {
8231 unlink_stmt_vdef (stmt);
8232 gsi_remove (&si, true);
8233 release_defs (stmt);
8234 }
8235 else
8236 {
8237 stmt_info = loop_vinfo->lookup_stmt (stmt);
8238
8239 /* vector stmts created in the outer-loop during vectorization of
8240 stmts in an inner-loop may not have a stmt_info, and do not
8241 need to be vectorized. */
8242 stmt_vec_info seen_store = NULL;
8243 if (stmt_info)
8244 {
8245 if (STMT_VINFO_IN_PATTERN_P (stmt_info))
8246 {
8247 gimple *def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
8248 for (gimple_stmt_iterator subsi = gsi_start (def_seq);
8249 !gsi_end_p (subsi); gsi_next (&subsi))
8250 {
8251 stmt_vec_info pat_stmt_info
8252 = loop_vinfo->lookup_stmt (gsi_stmt (subsi));
8253 vect_transform_loop_stmt (loop_vinfo, pat_stmt_info,
8254 &si, &seen_store);
8255 }
8256 stmt_vec_info pat_stmt_info
8257 = STMT_VINFO_RELATED_STMT (stmt_info);
8258 vect_transform_loop_stmt (loop_vinfo, pat_stmt_info, &si,
8259 &seen_store);
8260 }
8261 vect_transform_loop_stmt (loop_vinfo, stmt_info, &si,
8262 &seen_store);
8263 }
8264 gsi_next (&si);
8265 if (seen_store)
8266 {
8267 if (STMT_VINFO_GROUPED_ACCESS (seen_store))
8268 /* Interleaving. If IS_STORE is TRUE, the
8269 vectorization of the interleaving chain was
8270 completed - free all the stores in the chain. */
8271 vect_remove_stores (DR_GROUP_FIRST_ELEMENT (seen_store));
8272 else
8273 /* Free the attached stmt_vec_info and remove the stmt. */
8274 loop_vinfo->remove_stmt (stmt_info);
8275 }
8276 }
8277 }
8278
8279 /* Stub out scalar statements that must not survive vectorization.
8280 Doing this here helps with grouped statements, or statements that
8281 are involved in patterns. */
8282 for (gimple_stmt_iterator gsi = gsi_start_bb (bb);
8283 !gsi_end_p (gsi); gsi_next (&gsi))
8284 {
8285 gcall *call = dyn_cast <gcall *> (gsi_stmt (gsi));
8286 if (call && gimple_call_internal_p (call, IFN_MASK_LOAD))
8287 {
8288 tree lhs = gimple_get_lhs (call);
8289 if (!VECTOR_TYPE_P (TREE_TYPE (lhs)))
8290 {
8291 tree zero = build_zero_cst (TREE_TYPE (lhs));
8292 gimple *new_stmt = gimple_build_assign (lhs, zero);
8293 gsi_replace (&gsi, new_stmt, true);
8294 }
8295 }
8296 }
8297 } /* BBs in loop */
8298
8299 /* The vectorization factor is always > 1, so if we use an IV increment of 1.
8300 a zero NITERS becomes a nonzero NITERS_VECTOR. */
8301 if (integer_onep (step_vector))
8302 niters_no_overflow = true;
8303 vect_set_loop_condition (loop, loop_vinfo, niters_vector, step_vector,
8304 niters_vector_mult_vf, !niters_no_overflow);
8305
8306 unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
8307 scale_profile_for_vect_loop (loop, assumed_vf);
8308
8309 /* True if the final iteration might not handle a full vector's
8310 worth of scalar iterations. */
8311 bool final_iter_may_be_partial = LOOP_VINFO_FULLY_MASKED_P (loop_vinfo);
8312 /* The minimum number of iterations performed by the epilogue. This
8313 is 1 when peeling for gaps because we always need a final scalar
8314 iteration. */
8315 int min_epilogue_iters = LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) ? 1 : 0;
8316 /* +1 to convert latch counts to loop iteration counts,
8317 -min_epilogue_iters to remove iterations that cannot be performed
8318 by the vector code. */
8319 int bias_for_lowest = 1 - min_epilogue_iters;
8320 int bias_for_assumed = bias_for_lowest;
8321 int alignment_npeels = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
8322 if (alignment_npeels && LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
8323 {
8324 /* When the amount of peeling is known at compile time, the first
8325 iteration will have exactly alignment_npeels active elements.
8326 In the worst case it will have at least one. */
8327 int min_first_active = (alignment_npeels > 0 ? alignment_npeels : 1);
8328 bias_for_lowest += lowest_vf - min_first_active;
8329 bias_for_assumed += assumed_vf - min_first_active;
8330 }
8331 /* In these calculations the "- 1" converts loop iteration counts
8332 back to latch counts. */
8333 if (loop->any_upper_bound)
8334 loop->nb_iterations_upper_bound
8335 = (final_iter_may_be_partial
8336 ? wi::udiv_ceil (loop->nb_iterations_upper_bound + bias_for_lowest,
8337 lowest_vf) - 1
8338 : wi::udiv_floor (loop->nb_iterations_upper_bound + bias_for_lowest,
8339 lowest_vf) - 1);
8340 if (loop->any_likely_upper_bound)
8341 loop->nb_iterations_likely_upper_bound
8342 = (final_iter_may_be_partial
8343 ? wi::udiv_ceil (loop->nb_iterations_likely_upper_bound
8344 + bias_for_lowest, lowest_vf) - 1
8345 : wi::udiv_floor (loop->nb_iterations_likely_upper_bound
8346 + bias_for_lowest, lowest_vf) - 1);
8347 if (loop->any_estimate)
8348 loop->nb_iterations_estimate
8349 = (final_iter_may_be_partial
8350 ? wi::udiv_ceil (loop->nb_iterations_estimate + bias_for_assumed,
8351 assumed_vf) - 1
8352 : wi::udiv_floor (loop->nb_iterations_estimate + bias_for_assumed,
8353 assumed_vf) - 1);
8354
8355 if (dump_enabled_p ())
8356 {
8357 if (!LOOP_VINFO_EPILOGUE_P (loop_vinfo))
8358 {
8359 dump_printf_loc (MSG_NOTE, vect_location,
8360 "LOOP VECTORIZED\n");
8361 if (loop->inner)
8362 dump_printf_loc (MSG_NOTE, vect_location,
8363 "OUTER LOOP VECTORIZED\n");
8364 dump_printf (MSG_NOTE, "\n");
8365 }
8366 else
8367 {
8368 dump_printf_loc (MSG_NOTE, vect_location,
8369 "LOOP EPILOGUE VECTORIZED (VS=");
8370 dump_dec (MSG_NOTE, current_vector_size);
8371 dump_printf (MSG_NOTE, ")\n");
8372 }
8373 }
8374
8375 /* Loops vectorized with a variable factor won't benefit from
8376 unrolling/peeling. */
8377 if (!vf.is_constant ())
8378 {
8379 loop->unroll = 1;
8380 if (dump_enabled_p ())
8381 dump_printf_loc (MSG_NOTE, vect_location, "Disabling unrolling due to"
8382 " variable-length vectorization factor\n");
8383 }
8384 /* Free SLP instances here because otherwise stmt reference counting
8385 won't work. */
8386 slp_instance instance;
8387 FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), i, instance)
8388 vect_free_slp_instance (instance, true);
8389 LOOP_VINFO_SLP_INSTANCES (loop_vinfo).release ();
8390 /* Clear-up safelen field since its value is invalid after vectorization
8391 since vectorized loop can have loop-carried dependencies. */
8392 loop->safelen = 0;
8393
8394 /* Don't vectorize epilogue for epilogue. */
8395 if (LOOP_VINFO_EPILOGUE_P (loop_vinfo))
8396 epilogue = NULL;
8397
8398 if (!PARAM_VALUE (PARAM_VECT_EPILOGUES_NOMASK))
8399 epilogue = NULL;
8400
8401 if (epilogue)
8402 {
8403 auto_vector_sizes vector_sizes;
8404 targetm.vectorize.autovectorize_vector_sizes (&vector_sizes, false);
8405 unsigned int next_size = 0;
8406
8407 /* Note LOOP_VINFO_NITERS_KNOWN_P and LOOP_VINFO_INT_NITERS work
8408 on niters already ajusted for the iterations of the prologue. */
8409 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
8410 && known_eq (vf, lowest_vf))
8411 {
8412 unsigned HOST_WIDE_INT eiters
8413 = (LOOP_VINFO_INT_NITERS (loop_vinfo)
8414 - LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo));
8415 eiters
8416 = eiters % lowest_vf + LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo);
8417 epilogue->nb_iterations_upper_bound = eiters - 1;
8418 epilogue->any_upper_bound = true;
8419
8420 unsigned int ratio;
8421 while (next_size < vector_sizes.length ()
8422 && !(constant_multiple_p (current_vector_size,
8423 vector_sizes[next_size], &ratio)
8424 && eiters >= lowest_vf / ratio))
8425 next_size += 1;
8426 }
8427 else
8428 while (next_size < vector_sizes.length ()
8429 && maybe_lt (current_vector_size, vector_sizes[next_size]))
8430 next_size += 1;
8431
8432 if (next_size == vector_sizes.length ())
8433 epilogue = NULL;
8434 }
8435
8436 if (epilogue)
8437 {
8438 epilogue->force_vectorize = loop->force_vectorize;
8439 epilogue->safelen = loop->safelen;
8440 epilogue->dont_vectorize = false;
8441
8442 /* We may need to if-convert epilogue to vectorize it. */
8443 if (LOOP_VINFO_SCALAR_LOOP (loop_vinfo))
8444 tree_if_conversion (epilogue);
8445 }
8446
8447 return epilogue;
8448 }
8449
8450 /* The code below is trying to perform simple optimization - revert
8451 if-conversion for masked stores, i.e. if the mask of a store is zero
8452 do not perform it and all stored value producers also if possible.
8453 For example,
8454 for (i=0; i<n; i++)
8455 if (c[i])
8456 {
8457 p1[i] += 1;
8458 p2[i] = p3[i] +2;
8459 }
8460 this transformation will produce the following semi-hammock:
8461
8462 if (!mask__ifc__42.18_165 == { 0, 0, 0, 0, 0, 0, 0, 0 })
8463 {
8464 vect__11.19_170 = MASK_LOAD (vectp_p1.20_168, 0B, mask__ifc__42.18_165);
8465 vect__12.22_172 = vect__11.19_170 + vect_cst__171;
8466 MASK_STORE (vectp_p1.23_175, 0B, mask__ifc__42.18_165, vect__12.22_172);
8467 vect__18.25_182 = MASK_LOAD (vectp_p3.26_180, 0B, mask__ifc__42.18_165);
8468 vect__19.28_184 = vect__18.25_182 + vect_cst__183;
8469 MASK_STORE (vectp_p2.29_187, 0B, mask__ifc__42.18_165, vect__19.28_184);
8470 }
8471 */
8472
8473 void
8474 optimize_mask_stores (class loop *loop)
8475 {
8476 basic_block *bbs = get_loop_body (loop);
8477 unsigned nbbs = loop->num_nodes;
8478 unsigned i;
8479 basic_block bb;
8480 class loop *bb_loop;
8481 gimple_stmt_iterator gsi;
8482 gimple *stmt;
8483 auto_vec<gimple *> worklist;
8484 auto_purge_vect_location sentinel;
8485
8486 vect_location = find_loop_location (loop);
8487 /* Pick up all masked stores in loop if any. */
8488 for (i = 0; i < nbbs; i++)
8489 {
8490 bb = bbs[i];
8491 for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi);
8492 gsi_next (&gsi))
8493 {
8494 stmt = gsi_stmt (gsi);
8495 if (gimple_call_internal_p (stmt, IFN_MASK_STORE))
8496 worklist.safe_push (stmt);
8497 }
8498 }
8499
8500 free (bbs);
8501 if (worklist.is_empty ())
8502 return;
8503
8504 /* Loop has masked stores. */
8505 while (!worklist.is_empty ())
8506 {
8507 gimple *last, *last_store;
8508 edge e, efalse;
8509 tree mask;
8510 basic_block store_bb, join_bb;
8511 gimple_stmt_iterator gsi_to;
8512 tree vdef, new_vdef;
8513 gphi *phi;
8514 tree vectype;
8515 tree zero;
8516
8517 last = worklist.pop ();
8518 mask = gimple_call_arg (last, 2);
8519 bb = gimple_bb (last);
8520 /* Create then_bb and if-then structure in CFG, then_bb belongs to
8521 the same loop as if_bb. It could be different to LOOP when two
8522 level loop-nest is vectorized and mask_store belongs to the inner
8523 one. */
8524 e = split_block (bb, last);
8525 bb_loop = bb->loop_father;
8526 gcc_assert (loop == bb_loop || flow_loop_nested_p (loop, bb_loop));
8527 join_bb = e->dest;
8528 store_bb = create_empty_bb (bb);
8529 add_bb_to_loop (store_bb, bb_loop);
8530 e->flags = EDGE_TRUE_VALUE;
8531 efalse = make_edge (bb, store_bb, EDGE_FALSE_VALUE);
8532 /* Put STORE_BB to likely part. */
8533 efalse->probability = profile_probability::unlikely ();
8534 store_bb->count = efalse->count ();
8535 make_single_succ_edge (store_bb, join_bb, EDGE_FALLTHRU);
8536 if (dom_info_available_p (CDI_DOMINATORS))
8537 set_immediate_dominator (CDI_DOMINATORS, store_bb, bb);
8538 if (dump_enabled_p ())
8539 dump_printf_loc (MSG_NOTE, vect_location,
8540 "Create new block %d to sink mask stores.",
8541 store_bb->index);
8542 /* Create vector comparison with boolean result. */
8543 vectype = TREE_TYPE (mask);
8544 zero = build_zero_cst (vectype);
8545 stmt = gimple_build_cond (EQ_EXPR, mask, zero, NULL_TREE, NULL_TREE);
8546 gsi = gsi_last_bb (bb);
8547 gsi_insert_after (&gsi, stmt, GSI_SAME_STMT);
8548 /* Create new PHI node for vdef of the last masked store:
8549 .MEM_2 = VDEF <.MEM_1>
8550 will be converted to
8551 .MEM.3 = VDEF <.MEM_1>
8552 and new PHI node will be created in join bb
8553 .MEM_2 = PHI <.MEM_1, .MEM_3>
8554 */
8555 vdef = gimple_vdef (last);
8556 new_vdef = make_ssa_name (gimple_vop (cfun), last);
8557 gimple_set_vdef (last, new_vdef);
8558 phi = create_phi_node (vdef, join_bb);
8559 add_phi_arg (phi, new_vdef, EDGE_SUCC (store_bb, 0), UNKNOWN_LOCATION);
8560
8561 /* Put all masked stores with the same mask to STORE_BB if possible. */
8562 while (true)
8563 {
8564 gimple_stmt_iterator gsi_from;
8565 gimple *stmt1 = NULL;
8566
8567 /* Move masked store to STORE_BB. */
8568 last_store = last;
8569 gsi = gsi_for_stmt (last);
8570 gsi_from = gsi;
8571 /* Shift GSI to the previous stmt for further traversal. */
8572 gsi_prev (&gsi);
8573 gsi_to = gsi_start_bb (store_bb);
8574 gsi_move_before (&gsi_from, &gsi_to);
8575 /* Setup GSI_TO to the non-empty block start. */
8576 gsi_to = gsi_start_bb (store_bb);
8577 if (dump_enabled_p ())
8578 dump_printf_loc (MSG_NOTE, vect_location,
8579 "Move stmt to created bb\n%G", last);
8580 /* Move all stored value producers if possible. */
8581 while (!gsi_end_p (gsi))
8582 {
8583 tree lhs;
8584 imm_use_iterator imm_iter;
8585 use_operand_p use_p;
8586 bool res;
8587
8588 /* Skip debug statements. */
8589 if (is_gimple_debug (gsi_stmt (gsi)))
8590 {
8591 gsi_prev (&gsi);
8592 continue;
8593 }
8594 stmt1 = gsi_stmt (gsi);
8595 /* Do not consider statements writing to memory or having
8596 volatile operand. */
8597 if (gimple_vdef (stmt1)
8598 || gimple_has_volatile_ops (stmt1))
8599 break;
8600 gsi_from = gsi;
8601 gsi_prev (&gsi);
8602 lhs = gimple_get_lhs (stmt1);
8603 if (!lhs)
8604 break;
8605
8606 /* LHS of vectorized stmt must be SSA_NAME. */
8607 if (TREE_CODE (lhs) != SSA_NAME)
8608 break;
8609
8610 if (!VECTOR_TYPE_P (TREE_TYPE (lhs)))
8611 {
8612 /* Remove dead scalar statement. */
8613 if (has_zero_uses (lhs))
8614 {
8615 gsi_remove (&gsi_from, true);
8616 continue;
8617 }
8618 }
8619
8620 /* Check that LHS does not have uses outside of STORE_BB. */
8621 res = true;
8622 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, lhs)
8623 {
8624 gimple *use_stmt;
8625 use_stmt = USE_STMT (use_p);
8626 if (is_gimple_debug (use_stmt))
8627 continue;
8628 if (gimple_bb (use_stmt) != store_bb)
8629 {
8630 res = false;
8631 break;
8632 }
8633 }
8634 if (!res)
8635 break;
8636
8637 if (gimple_vuse (stmt1)
8638 && gimple_vuse (stmt1) != gimple_vuse (last_store))
8639 break;
8640
8641 /* Can move STMT1 to STORE_BB. */
8642 if (dump_enabled_p ())
8643 dump_printf_loc (MSG_NOTE, vect_location,
8644 "Move stmt to created bb\n%G", stmt1);
8645 gsi_move_before (&gsi_from, &gsi_to);
8646 /* Shift GSI_TO for further insertion. */
8647 gsi_prev (&gsi_to);
8648 }
8649 /* Put other masked stores with the same mask to STORE_BB. */
8650 if (worklist.is_empty ()
8651 || gimple_call_arg (worklist.last (), 2) != mask
8652 || worklist.last () != stmt1)
8653 break;
8654 last = worklist.pop ();
8655 }
8656 add_phi_arg (phi, gimple_vuse (last_store), e, UNKNOWN_LOCATION);
8657 }
8658 }
8659
8660 /* Decide whether it is possible to use a zero-based induction variable
8661 when vectorizing LOOP_VINFO with a fully-masked loop. If it is,
8662 return the value that the induction variable must be able to hold
8663 in order to ensure that the loop ends with an all-false mask.
8664 Return -1 otherwise. */
8665 widest_int
8666 vect_iv_limit_for_full_masking (loop_vec_info loop_vinfo)
8667 {
8668 tree niters_skip = LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo);
8669 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
8670 unsigned HOST_WIDE_INT max_vf = vect_max_vf (loop_vinfo);
8671
8672 /* Calculate the value that the induction variable must be able
8673 to hit in order to ensure that we end the loop with an all-false mask.
8674 This involves adding the maximum number of inactive trailing scalar
8675 iterations. */
8676 widest_int iv_limit = -1;
8677 if (max_loop_iterations (loop, &iv_limit))
8678 {
8679 if (niters_skip)
8680 {
8681 /* Add the maximum number of skipped iterations to the
8682 maximum iteration count. */
8683 if (TREE_CODE (niters_skip) == INTEGER_CST)
8684 iv_limit += wi::to_widest (niters_skip);
8685 else
8686 iv_limit += max_vf - 1;
8687 }
8688 else if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo))
8689 /* Make a conservatively-correct assumption. */
8690 iv_limit += max_vf - 1;
8691
8692 /* IV_LIMIT is the maximum number of latch iterations, which is also
8693 the maximum in-range IV value. Round this value down to the previous
8694 vector alignment boundary and then add an extra full iteration. */
8695 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
8696 iv_limit = (iv_limit & -(int) known_alignment (vf)) + max_vf;
8697 }
8698 return iv_limit;
8699 }
8700