]> git.ipfire.org Git - thirdparty/gcc.git/blob - gcc/tree-vect-loop.c
PR tree-optimization/91033
[thirdparty/gcc.git] / gcc / tree-vect-loop.c
1 /* Loop Vectorization
2 Copyright (C) 2003-2019 Free Software Foundation, Inc.
3 Contributed by Dorit Naishlos <dorit@il.ibm.com> and
4 Ira Rosen <irar@il.ibm.com>
5
6 This file is part of GCC.
7
8 GCC is free software; you can redistribute it and/or modify it under
9 the terms of the GNU General Public License as published by the Free
10 Software Foundation; either version 3, or (at your option) any later
11 version.
12
13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
16 for more details.
17
18 You should have received a copy of the GNU General Public License
19 along with GCC; see the file COPYING3. If not see
20 <http://www.gnu.org/licenses/>. */
21
22 #include "config.h"
23 #include "system.h"
24 #include "coretypes.h"
25 #include "backend.h"
26 #include "target.h"
27 #include "rtl.h"
28 #include "tree.h"
29 #include "gimple.h"
30 #include "cfghooks.h"
31 #include "tree-pass.h"
32 #include "ssa.h"
33 #include "optabs-tree.h"
34 #include "diagnostic-core.h"
35 #include "fold-const.h"
36 #include "stor-layout.h"
37 #include "cfganal.h"
38 #include "gimplify.h"
39 #include "gimple-iterator.h"
40 #include "gimplify-me.h"
41 #include "tree-ssa-loop-ivopts.h"
42 #include "tree-ssa-loop-manip.h"
43 #include "tree-ssa-loop-niter.h"
44 #include "tree-ssa-loop.h"
45 #include "cfgloop.h"
46 #include "params.h"
47 #include "tree-scalar-evolution.h"
48 #include "tree-vectorizer.h"
49 #include "gimple-fold.h"
50 #include "cgraph.h"
51 #include "tree-cfg.h"
52 #include "tree-if-conv.h"
53 #include "internal-fn.h"
54 #include "tree-vector-builder.h"
55 #include "vec-perm-indices.h"
56 #include "tree-eh.h"
57
58 /* Loop Vectorization Pass.
59
60 This pass tries to vectorize loops.
61
62 For example, the vectorizer transforms the following simple loop:
63
64 short a[N]; short b[N]; short c[N]; int i;
65
66 for (i=0; i<N; i++){
67 a[i] = b[i] + c[i];
68 }
69
70 as if it was manually vectorized by rewriting the source code into:
71
72 typedef int __attribute__((mode(V8HI))) v8hi;
73 short a[N]; short b[N]; short c[N]; int i;
74 v8hi *pa = (v8hi*)a, *pb = (v8hi*)b, *pc = (v8hi*)c;
75 v8hi va, vb, vc;
76
77 for (i=0; i<N/8; i++){
78 vb = pb[i];
79 vc = pc[i];
80 va = vb + vc;
81 pa[i] = va;
82 }
83
84 The main entry to this pass is vectorize_loops(), in which
85 the vectorizer applies a set of analyses on a given set of loops,
86 followed by the actual vectorization transformation for the loops that
87 had successfully passed the analysis phase.
88 Throughout this pass we make a distinction between two types of
89 data: scalars (which are represented by SSA_NAMES), and memory references
90 ("data-refs"). These two types of data require different handling both
91 during analysis and transformation. The types of data-refs that the
92 vectorizer currently supports are ARRAY_REFS which base is an array DECL
93 (not a pointer), and INDIRECT_REFS through pointers; both array and pointer
94 accesses are required to have a simple (consecutive) access pattern.
95
96 Analysis phase:
97 ===============
98 The driver for the analysis phase is vect_analyze_loop().
99 It applies a set of analyses, some of which rely on the scalar evolution
100 analyzer (scev) developed by Sebastian Pop.
101
102 During the analysis phase the vectorizer records some information
103 per stmt in a "stmt_vec_info" struct which is attached to each stmt in the
104 loop, as well as general information about the loop as a whole, which is
105 recorded in a "loop_vec_info" struct attached to each loop.
106
107 Transformation phase:
108 =====================
109 The loop transformation phase scans all the stmts in the loop, and
110 creates a vector stmt (or a sequence of stmts) for each scalar stmt S in
111 the loop that needs to be vectorized. It inserts the vector code sequence
112 just before the scalar stmt S, and records a pointer to the vector code
113 in STMT_VINFO_VEC_STMT (stmt_info) (stmt_info is the stmt_vec_info struct
114 attached to S). This pointer will be used for the vectorization of following
115 stmts which use the def of stmt S. Stmt S is removed if it writes to memory;
116 otherwise, we rely on dead code elimination for removing it.
117
118 For example, say stmt S1 was vectorized into stmt VS1:
119
120 VS1: vb = px[i];
121 S1: b = x[i]; STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
122 S2: a = b;
123
124 To vectorize stmt S2, the vectorizer first finds the stmt that defines
125 the operand 'b' (S1), and gets the relevant vector def 'vb' from the
126 vector stmt VS1 pointed to by STMT_VINFO_VEC_STMT (stmt_info (S1)). The
127 resulting sequence would be:
128
129 VS1: vb = px[i];
130 S1: b = x[i]; STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
131 VS2: va = vb;
132 S2: a = b; STMT_VINFO_VEC_STMT (stmt_info (S2)) = VS2
133
134 Operands that are not SSA_NAMEs, are data-refs that appear in
135 load/store operations (like 'x[i]' in S1), and are handled differently.
136
137 Target modeling:
138 =================
139 Currently the only target specific information that is used is the
140 size of the vector (in bytes) - "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD".
141 Targets that can support different sizes of vectors, for now will need
142 to specify one value for "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD". More
143 flexibility will be added in the future.
144
145 Since we only vectorize operations which vector form can be
146 expressed using existing tree codes, to verify that an operation is
147 supported, the vectorizer checks the relevant optab at the relevant
148 machine_mode (e.g, optab_handler (add_optab, V8HImode)). If
149 the value found is CODE_FOR_nothing, then there's no target support, and
150 we can't vectorize the stmt.
151
152 For additional information on this project see:
153 http://gcc.gnu.org/projects/tree-ssa/vectorization.html
154 */
155
156 static void vect_estimate_min_profitable_iters (loop_vec_info, int *, int *);
157
158 /* Subroutine of vect_determine_vf_for_stmt that handles only one
159 statement. VECTYPE_MAYBE_SET_P is true if STMT_VINFO_VECTYPE
160 may already be set for general statements (not just data refs). */
161
162 static opt_result
163 vect_determine_vf_for_stmt_1 (stmt_vec_info stmt_info,
164 bool vectype_maybe_set_p,
165 poly_uint64 *vf,
166 vec<stmt_vec_info > *mask_producers)
167 {
168 gimple *stmt = stmt_info->stmt;
169
170 if ((!STMT_VINFO_RELEVANT_P (stmt_info)
171 && !STMT_VINFO_LIVE_P (stmt_info))
172 || gimple_clobber_p (stmt))
173 {
174 if (dump_enabled_p ())
175 dump_printf_loc (MSG_NOTE, vect_location, "skip.\n");
176 return opt_result::success ();
177 }
178
179 tree stmt_vectype, nunits_vectype;
180 opt_result res = vect_get_vector_types_for_stmt (stmt_info, &stmt_vectype,
181 &nunits_vectype);
182 if (!res)
183 return res;
184
185 if (stmt_vectype)
186 {
187 if (STMT_VINFO_VECTYPE (stmt_info))
188 /* The only case when a vectype had been already set is for stmts
189 that contain a data ref, or for "pattern-stmts" (stmts generated
190 by the vectorizer to represent/replace a certain idiom). */
191 gcc_assert ((STMT_VINFO_DATA_REF (stmt_info)
192 || vectype_maybe_set_p)
193 && STMT_VINFO_VECTYPE (stmt_info) == stmt_vectype);
194 else if (stmt_vectype == boolean_type_node)
195 mask_producers->safe_push (stmt_info);
196 else
197 STMT_VINFO_VECTYPE (stmt_info) = stmt_vectype;
198 }
199
200 if (nunits_vectype)
201 vect_update_max_nunits (vf, nunits_vectype);
202
203 return opt_result::success ();
204 }
205
206 /* Subroutine of vect_determine_vectorization_factor. Set the vector
207 types of STMT_INFO and all attached pattern statements and update
208 the vectorization factor VF accordingly. If some of the statements
209 produce a mask result whose vector type can only be calculated later,
210 add them to MASK_PRODUCERS. Return true on success or false if
211 something prevented vectorization. */
212
213 static opt_result
214 vect_determine_vf_for_stmt (stmt_vec_info stmt_info, poly_uint64 *vf,
215 vec<stmt_vec_info > *mask_producers)
216 {
217 vec_info *vinfo = stmt_info->vinfo;
218 if (dump_enabled_p ())
219 dump_printf_loc (MSG_NOTE, vect_location, "==> examining statement: %G",
220 stmt_info->stmt);
221 opt_result res
222 = vect_determine_vf_for_stmt_1 (stmt_info, false, vf, mask_producers);
223 if (!res)
224 return res;
225
226 if (STMT_VINFO_IN_PATTERN_P (stmt_info)
227 && STMT_VINFO_RELATED_STMT (stmt_info))
228 {
229 gimple *pattern_def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
230 stmt_info = STMT_VINFO_RELATED_STMT (stmt_info);
231
232 /* If a pattern statement has def stmts, analyze them too. */
233 for (gimple_stmt_iterator si = gsi_start (pattern_def_seq);
234 !gsi_end_p (si); gsi_next (&si))
235 {
236 stmt_vec_info def_stmt_info = vinfo->lookup_stmt (gsi_stmt (si));
237 if (dump_enabled_p ())
238 dump_printf_loc (MSG_NOTE, vect_location,
239 "==> examining pattern def stmt: %G",
240 def_stmt_info->stmt);
241 if (!vect_determine_vf_for_stmt_1 (def_stmt_info, true,
242 vf, mask_producers))
243 res = vect_determine_vf_for_stmt_1 (def_stmt_info, true,
244 vf, mask_producers);
245 if (!res)
246 return res;
247 }
248
249 if (dump_enabled_p ())
250 dump_printf_loc (MSG_NOTE, vect_location,
251 "==> examining pattern statement: %G",
252 stmt_info->stmt);
253 res = vect_determine_vf_for_stmt_1 (stmt_info, true, vf, mask_producers);
254 if (!res)
255 return res;
256 }
257
258 return opt_result::success ();
259 }
260
261 /* Function vect_determine_vectorization_factor
262
263 Determine the vectorization factor (VF). VF is the number of data elements
264 that are operated upon in parallel in a single iteration of the vectorized
265 loop. For example, when vectorizing a loop that operates on 4byte elements,
266 on a target with vector size (VS) 16byte, the VF is set to 4, since 4
267 elements can fit in a single vector register.
268
269 We currently support vectorization of loops in which all types operated upon
270 are of the same size. Therefore this function currently sets VF according to
271 the size of the types operated upon, and fails if there are multiple sizes
272 in the loop.
273
274 VF is also the factor by which the loop iterations are strip-mined, e.g.:
275 original loop:
276 for (i=0; i<N; i++){
277 a[i] = b[i] + c[i];
278 }
279
280 vectorized loop:
281 for (i=0; i<N; i+=VF){
282 a[i:VF] = b[i:VF] + c[i:VF];
283 }
284 */
285
286 static opt_result
287 vect_determine_vectorization_factor (loop_vec_info loop_vinfo)
288 {
289 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
290 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
291 unsigned nbbs = loop->num_nodes;
292 poly_uint64 vectorization_factor = 1;
293 tree scalar_type = NULL_TREE;
294 gphi *phi;
295 tree vectype;
296 stmt_vec_info stmt_info;
297 unsigned i;
298 auto_vec<stmt_vec_info> mask_producers;
299
300 DUMP_VECT_SCOPE ("vect_determine_vectorization_factor");
301
302 for (i = 0; i < nbbs; i++)
303 {
304 basic_block bb = bbs[i];
305
306 for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
307 gsi_next (&si))
308 {
309 phi = si.phi ();
310 stmt_info = loop_vinfo->lookup_stmt (phi);
311 if (dump_enabled_p ())
312 dump_printf_loc (MSG_NOTE, vect_location, "==> examining phi: %G",
313 phi);
314
315 gcc_assert (stmt_info);
316
317 if (STMT_VINFO_RELEVANT_P (stmt_info)
318 || STMT_VINFO_LIVE_P (stmt_info))
319 {
320 gcc_assert (!STMT_VINFO_VECTYPE (stmt_info));
321 scalar_type = TREE_TYPE (PHI_RESULT (phi));
322
323 if (dump_enabled_p ())
324 dump_printf_loc (MSG_NOTE, vect_location,
325 "get vectype for scalar type: %T\n",
326 scalar_type);
327
328 vectype = get_vectype_for_scalar_type (scalar_type);
329 if (!vectype)
330 return opt_result::failure_at (phi,
331 "not vectorized: unsupported "
332 "data-type %T\n",
333 scalar_type);
334 STMT_VINFO_VECTYPE (stmt_info) = vectype;
335
336 if (dump_enabled_p ())
337 dump_printf_loc (MSG_NOTE, vect_location, "vectype: %T\n",
338 vectype);
339
340 if (dump_enabled_p ())
341 {
342 dump_printf_loc (MSG_NOTE, vect_location, "nunits = ");
343 dump_dec (MSG_NOTE, TYPE_VECTOR_SUBPARTS (vectype));
344 dump_printf (MSG_NOTE, "\n");
345 }
346
347 vect_update_max_nunits (&vectorization_factor, vectype);
348 }
349 }
350
351 for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
352 gsi_next (&si))
353 {
354 stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
355 opt_result res
356 = vect_determine_vf_for_stmt (stmt_info, &vectorization_factor,
357 &mask_producers);
358 if (!res)
359 return res;
360 }
361 }
362
363 /* TODO: Analyze cost. Decide if worth while to vectorize. */
364 if (dump_enabled_p ())
365 {
366 dump_printf_loc (MSG_NOTE, vect_location, "vectorization factor = ");
367 dump_dec (MSG_NOTE, vectorization_factor);
368 dump_printf (MSG_NOTE, "\n");
369 }
370
371 if (known_le (vectorization_factor, 1U))
372 return opt_result::failure_at (vect_location,
373 "not vectorized: unsupported data-type\n");
374 LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
375
376 for (i = 0; i < mask_producers.length (); i++)
377 {
378 stmt_info = mask_producers[i];
379 opt_tree mask_type = vect_get_mask_type_for_stmt (stmt_info);
380 if (!mask_type)
381 return opt_result::propagate_failure (mask_type);
382 STMT_VINFO_VECTYPE (stmt_info) = mask_type;
383 }
384
385 return opt_result::success ();
386 }
387
388
389 /* Function vect_is_simple_iv_evolution.
390
391 FORNOW: A simple evolution of an induction variables in the loop is
392 considered a polynomial evolution. */
393
394 static bool
395 vect_is_simple_iv_evolution (unsigned loop_nb, tree access_fn, tree * init,
396 tree * step)
397 {
398 tree init_expr;
399 tree step_expr;
400 tree evolution_part = evolution_part_in_loop_num (access_fn, loop_nb);
401 basic_block bb;
402
403 /* When there is no evolution in this loop, the evolution function
404 is not "simple". */
405 if (evolution_part == NULL_TREE)
406 return false;
407
408 /* When the evolution is a polynomial of degree >= 2
409 the evolution function is not "simple". */
410 if (tree_is_chrec (evolution_part))
411 return false;
412
413 step_expr = evolution_part;
414 init_expr = unshare_expr (initial_condition_in_loop_num (access_fn, loop_nb));
415
416 if (dump_enabled_p ())
417 dump_printf_loc (MSG_NOTE, vect_location, "step: %T, init: %T\n",
418 step_expr, init_expr);
419
420 *init = init_expr;
421 *step = step_expr;
422
423 if (TREE_CODE (step_expr) != INTEGER_CST
424 && (TREE_CODE (step_expr) != SSA_NAME
425 || ((bb = gimple_bb (SSA_NAME_DEF_STMT (step_expr)))
426 && flow_bb_inside_loop_p (get_loop (cfun, loop_nb), bb))
427 || (!INTEGRAL_TYPE_P (TREE_TYPE (step_expr))
428 && (!SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr))
429 || !flag_associative_math)))
430 && (TREE_CODE (step_expr) != REAL_CST
431 || !flag_associative_math))
432 {
433 if (dump_enabled_p ())
434 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
435 "step unknown.\n");
436 return false;
437 }
438
439 return true;
440 }
441
442 /* Return true if PHI, described by STMT_INFO, is the inner PHI in
443 what we are assuming is a double reduction. For example, given
444 a structure like this:
445
446 outer1:
447 x_1 = PHI <x_4(outer2), ...>;
448 ...
449
450 inner:
451 x_2 = PHI <x_1(outer1), ...>;
452 ...
453 x_3 = ...;
454 ...
455
456 outer2:
457 x_4 = PHI <x_3(inner)>;
458 ...
459
460 outer loop analysis would treat x_1 as a double reduction phi and
461 this function would then return true for x_2. */
462
463 static bool
464 vect_inner_phi_in_double_reduction_p (stmt_vec_info stmt_info, gphi *phi)
465 {
466 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
467 use_operand_p use_p;
468 ssa_op_iter op_iter;
469 FOR_EACH_PHI_ARG (use_p, phi, op_iter, SSA_OP_USE)
470 if (stmt_vec_info def_info = loop_vinfo->lookup_def (USE_FROM_PTR (use_p)))
471 if (STMT_VINFO_DEF_TYPE (def_info) == vect_double_reduction_def)
472 return true;
473 return false;
474 }
475
476 /* Function vect_analyze_scalar_cycles_1.
477
478 Examine the cross iteration def-use cycles of scalar variables
479 in LOOP. LOOP_VINFO represents the loop that is now being
480 considered for vectorization (can be LOOP, or an outer-loop
481 enclosing LOOP). */
482
483 static void
484 vect_analyze_scalar_cycles_1 (loop_vec_info loop_vinfo, struct loop *loop)
485 {
486 basic_block bb = loop->header;
487 tree init, step;
488 auto_vec<stmt_vec_info, 64> worklist;
489 gphi_iterator gsi;
490 bool double_reduc;
491
492 DUMP_VECT_SCOPE ("vect_analyze_scalar_cycles");
493
494 /* First - identify all inductions. Reduction detection assumes that all the
495 inductions have been identified, therefore, this order must not be
496 changed. */
497 for (gsi = gsi_start_phis (bb); !gsi_end_p (gsi); gsi_next (&gsi))
498 {
499 gphi *phi = gsi.phi ();
500 tree access_fn = NULL;
501 tree def = PHI_RESULT (phi);
502 stmt_vec_info stmt_vinfo = loop_vinfo->lookup_stmt (phi);
503
504 if (dump_enabled_p ())
505 dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: %G", phi);
506
507 /* Skip virtual phi's. The data dependences that are associated with
508 virtual defs/uses (i.e., memory accesses) are analyzed elsewhere. */
509 if (virtual_operand_p (def))
510 continue;
511
512 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_unknown_def_type;
513
514 /* Analyze the evolution function. */
515 access_fn = analyze_scalar_evolution (loop, def);
516 if (access_fn)
517 {
518 STRIP_NOPS (access_fn);
519 if (dump_enabled_p ())
520 dump_printf_loc (MSG_NOTE, vect_location,
521 "Access function of PHI: %T\n", access_fn);
522 STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo)
523 = initial_condition_in_loop_num (access_fn, loop->num);
524 STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo)
525 = evolution_part_in_loop_num (access_fn, loop->num);
526 }
527
528 if (!access_fn
529 || vect_inner_phi_in_double_reduction_p (stmt_vinfo, phi)
530 || !vect_is_simple_iv_evolution (loop->num, access_fn, &init, &step)
531 || (LOOP_VINFO_LOOP (loop_vinfo) != loop
532 && TREE_CODE (step) != INTEGER_CST))
533 {
534 worklist.safe_push (stmt_vinfo);
535 continue;
536 }
537
538 gcc_assert (STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo)
539 != NULL_TREE);
540 gcc_assert (STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo) != NULL_TREE);
541
542 if (dump_enabled_p ())
543 dump_printf_loc (MSG_NOTE, vect_location, "Detected induction.\n");
544 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_induction_def;
545 }
546
547
548 /* Second - identify all reductions and nested cycles. */
549 while (worklist.length () > 0)
550 {
551 stmt_vec_info stmt_vinfo = worklist.pop ();
552 gphi *phi = as_a <gphi *> (stmt_vinfo->stmt);
553 tree def = PHI_RESULT (phi);
554
555 if (dump_enabled_p ())
556 dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: %G", phi);
557
558 gcc_assert (!virtual_operand_p (def)
559 && STMT_VINFO_DEF_TYPE (stmt_vinfo) == vect_unknown_def_type);
560
561 stmt_vec_info reduc_stmt_info
562 = vect_force_simple_reduction (loop_vinfo, stmt_vinfo,
563 &double_reduc, false);
564 if (reduc_stmt_info)
565 {
566 if (double_reduc)
567 {
568 if (dump_enabled_p ())
569 dump_printf_loc (MSG_NOTE, vect_location,
570 "Detected double reduction.\n");
571
572 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_double_reduction_def;
573 STMT_VINFO_DEF_TYPE (reduc_stmt_info)
574 = vect_double_reduction_def;
575 }
576 else
577 {
578 if (loop != LOOP_VINFO_LOOP (loop_vinfo))
579 {
580 if (dump_enabled_p ())
581 dump_printf_loc (MSG_NOTE, vect_location,
582 "Detected vectorizable nested cycle.\n");
583
584 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_nested_cycle;
585 STMT_VINFO_DEF_TYPE (reduc_stmt_info) = vect_nested_cycle;
586 }
587 else
588 {
589 if (dump_enabled_p ())
590 dump_printf_loc (MSG_NOTE, vect_location,
591 "Detected reduction.\n");
592
593 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_reduction_def;
594 STMT_VINFO_DEF_TYPE (reduc_stmt_info) = vect_reduction_def;
595 /* Store the reduction cycles for possible vectorization in
596 loop-aware SLP if it was not detected as reduction
597 chain. */
598 if (! REDUC_GROUP_FIRST_ELEMENT (reduc_stmt_info))
599 LOOP_VINFO_REDUCTIONS (loop_vinfo).safe_push
600 (reduc_stmt_info);
601 }
602 }
603 }
604 else
605 if (dump_enabled_p ())
606 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
607 "Unknown def-use cycle pattern.\n");
608 }
609 }
610
611
612 /* Function vect_analyze_scalar_cycles.
613
614 Examine the cross iteration def-use cycles of scalar variables, by
615 analyzing the loop-header PHIs of scalar variables. Classify each
616 cycle as one of the following: invariant, induction, reduction, unknown.
617 We do that for the loop represented by LOOP_VINFO, and also to its
618 inner-loop, if exists.
619 Examples for scalar cycles:
620
621 Example1: reduction:
622
623 loop1:
624 for (i=0; i<N; i++)
625 sum += a[i];
626
627 Example2: induction:
628
629 loop2:
630 for (i=0; i<N; i++)
631 a[i] = i; */
632
633 static void
634 vect_analyze_scalar_cycles (loop_vec_info loop_vinfo)
635 {
636 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
637
638 vect_analyze_scalar_cycles_1 (loop_vinfo, loop);
639
640 /* When vectorizing an outer-loop, the inner-loop is executed sequentially.
641 Reductions in such inner-loop therefore have different properties than
642 the reductions in the nest that gets vectorized:
643 1. When vectorized, they are executed in the same order as in the original
644 scalar loop, so we can't change the order of computation when
645 vectorizing them.
646 2. FIXME: Inner-loop reductions can be used in the inner-loop, so the
647 current checks are too strict. */
648
649 if (loop->inner)
650 vect_analyze_scalar_cycles_1 (loop_vinfo, loop->inner);
651 }
652
653 /* Transfer group and reduction information from STMT_INFO to its
654 pattern stmt. */
655
656 static void
657 vect_fixup_reduc_chain (stmt_vec_info stmt_info)
658 {
659 stmt_vec_info firstp = STMT_VINFO_RELATED_STMT (stmt_info);
660 stmt_vec_info stmtp;
661 gcc_assert (!REDUC_GROUP_FIRST_ELEMENT (firstp)
662 && REDUC_GROUP_FIRST_ELEMENT (stmt_info));
663 REDUC_GROUP_SIZE (firstp) = REDUC_GROUP_SIZE (stmt_info);
664 do
665 {
666 stmtp = STMT_VINFO_RELATED_STMT (stmt_info);
667 REDUC_GROUP_FIRST_ELEMENT (stmtp) = firstp;
668 stmt_info = REDUC_GROUP_NEXT_ELEMENT (stmt_info);
669 if (stmt_info)
670 REDUC_GROUP_NEXT_ELEMENT (stmtp)
671 = STMT_VINFO_RELATED_STMT (stmt_info);
672 }
673 while (stmt_info);
674 STMT_VINFO_DEF_TYPE (stmtp) = vect_reduction_def;
675 }
676
677 /* Fixup scalar cycles that now have their stmts detected as patterns. */
678
679 static void
680 vect_fixup_scalar_cycles_with_patterns (loop_vec_info loop_vinfo)
681 {
682 stmt_vec_info first;
683 unsigned i;
684
685 FOR_EACH_VEC_ELT (LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo), i, first)
686 if (STMT_VINFO_IN_PATTERN_P (first))
687 {
688 stmt_vec_info next = REDUC_GROUP_NEXT_ELEMENT (first);
689 while (next)
690 {
691 if (! STMT_VINFO_IN_PATTERN_P (next))
692 break;
693 next = REDUC_GROUP_NEXT_ELEMENT (next);
694 }
695 /* If not all stmt in the chain are patterns try to handle
696 the chain without patterns. */
697 if (! next)
698 {
699 vect_fixup_reduc_chain (first);
700 LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo)[i]
701 = STMT_VINFO_RELATED_STMT (first);
702 }
703 }
704 }
705
706 /* Function vect_get_loop_niters.
707
708 Determine how many iterations the loop is executed and place it
709 in NUMBER_OF_ITERATIONS. Place the number of latch iterations
710 in NUMBER_OF_ITERATIONSM1. Place the condition under which the
711 niter information holds in ASSUMPTIONS.
712
713 Return the loop exit condition. */
714
715
716 static gcond *
717 vect_get_loop_niters (struct loop *loop, tree *assumptions,
718 tree *number_of_iterations, tree *number_of_iterationsm1)
719 {
720 edge exit = single_exit (loop);
721 struct tree_niter_desc niter_desc;
722 tree niter_assumptions, niter, may_be_zero;
723 gcond *cond = get_loop_exit_condition (loop);
724
725 *assumptions = boolean_true_node;
726 *number_of_iterationsm1 = chrec_dont_know;
727 *number_of_iterations = chrec_dont_know;
728 DUMP_VECT_SCOPE ("get_loop_niters");
729
730 if (!exit)
731 return cond;
732
733 may_be_zero = NULL_TREE;
734 if (!number_of_iterations_exit_assumptions (loop, exit, &niter_desc, NULL)
735 || chrec_contains_undetermined (niter_desc.niter))
736 return cond;
737
738 niter_assumptions = niter_desc.assumptions;
739 may_be_zero = niter_desc.may_be_zero;
740 niter = niter_desc.niter;
741
742 if (may_be_zero && integer_zerop (may_be_zero))
743 may_be_zero = NULL_TREE;
744
745 if (may_be_zero)
746 {
747 if (COMPARISON_CLASS_P (may_be_zero))
748 {
749 /* Try to combine may_be_zero with assumptions, this can simplify
750 computation of niter expression. */
751 if (niter_assumptions && !integer_nonzerop (niter_assumptions))
752 niter_assumptions = fold_build2 (TRUTH_AND_EXPR, boolean_type_node,
753 niter_assumptions,
754 fold_build1 (TRUTH_NOT_EXPR,
755 boolean_type_node,
756 may_be_zero));
757 else
758 niter = fold_build3 (COND_EXPR, TREE_TYPE (niter), may_be_zero,
759 build_int_cst (TREE_TYPE (niter), 0),
760 rewrite_to_non_trapping_overflow (niter));
761
762 may_be_zero = NULL_TREE;
763 }
764 else if (integer_nonzerop (may_be_zero))
765 {
766 *number_of_iterationsm1 = build_int_cst (TREE_TYPE (niter), 0);
767 *number_of_iterations = build_int_cst (TREE_TYPE (niter), 1);
768 return cond;
769 }
770 else
771 return cond;
772 }
773
774 *assumptions = niter_assumptions;
775 *number_of_iterationsm1 = niter;
776
777 /* We want the number of loop header executions which is the number
778 of latch executions plus one.
779 ??? For UINT_MAX latch executions this number overflows to zero
780 for loops like do { n++; } while (n != 0); */
781 if (niter && !chrec_contains_undetermined (niter))
782 niter = fold_build2 (PLUS_EXPR, TREE_TYPE (niter), unshare_expr (niter),
783 build_int_cst (TREE_TYPE (niter), 1));
784 *number_of_iterations = niter;
785
786 return cond;
787 }
788
789 /* Function bb_in_loop_p
790
791 Used as predicate for dfs order traversal of the loop bbs. */
792
793 static bool
794 bb_in_loop_p (const_basic_block bb, const void *data)
795 {
796 const struct loop *const loop = (const struct loop *)data;
797 if (flow_bb_inside_loop_p (loop, bb))
798 return true;
799 return false;
800 }
801
802
803 /* Create and initialize a new loop_vec_info struct for LOOP_IN, as well as
804 stmt_vec_info structs for all the stmts in LOOP_IN. */
805
806 _loop_vec_info::_loop_vec_info (struct loop *loop_in, vec_info_shared *shared)
807 : vec_info (vec_info::loop, init_cost (loop_in), shared),
808 loop (loop_in),
809 bbs (XCNEWVEC (basic_block, loop->num_nodes)),
810 num_itersm1 (NULL_TREE),
811 num_iters (NULL_TREE),
812 num_iters_unchanged (NULL_TREE),
813 num_iters_assumptions (NULL_TREE),
814 th (0),
815 versioning_threshold (0),
816 vectorization_factor (0),
817 max_vectorization_factor (0),
818 mask_skip_niters (NULL_TREE),
819 mask_compare_type (NULL_TREE),
820 simd_if_cond (NULL_TREE),
821 unaligned_dr (NULL),
822 peeling_for_alignment (0),
823 ptr_mask (0),
824 ivexpr_map (NULL),
825 scan_map (NULL),
826 slp_unrolling_factor (1),
827 single_scalar_iteration_cost (0),
828 vectorizable (false),
829 can_fully_mask_p (true),
830 fully_masked_p (false),
831 peeling_for_gaps (false),
832 peeling_for_niter (false),
833 operands_swapped (false),
834 no_data_dependencies (false),
835 has_mask_store (false),
836 scalar_loop (NULL),
837 orig_loop_info (NULL)
838 {
839 /* CHECKME: We want to visit all BBs before their successors (except for
840 latch blocks, for which this assertion wouldn't hold). In the simple
841 case of the loop forms we allow, a dfs order of the BBs would the same
842 as reversed postorder traversal, so we are safe. */
843
844 unsigned int nbbs = dfs_enumerate_from (loop->header, 0, bb_in_loop_p,
845 bbs, loop->num_nodes, loop);
846 gcc_assert (nbbs == loop->num_nodes);
847
848 for (unsigned int i = 0; i < nbbs; i++)
849 {
850 basic_block bb = bbs[i];
851 gimple_stmt_iterator si;
852
853 for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
854 {
855 gimple *phi = gsi_stmt (si);
856 gimple_set_uid (phi, 0);
857 add_stmt (phi);
858 }
859
860 for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
861 {
862 gimple *stmt = gsi_stmt (si);
863 gimple_set_uid (stmt, 0);
864 add_stmt (stmt);
865 /* If .GOMP_SIMD_LANE call for the current loop has 3 arguments, the
866 third argument is the #pragma omp simd if (x) condition, when 0,
867 loop shouldn't be vectorized, when non-zero constant, it should
868 be vectorized normally, otherwise versioned with vectorized loop
869 done if the condition is non-zero at runtime. */
870 if (loop_in->simduid
871 && is_gimple_call (stmt)
872 && gimple_call_internal_p (stmt)
873 && gimple_call_internal_fn (stmt) == IFN_GOMP_SIMD_LANE
874 && gimple_call_num_args (stmt) >= 3
875 && TREE_CODE (gimple_call_arg (stmt, 0)) == SSA_NAME
876 && (loop_in->simduid
877 == SSA_NAME_VAR (gimple_call_arg (stmt, 0))))
878 {
879 tree arg = gimple_call_arg (stmt, 2);
880 if (integer_zerop (arg) || TREE_CODE (arg) == SSA_NAME)
881 simd_if_cond = arg;
882 else
883 gcc_assert (integer_nonzerop (arg));
884 }
885 }
886 }
887 }
888
889 /* Free all levels of MASKS. */
890
891 void
892 release_vec_loop_masks (vec_loop_masks *masks)
893 {
894 rgroup_masks *rgm;
895 unsigned int i;
896 FOR_EACH_VEC_ELT (*masks, i, rgm)
897 rgm->masks.release ();
898 masks->release ();
899 }
900
901 /* Free all memory used by the _loop_vec_info, as well as all the
902 stmt_vec_info structs of all the stmts in the loop. */
903
904 _loop_vec_info::~_loop_vec_info ()
905 {
906 int nbbs;
907 gimple_stmt_iterator si;
908 int j;
909
910 nbbs = loop->num_nodes;
911 for (j = 0; j < nbbs; j++)
912 {
913 basic_block bb = bbs[j];
914 for (si = gsi_start_bb (bb); !gsi_end_p (si); )
915 {
916 gimple *stmt = gsi_stmt (si);
917
918 /* We may have broken canonical form by moving a constant
919 into RHS1 of a commutative op. Fix such occurrences. */
920 if (operands_swapped && is_gimple_assign (stmt))
921 {
922 enum tree_code code = gimple_assign_rhs_code (stmt);
923
924 if ((code == PLUS_EXPR
925 || code == POINTER_PLUS_EXPR
926 || code == MULT_EXPR)
927 && CONSTANT_CLASS_P (gimple_assign_rhs1 (stmt)))
928 swap_ssa_operands (stmt,
929 gimple_assign_rhs1_ptr (stmt),
930 gimple_assign_rhs2_ptr (stmt));
931 else if (code == COND_EXPR
932 && CONSTANT_CLASS_P (gimple_assign_rhs2 (stmt)))
933 {
934 tree cond_expr = gimple_assign_rhs1 (stmt);
935 enum tree_code cond_code = TREE_CODE (cond_expr);
936
937 if (TREE_CODE_CLASS (cond_code) == tcc_comparison)
938 {
939 bool honor_nans = HONOR_NANS (TREE_OPERAND (cond_expr,
940 0));
941 cond_code = invert_tree_comparison (cond_code,
942 honor_nans);
943 if (cond_code != ERROR_MARK)
944 {
945 TREE_SET_CODE (cond_expr, cond_code);
946 swap_ssa_operands (stmt,
947 gimple_assign_rhs2_ptr (stmt),
948 gimple_assign_rhs3_ptr (stmt));
949 }
950 }
951 }
952 }
953 gsi_next (&si);
954 }
955 }
956
957 free (bbs);
958
959 release_vec_loop_masks (&masks);
960 delete ivexpr_map;
961 delete scan_map;
962
963 loop->aux = NULL;
964 }
965
966 /* Return an invariant or register for EXPR and emit necessary
967 computations in the LOOP_VINFO loop preheader. */
968
969 tree
970 cse_and_gimplify_to_preheader (loop_vec_info loop_vinfo, tree expr)
971 {
972 if (is_gimple_reg (expr)
973 || is_gimple_min_invariant (expr))
974 return expr;
975
976 if (! loop_vinfo->ivexpr_map)
977 loop_vinfo->ivexpr_map = new hash_map<tree_operand_hash, tree>;
978 tree &cached = loop_vinfo->ivexpr_map->get_or_insert (expr);
979 if (! cached)
980 {
981 gimple_seq stmts = NULL;
982 cached = force_gimple_operand (unshare_expr (expr),
983 &stmts, true, NULL_TREE);
984 if (stmts)
985 {
986 edge e = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo));
987 gsi_insert_seq_on_edge_immediate (e, stmts);
988 }
989 }
990 return cached;
991 }
992
993 /* Return true if we can use CMP_TYPE as the comparison type to produce
994 all masks required to mask LOOP_VINFO. */
995
996 static bool
997 can_produce_all_loop_masks_p (loop_vec_info loop_vinfo, tree cmp_type)
998 {
999 rgroup_masks *rgm;
1000 unsigned int i;
1001 FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo), i, rgm)
1002 if (rgm->mask_type != NULL_TREE
1003 && !direct_internal_fn_supported_p (IFN_WHILE_ULT,
1004 cmp_type, rgm->mask_type,
1005 OPTIMIZE_FOR_SPEED))
1006 return false;
1007 return true;
1008 }
1009
1010 /* Calculate the maximum number of scalars per iteration for every
1011 rgroup in LOOP_VINFO. */
1012
1013 static unsigned int
1014 vect_get_max_nscalars_per_iter (loop_vec_info loop_vinfo)
1015 {
1016 unsigned int res = 1;
1017 unsigned int i;
1018 rgroup_masks *rgm;
1019 FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo), i, rgm)
1020 res = MAX (res, rgm->max_nscalars_per_iter);
1021 return res;
1022 }
1023
1024 /* Each statement in LOOP_VINFO can be masked where necessary. Check
1025 whether we can actually generate the masks required. Return true if so,
1026 storing the type of the scalar IV in LOOP_VINFO_MASK_COMPARE_TYPE. */
1027
1028 static bool
1029 vect_verify_full_masking (loop_vec_info loop_vinfo)
1030 {
1031 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1032 unsigned int min_ni_width;
1033 unsigned int max_nscalars_per_iter
1034 = vect_get_max_nscalars_per_iter (loop_vinfo);
1035
1036 /* Use a normal loop if there are no statements that need masking.
1037 This only happens in rare degenerate cases: it means that the loop
1038 has no loads, no stores, and no live-out values. */
1039 if (LOOP_VINFO_MASKS (loop_vinfo).is_empty ())
1040 return false;
1041
1042 /* Get the maximum number of iterations that is representable
1043 in the counter type. */
1044 tree ni_type = TREE_TYPE (LOOP_VINFO_NITERSM1 (loop_vinfo));
1045 widest_int max_ni = wi::to_widest (TYPE_MAX_VALUE (ni_type)) + 1;
1046
1047 /* Get a more refined estimate for the number of iterations. */
1048 widest_int max_back_edges;
1049 if (max_loop_iterations (loop, &max_back_edges))
1050 max_ni = wi::smin (max_ni, max_back_edges + 1);
1051
1052 /* Account for rgroup masks, in which each bit is replicated N times. */
1053 max_ni *= max_nscalars_per_iter;
1054
1055 /* Work out how many bits we need to represent the limit. */
1056 min_ni_width = wi::min_precision (max_ni, UNSIGNED);
1057
1058 /* Find a scalar mode for which WHILE_ULT is supported. */
1059 opt_scalar_int_mode cmp_mode_iter;
1060 tree cmp_type = NULL_TREE;
1061 tree iv_type = NULL_TREE;
1062 widest_int iv_limit = vect_iv_limit_for_full_masking (loop_vinfo);
1063 unsigned int iv_precision = UINT_MAX;
1064
1065 if (iv_limit != -1)
1066 iv_precision = wi::min_precision (iv_limit * max_nscalars_per_iter,
1067 UNSIGNED);
1068
1069 FOR_EACH_MODE_IN_CLASS (cmp_mode_iter, MODE_INT)
1070 {
1071 unsigned int cmp_bits = GET_MODE_BITSIZE (cmp_mode_iter.require ());
1072 if (cmp_bits >= min_ni_width
1073 && targetm.scalar_mode_supported_p (cmp_mode_iter.require ()))
1074 {
1075 tree this_type = build_nonstandard_integer_type (cmp_bits, true);
1076 if (this_type
1077 && can_produce_all_loop_masks_p (loop_vinfo, this_type))
1078 {
1079 /* Although we could stop as soon as we find a valid mode,
1080 there are at least two reasons why that's not always the
1081 best choice:
1082
1083 - An IV that's Pmode or wider is more likely to be reusable
1084 in address calculations than an IV that's narrower than
1085 Pmode.
1086
1087 - Doing the comparison in IV_PRECISION or wider allows
1088 a natural 0-based IV, whereas using a narrower comparison
1089 type requires mitigations against wrap-around.
1090
1091 Conversely, if the IV limit is variable, doing the comparison
1092 in a wider type than the original type can introduce
1093 unnecessary extensions, so picking the widest valid mode
1094 is not always a good choice either.
1095
1096 Here we prefer the first IV type that's Pmode or wider,
1097 and the first comparison type that's IV_PRECISION or wider.
1098 (The comparison type must be no wider than the IV type,
1099 to avoid extensions in the vector loop.)
1100
1101 ??? We might want to try continuing beyond Pmode for ILP32
1102 targets if CMP_BITS < IV_PRECISION. */
1103 iv_type = this_type;
1104 if (!cmp_type || iv_precision > TYPE_PRECISION (cmp_type))
1105 cmp_type = this_type;
1106 if (cmp_bits >= GET_MODE_BITSIZE (Pmode))
1107 break;
1108 }
1109 }
1110 }
1111
1112 if (!cmp_type)
1113 return false;
1114
1115 LOOP_VINFO_MASK_COMPARE_TYPE (loop_vinfo) = cmp_type;
1116 LOOP_VINFO_MASK_IV_TYPE (loop_vinfo) = iv_type;
1117 return true;
1118 }
1119
1120 /* Calculate the cost of one scalar iteration of the loop. */
1121 static void
1122 vect_compute_single_scalar_iteration_cost (loop_vec_info loop_vinfo)
1123 {
1124 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1125 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1126 int nbbs = loop->num_nodes, factor;
1127 int innerloop_iters, i;
1128
1129 DUMP_VECT_SCOPE ("vect_compute_single_scalar_iteration_cost");
1130
1131 /* Gather costs for statements in the scalar loop. */
1132
1133 /* FORNOW. */
1134 innerloop_iters = 1;
1135 if (loop->inner)
1136 innerloop_iters = 50; /* FIXME */
1137
1138 for (i = 0; i < nbbs; i++)
1139 {
1140 gimple_stmt_iterator si;
1141 basic_block bb = bbs[i];
1142
1143 if (bb->loop_father == loop->inner)
1144 factor = innerloop_iters;
1145 else
1146 factor = 1;
1147
1148 for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
1149 {
1150 gimple *stmt = gsi_stmt (si);
1151 stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (stmt);
1152
1153 if (!is_gimple_assign (stmt) && !is_gimple_call (stmt))
1154 continue;
1155
1156 /* Skip stmts that are not vectorized inside the loop. */
1157 stmt_vec_info vstmt_info = vect_stmt_to_vectorize (stmt_info);
1158 if (!STMT_VINFO_RELEVANT_P (vstmt_info)
1159 && (!STMT_VINFO_LIVE_P (vstmt_info)
1160 || !VECTORIZABLE_CYCLE_DEF
1161 (STMT_VINFO_DEF_TYPE (vstmt_info))))
1162 continue;
1163
1164 vect_cost_for_stmt kind;
1165 if (STMT_VINFO_DATA_REF (stmt_info))
1166 {
1167 if (DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
1168 kind = scalar_load;
1169 else
1170 kind = scalar_store;
1171 }
1172 else
1173 kind = scalar_stmt;
1174
1175 record_stmt_cost (&LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1176 factor, kind, stmt_info, 0, vect_prologue);
1177 }
1178 }
1179
1180 /* Now accumulate cost. */
1181 void *target_cost_data = init_cost (loop);
1182 stmt_info_for_cost *si;
1183 int j;
1184 FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1185 j, si)
1186 (void) add_stmt_cost (target_cost_data, si->count,
1187 si->kind, si->stmt_info, si->misalign,
1188 vect_body);
1189 unsigned dummy, body_cost = 0;
1190 finish_cost (target_cost_data, &dummy, &body_cost, &dummy);
1191 destroy_cost_data (target_cost_data);
1192 LOOP_VINFO_SINGLE_SCALAR_ITERATION_COST (loop_vinfo) = body_cost;
1193 }
1194
1195
1196 /* Function vect_analyze_loop_form_1.
1197
1198 Verify that certain CFG restrictions hold, including:
1199 - the loop has a pre-header
1200 - the loop has a single entry and exit
1201 - the loop exit condition is simple enough
1202 - the number of iterations can be analyzed, i.e, a countable loop. The
1203 niter could be analyzed under some assumptions. */
1204
1205 opt_result
1206 vect_analyze_loop_form_1 (struct loop *loop, gcond **loop_cond,
1207 tree *assumptions, tree *number_of_iterationsm1,
1208 tree *number_of_iterations, gcond **inner_loop_cond)
1209 {
1210 DUMP_VECT_SCOPE ("vect_analyze_loop_form");
1211
1212 /* Different restrictions apply when we are considering an inner-most loop,
1213 vs. an outer (nested) loop.
1214 (FORNOW. May want to relax some of these restrictions in the future). */
1215
1216 if (!loop->inner)
1217 {
1218 /* Inner-most loop. We currently require that the number of BBs is
1219 exactly 2 (the header and latch). Vectorizable inner-most loops
1220 look like this:
1221
1222 (pre-header)
1223 |
1224 header <--------+
1225 | | |
1226 | +--> latch --+
1227 |
1228 (exit-bb) */
1229
1230 if (loop->num_nodes != 2)
1231 return opt_result::failure_at (vect_location,
1232 "not vectorized:"
1233 " control flow in loop.\n");
1234
1235 if (empty_block_p (loop->header))
1236 return opt_result::failure_at (vect_location,
1237 "not vectorized: empty loop.\n");
1238 }
1239 else
1240 {
1241 struct loop *innerloop = loop->inner;
1242 edge entryedge;
1243
1244 /* Nested loop. We currently require that the loop is doubly-nested,
1245 contains a single inner loop, and the number of BBs is exactly 5.
1246 Vectorizable outer-loops look like this:
1247
1248 (pre-header)
1249 |
1250 header <---+
1251 | |
1252 inner-loop |
1253 | |
1254 tail ------+
1255 |
1256 (exit-bb)
1257
1258 The inner-loop has the properties expected of inner-most loops
1259 as described above. */
1260
1261 if ((loop->inner)->inner || (loop->inner)->next)
1262 return opt_result::failure_at (vect_location,
1263 "not vectorized:"
1264 " multiple nested loops.\n");
1265
1266 if (loop->num_nodes != 5)
1267 return opt_result::failure_at (vect_location,
1268 "not vectorized:"
1269 " control flow in loop.\n");
1270
1271 entryedge = loop_preheader_edge (innerloop);
1272 if (entryedge->src != loop->header
1273 || !single_exit (innerloop)
1274 || single_exit (innerloop)->dest != EDGE_PRED (loop->latch, 0)->src)
1275 return opt_result::failure_at (vect_location,
1276 "not vectorized:"
1277 " unsupported outerloop form.\n");
1278
1279 /* Analyze the inner-loop. */
1280 tree inner_niterm1, inner_niter, inner_assumptions;
1281 opt_result res
1282 = vect_analyze_loop_form_1 (loop->inner, inner_loop_cond,
1283 &inner_assumptions, &inner_niterm1,
1284 &inner_niter, NULL);
1285 if (!res)
1286 {
1287 if (dump_enabled_p ())
1288 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1289 "not vectorized: Bad inner loop.\n");
1290 return res;
1291 }
1292
1293 /* Don't support analyzing niter under assumptions for inner
1294 loop. */
1295 if (!integer_onep (inner_assumptions))
1296 return opt_result::failure_at (vect_location,
1297 "not vectorized: Bad inner loop.\n");
1298
1299 if (!expr_invariant_in_loop_p (loop, inner_niter))
1300 return opt_result::failure_at (vect_location,
1301 "not vectorized: inner-loop count not"
1302 " invariant.\n");
1303
1304 if (dump_enabled_p ())
1305 dump_printf_loc (MSG_NOTE, vect_location,
1306 "Considering outer-loop vectorization.\n");
1307 }
1308
1309 if (!single_exit (loop))
1310 return opt_result::failure_at (vect_location,
1311 "not vectorized: multiple exits.\n");
1312 if (EDGE_COUNT (loop->header->preds) != 2)
1313 return opt_result::failure_at (vect_location,
1314 "not vectorized:"
1315 " too many incoming edges.\n");
1316
1317 /* We assume that the loop exit condition is at the end of the loop. i.e,
1318 that the loop is represented as a do-while (with a proper if-guard
1319 before the loop if needed), where the loop header contains all the
1320 executable statements, and the latch is empty. */
1321 if (!empty_block_p (loop->latch)
1322 || !gimple_seq_empty_p (phi_nodes (loop->latch)))
1323 return opt_result::failure_at (vect_location,
1324 "not vectorized: latch block not empty.\n");
1325
1326 /* Make sure the exit is not abnormal. */
1327 edge e = single_exit (loop);
1328 if (e->flags & EDGE_ABNORMAL)
1329 return opt_result::failure_at (vect_location,
1330 "not vectorized:"
1331 " abnormal loop exit edge.\n");
1332
1333 *loop_cond = vect_get_loop_niters (loop, assumptions, number_of_iterations,
1334 number_of_iterationsm1);
1335 if (!*loop_cond)
1336 return opt_result::failure_at
1337 (vect_location,
1338 "not vectorized: complicated exit condition.\n");
1339
1340 if (integer_zerop (*assumptions)
1341 || !*number_of_iterations
1342 || chrec_contains_undetermined (*number_of_iterations))
1343 return opt_result::failure_at
1344 (*loop_cond,
1345 "not vectorized: number of iterations cannot be computed.\n");
1346
1347 if (integer_zerop (*number_of_iterations))
1348 return opt_result::failure_at
1349 (*loop_cond,
1350 "not vectorized: number of iterations = 0.\n");
1351
1352 return opt_result::success ();
1353 }
1354
1355 /* Analyze LOOP form and return a loop_vec_info if it is of suitable form. */
1356
1357 opt_loop_vec_info
1358 vect_analyze_loop_form (struct loop *loop, vec_info_shared *shared)
1359 {
1360 tree assumptions, number_of_iterations, number_of_iterationsm1;
1361 gcond *loop_cond, *inner_loop_cond = NULL;
1362
1363 opt_result res
1364 = vect_analyze_loop_form_1 (loop, &loop_cond,
1365 &assumptions, &number_of_iterationsm1,
1366 &number_of_iterations, &inner_loop_cond);
1367 if (!res)
1368 return opt_loop_vec_info::propagate_failure (res);
1369
1370 loop_vec_info loop_vinfo = new _loop_vec_info (loop, shared);
1371 LOOP_VINFO_NITERSM1 (loop_vinfo) = number_of_iterationsm1;
1372 LOOP_VINFO_NITERS (loop_vinfo) = number_of_iterations;
1373 LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = number_of_iterations;
1374 if (!integer_onep (assumptions))
1375 {
1376 /* We consider to vectorize this loop by versioning it under
1377 some assumptions. In order to do this, we need to clear
1378 existing information computed by scev and niter analyzer. */
1379 scev_reset_htab ();
1380 free_numbers_of_iterations_estimates (loop);
1381 /* Also set flag for this loop so that following scev and niter
1382 analysis are done under the assumptions. */
1383 loop_constraint_set (loop, LOOP_C_FINITE);
1384 /* Also record the assumptions for versioning. */
1385 LOOP_VINFO_NITERS_ASSUMPTIONS (loop_vinfo) = assumptions;
1386 }
1387
1388 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
1389 {
1390 if (dump_enabled_p ())
1391 {
1392 dump_printf_loc (MSG_NOTE, vect_location,
1393 "Symbolic number of iterations is ");
1394 dump_generic_expr (MSG_NOTE, TDF_DETAILS, number_of_iterations);
1395 dump_printf (MSG_NOTE, "\n");
1396 }
1397 }
1398
1399 stmt_vec_info loop_cond_info = loop_vinfo->lookup_stmt (loop_cond);
1400 STMT_VINFO_TYPE (loop_cond_info) = loop_exit_ctrl_vec_info_type;
1401 if (inner_loop_cond)
1402 {
1403 stmt_vec_info inner_loop_cond_info
1404 = loop_vinfo->lookup_stmt (inner_loop_cond);
1405 STMT_VINFO_TYPE (inner_loop_cond_info) = loop_exit_ctrl_vec_info_type;
1406 }
1407
1408 gcc_assert (!loop->aux);
1409 loop->aux = loop_vinfo;
1410 return opt_loop_vec_info::success (loop_vinfo);
1411 }
1412
1413
1414
1415 /* Scan the loop stmts and dependent on whether there are any (non-)SLP
1416 statements update the vectorization factor. */
1417
1418 static void
1419 vect_update_vf_for_slp (loop_vec_info loop_vinfo)
1420 {
1421 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1422 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1423 int nbbs = loop->num_nodes;
1424 poly_uint64 vectorization_factor;
1425 int i;
1426
1427 DUMP_VECT_SCOPE ("vect_update_vf_for_slp");
1428
1429 vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1430 gcc_assert (known_ne (vectorization_factor, 0U));
1431
1432 /* If all the stmts in the loop can be SLPed, we perform only SLP, and
1433 vectorization factor of the loop is the unrolling factor required by
1434 the SLP instances. If that unrolling factor is 1, we say, that we
1435 perform pure SLP on loop - cross iteration parallelism is not
1436 exploited. */
1437 bool only_slp_in_loop = true;
1438 for (i = 0; i < nbbs; i++)
1439 {
1440 basic_block bb = bbs[i];
1441 for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
1442 gsi_next (&si))
1443 {
1444 stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
1445 stmt_info = vect_stmt_to_vectorize (stmt_info);
1446 if ((STMT_VINFO_RELEVANT_P (stmt_info)
1447 || VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
1448 && !PURE_SLP_STMT (stmt_info))
1449 /* STMT needs both SLP and loop-based vectorization. */
1450 only_slp_in_loop = false;
1451 }
1452 }
1453
1454 if (only_slp_in_loop)
1455 {
1456 if (dump_enabled_p ())
1457 dump_printf_loc (MSG_NOTE, vect_location,
1458 "Loop contains only SLP stmts\n");
1459 vectorization_factor = LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo);
1460 }
1461 else
1462 {
1463 if (dump_enabled_p ())
1464 dump_printf_loc (MSG_NOTE, vect_location,
1465 "Loop contains SLP and non-SLP stmts\n");
1466 /* Both the vectorization factor and unroll factor have the form
1467 current_vector_size * X for some rational X, so they must have
1468 a common multiple. */
1469 vectorization_factor
1470 = force_common_multiple (vectorization_factor,
1471 LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo));
1472 }
1473
1474 LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
1475 if (dump_enabled_p ())
1476 {
1477 dump_printf_loc (MSG_NOTE, vect_location,
1478 "Updating vectorization factor to ");
1479 dump_dec (MSG_NOTE, vectorization_factor);
1480 dump_printf (MSG_NOTE, ".\n");
1481 }
1482 }
1483
1484 /* Return true if STMT_INFO describes a double reduction phi and if
1485 the other phi in the reduction is also relevant for vectorization.
1486 This rejects cases such as:
1487
1488 outer1:
1489 x_1 = PHI <x_3(outer2), ...>;
1490 ...
1491
1492 inner:
1493 x_2 = ...;
1494 ...
1495
1496 outer2:
1497 x_3 = PHI <x_2(inner)>;
1498
1499 if nothing in x_2 or elsewhere makes x_1 relevant. */
1500
1501 static bool
1502 vect_active_double_reduction_p (stmt_vec_info stmt_info)
1503 {
1504 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_double_reduction_def)
1505 return false;
1506
1507 return STMT_VINFO_RELEVANT_P (STMT_VINFO_REDUC_DEF (stmt_info));
1508 }
1509
1510 /* Function vect_analyze_loop_operations.
1511
1512 Scan the loop stmts and make sure they are all vectorizable. */
1513
1514 static opt_result
1515 vect_analyze_loop_operations (loop_vec_info loop_vinfo)
1516 {
1517 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1518 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1519 int nbbs = loop->num_nodes;
1520 int i;
1521 stmt_vec_info stmt_info;
1522 bool need_to_vectorize = false;
1523 bool ok;
1524
1525 DUMP_VECT_SCOPE ("vect_analyze_loop_operations");
1526
1527 auto_vec<stmt_info_for_cost> cost_vec;
1528
1529 for (i = 0; i < nbbs; i++)
1530 {
1531 basic_block bb = bbs[i];
1532
1533 for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
1534 gsi_next (&si))
1535 {
1536 gphi *phi = si.phi ();
1537 ok = true;
1538
1539 stmt_info = loop_vinfo->lookup_stmt (phi);
1540 if (dump_enabled_p ())
1541 dump_printf_loc (MSG_NOTE, vect_location, "examining phi: %G", phi);
1542 if (virtual_operand_p (gimple_phi_result (phi)))
1543 continue;
1544
1545 /* Inner-loop loop-closed exit phi in outer-loop vectorization
1546 (i.e., a phi in the tail of the outer-loop). */
1547 if (! is_loop_header_bb_p (bb))
1548 {
1549 /* FORNOW: we currently don't support the case that these phis
1550 are not used in the outerloop (unless it is double reduction,
1551 i.e., this phi is vect_reduction_def), cause this case
1552 requires to actually do something here. */
1553 if (STMT_VINFO_LIVE_P (stmt_info)
1554 && !vect_active_double_reduction_p (stmt_info))
1555 return opt_result::failure_at (phi,
1556 "Unsupported loop-closed phi"
1557 " in outer-loop.\n");
1558
1559 /* If PHI is used in the outer loop, we check that its operand
1560 is defined in the inner loop. */
1561 if (STMT_VINFO_RELEVANT_P (stmt_info))
1562 {
1563 tree phi_op;
1564
1565 if (gimple_phi_num_args (phi) != 1)
1566 return opt_result::failure_at (phi, "unsupported phi");
1567
1568 phi_op = PHI_ARG_DEF (phi, 0);
1569 stmt_vec_info op_def_info = loop_vinfo->lookup_def (phi_op);
1570 if (!op_def_info)
1571 return opt_result::failure_at (phi, "unsupported phi");
1572
1573 if (STMT_VINFO_RELEVANT (op_def_info) != vect_used_in_outer
1574 && (STMT_VINFO_RELEVANT (op_def_info)
1575 != vect_used_in_outer_by_reduction))
1576 return opt_result::failure_at (phi, "unsupported phi");
1577 }
1578
1579 continue;
1580 }
1581
1582 gcc_assert (stmt_info);
1583
1584 if ((STMT_VINFO_RELEVANT (stmt_info) == vect_used_in_scope
1585 || STMT_VINFO_LIVE_P (stmt_info))
1586 && STMT_VINFO_DEF_TYPE (stmt_info) != vect_induction_def)
1587 /* A scalar-dependence cycle that we don't support. */
1588 return opt_result::failure_at (phi,
1589 "not vectorized:"
1590 " scalar dependence cycle.\n");
1591
1592 if (STMT_VINFO_RELEVANT_P (stmt_info))
1593 {
1594 need_to_vectorize = true;
1595 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def
1596 && ! PURE_SLP_STMT (stmt_info))
1597 ok = vectorizable_induction (stmt_info, NULL, NULL, NULL,
1598 &cost_vec);
1599 else if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
1600 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle)
1601 && ! PURE_SLP_STMT (stmt_info))
1602 ok = vectorizable_reduction (stmt_info, NULL, NULL, NULL, NULL,
1603 &cost_vec);
1604 }
1605
1606 /* SLP PHIs are tested by vect_slp_analyze_node_operations. */
1607 if (ok
1608 && STMT_VINFO_LIVE_P (stmt_info)
1609 && !PURE_SLP_STMT (stmt_info))
1610 ok = vectorizable_live_operation (stmt_info, NULL, NULL, -1, NULL,
1611 &cost_vec);
1612
1613 if (!ok)
1614 return opt_result::failure_at (phi,
1615 "not vectorized: relevant phi not "
1616 "supported: %G",
1617 static_cast <gimple *> (phi));
1618 }
1619
1620 for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
1621 gsi_next (&si))
1622 {
1623 gimple *stmt = gsi_stmt (si);
1624 if (!gimple_clobber_p (stmt))
1625 {
1626 opt_result res
1627 = vect_analyze_stmt (loop_vinfo->lookup_stmt (stmt),
1628 &need_to_vectorize,
1629 NULL, NULL, &cost_vec);
1630 if (!res)
1631 return res;
1632 }
1633 }
1634 } /* bbs */
1635
1636 add_stmt_costs (loop_vinfo->target_cost_data, &cost_vec);
1637
1638 /* All operations in the loop are either irrelevant (deal with loop
1639 control, or dead), or only used outside the loop and can be moved
1640 out of the loop (e.g. invariants, inductions). The loop can be
1641 optimized away by scalar optimizations. We're better off not
1642 touching this loop. */
1643 if (!need_to_vectorize)
1644 {
1645 if (dump_enabled_p ())
1646 dump_printf_loc (MSG_NOTE, vect_location,
1647 "All the computation can be taken out of the loop.\n");
1648 return opt_result::failure_at
1649 (vect_location,
1650 "not vectorized: redundant loop. no profit to vectorize.\n");
1651 }
1652
1653 return opt_result::success ();
1654 }
1655
1656 /* Analyze the cost of the loop described by LOOP_VINFO. Decide if it
1657 is worthwhile to vectorize. Return 1 if definitely yes, 0 if
1658 definitely no, or -1 if it's worth retrying. */
1659
1660 static int
1661 vect_analyze_loop_costing (loop_vec_info loop_vinfo)
1662 {
1663 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1664 unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
1665
1666 /* Only fully-masked loops can have iteration counts less than the
1667 vectorization factor. */
1668 if (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
1669 {
1670 HOST_WIDE_INT max_niter;
1671
1672 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
1673 max_niter = LOOP_VINFO_INT_NITERS (loop_vinfo);
1674 else
1675 max_niter = max_stmt_executions_int (loop);
1676
1677 if (max_niter != -1
1678 && (unsigned HOST_WIDE_INT) max_niter < assumed_vf)
1679 {
1680 if (dump_enabled_p ())
1681 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1682 "not vectorized: iteration count smaller than "
1683 "vectorization factor.\n");
1684 return 0;
1685 }
1686 }
1687
1688 int min_profitable_iters, min_profitable_estimate;
1689 vect_estimate_min_profitable_iters (loop_vinfo, &min_profitable_iters,
1690 &min_profitable_estimate);
1691
1692 if (min_profitable_iters < 0)
1693 {
1694 if (dump_enabled_p ())
1695 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1696 "not vectorized: vectorization not profitable.\n");
1697 if (dump_enabled_p ())
1698 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1699 "not vectorized: vector version will never be "
1700 "profitable.\n");
1701 return -1;
1702 }
1703
1704 int min_scalar_loop_bound = (PARAM_VALUE (PARAM_MIN_VECT_LOOP_BOUND)
1705 * assumed_vf);
1706
1707 /* Use the cost model only if it is more conservative than user specified
1708 threshold. */
1709 unsigned int th = (unsigned) MAX (min_scalar_loop_bound,
1710 min_profitable_iters);
1711
1712 LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo) = th;
1713
1714 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
1715 && LOOP_VINFO_INT_NITERS (loop_vinfo) < th)
1716 {
1717 if (dump_enabled_p ())
1718 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1719 "not vectorized: vectorization not profitable.\n");
1720 if (dump_enabled_p ())
1721 dump_printf_loc (MSG_NOTE, vect_location,
1722 "not vectorized: iteration count smaller than user "
1723 "specified loop bound parameter or minimum profitable "
1724 "iterations (whichever is more conservative).\n");
1725 return 0;
1726 }
1727
1728 HOST_WIDE_INT estimated_niter = estimated_stmt_executions_int (loop);
1729 if (estimated_niter == -1)
1730 estimated_niter = likely_max_stmt_executions_int (loop);
1731 if (estimated_niter != -1
1732 && ((unsigned HOST_WIDE_INT) estimated_niter
1733 < MAX (th, (unsigned) min_profitable_estimate)))
1734 {
1735 if (dump_enabled_p ())
1736 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1737 "not vectorized: estimated iteration count too "
1738 "small.\n");
1739 if (dump_enabled_p ())
1740 dump_printf_loc (MSG_NOTE, vect_location,
1741 "not vectorized: estimated iteration count smaller "
1742 "than specified loop bound parameter or minimum "
1743 "profitable iterations (whichever is more "
1744 "conservative).\n");
1745 return -1;
1746 }
1747
1748 return 1;
1749 }
1750
1751 static opt_result
1752 vect_get_datarefs_in_loop (loop_p loop, basic_block *bbs,
1753 vec<data_reference_p> *datarefs,
1754 unsigned int *n_stmts)
1755 {
1756 *n_stmts = 0;
1757 for (unsigned i = 0; i < loop->num_nodes; i++)
1758 for (gimple_stmt_iterator gsi = gsi_start_bb (bbs[i]);
1759 !gsi_end_p (gsi); gsi_next (&gsi))
1760 {
1761 gimple *stmt = gsi_stmt (gsi);
1762 if (is_gimple_debug (stmt))
1763 continue;
1764 ++(*n_stmts);
1765 opt_result res = vect_find_stmt_data_reference (loop, stmt, datarefs);
1766 if (!res)
1767 {
1768 if (is_gimple_call (stmt) && loop->safelen)
1769 {
1770 tree fndecl = gimple_call_fndecl (stmt), op;
1771 if (fndecl != NULL_TREE)
1772 {
1773 cgraph_node *node = cgraph_node::get (fndecl);
1774 if (node != NULL && node->simd_clones != NULL)
1775 {
1776 unsigned int j, n = gimple_call_num_args (stmt);
1777 for (j = 0; j < n; j++)
1778 {
1779 op = gimple_call_arg (stmt, j);
1780 if (DECL_P (op)
1781 || (REFERENCE_CLASS_P (op)
1782 && get_base_address (op)))
1783 break;
1784 }
1785 op = gimple_call_lhs (stmt);
1786 /* Ignore #pragma omp declare simd functions
1787 if they don't have data references in the
1788 call stmt itself. */
1789 if (j == n
1790 && !(op
1791 && (DECL_P (op)
1792 || (REFERENCE_CLASS_P (op)
1793 && get_base_address (op)))))
1794 continue;
1795 }
1796 }
1797 }
1798 return res;
1799 }
1800 /* If dependence analysis will give up due to the limit on the
1801 number of datarefs stop here and fail fatally. */
1802 if (datarefs->length ()
1803 > (unsigned)PARAM_VALUE (PARAM_LOOP_MAX_DATAREFS_FOR_DATADEPS))
1804 return opt_result::failure_at (stmt, "exceeded param "
1805 "loop-max-datarefs-for-datadeps\n");
1806 }
1807 return opt_result::success ();
1808 }
1809
1810 /* Look for SLP-only access groups and turn each individual access into its own
1811 group. */
1812 static void
1813 vect_dissolve_slp_only_groups (loop_vec_info loop_vinfo)
1814 {
1815 unsigned int i;
1816 struct data_reference *dr;
1817
1818 DUMP_VECT_SCOPE ("vect_dissolve_slp_only_groups");
1819
1820 vec<data_reference_p> datarefs = loop_vinfo->shared->datarefs;
1821 FOR_EACH_VEC_ELT (datarefs, i, dr)
1822 {
1823 gcc_assert (DR_REF (dr));
1824 stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (DR_STMT (dr));
1825
1826 /* Check if the load is a part of an interleaving chain. */
1827 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
1828 {
1829 stmt_vec_info first_element = DR_GROUP_FIRST_ELEMENT (stmt_info);
1830 unsigned int group_size = DR_GROUP_SIZE (first_element);
1831
1832 /* Check if SLP-only groups. */
1833 if (!STMT_SLP_TYPE (stmt_info)
1834 && STMT_VINFO_SLP_VECT_ONLY (first_element))
1835 {
1836 /* Dissolve the group. */
1837 STMT_VINFO_SLP_VECT_ONLY (first_element) = false;
1838
1839 stmt_vec_info vinfo = first_element;
1840 while (vinfo)
1841 {
1842 stmt_vec_info next = DR_GROUP_NEXT_ELEMENT (vinfo);
1843 DR_GROUP_FIRST_ELEMENT (vinfo) = vinfo;
1844 DR_GROUP_NEXT_ELEMENT (vinfo) = NULL;
1845 DR_GROUP_SIZE (vinfo) = 1;
1846 DR_GROUP_GAP (vinfo) = group_size - 1;
1847 vinfo = next;
1848 }
1849 }
1850 }
1851 }
1852 }
1853
1854 /* Function vect_analyze_loop_2.
1855
1856 Apply a set of analyses on LOOP, and create a loop_vec_info struct
1857 for it. The different analyses will record information in the
1858 loop_vec_info struct. */
1859 static opt_result
1860 vect_analyze_loop_2 (loop_vec_info loop_vinfo, bool &fatal, unsigned *n_stmts)
1861 {
1862 opt_result ok = opt_result::success ();
1863 int res;
1864 unsigned int max_vf = MAX_VECTORIZATION_FACTOR;
1865 poly_uint64 min_vf = 2;
1866
1867 /* The first group of checks is independent of the vector size. */
1868 fatal = true;
1869
1870 if (LOOP_VINFO_SIMD_IF_COND (loop_vinfo)
1871 && integer_zerop (LOOP_VINFO_SIMD_IF_COND (loop_vinfo)))
1872 return opt_result::failure_at (vect_location,
1873 "not vectorized: simd if(0)\n");
1874
1875 /* Find all data references in the loop (which correspond to vdefs/vuses)
1876 and analyze their evolution in the loop. */
1877
1878 loop_p loop = LOOP_VINFO_LOOP (loop_vinfo);
1879
1880 /* Gather the data references and count stmts in the loop. */
1881 if (!LOOP_VINFO_DATAREFS (loop_vinfo).exists ())
1882 {
1883 opt_result res
1884 = vect_get_datarefs_in_loop (loop, LOOP_VINFO_BBS (loop_vinfo),
1885 &LOOP_VINFO_DATAREFS (loop_vinfo),
1886 n_stmts);
1887 if (!res)
1888 {
1889 if (dump_enabled_p ())
1890 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1891 "not vectorized: loop contains function "
1892 "calls or data references that cannot "
1893 "be analyzed\n");
1894 return res;
1895 }
1896 loop_vinfo->shared->save_datarefs ();
1897 }
1898 else
1899 loop_vinfo->shared->check_datarefs ();
1900
1901 /* Analyze the data references and also adjust the minimal
1902 vectorization factor according to the loads and stores. */
1903
1904 ok = vect_analyze_data_refs (loop_vinfo, &min_vf, &fatal);
1905 if (!ok)
1906 {
1907 if (dump_enabled_p ())
1908 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1909 "bad data references.\n");
1910 return ok;
1911 }
1912
1913 /* Classify all cross-iteration scalar data-flow cycles.
1914 Cross-iteration cycles caused by virtual phis are analyzed separately. */
1915 vect_analyze_scalar_cycles (loop_vinfo);
1916
1917 vect_pattern_recog (loop_vinfo);
1918
1919 vect_fixup_scalar_cycles_with_patterns (loop_vinfo);
1920
1921 /* Analyze the access patterns of the data-refs in the loop (consecutive,
1922 complex, etc.). FORNOW: Only handle consecutive access pattern. */
1923
1924 ok = vect_analyze_data_ref_accesses (loop_vinfo);
1925 if (!ok)
1926 {
1927 if (dump_enabled_p ())
1928 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1929 "bad data access.\n");
1930 return ok;
1931 }
1932
1933 /* Data-flow analysis to detect stmts that do not need to be vectorized. */
1934
1935 ok = vect_mark_stmts_to_be_vectorized (loop_vinfo, &fatal);
1936 if (!ok)
1937 {
1938 if (dump_enabled_p ())
1939 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1940 "unexpected pattern.\n");
1941 return ok;
1942 }
1943
1944 /* While the rest of the analysis below depends on it in some way. */
1945 fatal = false;
1946
1947 /* Analyze data dependences between the data-refs in the loop
1948 and adjust the maximum vectorization factor according to
1949 the dependences.
1950 FORNOW: fail at the first data dependence that we encounter. */
1951
1952 ok = vect_analyze_data_ref_dependences (loop_vinfo, &max_vf);
1953 if (!ok)
1954 {
1955 if (dump_enabled_p ())
1956 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1957 "bad data dependence.\n");
1958 return ok;
1959 }
1960 if (max_vf != MAX_VECTORIZATION_FACTOR
1961 && maybe_lt (max_vf, min_vf))
1962 return opt_result::failure_at (vect_location, "bad data dependence.\n");
1963 LOOP_VINFO_MAX_VECT_FACTOR (loop_vinfo) = max_vf;
1964
1965 ok = vect_determine_vectorization_factor (loop_vinfo);
1966 if (!ok)
1967 {
1968 if (dump_enabled_p ())
1969 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1970 "can't determine vectorization factor.\n");
1971 return ok;
1972 }
1973 if (max_vf != MAX_VECTORIZATION_FACTOR
1974 && maybe_lt (max_vf, LOOP_VINFO_VECT_FACTOR (loop_vinfo)))
1975 return opt_result::failure_at (vect_location, "bad data dependence.\n");
1976
1977 /* Compute the scalar iteration cost. */
1978 vect_compute_single_scalar_iteration_cost (loop_vinfo);
1979
1980 poly_uint64 saved_vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1981 unsigned th;
1982
1983 /* Check the SLP opportunities in the loop, analyze and build SLP trees. */
1984 ok = vect_analyze_slp (loop_vinfo, *n_stmts);
1985 if (!ok)
1986 return ok;
1987
1988 /* If there are any SLP instances mark them as pure_slp. */
1989 bool slp = vect_make_slp_decision (loop_vinfo);
1990 if (slp)
1991 {
1992 /* Find stmts that need to be both vectorized and SLPed. */
1993 vect_detect_hybrid_slp (loop_vinfo);
1994
1995 /* Update the vectorization factor based on the SLP decision. */
1996 vect_update_vf_for_slp (loop_vinfo);
1997 }
1998
1999 bool saved_can_fully_mask_p = LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo);
2000
2001 /* We don't expect to have to roll back to anything other than an empty
2002 set of rgroups. */
2003 gcc_assert (LOOP_VINFO_MASKS (loop_vinfo).is_empty ());
2004
2005 /* This is the point where we can re-start analysis with SLP forced off. */
2006 start_over:
2007
2008 /* Now the vectorization factor is final. */
2009 poly_uint64 vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2010 gcc_assert (known_ne (vectorization_factor, 0U));
2011
2012 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) && dump_enabled_p ())
2013 {
2014 dump_printf_loc (MSG_NOTE, vect_location,
2015 "vectorization_factor = ");
2016 dump_dec (MSG_NOTE, vectorization_factor);
2017 dump_printf (MSG_NOTE, ", niters = %wd\n",
2018 LOOP_VINFO_INT_NITERS (loop_vinfo));
2019 }
2020
2021 HOST_WIDE_INT max_niter
2022 = likely_max_stmt_executions_int (LOOP_VINFO_LOOP (loop_vinfo));
2023
2024 /* Analyze the alignment of the data-refs in the loop.
2025 Fail if a data reference is found that cannot be vectorized. */
2026
2027 ok = vect_analyze_data_refs_alignment (loop_vinfo);
2028 if (!ok)
2029 {
2030 if (dump_enabled_p ())
2031 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2032 "bad data alignment.\n");
2033 return ok;
2034 }
2035
2036 /* Prune the list of ddrs to be tested at run-time by versioning for alias.
2037 It is important to call pruning after vect_analyze_data_ref_accesses,
2038 since we use grouping information gathered by interleaving analysis. */
2039 ok = vect_prune_runtime_alias_test_list (loop_vinfo);
2040 if (!ok)
2041 return ok;
2042
2043 /* Do not invoke vect_enhance_data_refs_alignment for epilogue
2044 vectorization, since we do not want to add extra peeling or
2045 add versioning for alignment. */
2046 if (!LOOP_VINFO_EPILOGUE_P (loop_vinfo))
2047 /* This pass will decide on using loop versioning and/or loop peeling in
2048 order to enhance the alignment of data references in the loop. */
2049 ok = vect_enhance_data_refs_alignment (loop_vinfo);
2050 else
2051 ok = vect_verify_datarefs_alignment (loop_vinfo);
2052 if (!ok)
2053 return ok;
2054
2055 if (slp)
2056 {
2057 /* Analyze operations in the SLP instances. Note this may
2058 remove unsupported SLP instances which makes the above
2059 SLP kind detection invalid. */
2060 unsigned old_size = LOOP_VINFO_SLP_INSTANCES (loop_vinfo).length ();
2061 vect_slp_analyze_operations (loop_vinfo);
2062 if (LOOP_VINFO_SLP_INSTANCES (loop_vinfo).length () != old_size)
2063 {
2064 ok = opt_result::failure_at (vect_location,
2065 "unsupported SLP instances\n");
2066 goto again;
2067 }
2068 }
2069
2070 /* Dissolve SLP-only groups. */
2071 vect_dissolve_slp_only_groups (loop_vinfo);
2072
2073 /* Scan all the remaining operations in the loop that are not subject
2074 to SLP and make sure they are vectorizable. */
2075 ok = vect_analyze_loop_operations (loop_vinfo);
2076 if (!ok)
2077 {
2078 if (dump_enabled_p ())
2079 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2080 "bad operation or unsupported loop bound.\n");
2081 return ok;
2082 }
2083
2084 /* Decide whether to use a fully-masked loop for this vectorization
2085 factor. */
2086 LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
2087 = (LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo)
2088 && vect_verify_full_masking (loop_vinfo));
2089 if (dump_enabled_p ())
2090 {
2091 if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
2092 dump_printf_loc (MSG_NOTE, vect_location,
2093 "using a fully-masked loop.\n");
2094 else
2095 dump_printf_loc (MSG_NOTE, vect_location,
2096 "not using a fully-masked loop.\n");
2097 }
2098
2099 /* If epilog loop is required because of data accesses with gaps,
2100 one additional iteration needs to be peeled. Check if there is
2101 enough iterations for vectorization. */
2102 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
2103 && LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2104 && !LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
2105 {
2106 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2107 tree scalar_niters = LOOP_VINFO_NITERSM1 (loop_vinfo);
2108
2109 if (known_lt (wi::to_widest (scalar_niters), vf))
2110 return opt_result::failure_at (vect_location,
2111 "loop has no enough iterations to"
2112 " support peeling for gaps.\n");
2113 }
2114
2115 /* Check the costings of the loop make vectorizing worthwhile. */
2116 res = vect_analyze_loop_costing (loop_vinfo);
2117 if (res < 0)
2118 {
2119 ok = opt_result::failure_at (vect_location,
2120 "Loop costings may not be worthwhile.\n");
2121 goto again;
2122 }
2123 if (!res)
2124 return opt_result::failure_at (vect_location,
2125 "Loop costings not worthwhile.\n");
2126
2127 /* Decide whether we need to create an epilogue loop to handle
2128 remaining scalar iterations. */
2129 th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
2130
2131 unsigned HOST_WIDE_INT const_vf;
2132 if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
2133 /* The main loop handles all iterations. */
2134 LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = false;
2135 else if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2136 && LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) >= 0)
2137 {
2138 /* Work out the (constant) number of iterations that need to be
2139 peeled for reasons other than niters. */
2140 unsigned int peel_niter = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
2141 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
2142 peel_niter += 1;
2143 if (!multiple_p (LOOP_VINFO_INT_NITERS (loop_vinfo) - peel_niter,
2144 LOOP_VINFO_VECT_FACTOR (loop_vinfo)))
2145 LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = true;
2146 }
2147 else if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
2148 /* ??? When peeling for gaps but not alignment, we could
2149 try to check whether the (variable) niters is known to be
2150 VF * N + 1. That's something of a niche case though. */
2151 || LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
2152 || !LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant (&const_vf)
2153 || ((tree_ctz (LOOP_VINFO_NITERS (loop_vinfo))
2154 < (unsigned) exact_log2 (const_vf))
2155 /* In case of versioning, check if the maximum number of
2156 iterations is greater than th. If they are identical,
2157 the epilogue is unnecessary. */
2158 && (!LOOP_REQUIRES_VERSIONING (loop_vinfo)
2159 || ((unsigned HOST_WIDE_INT) max_niter
2160 > (th / const_vf) * const_vf))))
2161 LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = true;
2162
2163 /* If an epilogue loop is required make sure we can create one. */
2164 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
2165 || LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo))
2166 {
2167 if (dump_enabled_p ())
2168 dump_printf_loc (MSG_NOTE, vect_location, "epilog loop required\n");
2169 if (!vect_can_advance_ivs_p (loop_vinfo)
2170 || !slpeel_can_duplicate_loop_p (LOOP_VINFO_LOOP (loop_vinfo),
2171 single_exit (LOOP_VINFO_LOOP
2172 (loop_vinfo))))
2173 {
2174 ok = opt_result::failure_at (vect_location,
2175 "not vectorized: can't create required "
2176 "epilog loop\n");
2177 goto again;
2178 }
2179 }
2180
2181 /* During peeling, we need to check if number of loop iterations is
2182 enough for both peeled prolog loop and vector loop. This check
2183 can be merged along with threshold check of loop versioning, so
2184 increase threshold for this case if necessary. */
2185 if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
2186 {
2187 poly_uint64 niters_th = 0;
2188
2189 if (!vect_use_loop_mask_for_alignment_p (loop_vinfo))
2190 {
2191 /* Niters for peeled prolog loop. */
2192 if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) < 0)
2193 {
2194 dr_vec_info *dr_info = LOOP_VINFO_UNALIGNED_DR (loop_vinfo);
2195 tree vectype = STMT_VINFO_VECTYPE (dr_info->stmt);
2196 niters_th += TYPE_VECTOR_SUBPARTS (vectype) - 1;
2197 }
2198 else
2199 niters_th += LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
2200 }
2201
2202 /* Niters for at least one iteration of vectorized loop. */
2203 if (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
2204 niters_th += LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2205 /* One additional iteration because of peeling for gap. */
2206 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
2207 niters_th += 1;
2208 LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo) = niters_th;
2209 }
2210
2211 gcc_assert (known_eq (vectorization_factor,
2212 LOOP_VINFO_VECT_FACTOR (loop_vinfo)));
2213
2214 /* Ok to vectorize! */
2215 return opt_result::success ();
2216
2217 again:
2218 /* Ensure that "ok" is false (with an opt_problem if dumping is enabled). */
2219 gcc_assert (!ok);
2220
2221 /* Try again with SLP forced off but if we didn't do any SLP there is
2222 no point in re-trying. */
2223 if (!slp)
2224 return ok;
2225
2226 /* If there are reduction chains re-trying will fail anyway. */
2227 if (! LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo).is_empty ())
2228 return ok;
2229
2230 /* Likewise if the grouped loads or stores in the SLP cannot be handled
2231 via interleaving or lane instructions. */
2232 slp_instance instance;
2233 slp_tree node;
2234 unsigned i, j;
2235 FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), i, instance)
2236 {
2237 stmt_vec_info vinfo;
2238 vinfo = SLP_TREE_SCALAR_STMTS (SLP_INSTANCE_TREE (instance))[0];
2239 if (! STMT_VINFO_GROUPED_ACCESS (vinfo))
2240 continue;
2241 vinfo = DR_GROUP_FIRST_ELEMENT (vinfo);
2242 unsigned int size = DR_GROUP_SIZE (vinfo);
2243 tree vectype = STMT_VINFO_VECTYPE (vinfo);
2244 if (! vect_store_lanes_supported (vectype, size, false)
2245 && ! known_eq (TYPE_VECTOR_SUBPARTS (vectype), 1U)
2246 && ! vect_grouped_store_supported (vectype, size))
2247 return opt_result::failure_at (vinfo->stmt,
2248 "unsupported grouped store\n");
2249 FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), j, node)
2250 {
2251 vinfo = SLP_TREE_SCALAR_STMTS (node)[0];
2252 vinfo = DR_GROUP_FIRST_ELEMENT (vinfo);
2253 bool single_element_p = !DR_GROUP_NEXT_ELEMENT (vinfo);
2254 size = DR_GROUP_SIZE (vinfo);
2255 vectype = STMT_VINFO_VECTYPE (vinfo);
2256 if (! vect_load_lanes_supported (vectype, size, false)
2257 && ! vect_grouped_load_supported (vectype, single_element_p,
2258 size))
2259 return opt_result::failure_at (vinfo->stmt,
2260 "unsupported grouped load\n");
2261 }
2262 }
2263
2264 if (dump_enabled_p ())
2265 dump_printf_loc (MSG_NOTE, vect_location,
2266 "re-trying with SLP disabled\n");
2267
2268 /* Roll back state appropriately. No SLP this time. */
2269 slp = false;
2270 /* Restore vectorization factor as it were without SLP. */
2271 LOOP_VINFO_VECT_FACTOR (loop_vinfo) = saved_vectorization_factor;
2272 /* Free the SLP instances. */
2273 FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), j, instance)
2274 vect_free_slp_instance (instance, false);
2275 LOOP_VINFO_SLP_INSTANCES (loop_vinfo).release ();
2276 /* Reset SLP type to loop_vect on all stmts. */
2277 for (i = 0; i < LOOP_VINFO_LOOP (loop_vinfo)->num_nodes; ++i)
2278 {
2279 basic_block bb = LOOP_VINFO_BBS (loop_vinfo)[i];
2280 for (gimple_stmt_iterator si = gsi_start_phis (bb);
2281 !gsi_end_p (si); gsi_next (&si))
2282 {
2283 stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
2284 STMT_SLP_TYPE (stmt_info) = loop_vect;
2285 }
2286 for (gimple_stmt_iterator si = gsi_start_bb (bb);
2287 !gsi_end_p (si); gsi_next (&si))
2288 {
2289 stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
2290 STMT_SLP_TYPE (stmt_info) = loop_vect;
2291 if (STMT_VINFO_IN_PATTERN_P (stmt_info))
2292 {
2293 gimple *pattern_def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
2294 stmt_info = STMT_VINFO_RELATED_STMT (stmt_info);
2295 STMT_SLP_TYPE (stmt_info) = loop_vect;
2296 for (gimple_stmt_iterator pi = gsi_start (pattern_def_seq);
2297 !gsi_end_p (pi); gsi_next (&pi))
2298 STMT_SLP_TYPE (loop_vinfo->lookup_stmt (gsi_stmt (pi)))
2299 = loop_vect;
2300 }
2301 }
2302 }
2303 /* Free optimized alias test DDRS. */
2304 LOOP_VINFO_LOWER_BOUNDS (loop_vinfo).truncate (0);
2305 LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo).release ();
2306 LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo).release ();
2307 /* Reset target cost data. */
2308 destroy_cost_data (LOOP_VINFO_TARGET_COST_DATA (loop_vinfo));
2309 LOOP_VINFO_TARGET_COST_DATA (loop_vinfo)
2310 = init_cost (LOOP_VINFO_LOOP (loop_vinfo));
2311 /* Reset accumulated rgroup information. */
2312 release_vec_loop_masks (&LOOP_VINFO_MASKS (loop_vinfo));
2313 /* Reset assorted flags. */
2314 LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = false;
2315 LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) = false;
2316 LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo) = 0;
2317 LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo) = 0;
2318 LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = saved_can_fully_mask_p;
2319
2320 goto start_over;
2321 }
2322
2323 /* Function vect_analyze_loop.
2324
2325 Apply a set of analyses on LOOP, and create a loop_vec_info struct
2326 for it. The different analyses will record information in the
2327 loop_vec_info struct. If ORIG_LOOP_VINFO is not NULL epilogue must
2328 be vectorized. */
2329 opt_loop_vec_info
2330 vect_analyze_loop (struct loop *loop, loop_vec_info orig_loop_vinfo,
2331 vec_info_shared *shared)
2332 {
2333 auto_vector_sizes vector_sizes;
2334
2335 /* Autodetect first vector size we try. */
2336 current_vector_size = 0;
2337 targetm.vectorize.autovectorize_vector_sizes (&vector_sizes,
2338 loop->simdlen != 0);
2339 unsigned int next_size = 0;
2340
2341 DUMP_VECT_SCOPE ("analyze_loop_nest");
2342
2343 if (loop_outer (loop)
2344 && loop_vec_info_for_loop (loop_outer (loop))
2345 && LOOP_VINFO_VECTORIZABLE_P (loop_vec_info_for_loop (loop_outer (loop))))
2346 return opt_loop_vec_info::failure_at (vect_location,
2347 "outer-loop already vectorized.\n");
2348
2349 if (!find_loop_nest (loop, &shared->loop_nest))
2350 return opt_loop_vec_info::failure_at
2351 (vect_location,
2352 "not vectorized: loop nest containing two or more consecutive inner"
2353 " loops cannot be vectorized\n");
2354
2355 unsigned n_stmts = 0;
2356 poly_uint64 autodetected_vector_size = 0;
2357 opt_loop_vec_info first_loop_vinfo = opt_loop_vec_info::success (NULL);
2358 poly_uint64 first_vector_size = 0;
2359 while (1)
2360 {
2361 /* Check the CFG characteristics of the loop (nesting, entry/exit). */
2362 opt_loop_vec_info loop_vinfo
2363 = vect_analyze_loop_form (loop, shared);
2364 if (!loop_vinfo)
2365 {
2366 if (dump_enabled_p ())
2367 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2368 "bad loop form.\n");
2369 gcc_checking_assert (first_loop_vinfo == NULL);
2370 return loop_vinfo;
2371 }
2372
2373 bool fatal = false;
2374
2375 if (orig_loop_vinfo)
2376 LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo) = orig_loop_vinfo;
2377
2378 opt_result res = vect_analyze_loop_2 (loop_vinfo, fatal, &n_stmts);
2379 if (res)
2380 {
2381 LOOP_VINFO_VECTORIZABLE_P (loop_vinfo) = 1;
2382
2383 if (loop->simdlen
2384 && maybe_ne (LOOP_VINFO_VECT_FACTOR (loop_vinfo),
2385 (unsigned HOST_WIDE_INT) loop->simdlen))
2386 {
2387 if (first_loop_vinfo == NULL)
2388 {
2389 first_loop_vinfo = loop_vinfo;
2390 first_vector_size = current_vector_size;
2391 loop->aux = NULL;
2392 }
2393 else
2394 delete loop_vinfo;
2395 }
2396 else
2397 {
2398 delete first_loop_vinfo;
2399 return loop_vinfo;
2400 }
2401 }
2402 else
2403 delete loop_vinfo;
2404
2405 if (next_size == 0)
2406 autodetected_vector_size = current_vector_size;
2407
2408 if (next_size < vector_sizes.length ()
2409 && known_eq (vector_sizes[next_size], autodetected_vector_size))
2410 next_size += 1;
2411
2412 if (fatal)
2413 {
2414 gcc_checking_assert (first_loop_vinfo == NULL);
2415 return opt_loop_vec_info::propagate_failure (res);
2416 }
2417
2418 if (next_size == vector_sizes.length ()
2419 || known_eq (current_vector_size, 0U))
2420 {
2421 if (first_loop_vinfo)
2422 {
2423 current_vector_size = first_vector_size;
2424 loop->aux = (loop_vec_info) first_loop_vinfo;
2425 if (dump_enabled_p ())
2426 {
2427 dump_printf_loc (MSG_NOTE, vect_location,
2428 "***** Choosing vector size ");
2429 dump_dec (MSG_NOTE, current_vector_size);
2430 dump_printf (MSG_NOTE, "\n");
2431 }
2432 return first_loop_vinfo;
2433 }
2434 else
2435 return opt_loop_vec_info::propagate_failure (res);
2436 }
2437
2438 /* Try the next biggest vector size. */
2439 current_vector_size = vector_sizes[next_size++];
2440 if (dump_enabled_p ())
2441 {
2442 dump_printf_loc (MSG_NOTE, vect_location,
2443 "***** Re-trying analysis with "
2444 "vector size ");
2445 dump_dec (MSG_NOTE, current_vector_size);
2446 dump_printf (MSG_NOTE, "\n");
2447 }
2448 }
2449 }
2450
2451 /* Return true if there is an in-order reduction function for CODE, storing
2452 it in *REDUC_FN if so. */
2453
2454 static bool
2455 fold_left_reduction_fn (tree_code code, internal_fn *reduc_fn)
2456 {
2457 switch (code)
2458 {
2459 case PLUS_EXPR:
2460 *reduc_fn = IFN_FOLD_LEFT_PLUS;
2461 return true;
2462
2463 default:
2464 return false;
2465 }
2466 }
2467
2468 /* Function reduction_fn_for_scalar_code
2469
2470 Input:
2471 CODE - tree_code of a reduction operations.
2472
2473 Output:
2474 REDUC_FN - the corresponding internal function to be used to reduce the
2475 vector of partial results into a single scalar result, or IFN_LAST
2476 if the operation is a supported reduction operation, but does not have
2477 such an internal function.
2478
2479 Return FALSE if CODE currently cannot be vectorized as reduction. */
2480
2481 static bool
2482 reduction_fn_for_scalar_code (enum tree_code code, internal_fn *reduc_fn)
2483 {
2484 switch (code)
2485 {
2486 case MAX_EXPR:
2487 *reduc_fn = IFN_REDUC_MAX;
2488 return true;
2489
2490 case MIN_EXPR:
2491 *reduc_fn = IFN_REDUC_MIN;
2492 return true;
2493
2494 case PLUS_EXPR:
2495 *reduc_fn = IFN_REDUC_PLUS;
2496 return true;
2497
2498 case BIT_AND_EXPR:
2499 *reduc_fn = IFN_REDUC_AND;
2500 return true;
2501
2502 case BIT_IOR_EXPR:
2503 *reduc_fn = IFN_REDUC_IOR;
2504 return true;
2505
2506 case BIT_XOR_EXPR:
2507 *reduc_fn = IFN_REDUC_XOR;
2508 return true;
2509
2510 case MULT_EXPR:
2511 case MINUS_EXPR:
2512 *reduc_fn = IFN_LAST;
2513 return true;
2514
2515 default:
2516 return false;
2517 }
2518 }
2519
2520 /* If there is a neutral value X such that SLP reduction NODE would not
2521 be affected by the introduction of additional X elements, return that X,
2522 otherwise return null. CODE is the code of the reduction. REDUC_CHAIN
2523 is true if the SLP statements perform a single reduction, false if each
2524 statement performs an independent reduction. */
2525
2526 static tree
2527 neutral_op_for_slp_reduction (slp_tree slp_node, tree_code code,
2528 bool reduc_chain)
2529 {
2530 vec<stmt_vec_info> stmts = SLP_TREE_SCALAR_STMTS (slp_node);
2531 stmt_vec_info stmt_vinfo = stmts[0];
2532 tree vector_type = STMT_VINFO_VECTYPE (stmt_vinfo);
2533 tree scalar_type = TREE_TYPE (vector_type);
2534 struct loop *loop = gimple_bb (stmt_vinfo->stmt)->loop_father;
2535 gcc_assert (loop);
2536
2537 switch (code)
2538 {
2539 case WIDEN_SUM_EXPR:
2540 case DOT_PROD_EXPR:
2541 case SAD_EXPR:
2542 case PLUS_EXPR:
2543 case MINUS_EXPR:
2544 case BIT_IOR_EXPR:
2545 case BIT_XOR_EXPR:
2546 return build_zero_cst (scalar_type);
2547
2548 case MULT_EXPR:
2549 return build_one_cst (scalar_type);
2550
2551 case BIT_AND_EXPR:
2552 return build_all_ones_cst (scalar_type);
2553
2554 case MAX_EXPR:
2555 case MIN_EXPR:
2556 /* For MIN/MAX the initial values are neutral. A reduction chain
2557 has only a single initial value, so that value is neutral for
2558 all statements. */
2559 if (reduc_chain)
2560 return PHI_ARG_DEF_FROM_EDGE (stmt_vinfo->stmt,
2561 loop_preheader_edge (loop));
2562 return NULL_TREE;
2563
2564 default:
2565 return NULL_TREE;
2566 }
2567 }
2568
2569 /* Error reporting helper for vect_is_simple_reduction below. GIMPLE statement
2570 STMT is printed with a message MSG. */
2571
2572 static void
2573 report_vect_op (dump_flags_t msg_type, gimple *stmt, const char *msg)
2574 {
2575 dump_printf_loc (msg_type, vect_location, "%s%G", msg, stmt);
2576 }
2577
2578 /* DEF_STMT_INFO occurs in a loop that contains a potential reduction
2579 operation. Return true if the results of DEF_STMT_INFO are something
2580 that can be accumulated by such a reduction. */
2581
2582 static bool
2583 vect_valid_reduction_input_p (stmt_vec_info def_stmt_info)
2584 {
2585 return (is_gimple_assign (def_stmt_info->stmt)
2586 || is_gimple_call (def_stmt_info->stmt)
2587 || STMT_VINFO_DEF_TYPE (def_stmt_info) == vect_induction_def
2588 || (gimple_code (def_stmt_info->stmt) == GIMPLE_PHI
2589 && STMT_VINFO_DEF_TYPE (def_stmt_info) == vect_internal_def
2590 && !is_loop_header_bb_p (gimple_bb (def_stmt_info->stmt))));
2591 }
2592
2593 /* Detect SLP reduction of the form:
2594
2595 #a1 = phi <a5, a0>
2596 a2 = operation (a1)
2597 a3 = operation (a2)
2598 a4 = operation (a3)
2599 a5 = operation (a4)
2600
2601 #a = phi <a5>
2602
2603 PHI is the reduction phi node (#a1 = phi <a5, a0> above)
2604 FIRST_STMT is the first reduction stmt in the chain
2605 (a2 = operation (a1)).
2606
2607 Return TRUE if a reduction chain was detected. */
2608
2609 static bool
2610 vect_is_slp_reduction (loop_vec_info loop_info, gimple *phi,
2611 gimple *first_stmt)
2612 {
2613 struct loop *loop = (gimple_bb (phi))->loop_father;
2614 struct loop *vect_loop = LOOP_VINFO_LOOP (loop_info);
2615 enum tree_code code;
2616 gimple *loop_use_stmt = NULL;
2617 stmt_vec_info use_stmt_info;
2618 tree lhs;
2619 imm_use_iterator imm_iter;
2620 use_operand_p use_p;
2621 int nloop_uses, size = 0, n_out_of_loop_uses;
2622 bool found = false;
2623
2624 if (loop != vect_loop)
2625 return false;
2626
2627 auto_vec<stmt_vec_info, 8> reduc_chain;
2628 lhs = PHI_RESULT (phi);
2629 code = gimple_assign_rhs_code (first_stmt);
2630 while (1)
2631 {
2632 nloop_uses = 0;
2633 n_out_of_loop_uses = 0;
2634 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, lhs)
2635 {
2636 gimple *use_stmt = USE_STMT (use_p);
2637 if (is_gimple_debug (use_stmt))
2638 continue;
2639
2640 /* Check if we got back to the reduction phi. */
2641 if (use_stmt == phi)
2642 {
2643 loop_use_stmt = use_stmt;
2644 found = true;
2645 break;
2646 }
2647
2648 if (flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
2649 {
2650 loop_use_stmt = use_stmt;
2651 nloop_uses++;
2652 }
2653 else
2654 n_out_of_loop_uses++;
2655
2656 /* There are can be either a single use in the loop or two uses in
2657 phi nodes. */
2658 if (nloop_uses > 1 || (n_out_of_loop_uses && nloop_uses))
2659 return false;
2660 }
2661
2662 if (found)
2663 break;
2664
2665 /* We reached a statement with no loop uses. */
2666 if (nloop_uses == 0)
2667 return false;
2668
2669 /* This is a loop exit phi, and we haven't reached the reduction phi. */
2670 if (gimple_code (loop_use_stmt) == GIMPLE_PHI)
2671 return false;
2672
2673 if (!is_gimple_assign (loop_use_stmt)
2674 || code != gimple_assign_rhs_code (loop_use_stmt)
2675 || !flow_bb_inside_loop_p (loop, gimple_bb (loop_use_stmt)))
2676 return false;
2677
2678 /* Insert USE_STMT into reduction chain. */
2679 use_stmt_info = loop_info->lookup_stmt (loop_use_stmt);
2680 reduc_chain.safe_push (use_stmt_info);
2681
2682 lhs = gimple_assign_lhs (loop_use_stmt);
2683 size++;
2684 }
2685
2686 if (!found || loop_use_stmt != phi || size < 2)
2687 return false;
2688
2689 /* Swap the operands, if needed, to make the reduction operand be the second
2690 operand. */
2691 lhs = PHI_RESULT (phi);
2692 for (unsigned i = 0; i < reduc_chain.length (); ++i)
2693 {
2694 gassign *next_stmt = as_a <gassign *> (reduc_chain[i]->stmt);
2695 if (gimple_assign_rhs2 (next_stmt) == lhs)
2696 {
2697 tree op = gimple_assign_rhs1 (next_stmt);
2698 stmt_vec_info def_stmt_info = loop_info->lookup_def (op);
2699
2700 /* Check that the other def is either defined in the loop
2701 ("vect_internal_def"), or it's an induction (defined by a
2702 loop-header phi-node). */
2703 if (def_stmt_info
2704 && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt_info->stmt))
2705 && vect_valid_reduction_input_p (def_stmt_info))
2706 {
2707 lhs = gimple_assign_lhs (next_stmt);
2708 continue;
2709 }
2710
2711 return false;
2712 }
2713 else
2714 {
2715 tree op = gimple_assign_rhs2 (next_stmt);
2716 stmt_vec_info def_stmt_info = loop_info->lookup_def (op);
2717
2718 /* Check that the other def is either defined in the loop
2719 ("vect_internal_def"), or it's an induction (defined by a
2720 loop-header phi-node). */
2721 if (def_stmt_info
2722 && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt_info->stmt))
2723 && vect_valid_reduction_input_p (def_stmt_info))
2724 {
2725 if (dump_enabled_p ())
2726 dump_printf_loc (MSG_NOTE, vect_location, "swapping oprnds: %G",
2727 next_stmt);
2728
2729 swap_ssa_operands (next_stmt,
2730 gimple_assign_rhs1_ptr (next_stmt),
2731 gimple_assign_rhs2_ptr (next_stmt));
2732 update_stmt (next_stmt);
2733
2734 if (CONSTANT_CLASS_P (gimple_assign_rhs1 (next_stmt)))
2735 LOOP_VINFO_OPERANDS_SWAPPED (loop_info) = true;
2736 }
2737 else
2738 return false;
2739 }
2740
2741 lhs = gimple_assign_lhs (next_stmt);
2742 }
2743
2744 /* Build up the actual chain. */
2745 for (unsigned i = 0; i < reduc_chain.length () - 1; ++i)
2746 {
2747 REDUC_GROUP_FIRST_ELEMENT (reduc_chain[i]) = reduc_chain[0];
2748 REDUC_GROUP_NEXT_ELEMENT (reduc_chain[i]) = reduc_chain[i+1];
2749 }
2750 REDUC_GROUP_FIRST_ELEMENT (reduc_chain.last ()) = reduc_chain[0];
2751 REDUC_GROUP_NEXT_ELEMENT (reduc_chain.last ()) = NULL;
2752
2753 /* Save the chain for further analysis in SLP detection. */
2754 LOOP_VINFO_REDUCTION_CHAINS (loop_info).safe_push (reduc_chain[0]);
2755 REDUC_GROUP_SIZE (reduc_chain[0]) = size;
2756
2757 return true;
2758 }
2759
2760 /* Return true if we need an in-order reduction for operation CODE
2761 on type TYPE. NEED_WRAPPING_INTEGRAL_OVERFLOW is true if integer
2762 overflow must wrap. */
2763
2764 static bool
2765 needs_fold_left_reduction_p (tree type, tree_code code,
2766 bool need_wrapping_integral_overflow)
2767 {
2768 /* CHECKME: check for !flag_finite_math_only too? */
2769 if (SCALAR_FLOAT_TYPE_P (type))
2770 switch (code)
2771 {
2772 case MIN_EXPR:
2773 case MAX_EXPR:
2774 return false;
2775
2776 default:
2777 return !flag_associative_math;
2778 }
2779
2780 if (INTEGRAL_TYPE_P (type))
2781 {
2782 if (!operation_no_trapping_overflow (type, code))
2783 return true;
2784 if (need_wrapping_integral_overflow
2785 && !TYPE_OVERFLOW_WRAPS (type)
2786 && operation_can_overflow (code))
2787 return true;
2788 return false;
2789 }
2790
2791 if (SAT_FIXED_POINT_TYPE_P (type))
2792 return true;
2793
2794 return false;
2795 }
2796
2797 /* Return true if the reduction PHI in LOOP with latch arg LOOP_ARG and
2798 reduction operation CODE has a handled computation expression. */
2799
2800 bool
2801 check_reduction_path (dump_user_location_t loc, loop_p loop, gphi *phi,
2802 tree loop_arg, enum tree_code code)
2803 {
2804 auto_vec<std::pair<ssa_op_iter, use_operand_p> > path;
2805 auto_bitmap visited;
2806 tree lookfor = PHI_RESULT (phi);
2807 ssa_op_iter curri;
2808 use_operand_p curr = op_iter_init_phiuse (&curri, phi, SSA_OP_USE);
2809 while (USE_FROM_PTR (curr) != loop_arg)
2810 curr = op_iter_next_use (&curri);
2811 curri.i = curri.numops;
2812 do
2813 {
2814 path.safe_push (std::make_pair (curri, curr));
2815 tree use = USE_FROM_PTR (curr);
2816 if (use == lookfor)
2817 break;
2818 gimple *def = SSA_NAME_DEF_STMT (use);
2819 if (gimple_nop_p (def)
2820 || ! flow_bb_inside_loop_p (loop, gimple_bb (def)))
2821 {
2822 pop:
2823 do
2824 {
2825 std::pair<ssa_op_iter, use_operand_p> x = path.pop ();
2826 curri = x.first;
2827 curr = x.second;
2828 do
2829 curr = op_iter_next_use (&curri);
2830 /* Skip already visited or non-SSA operands (from iterating
2831 over PHI args). */
2832 while (curr != NULL_USE_OPERAND_P
2833 && (TREE_CODE (USE_FROM_PTR (curr)) != SSA_NAME
2834 || ! bitmap_set_bit (visited,
2835 SSA_NAME_VERSION
2836 (USE_FROM_PTR (curr)))));
2837 }
2838 while (curr == NULL_USE_OPERAND_P && ! path.is_empty ());
2839 if (curr == NULL_USE_OPERAND_P)
2840 break;
2841 }
2842 else
2843 {
2844 if (gimple_code (def) == GIMPLE_PHI)
2845 curr = op_iter_init_phiuse (&curri, as_a <gphi *>(def), SSA_OP_USE);
2846 else
2847 curr = op_iter_init_use (&curri, def, SSA_OP_USE);
2848 while (curr != NULL_USE_OPERAND_P
2849 && (TREE_CODE (USE_FROM_PTR (curr)) != SSA_NAME
2850 || ! bitmap_set_bit (visited,
2851 SSA_NAME_VERSION
2852 (USE_FROM_PTR (curr)))))
2853 curr = op_iter_next_use (&curri);
2854 if (curr == NULL_USE_OPERAND_P)
2855 goto pop;
2856 }
2857 }
2858 while (1);
2859 if (dump_file && (dump_flags & TDF_DETAILS))
2860 {
2861 dump_printf_loc (MSG_NOTE, loc, "reduction path: ");
2862 unsigned i;
2863 std::pair<ssa_op_iter, use_operand_p> *x;
2864 FOR_EACH_VEC_ELT (path, i, x)
2865 dump_printf (MSG_NOTE, "%T ", USE_FROM_PTR (x->second));
2866 dump_printf (MSG_NOTE, "\n");
2867 }
2868
2869 /* Check whether the reduction path detected is valid. */
2870 bool fail = path.length () == 0;
2871 bool neg = false;
2872 for (unsigned i = 1; i < path.length (); ++i)
2873 {
2874 gimple *use_stmt = USE_STMT (path[i].second);
2875 tree op = USE_FROM_PTR (path[i].second);
2876 if (! has_single_use (op)
2877 || ! is_gimple_assign (use_stmt))
2878 {
2879 fail = true;
2880 break;
2881 }
2882 if (gimple_assign_rhs_code (use_stmt) != code)
2883 {
2884 if (code == PLUS_EXPR
2885 && gimple_assign_rhs_code (use_stmt) == MINUS_EXPR)
2886 {
2887 /* Track whether we negate the reduction value each iteration. */
2888 if (gimple_assign_rhs2 (use_stmt) == op)
2889 neg = ! neg;
2890 }
2891 else
2892 {
2893 fail = true;
2894 break;
2895 }
2896 }
2897 }
2898 return ! fail && ! neg;
2899 }
2900
2901
2902 /* Function vect_is_simple_reduction
2903
2904 (1) Detect a cross-iteration def-use cycle that represents a simple
2905 reduction computation. We look for the following pattern:
2906
2907 loop_header:
2908 a1 = phi < a0, a2 >
2909 a3 = ...
2910 a2 = operation (a3, a1)
2911
2912 or
2913
2914 a3 = ...
2915 loop_header:
2916 a1 = phi < a0, a2 >
2917 a2 = operation (a3, a1)
2918
2919 such that:
2920 1. operation is commutative and associative and it is safe to
2921 change the order of the computation
2922 2. no uses for a2 in the loop (a2 is used out of the loop)
2923 3. no uses of a1 in the loop besides the reduction operation
2924 4. no uses of a1 outside the loop.
2925
2926 Conditions 1,4 are tested here.
2927 Conditions 2,3 are tested in vect_mark_stmts_to_be_vectorized.
2928
2929 (2) Detect a cross-iteration def-use cycle in nested loops, i.e.,
2930 nested cycles.
2931
2932 (3) Detect cycles of phi nodes in outer-loop vectorization, i.e., double
2933 reductions:
2934
2935 a1 = phi < a0, a2 >
2936 inner loop (def of a3)
2937 a2 = phi < a3 >
2938
2939 (4) Detect condition expressions, ie:
2940 for (int i = 0; i < N; i++)
2941 if (a[i] < val)
2942 ret_val = a[i];
2943
2944 */
2945
2946 static stmt_vec_info
2947 vect_is_simple_reduction (loop_vec_info loop_info, stmt_vec_info phi_info,
2948 bool *double_reduc,
2949 bool need_wrapping_integral_overflow,
2950 enum vect_reduction_type *v_reduc_type)
2951 {
2952 gphi *phi = as_a <gphi *> (phi_info->stmt);
2953 struct loop *loop = (gimple_bb (phi))->loop_father;
2954 struct loop *vect_loop = LOOP_VINFO_LOOP (loop_info);
2955 bool nested_in_vect_loop = flow_loop_nested_p (vect_loop, loop);
2956 gimple *phi_use_stmt = NULL;
2957 enum tree_code orig_code, code;
2958 tree op1, op2, op3 = NULL_TREE, op4 = NULL_TREE;
2959 tree type;
2960 tree name;
2961 imm_use_iterator imm_iter;
2962 use_operand_p use_p;
2963 bool phi_def;
2964
2965 *double_reduc = false;
2966 *v_reduc_type = TREE_CODE_REDUCTION;
2967
2968 tree phi_name = PHI_RESULT (phi);
2969 /* ??? If there are no uses of the PHI result the inner loop reduction
2970 won't be detected as possibly double-reduction by vectorizable_reduction
2971 because that tries to walk the PHI arg from the preheader edge which
2972 can be constant. See PR60382. */
2973 if (has_zero_uses (phi_name))
2974 return NULL;
2975 unsigned nphi_def_loop_uses = 0;
2976 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, phi_name)
2977 {
2978 gimple *use_stmt = USE_STMT (use_p);
2979 if (is_gimple_debug (use_stmt))
2980 continue;
2981
2982 if (!flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
2983 {
2984 if (dump_enabled_p ())
2985 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2986 "intermediate value used outside loop.\n");
2987
2988 return NULL;
2989 }
2990
2991 nphi_def_loop_uses++;
2992 phi_use_stmt = use_stmt;
2993 }
2994
2995 edge latch_e = loop_latch_edge (loop);
2996 tree loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
2997 if (TREE_CODE (loop_arg) != SSA_NAME)
2998 {
2999 if (dump_enabled_p ())
3000 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3001 "reduction: not ssa_name: %T\n", loop_arg);
3002 return NULL;
3003 }
3004
3005 stmt_vec_info def_stmt_info = loop_info->lookup_def (loop_arg);
3006 if (!def_stmt_info
3007 || !flow_bb_inside_loop_p (loop, gimple_bb (def_stmt_info->stmt)))
3008 return NULL;
3009
3010 if (gassign *def_stmt = dyn_cast <gassign *> (def_stmt_info->stmt))
3011 {
3012 name = gimple_assign_lhs (def_stmt);
3013 phi_def = false;
3014 }
3015 else if (gphi *def_stmt = dyn_cast <gphi *> (def_stmt_info->stmt))
3016 {
3017 name = PHI_RESULT (def_stmt);
3018 phi_def = true;
3019 }
3020 else
3021 {
3022 if (dump_enabled_p ())
3023 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3024 "reduction: unhandled reduction operation: %G",
3025 def_stmt_info->stmt);
3026 return NULL;
3027 }
3028
3029 unsigned nlatch_def_loop_uses = 0;
3030 auto_vec<gphi *, 3> lcphis;
3031 bool inner_loop_of_double_reduc = false;
3032 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, name)
3033 {
3034 gimple *use_stmt = USE_STMT (use_p);
3035 if (is_gimple_debug (use_stmt))
3036 continue;
3037 if (flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
3038 nlatch_def_loop_uses++;
3039 else
3040 {
3041 /* We can have more than one loop-closed PHI. */
3042 lcphis.safe_push (as_a <gphi *> (use_stmt));
3043 if (nested_in_vect_loop
3044 && (STMT_VINFO_DEF_TYPE (loop_info->lookup_stmt (use_stmt))
3045 == vect_double_reduction_def))
3046 inner_loop_of_double_reduc = true;
3047 }
3048 }
3049
3050 /* If this isn't a nested cycle or if the nested cycle reduction value
3051 is used ouside of the inner loop we cannot handle uses of the reduction
3052 value. */
3053 if ((!nested_in_vect_loop || inner_loop_of_double_reduc)
3054 && (nlatch_def_loop_uses > 1 || nphi_def_loop_uses > 1))
3055 {
3056 if (dump_enabled_p ())
3057 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3058 "reduction used in loop.\n");
3059 return NULL;
3060 }
3061
3062 /* If DEF_STMT is a phi node itself, we expect it to have a single argument
3063 defined in the inner loop. */
3064 if (phi_def)
3065 {
3066 gphi *def_stmt = as_a <gphi *> (def_stmt_info->stmt);
3067 op1 = PHI_ARG_DEF (def_stmt, 0);
3068
3069 if (gimple_phi_num_args (def_stmt) != 1
3070 || TREE_CODE (op1) != SSA_NAME)
3071 {
3072 if (dump_enabled_p ())
3073 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3074 "unsupported phi node definition.\n");
3075
3076 return NULL;
3077 }
3078
3079 gimple *def1 = SSA_NAME_DEF_STMT (op1);
3080 if (gimple_bb (def1)
3081 && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
3082 && loop->inner
3083 && flow_bb_inside_loop_p (loop->inner, gimple_bb (def1))
3084 && is_gimple_assign (def1)
3085 && is_a <gphi *> (phi_use_stmt)
3086 && flow_bb_inside_loop_p (loop->inner, gimple_bb (phi_use_stmt)))
3087 {
3088 if (dump_enabled_p ())
3089 report_vect_op (MSG_NOTE, def_stmt,
3090 "detected double reduction: ");
3091
3092 *double_reduc = true;
3093 return def_stmt_info;
3094 }
3095
3096 return NULL;
3097 }
3098
3099 /* If we are vectorizing an inner reduction we are executing that
3100 in the original order only in case we are not dealing with a
3101 double reduction. */
3102 bool check_reduction = true;
3103 if (flow_loop_nested_p (vect_loop, loop))
3104 {
3105 gphi *lcphi;
3106 unsigned i;
3107 check_reduction = false;
3108 FOR_EACH_VEC_ELT (lcphis, i, lcphi)
3109 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, gimple_phi_result (lcphi))
3110 {
3111 gimple *use_stmt = USE_STMT (use_p);
3112 if (is_gimple_debug (use_stmt))
3113 continue;
3114 if (! flow_bb_inside_loop_p (vect_loop, gimple_bb (use_stmt)))
3115 check_reduction = true;
3116 }
3117 }
3118
3119 gassign *def_stmt = as_a <gassign *> (def_stmt_info->stmt);
3120 code = orig_code = gimple_assign_rhs_code (def_stmt);
3121
3122 if (nested_in_vect_loop && !check_reduction)
3123 {
3124 /* FIXME: Even for non-reductions code generation is funneled
3125 through vectorizable_reduction for the stmt defining the
3126 PHI latch value. So we have to artificially restrict ourselves
3127 for the supported operations. */
3128 switch (get_gimple_rhs_class (code))
3129 {
3130 case GIMPLE_BINARY_RHS:
3131 case GIMPLE_TERNARY_RHS:
3132 break;
3133 default:
3134 /* Not supported by vectorizable_reduction. */
3135 if (dump_enabled_p ())
3136 report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
3137 "nested cycle: not handled operation: ");
3138 return NULL;
3139 }
3140 if (dump_enabled_p ())
3141 report_vect_op (MSG_NOTE, def_stmt, "detected nested cycle: ");
3142 return def_stmt_info;
3143 }
3144
3145 /* We can handle "res -= x[i]", which is non-associative by
3146 simply rewriting this into "res += -x[i]". Avoid changing
3147 gimple instruction for the first simple tests and only do this
3148 if we're allowed to change code at all. */
3149 if (code == MINUS_EXPR && gimple_assign_rhs2 (def_stmt) != phi_name)
3150 code = PLUS_EXPR;
3151
3152 if (code == COND_EXPR)
3153 {
3154 if (! nested_in_vect_loop)
3155 *v_reduc_type = COND_REDUCTION;
3156
3157 op3 = gimple_assign_rhs1 (def_stmt);
3158 if (COMPARISON_CLASS_P (op3))
3159 {
3160 op4 = TREE_OPERAND (op3, 1);
3161 op3 = TREE_OPERAND (op3, 0);
3162 }
3163 if (op3 == phi_name || op4 == phi_name)
3164 {
3165 if (dump_enabled_p ())
3166 report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
3167 "reduction: condition depends on previous"
3168 " iteration: ");
3169 return NULL;
3170 }
3171
3172 op1 = gimple_assign_rhs2 (def_stmt);
3173 op2 = gimple_assign_rhs3 (def_stmt);
3174 }
3175 else if (!commutative_tree_code (code) || !associative_tree_code (code))
3176 {
3177 if (dump_enabled_p ())
3178 report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
3179 "reduction: not commutative/associative: ");
3180 return NULL;
3181 }
3182 else if (get_gimple_rhs_class (code) == GIMPLE_BINARY_RHS)
3183 {
3184 op1 = gimple_assign_rhs1 (def_stmt);
3185 op2 = gimple_assign_rhs2 (def_stmt);
3186 }
3187 else
3188 {
3189 if (dump_enabled_p ())
3190 report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
3191 "reduction: not handled operation: ");
3192 return NULL;
3193 }
3194
3195 if (TREE_CODE (op1) != SSA_NAME && TREE_CODE (op2) != SSA_NAME)
3196 {
3197 if (dump_enabled_p ())
3198 report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
3199 "reduction: both uses not ssa_names: ");
3200
3201 return NULL;
3202 }
3203
3204 type = TREE_TYPE (gimple_assign_lhs (def_stmt));
3205 if ((TREE_CODE (op1) == SSA_NAME
3206 && !types_compatible_p (type,TREE_TYPE (op1)))
3207 || (TREE_CODE (op2) == SSA_NAME
3208 && !types_compatible_p (type, TREE_TYPE (op2)))
3209 || (op3 && TREE_CODE (op3) == SSA_NAME
3210 && !types_compatible_p (type, TREE_TYPE (op3)))
3211 || (op4 && TREE_CODE (op4) == SSA_NAME
3212 && !types_compatible_p (type, TREE_TYPE (op4))))
3213 {
3214 if (dump_enabled_p ())
3215 {
3216 dump_printf_loc (MSG_NOTE, vect_location,
3217 "reduction: multiple types: operation type: "
3218 "%T, operands types: %T,%T",
3219 type, TREE_TYPE (op1), TREE_TYPE (op2));
3220 if (op3)
3221 dump_printf (MSG_NOTE, ",%T", TREE_TYPE (op3));
3222
3223 if (op4)
3224 dump_printf (MSG_NOTE, ",%T", TREE_TYPE (op4));
3225 dump_printf (MSG_NOTE, "\n");
3226 }
3227
3228 return NULL;
3229 }
3230
3231 /* Check whether it's ok to change the order of the computation.
3232 Generally, when vectorizing a reduction we change the order of the
3233 computation. This may change the behavior of the program in some
3234 cases, so we need to check that this is ok. One exception is when
3235 vectorizing an outer-loop: the inner-loop is executed sequentially,
3236 and therefore vectorizing reductions in the inner-loop during
3237 outer-loop vectorization is safe. */
3238 if (check_reduction
3239 && *v_reduc_type == TREE_CODE_REDUCTION
3240 && needs_fold_left_reduction_p (type, code,
3241 need_wrapping_integral_overflow))
3242 *v_reduc_type = FOLD_LEFT_REDUCTION;
3243
3244 /* Reduction is safe. We're dealing with one of the following:
3245 1) integer arithmetic and no trapv
3246 2) floating point arithmetic, and special flags permit this optimization
3247 3) nested cycle (i.e., outer loop vectorization). */
3248 stmt_vec_info def1_info = loop_info->lookup_def (op1);
3249 stmt_vec_info def2_info = loop_info->lookup_def (op2);
3250 if (code != COND_EXPR && !def1_info && !def2_info)
3251 {
3252 if (dump_enabled_p ())
3253 report_vect_op (MSG_NOTE, def_stmt, "reduction: no defs for operands: ");
3254 return NULL;
3255 }
3256
3257 /* Check that one def is the reduction def, defined by PHI,
3258 the other def is either defined in the loop ("vect_internal_def"),
3259 or it's an induction (defined by a loop-header phi-node). */
3260
3261 if (def2_info
3262 && def2_info->stmt == phi
3263 && (code == COND_EXPR
3264 || !def1_info
3265 || !flow_bb_inside_loop_p (loop, gimple_bb (def1_info->stmt))
3266 || vect_valid_reduction_input_p (def1_info)))
3267 {
3268 if (dump_enabled_p ())
3269 report_vect_op (MSG_NOTE, def_stmt, "detected reduction: ");
3270 return def_stmt_info;
3271 }
3272
3273 if (def1_info
3274 && def1_info->stmt == phi
3275 && (code == COND_EXPR
3276 || !def2_info
3277 || !flow_bb_inside_loop_p (loop, gimple_bb (def2_info->stmt))
3278 || vect_valid_reduction_input_p (def2_info)))
3279 {
3280 if (! nested_in_vect_loop && orig_code != MINUS_EXPR)
3281 {
3282 /* Check if we can swap operands (just for simplicity - so that
3283 the rest of the code can assume that the reduction variable
3284 is always the last (second) argument). */
3285 if (code == COND_EXPR)
3286 {
3287 /* Swap cond_expr by inverting the condition. */
3288 tree cond_expr = gimple_assign_rhs1 (def_stmt);
3289 enum tree_code invert_code = ERROR_MARK;
3290 enum tree_code cond_code = TREE_CODE (cond_expr);
3291
3292 if (TREE_CODE_CLASS (cond_code) == tcc_comparison)
3293 {
3294 bool honor_nans = HONOR_NANS (TREE_OPERAND (cond_expr, 0));
3295 invert_code = invert_tree_comparison (cond_code, honor_nans);
3296 }
3297 if (invert_code != ERROR_MARK)
3298 {
3299 TREE_SET_CODE (cond_expr, invert_code);
3300 swap_ssa_operands (def_stmt,
3301 gimple_assign_rhs2_ptr (def_stmt),
3302 gimple_assign_rhs3_ptr (def_stmt));
3303 }
3304 else
3305 {
3306 if (dump_enabled_p ())
3307 report_vect_op (MSG_NOTE, def_stmt,
3308 "detected reduction: cannot swap operands "
3309 "for cond_expr");
3310 return NULL;
3311 }
3312 }
3313 else
3314 swap_ssa_operands (def_stmt, gimple_assign_rhs1_ptr (def_stmt),
3315 gimple_assign_rhs2_ptr (def_stmt));
3316
3317 if (dump_enabled_p ())
3318 report_vect_op (MSG_NOTE, def_stmt,
3319 "detected reduction: need to swap operands: ");
3320
3321 if (CONSTANT_CLASS_P (gimple_assign_rhs1 (def_stmt)))
3322 LOOP_VINFO_OPERANDS_SWAPPED (loop_info) = true;
3323 }
3324 else
3325 {
3326 if (dump_enabled_p ())
3327 report_vect_op (MSG_NOTE, def_stmt, "detected reduction: ");
3328 }
3329
3330 return def_stmt_info;
3331 }
3332
3333 /* Try to find SLP reduction chain. */
3334 if (! nested_in_vect_loop
3335 && code != COND_EXPR
3336 && orig_code != MINUS_EXPR
3337 && vect_is_slp_reduction (loop_info, phi, def_stmt))
3338 {
3339 if (dump_enabled_p ())
3340 report_vect_op (MSG_NOTE, def_stmt,
3341 "reduction: detected reduction chain: ");
3342
3343 return def_stmt_info;
3344 }
3345
3346 /* Look for the expression computing loop_arg from loop PHI result. */
3347 if (check_reduction_path (vect_location, loop, phi, loop_arg, code))
3348 return def_stmt_info;
3349
3350 if (dump_enabled_p ())
3351 {
3352 report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
3353 "reduction: unknown pattern: ");
3354 }
3355
3356 return NULL;
3357 }
3358
3359 /* Wrapper around vect_is_simple_reduction, which will modify code
3360 in-place if it enables detection of more reductions. Arguments
3361 as there. */
3362
3363 stmt_vec_info
3364 vect_force_simple_reduction (loop_vec_info loop_info, stmt_vec_info phi_info,
3365 bool *double_reduc,
3366 bool need_wrapping_integral_overflow)
3367 {
3368 enum vect_reduction_type v_reduc_type;
3369 stmt_vec_info def_info
3370 = vect_is_simple_reduction (loop_info, phi_info, double_reduc,
3371 need_wrapping_integral_overflow,
3372 &v_reduc_type);
3373 if (def_info)
3374 {
3375 STMT_VINFO_REDUC_TYPE (phi_info) = v_reduc_type;
3376 STMT_VINFO_REDUC_DEF (phi_info) = def_info;
3377 STMT_VINFO_REDUC_TYPE (def_info) = v_reduc_type;
3378 STMT_VINFO_REDUC_DEF (def_info) = phi_info;
3379 }
3380 return def_info;
3381 }
3382
3383 /* Calculate cost of peeling the loop PEEL_ITERS_PROLOGUE times. */
3384 int
3385 vect_get_known_peeling_cost (loop_vec_info loop_vinfo, int peel_iters_prologue,
3386 int *peel_iters_epilogue,
3387 stmt_vector_for_cost *scalar_cost_vec,
3388 stmt_vector_for_cost *prologue_cost_vec,
3389 stmt_vector_for_cost *epilogue_cost_vec)
3390 {
3391 int retval = 0;
3392 int assumed_vf = vect_vf_for_cost (loop_vinfo);
3393
3394 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
3395 {
3396 *peel_iters_epilogue = assumed_vf / 2;
3397 if (dump_enabled_p ())
3398 dump_printf_loc (MSG_NOTE, vect_location,
3399 "cost model: epilogue peel iters set to vf/2 "
3400 "because loop iterations are unknown .\n");
3401
3402 /* If peeled iterations are known but number of scalar loop
3403 iterations are unknown, count a taken branch per peeled loop. */
3404 retval = record_stmt_cost (prologue_cost_vec, 1, cond_branch_taken,
3405 NULL, 0, vect_prologue);
3406 retval += record_stmt_cost (epilogue_cost_vec, 1, cond_branch_taken,
3407 NULL, 0, vect_epilogue);
3408 }
3409 else
3410 {
3411 int niters = LOOP_VINFO_INT_NITERS (loop_vinfo);
3412 peel_iters_prologue = niters < peel_iters_prologue ?
3413 niters : peel_iters_prologue;
3414 *peel_iters_epilogue = (niters - peel_iters_prologue) % assumed_vf;
3415 /* If we need to peel for gaps, but no peeling is required, we have to
3416 peel VF iterations. */
3417 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) && !*peel_iters_epilogue)
3418 *peel_iters_epilogue = assumed_vf;
3419 }
3420
3421 stmt_info_for_cost *si;
3422 int j;
3423 if (peel_iters_prologue)
3424 FOR_EACH_VEC_ELT (*scalar_cost_vec, j, si)
3425 retval += record_stmt_cost (prologue_cost_vec,
3426 si->count * peel_iters_prologue,
3427 si->kind, si->stmt_info, si->misalign,
3428 vect_prologue);
3429 if (*peel_iters_epilogue)
3430 FOR_EACH_VEC_ELT (*scalar_cost_vec, j, si)
3431 retval += record_stmt_cost (epilogue_cost_vec,
3432 si->count * *peel_iters_epilogue,
3433 si->kind, si->stmt_info, si->misalign,
3434 vect_epilogue);
3435
3436 return retval;
3437 }
3438
3439 /* Function vect_estimate_min_profitable_iters
3440
3441 Return the number of iterations required for the vector version of the
3442 loop to be profitable relative to the cost of the scalar version of the
3443 loop.
3444
3445 *RET_MIN_PROFITABLE_NITERS is a cost model profitability threshold
3446 of iterations for vectorization. -1 value means loop vectorization
3447 is not profitable. This returned value may be used for dynamic
3448 profitability check.
3449
3450 *RET_MIN_PROFITABLE_ESTIMATE is a profitability threshold to be used
3451 for static check against estimated number of iterations. */
3452
3453 static void
3454 vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo,
3455 int *ret_min_profitable_niters,
3456 int *ret_min_profitable_estimate)
3457 {
3458 int min_profitable_iters;
3459 int min_profitable_estimate;
3460 int peel_iters_prologue;
3461 int peel_iters_epilogue;
3462 unsigned vec_inside_cost = 0;
3463 int vec_outside_cost = 0;
3464 unsigned vec_prologue_cost = 0;
3465 unsigned vec_epilogue_cost = 0;
3466 int scalar_single_iter_cost = 0;
3467 int scalar_outside_cost = 0;
3468 int assumed_vf = vect_vf_for_cost (loop_vinfo);
3469 int npeel = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
3470 void *target_cost_data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
3471
3472 /* Cost model disabled. */
3473 if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
3474 {
3475 if (dump_enabled_p ())
3476 dump_printf_loc (MSG_NOTE, vect_location, "cost model disabled.\n");
3477 *ret_min_profitable_niters = 0;
3478 *ret_min_profitable_estimate = 0;
3479 return;
3480 }
3481
3482 /* Requires loop versioning tests to handle misalignment. */
3483 if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo))
3484 {
3485 /* FIXME: Make cost depend on complexity of individual check. */
3486 unsigned len = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).length ();
3487 (void) add_stmt_cost (target_cost_data, len, vector_stmt, NULL, 0,
3488 vect_prologue);
3489 if (dump_enabled_p ())
3490 dump_printf (MSG_NOTE,
3491 "cost model: Adding cost of checks for loop "
3492 "versioning to treat misalignment.\n");
3493 }
3494
3495 /* Requires loop versioning with alias checks. */
3496 if (LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
3497 {
3498 /* FIXME: Make cost depend on complexity of individual check. */
3499 unsigned len = LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo).length ();
3500 (void) add_stmt_cost (target_cost_data, len, vector_stmt, NULL, 0,
3501 vect_prologue);
3502 len = LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo).length ();
3503 if (len)
3504 /* Count LEN - 1 ANDs and LEN comparisons. */
3505 (void) add_stmt_cost (target_cost_data, len * 2 - 1, scalar_stmt,
3506 NULL, 0, vect_prologue);
3507 len = LOOP_VINFO_LOWER_BOUNDS (loop_vinfo).length ();
3508 if (len)
3509 {
3510 /* Count LEN - 1 ANDs and LEN comparisons. */
3511 unsigned int nstmts = len * 2 - 1;
3512 /* +1 for each bias that needs adding. */
3513 for (unsigned int i = 0; i < len; ++i)
3514 if (!LOOP_VINFO_LOWER_BOUNDS (loop_vinfo)[i].unsigned_p)
3515 nstmts += 1;
3516 (void) add_stmt_cost (target_cost_data, nstmts, scalar_stmt,
3517 NULL, 0, vect_prologue);
3518 }
3519 if (dump_enabled_p ())
3520 dump_printf (MSG_NOTE,
3521 "cost model: Adding cost of checks for loop "
3522 "versioning aliasing.\n");
3523 }
3524
3525 /* Requires loop versioning with niter checks. */
3526 if (LOOP_REQUIRES_VERSIONING_FOR_NITERS (loop_vinfo))
3527 {
3528 /* FIXME: Make cost depend on complexity of individual check. */
3529 (void) add_stmt_cost (target_cost_data, 1, vector_stmt, NULL, 0,
3530 vect_prologue);
3531 if (dump_enabled_p ())
3532 dump_printf (MSG_NOTE,
3533 "cost model: Adding cost of checks for loop "
3534 "versioning niters.\n");
3535 }
3536
3537 if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
3538 (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken, NULL, 0,
3539 vect_prologue);
3540
3541 /* Count statements in scalar loop. Using this as scalar cost for a single
3542 iteration for now.
3543
3544 TODO: Add outer loop support.
3545
3546 TODO: Consider assigning different costs to different scalar
3547 statements. */
3548
3549 scalar_single_iter_cost
3550 = LOOP_VINFO_SINGLE_SCALAR_ITERATION_COST (loop_vinfo);
3551
3552 /* Add additional cost for the peeled instructions in prologue and epilogue
3553 loop. (For fully-masked loops there will be no peeling.)
3554
3555 FORNOW: If we don't know the value of peel_iters for prologue or epilogue
3556 at compile-time - we assume it's vf/2 (the worst would be vf-1).
3557
3558 TODO: Build an expression that represents peel_iters for prologue and
3559 epilogue to be used in a run-time test. */
3560
3561 if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
3562 {
3563 peel_iters_prologue = 0;
3564 peel_iters_epilogue = 0;
3565
3566 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
3567 {
3568 /* We need to peel exactly one iteration. */
3569 peel_iters_epilogue += 1;
3570 stmt_info_for_cost *si;
3571 int j;
3572 FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
3573 j, si)
3574 (void) add_stmt_cost (target_cost_data, si->count,
3575 si->kind, si->stmt_info, si->misalign,
3576 vect_epilogue);
3577 }
3578 }
3579 else if (npeel < 0)
3580 {
3581 peel_iters_prologue = assumed_vf / 2;
3582 if (dump_enabled_p ())
3583 dump_printf (MSG_NOTE, "cost model: "
3584 "prologue peel iters set to vf/2.\n");
3585
3586 /* If peeling for alignment is unknown, loop bound of main loop becomes
3587 unknown. */
3588 peel_iters_epilogue = assumed_vf / 2;
3589 if (dump_enabled_p ())
3590 dump_printf (MSG_NOTE, "cost model: "
3591 "epilogue peel iters set to vf/2 because "
3592 "peeling for alignment is unknown.\n");
3593
3594 /* If peeled iterations are unknown, count a taken branch and a not taken
3595 branch per peeled loop. Even if scalar loop iterations are known,
3596 vector iterations are not known since peeled prologue iterations are
3597 not known. Hence guards remain the same. */
3598 (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken,
3599 NULL, 0, vect_prologue);
3600 (void) add_stmt_cost (target_cost_data, 1, cond_branch_not_taken,
3601 NULL, 0, vect_prologue);
3602 (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken,
3603 NULL, 0, vect_epilogue);
3604 (void) add_stmt_cost (target_cost_data, 1, cond_branch_not_taken,
3605 NULL, 0, vect_epilogue);
3606 stmt_info_for_cost *si;
3607 int j;
3608 FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo), j, si)
3609 {
3610 (void) add_stmt_cost (target_cost_data,
3611 si->count * peel_iters_prologue,
3612 si->kind, si->stmt_info, si->misalign,
3613 vect_prologue);
3614 (void) add_stmt_cost (target_cost_data,
3615 si->count * peel_iters_epilogue,
3616 si->kind, si->stmt_info, si->misalign,
3617 vect_epilogue);
3618 }
3619 }
3620 else
3621 {
3622 stmt_vector_for_cost prologue_cost_vec, epilogue_cost_vec;
3623 stmt_info_for_cost *si;
3624 int j;
3625 void *data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
3626
3627 prologue_cost_vec.create (2);
3628 epilogue_cost_vec.create (2);
3629 peel_iters_prologue = npeel;
3630
3631 (void) vect_get_known_peeling_cost (loop_vinfo, peel_iters_prologue,
3632 &peel_iters_epilogue,
3633 &LOOP_VINFO_SCALAR_ITERATION_COST
3634 (loop_vinfo),
3635 &prologue_cost_vec,
3636 &epilogue_cost_vec);
3637
3638 FOR_EACH_VEC_ELT (prologue_cost_vec, j, si)
3639 (void) add_stmt_cost (data, si->count, si->kind, si->stmt_info,
3640 si->misalign, vect_prologue);
3641
3642 FOR_EACH_VEC_ELT (epilogue_cost_vec, j, si)
3643 (void) add_stmt_cost (data, si->count, si->kind, si->stmt_info,
3644 si->misalign, vect_epilogue);
3645
3646 prologue_cost_vec.release ();
3647 epilogue_cost_vec.release ();
3648 }
3649
3650 /* FORNOW: The scalar outside cost is incremented in one of the
3651 following ways:
3652
3653 1. The vectorizer checks for alignment and aliasing and generates
3654 a condition that allows dynamic vectorization. A cost model
3655 check is ANDED with the versioning condition. Hence scalar code
3656 path now has the added cost of the versioning check.
3657
3658 if (cost > th & versioning_check)
3659 jmp to vector code
3660
3661 Hence run-time scalar is incremented by not-taken branch cost.
3662
3663 2. The vectorizer then checks if a prologue is required. If the
3664 cost model check was not done before during versioning, it has to
3665 be done before the prologue check.
3666
3667 if (cost <= th)
3668 prologue = scalar_iters
3669 if (prologue == 0)
3670 jmp to vector code
3671 else
3672 execute prologue
3673 if (prologue == num_iters)
3674 go to exit
3675
3676 Hence the run-time scalar cost is incremented by a taken branch,
3677 plus a not-taken branch, plus a taken branch cost.
3678
3679 3. The vectorizer then checks if an epilogue is required. If the
3680 cost model check was not done before during prologue check, it
3681 has to be done with the epilogue check.
3682
3683 if (prologue == 0)
3684 jmp to vector code
3685 else
3686 execute prologue
3687 if (prologue == num_iters)
3688 go to exit
3689 vector code:
3690 if ((cost <= th) | (scalar_iters-prologue-epilogue == 0))
3691 jmp to epilogue
3692
3693 Hence the run-time scalar cost should be incremented by 2 taken
3694 branches.
3695
3696 TODO: The back end may reorder the BBS's differently and reverse
3697 conditions/branch directions. Change the estimates below to
3698 something more reasonable. */
3699
3700 /* If the number of iterations is known and we do not do versioning, we can
3701 decide whether to vectorize at compile time. Hence the scalar version
3702 do not carry cost model guard costs. */
3703 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
3704 || LOOP_REQUIRES_VERSIONING (loop_vinfo))
3705 {
3706 /* Cost model check occurs at versioning. */
3707 if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
3708 scalar_outside_cost += vect_get_stmt_cost (cond_branch_not_taken);
3709 else
3710 {
3711 /* Cost model check occurs at prologue generation. */
3712 if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) < 0)
3713 scalar_outside_cost += 2 * vect_get_stmt_cost (cond_branch_taken)
3714 + vect_get_stmt_cost (cond_branch_not_taken);
3715 /* Cost model check occurs at epilogue generation. */
3716 else
3717 scalar_outside_cost += 2 * vect_get_stmt_cost (cond_branch_taken);
3718 }
3719 }
3720
3721 /* Complete the target-specific cost calculations. */
3722 finish_cost (LOOP_VINFO_TARGET_COST_DATA (loop_vinfo), &vec_prologue_cost,
3723 &vec_inside_cost, &vec_epilogue_cost);
3724
3725 vec_outside_cost = (int)(vec_prologue_cost + vec_epilogue_cost);
3726
3727 if (dump_enabled_p ())
3728 {
3729 dump_printf_loc (MSG_NOTE, vect_location, "Cost model analysis: \n");
3730 dump_printf (MSG_NOTE, " Vector inside of loop cost: %d\n",
3731 vec_inside_cost);
3732 dump_printf (MSG_NOTE, " Vector prologue cost: %d\n",
3733 vec_prologue_cost);
3734 dump_printf (MSG_NOTE, " Vector epilogue cost: %d\n",
3735 vec_epilogue_cost);
3736 dump_printf (MSG_NOTE, " Scalar iteration cost: %d\n",
3737 scalar_single_iter_cost);
3738 dump_printf (MSG_NOTE, " Scalar outside cost: %d\n",
3739 scalar_outside_cost);
3740 dump_printf (MSG_NOTE, " Vector outside cost: %d\n",
3741 vec_outside_cost);
3742 dump_printf (MSG_NOTE, " prologue iterations: %d\n",
3743 peel_iters_prologue);
3744 dump_printf (MSG_NOTE, " epilogue iterations: %d\n",
3745 peel_iters_epilogue);
3746 }
3747
3748 /* Calculate number of iterations required to make the vector version
3749 profitable, relative to the loop bodies only. The following condition
3750 must hold true:
3751 SIC * niters + SOC > VIC * ((niters - NPEEL) / VF) + VOC
3752 where
3753 SIC = scalar iteration cost, VIC = vector iteration cost,
3754 VOC = vector outside cost, VF = vectorization factor,
3755 NPEEL = prologue iterations + epilogue iterations,
3756 SOC = scalar outside cost for run time cost model check. */
3757
3758 int saving_per_viter = (scalar_single_iter_cost * assumed_vf
3759 - vec_inside_cost);
3760 if (saving_per_viter <= 0)
3761 {
3762 if (LOOP_VINFO_LOOP (loop_vinfo)->force_vectorize)
3763 warning_at (vect_location.get_location_t (), OPT_Wopenmp_simd,
3764 "vectorization did not happen for a simd loop");
3765
3766 if (dump_enabled_p ())
3767 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3768 "cost model: the vector iteration cost = %d "
3769 "divided by the scalar iteration cost = %d "
3770 "is greater or equal to the vectorization factor = %d"
3771 ".\n",
3772 vec_inside_cost, scalar_single_iter_cost, assumed_vf);
3773 *ret_min_profitable_niters = -1;
3774 *ret_min_profitable_estimate = -1;
3775 return;
3776 }
3777
3778 /* ??? The "if" arm is written to handle all cases; see below for what
3779 we would do for !LOOP_VINFO_FULLY_MASKED_P. */
3780 if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
3781 {
3782 /* Rewriting the condition above in terms of the number of
3783 vector iterations (vniters) rather than the number of
3784 scalar iterations (niters) gives:
3785
3786 SIC * (vniters * VF + NPEEL) + SOC > VIC * vniters + VOC
3787
3788 <==> vniters * (SIC * VF - VIC) > VOC - SIC * NPEEL - SOC
3789
3790 For integer N, X and Y when X > 0:
3791
3792 N * X > Y <==> N >= (Y /[floor] X) + 1. */
3793 int outside_overhead = (vec_outside_cost
3794 - scalar_single_iter_cost * peel_iters_prologue
3795 - scalar_single_iter_cost * peel_iters_epilogue
3796 - scalar_outside_cost);
3797 /* We're only interested in cases that require at least one
3798 vector iteration. */
3799 int min_vec_niters = 1;
3800 if (outside_overhead > 0)
3801 min_vec_niters = outside_overhead / saving_per_viter + 1;
3802
3803 if (dump_enabled_p ())
3804 dump_printf (MSG_NOTE, " Minimum number of vector iterations: %d\n",
3805 min_vec_niters);
3806
3807 if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
3808 {
3809 /* Now that we know the minimum number of vector iterations,
3810 find the minimum niters for which the scalar cost is larger:
3811
3812 SIC * niters > VIC * vniters + VOC - SOC
3813
3814 We know that the minimum niters is no more than
3815 vniters * VF + NPEEL, but it might be (and often is) less
3816 than that if a partial vector iteration is cheaper than the
3817 equivalent scalar code. */
3818 int threshold = (vec_inside_cost * min_vec_niters
3819 + vec_outside_cost
3820 - scalar_outside_cost);
3821 if (threshold <= 0)
3822 min_profitable_iters = 1;
3823 else
3824 min_profitable_iters = threshold / scalar_single_iter_cost + 1;
3825 }
3826 else
3827 /* Convert the number of vector iterations into a number of
3828 scalar iterations. */
3829 min_profitable_iters = (min_vec_niters * assumed_vf
3830 + peel_iters_prologue
3831 + peel_iters_epilogue);
3832 }
3833 else
3834 {
3835 min_profitable_iters = ((vec_outside_cost - scalar_outside_cost)
3836 * assumed_vf
3837 - vec_inside_cost * peel_iters_prologue
3838 - vec_inside_cost * peel_iters_epilogue);
3839 if (min_profitable_iters <= 0)
3840 min_profitable_iters = 0;
3841 else
3842 {
3843 min_profitable_iters /= saving_per_viter;
3844
3845 if ((scalar_single_iter_cost * assumed_vf * min_profitable_iters)
3846 <= (((int) vec_inside_cost * min_profitable_iters)
3847 + (((int) vec_outside_cost - scalar_outside_cost)
3848 * assumed_vf)))
3849 min_profitable_iters++;
3850 }
3851 }
3852
3853 if (dump_enabled_p ())
3854 dump_printf (MSG_NOTE,
3855 " Calculated minimum iters for profitability: %d\n",
3856 min_profitable_iters);
3857
3858 if (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
3859 && min_profitable_iters < (assumed_vf + peel_iters_prologue))
3860 /* We want the vectorized loop to execute at least once. */
3861 min_profitable_iters = assumed_vf + peel_iters_prologue;
3862
3863 if (dump_enabled_p ())
3864 dump_printf_loc (MSG_NOTE, vect_location,
3865 " Runtime profitability threshold = %d\n",
3866 min_profitable_iters);
3867
3868 *ret_min_profitable_niters = min_profitable_iters;
3869
3870 /* Calculate number of iterations required to make the vector version
3871 profitable, relative to the loop bodies only.
3872
3873 Non-vectorized variant is SIC * niters and it must win over vector
3874 variant on the expected loop trip count. The following condition must hold true:
3875 SIC * niters > VIC * ((niters - NPEEL) / VF) + VOC + SOC */
3876
3877 if (vec_outside_cost <= 0)
3878 min_profitable_estimate = 0;
3879 else if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
3880 {
3881 /* This is a repeat of the code above, but with + SOC rather
3882 than - SOC. */
3883 int outside_overhead = (vec_outside_cost
3884 - scalar_single_iter_cost * peel_iters_prologue
3885 - scalar_single_iter_cost * peel_iters_epilogue
3886 + scalar_outside_cost);
3887 int min_vec_niters = 1;
3888 if (outside_overhead > 0)
3889 min_vec_niters = outside_overhead / saving_per_viter + 1;
3890
3891 if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
3892 {
3893 int threshold = (vec_inside_cost * min_vec_niters
3894 + vec_outside_cost
3895 + scalar_outside_cost);
3896 min_profitable_estimate = threshold / scalar_single_iter_cost + 1;
3897 }
3898 else
3899 min_profitable_estimate = (min_vec_niters * assumed_vf
3900 + peel_iters_prologue
3901 + peel_iters_epilogue);
3902 }
3903 else
3904 {
3905 min_profitable_estimate = ((vec_outside_cost + scalar_outside_cost)
3906 * assumed_vf
3907 - vec_inside_cost * peel_iters_prologue
3908 - vec_inside_cost * peel_iters_epilogue)
3909 / ((scalar_single_iter_cost * assumed_vf)
3910 - vec_inside_cost);
3911 }
3912 min_profitable_estimate = MAX (min_profitable_estimate, min_profitable_iters);
3913 if (dump_enabled_p ())
3914 dump_printf_loc (MSG_NOTE, vect_location,
3915 " Static estimate profitability threshold = %d\n",
3916 min_profitable_estimate);
3917
3918 *ret_min_profitable_estimate = min_profitable_estimate;
3919 }
3920
3921 /* Writes into SEL a mask for a vec_perm, equivalent to a vec_shr by OFFSET
3922 vector elements (not bits) for a vector with NELT elements. */
3923 static void
3924 calc_vec_perm_mask_for_shift (unsigned int offset, unsigned int nelt,
3925 vec_perm_builder *sel)
3926 {
3927 /* The encoding is a single stepped pattern. Any wrap-around is handled
3928 by vec_perm_indices. */
3929 sel->new_vector (nelt, 1, 3);
3930 for (unsigned int i = 0; i < 3; i++)
3931 sel->quick_push (i + offset);
3932 }
3933
3934 /* Checks whether the target supports whole-vector shifts for vectors of mode
3935 MODE. This is the case if _either_ the platform handles vec_shr_optab, _or_
3936 it supports vec_perm_const with masks for all necessary shift amounts. */
3937 static bool
3938 have_whole_vector_shift (machine_mode mode)
3939 {
3940 if (optab_handler (vec_shr_optab, mode) != CODE_FOR_nothing)
3941 return true;
3942
3943 /* Variable-length vectors should be handled via the optab. */
3944 unsigned int nelt;
3945 if (!GET_MODE_NUNITS (mode).is_constant (&nelt))
3946 return false;
3947
3948 vec_perm_builder sel;
3949 vec_perm_indices indices;
3950 for (unsigned int i = nelt / 2; i >= 1; i /= 2)
3951 {
3952 calc_vec_perm_mask_for_shift (i, nelt, &sel);
3953 indices.new_vector (sel, 2, nelt);
3954 if (!can_vec_perm_const_p (mode, indices, false))
3955 return false;
3956 }
3957 return true;
3958 }
3959
3960 /* TODO: Close dependency between vect_model_*_cost and vectorizable_*
3961 functions. Design better to avoid maintenance issues. */
3962
3963 /* Function vect_model_reduction_cost.
3964
3965 Models cost for a reduction operation, including the vector ops
3966 generated within the strip-mine loop, the initial definition before
3967 the loop, and the epilogue code that must be generated. */
3968
3969 static void
3970 vect_model_reduction_cost (stmt_vec_info stmt_info, internal_fn reduc_fn,
3971 int ncopies, stmt_vector_for_cost *cost_vec)
3972 {
3973 int prologue_cost = 0, epilogue_cost = 0, inside_cost;
3974 enum tree_code code;
3975 optab optab;
3976 tree vectype;
3977 machine_mode mode;
3978 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
3979 struct loop *loop = NULL;
3980
3981 if (loop_vinfo)
3982 loop = LOOP_VINFO_LOOP (loop_vinfo);
3983
3984 /* Condition reductions generate two reductions in the loop. */
3985 vect_reduction_type reduction_type
3986 = STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info);
3987 if (reduction_type == COND_REDUCTION)
3988 ncopies *= 2;
3989
3990 vectype = STMT_VINFO_VECTYPE (stmt_info);
3991 mode = TYPE_MODE (vectype);
3992 stmt_vec_info orig_stmt_info = vect_orig_stmt (stmt_info);
3993
3994 code = gimple_assign_rhs_code (orig_stmt_info->stmt);
3995
3996 if (reduction_type == EXTRACT_LAST_REDUCTION
3997 || reduction_type == FOLD_LEFT_REDUCTION)
3998 {
3999 /* No extra instructions needed in the prologue. */
4000 prologue_cost = 0;
4001
4002 if (reduction_type == EXTRACT_LAST_REDUCTION || reduc_fn != IFN_LAST)
4003 /* Count one reduction-like operation per vector. */
4004 inside_cost = record_stmt_cost (cost_vec, ncopies, vec_to_scalar,
4005 stmt_info, 0, vect_body);
4006 else
4007 {
4008 /* Use NELEMENTS extracts and NELEMENTS scalar ops. */
4009 unsigned int nelements = ncopies * vect_nunits_for_cost (vectype);
4010 inside_cost = record_stmt_cost (cost_vec, nelements,
4011 vec_to_scalar, stmt_info, 0,
4012 vect_body);
4013 inside_cost += record_stmt_cost (cost_vec, nelements,
4014 scalar_stmt, stmt_info, 0,
4015 vect_body);
4016 }
4017 }
4018 else
4019 {
4020 /* Add in cost for initial definition.
4021 For cond reduction we have four vectors: initial index, step,
4022 initial result of the data reduction, initial value of the index
4023 reduction. */
4024 int prologue_stmts = reduction_type == COND_REDUCTION ? 4 : 1;
4025 prologue_cost += record_stmt_cost (cost_vec, prologue_stmts,
4026 scalar_to_vec, stmt_info, 0,
4027 vect_prologue);
4028
4029 /* Cost of reduction op inside loop. */
4030 inside_cost = record_stmt_cost (cost_vec, ncopies, vector_stmt,
4031 stmt_info, 0, vect_body);
4032 }
4033
4034 /* Determine cost of epilogue code.
4035
4036 We have a reduction operator that will reduce the vector in one statement.
4037 Also requires scalar extract. */
4038
4039 if (!loop || !nested_in_vect_loop_p (loop, orig_stmt_info))
4040 {
4041 if (reduc_fn != IFN_LAST)
4042 {
4043 if (reduction_type == COND_REDUCTION)
4044 {
4045 /* An EQ stmt and an COND_EXPR stmt. */
4046 epilogue_cost += record_stmt_cost (cost_vec, 2,
4047 vector_stmt, stmt_info, 0,
4048 vect_epilogue);
4049 /* Reduction of the max index and a reduction of the found
4050 values. */
4051 epilogue_cost += record_stmt_cost (cost_vec, 2,
4052 vec_to_scalar, stmt_info, 0,
4053 vect_epilogue);
4054 /* A broadcast of the max value. */
4055 epilogue_cost += record_stmt_cost (cost_vec, 1,
4056 scalar_to_vec, stmt_info, 0,
4057 vect_epilogue);
4058 }
4059 else
4060 {
4061 epilogue_cost += record_stmt_cost (cost_vec, 1, vector_stmt,
4062 stmt_info, 0, vect_epilogue);
4063 epilogue_cost += record_stmt_cost (cost_vec, 1,
4064 vec_to_scalar, stmt_info, 0,
4065 vect_epilogue);
4066 }
4067 }
4068 else if (reduction_type == COND_REDUCTION)
4069 {
4070 unsigned estimated_nunits = vect_nunits_for_cost (vectype);
4071 /* Extraction of scalar elements. */
4072 epilogue_cost += record_stmt_cost (cost_vec,
4073 2 * estimated_nunits,
4074 vec_to_scalar, stmt_info, 0,
4075 vect_epilogue);
4076 /* Scalar max reductions via COND_EXPR / MAX_EXPR. */
4077 epilogue_cost += record_stmt_cost (cost_vec,
4078 2 * estimated_nunits - 3,
4079 scalar_stmt, stmt_info, 0,
4080 vect_epilogue);
4081 }
4082 else if (reduction_type == EXTRACT_LAST_REDUCTION
4083 || reduction_type == FOLD_LEFT_REDUCTION)
4084 /* No extra instructions need in the epilogue. */
4085 ;
4086 else
4087 {
4088 int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
4089 tree bitsize =
4090 TYPE_SIZE (TREE_TYPE (gimple_assign_lhs (orig_stmt_info->stmt)));
4091 int element_bitsize = tree_to_uhwi (bitsize);
4092 int nelements = vec_size_in_bits / element_bitsize;
4093
4094 if (code == COND_EXPR)
4095 code = MAX_EXPR;
4096
4097 optab = optab_for_tree_code (code, vectype, optab_default);
4098
4099 /* We have a whole vector shift available. */
4100 if (optab != unknown_optab
4101 && VECTOR_MODE_P (mode)
4102 && optab_handler (optab, mode) != CODE_FOR_nothing
4103 && have_whole_vector_shift (mode))
4104 {
4105 /* Final reduction via vector shifts and the reduction operator.
4106 Also requires scalar extract. */
4107 epilogue_cost += record_stmt_cost (cost_vec,
4108 exact_log2 (nelements) * 2,
4109 vector_stmt, stmt_info, 0,
4110 vect_epilogue);
4111 epilogue_cost += record_stmt_cost (cost_vec, 1,
4112 vec_to_scalar, stmt_info, 0,
4113 vect_epilogue);
4114 }
4115 else
4116 /* Use extracts and reduction op for final reduction. For N
4117 elements, we have N extracts and N-1 reduction ops. */
4118 epilogue_cost += record_stmt_cost (cost_vec,
4119 nelements + nelements - 1,
4120 vector_stmt, stmt_info, 0,
4121 vect_epilogue);
4122 }
4123 }
4124
4125 if (dump_enabled_p ())
4126 dump_printf (MSG_NOTE,
4127 "vect_model_reduction_cost: inside_cost = %d, "
4128 "prologue_cost = %d, epilogue_cost = %d .\n", inside_cost,
4129 prologue_cost, epilogue_cost);
4130 }
4131
4132
4133 /* Function vect_model_induction_cost.
4134
4135 Models cost for induction operations. */
4136
4137 static void
4138 vect_model_induction_cost (stmt_vec_info stmt_info, int ncopies,
4139 stmt_vector_for_cost *cost_vec)
4140 {
4141 unsigned inside_cost, prologue_cost;
4142
4143 if (PURE_SLP_STMT (stmt_info))
4144 return;
4145
4146 /* loop cost for vec_loop. */
4147 inside_cost = record_stmt_cost (cost_vec, ncopies, vector_stmt,
4148 stmt_info, 0, vect_body);
4149
4150 /* prologue cost for vec_init and vec_step. */
4151 prologue_cost = record_stmt_cost (cost_vec, 2, scalar_to_vec,
4152 stmt_info, 0, vect_prologue);
4153
4154 if (dump_enabled_p ())
4155 dump_printf_loc (MSG_NOTE, vect_location,
4156 "vect_model_induction_cost: inside_cost = %d, "
4157 "prologue_cost = %d .\n", inside_cost, prologue_cost);
4158 }
4159
4160
4161
4162 /* Function get_initial_def_for_reduction
4163
4164 Input:
4165 STMT_VINFO - a stmt that performs a reduction operation in the loop.
4166 INIT_VAL - the initial value of the reduction variable
4167
4168 Output:
4169 ADJUSTMENT_DEF - a tree that holds a value to be added to the final result
4170 of the reduction (used for adjusting the epilog - see below).
4171 Return a vector variable, initialized according to the operation that
4172 STMT_VINFO performs. This vector will be used as the initial value
4173 of the vector of partial results.
4174
4175 Option1 (adjust in epilog): Initialize the vector as follows:
4176 add/bit or/xor: [0,0,...,0,0]
4177 mult/bit and: [1,1,...,1,1]
4178 min/max/cond_expr: [init_val,init_val,..,init_val,init_val]
4179 and when necessary (e.g. add/mult case) let the caller know
4180 that it needs to adjust the result by init_val.
4181
4182 Option2: Initialize the vector as follows:
4183 add/bit or/xor: [init_val,0,0,...,0]
4184 mult/bit and: [init_val,1,1,...,1]
4185 min/max/cond_expr: [init_val,init_val,...,init_val]
4186 and no adjustments are needed.
4187
4188 For example, for the following code:
4189
4190 s = init_val;
4191 for (i=0;i<n;i++)
4192 s = s + a[i];
4193
4194 STMT_VINFO is 's = s + a[i]', and the reduction variable is 's'.
4195 For a vector of 4 units, we want to return either [0,0,0,init_val],
4196 or [0,0,0,0] and let the caller know that it needs to adjust
4197 the result at the end by 'init_val'.
4198
4199 FORNOW, we are using the 'adjust in epilog' scheme, because this way the
4200 initialization vector is simpler (same element in all entries), if
4201 ADJUSTMENT_DEF is not NULL, and Option2 otherwise.
4202
4203 A cost model should help decide between these two schemes. */
4204
4205 tree
4206 get_initial_def_for_reduction (stmt_vec_info stmt_vinfo, tree init_val,
4207 tree *adjustment_def)
4208 {
4209 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_vinfo);
4210 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
4211 tree scalar_type = TREE_TYPE (init_val);
4212 tree vectype = get_vectype_for_scalar_type (scalar_type);
4213 enum tree_code code = gimple_assign_rhs_code (stmt_vinfo->stmt);
4214 tree def_for_init;
4215 tree init_def;
4216 REAL_VALUE_TYPE real_init_val = dconst0;
4217 int int_init_val = 0;
4218 gimple_seq stmts = NULL;
4219
4220 gcc_assert (vectype);
4221
4222 gcc_assert (POINTER_TYPE_P (scalar_type) || INTEGRAL_TYPE_P (scalar_type)
4223 || SCALAR_FLOAT_TYPE_P (scalar_type));
4224
4225 gcc_assert (nested_in_vect_loop_p (loop, stmt_vinfo)
4226 || loop == (gimple_bb (stmt_vinfo->stmt))->loop_father);
4227
4228 vect_reduction_type reduction_type
4229 = STMT_VINFO_VEC_REDUCTION_TYPE (stmt_vinfo);
4230
4231 switch (code)
4232 {
4233 case WIDEN_SUM_EXPR:
4234 case DOT_PROD_EXPR:
4235 case SAD_EXPR:
4236 case PLUS_EXPR:
4237 case MINUS_EXPR:
4238 case BIT_IOR_EXPR:
4239 case BIT_XOR_EXPR:
4240 case MULT_EXPR:
4241 case BIT_AND_EXPR:
4242 {
4243 /* ADJUSTMENT_DEF is NULL when called from
4244 vect_create_epilog_for_reduction to vectorize double reduction. */
4245 if (adjustment_def)
4246 *adjustment_def = init_val;
4247
4248 if (code == MULT_EXPR)
4249 {
4250 real_init_val = dconst1;
4251 int_init_val = 1;
4252 }
4253
4254 if (code == BIT_AND_EXPR)
4255 int_init_val = -1;
4256
4257 if (SCALAR_FLOAT_TYPE_P (scalar_type))
4258 def_for_init = build_real (scalar_type, real_init_val);
4259 else
4260 def_for_init = build_int_cst (scalar_type, int_init_val);
4261
4262 if (adjustment_def)
4263 /* Option1: the first element is '0' or '1' as well. */
4264 init_def = gimple_build_vector_from_val (&stmts, vectype,
4265 def_for_init);
4266 else if (!TYPE_VECTOR_SUBPARTS (vectype).is_constant ())
4267 {
4268 /* Option2 (variable length): the first element is INIT_VAL. */
4269 init_def = gimple_build_vector_from_val (&stmts, vectype,
4270 def_for_init);
4271 init_def = gimple_build (&stmts, CFN_VEC_SHL_INSERT,
4272 vectype, init_def, init_val);
4273 }
4274 else
4275 {
4276 /* Option2: the first element is INIT_VAL. */
4277 tree_vector_builder elts (vectype, 1, 2);
4278 elts.quick_push (init_val);
4279 elts.quick_push (def_for_init);
4280 init_def = gimple_build_vector (&stmts, &elts);
4281 }
4282 }
4283 break;
4284
4285 case MIN_EXPR:
4286 case MAX_EXPR:
4287 case COND_EXPR:
4288 {
4289 if (adjustment_def)
4290 {
4291 *adjustment_def = NULL_TREE;
4292 if (reduction_type != COND_REDUCTION
4293 && reduction_type != EXTRACT_LAST_REDUCTION)
4294 {
4295 init_def = vect_get_vec_def_for_operand (init_val, stmt_vinfo);
4296 break;
4297 }
4298 }
4299 init_val = gimple_convert (&stmts, TREE_TYPE (vectype), init_val);
4300 init_def = gimple_build_vector_from_val (&stmts, vectype, init_val);
4301 }
4302 break;
4303
4304 default:
4305 gcc_unreachable ();
4306 }
4307
4308 if (stmts)
4309 gsi_insert_seq_on_edge_immediate (loop_preheader_edge (loop), stmts);
4310 return init_def;
4311 }
4312
4313 /* Get at the initial defs for the reduction PHIs in SLP_NODE.
4314 NUMBER_OF_VECTORS is the number of vector defs to create.
4315 If NEUTRAL_OP is nonnull, introducing extra elements of that
4316 value will not change the result. */
4317
4318 static void
4319 get_initial_defs_for_reduction (slp_tree slp_node,
4320 vec<tree> *vec_oprnds,
4321 unsigned int number_of_vectors,
4322 bool reduc_chain, tree neutral_op)
4323 {
4324 vec<stmt_vec_info> stmts = SLP_TREE_SCALAR_STMTS (slp_node);
4325 stmt_vec_info stmt_vinfo = stmts[0];
4326 unsigned HOST_WIDE_INT nunits;
4327 unsigned j, number_of_places_left_in_vector;
4328 tree vector_type;
4329 unsigned int group_size = stmts.length ();
4330 unsigned int i;
4331 struct loop *loop;
4332
4333 vector_type = STMT_VINFO_VECTYPE (stmt_vinfo);
4334
4335 gcc_assert (STMT_VINFO_DEF_TYPE (stmt_vinfo) == vect_reduction_def);
4336
4337 loop = (gimple_bb (stmt_vinfo->stmt))->loop_father;
4338 gcc_assert (loop);
4339 edge pe = loop_preheader_edge (loop);
4340
4341 gcc_assert (!reduc_chain || neutral_op);
4342
4343 /* NUMBER_OF_COPIES is the number of times we need to use the same values in
4344 created vectors. It is greater than 1 if unrolling is performed.
4345
4346 For example, we have two scalar operands, s1 and s2 (e.g., group of
4347 strided accesses of size two), while NUNITS is four (i.e., four scalars
4348 of this type can be packed in a vector). The output vector will contain
4349 two copies of each scalar operand: {s1, s2, s1, s2}. (NUMBER_OF_COPIES
4350 will be 2).
4351
4352 If REDUC_GROUP_SIZE > NUNITS, the scalars will be split into several
4353 vectors containing the operands.
4354
4355 For example, NUNITS is four as before, and the group size is 8
4356 (s1, s2, ..., s8). We will create two vectors {s1, s2, s3, s4} and
4357 {s5, s6, s7, s8}. */
4358
4359 if (!TYPE_VECTOR_SUBPARTS (vector_type).is_constant (&nunits))
4360 nunits = group_size;
4361
4362 number_of_places_left_in_vector = nunits;
4363 bool constant_p = true;
4364 tree_vector_builder elts (vector_type, nunits, 1);
4365 elts.quick_grow (nunits);
4366 gimple_seq ctor_seq = NULL;
4367 for (j = 0; j < nunits * number_of_vectors; ++j)
4368 {
4369 tree op;
4370 i = j % group_size;
4371 stmt_vinfo = stmts[i];
4372
4373 /* Get the def before the loop. In reduction chain we have only
4374 one initial value. Else we have as many as PHIs in the group. */
4375 if (reduc_chain)
4376 op = j != 0 ? neutral_op : PHI_ARG_DEF_FROM_EDGE (stmt_vinfo->stmt, pe);
4377 else if (((vec_oprnds->length () + 1) * nunits
4378 - number_of_places_left_in_vector >= group_size)
4379 && neutral_op)
4380 op = neutral_op;
4381 else
4382 op = PHI_ARG_DEF_FROM_EDGE (stmt_vinfo->stmt, pe);
4383
4384 /* Create 'vect_ = {op0,op1,...,opn}'. */
4385 number_of_places_left_in_vector--;
4386 elts[nunits - number_of_places_left_in_vector - 1] = op;
4387 if (!CONSTANT_CLASS_P (op))
4388 constant_p = false;
4389
4390 if (number_of_places_left_in_vector == 0)
4391 {
4392 tree init;
4393 if (constant_p && !neutral_op
4394 ? multiple_p (TYPE_VECTOR_SUBPARTS (vector_type), nunits)
4395 : known_eq (TYPE_VECTOR_SUBPARTS (vector_type), nunits))
4396 /* Build the vector directly from ELTS. */
4397 init = gimple_build_vector (&ctor_seq, &elts);
4398 else if (neutral_op)
4399 {
4400 /* Build a vector of the neutral value and shift the
4401 other elements into place. */
4402 init = gimple_build_vector_from_val (&ctor_seq, vector_type,
4403 neutral_op);
4404 int k = nunits;
4405 while (k > 0 && elts[k - 1] == neutral_op)
4406 k -= 1;
4407 while (k > 0)
4408 {
4409 k -= 1;
4410 init = gimple_build (&ctor_seq, CFN_VEC_SHL_INSERT,
4411 vector_type, init, elts[k]);
4412 }
4413 }
4414 else
4415 {
4416 /* First time round, duplicate ELTS to fill the
4417 required number of vectors. */
4418 duplicate_and_interleave (&ctor_seq, vector_type, elts,
4419 number_of_vectors, *vec_oprnds);
4420 break;
4421 }
4422 vec_oprnds->quick_push (init);
4423
4424 number_of_places_left_in_vector = nunits;
4425 elts.new_vector (vector_type, nunits, 1);
4426 elts.quick_grow (nunits);
4427 constant_p = true;
4428 }
4429 }
4430 if (ctor_seq != NULL)
4431 gsi_insert_seq_on_edge_immediate (pe, ctor_seq);
4432 }
4433
4434
4435 /* Function vect_create_epilog_for_reduction
4436
4437 Create code at the loop-epilog to finalize the result of a reduction
4438 computation.
4439
4440 VECT_DEFS is list of vector of partial results, i.e., the lhs's of vector
4441 reduction statements.
4442 STMT_INFO is the scalar reduction stmt that is being vectorized.
4443 NCOPIES is > 1 in case the vectorization factor (VF) is bigger than the
4444 number of elements that we can fit in a vectype (nunits). In this case
4445 we have to generate more than one vector stmt - i.e - we need to "unroll"
4446 the vector stmt by a factor VF/nunits. For more details see documentation
4447 in vectorizable_operation.
4448 REDUC_FN is the internal function for the epilog reduction.
4449 REDUCTION_PHIS is a list of the phi-nodes that carry the reduction
4450 computation.
4451 REDUC_INDEX is the index of the operand in the right hand side of the
4452 statement that is defined by REDUCTION_PHI.
4453 DOUBLE_REDUC is TRUE if double reduction phi nodes should be handled.
4454 SLP_NODE is an SLP node containing a group of reduction statements. The
4455 first one in this group is STMT_INFO.
4456 INDUC_VAL is for INTEGER_INDUC_COND_REDUCTION the value to use for the case
4457 when the COND_EXPR is never true in the loop. For MAX_EXPR, it needs to
4458 be smaller than any value of the IV in the loop, for MIN_EXPR larger than
4459 any value of the IV in the loop.
4460 INDUC_CODE is the code for epilog reduction if INTEGER_INDUC_COND_REDUCTION.
4461 NEUTRAL_OP is the value given by neutral_op_for_slp_reduction; it is
4462 null if this is not an SLP reduction
4463
4464 This function:
4465 1. Creates the reduction def-use cycles: sets the arguments for
4466 REDUCTION_PHIS:
4467 The loop-entry argument is the vectorized initial-value of the reduction.
4468 The loop-latch argument is taken from VECT_DEFS - the vector of partial
4469 sums.
4470 2. "Reduces" each vector of partial results VECT_DEFS into a single result,
4471 by calling the function specified by REDUC_FN if available, or by
4472 other means (whole-vector shifts or a scalar loop).
4473 The function also creates a new phi node at the loop exit to preserve
4474 loop-closed form, as illustrated below.
4475
4476 The flow at the entry to this function:
4477
4478 loop:
4479 vec_def = phi <null, null> # REDUCTION_PHI
4480 VECT_DEF = vector_stmt # vectorized form of STMT_INFO
4481 s_loop = scalar_stmt # (scalar) STMT_INFO
4482 loop_exit:
4483 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
4484 use <s_out0>
4485 use <s_out0>
4486
4487 The above is transformed by this function into:
4488
4489 loop:
4490 vec_def = phi <vec_init, VECT_DEF> # REDUCTION_PHI
4491 VECT_DEF = vector_stmt # vectorized form of STMT_INFO
4492 s_loop = scalar_stmt # (scalar) STMT_INFO
4493 loop_exit:
4494 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
4495 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
4496 v_out2 = reduce <v_out1>
4497 s_out3 = extract_field <v_out2, 0>
4498 s_out4 = adjust_result <s_out3>
4499 use <s_out4>
4500 use <s_out4>
4501 */
4502
4503 static void
4504 vect_create_epilog_for_reduction (vec<tree> vect_defs,
4505 stmt_vec_info stmt_info,
4506 gimple *reduc_def_stmt,
4507 int ncopies, internal_fn reduc_fn,
4508 vec<stmt_vec_info> reduction_phis,
4509 bool double_reduc,
4510 slp_tree slp_node,
4511 slp_instance slp_node_instance,
4512 tree induc_val, enum tree_code induc_code,
4513 tree neutral_op)
4514 {
4515 stmt_vec_info prev_phi_info;
4516 tree vectype;
4517 machine_mode mode;
4518 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4519 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo), *outer_loop = NULL;
4520 basic_block exit_bb;
4521 tree scalar_dest;
4522 tree scalar_type;
4523 gimple *new_phi = NULL, *phi;
4524 stmt_vec_info phi_info;
4525 gimple_stmt_iterator exit_gsi;
4526 tree vec_dest;
4527 tree new_temp = NULL_TREE, new_dest, new_name, new_scalar_dest;
4528 gimple *epilog_stmt = NULL;
4529 enum tree_code code = gimple_assign_rhs_code (stmt_info->stmt);
4530 gimple *exit_phi;
4531 tree bitsize;
4532 tree adjustment_def = NULL;
4533 tree vec_initial_def = NULL;
4534 tree expr, def, initial_def = NULL;
4535 tree orig_name, scalar_result;
4536 imm_use_iterator imm_iter, phi_imm_iter;
4537 use_operand_p use_p, phi_use_p;
4538 gimple *use_stmt;
4539 stmt_vec_info reduction_phi_info = NULL;
4540 bool nested_in_vect_loop = false;
4541 auto_vec<gimple *> new_phis;
4542 auto_vec<stmt_vec_info> inner_phis;
4543 int j, i;
4544 auto_vec<tree> scalar_results;
4545 unsigned int group_size = 1, k, ratio;
4546 auto_vec<tree> vec_initial_defs;
4547 auto_vec<gimple *> phis;
4548 bool slp_reduc = false;
4549 bool direct_slp_reduc;
4550 tree new_phi_result;
4551 stmt_vec_info inner_phi = NULL;
4552 tree induction_index = NULL_TREE;
4553
4554 if (slp_node)
4555 group_size = SLP_TREE_SCALAR_STMTS (slp_node).length ();
4556
4557 if (nested_in_vect_loop_p (loop, stmt_info))
4558 {
4559 outer_loop = loop;
4560 loop = loop->inner;
4561 nested_in_vect_loop = true;
4562 gcc_assert (!slp_node);
4563 }
4564
4565 vectype = STMT_VINFO_VECTYPE (stmt_info);
4566 gcc_assert (vectype);
4567 mode = TYPE_MODE (vectype);
4568
4569 /* 1. Create the reduction def-use cycle:
4570 Set the arguments of REDUCTION_PHIS, i.e., transform
4571
4572 loop:
4573 vec_def = phi <null, null> # REDUCTION_PHI
4574 VECT_DEF = vector_stmt # vectorized form of STMT
4575 ...
4576
4577 into:
4578
4579 loop:
4580 vec_def = phi <vec_init, VECT_DEF> # REDUCTION_PHI
4581 VECT_DEF = vector_stmt # vectorized form of STMT
4582 ...
4583
4584 (in case of SLP, do it for all the phis). */
4585
4586 /* Get the loop-entry arguments. */
4587 enum vect_def_type initial_def_dt = vect_unknown_def_type;
4588 if (slp_node)
4589 {
4590 unsigned vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
4591 vec_initial_defs.reserve (vec_num);
4592 get_initial_defs_for_reduction (slp_node_instance->reduc_phis,
4593 &vec_initial_defs, vec_num,
4594 REDUC_GROUP_FIRST_ELEMENT (stmt_info),
4595 neutral_op);
4596 }
4597 else
4598 {
4599 /* Get at the scalar def before the loop, that defines the initial value
4600 of the reduction variable. */
4601 initial_def = PHI_ARG_DEF_FROM_EDGE (reduc_def_stmt,
4602 loop_preheader_edge (loop));
4603 /* Optimize: if initial_def is for REDUC_MAX smaller than the base
4604 and we can't use zero for induc_val, use initial_def. Similarly
4605 for REDUC_MIN and initial_def larger than the base. */
4606 if (TREE_CODE (initial_def) == INTEGER_CST
4607 && (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
4608 == INTEGER_INDUC_COND_REDUCTION)
4609 && !integer_zerop (induc_val)
4610 && ((induc_code == MAX_EXPR
4611 && tree_int_cst_lt (initial_def, induc_val))
4612 || (induc_code == MIN_EXPR
4613 && tree_int_cst_lt (induc_val, initial_def))))
4614 induc_val = initial_def;
4615
4616 if (double_reduc)
4617 /* In case of double reduction we only create a vector variable
4618 to be put in the reduction phi node. The actual statement
4619 creation is done later in this function. */
4620 vec_initial_def = vect_create_destination_var (initial_def, vectype);
4621 else if (nested_in_vect_loop)
4622 {
4623 /* Do not use an adjustment def as that case is not supported
4624 correctly if ncopies is not one. */
4625 vect_is_simple_use (initial_def, loop_vinfo, &initial_def_dt);
4626 vec_initial_def = vect_get_vec_def_for_operand (initial_def,
4627 stmt_info);
4628 }
4629 else
4630 vec_initial_def
4631 = get_initial_def_for_reduction (stmt_info, initial_def,
4632 &adjustment_def);
4633 vec_initial_defs.create (1);
4634 vec_initial_defs.quick_push (vec_initial_def);
4635 }
4636
4637 /* Set phi nodes arguments. */
4638 FOR_EACH_VEC_ELT (reduction_phis, i, phi_info)
4639 {
4640 tree vec_init_def = vec_initial_defs[i];
4641 tree def = vect_defs[i];
4642 for (j = 0; j < ncopies; j++)
4643 {
4644 if (j != 0)
4645 {
4646 phi_info = STMT_VINFO_RELATED_STMT (phi_info);
4647 if (nested_in_vect_loop)
4648 vec_init_def
4649 = vect_get_vec_def_for_stmt_copy (loop_vinfo, vec_init_def);
4650 }
4651
4652 /* Set the loop-entry arg of the reduction-phi. */
4653
4654 gphi *phi = as_a <gphi *> (phi_info->stmt);
4655 if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
4656 == INTEGER_INDUC_COND_REDUCTION)
4657 {
4658 /* Initialise the reduction phi to zero. This prevents initial
4659 values of non-zero interferring with the reduction op. */
4660 gcc_assert (ncopies == 1);
4661 gcc_assert (i == 0);
4662
4663 tree vec_init_def_type = TREE_TYPE (vec_init_def);
4664 tree induc_val_vec
4665 = build_vector_from_val (vec_init_def_type, induc_val);
4666
4667 add_phi_arg (phi, induc_val_vec, loop_preheader_edge (loop),
4668 UNKNOWN_LOCATION);
4669 }
4670 else
4671 add_phi_arg (phi, vec_init_def, loop_preheader_edge (loop),
4672 UNKNOWN_LOCATION);
4673
4674 /* Set the loop-latch arg for the reduction-phi. */
4675 if (j > 0)
4676 def = vect_get_vec_def_for_stmt_copy (loop_vinfo, def);
4677
4678 add_phi_arg (phi, def, loop_latch_edge (loop), UNKNOWN_LOCATION);
4679
4680 if (dump_enabled_p ())
4681 dump_printf_loc (MSG_NOTE, vect_location,
4682 "transform reduction: created def-use cycle: %G%G",
4683 phi, SSA_NAME_DEF_STMT (def));
4684 }
4685 }
4686
4687 /* For cond reductions we want to create a new vector (INDEX_COND_EXPR)
4688 which is updated with the current index of the loop for every match of
4689 the original loop's cond_expr (VEC_STMT). This results in a vector
4690 containing the last time the condition passed for that vector lane.
4691 The first match will be a 1 to allow 0 to be used for non-matching
4692 indexes. If there are no matches at all then the vector will be all
4693 zeroes. */
4694 if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) == COND_REDUCTION)
4695 {
4696 tree indx_before_incr, indx_after_incr;
4697 poly_uint64 nunits_out = TYPE_VECTOR_SUBPARTS (vectype);
4698
4699 gimple *vec_stmt = STMT_VINFO_VEC_STMT (stmt_info)->stmt;
4700 gcc_assert (gimple_assign_rhs_code (vec_stmt) == VEC_COND_EXPR);
4701
4702 int scalar_precision
4703 = GET_MODE_PRECISION (SCALAR_TYPE_MODE (TREE_TYPE (vectype)));
4704 tree cr_index_scalar_type = make_unsigned_type (scalar_precision);
4705 tree cr_index_vector_type = build_vector_type
4706 (cr_index_scalar_type, TYPE_VECTOR_SUBPARTS (vectype));
4707
4708 /* First we create a simple vector induction variable which starts
4709 with the values {1,2,3,...} (SERIES_VECT) and increments by the
4710 vector size (STEP). */
4711
4712 /* Create a {1,2,3,...} vector. */
4713 tree series_vect = build_index_vector (cr_index_vector_type, 1, 1);
4714
4715 /* Create a vector of the step value. */
4716 tree step = build_int_cst (cr_index_scalar_type, nunits_out);
4717 tree vec_step = build_vector_from_val (cr_index_vector_type, step);
4718
4719 /* Create an induction variable. */
4720 gimple_stmt_iterator incr_gsi;
4721 bool insert_after;
4722 standard_iv_increment_position (loop, &incr_gsi, &insert_after);
4723 create_iv (series_vect, vec_step, NULL_TREE, loop, &incr_gsi,
4724 insert_after, &indx_before_incr, &indx_after_incr);
4725
4726 /* Next create a new phi node vector (NEW_PHI_TREE) which starts
4727 filled with zeros (VEC_ZERO). */
4728
4729 /* Create a vector of 0s. */
4730 tree zero = build_zero_cst (cr_index_scalar_type);
4731 tree vec_zero = build_vector_from_val (cr_index_vector_type, zero);
4732
4733 /* Create a vector phi node. */
4734 tree new_phi_tree = make_ssa_name (cr_index_vector_type);
4735 new_phi = create_phi_node (new_phi_tree, loop->header);
4736 loop_vinfo->add_stmt (new_phi);
4737 add_phi_arg (as_a <gphi *> (new_phi), vec_zero,
4738 loop_preheader_edge (loop), UNKNOWN_LOCATION);
4739
4740 /* Now take the condition from the loops original cond_expr
4741 (VEC_STMT) and produce a new cond_expr (INDEX_COND_EXPR) which for
4742 every match uses values from the induction variable
4743 (INDEX_BEFORE_INCR) otherwise uses values from the phi node
4744 (NEW_PHI_TREE).
4745 Finally, we update the phi (NEW_PHI_TREE) to take the value of
4746 the new cond_expr (INDEX_COND_EXPR). */
4747
4748 /* Duplicate the condition from vec_stmt. */
4749 tree ccompare = unshare_expr (gimple_assign_rhs1 (vec_stmt));
4750
4751 /* Create a conditional, where the condition is taken from vec_stmt
4752 (CCOMPARE), then is the induction index (INDEX_BEFORE_INCR) and
4753 else is the phi (NEW_PHI_TREE). */
4754 tree index_cond_expr = build3 (VEC_COND_EXPR, cr_index_vector_type,
4755 ccompare, indx_before_incr,
4756 new_phi_tree);
4757 induction_index = make_ssa_name (cr_index_vector_type);
4758 gimple *index_condition = gimple_build_assign (induction_index,
4759 index_cond_expr);
4760 gsi_insert_before (&incr_gsi, index_condition, GSI_SAME_STMT);
4761 stmt_vec_info index_vec_info = loop_vinfo->add_stmt (index_condition);
4762 STMT_VINFO_VECTYPE (index_vec_info) = cr_index_vector_type;
4763
4764 /* Update the phi with the vec cond. */
4765 add_phi_arg (as_a <gphi *> (new_phi), induction_index,
4766 loop_latch_edge (loop), UNKNOWN_LOCATION);
4767 }
4768
4769 /* 2. Create epilog code.
4770 The reduction epilog code operates across the elements of the vector
4771 of partial results computed by the vectorized loop.
4772 The reduction epilog code consists of:
4773
4774 step 1: compute the scalar result in a vector (v_out2)
4775 step 2: extract the scalar result (s_out3) from the vector (v_out2)
4776 step 3: adjust the scalar result (s_out3) if needed.
4777
4778 Step 1 can be accomplished using one the following three schemes:
4779 (scheme 1) using reduc_fn, if available.
4780 (scheme 2) using whole-vector shifts, if available.
4781 (scheme 3) using a scalar loop. In this case steps 1+2 above are
4782 combined.
4783
4784 The overall epilog code looks like this:
4785
4786 s_out0 = phi <s_loop> # original EXIT_PHI
4787 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
4788 v_out2 = reduce <v_out1> # step 1
4789 s_out3 = extract_field <v_out2, 0> # step 2
4790 s_out4 = adjust_result <s_out3> # step 3
4791
4792 (step 3 is optional, and steps 1 and 2 may be combined).
4793 Lastly, the uses of s_out0 are replaced by s_out4. */
4794
4795
4796 /* 2.1 Create new loop-exit-phis to preserve loop-closed form:
4797 v_out1 = phi <VECT_DEF>
4798 Store them in NEW_PHIS. */
4799
4800 exit_bb = single_exit (loop)->dest;
4801 prev_phi_info = NULL;
4802 new_phis.create (vect_defs.length ());
4803 FOR_EACH_VEC_ELT (vect_defs, i, def)
4804 {
4805 for (j = 0; j < ncopies; j++)
4806 {
4807 tree new_def = copy_ssa_name (def);
4808 phi = create_phi_node (new_def, exit_bb);
4809 stmt_vec_info phi_info = loop_vinfo->add_stmt (phi);
4810 if (j == 0)
4811 new_phis.quick_push (phi);
4812 else
4813 {
4814 def = vect_get_vec_def_for_stmt_copy (loop_vinfo, def);
4815 STMT_VINFO_RELATED_STMT (prev_phi_info) = phi_info;
4816 }
4817
4818 SET_PHI_ARG_DEF (phi, single_exit (loop)->dest_idx, def);
4819 prev_phi_info = phi_info;
4820 }
4821 }
4822
4823 /* The epilogue is created for the outer-loop, i.e., for the loop being
4824 vectorized. Create exit phis for the outer loop. */
4825 if (double_reduc)
4826 {
4827 loop = outer_loop;
4828 exit_bb = single_exit (loop)->dest;
4829 inner_phis.create (vect_defs.length ());
4830 FOR_EACH_VEC_ELT (new_phis, i, phi)
4831 {
4832 stmt_vec_info phi_info = loop_vinfo->lookup_stmt (phi);
4833 tree new_result = copy_ssa_name (PHI_RESULT (phi));
4834 gphi *outer_phi = create_phi_node (new_result, exit_bb);
4835 SET_PHI_ARG_DEF (outer_phi, single_exit (loop)->dest_idx,
4836 PHI_RESULT (phi));
4837 prev_phi_info = loop_vinfo->add_stmt (outer_phi);
4838 inner_phis.quick_push (phi_info);
4839 new_phis[i] = outer_phi;
4840 while (STMT_VINFO_RELATED_STMT (phi_info))
4841 {
4842 phi_info = STMT_VINFO_RELATED_STMT (phi_info);
4843 new_result = copy_ssa_name (PHI_RESULT (phi_info->stmt));
4844 outer_phi = create_phi_node (new_result, exit_bb);
4845 SET_PHI_ARG_DEF (outer_phi, single_exit (loop)->dest_idx,
4846 PHI_RESULT (phi_info->stmt));
4847 stmt_vec_info outer_phi_info = loop_vinfo->add_stmt (outer_phi);
4848 STMT_VINFO_RELATED_STMT (prev_phi_info) = outer_phi_info;
4849 prev_phi_info = outer_phi_info;
4850 }
4851 }
4852 }
4853
4854 exit_gsi = gsi_after_labels (exit_bb);
4855
4856 /* 2.2 Get the relevant tree-code to use in the epilog for schemes 2,3
4857 (i.e. when reduc_fn is not available) and in the final adjustment
4858 code (if needed). Also get the original scalar reduction variable as
4859 defined in the loop. In case STMT is a "pattern-stmt" (i.e. - it
4860 represents a reduction pattern), the tree-code and scalar-def are
4861 taken from the original stmt that the pattern-stmt (STMT) replaces.
4862 Otherwise (it is a regular reduction) - the tree-code and scalar-def
4863 are taken from STMT. */
4864
4865 stmt_vec_info orig_stmt_info = vect_orig_stmt (stmt_info);
4866 if (orig_stmt_info != stmt_info)
4867 {
4868 /* Reduction pattern */
4869 gcc_assert (STMT_VINFO_IN_PATTERN_P (orig_stmt_info));
4870 gcc_assert (STMT_VINFO_RELATED_STMT (orig_stmt_info) == stmt_info);
4871 }
4872
4873 code = gimple_assign_rhs_code (orig_stmt_info->stmt);
4874 /* For MINUS_EXPR the initial vector is [init_val,0,...,0], therefore,
4875 partial results are added and not subtracted. */
4876 if (code == MINUS_EXPR)
4877 code = PLUS_EXPR;
4878
4879 scalar_dest = gimple_assign_lhs (orig_stmt_info->stmt);
4880 scalar_type = TREE_TYPE (scalar_dest);
4881 scalar_results.create (group_size);
4882 new_scalar_dest = vect_create_destination_var (scalar_dest, NULL);
4883 bitsize = TYPE_SIZE (scalar_type);
4884
4885 /* In case this is a reduction in an inner-loop while vectorizing an outer
4886 loop - we don't need to extract a single scalar result at the end of the
4887 inner-loop (unless it is double reduction, i.e., the use of reduction is
4888 outside the outer-loop). The final vector of partial results will be used
4889 in the vectorized outer-loop, or reduced to a scalar result at the end of
4890 the outer-loop. */
4891 if (nested_in_vect_loop && !double_reduc)
4892 goto vect_finalize_reduction;
4893
4894 /* SLP reduction without reduction chain, e.g.,
4895 # a1 = phi <a2, a0>
4896 # b1 = phi <b2, b0>
4897 a2 = operation (a1)
4898 b2 = operation (b1) */
4899 slp_reduc = (slp_node && !REDUC_GROUP_FIRST_ELEMENT (stmt_info));
4900
4901 /* True if we should implement SLP_REDUC using native reduction operations
4902 instead of scalar operations. */
4903 direct_slp_reduc = (reduc_fn != IFN_LAST
4904 && slp_reduc
4905 && !TYPE_VECTOR_SUBPARTS (vectype).is_constant ());
4906
4907 /* In case of reduction chain, e.g.,
4908 # a1 = phi <a3, a0>
4909 a2 = operation (a1)
4910 a3 = operation (a2),
4911
4912 we may end up with more than one vector result. Here we reduce them to
4913 one vector. */
4914 if (REDUC_GROUP_FIRST_ELEMENT (stmt_info) || direct_slp_reduc)
4915 {
4916 tree first_vect = PHI_RESULT (new_phis[0]);
4917 gassign *new_vec_stmt = NULL;
4918 vec_dest = vect_create_destination_var (scalar_dest, vectype);
4919 for (k = 1; k < new_phis.length (); k++)
4920 {
4921 gimple *next_phi = new_phis[k];
4922 tree second_vect = PHI_RESULT (next_phi);
4923 tree tem = make_ssa_name (vec_dest, new_vec_stmt);
4924 new_vec_stmt = gimple_build_assign (tem, code,
4925 first_vect, second_vect);
4926 gsi_insert_before (&exit_gsi, new_vec_stmt, GSI_SAME_STMT);
4927 first_vect = tem;
4928 }
4929
4930 new_phi_result = first_vect;
4931 if (new_vec_stmt)
4932 {
4933 new_phis.truncate (0);
4934 new_phis.safe_push (new_vec_stmt);
4935 }
4936 }
4937 /* Likewise if we couldn't use a single defuse cycle. */
4938 else if (ncopies > 1)
4939 {
4940 gcc_assert (new_phis.length () == 1);
4941 tree first_vect = PHI_RESULT (new_phis[0]);
4942 gassign *new_vec_stmt = NULL;
4943 vec_dest = vect_create_destination_var (scalar_dest, vectype);
4944 stmt_vec_info next_phi_info = loop_vinfo->lookup_stmt (new_phis[0]);
4945 for (int k = 1; k < ncopies; ++k)
4946 {
4947 next_phi_info = STMT_VINFO_RELATED_STMT (next_phi_info);
4948 tree second_vect = PHI_RESULT (next_phi_info->stmt);
4949 tree tem = make_ssa_name (vec_dest, new_vec_stmt);
4950 new_vec_stmt = gimple_build_assign (tem, code,
4951 first_vect, second_vect);
4952 gsi_insert_before (&exit_gsi, new_vec_stmt, GSI_SAME_STMT);
4953 first_vect = tem;
4954 }
4955 new_phi_result = first_vect;
4956 new_phis.truncate (0);
4957 new_phis.safe_push (new_vec_stmt);
4958 }
4959 else
4960 new_phi_result = PHI_RESULT (new_phis[0]);
4961
4962 if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) == COND_REDUCTION
4963 && reduc_fn != IFN_LAST)
4964 {
4965 /* For condition reductions, we have a vector (NEW_PHI_RESULT) containing
4966 various data values where the condition matched and another vector
4967 (INDUCTION_INDEX) containing all the indexes of those matches. We
4968 need to extract the last matching index (which will be the index with
4969 highest value) and use this to index into the data vector.
4970 For the case where there were no matches, the data vector will contain
4971 all default values and the index vector will be all zeros. */
4972
4973 /* Get various versions of the type of the vector of indexes. */
4974 tree index_vec_type = TREE_TYPE (induction_index);
4975 gcc_checking_assert (TYPE_UNSIGNED (index_vec_type));
4976 tree index_scalar_type = TREE_TYPE (index_vec_type);
4977 tree index_vec_cmp_type = build_same_sized_truth_vector_type
4978 (index_vec_type);
4979
4980 /* Get an unsigned integer version of the type of the data vector. */
4981 int scalar_precision
4982 = GET_MODE_PRECISION (SCALAR_TYPE_MODE (scalar_type));
4983 tree scalar_type_unsigned = make_unsigned_type (scalar_precision);
4984 tree vectype_unsigned = build_vector_type
4985 (scalar_type_unsigned, TYPE_VECTOR_SUBPARTS (vectype));
4986
4987 /* First we need to create a vector (ZERO_VEC) of zeros and another
4988 vector (MAX_INDEX_VEC) filled with the last matching index, which we
4989 can create using a MAX reduction and then expanding.
4990 In the case where the loop never made any matches, the max index will
4991 be zero. */
4992
4993 /* Vector of {0, 0, 0,...}. */
4994 tree zero_vec = make_ssa_name (vectype);
4995 tree zero_vec_rhs = build_zero_cst (vectype);
4996 gimple *zero_vec_stmt = gimple_build_assign (zero_vec, zero_vec_rhs);
4997 gsi_insert_before (&exit_gsi, zero_vec_stmt, GSI_SAME_STMT);
4998
4999 /* Find maximum value from the vector of found indexes. */
5000 tree max_index = make_ssa_name (index_scalar_type);
5001 gcall *max_index_stmt = gimple_build_call_internal (IFN_REDUC_MAX,
5002 1, induction_index);
5003 gimple_call_set_lhs (max_index_stmt, max_index);
5004 gsi_insert_before (&exit_gsi, max_index_stmt, GSI_SAME_STMT);
5005
5006 /* Vector of {max_index, max_index, max_index,...}. */
5007 tree max_index_vec = make_ssa_name (index_vec_type);
5008 tree max_index_vec_rhs = build_vector_from_val (index_vec_type,
5009 max_index);
5010 gimple *max_index_vec_stmt = gimple_build_assign (max_index_vec,
5011 max_index_vec_rhs);
5012 gsi_insert_before (&exit_gsi, max_index_vec_stmt, GSI_SAME_STMT);
5013
5014 /* Next we compare the new vector (MAX_INDEX_VEC) full of max indexes
5015 with the vector (INDUCTION_INDEX) of found indexes, choosing values
5016 from the data vector (NEW_PHI_RESULT) for matches, 0 (ZERO_VEC)
5017 otherwise. Only one value should match, resulting in a vector
5018 (VEC_COND) with one data value and the rest zeros.
5019 In the case where the loop never made any matches, every index will
5020 match, resulting in a vector with all data values (which will all be
5021 the default value). */
5022
5023 /* Compare the max index vector to the vector of found indexes to find
5024 the position of the max value. */
5025 tree vec_compare = make_ssa_name (index_vec_cmp_type);
5026 gimple *vec_compare_stmt = gimple_build_assign (vec_compare, EQ_EXPR,
5027 induction_index,
5028 max_index_vec);
5029 gsi_insert_before (&exit_gsi, vec_compare_stmt, GSI_SAME_STMT);
5030
5031 /* Use the compare to choose either values from the data vector or
5032 zero. */
5033 tree vec_cond = make_ssa_name (vectype);
5034 gimple *vec_cond_stmt = gimple_build_assign (vec_cond, VEC_COND_EXPR,
5035 vec_compare, new_phi_result,
5036 zero_vec);
5037 gsi_insert_before (&exit_gsi, vec_cond_stmt, GSI_SAME_STMT);
5038
5039 /* Finally we need to extract the data value from the vector (VEC_COND)
5040 into a scalar (MATCHED_DATA_REDUC). Logically we want to do a OR
5041 reduction, but because this doesn't exist, we can use a MAX reduction
5042 instead. The data value might be signed or a float so we need to cast
5043 it first.
5044 In the case where the loop never made any matches, the data values are
5045 all identical, and so will reduce down correctly. */
5046
5047 /* Make the matched data values unsigned. */
5048 tree vec_cond_cast = make_ssa_name (vectype_unsigned);
5049 tree vec_cond_cast_rhs = build1 (VIEW_CONVERT_EXPR, vectype_unsigned,
5050 vec_cond);
5051 gimple *vec_cond_cast_stmt = gimple_build_assign (vec_cond_cast,
5052 VIEW_CONVERT_EXPR,
5053 vec_cond_cast_rhs);
5054 gsi_insert_before (&exit_gsi, vec_cond_cast_stmt, GSI_SAME_STMT);
5055
5056 /* Reduce down to a scalar value. */
5057 tree data_reduc = make_ssa_name (scalar_type_unsigned);
5058 gcall *data_reduc_stmt = gimple_build_call_internal (IFN_REDUC_MAX,
5059 1, vec_cond_cast);
5060 gimple_call_set_lhs (data_reduc_stmt, data_reduc);
5061 gsi_insert_before (&exit_gsi, data_reduc_stmt, GSI_SAME_STMT);
5062
5063 /* Convert the reduced value back to the result type and set as the
5064 result. */
5065 gimple_seq stmts = NULL;
5066 new_temp = gimple_build (&stmts, VIEW_CONVERT_EXPR, scalar_type,
5067 data_reduc);
5068 gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
5069 scalar_results.safe_push (new_temp);
5070 }
5071 else if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) == COND_REDUCTION
5072 && reduc_fn == IFN_LAST)
5073 {
5074 /* Condition reduction without supported IFN_REDUC_MAX. Generate
5075 idx = 0;
5076 idx_val = induction_index[0];
5077 val = data_reduc[0];
5078 for (idx = 0, val = init, i = 0; i < nelts; ++i)
5079 if (induction_index[i] > idx_val)
5080 val = data_reduc[i], idx_val = induction_index[i];
5081 return val; */
5082
5083 tree data_eltype = TREE_TYPE (TREE_TYPE (new_phi_result));
5084 tree idx_eltype = TREE_TYPE (TREE_TYPE (induction_index));
5085 unsigned HOST_WIDE_INT el_size = tree_to_uhwi (TYPE_SIZE (idx_eltype));
5086 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (TREE_TYPE (induction_index));
5087 /* Enforced by vectorizable_reduction, which ensures we have target
5088 support before allowing a conditional reduction on variable-length
5089 vectors. */
5090 unsigned HOST_WIDE_INT v_size = el_size * nunits.to_constant ();
5091 tree idx_val = NULL_TREE, val = NULL_TREE;
5092 for (unsigned HOST_WIDE_INT off = 0; off < v_size; off += el_size)
5093 {
5094 tree old_idx_val = idx_val;
5095 tree old_val = val;
5096 idx_val = make_ssa_name (idx_eltype);
5097 epilog_stmt = gimple_build_assign (idx_val, BIT_FIELD_REF,
5098 build3 (BIT_FIELD_REF, idx_eltype,
5099 induction_index,
5100 bitsize_int (el_size),
5101 bitsize_int (off)));
5102 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5103 val = make_ssa_name (data_eltype);
5104 epilog_stmt = gimple_build_assign (val, BIT_FIELD_REF,
5105 build3 (BIT_FIELD_REF,
5106 data_eltype,
5107 new_phi_result,
5108 bitsize_int (el_size),
5109 bitsize_int (off)));
5110 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5111 if (off != 0)
5112 {
5113 tree new_idx_val = idx_val;
5114 if (off != v_size - el_size)
5115 {
5116 new_idx_val = make_ssa_name (idx_eltype);
5117 epilog_stmt = gimple_build_assign (new_idx_val,
5118 MAX_EXPR, idx_val,
5119 old_idx_val);
5120 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5121 }
5122 tree new_val = make_ssa_name (data_eltype);
5123 epilog_stmt = gimple_build_assign (new_val,
5124 COND_EXPR,
5125 build2 (GT_EXPR,
5126 boolean_type_node,
5127 idx_val,
5128 old_idx_val),
5129 val, old_val);
5130 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5131 idx_val = new_idx_val;
5132 val = new_val;
5133 }
5134 }
5135 /* Convert the reduced value back to the result type and set as the
5136 result. */
5137 gimple_seq stmts = NULL;
5138 val = gimple_convert (&stmts, scalar_type, val);
5139 gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
5140 scalar_results.safe_push (val);
5141 }
5142
5143 /* 2.3 Create the reduction code, using one of the three schemes described
5144 above. In SLP we simply need to extract all the elements from the
5145 vector (without reducing them), so we use scalar shifts. */
5146 else if (reduc_fn != IFN_LAST && !slp_reduc)
5147 {
5148 tree tmp;
5149 tree vec_elem_type;
5150
5151 /* Case 1: Create:
5152 v_out2 = reduc_expr <v_out1> */
5153
5154 if (dump_enabled_p ())
5155 dump_printf_loc (MSG_NOTE, vect_location,
5156 "Reduce using direct vector reduction.\n");
5157
5158 vec_elem_type = TREE_TYPE (TREE_TYPE (new_phi_result));
5159 if (!useless_type_conversion_p (scalar_type, vec_elem_type))
5160 {
5161 tree tmp_dest
5162 = vect_create_destination_var (scalar_dest, vec_elem_type);
5163 epilog_stmt = gimple_build_call_internal (reduc_fn, 1,
5164 new_phi_result);
5165 gimple_set_lhs (epilog_stmt, tmp_dest);
5166 new_temp = make_ssa_name (tmp_dest, epilog_stmt);
5167 gimple_set_lhs (epilog_stmt, new_temp);
5168 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5169
5170 epilog_stmt = gimple_build_assign (new_scalar_dest, NOP_EXPR,
5171 new_temp);
5172 }
5173 else
5174 {
5175 epilog_stmt = gimple_build_call_internal (reduc_fn, 1,
5176 new_phi_result);
5177 gimple_set_lhs (epilog_stmt, new_scalar_dest);
5178 }
5179
5180 new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
5181 gimple_set_lhs (epilog_stmt, new_temp);
5182 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5183
5184 if ((STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
5185 == INTEGER_INDUC_COND_REDUCTION)
5186 && !operand_equal_p (initial_def, induc_val, 0))
5187 {
5188 /* Earlier we set the initial value to be a vector if induc_val
5189 values. Check the result and if it is induc_val then replace
5190 with the original initial value, unless induc_val is
5191 the same as initial_def already. */
5192 tree zcompare = build2 (EQ_EXPR, boolean_type_node, new_temp,
5193 induc_val);
5194
5195 tmp = make_ssa_name (new_scalar_dest);
5196 epilog_stmt = gimple_build_assign (tmp, COND_EXPR, zcompare,
5197 initial_def, new_temp);
5198 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5199 new_temp = tmp;
5200 }
5201
5202 scalar_results.safe_push (new_temp);
5203 }
5204 else if (direct_slp_reduc)
5205 {
5206 /* Here we create one vector for each of the REDUC_GROUP_SIZE results,
5207 with the elements for other SLP statements replaced with the
5208 neutral value. We can then do a normal reduction on each vector. */
5209
5210 /* Enforced by vectorizable_reduction. */
5211 gcc_assert (new_phis.length () == 1);
5212 gcc_assert (pow2p_hwi (group_size));
5213
5214 slp_tree orig_phis_slp_node = slp_node_instance->reduc_phis;
5215 vec<stmt_vec_info> orig_phis
5216 = SLP_TREE_SCALAR_STMTS (orig_phis_slp_node);
5217 gimple_seq seq = NULL;
5218
5219 /* Build a vector {0, 1, 2, ...}, with the same number of elements
5220 and the same element size as VECTYPE. */
5221 tree index = build_index_vector (vectype, 0, 1);
5222 tree index_type = TREE_TYPE (index);
5223 tree index_elt_type = TREE_TYPE (index_type);
5224 tree mask_type = build_same_sized_truth_vector_type (index_type);
5225
5226 /* Create a vector that, for each element, identifies which of
5227 the REDUC_GROUP_SIZE results should use it. */
5228 tree index_mask = build_int_cst (index_elt_type, group_size - 1);
5229 index = gimple_build (&seq, BIT_AND_EXPR, index_type, index,
5230 build_vector_from_val (index_type, index_mask));
5231
5232 /* Get a neutral vector value. This is simply a splat of the neutral
5233 scalar value if we have one, otherwise the initial scalar value
5234 is itself a neutral value. */
5235 tree vector_identity = NULL_TREE;
5236 if (neutral_op)
5237 vector_identity = gimple_build_vector_from_val (&seq, vectype,
5238 neutral_op);
5239 for (unsigned int i = 0; i < group_size; ++i)
5240 {
5241 /* If there's no univeral neutral value, we can use the
5242 initial scalar value from the original PHI. This is used
5243 for MIN and MAX reduction, for example. */
5244 if (!neutral_op)
5245 {
5246 tree scalar_value
5247 = PHI_ARG_DEF_FROM_EDGE (orig_phis[i]->stmt,
5248 loop_preheader_edge (loop));
5249 vector_identity = gimple_build_vector_from_val (&seq, vectype,
5250 scalar_value);
5251 }
5252
5253 /* Calculate the equivalent of:
5254
5255 sel[j] = (index[j] == i);
5256
5257 which selects the elements of NEW_PHI_RESULT that should
5258 be included in the result. */
5259 tree compare_val = build_int_cst (index_elt_type, i);
5260 compare_val = build_vector_from_val (index_type, compare_val);
5261 tree sel = gimple_build (&seq, EQ_EXPR, mask_type,
5262 index, compare_val);
5263
5264 /* Calculate the equivalent of:
5265
5266 vec = seq ? new_phi_result : vector_identity;
5267
5268 VEC is now suitable for a full vector reduction. */
5269 tree vec = gimple_build (&seq, VEC_COND_EXPR, vectype,
5270 sel, new_phi_result, vector_identity);
5271
5272 /* Do the reduction and convert it to the appropriate type. */
5273 tree scalar = gimple_build (&seq, as_combined_fn (reduc_fn),
5274 TREE_TYPE (vectype), vec);
5275 scalar = gimple_convert (&seq, scalar_type, scalar);
5276 scalar_results.safe_push (scalar);
5277 }
5278 gsi_insert_seq_before (&exit_gsi, seq, GSI_SAME_STMT);
5279 }
5280 else
5281 {
5282 bool reduce_with_shift;
5283 tree vec_temp;
5284
5285 /* COND reductions all do the final reduction with MAX_EXPR
5286 or MIN_EXPR. */
5287 if (code == COND_EXPR)
5288 {
5289 if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
5290 == INTEGER_INDUC_COND_REDUCTION)
5291 code = induc_code;
5292 else if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
5293 == CONST_COND_REDUCTION)
5294 code = STMT_VINFO_VEC_CONST_COND_REDUC_CODE (stmt_info);
5295 else
5296 code = MAX_EXPR;
5297 }
5298
5299 /* See if the target wants to do the final (shift) reduction
5300 in a vector mode of smaller size and first reduce upper/lower
5301 halves against each other. */
5302 enum machine_mode mode1 = mode;
5303 unsigned sz = tree_to_uhwi (TYPE_SIZE_UNIT (vectype));
5304 unsigned sz1 = sz;
5305 if (!slp_reduc
5306 && (mode1 = targetm.vectorize.split_reduction (mode)) != mode)
5307 sz1 = GET_MODE_SIZE (mode1).to_constant ();
5308
5309 tree vectype1 = get_vectype_for_scalar_type_and_size (scalar_type, sz1);
5310 reduce_with_shift = have_whole_vector_shift (mode1);
5311 if (!VECTOR_MODE_P (mode1))
5312 reduce_with_shift = false;
5313 else
5314 {
5315 optab optab = optab_for_tree_code (code, vectype1, optab_default);
5316 if (optab_handler (optab, mode1) == CODE_FOR_nothing)
5317 reduce_with_shift = false;
5318 }
5319
5320 /* First reduce the vector to the desired vector size we should
5321 do shift reduction on by combining upper and lower halves. */
5322 new_temp = new_phi_result;
5323 while (sz > sz1)
5324 {
5325 gcc_assert (!slp_reduc);
5326 sz /= 2;
5327 vectype1 = get_vectype_for_scalar_type_and_size (scalar_type, sz);
5328
5329 /* The target has to make sure we support lowpart/highpart
5330 extraction, either via direct vector extract or through
5331 an integer mode punning. */
5332 tree dst1, dst2;
5333 if (convert_optab_handler (vec_extract_optab,
5334 TYPE_MODE (TREE_TYPE (new_temp)),
5335 TYPE_MODE (vectype1))
5336 != CODE_FOR_nothing)
5337 {
5338 /* Extract sub-vectors directly once vec_extract becomes
5339 a conversion optab. */
5340 dst1 = make_ssa_name (vectype1);
5341 epilog_stmt
5342 = gimple_build_assign (dst1, BIT_FIELD_REF,
5343 build3 (BIT_FIELD_REF, vectype1,
5344 new_temp, TYPE_SIZE (vectype1),
5345 bitsize_int (0)));
5346 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5347 dst2 = make_ssa_name (vectype1);
5348 epilog_stmt
5349 = gimple_build_assign (dst2, BIT_FIELD_REF,
5350 build3 (BIT_FIELD_REF, vectype1,
5351 new_temp, TYPE_SIZE (vectype1),
5352 bitsize_int (sz * BITS_PER_UNIT)));
5353 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5354 }
5355 else
5356 {
5357 /* Extract via punning to appropriately sized integer mode
5358 vector. */
5359 tree eltype = build_nonstandard_integer_type (sz * BITS_PER_UNIT,
5360 1);
5361 tree etype = build_vector_type (eltype, 2);
5362 gcc_assert (convert_optab_handler (vec_extract_optab,
5363 TYPE_MODE (etype),
5364 TYPE_MODE (eltype))
5365 != CODE_FOR_nothing);
5366 tree tem = make_ssa_name (etype);
5367 epilog_stmt = gimple_build_assign (tem, VIEW_CONVERT_EXPR,
5368 build1 (VIEW_CONVERT_EXPR,
5369 etype, new_temp));
5370 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5371 new_temp = tem;
5372 tem = make_ssa_name (eltype);
5373 epilog_stmt
5374 = gimple_build_assign (tem, BIT_FIELD_REF,
5375 build3 (BIT_FIELD_REF, eltype,
5376 new_temp, TYPE_SIZE (eltype),
5377 bitsize_int (0)));
5378 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5379 dst1 = make_ssa_name (vectype1);
5380 epilog_stmt = gimple_build_assign (dst1, VIEW_CONVERT_EXPR,
5381 build1 (VIEW_CONVERT_EXPR,
5382 vectype1, tem));
5383 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5384 tem = make_ssa_name (eltype);
5385 epilog_stmt
5386 = gimple_build_assign (tem, BIT_FIELD_REF,
5387 build3 (BIT_FIELD_REF, eltype,
5388 new_temp, TYPE_SIZE (eltype),
5389 bitsize_int (sz * BITS_PER_UNIT)));
5390 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5391 dst2 = make_ssa_name (vectype1);
5392 epilog_stmt = gimple_build_assign (dst2, VIEW_CONVERT_EXPR,
5393 build1 (VIEW_CONVERT_EXPR,
5394 vectype1, tem));
5395 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5396 }
5397
5398 new_temp = make_ssa_name (vectype1);
5399 epilog_stmt = gimple_build_assign (new_temp, code, dst1, dst2);
5400 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5401 }
5402
5403 if (reduce_with_shift && !slp_reduc)
5404 {
5405 int element_bitsize = tree_to_uhwi (bitsize);
5406 /* Enforced by vectorizable_reduction, which disallows SLP reductions
5407 for variable-length vectors and also requires direct target support
5408 for loop reductions. */
5409 int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype1));
5410 int nelements = vec_size_in_bits / element_bitsize;
5411 vec_perm_builder sel;
5412 vec_perm_indices indices;
5413
5414 int elt_offset;
5415
5416 tree zero_vec = build_zero_cst (vectype1);
5417 /* Case 2: Create:
5418 for (offset = nelements/2; offset >= 1; offset/=2)
5419 {
5420 Create: va' = vec_shift <va, offset>
5421 Create: va = vop <va, va'>
5422 } */
5423
5424 tree rhs;
5425
5426 if (dump_enabled_p ())
5427 dump_printf_loc (MSG_NOTE, vect_location,
5428 "Reduce using vector shifts\n");
5429
5430 vec_dest = vect_create_destination_var (scalar_dest, vectype1);
5431 for (elt_offset = nelements / 2;
5432 elt_offset >= 1;
5433 elt_offset /= 2)
5434 {
5435 calc_vec_perm_mask_for_shift (elt_offset, nelements, &sel);
5436 indices.new_vector (sel, 2, nelements);
5437 tree mask = vect_gen_perm_mask_any (vectype1, indices);
5438 epilog_stmt = gimple_build_assign (vec_dest, VEC_PERM_EXPR,
5439 new_temp, zero_vec, mask);
5440 new_name = make_ssa_name (vec_dest, epilog_stmt);
5441 gimple_assign_set_lhs (epilog_stmt, new_name);
5442 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5443
5444 epilog_stmt = gimple_build_assign (vec_dest, code, new_name,
5445 new_temp);
5446 new_temp = make_ssa_name (vec_dest, epilog_stmt);
5447 gimple_assign_set_lhs (epilog_stmt, new_temp);
5448 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5449 }
5450
5451 /* 2.4 Extract the final scalar result. Create:
5452 s_out3 = extract_field <v_out2, bitpos> */
5453
5454 if (dump_enabled_p ())
5455 dump_printf_loc (MSG_NOTE, vect_location,
5456 "extract scalar result\n");
5457
5458 rhs = build3 (BIT_FIELD_REF, scalar_type, new_temp,
5459 bitsize, bitsize_zero_node);
5460 epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
5461 new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
5462 gimple_assign_set_lhs (epilog_stmt, new_temp);
5463 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5464 scalar_results.safe_push (new_temp);
5465 }
5466 else
5467 {
5468 /* Case 3: Create:
5469 s = extract_field <v_out2, 0>
5470 for (offset = element_size;
5471 offset < vector_size;
5472 offset += element_size;)
5473 {
5474 Create: s' = extract_field <v_out2, offset>
5475 Create: s = op <s, s'> // For non SLP cases
5476 } */
5477
5478 if (dump_enabled_p ())
5479 dump_printf_loc (MSG_NOTE, vect_location,
5480 "Reduce using scalar code.\n");
5481
5482 int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype1));
5483 int element_bitsize = tree_to_uhwi (bitsize);
5484 FOR_EACH_VEC_ELT (new_phis, i, new_phi)
5485 {
5486 int bit_offset;
5487 if (gimple_code (new_phi) == GIMPLE_PHI)
5488 vec_temp = PHI_RESULT (new_phi);
5489 else
5490 vec_temp = gimple_assign_lhs (new_phi);
5491 tree rhs = build3 (BIT_FIELD_REF, scalar_type, vec_temp, bitsize,
5492 bitsize_zero_node);
5493 epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
5494 new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
5495 gimple_assign_set_lhs (epilog_stmt, new_temp);
5496 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5497
5498 /* In SLP we don't need to apply reduction operation, so we just
5499 collect s' values in SCALAR_RESULTS. */
5500 if (slp_reduc)
5501 scalar_results.safe_push (new_temp);
5502
5503 for (bit_offset = element_bitsize;
5504 bit_offset < vec_size_in_bits;
5505 bit_offset += element_bitsize)
5506 {
5507 tree bitpos = bitsize_int (bit_offset);
5508 tree rhs = build3 (BIT_FIELD_REF, scalar_type, vec_temp,
5509 bitsize, bitpos);
5510
5511 epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
5512 new_name = make_ssa_name (new_scalar_dest, epilog_stmt);
5513 gimple_assign_set_lhs (epilog_stmt, new_name);
5514 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5515
5516 if (slp_reduc)
5517 {
5518 /* In SLP we don't need to apply reduction operation, so
5519 we just collect s' values in SCALAR_RESULTS. */
5520 new_temp = new_name;
5521 scalar_results.safe_push (new_name);
5522 }
5523 else
5524 {
5525 epilog_stmt = gimple_build_assign (new_scalar_dest, code,
5526 new_name, new_temp);
5527 new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
5528 gimple_assign_set_lhs (epilog_stmt, new_temp);
5529 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5530 }
5531 }
5532 }
5533
5534 /* The only case where we need to reduce scalar results in SLP, is
5535 unrolling. If the size of SCALAR_RESULTS is greater than
5536 REDUC_GROUP_SIZE, we reduce them combining elements modulo
5537 REDUC_GROUP_SIZE. */
5538 if (slp_reduc)
5539 {
5540 tree res, first_res, new_res;
5541 gimple *new_stmt;
5542
5543 /* Reduce multiple scalar results in case of SLP unrolling. */
5544 for (j = group_size; scalar_results.iterate (j, &res);
5545 j++)
5546 {
5547 first_res = scalar_results[j % group_size];
5548 new_stmt = gimple_build_assign (new_scalar_dest, code,
5549 first_res, res);
5550 new_res = make_ssa_name (new_scalar_dest, new_stmt);
5551 gimple_assign_set_lhs (new_stmt, new_res);
5552 gsi_insert_before (&exit_gsi, new_stmt, GSI_SAME_STMT);
5553 scalar_results[j % group_size] = new_res;
5554 }
5555 }
5556 else
5557 /* Not SLP - we have one scalar to keep in SCALAR_RESULTS. */
5558 scalar_results.safe_push (new_temp);
5559 }
5560
5561 if ((STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
5562 == INTEGER_INDUC_COND_REDUCTION)
5563 && !operand_equal_p (initial_def, induc_val, 0))
5564 {
5565 /* Earlier we set the initial value to be a vector if induc_val
5566 values. Check the result and if it is induc_val then replace
5567 with the original initial value, unless induc_val is
5568 the same as initial_def already. */
5569 tree zcompare = build2 (EQ_EXPR, boolean_type_node, new_temp,
5570 induc_val);
5571
5572 tree tmp = make_ssa_name (new_scalar_dest);
5573 epilog_stmt = gimple_build_assign (tmp, COND_EXPR, zcompare,
5574 initial_def, new_temp);
5575 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5576 scalar_results[0] = tmp;
5577 }
5578 }
5579
5580 vect_finalize_reduction:
5581
5582 if (double_reduc)
5583 loop = loop->inner;
5584
5585 /* 2.5 Adjust the final result by the initial value of the reduction
5586 variable. (When such adjustment is not needed, then
5587 'adjustment_def' is zero). For example, if code is PLUS we create:
5588 new_temp = loop_exit_def + adjustment_def */
5589
5590 if (adjustment_def)
5591 {
5592 gcc_assert (!slp_reduc);
5593 if (nested_in_vect_loop)
5594 {
5595 new_phi = new_phis[0];
5596 gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) == VECTOR_TYPE);
5597 expr = build2 (code, vectype, PHI_RESULT (new_phi), adjustment_def);
5598 new_dest = vect_create_destination_var (scalar_dest, vectype);
5599 }
5600 else
5601 {
5602 new_temp = scalar_results[0];
5603 gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) != VECTOR_TYPE);
5604 expr = build2 (code, scalar_type, new_temp, adjustment_def);
5605 new_dest = vect_create_destination_var (scalar_dest, scalar_type);
5606 }
5607
5608 epilog_stmt = gimple_build_assign (new_dest, expr);
5609 new_temp = make_ssa_name (new_dest, epilog_stmt);
5610 gimple_assign_set_lhs (epilog_stmt, new_temp);
5611 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5612 if (nested_in_vect_loop)
5613 {
5614 stmt_vec_info epilog_stmt_info = loop_vinfo->add_stmt (epilog_stmt);
5615 STMT_VINFO_RELATED_STMT (epilog_stmt_info)
5616 = STMT_VINFO_RELATED_STMT (loop_vinfo->lookup_stmt (new_phi));
5617
5618 if (!double_reduc)
5619 scalar_results.quick_push (new_temp);
5620 else
5621 scalar_results[0] = new_temp;
5622 }
5623 else
5624 scalar_results[0] = new_temp;
5625
5626 new_phis[0] = epilog_stmt;
5627 }
5628
5629 /* 2.6 Handle the loop-exit phis. Replace the uses of scalar loop-exit
5630 phis with new adjusted scalar results, i.e., replace use <s_out0>
5631 with use <s_out4>.
5632
5633 Transform:
5634 loop_exit:
5635 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
5636 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
5637 v_out2 = reduce <v_out1>
5638 s_out3 = extract_field <v_out2, 0>
5639 s_out4 = adjust_result <s_out3>
5640 use <s_out0>
5641 use <s_out0>
5642
5643 into:
5644
5645 loop_exit:
5646 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
5647 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
5648 v_out2 = reduce <v_out1>
5649 s_out3 = extract_field <v_out2, 0>
5650 s_out4 = adjust_result <s_out3>
5651 use <s_out4>
5652 use <s_out4> */
5653
5654
5655 /* In SLP reduction chain we reduce vector results into one vector if
5656 necessary, hence we set here REDUC_GROUP_SIZE to 1. SCALAR_DEST is the
5657 LHS of the last stmt in the reduction chain, since we are looking for
5658 the loop exit phi node. */
5659 if (REDUC_GROUP_FIRST_ELEMENT (stmt_info))
5660 {
5661 stmt_vec_info dest_stmt_info
5662 = vect_orig_stmt (SLP_TREE_SCALAR_STMTS (slp_node)[group_size - 1]);
5663 scalar_dest = gimple_assign_lhs (dest_stmt_info->stmt);
5664 group_size = 1;
5665 }
5666
5667 /* In SLP we may have several statements in NEW_PHIS and REDUCTION_PHIS (in
5668 case that REDUC_GROUP_SIZE is greater than vectorization factor).
5669 Therefore, we need to match SCALAR_RESULTS with corresponding statements.
5670 The first (REDUC_GROUP_SIZE / number of new vector stmts) scalar results
5671 correspond to the first vector stmt, etc.
5672 (RATIO is equal to (REDUC_GROUP_SIZE / number of new vector stmts)). */
5673 if (group_size > new_phis.length ())
5674 {
5675 ratio = group_size / new_phis.length ();
5676 gcc_assert (!(group_size % new_phis.length ()));
5677 }
5678 else
5679 ratio = 1;
5680
5681 stmt_vec_info epilog_stmt_info = NULL;
5682 for (k = 0; k < group_size; k++)
5683 {
5684 if (k % ratio == 0)
5685 {
5686 epilog_stmt_info = loop_vinfo->lookup_stmt (new_phis[k / ratio]);
5687 reduction_phi_info = reduction_phis[k / ratio];
5688 if (double_reduc)
5689 inner_phi = inner_phis[k / ratio];
5690 }
5691
5692 if (slp_reduc)
5693 {
5694 stmt_vec_info scalar_stmt_info = SLP_TREE_SCALAR_STMTS (slp_node)[k];
5695
5696 orig_stmt_info = STMT_VINFO_RELATED_STMT (scalar_stmt_info);
5697 /* SLP statements can't participate in patterns. */
5698 gcc_assert (!orig_stmt_info);
5699 scalar_dest = gimple_assign_lhs (scalar_stmt_info->stmt);
5700 }
5701
5702 phis.create (3);
5703 /* Find the loop-closed-use at the loop exit of the original scalar
5704 result. (The reduction result is expected to have two immediate uses -
5705 one at the latch block, and one at the loop exit). */
5706 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, scalar_dest)
5707 if (!flow_bb_inside_loop_p (loop, gimple_bb (USE_STMT (use_p)))
5708 && !is_gimple_debug (USE_STMT (use_p)))
5709 phis.safe_push (USE_STMT (use_p));
5710
5711 /* While we expect to have found an exit_phi because of loop-closed-ssa
5712 form we can end up without one if the scalar cycle is dead. */
5713
5714 FOR_EACH_VEC_ELT (phis, i, exit_phi)
5715 {
5716 if (outer_loop)
5717 {
5718 stmt_vec_info exit_phi_vinfo
5719 = loop_vinfo->lookup_stmt (exit_phi);
5720 gphi *vect_phi;
5721
5722 if (double_reduc)
5723 STMT_VINFO_VEC_STMT (exit_phi_vinfo) = inner_phi;
5724 else
5725 STMT_VINFO_VEC_STMT (exit_phi_vinfo) = epilog_stmt_info;
5726 if (!double_reduc
5727 || STMT_VINFO_DEF_TYPE (exit_phi_vinfo)
5728 != vect_double_reduction_def)
5729 continue;
5730
5731 /* Handle double reduction:
5732
5733 stmt1: s1 = phi <s0, s2> - double reduction phi (outer loop)
5734 stmt2: s3 = phi <s1, s4> - (regular) reduc phi (inner loop)
5735 stmt3: s4 = use (s3) - (regular) reduc stmt (inner loop)
5736 stmt4: s2 = phi <s4> - double reduction stmt (outer loop)
5737
5738 At that point the regular reduction (stmt2 and stmt3) is
5739 already vectorized, as well as the exit phi node, stmt4.
5740 Here we vectorize the phi node of double reduction, stmt1, and
5741 update all relevant statements. */
5742
5743 /* Go through all the uses of s2 to find double reduction phi
5744 node, i.e., stmt1 above. */
5745 orig_name = PHI_RESULT (exit_phi);
5746 FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name)
5747 {
5748 stmt_vec_info use_stmt_vinfo;
5749 tree vect_phi_init, preheader_arg, vect_phi_res;
5750 basic_block bb = gimple_bb (use_stmt);
5751
5752 /* Check that USE_STMT is really double reduction phi
5753 node. */
5754 if (gimple_code (use_stmt) != GIMPLE_PHI
5755 || gimple_phi_num_args (use_stmt) != 2
5756 || bb->loop_father != outer_loop)
5757 continue;
5758 use_stmt_vinfo = loop_vinfo->lookup_stmt (use_stmt);
5759 if (!use_stmt_vinfo
5760 || STMT_VINFO_DEF_TYPE (use_stmt_vinfo)
5761 != vect_double_reduction_def)
5762 continue;
5763
5764 /* Create vector phi node for double reduction:
5765 vs1 = phi <vs0, vs2>
5766 vs1 was created previously in this function by a call to
5767 vect_get_vec_def_for_operand and is stored in
5768 vec_initial_def;
5769 vs2 is defined by INNER_PHI, the vectorized EXIT_PHI;
5770 vs0 is created here. */
5771
5772 /* Create vector phi node. */
5773 vect_phi = create_phi_node (vec_initial_def, bb);
5774 loop_vec_info_for_loop (outer_loop)->add_stmt (vect_phi);
5775
5776 /* Create vs0 - initial def of the double reduction phi. */
5777 preheader_arg = PHI_ARG_DEF_FROM_EDGE (use_stmt,
5778 loop_preheader_edge (outer_loop));
5779 vect_phi_init = get_initial_def_for_reduction
5780 (stmt_info, preheader_arg, NULL);
5781
5782 /* Update phi node arguments with vs0 and vs2. */
5783 add_phi_arg (vect_phi, vect_phi_init,
5784 loop_preheader_edge (outer_loop),
5785 UNKNOWN_LOCATION);
5786 add_phi_arg (vect_phi, PHI_RESULT (inner_phi->stmt),
5787 loop_latch_edge (outer_loop), UNKNOWN_LOCATION);
5788 if (dump_enabled_p ())
5789 dump_printf_loc (MSG_NOTE, vect_location,
5790 "created double reduction phi node: %G",
5791 vect_phi);
5792
5793 vect_phi_res = PHI_RESULT (vect_phi);
5794
5795 /* Replace the use, i.e., set the correct vs1 in the regular
5796 reduction phi node. FORNOW, NCOPIES is always 1, so the
5797 loop is redundant. */
5798 stmt_vec_info use_info = reduction_phi_info;
5799 for (j = 0; j < ncopies; j++)
5800 {
5801 edge pr_edge = loop_preheader_edge (loop);
5802 SET_PHI_ARG_DEF (as_a <gphi *> (use_info->stmt),
5803 pr_edge->dest_idx, vect_phi_res);
5804 use_info = STMT_VINFO_RELATED_STMT (use_info);
5805 }
5806 }
5807 }
5808 }
5809
5810 phis.release ();
5811 if (nested_in_vect_loop)
5812 {
5813 if (double_reduc)
5814 loop = outer_loop;
5815 else
5816 continue;
5817 }
5818
5819 phis.create (3);
5820 /* Find the loop-closed-use at the loop exit of the original scalar
5821 result. (The reduction result is expected to have two immediate uses,
5822 one at the latch block, and one at the loop exit). For double
5823 reductions we are looking for exit phis of the outer loop. */
5824 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, scalar_dest)
5825 {
5826 if (!flow_bb_inside_loop_p (loop, gimple_bb (USE_STMT (use_p))))
5827 {
5828 if (!is_gimple_debug (USE_STMT (use_p)))
5829 phis.safe_push (USE_STMT (use_p));
5830 }
5831 else
5832 {
5833 if (double_reduc && gimple_code (USE_STMT (use_p)) == GIMPLE_PHI)
5834 {
5835 tree phi_res = PHI_RESULT (USE_STMT (use_p));
5836
5837 FOR_EACH_IMM_USE_FAST (phi_use_p, phi_imm_iter, phi_res)
5838 {
5839 if (!flow_bb_inside_loop_p (loop,
5840 gimple_bb (USE_STMT (phi_use_p)))
5841 && !is_gimple_debug (USE_STMT (phi_use_p)))
5842 phis.safe_push (USE_STMT (phi_use_p));
5843 }
5844 }
5845 }
5846 }
5847
5848 FOR_EACH_VEC_ELT (phis, i, exit_phi)
5849 {
5850 /* Replace the uses: */
5851 orig_name = PHI_RESULT (exit_phi);
5852 scalar_result = scalar_results[k];
5853 FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name)
5854 FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
5855 SET_USE (use_p, scalar_result);
5856 }
5857
5858 phis.release ();
5859 }
5860 }
5861
5862 /* Return a vector of type VECTYPE that is equal to the vector select
5863 operation "MASK ? VEC : IDENTITY". Insert the select statements
5864 before GSI. */
5865
5866 static tree
5867 merge_with_identity (gimple_stmt_iterator *gsi, tree mask, tree vectype,
5868 tree vec, tree identity)
5869 {
5870 tree cond = make_temp_ssa_name (vectype, NULL, "cond");
5871 gimple *new_stmt = gimple_build_assign (cond, VEC_COND_EXPR,
5872 mask, vec, identity);
5873 gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
5874 return cond;
5875 }
5876
5877 /* Successively apply CODE to each element of VECTOR_RHS, in left-to-right
5878 order, starting with LHS. Insert the extraction statements before GSI and
5879 associate the new scalar SSA names with variable SCALAR_DEST.
5880 Return the SSA name for the result. */
5881
5882 static tree
5883 vect_expand_fold_left (gimple_stmt_iterator *gsi, tree scalar_dest,
5884 tree_code code, tree lhs, tree vector_rhs)
5885 {
5886 tree vectype = TREE_TYPE (vector_rhs);
5887 tree scalar_type = TREE_TYPE (vectype);
5888 tree bitsize = TYPE_SIZE (scalar_type);
5889 unsigned HOST_WIDE_INT vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
5890 unsigned HOST_WIDE_INT element_bitsize = tree_to_uhwi (bitsize);
5891
5892 for (unsigned HOST_WIDE_INT bit_offset = 0;
5893 bit_offset < vec_size_in_bits;
5894 bit_offset += element_bitsize)
5895 {
5896 tree bitpos = bitsize_int (bit_offset);
5897 tree rhs = build3 (BIT_FIELD_REF, scalar_type, vector_rhs,
5898 bitsize, bitpos);
5899
5900 gassign *stmt = gimple_build_assign (scalar_dest, rhs);
5901 rhs = make_ssa_name (scalar_dest, stmt);
5902 gimple_assign_set_lhs (stmt, rhs);
5903 gsi_insert_before (gsi, stmt, GSI_SAME_STMT);
5904
5905 stmt = gimple_build_assign (scalar_dest, code, lhs, rhs);
5906 tree new_name = make_ssa_name (scalar_dest, stmt);
5907 gimple_assign_set_lhs (stmt, new_name);
5908 gsi_insert_before (gsi, stmt, GSI_SAME_STMT);
5909 lhs = new_name;
5910 }
5911 return lhs;
5912 }
5913
5914 /* Get a masked internal function equivalent to REDUC_FN. VECTYPE_IN is the
5915 type of the vector input. */
5916
5917 static internal_fn
5918 get_masked_reduction_fn (internal_fn reduc_fn, tree vectype_in)
5919 {
5920 internal_fn mask_reduc_fn;
5921
5922 switch (reduc_fn)
5923 {
5924 case IFN_FOLD_LEFT_PLUS:
5925 mask_reduc_fn = IFN_MASK_FOLD_LEFT_PLUS;
5926 break;
5927
5928 default:
5929 return IFN_LAST;
5930 }
5931
5932 if (direct_internal_fn_supported_p (mask_reduc_fn, vectype_in,
5933 OPTIMIZE_FOR_SPEED))
5934 return mask_reduc_fn;
5935 return IFN_LAST;
5936 }
5937
5938 /* Perform an in-order reduction (FOLD_LEFT_REDUCTION). STMT_INFO is the
5939 statement that sets the live-out value. REDUC_DEF_STMT is the phi
5940 statement. CODE is the operation performed by STMT_INFO and OPS are
5941 its scalar operands. REDUC_INDEX is the index of the operand in
5942 OPS that is set by REDUC_DEF_STMT. REDUC_FN is the function that
5943 implements in-order reduction, or IFN_LAST if we should open-code it.
5944 VECTYPE_IN is the type of the vector input. MASKS specifies the masks
5945 that should be used to control the operation in a fully-masked loop. */
5946
5947 static bool
5948 vectorize_fold_left_reduction (stmt_vec_info stmt_info,
5949 gimple_stmt_iterator *gsi,
5950 stmt_vec_info *vec_stmt, slp_tree slp_node,
5951 gimple *reduc_def_stmt,
5952 tree_code code, internal_fn reduc_fn,
5953 tree ops[3], tree vectype_in,
5954 int reduc_index, vec_loop_masks *masks)
5955 {
5956 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
5957 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
5958 tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
5959 stmt_vec_info new_stmt_info = NULL;
5960 internal_fn mask_reduc_fn = get_masked_reduction_fn (reduc_fn, vectype_in);
5961
5962 int ncopies;
5963 if (slp_node)
5964 ncopies = 1;
5965 else
5966 ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
5967
5968 gcc_assert (!nested_in_vect_loop_p (loop, stmt_info));
5969 gcc_assert (ncopies == 1);
5970 gcc_assert (TREE_CODE_LENGTH (code) == binary_op);
5971 gcc_assert (reduc_index == (code == MINUS_EXPR ? 0 : 1));
5972 gcc_assert (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
5973 == FOLD_LEFT_REDUCTION);
5974
5975 if (slp_node)
5976 gcc_assert (known_eq (TYPE_VECTOR_SUBPARTS (vectype_out),
5977 TYPE_VECTOR_SUBPARTS (vectype_in)));
5978
5979 tree op0 = ops[1 - reduc_index];
5980
5981 int group_size = 1;
5982 stmt_vec_info scalar_dest_def_info;
5983 auto_vec<tree> vec_oprnds0;
5984 if (slp_node)
5985 {
5986 auto_vec<vec<tree> > vec_defs (2);
5987 auto_vec<tree> sops(2);
5988 sops.quick_push (ops[0]);
5989 sops.quick_push (ops[1]);
5990 vect_get_slp_defs (sops, slp_node, &vec_defs);
5991 vec_oprnds0.safe_splice (vec_defs[1 - reduc_index]);
5992 vec_defs[0].release ();
5993 vec_defs[1].release ();
5994 group_size = SLP_TREE_SCALAR_STMTS (slp_node).length ();
5995 scalar_dest_def_info = SLP_TREE_SCALAR_STMTS (slp_node)[group_size - 1];
5996 }
5997 else
5998 {
5999 tree loop_vec_def0 = vect_get_vec_def_for_operand (op0, stmt_info);
6000 vec_oprnds0.create (1);
6001 vec_oprnds0.quick_push (loop_vec_def0);
6002 scalar_dest_def_info = stmt_info;
6003 }
6004
6005 tree scalar_dest = gimple_assign_lhs (scalar_dest_def_info->stmt);
6006 tree scalar_type = TREE_TYPE (scalar_dest);
6007 tree reduc_var = gimple_phi_result (reduc_def_stmt);
6008
6009 int vec_num = vec_oprnds0.length ();
6010 gcc_assert (vec_num == 1 || slp_node);
6011 tree vec_elem_type = TREE_TYPE (vectype_out);
6012 gcc_checking_assert (useless_type_conversion_p (scalar_type, vec_elem_type));
6013
6014 tree vector_identity = NULL_TREE;
6015 if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
6016 vector_identity = build_zero_cst (vectype_out);
6017
6018 tree scalar_dest_var = vect_create_destination_var (scalar_dest, NULL);
6019 int i;
6020 tree def0;
6021 FOR_EACH_VEC_ELT (vec_oprnds0, i, def0)
6022 {
6023 gimple *new_stmt;
6024 tree mask = NULL_TREE;
6025 if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
6026 mask = vect_get_loop_mask (gsi, masks, vec_num, vectype_in, i);
6027
6028 /* Handle MINUS by adding the negative. */
6029 if (reduc_fn != IFN_LAST && code == MINUS_EXPR)
6030 {
6031 tree negated = make_ssa_name (vectype_out);
6032 new_stmt = gimple_build_assign (negated, NEGATE_EXPR, def0);
6033 gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
6034 def0 = negated;
6035 }
6036
6037 if (mask && mask_reduc_fn == IFN_LAST)
6038 def0 = merge_with_identity (gsi, mask, vectype_out, def0,
6039 vector_identity);
6040
6041 /* On the first iteration the input is simply the scalar phi
6042 result, and for subsequent iterations it is the output of
6043 the preceding operation. */
6044 if (reduc_fn != IFN_LAST || (mask && mask_reduc_fn != IFN_LAST))
6045 {
6046 if (mask && mask_reduc_fn != IFN_LAST)
6047 new_stmt = gimple_build_call_internal (mask_reduc_fn, 3, reduc_var,
6048 def0, mask);
6049 else
6050 new_stmt = gimple_build_call_internal (reduc_fn, 2, reduc_var,
6051 def0);
6052 /* For chained SLP reductions the output of the previous reduction
6053 operation serves as the input of the next. For the final statement
6054 the output cannot be a temporary - we reuse the original
6055 scalar destination of the last statement. */
6056 if (i != vec_num - 1)
6057 {
6058 gimple_set_lhs (new_stmt, scalar_dest_var);
6059 reduc_var = make_ssa_name (scalar_dest_var, new_stmt);
6060 gimple_set_lhs (new_stmt, reduc_var);
6061 }
6062 }
6063 else
6064 {
6065 reduc_var = vect_expand_fold_left (gsi, scalar_dest_var, code,
6066 reduc_var, def0);
6067 new_stmt = SSA_NAME_DEF_STMT (reduc_var);
6068 /* Remove the statement, so that we can use the same code paths
6069 as for statements that we've just created. */
6070 gimple_stmt_iterator tmp_gsi = gsi_for_stmt (new_stmt);
6071 gsi_remove (&tmp_gsi, true);
6072 }
6073
6074 if (i == vec_num - 1)
6075 {
6076 gimple_set_lhs (new_stmt, scalar_dest);
6077 new_stmt_info = vect_finish_replace_stmt (scalar_dest_def_info,
6078 new_stmt);
6079 }
6080 else
6081 new_stmt_info = vect_finish_stmt_generation (scalar_dest_def_info,
6082 new_stmt, gsi);
6083
6084 if (slp_node)
6085 SLP_TREE_VEC_STMTS (slp_node).quick_push (new_stmt_info);
6086 }
6087
6088 if (!slp_node)
6089 STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt_info;
6090
6091 return true;
6092 }
6093
6094 /* Function is_nonwrapping_integer_induction.
6095
6096 Check if STMT_VINO (which is part of loop LOOP) both increments and
6097 does not cause overflow. */
6098
6099 static bool
6100 is_nonwrapping_integer_induction (stmt_vec_info stmt_vinfo, struct loop *loop)
6101 {
6102 gphi *phi = as_a <gphi *> (stmt_vinfo->stmt);
6103 tree base = STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo);
6104 tree step = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo);
6105 tree lhs_type = TREE_TYPE (gimple_phi_result (phi));
6106 widest_int ni, max_loop_value, lhs_max;
6107 wi::overflow_type overflow = wi::OVF_NONE;
6108
6109 /* Make sure the loop is integer based. */
6110 if (TREE_CODE (base) != INTEGER_CST
6111 || TREE_CODE (step) != INTEGER_CST)
6112 return false;
6113
6114 /* Check that the max size of the loop will not wrap. */
6115
6116 if (TYPE_OVERFLOW_UNDEFINED (lhs_type))
6117 return true;
6118
6119 if (! max_stmt_executions (loop, &ni))
6120 return false;
6121
6122 max_loop_value = wi::mul (wi::to_widest (step), ni, TYPE_SIGN (lhs_type),
6123 &overflow);
6124 if (overflow)
6125 return false;
6126
6127 max_loop_value = wi::add (wi::to_widest (base), max_loop_value,
6128 TYPE_SIGN (lhs_type), &overflow);
6129 if (overflow)
6130 return false;
6131
6132 return (wi::min_precision (max_loop_value, TYPE_SIGN (lhs_type))
6133 <= TYPE_PRECISION (lhs_type));
6134 }
6135
6136 /* Check if masking can be supported by inserting a conditional expression.
6137 CODE is the code for the operation. COND_FN is the conditional internal
6138 function, if it exists. VECTYPE_IN is the type of the vector input. */
6139 static bool
6140 use_mask_by_cond_expr_p (enum tree_code code, internal_fn cond_fn,
6141 tree vectype_in)
6142 {
6143 if (cond_fn != IFN_LAST
6144 && direct_internal_fn_supported_p (cond_fn, vectype_in,
6145 OPTIMIZE_FOR_SPEED))
6146 return false;
6147
6148 switch (code)
6149 {
6150 case DOT_PROD_EXPR:
6151 case SAD_EXPR:
6152 return true;
6153
6154 default:
6155 return false;
6156 }
6157 }
6158
6159 /* Insert a conditional expression to enable masked vectorization. CODE is the
6160 code for the operation. VOP is the array of operands. MASK is the loop
6161 mask. GSI is a statement iterator used to place the new conditional
6162 expression. */
6163 static void
6164 build_vect_cond_expr (enum tree_code code, tree vop[3], tree mask,
6165 gimple_stmt_iterator *gsi)
6166 {
6167 switch (code)
6168 {
6169 case DOT_PROD_EXPR:
6170 {
6171 tree vectype = TREE_TYPE (vop[1]);
6172 tree zero = build_zero_cst (vectype);
6173 tree masked_op1 = make_temp_ssa_name (vectype, NULL, "masked_op1");
6174 gassign *select = gimple_build_assign (masked_op1, VEC_COND_EXPR,
6175 mask, vop[1], zero);
6176 gsi_insert_before (gsi, select, GSI_SAME_STMT);
6177 vop[1] = masked_op1;
6178 break;
6179 }
6180
6181 case SAD_EXPR:
6182 {
6183 tree vectype = TREE_TYPE (vop[1]);
6184 tree masked_op1 = make_temp_ssa_name (vectype, NULL, "masked_op1");
6185 gassign *select = gimple_build_assign (masked_op1, VEC_COND_EXPR,
6186 mask, vop[1], vop[0]);
6187 gsi_insert_before (gsi, select, GSI_SAME_STMT);
6188 vop[1] = masked_op1;
6189 break;
6190 }
6191
6192 default:
6193 gcc_unreachable ();
6194 }
6195 }
6196
6197 /* Function vectorizable_reduction.
6198
6199 Check if STMT_INFO performs a reduction operation that can be vectorized.
6200 If VEC_STMT is also passed, vectorize STMT_INFO: create a vectorized
6201 stmt to replace it, put it in VEC_STMT, and insert it at GSI.
6202 Return true if STMT_INFO is vectorizable in this way.
6203
6204 This function also handles reduction idioms (patterns) that have been
6205 recognized in advance during vect_pattern_recog. In this case, STMT_INFO
6206 may be of this form:
6207 X = pattern_expr (arg0, arg1, ..., X)
6208 and its STMT_VINFO_RELATED_STMT points to the last stmt in the original
6209 sequence that had been detected and replaced by the pattern-stmt
6210 (STMT_INFO).
6211
6212 This function also handles reduction of condition expressions, for example:
6213 for (int i = 0; i < N; i++)
6214 if (a[i] < value)
6215 last = a[i];
6216 This is handled by vectorising the loop and creating an additional vector
6217 containing the loop indexes for which "a[i] < value" was true. In the
6218 function epilogue this is reduced to a single max value and then used to
6219 index into the vector of results.
6220
6221 In some cases of reduction patterns, the type of the reduction variable X is
6222 different than the type of the other arguments of STMT_INFO.
6223 In such cases, the vectype that is used when transforming STMT_INFO into
6224 a vector stmt is different than the vectype that is used to determine the
6225 vectorization factor, because it consists of a different number of elements
6226 than the actual number of elements that are being operated upon in parallel.
6227
6228 For example, consider an accumulation of shorts into an int accumulator.
6229 On some targets it's possible to vectorize this pattern operating on 8
6230 shorts at a time (hence, the vectype for purposes of determining the
6231 vectorization factor should be V8HI); on the other hand, the vectype that
6232 is used to create the vector form is actually V4SI (the type of the result).
6233
6234 Upon entry to this function, STMT_VINFO_VECTYPE records the vectype that
6235 indicates what is the actual level of parallelism (V8HI in the example), so
6236 that the right vectorization factor would be derived. This vectype
6237 corresponds to the type of arguments to the reduction stmt, and should *NOT*
6238 be used to create the vectorized stmt. The right vectype for the vectorized
6239 stmt is obtained from the type of the result X:
6240 get_vectype_for_scalar_type (TREE_TYPE (X))
6241
6242 This means that, contrary to "regular" reductions (or "regular" stmts in
6243 general), the following equation:
6244 STMT_VINFO_VECTYPE == get_vectype_for_scalar_type (TREE_TYPE (X))
6245 does *NOT* necessarily hold for reduction patterns. */
6246
6247 bool
6248 vectorizable_reduction (stmt_vec_info stmt_info, gimple_stmt_iterator *gsi,
6249 stmt_vec_info *vec_stmt, slp_tree slp_node,
6250 slp_instance slp_node_instance,
6251 stmt_vector_for_cost *cost_vec)
6252 {
6253 tree vec_dest;
6254 tree scalar_dest;
6255 tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
6256 tree vectype_in = NULL_TREE;
6257 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
6258 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
6259 enum tree_code code, orig_code;
6260 internal_fn reduc_fn;
6261 machine_mode vec_mode;
6262 int op_type;
6263 optab optab;
6264 tree new_temp = NULL_TREE;
6265 enum vect_def_type dt, cond_reduc_dt = vect_unknown_def_type;
6266 stmt_vec_info cond_stmt_vinfo = NULL;
6267 enum tree_code cond_reduc_op_code = ERROR_MARK;
6268 tree scalar_type;
6269 bool is_simple_use;
6270 int i;
6271 int ncopies;
6272 int epilog_copies;
6273 stmt_vec_info prev_stmt_info, prev_phi_info;
6274 bool single_defuse_cycle = false;
6275 stmt_vec_info new_stmt_info = NULL;
6276 int j;
6277 tree ops[3];
6278 enum vect_def_type dts[3];
6279 bool nested_cycle = false, found_nested_cycle_def = false;
6280 bool double_reduc = false;
6281 basic_block def_bb;
6282 struct loop * def_stmt_loop;
6283 tree def_arg;
6284 auto_vec<tree> vec_oprnds0;
6285 auto_vec<tree> vec_oprnds1;
6286 auto_vec<tree> vec_oprnds2;
6287 auto_vec<tree> vect_defs;
6288 auto_vec<stmt_vec_info> phis;
6289 int vec_num;
6290 tree def0, tem;
6291 tree cr_index_scalar_type = NULL_TREE, cr_index_vector_type = NULL_TREE;
6292 tree cond_reduc_val = NULL_TREE;
6293
6294 /* Make sure it was already recognized as a reduction computation. */
6295 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_reduction_def
6296 && STMT_VINFO_DEF_TYPE (stmt_info) != vect_nested_cycle)
6297 return false;
6298
6299 if (nested_in_vect_loop_p (loop, stmt_info))
6300 {
6301 loop = loop->inner;
6302 nested_cycle = true;
6303 }
6304
6305 if (REDUC_GROUP_FIRST_ELEMENT (stmt_info))
6306 gcc_assert (slp_node
6307 && REDUC_GROUP_FIRST_ELEMENT (stmt_info) == stmt_info);
6308
6309 if (gphi *phi = dyn_cast <gphi *> (stmt_info->stmt))
6310 {
6311 tree phi_result = gimple_phi_result (phi);
6312 /* Analysis is fully done on the reduction stmt invocation. */
6313 if (! vec_stmt)
6314 {
6315 if (slp_node)
6316 slp_node_instance->reduc_phis = slp_node;
6317
6318 STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
6319 return true;
6320 }
6321
6322 if (STMT_VINFO_REDUC_TYPE (stmt_info) == FOLD_LEFT_REDUCTION)
6323 /* Leave the scalar phi in place. Note that checking
6324 STMT_VINFO_VEC_REDUCTION_TYPE (as below) only works
6325 for reductions involving a single statement. */
6326 return true;
6327
6328 stmt_vec_info reduc_stmt_info = STMT_VINFO_REDUC_DEF (stmt_info);
6329 reduc_stmt_info = vect_stmt_to_vectorize (reduc_stmt_info);
6330
6331 if (STMT_VINFO_VEC_REDUCTION_TYPE (reduc_stmt_info)
6332 == EXTRACT_LAST_REDUCTION)
6333 /* Leave the scalar phi in place. */
6334 return true;
6335
6336 gassign *reduc_stmt = as_a <gassign *> (reduc_stmt_info->stmt);
6337 code = gimple_assign_rhs_code (reduc_stmt);
6338 for (unsigned k = 1; k < gimple_num_ops (reduc_stmt); ++k)
6339 {
6340 tree op = gimple_op (reduc_stmt, k);
6341 if (op == phi_result)
6342 continue;
6343 if (k == 1 && code == COND_EXPR)
6344 continue;
6345 bool is_simple_use = vect_is_simple_use (op, loop_vinfo, &dt);
6346 gcc_assert (is_simple_use);
6347 if (dt == vect_constant_def || dt == vect_external_def)
6348 continue;
6349 if (!vectype_in
6350 || (GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (vectype_in)))
6351 < GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (op)))))
6352 vectype_in = get_vectype_for_scalar_type (TREE_TYPE (op));
6353 break;
6354 }
6355 /* For a nested cycle we might end up with an operation like
6356 phi_result * phi_result. */
6357 if (!vectype_in)
6358 vectype_in = STMT_VINFO_VECTYPE (stmt_info);
6359 gcc_assert (vectype_in);
6360
6361 if (slp_node)
6362 ncopies = 1;
6363 else
6364 ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
6365
6366 stmt_vec_info use_stmt_info;
6367 if (ncopies > 1
6368 && STMT_VINFO_RELEVANT (reduc_stmt_info) <= vect_used_only_live
6369 && (use_stmt_info = loop_vinfo->lookup_single_use (phi_result))
6370 && vect_stmt_to_vectorize (use_stmt_info) == reduc_stmt_info)
6371 single_defuse_cycle = true;
6372
6373 /* Create the destination vector */
6374 scalar_dest = gimple_assign_lhs (reduc_stmt);
6375 vec_dest = vect_create_destination_var (scalar_dest, vectype_out);
6376
6377 if (slp_node)
6378 /* The size vect_schedule_slp_instance computes is off for us. */
6379 vec_num = vect_get_num_vectors
6380 (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
6381 * SLP_TREE_SCALAR_STMTS (slp_node).length (),
6382 vectype_in);
6383 else
6384 vec_num = 1;
6385
6386 /* Generate the reduction PHIs upfront. */
6387 prev_phi_info = NULL;
6388 for (j = 0; j < ncopies; j++)
6389 {
6390 if (j == 0 || !single_defuse_cycle)
6391 {
6392 for (i = 0; i < vec_num; i++)
6393 {
6394 /* Create the reduction-phi that defines the reduction
6395 operand. */
6396 gimple *new_phi = create_phi_node (vec_dest, loop->header);
6397 stmt_vec_info new_phi_info = loop_vinfo->add_stmt (new_phi);
6398
6399 if (slp_node)
6400 SLP_TREE_VEC_STMTS (slp_node).quick_push (new_phi_info);
6401 else
6402 {
6403 if (j == 0)
6404 STMT_VINFO_VEC_STMT (stmt_info)
6405 = *vec_stmt = new_phi_info;
6406 else
6407 STMT_VINFO_RELATED_STMT (prev_phi_info) = new_phi_info;
6408 prev_phi_info = new_phi_info;
6409 }
6410 }
6411 }
6412 }
6413
6414 return true;
6415 }
6416
6417 /* 1. Is vectorizable reduction? */
6418 /* Not supportable if the reduction variable is used in the loop, unless
6419 it's a reduction chain. */
6420 if (STMT_VINFO_RELEVANT (stmt_info) > vect_used_in_outer
6421 && !REDUC_GROUP_FIRST_ELEMENT (stmt_info))
6422 return false;
6423
6424 /* Reductions that are not used even in an enclosing outer-loop,
6425 are expected to be "live" (used out of the loop). */
6426 if (STMT_VINFO_RELEVANT (stmt_info) == vect_unused_in_scope
6427 && !STMT_VINFO_LIVE_P (stmt_info))
6428 return false;
6429
6430 /* 2. Has this been recognized as a reduction pattern?
6431
6432 Check if STMT represents a pattern that has been recognized
6433 in earlier analysis stages. For stmts that represent a pattern,
6434 the STMT_VINFO_RELATED_STMT field records the last stmt in
6435 the original sequence that constitutes the pattern. */
6436
6437 stmt_vec_info orig_stmt_info = STMT_VINFO_RELATED_STMT (stmt_info);
6438 if (orig_stmt_info)
6439 {
6440 gcc_assert (STMT_VINFO_IN_PATTERN_P (orig_stmt_info));
6441 gcc_assert (!STMT_VINFO_IN_PATTERN_P (stmt_info));
6442 }
6443
6444 /* 3. Check the operands of the operation. The first operands are defined
6445 inside the loop body. The last operand is the reduction variable,
6446 which is defined by the loop-header-phi. */
6447
6448 gassign *stmt = as_a <gassign *> (stmt_info->stmt);
6449
6450 /* Flatten RHS. */
6451 switch (get_gimple_rhs_class (gimple_assign_rhs_code (stmt)))
6452 {
6453 case GIMPLE_BINARY_RHS:
6454 code = gimple_assign_rhs_code (stmt);
6455 op_type = TREE_CODE_LENGTH (code);
6456 gcc_assert (op_type == binary_op);
6457 ops[0] = gimple_assign_rhs1 (stmt);
6458 ops[1] = gimple_assign_rhs2 (stmt);
6459 break;
6460
6461 case GIMPLE_TERNARY_RHS:
6462 code = gimple_assign_rhs_code (stmt);
6463 op_type = TREE_CODE_LENGTH (code);
6464 gcc_assert (op_type == ternary_op);
6465 ops[0] = gimple_assign_rhs1 (stmt);
6466 ops[1] = gimple_assign_rhs2 (stmt);
6467 ops[2] = gimple_assign_rhs3 (stmt);
6468 break;
6469
6470 case GIMPLE_UNARY_RHS:
6471 return false;
6472
6473 default:
6474 gcc_unreachable ();
6475 }
6476
6477 if (code == COND_EXPR && slp_node)
6478 return false;
6479
6480 scalar_dest = gimple_assign_lhs (stmt);
6481 scalar_type = TREE_TYPE (scalar_dest);
6482 if (!POINTER_TYPE_P (scalar_type) && !INTEGRAL_TYPE_P (scalar_type)
6483 && !SCALAR_FLOAT_TYPE_P (scalar_type))
6484 return false;
6485
6486 /* Do not try to vectorize bit-precision reductions. */
6487 if (!type_has_mode_precision_p (scalar_type))
6488 return false;
6489
6490 /* All uses but the last are expected to be defined in the loop.
6491 The last use is the reduction variable. In case of nested cycle this
6492 assumption is not true: we use reduc_index to record the index of the
6493 reduction variable. */
6494 stmt_vec_info reduc_def_info;
6495 if (orig_stmt_info)
6496 reduc_def_info = STMT_VINFO_REDUC_DEF (orig_stmt_info);
6497 else
6498 reduc_def_info = STMT_VINFO_REDUC_DEF (stmt_info);
6499 gcc_assert (reduc_def_info);
6500 gphi *reduc_def_phi = as_a <gphi *> (reduc_def_info->stmt);
6501 tree reduc_def = PHI_RESULT (reduc_def_phi);
6502 int reduc_index = -1;
6503 for (i = 0; i < op_type; i++)
6504 {
6505 /* The condition of COND_EXPR is checked in vectorizable_condition(). */
6506 if (i == 0 && code == COND_EXPR)
6507 continue;
6508
6509 stmt_vec_info def_stmt_info;
6510 is_simple_use = vect_is_simple_use (ops[i], loop_vinfo, &dts[i], &tem,
6511 &def_stmt_info);
6512 dt = dts[i];
6513 gcc_assert (is_simple_use);
6514 if (dt == vect_reduction_def
6515 && ops[i] == reduc_def)
6516 {
6517 reduc_index = i;
6518 continue;
6519 }
6520 else if (tem)
6521 {
6522 /* To properly compute ncopies we are interested in the widest
6523 input type in case we're looking at a widening accumulation. */
6524 if (!vectype_in
6525 || (GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (vectype_in)))
6526 < GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (tem)))))
6527 vectype_in = tem;
6528 }
6529
6530 if (dt != vect_internal_def
6531 && dt != vect_external_def
6532 && dt != vect_constant_def
6533 && dt != vect_induction_def
6534 && !(dt == vect_nested_cycle && nested_cycle))
6535 return false;
6536
6537 if (dt == vect_nested_cycle
6538 && ops[i] == reduc_def)
6539 {
6540 found_nested_cycle_def = true;
6541 reduc_index = i;
6542 }
6543
6544 if (i == 1 && code == COND_EXPR)
6545 {
6546 /* Record how value of COND_EXPR is defined. */
6547 if (dt == vect_constant_def)
6548 {
6549 cond_reduc_dt = dt;
6550 cond_reduc_val = ops[i];
6551 }
6552 if (dt == vect_induction_def
6553 && def_stmt_info
6554 && is_nonwrapping_integer_induction (def_stmt_info, loop))
6555 {
6556 cond_reduc_dt = dt;
6557 cond_stmt_vinfo = def_stmt_info;
6558 }
6559 }
6560 }
6561
6562 if (!vectype_in)
6563 vectype_in = vectype_out;
6564
6565 /* When vectorizing a reduction chain w/o SLP the reduction PHI is not
6566 directy used in stmt. */
6567 if (reduc_index == -1)
6568 {
6569 if (STMT_VINFO_REDUC_TYPE (stmt_info) == FOLD_LEFT_REDUCTION)
6570 {
6571 if (dump_enabled_p ())
6572 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6573 "in-order reduction chain without SLP.\n");
6574 return false;
6575 }
6576 }
6577
6578 if (!(reduc_index == -1
6579 || dts[reduc_index] == vect_reduction_def
6580 || dts[reduc_index] == vect_nested_cycle
6581 || ((dts[reduc_index] == vect_internal_def
6582 || dts[reduc_index] == vect_external_def
6583 || dts[reduc_index] == vect_constant_def
6584 || dts[reduc_index] == vect_induction_def)
6585 && nested_cycle && found_nested_cycle_def)))
6586 {
6587 /* For pattern recognized stmts, orig_stmt might be a reduction,
6588 but some helper statements for the pattern might not, or
6589 might be COND_EXPRs with reduction uses in the condition. */
6590 gcc_assert (orig_stmt_info);
6591 return false;
6592 }
6593
6594 /* PHIs should not participate in patterns. */
6595 gcc_assert (!STMT_VINFO_RELATED_STMT (reduc_def_info));
6596 enum vect_reduction_type v_reduc_type
6597 = STMT_VINFO_REDUC_TYPE (reduc_def_info);
6598 stmt_vec_info tmp = STMT_VINFO_REDUC_DEF (reduc_def_info);
6599
6600 STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) = v_reduc_type;
6601 /* If we have a condition reduction, see if we can simplify it further. */
6602 if (v_reduc_type == COND_REDUCTION)
6603 {
6604 /* TODO: We can't yet handle reduction chains, since we need to treat
6605 each COND_EXPR in the chain specially, not just the last one.
6606 E.g. for:
6607
6608 x_1 = PHI <x_3, ...>
6609 x_2 = a_2 ? ... : x_1;
6610 x_3 = a_3 ? ... : x_2;
6611
6612 we're interested in the last element in x_3 for which a_2 || a_3
6613 is true, whereas the current reduction chain handling would
6614 vectorize x_2 as a normal VEC_COND_EXPR and only treat x_3
6615 as a reduction operation. */
6616 if (reduc_index == -1)
6617 {
6618 if (dump_enabled_p ())
6619 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6620 "conditional reduction chains not supported\n");
6621 return false;
6622 }
6623
6624 /* vect_is_simple_reduction ensured that operand 2 is the
6625 loop-carried operand. */
6626 gcc_assert (reduc_index == 2);
6627
6628 /* Loop peeling modifies initial value of reduction PHI, which
6629 makes the reduction stmt to be transformed different to the
6630 original stmt analyzed. We need to record reduction code for
6631 CONST_COND_REDUCTION type reduction at analyzing stage, thus
6632 it can be used directly at transform stage. */
6633 if (STMT_VINFO_VEC_CONST_COND_REDUC_CODE (stmt_info) == MAX_EXPR
6634 || STMT_VINFO_VEC_CONST_COND_REDUC_CODE (stmt_info) == MIN_EXPR)
6635 {
6636 /* Also set the reduction type to CONST_COND_REDUCTION. */
6637 gcc_assert (cond_reduc_dt == vect_constant_def);
6638 STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) = CONST_COND_REDUCTION;
6639 }
6640 else if (direct_internal_fn_supported_p (IFN_FOLD_EXTRACT_LAST,
6641 vectype_in, OPTIMIZE_FOR_SPEED))
6642 {
6643 if (dump_enabled_p ())
6644 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6645 "optimizing condition reduction with"
6646 " FOLD_EXTRACT_LAST.\n");
6647 STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) = EXTRACT_LAST_REDUCTION;
6648 }
6649 else if (cond_reduc_dt == vect_induction_def)
6650 {
6651 tree base
6652 = STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (cond_stmt_vinfo);
6653 tree step = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (cond_stmt_vinfo);
6654
6655 gcc_assert (TREE_CODE (base) == INTEGER_CST
6656 && TREE_CODE (step) == INTEGER_CST);
6657 cond_reduc_val = NULL_TREE;
6658 /* Find a suitable value, for MAX_EXPR below base, for MIN_EXPR
6659 above base; punt if base is the minimum value of the type for
6660 MAX_EXPR or maximum value of the type for MIN_EXPR for now. */
6661 if (tree_int_cst_sgn (step) == -1)
6662 {
6663 cond_reduc_op_code = MIN_EXPR;
6664 if (tree_int_cst_sgn (base) == -1)
6665 cond_reduc_val = build_int_cst (TREE_TYPE (base), 0);
6666 else if (tree_int_cst_lt (base,
6667 TYPE_MAX_VALUE (TREE_TYPE (base))))
6668 cond_reduc_val
6669 = int_const_binop (PLUS_EXPR, base, integer_one_node);
6670 }
6671 else
6672 {
6673 cond_reduc_op_code = MAX_EXPR;
6674 if (tree_int_cst_sgn (base) == 1)
6675 cond_reduc_val = build_int_cst (TREE_TYPE (base), 0);
6676 else if (tree_int_cst_lt (TYPE_MIN_VALUE (TREE_TYPE (base)),
6677 base))
6678 cond_reduc_val
6679 = int_const_binop (MINUS_EXPR, base, integer_one_node);
6680 }
6681 if (cond_reduc_val)
6682 {
6683 if (dump_enabled_p ())
6684 dump_printf_loc (MSG_NOTE, vect_location,
6685 "condition expression based on "
6686 "integer induction.\n");
6687 STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
6688 = INTEGER_INDUC_COND_REDUCTION;
6689 }
6690 }
6691 else if (cond_reduc_dt == vect_constant_def)
6692 {
6693 enum vect_def_type cond_initial_dt;
6694 gimple *def_stmt = SSA_NAME_DEF_STMT (ops[reduc_index]);
6695 tree cond_initial_val
6696 = PHI_ARG_DEF_FROM_EDGE (def_stmt, loop_preheader_edge (loop));
6697
6698 gcc_assert (cond_reduc_val != NULL_TREE);
6699 vect_is_simple_use (cond_initial_val, loop_vinfo, &cond_initial_dt);
6700 if (cond_initial_dt == vect_constant_def
6701 && types_compatible_p (TREE_TYPE (cond_initial_val),
6702 TREE_TYPE (cond_reduc_val)))
6703 {
6704 tree e = fold_binary (LE_EXPR, boolean_type_node,
6705 cond_initial_val, cond_reduc_val);
6706 if (e && (integer_onep (e) || integer_zerop (e)))
6707 {
6708 if (dump_enabled_p ())
6709 dump_printf_loc (MSG_NOTE, vect_location,
6710 "condition expression based on "
6711 "compile time constant.\n");
6712 /* Record reduction code at analysis stage. */
6713 STMT_VINFO_VEC_CONST_COND_REDUC_CODE (stmt_info)
6714 = integer_onep (e) ? MAX_EXPR : MIN_EXPR;
6715 STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
6716 = CONST_COND_REDUCTION;
6717 }
6718 }
6719 }
6720 }
6721
6722 if (orig_stmt_info)
6723 gcc_assert (tmp == orig_stmt_info
6724 || REDUC_GROUP_FIRST_ELEMENT (tmp) == orig_stmt_info);
6725 else
6726 /* We changed STMT to be the first stmt in reduction chain, hence we
6727 check that in this case the first element in the chain is STMT. */
6728 gcc_assert (tmp == stmt_info
6729 || REDUC_GROUP_FIRST_ELEMENT (tmp) == stmt_info);
6730
6731 if (STMT_VINFO_LIVE_P (reduc_def_info))
6732 return false;
6733
6734 if (slp_node)
6735 ncopies = 1;
6736 else
6737 ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
6738
6739 gcc_assert (ncopies >= 1);
6740
6741 vec_mode = TYPE_MODE (vectype_in);
6742 poly_uint64 nunits_out = TYPE_VECTOR_SUBPARTS (vectype_out);
6743
6744 if (nested_cycle)
6745 {
6746 def_bb = gimple_bb (reduc_def_phi);
6747 def_stmt_loop = def_bb->loop_father;
6748 def_arg = PHI_ARG_DEF_FROM_EDGE (reduc_def_phi,
6749 loop_preheader_edge (def_stmt_loop));
6750 stmt_vec_info def_arg_stmt_info = loop_vinfo->lookup_def (def_arg);
6751 if (def_arg_stmt_info
6752 && (STMT_VINFO_DEF_TYPE (def_arg_stmt_info)
6753 == vect_double_reduction_def))
6754 double_reduc = true;
6755 }
6756
6757 vect_reduction_type reduction_type
6758 = STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info);
6759 if ((double_reduc || reduction_type != TREE_CODE_REDUCTION)
6760 && ncopies > 1)
6761 {
6762 if (dump_enabled_p ())
6763 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6764 "multiple types in double reduction or condition "
6765 "reduction.\n");
6766 return false;
6767 }
6768
6769 if (code == COND_EXPR)
6770 {
6771 /* Only call during the analysis stage, otherwise we'll lose
6772 STMT_VINFO_TYPE. */
6773 if (!vec_stmt && !vectorizable_condition (stmt_info, gsi, NULL,
6774 true, NULL, cost_vec))
6775 {
6776 if (dump_enabled_p ())
6777 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6778 "unsupported condition in reduction\n");
6779 return false;
6780 }
6781 }
6782 else if (code == LSHIFT_EXPR || code == RSHIFT_EXPR
6783 || code == LROTATE_EXPR || code == RROTATE_EXPR)
6784 {
6785 /* Only call during the analysis stage, otherwise we'll lose
6786 STMT_VINFO_TYPE. We only support this for nested cycles
6787 without double reductions at the moment. */
6788 if (!nested_cycle
6789 || double_reduc
6790 || (!vec_stmt && !vectorizable_shift (stmt_info, gsi, NULL,
6791 NULL, cost_vec)))
6792 {
6793 if (dump_enabled_p ())
6794 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6795 "unsupported shift or rotation in reduction\n");
6796 return false;
6797 }
6798 }
6799 else
6800 {
6801 /* 4. Supportable by target? */
6802
6803 /* 4.1. check support for the operation in the loop */
6804 optab = optab_for_tree_code (code, vectype_in, optab_default);
6805 if (!optab)
6806 {
6807 if (dump_enabled_p ())
6808 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6809 "no optab.\n");
6810
6811 return false;
6812 }
6813
6814 if (optab_handler (optab, vec_mode) == CODE_FOR_nothing)
6815 {
6816 if (dump_enabled_p ())
6817 dump_printf (MSG_NOTE, "op not supported by target.\n");
6818
6819 if (maybe_ne (GET_MODE_SIZE (vec_mode), UNITS_PER_WORD)
6820 || !vect_worthwhile_without_simd_p (loop_vinfo, code))
6821 return false;
6822
6823 if (dump_enabled_p ())
6824 dump_printf (MSG_NOTE, "proceeding using word mode.\n");
6825 }
6826
6827 /* Worthwhile without SIMD support? */
6828 if (!VECTOR_MODE_P (TYPE_MODE (vectype_in))
6829 && !vect_worthwhile_without_simd_p (loop_vinfo, code))
6830 {
6831 if (dump_enabled_p ())
6832 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6833 "not worthwhile without SIMD support.\n");
6834
6835 return false;
6836 }
6837 }
6838
6839 /* 4.2. Check support for the epilog operation.
6840
6841 If STMT represents a reduction pattern, then the type of the
6842 reduction variable may be different than the type of the rest
6843 of the arguments. For example, consider the case of accumulation
6844 of shorts into an int accumulator; The original code:
6845 S1: int_a = (int) short_a;
6846 orig_stmt-> S2: int_acc = plus <int_a ,int_acc>;
6847
6848 was replaced with:
6849 STMT: int_acc = widen_sum <short_a, int_acc>
6850
6851 This means that:
6852 1. The tree-code that is used to create the vector operation in the
6853 epilog code (that reduces the partial results) is not the
6854 tree-code of STMT, but is rather the tree-code of the original
6855 stmt from the pattern that STMT is replacing. I.e, in the example
6856 above we want to use 'widen_sum' in the loop, but 'plus' in the
6857 epilog.
6858 2. The type (mode) we use to check available target support
6859 for the vector operation to be created in the *epilog*, is
6860 determined by the type of the reduction variable (in the example
6861 above we'd check this: optab_handler (plus_optab, vect_int_mode])).
6862 However the type (mode) we use to check available target support
6863 for the vector operation to be created *inside the loop*, is
6864 determined by the type of the other arguments to STMT (in the
6865 example we'd check this: optab_handler (widen_sum_optab,
6866 vect_short_mode)).
6867
6868 This is contrary to "regular" reductions, in which the types of all
6869 the arguments are the same as the type of the reduction variable.
6870 For "regular" reductions we can therefore use the same vector type
6871 (and also the same tree-code) when generating the epilog code and
6872 when generating the code inside the loop. */
6873
6874 if (orig_stmt_info
6875 && (reduction_type == TREE_CODE_REDUCTION
6876 || reduction_type == FOLD_LEFT_REDUCTION))
6877 {
6878 /* This is a reduction pattern: get the vectype from the type of the
6879 reduction variable, and get the tree-code from orig_stmt. */
6880 orig_code = gimple_assign_rhs_code (orig_stmt_info->stmt);
6881 gcc_assert (vectype_out);
6882 vec_mode = TYPE_MODE (vectype_out);
6883 }
6884 else
6885 {
6886 /* Regular reduction: use the same vectype and tree-code as used for
6887 the vector code inside the loop can be used for the epilog code. */
6888 orig_code = code;
6889
6890 if (code == MINUS_EXPR)
6891 orig_code = PLUS_EXPR;
6892
6893 /* For simple condition reductions, replace with the actual expression
6894 we want to base our reduction around. */
6895 if (reduction_type == CONST_COND_REDUCTION)
6896 {
6897 orig_code = STMT_VINFO_VEC_CONST_COND_REDUC_CODE (stmt_info);
6898 gcc_assert (orig_code == MAX_EXPR || orig_code == MIN_EXPR);
6899 }
6900 else if (reduction_type == INTEGER_INDUC_COND_REDUCTION)
6901 orig_code = cond_reduc_op_code;
6902 }
6903
6904 reduc_fn = IFN_LAST;
6905
6906 if (reduction_type == TREE_CODE_REDUCTION
6907 || reduction_type == FOLD_LEFT_REDUCTION
6908 || reduction_type == INTEGER_INDUC_COND_REDUCTION
6909 || reduction_type == CONST_COND_REDUCTION)
6910 {
6911 if (reduction_type == FOLD_LEFT_REDUCTION
6912 ? fold_left_reduction_fn (orig_code, &reduc_fn)
6913 : reduction_fn_for_scalar_code (orig_code, &reduc_fn))
6914 {
6915 if (reduc_fn != IFN_LAST
6916 && !direct_internal_fn_supported_p (reduc_fn, vectype_out,
6917 OPTIMIZE_FOR_SPEED))
6918 {
6919 if (dump_enabled_p ())
6920 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6921 "reduc op not supported by target.\n");
6922
6923 reduc_fn = IFN_LAST;
6924 }
6925 }
6926 else
6927 {
6928 if (!nested_cycle || double_reduc)
6929 {
6930 if (dump_enabled_p ())
6931 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6932 "no reduc code for scalar code.\n");
6933
6934 return false;
6935 }
6936 }
6937 }
6938 else if (reduction_type == COND_REDUCTION)
6939 {
6940 int scalar_precision
6941 = GET_MODE_PRECISION (SCALAR_TYPE_MODE (scalar_type));
6942 cr_index_scalar_type = make_unsigned_type (scalar_precision);
6943 cr_index_vector_type = build_vector_type (cr_index_scalar_type,
6944 nunits_out);
6945
6946 if (direct_internal_fn_supported_p (IFN_REDUC_MAX, cr_index_vector_type,
6947 OPTIMIZE_FOR_SPEED))
6948 reduc_fn = IFN_REDUC_MAX;
6949 }
6950
6951 if (reduction_type != EXTRACT_LAST_REDUCTION
6952 && (!nested_cycle || double_reduc)
6953 && reduc_fn == IFN_LAST
6954 && !nunits_out.is_constant ())
6955 {
6956 if (dump_enabled_p ())
6957 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6958 "missing target support for reduction on"
6959 " variable-length vectors.\n");
6960 return false;
6961 }
6962
6963 /* For SLP reductions, see if there is a neutral value we can use. */
6964 tree neutral_op = NULL_TREE;
6965 if (slp_node)
6966 neutral_op = neutral_op_for_slp_reduction
6967 (slp_node_instance->reduc_phis, code,
6968 REDUC_GROUP_FIRST_ELEMENT (stmt_info) != NULL);
6969
6970 if (double_reduc && reduction_type == FOLD_LEFT_REDUCTION)
6971 {
6972 /* We can't support in-order reductions of code such as this:
6973
6974 for (int i = 0; i < n1; ++i)
6975 for (int j = 0; j < n2; ++j)
6976 l += a[j];
6977
6978 since GCC effectively transforms the loop when vectorizing:
6979
6980 for (int i = 0; i < n1 / VF; ++i)
6981 for (int j = 0; j < n2; ++j)
6982 for (int k = 0; k < VF; ++k)
6983 l += a[j];
6984
6985 which is a reassociation of the original operation. */
6986 if (dump_enabled_p ())
6987 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6988 "in-order double reduction not supported.\n");
6989
6990 return false;
6991 }
6992
6993 if (reduction_type == FOLD_LEFT_REDUCTION
6994 && slp_node
6995 && !REDUC_GROUP_FIRST_ELEMENT (stmt_info))
6996 {
6997 /* We cannot use in-order reductions in this case because there is
6998 an implicit reassociation of the operations involved. */
6999 if (dump_enabled_p ())
7000 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7001 "in-order unchained SLP reductions not supported.\n");
7002 return false;
7003 }
7004
7005 /* For double reductions, and for SLP reductions with a neutral value,
7006 we construct a variable-length initial vector by loading a vector
7007 full of the neutral value and then shift-and-inserting the start
7008 values into the low-numbered elements. */
7009 if ((double_reduc || neutral_op)
7010 && !nunits_out.is_constant ()
7011 && !direct_internal_fn_supported_p (IFN_VEC_SHL_INSERT,
7012 vectype_out, OPTIMIZE_FOR_SPEED))
7013 {
7014 if (dump_enabled_p ())
7015 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7016 "reduction on variable-length vectors requires"
7017 " target support for a vector-shift-and-insert"
7018 " operation.\n");
7019 return false;
7020 }
7021
7022 /* Check extra constraints for variable-length unchained SLP reductions. */
7023 if (STMT_SLP_TYPE (stmt_info)
7024 && !REDUC_GROUP_FIRST_ELEMENT (stmt_info)
7025 && !nunits_out.is_constant ())
7026 {
7027 /* We checked above that we could build the initial vector when
7028 there's a neutral element value. Check here for the case in
7029 which each SLP statement has its own initial value and in which
7030 that value needs to be repeated for every instance of the
7031 statement within the initial vector. */
7032 unsigned int group_size = SLP_TREE_SCALAR_STMTS (slp_node).length ();
7033 scalar_mode elt_mode = SCALAR_TYPE_MODE (TREE_TYPE (vectype_out));
7034 if (!neutral_op
7035 && !can_duplicate_and_interleave_p (group_size, elt_mode))
7036 {
7037 if (dump_enabled_p ())
7038 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7039 "unsupported form of SLP reduction for"
7040 " variable-length vectors: cannot build"
7041 " initial vector.\n");
7042 return false;
7043 }
7044 /* The epilogue code relies on the number of elements being a multiple
7045 of the group size. The duplicate-and-interleave approach to setting
7046 up the the initial vector does too. */
7047 if (!multiple_p (nunits_out, group_size))
7048 {
7049 if (dump_enabled_p ())
7050 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7051 "unsupported form of SLP reduction for"
7052 " variable-length vectors: the vector size"
7053 " is not a multiple of the number of results.\n");
7054 return false;
7055 }
7056 }
7057
7058 /* In case of widenning multiplication by a constant, we update the type
7059 of the constant to be the type of the other operand. We check that the
7060 constant fits the type in the pattern recognition pass. */
7061 if (code == DOT_PROD_EXPR
7062 && !types_compatible_p (TREE_TYPE (ops[0]), TREE_TYPE (ops[1])))
7063 {
7064 if (TREE_CODE (ops[0]) == INTEGER_CST)
7065 ops[0] = fold_convert (TREE_TYPE (ops[1]), ops[0]);
7066 else if (TREE_CODE (ops[1]) == INTEGER_CST)
7067 ops[1] = fold_convert (TREE_TYPE (ops[0]), ops[1]);
7068 else
7069 {
7070 if (dump_enabled_p ())
7071 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7072 "invalid types in dot-prod\n");
7073
7074 return false;
7075 }
7076 }
7077
7078 if (reduction_type == COND_REDUCTION)
7079 {
7080 widest_int ni;
7081
7082 if (! max_loop_iterations (loop, &ni))
7083 {
7084 if (dump_enabled_p ())
7085 dump_printf_loc (MSG_NOTE, vect_location,
7086 "loop count not known, cannot create cond "
7087 "reduction.\n");
7088 return false;
7089 }
7090 /* Convert backedges to iterations. */
7091 ni += 1;
7092
7093 /* The additional index will be the same type as the condition. Check
7094 that the loop can fit into this less one (because we'll use up the
7095 zero slot for when there are no matches). */
7096 tree max_index = TYPE_MAX_VALUE (cr_index_scalar_type);
7097 if (wi::geu_p (ni, wi::to_widest (max_index)))
7098 {
7099 if (dump_enabled_p ())
7100 dump_printf_loc (MSG_NOTE, vect_location,
7101 "loop size is greater than data size.\n");
7102 return false;
7103 }
7104 }
7105
7106 /* In case the vectorization factor (VF) is bigger than the number
7107 of elements that we can fit in a vectype (nunits), we have to generate
7108 more than one vector stmt - i.e - we need to "unroll" the
7109 vector stmt by a factor VF/nunits. For more details see documentation
7110 in vectorizable_operation. */
7111
7112 /* If the reduction is used in an outer loop we need to generate
7113 VF intermediate results, like so (e.g. for ncopies=2):
7114 r0 = phi (init, r0)
7115 r1 = phi (init, r1)
7116 r0 = x0 + r0;
7117 r1 = x1 + r1;
7118 (i.e. we generate VF results in 2 registers).
7119 In this case we have a separate def-use cycle for each copy, and therefore
7120 for each copy we get the vector def for the reduction variable from the
7121 respective phi node created for this copy.
7122
7123 Otherwise (the reduction is unused in the loop nest), we can combine
7124 together intermediate results, like so (e.g. for ncopies=2):
7125 r = phi (init, r)
7126 r = x0 + r;
7127 r = x1 + r;
7128 (i.e. we generate VF/2 results in a single register).
7129 In this case for each copy we get the vector def for the reduction variable
7130 from the vectorized reduction operation generated in the previous iteration.
7131
7132 This only works when we see both the reduction PHI and its only consumer
7133 in vectorizable_reduction and there are no intermediate stmts
7134 participating. */
7135 stmt_vec_info use_stmt_info;
7136 tree reduc_phi_result = gimple_phi_result (reduc_def_phi);
7137 if (ncopies > 1
7138 && (STMT_VINFO_RELEVANT (stmt_info) <= vect_used_only_live)
7139 && (use_stmt_info = loop_vinfo->lookup_single_use (reduc_phi_result))
7140 && vect_stmt_to_vectorize (use_stmt_info) == stmt_info)
7141 {
7142 single_defuse_cycle = true;
7143 epilog_copies = 1;
7144 }
7145 else
7146 epilog_copies = ncopies;
7147
7148 /* If the reduction stmt is one of the patterns that have lane
7149 reduction embedded we cannot handle the case of ! single_defuse_cycle. */
7150 if ((ncopies > 1
7151 && ! single_defuse_cycle)
7152 && (code == DOT_PROD_EXPR
7153 || code == WIDEN_SUM_EXPR
7154 || code == SAD_EXPR))
7155 {
7156 if (dump_enabled_p ())
7157 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7158 "multi def-use cycle not possible for lane-reducing "
7159 "reduction operation\n");
7160 return false;
7161 }
7162
7163 if (slp_node)
7164 vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
7165 else
7166 vec_num = 1;
7167
7168 internal_fn cond_fn = get_conditional_internal_fn (code);
7169 vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo);
7170 bool mask_by_cond_expr = use_mask_by_cond_expr_p (code, cond_fn, vectype_in);
7171
7172 if (!vec_stmt) /* transformation not required. */
7173 {
7174 vect_model_reduction_cost (stmt_info, reduc_fn, ncopies, cost_vec);
7175 if (loop_vinfo && LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo))
7176 {
7177 if (reduction_type != FOLD_LEFT_REDUCTION
7178 && !mask_by_cond_expr
7179 && (cond_fn == IFN_LAST
7180 || !direct_internal_fn_supported_p (cond_fn, vectype_in,
7181 OPTIMIZE_FOR_SPEED)))
7182 {
7183 if (dump_enabled_p ())
7184 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7185 "can't use a fully-masked loop because no"
7186 " conditional operation is available.\n");
7187 LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = false;
7188 }
7189 else if (reduc_index == -1)
7190 {
7191 if (dump_enabled_p ())
7192 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7193 "can't use a fully-masked loop for chained"
7194 " reductions.\n");
7195 LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = false;
7196 }
7197 else
7198 vect_record_loop_mask (loop_vinfo, masks, ncopies * vec_num,
7199 vectype_in);
7200 }
7201 if (dump_enabled_p ()
7202 && reduction_type == FOLD_LEFT_REDUCTION)
7203 dump_printf_loc (MSG_NOTE, vect_location,
7204 "using an in-order (fold-left) reduction.\n");
7205 STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
7206 return true;
7207 }
7208
7209 /* Transform. */
7210
7211 if (dump_enabled_p ())
7212 dump_printf_loc (MSG_NOTE, vect_location, "transform reduction.\n");
7213
7214 /* FORNOW: Multiple types are not supported for condition. */
7215 if (code == COND_EXPR)
7216 gcc_assert (ncopies == 1);
7217
7218 bool masked_loop_p = LOOP_VINFO_FULLY_MASKED_P (loop_vinfo);
7219
7220 if (reduction_type == FOLD_LEFT_REDUCTION)
7221 return vectorize_fold_left_reduction
7222 (stmt_info, gsi, vec_stmt, slp_node, reduc_def_phi, code,
7223 reduc_fn, ops, vectype_in, reduc_index, masks);
7224
7225 if (reduction_type == EXTRACT_LAST_REDUCTION)
7226 {
7227 gcc_assert (!slp_node);
7228 return vectorizable_condition (stmt_info, gsi, vec_stmt,
7229 true, NULL, NULL);
7230 }
7231
7232 /* Create the destination vector */
7233 vec_dest = vect_create_destination_var (scalar_dest, vectype_out);
7234
7235 prev_stmt_info = NULL;
7236 prev_phi_info = NULL;
7237 if (!slp_node)
7238 {
7239 vec_oprnds0.create (1);
7240 vec_oprnds1.create (1);
7241 if (op_type == ternary_op)
7242 vec_oprnds2.create (1);
7243 }
7244
7245 phis.create (vec_num);
7246 vect_defs.create (vec_num);
7247 if (!slp_node)
7248 vect_defs.quick_push (NULL_TREE);
7249
7250 if (slp_node)
7251 phis.splice (SLP_TREE_VEC_STMTS (slp_node_instance->reduc_phis));
7252 else
7253 phis.quick_push (STMT_VINFO_VEC_STMT (reduc_def_info));
7254
7255 for (j = 0; j < ncopies; j++)
7256 {
7257 if (code == COND_EXPR)
7258 {
7259 gcc_assert (!slp_node);
7260 vectorizable_condition (stmt_info, gsi, vec_stmt,
7261 true, NULL, NULL);
7262 break;
7263 }
7264 if (code == LSHIFT_EXPR
7265 || code == RSHIFT_EXPR)
7266 {
7267 vectorizable_shift (stmt_info, gsi, vec_stmt, slp_node, NULL);
7268 break;
7269 }
7270
7271 /* Handle uses. */
7272 if (j == 0)
7273 {
7274 if (slp_node)
7275 {
7276 /* Get vec defs for all the operands except the reduction index,
7277 ensuring the ordering of the ops in the vector is kept. */
7278 auto_vec<tree, 3> slp_ops;
7279 auto_vec<vec<tree>, 3> vec_defs;
7280
7281 slp_ops.quick_push (ops[0]);
7282 slp_ops.quick_push (ops[1]);
7283 if (op_type == ternary_op)
7284 slp_ops.quick_push (ops[2]);
7285
7286 vect_get_slp_defs (slp_ops, slp_node, &vec_defs);
7287
7288 vec_oprnds0.safe_splice (vec_defs[0]);
7289 vec_defs[0].release ();
7290 vec_oprnds1.safe_splice (vec_defs[1]);
7291 vec_defs[1].release ();
7292 if (op_type == ternary_op)
7293 {
7294 vec_oprnds2.safe_splice (vec_defs[2]);
7295 vec_defs[2].release ();
7296 }
7297 }
7298 else
7299 {
7300 vec_oprnds0.quick_push
7301 (vect_get_vec_def_for_operand (ops[0], stmt_info));
7302 vec_oprnds1.quick_push
7303 (vect_get_vec_def_for_operand (ops[1], stmt_info));
7304 if (op_type == ternary_op)
7305 vec_oprnds2.quick_push
7306 (vect_get_vec_def_for_operand (ops[2], stmt_info));
7307 }
7308 }
7309 else
7310 {
7311 if (!slp_node)
7312 {
7313 gcc_assert (reduc_index != -1 || ! single_defuse_cycle);
7314
7315 if (single_defuse_cycle && reduc_index == 0)
7316 vec_oprnds0[0] = gimple_get_lhs (new_stmt_info->stmt);
7317 else
7318 vec_oprnds0[0]
7319 = vect_get_vec_def_for_stmt_copy (loop_vinfo,
7320 vec_oprnds0[0]);
7321 if (single_defuse_cycle && reduc_index == 1)
7322 vec_oprnds1[0] = gimple_get_lhs (new_stmt_info->stmt);
7323 else
7324 vec_oprnds1[0]
7325 = vect_get_vec_def_for_stmt_copy (loop_vinfo,
7326 vec_oprnds1[0]);
7327 if (op_type == ternary_op)
7328 {
7329 if (single_defuse_cycle && reduc_index == 2)
7330 vec_oprnds2[0] = gimple_get_lhs (new_stmt_info->stmt);
7331 else
7332 vec_oprnds2[0]
7333 = vect_get_vec_def_for_stmt_copy (loop_vinfo,
7334 vec_oprnds2[0]);
7335 }
7336 }
7337 }
7338
7339 FOR_EACH_VEC_ELT (vec_oprnds0, i, def0)
7340 {
7341 tree vop[3] = { def0, vec_oprnds1[i], NULL_TREE };
7342 if (masked_loop_p && !mask_by_cond_expr)
7343 {
7344 /* Make sure that the reduction accumulator is vop[0]. */
7345 if (reduc_index == 1)
7346 {
7347 gcc_assert (commutative_tree_code (code));
7348 std::swap (vop[0], vop[1]);
7349 }
7350 tree mask = vect_get_loop_mask (gsi, masks, vec_num * ncopies,
7351 vectype_in, i * ncopies + j);
7352 gcall *call = gimple_build_call_internal (cond_fn, 4, mask,
7353 vop[0], vop[1],
7354 vop[0]);
7355 new_temp = make_ssa_name (vec_dest, call);
7356 gimple_call_set_lhs (call, new_temp);
7357 gimple_call_set_nothrow (call, true);
7358 new_stmt_info
7359 = vect_finish_stmt_generation (stmt_info, call, gsi);
7360 }
7361 else
7362 {
7363 if (op_type == ternary_op)
7364 vop[2] = vec_oprnds2[i];
7365
7366 if (masked_loop_p && mask_by_cond_expr)
7367 {
7368 tree mask = vect_get_loop_mask (gsi, masks,
7369 vec_num * ncopies,
7370 vectype_in, i * ncopies + j);
7371 build_vect_cond_expr (code, vop, mask, gsi);
7372 }
7373
7374 gassign *new_stmt = gimple_build_assign (vec_dest, code,
7375 vop[0], vop[1], vop[2]);
7376 new_temp = make_ssa_name (vec_dest, new_stmt);
7377 gimple_assign_set_lhs (new_stmt, new_temp);
7378 new_stmt_info
7379 = vect_finish_stmt_generation (stmt_info, new_stmt, gsi);
7380 }
7381
7382 if (slp_node)
7383 {
7384 SLP_TREE_VEC_STMTS (slp_node).quick_push (new_stmt_info);
7385 vect_defs.quick_push (new_temp);
7386 }
7387 else
7388 vect_defs[0] = new_temp;
7389 }
7390
7391 if (slp_node)
7392 continue;
7393
7394 if (j == 0)
7395 STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt_info;
7396 else
7397 STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt_info;
7398
7399 prev_stmt_info = new_stmt_info;
7400 }
7401
7402 /* Finalize the reduction-phi (set its arguments) and create the
7403 epilog reduction code. */
7404 if ((!single_defuse_cycle || code == COND_EXPR) && !slp_node)
7405 vect_defs[0] = gimple_get_lhs ((*vec_stmt)->stmt);
7406
7407 vect_create_epilog_for_reduction (vect_defs, stmt_info, reduc_def_phi,
7408 epilog_copies, reduc_fn, phis,
7409 double_reduc, slp_node, slp_node_instance,
7410 cond_reduc_val, cond_reduc_op_code,
7411 neutral_op);
7412
7413 return true;
7414 }
7415
7416 /* Function vect_min_worthwhile_factor.
7417
7418 For a loop where we could vectorize the operation indicated by CODE,
7419 return the minimum vectorization factor that makes it worthwhile
7420 to use generic vectors. */
7421 static unsigned int
7422 vect_min_worthwhile_factor (enum tree_code code)
7423 {
7424 switch (code)
7425 {
7426 case PLUS_EXPR:
7427 case MINUS_EXPR:
7428 case NEGATE_EXPR:
7429 return 4;
7430
7431 case BIT_AND_EXPR:
7432 case BIT_IOR_EXPR:
7433 case BIT_XOR_EXPR:
7434 case BIT_NOT_EXPR:
7435 return 2;
7436
7437 default:
7438 return INT_MAX;
7439 }
7440 }
7441
7442 /* Return true if VINFO indicates we are doing loop vectorization and if
7443 it is worth decomposing CODE operations into scalar operations for
7444 that loop's vectorization factor. */
7445
7446 bool
7447 vect_worthwhile_without_simd_p (vec_info *vinfo, tree_code code)
7448 {
7449 loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
7450 unsigned HOST_WIDE_INT value;
7451 return (loop_vinfo
7452 && LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant (&value)
7453 && value >= vect_min_worthwhile_factor (code));
7454 }
7455
7456 /* Function vectorizable_induction
7457
7458 Check if STMT_INFO performs an induction computation that can be vectorized.
7459 If VEC_STMT is also passed, vectorize the induction PHI: create a vectorized
7460 phi to replace it, put it in VEC_STMT, and add it to the same basic block.
7461 Return true if STMT_INFO is vectorizable in this way. */
7462
7463 bool
7464 vectorizable_induction (stmt_vec_info stmt_info,
7465 gimple_stmt_iterator *gsi ATTRIBUTE_UNUSED,
7466 stmt_vec_info *vec_stmt, slp_tree slp_node,
7467 stmt_vector_for_cost *cost_vec)
7468 {
7469 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
7470 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
7471 unsigned ncopies;
7472 bool nested_in_vect_loop = false;
7473 struct loop *iv_loop;
7474 tree vec_def;
7475 edge pe = loop_preheader_edge (loop);
7476 basic_block new_bb;
7477 tree new_vec, vec_init, vec_step, t;
7478 tree new_name;
7479 gimple *new_stmt;
7480 gphi *induction_phi;
7481 tree induc_def, vec_dest;
7482 tree init_expr, step_expr;
7483 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
7484 unsigned i;
7485 tree expr;
7486 gimple_seq stmts;
7487 imm_use_iterator imm_iter;
7488 use_operand_p use_p;
7489 gimple *exit_phi;
7490 edge latch_e;
7491 tree loop_arg;
7492 gimple_stmt_iterator si;
7493
7494 gphi *phi = dyn_cast <gphi *> (stmt_info->stmt);
7495 if (!phi)
7496 return false;
7497
7498 if (!STMT_VINFO_RELEVANT_P (stmt_info))
7499 return false;
7500
7501 /* Make sure it was recognized as induction computation. */
7502 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_induction_def)
7503 return false;
7504
7505 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
7506 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
7507
7508 if (slp_node)
7509 ncopies = 1;
7510 else
7511 ncopies = vect_get_num_copies (loop_vinfo, vectype);
7512 gcc_assert (ncopies >= 1);
7513
7514 /* FORNOW. These restrictions should be relaxed. */
7515 if (nested_in_vect_loop_p (loop, stmt_info))
7516 {
7517 imm_use_iterator imm_iter;
7518 use_operand_p use_p;
7519 gimple *exit_phi;
7520 edge latch_e;
7521 tree loop_arg;
7522
7523 if (ncopies > 1)
7524 {
7525 if (dump_enabled_p ())
7526 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7527 "multiple types in nested loop.\n");
7528 return false;
7529 }
7530
7531 /* FORNOW: outer loop induction with SLP not supported. */
7532 if (STMT_SLP_TYPE (stmt_info))
7533 return false;
7534
7535 exit_phi = NULL;
7536 latch_e = loop_latch_edge (loop->inner);
7537 loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
7538 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, loop_arg)
7539 {
7540 gimple *use_stmt = USE_STMT (use_p);
7541 if (is_gimple_debug (use_stmt))
7542 continue;
7543
7544 if (!flow_bb_inside_loop_p (loop->inner, gimple_bb (use_stmt)))
7545 {
7546 exit_phi = use_stmt;
7547 break;
7548 }
7549 }
7550 if (exit_phi)
7551 {
7552 stmt_vec_info exit_phi_vinfo = loop_vinfo->lookup_stmt (exit_phi);
7553 if (!(STMT_VINFO_RELEVANT_P (exit_phi_vinfo)
7554 && !STMT_VINFO_LIVE_P (exit_phi_vinfo)))
7555 {
7556 if (dump_enabled_p ())
7557 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7558 "inner-loop induction only used outside "
7559 "of the outer vectorized loop.\n");
7560 return false;
7561 }
7562 }
7563
7564 nested_in_vect_loop = true;
7565 iv_loop = loop->inner;
7566 }
7567 else
7568 iv_loop = loop;
7569 gcc_assert (iv_loop == (gimple_bb (phi))->loop_father);
7570
7571 if (slp_node && !nunits.is_constant ())
7572 {
7573 /* The current SLP code creates the initial value element-by-element. */
7574 if (dump_enabled_p ())
7575 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7576 "SLP induction not supported for variable-length"
7577 " vectors.\n");
7578 return false;
7579 }
7580
7581 if (!vec_stmt) /* transformation not required. */
7582 {
7583 STMT_VINFO_TYPE (stmt_info) = induc_vec_info_type;
7584 DUMP_VECT_SCOPE ("vectorizable_induction");
7585 vect_model_induction_cost (stmt_info, ncopies, cost_vec);
7586 return true;
7587 }
7588
7589 /* Transform. */
7590
7591 /* Compute a vector variable, initialized with the first VF values of
7592 the induction variable. E.g., for an iv with IV_PHI='X' and
7593 evolution S, for a vector of 4 units, we want to compute:
7594 [X, X + S, X + 2*S, X + 3*S]. */
7595
7596 if (dump_enabled_p ())
7597 dump_printf_loc (MSG_NOTE, vect_location, "transform induction phi.\n");
7598
7599 latch_e = loop_latch_edge (iv_loop);
7600 loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
7601
7602 step_expr = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_info);
7603 gcc_assert (step_expr != NULL_TREE);
7604
7605 pe = loop_preheader_edge (iv_loop);
7606 init_expr = PHI_ARG_DEF_FROM_EDGE (phi,
7607 loop_preheader_edge (iv_loop));
7608
7609 stmts = NULL;
7610 if (!nested_in_vect_loop)
7611 {
7612 /* Convert the initial value to the desired type. */
7613 tree new_type = TREE_TYPE (vectype);
7614 init_expr = gimple_convert (&stmts, new_type, init_expr);
7615
7616 /* If we are using the loop mask to "peel" for alignment then we need
7617 to adjust the start value here. */
7618 tree skip_niters = LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo);
7619 if (skip_niters != NULL_TREE)
7620 {
7621 if (FLOAT_TYPE_P (vectype))
7622 skip_niters = gimple_build (&stmts, FLOAT_EXPR, new_type,
7623 skip_niters);
7624 else
7625 skip_niters = gimple_convert (&stmts, new_type, skip_niters);
7626 tree skip_step = gimple_build (&stmts, MULT_EXPR, new_type,
7627 skip_niters, step_expr);
7628 init_expr = gimple_build (&stmts, MINUS_EXPR, new_type,
7629 init_expr, skip_step);
7630 }
7631 }
7632
7633 /* Convert the step to the desired type. */
7634 step_expr = gimple_convert (&stmts, TREE_TYPE (vectype), step_expr);
7635
7636 if (stmts)
7637 {
7638 new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
7639 gcc_assert (!new_bb);
7640 }
7641
7642 /* Find the first insertion point in the BB. */
7643 basic_block bb = gimple_bb (phi);
7644 si = gsi_after_labels (bb);
7645
7646 /* For SLP induction we have to generate several IVs as for example
7647 with group size 3 we need [i, i, i, i + S] [i + S, i + S, i + 2*S, i + 2*S]
7648 [i + 2*S, i + 3*S, i + 3*S, i + 3*S]. The step is the same uniform
7649 [VF*S, VF*S, VF*S, VF*S] for all. */
7650 if (slp_node)
7651 {
7652 /* Enforced above. */
7653 unsigned int const_nunits = nunits.to_constant ();
7654
7655 /* Generate [VF*S, VF*S, ... ]. */
7656 if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
7657 {
7658 expr = build_int_cst (integer_type_node, vf);
7659 expr = fold_convert (TREE_TYPE (step_expr), expr);
7660 }
7661 else
7662 expr = build_int_cst (TREE_TYPE (step_expr), vf);
7663 new_name = fold_build2 (MULT_EXPR, TREE_TYPE (step_expr),
7664 expr, step_expr);
7665 if (! CONSTANT_CLASS_P (new_name))
7666 new_name = vect_init_vector (stmt_info, new_name,
7667 TREE_TYPE (step_expr), NULL);
7668 new_vec = build_vector_from_val (vectype, new_name);
7669 vec_step = vect_init_vector (stmt_info, new_vec, vectype, NULL);
7670
7671 /* Now generate the IVs. */
7672 unsigned group_size = SLP_TREE_SCALAR_STMTS (slp_node).length ();
7673 unsigned nvects = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
7674 unsigned elts = const_nunits * nvects;
7675 unsigned nivs = least_common_multiple (group_size,
7676 const_nunits) / const_nunits;
7677 gcc_assert (elts % group_size == 0);
7678 tree elt = init_expr;
7679 unsigned ivn;
7680 for (ivn = 0; ivn < nivs; ++ivn)
7681 {
7682 tree_vector_builder elts (vectype, const_nunits, 1);
7683 stmts = NULL;
7684 for (unsigned eltn = 0; eltn < const_nunits; ++eltn)
7685 {
7686 if (ivn*const_nunits + eltn >= group_size
7687 && (ivn * const_nunits + eltn) % group_size == 0)
7688 elt = gimple_build (&stmts, PLUS_EXPR, TREE_TYPE (elt),
7689 elt, step_expr);
7690 elts.quick_push (elt);
7691 }
7692 vec_init = gimple_build_vector (&stmts, &elts);
7693 if (stmts)
7694 {
7695 new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
7696 gcc_assert (!new_bb);
7697 }
7698
7699 /* Create the induction-phi that defines the induction-operand. */
7700 vec_dest = vect_get_new_vect_var (vectype, vect_simple_var, "vec_iv_");
7701 induction_phi = create_phi_node (vec_dest, iv_loop->header);
7702 stmt_vec_info induction_phi_info
7703 = loop_vinfo->add_stmt (induction_phi);
7704 induc_def = PHI_RESULT (induction_phi);
7705
7706 /* Create the iv update inside the loop */
7707 vec_def = make_ssa_name (vec_dest);
7708 new_stmt = gimple_build_assign (vec_def, PLUS_EXPR, induc_def, vec_step);
7709 gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
7710 loop_vinfo->add_stmt (new_stmt);
7711
7712 /* Set the arguments of the phi node: */
7713 add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION);
7714 add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
7715 UNKNOWN_LOCATION);
7716
7717 SLP_TREE_VEC_STMTS (slp_node).quick_push (induction_phi_info);
7718 }
7719
7720 /* Re-use IVs when we can. */
7721 if (ivn < nvects)
7722 {
7723 unsigned vfp
7724 = least_common_multiple (group_size, const_nunits) / group_size;
7725 /* Generate [VF'*S, VF'*S, ... ]. */
7726 if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
7727 {
7728 expr = build_int_cst (integer_type_node, vfp);
7729 expr = fold_convert (TREE_TYPE (step_expr), expr);
7730 }
7731 else
7732 expr = build_int_cst (TREE_TYPE (step_expr), vfp);
7733 new_name = fold_build2 (MULT_EXPR, TREE_TYPE (step_expr),
7734 expr, step_expr);
7735 if (! CONSTANT_CLASS_P (new_name))
7736 new_name = vect_init_vector (stmt_info, new_name,
7737 TREE_TYPE (step_expr), NULL);
7738 new_vec = build_vector_from_val (vectype, new_name);
7739 vec_step = vect_init_vector (stmt_info, new_vec, vectype, NULL);
7740 for (; ivn < nvects; ++ivn)
7741 {
7742 gimple *iv = SLP_TREE_VEC_STMTS (slp_node)[ivn - nivs]->stmt;
7743 tree def;
7744 if (gimple_code (iv) == GIMPLE_PHI)
7745 def = gimple_phi_result (iv);
7746 else
7747 def = gimple_assign_lhs (iv);
7748 new_stmt = gimple_build_assign (make_ssa_name (vectype),
7749 PLUS_EXPR,
7750 def, vec_step);
7751 if (gimple_code (iv) == GIMPLE_PHI)
7752 gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
7753 else
7754 {
7755 gimple_stmt_iterator tgsi = gsi_for_stmt (iv);
7756 gsi_insert_after (&tgsi, new_stmt, GSI_CONTINUE_LINKING);
7757 }
7758 SLP_TREE_VEC_STMTS (slp_node).quick_push
7759 (loop_vinfo->add_stmt (new_stmt));
7760 }
7761 }
7762
7763 return true;
7764 }
7765
7766 /* Create the vector that holds the initial_value of the induction. */
7767 if (nested_in_vect_loop)
7768 {
7769 /* iv_loop is nested in the loop to be vectorized. init_expr had already
7770 been created during vectorization of previous stmts. We obtain it
7771 from the STMT_VINFO_VEC_STMT of the defining stmt. */
7772 vec_init = vect_get_vec_def_for_operand (init_expr, stmt_info);
7773 /* If the initial value is not of proper type, convert it. */
7774 if (!useless_type_conversion_p (vectype, TREE_TYPE (vec_init)))
7775 {
7776 new_stmt
7777 = gimple_build_assign (vect_get_new_ssa_name (vectype,
7778 vect_simple_var,
7779 "vec_iv_"),
7780 VIEW_CONVERT_EXPR,
7781 build1 (VIEW_CONVERT_EXPR, vectype,
7782 vec_init));
7783 vec_init = gimple_assign_lhs (new_stmt);
7784 new_bb = gsi_insert_on_edge_immediate (loop_preheader_edge (iv_loop),
7785 new_stmt);
7786 gcc_assert (!new_bb);
7787 loop_vinfo->add_stmt (new_stmt);
7788 }
7789 }
7790 else
7791 {
7792 /* iv_loop is the loop to be vectorized. Create:
7793 vec_init = [X, X+S, X+2*S, X+3*S] (S = step_expr, X = init_expr) */
7794 stmts = NULL;
7795 new_name = gimple_convert (&stmts, TREE_TYPE (vectype), init_expr);
7796
7797 unsigned HOST_WIDE_INT const_nunits;
7798 if (nunits.is_constant (&const_nunits))
7799 {
7800 tree_vector_builder elts (vectype, const_nunits, 1);
7801 elts.quick_push (new_name);
7802 for (i = 1; i < const_nunits; i++)
7803 {
7804 /* Create: new_name_i = new_name + step_expr */
7805 new_name = gimple_build (&stmts, PLUS_EXPR, TREE_TYPE (new_name),
7806 new_name, step_expr);
7807 elts.quick_push (new_name);
7808 }
7809 /* Create a vector from [new_name_0, new_name_1, ...,
7810 new_name_nunits-1] */
7811 vec_init = gimple_build_vector (&stmts, &elts);
7812 }
7813 else if (INTEGRAL_TYPE_P (TREE_TYPE (step_expr)))
7814 /* Build the initial value directly from a VEC_SERIES_EXPR. */
7815 vec_init = gimple_build (&stmts, VEC_SERIES_EXPR, vectype,
7816 new_name, step_expr);
7817 else
7818 {
7819 /* Build:
7820 [base, base, base, ...]
7821 + (vectype) [0, 1, 2, ...] * [step, step, step, ...]. */
7822 gcc_assert (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)));
7823 gcc_assert (flag_associative_math);
7824 tree index = build_index_vector (vectype, 0, 1);
7825 tree base_vec = gimple_build_vector_from_val (&stmts, vectype,
7826 new_name);
7827 tree step_vec = gimple_build_vector_from_val (&stmts, vectype,
7828 step_expr);
7829 vec_init = gimple_build (&stmts, FLOAT_EXPR, vectype, index);
7830 vec_init = gimple_build (&stmts, MULT_EXPR, vectype,
7831 vec_init, step_vec);
7832 vec_init = gimple_build (&stmts, PLUS_EXPR, vectype,
7833 vec_init, base_vec);
7834 }
7835
7836 if (stmts)
7837 {
7838 new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
7839 gcc_assert (!new_bb);
7840 }
7841 }
7842
7843
7844 /* Create the vector that holds the step of the induction. */
7845 if (nested_in_vect_loop)
7846 /* iv_loop is nested in the loop to be vectorized. Generate:
7847 vec_step = [S, S, S, S] */
7848 new_name = step_expr;
7849 else
7850 {
7851 /* iv_loop is the loop to be vectorized. Generate:
7852 vec_step = [VF*S, VF*S, VF*S, VF*S] */
7853 gimple_seq seq = NULL;
7854 if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
7855 {
7856 expr = build_int_cst (integer_type_node, vf);
7857 expr = gimple_build (&seq, FLOAT_EXPR, TREE_TYPE (step_expr), expr);
7858 }
7859 else
7860 expr = build_int_cst (TREE_TYPE (step_expr), vf);
7861 new_name = gimple_build (&seq, MULT_EXPR, TREE_TYPE (step_expr),
7862 expr, step_expr);
7863 if (seq)
7864 {
7865 new_bb = gsi_insert_seq_on_edge_immediate (pe, seq);
7866 gcc_assert (!new_bb);
7867 }
7868 }
7869
7870 t = unshare_expr (new_name);
7871 gcc_assert (CONSTANT_CLASS_P (new_name)
7872 || TREE_CODE (new_name) == SSA_NAME);
7873 new_vec = build_vector_from_val (vectype, t);
7874 vec_step = vect_init_vector (stmt_info, new_vec, vectype, NULL);
7875
7876
7877 /* Create the following def-use cycle:
7878 loop prolog:
7879 vec_init = ...
7880 vec_step = ...
7881 loop:
7882 vec_iv = PHI <vec_init, vec_loop>
7883 ...
7884 STMT
7885 ...
7886 vec_loop = vec_iv + vec_step; */
7887
7888 /* Create the induction-phi that defines the induction-operand. */
7889 vec_dest = vect_get_new_vect_var (vectype, vect_simple_var, "vec_iv_");
7890 induction_phi = create_phi_node (vec_dest, iv_loop->header);
7891 stmt_vec_info induction_phi_info = loop_vinfo->add_stmt (induction_phi);
7892 induc_def = PHI_RESULT (induction_phi);
7893
7894 /* Create the iv update inside the loop */
7895 vec_def = make_ssa_name (vec_dest);
7896 new_stmt = gimple_build_assign (vec_def, PLUS_EXPR, induc_def, vec_step);
7897 gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
7898 stmt_vec_info new_stmt_info = loop_vinfo->add_stmt (new_stmt);
7899
7900 /* Set the arguments of the phi node: */
7901 add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION);
7902 add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
7903 UNKNOWN_LOCATION);
7904
7905 STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = induction_phi_info;
7906
7907 /* In case that vectorization factor (VF) is bigger than the number
7908 of elements that we can fit in a vectype (nunits), we have to generate
7909 more than one vector stmt - i.e - we need to "unroll" the
7910 vector stmt by a factor VF/nunits. For more details see documentation
7911 in vectorizable_operation. */
7912
7913 if (ncopies > 1)
7914 {
7915 gimple_seq seq = NULL;
7916 stmt_vec_info prev_stmt_vinfo;
7917 /* FORNOW. This restriction should be relaxed. */
7918 gcc_assert (!nested_in_vect_loop);
7919
7920 /* Create the vector that holds the step of the induction. */
7921 if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
7922 {
7923 expr = build_int_cst (integer_type_node, nunits);
7924 expr = gimple_build (&seq, FLOAT_EXPR, TREE_TYPE (step_expr), expr);
7925 }
7926 else
7927 expr = build_int_cst (TREE_TYPE (step_expr), nunits);
7928 new_name = gimple_build (&seq, MULT_EXPR, TREE_TYPE (step_expr),
7929 expr, step_expr);
7930 if (seq)
7931 {
7932 new_bb = gsi_insert_seq_on_edge_immediate (pe, seq);
7933 gcc_assert (!new_bb);
7934 }
7935
7936 t = unshare_expr (new_name);
7937 gcc_assert (CONSTANT_CLASS_P (new_name)
7938 || TREE_CODE (new_name) == SSA_NAME);
7939 new_vec = build_vector_from_val (vectype, t);
7940 vec_step = vect_init_vector (stmt_info, new_vec, vectype, NULL);
7941
7942 vec_def = induc_def;
7943 prev_stmt_vinfo = induction_phi_info;
7944 for (i = 1; i < ncopies; i++)
7945 {
7946 /* vec_i = vec_prev + vec_step */
7947 new_stmt = gimple_build_assign (vec_dest, PLUS_EXPR,
7948 vec_def, vec_step);
7949 vec_def = make_ssa_name (vec_dest, new_stmt);
7950 gimple_assign_set_lhs (new_stmt, vec_def);
7951
7952 gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
7953 new_stmt_info = loop_vinfo->add_stmt (new_stmt);
7954 STMT_VINFO_RELATED_STMT (prev_stmt_vinfo) = new_stmt_info;
7955 prev_stmt_vinfo = new_stmt_info;
7956 }
7957 }
7958
7959 if (nested_in_vect_loop)
7960 {
7961 /* Find the loop-closed exit-phi of the induction, and record
7962 the final vector of induction results: */
7963 exit_phi = NULL;
7964 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, loop_arg)
7965 {
7966 gimple *use_stmt = USE_STMT (use_p);
7967 if (is_gimple_debug (use_stmt))
7968 continue;
7969
7970 if (!flow_bb_inside_loop_p (iv_loop, gimple_bb (use_stmt)))
7971 {
7972 exit_phi = use_stmt;
7973 break;
7974 }
7975 }
7976 if (exit_phi)
7977 {
7978 stmt_vec_info stmt_vinfo = loop_vinfo->lookup_stmt (exit_phi);
7979 /* FORNOW. Currently not supporting the case that an inner-loop induction
7980 is not used in the outer-loop (i.e. only outside the outer-loop). */
7981 gcc_assert (STMT_VINFO_RELEVANT_P (stmt_vinfo)
7982 && !STMT_VINFO_LIVE_P (stmt_vinfo));
7983
7984 STMT_VINFO_VEC_STMT (stmt_vinfo) = new_stmt_info;
7985 if (dump_enabled_p ())
7986 dump_printf_loc (MSG_NOTE, vect_location,
7987 "vector of inductions after inner-loop:%G",
7988 new_stmt);
7989 }
7990 }
7991
7992
7993 if (dump_enabled_p ())
7994 dump_printf_loc (MSG_NOTE, vect_location,
7995 "transform induction: created def-use cycle: %G%G",
7996 induction_phi, SSA_NAME_DEF_STMT (vec_def));
7997
7998 return true;
7999 }
8000
8001 /* Function vectorizable_live_operation.
8002
8003 STMT_INFO computes a value that is used outside the loop. Check if
8004 it can be supported. */
8005
8006 bool
8007 vectorizable_live_operation (stmt_vec_info stmt_info,
8008 gimple_stmt_iterator *gsi ATTRIBUTE_UNUSED,
8009 slp_tree slp_node, int slp_index,
8010 stmt_vec_info *vec_stmt,
8011 stmt_vector_for_cost *)
8012 {
8013 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
8014 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
8015 imm_use_iterator imm_iter;
8016 tree lhs, lhs_type, bitsize, vec_bitsize;
8017 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
8018 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
8019 int ncopies;
8020 gimple *use_stmt;
8021 auto_vec<tree> vec_oprnds;
8022 int vec_entry = 0;
8023 poly_uint64 vec_index = 0;
8024
8025 gcc_assert (STMT_VINFO_LIVE_P (stmt_info));
8026
8027 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def)
8028 return false;
8029
8030 /* FORNOW. CHECKME. */
8031 if (nested_in_vect_loop_p (loop, stmt_info))
8032 return false;
8033
8034 /* If STMT is not relevant and it is a simple assignment and its inputs are
8035 invariant then it can remain in place, unvectorized. The original last
8036 scalar value that it computes will be used. */
8037 if (!STMT_VINFO_RELEVANT_P (stmt_info))
8038 {
8039 gcc_assert (is_simple_and_all_uses_invariant (stmt_info, loop_vinfo));
8040 if (dump_enabled_p ())
8041 dump_printf_loc (MSG_NOTE, vect_location,
8042 "statement is simple and uses invariant. Leaving in "
8043 "place.\n");
8044 return true;
8045 }
8046
8047 if (slp_node)
8048 ncopies = 1;
8049 else
8050 ncopies = vect_get_num_copies (loop_vinfo, vectype);
8051
8052 if (slp_node)
8053 {
8054 gcc_assert (slp_index >= 0);
8055
8056 int num_scalar = SLP_TREE_SCALAR_STMTS (slp_node).length ();
8057 int num_vec = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
8058
8059 /* Get the last occurrence of the scalar index from the concatenation of
8060 all the slp vectors. Calculate which slp vector it is and the index
8061 within. */
8062 poly_uint64 pos = (num_vec * nunits) - num_scalar + slp_index;
8063
8064 /* Calculate which vector contains the result, and which lane of
8065 that vector we need. */
8066 if (!can_div_trunc_p (pos, nunits, &vec_entry, &vec_index))
8067 {
8068 if (dump_enabled_p ())
8069 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8070 "Cannot determine which vector holds the"
8071 " final result.\n");
8072 return false;
8073 }
8074 }
8075
8076 if (!vec_stmt)
8077 {
8078 /* No transformation required. */
8079 if (LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo))
8080 {
8081 if (!direct_internal_fn_supported_p (IFN_EXTRACT_LAST, vectype,
8082 OPTIMIZE_FOR_SPEED))
8083 {
8084 if (dump_enabled_p ())
8085 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8086 "can't use a fully-masked loop because "
8087 "the target doesn't support extract last "
8088 "reduction.\n");
8089 LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = false;
8090 }
8091 else if (slp_node)
8092 {
8093 if (dump_enabled_p ())
8094 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8095 "can't use a fully-masked loop because an "
8096 "SLP statement is live after the loop.\n");
8097 LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = false;
8098 }
8099 else if (ncopies > 1)
8100 {
8101 if (dump_enabled_p ())
8102 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8103 "can't use a fully-masked loop because"
8104 " ncopies is greater than 1.\n");
8105 LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = false;
8106 }
8107 else
8108 {
8109 gcc_assert (ncopies == 1 && !slp_node);
8110 vect_record_loop_mask (loop_vinfo,
8111 &LOOP_VINFO_MASKS (loop_vinfo),
8112 1, vectype);
8113 }
8114 }
8115 return true;
8116 }
8117
8118 /* Use the lhs of the original scalar statement. */
8119 gimple *stmt = vect_orig_stmt (stmt_info)->stmt;
8120
8121 lhs = (is_a <gphi *> (stmt)) ? gimple_phi_result (stmt)
8122 : gimple_get_lhs (stmt);
8123 lhs_type = TREE_TYPE (lhs);
8124
8125 bitsize = (VECTOR_BOOLEAN_TYPE_P (vectype)
8126 ? bitsize_int (TYPE_PRECISION (TREE_TYPE (vectype)))
8127 : TYPE_SIZE (TREE_TYPE (vectype)));
8128 vec_bitsize = TYPE_SIZE (vectype);
8129
8130 /* Get the vectorized lhs of STMT and the lane to use (counted in bits). */
8131 tree vec_lhs, bitstart;
8132 if (slp_node)
8133 {
8134 gcc_assert (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo));
8135
8136 /* Get the correct slp vectorized stmt. */
8137 gimple *vec_stmt = SLP_TREE_VEC_STMTS (slp_node)[vec_entry]->stmt;
8138 if (gphi *phi = dyn_cast <gphi *> (vec_stmt))
8139 vec_lhs = gimple_phi_result (phi);
8140 else
8141 vec_lhs = gimple_get_lhs (vec_stmt);
8142
8143 /* Get entry to use. */
8144 bitstart = bitsize_int (vec_index);
8145 bitstart = int_const_binop (MULT_EXPR, bitsize, bitstart);
8146 }
8147 else
8148 {
8149 enum vect_def_type dt = STMT_VINFO_DEF_TYPE (stmt_info);
8150 vec_lhs = vect_get_vec_def_for_operand_1 (stmt_info, dt);
8151 gcc_checking_assert (ncopies == 1
8152 || !LOOP_VINFO_FULLY_MASKED_P (loop_vinfo));
8153
8154 /* For multiple copies, get the last copy. */
8155 for (int i = 1; i < ncopies; ++i)
8156 vec_lhs = vect_get_vec_def_for_stmt_copy (loop_vinfo, vec_lhs);
8157
8158 /* Get the last lane in the vector. */
8159 bitstart = int_const_binop (MINUS_EXPR, vec_bitsize, bitsize);
8160 }
8161
8162 gimple_seq stmts = NULL;
8163 tree new_tree;
8164 if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
8165 {
8166 /* Emit:
8167
8168 SCALAR_RES = EXTRACT_LAST <VEC_LHS, MASK>
8169
8170 where VEC_LHS is the vectorized live-out result and MASK is
8171 the loop mask for the final iteration. */
8172 gcc_assert (ncopies == 1 && !slp_node);
8173 tree scalar_type = TREE_TYPE (STMT_VINFO_VECTYPE (stmt_info));
8174 tree mask = vect_get_loop_mask (gsi, &LOOP_VINFO_MASKS (loop_vinfo),
8175 1, vectype, 0);
8176 tree scalar_res = gimple_build (&stmts, CFN_EXTRACT_LAST,
8177 scalar_type, mask, vec_lhs);
8178
8179 /* Convert the extracted vector element to the required scalar type. */
8180 new_tree = gimple_convert (&stmts, lhs_type, scalar_res);
8181 }
8182 else
8183 {
8184 tree bftype = TREE_TYPE (vectype);
8185 if (VECTOR_BOOLEAN_TYPE_P (vectype))
8186 bftype = build_nonstandard_integer_type (tree_to_uhwi (bitsize), 1);
8187 new_tree = build3 (BIT_FIELD_REF, bftype, vec_lhs, bitsize, bitstart);
8188 new_tree = force_gimple_operand (fold_convert (lhs_type, new_tree),
8189 &stmts, true, NULL_TREE);
8190 }
8191
8192 if (stmts)
8193 gsi_insert_seq_on_edge_immediate (single_exit (loop), stmts);
8194
8195 /* Replace use of lhs with newly computed result. If the use stmt is a
8196 single arg PHI, just replace all uses of PHI result. It's necessary
8197 because lcssa PHI defining lhs may be before newly inserted stmt. */
8198 use_operand_p use_p;
8199 FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, lhs)
8200 if (!flow_bb_inside_loop_p (loop, gimple_bb (use_stmt))
8201 && !is_gimple_debug (use_stmt))
8202 {
8203 if (gimple_code (use_stmt) == GIMPLE_PHI
8204 && gimple_phi_num_args (use_stmt) == 1)
8205 {
8206 replace_uses_by (gimple_phi_result (use_stmt), new_tree);
8207 }
8208 else
8209 {
8210 FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
8211 SET_USE (use_p, new_tree);
8212 }
8213 update_stmt (use_stmt);
8214 }
8215
8216 return true;
8217 }
8218
8219 /* Kill any debug uses outside LOOP of SSA names defined in STMT_INFO. */
8220
8221 static void
8222 vect_loop_kill_debug_uses (struct loop *loop, stmt_vec_info stmt_info)
8223 {
8224 ssa_op_iter op_iter;
8225 imm_use_iterator imm_iter;
8226 def_operand_p def_p;
8227 gimple *ustmt;
8228
8229 FOR_EACH_PHI_OR_STMT_DEF (def_p, stmt_info->stmt, op_iter, SSA_OP_DEF)
8230 {
8231 FOR_EACH_IMM_USE_STMT (ustmt, imm_iter, DEF_FROM_PTR (def_p))
8232 {
8233 basic_block bb;
8234
8235 if (!is_gimple_debug (ustmt))
8236 continue;
8237
8238 bb = gimple_bb (ustmt);
8239
8240 if (!flow_bb_inside_loop_p (loop, bb))
8241 {
8242 if (gimple_debug_bind_p (ustmt))
8243 {
8244 if (dump_enabled_p ())
8245 dump_printf_loc (MSG_NOTE, vect_location,
8246 "killing debug use\n");
8247
8248 gimple_debug_bind_reset_value (ustmt);
8249 update_stmt (ustmt);
8250 }
8251 else
8252 gcc_unreachable ();
8253 }
8254 }
8255 }
8256 }
8257
8258 /* Given loop represented by LOOP_VINFO, return true if computation of
8259 LOOP_VINFO_NITERS (= LOOP_VINFO_NITERSM1 + 1) doesn't overflow, false
8260 otherwise. */
8261
8262 static bool
8263 loop_niters_no_overflow (loop_vec_info loop_vinfo)
8264 {
8265 /* Constant case. */
8266 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
8267 {
8268 tree cst_niters = LOOP_VINFO_NITERS (loop_vinfo);
8269 tree cst_nitersm1 = LOOP_VINFO_NITERSM1 (loop_vinfo);
8270
8271 gcc_assert (TREE_CODE (cst_niters) == INTEGER_CST);
8272 gcc_assert (TREE_CODE (cst_nitersm1) == INTEGER_CST);
8273 if (wi::to_widest (cst_nitersm1) < wi::to_widest (cst_niters))
8274 return true;
8275 }
8276
8277 widest_int max;
8278 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
8279 /* Check the upper bound of loop niters. */
8280 if (get_max_loop_iterations (loop, &max))
8281 {
8282 tree type = TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo));
8283 signop sgn = TYPE_SIGN (type);
8284 widest_int type_max = widest_int::from (wi::max_value (type), sgn);
8285 if (max < type_max)
8286 return true;
8287 }
8288 return false;
8289 }
8290
8291 /* Return a mask type with half the number of elements as TYPE. */
8292
8293 tree
8294 vect_halve_mask_nunits (tree type)
8295 {
8296 poly_uint64 nunits = exact_div (TYPE_VECTOR_SUBPARTS (type), 2);
8297 return build_truth_vector_type (nunits, current_vector_size);
8298 }
8299
8300 /* Return a mask type with twice as many elements as TYPE. */
8301
8302 tree
8303 vect_double_mask_nunits (tree type)
8304 {
8305 poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (type) * 2;
8306 return build_truth_vector_type (nunits, current_vector_size);
8307 }
8308
8309 /* Record that a fully-masked version of LOOP_VINFO would need MASKS to
8310 contain a sequence of NVECTORS masks that each control a vector of type
8311 VECTYPE. */
8312
8313 void
8314 vect_record_loop_mask (loop_vec_info loop_vinfo, vec_loop_masks *masks,
8315 unsigned int nvectors, tree vectype)
8316 {
8317 gcc_assert (nvectors != 0);
8318 if (masks->length () < nvectors)
8319 masks->safe_grow_cleared (nvectors);
8320 rgroup_masks *rgm = &(*masks)[nvectors - 1];
8321 /* The number of scalars per iteration and the number of vectors are
8322 both compile-time constants. */
8323 unsigned int nscalars_per_iter
8324 = exact_div (nvectors * TYPE_VECTOR_SUBPARTS (vectype),
8325 LOOP_VINFO_VECT_FACTOR (loop_vinfo)).to_constant ();
8326 if (rgm->max_nscalars_per_iter < nscalars_per_iter)
8327 {
8328 rgm->max_nscalars_per_iter = nscalars_per_iter;
8329 rgm->mask_type = build_same_sized_truth_vector_type (vectype);
8330 }
8331 }
8332
8333 /* Given a complete set of masks MASKS, extract mask number INDEX
8334 for an rgroup that operates on NVECTORS vectors of type VECTYPE,
8335 where 0 <= INDEX < NVECTORS. Insert any set-up statements before GSI.
8336
8337 See the comment above vec_loop_masks for more details about the mask
8338 arrangement. */
8339
8340 tree
8341 vect_get_loop_mask (gimple_stmt_iterator *gsi, vec_loop_masks *masks,
8342 unsigned int nvectors, tree vectype, unsigned int index)
8343 {
8344 rgroup_masks *rgm = &(*masks)[nvectors - 1];
8345 tree mask_type = rgm->mask_type;
8346
8347 /* Populate the rgroup's mask array, if this is the first time we've
8348 used it. */
8349 if (rgm->masks.is_empty ())
8350 {
8351 rgm->masks.safe_grow_cleared (nvectors);
8352 for (unsigned int i = 0; i < nvectors; ++i)
8353 {
8354 tree mask = make_temp_ssa_name (mask_type, NULL, "loop_mask");
8355 /* Provide a dummy definition until the real one is available. */
8356 SSA_NAME_DEF_STMT (mask) = gimple_build_nop ();
8357 rgm->masks[i] = mask;
8358 }
8359 }
8360
8361 tree mask = rgm->masks[index];
8362 if (maybe_ne (TYPE_VECTOR_SUBPARTS (mask_type),
8363 TYPE_VECTOR_SUBPARTS (vectype)))
8364 {
8365 /* A loop mask for data type X can be reused for data type Y
8366 if X has N times more elements than Y and if Y's elements
8367 are N times bigger than X's. In this case each sequence
8368 of N elements in the loop mask will be all-zero or all-one.
8369 We can then view-convert the mask so that each sequence of
8370 N elements is replaced by a single element. */
8371 gcc_assert (multiple_p (TYPE_VECTOR_SUBPARTS (mask_type),
8372 TYPE_VECTOR_SUBPARTS (vectype)));
8373 gimple_seq seq = NULL;
8374 mask_type = build_same_sized_truth_vector_type (vectype);
8375 mask = gimple_build (&seq, VIEW_CONVERT_EXPR, mask_type, mask);
8376 if (seq)
8377 gsi_insert_seq_before (gsi, seq, GSI_SAME_STMT);
8378 }
8379 return mask;
8380 }
8381
8382 /* Scale profiling counters by estimation for LOOP which is vectorized
8383 by factor VF. */
8384
8385 static void
8386 scale_profile_for_vect_loop (struct loop *loop, unsigned vf)
8387 {
8388 edge preheader = loop_preheader_edge (loop);
8389 /* Reduce loop iterations by the vectorization factor. */
8390 gcov_type new_est_niter = niter_for_unrolled_loop (loop, vf);
8391 profile_count freq_h = loop->header->count, freq_e = preheader->count ();
8392
8393 if (freq_h.nonzero_p ())
8394 {
8395 profile_probability p;
8396
8397 /* Avoid dropping loop body profile counter to 0 because of zero count
8398 in loop's preheader. */
8399 if (!(freq_e == profile_count::zero ()))
8400 freq_e = freq_e.force_nonzero ();
8401 p = freq_e.apply_scale (new_est_niter + 1, 1).probability_in (freq_h);
8402 scale_loop_frequencies (loop, p);
8403 }
8404
8405 edge exit_e = single_exit (loop);
8406 exit_e->probability = profile_probability::always ()
8407 .apply_scale (1, new_est_niter + 1);
8408
8409 edge exit_l = single_pred_edge (loop->latch);
8410 profile_probability prob = exit_l->probability;
8411 exit_l->probability = exit_e->probability.invert ();
8412 if (prob.initialized_p () && exit_l->probability.initialized_p ())
8413 scale_bbs_frequencies (&loop->latch, 1, exit_l->probability / prob);
8414 }
8415
8416 /* Vectorize STMT_INFO if relevant, inserting any new instructions before GSI.
8417 When vectorizing STMT_INFO as a store, set *SEEN_STORE to its
8418 stmt_vec_info. */
8419
8420 static void
8421 vect_transform_loop_stmt (loop_vec_info loop_vinfo, stmt_vec_info stmt_info,
8422 gimple_stmt_iterator *gsi, stmt_vec_info *seen_store)
8423 {
8424 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
8425 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
8426
8427 if (dump_enabled_p ())
8428 dump_printf_loc (MSG_NOTE, vect_location,
8429 "------>vectorizing statement: %G", stmt_info->stmt);
8430
8431 if (MAY_HAVE_DEBUG_BIND_STMTS && !STMT_VINFO_LIVE_P (stmt_info))
8432 vect_loop_kill_debug_uses (loop, stmt_info);
8433
8434 if (!STMT_VINFO_RELEVANT_P (stmt_info)
8435 && !STMT_VINFO_LIVE_P (stmt_info))
8436 return;
8437
8438 if (STMT_VINFO_VECTYPE (stmt_info))
8439 {
8440 poly_uint64 nunits
8441 = TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info));
8442 if (!STMT_SLP_TYPE (stmt_info)
8443 && maybe_ne (nunits, vf)
8444 && dump_enabled_p ())
8445 /* For SLP VF is set according to unrolling factor, and not
8446 to vector size, hence for SLP this print is not valid. */
8447 dump_printf_loc (MSG_NOTE, vect_location, "multiple-types.\n");
8448 }
8449
8450 /* Pure SLP statements have already been vectorized. We still need
8451 to apply loop vectorization to hybrid SLP statements. */
8452 if (PURE_SLP_STMT (stmt_info))
8453 return;
8454
8455 if (dump_enabled_p ())
8456 dump_printf_loc (MSG_NOTE, vect_location, "transform statement.\n");
8457
8458 if (vect_transform_stmt (stmt_info, gsi, NULL, NULL))
8459 *seen_store = stmt_info;
8460 }
8461
8462 /* Function vect_transform_loop.
8463
8464 The analysis phase has determined that the loop is vectorizable.
8465 Vectorize the loop - created vectorized stmts to replace the scalar
8466 stmts in the loop, and update the loop exit condition.
8467 Returns scalar epilogue loop if any. */
8468
8469 struct loop *
8470 vect_transform_loop (loop_vec_info loop_vinfo)
8471 {
8472 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
8473 struct loop *epilogue = NULL;
8474 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
8475 int nbbs = loop->num_nodes;
8476 int i;
8477 tree niters_vector = NULL_TREE;
8478 tree step_vector = NULL_TREE;
8479 tree niters_vector_mult_vf = NULL_TREE;
8480 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
8481 unsigned int lowest_vf = constant_lower_bound (vf);
8482 gimple *stmt;
8483 bool check_profitability = false;
8484 unsigned int th;
8485
8486 DUMP_VECT_SCOPE ("vec_transform_loop");
8487
8488 loop_vinfo->shared->check_datarefs ();
8489
8490 /* Use the more conservative vectorization threshold. If the number
8491 of iterations is constant assume the cost check has been performed
8492 by our caller. If the threshold makes all loops profitable that
8493 run at least the (estimated) vectorization factor number of times
8494 checking is pointless, too. */
8495 th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
8496 if (th >= vect_vf_for_cost (loop_vinfo)
8497 && !LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
8498 {
8499 if (dump_enabled_p ())
8500 dump_printf_loc (MSG_NOTE, vect_location,
8501 "Profitability threshold is %d loop iterations.\n",
8502 th);
8503 check_profitability = true;
8504 }
8505
8506 /* Make sure there exists a single-predecessor exit bb. Do this before
8507 versioning. */
8508 edge e = single_exit (loop);
8509 if (! single_pred_p (e->dest))
8510 {
8511 split_loop_exit_edge (e, true);
8512 if (dump_enabled_p ())
8513 dump_printf (MSG_NOTE, "split exit edge\n");
8514 }
8515
8516 /* Version the loop first, if required, so the profitability check
8517 comes first. */
8518
8519 if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
8520 {
8521 poly_uint64 versioning_threshold
8522 = LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo);
8523 if (check_profitability
8524 && ordered_p (poly_uint64 (th), versioning_threshold))
8525 {
8526 versioning_threshold = ordered_max (poly_uint64 (th),
8527 versioning_threshold);
8528 check_profitability = false;
8529 }
8530 struct loop *sloop
8531 = vect_loop_versioning (loop_vinfo, th, check_profitability,
8532 versioning_threshold);
8533 sloop->force_vectorize = false;
8534 check_profitability = false;
8535 }
8536
8537 /* Make sure there exists a single-predecessor exit bb also on the
8538 scalar loop copy. Do this after versioning but before peeling
8539 so CFG structure is fine for both scalar and if-converted loop
8540 to make slpeel_duplicate_current_defs_from_edges face matched
8541 loop closed PHI nodes on the exit. */
8542 if (LOOP_VINFO_SCALAR_LOOP (loop_vinfo))
8543 {
8544 e = single_exit (LOOP_VINFO_SCALAR_LOOP (loop_vinfo));
8545 if (! single_pred_p (e->dest))
8546 {
8547 split_loop_exit_edge (e, true);
8548 if (dump_enabled_p ())
8549 dump_printf (MSG_NOTE, "split exit edge of scalar loop\n");
8550 }
8551 }
8552
8553 tree niters = vect_build_loop_niters (loop_vinfo);
8554 LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = niters;
8555 tree nitersm1 = unshare_expr (LOOP_VINFO_NITERSM1 (loop_vinfo));
8556 bool niters_no_overflow = loop_niters_no_overflow (loop_vinfo);
8557 epilogue = vect_do_peeling (loop_vinfo, niters, nitersm1, &niters_vector,
8558 &step_vector, &niters_vector_mult_vf, th,
8559 check_profitability, niters_no_overflow);
8560
8561 if (niters_vector == NULL_TREE)
8562 {
8563 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
8564 && !LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
8565 && known_eq (lowest_vf, vf))
8566 {
8567 niters_vector
8568 = build_int_cst (TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo)),
8569 LOOP_VINFO_INT_NITERS (loop_vinfo) / lowest_vf);
8570 step_vector = build_one_cst (TREE_TYPE (niters));
8571 }
8572 else
8573 vect_gen_vector_loop_niters (loop_vinfo, niters, &niters_vector,
8574 &step_vector, niters_no_overflow);
8575 }
8576
8577 /* 1) Make sure the loop header has exactly two entries
8578 2) Make sure we have a preheader basic block. */
8579
8580 gcc_assert (EDGE_COUNT (loop->header->preds) == 2);
8581
8582 split_edge (loop_preheader_edge (loop));
8583
8584 if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
8585 && vect_use_loop_mask_for_alignment_p (loop_vinfo))
8586 /* This will deal with any possible peeling. */
8587 vect_prepare_for_masked_peels (loop_vinfo);
8588
8589 /* Schedule the SLP instances first, then handle loop vectorization
8590 below. */
8591 if (!loop_vinfo->slp_instances.is_empty ())
8592 {
8593 DUMP_VECT_SCOPE ("scheduling SLP instances");
8594 vect_schedule_slp (loop_vinfo);
8595 }
8596
8597 /* FORNOW: the vectorizer supports only loops which body consist
8598 of one basic block (header + empty latch). When the vectorizer will
8599 support more involved loop forms, the order by which the BBs are
8600 traversed need to be reconsidered. */
8601
8602 for (i = 0; i < nbbs; i++)
8603 {
8604 basic_block bb = bbs[i];
8605 stmt_vec_info stmt_info;
8606
8607 for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
8608 gsi_next (&si))
8609 {
8610 gphi *phi = si.phi ();
8611 if (dump_enabled_p ())
8612 dump_printf_loc (MSG_NOTE, vect_location,
8613 "------>vectorizing phi: %G", phi);
8614 stmt_info = loop_vinfo->lookup_stmt (phi);
8615 if (!stmt_info)
8616 continue;
8617
8618 if (MAY_HAVE_DEBUG_BIND_STMTS && !STMT_VINFO_LIVE_P (stmt_info))
8619 vect_loop_kill_debug_uses (loop, stmt_info);
8620
8621 if (!STMT_VINFO_RELEVANT_P (stmt_info)
8622 && !STMT_VINFO_LIVE_P (stmt_info))
8623 continue;
8624
8625 if (STMT_VINFO_VECTYPE (stmt_info)
8626 && (maybe_ne
8627 (TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info)), vf))
8628 && dump_enabled_p ())
8629 dump_printf_loc (MSG_NOTE, vect_location, "multiple-types.\n");
8630
8631 if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def
8632 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
8633 || STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle)
8634 && ! PURE_SLP_STMT (stmt_info))
8635 {
8636 if (dump_enabled_p ())
8637 dump_printf_loc (MSG_NOTE, vect_location, "transform phi.\n");
8638 vect_transform_stmt (stmt_info, NULL, NULL, NULL);
8639 }
8640 }
8641
8642 for (gimple_stmt_iterator si = gsi_start_bb (bb);
8643 !gsi_end_p (si);)
8644 {
8645 stmt = gsi_stmt (si);
8646 /* During vectorization remove existing clobber stmts. */
8647 if (gimple_clobber_p (stmt))
8648 {
8649 unlink_stmt_vdef (stmt);
8650 gsi_remove (&si, true);
8651 release_defs (stmt);
8652 }
8653 else
8654 {
8655 stmt_info = loop_vinfo->lookup_stmt (stmt);
8656
8657 /* vector stmts created in the outer-loop during vectorization of
8658 stmts in an inner-loop may not have a stmt_info, and do not
8659 need to be vectorized. */
8660 stmt_vec_info seen_store = NULL;
8661 if (stmt_info)
8662 {
8663 if (STMT_VINFO_IN_PATTERN_P (stmt_info))
8664 {
8665 gimple *def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
8666 for (gimple_stmt_iterator subsi = gsi_start (def_seq);
8667 !gsi_end_p (subsi); gsi_next (&subsi))
8668 {
8669 stmt_vec_info pat_stmt_info
8670 = loop_vinfo->lookup_stmt (gsi_stmt (subsi));
8671 vect_transform_loop_stmt (loop_vinfo, pat_stmt_info,
8672 &si, &seen_store);
8673 }
8674 stmt_vec_info pat_stmt_info
8675 = STMT_VINFO_RELATED_STMT (stmt_info);
8676 vect_transform_loop_stmt (loop_vinfo, pat_stmt_info, &si,
8677 &seen_store);
8678 }
8679 vect_transform_loop_stmt (loop_vinfo, stmt_info, &si,
8680 &seen_store);
8681 }
8682 gsi_next (&si);
8683 if (seen_store)
8684 {
8685 if (STMT_VINFO_GROUPED_ACCESS (seen_store))
8686 /* Interleaving. If IS_STORE is TRUE, the
8687 vectorization of the interleaving chain was
8688 completed - free all the stores in the chain. */
8689 vect_remove_stores (DR_GROUP_FIRST_ELEMENT (seen_store));
8690 else
8691 /* Free the attached stmt_vec_info and remove the stmt. */
8692 loop_vinfo->remove_stmt (stmt_info);
8693 }
8694 }
8695 }
8696
8697 /* Stub out scalar statements that must not survive vectorization.
8698 Doing this here helps with grouped statements, or statements that
8699 are involved in patterns. */
8700 for (gimple_stmt_iterator gsi = gsi_start_bb (bb);
8701 !gsi_end_p (gsi); gsi_next (&gsi))
8702 {
8703 gcall *call = dyn_cast <gcall *> (gsi_stmt (gsi));
8704 if (call && gimple_call_internal_p (call, IFN_MASK_LOAD))
8705 {
8706 tree lhs = gimple_get_lhs (call);
8707 if (!VECTOR_TYPE_P (TREE_TYPE (lhs)))
8708 {
8709 tree zero = build_zero_cst (TREE_TYPE (lhs));
8710 gimple *new_stmt = gimple_build_assign (lhs, zero);
8711 gsi_replace (&gsi, new_stmt, true);
8712 }
8713 }
8714 }
8715 } /* BBs in loop */
8716
8717 /* The vectorization factor is always > 1, so if we use an IV increment of 1.
8718 a zero NITERS becomes a nonzero NITERS_VECTOR. */
8719 if (integer_onep (step_vector))
8720 niters_no_overflow = true;
8721 vect_set_loop_condition (loop, loop_vinfo, niters_vector, step_vector,
8722 niters_vector_mult_vf, !niters_no_overflow);
8723
8724 unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
8725 scale_profile_for_vect_loop (loop, assumed_vf);
8726
8727 /* True if the final iteration might not handle a full vector's
8728 worth of scalar iterations. */
8729 bool final_iter_may_be_partial = LOOP_VINFO_FULLY_MASKED_P (loop_vinfo);
8730 /* The minimum number of iterations performed by the epilogue. This
8731 is 1 when peeling for gaps because we always need a final scalar
8732 iteration. */
8733 int min_epilogue_iters = LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) ? 1 : 0;
8734 /* +1 to convert latch counts to loop iteration counts,
8735 -min_epilogue_iters to remove iterations that cannot be performed
8736 by the vector code. */
8737 int bias_for_lowest = 1 - min_epilogue_iters;
8738 int bias_for_assumed = bias_for_lowest;
8739 int alignment_npeels = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
8740 if (alignment_npeels && LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
8741 {
8742 /* When the amount of peeling is known at compile time, the first
8743 iteration will have exactly alignment_npeels active elements.
8744 In the worst case it will have at least one. */
8745 int min_first_active = (alignment_npeels > 0 ? alignment_npeels : 1);
8746 bias_for_lowest += lowest_vf - min_first_active;
8747 bias_for_assumed += assumed_vf - min_first_active;
8748 }
8749 /* In these calculations the "- 1" converts loop iteration counts
8750 back to latch counts. */
8751 if (loop->any_upper_bound)
8752 loop->nb_iterations_upper_bound
8753 = (final_iter_may_be_partial
8754 ? wi::udiv_ceil (loop->nb_iterations_upper_bound + bias_for_lowest,
8755 lowest_vf) - 1
8756 : wi::udiv_floor (loop->nb_iterations_upper_bound + bias_for_lowest,
8757 lowest_vf) - 1);
8758 if (loop->any_likely_upper_bound)
8759 loop->nb_iterations_likely_upper_bound
8760 = (final_iter_may_be_partial
8761 ? wi::udiv_ceil (loop->nb_iterations_likely_upper_bound
8762 + bias_for_lowest, lowest_vf) - 1
8763 : wi::udiv_floor (loop->nb_iterations_likely_upper_bound
8764 + bias_for_lowest, lowest_vf) - 1);
8765 if (loop->any_estimate)
8766 loop->nb_iterations_estimate
8767 = (final_iter_may_be_partial
8768 ? wi::udiv_ceil (loop->nb_iterations_estimate + bias_for_assumed,
8769 assumed_vf) - 1
8770 : wi::udiv_floor (loop->nb_iterations_estimate + bias_for_assumed,
8771 assumed_vf) - 1);
8772
8773 if (dump_enabled_p ())
8774 {
8775 if (!LOOP_VINFO_EPILOGUE_P (loop_vinfo))
8776 {
8777 dump_printf_loc (MSG_NOTE, vect_location,
8778 "LOOP VECTORIZED\n");
8779 if (loop->inner)
8780 dump_printf_loc (MSG_NOTE, vect_location,
8781 "OUTER LOOP VECTORIZED\n");
8782 dump_printf (MSG_NOTE, "\n");
8783 }
8784 else
8785 {
8786 dump_printf_loc (MSG_NOTE, vect_location,
8787 "LOOP EPILOGUE VECTORIZED (VS=");
8788 dump_dec (MSG_NOTE, current_vector_size);
8789 dump_printf (MSG_NOTE, ")\n");
8790 }
8791 }
8792
8793 /* Loops vectorized with a variable factor won't benefit from
8794 unrolling/peeling. */
8795 if (!vf.is_constant ())
8796 {
8797 loop->unroll = 1;
8798 if (dump_enabled_p ())
8799 dump_printf_loc (MSG_NOTE, vect_location, "Disabling unrolling due to"
8800 " variable-length vectorization factor\n");
8801 }
8802 /* Free SLP instances here because otherwise stmt reference counting
8803 won't work. */
8804 slp_instance instance;
8805 FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), i, instance)
8806 vect_free_slp_instance (instance, true);
8807 LOOP_VINFO_SLP_INSTANCES (loop_vinfo).release ();
8808 /* Clear-up safelen field since its value is invalid after vectorization
8809 since vectorized loop can have loop-carried dependencies. */
8810 loop->safelen = 0;
8811
8812 /* Don't vectorize epilogue for epilogue. */
8813 if (LOOP_VINFO_EPILOGUE_P (loop_vinfo))
8814 epilogue = NULL;
8815
8816 if (!PARAM_VALUE (PARAM_VECT_EPILOGUES_NOMASK))
8817 epilogue = NULL;
8818
8819 if (epilogue)
8820 {
8821 auto_vector_sizes vector_sizes;
8822 targetm.vectorize.autovectorize_vector_sizes (&vector_sizes, false);
8823 unsigned int next_size = 0;
8824
8825 /* Note LOOP_VINFO_NITERS_KNOWN_P and LOOP_VINFO_INT_NITERS work
8826 on niters already ajusted for the iterations of the prologue. */
8827 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
8828 && known_eq (vf, lowest_vf))
8829 {
8830 unsigned HOST_WIDE_INT eiters
8831 = (LOOP_VINFO_INT_NITERS (loop_vinfo)
8832 - LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo));
8833 eiters
8834 = eiters % lowest_vf + LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo);
8835 epilogue->nb_iterations_upper_bound = eiters - 1;
8836 epilogue->any_upper_bound = true;
8837
8838 unsigned int ratio;
8839 while (next_size < vector_sizes.length ()
8840 && !(constant_multiple_p (current_vector_size,
8841 vector_sizes[next_size], &ratio)
8842 && eiters >= lowest_vf / ratio))
8843 next_size += 1;
8844 }
8845 else
8846 while (next_size < vector_sizes.length ()
8847 && maybe_lt (current_vector_size, vector_sizes[next_size]))
8848 next_size += 1;
8849
8850 if (next_size == vector_sizes.length ())
8851 epilogue = NULL;
8852 }
8853
8854 if (epilogue)
8855 {
8856 epilogue->force_vectorize = loop->force_vectorize;
8857 epilogue->safelen = loop->safelen;
8858 epilogue->dont_vectorize = false;
8859
8860 /* We may need to if-convert epilogue to vectorize it. */
8861 if (LOOP_VINFO_SCALAR_LOOP (loop_vinfo))
8862 tree_if_conversion (epilogue);
8863 }
8864
8865 return epilogue;
8866 }
8867
8868 /* The code below is trying to perform simple optimization - revert
8869 if-conversion for masked stores, i.e. if the mask of a store is zero
8870 do not perform it and all stored value producers also if possible.
8871 For example,
8872 for (i=0; i<n; i++)
8873 if (c[i])
8874 {
8875 p1[i] += 1;
8876 p2[i] = p3[i] +2;
8877 }
8878 this transformation will produce the following semi-hammock:
8879
8880 if (!mask__ifc__42.18_165 == { 0, 0, 0, 0, 0, 0, 0, 0 })
8881 {
8882 vect__11.19_170 = MASK_LOAD (vectp_p1.20_168, 0B, mask__ifc__42.18_165);
8883 vect__12.22_172 = vect__11.19_170 + vect_cst__171;
8884 MASK_STORE (vectp_p1.23_175, 0B, mask__ifc__42.18_165, vect__12.22_172);
8885 vect__18.25_182 = MASK_LOAD (vectp_p3.26_180, 0B, mask__ifc__42.18_165);
8886 vect__19.28_184 = vect__18.25_182 + vect_cst__183;
8887 MASK_STORE (vectp_p2.29_187, 0B, mask__ifc__42.18_165, vect__19.28_184);
8888 }
8889 */
8890
8891 void
8892 optimize_mask_stores (struct loop *loop)
8893 {
8894 basic_block *bbs = get_loop_body (loop);
8895 unsigned nbbs = loop->num_nodes;
8896 unsigned i;
8897 basic_block bb;
8898 struct loop *bb_loop;
8899 gimple_stmt_iterator gsi;
8900 gimple *stmt;
8901 auto_vec<gimple *> worklist;
8902 auto_purge_vect_location sentinel;
8903
8904 vect_location = find_loop_location (loop);
8905 /* Pick up all masked stores in loop if any. */
8906 for (i = 0; i < nbbs; i++)
8907 {
8908 bb = bbs[i];
8909 for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi);
8910 gsi_next (&gsi))
8911 {
8912 stmt = gsi_stmt (gsi);
8913 if (gimple_call_internal_p (stmt, IFN_MASK_STORE))
8914 worklist.safe_push (stmt);
8915 }
8916 }
8917
8918 free (bbs);
8919 if (worklist.is_empty ())
8920 return;
8921
8922 /* Loop has masked stores. */
8923 while (!worklist.is_empty ())
8924 {
8925 gimple *last, *last_store;
8926 edge e, efalse;
8927 tree mask;
8928 basic_block store_bb, join_bb;
8929 gimple_stmt_iterator gsi_to;
8930 tree vdef, new_vdef;
8931 gphi *phi;
8932 tree vectype;
8933 tree zero;
8934
8935 last = worklist.pop ();
8936 mask = gimple_call_arg (last, 2);
8937 bb = gimple_bb (last);
8938 /* Create then_bb and if-then structure in CFG, then_bb belongs to
8939 the same loop as if_bb. It could be different to LOOP when two
8940 level loop-nest is vectorized and mask_store belongs to the inner
8941 one. */
8942 e = split_block (bb, last);
8943 bb_loop = bb->loop_father;
8944 gcc_assert (loop == bb_loop || flow_loop_nested_p (loop, bb_loop));
8945 join_bb = e->dest;
8946 store_bb = create_empty_bb (bb);
8947 add_bb_to_loop (store_bb, bb_loop);
8948 e->flags = EDGE_TRUE_VALUE;
8949 efalse = make_edge (bb, store_bb, EDGE_FALSE_VALUE);
8950 /* Put STORE_BB to likely part. */
8951 efalse->probability = profile_probability::unlikely ();
8952 store_bb->count = efalse->count ();
8953 make_single_succ_edge (store_bb, join_bb, EDGE_FALLTHRU);
8954 if (dom_info_available_p (CDI_DOMINATORS))
8955 set_immediate_dominator (CDI_DOMINATORS, store_bb, bb);
8956 if (dump_enabled_p ())
8957 dump_printf_loc (MSG_NOTE, vect_location,
8958 "Create new block %d to sink mask stores.",
8959 store_bb->index);
8960 /* Create vector comparison with boolean result. */
8961 vectype = TREE_TYPE (mask);
8962 zero = build_zero_cst (vectype);
8963 stmt = gimple_build_cond (EQ_EXPR, mask, zero, NULL_TREE, NULL_TREE);
8964 gsi = gsi_last_bb (bb);
8965 gsi_insert_after (&gsi, stmt, GSI_SAME_STMT);
8966 /* Create new PHI node for vdef of the last masked store:
8967 .MEM_2 = VDEF <.MEM_1>
8968 will be converted to
8969 .MEM.3 = VDEF <.MEM_1>
8970 and new PHI node will be created in join bb
8971 .MEM_2 = PHI <.MEM_1, .MEM_3>
8972 */
8973 vdef = gimple_vdef (last);
8974 new_vdef = make_ssa_name (gimple_vop (cfun), last);
8975 gimple_set_vdef (last, new_vdef);
8976 phi = create_phi_node (vdef, join_bb);
8977 add_phi_arg (phi, new_vdef, EDGE_SUCC (store_bb, 0), UNKNOWN_LOCATION);
8978
8979 /* Put all masked stores with the same mask to STORE_BB if possible. */
8980 while (true)
8981 {
8982 gimple_stmt_iterator gsi_from;
8983 gimple *stmt1 = NULL;
8984
8985 /* Move masked store to STORE_BB. */
8986 last_store = last;
8987 gsi = gsi_for_stmt (last);
8988 gsi_from = gsi;
8989 /* Shift GSI to the previous stmt for further traversal. */
8990 gsi_prev (&gsi);
8991 gsi_to = gsi_start_bb (store_bb);
8992 gsi_move_before (&gsi_from, &gsi_to);
8993 /* Setup GSI_TO to the non-empty block start. */
8994 gsi_to = gsi_start_bb (store_bb);
8995 if (dump_enabled_p ())
8996 dump_printf_loc (MSG_NOTE, vect_location,
8997 "Move stmt to created bb\n%G", last);
8998 /* Move all stored value producers if possible. */
8999 while (!gsi_end_p (gsi))
9000 {
9001 tree lhs;
9002 imm_use_iterator imm_iter;
9003 use_operand_p use_p;
9004 bool res;
9005
9006 /* Skip debug statements. */
9007 if (is_gimple_debug (gsi_stmt (gsi)))
9008 {
9009 gsi_prev (&gsi);
9010 continue;
9011 }
9012 stmt1 = gsi_stmt (gsi);
9013 /* Do not consider statements writing to memory or having
9014 volatile operand. */
9015 if (gimple_vdef (stmt1)
9016 || gimple_has_volatile_ops (stmt1))
9017 break;
9018 gsi_from = gsi;
9019 gsi_prev (&gsi);
9020 lhs = gimple_get_lhs (stmt1);
9021 if (!lhs)
9022 break;
9023
9024 /* LHS of vectorized stmt must be SSA_NAME. */
9025 if (TREE_CODE (lhs) != SSA_NAME)
9026 break;
9027
9028 if (!VECTOR_TYPE_P (TREE_TYPE (lhs)))
9029 {
9030 /* Remove dead scalar statement. */
9031 if (has_zero_uses (lhs))
9032 {
9033 gsi_remove (&gsi_from, true);
9034 continue;
9035 }
9036 }
9037
9038 /* Check that LHS does not have uses outside of STORE_BB. */
9039 res = true;
9040 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, lhs)
9041 {
9042 gimple *use_stmt;
9043 use_stmt = USE_STMT (use_p);
9044 if (is_gimple_debug (use_stmt))
9045 continue;
9046 if (gimple_bb (use_stmt) != store_bb)
9047 {
9048 res = false;
9049 break;
9050 }
9051 }
9052 if (!res)
9053 break;
9054
9055 if (gimple_vuse (stmt1)
9056 && gimple_vuse (stmt1) != gimple_vuse (last_store))
9057 break;
9058
9059 /* Can move STMT1 to STORE_BB. */
9060 if (dump_enabled_p ())
9061 dump_printf_loc (MSG_NOTE, vect_location,
9062 "Move stmt to created bb\n%G", stmt1);
9063 gsi_move_before (&gsi_from, &gsi_to);
9064 /* Shift GSI_TO for further insertion. */
9065 gsi_prev (&gsi_to);
9066 }
9067 /* Put other masked stores with the same mask to STORE_BB. */
9068 if (worklist.is_empty ()
9069 || gimple_call_arg (worklist.last (), 2) != mask
9070 || worklist.last () != stmt1)
9071 break;
9072 last = worklist.pop ();
9073 }
9074 add_phi_arg (phi, gimple_vuse (last_store), e, UNKNOWN_LOCATION);
9075 }
9076 }
9077
9078 /* Decide whether it is possible to use a zero-based induction variable
9079 when vectorizing LOOP_VINFO with a fully-masked loop. If it is,
9080 return the value that the induction variable must be able to hold
9081 in order to ensure that the loop ends with an all-false mask.
9082 Return -1 otherwise. */
9083 widest_int
9084 vect_iv_limit_for_full_masking (loop_vec_info loop_vinfo)
9085 {
9086 tree niters_skip = LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo);
9087 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
9088 unsigned HOST_WIDE_INT max_vf = vect_max_vf (loop_vinfo);
9089
9090 /* Calculate the value that the induction variable must be able
9091 to hit in order to ensure that we end the loop with an all-false mask.
9092 This involves adding the maximum number of inactive trailing scalar
9093 iterations. */
9094 widest_int iv_limit = -1;
9095 if (max_loop_iterations (loop, &iv_limit))
9096 {
9097 if (niters_skip)
9098 {
9099 /* Add the maximum number of skipped iterations to the
9100 maximum iteration count. */
9101 if (TREE_CODE (niters_skip) == INTEGER_CST)
9102 iv_limit += wi::to_widest (niters_skip);
9103 else
9104 iv_limit += max_vf - 1;
9105 }
9106 else if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo))
9107 /* Make a conservatively-correct assumption. */
9108 iv_limit += max_vf - 1;
9109
9110 /* IV_LIMIT is the maximum number of latch iterations, which is also
9111 the maximum in-range IV value. Round this value down to the previous
9112 vector alignment boundary and then add an extra full iteration. */
9113 poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
9114 iv_limit = (iv_limit & -(int) known_alignment (vf)) + max_vf;
9115 }
9116 return iv_limit;
9117 }
9118