]> git.ipfire.org Git - thirdparty/gcc.git/blob - gcc/tree-vect-loop.c
dojump.h: New header file.
[thirdparty/gcc.git] / gcc / tree-vect-loop.c
1 /* Loop Vectorization
2 Copyright (C) 2003-2015 Free Software Foundation, Inc.
3 Contributed by Dorit Naishlos <dorit@il.ibm.com> and
4 Ira Rosen <irar@il.ibm.com>
5
6 This file is part of GCC.
7
8 GCC is free software; you can redistribute it and/or modify it under
9 the terms of the GNU General Public License as published by the Free
10 Software Foundation; either version 3, or (at your option) any later
11 version.
12
13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
16 for more details.
17
18 You should have received a copy of the GNU General Public License
19 along with GCC; see the file COPYING3. If not see
20 <http://www.gnu.org/licenses/>. */
21
22 #include "config.h"
23 #include "system.h"
24 #include "coretypes.h"
25 #include "dumpfile.h"
26 #include "tm.h"
27 #include "hash-set.h"
28 #include "machmode.h"
29 #include "vec.h"
30 #include "double-int.h"
31 #include "input.h"
32 #include "alias.h"
33 #include "symtab.h"
34 #include "wide-int.h"
35 #include "inchash.h"
36 #include "tree.h"
37 #include "fold-const.h"
38 #include "stor-layout.h"
39 #include "predict.h"
40 #include "hard-reg-set.h"
41 #include "function.h"
42 #include "dominance.h"
43 #include "cfg.h"
44 #include "cfganal.h"
45 #include "basic-block.h"
46 #include "gimple-pretty-print.h"
47 #include "tree-ssa-alias.h"
48 #include "internal-fn.h"
49 #include "gimple-expr.h"
50 #include "is-a.h"
51 #include "gimple.h"
52 #include "gimplify.h"
53 #include "gimple-iterator.h"
54 #include "gimplify-me.h"
55 #include "gimple-ssa.h"
56 #include "tree-phinodes.h"
57 #include "ssa-iterators.h"
58 #include "stringpool.h"
59 #include "tree-ssanames.h"
60 #include "tree-ssa-loop-ivopts.h"
61 #include "tree-ssa-loop-manip.h"
62 #include "tree-ssa-loop-niter.h"
63 #include "tree-pass.h"
64 #include "cfgloop.h"
65 #include "hashtab.h"
66 #include "rtl.h"
67 #include "flags.h"
68 #include "statistics.h"
69 #include "real.h"
70 #include "fixed-value.h"
71 #include "insn-config.h"
72 #include "expmed.h"
73 #include "dojump.h"
74 #include "explow.h"
75 #include "calls.h"
76 #include "emit-rtl.h"
77 #include "varasm.h"
78 #include "stmt.h"
79 #include "expr.h"
80 #include "recog.h"
81 #include "insn-codes.h"
82 #include "optabs.h"
83 #include "params.h"
84 #include "diagnostic-core.h"
85 #include "tree-chrec.h"
86 #include "tree-scalar-evolution.h"
87 #include "tree-vectorizer.h"
88 #include "target.h"
89
90 /* Loop Vectorization Pass.
91
92 This pass tries to vectorize loops.
93
94 For example, the vectorizer transforms the following simple loop:
95
96 short a[N]; short b[N]; short c[N]; int i;
97
98 for (i=0; i<N; i++){
99 a[i] = b[i] + c[i];
100 }
101
102 as if it was manually vectorized by rewriting the source code into:
103
104 typedef int __attribute__((mode(V8HI))) v8hi;
105 short a[N]; short b[N]; short c[N]; int i;
106 v8hi *pa = (v8hi*)a, *pb = (v8hi*)b, *pc = (v8hi*)c;
107 v8hi va, vb, vc;
108
109 for (i=0; i<N/8; i++){
110 vb = pb[i];
111 vc = pc[i];
112 va = vb + vc;
113 pa[i] = va;
114 }
115
116 The main entry to this pass is vectorize_loops(), in which
117 the vectorizer applies a set of analyses on a given set of loops,
118 followed by the actual vectorization transformation for the loops that
119 had successfully passed the analysis phase.
120 Throughout this pass we make a distinction between two types of
121 data: scalars (which are represented by SSA_NAMES), and memory references
122 ("data-refs"). These two types of data require different handling both
123 during analysis and transformation. The types of data-refs that the
124 vectorizer currently supports are ARRAY_REFS which base is an array DECL
125 (not a pointer), and INDIRECT_REFS through pointers; both array and pointer
126 accesses are required to have a simple (consecutive) access pattern.
127
128 Analysis phase:
129 ===============
130 The driver for the analysis phase is vect_analyze_loop().
131 It applies a set of analyses, some of which rely on the scalar evolution
132 analyzer (scev) developed by Sebastian Pop.
133
134 During the analysis phase the vectorizer records some information
135 per stmt in a "stmt_vec_info" struct which is attached to each stmt in the
136 loop, as well as general information about the loop as a whole, which is
137 recorded in a "loop_vec_info" struct attached to each loop.
138
139 Transformation phase:
140 =====================
141 The loop transformation phase scans all the stmts in the loop, and
142 creates a vector stmt (or a sequence of stmts) for each scalar stmt S in
143 the loop that needs to be vectorized. It inserts the vector code sequence
144 just before the scalar stmt S, and records a pointer to the vector code
145 in STMT_VINFO_VEC_STMT (stmt_info) (stmt_info is the stmt_vec_info struct
146 attached to S). This pointer will be used for the vectorization of following
147 stmts which use the def of stmt S. Stmt S is removed if it writes to memory;
148 otherwise, we rely on dead code elimination for removing it.
149
150 For example, say stmt S1 was vectorized into stmt VS1:
151
152 VS1: vb = px[i];
153 S1: b = x[i]; STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
154 S2: a = b;
155
156 To vectorize stmt S2, the vectorizer first finds the stmt that defines
157 the operand 'b' (S1), and gets the relevant vector def 'vb' from the
158 vector stmt VS1 pointed to by STMT_VINFO_VEC_STMT (stmt_info (S1)). The
159 resulting sequence would be:
160
161 VS1: vb = px[i];
162 S1: b = x[i]; STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
163 VS2: va = vb;
164 S2: a = b; STMT_VINFO_VEC_STMT (stmt_info (S2)) = VS2
165
166 Operands that are not SSA_NAMEs, are data-refs that appear in
167 load/store operations (like 'x[i]' in S1), and are handled differently.
168
169 Target modeling:
170 =================
171 Currently the only target specific information that is used is the
172 size of the vector (in bytes) - "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD".
173 Targets that can support different sizes of vectors, for now will need
174 to specify one value for "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD". More
175 flexibility will be added in the future.
176
177 Since we only vectorize operations which vector form can be
178 expressed using existing tree codes, to verify that an operation is
179 supported, the vectorizer checks the relevant optab at the relevant
180 machine_mode (e.g, optab_handler (add_optab, V8HImode)). If
181 the value found is CODE_FOR_nothing, then there's no target support, and
182 we can't vectorize the stmt.
183
184 For additional information on this project see:
185 http://gcc.gnu.org/projects/tree-ssa/vectorization.html
186 */
187
188 static void vect_estimate_min_profitable_iters (loop_vec_info, int *, int *);
189
190 /* Function vect_determine_vectorization_factor
191
192 Determine the vectorization factor (VF). VF is the number of data elements
193 that are operated upon in parallel in a single iteration of the vectorized
194 loop. For example, when vectorizing a loop that operates on 4byte elements,
195 on a target with vector size (VS) 16byte, the VF is set to 4, since 4
196 elements can fit in a single vector register.
197
198 We currently support vectorization of loops in which all types operated upon
199 are of the same size. Therefore this function currently sets VF according to
200 the size of the types operated upon, and fails if there are multiple sizes
201 in the loop.
202
203 VF is also the factor by which the loop iterations are strip-mined, e.g.:
204 original loop:
205 for (i=0; i<N; i++){
206 a[i] = b[i] + c[i];
207 }
208
209 vectorized loop:
210 for (i=0; i<N; i+=VF){
211 a[i:VF] = b[i:VF] + c[i:VF];
212 }
213 */
214
215 static bool
216 vect_determine_vectorization_factor (loop_vec_info loop_vinfo)
217 {
218 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
219 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
220 int nbbs = loop->num_nodes;
221 unsigned int vectorization_factor = 0;
222 tree scalar_type;
223 gphi *phi;
224 tree vectype;
225 unsigned int nunits;
226 stmt_vec_info stmt_info;
227 int i;
228 HOST_WIDE_INT dummy;
229 gimple stmt, pattern_stmt = NULL;
230 gimple_seq pattern_def_seq = NULL;
231 gimple_stmt_iterator pattern_def_si = gsi_none ();
232 bool analyze_pattern_stmt = false;
233
234 if (dump_enabled_p ())
235 dump_printf_loc (MSG_NOTE, vect_location,
236 "=== vect_determine_vectorization_factor ===\n");
237
238 for (i = 0; i < nbbs; i++)
239 {
240 basic_block bb = bbs[i];
241
242 for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
243 gsi_next (&si))
244 {
245 phi = si.phi ();
246 stmt_info = vinfo_for_stmt (phi);
247 if (dump_enabled_p ())
248 {
249 dump_printf_loc (MSG_NOTE, vect_location, "==> examining phi: ");
250 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
251 dump_printf (MSG_NOTE, "\n");
252 }
253
254 gcc_assert (stmt_info);
255
256 if (STMT_VINFO_RELEVANT_P (stmt_info))
257 {
258 gcc_assert (!STMT_VINFO_VECTYPE (stmt_info));
259 scalar_type = TREE_TYPE (PHI_RESULT (phi));
260
261 if (dump_enabled_p ())
262 {
263 dump_printf_loc (MSG_NOTE, vect_location,
264 "get vectype for scalar type: ");
265 dump_generic_expr (MSG_NOTE, TDF_SLIM, scalar_type);
266 dump_printf (MSG_NOTE, "\n");
267 }
268
269 vectype = get_vectype_for_scalar_type (scalar_type);
270 if (!vectype)
271 {
272 if (dump_enabled_p ())
273 {
274 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
275 "not vectorized: unsupported "
276 "data-type ");
277 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
278 scalar_type);
279 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
280 }
281 return false;
282 }
283 STMT_VINFO_VECTYPE (stmt_info) = vectype;
284
285 if (dump_enabled_p ())
286 {
287 dump_printf_loc (MSG_NOTE, vect_location, "vectype: ");
288 dump_generic_expr (MSG_NOTE, TDF_SLIM, vectype);
289 dump_printf (MSG_NOTE, "\n");
290 }
291
292 nunits = TYPE_VECTOR_SUBPARTS (vectype);
293 if (dump_enabled_p ())
294 dump_printf_loc (MSG_NOTE, vect_location, "nunits = %d\n",
295 nunits);
296
297 if (!vectorization_factor
298 || (nunits > vectorization_factor))
299 vectorization_factor = nunits;
300 }
301 }
302
303 for (gimple_stmt_iterator si = gsi_start_bb (bb);
304 !gsi_end_p (si) || analyze_pattern_stmt;)
305 {
306 tree vf_vectype;
307
308 if (analyze_pattern_stmt)
309 stmt = pattern_stmt;
310 else
311 stmt = gsi_stmt (si);
312
313 stmt_info = vinfo_for_stmt (stmt);
314
315 if (dump_enabled_p ())
316 {
317 dump_printf_loc (MSG_NOTE, vect_location,
318 "==> examining statement: ");
319 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
320 dump_printf (MSG_NOTE, "\n");
321 }
322
323 gcc_assert (stmt_info);
324
325 /* Skip stmts which do not need to be vectorized. */
326 if ((!STMT_VINFO_RELEVANT_P (stmt_info)
327 && !STMT_VINFO_LIVE_P (stmt_info))
328 || gimple_clobber_p (stmt))
329 {
330 if (STMT_VINFO_IN_PATTERN_P (stmt_info)
331 && (pattern_stmt = STMT_VINFO_RELATED_STMT (stmt_info))
332 && (STMT_VINFO_RELEVANT_P (vinfo_for_stmt (pattern_stmt))
333 || STMT_VINFO_LIVE_P (vinfo_for_stmt (pattern_stmt))))
334 {
335 stmt = pattern_stmt;
336 stmt_info = vinfo_for_stmt (pattern_stmt);
337 if (dump_enabled_p ())
338 {
339 dump_printf_loc (MSG_NOTE, vect_location,
340 "==> examining pattern statement: ");
341 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
342 dump_printf (MSG_NOTE, "\n");
343 }
344 }
345 else
346 {
347 if (dump_enabled_p ())
348 dump_printf_loc (MSG_NOTE, vect_location, "skip.\n");
349 gsi_next (&si);
350 continue;
351 }
352 }
353 else if (STMT_VINFO_IN_PATTERN_P (stmt_info)
354 && (pattern_stmt = STMT_VINFO_RELATED_STMT (stmt_info))
355 && (STMT_VINFO_RELEVANT_P (vinfo_for_stmt (pattern_stmt))
356 || STMT_VINFO_LIVE_P (vinfo_for_stmt (pattern_stmt))))
357 analyze_pattern_stmt = true;
358
359 /* If a pattern statement has def stmts, analyze them too. */
360 if (is_pattern_stmt_p (stmt_info))
361 {
362 if (pattern_def_seq == NULL)
363 {
364 pattern_def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
365 pattern_def_si = gsi_start (pattern_def_seq);
366 }
367 else if (!gsi_end_p (pattern_def_si))
368 gsi_next (&pattern_def_si);
369 if (pattern_def_seq != NULL)
370 {
371 gimple pattern_def_stmt = NULL;
372 stmt_vec_info pattern_def_stmt_info = NULL;
373
374 while (!gsi_end_p (pattern_def_si))
375 {
376 pattern_def_stmt = gsi_stmt (pattern_def_si);
377 pattern_def_stmt_info
378 = vinfo_for_stmt (pattern_def_stmt);
379 if (STMT_VINFO_RELEVANT_P (pattern_def_stmt_info)
380 || STMT_VINFO_LIVE_P (pattern_def_stmt_info))
381 break;
382 gsi_next (&pattern_def_si);
383 }
384
385 if (!gsi_end_p (pattern_def_si))
386 {
387 if (dump_enabled_p ())
388 {
389 dump_printf_loc (MSG_NOTE, vect_location,
390 "==> examining pattern def stmt: ");
391 dump_gimple_stmt (MSG_NOTE, TDF_SLIM,
392 pattern_def_stmt, 0);
393 dump_printf (MSG_NOTE, "\n");
394 }
395
396 stmt = pattern_def_stmt;
397 stmt_info = pattern_def_stmt_info;
398 }
399 else
400 {
401 pattern_def_si = gsi_none ();
402 analyze_pattern_stmt = false;
403 }
404 }
405 else
406 analyze_pattern_stmt = false;
407 }
408
409 if (gimple_get_lhs (stmt) == NULL_TREE
410 /* MASK_STORE has no lhs, but is ok. */
411 && (!is_gimple_call (stmt)
412 || !gimple_call_internal_p (stmt)
413 || gimple_call_internal_fn (stmt) != IFN_MASK_STORE))
414 {
415 if (is_gimple_call (stmt))
416 {
417 /* Ignore calls with no lhs. These must be calls to
418 #pragma omp simd functions, and what vectorization factor
419 it really needs can't be determined until
420 vectorizable_simd_clone_call. */
421 if (!analyze_pattern_stmt && gsi_end_p (pattern_def_si))
422 {
423 pattern_def_seq = NULL;
424 gsi_next (&si);
425 }
426 continue;
427 }
428 if (dump_enabled_p ())
429 {
430 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
431 "not vectorized: irregular stmt.");
432 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt,
433 0);
434 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
435 }
436 return false;
437 }
438
439 if (VECTOR_MODE_P (TYPE_MODE (gimple_expr_type (stmt))))
440 {
441 if (dump_enabled_p ())
442 {
443 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
444 "not vectorized: vector stmt in loop:");
445 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
446 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
447 }
448 return false;
449 }
450
451 if (STMT_VINFO_VECTYPE (stmt_info))
452 {
453 /* The only case when a vectype had been already set is for stmts
454 that contain a dataref, or for "pattern-stmts" (stmts
455 generated by the vectorizer to represent/replace a certain
456 idiom). */
457 gcc_assert (STMT_VINFO_DATA_REF (stmt_info)
458 || is_pattern_stmt_p (stmt_info)
459 || !gsi_end_p (pattern_def_si));
460 vectype = STMT_VINFO_VECTYPE (stmt_info);
461 }
462 else
463 {
464 gcc_assert (!STMT_VINFO_DATA_REF (stmt_info));
465 if (is_gimple_call (stmt)
466 && gimple_call_internal_p (stmt)
467 && gimple_call_internal_fn (stmt) == IFN_MASK_STORE)
468 scalar_type = TREE_TYPE (gimple_call_arg (stmt, 3));
469 else
470 scalar_type = TREE_TYPE (gimple_get_lhs (stmt));
471 if (dump_enabled_p ())
472 {
473 dump_printf_loc (MSG_NOTE, vect_location,
474 "get vectype for scalar type: ");
475 dump_generic_expr (MSG_NOTE, TDF_SLIM, scalar_type);
476 dump_printf (MSG_NOTE, "\n");
477 }
478 vectype = get_vectype_for_scalar_type (scalar_type);
479 if (!vectype)
480 {
481 if (dump_enabled_p ())
482 {
483 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
484 "not vectorized: unsupported "
485 "data-type ");
486 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
487 scalar_type);
488 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
489 }
490 return false;
491 }
492
493 STMT_VINFO_VECTYPE (stmt_info) = vectype;
494
495 if (dump_enabled_p ())
496 {
497 dump_printf_loc (MSG_NOTE, vect_location, "vectype: ");
498 dump_generic_expr (MSG_NOTE, TDF_SLIM, vectype);
499 dump_printf (MSG_NOTE, "\n");
500 }
501 }
502
503 /* The vectorization factor is according to the smallest
504 scalar type (or the largest vector size, but we only
505 support one vector size per loop). */
506 scalar_type = vect_get_smallest_scalar_type (stmt, &dummy,
507 &dummy);
508 if (dump_enabled_p ())
509 {
510 dump_printf_loc (MSG_NOTE, vect_location,
511 "get vectype for scalar type: ");
512 dump_generic_expr (MSG_NOTE, TDF_SLIM, scalar_type);
513 dump_printf (MSG_NOTE, "\n");
514 }
515 vf_vectype = get_vectype_for_scalar_type (scalar_type);
516 if (!vf_vectype)
517 {
518 if (dump_enabled_p ())
519 {
520 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
521 "not vectorized: unsupported data-type ");
522 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
523 scalar_type);
524 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
525 }
526 return false;
527 }
528
529 if ((GET_MODE_SIZE (TYPE_MODE (vectype))
530 != GET_MODE_SIZE (TYPE_MODE (vf_vectype))))
531 {
532 if (dump_enabled_p ())
533 {
534 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
535 "not vectorized: different sized vector "
536 "types in statement, ");
537 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
538 vectype);
539 dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
540 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
541 vf_vectype);
542 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
543 }
544 return false;
545 }
546
547 if (dump_enabled_p ())
548 {
549 dump_printf_loc (MSG_NOTE, vect_location, "vectype: ");
550 dump_generic_expr (MSG_NOTE, TDF_SLIM, vf_vectype);
551 dump_printf (MSG_NOTE, "\n");
552 }
553
554 nunits = TYPE_VECTOR_SUBPARTS (vf_vectype);
555 if (dump_enabled_p ())
556 dump_printf_loc (MSG_NOTE, vect_location, "nunits = %d\n", nunits);
557 if (!vectorization_factor
558 || (nunits > vectorization_factor))
559 vectorization_factor = nunits;
560
561 if (!analyze_pattern_stmt && gsi_end_p (pattern_def_si))
562 {
563 pattern_def_seq = NULL;
564 gsi_next (&si);
565 }
566 }
567 }
568
569 /* TODO: Analyze cost. Decide if worth while to vectorize. */
570 if (dump_enabled_p ())
571 dump_printf_loc (MSG_NOTE, vect_location, "vectorization factor = %d\n",
572 vectorization_factor);
573 if (vectorization_factor <= 1)
574 {
575 if (dump_enabled_p ())
576 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
577 "not vectorized: unsupported data-type\n");
578 return false;
579 }
580 LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
581
582 return true;
583 }
584
585
586 /* Function vect_is_simple_iv_evolution.
587
588 FORNOW: A simple evolution of an induction variables in the loop is
589 considered a polynomial evolution. */
590
591 static bool
592 vect_is_simple_iv_evolution (unsigned loop_nb, tree access_fn, tree * init,
593 tree * step)
594 {
595 tree init_expr;
596 tree step_expr;
597 tree evolution_part = evolution_part_in_loop_num (access_fn, loop_nb);
598 basic_block bb;
599
600 /* When there is no evolution in this loop, the evolution function
601 is not "simple". */
602 if (evolution_part == NULL_TREE)
603 return false;
604
605 /* When the evolution is a polynomial of degree >= 2
606 the evolution function is not "simple". */
607 if (tree_is_chrec (evolution_part))
608 return false;
609
610 step_expr = evolution_part;
611 init_expr = unshare_expr (initial_condition_in_loop_num (access_fn, loop_nb));
612
613 if (dump_enabled_p ())
614 {
615 dump_printf_loc (MSG_NOTE, vect_location, "step: ");
616 dump_generic_expr (MSG_NOTE, TDF_SLIM, step_expr);
617 dump_printf (MSG_NOTE, ", init: ");
618 dump_generic_expr (MSG_NOTE, TDF_SLIM, init_expr);
619 dump_printf (MSG_NOTE, "\n");
620 }
621
622 *init = init_expr;
623 *step = step_expr;
624
625 if (TREE_CODE (step_expr) != INTEGER_CST
626 && (TREE_CODE (step_expr) != SSA_NAME
627 || ((bb = gimple_bb (SSA_NAME_DEF_STMT (step_expr)))
628 && flow_bb_inside_loop_p (get_loop (cfun, loop_nb), bb))
629 || (!INTEGRAL_TYPE_P (TREE_TYPE (step_expr))
630 && (!SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr))
631 || !flag_associative_math)))
632 && (TREE_CODE (step_expr) != REAL_CST
633 || !flag_associative_math))
634 {
635 if (dump_enabled_p ())
636 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
637 "step unknown.\n");
638 return false;
639 }
640
641 return true;
642 }
643
644 /* Function vect_analyze_scalar_cycles_1.
645
646 Examine the cross iteration def-use cycles of scalar variables
647 in LOOP. LOOP_VINFO represents the loop that is now being
648 considered for vectorization (can be LOOP, or an outer-loop
649 enclosing LOOP). */
650
651 static void
652 vect_analyze_scalar_cycles_1 (loop_vec_info loop_vinfo, struct loop *loop)
653 {
654 basic_block bb = loop->header;
655 tree init, step;
656 auto_vec<gimple, 64> worklist;
657 gphi_iterator gsi;
658 bool double_reduc;
659
660 if (dump_enabled_p ())
661 dump_printf_loc (MSG_NOTE, vect_location,
662 "=== vect_analyze_scalar_cycles ===\n");
663
664 /* First - identify all inductions. Reduction detection assumes that all the
665 inductions have been identified, therefore, this order must not be
666 changed. */
667 for (gsi = gsi_start_phis (bb); !gsi_end_p (gsi); gsi_next (&gsi))
668 {
669 gphi *phi = gsi.phi ();
670 tree access_fn = NULL;
671 tree def = PHI_RESULT (phi);
672 stmt_vec_info stmt_vinfo = vinfo_for_stmt (phi);
673
674 if (dump_enabled_p ())
675 {
676 dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: ");
677 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
678 dump_printf (MSG_NOTE, "\n");
679 }
680
681 /* Skip virtual phi's. The data dependences that are associated with
682 virtual defs/uses (i.e., memory accesses) are analyzed elsewhere. */
683 if (virtual_operand_p (def))
684 continue;
685
686 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_unknown_def_type;
687
688 /* Analyze the evolution function. */
689 access_fn = analyze_scalar_evolution (loop, def);
690 if (access_fn)
691 {
692 STRIP_NOPS (access_fn);
693 if (dump_enabled_p ())
694 {
695 dump_printf_loc (MSG_NOTE, vect_location,
696 "Access function of PHI: ");
697 dump_generic_expr (MSG_NOTE, TDF_SLIM, access_fn);
698 dump_printf (MSG_NOTE, "\n");
699 }
700 STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo)
701 = evolution_part_in_loop_num (access_fn, loop->num);
702 }
703
704 if (!access_fn
705 || !vect_is_simple_iv_evolution (loop->num, access_fn, &init, &step)
706 || (LOOP_VINFO_LOOP (loop_vinfo) != loop
707 && TREE_CODE (step) != INTEGER_CST))
708 {
709 worklist.safe_push (phi);
710 continue;
711 }
712
713 gcc_assert (STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo) != NULL_TREE);
714
715 if (dump_enabled_p ())
716 dump_printf_loc (MSG_NOTE, vect_location, "Detected induction.\n");
717 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_induction_def;
718 }
719
720
721 /* Second - identify all reductions and nested cycles. */
722 while (worklist.length () > 0)
723 {
724 gimple phi = worklist.pop ();
725 tree def = PHI_RESULT (phi);
726 stmt_vec_info stmt_vinfo = vinfo_for_stmt (phi);
727 gimple reduc_stmt;
728 bool nested_cycle;
729
730 if (dump_enabled_p ())
731 {
732 dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: ");
733 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
734 dump_printf (MSG_NOTE, "\n");
735 }
736
737 gcc_assert (!virtual_operand_p (def)
738 && STMT_VINFO_DEF_TYPE (stmt_vinfo) == vect_unknown_def_type);
739
740 nested_cycle = (loop != LOOP_VINFO_LOOP (loop_vinfo));
741 reduc_stmt = vect_force_simple_reduction (loop_vinfo, phi, !nested_cycle,
742 &double_reduc);
743 if (reduc_stmt)
744 {
745 if (double_reduc)
746 {
747 if (dump_enabled_p ())
748 dump_printf_loc (MSG_NOTE, vect_location,
749 "Detected double reduction.\n");
750
751 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_double_reduction_def;
752 STMT_VINFO_DEF_TYPE (vinfo_for_stmt (reduc_stmt)) =
753 vect_double_reduction_def;
754 }
755 else
756 {
757 if (nested_cycle)
758 {
759 if (dump_enabled_p ())
760 dump_printf_loc (MSG_NOTE, vect_location,
761 "Detected vectorizable nested cycle.\n");
762
763 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_nested_cycle;
764 STMT_VINFO_DEF_TYPE (vinfo_for_stmt (reduc_stmt)) =
765 vect_nested_cycle;
766 }
767 else
768 {
769 if (dump_enabled_p ())
770 dump_printf_loc (MSG_NOTE, vect_location,
771 "Detected reduction.\n");
772
773 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_reduction_def;
774 STMT_VINFO_DEF_TYPE (vinfo_for_stmt (reduc_stmt)) =
775 vect_reduction_def;
776 /* Store the reduction cycles for possible vectorization in
777 loop-aware SLP. */
778 LOOP_VINFO_REDUCTIONS (loop_vinfo).safe_push (reduc_stmt);
779 }
780 }
781 }
782 else
783 if (dump_enabled_p ())
784 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
785 "Unknown def-use cycle pattern.\n");
786 }
787 }
788
789
790 /* Function vect_analyze_scalar_cycles.
791
792 Examine the cross iteration def-use cycles of scalar variables, by
793 analyzing the loop-header PHIs of scalar variables. Classify each
794 cycle as one of the following: invariant, induction, reduction, unknown.
795 We do that for the loop represented by LOOP_VINFO, and also to its
796 inner-loop, if exists.
797 Examples for scalar cycles:
798
799 Example1: reduction:
800
801 loop1:
802 for (i=0; i<N; i++)
803 sum += a[i];
804
805 Example2: induction:
806
807 loop2:
808 for (i=0; i<N; i++)
809 a[i] = i; */
810
811 static void
812 vect_analyze_scalar_cycles (loop_vec_info loop_vinfo)
813 {
814 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
815
816 vect_analyze_scalar_cycles_1 (loop_vinfo, loop);
817
818 /* When vectorizing an outer-loop, the inner-loop is executed sequentially.
819 Reductions in such inner-loop therefore have different properties than
820 the reductions in the nest that gets vectorized:
821 1. When vectorized, they are executed in the same order as in the original
822 scalar loop, so we can't change the order of computation when
823 vectorizing them.
824 2. FIXME: Inner-loop reductions can be used in the inner-loop, so the
825 current checks are too strict. */
826
827 if (loop->inner)
828 vect_analyze_scalar_cycles_1 (loop_vinfo, loop->inner);
829 }
830
831
832 /* Function vect_get_loop_niters.
833
834 Determine how many iterations the loop is executed and place it
835 in NUMBER_OF_ITERATIONS. Place the number of latch iterations
836 in NUMBER_OF_ITERATIONSM1.
837
838 Return the loop exit condition. */
839
840
841 static gcond *
842 vect_get_loop_niters (struct loop *loop, tree *number_of_iterations,
843 tree *number_of_iterationsm1)
844 {
845 tree niters;
846
847 if (dump_enabled_p ())
848 dump_printf_loc (MSG_NOTE, vect_location,
849 "=== get_loop_niters ===\n");
850
851 niters = number_of_latch_executions (loop);
852 *number_of_iterationsm1 = niters;
853
854 /* We want the number of loop header executions which is the number
855 of latch executions plus one.
856 ??? For UINT_MAX latch executions this number overflows to zero
857 for loops like do { n++; } while (n != 0); */
858 if (niters && !chrec_contains_undetermined (niters))
859 niters = fold_build2 (PLUS_EXPR, TREE_TYPE (niters), unshare_expr (niters),
860 build_int_cst (TREE_TYPE (niters), 1));
861 *number_of_iterations = niters;
862
863 return get_loop_exit_condition (loop);
864 }
865
866
867 /* Function bb_in_loop_p
868
869 Used as predicate for dfs order traversal of the loop bbs. */
870
871 static bool
872 bb_in_loop_p (const_basic_block bb, const void *data)
873 {
874 const struct loop *const loop = (const struct loop *)data;
875 if (flow_bb_inside_loop_p (loop, bb))
876 return true;
877 return false;
878 }
879
880
881 /* Function new_loop_vec_info.
882
883 Create and initialize a new loop_vec_info struct for LOOP, as well as
884 stmt_vec_info structs for all the stmts in LOOP. */
885
886 static loop_vec_info
887 new_loop_vec_info (struct loop *loop)
888 {
889 loop_vec_info res;
890 basic_block *bbs;
891 gimple_stmt_iterator si;
892 unsigned int i, nbbs;
893
894 res = (loop_vec_info) xcalloc (1, sizeof (struct _loop_vec_info));
895 LOOP_VINFO_LOOP (res) = loop;
896
897 bbs = get_loop_body (loop);
898
899 /* Create/Update stmt_info for all stmts in the loop. */
900 for (i = 0; i < loop->num_nodes; i++)
901 {
902 basic_block bb = bbs[i];
903
904 /* BBs in a nested inner-loop will have been already processed (because
905 we will have called vect_analyze_loop_form for any nested inner-loop).
906 Therefore, for stmts in an inner-loop we just want to update the
907 STMT_VINFO_LOOP_VINFO field of their stmt_info to point to the new
908 loop_info of the outer-loop we are currently considering to vectorize
909 (instead of the loop_info of the inner-loop).
910 For stmts in other BBs we need to create a stmt_info from scratch. */
911 if (bb->loop_father != loop)
912 {
913 /* Inner-loop bb. */
914 gcc_assert (loop->inner && bb->loop_father == loop->inner);
915 for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
916 {
917 gimple phi = gsi_stmt (si);
918 stmt_vec_info stmt_info = vinfo_for_stmt (phi);
919 loop_vec_info inner_loop_vinfo =
920 STMT_VINFO_LOOP_VINFO (stmt_info);
921 gcc_assert (loop->inner == LOOP_VINFO_LOOP (inner_loop_vinfo));
922 STMT_VINFO_LOOP_VINFO (stmt_info) = res;
923 }
924 for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
925 {
926 gimple stmt = gsi_stmt (si);
927 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
928 loop_vec_info inner_loop_vinfo =
929 STMT_VINFO_LOOP_VINFO (stmt_info);
930 gcc_assert (loop->inner == LOOP_VINFO_LOOP (inner_loop_vinfo));
931 STMT_VINFO_LOOP_VINFO (stmt_info) = res;
932 }
933 }
934 else
935 {
936 /* bb in current nest. */
937 for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
938 {
939 gimple phi = gsi_stmt (si);
940 gimple_set_uid (phi, 0);
941 set_vinfo_for_stmt (phi, new_stmt_vec_info (phi, res, NULL));
942 }
943
944 for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
945 {
946 gimple stmt = gsi_stmt (si);
947 gimple_set_uid (stmt, 0);
948 set_vinfo_for_stmt (stmt, new_stmt_vec_info (stmt, res, NULL));
949 }
950 }
951 }
952
953 /* CHECKME: We want to visit all BBs before their successors (except for
954 latch blocks, for which this assertion wouldn't hold). In the simple
955 case of the loop forms we allow, a dfs order of the BBs would the same
956 as reversed postorder traversal, so we are safe. */
957
958 free (bbs);
959 bbs = XCNEWVEC (basic_block, loop->num_nodes);
960 nbbs = dfs_enumerate_from (loop->header, 0, bb_in_loop_p,
961 bbs, loop->num_nodes, loop);
962 gcc_assert (nbbs == loop->num_nodes);
963
964 LOOP_VINFO_BBS (res) = bbs;
965 LOOP_VINFO_NITERSM1 (res) = NULL;
966 LOOP_VINFO_NITERS (res) = NULL;
967 LOOP_VINFO_NITERS_UNCHANGED (res) = NULL;
968 LOOP_VINFO_COST_MODEL_MIN_ITERS (res) = 0;
969 LOOP_VINFO_COST_MODEL_THRESHOLD (res) = 0;
970 LOOP_VINFO_VECTORIZABLE_P (res) = 0;
971 LOOP_VINFO_PEELING_FOR_ALIGNMENT (res) = 0;
972 LOOP_VINFO_VECT_FACTOR (res) = 0;
973 LOOP_VINFO_LOOP_NEST (res).create (3);
974 LOOP_VINFO_DATAREFS (res).create (10);
975 LOOP_VINFO_DDRS (res).create (10 * 10);
976 LOOP_VINFO_UNALIGNED_DR (res) = NULL;
977 LOOP_VINFO_MAY_MISALIGN_STMTS (res).create (
978 PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIGNMENT_CHECKS));
979 LOOP_VINFO_MAY_ALIAS_DDRS (res).create (
980 PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS));
981 LOOP_VINFO_GROUPED_STORES (res).create (10);
982 LOOP_VINFO_REDUCTIONS (res).create (10);
983 LOOP_VINFO_REDUCTION_CHAINS (res).create (10);
984 LOOP_VINFO_SLP_INSTANCES (res).create (10);
985 LOOP_VINFO_SLP_UNROLLING_FACTOR (res) = 1;
986 LOOP_VINFO_TARGET_COST_DATA (res) = init_cost (loop);
987 LOOP_VINFO_PEELING_FOR_GAPS (res) = false;
988 LOOP_VINFO_PEELING_FOR_NITER (res) = false;
989 LOOP_VINFO_OPERANDS_SWAPPED (res) = false;
990
991 return res;
992 }
993
994
995 /* Function destroy_loop_vec_info.
996
997 Free LOOP_VINFO struct, as well as all the stmt_vec_info structs of all the
998 stmts in the loop. */
999
1000 void
1001 destroy_loop_vec_info (loop_vec_info loop_vinfo, bool clean_stmts)
1002 {
1003 struct loop *loop;
1004 basic_block *bbs;
1005 int nbbs;
1006 gimple_stmt_iterator si;
1007 int j;
1008 vec<slp_instance> slp_instances;
1009 slp_instance instance;
1010 bool swapped;
1011
1012 if (!loop_vinfo)
1013 return;
1014
1015 loop = LOOP_VINFO_LOOP (loop_vinfo);
1016
1017 bbs = LOOP_VINFO_BBS (loop_vinfo);
1018 nbbs = clean_stmts ? loop->num_nodes : 0;
1019 swapped = LOOP_VINFO_OPERANDS_SWAPPED (loop_vinfo);
1020
1021 for (j = 0; j < nbbs; j++)
1022 {
1023 basic_block bb = bbs[j];
1024 for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
1025 free_stmt_vec_info (gsi_stmt (si));
1026
1027 for (si = gsi_start_bb (bb); !gsi_end_p (si); )
1028 {
1029 gimple stmt = gsi_stmt (si);
1030
1031 /* We may have broken canonical form by moving a constant
1032 into RHS1 of a commutative op. Fix such occurrences. */
1033 if (swapped && is_gimple_assign (stmt))
1034 {
1035 enum tree_code code = gimple_assign_rhs_code (stmt);
1036
1037 if ((code == PLUS_EXPR
1038 || code == POINTER_PLUS_EXPR
1039 || code == MULT_EXPR)
1040 && CONSTANT_CLASS_P (gimple_assign_rhs1 (stmt)))
1041 swap_ssa_operands (stmt,
1042 gimple_assign_rhs1_ptr (stmt),
1043 gimple_assign_rhs2_ptr (stmt));
1044 }
1045
1046 /* Free stmt_vec_info. */
1047 free_stmt_vec_info (stmt);
1048 gsi_next (&si);
1049 }
1050 }
1051
1052 free (LOOP_VINFO_BBS (loop_vinfo));
1053 vect_destroy_datarefs (loop_vinfo, NULL);
1054 free_dependence_relations (LOOP_VINFO_DDRS (loop_vinfo));
1055 LOOP_VINFO_LOOP_NEST (loop_vinfo).release ();
1056 LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).release ();
1057 LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo).release ();
1058 slp_instances = LOOP_VINFO_SLP_INSTANCES (loop_vinfo);
1059 FOR_EACH_VEC_ELT (slp_instances, j, instance)
1060 vect_free_slp_instance (instance);
1061
1062 LOOP_VINFO_SLP_INSTANCES (loop_vinfo).release ();
1063 LOOP_VINFO_GROUPED_STORES (loop_vinfo).release ();
1064 LOOP_VINFO_REDUCTIONS (loop_vinfo).release ();
1065 LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo).release ();
1066
1067 delete LOOP_VINFO_PEELING_HTAB (loop_vinfo);
1068 LOOP_VINFO_PEELING_HTAB (loop_vinfo) = NULL;
1069
1070 destroy_cost_data (LOOP_VINFO_TARGET_COST_DATA (loop_vinfo));
1071
1072 free (loop_vinfo);
1073 loop->aux = NULL;
1074 }
1075
1076
1077 /* Function vect_analyze_loop_1.
1078
1079 Apply a set of analyses on LOOP, and create a loop_vec_info struct
1080 for it. The different analyses will record information in the
1081 loop_vec_info struct. This is a subset of the analyses applied in
1082 vect_analyze_loop, to be applied on an inner-loop nested in the loop
1083 that is now considered for (outer-loop) vectorization. */
1084
1085 static loop_vec_info
1086 vect_analyze_loop_1 (struct loop *loop)
1087 {
1088 loop_vec_info loop_vinfo;
1089
1090 if (dump_enabled_p ())
1091 dump_printf_loc (MSG_NOTE, vect_location,
1092 "===== analyze_loop_nest_1 =====\n");
1093
1094 /* Check the CFG characteristics of the loop (nesting, entry/exit, etc. */
1095
1096 loop_vinfo = vect_analyze_loop_form (loop);
1097 if (!loop_vinfo)
1098 {
1099 if (dump_enabled_p ())
1100 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1101 "bad inner-loop form.\n");
1102 return NULL;
1103 }
1104
1105 return loop_vinfo;
1106 }
1107
1108
1109 /* Function vect_analyze_loop_form.
1110
1111 Verify that certain CFG restrictions hold, including:
1112 - the loop has a pre-header
1113 - the loop has a single entry and exit
1114 - the loop exit condition is simple enough, and the number of iterations
1115 can be analyzed (a countable loop). */
1116
1117 loop_vec_info
1118 vect_analyze_loop_form (struct loop *loop)
1119 {
1120 loop_vec_info loop_vinfo;
1121 gcond *loop_cond;
1122 tree number_of_iterations = NULL, number_of_iterationsm1 = NULL;
1123 loop_vec_info inner_loop_vinfo = NULL;
1124
1125 if (dump_enabled_p ())
1126 dump_printf_loc (MSG_NOTE, vect_location,
1127 "=== vect_analyze_loop_form ===\n");
1128
1129 /* Different restrictions apply when we are considering an inner-most loop,
1130 vs. an outer (nested) loop.
1131 (FORNOW. May want to relax some of these restrictions in the future). */
1132
1133 if (!loop->inner)
1134 {
1135 /* Inner-most loop. We currently require that the number of BBs is
1136 exactly 2 (the header and latch). Vectorizable inner-most loops
1137 look like this:
1138
1139 (pre-header)
1140 |
1141 header <--------+
1142 | | |
1143 | +--> latch --+
1144 |
1145 (exit-bb) */
1146
1147 if (loop->num_nodes != 2)
1148 {
1149 if (dump_enabled_p ())
1150 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1151 "not vectorized: control flow in loop.\n");
1152 return NULL;
1153 }
1154
1155 if (empty_block_p (loop->header))
1156 {
1157 if (dump_enabled_p ())
1158 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1159 "not vectorized: empty loop.\n");
1160 return NULL;
1161 }
1162 }
1163 else
1164 {
1165 struct loop *innerloop = loop->inner;
1166 edge entryedge;
1167
1168 /* Nested loop. We currently require that the loop is doubly-nested,
1169 contains a single inner loop, and the number of BBs is exactly 5.
1170 Vectorizable outer-loops look like this:
1171
1172 (pre-header)
1173 |
1174 header <---+
1175 | |
1176 inner-loop |
1177 | |
1178 tail ------+
1179 |
1180 (exit-bb)
1181
1182 The inner-loop has the properties expected of inner-most loops
1183 as described above. */
1184
1185 if ((loop->inner)->inner || (loop->inner)->next)
1186 {
1187 if (dump_enabled_p ())
1188 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1189 "not vectorized: multiple nested loops.\n");
1190 return NULL;
1191 }
1192
1193 /* Analyze the inner-loop. */
1194 inner_loop_vinfo = vect_analyze_loop_1 (loop->inner);
1195 if (!inner_loop_vinfo)
1196 {
1197 if (dump_enabled_p ())
1198 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1199 "not vectorized: Bad inner loop.\n");
1200 return NULL;
1201 }
1202
1203 if (!expr_invariant_in_loop_p (loop,
1204 LOOP_VINFO_NITERS (inner_loop_vinfo)))
1205 {
1206 if (dump_enabled_p ())
1207 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1208 "not vectorized: inner-loop count not"
1209 " invariant.\n");
1210 destroy_loop_vec_info (inner_loop_vinfo, true);
1211 return NULL;
1212 }
1213
1214 if (loop->num_nodes != 5)
1215 {
1216 if (dump_enabled_p ())
1217 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1218 "not vectorized: control flow in loop.\n");
1219 destroy_loop_vec_info (inner_loop_vinfo, true);
1220 return NULL;
1221 }
1222
1223 gcc_assert (EDGE_COUNT (innerloop->header->preds) == 2);
1224 entryedge = EDGE_PRED (innerloop->header, 0);
1225 if (EDGE_PRED (innerloop->header, 0)->src == innerloop->latch)
1226 entryedge = EDGE_PRED (innerloop->header, 1);
1227
1228 if (entryedge->src != loop->header
1229 || !single_exit (innerloop)
1230 || single_exit (innerloop)->dest != EDGE_PRED (loop->latch, 0)->src)
1231 {
1232 if (dump_enabled_p ())
1233 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1234 "not vectorized: unsupported outerloop form.\n");
1235 destroy_loop_vec_info (inner_loop_vinfo, true);
1236 return NULL;
1237 }
1238
1239 if (dump_enabled_p ())
1240 dump_printf_loc (MSG_NOTE, vect_location,
1241 "Considering outer-loop vectorization.\n");
1242 }
1243
1244 if (!single_exit (loop)
1245 || EDGE_COUNT (loop->header->preds) != 2)
1246 {
1247 if (dump_enabled_p ())
1248 {
1249 if (!single_exit (loop))
1250 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1251 "not vectorized: multiple exits.\n");
1252 else if (EDGE_COUNT (loop->header->preds) != 2)
1253 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1254 "not vectorized: too many incoming edges.\n");
1255 }
1256 if (inner_loop_vinfo)
1257 destroy_loop_vec_info (inner_loop_vinfo, true);
1258 return NULL;
1259 }
1260
1261 /* We assume that the loop exit condition is at the end of the loop. i.e,
1262 that the loop is represented as a do-while (with a proper if-guard
1263 before the loop if needed), where the loop header contains all the
1264 executable statements, and the latch is empty. */
1265 if (!empty_block_p (loop->latch)
1266 || !gimple_seq_empty_p (phi_nodes (loop->latch)))
1267 {
1268 if (dump_enabled_p ())
1269 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1270 "not vectorized: latch block not empty.\n");
1271 if (inner_loop_vinfo)
1272 destroy_loop_vec_info (inner_loop_vinfo, true);
1273 return NULL;
1274 }
1275
1276 /* Make sure there exists a single-predecessor exit bb: */
1277 if (!single_pred_p (single_exit (loop)->dest))
1278 {
1279 edge e = single_exit (loop);
1280 if (!(e->flags & EDGE_ABNORMAL))
1281 {
1282 split_loop_exit_edge (e);
1283 if (dump_enabled_p ())
1284 dump_printf (MSG_NOTE, "split exit edge.\n");
1285 }
1286 else
1287 {
1288 if (dump_enabled_p ())
1289 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1290 "not vectorized: abnormal loop exit edge.\n");
1291 if (inner_loop_vinfo)
1292 destroy_loop_vec_info (inner_loop_vinfo, true);
1293 return NULL;
1294 }
1295 }
1296
1297 loop_cond = vect_get_loop_niters (loop, &number_of_iterations,
1298 &number_of_iterationsm1);
1299 if (!loop_cond)
1300 {
1301 if (dump_enabled_p ())
1302 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1303 "not vectorized: complicated exit condition.\n");
1304 if (inner_loop_vinfo)
1305 destroy_loop_vec_info (inner_loop_vinfo, true);
1306 return NULL;
1307 }
1308
1309 if (!number_of_iterations
1310 || chrec_contains_undetermined (number_of_iterations))
1311 {
1312 if (dump_enabled_p ())
1313 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1314 "not vectorized: number of iterations cannot be "
1315 "computed.\n");
1316 if (inner_loop_vinfo)
1317 destroy_loop_vec_info (inner_loop_vinfo, true);
1318 return NULL;
1319 }
1320
1321 if (integer_zerop (number_of_iterations))
1322 {
1323 if (dump_enabled_p ())
1324 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1325 "not vectorized: number of iterations = 0.\n");
1326 if (inner_loop_vinfo)
1327 destroy_loop_vec_info (inner_loop_vinfo, true);
1328 return NULL;
1329 }
1330
1331 loop_vinfo = new_loop_vec_info (loop);
1332 LOOP_VINFO_NITERSM1 (loop_vinfo) = number_of_iterationsm1;
1333 LOOP_VINFO_NITERS (loop_vinfo) = number_of_iterations;
1334 LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = number_of_iterations;
1335
1336 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
1337 {
1338 if (dump_enabled_p ())
1339 {
1340 dump_printf_loc (MSG_NOTE, vect_location,
1341 "Symbolic number of iterations is ");
1342 dump_generic_expr (MSG_NOTE, TDF_DETAILS, number_of_iterations);
1343 dump_printf (MSG_NOTE, "\n");
1344 }
1345 }
1346
1347 STMT_VINFO_TYPE (vinfo_for_stmt (loop_cond)) = loop_exit_ctrl_vec_info_type;
1348
1349 /* CHECKME: May want to keep it around it in the future. */
1350 if (inner_loop_vinfo)
1351 destroy_loop_vec_info (inner_loop_vinfo, false);
1352
1353 gcc_assert (!loop->aux);
1354 loop->aux = loop_vinfo;
1355 return loop_vinfo;
1356 }
1357
1358
1359 /* Function vect_analyze_loop_operations.
1360
1361 Scan the loop stmts and make sure they are all vectorizable. */
1362
1363 static bool
1364 vect_analyze_loop_operations (loop_vec_info loop_vinfo, bool slp)
1365 {
1366 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1367 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1368 int nbbs = loop->num_nodes;
1369 unsigned int vectorization_factor = 0;
1370 int i;
1371 stmt_vec_info stmt_info;
1372 bool need_to_vectorize = false;
1373 int min_profitable_iters;
1374 int min_scalar_loop_bound;
1375 unsigned int th;
1376 bool only_slp_in_loop = true, ok;
1377 HOST_WIDE_INT max_niter;
1378 HOST_WIDE_INT estimated_niter;
1379 int min_profitable_estimate;
1380
1381 if (dump_enabled_p ())
1382 dump_printf_loc (MSG_NOTE, vect_location,
1383 "=== vect_analyze_loop_operations ===\n");
1384
1385 gcc_assert (LOOP_VINFO_VECT_FACTOR (loop_vinfo));
1386 vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1387 if (slp)
1388 {
1389 /* If all the stmts in the loop can be SLPed, we perform only SLP, and
1390 vectorization factor of the loop is the unrolling factor required by
1391 the SLP instances. If that unrolling factor is 1, we say, that we
1392 perform pure SLP on loop - cross iteration parallelism is not
1393 exploited. */
1394 for (i = 0; i < nbbs; i++)
1395 {
1396 basic_block bb = bbs[i];
1397 for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
1398 gsi_next (&si))
1399 {
1400 gimple stmt = gsi_stmt (si);
1401 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1402 gcc_assert (stmt_info);
1403 if ((STMT_VINFO_RELEVANT_P (stmt_info)
1404 || VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
1405 && !PURE_SLP_STMT (stmt_info))
1406 /* STMT needs both SLP and loop-based vectorization. */
1407 only_slp_in_loop = false;
1408 }
1409 }
1410
1411 if (only_slp_in_loop)
1412 vectorization_factor = LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo);
1413 else
1414 vectorization_factor = least_common_multiple (vectorization_factor,
1415 LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo));
1416
1417 LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
1418 if (dump_enabled_p ())
1419 dump_printf_loc (MSG_NOTE, vect_location,
1420 "Updating vectorization factor to %d\n",
1421 vectorization_factor);
1422 }
1423
1424 for (i = 0; i < nbbs; i++)
1425 {
1426 basic_block bb = bbs[i];
1427
1428 for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
1429 gsi_next (&si))
1430 {
1431 gphi *phi = si.phi ();
1432 ok = true;
1433
1434 stmt_info = vinfo_for_stmt (phi);
1435 if (dump_enabled_p ())
1436 {
1437 dump_printf_loc (MSG_NOTE, vect_location, "examining phi: ");
1438 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
1439 dump_printf (MSG_NOTE, "\n");
1440 }
1441
1442 /* Inner-loop loop-closed exit phi in outer-loop vectorization
1443 (i.e., a phi in the tail of the outer-loop). */
1444 if (! is_loop_header_bb_p (bb))
1445 {
1446 /* FORNOW: we currently don't support the case that these phis
1447 are not used in the outerloop (unless it is double reduction,
1448 i.e., this phi is vect_reduction_def), cause this case
1449 requires to actually do something here. */
1450 if ((!STMT_VINFO_RELEVANT_P (stmt_info)
1451 || STMT_VINFO_LIVE_P (stmt_info))
1452 && STMT_VINFO_DEF_TYPE (stmt_info)
1453 != vect_double_reduction_def)
1454 {
1455 if (dump_enabled_p ())
1456 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1457 "Unsupported loop-closed phi in "
1458 "outer-loop.\n");
1459 return false;
1460 }
1461
1462 /* If PHI is used in the outer loop, we check that its operand
1463 is defined in the inner loop. */
1464 if (STMT_VINFO_RELEVANT_P (stmt_info))
1465 {
1466 tree phi_op;
1467 gimple op_def_stmt;
1468
1469 if (gimple_phi_num_args (phi) != 1)
1470 return false;
1471
1472 phi_op = PHI_ARG_DEF (phi, 0);
1473 if (TREE_CODE (phi_op) != SSA_NAME)
1474 return false;
1475
1476 op_def_stmt = SSA_NAME_DEF_STMT (phi_op);
1477 if (gimple_nop_p (op_def_stmt)
1478 || !flow_bb_inside_loop_p (loop, gimple_bb (op_def_stmt))
1479 || !vinfo_for_stmt (op_def_stmt))
1480 return false;
1481
1482 if (STMT_VINFO_RELEVANT (vinfo_for_stmt (op_def_stmt))
1483 != vect_used_in_outer
1484 && STMT_VINFO_RELEVANT (vinfo_for_stmt (op_def_stmt))
1485 != vect_used_in_outer_by_reduction)
1486 return false;
1487 }
1488
1489 continue;
1490 }
1491
1492 gcc_assert (stmt_info);
1493
1494 if (STMT_VINFO_LIVE_P (stmt_info))
1495 {
1496 /* FORNOW: not yet supported. */
1497 if (dump_enabled_p ())
1498 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1499 "not vectorized: value used after loop.\n");
1500 return false;
1501 }
1502
1503 if (STMT_VINFO_RELEVANT (stmt_info) == vect_used_in_scope
1504 && STMT_VINFO_DEF_TYPE (stmt_info) != vect_induction_def)
1505 {
1506 /* A scalar-dependence cycle that we don't support. */
1507 if (dump_enabled_p ())
1508 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1509 "not vectorized: scalar dependence cycle.\n");
1510 return false;
1511 }
1512
1513 if (STMT_VINFO_RELEVANT_P (stmt_info))
1514 {
1515 need_to_vectorize = true;
1516 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def)
1517 ok = vectorizable_induction (phi, NULL, NULL);
1518 }
1519
1520 if (!ok)
1521 {
1522 if (dump_enabled_p ())
1523 {
1524 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1525 "not vectorized: relevant phi not "
1526 "supported: ");
1527 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, phi, 0);
1528 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
1529 }
1530 return false;
1531 }
1532 }
1533
1534 for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
1535 gsi_next (&si))
1536 {
1537 gimple stmt = gsi_stmt (si);
1538 if (!gimple_clobber_p (stmt)
1539 && !vect_analyze_stmt (stmt, &need_to_vectorize, NULL))
1540 return false;
1541 }
1542 } /* bbs */
1543
1544 /* All operations in the loop are either irrelevant (deal with loop
1545 control, or dead), or only used outside the loop and can be moved
1546 out of the loop (e.g. invariants, inductions). The loop can be
1547 optimized away by scalar optimizations. We're better off not
1548 touching this loop. */
1549 if (!need_to_vectorize)
1550 {
1551 if (dump_enabled_p ())
1552 dump_printf_loc (MSG_NOTE, vect_location,
1553 "All the computation can be taken out of the loop.\n");
1554 if (dump_enabled_p ())
1555 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1556 "not vectorized: redundant loop. no profit to "
1557 "vectorize.\n");
1558 return false;
1559 }
1560
1561 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) && dump_enabled_p ())
1562 dump_printf_loc (MSG_NOTE, vect_location,
1563 "vectorization_factor = %d, niters = "
1564 HOST_WIDE_INT_PRINT_DEC "\n", vectorization_factor,
1565 LOOP_VINFO_INT_NITERS (loop_vinfo));
1566
1567 if ((LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
1568 && (LOOP_VINFO_INT_NITERS (loop_vinfo) < vectorization_factor))
1569 || ((max_niter = max_stmt_executions_int (loop)) != -1
1570 && (unsigned HOST_WIDE_INT) max_niter < vectorization_factor))
1571 {
1572 if (dump_enabled_p ())
1573 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1574 "not vectorized: iteration count too small.\n");
1575 if (dump_enabled_p ())
1576 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1577 "not vectorized: iteration count smaller than "
1578 "vectorization factor.\n");
1579 return false;
1580 }
1581
1582 /* Analyze cost. Decide if worth while to vectorize. */
1583
1584 /* Once VF is set, SLP costs should be updated since the number of created
1585 vector stmts depends on VF. */
1586 vect_update_slp_costs_according_to_vf (loop_vinfo);
1587
1588 vect_estimate_min_profitable_iters (loop_vinfo, &min_profitable_iters,
1589 &min_profitable_estimate);
1590 LOOP_VINFO_COST_MODEL_MIN_ITERS (loop_vinfo) = min_profitable_iters;
1591
1592 if (min_profitable_iters < 0)
1593 {
1594 if (dump_enabled_p ())
1595 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1596 "not vectorized: vectorization not profitable.\n");
1597 if (dump_enabled_p ())
1598 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1599 "not vectorized: vector version will never be "
1600 "profitable.\n");
1601 return false;
1602 }
1603
1604 min_scalar_loop_bound = ((PARAM_VALUE (PARAM_MIN_VECT_LOOP_BOUND)
1605 * vectorization_factor) - 1);
1606
1607
1608 /* Use the cost model only if it is more conservative than user specified
1609 threshold. */
1610
1611 th = (unsigned) min_scalar_loop_bound;
1612 if (min_profitable_iters
1613 && (!min_scalar_loop_bound
1614 || min_profitable_iters > min_scalar_loop_bound))
1615 th = (unsigned) min_profitable_iters;
1616
1617 LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo) = th;
1618
1619 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
1620 && LOOP_VINFO_INT_NITERS (loop_vinfo) <= th)
1621 {
1622 if (dump_enabled_p ())
1623 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1624 "not vectorized: vectorization not profitable.\n");
1625 if (dump_enabled_p ())
1626 dump_printf_loc (MSG_NOTE, vect_location,
1627 "not vectorized: iteration count smaller than user "
1628 "specified loop bound parameter or minimum profitable "
1629 "iterations (whichever is more conservative).\n");
1630 return false;
1631 }
1632
1633 if ((estimated_niter = estimated_stmt_executions_int (loop)) != -1
1634 && ((unsigned HOST_WIDE_INT) estimated_niter
1635 <= MAX (th, (unsigned)min_profitable_estimate)))
1636 {
1637 if (dump_enabled_p ())
1638 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1639 "not vectorized: estimated iteration count too "
1640 "small.\n");
1641 if (dump_enabled_p ())
1642 dump_printf_loc (MSG_NOTE, vect_location,
1643 "not vectorized: estimated iteration count smaller "
1644 "than specified loop bound parameter or minimum "
1645 "profitable iterations (whichever is more "
1646 "conservative).\n");
1647 return false;
1648 }
1649
1650 return true;
1651 }
1652
1653
1654 /* Function vect_analyze_loop_2.
1655
1656 Apply a set of analyses on LOOP, and create a loop_vec_info struct
1657 for it. The different analyses will record information in the
1658 loop_vec_info struct. */
1659 static bool
1660 vect_analyze_loop_2 (loop_vec_info loop_vinfo)
1661 {
1662 bool ok, slp = false;
1663 int max_vf = MAX_VECTORIZATION_FACTOR;
1664 int min_vf = 2;
1665 unsigned int th;
1666 unsigned int n_stmts = 0;
1667
1668 /* Find all data references in the loop (which correspond to vdefs/vuses)
1669 and analyze their evolution in the loop. Also adjust the minimal
1670 vectorization factor according to the loads and stores.
1671
1672 FORNOW: Handle only simple, array references, which
1673 alignment can be forced, and aligned pointer-references. */
1674
1675 ok = vect_analyze_data_refs (loop_vinfo, NULL, &min_vf, &n_stmts);
1676 if (!ok)
1677 {
1678 if (dump_enabled_p ())
1679 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1680 "bad data references.\n");
1681 return false;
1682 }
1683
1684 /* Classify all cross-iteration scalar data-flow cycles.
1685 Cross-iteration cycles caused by virtual phis are analyzed separately. */
1686
1687 vect_analyze_scalar_cycles (loop_vinfo);
1688
1689 vect_pattern_recog (loop_vinfo, NULL);
1690
1691 /* Analyze the access patterns of the data-refs in the loop (consecutive,
1692 complex, etc.). FORNOW: Only handle consecutive access pattern. */
1693
1694 ok = vect_analyze_data_ref_accesses (loop_vinfo, NULL);
1695 if (!ok)
1696 {
1697 if (dump_enabled_p ())
1698 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1699 "bad data access.\n");
1700 return false;
1701 }
1702
1703 /* Data-flow analysis to detect stmts that do not need to be vectorized. */
1704
1705 ok = vect_mark_stmts_to_be_vectorized (loop_vinfo);
1706 if (!ok)
1707 {
1708 if (dump_enabled_p ())
1709 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1710 "unexpected pattern.\n");
1711 return false;
1712 }
1713
1714 /* Analyze data dependences between the data-refs in the loop
1715 and adjust the maximum vectorization factor according to
1716 the dependences.
1717 FORNOW: fail at the first data dependence that we encounter. */
1718
1719 ok = vect_analyze_data_ref_dependences (loop_vinfo, &max_vf);
1720 if (!ok
1721 || max_vf < min_vf)
1722 {
1723 if (dump_enabled_p ())
1724 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1725 "bad data dependence.\n");
1726 return false;
1727 }
1728
1729 ok = vect_determine_vectorization_factor (loop_vinfo);
1730 if (!ok)
1731 {
1732 if (dump_enabled_p ())
1733 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1734 "can't determine vectorization factor.\n");
1735 return false;
1736 }
1737 if (max_vf < LOOP_VINFO_VECT_FACTOR (loop_vinfo))
1738 {
1739 if (dump_enabled_p ())
1740 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1741 "bad data dependence.\n");
1742 return false;
1743 }
1744
1745 /* Analyze the alignment of the data-refs in the loop.
1746 Fail if a data reference is found that cannot be vectorized. */
1747
1748 ok = vect_analyze_data_refs_alignment (loop_vinfo, NULL);
1749 if (!ok)
1750 {
1751 if (dump_enabled_p ())
1752 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1753 "bad data alignment.\n");
1754 return false;
1755 }
1756
1757 /* Prune the list of ddrs to be tested at run-time by versioning for alias.
1758 It is important to call pruning after vect_analyze_data_ref_accesses,
1759 since we use grouping information gathered by interleaving analysis. */
1760 ok = vect_prune_runtime_alias_test_list (loop_vinfo);
1761 if (!ok)
1762 {
1763 if (dump_enabled_p ())
1764 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1765 "number of versioning for alias "
1766 "run-time tests exceeds %d "
1767 "(--param vect-max-version-for-alias-checks)\n",
1768 PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS));
1769 return false;
1770 }
1771
1772 /* This pass will decide on using loop versioning and/or loop peeling in
1773 order to enhance the alignment of data references in the loop. */
1774
1775 ok = vect_enhance_data_refs_alignment (loop_vinfo);
1776 if (!ok)
1777 {
1778 if (dump_enabled_p ())
1779 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1780 "bad data alignment.\n");
1781 return false;
1782 }
1783
1784 /* Check the SLP opportunities in the loop, analyze and build SLP trees. */
1785 ok = vect_analyze_slp (loop_vinfo, NULL, n_stmts);
1786 if (ok)
1787 {
1788 /* Decide which possible SLP instances to SLP. */
1789 slp = vect_make_slp_decision (loop_vinfo);
1790
1791 /* Find stmts that need to be both vectorized and SLPed. */
1792 vect_detect_hybrid_slp (loop_vinfo);
1793 }
1794 else
1795 return false;
1796
1797 /* Scan all the operations in the loop and make sure they are
1798 vectorizable. */
1799
1800 ok = vect_analyze_loop_operations (loop_vinfo, slp);
1801 if (!ok)
1802 {
1803 if (dump_enabled_p ())
1804 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1805 "bad operation or unsupported loop bound.\n");
1806 return false;
1807 }
1808
1809 /* Decide whether we need to create an epilogue loop to handle
1810 remaining scalar iterations. */
1811 th = ((LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo) + 1)
1812 / LOOP_VINFO_VECT_FACTOR (loop_vinfo))
1813 * LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1814
1815 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
1816 && LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) > 0)
1817 {
1818 if (ctz_hwi (LOOP_VINFO_INT_NITERS (loop_vinfo)
1819 - LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo))
1820 < exact_log2 (LOOP_VINFO_VECT_FACTOR (loop_vinfo)))
1821 LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = true;
1822 }
1823 else if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
1824 || (tree_ctz (LOOP_VINFO_NITERS (loop_vinfo))
1825 < (unsigned)exact_log2 (LOOP_VINFO_VECT_FACTOR (loop_vinfo))
1826 /* In case of versioning, check if the maximum number of
1827 iterations is greater than th. If they are identical,
1828 the epilogue is unnecessary. */
1829 && ((!LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo)
1830 && !LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo))
1831 || (unsigned HOST_WIDE_INT)max_stmt_executions_int
1832 (LOOP_VINFO_LOOP (loop_vinfo)) > th)))
1833 LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = true;
1834
1835 /* If an epilogue loop is required make sure we can create one. */
1836 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
1837 || LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo))
1838 {
1839 if (dump_enabled_p ())
1840 dump_printf_loc (MSG_NOTE, vect_location, "epilog loop required\n");
1841 if (!vect_can_advance_ivs_p (loop_vinfo)
1842 || !slpeel_can_duplicate_loop_p (LOOP_VINFO_LOOP (loop_vinfo),
1843 single_exit (LOOP_VINFO_LOOP
1844 (loop_vinfo))))
1845 {
1846 if (dump_enabled_p ())
1847 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1848 "not vectorized: can't create required "
1849 "epilog loop\n");
1850 return false;
1851 }
1852 }
1853
1854 return true;
1855 }
1856
1857 /* Function vect_analyze_loop.
1858
1859 Apply a set of analyses on LOOP, and create a loop_vec_info struct
1860 for it. The different analyses will record information in the
1861 loop_vec_info struct. */
1862 loop_vec_info
1863 vect_analyze_loop (struct loop *loop)
1864 {
1865 loop_vec_info loop_vinfo;
1866 unsigned int vector_sizes;
1867
1868 /* Autodetect first vector size we try. */
1869 current_vector_size = 0;
1870 vector_sizes = targetm.vectorize.autovectorize_vector_sizes ();
1871
1872 if (dump_enabled_p ())
1873 dump_printf_loc (MSG_NOTE, vect_location,
1874 "===== analyze_loop_nest =====\n");
1875
1876 if (loop_outer (loop)
1877 && loop_vec_info_for_loop (loop_outer (loop))
1878 && LOOP_VINFO_VECTORIZABLE_P (loop_vec_info_for_loop (loop_outer (loop))))
1879 {
1880 if (dump_enabled_p ())
1881 dump_printf_loc (MSG_NOTE, vect_location,
1882 "outer-loop already vectorized.\n");
1883 return NULL;
1884 }
1885
1886 while (1)
1887 {
1888 /* Check the CFG characteristics of the loop (nesting, entry/exit). */
1889 loop_vinfo = vect_analyze_loop_form (loop);
1890 if (!loop_vinfo)
1891 {
1892 if (dump_enabled_p ())
1893 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1894 "bad loop form.\n");
1895 return NULL;
1896 }
1897
1898 if (vect_analyze_loop_2 (loop_vinfo))
1899 {
1900 LOOP_VINFO_VECTORIZABLE_P (loop_vinfo) = 1;
1901
1902 return loop_vinfo;
1903 }
1904
1905 destroy_loop_vec_info (loop_vinfo, true);
1906
1907 vector_sizes &= ~current_vector_size;
1908 if (vector_sizes == 0
1909 || current_vector_size == 0)
1910 return NULL;
1911
1912 /* Try the next biggest vector size. */
1913 current_vector_size = 1 << floor_log2 (vector_sizes);
1914 if (dump_enabled_p ())
1915 dump_printf_loc (MSG_NOTE, vect_location,
1916 "***** Re-trying analysis with "
1917 "vector size %d\n", current_vector_size);
1918 }
1919 }
1920
1921
1922 /* Function reduction_code_for_scalar_code
1923
1924 Input:
1925 CODE - tree_code of a reduction operations.
1926
1927 Output:
1928 REDUC_CODE - the corresponding tree-code to be used to reduce the
1929 vector of partial results into a single scalar result, or ERROR_MARK
1930 if the operation is a supported reduction operation, but does not have
1931 such a tree-code.
1932
1933 Return FALSE if CODE currently cannot be vectorized as reduction. */
1934
1935 static bool
1936 reduction_code_for_scalar_code (enum tree_code code,
1937 enum tree_code *reduc_code)
1938 {
1939 switch (code)
1940 {
1941 case MAX_EXPR:
1942 *reduc_code = REDUC_MAX_EXPR;
1943 return true;
1944
1945 case MIN_EXPR:
1946 *reduc_code = REDUC_MIN_EXPR;
1947 return true;
1948
1949 case PLUS_EXPR:
1950 *reduc_code = REDUC_PLUS_EXPR;
1951 return true;
1952
1953 case MULT_EXPR:
1954 case MINUS_EXPR:
1955 case BIT_IOR_EXPR:
1956 case BIT_XOR_EXPR:
1957 case BIT_AND_EXPR:
1958 *reduc_code = ERROR_MARK;
1959 return true;
1960
1961 default:
1962 return false;
1963 }
1964 }
1965
1966
1967 /* Error reporting helper for vect_is_simple_reduction below. GIMPLE statement
1968 STMT is printed with a message MSG. */
1969
1970 static void
1971 report_vect_op (int msg_type, gimple stmt, const char *msg)
1972 {
1973 dump_printf_loc (msg_type, vect_location, "%s", msg);
1974 dump_gimple_stmt (msg_type, TDF_SLIM, stmt, 0);
1975 dump_printf (msg_type, "\n");
1976 }
1977
1978
1979 /* Detect SLP reduction of the form:
1980
1981 #a1 = phi <a5, a0>
1982 a2 = operation (a1)
1983 a3 = operation (a2)
1984 a4 = operation (a3)
1985 a5 = operation (a4)
1986
1987 #a = phi <a5>
1988
1989 PHI is the reduction phi node (#a1 = phi <a5, a0> above)
1990 FIRST_STMT is the first reduction stmt in the chain
1991 (a2 = operation (a1)).
1992
1993 Return TRUE if a reduction chain was detected. */
1994
1995 static bool
1996 vect_is_slp_reduction (loop_vec_info loop_info, gimple phi, gimple first_stmt)
1997 {
1998 struct loop *loop = (gimple_bb (phi))->loop_father;
1999 struct loop *vect_loop = LOOP_VINFO_LOOP (loop_info);
2000 enum tree_code code;
2001 gimple current_stmt = NULL, loop_use_stmt = NULL, first, next_stmt;
2002 stmt_vec_info use_stmt_info, current_stmt_info;
2003 tree lhs;
2004 imm_use_iterator imm_iter;
2005 use_operand_p use_p;
2006 int nloop_uses, size = 0, n_out_of_loop_uses;
2007 bool found = false;
2008
2009 if (loop != vect_loop)
2010 return false;
2011
2012 lhs = PHI_RESULT (phi);
2013 code = gimple_assign_rhs_code (first_stmt);
2014 while (1)
2015 {
2016 nloop_uses = 0;
2017 n_out_of_loop_uses = 0;
2018 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, lhs)
2019 {
2020 gimple use_stmt = USE_STMT (use_p);
2021 if (is_gimple_debug (use_stmt))
2022 continue;
2023
2024 /* Check if we got back to the reduction phi. */
2025 if (use_stmt == phi)
2026 {
2027 loop_use_stmt = use_stmt;
2028 found = true;
2029 break;
2030 }
2031
2032 if (flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
2033 {
2034 if (vinfo_for_stmt (use_stmt)
2035 && !STMT_VINFO_IN_PATTERN_P (vinfo_for_stmt (use_stmt)))
2036 {
2037 loop_use_stmt = use_stmt;
2038 nloop_uses++;
2039 }
2040 }
2041 else
2042 n_out_of_loop_uses++;
2043
2044 /* There are can be either a single use in the loop or two uses in
2045 phi nodes. */
2046 if (nloop_uses > 1 || (n_out_of_loop_uses && nloop_uses))
2047 return false;
2048 }
2049
2050 if (found)
2051 break;
2052
2053 /* We reached a statement with no loop uses. */
2054 if (nloop_uses == 0)
2055 return false;
2056
2057 /* This is a loop exit phi, and we haven't reached the reduction phi. */
2058 if (gimple_code (loop_use_stmt) == GIMPLE_PHI)
2059 return false;
2060
2061 if (!is_gimple_assign (loop_use_stmt)
2062 || code != gimple_assign_rhs_code (loop_use_stmt)
2063 || !flow_bb_inside_loop_p (loop, gimple_bb (loop_use_stmt)))
2064 return false;
2065
2066 /* Insert USE_STMT into reduction chain. */
2067 use_stmt_info = vinfo_for_stmt (loop_use_stmt);
2068 if (current_stmt)
2069 {
2070 current_stmt_info = vinfo_for_stmt (current_stmt);
2071 GROUP_NEXT_ELEMENT (current_stmt_info) = loop_use_stmt;
2072 GROUP_FIRST_ELEMENT (use_stmt_info)
2073 = GROUP_FIRST_ELEMENT (current_stmt_info);
2074 }
2075 else
2076 GROUP_FIRST_ELEMENT (use_stmt_info) = loop_use_stmt;
2077
2078 lhs = gimple_assign_lhs (loop_use_stmt);
2079 current_stmt = loop_use_stmt;
2080 size++;
2081 }
2082
2083 if (!found || loop_use_stmt != phi || size < 2)
2084 return false;
2085
2086 /* Swap the operands, if needed, to make the reduction operand be the second
2087 operand. */
2088 lhs = PHI_RESULT (phi);
2089 next_stmt = GROUP_FIRST_ELEMENT (vinfo_for_stmt (current_stmt));
2090 while (next_stmt)
2091 {
2092 if (gimple_assign_rhs2 (next_stmt) == lhs)
2093 {
2094 tree op = gimple_assign_rhs1 (next_stmt);
2095 gimple def_stmt = NULL;
2096
2097 if (TREE_CODE (op) == SSA_NAME)
2098 def_stmt = SSA_NAME_DEF_STMT (op);
2099
2100 /* Check that the other def is either defined in the loop
2101 ("vect_internal_def"), or it's an induction (defined by a
2102 loop-header phi-node). */
2103 if (def_stmt
2104 && gimple_bb (def_stmt)
2105 && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
2106 && (is_gimple_assign (def_stmt)
2107 || is_gimple_call (def_stmt)
2108 || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
2109 == vect_induction_def
2110 || (gimple_code (def_stmt) == GIMPLE_PHI
2111 && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
2112 == vect_internal_def
2113 && !is_loop_header_bb_p (gimple_bb (def_stmt)))))
2114 {
2115 lhs = gimple_assign_lhs (next_stmt);
2116 next_stmt = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next_stmt));
2117 continue;
2118 }
2119
2120 return false;
2121 }
2122 else
2123 {
2124 tree op = gimple_assign_rhs2 (next_stmt);
2125 gimple def_stmt = NULL;
2126
2127 if (TREE_CODE (op) == SSA_NAME)
2128 def_stmt = SSA_NAME_DEF_STMT (op);
2129
2130 /* Check that the other def is either defined in the loop
2131 ("vect_internal_def"), or it's an induction (defined by a
2132 loop-header phi-node). */
2133 if (def_stmt
2134 && gimple_bb (def_stmt)
2135 && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
2136 && (is_gimple_assign (def_stmt)
2137 || is_gimple_call (def_stmt)
2138 || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
2139 == vect_induction_def
2140 || (gimple_code (def_stmt) == GIMPLE_PHI
2141 && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
2142 == vect_internal_def
2143 && !is_loop_header_bb_p (gimple_bb (def_stmt)))))
2144 {
2145 if (dump_enabled_p ())
2146 {
2147 dump_printf_loc (MSG_NOTE, vect_location, "swapping oprnds: ");
2148 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, next_stmt, 0);
2149 dump_printf (MSG_NOTE, "\n");
2150 }
2151
2152 swap_ssa_operands (next_stmt,
2153 gimple_assign_rhs1_ptr (next_stmt),
2154 gimple_assign_rhs2_ptr (next_stmt));
2155 update_stmt (next_stmt);
2156
2157 if (CONSTANT_CLASS_P (gimple_assign_rhs1 (next_stmt)))
2158 LOOP_VINFO_OPERANDS_SWAPPED (loop_info) = true;
2159 }
2160 else
2161 return false;
2162 }
2163
2164 lhs = gimple_assign_lhs (next_stmt);
2165 next_stmt = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next_stmt));
2166 }
2167
2168 /* Save the chain for further analysis in SLP detection. */
2169 first = GROUP_FIRST_ELEMENT (vinfo_for_stmt (current_stmt));
2170 LOOP_VINFO_REDUCTION_CHAINS (loop_info).safe_push (first);
2171 GROUP_SIZE (vinfo_for_stmt (first)) = size;
2172
2173 return true;
2174 }
2175
2176
2177 /* Function vect_is_simple_reduction_1
2178
2179 (1) Detect a cross-iteration def-use cycle that represents a simple
2180 reduction computation. We look for the following pattern:
2181
2182 loop_header:
2183 a1 = phi < a0, a2 >
2184 a3 = ...
2185 a2 = operation (a3, a1)
2186
2187 or
2188
2189 a3 = ...
2190 loop_header:
2191 a1 = phi < a0, a2 >
2192 a2 = operation (a3, a1)
2193
2194 such that:
2195 1. operation is commutative and associative and it is safe to
2196 change the order of the computation (if CHECK_REDUCTION is true)
2197 2. no uses for a2 in the loop (a2 is used out of the loop)
2198 3. no uses of a1 in the loop besides the reduction operation
2199 4. no uses of a1 outside the loop.
2200
2201 Conditions 1,4 are tested here.
2202 Conditions 2,3 are tested in vect_mark_stmts_to_be_vectorized.
2203
2204 (2) Detect a cross-iteration def-use cycle in nested loops, i.e.,
2205 nested cycles, if CHECK_REDUCTION is false.
2206
2207 (3) Detect cycles of phi nodes in outer-loop vectorization, i.e., double
2208 reductions:
2209
2210 a1 = phi < a0, a2 >
2211 inner loop (def of a3)
2212 a2 = phi < a3 >
2213
2214 If MODIFY is true it tries also to rework the code in-place to enable
2215 detection of more reduction patterns. For the time being we rewrite
2216 "res -= RHS" into "rhs += -RHS" when it seems worthwhile.
2217 */
2218
2219 static gimple
2220 vect_is_simple_reduction_1 (loop_vec_info loop_info, gimple phi,
2221 bool check_reduction, bool *double_reduc,
2222 bool modify)
2223 {
2224 struct loop *loop = (gimple_bb (phi))->loop_father;
2225 struct loop *vect_loop = LOOP_VINFO_LOOP (loop_info);
2226 edge latch_e = loop_latch_edge (loop);
2227 tree loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
2228 gimple def_stmt, def1 = NULL, def2 = NULL;
2229 enum tree_code orig_code, code;
2230 tree op1, op2, op3 = NULL_TREE, op4 = NULL_TREE;
2231 tree type;
2232 int nloop_uses;
2233 tree name;
2234 imm_use_iterator imm_iter;
2235 use_operand_p use_p;
2236 bool phi_def;
2237
2238 *double_reduc = false;
2239
2240 /* If CHECK_REDUCTION is true, we assume inner-most loop vectorization,
2241 otherwise, we assume outer loop vectorization. */
2242 gcc_assert ((check_reduction && loop == vect_loop)
2243 || (!check_reduction && flow_loop_nested_p (vect_loop, loop)));
2244
2245 name = PHI_RESULT (phi);
2246 /* ??? If there are no uses of the PHI result the inner loop reduction
2247 won't be detected as possibly double-reduction by vectorizable_reduction
2248 because that tries to walk the PHI arg from the preheader edge which
2249 can be constant. See PR60382. */
2250 if (has_zero_uses (name))
2251 return NULL;
2252 nloop_uses = 0;
2253 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, name)
2254 {
2255 gimple use_stmt = USE_STMT (use_p);
2256 if (is_gimple_debug (use_stmt))
2257 continue;
2258
2259 if (!flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
2260 {
2261 if (dump_enabled_p ())
2262 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2263 "intermediate value used outside loop.\n");
2264
2265 return NULL;
2266 }
2267
2268 if (vinfo_for_stmt (use_stmt)
2269 && !is_pattern_stmt_p (vinfo_for_stmt (use_stmt)))
2270 nloop_uses++;
2271 if (nloop_uses > 1)
2272 {
2273 if (dump_enabled_p ())
2274 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2275 "reduction used in loop.\n");
2276 return NULL;
2277 }
2278 }
2279
2280 if (TREE_CODE (loop_arg) != SSA_NAME)
2281 {
2282 if (dump_enabled_p ())
2283 {
2284 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2285 "reduction: not ssa_name: ");
2286 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, loop_arg);
2287 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
2288 }
2289 return NULL;
2290 }
2291
2292 def_stmt = SSA_NAME_DEF_STMT (loop_arg);
2293 if (!def_stmt)
2294 {
2295 if (dump_enabled_p ())
2296 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2297 "reduction: no def_stmt.\n");
2298 return NULL;
2299 }
2300
2301 if (!is_gimple_assign (def_stmt) && gimple_code (def_stmt) != GIMPLE_PHI)
2302 {
2303 if (dump_enabled_p ())
2304 {
2305 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, def_stmt, 0);
2306 dump_printf (MSG_NOTE, "\n");
2307 }
2308 return NULL;
2309 }
2310
2311 if (is_gimple_assign (def_stmt))
2312 {
2313 name = gimple_assign_lhs (def_stmt);
2314 phi_def = false;
2315 }
2316 else
2317 {
2318 name = PHI_RESULT (def_stmt);
2319 phi_def = true;
2320 }
2321
2322 nloop_uses = 0;
2323 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, name)
2324 {
2325 gimple use_stmt = USE_STMT (use_p);
2326 if (is_gimple_debug (use_stmt))
2327 continue;
2328 if (flow_bb_inside_loop_p (loop, gimple_bb (use_stmt))
2329 && vinfo_for_stmt (use_stmt)
2330 && !is_pattern_stmt_p (vinfo_for_stmt (use_stmt)))
2331 nloop_uses++;
2332 if (nloop_uses > 1)
2333 {
2334 if (dump_enabled_p ())
2335 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2336 "reduction used in loop.\n");
2337 return NULL;
2338 }
2339 }
2340
2341 /* If DEF_STMT is a phi node itself, we expect it to have a single argument
2342 defined in the inner loop. */
2343 if (phi_def)
2344 {
2345 op1 = PHI_ARG_DEF (def_stmt, 0);
2346
2347 if (gimple_phi_num_args (def_stmt) != 1
2348 || TREE_CODE (op1) != SSA_NAME)
2349 {
2350 if (dump_enabled_p ())
2351 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2352 "unsupported phi node definition.\n");
2353
2354 return NULL;
2355 }
2356
2357 def1 = SSA_NAME_DEF_STMT (op1);
2358 if (gimple_bb (def1)
2359 && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
2360 && loop->inner
2361 && flow_bb_inside_loop_p (loop->inner, gimple_bb (def1))
2362 && is_gimple_assign (def1))
2363 {
2364 if (dump_enabled_p ())
2365 report_vect_op (MSG_NOTE, def_stmt,
2366 "detected double reduction: ");
2367
2368 *double_reduc = true;
2369 return def_stmt;
2370 }
2371
2372 return NULL;
2373 }
2374
2375 code = orig_code = gimple_assign_rhs_code (def_stmt);
2376
2377 /* We can handle "res -= x[i]", which is non-associative by
2378 simply rewriting this into "res += -x[i]". Avoid changing
2379 gimple instruction for the first simple tests and only do this
2380 if we're allowed to change code at all. */
2381 if (code == MINUS_EXPR
2382 && modify
2383 && (op1 = gimple_assign_rhs1 (def_stmt))
2384 && TREE_CODE (op1) == SSA_NAME
2385 && SSA_NAME_DEF_STMT (op1) == phi)
2386 code = PLUS_EXPR;
2387
2388 if (check_reduction
2389 && (!commutative_tree_code (code) || !associative_tree_code (code)))
2390 {
2391 if (dump_enabled_p ())
2392 report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2393 "reduction: not commutative/associative: ");
2394 return NULL;
2395 }
2396
2397 if (get_gimple_rhs_class (code) != GIMPLE_BINARY_RHS)
2398 {
2399 if (code != COND_EXPR)
2400 {
2401 if (dump_enabled_p ())
2402 report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2403 "reduction: not binary operation: ");
2404
2405 return NULL;
2406 }
2407
2408 op3 = gimple_assign_rhs1 (def_stmt);
2409 if (COMPARISON_CLASS_P (op3))
2410 {
2411 op4 = TREE_OPERAND (op3, 1);
2412 op3 = TREE_OPERAND (op3, 0);
2413 }
2414
2415 op1 = gimple_assign_rhs2 (def_stmt);
2416 op2 = gimple_assign_rhs3 (def_stmt);
2417
2418 if (TREE_CODE (op1) != SSA_NAME && TREE_CODE (op2) != SSA_NAME)
2419 {
2420 if (dump_enabled_p ())
2421 report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2422 "reduction: uses not ssa_names: ");
2423
2424 return NULL;
2425 }
2426 }
2427 else
2428 {
2429 op1 = gimple_assign_rhs1 (def_stmt);
2430 op2 = gimple_assign_rhs2 (def_stmt);
2431
2432 if (TREE_CODE (op1) != SSA_NAME && TREE_CODE (op2) != SSA_NAME)
2433 {
2434 if (dump_enabled_p ())
2435 report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2436 "reduction: uses not ssa_names: ");
2437
2438 return NULL;
2439 }
2440 }
2441
2442 type = TREE_TYPE (gimple_assign_lhs (def_stmt));
2443 if ((TREE_CODE (op1) == SSA_NAME
2444 && !types_compatible_p (type,TREE_TYPE (op1)))
2445 || (TREE_CODE (op2) == SSA_NAME
2446 && !types_compatible_p (type, TREE_TYPE (op2)))
2447 || (op3 && TREE_CODE (op3) == SSA_NAME
2448 && !types_compatible_p (type, TREE_TYPE (op3)))
2449 || (op4 && TREE_CODE (op4) == SSA_NAME
2450 && !types_compatible_p (type, TREE_TYPE (op4))))
2451 {
2452 if (dump_enabled_p ())
2453 {
2454 dump_printf_loc (MSG_NOTE, vect_location,
2455 "reduction: multiple types: operation type: ");
2456 dump_generic_expr (MSG_NOTE, TDF_SLIM, type);
2457 dump_printf (MSG_NOTE, ", operands types: ");
2458 dump_generic_expr (MSG_NOTE, TDF_SLIM,
2459 TREE_TYPE (op1));
2460 dump_printf (MSG_NOTE, ",");
2461 dump_generic_expr (MSG_NOTE, TDF_SLIM,
2462 TREE_TYPE (op2));
2463 if (op3)
2464 {
2465 dump_printf (MSG_NOTE, ",");
2466 dump_generic_expr (MSG_NOTE, TDF_SLIM,
2467 TREE_TYPE (op3));
2468 }
2469
2470 if (op4)
2471 {
2472 dump_printf (MSG_NOTE, ",");
2473 dump_generic_expr (MSG_NOTE, TDF_SLIM,
2474 TREE_TYPE (op4));
2475 }
2476 dump_printf (MSG_NOTE, "\n");
2477 }
2478
2479 return NULL;
2480 }
2481
2482 /* Check that it's ok to change the order of the computation.
2483 Generally, when vectorizing a reduction we change the order of the
2484 computation. This may change the behavior of the program in some
2485 cases, so we need to check that this is ok. One exception is when
2486 vectorizing an outer-loop: the inner-loop is executed sequentially,
2487 and therefore vectorizing reductions in the inner-loop during
2488 outer-loop vectorization is safe. */
2489
2490 /* CHECKME: check for !flag_finite_math_only too? */
2491 if (SCALAR_FLOAT_TYPE_P (type) && !flag_associative_math
2492 && check_reduction)
2493 {
2494 /* Changing the order of operations changes the semantics. */
2495 if (dump_enabled_p ())
2496 report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2497 "reduction: unsafe fp math optimization: ");
2498 return NULL;
2499 }
2500 else if (INTEGRAL_TYPE_P (type) && TYPE_OVERFLOW_TRAPS (type)
2501 && check_reduction)
2502 {
2503 /* Changing the order of operations changes the semantics. */
2504 if (dump_enabled_p ())
2505 report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2506 "reduction: unsafe int math optimization: ");
2507 return NULL;
2508 }
2509 else if (SAT_FIXED_POINT_TYPE_P (type) && check_reduction)
2510 {
2511 /* Changing the order of operations changes the semantics. */
2512 if (dump_enabled_p ())
2513 report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2514 "reduction: unsafe fixed-point math optimization: ");
2515 return NULL;
2516 }
2517
2518 /* If we detected "res -= x[i]" earlier, rewrite it into
2519 "res += -x[i]" now. If this turns out to be useless reassoc
2520 will clean it up again. */
2521 if (orig_code == MINUS_EXPR)
2522 {
2523 tree rhs = gimple_assign_rhs2 (def_stmt);
2524 tree negrhs = make_ssa_name (TREE_TYPE (rhs));
2525 gimple negate_stmt = gimple_build_assign (negrhs, NEGATE_EXPR, rhs);
2526 gimple_stmt_iterator gsi = gsi_for_stmt (def_stmt);
2527 set_vinfo_for_stmt (negate_stmt, new_stmt_vec_info (negate_stmt,
2528 loop_info, NULL));
2529 gsi_insert_before (&gsi, negate_stmt, GSI_NEW_STMT);
2530 gimple_assign_set_rhs2 (def_stmt, negrhs);
2531 gimple_assign_set_rhs_code (def_stmt, PLUS_EXPR);
2532 update_stmt (def_stmt);
2533 }
2534
2535 /* Reduction is safe. We're dealing with one of the following:
2536 1) integer arithmetic and no trapv
2537 2) floating point arithmetic, and special flags permit this optimization
2538 3) nested cycle (i.e., outer loop vectorization). */
2539 if (TREE_CODE (op1) == SSA_NAME)
2540 def1 = SSA_NAME_DEF_STMT (op1);
2541
2542 if (TREE_CODE (op2) == SSA_NAME)
2543 def2 = SSA_NAME_DEF_STMT (op2);
2544
2545 if (code != COND_EXPR
2546 && ((!def1 || gimple_nop_p (def1)) && (!def2 || gimple_nop_p (def2))))
2547 {
2548 if (dump_enabled_p ())
2549 report_vect_op (MSG_NOTE, def_stmt, "reduction: no defs for operands: ");
2550 return NULL;
2551 }
2552
2553 /* Check that one def is the reduction def, defined by PHI,
2554 the other def is either defined in the loop ("vect_internal_def"),
2555 or it's an induction (defined by a loop-header phi-node). */
2556
2557 if (def2 && def2 == phi
2558 && (code == COND_EXPR
2559 || !def1 || gimple_nop_p (def1)
2560 || !flow_bb_inside_loop_p (loop, gimple_bb (def1))
2561 || (def1 && flow_bb_inside_loop_p (loop, gimple_bb (def1))
2562 && (is_gimple_assign (def1)
2563 || is_gimple_call (def1)
2564 || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def1))
2565 == vect_induction_def
2566 || (gimple_code (def1) == GIMPLE_PHI
2567 && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def1))
2568 == vect_internal_def
2569 && !is_loop_header_bb_p (gimple_bb (def1)))))))
2570 {
2571 if (dump_enabled_p ())
2572 report_vect_op (MSG_NOTE, def_stmt, "detected reduction: ");
2573 return def_stmt;
2574 }
2575
2576 if (def1 && def1 == phi
2577 && (code == COND_EXPR
2578 || !def2 || gimple_nop_p (def2)
2579 || !flow_bb_inside_loop_p (loop, gimple_bb (def2))
2580 || (def2 && flow_bb_inside_loop_p (loop, gimple_bb (def2))
2581 && (is_gimple_assign (def2)
2582 || is_gimple_call (def2)
2583 || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def2))
2584 == vect_induction_def
2585 || (gimple_code (def2) == GIMPLE_PHI
2586 && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def2))
2587 == vect_internal_def
2588 && !is_loop_header_bb_p (gimple_bb (def2)))))))
2589 {
2590 if (check_reduction)
2591 {
2592 /* Swap operands (just for simplicity - so that the rest of the code
2593 can assume that the reduction variable is always the last (second)
2594 argument). */
2595 if (dump_enabled_p ())
2596 report_vect_op (MSG_NOTE, def_stmt,
2597 "detected reduction: need to swap operands: ");
2598
2599 swap_ssa_operands (def_stmt, gimple_assign_rhs1_ptr (def_stmt),
2600 gimple_assign_rhs2_ptr (def_stmt));
2601
2602 if (CONSTANT_CLASS_P (gimple_assign_rhs1 (def_stmt)))
2603 LOOP_VINFO_OPERANDS_SWAPPED (loop_info) = true;
2604 }
2605 else
2606 {
2607 if (dump_enabled_p ())
2608 report_vect_op (MSG_NOTE, def_stmt, "detected reduction: ");
2609 }
2610
2611 return def_stmt;
2612 }
2613
2614 /* Try to find SLP reduction chain. */
2615 if (check_reduction && vect_is_slp_reduction (loop_info, phi, def_stmt))
2616 {
2617 if (dump_enabled_p ())
2618 report_vect_op (MSG_NOTE, def_stmt,
2619 "reduction: detected reduction chain: ");
2620
2621 return def_stmt;
2622 }
2623
2624 if (dump_enabled_p ())
2625 report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2626 "reduction: unknown pattern: ");
2627
2628 return NULL;
2629 }
2630
2631 /* Wrapper around vect_is_simple_reduction_1, that won't modify code
2632 in-place. Arguments as there. */
2633
2634 static gimple
2635 vect_is_simple_reduction (loop_vec_info loop_info, gimple phi,
2636 bool check_reduction, bool *double_reduc)
2637 {
2638 return vect_is_simple_reduction_1 (loop_info, phi, check_reduction,
2639 double_reduc, false);
2640 }
2641
2642 /* Wrapper around vect_is_simple_reduction_1, which will modify code
2643 in-place if it enables detection of more reductions. Arguments
2644 as there. */
2645
2646 gimple
2647 vect_force_simple_reduction (loop_vec_info loop_info, gimple phi,
2648 bool check_reduction, bool *double_reduc)
2649 {
2650 return vect_is_simple_reduction_1 (loop_info, phi, check_reduction,
2651 double_reduc, true);
2652 }
2653
2654 /* Calculate the cost of one scalar iteration of the loop. */
2655 int
2656 vect_get_single_scalar_iteration_cost (loop_vec_info loop_vinfo)
2657 {
2658 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2659 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
2660 int nbbs = loop->num_nodes, factor, scalar_single_iter_cost = 0;
2661 int innerloop_iters, i, stmt_cost;
2662
2663 /* Count statements in scalar loop. Using this as scalar cost for a single
2664 iteration for now.
2665
2666 TODO: Add outer loop support.
2667
2668 TODO: Consider assigning different costs to different scalar
2669 statements. */
2670
2671 /* FORNOW. */
2672 innerloop_iters = 1;
2673 if (loop->inner)
2674 innerloop_iters = 50; /* FIXME */
2675
2676 for (i = 0; i < nbbs; i++)
2677 {
2678 gimple_stmt_iterator si;
2679 basic_block bb = bbs[i];
2680
2681 if (bb->loop_father == loop->inner)
2682 factor = innerloop_iters;
2683 else
2684 factor = 1;
2685
2686 for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
2687 {
2688 gimple stmt = gsi_stmt (si);
2689 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2690
2691 if (!is_gimple_assign (stmt) && !is_gimple_call (stmt))
2692 continue;
2693
2694 /* Skip stmts that are not vectorized inside the loop. */
2695 if (stmt_info
2696 && !STMT_VINFO_RELEVANT_P (stmt_info)
2697 && (!STMT_VINFO_LIVE_P (stmt_info)
2698 || !VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
2699 && !STMT_VINFO_IN_PATTERN_P (stmt_info))
2700 continue;
2701
2702 if (STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt)))
2703 {
2704 if (DR_IS_READ (STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt))))
2705 stmt_cost = vect_get_stmt_cost (scalar_load);
2706 else
2707 stmt_cost = vect_get_stmt_cost (scalar_store);
2708 }
2709 else
2710 stmt_cost = vect_get_stmt_cost (scalar_stmt);
2711
2712 scalar_single_iter_cost += stmt_cost * factor;
2713 }
2714 }
2715 return scalar_single_iter_cost;
2716 }
2717
2718 /* Calculate cost of peeling the loop PEEL_ITERS_PROLOGUE times. */
2719 int
2720 vect_get_known_peeling_cost (loop_vec_info loop_vinfo, int peel_iters_prologue,
2721 int *peel_iters_epilogue,
2722 int scalar_single_iter_cost,
2723 stmt_vector_for_cost *prologue_cost_vec,
2724 stmt_vector_for_cost *epilogue_cost_vec)
2725 {
2726 int retval = 0;
2727 int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2728
2729 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
2730 {
2731 *peel_iters_epilogue = vf/2;
2732 if (dump_enabled_p ())
2733 dump_printf_loc (MSG_NOTE, vect_location,
2734 "cost model: epilogue peel iters set to vf/2 "
2735 "because loop iterations are unknown .\n");
2736
2737 /* If peeled iterations are known but number of scalar loop
2738 iterations are unknown, count a taken branch per peeled loop. */
2739 retval = record_stmt_cost (prologue_cost_vec, 2, cond_branch_taken,
2740 NULL, 0, vect_prologue);
2741 }
2742 else
2743 {
2744 int niters = LOOP_VINFO_INT_NITERS (loop_vinfo);
2745 peel_iters_prologue = niters < peel_iters_prologue ?
2746 niters : peel_iters_prologue;
2747 *peel_iters_epilogue = (niters - peel_iters_prologue) % vf;
2748 /* If we need to peel for gaps, but no peeling is required, we have to
2749 peel VF iterations. */
2750 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) && !*peel_iters_epilogue)
2751 *peel_iters_epilogue = vf;
2752 }
2753
2754 if (peel_iters_prologue)
2755 retval += record_stmt_cost (prologue_cost_vec,
2756 peel_iters_prologue * scalar_single_iter_cost,
2757 scalar_stmt, NULL, 0, vect_prologue);
2758 if (*peel_iters_epilogue)
2759 retval += record_stmt_cost (epilogue_cost_vec,
2760 *peel_iters_epilogue * scalar_single_iter_cost,
2761 scalar_stmt, NULL, 0, vect_epilogue);
2762 return retval;
2763 }
2764
2765 /* Function vect_estimate_min_profitable_iters
2766
2767 Return the number of iterations required for the vector version of the
2768 loop to be profitable relative to the cost of the scalar version of the
2769 loop. */
2770
2771 static void
2772 vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo,
2773 int *ret_min_profitable_niters,
2774 int *ret_min_profitable_estimate)
2775 {
2776 int min_profitable_iters;
2777 int min_profitable_estimate;
2778 int peel_iters_prologue;
2779 int peel_iters_epilogue;
2780 unsigned vec_inside_cost = 0;
2781 int vec_outside_cost = 0;
2782 unsigned vec_prologue_cost = 0;
2783 unsigned vec_epilogue_cost = 0;
2784 int scalar_single_iter_cost = 0;
2785 int scalar_outside_cost = 0;
2786 int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2787 int npeel = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
2788 void *target_cost_data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
2789
2790 /* Cost model disabled. */
2791 if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
2792 {
2793 dump_printf_loc (MSG_NOTE, vect_location, "cost model disabled.\n");
2794 *ret_min_profitable_niters = 0;
2795 *ret_min_profitable_estimate = 0;
2796 return;
2797 }
2798
2799 /* Requires loop versioning tests to handle misalignment. */
2800 if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo))
2801 {
2802 /* FIXME: Make cost depend on complexity of individual check. */
2803 unsigned len = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).length ();
2804 (void) add_stmt_cost (target_cost_data, len, vector_stmt, NULL, 0,
2805 vect_prologue);
2806 dump_printf (MSG_NOTE,
2807 "cost model: Adding cost of checks for loop "
2808 "versioning to treat misalignment.\n");
2809 }
2810
2811 /* Requires loop versioning with alias checks. */
2812 if (LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
2813 {
2814 /* FIXME: Make cost depend on complexity of individual check. */
2815 unsigned len = LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo).length ();
2816 (void) add_stmt_cost (target_cost_data, len, vector_stmt, NULL, 0,
2817 vect_prologue);
2818 dump_printf (MSG_NOTE,
2819 "cost model: Adding cost of checks for loop "
2820 "versioning aliasing.\n");
2821 }
2822
2823 if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo)
2824 || LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
2825 (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken, NULL, 0,
2826 vect_prologue);
2827
2828 /* Count statements in scalar loop. Using this as scalar cost for a single
2829 iteration for now.
2830
2831 TODO: Add outer loop support.
2832
2833 TODO: Consider assigning different costs to different scalar
2834 statements. */
2835
2836 scalar_single_iter_cost = vect_get_single_scalar_iteration_cost (loop_vinfo);
2837
2838 /* Add additional cost for the peeled instructions in prologue and epilogue
2839 loop.
2840
2841 FORNOW: If we don't know the value of peel_iters for prologue or epilogue
2842 at compile-time - we assume it's vf/2 (the worst would be vf-1).
2843
2844 TODO: Build an expression that represents peel_iters for prologue and
2845 epilogue to be used in a run-time test. */
2846
2847 if (npeel < 0)
2848 {
2849 peel_iters_prologue = vf/2;
2850 dump_printf (MSG_NOTE, "cost model: "
2851 "prologue peel iters set to vf/2.\n");
2852
2853 /* If peeling for alignment is unknown, loop bound of main loop becomes
2854 unknown. */
2855 peel_iters_epilogue = vf/2;
2856 dump_printf (MSG_NOTE, "cost model: "
2857 "epilogue peel iters set to vf/2 because "
2858 "peeling for alignment is unknown.\n");
2859
2860 /* If peeled iterations are unknown, count a taken branch and a not taken
2861 branch per peeled loop. Even if scalar loop iterations are known,
2862 vector iterations are not known since peeled prologue iterations are
2863 not known. Hence guards remain the same. */
2864 (void) add_stmt_cost (target_cost_data, 2, cond_branch_taken,
2865 NULL, 0, vect_prologue);
2866 (void) add_stmt_cost (target_cost_data, 2, cond_branch_not_taken,
2867 NULL, 0, vect_prologue);
2868 /* FORNOW: Don't attempt to pass individual scalar instructions to
2869 the model; just assume linear cost for scalar iterations. */
2870 (void) add_stmt_cost (target_cost_data,
2871 peel_iters_prologue * scalar_single_iter_cost,
2872 scalar_stmt, NULL, 0, vect_prologue);
2873 (void) add_stmt_cost (target_cost_data,
2874 peel_iters_epilogue * scalar_single_iter_cost,
2875 scalar_stmt, NULL, 0, vect_epilogue);
2876 }
2877 else
2878 {
2879 stmt_vector_for_cost prologue_cost_vec, epilogue_cost_vec;
2880 stmt_info_for_cost *si;
2881 int j;
2882 void *data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
2883
2884 prologue_cost_vec.create (2);
2885 epilogue_cost_vec.create (2);
2886 peel_iters_prologue = npeel;
2887
2888 (void) vect_get_known_peeling_cost (loop_vinfo, peel_iters_prologue,
2889 &peel_iters_epilogue,
2890 scalar_single_iter_cost,
2891 &prologue_cost_vec,
2892 &epilogue_cost_vec);
2893
2894 FOR_EACH_VEC_ELT (prologue_cost_vec, j, si)
2895 {
2896 struct _stmt_vec_info *stmt_info
2897 = si->stmt ? vinfo_for_stmt (si->stmt) : NULL;
2898 (void) add_stmt_cost (data, si->count, si->kind, stmt_info,
2899 si->misalign, vect_prologue);
2900 }
2901
2902 FOR_EACH_VEC_ELT (epilogue_cost_vec, j, si)
2903 {
2904 struct _stmt_vec_info *stmt_info
2905 = si->stmt ? vinfo_for_stmt (si->stmt) : NULL;
2906 (void) add_stmt_cost (data, si->count, si->kind, stmt_info,
2907 si->misalign, vect_epilogue);
2908 }
2909
2910 prologue_cost_vec.release ();
2911 epilogue_cost_vec.release ();
2912 }
2913
2914 /* FORNOW: The scalar outside cost is incremented in one of the
2915 following ways:
2916
2917 1. The vectorizer checks for alignment and aliasing and generates
2918 a condition that allows dynamic vectorization. A cost model
2919 check is ANDED with the versioning condition. Hence scalar code
2920 path now has the added cost of the versioning check.
2921
2922 if (cost > th & versioning_check)
2923 jmp to vector code
2924
2925 Hence run-time scalar is incremented by not-taken branch cost.
2926
2927 2. The vectorizer then checks if a prologue is required. If the
2928 cost model check was not done before during versioning, it has to
2929 be done before the prologue check.
2930
2931 if (cost <= th)
2932 prologue = scalar_iters
2933 if (prologue == 0)
2934 jmp to vector code
2935 else
2936 execute prologue
2937 if (prologue == num_iters)
2938 go to exit
2939
2940 Hence the run-time scalar cost is incremented by a taken branch,
2941 plus a not-taken branch, plus a taken branch cost.
2942
2943 3. The vectorizer then checks if an epilogue is required. If the
2944 cost model check was not done before during prologue check, it
2945 has to be done with the epilogue check.
2946
2947 if (prologue == 0)
2948 jmp to vector code
2949 else
2950 execute prologue
2951 if (prologue == num_iters)
2952 go to exit
2953 vector code:
2954 if ((cost <= th) | (scalar_iters-prologue-epilogue == 0))
2955 jmp to epilogue
2956
2957 Hence the run-time scalar cost should be incremented by 2 taken
2958 branches.
2959
2960 TODO: The back end may reorder the BBS's differently and reverse
2961 conditions/branch directions. Change the estimates below to
2962 something more reasonable. */
2963
2964 /* If the number of iterations is known and we do not do versioning, we can
2965 decide whether to vectorize at compile time. Hence the scalar version
2966 do not carry cost model guard costs. */
2967 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2968 || LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo)
2969 || LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
2970 {
2971 /* Cost model check occurs at versioning. */
2972 if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo)
2973 || LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
2974 scalar_outside_cost += vect_get_stmt_cost (cond_branch_not_taken);
2975 else
2976 {
2977 /* Cost model check occurs at prologue generation. */
2978 if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) < 0)
2979 scalar_outside_cost += 2 * vect_get_stmt_cost (cond_branch_taken)
2980 + vect_get_stmt_cost (cond_branch_not_taken);
2981 /* Cost model check occurs at epilogue generation. */
2982 else
2983 scalar_outside_cost += 2 * vect_get_stmt_cost (cond_branch_taken);
2984 }
2985 }
2986
2987 /* Complete the target-specific cost calculations. */
2988 finish_cost (LOOP_VINFO_TARGET_COST_DATA (loop_vinfo), &vec_prologue_cost,
2989 &vec_inside_cost, &vec_epilogue_cost);
2990
2991 vec_outside_cost = (int)(vec_prologue_cost + vec_epilogue_cost);
2992
2993 /* Calculate number of iterations required to make the vector version
2994 profitable, relative to the loop bodies only. The following condition
2995 must hold true:
2996 SIC * niters + SOC > VIC * ((niters-PL_ITERS-EP_ITERS)/VF) + VOC
2997 where
2998 SIC = scalar iteration cost, VIC = vector iteration cost,
2999 VOC = vector outside cost, VF = vectorization factor,
3000 PL_ITERS = prologue iterations, EP_ITERS= epilogue iterations
3001 SOC = scalar outside cost for run time cost model check. */
3002
3003 if ((scalar_single_iter_cost * vf) > (int) vec_inside_cost)
3004 {
3005 if (vec_outside_cost <= 0)
3006 min_profitable_iters = 1;
3007 else
3008 {
3009 min_profitable_iters = ((vec_outside_cost - scalar_outside_cost) * vf
3010 - vec_inside_cost * peel_iters_prologue
3011 - vec_inside_cost * peel_iters_epilogue)
3012 / ((scalar_single_iter_cost * vf)
3013 - vec_inside_cost);
3014
3015 if ((scalar_single_iter_cost * vf * min_profitable_iters)
3016 <= (((int) vec_inside_cost * min_profitable_iters)
3017 + (((int) vec_outside_cost - scalar_outside_cost) * vf)))
3018 min_profitable_iters++;
3019 }
3020 }
3021 /* vector version will never be profitable. */
3022 else
3023 {
3024 if (LOOP_VINFO_LOOP (loop_vinfo)->force_vectorize)
3025 warning_at (vect_location, OPT_Wopenmp_simd, "vectorization "
3026 "did not happen for a simd loop");
3027
3028 if (dump_enabled_p ())
3029 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3030 "cost model: the vector iteration cost = %d "
3031 "divided by the scalar iteration cost = %d "
3032 "is greater or equal to the vectorization factor = %d"
3033 ".\n",
3034 vec_inside_cost, scalar_single_iter_cost, vf);
3035 *ret_min_profitable_niters = -1;
3036 *ret_min_profitable_estimate = -1;
3037 return;
3038 }
3039
3040 if (dump_enabled_p ())
3041 {
3042 dump_printf_loc (MSG_NOTE, vect_location, "Cost model analysis: \n");
3043 dump_printf (MSG_NOTE, " Vector inside of loop cost: %d\n",
3044 vec_inside_cost);
3045 dump_printf (MSG_NOTE, " Vector prologue cost: %d\n",
3046 vec_prologue_cost);
3047 dump_printf (MSG_NOTE, " Vector epilogue cost: %d\n",
3048 vec_epilogue_cost);
3049 dump_printf (MSG_NOTE, " Scalar iteration cost: %d\n",
3050 scalar_single_iter_cost);
3051 dump_printf (MSG_NOTE, " Scalar outside cost: %d\n",
3052 scalar_outside_cost);
3053 dump_printf (MSG_NOTE, " Vector outside cost: %d\n",
3054 vec_outside_cost);
3055 dump_printf (MSG_NOTE, " prologue iterations: %d\n",
3056 peel_iters_prologue);
3057 dump_printf (MSG_NOTE, " epilogue iterations: %d\n",
3058 peel_iters_epilogue);
3059 dump_printf (MSG_NOTE,
3060 " Calculated minimum iters for profitability: %d\n",
3061 min_profitable_iters);
3062 dump_printf (MSG_NOTE, "\n");
3063 }
3064
3065 min_profitable_iters =
3066 min_profitable_iters < vf ? vf : min_profitable_iters;
3067
3068 /* Because the condition we create is:
3069 if (niters <= min_profitable_iters)
3070 then skip the vectorized loop. */
3071 min_profitable_iters--;
3072
3073 if (dump_enabled_p ())
3074 dump_printf_loc (MSG_NOTE, vect_location,
3075 " Runtime profitability threshold = %d\n",
3076 min_profitable_iters);
3077
3078 *ret_min_profitable_niters = min_profitable_iters;
3079
3080 /* Calculate number of iterations required to make the vector version
3081 profitable, relative to the loop bodies only.
3082
3083 Non-vectorized variant is SIC * niters and it must win over vector
3084 variant on the expected loop trip count. The following condition must hold true:
3085 SIC * niters > VIC * ((niters-PL_ITERS-EP_ITERS)/VF) + VOC + SOC */
3086
3087 if (vec_outside_cost <= 0)
3088 min_profitable_estimate = 1;
3089 else
3090 {
3091 min_profitable_estimate = ((vec_outside_cost + scalar_outside_cost) * vf
3092 - vec_inside_cost * peel_iters_prologue
3093 - vec_inside_cost * peel_iters_epilogue)
3094 / ((scalar_single_iter_cost * vf)
3095 - vec_inside_cost);
3096 }
3097 min_profitable_estimate --;
3098 min_profitable_estimate = MAX (min_profitable_estimate, min_profitable_iters);
3099 if (dump_enabled_p ())
3100 dump_printf_loc (MSG_NOTE, vect_location,
3101 " Static estimate profitability threshold = %d\n",
3102 min_profitable_iters);
3103
3104 *ret_min_profitable_estimate = min_profitable_estimate;
3105 }
3106
3107 /* Writes into SEL a mask for a vec_perm, equivalent to a vec_shr by OFFSET
3108 vector elements (not bits) for a vector of mode MODE. */
3109 static void
3110 calc_vec_perm_mask_for_shift (enum machine_mode mode, unsigned int offset,
3111 unsigned char *sel)
3112 {
3113 unsigned int i, nelt = GET_MODE_NUNITS (mode);
3114
3115 for (i = 0; i < nelt; i++)
3116 sel[i] = (i + offset) & (2*nelt - 1);
3117 }
3118
3119 /* Checks whether the target supports whole-vector shifts for vectors of mode
3120 MODE. This is the case if _either_ the platform handles vec_shr_optab, _or_
3121 it supports vec_perm_const with masks for all necessary shift amounts. */
3122 static bool
3123 have_whole_vector_shift (enum machine_mode mode)
3124 {
3125 if (optab_handler (vec_shr_optab, mode) != CODE_FOR_nothing)
3126 return true;
3127
3128 if (direct_optab_handler (vec_perm_const_optab, mode) == CODE_FOR_nothing)
3129 return false;
3130
3131 unsigned int i, nelt = GET_MODE_NUNITS (mode);
3132 unsigned char *sel = XALLOCAVEC (unsigned char, nelt);
3133
3134 for (i = nelt/2; i >= 1; i/=2)
3135 {
3136 calc_vec_perm_mask_for_shift (mode, i, sel);
3137 if (!can_vec_perm_p (mode, false, sel))
3138 return false;
3139 }
3140 return true;
3141 }
3142
3143 /* TODO: Close dependency between vect_model_*_cost and vectorizable_*
3144 functions. Design better to avoid maintenance issues. */
3145
3146 /* Function vect_model_reduction_cost.
3147
3148 Models cost for a reduction operation, including the vector ops
3149 generated within the strip-mine loop, the initial definition before
3150 the loop, and the epilogue code that must be generated. */
3151
3152 static bool
3153 vect_model_reduction_cost (stmt_vec_info stmt_info, enum tree_code reduc_code,
3154 int ncopies)
3155 {
3156 int prologue_cost = 0, epilogue_cost = 0;
3157 enum tree_code code;
3158 optab optab;
3159 tree vectype;
3160 gimple stmt, orig_stmt;
3161 tree reduction_op;
3162 machine_mode mode;
3163 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
3164 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
3165 void *target_cost_data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
3166
3167 /* Cost of reduction op inside loop. */
3168 unsigned inside_cost = add_stmt_cost (target_cost_data, ncopies, vector_stmt,
3169 stmt_info, 0, vect_body);
3170 stmt = STMT_VINFO_STMT (stmt_info);
3171
3172 switch (get_gimple_rhs_class (gimple_assign_rhs_code (stmt)))
3173 {
3174 case GIMPLE_SINGLE_RHS:
3175 gcc_assert (TREE_OPERAND_LENGTH (gimple_assign_rhs1 (stmt)) == ternary_op);
3176 reduction_op = TREE_OPERAND (gimple_assign_rhs1 (stmt), 2);
3177 break;
3178 case GIMPLE_UNARY_RHS:
3179 reduction_op = gimple_assign_rhs1 (stmt);
3180 break;
3181 case GIMPLE_BINARY_RHS:
3182 reduction_op = gimple_assign_rhs2 (stmt);
3183 break;
3184 case GIMPLE_TERNARY_RHS:
3185 reduction_op = gimple_assign_rhs3 (stmt);
3186 break;
3187 default:
3188 gcc_unreachable ();
3189 }
3190
3191 vectype = get_vectype_for_scalar_type (TREE_TYPE (reduction_op));
3192 if (!vectype)
3193 {
3194 if (dump_enabled_p ())
3195 {
3196 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3197 "unsupported data-type ");
3198 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
3199 TREE_TYPE (reduction_op));
3200 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3201 }
3202 return false;
3203 }
3204
3205 mode = TYPE_MODE (vectype);
3206 orig_stmt = STMT_VINFO_RELATED_STMT (stmt_info);
3207
3208 if (!orig_stmt)
3209 orig_stmt = STMT_VINFO_STMT (stmt_info);
3210
3211 code = gimple_assign_rhs_code (orig_stmt);
3212
3213 /* Add in cost for initial definition. */
3214 prologue_cost += add_stmt_cost (target_cost_data, 1, scalar_to_vec,
3215 stmt_info, 0, vect_prologue);
3216
3217 /* Determine cost of epilogue code.
3218
3219 We have a reduction operator that will reduce the vector in one statement.
3220 Also requires scalar extract. */
3221
3222 if (!nested_in_vect_loop_p (loop, orig_stmt))
3223 {
3224 if (reduc_code != ERROR_MARK)
3225 {
3226 epilogue_cost += add_stmt_cost (target_cost_data, 1, vector_stmt,
3227 stmt_info, 0, vect_epilogue);
3228 epilogue_cost += add_stmt_cost (target_cost_data, 1, vec_to_scalar,
3229 stmt_info, 0, vect_epilogue);
3230 }
3231 else
3232 {
3233 int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
3234 tree bitsize =
3235 TYPE_SIZE (TREE_TYPE (gimple_assign_lhs (orig_stmt)));
3236 int element_bitsize = tree_to_uhwi (bitsize);
3237 int nelements = vec_size_in_bits / element_bitsize;
3238
3239 optab = optab_for_tree_code (code, vectype, optab_default);
3240
3241 /* We have a whole vector shift available. */
3242 if (VECTOR_MODE_P (mode)
3243 && optab_handler (optab, mode) != CODE_FOR_nothing
3244 && have_whole_vector_shift (mode))
3245 {
3246 /* Final reduction via vector shifts and the reduction operator.
3247 Also requires scalar extract. */
3248 epilogue_cost += add_stmt_cost (target_cost_data,
3249 exact_log2 (nelements) * 2,
3250 vector_stmt, stmt_info, 0,
3251 vect_epilogue);
3252 epilogue_cost += add_stmt_cost (target_cost_data, 1,
3253 vec_to_scalar, stmt_info, 0,
3254 vect_epilogue);
3255 }
3256 else
3257 /* Use extracts and reduction op for final reduction. For N
3258 elements, we have N extracts and N-1 reduction ops. */
3259 epilogue_cost += add_stmt_cost (target_cost_data,
3260 nelements + nelements - 1,
3261 vector_stmt, stmt_info, 0,
3262 vect_epilogue);
3263 }
3264 }
3265
3266 if (dump_enabled_p ())
3267 dump_printf (MSG_NOTE,
3268 "vect_model_reduction_cost: inside_cost = %d, "
3269 "prologue_cost = %d, epilogue_cost = %d .\n", inside_cost,
3270 prologue_cost, epilogue_cost);
3271
3272 return true;
3273 }
3274
3275
3276 /* Function vect_model_induction_cost.
3277
3278 Models cost for induction operations. */
3279
3280 static void
3281 vect_model_induction_cost (stmt_vec_info stmt_info, int ncopies)
3282 {
3283 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
3284 void *target_cost_data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
3285 unsigned inside_cost, prologue_cost;
3286
3287 /* loop cost for vec_loop. */
3288 inside_cost = add_stmt_cost (target_cost_data, ncopies, vector_stmt,
3289 stmt_info, 0, vect_body);
3290
3291 /* prologue cost for vec_init and vec_step. */
3292 prologue_cost = add_stmt_cost (target_cost_data, 2, scalar_to_vec,
3293 stmt_info, 0, vect_prologue);
3294
3295 if (dump_enabled_p ())
3296 dump_printf_loc (MSG_NOTE, vect_location,
3297 "vect_model_induction_cost: inside_cost = %d, "
3298 "prologue_cost = %d .\n", inside_cost, prologue_cost);
3299 }
3300
3301
3302 /* Function get_initial_def_for_induction
3303
3304 Input:
3305 STMT - a stmt that performs an induction operation in the loop.
3306 IV_PHI - the initial value of the induction variable
3307
3308 Output:
3309 Return a vector variable, initialized with the first VF values of
3310 the induction variable. E.g., for an iv with IV_PHI='X' and
3311 evolution S, for a vector of 4 units, we want to return:
3312 [X, X + S, X + 2*S, X + 3*S]. */
3313
3314 static tree
3315 get_initial_def_for_induction (gimple iv_phi)
3316 {
3317 stmt_vec_info stmt_vinfo = vinfo_for_stmt (iv_phi);
3318 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_vinfo);
3319 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
3320 tree vectype;
3321 int nunits;
3322 edge pe = loop_preheader_edge (loop);
3323 struct loop *iv_loop;
3324 basic_block new_bb;
3325 tree new_vec, vec_init, vec_step, t;
3326 tree new_var;
3327 tree new_name;
3328 gimple init_stmt, new_stmt;
3329 gphi *induction_phi;
3330 tree induc_def, vec_def, vec_dest;
3331 tree init_expr, step_expr;
3332 int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
3333 int i;
3334 int ncopies;
3335 tree expr;
3336 stmt_vec_info phi_info = vinfo_for_stmt (iv_phi);
3337 bool nested_in_vect_loop = false;
3338 gimple_seq stmts = NULL;
3339 imm_use_iterator imm_iter;
3340 use_operand_p use_p;
3341 gimple exit_phi;
3342 edge latch_e;
3343 tree loop_arg;
3344 gimple_stmt_iterator si;
3345 basic_block bb = gimple_bb (iv_phi);
3346 tree stepvectype;
3347 tree resvectype;
3348
3349 /* Is phi in an inner-loop, while vectorizing an enclosing outer-loop? */
3350 if (nested_in_vect_loop_p (loop, iv_phi))
3351 {
3352 nested_in_vect_loop = true;
3353 iv_loop = loop->inner;
3354 }
3355 else
3356 iv_loop = loop;
3357 gcc_assert (iv_loop == (gimple_bb (iv_phi))->loop_father);
3358
3359 latch_e = loop_latch_edge (iv_loop);
3360 loop_arg = PHI_ARG_DEF_FROM_EDGE (iv_phi, latch_e);
3361
3362 step_expr = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (phi_info);
3363 gcc_assert (step_expr != NULL_TREE);
3364
3365 pe = loop_preheader_edge (iv_loop);
3366 init_expr = PHI_ARG_DEF_FROM_EDGE (iv_phi,
3367 loop_preheader_edge (iv_loop));
3368
3369 vectype = get_vectype_for_scalar_type (TREE_TYPE (init_expr));
3370 resvectype = get_vectype_for_scalar_type (TREE_TYPE (PHI_RESULT (iv_phi)));
3371 gcc_assert (vectype);
3372 nunits = TYPE_VECTOR_SUBPARTS (vectype);
3373 ncopies = vf / nunits;
3374
3375 gcc_assert (phi_info);
3376 gcc_assert (ncopies >= 1);
3377
3378 /* Convert the step to the desired type. */
3379 step_expr = force_gimple_operand (fold_convert (TREE_TYPE (vectype),
3380 step_expr),
3381 &stmts, true, NULL_TREE);
3382 if (stmts)
3383 {
3384 new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
3385 gcc_assert (!new_bb);
3386 }
3387
3388 /* Find the first insertion point in the BB. */
3389 si = gsi_after_labels (bb);
3390
3391 /* Create the vector that holds the initial_value of the induction. */
3392 if (nested_in_vect_loop)
3393 {
3394 /* iv_loop is nested in the loop to be vectorized. init_expr had already
3395 been created during vectorization of previous stmts. We obtain it
3396 from the STMT_VINFO_VEC_STMT of the defining stmt. */
3397 vec_init = vect_get_vec_def_for_operand (init_expr, iv_phi, NULL);
3398 /* If the initial value is not of proper type, convert it. */
3399 if (!useless_type_conversion_p (vectype, TREE_TYPE (vec_init)))
3400 {
3401 new_stmt
3402 = gimple_build_assign (vect_get_new_vect_var (vectype,
3403 vect_simple_var,
3404 "vec_iv_"),
3405 VIEW_CONVERT_EXPR,
3406 build1 (VIEW_CONVERT_EXPR, vectype,
3407 vec_init));
3408 vec_init = make_ssa_name (gimple_assign_lhs (new_stmt), new_stmt);
3409 gimple_assign_set_lhs (new_stmt, vec_init);
3410 new_bb = gsi_insert_on_edge_immediate (loop_preheader_edge (iv_loop),
3411 new_stmt);
3412 gcc_assert (!new_bb);
3413 set_vinfo_for_stmt (new_stmt,
3414 new_stmt_vec_info (new_stmt, loop_vinfo, NULL));
3415 }
3416 }
3417 else
3418 {
3419 vec<constructor_elt, va_gc> *v;
3420
3421 /* iv_loop is the loop to be vectorized. Create:
3422 vec_init = [X, X+S, X+2*S, X+3*S] (S = step_expr, X = init_expr) */
3423 new_var = vect_get_new_vect_var (TREE_TYPE (vectype),
3424 vect_scalar_var, "var_");
3425 new_name = force_gimple_operand (fold_convert (TREE_TYPE (vectype),
3426 init_expr),
3427 &stmts, false, new_var);
3428 if (stmts)
3429 {
3430 new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
3431 gcc_assert (!new_bb);
3432 }
3433
3434 vec_alloc (v, nunits);
3435 bool constant_p = is_gimple_min_invariant (new_name);
3436 CONSTRUCTOR_APPEND_ELT (v, NULL_TREE, new_name);
3437 for (i = 1; i < nunits; i++)
3438 {
3439 /* Create: new_name_i = new_name + step_expr */
3440 new_name = fold_build2 (PLUS_EXPR, TREE_TYPE (new_name),
3441 new_name, step_expr);
3442 if (!is_gimple_min_invariant (new_name))
3443 {
3444 init_stmt = gimple_build_assign (new_var, new_name);
3445 new_name = make_ssa_name (new_var, init_stmt);
3446 gimple_assign_set_lhs (init_stmt, new_name);
3447 new_bb = gsi_insert_on_edge_immediate (pe, init_stmt);
3448 gcc_assert (!new_bb);
3449 if (dump_enabled_p ())
3450 {
3451 dump_printf_loc (MSG_NOTE, vect_location,
3452 "created new init_stmt: ");
3453 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, init_stmt, 0);
3454 dump_printf (MSG_NOTE, "\n");
3455 }
3456 constant_p = false;
3457 }
3458 CONSTRUCTOR_APPEND_ELT (v, NULL_TREE, new_name);
3459 }
3460 /* Create a vector from [new_name_0, new_name_1, ..., new_name_nunits-1] */
3461 if (constant_p)
3462 new_vec = build_vector_from_ctor (vectype, v);
3463 else
3464 new_vec = build_constructor (vectype, v);
3465 vec_init = vect_init_vector (iv_phi, new_vec, vectype, NULL);
3466 }
3467
3468
3469 /* Create the vector that holds the step of the induction. */
3470 if (nested_in_vect_loop)
3471 /* iv_loop is nested in the loop to be vectorized. Generate:
3472 vec_step = [S, S, S, S] */
3473 new_name = step_expr;
3474 else
3475 {
3476 /* iv_loop is the loop to be vectorized. Generate:
3477 vec_step = [VF*S, VF*S, VF*S, VF*S] */
3478 if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
3479 {
3480 expr = build_int_cst (integer_type_node, vf);
3481 expr = fold_convert (TREE_TYPE (step_expr), expr);
3482 }
3483 else
3484 expr = build_int_cst (TREE_TYPE (step_expr), vf);
3485 new_name = fold_build2 (MULT_EXPR, TREE_TYPE (step_expr),
3486 expr, step_expr);
3487 if (TREE_CODE (step_expr) == SSA_NAME)
3488 new_name = vect_init_vector (iv_phi, new_name,
3489 TREE_TYPE (step_expr), NULL);
3490 }
3491
3492 t = unshare_expr (new_name);
3493 gcc_assert (CONSTANT_CLASS_P (new_name)
3494 || TREE_CODE (new_name) == SSA_NAME);
3495 stepvectype = get_vectype_for_scalar_type (TREE_TYPE (new_name));
3496 gcc_assert (stepvectype);
3497 new_vec = build_vector_from_val (stepvectype, t);
3498 vec_step = vect_init_vector (iv_phi, new_vec, stepvectype, NULL);
3499
3500
3501 /* Create the following def-use cycle:
3502 loop prolog:
3503 vec_init = ...
3504 vec_step = ...
3505 loop:
3506 vec_iv = PHI <vec_init, vec_loop>
3507 ...
3508 STMT
3509 ...
3510 vec_loop = vec_iv + vec_step; */
3511
3512 /* Create the induction-phi that defines the induction-operand. */
3513 vec_dest = vect_get_new_vect_var (vectype, vect_simple_var, "vec_iv_");
3514 induction_phi = create_phi_node (vec_dest, iv_loop->header);
3515 set_vinfo_for_stmt (induction_phi,
3516 new_stmt_vec_info (induction_phi, loop_vinfo, NULL));
3517 induc_def = PHI_RESULT (induction_phi);
3518
3519 /* Create the iv update inside the loop */
3520 new_stmt = gimple_build_assign (vec_dest, PLUS_EXPR, induc_def, vec_step);
3521 vec_def = make_ssa_name (vec_dest, new_stmt);
3522 gimple_assign_set_lhs (new_stmt, vec_def);
3523 gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
3524 set_vinfo_for_stmt (new_stmt, new_stmt_vec_info (new_stmt, loop_vinfo,
3525 NULL));
3526
3527 /* Set the arguments of the phi node: */
3528 add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION);
3529 add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
3530 UNKNOWN_LOCATION);
3531
3532
3533 /* In case that vectorization factor (VF) is bigger than the number
3534 of elements that we can fit in a vectype (nunits), we have to generate
3535 more than one vector stmt - i.e - we need to "unroll" the
3536 vector stmt by a factor VF/nunits. For more details see documentation
3537 in vectorizable_operation. */
3538
3539 if (ncopies > 1)
3540 {
3541 stmt_vec_info prev_stmt_vinfo;
3542 /* FORNOW. This restriction should be relaxed. */
3543 gcc_assert (!nested_in_vect_loop);
3544
3545 /* Create the vector that holds the step of the induction. */
3546 if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
3547 {
3548 expr = build_int_cst (integer_type_node, nunits);
3549 expr = fold_convert (TREE_TYPE (step_expr), expr);
3550 }
3551 else
3552 expr = build_int_cst (TREE_TYPE (step_expr), nunits);
3553 new_name = fold_build2 (MULT_EXPR, TREE_TYPE (step_expr),
3554 expr, step_expr);
3555 if (TREE_CODE (step_expr) == SSA_NAME)
3556 new_name = vect_init_vector (iv_phi, new_name,
3557 TREE_TYPE (step_expr), NULL);
3558 t = unshare_expr (new_name);
3559 gcc_assert (CONSTANT_CLASS_P (new_name)
3560 || TREE_CODE (new_name) == SSA_NAME);
3561 new_vec = build_vector_from_val (stepvectype, t);
3562 vec_step = vect_init_vector (iv_phi, new_vec, stepvectype, NULL);
3563
3564 vec_def = induc_def;
3565 prev_stmt_vinfo = vinfo_for_stmt (induction_phi);
3566 for (i = 1; i < ncopies; i++)
3567 {
3568 /* vec_i = vec_prev + vec_step */
3569 new_stmt = gimple_build_assign (vec_dest, PLUS_EXPR,
3570 vec_def, vec_step);
3571 vec_def = make_ssa_name (vec_dest, new_stmt);
3572 gimple_assign_set_lhs (new_stmt, vec_def);
3573
3574 gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
3575 if (!useless_type_conversion_p (resvectype, vectype))
3576 {
3577 new_stmt
3578 = gimple_build_assign
3579 (vect_get_new_vect_var (resvectype, vect_simple_var,
3580 "vec_iv_"),
3581 VIEW_CONVERT_EXPR,
3582 build1 (VIEW_CONVERT_EXPR, resvectype,
3583 gimple_assign_lhs (new_stmt)));
3584 gimple_assign_set_lhs (new_stmt,
3585 make_ssa_name
3586 (gimple_assign_lhs (new_stmt), new_stmt));
3587 gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
3588 }
3589 set_vinfo_for_stmt (new_stmt,
3590 new_stmt_vec_info (new_stmt, loop_vinfo, NULL));
3591 STMT_VINFO_RELATED_STMT (prev_stmt_vinfo) = new_stmt;
3592 prev_stmt_vinfo = vinfo_for_stmt (new_stmt);
3593 }
3594 }
3595
3596 if (nested_in_vect_loop)
3597 {
3598 /* Find the loop-closed exit-phi of the induction, and record
3599 the final vector of induction results: */
3600 exit_phi = NULL;
3601 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, loop_arg)
3602 {
3603 gimple use_stmt = USE_STMT (use_p);
3604 if (is_gimple_debug (use_stmt))
3605 continue;
3606
3607 if (!flow_bb_inside_loop_p (iv_loop, gimple_bb (use_stmt)))
3608 {
3609 exit_phi = use_stmt;
3610 break;
3611 }
3612 }
3613 if (exit_phi)
3614 {
3615 stmt_vec_info stmt_vinfo = vinfo_for_stmt (exit_phi);
3616 /* FORNOW. Currently not supporting the case that an inner-loop induction
3617 is not used in the outer-loop (i.e. only outside the outer-loop). */
3618 gcc_assert (STMT_VINFO_RELEVANT_P (stmt_vinfo)
3619 && !STMT_VINFO_LIVE_P (stmt_vinfo));
3620
3621 STMT_VINFO_VEC_STMT (stmt_vinfo) = new_stmt;
3622 if (dump_enabled_p ())
3623 {
3624 dump_printf_loc (MSG_NOTE, vect_location,
3625 "vector of inductions after inner-loop:");
3626 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, new_stmt, 0);
3627 dump_printf (MSG_NOTE, "\n");
3628 }
3629 }
3630 }
3631
3632
3633 if (dump_enabled_p ())
3634 {
3635 dump_printf_loc (MSG_NOTE, vect_location,
3636 "transform induction: created def-use cycle: ");
3637 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, induction_phi, 0);
3638 dump_printf (MSG_NOTE, "\n");
3639 dump_gimple_stmt (MSG_NOTE, TDF_SLIM,
3640 SSA_NAME_DEF_STMT (vec_def), 0);
3641 dump_printf (MSG_NOTE, "\n");
3642 }
3643
3644 STMT_VINFO_VEC_STMT (phi_info) = induction_phi;
3645 if (!useless_type_conversion_p (resvectype, vectype))
3646 {
3647 new_stmt = gimple_build_assign (vect_get_new_vect_var (resvectype,
3648 vect_simple_var,
3649 "vec_iv_"),
3650 VIEW_CONVERT_EXPR,
3651 build1 (VIEW_CONVERT_EXPR, resvectype,
3652 induc_def));
3653 induc_def = make_ssa_name (gimple_assign_lhs (new_stmt), new_stmt);
3654 gimple_assign_set_lhs (new_stmt, induc_def);
3655 si = gsi_after_labels (bb);
3656 gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
3657 set_vinfo_for_stmt (new_stmt,
3658 new_stmt_vec_info (new_stmt, loop_vinfo, NULL));
3659 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (new_stmt))
3660 = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (induction_phi));
3661 }
3662
3663 return induc_def;
3664 }
3665
3666
3667 /* Function get_initial_def_for_reduction
3668
3669 Input:
3670 STMT - a stmt that performs a reduction operation in the loop.
3671 INIT_VAL - the initial value of the reduction variable
3672
3673 Output:
3674 ADJUSTMENT_DEF - a tree that holds a value to be added to the final result
3675 of the reduction (used for adjusting the epilog - see below).
3676 Return a vector variable, initialized according to the operation that STMT
3677 performs. This vector will be used as the initial value of the
3678 vector of partial results.
3679
3680 Option1 (adjust in epilog): Initialize the vector as follows:
3681 add/bit or/xor: [0,0,...,0,0]
3682 mult/bit and: [1,1,...,1,1]
3683 min/max/cond_expr: [init_val,init_val,..,init_val,init_val]
3684 and when necessary (e.g. add/mult case) let the caller know
3685 that it needs to adjust the result by init_val.
3686
3687 Option2: Initialize the vector as follows:
3688 add/bit or/xor: [init_val,0,0,...,0]
3689 mult/bit and: [init_val,1,1,...,1]
3690 min/max/cond_expr: [init_val,init_val,...,init_val]
3691 and no adjustments are needed.
3692
3693 For example, for the following code:
3694
3695 s = init_val;
3696 for (i=0;i<n;i++)
3697 s = s + a[i];
3698
3699 STMT is 's = s + a[i]', and the reduction variable is 's'.
3700 For a vector of 4 units, we want to return either [0,0,0,init_val],
3701 or [0,0,0,0] and let the caller know that it needs to adjust
3702 the result at the end by 'init_val'.
3703
3704 FORNOW, we are using the 'adjust in epilog' scheme, because this way the
3705 initialization vector is simpler (same element in all entries), if
3706 ADJUSTMENT_DEF is not NULL, and Option2 otherwise.
3707
3708 A cost model should help decide between these two schemes. */
3709
3710 tree
3711 get_initial_def_for_reduction (gimple stmt, tree init_val,
3712 tree *adjustment_def)
3713 {
3714 stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt);
3715 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_vinfo);
3716 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
3717 tree scalar_type = TREE_TYPE (init_val);
3718 tree vectype = get_vectype_for_scalar_type (scalar_type);
3719 int nunits;
3720 enum tree_code code = gimple_assign_rhs_code (stmt);
3721 tree def_for_init;
3722 tree init_def;
3723 tree *elts;
3724 int i;
3725 bool nested_in_vect_loop = false;
3726 tree init_value;
3727 REAL_VALUE_TYPE real_init_val = dconst0;
3728 int int_init_val = 0;
3729 gimple def_stmt = NULL;
3730
3731 gcc_assert (vectype);
3732 nunits = TYPE_VECTOR_SUBPARTS (vectype);
3733
3734 gcc_assert (POINTER_TYPE_P (scalar_type) || INTEGRAL_TYPE_P (scalar_type)
3735 || SCALAR_FLOAT_TYPE_P (scalar_type));
3736
3737 if (nested_in_vect_loop_p (loop, stmt))
3738 nested_in_vect_loop = true;
3739 else
3740 gcc_assert (loop == (gimple_bb (stmt))->loop_father);
3741
3742 /* In case of double reduction we only create a vector variable to be put
3743 in the reduction phi node. The actual statement creation is done in
3744 vect_create_epilog_for_reduction. */
3745 if (adjustment_def && nested_in_vect_loop
3746 && TREE_CODE (init_val) == SSA_NAME
3747 && (def_stmt = SSA_NAME_DEF_STMT (init_val))
3748 && gimple_code (def_stmt) == GIMPLE_PHI
3749 && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
3750 && vinfo_for_stmt (def_stmt)
3751 && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
3752 == vect_double_reduction_def)
3753 {
3754 *adjustment_def = NULL;
3755 return vect_create_destination_var (init_val, vectype);
3756 }
3757
3758 if (TREE_CONSTANT (init_val))
3759 {
3760 if (SCALAR_FLOAT_TYPE_P (scalar_type))
3761 init_value = build_real (scalar_type, TREE_REAL_CST (init_val));
3762 else
3763 init_value = build_int_cst (scalar_type, TREE_INT_CST_LOW (init_val));
3764 }
3765 else
3766 init_value = init_val;
3767
3768 switch (code)
3769 {
3770 case WIDEN_SUM_EXPR:
3771 case DOT_PROD_EXPR:
3772 case SAD_EXPR:
3773 case PLUS_EXPR:
3774 case MINUS_EXPR:
3775 case BIT_IOR_EXPR:
3776 case BIT_XOR_EXPR:
3777 case MULT_EXPR:
3778 case BIT_AND_EXPR:
3779 /* ADJUSMENT_DEF is NULL when called from
3780 vect_create_epilog_for_reduction to vectorize double reduction. */
3781 if (adjustment_def)
3782 {
3783 if (nested_in_vect_loop)
3784 *adjustment_def = vect_get_vec_def_for_operand (init_val, stmt,
3785 NULL);
3786 else
3787 *adjustment_def = init_val;
3788 }
3789
3790 if (code == MULT_EXPR)
3791 {
3792 real_init_val = dconst1;
3793 int_init_val = 1;
3794 }
3795
3796 if (code == BIT_AND_EXPR)
3797 int_init_val = -1;
3798
3799 if (SCALAR_FLOAT_TYPE_P (scalar_type))
3800 def_for_init = build_real (scalar_type, real_init_val);
3801 else
3802 def_for_init = build_int_cst (scalar_type, int_init_val);
3803
3804 /* Create a vector of '0' or '1' except the first element. */
3805 elts = XALLOCAVEC (tree, nunits);
3806 for (i = nunits - 2; i >= 0; --i)
3807 elts[i + 1] = def_for_init;
3808
3809 /* Option1: the first element is '0' or '1' as well. */
3810 if (adjustment_def)
3811 {
3812 elts[0] = def_for_init;
3813 init_def = build_vector (vectype, elts);
3814 break;
3815 }
3816
3817 /* Option2: the first element is INIT_VAL. */
3818 elts[0] = init_val;
3819 if (TREE_CONSTANT (init_val))
3820 init_def = build_vector (vectype, elts);
3821 else
3822 {
3823 vec<constructor_elt, va_gc> *v;
3824 vec_alloc (v, nunits);
3825 CONSTRUCTOR_APPEND_ELT (v, NULL_TREE, init_val);
3826 for (i = 1; i < nunits; ++i)
3827 CONSTRUCTOR_APPEND_ELT (v, NULL_TREE, elts[i]);
3828 init_def = build_constructor (vectype, v);
3829 }
3830
3831 break;
3832
3833 case MIN_EXPR:
3834 case MAX_EXPR:
3835 case COND_EXPR:
3836 if (adjustment_def)
3837 {
3838 *adjustment_def = NULL_TREE;
3839 init_def = vect_get_vec_def_for_operand (init_val, stmt, NULL);
3840 break;
3841 }
3842
3843 init_def = build_vector_from_val (vectype, init_value);
3844 break;
3845
3846 default:
3847 gcc_unreachable ();
3848 }
3849
3850 return init_def;
3851 }
3852
3853 /* Function vect_create_epilog_for_reduction
3854
3855 Create code at the loop-epilog to finalize the result of a reduction
3856 computation.
3857
3858 VECT_DEFS is list of vector of partial results, i.e., the lhs's of vector
3859 reduction statements.
3860 STMT is the scalar reduction stmt that is being vectorized.
3861 NCOPIES is > 1 in case the vectorization factor (VF) is bigger than the
3862 number of elements that we can fit in a vectype (nunits). In this case
3863 we have to generate more than one vector stmt - i.e - we need to "unroll"
3864 the vector stmt by a factor VF/nunits. For more details see documentation
3865 in vectorizable_operation.
3866 REDUC_CODE is the tree-code for the epilog reduction.
3867 REDUCTION_PHIS is a list of the phi-nodes that carry the reduction
3868 computation.
3869 REDUC_INDEX is the index of the operand in the right hand side of the
3870 statement that is defined by REDUCTION_PHI.
3871 DOUBLE_REDUC is TRUE if double reduction phi nodes should be handled.
3872 SLP_NODE is an SLP node containing a group of reduction statements. The
3873 first one in this group is STMT.
3874
3875 This function:
3876 1. Creates the reduction def-use cycles: sets the arguments for
3877 REDUCTION_PHIS:
3878 The loop-entry argument is the vectorized initial-value of the reduction.
3879 The loop-latch argument is taken from VECT_DEFS - the vector of partial
3880 sums.
3881 2. "Reduces" each vector of partial results VECT_DEFS into a single result,
3882 by applying the operation specified by REDUC_CODE if available, or by
3883 other means (whole-vector shifts or a scalar loop).
3884 The function also creates a new phi node at the loop exit to preserve
3885 loop-closed form, as illustrated below.
3886
3887 The flow at the entry to this function:
3888
3889 loop:
3890 vec_def = phi <null, null> # REDUCTION_PHI
3891 VECT_DEF = vector_stmt # vectorized form of STMT
3892 s_loop = scalar_stmt # (scalar) STMT
3893 loop_exit:
3894 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
3895 use <s_out0>
3896 use <s_out0>
3897
3898 The above is transformed by this function into:
3899
3900 loop:
3901 vec_def = phi <vec_init, VECT_DEF> # REDUCTION_PHI
3902 VECT_DEF = vector_stmt # vectorized form of STMT
3903 s_loop = scalar_stmt # (scalar) STMT
3904 loop_exit:
3905 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
3906 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
3907 v_out2 = reduce <v_out1>
3908 s_out3 = extract_field <v_out2, 0>
3909 s_out4 = adjust_result <s_out3>
3910 use <s_out4>
3911 use <s_out4>
3912 */
3913
3914 static void
3915 vect_create_epilog_for_reduction (vec<tree> vect_defs, gimple stmt,
3916 int ncopies, enum tree_code reduc_code,
3917 vec<gimple> reduction_phis,
3918 int reduc_index, bool double_reduc,
3919 slp_tree slp_node)
3920 {
3921 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
3922 stmt_vec_info prev_phi_info;
3923 tree vectype;
3924 machine_mode mode;
3925 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
3926 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo), *outer_loop = NULL;
3927 basic_block exit_bb;
3928 tree scalar_dest;
3929 tree scalar_type;
3930 gimple new_phi = NULL, phi;
3931 gimple_stmt_iterator exit_gsi;
3932 tree vec_dest;
3933 tree new_temp = NULL_TREE, new_dest, new_name, new_scalar_dest;
3934 gimple epilog_stmt = NULL;
3935 enum tree_code code = gimple_assign_rhs_code (stmt);
3936 gimple exit_phi;
3937 tree bitsize;
3938 tree adjustment_def = NULL;
3939 tree vec_initial_def = NULL;
3940 tree reduction_op, expr, def;
3941 tree orig_name, scalar_result;
3942 imm_use_iterator imm_iter, phi_imm_iter;
3943 use_operand_p use_p, phi_use_p;
3944 gimple use_stmt, orig_stmt, reduction_phi = NULL;
3945 bool nested_in_vect_loop = false;
3946 auto_vec<gimple> new_phis;
3947 auto_vec<gimple> inner_phis;
3948 enum vect_def_type dt = vect_unknown_def_type;
3949 int j, i;
3950 auto_vec<tree> scalar_results;
3951 unsigned int group_size = 1, k, ratio;
3952 auto_vec<tree> vec_initial_defs;
3953 auto_vec<gimple> phis;
3954 bool slp_reduc = false;
3955 tree new_phi_result;
3956 gimple inner_phi = NULL;
3957
3958 if (slp_node)
3959 group_size = SLP_TREE_SCALAR_STMTS (slp_node).length ();
3960
3961 if (nested_in_vect_loop_p (loop, stmt))
3962 {
3963 outer_loop = loop;
3964 loop = loop->inner;
3965 nested_in_vect_loop = true;
3966 gcc_assert (!slp_node);
3967 }
3968
3969 switch (get_gimple_rhs_class (gimple_assign_rhs_code (stmt)))
3970 {
3971 case GIMPLE_SINGLE_RHS:
3972 gcc_assert (TREE_OPERAND_LENGTH (gimple_assign_rhs1 (stmt))
3973 == ternary_op);
3974 reduction_op = TREE_OPERAND (gimple_assign_rhs1 (stmt), reduc_index);
3975 break;
3976 case GIMPLE_UNARY_RHS:
3977 reduction_op = gimple_assign_rhs1 (stmt);
3978 break;
3979 case GIMPLE_BINARY_RHS:
3980 reduction_op = reduc_index ?
3981 gimple_assign_rhs2 (stmt) : gimple_assign_rhs1 (stmt);
3982 break;
3983 case GIMPLE_TERNARY_RHS:
3984 reduction_op = gimple_op (stmt, reduc_index + 1);
3985 break;
3986 default:
3987 gcc_unreachable ();
3988 }
3989
3990 vectype = get_vectype_for_scalar_type (TREE_TYPE (reduction_op));
3991 gcc_assert (vectype);
3992 mode = TYPE_MODE (vectype);
3993
3994 /* 1. Create the reduction def-use cycle:
3995 Set the arguments of REDUCTION_PHIS, i.e., transform
3996
3997 loop:
3998 vec_def = phi <null, null> # REDUCTION_PHI
3999 VECT_DEF = vector_stmt # vectorized form of STMT
4000 ...
4001
4002 into:
4003
4004 loop:
4005 vec_def = phi <vec_init, VECT_DEF> # REDUCTION_PHI
4006 VECT_DEF = vector_stmt # vectorized form of STMT
4007 ...
4008
4009 (in case of SLP, do it for all the phis). */
4010
4011 /* Get the loop-entry arguments. */
4012 if (slp_node)
4013 vect_get_vec_defs (reduction_op, NULL_TREE, stmt, &vec_initial_defs,
4014 NULL, slp_node, reduc_index);
4015 else
4016 {
4017 vec_initial_defs.create (1);
4018 /* For the case of reduction, vect_get_vec_def_for_operand returns
4019 the scalar def before the loop, that defines the initial value
4020 of the reduction variable. */
4021 vec_initial_def = vect_get_vec_def_for_operand (reduction_op, stmt,
4022 &adjustment_def);
4023 vec_initial_defs.quick_push (vec_initial_def);
4024 }
4025
4026 /* Set phi nodes arguments. */
4027 FOR_EACH_VEC_ELT (reduction_phis, i, phi)
4028 {
4029 tree vec_init_def, def;
4030 gimple_seq stmts;
4031 vec_init_def = force_gimple_operand (vec_initial_defs[i], &stmts,
4032 true, NULL_TREE);
4033 gsi_insert_seq_on_edge_immediate (loop_preheader_edge (loop), stmts);
4034 def = vect_defs[i];
4035 for (j = 0; j < ncopies; j++)
4036 {
4037 /* Set the loop-entry arg of the reduction-phi. */
4038 add_phi_arg (as_a <gphi *> (phi), vec_init_def,
4039 loop_preheader_edge (loop), UNKNOWN_LOCATION);
4040
4041 /* Set the loop-latch arg for the reduction-phi. */
4042 if (j > 0)
4043 def = vect_get_vec_def_for_stmt_copy (vect_unknown_def_type, def);
4044
4045 add_phi_arg (as_a <gphi *> (phi), def, loop_latch_edge (loop),
4046 UNKNOWN_LOCATION);
4047
4048 if (dump_enabled_p ())
4049 {
4050 dump_printf_loc (MSG_NOTE, vect_location,
4051 "transform reduction: created def-use cycle: ");
4052 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
4053 dump_printf (MSG_NOTE, "\n");
4054 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, SSA_NAME_DEF_STMT (def), 0);
4055 dump_printf (MSG_NOTE, "\n");
4056 }
4057
4058 phi = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (phi));
4059 }
4060 }
4061
4062 /* 2. Create epilog code.
4063 The reduction epilog code operates across the elements of the vector
4064 of partial results computed by the vectorized loop.
4065 The reduction epilog code consists of:
4066
4067 step 1: compute the scalar result in a vector (v_out2)
4068 step 2: extract the scalar result (s_out3) from the vector (v_out2)
4069 step 3: adjust the scalar result (s_out3) if needed.
4070
4071 Step 1 can be accomplished using one the following three schemes:
4072 (scheme 1) using reduc_code, if available.
4073 (scheme 2) using whole-vector shifts, if available.
4074 (scheme 3) using a scalar loop. In this case steps 1+2 above are
4075 combined.
4076
4077 The overall epilog code looks like this:
4078
4079 s_out0 = phi <s_loop> # original EXIT_PHI
4080 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
4081 v_out2 = reduce <v_out1> # step 1
4082 s_out3 = extract_field <v_out2, 0> # step 2
4083 s_out4 = adjust_result <s_out3> # step 3
4084
4085 (step 3 is optional, and steps 1 and 2 may be combined).
4086 Lastly, the uses of s_out0 are replaced by s_out4. */
4087
4088
4089 /* 2.1 Create new loop-exit-phis to preserve loop-closed form:
4090 v_out1 = phi <VECT_DEF>
4091 Store them in NEW_PHIS. */
4092
4093 exit_bb = single_exit (loop)->dest;
4094 prev_phi_info = NULL;
4095 new_phis.create (vect_defs.length ());
4096 FOR_EACH_VEC_ELT (vect_defs, i, def)
4097 {
4098 for (j = 0; j < ncopies; j++)
4099 {
4100 tree new_def = copy_ssa_name (def);
4101 phi = create_phi_node (new_def, exit_bb);
4102 set_vinfo_for_stmt (phi, new_stmt_vec_info (phi, loop_vinfo, NULL));
4103 if (j == 0)
4104 new_phis.quick_push (phi);
4105 else
4106 {
4107 def = vect_get_vec_def_for_stmt_copy (dt, def);
4108 STMT_VINFO_RELATED_STMT (prev_phi_info) = phi;
4109 }
4110
4111 SET_PHI_ARG_DEF (phi, single_exit (loop)->dest_idx, def);
4112 prev_phi_info = vinfo_for_stmt (phi);
4113 }
4114 }
4115
4116 /* The epilogue is created for the outer-loop, i.e., for the loop being
4117 vectorized. Create exit phis for the outer loop. */
4118 if (double_reduc)
4119 {
4120 loop = outer_loop;
4121 exit_bb = single_exit (loop)->dest;
4122 inner_phis.create (vect_defs.length ());
4123 FOR_EACH_VEC_ELT (new_phis, i, phi)
4124 {
4125 tree new_result = copy_ssa_name (PHI_RESULT (phi));
4126 gphi *outer_phi = create_phi_node (new_result, exit_bb);
4127 SET_PHI_ARG_DEF (outer_phi, single_exit (loop)->dest_idx,
4128 PHI_RESULT (phi));
4129 set_vinfo_for_stmt (outer_phi, new_stmt_vec_info (outer_phi,
4130 loop_vinfo, NULL));
4131 inner_phis.quick_push (phi);
4132 new_phis[i] = outer_phi;
4133 prev_phi_info = vinfo_for_stmt (outer_phi);
4134 while (STMT_VINFO_RELATED_STMT (vinfo_for_stmt (phi)))
4135 {
4136 phi = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (phi));
4137 new_result = copy_ssa_name (PHI_RESULT (phi));
4138 outer_phi = create_phi_node (new_result, exit_bb);
4139 SET_PHI_ARG_DEF (outer_phi, single_exit (loop)->dest_idx,
4140 PHI_RESULT (phi));
4141 set_vinfo_for_stmt (outer_phi, new_stmt_vec_info (outer_phi,
4142 loop_vinfo, NULL));
4143 STMT_VINFO_RELATED_STMT (prev_phi_info) = outer_phi;
4144 prev_phi_info = vinfo_for_stmt (outer_phi);
4145 }
4146 }
4147 }
4148
4149 exit_gsi = gsi_after_labels (exit_bb);
4150
4151 /* 2.2 Get the relevant tree-code to use in the epilog for schemes 2,3
4152 (i.e. when reduc_code is not available) and in the final adjustment
4153 code (if needed). Also get the original scalar reduction variable as
4154 defined in the loop. In case STMT is a "pattern-stmt" (i.e. - it
4155 represents a reduction pattern), the tree-code and scalar-def are
4156 taken from the original stmt that the pattern-stmt (STMT) replaces.
4157 Otherwise (it is a regular reduction) - the tree-code and scalar-def
4158 are taken from STMT. */
4159
4160 orig_stmt = STMT_VINFO_RELATED_STMT (stmt_info);
4161 if (!orig_stmt)
4162 {
4163 /* Regular reduction */
4164 orig_stmt = stmt;
4165 }
4166 else
4167 {
4168 /* Reduction pattern */
4169 stmt_vec_info stmt_vinfo = vinfo_for_stmt (orig_stmt);
4170 gcc_assert (STMT_VINFO_IN_PATTERN_P (stmt_vinfo));
4171 gcc_assert (STMT_VINFO_RELATED_STMT (stmt_vinfo) == stmt);
4172 }
4173
4174 code = gimple_assign_rhs_code (orig_stmt);
4175 /* For MINUS_EXPR the initial vector is [init_val,0,...,0], therefore,
4176 partial results are added and not subtracted. */
4177 if (code == MINUS_EXPR)
4178 code = PLUS_EXPR;
4179
4180 scalar_dest = gimple_assign_lhs (orig_stmt);
4181 scalar_type = TREE_TYPE (scalar_dest);
4182 scalar_results.create (group_size);
4183 new_scalar_dest = vect_create_destination_var (scalar_dest, NULL);
4184 bitsize = TYPE_SIZE (scalar_type);
4185
4186 /* In case this is a reduction in an inner-loop while vectorizing an outer
4187 loop - we don't need to extract a single scalar result at the end of the
4188 inner-loop (unless it is double reduction, i.e., the use of reduction is
4189 outside the outer-loop). The final vector of partial results will be used
4190 in the vectorized outer-loop, or reduced to a scalar result at the end of
4191 the outer-loop. */
4192 if (nested_in_vect_loop && !double_reduc)
4193 goto vect_finalize_reduction;
4194
4195 /* SLP reduction without reduction chain, e.g.,
4196 # a1 = phi <a2, a0>
4197 # b1 = phi <b2, b0>
4198 a2 = operation (a1)
4199 b2 = operation (b1) */
4200 slp_reduc = (slp_node && !GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)));
4201
4202 /* In case of reduction chain, e.g.,
4203 # a1 = phi <a3, a0>
4204 a2 = operation (a1)
4205 a3 = operation (a2),
4206
4207 we may end up with more than one vector result. Here we reduce them to
4208 one vector. */
4209 if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
4210 {
4211 tree first_vect = PHI_RESULT (new_phis[0]);
4212 tree tmp;
4213 gassign *new_vec_stmt = NULL;
4214
4215 vec_dest = vect_create_destination_var (scalar_dest, vectype);
4216 for (k = 1; k < new_phis.length (); k++)
4217 {
4218 gimple next_phi = new_phis[k];
4219 tree second_vect = PHI_RESULT (next_phi);
4220
4221 tmp = build2 (code, vectype, first_vect, second_vect);
4222 new_vec_stmt = gimple_build_assign (vec_dest, tmp);
4223 first_vect = make_ssa_name (vec_dest, new_vec_stmt);
4224 gimple_assign_set_lhs (new_vec_stmt, first_vect);
4225 gsi_insert_before (&exit_gsi, new_vec_stmt, GSI_SAME_STMT);
4226 }
4227
4228 new_phi_result = first_vect;
4229 if (new_vec_stmt)
4230 {
4231 new_phis.truncate (0);
4232 new_phis.safe_push (new_vec_stmt);
4233 }
4234 }
4235 else
4236 new_phi_result = PHI_RESULT (new_phis[0]);
4237
4238 /* 2.3 Create the reduction code, using one of the three schemes described
4239 above. In SLP we simply need to extract all the elements from the
4240 vector (without reducing them), so we use scalar shifts. */
4241 if (reduc_code != ERROR_MARK && !slp_reduc)
4242 {
4243 tree tmp;
4244 tree vec_elem_type;
4245
4246 /*** Case 1: Create:
4247 v_out2 = reduc_expr <v_out1> */
4248
4249 if (dump_enabled_p ())
4250 dump_printf_loc (MSG_NOTE, vect_location,
4251 "Reduce using direct vector reduction.\n");
4252
4253 vec_elem_type = TREE_TYPE (TREE_TYPE (new_phi_result));
4254 if (!useless_type_conversion_p (scalar_type, vec_elem_type))
4255 {
4256 tree tmp_dest =
4257 vect_create_destination_var (scalar_dest, vec_elem_type);
4258 tmp = build1 (reduc_code, vec_elem_type, new_phi_result);
4259 epilog_stmt = gimple_build_assign (tmp_dest, tmp);
4260 new_temp = make_ssa_name (tmp_dest, epilog_stmt);
4261 gimple_assign_set_lhs (epilog_stmt, new_temp);
4262 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4263
4264 tmp = build1 (NOP_EXPR, scalar_type, new_temp);
4265 }
4266 else
4267 tmp = build1 (reduc_code, scalar_type, new_phi_result);
4268 epilog_stmt = gimple_build_assign (new_scalar_dest, tmp);
4269 new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
4270 gimple_assign_set_lhs (epilog_stmt, new_temp);
4271 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4272 scalar_results.safe_push (new_temp);
4273 }
4274 else
4275 {
4276 bool reduce_with_shift = have_whole_vector_shift (mode);
4277 int element_bitsize = tree_to_uhwi (bitsize);
4278 int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
4279 tree vec_temp;
4280
4281 /* Regardless of whether we have a whole vector shift, if we're
4282 emulating the operation via tree-vect-generic, we don't want
4283 to use it. Only the first round of the reduction is likely
4284 to still be profitable via emulation. */
4285 /* ??? It might be better to emit a reduction tree code here, so that
4286 tree-vect-generic can expand the first round via bit tricks. */
4287 if (!VECTOR_MODE_P (mode))
4288 reduce_with_shift = false;
4289 else
4290 {
4291 optab optab = optab_for_tree_code (code, vectype, optab_default);
4292 if (optab_handler (optab, mode) == CODE_FOR_nothing)
4293 reduce_with_shift = false;
4294 }
4295
4296 if (reduce_with_shift && !slp_reduc)
4297 {
4298 int nelements = vec_size_in_bits / element_bitsize;
4299 unsigned char *sel = XALLOCAVEC (unsigned char, nelements);
4300
4301 int elt_offset;
4302
4303 tree zero_vec = build_zero_cst (vectype);
4304 /*** Case 2: Create:
4305 for (offset = nelements/2; offset >= 1; offset/=2)
4306 {
4307 Create: va' = vec_shift <va, offset>
4308 Create: va = vop <va, va'>
4309 } */
4310
4311 tree rhs;
4312
4313 if (dump_enabled_p ())
4314 dump_printf_loc (MSG_NOTE, vect_location,
4315 "Reduce using vector shifts\n");
4316
4317 vec_dest = vect_create_destination_var (scalar_dest, vectype);
4318 new_temp = new_phi_result;
4319 for (elt_offset = nelements / 2;
4320 elt_offset >= 1;
4321 elt_offset /= 2)
4322 {
4323 calc_vec_perm_mask_for_shift (mode, elt_offset, sel);
4324 tree mask = vect_gen_perm_mask_any (vectype, sel);
4325 epilog_stmt = gimple_build_assign (vec_dest, VEC_PERM_EXPR,
4326 new_temp, zero_vec, mask);
4327 new_name = make_ssa_name (vec_dest, epilog_stmt);
4328 gimple_assign_set_lhs (epilog_stmt, new_name);
4329 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4330
4331 epilog_stmt = gimple_build_assign (vec_dest, code, new_name,
4332 new_temp);
4333 new_temp = make_ssa_name (vec_dest, epilog_stmt);
4334 gimple_assign_set_lhs (epilog_stmt, new_temp);
4335 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4336 }
4337
4338 /* 2.4 Extract the final scalar result. Create:
4339 s_out3 = extract_field <v_out2, bitpos> */
4340
4341 if (dump_enabled_p ())
4342 dump_printf_loc (MSG_NOTE, vect_location,
4343 "extract scalar result\n");
4344
4345 rhs = build3 (BIT_FIELD_REF, scalar_type, new_temp,
4346 bitsize, bitsize_zero_node);
4347 epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
4348 new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
4349 gimple_assign_set_lhs (epilog_stmt, new_temp);
4350 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4351 scalar_results.safe_push (new_temp);
4352 }
4353 else
4354 {
4355 /*** Case 3: Create:
4356 s = extract_field <v_out2, 0>
4357 for (offset = element_size;
4358 offset < vector_size;
4359 offset += element_size;)
4360 {
4361 Create: s' = extract_field <v_out2, offset>
4362 Create: s = op <s, s'> // For non SLP cases
4363 } */
4364
4365 if (dump_enabled_p ())
4366 dump_printf_loc (MSG_NOTE, vect_location,
4367 "Reduce using scalar code.\n");
4368
4369 vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
4370 FOR_EACH_VEC_ELT (new_phis, i, new_phi)
4371 {
4372 int bit_offset;
4373 if (gimple_code (new_phi) == GIMPLE_PHI)
4374 vec_temp = PHI_RESULT (new_phi);
4375 else
4376 vec_temp = gimple_assign_lhs (new_phi);
4377 tree rhs = build3 (BIT_FIELD_REF, scalar_type, vec_temp, bitsize,
4378 bitsize_zero_node);
4379 epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
4380 new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
4381 gimple_assign_set_lhs (epilog_stmt, new_temp);
4382 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4383
4384 /* In SLP we don't need to apply reduction operation, so we just
4385 collect s' values in SCALAR_RESULTS. */
4386 if (slp_reduc)
4387 scalar_results.safe_push (new_temp);
4388
4389 for (bit_offset = element_bitsize;
4390 bit_offset < vec_size_in_bits;
4391 bit_offset += element_bitsize)
4392 {
4393 tree bitpos = bitsize_int (bit_offset);
4394 tree rhs = build3 (BIT_FIELD_REF, scalar_type, vec_temp,
4395 bitsize, bitpos);
4396
4397 epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
4398 new_name = make_ssa_name (new_scalar_dest, epilog_stmt);
4399 gimple_assign_set_lhs (epilog_stmt, new_name);
4400 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4401
4402 if (slp_reduc)
4403 {
4404 /* In SLP we don't need to apply reduction operation, so
4405 we just collect s' values in SCALAR_RESULTS. */
4406 new_temp = new_name;
4407 scalar_results.safe_push (new_name);
4408 }
4409 else
4410 {
4411 epilog_stmt = gimple_build_assign (new_scalar_dest, code,
4412 new_name, new_temp);
4413 new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
4414 gimple_assign_set_lhs (epilog_stmt, new_temp);
4415 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4416 }
4417 }
4418 }
4419
4420 /* The only case where we need to reduce scalar results in SLP, is
4421 unrolling. If the size of SCALAR_RESULTS is greater than
4422 GROUP_SIZE, we reduce them combining elements modulo
4423 GROUP_SIZE. */
4424 if (slp_reduc)
4425 {
4426 tree res, first_res, new_res;
4427 gimple new_stmt;
4428
4429 /* Reduce multiple scalar results in case of SLP unrolling. */
4430 for (j = group_size; scalar_results.iterate (j, &res);
4431 j++)
4432 {
4433 first_res = scalar_results[j % group_size];
4434 new_stmt = gimple_build_assign (new_scalar_dest, code,
4435 first_res, res);
4436 new_res = make_ssa_name (new_scalar_dest, new_stmt);
4437 gimple_assign_set_lhs (new_stmt, new_res);
4438 gsi_insert_before (&exit_gsi, new_stmt, GSI_SAME_STMT);
4439 scalar_results[j % group_size] = new_res;
4440 }
4441 }
4442 else
4443 /* Not SLP - we have one scalar to keep in SCALAR_RESULTS. */
4444 scalar_results.safe_push (new_temp);
4445 }
4446 }
4447
4448 vect_finalize_reduction:
4449
4450 if (double_reduc)
4451 loop = loop->inner;
4452
4453 /* 2.5 Adjust the final result by the initial value of the reduction
4454 variable. (When such adjustment is not needed, then
4455 'adjustment_def' is zero). For example, if code is PLUS we create:
4456 new_temp = loop_exit_def + adjustment_def */
4457
4458 if (adjustment_def)
4459 {
4460 gcc_assert (!slp_reduc);
4461 if (nested_in_vect_loop)
4462 {
4463 new_phi = new_phis[0];
4464 gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) == VECTOR_TYPE);
4465 expr = build2 (code, vectype, PHI_RESULT (new_phi), adjustment_def);
4466 new_dest = vect_create_destination_var (scalar_dest, vectype);
4467 }
4468 else
4469 {
4470 new_temp = scalar_results[0];
4471 gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) != VECTOR_TYPE);
4472 expr = build2 (code, scalar_type, new_temp, adjustment_def);
4473 new_dest = vect_create_destination_var (scalar_dest, scalar_type);
4474 }
4475
4476 epilog_stmt = gimple_build_assign (new_dest, expr);
4477 new_temp = make_ssa_name (new_dest, epilog_stmt);
4478 gimple_assign_set_lhs (epilog_stmt, new_temp);
4479 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4480 if (nested_in_vect_loop)
4481 {
4482 set_vinfo_for_stmt (epilog_stmt,
4483 new_stmt_vec_info (epilog_stmt, loop_vinfo,
4484 NULL));
4485 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (epilog_stmt)) =
4486 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (new_phi));
4487
4488 if (!double_reduc)
4489 scalar_results.quick_push (new_temp);
4490 else
4491 scalar_results[0] = new_temp;
4492 }
4493 else
4494 scalar_results[0] = new_temp;
4495
4496 new_phis[0] = epilog_stmt;
4497 }
4498
4499 /* 2.6 Handle the loop-exit phis. Replace the uses of scalar loop-exit
4500 phis with new adjusted scalar results, i.e., replace use <s_out0>
4501 with use <s_out4>.
4502
4503 Transform:
4504 loop_exit:
4505 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
4506 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
4507 v_out2 = reduce <v_out1>
4508 s_out3 = extract_field <v_out2, 0>
4509 s_out4 = adjust_result <s_out3>
4510 use <s_out0>
4511 use <s_out0>
4512
4513 into:
4514
4515 loop_exit:
4516 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
4517 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
4518 v_out2 = reduce <v_out1>
4519 s_out3 = extract_field <v_out2, 0>
4520 s_out4 = adjust_result <s_out3>
4521 use <s_out4>
4522 use <s_out4> */
4523
4524
4525 /* In SLP reduction chain we reduce vector results into one vector if
4526 necessary, hence we set here GROUP_SIZE to 1. SCALAR_DEST is the LHS of
4527 the last stmt in the reduction chain, since we are looking for the loop
4528 exit phi node. */
4529 if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
4530 {
4531 scalar_dest = gimple_assign_lhs (
4532 SLP_TREE_SCALAR_STMTS (slp_node)[group_size - 1]);
4533 group_size = 1;
4534 }
4535
4536 /* In SLP we may have several statements in NEW_PHIS and REDUCTION_PHIS (in
4537 case that GROUP_SIZE is greater than vectorization factor). Therefore, we
4538 need to match SCALAR_RESULTS with corresponding statements. The first
4539 (GROUP_SIZE / number of new vector stmts) scalar results correspond to
4540 the first vector stmt, etc.
4541 (RATIO is equal to (GROUP_SIZE / number of new vector stmts)). */
4542 if (group_size > new_phis.length ())
4543 {
4544 ratio = group_size / new_phis.length ();
4545 gcc_assert (!(group_size % new_phis.length ()));
4546 }
4547 else
4548 ratio = 1;
4549
4550 for (k = 0; k < group_size; k++)
4551 {
4552 if (k % ratio == 0)
4553 {
4554 epilog_stmt = new_phis[k / ratio];
4555 reduction_phi = reduction_phis[k / ratio];
4556 if (double_reduc)
4557 inner_phi = inner_phis[k / ratio];
4558 }
4559
4560 if (slp_reduc)
4561 {
4562 gimple current_stmt = SLP_TREE_SCALAR_STMTS (slp_node)[k];
4563
4564 orig_stmt = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (current_stmt));
4565 /* SLP statements can't participate in patterns. */
4566 gcc_assert (!orig_stmt);
4567 scalar_dest = gimple_assign_lhs (current_stmt);
4568 }
4569
4570 phis.create (3);
4571 /* Find the loop-closed-use at the loop exit of the original scalar
4572 result. (The reduction result is expected to have two immediate uses -
4573 one at the latch block, and one at the loop exit). */
4574 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, scalar_dest)
4575 if (!flow_bb_inside_loop_p (loop, gimple_bb (USE_STMT (use_p)))
4576 && !is_gimple_debug (USE_STMT (use_p)))
4577 phis.safe_push (USE_STMT (use_p));
4578
4579 /* While we expect to have found an exit_phi because of loop-closed-ssa
4580 form we can end up without one if the scalar cycle is dead. */
4581
4582 FOR_EACH_VEC_ELT (phis, i, exit_phi)
4583 {
4584 if (outer_loop)
4585 {
4586 stmt_vec_info exit_phi_vinfo = vinfo_for_stmt (exit_phi);
4587 gphi *vect_phi;
4588
4589 /* FORNOW. Currently not supporting the case that an inner-loop
4590 reduction is not used in the outer-loop (but only outside the
4591 outer-loop), unless it is double reduction. */
4592 gcc_assert ((STMT_VINFO_RELEVANT_P (exit_phi_vinfo)
4593 && !STMT_VINFO_LIVE_P (exit_phi_vinfo))
4594 || double_reduc);
4595
4596 if (double_reduc)
4597 STMT_VINFO_VEC_STMT (exit_phi_vinfo) = inner_phi;
4598 else
4599 STMT_VINFO_VEC_STMT (exit_phi_vinfo) = epilog_stmt;
4600 if (!double_reduc
4601 || STMT_VINFO_DEF_TYPE (exit_phi_vinfo)
4602 != vect_double_reduction_def)
4603 continue;
4604
4605 /* Handle double reduction:
4606
4607 stmt1: s1 = phi <s0, s2> - double reduction phi (outer loop)
4608 stmt2: s3 = phi <s1, s4> - (regular) reduc phi (inner loop)
4609 stmt3: s4 = use (s3) - (regular) reduc stmt (inner loop)
4610 stmt4: s2 = phi <s4> - double reduction stmt (outer loop)
4611
4612 At that point the regular reduction (stmt2 and stmt3) is
4613 already vectorized, as well as the exit phi node, stmt4.
4614 Here we vectorize the phi node of double reduction, stmt1, and
4615 update all relevant statements. */
4616
4617 /* Go through all the uses of s2 to find double reduction phi
4618 node, i.e., stmt1 above. */
4619 orig_name = PHI_RESULT (exit_phi);
4620 FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name)
4621 {
4622 stmt_vec_info use_stmt_vinfo;
4623 stmt_vec_info new_phi_vinfo;
4624 tree vect_phi_init, preheader_arg, vect_phi_res, init_def;
4625 basic_block bb = gimple_bb (use_stmt);
4626 gimple use;
4627
4628 /* Check that USE_STMT is really double reduction phi
4629 node. */
4630 if (gimple_code (use_stmt) != GIMPLE_PHI
4631 || gimple_phi_num_args (use_stmt) != 2
4632 || bb->loop_father != outer_loop)
4633 continue;
4634 use_stmt_vinfo = vinfo_for_stmt (use_stmt);
4635 if (!use_stmt_vinfo
4636 || STMT_VINFO_DEF_TYPE (use_stmt_vinfo)
4637 != vect_double_reduction_def)
4638 continue;
4639
4640 /* Create vector phi node for double reduction:
4641 vs1 = phi <vs0, vs2>
4642 vs1 was created previously in this function by a call to
4643 vect_get_vec_def_for_operand and is stored in
4644 vec_initial_def;
4645 vs2 is defined by INNER_PHI, the vectorized EXIT_PHI;
4646 vs0 is created here. */
4647
4648 /* Create vector phi node. */
4649 vect_phi = create_phi_node (vec_initial_def, bb);
4650 new_phi_vinfo = new_stmt_vec_info (vect_phi,
4651 loop_vec_info_for_loop (outer_loop), NULL);
4652 set_vinfo_for_stmt (vect_phi, new_phi_vinfo);
4653
4654 /* Create vs0 - initial def of the double reduction phi. */
4655 preheader_arg = PHI_ARG_DEF_FROM_EDGE (use_stmt,
4656 loop_preheader_edge (outer_loop));
4657 init_def = get_initial_def_for_reduction (stmt,
4658 preheader_arg, NULL);
4659 vect_phi_init = vect_init_vector (use_stmt, init_def,
4660 vectype, NULL);
4661
4662 /* Update phi node arguments with vs0 and vs2. */
4663 add_phi_arg (vect_phi, vect_phi_init,
4664 loop_preheader_edge (outer_loop),
4665 UNKNOWN_LOCATION);
4666 add_phi_arg (vect_phi, PHI_RESULT (inner_phi),
4667 loop_latch_edge (outer_loop), UNKNOWN_LOCATION);
4668 if (dump_enabled_p ())
4669 {
4670 dump_printf_loc (MSG_NOTE, vect_location,
4671 "created double reduction phi node: ");
4672 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, vect_phi, 0);
4673 dump_printf (MSG_NOTE, "\n");
4674 }
4675
4676 vect_phi_res = PHI_RESULT (vect_phi);
4677
4678 /* Replace the use, i.e., set the correct vs1 in the regular
4679 reduction phi node. FORNOW, NCOPIES is always 1, so the
4680 loop is redundant. */
4681 use = reduction_phi;
4682 for (j = 0; j < ncopies; j++)
4683 {
4684 edge pr_edge = loop_preheader_edge (loop);
4685 SET_PHI_ARG_DEF (use, pr_edge->dest_idx, vect_phi_res);
4686 use = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (use));
4687 }
4688 }
4689 }
4690 }
4691
4692 phis.release ();
4693 if (nested_in_vect_loop)
4694 {
4695 if (double_reduc)
4696 loop = outer_loop;
4697 else
4698 continue;
4699 }
4700
4701 phis.create (3);
4702 /* Find the loop-closed-use at the loop exit of the original scalar
4703 result. (The reduction result is expected to have two immediate uses,
4704 one at the latch block, and one at the loop exit). For double
4705 reductions we are looking for exit phis of the outer loop. */
4706 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, scalar_dest)
4707 {
4708 if (!flow_bb_inside_loop_p (loop, gimple_bb (USE_STMT (use_p))))
4709 {
4710 if (!is_gimple_debug (USE_STMT (use_p)))
4711 phis.safe_push (USE_STMT (use_p));
4712 }
4713 else
4714 {
4715 if (double_reduc && gimple_code (USE_STMT (use_p)) == GIMPLE_PHI)
4716 {
4717 tree phi_res = PHI_RESULT (USE_STMT (use_p));
4718
4719 FOR_EACH_IMM_USE_FAST (phi_use_p, phi_imm_iter, phi_res)
4720 {
4721 if (!flow_bb_inside_loop_p (loop,
4722 gimple_bb (USE_STMT (phi_use_p)))
4723 && !is_gimple_debug (USE_STMT (phi_use_p)))
4724 phis.safe_push (USE_STMT (phi_use_p));
4725 }
4726 }
4727 }
4728 }
4729
4730 FOR_EACH_VEC_ELT (phis, i, exit_phi)
4731 {
4732 /* Replace the uses: */
4733 orig_name = PHI_RESULT (exit_phi);
4734 scalar_result = scalar_results[k];
4735 FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name)
4736 FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
4737 SET_USE (use_p, scalar_result);
4738 }
4739
4740 phis.release ();
4741 }
4742 }
4743
4744
4745 /* Function vectorizable_reduction.
4746
4747 Check if STMT performs a reduction operation that can be vectorized.
4748 If VEC_STMT is also passed, vectorize the STMT: create a vectorized
4749 stmt to replace it, put it in VEC_STMT, and insert it at GSI.
4750 Return FALSE if not a vectorizable STMT, TRUE otherwise.
4751
4752 This function also handles reduction idioms (patterns) that have been
4753 recognized in advance during vect_pattern_recog. In this case, STMT may be
4754 of this form:
4755 X = pattern_expr (arg0, arg1, ..., X)
4756 and it's STMT_VINFO_RELATED_STMT points to the last stmt in the original
4757 sequence that had been detected and replaced by the pattern-stmt (STMT).
4758
4759 In some cases of reduction patterns, the type of the reduction variable X is
4760 different than the type of the other arguments of STMT.
4761 In such cases, the vectype that is used when transforming STMT into a vector
4762 stmt is different than the vectype that is used to determine the
4763 vectorization factor, because it consists of a different number of elements
4764 than the actual number of elements that are being operated upon in parallel.
4765
4766 For example, consider an accumulation of shorts into an int accumulator.
4767 On some targets it's possible to vectorize this pattern operating on 8
4768 shorts at a time (hence, the vectype for purposes of determining the
4769 vectorization factor should be V8HI); on the other hand, the vectype that
4770 is used to create the vector form is actually V4SI (the type of the result).
4771
4772 Upon entry to this function, STMT_VINFO_VECTYPE records the vectype that
4773 indicates what is the actual level of parallelism (V8HI in the example), so
4774 that the right vectorization factor would be derived. This vectype
4775 corresponds to the type of arguments to the reduction stmt, and should *NOT*
4776 be used to create the vectorized stmt. The right vectype for the vectorized
4777 stmt is obtained from the type of the result X:
4778 get_vectype_for_scalar_type (TREE_TYPE (X))
4779
4780 This means that, contrary to "regular" reductions (or "regular" stmts in
4781 general), the following equation:
4782 STMT_VINFO_VECTYPE == get_vectype_for_scalar_type (TREE_TYPE (X))
4783 does *NOT* necessarily hold for reduction patterns. */
4784
4785 bool
4786 vectorizable_reduction (gimple stmt, gimple_stmt_iterator *gsi,
4787 gimple *vec_stmt, slp_tree slp_node)
4788 {
4789 tree vec_dest;
4790 tree scalar_dest;
4791 tree loop_vec_def0 = NULL_TREE, loop_vec_def1 = NULL_TREE;
4792 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4793 tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
4794 tree vectype_in = NULL_TREE;
4795 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4796 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
4797 enum tree_code code, orig_code, epilog_reduc_code;
4798 machine_mode vec_mode;
4799 int op_type;
4800 optab optab, reduc_optab;
4801 tree new_temp = NULL_TREE;
4802 tree def;
4803 gimple def_stmt;
4804 enum vect_def_type dt;
4805 gphi *new_phi = NULL;
4806 tree scalar_type;
4807 bool is_simple_use;
4808 gimple orig_stmt;
4809 stmt_vec_info orig_stmt_info;
4810 tree expr = NULL_TREE;
4811 int i;
4812 int ncopies;
4813 int epilog_copies;
4814 stmt_vec_info prev_stmt_info, prev_phi_info;
4815 bool single_defuse_cycle = false;
4816 tree reduc_def = NULL_TREE;
4817 gimple new_stmt = NULL;
4818 int j;
4819 tree ops[3];
4820 bool nested_cycle = false, found_nested_cycle_def = false;
4821 gimple reduc_def_stmt = NULL;
4822 /* The default is that the reduction variable is the last in statement. */
4823 int reduc_index = 2;
4824 bool double_reduc = false, dummy;
4825 basic_block def_bb;
4826 struct loop * def_stmt_loop, *outer_loop = NULL;
4827 tree def_arg;
4828 gimple def_arg_stmt;
4829 auto_vec<tree> vec_oprnds0;
4830 auto_vec<tree> vec_oprnds1;
4831 auto_vec<tree> vect_defs;
4832 auto_vec<gimple> phis;
4833 int vec_num;
4834 tree def0, def1, tem, op0, op1 = NULL_TREE;
4835
4836 /* In case of reduction chain we switch to the first stmt in the chain, but
4837 we don't update STMT_INFO, since only the last stmt is marked as reduction
4838 and has reduction properties. */
4839 if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
4840 stmt = GROUP_FIRST_ELEMENT (stmt_info);
4841
4842 if (nested_in_vect_loop_p (loop, stmt))
4843 {
4844 outer_loop = loop;
4845 loop = loop->inner;
4846 nested_cycle = true;
4847 }
4848
4849 /* 1. Is vectorizable reduction? */
4850 /* Not supportable if the reduction variable is used in the loop, unless
4851 it's a reduction chain. */
4852 if (STMT_VINFO_RELEVANT (stmt_info) > vect_used_in_outer
4853 && !GROUP_FIRST_ELEMENT (stmt_info))
4854 return false;
4855
4856 /* Reductions that are not used even in an enclosing outer-loop,
4857 are expected to be "live" (used out of the loop). */
4858 if (STMT_VINFO_RELEVANT (stmt_info) == vect_unused_in_scope
4859 && !STMT_VINFO_LIVE_P (stmt_info))
4860 return false;
4861
4862 /* Make sure it was already recognized as a reduction computation. */
4863 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_reduction_def
4864 && STMT_VINFO_DEF_TYPE (stmt_info) != vect_nested_cycle)
4865 return false;
4866
4867 /* 2. Has this been recognized as a reduction pattern?
4868
4869 Check if STMT represents a pattern that has been recognized
4870 in earlier analysis stages. For stmts that represent a pattern,
4871 the STMT_VINFO_RELATED_STMT field records the last stmt in
4872 the original sequence that constitutes the pattern. */
4873
4874 orig_stmt = STMT_VINFO_RELATED_STMT (stmt_info);
4875 if (orig_stmt)
4876 {
4877 orig_stmt_info = vinfo_for_stmt (orig_stmt);
4878 gcc_assert (STMT_VINFO_IN_PATTERN_P (orig_stmt_info));
4879 gcc_assert (!STMT_VINFO_IN_PATTERN_P (stmt_info));
4880 }
4881
4882 /* 3. Check the operands of the operation. The first operands are defined
4883 inside the loop body. The last operand is the reduction variable,
4884 which is defined by the loop-header-phi. */
4885
4886 gcc_assert (is_gimple_assign (stmt));
4887
4888 /* Flatten RHS. */
4889 switch (get_gimple_rhs_class (gimple_assign_rhs_code (stmt)))
4890 {
4891 case GIMPLE_SINGLE_RHS:
4892 op_type = TREE_OPERAND_LENGTH (gimple_assign_rhs1 (stmt));
4893 if (op_type == ternary_op)
4894 {
4895 tree rhs = gimple_assign_rhs1 (stmt);
4896 ops[0] = TREE_OPERAND (rhs, 0);
4897 ops[1] = TREE_OPERAND (rhs, 1);
4898 ops[2] = TREE_OPERAND (rhs, 2);
4899 code = TREE_CODE (rhs);
4900 }
4901 else
4902 return false;
4903 break;
4904
4905 case GIMPLE_BINARY_RHS:
4906 code = gimple_assign_rhs_code (stmt);
4907 op_type = TREE_CODE_LENGTH (code);
4908 gcc_assert (op_type == binary_op);
4909 ops[0] = gimple_assign_rhs1 (stmt);
4910 ops[1] = gimple_assign_rhs2 (stmt);
4911 break;
4912
4913 case GIMPLE_TERNARY_RHS:
4914 code = gimple_assign_rhs_code (stmt);
4915 op_type = TREE_CODE_LENGTH (code);
4916 gcc_assert (op_type == ternary_op);
4917 ops[0] = gimple_assign_rhs1 (stmt);
4918 ops[1] = gimple_assign_rhs2 (stmt);
4919 ops[2] = gimple_assign_rhs3 (stmt);
4920 break;
4921
4922 case GIMPLE_UNARY_RHS:
4923 return false;
4924
4925 default:
4926 gcc_unreachable ();
4927 }
4928
4929 if (code == COND_EXPR && slp_node)
4930 return false;
4931
4932 scalar_dest = gimple_assign_lhs (stmt);
4933 scalar_type = TREE_TYPE (scalar_dest);
4934 if (!POINTER_TYPE_P (scalar_type) && !INTEGRAL_TYPE_P (scalar_type)
4935 && !SCALAR_FLOAT_TYPE_P (scalar_type))
4936 return false;
4937
4938 /* Do not try to vectorize bit-precision reductions. */
4939 if ((TYPE_PRECISION (scalar_type)
4940 != GET_MODE_PRECISION (TYPE_MODE (scalar_type))))
4941 return false;
4942
4943 /* All uses but the last are expected to be defined in the loop.
4944 The last use is the reduction variable. In case of nested cycle this
4945 assumption is not true: we use reduc_index to record the index of the
4946 reduction variable. */
4947 for (i = 0; i < op_type - 1; i++)
4948 {
4949 /* The condition of COND_EXPR is checked in vectorizable_condition(). */
4950 if (i == 0 && code == COND_EXPR)
4951 continue;
4952
4953 is_simple_use = vect_is_simple_use_1 (ops[i], stmt, loop_vinfo, NULL,
4954 &def_stmt, &def, &dt, &tem);
4955 if (!vectype_in)
4956 vectype_in = tem;
4957 gcc_assert (is_simple_use);
4958
4959 if (dt != vect_internal_def
4960 && dt != vect_external_def
4961 && dt != vect_constant_def
4962 && dt != vect_induction_def
4963 && !(dt == vect_nested_cycle && nested_cycle))
4964 return false;
4965
4966 if (dt == vect_nested_cycle)
4967 {
4968 found_nested_cycle_def = true;
4969 reduc_def_stmt = def_stmt;
4970 reduc_index = i;
4971 }
4972 }
4973
4974 is_simple_use = vect_is_simple_use_1 (ops[i], stmt, loop_vinfo, NULL,
4975 &def_stmt, &def, &dt, &tem);
4976 if (!vectype_in)
4977 vectype_in = tem;
4978 gcc_assert (is_simple_use);
4979 if (!(dt == vect_reduction_def
4980 || dt == vect_nested_cycle
4981 || ((dt == vect_internal_def || dt == vect_external_def
4982 || dt == vect_constant_def || dt == vect_induction_def)
4983 && nested_cycle && found_nested_cycle_def)))
4984 {
4985 /* For pattern recognized stmts, orig_stmt might be a reduction,
4986 but some helper statements for the pattern might not, or
4987 might be COND_EXPRs with reduction uses in the condition. */
4988 gcc_assert (orig_stmt);
4989 return false;
4990 }
4991 if (!found_nested_cycle_def)
4992 reduc_def_stmt = def_stmt;
4993
4994 gcc_assert (gimple_code (reduc_def_stmt) == GIMPLE_PHI);
4995 if (orig_stmt)
4996 gcc_assert (orig_stmt == vect_is_simple_reduction (loop_vinfo,
4997 reduc_def_stmt,
4998 !nested_cycle,
4999 &dummy));
5000 else
5001 {
5002 gimple tmp = vect_is_simple_reduction (loop_vinfo, reduc_def_stmt,
5003 !nested_cycle, &dummy);
5004 /* We changed STMT to be the first stmt in reduction chain, hence we
5005 check that in this case the first element in the chain is STMT. */
5006 gcc_assert (stmt == tmp
5007 || GROUP_FIRST_ELEMENT (vinfo_for_stmt (tmp)) == stmt);
5008 }
5009
5010 if (STMT_VINFO_LIVE_P (vinfo_for_stmt (reduc_def_stmt)))
5011 return false;
5012
5013 if (slp_node || PURE_SLP_STMT (stmt_info))
5014 ncopies = 1;
5015 else
5016 ncopies = (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
5017 / TYPE_VECTOR_SUBPARTS (vectype_in));
5018
5019 gcc_assert (ncopies >= 1);
5020
5021 vec_mode = TYPE_MODE (vectype_in);
5022
5023 if (code == COND_EXPR)
5024 {
5025 if (!vectorizable_condition (stmt, gsi, NULL, ops[reduc_index], 0, NULL))
5026 {
5027 if (dump_enabled_p ())
5028 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5029 "unsupported condition in reduction\n");
5030
5031 return false;
5032 }
5033 }
5034 else
5035 {
5036 /* 4. Supportable by target? */
5037
5038 if (code == LSHIFT_EXPR || code == RSHIFT_EXPR
5039 || code == LROTATE_EXPR || code == RROTATE_EXPR)
5040 {
5041 /* Shifts and rotates are only supported by vectorizable_shifts,
5042 not vectorizable_reduction. */
5043 if (dump_enabled_p ())
5044 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5045 "unsupported shift or rotation.\n");
5046 return false;
5047 }
5048
5049 /* 4.1. check support for the operation in the loop */
5050 optab = optab_for_tree_code (code, vectype_in, optab_default);
5051 if (!optab)
5052 {
5053 if (dump_enabled_p ())
5054 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5055 "no optab.\n");
5056
5057 return false;
5058 }
5059
5060 if (optab_handler (optab, vec_mode) == CODE_FOR_nothing)
5061 {
5062 if (dump_enabled_p ())
5063 dump_printf (MSG_NOTE, "op not supported by target.\n");
5064
5065 if (GET_MODE_SIZE (vec_mode) != UNITS_PER_WORD
5066 || LOOP_VINFO_VECT_FACTOR (loop_vinfo)
5067 < vect_min_worthwhile_factor (code))
5068 return false;
5069
5070 if (dump_enabled_p ())
5071 dump_printf (MSG_NOTE, "proceeding using word mode.\n");
5072 }
5073
5074 /* Worthwhile without SIMD support? */
5075 if (!VECTOR_MODE_P (TYPE_MODE (vectype_in))
5076 && LOOP_VINFO_VECT_FACTOR (loop_vinfo)
5077 < vect_min_worthwhile_factor (code))
5078 {
5079 if (dump_enabled_p ())
5080 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5081 "not worthwhile without SIMD support.\n");
5082
5083 return false;
5084 }
5085 }
5086
5087 /* 4.2. Check support for the epilog operation.
5088
5089 If STMT represents a reduction pattern, then the type of the
5090 reduction variable may be different than the type of the rest
5091 of the arguments. For example, consider the case of accumulation
5092 of shorts into an int accumulator; The original code:
5093 S1: int_a = (int) short_a;
5094 orig_stmt-> S2: int_acc = plus <int_a ,int_acc>;
5095
5096 was replaced with:
5097 STMT: int_acc = widen_sum <short_a, int_acc>
5098
5099 This means that:
5100 1. The tree-code that is used to create the vector operation in the
5101 epilog code (that reduces the partial results) is not the
5102 tree-code of STMT, but is rather the tree-code of the original
5103 stmt from the pattern that STMT is replacing. I.e, in the example
5104 above we want to use 'widen_sum' in the loop, but 'plus' in the
5105 epilog.
5106 2. The type (mode) we use to check available target support
5107 for the vector operation to be created in the *epilog*, is
5108 determined by the type of the reduction variable (in the example
5109 above we'd check this: optab_handler (plus_optab, vect_int_mode])).
5110 However the type (mode) we use to check available target support
5111 for the vector operation to be created *inside the loop*, is
5112 determined by the type of the other arguments to STMT (in the
5113 example we'd check this: optab_handler (widen_sum_optab,
5114 vect_short_mode)).
5115
5116 This is contrary to "regular" reductions, in which the types of all
5117 the arguments are the same as the type of the reduction variable.
5118 For "regular" reductions we can therefore use the same vector type
5119 (and also the same tree-code) when generating the epilog code and
5120 when generating the code inside the loop. */
5121
5122 if (orig_stmt)
5123 {
5124 /* This is a reduction pattern: get the vectype from the type of the
5125 reduction variable, and get the tree-code from orig_stmt. */
5126 orig_code = gimple_assign_rhs_code (orig_stmt);
5127 gcc_assert (vectype_out);
5128 vec_mode = TYPE_MODE (vectype_out);
5129 }
5130 else
5131 {
5132 /* Regular reduction: use the same vectype and tree-code as used for
5133 the vector code inside the loop can be used for the epilog code. */
5134 orig_code = code;
5135 }
5136
5137 if (nested_cycle)
5138 {
5139 def_bb = gimple_bb (reduc_def_stmt);
5140 def_stmt_loop = def_bb->loop_father;
5141 def_arg = PHI_ARG_DEF_FROM_EDGE (reduc_def_stmt,
5142 loop_preheader_edge (def_stmt_loop));
5143 if (TREE_CODE (def_arg) == SSA_NAME
5144 && (def_arg_stmt = SSA_NAME_DEF_STMT (def_arg))
5145 && gimple_code (def_arg_stmt) == GIMPLE_PHI
5146 && flow_bb_inside_loop_p (outer_loop, gimple_bb (def_arg_stmt))
5147 && vinfo_for_stmt (def_arg_stmt)
5148 && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_arg_stmt))
5149 == vect_double_reduction_def)
5150 double_reduc = true;
5151 }
5152
5153 epilog_reduc_code = ERROR_MARK;
5154 if (reduction_code_for_scalar_code (orig_code, &epilog_reduc_code))
5155 {
5156 reduc_optab = optab_for_tree_code (epilog_reduc_code, vectype_out,
5157 optab_default);
5158 if (!reduc_optab)
5159 {
5160 if (dump_enabled_p ())
5161 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5162 "no optab for reduction.\n");
5163
5164 epilog_reduc_code = ERROR_MARK;
5165 }
5166 else if (optab_handler (reduc_optab, vec_mode) == CODE_FOR_nothing)
5167 {
5168 optab = scalar_reduc_to_vector (reduc_optab, vectype_out);
5169 if (optab_handler (optab, vec_mode) == CODE_FOR_nothing)
5170 {
5171 if (dump_enabled_p ())
5172 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5173 "reduc op not supported by target.\n");
5174
5175 epilog_reduc_code = ERROR_MARK;
5176 }
5177 }
5178 }
5179 else
5180 {
5181 if (!nested_cycle || double_reduc)
5182 {
5183 if (dump_enabled_p ())
5184 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5185 "no reduc code for scalar code.\n");
5186
5187 return false;
5188 }
5189 }
5190
5191 if (double_reduc && ncopies > 1)
5192 {
5193 if (dump_enabled_p ())
5194 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5195 "multiple types in double reduction\n");
5196
5197 return false;
5198 }
5199
5200 /* In case of widenning multiplication by a constant, we update the type
5201 of the constant to be the type of the other operand. We check that the
5202 constant fits the type in the pattern recognition pass. */
5203 if (code == DOT_PROD_EXPR
5204 && !types_compatible_p (TREE_TYPE (ops[0]), TREE_TYPE (ops[1])))
5205 {
5206 if (TREE_CODE (ops[0]) == INTEGER_CST)
5207 ops[0] = fold_convert (TREE_TYPE (ops[1]), ops[0]);
5208 else if (TREE_CODE (ops[1]) == INTEGER_CST)
5209 ops[1] = fold_convert (TREE_TYPE (ops[0]), ops[1]);
5210 else
5211 {
5212 if (dump_enabled_p ())
5213 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5214 "invalid types in dot-prod\n");
5215
5216 return false;
5217 }
5218 }
5219
5220 if (!vec_stmt) /* transformation not required. */
5221 {
5222 if (!vect_model_reduction_cost (stmt_info, epilog_reduc_code, ncopies))
5223 return false;
5224 STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
5225 return true;
5226 }
5227
5228 /** Transform. **/
5229
5230 if (dump_enabled_p ())
5231 dump_printf_loc (MSG_NOTE, vect_location, "transform reduction.\n");
5232
5233 /* FORNOW: Multiple types are not supported for condition. */
5234 if (code == COND_EXPR)
5235 gcc_assert (ncopies == 1);
5236
5237 /* Create the destination vector */
5238 vec_dest = vect_create_destination_var (scalar_dest, vectype_out);
5239
5240 /* In case the vectorization factor (VF) is bigger than the number
5241 of elements that we can fit in a vectype (nunits), we have to generate
5242 more than one vector stmt - i.e - we need to "unroll" the
5243 vector stmt by a factor VF/nunits. For more details see documentation
5244 in vectorizable_operation. */
5245
5246 /* If the reduction is used in an outer loop we need to generate
5247 VF intermediate results, like so (e.g. for ncopies=2):
5248 r0 = phi (init, r0)
5249 r1 = phi (init, r1)
5250 r0 = x0 + r0;
5251 r1 = x1 + r1;
5252 (i.e. we generate VF results in 2 registers).
5253 In this case we have a separate def-use cycle for each copy, and therefore
5254 for each copy we get the vector def for the reduction variable from the
5255 respective phi node created for this copy.
5256
5257 Otherwise (the reduction is unused in the loop nest), we can combine
5258 together intermediate results, like so (e.g. for ncopies=2):
5259 r = phi (init, r)
5260 r = x0 + r;
5261 r = x1 + r;
5262 (i.e. we generate VF/2 results in a single register).
5263 In this case for each copy we get the vector def for the reduction variable
5264 from the vectorized reduction operation generated in the previous iteration.
5265 */
5266
5267 if (STMT_VINFO_RELEVANT (stmt_info) == vect_unused_in_scope)
5268 {
5269 single_defuse_cycle = true;
5270 epilog_copies = 1;
5271 }
5272 else
5273 epilog_copies = ncopies;
5274
5275 prev_stmt_info = NULL;
5276 prev_phi_info = NULL;
5277 if (slp_node)
5278 {
5279 vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
5280 gcc_assert (TYPE_VECTOR_SUBPARTS (vectype_out)
5281 == TYPE_VECTOR_SUBPARTS (vectype_in));
5282 }
5283 else
5284 {
5285 vec_num = 1;
5286 vec_oprnds0.create (1);
5287 if (op_type == ternary_op)
5288 vec_oprnds1.create (1);
5289 }
5290
5291 phis.create (vec_num);
5292 vect_defs.create (vec_num);
5293 if (!slp_node)
5294 vect_defs.quick_push (NULL_TREE);
5295
5296 for (j = 0; j < ncopies; j++)
5297 {
5298 if (j == 0 || !single_defuse_cycle)
5299 {
5300 for (i = 0; i < vec_num; i++)
5301 {
5302 /* Create the reduction-phi that defines the reduction
5303 operand. */
5304 new_phi = create_phi_node (vec_dest, loop->header);
5305 set_vinfo_for_stmt (new_phi,
5306 new_stmt_vec_info (new_phi, loop_vinfo,
5307 NULL));
5308 if (j == 0 || slp_node)
5309 phis.quick_push (new_phi);
5310 }
5311 }
5312
5313 if (code == COND_EXPR)
5314 {
5315 gcc_assert (!slp_node);
5316 vectorizable_condition (stmt, gsi, vec_stmt,
5317 PHI_RESULT (phis[0]),
5318 reduc_index, NULL);
5319 /* Multiple types are not supported for condition. */
5320 break;
5321 }
5322
5323 /* Handle uses. */
5324 if (j == 0)
5325 {
5326 op0 = ops[!reduc_index];
5327 if (op_type == ternary_op)
5328 {
5329 if (reduc_index == 0)
5330 op1 = ops[2];
5331 else
5332 op1 = ops[1];
5333 }
5334
5335 if (slp_node)
5336 vect_get_vec_defs (op0, op1, stmt, &vec_oprnds0, &vec_oprnds1,
5337 slp_node, -1);
5338 else
5339 {
5340 loop_vec_def0 = vect_get_vec_def_for_operand (ops[!reduc_index],
5341 stmt, NULL);
5342 vec_oprnds0.quick_push (loop_vec_def0);
5343 if (op_type == ternary_op)
5344 {
5345 loop_vec_def1 = vect_get_vec_def_for_operand (op1, stmt,
5346 NULL);
5347 vec_oprnds1.quick_push (loop_vec_def1);
5348 }
5349 }
5350 }
5351 else
5352 {
5353 if (!slp_node)
5354 {
5355 enum vect_def_type dt;
5356 gimple dummy_stmt;
5357 tree dummy;
5358
5359 vect_is_simple_use (ops[!reduc_index], stmt, loop_vinfo, NULL,
5360 &dummy_stmt, &dummy, &dt);
5361 loop_vec_def0 = vect_get_vec_def_for_stmt_copy (dt,
5362 loop_vec_def0);
5363 vec_oprnds0[0] = loop_vec_def0;
5364 if (op_type == ternary_op)
5365 {
5366 vect_is_simple_use (op1, stmt, loop_vinfo, NULL, &dummy_stmt,
5367 &dummy, &dt);
5368 loop_vec_def1 = vect_get_vec_def_for_stmt_copy (dt,
5369 loop_vec_def1);
5370 vec_oprnds1[0] = loop_vec_def1;
5371 }
5372 }
5373
5374 if (single_defuse_cycle)
5375 reduc_def = gimple_assign_lhs (new_stmt);
5376
5377 STMT_VINFO_RELATED_STMT (prev_phi_info) = new_phi;
5378 }
5379
5380 FOR_EACH_VEC_ELT (vec_oprnds0, i, def0)
5381 {
5382 if (slp_node)
5383 reduc_def = PHI_RESULT (phis[i]);
5384 else
5385 {
5386 if (!single_defuse_cycle || j == 0)
5387 reduc_def = PHI_RESULT (new_phi);
5388 }
5389
5390 def1 = ((op_type == ternary_op)
5391 ? vec_oprnds1[i] : NULL);
5392 if (op_type == binary_op)
5393 {
5394 if (reduc_index == 0)
5395 expr = build2 (code, vectype_out, reduc_def, def0);
5396 else
5397 expr = build2 (code, vectype_out, def0, reduc_def);
5398 }
5399 else
5400 {
5401 if (reduc_index == 0)
5402 expr = build3 (code, vectype_out, reduc_def, def0, def1);
5403 else
5404 {
5405 if (reduc_index == 1)
5406 expr = build3 (code, vectype_out, def0, reduc_def, def1);
5407 else
5408 expr = build3 (code, vectype_out, def0, def1, reduc_def);
5409 }
5410 }
5411
5412 new_stmt = gimple_build_assign (vec_dest, expr);
5413 new_temp = make_ssa_name (vec_dest, new_stmt);
5414 gimple_assign_set_lhs (new_stmt, new_temp);
5415 vect_finish_stmt_generation (stmt, new_stmt, gsi);
5416
5417 if (slp_node)
5418 {
5419 SLP_TREE_VEC_STMTS (slp_node).quick_push (new_stmt);
5420 vect_defs.quick_push (new_temp);
5421 }
5422 else
5423 vect_defs[0] = new_temp;
5424 }
5425
5426 if (slp_node)
5427 continue;
5428
5429 if (j == 0)
5430 STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt;
5431 else
5432 STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
5433
5434 prev_stmt_info = vinfo_for_stmt (new_stmt);
5435 prev_phi_info = vinfo_for_stmt (new_phi);
5436 }
5437
5438 /* Finalize the reduction-phi (set its arguments) and create the
5439 epilog reduction code. */
5440 if ((!single_defuse_cycle || code == COND_EXPR) && !slp_node)
5441 {
5442 new_temp = gimple_assign_lhs (*vec_stmt);
5443 vect_defs[0] = new_temp;
5444 }
5445
5446 vect_create_epilog_for_reduction (vect_defs, stmt, epilog_copies,
5447 epilog_reduc_code, phis, reduc_index,
5448 double_reduc, slp_node);
5449
5450 return true;
5451 }
5452
5453 /* Function vect_min_worthwhile_factor.
5454
5455 For a loop where we could vectorize the operation indicated by CODE,
5456 return the minimum vectorization factor that makes it worthwhile
5457 to use generic vectors. */
5458 int
5459 vect_min_worthwhile_factor (enum tree_code code)
5460 {
5461 switch (code)
5462 {
5463 case PLUS_EXPR:
5464 case MINUS_EXPR:
5465 case NEGATE_EXPR:
5466 return 4;
5467
5468 case BIT_AND_EXPR:
5469 case BIT_IOR_EXPR:
5470 case BIT_XOR_EXPR:
5471 case BIT_NOT_EXPR:
5472 return 2;
5473
5474 default:
5475 return INT_MAX;
5476 }
5477 }
5478
5479
5480 /* Function vectorizable_induction
5481
5482 Check if PHI performs an induction computation that can be vectorized.
5483 If VEC_STMT is also passed, vectorize the induction PHI: create a vectorized
5484 phi to replace it, put it in VEC_STMT, and add it to the same basic block.
5485 Return FALSE if not a vectorizable STMT, TRUE otherwise. */
5486
5487 bool
5488 vectorizable_induction (gimple phi, gimple_stmt_iterator *gsi ATTRIBUTE_UNUSED,
5489 gimple *vec_stmt)
5490 {
5491 stmt_vec_info stmt_info = vinfo_for_stmt (phi);
5492 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
5493 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
5494 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
5495 int nunits = TYPE_VECTOR_SUBPARTS (vectype);
5496 int ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits;
5497 tree vec_def;
5498
5499 gcc_assert (ncopies >= 1);
5500 /* FORNOW. These restrictions should be relaxed. */
5501 if (nested_in_vect_loop_p (loop, phi))
5502 {
5503 imm_use_iterator imm_iter;
5504 use_operand_p use_p;
5505 gimple exit_phi;
5506 edge latch_e;
5507 tree loop_arg;
5508
5509 if (ncopies > 1)
5510 {
5511 if (dump_enabled_p ())
5512 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5513 "multiple types in nested loop.\n");
5514 return false;
5515 }
5516
5517 exit_phi = NULL;
5518 latch_e = loop_latch_edge (loop->inner);
5519 loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
5520 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, loop_arg)
5521 {
5522 gimple use_stmt = USE_STMT (use_p);
5523 if (is_gimple_debug (use_stmt))
5524 continue;
5525
5526 if (!flow_bb_inside_loop_p (loop->inner, gimple_bb (use_stmt)))
5527 {
5528 exit_phi = use_stmt;
5529 break;
5530 }
5531 }
5532 if (exit_phi)
5533 {
5534 stmt_vec_info exit_phi_vinfo = vinfo_for_stmt (exit_phi);
5535 if (!(STMT_VINFO_RELEVANT_P (exit_phi_vinfo)
5536 && !STMT_VINFO_LIVE_P (exit_phi_vinfo)))
5537 {
5538 if (dump_enabled_p ())
5539 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5540 "inner-loop induction only used outside "
5541 "of the outer vectorized loop.\n");
5542 return false;
5543 }
5544 }
5545 }
5546
5547 if (!STMT_VINFO_RELEVANT_P (stmt_info))
5548 return false;
5549
5550 /* FORNOW: SLP not supported. */
5551 if (STMT_SLP_TYPE (stmt_info))
5552 return false;
5553
5554 gcc_assert (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def);
5555
5556 if (gimple_code (phi) != GIMPLE_PHI)
5557 return false;
5558
5559 if (!vec_stmt) /* transformation not required. */
5560 {
5561 STMT_VINFO_TYPE (stmt_info) = induc_vec_info_type;
5562 if (dump_enabled_p ())
5563 dump_printf_loc (MSG_NOTE, vect_location,
5564 "=== vectorizable_induction ===\n");
5565 vect_model_induction_cost (stmt_info, ncopies);
5566 return true;
5567 }
5568
5569 /** Transform. **/
5570
5571 if (dump_enabled_p ())
5572 dump_printf_loc (MSG_NOTE, vect_location, "transform induction phi.\n");
5573
5574 vec_def = get_initial_def_for_induction (phi);
5575 *vec_stmt = SSA_NAME_DEF_STMT (vec_def);
5576 return true;
5577 }
5578
5579 /* Function vectorizable_live_operation.
5580
5581 STMT computes a value that is used outside the loop. Check if
5582 it can be supported. */
5583
5584 bool
5585 vectorizable_live_operation (gimple stmt,
5586 gimple_stmt_iterator *gsi ATTRIBUTE_UNUSED,
5587 gimple *vec_stmt)
5588 {
5589 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
5590 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
5591 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
5592 int i;
5593 int op_type;
5594 tree op;
5595 tree def;
5596 gimple def_stmt;
5597 enum vect_def_type dt;
5598 enum tree_code code;
5599 enum gimple_rhs_class rhs_class;
5600
5601 gcc_assert (STMT_VINFO_LIVE_P (stmt_info));
5602
5603 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def)
5604 return false;
5605
5606 if (!is_gimple_assign (stmt))
5607 {
5608 if (gimple_call_internal_p (stmt)
5609 && gimple_call_internal_fn (stmt) == IFN_GOMP_SIMD_LANE
5610 && gimple_call_lhs (stmt)
5611 && loop->simduid
5612 && TREE_CODE (gimple_call_arg (stmt, 0)) == SSA_NAME
5613 && loop->simduid
5614 == SSA_NAME_VAR (gimple_call_arg (stmt, 0)))
5615 {
5616 edge e = single_exit (loop);
5617 basic_block merge_bb = e->dest;
5618 imm_use_iterator imm_iter;
5619 use_operand_p use_p;
5620 tree lhs = gimple_call_lhs (stmt);
5621
5622 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, lhs)
5623 {
5624 gimple use_stmt = USE_STMT (use_p);
5625 if (gimple_code (use_stmt) == GIMPLE_PHI
5626 && gimple_bb (use_stmt) == merge_bb)
5627 {
5628 if (vec_stmt)
5629 {
5630 tree vfm1
5631 = build_int_cst (unsigned_type_node,
5632 loop_vinfo->vectorization_factor - 1);
5633 SET_PHI_ARG_DEF (use_stmt, e->dest_idx, vfm1);
5634 }
5635 return true;
5636 }
5637 }
5638 }
5639
5640 return false;
5641 }
5642
5643 if (TREE_CODE (gimple_assign_lhs (stmt)) != SSA_NAME)
5644 return false;
5645
5646 /* FORNOW. CHECKME. */
5647 if (nested_in_vect_loop_p (loop, stmt))
5648 return false;
5649
5650 code = gimple_assign_rhs_code (stmt);
5651 op_type = TREE_CODE_LENGTH (code);
5652 rhs_class = get_gimple_rhs_class (code);
5653 gcc_assert (rhs_class != GIMPLE_UNARY_RHS || op_type == unary_op);
5654 gcc_assert (rhs_class != GIMPLE_BINARY_RHS || op_type == binary_op);
5655
5656 /* FORNOW: support only if all uses are invariant. This means
5657 that the scalar operations can remain in place, unvectorized.
5658 The original last scalar value that they compute will be used. */
5659
5660 for (i = 0; i < op_type; i++)
5661 {
5662 if (rhs_class == GIMPLE_SINGLE_RHS)
5663 op = TREE_OPERAND (gimple_op (stmt, 1), i);
5664 else
5665 op = gimple_op (stmt, i + 1);
5666 if (op
5667 && !vect_is_simple_use (op, stmt, loop_vinfo, NULL, &def_stmt, &def,
5668 &dt))
5669 {
5670 if (dump_enabled_p ())
5671 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5672 "use not simple.\n");
5673 return false;
5674 }
5675
5676 if (dt != vect_external_def && dt != vect_constant_def)
5677 return false;
5678 }
5679
5680 /* No transformation is required for the cases we currently support. */
5681 return true;
5682 }
5683
5684 /* Kill any debug uses outside LOOP of SSA names defined in STMT. */
5685
5686 static void
5687 vect_loop_kill_debug_uses (struct loop *loop, gimple stmt)
5688 {
5689 ssa_op_iter op_iter;
5690 imm_use_iterator imm_iter;
5691 def_operand_p def_p;
5692 gimple ustmt;
5693
5694 FOR_EACH_PHI_OR_STMT_DEF (def_p, stmt, op_iter, SSA_OP_DEF)
5695 {
5696 FOR_EACH_IMM_USE_STMT (ustmt, imm_iter, DEF_FROM_PTR (def_p))
5697 {
5698 basic_block bb;
5699
5700 if (!is_gimple_debug (ustmt))
5701 continue;
5702
5703 bb = gimple_bb (ustmt);
5704
5705 if (!flow_bb_inside_loop_p (loop, bb))
5706 {
5707 if (gimple_debug_bind_p (ustmt))
5708 {
5709 if (dump_enabled_p ())
5710 dump_printf_loc (MSG_NOTE, vect_location,
5711 "killing debug use\n");
5712
5713 gimple_debug_bind_reset_value (ustmt);
5714 update_stmt (ustmt);
5715 }
5716 else
5717 gcc_unreachable ();
5718 }
5719 }
5720 }
5721 }
5722
5723
5724 /* This function builds ni_name = number of iterations. Statements
5725 are emitted on the loop preheader edge. */
5726
5727 static tree
5728 vect_build_loop_niters (loop_vec_info loop_vinfo)
5729 {
5730 tree ni = unshare_expr (LOOP_VINFO_NITERS (loop_vinfo));
5731 if (TREE_CODE (ni) == INTEGER_CST)
5732 return ni;
5733 else
5734 {
5735 tree ni_name, var;
5736 gimple_seq stmts = NULL;
5737 edge pe = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo));
5738
5739 var = create_tmp_var (TREE_TYPE (ni), "niters");
5740 ni_name = force_gimple_operand (ni, &stmts, false, var);
5741 if (stmts)
5742 gsi_insert_seq_on_edge_immediate (pe, stmts);
5743
5744 return ni_name;
5745 }
5746 }
5747
5748
5749 /* This function generates the following statements:
5750
5751 ni_name = number of iterations loop executes
5752 ratio = ni_name / vf
5753 ratio_mult_vf_name = ratio * vf
5754
5755 and places them on the loop preheader edge. */
5756
5757 static void
5758 vect_generate_tmps_on_preheader (loop_vec_info loop_vinfo,
5759 tree ni_name,
5760 tree *ratio_mult_vf_name_ptr,
5761 tree *ratio_name_ptr)
5762 {
5763 tree ni_minus_gap_name;
5764 tree var;
5765 tree ratio_name;
5766 tree ratio_mult_vf_name;
5767 int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
5768 edge pe = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo));
5769 tree log_vf;
5770
5771 log_vf = build_int_cst (TREE_TYPE (ni_name), exact_log2 (vf));
5772
5773 /* If epilogue loop is required because of data accesses with gaps, we
5774 subtract one iteration from the total number of iterations here for
5775 correct calculation of RATIO. */
5776 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
5777 {
5778 ni_minus_gap_name = fold_build2 (MINUS_EXPR, TREE_TYPE (ni_name),
5779 ni_name,
5780 build_one_cst (TREE_TYPE (ni_name)));
5781 if (!is_gimple_val (ni_minus_gap_name))
5782 {
5783 var = create_tmp_var (TREE_TYPE (ni_name), "ni_gap");
5784 gimple stmts = NULL;
5785 ni_minus_gap_name = force_gimple_operand (ni_minus_gap_name, &stmts,
5786 true, var);
5787 gsi_insert_seq_on_edge_immediate (pe, stmts);
5788 }
5789 }
5790 else
5791 ni_minus_gap_name = ni_name;
5792
5793 /* Create: ratio = ni >> log2(vf) */
5794 /* ??? As we have ni == number of latch executions + 1, ni could
5795 have overflown to zero. So avoid computing ratio based on ni
5796 but compute it using the fact that we know ratio will be at least
5797 one, thus via (ni - vf) >> log2(vf) + 1. */
5798 ratio_name
5799 = fold_build2 (PLUS_EXPR, TREE_TYPE (ni_name),
5800 fold_build2 (RSHIFT_EXPR, TREE_TYPE (ni_name),
5801 fold_build2 (MINUS_EXPR, TREE_TYPE (ni_name),
5802 ni_minus_gap_name,
5803 build_int_cst
5804 (TREE_TYPE (ni_name), vf)),
5805 log_vf),
5806 build_int_cst (TREE_TYPE (ni_name), 1));
5807 if (!is_gimple_val (ratio_name))
5808 {
5809 var = create_tmp_var (TREE_TYPE (ni_name), "bnd");
5810 gimple stmts = NULL;
5811 ratio_name = force_gimple_operand (ratio_name, &stmts, true, var);
5812 gsi_insert_seq_on_edge_immediate (pe, stmts);
5813 }
5814 *ratio_name_ptr = ratio_name;
5815
5816 /* Create: ratio_mult_vf = ratio << log2 (vf). */
5817
5818 if (ratio_mult_vf_name_ptr)
5819 {
5820 ratio_mult_vf_name = fold_build2 (LSHIFT_EXPR, TREE_TYPE (ratio_name),
5821 ratio_name, log_vf);
5822 if (!is_gimple_val (ratio_mult_vf_name))
5823 {
5824 var = create_tmp_var (TREE_TYPE (ni_name), "ratio_mult_vf");
5825 gimple stmts = NULL;
5826 ratio_mult_vf_name = force_gimple_operand (ratio_mult_vf_name, &stmts,
5827 true, var);
5828 gsi_insert_seq_on_edge_immediate (pe, stmts);
5829 }
5830 *ratio_mult_vf_name_ptr = ratio_mult_vf_name;
5831 }
5832
5833 return;
5834 }
5835
5836
5837 /* Function vect_transform_loop.
5838
5839 The analysis phase has determined that the loop is vectorizable.
5840 Vectorize the loop - created vectorized stmts to replace the scalar
5841 stmts in the loop, and update the loop exit condition. */
5842
5843 void
5844 vect_transform_loop (loop_vec_info loop_vinfo)
5845 {
5846 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
5847 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
5848 int nbbs = loop->num_nodes;
5849 int i;
5850 tree ratio = NULL;
5851 int vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
5852 bool grouped_store;
5853 bool slp_scheduled = false;
5854 gimple stmt, pattern_stmt;
5855 gimple_seq pattern_def_seq = NULL;
5856 gimple_stmt_iterator pattern_def_si = gsi_none ();
5857 bool transform_pattern_stmt = false;
5858 bool check_profitability = false;
5859 int th;
5860 /* Record number of iterations before we started tampering with the profile. */
5861 gcov_type expected_iterations = expected_loop_iterations_unbounded (loop);
5862
5863 if (dump_enabled_p ())
5864 dump_printf_loc (MSG_NOTE, vect_location, "=== vec_transform_loop ===\n");
5865
5866 /* If profile is inprecise, we have chance to fix it up. */
5867 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
5868 expected_iterations = LOOP_VINFO_INT_NITERS (loop_vinfo);
5869
5870 /* Use the more conservative vectorization threshold. If the number
5871 of iterations is constant assume the cost check has been performed
5872 by our caller. If the threshold makes all loops profitable that
5873 run at least the vectorization factor number of times checking
5874 is pointless, too. */
5875 th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
5876 if (th >= LOOP_VINFO_VECT_FACTOR (loop_vinfo) - 1
5877 && !LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
5878 {
5879 if (dump_enabled_p ())
5880 dump_printf_loc (MSG_NOTE, vect_location,
5881 "Profitability threshold is %d loop iterations.\n",
5882 th);
5883 check_profitability = true;
5884 }
5885
5886 /* Version the loop first, if required, so the profitability check
5887 comes first. */
5888
5889 if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo)
5890 || LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
5891 {
5892 vect_loop_versioning (loop_vinfo, th, check_profitability);
5893 check_profitability = false;
5894 }
5895
5896 tree ni_name = vect_build_loop_niters (loop_vinfo);
5897 LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = ni_name;
5898
5899 /* Peel the loop if there are data refs with unknown alignment.
5900 Only one data ref with unknown store is allowed. */
5901
5902 if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo))
5903 {
5904 vect_do_peeling_for_alignment (loop_vinfo, ni_name,
5905 th, check_profitability);
5906 check_profitability = false;
5907 /* The above adjusts LOOP_VINFO_NITERS, so cause ni_name to
5908 be re-computed. */
5909 ni_name = NULL_TREE;
5910 }
5911
5912 /* If the loop has a symbolic number of iterations 'n' (i.e. it's not a
5913 compile time constant), or it is a constant that doesn't divide by the
5914 vectorization factor, then an epilog loop needs to be created.
5915 We therefore duplicate the loop: the original loop will be vectorized,
5916 and will compute the first (n/VF) iterations. The second copy of the loop
5917 will remain scalar and will compute the remaining (n%VF) iterations.
5918 (VF is the vectorization factor). */
5919
5920 if (LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo)
5921 || LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
5922 {
5923 tree ratio_mult_vf;
5924 if (!ni_name)
5925 ni_name = vect_build_loop_niters (loop_vinfo);
5926 vect_generate_tmps_on_preheader (loop_vinfo, ni_name, &ratio_mult_vf,
5927 &ratio);
5928 vect_do_peeling_for_loop_bound (loop_vinfo, ni_name, ratio_mult_vf,
5929 th, check_profitability);
5930 }
5931 else if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
5932 ratio = build_int_cst (TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo)),
5933 LOOP_VINFO_INT_NITERS (loop_vinfo) / vectorization_factor);
5934 else
5935 {
5936 if (!ni_name)
5937 ni_name = vect_build_loop_niters (loop_vinfo);
5938 vect_generate_tmps_on_preheader (loop_vinfo, ni_name, NULL, &ratio);
5939 }
5940
5941 /* 1) Make sure the loop header has exactly two entries
5942 2) Make sure we have a preheader basic block. */
5943
5944 gcc_assert (EDGE_COUNT (loop->header->preds) == 2);
5945
5946 split_edge (loop_preheader_edge (loop));
5947
5948 /* FORNOW: the vectorizer supports only loops which body consist
5949 of one basic block (header + empty latch). When the vectorizer will
5950 support more involved loop forms, the order by which the BBs are
5951 traversed need to be reconsidered. */
5952
5953 for (i = 0; i < nbbs; i++)
5954 {
5955 basic_block bb = bbs[i];
5956 stmt_vec_info stmt_info;
5957
5958 for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
5959 gsi_next (&si))
5960 {
5961 gphi *phi = si.phi ();
5962 if (dump_enabled_p ())
5963 {
5964 dump_printf_loc (MSG_NOTE, vect_location,
5965 "------>vectorizing phi: ");
5966 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
5967 dump_printf (MSG_NOTE, "\n");
5968 }
5969 stmt_info = vinfo_for_stmt (phi);
5970 if (!stmt_info)
5971 continue;
5972
5973 if (MAY_HAVE_DEBUG_STMTS && !STMT_VINFO_LIVE_P (stmt_info))
5974 vect_loop_kill_debug_uses (loop, phi);
5975
5976 if (!STMT_VINFO_RELEVANT_P (stmt_info)
5977 && !STMT_VINFO_LIVE_P (stmt_info))
5978 continue;
5979
5980 if (STMT_VINFO_VECTYPE (stmt_info)
5981 && (TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info))
5982 != (unsigned HOST_WIDE_INT) vectorization_factor)
5983 && dump_enabled_p ())
5984 dump_printf_loc (MSG_NOTE, vect_location, "multiple-types.\n");
5985
5986 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def)
5987 {
5988 if (dump_enabled_p ())
5989 dump_printf_loc (MSG_NOTE, vect_location, "transform phi.\n");
5990 vect_transform_stmt (phi, NULL, NULL, NULL, NULL);
5991 }
5992 }
5993
5994 pattern_stmt = NULL;
5995 for (gimple_stmt_iterator si = gsi_start_bb (bb);
5996 !gsi_end_p (si) || transform_pattern_stmt;)
5997 {
5998 bool is_store;
5999
6000 if (transform_pattern_stmt)
6001 stmt = pattern_stmt;
6002 else
6003 {
6004 stmt = gsi_stmt (si);
6005 /* During vectorization remove existing clobber stmts. */
6006 if (gimple_clobber_p (stmt))
6007 {
6008 unlink_stmt_vdef (stmt);
6009 gsi_remove (&si, true);
6010 release_defs (stmt);
6011 continue;
6012 }
6013 }
6014
6015 if (dump_enabled_p ())
6016 {
6017 dump_printf_loc (MSG_NOTE, vect_location,
6018 "------>vectorizing statement: ");
6019 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
6020 dump_printf (MSG_NOTE, "\n");
6021 }
6022
6023 stmt_info = vinfo_for_stmt (stmt);
6024
6025 /* vector stmts created in the outer-loop during vectorization of
6026 stmts in an inner-loop may not have a stmt_info, and do not
6027 need to be vectorized. */
6028 if (!stmt_info)
6029 {
6030 gsi_next (&si);
6031 continue;
6032 }
6033
6034 if (MAY_HAVE_DEBUG_STMTS && !STMT_VINFO_LIVE_P (stmt_info))
6035 vect_loop_kill_debug_uses (loop, stmt);
6036
6037 if (!STMT_VINFO_RELEVANT_P (stmt_info)
6038 && !STMT_VINFO_LIVE_P (stmt_info))
6039 {
6040 if (STMT_VINFO_IN_PATTERN_P (stmt_info)
6041 && (pattern_stmt = STMT_VINFO_RELATED_STMT (stmt_info))
6042 && (STMT_VINFO_RELEVANT_P (vinfo_for_stmt (pattern_stmt))
6043 || STMT_VINFO_LIVE_P (vinfo_for_stmt (pattern_stmt))))
6044 {
6045 stmt = pattern_stmt;
6046 stmt_info = vinfo_for_stmt (stmt);
6047 }
6048 else
6049 {
6050 gsi_next (&si);
6051 continue;
6052 }
6053 }
6054 else if (STMT_VINFO_IN_PATTERN_P (stmt_info)
6055 && (pattern_stmt = STMT_VINFO_RELATED_STMT (stmt_info))
6056 && (STMT_VINFO_RELEVANT_P (vinfo_for_stmt (pattern_stmt))
6057 || STMT_VINFO_LIVE_P (vinfo_for_stmt (pattern_stmt))))
6058 transform_pattern_stmt = true;
6059
6060 /* If pattern statement has def stmts, vectorize them too. */
6061 if (is_pattern_stmt_p (stmt_info))
6062 {
6063 if (pattern_def_seq == NULL)
6064 {
6065 pattern_def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
6066 pattern_def_si = gsi_start (pattern_def_seq);
6067 }
6068 else if (!gsi_end_p (pattern_def_si))
6069 gsi_next (&pattern_def_si);
6070 if (pattern_def_seq != NULL)
6071 {
6072 gimple pattern_def_stmt = NULL;
6073 stmt_vec_info pattern_def_stmt_info = NULL;
6074
6075 while (!gsi_end_p (pattern_def_si))
6076 {
6077 pattern_def_stmt = gsi_stmt (pattern_def_si);
6078 pattern_def_stmt_info
6079 = vinfo_for_stmt (pattern_def_stmt);
6080 if (STMT_VINFO_RELEVANT_P (pattern_def_stmt_info)
6081 || STMT_VINFO_LIVE_P (pattern_def_stmt_info))
6082 break;
6083 gsi_next (&pattern_def_si);
6084 }
6085
6086 if (!gsi_end_p (pattern_def_si))
6087 {
6088 if (dump_enabled_p ())
6089 {
6090 dump_printf_loc (MSG_NOTE, vect_location,
6091 "==> vectorizing pattern def "
6092 "stmt: ");
6093 dump_gimple_stmt (MSG_NOTE, TDF_SLIM,
6094 pattern_def_stmt, 0);
6095 dump_printf (MSG_NOTE, "\n");
6096 }
6097
6098 stmt = pattern_def_stmt;
6099 stmt_info = pattern_def_stmt_info;
6100 }
6101 else
6102 {
6103 pattern_def_si = gsi_none ();
6104 transform_pattern_stmt = false;
6105 }
6106 }
6107 else
6108 transform_pattern_stmt = false;
6109 }
6110
6111 if (STMT_VINFO_VECTYPE (stmt_info))
6112 {
6113 unsigned int nunits
6114 = (unsigned int)
6115 TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info));
6116 if (!STMT_SLP_TYPE (stmt_info)
6117 && nunits != (unsigned int) vectorization_factor
6118 && dump_enabled_p ())
6119 /* For SLP VF is set according to unrolling factor, and not
6120 to vector size, hence for SLP this print is not valid. */
6121 dump_printf_loc (MSG_NOTE, vect_location, "multiple-types.\n");
6122 }
6123
6124 /* SLP. Schedule all the SLP instances when the first SLP stmt is
6125 reached. */
6126 if (STMT_SLP_TYPE (stmt_info))
6127 {
6128 if (!slp_scheduled)
6129 {
6130 slp_scheduled = true;
6131
6132 if (dump_enabled_p ())
6133 dump_printf_loc (MSG_NOTE, vect_location,
6134 "=== scheduling SLP instances ===\n");
6135
6136 vect_schedule_slp (loop_vinfo, NULL);
6137 }
6138
6139 /* Hybrid SLP stmts must be vectorized in addition to SLP. */
6140 if (!vinfo_for_stmt (stmt) || PURE_SLP_STMT (stmt_info))
6141 {
6142 if (!transform_pattern_stmt && gsi_end_p (pattern_def_si))
6143 {
6144 pattern_def_seq = NULL;
6145 gsi_next (&si);
6146 }
6147 continue;
6148 }
6149 }
6150
6151 /* -------- vectorize statement ------------ */
6152 if (dump_enabled_p ())
6153 dump_printf_loc (MSG_NOTE, vect_location, "transform statement.\n");
6154
6155 grouped_store = false;
6156 is_store = vect_transform_stmt (stmt, &si, &grouped_store, NULL, NULL);
6157 if (is_store)
6158 {
6159 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
6160 {
6161 /* Interleaving. If IS_STORE is TRUE, the vectorization of the
6162 interleaving chain was completed - free all the stores in
6163 the chain. */
6164 gsi_next (&si);
6165 vect_remove_stores (GROUP_FIRST_ELEMENT (stmt_info));
6166 }
6167 else
6168 {
6169 /* Free the attached stmt_vec_info and remove the stmt. */
6170 gimple store = gsi_stmt (si);
6171 free_stmt_vec_info (store);
6172 unlink_stmt_vdef (store);
6173 gsi_remove (&si, true);
6174 release_defs (store);
6175 }
6176
6177 /* Stores can only appear at the end of pattern statements. */
6178 gcc_assert (!transform_pattern_stmt);
6179 pattern_def_seq = NULL;
6180 }
6181 else if (!transform_pattern_stmt && gsi_end_p (pattern_def_si))
6182 {
6183 pattern_def_seq = NULL;
6184 gsi_next (&si);
6185 }
6186 } /* stmts in BB */
6187 } /* BBs in loop */
6188
6189 slpeel_make_loop_iterate_ntimes (loop, ratio);
6190
6191 /* Reduce loop iterations by the vectorization factor. */
6192 scale_loop_profile (loop, GCOV_COMPUTE_SCALE (1, vectorization_factor),
6193 expected_iterations / vectorization_factor);
6194 loop->nb_iterations_upper_bound
6195 = wi::udiv_floor (loop->nb_iterations_upper_bound, vectorization_factor);
6196 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
6197 && loop->nb_iterations_upper_bound != 0)
6198 loop->nb_iterations_upper_bound = loop->nb_iterations_upper_bound - 1;
6199 if (loop->any_estimate)
6200 {
6201 loop->nb_iterations_estimate
6202 = wi::udiv_floor (loop->nb_iterations_estimate, vectorization_factor);
6203 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
6204 && loop->nb_iterations_estimate != 0)
6205 loop->nb_iterations_estimate = loop->nb_iterations_estimate - 1;
6206 }
6207
6208 if (dump_enabled_p ())
6209 {
6210 dump_printf_loc (MSG_NOTE, vect_location,
6211 "LOOP VECTORIZED\n");
6212 if (loop->inner)
6213 dump_printf_loc (MSG_NOTE, vect_location,
6214 "OUTER LOOP VECTORIZED\n");
6215 dump_printf (MSG_NOTE, "\n");
6216 }
6217 }