]> git.ipfire.org Git - thirdparty/gcc.git/blame - gcc/tree-vect-loop.c
gcc/ada/
[thirdparty/gcc.git] / gcc / tree-vect-loop.c
CommitLineData
fb85abff 1/* Loop Vectorization
3aea1f79 2 Copyright (C) 2003-2014 Free Software Foundation, Inc.
48e1416a 3 Contributed by Dorit Naishlos <dorit@il.ibm.com> and
fb85abff 4 Ira Rosen <irar@il.ibm.com>
5
6This file is part of GCC.
7
8GCC is free software; you can redistribute it and/or modify it under
9the terms of the GNU General Public License as published by the Free
10Software Foundation; either version 3, or (at your option) any later
11version.
12
13GCC is distributed in the hope that it will be useful, but WITHOUT ANY
14WARRANTY; without even the implied warranty of MERCHANTABILITY or
15FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
16for more details.
17
18You should have received a copy of the GNU General Public License
19along with GCC; see the file COPYING3. If not see
20<http://www.gnu.org/licenses/>. */
21
22#include "config.h"
23#include "system.h"
24#include "coretypes.h"
7bd765d4 25#include "dumpfile.h"
fb85abff 26#include "tm.h"
fb85abff 27#include "tree.h"
9ed99284 28#include "stor-layout.h"
94ea8568 29#include "predict.h"
30#include "vec.h"
31#include "hashtab.h"
32#include "hash-set.h"
33#include "machmode.h"
34#include "hard-reg-set.h"
35#include "input.h"
36#include "function.h"
37#include "dominance.h"
38#include "cfg.h"
39#include "cfganal.h"
fb85abff 40#include "basic-block.h"
ce084dfc 41#include "gimple-pretty-print.h"
bc61cadb 42#include "tree-ssa-alias.h"
43#include "internal-fn.h"
44#include "gimple-expr.h"
45#include "is-a.h"
073c1fd5 46#include "gimple.h"
a8783bee 47#include "gimplify.h"
dcf1a1ec 48#include "gimple-iterator.h"
e795d6e1 49#include "gimplify-me.h"
073c1fd5 50#include "gimple-ssa.h"
51#include "tree-phinodes.h"
52#include "ssa-iterators.h"
9ed99284 53#include "stringpool.h"
073c1fd5 54#include "tree-ssanames.h"
05d9c18a 55#include "tree-ssa-loop-ivopts.h"
56#include "tree-ssa-loop-manip.h"
57#include "tree-ssa-loop-niter.h"
b9ed1410 58#include "tree-pass.h"
fb85abff 59#include "cfgloop.h"
fb85abff 60#include "expr.h"
61#include "recog.h"
62#include "optabs.h"
63#include "params.h"
0b205f4c 64#include "diagnostic-core.h"
fb85abff 65#include "tree-chrec.h"
66#include "tree-scalar-evolution.h"
67#include "tree-vectorizer.h"
559093aa 68#include "target.h"
fb85abff 69
70/* Loop Vectorization Pass.
71
48e1416a 72 This pass tries to vectorize loops.
fb85abff 73
74 For example, the vectorizer transforms the following simple loop:
75
76 short a[N]; short b[N]; short c[N]; int i;
77
78 for (i=0; i<N; i++){
79 a[i] = b[i] + c[i];
80 }
81
82 as if it was manually vectorized by rewriting the source code into:
83
84 typedef int __attribute__((mode(V8HI))) v8hi;
85 short a[N]; short b[N]; short c[N]; int i;
86 v8hi *pa = (v8hi*)a, *pb = (v8hi*)b, *pc = (v8hi*)c;
87 v8hi va, vb, vc;
88
89 for (i=0; i<N/8; i++){
90 vb = pb[i];
91 vc = pc[i];
92 va = vb + vc;
93 pa[i] = va;
94 }
95
96 The main entry to this pass is vectorize_loops(), in which
97 the vectorizer applies a set of analyses on a given set of loops,
98 followed by the actual vectorization transformation for the loops that
99 had successfully passed the analysis phase.
100 Throughout this pass we make a distinction between two types of
101 data: scalars (which are represented by SSA_NAMES), and memory references
282bf14c 102 ("data-refs"). These two types of data require different handling both
fb85abff 103 during analysis and transformation. The types of data-refs that the
104 vectorizer currently supports are ARRAY_REFS which base is an array DECL
105 (not a pointer), and INDIRECT_REFS through pointers; both array and pointer
106 accesses are required to have a simple (consecutive) access pattern.
107
108 Analysis phase:
109 ===============
110 The driver for the analysis phase is vect_analyze_loop().
111 It applies a set of analyses, some of which rely on the scalar evolution
112 analyzer (scev) developed by Sebastian Pop.
113
114 During the analysis phase the vectorizer records some information
115 per stmt in a "stmt_vec_info" struct which is attached to each stmt in the
116 loop, as well as general information about the loop as a whole, which is
117 recorded in a "loop_vec_info" struct attached to each loop.
118
119 Transformation phase:
120 =====================
121 The loop transformation phase scans all the stmts in the loop, and
122 creates a vector stmt (or a sequence of stmts) for each scalar stmt S in
282bf14c 123 the loop that needs to be vectorized. It inserts the vector code sequence
fb85abff 124 just before the scalar stmt S, and records a pointer to the vector code
125 in STMT_VINFO_VEC_STMT (stmt_info) (stmt_info is the stmt_vec_info struct
282bf14c 126 attached to S). This pointer will be used for the vectorization of following
fb85abff 127 stmts which use the def of stmt S. Stmt S is removed if it writes to memory;
128 otherwise, we rely on dead code elimination for removing it.
129
130 For example, say stmt S1 was vectorized into stmt VS1:
131
132 VS1: vb = px[i];
133 S1: b = x[i]; STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
134 S2: a = b;
135
136 To vectorize stmt S2, the vectorizer first finds the stmt that defines
137 the operand 'b' (S1), and gets the relevant vector def 'vb' from the
282bf14c 138 vector stmt VS1 pointed to by STMT_VINFO_VEC_STMT (stmt_info (S1)). The
fb85abff 139 resulting sequence would be:
140
141 VS1: vb = px[i];
142 S1: b = x[i]; STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
143 VS2: va = vb;
144 S2: a = b; STMT_VINFO_VEC_STMT (stmt_info (S2)) = VS2
145
146 Operands that are not SSA_NAMEs, are data-refs that appear in
147 load/store operations (like 'x[i]' in S1), and are handled differently.
148
149 Target modeling:
150 =================
151 Currently the only target specific information that is used is the
2101edf2 152 size of the vector (in bytes) - "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD".
153 Targets that can support different sizes of vectors, for now will need
282bf14c 154 to specify one value for "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD". More
2101edf2 155 flexibility will be added in the future.
fb85abff 156
157 Since we only vectorize operations which vector form can be
158 expressed using existing tree codes, to verify that an operation is
159 supported, the vectorizer checks the relevant optab at the relevant
282bf14c 160 machine_mode (e.g, optab_handler (add_optab, V8HImode)). If
fb85abff 161 the value found is CODE_FOR_nothing, then there's no target support, and
162 we can't vectorize the stmt.
163
164 For additional information on this project see:
165 http://gcc.gnu.org/projects/tree-ssa/vectorization.html
166*/
167
5938768b 168static void vect_estimate_min_profitable_iters (loop_vec_info, int *, int *);
169
fb85abff 170/* Function vect_determine_vectorization_factor
171
282bf14c 172 Determine the vectorization factor (VF). VF is the number of data elements
fb85abff 173 that are operated upon in parallel in a single iteration of the vectorized
282bf14c 174 loop. For example, when vectorizing a loop that operates on 4byte elements,
fb85abff 175 on a target with vector size (VS) 16byte, the VF is set to 4, since 4
176 elements can fit in a single vector register.
177
178 We currently support vectorization of loops in which all types operated upon
282bf14c 179 are of the same size. Therefore this function currently sets VF according to
fb85abff 180 the size of the types operated upon, and fails if there are multiple sizes
181 in the loop.
182
183 VF is also the factor by which the loop iterations are strip-mined, e.g.:
184 original loop:
185 for (i=0; i<N; i++){
186 a[i] = b[i] + c[i];
187 }
188
189 vectorized loop:
190 for (i=0; i<N; i+=VF){
191 a[i:VF] = b[i:VF] + c[i:VF];
192 }
193*/
194
195static bool
196vect_determine_vectorization_factor (loop_vec_info loop_vinfo)
197{
198 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
199 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
200 int nbbs = loop->num_nodes;
201 gimple_stmt_iterator si;
202 unsigned int vectorization_factor = 0;
203 tree scalar_type;
204 gimple phi;
205 tree vectype;
206 unsigned int nunits;
207 stmt_vec_info stmt_info;
208 int i;
209 HOST_WIDE_INT dummy;
18937389 210 gimple stmt, pattern_stmt = NULL;
211 gimple_seq pattern_def_seq = NULL;
e3a19533 212 gimple_stmt_iterator pattern_def_si = gsi_none ();
18937389 213 bool analyze_pattern_stmt = false;
fb85abff 214
6d8fb6cf 215 if (dump_enabled_p ())
7bd765d4 216 dump_printf_loc (MSG_NOTE, vect_location,
78bb46f5 217 "=== vect_determine_vectorization_factor ===\n");
fb85abff 218
219 for (i = 0; i < nbbs; i++)
220 {
221 basic_block bb = bbs[i];
222
223 for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
224 {
225 phi = gsi_stmt (si);
226 stmt_info = vinfo_for_stmt (phi);
6d8fb6cf 227 if (dump_enabled_p ())
fb85abff 228 {
7bd765d4 229 dump_printf_loc (MSG_NOTE, vect_location, "==> examining phi: ");
230 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
78bb46f5 231 dump_printf (MSG_NOTE, "\n");
fb85abff 232 }
233
234 gcc_assert (stmt_info);
235
236 if (STMT_VINFO_RELEVANT_P (stmt_info))
237 {
238 gcc_assert (!STMT_VINFO_VECTYPE (stmt_info));
239 scalar_type = TREE_TYPE (PHI_RESULT (phi));
240
6d8fb6cf 241 if (dump_enabled_p ())
fb85abff 242 {
7bd765d4 243 dump_printf_loc (MSG_NOTE, vect_location,
244 "get vectype for scalar type: ");
245 dump_generic_expr (MSG_NOTE, TDF_SLIM, scalar_type);
78bb46f5 246 dump_printf (MSG_NOTE, "\n");
fb85abff 247 }
248
249 vectype = get_vectype_for_scalar_type (scalar_type);
250 if (!vectype)
251 {
6d8fb6cf 252 if (dump_enabled_p ())
fb85abff 253 {
7bd765d4 254 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
255 "not vectorized: unsupported "
256 "data-type ");
257 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
258 scalar_type);
78bb46f5 259 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
fb85abff 260 }
261 return false;
262 }
263 STMT_VINFO_VECTYPE (stmt_info) = vectype;
264
6d8fb6cf 265 if (dump_enabled_p ())
fb85abff 266 {
7bd765d4 267 dump_printf_loc (MSG_NOTE, vect_location, "vectype: ");
268 dump_generic_expr (MSG_NOTE, TDF_SLIM, vectype);
78bb46f5 269 dump_printf (MSG_NOTE, "\n");
fb85abff 270 }
271
272 nunits = TYPE_VECTOR_SUBPARTS (vectype);
6d8fb6cf 273 if (dump_enabled_p ())
78bb46f5 274 dump_printf_loc (MSG_NOTE, vect_location, "nunits = %d\n",
275 nunits);
fb85abff 276
277 if (!vectorization_factor
278 || (nunits > vectorization_factor))
279 vectorization_factor = nunits;
280 }
281 }
282
8bf58742 283 for (si = gsi_start_bb (bb); !gsi_end_p (si) || analyze_pattern_stmt;)
fb85abff 284 {
8bf58742 285 tree vf_vectype;
286
287 if (analyze_pattern_stmt)
18937389 288 stmt = pattern_stmt;
8bf58742 289 else
290 stmt = gsi_stmt (si);
291
292 stmt_info = vinfo_for_stmt (stmt);
fb85abff 293
6d8fb6cf 294 if (dump_enabled_p ())
fb85abff 295 {
7bd765d4 296 dump_printf_loc (MSG_NOTE, vect_location,
297 "==> examining statement: ");
298 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
78bb46f5 299 dump_printf (MSG_NOTE, "\n");
fb85abff 300 }
301
302 gcc_assert (stmt_info);
303
67eea82d 304 /* Skip stmts which do not need to be vectorized. */
8911f4de 305 if ((!STMT_VINFO_RELEVANT_P (stmt_info)
306 && !STMT_VINFO_LIVE_P (stmt_info))
307 || gimple_clobber_p (stmt))
cfdcf183 308 {
309 if (STMT_VINFO_IN_PATTERN_P (stmt_info)
310 && (pattern_stmt = STMT_VINFO_RELATED_STMT (stmt_info))
311 && (STMT_VINFO_RELEVANT_P (vinfo_for_stmt (pattern_stmt))
312 || STMT_VINFO_LIVE_P (vinfo_for_stmt (pattern_stmt))))
67eea82d 313 {
cfdcf183 314 stmt = pattern_stmt;
315 stmt_info = vinfo_for_stmt (pattern_stmt);
6d8fb6cf 316 if (dump_enabled_p ())
cfdcf183 317 {
7bd765d4 318 dump_printf_loc (MSG_NOTE, vect_location,
319 "==> examining pattern statement: ");
320 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
78bb46f5 321 dump_printf (MSG_NOTE, "\n");
cfdcf183 322 }
323 }
324 else
325 {
6d8fb6cf 326 if (dump_enabled_p ())
78bb46f5 327 dump_printf_loc (MSG_NOTE, vect_location, "skip.\n");
8bf58742 328 gsi_next (&si);
cfdcf183 329 continue;
67eea82d 330 }
fb85abff 331 }
8bf58742 332 else if (STMT_VINFO_IN_PATTERN_P (stmt_info)
333 && (pattern_stmt = STMT_VINFO_RELATED_STMT (stmt_info))
334 && (STMT_VINFO_RELEVANT_P (vinfo_for_stmt (pattern_stmt))
335 || STMT_VINFO_LIVE_P (vinfo_for_stmt (pattern_stmt))))
336 analyze_pattern_stmt = true;
fb85abff 337
18937389 338 /* If a pattern statement has def stmts, analyze them too. */
339 if (is_pattern_stmt_p (stmt_info))
340 {
341 if (pattern_def_seq == NULL)
342 {
343 pattern_def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
344 pattern_def_si = gsi_start (pattern_def_seq);
345 }
346 else if (!gsi_end_p (pattern_def_si))
347 gsi_next (&pattern_def_si);
348 if (pattern_def_seq != NULL)
349 {
350 gimple pattern_def_stmt = NULL;
351 stmt_vec_info pattern_def_stmt_info = NULL;
45eea33f 352
18937389 353 while (!gsi_end_p (pattern_def_si))
354 {
355 pattern_def_stmt = gsi_stmt (pattern_def_si);
356 pattern_def_stmt_info
357 = vinfo_for_stmt (pattern_def_stmt);
358 if (STMT_VINFO_RELEVANT_P (pattern_def_stmt_info)
359 || STMT_VINFO_LIVE_P (pattern_def_stmt_info))
360 break;
361 gsi_next (&pattern_def_si);
362 }
363
364 if (!gsi_end_p (pattern_def_si))
365 {
6d8fb6cf 366 if (dump_enabled_p ())
18937389 367 {
7bd765d4 368 dump_printf_loc (MSG_NOTE, vect_location,
369 "==> examining pattern def stmt: ");
370 dump_gimple_stmt (MSG_NOTE, TDF_SLIM,
371 pattern_def_stmt, 0);
78bb46f5 372 dump_printf (MSG_NOTE, "\n");
18937389 373 }
374
375 stmt = pattern_def_stmt;
376 stmt_info = pattern_def_stmt_info;
377 }
378 else
379 {
e3a19533 380 pattern_def_si = gsi_none ();
18937389 381 analyze_pattern_stmt = false;
382 }
383 }
384 else
385 analyze_pattern_stmt = false;
386 }
45eea33f 387
c71d3c24 388 if (gimple_get_lhs (stmt) == NULL_TREE
389 /* MASK_STORE has no lhs, but is ok. */
390 && (!is_gimple_call (stmt)
391 || !gimple_call_internal_p (stmt)
392 || gimple_call_internal_fn (stmt) != IFN_MASK_STORE))
fb85abff 393 {
d09768a4 394 if (is_gimple_call (stmt))
395 {
396 /* Ignore calls with no lhs. These must be calls to
397 #pragma omp simd functions, and what vectorization factor
398 it really needs can't be determined until
399 vectorizable_simd_clone_call. */
400 if (!analyze_pattern_stmt && gsi_end_p (pattern_def_si))
401 {
402 pattern_def_seq = NULL;
403 gsi_next (&si);
404 }
405 continue;
406 }
6d8fb6cf 407 if (dump_enabled_p ())
fb85abff 408 {
7bd765d4 409 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
410 "not vectorized: irregular stmt.");
411 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt,
412 0);
78bb46f5 413 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
fb85abff 414 }
415 return false;
416 }
417
418 if (VECTOR_MODE_P (TYPE_MODE (gimple_expr_type (stmt))))
419 {
6d8fb6cf 420 if (dump_enabled_p ())
fb85abff 421 {
7bd765d4 422 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
423 "not vectorized: vector stmt in loop:");
424 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
78bb46f5 425 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
fb85abff 426 }
427 return false;
428 }
429
430 if (STMT_VINFO_VECTYPE (stmt_info))
431 {
48e1416a 432 /* The only case when a vectype had been already set is for stmts
acdc5fae 433 that contain a dataref, or for "pattern-stmts" (stmts
434 generated by the vectorizer to represent/replace a certain
435 idiom). */
48e1416a 436 gcc_assert (STMT_VINFO_DATA_REF (stmt_info)
acdc5fae 437 || is_pattern_stmt_p (stmt_info)
18937389 438 || !gsi_end_p (pattern_def_si));
fb85abff 439 vectype = STMT_VINFO_VECTYPE (stmt_info);
440 }
441 else
442 {
0187b74e 443 gcc_assert (!STMT_VINFO_DATA_REF (stmt_info));
c71d3c24 444 if (is_gimple_call (stmt)
445 && gimple_call_internal_p (stmt)
446 && gimple_call_internal_fn (stmt) == IFN_MASK_STORE)
447 scalar_type = TREE_TYPE (gimple_call_arg (stmt, 3));
448 else
449 scalar_type = TREE_TYPE (gimple_get_lhs (stmt));
6d8fb6cf 450 if (dump_enabled_p ())
fb85abff 451 {
7bd765d4 452 dump_printf_loc (MSG_NOTE, vect_location,
453 "get vectype for scalar type: ");
454 dump_generic_expr (MSG_NOTE, TDF_SLIM, scalar_type);
78bb46f5 455 dump_printf (MSG_NOTE, "\n");
fb85abff 456 }
fb85abff 457 vectype = get_vectype_for_scalar_type (scalar_type);
458 if (!vectype)
459 {
6d8fb6cf 460 if (dump_enabled_p ())
fb85abff 461 {
7bd765d4 462 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
463 "not vectorized: unsupported "
464 "data-type ");
465 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
466 scalar_type);
78bb46f5 467 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
fb85abff 468 }
469 return false;
470 }
b334cbba 471
fb85abff 472 STMT_VINFO_VECTYPE (stmt_info) = vectype;
0bf5f81b 473
474 if (dump_enabled_p ())
475 {
476 dump_printf_loc (MSG_NOTE, vect_location, "vectype: ");
477 dump_generic_expr (MSG_NOTE, TDF_SLIM, vectype);
78bb46f5 478 dump_printf (MSG_NOTE, "\n");
0bf5f81b 479 }
fb85abff 480 }
481
b334cbba 482 /* The vectorization factor is according to the smallest
483 scalar type (or the largest vector size, but we only
484 support one vector size per loop). */
485 scalar_type = vect_get_smallest_scalar_type (stmt, &dummy,
486 &dummy);
6d8fb6cf 487 if (dump_enabled_p ())
b334cbba 488 {
7bd765d4 489 dump_printf_loc (MSG_NOTE, vect_location,
490 "get vectype for scalar type: ");
491 dump_generic_expr (MSG_NOTE, TDF_SLIM, scalar_type);
78bb46f5 492 dump_printf (MSG_NOTE, "\n");
b334cbba 493 }
494 vf_vectype = get_vectype_for_scalar_type (scalar_type);
495 if (!vf_vectype)
496 {
6d8fb6cf 497 if (dump_enabled_p ())
b334cbba 498 {
7bd765d4 499 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
500 "not vectorized: unsupported data-type ");
501 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
502 scalar_type);
78bb46f5 503 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
b334cbba 504 }
505 return false;
506 }
507
508 if ((GET_MODE_SIZE (TYPE_MODE (vectype))
509 != GET_MODE_SIZE (TYPE_MODE (vf_vectype))))
510 {
6d8fb6cf 511 if (dump_enabled_p ())
b334cbba 512 {
7bd765d4 513 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
514 "not vectorized: different sized vector "
515 "types in statement, ");
516 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
517 vectype);
518 dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
519 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
520 vf_vectype);
78bb46f5 521 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
b334cbba 522 }
523 return false;
524 }
525
6d8fb6cf 526 if (dump_enabled_p ())
fb85abff 527 {
7bd765d4 528 dump_printf_loc (MSG_NOTE, vect_location, "vectype: ");
529 dump_generic_expr (MSG_NOTE, TDF_SLIM, vf_vectype);
78bb46f5 530 dump_printf (MSG_NOTE, "\n");
fb85abff 531 }
532
b334cbba 533 nunits = TYPE_VECTOR_SUBPARTS (vf_vectype);
6d8fb6cf 534 if (dump_enabled_p ())
78bb46f5 535 dump_printf_loc (MSG_NOTE, vect_location, "nunits = %d\n", nunits);
fb85abff 536 if (!vectorization_factor
537 || (nunits > vectorization_factor))
538 vectorization_factor = nunits;
8bf58742 539
18937389 540 if (!analyze_pattern_stmt && gsi_end_p (pattern_def_si))
541 {
542 pattern_def_seq = NULL;
543 gsi_next (&si);
544 }
fb85abff 545 }
546 }
547
548 /* TODO: Analyze cost. Decide if worth while to vectorize. */
6d8fb6cf 549 if (dump_enabled_p ())
78bb46f5 550 dump_printf_loc (MSG_NOTE, vect_location, "vectorization factor = %d\n",
7bd765d4 551 vectorization_factor);
fb85abff 552 if (vectorization_factor <= 1)
553 {
6d8fb6cf 554 if (dump_enabled_p ())
7bd765d4 555 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
78bb46f5 556 "not vectorized: unsupported data-type\n");
fb85abff 557 return false;
558 }
559 LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
560
561 return true;
562}
563
564
565/* Function vect_is_simple_iv_evolution.
566
567 FORNOW: A simple evolution of an induction variables in the loop is
bb0d2509 568 considered a polynomial evolution. */
fb85abff 569
570static bool
571vect_is_simple_iv_evolution (unsigned loop_nb, tree access_fn, tree * init,
572 tree * step)
573{
574 tree init_expr;
575 tree step_expr;
576 tree evolution_part = evolution_part_in_loop_num (access_fn, loop_nb);
bb0d2509 577 basic_block bb;
fb85abff 578
579 /* When there is no evolution in this loop, the evolution function
580 is not "simple". */
581 if (evolution_part == NULL_TREE)
582 return false;
583
584 /* When the evolution is a polynomial of degree >= 2
585 the evolution function is not "simple". */
586 if (tree_is_chrec (evolution_part))
587 return false;
588
589 step_expr = evolution_part;
590 init_expr = unshare_expr (initial_condition_in_loop_num (access_fn, loop_nb));
591
6d8fb6cf 592 if (dump_enabled_p ())
fb85abff 593 {
7bd765d4 594 dump_printf_loc (MSG_NOTE, vect_location, "step: ");
595 dump_generic_expr (MSG_NOTE, TDF_SLIM, step_expr);
596 dump_printf (MSG_NOTE, ", init: ");
597 dump_generic_expr (MSG_NOTE, TDF_SLIM, init_expr);
78bb46f5 598 dump_printf (MSG_NOTE, "\n");
fb85abff 599 }
600
601 *init = init_expr;
602 *step = step_expr;
603
bb0d2509 604 if (TREE_CODE (step_expr) != INTEGER_CST
605 && (TREE_CODE (step_expr) != SSA_NAME
606 || ((bb = gimple_bb (SSA_NAME_DEF_STMT (step_expr)))
1d62df1c 607 && flow_bb_inside_loop_p (get_loop (cfun, loop_nb), bb))
608 || (!INTEGRAL_TYPE_P (TREE_TYPE (step_expr))
609 && (!SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr))
610 || !flag_associative_math)))
611 && (TREE_CODE (step_expr) != REAL_CST
612 || !flag_associative_math))
fb85abff 613 {
6d8fb6cf 614 if (dump_enabled_p ())
7bd765d4 615 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
78bb46f5 616 "step unknown.\n");
fb85abff 617 return false;
618 }
619
620 return true;
621}
622
623/* Function vect_analyze_scalar_cycles_1.
624
625 Examine the cross iteration def-use cycles of scalar variables
282bf14c 626 in LOOP. LOOP_VINFO represents the loop that is now being
fb85abff 627 considered for vectorization (can be LOOP, or an outer-loop
628 enclosing LOOP). */
629
630static void
631vect_analyze_scalar_cycles_1 (loop_vec_info loop_vinfo, struct loop *loop)
632{
633 basic_block bb = loop->header;
bb0d2509 634 tree init, step;
4997014d 635 auto_vec<gimple, 64> worklist;
fb85abff 636 gimple_stmt_iterator gsi;
7aa0d350 637 bool double_reduc;
fb85abff 638
6d8fb6cf 639 if (dump_enabled_p ())
7bd765d4 640 dump_printf_loc (MSG_NOTE, vect_location,
78bb46f5 641 "=== vect_analyze_scalar_cycles ===\n");
fb85abff 642
282bf14c 643 /* First - identify all inductions. Reduction detection assumes that all the
48e1416a 644 inductions have been identified, therefore, this order must not be
ade2ac53 645 changed. */
fb85abff 646 for (gsi = gsi_start_phis (bb); !gsi_end_p (gsi); gsi_next (&gsi))
647 {
648 gimple phi = gsi_stmt (gsi);
649 tree access_fn = NULL;
650 tree def = PHI_RESULT (phi);
651 stmt_vec_info stmt_vinfo = vinfo_for_stmt (phi);
652
6d8fb6cf 653 if (dump_enabled_p ())
fb85abff 654 {
7bd765d4 655 dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: ");
656 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
78bb46f5 657 dump_printf (MSG_NOTE, "\n");
fb85abff 658 }
659
282bf14c 660 /* Skip virtual phi's. The data dependences that are associated with
fb85abff 661 virtual defs/uses (i.e., memory accesses) are analyzed elsewhere. */
7c782c9b 662 if (virtual_operand_p (def))
fb85abff 663 continue;
664
665 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_unknown_def_type;
666
667 /* Analyze the evolution function. */
668 access_fn = analyze_scalar_evolution (loop, def);
acf5dbc0 669 if (access_fn)
fb85abff 670 {
58280b1f 671 STRIP_NOPS (access_fn);
6d8fb6cf 672 if (dump_enabled_p ())
58280b1f 673 {
7bd765d4 674 dump_printf_loc (MSG_NOTE, vect_location,
675 "Access function of PHI: ");
676 dump_generic_expr (MSG_NOTE, TDF_SLIM, access_fn);
78bb46f5 677 dump_printf (MSG_NOTE, "\n");
58280b1f 678 }
679 STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo)
680 = evolution_part_in_loop_num (access_fn, loop->num);
fb85abff 681 }
682
683 if (!access_fn
bb0d2509 684 || !vect_is_simple_iv_evolution (loop->num, access_fn, &init, &step)
685 || (LOOP_VINFO_LOOP (loop_vinfo) != loop
686 && TREE_CODE (step) != INTEGER_CST))
fb85abff 687 {
f1f41a6c 688 worklist.safe_push (phi);
fb85abff 689 continue;
690 }
691
86faead7 692 gcc_assert (STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo) != NULL_TREE);
693
6d8fb6cf 694 if (dump_enabled_p ())
78bb46f5 695 dump_printf_loc (MSG_NOTE, vect_location, "Detected induction.\n");
fb85abff 696 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_induction_def;
697 }
698
699
ade2ac53 700 /* Second - identify all reductions and nested cycles. */
f1f41a6c 701 while (worklist.length () > 0)
fb85abff 702 {
f1f41a6c 703 gimple phi = worklist.pop ();
fb85abff 704 tree def = PHI_RESULT (phi);
705 stmt_vec_info stmt_vinfo = vinfo_for_stmt (phi);
706 gimple reduc_stmt;
ade2ac53 707 bool nested_cycle;
fb85abff 708
6d8fb6cf 709 if (dump_enabled_p ())
48e1416a 710 {
7bd765d4 711 dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: ");
712 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
78bb46f5 713 dump_printf (MSG_NOTE, "\n");
fb85abff 714 }
715
7c782c9b 716 gcc_assert (!virtual_operand_p (def)
717 && STMT_VINFO_DEF_TYPE (stmt_vinfo) == vect_unknown_def_type);
fb85abff 718
ade2ac53 719 nested_cycle = (loop != LOOP_VINFO_LOOP (loop_vinfo));
f4a50267 720 reduc_stmt = vect_force_simple_reduction (loop_vinfo, phi, !nested_cycle,
721 &double_reduc);
fb85abff 722 if (reduc_stmt)
723 {
7aa0d350 724 if (double_reduc)
ade2ac53 725 {
6d8fb6cf 726 if (dump_enabled_p ())
7bd765d4 727 dump_printf_loc (MSG_NOTE, vect_location,
78bb46f5 728 "Detected double reduction.\n");
ade2ac53 729
7aa0d350 730 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_double_reduction_def;
ade2ac53 731 STMT_VINFO_DEF_TYPE (vinfo_for_stmt (reduc_stmt)) =
7aa0d350 732 vect_double_reduction_def;
ade2ac53 733 }
48e1416a 734 else
ade2ac53 735 {
7aa0d350 736 if (nested_cycle)
737 {
6d8fb6cf 738 if (dump_enabled_p ())
7bd765d4 739 dump_printf_loc (MSG_NOTE, vect_location,
78bb46f5 740 "Detected vectorizable nested cycle.\n");
ade2ac53 741
7aa0d350 742 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_nested_cycle;
743 STMT_VINFO_DEF_TYPE (vinfo_for_stmt (reduc_stmt)) =
744 vect_nested_cycle;
745 }
746 else
747 {
6d8fb6cf 748 if (dump_enabled_p ())
7bd765d4 749 dump_printf_loc (MSG_NOTE, vect_location,
78bb46f5 750 "Detected reduction.\n");
7aa0d350 751
752 STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_reduction_def;
753 STMT_VINFO_DEF_TYPE (vinfo_for_stmt (reduc_stmt)) =
754 vect_reduction_def;
eefa05c8 755 /* Store the reduction cycles for possible vectorization in
756 loop-aware SLP. */
f1f41a6c 757 LOOP_VINFO_REDUCTIONS (loop_vinfo).safe_push (reduc_stmt);
7aa0d350 758 }
ade2ac53 759 }
fb85abff 760 }
761 else
6d8fb6cf 762 if (dump_enabled_p ())
7bd765d4 763 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
78bb46f5 764 "Unknown def-use cycle pattern.\n");
fb85abff 765 }
fb85abff 766}
767
768
769/* Function vect_analyze_scalar_cycles.
770
771 Examine the cross iteration def-use cycles of scalar variables, by
282bf14c 772 analyzing the loop-header PHIs of scalar variables. Classify each
fb85abff 773 cycle as one of the following: invariant, induction, reduction, unknown.
774 We do that for the loop represented by LOOP_VINFO, and also to its
775 inner-loop, if exists.
776 Examples for scalar cycles:
777
778 Example1: reduction:
779
780 loop1:
781 for (i=0; i<N; i++)
782 sum += a[i];
783
784 Example2: induction:
785
786 loop2:
787 for (i=0; i<N; i++)
788 a[i] = i; */
789
790static void
791vect_analyze_scalar_cycles (loop_vec_info loop_vinfo)
792{
793 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
794
795 vect_analyze_scalar_cycles_1 (loop_vinfo, loop);
796
797 /* When vectorizing an outer-loop, the inner-loop is executed sequentially.
798 Reductions in such inner-loop therefore have different properties than
799 the reductions in the nest that gets vectorized:
800 1. When vectorized, they are executed in the same order as in the original
801 scalar loop, so we can't change the order of computation when
802 vectorizing them.
48e1416a 803 2. FIXME: Inner-loop reductions can be used in the inner-loop, so the
fb85abff 804 current checks are too strict. */
805
806 if (loop->inner)
807 vect_analyze_scalar_cycles_1 (loop_vinfo, loop->inner);
808}
809
313a5120 810
fb85abff 811/* Function vect_get_loop_niters.
812
313a5120 813 Determine how many iterations the loop is executed and place it
796f6cba 814 in NUMBER_OF_ITERATIONS. Place the number of latch iterations
815 in NUMBER_OF_ITERATIONSM1.
313a5120 816
fb85abff 817 Return the loop exit condition. */
818
819static gimple
796f6cba 820vect_get_loop_niters (struct loop *loop, tree *number_of_iterations,
821 tree *number_of_iterationsm1)
fb85abff 822{
823 tree niters;
824
6d8fb6cf 825 if (dump_enabled_p ())
7bd765d4 826 dump_printf_loc (MSG_NOTE, vect_location,
78bb46f5 827 "=== get_loop_niters ===\n");
fb85abff 828
313a5120 829 niters = number_of_latch_executions (loop);
796f6cba 830 *number_of_iterationsm1 = niters;
831
313a5120 832 /* We want the number of loop header executions which is the number
833 of latch executions plus one.
834 ??? For UINT_MAX latch executions this number overflows to zero
835 for loops like do { n++; } while (n != 0); */
836 if (niters && !chrec_contains_undetermined (niters))
796f6cba 837 niters = fold_build2 (PLUS_EXPR, TREE_TYPE (niters), unshare_expr (niters),
313a5120 838 build_int_cst (TREE_TYPE (niters), 1));
839 *number_of_iterations = niters;
fb85abff 840
841 return get_loop_exit_condition (loop);
842}
843
844
845/* Function bb_in_loop_p
846
847 Used as predicate for dfs order traversal of the loop bbs. */
848
849static bool
850bb_in_loop_p (const_basic_block bb, const void *data)
851{
852 const struct loop *const loop = (const struct loop *)data;
853 if (flow_bb_inside_loop_p (loop, bb))
854 return true;
855 return false;
856}
857
858
859/* Function new_loop_vec_info.
860
861 Create and initialize a new loop_vec_info struct for LOOP, as well as
862 stmt_vec_info structs for all the stmts in LOOP. */
863
864static loop_vec_info
865new_loop_vec_info (struct loop *loop)
866{
867 loop_vec_info res;
868 basic_block *bbs;
869 gimple_stmt_iterator si;
870 unsigned int i, nbbs;
871
872 res = (loop_vec_info) xcalloc (1, sizeof (struct _loop_vec_info));
873 LOOP_VINFO_LOOP (res) = loop;
874
875 bbs = get_loop_body (loop);
876
877 /* Create/Update stmt_info for all stmts in the loop. */
878 for (i = 0; i < loop->num_nodes; i++)
879 {
880 basic_block bb = bbs[i];
881
882 /* BBs in a nested inner-loop will have been already processed (because
883 we will have called vect_analyze_loop_form for any nested inner-loop).
884 Therefore, for stmts in an inner-loop we just want to update the
885 STMT_VINFO_LOOP_VINFO field of their stmt_info to point to the new
886 loop_info of the outer-loop we are currently considering to vectorize
887 (instead of the loop_info of the inner-loop).
888 For stmts in other BBs we need to create a stmt_info from scratch. */
889 if (bb->loop_father != loop)
890 {
891 /* Inner-loop bb. */
892 gcc_assert (loop->inner && bb->loop_father == loop->inner);
893 for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
894 {
895 gimple phi = gsi_stmt (si);
896 stmt_vec_info stmt_info = vinfo_for_stmt (phi);
897 loop_vec_info inner_loop_vinfo =
898 STMT_VINFO_LOOP_VINFO (stmt_info);
899 gcc_assert (loop->inner == LOOP_VINFO_LOOP (inner_loop_vinfo));
900 STMT_VINFO_LOOP_VINFO (stmt_info) = res;
901 }
902 for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
903 {
904 gimple stmt = gsi_stmt (si);
905 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
906 loop_vec_info inner_loop_vinfo =
907 STMT_VINFO_LOOP_VINFO (stmt_info);
908 gcc_assert (loop->inner == LOOP_VINFO_LOOP (inner_loop_vinfo));
909 STMT_VINFO_LOOP_VINFO (stmt_info) = res;
910 }
911 }
912 else
913 {
914 /* bb in current nest. */
915 for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
916 {
917 gimple phi = gsi_stmt (si);
918 gimple_set_uid (phi, 0);
37545e54 919 set_vinfo_for_stmt (phi, new_stmt_vec_info (phi, res, NULL));
fb85abff 920 }
921
922 for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
923 {
924 gimple stmt = gsi_stmt (si);
925 gimple_set_uid (stmt, 0);
37545e54 926 set_vinfo_for_stmt (stmt, new_stmt_vec_info (stmt, res, NULL));
fb85abff 927 }
928 }
929 }
930
931 /* CHECKME: We want to visit all BBs before their successors (except for
932 latch blocks, for which this assertion wouldn't hold). In the simple
933 case of the loop forms we allow, a dfs order of the BBs would the same
934 as reversed postorder traversal, so we are safe. */
935
936 free (bbs);
937 bbs = XCNEWVEC (basic_block, loop->num_nodes);
938 nbbs = dfs_enumerate_from (loop->header, 0, bb_in_loop_p,
939 bbs, loop->num_nodes, loop);
940 gcc_assert (nbbs == loop->num_nodes);
941
942 LOOP_VINFO_BBS (res) = bbs;
796f6cba 943 LOOP_VINFO_NITERSM1 (res) = NULL;
fb85abff 944 LOOP_VINFO_NITERS (res) = NULL;
945 LOOP_VINFO_NITERS_UNCHANGED (res) = NULL;
946 LOOP_VINFO_COST_MODEL_MIN_ITERS (res) = 0;
004a94a5 947 LOOP_VINFO_COST_MODEL_THRESHOLD (res) = 0;
fb85abff 948 LOOP_VINFO_VECTORIZABLE_P (res) = 0;
313a5120 949 LOOP_VINFO_PEELING_FOR_ALIGNMENT (res) = 0;
fb85abff 950 LOOP_VINFO_VECT_FACTOR (res) = 0;
f1f41a6c 951 LOOP_VINFO_LOOP_NEST (res).create (3);
952 LOOP_VINFO_DATAREFS (res).create (10);
953 LOOP_VINFO_DDRS (res).create (10 * 10);
fb85abff 954 LOOP_VINFO_UNALIGNED_DR (res) = NULL;
f1f41a6c 955 LOOP_VINFO_MAY_MISALIGN_STMTS (res).create (
956 PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIGNMENT_CHECKS));
957 LOOP_VINFO_MAY_ALIAS_DDRS (res).create (
958 PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS));
959 LOOP_VINFO_GROUPED_STORES (res).create (10);
960 LOOP_VINFO_REDUCTIONS (res).create (10);
961 LOOP_VINFO_REDUCTION_CHAINS (res).create (10);
962 LOOP_VINFO_SLP_INSTANCES (res).create (10);
fb85abff 963 LOOP_VINFO_SLP_UNROLLING_FACTOR (res) = 1;
4db2b577 964 LOOP_VINFO_TARGET_COST_DATA (res) = init_cost (loop);
a4ee7fac 965 LOOP_VINFO_PEELING_FOR_GAPS (res) = false;
313a5120 966 LOOP_VINFO_PEELING_FOR_NITER (res) = false;
ba69439f 967 LOOP_VINFO_OPERANDS_SWAPPED (res) = false;
fb85abff 968
969 return res;
970}
971
972
973/* Function destroy_loop_vec_info.
974
975 Free LOOP_VINFO struct, as well as all the stmt_vec_info structs of all the
976 stmts in the loop. */
977
978void
979destroy_loop_vec_info (loop_vec_info loop_vinfo, bool clean_stmts)
980{
981 struct loop *loop;
982 basic_block *bbs;
983 int nbbs;
984 gimple_stmt_iterator si;
985 int j;
f1f41a6c 986 vec<slp_instance> slp_instances;
fb85abff 987 slp_instance instance;
ba69439f 988 bool swapped;
fb85abff 989
990 if (!loop_vinfo)
991 return;
992
993 loop = LOOP_VINFO_LOOP (loop_vinfo);
994
995 bbs = LOOP_VINFO_BBS (loop_vinfo);
033ee56d 996 nbbs = clean_stmts ? loop->num_nodes : 0;
ba69439f 997 swapped = LOOP_VINFO_OPERANDS_SWAPPED (loop_vinfo);
fb85abff 998
fb85abff 999 for (j = 0; j < nbbs; j++)
1000 {
1001 basic_block bb = bbs[j];
1002 for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
1003 free_stmt_vec_info (gsi_stmt (si));
1004
1005 for (si = gsi_start_bb (bb); !gsi_end_p (si); )
1006 {
1007 gimple stmt = gsi_stmt (si);
ba69439f 1008
1009 /* We may have broken canonical form by moving a constant
1010 into RHS1 of a commutative op. Fix such occurrences. */
1011 if (swapped && is_gimple_assign (stmt))
1012 {
1013 enum tree_code code = gimple_assign_rhs_code (stmt);
1014
1015 if ((code == PLUS_EXPR
1016 || code == POINTER_PLUS_EXPR
1017 || code == MULT_EXPR)
1018 && CONSTANT_CLASS_P (gimple_assign_rhs1 (stmt)))
8f6fa493 1019 swap_ssa_operands (stmt,
1020 gimple_assign_rhs1_ptr (stmt),
1021 gimple_assign_rhs2_ptr (stmt));
ba69439f 1022 }
1023
3b515af5 1024 /* Free stmt_vec_info. */
1025 free_stmt_vec_info (stmt);
fb85abff 1026 gsi_next (&si);
1027 }
1028 }
1029
1030 free (LOOP_VINFO_BBS (loop_vinfo));
23e1875f 1031 vect_destroy_datarefs (loop_vinfo, NULL);
fb85abff 1032 free_dependence_relations (LOOP_VINFO_DDRS (loop_vinfo));
f1f41a6c 1033 LOOP_VINFO_LOOP_NEST (loop_vinfo).release ();
1034 LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).release ();
1035 LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo).release ();
fb85abff 1036 slp_instances = LOOP_VINFO_SLP_INSTANCES (loop_vinfo);
f1f41a6c 1037 FOR_EACH_VEC_ELT (slp_instances, j, instance)
fb85abff 1038 vect_free_slp_instance (instance);
1039
f1f41a6c 1040 LOOP_VINFO_SLP_INSTANCES (loop_vinfo).release ();
1041 LOOP_VINFO_GROUPED_STORES (loop_vinfo).release ();
1042 LOOP_VINFO_REDUCTIONS (loop_vinfo).release ();
1043 LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo).release ();
fb85abff 1044
c1f445d2 1045 delete LOOP_VINFO_PEELING_HTAB (loop_vinfo);
1046 LOOP_VINFO_PEELING_HTAB (loop_vinfo) = NULL;
0822b158 1047
4db2b577 1048 destroy_cost_data (LOOP_VINFO_TARGET_COST_DATA (loop_vinfo));
1049
fb85abff 1050 free (loop_vinfo);
1051 loop->aux = NULL;
1052}
1053
1054
1055/* Function vect_analyze_loop_1.
1056
1057 Apply a set of analyses on LOOP, and create a loop_vec_info struct
1058 for it. The different analyses will record information in the
1059 loop_vec_info struct. This is a subset of the analyses applied in
1060 vect_analyze_loop, to be applied on an inner-loop nested in the loop
1061 that is now considered for (outer-loop) vectorization. */
1062
1063static loop_vec_info
1064vect_analyze_loop_1 (struct loop *loop)
1065{
1066 loop_vec_info loop_vinfo;
1067
6d8fb6cf 1068 if (dump_enabled_p ())
7bd765d4 1069 dump_printf_loc (MSG_NOTE, vect_location,
78bb46f5 1070 "===== analyze_loop_nest_1 =====\n");
fb85abff 1071
1072 /* Check the CFG characteristics of the loop (nesting, entry/exit, etc. */
1073
1074 loop_vinfo = vect_analyze_loop_form (loop);
1075 if (!loop_vinfo)
1076 {
6d8fb6cf 1077 if (dump_enabled_p ())
7bd765d4 1078 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
78bb46f5 1079 "bad inner-loop form.\n");
fb85abff 1080 return NULL;
1081 }
1082
1083 return loop_vinfo;
1084}
1085
1086
1087/* Function vect_analyze_loop_form.
1088
1089 Verify that certain CFG restrictions hold, including:
1090 - the loop has a pre-header
1091 - the loop has a single entry and exit
1092 - the loop exit condition is simple enough, and the number of iterations
1093 can be analyzed (a countable loop). */
1094
1095loop_vec_info
1096vect_analyze_loop_form (struct loop *loop)
1097{
1098 loop_vec_info loop_vinfo;
1099 gimple loop_cond;
796f6cba 1100 tree number_of_iterations = NULL, number_of_iterationsm1 = NULL;
fb85abff 1101 loop_vec_info inner_loop_vinfo = NULL;
1102
6d8fb6cf 1103 if (dump_enabled_p ())
7bd765d4 1104 dump_printf_loc (MSG_NOTE, vect_location,
78bb46f5 1105 "=== vect_analyze_loop_form ===\n");
fb85abff 1106
1107 /* Different restrictions apply when we are considering an inner-most loop,
48e1416a 1108 vs. an outer (nested) loop.
fb85abff 1109 (FORNOW. May want to relax some of these restrictions in the future). */
1110
1111 if (!loop->inner)
1112 {
48e1416a 1113 /* Inner-most loop. We currently require that the number of BBs is
1114 exactly 2 (the header and latch). Vectorizable inner-most loops
fb85abff 1115 look like this:
1116
1117 (pre-header)
1118 |
1119 header <--------+
1120 | | |
1121 | +--> latch --+
1122 |
1123 (exit-bb) */
1124
1125 if (loop->num_nodes != 2)
1126 {
6d8fb6cf 1127 if (dump_enabled_p ())
7bd765d4 1128 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
78bb46f5 1129 "not vectorized: control flow in loop.\n");
fb85abff 1130 return NULL;
1131 }
1132
1133 if (empty_block_p (loop->header))
313a5120 1134 {
1135 if (dump_enabled_p ())
1136 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
78bb46f5 1137 "not vectorized: empty loop.\n");
313a5120 1138 return NULL;
1139 }
fb85abff 1140 }
1141 else
1142 {
1143 struct loop *innerloop = loop->inner;
f018d957 1144 edge entryedge;
fb85abff 1145
1146 /* Nested loop. We currently require that the loop is doubly-nested,
48e1416a 1147 contains a single inner loop, and the number of BBs is exactly 5.
fb85abff 1148 Vectorizable outer-loops look like this:
1149
1150 (pre-header)
1151 |
1152 header <---+
1153 | |
1154 inner-loop |
1155 | |
1156 tail ------+
48e1416a 1157 |
fb85abff 1158 (exit-bb)
1159
1160 The inner-loop has the properties expected of inner-most loops
1161 as described above. */
1162
1163 if ((loop->inner)->inner || (loop->inner)->next)
1164 {
6d8fb6cf 1165 if (dump_enabled_p ())
7bd765d4 1166 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
78bb46f5 1167 "not vectorized: multiple nested loops.\n");
fb85abff 1168 return NULL;
1169 }
1170
1171 /* Analyze the inner-loop. */
1172 inner_loop_vinfo = vect_analyze_loop_1 (loop->inner);
1173 if (!inner_loop_vinfo)
1174 {
6d8fb6cf 1175 if (dump_enabled_p ())
7bd765d4 1176 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
78bb46f5 1177 "not vectorized: Bad inner loop.\n");
fb85abff 1178 return NULL;
1179 }
1180
1181 if (!expr_invariant_in_loop_p (loop,
1182 LOOP_VINFO_NITERS (inner_loop_vinfo)))
1183 {
6d8fb6cf 1184 if (dump_enabled_p ())
78bb46f5 1185 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1186 "not vectorized: inner-loop count not"
1187 " invariant.\n");
fb85abff 1188 destroy_loop_vec_info (inner_loop_vinfo, true);
1189 return NULL;
1190 }
1191
48e1416a 1192 if (loop->num_nodes != 5)
fb85abff 1193 {
6d8fb6cf 1194 if (dump_enabled_p ())
7bd765d4 1195 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
78bb46f5 1196 "not vectorized: control flow in loop.\n");
fb85abff 1197 destroy_loop_vec_info (inner_loop_vinfo, true);
1198 return NULL;
1199 }
1200
1201 gcc_assert (EDGE_COUNT (innerloop->header->preds) == 2);
fb85abff 1202 entryedge = EDGE_PRED (innerloop->header, 0);
1203 if (EDGE_PRED (innerloop->header, 0)->src == innerloop->latch)
f018d957 1204 entryedge = EDGE_PRED (innerloop->header, 1);
48e1416a 1205
fb85abff 1206 if (entryedge->src != loop->header
1207 || !single_exit (innerloop)
1208 || single_exit (innerloop)->dest != EDGE_PRED (loop->latch, 0)->src)
1209 {
6d8fb6cf 1210 if (dump_enabled_p ())
78bb46f5 1211 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1212 "not vectorized: unsupported outerloop form.\n");
fb85abff 1213 destroy_loop_vec_info (inner_loop_vinfo, true);
1214 return NULL;
1215 }
1216
6d8fb6cf 1217 if (dump_enabled_p ())
7bd765d4 1218 dump_printf_loc (MSG_NOTE, vect_location,
78bb46f5 1219 "Considering outer-loop vectorization.\n");
fb85abff 1220 }
48e1416a 1221
1222 if (!single_exit (loop)
fb85abff 1223 || EDGE_COUNT (loop->header->preds) != 2)
1224 {
6d8fb6cf 1225 if (dump_enabled_p ())
fb85abff 1226 {
1227 if (!single_exit (loop))
7bd765d4 1228 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
78bb46f5 1229 "not vectorized: multiple exits.\n");
fb85abff 1230 else if (EDGE_COUNT (loop->header->preds) != 2)
78bb46f5 1231 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1232 "not vectorized: too many incoming edges.\n");
fb85abff 1233 }
1234 if (inner_loop_vinfo)
1235 destroy_loop_vec_info (inner_loop_vinfo, true);
1236 return NULL;
1237 }
1238
1239 /* We assume that the loop exit condition is at the end of the loop. i.e,
1240 that the loop is represented as a do-while (with a proper if-guard
1241 before the loop if needed), where the loop header contains all the
1242 executable statements, and the latch is empty. */
1243 if (!empty_block_p (loop->latch)
3c18ea71 1244 || !gimple_seq_empty_p (phi_nodes (loop->latch)))
fb85abff 1245 {
6d8fb6cf 1246 if (dump_enabled_p ())
7bd765d4 1247 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
78bb46f5 1248 "not vectorized: latch block not empty.\n");
fb85abff 1249 if (inner_loop_vinfo)
1250 destroy_loop_vec_info (inner_loop_vinfo, true);
1251 return NULL;
1252 }
1253
1254 /* Make sure there exists a single-predecessor exit bb: */
1255 if (!single_pred_p (single_exit (loop)->dest))
1256 {
1257 edge e = single_exit (loop);
1258 if (!(e->flags & EDGE_ABNORMAL))
1259 {
1260 split_loop_exit_edge (e);
6d8fb6cf 1261 if (dump_enabled_p ())
78bb46f5 1262 dump_printf (MSG_NOTE, "split exit edge.\n");
fb85abff 1263 }
1264 else
1265 {
6d8fb6cf 1266 if (dump_enabled_p ())
78bb46f5 1267 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1268 "not vectorized: abnormal loop exit edge.\n");
fb85abff 1269 if (inner_loop_vinfo)
1270 destroy_loop_vec_info (inner_loop_vinfo, true);
1271 return NULL;
1272 }
1273 }
1274
796f6cba 1275 loop_cond = vect_get_loop_niters (loop, &number_of_iterations,
1276 &number_of_iterationsm1);
fb85abff 1277 if (!loop_cond)
1278 {
6d8fb6cf 1279 if (dump_enabled_p ())
78bb46f5 1280 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1281 "not vectorized: complicated exit condition.\n");
fb85abff 1282 if (inner_loop_vinfo)
1283 destroy_loop_vec_info (inner_loop_vinfo, true);
1284 return NULL;
1285 }
48e1416a 1286
313a5120 1287 if (!number_of_iterations
1288 || chrec_contains_undetermined (number_of_iterations))
fb85abff 1289 {
6d8fb6cf 1290 if (dump_enabled_p ())
78bb46f5 1291 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7bd765d4 1292 "not vectorized: number of iterations cannot be "
78bb46f5 1293 "computed.\n");
fb85abff 1294 if (inner_loop_vinfo)
1295 destroy_loop_vec_info (inner_loop_vinfo, true);
1296 return NULL;
1297 }
1298
313a5120 1299 if (integer_zerop (number_of_iterations))
fb85abff 1300 {
6d8fb6cf 1301 if (dump_enabled_p ())
313a5120 1302 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1303 "not vectorized: number of iterations = 0.\n");
fb85abff 1304 if (inner_loop_vinfo)
313a5120 1305 destroy_loop_vec_info (inner_loop_vinfo, true);
fb85abff 1306 return NULL;
1307 }
1308
313a5120 1309 loop_vinfo = new_loop_vec_info (loop);
796f6cba 1310 LOOP_VINFO_NITERSM1 (loop_vinfo) = number_of_iterationsm1;
313a5120 1311 LOOP_VINFO_NITERS (loop_vinfo) = number_of_iterations;
1312 LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = number_of_iterations;
1313
1314 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
fb85abff 1315 {
6d8fb6cf 1316 if (dump_enabled_p ())
fb85abff 1317 {
7bd765d4 1318 dump_printf_loc (MSG_NOTE, vect_location,
1319 "Symbolic number of iterations is ");
1320 dump_generic_expr (MSG_NOTE, TDF_DETAILS, number_of_iterations);
78bb46f5 1321 dump_printf (MSG_NOTE, "\n");
fb85abff 1322 }
1323 }
fb85abff 1324
1325 STMT_VINFO_TYPE (vinfo_for_stmt (loop_cond)) = loop_exit_ctrl_vec_info_type;
1326
1327 /* CHECKME: May want to keep it around it in the future. */
1328 if (inner_loop_vinfo)
1329 destroy_loop_vec_info (inner_loop_vinfo, false);
1330
1331 gcc_assert (!loop->aux);
1332 loop->aux = loop_vinfo;
1333 return loop_vinfo;
1334}
1335
f083cd24 1336
1337/* Function vect_analyze_loop_operations.
1338
1339 Scan the loop stmts and make sure they are all vectorizable. */
1340
1341static bool
bc937a44 1342vect_analyze_loop_operations (loop_vec_info loop_vinfo, bool slp)
f083cd24 1343{
1344 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1345 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1346 int nbbs = loop->num_nodes;
1347 gimple_stmt_iterator si;
1348 unsigned int vectorization_factor = 0;
1349 int i;
1350 gimple phi;
1351 stmt_vec_info stmt_info;
1352 bool need_to_vectorize = false;
1353 int min_profitable_iters;
1354 int min_scalar_loop_bound;
1355 unsigned int th;
1356 bool only_slp_in_loop = true, ok;
5115d20b 1357 HOST_WIDE_INT max_niter;
5938768b 1358 HOST_WIDE_INT estimated_niter;
1359 int min_profitable_estimate;
f083cd24 1360
6d8fb6cf 1361 if (dump_enabled_p ())
7bd765d4 1362 dump_printf_loc (MSG_NOTE, vect_location,
78bb46f5 1363 "=== vect_analyze_loop_operations ===\n");
f083cd24 1364
1365 gcc_assert (LOOP_VINFO_VECT_FACTOR (loop_vinfo));
1366 vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
bc937a44 1367 if (slp)
1368 {
1369 /* If all the stmts in the loop can be SLPed, we perform only SLP, and
1370 vectorization factor of the loop is the unrolling factor required by
1371 the SLP instances. If that unrolling factor is 1, we say, that we
1372 perform pure SLP on loop - cross iteration parallelism is not
1373 exploited. */
1374 for (i = 0; i < nbbs; i++)
1375 {
1376 basic_block bb = bbs[i];
1377 for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
1378 {
1379 gimple stmt = gsi_stmt (si);
1380 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1381 gcc_assert (stmt_info);
1382 if ((STMT_VINFO_RELEVANT_P (stmt_info)
1383 || VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
1384 && !PURE_SLP_STMT (stmt_info))
1385 /* STMT needs both SLP and loop-based vectorization. */
1386 only_slp_in_loop = false;
1387 }
1388 }
1389
1390 if (only_slp_in_loop)
1391 vectorization_factor = LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo);
1392 else
1393 vectorization_factor = least_common_multiple (vectorization_factor,
1394 LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo));
1395
1396 LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
6d8fb6cf 1397 if (dump_enabled_p ())
7bd765d4 1398 dump_printf_loc (MSG_NOTE, vect_location,
78bb46f5 1399 "Updating vectorization factor to %d\n",
7bd765d4 1400 vectorization_factor);
bc937a44 1401 }
f083cd24 1402
1403 for (i = 0; i < nbbs; i++)
1404 {
1405 basic_block bb = bbs[i];
1406
1407 for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
1408 {
1409 phi = gsi_stmt (si);
1410 ok = true;
1411
1412 stmt_info = vinfo_for_stmt (phi);
6d8fb6cf 1413 if (dump_enabled_p ())
f083cd24 1414 {
7bd765d4 1415 dump_printf_loc (MSG_NOTE, vect_location, "examining phi: ");
1416 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
78bb46f5 1417 dump_printf (MSG_NOTE, "\n");
f083cd24 1418 }
1419
8bdf488e 1420 /* Inner-loop loop-closed exit phi in outer-loop vectorization
1421 (i.e., a phi in the tail of the outer-loop). */
f083cd24 1422 if (! is_loop_header_bb_p (bb))
1423 {
8bdf488e 1424 /* FORNOW: we currently don't support the case that these phis
7aa0d350 1425 are not used in the outerloop (unless it is double reduction,
48e1416a 1426 i.e., this phi is vect_reduction_def), cause this case
7aa0d350 1427 requires to actually do something here. */
1428 if ((!STMT_VINFO_RELEVANT_P (stmt_info)
1429 || STMT_VINFO_LIVE_P (stmt_info))
48e1416a 1430 && STMT_VINFO_DEF_TYPE (stmt_info)
7aa0d350 1431 != vect_double_reduction_def)
f083cd24 1432 {
6d8fb6cf 1433 if (dump_enabled_p ())
78bb46f5 1434 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7bd765d4 1435 "Unsupported loop-closed phi in "
78bb46f5 1436 "outer-loop.\n");
f083cd24 1437 return false;
1438 }
8bdf488e 1439
1440 /* If PHI is used in the outer loop, we check that its operand
1441 is defined in the inner loop. */
1442 if (STMT_VINFO_RELEVANT_P (stmt_info))
1443 {
1444 tree phi_op;
1445 gimple op_def_stmt;
1446
1447 if (gimple_phi_num_args (phi) != 1)
1448 return false;
1449
1450 phi_op = PHI_ARG_DEF (phi, 0);
1451 if (TREE_CODE (phi_op) != SSA_NAME)
1452 return false;
1453
1454 op_def_stmt = SSA_NAME_DEF_STMT (phi_op);
ea902f25 1455 if (gimple_nop_p (op_def_stmt)
791e6391 1456 || !flow_bb_inside_loop_p (loop, gimple_bb (op_def_stmt))
1457 || !vinfo_for_stmt (op_def_stmt))
8bdf488e 1458 return false;
1459
1460 if (STMT_VINFO_RELEVANT (vinfo_for_stmt (op_def_stmt))
1461 != vect_used_in_outer
1462 && STMT_VINFO_RELEVANT (vinfo_for_stmt (op_def_stmt))
1463 != vect_used_in_outer_by_reduction)
1464 return false;
1465 }
1466
f083cd24 1467 continue;
1468 }
1469
1470 gcc_assert (stmt_info);
1471
1472 if (STMT_VINFO_LIVE_P (stmt_info))
1473 {
1474 /* FORNOW: not yet supported. */
6d8fb6cf 1475 if (dump_enabled_p ())
7bd765d4 1476 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
78bb46f5 1477 "not vectorized: value used after loop.\n");
f083cd24 1478 return false;
1479 }
1480
1481 if (STMT_VINFO_RELEVANT (stmt_info) == vect_used_in_scope
1482 && STMT_VINFO_DEF_TYPE (stmt_info) != vect_induction_def)
1483 {
1484 /* A scalar-dependence cycle that we don't support. */
6d8fb6cf 1485 if (dump_enabled_p ())
78bb46f5 1486 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1487 "not vectorized: scalar dependence cycle.\n");
f083cd24 1488 return false;
1489 }
1490
1491 if (STMT_VINFO_RELEVANT_P (stmt_info))
1492 {
1493 need_to_vectorize = true;
1494 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def)
1495 ok = vectorizable_induction (phi, NULL, NULL);
1496 }
1497
1498 if (!ok)
1499 {
6d8fb6cf 1500 if (dump_enabled_p ())
f083cd24 1501 {
78bb46f5 1502 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7bd765d4 1503 "not vectorized: relevant phi not "
1504 "supported: ");
1505 dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, phi, 0);
78bb46f5 1506 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
f083cd24 1507 }
4db2b577 1508 return false;
f083cd24 1509 }
1510 }
1511
1512 for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
1513 {
1514 gimple stmt = gsi_stmt (si);
8911f4de 1515 if (!gimple_clobber_p (stmt)
1516 && !vect_analyze_stmt (stmt, &need_to_vectorize, NULL))
f083cd24 1517 return false;
48e1416a 1518 }
f083cd24 1519 } /* bbs */
1520
1521 /* All operations in the loop are either irrelevant (deal with loop
1522 control, or dead), or only used outside the loop and can be moved
1523 out of the loop (e.g. invariants, inductions). The loop can be
1524 optimized away by scalar optimizations. We're better off not
1525 touching this loop. */
1526 if (!need_to_vectorize)
1527 {
6d8fb6cf 1528 if (dump_enabled_p ())
7bd765d4 1529 dump_printf_loc (MSG_NOTE, vect_location,
78bb46f5 1530 "All the computation can be taken out of the loop.\n");
6d8fb6cf 1531 if (dump_enabled_p ())
78bb46f5 1532 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7bd765d4 1533 "not vectorized: redundant loop. no profit to "
78bb46f5 1534 "vectorize.\n");
f083cd24 1535 return false;
1536 }
1537
6d8fb6cf 1538 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) && dump_enabled_p ())
7bd765d4 1539 dump_printf_loc (MSG_NOTE, vect_location,
1540 "vectorization_factor = %d, niters = "
78bb46f5 1541 HOST_WIDE_INT_PRINT_DEC "\n", vectorization_factor,
7bd765d4 1542 LOOP_VINFO_INT_NITERS (loop_vinfo));
f083cd24 1543
5115d20b 1544 if ((LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
1545 && (LOOP_VINFO_INT_NITERS (loop_vinfo) < vectorization_factor))
1546 || ((max_niter = max_stmt_executions_int (loop)) != -1
9a5ede52 1547 && (unsigned HOST_WIDE_INT) max_niter < vectorization_factor))
f083cd24 1548 {
6d8fb6cf 1549 if (dump_enabled_p ())
7bd765d4 1550 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
78bb46f5 1551 "not vectorized: iteration count too small.\n");
6d8fb6cf 1552 if (dump_enabled_p ())
7bd765d4 1553 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1554 "not vectorized: iteration count smaller than "
78bb46f5 1555 "vectorization factor.\n");
f083cd24 1556 return false;
1557 }
1558
282bf14c 1559 /* Analyze cost. Decide if worth while to vectorize. */
f083cd24 1560
1561 /* Once VF is set, SLP costs should be updated since the number of created
1562 vector stmts depends on VF. */
1563 vect_update_slp_costs_according_to_vf (loop_vinfo);
1564
5938768b 1565 vect_estimate_min_profitable_iters (loop_vinfo, &min_profitable_iters,
1566 &min_profitable_estimate);
f083cd24 1567 LOOP_VINFO_COST_MODEL_MIN_ITERS (loop_vinfo) = min_profitable_iters;
1568
1569 if (min_profitable_iters < 0)
1570 {
6d8fb6cf 1571 if (dump_enabled_p ())
7bd765d4 1572 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
78bb46f5 1573 "not vectorized: vectorization not profitable.\n");
6d8fb6cf 1574 if (dump_enabled_p ())
78bb46f5 1575 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7bd765d4 1576 "not vectorized: vector version will never be "
78bb46f5 1577 "profitable.\n");
f083cd24 1578 return false;
1579 }
1580
1581 min_scalar_loop_bound = ((PARAM_VALUE (PARAM_MIN_VECT_LOOP_BOUND)
1582 * vectorization_factor) - 1);
1583
5938768b 1584
f083cd24 1585 /* Use the cost model only if it is more conservative than user specified
1586 threshold. */
1587
1588 th = (unsigned) min_scalar_loop_bound;
1589 if (min_profitable_iters
1590 && (!min_scalar_loop_bound
1591 || min_profitable_iters > min_scalar_loop_bound))
1592 th = (unsigned) min_profitable_iters;
1593
004a94a5 1594 LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo) = th;
1595
f083cd24 1596 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
1597 && LOOP_VINFO_INT_NITERS (loop_vinfo) <= th)
1598 {
6d8fb6cf 1599 if (dump_enabled_p ())
7bd765d4 1600 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
78bb46f5 1601 "not vectorized: vectorization not profitable.\n");
6d8fb6cf 1602 if (dump_enabled_p ())
7bd765d4 1603 dump_printf_loc (MSG_NOTE, vect_location,
1604 "not vectorized: iteration count smaller than user "
1605 "specified loop bound parameter or minimum profitable "
78bb46f5 1606 "iterations (whichever is more conservative).\n");
f083cd24 1607 return false;
1608 }
1609
5938768b 1610 if ((estimated_niter = estimated_stmt_executions_int (loop)) != -1
1611 && ((unsigned HOST_WIDE_INT) estimated_niter
1612 <= MAX (th, (unsigned)min_profitable_estimate)))
1613 {
6d8fb6cf 1614 if (dump_enabled_p ())
5938768b 1615 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1616 "not vectorized: estimated iteration count too "
78bb46f5 1617 "small.\n");
6d8fb6cf 1618 if (dump_enabled_p ())
5938768b 1619 dump_printf_loc (MSG_NOTE, vect_location,
1620 "not vectorized: estimated iteration count smaller "
1621 "than specified loop bound parameter or minimum "
1622 "profitable iterations (whichever is more "
78bb46f5 1623 "conservative).\n");
5938768b 1624 return false;
1625 }
1626
f083cd24 1627 return true;
1628}
1629
1630
c4740c5d 1631/* Function vect_analyze_loop_2.
fb85abff 1632
1633 Apply a set of analyses on LOOP, and create a loop_vec_info struct
282bf14c 1634 for it. The different analyses will record information in the
fb85abff 1635 loop_vec_info struct. */
c4740c5d 1636static bool
1637vect_analyze_loop_2 (loop_vec_info loop_vinfo)
fb85abff 1638{
80508571 1639 bool ok, slp = false;
91a74fc6 1640 int max_vf = MAX_VECTORIZATION_FACTOR;
1641 int min_vf = 2;
004a94a5 1642 unsigned int th;
ca91d3f8 1643 unsigned int n_stmts = 0;
fb85abff 1644
fb85abff 1645 /* Find all data references in the loop (which correspond to vdefs/vuses)
91a74fc6 1646 and analyze their evolution in the loop. Also adjust the minimal
1647 vectorization factor according to the loads and stores.
fb85abff 1648
1649 FORNOW: Handle only simple, array references, which
1650 alignment can be forced, and aligned pointer-references. */
1651
ca91d3f8 1652 ok = vect_analyze_data_refs (loop_vinfo, NULL, &min_vf, &n_stmts);
fb85abff 1653 if (!ok)
1654 {
6d8fb6cf 1655 if (dump_enabled_p ())
7bd765d4 1656 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
78bb46f5 1657 "bad data references.\n");
c4740c5d 1658 return false;
fb85abff 1659 }
1660
68f15e9d 1661 /* Analyze the access patterns of the data-refs in the loop (consecutive,
1662 complex, etc.). FORNOW: Only handle consecutive access pattern. */
1663
1664 ok = vect_analyze_data_ref_accesses (loop_vinfo, NULL);
1665 if (!ok)
1666 {
1667 if (dump_enabled_p ())
1668 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
78bb46f5 1669 "bad data access.\n");
68f15e9d 1670 return false;
1671 }
1672
fb85abff 1673 /* Classify all cross-iteration scalar data-flow cycles.
1674 Cross-iteration cycles caused by virtual phis are analyzed separately. */
1675
1676 vect_analyze_scalar_cycles (loop_vinfo);
1677
4c0c783a 1678 vect_pattern_recog (loop_vinfo, NULL);
fb85abff 1679
1680 /* Data-flow analysis to detect stmts that do not need to be vectorized. */
1681
1682 ok = vect_mark_stmts_to_be_vectorized (loop_vinfo);
1683 if (!ok)
1684 {
6d8fb6cf 1685 if (dump_enabled_p ())
7bd765d4 1686 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
78bb46f5 1687 "unexpected pattern.\n");
c4740c5d 1688 return false;
fb85abff 1689 }
1690
91a74fc6 1691 /* Analyze data dependences between the data-refs in the loop
1692 and adjust the maximum vectorization factor according to
1693 the dependences.
1694 FORNOW: fail at the first data dependence that we encounter. */
fb85abff 1695
68f15e9d 1696 ok = vect_analyze_data_ref_dependences (loop_vinfo, &max_vf);
91a74fc6 1697 if (!ok
1698 || max_vf < min_vf)
fb85abff 1699 {
6d8fb6cf 1700 if (dump_enabled_p ())
7bd765d4 1701 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
78bb46f5 1702 "bad data dependence.\n");
c4740c5d 1703 return false;
fb85abff 1704 }
1705
1706 ok = vect_determine_vectorization_factor (loop_vinfo);
1707 if (!ok)
1708 {
6d8fb6cf 1709 if (dump_enabled_p ())
7bd765d4 1710 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
78bb46f5 1711 "can't determine vectorization factor.\n");
c4740c5d 1712 return false;
fb85abff 1713 }
91a74fc6 1714 if (max_vf < LOOP_VINFO_VECT_FACTOR (loop_vinfo))
1715 {
6d8fb6cf 1716 if (dump_enabled_p ())
7bd765d4 1717 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
78bb46f5 1718 "bad data dependence.\n");
c4740c5d 1719 return false;
91a74fc6 1720 }
fb85abff 1721
91a74fc6 1722 /* Analyze the alignment of the data-refs in the loop.
1723 Fail if a data reference is found that cannot be vectorized. */
fb85abff 1724
91a74fc6 1725 ok = vect_analyze_data_refs_alignment (loop_vinfo, NULL);
fb85abff 1726 if (!ok)
1727 {
6d8fb6cf 1728 if (dump_enabled_p ())
7bd765d4 1729 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
78bb46f5 1730 "bad data alignment.\n");
c4740c5d 1731 return false;
fb85abff 1732 }
1733
fb85abff 1734 /* Prune the list of ddrs to be tested at run-time by versioning for alias.
1735 It is important to call pruning after vect_analyze_data_ref_accesses,
1736 since we use grouping information gathered by interleaving analysis. */
1737 ok = vect_prune_runtime_alias_test_list (loop_vinfo);
1738 if (!ok)
1739 {
6d8fb6cf 1740 if (dump_enabled_p ())
7bd765d4 1741 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1c7f4889 1742 "number of versioning for alias "
1743 "run-time tests exceeds %d "
1744 "(--param vect-max-version-for-alias-checks)\n",
1745 PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS));
c4740c5d 1746 return false;
fb85abff 1747 }
1748
fb85abff 1749 /* This pass will decide on using loop versioning and/or loop peeling in
1750 order to enhance the alignment of data references in the loop. */
1751
1752 ok = vect_enhance_data_refs_alignment (loop_vinfo);
1753 if (!ok)
1754 {
6d8fb6cf 1755 if (dump_enabled_p ())
7bd765d4 1756 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
78bb46f5 1757 "bad data alignment.\n");
c4740c5d 1758 return false;
fb85abff 1759 }
1760
0822b158 1761 /* Check the SLP opportunities in the loop, analyze and build SLP trees. */
ca91d3f8 1762 ok = vect_analyze_slp (loop_vinfo, NULL, n_stmts);
0822b158 1763 if (ok)
1764 {
1765 /* Decide which possible SLP instances to SLP. */
bc937a44 1766 slp = vect_make_slp_decision (loop_vinfo);
0822b158 1767
1768 /* Find stmts that need to be both vectorized and SLPed. */
1769 vect_detect_hybrid_slp (loop_vinfo);
1770 }
39a5d6b1 1771 else
1772 return false;
0822b158 1773
fb85abff 1774 /* Scan all the operations in the loop and make sure they are
1775 vectorizable. */
1776
bc937a44 1777 ok = vect_analyze_loop_operations (loop_vinfo, slp);
fb85abff 1778 if (!ok)
1779 {
6d8fb6cf 1780 if (dump_enabled_p ())
7bd765d4 1781 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
78bb46f5 1782 "bad operation or unsupported loop bound.\n");
c4740c5d 1783 return false;
1784 }
1785
313a5120 1786 /* Decide whether we need to create an epilogue loop to handle
1787 remaining scalar iterations. */
004a94a5 1788 th = ((LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo) + 1)
1789 / LOOP_VINFO_VECT_FACTOR (loop_vinfo))
1790 * LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1791
313a5120 1792 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
1793 && LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) > 0)
1794 {
1795 if (ctz_hwi (LOOP_VINFO_INT_NITERS (loop_vinfo)
1796 - LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo))
1797 < exact_log2 (LOOP_VINFO_VECT_FACTOR (loop_vinfo)))
1798 LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = true;
1799 }
1800 else if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
1801 || (tree_ctz (LOOP_VINFO_NITERS (loop_vinfo))
004a94a5 1802 < (unsigned)exact_log2 (LOOP_VINFO_VECT_FACTOR (loop_vinfo))
1803 /* In case of versioning, check if the maximum number of
1804 iterations is greater than th. If they are identical,
1805 the epilogue is unnecessary. */
1806 && ((!LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo)
1807 && !LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo))
1808 || (unsigned HOST_WIDE_INT)max_stmt_executions_int
1809 (LOOP_VINFO_LOOP (loop_vinfo)) > th)))
313a5120 1810 LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = true;
1811
1812 /* If an epilogue loop is required make sure we can create one. */
1813 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
1814 || LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo))
1815 {
1816 if (dump_enabled_p ())
1817 dump_printf_loc (MSG_NOTE, vect_location, "epilog loop required\n");
1818 if (!vect_can_advance_ivs_p (loop_vinfo)
1819 || !slpeel_can_duplicate_loop_p (LOOP_VINFO_LOOP (loop_vinfo),
1820 single_exit (LOOP_VINFO_LOOP
1821 (loop_vinfo))))
1822 {
1823 if (dump_enabled_p ())
1824 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1825 "not vectorized: can't create required "
1826 "epilog loop\n");
1827 return false;
1828 }
1829 }
1830
c4740c5d 1831 return true;
1832}
1833
1834/* Function vect_analyze_loop.
1835
1836 Apply a set of analyses on LOOP, and create a loop_vec_info struct
1837 for it. The different analyses will record information in the
1838 loop_vec_info struct. */
1839loop_vec_info
1840vect_analyze_loop (struct loop *loop)
1841{
1842 loop_vec_info loop_vinfo;
1843 unsigned int vector_sizes;
1844
1845 /* Autodetect first vector size we try. */
1846 current_vector_size = 0;
1847 vector_sizes = targetm.vectorize.autovectorize_vector_sizes ();
1848
6d8fb6cf 1849 if (dump_enabled_p ())
7bd765d4 1850 dump_printf_loc (MSG_NOTE, vect_location,
78bb46f5 1851 "===== analyze_loop_nest =====\n");
c4740c5d 1852
1853 if (loop_outer (loop)
1854 && loop_vec_info_for_loop (loop_outer (loop))
1855 && LOOP_VINFO_VECTORIZABLE_P (loop_vec_info_for_loop (loop_outer (loop))))
1856 {
6d8fb6cf 1857 if (dump_enabled_p ())
7bd765d4 1858 dump_printf_loc (MSG_NOTE, vect_location,
78bb46f5 1859 "outer-loop already vectorized.\n");
fb85abff 1860 return NULL;
1861 }
1862
c4740c5d 1863 while (1)
1864 {
1865 /* Check the CFG characteristics of the loop (nesting, entry/exit). */
1866 loop_vinfo = vect_analyze_loop_form (loop);
1867 if (!loop_vinfo)
1868 {
6d8fb6cf 1869 if (dump_enabled_p ())
7bd765d4 1870 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
78bb46f5 1871 "bad loop form.\n");
c4740c5d 1872 return NULL;
1873 }
fb85abff 1874
c4740c5d 1875 if (vect_analyze_loop_2 (loop_vinfo))
1876 {
1877 LOOP_VINFO_VECTORIZABLE_P (loop_vinfo) = 1;
1878
1879 return loop_vinfo;
1880 }
1881
1882 destroy_loop_vec_info (loop_vinfo, true);
1883
1884 vector_sizes &= ~current_vector_size;
1885 if (vector_sizes == 0
1886 || current_vector_size == 0)
1887 return NULL;
1888
1889 /* Try the next biggest vector size. */
1890 current_vector_size = 1 << floor_log2 (vector_sizes);
6d8fb6cf 1891 if (dump_enabled_p ())
7bd765d4 1892 dump_printf_loc (MSG_NOTE, vect_location,
1893 "***** Re-trying analysis with "
1894 "vector size %d\n", current_vector_size);
c4740c5d 1895 }
fb85abff 1896}
1897
1898
1899/* Function reduction_code_for_scalar_code
1900
1901 Input:
1902 CODE - tree_code of a reduction operations.
1903
1904 Output:
1905 REDUC_CODE - the corresponding tree-code to be used to reduce the
7ba68b18 1906 vector of partial results into a single scalar result, or ERROR_MARK
1907 if the operation is a supported reduction operation, but does not have
1908 such a tree-code.
fb85abff 1909
7aa0d350 1910 Return FALSE if CODE currently cannot be vectorized as reduction. */
fb85abff 1911
1912static bool
1913reduction_code_for_scalar_code (enum tree_code code,
1914 enum tree_code *reduc_code)
1915{
1916 switch (code)
7aa0d350 1917 {
1918 case MAX_EXPR:
1919 *reduc_code = REDUC_MAX_EXPR;
1920 return true;
fb85abff 1921
7aa0d350 1922 case MIN_EXPR:
1923 *reduc_code = REDUC_MIN_EXPR;
1924 return true;
fb85abff 1925
7aa0d350 1926 case PLUS_EXPR:
1927 *reduc_code = REDUC_PLUS_EXPR;
1928 return true;
fb85abff 1929
7aa0d350 1930 case MULT_EXPR:
1931 case MINUS_EXPR:
1932 case BIT_IOR_EXPR:
1933 case BIT_XOR_EXPR:
1934 case BIT_AND_EXPR:
1935 *reduc_code = ERROR_MARK;
1936 return true;
1937
1938 default:
1939 return false;
1940 }
fb85abff 1941}
1942
1943
282bf14c 1944/* Error reporting helper for vect_is_simple_reduction below. GIMPLE statement
fb85abff 1945 STMT is printed with a message MSG. */
1946
1947static void
7bd765d4 1948report_vect_op (int msg_type, gimple stmt, const char *msg)
fb85abff 1949{
7bd765d4 1950 dump_printf_loc (msg_type, vect_location, "%s", msg);
1951 dump_gimple_stmt (msg_type, TDF_SLIM, stmt, 0);
78bb46f5 1952 dump_printf (msg_type, "\n");
fb85abff 1953}
1954
1955
39a5d6b1 1956/* Detect SLP reduction of the form:
1957
1958 #a1 = phi <a5, a0>
1959 a2 = operation (a1)
1960 a3 = operation (a2)
1961 a4 = operation (a3)
1962 a5 = operation (a4)
1963
1964 #a = phi <a5>
1965
1966 PHI is the reduction phi node (#a1 = phi <a5, a0> above)
1967 FIRST_STMT is the first reduction stmt in the chain
1968 (a2 = operation (a1)).
1969
1970 Return TRUE if a reduction chain was detected. */
1971
1972static bool
1973vect_is_slp_reduction (loop_vec_info loop_info, gimple phi, gimple first_stmt)
1974{
1975 struct loop *loop = (gimple_bb (phi))->loop_father;
1976 struct loop *vect_loop = LOOP_VINFO_LOOP (loop_info);
1977 enum tree_code code;
85078181 1978 gimple current_stmt = NULL, loop_use_stmt = NULL, first, next_stmt;
39a5d6b1 1979 stmt_vec_info use_stmt_info, current_stmt_info;
1980 tree lhs;
1981 imm_use_iterator imm_iter;
1982 use_operand_p use_p;
6b809b99 1983 int nloop_uses, size = 0, n_out_of_loop_uses;
39a5d6b1 1984 bool found = false;
1985
1986 if (loop != vect_loop)
1987 return false;
1988
1989 lhs = PHI_RESULT (phi);
1990 code = gimple_assign_rhs_code (first_stmt);
1991 while (1)
1992 {
1993 nloop_uses = 0;
6b809b99 1994 n_out_of_loop_uses = 0;
39a5d6b1 1995 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, lhs)
1996 {
85078181 1997 gimple use_stmt = USE_STMT (use_p);
0b308eee 1998 if (is_gimple_debug (use_stmt))
1999 continue;
85078181 2000
39a5d6b1 2001 /* Check if we got back to the reduction phi. */
85078181 2002 if (use_stmt == phi)
39a5d6b1 2003 {
85078181 2004 loop_use_stmt = use_stmt;
39a5d6b1 2005 found = true;
2006 break;
2007 }
2008
6b809b99 2009 if (flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
2010 {
2011 if (vinfo_for_stmt (use_stmt)
2012 && !STMT_VINFO_IN_PATTERN_P (vinfo_for_stmt (use_stmt)))
2013 {
2014 loop_use_stmt = use_stmt;
2015 nloop_uses++;
2016 }
2017 }
2018 else
2019 n_out_of_loop_uses++;
39a5d6b1 2020
6b809b99 2021 /* There are can be either a single use in the loop or two uses in
2022 phi nodes. */
2023 if (nloop_uses > 1 || (n_out_of_loop_uses && nloop_uses))
2024 return false;
39a5d6b1 2025 }
2026
2027 if (found)
2028 break;
2029
85078181 2030 /* We reached a statement with no loop uses. */
2031 if (nloop_uses == 0)
2032 return false;
2033
39a5d6b1 2034 /* This is a loop exit phi, and we haven't reached the reduction phi. */
85078181 2035 if (gimple_code (loop_use_stmt) == GIMPLE_PHI)
39a5d6b1 2036 return false;
2037
85078181 2038 if (!is_gimple_assign (loop_use_stmt)
2039 || code != gimple_assign_rhs_code (loop_use_stmt)
2040 || !flow_bb_inside_loop_p (loop, gimple_bb (loop_use_stmt)))
39a5d6b1 2041 return false;
2042
2043 /* Insert USE_STMT into reduction chain. */
85078181 2044 use_stmt_info = vinfo_for_stmt (loop_use_stmt);
39a5d6b1 2045 if (current_stmt)
2046 {
2047 current_stmt_info = vinfo_for_stmt (current_stmt);
85078181 2048 GROUP_NEXT_ELEMENT (current_stmt_info) = loop_use_stmt;
39a5d6b1 2049 GROUP_FIRST_ELEMENT (use_stmt_info)
2050 = GROUP_FIRST_ELEMENT (current_stmt_info);
2051 }
2052 else
85078181 2053 GROUP_FIRST_ELEMENT (use_stmt_info) = loop_use_stmt;
39a5d6b1 2054
85078181 2055 lhs = gimple_assign_lhs (loop_use_stmt);
2056 current_stmt = loop_use_stmt;
39a5d6b1 2057 size++;
2058 }
2059
85078181 2060 if (!found || loop_use_stmt != phi || size < 2)
39a5d6b1 2061 return false;
2062
39a5d6b1 2063 /* Swap the operands, if needed, to make the reduction operand be the second
2064 operand. */
2065 lhs = PHI_RESULT (phi);
eb3a666e 2066 next_stmt = GROUP_FIRST_ELEMENT (vinfo_for_stmt (current_stmt));
2067 while (next_stmt)
39a5d6b1 2068 {
85078181 2069 if (gimple_assign_rhs2 (next_stmt) == lhs)
eb3a666e 2070 {
85078181 2071 tree op = gimple_assign_rhs1 (next_stmt);
2072 gimple def_stmt = NULL;
2073
2074 if (TREE_CODE (op) == SSA_NAME)
2075 def_stmt = SSA_NAME_DEF_STMT (op);
2076
2077 /* Check that the other def is either defined in the loop
2078 ("vect_internal_def"), or it's an induction (defined by a
2079 loop-header phi-node). */
2080 if (def_stmt
aada78b6 2081 && gimple_bb (def_stmt)
85078181 2082 && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
2083 && (is_gimple_assign (def_stmt)
2084 || is_gimple_call (def_stmt)
2085 || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
2086 == vect_induction_def
2087 || (gimple_code (def_stmt) == GIMPLE_PHI
2088 && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
2089 == vect_internal_def
2090 && !is_loop_header_bb_p (gimple_bb (def_stmt)))))
eb3a666e 2091 {
85078181 2092 lhs = gimple_assign_lhs (next_stmt);
2093 next_stmt = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next_stmt));
2094 continue;
2095 }
2096
2097 return false;
2098 }
2099 else
2100 {
2101 tree op = gimple_assign_rhs2 (next_stmt);
2102 gimple def_stmt = NULL;
2103
2104 if (TREE_CODE (op) == SSA_NAME)
2105 def_stmt = SSA_NAME_DEF_STMT (op);
2106
2107 /* Check that the other def is either defined in the loop
2108 ("vect_internal_def"), or it's an induction (defined by a
2109 loop-header phi-node). */
2110 if (def_stmt
aada78b6 2111 && gimple_bb (def_stmt)
85078181 2112 && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
2113 && (is_gimple_assign (def_stmt)
2114 || is_gimple_call (def_stmt)
2115 || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
eb3a666e 2116 == vect_induction_def
85078181 2117 || (gimple_code (def_stmt) == GIMPLE_PHI
2118 && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
eb3a666e 2119 == vect_internal_def
85078181 2120 && !is_loop_header_bb_p (gimple_bb (def_stmt)))))
2121 {
6d8fb6cf 2122 if (dump_enabled_p ())
eb3a666e 2123 {
7bd765d4 2124 dump_printf_loc (MSG_NOTE, vect_location, "swapping oprnds: ");
2125 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, next_stmt, 0);
78bb46f5 2126 dump_printf (MSG_NOTE, "\n");
eb3a666e 2127 }
2128
8f6fa493 2129 swap_ssa_operands (next_stmt,
2130 gimple_assign_rhs1_ptr (next_stmt),
2131 gimple_assign_rhs2_ptr (next_stmt));
a9696ee9 2132 update_stmt (next_stmt);
ba69439f 2133
2134 if (CONSTANT_CLASS_P (gimple_assign_rhs1 (next_stmt)))
2135 LOOP_VINFO_OPERANDS_SWAPPED (loop_info) = true;
eb3a666e 2136 }
2137 else
85078181 2138 return false;
39a5d6b1 2139 }
2140
eb3a666e 2141 lhs = gimple_assign_lhs (next_stmt);
2142 next_stmt = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next_stmt));
39a5d6b1 2143 }
2144
eb3a666e 2145 /* Save the chain for further analysis in SLP detection. */
2146 first = GROUP_FIRST_ELEMENT (vinfo_for_stmt (current_stmt));
f1f41a6c 2147 LOOP_VINFO_REDUCTION_CHAINS (loop_info).safe_push (first);
eb3a666e 2148 GROUP_SIZE (vinfo_for_stmt (first)) = size;
2149
39a5d6b1 2150 return true;
2151}
2152
2153
f4a50267 2154/* Function vect_is_simple_reduction_1
fb85abff 2155
7aa0d350 2156 (1) Detect a cross-iteration def-use cycle that represents a simple
282bf14c 2157 reduction computation. We look for the following pattern:
fb85abff 2158
2159 loop_header:
2160 a1 = phi < a0, a2 >
2161 a3 = ...
2162 a2 = operation (a3, a1)
48e1416a 2163
63048bd8 2164 or
2165
2166 a3 = ...
2167 loop_header:
2168 a1 = phi < a0, a2 >
2169 a2 = operation (a3, a1)
2170
fb85abff 2171 such that:
48e1416a 2172 1. operation is commutative and associative and it is safe to
ade2ac53 2173 change the order of the computation (if CHECK_REDUCTION is true)
fb85abff 2174 2. no uses for a2 in the loop (a2 is used out of the loop)
caf6df13 2175 3. no uses of a1 in the loop besides the reduction operation
2176 4. no uses of a1 outside the loop.
fb85abff 2177
caf6df13 2178 Conditions 1,4 are tested here.
48e1416a 2179 Conditions 2,3 are tested in vect_mark_stmts_to_be_vectorized.
ade2ac53 2180
48e1416a 2181 (2) Detect a cross-iteration def-use cycle in nested loops, i.e.,
2182 nested cycles, if CHECK_REDUCTION is false.
7aa0d350 2183
2184 (3) Detect cycles of phi nodes in outer-loop vectorization, i.e., double
2185 reductions:
2186
2187 a1 = phi < a0, a2 >
2188 inner loop (def of a3)
48e1416a 2189 a2 = phi < a3 >
f4a50267 2190
2191 If MODIFY is true it tries also to rework the code in-place to enable
2192 detection of more reduction patterns. For the time being we rewrite
2193 "res -= RHS" into "rhs += -RHS" when it seems worthwhile.
7aa0d350 2194*/
fb85abff 2195
f4a50267 2196static gimple
2197vect_is_simple_reduction_1 (loop_vec_info loop_info, gimple phi,
2198 bool check_reduction, bool *double_reduc,
2199 bool modify)
fb85abff 2200{
2201 struct loop *loop = (gimple_bb (phi))->loop_father;
2202 struct loop *vect_loop = LOOP_VINFO_LOOP (loop_info);
2203 edge latch_e = loop_latch_edge (loop);
2204 tree loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
0df23b96 2205 gimple def_stmt, def1 = NULL, def2 = NULL;
f4a50267 2206 enum tree_code orig_code, code;
0df23b96 2207 tree op1, op2, op3 = NULL_TREE, op4 = NULL_TREE;
fb85abff 2208 tree type;
2209 int nloop_uses;
2210 tree name;
2211 imm_use_iterator imm_iter;
2212 use_operand_p use_p;
7aa0d350 2213 bool phi_def;
2214
2215 *double_reduc = false;
fb85abff 2216
ade2ac53 2217 /* If CHECK_REDUCTION is true, we assume inner-most loop vectorization,
2218 otherwise, we assume outer loop vectorization. */
48e1416a 2219 gcc_assert ((check_reduction && loop == vect_loop)
ade2ac53 2220 || (!check_reduction && flow_loop_nested_p (vect_loop, loop)));
fb85abff 2221
2222 name = PHI_RESULT (phi);
75f8b7c8 2223 /* ??? If there are no uses of the PHI result the inner loop reduction
2224 won't be detected as possibly double-reduction by vectorizable_reduction
2225 because that tries to walk the PHI arg from the preheader edge which
2226 can be constant. See PR60382. */
2227 if (has_zero_uses (name))
2228 return NULL;
fb85abff 2229 nloop_uses = 0;
2230 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, name)
2231 {
2232 gimple use_stmt = USE_STMT (use_p);
9845d120 2233 if (is_gimple_debug (use_stmt))
2234 continue;
caf6df13 2235
2236 if (!flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
2237 {
6d8fb6cf 2238 if (dump_enabled_p ())
7bd765d4 2239 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
78bb46f5 2240 "intermediate value used outside loop.\n");
caf6df13 2241
2242 return NULL;
2243 }
2244
2245 if (vinfo_for_stmt (use_stmt)
fb85abff 2246 && !is_pattern_stmt_p (vinfo_for_stmt (use_stmt)))
2247 nloop_uses++;
2248 if (nloop_uses > 1)
2249 {
6d8fb6cf 2250 if (dump_enabled_p ())
7bd765d4 2251 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
78bb46f5 2252 "reduction used in loop.\n");
fb85abff 2253 return NULL;
2254 }
2255 }
2256
2257 if (TREE_CODE (loop_arg) != SSA_NAME)
2258 {
6d8fb6cf 2259 if (dump_enabled_p ())
fb85abff 2260 {
7bd765d4 2261 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2262 "reduction: not ssa_name: ");
2263 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, loop_arg);
78bb46f5 2264 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
fb85abff 2265 }
2266 return NULL;
2267 }
2268
2269 def_stmt = SSA_NAME_DEF_STMT (loop_arg);
2270 if (!def_stmt)
2271 {
6d8fb6cf 2272 if (dump_enabled_p ())
7bd765d4 2273 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
78bb46f5 2274 "reduction: no def_stmt.\n");
fb85abff 2275 return NULL;
2276 }
2277
7aa0d350 2278 if (!is_gimple_assign (def_stmt) && gimple_code (def_stmt) != GIMPLE_PHI)
fb85abff 2279 {
6d8fb6cf 2280 if (dump_enabled_p ())
78bb46f5 2281 {
2282 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, def_stmt, 0);
2283 dump_printf (MSG_NOTE, "\n");
2284 }
fb85abff 2285 return NULL;
2286 }
2287
7aa0d350 2288 if (is_gimple_assign (def_stmt))
2289 {
2290 name = gimple_assign_lhs (def_stmt);
2291 phi_def = false;
2292 }
2293 else
2294 {
2295 name = PHI_RESULT (def_stmt);
2296 phi_def = true;
2297 }
2298
fb85abff 2299 nloop_uses = 0;
2300 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, name)
2301 {
2302 gimple use_stmt = USE_STMT (use_p);
9845d120 2303 if (is_gimple_debug (use_stmt))
2304 continue;
fb85abff 2305 if (flow_bb_inside_loop_p (loop, gimple_bb (use_stmt))
2306 && vinfo_for_stmt (use_stmt)
2307 && !is_pattern_stmt_p (vinfo_for_stmt (use_stmt)))
2308 nloop_uses++;
2309 if (nloop_uses > 1)
2310 {
6d8fb6cf 2311 if (dump_enabled_p ())
7bd765d4 2312 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
78bb46f5 2313 "reduction used in loop.\n");
fb85abff 2314 return NULL;
2315 }
2316 }
2317
7aa0d350 2318 /* If DEF_STMT is a phi node itself, we expect it to have a single argument
2319 defined in the inner loop. */
2320 if (phi_def)
2321 {
2322 op1 = PHI_ARG_DEF (def_stmt, 0);
2323
2324 if (gimple_phi_num_args (def_stmt) != 1
2325 || TREE_CODE (op1) != SSA_NAME)
2326 {
6d8fb6cf 2327 if (dump_enabled_p ())
7bd765d4 2328 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
78bb46f5 2329 "unsupported phi node definition.\n");
7aa0d350 2330
2331 return NULL;
2332 }
2333
48e1416a 2334 def1 = SSA_NAME_DEF_STMT (op1);
149f7c8d 2335 if (gimple_bb (def1)
2336 && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
7aa0d350 2337 && loop->inner
2338 && flow_bb_inside_loop_p (loop->inner, gimple_bb (def1))
2339 && is_gimple_assign (def1))
2340 {
6d8fb6cf 2341 if (dump_enabled_p ())
7bd765d4 2342 report_vect_op (MSG_NOTE, def_stmt,
2343 "detected double reduction: ");
48e1416a 2344
7aa0d350 2345 *double_reduc = true;
2346 return def_stmt;
2347 }
2348
2349 return NULL;
2350 }
2351
f4a50267 2352 code = orig_code = gimple_assign_rhs_code (def_stmt);
2353
2354 /* We can handle "res -= x[i]", which is non-associative by
2355 simply rewriting this into "res += -x[i]". Avoid changing
2356 gimple instruction for the first simple tests and only do this
2357 if we're allowed to change code at all. */
9ef16690 2358 if (code == MINUS_EXPR
2359 && modify
2360 && (op1 = gimple_assign_rhs1 (def_stmt))
2361 && TREE_CODE (op1) == SSA_NAME
2362 && SSA_NAME_DEF_STMT (op1) == phi)
f4a50267 2363 code = PLUS_EXPR;
fb85abff 2364
48e1416a 2365 if (check_reduction
ade2ac53 2366 && (!commutative_tree_code (code) || !associative_tree_code (code)))
fb85abff 2367 {
6d8fb6cf 2368 if (dump_enabled_p ())
7bd765d4 2369 report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2370 "reduction: not commutative/associative: ");
fb85abff 2371 return NULL;
2372 }
2373
48e1416a 2374 if (get_gimple_rhs_class (code) != GIMPLE_BINARY_RHS)
fb85abff 2375 {
0df23b96 2376 if (code != COND_EXPR)
2377 {
6d8fb6cf 2378 if (dump_enabled_p ())
7bd765d4 2379 report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2380 "reduction: not binary operation: ");
fb85abff 2381
0df23b96 2382 return NULL;
2383 }
2384
8a2caf10 2385 op3 = gimple_assign_rhs1 (def_stmt);
a18d4327 2386 if (COMPARISON_CLASS_P (op3))
2387 {
2388 op4 = TREE_OPERAND (op3, 1);
2389 op3 = TREE_OPERAND (op3, 0);
48e1416a 2390 }
2391
8a2caf10 2392 op1 = gimple_assign_rhs2 (def_stmt);
2393 op2 = gimple_assign_rhs3 (def_stmt);
0df23b96 2394
2395 if (TREE_CODE (op1) != SSA_NAME && TREE_CODE (op2) != SSA_NAME)
2396 {
6d8fb6cf 2397 if (dump_enabled_p ())
7bd765d4 2398 report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2399 "reduction: uses not ssa_names: ");
0df23b96 2400
2401 return NULL;
2402 }
fb85abff 2403 }
0df23b96 2404 else
2405 {
2406 op1 = gimple_assign_rhs1 (def_stmt);
2407 op2 = gimple_assign_rhs2 (def_stmt);
2408
a29b42f8 2409 if (TREE_CODE (op1) != SSA_NAME && TREE_CODE (op2) != SSA_NAME)
0df23b96 2410 {
6d8fb6cf 2411 if (dump_enabled_p ())
7bd765d4 2412 report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2413 "reduction: uses not ssa_names: ");
0df23b96 2414
2415 return NULL;
2416 }
2417 }
fb85abff 2418
fb85abff 2419 type = TREE_TYPE (gimple_assign_lhs (def_stmt));
0df23b96 2420 if ((TREE_CODE (op1) == SSA_NAME
1ea6a73c 2421 && !types_compatible_p (type,TREE_TYPE (op1)))
0df23b96 2422 || (TREE_CODE (op2) == SSA_NAME
1ea6a73c 2423 && !types_compatible_p (type, TREE_TYPE (op2)))
0df23b96 2424 || (op3 && TREE_CODE (op3) == SSA_NAME
1ea6a73c 2425 && !types_compatible_p (type, TREE_TYPE (op3)))
0df23b96 2426 || (op4 && TREE_CODE (op4) == SSA_NAME
1ea6a73c 2427 && !types_compatible_p (type, TREE_TYPE (op4))))
fb85abff 2428 {
6d8fb6cf 2429 if (dump_enabled_p ())
fb85abff 2430 {
7bd765d4 2431 dump_printf_loc (MSG_NOTE, vect_location,
2432 "reduction: multiple types: operation type: ");
2433 dump_generic_expr (MSG_NOTE, TDF_SLIM, type);
2434 dump_printf (MSG_NOTE, ", operands types: ");
2435 dump_generic_expr (MSG_NOTE, TDF_SLIM,
2436 TREE_TYPE (op1));
2437 dump_printf (MSG_NOTE, ",");
2438 dump_generic_expr (MSG_NOTE, TDF_SLIM,
2439 TREE_TYPE (op2));
a18d4327 2440 if (op3)
0df23b96 2441 {
7bd765d4 2442 dump_printf (MSG_NOTE, ",");
2443 dump_generic_expr (MSG_NOTE, TDF_SLIM,
2444 TREE_TYPE (op3));
a18d4327 2445 }
2446
2447 if (op4)
2448 {
7bd765d4 2449 dump_printf (MSG_NOTE, ",");
2450 dump_generic_expr (MSG_NOTE, TDF_SLIM,
2451 TREE_TYPE (op4));
0df23b96 2452 }
78bb46f5 2453 dump_printf (MSG_NOTE, "\n");
fb85abff 2454 }
0df23b96 2455
fb85abff 2456 return NULL;
2457 }
2458
48e1416a 2459 /* Check that it's ok to change the order of the computation.
ade2ac53 2460 Generally, when vectorizing a reduction we change the order of the
fb85abff 2461 computation. This may change the behavior of the program in some
48e1416a 2462 cases, so we need to check that this is ok. One exception is when
fb85abff 2463 vectorizing an outer-loop: the inner-loop is executed sequentially,
2464 and therefore vectorizing reductions in the inner-loop during
2465 outer-loop vectorization is safe. */
2466
2467 /* CHECKME: check for !flag_finite_math_only too? */
2468 if (SCALAR_FLOAT_TYPE_P (type) && !flag_associative_math
48e1416a 2469 && check_reduction)
fb85abff 2470 {
2471 /* Changing the order of operations changes the semantics. */
6d8fb6cf 2472 if (dump_enabled_p ())
7bd765d4 2473 report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2474 "reduction: unsafe fp math optimization: ");
fb85abff 2475 return NULL;
2476 }
2477 else if (INTEGRAL_TYPE_P (type) && TYPE_OVERFLOW_TRAPS (type)
ade2ac53 2478 && check_reduction)
fb85abff 2479 {
2480 /* Changing the order of operations changes the semantics. */
6d8fb6cf 2481 if (dump_enabled_p ())
7bd765d4 2482 report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2483 "reduction: unsafe int math optimization: ");
fb85abff 2484 return NULL;
2485 }
ade2ac53 2486 else if (SAT_FIXED_POINT_TYPE_P (type) && check_reduction)
fb85abff 2487 {
2488 /* Changing the order of operations changes the semantics. */
6d8fb6cf 2489 if (dump_enabled_p ())
7bd765d4 2490 report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
fb85abff 2491 "reduction: unsafe fixed-point math optimization: ");
2492 return NULL;
2493 }
2494
f4a50267 2495 /* If we detected "res -= x[i]" earlier, rewrite it into
2496 "res += -x[i]" now. If this turns out to be useless reassoc
2497 will clean it up again. */
2498 if (orig_code == MINUS_EXPR)
2499 {
2500 tree rhs = gimple_assign_rhs2 (def_stmt);
bb38acc8 2501 tree negrhs = make_ssa_name (TREE_TYPE (rhs), NULL);
f4a50267 2502 gimple negate_stmt = gimple_build_assign_with_ops (NEGATE_EXPR, negrhs,
2503 rhs, NULL);
2504 gimple_stmt_iterator gsi = gsi_for_stmt (def_stmt);
2505 set_vinfo_for_stmt (negate_stmt, new_stmt_vec_info (negate_stmt,
2506 loop_info, NULL));
2507 gsi_insert_before (&gsi, negate_stmt, GSI_NEW_STMT);
2508 gimple_assign_set_rhs2 (def_stmt, negrhs);
2509 gimple_assign_set_rhs_code (def_stmt, PLUS_EXPR);
2510 update_stmt (def_stmt);
2511 }
2512
ade2ac53 2513 /* Reduction is safe. We're dealing with one of the following:
fb85abff 2514 1) integer arithmetic and no trapv
ade2ac53 2515 2) floating point arithmetic, and special flags permit this optimization
2516 3) nested cycle (i.e., outer loop vectorization). */
0df23b96 2517 if (TREE_CODE (op1) == SSA_NAME)
2518 def1 = SSA_NAME_DEF_STMT (op1);
2519
2520 if (TREE_CODE (op2) == SSA_NAME)
2521 def2 = SSA_NAME_DEF_STMT (op2);
2522
48e1416a 2523 if (code != COND_EXPR
a29b42f8 2524 && ((!def1 || gimple_nop_p (def1)) && (!def2 || gimple_nop_p (def2))))
fb85abff 2525 {
6d8fb6cf 2526 if (dump_enabled_p ())
7bd765d4 2527 report_vect_op (MSG_NOTE, def_stmt, "reduction: no defs for operands: ");
fb85abff 2528 return NULL;
2529 }
2530
fb85abff 2531 /* Check that one def is the reduction def, defined by PHI,
f083cd24 2532 the other def is either defined in the loop ("vect_internal_def"),
fb85abff 2533 or it's an induction (defined by a loop-header phi-node). */
2534
0df23b96 2535 if (def2 && def2 == phi
2536 && (code == COND_EXPR
a29b42f8 2537 || !def1 || gimple_nop_p (def1)
63048bd8 2538 || !flow_bb_inside_loop_p (loop, gimple_bb (def1))
0df23b96 2539 || (def1 && flow_bb_inside_loop_p (loop, gimple_bb (def1))
2540 && (is_gimple_assign (def1)
1e845e91 2541 || is_gimple_call (def1)
48e1416a 2542 || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def1))
0df23b96 2543 == vect_induction_def
2544 || (gimple_code (def1) == GIMPLE_PHI
48e1416a 2545 && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def1))
0df23b96 2546 == vect_internal_def
2547 && !is_loop_header_bb_p (gimple_bb (def1)))))))
fb85abff 2548 {
6d8fb6cf 2549 if (dump_enabled_p ())
7bd765d4 2550 report_vect_op (MSG_NOTE, def_stmt, "detected reduction: ");
fb85abff 2551 return def_stmt;
2552 }
39a5d6b1 2553
2554 if (def1 && def1 == phi
2555 && (code == COND_EXPR
a29b42f8 2556 || !def2 || gimple_nop_p (def2)
63048bd8 2557 || !flow_bb_inside_loop_p (loop, gimple_bb (def2))
39a5d6b1 2558 || (def2 && flow_bb_inside_loop_p (loop, gimple_bb (def2))
2559 && (is_gimple_assign (def2)
2560 || is_gimple_call (def2)
2561 || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def2))
2562 == vect_induction_def
2563 || (gimple_code (def2) == GIMPLE_PHI
2564 && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def2))
2565 == vect_internal_def
2566 && !is_loop_header_bb_p (gimple_bb (def2)))))))
fb85abff 2567 {
ade2ac53 2568 if (check_reduction)
2569 {
2570 /* Swap operands (just for simplicity - so that the rest of the code
2571 can assume that the reduction variable is always the last (second)
2572 argument). */
6d8fb6cf 2573 if (dump_enabled_p ())
7bd765d4 2574 report_vect_op (MSG_NOTE, def_stmt,
ade2ac53 2575 "detected reduction: need to swap operands: ");
2576
8f6fa493 2577 swap_ssa_operands (def_stmt, gimple_assign_rhs1_ptr (def_stmt),
2578 gimple_assign_rhs2_ptr (def_stmt));
ba69439f 2579
2580 if (CONSTANT_CLASS_P (gimple_assign_rhs1 (def_stmt)))
2581 LOOP_VINFO_OPERANDS_SWAPPED (loop_info) = true;
ade2ac53 2582 }
2583 else
2584 {
6d8fb6cf 2585 if (dump_enabled_p ())
7bd765d4 2586 report_vect_op (MSG_NOTE, def_stmt, "detected reduction: ");
ade2ac53 2587 }
2588
fb85abff 2589 return def_stmt;
2590 }
39a5d6b1 2591
2592 /* Try to find SLP reduction chain. */
85078181 2593 if (check_reduction && vect_is_slp_reduction (loop_info, phi, def_stmt))
fb85abff 2594 {
6d8fb6cf 2595 if (dump_enabled_p ())
7bd765d4 2596 report_vect_op (MSG_NOTE, def_stmt,
2597 "reduction: detected reduction chain: ");
ade2ac53 2598
39a5d6b1 2599 return def_stmt;
fb85abff 2600 }
39a5d6b1 2601
6d8fb6cf 2602 if (dump_enabled_p ())
7bd765d4 2603 report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2604 "reduction: unknown pattern: ");
39a5d6b1 2605
2606 return NULL;
fb85abff 2607}
2608
f4a50267 2609/* Wrapper around vect_is_simple_reduction_1, that won't modify code
2610 in-place. Arguments as there. */
2611
2612static gimple
2613vect_is_simple_reduction (loop_vec_info loop_info, gimple phi,
2614 bool check_reduction, bool *double_reduc)
2615{
2616 return vect_is_simple_reduction_1 (loop_info, phi, check_reduction,
2617 double_reduc, false);
2618}
2619
2620/* Wrapper around vect_is_simple_reduction_1, which will modify code
2621 in-place if it enables detection of more reductions. Arguments
2622 as there. */
2623
2624gimple
2625vect_force_simple_reduction (loop_vec_info loop_info, gimple phi,
2626 bool check_reduction, bool *double_reduc)
2627{
2628 return vect_is_simple_reduction_1 (loop_info, phi, check_reduction,
2629 double_reduc, true);
2630}
fb85abff 2631
0822b158 2632/* Calculate the cost of one scalar iteration of the loop. */
2633int
f4ac3f3e 2634vect_get_single_scalar_iteration_cost (loop_vec_info loop_vinfo)
0822b158 2635{
2636 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2637 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
2638 int nbbs = loop->num_nodes, factor, scalar_single_iter_cost = 0;
2639 int innerloop_iters, i, stmt_cost;
2640
282bf14c 2641 /* Count statements in scalar loop. Using this as scalar cost for a single
0822b158 2642 iteration for now.
2643
2644 TODO: Add outer loop support.
2645
2646 TODO: Consider assigning different costs to different scalar
2647 statements. */
2648
2649 /* FORNOW. */
3aee830b 2650 innerloop_iters = 1;
0822b158 2651 if (loop->inner)
2652 innerloop_iters = 50; /* FIXME */
2653
2654 for (i = 0; i < nbbs; i++)
2655 {
2656 gimple_stmt_iterator si;
2657 basic_block bb = bbs[i];
2658
2659 if (bb->loop_father == loop->inner)
2660 factor = innerloop_iters;
2661 else
2662 factor = 1;
2663
2664 for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
2665 {
2666 gimple stmt = gsi_stmt (si);
45f1556e 2667 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
0822b158 2668
2669 if (!is_gimple_assign (stmt) && !is_gimple_call (stmt))
2670 continue;
2671
45f1556e 2672 /* Skip stmts that are not vectorized inside the loop. */
2673 if (stmt_info
2674 && !STMT_VINFO_RELEVANT_P (stmt_info)
2675 && (!STMT_VINFO_LIVE_P (stmt_info)
5df2530b 2676 || !VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
2677 && !STMT_VINFO_IN_PATTERN_P (stmt_info))
45f1556e 2678 continue;
2679
0822b158 2680 if (STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt)))
2681 {
2682 if (DR_IS_READ (STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt))))
f4ac3f3e 2683 stmt_cost = vect_get_stmt_cost (scalar_load);
0822b158 2684 else
f4ac3f3e 2685 stmt_cost = vect_get_stmt_cost (scalar_store);
0822b158 2686 }
2687 else
f4ac3f3e 2688 stmt_cost = vect_get_stmt_cost (scalar_stmt);
0822b158 2689
2690 scalar_single_iter_cost += stmt_cost * factor;
2691 }
2692 }
2693 return scalar_single_iter_cost;
2694}
2695
2696/* Calculate cost of peeling the loop PEEL_ITERS_PROLOGUE times. */
2697int
2698vect_get_known_peeling_cost (loop_vec_info loop_vinfo, int peel_iters_prologue,
2699 int *peel_iters_epilogue,
f97dec81 2700 int scalar_single_iter_cost,
2701 stmt_vector_for_cost *prologue_cost_vec,
2702 stmt_vector_for_cost *epilogue_cost_vec)
0822b158 2703{
f97dec81 2704 int retval = 0;
0822b158 2705 int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2706
2707 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
2708 {
2709 *peel_iters_epilogue = vf/2;
6d8fb6cf 2710 if (dump_enabled_p ())
7bd765d4 2711 dump_printf_loc (MSG_NOTE, vect_location,
2712 "cost model: epilogue peel iters set to vf/2 "
78bb46f5 2713 "because loop iterations are unknown .\n");
0822b158 2714
2715 /* If peeled iterations are known but number of scalar loop
2716 iterations are unknown, count a taken branch per peeled loop. */
f97dec81 2717 retval = record_stmt_cost (prologue_cost_vec, 2, cond_branch_taken,
2718 NULL, 0, vect_prologue);
0822b158 2719 }
2720 else
2721 {
2722 int niters = LOOP_VINFO_INT_NITERS (loop_vinfo);
2723 peel_iters_prologue = niters < peel_iters_prologue ?
2724 niters : peel_iters_prologue;
2725 *peel_iters_epilogue = (niters - peel_iters_prologue) % vf;
a4ee7fac 2726 /* If we need to peel for gaps, but no peeling is required, we have to
2727 peel VF iterations. */
2728 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) && !*peel_iters_epilogue)
2729 *peel_iters_epilogue = vf;
0822b158 2730 }
2731
f97dec81 2732 if (peel_iters_prologue)
2733 retval += record_stmt_cost (prologue_cost_vec,
2734 peel_iters_prologue * scalar_single_iter_cost,
2735 scalar_stmt, NULL, 0, vect_prologue);
2736 if (*peel_iters_epilogue)
2737 retval += record_stmt_cost (epilogue_cost_vec,
2738 *peel_iters_epilogue * scalar_single_iter_cost,
2739 scalar_stmt, NULL, 0, vect_epilogue);
2740 return retval;
0822b158 2741}
2742
fb85abff 2743/* Function vect_estimate_min_profitable_iters
2744
2745 Return the number of iterations required for the vector version of the
2746 loop to be profitable relative to the cost of the scalar version of the
5938768b 2747 loop. */
fb85abff 2748
5938768b 2749static void
2750vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo,
2751 int *ret_min_profitable_niters,
2752 int *ret_min_profitable_estimate)
fb85abff 2753{
fb85abff 2754 int min_profitable_iters;
5938768b 2755 int min_profitable_estimate;
fb85abff 2756 int peel_iters_prologue;
2757 int peel_iters_epilogue;
f97dec81 2758 unsigned vec_inside_cost = 0;
fb85abff 2759 int vec_outside_cost = 0;
f97dec81 2760 unsigned vec_prologue_cost = 0;
2761 unsigned vec_epilogue_cost = 0;
fb85abff 2762 int scalar_single_iter_cost = 0;
2763 int scalar_outside_cost = 0;
2764 int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
313a5120 2765 int npeel = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
f97dec81 2766 void *target_cost_data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
fb85abff 2767
2768 /* Cost model disabled. */
3e398f5b 2769 if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
fb85abff 2770 {
78bb46f5 2771 dump_printf_loc (MSG_NOTE, vect_location, "cost model disabled.\n");
5938768b 2772 *ret_min_profitable_niters = 0;
2773 *ret_min_profitable_estimate = 0;
2774 return;
fb85abff 2775 }
2776
2777 /* Requires loop versioning tests to handle misalignment. */
10095225 2778 if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo))
fb85abff 2779 {
2780 /* FIXME: Make cost depend on complexity of individual check. */
f1f41a6c 2781 unsigned len = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).length ();
f97dec81 2782 (void) add_stmt_cost (target_cost_data, len, vector_stmt, NULL, 0,
2783 vect_prologue);
7bd765d4 2784 dump_printf (MSG_NOTE,
2785 "cost model: Adding cost of checks for loop "
2786 "versioning to treat misalignment.\n");
fb85abff 2787 }
2788
10095225 2789 /* Requires loop versioning with alias checks. */
2790 if (LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
fb85abff 2791 {
2792 /* FIXME: Make cost depend on complexity of individual check. */
f1f41a6c 2793 unsigned len = LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo).length ();
f97dec81 2794 (void) add_stmt_cost (target_cost_data, len, vector_stmt, NULL, 0,
2795 vect_prologue);
7bd765d4 2796 dump_printf (MSG_NOTE,
2797 "cost model: Adding cost of checks for loop "
2798 "versioning aliasing.\n");
fb85abff 2799 }
2800
10095225 2801 if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo)
2802 || LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
f97dec81 2803 (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken, NULL, 0,
2804 vect_prologue);
fb85abff 2805
2806 /* Count statements in scalar loop. Using this as scalar cost for a single
2807 iteration for now.
2808
2809 TODO: Add outer loop support.
2810
2811 TODO: Consider assigning different costs to different scalar
2812 statements. */
2813
f4ac3f3e 2814 scalar_single_iter_cost = vect_get_single_scalar_iteration_cost (loop_vinfo);
0822b158 2815
fb85abff 2816 /* Add additional cost for the peeled instructions in prologue and epilogue
2817 loop.
2818
2819 FORNOW: If we don't know the value of peel_iters for prologue or epilogue
2820 at compile-time - we assume it's vf/2 (the worst would be vf-1).
2821
2822 TODO: Build an expression that represents peel_iters for prologue and
2823 epilogue to be used in a run-time test. */
2824
0822b158 2825 if (npeel < 0)
fb85abff 2826 {
2827 peel_iters_prologue = vf/2;
7bd765d4 2828 dump_printf (MSG_NOTE, "cost model: "
78bb46f5 2829 "prologue peel iters set to vf/2.\n");
fb85abff 2830
2831 /* If peeling for alignment is unknown, loop bound of main loop becomes
2832 unknown. */
2833 peel_iters_epilogue = vf/2;
7bd765d4 2834 dump_printf (MSG_NOTE, "cost model: "
2835 "epilogue peel iters set to vf/2 because "
78bb46f5 2836 "peeling for alignment is unknown.\n");
fb85abff 2837
2838 /* If peeled iterations are unknown, count a taken branch and a not taken
2839 branch per peeled loop. Even if scalar loop iterations are known,
2840 vector iterations are not known since peeled prologue iterations are
2841 not known. Hence guards remain the same. */
f97dec81 2842 (void) add_stmt_cost (target_cost_data, 2, cond_branch_taken,
2843 NULL, 0, vect_prologue);
2844 (void) add_stmt_cost (target_cost_data, 2, cond_branch_not_taken,
2845 NULL, 0, vect_prologue);
2846 /* FORNOW: Don't attempt to pass individual scalar instructions to
2847 the model; just assume linear cost for scalar iterations. */
2848 (void) add_stmt_cost (target_cost_data,
2849 peel_iters_prologue * scalar_single_iter_cost,
2850 scalar_stmt, NULL, 0, vect_prologue);
2851 (void) add_stmt_cost (target_cost_data,
2852 peel_iters_epilogue * scalar_single_iter_cost,
2853 scalar_stmt, NULL, 0, vect_epilogue);
fb85abff 2854 }
48e1416a 2855 else
fb85abff 2856 {
f97dec81 2857 stmt_vector_for_cost prologue_cost_vec, epilogue_cost_vec;
2858 stmt_info_for_cost *si;
2859 int j;
2860 void *data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
2861
f1f41a6c 2862 prologue_cost_vec.create (2);
2863 epilogue_cost_vec.create (2);
0822b158 2864 peel_iters_prologue = npeel;
f97dec81 2865
2866 (void) vect_get_known_peeling_cost (loop_vinfo, peel_iters_prologue,
2867 &peel_iters_epilogue,
2868 scalar_single_iter_cost,
2869 &prologue_cost_vec,
2870 &epilogue_cost_vec);
2871
f1f41a6c 2872 FOR_EACH_VEC_ELT (prologue_cost_vec, j, si)
f97dec81 2873 {
2874 struct _stmt_vec_info *stmt_info
2875 = si->stmt ? vinfo_for_stmt (si->stmt) : NULL;
2876 (void) add_stmt_cost (data, si->count, si->kind, stmt_info,
2877 si->misalign, vect_prologue);
2878 }
2879
f1f41a6c 2880 FOR_EACH_VEC_ELT (epilogue_cost_vec, j, si)
f97dec81 2881 {
2882 struct _stmt_vec_info *stmt_info
2883 = si->stmt ? vinfo_for_stmt (si->stmt) : NULL;
2884 (void) add_stmt_cost (data, si->count, si->kind, stmt_info,
2885 si->misalign, vect_epilogue);
2886 }
2887
f1f41a6c 2888 prologue_cost_vec.release ();
2889 epilogue_cost_vec.release ();
fb85abff 2890 }
2891
fb85abff 2892 /* FORNOW: The scalar outside cost is incremented in one of the
2893 following ways:
2894
2895 1. The vectorizer checks for alignment and aliasing and generates
2896 a condition that allows dynamic vectorization. A cost model
2897 check is ANDED with the versioning condition. Hence scalar code
2898 path now has the added cost of the versioning check.
2899
2900 if (cost > th & versioning_check)
2901 jmp to vector code
2902
2903 Hence run-time scalar is incremented by not-taken branch cost.
2904
2905 2. The vectorizer then checks if a prologue is required. If the
2906 cost model check was not done before during versioning, it has to
2907 be done before the prologue check.
2908
2909 if (cost <= th)
2910 prologue = scalar_iters
2911 if (prologue == 0)
2912 jmp to vector code
2913 else
2914 execute prologue
2915 if (prologue == num_iters)
2916 go to exit
2917
2918 Hence the run-time scalar cost is incremented by a taken branch,
2919 plus a not-taken branch, plus a taken branch cost.
2920
2921 3. The vectorizer then checks if an epilogue is required. If the
2922 cost model check was not done before during prologue check, it
2923 has to be done with the epilogue check.
2924
2925 if (prologue == 0)
2926 jmp to vector code
2927 else
2928 execute prologue
2929 if (prologue == num_iters)
2930 go to exit
2931 vector code:
2932 if ((cost <= th) | (scalar_iters-prologue-epilogue == 0))
2933 jmp to epilogue
2934
2935 Hence the run-time scalar cost should be incremented by 2 taken
2936 branches.
2937
2938 TODO: The back end may reorder the BBS's differently and reverse
2939 conditions/branch directions. Change the estimates below to
2940 something more reasonable. */
2941
2942 /* If the number of iterations is known and we do not do versioning, we can
282bf14c 2943 decide whether to vectorize at compile time. Hence the scalar version
fb85abff 2944 do not carry cost model guard costs. */
2945 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
10095225 2946 || LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo)
2947 || LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
fb85abff 2948 {
2949 /* Cost model check occurs at versioning. */
10095225 2950 if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo)
2951 || LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
f4ac3f3e 2952 scalar_outside_cost += vect_get_stmt_cost (cond_branch_not_taken);
fb85abff 2953 else
2954 {
2955 /* Cost model check occurs at prologue generation. */
313a5120 2956 if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) < 0)
f4ac3f3e 2957 scalar_outside_cost += 2 * vect_get_stmt_cost (cond_branch_taken)
2958 + vect_get_stmt_cost (cond_branch_not_taken);
fb85abff 2959 /* Cost model check occurs at epilogue generation. */
2960 else
f4ac3f3e 2961 scalar_outside_cost += 2 * vect_get_stmt_cost (cond_branch_taken);
fb85abff 2962 }
2963 }
2964
f97dec81 2965 /* Complete the target-specific cost calculations. */
2966 finish_cost (LOOP_VINFO_TARGET_COST_DATA (loop_vinfo), &vec_prologue_cost,
2967 &vec_inside_cost, &vec_epilogue_cost);
fb85abff 2968
f97dec81 2969 vec_outside_cost = (int)(vec_prologue_cost + vec_epilogue_cost);
4db2b577 2970
48e1416a 2971 /* Calculate number of iterations required to make the vector version
282bf14c 2972 profitable, relative to the loop bodies only. The following condition
48e1416a 2973 must hold true:
fb85abff 2974 SIC * niters + SOC > VIC * ((niters-PL_ITERS-EP_ITERS)/VF) + VOC
2975 where
2976 SIC = scalar iteration cost, VIC = vector iteration cost,
2977 VOC = vector outside cost, VF = vectorization factor,
2978 PL_ITERS = prologue iterations, EP_ITERS= epilogue iterations
2979 SOC = scalar outside cost for run time cost model check. */
2980
f97dec81 2981 if ((scalar_single_iter_cost * vf) > (int) vec_inside_cost)
fb85abff 2982 {
2983 if (vec_outside_cost <= 0)
2984 min_profitable_iters = 1;
2985 else
2986 {
2987 min_profitable_iters = ((vec_outside_cost - scalar_outside_cost) * vf
2988 - vec_inside_cost * peel_iters_prologue
2989 - vec_inside_cost * peel_iters_epilogue)
2990 / ((scalar_single_iter_cost * vf)
2991 - vec_inside_cost);
2992
2993 if ((scalar_single_iter_cost * vf * min_profitable_iters)
f97dec81 2994 <= (((int) vec_inside_cost * min_profitable_iters)
2995 + (((int) vec_outside_cost - scalar_outside_cost) * vf)))
fb85abff 2996 min_profitable_iters++;
2997 }
2998 }
2999 /* vector version will never be profitable. */
3000 else
3001 {
4c73695b 3002 if (LOOP_VINFO_LOOP (loop_vinfo)->force_vectorize)
3e398f5b 3003 warning_at (vect_location, OPT_Wopenmp_simd, "vectorization "
3004 "did not happen for a simd loop");
3005
6d8fb6cf 3006 if (dump_enabled_p ())
7bd765d4 3007 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3008 "cost model: the vector iteration cost = %d "
3009 "divided by the scalar iteration cost = %d "
78bb46f5 3010 "is greater or equal to the vectorization factor = %d"
3011 ".\n",
7bd765d4 3012 vec_inside_cost, scalar_single_iter_cost, vf);
5938768b 3013 *ret_min_profitable_niters = -1;
3014 *ret_min_profitable_estimate = -1;
3015 return;
fb85abff 3016 }
3017
6d8fb6cf 3018 if (dump_enabled_p ())
7bd765d4 3019 {
3020 dump_printf_loc (MSG_NOTE, vect_location, "Cost model analysis: \n");
3021 dump_printf (MSG_NOTE, " Vector inside of loop cost: %d\n",
3022 vec_inside_cost);
3023 dump_printf (MSG_NOTE, " Vector prologue cost: %d\n",
3024 vec_prologue_cost);
3025 dump_printf (MSG_NOTE, " Vector epilogue cost: %d\n",
3026 vec_epilogue_cost);
3027 dump_printf (MSG_NOTE, " Scalar iteration cost: %d\n",
3028 scalar_single_iter_cost);
3029 dump_printf (MSG_NOTE, " Scalar outside cost: %d\n",
3030 scalar_outside_cost);
5938768b 3031 dump_printf (MSG_NOTE, " Vector outside cost: %d\n",
3032 vec_outside_cost);
7bd765d4 3033 dump_printf (MSG_NOTE, " prologue iterations: %d\n",
3034 peel_iters_prologue);
3035 dump_printf (MSG_NOTE, " epilogue iterations: %d\n",
3036 peel_iters_epilogue);
78bb46f5 3037 dump_printf (MSG_NOTE,
7bd765d4 3038 " Calculated minimum iters for profitability: %d\n",
3039 min_profitable_iters);
78bb46f5 3040 dump_printf (MSG_NOTE, "\n");
fb85abff 3041 }
3042
48e1416a 3043 min_profitable_iters =
fb85abff 3044 min_profitable_iters < vf ? vf : min_profitable_iters;
3045
3046 /* Because the condition we create is:
3047 if (niters <= min_profitable_iters)
3048 then skip the vectorized loop. */
3049 min_profitable_iters--;
3050
6d8fb6cf 3051 if (dump_enabled_p ())
7bd765d4 3052 dump_printf_loc (MSG_NOTE, vect_location,
78bb46f5 3053 " Runtime profitability threshold = %d\n",
3054 min_profitable_iters);
5938768b 3055
3056 *ret_min_profitable_niters = min_profitable_iters;
3057
3058 /* Calculate number of iterations required to make the vector version
3059 profitable, relative to the loop bodies only.
3060
3061 Non-vectorized variant is SIC * niters and it must win over vector
3062 variant on the expected loop trip count. The following condition must hold true:
3063 SIC * niters > VIC * ((niters-PL_ITERS-EP_ITERS)/VF) + VOC + SOC */
3064
3065 if (vec_outside_cost <= 0)
3066 min_profitable_estimate = 1;
3067 else
3068 {
3069 min_profitable_estimate = ((vec_outside_cost + scalar_outside_cost) * vf
3070 - vec_inside_cost * peel_iters_prologue
3071 - vec_inside_cost * peel_iters_epilogue)
3072 / ((scalar_single_iter_cost * vf)
3073 - vec_inside_cost);
3074 }
3075 min_profitable_estimate --;
3076 min_profitable_estimate = MAX (min_profitable_estimate, min_profitable_iters);
6d8fb6cf 3077 if (dump_enabled_p ())
5938768b 3078 dump_printf_loc (MSG_NOTE, vect_location,
3079 " Static estimate profitability threshold = %d\n",
3080 min_profitable_iters);
48e1416a 3081
5938768b 3082 *ret_min_profitable_estimate = min_profitable_estimate;
fb85abff 3083}
3084
3085
48e1416a 3086/* TODO: Close dependency between vect_model_*_cost and vectorizable_*
fb85abff 3087 functions. Design better to avoid maintenance issues. */
fb85abff 3088
48e1416a 3089/* Function vect_model_reduction_cost.
3090
3091 Models cost for a reduction operation, including the vector ops
fb85abff 3092 generated within the strip-mine loop, the initial definition before
3093 the loop, and the epilogue code that must be generated. */
3094
48e1416a 3095static bool
fb85abff 3096vect_model_reduction_cost (stmt_vec_info stmt_info, enum tree_code reduc_code,
3097 int ncopies)
3098{
f97dec81 3099 int prologue_cost = 0, epilogue_cost = 0;
fb85abff 3100 enum tree_code code;
3101 optab optab;
3102 tree vectype;
3103 gimple stmt, orig_stmt;
3104 tree reduction_op;
3754d046 3105 machine_mode mode;
fb85abff 3106 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
3107 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
f97dec81 3108 void *target_cost_data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
fb85abff 3109
fb85abff 3110 /* Cost of reduction op inside loop. */
f97dec81 3111 unsigned inside_cost = add_stmt_cost (target_cost_data, ncopies, vector_stmt,
3112 stmt_info, 0, vect_body);
fb85abff 3113 stmt = STMT_VINFO_STMT (stmt_info);
3114
3115 switch (get_gimple_rhs_class (gimple_assign_rhs_code (stmt)))
3116 {
3117 case GIMPLE_SINGLE_RHS:
3118 gcc_assert (TREE_OPERAND_LENGTH (gimple_assign_rhs1 (stmt)) == ternary_op);
3119 reduction_op = TREE_OPERAND (gimple_assign_rhs1 (stmt), 2);
3120 break;
3121 case GIMPLE_UNARY_RHS:
3122 reduction_op = gimple_assign_rhs1 (stmt);
3123 break;
3124 case GIMPLE_BINARY_RHS:
3125 reduction_op = gimple_assign_rhs2 (stmt);
3126 break;
c86930b0 3127 case GIMPLE_TERNARY_RHS:
3128 reduction_op = gimple_assign_rhs3 (stmt);
3129 break;
fb85abff 3130 default:
3131 gcc_unreachable ();
3132 }
3133
3134 vectype = get_vectype_for_scalar_type (TREE_TYPE (reduction_op));
3135 if (!vectype)
3136 {
6d8fb6cf 3137 if (dump_enabled_p ())
fb85abff 3138 {
7bd765d4 3139 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3140 "unsupported data-type ");
3141 dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
3142 TREE_TYPE (reduction_op));
78bb46f5 3143 dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
fb85abff 3144 }
3145 return false;
3146 }
48e1416a 3147
fb85abff 3148 mode = TYPE_MODE (vectype);
3149 orig_stmt = STMT_VINFO_RELATED_STMT (stmt_info);
3150
48e1416a 3151 if (!orig_stmt)
fb85abff 3152 orig_stmt = STMT_VINFO_STMT (stmt_info);
3153
3154 code = gimple_assign_rhs_code (orig_stmt);
3155
3156 /* Add in cost for initial definition. */
f97dec81 3157 prologue_cost += add_stmt_cost (target_cost_data, 1, scalar_to_vec,
3158 stmt_info, 0, vect_prologue);
fb85abff 3159
3160 /* Determine cost of epilogue code.
3161
3162 We have a reduction operator that will reduce the vector in one statement.
3163 Also requires scalar extract. */
3164
3165 if (!nested_in_vect_loop_p (loop, orig_stmt))
3166 {
8458f4ca 3167 if (reduc_code != ERROR_MARK)
f97dec81 3168 {
3169 epilogue_cost += add_stmt_cost (target_cost_data, 1, vector_stmt,
3170 stmt_info, 0, vect_epilogue);
3171 epilogue_cost += add_stmt_cost (target_cost_data, 1, vec_to_scalar,
3172 stmt_info, 0, vect_epilogue);
3173 }
48e1416a 3174 else
fb85abff 3175 {
e913b5cd 3176 int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
fb85abff 3177 tree bitsize =
3178 TYPE_SIZE (TREE_TYPE (gimple_assign_lhs (orig_stmt)));
e913b5cd 3179 int element_bitsize = tree_to_uhwi (bitsize);
fb85abff 3180 int nelements = vec_size_in_bits / element_bitsize;
3181
3182 optab = optab_for_tree_code (code, vectype, optab_default);
3183
3184 /* We have a whole vector shift available. */
3185 if (VECTOR_MODE_P (mode)
d6bf3b14 3186 && optab_handler (optab, mode) != CODE_FOR_nothing
3187 && optab_handler (vec_shr_optab, mode) != CODE_FOR_nothing)
f97dec81 3188 {
3189 /* Final reduction via vector shifts and the reduction operator.
3190 Also requires scalar extract. */
3191 epilogue_cost += add_stmt_cost (target_cost_data,
3192 exact_log2 (nelements) * 2,
3193 vector_stmt, stmt_info, 0,
3194 vect_epilogue);
3195 epilogue_cost += add_stmt_cost (target_cost_data, 1,
3196 vec_to_scalar, stmt_info, 0,
3197 vect_epilogue);
3198 }
fb85abff 3199 else
f97dec81 3200 /* Use extracts and reduction op for final reduction. For N
3201 elements, we have N extracts and N-1 reduction ops. */
3202 epilogue_cost += add_stmt_cost (target_cost_data,
3203 nelements + nelements - 1,
3204 vector_stmt, stmt_info, 0,
3205 vect_epilogue);
fb85abff 3206 }
3207 }
3208
6d8fb6cf 3209 if (dump_enabled_p ())
7bd765d4 3210 dump_printf (MSG_NOTE,
3211 "vect_model_reduction_cost: inside_cost = %d, "
78bb46f5 3212 "prologue_cost = %d, epilogue_cost = %d .\n", inside_cost,
7bd765d4 3213 prologue_cost, epilogue_cost);
fb85abff 3214
3215 return true;
3216}
3217
3218
3219/* Function vect_model_induction_cost.
3220
3221 Models cost for induction operations. */
3222
3223static void
3224vect_model_induction_cost (stmt_vec_info stmt_info, int ncopies)
3225{
4db2b577 3226 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
f97dec81 3227 void *target_cost_data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
3228 unsigned inside_cost, prologue_cost;
4db2b577 3229
fb85abff 3230 /* loop cost for vec_loop. */
f97dec81 3231 inside_cost = add_stmt_cost (target_cost_data, ncopies, vector_stmt,
3232 stmt_info, 0, vect_body);
4db2b577 3233
fb85abff 3234 /* prologue cost for vec_init and vec_step. */
f97dec81 3235 prologue_cost = add_stmt_cost (target_cost_data, 2, scalar_to_vec,
3236 stmt_info, 0, vect_prologue);
48e1416a 3237
6d8fb6cf 3238 if (dump_enabled_p ())
7bd765d4 3239 dump_printf_loc (MSG_NOTE, vect_location,
3240 "vect_model_induction_cost: inside_cost = %d, "
78bb46f5 3241 "prologue_cost = %d .\n", inside_cost, prologue_cost);
fb85abff 3242}
3243
3244
3245/* Function get_initial_def_for_induction
3246
3247 Input:
3248 STMT - a stmt that performs an induction operation in the loop.
3249 IV_PHI - the initial value of the induction variable
3250
3251 Output:
3252 Return a vector variable, initialized with the first VF values of
282bf14c 3253 the induction variable. E.g., for an iv with IV_PHI='X' and
48e1416a 3254 evolution S, for a vector of 4 units, we want to return:
fb85abff 3255 [X, X + S, X + 2*S, X + 3*S]. */
3256
3257static tree
3258get_initial_def_for_induction (gimple iv_phi)
3259{
3260 stmt_vec_info stmt_vinfo = vinfo_for_stmt (iv_phi);
3261 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_vinfo);
3262 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
48e1416a 3263 tree vectype;
fb85abff 3264 int nunits;
3265 edge pe = loop_preheader_edge (loop);
3266 struct loop *iv_loop;
3267 basic_block new_bb;
f1f41a6c 3268 tree new_vec, vec_init, vec_step, t;
fb85abff 3269 tree new_var;
3270 tree new_name;
3271 gimple init_stmt, induction_phi, new_stmt;
3272 tree induc_def, vec_def, vec_dest;
3273 tree init_expr, step_expr;
3274 int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
3275 int i;
fb85abff 3276 int ncopies;
3277 tree expr;
3278 stmt_vec_info phi_info = vinfo_for_stmt (iv_phi);
3279 bool nested_in_vect_loop = false;
3280 gimple_seq stmts = NULL;
3281 imm_use_iterator imm_iter;
3282 use_operand_p use_p;
3283 gimple exit_phi;
3284 edge latch_e;
3285 tree loop_arg;
3286 gimple_stmt_iterator si;
3287 basic_block bb = gimple_bb (iv_phi);
f1a47479 3288 tree stepvectype;
0185abae 3289 tree resvectype;
fb85abff 3290
3291 /* Is phi in an inner-loop, while vectorizing an enclosing outer-loop? */
3292 if (nested_in_vect_loop_p (loop, iv_phi))
3293 {
3294 nested_in_vect_loop = true;
3295 iv_loop = loop->inner;
3296 }
3297 else
3298 iv_loop = loop;
3299 gcc_assert (iv_loop == (gimple_bb (iv_phi))->loop_father);
3300
3301 latch_e = loop_latch_edge (iv_loop);
3302 loop_arg = PHI_ARG_DEF_FROM_EDGE (iv_phi, latch_e);
3303
e431757f 3304 step_expr = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (phi_info);
3305 gcc_assert (step_expr != NULL_TREE);
3306
fb85abff 3307 pe = loop_preheader_edge (iv_loop);
e431757f 3308 init_expr = PHI_ARG_DEF_FROM_EDGE (iv_phi,
3309 loop_preheader_edge (iv_loop));
fb85abff 3310
99f81ffb 3311 vectype = get_vectype_for_scalar_type (TREE_TYPE (init_expr));
0185abae 3312 resvectype = get_vectype_for_scalar_type (TREE_TYPE (PHI_RESULT (iv_phi)));
3313 gcc_assert (vectype);
3314 nunits = TYPE_VECTOR_SUBPARTS (vectype);
3315 ncopies = vf / nunits;
3316
3317 gcc_assert (phi_info);
3318 gcc_assert (ncopies >= 1);
3319
e431757f 3320 /* Convert the step to the desired type. */
3321 step_expr = force_gimple_operand (fold_convert (TREE_TYPE (vectype),
3322 step_expr),
3323 &stmts, true, NULL_TREE);
3324 if (stmts)
3325 {
3326 new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
3327 gcc_assert (!new_bb);
3328 }
3329
0185abae 3330 /* Find the first insertion point in the BB. */
3331 si = gsi_after_labels (bb);
3332
fb85abff 3333 /* Create the vector that holds the initial_value of the induction. */
3334 if (nested_in_vect_loop)
3335 {
3336 /* iv_loop is nested in the loop to be vectorized. init_expr had already
282bf14c 3337 been created during vectorization of previous stmts. We obtain it
3338 from the STMT_VINFO_VEC_STMT of the defining stmt. */
e431757f 3339 vec_init = vect_get_vec_def_for_operand (init_expr, iv_phi, NULL);
abad9af1 3340 /* If the initial value is not of proper type, convert it. */
3341 if (!useless_type_conversion_p (vectype, TREE_TYPE (vec_init)))
3342 {
3343 new_stmt = gimple_build_assign_with_ops
3344 (VIEW_CONVERT_EXPR,
3345 vect_get_new_vect_var (vectype, vect_simple_var, "vec_iv_"),
3346 build1 (VIEW_CONVERT_EXPR, vectype, vec_init), NULL_TREE);
3347 vec_init = make_ssa_name (gimple_assign_lhs (new_stmt), new_stmt);
3348 gimple_assign_set_lhs (new_stmt, vec_init);
3349 new_bb = gsi_insert_on_edge_immediate (loop_preheader_edge (iv_loop),
3350 new_stmt);
3351 gcc_assert (!new_bb);
3352 set_vinfo_for_stmt (new_stmt,
3353 new_stmt_vec_info (new_stmt, loop_vinfo, NULL));
3354 }
fb85abff 3355 }
3356 else
3357 {
f1f41a6c 3358 vec<constructor_elt, va_gc> *v;
3e299f5d 3359
fb85abff 3360 /* iv_loop is the loop to be vectorized. Create:
3361 vec_init = [X, X+S, X+2*S, X+3*S] (S = step_expr, X = init_expr) */
99f81ffb 3362 new_var = vect_get_new_vect_var (TREE_TYPE (vectype),
3363 vect_scalar_var, "var_");
3364 new_name = force_gimple_operand (fold_convert (TREE_TYPE (vectype),
3365 init_expr),
3366 &stmts, false, new_var);
fb85abff 3367 if (stmts)
3368 {
3369 new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
3370 gcc_assert (!new_bb);
3371 }
3372
f1f41a6c 3373 vec_alloc (v, nunits);
6c2c88c7 3374 bool constant_p = is_gimple_min_invariant (new_name);
3e299f5d 3375 CONSTRUCTOR_APPEND_ELT (v, NULL_TREE, new_name);
fb85abff 3376 for (i = 1; i < nunits; i++)
3377 {
3378 /* Create: new_name_i = new_name + step_expr */
99f81ffb 3379 new_name = fold_build2 (PLUS_EXPR, TREE_TYPE (new_name),
3380 new_name, step_expr);
6c2c88c7 3381 if (!is_gimple_min_invariant (new_name))
fb85abff 3382 {
6c2c88c7 3383 init_stmt = gimple_build_assign (new_var, new_name);
3384 new_name = make_ssa_name (new_var, init_stmt);
3385 gimple_assign_set_lhs (init_stmt, new_name);
3386 new_bb = gsi_insert_on_edge_immediate (pe, init_stmt);
3387 gcc_assert (!new_bb);
3388 if (dump_enabled_p ())
3389 {
3390 dump_printf_loc (MSG_NOTE, vect_location,
3391 "created new init_stmt: ");
3392 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, init_stmt, 0);
78bb46f5 3393 dump_printf (MSG_NOTE, "\n");
6c2c88c7 3394 }
3395 constant_p = false;
fb85abff 3396 }
3e299f5d 3397 CONSTRUCTOR_APPEND_ELT (v, NULL_TREE, new_name);
fb85abff 3398 }
3399 /* Create a vector from [new_name_0, new_name_1, ..., new_name_nunits-1] */
6c2c88c7 3400 if (constant_p)
3401 new_vec = build_vector_from_ctor (vectype, v);
3402 else
3403 new_vec = build_constructor (vectype, v);
f1f41a6c 3404 vec_init = vect_init_vector (iv_phi, new_vec, vectype, NULL);
fb85abff 3405 }
3406
3407
3408 /* Create the vector that holds the step of the induction. */
3409 if (nested_in_vect_loop)
3410 /* iv_loop is nested in the loop to be vectorized. Generate:
3411 vec_step = [S, S, S, S] */
3412 new_name = step_expr;
3413 else
3414 {
3415 /* iv_loop is the loop to be vectorized. Generate:
3416 vec_step = [VF*S, VF*S, VF*S, VF*S] */
1d62df1c 3417 if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
3418 {
3419 expr = build_int_cst (integer_type_node, vf);
3420 expr = fold_convert (TREE_TYPE (step_expr), expr);
3421 }
3422 else
3423 expr = build_int_cst (TREE_TYPE (step_expr), vf);
f1a47479 3424 new_name = fold_build2 (MULT_EXPR, TREE_TYPE (step_expr),
3425 expr, step_expr);
bb0d2509 3426 if (TREE_CODE (step_expr) == SSA_NAME)
3427 new_name = vect_init_vector (iv_phi, new_name,
3428 TREE_TYPE (step_expr), NULL);
fb85abff 3429 }
3430
b797154e 3431 t = unshare_expr (new_name);
bb0d2509 3432 gcc_assert (CONSTANT_CLASS_P (new_name)
3433 || TREE_CODE (new_name) == SSA_NAME);
f1a47479 3434 stepvectype = get_vectype_for_scalar_type (TREE_TYPE (new_name));
3435 gcc_assert (stepvectype);
f1f41a6c 3436 new_vec = build_vector_from_val (stepvectype, t);
3437 vec_step = vect_init_vector (iv_phi, new_vec, stepvectype, NULL);
fb85abff 3438
3439
3440 /* Create the following def-use cycle:
3441 loop prolog:
3442 vec_init = ...
3443 vec_step = ...
3444 loop:
3445 vec_iv = PHI <vec_init, vec_loop>
3446 ...
3447 STMT
3448 ...
3449 vec_loop = vec_iv + vec_step; */
3450
3451 /* Create the induction-phi that defines the induction-operand. */
3452 vec_dest = vect_get_new_vect_var (vectype, vect_simple_var, "vec_iv_");
fb85abff 3453 induction_phi = create_phi_node (vec_dest, iv_loop->header);
3454 set_vinfo_for_stmt (induction_phi,
37545e54 3455 new_stmt_vec_info (induction_phi, loop_vinfo, NULL));
fb85abff 3456 induc_def = PHI_RESULT (induction_phi);
3457
3458 /* Create the iv update inside the loop */
3459 new_stmt = gimple_build_assign_with_ops (PLUS_EXPR, vec_dest,
3460 induc_def, vec_step);
3461 vec_def = make_ssa_name (vec_dest, new_stmt);
3462 gimple_assign_set_lhs (new_stmt, vec_def);
3463 gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
48e1416a 3464 set_vinfo_for_stmt (new_stmt, new_stmt_vec_info (new_stmt, loop_vinfo,
37545e54 3465 NULL));
fb85abff 3466
3467 /* Set the arguments of the phi node: */
60d535d2 3468 add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION);
48e1416a 3469 add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
60d535d2 3470 UNKNOWN_LOCATION);
fb85abff 3471
3472
3473 /* In case that vectorization factor (VF) is bigger than the number
3474 of elements that we can fit in a vectype (nunits), we have to generate
3475 more than one vector stmt - i.e - we need to "unroll" the
3476 vector stmt by a factor VF/nunits. For more details see documentation
3477 in vectorizable_operation. */
48e1416a 3478
fb85abff 3479 if (ncopies > 1)
3480 {
3481 stmt_vec_info prev_stmt_vinfo;
3482 /* FORNOW. This restriction should be relaxed. */
3483 gcc_assert (!nested_in_vect_loop);
3484
3485 /* Create the vector that holds the step of the induction. */
1d62df1c 3486 if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
3487 {
3488 expr = build_int_cst (integer_type_node, nunits);
3489 expr = fold_convert (TREE_TYPE (step_expr), expr);
3490 }
3491 else
3492 expr = build_int_cst (TREE_TYPE (step_expr), nunits);
f1a47479 3493 new_name = fold_build2 (MULT_EXPR, TREE_TYPE (step_expr),
3494 expr, step_expr);
bb0d2509 3495 if (TREE_CODE (step_expr) == SSA_NAME)
3496 new_name = vect_init_vector (iv_phi, new_name,
3497 TREE_TYPE (step_expr), NULL);
b797154e 3498 t = unshare_expr (new_name);
bb0d2509 3499 gcc_assert (CONSTANT_CLASS_P (new_name)
3500 || TREE_CODE (new_name) == SSA_NAME);
f1f41a6c 3501 new_vec = build_vector_from_val (stepvectype, t);
3502 vec_step = vect_init_vector (iv_phi, new_vec, stepvectype, NULL);
fb85abff 3503
3504 vec_def = induc_def;
3505 prev_stmt_vinfo = vinfo_for_stmt (induction_phi);
3506 for (i = 1; i < ncopies; i++)
3507 {
3508 /* vec_i = vec_prev + vec_step */
3509 new_stmt = gimple_build_assign_with_ops (PLUS_EXPR, vec_dest,
3510 vec_def, vec_step);
3511 vec_def = make_ssa_name (vec_dest, new_stmt);
3512 gimple_assign_set_lhs (new_stmt, vec_def);
39a5d6b1 3513
fb85abff 3514 gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
0185abae 3515 if (!useless_type_conversion_p (resvectype, vectype))
3516 {
3517 new_stmt = gimple_build_assign_with_ops
3518 (VIEW_CONVERT_EXPR,
3519 vect_get_new_vect_var (resvectype, vect_simple_var,
3520 "vec_iv_"),
3521 build1 (VIEW_CONVERT_EXPR, resvectype,
3522 gimple_assign_lhs (new_stmt)), NULL_TREE);
3523 gimple_assign_set_lhs (new_stmt,
3524 make_ssa_name
3525 (gimple_assign_lhs (new_stmt), new_stmt));
3526 gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
3527 }
fb85abff 3528 set_vinfo_for_stmt (new_stmt,
37545e54 3529 new_stmt_vec_info (new_stmt, loop_vinfo, NULL));
fb85abff 3530 STMT_VINFO_RELATED_STMT (prev_stmt_vinfo) = new_stmt;
48e1416a 3531 prev_stmt_vinfo = vinfo_for_stmt (new_stmt);
fb85abff 3532 }
3533 }
3534
3535 if (nested_in_vect_loop)
3536 {
3537 /* Find the loop-closed exit-phi of the induction, and record
3538 the final vector of induction results: */
3539 exit_phi = NULL;
3540 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, loop_arg)
3541 {
0b308eee 3542 gimple use_stmt = USE_STMT (use_p);
3543 if (is_gimple_debug (use_stmt))
3544 continue;
3545
3546 if (!flow_bb_inside_loop_p (iv_loop, gimple_bb (use_stmt)))
fb85abff 3547 {
0b308eee 3548 exit_phi = use_stmt;
fb85abff 3549 break;
3550 }
3551 }
48e1416a 3552 if (exit_phi)
fb85abff 3553 {
3554 stmt_vec_info stmt_vinfo = vinfo_for_stmt (exit_phi);
3555 /* FORNOW. Currently not supporting the case that an inner-loop induction
3556 is not used in the outer-loop (i.e. only outside the outer-loop). */
3557 gcc_assert (STMT_VINFO_RELEVANT_P (stmt_vinfo)
3558 && !STMT_VINFO_LIVE_P (stmt_vinfo));
3559
3560 STMT_VINFO_VEC_STMT (stmt_vinfo) = new_stmt;
6d8fb6cf 3561 if (dump_enabled_p ())
fb85abff 3562 {
7bd765d4 3563 dump_printf_loc (MSG_NOTE, vect_location,
3564 "vector of inductions after inner-loop:");
3565 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, new_stmt, 0);
78bb46f5 3566 dump_printf (MSG_NOTE, "\n");
fb85abff 3567 }
3568 }
3569 }
3570
3571
6d8fb6cf 3572 if (dump_enabled_p ())
fb85abff 3573 {
7bd765d4 3574 dump_printf_loc (MSG_NOTE, vect_location,
3575 "transform induction: created def-use cycle: ");
3576 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, induction_phi, 0);
3577 dump_printf (MSG_NOTE, "\n");
3578 dump_gimple_stmt (MSG_NOTE, TDF_SLIM,
3579 SSA_NAME_DEF_STMT (vec_def), 0);
78bb46f5 3580 dump_printf (MSG_NOTE, "\n");
fb85abff 3581 }
3582
3583 STMT_VINFO_VEC_STMT (phi_info) = induction_phi;
0185abae 3584 if (!useless_type_conversion_p (resvectype, vectype))
3585 {
3586 new_stmt = gimple_build_assign_with_ops
3587 (VIEW_CONVERT_EXPR,
3588 vect_get_new_vect_var (resvectype, vect_simple_var, "vec_iv_"),
3589 build1 (VIEW_CONVERT_EXPR, resvectype, induc_def), NULL_TREE);
3590 induc_def = make_ssa_name (gimple_assign_lhs (new_stmt), new_stmt);
3591 gimple_assign_set_lhs (new_stmt, induc_def);
c3c33891 3592 si = gsi_after_labels (bb);
0185abae 3593 gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
ffb35eed 3594 set_vinfo_for_stmt (new_stmt,
3595 new_stmt_vec_info (new_stmt, loop_vinfo, NULL));
3596 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (new_stmt))
3597 = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (induction_phi));
0185abae 3598 }
3599
fb85abff 3600 return induc_def;
3601}
3602
3603
3604/* Function get_initial_def_for_reduction
3605
3606 Input:
3607 STMT - a stmt that performs a reduction operation in the loop.
3608 INIT_VAL - the initial value of the reduction variable
3609
3610 Output:
3611 ADJUSTMENT_DEF - a tree that holds a value to be added to the final result
3612 of the reduction (used for adjusting the epilog - see below).
3613 Return a vector variable, initialized according to the operation that STMT
3614 performs. This vector will be used as the initial value of the
3615 vector of partial results.
3616
3617 Option1 (adjust in epilog): Initialize the vector as follows:
0df23b96 3618 add/bit or/xor: [0,0,...,0,0]
3619 mult/bit and: [1,1,...,1,1]
3620 min/max/cond_expr: [init_val,init_val,..,init_val,init_val]
fb85abff 3621 and when necessary (e.g. add/mult case) let the caller know
3622 that it needs to adjust the result by init_val.
3623
3624 Option2: Initialize the vector as follows:
0df23b96 3625 add/bit or/xor: [init_val,0,0,...,0]
3626 mult/bit and: [init_val,1,1,...,1]
3627 min/max/cond_expr: [init_val,init_val,...,init_val]
fb85abff 3628 and no adjustments are needed.
3629
3630 For example, for the following code:
3631
3632 s = init_val;
3633 for (i=0;i<n;i++)
3634 s = s + a[i];
3635
3636 STMT is 's = s + a[i]', and the reduction variable is 's'.
3637 For a vector of 4 units, we want to return either [0,0,0,init_val],
3638 or [0,0,0,0] and let the caller know that it needs to adjust
3639 the result at the end by 'init_val'.
3640
3641 FORNOW, we are using the 'adjust in epilog' scheme, because this way the
7aa0d350 3642 initialization vector is simpler (same element in all entries), if
3643 ADJUSTMENT_DEF is not NULL, and Option2 otherwise.
48e1416a 3644
fb85abff 3645 A cost model should help decide between these two schemes. */
3646
3647tree
48e1416a 3648get_initial_def_for_reduction (gimple stmt, tree init_val,
7aa0d350 3649 tree *adjustment_def)
fb85abff 3650{
3651 stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt);
3652 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_vinfo);
3653 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1efcacec 3654 tree scalar_type = TREE_TYPE (init_val);
3655 tree vectype = get_vectype_for_scalar_type (scalar_type);
3656 int nunits;
fb85abff 3657 enum tree_code code = gimple_assign_rhs_code (stmt);
fb85abff 3658 tree def_for_init;
3659 tree init_def;
fadf62f4 3660 tree *elts;
fb85abff 3661 int i;
48e1416a 3662 bool nested_in_vect_loop = false;
7aa0d350 3663 tree init_value;
3664 REAL_VALUE_TYPE real_init_val = dconst0;
3665 int int_init_val = 0;
c0a0357c 3666 gimple def_stmt = NULL;
fb85abff 3667
1efcacec 3668 gcc_assert (vectype);
3669 nunits = TYPE_VECTOR_SUBPARTS (vectype);
3670
3671 gcc_assert (POINTER_TYPE_P (scalar_type) || INTEGRAL_TYPE_P (scalar_type)
3672 || SCALAR_FLOAT_TYPE_P (scalar_type));
7aa0d350 3673
fb85abff 3674 if (nested_in_vect_loop_p (loop, stmt))
3675 nested_in_vect_loop = true;
3676 else
3677 gcc_assert (loop == (gimple_bb (stmt))->loop_father);
3678
7aa0d350 3679 /* In case of double reduction we only create a vector variable to be put
282bf14c 3680 in the reduction phi node. The actual statement creation is done in
7aa0d350 3681 vect_create_epilog_for_reduction. */
c0a0357c 3682 if (adjustment_def && nested_in_vect_loop
3683 && TREE_CODE (init_val) == SSA_NAME
3684 && (def_stmt = SSA_NAME_DEF_STMT (init_val))
3685 && gimple_code (def_stmt) == GIMPLE_PHI
3686 && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
48e1416a 3687 && vinfo_for_stmt (def_stmt)
3688 && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
7aa0d350 3689 == vect_double_reduction_def)
3690 {
3691 *adjustment_def = NULL;
3692 return vect_create_destination_var (init_val, vectype);
3693 }
fb85abff 3694
7aa0d350 3695 if (TREE_CONSTANT (init_val))
3696 {
3697 if (SCALAR_FLOAT_TYPE_P (scalar_type))
3698 init_value = build_real (scalar_type, TREE_REAL_CST (init_val));
3699 else
f9ae6f95 3700 init_value = build_int_cst (scalar_type, TREE_INT_CST_LOW (init_val));
7aa0d350 3701 }
3702 else
3703 init_value = init_val;
fb85abff 3704
7aa0d350 3705 switch (code)
3706 {
3707 case WIDEN_SUM_EXPR:
3708 case DOT_PROD_EXPR:
a2287001 3709 case SAD_EXPR:
7aa0d350 3710 case PLUS_EXPR:
3711 case MINUS_EXPR:
3712 case BIT_IOR_EXPR:
3713 case BIT_XOR_EXPR:
3714 case MULT_EXPR:
3715 case BIT_AND_EXPR:
48e1416a 3716 /* ADJUSMENT_DEF is NULL when called from
7aa0d350 3717 vect_create_epilog_for_reduction to vectorize double reduction. */
3718 if (adjustment_def)
3719 {
3720 if (nested_in_vect_loop)
48e1416a 3721 *adjustment_def = vect_get_vec_def_for_operand (init_val, stmt,
7aa0d350 3722 NULL);
3723 else
3724 *adjustment_def = init_val;
3725 }
3726
b036fcd8 3727 if (code == MULT_EXPR)
7aa0d350 3728 {
3729 real_init_val = dconst1;
3730 int_init_val = 1;
3731 }
3732
b036fcd8 3733 if (code == BIT_AND_EXPR)
3734 int_init_val = -1;
3735
7aa0d350 3736 if (SCALAR_FLOAT_TYPE_P (scalar_type))
3737 def_for_init = build_real (scalar_type, real_init_val);
3738 else
3739 def_for_init = build_int_cst (scalar_type, int_init_val);
3740
48e1416a 3741 /* Create a vector of '0' or '1' except the first element. */
fadf62f4 3742 elts = XALLOCAVEC (tree, nunits);
7aa0d350 3743 for (i = nunits - 2; i >= 0; --i)
fadf62f4 3744 elts[i + 1] = def_for_init;
7aa0d350 3745
3746 /* Option1: the first element is '0' or '1' as well. */
3747 if (adjustment_def)
3748 {
fadf62f4 3749 elts[0] = def_for_init;
3750 init_def = build_vector (vectype, elts);
7aa0d350 3751 break;
3752 }
3753
3754 /* Option2: the first element is INIT_VAL. */
fadf62f4 3755 elts[0] = init_val;
7aa0d350 3756 if (TREE_CONSTANT (init_val))
fadf62f4 3757 init_def = build_vector (vectype, elts);
7aa0d350 3758 else
fadf62f4 3759 {
f1f41a6c 3760 vec<constructor_elt, va_gc> *v;
3761 vec_alloc (v, nunits);
fadf62f4 3762 CONSTRUCTOR_APPEND_ELT (v, NULL_TREE, init_val);
3763 for (i = 1; i < nunits; ++i)
3764 CONSTRUCTOR_APPEND_ELT (v, NULL_TREE, elts[i]);
3765 init_def = build_constructor (vectype, v);
3766 }
7aa0d350 3767
3768 break;
3769
3770 case MIN_EXPR:
3771 case MAX_EXPR:
0df23b96 3772 case COND_EXPR:
7aa0d350 3773 if (adjustment_def)
3774 {
3775 *adjustment_def = NULL_TREE;
3776 init_def = vect_get_vec_def_for_operand (init_val, stmt, NULL);
3777 break;
3778 }
3779
b797154e 3780 init_def = build_vector_from_val (vectype, init_value);
7aa0d350 3781 break;
3782
3783 default:
3784 gcc_unreachable ();
3785 }
fb85abff 3786
3787 return init_def;
3788}
3789
3790
3791/* Function vect_create_epilog_for_reduction
48e1416a 3792
fb85abff 3793 Create code at the loop-epilog to finalize the result of a reduction
eefa05c8 3794 computation.
3795
3796 VECT_DEFS is list of vector of partial results, i.e., the lhs's of vector
3797 reduction statements.
3798 STMT is the scalar reduction stmt that is being vectorized.
fb85abff 3799 NCOPIES is > 1 in case the vectorization factor (VF) is bigger than the
282bf14c 3800 number of elements that we can fit in a vectype (nunits). In this case
fb85abff 3801 we have to generate more than one vector stmt - i.e - we need to "unroll"
3802 the vector stmt by a factor VF/nunits. For more details see documentation
3803 in vectorizable_operation.
eefa05c8 3804 REDUC_CODE is the tree-code for the epilog reduction.
3805 REDUCTION_PHIS is a list of the phi-nodes that carry the reduction
3806 computation.
3807 REDUC_INDEX is the index of the operand in the right hand side of the
ade2ac53 3808 statement that is defined by REDUCTION_PHI.
7aa0d350 3809 DOUBLE_REDUC is TRUE if double reduction phi nodes should be handled.
eefa05c8 3810 SLP_NODE is an SLP node containing a group of reduction statements. The
3811 first one in this group is STMT.
fb85abff 3812
3813 This function:
eefa05c8 3814 1. Creates the reduction def-use cycles: sets the arguments for
3815 REDUCTION_PHIS:
fb85abff 3816 The loop-entry argument is the vectorized initial-value of the reduction.
eefa05c8 3817 The loop-latch argument is taken from VECT_DEFS - the vector of partial
3818 sums.
3819 2. "Reduces" each vector of partial results VECT_DEFS into a single result,
3820 by applying the operation specified by REDUC_CODE if available, or by
fb85abff 3821 other means (whole-vector shifts or a scalar loop).
48e1416a 3822 The function also creates a new phi node at the loop exit to preserve
fb85abff 3823 loop-closed form, as illustrated below.
48e1416a 3824
fb85abff 3825 The flow at the entry to this function:
48e1416a 3826
fb85abff 3827 loop:
3828 vec_def = phi <null, null> # REDUCTION_PHI
3829 VECT_DEF = vector_stmt # vectorized form of STMT
3830 s_loop = scalar_stmt # (scalar) STMT
3831 loop_exit:
3832 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
3833 use <s_out0>
3834 use <s_out0>
3835
3836 The above is transformed by this function into:
3837
3838 loop:
3839 vec_def = phi <vec_init, VECT_DEF> # REDUCTION_PHI
3840 VECT_DEF = vector_stmt # vectorized form of STMT
48e1416a 3841 s_loop = scalar_stmt # (scalar) STMT
fb85abff 3842 loop_exit:
3843 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
3844 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
3845 v_out2 = reduce <v_out1>
3846 s_out3 = extract_field <v_out2, 0>
3847 s_out4 = adjust_result <s_out3>
3848 use <s_out4>
3849 use <s_out4>
3850*/
3851
3852static void
f1f41a6c 3853vect_create_epilog_for_reduction (vec<tree> vect_defs, gimple stmt,
eefa05c8 3854 int ncopies, enum tree_code reduc_code,
f1f41a6c 3855 vec<gimple> reduction_phis,
eefa05c8 3856 int reduc_index, bool double_reduc,
3857 slp_tree slp_node)
fb85abff 3858{
3859 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
3860 stmt_vec_info prev_phi_info;
3861 tree vectype;
3754d046 3862 machine_mode mode;
fb85abff 3863 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
7aa0d350 3864 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo), *outer_loop = NULL;
fb85abff 3865 basic_block exit_bb;
3866 tree scalar_dest;
3867 tree scalar_type;
3868 gimple new_phi = NULL, phi;
3869 gimple_stmt_iterator exit_gsi;
3870 tree vec_dest;
eefa05c8 3871 tree new_temp = NULL_TREE, new_dest, new_name, new_scalar_dest;
fb85abff 3872 gimple epilog_stmt = NULL;
eefa05c8 3873 enum tree_code code = gimple_assign_rhs_code (stmt);
fb85abff 3874 gimple exit_phi;
f018d957 3875 tree bitsize, bitpos;
eefa05c8 3876 tree adjustment_def = NULL;
3877 tree vec_initial_def = NULL;
3878 tree reduction_op, expr, def;
3879 tree orig_name, scalar_result;
b219ece3 3880 imm_use_iterator imm_iter, phi_imm_iter;
3881 use_operand_p use_p, phi_use_p;
fb85abff 3882 bool extract_scalar_result = false;
eefa05c8 3883 gimple use_stmt, orig_stmt, reduction_phi = NULL;
fb85abff 3884 bool nested_in_vect_loop = false;
c2078b80 3885 auto_vec<gimple> new_phis;
3886 auto_vec<gimple> inner_phis;
fb85abff 3887 enum vect_def_type dt = vect_unknown_def_type;
3888 int j, i;
c2078b80 3889 auto_vec<tree> scalar_results;
47deb25f 3890 unsigned int group_size = 1, k, ratio;
c2078b80 3891 auto_vec<tree> vec_initial_defs;
3892 auto_vec<gimple> phis;
39a5d6b1 3893 bool slp_reduc = false;
3894 tree new_phi_result;
58045f90 3895 gimple inner_phi = NULL;
eefa05c8 3896
3897 if (slp_node)
f1f41a6c 3898 group_size = SLP_TREE_SCALAR_STMTS (slp_node).length ();
48e1416a 3899
fb85abff 3900 if (nested_in_vect_loop_p (loop, stmt))
3901 {
7aa0d350 3902 outer_loop = loop;
fb85abff 3903 loop = loop->inner;
3904 nested_in_vect_loop = true;
eefa05c8 3905 gcc_assert (!slp_node);
fb85abff 3906 }
48e1416a 3907
fb85abff 3908 switch (get_gimple_rhs_class (gimple_assign_rhs_code (stmt)))
3909 {
3910 case GIMPLE_SINGLE_RHS:
48e1416a 3911 gcc_assert (TREE_OPERAND_LENGTH (gimple_assign_rhs1 (stmt))
c86930b0 3912 == ternary_op);
ade2ac53 3913 reduction_op = TREE_OPERAND (gimple_assign_rhs1 (stmt), reduc_index);
fb85abff 3914 break;
3915 case GIMPLE_UNARY_RHS:
3916 reduction_op = gimple_assign_rhs1 (stmt);
3917 break;
3918 case GIMPLE_BINARY_RHS:
48e1416a 3919 reduction_op = reduc_index ?
ade2ac53 3920 gimple_assign_rhs2 (stmt) : gimple_assign_rhs1 (stmt);
fb85abff 3921 break;
c86930b0 3922 case GIMPLE_TERNARY_RHS:
3923 reduction_op = gimple_op (stmt, reduc_index + 1);
3924 break;
fb85abff 3925 default:
3926 gcc_unreachable ();
3927 }
3928
3929 vectype = get_vectype_for_scalar_type (TREE_TYPE (reduction_op));
3930 gcc_assert (vectype);
3931 mode = TYPE_MODE (vectype);
3932
eefa05c8 3933 /* 1. Create the reduction def-use cycle:
3934 Set the arguments of REDUCTION_PHIS, i.e., transform
48e1416a 3935
eefa05c8 3936 loop:
3937 vec_def = phi <null, null> # REDUCTION_PHI
3938 VECT_DEF = vector_stmt # vectorized form of STMT
3939 ...
fb85abff 3940
eefa05c8 3941 into:
3942
3943 loop:
3944 vec_def = phi <vec_init, VECT_DEF> # REDUCTION_PHI
3945 VECT_DEF = vector_stmt # vectorized form of STMT
3946 ...
3947
3948 (in case of SLP, do it for all the phis). */
3949
3950 /* Get the loop-entry arguments. */
3951 if (slp_node)
b0f64919 3952 vect_get_vec_defs (reduction_op, NULL_TREE, stmt, &vec_initial_defs,
3953 NULL, slp_node, reduc_index);
eefa05c8 3954 else
3955 {
f1f41a6c 3956 vec_initial_defs.create (1);
eefa05c8 3957 /* For the case of reduction, vect_get_vec_def_for_operand returns
3958 the scalar def before the loop, that defines the initial value
3959 of the reduction variable. */
3960 vec_initial_def = vect_get_vec_def_for_operand (reduction_op, stmt,
3961 &adjustment_def);
f1f41a6c 3962 vec_initial_defs.quick_push (vec_initial_def);
eefa05c8 3963 }
3964
3965 /* Set phi nodes arguments. */
f1f41a6c 3966 FOR_EACH_VEC_ELT (reduction_phis, i, phi)
fb85abff 3967 {
aae8f543 3968 tree vec_init_def, def;
3969 gimple_seq stmts;
3970 vec_init_def = force_gimple_operand (vec_initial_defs[i], &stmts,
3971 true, NULL_TREE);
3972 gsi_insert_seq_on_edge_immediate (loop_preheader_edge (loop), stmts);
3973 def = vect_defs[i];
eefa05c8 3974 for (j = 0; j < ncopies; j++)
3975 {
3976 /* Set the loop-entry arg of the reduction-phi. */
3977 add_phi_arg (phi, vec_init_def, loop_preheader_edge (loop),
60d535d2 3978 UNKNOWN_LOCATION);
fb85abff 3979
eefa05c8 3980 /* Set the loop-latch arg for the reduction-phi. */
3981 if (j > 0)
3982 def = vect_get_vec_def_for_stmt_copy (vect_unknown_def_type, def);
fb85abff 3983
60d535d2 3984 add_phi_arg (phi, def, loop_latch_edge (loop), UNKNOWN_LOCATION);
fb85abff 3985
6d8fb6cf 3986 if (dump_enabled_p ())
eefa05c8 3987 {
7bd765d4 3988 dump_printf_loc (MSG_NOTE, vect_location,
3989 "transform reduction: created def-use cycle: ");
3990 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
3991 dump_printf (MSG_NOTE, "\n");
3992 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, SSA_NAME_DEF_STMT (def), 0);
78bb46f5 3993 dump_printf (MSG_NOTE, "\n");
eefa05c8 3994 }
3995
3996 phi = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (phi));
3997 }
fb85abff 3998 }
3999
eefa05c8 4000 /* 2. Create epilog code.
4001 The reduction epilog code operates across the elements of the vector
4002 of partial results computed by the vectorized loop.
4003 The reduction epilog code consists of:
fb85abff 4004
eefa05c8 4005 step 1: compute the scalar result in a vector (v_out2)
4006 step 2: extract the scalar result (s_out3) from the vector (v_out2)
4007 step 3: adjust the scalar result (s_out3) if needed.
4008
4009 Step 1 can be accomplished using one the following three schemes:
fb85abff 4010 (scheme 1) using reduc_code, if available.
4011 (scheme 2) using whole-vector shifts, if available.
48e1416a 4012 (scheme 3) using a scalar loop. In this case steps 1+2 above are
fb85abff 4013 combined.
48e1416a 4014
fb85abff 4015 The overall epilog code looks like this:
4016
4017 s_out0 = phi <s_loop> # original EXIT_PHI
4018 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
4019 v_out2 = reduce <v_out1> # step 1
4020 s_out3 = extract_field <v_out2, 0> # step 2
4021 s_out4 = adjust_result <s_out3> # step 3
4022
4023 (step 3 is optional, and steps 1 and 2 may be combined).
eefa05c8 4024 Lastly, the uses of s_out0 are replaced by s_out4. */
fb85abff 4025
fb85abff 4026
eefa05c8 4027 /* 2.1 Create new loop-exit-phis to preserve loop-closed form:
4028 v_out1 = phi <VECT_DEF>
4029 Store them in NEW_PHIS. */
fb85abff 4030
4031 exit_bb = single_exit (loop)->dest;
fb85abff 4032 prev_phi_info = NULL;
f1f41a6c 4033 new_phis.create (vect_defs.length ());
4034 FOR_EACH_VEC_ELT (vect_defs, i, def)
fb85abff 4035 {
eefa05c8 4036 for (j = 0; j < ncopies; j++)
4037 {
874117c8 4038 tree new_def = copy_ssa_name (def, NULL);
4039 phi = create_phi_node (new_def, exit_bb);
eefa05c8 4040 set_vinfo_for_stmt (phi, new_stmt_vec_info (phi, loop_vinfo, NULL));
4041 if (j == 0)
f1f41a6c 4042 new_phis.quick_push (phi);
eefa05c8 4043 else
4044 {
4045 def = vect_get_vec_def_for_stmt_copy (dt, def);
4046 STMT_VINFO_RELATED_STMT (prev_phi_info) = phi;
4047 }
4048
4049 SET_PHI_ARG_DEF (phi, single_exit (loop)->dest_idx, def);
4050 prev_phi_info = vinfo_for_stmt (phi);
4051 }
fb85abff 4052 }
ade2ac53 4053
b219ece3 4054 /* The epilogue is created for the outer-loop, i.e., for the loop being
58045f90 4055 vectorized. Create exit phis for the outer loop. */
b219ece3 4056 if (double_reduc)
4057 {
4058 loop = outer_loop;
4059 exit_bb = single_exit (loop)->dest;
f1f41a6c 4060 inner_phis.create (vect_defs.length ());
4061 FOR_EACH_VEC_ELT (new_phis, i, phi)
58045f90 4062 {
874117c8 4063 tree new_result = copy_ssa_name (PHI_RESULT (phi), NULL);
4064 gimple outer_phi = create_phi_node (new_result, exit_bb);
58045f90 4065 SET_PHI_ARG_DEF (outer_phi, single_exit (loop)->dest_idx,
4066 PHI_RESULT (phi));
4067 set_vinfo_for_stmt (outer_phi, new_stmt_vec_info (outer_phi,
4068 loop_vinfo, NULL));
f1f41a6c 4069 inner_phis.quick_push (phi);
4070 new_phis[i] = outer_phi;
58045f90 4071 prev_phi_info = vinfo_for_stmt (outer_phi);
4072 while (STMT_VINFO_RELATED_STMT (vinfo_for_stmt (phi)))
4073 {
4074 phi = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (phi));
874117c8 4075 new_result = copy_ssa_name (PHI_RESULT (phi), NULL);
4076 outer_phi = create_phi_node (new_result, exit_bb);
58045f90 4077 SET_PHI_ARG_DEF (outer_phi, single_exit (loop)->dest_idx,
4078 PHI_RESULT (phi));
4079 set_vinfo_for_stmt (outer_phi, new_stmt_vec_info (outer_phi,
4080 loop_vinfo, NULL));
4081 STMT_VINFO_RELATED_STMT (prev_phi_info) = outer_phi;
4082 prev_phi_info = vinfo_for_stmt (outer_phi);
4083 }
4084 }
b219ece3 4085 }
4086
fb85abff 4087 exit_gsi = gsi_after_labels (exit_bb);
4088
48e1416a 4089 /* 2.2 Get the relevant tree-code to use in the epilog for schemes 2,3
fb85abff 4090 (i.e. when reduc_code is not available) and in the final adjustment
4091 code (if needed). Also get the original scalar reduction variable as
48e1416a 4092 defined in the loop. In case STMT is a "pattern-stmt" (i.e. - it
4093 represents a reduction pattern), the tree-code and scalar-def are
4094 taken from the original stmt that the pattern-stmt (STMT) replaces.
fb85abff 4095 Otherwise (it is a regular reduction) - the tree-code and scalar-def
48e1416a 4096 are taken from STMT. */
fb85abff 4097
4098 orig_stmt = STMT_VINFO_RELATED_STMT (stmt_info);
4099 if (!orig_stmt)
4100 {
4101 /* Regular reduction */
4102 orig_stmt = stmt;
4103 }
4104 else
4105 {
4106 /* Reduction pattern */
4107 stmt_vec_info stmt_vinfo = vinfo_for_stmt (orig_stmt);
4108 gcc_assert (STMT_VINFO_IN_PATTERN_P (stmt_vinfo));
4109 gcc_assert (STMT_VINFO_RELATED_STMT (stmt_vinfo) == stmt);
4110 }
ade2ac53 4111
fb85abff 4112 code = gimple_assign_rhs_code (orig_stmt);
eefa05c8 4113 /* For MINUS_EXPR the initial vector is [init_val,0,...,0], therefore,
4114 partial results are added and not subtracted. */
4115 if (code == MINUS_EXPR)
4116 code = PLUS_EXPR;
4117
fb85abff 4118 scalar_dest = gimple_assign_lhs (orig_stmt);
4119 scalar_type = TREE_TYPE (scalar_dest);
f1f41a6c 4120 scalar_results.create (group_size);
fb85abff 4121 new_scalar_dest = vect_create_destination_var (scalar_dest, NULL);
4122 bitsize = TYPE_SIZE (scalar_type);
fb85abff 4123
fb85abff 4124 /* In case this is a reduction in an inner-loop while vectorizing an outer
4125 loop - we don't need to extract a single scalar result at the end of the
7aa0d350 4126 inner-loop (unless it is double reduction, i.e., the use of reduction is
282bf14c 4127 outside the outer-loop). The final vector of partial results will be used
7aa0d350 4128 in the vectorized outer-loop, or reduced to a scalar result at the end of
4129 the outer-loop. */
4130 if (nested_in_vect_loop && !double_reduc)
fb85abff 4131 goto vect_finalize_reduction;
4132
39a5d6b1 4133 /* SLP reduction without reduction chain, e.g.,
4134 # a1 = phi <a2, a0>
4135 # b1 = phi <b2, b0>
4136 a2 = operation (a1)
4137 b2 = operation (b1) */
4138 slp_reduc = (slp_node && !GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)));
4139
4140 /* In case of reduction chain, e.g.,
4141 # a1 = phi <a3, a0>
4142 a2 = operation (a1)
4143 a3 = operation (a2),
4144
4145 we may end up with more than one vector result. Here we reduce them to
4146 one vector. */
4147 if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
4148 {
f1f41a6c 4149 tree first_vect = PHI_RESULT (new_phis[0]);
39a5d6b1 4150 tree tmp;
2f4ce795 4151 gimple new_vec_stmt = NULL;
39a5d6b1 4152
4153 vec_dest = vect_create_destination_var (scalar_dest, vectype);
f1f41a6c 4154 for (k = 1; k < new_phis.length (); k++)
39a5d6b1 4155 {
f1f41a6c 4156 gimple next_phi = new_phis[k];
39a5d6b1 4157 tree second_vect = PHI_RESULT (next_phi);
39a5d6b1 4158
4159 tmp = build2 (code, vectype, first_vect, second_vect);
4160 new_vec_stmt = gimple_build_assign (vec_dest, tmp);
4161 first_vect = make_ssa_name (vec_dest, new_vec_stmt);
4162 gimple_assign_set_lhs (new_vec_stmt, first_vect);
4163 gsi_insert_before (&exit_gsi, new_vec_stmt, GSI_SAME_STMT);
4164 }
4165
4166 new_phi_result = first_vect;
2f4ce795 4167 if (new_vec_stmt)
4168 {
f1f41a6c 4169 new_phis.truncate (0);
4170 new_phis.safe_push (new_vec_stmt);
2f4ce795 4171 }
39a5d6b1 4172 }
4173 else
f1f41a6c 4174 new_phi_result = PHI_RESULT (new_phis[0]);
39a5d6b1 4175
fb85abff 4176 /* 2.3 Create the reduction code, using one of the three schemes described
eefa05c8 4177 above. In SLP we simply need to extract all the elements from the
4178 vector (without reducing them), so we use scalar shifts. */
39a5d6b1 4179 if (reduc_code != ERROR_MARK && !slp_reduc)
fb85abff 4180 {
4181 tree tmp;
7ba68b18 4182 tree vec_elem_type;
fb85abff 4183
4184 /*** Case 1: Create:
eefa05c8 4185 v_out2 = reduc_expr <v_out1> */
fb85abff 4186
6d8fb6cf 4187 if (dump_enabled_p ())
7bd765d4 4188 dump_printf_loc (MSG_NOTE, vect_location,
78bb46f5 4189 "Reduce using direct vector reduction.\n");
fb85abff 4190
7ba68b18 4191 vec_elem_type = TREE_TYPE (TREE_TYPE (new_phi_result));
4192 if (!useless_type_conversion_p (scalar_type, vec_elem_type))
4193 {
4194 tree tmp_dest =
4195 vect_create_destination_var (scalar_dest, vec_elem_type);
4196 tmp = build1 (reduc_code, vec_elem_type, new_phi_result);
4197 epilog_stmt = gimple_build_assign (tmp_dest, tmp);
4198 new_temp = make_ssa_name (tmp_dest, epilog_stmt);
4199 gimple_assign_set_lhs (epilog_stmt, new_temp);
4200 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4201
4202 tmp = build1 (NOP_EXPR, scalar_type, new_temp);
4203 }
4204 else
4205 tmp = build1 (reduc_code, scalar_type, new_phi_result);
4206 epilog_stmt = gimple_build_assign (new_scalar_dest, tmp);
4207 new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
fb85abff 4208 gimple_assign_set_lhs (epilog_stmt, new_temp);
4209 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
7ba68b18 4210 scalar_results.safe_push (new_temp);
fb85abff 4211 }
4212 else
4213 {
bc620c5c 4214 enum tree_code shift_code = ERROR_MARK;
fb85abff 4215 bool have_whole_vector_shift = true;
4216 int bit_offset;
e913b5cd 4217 int element_bitsize = tree_to_uhwi (bitsize);
4218 int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
fb85abff 4219 tree vec_temp;
4220
d6bf3b14 4221 if (optab_handler (vec_shr_optab, mode) != CODE_FOR_nothing)
eefa05c8 4222 shift_code = VEC_RSHIFT_EXPR;
fb85abff 4223 else
eefa05c8 4224 have_whole_vector_shift = false;
fb85abff 4225
4226 /* Regardless of whether we have a whole vector shift, if we're
eefa05c8 4227 emulating the operation via tree-vect-generic, we don't want
4228 to use it. Only the first round of the reduction is likely
4229 to still be profitable via emulation. */
fb85abff 4230 /* ??? It might be better to emit a reduction tree code here, so that
eefa05c8 4231 tree-vect-generic can expand the first round via bit tricks. */
fb85abff 4232 if (!VECTOR_MODE_P (mode))
eefa05c8 4233 have_whole_vector_shift = false;
fb85abff 4234 else
fb85abff 4235 {
eefa05c8 4236 optab optab = optab_for_tree_code (code, vectype, optab_default);
d6bf3b14 4237 if (optab_handler (optab, mode) == CODE_FOR_nothing)
eefa05c8 4238 have_whole_vector_shift = false;
4239 }
fb85abff 4240
39a5d6b1 4241 if (have_whole_vector_shift && !slp_reduc)
eefa05c8 4242 {
4243 /*** Case 2: Create:
4244 for (offset = VS/2; offset >= element_size; offset/=2)
4245 {
4246 Create: va' = vec_shift <va, offset>
4247 Create: va = vop <va, va'>
4248 } */
fb85abff 4249
6d8fb6cf 4250 if (dump_enabled_p ())
7bd765d4 4251 dump_printf_loc (MSG_NOTE, vect_location,
78bb46f5 4252 "Reduce using vector shifts\n");
eefa05c8 4253
4254 vec_dest = vect_create_destination_var (scalar_dest, vectype);
39a5d6b1 4255 new_temp = new_phi_result;
eefa05c8 4256 for (bit_offset = vec_size_in_bits/2;
4257 bit_offset >= element_bitsize;
4258 bit_offset /= 2)
4259 {
4260 tree bitpos = size_int (bit_offset);
4261
4262 epilog_stmt = gimple_build_assign_with_ops (shift_code,
4263 vec_dest, new_temp, bitpos);
4264 new_name = make_ssa_name (vec_dest, epilog_stmt);
4265 gimple_assign_set_lhs (epilog_stmt, new_name);
4266 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4267
4268 epilog_stmt = gimple_build_assign_with_ops (code, vec_dest,
4269 new_name, new_temp);
4270 new_temp = make_ssa_name (vec_dest, epilog_stmt);
4271 gimple_assign_set_lhs (epilog_stmt, new_temp);
4272 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4273 }
fb85abff 4274
eefa05c8 4275 extract_scalar_result = true;
4276 }
fb85abff 4277 else
4278 {
eefa05c8 4279 tree rhs;
4280
4281 /*** Case 3: Create:
4282 s = extract_field <v_out2, 0>
4283 for (offset = element_size;
4284 offset < vector_size;
4285 offset += element_size;)
4286 {
4287 Create: s' = extract_field <v_out2, offset>
4288 Create: s = op <s, s'> // For non SLP cases
4289 } */
fb85abff 4290
6d8fb6cf 4291 if (dump_enabled_p ())
7bd765d4 4292 dump_printf_loc (MSG_NOTE, vect_location,
78bb46f5 4293 "Reduce using scalar code.\n");
fb85abff 4294
e913b5cd 4295 vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
f1f41a6c 4296 FOR_EACH_VEC_ELT (new_phis, i, new_phi)
eefa05c8 4297 {
2f4ce795 4298 if (gimple_code (new_phi) == GIMPLE_PHI)
4299 vec_temp = PHI_RESULT (new_phi);
4300 else
4301 vec_temp = gimple_assign_lhs (new_phi);
eefa05c8 4302 rhs = build3 (BIT_FIELD_REF, scalar_type, vec_temp, bitsize,
4303 bitsize_zero_node);
4304 epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
4305 new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
4306 gimple_assign_set_lhs (epilog_stmt, new_temp);
4307 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4308
4309 /* In SLP we don't need to apply reduction operation, so we just
4310 collect s' values in SCALAR_RESULTS. */
39a5d6b1 4311 if (slp_reduc)
f1f41a6c 4312 scalar_results.safe_push (new_temp);
eefa05c8 4313
4314 for (bit_offset = element_bitsize;
4315 bit_offset < vec_size_in_bits;
4316 bit_offset += element_bitsize)
4317 {
4318 tree bitpos = bitsize_int (bit_offset);
4319 tree rhs = build3 (BIT_FIELD_REF, scalar_type, vec_temp,
4320 bitsize, bitpos);
4321
4322 epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
4323 new_name = make_ssa_name (new_scalar_dest, epilog_stmt);
4324 gimple_assign_set_lhs (epilog_stmt, new_name);
4325 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4326
39a5d6b1 4327 if (slp_reduc)
eefa05c8 4328 {
4329 /* In SLP we don't need to apply reduction operation, so
4330 we just collect s' values in SCALAR_RESULTS. */
4331 new_temp = new_name;
f1f41a6c 4332 scalar_results.safe_push (new_name);
eefa05c8 4333 }
4334 else
4335 {
4336 epilog_stmt = gimple_build_assign_with_ops (code,
4337 new_scalar_dest, new_name, new_temp);
4338 new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
4339 gimple_assign_set_lhs (epilog_stmt, new_temp);
4340 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4341 }
4342 }
4343 }
4344
4345 /* The only case where we need to reduce scalar results in SLP, is
282bf14c 4346 unrolling. If the size of SCALAR_RESULTS is greater than
eefa05c8 4347 GROUP_SIZE, we reduce them combining elements modulo
4348 GROUP_SIZE. */
39a5d6b1 4349 if (slp_reduc)
eefa05c8 4350 {
4351 tree res, first_res, new_res;
4352 gimple new_stmt;
4353
4354 /* Reduce multiple scalar results in case of SLP unrolling. */
f1f41a6c 4355 for (j = group_size; scalar_results.iterate (j, &res);
eefa05c8 4356 j++)
4357 {
f1f41a6c 4358 first_res = scalar_results[j % group_size];
eefa05c8 4359 new_stmt = gimple_build_assign_with_ops (code,
4360 new_scalar_dest, first_res, res);
4361 new_res = make_ssa_name (new_scalar_dest, new_stmt);
4362 gimple_assign_set_lhs (new_stmt, new_res);
4363 gsi_insert_before (&exit_gsi, new_stmt, GSI_SAME_STMT);
f1f41a6c 4364 scalar_results[j % group_size] = new_res;
eefa05c8 4365 }
4366 }
4367 else
4368 /* Not SLP - we have one scalar to keep in SCALAR_RESULTS. */
f1f41a6c 4369 scalar_results.safe_push (new_temp);
eefa05c8 4370
4371 extract_scalar_result = false;
4372 }
fb85abff 4373 }
4374
4375 /* 2.4 Extract the final scalar result. Create:
eefa05c8 4376 s_out3 = extract_field <v_out2, bitpos> */
48e1416a 4377
fb85abff 4378 if (extract_scalar_result)
4379 {
4380 tree rhs;
4381
6d8fb6cf 4382 if (dump_enabled_p ())
7bd765d4 4383 dump_printf_loc (MSG_NOTE, vect_location,
78bb46f5 4384 "extract scalar result\n");
fb85abff 4385
4386 if (BYTES_BIG_ENDIAN)
eefa05c8 4387 bitpos = size_binop (MULT_EXPR,
4388 bitsize_int (TYPE_VECTOR_SUBPARTS (vectype) - 1),
4389 TYPE_SIZE (scalar_type));
fb85abff 4390 else
eefa05c8 4391 bitpos = bitsize_zero_node;
fb85abff 4392
4393 rhs = build3 (BIT_FIELD_REF, scalar_type, new_temp, bitsize, bitpos);
4394 epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
4395 new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
4396 gimple_assign_set_lhs (epilog_stmt, new_temp);
4397 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
f1f41a6c 4398 scalar_results.safe_push (new_temp);
fb85abff 4399 }
eefa05c8 4400
fb85abff 4401vect_finalize_reduction:
4402
b219ece3 4403 if (double_reduc)
4404 loop = loop->inner;
4405
fb85abff 4406 /* 2.5 Adjust the final result by the initial value of the reduction
4407 variable. (When such adjustment is not needed, then
4408 'adjustment_def' is zero). For example, if code is PLUS we create:
4409 new_temp = loop_exit_def + adjustment_def */
4410
4411 if (adjustment_def)
4412 {
39a5d6b1 4413 gcc_assert (!slp_reduc);
fb85abff 4414 if (nested_in_vect_loop)
4415 {
f1f41a6c 4416 new_phi = new_phis[0];
fb85abff 4417 gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) == VECTOR_TYPE);
4418 expr = build2 (code, vectype, PHI_RESULT (new_phi), adjustment_def);
4419 new_dest = vect_create_destination_var (scalar_dest, vectype);
4420 }
4421 else
4422 {
f1f41a6c 4423 new_temp = scalar_results[0];
fb85abff 4424 gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) != VECTOR_TYPE);
4425 expr = build2 (code, scalar_type, new_temp, adjustment_def);
4426 new_dest = vect_create_destination_var (scalar_dest, scalar_type);
4427 }
ade2ac53 4428
fb85abff 4429 epilog_stmt = gimple_build_assign (new_dest, expr);
4430 new_temp = make_ssa_name (new_dest, epilog_stmt);
4431 gimple_assign_set_lhs (epilog_stmt, new_temp);
fb85abff 4432 gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
eefa05c8 4433 if (nested_in_vect_loop)
4434 {
4435 set_vinfo_for_stmt (epilog_stmt,
4436 new_stmt_vec_info (epilog_stmt, loop_vinfo,
4437 NULL));
4438 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (epilog_stmt)) =
4439 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (new_phi));
4440
4441 if (!double_reduc)
f1f41a6c 4442 scalar_results.quick_push (new_temp);
eefa05c8 4443 else
f1f41a6c 4444 scalar_results[0] = new_temp;
eefa05c8 4445 }
4446 else
f1f41a6c 4447 scalar_results[0] = new_temp;
eefa05c8 4448
f1f41a6c 4449 new_phis[0] = epilog_stmt;
fb85abff 4450 }
4451
282bf14c 4452 /* 2.6 Handle the loop-exit phis. Replace the uses of scalar loop-exit
eefa05c8 4453 phis with new adjusted scalar results, i.e., replace use <s_out0>
4454 with use <s_out4>.
fb85abff 4455
eefa05c8 4456 Transform:
4457 loop_exit:
4458 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
4459 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
4460 v_out2 = reduce <v_out1>
4461 s_out3 = extract_field <v_out2, 0>
4462 s_out4 = adjust_result <s_out3>
4463 use <s_out0>
4464 use <s_out0>
4465
4466 into:
fb85abff 4467
eefa05c8 4468 loop_exit:
4469 s_out0 = phi <s_loop> # (scalar) EXIT_PHI
4470 v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI
4471 v_out2 = reduce <v_out1>
4472 s_out3 = extract_field <v_out2, 0>
4473 s_out4 = adjust_result <s_out3>
47deb25f 4474 use <s_out4>
4475 use <s_out4> */
eefa05c8 4476
39a5d6b1 4477
4478 /* In SLP reduction chain we reduce vector results into one vector if
4479 necessary, hence we set here GROUP_SIZE to 1. SCALAR_DEST is the LHS of
4480 the last stmt in the reduction chain, since we are looking for the loop
4481 exit phi node. */
4482 if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
4483 {
f1f41a6c 4484 scalar_dest = gimple_assign_lhs (
4485 SLP_TREE_SCALAR_STMTS (slp_node)[group_size - 1]);
39a5d6b1 4486 group_size = 1;
4487 }
4488
eefa05c8 4489 /* In SLP we may have several statements in NEW_PHIS and REDUCTION_PHIS (in
282bf14c 4490 case that GROUP_SIZE is greater than vectorization factor). Therefore, we
4491 need to match SCALAR_RESULTS with corresponding statements. The first
eefa05c8 4492 (GROUP_SIZE / number of new vector stmts) scalar results correspond to
4493 the first vector stmt, etc.
4494 (RATIO is equal to (GROUP_SIZE / number of new vector stmts)). */
f1f41a6c 4495 if (group_size > new_phis.length ())
47deb25f 4496 {
f1f41a6c 4497 ratio = group_size / new_phis.length ();
4498 gcc_assert (!(group_size % new_phis.length ()));
47deb25f 4499 }
4500 else
4501 ratio = 1;
eefa05c8 4502
4503 for (k = 0; k < group_size; k++)
fb85abff 4504 {
eefa05c8 4505 if (k % ratio == 0)
4506 {
f1f41a6c 4507 epilog_stmt = new_phis[k / ratio];
4508 reduction_phi = reduction_phis[k / ratio];
58045f90 4509 if (double_reduc)
f1f41a6c 4510 inner_phi = inner_phis[k / ratio];
eefa05c8 4511 }
7aa0d350 4512
39a5d6b1 4513 if (slp_reduc)
eefa05c8 4514 {
f1f41a6c 4515 gimple current_stmt = SLP_TREE_SCALAR_STMTS (slp_node)[k];
fb85abff 4516
eefa05c8 4517 orig_stmt = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (current_stmt));
4518 /* SLP statements can't participate in patterns. */
4519 gcc_assert (!orig_stmt);
4520 scalar_dest = gimple_assign_lhs (current_stmt);
4521 }
4522
f1f41a6c 4523 phis.create (3);
eefa05c8 4524 /* Find the loop-closed-use at the loop exit of the original scalar
282bf14c 4525 result. (The reduction result is expected to have two immediate uses -
eefa05c8 4526 one at the latch block, and one at the loop exit). */
4527 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, scalar_dest)
f898e094 4528 if (!flow_bb_inside_loop_p (loop, gimple_bb (USE_STMT (use_p)))
4529 && !is_gimple_debug (USE_STMT (use_p)))
f1f41a6c 4530 phis.safe_push (USE_STMT (use_p));
eefa05c8 4531
1d4bc0bb 4532 /* While we expect to have found an exit_phi because of loop-closed-ssa
4533 form we can end up without one if the scalar cycle is dead. */
eefa05c8 4534
f1f41a6c 4535 FOR_EACH_VEC_ELT (phis, i, exit_phi)
eefa05c8 4536 {
4537 if (outer_loop)
7aa0d350 4538 {
eefa05c8 4539 stmt_vec_info exit_phi_vinfo = vinfo_for_stmt (exit_phi);
4540 gimple vect_phi;
4541
4542 /* FORNOW. Currently not supporting the case that an inner-loop
4543 reduction is not used in the outer-loop (but only outside the
4544 outer-loop), unless it is double reduction. */
4545 gcc_assert ((STMT_VINFO_RELEVANT_P (exit_phi_vinfo)
4546 && !STMT_VINFO_LIVE_P (exit_phi_vinfo))
4547 || double_reduc);
4548
4549 STMT_VINFO_VEC_STMT (exit_phi_vinfo) = epilog_stmt;
4550 if (!double_reduc
4551 || STMT_VINFO_DEF_TYPE (exit_phi_vinfo)
4552 != vect_double_reduction_def)
7aa0d350 4553 continue;
4554
eefa05c8 4555 /* Handle double reduction:
7aa0d350 4556
eefa05c8 4557 stmt1: s1 = phi <s0, s2> - double reduction phi (outer loop)
4558 stmt2: s3 = phi <s1, s4> - (regular) reduc phi (inner loop)
4559 stmt3: s4 = use (s3) - (regular) reduc stmt (inner loop)
4560 stmt4: s2 = phi <s4> - double reduction stmt (outer loop)
7aa0d350 4561
eefa05c8 4562 At that point the regular reduction (stmt2 and stmt3) is
4563 already vectorized, as well as the exit phi node, stmt4.
4564 Here we vectorize the phi node of double reduction, stmt1, and
4565 update all relevant statements. */
7aa0d350 4566
eefa05c8 4567 /* Go through all the uses of s2 to find double reduction phi
4568 node, i.e., stmt1 above. */
4569 orig_name = PHI_RESULT (exit_phi);
4570 FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name)
7aa0d350 4571 {
f83623cc 4572 stmt_vec_info use_stmt_vinfo;
eefa05c8 4573 stmt_vec_info new_phi_vinfo;
4574 tree vect_phi_init, preheader_arg, vect_phi_res, init_def;
4575 basic_block bb = gimple_bb (use_stmt);
4576 gimple use;
4577
4578 /* Check that USE_STMT is really double reduction phi
4579 node. */
4580 if (gimple_code (use_stmt) != GIMPLE_PHI
4581 || gimple_phi_num_args (use_stmt) != 2
eefa05c8 4582 || bb->loop_father != outer_loop)
4583 continue;
f83623cc 4584 use_stmt_vinfo = vinfo_for_stmt (use_stmt);
4585 if (!use_stmt_vinfo
4586 || STMT_VINFO_DEF_TYPE (use_stmt_vinfo)
4587 != vect_double_reduction_def)
4588 continue;
eefa05c8 4589
4590 /* Create vector phi node for double reduction:
4591 vs1 = phi <vs0, vs2>
4592 vs1 was created previously in this function by a call to
4593 vect_get_vec_def_for_operand and is stored in
4594 vec_initial_def;
58045f90 4595 vs2 is defined by INNER_PHI, the vectorized EXIT_PHI;
eefa05c8 4596 vs0 is created here. */
4597
4598 /* Create vector phi node. */
4599 vect_phi = create_phi_node (vec_initial_def, bb);
4600 new_phi_vinfo = new_stmt_vec_info (vect_phi,
4601 loop_vec_info_for_loop (outer_loop), NULL);
4602 set_vinfo_for_stmt (vect_phi, new_phi_vinfo);
4603
4604 /* Create vs0 - initial def of the double reduction phi. */
4605 preheader_arg = PHI_ARG_DEF_FROM_EDGE (use_stmt,
4606 loop_preheader_edge (outer_loop));
4607 init_def = get_initial_def_for_reduction (stmt,
4608 preheader_arg, NULL);
4609 vect_phi_init = vect_init_vector (use_stmt, init_def,
4610 vectype, NULL);
4611
4612 /* Update phi node arguments with vs0 and vs2. */
4613 add_phi_arg (vect_phi, vect_phi_init,
4614 loop_preheader_edge (outer_loop),
60d535d2 4615 UNKNOWN_LOCATION);
58045f90 4616 add_phi_arg (vect_phi, PHI_RESULT (inner_phi),
60d535d2 4617 loop_latch_edge (outer_loop), UNKNOWN_LOCATION);
6d8fb6cf 4618 if (dump_enabled_p ())
eefa05c8 4619 {
7bd765d4 4620 dump_printf_loc (MSG_NOTE, vect_location,
4621 "created double reduction phi node: ");
4622 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, vect_phi, 0);
78bb46f5 4623 dump_printf (MSG_NOTE, "\n");
eefa05c8 4624 }
4625
4626 vect_phi_res = PHI_RESULT (vect_phi);
4627
4628 /* Replace the use, i.e., set the correct vs1 in the regular
282bf14c 4629 reduction phi node. FORNOW, NCOPIES is always 1, so the
eefa05c8 4630 loop is redundant. */
4631 use = reduction_phi;
4632 for (j = 0; j < ncopies; j++)
4633 {
4634 edge pr_edge = loop_preheader_edge (loop);
4635 SET_PHI_ARG_DEF (use, pr_edge->dest_idx, vect_phi_res);
4636 use = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (use));
4637 }
7aa0d350 4638 }
4639 }
b219ece3 4640 }
4641
f1f41a6c 4642 phis.release ();
b219ece3 4643 if (nested_in_vect_loop)
4644 {
4645 if (double_reduc)
4646 loop = outer_loop;
4647 else
4648 continue;
4649 }
4650
f1f41a6c 4651 phis.create (3);
b219ece3 4652 /* Find the loop-closed-use at the loop exit of the original scalar
282bf14c 4653 result. (The reduction result is expected to have two immediate uses,
4654 one at the latch block, and one at the loop exit). For double
b219ece3 4655 reductions we are looking for exit phis of the outer loop. */
4656 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, scalar_dest)
4657 {
4658 if (!flow_bb_inside_loop_p (loop, gimple_bb (USE_STMT (use_p))))
f898e094 4659 {
4660 if (!is_gimple_debug (USE_STMT (use_p)))
4661 phis.safe_push (USE_STMT (use_p));
4662 }
b219ece3 4663 else
4664 {
4665 if (double_reduc && gimple_code (USE_STMT (use_p)) == GIMPLE_PHI)
4666 {
4667 tree phi_res = PHI_RESULT (USE_STMT (use_p));
4668
4669 FOR_EACH_IMM_USE_FAST (phi_use_p, phi_imm_iter, phi_res)
4670 {
4671 if (!flow_bb_inside_loop_p (loop,
f898e094 4672 gimple_bb (USE_STMT (phi_use_p)))
4673 && !is_gimple_debug (USE_STMT (phi_use_p)))
f1f41a6c 4674 phis.safe_push (USE_STMT (phi_use_p));
b219ece3 4675 }
4676 }
4677 }
4678 }
fb85abff 4679
f1f41a6c 4680 FOR_EACH_VEC_ELT (phis, i, exit_phi)
b219ece3 4681 {
eefa05c8 4682 /* Replace the uses: */
4683 orig_name = PHI_RESULT (exit_phi);
f1f41a6c 4684 scalar_result = scalar_results[k];
eefa05c8 4685 FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name)
4686 FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
4687 SET_USE (use_p, scalar_result);
4688 }
4689
f1f41a6c 4690 phis.release ();
fb85abff 4691 }
6ae8a044 4692}
fb85abff 4693
4694
4695/* Function vectorizable_reduction.
4696
4697 Check if STMT performs a reduction operation that can be vectorized.
4698 If VEC_STMT is also passed, vectorize the STMT: create a vectorized
ade2ac53 4699 stmt to replace it, put it in VEC_STMT, and insert it at GSI.
fb85abff 4700 Return FALSE if not a vectorizable STMT, TRUE otherwise.
4701
48e1416a 4702 This function also handles reduction idioms (patterns) that have been
282bf14c 4703 recognized in advance during vect_pattern_recog. In this case, STMT may be
fb85abff 4704 of this form:
4705 X = pattern_expr (arg0, arg1, ..., X)
4706 and it's STMT_VINFO_RELATED_STMT points to the last stmt in the original
4707 sequence that had been detected and replaced by the pattern-stmt (STMT).
48e1416a 4708
fb85abff 4709 In some cases of reduction patterns, the type of the reduction variable X is
4710 different than the type of the other arguments of STMT.
4711 In such cases, the vectype that is used when transforming STMT into a vector
4712 stmt is different than the vectype that is used to determine the
48e1416a 4713 vectorization factor, because it consists of a different number of elements
fb85abff 4714 than the actual number of elements that are being operated upon in parallel.
4715
4716 For example, consider an accumulation of shorts into an int accumulator.
4717 On some targets it's possible to vectorize this pattern operating on 8
4718 shorts at a time (hence, the vectype for purposes of determining the
4719 vectorization factor should be V8HI); on the other hand, the vectype that
4720 is used to create the vector form is actually V4SI (the type of the result).
4721
4722 Upon entry to this function, STMT_VINFO_VECTYPE records the vectype that
4723 indicates what is the actual level of parallelism (V8HI in the example), so
282bf14c 4724 that the right vectorization factor would be derived. This vectype
fb85abff 4725 corresponds to the type of arguments to the reduction stmt, and should *NOT*
282bf14c 4726 be used to create the vectorized stmt. The right vectype for the vectorized
fb85abff 4727 stmt is obtained from the type of the result X:
4728 get_vectype_for_scalar_type (TREE_TYPE (X))
4729
4730 This means that, contrary to "regular" reductions (or "regular" stmts in
4731 general), the following equation:
4732 STMT_VINFO_VECTYPE == get_vectype_for_scalar_type (TREE_TYPE (X))
4733 does *NOT* necessarily hold for reduction patterns. */
4734
4735bool
4736vectorizable_reduction (gimple stmt, gimple_stmt_iterator *gsi,
eefa05c8 4737 gimple *vec_stmt, slp_tree slp_node)
fb85abff 4738{
4739 tree vec_dest;
4740 tree scalar_dest;
4741 tree loop_vec_def0 = NULL_TREE, loop_vec_def1 = NULL_TREE;
4742 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
b334cbba 4743 tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
4744 tree vectype_in = NULL_TREE;
fb85abff 4745 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4746 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
8458f4ca 4747 enum tree_code code, orig_code, epilog_reduc_code;
3754d046 4748 machine_mode vec_mode;
fb85abff 4749 int op_type;
4750 optab optab, reduc_optab;
4751 tree new_temp = NULL_TREE;
4752 tree def;
4753 gimple def_stmt;
4754 enum vect_def_type dt;
4755 gimple new_phi = NULL;
4756 tree scalar_type;
4757 bool is_simple_use;
4758 gimple orig_stmt;
4759 stmt_vec_info orig_stmt_info;
4760 tree expr = NULL_TREE;
4761 int i;
b334cbba 4762 int ncopies;
fb85abff 4763 int epilog_copies;
4764 stmt_vec_info prev_stmt_info, prev_phi_info;
fb85abff 4765 bool single_defuse_cycle = false;
0df23b96 4766 tree reduc_def = NULL_TREE;
fb85abff 4767 gimple new_stmt = NULL;
4768 int j;
4769 tree ops[3];
ade2ac53 4770 bool nested_cycle = false, found_nested_cycle_def = false;
4771 gimple reduc_def_stmt = NULL;
4772 /* The default is that the reduction variable is the last in statement. */
4773 int reduc_index = 2;
7aa0d350 4774 bool double_reduc = false, dummy;
4775 basic_block def_bb;
c0a0357c 4776 struct loop * def_stmt_loop, *outer_loop = NULL;
7aa0d350 4777 tree def_arg;
c0a0357c 4778 gimple def_arg_stmt;
c2078b80 4779 auto_vec<tree> vec_oprnds0;
4780 auto_vec<tree> vec_oprnds1;
4781 auto_vec<tree> vect_defs;
4782 auto_vec<gimple> phis;
eefa05c8 4783 int vec_num;
d42d0fe0 4784 tree def0, def1, tem, op0, op1 = NULL_TREE;
fb85abff 4785
39a5d6b1 4786 /* In case of reduction chain we switch to the first stmt in the chain, but
4787 we don't update STMT_INFO, since only the last stmt is marked as reduction
4788 and has reduction properties. */
4789 if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
4790 stmt = GROUP_FIRST_ELEMENT (stmt_info);
4791
fb85abff 4792 if (nested_in_vect_loop_p (loop, stmt))
ade2ac53 4793 {
c0a0357c 4794 outer_loop = loop;
ade2ac53 4795 loop = loop->inner;
4796 nested_cycle = true;
4797 }
fb85abff 4798
fb85abff 4799 /* 1. Is vectorizable reduction? */
39a5d6b1 4800 /* Not supportable if the reduction variable is used in the loop, unless
4801 it's a reduction chain. */
4802 if (STMT_VINFO_RELEVANT (stmt_info) > vect_used_in_outer
4803 && !GROUP_FIRST_ELEMENT (stmt_info))
fb85abff 4804 return false;
4805
4806 /* Reductions that are not used even in an enclosing outer-loop,
4807 are expected to be "live" (used out of the loop). */
f083cd24 4808 if (STMT_VINFO_RELEVANT (stmt_info) == vect_unused_in_scope
fb85abff 4809 && !STMT_VINFO_LIVE_P (stmt_info))
4810 return false;
4811
4812 /* Make sure it was already recognized as a reduction computation. */
ade2ac53 4813 if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_reduction_def
4814 && STMT_VINFO_DEF_TYPE (stmt_info) != vect_nested_cycle)
fb85abff 4815 return false;
4816
48e1416a 4817 /* 2. Has this been recognized as a reduction pattern?
fb85abff 4818
4819 Check if STMT represents a pattern that has been recognized
4820 in earlier analysis stages. For stmts that represent a pattern,
4821 the STMT_VINFO_RELATED_STMT field records the last stmt in
4822 the original sequence that constitutes the pattern. */
4823
4824 orig_stmt = STMT_VINFO_RELATED_STMT (stmt_info);
4825 if (orig_stmt)
4826 {
4827 orig_stmt_info = vinfo_for_stmt (orig_stmt);
fb85abff 4828 gcc_assert (STMT_VINFO_IN_PATTERN_P (orig_stmt_info));
4829 gcc_assert (!STMT_VINFO_IN_PATTERN_P (stmt_info));
4830 }
48e1416a 4831
282bf14c 4832 /* 3. Check the operands of the operation. The first operands are defined
fb85abff 4833 inside the loop body. The last operand is the reduction variable,
4834 which is defined by the loop-header-phi. */
4835
4836 gcc_assert (is_gimple_assign (stmt));
4837
09e31a48 4838 /* Flatten RHS. */
fb85abff 4839 switch (get_gimple_rhs_class (gimple_assign_rhs_code (stmt)))
4840 {
4841 case GIMPLE_SINGLE_RHS:
4842 op_type = TREE_OPERAND_LENGTH (gimple_assign_rhs1 (stmt));
4843 if (op_type == ternary_op)
4844 {
4845 tree rhs = gimple_assign_rhs1 (stmt);
4846 ops[0] = TREE_OPERAND (rhs, 0);
4847 ops[1] = TREE_OPERAND (rhs, 1);
4848 ops[2] = TREE_OPERAND (rhs, 2);
4849 code = TREE_CODE (rhs);
4850 }
4851 else
4852 return false;
4853 break;
4854
4855 case GIMPLE_BINARY_RHS:
4856 code = gimple_assign_rhs_code (stmt);
4857 op_type = TREE_CODE_LENGTH (code);
4858 gcc_assert (op_type == binary_op);
4859 ops[0] = gimple_assign_rhs1 (stmt);
4860 ops[1] = gimple_assign_rhs2 (stmt);
4861 break;
4862
c86930b0 4863 case GIMPLE_TERNARY_RHS:
4864 code = gimple_assign_rhs_code (stmt);
4865 op_type = TREE_CODE_LENGTH (code);
4866 gcc_assert (op_type == ternary_op);
4867 ops[0] = gimple_assign_rhs1 (stmt);
4868 ops[1] = gimple_assign_rhs2 (stmt);
4869 ops[2] = gimple_assign_rhs3 (stmt);
4870 break;
4871
fb85abff 4872 case GIMPLE_UNARY_RHS:
4873 return false;
4874
4875 default:
4876 gcc_unreachable ();
4877 }
4878
f2104a54 4879 if (code == COND_EXPR && slp_node)
4880 return false;
4881
fb85abff 4882 scalar_dest = gimple_assign_lhs (stmt);
4883 scalar_type = TREE_TYPE (scalar_dest);
48e1416a 4884 if (!POINTER_TYPE_P (scalar_type) && !INTEGRAL_TYPE_P (scalar_type)
fb85abff 4885 && !SCALAR_FLOAT_TYPE_P (scalar_type))
4886 return false;
4887
6960a794 4888 /* Do not try to vectorize bit-precision reductions. */
4889 if ((TYPE_PRECISION (scalar_type)
4890 != GET_MODE_PRECISION (TYPE_MODE (scalar_type))))
4891 return false;
4892
fb85abff 4893 /* All uses but the last are expected to be defined in the loop.
282bf14c 4894 The last use is the reduction variable. In case of nested cycle this
ade2ac53 4895 assumption is not true: we use reduc_index to record the index of the
4896 reduction variable. */
a82fc9c6 4897 for (i = 0; i < op_type - 1; i++)
fb85abff 4898 {
0df23b96 4899 /* The condition of COND_EXPR is checked in vectorizable_condition(). */
4900 if (i == 0 && code == COND_EXPR)
4901 continue;
4902
bed8b93b 4903 is_simple_use = vect_is_simple_use_1 (ops[i], stmt, loop_vinfo, NULL,
b334cbba 4904 &def_stmt, &def, &dt, &tem);
4905 if (!vectype_in)
4906 vectype_in = tem;
fb85abff 4907 gcc_assert (is_simple_use);
39a5d6b1 4908
f083cd24 4909 if (dt != vect_internal_def
4910 && dt != vect_external_def
fb85abff 4911 && dt != vect_constant_def
ade2ac53 4912 && dt != vect_induction_def
0df23b96 4913 && !(dt == vect_nested_cycle && nested_cycle))
fb85abff 4914 return false;
ade2ac53 4915
4916 if (dt == vect_nested_cycle)
4917 {
4918 found_nested_cycle_def = true;
4919 reduc_def_stmt = def_stmt;
4920 reduc_index = i;
4921 }
fb85abff 4922 }
4923
bed8b93b 4924 is_simple_use = vect_is_simple_use_1 (ops[i], stmt, loop_vinfo, NULL,
4925 &def_stmt, &def, &dt, &tem);
fae41702 4926 if (!vectype_in)
4927 vectype_in = tem;
fb85abff 4928 gcc_assert (is_simple_use);
a82fc9c6 4929 if (!(dt == vect_reduction_def
4930 || dt == vect_nested_cycle
4931 || ((dt == vect_internal_def || dt == vect_external_def
4932 || dt == vect_constant_def || dt == vect_induction_def)
4933 && nested_cycle && found_nested_cycle_def)))
4934 {
4935 /* For pattern recognized stmts, orig_stmt might be a reduction,
4936 but some helper statements for the pattern might not, or
4937 might be COND_EXPRs with reduction uses in the condition. */
4938 gcc_assert (orig_stmt);
4939 return false;
4940 }
ade2ac53 4941 if (!found_nested_cycle_def)
4942 reduc_def_stmt = def_stmt;
4943
4944 gcc_assert (gimple_code (reduc_def_stmt) == GIMPLE_PHI);
48e1416a 4945 if (orig_stmt)
4946 gcc_assert (orig_stmt == vect_is_simple_reduction (loop_vinfo,
4947 reduc_def_stmt,
4948 !nested_cycle,
7aa0d350 4949 &dummy));
fb85abff 4950 else
39a5d6b1 4951 {
4952 gimple tmp = vect_is_simple_reduction (loop_vinfo, reduc_def_stmt,
4953 !nested_cycle, &dummy);
4954 /* We changed STMT to be the first stmt in reduction chain, hence we
4955 check that in this case the first element in the chain is STMT. */
4956 gcc_assert (stmt == tmp
4957 || GROUP_FIRST_ELEMENT (vinfo_for_stmt (tmp)) == stmt);
4958 }
48e1416a 4959
ade2ac53 4960 if (STMT_VINFO_LIVE_P (vinfo_for_stmt (reduc_def_stmt)))
fb85abff 4961 return false;
4962
bc937a44 4963 if (slp_node || PURE_SLP_STMT (stmt_info))
eefa05c8 4964 ncopies = 1;
4965 else
4966 ncopies = (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
4967 / TYPE_VECTOR_SUBPARTS (vectype_in));
b334cbba 4968
b334cbba 4969 gcc_assert (ncopies >= 1);
4970
4971 vec_mode = TYPE_MODE (vectype_in);
fb85abff 4972
0df23b96 4973 if (code == COND_EXPR)
fb85abff 4974 {
f2104a54 4975 if (!vectorizable_condition (stmt, gsi, NULL, ops[reduc_index], 0, NULL))
0df23b96 4976 {
6d8fb6cf 4977 if (dump_enabled_p ())
7bd765d4 4978 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
78bb46f5 4979 "unsupported condition in reduction\n");
0df23b96 4980
4981 return false;
4982 }
fb85abff 4983 }
0df23b96 4984 else
fb85abff 4985 {
0df23b96 4986 /* 4. Supportable by target? */
fb85abff 4987
2d788f29 4988 if (code == LSHIFT_EXPR || code == RSHIFT_EXPR
4989 || code == LROTATE_EXPR || code == RROTATE_EXPR)
4990 {
4991 /* Shifts and rotates are only supported by vectorizable_shifts,
4992 not vectorizable_reduction. */
4993 if (dump_enabled_p ())
4994 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
78bb46f5 4995 "unsupported shift or rotation.\n");
2d788f29 4996 return false;
4997 }
4998
0df23b96 4999 /* 4.1. check support for the operation in the loop */
b334cbba 5000 optab = optab_for_tree_code (code, vectype_in, optab_default);
0df23b96 5001 if (!optab)
5002 {
6d8fb6cf 5003 if (dump_enabled_p ())
7bd765d4 5004 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
78bb46f5 5005 "no optab.\n");
0df23b96 5006
5007 return false;
5008 }
5009
d6bf3b14 5010 if (optab_handler (optab, vec_mode) == CODE_FOR_nothing)
0df23b96 5011 {
6d8fb6cf 5012 if (dump_enabled_p ())
78bb46f5 5013 dump_printf (MSG_NOTE, "op not supported by target.\n");
0df23b96 5014
5015 if (GET_MODE_SIZE (vec_mode) != UNITS_PER_WORD
5016 || LOOP_VINFO_VECT_FACTOR (loop_vinfo)
5017 < vect_min_worthwhile_factor (code))
5018 return false;
5019
6d8fb6cf 5020 if (dump_enabled_p ())
78bb46f5 5021 dump_printf (MSG_NOTE, "proceeding using word mode.\n");
0df23b96 5022 }
5023
5024 /* Worthwhile without SIMD support? */
b334cbba 5025 if (!VECTOR_MODE_P (TYPE_MODE (vectype_in))
0df23b96 5026 && LOOP_VINFO_VECT_FACTOR (loop_vinfo)
5027 < vect_min_worthwhile_factor (code))
5028 {
6d8fb6cf 5029 if (dump_enabled_p ())
7bd765d4 5030 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
78bb46f5 5031 "not worthwhile without SIMD support.\n");
0df23b96 5032
5033 return false;
5034 }
fb85abff 5035 }
5036
5037 /* 4.2. Check support for the epilog operation.
5038
5039 If STMT represents a reduction pattern, then the type of the
5040 reduction variable may be different than the type of the rest
5041 of the arguments. For example, consider the case of accumulation
5042 of shorts into an int accumulator; The original code:
5043 S1: int_a = (int) short_a;
5044 orig_stmt-> S2: int_acc = plus <int_a ,int_acc>;
5045
5046 was replaced with:
5047 STMT: int_acc = widen_sum <short_a, int_acc>
5048
5049 This means that:
48e1416a 5050 1. The tree-code that is used to create the vector operation in the
5051 epilog code (that reduces the partial results) is not the
5052 tree-code of STMT, but is rather the tree-code of the original
282bf14c 5053 stmt from the pattern that STMT is replacing. I.e, in the example
48e1416a 5054 above we want to use 'widen_sum' in the loop, but 'plus' in the
fb85abff 5055 epilog.
5056 2. The type (mode) we use to check available target support
48e1416a 5057 for the vector operation to be created in the *epilog*, is
5058 determined by the type of the reduction variable (in the example
d6bf3b14 5059 above we'd check this: optab_handler (plus_optab, vect_int_mode])).
fb85abff 5060 However the type (mode) we use to check available target support
5061 for the vector operation to be created *inside the loop*, is
5062 determined by the type of the other arguments to STMT (in the
d6bf3b14 5063 example we'd check this: optab_handler (widen_sum_optab,
5064 vect_short_mode)).
48e1416a 5065
5066 This is contrary to "regular" reductions, in which the types of all
5067 the arguments are the same as the type of the reduction variable.
5068 For "regular" reductions we can therefore use the same vector type
fb85abff 5069 (and also the same tree-code) when generating the epilog code and
5070 when generating the code inside the loop. */
5071
5072 if (orig_stmt)
5073 {
5074 /* This is a reduction pattern: get the vectype from the type of the
5075 reduction variable, and get the tree-code from orig_stmt. */
5076 orig_code = gimple_assign_rhs_code (orig_stmt);
b334cbba 5077 gcc_assert (vectype_out);
5078 vec_mode = TYPE_MODE (vectype_out);
fb85abff 5079 }
5080 else
5081 {
5082 /* Regular reduction: use the same vectype and tree-code as used for
5083 the vector code inside the loop can be used for the epilog code. */
5084 orig_code = code;
5085 }
5086
c0a0357c 5087 if (nested_cycle)
5088 {
5089 def_bb = gimple_bb (reduc_def_stmt);
5090 def_stmt_loop = def_bb->loop_father;
5091 def_arg = PHI_ARG_DEF_FROM_EDGE (reduc_def_stmt,
5092 loop_preheader_edge (def_stmt_loop));
5093 if (TREE_CODE (def_arg) == SSA_NAME
5094 && (def_arg_stmt = SSA_NAME_DEF_STMT (def_arg))
5095 && gimple_code (def_arg_stmt) == GIMPLE_PHI
5096 && flow_bb_inside_loop_p (outer_loop, gimple_bb (def_arg_stmt))
5097 && vinfo_for_stmt (def_arg_stmt)
5098 && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_arg_stmt))
5099 == vect_double_reduction_def)
5100 double_reduc = true;
5101 }
7aa0d350 5102
0df23b96 5103 epilog_reduc_code = ERROR_MARK;
5104 if (reduction_code_for_scalar_code (orig_code, &epilog_reduc_code))
5105 {
b334cbba 5106 reduc_optab = optab_for_tree_code (epilog_reduc_code, vectype_out,
0df23b96 5107 optab_default);
5108 if (!reduc_optab)
5109 {
6d8fb6cf 5110 if (dump_enabled_p ())
7bd765d4 5111 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
78bb46f5 5112 "no optab for reduction.\n");
0df23b96 5113
5114 epilog_reduc_code = ERROR_MARK;
5115 }
f3d76545 5116 else if (optab_handler (reduc_optab, vec_mode) == CODE_FOR_nothing)
0df23b96 5117 {
f3d76545 5118 optab = scalar_reduc_to_vector (reduc_optab, vectype_out);
5119 if (optab_handler (optab, vec_mode) == CODE_FOR_nothing)
5120 {
5121 if (dump_enabled_p ())
5122 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5123 "reduc op not supported by target.\n");
48e1416a 5124
f3d76545 5125 epilog_reduc_code = ERROR_MARK;
5126 }
0df23b96 5127 }
5128 }
5129 else
5130 {
5131 if (!nested_cycle || double_reduc)
5132 {
6d8fb6cf 5133 if (dump_enabled_p ())
7bd765d4 5134 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
78bb46f5 5135 "no reduc code for scalar code.\n");
0df23b96 5136
5137 return false;
5138 }
5139 }
5140
7aa0d350 5141 if (double_reduc && ncopies > 1)
5142 {
6d8fb6cf 5143 if (dump_enabled_p ())
7bd765d4 5144 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
78bb46f5 5145 "multiple types in double reduction\n");
7aa0d350 5146
5147 return false;
5148 }
48e1416a 5149
f0c50415 5150 /* In case of widenning multiplication by a constant, we update the type
5151 of the constant to be the type of the other operand. We check that the
5152 constant fits the type in the pattern recognition pass. */
5153 if (code == DOT_PROD_EXPR
5154 && !types_compatible_p (TREE_TYPE (ops[0]), TREE_TYPE (ops[1])))
5155 {
5156 if (TREE_CODE (ops[0]) == INTEGER_CST)
5157 ops[0] = fold_convert (TREE_TYPE (ops[1]), ops[0]);
5158 else if (TREE_CODE (ops[1]) == INTEGER_CST)
5159 ops[1] = fold_convert (TREE_TYPE (ops[0]), ops[1]);
5160 else
5161 {
6d8fb6cf 5162 if (dump_enabled_p ())
7bd765d4 5163 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
78bb46f5 5164 "invalid types in dot-prod\n");
f0c50415 5165
5166 return false;
5167 }
5168 }
5169
fb85abff 5170 if (!vec_stmt) /* transformation not required. */
5171 {
fb85abff 5172 if (!vect_model_reduction_cost (stmt_info, epilog_reduc_code, ncopies))
5173 return false;
2814125e 5174 STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
fb85abff 5175 return true;
5176 }
5177
5178 /** Transform. **/
5179
6d8fb6cf 5180 if (dump_enabled_p ())
78bb46f5 5181 dump_printf_loc (MSG_NOTE, vect_location, "transform reduction.\n");
fb85abff 5182
0df23b96 5183 /* FORNOW: Multiple types are not supported for condition. */
5184 if (code == COND_EXPR)
5185 gcc_assert (ncopies == 1);
5186
fb85abff 5187 /* Create the destination vector */
b334cbba 5188 vec_dest = vect_create_destination_var (scalar_dest, vectype_out);
fb85abff 5189
5190 /* In case the vectorization factor (VF) is bigger than the number
5191 of elements that we can fit in a vectype (nunits), we have to generate
5192 more than one vector stmt - i.e - we need to "unroll" the
5193 vector stmt by a factor VF/nunits. For more details see documentation
5194 in vectorizable_operation. */
5195
5196 /* If the reduction is used in an outer loop we need to generate
5197 VF intermediate results, like so (e.g. for ncopies=2):
5198 r0 = phi (init, r0)
5199 r1 = phi (init, r1)
5200 r0 = x0 + r0;
5201 r1 = x1 + r1;
5202 (i.e. we generate VF results in 2 registers).
5203 In this case we have a separate def-use cycle for each copy, and therefore
5204 for each copy we get the vector def for the reduction variable from the
5205 respective phi node created for this copy.
5206
5207 Otherwise (the reduction is unused in the loop nest), we can combine
5208 together intermediate results, like so (e.g. for ncopies=2):
5209 r = phi (init, r)
5210 r = x0 + r;
5211 r = x1 + r;
5212 (i.e. we generate VF/2 results in a single register).
5213 In this case for each copy we get the vector def for the reduction variable
5214 from the vectorized reduction operation generated in the previous iteration.
5215 */
5216
f083cd24 5217 if (STMT_VINFO_RELEVANT (stmt_info) == vect_unused_in_scope)
fb85abff 5218 {
5219 single_defuse_cycle = true;
5220 epilog_copies = 1;
5221 }
5222 else
5223 epilog_copies = ncopies;
5224
5225 prev_stmt_info = NULL;
5226 prev_phi_info = NULL;
eefa05c8 5227 if (slp_node)
5228 {
5229 vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
5230 gcc_assert (TYPE_VECTOR_SUBPARTS (vectype_out)
5231 == TYPE_VECTOR_SUBPARTS (vectype_in));
5232 }
5233 else
5234 {
5235 vec_num = 1;
f1f41a6c 5236 vec_oprnds0.create (1);
eefa05c8 5237 if (op_type == ternary_op)
f1f41a6c 5238 vec_oprnds1.create (1);
eefa05c8 5239 }
5240
f1f41a6c 5241 phis.create (vec_num);
5242 vect_defs.create (vec_num);
eefa05c8 5243 if (!slp_node)
f1f41a6c 5244 vect_defs.quick_push (NULL_TREE);
eefa05c8 5245
fb85abff 5246 for (j = 0; j < ncopies; j++)
5247 {
5248 if (j == 0 || !single_defuse_cycle)
5249 {
eefa05c8 5250 for (i = 0; i < vec_num; i++)
5251 {
5252 /* Create the reduction-phi that defines the reduction
5253 operand. */
5254 new_phi = create_phi_node (vec_dest, loop->header);
5255 set_vinfo_for_stmt (new_phi,
5256 new_stmt_vec_info (new_phi, loop_vinfo,
5257 NULL));
5258 if (j == 0 || slp_node)
f1f41a6c 5259 phis.quick_push (new_phi);
eefa05c8 5260 }
5261 }
fb85abff 5262
0df23b96 5263 if (code == COND_EXPR)
5264 {
eefa05c8 5265 gcc_assert (!slp_node);
5266 vectorizable_condition (stmt, gsi, vec_stmt,
f1f41a6c 5267 PHI_RESULT (phis[0]),
f2104a54 5268 reduc_index, NULL);
0df23b96 5269 /* Multiple types are not supported for condition. */
5270 break;
5271 }
5272
fb85abff 5273 /* Handle uses. */
5274 if (j == 0)
5275 {
09e31a48 5276 op0 = ops[!reduc_index];
5277 if (op_type == ternary_op)
5278 {
5279 if (reduc_index == 0)
5280 op1 = ops[2];
5281 else
5282 op1 = ops[1];
5283 }
5284
eefa05c8 5285 if (slp_node)
b0f64919 5286 vect_get_vec_defs (op0, op1, stmt, &vec_oprnds0, &vec_oprnds1,
5287 slp_node, -1);
eefa05c8 5288 else
fb85abff 5289 {
eefa05c8 5290 loop_vec_def0 = vect_get_vec_def_for_operand (ops[!reduc_index],
5291 stmt, NULL);
f1f41a6c 5292 vec_oprnds0.quick_push (loop_vec_def0);
eefa05c8 5293 if (op_type == ternary_op)
5294 {
09e31a48 5295 loop_vec_def1 = vect_get_vec_def_for_operand (op1, stmt,
5296 NULL);
f1f41a6c 5297 vec_oprnds1.quick_push (loop_vec_def1);
eefa05c8 5298 }
fb85abff 5299 }
fb85abff 5300 }
5301 else
5302 {
eefa05c8 5303 if (!slp_node)
5304 {
d42d0fe0 5305 enum vect_def_type dt;
5306 gimple dummy_stmt;
5307 tree dummy;
5308
bed8b93b 5309 vect_is_simple_use (ops[!reduc_index], stmt, loop_vinfo, NULL,
d42d0fe0 5310 &dummy_stmt, &dummy, &dt);
5311 loop_vec_def0 = vect_get_vec_def_for_stmt_copy (dt,
5312 loop_vec_def0);
f1f41a6c 5313 vec_oprnds0[0] = loop_vec_def0;
eefa05c8 5314 if (op_type == ternary_op)
5315 {
bed8b93b 5316 vect_is_simple_use (op1, stmt, loop_vinfo, NULL, &dummy_stmt,
d42d0fe0 5317 &dummy, &dt);
eefa05c8 5318 loop_vec_def1 = vect_get_vec_def_for_stmt_copy (dt,
5319 loop_vec_def1);
f1f41a6c 5320 vec_oprnds1[0] = loop_vec_def1;
eefa05c8 5321 }
5322 }
fb85abff 5323
eefa05c8 5324 if (single_defuse_cycle)
5325 reduc_def = gimple_assign_lhs (new_stmt);
fb85abff 5326
eefa05c8 5327 STMT_VINFO_RELATED_STMT (prev_phi_info) = new_phi;
fb85abff 5328 }
5329
f1f41a6c 5330 FOR_EACH_VEC_ELT (vec_oprnds0, i, def0)
ade2ac53 5331 {
eefa05c8 5332 if (slp_node)
f1f41a6c 5333 reduc_def = PHI_RESULT (phis[i]);
ade2ac53 5334 else
eefa05c8 5335 {
5336 if (!single_defuse_cycle || j == 0)
5337 reduc_def = PHI_RESULT (new_phi);
5338 }
5339
5340 def1 = ((op_type == ternary_op)
f1f41a6c 5341 ? vec_oprnds1[i] : NULL);
eefa05c8 5342 if (op_type == binary_op)
5343 {
5344 if (reduc_index == 0)
5345 expr = build2 (code, vectype_out, reduc_def, def0);
5346 else
5347 expr = build2 (code, vectype_out, def0, reduc_def);
5348 }
48e1416a 5349 else
ade2ac53 5350 {
eefa05c8 5351 if (reduc_index == 0)
5352 expr = build3 (code, vectype_out, reduc_def, def0, def1);
ade2ac53 5353 else
eefa05c8 5354 {
5355 if (reduc_index == 1)
5356 expr = build3 (code, vectype_out, def0, reduc_def, def1);
5357 else
5358 expr = build3 (code, vectype_out, def0, def1, reduc_def);
5359 }
5360 }
5361
5362 new_stmt = gimple_build_assign (vec_dest, expr);
5363 new_temp = make_ssa_name (vec_dest, new_stmt);
5364 gimple_assign_set_lhs (new_stmt, new_temp);
5365 vect_finish_stmt_generation (stmt, new_stmt, gsi);
39a5d6b1 5366
eefa05c8 5367 if (slp_node)
5368 {
f1f41a6c 5369 SLP_TREE_VEC_STMTS (slp_node).quick_push (new_stmt);
5370 vect_defs.quick_push (new_temp);
ade2ac53 5371 }
eefa05c8 5372 else
f1f41a6c 5373 vect_defs[0] = new_temp;
ade2ac53 5374 }
5375
eefa05c8 5376 if (slp_node)
5377 continue;
48e1416a 5378
fb85abff 5379 if (j == 0)
5380 STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt;
5381 else
5382 STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
0df23b96 5383
fb85abff 5384 prev_stmt_info = vinfo_for_stmt (new_stmt);
5385 prev_phi_info = vinfo_for_stmt (new_phi);
5386 }
5387
5388 /* Finalize the reduction-phi (set its arguments) and create the
5389 epilog reduction code. */
eefa05c8 5390 if ((!single_defuse_cycle || code == COND_EXPR) && !slp_node)
5391 {
5392 new_temp = gimple_assign_lhs (*vec_stmt);
f1f41a6c 5393 vect_defs[0] = new_temp;
eefa05c8 5394 }
5395
5396 vect_create_epilog_for_reduction (vect_defs, stmt, epilog_copies,
5397 epilog_reduc_code, phis, reduc_index,
5398 double_reduc, slp_node);
5399
fb85abff 5400 return true;
5401}
5402
5403/* Function vect_min_worthwhile_factor.
5404
5405 For a loop where we could vectorize the operation indicated by CODE,
5406 return the minimum vectorization factor that makes it worthwhile
5407 to use generic vectors. */
5408int
5409vect_min_worthwhile_factor (enum tree_code code)
5410{
5411 switch (code)
5412 {
5413 case PLUS_EXPR:
5414 case MINUS_EXPR:
5415 case NEGATE_EXPR:
5416 return 4;
5417
5418 case BIT_AND_EXPR:
5419 case BIT_IOR_EXPR:
5420 case BIT_XOR_EXPR:
5421 case BIT_NOT_EXPR:
5422 return 2;
5423
5424 default:
5425 return INT_MAX;
5426 }
5427}
5428
5429
5430/* Function vectorizable_induction
5431
5432 Check if PHI performs an induction computation that can be vectorized.
5433 If VEC_STMT is also passed, vectorize the induction PHI: create a vectorized
5434 phi to replace it, put it in VEC_STMT, and add it to the same basic block.
5435 Return FALSE if not a vectorizable STMT, TRUE otherwise. */
5436
5437bool
5438vectorizable_induction (gimple phi, gimple_stmt_iterator *gsi ATTRIBUTE_UNUSED,
5439 gimple *vec_stmt)
5440{
5441 stmt_vec_info stmt_info = vinfo_for_stmt (phi);
5442 tree vectype = STMT_VINFO_VECTYPE (stmt_info);
5443 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
5444 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
5445 int nunits = TYPE_VECTOR_SUBPARTS (vectype);
5446 int ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits;
5447 tree vec_def;
5448
5449 gcc_assert (ncopies >= 1);
02a2bdca 5450 /* FORNOW. These restrictions should be relaxed. */
5451 if (nested_in_vect_loop_p (loop, phi))
fb85abff 5452 {
02a2bdca 5453 imm_use_iterator imm_iter;
5454 use_operand_p use_p;
5455 gimple exit_phi;
5456 edge latch_e;
5457 tree loop_arg;
5458
5459 if (ncopies > 1)
5460 {
6d8fb6cf 5461 if (dump_enabled_p ())
7bd765d4 5462 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
78bb46f5 5463 "multiple types in nested loop.\n");
02a2bdca 5464 return false;
5465 }
5466
5467 exit_phi = NULL;
5468 latch_e = loop_latch_edge (loop->inner);
5469 loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
5470 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, loop_arg)
5471 {
0b308eee 5472 gimple use_stmt = USE_STMT (use_p);
5473 if (is_gimple_debug (use_stmt))
5474 continue;
5475
5476 if (!flow_bb_inside_loop_p (loop->inner, gimple_bb (use_stmt)))
02a2bdca 5477 {
0b308eee 5478 exit_phi = use_stmt;
02a2bdca 5479 break;
5480 }
5481 }
5482 if (exit_phi)
5483 {
5484 stmt_vec_info exit_phi_vinfo = vinfo_for_stmt (exit_phi);
5485 if (!(STMT_VINFO_RELEVANT_P (exit_phi_vinfo)
5486 && !STMT_VINFO_LIVE_P (exit_phi_vinfo)))
5487 {
6d8fb6cf 5488 if (dump_enabled_p ())
78bb46f5 5489 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7bd765d4 5490 "inner-loop induction only used outside "
78bb46f5 5491 "of the outer vectorized loop.\n");
02a2bdca 5492 return false;
5493 }
5494 }
fb85abff 5495 }
5496
5497 if (!STMT_VINFO_RELEVANT_P (stmt_info))
5498 return false;
5499
5500 /* FORNOW: SLP not supported. */
5501 if (STMT_SLP_TYPE (stmt_info))
5502 return false;
5503
5504 gcc_assert (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def);
5505
5506 if (gimple_code (phi) != GIMPLE_PHI)
5507 return false;
5508
5509 if (!vec_stmt) /* transformation not required. */
5510 {
5511 STMT_VINFO_TYPE (stmt_info) = induc_vec_info_type;
6d8fb6cf 5512 if (dump_enabled_p ())
7bd765d4 5513 dump_printf_loc (MSG_NOTE, vect_location,
78bb46f5 5514 "=== vectorizable_induction ===\n");
fb85abff 5515 vect_model_induction_cost (stmt_info, ncopies);
5516 return true;
5517 }
5518
5519 /** Transform. **/
5520
6d8fb6cf 5521 if (dump_enabled_p ())
78bb46f5 5522 dump_printf_loc (MSG_NOTE, vect_location, "transform induction phi.\n");
fb85abff 5523
5524 vec_def = get_initial_def_for_induction (phi);
5525 *vec_stmt = SSA_NAME_DEF_STMT (vec_def);
5526 return true;
5527}
5528
5529/* Function vectorizable_live_operation.
5530
282bf14c 5531 STMT computes a value that is used outside the loop. Check if
fb85abff 5532 it can be supported. */
5533
5534bool
5535vectorizable_live_operation (gimple stmt,
5536 gimple_stmt_iterator *gsi ATTRIBUTE_UNUSED,
3d483a94 5537 gimple *vec_stmt)
fb85abff 5538{
5539 stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
5540 loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
5541 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
5542 int i;
5543 int op_type;
5544 tree op;
5545 tree def;
5546 gimple def_stmt;
48e1416a 5547 enum vect_def_type dt;
fb85abff 5548 enum tree_code code;
5549 enum gimple_rhs_class rhs_class;
5550
5551 gcc_assert (STMT_VINFO_LIVE_P (stmt_info));
5552
5553 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def)
5554 return false;
5555
5556 if (!is_gimple_assign (stmt))
3d483a94 5557 {
5558 if (gimple_call_internal_p (stmt)
5559 && gimple_call_internal_fn (stmt) == IFN_GOMP_SIMD_LANE
5560 && gimple_call_lhs (stmt)
5561 && loop->simduid
5562 && TREE_CODE (gimple_call_arg (stmt, 0)) == SSA_NAME
5563 && loop->simduid
5564 == SSA_NAME_VAR (gimple_call_arg (stmt, 0)))
5565 {
5566 edge e = single_exit (loop);
5567 basic_block merge_bb = e->dest;
5568 imm_use_iterator imm_iter;
5569 use_operand_p use_p;
5570 tree lhs = gimple_call_lhs (stmt);
5571
5572 FOR_EACH_IMM_USE_FAST (use_p, imm_iter, lhs)
5573 {
5574 gimple use_stmt = USE_STMT (use_p);
5575 if (gimple_code (use_stmt) == GIMPLE_PHI
0b308eee 5576 && gimple_bb (use_stmt) == merge_bb)
3d483a94 5577 {
5578 if (vec_stmt)
5579 {
5580 tree vfm1
5581 = build_int_cst (unsigned_type_node,
5582 loop_vinfo->vectorization_factor - 1);
5583 SET_PHI_ARG_DEF (use_stmt, e->dest_idx, vfm1);
5584 }
5585 return true;
5586 }
5587 }
5588 }
5589
5590 return false;
5591 }
fb85abff 5592
5593 if (TREE_CODE (gimple_assign_lhs (stmt)) != SSA_NAME)
5594 return false;
5595
5596 /* FORNOW. CHECKME. */
5597 if (nested_in_vect_loop_p (loop, stmt))
5598 return false;
5599
5600 code = gimple_assign_rhs_code (stmt);
5601 op_type = TREE_CODE_LENGTH (code);
5602 rhs_class = get_gimple_rhs_class (code);
5603 gcc_assert (rhs_class != GIMPLE_UNARY_RHS || op_type == unary_op);
5604 gcc_assert (rhs_class != GIMPLE_BINARY_RHS || op_type == binary_op);
5605
282bf14c 5606 /* FORNOW: support only if all uses are invariant. This means
fb85abff 5607 that the scalar operations can remain in place, unvectorized.
5608 The original last scalar value that they compute will be used. */
5609
5610 for (i = 0; i < op_type; i++)
5611 {
5612 if (rhs_class == GIMPLE_SINGLE_RHS)
5613 op = TREE_OPERAND (gimple_op (stmt, 1), i);
5614 else
5615 op = gimple_op (stmt, i + 1);
37545e54 5616 if (op
bed8b93b 5617 && !vect_is_simple_use (op, stmt, loop_vinfo, NULL, &def_stmt, &def,
5618 &dt))
fb85abff 5619 {
6d8fb6cf 5620 if (dump_enabled_p ())
7bd765d4 5621 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
78bb46f5 5622 "use not simple.\n");
fb85abff 5623 return false;
5624 }
5625
f083cd24 5626 if (dt != vect_external_def && dt != vect_constant_def)
fb85abff 5627 return false;
5628 }
5629
5630 /* No transformation is required for the cases we currently support. */
5631 return true;
5632}
5633
4c48884e 5634/* Kill any debug uses outside LOOP of SSA names defined in STMT. */
5635
5636static void
5637vect_loop_kill_debug_uses (struct loop *loop, gimple stmt)
5638{
5639 ssa_op_iter op_iter;
5640 imm_use_iterator imm_iter;
5641 def_operand_p def_p;
5642 gimple ustmt;
5643
5644 FOR_EACH_PHI_OR_STMT_DEF (def_p, stmt, op_iter, SSA_OP_DEF)
5645 {
5646 FOR_EACH_IMM_USE_STMT (ustmt, imm_iter, DEF_FROM_PTR (def_p))
5647 {
5648 basic_block bb;
5649
5650 if (!is_gimple_debug (ustmt))
5651 continue;
5652
5653 bb = gimple_bb (ustmt);
5654
5655 if (!flow_bb_inside_loop_p (loop, bb))
5656 {
5657 if (gimple_debug_bind_p (ustmt))
5658 {
6d8fb6cf 5659 if (dump_enabled_p ())
7bd765d4 5660 dump_printf_loc (MSG_NOTE, vect_location,
78bb46f5 5661 "killing debug use\n");
4c48884e 5662
5663 gimple_debug_bind_reset_value (ustmt);
5664 update_stmt (ustmt);
5665 }
5666 else
5667 gcc_unreachable ();
5668 }
5669 }
5670 }
5671}
5672
782fd1d1 5673
5674/* This function builds ni_name = number of iterations. Statements
313a5120 5675 are emitted on the loop preheader edge. */
782fd1d1 5676
5677static tree
313a5120 5678vect_build_loop_niters (loop_vec_info loop_vinfo)
782fd1d1 5679{
782fd1d1 5680 tree ni = unshare_expr (LOOP_VINFO_NITERS (loop_vinfo));
313a5120 5681 if (TREE_CODE (ni) == INTEGER_CST)
5682 return ni;
5683 else
5684 {
5685 tree ni_name, var;
5686 gimple_seq stmts = NULL;
5687 edge pe = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo));
782fd1d1 5688
313a5120 5689 var = create_tmp_var (TREE_TYPE (ni), "niters");
5690 ni_name = force_gimple_operand (ni, &stmts, false, var);
5691 if (stmts)
5692 gsi_insert_seq_on_edge_immediate (pe, stmts);
782fd1d1 5693
313a5120 5694 return ni_name;
5695 }
782fd1d1 5696}
5697
5698
5699/* This function generates the following statements:
5700
313a5120 5701 ni_name = number of iterations loop executes
5702 ratio = ni_name / vf
5703 ratio_mult_vf_name = ratio * vf
782fd1d1 5704
313a5120 5705 and places them on the loop preheader edge. */
782fd1d1 5706
5707static void
5708vect_generate_tmps_on_preheader (loop_vec_info loop_vinfo,
5709 tree ni_name,
5710 tree *ratio_mult_vf_name_ptr,
313a5120 5711 tree *ratio_name_ptr)
782fd1d1 5712{
782fd1d1 5713 tree ni_minus_gap_name;
5714 tree var;
5715 tree ratio_name;
5716 tree ratio_mult_vf_name;
782fd1d1 5717 int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
313a5120 5718 edge pe = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo));
782fd1d1 5719 tree log_vf;
5720
796f6cba 5721 log_vf = build_int_cst (TREE_TYPE (ni_name), exact_log2 (vf));
782fd1d1 5722
5723 /* If epilogue loop is required because of data accesses with gaps, we
5724 subtract one iteration from the total number of iterations here for
5725 correct calculation of RATIO. */
5726 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
5727 {
5728 ni_minus_gap_name = fold_build2 (MINUS_EXPR, TREE_TYPE (ni_name),
5729 ni_name,
5730 build_one_cst (TREE_TYPE (ni_name)));
5731 if (!is_gimple_val (ni_minus_gap_name))
5732 {
796f6cba 5733 var = create_tmp_var (TREE_TYPE (ni_name), "ni_gap");
313a5120 5734 gimple stmts = NULL;
782fd1d1 5735 ni_minus_gap_name = force_gimple_operand (ni_minus_gap_name, &stmts,
5736 true, var);
313a5120 5737 gsi_insert_seq_on_edge_immediate (pe, stmts);
782fd1d1 5738 }
5739 }
5740 else
5741 ni_minus_gap_name = ni_name;
5742
5743 /* Create: ratio = ni >> log2(vf) */
796f6cba 5744 /* ??? As we have ni == number of latch executions + 1, ni could
5745 have overflown to zero. So avoid computing ratio based on ni
5746 but compute it using the fact that we know ratio will be at least
5747 one, thus via (ni - vf) >> log2(vf) + 1. */
5748 ratio_name
5749 = fold_build2 (PLUS_EXPR, TREE_TYPE (ni_name),
5750 fold_build2 (RSHIFT_EXPR, TREE_TYPE (ni_name),
5751 fold_build2 (MINUS_EXPR, TREE_TYPE (ni_name),
5752 ni_minus_gap_name,
5753 build_int_cst
5754 (TREE_TYPE (ni_name), vf)),
5755 log_vf),
5756 build_int_cst (TREE_TYPE (ni_name), 1));
782fd1d1 5757 if (!is_gimple_val (ratio_name))
5758 {
796f6cba 5759 var = create_tmp_var (TREE_TYPE (ni_name), "bnd");
313a5120 5760 gimple stmts = NULL;
782fd1d1 5761 ratio_name = force_gimple_operand (ratio_name, &stmts, true, var);
313a5120 5762 gsi_insert_seq_on_edge_immediate (pe, stmts);
782fd1d1 5763 }
5764 *ratio_name_ptr = ratio_name;
5765
5766 /* Create: ratio_mult_vf = ratio << log2 (vf). */
5767
5768 if (ratio_mult_vf_name_ptr)
5769 {
5770 ratio_mult_vf_name = fold_build2 (LSHIFT_EXPR, TREE_TYPE (ratio_name),
5771 ratio_name, log_vf);
5772 if (!is_gimple_val (ratio_mult_vf_name))
5773 {
796f6cba 5774 var = create_tmp_var (TREE_TYPE (ni_name), "ratio_mult_vf");
313a5120 5775 gimple stmts = NULL;
782fd1d1 5776 ratio_mult_vf_name = force_gimple_operand (ratio_mult_vf_name, &stmts,
5777 true, var);
313a5120 5778 gsi_insert_seq_on_edge_immediate (pe, stmts);
782fd1d1 5779 }
5780 *ratio_mult_vf_name_ptr = ratio_mult_vf_name;
5781 }
5782
5783 return;
5784}
5785
5786
fb85abff 5787/* Function vect_transform_loop.
5788
5789 The analysis phase has determined that the loop is vectorizable.
5790 Vectorize the loop - created vectorized stmts to replace the scalar
5791 stmts in the loop, and update the loop exit condition. */
5792
5793void
5794vect_transform_loop (loop_vec_info loop_vinfo)
5795{
5796 struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
5797 basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
5798 int nbbs = loop->num_nodes;
5799 gimple_stmt_iterator si;
5800 int i;
5801 tree ratio = NULL;
5802 int vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
ee612634 5803 bool grouped_store;
fb85abff 5804 bool slp_scheduled = false;
18937389 5805 gimple stmt, pattern_stmt;
5806 gimple_seq pattern_def_seq = NULL;
e3a19533 5807 gimple_stmt_iterator pattern_def_si = gsi_none ();
18937389 5808 bool transform_pattern_stmt = false;
13b31e0b 5809 bool check_profitability = false;
e7430948 5810 int th;
d3f1934c 5811 /* Record number of iterations before we started tampering with the profile. */
5812 gcov_type expected_iterations = expected_loop_iterations_unbounded (loop);
fb85abff 5813
6d8fb6cf 5814 if (dump_enabled_p ())
78bb46f5 5815 dump_printf_loc (MSG_NOTE, vect_location, "=== vec_transform_loop ===\n");
fb85abff 5816
d3f1934c 5817 /* If profile is inprecise, we have chance to fix it up. */
5818 if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
5819 expected_iterations = LOOP_VINFO_INT_NITERS (loop_vinfo);
5820
e7430948 5821 /* Use the more conservative vectorization threshold. If the number
5822 of iterations is constant assume the cost check has been performed
5823 by our caller. If the threshold makes all loops profitable that
5824 run at least the vectorization factor number of times checking
5825 is pointless, too. */
004a94a5 5826 th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
e7430948 5827 if (th >= LOOP_VINFO_VECT_FACTOR (loop_vinfo) - 1
5828 && !LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
5829 {
6d8fb6cf 5830 if (dump_enabled_p ())
7bd765d4 5831 dump_printf_loc (MSG_NOTE, vect_location,
78bb46f5 5832 "Profitability threshold is %d loop iterations.\n",
5833 th);
e7430948 5834 check_profitability = true;
5835 }
5836
2cd0995e 5837 /* Version the loop first, if required, so the profitability check
5838 comes first. */
23a3430d 5839
2cd0995e 5840 if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo)
5841 || LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
e7430948 5842 {
2cd0995e 5843 vect_loop_versioning (loop_vinfo, th, check_profitability);
e7430948 5844 check_profitability = false;
5845 }
23a3430d 5846
313a5120 5847 tree ni_name = vect_build_loop_niters (loop_vinfo);
5848 LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = ni_name;
5849
2cd0995e 5850 /* Peel the loop if there are data refs with unknown alignment.
313a5120 5851 Only one data ref with unknown store is allowed. */
2cd0995e 5852
313a5120 5853 if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo))
e7430948 5854 {
782fd1d1 5855 vect_do_peeling_for_alignment (loop_vinfo, ni_name,
5856 th, check_profitability);
e7430948 5857 check_profitability = false;
313a5120 5858 /* The above adjusts LOOP_VINFO_NITERS, so cause ni_name to
5859 be re-computed. */
5860 ni_name = NULL_TREE;
e7430948 5861 }
fb85abff 5862
fb85abff 5863 /* If the loop has a symbolic number of iterations 'n' (i.e. it's not a
5864 compile time constant), or it is a constant that doesn't divide by the
5865 vectorization factor, then an epilog loop needs to be created.
5866 We therefore duplicate the loop: the original loop will be vectorized,
282bf14c 5867 and will compute the first (n/VF) iterations. The second copy of the loop
fb85abff 5868 will remain scalar and will compute the remaining (n%VF) iterations.
5869 (VF is the vectorization factor). */
5870
313a5120 5871 if (LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo)
c8a2b4ff 5872 || LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
782fd1d1 5873 {
313a5120 5874 tree ratio_mult_vf;
5875 if (!ni_name)
5876 ni_name = vect_build_loop_niters (loop_vinfo);
782fd1d1 5877 vect_generate_tmps_on_preheader (loop_vinfo, ni_name, &ratio_mult_vf,
313a5120 5878 &ratio);
782fd1d1 5879 vect_do_peeling_for_loop_bound (loop_vinfo, ni_name, ratio_mult_vf,
5880 th, check_profitability);
5881 }
c8a2b4ff 5882 else if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
fb85abff 5883 ratio = build_int_cst (TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo)),
5884 LOOP_VINFO_INT_NITERS (loop_vinfo) / vectorization_factor);
c8a2b4ff 5885 else
5886 {
313a5120 5887 if (!ni_name)
5888 ni_name = vect_build_loop_niters (loop_vinfo);
5889 vect_generate_tmps_on_preheader (loop_vinfo, ni_name, NULL, &ratio);
c8a2b4ff 5890 }
fb85abff 5891
5892 /* 1) Make sure the loop header has exactly two entries
5893 2) Make sure we have a preheader basic block. */
5894
5895 gcc_assert (EDGE_COUNT (loop->header->preds) == 2);
5896
5897 split_edge (loop_preheader_edge (loop));
5898
5899 /* FORNOW: the vectorizer supports only loops which body consist
48e1416a 5900 of one basic block (header + empty latch). When the vectorizer will
5901 support more involved loop forms, the order by which the BBs are
fb85abff 5902 traversed need to be reconsidered. */
5903
5904 for (i = 0; i < nbbs; i++)
5905 {
5906 basic_block bb = bbs[i];
5907 stmt_vec_info stmt_info;
5908 gimple phi;
5909
5910 for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
5911 {
5912 phi = gsi_stmt (si);
6d8fb6cf 5913 if (dump_enabled_p ())
fb85abff 5914 {
7bd765d4 5915 dump_printf_loc (MSG_NOTE, vect_location,
5916 "------>vectorizing phi: ");
5917 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
78bb46f5 5918 dump_printf (MSG_NOTE, "\n");
fb85abff 5919 }
5920 stmt_info = vinfo_for_stmt (phi);
5921 if (!stmt_info)
5922 continue;
5923
12e7ff4f 5924 if (MAY_HAVE_DEBUG_STMTS && !STMT_VINFO_LIVE_P (stmt_info))
5925 vect_loop_kill_debug_uses (loop, phi);
5926
fb85abff 5927 if (!STMT_VINFO_RELEVANT_P (stmt_info)
5928 && !STMT_VINFO_LIVE_P (stmt_info))
12e7ff4f 5929 continue;
fb85abff 5930
bb4b5e0f 5931 if (STMT_VINFO_VECTYPE (stmt_info)
5932 && (TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info))
5933 != (unsigned HOST_WIDE_INT) vectorization_factor)
6d8fb6cf 5934 && dump_enabled_p ())
78bb46f5 5935 dump_printf_loc (MSG_NOTE, vect_location, "multiple-types.\n");
fb85abff 5936
5937 if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def)
5938 {
6d8fb6cf 5939 if (dump_enabled_p ())
78bb46f5 5940 dump_printf_loc (MSG_NOTE, vect_location, "transform phi.\n");
fb85abff 5941 vect_transform_stmt (phi, NULL, NULL, NULL, NULL);
5942 }
5943 }
5944
8bf58742 5945 pattern_stmt = NULL;
5946 for (si = gsi_start_bb (bb); !gsi_end_p (si) || transform_pattern_stmt;)
fb85abff 5947 {
fb85abff 5948 bool is_store;
5949
8bf58742 5950 if (transform_pattern_stmt)
18937389 5951 stmt = pattern_stmt;
8bf58742 5952 else
8911f4de 5953 {
5954 stmt = gsi_stmt (si);
5955 /* During vectorization remove existing clobber stmts. */
5956 if (gimple_clobber_p (stmt))
5957 {
5958 unlink_stmt_vdef (stmt);
5959 gsi_remove (&si, true);
5960 release_defs (stmt);
5961 continue;
5962 }
5963 }
8bf58742 5964
6d8fb6cf 5965 if (dump_enabled_p ())
fb85abff 5966 {
7bd765d4 5967 dump_printf_loc (MSG_NOTE, vect_location,
5968 "------>vectorizing statement: ");
5969 dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
78bb46f5 5970 dump_printf (MSG_NOTE, "\n");
48e1416a 5971 }
fb85abff 5972
5973 stmt_info = vinfo_for_stmt (stmt);
5974
5975 /* vector stmts created in the outer-loop during vectorization of
5976 stmts in an inner-loop may not have a stmt_info, and do not
5977 need to be vectorized. */
5978 if (!stmt_info)
5979 {
5980 gsi_next (&si);
5981 continue;
5982 }
5983
12e7ff4f 5984 if (MAY_HAVE_DEBUG_STMTS && !STMT_VINFO_LIVE_P (stmt_info))
5985 vect_loop_kill_debug_uses (loop, stmt);
5986
fb85abff 5987 if (!STMT_VINFO_RELEVANT_P (stmt_info)
5988 && !STMT_VINFO_LIVE_P (stmt_info))
cfdcf183 5989 {
5990 if (STMT_VINFO_IN_PATTERN_P (stmt_info)
5991 && (pattern_stmt = STMT_VINFO_RELATED_STMT (stmt_info))
5992 && (STMT_VINFO_RELEVANT_P (vinfo_for_stmt (pattern_stmt))
5993 || STMT_VINFO_LIVE_P (vinfo_for_stmt (pattern_stmt))))
5994 {
5995 stmt = pattern_stmt;
5996 stmt_info = vinfo_for_stmt (stmt);
5997 }
5998 else
5999 {
6000 gsi_next (&si);
6001 continue;
6002 }
fb85abff 6003 }
8bf58742 6004 else if (STMT_VINFO_IN_PATTERN_P (stmt_info)
6005 && (pattern_stmt = STMT_VINFO_RELATED_STMT (stmt_info))
6006 && (STMT_VINFO_RELEVANT_P (vinfo_for_stmt (pattern_stmt))
6007 || STMT_VINFO_LIVE_P (vinfo_for_stmt (pattern_stmt))))
6008 transform_pattern_stmt = true;
fb85abff 6009
18937389 6010 /* If pattern statement has def stmts, vectorize them too. */
6011 if (is_pattern_stmt_p (stmt_info))
6012 {
6013 if (pattern_def_seq == NULL)
6014 {
6015 pattern_def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
6016 pattern_def_si = gsi_start (pattern_def_seq);
6017 }
6018 else if (!gsi_end_p (pattern_def_si))
6019 gsi_next (&pattern_def_si);
6020 if (pattern_def_seq != NULL)
6021 {
6022 gimple pattern_def_stmt = NULL;
6023 stmt_vec_info pattern_def_stmt_info = NULL;
45eea33f 6024
18937389 6025 while (!gsi_end_p (pattern_def_si))
6026 {
6027 pattern_def_stmt = gsi_stmt (pattern_def_si);
6028 pattern_def_stmt_info
6029 = vinfo_for_stmt (pattern_def_stmt);
6030 if (STMT_VINFO_RELEVANT_P (pattern_def_stmt_info)
6031 || STMT_VINFO_LIVE_P (pattern_def_stmt_info))
6032 break;
6033 gsi_next (&pattern_def_si);
6034 }
6035
6036 if (!gsi_end_p (pattern_def_si))
6037 {
6d8fb6cf 6038 if (dump_enabled_p ())
18937389 6039 {
7bd765d4 6040 dump_printf_loc (MSG_NOTE, vect_location,
6041 "==> vectorizing pattern def "
6042 "stmt: ");
6043 dump_gimple_stmt (MSG_NOTE, TDF_SLIM,
6044 pattern_def_stmt, 0);
78bb46f5 6045 dump_printf (MSG_NOTE, "\n");
18937389 6046 }
6047
6048 stmt = pattern_def_stmt;
6049 stmt_info = pattern_def_stmt_info;
6050 }
6051 else
6052 {
e3a19533 6053 pattern_def_si = gsi_none ();
18937389 6054 transform_pattern_stmt = false;
6055 }
6056 }
6057 else
6058 transform_pattern_stmt = false;
45eea33f 6059 }
6060
d09768a4 6061 if (STMT_VINFO_VECTYPE (stmt_info))
6062 {
6063 unsigned int nunits
6064 = (unsigned int)
6065 TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info));
6066 if (!STMT_SLP_TYPE (stmt_info)
6067 && nunits != (unsigned int) vectorization_factor
6068 && dump_enabled_p ())
6069 /* For SLP VF is set according to unrolling factor, and not
6070 to vector size, hence for SLP this print is not valid. */
6071 dump_printf_loc (MSG_NOTE, vect_location, "multiple-types.\n");
6072 }
fb85abff 6073
6074 /* SLP. Schedule all the SLP instances when the first SLP stmt is
6075 reached. */
6076 if (STMT_SLP_TYPE (stmt_info))
6077 {
6078 if (!slp_scheduled)
6079 {
6080 slp_scheduled = true;
6081
6d8fb6cf 6082 if (dump_enabled_p ())
7bd765d4 6083 dump_printf_loc (MSG_NOTE, vect_location,
78bb46f5 6084 "=== scheduling SLP instances ===\n");
fb85abff 6085
37545e54 6086 vect_schedule_slp (loop_vinfo, NULL);
fb85abff 6087 }
6088
6089 /* Hybrid SLP stmts must be vectorized in addition to SLP. */
1065dd4e 6090 if (!vinfo_for_stmt (stmt) || PURE_SLP_STMT (stmt_info))
fb85abff 6091 {
18937389 6092 if (!transform_pattern_stmt && gsi_end_p (pattern_def_si))
6093 {
6094 pattern_def_seq = NULL;
6095 gsi_next (&si);
6096 }
6097 continue;
fb85abff 6098 }
6099 }
48e1416a 6100
fb85abff 6101 /* -------- vectorize statement ------------ */
6d8fb6cf 6102 if (dump_enabled_p ())
78bb46f5 6103 dump_printf_loc (MSG_NOTE, vect_location, "transform statement.\n");
fb85abff 6104
ee612634 6105 grouped_store = false;
6106 is_store = vect_transform_stmt (stmt, &si, &grouped_store, NULL, NULL);
fb85abff 6107 if (is_store)
6108 {
ee612634 6109 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
fb85abff 6110 {
6111 /* Interleaving. If IS_STORE is TRUE, the vectorization of the
6112 interleaving chain was completed - free all the stores in
6113 the chain. */
3b515af5 6114 gsi_next (&si);
21009880 6115 vect_remove_stores (GROUP_FIRST_ELEMENT (stmt_info));
fb85abff 6116 }
6117 else
6118 {
6119 /* Free the attached stmt_vec_info and remove the stmt. */
bc8a8451 6120 gimple store = gsi_stmt (si);
6121 free_stmt_vec_info (store);
6122 unlink_stmt_vdef (store);
fb85abff 6123 gsi_remove (&si, true);
bc8a8451 6124 release_defs (store);
fb85abff 6125 }
8bf58742 6126
512cbd67 6127 /* Stores can only appear at the end of pattern statements. */
6128 gcc_assert (!transform_pattern_stmt);
6129 pattern_def_seq = NULL;
6130 }
6131 else if (!transform_pattern_stmt && gsi_end_p (pattern_def_si))
18937389 6132 {
6133 pattern_def_seq = NULL;
6134 gsi_next (&si);
6135 }
fb85abff 6136 } /* stmts in BB */
6137 } /* BBs in loop */
6138
6139 slpeel_make_loop_iterate_ntimes (loop, ratio);
6140
d3f1934c 6141 /* Reduce loop iterations by the vectorization factor. */
f9d4b7f4 6142 scale_loop_profile (loop, GCOV_COMPUTE_SCALE (1, vectorization_factor),
d3f1934c 6143 expected_iterations / vectorization_factor);
6144 loop->nb_iterations_upper_bound
796b6678 6145 = wi::udiv_floor (loop->nb_iterations_upper_bound, vectorization_factor);
d3f1934c 6146 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
796b6678 6147 && loop->nb_iterations_upper_bound != 0)
e913b5cd 6148 loop->nb_iterations_upper_bound = loop->nb_iterations_upper_bound - 1;
d3f1934c 6149 if (loop->any_estimate)
6150 {
6151 loop->nb_iterations_estimate
796b6678 6152 = wi::udiv_floor (loop->nb_iterations_estimate, vectorization_factor);
d3f1934c 6153 if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
796b6678 6154 && loop->nb_iterations_estimate != 0)
e913b5cd 6155 loop->nb_iterations_estimate = loop->nb_iterations_estimate - 1;
d3f1934c 6156 }
6157
6d8fb6cf 6158 if (dump_enabled_p ())
b055bc88 6159 {
a21425b5 6160 dump_printf_loc (MSG_NOTE, vect_location,
b055bc88 6161 "LOOP VECTORIZED\n");
6162 if (loop->inner)
6163 dump_printf_loc (MSG_NOTE, vect_location,
6164 "OUTER LOOP VECTORIZED\n");
78bb46f5 6165 dump_printf (MSG_NOTE, "\n");
b055bc88 6166 }
fb85abff 6167}