]>
Commit | Line | Data |
---|---|---|
ebfd146a IR |
1 | /* Loop Vectorization |
2 | Copyright (C) 2003, 2004, 2005, 2006, 2007, 2008, 2009 Free Software | |
3 | Foundation, Inc. | |
b8698a0f | 4 | Contributed by Dorit Naishlos <dorit@il.ibm.com> and |
ebfd146a IR |
5 | Ira Rosen <irar@il.ibm.com> |
6 | ||
7 | This file is part of GCC. | |
8 | ||
9 | GCC is free software; you can redistribute it and/or modify it under | |
10 | the terms of the GNU General Public License as published by the Free | |
11 | Software Foundation; either version 3, or (at your option) any later | |
12 | version. | |
13 | ||
14 | GCC is distributed in the hope that it will be useful, but WITHOUT ANY | |
15 | WARRANTY; without even the implied warranty of MERCHANTABILITY or | |
16 | FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License | |
17 | for more details. | |
18 | ||
19 | You should have received a copy of the GNU General Public License | |
20 | along with GCC; see the file COPYING3. If not see | |
21 | <http://www.gnu.org/licenses/>. */ | |
22 | ||
23 | #include "config.h" | |
24 | #include "system.h" | |
25 | #include "coretypes.h" | |
26 | #include "tm.h" | |
27 | #include "ggc.h" | |
28 | #include "tree.h" | |
29 | #include "basic-block.h" | |
30 | #include "diagnostic.h" | |
31 | #include "tree-flow.h" | |
32 | #include "tree-dump.h" | |
33 | #include "cfgloop.h" | |
34 | #include "cfglayout.h" | |
35 | #include "expr.h" | |
36 | #include "recog.h" | |
37 | #include "optabs.h" | |
38 | #include "params.h" | |
39 | #include "toplev.h" | |
40 | #include "tree-chrec.h" | |
41 | #include "tree-scalar-evolution.h" | |
42 | #include "tree-vectorizer.h" | |
43 | ||
44 | /* Loop Vectorization Pass. | |
45 | ||
b8698a0f | 46 | This pass tries to vectorize loops. |
ebfd146a IR |
47 | |
48 | For example, the vectorizer transforms the following simple loop: | |
49 | ||
50 | short a[N]; short b[N]; short c[N]; int i; | |
51 | ||
52 | for (i=0; i<N; i++){ | |
53 | a[i] = b[i] + c[i]; | |
54 | } | |
55 | ||
56 | as if it was manually vectorized by rewriting the source code into: | |
57 | ||
58 | typedef int __attribute__((mode(V8HI))) v8hi; | |
59 | short a[N]; short b[N]; short c[N]; int i; | |
60 | v8hi *pa = (v8hi*)a, *pb = (v8hi*)b, *pc = (v8hi*)c; | |
61 | v8hi va, vb, vc; | |
62 | ||
63 | for (i=0; i<N/8; i++){ | |
64 | vb = pb[i]; | |
65 | vc = pc[i]; | |
66 | va = vb + vc; | |
67 | pa[i] = va; | |
68 | } | |
69 | ||
70 | The main entry to this pass is vectorize_loops(), in which | |
71 | the vectorizer applies a set of analyses on a given set of loops, | |
72 | followed by the actual vectorization transformation for the loops that | |
73 | had successfully passed the analysis phase. | |
74 | Throughout this pass we make a distinction between two types of | |
75 | data: scalars (which are represented by SSA_NAMES), and memory references | |
76 | ("data-refs"). These two types of data require different handling both | |
77 | during analysis and transformation. The types of data-refs that the | |
78 | vectorizer currently supports are ARRAY_REFS which base is an array DECL | |
79 | (not a pointer), and INDIRECT_REFS through pointers; both array and pointer | |
80 | accesses are required to have a simple (consecutive) access pattern. | |
81 | ||
82 | Analysis phase: | |
83 | =============== | |
84 | The driver for the analysis phase is vect_analyze_loop(). | |
85 | It applies a set of analyses, some of which rely on the scalar evolution | |
86 | analyzer (scev) developed by Sebastian Pop. | |
87 | ||
88 | During the analysis phase the vectorizer records some information | |
89 | per stmt in a "stmt_vec_info" struct which is attached to each stmt in the | |
90 | loop, as well as general information about the loop as a whole, which is | |
91 | recorded in a "loop_vec_info" struct attached to each loop. | |
92 | ||
93 | Transformation phase: | |
94 | ===================== | |
95 | The loop transformation phase scans all the stmts in the loop, and | |
96 | creates a vector stmt (or a sequence of stmts) for each scalar stmt S in | |
97 | the loop that needs to be vectorized. It inserts the vector code sequence | |
98 | just before the scalar stmt S, and records a pointer to the vector code | |
99 | in STMT_VINFO_VEC_STMT (stmt_info) (stmt_info is the stmt_vec_info struct | |
100 | attached to S). This pointer will be used for the vectorization of following | |
101 | stmts which use the def of stmt S. Stmt S is removed if it writes to memory; | |
102 | otherwise, we rely on dead code elimination for removing it. | |
103 | ||
104 | For example, say stmt S1 was vectorized into stmt VS1: | |
105 | ||
106 | VS1: vb = px[i]; | |
107 | S1: b = x[i]; STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1 | |
108 | S2: a = b; | |
109 | ||
110 | To vectorize stmt S2, the vectorizer first finds the stmt that defines | |
111 | the operand 'b' (S1), and gets the relevant vector def 'vb' from the | |
112 | vector stmt VS1 pointed to by STMT_VINFO_VEC_STMT (stmt_info (S1)). The | |
113 | resulting sequence would be: | |
114 | ||
115 | VS1: vb = px[i]; | |
116 | S1: b = x[i]; STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1 | |
117 | VS2: va = vb; | |
118 | S2: a = b; STMT_VINFO_VEC_STMT (stmt_info (S2)) = VS2 | |
119 | ||
120 | Operands that are not SSA_NAMEs, are data-refs that appear in | |
121 | load/store operations (like 'x[i]' in S1), and are handled differently. | |
122 | ||
123 | Target modeling: | |
124 | ================= | |
125 | Currently the only target specific information that is used is the | |
126 | size of the vector (in bytes) - "UNITS_PER_SIMD_WORD". Targets that can | |
127 | support different sizes of vectors, for now will need to specify one value | |
128 | for "UNITS_PER_SIMD_WORD". More flexibility will be added in the future. | |
129 | ||
130 | Since we only vectorize operations which vector form can be | |
131 | expressed using existing tree codes, to verify that an operation is | |
132 | supported, the vectorizer checks the relevant optab at the relevant | |
133 | machine_mode (e.g, optab_handler (add_optab, V8HImode)->insn_code). If | |
134 | the value found is CODE_FOR_nothing, then there's no target support, and | |
135 | we can't vectorize the stmt. | |
136 | ||
137 | For additional information on this project see: | |
138 | http://gcc.gnu.org/projects/tree-ssa/vectorization.html | |
139 | */ | |
140 | ||
141 | /* Function vect_determine_vectorization_factor | |
142 | ||
143 | Determine the vectorization factor (VF). VF is the number of data elements | |
144 | that are operated upon in parallel in a single iteration of the vectorized | |
145 | loop. For example, when vectorizing a loop that operates on 4byte elements, | |
146 | on a target with vector size (VS) 16byte, the VF is set to 4, since 4 | |
147 | elements can fit in a single vector register. | |
148 | ||
149 | We currently support vectorization of loops in which all types operated upon | |
150 | are of the same size. Therefore this function currently sets VF according to | |
151 | the size of the types operated upon, and fails if there are multiple sizes | |
152 | in the loop. | |
153 | ||
154 | VF is also the factor by which the loop iterations are strip-mined, e.g.: | |
155 | original loop: | |
156 | for (i=0; i<N; i++){ | |
157 | a[i] = b[i] + c[i]; | |
158 | } | |
159 | ||
160 | vectorized loop: | |
161 | for (i=0; i<N; i+=VF){ | |
162 | a[i:VF] = b[i:VF] + c[i:VF]; | |
163 | } | |
164 | */ | |
165 | ||
166 | static bool | |
167 | vect_determine_vectorization_factor (loop_vec_info loop_vinfo) | |
168 | { | |
169 | struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo); | |
170 | basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo); | |
171 | int nbbs = loop->num_nodes; | |
172 | gimple_stmt_iterator si; | |
173 | unsigned int vectorization_factor = 0; | |
174 | tree scalar_type; | |
175 | gimple phi; | |
176 | tree vectype; | |
177 | unsigned int nunits; | |
178 | stmt_vec_info stmt_info; | |
179 | int i; | |
180 | HOST_WIDE_INT dummy; | |
181 | ||
182 | if (vect_print_dump_info (REPORT_DETAILS)) | |
183 | fprintf (vect_dump, "=== vect_determine_vectorization_factor ==="); | |
184 | ||
185 | for (i = 0; i < nbbs; i++) | |
186 | { | |
187 | basic_block bb = bbs[i]; | |
188 | ||
189 | for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si)) | |
190 | { | |
191 | phi = gsi_stmt (si); | |
192 | stmt_info = vinfo_for_stmt (phi); | |
193 | if (vect_print_dump_info (REPORT_DETAILS)) | |
194 | { | |
195 | fprintf (vect_dump, "==> examining phi: "); | |
196 | print_gimple_stmt (vect_dump, phi, 0, TDF_SLIM); | |
197 | } | |
198 | ||
199 | gcc_assert (stmt_info); | |
200 | ||
201 | if (STMT_VINFO_RELEVANT_P (stmt_info)) | |
202 | { | |
203 | gcc_assert (!STMT_VINFO_VECTYPE (stmt_info)); | |
204 | scalar_type = TREE_TYPE (PHI_RESULT (phi)); | |
205 | ||
206 | if (vect_print_dump_info (REPORT_DETAILS)) | |
207 | { | |
208 | fprintf (vect_dump, "get vectype for scalar type: "); | |
209 | print_generic_expr (vect_dump, scalar_type, TDF_SLIM); | |
210 | } | |
211 | ||
212 | vectype = get_vectype_for_scalar_type (scalar_type); | |
213 | if (!vectype) | |
214 | { | |
8644a673 | 215 | if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS)) |
ebfd146a IR |
216 | { |
217 | fprintf (vect_dump, | |
218 | "not vectorized: unsupported data-type "); | |
219 | print_generic_expr (vect_dump, scalar_type, TDF_SLIM); | |
220 | } | |
221 | return false; | |
222 | } | |
223 | STMT_VINFO_VECTYPE (stmt_info) = vectype; | |
224 | ||
225 | if (vect_print_dump_info (REPORT_DETAILS)) | |
226 | { | |
227 | fprintf (vect_dump, "vectype: "); | |
228 | print_generic_expr (vect_dump, vectype, TDF_SLIM); | |
229 | } | |
230 | ||
231 | nunits = TYPE_VECTOR_SUBPARTS (vectype); | |
232 | if (vect_print_dump_info (REPORT_DETAILS)) | |
233 | fprintf (vect_dump, "nunits = %d", nunits); | |
234 | ||
235 | if (!vectorization_factor | |
236 | || (nunits > vectorization_factor)) | |
237 | vectorization_factor = nunits; | |
238 | } | |
239 | } | |
240 | ||
241 | for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si)) | |
242 | { | |
243 | gimple stmt = gsi_stmt (si); | |
244 | stmt_info = vinfo_for_stmt (stmt); | |
245 | ||
246 | if (vect_print_dump_info (REPORT_DETAILS)) | |
247 | { | |
248 | fprintf (vect_dump, "==> examining statement: "); | |
249 | print_gimple_stmt (vect_dump, stmt, 0, TDF_SLIM); | |
250 | } | |
251 | ||
252 | gcc_assert (stmt_info); | |
253 | ||
254 | /* skip stmts which do not need to be vectorized. */ | |
255 | if (!STMT_VINFO_RELEVANT_P (stmt_info) | |
256 | && !STMT_VINFO_LIVE_P (stmt_info)) | |
257 | { | |
258 | if (vect_print_dump_info (REPORT_DETAILS)) | |
259 | fprintf (vect_dump, "skip."); | |
260 | continue; | |
261 | } | |
262 | ||
263 | if (gimple_get_lhs (stmt) == NULL_TREE) | |
264 | { | |
8644a673 | 265 | if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS)) |
ebfd146a IR |
266 | { |
267 | fprintf (vect_dump, "not vectorized: irregular stmt."); | |
268 | print_gimple_stmt (vect_dump, stmt, 0, TDF_SLIM); | |
269 | } | |
270 | return false; | |
271 | } | |
272 | ||
273 | if (VECTOR_MODE_P (TYPE_MODE (gimple_expr_type (stmt)))) | |
274 | { | |
8644a673 | 275 | if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS)) |
ebfd146a IR |
276 | { |
277 | fprintf (vect_dump, "not vectorized: vector stmt in loop:"); | |
278 | print_gimple_stmt (vect_dump, stmt, 0, TDF_SLIM); | |
279 | } | |
280 | return false; | |
281 | } | |
282 | ||
283 | if (STMT_VINFO_VECTYPE (stmt_info)) | |
284 | { | |
b8698a0f | 285 | /* The only case when a vectype had been already set is for stmts |
ebfd146a IR |
286 | that contain a dataref, or for "pattern-stmts" (stmts generated |
287 | by the vectorizer to represent/replace a certain idiom). */ | |
b8698a0f | 288 | gcc_assert (STMT_VINFO_DATA_REF (stmt_info) |
ebfd146a IR |
289 | || is_pattern_stmt_p (stmt_info)); |
290 | vectype = STMT_VINFO_VECTYPE (stmt_info); | |
291 | } | |
292 | else | |
293 | { | |
06066f92 | 294 | gcc_assert (!STMT_VINFO_DATA_REF (stmt_info) |
ebfd146a IR |
295 | && !is_pattern_stmt_p (stmt_info)); |
296 | ||
b8698a0f | 297 | scalar_type = vect_get_smallest_scalar_type (stmt, &dummy, |
ebfd146a IR |
298 | &dummy); |
299 | if (vect_print_dump_info (REPORT_DETAILS)) | |
300 | { | |
301 | fprintf (vect_dump, "get vectype for scalar type: "); | |
302 | print_generic_expr (vect_dump, scalar_type, TDF_SLIM); | |
303 | } | |
304 | ||
305 | vectype = get_vectype_for_scalar_type (scalar_type); | |
306 | if (!vectype) | |
307 | { | |
8644a673 | 308 | if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS)) |
ebfd146a | 309 | { |
b8698a0f | 310 | fprintf (vect_dump, |
ebfd146a IR |
311 | "not vectorized: unsupported data-type "); |
312 | print_generic_expr (vect_dump, scalar_type, TDF_SLIM); | |
313 | } | |
314 | return false; | |
315 | } | |
316 | STMT_VINFO_VECTYPE (stmt_info) = vectype; | |
317 | } | |
318 | ||
319 | if (vect_print_dump_info (REPORT_DETAILS)) | |
320 | { | |
321 | fprintf (vect_dump, "vectype: "); | |
322 | print_generic_expr (vect_dump, vectype, TDF_SLIM); | |
323 | } | |
324 | ||
325 | nunits = TYPE_VECTOR_SUBPARTS (vectype); | |
326 | if (vect_print_dump_info (REPORT_DETAILS)) | |
327 | fprintf (vect_dump, "nunits = %d", nunits); | |
328 | ||
329 | if (!vectorization_factor | |
330 | || (nunits > vectorization_factor)) | |
331 | vectorization_factor = nunits; | |
332 | ||
333 | } | |
334 | } | |
335 | ||
336 | /* TODO: Analyze cost. Decide if worth while to vectorize. */ | |
337 | if (vect_print_dump_info (REPORT_DETAILS)) | |
338 | fprintf (vect_dump, "vectorization factor = %d", vectorization_factor); | |
339 | if (vectorization_factor <= 1) | |
340 | { | |
8644a673 | 341 | if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS)) |
ebfd146a IR |
342 | fprintf (vect_dump, "not vectorized: unsupported data-type"); |
343 | return false; | |
344 | } | |
345 | LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor; | |
346 | ||
347 | return true; | |
348 | } | |
349 | ||
350 | ||
351 | /* Function vect_is_simple_iv_evolution. | |
352 | ||
353 | FORNOW: A simple evolution of an induction variables in the loop is | |
354 | considered a polynomial evolution with constant step. */ | |
355 | ||
356 | static bool | |
357 | vect_is_simple_iv_evolution (unsigned loop_nb, tree access_fn, tree * init, | |
358 | tree * step) | |
359 | { | |
360 | tree init_expr; | |
361 | tree step_expr; | |
362 | tree evolution_part = evolution_part_in_loop_num (access_fn, loop_nb); | |
363 | ||
364 | /* When there is no evolution in this loop, the evolution function | |
365 | is not "simple". */ | |
366 | if (evolution_part == NULL_TREE) | |
367 | return false; | |
368 | ||
369 | /* When the evolution is a polynomial of degree >= 2 | |
370 | the evolution function is not "simple". */ | |
371 | if (tree_is_chrec (evolution_part)) | |
372 | return false; | |
373 | ||
374 | step_expr = evolution_part; | |
375 | init_expr = unshare_expr (initial_condition_in_loop_num (access_fn, loop_nb)); | |
376 | ||
377 | if (vect_print_dump_info (REPORT_DETAILS)) | |
378 | { | |
379 | fprintf (vect_dump, "step: "); | |
380 | print_generic_expr (vect_dump, step_expr, TDF_SLIM); | |
381 | fprintf (vect_dump, ", init: "); | |
382 | print_generic_expr (vect_dump, init_expr, TDF_SLIM); | |
383 | } | |
384 | ||
385 | *init = init_expr; | |
386 | *step = step_expr; | |
387 | ||
388 | if (TREE_CODE (step_expr) != INTEGER_CST) | |
389 | { | |
390 | if (vect_print_dump_info (REPORT_DETAILS)) | |
391 | fprintf (vect_dump, "step unknown."); | |
392 | return false; | |
393 | } | |
394 | ||
395 | return true; | |
396 | } | |
397 | ||
398 | /* Function vect_analyze_scalar_cycles_1. | |
399 | ||
400 | Examine the cross iteration def-use cycles of scalar variables | |
401 | in LOOP. LOOP_VINFO represents the loop that is now being | |
402 | considered for vectorization (can be LOOP, or an outer-loop | |
403 | enclosing LOOP). */ | |
404 | ||
405 | static void | |
406 | vect_analyze_scalar_cycles_1 (loop_vec_info loop_vinfo, struct loop *loop) | |
407 | { | |
408 | basic_block bb = loop->header; | |
409 | tree dumy; | |
410 | VEC(gimple,heap) *worklist = VEC_alloc (gimple, heap, 64); | |
411 | gimple_stmt_iterator gsi; | |
06066f92 | 412 | bool double_reduc; |
ebfd146a IR |
413 | |
414 | if (vect_print_dump_info (REPORT_DETAILS)) | |
415 | fprintf (vect_dump, "=== vect_analyze_scalar_cycles ==="); | |
416 | ||
7c5222ff | 417 | /* First - identify all inductions. Reduction detection assumes that all the |
b8698a0f | 418 | inductions have been identified, therefore, this order must not be |
7c5222ff | 419 | changed. */ |
ebfd146a IR |
420 | for (gsi = gsi_start_phis (bb); !gsi_end_p (gsi); gsi_next (&gsi)) |
421 | { | |
422 | gimple phi = gsi_stmt (gsi); | |
423 | tree access_fn = NULL; | |
424 | tree def = PHI_RESULT (phi); | |
425 | stmt_vec_info stmt_vinfo = vinfo_for_stmt (phi); | |
426 | ||
427 | if (vect_print_dump_info (REPORT_DETAILS)) | |
428 | { | |
429 | fprintf (vect_dump, "Analyze phi: "); | |
430 | print_gimple_stmt (vect_dump, phi, 0, TDF_SLIM); | |
431 | } | |
432 | ||
433 | /* Skip virtual phi's. The data dependences that are associated with | |
434 | virtual defs/uses (i.e., memory accesses) are analyzed elsewhere. */ | |
435 | if (!is_gimple_reg (SSA_NAME_VAR (def))) | |
436 | continue; | |
437 | ||
438 | STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_unknown_def_type; | |
439 | ||
440 | /* Analyze the evolution function. */ | |
441 | access_fn = analyze_scalar_evolution (loop, def); | |
442 | if (access_fn && vect_print_dump_info (REPORT_DETAILS)) | |
443 | { | |
444 | fprintf (vect_dump, "Access function of PHI: "); | |
445 | print_generic_expr (vect_dump, access_fn, TDF_SLIM); | |
446 | } | |
447 | ||
448 | if (!access_fn | |
b8698a0f | 449 | || !vect_is_simple_iv_evolution (loop->num, access_fn, &dumy, &dumy)) |
ebfd146a | 450 | { |
b8698a0f | 451 | VEC_safe_push (gimple, heap, worklist, phi); |
ebfd146a IR |
452 | continue; |
453 | } | |
454 | ||
455 | if (vect_print_dump_info (REPORT_DETAILS)) | |
456 | fprintf (vect_dump, "Detected induction."); | |
457 | STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_induction_def; | |
458 | } | |
459 | ||
460 | ||
7c5222ff | 461 | /* Second - identify all reductions and nested cycles. */ |
ebfd146a IR |
462 | while (VEC_length (gimple, worklist) > 0) |
463 | { | |
464 | gimple phi = VEC_pop (gimple, worklist); | |
465 | tree def = PHI_RESULT (phi); | |
466 | stmt_vec_info stmt_vinfo = vinfo_for_stmt (phi); | |
467 | gimple reduc_stmt; | |
7c5222ff | 468 | bool nested_cycle; |
ebfd146a IR |
469 | |
470 | if (vect_print_dump_info (REPORT_DETAILS)) | |
b8698a0f | 471 | { |
ebfd146a IR |
472 | fprintf (vect_dump, "Analyze phi: "); |
473 | print_gimple_stmt (vect_dump, phi, 0, TDF_SLIM); | |
474 | } | |
475 | ||
476 | gcc_assert (is_gimple_reg (SSA_NAME_VAR (def))); | |
477 | gcc_assert (STMT_VINFO_DEF_TYPE (stmt_vinfo) == vect_unknown_def_type); | |
478 | ||
7c5222ff | 479 | nested_cycle = (loop != LOOP_VINFO_LOOP (loop_vinfo)); |
b8698a0f | 480 | reduc_stmt = vect_is_simple_reduction (loop_vinfo, phi, !nested_cycle, |
06066f92 | 481 | &double_reduc); |
ebfd146a IR |
482 | if (reduc_stmt) |
483 | { | |
06066f92 | 484 | if (double_reduc) |
7c5222ff IR |
485 | { |
486 | if (vect_print_dump_info (REPORT_DETAILS)) | |
06066f92 | 487 | fprintf (vect_dump, "Detected double reduction."); |
7c5222ff | 488 | |
06066f92 | 489 | STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_double_reduction_def; |
7c5222ff | 490 | STMT_VINFO_DEF_TYPE (vinfo_for_stmt (reduc_stmt)) = |
06066f92 | 491 | vect_double_reduction_def; |
7c5222ff | 492 | } |
b8698a0f | 493 | else |
7c5222ff | 494 | { |
06066f92 IR |
495 | if (nested_cycle) |
496 | { | |
497 | if (vect_print_dump_info (REPORT_DETAILS)) | |
498 | fprintf (vect_dump, "Detected vectorizable nested cycle."); | |
7c5222ff | 499 | |
06066f92 IR |
500 | STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_nested_cycle; |
501 | STMT_VINFO_DEF_TYPE (vinfo_for_stmt (reduc_stmt)) = | |
502 | vect_nested_cycle; | |
503 | } | |
504 | else | |
505 | { | |
506 | if (vect_print_dump_info (REPORT_DETAILS)) | |
507 | fprintf (vect_dump, "Detected reduction."); | |
508 | ||
509 | STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_reduction_def; | |
510 | STMT_VINFO_DEF_TYPE (vinfo_for_stmt (reduc_stmt)) = | |
511 | vect_reduction_def; | |
512 | } | |
7c5222ff | 513 | } |
ebfd146a IR |
514 | } |
515 | else | |
516 | if (vect_print_dump_info (REPORT_DETAILS)) | |
517 | fprintf (vect_dump, "Unknown def-use cycle pattern."); | |
518 | } | |
519 | ||
520 | VEC_free (gimple, heap, worklist); | |
ebfd146a IR |
521 | } |
522 | ||
523 | ||
524 | /* Function vect_analyze_scalar_cycles. | |
525 | ||
526 | Examine the cross iteration def-use cycles of scalar variables, by | |
b8698a0f | 527 | analyzing the loop-header PHIs of scalar variables; Classify each |
ebfd146a IR |
528 | cycle as one of the following: invariant, induction, reduction, unknown. |
529 | We do that for the loop represented by LOOP_VINFO, and also to its | |
530 | inner-loop, if exists. | |
531 | Examples for scalar cycles: | |
532 | ||
533 | Example1: reduction: | |
534 | ||
535 | loop1: | |
536 | for (i=0; i<N; i++) | |
537 | sum += a[i]; | |
538 | ||
539 | Example2: induction: | |
540 | ||
541 | loop2: | |
542 | for (i=0; i<N; i++) | |
543 | a[i] = i; */ | |
544 | ||
545 | static void | |
546 | vect_analyze_scalar_cycles (loop_vec_info loop_vinfo) | |
547 | { | |
548 | struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo); | |
549 | ||
550 | vect_analyze_scalar_cycles_1 (loop_vinfo, loop); | |
551 | ||
552 | /* When vectorizing an outer-loop, the inner-loop is executed sequentially. | |
553 | Reductions in such inner-loop therefore have different properties than | |
554 | the reductions in the nest that gets vectorized: | |
555 | 1. When vectorized, they are executed in the same order as in the original | |
556 | scalar loop, so we can't change the order of computation when | |
557 | vectorizing them. | |
b8698a0f | 558 | 2. FIXME: Inner-loop reductions can be used in the inner-loop, so the |
ebfd146a IR |
559 | current checks are too strict. */ |
560 | ||
561 | if (loop->inner) | |
562 | vect_analyze_scalar_cycles_1 (loop_vinfo, loop->inner); | |
563 | } | |
564 | ||
ebfd146a IR |
565 | /* Function vect_get_loop_niters. |
566 | ||
567 | Determine how many iterations the loop is executed. | |
568 | If an expression that represents the number of iterations | |
569 | can be constructed, place it in NUMBER_OF_ITERATIONS. | |
570 | Return the loop exit condition. */ | |
571 | ||
572 | static gimple | |
573 | vect_get_loop_niters (struct loop *loop, tree *number_of_iterations) | |
574 | { | |
575 | tree niters; | |
576 | ||
577 | if (vect_print_dump_info (REPORT_DETAILS)) | |
578 | fprintf (vect_dump, "=== get_loop_niters ==="); | |
579 | ||
580 | niters = number_of_exit_cond_executions (loop); | |
581 | ||
582 | if (niters != NULL_TREE | |
583 | && niters != chrec_dont_know) | |
584 | { | |
585 | *number_of_iterations = niters; | |
586 | ||
587 | if (vect_print_dump_info (REPORT_DETAILS)) | |
8644a673 IR |
588 | { |
589 | fprintf (vect_dump, "==> get_loop_niters:" ); | |
590 | print_generic_expr (vect_dump, *number_of_iterations, TDF_SLIM); | |
591 | } | |
ebfd146a IR |
592 | } |
593 | ||
594 | return get_loop_exit_condition (loop); | |
595 | } | |
596 | ||
597 | ||
598 | /* Function bb_in_loop_p | |
599 | ||
600 | Used as predicate for dfs order traversal of the loop bbs. */ | |
601 | ||
602 | static bool | |
603 | bb_in_loop_p (const_basic_block bb, const void *data) | |
604 | { | |
605 | const struct loop *const loop = (const struct loop *)data; | |
606 | if (flow_bb_inside_loop_p (loop, bb)) | |
607 | return true; | |
608 | return false; | |
609 | } | |
610 | ||
611 | ||
612 | /* Function new_loop_vec_info. | |
613 | ||
614 | Create and initialize a new loop_vec_info struct for LOOP, as well as | |
615 | stmt_vec_info structs for all the stmts in LOOP. */ | |
616 | ||
617 | static loop_vec_info | |
618 | new_loop_vec_info (struct loop *loop) | |
619 | { | |
620 | loop_vec_info res; | |
621 | basic_block *bbs; | |
622 | gimple_stmt_iterator si; | |
623 | unsigned int i, nbbs; | |
624 | ||
625 | res = (loop_vec_info) xcalloc (1, sizeof (struct _loop_vec_info)); | |
626 | LOOP_VINFO_LOOP (res) = loop; | |
627 | ||
628 | bbs = get_loop_body (loop); | |
629 | ||
630 | /* Create/Update stmt_info for all stmts in the loop. */ | |
631 | for (i = 0; i < loop->num_nodes; i++) | |
632 | { | |
633 | basic_block bb = bbs[i]; | |
634 | ||
635 | /* BBs in a nested inner-loop will have been already processed (because | |
636 | we will have called vect_analyze_loop_form for any nested inner-loop). | |
637 | Therefore, for stmts in an inner-loop we just want to update the | |
638 | STMT_VINFO_LOOP_VINFO field of their stmt_info to point to the new | |
639 | loop_info of the outer-loop we are currently considering to vectorize | |
640 | (instead of the loop_info of the inner-loop). | |
641 | For stmts in other BBs we need to create a stmt_info from scratch. */ | |
642 | if (bb->loop_father != loop) | |
643 | { | |
644 | /* Inner-loop bb. */ | |
645 | gcc_assert (loop->inner && bb->loop_father == loop->inner); | |
646 | for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si)) | |
647 | { | |
648 | gimple phi = gsi_stmt (si); | |
649 | stmt_vec_info stmt_info = vinfo_for_stmt (phi); | |
650 | loop_vec_info inner_loop_vinfo = | |
651 | STMT_VINFO_LOOP_VINFO (stmt_info); | |
652 | gcc_assert (loop->inner == LOOP_VINFO_LOOP (inner_loop_vinfo)); | |
653 | STMT_VINFO_LOOP_VINFO (stmt_info) = res; | |
654 | } | |
655 | for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si)) | |
656 | { | |
657 | gimple stmt = gsi_stmt (si); | |
658 | stmt_vec_info stmt_info = vinfo_for_stmt (stmt); | |
659 | loop_vec_info inner_loop_vinfo = | |
660 | STMT_VINFO_LOOP_VINFO (stmt_info); | |
661 | gcc_assert (loop->inner == LOOP_VINFO_LOOP (inner_loop_vinfo)); | |
662 | STMT_VINFO_LOOP_VINFO (stmt_info) = res; | |
663 | } | |
664 | } | |
665 | else | |
666 | { | |
667 | /* bb in current nest. */ | |
668 | for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si)) | |
669 | { | |
670 | gimple phi = gsi_stmt (si); | |
671 | gimple_set_uid (phi, 0); | |
a70d6342 | 672 | set_vinfo_for_stmt (phi, new_stmt_vec_info (phi, res, NULL)); |
ebfd146a IR |
673 | } |
674 | ||
675 | for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si)) | |
676 | { | |
677 | gimple stmt = gsi_stmt (si); | |
678 | gimple_set_uid (stmt, 0); | |
a70d6342 | 679 | set_vinfo_for_stmt (stmt, new_stmt_vec_info (stmt, res, NULL)); |
ebfd146a IR |
680 | } |
681 | } | |
682 | } | |
683 | ||
684 | /* CHECKME: We want to visit all BBs before their successors (except for | |
685 | latch blocks, for which this assertion wouldn't hold). In the simple | |
686 | case of the loop forms we allow, a dfs order of the BBs would the same | |
687 | as reversed postorder traversal, so we are safe. */ | |
688 | ||
689 | free (bbs); | |
690 | bbs = XCNEWVEC (basic_block, loop->num_nodes); | |
691 | nbbs = dfs_enumerate_from (loop->header, 0, bb_in_loop_p, | |
692 | bbs, loop->num_nodes, loop); | |
693 | gcc_assert (nbbs == loop->num_nodes); | |
694 | ||
695 | LOOP_VINFO_BBS (res) = bbs; | |
696 | LOOP_VINFO_NITERS (res) = NULL; | |
697 | LOOP_VINFO_NITERS_UNCHANGED (res) = NULL; | |
698 | LOOP_VINFO_COST_MODEL_MIN_ITERS (res) = 0; | |
699 | LOOP_VINFO_VECTORIZABLE_P (res) = 0; | |
700 | LOOP_PEELING_FOR_ALIGNMENT (res) = 0; | |
701 | LOOP_VINFO_VECT_FACTOR (res) = 0; | |
702 | LOOP_VINFO_DATAREFS (res) = VEC_alloc (data_reference_p, heap, 10); | |
703 | LOOP_VINFO_DDRS (res) = VEC_alloc (ddr_p, heap, 10 * 10); | |
704 | LOOP_VINFO_UNALIGNED_DR (res) = NULL; | |
705 | LOOP_VINFO_MAY_MISALIGN_STMTS (res) = | |
706 | VEC_alloc (gimple, heap, | |
707 | PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIGNMENT_CHECKS)); | |
708 | LOOP_VINFO_MAY_ALIAS_DDRS (res) = | |
709 | VEC_alloc (ddr_p, heap, | |
710 | PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS)); | |
711 | LOOP_VINFO_STRIDED_STORES (res) = VEC_alloc (gimple, heap, 10); | |
712 | LOOP_VINFO_SLP_INSTANCES (res) = VEC_alloc (slp_instance, heap, 10); | |
713 | LOOP_VINFO_SLP_UNROLLING_FACTOR (res) = 1; | |
714 | ||
715 | return res; | |
716 | } | |
717 | ||
718 | ||
719 | /* Function destroy_loop_vec_info. | |
720 | ||
721 | Free LOOP_VINFO struct, as well as all the stmt_vec_info structs of all the | |
722 | stmts in the loop. */ | |
723 | ||
724 | void | |
725 | destroy_loop_vec_info (loop_vec_info loop_vinfo, bool clean_stmts) | |
726 | { | |
727 | struct loop *loop; | |
728 | basic_block *bbs; | |
729 | int nbbs; | |
730 | gimple_stmt_iterator si; | |
731 | int j; | |
732 | VEC (slp_instance, heap) *slp_instances; | |
733 | slp_instance instance; | |
734 | ||
735 | if (!loop_vinfo) | |
736 | return; | |
737 | ||
738 | loop = LOOP_VINFO_LOOP (loop_vinfo); | |
739 | ||
740 | bbs = LOOP_VINFO_BBS (loop_vinfo); | |
741 | nbbs = loop->num_nodes; | |
742 | ||
743 | if (!clean_stmts) | |
744 | { | |
745 | free (LOOP_VINFO_BBS (loop_vinfo)); | |
746 | free_data_refs (LOOP_VINFO_DATAREFS (loop_vinfo)); | |
747 | free_dependence_relations (LOOP_VINFO_DDRS (loop_vinfo)); | |
748 | VEC_free (gimple, heap, LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo)); | |
749 | ||
750 | free (loop_vinfo); | |
751 | loop->aux = NULL; | |
752 | return; | |
753 | } | |
754 | ||
755 | for (j = 0; j < nbbs; j++) | |
756 | { | |
757 | basic_block bb = bbs[j]; | |
758 | for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si)) | |
759 | free_stmt_vec_info (gsi_stmt (si)); | |
760 | ||
761 | for (si = gsi_start_bb (bb); !gsi_end_p (si); ) | |
762 | { | |
763 | gimple stmt = gsi_stmt (si); | |
764 | stmt_vec_info stmt_info = vinfo_for_stmt (stmt); | |
765 | ||
766 | if (stmt_info) | |
767 | { | |
768 | /* Check if this is a "pattern stmt" (introduced by the | |
769 | vectorizer during the pattern recognition pass). */ | |
770 | bool remove_stmt_p = false; | |
771 | gimple orig_stmt = STMT_VINFO_RELATED_STMT (stmt_info); | |
772 | if (orig_stmt) | |
773 | { | |
774 | stmt_vec_info orig_stmt_info = vinfo_for_stmt (orig_stmt); | |
775 | if (orig_stmt_info | |
776 | && STMT_VINFO_IN_PATTERN_P (orig_stmt_info)) | |
777 | remove_stmt_p = true; | |
778 | } | |
779 | ||
780 | /* Free stmt_vec_info. */ | |
781 | free_stmt_vec_info (stmt); | |
782 | ||
783 | /* Remove dead "pattern stmts". */ | |
784 | if (remove_stmt_p) | |
785 | gsi_remove (&si, true); | |
786 | } | |
787 | gsi_next (&si); | |
788 | } | |
789 | } | |
790 | ||
791 | free (LOOP_VINFO_BBS (loop_vinfo)); | |
792 | free_data_refs (LOOP_VINFO_DATAREFS (loop_vinfo)); | |
793 | free_dependence_relations (LOOP_VINFO_DDRS (loop_vinfo)); | |
794 | VEC_free (gimple, heap, LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo)); | |
795 | VEC_free (ddr_p, heap, LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo)); | |
796 | slp_instances = LOOP_VINFO_SLP_INSTANCES (loop_vinfo); | |
797 | for (j = 0; VEC_iterate (slp_instance, slp_instances, j, instance); j++) | |
798 | vect_free_slp_instance (instance); | |
799 | ||
800 | VEC_free (slp_instance, heap, LOOP_VINFO_SLP_INSTANCES (loop_vinfo)); | |
801 | VEC_free (gimple, heap, LOOP_VINFO_STRIDED_STORES (loop_vinfo)); | |
802 | ||
803 | free (loop_vinfo); | |
804 | loop->aux = NULL; | |
805 | } | |
806 | ||
807 | ||
808 | /* Function vect_analyze_loop_1. | |
809 | ||
810 | Apply a set of analyses on LOOP, and create a loop_vec_info struct | |
811 | for it. The different analyses will record information in the | |
812 | loop_vec_info struct. This is a subset of the analyses applied in | |
813 | vect_analyze_loop, to be applied on an inner-loop nested in the loop | |
814 | that is now considered for (outer-loop) vectorization. */ | |
815 | ||
816 | static loop_vec_info | |
817 | vect_analyze_loop_1 (struct loop *loop) | |
818 | { | |
819 | loop_vec_info loop_vinfo; | |
820 | ||
821 | if (vect_print_dump_info (REPORT_DETAILS)) | |
822 | fprintf (vect_dump, "===== analyze_loop_nest_1 ====="); | |
823 | ||
824 | /* Check the CFG characteristics of the loop (nesting, entry/exit, etc. */ | |
825 | ||
826 | loop_vinfo = vect_analyze_loop_form (loop); | |
827 | if (!loop_vinfo) | |
828 | { | |
829 | if (vect_print_dump_info (REPORT_DETAILS)) | |
830 | fprintf (vect_dump, "bad inner-loop form."); | |
831 | return NULL; | |
832 | } | |
833 | ||
834 | return loop_vinfo; | |
835 | } | |
836 | ||
837 | ||
838 | /* Function vect_analyze_loop_form. | |
839 | ||
840 | Verify that certain CFG restrictions hold, including: | |
841 | - the loop has a pre-header | |
842 | - the loop has a single entry and exit | |
843 | - the loop exit condition is simple enough, and the number of iterations | |
844 | can be analyzed (a countable loop). */ | |
845 | ||
846 | loop_vec_info | |
847 | vect_analyze_loop_form (struct loop *loop) | |
848 | { | |
849 | loop_vec_info loop_vinfo; | |
850 | gimple loop_cond; | |
851 | tree number_of_iterations = NULL; | |
852 | loop_vec_info inner_loop_vinfo = NULL; | |
853 | ||
854 | if (vect_print_dump_info (REPORT_DETAILS)) | |
855 | fprintf (vect_dump, "=== vect_analyze_loop_form ==="); | |
856 | ||
857 | /* Different restrictions apply when we are considering an inner-most loop, | |
b8698a0f | 858 | vs. an outer (nested) loop. |
ebfd146a IR |
859 | (FORNOW. May want to relax some of these restrictions in the future). */ |
860 | ||
861 | if (!loop->inner) | |
862 | { | |
b8698a0f L |
863 | /* Inner-most loop. We currently require that the number of BBs is |
864 | exactly 2 (the header and latch). Vectorizable inner-most loops | |
ebfd146a IR |
865 | look like this: |
866 | ||
867 | (pre-header) | |
868 | | | |
869 | header <--------+ | |
870 | | | | | |
871 | | +--> latch --+ | |
872 | | | |
873 | (exit-bb) */ | |
874 | ||
875 | if (loop->num_nodes != 2) | |
876 | { | |
877 | if (vect_print_dump_info (REPORT_BAD_FORM_LOOPS)) | |
e9dbe7bb | 878 | fprintf (vect_dump, "not vectorized: control flow in loop."); |
ebfd146a IR |
879 | return NULL; |
880 | } | |
881 | ||
882 | if (empty_block_p (loop->header)) | |
883 | { | |
884 | if (vect_print_dump_info (REPORT_BAD_FORM_LOOPS)) | |
885 | fprintf (vect_dump, "not vectorized: empty loop."); | |
886 | return NULL; | |
887 | } | |
888 | } | |
889 | else | |
890 | { | |
891 | struct loop *innerloop = loop->inner; | |
0f900dfa | 892 | edge entryedge; |
ebfd146a IR |
893 | |
894 | /* Nested loop. We currently require that the loop is doubly-nested, | |
b8698a0f | 895 | contains a single inner loop, and the number of BBs is exactly 5. |
ebfd146a IR |
896 | Vectorizable outer-loops look like this: |
897 | ||
898 | (pre-header) | |
899 | | | |
900 | header <---+ | |
901 | | | | |
902 | inner-loop | | |
903 | | | | |
904 | tail ------+ | |
b8698a0f | 905 | | |
ebfd146a IR |
906 | (exit-bb) |
907 | ||
908 | The inner-loop has the properties expected of inner-most loops | |
909 | as described above. */ | |
910 | ||
911 | if ((loop->inner)->inner || (loop->inner)->next) | |
912 | { | |
913 | if (vect_print_dump_info (REPORT_BAD_FORM_LOOPS)) | |
914 | fprintf (vect_dump, "not vectorized: multiple nested loops."); | |
915 | return NULL; | |
916 | } | |
917 | ||
918 | /* Analyze the inner-loop. */ | |
919 | inner_loop_vinfo = vect_analyze_loop_1 (loop->inner); | |
920 | if (!inner_loop_vinfo) | |
921 | { | |
922 | if (vect_print_dump_info (REPORT_BAD_FORM_LOOPS)) | |
923 | fprintf (vect_dump, "not vectorized: Bad inner loop."); | |
924 | return NULL; | |
925 | } | |
926 | ||
927 | if (!expr_invariant_in_loop_p (loop, | |
928 | LOOP_VINFO_NITERS (inner_loop_vinfo))) | |
929 | { | |
930 | if (vect_print_dump_info (REPORT_BAD_FORM_LOOPS)) | |
931 | fprintf (vect_dump, | |
932 | "not vectorized: inner-loop count not invariant."); | |
933 | destroy_loop_vec_info (inner_loop_vinfo, true); | |
934 | return NULL; | |
935 | } | |
936 | ||
b8698a0f | 937 | if (loop->num_nodes != 5) |
ebfd146a IR |
938 | { |
939 | if (vect_print_dump_info (REPORT_BAD_FORM_LOOPS)) | |
e9dbe7bb | 940 | fprintf (vect_dump, "not vectorized: control flow in loop."); |
ebfd146a IR |
941 | destroy_loop_vec_info (inner_loop_vinfo, true); |
942 | return NULL; | |
943 | } | |
944 | ||
945 | gcc_assert (EDGE_COUNT (innerloop->header->preds) == 2); | |
ebfd146a IR |
946 | entryedge = EDGE_PRED (innerloop->header, 0); |
947 | if (EDGE_PRED (innerloop->header, 0)->src == innerloop->latch) | |
0f900dfa | 948 | entryedge = EDGE_PRED (innerloop->header, 1); |
b8698a0f | 949 | |
ebfd146a IR |
950 | if (entryedge->src != loop->header |
951 | || !single_exit (innerloop) | |
952 | || single_exit (innerloop)->dest != EDGE_PRED (loop->latch, 0)->src) | |
953 | { | |
954 | if (vect_print_dump_info (REPORT_BAD_FORM_LOOPS)) | |
955 | fprintf (vect_dump, "not vectorized: unsupported outerloop form."); | |
956 | destroy_loop_vec_info (inner_loop_vinfo, true); | |
957 | return NULL; | |
958 | } | |
959 | ||
960 | if (vect_print_dump_info (REPORT_DETAILS)) | |
961 | fprintf (vect_dump, "Considering outer-loop vectorization."); | |
962 | } | |
b8698a0f L |
963 | |
964 | if (!single_exit (loop) | |
ebfd146a IR |
965 | || EDGE_COUNT (loop->header->preds) != 2) |
966 | { | |
967 | if (vect_print_dump_info (REPORT_BAD_FORM_LOOPS)) | |
968 | { | |
969 | if (!single_exit (loop)) | |
970 | fprintf (vect_dump, "not vectorized: multiple exits."); | |
971 | else if (EDGE_COUNT (loop->header->preds) != 2) | |
972 | fprintf (vect_dump, "not vectorized: too many incoming edges."); | |
973 | } | |
974 | if (inner_loop_vinfo) | |
975 | destroy_loop_vec_info (inner_loop_vinfo, true); | |
976 | return NULL; | |
977 | } | |
978 | ||
979 | /* We assume that the loop exit condition is at the end of the loop. i.e, | |
980 | that the loop is represented as a do-while (with a proper if-guard | |
981 | before the loop if needed), where the loop header contains all the | |
982 | executable statements, and the latch is empty. */ | |
983 | if (!empty_block_p (loop->latch) | |
984 | || phi_nodes (loop->latch)) | |
985 | { | |
986 | if (vect_print_dump_info (REPORT_BAD_FORM_LOOPS)) | |
987 | fprintf (vect_dump, "not vectorized: unexpected loop form."); | |
988 | if (inner_loop_vinfo) | |
989 | destroy_loop_vec_info (inner_loop_vinfo, true); | |
990 | return NULL; | |
991 | } | |
992 | ||
993 | /* Make sure there exists a single-predecessor exit bb: */ | |
994 | if (!single_pred_p (single_exit (loop)->dest)) | |
995 | { | |
996 | edge e = single_exit (loop); | |
997 | if (!(e->flags & EDGE_ABNORMAL)) | |
998 | { | |
999 | split_loop_exit_edge (e); | |
1000 | if (vect_print_dump_info (REPORT_DETAILS)) | |
1001 | fprintf (vect_dump, "split exit edge."); | |
1002 | } | |
1003 | else | |
1004 | { | |
1005 | if (vect_print_dump_info (REPORT_BAD_FORM_LOOPS)) | |
1006 | fprintf (vect_dump, "not vectorized: abnormal loop exit edge."); | |
1007 | if (inner_loop_vinfo) | |
1008 | destroy_loop_vec_info (inner_loop_vinfo, true); | |
1009 | return NULL; | |
1010 | } | |
1011 | } | |
1012 | ||
1013 | loop_cond = vect_get_loop_niters (loop, &number_of_iterations); | |
1014 | if (!loop_cond) | |
1015 | { | |
1016 | if (vect_print_dump_info (REPORT_BAD_FORM_LOOPS)) | |
1017 | fprintf (vect_dump, "not vectorized: complicated exit condition."); | |
1018 | if (inner_loop_vinfo) | |
1019 | destroy_loop_vec_info (inner_loop_vinfo, true); | |
1020 | return NULL; | |
1021 | } | |
b8698a0f L |
1022 | |
1023 | if (!number_of_iterations) | |
ebfd146a IR |
1024 | { |
1025 | if (vect_print_dump_info (REPORT_BAD_FORM_LOOPS)) | |
b8698a0f | 1026 | fprintf (vect_dump, |
ebfd146a IR |
1027 | "not vectorized: number of iterations cannot be computed."); |
1028 | if (inner_loop_vinfo) | |
1029 | destroy_loop_vec_info (inner_loop_vinfo, true); | |
1030 | return NULL; | |
1031 | } | |
1032 | ||
1033 | if (chrec_contains_undetermined (number_of_iterations)) | |
1034 | { | |
1035 | if (vect_print_dump_info (REPORT_BAD_FORM_LOOPS)) | |
1036 | fprintf (vect_dump, "Infinite number of iterations."); | |
1037 | if (inner_loop_vinfo) | |
1038 | destroy_loop_vec_info (inner_loop_vinfo, true); | |
1039 | return NULL; | |
1040 | } | |
1041 | ||
1042 | if (!NITERS_KNOWN_P (number_of_iterations)) | |
1043 | { | |
1044 | if (vect_print_dump_info (REPORT_DETAILS)) | |
1045 | { | |
1046 | fprintf (vect_dump, "Symbolic number of iterations is "); | |
1047 | print_generic_expr (vect_dump, number_of_iterations, TDF_DETAILS); | |
1048 | } | |
1049 | } | |
1050 | else if (TREE_INT_CST_LOW (number_of_iterations) == 0) | |
1051 | { | |
8644a673 | 1052 | if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS)) |
ebfd146a IR |
1053 | fprintf (vect_dump, "not vectorized: number of iterations = 0."); |
1054 | if (inner_loop_vinfo) | |
1055 | destroy_loop_vec_info (inner_loop_vinfo, false); | |
1056 | return NULL; | |
1057 | } | |
1058 | ||
1059 | loop_vinfo = new_loop_vec_info (loop); | |
1060 | LOOP_VINFO_NITERS (loop_vinfo) = number_of_iterations; | |
1061 | LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = number_of_iterations; | |
1062 | ||
1063 | STMT_VINFO_TYPE (vinfo_for_stmt (loop_cond)) = loop_exit_ctrl_vec_info_type; | |
1064 | ||
1065 | /* CHECKME: May want to keep it around it in the future. */ | |
1066 | if (inner_loop_vinfo) | |
1067 | destroy_loop_vec_info (inner_loop_vinfo, false); | |
1068 | ||
1069 | gcc_assert (!loop->aux); | |
1070 | loop->aux = loop_vinfo; | |
1071 | return loop_vinfo; | |
1072 | } | |
1073 | ||
8644a673 IR |
1074 | |
1075 | /* Function vect_analyze_loop_operations. | |
1076 | ||
1077 | Scan the loop stmts and make sure they are all vectorizable. */ | |
1078 | ||
1079 | static bool | |
1080 | vect_analyze_loop_operations (loop_vec_info loop_vinfo) | |
1081 | { | |
1082 | struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo); | |
1083 | basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo); | |
1084 | int nbbs = loop->num_nodes; | |
1085 | gimple_stmt_iterator si; | |
1086 | unsigned int vectorization_factor = 0; | |
1087 | int i; | |
1088 | gimple phi; | |
1089 | stmt_vec_info stmt_info; | |
1090 | bool need_to_vectorize = false; | |
1091 | int min_profitable_iters; | |
1092 | int min_scalar_loop_bound; | |
1093 | unsigned int th; | |
1094 | bool only_slp_in_loop = true, ok; | |
1095 | ||
1096 | if (vect_print_dump_info (REPORT_DETAILS)) | |
1097 | fprintf (vect_dump, "=== vect_analyze_loop_operations ==="); | |
1098 | ||
1099 | gcc_assert (LOOP_VINFO_VECT_FACTOR (loop_vinfo)); | |
1100 | vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo); | |
1101 | ||
1102 | for (i = 0; i < nbbs; i++) | |
1103 | { | |
1104 | basic_block bb = bbs[i]; | |
1105 | ||
1106 | for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si)) | |
1107 | { | |
1108 | phi = gsi_stmt (si); | |
1109 | ok = true; | |
1110 | ||
1111 | stmt_info = vinfo_for_stmt (phi); | |
1112 | if (vect_print_dump_info (REPORT_DETAILS)) | |
1113 | { | |
1114 | fprintf (vect_dump, "examining phi: "); | |
1115 | print_gimple_stmt (vect_dump, phi, 0, TDF_SLIM); | |
1116 | } | |
1117 | ||
1118 | if (! is_loop_header_bb_p (bb)) | |
1119 | { | |
1120 | /* inner-loop loop-closed exit phi in outer-loop vectorization | |
1121 | (i.e. a phi in the tail of the outer-loop). | |
1122 | FORNOW: we currently don't support the case that these phis | |
06066f92 | 1123 | are not used in the outerloop (unless it is double reduction, |
b8698a0f | 1124 | i.e., this phi is vect_reduction_def), cause this case |
06066f92 IR |
1125 | requires to actually do something here. */ |
1126 | if ((!STMT_VINFO_RELEVANT_P (stmt_info) | |
1127 | || STMT_VINFO_LIVE_P (stmt_info)) | |
b8698a0f | 1128 | && STMT_VINFO_DEF_TYPE (stmt_info) |
06066f92 | 1129 | != vect_double_reduction_def) |
8644a673 IR |
1130 | { |
1131 | if (vect_print_dump_info (REPORT_DETAILS)) | |
1132 | fprintf (vect_dump, | |
1133 | "Unsupported loop-closed phi in outer-loop."); | |
1134 | return false; | |
1135 | } | |
1136 | continue; | |
1137 | } | |
1138 | ||
1139 | gcc_assert (stmt_info); | |
1140 | ||
1141 | if (STMT_VINFO_LIVE_P (stmt_info)) | |
1142 | { | |
1143 | /* FORNOW: not yet supported. */ | |
1144 | if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS)) | |
1145 | fprintf (vect_dump, "not vectorized: value used after loop."); | |
1146 | return false; | |
1147 | } | |
1148 | ||
1149 | if (STMT_VINFO_RELEVANT (stmt_info) == vect_used_in_scope | |
1150 | && STMT_VINFO_DEF_TYPE (stmt_info) != vect_induction_def) | |
1151 | { | |
1152 | /* A scalar-dependence cycle that we don't support. */ | |
1153 | if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS)) | |
1154 | fprintf (vect_dump, "not vectorized: scalar dependence cycle."); | |
1155 | return false; | |
1156 | } | |
1157 | ||
1158 | if (STMT_VINFO_RELEVANT_P (stmt_info)) | |
1159 | { | |
1160 | need_to_vectorize = true; | |
1161 | if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def) | |
1162 | ok = vectorizable_induction (phi, NULL, NULL); | |
1163 | } | |
1164 | ||
1165 | if (!ok) | |
1166 | { | |
1167 | if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS)) | |
1168 | { | |
1169 | fprintf (vect_dump, | |
1170 | "not vectorized: relevant phi not supported: "); | |
1171 | print_gimple_stmt (vect_dump, phi, 0, TDF_SLIM); | |
1172 | } | |
1173 | return false; | |
1174 | } | |
1175 | } | |
1176 | ||
1177 | for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si)) | |
1178 | { | |
1179 | gimple stmt = gsi_stmt (si); | |
1180 | stmt_vec_info stmt_info = vinfo_for_stmt (stmt); | |
1181 | ||
1182 | gcc_assert (stmt_info); | |
1183 | ||
a70d6342 | 1184 | if (!vect_analyze_stmt (stmt, &need_to_vectorize, NULL)) |
8644a673 IR |
1185 | return false; |
1186 | ||
1187 | if (STMT_VINFO_RELEVANT_P (stmt_info) && !PURE_SLP_STMT (stmt_info)) | |
1188 | /* STMT needs both SLP and loop-based vectorization. */ | |
1189 | only_slp_in_loop = false; | |
b8698a0f | 1190 | } |
8644a673 IR |
1191 | } /* bbs */ |
1192 | ||
1193 | /* All operations in the loop are either irrelevant (deal with loop | |
1194 | control, or dead), or only used outside the loop and can be moved | |
1195 | out of the loop (e.g. invariants, inductions). The loop can be | |
1196 | optimized away by scalar optimizations. We're better off not | |
1197 | touching this loop. */ | |
1198 | if (!need_to_vectorize) | |
1199 | { | |
1200 | if (vect_print_dump_info (REPORT_DETAILS)) | |
1201 | fprintf (vect_dump, | |
1202 | "All the computation can be taken out of the loop."); | |
1203 | if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS)) | |
1204 | fprintf (vect_dump, | |
1205 | "not vectorized: redundant loop. no profit to vectorize."); | |
1206 | return false; | |
1207 | } | |
1208 | ||
1209 | /* If all the stmts in the loop can be SLPed, we perform only SLP, and | |
1210 | vectorization factor of the loop is the unrolling factor required by the | |
1211 | SLP instances. If that unrolling factor is 1, we say, that we perform | |
1212 | pure SLP on loop - cross iteration parallelism is not exploited. */ | |
1213 | if (only_slp_in_loop) | |
1214 | vectorization_factor = LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo); | |
1215 | else | |
1216 | vectorization_factor = least_common_multiple (vectorization_factor, | |
1217 | LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo)); | |
1218 | ||
1219 | LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor; | |
1220 | ||
1221 | if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) | |
1222 | && vect_print_dump_info (REPORT_DETAILS)) | |
1223 | fprintf (vect_dump, | |
1224 | "vectorization_factor = %d, niters = " HOST_WIDE_INT_PRINT_DEC, | |
1225 | vectorization_factor, LOOP_VINFO_INT_NITERS (loop_vinfo)); | |
1226 | ||
1227 | if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) | |
1228 | && (LOOP_VINFO_INT_NITERS (loop_vinfo) < vectorization_factor)) | |
1229 | { | |
1230 | if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS)) | |
1231 | fprintf (vect_dump, "not vectorized: iteration count too small."); | |
1232 | if (vect_print_dump_info (REPORT_DETAILS)) | |
1233 | fprintf (vect_dump,"not vectorized: iteration count smaller than " | |
1234 | "vectorization factor."); | |
1235 | return false; | |
1236 | } | |
1237 | ||
1238 | /* Analyze cost. Decide if worth while to vectorize. */ | |
1239 | ||
1240 | /* Once VF is set, SLP costs should be updated since the number of created | |
1241 | vector stmts depends on VF. */ | |
1242 | vect_update_slp_costs_according_to_vf (loop_vinfo); | |
1243 | ||
1244 | min_profitable_iters = vect_estimate_min_profitable_iters (loop_vinfo); | |
1245 | LOOP_VINFO_COST_MODEL_MIN_ITERS (loop_vinfo) = min_profitable_iters; | |
1246 | ||
1247 | if (min_profitable_iters < 0) | |
1248 | { | |
1249 | if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS)) | |
1250 | fprintf (vect_dump, "not vectorized: vectorization not profitable."); | |
1251 | if (vect_print_dump_info (REPORT_DETAILS)) | |
1252 | fprintf (vect_dump, "not vectorized: vector version will never be " | |
1253 | "profitable."); | |
1254 | return false; | |
1255 | } | |
1256 | ||
1257 | min_scalar_loop_bound = ((PARAM_VALUE (PARAM_MIN_VECT_LOOP_BOUND) | |
1258 | * vectorization_factor) - 1); | |
1259 | ||
1260 | /* Use the cost model only if it is more conservative than user specified | |
1261 | threshold. */ | |
1262 | ||
1263 | th = (unsigned) min_scalar_loop_bound; | |
1264 | if (min_profitable_iters | |
1265 | && (!min_scalar_loop_bound | |
1266 | || min_profitable_iters > min_scalar_loop_bound)) | |
1267 | th = (unsigned) min_profitable_iters; | |
1268 | ||
1269 | if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) | |
1270 | && LOOP_VINFO_INT_NITERS (loop_vinfo) <= th) | |
1271 | { | |
1272 | if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS)) | |
1273 | fprintf (vect_dump, "not vectorized: vectorization not " | |
1274 | "profitable."); | |
1275 | if (vect_print_dump_info (REPORT_DETAILS)) | |
1276 | fprintf (vect_dump, "not vectorized: iteration count smaller than " | |
1277 | "user specified loop bound parameter or minimum " | |
1278 | "profitable iterations (whichever is more conservative)."); | |
1279 | return false; | |
1280 | } | |
1281 | ||
1282 | if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) | |
1283 | || LOOP_VINFO_INT_NITERS (loop_vinfo) % vectorization_factor != 0 | |
1284 | || LOOP_PEELING_FOR_ALIGNMENT (loop_vinfo)) | |
1285 | { | |
1286 | if (vect_print_dump_info (REPORT_DETAILS)) | |
1287 | fprintf (vect_dump, "epilog loop required."); | |
1288 | if (!vect_can_advance_ivs_p (loop_vinfo)) | |
1289 | { | |
1290 | if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS)) | |
1291 | fprintf (vect_dump, | |
1292 | "not vectorized: can't create epilog loop 1."); | |
1293 | return false; | |
1294 | } | |
1295 | if (!slpeel_can_duplicate_loop_p (loop, single_exit (loop))) | |
1296 | { | |
1297 | if (vect_print_dump_info (REPORT_UNVECTORIZED_LOCATIONS)) | |
1298 | fprintf (vect_dump, | |
1299 | "not vectorized: can't create epilog loop 2."); | |
1300 | return false; | |
1301 | } | |
1302 | } | |
1303 | ||
1304 | return true; | |
1305 | } | |
1306 | ||
1307 | ||
ebfd146a IR |
1308 | /* Function vect_analyze_loop. |
1309 | ||
1310 | Apply a set of analyses on LOOP, and create a loop_vec_info struct | |
1311 | for it. The different analyses will record information in the | |
1312 | loop_vec_info struct. */ | |
1313 | loop_vec_info | |
1314 | vect_analyze_loop (struct loop *loop) | |
1315 | { | |
1316 | bool ok; | |
1317 | loop_vec_info loop_vinfo; | |
1318 | ||
1319 | if (vect_print_dump_info (REPORT_DETAILS)) | |
1320 | fprintf (vect_dump, "===== analyze_loop_nest ====="); | |
1321 | ||
b8698a0f | 1322 | if (loop_outer (loop) |
ebfd146a IR |
1323 | && loop_vec_info_for_loop (loop_outer (loop)) |
1324 | && LOOP_VINFO_VECTORIZABLE_P (loop_vec_info_for_loop (loop_outer (loop)))) | |
1325 | { | |
1326 | if (vect_print_dump_info (REPORT_DETAILS)) | |
1327 | fprintf (vect_dump, "outer-loop already vectorized."); | |
1328 | return NULL; | |
1329 | } | |
1330 | ||
1331 | /* Check the CFG characteristics of the loop (nesting, entry/exit, etc. */ | |
1332 | ||
1333 | loop_vinfo = vect_analyze_loop_form (loop); | |
1334 | if (!loop_vinfo) | |
1335 | { | |
1336 | if (vect_print_dump_info (REPORT_DETAILS)) | |
1337 | fprintf (vect_dump, "bad loop form."); | |
1338 | return NULL; | |
1339 | } | |
1340 | ||
1341 | /* Find all data references in the loop (which correspond to vdefs/vuses) | |
1342 | and analyze their evolution in the loop. | |
1343 | ||
1344 | FORNOW: Handle only simple, array references, which | |
1345 | alignment can be forced, and aligned pointer-references. */ | |
1346 | ||
a70d6342 | 1347 | ok = vect_analyze_data_refs (loop_vinfo, NULL); |
ebfd146a IR |
1348 | if (!ok) |
1349 | { | |
1350 | if (vect_print_dump_info (REPORT_DETAILS)) | |
1351 | fprintf (vect_dump, "bad data references."); | |
1352 | destroy_loop_vec_info (loop_vinfo, true); | |
1353 | return NULL; | |
1354 | } | |
1355 | ||
1356 | /* Classify all cross-iteration scalar data-flow cycles. | |
1357 | Cross-iteration cycles caused by virtual phis are analyzed separately. */ | |
1358 | ||
1359 | vect_analyze_scalar_cycles (loop_vinfo); | |
1360 | ||
1361 | vect_pattern_recog (loop_vinfo); | |
1362 | ||
1363 | /* Data-flow analysis to detect stmts that do not need to be vectorized. */ | |
1364 | ||
1365 | ok = vect_mark_stmts_to_be_vectorized (loop_vinfo); | |
1366 | if (!ok) | |
1367 | { | |
1368 | if (vect_print_dump_info (REPORT_DETAILS)) | |
1369 | fprintf (vect_dump, "unexpected pattern."); | |
1370 | destroy_loop_vec_info (loop_vinfo, true); | |
1371 | return NULL; | |
1372 | } | |
1373 | ||
1374 | /* Analyze the alignment of the data-refs in the loop. | |
1375 | Fail if a data reference is found that cannot be vectorized. */ | |
1376 | ||
a70d6342 | 1377 | ok = vect_analyze_data_refs_alignment (loop_vinfo, NULL); |
ebfd146a IR |
1378 | if (!ok) |
1379 | { | |
1380 | if (vect_print_dump_info (REPORT_DETAILS)) | |
1381 | fprintf (vect_dump, "bad data alignment."); | |
1382 | destroy_loop_vec_info (loop_vinfo, true); | |
1383 | return NULL; | |
1384 | } | |
1385 | ||
1386 | ok = vect_determine_vectorization_factor (loop_vinfo); | |
1387 | if (!ok) | |
1388 | { | |
1389 | if (vect_print_dump_info (REPORT_DETAILS)) | |
1390 | fprintf (vect_dump, "can't determine vectorization factor."); | |
1391 | destroy_loop_vec_info (loop_vinfo, true); | |
1392 | return NULL; | |
1393 | } | |
1394 | ||
b8698a0f | 1395 | /* Analyze data dependences between the data-refs in the loop. |
ebfd146a IR |
1396 | FORNOW: fail at the first data dependence that we encounter. */ |
1397 | ||
a70d6342 | 1398 | ok = vect_analyze_data_ref_dependences (loop_vinfo, NULL); |
ebfd146a IR |
1399 | if (!ok) |
1400 | { | |
1401 | if (vect_print_dump_info (REPORT_DETAILS)) | |
1402 | fprintf (vect_dump, "bad data dependence."); | |
1403 | destroy_loop_vec_info (loop_vinfo, true); | |
1404 | return NULL; | |
1405 | } | |
1406 | ||
1407 | /* Analyze the access patterns of the data-refs in the loop (consecutive, | |
1408 | complex, etc.). FORNOW: Only handle consecutive access pattern. */ | |
1409 | ||
a70d6342 | 1410 | ok = vect_analyze_data_ref_accesses (loop_vinfo, NULL); |
ebfd146a IR |
1411 | if (!ok) |
1412 | { | |
1413 | if (vect_print_dump_info (REPORT_DETAILS)) | |
1414 | fprintf (vect_dump, "bad data access."); | |
1415 | destroy_loop_vec_info (loop_vinfo, true); | |
1416 | return NULL; | |
1417 | } | |
1418 | ||
1419 | /* Prune the list of ddrs to be tested at run-time by versioning for alias. | |
1420 | It is important to call pruning after vect_analyze_data_ref_accesses, | |
1421 | since we use grouping information gathered by interleaving analysis. */ | |
1422 | ok = vect_prune_runtime_alias_test_list (loop_vinfo); | |
1423 | if (!ok) | |
1424 | { | |
1425 | if (vect_print_dump_info (REPORT_DETAILS)) | |
1426 | fprintf (vect_dump, "too long list of versioning for alias " | |
1427 | "run-time tests."); | |
1428 | destroy_loop_vec_info (loop_vinfo, true); | |
1429 | return NULL; | |
1430 | } | |
1431 | ||
1432 | /* Check the SLP opportunities in the loop, analyze and build SLP trees. */ | |
a70d6342 | 1433 | ok = vect_analyze_slp (loop_vinfo, NULL); |
ebfd146a IR |
1434 | if (ok) |
1435 | { | |
1436 | /* Decide which possible SLP instances to SLP. */ | |
1437 | vect_make_slp_decision (loop_vinfo); | |
1438 | ||
1439 | /* Find stmts that need to be both vectorized and SLPed. */ | |
1440 | vect_detect_hybrid_slp (loop_vinfo); | |
1441 | } | |
1442 | ||
1443 | /* This pass will decide on using loop versioning and/or loop peeling in | |
1444 | order to enhance the alignment of data references in the loop. */ | |
1445 | ||
1446 | ok = vect_enhance_data_refs_alignment (loop_vinfo); | |
1447 | if (!ok) | |
1448 | { | |
1449 | if (vect_print_dump_info (REPORT_DETAILS)) | |
1450 | fprintf (vect_dump, "bad data alignment."); | |
1451 | destroy_loop_vec_info (loop_vinfo, true); | |
1452 | return NULL; | |
1453 | } | |
1454 | ||
1455 | /* Scan all the operations in the loop and make sure they are | |
1456 | vectorizable. */ | |
1457 | ||
8644a673 | 1458 | ok = vect_analyze_loop_operations (loop_vinfo); |
ebfd146a IR |
1459 | if (!ok) |
1460 | { | |
1461 | if (vect_print_dump_info (REPORT_DETAILS)) | |
1462 | fprintf (vect_dump, "bad operation or unsupported loop bound."); | |
1463 | destroy_loop_vec_info (loop_vinfo, true); | |
1464 | return NULL; | |
1465 | } | |
1466 | ||
1467 | LOOP_VINFO_VECTORIZABLE_P (loop_vinfo) = 1; | |
1468 | ||
1469 | return loop_vinfo; | |
1470 | } | |
1471 | ||
1472 | ||
1473 | /* Function reduction_code_for_scalar_code | |
1474 | ||
1475 | Input: | |
1476 | CODE - tree_code of a reduction operations. | |
1477 | ||
1478 | Output: | |
1479 | REDUC_CODE - the corresponding tree-code to be used to reduce the | |
1480 | vector of partial results into a single scalar result (which | |
06066f92 IR |
1481 | will also reside in a vector) or ERROR_MARK if the operation is |
1482 | a supported reduction operation, but does not have such tree-code. | |
ebfd146a | 1483 | |
06066f92 | 1484 | Return FALSE if CODE currently cannot be vectorized as reduction. */ |
ebfd146a IR |
1485 | |
1486 | static bool | |
1487 | reduction_code_for_scalar_code (enum tree_code code, | |
1488 | enum tree_code *reduc_code) | |
1489 | { | |
1490 | switch (code) | |
06066f92 IR |
1491 | { |
1492 | case MAX_EXPR: | |
1493 | *reduc_code = REDUC_MAX_EXPR; | |
1494 | return true; | |
ebfd146a | 1495 | |
06066f92 IR |
1496 | case MIN_EXPR: |
1497 | *reduc_code = REDUC_MIN_EXPR; | |
1498 | return true; | |
ebfd146a | 1499 | |
06066f92 IR |
1500 | case PLUS_EXPR: |
1501 | *reduc_code = REDUC_PLUS_EXPR; | |
1502 | return true; | |
ebfd146a | 1503 | |
06066f92 IR |
1504 | case MULT_EXPR: |
1505 | case MINUS_EXPR: | |
1506 | case BIT_IOR_EXPR: | |
1507 | case BIT_XOR_EXPR: | |
1508 | case BIT_AND_EXPR: | |
1509 | *reduc_code = ERROR_MARK; | |
1510 | return true; | |
1511 | ||
1512 | default: | |
1513 | return false; | |
1514 | } | |
ebfd146a IR |
1515 | } |
1516 | ||
1517 | ||
1518 | /* Error reporting helper for vect_is_simple_reduction below. GIMPLE statement | |
1519 | STMT is printed with a message MSG. */ | |
1520 | ||
1521 | static void | |
1522 | report_vect_op (gimple stmt, const char *msg) | |
1523 | { | |
1524 | fprintf (vect_dump, "%s", msg); | |
1525 | print_gimple_stmt (vect_dump, stmt, 0, TDF_SLIM); | |
1526 | } | |
1527 | ||
1528 | ||
1529 | /* Function vect_is_simple_reduction | |
1530 | ||
06066f92 | 1531 | (1) Detect a cross-iteration def-use cycle that represents a simple |
ebfd146a IR |
1532 | reduction computation. We look for the following pattern: |
1533 | ||
1534 | loop_header: | |
1535 | a1 = phi < a0, a2 > | |
1536 | a3 = ... | |
1537 | a2 = operation (a3, a1) | |
b8698a0f | 1538 | |
ebfd146a | 1539 | such that: |
b8698a0f | 1540 | 1. operation is commutative and associative and it is safe to |
7c5222ff | 1541 | change the order of the computation (if CHECK_REDUCTION is true) |
ebfd146a IR |
1542 | 2. no uses for a2 in the loop (a2 is used out of the loop) |
1543 | 3. no uses of a1 in the loop besides the reduction operation. | |
1544 | ||
1545 | Condition 1 is tested here. | |
b8698a0f | 1546 | Conditions 2,3 are tested in vect_mark_stmts_to_be_vectorized. |
7c5222ff | 1547 | |
b8698a0f L |
1548 | (2) Detect a cross-iteration def-use cycle in nested loops, i.e., |
1549 | nested cycles, if CHECK_REDUCTION is false. | |
06066f92 IR |
1550 | |
1551 | (3) Detect cycles of phi nodes in outer-loop vectorization, i.e., double | |
1552 | reductions: | |
1553 | ||
1554 | a1 = phi < a0, a2 > | |
1555 | inner loop (def of a3) | |
b8698a0f | 1556 | a2 = phi < a3 > |
06066f92 | 1557 | */ |
ebfd146a IR |
1558 | |
1559 | gimple | |
b8698a0f | 1560 | vect_is_simple_reduction (loop_vec_info loop_info, gimple phi, |
06066f92 | 1561 | bool check_reduction, bool *double_reduc) |
ebfd146a IR |
1562 | { |
1563 | struct loop *loop = (gimple_bb (phi))->loop_father; | |
1564 | struct loop *vect_loop = LOOP_VINFO_LOOP (loop_info); | |
1565 | edge latch_e = loop_latch_edge (loop); | |
1566 | tree loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e); | |
4bbe8262 | 1567 | gimple def_stmt, def1 = NULL, def2 = NULL; |
ebfd146a | 1568 | enum tree_code code; |
4bbe8262 | 1569 | tree op1, op2, op3 = NULL_TREE, op4 = NULL_TREE; |
ebfd146a IR |
1570 | tree type; |
1571 | int nloop_uses; | |
1572 | tree name; | |
1573 | imm_use_iterator imm_iter; | |
1574 | use_operand_p use_p; | |
06066f92 IR |
1575 | bool phi_def; |
1576 | ||
1577 | *double_reduc = false; | |
ebfd146a | 1578 | |
7c5222ff IR |
1579 | /* If CHECK_REDUCTION is true, we assume inner-most loop vectorization, |
1580 | otherwise, we assume outer loop vectorization. */ | |
b8698a0f | 1581 | gcc_assert ((check_reduction && loop == vect_loop) |
7c5222ff | 1582 | || (!check_reduction && flow_loop_nested_p (vect_loop, loop))); |
ebfd146a IR |
1583 | |
1584 | name = PHI_RESULT (phi); | |
1585 | nloop_uses = 0; | |
1586 | FOR_EACH_IMM_USE_FAST (use_p, imm_iter, name) | |
1587 | { | |
1588 | gimple use_stmt = USE_STMT (use_p); | |
b5b8b0ac AO |
1589 | if (is_gimple_debug (use_stmt)) |
1590 | continue; | |
ebfd146a IR |
1591 | if (flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)) |
1592 | && vinfo_for_stmt (use_stmt) | |
1593 | && !is_pattern_stmt_p (vinfo_for_stmt (use_stmt))) | |
1594 | nloop_uses++; | |
1595 | if (nloop_uses > 1) | |
1596 | { | |
1597 | if (vect_print_dump_info (REPORT_DETAILS)) | |
1598 | fprintf (vect_dump, "reduction used in loop."); | |
1599 | return NULL; | |
1600 | } | |
1601 | } | |
1602 | ||
1603 | if (TREE_CODE (loop_arg) != SSA_NAME) | |
1604 | { | |
1605 | if (vect_print_dump_info (REPORT_DETAILS)) | |
1606 | { | |
1607 | fprintf (vect_dump, "reduction: not ssa_name: "); | |
1608 | print_generic_expr (vect_dump, loop_arg, TDF_SLIM); | |
1609 | } | |
1610 | return NULL; | |
1611 | } | |
1612 | ||
1613 | def_stmt = SSA_NAME_DEF_STMT (loop_arg); | |
1614 | if (!def_stmt) | |
1615 | { | |
1616 | if (vect_print_dump_info (REPORT_DETAILS)) | |
1617 | fprintf (vect_dump, "reduction: no def_stmt."); | |
1618 | return NULL; | |
1619 | } | |
1620 | ||
06066f92 | 1621 | if (!is_gimple_assign (def_stmt) && gimple_code (def_stmt) != GIMPLE_PHI) |
ebfd146a IR |
1622 | { |
1623 | if (vect_print_dump_info (REPORT_DETAILS)) | |
1624 | print_gimple_stmt (vect_dump, def_stmt, 0, TDF_SLIM); | |
1625 | return NULL; | |
1626 | } | |
1627 | ||
06066f92 IR |
1628 | if (is_gimple_assign (def_stmt)) |
1629 | { | |
1630 | name = gimple_assign_lhs (def_stmt); | |
1631 | phi_def = false; | |
1632 | } | |
1633 | else | |
1634 | { | |
1635 | name = PHI_RESULT (def_stmt); | |
1636 | phi_def = true; | |
1637 | } | |
1638 | ||
ebfd146a IR |
1639 | nloop_uses = 0; |
1640 | FOR_EACH_IMM_USE_FAST (use_p, imm_iter, name) | |
1641 | { | |
1642 | gimple use_stmt = USE_STMT (use_p); | |
b5b8b0ac AO |
1643 | if (is_gimple_debug (use_stmt)) |
1644 | continue; | |
ebfd146a IR |
1645 | if (flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)) |
1646 | && vinfo_for_stmt (use_stmt) | |
1647 | && !is_pattern_stmt_p (vinfo_for_stmt (use_stmt))) | |
1648 | nloop_uses++; | |
1649 | if (nloop_uses > 1) | |
1650 | { | |
1651 | if (vect_print_dump_info (REPORT_DETAILS)) | |
1652 | fprintf (vect_dump, "reduction used in loop."); | |
1653 | return NULL; | |
1654 | } | |
1655 | } | |
1656 | ||
06066f92 IR |
1657 | /* If DEF_STMT is a phi node itself, we expect it to have a single argument |
1658 | defined in the inner loop. */ | |
1659 | if (phi_def) | |
1660 | { | |
1661 | op1 = PHI_ARG_DEF (def_stmt, 0); | |
1662 | ||
1663 | if (gimple_phi_num_args (def_stmt) != 1 | |
1664 | || TREE_CODE (op1) != SSA_NAME) | |
1665 | { | |
1666 | if (vect_print_dump_info (REPORT_DETAILS)) | |
1667 | fprintf (vect_dump, "unsupported phi node definition."); | |
1668 | ||
1669 | return NULL; | |
1670 | } | |
1671 | ||
b8698a0f L |
1672 | def1 = SSA_NAME_DEF_STMT (op1); |
1673 | if (flow_bb_inside_loop_p (loop, gimple_bb (def_stmt)) | |
06066f92 IR |
1674 | && loop->inner |
1675 | && flow_bb_inside_loop_p (loop->inner, gimple_bb (def1)) | |
1676 | && is_gimple_assign (def1)) | |
1677 | { | |
1678 | if (vect_print_dump_info (REPORT_DETAILS)) | |
1679 | report_vect_op (def_stmt, "detected double reduction: "); | |
b8698a0f | 1680 | |
06066f92 IR |
1681 | *double_reduc = true; |
1682 | return def_stmt; | |
1683 | } | |
1684 | ||
1685 | return NULL; | |
1686 | } | |
1687 | ||
ebfd146a IR |
1688 | code = gimple_assign_rhs_code (def_stmt); |
1689 | ||
b8698a0f | 1690 | if (check_reduction |
7c5222ff | 1691 | && (!commutative_tree_code (code) || !associative_tree_code (code))) |
ebfd146a IR |
1692 | { |
1693 | if (vect_print_dump_info (REPORT_DETAILS)) | |
1694 | report_vect_op (def_stmt, "reduction: not commutative/associative: "); | |
1695 | return NULL; | |
1696 | } | |
1697 | ||
b8698a0f | 1698 | if (get_gimple_rhs_class (code) != GIMPLE_BINARY_RHS) |
ebfd146a | 1699 | { |
4bbe8262 IR |
1700 | if (code != COND_EXPR) |
1701 | { | |
1702 | if (vect_print_dump_info (REPORT_DETAILS)) | |
1703 | report_vect_op (def_stmt, "reduction: not binary operation: "); | |
ebfd146a | 1704 | |
4bbe8262 IR |
1705 | return NULL; |
1706 | } | |
1707 | ||
6f4454fc IR |
1708 | op3 = TREE_OPERAND (gimple_assign_rhs1 (def_stmt), 0); |
1709 | if (COMPARISON_CLASS_P (op3)) | |
1710 | { | |
1711 | op4 = TREE_OPERAND (op3, 1); | |
1712 | op3 = TREE_OPERAND (op3, 0); | |
b8698a0f L |
1713 | } |
1714 | ||
4bbe8262 IR |
1715 | op1 = TREE_OPERAND (gimple_assign_rhs1 (def_stmt), 1); |
1716 | op2 = TREE_OPERAND (gimple_assign_rhs1 (def_stmt), 2); | |
1717 | ||
1718 | if (TREE_CODE (op1) != SSA_NAME && TREE_CODE (op2) != SSA_NAME) | |
1719 | { | |
1720 | if (vect_print_dump_info (REPORT_DETAILS)) | |
1721 | report_vect_op (def_stmt, "reduction: uses not ssa_names: "); | |
1722 | ||
1723 | return NULL; | |
1724 | } | |
ebfd146a | 1725 | } |
4bbe8262 IR |
1726 | else |
1727 | { | |
1728 | op1 = gimple_assign_rhs1 (def_stmt); | |
1729 | op2 = gimple_assign_rhs2 (def_stmt); | |
1730 | ||
1731 | if (TREE_CODE (op1) != SSA_NAME || TREE_CODE (op2) != SSA_NAME) | |
1732 | { | |
1733 | if (vect_print_dump_info (REPORT_DETAILS)) | |
1734 | report_vect_op (def_stmt, "reduction: uses not ssa_names: "); | |
1735 | ||
1736 | return NULL; | |
1737 | } | |
1738 | } | |
ebfd146a | 1739 | |
ebfd146a | 1740 | type = TREE_TYPE (gimple_assign_lhs (def_stmt)); |
4bbe8262 | 1741 | if ((TREE_CODE (op1) == SSA_NAME |
9600efe1 | 1742 | && !types_compatible_p (type,TREE_TYPE (op1))) |
4bbe8262 | 1743 | || (TREE_CODE (op2) == SSA_NAME |
9600efe1 | 1744 | && !types_compatible_p (type, TREE_TYPE (op2))) |
4bbe8262 | 1745 | || (op3 && TREE_CODE (op3) == SSA_NAME |
9600efe1 | 1746 | && !types_compatible_p (type, TREE_TYPE (op3))) |
4bbe8262 | 1747 | || (op4 && TREE_CODE (op4) == SSA_NAME |
9600efe1 | 1748 | && !types_compatible_p (type, TREE_TYPE (op4)))) |
ebfd146a IR |
1749 | { |
1750 | if (vect_print_dump_info (REPORT_DETAILS)) | |
1751 | { | |
1752 | fprintf (vect_dump, "reduction: multiple types: operation type: "); | |
1753 | print_generic_expr (vect_dump, type, TDF_SLIM); | |
1754 | fprintf (vect_dump, ", operands types: "); | |
1755 | print_generic_expr (vect_dump, TREE_TYPE (op1), TDF_SLIM); | |
1756 | fprintf (vect_dump, ","); | |
1757 | print_generic_expr (vect_dump, TREE_TYPE (op2), TDF_SLIM); | |
6f4454fc | 1758 | if (op3) |
4bbe8262 IR |
1759 | { |
1760 | fprintf (vect_dump, ","); | |
1761 | print_generic_expr (vect_dump, TREE_TYPE (op3), TDF_SLIM); | |
6f4454fc IR |
1762 | } |
1763 | ||
1764 | if (op4) | |
1765 | { | |
4bbe8262 IR |
1766 | fprintf (vect_dump, ","); |
1767 | print_generic_expr (vect_dump, TREE_TYPE (op4), TDF_SLIM); | |
1768 | } | |
ebfd146a | 1769 | } |
4bbe8262 | 1770 | |
ebfd146a IR |
1771 | return NULL; |
1772 | } | |
1773 | ||
b8698a0f | 1774 | /* Check that it's ok to change the order of the computation. |
7c5222ff | 1775 | Generally, when vectorizing a reduction we change the order of the |
ebfd146a | 1776 | computation. This may change the behavior of the program in some |
b8698a0f | 1777 | cases, so we need to check that this is ok. One exception is when |
ebfd146a IR |
1778 | vectorizing an outer-loop: the inner-loop is executed sequentially, |
1779 | and therefore vectorizing reductions in the inner-loop during | |
1780 | outer-loop vectorization is safe. */ | |
1781 | ||
1782 | /* CHECKME: check for !flag_finite_math_only too? */ | |
1783 | if (SCALAR_FLOAT_TYPE_P (type) && !flag_associative_math | |
b8698a0f | 1784 | && check_reduction) |
ebfd146a IR |
1785 | { |
1786 | /* Changing the order of operations changes the semantics. */ | |
1787 | if (vect_print_dump_info (REPORT_DETAILS)) | |
1788 | report_vect_op (def_stmt, "reduction: unsafe fp math optimization: "); | |
1789 | return NULL; | |
1790 | } | |
1791 | else if (INTEGRAL_TYPE_P (type) && TYPE_OVERFLOW_TRAPS (type) | |
7c5222ff | 1792 | && check_reduction) |
ebfd146a IR |
1793 | { |
1794 | /* Changing the order of operations changes the semantics. */ | |
1795 | if (vect_print_dump_info (REPORT_DETAILS)) | |
1796 | report_vect_op (def_stmt, "reduction: unsafe int math optimization: "); | |
1797 | return NULL; | |
1798 | } | |
7c5222ff | 1799 | else if (SAT_FIXED_POINT_TYPE_P (type) && check_reduction) |
ebfd146a IR |
1800 | { |
1801 | /* Changing the order of operations changes the semantics. */ | |
1802 | if (vect_print_dump_info (REPORT_DETAILS)) | |
b8698a0f | 1803 | report_vect_op (def_stmt, |
ebfd146a IR |
1804 | "reduction: unsafe fixed-point math optimization: "); |
1805 | return NULL; | |
1806 | } | |
1807 | ||
7c5222ff | 1808 | /* Reduction is safe. We're dealing with one of the following: |
ebfd146a | 1809 | 1) integer arithmetic and no trapv |
7c5222ff IR |
1810 | 2) floating point arithmetic, and special flags permit this optimization |
1811 | 3) nested cycle (i.e., outer loop vectorization). */ | |
4bbe8262 IR |
1812 | if (TREE_CODE (op1) == SSA_NAME) |
1813 | def1 = SSA_NAME_DEF_STMT (op1); | |
1814 | ||
1815 | if (TREE_CODE (op2) == SSA_NAME) | |
1816 | def2 = SSA_NAME_DEF_STMT (op2); | |
1817 | ||
b8698a0f | 1818 | if (code != COND_EXPR |
4bbe8262 | 1819 | && (!def1 || !def2 || gimple_nop_p (def1) || gimple_nop_p (def2))) |
ebfd146a IR |
1820 | { |
1821 | if (vect_print_dump_info (REPORT_DETAILS)) | |
1822 | report_vect_op (def_stmt, "reduction: no defs for operands: "); | |
1823 | return NULL; | |
1824 | } | |
1825 | ||
ebfd146a | 1826 | /* Check that one def is the reduction def, defined by PHI, |
8644a673 | 1827 | the other def is either defined in the loop ("vect_internal_def"), |
ebfd146a IR |
1828 | or it's an induction (defined by a loop-header phi-node). */ |
1829 | ||
4bbe8262 IR |
1830 | if (def2 && def2 == phi |
1831 | && (code == COND_EXPR | |
1832 | || (def1 && flow_bb_inside_loop_p (loop, gimple_bb (def1)) | |
1833 | && (is_gimple_assign (def1) | |
b8698a0f | 1834 | || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def1)) |
4bbe8262 IR |
1835 | == vect_induction_def |
1836 | || (gimple_code (def1) == GIMPLE_PHI | |
b8698a0f | 1837 | && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def1)) |
4bbe8262 IR |
1838 | == vect_internal_def |
1839 | && !is_loop_header_bb_p (gimple_bb (def1))))))) | |
ebfd146a IR |
1840 | { |
1841 | if (vect_print_dump_info (REPORT_DETAILS)) | |
7c5222ff | 1842 | report_vect_op (def_stmt, "detected reduction: "); |
ebfd146a IR |
1843 | return def_stmt; |
1844 | } | |
4bbe8262 IR |
1845 | else if (def1 && def1 == phi |
1846 | && (code == COND_EXPR | |
1847 | || (def2 && flow_bb_inside_loop_p (loop, gimple_bb (def2)) | |
1848 | && (is_gimple_assign (def2) | |
1849 | || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def2)) | |
1850 | == vect_induction_def | |
1851 | || (gimple_code (def2) == GIMPLE_PHI | |
b8698a0f | 1852 | && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def2)) |
4bbe8262 IR |
1853 | == vect_internal_def |
1854 | && !is_loop_header_bb_p (gimple_bb (def2))))))) | |
ebfd146a | 1855 | { |
7c5222ff IR |
1856 | if (check_reduction) |
1857 | { | |
1858 | /* Swap operands (just for simplicity - so that the rest of the code | |
1859 | can assume that the reduction variable is always the last (second) | |
1860 | argument). */ | |
1861 | if (vect_print_dump_info (REPORT_DETAILS)) | |
1862 | report_vect_op (def_stmt, | |
1863 | "detected reduction: need to swap operands: "); | |
1864 | ||
1865 | swap_tree_operands (def_stmt, gimple_assign_rhs1_ptr (def_stmt), | |
1866 | gimple_assign_rhs2_ptr (def_stmt)); | |
1867 | } | |
1868 | else | |
1869 | { | |
1870 | if (vect_print_dump_info (REPORT_DETAILS)) | |
1871 | report_vect_op (def_stmt, "detected reduction: "); | |
1872 | } | |
1873 | ||
ebfd146a IR |
1874 | return def_stmt; |
1875 | } | |
1876 | else | |
1877 | { | |
1878 | if (vect_print_dump_info (REPORT_DETAILS)) | |
7c5222ff IR |
1879 | report_vect_op (def_stmt, "reduction: unknown pattern: "); |
1880 | ||
ebfd146a IR |
1881 | return NULL; |
1882 | } | |
1883 | } | |
1884 | ||
1885 | ||
1886 | /* Function vect_estimate_min_profitable_iters | |
1887 | ||
1888 | Return the number of iterations required for the vector version of the | |
1889 | loop to be profitable relative to the cost of the scalar version of the | |
1890 | loop. | |
1891 | ||
1892 | TODO: Take profile info into account before making vectorization | |
1893 | decisions, if available. */ | |
1894 | ||
1895 | int | |
1896 | vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo) | |
1897 | { | |
1898 | int i; | |
1899 | int min_profitable_iters; | |
1900 | int peel_iters_prologue; | |
1901 | int peel_iters_epilogue; | |
1902 | int vec_inside_cost = 0; | |
1903 | int vec_outside_cost = 0; | |
1904 | int scalar_single_iter_cost = 0; | |
1905 | int scalar_outside_cost = 0; | |
1906 | int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo); | |
1907 | struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo); | |
1908 | basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo); | |
1909 | int nbbs = loop->num_nodes; | |
1910 | int byte_misalign = LOOP_PEELING_FOR_ALIGNMENT (loop_vinfo); | |
1911 | int peel_guard_costs = 0; | |
1912 | int innerloop_iters = 0, factor; | |
1913 | VEC (slp_instance, heap) *slp_instances; | |
1914 | slp_instance instance; | |
1915 | ||
1916 | /* Cost model disabled. */ | |
1917 | if (!flag_vect_cost_model) | |
1918 | { | |
1919 | if (vect_print_dump_info (REPORT_COST)) | |
b8698a0f | 1920 | fprintf (vect_dump, "cost model disabled."); |
ebfd146a IR |
1921 | return 0; |
1922 | } | |
1923 | ||
1924 | /* Requires loop versioning tests to handle misalignment. */ | |
e9dbe7bb | 1925 | if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo)) |
ebfd146a IR |
1926 | { |
1927 | /* FIXME: Make cost depend on complexity of individual check. */ | |
1928 | vec_outside_cost += | |
1929 | VEC_length (gimple, LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo)); | |
1930 | if (vect_print_dump_info (REPORT_COST)) | |
1931 | fprintf (vect_dump, "cost model: Adding cost of checks for loop " | |
1932 | "versioning to treat misalignment.\n"); | |
1933 | } | |
1934 | ||
e9dbe7bb IR |
1935 | /* Requires loop versioning with alias checks. */ |
1936 | if (LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo)) | |
ebfd146a IR |
1937 | { |
1938 | /* FIXME: Make cost depend on complexity of individual check. */ | |
1939 | vec_outside_cost += | |
1940 | VEC_length (ddr_p, LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo)); | |
1941 | if (vect_print_dump_info (REPORT_COST)) | |
1942 | fprintf (vect_dump, "cost model: Adding cost of checks for loop " | |
1943 | "versioning aliasing.\n"); | |
1944 | } | |
1945 | ||
e9dbe7bb IR |
1946 | if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo) |
1947 | || LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo)) | |
1948 | vec_outside_cost += TARG_COND_TAKEN_BRANCH_COST; | |
ebfd146a IR |
1949 | |
1950 | /* Count statements in scalar loop. Using this as scalar cost for a single | |
1951 | iteration for now. | |
1952 | ||
1953 | TODO: Add outer loop support. | |
1954 | ||
1955 | TODO: Consider assigning different costs to different scalar | |
1956 | statements. */ | |
1957 | ||
1958 | /* FORNOW. */ | |
1959 | if (loop->inner) | |
1960 | innerloop_iters = 50; /* FIXME */ | |
1961 | ||
1962 | for (i = 0; i < nbbs; i++) | |
1963 | { | |
1964 | gimple_stmt_iterator si; | |
1965 | basic_block bb = bbs[i]; | |
1966 | ||
1967 | if (bb->loop_father == loop->inner) | |
1968 | factor = innerloop_iters; | |
1969 | else | |
1970 | factor = 1; | |
1971 | ||
1972 | for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si)) | |
1973 | { | |
1974 | gimple stmt = gsi_stmt (si); | |
1975 | stmt_vec_info stmt_info = vinfo_for_stmt (stmt); | |
1976 | /* Skip stmts that are not vectorized inside the loop. */ | |
1977 | if (!STMT_VINFO_RELEVANT_P (stmt_info) | |
1978 | && (!STMT_VINFO_LIVE_P (stmt_info) | |
1979 | || STMT_VINFO_DEF_TYPE (stmt_info) != vect_reduction_def)) | |
1980 | continue; | |
1981 | scalar_single_iter_cost += cost_for_stmt (stmt) * factor; | |
1982 | vec_inside_cost += STMT_VINFO_INSIDE_OF_LOOP_COST (stmt_info) * factor; | |
1983 | /* FIXME: for stmts in the inner-loop in outer-loop vectorization, | |
1984 | some of the "outside" costs are generated inside the outer-loop. */ | |
1985 | vec_outside_cost += STMT_VINFO_OUTSIDE_OF_LOOP_COST (stmt_info); | |
1986 | } | |
1987 | } | |
1988 | ||
1989 | /* Add additional cost for the peeled instructions in prologue and epilogue | |
1990 | loop. | |
1991 | ||
1992 | FORNOW: If we don't know the value of peel_iters for prologue or epilogue | |
1993 | at compile-time - we assume it's vf/2 (the worst would be vf-1). | |
1994 | ||
1995 | TODO: Build an expression that represents peel_iters for prologue and | |
1996 | epilogue to be used in a run-time test. */ | |
1997 | ||
1998 | if (byte_misalign < 0) | |
1999 | { | |
2000 | peel_iters_prologue = vf/2; | |
2001 | if (vect_print_dump_info (REPORT_COST)) | |
2002 | fprintf (vect_dump, "cost model: " | |
2003 | "prologue peel iters set to vf/2."); | |
2004 | ||
2005 | /* If peeling for alignment is unknown, loop bound of main loop becomes | |
2006 | unknown. */ | |
2007 | peel_iters_epilogue = vf/2; | |
2008 | if (vect_print_dump_info (REPORT_COST)) | |
2009 | fprintf (vect_dump, "cost model: " | |
2010 | "epilogue peel iters set to vf/2 because " | |
2011 | "peeling for alignment is unknown ."); | |
2012 | ||
2013 | /* If peeled iterations are unknown, count a taken branch and a not taken | |
2014 | branch per peeled loop. Even if scalar loop iterations are known, | |
2015 | vector iterations are not known since peeled prologue iterations are | |
2016 | not known. Hence guards remain the same. */ | |
2017 | peel_guard_costs += 2 * (TARG_COND_TAKEN_BRANCH_COST | |
2018 | + TARG_COND_NOT_TAKEN_BRANCH_COST); | |
2019 | } | |
b8698a0f | 2020 | else |
ebfd146a IR |
2021 | { |
2022 | if (byte_misalign) | |
2023 | { | |
2024 | struct data_reference *dr = LOOP_VINFO_UNALIGNED_DR (loop_vinfo); | |
2025 | int element_size = GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (dr)))); | |
2026 | tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (DR_STMT (dr))); | |
2027 | int nelements = TYPE_VECTOR_SUBPARTS (vectype); | |
2028 | ||
2029 | peel_iters_prologue = nelements - (byte_misalign / element_size); | |
2030 | } | |
2031 | else | |
2032 | peel_iters_prologue = 0; | |
2033 | ||
2034 | if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)) | |
2035 | { | |
2036 | peel_iters_epilogue = vf/2; | |
2037 | if (vect_print_dump_info (REPORT_COST)) | |
2038 | fprintf (vect_dump, "cost model: " | |
2039 | "epilogue peel iters set to vf/2 because " | |
2040 | "loop iterations are unknown ."); | |
2041 | ||
2042 | /* If peeled iterations are known but number of scalar loop | |
2043 | iterations are unknown, count a taken branch per peeled loop. */ | |
2044 | peel_guard_costs += 2 * TARG_COND_TAKEN_BRANCH_COST; | |
2045 | ||
2046 | } | |
b8698a0f | 2047 | else |
ebfd146a IR |
2048 | { |
2049 | int niters = LOOP_VINFO_INT_NITERS (loop_vinfo); | |
b8698a0f | 2050 | peel_iters_prologue = niters < peel_iters_prologue ? |
ebfd146a IR |
2051 | niters : peel_iters_prologue; |
2052 | peel_iters_epilogue = (niters - peel_iters_prologue) % vf; | |
2053 | } | |
2054 | } | |
2055 | ||
2056 | vec_outside_cost += (peel_iters_prologue * scalar_single_iter_cost) | |
2057 | + (peel_iters_epilogue * scalar_single_iter_cost) | |
2058 | + peel_guard_costs; | |
2059 | ||
2060 | /* FORNOW: The scalar outside cost is incremented in one of the | |
2061 | following ways: | |
2062 | ||
2063 | 1. The vectorizer checks for alignment and aliasing and generates | |
2064 | a condition that allows dynamic vectorization. A cost model | |
2065 | check is ANDED with the versioning condition. Hence scalar code | |
2066 | path now has the added cost of the versioning check. | |
2067 | ||
2068 | if (cost > th & versioning_check) | |
2069 | jmp to vector code | |
2070 | ||
2071 | Hence run-time scalar is incremented by not-taken branch cost. | |
2072 | ||
2073 | 2. The vectorizer then checks if a prologue is required. If the | |
2074 | cost model check was not done before during versioning, it has to | |
2075 | be done before the prologue check. | |
2076 | ||
2077 | if (cost <= th) | |
2078 | prologue = scalar_iters | |
2079 | if (prologue == 0) | |
2080 | jmp to vector code | |
2081 | else | |
2082 | execute prologue | |
2083 | if (prologue == num_iters) | |
2084 | go to exit | |
2085 | ||
2086 | Hence the run-time scalar cost is incremented by a taken branch, | |
2087 | plus a not-taken branch, plus a taken branch cost. | |
2088 | ||
2089 | 3. The vectorizer then checks if an epilogue is required. If the | |
2090 | cost model check was not done before during prologue check, it | |
2091 | has to be done with the epilogue check. | |
2092 | ||
2093 | if (prologue == 0) | |
2094 | jmp to vector code | |
2095 | else | |
2096 | execute prologue | |
2097 | if (prologue == num_iters) | |
2098 | go to exit | |
2099 | vector code: | |
2100 | if ((cost <= th) | (scalar_iters-prologue-epilogue == 0)) | |
2101 | jmp to epilogue | |
2102 | ||
2103 | Hence the run-time scalar cost should be incremented by 2 taken | |
2104 | branches. | |
2105 | ||
2106 | TODO: The back end may reorder the BBS's differently and reverse | |
2107 | conditions/branch directions. Change the estimates below to | |
2108 | something more reasonable. */ | |
2109 | ||
2110 | /* If the number of iterations is known and we do not do versioning, we can | |
2111 | decide whether to vectorize at compile time. Hence the scalar version | |
2112 | do not carry cost model guard costs. */ | |
2113 | if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) | |
e9dbe7bb IR |
2114 | || LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo) |
2115 | || LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo)) | |
ebfd146a IR |
2116 | { |
2117 | /* Cost model check occurs at versioning. */ | |
e9dbe7bb IR |
2118 | if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo) |
2119 | || LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo)) | |
ebfd146a IR |
2120 | scalar_outside_cost += TARG_COND_NOT_TAKEN_BRANCH_COST; |
2121 | else | |
2122 | { | |
2123 | /* Cost model check occurs at prologue generation. */ | |
2124 | if (LOOP_PEELING_FOR_ALIGNMENT (loop_vinfo) < 0) | |
2125 | scalar_outside_cost += 2 * TARG_COND_TAKEN_BRANCH_COST | |
2126 | + TARG_COND_NOT_TAKEN_BRANCH_COST; | |
2127 | /* Cost model check occurs at epilogue generation. */ | |
2128 | else | |
2129 | scalar_outside_cost += 2 * TARG_COND_TAKEN_BRANCH_COST; | |
2130 | } | |
2131 | } | |
2132 | ||
2133 | /* Add SLP costs. */ | |
2134 | slp_instances = LOOP_VINFO_SLP_INSTANCES (loop_vinfo); | |
2135 | for (i = 0; VEC_iterate (slp_instance, slp_instances, i, instance); i++) | |
2136 | { | |
2137 | vec_outside_cost += SLP_INSTANCE_OUTSIDE_OF_LOOP_COST (instance); | |
2138 | vec_inside_cost += SLP_INSTANCE_INSIDE_OF_LOOP_COST (instance); | |
2139 | } | |
2140 | ||
b8698a0f | 2141 | /* Calculate number of iterations required to make the vector version |
ebfd146a | 2142 | profitable, relative to the loop bodies only. The following condition |
b8698a0f | 2143 | must hold true: |
ebfd146a IR |
2144 | SIC * niters + SOC > VIC * ((niters-PL_ITERS-EP_ITERS)/VF) + VOC |
2145 | where | |
2146 | SIC = scalar iteration cost, VIC = vector iteration cost, | |
2147 | VOC = vector outside cost, VF = vectorization factor, | |
2148 | PL_ITERS = prologue iterations, EP_ITERS= epilogue iterations | |
2149 | SOC = scalar outside cost for run time cost model check. */ | |
2150 | ||
2151 | if ((scalar_single_iter_cost * vf) > vec_inside_cost) | |
2152 | { | |
2153 | if (vec_outside_cost <= 0) | |
2154 | min_profitable_iters = 1; | |
2155 | else | |
2156 | { | |
2157 | min_profitable_iters = ((vec_outside_cost - scalar_outside_cost) * vf | |
2158 | - vec_inside_cost * peel_iters_prologue | |
2159 | - vec_inside_cost * peel_iters_epilogue) | |
2160 | / ((scalar_single_iter_cost * vf) | |
2161 | - vec_inside_cost); | |
2162 | ||
2163 | if ((scalar_single_iter_cost * vf * min_profitable_iters) | |
2164 | <= ((vec_inside_cost * min_profitable_iters) | |
2165 | + ((vec_outside_cost - scalar_outside_cost) * vf))) | |
2166 | min_profitable_iters++; | |
2167 | } | |
2168 | } | |
2169 | /* vector version will never be profitable. */ | |
2170 | else | |
2171 | { | |
2172 | if (vect_print_dump_info (REPORT_COST)) | |
2173 | fprintf (vect_dump, "cost model: vector iteration cost = %d " | |
2174 | "is divisible by scalar iteration cost = %d by a factor " | |
2175 | "greater than or equal to the vectorization factor = %d .", | |
2176 | vec_inside_cost, scalar_single_iter_cost, vf); | |
2177 | return -1; | |
2178 | } | |
2179 | ||
2180 | if (vect_print_dump_info (REPORT_COST)) | |
2181 | { | |
2182 | fprintf (vect_dump, "Cost model analysis: \n"); | |
2183 | fprintf (vect_dump, " Vector inside of loop cost: %d\n", | |
2184 | vec_inside_cost); | |
2185 | fprintf (vect_dump, " Vector outside of loop cost: %d\n", | |
2186 | vec_outside_cost); | |
2187 | fprintf (vect_dump, " Scalar iteration cost: %d\n", | |
2188 | scalar_single_iter_cost); | |
2189 | fprintf (vect_dump, " Scalar outside cost: %d\n", scalar_outside_cost); | |
2190 | fprintf (vect_dump, " prologue iterations: %d\n", | |
2191 | peel_iters_prologue); | |
2192 | fprintf (vect_dump, " epilogue iterations: %d\n", | |
2193 | peel_iters_epilogue); | |
2194 | fprintf (vect_dump, " Calculated minimum iters for profitability: %d\n", | |
2195 | min_profitable_iters); | |
2196 | } | |
2197 | ||
b8698a0f | 2198 | min_profitable_iters = |
ebfd146a IR |
2199 | min_profitable_iters < vf ? vf : min_profitable_iters; |
2200 | ||
2201 | /* Because the condition we create is: | |
2202 | if (niters <= min_profitable_iters) | |
2203 | then skip the vectorized loop. */ | |
2204 | min_profitable_iters--; | |
2205 | ||
2206 | if (vect_print_dump_info (REPORT_COST)) | |
2207 | fprintf (vect_dump, " Profitability threshold = %d\n", | |
2208 | min_profitable_iters); | |
b8698a0f | 2209 | |
ebfd146a IR |
2210 | return min_profitable_iters; |
2211 | } | |
2212 | ||
2213 | ||
b8698a0f | 2214 | /* TODO: Close dependency between vect_model_*_cost and vectorizable_* |
ebfd146a | 2215 | functions. Design better to avoid maintenance issues. */ |
ebfd146a | 2216 | |
b8698a0f L |
2217 | /* Function vect_model_reduction_cost. |
2218 | ||
2219 | Models cost for a reduction operation, including the vector ops | |
ebfd146a IR |
2220 | generated within the strip-mine loop, the initial definition before |
2221 | the loop, and the epilogue code that must be generated. */ | |
2222 | ||
b8698a0f | 2223 | static bool |
ebfd146a IR |
2224 | vect_model_reduction_cost (stmt_vec_info stmt_info, enum tree_code reduc_code, |
2225 | int ncopies) | |
2226 | { | |
2227 | int outer_cost = 0; | |
2228 | enum tree_code code; | |
2229 | optab optab; | |
2230 | tree vectype; | |
2231 | gimple stmt, orig_stmt; | |
2232 | tree reduction_op; | |
2233 | enum machine_mode mode; | |
2234 | loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info); | |
2235 | struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo); | |
2236 | ||
2237 | ||
2238 | /* Cost of reduction op inside loop. */ | |
2239 | STMT_VINFO_INSIDE_OF_LOOP_COST (stmt_info) += ncopies * TARG_VEC_STMT_COST; | |
2240 | ||
2241 | stmt = STMT_VINFO_STMT (stmt_info); | |
2242 | ||
2243 | switch (get_gimple_rhs_class (gimple_assign_rhs_code (stmt))) | |
2244 | { | |
2245 | case GIMPLE_SINGLE_RHS: | |
2246 | gcc_assert (TREE_OPERAND_LENGTH (gimple_assign_rhs1 (stmt)) == ternary_op); | |
2247 | reduction_op = TREE_OPERAND (gimple_assign_rhs1 (stmt), 2); | |
2248 | break; | |
2249 | case GIMPLE_UNARY_RHS: | |
2250 | reduction_op = gimple_assign_rhs1 (stmt); | |
2251 | break; | |
2252 | case GIMPLE_BINARY_RHS: | |
2253 | reduction_op = gimple_assign_rhs2 (stmt); | |
2254 | break; | |
2255 | default: | |
2256 | gcc_unreachable (); | |
2257 | } | |
2258 | ||
2259 | vectype = get_vectype_for_scalar_type (TREE_TYPE (reduction_op)); | |
2260 | if (!vectype) | |
2261 | { | |
2262 | if (vect_print_dump_info (REPORT_COST)) | |
2263 | { | |
2264 | fprintf (vect_dump, "unsupported data-type "); | |
2265 | print_generic_expr (vect_dump, TREE_TYPE (reduction_op), TDF_SLIM); | |
2266 | } | |
2267 | return false; | |
2268 | } | |
b8698a0f | 2269 | |
ebfd146a IR |
2270 | mode = TYPE_MODE (vectype); |
2271 | orig_stmt = STMT_VINFO_RELATED_STMT (stmt_info); | |
2272 | ||
b8698a0f | 2273 | if (!orig_stmt) |
ebfd146a IR |
2274 | orig_stmt = STMT_VINFO_STMT (stmt_info); |
2275 | ||
2276 | code = gimple_assign_rhs_code (orig_stmt); | |
2277 | ||
2278 | /* Add in cost for initial definition. */ | |
2279 | outer_cost += TARG_SCALAR_TO_VEC_COST; | |
2280 | ||
2281 | /* Determine cost of epilogue code. | |
2282 | ||
2283 | We have a reduction operator that will reduce the vector in one statement. | |
2284 | Also requires scalar extract. */ | |
2285 | ||
2286 | if (!nested_in_vect_loop_p (loop, orig_stmt)) | |
2287 | { | |
32e8bb8e | 2288 | if (reduc_code != ERROR_MARK) |
ebfd146a | 2289 | outer_cost += TARG_VEC_STMT_COST + TARG_VEC_TO_SCALAR_COST; |
b8698a0f | 2290 | else |
ebfd146a IR |
2291 | { |
2292 | int vec_size_in_bits = tree_low_cst (TYPE_SIZE (vectype), 1); | |
2293 | tree bitsize = | |
2294 | TYPE_SIZE (TREE_TYPE (gimple_assign_lhs (orig_stmt))); | |
2295 | int element_bitsize = tree_low_cst (bitsize, 1); | |
2296 | int nelements = vec_size_in_bits / element_bitsize; | |
2297 | ||
2298 | optab = optab_for_tree_code (code, vectype, optab_default); | |
2299 | ||
2300 | /* We have a whole vector shift available. */ | |
2301 | if (VECTOR_MODE_P (mode) | |
2302 | && optab_handler (optab, mode)->insn_code != CODE_FOR_nothing | |
2303 | && optab_handler (vec_shr_optab, mode)->insn_code != CODE_FOR_nothing) | |
2304 | /* Final reduction via vector shifts and the reduction operator. Also | |
2305 | requires scalar extract. */ | |
2306 | outer_cost += ((exact_log2(nelements) * 2) * TARG_VEC_STMT_COST | |
b8698a0f | 2307 | + TARG_VEC_TO_SCALAR_COST); |
ebfd146a IR |
2308 | else |
2309 | /* Use extracts and reduction op for final reduction. For N elements, | |
2310 | we have N extracts and N-1 reduction ops. */ | |
2311 | outer_cost += ((nelements + nelements - 1) * TARG_VEC_STMT_COST); | |
2312 | } | |
2313 | } | |
2314 | ||
2315 | STMT_VINFO_OUTSIDE_OF_LOOP_COST (stmt_info) = outer_cost; | |
2316 | ||
2317 | if (vect_print_dump_info (REPORT_COST)) | |
2318 | fprintf (vect_dump, "vect_model_reduction_cost: inside_cost = %d, " | |
2319 | "outside_cost = %d .", STMT_VINFO_INSIDE_OF_LOOP_COST (stmt_info), | |
2320 | STMT_VINFO_OUTSIDE_OF_LOOP_COST (stmt_info)); | |
2321 | ||
2322 | return true; | |
2323 | } | |
2324 | ||
2325 | ||
2326 | /* Function vect_model_induction_cost. | |
2327 | ||
2328 | Models cost for induction operations. */ | |
2329 | ||
2330 | static void | |
2331 | vect_model_induction_cost (stmt_vec_info stmt_info, int ncopies) | |
2332 | { | |
2333 | /* loop cost for vec_loop. */ | |
2334 | STMT_VINFO_INSIDE_OF_LOOP_COST (stmt_info) = ncopies * TARG_VEC_STMT_COST; | |
2335 | /* prologue cost for vec_init and vec_step. */ | |
2336 | STMT_VINFO_OUTSIDE_OF_LOOP_COST (stmt_info) = 2 * TARG_SCALAR_TO_VEC_COST; | |
b8698a0f | 2337 | |
ebfd146a IR |
2338 | if (vect_print_dump_info (REPORT_COST)) |
2339 | fprintf (vect_dump, "vect_model_induction_cost: inside_cost = %d, " | |
2340 | "outside_cost = %d .", STMT_VINFO_INSIDE_OF_LOOP_COST (stmt_info), | |
2341 | STMT_VINFO_OUTSIDE_OF_LOOP_COST (stmt_info)); | |
2342 | } | |
2343 | ||
2344 | ||
2345 | /* Function get_initial_def_for_induction | |
2346 | ||
2347 | Input: | |
2348 | STMT - a stmt that performs an induction operation in the loop. | |
2349 | IV_PHI - the initial value of the induction variable | |
2350 | ||
2351 | Output: | |
2352 | Return a vector variable, initialized with the first VF values of | |
2353 | the induction variable. E.g., for an iv with IV_PHI='X' and | |
b8698a0f | 2354 | evolution S, for a vector of 4 units, we want to return: |
ebfd146a IR |
2355 | [X, X + S, X + 2*S, X + 3*S]. */ |
2356 | ||
2357 | static tree | |
2358 | get_initial_def_for_induction (gimple iv_phi) | |
2359 | { | |
2360 | stmt_vec_info stmt_vinfo = vinfo_for_stmt (iv_phi); | |
2361 | loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_vinfo); | |
2362 | struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo); | |
2363 | tree scalar_type = TREE_TYPE (gimple_phi_result (iv_phi)); | |
b8698a0f | 2364 | tree vectype; |
ebfd146a IR |
2365 | int nunits; |
2366 | edge pe = loop_preheader_edge (loop); | |
2367 | struct loop *iv_loop; | |
2368 | basic_block new_bb; | |
2369 | tree vec, vec_init, vec_step, t; | |
2370 | tree access_fn; | |
2371 | tree new_var; | |
2372 | tree new_name; | |
2373 | gimple init_stmt, induction_phi, new_stmt; | |
2374 | tree induc_def, vec_def, vec_dest; | |
2375 | tree init_expr, step_expr; | |
2376 | int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo); | |
2377 | int i; | |
2378 | bool ok; | |
2379 | int ncopies; | |
2380 | tree expr; | |
2381 | stmt_vec_info phi_info = vinfo_for_stmt (iv_phi); | |
2382 | bool nested_in_vect_loop = false; | |
2383 | gimple_seq stmts = NULL; | |
2384 | imm_use_iterator imm_iter; | |
2385 | use_operand_p use_p; | |
2386 | gimple exit_phi; | |
2387 | edge latch_e; | |
2388 | tree loop_arg; | |
2389 | gimple_stmt_iterator si; | |
2390 | basic_block bb = gimple_bb (iv_phi); | |
795bd26a | 2391 | tree stepvectype; |
ebfd146a IR |
2392 | |
2393 | vectype = get_vectype_for_scalar_type (scalar_type); | |
2394 | gcc_assert (vectype); | |
2395 | nunits = TYPE_VECTOR_SUBPARTS (vectype); | |
2396 | ncopies = vf / nunits; | |
2397 | ||
2398 | gcc_assert (phi_info); | |
2399 | gcc_assert (ncopies >= 1); | |
2400 | ||
2401 | /* Find the first insertion point in the BB. */ | |
2402 | si = gsi_after_labels (bb); | |
2403 | ||
795bd26a | 2404 | if (INTEGRAL_TYPE_P (scalar_type)) |
ebfd146a | 2405 | step_expr = build_int_cst (scalar_type, 0); |
795bd26a RG |
2406 | else if (POINTER_TYPE_P (scalar_type)) |
2407 | step_expr = build_int_cst (sizetype, 0); | |
ebfd146a IR |
2408 | else |
2409 | step_expr = build_real (scalar_type, dconst0); | |
2410 | ||
2411 | /* Is phi in an inner-loop, while vectorizing an enclosing outer-loop? */ | |
2412 | if (nested_in_vect_loop_p (loop, iv_phi)) | |
2413 | { | |
2414 | nested_in_vect_loop = true; | |
2415 | iv_loop = loop->inner; | |
2416 | } | |
2417 | else | |
2418 | iv_loop = loop; | |
2419 | gcc_assert (iv_loop == (gimple_bb (iv_phi))->loop_father); | |
2420 | ||
2421 | latch_e = loop_latch_edge (iv_loop); | |
2422 | loop_arg = PHI_ARG_DEF_FROM_EDGE (iv_phi, latch_e); | |
2423 | ||
2424 | access_fn = analyze_scalar_evolution (iv_loop, PHI_RESULT (iv_phi)); | |
2425 | gcc_assert (access_fn); | |
2426 | ok = vect_is_simple_iv_evolution (iv_loop->num, access_fn, | |
06066f92 | 2427 | &init_expr, &step_expr); |
ebfd146a IR |
2428 | gcc_assert (ok); |
2429 | pe = loop_preheader_edge (iv_loop); | |
2430 | ||
2431 | /* Create the vector that holds the initial_value of the induction. */ | |
2432 | if (nested_in_vect_loop) | |
2433 | { | |
2434 | /* iv_loop is nested in the loop to be vectorized. init_expr had already | |
2435 | been created during vectorization of previous stmts; We obtain it from | |
2436 | the STMT_VINFO_VEC_STMT of the defining stmt. */ | |
b8698a0f | 2437 | tree iv_def = PHI_ARG_DEF_FROM_EDGE (iv_phi, |
06066f92 | 2438 | loop_preheader_edge (iv_loop)); |
ebfd146a IR |
2439 | vec_init = vect_get_vec_def_for_operand (iv_def, iv_phi, NULL); |
2440 | } | |
2441 | else | |
2442 | { | |
2443 | /* iv_loop is the loop to be vectorized. Create: | |
2444 | vec_init = [X, X+S, X+2*S, X+3*S] (S = step_expr, X = init_expr) */ | |
2445 | new_var = vect_get_new_vect_var (scalar_type, vect_scalar_var, "var_"); | |
2446 | add_referenced_var (new_var); | |
2447 | ||
2448 | new_name = force_gimple_operand (init_expr, &stmts, false, new_var); | |
2449 | if (stmts) | |
2450 | { | |
2451 | new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts); | |
2452 | gcc_assert (!new_bb); | |
2453 | } | |
2454 | ||
2455 | t = NULL_TREE; | |
2456 | t = tree_cons (NULL_TREE, init_expr, t); | |
2457 | for (i = 1; i < nunits; i++) | |
2458 | { | |
2459 | /* Create: new_name_i = new_name + step_expr */ | |
2460 | enum tree_code code = POINTER_TYPE_P (scalar_type) | |
2461 | ? POINTER_PLUS_EXPR : PLUS_EXPR; | |
2462 | init_stmt = gimple_build_assign_with_ops (code, new_var, | |
2463 | new_name, step_expr); | |
2464 | new_name = make_ssa_name (new_var, init_stmt); | |
2465 | gimple_assign_set_lhs (init_stmt, new_name); | |
2466 | ||
2467 | new_bb = gsi_insert_on_edge_immediate (pe, init_stmt); | |
2468 | gcc_assert (!new_bb); | |
2469 | ||
2470 | if (vect_print_dump_info (REPORT_DETAILS)) | |
2471 | { | |
2472 | fprintf (vect_dump, "created new init_stmt: "); | |
2473 | print_gimple_stmt (vect_dump, init_stmt, 0, TDF_SLIM); | |
2474 | } | |
2475 | t = tree_cons (NULL_TREE, new_name, t); | |
2476 | } | |
2477 | /* Create a vector from [new_name_0, new_name_1, ..., new_name_nunits-1] */ | |
2478 | vec = build_constructor_from_list (vectype, nreverse (t)); | |
2479 | vec_init = vect_init_vector (iv_phi, vec, vectype, NULL); | |
2480 | } | |
2481 | ||
2482 | ||
2483 | /* Create the vector that holds the step of the induction. */ | |
2484 | if (nested_in_vect_loop) | |
2485 | /* iv_loop is nested in the loop to be vectorized. Generate: | |
2486 | vec_step = [S, S, S, S] */ | |
2487 | new_name = step_expr; | |
2488 | else | |
2489 | { | |
2490 | /* iv_loop is the loop to be vectorized. Generate: | |
2491 | vec_step = [VF*S, VF*S, VF*S, VF*S] */ | |
795bd26a RG |
2492 | expr = build_int_cst (TREE_TYPE (step_expr), vf); |
2493 | new_name = fold_build2 (MULT_EXPR, TREE_TYPE (step_expr), | |
2494 | expr, step_expr); | |
ebfd146a IR |
2495 | } |
2496 | ||
2497 | t = NULL_TREE; | |
2498 | for (i = 0; i < nunits; i++) | |
2499 | t = tree_cons (NULL_TREE, unshare_expr (new_name), t); | |
2500 | gcc_assert (CONSTANT_CLASS_P (new_name)); | |
795bd26a RG |
2501 | stepvectype = get_vectype_for_scalar_type (TREE_TYPE (new_name)); |
2502 | gcc_assert (stepvectype); | |
2503 | vec = build_vector (stepvectype, t); | |
2504 | vec_step = vect_init_vector (iv_phi, vec, stepvectype, NULL); | |
ebfd146a IR |
2505 | |
2506 | ||
2507 | /* Create the following def-use cycle: | |
2508 | loop prolog: | |
2509 | vec_init = ... | |
2510 | vec_step = ... | |
2511 | loop: | |
2512 | vec_iv = PHI <vec_init, vec_loop> | |
2513 | ... | |
2514 | STMT | |
2515 | ... | |
2516 | vec_loop = vec_iv + vec_step; */ | |
2517 | ||
2518 | /* Create the induction-phi that defines the induction-operand. */ | |
2519 | vec_dest = vect_get_new_vect_var (vectype, vect_simple_var, "vec_iv_"); | |
2520 | add_referenced_var (vec_dest); | |
2521 | induction_phi = create_phi_node (vec_dest, iv_loop->header); | |
2522 | set_vinfo_for_stmt (induction_phi, | |
a70d6342 | 2523 | new_stmt_vec_info (induction_phi, loop_vinfo, NULL)); |
ebfd146a IR |
2524 | induc_def = PHI_RESULT (induction_phi); |
2525 | ||
2526 | /* Create the iv update inside the loop */ | |
2527 | new_stmt = gimple_build_assign_with_ops (PLUS_EXPR, vec_dest, | |
2528 | induc_def, vec_step); | |
2529 | vec_def = make_ssa_name (vec_dest, new_stmt); | |
2530 | gimple_assign_set_lhs (new_stmt, vec_def); | |
2531 | gsi_insert_before (&si, new_stmt, GSI_SAME_STMT); | |
b8698a0f | 2532 | set_vinfo_for_stmt (new_stmt, new_stmt_vec_info (new_stmt, loop_vinfo, |
a70d6342 | 2533 | NULL)); |
ebfd146a IR |
2534 | |
2535 | /* Set the arguments of the phi node: */ | |
f5045c96 | 2536 | add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION); |
b8698a0f | 2537 | add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop), |
f5045c96 | 2538 | UNKNOWN_LOCATION); |
ebfd146a IR |
2539 | |
2540 | ||
2541 | /* In case that vectorization factor (VF) is bigger than the number | |
2542 | of elements that we can fit in a vectype (nunits), we have to generate | |
2543 | more than one vector stmt - i.e - we need to "unroll" the | |
2544 | vector stmt by a factor VF/nunits. For more details see documentation | |
2545 | in vectorizable_operation. */ | |
b8698a0f | 2546 | |
ebfd146a IR |
2547 | if (ncopies > 1) |
2548 | { | |
2549 | stmt_vec_info prev_stmt_vinfo; | |
2550 | /* FORNOW. This restriction should be relaxed. */ | |
2551 | gcc_assert (!nested_in_vect_loop); | |
2552 | ||
2553 | /* Create the vector that holds the step of the induction. */ | |
795bd26a RG |
2554 | expr = build_int_cst (TREE_TYPE (step_expr), nunits); |
2555 | new_name = fold_build2 (MULT_EXPR, TREE_TYPE (step_expr), | |
2556 | expr, step_expr); | |
ebfd146a IR |
2557 | t = NULL_TREE; |
2558 | for (i = 0; i < nunits; i++) | |
2559 | t = tree_cons (NULL_TREE, unshare_expr (new_name), t); | |
2560 | gcc_assert (CONSTANT_CLASS_P (new_name)); | |
795bd26a RG |
2561 | vec = build_vector (stepvectype, t); |
2562 | vec_step = vect_init_vector (iv_phi, vec, stepvectype, NULL); | |
ebfd146a IR |
2563 | |
2564 | vec_def = induc_def; | |
2565 | prev_stmt_vinfo = vinfo_for_stmt (induction_phi); | |
2566 | for (i = 1; i < ncopies; i++) | |
2567 | { | |
2568 | /* vec_i = vec_prev + vec_step */ | |
2569 | new_stmt = gimple_build_assign_with_ops (PLUS_EXPR, vec_dest, | |
2570 | vec_def, vec_step); | |
2571 | vec_def = make_ssa_name (vec_dest, new_stmt); | |
2572 | gimple_assign_set_lhs (new_stmt, vec_def); | |
2573 | ||
2574 | gsi_insert_before (&si, new_stmt, GSI_SAME_STMT); | |
2575 | set_vinfo_for_stmt (new_stmt, | |
a70d6342 | 2576 | new_stmt_vec_info (new_stmt, loop_vinfo, NULL)); |
ebfd146a | 2577 | STMT_VINFO_RELATED_STMT (prev_stmt_vinfo) = new_stmt; |
b8698a0f | 2578 | prev_stmt_vinfo = vinfo_for_stmt (new_stmt); |
ebfd146a IR |
2579 | } |
2580 | } | |
2581 | ||
2582 | if (nested_in_vect_loop) | |
2583 | { | |
2584 | /* Find the loop-closed exit-phi of the induction, and record | |
2585 | the final vector of induction results: */ | |
2586 | exit_phi = NULL; | |
2587 | FOR_EACH_IMM_USE_FAST (use_p, imm_iter, loop_arg) | |
2588 | { | |
2589 | if (!flow_bb_inside_loop_p (iv_loop, gimple_bb (USE_STMT (use_p)))) | |
2590 | { | |
2591 | exit_phi = USE_STMT (use_p); | |
2592 | break; | |
2593 | } | |
2594 | } | |
b8698a0f | 2595 | if (exit_phi) |
ebfd146a IR |
2596 | { |
2597 | stmt_vec_info stmt_vinfo = vinfo_for_stmt (exit_phi); | |
2598 | /* FORNOW. Currently not supporting the case that an inner-loop induction | |
2599 | is not used in the outer-loop (i.e. only outside the outer-loop). */ | |
2600 | gcc_assert (STMT_VINFO_RELEVANT_P (stmt_vinfo) | |
2601 | && !STMT_VINFO_LIVE_P (stmt_vinfo)); | |
2602 | ||
2603 | STMT_VINFO_VEC_STMT (stmt_vinfo) = new_stmt; | |
2604 | if (vect_print_dump_info (REPORT_DETAILS)) | |
2605 | { | |
2606 | fprintf (vect_dump, "vector of inductions after inner-loop:"); | |
2607 | print_gimple_stmt (vect_dump, new_stmt, 0, TDF_SLIM); | |
2608 | } | |
2609 | } | |
2610 | } | |
2611 | ||
2612 | ||
2613 | if (vect_print_dump_info (REPORT_DETAILS)) | |
2614 | { | |
2615 | fprintf (vect_dump, "transform induction: created def-use cycle: "); | |
2616 | print_gimple_stmt (vect_dump, induction_phi, 0, TDF_SLIM); | |
2617 | fprintf (vect_dump, "\n"); | |
2618 | print_gimple_stmt (vect_dump, SSA_NAME_DEF_STMT (vec_def), 0, TDF_SLIM); | |
2619 | } | |
2620 | ||
2621 | STMT_VINFO_VEC_STMT (phi_info) = induction_phi; | |
2622 | return induc_def; | |
2623 | } | |
2624 | ||
2625 | ||
2626 | /* Function get_initial_def_for_reduction | |
2627 | ||
2628 | Input: | |
2629 | STMT - a stmt that performs a reduction operation in the loop. | |
2630 | INIT_VAL - the initial value of the reduction variable | |
2631 | ||
2632 | Output: | |
2633 | ADJUSTMENT_DEF - a tree that holds a value to be added to the final result | |
2634 | of the reduction (used for adjusting the epilog - see below). | |
2635 | Return a vector variable, initialized according to the operation that STMT | |
2636 | performs. This vector will be used as the initial value of the | |
2637 | vector of partial results. | |
2638 | ||
2639 | Option1 (adjust in epilog): Initialize the vector as follows: | |
4bbe8262 IR |
2640 | add/bit or/xor: [0,0,...,0,0] |
2641 | mult/bit and: [1,1,...,1,1] | |
2642 | min/max/cond_expr: [init_val,init_val,..,init_val,init_val] | |
ebfd146a IR |
2643 | and when necessary (e.g. add/mult case) let the caller know |
2644 | that it needs to adjust the result by init_val. | |
2645 | ||
2646 | Option2: Initialize the vector as follows: | |
4bbe8262 IR |
2647 | add/bit or/xor: [init_val,0,0,...,0] |
2648 | mult/bit and: [init_val,1,1,...,1] | |
2649 | min/max/cond_expr: [init_val,init_val,...,init_val] | |
ebfd146a IR |
2650 | and no adjustments are needed. |
2651 | ||
2652 | For example, for the following code: | |
2653 | ||
2654 | s = init_val; | |
2655 | for (i=0;i<n;i++) | |
2656 | s = s + a[i]; | |
2657 | ||
2658 | STMT is 's = s + a[i]', and the reduction variable is 's'. | |
2659 | For a vector of 4 units, we want to return either [0,0,0,init_val], | |
2660 | or [0,0,0,0] and let the caller know that it needs to adjust | |
2661 | the result at the end by 'init_val'. | |
2662 | ||
2663 | FORNOW, we are using the 'adjust in epilog' scheme, because this way the | |
06066f92 IR |
2664 | initialization vector is simpler (same element in all entries), if |
2665 | ADJUSTMENT_DEF is not NULL, and Option2 otherwise. | |
b8698a0f | 2666 | |
ebfd146a IR |
2667 | A cost model should help decide between these two schemes. */ |
2668 | ||
2669 | tree | |
b8698a0f | 2670 | get_initial_def_for_reduction (gimple stmt, tree init_val, |
06066f92 | 2671 | tree *adjustment_def) |
ebfd146a IR |
2672 | { |
2673 | stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt); | |
2674 | loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_vinfo); | |
2675 | struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo); | |
550918ca RG |
2676 | tree scalar_type = TREE_TYPE (init_val); |
2677 | tree vectype = get_vectype_for_scalar_type (scalar_type); | |
2678 | int nunits; | |
ebfd146a | 2679 | enum tree_code code = gimple_assign_rhs_code (stmt); |
ebfd146a IR |
2680 | tree def_for_init; |
2681 | tree init_def; | |
2682 | tree t = NULL_TREE; | |
2683 | int i; | |
b8698a0f | 2684 | bool nested_in_vect_loop = false; |
06066f92 IR |
2685 | tree init_value; |
2686 | REAL_VALUE_TYPE real_init_val = dconst0; | |
2687 | int int_init_val = 0; | |
2f3e235b | 2688 | gimple def_stmt = NULL; |
ebfd146a | 2689 | |
550918ca RG |
2690 | gcc_assert (vectype); |
2691 | nunits = TYPE_VECTOR_SUBPARTS (vectype); | |
2692 | ||
2693 | gcc_assert (POINTER_TYPE_P (scalar_type) || INTEGRAL_TYPE_P (scalar_type) | |
2694 | || SCALAR_FLOAT_TYPE_P (scalar_type)); | |
06066f92 | 2695 | |
ebfd146a IR |
2696 | if (nested_in_vect_loop_p (loop, stmt)) |
2697 | nested_in_vect_loop = true; | |
2698 | else | |
2699 | gcc_assert (loop == (gimple_bb (stmt))->loop_father); | |
2700 | ||
06066f92 IR |
2701 | /* In case of double reduction we only create a vector variable to be put |
2702 | in the reduction phi node. The actual statement creation is done in | |
2703 | vect_create_epilog_for_reduction. */ | |
2f3e235b IR |
2704 | if (adjustment_def && nested_in_vect_loop |
2705 | && TREE_CODE (init_val) == SSA_NAME | |
2706 | && (def_stmt = SSA_NAME_DEF_STMT (init_val)) | |
2707 | && gimple_code (def_stmt) == GIMPLE_PHI | |
2708 | && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt)) | |
b8698a0f L |
2709 | && vinfo_for_stmt (def_stmt) |
2710 | && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt)) | |
06066f92 IR |
2711 | == vect_double_reduction_def) |
2712 | { | |
2713 | *adjustment_def = NULL; | |
2714 | return vect_create_destination_var (init_val, vectype); | |
2715 | } | |
ebfd146a | 2716 | |
06066f92 IR |
2717 | if (TREE_CONSTANT (init_val)) |
2718 | { | |
2719 | if (SCALAR_FLOAT_TYPE_P (scalar_type)) | |
2720 | init_value = build_real (scalar_type, TREE_REAL_CST (init_val)); | |
2721 | else | |
2722 | init_value = build_int_cst (scalar_type, TREE_INT_CST_LOW (init_val)); | |
2723 | } | |
2724 | else | |
2725 | init_value = init_val; | |
ebfd146a | 2726 | |
06066f92 IR |
2727 | switch (code) |
2728 | { | |
2729 | case WIDEN_SUM_EXPR: | |
2730 | case DOT_PROD_EXPR: | |
2731 | case PLUS_EXPR: | |
2732 | case MINUS_EXPR: | |
2733 | case BIT_IOR_EXPR: | |
2734 | case BIT_XOR_EXPR: | |
2735 | case MULT_EXPR: | |
2736 | case BIT_AND_EXPR: | |
b8698a0f | 2737 | /* ADJUSMENT_DEF is NULL when called from |
06066f92 IR |
2738 | vect_create_epilog_for_reduction to vectorize double reduction. */ |
2739 | if (adjustment_def) | |
2740 | { | |
2741 | if (nested_in_vect_loop) | |
b8698a0f | 2742 | *adjustment_def = vect_get_vec_def_for_operand (init_val, stmt, |
06066f92 IR |
2743 | NULL); |
2744 | else | |
2745 | *adjustment_def = init_val; | |
2746 | } | |
2747 | ||
2748 | if (code == MULT_EXPR || code == BIT_AND_EXPR) | |
2749 | { | |
2750 | real_init_val = dconst1; | |
2751 | int_init_val = 1; | |
2752 | } | |
2753 | ||
2754 | if (SCALAR_FLOAT_TYPE_P (scalar_type)) | |
2755 | def_for_init = build_real (scalar_type, real_init_val); | |
2756 | else | |
2757 | def_for_init = build_int_cst (scalar_type, int_init_val); | |
2758 | ||
b8698a0f | 2759 | /* Create a vector of '0' or '1' except the first element. */ |
06066f92 IR |
2760 | for (i = nunits - 2; i >= 0; --i) |
2761 | t = tree_cons (NULL_TREE, def_for_init, t); | |
2762 | ||
2763 | /* Option1: the first element is '0' or '1' as well. */ | |
2764 | if (adjustment_def) | |
2765 | { | |
2766 | t = tree_cons (NULL_TREE, def_for_init, t); | |
2767 | init_def = build_vector (vectype, t); | |
2768 | break; | |
2769 | } | |
2770 | ||
2771 | /* Option2: the first element is INIT_VAL. */ | |
2772 | t = tree_cons (NULL_TREE, init_value, t); | |
2773 | if (TREE_CONSTANT (init_val)) | |
2774 | init_def = build_vector (vectype, t); | |
2775 | else | |
2776 | init_def = build_constructor_from_list (vectype, t); | |
2777 | ||
2778 | break; | |
2779 | ||
2780 | case MIN_EXPR: | |
2781 | case MAX_EXPR: | |
4bbe8262 | 2782 | case COND_EXPR: |
06066f92 IR |
2783 | if (adjustment_def) |
2784 | { | |
2785 | *adjustment_def = NULL_TREE; | |
2786 | init_def = vect_get_vec_def_for_operand (init_val, stmt, NULL); | |
2787 | break; | |
2788 | } | |
2789 | ||
2790 | for (i = nunits - 1; i >= 0; --i) | |
2791 | t = tree_cons (NULL_TREE, init_value, t); | |
2792 | ||
2793 | if (TREE_CONSTANT (init_val)) | |
2794 | init_def = build_vector (vectype, t); | |
2795 | else | |
2796 | init_def = build_constructor_from_list (vectype, t); | |
2797 | ||
2798 | break; | |
2799 | ||
2800 | default: | |
2801 | gcc_unreachable (); | |
2802 | } | |
ebfd146a IR |
2803 | |
2804 | return init_def; | |
2805 | } | |
2806 | ||
2807 | ||
2808 | /* Function vect_create_epilog_for_reduction | |
b8698a0f | 2809 | |
ebfd146a | 2810 | Create code at the loop-epilog to finalize the result of a reduction |
b8698a0f L |
2811 | computation. |
2812 | ||
2813 | VECT_DEF is a vector of partial results. | |
ebfd146a IR |
2814 | REDUC_CODE is the tree-code for the epilog reduction. |
2815 | NCOPIES is > 1 in case the vectorization factor (VF) is bigger than the | |
2816 | number of elements that we can fit in a vectype (nunits). In this case | |
2817 | we have to generate more than one vector stmt - i.e - we need to "unroll" | |
2818 | the vector stmt by a factor VF/nunits. For more details see documentation | |
2819 | in vectorizable_operation. | |
2820 | STMT is the scalar reduction stmt that is being vectorized. | |
2821 | REDUCTION_PHI is the phi-node that carries the reduction computation. | |
b8698a0f | 2822 | REDUC_INDEX is the index of the operand in the right hand side of the |
7c5222ff | 2823 | statement that is defined by REDUCTION_PHI. |
06066f92 | 2824 | DOUBLE_REDUC is TRUE if double reduction phi nodes should be handled. |
ebfd146a IR |
2825 | |
2826 | This function: | |
b8698a0f | 2827 | 1. Creates the reduction def-use cycle: sets the arguments for |
ebfd146a IR |
2828 | REDUCTION_PHI: |
2829 | The loop-entry argument is the vectorized initial-value of the reduction. | |
2830 | The loop-latch argument is VECT_DEF - the vector of partial sums. | |
2831 | 2. "Reduces" the vector of partial results VECT_DEF into a single result, | |
b8698a0f | 2832 | by applying the operation specified by REDUC_CODE if available, or by |
ebfd146a | 2833 | other means (whole-vector shifts or a scalar loop). |
b8698a0f | 2834 | The function also creates a new phi node at the loop exit to preserve |
ebfd146a | 2835 | loop-closed form, as illustrated below. |
b8698a0f | 2836 | |
ebfd146a | 2837 | The flow at the entry to this function: |
b8698a0f | 2838 | |
ebfd146a IR |
2839 | loop: |
2840 | vec_def = phi <null, null> # REDUCTION_PHI | |
2841 | VECT_DEF = vector_stmt # vectorized form of STMT | |
2842 | s_loop = scalar_stmt # (scalar) STMT | |
2843 | loop_exit: | |
2844 | s_out0 = phi <s_loop> # (scalar) EXIT_PHI | |
2845 | use <s_out0> | |
2846 | use <s_out0> | |
2847 | ||
2848 | The above is transformed by this function into: | |
2849 | ||
2850 | loop: | |
2851 | vec_def = phi <vec_init, VECT_DEF> # REDUCTION_PHI | |
2852 | VECT_DEF = vector_stmt # vectorized form of STMT | |
b8698a0f | 2853 | s_loop = scalar_stmt # (scalar) STMT |
ebfd146a IR |
2854 | loop_exit: |
2855 | s_out0 = phi <s_loop> # (scalar) EXIT_PHI | |
2856 | v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI | |
2857 | v_out2 = reduce <v_out1> | |
2858 | s_out3 = extract_field <v_out2, 0> | |
2859 | s_out4 = adjust_result <s_out3> | |
2860 | use <s_out4> | |
2861 | use <s_out4> | |
2862 | */ | |
2863 | ||
2864 | static void | |
2865 | vect_create_epilog_for_reduction (tree vect_def, gimple stmt, | |
2866 | int ncopies, | |
2867 | enum tree_code reduc_code, | |
7c5222ff | 2868 | gimple reduction_phi, |
b8698a0f | 2869 | int reduc_index, |
06066f92 | 2870 | bool double_reduc) |
ebfd146a IR |
2871 | { |
2872 | stmt_vec_info stmt_info = vinfo_for_stmt (stmt); | |
2873 | stmt_vec_info prev_phi_info; | |
2874 | tree vectype; | |
2875 | enum machine_mode mode; | |
2876 | loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info); | |
06066f92 | 2877 | struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo), *outer_loop = NULL; |
ebfd146a IR |
2878 | basic_block exit_bb; |
2879 | tree scalar_dest; | |
2880 | tree scalar_type; | |
2881 | gimple new_phi = NULL, phi; | |
2882 | gimple_stmt_iterator exit_gsi; | |
2883 | tree vec_dest; | |
2884 | tree new_temp = NULL_TREE; | |
2885 | tree new_name; | |
2886 | gimple epilog_stmt = NULL; | |
2887 | tree new_scalar_dest, new_dest; | |
2888 | gimple exit_phi; | |
0f900dfa | 2889 | tree bitsize, bitpos; |
ebfd146a IR |
2890 | enum tree_code code = gimple_assign_rhs_code (stmt); |
2891 | tree adjustment_def; | |
2892 | tree vec_initial_def, def; | |
2893 | tree orig_name; | |
2894 | imm_use_iterator imm_iter; | |
2895 | use_operand_p use_p; | |
2896 | bool extract_scalar_result = false; | |
2897 | tree reduction_op, expr; | |
2898 | gimple orig_stmt; | |
2899 | gimple use_stmt; | |
2900 | bool nested_in_vect_loop = false; | |
2901 | VEC(gimple,heap) *phis = NULL; | |
2902 | enum vect_def_type dt = vect_unknown_def_type; | |
2903 | int j, i; | |
b8698a0f | 2904 | |
ebfd146a IR |
2905 | if (nested_in_vect_loop_p (loop, stmt)) |
2906 | { | |
06066f92 | 2907 | outer_loop = loop; |
ebfd146a IR |
2908 | loop = loop->inner; |
2909 | nested_in_vect_loop = true; | |
2910 | } | |
b8698a0f | 2911 | |
ebfd146a IR |
2912 | switch (get_gimple_rhs_class (gimple_assign_rhs_code (stmt))) |
2913 | { | |
2914 | case GIMPLE_SINGLE_RHS: | |
b8698a0f | 2915 | gcc_assert (TREE_OPERAND_LENGTH (gimple_assign_rhs1 (stmt)) |
7c5222ff IR |
2916 | == ternary_op); |
2917 | reduction_op = TREE_OPERAND (gimple_assign_rhs1 (stmt), reduc_index); | |
ebfd146a IR |
2918 | break; |
2919 | case GIMPLE_UNARY_RHS: | |
2920 | reduction_op = gimple_assign_rhs1 (stmt); | |
2921 | break; | |
2922 | case GIMPLE_BINARY_RHS: | |
b8698a0f | 2923 | reduction_op = reduc_index ? |
7c5222ff | 2924 | gimple_assign_rhs2 (stmt) : gimple_assign_rhs1 (stmt); |
ebfd146a IR |
2925 | break; |
2926 | default: | |
2927 | gcc_unreachable (); | |
2928 | } | |
2929 | ||
2930 | vectype = get_vectype_for_scalar_type (TREE_TYPE (reduction_op)); | |
2931 | gcc_assert (vectype); | |
2932 | mode = TYPE_MODE (vectype); | |
2933 | ||
2934 | /*** 1. Create the reduction def-use cycle ***/ | |
b8698a0f | 2935 | |
ebfd146a IR |
2936 | /* For the case of reduction, vect_get_vec_def_for_operand returns |
2937 | the scalar def before the loop, that defines the initial value | |
2938 | of the reduction variable. */ | |
2939 | vec_initial_def = vect_get_vec_def_for_operand (reduction_op, stmt, | |
06066f92 | 2940 | &adjustment_def); |
ebfd146a IR |
2941 | |
2942 | phi = reduction_phi; | |
2943 | def = vect_def; | |
2944 | for (j = 0; j < ncopies; j++) | |
2945 | { | |
2946 | /* 1.1 set the loop-entry arg of the reduction-phi: */ | |
b8698a0f | 2947 | add_phi_arg (phi, vec_initial_def, loop_preheader_edge (loop), |
f5045c96 | 2948 | UNKNOWN_LOCATION); |
ebfd146a IR |
2949 | |
2950 | /* 1.2 set the loop-latch arg for the reduction-phi: */ | |
2951 | if (j > 0) | |
2952 | def = vect_get_vec_def_for_stmt_copy (dt, def); | |
f5045c96 | 2953 | add_phi_arg (phi, def, loop_latch_edge (loop), UNKNOWN_LOCATION); |
ebfd146a IR |
2954 | |
2955 | if (vect_print_dump_info (REPORT_DETAILS)) | |
2956 | { | |
2957 | fprintf (vect_dump, "transform reduction: created def-use cycle: "); | |
2958 | print_gimple_stmt (vect_dump, phi, 0, TDF_SLIM); | |
06066f92 IR |
2959 | fprintf (vect_dump, "\n"); |
2960 | print_gimple_stmt (vect_dump, SSA_NAME_DEF_STMT (def), 0, TDF_SLIM); | |
ebfd146a IR |
2961 | } |
2962 | ||
2963 | phi = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (phi)); | |
2964 | } | |
2965 | ||
2966 | /*** 2. Create epilog code | |
2967 | The reduction epilog code operates across the elements of the vector | |
2968 | of partial results computed by the vectorized loop. | |
2969 | The reduction epilog code consists of: | |
2970 | step 1: compute the scalar result in a vector (v_out2) | |
2971 | step 2: extract the scalar result (s_out3) from the vector (v_out2) | |
2972 | step 3: adjust the scalar result (s_out3) if needed. | |
2973 | ||
2974 | Step 1 can be accomplished using one the following three schemes: | |
2975 | (scheme 1) using reduc_code, if available. | |
2976 | (scheme 2) using whole-vector shifts, if available. | |
b8698a0f | 2977 | (scheme 3) using a scalar loop. In this case steps 1+2 above are |
ebfd146a | 2978 | combined. |
b8698a0f | 2979 | |
ebfd146a IR |
2980 | The overall epilog code looks like this: |
2981 | ||
2982 | s_out0 = phi <s_loop> # original EXIT_PHI | |
2983 | v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI | |
2984 | v_out2 = reduce <v_out1> # step 1 | |
2985 | s_out3 = extract_field <v_out2, 0> # step 2 | |
2986 | s_out4 = adjust_result <s_out3> # step 3 | |
2987 | ||
2988 | (step 3 is optional, and steps 1 and 2 may be combined). | |
2989 | Lastly, the uses of s_out0 are replaced by s_out4. | |
2990 | ||
2991 | ***/ | |
2992 | ||
2993 | /* 2.1 Create new loop-exit-phi to preserve loop-closed form: | |
2994 | v_out1 = phi <v_loop> */ | |
2995 | ||
2996 | exit_bb = single_exit (loop)->dest; | |
2997 | def = vect_def; | |
2998 | prev_phi_info = NULL; | |
2999 | for (j = 0; j < ncopies; j++) | |
3000 | { | |
3001 | phi = create_phi_node (SSA_NAME_VAR (vect_def), exit_bb); | |
a70d6342 | 3002 | set_vinfo_for_stmt (phi, new_stmt_vec_info (phi, loop_vinfo, NULL)); |
ebfd146a IR |
3003 | if (j == 0) |
3004 | new_phi = phi; | |
3005 | else | |
3006 | { | |
3007 | def = vect_get_vec_def_for_stmt_copy (dt, def); | |
3008 | STMT_VINFO_RELATED_STMT (prev_phi_info) = phi; | |
3009 | } | |
3010 | SET_PHI_ARG_DEF (phi, single_exit (loop)->dest_idx, def); | |
3011 | prev_phi_info = vinfo_for_stmt (phi); | |
3012 | } | |
7c5222ff | 3013 | |
ebfd146a IR |
3014 | exit_gsi = gsi_after_labels (exit_bb); |
3015 | ||
b8698a0f | 3016 | /* 2.2 Get the relevant tree-code to use in the epilog for schemes 2,3 |
ebfd146a IR |
3017 | (i.e. when reduc_code is not available) and in the final adjustment |
3018 | code (if needed). Also get the original scalar reduction variable as | |
b8698a0f L |
3019 | defined in the loop. In case STMT is a "pattern-stmt" (i.e. - it |
3020 | represents a reduction pattern), the tree-code and scalar-def are | |
3021 | taken from the original stmt that the pattern-stmt (STMT) replaces. | |
ebfd146a | 3022 | Otherwise (it is a regular reduction) - the tree-code and scalar-def |
b8698a0f | 3023 | are taken from STMT. */ |
ebfd146a IR |
3024 | |
3025 | orig_stmt = STMT_VINFO_RELATED_STMT (stmt_info); | |
3026 | if (!orig_stmt) | |
3027 | { | |
3028 | /* Regular reduction */ | |
3029 | orig_stmt = stmt; | |
3030 | } | |
3031 | else | |
3032 | { | |
3033 | /* Reduction pattern */ | |
3034 | stmt_vec_info stmt_vinfo = vinfo_for_stmt (orig_stmt); | |
3035 | gcc_assert (STMT_VINFO_IN_PATTERN_P (stmt_vinfo)); | |
3036 | gcc_assert (STMT_VINFO_RELATED_STMT (stmt_vinfo) == stmt); | |
3037 | } | |
7c5222ff | 3038 | |
ebfd146a IR |
3039 | code = gimple_assign_rhs_code (orig_stmt); |
3040 | scalar_dest = gimple_assign_lhs (orig_stmt); | |
3041 | scalar_type = TREE_TYPE (scalar_dest); | |
3042 | new_scalar_dest = vect_create_destination_var (scalar_dest, NULL); | |
3043 | bitsize = TYPE_SIZE (scalar_type); | |
ebfd146a | 3044 | |
06066f92 IR |
3045 | /* For MINUS_EXPR the initial vector is [init_val,0,...,0], therefore, |
3046 | partial results are added and not subtracted. */ | |
3047 | if (code == MINUS_EXPR) | |
3048 | code = PLUS_EXPR; | |
ebfd146a IR |
3049 | |
3050 | /* In case this is a reduction in an inner-loop while vectorizing an outer | |
3051 | loop - we don't need to extract a single scalar result at the end of the | |
06066f92 | 3052 | inner-loop (unless it is double reduction, i.e., the use of reduction is |
b8698a0f | 3053 | outside the outer-loop). The final vector of partial results will be used |
06066f92 IR |
3054 | in the vectorized outer-loop, or reduced to a scalar result at the end of |
3055 | the outer-loop. */ | |
3056 | if (nested_in_vect_loop && !double_reduc) | |
ebfd146a IR |
3057 | goto vect_finalize_reduction; |
3058 | ||
06066f92 IR |
3059 | /* The epilogue is created for the outer-loop, i.e., for the loop being |
3060 | vectorized. */ | |
3061 | if (double_reduc) | |
3062 | loop = outer_loop; | |
3063 | ||
ebfd146a IR |
3064 | /* FORNOW */ |
3065 | gcc_assert (ncopies == 1); | |
3066 | ||
3067 | /* 2.3 Create the reduction code, using one of the three schemes described | |
3068 | above. */ | |
3069 | ||
32e8bb8e | 3070 | if (reduc_code != ERROR_MARK) |
ebfd146a IR |
3071 | { |
3072 | tree tmp; | |
3073 | ||
3074 | /*** Case 1: Create: | |
3075 | v_out2 = reduc_expr <v_out1> */ | |
3076 | ||
3077 | if (vect_print_dump_info (REPORT_DETAILS)) | |
3078 | fprintf (vect_dump, "Reduce using direct vector reduction."); | |
3079 | ||
3080 | vec_dest = vect_create_destination_var (scalar_dest, vectype); | |
3081 | tmp = build1 (reduc_code, vectype, PHI_RESULT (new_phi)); | |
3082 | epilog_stmt = gimple_build_assign (vec_dest, tmp); | |
3083 | new_temp = make_ssa_name (vec_dest, epilog_stmt); | |
3084 | gimple_assign_set_lhs (epilog_stmt, new_temp); | |
3085 | gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT); | |
3086 | ||
3087 | extract_scalar_result = true; | |
3088 | } | |
3089 | else | |
3090 | { | |
81f40b79 | 3091 | enum tree_code shift_code = ERROR_MARK; |
ebfd146a IR |
3092 | bool have_whole_vector_shift = true; |
3093 | int bit_offset; | |
3094 | int element_bitsize = tree_low_cst (bitsize, 1); | |
3095 | int vec_size_in_bits = tree_low_cst (TYPE_SIZE (vectype), 1); | |
3096 | tree vec_temp; | |
3097 | ||
3098 | if (optab_handler (vec_shr_optab, mode)->insn_code != CODE_FOR_nothing) | |
3099 | shift_code = VEC_RSHIFT_EXPR; | |
3100 | else | |
3101 | have_whole_vector_shift = false; | |
3102 | ||
3103 | /* Regardless of whether we have a whole vector shift, if we're | |
3104 | emulating the operation via tree-vect-generic, we don't want | |
3105 | to use it. Only the first round of the reduction is likely | |
3106 | to still be profitable via emulation. */ | |
3107 | /* ??? It might be better to emit a reduction tree code here, so that | |
3108 | tree-vect-generic can expand the first round via bit tricks. */ | |
3109 | if (!VECTOR_MODE_P (mode)) | |
3110 | have_whole_vector_shift = false; | |
3111 | else | |
3112 | { | |
3113 | optab optab = optab_for_tree_code (code, vectype, optab_default); | |
3114 | if (optab_handler (optab, mode)->insn_code == CODE_FOR_nothing) | |
3115 | have_whole_vector_shift = false; | |
3116 | } | |
3117 | ||
3118 | if (have_whole_vector_shift) | |
3119 | { | |
3120 | /*** Case 2: Create: | |
3121 | for (offset = VS/2; offset >= element_size; offset/=2) | |
3122 | { | |
3123 | Create: va' = vec_shift <va, offset> | |
3124 | Create: va = vop <va, va'> | |
3125 | } */ | |
3126 | ||
3127 | if (vect_print_dump_info (REPORT_DETAILS)) | |
3128 | fprintf (vect_dump, "Reduce using vector shifts"); | |
3129 | ||
3130 | vec_dest = vect_create_destination_var (scalar_dest, vectype); | |
3131 | new_temp = PHI_RESULT (new_phi); | |
3132 | ||
3133 | for (bit_offset = vec_size_in_bits/2; | |
3134 | bit_offset >= element_bitsize; | |
3135 | bit_offset /= 2) | |
3136 | { | |
3137 | tree bitpos = size_int (bit_offset); | |
b8698a0f | 3138 | |
ebfd146a IR |
3139 | epilog_stmt = gimple_build_assign_with_ops (shift_code, vec_dest, |
3140 | new_temp, bitpos); | |
3141 | new_name = make_ssa_name (vec_dest, epilog_stmt); | |
3142 | gimple_assign_set_lhs (epilog_stmt, new_name); | |
3143 | gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT); | |
3144 | ||
3145 | epilog_stmt = gimple_build_assign_with_ops (code, vec_dest, | |
3146 | new_name, new_temp); | |
3147 | new_temp = make_ssa_name (vec_dest, epilog_stmt); | |
3148 | gimple_assign_set_lhs (epilog_stmt, new_temp); | |
3149 | gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT); | |
3150 | } | |
3151 | ||
3152 | extract_scalar_result = true; | |
3153 | } | |
3154 | else | |
3155 | { | |
3156 | tree rhs; | |
3157 | ||
b8698a0f | 3158 | /*** Case 3: Create: |
ebfd146a | 3159 | s = extract_field <v_out2, 0> |
b8698a0f L |
3160 | for (offset = element_size; |
3161 | offset < vector_size; | |
ebfd146a IR |
3162 | offset += element_size;) |
3163 | { | |
3164 | Create: s' = extract_field <v_out2, offset> | |
3165 | Create: s = op <s, s'> | |
3166 | } */ | |
3167 | ||
3168 | if (vect_print_dump_info (REPORT_DETAILS)) | |
3169 | fprintf (vect_dump, "Reduce using scalar code. "); | |
3170 | ||
3171 | vec_temp = PHI_RESULT (new_phi); | |
3172 | vec_size_in_bits = tree_low_cst (TYPE_SIZE (vectype), 1); | |
3173 | rhs = build3 (BIT_FIELD_REF, scalar_type, vec_temp, bitsize, | |
3174 | bitsize_zero_node); | |
3175 | epilog_stmt = gimple_build_assign (new_scalar_dest, rhs); | |
3176 | new_temp = make_ssa_name (new_scalar_dest, epilog_stmt); | |
3177 | gimple_assign_set_lhs (epilog_stmt, new_temp); | |
3178 | gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT); | |
b8698a0f | 3179 | |
ebfd146a IR |
3180 | for (bit_offset = element_bitsize; |
3181 | bit_offset < vec_size_in_bits; | |
3182 | bit_offset += element_bitsize) | |
b8698a0f | 3183 | { |
ebfd146a IR |
3184 | tree bitpos = bitsize_int (bit_offset); |
3185 | tree rhs = build3 (BIT_FIELD_REF, scalar_type, vec_temp, bitsize, | |
3186 | bitpos); | |
b8698a0f | 3187 | |
ebfd146a IR |
3188 | epilog_stmt = gimple_build_assign (new_scalar_dest, rhs); |
3189 | new_name = make_ssa_name (new_scalar_dest, epilog_stmt); | |
3190 | gimple_assign_set_lhs (epilog_stmt, new_name); | |
3191 | gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT); | |
3192 | ||
3193 | epilog_stmt = gimple_build_assign_with_ops (code, | |
3194 | new_scalar_dest, | |
3195 | new_name, new_temp); | |
3196 | new_temp = make_ssa_name (new_scalar_dest, epilog_stmt); | |
3197 | gimple_assign_set_lhs (epilog_stmt, new_temp); | |
3198 | gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT); | |
3199 | } | |
3200 | ||
3201 | extract_scalar_result = false; | |
3202 | } | |
3203 | } | |
3204 | ||
3205 | /* 2.4 Extract the final scalar result. Create: | |
3206 | s_out3 = extract_field <v_out2, bitpos> */ | |
b8698a0f | 3207 | |
ebfd146a IR |
3208 | if (extract_scalar_result) |
3209 | { | |
3210 | tree rhs; | |
3211 | ||
06066f92 | 3212 | gcc_assert (!nested_in_vect_loop || double_reduc); |
ebfd146a IR |
3213 | if (vect_print_dump_info (REPORT_DETAILS)) |
3214 | fprintf (vect_dump, "extract scalar result"); | |
3215 | ||
3216 | if (BYTES_BIG_ENDIAN) | |
3217 | bitpos = size_binop (MULT_EXPR, | |
3218 | bitsize_int (TYPE_VECTOR_SUBPARTS (vectype) - 1), | |
3219 | TYPE_SIZE (scalar_type)); | |
3220 | else | |
3221 | bitpos = bitsize_zero_node; | |
3222 | ||
3223 | rhs = build3 (BIT_FIELD_REF, scalar_type, new_temp, bitsize, bitpos); | |
3224 | epilog_stmt = gimple_build_assign (new_scalar_dest, rhs); | |
3225 | new_temp = make_ssa_name (new_scalar_dest, epilog_stmt); | |
3226 | gimple_assign_set_lhs (epilog_stmt, new_temp); | |
3227 | gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT); | |
3228 | } | |
3229 | ||
3230 | vect_finalize_reduction: | |
3231 | ||
06066f92 IR |
3232 | if (double_reduc) |
3233 | loop = loop->inner; | |
3234 | ||
ebfd146a IR |
3235 | /* 2.5 Adjust the final result by the initial value of the reduction |
3236 | variable. (When such adjustment is not needed, then | |
3237 | 'adjustment_def' is zero). For example, if code is PLUS we create: | |
3238 | new_temp = loop_exit_def + adjustment_def */ | |
3239 | ||
3240 | if (adjustment_def) | |
3241 | { | |
3242 | if (nested_in_vect_loop) | |
3243 | { | |
3244 | gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) == VECTOR_TYPE); | |
3245 | expr = build2 (code, vectype, PHI_RESULT (new_phi), adjustment_def); | |
3246 | new_dest = vect_create_destination_var (scalar_dest, vectype); | |
3247 | } | |
3248 | else | |
3249 | { | |
3250 | gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) != VECTOR_TYPE); | |
3251 | expr = build2 (code, scalar_type, new_temp, adjustment_def); | |
3252 | new_dest = vect_create_destination_var (scalar_dest, scalar_type); | |
3253 | } | |
7c5222ff | 3254 | |
ebfd146a IR |
3255 | epilog_stmt = gimple_build_assign (new_dest, expr); |
3256 | new_temp = make_ssa_name (new_dest, epilog_stmt); | |
3257 | gimple_assign_set_lhs (epilog_stmt, new_temp); | |
3258 | SSA_NAME_DEF_STMT (new_temp) = epilog_stmt; | |
3259 | gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT); | |
3260 | } | |
3261 | ||
3262 | ||
3263 | /* 2.6 Handle the loop-exit phi */ | |
3264 | ||
3265 | /* Replace uses of s_out0 with uses of s_out3: | |
3266 | Find the loop-closed-use at the loop exit of the original scalar result. | |
b8698a0f | 3267 | (The reduction result is expected to have two immediate uses - one at the |
ebfd146a IR |
3268 | latch block, and one at the loop exit). */ |
3269 | phis = VEC_alloc (gimple, heap, 10); | |
3270 | FOR_EACH_IMM_USE_FAST (use_p, imm_iter, scalar_dest) | |
3271 | { | |
3272 | if (!flow_bb_inside_loop_p (loop, gimple_bb (USE_STMT (use_p)))) | |
3273 | { | |
3274 | exit_phi = USE_STMT (use_p); | |
3275 | VEC_quick_push (gimple, phis, exit_phi); | |
3276 | } | |
3277 | } | |
06066f92 | 3278 | |
ebfd146a IR |
3279 | /* We expect to have found an exit_phi because of loop-closed-ssa form. */ |
3280 | gcc_assert (!VEC_empty (gimple, phis)); | |
3281 | ||
3282 | for (i = 0; VEC_iterate (gimple, phis, i, exit_phi); i++) | |
3283 | { | |
3284 | if (nested_in_vect_loop) | |
3285 | { | |
3286 | stmt_vec_info stmt_vinfo = vinfo_for_stmt (exit_phi); | |
06066f92 | 3287 | gimple vect_phi; |
ebfd146a IR |
3288 | |
3289 | /* FORNOW. Currently not supporting the case that an inner-loop | |
3290 | reduction is not used in the outer-loop (but only outside the | |
06066f92 | 3291 | outer-loop), unless it is double reduction. */ |
b8698a0f | 3292 | gcc_assert ((STMT_VINFO_RELEVANT_P (stmt_vinfo) |
06066f92 | 3293 | && !STMT_VINFO_LIVE_P (stmt_vinfo)) || double_reduc); |
ebfd146a IR |
3294 | |
3295 | epilog_stmt = adjustment_def ? epilog_stmt : new_phi; | |
3296 | STMT_VINFO_VEC_STMT (stmt_vinfo) = epilog_stmt; | |
b8698a0f L |
3297 | set_vinfo_for_stmt (epilog_stmt, |
3298 | new_stmt_vec_info (epilog_stmt, loop_vinfo, | |
a70d6342 | 3299 | NULL)); |
ebfd146a IR |
3300 | if (adjustment_def) |
3301 | STMT_VINFO_RELATED_STMT (vinfo_for_stmt (epilog_stmt)) = | |
3302 | STMT_VINFO_RELATED_STMT (vinfo_for_stmt (new_phi)); | |
06066f92 | 3303 | |
b8698a0f | 3304 | if (!double_reduc |
06066f92 IR |
3305 | || STMT_VINFO_DEF_TYPE (stmt_vinfo) != vect_double_reduction_def) |
3306 | continue; | |
3307 | ||
b8698a0f | 3308 | /* Handle double reduction: |
06066f92 IR |
3309 | |
3310 | stmt1: s1 = phi <s0, s2> - double reduction phi (outer loop) | |
3311 | stmt2: s3 = phi <s1, s4> - (regular) reduction phi (inner loop) | |
3312 | stmt3: s4 = use (s3) - (regular) reduction stmt (inner loop) | |
3313 | stmt4: s2 = phi <s4> - double reduction stmt (outer loop) | |
3314 | ||
b8698a0f | 3315 | At that point the regular reduction (stmt2 and stmt3) is already |
06066f92 IR |
3316 | vectorized, as well as the exit phi node, stmt4. |
3317 | Here we vectorize the phi node of double reduction, stmt1, and | |
3318 | update all relevant statements. */ | |
3319 | ||
b8698a0f | 3320 | /* Go through all the uses of s2 to find double reduction phi node, |
06066f92 IR |
3321 | i.e., stmt1 above. */ |
3322 | orig_name = PHI_RESULT (exit_phi); | |
3323 | FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name) | |
3324 | { | |
3325 | stmt_vec_info use_stmt_vinfo = vinfo_for_stmt (use_stmt); | |
3326 | stmt_vec_info new_phi_vinfo; | |
3327 | tree vect_phi_init, preheader_arg, vect_phi_res, init_def; | |
3328 | basic_block bb = gimple_bb (use_stmt); | |
3329 | gimple use; | |
3330 | ||
3331 | /* Check that USE_STMT is really double reduction phi node. */ | |
3332 | if (gimple_code (use_stmt) != GIMPLE_PHI | |
3333 | || gimple_phi_num_args (use_stmt) != 2 | |
3334 | || !use_stmt_vinfo | |
b8698a0f | 3335 | || STMT_VINFO_DEF_TYPE (use_stmt_vinfo) |
06066f92 IR |
3336 | != vect_double_reduction_def |
3337 | || bb->loop_father != outer_loop) | |
3338 | continue; | |
3339 | ||
b8698a0f L |
3340 | /* Create vector phi node for double reduction: |
3341 | vs1 = phi <vs0, vs2> | |
06066f92 IR |
3342 | vs1 was created previously in this function by a call to |
3343 | vect_get_vec_def_for_operand and is stored in vec_initial_def; | |
3344 | vs2 is defined by EPILOG_STMT, the vectorized EXIT_PHI; | |
3345 | vs0 is created here. */ | |
3346 | ||
3347 | /* Create vector phi node. */ | |
3348 | vect_phi = create_phi_node (vec_initial_def, bb); | |
b8698a0f | 3349 | new_phi_vinfo = new_stmt_vec_info (vect_phi, |
06066f92 IR |
3350 | loop_vec_info_for_loop (outer_loop), NULL); |
3351 | set_vinfo_for_stmt (vect_phi, new_phi_vinfo); | |
3352 | ||
b8698a0f L |
3353 | /* Create vs0 - initial def of the double reduction phi. */ |
3354 | preheader_arg = PHI_ARG_DEF_FROM_EDGE (use_stmt, | |
3355 | loop_preheader_edge (outer_loop)); | |
06066f92 IR |
3356 | init_def = get_initial_def_for_reduction (stmt, preheader_arg, |
3357 | NULL); | |
3358 | vect_phi_init = vect_init_vector (use_stmt, init_def, vectype, | |
3359 | NULL); | |
b8698a0f | 3360 | |
06066f92 | 3361 | /* Update phi node arguments with vs0 and vs2. */ |
b8698a0f | 3362 | add_phi_arg (vect_phi, vect_phi_init, |
f5045c96 | 3363 | loop_preheader_edge (outer_loop), UNKNOWN_LOCATION); |
b8698a0f | 3364 | add_phi_arg (vect_phi, PHI_RESULT (epilog_stmt), |
f5045c96 | 3365 | loop_latch_edge (outer_loop), UNKNOWN_LOCATION); |
06066f92 IR |
3366 | if (vect_print_dump_info (REPORT_DETAILS)) |
3367 | { | |
3368 | fprintf (vect_dump, "created double reduction phi node: "); | |
3369 | print_gimple_stmt (vect_dump, vect_phi, 0, TDF_SLIM); | |
3370 | } | |
3371 | ||
3372 | vect_phi_res = PHI_RESULT (vect_phi); | |
3373 | ||
3374 | /* Replace the use, i.e., set the correct vs1 in the regular | |
3375 | reduction phi node. FORNOW, NCOPIES is always 1, so the loop | |
b8698a0f | 3376 | is redundant. */ |
06066f92 IR |
3377 | use = reduction_phi; |
3378 | for (j = 0; j < ncopies; j++) | |
3379 | { | |
3380 | edge pr_edge = loop_preheader_edge (loop); | |
b8698a0f | 3381 | SET_PHI_ARG_DEF (use, pr_edge->dest_idx, vect_phi_res); |
06066f92 IR |
3382 | use = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (use)); |
3383 | } | |
3384 | } | |
ebfd146a IR |
3385 | } |
3386 | ||
3387 | /* Replace the uses: */ | |
3388 | orig_name = PHI_RESULT (exit_phi); | |
3389 | FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name) | |
3390 | FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter) | |
3391 | SET_USE (use_p, new_temp); | |
3392 | } | |
06066f92 | 3393 | |
ebfd146a | 3394 | VEC_free (gimple, heap, phis); |
b8698a0f | 3395 | } |
ebfd146a IR |
3396 | |
3397 | ||
3398 | /* Function vectorizable_reduction. | |
3399 | ||
3400 | Check if STMT performs a reduction operation that can be vectorized. | |
3401 | If VEC_STMT is also passed, vectorize the STMT: create a vectorized | |
7c5222ff | 3402 | stmt to replace it, put it in VEC_STMT, and insert it at GSI. |
ebfd146a IR |
3403 | Return FALSE if not a vectorizable STMT, TRUE otherwise. |
3404 | ||
b8698a0f | 3405 | This function also handles reduction idioms (patterns) that have been |
ebfd146a IR |
3406 | recognized in advance during vect_pattern_recog. In this case, STMT may be |
3407 | of this form: | |
3408 | X = pattern_expr (arg0, arg1, ..., X) | |
3409 | and it's STMT_VINFO_RELATED_STMT points to the last stmt in the original | |
3410 | sequence that had been detected and replaced by the pattern-stmt (STMT). | |
b8698a0f | 3411 | |
ebfd146a IR |
3412 | In some cases of reduction patterns, the type of the reduction variable X is |
3413 | different than the type of the other arguments of STMT. | |
3414 | In such cases, the vectype that is used when transforming STMT into a vector | |
3415 | stmt is different than the vectype that is used to determine the | |
b8698a0f | 3416 | vectorization factor, because it consists of a different number of elements |
ebfd146a IR |
3417 | than the actual number of elements that are being operated upon in parallel. |
3418 | ||
3419 | For example, consider an accumulation of shorts into an int accumulator. | |
3420 | On some targets it's possible to vectorize this pattern operating on 8 | |
3421 | shorts at a time (hence, the vectype for purposes of determining the | |
3422 | vectorization factor should be V8HI); on the other hand, the vectype that | |
3423 | is used to create the vector form is actually V4SI (the type of the result). | |
3424 | ||
3425 | Upon entry to this function, STMT_VINFO_VECTYPE records the vectype that | |
3426 | indicates what is the actual level of parallelism (V8HI in the example), so | |
3427 | that the right vectorization factor would be derived. This vectype | |
3428 | corresponds to the type of arguments to the reduction stmt, and should *NOT* | |
3429 | be used to create the vectorized stmt. The right vectype for the vectorized | |
3430 | stmt is obtained from the type of the result X: | |
3431 | get_vectype_for_scalar_type (TREE_TYPE (X)) | |
3432 | ||
3433 | This means that, contrary to "regular" reductions (or "regular" stmts in | |
3434 | general), the following equation: | |
3435 | STMT_VINFO_VECTYPE == get_vectype_for_scalar_type (TREE_TYPE (X)) | |
3436 | does *NOT* necessarily hold for reduction patterns. */ | |
3437 | ||
3438 | bool | |
3439 | vectorizable_reduction (gimple stmt, gimple_stmt_iterator *gsi, | |
3440 | gimple *vec_stmt) | |
3441 | { | |
3442 | tree vec_dest; | |
3443 | tree scalar_dest; | |
3444 | tree loop_vec_def0 = NULL_TREE, loop_vec_def1 = NULL_TREE; | |
3445 | stmt_vec_info stmt_info = vinfo_for_stmt (stmt); | |
3446 | tree vectype = STMT_VINFO_VECTYPE (stmt_info); | |
3447 | loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info); | |
3448 | struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo); | |
32e8bb8e | 3449 | enum tree_code code, orig_code, epilog_reduc_code; |
ebfd146a IR |
3450 | enum machine_mode vec_mode; |
3451 | int op_type; | |
3452 | optab optab, reduc_optab; | |
3453 | tree new_temp = NULL_TREE; | |
3454 | tree def; | |
3455 | gimple def_stmt; | |
3456 | enum vect_def_type dt; | |
3457 | gimple new_phi = NULL; | |
3458 | tree scalar_type; | |
3459 | bool is_simple_use; | |
3460 | gimple orig_stmt; | |
3461 | stmt_vec_info orig_stmt_info; | |
3462 | tree expr = NULL_TREE; | |
3463 | int i; | |
3464 | int nunits = TYPE_VECTOR_SUBPARTS (vectype); | |
3465 | int ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits; | |
3466 | int epilog_copies; | |
3467 | stmt_vec_info prev_stmt_info, prev_phi_info; | |
3468 | gimple first_phi = NULL; | |
3469 | bool single_defuse_cycle = false; | |
4bbe8262 | 3470 | tree reduc_def = NULL_TREE; |
ebfd146a IR |
3471 | gimple new_stmt = NULL; |
3472 | int j; | |
3473 | tree ops[3]; | |
7c5222ff IR |
3474 | bool nested_cycle = false, found_nested_cycle_def = false; |
3475 | gimple reduc_def_stmt = NULL; | |
3476 | /* The default is that the reduction variable is the last in statement. */ | |
3477 | int reduc_index = 2; | |
06066f92 IR |
3478 | bool double_reduc = false, dummy; |
3479 | basic_block def_bb; | |
2f3e235b | 3480 | struct loop * def_stmt_loop, *outer_loop = NULL; |
06066f92 | 3481 | tree def_arg; |
2f3e235b | 3482 | gimple def_arg_stmt; |
ebfd146a IR |
3483 | |
3484 | if (nested_in_vect_loop_p (loop, stmt)) | |
7c5222ff | 3485 | { |
2f3e235b | 3486 | outer_loop = loop; |
7c5222ff IR |
3487 | loop = loop->inner; |
3488 | nested_cycle = true; | |
3489 | } | |
ebfd146a IR |
3490 | |
3491 | gcc_assert (ncopies >= 1); | |
3492 | ||
3493 | /* FORNOW: SLP not supported. */ | |
3494 | if (STMT_SLP_TYPE (stmt_info)) | |
3495 | return false; | |
3496 | ||
3497 | /* 1. Is vectorizable reduction? */ | |
ebfd146a IR |
3498 | /* Not supportable if the reduction variable is used in the loop. */ |
3499 | if (STMT_VINFO_RELEVANT (stmt_info) > vect_used_in_outer) | |
3500 | return false; | |
3501 | ||
3502 | /* Reductions that are not used even in an enclosing outer-loop, | |
3503 | are expected to be "live" (used out of the loop). */ | |
8644a673 | 3504 | if (STMT_VINFO_RELEVANT (stmt_info) == vect_unused_in_scope |
ebfd146a IR |
3505 | && !STMT_VINFO_LIVE_P (stmt_info)) |
3506 | return false; | |
3507 | ||
3508 | /* Make sure it was already recognized as a reduction computation. */ | |
7c5222ff IR |
3509 | if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_reduction_def |
3510 | && STMT_VINFO_DEF_TYPE (stmt_info) != vect_nested_cycle) | |
ebfd146a IR |
3511 | return false; |
3512 | ||
b8698a0f | 3513 | /* 2. Has this been recognized as a reduction pattern? |
ebfd146a IR |
3514 | |
3515 | Check if STMT represents a pattern that has been recognized | |
3516 | in earlier analysis stages. For stmts that represent a pattern, | |
3517 | the STMT_VINFO_RELATED_STMT field records the last stmt in | |
3518 | the original sequence that constitutes the pattern. */ | |
3519 | ||
3520 | orig_stmt = STMT_VINFO_RELATED_STMT (stmt_info); | |
3521 | if (orig_stmt) | |
3522 | { | |
3523 | orig_stmt_info = vinfo_for_stmt (orig_stmt); | |
3524 | gcc_assert (STMT_VINFO_RELATED_STMT (orig_stmt_info) == stmt); | |
3525 | gcc_assert (STMT_VINFO_IN_PATTERN_P (orig_stmt_info)); | |
3526 | gcc_assert (!STMT_VINFO_IN_PATTERN_P (stmt_info)); | |
3527 | } | |
b8698a0f | 3528 | |
ebfd146a IR |
3529 | /* 3. Check the operands of the operation. The first operands are defined |
3530 | inside the loop body. The last operand is the reduction variable, | |
3531 | which is defined by the loop-header-phi. */ | |
3532 | ||
3533 | gcc_assert (is_gimple_assign (stmt)); | |
3534 | ||
3535 | /* Flatten RHS */ | |
3536 | switch (get_gimple_rhs_class (gimple_assign_rhs_code (stmt))) | |
3537 | { | |
3538 | case GIMPLE_SINGLE_RHS: | |
3539 | op_type = TREE_OPERAND_LENGTH (gimple_assign_rhs1 (stmt)); | |
3540 | if (op_type == ternary_op) | |
3541 | { | |
3542 | tree rhs = gimple_assign_rhs1 (stmt); | |
3543 | ops[0] = TREE_OPERAND (rhs, 0); | |
3544 | ops[1] = TREE_OPERAND (rhs, 1); | |
3545 | ops[2] = TREE_OPERAND (rhs, 2); | |
3546 | code = TREE_CODE (rhs); | |
3547 | } | |
3548 | else | |
3549 | return false; | |
3550 | break; | |
3551 | ||
3552 | case GIMPLE_BINARY_RHS: | |
3553 | code = gimple_assign_rhs_code (stmt); | |
3554 | op_type = TREE_CODE_LENGTH (code); | |
3555 | gcc_assert (op_type == binary_op); | |
3556 | ops[0] = gimple_assign_rhs1 (stmt); | |
3557 | ops[1] = gimple_assign_rhs2 (stmt); | |
3558 | break; | |
3559 | ||
3560 | case GIMPLE_UNARY_RHS: | |
3561 | return false; | |
3562 | ||
3563 | default: | |
3564 | gcc_unreachable (); | |
3565 | } | |
3566 | ||
3567 | scalar_dest = gimple_assign_lhs (stmt); | |
3568 | scalar_type = TREE_TYPE (scalar_dest); | |
b8698a0f | 3569 | if (!POINTER_TYPE_P (scalar_type) && !INTEGRAL_TYPE_P (scalar_type) |
ebfd146a IR |
3570 | && !SCALAR_FLOAT_TYPE_P (scalar_type)) |
3571 | return false; | |
3572 | ||
3573 | /* All uses but the last are expected to be defined in the loop. | |
b8698a0f | 3574 | The last use is the reduction variable. In case of nested cycle this |
7c5222ff IR |
3575 | assumption is not true: we use reduc_index to record the index of the |
3576 | reduction variable. */ | |
ebfd146a IR |
3577 | for (i = 0; i < op_type-1; i++) |
3578 | { | |
4bbe8262 IR |
3579 | /* The condition of COND_EXPR is checked in vectorizable_condition(). */ |
3580 | if (i == 0 && code == COND_EXPR) | |
3581 | continue; | |
3582 | ||
a70d6342 | 3583 | is_simple_use = vect_is_simple_use (ops[i], loop_vinfo, NULL, &def_stmt, |
ebfd146a IR |
3584 | &def, &dt); |
3585 | gcc_assert (is_simple_use); | |
8644a673 IR |
3586 | if (dt != vect_internal_def |
3587 | && dt != vect_external_def | |
ebfd146a | 3588 | && dt != vect_constant_def |
7c5222ff | 3589 | && dt != vect_induction_def |
4bbe8262 | 3590 | && !(dt == vect_nested_cycle && nested_cycle)) |
ebfd146a | 3591 | return false; |
7c5222ff IR |
3592 | |
3593 | if (dt == vect_nested_cycle) | |
3594 | { | |
3595 | found_nested_cycle_def = true; | |
3596 | reduc_def_stmt = def_stmt; | |
3597 | reduc_index = i; | |
3598 | } | |
ebfd146a IR |
3599 | } |
3600 | ||
b8698a0f | 3601 | is_simple_use = vect_is_simple_use (ops[i], loop_vinfo, NULL, &def_stmt, |
a70d6342 | 3602 | &def, &dt); |
ebfd146a | 3603 | gcc_assert (is_simple_use); |
7c5222ff IR |
3604 | gcc_assert (dt == vect_reduction_def |
3605 | || dt == vect_nested_cycle | |
b8698a0f | 3606 | || ((dt == vect_internal_def || dt == vect_external_def |
7c5222ff | 3607 | || dt == vect_constant_def || dt == vect_induction_def) |
b8698a0f | 3608 | && nested_cycle && found_nested_cycle_def)); |
7c5222ff IR |
3609 | if (!found_nested_cycle_def) |
3610 | reduc_def_stmt = def_stmt; | |
3611 | ||
3612 | gcc_assert (gimple_code (reduc_def_stmt) == GIMPLE_PHI); | |
b8698a0f L |
3613 | if (orig_stmt) |
3614 | gcc_assert (orig_stmt == vect_is_simple_reduction (loop_vinfo, | |
3615 | reduc_def_stmt, | |
3616 | !nested_cycle, | |
06066f92 | 3617 | &dummy)); |
ebfd146a | 3618 | else |
b8698a0f | 3619 | gcc_assert (stmt == vect_is_simple_reduction (loop_vinfo, reduc_def_stmt, |
06066f92 | 3620 | !nested_cycle, &dummy)); |
b8698a0f | 3621 | |
7c5222ff | 3622 | if (STMT_VINFO_LIVE_P (vinfo_for_stmt (reduc_def_stmt))) |
ebfd146a IR |
3623 | return false; |
3624 | ||
4bbe8262 | 3625 | vec_mode = TYPE_MODE (vectype); |
ebfd146a | 3626 | |
4bbe8262 | 3627 | if (code == COND_EXPR) |
ebfd146a | 3628 | { |
4bbe8262 IR |
3629 | if (!vectorizable_condition (stmt, gsi, NULL, ops[reduc_index], 0)) |
3630 | { | |
3631 | if (vect_print_dump_info (REPORT_DETAILS)) | |
3632 | fprintf (vect_dump, "unsupported condition in reduction"); | |
3633 | ||
3634 | return false; | |
3635 | } | |
ebfd146a | 3636 | } |
4bbe8262 | 3637 | else |
ebfd146a | 3638 | { |
4bbe8262 | 3639 | /* 4. Supportable by target? */ |
ebfd146a | 3640 | |
4bbe8262 IR |
3641 | /* 4.1. check support for the operation in the loop */ |
3642 | optab = optab_for_tree_code (code, vectype, optab_default); | |
3643 | if (!optab) | |
3644 | { | |
3645 | if (vect_print_dump_info (REPORT_DETAILS)) | |
3646 | fprintf (vect_dump, "no optab."); | |
3647 | ||
3648 | return false; | |
3649 | } | |
3650 | ||
3651 | if (optab_handler (optab, vec_mode)->insn_code == CODE_FOR_nothing) | |
3652 | { | |
3653 | if (vect_print_dump_info (REPORT_DETAILS)) | |
3654 | fprintf (vect_dump, "op not supported by target."); | |
3655 | ||
3656 | if (GET_MODE_SIZE (vec_mode) != UNITS_PER_WORD | |
3657 | || LOOP_VINFO_VECT_FACTOR (loop_vinfo) | |
3658 | < vect_min_worthwhile_factor (code)) | |
3659 | return false; | |
3660 | ||
3661 | if (vect_print_dump_info (REPORT_DETAILS)) | |
3662 | fprintf (vect_dump, "proceeding using word mode."); | |
3663 | } | |
3664 | ||
3665 | /* Worthwhile without SIMD support? */ | |
3666 | if (!VECTOR_MODE_P (TYPE_MODE (vectype)) | |
3667 | && LOOP_VINFO_VECT_FACTOR (loop_vinfo) | |
3668 | < vect_min_worthwhile_factor (code)) | |
3669 | { | |
3670 | if (vect_print_dump_info (REPORT_DETAILS)) | |
3671 | fprintf (vect_dump, "not worthwhile without SIMD support."); | |
3672 | ||
3673 | return false; | |
3674 | } | |
ebfd146a IR |
3675 | } |
3676 | ||
3677 | /* 4.2. Check support for the epilog operation. | |
3678 | ||
3679 | If STMT represents a reduction pattern, then the type of the | |
3680 | reduction variable may be different than the type of the rest | |
3681 | of the arguments. For example, consider the case of accumulation | |
3682 | of shorts into an int accumulator; The original code: | |
3683 | S1: int_a = (int) short_a; | |
3684 | orig_stmt-> S2: int_acc = plus <int_a ,int_acc>; | |
3685 | ||
3686 | was replaced with: | |
3687 | STMT: int_acc = widen_sum <short_a, int_acc> | |
3688 | ||
3689 | This means that: | |
b8698a0f L |
3690 | 1. The tree-code that is used to create the vector operation in the |
3691 | epilog code (that reduces the partial results) is not the | |
3692 | tree-code of STMT, but is rather the tree-code of the original | |
3693 | stmt from the pattern that STMT is replacing. I.e, in the example | |
3694 | above we want to use 'widen_sum' in the loop, but 'plus' in the | |
ebfd146a IR |
3695 | epilog. |
3696 | 2. The type (mode) we use to check available target support | |
b8698a0f L |
3697 | for the vector operation to be created in the *epilog*, is |
3698 | determined by the type of the reduction variable (in the example | |
ebfd146a IR |
3699 | above we'd check this: plus_optab[vect_int_mode]). |
3700 | However the type (mode) we use to check available target support | |
3701 | for the vector operation to be created *inside the loop*, is | |
3702 | determined by the type of the other arguments to STMT (in the | |
3703 | example we'd check this: widen_sum_optab[vect_short_mode]). | |
b8698a0f L |
3704 | |
3705 | This is contrary to "regular" reductions, in which the types of all | |
3706 | the arguments are the same as the type of the reduction variable. | |
3707 | For "regular" reductions we can therefore use the same vector type | |
ebfd146a IR |
3708 | (and also the same tree-code) when generating the epilog code and |
3709 | when generating the code inside the loop. */ | |
3710 | ||
3711 | if (orig_stmt) | |
3712 | { | |
3713 | /* This is a reduction pattern: get the vectype from the type of the | |
3714 | reduction variable, and get the tree-code from orig_stmt. */ | |
3715 | orig_code = gimple_assign_rhs_code (orig_stmt); | |
3716 | vectype = get_vectype_for_scalar_type (TREE_TYPE (def)); | |
3717 | if (!vectype) | |
3718 | { | |
3719 | if (vect_print_dump_info (REPORT_DETAILS)) | |
3720 | { | |
3721 | fprintf (vect_dump, "unsupported data-type "); | |
3722 | print_generic_expr (vect_dump, TREE_TYPE (def), TDF_SLIM); | |
3723 | } | |
3724 | return false; | |
3725 | } | |
3726 | ||
3727 | vec_mode = TYPE_MODE (vectype); | |
3728 | } | |
3729 | else | |
3730 | { | |
3731 | /* Regular reduction: use the same vectype and tree-code as used for | |
3732 | the vector code inside the loop can be used for the epilog code. */ | |
3733 | orig_code = code; | |
3734 | } | |
3735 | ||
2f3e235b IR |
3736 | if (nested_cycle) |
3737 | { | |
3738 | def_bb = gimple_bb (reduc_def_stmt); | |
3739 | def_stmt_loop = def_bb->loop_father; | |
3740 | def_arg = PHI_ARG_DEF_FROM_EDGE (reduc_def_stmt, | |
3741 | loop_preheader_edge (def_stmt_loop)); | |
3742 | if (TREE_CODE (def_arg) == SSA_NAME | |
3743 | && (def_arg_stmt = SSA_NAME_DEF_STMT (def_arg)) | |
3744 | && gimple_code (def_arg_stmt) == GIMPLE_PHI | |
3745 | && flow_bb_inside_loop_p (outer_loop, gimple_bb (def_arg_stmt)) | |
3746 | && vinfo_for_stmt (def_arg_stmt) | |
3747 | && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_arg_stmt)) | |
3748 | == vect_double_reduction_def) | |
3749 | double_reduc = true; | |
3750 | } | |
06066f92 | 3751 | |
4bbe8262 IR |
3752 | epilog_reduc_code = ERROR_MARK; |
3753 | if (reduction_code_for_scalar_code (orig_code, &epilog_reduc_code)) | |
3754 | { | |
b8698a0f | 3755 | reduc_optab = optab_for_tree_code (epilog_reduc_code, vectype, |
4bbe8262 IR |
3756 | optab_default); |
3757 | if (!reduc_optab) | |
3758 | { | |
3759 | if (vect_print_dump_info (REPORT_DETAILS)) | |
3760 | fprintf (vect_dump, "no optab for reduction."); | |
3761 | ||
3762 | epilog_reduc_code = ERROR_MARK; | |
3763 | } | |
3764 | ||
3765 | if (reduc_optab | |
b8698a0f | 3766 | && optab_handler (reduc_optab, vec_mode)->insn_code |
4bbe8262 IR |
3767 | == CODE_FOR_nothing) |
3768 | { | |
3769 | if (vect_print_dump_info (REPORT_DETAILS)) | |
3770 | fprintf (vect_dump, "reduc op not supported by target."); | |
b8698a0f | 3771 | |
4bbe8262 IR |
3772 | epilog_reduc_code = ERROR_MARK; |
3773 | } | |
3774 | } | |
3775 | else | |
3776 | { | |
3777 | if (!nested_cycle || double_reduc) | |
3778 | { | |
3779 | if (vect_print_dump_info (REPORT_DETAILS)) | |
3780 | fprintf (vect_dump, "no reduc code for scalar code."); | |
3781 | ||
3782 | return false; | |
3783 | } | |
3784 | } | |
3785 | ||
06066f92 IR |
3786 | if (double_reduc && ncopies > 1) |
3787 | { | |
3788 | if (vect_print_dump_info (REPORT_DETAILS)) | |
3789 | fprintf (vect_dump, "multiple types in double reduction"); | |
3790 | ||
3791 | return false; | |
3792 | } | |
b8698a0f | 3793 | |
ebfd146a IR |
3794 | if (!vec_stmt) /* transformation not required. */ |
3795 | { | |
3796 | STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type; | |
3797 | if (!vect_model_reduction_cost (stmt_info, epilog_reduc_code, ncopies)) | |
3798 | return false; | |
3799 | return true; | |
3800 | } | |
3801 | ||
3802 | /** Transform. **/ | |
3803 | ||
3804 | if (vect_print_dump_info (REPORT_DETAILS)) | |
3805 | fprintf (vect_dump, "transform reduction."); | |
3806 | ||
4bbe8262 IR |
3807 | /* FORNOW: Multiple types are not supported for condition. */ |
3808 | if (code == COND_EXPR) | |
3809 | gcc_assert (ncopies == 1); | |
3810 | ||
ebfd146a IR |
3811 | /* Create the destination vector */ |
3812 | vec_dest = vect_create_destination_var (scalar_dest, vectype); | |
3813 | ||
3814 | /* In case the vectorization factor (VF) is bigger than the number | |
3815 | of elements that we can fit in a vectype (nunits), we have to generate | |
3816 | more than one vector stmt - i.e - we need to "unroll" the | |
3817 | vector stmt by a factor VF/nunits. For more details see documentation | |
3818 | in vectorizable_operation. */ | |
3819 | ||
3820 | /* If the reduction is used in an outer loop we need to generate | |
3821 | VF intermediate results, like so (e.g. for ncopies=2): | |
3822 | r0 = phi (init, r0) | |
3823 | r1 = phi (init, r1) | |
3824 | r0 = x0 + r0; | |
3825 | r1 = x1 + r1; | |
3826 | (i.e. we generate VF results in 2 registers). | |
3827 | In this case we have a separate def-use cycle for each copy, and therefore | |
3828 | for each copy we get the vector def for the reduction variable from the | |
3829 | respective phi node created for this copy. | |
3830 | ||
3831 | Otherwise (the reduction is unused in the loop nest), we can combine | |
3832 | together intermediate results, like so (e.g. for ncopies=2): | |
3833 | r = phi (init, r) | |
3834 | r = x0 + r; | |
3835 | r = x1 + r; | |
3836 | (i.e. we generate VF/2 results in a single register). | |
3837 | In this case for each copy we get the vector def for the reduction variable | |
3838 | from the vectorized reduction operation generated in the previous iteration. | |
3839 | */ | |
3840 | ||
8644a673 | 3841 | if (STMT_VINFO_RELEVANT (stmt_info) == vect_unused_in_scope) |
ebfd146a IR |
3842 | { |
3843 | single_defuse_cycle = true; | |
3844 | epilog_copies = 1; | |
3845 | } | |
3846 | else | |
3847 | epilog_copies = ncopies; | |
3848 | ||
3849 | prev_stmt_info = NULL; | |
3850 | prev_phi_info = NULL; | |
3851 | for (j = 0; j < ncopies; j++) | |
3852 | { | |
3853 | if (j == 0 || !single_defuse_cycle) | |
3854 | { | |
3855 | /* Create the reduction-phi that defines the reduction-operand. */ | |
3856 | new_phi = create_phi_node (vec_dest, loop->header); | |
b8698a0f | 3857 | set_vinfo_for_stmt (new_phi, new_stmt_vec_info (new_phi, loop_vinfo, |
a70d6342 | 3858 | NULL)); |
4bbe8262 IR |
3859 | /* Get the vector def for the reduction variable from the phi |
3860 | node. */ | |
3861 | reduc_def = PHI_RESULT (new_phi); | |
ebfd146a IR |
3862 | } |
3863 | ||
4bbe8262 IR |
3864 | if (code == COND_EXPR) |
3865 | { | |
3866 | first_phi = new_phi; | |
3867 | vectorizable_condition (stmt, gsi, vec_stmt, reduc_def, reduc_index); | |
3868 | /* Multiple types are not supported for condition. */ | |
3869 | break; | |
3870 | } | |
3871 | ||
ebfd146a IR |
3872 | /* Handle uses. */ |
3873 | if (j == 0) | |
3874 | { | |
b8698a0f | 3875 | loop_vec_def0 = vect_get_vec_def_for_operand (ops[!reduc_index], |
7c5222ff | 3876 | stmt, NULL); |
ebfd146a IR |
3877 | if (op_type == ternary_op) |
3878 | { | |
7c5222ff | 3879 | if (reduc_index == 0) |
b8698a0f | 3880 | loop_vec_def1 = vect_get_vec_def_for_operand (ops[2], stmt, |
7c5222ff IR |
3881 | NULL); |
3882 | else | |
b8698a0f | 3883 | loop_vec_def1 = vect_get_vec_def_for_operand (ops[1], stmt, |
7c5222ff | 3884 | NULL); |
ebfd146a IR |
3885 | } |
3886 | ||
b8698a0f | 3887 | /* Get the vector def for the reduction variable from the phi |
7c5222ff | 3888 | node. */ |
ebfd146a IR |
3889 | first_phi = new_phi; |
3890 | } | |
3891 | else | |
3892 | { | |
3893 | enum vect_def_type dt = vect_unknown_def_type; /* Dummy */ | |
3894 | loop_vec_def0 = vect_get_vec_def_for_stmt_copy (dt, loop_vec_def0); | |
3895 | if (op_type == ternary_op) | |
3896 | loop_vec_def1 = vect_get_vec_def_for_stmt_copy (dt, loop_vec_def1); | |
3897 | ||
3898 | if (single_defuse_cycle) | |
3899 | reduc_def = gimple_assign_lhs (new_stmt); | |
3900 | else | |
3901 | reduc_def = PHI_RESULT (new_phi); | |
3902 | ||
3903 | STMT_VINFO_RELATED_STMT (prev_phi_info) = new_phi; | |
3904 | } | |
3905 | ||
4bbe8262 | 3906 | /* Arguments are ready. Create the new vector stmt. */ |
ebfd146a | 3907 | if (op_type == binary_op) |
7c5222ff IR |
3908 | { |
3909 | if (reduc_index == 0) | |
3910 | expr = build2 (code, vectype, reduc_def, loop_vec_def0); | |
3911 | else | |
3912 | expr = build2 (code, vectype, loop_vec_def0, reduc_def); | |
3913 | } | |
ebfd146a | 3914 | else |
7c5222ff IR |
3915 | { |
3916 | if (reduc_index == 0) | |
b8698a0f | 3917 | expr = build3 (code, vectype, reduc_def, loop_vec_def0, |
7c5222ff | 3918 | loop_vec_def1); |
b8698a0f | 3919 | else |
7c5222ff IR |
3920 | { |
3921 | if (reduc_index == 1) | |
b8698a0f | 3922 | expr = build3 (code, vectype, loop_vec_def0, reduc_def, |
7c5222ff IR |
3923 | loop_vec_def1); |
3924 | else | |
b8698a0f | 3925 | expr = build3 (code, vectype, loop_vec_def0, loop_vec_def1, |
7c5222ff IR |
3926 | reduc_def); |
3927 | } | |
3928 | } | |
3929 | ||
ebfd146a IR |
3930 | new_stmt = gimple_build_assign (vec_dest, expr); |
3931 | new_temp = make_ssa_name (vec_dest, new_stmt); | |
3932 | gimple_assign_set_lhs (new_stmt, new_temp); | |
3933 | vect_finish_stmt_generation (stmt, new_stmt, gsi); | |
b8698a0f | 3934 | |
ebfd146a IR |
3935 | if (j == 0) |
3936 | STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt; | |
3937 | else | |
3938 | STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt; | |
4bbe8262 | 3939 | |
ebfd146a IR |
3940 | prev_stmt_info = vinfo_for_stmt (new_stmt); |
3941 | prev_phi_info = vinfo_for_stmt (new_phi); | |
3942 | } | |
3943 | ||
3944 | /* Finalize the reduction-phi (set its arguments) and create the | |
3945 | epilog reduction code. */ | |
4bbe8262 | 3946 | if (!single_defuse_cycle || code == COND_EXPR) |
ebfd146a | 3947 | new_temp = gimple_assign_lhs (*vec_stmt); |
06066f92 | 3948 | |
ebfd146a | 3949 | vect_create_epilog_for_reduction (new_temp, stmt, epilog_copies, |
06066f92 IR |
3950 | epilog_reduc_code, first_phi, reduc_index, |
3951 | double_reduc); | |
ebfd146a IR |
3952 | return true; |
3953 | } | |
3954 | ||
3955 | /* Function vect_min_worthwhile_factor. | |
3956 | ||
3957 | For a loop where we could vectorize the operation indicated by CODE, | |
3958 | return the minimum vectorization factor that makes it worthwhile | |
3959 | to use generic vectors. */ | |
3960 | int | |
3961 | vect_min_worthwhile_factor (enum tree_code code) | |
3962 | { | |
3963 | switch (code) | |
3964 | { | |
3965 | case PLUS_EXPR: | |
3966 | case MINUS_EXPR: | |
3967 | case NEGATE_EXPR: | |
3968 | return 4; | |
3969 | ||
3970 | case BIT_AND_EXPR: | |
3971 | case BIT_IOR_EXPR: | |
3972 | case BIT_XOR_EXPR: | |
3973 | case BIT_NOT_EXPR: | |
3974 | return 2; | |
3975 | ||
3976 | default: | |
3977 | return INT_MAX; | |
3978 | } | |
3979 | } | |
3980 | ||
3981 | ||
3982 | /* Function vectorizable_induction | |
3983 | ||
3984 | Check if PHI performs an induction computation that can be vectorized. | |
3985 | If VEC_STMT is also passed, vectorize the induction PHI: create a vectorized | |
3986 | phi to replace it, put it in VEC_STMT, and add it to the same basic block. | |
3987 | Return FALSE if not a vectorizable STMT, TRUE otherwise. */ | |
3988 | ||
3989 | bool | |
3990 | vectorizable_induction (gimple phi, gimple_stmt_iterator *gsi ATTRIBUTE_UNUSED, | |
3991 | gimple *vec_stmt) | |
3992 | { | |
3993 | stmt_vec_info stmt_info = vinfo_for_stmt (phi); | |
3994 | tree vectype = STMT_VINFO_VECTYPE (stmt_info); | |
3995 | loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info); | |
3996 | struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo); | |
3997 | int nunits = TYPE_VECTOR_SUBPARTS (vectype); | |
3998 | int ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits; | |
3999 | tree vec_def; | |
4000 | ||
4001 | gcc_assert (ncopies >= 1); | |
4002 | /* FORNOW. This restriction should be relaxed. */ | |
4003 | if (nested_in_vect_loop_p (loop, phi) && ncopies > 1) | |
4004 | { | |
4005 | if (vect_print_dump_info (REPORT_DETAILS)) | |
4006 | fprintf (vect_dump, "multiple types in nested loop."); | |
4007 | return false; | |
4008 | } | |
4009 | ||
4010 | if (!STMT_VINFO_RELEVANT_P (stmt_info)) | |
4011 | return false; | |
4012 | ||
4013 | /* FORNOW: SLP not supported. */ | |
4014 | if (STMT_SLP_TYPE (stmt_info)) | |
4015 | return false; | |
4016 | ||
4017 | gcc_assert (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def); | |
4018 | ||
4019 | if (gimple_code (phi) != GIMPLE_PHI) | |
4020 | return false; | |
4021 | ||
4022 | if (!vec_stmt) /* transformation not required. */ | |
4023 | { | |
4024 | STMT_VINFO_TYPE (stmt_info) = induc_vec_info_type; | |
4025 | if (vect_print_dump_info (REPORT_DETAILS)) | |
4026 | fprintf (vect_dump, "=== vectorizable_induction ==="); | |
4027 | vect_model_induction_cost (stmt_info, ncopies); | |
4028 | return true; | |
4029 | } | |
4030 | ||
4031 | /** Transform. **/ | |
4032 | ||
4033 | if (vect_print_dump_info (REPORT_DETAILS)) | |
4034 | fprintf (vect_dump, "transform induction phi."); | |
4035 | ||
4036 | vec_def = get_initial_def_for_induction (phi); | |
4037 | *vec_stmt = SSA_NAME_DEF_STMT (vec_def); | |
4038 | return true; | |
4039 | } | |
4040 | ||
4041 | /* Function vectorizable_live_operation. | |
4042 | ||
b8698a0f | 4043 | STMT computes a value that is used outside the loop. Check if |
ebfd146a IR |
4044 | it can be supported. */ |
4045 | ||
4046 | bool | |
4047 | vectorizable_live_operation (gimple stmt, | |
4048 | gimple_stmt_iterator *gsi ATTRIBUTE_UNUSED, | |
4049 | gimple *vec_stmt ATTRIBUTE_UNUSED) | |
4050 | { | |
4051 | stmt_vec_info stmt_info = vinfo_for_stmt (stmt); | |
4052 | loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info); | |
4053 | struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo); | |
4054 | int i; | |
4055 | int op_type; | |
4056 | tree op; | |
4057 | tree def; | |
4058 | gimple def_stmt; | |
b8698a0f | 4059 | enum vect_def_type dt; |
ebfd146a IR |
4060 | enum tree_code code; |
4061 | enum gimple_rhs_class rhs_class; | |
4062 | ||
4063 | gcc_assert (STMT_VINFO_LIVE_P (stmt_info)); | |
4064 | ||
4065 | if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def) | |
4066 | return false; | |
4067 | ||
4068 | if (!is_gimple_assign (stmt)) | |
4069 | return false; | |
4070 | ||
4071 | if (TREE_CODE (gimple_assign_lhs (stmt)) != SSA_NAME) | |
4072 | return false; | |
4073 | ||
4074 | /* FORNOW. CHECKME. */ | |
4075 | if (nested_in_vect_loop_p (loop, stmt)) | |
4076 | return false; | |
4077 | ||
4078 | code = gimple_assign_rhs_code (stmt); | |
4079 | op_type = TREE_CODE_LENGTH (code); | |
4080 | rhs_class = get_gimple_rhs_class (code); | |
4081 | gcc_assert (rhs_class != GIMPLE_UNARY_RHS || op_type == unary_op); | |
4082 | gcc_assert (rhs_class != GIMPLE_BINARY_RHS || op_type == binary_op); | |
4083 | ||
4084 | /* FORNOW: support only if all uses are invariant. This means | |
4085 | that the scalar operations can remain in place, unvectorized. | |
4086 | The original last scalar value that they compute will be used. */ | |
4087 | ||
4088 | for (i = 0; i < op_type; i++) | |
4089 | { | |
4090 | if (rhs_class == GIMPLE_SINGLE_RHS) | |
4091 | op = TREE_OPERAND (gimple_op (stmt, 1), i); | |
4092 | else | |
4093 | op = gimple_op (stmt, i + 1); | |
a70d6342 IR |
4094 | if (op |
4095 | && !vect_is_simple_use (op, loop_vinfo, NULL, &def_stmt, &def, &dt)) | |
ebfd146a IR |
4096 | { |
4097 | if (vect_print_dump_info (REPORT_DETAILS)) | |
4098 | fprintf (vect_dump, "use not simple."); | |
4099 | return false; | |
4100 | } | |
4101 | ||
8644a673 | 4102 | if (dt != vect_external_def && dt != vect_constant_def) |
ebfd146a IR |
4103 | return false; |
4104 | } | |
4105 | ||
4106 | /* No transformation is required for the cases we currently support. */ | |
4107 | return true; | |
4108 | } | |
4109 | ||
a83452e9 AO |
4110 | /* Kill any debug uses outside LOOP of SSA names defined in STMT. */ |
4111 | ||
4112 | static void | |
4113 | vect_loop_kill_debug_uses (struct loop *loop, gimple stmt) | |
4114 | { | |
4115 | ssa_op_iter op_iter; | |
4116 | imm_use_iterator imm_iter; | |
4117 | def_operand_p def_p; | |
4118 | gimple ustmt; | |
4119 | ||
4120 | FOR_EACH_PHI_OR_STMT_DEF (def_p, stmt, op_iter, SSA_OP_DEF) | |
4121 | { | |
4122 | FOR_EACH_IMM_USE_STMT (ustmt, imm_iter, DEF_FROM_PTR (def_p)) | |
4123 | { | |
4124 | basic_block bb; | |
4125 | ||
4126 | if (!is_gimple_debug (ustmt)) | |
4127 | continue; | |
4128 | ||
4129 | bb = gimple_bb (ustmt); | |
4130 | ||
4131 | if (!flow_bb_inside_loop_p (loop, bb)) | |
4132 | { | |
4133 | if (gimple_debug_bind_p (ustmt)) | |
4134 | { | |
4135 | if (vect_print_dump_info (REPORT_DETAILS)) | |
4136 | fprintf (vect_dump, "killing debug use"); | |
4137 | ||
4138 | gimple_debug_bind_reset_value (ustmt); | |
4139 | update_stmt (ustmt); | |
4140 | } | |
4141 | else | |
4142 | gcc_unreachable (); | |
4143 | } | |
4144 | } | |
4145 | } | |
4146 | } | |
4147 | ||
ebfd146a IR |
4148 | /* Function vect_transform_loop. |
4149 | ||
4150 | The analysis phase has determined that the loop is vectorizable. | |
4151 | Vectorize the loop - created vectorized stmts to replace the scalar | |
4152 | stmts in the loop, and update the loop exit condition. */ | |
4153 | ||
4154 | void | |
4155 | vect_transform_loop (loop_vec_info loop_vinfo) | |
4156 | { | |
4157 | struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo); | |
4158 | basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo); | |
4159 | int nbbs = loop->num_nodes; | |
4160 | gimple_stmt_iterator si; | |
4161 | int i; | |
4162 | tree ratio = NULL; | |
4163 | int vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo); | |
4164 | bool strided_store; | |
4165 | bool slp_scheduled = false; | |
4166 | unsigned int nunits; | |
86290011 RG |
4167 | tree cond_expr = NULL_TREE; |
4168 | gimple_seq cond_expr_stmt_list = NULL; | |
4169 | bool do_peeling_for_loop_bound; | |
ebfd146a IR |
4170 | |
4171 | if (vect_print_dump_info (REPORT_DETAILS)) | |
4172 | fprintf (vect_dump, "=== vec_transform_loop ==="); | |
4173 | ||
86290011 RG |
4174 | /* Peel the loop if there are data refs with unknown alignment. |
4175 | Only one data ref with unknown store is allowed. */ | |
4176 | ||
4177 | if (LOOP_PEELING_FOR_ALIGNMENT (loop_vinfo)) | |
4178 | vect_do_peeling_for_alignment (loop_vinfo); | |
4179 | ||
4180 | do_peeling_for_loop_bound | |
4181 | = (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) | |
4182 | || (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) | |
4183 | && LOOP_VINFO_INT_NITERS (loop_vinfo) % vectorization_factor != 0)); | |
4184 | ||
e9dbe7bb IR |
4185 | if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo) |
4186 | || LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo)) | |
86290011 RG |
4187 | vect_loop_versioning (loop_vinfo, |
4188 | !do_peeling_for_loop_bound, | |
4189 | &cond_expr, &cond_expr_stmt_list); | |
ebfd146a | 4190 | |
ebfd146a IR |
4191 | /* If the loop has a symbolic number of iterations 'n' (i.e. it's not a |
4192 | compile time constant), or it is a constant that doesn't divide by the | |
4193 | vectorization factor, then an epilog loop needs to be created. | |
4194 | We therefore duplicate the loop: the original loop will be vectorized, | |
4195 | and will compute the first (n/VF) iterations. The second copy of the loop | |
4196 | will remain scalar and will compute the remaining (n%VF) iterations. | |
4197 | (VF is the vectorization factor). */ | |
4198 | ||
86290011 RG |
4199 | if (do_peeling_for_loop_bound) |
4200 | vect_do_peeling_for_loop_bound (loop_vinfo, &ratio, | |
4201 | cond_expr, cond_expr_stmt_list); | |
ebfd146a IR |
4202 | else |
4203 | ratio = build_int_cst (TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo)), | |
4204 | LOOP_VINFO_INT_NITERS (loop_vinfo) / vectorization_factor); | |
4205 | ||
4206 | /* 1) Make sure the loop header has exactly two entries | |
4207 | 2) Make sure we have a preheader basic block. */ | |
4208 | ||
4209 | gcc_assert (EDGE_COUNT (loop->header->preds) == 2); | |
4210 | ||
4211 | split_edge (loop_preheader_edge (loop)); | |
4212 | ||
4213 | /* FORNOW: the vectorizer supports only loops which body consist | |
b8698a0f L |
4214 | of one basic block (header + empty latch). When the vectorizer will |
4215 | support more involved loop forms, the order by which the BBs are | |
ebfd146a IR |
4216 | traversed need to be reconsidered. */ |
4217 | ||
4218 | for (i = 0; i < nbbs; i++) | |
4219 | { | |
4220 | basic_block bb = bbs[i]; | |
4221 | stmt_vec_info stmt_info; | |
4222 | gimple phi; | |
4223 | ||
4224 | for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si)) | |
4225 | { | |
4226 | phi = gsi_stmt (si); | |
4227 | if (vect_print_dump_info (REPORT_DETAILS)) | |
4228 | { | |
4229 | fprintf (vect_dump, "------>vectorizing phi: "); | |
4230 | print_gimple_stmt (vect_dump, phi, 0, TDF_SLIM); | |
4231 | } | |
4232 | stmt_info = vinfo_for_stmt (phi); | |
4233 | if (!stmt_info) | |
4234 | continue; | |
4235 | ||
4236 | if (!STMT_VINFO_RELEVANT_P (stmt_info) | |
4237 | && !STMT_VINFO_LIVE_P (stmt_info)) | |
a83452e9 AO |
4238 | { |
4239 | if (MAY_HAVE_DEBUG_STMTS) | |
4240 | vect_loop_kill_debug_uses (loop, phi); | |
4241 | continue; | |
4242 | } | |
ebfd146a IR |
4243 | |
4244 | if ((TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info)) | |
4245 | != (unsigned HOST_WIDE_INT) vectorization_factor) | |
4246 | && vect_print_dump_info (REPORT_DETAILS)) | |
4247 | fprintf (vect_dump, "multiple-types."); | |
4248 | ||
4249 | if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def) | |
4250 | { | |
4251 | if (vect_print_dump_info (REPORT_DETAILS)) | |
4252 | fprintf (vect_dump, "transform phi."); | |
4253 | vect_transform_stmt (phi, NULL, NULL, NULL, NULL); | |
4254 | } | |
4255 | } | |
4256 | ||
4257 | for (si = gsi_start_bb (bb); !gsi_end_p (si);) | |
4258 | { | |
4259 | gimple stmt = gsi_stmt (si); | |
4260 | bool is_store; | |
4261 | ||
4262 | if (vect_print_dump_info (REPORT_DETAILS)) | |
4263 | { | |
4264 | fprintf (vect_dump, "------>vectorizing statement: "); | |
4265 | print_gimple_stmt (vect_dump, stmt, 0, TDF_SLIM); | |
b8698a0f | 4266 | } |
ebfd146a IR |
4267 | |
4268 | stmt_info = vinfo_for_stmt (stmt); | |
4269 | ||
4270 | /* vector stmts created in the outer-loop during vectorization of | |
4271 | stmts in an inner-loop may not have a stmt_info, and do not | |
4272 | need to be vectorized. */ | |
4273 | if (!stmt_info) | |
4274 | { | |
4275 | gsi_next (&si); | |
4276 | continue; | |
4277 | } | |
4278 | ||
4279 | if (!STMT_VINFO_RELEVANT_P (stmt_info) | |
4280 | && !STMT_VINFO_LIVE_P (stmt_info)) | |
4281 | { | |
a83452e9 AO |
4282 | if (MAY_HAVE_DEBUG_STMTS) |
4283 | vect_loop_kill_debug_uses (loop, stmt); | |
ebfd146a IR |
4284 | gsi_next (&si); |
4285 | continue; | |
4286 | } | |
4287 | ||
4288 | gcc_assert (STMT_VINFO_VECTYPE (stmt_info)); | |
4289 | nunits = | |
4290 | (unsigned int) TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info)); | |
4291 | if (!STMT_SLP_TYPE (stmt_info) | |
4292 | && nunits != (unsigned int) vectorization_factor | |
4293 | && vect_print_dump_info (REPORT_DETAILS)) | |
4294 | /* For SLP VF is set according to unrolling factor, and not to | |
4295 | vector size, hence for SLP this print is not valid. */ | |
4296 | fprintf (vect_dump, "multiple-types."); | |
4297 | ||
4298 | /* SLP. Schedule all the SLP instances when the first SLP stmt is | |
4299 | reached. */ | |
4300 | if (STMT_SLP_TYPE (stmt_info)) | |
4301 | { | |
4302 | if (!slp_scheduled) | |
4303 | { | |
4304 | slp_scheduled = true; | |
4305 | ||
4306 | if (vect_print_dump_info (REPORT_DETAILS)) | |
4307 | fprintf (vect_dump, "=== scheduling SLP instances ==="); | |
4308 | ||
a70d6342 | 4309 | vect_schedule_slp (loop_vinfo, NULL); |
ebfd146a IR |
4310 | } |
4311 | ||
4312 | /* Hybrid SLP stmts must be vectorized in addition to SLP. */ | |
c4551b28 | 4313 | if (!vinfo_for_stmt (stmt) || PURE_SLP_STMT (stmt_info)) |
ebfd146a IR |
4314 | { |
4315 | gsi_next (&si); | |
4316 | continue; | |
4317 | } | |
4318 | } | |
b8698a0f | 4319 | |
ebfd146a IR |
4320 | /* -------- vectorize statement ------------ */ |
4321 | if (vect_print_dump_info (REPORT_DETAILS)) | |
4322 | fprintf (vect_dump, "transform statement."); | |
4323 | ||
4324 | strided_store = false; | |
4325 | is_store = vect_transform_stmt (stmt, &si, &strided_store, NULL, NULL); | |
4326 | if (is_store) | |
4327 | { | |
4328 | if (STMT_VINFO_STRIDED_ACCESS (stmt_info)) | |
4329 | { | |
4330 | /* Interleaving. If IS_STORE is TRUE, the vectorization of the | |
4331 | interleaving chain was completed - free all the stores in | |
4332 | the chain. */ | |
4333 | vect_remove_stores (DR_GROUP_FIRST_DR (stmt_info)); | |
4334 | gsi_remove (&si, true); | |
4335 | continue; | |
4336 | } | |
4337 | else | |
4338 | { | |
4339 | /* Free the attached stmt_vec_info and remove the stmt. */ | |
4340 | free_stmt_vec_info (stmt); | |
4341 | gsi_remove (&si, true); | |
4342 | continue; | |
4343 | } | |
4344 | } | |
4345 | gsi_next (&si); | |
4346 | } /* stmts in BB */ | |
4347 | } /* BBs in loop */ | |
4348 | ||
4349 | slpeel_make_loop_iterate_ntimes (loop, ratio); | |
4350 | ||
ebfd146a IR |
4351 | /* The memory tags and pointers in vectorized statements need to |
4352 | have their SSA forms updated. FIXME, why can't this be delayed | |
4353 | until all the loops have been transformed? */ | |
4354 | update_ssa (TODO_update_ssa); | |
4355 | ||
8644a673 | 4356 | if (vect_print_dump_info (REPORT_VECTORIZED_LOCATIONS)) |
ebfd146a | 4357 | fprintf (vect_dump, "LOOP VECTORIZED."); |
8644a673 | 4358 | if (loop->inner && vect_print_dump_info (REPORT_VECTORIZED_LOCATIONS)) |
ebfd146a IR |
4359 | fprintf (vect_dump, "OUTER LOOP VECTORIZED."); |
4360 | } |