]>
Commit | Line | Data |
---|---|---|
f7064d11 | 1 | /* Transformation Utilities for Loop Vectorization. |
2d593c86 | 2 | Copyright (C) 2003, 2004, 2005, 2006, 2007, 2008 Free Software Foundation, Inc. |
f7064d11 DN |
3 | Contributed by Dorit Naishlos <dorit@il.ibm.com> |
4 | ||
5 | This file is part of GCC. | |
6 | ||
7 | GCC is free software; you can redistribute it and/or modify it under | |
8 | the terms of the GNU General Public License as published by the Free | |
9dcd6f09 | 9 | Software Foundation; either version 3, or (at your option) any later |
f7064d11 DN |
10 | version. |
11 | ||
12 | GCC is distributed in the hope that it will be useful, but WITHOUT ANY | |
13 | WARRANTY; without even the implied warranty of MERCHANTABILITY or | |
14 | FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License | |
15 | for more details. | |
16 | ||
17 | You should have received a copy of the GNU General Public License | |
9dcd6f09 NC |
18 | along with GCC; see the file COPYING3. If not see |
19 | <http://www.gnu.org/licenses/>. */ | |
f7064d11 DN |
20 | |
21 | #include "config.h" | |
22 | #include "system.h" | |
23 | #include "coretypes.h" | |
24 | #include "tm.h" | |
f7064d11 DN |
25 | #include "ggc.h" |
26 | #include "tree.h" | |
27 | #include "target.h" | |
28 | #include "rtl.h" | |
29 | #include "basic-block.h" | |
30 | #include "diagnostic.h" | |
31 | #include "tree-flow.h" | |
32 | #include "tree-dump.h" | |
33 | #include "timevar.h" | |
34 | #include "cfgloop.h" | |
35 | #include "expr.h" | |
36 | #include "optabs.h" | |
acdc40df | 37 | #include "params.h" |
b2d16a23 | 38 | #include "recog.h" |
f7064d11 DN |
39 | #include "tree-data-ref.h" |
40 | #include "tree-chrec.h" | |
41 | #include "tree-scalar-evolution.h" | |
42 | #include "tree-vectorizer.h" | |
43 | #include "langhooks.h" | |
44 | #include "tree-pass.h" | |
45 | #include "toplev.h" | |
61d3cdbb | 46 | #include "real.h" |
f7064d11 DN |
47 | |
48 | /* Utility functions for the code transformation. */ | |
805e2059 | 49 | static bool vect_transform_stmt (tree, block_stmt_iterator *, bool *, slp_tree); |
f7064d11 DN |
50 | static tree vect_create_destination_var (tree, tree); |
51 | static tree vect_create_data_ref_ptr | |
468c2ac0 DN |
52 | (tree, struct loop*, tree, tree *, tree *, bool, tree, bool *); |
53 | static tree vect_create_addr_base_for_vector_ref | |
54 | (tree, tree *, tree, struct loop *); | |
f7064d11 | 55 | static tree vect_get_new_vect_var (tree, enum vect_var_kind, const char *); |
61d3cdbb | 56 | static tree vect_get_vec_def_for_operand (tree, tree, tree *); |
468c2ac0 | 57 | static tree vect_init_vector (tree, tree, tree, block_stmt_iterator *); |
f7064d11 | 58 | static void vect_finish_stmt_generation |
468c2ac0 | 59 | (tree stmt, tree vec_stmt, block_stmt_iterator *); |
b52485c6 | 60 | static bool vect_is_simple_cond (tree, loop_vec_info); |
20f06221 | 61 | static void vect_create_epilog_for_reduction (tree, tree, enum tree_code, tree); |
61d3cdbb | 62 | static tree get_initial_def_for_reduction (tree, tree, tree *); |
f7064d11 DN |
63 | |
64 | /* Utility function dealing with loop peeling (not peeling itself). */ | |
65 | static void vect_generate_tmps_on_preheader | |
66 | (loop_vec_info, tree *, tree *, tree *); | |
67 | static tree vect_build_loop_niters (loop_vec_info); | |
68 | static void vect_update_ivs_after_vectorizer (loop_vec_info, tree, edge); | |
69 | static tree vect_gen_niters_for_prolog_loop (loop_vec_info, tree); | |
5f55a1ba | 70 | static void vect_update_init_of_dr (struct data_reference *, tree niters); |
f7064d11 | 71 | static void vect_update_inits_of_drs (loop_vec_info, tree); |
afc1ab61 | 72 | static int vect_min_worthwhile_factor (enum tree_code); |
f7064d11 DN |
73 | |
74 | ||
e95b59d2 DN |
75 | static int |
76 | cost_for_stmt (tree stmt) | |
77 | { | |
78 | stmt_vec_info stmt_info = vinfo_for_stmt (stmt); | |
79 | ||
80 | switch (STMT_VINFO_TYPE (stmt_info)) | |
81 | { | |
82 | case load_vec_info_type: | |
83 | return TARG_SCALAR_LOAD_COST; | |
84 | case store_vec_info_type: | |
85 | return TARG_SCALAR_STORE_COST; | |
86 | case op_vec_info_type: | |
87 | case condition_vec_info_type: | |
88 | case assignment_vec_info_type: | |
89 | case reduc_vec_info_type: | |
90 | case induc_vec_info_type: | |
91 | case type_promotion_vec_info_type: | |
92 | case type_demotion_vec_info_type: | |
93 | case type_conversion_vec_info_type: | |
94 | case call_vec_info_type: | |
95 | return TARG_SCALAR_STMT_COST; | |
96 | case undef_vec_info_type: | |
97 | default: | |
98 | gcc_unreachable (); | |
99 | } | |
100 | } | |
101 | ||
102 | ||
792ed98b HJ |
103 | /* Function vect_estimate_min_profitable_iters |
104 | ||
105 | Return the number of iterations required for the vector version of the | |
106 | loop to be profitable relative to the cost of the scalar version of the | |
107 | loop. | |
108 | ||
109 | TODO: Take profile info into account before making vectorization | |
110 | decisions, if available. */ | |
111 | ||
112 | int | |
113 | vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo) | |
114 | { | |
115 | int i; | |
116 | int min_profitable_iters; | |
117 | int peel_iters_prologue; | |
118 | int peel_iters_epilogue; | |
119 | int vec_inside_cost = 0; | |
120 | int vec_outside_cost = 0; | |
121 | int scalar_single_iter_cost = 0; | |
749cc4b1 HJ |
122 | int scalar_outside_cost = 0; |
123 | bool runtime_test = false; | |
792ed98b HJ |
124 | int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo); |
125 | struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo); | |
126 | basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo); | |
127 | int nbbs = loop->num_nodes; | |
749cc4b1 | 128 | int byte_misalign = LOOP_PEELING_FOR_ALIGNMENT (loop_vinfo); |
e70444a8 | 129 | int peel_guard_costs = 0; |
a34ef915 | 130 | int innerloop_iters = 0, factor; |
805e2059 IR |
131 | VEC (slp_instance, heap) *slp_instances; |
132 | slp_instance instance; | |
792ed98b HJ |
133 | |
134 | /* Cost model disabled. */ | |
135 | if (!flag_vect_cost_model) | |
136 | { | |
f5adacc5 | 137 | if (vect_print_dump_info (REPORT_COST)) |
792ed98b HJ |
138 | fprintf (vect_dump, "cost model disabled."); |
139 | return 0; | |
140 | } | |
141 | ||
749cc4b1 HJ |
142 | /* If the number of iterations is unknown, or the |
143 | peeling-for-misalignment amount is unknown, we will have to generate | |
144 | a runtime test to test the loop count against the threshold. */ | |
145 | if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) | |
146 | || (byte_misalign < 0)) | |
147 | runtime_test = true; | |
148 | ||
42cbdeac | 149 | /* Requires loop versioning tests to handle misalignment. */ |
792ed98b | 150 | |
3a70f3ef | 151 | if (VEC_length (tree, LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo))) |
792ed98b | 152 | { |
42cbdeac VK |
153 | /* FIXME: Make cost depend on complexity of individual check. */ |
154 | vec_outside_cost += | |
155 | VEC_length (tree, LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo)); | |
f5adacc5 | 156 | if (vect_print_dump_info (REPORT_COST)) |
42cbdeac VK |
157 | fprintf (vect_dump, "cost model: Adding cost of checks for loop " |
158 | "versioning to treat misalignment.\n"); | |
159 | } | |
160 | ||
161 | if (VEC_length (ddr_p, LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo))) | |
162 | { | |
163 | /* FIXME: Make cost depend on complexity of individual check. */ | |
164 | vec_outside_cost += | |
165 | VEC_length (ddr_p, LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo)); | |
f5adacc5 | 166 | if (vect_print_dump_info (REPORT_COST)) |
792ed98b | 167 | fprintf (vect_dump, "cost model: Adding cost of checks for loop " |
42cbdeac VK |
168 | "versioning aliasing.\n"); |
169 | } | |
170 | ||
171 | if (VEC_length (tree, LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo)) | |
172 | || VEC_length (ddr_p, LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo))) | |
173 | { | |
174 | vec_outside_cost += TARG_COND_TAKEN_BRANCH_COST; | |
792ed98b HJ |
175 | } |
176 | ||
792ed98b HJ |
177 | /* Count statements in scalar loop. Using this as scalar cost for a single |
178 | iteration for now. | |
179 | ||
180 | TODO: Add outer loop support. | |
181 | ||
182 | TODO: Consider assigning different costs to different scalar | |
183 | statements. */ | |
184 | ||
d29de1bf DN |
185 | /* FORNOW. */ |
186 | if (loop->inner) | |
187 | innerloop_iters = 50; /* FIXME */ | |
188 | ||
792ed98b HJ |
189 | for (i = 0; i < nbbs; i++) |
190 | { | |
191 | block_stmt_iterator si; | |
192 | basic_block bb = bbs[i]; | |
193 | ||
d29de1bf DN |
194 | if (bb->loop_father == loop->inner) |
195 | factor = innerloop_iters; | |
196 | else | |
197 | factor = 1; | |
198 | ||
792ed98b | 199 | for (si = bsi_start (bb); !bsi_end_p (si); bsi_next (&si)) |
ca7b0517 DN |
200 | { |
201 | tree stmt = bsi_stmt (si); | |
202 | stmt_vec_info stmt_info = vinfo_for_stmt (stmt); | |
203 | /* Skip stmts that are not vectorized inside the loop. */ | |
204 | if (!STMT_VINFO_RELEVANT_P (stmt_info) | |
0e1b778a DN |
205 | && (!STMT_VINFO_LIVE_P (stmt_info) |
206 | || STMT_VINFO_DEF_TYPE (stmt_info) != vect_reduction_def)) | |
ca7b0517 DN |
207 | continue; |
208 | scalar_single_iter_cost += cost_for_stmt (stmt) * factor; | |
209 | vec_inside_cost += STMT_VINFO_INSIDE_OF_LOOP_COST (stmt_info) * factor; | |
d29de1bf DN |
210 | /* FIXME: for stmts in the inner-loop in outer-loop vectorization, |
211 | some of the "outside" costs are generated inside the outer-loop. */ | |
ca7b0517 DN |
212 | vec_outside_cost += STMT_VINFO_OUTSIDE_OF_LOOP_COST (stmt_info); |
213 | } | |
792ed98b HJ |
214 | } |
215 | ||
216 | /* Add additional cost for the peeled instructions in prologue and epilogue | |
217 | loop. | |
218 | ||
219 | FORNOW: If we dont know the value of peel_iters for prologue or epilogue | |
e70444a8 | 220 | at compile-time - we assume it's vf/2 (the worst would be vf-1). |
792ed98b HJ |
221 | |
222 | TODO: Build an expression that represents peel_iters for prologue and | |
223 | epilogue to be used in a run-time test. */ | |
224 | ||
3a70f3ef | 225 | if (byte_misalign < 0) |
792ed98b | 226 | { |
e70444a8 | 227 | peel_iters_prologue = vf/2; |
f5adacc5 | 228 | if (vect_print_dump_info (REPORT_COST)) |
792ed98b | 229 | fprintf (vect_dump, "cost model: " |
e70444a8 | 230 | "prologue peel iters set to vf/2."); |
792ed98b HJ |
231 | |
232 | /* If peeling for alignment is unknown, loop bound of main loop becomes | |
0d52bcc1 | 233 | unknown. */ |
e70444a8 | 234 | peel_iters_epilogue = vf/2; |
f5adacc5 | 235 | if (vect_print_dump_info (REPORT_COST)) |
792ed98b | 236 | fprintf (vect_dump, "cost model: " |
e70444a8 | 237 | "epilogue peel iters set to vf/2 because " |
792ed98b | 238 | "peeling for alignment is unknown ."); |
e70444a8 HJ |
239 | |
240 | /* If peeled iterations are unknown, count a taken branch and a not taken | |
241 | branch per peeled loop. Even if scalar loop iterations are known, | |
242 | vector iterations are not known since peeled prologue iterations are | |
243 | not known. Hence guards remain the same. */ | |
244 | peel_guard_costs += 2 * (TARG_COND_TAKEN_BRANCH_COST | |
245 | + TARG_COND_NOT_TAKEN_BRANCH_COST); | |
246 | ||
792ed98b HJ |
247 | } |
248 | else | |
249 | { | |
3a70f3ef DN |
250 | if (byte_misalign) |
251 | { | |
252 | struct data_reference *dr = LOOP_VINFO_UNALIGNED_DR (loop_vinfo); | |
253 | int element_size = GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (dr)))); | |
254 | tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (DR_STMT (dr))); | |
255 | int nelements = TYPE_VECTOR_SUBPARTS (vectype); | |
256 | ||
257 | peel_iters_prologue = nelements - (byte_misalign / element_size); | |
258 | } | |
259 | else | |
260 | peel_iters_prologue = 0; | |
261 | ||
792ed98b HJ |
262 | if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)) |
263 | { | |
e70444a8 | 264 | peel_iters_epilogue = vf/2; |
f5adacc5 | 265 | if (vect_print_dump_info (REPORT_COST)) |
792ed98b | 266 | fprintf (vect_dump, "cost model: " |
e70444a8 | 267 | "epilogue peel iters set to vf/2 because " |
792ed98b | 268 | "loop iterations are unknown ."); |
e70444a8 HJ |
269 | |
270 | /* If peeled iterations are known but number of scalar loop | |
271 | iterations are unknown, count a taken branch per peeled loop. */ | |
272 | peel_guard_costs += 2 * TARG_COND_TAKEN_BRANCH_COST; | |
273 | ||
792ed98b HJ |
274 | } |
275 | else | |
3a70f3ef DN |
276 | { |
277 | int niters = LOOP_VINFO_INT_NITERS (loop_vinfo); | |
278 | peel_iters_prologue = niters < peel_iters_prologue ? | |
279 | niters : peel_iters_prologue; | |
280 | peel_iters_epilogue = (niters - peel_iters_prologue) % vf; | |
281 | } | |
282 | } | |
283 | ||
792ed98b | 284 | vec_outside_cost += (peel_iters_prologue * scalar_single_iter_cost) |
e70444a8 HJ |
285 | + (peel_iters_epilogue * scalar_single_iter_cost) |
286 | + peel_guard_costs; | |
792ed98b | 287 | |
749cc4b1 HJ |
288 | /* FORNOW: The scalar outside cost is incremented in one of the |
289 | following ways: | |
290 | ||
291 | 1. The vectorizer checks for alignment and aliasing and generates | |
292 | a condition that allows dynamic vectorization. A cost model | |
293 | check is ANDED with the versioning condition. Hence scalar code | |
294 | path now has the added cost of the versioning check. | |
295 | ||
296 | if (cost > th & versioning_check) | |
297 | jmp to vector code | |
298 | ||
299 | Hence run-time scalar is incremented by not-taken branch cost. | |
300 | ||
301 | 2. The vectorizer then checks if a prologue is required. If the | |
302 | cost model check was not done before during versioning, it has to | |
303 | be done before the prologue check. | |
304 | ||
305 | if (cost <= th) | |
306 | prologue = scalar_iters | |
307 | if (prologue == 0) | |
308 | jmp to vector code | |
309 | else | |
310 | execute prologue | |
311 | if (prologue == num_iters) | |
312 | go to exit | |
313 | ||
314 | Hence the run-time scalar cost is incremented by a taken branch, | |
315 | plus a not-taken branch, plus a taken branch cost. | |
316 | ||
317 | 3. The vectorizer then checks if an epilogue is required. If the | |
318 | cost model check was not done before during prologue check, it | |
319 | has to be done with the epilogue check. | |
320 | ||
321 | if (prologue == 0) | |
322 | jmp to vector code | |
323 | else | |
324 | execute prologue | |
325 | if (prologue == num_iters) | |
326 | go to exit | |
327 | vector code: | |
328 | if ((cost <= th) | (scalar_iters-prologue-epilogue == 0)) | |
329 | jmp to epilogue | |
330 | ||
331 | Hence the run-time scalar cost should be incremented by 2 taken | |
332 | branches. | |
333 | ||
334 | TODO: The back end may reorder the BBS's differently and reverse | |
335 | conditions/branch directions. Change the stimates below to | |
336 | something more reasonable. */ | |
337 | ||
338 | if (runtime_test) | |
339 | { | |
340 | /* Cost model check occurs at versioning. */ | |
341 | if (VEC_length (tree, LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo)) | |
342 | || VEC_length (ddr_p, LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo))) | |
343 | scalar_outside_cost += TARG_COND_NOT_TAKEN_BRANCH_COST; | |
344 | else | |
345 | { | |
346 | /* Cost model occurs at prologue generation. */ | |
347 | if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)) | |
348 | scalar_outside_cost += 2 * TARG_COND_TAKEN_BRANCH_COST | |
349 | + TARG_COND_NOT_TAKEN_BRANCH_COST; | |
350 | /* Cost model check occurs at epilogue generation. */ | |
351 | else | |
352 | scalar_outside_cost += 2 * TARG_COND_TAKEN_BRANCH_COST; | |
353 | } | |
e95b59d2 DN |
354 | } |
355 | ||
805e2059 IR |
356 | /* Add SLP costs. */ |
357 | slp_instances = LOOP_VINFO_SLP_INSTANCES (loop_vinfo); | |
358 | for (i = 0; VEC_iterate (slp_instance, slp_instances, i, instance); i++) | |
359 | { | |
360 | vec_outside_cost += SLP_INSTANCE_OUTSIDE_OF_LOOP_COST (instance); | |
361 | vec_inside_cost += SLP_INSTANCE_INSIDE_OF_LOOP_COST (instance); | |
362 | } | |
363 | ||
792ed98b HJ |
364 | /* Calculate number of iterations required to make the vector version |
365 | profitable, relative to the loop bodies only. The following condition | |
749cc4b1 HJ |
366 | must hold true: |
367 | SIC * niters + SOC > VIC * ((niters-PL_ITERS-EP_ITERS)/VF) + VOC | |
368 | where | |
792ed98b | 369 | SIC = scalar iteration cost, VIC = vector iteration cost, |
749cc4b1 HJ |
370 | VOC = vector outside cost, VF = vectorization factor, |
371 | PL_ITERS = prologue iterations, EP_ITERS= epilogue iterations | |
372 | SOC = scalar outside cost for run time cost model check. */ | |
792ed98b HJ |
373 | |
374 | if ((scalar_single_iter_cost * vf) > vec_inside_cost) | |
375 | { | |
e70444a8 | 376 | if (vec_outside_cost <= 0) |
792ed98b HJ |
377 | min_profitable_iters = 1; |
378 | else | |
379 | { | |
749cc4b1 HJ |
380 | min_profitable_iters = ((vec_outside_cost - scalar_outside_cost) * vf |
381 | - vec_inside_cost * peel_iters_prologue | |
e70444a8 | 382 | - vec_inside_cost * peel_iters_epilogue) |
792ed98b HJ |
383 | / ((scalar_single_iter_cost * vf) |
384 | - vec_inside_cost); | |
385 | ||
386 | if ((scalar_single_iter_cost * vf * min_profitable_iters) | |
387 | <= ((vec_inside_cost * min_profitable_iters) | |
749cc4b1 | 388 | + ((vec_outside_cost - scalar_outside_cost) * vf))) |
792ed98b HJ |
389 | min_profitable_iters++; |
390 | } | |
391 | } | |
392 | /* vector version will never be profitable. */ | |
393 | else | |
394 | { | |
f5adacc5 | 395 | if (vect_print_dump_info (REPORT_COST)) |
792ed98b HJ |
396 | fprintf (vect_dump, "cost model: vector iteration cost = %d " |
397 | "is divisible by scalar iteration cost = %d by a factor " | |
398 | "greater than or equal to the vectorization factor = %d .", | |
399 | vec_inside_cost, scalar_single_iter_cost, vf); | |
400 | return -1; | |
401 | } | |
402 | ||
f5adacc5 | 403 | if (vect_print_dump_info (REPORT_COST)) |
792ed98b HJ |
404 | { |
405 | fprintf (vect_dump, "Cost model analysis: \n"); | |
406 | fprintf (vect_dump, " Vector inside of loop cost: %d\n", | |
407 | vec_inside_cost); | |
408 | fprintf (vect_dump, " Vector outside of loop cost: %d\n", | |
409 | vec_outside_cost); | |
749cc4b1 HJ |
410 | fprintf (vect_dump, " Scalar iteration cost: %d\n", |
411 | scalar_single_iter_cost); | |
412 | fprintf (vect_dump, " Scalar outside cost: %d\n", scalar_outside_cost); | |
792ed98b HJ |
413 | fprintf (vect_dump, " prologue iterations: %d\n", |
414 | peel_iters_prologue); | |
415 | fprintf (vect_dump, " epilogue iterations: %d\n", | |
416 | peel_iters_epilogue); | |
417 | fprintf (vect_dump, " Calculated minimum iters for profitability: %d\n", | |
418 | min_profitable_iters); | |
792ed98b HJ |
419 | } |
420 | ||
e95b59d2 DN |
421 | min_profitable_iters = |
422 | min_profitable_iters < vf ? vf : min_profitable_iters; | |
423 | ||
424 | /* Because the condition we create is: | |
425 | if (niters <= min_profitable_iters) | |
426 | then skip the vectorized loop. */ | |
427 | min_profitable_iters--; | |
e70444a8 | 428 | |
f5adacc5 | 429 | if (vect_print_dump_info (REPORT_COST)) |
e70444a8 HJ |
430 | fprintf (vect_dump, " Profitability threshold = %d\n", |
431 | min_profitable_iters); | |
432 | ||
e95b59d2 | 433 | return min_profitable_iters; |
792ed98b HJ |
434 | } |
435 | ||
436 | ||
437 | /* TODO: Close dependency between vect_model_*_cost and vectorizable_* | |
0d52bcc1 | 438 | functions. Design better to avoid maintenance issues. */ |
792ed98b HJ |
439 | |
440 | /* Function vect_model_reduction_cost. | |
441 | ||
442 | Models cost for a reduction operation, including the vector ops | |
443 | generated within the strip-mine loop, the initial definition before | |
444 | the loop, and the epilogue code that must be generated. */ | |
445 | ||
20e545c3 | 446 | static bool |
792ed98b HJ |
447 | vect_model_reduction_cost (stmt_vec_info stmt_info, enum tree_code reduc_code, |
448 | int ncopies) | |
449 | { | |
450 | int outer_cost = 0; | |
451 | enum tree_code code; | |
452 | optab optab; | |
453 | tree vectype; | |
454 | tree orig_stmt; | |
455 | tree reduction_op; | |
456 | enum machine_mode mode; | |
457 | tree operation = GIMPLE_STMT_OPERAND (STMT_VINFO_STMT (stmt_info), 1); | |
458 | int op_type = TREE_CODE_LENGTH (TREE_CODE (operation)); | |
468c2ac0 DN |
459 | loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info); |
460 | struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo); | |
792ed98b HJ |
461 | |
462 | /* Cost of reduction op inside loop. */ | |
463 | STMT_VINFO_INSIDE_OF_LOOP_COST (stmt_info) += ncopies * TARG_VEC_STMT_COST; | |
464 | ||
465 | reduction_op = TREE_OPERAND (operation, op_type-1); | |
466 | vectype = get_vectype_for_scalar_type (TREE_TYPE (reduction_op)); | |
20e545c3 IR |
467 | if (!vectype) |
468 | { | |
f5adacc5 | 469 | if (vect_print_dump_info (REPORT_COST)) |
20e545c3 IR |
470 | { |
471 | fprintf (vect_dump, "unsupported data-type "); | |
472 | print_generic_expr (vect_dump, TREE_TYPE (reduction_op), TDF_SLIM); | |
473 | } | |
474 | return false; | |
475 | } | |
476 | ||
792ed98b HJ |
477 | mode = TYPE_MODE (vectype); |
478 | orig_stmt = STMT_VINFO_RELATED_STMT (stmt_info); | |
479 | ||
480 | if (!orig_stmt) | |
481 | orig_stmt = STMT_VINFO_STMT (stmt_info); | |
482 | ||
483 | code = TREE_CODE (GIMPLE_STMT_OPERAND (orig_stmt, 1)); | |
484 | ||
485 | /* Add in cost for initial definition. */ | |
e95b59d2 | 486 | outer_cost += TARG_SCALAR_TO_VEC_COST; |
792ed98b HJ |
487 | |
488 | /* Determine cost of epilogue code. | |
489 | ||
490 | We have a reduction operator that will reduce the vector in one statement. | |
491 | Also requires scalar extract. */ | |
492 | ||
468c2ac0 | 493 | if (!nested_in_vect_loop_p (loop, orig_stmt)) |
792ed98b | 494 | { |
468c2ac0 DN |
495 | if (reduc_code < NUM_TREE_CODES) |
496 | outer_cost += TARG_VEC_STMT_COST + TARG_VEC_TO_SCALAR_COST; | |
497 | else | |
498 | { | |
499 | int vec_size_in_bits = tree_low_cst (TYPE_SIZE (vectype), 1); | |
500 | tree bitsize = | |
501 | TYPE_SIZE (TREE_TYPE ( GIMPLE_STMT_OPERAND (orig_stmt, 0))); | |
502 | int element_bitsize = tree_low_cst (bitsize, 1); | |
503 | int nelements = vec_size_in_bits / element_bitsize; | |
504 | ||
505 | optab = optab_for_tree_code (code, vectype); | |
506 | ||
507 | /* We have a whole vector shift available. */ | |
508 | if (VECTOR_MODE_P (mode) | |
509 | && optab_handler (optab, mode)->insn_code != CODE_FOR_nothing | |
510 | && optab_handler (vec_shr_optab, mode)->insn_code != CODE_FOR_nothing) | |
511 | /* Final reduction via vector shifts and the reduction operator. Also | |
512 | requires scalar extract. */ | |
513 | outer_cost += ((exact_log2(nelements) * 2) * TARG_VEC_STMT_COST | |
514 | + TARG_VEC_TO_SCALAR_COST); | |
515 | else | |
516 | /* Use extracts and reduction op for final reduction. For N elements, | |
517 | we have N extracts and N-1 reduction ops. */ | |
518 | outer_cost += ((nelements + nelements - 1) * TARG_VEC_STMT_COST); | |
519 | } | |
792ed98b HJ |
520 | } |
521 | ||
522 | STMT_VINFO_OUTSIDE_OF_LOOP_COST (stmt_info) = outer_cost; | |
523 | ||
f5adacc5 | 524 | if (vect_print_dump_info (REPORT_COST)) |
792ed98b HJ |
525 | fprintf (vect_dump, "vect_model_reduction_cost: inside_cost = %d, " |
526 | "outside_cost = %d .", STMT_VINFO_INSIDE_OF_LOOP_COST (stmt_info), | |
527 | STMT_VINFO_OUTSIDE_OF_LOOP_COST (stmt_info)); | |
20e545c3 IR |
528 | |
529 | return true; | |
792ed98b HJ |
530 | } |
531 | ||
532 | ||
533 | /* Function vect_model_induction_cost. | |
534 | ||
535 | Models cost for induction operations. */ | |
536 | ||
537 | static void | |
538 | vect_model_induction_cost (stmt_vec_info stmt_info, int ncopies) | |
539 | { | |
540 | /* loop cost for vec_loop. */ | |
541 | STMT_VINFO_INSIDE_OF_LOOP_COST (stmt_info) = ncopies * TARG_VEC_STMT_COST; | |
542 | /* prologue cost for vec_init and vec_step. */ | |
3a70f3ef | 543 | STMT_VINFO_OUTSIDE_OF_LOOP_COST (stmt_info) = 2 * TARG_SCALAR_TO_VEC_COST; |
792ed98b | 544 | |
f5adacc5 | 545 | if (vect_print_dump_info (REPORT_COST)) |
792ed98b HJ |
546 | fprintf (vect_dump, "vect_model_induction_cost: inside_cost = %d, " |
547 | "outside_cost = %d .", STMT_VINFO_INSIDE_OF_LOOP_COST (stmt_info), | |
548 | STMT_VINFO_OUTSIDE_OF_LOOP_COST (stmt_info)); | |
549 | } | |
550 | ||
551 | ||
552 | /* Function vect_model_simple_cost. | |
553 | ||
554 | Models cost for simple operations, i.e. those that only emit ncopies of a | |
555 | single op. Right now, this does not account for multiple insns that could | |
556 | be generated for the single vector op. We will handle that shortly. */ | |
557 | ||
805e2059 IR |
558 | void |
559 | vect_model_simple_cost (stmt_vec_info stmt_info, int ncopies, | |
560 | enum vect_def_type *dt, slp_tree slp_node) | |
792ed98b | 561 | { |
3a70f3ef | 562 | int i; |
89675e73 | 563 | int inside_cost = 0, outside_cost = 0; |
3a70f3ef | 564 | |
89675e73 | 565 | inside_cost = ncopies * TARG_VEC_STMT_COST; |
792ed98b | 566 | |
3a70f3ef | 567 | /* FORNOW: Assuming maximum 2 args per stmts. */ |
805e2059 | 568 | for (i = 0; i < 2; i++) |
3a70f3ef DN |
569 | { |
570 | if (dt[i] == vect_constant_def || dt[i] == vect_invariant_def) | |
89675e73 | 571 | outside_cost += TARG_SCALAR_TO_VEC_COST; |
3a70f3ef DN |
572 | } |
573 | ||
f5adacc5 | 574 | if (vect_print_dump_info (REPORT_COST)) |
792ed98b | 575 | fprintf (vect_dump, "vect_model_simple_cost: inside_cost = %d, " |
89675e73 IR |
576 | "outside_cost = %d .", inside_cost, outside_cost); |
577 | ||
578 | /* Set the costs either in STMT_INFO or SLP_NODE (if exists). */ | |
579 | stmt_vinfo_set_inside_of_loop_cost (stmt_info, slp_node, inside_cost); | |
580 | stmt_vinfo_set_outside_of_loop_cost (stmt_info, slp_node, outside_cost); | |
792ed98b HJ |
581 | } |
582 | ||
583 | ||
584 | /* Function vect_cost_strided_group_size | |
585 | ||
586 | For strided load or store, return the group_size only if it is the first | |
587 | load or store of a group, else return 1. This ensures that group size is | |
588 | only returned once per group. */ | |
589 | ||
590 | static int | |
591 | vect_cost_strided_group_size (stmt_vec_info stmt_info) | |
592 | { | |
593 | tree first_stmt = DR_GROUP_FIRST_DR (stmt_info); | |
594 | ||
595 | if (first_stmt == STMT_VINFO_STMT (stmt_info)) | |
596 | return DR_GROUP_SIZE (stmt_info); | |
597 | ||
598 | return 1; | |
599 | } | |
600 | ||
601 | ||
602 | /* Function vect_model_store_cost | |
603 | ||
604 | Models cost for stores. In the case of strided accesses, one access | |
605 | has the overhead of the strided access attributed to it. */ | |
606 | ||
805e2059 IR |
607 | void |
608 | vect_model_store_cost (stmt_vec_info stmt_info, int ncopies, | |
609 | enum vect_def_type dt, slp_tree slp_node) | |
792ed98b | 610 | { |
792ed98b | 611 | int group_size; |
89675e73 | 612 | int inside_cost = 0, outside_cost = 0; |
792ed98b | 613 | |
3a70f3ef | 614 | if (dt == vect_constant_def || dt == vect_invariant_def) |
89675e73 | 615 | outside_cost = TARG_SCALAR_TO_VEC_COST; |
3a70f3ef | 616 | |
792ed98b HJ |
617 | /* Strided access? */ |
618 | if (DR_GROUP_FIRST_DR (stmt_info)) | |
619 | group_size = vect_cost_strided_group_size (stmt_info); | |
620 | /* Not a strided access. */ | |
621 | else | |
622 | group_size = 1; | |
623 | ||
624 | /* Is this an access in a group of stores, which provide strided access? | |
625 | If so, add in the cost of the permutes. */ | |
626 | if (group_size > 1) | |
627 | { | |
628 | /* Uses a high and low interleave operation for each needed permute. */ | |
89675e73 | 629 | inside_cost = ncopies * exact_log2(group_size) * group_size |
792ed98b HJ |
630 | * TARG_VEC_STMT_COST; |
631 | ||
f5adacc5 | 632 | if (vect_print_dump_info (REPORT_COST)) |
792ed98b HJ |
633 | fprintf (vect_dump, "vect_model_store_cost: strided group_size = %d .", |
634 | group_size); | |
635 | ||
636 | } | |
637 | ||
638 | /* Costs of the stores. */ | |
89675e73 | 639 | inside_cost += ncopies * TARG_VEC_STORE_COST; |
792ed98b | 640 | |
f5adacc5 | 641 | if (vect_print_dump_info (REPORT_COST)) |
792ed98b | 642 | fprintf (vect_dump, "vect_model_store_cost: inside_cost = %d, " |
89675e73 IR |
643 | "outside_cost = %d .", inside_cost, outside_cost); |
644 | ||
645 | /* Set the costs either in STMT_INFO or SLP_NODE (if exists). */ | |
646 | stmt_vinfo_set_inside_of_loop_cost (stmt_info, slp_node, inside_cost); | |
647 | stmt_vinfo_set_outside_of_loop_cost (stmt_info, slp_node, outside_cost); | |
792ed98b HJ |
648 | } |
649 | ||
650 | ||
651 | /* Function vect_model_load_cost | |
652 | ||
653 | Models cost for loads. In the case of strided accesses, the last access | |
654 | has the overhead of the strided access attributed to it. Since unaligned | |
655 | accesses are supported for loads, we also account for the costs of the | |
656 | access scheme chosen. */ | |
657 | ||
805e2059 IR |
658 | void |
659 | vect_model_load_cost (stmt_vec_info stmt_info, int ncopies, slp_tree slp_node) | |
792ed98b HJ |
660 | |
661 | { | |
792ed98b HJ |
662 | int group_size; |
663 | int alignment_support_cheme; | |
664 | tree first_stmt; | |
665 | struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info), *first_dr; | |
89675e73 | 666 | int inside_cost = 0, outside_cost = 0; |
792ed98b HJ |
667 | |
668 | /* Strided accesses? */ | |
669 | first_stmt = DR_GROUP_FIRST_DR (stmt_info); | |
805e2059 | 670 | if (first_stmt && !slp_node) |
792ed98b HJ |
671 | { |
672 | group_size = vect_cost_strided_group_size (stmt_info); | |
673 | first_dr = STMT_VINFO_DATA_REF (vinfo_for_stmt (first_stmt)); | |
674 | } | |
675 | /* Not a strided access. */ | |
676 | else | |
677 | { | |
678 | group_size = 1; | |
679 | first_dr = dr; | |
680 | } | |
681 | ||
682 | alignment_support_cheme = vect_supportable_dr_alignment (first_dr); | |
683 | ||
684 | /* Is this an access in a group of loads providing strided access? | |
685 | If so, add in the cost of the permutes. */ | |
686 | if (group_size > 1) | |
687 | { | |
688 | /* Uses an even and odd extract operations for each needed permute. */ | |
89675e73 IR |
689 | inside_cost = ncopies * exact_log2(group_size) * group_size |
690 | * TARG_VEC_STMT_COST; | |
792ed98b | 691 | |
f5adacc5 | 692 | if (vect_print_dump_info (REPORT_COST)) |
792ed98b HJ |
693 | fprintf (vect_dump, "vect_model_load_cost: strided group_size = %d .", |
694 | group_size); | |
695 | ||
696 | } | |
697 | ||
698 | /* The loads themselves. */ | |
699 | switch (alignment_support_cheme) | |
700 | { | |
701 | case dr_aligned: | |
702 | { | |
89675e73 | 703 | inside_cost += ncopies * TARG_VEC_LOAD_COST; |
792ed98b | 704 | |
f5adacc5 | 705 | if (vect_print_dump_info (REPORT_COST)) |
792ed98b HJ |
706 | fprintf (vect_dump, "vect_model_load_cost: aligned."); |
707 | ||
708 | break; | |
709 | } | |
710 | case dr_unaligned_supported: | |
711 | { | |
712 | /* Here, we assign an additional cost for the unaligned load. */ | |
89675e73 | 713 | inside_cost += ncopies * TARG_VEC_UNALIGNED_LOAD_COST; |
792ed98b | 714 | |
f5adacc5 | 715 | if (vect_print_dump_info (REPORT_COST)) |
792ed98b HJ |
716 | fprintf (vect_dump, "vect_model_load_cost: unaligned supported by " |
717 | "hardware."); | |
718 | ||
719 | break; | |
720 | } | |
468c2ac0 DN |
721 | case dr_explicit_realign: |
722 | { | |
89675e73 | 723 | inside_cost += ncopies * (2*TARG_VEC_LOAD_COST + TARG_VEC_STMT_COST); |
468c2ac0 DN |
724 | |
725 | /* FIXME: If the misalignment remains fixed across the iterations of | |
726 | the containing loop, the following cost should be added to the | |
727 | outside costs. */ | |
728 | if (targetm.vectorize.builtin_mask_for_load) | |
89675e73 | 729 | inside_cost += TARG_VEC_STMT_COST; |
468c2ac0 DN |
730 | |
731 | break; | |
732 | } | |
733 | case dr_explicit_realign_optimized: | |
792ed98b | 734 | { |
f5adacc5 | 735 | if (vect_print_dump_info (REPORT_COST)) |
792ed98b HJ |
736 | fprintf (vect_dump, "vect_model_load_cost: unaligned software " |
737 | "pipelined."); | |
738 | ||
739 | /* Unaligned software pipeline has a load of an address, an initial | |
740 | load, and possibly a mask operation to "prime" the loop. However, | |
741 | if this is an access in a group of loads, which provide strided | |
0d52bcc1 | 742 | access, then the above cost should only be considered for one |
792ed98b HJ |
743 | access in the group. Inside the loop, there is a load op |
744 | and a realignment op. */ | |
745 | ||
805e2059 | 746 | if ((!DR_GROUP_FIRST_DR (stmt_info)) || group_size > 1 || slp_node) |
792ed98b | 747 | { |
89675e73 | 748 | outside_cost = 2*TARG_VEC_STMT_COST; |
792ed98b | 749 | if (targetm.vectorize.builtin_mask_for_load) |
89675e73 | 750 | outside_cost += TARG_VEC_STMT_COST; |
792ed98b | 751 | } |
792ed98b | 752 | |
89675e73 | 753 | inside_cost += ncopies * (TARG_VEC_LOAD_COST + TARG_VEC_STMT_COST); |
792ed98b HJ |
754 | |
755 | break; | |
756 | } | |
757 | ||
758 | default: | |
759 | gcc_unreachable (); | |
760 | } | |
89675e73 | 761 | |
f5adacc5 | 762 | if (vect_print_dump_info (REPORT_COST)) |
792ed98b | 763 | fprintf (vect_dump, "vect_model_load_cost: inside_cost = %d, " |
89675e73 | 764 | "outside_cost = %d .", inside_cost, outside_cost); |
792ed98b | 765 | |
89675e73 IR |
766 | /* Set the costs either in STMT_INFO or SLP_NODE (if exists). */ |
767 | stmt_vinfo_set_inside_of_loop_cost (stmt_info, slp_node, inside_cost); | |
768 | stmt_vinfo_set_outside_of_loop_cost (stmt_info, slp_node, outside_cost); | |
792ed98b HJ |
769 | } |
770 | ||
771 | ||
f7064d11 DN |
772 | /* Function vect_get_new_vect_var. |
773 | ||
774 | Returns a name for a new variable. The current naming scheme appends the | |
775 | prefix "vect_" or "vect_p" (depending on the value of VAR_KIND) to | |
776 | the name of vectorizer generated variables, and appends that to NAME if | |
777 | provided. */ | |
778 | ||
779 | static tree | |
780 | vect_get_new_vect_var (tree type, enum vect_var_kind var_kind, const char *name) | |
781 | { | |
782 | const char *prefix; | |
f7064d11 DN |
783 | tree new_vect_var; |
784 | ||
61d3cdbb DN |
785 | switch (var_kind) |
786 | { | |
787 | case vect_simple_var: | |
788 | prefix = "vect_"; | |
789 | break; | |
790 | case vect_scalar_var: | |
791 | prefix = "stmp_"; | |
792 | break; | |
793 | case vect_pointer_var: | |
f7064d11 | 794 | prefix = "vect_p"; |
61d3cdbb DN |
795 | break; |
796 | default: | |
797 | gcc_unreachable (); | |
798 | } | |
f7064d11 | 799 | |
f7064d11 | 800 | if (name) |
639d3040 DM |
801 | { |
802 | char* tmp = concat (prefix, name, NULL); | |
803 | new_vect_var = create_tmp_var (type, tmp); | |
804 | free (tmp); | |
805 | } | |
f7064d11 DN |
806 | else |
807 | new_vect_var = create_tmp_var (type, prefix); | |
808 | ||
0890b981 AP |
809 | /* Mark vector typed variable as a gimple register variable. */ |
810 | if (TREE_CODE (type) == VECTOR_TYPE) | |
811 | DECL_GIMPLE_REG_P (new_vect_var) = true; | |
812 | ||
f7064d11 DN |
813 | return new_vect_var; |
814 | } | |
815 | ||
816 | ||
f7064d11 DN |
817 | /* Function vect_create_addr_base_for_vector_ref. |
818 | ||
819 | Create an expression that computes the address of the first memory location | |
820 | that will be accessed for a data reference. | |
821 | ||
822 | Input: | |
823 | STMT: The statement containing the data reference. | |
824 | NEW_STMT_LIST: Must be initialized to NULL_TREE or a statement list. | |
825 | OFFSET: Optional. If supplied, it is be added to the initial address. | |
468c2ac0 DN |
826 | LOOP: Specify relative to which loop-nest should the address be computed. |
827 | For example, when the dataref is in an inner-loop nested in an | |
828 | outer-loop that is now being vectorized, LOOP can be either the | |
829 | outer-loop, or the inner-loop. The first memory location accessed | |
830 | by the following dataref ('in' points to short): | |
831 | ||
832 | for (i=0; i<N; i++) | |
833 | for (j=0; j<M; j++) | |
834 | s += in[i+j] | |
835 | ||
836 | is as follows: | |
837 | if LOOP=i_loop: &in (relative to i_loop) | |
838 | if LOOP=j_loop: &in+i*2B (relative to j_loop) | |
f7064d11 DN |
839 | |
840 | Output: | |
841 | 1. Return an SSA_NAME whose value is the address of the memory location of | |
842 | the first vector of the data reference. | |
843 | 2. If new_stmt_list is not NULL_TREE after return then the caller must insert | |
844 | these statement(s) which define the returned SSA_NAME. | |
845 | ||
846 | FORNOW: We are only handling array accesses with step 1. */ | |
847 | ||
848 | static tree | |
849 | vect_create_addr_base_for_vector_ref (tree stmt, | |
850 | tree *new_stmt_list, | |
468c2ac0 DN |
851 | tree offset, |
852 | struct loop *loop) | |
f7064d11 DN |
853 | { |
854 | stmt_vec_info stmt_info = vinfo_for_stmt (stmt); | |
855 | struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info); | |
468c2ac0 DN |
856 | struct loop *containing_loop = (bb_for_stmt (stmt))->loop_father; |
857 | tree data_ref_base = unshare_expr (DR_BASE_ADDRESS (dr)); | |
858 | tree base_name; | |
06cb4f79 | 859 | tree data_ref_base_var; |
06cb4f79 | 860 | tree new_base_stmt; |
f7064d11 | 861 | tree vec_stmt; |
f7064d11 DN |
862 | tree addr_base, addr_expr; |
863 | tree dest, new_stmt; | |
86a07404 IR |
864 | tree base_offset = unshare_expr (DR_OFFSET (dr)); |
865 | tree init = unshare_expr (DR_INIT (dr)); | |
4090db01 | 866 | tree vect_ptr_type, addr_expr2; |
468c2ac0 DN |
867 | tree step = TYPE_SIZE_UNIT (TREE_TYPE (DR_REF (dr))); |
868 | ||
869 | gcc_assert (loop); | |
870 | if (loop != containing_loop) | |
871 | { | |
872 | loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info); | |
873 | struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo); | |
874 | ||
875 | gcc_assert (nested_in_vect_loop_p (loop, stmt)); | |
876 | ||
877 | data_ref_base = unshare_expr (STMT_VINFO_DR_BASE_ADDRESS (stmt_info)); | |
878 | base_offset = unshare_expr (STMT_VINFO_DR_OFFSET (stmt_info)); | |
879 | init = unshare_expr (STMT_VINFO_DR_INIT (stmt_info)); | |
880 | } | |
881 | ||
06cb4f79 | 882 | /* Create data_ref_base */ |
468c2ac0 DN |
883 | base_name = build_fold_indirect_ref (data_ref_base); |
884 | data_ref_base_var = create_tmp_var (TREE_TYPE (data_ref_base), "batmp"); | |
06cb4f79 | 885 | add_referenced_var (data_ref_base_var); |
468c2ac0 | 886 | data_ref_base = force_gimple_operand (data_ref_base, &new_base_stmt, |
06cb4f79 JS |
887 | true, data_ref_base_var); |
888 | append_to_statement_list_force(new_base_stmt, new_stmt_list); | |
f7064d11 DN |
889 | |
890 | /* Create base_offset */ | |
86a07404 | 891 | base_offset = size_binop (PLUS_EXPR, base_offset, init); |
5be014d5 | 892 | base_offset = fold_convert (sizetype, base_offset); |
f7064d11 | 893 | dest = create_tmp_var (TREE_TYPE (base_offset), "base_off"); |
f004ab02 | 894 | add_referenced_var (dest); |
06cb4f79 | 895 | base_offset = force_gimple_operand (base_offset, &new_stmt, true, dest); |
f7064d11 DN |
896 | append_to_statement_list_force (new_stmt, new_stmt_list); |
897 | ||
898 | if (offset) | |
899 | { | |
5be014d5 | 900 | tree tmp = create_tmp_var (sizetype, "offset"); |
98b44b0e | 901 | |
f004ab02 | 902 | add_referenced_var (tmp); |
98b44b0e | 903 | offset = fold_build2 (MULT_EXPR, TREE_TYPE (offset), offset, step); |
987b67bc KH |
904 | base_offset = fold_build2 (PLUS_EXPR, TREE_TYPE (base_offset), |
905 | base_offset, offset); | |
8115817b | 906 | base_offset = force_gimple_operand (base_offset, &new_stmt, false, tmp); |
f7064d11 DN |
907 | append_to_statement_list_force (new_stmt, new_stmt_list); |
908 | } | |
909 | ||
910 | /* base + base_offset */ | |
468c2ac0 DN |
911 | addr_base = fold_build2 (POINTER_PLUS_EXPR, TREE_TYPE (data_ref_base), |
912 | data_ref_base, base_offset); | |
f7064d11 | 913 | |
4090db01 IR |
914 | vect_ptr_type = build_pointer_type (STMT_VINFO_VECTYPE (stmt_info)); |
915 | ||
f7064d11 | 916 | /* addr_expr = addr_base */ |
4090db01 | 917 | addr_expr = vect_get_new_vect_var (vect_ptr_type, vect_pointer_var, |
f7064d11 | 918 | get_name (base_name)); |
f004ab02 | 919 | add_referenced_var (addr_expr); |
4090db01 IR |
920 | vec_stmt = fold_convert (vect_ptr_type, addr_base); |
921 | addr_expr2 = vect_get_new_vect_var (vect_ptr_type, vect_pointer_var, | |
922 | get_name (base_name)); | |
923 | add_referenced_var (addr_expr2); | |
924 | vec_stmt = force_gimple_operand (vec_stmt, &new_stmt, false, addr_expr2); | |
925 | append_to_statement_list_force (new_stmt, new_stmt_list); | |
f7064d11 | 926 | |
00518cb1 | 927 | if (vect_print_dump_info (REPORT_DETAILS)) |
f7064d11 DN |
928 | { |
929 | fprintf (vect_dump, "created "); | |
930 | print_generic_expr (vect_dump, vec_stmt, TDF_SLIM); | |
931 | } | |
4090db01 | 932 | return vec_stmt; |
f7064d11 DN |
933 | } |
934 | ||
935 | ||
f7064d11 DN |
936 | /* Function vect_create_data_ref_ptr. |
937 | ||
89d67cca DN |
938 | Create a new pointer to vector type (vp), that points to the first location |
939 | accessed in the loop by STMT, along with the def-use update chain to | |
940 | appropriately advance the pointer through the loop iterations. Also set | |
941 | aliasing information for the pointer. This vector pointer is used by the | |
d9987fb4 | 942 | callers to this function to create a memory reference expression for vector |
89d67cca | 943 | load/store access. |
f7064d11 DN |
944 | |
945 | Input: | |
946 | 1. STMT: a stmt that references memory. Expected to be of the form | |
07beea0d AH |
947 | GIMPLE_MODIFY_STMT <name, data-ref> or |
948 | GIMPLE_MODIFY_STMT <data-ref, name>. | |
468c2ac0 | 949 | 2. AT_LOOP: the loop where the vector memref is to be created. |
f7064d11 DN |
950 | 3. OFFSET (optional): an offset to be added to the initial address accessed |
951 | by the data-ref in STMT. | |
952 | 4. ONLY_INIT: indicate if vp is to be updated in the loop, or remain | |
953 | pointing to the initial address. | |
4090db01 | 954 | 5. TYPE: if not NULL indicates the required type of the data-ref |
f7064d11 DN |
955 | |
956 | Output: | |
957 | 1. Declare a new ptr to vector_type, and have it point to the base of the | |
958 | data reference (initial addressed accessed by the data reference). | |
959 | For example, for vector of type V8HI, the following code is generated: | |
960 | ||
961 | v8hi *vp; | |
962 | vp = (v8hi *)initial_address; | |
963 | ||
964 | if OFFSET is not supplied: | |
965 | initial_address = &a[init]; | |
966 | if OFFSET is supplied: | |
967 | initial_address = &a[init + OFFSET]; | |
968 | ||
969 | Return the initial_address in INITIAL_ADDRESS. | |
970 | ||
89d67cca DN |
971 | 2. If ONLY_INIT is true, just return the initial pointer. Otherwise, also |
972 | update the pointer in each iteration of the loop. | |
f7064d11 | 973 | |
89d67cca DN |
974 | Return the increment stmt that updates the pointer in PTR_INCR. |
975 | ||
468c2ac0 DN |
976 | 3. Set INV_P to true if the access pattern of the data reference in the |
977 | vectorized loop is invariant. Set it to false otherwise. | |
978 | ||
979 | 4. Return the pointer. */ | |
f7064d11 DN |
980 | |
981 | static tree | |
468c2ac0 | 982 | vect_create_data_ref_ptr (tree stmt, struct loop *at_loop, |
89d67cca | 983 | tree offset, tree *initial_address, tree *ptr_incr, |
468c2ac0 | 984 | bool only_init, tree type, bool *inv_p) |
f7064d11 DN |
985 | { |
986 | tree base_name; | |
987 | stmt_vec_info stmt_info = vinfo_for_stmt (stmt); | |
988 | loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info); | |
989 | struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo); | |
468c2ac0 DN |
990 | bool nested_in_vect_loop = nested_in_vect_loop_p (loop, stmt); |
991 | struct loop *containing_loop = (bb_for_stmt (stmt))->loop_father; | |
f7064d11 DN |
992 | tree vectype = STMT_VINFO_VECTYPE (stmt_info); |
993 | tree vect_ptr_type; | |
994 | tree vect_ptr; | |
995 | tree tag; | |
f7064d11 DN |
996 | tree new_temp; |
997 | tree vec_stmt; | |
998 | tree new_stmt_list = NULL_TREE; | |
468c2ac0 | 999 | edge pe; |
f7064d11 DN |
1000 | basic_block new_bb; |
1001 | tree vect_ptr_init; | |
86a07404 | 1002 | struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info); |
468c2ac0 DN |
1003 | tree vptr; |
1004 | block_stmt_iterator incr_bsi; | |
1005 | bool insert_after; | |
1006 | tree indx_before_incr, indx_after_incr; | |
1007 | tree incr; | |
1008 | tree step; | |
1009 | ||
1010 | /* Check the step (evolution) of the load in LOOP, and record | |
1011 | whether it's invariant. */ | |
1012 | if (nested_in_vect_loop) | |
1013 | step = STMT_VINFO_DR_STEP (stmt_info); | |
1014 | else | |
1015 | step = DR_STEP (STMT_VINFO_DATA_REF (stmt_info)); | |
1016 | ||
1017 | if (tree_int_cst_compare (step, size_zero_node) == 0) | |
1018 | *inv_p = true; | |
1019 | else | |
1020 | *inv_p = false; | |
f7064d11 | 1021 | |
468c2ac0 DN |
1022 | /* Create an expression for the first address accessed by this load |
1023 | in LOOP. */ | |
86a07404 | 1024 | base_name = build_fold_indirect_ref (unshare_expr (DR_BASE_ADDRESS (dr))); |
f7064d11 | 1025 | |
00518cb1 | 1026 | if (vect_print_dump_info (REPORT_DETAILS)) |
f7064d11 DN |
1027 | { |
1028 | tree data_ref_base = base_name; | |
c12cc930 | 1029 | fprintf (vect_dump, "create vector-pointer variable to type: "); |
f7064d11 DN |
1030 | print_generic_expr (vect_dump, vectype, TDF_SLIM); |
1031 | if (TREE_CODE (data_ref_base) == VAR_DECL) | |
1032 | fprintf (vect_dump, " vectorizing a one dimensional array ref: "); | |
1033 | else if (TREE_CODE (data_ref_base) == ARRAY_REF) | |
1034 | fprintf (vect_dump, " vectorizing a multidimensional array ref: "); | |
1035 | else if (TREE_CODE (data_ref_base) == COMPONENT_REF) | |
1036 | fprintf (vect_dump, " vectorizing a record based array ref: "); | |
1037 | else if (TREE_CODE (data_ref_base) == SSA_NAME) | |
1038 | fprintf (vect_dump, " vectorizing a pointer ref: "); | |
1039 | print_generic_expr (vect_dump, base_name, TDF_SLIM); | |
1040 | } | |
1041 | ||
1042 | /** (1) Create the new vector-pointer variable: **/ | |
4090db01 IR |
1043 | if (type) |
1044 | vect_ptr_type = build_pointer_type (type); | |
1045 | else | |
1046 | vect_ptr_type = build_pointer_type (vectype); | |
f7064d11 DN |
1047 | vect_ptr = vect_get_new_vect_var (vect_ptr_type, vect_pointer_var, |
1048 | get_name (base_name)); | |
f004ab02 | 1049 | add_referenced_var (vect_ptr); |
4090db01 | 1050 | |
8bb46326 | 1051 | /** (2) Add aliasing information to the new vector-pointer: |
86a07404 | 1052 | (The points-to info (DR_PTR_INFO) may be defined later.) **/ |
f7064d11 | 1053 | |
3cb960c7 | 1054 | tag = DR_SYMBOL_TAG (dr); |
f7064d11 | 1055 | gcc_assert (tag); |
c75ab022 | 1056 | |
18cd8a03 | 1057 | /* If tag is a variable (and NOT_A_TAG) than a new symbol memory |
9cf5a7e3 | 1058 | tag must be created with tag added to its may alias list. */ |
326eda4b | 1059 | if (!MTAG_P (tag)) |
cc0968b0 | 1060 | new_type_alias (vect_ptr, tag, DR_REF (dr)); |
0bca51f0 | 1061 | else |
38635499 | 1062 | set_symbol_mem_tag (vect_ptr, tag); |
9cf5a7e3 | 1063 | |
86a07404 | 1064 | var_ann (vect_ptr)->subvars = DR_SUBVARS (dr); |
f7064d11 | 1065 | |
468c2ac0 DN |
1066 | /** Note: If the dataref is in an inner-loop nested in LOOP, and we are |
1067 | vectorizing LOOP (i.e. outer-loop vectorization), we need to create two | |
1068 | def-use update cycles for the pointer: One relative to the outer-loop | |
1069 | (LOOP), which is what steps (3) and (4) below do. The other is relative | |
1070 | to the inner-loop (which is the inner-most loop containing the dataref), | |
1071 | and this is done be step (5) below. | |
1072 | ||
1073 | When vectorizing inner-most loops, the vectorized loop (LOOP) is also the | |
1074 | inner-most loop, and so steps (3),(4) work the same, and step (5) is | |
1075 | redundant. Steps (3),(4) create the following: | |
1076 | ||
1077 | vp0 = &base_addr; | |
1078 | LOOP: vp1 = phi(vp0,vp2) | |
1079 | ... | |
1080 | ... | |
1081 | vp2 = vp1 + step | |
1082 | goto LOOP | |
1083 | ||
1084 | If there is an inner-loop nested in loop, then step (5) will also be | |
1085 | applied, and an additional update in the inner-loop will be created: | |
1086 | ||
1087 | vp0 = &base_addr; | |
1088 | LOOP: vp1 = phi(vp0,vp2) | |
1089 | ... | |
1090 | inner: vp3 = phi(vp1,vp4) | |
1091 | vp4 = vp3 + inner_step | |
1092 | if () goto inner | |
1093 | ... | |
1094 | vp2 = vp1 + step | |
1095 | if () goto LOOP */ | |
1096 | ||
f7064d11 DN |
1097 | /** (3) Calculate the initial address the vector-pointer, and set |
1098 | the vector-pointer to point to it before the loop: **/ | |
1099 | ||
1100 | /* Create: (&(base[init_val+offset]) in the loop preheader. */ | |
468c2ac0 | 1101 | |
f7064d11 | 1102 | new_temp = vect_create_addr_base_for_vector_ref (stmt, &new_stmt_list, |
468c2ac0 | 1103 | offset, loop); |
f7064d11 DN |
1104 | pe = loop_preheader_edge (loop); |
1105 | new_bb = bsi_insert_on_edge_immediate (pe, new_stmt_list); | |
1106 | gcc_assert (!new_bb); | |
1107 | *initial_address = new_temp; | |
1108 | ||
1109 | /* Create: p = (vectype *) initial_base */ | |
1110 | vec_stmt = fold_convert (vect_ptr_type, new_temp); | |
ebb07520 | 1111 | vec_stmt = build_gimple_modify_stmt (vect_ptr, vec_stmt); |
99c09897 | 1112 | vect_ptr_init = make_ssa_name (vect_ptr, vec_stmt); |
07beea0d | 1113 | GIMPLE_STMT_OPERAND (vec_stmt, 0) = vect_ptr_init; |
f7064d11 DN |
1114 | new_bb = bsi_insert_on_edge_immediate (pe, vec_stmt); |
1115 | gcc_assert (!new_bb); | |
f7064d11 DN |
1116 | |
1117 | ||
468c2ac0 DN |
1118 | /** (4) Handle the updating of the vector-pointer inside the loop. |
1119 | This is needed when ONLY_INIT is false, and also when AT_LOOP | |
1120 | is the inner-loop nested in LOOP (during outer-loop vectorization). | |
1121 | **/ | |
f7064d11 | 1122 | |
468c2ac0 | 1123 | if (only_init && at_loop == loop) /* No update in loop is required. */ |
8bb46326 DN |
1124 | { |
1125 | /* Copy the points-to information if it exists. */ | |
86a07404 IR |
1126 | if (DR_PTR_INFO (dr)) |
1127 | duplicate_ssa_name_ptr_info (vect_ptr_init, DR_PTR_INFO (dr)); | |
468c2ac0 | 1128 | vptr = vect_ptr_init; |
8bb46326 | 1129 | } |
99c09897 RH |
1130 | else |
1131 | { | |
468c2ac0 DN |
1132 | /* The step of the vector pointer is the Vector Size. */ |
1133 | tree step = TYPE_SIZE_UNIT (vectype); | |
1134 | /* One exception to the above is when the scalar step of the load in | |
1135 | LOOP is zero. In this case the step here is also zero. */ | |
1136 | if (*inv_p) | |
1137 | step = size_zero_node; | |
99c09897 RH |
1138 | |
1139 | standard_iv_increment_position (loop, &incr_bsi, &insert_after); | |
468c2ac0 | 1140 | |
99c09897 | 1141 | create_iv (vect_ptr_init, |
468c2ac0 | 1142 | fold_convert (vect_ptr_type, step), |
99c09897 RH |
1143 | NULL_TREE, loop, &incr_bsi, insert_after, |
1144 | &indx_before_incr, &indx_after_incr); | |
1145 | incr = bsi_stmt (incr_bsi); | |
93c094b5 | 1146 | set_stmt_info (stmt_ann (incr), |
99c09897 | 1147 | new_stmt_vec_info (incr, loop_vinfo)); |
f7064d11 | 1148 | |
99c09897 RH |
1149 | /* Copy the points-to information if it exists. */ |
1150 | if (DR_PTR_INFO (dr)) | |
1151 | { | |
1152 | duplicate_ssa_name_ptr_info (indx_before_incr, DR_PTR_INFO (dr)); | |
1153 | duplicate_ssa_name_ptr_info (indx_after_incr, DR_PTR_INFO (dr)); | |
1154 | } | |
1155 | merge_alias_info (vect_ptr_init, indx_before_incr); | |
1156 | merge_alias_info (vect_ptr_init, indx_after_incr); | |
89d67cca DN |
1157 | if (ptr_incr) |
1158 | *ptr_incr = incr; | |
f7064d11 | 1159 | |
468c2ac0 DN |
1160 | vptr = indx_before_incr; |
1161 | } | |
1162 | ||
1163 | if (!nested_in_vect_loop || only_init) | |
1164 | return vptr; | |
1165 | ||
1166 | ||
1167 | /** (5) Handle the updating of the vector-pointer inside the inner-loop | |
1168 | nested in LOOP, if exists: **/ | |
1169 | ||
1170 | gcc_assert (nested_in_vect_loop); | |
1171 | if (!only_init) | |
1172 | { | |
1173 | standard_iv_increment_position (containing_loop, &incr_bsi, | |
1174 | &insert_after); | |
1175 | create_iv (vptr, fold_convert (vect_ptr_type, DR_STEP (dr)), NULL_TREE, | |
1176 | containing_loop, &incr_bsi, insert_after, &indx_before_incr, | |
1177 | &indx_after_incr); | |
1178 | incr = bsi_stmt (incr_bsi); | |
1179 | set_stmt_info (stmt_ann (incr), new_stmt_vec_info (incr, loop_vinfo)); | |
1180 | ||
1181 | /* Copy the points-to information if it exists. */ | |
1182 | if (DR_PTR_INFO (dr)) | |
1183 | { | |
1184 | duplicate_ssa_name_ptr_info (indx_before_incr, DR_PTR_INFO (dr)); | |
1185 | duplicate_ssa_name_ptr_info (indx_after_incr, DR_PTR_INFO (dr)); | |
1186 | } | |
1187 | merge_alias_info (vect_ptr_init, indx_before_incr); | |
1188 | merge_alias_info (vect_ptr_init, indx_after_incr); | |
1189 | if (ptr_incr) | |
1190 | *ptr_incr = incr; | |
1191 | ||
1192 | return indx_before_incr; | |
99c09897 | 1193 | } |
468c2ac0 DN |
1194 | else |
1195 | gcc_unreachable (); | |
f7064d11 DN |
1196 | } |
1197 | ||
1198 | ||
89d67cca DN |
1199 | /* Function bump_vector_ptr |
1200 | ||
468c2ac0 DN |
1201 | Increment a pointer (to a vector type) by vector-size. If requested, |
1202 | i.e. if PTR-INCR is given, then also connect the new increment stmt | |
1203 | to the existing def-use update-chain of the pointer, by modifying | |
1204 | the PTR_INCR as illustrated below: | |
89d67cca DN |
1205 | |
1206 | The pointer def-use update-chain before this function: | |
1207 | DATAREF_PTR = phi (p_0, p_2) | |
1208 | .... | |
1209 | PTR_INCR: p_2 = DATAREF_PTR + step | |
1210 | ||
1211 | The pointer def-use update-chain after this function: | |
1212 | DATAREF_PTR = phi (p_0, p_2) | |
1213 | .... | |
468c2ac0 | 1214 | NEW_DATAREF_PTR = DATAREF_PTR + BUMP |
89d67cca DN |
1215 | .... |
1216 | PTR_INCR: p_2 = NEW_DATAREF_PTR + step | |
1217 | ||
1218 | Input: | |
1219 | DATAREF_PTR - ssa_name of a pointer (to vector type) that is being updated | |
1220 | in the loop. | |
468c2ac0 DN |
1221 | PTR_INCR - optional. The stmt that updates the pointer in each iteration of |
1222 | the loop. The increment amount across iterations is expected | |
1223 | to be vector_size. | |
89d67cca DN |
1224 | BSI - location where the new update stmt is to be placed. |
1225 | STMT - the original scalar memory-access stmt that is being vectorized. | |
468c2ac0 DN |
1226 | BUMP - optional. The offset by which to bump the pointer. If not given, |
1227 | the offset is assumed to be vector_size. | |
89d67cca DN |
1228 | |
1229 | Output: Return NEW_DATAREF_PTR as illustrated above. | |
1230 | ||
1231 | */ | |
1232 | ||
1233 | static tree | |
1234 | bump_vector_ptr (tree dataref_ptr, tree ptr_incr, block_stmt_iterator *bsi, | |
468c2ac0 | 1235 | tree stmt, tree bump) |
89d67cca DN |
1236 | { |
1237 | stmt_vec_info stmt_info = vinfo_for_stmt (stmt); | |
1238 | struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info); | |
1239 | tree vectype = STMT_VINFO_VECTYPE (stmt_info); | |
1240 | tree vptr_type = TREE_TYPE (dataref_ptr); | |
1241 | tree ptr_var = SSA_NAME_VAR (dataref_ptr); | |
5be014d5 | 1242 | tree update = TYPE_SIZE_UNIT (vectype); |
89d67cca DN |
1243 | tree incr_stmt; |
1244 | ssa_op_iter iter; | |
1245 | use_operand_p use_p; | |
1246 | tree new_dataref_ptr; | |
1247 | ||
468c2ac0 DN |
1248 | if (bump) |
1249 | update = bump; | |
1250 | ||
ebb07520 | 1251 | incr_stmt = build_gimple_modify_stmt (ptr_var, |
5be014d5 | 1252 | build2 (POINTER_PLUS_EXPR, vptr_type, |
ebb07520 | 1253 | dataref_ptr, update)); |
89d67cca | 1254 | new_dataref_ptr = make_ssa_name (ptr_var, incr_stmt); |
07beea0d | 1255 | GIMPLE_STMT_OPERAND (incr_stmt, 0) = new_dataref_ptr; |
89d67cca DN |
1256 | vect_finish_stmt_generation (stmt, incr_stmt, bsi); |
1257 | ||
468c2ac0 DN |
1258 | /* Copy the points-to information if it exists. */ |
1259 | if (DR_PTR_INFO (dr)) | |
1260 | duplicate_ssa_name_ptr_info (new_dataref_ptr, DR_PTR_INFO (dr)); | |
1261 | merge_alias_info (new_dataref_ptr, dataref_ptr); | |
1262 | ||
1263 | if (!ptr_incr) | |
1264 | return new_dataref_ptr; | |
1265 | ||
89d67cca DN |
1266 | /* Update the vector-pointer's cross-iteration increment. */ |
1267 | FOR_EACH_SSA_USE_OPERAND (use_p, ptr_incr, iter, SSA_OP_USE) | |
1268 | { | |
1269 | tree use = USE_FROM_PTR (use_p); | |
1270 | ||
1271 | if (use == dataref_ptr) | |
1272 | SET_USE (use_p, new_dataref_ptr); | |
1273 | else | |
1274 | gcc_assert (tree_int_cst_compare (use, update) == 0); | |
1275 | } | |
1276 | ||
89d67cca DN |
1277 | return new_dataref_ptr; |
1278 | } | |
1279 | ||
1280 | ||
f7064d11 DN |
1281 | /* Function vect_create_destination_var. |
1282 | ||
1283 | Create a new temporary of type VECTYPE. */ | |
1284 | ||
1285 | static tree | |
1286 | vect_create_destination_var (tree scalar_dest, tree vectype) | |
1287 | { | |
1288 | tree vec_dest; | |
1289 | const char *new_name; | |
61d3cdbb DN |
1290 | tree type; |
1291 | enum vect_var_kind kind; | |
1292 | ||
1293 | kind = vectype ? vect_simple_var : vect_scalar_var; | |
1294 | type = vectype ? vectype : TREE_TYPE (scalar_dest); | |
f7064d11 DN |
1295 | |
1296 | gcc_assert (TREE_CODE (scalar_dest) == SSA_NAME); | |
1297 | ||
1298 | new_name = get_name (scalar_dest); | |
1299 | if (!new_name) | |
1300 | new_name = "var_"; | |
fbf798fc | 1301 | vec_dest = vect_get_new_vect_var (type, kind, new_name); |
f004ab02 | 1302 | add_referenced_var (vec_dest); |
f7064d11 DN |
1303 | |
1304 | return vec_dest; | |
1305 | } | |
1306 | ||
1307 | ||
1308 | /* Function vect_init_vector. | |
1309 | ||
1310 | Insert a new stmt (INIT_STMT) that initializes a new vector variable with | |
468c2ac0 DN |
1311 | the vector elements of VECTOR_VAR. Place the initialization at BSI if it |
1312 | is not NULL. Otherwise, place the initialization at the loop preheader. | |
1313 | Return the DEF of INIT_STMT. | |
1314 | It will be used in the vectorization of STMT. */ | |
f7064d11 DN |
1315 | |
1316 | static tree | |
468c2ac0 DN |
1317 | vect_init_vector (tree stmt, tree vector_var, tree vector_type, |
1318 | block_stmt_iterator *bsi) | |
f7064d11 DN |
1319 | { |
1320 | stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt); | |
f7064d11 DN |
1321 | tree new_var; |
1322 | tree init_stmt; | |
f7064d11 DN |
1323 | tree vec_oprnd; |
1324 | edge pe; | |
1325 | tree new_temp; | |
1326 | basic_block new_bb; | |
1327 | ||
4090db01 | 1328 | new_var = vect_get_new_vect_var (vector_type, vect_simple_var, "cst_"); |
f004ab02 | 1329 | add_referenced_var (new_var); |
ebb07520 | 1330 | init_stmt = build_gimple_modify_stmt (new_var, vector_var); |
f7064d11 | 1331 | new_temp = make_ssa_name (new_var, init_stmt); |
07beea0d | 1332 | GIMPLE_STMT_OPERAND (init_stmt, 0) = new_temp; |
f7064d11 | 1333 | |
468c2ac0 DN |
1334 | if (bsi) |
1335 | vect_finish_stmt_generation (stmt, init_stmt, bsi); | |
1336 | else | |
1337 | { | |
1338 | loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_vinfo); | |
1339 | struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo); | |
1340 | ||
1341 | if (nested_in_vect_loop_p (loop, stmt)) | |
1342 | loop = loop->inner; | |
1343 | pe = loop_preheader_edge (loop); | |
1344 | new_bb = bsi_insert_on_edge_immediate (pe, init_stmt); | |
1345 | gcc_assert (!new_bb); | |
1346 | } | |
f7064d11 | 1347 | |
00518cb1 | 1348 | if (vect_print_dump_info (REPORT_DETAILS)) |
f7064d11 DN |
1349 | { |
1350 | fprintf (vect_dump, "created new init_stmt: "); | |
1351 | print_generic_expr (vect_dump, init_stmt, TDF_SLIM); | |
1352 | } | |
1353 | ||
07beea0d | 1354 | vec_oprnd = GIMPLE_STMT_OPERAND (init_stmt, 0); |
f7064d11 DN |
1355 | return vec_oprnd; |
1356 | } | |
1357 | ||
1358 | ||
805e2059 IR |
1359 | /* For constant and loop invariant defs of SLP_NODE this function returns |
1360 | (vector) defs (VEC_OPRNDS) that will be used in the vectorized stmts. | |
1361 | OP_NUM determines if we gather defs for operand 0 or operand 1 of the scalar | |
1362 | stmts. */ | |
1363 | ||
1364 | static void | |
1365 | vect_get_constant_vectors (slp_tree slp_node, VEC(tree,heap) **vec_oprnds, | |
1366 | unsigned int op_num) | |
1367 | { | |
1368 | VEC (tree, heap) *stmts = SLP_TREE_SCALAR_STMTS (slp_node); | |
1369 | tree stmt = VEC_index (tree, stmts, 0); | |
1370 | stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt); | |
1371 | tree vectype = STMT_VINFO_VECTYPE (stmt_vinfo); | |
1372 | int nunits = TYPE_VECTOR_SUBPARTS (vectype); | |
1373 | tree vec_cst; | |
1374 | tree t = NULL_TREE; | |
1375 | int j, number_of_places_left_in_vector; | |
1376 | tree vector_type; | |
1377 | tree op, vop, operation; | |
1378 | int group_size = VEC_length (tree, stmts); | |
1379 | unsigned int vec_num, i; | |
1380 | int number_of_copies = 1; | |
1381 | bool is_store = false; | |
1382 | unsigned int number_of_vectors = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node); | |
1383 | VEC (tree, heap) *voprnds = VEC_alloc (tree, heap, number_of_vectors); | |
c563bcd1 | 1384 | bool constant_p; |
805e2059 IR |
1385 | |
1386 | if (STMT_VINFO_DATA_REF (stmt_vinfo)) | |
1387 | is_store = true; | |
1388 | ||
1389 | /* NUMBER_OF_COPIES is the number of times we need to use the same values in | |
1390 | created vectors. It is greater than 1 if unrolling is performed. | |
1391 | ||
1392 | For example, we have two scalar operands, s1 and s2 (e.g., group of | |
1393 | strided accesses of size two), while NUINTS is four (i.e., four scalars | |
1394 | of this type can be packed in a vector). The output vector will contain | |
1395 | two copies of each scalar operand: {s1, s2, s1, s2}. (NUMBER_OF_COPIES | |
1396 | will be 2). | |
1397 | ||
1398 | If GROUP_SIZE > NUNITS, the scalars will be split into several vectors | |
1399 | containing the operands. | |
1400 | ||
1401 | For example, NUINTS is four as before, and the group size is 8 | |
1402 | (s1, s2, ..., s8). We will create two vectors {s1, s2, s3, s4} and | |
1403 | {s5, s6, s7, s8}. */ | |
1404 | ||
1405 | number_of_copies = least_common_multiple (nunits, group_size) / group_size; | |
1406 | ||
1407 | number_of_places_left_in_vector = nunits; | |
c563bcd1 | 1408 | constant_p = true; |
805e2059 IR |
1409 | for (j = 0; j < number_of_copies; j++) |
1410 | { | |
1411 | for (i = group_size - 1; VEC_iterate (tree, stmts, i, stmt); i--) | |
1412 | { | |
1413 | operation = GIMPLE_STMT_OPERAND (stmt, 1); | |
1414 | if (is_store) | |
1415 | op = operation; | |
1416 | else | |
1417 | op = TREE_OPERAND (operation, op_num); | |
c563bcd1 JJ |
1418 | if (!CONSTANT_CLASS_P (op)) |
1419 | constant_p = false; | |
805e2059 IR |
1420 | |
1421 | /* Create 'vect_ = {op0,op1,...,opn}'. */ | |
1422 | t = tree_cons (NULL_TREE, op, t); | |
1423 | ||
1424 | number_of_places_left_in_vector--; | |
1425 | ||
1426 | if (number_of_places_left_in_vector == 0) | |
1427 | { | |
1428 | number_of_places_left_in_vector = nunits; | |
1429 | ||
1430 | vector_type = get_vectype_for_scalar_type (TREE_TYPE (op)); | |
20e545c3 | 1431 | gcc_assert (vector_type); |
c563bcd1 JJ |
1432 | if (constant_p) |
1433 | vec_cst = build_vector (vector_type, t); | |
1434 | else | |
1435 | vec_cst = build_constructor_from_list (vector_type, t); | |
1436 | constant_p = true; | |
805e2059 IR |
1437 | VEC_quick_push (tree, voprnds, |
1438 | vect_init_vector (stmt, vec_cst, vector_type, | |
1439 | NULL)); | |
1440 | t = NULL_TREE; | |
1441 | } | |
1442 | } | |
1443 | } | |
1444 | ||
1445 | /* Since the vectors are created in the reverse order, we should invert | |
1446 | them. */ | |
1447 | vec_num = VEC_length (tree, voprnds); | |
1448 | for (j = vec_num - 1; j >= 0; j--) | |
1449 | { | |
1450 | vop = VEC_index (tree, voprnds, j); | |
1451 | VEC_quick_push (tree, *vec_oprnds, vop); | |
1452 | } | |
1453 | ||
1454 | VEC_free (tree, heap, voprnds); | |
1455 | ||
1456 | /* In case that VF is greater than the unrolling factor needed for the SLP | |
1457 | group of stmts, NUMBER_OF_VECTORS to be created is greater than | |
1458 | NUMBER_OF_SCALARS/NUNITS or NUNITS/NUMBER_OF_SCALARS, and hence we have | |
1459 | to replicate the vectors. */ | |
1460 | while (number_of_vectors > VEC_length (tree, *vec_oprnds)) | |
1461 | { | |
1462 | for (i = 0; VEC_iterate (tree, *vec_oprnds, i, vop) && i < vec_num; i++) | |
1463 | VEC_quick_push (tree, *vec_oprnds, vop); | |
1464 | } | |
1465 | } | |
1466 | ||
1467 | ||
84fbffb2 | 1468 | /* Get vectorized definitions from SLP_NODE that contains corresponding |
805e2059 IR |
1469 | vectorized def-stmts. */ |
1470 | ||
1471 | static void | |
1472 | vect_get_slp_vect_defs (slp_tree slp_node, VEC (tree,heap) **vec_oprnds) | |
1473 | { | |
1474 | tree vec_oprnd; | |
1475 | tree vec_def_stmt; | |
1476 | unsigned int i; | |
1477 | ||
1478 | gcc_assert (SLP_TREE_VEC_STMTS (slp_node)); | |
1479 | ||
1480 | for (i = 0; | |
1481 | VEC_iterate (tree, SLP_TREE_VEC_STMTS (slp_node), i, vec_def_stmt); | |
1482 | i++) | |
1483 | { | |
1484 | gcc_assert (vec_def_stmt); | |
1485 | vec_oprnd = GIMPLE_STMT_OPERAND (vec_def_stmt, 0); | |
1486 | VEC_quick_push (tree, *vec_oprnds, vec_oprnd); | |
1487 | } | |
1488 | } | |
1489 | ||
1490 | ||
1491 | /* Get vectorized definitions for SLP_NODE. | |
1492 | If the scalar definitions are loop invariants or constants, collect them and | |
1493 | call vect_get_constant_vectors() to create vector stmts. | |
1494 | Otherwise, the def-stmts must be already vectorized and the vectorized stmts | |
1495 | must be stored in the LEFT/RIGHT node of SLP_NODE, and we call | |
f8f8fee8 IR |
1496 | vect_get_slp_vect_defs() to retrieve them. |
1497 | If VEC_OPRNDS1 is NULL, don't get vector defs for the second operand (from | |
1498 | the right node. This is used when the second operand must remain scalar. */ | |
805e2059 IR |
1499 | |
1500 | static void | |
1501 | vect_get_slp_defs (slp_tree slp_node, VEC (tree,heap) **vec_oprnds0, | |
1502 | VEC (tree,heap) **vec_oprnds1) | |
1503 | { | |
1504 | tree operation, first_stmt; | |
1505 | ||
1506 | /* Allocate memory for vectorized defs. */ | |
1507 | *vec_oprnds0 = VEC_alloc (tree, heap, | |
1508 | SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node)); | |
1509 | ||
1510 | /* SLP_NODE corresponds either to a group of stores or to a group of | |
1511 | unary/binary operations. We don't call this function for loads. */ | |
1512 | if (SLP_TREE_LEFT (slp_node)) | |
1513 | /* The defs are already vectorized. */ | |
1514 | vect_get_slp_vect_defs (SLP_TREE_LEFT (slp_node), vec_oprnds0); | |
1515 | else | |
1516 | /* Build vectors from scalar defs. */ | |
1517 | vect_get_constant_vectors (slp_node, vec_oprnds0, 0); | |
1518 | ||
1519 | first_stmt = VEC_index (tree, SLP_TREE_SCALAR_STMTS (slp_node), 0); | |
1520 | if (STMT_VINFO_DATA_REF (vinfo_for_stmt (first_stmt))) | |
1521 | /* Since we don't call this function with loads, this is a group of | |
1522 | stores. */ | |
1523 | return; | |
1524 | ||
1525 | operation = GIMPLE_STMT_OPERAND (first_stmt, 1); | |
f8f8fee8 | 1526 | if (TREE_OPERAND_LENGTH (operation) == unary_op || !vec_oprnds1) |
805e2059 IR |
1527 | return; |
1528 | ||
1529 | *vec_oprnds1 = VEC_alloc (tree, heap, | |
1530 | SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node)); | |
1531 | ||
1532 | if (SLP_TREE_RIGHT (slp_node)) | |
1533 | /* The defs are already vectorized. */ | |
1534 | vect_get_slp_vect_defs (SLP_TREE_RIGHT (slp_node), vec_oprnds1); | |
1535 | else | |
1536 | /* Build vectors from scalar defs. */ | |
1537 | vect_get_constant_vectors (slp_node, vec_oprnds1, 1); | |
1538 | } | |
1539 | ||
1540 | ||
fbf798fc DN |
1541 | /* Function get_initial_def_for_induction |
1542 | ||
1543 | Input: | |
d29de1bf | 1544 | STMT - a stmt that performs an induction operation in the loop. |
fbf798fc DN |
1545 | IV_PHI - the initial value of the induction variable |
1546 | ||
1547 | Output: | |
1548 | Return a vector variable, initialized with the first VF values of | |
1549 | the induction variable. E.g., for an iv with IV_PHI='X' and | |
1550 | evolution S, for a vector of 4 units, we want to return: | |
1551 | [X, X + S, X + 2*S, X + 3*S]. */ | |
1552 | ||
1553 | static tree | |
cd38ca7f | 1554 | get_initial_def_for_induction (tree iv_phi) |
fbf798fc | 1555 | { |
cd38ca7f | 1556 | stmt_vec_info stmt_vinfo = vinfo_for_stmt (iv_phi); |
fbf798fc DN |
1557 | loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_vinfo); |
1558 | struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo); | |
3d95caa4 | 1559 | tree scalar_type = TREE_TYPE (PHI_RESULT_TREE (iv_phi)); |
20e545c3 IR |
1560 | tree vectype; |
1561 | int nunits; | |
fbf798fc | 1562 | edge pe = loop_preheader_edge (loop); |
d29de1bf | 1563 | struct loop *iv_loop; |
fbf798fc | 1564 | basic_block new_bb; |
fbf798fc DN |
1565 | tree vec, vec_init, vec_step, t; |
1566 | tree access_fn; | |
1567 | tree new_var; | |
1568 | tree new_name; | |
1569 | tree init_stmt; | |
1570 | tree induction_phi, induc_def, new_stmt, vec_def, vec_dest; | |
1571 | tree init_expr, step_expr; | |
1572 | int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo); | |
1573 | int i; | |
1574 | bool ok; | |
20e545c3 | 1575 | int ncopies; |
fbf798fc DN |
1576 | tree expr; |
1577 | stmt_vec_info phi_info = vinfo_for_stmt (iv_phi); | |
d29de1bf | 1578 | bool nested_in_vect_loop = false; |
c492dc9a | 1579 | tree stmts; |
d29de1bf DN |
1580 | imm_use_iterator imm_iter; |
1581 | use_operand_p use_p; | |
1582 | tree exit_phi; | |
1583 | edge latch_e; | |
1584 | tree loop_arg; | |
cd38ca7f DN |
1585 | block_stmt_iterator si; |
1586 | basic_block bb = bb_for_stmt (iv_phi); | |
fbf798fc | 1587 | |
20e545c3 IR |
1588 | vectype = get_vectype_for_scalar_type (scalar_type); |
1589 | gcc_assert (vectype); | |
1590 | nunits = TYPE_VECTOR_SUBPARTS (vectype); | |
1591 | ncopies = vf / nunits; | |
1592 | ||
fbf798fc | 1593 | gcc_assert (phi_info); |
cd38ca7f | 1594 | gcc_assert (ncopies >= 1); |
fbf798fc | 1595 | |
cd38ca7f DN |
1596 | /* Find the first insertion point in the BB. */ |
1597 | si = bsi_after_labels (bb); | |
fbf798fc | 1598 | |
d29de1bf DN |
1599 | if (INTEGRAL_TYPE_P (scalar_type)) |
1600 | step_expr = build_int_cst (scalar_type, 0); | |
1601 | else | |
1602 | step_expr = build_real (scalar_type, dconst0); | |
1603 | ||
1604 | /* Is phi in an inner-loop, while vectorizing an enclosing outer-loop? */ | |
1605 | if (nested_in_vect_loop_p (loop, iv_phi)) | |
1606 | { | |
1607 | nested_in_vect_loop = true; | |
1608 | iv_loop = loop->inner; | |
1609 | } | |
1610 | else | |
1611 | iv_loop = loop; | |
1612 | gcc_assert (iv_loop == (bb_for_stmt (iv_phi))->loop_father); | |
1613 | ||
1614 | latch_e = loop_latch_edge (iv_loop); | |
1615 | loop_arg = PHI_ARG_DEF_FROM_EDGE (iv_phi, latch_e); | |
1616 | ||
1617 | access_fn = analyze_scalar_evolution (iv_loop, PHI_RESULT (iv_phi)); | |
fbf798fc | 1618 | gcc_assert (access_fn); |
d29de1bf DN |
1619 | ok = vect_is_simple_iv_evolution (iv_loop->num, access_fn, |
1620 | &init_expr, &step_expr); | |
fbf798fc | 1621 | gcc_assert (ok); |
d29de1bf | 1622 | pe = loop_preheader_edge (iv_loop); |
fbf798fc DN |
1623 | |
1624 | /* Create the vector that holds the initial_value of the induction. */ | |
d29de1bf | 1625 | if (nested_in_vect_loop) |
c492dc9a | 1626 | { |
d29de1bf DN |
1627 | /* iv_loop is nested in the loop to be vectorized. init_expr had already |
1628 | been created during vectorization of previous stmts; We obtain it from | |
1629 | the STMT_VINFO_VEC_STMT of the defining stmt. */ | |
1630 | tree iv_def = PHI_ARG_DEF_FROM_EDGE (iv_phi, loop_preheader_edge (iv_loop)); | |
1631 | vec_init = vect_get_vec_def_for_operand (iv_def, iv_phi, NULL); | |
c492dc9a | 1632 | } |
d29de1bf | 1633 | else |
fbf798fc | 1634 | { |
d29de1bf DN |
1635 | /* iv_loop is the loop to be vectorized. Create: |
1636 | vec_init = [X, X+S, X+2*S, X+3*S] (S = step_expr, X = init_expr) */ | |
1637 | new_var = vect_get_new_vect_var (scalar_type, vect_scalar_var, "var_"); | |
1638 | add_referenced_var (new_var); | |
ebb07520 | 1639 | |
d29de1bf DN |
1640 | new_name = force_gimple_operand (init_expr, &stmts, false, new_var); |
1641 | if (stmts) | |
1642 | { | |
1643 | new_bb = bsi_insert_on_edge_immediate (pe, stmts); | |
1644 | gcc_assert (!new_bb); | |
1645 | } | |
fbf798fc | 1646 | |
d29de1bf DN |
1647 | t = NULL_TREE; |
1648 | t = tree_cons (NULL_TREE, init_expr, t); | |
1649 | for (i = 1; i < nunits; i++) | |
1650 | { | |
1651 | tree tmp; | |
fbf798fc | 1652 | |
d29de1bf DN |
1653 | /* Create: new_name_i = new_name + step_expr */ |
1654 | tmp = fold_build2 (PLUS_EXPR, scalar_type, new_name, step_expr); | |
1655 | init_stmt = build_gimple_modify_stmt (new_var, tmp); | |
1656 | new_name = make_ssa_name (new_var, init_stmt); | |
1657 | GIMPLE_STMT_OPERAND (init_stmt, 0) = new_name; | |
1658 | ||
1659 | new_bb = bsi_insert_on_edge_immediate (pe, init_stmt); | |
1660 | gcc_assert (!new_bb); | |
1661 | ||
1662 | if (vect_print_dump_info (REPORT_DETAILS)) | |
1663 | { | |
1664 | fprintf (vect_dump, "created new init_stmt: "); | |
1665 | print_generic_expr (vect_dump, init_stmt, TDF_SLIM); | |
1666 | } | |
1667 | t = tree_cons (NULL_TREE, new_name, t); | |
1668 | } | |
1669 | /* Create a vector from [new_name_0, new_name_1, ..., new_name_nunits-1] */ | |
1670 | vec = build_constructor_from_list (vectype, nreverse (t)); | |
468c2ac0 | 1671 | vec_init = vect_init_vector (iv_phi, vec, vectype, NULL); |
fbf798fc | 1672 | } |
fbf798fc DN |
1673 | |
1674 | ||
1675 | /* Create the vector that holds the step of the induction. */ | |
d29de1bf DN |
1676 | if (nested_in_vect_loop) |
1677 | /* iv_loop is nested in the loop to be vectorized. Generate: | |
1678 | vec_step = [S, S, S, S] */ | |
1679 | new_name = step_expr; | |
1680 | else | |
1681 | { | |
1682 | /* iv_loop is the loop to be vectorized. Generate: | |
1683 | vec_step = [VF*S, VF*S, VF*S, VF*S] */ | |
1684 | expr = build_int_cst (scalar_type, vf); | |
1685 | new_name = fold_build2 (MULT_EXPR, scalar_type, expr, step_expr); | |
1686 | } | |
1687 | ||
fbf798fc DN |
1688 | t = NULL_TREE; |
1689 | for (i = 0; i < nunits; i++) | |
1690 | t = tree_cons (NULL_TREE, unshare_expr (new_name), t); | |
c563bcd1 JJ |
1691 | gcc_assert (CONSTANT_CLASS_P (new_name)); |
1692 | vec = build_vector (vectype, t); | |
468c2ac0 | 1693 | vec_step = vect_init_vector (iv_phi, vec, vectype, NULL); |
fbf798fc DN |
1694 | |
1695 | ||
1696 | /* Create the following def-use cycle: | |
1697 | loop prolog: | |
d29de1bf DN |
1698 | vec_init = ... |
1699 | vec_step = ... | |
fbf798fc DN |
1700 | loop: |
1701 | vec_iv = PHI <vec_init, vec_loop> | |
1702 | ... | |
1703 | STMT | |
1704 | ... | |
1705 | vec_loop = vec_iv + vec_step; */ | |
1706 | ||
1707 | /* Create the induction-phi that defines the induction-operand. */ | |
1708 | vec_dest = vect_get_new_vect_var (vectype, vect_simple_var, "vec_iv_"); | |
1709 | add_referenced_var (vec_dest); | |
d29de1bf | 1710 | induction_phi = create_phi_node (vec_dest, iv_loop->header); |
fbf798fc DN |
1711 | set_stmt_info (get_stmt_ann (induction_phi), |
1712 | new_stmt_vec_info (induction_phi, loop_vinfo)); | |
1713 | induc_def = PHI_RESULT (induction_phi); | |
1714 | ||
1715 | /* Create the iv update inside the loop */ | |
ebb07520 RS |
1716 | new_stmt = build_gimple_modify_stmt (NULL_TREE, |
1717 | build2 (PLUS_EXPR, vectype, | |
1718 | induc_def, vec_step)); | |
fbf798fc DN |
1719 | vec_def = make_ssa_name (vec_dest, new_stmt); |
1720 | GIMPLE_STMT_OPERAND (new_stmt, 0) = vec_def; | |
d29de1bf DN |
1721 | bsi_insert_before (&si, new_stmt, BSI_SAME_STMT); |
1722 | set_stmt_info (get_stmt_ann (new_stmt), | |
1723 | new_stmt_vec_info (new_stmt, loop_vinfo)); | |
fbf798fc DN |
1724 | |
1725 | /* Set the arguments of the phi node: */ | |
d29de1bf DN |
1726 | add_phi_arg (induction_phi, vec_init, pe); |
1727 | add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop)); | |
fbf798fc DN |
1728 | |
1729 | ||
d29de1bf | 1730 | /* In case that vectorization factor (VF) is bigger than the number |
fbf798fc DN |
1731 | of elements that we can fit in a vectype (nunits), we have to generate |
1732 | more than one vector stmt - i.e - we need to "unroll" the | |
1733 | vector stmt by a factor VF/nunits. For more details see documentation | |
1734 | in vectorizable_operation. */ | |
1735 | ||
1736 | if (ncopies > 1) | |
1737 | { | |
1738 | stmt_vec_info prev_stmt_vinfo; | |
d29de1bf DN |
1739 | /* FORNOW. This restriction should be relaxed. */ |
1740 | gcc_assert (!nested_in_vect_loop); | |
fbf798fc DN |
1741 | |
1742 | /* Create the vector that holds the step of the induction. */ | |
1743 | expr = build_int_cst (scalar_type, nunits); | |
1744 | new_name = fold_build2 (MULT_EXPR, scalar_type, expr, step_expr); | |
1745 | t = NULL_TREE; | |
1746 | for (i = 0; i < nunits; i++) | |
1747 | t = tree_cons (NULL_TREE, unshare_expr (new_name), t); | |
c563bcd1 JJ |
1748 | gcc_assert (CONSTANT_CLASS_P (new_name)); |
1749 | vec = build_vector (vectype, t); | |
468c2ac0 | 1750 | vec_step = vect_init_vector (iv_phi, vec, vectype, NULL); |
fbf798fc DN |
1751 | |
1752 | vec_def = induc_def; | |
1753 | prev_stmt_vinfo = vinfo_for_stmt (induction_phi); | |
1754 | for (i = 1; i < ncopies; i++) | |
1755 | { | |
ebb07520 RS |
1756 | tree tmp; |
1757 | ||
d29de1bf | 1758 | /* vec_i = vec_prev + vec_step */ |
ebb07520 RS |
1759 | tmp = build2 (PLUS_EXPR, vectype, vec_def, vec_step); |
1760 | new_stmt = build_gimple_modify_stmt (NULL_TREE, tmp); | |
fbf798fc DN |
1761 | vec_def = make_ssa_name (vec_dest, new_stmt); |
1762 | GIMPLE_STMT_OPERAND (new_stmt, 0) = vec_def; | |
d29de1bf DN |
1763 | bsi_insert_before (&si, new_stmt, BSI_SAME_STMT); |
1764 | set_stmt_info (get_stmt_ann (new_stmt), | |
1765 | new_stmt_vec_info (new_stmt, loop_vinfo)); | |
fbf798fc DN |
1766 | STMT_VINFO_RELATED_STMT (prev_stmt_vinfo) = new_stmt; |
1767 | prev_stmt_vinfo = vinfo_for_stmt (new_stmt); | |
1768 | } | |
1769 | } | |
1770 | ||
d29de1bf DN |
1771 | if (nested_in_vect_loop) |
1772 | { | |
1773 | /* Find the loop-closed exit-phi of the induction, and record | |
1774 | the final vector of induction results: */ | |
1775 | exit_phi = NULL; | |
1776 | FOR_EACH_IMM_USE_FAST (use_p, imm_iter, loop_arg) | |
1777 | { | |
1778 | if (!flow_bb_inside_loop_p (iv_loop, bb_for_stmt (USE_STMT (use_p)))) | |
1779 | { | |
1780 | exit_phi = USE_STMT (use_p); | |
1781 | break; | |
1782 | } | |
1783 | } | |
1784 | if (exit_phi) | |
1785 | { | |
1786 | stmt_vec_info stmt_vinfo = vinfo_for_stmt (exit_phi); | |
1787 | /* FORNOW. Currently not supporting the case that an inner-loop induction | |
1788 | is not used in the outer-loop (i.e. only outside the outer-loop). */ | |
1789 | gcc_assert (STMT_VINFO_RELEVANT_P (stmt_vinfo) | |
1790 | && !STMT_VINFO_LIVE_P (stmt_vinfo)); | |
1791 | ||
1792 | STMT_VINFO_VEC_STMT (stmt_vinfo) = new_stmt; | |
1793 | if (vect_print_dump_info (REPORT_DETAILS)) | |
1794 | { | |
1795 | fprintf (vect_dump, "vector of inductions after inner-loop:"); | |
1796 | print_generic_expr (vect_dump, new_stmt, TDF_SLIM); | |
1797 | } | |
1798 | } | |
1799 | } | |
1800 | ||
1801 | ||
fbf798fc DN |
1802 | if (vect_print_dump_info (REPORT_DETAILS)) |
1803 | { | |
1804 | fprintf (vect_dump, "transform induction: created def-use cycle:"); | |
1805 | print_generic_expr (vect_dump, induction_phi, TDF_SLIM); | |
1806 | fprintf (vect_dump, "\n"); | |
1807 | print_generic_expr (vect_dump, SSA_NAME_DEF_STMT (vec_def), TDF_SLIM); | |
1808 | } | |
1809 | ||
1810 | STMT_VINFO_VEC_STMT (phi_info) = induction_phi; | |
1811 | return induc_def; | |
1812 | } | |
1813 | ||
1814 | ||
f7064d11 DN |
1815 | /* Function vect_get_vec_def_for_operand. |
1816 | ||
1817 | OP is an operand in STMT. This function returns a (vector) def that will be | |
1818 | used in the vectorized stmt for STMT. | |
1819 | ||
1820 | In the case that OP is an SSA_NAME which is defined in the loop, then | |
1821 | STMT_VINFO_VEC_STMT of the defining stmt holds the relevant def. | |
1822 | ||
1823 | In case OP is an invariant or constant, a new stmt that creates a vector def | |
1824 | needs to be introduced. */ | |
1825 | ||
1826 | static tree | |
61d3cdbb | 1827 | vect_get_vec_def_for_operand (tree op, tree stmt, tree *scalar_def) |
f7064d11 DN |
1828 | { |
1829 | tree vec_oprnd; | |
1830 | tree vec_stmt; | |
1831 | tree def_stmt; | |
1832 | stmt_vec_info def_stmt_info = NULL; | |
1833 | stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt); | |
1834 | tree vectype = STMT_VINFO_VECTYPE (stmt_vinfo); | |
57d1677d | 1835 | int nunits = TYPE_VECTOR_SUBPARTS (vectype); |
f7064d11 | 1836 | loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_vinfo); |
f7064d11 | 1837 | tree vec_inv; |
88088c03 | 1838 | tree vec_cst; |
f7064d11 DN |
1839 | tree t = NULL_TREE; |
1840 | tree def; | |
1841 | int i; | |
88088c03 DN |
1842 | enum vect_def_type dt; |
1843 | bool is_simple_use; | |
4090db01 | 1844 | tree vector_type; |
f7064d11 | 1845 | |
00518cb1 | 1846 | if (vect_print_dump_info (REPORT_DETAILS)) |
f7064d11 DN |
1847 | { |
1848 | fprintf (vect_dump, "vect_get_vec_def_for_operand: "); | |
1849 | print_generic_expr (vect_dump, op, TDF_SLIM); | |
1850 | } | |
1851 | ||
88088c03 DN |
1852 | is_simple_use = vect_is_simple_use (op, loop_vinfo, &def_stmt, &def, &dt); |
1853 | gcc_assert (is_simple_use); | |
00518cb1 | 1854 | if (vect_print_dump_info (REPORT_DETAILS)) |
f7064d11 | 1855 | { |
88088c03 | 1856 | if (def) |
f7064d11 | 1857 | { |
88088c03 DN |
1858 | fprintf (vect_dump, "def = "); |
1859 | print_generic_expr (vect_dump, def, TDF_SLIM); | |
1860 | } | |
1861 | if (def_stmt) | |
1862 | { | |
1863 | fprintf (vect_dump, " def_stmt = "); | |
1864 | print_generic_expr (vect_dump, def_stmt, TDF_SLIM); | |
f7064d11 | 1865 | } |
f7064d11 DN |
1866 | } |
1867 | ||
88088c03 | 1868 | switch (dt) |
f7064d11 | 1869 | { |
88088c03 DN |
1870 | /* Case 1: operand is a constant. */ |
1871 | case vect_constant_def: | |
1872 | { | |
61d3cdbb DN |
1873 | if (scalar_def) |
1874 | *scalar_def = op; | |
1875 | ||
88088c03 | 1876 | /* Create 'vect_cst_ = {cst,cst,...,cst}' */ |
00518cb1 | 1877 | if (vect_print_dump_info (REPORT_DETAILS)) |
88088c03 DN |
1878 | fprintf (vect_dump, "Create vector_cst. nunits = %d", nunits); |
1879 | ||
1880 | for (i = nunits - 1; i >= 0; --i) | |
1881 | { | |
1882 | t = tree_cons (NULL_TREE, op, t); | |
1883 | } | |
4090db01 | 1884 | vector_type = get_vectype_for_scalar_type (TREE_TYPE (op)); |
20e545c3 | 1885 | gcc_assert (vector_type); |
4090db01 IR |
1886 | vec_cst = build_vector (vector_type, t); |
1887 | ||
468c2ac0 | 1888 | return vect_init_vector (stmt, vec_cst, vector_type, NULL); |
88088c03 DN |
1889 | } |
1890 | ||
1891 | /* Case 2: operand is defined outside the loop - loop invariant. */ | |
1892 | case vect_invariant_def: | |
1893 | { | |
61d3cdbb DN |
1894 | if (scalar_def) |
1895 | *scalar_def = def; | |
1896 | ||
88088c03 | 1897 | /* Create 'vec_inv = {inv,inv,..,inv}' */ |
00518cb1 | 1898 | if (vect_print_dump_info (REPORT_DETAILS)) |
88088c03 DN |
1899 | fprintf (vect_dump, "Create vector_inv."); |
1900 | ||
1901 | for (i = nunits - 1; i >= 0; --i) | |
1902 | { | |
1903 | t = tree_cons (NULL_TREE, def, t); | |
1904 | } | |
1905 | ||
4038c495 | 1906 | /* FIXME: use build_constructor directly. */ |
4090db01 | 1907 | vector_type = get_vectype_for_scalar_type (TREE_TYPE (def)); |
20e545c3 | 1908 | gcc_assert (vector_type); |
4090db01 | 1909 | vec_inv = build_constructor_from_list (vector_type, t); |
468c2ac0 | 1910 | return vect_init_vector (stmt, vec_inv, vector_type, NULL); |
88088c03 DN |
1911 | } |
1912 | ||
1913 | /* Case 3: operand is defined inside the loop. */ | |
1914 | case vect_loop_def: | |
1915 | { | |
61d3cdbb DN |
1916 | if (scalar_def) |
1917 | *scalar_def = def_stmt; | |
1918 | ||
88088c03 DN |
1919 | /* Get the def from the vectorized stmt. */ |
1920 | def_stmt_info = vinfo_for_stmt (def_stmt); | |
1921 | vec_stmt = STMT_VINFO_VEC_STMT (def_stmt_info); | |
1922 | gcc_assert (vec_stmt); | |
d29de1bf DN |
1923 | if (TREE_CODE (vec_stmt) == PHI_NODE) |
1924 | vec_oprnd = PHI_RESULT (vec_stmt); | |
1925 | else | |
1926 | vec_oprnd = GIMPLE_STMT_OPERAND (vec_stmt, 0); | |
88088c03 DN |
1927 | return vec_oprnd; |
1928 | } | |
1929 | ||
61d3cdbb DN |
1930 | /* Case 4: operand is defined by a loop header phi - reduction */ |
1931 | case vect_reduction_def: | |
1932 | { | |
d29de1bf DN |
1933 | struct loop *loop; |
1934 | ||
61d3cdbb | 1935 | gcc_assert (TREE_CODE (def_stmt) == PHI_NODE); |
d29de1bf | 1936 | loop = (bb_for_stmt (def_stmt))->loop_father; |
61d3cdbb DN |
1937 | |
1938 | /* Get the def before the loop */ | |
1939 | op = PHI_ARG_DEF_FROM_EDGE (def_stmt, loop_preheader_edge (loop)); | |
1940 | return get_initial_def_for_reduction (stmt, op, scalar_def); | |
1941 | } | |
1942 | ||
1943 | /* Case 5: operand is defined by loop-header phi - induction. */ | |
88088c03 DN |
1944 | case vect_induction_def: |
1945 | { | |
fbf798fc DN |
1946 | gcc_assert (TREE_CODE (def_stmt) == PHI_NODE); |
1947 | ||
d29de1bf DN |
1948 | /* Get the def from the vectorized stmt. */ |
1949 | def_stmt_info = vinfo_for_stmt (def_stmt); | |
1950 | vec_stmt = STMT_VINFO_VEC_STMT (def_stmt_info); | |
1951 | gcc_assert (vec_stmt && (TREE_CODE (vec_stmt) == PHI_NODE)); | |
1952 | vec_oprnd = PHI_RESULT (vec_stmt); | |
1953 | return vec_oprnd; | |
88088c03 | 1954 | } |
f7064d11 | 1955 | |
f7064d11 | 1956 | default: |
88088c03 | 1957 | gcc_unreachable (); |
f7064d11 | 1958 | } |
f7064d11 DN |
1959 | } |
1960 | ||
1961 | ||
89d67cca DN |
1962 | /* Function vect_get_vec_def_for_stmt_copy |
1963 | ||
1964 | Return a vector-def for an operand. This function is used when the | |
1965 | vectorized stmt to be created (by the caller to this function) is a "copy" | |
1966 | created in case the vectorized result cannot fit in one vector, and several | |
1967 | copies of the vector-stmt are required. In this case the vector-def is | |
8115817b | 1968 | retrieved from the vector stmt recorded in the STMT_VINFO_RELATED_STMT field |
89d67cca DN |
1969 | of the stmt that defines VEC_OPRND. |
1970 | DT is the type of the vector def VEC_OPRND. | |
1971 | ||
1972 | Context: | |
1973 | In case the vectorization factor (VF) is bigger than the number | |
1974 | of elements that can fit in a vectype (nunits), we have to generate | |
1975 | more than one vector stmt to vectorize the scalar stmt. This situation | |
1976 | arises when there are multiple data-types operated upon in the loop; the | |
1977 | smallest data-type determines the VF, and as a result, when vectorizing | |
1978 | stmts operating on wider types we need to create 'VF/nunits' "copies" of the | |
1979 | vector stmt (each computing a vector of 'nunits' results, and together | |
1980 | computing 'VF' results in each iteration). This function is called when | |
2f8e468b | 1981 | vectorizing such a stmt (e.g. vectorizing S2 in the illustration below, in |
878aa817 | 1982 | which VF=16 and nunits=4, so the number of copies required is 4): |
89d67cca DN |
1983 | |
1984 | scalar stmt: vectorized into: STMT_VINFO_RELATED_STMT | |
1985 | ||
1986 | S1: x = load VS1.0: vx.0 = memref0 VS1.1 | |
1987 | VS1.1: vx.1 = memref1 VS1.2 | |
1988 | VS1.2: vx.2 = memref2 VS1.3 | |
1989 | VS1.3: vx.3 = memref3 | |
1990 | ||
1991 | S2: z = x + ... VSnew.0: vz0 = vx.0 + ... VSnew.1 | |
1992 | VSnew.1: vz1 = vx.1 + ... VSnew.2 | |
1993 | VSnew.2: vz2 = vx.2 + ... VSnew.3 | |
1994 | VSnew.3: vz3 = vx.3 + ... | |
1995 | ||
1996 | The vectorization of S1 is explained in vectorizable_load. | |
1997 | The vectorization of S2: | |
1998 | To create the first vector-stmt out of the 4 copies - VSnew.0 - | |
1999 | the function 'vect_get_vec_def_for_operand' is called to | |
2000 | get the relevant vector-def for each operand of S2. For operand x it | |
2001 | returns the vector-def 'vx.0'. | |
2002 | ||
2003 | To create the remaining copies of the vector-stmt (VSnew.j), this | |
2004 | function is called to get the relevant vector-def for each operand. It is | |
2005 | obtained from the respective VS1.j stmt, which is recorded in the | |
2006 | STMT_VINFO_RELATED_STMT field of the stmt that defines VEC_OPRND. | |
2007 | ||
2008 | For example, to obtain the vector-def 'vx.1' in order to create the | |
2009 | vector stmt 'VSnew.1', this function is called with VEC_OPRND='vx.0'. | |
2010 | Given 'vx0' we obtain the stmt that defines it ('VS1.0'); from the | |
2011 | STMT_VINFO_RELATED_STMT field of 'VS1.0' we obtain the next copy - 'VS1.1', | |
2012 | and return its def ('vx.1'). | |
2013 | Overall, to create the above sequence this function will be called 3 times: | |
2014 | vx.1 = vect_get_vec_def_for_stmt_copy (dt, vx.0); | |
2015 | vx.2 = vect_get_vec_def_for_stmt_copy (dt, vx.1); | |
2016 | vx.3 = vect_get_vec_def_for_stmt_copy (dt, vx.2); */ | |
2017 | ||
2018 | static tree | |
2019 | vect_get_vec_def_for_stmt_copy (enum vect_def_type dt, tree vec_oprnd) | |
2020 | { | |
2021 | tree vec_stmt_for_operand; | |
2022 | stmt_vec_info def_stmt_info; | |
2023 | ||
fbf798fc DN |
2024 | /* Do nothing; can reuse same def. */ |
2025 | if (dt == vect_invariant_def || dt == vect_constant_def ) | |
2026 | return vec_oprnd; | |
89d67cca DN |
2027 | |
2028 | vec_stmt_for_operand = SSA_NAME_DEF_STMT (vec_oprnd); | |
2029 | def_stmt_info = vinfo_for_stmt (vec_stmt_for_operand); | |
2030 | gcc_assert (def_stmt_info); | |
2031 | vec_stmt_for_operand = STMT_VINFO_RELATED_STMT (def_stmt_info); | |
2032 | gcc_assert (vec_stmt_for_operand); | |
07beea0d | 2033 | vec_oprnd = GIMPLE_STMT_OPERAND (vec_stmt_for_operand, 0); |
89d67cca DN |
2034 | return vec_oprnd; |
2035 | } | |
2036 | ||
2037 | ||
805e2059 IR |
2038 | /* Get vectorized definitions for the operands to create a copy of an original |
2039 | stmt. See vect_get_vec_def_for_stmt_copy() for details. */ | |
2040 | ||
2041 | static void | |
2042 | vect_get_vec_defs_for_stmt_copy (enum vect_def_type *dt, | |
2043 | VEC(tree,heap) **vec_oprnds0, | |
2044 | VEC(tree,heap) **vec_oprnds1) | |
2045 | { | |
2046 | tree vec_oprnd = VEC_pop (tree, *vec_oprnds0); | |
2047 | ||
2048 | vec_oprnd = vect_get_vec_def_for_stmt_copy (dt[0], vec_oprnd); | |
2049 | VEC_quick_push (tree, *vec_oprnds0, vec_oprnd); | |
2050 | ||
4934454b | 2051 | if (vec_oprnds1 && *vec_oprnds1) |
805e2059 IR |
2052 | { |
2053 | vec_oprnd = VEC_pop (tree, *vec_oprnds1); | |
2054 | vec_oprnd = vect_get_vec_def_for_stmt_copy (dt[1], vec_oprnd); | |
2055 | VEC_quick_push (tree, *vec_oprnds1, vec_oprnd); | |
2056 | } | |
2057 | } | |
2058 | ||
2059 | ||
2060 | /* Get vectorized definitions for OP0 and OP1, or SLP_NODE if it is not NULL. */ | |
2061 | ||
2062 | static void | |
2063 | vect_get_vec_defs (tree op0, tree op1, tree stmt, VEC(tree,heap) **vec_oprnds0, | |
2064 | VEC(tree,heap) **vec_oprnds1, slp_tree slp_node) | |
2065 | { | |
2066 | if (slp_node) | |
2067 | vect_get_slp_defs (slp_node, vec_oprnds0, vec_oprnds1); | |
2068 | else | |
2069 | { | |
2070 | tree vec_oprnd; | |
2071 | ||
2072 | *vec_oprnds0 = VEC_alloc (tree, heap, 1); | |
2073 | vec_oprnd = vect_get_vec_def_for_operand (op0, stmt, NULL); | |
2074 | VEC_quick_push (tree, *vec_oprnds0, vec_oprnd); | |
2075 | ||
2076 | if (op1) | |
2077 | { | |
2078 | *vec_oprnds1 = VEC_alloc (tree, heap, 1); | |
2079 | vec_oprnd = vect_get_vec_def_for_operand (op1, stmt, NULL); | |
2080 | VEC_quick_push (tree, *vec_oprnds1, vec_oprnd); | |
2081 | } | |
2082 | } | |
2083 | } | |
2084 | ||
2085 | ||
f7064d11 DN |
2086 | /* Function vect_finish_stmt_generation. |
2087 | ||
2088 | Insert a new stmt. */ | |
2089 | ||
2090 | static void | |
89d67cca DN |
2091 | vect_finish_stmt_generation (tree stmt, tree vec_stmt, |
2092 | block_stmt_iterator *bsi) | |
f7064d11 | 2093 | { |
89d67cca DN |
2094 | stmt_vec_info stmt_info = vinfo_for_stmt (stmt); |
2095 | loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info); | |
2096 | ||
d29de1bf DN |
2097 | gcc_assert (stmt == bsi_stmt (*bsi)); |
2098 | gcc_assert (TREE_CODE (stmt) != LABEL_EXPR); | |
2099 | ||
f7064d11 | 2100 | bsi_insert_before (bsi, vec_stmt, BSI_SAME_STMT); |
d29de1bf | 2101 | |
89d67cca DN |
2102 | set_stmt_info (get_stmt_ann (vec_stmt), |
2103 | new_stmt_vec_info (vec_stmt, loop_vinfo)); | |
f7064d11 | 2104 | |
00518cb1 | 2105 | if (vect_print_dump_info (REPORT_DETAILS)) |
f7064d11 DN |
2106 | { |
2107 | fprintf (vect_dump, "add new stmt: "); | |
2108 | print_generic_expr (vect_dump, vec_stmt, TDF_SLIM); | |
2109 | } | |
2110 | ||
f7064d11 DN |
2111 | /* Make sure bsi points to the stmt that is being vectorized. */ |
2112 | gcc_assert (stmt == bsi_stmt (*bsi)); | |
f7064d11 | 2113 | |
dbce1570 | 2114 | SET_EXPR_LOCATION (vec_stmt, EXPR_LOCATION (stmt)); |
f7064d11 DN |
2115 | } |
2116 | ||
2117 | ||
61d3cdbb DN |
2118 | /* Function get_initial_def_for_reduction |
2119 | ||
2120 | Input: | |
2121 | STMT - a stmt that performs a reduction operation in the loop. | |
2122 | INIT_VAL - the initial value of the reduction variable | |
2123 | ||
2124 | Output: | |
f7c1d73d DN |
2125 | ADJUSTMENT_DEF - a tree that holds a value to be added to the final result |
2126 | of the reduction (used for adjusting the epilog - see below). | |
61d3cdbb | 2127 | Return a vector variable, initialized according to the operation that STMT |
f7c1d73d DN |
2128 | performs. This vector will be used as the initial value of the |
2129 | vector of partial results. | |
61d3cdbb | 2130 | |
f7c1d73d | 2131 | Option1 (adjust in epilog): Initialize the vector as follows: |
61d3cdbb DN |
2132 | add: [0,0,...,0,0] |
2133 | mult: [1,1,...,1,1] | |
2134 | min/max: [init_val,init_val,..,init_val,init_val] | |
2135 | bit and/or: [init_val,init_val,..,init_val,init_val] | |
f7c1d73d | 2136 | and when necessary (e.g. add/mult case) let the caller know |
61d3cdbb DN |
2137 | that it needs to adjust the result by init_val. |
2138 | ||
2139 | Option2: Initialize the vector as follows: | |
2140 | add: [0,0,...,0,init_val] | |
2141 | mult: [1,1,...,1,init_val] | |
2142 | min/max: [init_val,init_val,...,init_val] | |
2143 | bit and/or: [init_val,init_val,...,init_val] | |
2144 | and no adjustments are needed. | |
2145 | ||
2146 | For example, for the following code: | |
2147 | ||
2148 | s = init_val; | |
2149 | for (i=0;i<n;i++) | |
2150 | s = s + a[i]; | |
2151 | ||
2152 | STMT is 's = s + a[i]', and the reduction variable is 's'. | |
2153 | For a vector of 4 units, we want to return either [0,0,0,init_val], | |
2154 | or [0,0,0,0] and let the caller know that it needs to adjust | |
2155 | the result at the end by 'init_val'. | |
2156 | ||
f7c1d73d DN |
2157 | FORNOW, we are using the 'adjust in epilog' scheme, because this way the |
2158 | initialization vector is simpler (same element in all entries). | |
2159 | A cost model should help decide between these two schemes. */ | |
61d3cdbb DN |
2160 | |
2161 | static tree | |
f7c1d73d | 2162 | get_initial_def_for_reduction (tree stmt, tree init_val, tree *adjustment_def) |
61d3cdbb DN |
2163 | { |
2164 | stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt); | |
d29de1bf DN |
2165 | loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_vinfo); |
2166 | struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo); | |
61d3cdbb | 2167 | tree vectype = STMT_VINFO_VECTYPE (stmt_vinfo); |
f58e9734 | 2168 | int nunits = TYPE_VECTOR_SUBPARTS (vectype); |
07beea0d | 2169 | enum tree_code code = TREE_CODE (GIMPLE_STMT_OPERAND (stmt, 1)); |
61d3cdbb | 2170 | tree type = TREE_TYPE (init_val); |
f7c1d73d DN |
2171 | tree vecdef; |
2172 | tree def_for_init; | |
2173 | tree init_def; | |
2174 | tree t = NULL_TREE; | |
61d3cdbb | 2175 | int i; |
4090db01 | 2176 | tree vector_type; |
d29de1bf | 2177 | bool nested_in_vect_loop = false; |
61d3cdbb | 2178 | |
a0aa00d7 | 2179 | gcc_assert (POINTER_TYPE_P (type) || INTEGRAL_TYPE_P (type) || SCALAR_FLOAT_TYPE_P (type)); |
d29de1bf DN |
2180 | if (nested_in_vect_loop_p (loop, stmt)) |
2181 | nested_in_vect_loop = true; | |
2182 | else | |
2183 | gcc_assert (loop == (bb_for_stmt (stmt))->loop_father); | |
2184 | ||
f7c1d73d | 2185 | vecdef = vect_get_vec_def_for_operand (init_val, stmt, NULL); |
61d3cdbb DN |
2186 | |
2187 | switch (code) | |
2188 | { | |
20f06221 DN |
2189 | case WIDEN_SUM_EXPR: |
2190 | case DOT_PROD_EXPR: | |
61d3cdbb | 2191 | case PLUS_EXPR: |
a0aa00d7 DN |
2192 | if (nested_in_vect_loop) |
2193 | *adjustment_def = vecdef; | |
9009820b | 2194 | else |
a0aa00d7 DN |
2195 | *adjustment_def = init_val; |
2196 | /* Create a vector of zeros for init_def. */ | |
2197 | if (SCALAR_FLOAT_TYPE_P (type)) | |
f7c1d73d | 2198 | def_for_init = build_real (type, dconst0); |
a0aa00d7 DN |
2199 | else |
2200 | def_for_init = build_int_cst (type, 0); | |
2201 | for (i = nunits - 1; i >= 0; --i) | |
2202 | t = tree_cons (NULL_TREE, def_for_init, t); | |
f7c1d73d | 2203 | vector_type = get_vectype_for_scalar_type (TREE_TYPE (def_for_init)); |
20e545c3 | 2204 | gcc_assert (vector_type); |
f7c1d73d | 2205 | init_def = build_vector (vector_type, t); |
61d3cdbb DN |
2206 | break; |
2207 | ||
2208 | case MIN_EXPR: | |
2209 | case MAX_EXPR: | |
f7c1d73d DN |
2210 | *adjustment_def = NULL_TREE; |
2211 | init_def = vecdef; | |
61d3cdbb DN |
2212 | break; |
2213 | ||
2214 | default: | |
2215 | gcc_unreachable (); | |
2216 | } | |
2217 | ||
f7c1d73d | 2218 | return init_def; |
61d3cdbb DN |
2219 | } |
2220 | ||
2221 | ||
20f06221 | 2222 | /* Function vect_create_epilog_for_reduction |
61d3cdbb DN |
2223 | |
2224 | Create code at the loop-epilog to finalize the result of a reduction | |
20f06221 | 2225 | computation. |
61d3cdbb | 2226 | |
20f06221 DN |
2227 | VECT_DEF is a vector of partial results. |
2228 | REDUC_CODE is the tree-code for the epilog reduction. | |
2229 | STMT is the scalar reduction stmt that is being vectorized. | |
61d3cdbb | 2230 | REDUCTION_PHI is the phi-node that carries the reduction computation. |
61d3cdbb | 2231 | |
20f06221 | 2232 | This function: |
ea2c620c | 2233 | 1. Creates the reduction def-use cycle: sets the arguments for |
20f06221 DN |
2234 | REDUCTION_PHI: |
2235 | The loop-entry argument is the vectorized initial-value of the reduction. | |
2236 | The loop-latch argument is VECT_DEF - the vector of partial sums. | |
2237 | 2. "Reduces" the vector of partial results VECT_DEF into a single result, | |
2238 | by applying the operation specified by REDUC_CODE if available, or by | |
2239 | other means (whole-vector shifts or a scalar loop). | |
2240 | The function also creates a new phi node at the loop exit to preserve | |
2241 | loop-closed form, as illustrated below. | |
2242 | ||
2243 | The flow at the entry to this function: | |
61d3cdbb DN |
2244 | |
2245 | loop: | |
20f06221 | 2246 | vec_def = phi <null, null> # REDUCTION_PHI |
8115817b | 2247 | VECT_DEF = vector_stmt # vectorized form of STMT |
20f06221 | 2248 | s_loop = scalar_stmt # (scalar) STMT |
61d3cdbb | 2249 | loop_exit: |
20f06221 | 2250 | s_out0 = phi <s_loop> # (scalar) EXIT_PHI |
61d3cdbb DN |
2251 | use <s_out0> |
2252 | use <s_out0> | |
2253 | ||
20f06221 | 2254 | The above is transformed by this function into: |
61d3cdbb DN |
2255 | |
2256 | loop: | |
20f06221 DN |
2257 | vec_def = phi <vec_init, VECT_DEF> # REDUCTION_PHI |
2258 | VECT_DEF = vector_stmt # vectorized form of STMT | |
2259 | s_loop = scalar_stmt # (scalar) STMT | |
61d3cdbb | 2260 | loop_exit: |
20f06221 DN |
2261 | s_out0 = phi <s_loop> # (scalar) EXIT_PHI |
2262 | v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI | |
2263 | v_out2 = reduce <v_out1> | |
61d3cdbb | 2264 | s_out3 = extract_field <v_out2, 0> |
20f06221 DN |
2265 | s_out4 = adjust_result <s_out3> |
2266 | use <s_out4> | |
2267 | use <s_out4> | |
61d3cdbb DN |
2268 | */ |
2269 | ||
2270 | static void | |
20f06221 | 2271 | vect_create_epilog_for_reduction (tree vect_def, tree stmt, |
61d3cdbb DN |
2272 | enum tree_code reduc_code, tree reduction_phi) |
2273 | { | |
2274 | stmt_vec_info stmt_info = vinfo_for_stmt (stmt); | |
20f06221 DN |
2275 | tree vectype; |
2276 | enum machine_mode mode; | |
61d3cdbb DN |
2277 | loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info); |
2278 | struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo); | |
2279 | basic_block exit_bb; | |
20f06221 DN |
2280 | tree scalar_dest; |
2281 | tree scalar_type; | |
61d3cdbb DN |
2282 | tree new_phi; |
2283 | block_stmt_iterator exit_bsi; | |
2284 | tree vec_dest; | |
d29de1bf | 2285 | tree new_temp = NULL_TREE; |
a6b46ba2 | 2286 | tree new_name; |
d29de1bf DN |
2287 | tree epilog_stmt = NULL_TREE; |
2288 | tree new_scalar_dest, exit_phi, new_dest; | |
a6b46ba2 | 2289 | tree bitsize, bitpos, bytesize; |
07beea0d | 2290 | enum tree_code code = TREE_CODE (GIMPLE_STMT_OPERAND (stmt, 1)); |
d29de1bf | 2291 | tree adjustment_def; |
61d3cdbb DN |
2292 | tree vec_initial_def; |
2293 | tree orig_name; | |
2294 | imm_use_iterator imm_iter; | |
2295 | use_operand_p use_p; | |
d29de1bf DN |
2296 | bool extract_scalar_result = false; |
2297 | tree reduction_op, expr; | |
20f06221 | 2298 | tree orig_stmt; |
6c00f606 | 2299 | tree use_stmt; |
07beea0d | 2300 | tree operation = GIMPLE_STMT_OPERAND (stmt, 1); |
d29de1bf | 2301 | bool nested_in_vect_loop = false; |
20f06221 | 2302 | int op_type; |
71f4a023 DN |
2303 | VEC(tree,heap) *phis = NULL; |
2304 | int i; | |
61d3cdbb | 2305 | |
d29de1bf DN |
2306 | if (nested_in_vect_loop_p (loop, stmt)) |
2307 | { | |
2308 | loop = loop->inner; | |
2309 | nested_in_vect_loop = true; | |
2310 | } | |
2311 | ||
5039610b | 2312 | op_type = TREE_OPERAND_LENGTH (operation); |
20f06221 DN |
2313 | reduction_op = TREE_OPERAND (operation, op_type-1); |
2314 | vectype = get_vectype_for_scalar_type (TREE_TYPE (reduction_op)); | |
20e545c3 | 2315 | gcc_assert (vectype); |
20f06221 DN |
2316 | mode = TYPE_MODE (vectype); |
2317 | ||
61d3cdbb DN |
2318 | /*** 1. Create the reduction def-use cycle ***/ |
2319 | ||
2320 | /* 1.1 set the loop-entry arg of the reduction-phi: */ | |
2321 | /* For the case of reduction, vect_get_vec_def_for_operand returns | |
2322 | the scalar def before the loop, that defines the initial value | |
2323 | of the reduction variable. */ | |
2324 | vec_initial_def = vect_get_vec_def_for_operand (reduction_op, stmt, | |
d29de1bf | 2325 | &adjustment_def); |
61d3cdbb DN |
2326 | add_phi_arg (reduction_phi, vec_initial_def, loop_preheader_edge (loop)); |
2327 | ||
61d3cdbb DN |
2328 | /* 1.2 set the loop-latch arg for the reduction-phi: */ |
2329 | add_phi_arg (reduction_phi, vect_def, loop_latch_edge (loop)); | |
2330 | ||
00518cb1 | 2331 | if (vect_print_dump_info (REPORT_DETAILS)) |
61d3cdbb DN |
2332 | { |
2333 | fprintf (vect_dump, "transform reduction: created def-use cycle:"); | |
2334 | print_generic_expr (vect_dump, reduction_phi, TDF_SLIM); | |
2335 | fprintf (vect_dump, "\n"); | |
2336 | print_generic_expr (vect_dump, SSA_NAME_DEF_STMT (vect_def), TDF_SLIM); | |
2337 | } | |
2338 | ||
2339 | ||
20f06221 DN |
2340 | /*** 2. Create epilog code |
2341 | The reduction epilog code operates across the elements of the vector | |
2342 | of partial results computed by the vectorized loop. | |
2343 | The reduction epilog code consists of: | |
2344 | step 1: compute the scalar result in a vector (v_out2) | |
2345 | step 2: extract the scalar result (s_out3) from the vector (v_out2) | |
2346 | step 3: adjust the scalar result (s_out3) if needed. | |
2347 | ||
2348 | Step 1 can be accomplished using one the following three schemes: | |
2349 | (scheme 1) using reduc_code, if available. | |
2350 | (scheme 2) using whole-vector shifts, if available. | |
2351 | (scheme 3) using a scalar loop. In this case steps 1+2 above are | |
2352 | combined. | |
2353 | ||
2354 | The overall epilog code looks like this: | |
2355 | ||
2356 | s_out0 = phi <s_loop> # original EXIT_PHI | |
2357 | v_out1 = phi <VECT_DEF> # NEW_EXIT_PHI | |
2358 | v_out2 = reduce <v_out1> # step 1 | |
2359 | s_out3 = extract_field <v_out2, 0> # step 2 | |
2360 | s_out4 = adjust_result <s_out3> # step 3 | |
2361 | ||
2362 | (step 3 is optional, and step2 1 and 2 may be combined). | |
2363 | Lastly, the uses of s_out0 are replaced by s_out4. | |
2364 | ||
2365 | ***/ | |
61d3cdbb DN |
2366 | |
2367 | /* 2.1 Create new loop-exit-phi to preserve loop-closed form: | |
2368 | v_out1 = phi <v_loop> */ | |
2369 | ||
ac8f6c69 | 2370 | exit_bb = single_exit (loop)->dest; |
61d3cdbb | 2371 | new_phi = create_phi_node (SSA_NAME_VAR (vect_def), exit_bb); |
ac8f6c69 | 2372 | SET_PHI_ARG_DEF (new_phi, single_exit (loop)->dest_idx, vect_def); |
8b11009b | 2373 | exit_bsi = bsi_after_labels (exit_bb); |
61d3cdbb | 2374 | |
20f06221 | 2375 | /* 2.2 Get the relevant tree-code to use in the epilog for schemes 2,3 |
8115817b UB |
2376 | (i.e. when reduc_code is not available) and in the final adjustment |
2377 | code (if needed). Also get the original scalar reduction variable as | |
20f06221 DN |
2378 | defined in the loop. In case STMT is a "pattern-stmt" (i.e. - it |
2379 | represents a reduction pattern), the tree-code and scalar-def are | |
2380 | taken from the original stmt that the pattern-stmt (STMT) replaces. | |
2381 | Otherwise (it is a regular reduction) - the tree-code and scalar-def | |
2382 | are taken from STMT. */ | |
2383 | ||
2384 | orig_stmt = STMT_VINFO_RELATED_STMT (stmt_info); | |
2385 | if (!orig_stmt) | |
2386 | { | |
2387 | /* Regular reduction */ | |
2388 | orig_stmt = stmt; | |
2389 | } | |
2390 | else | |
2391 | { | |
2392 | /* Reduction pattern */ | |
2393 | stmt_vec_info stmt_vinfo = vinfo_for_stmt (orig_stmt); | |
2394 | gcc_assert (STMT_VINFO_IN_PATTERN_P (stmt_vinfo)); | |
2395 | gcc_assert (STMT_VINFO_RELATED_STMT (stmt_vinfo) == stmt); | |
2396 | } | |
07beea0d AH |
2397 | code = TREE_CODE (GIMPLE_STMT_OPERAND (orig_stmt, 1)); |
2398 | scalar_dest = GIMPLE_STMT_OPERAND (orig_stmt, 0); | |
20f06221 | 2399 | scalar_type = TREE_TYPE (scalar_dest); |
a6b46ba2 DN |
2400 | new_scalar_dest = vect_create_destination_var (scalar_dest, NULL); |
2401 | bitsize = TYPE_SIZE (scalar_type); | |
2402 | bytesize = TYPE_SIZE_UNIT (scalar_type); | |
61d3cdbb | 2403 | |
d29de1bf DN |
2404 | |
2405 | /* In case this is a reduction in an inner-loop while vectorizing an outer | |
2406 | loop - we don't need to extract a single scalar result at the end of the | |
2407 | inner-loop. The final vector of partial results will be used in the | |
2408 | vectorized outer-loop, or reduced to a scalar result at the end of the | |
2409 | outer-loop. */ | |
2410 | if (nested_in_vect_loop) | |
2411 | goto vect_finalize_reduction; | |
2412 | ||
20f06221 DN |
2413 | /* 2.3 Create the reduction code, using one of the three schemes described |
2414 | above. */ | |
61d3cdbb | 2415 | |
a6b46ba2 | 2416 | if (reduc_code < NUM_TREE_CODES) |
61d3cdbb | 2417 | { |
ebb07520 RS |
2418 | tree tmp; |
2419 | ||
a6b46ba2 DN |
2420 | /*** Case 1: Create: |
2421 | v_out2 = reduc_expr <v_out1> */ | |
61d3cdbb | 2422 | |
00518cb1 | 2423 | if (vect_print_dump_info (REPORT_DETAILS)) |
a6b46ba2 | 2424 | fprintf (vect_dump, "Reduce using direct vector reduction."); |
61d3cdbb | 2425 | |
a6b46ba2 | 2426 | vec_dest = vect_create_destination_var (scalar_dest, vectype); |
ebb07520 RS |
2427 | tmp = build1 (reduc_code, vectype, PHI_RESULT (new_phi)); |
2428 | epilog_stmt = build_gimple_modify_stmt (vec_dest, tmp); | |
a6b46ba2 | 2429 | new_temp = make_ssa_name (vec_dest, epilog_stmt); |
07beea0d | 2430 | GIMPLE_STMT_OPERAND (epilog_stmt, 0) = new_temp; |
8b11009b | 2431 | bsi_insert_before (&exit_bsi, epilog_stmt, BSI_SAME_STMT); |
61d3cdbb | 2432 | |
a6b46ba2 | 2433 | extract_scalar_result = true; |
a6b46ba2 DN |
2434 | } |
2435 | else | |
2436 | { | |
dfea6c85 | 2437 | enum tree_code shift_code = 0; |
a6b46ba2 | 2438 | bool have_whole_vector_shift = true; |
a6b46ba2 DN |
2439 | int bit_offset; |
2440 | int element_bitsize = tree_low_cst (bitsize, 1); | |
2441 | int vec_size_in_bits = tree_low_cst (TYPE_SIZE (vectype), 1); | |
2442 | tree vec_temp; | |
2443 | ||
166cdb08 | 2444 | if (optab_handler (vec_shr_optab, mode)->insn_code != CODE_FOR_nothing) |
a6b46ba2 | 2445 | shift_code = VEC_RSHIFT_EXPR; |
a6b46ba2 DN |
2446 | else |
2447 | have_whole_vector_shift = false; | |
2448 | ||
afc1ab61 RH |
2449 | /* Regardless of whether we have a whole vector shift, if we're |
2450 | emulating the operation via tree-vect-generic, we don't want | |
2451 | to use it. Only the first round of the reduction is likely | |
2452 | to still be profitable via emulation. */ | |
2453 | /* ??? It might be better to emit a reduction tree code here, so that | |
2454 | tree-vect-generic can expand the first round via bit tricks. */ | |
2455 | if (!VECTOR_MODE_P (mode)) | |
2456 | have_whole_vector_shift = false; | |
2457 | else | |
2458 | { | |
2459 | optab optab = optab_for_tree_code (code, vectype); | |
166cdb08 | 2460 | if (optab_handler (optab, mode)->insn_code == CODE_FOR_nothing) |
afc1ab61 RH |
2461 | have_whole_vector_shift = false; |
2462 | } | |
2463 | ||
a6b46ba2 DN |
2464 | if (have_whole_vector_shift) |
2465 | { | |
20f06221 | 2466 | /*** Case 2: Create: |
a6b46ba2 DN |
2467 | for (offset = VS/2; offset >= element_size; offset/=2) |
2468 | { | |
2469 | Create: va' = vec_shift <va, offset> | |
2470 | Create: va = vop <va, va'> | |
2471 | } */ | |
2472 | ||
00518cb1 | 2473 | if (vect_print_dump_info (REPORT_DETAILS)) |
a6b46ba2 DN |
2474 | fprintf (vect_dump, "Reduce using vector shifts"); |
2475 | ||
2476 | vec_dest = vect_create_destination_var (scalar_dest, vectype); | |
2477 | new_temp = PHI_RESULT (new_phi); | |
2478 | ||
2479 | for (bit_offset = vec_size_in_bits/2; | |
2480 | bit_offset >= element_bitsize; | |
2481 | bit_offset /= 2) | |
2482 | { | |
2483 | tree bitpos = size_int (bit_offset); | |
ebb07520 RS |
2484 | tree tmp = build2 (shift_code, vectype, new_temp, bitpos); |
2485 | epilog_stmt = build_gimple_modify_stmt (vec_dest, tmp); | |
a6b46ba2 | 2486 | new_name = make_ssa_name (vec_dest, epilog_stmt); |
07beea0d | 2487 | GIMPLE_STMT_OPERAND (epilog_stmt, 0) = new_name; |
8b11009b | 2488 | bsi_insert_before (&exit_bsi, epilog_stmt, BSI_SAME_STMT); |
a6b46ba2 | 2489 | |
ebb07520 RS |
2490 | tmp = build2 (code, vectype, new_name, new_temp); |
2491 | epilog_stmt = build_gimple_modify_stmt (vec_dest, tmp); | |
a6b46ba2 | 2492 | new_temp = make_ssa_name (vec_dest, epilog_stmt); |
07beea0d | 2493 | GIMPLE_STMT_OPERAND (epilog_stmt, 0) = new_temp; |
8b11009b | 2494 | bsi_insert_before (&exit_bsi, epilog_stmt, BSI_SAME_STMT); |
a6b46ba2 DN |
2495 | } |
2496 | ||
2497 | extract_scalar_result = true; | |
a6b46ba2 DN |
2498 | } |
2499 | else | |
2500 | { | |
429268fc DN |
2501 | tree rhs; |
2502 | ||
20f06221 | 2503 | /*** Case 3: Create: |
429268fc | 2504 | s = extract_field <v_out2, 0> |
20f06221 DN |
2505 | for (offset = element_size; |
2506 | offset < vector_size; | |
2507 | offset += element_size;) | |
a6b46ba2 DN |
2508 | { |
2509 | Create: s' = extract_field <v_out2, offset> | |
2510 | Create: s = op <s, s'> | |
2511 | } */ | |
2512 | ||
00518cb1 | 2513 | if (vect_print_dump_info (REPORT_DETAILS)) |
a6b46ba2 DN |
2514 | fprintf (vect_dump, "Reduce using scalar code. "); |
2515 | ||
2516 | vec_temp = PHI_RESULT (new_phi); | |
2517 | vec_size_in_bits = tree_low_cst (TYPE_SIZE (vectype), 1); | |
429268fc DN |
2518 | rhs = build3 (BIT_FIELD_REF, scalar_type, vec_temp, bitsize, |
2519 | bitsize_zero_node); | |
429268fc | 2520 | BIT_FIELD_REF_UNSIGNED (rhs) = TYPE_UNSIGNED (scalar_type); |
ebb07520 | 2521 | epilog_stmt = build_gimple_modify_stmt (new_scalar_dest, rhs); |
429268fc | 2522 | new_temp = make_ssa_name (new_scalar_dest, epilog_stmt); |
07beea0d | 2523 | GIMPLE_STMT_OPERAND (epilog_stmt, 0) = new_temp; |
8b11009b | 2524 | bsi_insert_before (&exit_bsi, epilog_stmt, BSI_SAME_STMT); |
a6b46ba2 | 2525 | |
429268fc | 2526 | for (bit_offset = element_bitsize; |
a6b46ba2 DN |
2527 | bit_offset < vec_size_in_bits; |
2528 | bit_offset += element_bitsize) | |
2529 | { | |
ebb07520 | 2530 | tree tmp; |
a6b46ba2 | 2531 | tree bitpos = bitsize_int (bit_offset); |
0ed414a4 DN |
2532 | tree rhs = build3 (BIT_FIELD_REF, scalar_type, vec_temp, bitsize, |
2533 | bitpos); | |
2534 | ||
2535 | BIT_FIELD_REF_UNSIGNED (rhs) = TYPE_UNSIGNED (scalar_type); | |
8115817b | 2536 | epilog_stmt = build_gimple_modify_stmt (new_scalar_dest, rhs); |
a6b46ba2 | 2537 | new_name = make_ssa_name (new_scalar_dest, epilog_stmt); |
07beea0d | 2538 | GIMPLE_STMT_OPERAND (epilog_stmt, 0) = new_name; |
8b11009b | 2539 | bsi_insert_before (&exit_bsi, epilog_stmt, BSI_SAME_STMT); |
a6b46ba2 | 2540 | |
ebb07520 RS |
2541 | tmp = build2 (code, scalar_type, new_name, new_temp); |
2542 | epilog_stmt = build_gimple_modify_stmt (new_scalar_dest, tmp); | |
a6b46ba2 | 2543 | new_temp = make_ssa_name (new_scalar_dest, epilog_stmt); |
07beea0d | 2544 | GIMPLE_STMT_OPERAND (epilog_stmt, 0) = new_temp; |
8b11009b | 2545 | bsi_insert_before (&exit_bsi, epilog_stmt, BSI_SAME_STMT); |
a6b46ba2 DN |
2546 | } |
2547 | ||
2548 | extract_scalar_result = false; | |
a6b46ba2 DN |
2549 | } |
2550 | } | |
61d3cdbb | 2551 | |
20f06221 | 2552 | /* 2.4 Extract the final scalar result. Create: |
a6b46ba2 | 2553 | s_out3 = extract_field <v_out2, bitpos> */ |
61d3cdbb | 2554 | |
a6b46ba2 DN |
2555 | if (extract_scalar_result) |
2556 | { | |
0ed414a4 DN |
2557 | tree rhs; |
2558 | ||
d29de1bf | 2559 | gcc_assert (!nested_in_vect_loop); |
00518cb1 | 2560 | if (vect_print_dump_info (REPORT_DETAILS)) |
a6b46ba2 DN |
2561 | fprintf (vect_dump, "extract scalar result"); |
2562 | ||
578578a5 | 2563 | if (BYTES_BIG_ENDIAN) |
a6b46ba2 DN |
2564 | bitpos = size_binop (MULT_EXPR, |
2565 | bitsize_int (TYPE_VECTOR_SUBPARTS (vectype) - 1), | |
2566 | TYPE_SIZE (scalar_type)); | |
2567 | else | |
2568 | bitpos = bitsize_zero_node; | |
2569 | ||
0ed414a4 DN |
2570 | rhs = build3 (BIT_FIELD_REF, scalar_type, new_temp, bitsize, bitpos); |
2571 | BIT_FIELD_REF_UNSIGNED (rhs) = TYPE_UNSIGNED (scalar_type); | |
ebb07520 | 2572 | epilog_stmt = build_gimple_modify_stmt (new_scalar_dest, rhs); |
a6b46ba2 | 2573 | new_temp = make_ssa_name (new_scalar_dest, epilog_stmt); |
07beea0d | 2574 | GIMPLE_STMT_OPERAND (epilog_stmt, 0) = new_temp; |
8b11009b | 2575 | bsi_insert_before (&exit_bsi, epilog_stmt, BSI_SAME_STMT); |
a6b46ba2 | 2576 | } |
61d3cdbb | 2577 | |
d29de1bf DN |
2578 | vect_finalize_reduction: |
2579 | ||
2580 | /* 2.5 Adjust the final result by the initial value of the reduction | |
20f06221 | 2581 | variable. (When such adjustment is not needed, then |
d29de1bf DN |
2582 | 'adjustment_def' is zero). For example, if code is PLUS we create: |
2583 | new_temp = loop_exit_def + adjustment_def */ | |
61d3cdbb | 2584 | |
d29de1bf | 2585 | if (adjustment_def) |
a6b46ba2 | 2586 | { |
d29de1bf DN |
2587 | if (nested_in_vect_loop) |
2588 | { | |
2589 | gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) == VECTOR_TYPE); | |
2590 | expr = build2 (code, vectype, PHI_RESULT (new_phi), adjustment_def); | |
2591 | new_dest = vect_create_destination_var (scalar_dest, vectype); | |
2592 | } | |
2593 | else | |
2594 | { | |
2595 | gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) != VECTOR_TYPE); | |
2596 | expr = build2 (code, scalar_type, new_temp, adjustment_def); | |
2597 | new_dest = vect_create_destination_var (scalar_dest, scalar_type); | |
2598 | } | |
2599 | epilog_stmt = build_gimple_modify_stmt (new_dest, expr); | |
2600 | new_temp = make_ssa_name (new_dest, epilog_stmt); | |
07beea0d | 2601 | GIMPLE_STMT_OPERAND (epilog_stmt, 0) = new_temp; |
8b11009b | 2602 | bsi_insert_before (&exit_bsi, epilog_stmt, BSI_SAME_STMT); |
a6b46ba2 | 2603 | } |
61d3cdbb | 2604 | |
a6b46ba2 | 2605 | |
d29de1bf DN |
2606 | /* 2.6 Handle the loop-exit phi */ |
2607 | ||
2608 | /* Replace uses of s_out0 with uses of s_out3: | |
2609 | Find the loop-closed-use at the loop exit of the original scalar result. | |
20f06221 DN |
2610 | (The reduction result is expected to have two immediate uses - one at the |
2611 | latch block, and one at the loop exit). */ | |
71f4a023 | 2612 | phis = VEC_alloc (tree, heap, 10); |
61d3cdbb DN |
2613 | FOR_EACH_IMM_USE_FAST (use_p, imm_iter, scalar_dest) |
2614 | { | |
2615 | if (!flow_bb_inside_loop_p (loop, bb_for_stmt (USE_STMT (use_p)))) | |
a6b46ba2 DN |
2616 | { |
2617 | exit_phi = USE_STMT (use_p); | |
71f4a023 | 2618 | VEC_quick_push (tree, phis, exit_phi); |
a6b46ba2 | 2619 | } |
61d3cdbb | 2620 | } |
20f06221 | 2621 | /* We expect to have found an exit_phi because of loop-closed-ssa form. */ |
71f4a023 | 2622 | gcc_assert (!VEC_empty (tree, phis)); |
d29de1bf | 2623 | |
71f4a023 | 2624 | for (i = 0; VEC_iterate (tree, phis, i, exit_phi); i++) |
d29de1bf | 2625 | { |
71f4a023 DN |
2626 | if (nested_in_vect_loop) |
2627 | { | |
2628 | stmt_vec_info stmt_vinfo = vinfo_for_stmt (exit_phi); | |
d29de1bf | 2629 | |
71f4a023 DN |
2630 | /* FORNOW. Currently not supporting the case that an inner-loop reduction |
2631 | is not used in the outer-loop (but only outside the outer-loop). */ | |
2632 | gcc_assert (STMT_VINFO_RELEVANT_P (stmt_vinfo) | |
2633 | && !STMT_VINFO_LIVE_P (stmt_vinfo)); | |
d29de1bf | 2634 | |
71f4a023 DN |
2635 | epilog_stmt = adjustment_def ? epilog_stmt : new_phi; |
2636 | STMT_VINFO_VEC_STMT (stmt_vinfo) = epilog_stmt; | |
2637 | set_stmt_info (get_stmt_ann (epilog_stmt), | |
2638 | new_stmt_vec_info (epilog_stmt, loop_vinfo)); | |
2639 | continue; | |
2640 | } | |
d29de1bf | 2641 | |
71f4a023 DN |
2642 | /* Replace the uses: */ |
2643 | orig_name = PHI_RESULT (exit_phi); | |
2644 | FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name) | |
2645 | FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter) | |
2646 | SET_USE (use_p, new_temp); | |
d29de1bf | 2647 | } |
71f4a023 | 2648 | VEC_free (tree, heap, phis); |
61d3cdbb DN |
2649 | } |
2650 | ||
2651 | ||
2652 | /* Function vectorizable_reduction. | |
2653 | ||
2654 | Check if STMT performs a reduction operation that can be vectorized. | |
2655 | If VEC_STMT is also passed, vectorize the STMT: create a vectorized | |
2656 | stmt to replace it, put it in VEC_STMT, and insert it at BSI. | |
20f06221 DN |
2657 | Return FALSE if not a vectorizable STMT, TRUE otherwise. |
2658 | ||
2659 | This function also handles reduction idioms (patterns) that have been | |
2660 | recognized in advance during vect_pattern_recog. In this case, STMT may be | |
2661 | of this form: | |
2662 | X = pattern_expr (arg0, arg1, ..., X) | |
2663 | and it's STMT_VINFO_RELATED_STMT points to the last stmt in the original | |
2664 | sequence that had been detected and replaced by the pattern-stmt (STMT). | |
2665 | ||
8115817b | 2666 | In some cases of reduction patterns, the type of the reduction variable X is |
20f06221 DN |
2667 | different than the type of the other arguments of STMT. |
2668 | In such cases, the vectype that is used when transforming STMT into a vector | |
8115817b | 2669 | stmt is different than the vectype that is used to determine the |
20f06221 DN |
2670 | vectorization factor, because it consists of a different number of elements |
2671 | than the actual number of elements that are being operated upon in parallel. | |
2672 | ||
8115817b | 2673 | For example, consider an accumulation of shorts into an int accumulator. |
20f06221 DN |
2674 | On some targets it's possible to vectorize this pattern operating on 8 |
2675 | shorts at a time (hence, the vectype for purposes of determining the | |
2676 | vectorization factor should be V8HI); on the other hand, the vectype that | |
8115817b | 2677 | is used to create the vector form is actually V4SI (the type of the result). |
20f06221 | 2678 | |
8115817b UB |
2679 | Upon entry to this function, STMT_VINFO_VECTYPE records the vectype that |
2680 | indicates what is the actual level of parallelism (V8HI in the example), so | |
2681 | that the right vectorization factor would be derived. This vectype | |
2682 | corresponds to the type of arguments to the reduction stmt, and should *NOT* | |
20f06221 | 2683 | be used to create the vectorized stmt. The right vectype for the vectorized |
8115817b | 2684 | stmt is obtained from the type of the result X: |
20f06221 DN |
2685 | get_vectype_for_scalar_type (TREE_TYPE (X)) |
2686 | ||
8115817b | 2687 | This means that, contrary to "regular" reductions (or "regular" stmts in |
20f06221 DN |
2688 | general), the following equation: |
2689 | STMT_VINFO_VECTYPE == get_vectype_for_scalar_type (TREE_TYPE (X)) | |
2690 | does *NOT* necessarily hold for reduction patterns. */ | |
61d3cdbb DN |
2691 | |
2692 | bool | |
2693 | vectorizable_reduction (tree stmt, block_stmt_iterator *bsi, tree *vec_stmt) | |
2694 | { | |
2695 | tree vec_dest; | |
2696 | tree scalar_dest; | |
20f06221 | 2697 | tree op; |
89d67cca | 2698 | tree loop_vec_def0 = NULL_TREE, loop_vec_def1 = NULL_TREE; |
61d3cdbb DN |
2699 | stmt_vec_info stmt_info = vinfo_for_stmt (stmt); |
2700 | tree vectype = STMT_VINFO_VECTYPE (stmt_info); | |
2701 | loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info); | |
2702 | struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo); | |
2703 | tree operation; | |
20f06221 | 2704 | enum tree_code code, orig_code, epilog_reduc_code = 0; |
61d3cdbb DN |
2705 | enum machine_mode vec_mode; |
2706 | int op_type; | |
2707 | optab optab, reduc_optab; | |
89d67cca | 2708 | tree new_temp = NULL_TREE; |
20f06221 DN |
2709 | tree def, def_stmt; |
2710 | enum vect_def_type dt; | |
61d3cdbb DN |
2711 | tree new_phi; |
2712 | tree scalar_type; | |
20f06221 DN |
2713 | bool is_simple_use; |
2714 | tree orig_stmt; | |
2715 | stmt_vec_info orig_stmt_info; | |
2716 | tree expr = NULL_TREE; | |
2717 | int i; | |
89d67cca DN |
2718 | int nunits = TYPE_VECTOR_SUBPARTS (vectype); |
2719 | int ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits; | |
2720 | stmt_vec_info prev_stmt_info; | |
2721 | tree reduc_def; | |
2722 | tree new_stmt = NULL_TREE; | |
2723 | int j; | |
2724 | ||
d29de1bf DN |
2725 | if (nested_in_vect_loop_p (loop, stmt)) |
2726 | { | |
2727 | loop = loop->inner; | |
2728 | /* FORNOW. This restriction should be relaxed. */ | |
2729 | if (ncopies > 1) | |
2730 | { | |
2731 | if (vect_print_dump_info (REPORT_DETAILS)) | |
2732 | fprintf (vect_dump, "multiple types in nested loop."); | |
2733 | return false; | |
2734 | } | |
2735 | } | |
2736 | ||
89d67cca | 2737 | gcc_assert (ncopies >= 1); |
61d3cdbb | 2738 | |
805e2059 IR |
2739 | /* FORNOW: SLP not supported. */ |
2740 | if (STMT_SLP_TYPE (stmt_info)) | |
2741 | return false; | |
2742 | ||
20f06221 | 2743 | /* 1. Is vectorizable reduction? */ |
61d3cdbb DN |
2744 | |
2745 | /* Not supportable if the reduction variable is used in the loop. */ | |
d29de1bf | 2746 | if (STMT_VINFO_RELEVANT (stmt_info) > vect_used_in_outer) |
61d3cdbb DN |
2747 | return false; |
2748 | ||
d29de1bf DN |
2749 | /* Reductions that are not used even in an enclosing outer-loop, |
2750 | are expected to be "live" (used out of the loop). */ | |
2751 | if (STMT_VINFO_RELEVANT (stmt_info) == vect_unused_in_loop | |
2752 | && !STMT_VINFO_LIVE_P (stmt_info)) | |
61d3cdbb DN |
2753 | return false; |
2754 | ||
20f06221 | 2755 | /* Make sure it was already recognized as a reduction computation. */ |
61d3cdbb DN |
2756 | if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_reduction_def) |
2757 | return false; | |
2758 | ||
20f06221 DN |
2759 | /* 2. Has this been recognized as a reduction pattern? |
2760 | ||
2761 | Check if STMT represents a pattern that has been recognized | |
2762 | in earlier analysis stages. For stmts that represent a pattern, | |
2763 | the STMT_VINFO_RELATED_STMT field records the last stmt in | |
2764 | the original sequence that constitutes the pattern. */ | |
2765 | ||
2766 | orig_stmt = STMT_VINFO_RELATED_STMT (stmt_info); | |
2767 | if (orig_stmt) | |
2768 | { | |
2769 | orig_stmt_info = vinfo_for_stmt (orig_stmt); | |
2770 | gcc_assert (STMT_VINFO_RELATED_STMT (orig_stmt_info) == stmt); | |
2771 | gcc_assert (STMT_VINFO_IN_PATTERN_P (orig_stmt_info)); | |
2772 | gcc_assert (!STMT_VINFO_IN_PATTERN_P (stmt_info)); | |
2773 | } | |
2774 | ||
2775 | /* 3. Check the operands of the operation. The first operands are defined | |
2776 | inside the loop body. The last operand is the reduction variable, | |
2777 | which is defined by the loop-header-phi. */ | |
2778 | ||
07beea0d | 2779 | gcc_assert (TREE_CODE (stmt) == GIMPLE_MODIFY_STMT); |
61d3cdbb | 2780 | |
07beea0d | 2781 | operation = GIMPLE_STMT_OPERAND (stmt, 1); |
61d3cdbb | 2782 | code = TREE_CODE (operation); |
5039610b | 2783 | op_type = TREE_OPERAND_LENGTH (operation); |
20f06221 | 2784 | if (op_type != binary_op && op_type != ternary_op) |
61d3cdbb | 2785 | return false; |
07beea0d | 2786 | scalar_dest = GIMPLE_STMT_OPERAND (stmt, 0); |
61d3cdbb | 2787 | scalar_type = TREE_TYPE (scalar_dest); |
a0aa00d7 DN |
2788 | if (!POINTER_TYPE_P (scalar_type) && !INTEGRAL_TYPE_P (scalar_type) |
2789 | && !SCALAR_FLOAT_TYPE_P (scalar_type)) | |
2790 | return false; | |
61d3cdbb | 2791 | |
20f06221 DN |
2792 | /* All uses but the last are expected to be defined in the loop. |
2793 | The last use is the reduction variable. */ | |
2794 | for (i = 0; i < op_type-1; i++) | |
2795 | { | |
2796 | op = TREE_OPERAND (operation, i); | |
2797 | is_simple_use = vect_is_simple_use (op, loop_vinfo, &def_stmt, &def, &dt); | |
2798 | gcc_assert (is_simple_use); | |
fbf798fc DN |
2799 | if (dt != vect_loop_def |
2800 | && dt != vect_invariant_def | |
2801 | && dt != vect_constant_def | |
2802 | && dt != vect_induction_def) | |
2803 | return false; | |
20f06221 | 2804 | } |
61d3cdbb | 2805 | |
20f06221 DN |
2806 | op = TREE_OPERAND (operation, i); |
2807 | is_simple_use = vect_is_simple_use (op, loop_vinfo, &def_stmt, &def, &dt); | |
2808 | gcc_assert (is_simple_use); | |
2809 | gcc_assert (dt == vect_reduction_def); | |
2810 | gcc_assert (TREE_CODE (def_stmt) == PHI_NODE); | |
2811 | if (orig_stmt) | |
d29de1bf | 2812 | gcc_assert (orig_stmt == vect_is_simple_reduction (loop_vinfo, def_stmt)); |
20f06221 | 2813 | else |
d29de1bf | 2814 | gcc_assert (stmt == vect_is_simple_reduction (loop_vinfo, def_stmt)); |
20f06221 DN |
2815 | |
2816 | if (STMT_VINFO_LIVE_P (vinfo_for_stmt (def_stmt))) | |
2817 | return false; | |
61d3cdbb | 2818 | |
20f06221 | 2819 | /* 4. Supportable by target? */ |
61d3cdbb | 2820 | |
20f06221 | 2821 | /* 4.1. check support for the operation in the loop */ |
61d3cdbb DN |
2822 | optab = optab_for_tree_code (code, vectype); |
2823 | if (!optab) | |
2824 | { | |
00518cb1 | 2825 | if (vect_print_dump_info (REPORT_DETAILS)) |
61d3cdbb DN |
2826 | fprintf (vect_dump, "no optab."); |
2827 | return false; | |
2828 | } | |
2829 | vec_mode = TYPE_MODE (vectype); | |
166cdb08 | 2830 | if (optab_handler (optab, vec_mode)->insn_code == CODE_FOR_nothing) |
61d3cdbb | 2831 | { |
00518cb1 | 2832 | if (vect_print_dump_info (REPORT_DETAILS)) |
61d3cdbb | 2833 | fprintf (vect_dump, "op not supported by target."); |
afc1ab61 RH |
2834 | if (GET_MODE_SIZE (vec_mode) != UNITS_PER_WORD |
2835 | || LOOP_VINFO_VECT_FACTOR (loop_vinfo) | |
2836 | < vect_min_worthwhile_factor (code)) | |
2837 | return false; | |
00518cb1 | 2838 | if (vect_print_dump_info (REPORT_DETAILS)) |
afc1ab61 RH |
2839 | fprintf (vect_dump, "proceeding using word mode."); |
2840 | } | |
2841 | ||
2842 | /* Worthwhile without SIMD support? */ | |
2843 | if (!VECTOR_MODE_P (TYPE_MODE (vectype)) | |
2844 | && LOOP_VINFO_VECT_FACTOR (loop_vinfo) | |
2845 | < vect_min_worthwhile_factor (code)) | |
2846 | { | |
00518cb1 | 2847 | if (vect_print_dump_info (REPORT_DETAILS)) |
afc1ab61 | 2848 | fprintf (vect_dump, "not worthwhile without SIMD support."); |
61d3cdbb DN |
2849 | return false; |
2850 | } | |
2851 | ||
20f06221 DN |
2852 | /* 4.2. Check support for the epilog operation. |
2853 | ||
2854 | If STMT represents a reduction pattern, then the type of the | |
2855 | reduction variable may be different than the type of the rest | |
2856 | of the arguments. For example, consider the case of accumulation | |
2857 | of shorts into an int accumulator; The original code: | |
2858 | S1: int_a = (int) short_a; | |
2859 | orig_stmt-> S2: int_acc = plus <int_a ,int_acc>; | |
2860 | ||
2861 | was replaced with: | |
2862 | STMT: int_acc = widen_sum <short_a, int_acc> | |
2863 | ||
2864 | This means that: | |
2865 | 1. The tree-code that is used to create the vector operation in the | |
2866 | epilog code (that reduces the partial results) is not the | |
2867 | tree-code of STMT, but is rather the tree-code of the original | |
2868 | stmt from the pattern that STMT is replacing. I.e, in the example | |
2869 | above we want to use 'widen_sum' in the loop, but 'plus' in the | |
2870 | epilog. | |
2871 | 2. The type (mode) we use to check available target support | |
2872 | for the vector operation to be created in the *epilog*, is | |
2873 | determined by the type of the reduction variable (in the example | |
2874 | above we'd check this: plus_optab[vect_int_mode]). | |
2875 | However the type (mode) we use to check available target support | |
2876 | for the vector operation to be created *inside the loop*, is | |
2877 | determined by the type of the other arguments to STMT (in the | |
2878 | example we'd check this: widen_sum_optab[vect_short_mode]). | |
2879 | ||
2880 | This is contrary to "regular" reductions, in which the types of all | |
2881 | the arguments are the same as the type of the reduction variable. | |
2882 | For "regular" reductions we can therefore use the same vector type | |
2883 | (and also the same tree-code) when generating the epilog code and | |
2884 | when generating the code inside the loop. */ | |
2885 | ||
2886 | if (orig_stmt) | |
2887 | { | |
2888 | /* This is a reduction pattern: get the vectype from the type of the | |
2889 | reduction variable, and get the tree-code from orig_stmt. */ | |
07beea0d | 2890 | orig_code = TREE_CODE (GIMPLE_STMT_OPERAND (orig_stmt, 1)); |
20f06221 | 2891 | vectype = get_vectype_for_scalar_type (TREE_TYPE (def)); |
20e545c3 IR |
2892 | if (!vectype) |
2893 | { | |
2894 | if (vect_print_dump_info (REPORT_DETAILS)) | |
2895 | { | |
2896 | fprintf (vect_dump, "unsupported data-type "); | |
2897 | print_generic_expr (vect_dump, TREE_TYPE (def), TDF_SLIM); | |
2898 | } | |
2899 | return false; | |
2900 | } | |
2901 | ||
20f06221 DN |
2902 | vec_mode = TYPE_MODE (vectype); |
2903 | } | |
2904 | else | |
2905 | { | |
2906 | /* Regular reduction: use the same vectype and tree-code as used for | |
2907 | the vector code inside the loop can be used for the epilog code. */ | |
2908 | orig_code = code; | |
2909 | } | |
2910 | ||
2911 | if (!reduction_code_for_scalar_code (orig_code, &epilog_reduc_code)) | |
61d3cdbb | 2912 | return false; |
20f06221 | 2913 | reduc_optab = optab_for_tree_code (epilog_reduc_code, vectype); |
61d3cdbb DN |
2914 | if (!reduc_optab) |
2915 | { | |
00518cb1 | 2916 | if (vect_print_dump_info (REPORT_DETAILS)) |
61d3cdbb | 2917 | fprintf (vect_dump, "no optab for reduction."); |
20f06221 | 2918 | epilog_reduc_code = NUM_TREE_CODES; |
61d3cdbb | 2919 | } |
166cdb08 | 2920 | if (optab_handler (reduc_optab, vec_mode)->insn_code == CODE_FOR_nothing) |
61d3cdbb | 2921 | { |
00518cb1 | 2922 | if (vect_print_dump_info (REPORT_DETAILS)) |
a6b46ba2 | 2923 | fprintf (vect_dump, "reduc op not supported by target."); |
20f06221 | 2924 | epilog_reduc_code = NUM_TREE_CODES; |
61d3cdbb DN |
2925 | } |
2926 | ||
2927 | if (!vec_stmt) /* transformation not required. */ | |
2928 | { | |
2929 | STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type; | |
20e545c3 IR |
2930 | if (!vect_model_reduction_cost (stmt_info, epilog_reduc_code, ncopies)) |
2931 | return false; | |
61d3cdbb DN |
2932 | return true; |
2933 | } | |
2934 | ||
2935 | /** Transform. **/ | |
2936 | ||
00518cb1 | 2937 | if (vect_print_dump_info (REPORT_DETAILS)) |
61d3cdbb DN |
2938 | fprintf (vect_dump, "transform reduction."); |
2939 | ||
2940 | /* Create the destination vector */ | |
2941 | vec_dest = vect_create_destination_var (scalar_dest, vectype); | |
2942 | ||
61d3cdbb DN |
2943 | /* Create the reduction-phi that defines the reduction-operand. */ |
2944 | new_phi = create_phi_node (vec_dest, loop->header); | |
2945 | ||
89d67cca DN |
2946 | /* In case the vectorization factor (VF) is bigger than the number |
2947 | of elements that we can fit in a vectype (nunits), we have to generate | |
2948 | more than one vector stmt - i.e - we need to "unroll" the | |
2949 | vector stmt by a factor VF/nunits. For more details see documentation | |
2950 | in vectorizable_operation. */ | |
2951 | ||
2952 | prev_stmt_info = NULL; | |
2953 | for (j = 0; j < ncopies; j++) | |
20f06221 | 2954 | { |
89d67cca DN |
2955 | /* Handle uses. */ |
2956 | if (j == 0) | |
2957 | { | |
2958 | op = TREE_OPERAND (operation, 0); | |
2959 | loop_vec_def0 = vect_get_vec_def_for_operand (op, stmt, NULL); | |
2960 | if (op_type == ternary_op) | |
2961 | { | |
2962 | op = TREE_OPERAND (operation, 1); | |
2963 | loop_vec_def1 = vect_get_vec_def_for_operand (op, stmt, NULL); | |
2964 | } | |
8115817b | 2965 | |
89d67cca DN |
2966 | /* Get the vector def for the reduction variable from the phi node */ |
2967 | reduc_def = PHI_RESULT (new_phi); | |
2968 | } | |
2969 | else | |
2970 | { | |
2971 | enum vect_def_type dt = vect_unknown_def_type; /* Dummy */ | |
2972 | loop_vec_def0 = vect_get_vec_def_for_stmt_copy (dt, loop_vec_def0); | |
2973 | if (op_type == ternary_op) | |
2974 | loop_vec_def1 = vect_get_vec_def_for_stmt_copy (dt, loop_vec_def1); | |
8115817b | 2975 | |
89d67cca DN |
2976 | /* Get the vector def for the reduction variable from the vectorized |
2977 | reduction operation generated in the previous iteration (j-1) */ | |
07beea0d | 2978 | reduc_def = GIMPLE_STMT_OPERAND (new_stmt ,0); |
89d67cca | 2979 | } |
8115817b | 2980 | |
89d67cca | 2981 | /* Arguments are ready. create the new vector stmt. */ |
89d67cca DN |
2982 | if (op_type == binary_op) |
2983 | expr = build2 (code, vectype, loop_vec_def0, reduc_def); | |
2984 | else | |
2985 | expr = build3 (code, vectype, loop_vec_def0, loop_vec_def1, | |
8115817b | 2986 | reduc_def); |
ebb07520 | 2987 | new_stmt = build_gimple_modify_stmt (vec_dest, expr); |
89d67cca | 2988 | new_temp = make_ssa_name (vec_dest, new_stmt); |
07beea0d | 2989 | GIMPLE_STMT_OPERAND (new_stmt, 0) = new_temp; |
89d67cca | 2990 | vect_finish_stmt_generation (stmt, new_stmt, bsi); |
8115817b | 2991 | |
89d67cca DN |
2992 | if (j == 0) |
2993 | STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt; | |
2994 | else | |
2995 | STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt; | |
2996 | prev_stmt_info = vinfo_for_stmt (new_stmt); | |
20f06221 | 2997 | } |
8115817b | 2998 | |
61d3cdbb DN |
2999 | /* Finalize the reduction-phi (set it's arguments) and create the |
3000 | epilog reduction code. */ | |
8115817b | 3001 | vect_create_epilog_for_reduction (new_temp, stmt, epilog_reduc_code, new_phi); |
61d3cdbb DN |
3002 | return true; |
3003 | } | |
3004 | ||
2505a3f2 | 3005 | /* Checks if CALL can be vectorized in type VECTYPE. Returns |
b95becfc RG |
3006 | a function declaration if the target has a vectorized version |
3007 | of the function, or NULL_TREE if the function cannot be vectorized. */ | |
2505a3f2 | 3008 | |
b95becfc RG |
3009 | tree |
3010 | vectorizable_function (tree call, tree vectype_out, tree vectype_in) | |
2505a3f2 RG |
3011 | { |
3012 | tree fndecl = get_callee_fndecl (call); | |
b95becfc | 3013 | enum built_in_function code; |
2505a3f2 RG |
3014 | |
3015 | /* We only handle functions that do not read or clobber memory -- i.e. | |
3016 | const or novops ones. */ | |
3017 | if (!(call_expr_flags (call) & (ECF_CONST | ECF_NOVOPS))) | |
b95becfc | 3018 | return NULL_TREE; |
2505a3f2 RG |
3019 | |
3020 | if (!fndecl | |
3021 | || TREE_CODE (fndecl) != FUNCTION_DECL | |
3022 | || !DECL_BUILT_IN (fndecl)) | |
b95becfc | 3023 | return NULL_TREE; |
2505a3f2 | 3024 | |
b95becfc RG |
3025 | code = DECL_FUNCTION_CODE (fndecl); |
3026 | return targetm.vectorize.builtin_vectorized_function (code, vectype_out, | |
3027 | vectype_in); | |
2505a3f2 RG |
3028 | } |
3029 | ||
3030 | /* Function vectorizable_call. | |
3031 | ||
3032 | Check if STMT performs a function call that can be vectorized. | |
3033 | If VEC_STMT is also passed, vectorize the STMT: create a vectorized | |
3034 | stmt to replace it, put it in VEC_STMT, and insert it at BSI. | |
3035 | Return FALSE if not a vectorizable STMT, TRUE otherwise. */ | |
3036 | ||
3037 | bool | |
3038 | vectorizable_call (tree stmt, block_stmt_iterator *bsi, tree *vec_stmt) | |
3039 | { | |
3040 | tree vec_dest; | |
3041 | tree scalar_dest; | |
3042 | tree operation; | |
5039610b | 3043 | tree op, type; |
b40c4f68 | 3044 | tree vec_oprnd0 = NULL_TREE, vec_oprnd1 = NULL_TREE; |
b95becfc RG |
3045 | stmt_vec_info stmt_info = vinfo_for_stmt (stmt), prev_stmt_info; |
3046 | tree vectype_out, vectype_in; | |
b40c4f68 UB |
3047 | int nunits_in; |
3048 | int nunits_out; | |
2505a3f2 | 3049 | loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info); |
d29de1bf | 3050 | struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo); |
b95becfc | 3051 | tree fndecl, rhs, new_temp, def, def_stmt, rhs_type, lhs_type; |
3a70f3ef | 3052 | enum vect_def_type dt[2] = {vect_unknown_def_type, vect_unknown_def_type}; |
b40c4f68 | 3053 | tree new_stmt; |
b95becfc | 3054 | int ncopies, j, nargs; |
5039610b | 3055 | call_expr_arg_iterator iter; |
b40c4f68 UB |
3056 | tree vargs; |
3057 | enum { NARROW, NONE, WIDEN } modifier; | |
2505a3f2 | 3058 | |
60555ced DN |
3059 | if (!STMT_VINFO_RELEVANT_P (stmt_info)) |
3060 | return false; | |
3061 | ||
3062 | if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_loop_def) | |
3063 | return false; | |
3064 | ||
805e2059 IR |
3065 | /* FORNOW: SLP not supported. */ |
3066 | if (STMT_SLP_TYPE (stmt_info)) | |
3067 | return false; | |
3068 | ||
2505a3f2 | 3069 | /* Is STMT a vectorizable call? */ |
07beea0d | 3070 | if (TREE_CODE (stmt) != GIMPLE_MODIFY_STMT) |
2505a3f2 RG |
3071 | return false; |
3072 | ||
07beea0d | 3073 | if (TREE_CODE (GIMPLE_STMT_OPERAND (stmt, 0)) != SSA_NAME) |
2505a3f2 RG |
3074 | return false; |
3075 | ||
07beea0d | 3076 | operation = GIMPLE_STMT_OPERAND (stmt, 1); |
2505a3f2 RG |
3077 | if (TREE_CODE (operation) != CALL_EXPR) |
3078 | return false; | |
b95becfc RG |
3079 | |
3080 | /* Process function arguments. */ | |
3081 | rhs_type = NULL_TREE; | |
5039610b SL |
3082 | nargs = 0; |
3083 | FOR_EACH_CALL_EXPR_ARG (op, iter, operation) | |
b95becfc | 3084 | { |
b95becfc RG |
3085 | /* Bail out if the function has more than two arguments, we |
3086 | do not have interesting builtin functions to vectorize with | |
3087 | more than two arguments. */ | |
b40c4f68 | 3088 | if (nargs >= 2) |
b95becfc RG |
3089 | return false; |
3090 | ||
3091 | /* We can only handle calls with arguments of the same type. */ | |
3092 | if (rhs_type | |
3093 | && rhs_type != TREE_TYPE (op)) | |
3094 | { | |
3095 | if (vect_print_dump_info (REPORT_DETAILS)) | |
3096 | fprintf (vect_dump, "argument types differ."); | |
3097 | return false; | |
3098 | } | |
3099 | rhs_type = TREE_TYPE (op); | |
3100 | ||
b40c4f68 | 3101 | if (!vect_is_simple_use (op, loop_vinfo, &def_stmt, &def, &dt[nargs])) |
b95becfc RG |
3102 | { |
3103 | if (vect_print_dump_info (REPORT_DETAILS)) | |
3104 | fprintf (vect_dump, "use not simple."); | |
3105 | return false; | |
3106 | } | |
b40c4f68 UB |
3107 | |
3108 | ++nargs; | |
b95becfc RG |
3109 | } |
3110 | ||
3111 | /* No arguments is also not good. */ | |
3112 | if (nargs == 0) | |
3113 | return false; | |
3114 | ||
3115 | vectype_in = get_vectype_for_scalar_type (rhs_type); | |
6d3bf849 UB |
3116 | if (!vectype_in) |
3117 | return false; | |
b40c4f68 | 3118 | nunits_in = TYPE_VECTOR_SUBPARTS (vectype_in); |
b95becfc RG |
3119 | |
3120 | lhs_type = TREE_TYPE (GIMPLE_STMT_OPERAND (stmt, 0)); | |
3121 | vectype_out = get_vectype_for_scalar_type (lhs_type); | |
6d3bf849 UB |
3122 | if (!vectype_out) |
3123 | return false; | |
b40c4f68 UB |
3124 | nunits_out = TYPE_VECTOR_SUBPARTS (vectype_out); |
3125 | ||
3126 | /* FORNOW */ | |
3127 | if (nunits_in == nunits_out / 2) | |
3128 | modifier = NARROW; | |
3129 | else if (nunits_out == nunits_in) | |
3130 | modifier = NONE; | |
3131 | else if (nunits_out == nunits_in / 2) | |
3132 | modifier = WIDEN; | |
3133 | else | |
b95becfc RG |
3134 | return false; |
3135 | ||
2505a3f2 RG |
3136 | /* For now, we only vectorize functions if a target specific builtin |
3137 | is available. TODO -- in some cases, it might be profitable to | |
3138 | insert the calls for pieces of the vector, in order to be able | |
3139 | to vectorize other operations in the loop. */ | |
b95becfc RG |
3140 | fndecl = vectorizable_function (operation, vectype_out, vectype_in); |
3141 | if (fndecl == NULL_TREE) | |
2505a3f2 RG |
3142 | { |
3143 | if (vect_print_dump_info (REPORT_DETAILS)) | |
3144 | fprintf (vect_dump, "function is not vectorizable."); | |
3145 | ||
3146 | return false; | |
3147 | } | |
2505a3f2 | 3148 | |
b95becfc | 3149 | gcc_assert (ZERO_SSA_OPERANDS (stmt, SSA_OP_ALL_VIRTUALS)); |
2505a3f2 | 3150 | |
b40c4f68 UB |
3151 | if (modifier == NARROW) |
3152 | ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits_out; | |
3153 | else | |
3154 | ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits_in; | |
3155 | ||
3156 | /* Sanity check: make sure that at least one copy of the vectorized stmt | |
3157 | needs to be generated. */ | |
3158 | gcc_assert (ncopies >= 1); | |
792ed98b | 3159 | |
d29de1bf DN |
3160 | /* FORNOW. This restriction should be relaxed. */ |
3161 | if (nested_in_vect_loop_p (loop, stmt) && ncopies > 1) | |
3162 | { | |
3163 | if (vect_print_dump_info (REPORT_DETAILS)) | |
3164 | fprintf (vect_dump, "multiple types in nested loop."); | |
3165 | return false; | |
3166 | } | |
3167 | ||
2505a3f2 RG |
3168 | if (!vec_stmt) /* transformation not required. */ |
3169 | { | |
3170 | STMT_VINFO_TYPE (stmt_info) = call_vec_info_type; | |
792ed98b HJ |
3171 | if (vect_print_dump_info (REPORT_DETAILS)) |
3172 | fprintf (vect_dump, "=== vectorizable_call ==="); | |
805e2059 | 3173 | vect_model_simple_cost (stmt_info, ncopies, dt, NULL); |
2505a3f2 RG |
3174 | return true; |
3175 | } | |
3176 | ||
3177 | /** Transform. **/ | |
3178 | ||
3179 | if (vect_print_dump_info (REPORT_DETAILS)) | |
3180 | fprintf (vect_dump, "transform operation."); | |
3181 | ||
d29de1bf DN |
3182 | /* FORNOW. This restriction should be relaxed. */ |
3183 | if (nested_in_vect_loop_p (loop, stmt) && ncopies > 1) | |
3184 | { | |
3185 | if (vect_print_dump_info (REPORT_DETAILS)) | |
3186 | fprintf (vect_dump, "multiple types in nested loop."); | |
3187 | return false; | |
3188 | } | |
3189 | ||
2505a3f2 | 3190 | /* Handle def. */ |
07beea0d | 3191 | scalar_dest = GIMPLE_STMT_OPERAND (stmt, 0); |
b95becfc | 3192 | vec_dest = vect_create_destination_var (scalar_dest, vectype_out); |
2505a3f2 | 3193 | |
b95becfc | 3194 | prev_stmt_info = NULL; |
b40c4f68 | 3195 | switch (modifier) |
2505a3f2 | 3196 | { |
b40c4f68 UB |
3197 | case NONE: |
3198 | for (j = 0; j < ncopies; ++j) | |
b95becfc | 3199 | { |
b40c4f68 UB |
3200 | /* Build argument list for the vectorized call. */ |
3201 | /* FIXME: Rewrite this so that it doesn't | |
3202 | construct a temporary list. */ | |
3203 | vargs = NULL_TREE; | |
3204 | nargs = 0; | |
3205 | FOR_EACH_CALL_EXPR_ARG (op, iter, operation) | |
3206 | { | |
3207 | if (j == 0) | |
3208 | vec_oprnd0 | |
3209 | = vect_get_vec_def_for_operand (op, stmt, NULL); | |
3210 | else | |
3211 | vec_oprnd0 | |
3212 | = vect_get_vec_def_for_stmt_copy (dt[nargs], vec_oprnd0); | |
3213 | ||
3214 | vargs = tree_cons (NULL_TREE, vec_oprnd0, vargs); | |
3215 | ||
3216 | ++nargs; | |
3217 | } | |
3218 | vargs = nreverse (vargs); | |
3219 | ||
3220 | rhs = build_function_call_expr (fndecl, vargs); | |
3221 | new_stmt = build_gimple_modify_stmt (vec_dest, rhs); | |
3222 | new_temp = make_ssa_name (vec_dest, new_stmt); | |
3223 | GIMPLE_STMT_OPERAND (new_stmt, 0) = new_temp; | |
3224 | ||
3225 | vect_finish_stmt_generation (stmt, new_stmt, bsi); | |
b95becfc RG |
3226 | |
3227 | if (j == 0) | |
b40c4f68 | 3228 | STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt; |
b95becfc | 3229 | else |
b40c4f68 | 3230 | STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt; |
b95becfc | 3231 | |
b40c4f68 | 3232 | prev_stmt_info = vinfo_for_stmt (new_stmt); |
b95becfc | 3233 | } |
b95becfc | 3234 | |
b40c4f68 | 3235 | break; |
2505a3f2 | 3236 | |
b40c4f68 UB |
3237 | case NARROW: |
3238 | for (j = 0; j < ncopies; ++j) | |
3239 | { | |
3240 | /* Build argument list for the vectorized call. */ | |
3241 | /* FIXME: Rewrite this so that it doesn't | |
3242 | construct a temporary list. */ | |
3243 | vargs = NULL_TREE; | |
3244 | nargs = 0; | |
3245 | FOR_EACH_CALL_EXPR_ARG (op, iter, operation) | |
3246 | { | |
3247 | if (j == 0) | |
3248 | { | |
3249 | vec_oprnd0 | |
3250 | = vect_get_vec_def_for_operand (op, stmt, NULL); | |
3251 | vec_oprnd1 | |
3252 | = vect_get_vec_def_for_stmt_copy (dt[nargs], vec_oprnd0); | |
3253 | } | |
3254 | else | |
3255 | { | |
3256 | vec_oprnd0 | |
3257 | = vect_get_vec_def_for_stmt_copy (dt[nargs], vec_oprnd1); | |
3258 | vec_oprnd1 | |
3259 | = vect_get_vec_def_for_stmt_copy (dt[nargs], vec_oprnd0); | |
3260 | } | |
3261 | ||
3262 | vargs = tree_cons (NULL_TREE, vec_oprnd0, vargs); | |
3263 | vargs = tree_cons (NULL_TREE, vec_oprnd1, vargs); | |
3264 | ||
3265 | ++nargs; | |
3266 | } | |
3267 | vargs = nreverse (vargs); | |
2505a3f2 | 3268 | |
b40c4f68 UB |
3269 | rhs = build_function_call_expr (fndecl, vargs); |
3270 | new_stmt = build_gimple_modify_stmt (vec_dest, rhs); | |
3271 | new_temp = make_ssa_name (vec_dest, new_stmt); | |
3272 | GIMPLE_STMT_OPERAND (new_stmt, 0) = new_temp; | |
3273 | ||
3274 | vect_finish_stmt_generation (stmt, new_stmt, bsi); | |
3275 | ||
3276 | if (j == 0) | |
3277 | STMT_VINFO_VEC_STMT (stmt_info) = new_stmt; | |
3278 | else | |
3279 | STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt; | |
3280 | ||
3281 | prev_stmt_info = vinfo_for_stmt (new_stmt); | |
3282 | } | |
3283 | ||
3284 | *vec_stmt = STMT_VINFO_VEC_STMT (stmt_info); | |
3285 | ||
3286 | break; | |
3287 | ||
3288 | case WIDEN: | |
3289 | /* No current target implements this case. */ | |
3290 | return false; | |
b95becfc | 3291 | } |
2505a3f2 | 3292 | |
b40c4f68 UB |
3293 | /* The call in STMT might prevent it from being removed in dce. |
3294 | We however cannot remove it here, due to the way the ssa name | |
3295 | it defines is mapped to the new definition. So just replace | |
3296 | rhs of the statement with something harmless. */ | |
2505a3f2 | 3297 | type = TREE_TYPE (scalar_dest); |
9f919563 | 3298 | GIMPLE_STMT_OPERAND (stmt, 1) = fold_convert (type, integer_zero_node); |
1344284e | 3299 | update_stmt (stmt); |
2505a3f2 RG |
3300 | |
3301 | return true; | |
3302 | } | |
3303 | ||
61d3cdbb | 3304 | |
d9987fb4 UB |
3305 | /* Function vect_gen_widened_results_half |
3306 | ||
3307 | Create a vector stmt whose code, type, number of arguments, and result | |
3308 | variable are CODE, VECTYPE, OP_TYPE, and VEC_DEST, and its arguments are | |
3309 | VEC_OPRND0 and VEC_OPRND1. The new vector stmt is to be inserted at BSI. | |
3310 | In the case that CODE is a CALL_EXPR, this means that a call to DECL | |
3311 | needs to be created (DECL is a function-decl of a target-builtin). | |
3312 | STMT is the original scalar stmt that we are vectorizing. */ | |
3313 | ||
3314 | static tree | |
3315 | vect_gen_widened_results_half (enum tree_code code, tree vectype, tree decl, | |
3316 | tree vec_oprnd0, tree vec_oprnd1, int op_type, | |
3317 | tree vec_dest, block_stmt_iterator *bsi, | |
3318 | tree stmt) | |
3319 | { | |
3320 | tree expr; | |
3321 | tree new_stmt; | |
3322 | tree new_temp; | |
3323 | tree sym; | |
3324 | ssa_op_iter iter; | |
3325 | ||
3326 | /* Generate half of the widened result: */ | |
3327 | if (code == CALL_EXPR) | |
3328 | { | |
3329 | /* Target specific support */ | |
3330 | if (op_type == binary_op) | |
3331 | expr = build_call_expr (decl, 2, vec_oprnd0, vec_oprnd1); | |
3332 | else | |
3333 | expr = build_call_expr (decl, 1, vec_oprnd0); | |
3334 | } | |
3335 | else | |
3336 | { | |
3337 | /* Generic support */ | |
3338 | gcc_assert (op_type == TREE_CODE_LENGTH (code)); | |
3339 | if (op_type == binary_op) | |
3340 | expr = build2 (code, vectype, vec_oprnd0, vec_oprnd1); | |
3341 | else | |
3342 | expr = build1 (code, vectype, vec_oprnd0); | |
3343 | } | |
3344 | new_stmt = build_gimple_modify_stmt (vec_dest, expr); | |
3345 | new_temp = make_ssa_name (vec_dest, new_stmt); | |
3346 | GIMPLE_STMT_OPERAND (new_stmt, 0) = new_temp; | |
3347 | vect_finish_stmt_generation (stmt, new_stmt, bsi); | |
3348 | ||
3349 | if (code == CALL_EXPR) | |
3350 | { | |
3351 | FOR_EACH_SSA_TREE_OPERAND (sym, new_stmt, iter, SSA_OP_ALL_VIRTUALS) | |
3352 | { | |
3353 | if (TREE_CODE (sym) == SSA_NAME) | |
3354 | sym = SSA_NAME_VAR (sym); | |
3355 | mark_sym_for_renaming (sym); | |
3356 | } | |
3357 | } | |
3358 | ||
3359 | return new_stmt; | |
3360 | } | |
3361 | ||
3362 | ||
805e2059 IR |
3363 | /* Check if STMT performs a conversion operation, that can be vectorized. |
3364 | If VEC_STMT is also passed, vectorize the STMT: create a vectorized | |
3365 | stmt to replace it, put it in VEC_STMT, and insert it at BSI. | |
3366 | Return FALSE if not a vectorizable STMT, TRUE otherwise. */ | |
f57d17f1 TM |
3367 | |
3368 | bool | |
805e2059 IR |
3369 | vectorizable_conversion (tree stmt, block_stmt_iterator *bsi, |
3370 | tree *vec_stmt, slp_tree slp_node) | |
f57d17f1 TM |
3371 | { |
3372 | tree vec_dest; | |
3373 | tree scalar_dest; | |
3374 | tree operation; | |
3375 | tree op0; | |
d9987fb4 | 3376 | tree vec_oprnd0 = NULL_TREE, vec_oprnd1 = NULL_TREE; |
f57d17f1 TM |
3377 | stmt_vec_info stmt_info = vinfo_for_stmt (stmt); |
3378 | loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info); | |
d29de1bf | 3379 | struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo); |
8ff43db0 | 3380 | enum tree_code code, code1 = ERROR_MARK, code2 = ERROR_MARK; |
d9987fb4 | 3381 | tree decl1 = NULL_TREE, decl2 = NULL_TREE; |
f57d17f1 TM |
3382 | tree new_temp; |
3383 | tree def, def_stmt; | |
805e2059 IR |
3384 | enum vect_def_type dt[2] = {vect_unknown_def_type, vect_unknown_def_type}; |
3385 | tree new_stmt = NULL_TREE; | |
d9987fb4 | 3386 | stmt_vec_info prev_stmt_info; |
f57d17f1 TM |
3387 | int nunits_in; |
3388 | int nunits_out; | |
f57d17f1 | 3389 | tree vectype_out, vectype_in; |
d9987fb4 UB |
3390 | int ncopies, j; |
3391 | tree expr; | |
f57d17f1 | 3392 | tree rhs_type, lhs_type; |
5039610b | 3393 | tree builtin_decl; |
d9987fb4 | 3394 | enum { NARROW, NONE, WIDEN } modifier; |
805e2059 IR |
3395 | int i; |
3396 | VEC(tree,heap) *vec_oprnds0 = NULL; | |
3397 | tree vop0; | |
f57d17f1 TM |
3398 | |
3399 | /* Is STMT a vectorizable conversion? */ | |
3400 | ||
3401 | if (!STMT_VINFO_RELEVANT_P (stmt_info)) | |
3402 | return false; | |
3403 | ||
60555ced DN |
3404 | if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_loop_def) |
3405 | return false; | |
f57d17f1 | 3406 | |
f57d17f1 TM |
3407 | if (TREE_CODE (stmt) != GIMPLE_MODIFY_STMT) |
3408 | return false; | |
3409 | ||
3410 | if (TREE_CODE (GIMPLE_STMT_OPERAND (stmt, 0)) != SSA_NAME) | |
3411 | return false; | |
3412 | ||
3413 | operation = GIMPLE_STMT_OPERAND (stmt, 1); | |
3414 | code = TREE_CODE (operation); | |
3415 | if (code != FIX_TRUNC_EXPR && code != FLOAT_EXPR) | |
3416 | return false; | |
3417 | ||
805e2059 | 3418 | /* Check types of lhs and rhs. */ |
f57d17f1 TM |
3419 | op0 = TREE_OPERAND (operation, 0); |
3420 | rhs_type = TREE_TYPE (op0); | |
3421 | vectype_in = get_vectype_for_scalar_type (rhs_type); | |
4934454b DN |
3422 | if (!vectype_in) |
3423 | return false; | |
f57d17f1 TM |
3424 | nunits_in = TYPE_VECTOR_SUBPARTS (vectype_in); |
3425 | ||
3426 | scalar_dest = GIMPLE_STMT_OPERAND (stmt, 0); | |
3427 | lhs_type = TREE_TYPE (scalar_dest); | |
3428 | vectype_out = get_vectype_for_scalar_type (lhs_type); | |
4934454b DN |
3429 | if (!vectype_out) |
3430 | return false; | |
f57d17f1 TM |
3431 | nunits_out = TYPE_VECTOR_SUBPARTS (vectype_out); |
3432 | ||
d9987fb4 UB |
3433 | /* FORNOW */ |
3434 | if (nunits_in == nunits_out / 2) | |
3435 | modifier = NARROW; | |
3436 | else if (nunits_out == nunits_in) | |
3437 | modifier = NONE; | |
3438 | else if (nunits_out == nunits_in / 2) | |
3439 | modifier = WIDEN; | |
3440 | else | |
f57d17f1 TM |
3441 | return false; |
3442 | ||
d9987fb4 UB |
3443 | if (modifier == NONE) |
3444 | gcc_assert (STMT_VINFO_VECTYPE (stmt_info) == vectype_out); | |
3445 | ||
805e2059 | 3446 | /* Bail out if the types are both integral or non-integral. */ |
f57d17f1 TM |
3447 | if ((INTEGRAL_TYPE_P (rhs_type) && INTEGRAL_TYPE_P (lhs_type)) |
3448 | || (!INTEGRAL_TYPE_P (rhs_type) && !INTEGRAL_TYPE_P (lhs_type))) | |
3449 | return false; | |
3450 | ||
d9987fb4 UB |
3451 | if (modifier == NARROW) |
3452 | ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits_out; | |
3453 | else | |
3454 | ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits_in; | |
3455 | ||
805e2059 IR |
3456 | /* FORNOW: SLP with multiple types is not supported. The SLP analysis verifies |
3457 | this, so we can safely override NCOPIES with 1 here. */ | |
3458 | if (slp_node) | |
3459 | ncopies = 1; | |
3460 | ||
f57d17f1 TM |
3461 | /* Sanity check: make sure that at least one copy of the vectorized stmt |
3462 | needs to be generated. */ | |
f57d17f1 TM |
3463 | gcc_assert (ncopies >= 1); |
3464 | ||
d29de1bf DN |
3465 | /* FORNOW. This restriction should be relaxed. */ |
3466 | if (nested_in_vect_loop_p (loop, stmt) && ncopies > 1) | |
3467 | { | |
3468 | if (vect_print_dump_info (REPORT_DETAILS)) | |
3469 | fprintf (vect_dump, "multiple types in nested loop."); | |
3470 | return false; | |
3471 | } | |
3472 | ||
d9987fb4 | 3473 | /* Check the operands of the operation. */ |
805e2059 | 3474 | if (!vect_is_simple_use (op0, loop_vinfo, &def_stmt, &def, &dt[0])) |
f57d17f1 TM |
3475 | { |
3476 | if (vect_print_dump_info (REPORT_DETAILS)) | |
3477 | fprintf (vect_dump, "use not simple."); | |
3478 | return false; | |
3479 | } | |
3480 | ||
3481 | /* Supportable by target? */ | |
d9987fb4 UB |
3482 | if ((modifier == NONE |
3483 | && !targetm.vectorize.builtin_conversion (code, vectype_in)) | |
3484 | || (modifier == WIDEN | |
3485 | && !supportable_widening_operation (code, stmt, vectype_in, | |
3486 | &decl1, &decl2, | |
3487 | &code1, &code2)) | |
3488 | || (modifier == NARROW | |
3489 | && !supportable_narrowing_operation (code, stmt, vectype_in, | |
3490 | &code1))) | |
f57d17f1 TM |
3491 | { |
3492 | if (vect_print_dump_info (REPORT_DETAILS)) | |
3493 | fprintf (vect_dump, "op not supported by target."); | |
3494 | return false; | |
3495 | } | |
3496 | ||
d9987fb4 | 3497 | if (modifier != NONE) |
805e2059 IR |
3498 | { |
3499 | STMT_VINFO_VECTYPE (stmt_info) = vectype_in; | |
3500 | /* FORNOW: SLP not supported. */ | |
3501 | if (STMT_SLP_TYPE (stmt_info)) | |
3502 | return false; | |
3503 | } | |
d9987fb4 | 3504 | |
f57d17f1 TM |
3505 | if (!vec_stmt) /* transformation not required. */ |
3506 | { | |
3507 | STMT_VINFO_TYPE (stmt_info) = type_conversion_vec_info_type; | |
3508 | return true; | |
3509 | } | |
3510 | ||
d9987fb4 | 3511 | /** Transform. **/ |
f57d17f1 TM |
3512 | if (vect_print_dump_info (REPORT_DETAILS)) |
3513 | fprintf (vect_dump, "transform conversion."); | |
3514 | ||
3515 | /* Handle def. */ | |
3516 | vec_dest = vect_create_destination_var (scalar_dest, vectype_out); | |
3517 | ||
805e2059 IR |
3518 | if (modifier == NONE && !slp_node) |
3519 | vec_oprnds0 = VEC_alloc (tree, heap, 1); | |
3520 | ||
f57d17f1 | 3521 | prev_stmt_info = NULL; |
d9987fb4 | 3522 | switch (modifier) |
f57d17f1 | 3523 | { |
d9987fb4 UB |
3524 | case NONE: |
3525 | for (j = 0; j < ncopies; j++) | |
3526 | { | |
3527 | tree sym; | |
3528 | ssa_op_iter iter; | |
f57d17f1 | 3529 | |
d9987fb4 | 3530 | if (j == 0) |
805e2059 | 3531 | vect_get_vec_defs (op0, NULL, stmt, &vec_oprnds0, NULL, slp_node); |
d9987fb4 | 3532 | else |
805e2059 | 3533 | vect_get_vec_defs_for_stmt_copy (dt, &vec_oprnds0, NULL); |
d9987fb4 UB |
3534 | |
3535 | builtin_decl = | |
3536 | targetm.vectorize.builtin_conversion (code, vectype_in); | |
805e2059 IR |
3537 | for (i = 0; VEC_iterate (tree, vec_oprnds0, i, vop0); i++) |
3538 | { | |
3539 | new_stmt = build_call_expr (builtin_decl, 1, vop0); | |
d9987fb4 | 3540 | |
805e2059 IR |
3541 | /* Arguments are ready. create the new vector stmt. */ |
3542 | new_stmt = build_gimple_modify_stmt (vec_dest, new_stmt); | |
3543 | new_temp = make_ssa_name (vec_dest, new_stmt); | |
3544 | GIMPLE_STMT_OPERAND (new_stmt, 0) = new_temp; | |
3545 | vect_finish_stmt_generation (stmt, new_stmt, bsi); | |
3546 | FOR_EACH_SSA_TREE_OPERAND (sym, new_stmt, iter, | |
3547 | SSA_OP_ALL_VIRTUALS) | |
3548 | { | |
3549 | if (TREE_CODE (sym) == SSA_NAME) | |
3550 | sym = SSA_NAME_VAR (sym); | |
3551 | mark_sym_for_renaming (sym); | |
3552 | } | |
3553 | if (slp_node) | |
3554 | VEC_quick_push (tree, SLP_TREE_VEC_STMTS (slp_node), new_stmt); | |
d9987fb4 | 3555 | } |
f57d17f1 | 3556 | |
d9987fb4 UB |
3557 | if (j == 0) |
3558 | STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt; | |
3559 | else | |
3560 | STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt; | |
3561 | prev_stmt_info = vinfo_for_stmt (new_stmt); | |
3562 | } | |
3563 | break; | |
3564 | ||
3565 | case WIDEN: | |
3566 | /* In case the vectorization factor (VF) is bigger than the number | |
3567 | of elements that we can fit in a vectype (nunits), we have to | |
3568 | generate more than one vector stmt - i.e - we need to "unroll" | |
3569 | the vector stmt by a factor VF/nunits. */ | |
3570 | for (j = 0; j < ncopies; j++) | |
3571 | { | |
3572 | if (j == 0) | |
3573 | vec_oprnd0 = vect_get_vec_def_for_operand (op0, stmt, NULL); | |
3574 | else | |
805e2059 | 3575 | vec_oprnd0 = vect_get_vec_def_for_stmt_copy (dt[0], vec_oprnd0); |
f57d17f1 | 3576 | |
d9987fb4 | 3577 | STMT_VINFO_VECTYPE (stmt_info) = vectype_in; |
f57d17f1 | 3578 | |
d9987fb4 UB |
3579 | /* Generate first half of the widened result: */ |
3580 | new_stmt | |
3581 | = vect_gen_widened_results_half (code1, vectype_out, decl1, | |
3582 | vec_oprnd0, vec_oprnd1, | |
3583 | unary_op, vec_dest, bsi, stmt); | |
3584 | if (j == 0) | |
3585 | STMT_VINFO_VEC_STMT (stmt_info) = new_stmt; | |
3586 | else | |
3587 | STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt; | |
3588 | prev_stmt_info = vinfo_for_stmt (new_stmt); | |
3589 | ||
3590 | /* Generate second half of the widened result: */ | |
3591 | new_stmt | |
3592 | = vect_gen_widened_results_half (code2, vectype_out, decl2, | |
3593 | vec_oprnd0, vec_oprnd1, | |
3594 | unary_op, vec_dest, bsi, stmt); | |
3595 | STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt; | |
3596 | prev_stmt_info = vinfo_for_stmt (new_stmt); | |
3597 | } | |
3598 | break; | |
3599 | ||
3600 | case NARROW: | |
3601 | /* In case the vectorization factor (VF) is bigger than the number | |
3602 | of elements that we can fit in a vectype (nunits), we have to | |
3603 | generate more than one vector stmt - i.e - we need to "unroll" | |
3604 | the vector stmt by a factor VF/nunits. */ | |
3605 | for (j = 0; j < ncopies; j++) | |
3606 | { | |
3607 | /* Handle uses. */ | |
3608 | if (j == 0) | |
3609 | { | |
3610 | vec_oprnd0 = vect_get_vec_def_for_operand (op0, stmt, NULL); | |
805e2059 | 3611 | vec_oprnd1 = vect_get_vec_def_for_stmt_copy (dt[0], vec_oprnd0); |
d9987fb4 UB |
3612 | } |
3613 | else | |
3614 | { | |
805e2059 IR |
3615 | vec_oprnd0 = vect_get_vec_def_for_stmt_copy (dt[0], vec_oprnd1); |
3616 | vec_oprnd1 = vect_get_vec_def_for_stmt_copy (dt[0], vec_oprnd0); | |
d9987fb4 UB |
3617 | } |
3618 | ||
3619 | /* Arguments are ready. Create the new vector stmt. */ | |
3620 | expr = build2 (code1, vectype_out, vec_oprnd0, vec_oprnd1); | |
3621 | new_stmt = build_gimple_modify_stmt (vec_dest, expr); | |
3622 | new_temp = make_ssa_name (vec_dest, new_stmt); | |
3623 | GIMPLE_STMT_OPERAND (new_stmt, 0) = new_temp; | |
3624 | vect_finish_stmt_generation (stmt, new_stmt, bsi); | |
3625 | ||
3626 | if (j == 0) | |
3627 | STMT_VINFO_VEC_STMT (stmt_info) = new_stmt; | |
3628 | else | |
3629 | STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt; | |
3630 | ||
3631 | prev_stmt_info = vinfo_for_stmt (new_stmt); | |
3632 | } | |
3633 | ||
3634 | *vec_stmt = STMT_VINFO_VEC_STMT (stmt_info); | |
f57d17f1 | 3635 | } |
805e2059 | 3636 | |
f57d17f1 TM |
3637 | return true; |
3638 | } | |
3639 | ||
3640 | ||
f7064d11 DN |
3641 | /* Function vectorizable_assignment. |
3642 | ||
3643 | Check if STMT performs an assignment (copy) that can be vectorized. | |
3644 | If VEC_STMT is also passed, vectorize the STMT: create a vectorized | |
3645 | stmt to replace it, put it in VEC_STMT, and insert it at BSI. | |
3646 | Return FALSE if not a vectorizable STMT, TRUE otherwise. */ | |
3647 | ||
3648 | bool | |
805e2059 IR |
3649 | vectorizable_assignment (tree stmt, block_stmt_iterator *bsi, tree *vec_stmt, |
3650 | slp_tree slp_node) | |
f7064d11 DN |
3651 | { |
3652 | tree vec_dest; | |
3653 | tree scalar_dest; | |
3654 | tree op; | |
f7064d11 DN |
3655 | stmt_vec_info stmt_info = vinfo_for_stmt (stmt); |
3656 | tree vectype = STMT_VINFO_VECTYPE (stmt_info); | |
3657 | loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info); | |
3658 | tree new_temp; | |
88088c03 | 3659 | tree def, def_stmt; |
3a70f3ef | 3660 | enum vect_def_type dt[2] = {vect_unknown_def_type, vect_unknown_def_type}; |
89d67cca DN |
3661 | int nunits = TYPE_VECTOR_SUBPARTS (vectype); |
3662 | int ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits; | |
805e2059 IR |
3663 | int i; |
3664 | VEC(tree,heap) *vec_oprnds = NULL; | |
3665 | tree vop; | |
89d67cca DN |
3666 | |
3667 | gcc_assert (ncopies >= 1); | |
3668 | if (ncopies > 1) | |
3669 | return false; /* FORNOW */ | |
f7064d11 | 3670 | |
88088c03 DN |
3671 | if (!STMT_VINFO_RELEVANT_P (stmt_info)) |
3672 | return false; | |
3673 | ||
60555ced DN |
3674 | if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_loop_def) |
3675 | return false; | |
3676 | ||
60555ced | 3677 | /* Is vectorizable assignment? */ |
07beea0d | 3678 | if (TREE_CODE (stmt) != GIMPLE_MODIFY_STMT) |
f7064d11 DN |
3679 | return false; |
3680 | ||
07beea0d | 3681 | scalar_dest = GIMPLE_STMT_OPERAND (stmt, 0); |
f7064d11 DN |
3682 | if (TREE_CODE (scalar_dest) != SSA_NAME) |
3683 | return false; | |
3684 | ||
07beea0d | 3685 | op = GIMPLE_STMT_OPERAND (stmt, 1); |
3a70f3ef | 3686 | if (!vect_is_simple_use (op, loop_vinfo, &def_stmt, &def, &dt[0])) |
f7064d11 | 3687 | { |
00518cb1 | 3688 | if (vect_print_dump_info (REPORT_DETAILS)) |
f7064d11 DN |
3689 | fprintf (vect_dump, "use not simple."); |
3690 | return false; | |
3691 | } | |
3692 | ||
3693 | if (!vec_stmt) /* transformation not required. */ | |
3694 | { | |
3695 | STMT_VINFO_TYPE (stmt_info) = assignment_vec_info_type; | |
792ed98b HJ |
3696 | if (vect_print_dump_info (REPORT_DETAILS)) |
3697 | fprintf (vect_dump, "=== vectorizable_assignment ==="); | |
805e2059 | 3698 | vect_model_simple_cost (stmt_info, ncopies, dt, NULL); |
f7064d11 DN |
3699 | return true; |
3700 | } | |
3701 | ||
3702 | /** Transform. **/ | |
00518cb1 | 3703 | if (vect_print_dump_info (REPORT_DETAILS)) |
f7064d11 DN |
3704 | fprintf (vect_dump, "transform assignment."); |
3705 | ||
3706 | /* Handle def. */ | |
3707 | vec_dest = vect_create_destination_var (scalar_dest, vectype); | |
3708 | ||
3709 | /* Handle use. */ | |
805e2059 | 3710 | vect_get_vec_defs (op, NULL, stmt, &vec_oprnds, NULL, slp_node); |
f7064d11 DN |
3711 | |
3712 | /* Arguments are ready. create the new vector stmt. */ | |
805e2059 IR |
3713 | for (i = 0; VEC_iterate (tree, vec_oprnds, i, vop); i++) |
3714 | { | |
3715 | *vec_stmt = build_gimple_modify_stmt (vec_dest, vop); | |
3716 | new_temp = make_ssa_name (vec_dest, *vec_stmt); | |
3717 | GIMPLE_STMT_OPERAND (*vec_stmt, 0) = new_temp; | |
3718 | vect_finish_stmt_generation (stmt, *vec_stmt, bsi); | |
3719 | STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt; | |
3720 | ||
3721 | if (slp_node) | |
3722 | VEC_quick_push (tree, SLP_TREE_VEC_STMTS (slp_node), *vec_stmt); | |
3723 | } | |
f7064d11 | 3724 | |
805e2059 | 3725 | VEC_free (tree, heap, vec_oprnds); |
f7064d11 DN |
3726 | return true; |
3727 | } | |
3728 | ||
3729 | ||
c4336539 PB |
3730 | /* Function vect_min_worthwhile_factor. |
3731 | ||
3732 | For a loop where we could vectorize the operation indicated by CODE, | |
3733 | return the minimum vectorization factor that makes it worthwhile | |
3734 | to use generic vectors. */ | |
3735 | static int | |
3736 | vect_min_worthwhile_factor (enum tree_code code) | |
3737 | { | |
3738 | switch (code) | |
3739 | { | |
3740 | case PLUS_EXPR: | |
3741 | case MINUS_EXPR: | |
3742 | case NEGATE_EXPR: | |
3743 | return 4; | |
3744 | ||
3745 | case BIT_AND_EXPR: | |
3746 | case BIT_IOR_EXPR: | |
3747 | case BIT_XOR_EXPR: | |
3748 | case BIT_NOT_EXPR: | |
3749 | return 2; | |
3750 | ||
3751 | default: | |
3752 | return INT_MAX; | |
3753 | } | |
3754 | } | |
3755 | ||
88088c03 | 3756 | |
cd38ca7f DN |
3757 | /* Function vectorizable_induction |
3758 | ||
3759 | Check if PHI performs an induction computation that can be vectorized. | |
3760 | If VEC_STMT is also passed, vectorize the induction PHI: create a vectorized | |
3761 | phi to replace it, put it in VEC_STMT, and add it to the same basic block. | |
3762 | Return FALSE if not a vectorizable STMT, TRUE otherwise. */ | |
3763 | ||
3764 | bool | |
3765 | vectorizable_induction (tree phi, block_stmt_iterator *bsi ATTRIBUTE_UNUSED, | |
3766 | tree *vec_stmt) | |
3767 | { | |
3768 | stmt_vec_info stmt_info = vinfo_for_stmt (phi); | |
3769 | tree vectype = STMT_VINFO_VECTYPE (stmt_info); | |
3770 | loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info); | |
3771 | int nunits = TYPE_VECTOR_SUBPARTS (vectype); | |
3772 | int ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits; | |
3773 | tree vec_def; | |
3774 | ||
3775 | gcc_assert (ncopies >= 1); | |
3776 | ||
3777 | if (!STMT_VINFO_RELEVANT_P (stmt_info)) | |
3778 | return false; | |
3779 | ||
805e2059 IR |
3780 | /* FORNOW: SLP not supported. */ |
3781 | if (STMT_SLP_TYPE (stmt_info)) | |
3782 | return false; | |
3783 | ||
cd38ca7f DN |
3784 | gcc_assert (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def); |
3785 | ||
cd38ca7f DN |
3786 | if (TREE_CODE (phi) != PHI_NODE) |
3787 | return false; | |
3788 | ||
3789 | if (!vec_stmt) /* transformation not required. */ | |
3790 | { | |
3791 | STMT_VINFO_TYPE (stmt_info) = induc_vec_info_type; | |
792ed98b HJ |
3792 | if (vect_print_dump_info (REPORT_DETAILS)) |
3793 | fprintf (vect_dump, "=== vectorizable_induction ==="); | |
3794 | vect_model_induction_cost (stmt_info, ncopies); | |
cd38ca7f DN |
3795 | return true; |
3796 | } | |
3797 | ||
3798 | /** Transform. **/ | |
3799 | ||
3800 | if (vect_print_dump_info (REPORT_DETAILS)) | |
3801 | fprintf (vect_dump, "transform induction phi."); | |
3802 | ||
3803 | vec_def = get_initial_def_for_induction (phi); | |
3804 | *vec_stmt = SSA_NAME_DEF_STMT (vec_def); | |
3805 | return true; | |
3806 | } | |
3807 | ||
3808 | ||
f7064d11 DN |
3809 | /* Function vectorizable_operation. |
3810 | ||
3811 | Check if STMT performs a binary or unary operation that can be vectorized. | |
3812 | If VEC_STMT is also passed, vectorize the STMT: create a vectorized | |
3813 | stmt to replace it, put it in VEC_STMT, and insert it at BSI. | |
3814 | Return FALSE if not a vectorizable STMT, TRUE otherwise. */ | |
3815 | ||
3816 | bool | |
805e2059 IR |
3817 | vectorizable_operation (tree stmt, block_stmt_iterator *bsi, tree *vec_stmt, |
3818 | slp_tree slp_node) | |
f7064d11 DN |
3819 | { |
3820 | tree vec_dest; | |
3821 | tree scalar_dest; | |
3822 | tree operation; | |
3823 | tree op0, op1 = NULL; | |
805e2059 | 3824 | tree vec_oprnd1 = NULL_TREE; |
f7064d11 DN |
3825 | stmt_vec_info stmt_info = vinfo_for_stmt (stmt); |
3826 | tree vectype = STMT_VINFO_VECTYPE (stmt_info); | |
3827 | loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info); | |
d29de1bf | 3828 | struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo); |
f7064d11 DN |
3829 | enum tree_code code; |
3830 | enum machine_mode vec_mode; | |
3831 | tree new_temp; | |
3832 | int op_type; | |
f7064d11 | 3833 | optab optab; |
b2d16a23 UB |
3834 | int icode; |
3835 | enum machine_mode optab_op2_mode; | |
88088c03 | 3836 | tree def, def_stmt; |
3a70f3ef | 3837 | enum vect_def_type dt[2] = {vect_unknown_def_type, vect_unknown_def_type}; |
805e2059 | 3838 | tree new_stmt = NULL_TREE; |
89d67cca DN |
3839 | stmt_vec_info prev_stmt_info; |
3840 | int nunits_in = TYPE_VECTOR_SUBPARTS (vectype); | |
3841 | int nunits_out; | |
3842 | tree vectype_out; | |
3843 | int ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits_in; | |
805e2059 IR |
3844 | int j, i; |
3845 | VEC(tree,heap) *vec_oprnds0 = NULL, *vec_oprnds1 = NULL; | |
3846 | tree vop0, vop1; | |
50d76c24 IR |
3847 | unsigned int k; |
3848 | bool scalar_shift_arg = false; | |
805e2059 IR |
3849 | |
3850 | /* FORNOW: SLP with multiple types is not supported. The SLP analysis verifies | |
3851 | this, so we can safely override NCOPIES with 1 here. */ | |
3852 | if (slp_node) | |
3853 | ncopies = 1; | |
89d67cca | 3854 | gcc_assert (ncopies >= 1); |
d29de1bf DN |
3855 | /* FORNOW. This restriction should be relaxed. */ |
3856 | if (nested_in_vect_loop_p (loop, stmt) && ncopies > 1) | |
3857 | { | |
3858 | if (vect_print_dump_info (REPORT_DETAILS)) | |
3859 | fprintf (vect_dump, "multiple types in nested loop."); | |
3860 | return false; | |
3861 | } | |
f7064d11 | 3862 | |
88088c03 DN |
3863 | if (!STMT_VINFO_RELEVANT_P (stmt_info)) |
3864 | return false; | |
3865 | ||
60555ced DN |
3866 | if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_loop_def) |
3867 | return false; | |
88088c03 | 3868 | |
60555ced | 3869 | /* Is STMT a vectorizable binary/unary operation? */ |
07beea0d | 3870 | if (TREE_CODE (stmt) != GIMPLE_MODIFY_STMT) |
f7064d11 DN |
3871 | return false; |
3872 | ||
07beea0d | 3873 | if (TREE_CODE (GIMPLE_STMT_OPERAND (stmt, 0)) != SSA_NAME) |
f7064d11 DN |
3874 | return false; |
3875 | ||
07beea0d | 3876 | scalar_dest = GIMPLE_STMT_OPERAND (stmt, 0); |
89d67cca | 3877 | vectype_out = get_vectype_for_scalar_type (TREE_TYPE (scalar_dest)); |
6d3bf849 UB |
3878 | if (!vectype_out) |
3879 | return false; | |
89d67cca DN |
3880 | nunits_out = TYPE_VECTOR_SUBPARTS (vectype_out); |
3881 | if (nunits_out != nunits_in) | |
3882 | return false; | |
3883 | ||
07beea0d | 3884 | operation = GIMPLE_STMT_OPERAND (stmt, 1); |
f7064d11 | 3885 | code = TREE_CODE (operation); |
2caf766b AP |
3886 | |
3887 | /* For pointer addition, we should use the normal plus for | |
3888 | the vector addition. */ | |
3889 | if (code == POINTER_PLUS_EXPR) | |
3890 | code = PLUS_EXPR; | |
3891 | ||
f7064d11 DN |
3892 | optab = optab_for_tree_code (code, vectype); |
3893 | ||
3894 | /* Support only unary or binary operations. */ | |
5039610b | 3895 | op_type = TREE_OPERAND_LENGTH (operation); |
f7064d11 DN |
3896 | if (op_type != unary_op && op_type != binary_op) |
3897 | { | |
00518cb1 | 3898 | if (vect_print_dump_info (REPORT_DETAILS)) |
f7064d11 DN |
3899 | fprintf (vect_dump, "num. args = %d (not unary/binary op).", op_type); |
3900 | return false; | |
3901 | } | |
3902 | ||
89d67cca | 3903 | op0 = TREE_OPERAND (operation, 0); |
3a70f3ef | 3904 | if (!vect_is_simple_use (op0, loop_vinfo, &def_stmt, &def, &dt[0])) |
f7064d11 | 3905 | { |
89d67cca DN |
3906 | if (vect_print_dump_info (REPORT_DETAILS)) |
3907 | fprintf (vect_dump, "use not simple."); | |
3908 | return false; | |
3909 | } | |
8115817b | 3910 | |
89d67cca DN |
3911 | if (op_type == binary_op) |
3912 | { | |
3913 | op1 = TREE_OPERAND (operation, 1); | |
3a70f3ef | 3914 | if (!vect_is_simple_use (op1, loop_vinfo, &def_stmt, &def, &dt[1])) |
f7064d11 | 3915 | { |
00518cb1 | 3916 | if (vect_print_dump_info (REPORT_DETAILS)) |
f7064d11 DN |
3917 | fprintf (vect_dump, "use not simple."); |
3918 | return false; | |
89d67cca DN |
3919 | } |
3920 | } | |
f7064d11 DN |
3921 | |
3922 | /* Supportable by target? */ | |
3923 | if (!optab) | |
3924 | { | |
00518cb1 | 3925 | if (vect_print_dump_info (REPORT_DETAILS)) |
f7064d11 DN |
3926 | fprintf (vect_dump, "no optab."); |
3927 | return false; | |
3928 | } | |
3929 | vec_mode = TYPE_MODE (vectype); | |
166cdb08 | 3930 | icode = (int) optab_handler (optab, vec_mode)->insn_code; |
b2d16a23 | 3931 | if (icode == CODE_FOR_nothing) |
f7064d11 | 3932 | { |
00518cb1 | 3933 | if (vect_print_dump_info (REPORT_DETAILS)) |
f7064d11 | 3934 | fprintf (vect_dump, "op not supported by target."); |
712f1172 | 3935 | /* Check only during analysis. */ |
598b2024 | 3936 | if (GET_MODE_SIZE (vec_mode) != UNITS_PER_WORD |
712f1172 IR |
3937 | || (LOOP_VINFO_VECT_FACTOR (loop_vinfo) |
3938 | < vect_min_worthwhile_factor (code) | |
3939 | && !vec_stmt)) | |
598b2024 | 3940 | return false; |
00518cb1 | 3941 | if (vect_print_dump_info (REPORT_DETAILS)) |
598b2024 | 3942 | fprintf (vect_dump, "proceeding using word mode."); |
f7064d11 DN |
3943 | } |
3944 | ||
712f1172 | 3945 | /* Worthwhile without SIMD support? Check only during analysis. */ |
c4336539 PB |
3946 | if (!VECTOR_MODE_P (TYPE_MODE (vectype)) |
3947 | && LOOP_VINFO_VECT_FACTOR (loop_vinfo) | |
712f1172 IR |
3948 | < vect_min_worthwhile_factor (code) |
3949 | && !vec_stmt) | |
c4336539 | 3950 | { |
00518cb1 | 3951 | if (vect_print_dump_info (REPORT_DETAILS)) |
c4336539 PB |
3952 | fprintf (vect_dump, "not worthwhile without SIMD support."); |
3953 | return false; | |
3954 | } | |
3955 | ||
b2d16a23 UB |
3956 | if (code == LSHIFT_EXPR || code == RSHIFT_EXPR) |
3957 | { | |
3958 | /* FORNOW: not yet supported. */ | |
3959 | if (!VECTOR_MODE_P (vec_mode)) | |
3960 | return false; | |
3961 | ||
3962 | /* Invariant argument is needed for a vector shift | |
3963 | by a scalar shift operand. */ | |
3964 | optab_op2_mode = insn_data[icode].operand[2].mode; | |
50d76c24 | 3965 | if (!VECTOR_MODE_P (optab_op2_mode)) |
b2d16a23 | 3966 | { |
50d76c24 IR |
3967 | if (dt[1] != vect_constant_def && dt[1] != vect_invariant_def) |
3968 | { | |
3969 | if (vect_print_dump_info (REPORT_DETAILS)) | |
3970 | fprintf (vect_dump, "operand mode requires invariant" | |
3971 | " argument."); | |
3972 | return false; | |
3973 | } | |
3974 | ||
3975 | scalar_shift_arg = true; | |
3976 | } | |
b2d16a23 UB |
3977 | } |
3978 | ||
f7064d11 DN |
3979 | if (!vec_stmt) /* transformation not required. */ |
3980 | { | |
3981 | STMT_VINFO_TYPE (stmt_info) = op_vec_info_type; | |
792ed98b HJ |
3982 | if (vect_print_dump_info (REPORT_DETAILS)) |
3983 | fprintf (vect_dump, "=== vectorizable_operation ==="); | |
805e2059 | 3984 | vect_model_simple_cost (stmt_info, ncopies, dt, NULL); |
f7064d11 DN |
3985 | return true; |
3986 | } | |
3987 | ||
3988 | /** Transform. **/ | |
3989 | ||
00518cb1 | 3990 | if (vect_print_dump_info (REPORT_DETAILS)) |
f7064d11 DN |
3991 | fprintf (vect_dump, "transform binary/unary operation."); |
3992 | ||
3993 | /* Handle def. */ | |
f7064d11 DN |
3994 | vec_dest = vect_create_destination_var (scalar_dest, vectype); |
3995 | ||
50d76c24 IR |
3996 | /* Allocate VECs for vector operands. In case of SLP, vector operands are |
3997 | created in the previous stages of the recursion, so no allocation is | |
3998 | needed, except for the case of shift with scalar shift argument. In that | |
3999 | case we store the scalar operand in VEC_OPRNDS1 for every vector stmt to | |
4000 | be created to vectorize the SLP group, i.e., SLP_NODE->VEC_STMTS_SIZE. | |
4001 | In case of loop-based vectorization we allocate VECs of size 1. We | |
4002 | allocate VEC_OPRNDS1 only in case of binary operation. */ | |
805e2059 | 4003 | if (!slp_node) |
50d76c24 IR |
4004 | { |
4005 | vec_oprnds0 = VEC_alloc (tree, heap, 1); | |
4006 | if (op_type == binary_op) | |
4007 | vec_oprnds1 = VEC_alloc (tree, heap, 1); | |
4008 | } | |
4009 | else if (scalar_shift_arg) | |
4010 | vec_oprnds1 = VEC_alloc (tree, heap, slp_node->vec_stmts_size); | |
805e2059 | 4011 | |
89d67cca DN |
4012 | /* In case the vectorization factor (VF) is bigger than the number |
4013 | of elements that we can fit in a vectype (nunits), we have to generate | |
4014 | more than one vector stmt - i.e - we need to "unroll" the | |
4015 | vector stmt by a factor VF/nunits. In doing so, we record a pointer | |
4016 | from one copy of the vector stmt to the next, in the field | |
4017 | STMT_VINFO_RELATED_STMT. This is necessary in order to allow following | |
4018 | stages to find the correct vector defs to be used when vectorizing | |
4019 | stmts that use the defs of the current stmt. The example below illustrates | |
4020 | the vectorization process when VF=16 and nunits=4 (i.e - we need to create | |
4021 | 4 vectorized stmts): | |
8115817b | 4022 | |
89d67cca DN |
4023 | before vectorization: |
4024 | RELATED_STMT VEC_STMT | |
4025 | S1: x = memref - - | |
4026 | S2: z = x + 1 - - | |
8115817b | 4027 | |
89d67cca DN |
4028 | step 1: vectorize stmt S1 (done in vectorizable_load. See more details |
4029 | there): | |
4030 | RELATED_STMT VEC_STMT | |
4031 | VS1_0: vx0 = memref0 VS1_1 - | |
4032 | VS1_1: vx1 = memref1 VS1_2 - | |
4033 | VS1_2: vx2 = memref2 VS1_3 - | |
4034 | VS1_3: vx3 = memref3 - - | |
4035 | S1: x = load - VS1_0 | |
4036 | S2: z = x + 1 - - | |
8115817b | 4037 | |
89d67cca DN |
4038 | step2: vectorize stmt S2 (done here): |
4039 | To vectorize stmt S2 we first need to find the relevant vector | |
4040 | def for the first operand 'x'. This is, as usual, obtained from | |
4041 | the vector stmt recorded in the STMT_VINFO_VEC_STMT of the stmt | |
4042 | that defines 'x' (S1). This way we find the stmt VS1_0, and the | |
4043 | relevant vector def 'vx0'. Having found 'vx0' we can generate | |
4044 | the vector stmt VS2_0, and as usual, record it in the | |
4045 | STMT_VINFO_VEC_STMT of stmt S2. | |
4046 | When creating the second copy (VS2_1), we obtain the relevant vector | |
4047 | def from the vector stmt recorded in the STMT_VINFO_RELATED_STMT of | |
4048 | stmt VS1_0. This way we find the stmt VS1_1 and the relevant | |
4049 | vector def 'vx1'. Using 'vx1' we create stmt VS2_1 and record a | |
4050 | pointer to it in the STMT_VINFO_RELATED_STMT of the vector stmt VS2_0. | |
4051 | Similarly when creating stmts VS2_2 and VS2_3. This is the resulting | |
4052 | chain of stmts and pointers: | |
4053 | RELATED_STMT VEC_STMT | |
4054 | VS1_0: vx0 = memref0 VS1_1 - | |
4055 | VS1_1: vx1 = memref1 VS1_2 - | |
4056 | VS1_2: vx2 = memref2 VS1_3 - | |
4057 | VS1_3: vx3 = memref3 - - | |
4058 | S1: x = load - VS1_0 | |
4059 | VS2_0: vz0 = vx0 + v1 VS2_1 - | |
4060 | VS2_1: vz1 = vx1 + v1 VS2_2 - | |
4061 | VS2_2: vz2 = vx2 + v1 VS2_3 - | |
4062 | VS2_3: vz3 = vx3 + v1 - - | |
4063 | S2: z = x + 1 - VS2_0 */ | |
8115817b | 4064 | |
89d67cca DN |
4065 | prev_stmt_info = NULL; |
4066 | for (j = 0; j < ncopies; j++) | |
4067 | { | |
4068 | /* Handle uses. */ | |
4069 | if (j == 0) | |
4070 | { | |
805e2059 | 4071 | if (op_type == binary_op |
f8f8fee8 | 4072 | && (code == LSHIFT_EXPR || code == RSHIFT_EXPR)) |
89d67cca | 4073 | { |
805e2059 IR |
4074 | /* Vector shl and shr insn patterns can be defined with scalar |
4075 | operand 2 (shift operand). In this case, use constant or loop | |
4076 | invariant op1 directly, without extending it to vector mode | |
4077 | first. */ | |
4078 | optab_op2_mode = insn_data[icode].operand[2].mode; | |
4079 | if (!VECTOR_MODE_P (optab_op2_mode)) | |
4080 | { | |
4081 | if (vect_print_dump_info (REPORT_DETAILS)) | |
4082 | fprintf (vect_dump, "operand 1 using scalar mode."); | |
4083 | vec_oprnd1 = op1; | |
4084 | VEC_quick_push (tree, vec_oprnds1, vec_oprnd1); | |
50d76c24 IR |
4085 | if (slp_node) |
4086 | { | |
4087 | /* Store vec_oprnd1 for every vector stmt to be created | |
4088 | for SLP_NODE. We check during the analysis that all the | |
4089 | shift arguments are the same. | |
4090 | TODO: Allow different constants for different vector | |
4091 | stmts generated for an SLP instance. */ | |
4092 | for (k = 0; k < slp_node->vec_stmts_size - 1; k++) | |
4093 | VEC_quick_push (tree, vec_oprnds1, vec_oprnd1); | |
4094 | } | |
805e2059 | 4095 | } |
89d67cca | 4096 | } |
805e2059 | 4097 | |
50d76c24 | 4098 | /* vec_oprnd1 is available if operand 1 should be of a scalar-type |
f8f8fee8 IR |
4099 | (a special case for certain kind of vector shifts); otherwise, |
4100 | operand 1 should be of a vector type (the usual case). */ | |
805e2059 IR |
4101 | if (op_type == binary_op && !vec_oprnd1) |
4102 | vect_get_vec_defs (op0, op1, stmt, &vec_oprnds0, &vec_oprnds1, | |
4103 | slp_node); | |
4104 | else | |
f8f8fee8 | 4105 | vect_get_vec_defs (op0, NULL_TREE, stmt, &vec_oprnds0, NULL, |
805e2059 | 4106 | slp_node); |
89d67cca DN |
4107 | } |
4108 | else | |
805e2059 | 4109 | vect_get_vec_defs_for_stmt_copy (dt, &vec_oprnds0, &vec_oprnds1); |
89d67cca | 4110 | |
805e2059 IR |
4111 | /* Arguments are ready. Create the new vector stmt. */ |
4112 | for (i = 0; VEC_iterate (tree, vec_oprnds0, i, vop0); i++) | |
4113 | { | |
4114 | if (op_type == binary_op) | |
4115 | { | |
4116 | vop1 = VEC_index (tree, vec_oprnds1, i); | |
4117 | new_stmt = build_gimple_modify_stmt (vec_dest, | |
4118 | build2 (code, vectype, vop0, vop1)); | |
4119 | } | |
4120 | else | |
4121 | new_stmt = build_gimple_modify_stmt (vec_dest, | |
4122 | build1 (code, vectype, vop0)); | |
8115817b | 4123 | |
805e2059 IR |
4124 | new_temp = make_ssa_name (vec_dest, new_stmt); |
4125 | GIMPLE_STMT_OPERAND (new_stmt, 0) = new_temp; | |
4126 | vect_finish_stmt_generation (stmt, new_stmt, bsi); | |
4127 | if (slp_node) | |
4128 | VEC_quick_push (tree, SLP_TREE_VEC_STMTS (slp_node), new_stmt); | |
4129 | } | |
8115817b | 4130 | |
89d67cca DN |
4131 | if (j == 0) |
4132 | STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt; | |
4133 | else | |
4134 | STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt; | |
4135 | prev_stmt_info = vinfo_for_stmt (new_stmt); | |
4136 | } | |
4137 | ||
805e2059 IR |
4138 | VEC_free (tree, heap, vec_oprnds0); |
4139 | if (vec_oprnds1) | |
4140 | VEC_free (tree, heap, vec_oprnds1); | |
4141 | ||
89d67cca DN |
4142 | return true; |
4143 | } | |
4144 | ||
4145 | ||
4146 | /* Function vectorizable_type_demotion | |
8115817b | 4147 | |
89d67cca DN |
4148 | Check if STMT performs a binary or unary operation that involves |
4149 | type demotion, and if it can be vectorized. | |
4150 | If VEC_STMT is also passed, vectorize the STMT: create a vectorized | |
4151 | stmt to replace it, put it in VEC_STMT, and insert it at BSI. | |
4152 | Return FALSE if not a vectorizable STMT, TRUE otherwise. */ | |
8115817b | 4153 | |
89d67cca DN |
4154 | bool |
4155 | vectorizable_type_demotion (tree stmt, block_stmt_iterator *bsi, | |
d9987fb4 | 4156 | tree *vec_stmt) |
89d67cca DN |
4157 | { |
4158 | tree vec_dest; | |
4159 | tree scalar_dest; | |
4160 | tree operation; | |
4161 | tree op0; | |
4162 | tree vec_oprnd0=NULL, vec_oprnd1=NULL; | |
4163 | stmt_vec_info stmt_info = vinfo_for_stmt (stmt); | |
4164 | loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info); | |
d29de1bf | 4165 | struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo); |
8ff43db0 | 4166 | enum tree_code code, code1 = ERROR_MARK; |
89d67cca DN |
4167 | tree new_temp; |
4168 | tree def, def_stmt; | |
3a70f3ef | 4169 | enum vect_def_type dt[2] = {vect_unknown_def_type, vect_unknown_def_type}; |
89d67cca DN |
4170 | tree new_stmt; |
4171 | stmt_vec_info prev_stmt_info; | |
4172 | int nunits_in; | |
4173 | int nunits_out; | |
4174 | tree vectype_out; | |
4175 | int ncopies; | |
4176 | int j; | |
4177 | tree expr; | |
4178 | tree vectype_in; | |
8115817b | 4179 | |
89d67cca DN |
4180 | if (!STMT_VINFO_RELEVANT_P (stmt_info)) |
4181 | return false; | |
60555ced DN |
4182 | |
4183 | if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_loop_def) | |
4184 | return false; | |
4185 | ||
60555ced | 4186 | /* Is STMT a vectorizable type-demotion operation? */ |
07beea0d | 4187 | if (TREE_CODE (stmt) != GIMPLE_MODIFY_STMT) |
89d67cca | 4188 | return false; |
8115817b | 4189 | |
07beea0d | 4190 | if (TREE_CODE (GIMPLE_STMT_OPERAND (stmt, 0)) != SSA_NAME) |
89d67cca | 4191 | return false; |
8115817b | 4192 | |
07beea0d | 4193 | operation = GIMPLE_STMT_OPERAND (stmt, 1); |
89d67cca DN |
4194 | code = TREE_CODE (operation); |
4195 | if (code != NOP_EXPR && code != CONVERT_EXPR) | |
4196 | return false; | |
8115817b | 4197 | |
89d67cca DN |
4198 | op0 = TREE_OPERAND (operation, 0); |
4199 | vectype_in = get_vectype_for_scalar_type (TREE_TYPE (op0)); | |
4934454b DN |
4200 | if (!vectype_in) |
4201 | return false; | |
89d67cca | 4202 | nunits_in = TYPE_VECTOR_SUBPARTS (vectype_in); |
8115817b | 4203 | |
07beea0d | 4204 | scalar_dest = GIMPLE_STMT_OPERAND (stmt, 0); |
459e691a | 4205 | vectype_out = get_vectype_for_scalar_type (TREE_TYPE (scalar_dest)); |
4934454b DN |
4206 | if (!vectype_out) |
4207 | return false; | |
89d67cca DN |
4208 | nunits_out = TYPE_VECTOR_SUBPARTS (vectype_out); |
4209 | if (nunits_in != nunits_out / 2) /* FORNOW */ | |
4210 | return false; | |
8115817b | 4211 | |
89d67cca DN |
4212 | ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits_out; |
4213 | gcc_assert (ncopies >= 1); | |
d29de1bf DN |
4214 | /* FORNOW. This restriction should be relaxed. */ |
4215 | if (nested_in_vect_loop_p (loop, stmt) && ncopies > 1) | |
4216 | { | |
4217 | if (vect_print_dump_info (REPORT_DETAILS)) | |
4218 | fprintf (vect_dump, "multiple types in nested loop."); | |
4219 | return false; | |
4220 | } | |
878aa817 | 4221 | |
8115817b UB |
4222 | if (! ((INTEGRAL_TYPE_P (TREE_TYPE (scalar_dest)) |
4223 | && INTEGRAL_TYPE_P (TREE_TYPE (op0))) | |
4224 | || (SCALAR_FLOAT_TYPE_P (TREE_TYPE (scalar_dest)) | |
4225 | && SCALAR_FLOAT_TYPE_P (TREE_TYPE (op0)) | |
4226 | && (code == NOP_EXPR || code == CONVERT_EXPR)))) | |
878aa817 | 4227 | return false; |
8115817b | 4228 | |
89d67cca | 4229 | /* Check the operands of the operation. */ |
3a70f3ef | 4230 | if (!vect_is_simple_use (op0, loop_vinfo, &def_stmt, &def, &dt[0])) |
89d67cca DN |
4231 | { |
4232 | if (vect_print_dump_info (REPORT_DETAILS)) | |
4233 | fprintf (vect_dump, "use not simple."); | |
4234 | return false; | |
4235 | } | |
8115817b | 4236 | |
89d67cca | 4237 | /* Supportable by target? */ |
d9987fb4 | 4238 | if (!supportable_narrowing_operation (code, stmt, vectype_in, &code1)) |
89d67cca | 4239 | return false; |
8115817b | 4240 | |
89d67cca | 4241 | STMT_VINFO_VECTYPE (stmt_info) = vectype_in; |
8115817b | 4242 | |
89d67cca DN |
4243 | if (!vec_stmt) /* transformation not required. */ |
4244 | { | |
4245 | STMT_VINFO_TYPE (stmt_info) = type_demotion_vec_info_type; | |
792ed98b HJ |
4246 | if (vect_print_dump_info (REPORT_DETAILS)) |
4247 | fprintf (vect_dump, "=== vectorizable_demotion ==="); | |
805e2059 | 4248 | vect_model_simple_cost (stmt_info, ncopies, dt, NULL); |
89d67cca DN |
4249 | return true; |
4250 | } | |
8115817b | 4251 | |
89d67cca | 4252 | /** Transform. **/ |
89d67cca DN |
4253 | if (vect_print_dump_info (REPORT_DETAILS)) |
4254 | fprintf (vect_dump, "transform type demotion operation. ncopies = %d.", | |
8115817b UB |
4255 | ncopies); |
4256 | ||
89d67cca DN |
4257 | /* Handle def. */ |
4258 | vec_dest = vect_create_destination_var (scalar_dest, vectype_out); | |
4259 | ||
4260 | /* In case the vectorization factor (VF) is bigger than the number | |
4261 | of elements that we can fit in a vectype (nunits), we have to generate | |
4262 | more than one vector stmt - i.e - we need to "unroll" the | |
4263 | vector stmt by a factor VF/nunits. */ | |
4264 | prev_stmt_info = NULL; | |
4265 | for (j = 0; j < ncopies; j++) | |
4266 | { | |
4267 | /* Handle uses. */ | |
4268 | if (j == 0) | |
4269 | { | |
89d67cca | 4270 | vec_oprnd0 = vect_get_vec_def_for_operand (op0, stmt, NULL); |
3a70f3ef | 4271 | vec_oprnd1 = vect_get_vec_def_for_stmt_copy (dt[0], vec_oprnd0); |
89d67cca DN |
4272 | } |
4273 | else | |
4274 | { | |
3a70f3ef DN |
4275 | vec_oprnd0 = vect_get_vec_def_for_stmt_copy (dt[0], vec_oprnd1); |
4276 | vec_oprnd1 = vect_get_vec_def_for_stmt_copy (dt[0], vec_oprnd0); | |
89d67cca | 4277 | } |
8115817b | 4278 | |
89d67cca | 4279 | /* Arguments are ready. Create the new vector stmt. */ |
d9987fb4 | 4280 | expr = build2 (code1, vectype_out, vec_oprnd0, vec_oprnd1); |
ebb07520 | 4281 | new_stmt = build_gimple_modify_stmt (vec_dest, expr); |
89d67cca | 4282 | new_temp = make_ssa_name (vec_dest, new_stmt); |
07beea0d | 4283 | GIMPLE_STMT_OPERAND (new_stmt, 0) = new_temp; |
89d67cca | 4284 | vect_finish_stmt_generation (stmt, new_stmt, bsi); |
8115817b | 4285 | |
89d67cca DN |
4286 | if (j == 0) |
4287 | STMT_VINFO_VEC_STMT (stmt_info) = new_stmt; | |
4288 | else | |
4289 | STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt; | |
8115817b | 4290 | |
89d67cca DN |
4291 | prev_stmt_info = vinfo_for_stmt (new_stmt); |
4292 | } | |
8115817b | 4293 | |
89d67cca DN |
4294 | *vec_stmt = STMT_VINFO_VEC_STMT (stmt_info); |
4295 | return true; | |
4296 | } | |
4297 | ||
4298 | ||
89d67cca DN |
4299 | /* Function vectorizable_type_promotion |
4300 | ||
4301 | Check if STMT performs a binary or unary operation that involves | |
4302 | type promotion, and if it can be vectorized. | |
4303 | If VEC_STMT is also passed, vectorize the STMT: create a vectorized | |
4304 | stmt to replace it, put it in VEC_STMT, and insert it at BSI. | |
4305 | Return FALSE if not a vectorizable STMT, TRUE otherwise. */ | |
4306 | ||
4307 | bool | |
4308 | vectorizable_type_promotion (tree stmt, block_stmt_iterator *bsi, | |
4309 | tree *vec_stmt) | |
4310 | { | |
4311 | tree vec_dest; | |
4312 | tree scalar_dest; | |
4313 | tree operation; | |
4314 | tree op0, op1 = NULL; | |
4315 | tree vec_oprnd0=NULL, vec_oprnd1=NULL; | |
4316 | stmt_vec_info stmt_info = vinfo_for_stmt (stmt); | |
4317 | loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info); | |
d29de1bf | 4318 | struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo); |
8ff43db0 | 4319 | enum tree_code code, code1 = ERROR_MARK, code2 = ERROR_MARK; |
89d67cca DN |
4320 | tree decl1 = NULL_TREE, decl2 = NULL_TREE; |
4321 | int op_type; | |
4322 | tree def, def_stmt; | |
3a70f3ef | 4323 | enum vect_def_type dt[2] = {vect_unknown_def_type, vect_unknown_def_type}; |
89d67cca DN |
4324 | tree new_stmt; |
4325 | stmt_vec_info prev_stmt_info; | |
4326 | int nunits_in; | |
4327 | int nunits_out; | |
4328 | tree vectype_out; | |
4329 | int ncopies; | |
4330 | int j; | |
4331 | tree vectype_in; | |
4332 | ||
89d67cca DN |
4333 | if (!STMT_VINFO_RELEVANT_P (stmt_info)) |
4334 | return false; | |
4335 | ||
60555ced DN |
4336 | if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_loop_def) |
4337 | return false; | |
89d67cca | 4338 | |
60555ced | 4339 | /* Is STMT a vectorizable type-promotion operation? */ |
07beea0d | 4340 | if (TREE_CODE (stmt) != GIMPLE_MODIFY_STMT) |
89d67cca DN |
4341 | return false; |
4342 | ||
07beea0d | 4343 | if (TREE_CODE (GIMPLE_STMT_OPERAND (stmt, 0)) != SSA_NAME) |
89d67cca DN |
4344 | return false; |
4345 | ||
07beea0d | 4346 | operation = GIMPLE_STMT_OPERAND (stmt, 1); |
89d67cca | 4347 | code = TREE_CODE (operation); |
d9987fb4 UB |
4348 | if (code != NOP_EXPR && code != CONVERT_EXPR |
4349 | && code != WIDEN_MULT_EXPR) | |
89d67cca DN |
4350 | return false; |
4351 | ||
f7064d11 | 4352 | op0 = TREE_OPERAND (operation, 0); |
89d67cca | 4353 | vectype_in = get_vectype_for_scalar_type (TREE_TYPE (op0)); |
4934454b DN |
4354 | if (!vectype_in) |
4355 | return false; | |
89d67cca | 4356 | nunits_in = TYPE_VECTOR_SUBPARTS (vectype_in); |
89d67cca | 4357 | |
07beea0d | 4358 | scalar_dest = GIMPLE_STMT_OPERAND (stmt, 0); |
89d67cca | 4359 | vectype_out = get_vectype_for_scalar_type (TREE_TYPE (scalar_dest)); |
4934454b DN |
4360 | if (!vectype_out) |
4361 | return false; | |
89d67cca DN |
4362 | nunits_out = TYPE_VECTOR_SUBPARTS (vectype_out); |
4363 | if (nunits_out != nunits_in / 2) /* FORNOW */ | |
4364 | return false; | |
4365 | ||
459e691a UB |
4366 | ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits_in; |
4367 | gcc_assert (ncopies >= 1); | |
d29de1bf DN |
4368 | /* FORNOW. This restriction should be relaxed. */ |
4369 | if (nested_in_vect_loop_p (loop, stmt) && ncopies > 1) | |
4370 | { | |
4371 | if (vect_print_dump_info (REPORT_DETAILS)) | |
4372 | fprintf (vect_dump, "multiple types in nested loop."); | |
4373 | return false; | |
4374 | } | |
459e691a | 4375 | |
8115817b UB |
4376 | if (! ((INTEGRAL_TYPE_P (TREE_TYPE (scalar_dest)) |
4377 | && INTEGRAL_TYPE_P (TREE_TYPE (op0))) | |
4378 | || (SCALAR_FLOAT_TYPE_P (TREE_TYPE (scalar_dest)) | |
4379 | && SCALAR_FLOAT_TYPE_P (TREE_TYPE (op0)) | |
4380 | && (code == CONVERT_EXPR || code == NOP_EXPR)))) | |
878aa817 DN |
4381 | return false; |
4382 | ||
89d67cca | 4383 | /* Check the operands of the operation. */ |
3a70f3ef | 4384 | if (!vect_is_simple_use (op0, loop_vinfo, &def_stmt, &def, &dt[0])) |
89d67cca DN |
4385 | { |
4386 | if (vect_print_dump_info (REPORT_DETAILS)) | |
4387 | fprintf (vect_dump, "use not simple."); | |
4388 | return false; | |
4389 | } | |
f7064d11 | 4390 | |
89d67cca | 4391 | op_type = TREE_CODE_LENGTH (code); |
f7064d11 DN |
4392 | if (op_type == binary_op) |
4393 | { | |
4394 | op1 = TREE_OPERAND (operation, 1); | |
3a70f3ef | 4395 | if (!vect_is_simple_use (op1, loop_vinfo, &def_stmt, &def, &dt[1])) |
89d67cca DN |
4396 | { |
4397 | if (vect_print_dump_info (REPORT_DETAILS)) | |
4398 | fprintf (vect_dump, "use not simple."); | |
4399 | return false; | |
4400 | } | |
4401 | } | |
b2d16a23 | 4402 | |
89d67cca DN |
4403 | /* Supportable by target? */ |
4404 | if (!supportable_widening_operation (code, stmt, vectype_in, | |
4405 | &decl1, &decl2, &code1, &code2)) | |
4406 | return false; | |
b2d16a23 | 4407 | |
89d67cca | 4408 | STMT_VINFO_VECTYPE (stmt_info) = vectype_in; |
b2d16a23 | 4409 | |
89d67cca DN |
4410 | if (!vec_stmt) /* transformation not required. */ |
4411 | { | |
4412 | STMT_VINFO_TYPE (stmt_info) = type_promotion_vec_info_type; | |
792ed98b HJ |
4413 | if (vect_print_dump_info (REPORT_DETAILS)) |
4414 | fprintf (vect_dump, "=== vectorizable_promotion ==="); | |
805e2059 | 4415 | vect_model_simple_cost (stmt_info, 2*ncopies, dt, NULL); |
89d67cca | 4416 | return true; |
f7064d11 DN |
4417 | } |
4418 | ||
89d67cca | 4419 | /** Transform. **/ |
f7064d11 | 4420 | |
89d67cca DN |
4421 | if (vect_print_dump_info (REPORT_DETAILS)) |
4422 | fprintf (vect_dump, "transform type promotion operation. ncopies = %d.", | |
4423 | ncopies); | |
4424 | ||
4425 | /* Handle def. */ | |
4426 | vec_dest = vect_create_destination_var (scalar_dest, vectype_out); | |
4427 | ||
4428 | /* In case the vectorization factor (VF) is bigger than the number | |
4429 | of elements that we can fit in a vectype (nunits), we have to generate | |
4430 | more than one vector stmt - i.e - we need to "unroll" the | |
4431 | vector stmt by a factor VF/nunits. */ | |
f7064d11 | 4432 | |
89d67cca DN |
4433 | prev_stmt_info = NULL; |
4434 | for (j = 0; j < ncopies; j++) | |
4435 | { | |
4436 | /* Handle uses. */ | |
4437 | if (j == 0) | |
4438 | { | |
4439 | vec_oprnd0 = vect_get_vec_def_for_operand (op0, stmt, NULL); | |
4440 | if (op_type == binary_op) | |
4441 | vec_oprnd1 = vect_get_vec_def_for_operand (op1, stmt, NULL); | |
4442 | } | |
4443 | else | |
4444 | { | |
3a70f3ef | 4445 | vec_oprnd0 = vect_get_vec_def_for_stmt_copy (dt[0], vec_oprnd0); |
89d67cca | 4446 | if (op_type == binary_op) |
3a70f3ef | 4447 | vec_oprnd1 = vect_get_vec_def_for_stmt_copy (dt[1], vec_oprnd1); |
89d67cca DN |
4448 | } |
4449 | ||
4450 | /* Arguments are ready. Create the new vector stmt. We are creating | |
4451 | two vector defs because the widened result does not fit in one vector. | |
4452 | The vectorized stmt can be expressed as a call to a taregt builtin, | |
4453 | or a using a tree-code. */ | |
4454 | /* Generate first half of the widened result: */ | |
4455 | new_stmt = vect_gen_widened_results_half (code1, vectype_out, decl1, | |
4456 | vec_oprnd0, vec_oprnd1, op_type, vec_dest, bsi, stmt); | |
4457 | if (j == 0) | |
4458 | STMT_VINFO_VEC_STMT (stmt_info) = new_stmt; | |
4459 | else | |
4460 | STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt; | |
4461 | prev_stmt_info = vinfo_for_stmt (new_stmt); | |
4462 | ||
4463 | /* Generate second half of the widened result: */ | |
4464 | new_stmt = vect_gen_widened_results_half (code2, vectype_out, decl2, | |
4465 | vec_oprnd0, vec_oprnd1, op_type, vec_dest, bsi, stmt); | |
4466 | STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt; | |
4467 | prev_stmt_info = vinfo_for_stmt (new_stmt); | |
4468 | ||
4469 | } | |
4470 | ||
4471 | *vec_stmt = STMT_VINFO_VEC_STMT (stmt_info); | |
f7064d11 DN |
4472 | return true; |
4473 | } | |
4474 | ||
4475 | ||
98b44b0e IR |
4476 | /* Function vect_strided_store_supported. |
4477 | ||
4478 | Returns TRUE is INTERLEAVE_HIGH and INTERLEAVE_LOW operations are supported, | |
4479 | and FALSE otherwise. */ | |
4480 | ||
4481 | static bool | |
4482 | vect_strided_store_supported (tree vectype) | |
4483 | { | |
4484 | optab interleave_high_optab, interleave_low_optab; | |
4485 | int mode; | |
4486 | ||
4487 | mode = (int) TYPE_MODE (vectype); | |
4488 | ||
4489 | /* Check that the operation is supported. */ | |
4490 | interleave_high_optab = optab_for_tree_code (VEC_INTERLEAVE_HIGH_EXPR, | |
4491 | vectype); | |
4492 | interleave_low_optab = optab_for_tree_code (VEC_INTERLEAVE_LOW_EXPR, | |
4493 | vectype); | |
4494 | if (!interleave_high_optab || !interleave_low_optab) | |
4495 | { | |
4496 | if (vect_print_dump_info (REPORT_DETAILS)) | |
4497 | fprintf (vect_dump, "no optab for interleave."); | |
4498 | return false; | |
4499 | } | |
4500 | ||
166cdb08 | 4501 | if (optab_handler (interleave_high_optab, mode)->insn_code |
98b44b0e | 4502 | == CODE_FOR_nothing |
166cdb08 | 4503 | || optab_handler (interleave_low_optab, mode)->insn_code |
98b44b0e IR |
4504 | == CODE_FOR_nothing) |
4505 | { | |
4506 | if (vect_print_dump_info (REPORT_DETAILS)) | |
4507 | fprintf (vect_dump, "interleave op not supported by target."); | |
4508 | return false; | |
4509 | } | |
805e2059 | 4510 | |
98b44b0e IR |
4511 | return true; |
4512 | } | |
4513 | ||
4514 | ||
4515 | /* Function vect_permute_store_chain. | |
4516 | ||
2f8e468b | 4517 | Given a chain of interleaved stores in DR_CHAIN of LENGTH that must be |
98b44b0e IR |
4518 | a power of 2, generate interleave_high/low stmts to reorder the data |
4519 | correctly for the stores. Return the final references for stores in | |
4520 | RESULT_CHAIN. | |
4521 | ||
4522 | E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8. | |
2f8e468b | 4523 | The input is 4 vectors each containing 8 elements. We assign a number to each |
98b44b0e IR |
4524 | element, the input sequence is: |
4525 | ||
4526 | 1st vec: 0 1 2 3 4 5 6 7 | |
4527 | 2nd vec: 8 9 10 11 12 13 14 15 | |
4528 | 3rd vec: 16 17 18 19 20 21 22 23 | |
4529 | 4th vec: 24 25 26 27 28 29 30 31 | |
4530 | ||
4531 | The output sequence should be: | |
4532 | ||
4533 | 1st vec: 0 8 16 24 1 9 17 25 | |
4534 | 2nd vec: 2 10 18 26 3 11 19 27 | |
4535 | 3rd vec: 4 12 20 28 5 13 21 30 | |
4536 | 4th vec: 6 14 22 30 7 15 23 31 | |
4537 | ||
4538 | i.e., we interleave the contents of the four vectors in their order. | |
4539 | ||
4540 | We use interleave_high/low instructions to create such output. The input of | |
4541 | each interleave_high/low operation is two vectors: | |
4542 | 1st vec 2nd vec | |
4543 | 0 1 2 3 4 5 6 7 | |
4544 | the even elements of the result vector are obtained left-to-right from the | |
4545 | high/low elements of the first vector. The odd elements of the result are | |
4546 | obtained left-to-right from the high/low elements of the second vector. | |
4547 | The output of interleave_high will be: 0 4 1 5 | |
4548 | and of interleave_low: 2 6 3 7 | |
4549 | ||
4550 | ||
2f8e468b | 4551 | The permutation is done in log LENGTH stages. In each stage interleave_high |
98b44b0e IR |
4552 | and interleave_low stmts are created for each pair of vectors in DR_CHAIN, |
4553 | where the first argument is taken from the first half of DR_CHAIN and the | |
4554 | second argument from it's second half. | |
4555 | In our example, | |
4556 | ||
4557 | I1: interleave_high (1st vec, 3rd vec) | |
4558 | I2: interleave_low (1st vec, 3rd vec) | |
4559 | I3: interleave_high (2nd vec, 4th vec) | |
4560 | I4: interleave_low (2nd vec, 4th vec) | |
4561 | ||
4562 | The output for the first stage is: | |
4563 | ||
4564 | I1: 0 16 1 17 2 18 3 19 | |
4565 | I2: 4 20 5 21 6 22 7 23 | |
4566 | I3: 8 24 9 25 10 26 11 27 | |
4567 | I4: 12 28 13 29 14 30 15 31 | |
4568 | ||
4569 | The output of the second stage, i.e. the final result is: | |
4570 | ||
4571 | I1: 0 8 16 24 1 9 17 25 | |
4572 | I2: 2 10 18 26 3 11 19 27 | |
4573 | I3: 4 12 20 28 5 13 21 30 | |
4574 | I4: 6 14 22 30 7 15 23 31. */ | |
4575 | ||
4576 | static bool | |
4577 | vect_permute_store_chain (VEC(tree,heap) *dr_chain, | |
4578 | unsigned int length, | |
4579 | tree stmt, | |
4580 | block_stmt_iterator *bsi, | |
4581 | VEC(tree,heap) **result_chain) | |
4582 | { | |
4583 | tree perm_dest, perm_stmt, vect1, vect2, high, low; | |
4584 | tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt)); | |
ebb07520 | 4585 | tree scalar_dest, tmp; |
98b44b0e IR |
4586 | int i; |
4587 | unsigned int j; | |
4588 | VEC(tree,heap) *first, *second; | |
4589 | ||
ce133c3f | 4590 | scalar_dest = GIMPLE_STMT_OPERAND (stmt, 0); |
98b44b0e IR |
4591 | first = VEC_alloc (tree, heap, length/2); |
4592 | second = VEC_alloc (tree, heap, length/2); | |
4593 | ||
4594 | /* Check that the operation is supported. */ | |
4595 | if (!vect_strided_store_supported (vectype)) | |
4596 | return false; | |
4597 | ||
4598 | *result_chain = VEC_copy (tree, heap, dr_chain); | |
4599 | ||
4600 | for (i = 0; i < exact_log2 (length); i++) | |
4601 | { | |
4602 | for (j = 0; j < length/2; j++) | |
4603 | { | |
4604 | vect1 = VEC_index (tree, dr_chain, j); | |
4605 | vect2 = VEC_index (tree, dr_chain, j+length/2); | |
4606 | ||
a3895f55 IR |
4607 | /* Create interleaving stmt: |
4608 | in the case of big endian: | |
4609 | high = interleave_high (vect1, vect2) | |
4610 | and in the case of little endian: | |
4611 | high = interleave_low (vect1, vect2). */ | |
98b44b0e | 4612 | perm_dest = create_tmp_var (vectype, "vect_inter_high"); |
0890b981 | 4613 | DECL_GIMPLE_REG_P (perm_dest) = 1; |
98b44b0e | 4614 | add_referenced_var (perm_dest); |
a3895f55 | 4615 | if (BYTES_BIG_ENDIAN) |
ebb07520 | 4616 | tmp = build2 (VEC_INTERLEAVE_HIGH_EXPR, vectype, vect1, vect2); |
a3895f55 | 4617 | else |
ebb07520 RS |
4618 | tmp = build2 (VEC_INTERLEAVE_LOW_EXPR, vectype, vect1, vect2); |
4619 | perm_stmt = build_gimple_modify_stmt (perm_dest, tmp); | |
98b44b0e | 4620 | high = make_ssa_name (perm_dest, perm_stmt); |
07beea0d | 4621 | GIMPLE_STMT_OPERAND (perm_stmt, 0) = high; |
98b44b0e IR |
4622 | vect_finish_stmt_generation (stmt, perm_stmt, bsi); |
4623 | VEC_replace (tree, *result_chain, 2*j, high); | |
4624 | ||
a3895f55 IR |
4625 | /* Create interleaving stmt: |
4626 | in the case of big endian: | |
4627 | low = interleave_low (vect1, vect2) | |
4628 | and in the case of little endian: | |
4629 | low = interleave_high (vect1, vect2). */ | |
98b44b0e | 4630 | perm_dest = create_tmp_var (vectype, "vect_inter_low"); |
0890b981 | 4631 | DECL_GIMPLE_REG_P (perm_dest) = 1; |
98b44b0e | 4632 | add_referenced_var (perm_dest); |
a3895f55 | 4633 | if (BYTES_BIG_ENDIAN) |
ebb07520 | 4634 | tmp = build2 (VEC_INTERLEAVE_LOW_EXPR, vectype, vect1, vect2); |
a3895f55 | 4635 | else |
ebb07520 RS |
4636 | tmp = build2 (VEC_INTERLEAVE_HIGH_EXPR, vectype, vect1, vect2); |
4637 | perm_stmt = build_gimple_modify_stmt (perm_dest, tmp); | |
98b44b0e | 4638 | low = make_ssa_name (perm_dest, perm_stmt); |
07beea0d | 4639 | GIMPLE_STMT_OPERAND (perm_stmt, 0) = low; |
98b44b0e IR |
4640 | vect_finish_stmt_generation (stmt, perm_stmt, bsi); |
4641 | VEC_replace (tree, *result_chain, 2*j+1, low); | |
4642 | } | |
4643 | dr_chain = VEC_copy (tree, heap, *result_chain); | |
4644 | } | |
4645 | return true; | |
4646 | } | |
4647 | ||
4648 | ||
f7064d11 DN |
4649 | /* Function vectorizable_store. |
4650 | ||
4651 | Check if STMT defines a non scalar data-ref (array/pointer/structure) that | |
4652 | can be vectorized. | |
4653 | If VEC_STMT is also passed, vectorize the STMT: create a vectorized | |
4654 | stmt to replace it, put it in VEC_STMT, and insert it at BSI. | |
4655 | Return FALSE if not a vectorizable STMT, TRUE otherwise. */ | |
4656 | ||
4657 | bool | |
805e2059 IR |
4658 | vectorizable_store (tree stmt, block_stmt_iterator *bsi, tree *vec_stmt, |
4659 | slp_tree slp_node) | |
f7064d11 DN |
4660 | { |
4661 | tree scalar_dest; | |
4662 | tree data_ref; | |
4663 | tree op; | |
89d67cca | 4664 | tree vec_oprnd = NULL_TREE; |
f7064d11 | 4665 | stmt_vec_info stmt_info = vinfo_for_stmt (stmt); |
98b44b0e | 4666 | struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info), *first_dr = NULL; |
f7064d11 DN |
4667 | tree vectype = STMT_VINFO_VECTYPE (stmt_info); |
4668 | loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info); | |
d29de1bf | 4669 | struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo); |
f7064d11 DN |
4670 | enum machine_mode vec_mode; |
4671 | tree dummy; | |
468c2ac0 | 4672 | enum dr_alignment_support alignment_support_scheme; |
88088c03 DN |
4673 | tree def, def_stmt; |
4674 | enum vect_def_type dt; | |
98b44b0e | 4675 | stmt_vec_info prev_stmt_info = NULL; |
89d67cca DN |
4676 | tree dataref_ptr = NULL_TREE; |
4677 | int nunits = TYPE_VECTOR_SUBPARTS (vectype); | |
4678 | int ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits; | |
4679 | int j; | |
0bf2cf89 | 4680 | tree next_stmt, first_stmt = NULL_TREE; |
98b44b0e IR |
4681 | bool strided_store = false; |
4682 | unsigned int group_size, i; | |
4683 | VEC(tree,heap) *dr_chain = NULL, *oprnds = NULL, *result_chain = NULL; | |
468c2ac0 | 4684 | bool inv_p; |
805e2059 IR |
4685 | VEC(tree,heap) *vec_oprnds = NULL; |
4686 | bool slp = (slp_node != NULL); | |
4687 | stmt_vec_info first_stmt_vinfo; | |
4688 | unsigned int vec_num; | |
4689 | ||
4690 | /* FORNOW: SLP with multiple types is not supported. The SLP analysis verifies | |
4691 | this, so we can safely override NCOPIES with 1 here. */ | |
4692 | if (slp) | |
4693 | ncopies = 1; | |
468c2ac0 | 4694 | |
89d67cca | 4695 | gcc_assert (ncopies >= 1); |
468c2ac0 | 4696 | |
d29de1bf DN |
4697 | /* FORNOW. This restriction should be relaxed. */ |
4698 | if (nested_in_vect_loop_p (loop, stmt) && ncopies > 1) | |
4699 | { | |
4700 | if (vect_print_dump_info (REPORT_DETAILS)) | |
4701 | fprintf (vect_dump, "multiple types in nested loop."); | |
4702 | return false; | |
4703 | } | |
f7064d11 | 4704 | |
60555ced DN |
4705 | if (!STMT_VINFO_RELEVANT_P (stmt_info)) |
4706 | return false; | |
4707 | ||
4708 | if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_loop_def) | |
4709 | return false; | |
4710 | ||
f7064d11 DN |
4711 | /* Is vectorizable store? */ |
4712 | ||
07beea0d | 4713 | if (TREE_CODE (stmt) != GIMPLE_MODIFY_STMT) |
f7064d11 DN |
4714 | return false; |
4715 | ||
07beea0d | 4716 | scalar_dest = GIMPLE_STMT_OPERAND (stmt, 0); |
f7064d11 | 4717 | if (TREE_CODE (scalar_dest) != ARRAY_REF |
98b44b0e | 4718 | && TREE_CODE (scalar_dest) != INDIRECT_REF |
805e2059 | 4719 | && !STMT_VINFO_STRIDED_ACCESS (stmt_info)) |
f7064d11 DN |
4720 | return false; |
4721 | ||
07beea0d | 4722 | op = GIMPLE_STMT_OPERAND (stmt, 1); |
88088c03 | 4723 | if (!vect_is_simple_use (op, loop_vinfo, &def_stmt, &def, &dt)) |
f7064d11 | 4724 | { |
00518cb1 | 4725 | if (vect_print_dump_info (REPORT_DETAILS)) |
f7064d11 DN |
4726 | fprintf (vect_dump, "use not simple."); |
4727 | return false; | |
4728 | } | |
4729 | ||
4730 | vec_mode = TYPE_MODE (vectype); | |
4731 | /* FORNOW. In some cases can vectorize even if data-type not supported | |
4732 | (e.g. - array initialization with 0). */ | |
166cdb08 | 4733 | if (optab_handler (mov_optab, (int)vec_mode)->insn_code == CODE_FOR_nothing) |
f7064d11 DN |
4734 | return false; |
4735 | ||
4736 | if (!STMT_VINFO_DATA_REF (stmt_info)) | |
4737 | return false; | |
4738 | ||
805e2059 | 4739 | if (STMT_VINFO_STRIDED_ACCESS (stmt_info)) |
98b44b0e IR |
4740 | { |
4741 | strided_store = true; | |
0bf2cf89 | 4742 | first_stmt = DR_GROUP_FIRST_DR (stmt_info); |
805e2059 IR |
4743 | if (!vect_strided_store_supported (vectype) |
4744 | && !PURE_SLP_STMT (stmt_info) && !slp) | |
0bf2cf89 IR |
4745 | return false; |
4746 | ||
4747 | if (first_stmt == stmt) | |
4748 | { | |
4749 | /* STMT is the leader of the group. Check the operands of all the | |
4750 | stmts of the group. */ | |
4751 | next_stmt = DR_GROUP_NEXT_DR (stmt_info); | |
4752 | while (next_stmt) | |
4753 | { | |
4754 | op = GIMPLE_STMT_OPERAND (next_stmt, 1); | |
4755 | if (!vect_is_simple_use (op, loop_vinfo, &def_stmt, &def, &dt)) | |
4756 | { | |
4757 | if (vect_print_dump_info (REPORT_DETAILS)) | |
4758 | fprintf (vect_dump, "use not simple."); | |
4759 | return false; | |
4760 | } | |
4761 | next_stmt = DR_GROUP_NEXT_DR (vinfo_for_stmt (next_stmt)); | |
4762 | } | |
4763 | } | |
98b44b0e | 4764 | } |
f7064d11 DN |
4765 | |
4766 | if (!vec_stmt) /* transformation not required. */ | |
4767 | { | |
4768 | STMT_VINFO_TYPE (stmt_info) = store_vec_info_type; | |
805e2059 IR |
4769 | if (!PURE_SLP_STMT (stmt_info)) |
4770 | vect_model_store_cost (stmt_info, ncopies, dt, NULL); | |
f7064d11 DN |
4771 | return true; |
4772 | } | |
4773 | ||
4774 | /** Transform. **/ | |
4775 | ||
98b44b0e IR |
4776 | if (strided_store) |
4777 | { | |
98b44b0e IR |
4778 | first_dr = STMT_VINFO_DATA_REF (vinfo_for_stmt (first_stmt)); |
4779 | group_size = DR_GROUP_SIZE (vinfo_for_stmt (first_stmt)); | |
4780 | ||
4781 | DR_GROUP_STORE_COUNT (vinfo_for_stmt (first_stmt))++; | |
4782 | ||
468c2ac0 DN |
4783 | /* FORNOW */ |
4784 | gcc_assert (!nested_in_vect_loop_p (loop, stmt)); | |
4785 | ||
98b44b0e IR |
4786 | /* We vectorize all the stmts of the interleaving group when we |
4787 | reach the last stmt in the group. */ | |
4788 | if (DR_GROUP_STORE_COUNT (vinfo_for_stmt (first_stmt)) | |
805e2059 IR |
4789 | < DR_GROUP_SIZE (vinfo_for_stmt (first_stmt)) |
4790 | && !slp) | |
98b44b0e IR |
4791 | { |
4792 | *vec_stmt = NULL_TREE; | |
4793 | return true; | |
4794 | } | |
805e2059 IR |
4795 | |
4796 | if (slp) | |
4797 | strided_store = false; | |
4798 | ||
4799 | /* VEC_NUM is the number of vect stmts to be created for this group. */ | |
4800 | if (slp && SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node) < group_size) | |
4801 | vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node); | |
4802 | else | |
4803 | vec_num = group_size; | |
98b44b0e IR |
4804 | } |
4805 | else | |
4806 | { | |
4807 | first_stmt = stmt; | |
4808 | first_dr = dr; | |
805e2059 IR |
4809 | group_size = vec_num = 1; |
4810 | first_stmt_vinfo = stmt_info; | |
98b44b0e IR |
4811 | } |
4812 | ||
792ed98b HJ |
4813 | if (vect_print_dump_info (REPORT_DETAILS)) |
4814 | fprintf (vect_dump, "transform store. ncopies = %d",ncopies); | |
4815 | ||
98b44b0e IR |
4816 | dr_chain = VEC_alloc (tree, heap, group_size); |
4817 | oprnds = VEC_alloc (tree, heap, group_size); | |
4818 | ||
468c2ac0 DN |
4819 | alignment_support_scheme = vect_supportable_dr_alignment (first_dr); |
4820 | gcc_assert (alignment_support_scheme); | |
4821 | gcc_assert (alignment_support_scheme == dr_aligned); /* FORNOW */ | |
f7064d11 | 4822 | |
89d67cca DN |
4823 | /* In case the vectorization factor (VF) is bigger than the number |
4824 | of elements that we can fit in a vectype (nunits), we have to generate | |
4825 | more than one vector stmt - i.e - we need to "unroll" the | |
4826 | vector stmt by a factor VF/nunits. For more details see documentation in | |
4827 | vect_get_vec_def_for_copy_stmt. */ | |
f7064d11 | 4828 | |
98b44b0e IR |
4829 | /* In case of interleaving (non-unit strided access): |
4830 | ||
4831 | S1: &base + 2 = x2 | |
4832 | S2: &base = x0 | |
4833 | S3: &base + 1 = x1 | |
4834 | S4: &base + 3 = x3 | |
4835 | ||
878aa817 | 4836 | We create vectorized stores starting from base address (the access of the |
98b44b0e IR |
4837 | first stmt in the chain (S2 in the above example), when the last store stmt |
4838 | of the chain (S4) is reached: | |
4839 | ||
4840 | VS1: &base = vx2 | |
4841 | VS2: &base + vec_size*1 = vx0 | |
4842 | VS3: &base + vec_size*2 = vx1 | |
4843 | VS4: &base + vec_size*3 = vx3 | |
4844 | ||
4845 | Then permutation statements are generated: | |
4846 | ||
4847 | VS5: vx5 = VEC_INTERLEAVE_HIGH_EXPR < vx0, vx3 > | |
4848 | VS6: vx6 = VEC_INTERLEAVE_LOW_EXPR < vx0, vx3 > | |
4849 | ... | |
4850 | ||
4851 | And they are put in STMT_VINFO_VEC_STMT of the corresponding scalar stmts | |
4852 | (the order of the data-refs in the output of vect_permute_store_chain | |
4853 | corresponds to the order of scalar stmts in the interleaving chain - see | |
2f8e468b | 4854 | the documentation of vect_permute_store_chain()). |
98b44b0e IR |
4855 | |
4856 | In case of both multiple types and interleaving, above vector stores and | |
4857 | permutation stmts are created for every copy. The result vector stmts are | |
4858 | put in STMT_VINFO_VEC_STMT for the first copy and in the corresponding | |
4859 | STMT_VINFO_RELATED_STMT for the next copies. | |
4860 | */ | |
4861 | ||
89d67cca DN |
4862 | prev_stmt_info = NULL; |
4863 | for (j = 0; j < ncopies; j++) | |
4864 | { | |
4865 | tree new_stmt; | |
4866 | tree ptr_incr; | |
f7064d11 | 4867 | |
89d67cca DN |
4868 | if (j == 0) |
4869 | { | |
805e2059 IR |
4870 | if (slp) |
4871 | { | |
4872 | /* Get vectorized arguments for SLP_NODE. */ | |
4873 | vect_get_slp_defs (slp_node, &vec_oprnds, NULL); | |
4874 | ||
4875 | vec_oprnd = VEC_index (tree, vec_oprnds, 0); | |
4876 | } | |
4877 | else | |
4878 | { | |
4879 | /* For interleaved stores we collect vectorized defs for all the | |
4880 | stores in the group in DR_CHAIN and OPRNDS. DR_CHAIN is then | |
4881 | used as an input to vect_permute_store_chain(), and OPRNDS as | |
4882 | an input to vect_get_vec_def_for_stmt_copy() for the next copy. | |
4883 | ||
4884 | If the store is not strided, GROUP_SIZE is 1, and DR_CHAIN and | |
4885 | OPRNDS are of size 1. */ | |
4886 | next_stmt = first_stmt; | |
4887 | for (i = 0; i < group_size; i++) | |
4888 | { | |
4889 | /* Since gaps are not supported for interleaved stores, | |
4890 | GROUP_SIZE is the exact number of stmts in the chain. | |
4891 | Therefore, NEXT_STMT can't be NULL_TREE. In case that | |
4892 | there is no interleaving, GROUP_SIZE is 1, and only one | |
4893 | iteration of the loop will be executed. */ | |
4894 | gcc_assert (next_stmt); | |
4895 | op = GIMPLE_STMT_OPERAND (next_stmt, 1); | |
4896 | ||
4897 | vec_oprnd = vect_get_vec_def_for_operand (op, next_stmt, | |
4898 | NULL); | |
4899 | VEC_quick_push(tree, dr_chain, vec_oprnd); | |
4900 | VEC_quick_push(tree, oprnds, vec_oprnd); | |
4901 | next_stmt = DR_GROUP_NEXT_DR (vinfo_for_stmt (next_stmt)); | |
4902 | } | |
98b44b0e | 4903 | } |
468c2ac0 | 4904 | dataref_ptr = vect_create_data_ref_ptr (first_stmt, NULL, NULL_TREE, |
4090db01 | 4905 | &dummy, &ptr_incr, false, |
468c2ac0 DN |
4906 | TREE_TYPE (vec_oprnd), &inv_p); |
4907 | gcc_assert (!inv_p); | |
89d67cca DN |
4908 | } |
4909 | else | |
4910 | { | |
805e2059 IR |
4911 | /* FORNOW SLP doesn't work for multiple types. */ |
4912 | gcc_assert (!slp); | |
4913 | ||
98b44b0e IR |
4914 | /* For interleaved stores we created vectorized defs for all the |
4915 | defs stored in OPRNDS in the previous iteration (previous copy). | |
4916 | DR_CHAIN is then used as an input to vect_permute_store_chain(), | |
8115817b | 4917 | and OPRNDS as an input to vect_get_vec_def_for_stmt_copy() for the |
98b44b0e IR |
4918 | next copy. |
4919 | If the store is not strided, GROUP_SIZE is 1, and DR_CHAIN and | |
878aa817 | 4920 | OPRNDS are of size 1. */ |
98b44b0e IR |
4921 | for (i = 0; i < group_size; i++) |
4922 | { | |
0bf2cf89 IR |
4923 | op = VEC_index (tree, oprnds, i); |
4924 | vect_is_simple_use (op, loop_vinfo, &def_stmt, &def, &dt); | |
4925 | vec_oprnd = vect_get_vec_def_for_stmt_copy (dt, op); | |
98b44b0e IR |
4926 | VEC_replace(tree, dr_chain, i, vec_oprnd); |
4927 | VEC_replace(tree, oprnds, i, vec_oprnd); | |
4928 | } | |
468c2ac0 DN |
4929 | dataref_ptr = |
4930 | bump_vector_ptr (dataref_ptr, ptr_incr, bsi, stmt, NULL_TREE); | |
89d67cca | 4931 | } |
f7064d11 | 4932 | |
98b44b0e IR |
4933 | if (strided_store) |
4934 | { | |
4935 | result_chain = VEC_alloc (tree, heap, group_size); | |
4936 | /* Permute. */ | |
4937 | if (!vect_permute_store_chain (dr_chain, group_size, stmt, bsi, | |
4938 | &result_chain)) | |
4939 | return false; | |
4940 | } | |
9cf5a7e3 | 4941 | |
98b44b0e | 4942 | next_stmt = first_stmt; |
805e2059 | 4943 | for (i = 0; i < vec_num; i++) |
89d67cca | 4944 | { |
805e2059 IR |
4945 | if (i > 0) |
4946 | /* Bump the vector pointer. */ | |
4947 | dataref_ptr = bump_vector_ptr (dataref_ptr, ptr_incr, bsi, stmt, | |
4948 | NULL_TREE); | |
4949 | ||
4950 | if (slp) | |
4951 | vec_oprnd = VEC_index (tree, vec_oprnds, i); | |
4952 | else if (strided_store) | |
4953 | /* For strided stores vectorized defs are interleaved in | |
4954 | vect_permute_store_chain(). */ | |
4955 | vec_oprnd = VEC_index (tree, result_chain, i); | |
98b44b0e IR |
4956 | |
4957 | data_ref = build_fold_indirect_ref (dataref_ptr); | |
4958 | /* Arguments are ready. Create the new vector stmt. */ | |
ebb07520 | 4959 | new_stmt = build_gimple_modify_stmt (data_ref, vec_oprnd); |
98b44b0e | 4960 | vect_finish_stmt_generation (stmt, new_stmt, bsi); |
cd7ae74d IR |
4961 | mark_symbols_for_renaming (new_stmt); |
4962 | ||
4963 | if (j == 0) | |
4964 | STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt; | |
98b44b0e | 4965 | else |
cd7ae74d | 4966 | STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt; |
89d67cca | 4967 | |
98b44b0e | 4968 | prev_stmt_info = vinfo_for_stmt (new_stmt); |
878aa817 | 4969 | next_stmt = DR_GROUP_NEXT_DR (vinfo_for_stmt (next_stmt)); |
98b44b0e IR |
4970 | if (!next_stmt) |
4971 | break; | |
89d67cca | 4972 | } |
9cf5a7e3 KB |
4973 | } |
4974 | ||
f7064d11 DN |
4975 | return true; |
4976 | } | |
4977 | ||
4978 | ||
89d67cca DN |
4979 | /* Function vect_setup_realignment |
4980 | ||
4981 | This function is called when vectorizing an unaligned load using | |
468c2ac0 | 4982 | the dr_explicit_realign[_optimized] scheme. |
89d67cca DN |
4983 | This function generates the following code at the loop prolog: |
4984 | ||
4985 | p = initial_addr; | |
468c2ac0 | 4986 | x msq_init = *(floor(p)); # prolog load |
89d67cca DN |
4987 | realignment_token = call target_builtin; |
4988 | loop: | |
468c2ac0 DN |
4989 | x msq = phi (msq_init, ---) |
4990 | ||
4991 | The stmts marked with x are generated only for the case of | |
4992 | dr_explicit_realign_optimized. | |
89d67cca DN |
4993 | |
4994 | The code above sets up a new (vector) pointer, pointing to the first | |
4995 | location accessed by STMT, and a "floor-aligned" load using that pointer. | |
4996 | It also generates code to compute the "realignment-token" (if the relevant | |
4997 | target hook was defined), and creates a phi-node at the loop-header bb | |
4998 | whose arguments are the result of the prolog-load (created by this | |
4999 | function) and the result of a load that takes place in the loop (to be | |
5000 | created by the caller to this function). | |
468c2ac0 DN |
5001 | |
5002 | For the case of dr_explicit_realign_optimized: | |
89d67cca DN |
5003 | The caller to this function uses the phi-result (msq) to create the |
5004 | realignment code inside the loop, and sets up the missing phi argument, | |
5005 | as follows: | |
89d67cca DN |
5006 | loop: |
5007 | msq = phi (msq_init, lsq) | |
5008 | lsq = *(floor(p')); # load in loop | |
5009 | result = realign_load (msq, lsq, realignment_token); | |
5010 | ||
468c2ac0 DN |
5011 | For the case of dr_explicit_realign: |
5012 | loop: | |
5013 | msq = *(floor(p)); # load in loop | |
5014 | p' = p + (VS-1); | |
5015 | lsq = *(floor(p')); # load in loop | |
5016 | result = realign_load (msq, lsq, realignment_token); | |
5017 | ||
89d67cca DN |
5018 | Input: |
5019 | STMT - (scalar) load stmt to be vectorized. This load accesses | |
5020 | a memory location that may be unaligned. | |
5021 | BSI - place where new code is to be inserted. | |
468c2ac0 DN |
5022 | ALIGNMENT_SUPPORT_SCHEME - which of the two misalignment handling schemes |
5023 | is used. | |
89d67cca DN |
5024 | |
5025 | Output: | |
5026 | REALIGNMENT_TOKEN - the result of a call to the builtin_mask_for_load | |
5027 | target hook, if defined. | |
98b44b0e | 5028 | Return value - the result of the loop-header phi node. */ |
89d67cca DN |
5029 | |
5030 | static tree | |
5031 | vect_setup_realignment (tree stmt, block_stmt_iterator *bsi, | |
468c2ac0 DN |
5032 | tree *realignment_token, |
5033 | enum dr_alignment_support alignment_support_scheme, | |
5034 | tree init_addr, | |
5035 | struct loop **at_loop) | |
89d67cca DN |
5036 | { |
5037 | stmt_vec_info stmt_info = vinfo_for_stmt (stmt); | |
5038 | tree vectype = STMT_VINFO_VECTYPE (stmt_info); | |
5039 | loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info); | |
5040 | struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo); | |
468c2ac0 | 5041 | edge pe; |
ce133c3f | 5042 | tree scalar_dest = GIMPLE_STMT_OPERAND (stmt, 0); |
89d67cca | 5043 | tree vec_dest; |
89d67cca DN |
5044 | tree inc; |
5045 | tree ptr; | |
5046 | tree data_ref; | |
5047 | tree new_stmt; | |
5048 | basic_block new_bb; | |
468c2ac0 | 5049 | tree msq_init = NULL_TREE; |
89d67cca DN |
5050 | tree new_temp; |
5051 | tree phi_stmt; | |
468c2ac0 DN |
5052 | tree msq = NULL_TREE; |
5053 | tree stmts = NULL_TREE; | |
5054 | bool inv_p; | |
5055 | bool compute_in_loop = false; | |
5056 | bool nested_in_vect_loop = nested_in_vect_loop_p (loop, stmt); | |
5057 | struct loop *containing_loop = (bb_for_stmt (stmt))->loop_father; | |
5058 | struct loop *loop_for_initial_load; | |
5059 | ||
5060 | gcc_assert (alignment_support_scheme == dr_explicit_realign | |
5061 | || alignment_support_scheme == dr_explicit_realign_optimized); | |
5062 | ||
5063 | /* We need to generate three things: | |
5064 | 1. the misalignment computation | |
5065 | 2. the extra vector load (for the optimized realignment scheme). | |
5066 | 3. the phi node for the two vectors from which the realignment is | |
5067 | done (for the optimized realignment scheme). | |
5068 | */ | |
5069 | ||
5070 | /* 1. Determine where to generate the misalignment computation. | |
5071 | ||
5072 | If INIT_ADDR is NULL_TREE, this indicates that the misalignment | |
5073 | calculation will be generated by this function, outside the loop (in the | |
5074 | preheader). Otherwise, INIT_ADDR had already been computed for us by the | |
5075 | caller, inside the loop. | |
5076 | ||
5077 | Background: If the misalignment remains fixed throughout the iterations of | |
5078 | the loop, then both realignment schemes are applicable, and also the | |
5079 | misalignment computation can be done outside LOOP. This is because we are | |
5080 | vectorizing LOOP, and so the memory accesses in LOOP advance in steps that | |
5081 | are a multiple of VS (the Vector Size), and therefore the misalignment in | |
5082 | different vectorized LOOP iterations is always the same. | |
5083 | The problem arises only if the memory access is in an inner-loop nested | |
5084 | inside LOOP, which is now being vectorized using outer-loop vectorization. | |
5085 | This is the only case when the misalignment of the memory access may not | |
15dc95cb | 5086 | remain fixed throughout the iterations of the inner-loop (as explained in |
468c2ac0 DN |
5087 | detail in vect_supportable_dr_alignment). In this case, not only is the |
5088 | optimized realignment scheme not applicable, but also the misalignment | |
5089 | computation (and generation of the realignment token that is passed to | |
5090 | REALIGN_LOAD) have to be done inside the loop. | |
5091 | ||
5092 | In short, INIT_ADDR indicates whether we are in a COMPUTE_IN_LOOP mode | |
5093 | or not, which in turn determines if the misalignment is computed inside | |
5094 | the inner-loop, or outside LOOP. */ | |
5095 | ||
5096 | if (init_addr != NULL_TREE) | |
5097 | { | |
5098 | compute_in_loop = true; | |
5099 | gcc_assert (alignment_support_scheme == dr_explicit_realign); | |
5100 | } | |
5101 | ||
5102 | ||
5103 | /* 2. Determine where to generate the extra vector load. | |
5104 | ||
5105 | For the optimized realignment scheme, instead of generating two vector | |
5106 | loads in each iteration, we generate a single extra vector load in the | |
5107 | preheader of the loop, and in each iteration reuse the result of the | |
5108 | vector load from the previous iteration. In case the memory access is in | |
5109 | an inner-loop nested inside LOOP, which is now being vectorized using | |
5110 | outer-loop vectorization, we need to determine whether this initial vector | |
5111 | load should be generated at the preheader of the inner-loop, or can be | |
5112 | generated at the preheader of LOOP. If the memory access has no evolution | |
5113 | in LOOP, it can be generated in the preheader of LOOP. Otherwise, it has | |
5114 | to be generated inside LOOP (in the preheader of the inner-loop). */ | |
89d67cca | 5115 | |
468c2ac0 DN |
5116 | if (nested_in_vect_loop) |
5117 | { | |
5118 | tree outerloop_step = STMT_VINFO_DR_STEP (stmt_info); | |
5119 | bool invariant_in_outerloop = | |
5120 | (tree_int_cst_compare (outerloop_step, size_zero_node) == 0); | |
5121 | loop_for_initial_load = (invariant_in_outerloop ? loop : loop->inner); | |
5122 | } | |
5123 | else | |
5124 | loop_for_initial_load = loop; | |
5125 | if (at_loop) | |
5126 | *at_loop = loop_for_initial_load; | |
5127 | ||
5128 | /* 3. For the case of the optimized realignment, create the first vector | |
5129 | load at the loop preheader. */ | |
5130 | ||
5131 | if (alignment_support_scheme == dr_explicit_realign_optimized) | |
5132 | { | |
5133 | /* Create msq_init = *(floor(p1)) in the loop preheader */ | |
5134 | ||
5135 | gcc_assert (!compute_in_loop); | |
5136 | pe = loop_preheader_edge (loop_for_initial_load); | |
5137 | vec_dest = vect_create_destination_var (scalar_dest, vectype); | |
5138 | ptr = vect_create_data_ref_ptr (stmt, loop_for_initial_load, NULL_TREE, | |
5139 | &init_addr, &inc, true, NULL_TREE, &inv_p); | |
5140 | data_ref = build1 (ALIGN_INDIRECT_REF, vectype, ptr); | |
5141 | new_stmt = build_gimple_modify_stmt (vec_dest, data_ref); | |
5142 | new_temp = make_ssa_name (vec_dest, new_stmt); | |
5143 | GIMPLE_STMT_OPERAND (new_stmt, 0) = new_temp; | |
f10d132b | 5144 | mark_symbols_for_renaming (new_stmt); |
468c2ac0 DN |
5145 | new_bb = bsi_insert_on_edge_immediate (pe, new_stmt); |
5146 | gcc_assert (!new_bb); | |
5147 | msq_init = GIMPLE_STMT_OPERAND (new_stmt, 0); | |
5148 | } | |
5149 | ||
5150 | /* 4. Create realignment token using a target builtin, if available. | |
5151 | It is done either inside the containing loop, or before LOOP (as | |
5152 | determined above). */ | |
89d67cca | 5153 | |
89d67cca DN |
5154 | if (targetm.vectorize.builtin_mask_for_load) |
5155 | { | |
5156 | tree builtin_decl; | |
89d67cca | 5157 | |
468c2ac0 DN |
5158 | /* Compute INIT_ADDR - the initial addressed accessed by this memref. */ |
5159 | if (compute_in_loop) | |
5160 | gcc_assert (init_addr); /* already computed by the caller. */ | |
5161 | else | |
5162 | { | |
5163 | /* Generate the INIT_ADDR computation outside LOOP. */ | |
5164 | init_addr = vect_create_addr_base_for_vector_ref (stmt, &stmts, | |
5165 | NULL_TREE, loop); | |
5166 | pe = loop_preheader_edge (loop); | |
5167 | new_bb = bsi_insert_on_edge_immediate (pe, stmts); | |
5168 | gcc_assert (!new_bb); | |
5169 | } | |
5170 | ||
89d67cca | 5171 | builtin_decl = targetm.vectorize.builtin_mask_for_load (); |
5039610b | 5172 | new_stmt = build_call_expr (builtin_decl, 1, init_addr); |
4090db01 IR |
5173 | vec_dest = vect_create_destination_var (scalar_dest, |
5174 | TREE_TYPE (new_stmt)); | |
ebb07520 | 5175 | new_stmt = build_gimple_modify_stmt (vec_dest, new_stmt); |
89d67cca | 5176 | new_temp = make_ssa_name (vec_dest, new_stmt); |
07beea0d | 5177 | GIMPLE_STMT_OPERAND (new_stmt, 0) = new_temp; |
468c2ac0 DN |
5178 | |
5179 | if (compute_in_loop) | |
5180 | bsi_insert_before (bsi, new_stmt, BSI_SAME_STMT); | |
5181 | else | |
5182 | { | |
5183 | /* Generate the misalignment computation outside LOOP. */ | |
5184 | pe = loop_preheader_edge (loop); | |
5185 | new_bb = bsi_insert_on_edge_immediate (pe, new_stmt); | |
5186 | gcc_assert (!new_bb); | |
5187 | } | |
5188 | ||
07beea0d | 5189 | *realignment_token = GIMPLE_STMT_OPERAND (new_stmt, 0); |
89d67cca DN |
5190 | |
5191 | /* The result of the CALL_EXPR to this builtin is determined from | |
5192 | the value of the parameter and no global variables are touched | |
5193 | which makes the builtin a "const" function. Requiring the | |
5194 | builtin to have the "const" attribute makes it unnecessary | |
5195 | to call mark_call_clobbered. */ | |
5196 | gcc_assert (TREE_READONLY (builtin_decl)); | |
5197 | } | |
5198 | ||
468c2ac0 DN |
5199 | if (alignment_support_scheme == dr_explicit_realign) |
5200 | return msq; | |
5201 | ||
5202 | gcc_assert (!compute_in_loop); | |
5203 | gcc_assert (alignment_support_scheme == dr_explicit_realign_optimized); | |
5204 | ||
5205 | ||
5206 | /* 5. Create msq = phi <msq_init, lsq> in loop */ | |
5207 | ||
5208 | pe = loop_preheader_edge (containing_loop); | |
89d67cca DN |
5209 | vec_dest = vect_create_destination_var (scalar_dest, vectype); |
5210 | msq = make_ssa_name (vec_dest, NULL_TREE); | |
468c2ac0 | 5211 | phi_stmt = create_phi_node (msq, containing_loop->header); |
89d67cca | 5212 | SSA_NAME_DEF_STMT (msq) = phi_stmt; |
468c2ac0 | 5213 | add_phi_arg (phi_stmt, msq_init, pe); |
89d67cca DN |
5214 | |
5215 | return msq; | |
5216 | } | |
5217 | ||
5218 | ||
98b44b0e IR |
5219 | /* Function vect_strided_load_supported. |
5220 | ||
5221 | Returns TRUE is EXTRACT_EVEN and EXTRACT_ODD operations are supported, | |
5222 | and FALSE otherwise. */ | |
5223 | ||
5224 | static bool | |
5225 | vect_strided_load_supported (tree vectype) | |
5226 | { | |
5227 | optab perm_even_optab, perm_odd_optab; | |
5228 | int mode; | |
5229 | ||
5230 | mode = (int) TYPE_MODE (vectype); | |
5231 | ||
5232 | perm_even_optab = optab_for_tree_code (VEC_EXTRACT_EVEN_EXPR, vectype); | |
5233 | if (!perm_even_optab) | |
5234 | { | |
5235 | if (vect_print_dump_info (REPORT_DETAILS)) | |
5236 | fprintf (vect_dump, "no optab for perm_even."); | |
5237 | return false; | |
5238 | } | |
5239 | ||
166cdb08 | 5240 | if (optab_handler (perm_even_optab, mode)->insn_code == CODE_FOR_nothing) |
98b44b0e IR |
5241 | { |
5242 | if (vect_print_dump_info (REPORT_DETAILS)) | |
5243 | fprintf (vect_dump, "perm_even op not supported by target."); | |
5244 | return false; | |
5245 | } | |
5246 | ||
5247 | perm_odd_optab = optab_for_tree_code (VEC_EXTRACT_ODD_EXPR, vectype); | |
5248 | if (!perm_odd_optab) | |
5249 | { | |
5250 | if (vect_print_dump_info (REPORT_DETAILS)) | |
5251 | fprintf (vect_dump, "no optab for perm_odd."); | |
5252 | return false; | |
5253 | } | |
5254 | ||
166cdb08 | 5255 | if (optab_handler (perm_odd_optab, mode)->insn_code == CODE_FOR_nothing) |
98b44b0e IR |
5256 | { |
5257 | if (vect_print_dump_info (REPORT_DETAILS)) | |
5258 | fprintf (vect_dump, "perm_odd op not supported by target."); | |
5259 | return false; | |
5260 | } | |
5261 | return true; | |
5262 | } | |
5263 | ||
5264 | ||
5265 | /* Function vect_permute_load_chain. | |
5266 | ||
5267 | Given a chain of interleaved loads in DR_CHAIN of LENGTH that must be | |
5268 | a power of 2, generate extract_even/odd stmts to reorder the input data | |
5269 | correctly. Return the final references for loads in RESULT_CHAIN. | |
5270 | ||
5271 | E.g., LENGTH is 4 and the scalar type is short, i.e., VF is 8. | |
2f8e468b | 5272 | The input is 4 vectors each containing 8 elements. We assign a number to each |
98b44b0e IR |
5273 | element, the input sequence is: |
5274 | ||
5275 | 1st vec: 0 1 2 3 4 5 6 7 | |
5276 | 2nd vec: 8 9 10 11 12 13 14 15 | |
5277 | 3rd vec: 16 17 18 19 20 21 22 23 | |
5278 | 4th vec: 24 25 26 27 28 29 30 31 | |
5279 | ||
5280 | The output sequence should be: | |
5281 | ||
5282 | 1st vec: 0 4 8 12 16 20 24 28 | |
5283 | 2nd vec: 1 5 9 13 17 21 25 29 | |
5284 | 3rd vec: 2 6 10 14 18 22 26 30 | |
5285 | 4th vec: 3 7 11 15 19 23 27 31 | |
5286 | ||
5287 | i.e., the first output vector should contain the first elements of each | |
5288 | interleaving group, etc. | |
5289 | ||
5290 | We use extract_even/odd instructions to create such output. The input of each | |
5291 | extract_even/odd operation is two vectors | |
5292 | 1st vec 2nd vec | |
5293 | 0 1 2 3 4 5 6 7 | |
5294 | ||
5295 | and the output is the vector of extracted even/odd elements. The output of | |
5296 | extract_even will be: 0 2 4 6 | |
5297 | and of extract_odd: 1 3 5 7 | |
5298 | ||
5299 | ||
2f8e468b | 5300 | The permutation is done in log LENGTH stages. In each stage extract_even and |
98b44b0e IR |
5301 | extract_odd stmts are created for each pair of vectors in DR_CHAIN in their |
5302 | order. In our example, | |
5303 | ||
5304 | E1: extract_even (1st vec, 2nd vec) | |
5305 | E2: extract_odd (1st vec, 2nd vec) | |
5306 | E3: extract_even (3rd vec, 4th vec) | |
5307 | E4: extract_odd (3rd vec, 4th vec) | |
5308 | ||
5309 | The output for the first stage will be: | |
5310 | ||
5311 | E1: 0 2 4 6 8 10 12 14 | |
5312 | E2: 1 3 5 7 9 11 13 15 | |
5313 | E3: 16 18 20 22 24 26 28 30 | |
5314 | E4: 17 19 21 23 25 27 29 31 | |
5315 | ||
5316 | In order to proceed and create the correct sequence for the next stage (or | |
5317 | for the correct output, if the second stage is the last one, as in our | |
5318 | example), we first put the output of extract_even operation and then the | |
5319 | output of extract_odd in RESULT_CHAIN (which is then copied to DR_CHAIN). | |
5320 | The input for the second stage is: | |
5321 | ||
5322 | 1st vec (E1): 0 2 4 6 8 10 12 14 | |
5323 | 2nd vec (E3): 16 18 20 22 24 26 28 30 | |
5324 | 3rd vec (E2): 1 3 5 7 9 11 13 15 | |
5325 | 4th vec (E4): 17 19 21 23 25 27 29 31 | |
5326 | ||
5327 | The output of the second stage: | |
5328 | ||
5329 | E1: 0 4 8 12 16 20 24 28 | |
5330 | E2: 2 6 10 14 18 22 26 30 | |
5331 | E3: 1 5 9 13 17 21 25 29 | |
5332 | E4: 3 7 11 15 19 23 27 31 | |
5333 | ||
5334 | And RESULT_CHAIN after reordering: | |
5335 | ||
5336 | 1st vec (E1): 0 4 8 12 16 20 24 28 | |
5337 | 2nd vec (E3): 1 5 9 13 17 21 25 29 | |
5338 | 3rd vec (E2): 2 6 10 14 18 22 26 30 | |
5339 | 4th vec (E4): 3 7 11 15 19 23 27 31. */ | |
5340 | ||
5341 | static bool | |
5342 | vect_permute_load_chain (VEC(tree,heap) *dr_chain, | |
5343 | unsigned int length, | |
5344 | tree stmt, | |
5345 | block_stmt_iterator *bsi, | |
5346 | VEC(tree,heap) **result_chain) | |
5347 | { | |
5348 | tree perm_dest, perm_stmt, data_ref, first_vect, second_vect; | |
5349 | tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (stmt)); | |
ebb07520 | 5350 | tree tmp; |
98b44b0e IR |
5351 | int i; |
5352 | unsigned int j; | |
5353 | ||
5354 | /* Check that the operation is supported. */ | |
5355 | if (!vect_strided_load_supported (vectype)) | |
5356 | return false; | |
5357 | ||
5358 | *result_chain = VEC_copy (tree, heap, dr_chain); | |
5359 | for (i = 0; i < exact_log2 (length); i++) | |
5360 | { | |
5361 | for (j = 0; j < length; j +=2) | |
5362 | { | |
5363 | first_vect = VEC_index (tree, dr_chain, j); | |
5364 | second_vect = VEC_index (tree, dr_chain, j+1); | |
5365 | ||
5366 | /* data_ref = permute_even (first_data_ref, second_data_ref); */ | |
5367 | perm_dest = create_tmp_var (vectype, "vect_perm_even"); | |
fc98ed56 | 5368 | DECL_GIMPLE_REG_P (perm_dest) = 1; |
98b44b0e | 5369 | add_referenced_var (perm_dest); |
ebb07520 RS |
5370 | |
5371 | tmp = build2 (VEC_EXTRACT_EVEN_EXPR, vectype, | |
5372 | first_vect, second_vect); | |
5373 | perm_stmt = build_gimple_modify_stmt (perm_dest, tmp); | |
98b44b0e IR |
5374 | |
5375 | data_ref = make_ssa_name (perm_dest, perm_stmt); | |
07beea0d | 5376 | GIMPLE_STMT_OPERAND (perm_stmt, 0) = data_ref; |
98b44b0e | 5377 | vect_finish_stmt_generation (stmt, perm_stmt, bsi); |
cfaab3a9 | 5378 | mark_symbols_for_renaming (perm_stmt); |
98b44b0e IR |
5379 | |
5380 | VEC_replace (tree, *result_chain, j/2, data_ref); | |
5381 | ||
5382 | /* data_ref = permute_odd (first_data_ref, second_data_ref); */ | |
5383 | perm_dest = create_tmp_var (vectype, "vect_perm_odd"); | |
fc98ed56 | 5384 | DECL_GIMPLE_REG_P (perm_dest) = 1; |
98b44b0e IR |
5385 | add_referenced_var (perm_dest); |
5386 | ||
ebb07520 RS |
5387 | tmp = build2 (VEC_EXTRACT_ODD_EXPR, vectype, |
5388 | first_vect, second_vect); | |
5389 | perm_stmt = build_gimple_modify_stmt (perm_dest, tmp); | |
98b44b0e | 5390 | data_ref = make_ssa_name (perm_dest, perm_stmt); |
07beea0d | 5391 | GIMPLE_STMT_OPERAND (perm_stmt, 0) = data_ref; |
98b44b0e | 5392 | vect_finish_stmt_generation (stmt, perm_stmt, bsi); |
cfaab3a9 | 5393 | mark_symbols_for_renaming (perm_stmt); |
98b44b0e IR |
5394 | |
5395 | VEC_replace (tree, *result_chain, j/2+length/2, data_ref); | |
5396 | } | |
5397 | dr_chain = VEC_copy (tree, heap, *result_chain); | |
5398 | } | |
5399 | return true; | |
5400 | } | |
5401 | ||
5402 | ||
5403 | /* Function vect_transform_strided_load. | |
5404 | ||
5405 | Given a chain of input interleaved data-refs (in DR_CHAIN), build statements | |
5406 | to perform their permutation and ascribe the result vectorized statements to | |
5407 | the scalar statements. | |
5408 | */ | |
5409 | ||
5410 | static bool | |
5411 | vect_transform_strided_load (tree stmt, VEC(tree,heap) *dr_chain, int size, | |
5412 | block_stmt_iterator *bsi) | |
5413 | { | |
5414 | stmt_vec_info stmt_info = vinfo_for_stmt (stmt); | |
5415 | tree first_stmt = DR_GROUP_FIRST_DR (stmt_info); | |
5416 | tree next_stmt, new_stmt; | |
5417 | VEC(tree,heap) *result_chain = NULL; | |
5418 | unsigned int i, gap_count; | |
5419 | tree tmp_data_ref; | |
5420 | ||
5421 | /* DR_CHAIN contains input data-refs that are a part of the interleaving. | |
5422 | RESULT_CHAIN is the output of vect_permute_load_chain, it contains permuted | |
5423 | vectors, that are ready for vector computation. */ | |
5424 | result_chain = VEC_alloc (tree, heap, size); | |
5425 | /* Permute. */ | |
5426 | if (!vect_permute_load_chain (dr_chain, size, stmt, bsi, &result_chain)) | |
5427 | return false; | |
5428 | ||
5429 | /* Put a permuted data-ref in the VECTORIZED_STMT field. | |
5430 | Since we scan the chain starting from it's first node, their order | |
5431 | corresponds the order of data-refs in RESULT_CHAIN. */ | |
5432 | next_stmt = first_stmt; | |
5433 | gap_count = 1; | |
639d3040 | 5434 | for (i = 0; VEC_iterate (tree, result_chain, i, tmp_data_ref); i++) |
98b44b0e IR |
5435 | { |
5436 | if (!next_stmt) | |
5437 | break; | |
5438 | ||
5439 | /* Skip the gaps. Loads created for the gaps will be removed by dead | |
5440 | code elimination pass later. | |
5441 | DR_GROUP_GAP is the number of steps in elements from the previous | |
5442 | access (if there is no gap DR_GROUP_GAP is 1). We skip loads that | |
5443 | correspond to the gaps. | |
5444 | */ | |
5445 | if (gap_count < DR_GROUP_GAP (vinfo_for_stmt (next_stmt))) | |
5446 | { | |
5447 | gap_count++; | |
5448 | continue; | |
5449 | } | |
5450 | ||
5451 | while (next_stmt) | |
5452 | { | |
5453 | new_stmt = SSA_NAME_DEF_STMT (tmp_data_ref); | |
5454 | /* We assume that if VEC_STMT is not NULL, this is a case of multiple | |
5455 | copies, and we put the new vector statement in the first available | |
5456 | RELATED_STMT. */ | |
5457 | if (!STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt))) | |
5458 | STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt)) = new_stmt; | |
5459 | else | |
5460 | { | |
5461 | tree prev_stmt = STMT_VINFO_VEC_STMT (vinfo_for_stmt (next_stmt)); | |
5462 | tree rel_stmt = STMT_VINFO_RELATED_STMT ( | |
5463 | vinfo_for_stmt (prev_stmt)); | |
5464 | while (rel_stmt) | |
5465 | { | |
5466 | prev_stmt = rel_stmt; | |
5467 | rel_stmt = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (rel_stmt)); | |
5468 | } | |
5469 | STMT_VINFO_RELATED_STMT (vinfo_for_stmt (prev_stmt)) = new_stmt; | |
5470 | } | |
5471 | next_stmt = DR_GROUP_NEXT_DR (vinfo_for_stmt (next_stmt)); | |
5472 | gap_count = 1; | |
5473 | /* If NEXT_STMT accesses the same DR as the previous statement, | |
5474 | put the same TMP_DATA_REF as its vectorized statement; otherwise | |
5475 | get the next data-ref from RESULT_CHAIN. */ | |
5476 | if (!next_stmt || !DR_GROUP_SAME_DR_STMT (vinfo_for_stmt (next_stmt))) | |
5477 | break; | |
5478 | } | |
5479 | } | |
5480 | return true; | |
5481 | } | |
5482 | ||
5483 | ||
f7064d11 DN |
5484 | /* vectorizable_load. |
5485 | ||
5486 | Check if STMT reads a non scalar data-ref (array/pointer/structure) that | |
5487 | can be vectorized. | |
5488 | If VEC_STMT is also passed, vectorize the STMT: create a vectorized | |
5489 | stmt to replace it, put it in VEC_STMT, and insert it at BSI. | |
5490 | Return FALSE if not a vectorizable STMT, TRUE otherwise. */ | |
5491 | ||
5492 | bool | |
805e2059 IR |
5493 | vectorizable_load (tree stmt, block_stmt_iterator *bsi, tree *vec_stmt, |
5494 | slp_tree slp_node) | |
f7064d11 DN |
5495 | { |
5496 | tree scalar_dest; | |
5497 | tree vec_dest = NULL; | |
5498 | tree data_ref = NULL; | |
5499 | tree op; | |
5500 | stmt_vec_info stmt_info = vinfo_for_stmt (stmt); | |
89d67cca DN |
5501 | stmt_vec_info prev_stmt_info; |
5502 | loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info); | |
5503 | struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo); | |
468c2ac0 DN |
5504 | struct loop *containing_loop = (bb_for_stmt (stmt))->loop_father; |
5505 | bool nested_in_vect_loop = nested_in_vect_loop_p (loop, stmt); | |
98b44b0e | 5506 | struct data_reference *dr = STMT_VINFO_DATA_REF (stmt_info), *first_dr; |
f7064d11 DN |
5507 | tree vectype = STMT_VINFO_VECTYPE (stmt_info); |
5508 | tree new_temp; | |
5509 | int mode; | |
98b44b0e | 5510 | tree new_stmt = NULL_TREE; |
f7064d11 | 5511 | tree dummy; |
468c2ac0 | 5512 | enum dr_alignment_support alignment_support_scheme; |
89d67cca DN |
5513 | tree dataref_ptr = NULL_TREE; |
5514 | tree ptr_incr; | |
5515 | int nunits = TYPE_VECTOR_SUBPARTS (vectype); | |
5516 | int ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits; | |
98b44b0e | 5517 | int i, j, group_size; |
89d67cca DN |
5518 | tree msq = NULL_TREE, lsq; |
5519 | tree offset = NULL_TREE; | |
5520 | tree realignment_token = NULL_TREE; | |
468c2ac0 | 5521 | tree phi = NULL_TREE; |
98b44b0e IR |
5522 | VEC(tree,heap) *dr_chain = NULL; |
5523 | bool strided_load = false; | |
5524 | tree first_stmt; | |
468c2ac0 DN |
5525 | tree scalar_type; |
5526 | bool inv_p; | |
5527 | bool compute_in_loop = false; | |
5528 | struct loop *at_loop; | |
805e2059 IR |
5529 | int vec_num; |
5530 | bool slp = (slp_node != NULL); | |
5531 | ||
5532 | /* FORNOW: SLP with multiple types is not supported. The SLP analysis verifies | |
5533 | this, so we can safely override NCOPIES with 1 here. */ | |
5534 | if (slp) | |
5535 | ncopies = 1; | |
f7064d11 | 5536 | |
d29de1bf | 5537 | gcc_assert (ncopies >= 1); |
468c2ac0 | 5538 | |
d29de1bf | 5539 | /* FORNOW. This restriction should be relaxed. */ |
468c2ac0 | 5540 | if (nested_in_vect_loop && ncopies > 1) |
d29de1bf DN |
5541 | { |
5542 | if (vect_print_dump_info (REPORT_DETAILS)) | |
5543 | fprintf (vect_dump, "multiple types in nested loop."); | |
5544 | return false; | |
5545 | } | |
5546 | ||
88088c03 DN |
5547 | if (!STMT_VINFO_RELEVANT_P (stmt_info)) |
5548 | return false; | |
5549 | ||
60555ced DN |
5550 | if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_loop_def) |
5551 | return false; | |
88088c03 | 5552 | |
60555ced | 5553 | /* Is vectorizable load? */ |
07beea0d | 5554 | if (TREE_CODE (stmt) != GIMPLE_MODIFY_STMT) |
f7064d11 DN |
5555 | return false; |
5556 | ||
07beea0d | 5557 | scalar_dest = GIMPLE_STMT_OPERAND (stmt, 0); |
f7064d11 DN |
5558 | if (TREE_CODE (scalar_dest) != SSA_NAME) |
5559 | return false; | |
5560 | ||
07beea0d | 5561 | op = GIMPLE_STMT_OPERAND (stmt, 1); |
98b44b0e IR |
5562 | if (TREE_CODE (op) != ARRAY_REF |
5563 | && TREE_CODE (op) != INDIRECT_REF | |
805e2059 | 5564 | && !STMT_VINFO_STRIDED_ACCESS (stmt_info)) |
f7064d11 DN |
5565 | return false; |
5566 | ||
5567 | if (!STMT_VINFO_DATA_REF (stmt_info)) | |
5568 | return false; | |
5569 | ||
468c2ac0 | 5570 | scalar_type = TREE_TYPE (DR_REF (dr)); |
f7064d11 DN |
5571 | mode = (int) TYPE_MODE (vectype); |
5572 | ||
5573 | /* FORNOW. In some cases can vectorize even if data-type not supported | |
5574 | (e.g. - data copies). */ | |
166cdb08 | 5575 | if (optab_handler (mov_optab, mode)->insn_code == CODE_FOR_nothing) |
f7064d11 | 5576 | { |
00518cb1 | 5577 | if (vect_print_dump_info (REPORT_DETAILS)) |
f7064d11 DN |
5578 | fprintf (vect_dump, "Aligned load, but unsupported type."); |
5579 | return false; | |
5580 | } | |
5581 | ||
98b44b0e | 5582 | /* Check if the load is a part of an interleaving chain. */ |
805e2059 | 5583 | if (STMT_VINFO_STRIDED_ACCESS (stmt_info)) |
98b44b0e IR |
5584 | { |
5585 | strided_load = true; | |
468c2ac0 DN |
5586 | /* FORNOW */ |
5587 | gcc_assert (! nested_in_vect_loop); | |
98b44b0e IR |
5588 | |
5589 | /* Check if interleaving is supported. */ | |
805e2059 IR |
5590 | if (!vect_strided_load_supported (vectype) |
5591 | && !PURE_SLP_STMT (stmt_info) && !slp) | |
98b44b0e IR |
5592 | return false; |
5593 | } | |
5594 | ||
f7064d11 DN |
5595 | if (!vec_stmt) /* transformation not required. */ |
5596 | { | |
5597 | STMT_VINFO_TYPE (stmt_info) = load_vec_info_type; | |
805e2059 | 5598 | vect_model_load_cost (stmt_info, ncopies, NULL); |
f7064d11 DN |
5599 | return true; |
5600 | } | |
5601 | ||
00518cb1 | 5602 | if (vect_print_dump_info (REPORT_DETAILS)) |
f7064d11 DN |
5603 | fprintf (vect_dump, "transform load."); |
5604 | ||
792ed98b HJ |
5605 | /** Transform. **/ |
5606 | ||
98b44b0e IR |
5607 | if (strided_load) |
5608 | { | |
5609 | first_stmt = DR_GROUP_FIRST_DR (stmt_info); | |
5610 | /* Check if the chain of loads is already vectorized. */ | |
5611 | if (STMT_VINFO_VEC_STMT (vinfo_for_stmt (first_stmt))) | |
5612 | { | |
5613 | *vec_stmt = STMT_VINFO_VEC_STMT (stmt_info); | |
5614 | return true; | |
5615 | } | |
5616 | first_dr = STMT_VINFO_DATA_REF (vinfo_for_stmt (first_stmt)); | |
5617 | group_size = DR_GROUP_SIZE (vinfo_for_stmt (first_stmt)); | |
5618 | dr_chain = VEC_alloc (tree, heap, group_size); | |
805e2059 IR |
5619 | |
5620 | /* VEC_NUM is the number of vect stmts to be created for this group. */ | |
5621 | if (slp) | |
5622 | { | |
5623 | strided_load = false; | |
5624 | vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node); | |
5625 | } | |
5626 | else | |
5627 | vec_num = group_size; | |
98b44b0e IR |
5628 | } |
5629 | else | |
5630 | { | |
5631 | first_stmt = stmt; | |
5632 | first_dr = dr; | |
805e2059 | 5633 | group_size = vec_num = 1; |
98b44b0e IR |
5634 | } |
5635 | ||
468c2ac0 DN |
5636 | alignment_support_scheme = vect_supportable_dr_alignment (first_dr); |
5637 | gcc_assert (alignment_support_scheme); | |
98b44b0e | 5638 | |
89d67cca DN |
5639 | /* In case the vectorization factor (VF) is bigger than the number |
5640 | of elements that we can fit in a vectype (nunits), we have to generate | |
5641 | more than one vector stmt - i.e - we need to "unroll" the | |
5642 | vector stmt by a factor VF/nunits. In doing so, we record a pointer | |
5643 | from one copy of the vector stmt to the next, in the field | |
5644 | STMT_VINFO_RELATED_STMT. This is necessary in order to allow following | |
5645 | stages to find the correct vector defs to be used when vectorizing | |
5646 | stmts that use the defs of the current stmt. The example below illustrates | |
5647 | the vectorization process when VF=16 and nunits=4 (i.e - we need to create | |
5648 | 4 vectorized stmts): | |
5649 | ||
5650 | before vectorization: | |
5651 | RELATED_STMT VEC_STMT | |
5652 | S1: x = memref - - | |
5653 | S2: z = x + 1 - - | |
5654 | ||
5655 | step 1: vectorize stmt S1: | |
5656 | We first create the vector stmt VS1_0, and, as usual, record a | |
5657 | pointer to it in the STMT_VINFO_VEC_STMT of the scalar stmt S1. | |
5658 | Next, we create the vector stmt VS1_1, and record a pointer to | |
5659 | it in the STMT_VINFO_RELATED_STMT of the vector stmt VS1_0. | |
5660 | Similarly, for VS1_2 and VS1_3. This is the resulting chain of | |
5661 | stmts and pointers: | |
5662 | RELATED_STMT VEC_STMT | |
5663 | VS1_0: vx0 = memref0 VS1_1 - | |
5664 | VS1_1: vx1 = memref1 VS1_2 - | |
5665 | VS1_2: vx2 = memref2 VS1_3 - | |
5666 | VS1_3: vx3 = memref3 - - | |
5667 | S1: x = load - VS1_0 | |
5668 | S2: z = x + 1 - - | |
5669 | ||
5670 | See in documentation in vect_get_vec_def_for_stmt_copy for how the | |
5671 | information we recorded in RELATED_STMT field is used to vectorize | |
5672 | stmt S2. */ | |
5673 | ||
98b44b0e IR |
5674 | /* In case of interleaving (non-unit strided access): |
5675 | ||
5676 | S1: x2 = &base + 2 | |
5677 | S2: x0 = &base | |
5678 | S3: x1 = &base + 1 | |
5679 | S4: x3 = &base + 3 | |
5680 | ||
5681 | Vectorized loads are created in the order of memory accesses | |
5682 | starting from the access of the first stmt of the chain: | |
5683 | ||
5684 | VS1: vx0 = &base | |
5685 | VS2: vx1 = &base + vec_size*1 | |
5686 | VS3: vx3 = &base + vec_size*2 | |
5687 | VS4: vx4 = &base + vec_size*3 | |
5688 | ||
5689 | Then permutation statements are generated: | |
5690 | ||
5691 | VS5: vx5 = VEC_EXTRACT_EVEN_EXPR < vx0, vx1 > | |
5692 | VS6: vx6 = VEC_EXTRACT_ODD_EXPR < vx0, vx1 > | |
5693 | ... | |
5694 | ||
5695 | And they are put in STMT_VINFO_VEC_STMT of the corresponding scalar stmts | |
5696 | (the order of the data-refs in the output of vect_permute_load_chain | |
5697 | corresponds to the order of scalar stmts in the interleaving chain - see | |
2f8e468b | 5698 | the documentation of vect_permute_load_chain()). |
98b44b0e IR |
5699 | The generation of permutation stmts and recording them in |
5700 | STMT_VINFO_VEC_STMT is done in vect_transform_strided_load(). | |
5701 | ||
5702 | In case of both multiple types and interleaving, the vector loads and | |
5703 | permutation stmts above are created for every copy. The result vector stmts | |
5704 | are put in STMT_VINFO_VEC_STMT for the first copy and in the corresponding | |
5705 | STMT_VINFO_RELATED_STMT for the next copies. */ | |
5706 | ||
89d67cca DN |
5707 | /* If the data reference is aligned (dr_aligned) or potentially unaligned |
5708 | on a target that supports unaligned accesses (dr_unaligned_supported) | |
5709 | we generate the following code: | |
f7064d11 DN |
5710 | p = initial_addr; |
5711 | indx = 0; | |
5712 | loop { | |
89d67cca | 5713 | p = p + indx * vectype_size; |
f7064d11 DN |
5714 | vec_dest = *(p); |
5715 | indx = indx + 1; | |
5716 | } | |
f7064d11 | 5717 | |
89d67cca | 5718 | Otherwise, the data reference is potentially unaligned on a target that |
468c2ac0 | 5719 | does not support unaligned accesses (dr_explicit_realign_optimized) - |
89d67cca DN |
5720 | then generate the following code, in which the data in each iteration is |
5721 | obtained by two vector loads, one from the previous iteration, and one | |
5722 | from the current iteration: | |
5723 | p1 = initial_addr; | |
5724 | msq_init = *(floor(p1)) | |
5725 | p2 = initial_addr + VS - 1; | |
5726 | realignment_token = call target_builtin; | |
5727 | indx = 0; | |
5728 | loop { | |
5729 | p2 = p2 + indx * vectype_size | |
5730 | lsq = *(floor(p2)) | |
5731 | vec_dest = realign_load (msq, lsq, realignment_token) | |
5732 | indx = indx + 1; | |
5733 | msq = lsq; | |
98b44b0e | 5734 | } */ |
89d67cca | 5735 | |
468c2ac0 DN |
5736 | /* If the misalignment remains the same throughout the execution of the |
5737 | loop, we can create the init_addr and permutation mask at the loop | |
5738 | preheader. Otherwise, it needs to be created inside the loop. | |
5739 | This can only occur when vectorizing memory accesses in the inner-loop | |
5740 | nested within an outer-loop that is being vectorized. */ | |
5741 | ||
5742 | if (nested_in_vect_loop_p (loop, stmt) | |
5743 | && (TREE_INT_CST_LOW (DR_STEP (dr)) % UNITS_PER_SIMD_WORD != 0)) | |
5744 | { | |
5745 | gcc_assert (alignment_support_scheme != dr_explicit_realign_optimized); | |
5746 | compute_in_loop = true; | |
5747 | } | |
5748 | ||
5749 | if ((alignment_support_scheme == dr_explicit_realign_optimized | |
5750 | || alignment_support_scheme == dr_explicit_realign) | |
5751 | && !compute_in_loop) | |
89d67cca | 5752 | { |
468c2ac0 DN |
5753 | msq = vect_setup_realignment (first_stmt, bsi, &realignment_token, |
5754 | alignment_support_scheme, NULL_TREE, | |
5755 | &at_loop); | |
5756 | if (alignment_support_scheme == dr_explicit_realign_optimized) | |
5757 | { | |
5758 | phi = SSA_NAME_DEF_STMT (msq); | |
5759 | offset = size_int (TYPE_VECTOR_SUBPARTS (vectype) - 1); | |
5760 | } | |
f7064d11 | 5761 | } |
468c2ac0 DN |
5762 | else |
5763 | at_loop = loop; | |
f7064d11 | 5764 | |
89d67cca DN |
5765 | prev_stmt_info = NULL; |
5766 | for (j = 0; j < ncopies; j++) | |
5767 | { | |
5768 | /* 1. Create the vector pointer update chain. */ | |
5769 | if (j == 0) | |
468c2ac0 DN |
5770 | dataref_ptr = vect_create_data_ref_ptr (first_stmt, |
5771 | at_loop, offset, | |
5772 | &dummy, &ptr_incr, false, | |
5773 | NULL_TREE, &inv_p); | |
89d67cca | 5774 | else |
468c2ac0 DN |
5775 | dataref_ptr = |
5776 | bump_vector_ptr (dataref_ptr, ptr_incr, bsi, stmt, NULL_TREE); | |
f7064d11 | 5777 | |
805e2059 | 5778 | for (i = 0; i < vec_num; i++) |
98b44b0e | 5779 | { |
805e2059 IR |
5780 | if (i > 0) |
5781 | dataref_ptr = bump_vector_ptr (dataref_ptr, ptr_incr, bsi, stmt, | |
5782 | NULL_TREE); | |
5783 | ||
98b44b0e | 5784 | /* 2. Create the vector-load in the loop. */ |
468c2ac0 | 5785 | switch (alignment_support_scheme) |
98b44b0e IR |
5786 | { |
5787 | case dr_aligned: | |
5788 | gcc_assert (aligned_access_p (first_dr)); | |
5789 | data_ref = build_fold_indirect_ref (dataref_ptr); | |
5790 | break; | |
5791 | case dr_unaligned_supported: | |
5792 | { | |
5793 | int mis = DR_MISALIGNMENT (first_dr); | |
5794 | tree tmis = (mis == -1 ? size_zero_node : size_int (mis)); | |
5795 | ||
98b44b0e IR |
5796 | tmis = size_binop (MULT_EXPR, tmis, size_int(BITS_PER_UNIT)); |
5797 | data_ref = | |
5798 | build2 (MISALIGNED_INDIRECT_REF, vectype, dataref_ptr, tmis); | |
5799 | break; | |
5800 | } | |
468c2ac0 DN |
5801 | case dr_explicit_realign: |
5802 | { | |
5803 | tree ptr, bump; | |
5804 | tree vs_minus_1 = size_int (TYPE_VECTOR_SUBPARTS (vectype) - 1); | |
5805 | ||
5806 | if (compute_in_loop) | |
5807 | msq = vect_setup_realignment (first_stmt, bsi, | |
5808 | &realignment_token, | |
5809 | dr_explicit_realign, | |
5810 | dataref_ptr, NULL); | |
5811 | ||
5812 | data_ref = build1 (ALIGN_INDIRECT_REF, vectype, dataref_ptr); | |
5813 | vec_dest = vect_create_destination_var (scalar_dest, vectype); | |
5814 | new_stmt = build_gimple_modify_stmt (vec_dest, data_ref); | |
5815 | new_temp = make_ssa_name (vec_dest, new_stmt); | |
5816 | GIMPLE_STMT_OPERAND (new_stmt, 0) = new_temp; | |
5817 | vect_finish_stmt_generation (stmt, new_stmt, bsi); | |
5818 | copy_virtual_operands (new_stmt, stmt); | |
5819 | mark_symbols_for_renaming (new_stmt); | |
5820 | msq = new_temp; | |
5821 | ||
5822 | bump = size_binop (MULT_EXPR, vs_minus_1, | |
5823 | TYPE_SIZE_UNIT (scalar_type)); | |
5824 | ptr = bump_vector_ptr (dataref_ptr, NULL_TREE, bsi, stmt, bump); | |
5825 | data_ref = build1 (ALIGN_INDIRECT_REF, vectype, ptr); | |
5826 | break; | |
5827 | } | |
5828 | case dr_explicit_realign_optimized: | |
98b44b0e IR |
5829 | data_ref = build1 (ALIGN_INDIRECT_REF, vectype, dataref_ptr); |
5830 | break; | |
5831 | default: | |
5832 | gcc_unreachable (); | |
5833 | } | |
5834 | vec_dest = vect_create_destination_var (scalar_dest, vectype); | |
ebb07520 | 5835 | new_stmt = build_gimple_modify_stmt (vec_dest, data_ref); |
98b44b0e | 5836 | new_temp = make_ssa_name (vec_dest, new_stmt); |
07beea0d | 5837 | GIMPLE_STMT_OPERAND (new_stmt, 0) = new_temp; |
98b44b0e | 5838 | vect_finish_stmt_generation (stmt, new_stmt, bsi); |
cfaab3a9 | 5839 | mark_symbols_for_renaming (new_stmt); |
98b44b0e | 5840 | |
468c2ac0 DN |
5841 | /* 3. Handle explicit realignment if necessary/supported. Create in |
5842 | loop: vec_dest = realign_load (msq, lsq, realignment_token) */ | |
5843 | if (alignment_support_scheme == dr_explicit_realign_optimized | |
5844 | || alignment_support_scheme == dr_explicit_realign) | |
98b44b0e | 5845 | { |
07beea0d | 5846 | lsq = GIMPLE_STMT_OPERAND (new_stmt, 0); |
98b44b0e IR |
5847 | if (!realignment_token) |
5848 | realignment_token = dataref_ptr; | |
5849 | vec_dest = vect_create_destination_var (scalar_dest, vectype); | |
468c2ac0 DN |
5850 | new_stmt = build3 (REALIGN_LOAD_EXPR, vectype, msq, lsq, |
5851 | realignment_token); | |
ebb07520 | 5852 | new_stmt = build_gimple_modify_stmt (vec_dest, new_stmt); |
98b44b0e | 5853 | new_temp = make_ssa_name (vec_dest, new_stmt); |
07beea0d | 5854 | GIMPLE_STMT_OPERAND (new_stmt, 0) = new_temp; |
98b44b0e | 5855 | vect_finish_stmt_generation (stmt, new_stmt, bsi); |
468c2ac0 DN |
5856 | |
5857 | if (alignment_support_scheme == dr_explicit_realign_optimized) | |
5858 | { | |
805e2059 | 5859 | if (i == vec_num - 1 && j == ncopies - 1) |
468c2ac0 DN |
5860 | add_phi_arg (phi, lsq, loop_latch_edge (containing_loop)); |
5861 | msq = lsq; | |
5862 | } | |
5863 | } | |
5864 | ||
5865 | /* 4. Handle invariant-load. */ | |
5866 | if (inv_p) | |
5867 | { | |
5868 | gcc_assert (!strided_load); | |
5869 | gcc_assert (nested_in_vect_loop_p (loop, stmt)); | |
5870 | if (j == 0) | |
5871 | { | |
5872 | int k; | |
5873 | tree t = NULL_TREE; | |
5874 | tree vec_inv, bitpos, bitsize = TYPE_SIZE (scalar_type); | |
5875 | ||
5876 | /* CHECKME: bitpos depends on endianess? */ | |
5877 | bitpos = bitsize_zero_node; | |
5878 | vec_inv = build3 (BIT_FIELD_REF, scalar_type, new_temp, | |
5879 | bitsize, bitpos); | |
5880 | BIT_FIELD_REF_UNSIGNED (vec_inv) = | |
5881 | TYPE_UNSIGNED (scalar_type); | |
5882 | vec_dest = | |
5883 | vect_create_destination_var (scalar_dest, NULL_TREE); | |
5884 | new_stmt = build_gimple_modify_stmt (vec_dest, vec_inv); | |
5885 | new_temp = make_ssa_name (vec_dest, new_stmt); | |
5886 | GIMPLE_STMT_OPERAND (new_stmt, 0) = new_temp; | |
5887 | vect_finish_stmt_generation (stmt, new_stmt, bsi); | |
5888 | ||
5889 | for (k = nunits - 1; k >= 0; --k) | |
5890 | t = tree_cons (NULL_TREE, new_temp, t); | |
5891 | /* FIXME: use build_constructor directly. */ | |
5892 | vec_inv = build_constructor_from_list (vectype, t); | |
5893 | new_temp = vect_init_vector (stmt, vec_inv, vectype, bsi); | |
5894 | new_stmt = SSA_NAME_DEF_STMT (new_temp); | |
5895 | } | |
5896 | else | |
5897 | gcc_unreachable (); /* FORNOW. */ | |
98b44b0e | 5898 | } |
468c2ac0 | 5899 | |
805e2059 IR |
5900 | /* Collect vector loads and later create their permutation in |
5901 | vect_transform_strided_load (). */ | |
5902 | if (strided_load) | |
5903 | VEC_quick_push (tree, dr_chain, new_temp); | |
5904 | ||
5905 | /* Store vector loads in the corresponding SLP_NODE. */ | |
5906 | if (slp) | |
5907 | VEC_quick_push (tree, SLP_TREE_VEC_STMTS (slp_node), new_stmt); | |
98b44b0e | 5908 | } |
f7064d11 | 5909 | |
805e2059 IR |
5910 | /* FORNOW: SLP with multiple types is unsupported. */ |
5911 | if (slp) | |
5912 | return true; | |
5913 | ||
98b44b0e IR |
5914 | if (strided_load) |
5915 | { | |
5916 | if (!vect_transform_strided_load (stmt, dr_chain, group_size, bsi)) | |
5917 | return false; | |
5918 | *vec_stmt = STMT_VINFO_VEC_STMT (stmt_info); | |
5919 | dr_chain = VEC_alloc (tree, heap, group_size); | |
5920 | } | |
89d67cca | 5921 | else |
98b44b0e IR |
5922 | { |
5923 | if (j == 0) | |
5924 | STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt; | |
5925 | else | |
5926 | STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt; | |
5927 | prev_stmt_info = vinfo_for_stmt (new_stmt); | |
5928 | } | |
f7064d11 | 5929 | } |
f7064d11 | 5930 | |
f7064d11 DN |
5931 | return true; |
5932 | } | |
5933 | ||
88088c03 DN |
5934 | |
5935 | /* Function vectorizable_live_operation. | |
5936 | ||
5937 | STMT computes a value that is used outside the loop. Check if | |
5938 | it can be supported. */ | |
5939 | ||
5940 | bool | |
5941 | vectorizable_live_operation (tree stmt, | |
5942 | block_stmt_iterator *bsi ATTRIBUTE_UNUSED, | |
5943 | tree *vec_stmt ATTRIBUTE_UNUSED) | |
5944 | { | |
5945 | tree operation; | |
5946 | stmt_vec_info stmt_info = vinfo_for_stmt (stmt); | |
5947 | loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info); | |
d29de1bf | 5948 | struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo); |
88088c03 | 5949 | int i; |
88088c03 DN |
5950 | int op_type; |
5951 | tree op; | |
5952 | tree def, def_stmt; | |
5953 | enum vect_def_type dt; | |
5954 | ||
60555ced DN |
5955 | gcc_assert (STMT_VINFO_LIVE_P (stmt_info)); |
5956 | ||
5957 | if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def) | |
88088c03 DN |
5958 | return false; |
5959 | ||
07beea0d | 5960 | if (TREE_CODE (stmt) != GIMPLE_MODIFY_STMT) |
88088c03 DN |
5961 | return false; |
5962 | ||
07beea0d | 5963 | if (TREE_CODE (GIMPLE_STMT_OPERAND (stmt, 0)) != SSA_NAME) |
88088c03 DN |
5964 | return false; |
5965 | ||
d29de1bf DN |
5966 | /* FORNOW. CHECKME. */ |
5967 | if (nested_in_vect_loop_p (loop, stmt)) | |
5968 | return false; | |
5969 | ||
07beea0d | 5970 | operation = GIMPLE_STMT_OPERAND (stmt, 1); |
5039610b | 5971 | op_type = TREE_OPERAND_LENGTH (operation); |
88088c03 DN |
5972 | |
5973 | /* FORNOW: support only if all uses are invariant. This means | |
5974 | that the scalar operations can remain in place, unvectorized. | |
5975 | The original last scalar value that they compute will be used. */ | |
5976 | ||
5977 | for (i = 0; i < op_type; i++) | |
5978 | { | |
5979 | op = TREE_OPERAND (operation, i); | |
7de5c6a4 | 5980 | if (op && !vect_is_simple_use (op, loop_vinfo, &def_stmt, &def, &dt)) |
88088c03 | 5981 | { |
00518cb1 | 5982 | if (vect_print_dump_info (REPORT_DETAILS)) |
88088c03 DN |
5983 | fprintf (vect_dump, "use not simple."); |
5984 | return false; | |
5985 | } | |
5986 | ||
5987 | if (dt != vect_invariant_def && dt != vect_constant_def) | |
5988 | return false; | |
5989 | } | |
5990 | ||
5991 | /* No transformation is required for the cases we currently support. */ | |
5992 | return true; | |
5993 | } | |
5994 | ||
5995 | ||
b52485c6 DP |
5996 | /* Function vect_is_simple_cond. |
5997 | ||
5998 | Input: | |
5999 | LOOP - the loop that is being vectorized. | |
6000 | COND - Condition that is checked for simple use. | |
6001 | ||
a82635d0 | 6002 | Returns whether a COND can be vectorized. Checks whether |
b52485c6 DP |
6003 | condition operands are supportable using vec_is_simple_use. */ |
6004 | ||
6005 | static bool | |
6006 | vect_is_simple_cond (tree cond, loop_vec_info loop_vinfo) | |
6007 | { | |
6008 | tree lhs, rhs; | |
88088c03 DN |
6009 | tree def; |
6010 | enum vect_def_type dt; | |
b52485c6 | 6011 | |
7da4bf7d | 6012 | if (!COMPARISON_CLASS_P (cond)) |
b52485c6 DP |
6013 | return false; |
6014 | ||
6015 | lhs = TREE_OPERAND (cond, 0); | |
6016 | rhs = TREE_OPERAND (cond, 1); | |
6017 | ||
6018 | if (TREE_CODE (lhs) == SSA_NAME) | |
6019 | { | |
6020 | tree lhs_def_stmt = SSA_NAME_DEF_STMT (lhs); | |
88088c03 | 6021 | if (!vect_is_simple_use (lhs, loop_vinfo, &lhs_def_stmt, &def, &dt)) |
b52485c6 DP |
6022 | return false; |
6023 | } | |
325217ed CF |
6024 | else if (TREE_CODE (lhs) != INTEGER_CST && TREE_CODE (lhs) != REAL_CST |
6025 | && TREE_CODE (lhs) != FIXED_CST) | |
b52485c6 DP |
6026 | return false; |
6027 | ||
6028 | if (TREE_CODE (rhs) == SSA_NAME) | |
6029 | { | |
6030 | tree rhs_def_stmt = SSA_NAME_DEF_STMT (rhs); | |
88088c03 | 6031 | if (!vect_is_simple_use (rhs, loop_vinfo, &rhs_def_stmt, &def, &dt)) |
b52485c6 DP |
6032 | return false; |
6033 | } | |
325217ed CF |
6034 | else if (TREE_CODE (rhs) != INTEGER_CST && TREE_CODE (rhs) != REAL_CST |
6035 | && TREE_CODE (rhs) != FIXED_CST) | |
b52485c6 DP |
6036 | return false; |
6037 | ||
6038 | return true; | |
6039 | } | |
6040 | ||
6041 | /* vectorizable_condition. | |
6042 | ||
6043 | Check if STMT is conditional modify expression that can be vectorized. | |
6044 | If VEC_STMT is also passed, vectorize the STMT: create a vectorized | |
6045 | stmt using VEC_COND_EXPR to replace it, put it in VEC_STMT, and insert it | |
6046 | at BSI. | |
6047 | ||
6048 | Return FALSE if not a vectorizable STMT, TRUE otherwise. */ | |
6049 | ||
6050 | bool | |
6051 | vectorizable_condition (tree stmt, block_stmt_iterator *bsi, tree *vec_stmt) | |
6052 | { | |
6053 | tree scalar_dest = NULL_TREE; | |
6054 | tree vec_dest = NULL_TREE; | |
6055 | tree op = NULL_TREE; | |
6056 | tree cond_expr, then_clause, else_clause; | |
6057 | stmt_vec_info stmt_info = vinfo_for_stmt (stmt); | |
6058 | tree vectype = STMT_VINFO_VECTYPE (stmt_info); | |
6059 | tree vec_cond_lhs, vec_cond_rhs, vec_then_clause, vec_else_clause; | |
6060 | tree vec_compare, vec_cond_expr; | |
6061 | tree new_temp; | |
6062 | loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info); | |
6063 | enum machine_mode vec_mode; | |
88088c03 DN |
6064 | tree def; |
6065 | enum vect_def_type dt; | |
89d67cca DN |
6066 | int nunits = TYPE_VECTOR_SUBPARTS (vectype); |
6067 | int ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits; | |
6068 | ||
6069 | gcc_assert (ncopies >= 1); | |
6070 | if (ncopies > 1) | |
6071 | return false; /* FORNOW */ | |
b52485c6 DP |
6072 | |
6073 | if (!STMT_VINFO_RELEVANT_P (stmt_info)) | |
6074 | return false; | |
6075 | ||
60555ced DN |
6076 | if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_loop_def) |
6077 | return false; | |
88088c03 | 6078 | |
805e2059 IR |
6079 | /* FORNOW: SLP not supported. */ |
6080 | if (STMT_SLP_TYPE (stmt_info)) | |
6081 | return false; | |
6082 | ||
60555ced | 6083 | /* FORNOW: not yet supported. */ |
88088c03 DN |
6084 | if (STMT_VINFO_LIVE_P (stmt_info)) |
6085 | { | |
00518cb1 | 6086 | if (vect_print_dump_info (REPORT_DETAILS)) |
88088c03 DN |
6087 | fprintf (vect_dump, "value used after loop."); |
6088 | return false; | |
6089 | } | |
6090 | ||
60555ced | 6091 | /* Is vectorizable conditional operation? */ |
07beea0d | 6092 | if (TREE_CODE (stmt) != GIMPLE_MODIFY_STMT) |
b52485c6 DP |
6093 | return false; |
6094 | ||
07beea0d | 6095 | op = GIMPLE_STMT_OPERAND (stmt, 1); |
b52485c6 DP |
6096 | |
6097 | if (TREE_CODE (op) != COND_EXPR) | |
6098 | return false; | |
6099 | ||
6100 | cond_expr = TREE_OPERAND (op, 0); | |
6101 | then_clause = TREE_OPERAND (op, 1); | |
6102 | else_clause = TREE_OPERAND (op, 2); | |
6103 | ||
10b96810 AP |
6104 | if (!vect_is_simple_cond (cond_expr, loop_vinfo)) |
6105 | return false; | |
6106 | ||
75bfa678 RG |
6107 | /* We do not handle two different vector types for the condition |
6108 | and the values. */ | |
6109 | if (TREE_TYPE (TREE_OPERAND (cond_expr, 0)) != TREE_TYPE (vectype)) | |
6110 | return false; | |
6111 | ||
b52485c6 DP |
6112 | if (TREE_CODE (then_clause) == SSA_NAME) |
6113 | { | |
6114 | tree then_def_stmt = SSA_NAME_DEF_STMT (then_clause); | |
88088c03 DN |
6115 | if (!vect_is_simple_use (then_clause, loop_vinfo, |
6116 | &then_def_stmt, &def, &dt)) | |
b52485c6 DP |
6117 | return false; |
6118 | } | |
6119 | else if (TREE_CODE (then_clause) != INTEGER_CST | |
325217ed CF |
6120 | && TREE_CODE (then_clause) != REAL_CST |
6121 | && TREE_CODE (then_clause) != FIXED_CST) | |
b52485c6 DP |
6122 | return false; |
6123 | ||
6124 | if (TREE_CODE (else_clause) == SSA_NAME) | |
6125 | { | |
6126 | tree else_def_stmt = SSA_NAME_DEF_STMT (else_clause); | |
88088c03 DN |
6127 | if (!vect_is_simple_use (else_clause, loop_vinfo, |
6128 | &else_def_stmt, &def, &dt)) | |
b52485c6 DP |
6129 | return false; |
6130 | } | |
6131 | else if (TREE_CODE (else_clause) != INTEGER_CST | |
325217ed CF |
6132 | && TREE_CODE (else_clause) != REAL_CST |
6133 | && TREE_CODE (else_clause) != FIXED_CST) | |
b52485c6 DP |
6134 | return false; |
6135 | ||
6136 | ||
6137 | vec_mode = TYPE_MODE (vectype); | |
6138 | ||
6139 | if (!vec_stmt) | |
6140 | { | |
6141 | STMT_VINFO_TYPE (stmt_info) = condition_vec_info_type; | |
6142 | return expand_vec_cond_expr_p (op, vec_mode); | |
6143 | } | |
6144 | ||
6145 | /* Transform */ | |
6146 | ||
6147 | /* Handle def. */ | |
07beea0d | 6148 | scalar_dest = GIMPLE_STMT_OPERAND (stmt, 0); |
b52485c6 DP |
6149 | vec_dest = vect_create_destination_var (scalar_dest, vectype); |
6150 | ||
6151 | /* Handle cond expr. */ | |
6152 | vec_cond_lhs = | |
61d3cdbb | 6153 | vect_get_vec_def_for_operand (TREE_OPERAND (cond_expr, 0), stmt, NULL); |
b52485c6 | 6154 | vec_cond_rhs = |
61d3cdbb DN |
6155 | vect_get_vec_def_for_operand (TREE_OPERAND (cond_expr, 1), stmt, NULL); |
6156 | vec_then_clause = vect_get_vec_def_for_operand (then_clause, stmt, NULL); | |
6157 | vec_else_clause = vect_get_vec_def_for_operand (else_clause, stmt, NULL); | |
b52485c6 DP |
6158 | |
6159 | /* Arguments are ready. create the new vector stmt. */ | |
6160 | vec_compare = build2 (TREE_CODE (cond_expr), vectype, | |
6161 | vec_cond_lhs, vec_cond_rhs); | |
b4257cfc RG |
6162 | vec_cond_expr = build3 (VEC_COND_EXPR, vectype, |
6163 | vec_compare, vec_then_clause, vec_else_clause); | |
b52485c6 | 6164 | |
ebb07520 | 6165 | *vec_stmt = build_gimple_modify_stmt (vec_dest, vec_cond_expr); |
b52485c6 | 6166 | new_temp = make_ssa_name (vec_dest, *vec_stmt); |
07beea0d | 6167 | GIMPLE_STMT_OPERAND (*vec_stmt, 0) = new_temp; |
b52485c6 DP |
6168 | vect_finish_stmt_generation (stmt, *vec_stmt, bsi); |
6169 | ||
6170 | return true; | |
6171 | } | |
f7064d11 | 6172 | |
805e2059 | 6173 | |
f7064d11 DN |
6174 | /* Function vect_transform_stmt. |
6175 | ||
6176 | Create a vectorized stmt to replace STMT, and insert it at BSI. */ | |
6177 | ||
805e2059 IR |
6178 | static bool |
6179 | vect_transform_stmt (tree stmt, block_stmt_iterator *bsi, bool *strided_store, | |
6180 | slp_tree slp_node) | |
f7064d11 DN |
6181 | { |
6182 | bool is_store = false; | |
6183 | tree vec_stmt = NULL_TREE; | |
6184 | stmt_vec_info stmt_info = vinfo_for_stmt (stmt); | |
20f06221 | 6185 | tree orig_stmt_in_pattern; |
f7064d11 DN |
6186 | bool done; |
6187 | ||
60555ced | 6188 | switch (STMT_VINFO_TYPE (stmt_info)) |
f7064d11 | 6189 | { |
60555ced | 6190 | case type_demotion_vec_info_type: |
805e2059 | 6191 | gcc_assert (!slp_node); |
60555ced DN |
6192 | done = vectorizable_type_demotion (stmt, bsi, &vec_stmt); |
6193 | gcc_assert (done); | |
6194 | break; | |
8115817b | 6195 | |
60555ced | 6196 | case type_promotion_vec_info_type: |
805e2059 | 6197 | gcc_assert (!slp_node); |
60555ced DN |
6198 | done = vectorizable_type_promotion (stmt, bsi, &vec_stmt); |
6199 | gcc_assert (done); | |
6200 | break; | |
6201 | ||
6202 | case type_conversion_vec_info_type: | |
805e2059 | 6203 | done = vectorizable_conversion (stmt, bsi, &vec_stmt, slp_node); |
60555ced DN |
6204 | gcc_assert (done); |
6205 | break; | |
6206 | ||
cd38ca7f | 6207 | case induc_vec_info_type: |
805e2059 | 6208 | gcc_assert (!slp_node); |
cd38ca7f DN |
6209 | done = vectorizable_induction (stmt, bsi, &vec_stmt); |
6210 | gcc_assert (done); | |
6211 | break; | |
6212 | ||
60555ced | 6213 | case op_vec_info_type: |
805e2059 | 6214 | done = vectorizable_operation (stmt, bsi, &vec_stmt, slp_node); |
60555ced DN |
6215 | gcc_assert (done); |
6216 | break; | |
6217 | ||
6218 | case assignment_vec_info_type: | |
805e2059 | 6219 | done = vectorizable_assignment (stmt, bsi, &vec_stmt, slp_node); |
60555ced DN |
6220 | gcc_assert (done); |
6221 | break; | |
6222 | ||
6223 | case load_vec_info_type: | |
805e2059 | 6224 | done = vectorizable_load (stmt, bsi, &vec_stmt, slp_node); |
60555ced DN |
6225 | gcc_assert (done); |
6226 | break; | |
6227 | ||
6228 | case store_vec_info_type: | |
805e2059 | 6229 | done = vectorizable_store (stmt, bsi, &vec_stmt, slp_node); |
60555ced | 6230 | gcc_assert (done); |
805e2059 | 6231 | if (STMT_VINFO_STRIDED_ACCESS (stmt_info)) |
60555ced DN |
6232 | { |
6233 | /* In case of interleaving, the whole chain is vectorized when the | |
6234 | last store in the chain is reached. Store stmts before the last | |
6235 | one are skipped, and there vec_stmt_info shouldn't be freed | |
6236 | meanwhile. */ | |
6237 | *strided_store = true; | |
6238 | if (STMT_VINFO_VEC_STMT (stmt_info)) | |
6239 | is_store = true; | |
98b44b0e | 6240 | } |
60555ced DN |
6241 | else |
6242 | is_store = true; | |
6243 | break; | |
88088c03 | 6244 | |
60555ced | 6245 | case condition_vec_info_type: |
805e2059 | 6246 | gcc_assert (!slp_node); |
60555ced DN |
6247 | done = vectorizable_condition (stmt, bsi, &vec_stmt); |
6248 | gcc_assert (done); | |
6249 | break; | |
88088c03 | 6250 | |
60555ced | 6251 | case call_vec_info_type: |
805e2059 | 6252 | gcc_assert (!slp_node); |
60555ced DN |
6253 | done = vectorizable_call (stmt, bsi, &vec_stmt); |
6254 | break; | |
2505a3f2 | 6255 | |
60555ced | 6256 | case reduc_vec_info_type: |
805e2059 | 6257 | gcc_assert (!slp_node); |
60555ced DN |
6258 | done = vectorizable_reduction (stmt, bsi, &vec_stmt); |
6259 | gcc_assert (done); | |
6260 | break; | |
88088c03 | 6261 | |
60555ced DN |
6262 | default: |
6263 | if (!STMT_VINFO_LIVE_P (stmt_info)) | |
98b44b0e | 6264 | { |
60555ced DN |
6265 | if (vect_print_dump_info (REPORT_DETAILS)) |
6266 | fprintf (vect_dump, "stmt not supported."); | |
6267 | gcc_unreachable (); | |
98b44b0e | 6268 | } |
88088c03 | 6269 | } |
b52485c6 | 6270 | |
60555ced DN |
6271 | if (STMT_VINFO_LIVE_P (stmt_info) |
6272 | && STMT_VINFO_TYPE (stmt_info) != reduc_vec_info_type) | |
88088c03 | 6273 | { |
60555ced DN |
6274 | done = vectorizable_live_operation (stmt, bsi, &vec_stmt); |
6275 | gcc_assert (done); | |
6276 | } | |
6277 | ||
6278 | if (vec_stmt) | |
6279 | { | |
6280 | STMT_VINFO_VEC_STMT (stmt_info) = vec_stmt; | |
6281 | orig_stmt_in_pattern = STMT_VINFO_RELATED_STMT (stmt_info); | |
6282 | if (orig_stmt_in_pattern) | |
6283 | { | |
6284 | stmt_vec_info stmt_vinfo = vinfo_for_stmt (orig_stmt_in_pattern); | |
6285 | /* STMT was inserted by the vectorizer to replace a computation idiom. | |
6286 | ORIG_STMT_IN_PATTERN is a stmt in the original sequence that | |
6287 | computed this idiom. We need to record a pointer to VEC_STMT in | |
6288 | the stmt_info of ORIG_STMT_IN_PATTERN. See more details in the | |
6289 | documentation of vect_pattern_recog. */ | |
6290 | if (STMT_VINFO_IN_PATTERN_P (stmt_vinfo)) | |
6291 | { | |
6292 | gcc_assert (STMT_VINFO_RELATED_STMT (stmt_vinfo) == stmt); | |
6293 | STMT_VINFO_VEC_STMT (stmt_vinfo) = vec_stmt; | |
6294 | } | |
6295 | } | |
f7064d11 DN |
6296 | } |
6297 | ||
88088c03 | 6298 | return is_store; |
f7064d11 DN |
6299 | } |
6300 | ||
6301 | ||
6302 | /* This function builds ni_name = number of iterations loop executes | |
6303 | on the loop preheader. */ | |
6304 | ||
6305 | static tree | |
6306 | vect_build_loop_niters (loop_vec_info loop_vinfo) | |
6307 | { | |
6308 | tree ni_name, stmt, var; | |
6309 | edge pe; | |
6310 | struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo); | |
6311 | tree ni = unshare_expr (LOOP_VINFO_NITERS (loop_vinfo)); | |
6312 | ||
6313 | var = create_tmp_var (TREE_TYPE (ni), "niters"); | |
f004ab02 | 6314 | add_referenced_var (var); |
f7064d11 DN |
6315 | ni_name = force_gimple_operand (ni, &stmt, false, var); |
6316 | ||
6317 | pe = loop_preheader_edge (loop); | |
6318 | if (stmt) | |
6319 | { | |
6320 | basic_block new_bb = bsi_insert_on_edge_immediate (pe, stmt); | |
6321 | gcc_assert (!new_bb); | |
6322 | } | |
6323 | ||
6324 | return ni_name; | |
6325 | } | |
6326 | ||
6327 | ||
6328 | /* This function generates the following statements: | |
6329 | ||
6330 | ni_name = number of iterations loop executes | |
6331 | ratio = ni_name / vf | |
6332 | ratio_mult_vf_name = ratio * vf | |
6333 | ||
6334 | and places them at the loop preheader edge. */ | |
6335 | ||
6336 | static void | |
6337 | vect_generate_tmps_on_preheader (loop_vec_info loop_vinfo, | |
6338 | tree *ni_name_ptr, | |
6339 | tree *ratio_mult_vf_name_ptr, | |
6340 | tree *ratio_name_ptr) | |
6341 | { | |
6342 | ||
6343 | edge pe; | |
6344 | basic_block new_bb; | |
6345 | tree stmt, ni_name; | |
6346 | tree var; | |
6347 | tree ratio_name; | |
6348 | tree ratio_mult_vf_name; | |
6349 | struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo); | |
6350 | tree ni = LOOP_VINFO_NITERS (loop_vinfo); | |
6351 | int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo); | |
99c09897 | 6352 | tree log_vf; |
f7064d11 DN |
6353 | |
6354 | pe = loop_preheader_edge (loop); | |
6355 | ||
6356 | /* Generate temporary variable that contains | |
6357 | number of iterations loop executes. */ | |
6358 | ||
6359 | ni_name = vect_build_loop_niters (loop_vinfo); | |
99c09897 | 6360 | log_vf = build_int_cst (TREE_TYPE (ni), exact_log2 (vf)); |
f7064d11 DN |
6361 | |
6362 | /* Create: ratio = ni >> log2(vf) */ | |
6363 | ||
80b4a8d9 ZD |
6364 | ratio_name = fold_build2 (RSHIFT_EXPR, TREE_TYPE (ni_name), ni_name, log_vf); |
6365 | if (!is_gimple_val (ratio_name)) | |
6366 | { | |
6367 | var = create_tmp_var (TREE_TYPE (ni), "bnd"); | |
6368 | add_referenced_var (var); | |
f7064d11 | 6369 | |
80b4a8d9 ZD |
6370 | ratio_name = force_gimple_operand (ratio_name, &stmt, true, var); |
6371 | pe = loop_preheader_edge (loop); | |
6372 | new_bb = bsi_insert_on_edge_immediate (pe, stmt); | |
6373 | gcc_assert (!new_bb); | |
6374 | } | |
f7064d11 DN |
6375 | |
6376 | /* Create: ratio_mult_vf = ratio << log2 (vf). */ | |
6377 | ||
80b4a8d9 ZD |
6378 | ratio_mult_vf_name = fold_build2 (LSHIFT_EXPR, TREE_TYPE (ratio_name), |
6379 | ratio_name, log_vf); | |
6380 | if (!is_gimple_val (ratio_mult_vf_name)) | |
6381 | { | |
6382 | var = create_tmp_var (TREE_TYPE (ni), "ratio_mult_vf"); | |
6383 | add_referenced_var (var); | |
f7064d11 | 6384 | |
80b4a8d9 ZD |
6385 | ratio_mult_vf_name = force_gimple_operand (ratio_mult_vf_name, &stmt, |
6386 | true, var); | |
6387 | pe = loop_preheader_edge (loop); | |
6388 | new_bb = bsi_insert_on_edge_immediate (pe, stmt); | |
6389 | gcc_assert (!new_bb); | |
6390 | } | |
f7064d11 DN |
6391 | |
6392 | *ni_name_ptr = ni_name; | |
6393 | *ratio_mult_vf_name_ptr = ratio_mult_vf_name; | |
6394 | *ratio_name_ptr = ratio_name; | |
6395 | ||
6396 | return; | |
6397 | } | |
6398 | ||
6399 | ||
6400 | /* Function vect_update_ivs_after_vectorizer. | |
6401 | ||
6402 | "Advance" the induction variables of LOOP to the value they should take | |
6403 | after the execution of LOOP. This is currently necessary because the | |
6404 | vectorizer does not handle induction variables that are used after the | |
6405 | loop. Such a situation occurs when the last iterations of LOOP are | |
6406 | peeled, because: | |
6407 | 1. We introduced new uses after LOOP for IVs that were not originally used | |
6408 | after LOOP: the IVs of LOOP are now used by an epilog loop. | |
6409 | 2. LOOP is going to be vectorized; this means that it will iterate N/VF | |
6410 | times, whereas the loop IVs should be bumped N times. | |
6411 | ||
6412 | Input: | |
6413 | - LOOP - a loop that is going to be vectorized. The last few iterations | |
6414 | of LOOP were peeled. | |
6415 | - NITERS - the number of iterations that LOOP executes (before it is | |
6416 | vectorized). i.e, the number of times the ivs should be bumped. | |
6417 | - UPDATE_E - a successor edge of LOOP->exit that is on the (only) path | |
6418 | coming out from LOOP on which there are uses of the LOOP ivs | |
6419 | (this is the path from LOOP->exit to epilog_loop->preheader). | |
6420 | ||
6421 | The new definitions of the ivs are placed in LOOP->exit. | |
6422 | The phi args associated with the edge UPDATE_E in the bb | |
6423 | UPDATE_E->dest are updated accordingly. | |
6424 | ||
6425 | Assumption 1: Like the rest of the vectorizer, this function assumes | |
6426 | a single loop exit that has a single predecessor. | |
6427 | ||
6428 | Assumption 2: The phi nodes in the LOOP header and in update_bb are | |
6429 | organized in the same order. | |
6430 | ||
6431 | Assumption 3: The access function of the ivs is simple enough (see | |
6432 | vect_can_advance_ivs_p). This assumption will be relaxed in the future. | |
6433 | ||
6434 | Assumption 4: Exactly one of the successors of LOOP exit-bb is on a path | |
6435 | coming out of LOOP on which the ivs of LOOP are used (this is the path | |
6436 | that leads to the epilog loop; other paths skip the epilog loop). This | |
6437 | path starts with the edge UPDATE_E, and its destination (denoted update_bb) | |
6438 | needs to have its phis updated. | |
6439 | */ | |
6440 | ||
6441 | static void | |
6442 | vect_update_ivs_after_vectorizer (loop_vec_info loop_vinfo, tree niters, | |
6443 | edge update_e) | |
6444 | { | |
6445 | struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo); | |
ac8f6c69 | 6446 | basic_block exit_bb = single_exit (loop)->dest; |
f7064d11 DN |
6447 | tree phi, phi1; |
6448 | basic_block update_bb = update_e->dest; | |
6449 | ||
6450 | /* gcc_assert (vect_can_advance_ivs_p (loop_vinfo)); */ | |
6451 | ||
6452 | /* Make sure there exists a single-predecessor exit bb: */ | |
c5cbcccf | 6453 | gcc_assert (single_pred_p (exit_bb)); |
f7064d11 DN |
6454 | |
6455 | for (phi = phi_nodes (loop->header), phi1 = phi_nodes (update_bb); | |
6456 | phi && phi1; | |
6457 | phi = PHI_CHAIN (phi), phi1 = PHI_CHAIN (phi1)) | |
6458 | { | |
6459 | tree access_fn = NULL; | |
6460 | tree evolution_part; | |
6461 | tree init_expr; | |
6462 | tree step_expr; | |
c6540bde | 6463 | tree var, ni, ni_name; |
f7064d11 DN |
6464 | block_stmt_iterator last_bsi; |
6465 | ||
00518cb1 | 6466 | if (vect_print_dump_info (REPORT_DETAILS)) |
88088c03 DN |
6467 | { |
6468 | fprintf (vect_dump, "vect_update_ivs_after_vectorizer: phi: "); | |
6469 | print_generic_expr (vect_dump, phi, TDF_SLIM); | |
6470 | } | |
6471 | ||
f7064d11 DN |
6472 | /* Skip virtual phi's. */ |
6473 | if (!is_gimple_reg (SSA_NAME_VAR (PHI_RESULT (phi)))) | |
6474 | { | |
00518cb1 | 6475 | if (vect_print_dump_info (REPORT_DETAILS)) |
f7064d11 DN |
6476 | fprintf (vect_dump, "virtual phi. skip."); |
6477 | continue; | |
6478 | } | |
6479 | ||
61d3cdbb DN |
6480 | /* Skip reduction phis. */ |
6481 | if (STMT_VINFO_DEF_TYPE (vinfo_for_stmt (phi)) == vect_reduction_def) | |
6482 | { | |
00518cb1 | 6483 | if (vect_print_dump_info (REPORT_DETAILS)) |
61d3cdbb DN |
6484 | fprintf (vect_dump, "reduc phi. skip."); |
6485 | continue; | |
6486 | } | |
6487 | ||
f7064d11 DN |
6488 | access_fn = analyze_scalar_evolution (loop, PHI_RESULT (phi)); |
6489 | gcc_assert (access_fn); | |
6490 | evolution_part = | |
6491 | unshare_expr (evolution_part_in_loop_num (access_fn, loop->num)); | |
6492 | gcc_assert (evolution_part != NULL_TREE); | |
6493 | ||
6494 | /* FORNOW: We do not support IVs whose evolution function is a polynomial | |
6495 | of degree >= 2 or exponential. */ | |
6496 | gcc_assert (!tree_is_chrec (evolution_part)); | |
6497 | ||
6498 | step_expr = evolution_part; | |
6499 | init_expr = unshare_expr (initial_condition_in_loop_num (access_fn, | |
6500 | loop->num)); | |
6501 | ||
5be014d5 AP |
6502 | if (POINTER_TYPE_P (TREE_TYPE (init_expr))) |
6503 | ni = fold_build2 (POINTER_PLUS_EXPR, TREE_TYPE (init_expr), | |
6504 | init_expr, | |
6505 | fold_convert (sizetype, | |
6506 | fold_build2 (MULT_EXPR, TREE_TYPE (niters), | |
6507 | niters, step_expr))); | |
6508 | else | |
6509 | ni = fold_build2 (PLUS_EXPR, TREE_TYPE (init_expr), | |
6510 | fold_build2 (MULT_EXPR, TREE_TYPE (init_expr), | |
6511 | fold_convert (TREE_TYPE (init_expr), | |
6512 | niters), | |
6513 | step_expr), | |
6514 | init_expr); | |
6515 | ||
6516 | ||
f7064d11 DN |
6517 | |
6518 | var = create_tmp_var (TREE_TYPE (init_expr), "tmp"); | |
f004ab02 | 6519 | add_referenced_var (var); |
f7064d11 | 6520 | |
f7064d11 | 6521 | last_bsi = bsi_last (exit_bb); |
c6540bde ZD |
6522 | ni_name = force_gimple_operand_bsi (&last_bsi, ni, false, var, |
6523 | true, BSI_SAME_STMT); | |
6524 | ||
f7064d11 | 6525 | /* Fix phi expressions in the successor bb. */ |
f7064d11 DN |
6526 | SET_PHI_ARG_DEF (phi1, update_e->dest_idx, ni_name); |
6527 | } | |
6528 | } | |
6529 | ||
749cc4b1 HJ |
6530 | /* Return the more conservative threshold between the |
6531 | min_profitable_iters returned by the cost model and the user | |
6532 | specified threshold, if provided. */ | |
6533 | ||
6534 | static unsigned int | |
6535 | conservative_cost_threshold (loop_vec_info loop_vinfo, | |
6536 | int min_profitable_iters) | |
6537 | { | |
6538 | unsigned int th; | |
6539 | int min_scalar_loop_bound; | |
6540 | ||
6541 | min_scalar_loop_bound = ((PARAM_VALUE (PARAM_MIN_VECT_LOOP_BOUND) | |
6542 | * LOOP_VINFO_VECT_FACTOR (loop_vinfo)) - 1); | |
6543 | ||
6544 | /* Use the cost model only if it is more conservative than user specified | |
6545 | threshold. */ | |
6546 | th = (unsigned) min_scalar_loop_bound; | |
6547 | if (min_profitable_iters | |
6548 | && (!min_scalar_loop_bound | |
6549 | || min_profitable_iters > min_scalar_loop_bound)) | |
6550 | th = (unsigned) min_profitable_iters; | |
6551 | ||
f5adacc5 | 6552 | if (th && vect_print_dump_info (REPORT_COST)) |
749cc4b1 HJ |
6553 | fprintf (vect_dump, "Vectorization may not be profitable."); |
6554 | ||
6555 | return th; | |
6556 | } | |
f7064d11 DN |
6557 | |
6558 | /* Function vect_do_peeling_for_loop_bound | |
6559 | ||
6560 | Peel the last iterations of the loop represented by LOOP_VINFO. | |
6561 | The peeled iterations form a new epilog loop. Given that the loop now | |
6562 | iterates NITERS times, the new epilog loop iterates | |
6563 | NITERS % VECTORIZATION_FACTOR times. | |
6564 | ||
6565 | The original loop will later be made to iterate | |
6566 | NITERS / VECTORIZATION_FACTOR times (this value is placed into RATIO). */ | |
6567 | ||
6568 | static void | |
d73be268 | 6569 | vect_do_peeling_for_loop_bound (loop_vec_info loop_vinfo, tree *ratio) |
f7064d11 | 6570 | { |
f7064d11 DN |
6571 | tree ni_name, ratio_mult_vf_name; |
6572 | struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo); | |
6573 | struct loop *new_loop; | |
6574 | edge update_e; | |
70388d94 | 6575 | basic_block preheader; |
f7064d11 | 6576 | int loop_num; |
749cc4b1 HJ |
6577 | bool check_profitability = false; |
6578 | unsigned int th = 0; | |
792ed98b | 6579 | int min_profitable_iters; |
f7064d11 | 6580 | |
00518cb1 | 6581 | if (vect_print_dump_info (REPORT_DETAILS)) |
bb748329 | 6582 | fprintf (vect_dump, "=== vect_do_peeling_for_loop_bound ==="); |
f7064d11 | 6583 | |
9498a22f RH |
6584 | initialize_original_copy_tables (); |
6585 | ||
f7064d11 DN |
6586 | /* Generate the following variables on the preheader of original loop: |
6587 | ||
6588 | ni_name = number of iteration the original loop executes | |
6589 | ratio = ni_name / vf | |
6590 | ratio_mult_vf_name = ratio * vf */ | |
6591 | vect_generate_tmps_on_preheader (loop_vinfo, &ni_name, | |
6592 | &ratio_mult_vf_name, ratio); | |
6593 | ||
f7064d11 | 6594 | loop_num = loop->num; |
792ed98b | 6595 | |
749cc4b1 HJ |
6596 | /* If cost model check not done during versioning and |
6597 | peeling for alignment. */ | |
6598 | if (!VEC_length (tree, LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo)) | |
6599 | && !VEC_length (ddr_p, LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo)) | |
6600 | && !LOOP_PEELING_FOR_ALIGNMENT (loop_vinfo)) | |
6601 | { | |
6602 | check_profitability = true; | |
792ed98b | 6603 | |
749cc4b1 HJ |
6604 | /* Get profitability threshold for vectorized loop. */ |
6605 | min_profitable_iters = LOOP_VINFO_COST_MODEL_MIN_ITERS (loop_vinfo); | |
792ed98b | 6606 | |
749cc4b1 HJ |
6607 | th = conservative_cost_threshold (loop_vinfo, |
6608 | min_profitable_iters); | |
6609 | } | |
792ed98b | 6610 | |
d73be268 | 6611 | new_loop = slpeel_tree_peel_loop_to_edge (loop, single_exit (loop), |
792ed98b | 6612 | ratio_mult_vf_name, ni_name, false, |
749cc4b1 | 6613 | th, check_profitability); |
f7064d11 DN |
6614 | gcc_assert (new_loop); |
6615 | gcc_assert (loop_num == loop->num); | |
61d3cdbb | 6616 | #ifdef ENABLE_CHECKING |
f7064d11 DN |
6617 | slpeel_verify_cfg_after_peeling (loop, new_loop); |
6618 | #endif | |
6619 | ||
6620 | /* A guard that controls whether the new_loop is to be executed or skipped | |
6621 | is placed in LOOP->exit. LOOP->exit therefore has two successors - one | |
6622 | is the preheader of NEW_LOOP, where the IVs from LOOP are used. The other | |
6623 | is a bb after NEW_LOOP, where these IVs are not used. Find the edge that | |
6624 | is on the path where the LOOP IVs are used and need to be updated. */ | |
6625 | ||
70388d94 | 6626 | preheader = loop_preheader_edge (new_loop)->src; |
ac8f6c69 | 6627 | if (EDGE_PRED (preheader, 0)->src == single_exit (loop)->dest) |
70388d94 | 6628 | update_e = EDGE_PRED (preheader, 0); |
f7064d11 | 6629 | else |
70388d94 | 6630 | update_e = EDGE_PRED (preheader, 1); |
f7064d11 DN |
6631 | |
6632 | /* Update IVs of original loop as if they were advanced | |
6633 | by ratio_mult_vf_name steps. */ | |
6634 | vect_update_ivs_after_vectorizer (loop_vinfo, ratio_mult_vf_name, update_e); | |
6635 | ||
6636 | /* After peeling we have to reset scalar evolution analyzer. */ | |
6637 | scev_reset (); | |
6638 | ||
9498a22f | 6639 | free_original_copy_tables (); |
f7064d11 DN |
6640 | } |
6641 | ||
6642 | ||
6643 | /* Function vect_gen_niters_for_prolog_loop | |
6644 | ||
6645 | Set the number of iterations for the loop represented by LOOP_VINFO | |
6646 | to the minimum between LOOP_NITERS (the original iteration count of the loop) | |
5f55a1ba | 6647 | and the misalignment of DR - the data reference recorded in |
f7064d11 DN |
6648 | LOOP_VINFO_UNALIGNED_DR (LOOP_VINFO). As a result, after the execution of |
6649 | this loop, the data reference DR will refer to an aligned location. | |
6650 | ||
6651 | The following computation is generated: | |
6652 | ||
5f55a1ba DN |
6653 | If the misalignment of DR is known at compile time: |
6654 | addr_mis = int mis = DR_MISALIGNMENT (dr); | |
6655 | Else, compute address misalignment in bytes: | |
6656 | addr_mis = addr & (vectype_size - 1) | |
f7064d11 DN |
6657 | |
6658 | prolog_niters = min ( LOOP_NITERS , (VF - addr_mis/elem_size)&(VF-1) ) | |
6659 | ||
6660 | (elem_size = element type size; an element is the scalar element | |
98b44b0e IR |
6661 | whose type is the inner type of the vectype) |
6662 | ||
6663 | For interleaving, | |
6664 | ||
6665 | prolog_niters = min ( LOOP_NITERS , | |
6666 | (VF/group_size - addr_mis/elem_size)&(VF/group_size-1) ) | |
6667 | where group_size is the size of the interleaved group. | |
cb9ed5d7 DN |
6668 | |
6669 | The above formulas assume that VF == number of elements in the vector. This | |
6670 | may not hold when there are multiple-types in the loop. | |
6671 | In this case, for some data-references in the loop the VF does not represent | |
6672 | the number of elements that fit in the vector. Therefore, instead of VF we | |
6673 | use TYPE_VECTOR_SUBPARTS. */ | |
f7064d11 DN |
6674 | |
6675 | static tree | |
6676 | vect_gen_niters_for_prolog_loop (loop_vec_info loop_vinfo, tree loop_niters) | |
6677 | { | |
6678 | struct data_reference *dr = LOOP_VINFO_UNALIGNED_DR (loop_vinfo); | |
f7064d11 DN |
6679 | struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo); |
6680 | tree var, stmt; | |
6681 | tree iters, iters_name; | |
6682 | edge pe; | |
6683 | basic_block new_bb; | |
6684 | tree dr_stmt = DR_STMT (dr); | |
6685 | stmt_vec_info stmt_info = vinfo_for_stmt (dr_stmt); | |
6686 | tree vectype = STMT_VINFO_VECTYPE (stmt_info); | |
6687 | int vectype_align = TYPE_ALIGN (vectype) / BITS_PER_UNIT; | |
f7064d11 | 6688 | tree niters_type = TREE_TYPE (loop_niters); |
98b44b0e IR |
6689 | int group_size = 1; |
6690 | int element_size = GET_MODE_SIZE (TYPE_MODE (TREE_TYPE (DR_REF (dr)))); | |
cb9ed5d7 | 6691 | int nelements = TYPE_VECTOR_SUBPARTS (vectype); |
98b44b0e | 6692 | |
805e2059 | 6693 | if (STMT_VINFO_STRIDED_ACCESS (stmt_info)) |
98b44b0e | 6694 | { |
2f8e468b | 6695 | /* For interleaved access element size must be multiplied by the size of |
98b44b0e IR |
6696 | the interleaved group. */ |
6697 | group_size = DR_GROUP_SIZE (vinfo_for_stmt ( | |
6698 | DR_GROUP_FIRST_DR (stmt_info))); | |
6699 | element_size *= group_size; | |
6700 | } | |
f7064d11 DN |
6701 | |
6702 | pe = loop_preheader_edge (loop); | |
f7064d11 | 6703 | |
5f55a1ba DN |
6704 | if (LOOP_PEELING_FOR_ALIGNMENT (loop_vinfo) > 0) |
6705 | { | |
6706 | int byte_misalign = LOOP_PEELING_FOR_ALIGNMENT (loop_vinfo); | |
5f55a1ba | 6707 | int elem_misalign = byte_misalign / element_size; |
f7064d11 | 6708 | |
00518cb1 | 6709 | if (vect_print_dump_info (REPORT_DETAILS)) |
5f55a1ba | 6710 | fprintf (vect_dump, "known alignment = %d.", byte_misalign); |
98b44b0e | 6711 | iters = build_int_cst (niters_type, |
cb9ed5d7 | 6712 | (nelements - elem_misalign)&(nelements/group_size-1)); |
5f55a1ba DN |
6713 | } |
6714 | else | |
6715 | { | |
6716 | tree new_stmts = NULL_TREE; | |
468c2ac0 DN |
6717 | tree start_addr = vect_create_addr_base_for_vector_ref (dr_stmt, |
6718 | &new_stmts, NULL_TREE, loop); | |
5f55a1ba DN |
6719 | tree ptr_type = TREE_TYPE (start_addr); |
6720 | tree size = TYPE_SIZE (ptr_type); | |
6721 | tree type = lang_hooks.types.type_for_size (tree_low_cst (size, 1), 1); | |
6722 | tree vectype_size_minus_1 = build_int_cst (type, vectype_align - 1); | |
6723 | tree elem_size_log = | |
cb9ed5d7 DN |
6724 | build_int_cst (type, exact_log2 (vectype_align/nelements)); |
6725 | tree nelements_minus_1 = build_int_cst (type, nelements - 1); | |
6726 | tree nelements_tree = build_int_cst (type, nelements); | |
5f55a1ba DN |
6727 | tree byte_misalign; |
6728 | tree elem_misalign; | |
6729 | ||
6730 | new_bb = bsi_insert_on_edge_immediate (pe, new_stmts); | |
6731 | gcc_assert (!new_bb); | |
f7064d11 | 6732 | |
5f55a1ba DN |
6733 | /* Create: byte_misalign = addr & (vectype_size - 1) */ |
6734 | byte_misalign = | |
5be014d5 | 6735 | fold_build2 (BIT_AND_EXPR, type, fold_convert (type, start_addr), vectype_size_minus_1); |
f7064d11 | 6736 | |
5f55a1ba DN |
6737 | /* Create: elem_misalign = byte_misalign / element_size */ |
6738 | elem_misalign = | |
80b4a8d9 | 6739 | fold_build2 (RSHIFT_EXPR, type, byte_misalign, elem_size_log); |
5f55a1ba | 6740 | |
cb9ed5d7 DN |
6741 | /* Create: (niters_type) (nelements - elem_misalign)&(nelements - 1) */ |
6742 | iters = fold_build2 (MINUS_EXPR, type, nelements_tree, elem_misalign); | |
6743 | iters = fold_build2 (BIT_AND_EXPR, type, iters, nelements_minus_1); | |
5f55a1ba DN |
6744 | iters = fold_convert (niters_type, iters); |
6745 | } | |
6746 | ||
f7064d11 DN |
6747 | /* Create: prolog_loop_niters = min (iters, loop_niters) */ |
6748 | /* If the loop bound is known at compile time we already verified that it is | |
6749 | greater than vf; since the misalignment ('iters') is at most vf, there's | |
6750 | no need to generate the MIN_EXPR in this case. */ | |
6751 | if (TREE_CODE (loop_niters) != INTEGER_CST) | |
80b4a8d9 | 6752 | iters = fold_build2 (MIN_EXPR, niters_type, iters, loop_niters); |
f7064d11 | 6753 | |
00518cb1 | 6754 | if (vect_print_dump_info (REPORT_DETAILS)) |
5f55a1ba DN |
6755 | { |
6756 | fprintf (vect_dump, "niters for prolog loop: "); | |
6757 | print_generic_expr (vect_dump, iters, TDF_SLIM); | |
6758 | } | |
6759 | ||
f7064d11 | 6760 | var = create_tmp_var (niters_type, "prolog_loop_niters"); |
f004ab02 | 6761 | add_referenced_var (var); |
f7064d11 DN |
6762 | iters_name = force_gimple_operand (iters, &stmt, false, var); |
6763 | ||
6764 | /* Insert stmt on loop preheader edge. */ | |
f7064d11 DN |
6765 | if (stmt) |
6766 | { | |
6767 | basic_block new_bb = bsi_insert_on_edge_immediate (pe, stmt); | |
6768 | gcc_assert (!new_bb); | |
6769 | } | |
6770 | ||
6771 | return iters_name; | |
6772 | } | |
6773 | ||
6774 | ||
5f55a1ba | 6775 | /* Function vect_update_init_of_dr |
f7064d11 DN |
6776 | |
6777 | NITERS iterations were peeled from LOOP. DR represents a data reference | |
6778 | in LOOP. This function updates the information recorded in DR to | |
6779 | account for the fact that the first NITERS iterations had already been | |
86a07404 | 6780 | executed. Specifically, it updates the OFFSET field of DR. */ |
f7064d11 DN |
6781 | |
6782 | static void | |
5f55a1ba | 6783 | vect_update_init_of_dr (struct data_reference *dr, tree niters) |
f7064d11 | 6784 | { |
86a07404 | 6785 | tree offset = DR_OFFSET (dr); |
f7064d11 | 6786 | |
86a07404 | 6787 | niters = fold_build2 (MULT_EXPR, TREE_TYPE (niters), niters, DR_STEP (dr)); |
987b67bc | 6788 | offset = fold_build2 (PLUS_EXPR, TREE_TYPE (offset), offset, niters); |
86a07404 | 6789 | DR_OFFSET (dr) = offset; |
f7064d11 DN |
6790 | } |
6791 | ||
6792 | ||
6793 | /* Function vect_update_inits_of_drs | |
6794 | ||
6795 | NITERS iterations were peeled from the loop represented by LOOP_VINFO. | |
6796 | This function updates the information recorded for the data references in | |
6797 | the loop to account for the fact that the first NITERS iterations had | |
98120f62 UB |
6798 | already been executed. Specifically, it updates the initial_condition of |
6799 | the access_function of all the data_references in the loop. */ | |
f7064d11 DN |
6800 | |
6801 | static void | |
6802 | vect_update_inits_of_drs (loop_vec_info loop_vinfo, tree niters) | |
6803 | { | |
6804 | unsigned int i; | |
ebf78a47 SP |
6805 | VEC (data_reference_p, heap) *datarefs = LOOP_VINFO_DATAREFS (loop_vinfo); |
6806 | struct data_reference *dr; | |
f7064d11 | 6807 | |
98120f62 | 6808 | if (vect_print_dump_info (REPORT_DETAILS)) |
f7064d11 DN |
6809 | fprintf (vect_dump, "=== vect_update_inits_of_dr ==="); |
6810 | ||
ebf78a47 SP |
6811 | for (i = 0; VEC_iterate (data_reference_p, datarefs, i, dr); i++) |
6812 | vect_update_init_of_dr (dr, niters); | |
f7064d11 DN |
6813 | } |
6814 | ||
6815 | ||
6816 | /* Function vect_do_peeling_for_alignment | |
6817 | ||
6818 | Peel the first 'niters' iterations of the loop represented by LOOP_VINFO. | |
6819 | 'niters' is set to the misalignment of one of the data references in the | |
6820 | loop, thereby forcing it to refer to an aligned location at the beginning | |
6821 | of the execution of this loop. The data reference for which we are | |
6822 | peeling is recorded in LOOP_VINFO_UNALIGNED_DR. */ | |
6823 | ||
6824 | static void | |
d73be268 | 6825 | vect_do_peeling_for_alignment (loop_vec_info loop_vinfo) |
f7064d11 DN |
6826 | { |
6827 | struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo); | |
6828 | tree niters_of_prolog_loop, ni_name; | |
6829 | tree n_iters; | |
6830 | struct loop *new_loop; | |
749cc4b1 HJ |
6831 | bool check_profitability = false; |
6832 | unsigned int th = 0; | |
6833 | int min_profitable_iters; | |
f7064d11 | 6834 | |
00518cb1 | 6835 | if (vect_print_dump_info (REPORT_DETAILS)) |
f7064d11 DN |
6836 | fprintf (vect_dump, "=== vect_do_peeling_for_alignment ==="); |
6837 | ||
9498a22f RH |
6838 | initialize_original_copy_tables (); |
6839 | ||
f7064d11 DN |
6840 | ni_name = vect_build_loop_niters (loop_vinfo); |
6841 | niters_of_prolog_loop = vect_gen_niters_for_prolog_loop (loop_vinfo, ni_name); | |
6842 | ||
749cc4b1 HJ |
6843 | |
6844 | /* If cost model check not done during versioning. */ | |
6845 | if (!VEC_length (tree, LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo)) | |
6846 | && !VEC_length (ddr_p, LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo))) | |
6847 | { | |
6848 | check_profitability = true; | |
6849 | ||
6850 | /* Get profitability threshold for vectorized loop. */ | |
6851 | min_profitable_iters = LOOP_VINFO_COST_MODEL_MIN_ITERS (loop_vinfo); | |
6852 | ||
6853 | th = conservative_cost_threshold (loop_vinfo, | |
6854 | min_profitable_iters); | |
6855 | } | |
6856 | ||
f7064d11 | 6857 | /* Peel the prolog loop and iterate it niters_of_prolog_loop. */ |
749cc4b1 HJ |
6858 | new_loop = |
6859 | slpeel_tree_peel_loop_to_edge (loop, loop_preheader_edge (loop), | |
6860 | niters_of_prolog_loop, ni_name, true, | |
6861 | th, check_profitability); | |
6862 | ||
f7064d11 | 6863 | gcc_assert (new_loop); |
61d3cdbb | 6864 | #ifdef ENABLE_CHECKING |
f7064d11 DN |
6865 | slpeel_verify_cfg_after_peeling (new_loop, loop); |
6866 | #endif | |
6867 | ||
6868 | /* Update number of times loop executes. */ | |
6869 | n_iters = LOOP_VINFO_NITERS (loop_vinfo); | |
987b67bc KH |
6870 | LOOP_VINFO_NITERS (loop_vinfo) = fold_build2 (MINUS_EXPR, |
6871 | TREE_TYPE (n_iters), n_iters, niters_of_prolog_loop); | |
f7064d11 DN |
6872 | |
6873 | /* Update the init conditions of the access functions of all data refs. */ | |
6874 | vect_update_inits_of_drs (loop_vinfo, niters_of_prolog_loop); | |
6875 | ||
6876 | /* After peeling we have to reset scalar evolution analyzer. */ | |
6877 | scev_reset (); | |
6878 | ||
9498a22f | 6879 | free_original_copy_tables (); |
f7064d11 DN |
6880 | } |
6881 | ||
6882 | ||
c12cc930 KB |
6883 | /* Function vect_create_cond_for_align_checks. |
6884 | ||
6885 | Create a conditional expression that represents the alignment checks for | |
6886 | all of data references (array element references) whose alignment must be | |
6887 | checked at runtime. | |
6888 | ||
6889 | Input: | |
749cc4b1 HJ |
6890 | COND_EXPR - input conditional expression. New conditions will be chained |
6891 | with logical AND operation. | |
c12cc930 KB |
6892 | LOOP_VINFO - two fields of the loop information are used. |
6893 | LOOP_VINFO_PTR_MASK is the mask used to check the alignment. | |
6894 | LOOP_VINFO_MAY_MISALIGN_STMTS contains the refs to be checked. | |
6895 | ||
6896 | Output: | |
6897 | COND_EXPR_STMT_LIST - statements needed to construct the conditional | |
6898 | expression. | |
6899 | The returned value is the conditional expression to be used in the if | |
6900 | statement that controls which version of the loop gets executed at runtime. | |
6901 | ||
6902 | The algorithm makes two assumptions: | |
6903 | 1) The number of bytes "n" in a vector is a power of 2. | |
6904 | 2) An address "a" is aligned if a%n is zero and that this | |
6905 | test can be done as a&(n-1) == 0. For example, for 16 | |
6906 | byte vectors the test is a&0xf == 0. */ | |
6907 | ||
749cc4b1 | 6908 | static void |
c12cc930 | 6909 | vect_create_cond_for_align_checks (loop_vec_info loop_vinfo, |
749cc4b1 | 6910 | tree *cond_expr, |
c12cc930 KB |
6911 | tree *cond_expr_stmt_list) |
6912 | { | |
468c2ac0 | 6913 | struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo); |
c12cc930 KB |
6914 | VEC(tree,heap) *may_misalign_stmts |
6915 | = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo); | |
ebb07520 | 6916 | tree ref_stmt, tmp; |
c12cc930 KB |
6917 | int mask = LOOP_VINFO_PTR_MASK (loop_vinfo); |
6918 | tree mask_cst; | |
6919 | unsigned int i; | |
6920 | tree psize; | |
6921 | tree int_ptrsize_type; | |
6922 | char tmp_name[20]; | |
6923 | tree or_tmp_name = NULL_TREE; | |
6924 | tree and_tmp, and_tmp_name, and_stmt; | |
6925 | tree ptrsize_zero; | |
749cc4b1 | 6926 | tree part_cond_expr; |
c12cc930 KB |
6927 | |
6928 | /* Check that mask is one less than a power of 2, i.e., mask is | |
6929 | all zeros followed by all ones. */ | |
6930 | gcc_assert ((mask != 0) && ((mask & (mask+1)) == 0)); | |
6931 | ||
6932 | /* CHECKME: what is the best integer or unsigned type to use to hold a | |
6933 | cast from a pointer value? */ | |
6934 | psize = TYPE_SIZE (ptr_type_node); | |
6935 | int_ptrsize_type | |
6936 | = lang_hooks.types.type_for_size (tree_low_cst (psize, 1), 0); | |
6937 | ||
6938 | /* Create expression (mask & (dr_1 || ... || dr_n)) where dr_i is the address | |
6939 | of the first vector of the i'th data reference. */ | |
6940 | ||
6941 | for (i = 0; VEC_iterate (tree, may_misalign_stmts, i, ref_stmt); i++) | |
6942 | { | |
6943 | tree new_stmt_list = NULL_TREE; | |
6944 | tree addr_base; | |
6945 | tree addr_tmp, addr_tmp_name, addr_stmt; | |
6946 | tree or_tmp, new_or_tmp_name, or_stmt; | |
6947 | ||
6948 | /* create: addr_tmp = (int)(address_of_first_vector) */ | |
6949 | addr_base = vect_create_addr_base_for_vector_ref (ref_stmt, | |
468c2ac0 | 6950 | &new_stmt_list, NULL_TREE, loop); |
c12cc930 KB |
6951 | |
6952 | if (new_stmt_list != NULL_TREE) | |
6953 | append_to_statement_list_force (new_stmt_list, cond_expr_stmt_list); | |
6954 | ||
6955 | sprintf (tmp_name, "%s%d", "addr2int", i); | |
6956 | addr_tmp = create_tmp_var (int_ptrsize_type, tmp_name); | |
f004ab02 | 6957 | add_referenced_var (addr_tmp); |
c12cc930 KB |
6958 | addr_tmp_name = make_ssa_name (addr_tmp, NULL_TREE); |
6959 | addr_stmt = fold_convert (int_ptrsize_type, addr_base); | |
ebb07520 | 6960 | addr_stmt = build_gimple_modify_stmt (addr_tmp_name, addr_stmt); |
c12cc930 KB |
6961 | SSA_NAME_DEF_STMT (addr_tmp_name) = addr_stmt; |
6962 | append_to_statement_list_force (addr_stmt, cond_expr_stmt_list); | |
6963 | ||
6964 | /* The addresses are OR together. */ | |
6965 | ||
6966 | if (or_tmp_name != NULL_TREE) | |
6967 | { | |
6968 | /* create: or_tmp = or_tmp | addr_tmp */ | |
6969 | sprintf (tmp_name, "%s%d", "orptrs", i); | |
6970 | or_tmp = create_tmp_var (int_ptrsize_type, tmp_name); | |
f004ab02 | 6971 | add_referenced_var (or_tmp); |
c12cc930 | 6972 | new_or_tmp_name = make_ssa_name (or_tmp, NULL_TREE); |
ebb07520 RS |
6973 | tmp = build2 (BIT_IOR_EXPR, int_ptrsize_type, |
6974 | or_tmp_name, addr_tmp_name); | |
6975 | or_stmt = build_gimple_modify_stmt (new_or_tmp_name, tmp); | |
c12cc930 KB |
6976 | SSA_NAME_DEF_STMT (new_or_tmp_name) = or_stmt; |
6977 | append_to_statement_list_force (or_stmt, cond_expr_stmt_list); | |
6978 | or_tmp_name = new_or_tmp_name; | |
6979 | } | |
6980 | else | |
6981 | or_tmp_name = addr_tmp_name; | |
6982 | ||
6983 | } /* end for i */ | |
6984 | ||
6985 | mask_cst = build_int_cst (int_ptrsize_type, mask); | |
6986 | ||
6987 | /* create: and_tmp = or_tmp & mask */ | |
6988 | and_tmp = create_tmp_var (int_ptrsize_type, "andmask" ); | |
f004ab02 | 6989 | add_referenced_var (and_tmp); |
c12cc930 KB |
6990 | and_tmp_name = make_ssa_name (and_tmp, NULL_TREE); |
6991 | ||
ebb07520 RS |
6992 | tmp = build2 (BIT_AND_EXPR, int_ptrsize_type, or_tmp_name, mask_cst); |
6993 | and_stmt = build_gimple_modify_stmt (and_tmp_name, tmp); | |
c12cc930 KB |
6994 | SSA_NAME_DEF_STMT (and_tmp_name) = and_stmt; |
6995 | append_to_statement_list_force (and_stmt, cond_expr_stmt_list); | |
6996 | ||
6997 | /* Make and_tmp the left operand of the conditional test against zero. | |
c0220ea4 | 6998 | if and_tmp has a nonzero bit then some address is unaligned. */ |
c12cc930 | 6999 | ptrsize_zero = build_int_cst (int_ptrsize_type, 0); |
749cc4b1 HJ |
7000 | part_cond_expr = fold_build2 (EQ_EXPR, boolean_type_node, |
7001 | and_tmp_name, ptrsize_zero); | |
7002 | if (*cond_expr) | |
7003 | *cond_expr = fold_build2 (TRUTH_AND_EXPR, boolean_type_node, | |
7004 | *cond_expr, part_cond_expr); | |
7005 | else | |
7006 | *cond_expr = part_cond_expr; | |
c12cc930 KB |
7007 | } |
7008 | ||
bc1edb77 VK |
7009 | /* Function vect_vfa_segment_size. |
7010 | ||
7011 | Create an expression that computes the size of segment | |
7012 | that will be accessed for a data reference. The functions takes into | |
7013 | account that realignment loads may access one more vector. | |
7014 | ||
7015 | Input: | |
7016 | DR: The data reference. | |
7017 | VECT_FACTOR: vectorization factor. | |
7018 | ||
15dc95cb | 7019 | Return an expression whose value is the size of segment which will be |
bc1edb77 VK |
7020 | accessed by DR. */ |
7021 | ||
7022 | static tree | |
7023 | vect_vfa_segment_size (struct data_reference *dr, tree vect_factor) | |
7024 | { | |
42cbdeac VK |
7025 | tree segment_length = fold_build2 (MULT_EXPR, integer_type_node, |
7026 | DR_STEP (dr), vect_factor); | |
bc1edb77 | 7027 | |
468c2ac0 | 7028 | if (vect_supportable_dr_alignment (dr) == dr_explicit_realign_optimized) |
bc1edb77 | 7029 | { |
42cbdeac VK |
7030 | tree vector_size = TYPE_SIZE_UNIT |
7031 | (STMT_VINFO_VECTYPE (vinfo_for_stmt (DR_STMT (dr)))); | |
bc1edb77 | 7032 | |
42cbdeac VK |
7033 | segment_length = fold_build2 (PLUS_EXPR, integer_type_node, |
7034 | segment_length, vector_size); | |
bc1edb77 | 7035 | } |
42cbdeac | 7036 | return fold_convert (sizetype, segment_length); |
bc1edb77 VK |
7037 | } |
7038 | ||
7039 | /* Function vect_create_cond_for_alias_checks. | |
7040 | ||
7041 | Create a conditional expression that represents the run-time checks for | |
7042 | overlapping of address ranges represented by a list of data references | |
7043 | relations passed as input. | |
7044 | ||
7045 | Input: | |
7046 | COND_EXPR - input conditional expression. New conditions will be chained | |
749cc4b1 | 7047 | with logical AND operation. |
bc1edb77 VK |
7048 | LOOP_VINFO - field LOOP_VINFO_MAY_ALIAS_STMTS contains the list of ddrs |
7049 | to be checked. | |
7050 | ||
7051 | Output: | |
7052 | COND_EXPR - conditional expression. | |
7053 | COND_EXPR_STMT_LIST - statements needed to construct the conditional | |
7054 | expression. | |
42cbdeac VK |
7055 | |
7056 | ||
bc1edb77 VK |
7057 | The returned value is the conditional expression to be used in the if |
7058 | statement that controls which version of the loop gets executed at runtime. | |
7059 | */ | |
7060 | ||
7061 | static void | |
7062 | vect_create_cond_for_alias_checks (loop_vec_info loop_vinfo, | |
7063 | tree * cond_expr, | |
7064 | tree * cond_expr_stmt_list) | |
7065 | { | |
468c2ac0 | 7066 | struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo); |
bc1edb77 VK |
7067 | VEC (ddr_p, heap) * may_alias_ddrs = |
7068 | LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo); | |
7069 | tree vect_factor = | |
7070 | build_int_cst (integer_type_node, LOOP_VINFO_VECT_FACTOR (loop_vinfo)); | |
7071 | ||
7072 | ddr_p ddr; | |
7073 | unsigned int i; | |
7074 | tree part_cond_expr; | |
7075 | ||
7076 | /* Create expression | |
7077 | ((store_ptr_0 + store_segment_length_0) < load_ptr_0) | |
7078 | || (load_ptr_0 + load_segment_length_0) < store_ptr_0)) | |
7079 | && | |
7080 | ... | |
7081 | && | |
7082 | ((store_ptr_n + store_segment_length_n) < load_ptr_n) | |
7083 | || (load_ptr_n + load_segment_length_n) < store_ptr_n)) */ | |
7084 | ||
7085 | if (VEC_empty (ddr_p, may_alias_ddrs)) | |
7086 | return; | |
7087 | ||
7088 | for (i = 0; VEC_iterate (ddr_p, may_alias_ddrs, i, ddr); i++) | |
7089 | { | |
42cbdeac VK |
7090 | struct data_reference *dr_a, *dr_b; |
7091 | tree dr_group_first_a, dr_group_first_b; | |
7092 | tree addr_base_a, addr_base_b; | |
7093 | tree segment_length_a, segment_length_b; | |
7094 | tree stmt_a, stmt_b; | |
bc1edb77 | 7095 | |
42cbdeac VK |
7096 | dr_a = DDR_A (ddr); |
7097 | stmt_a = DR_STMT (DDR_A (ddr)); | |
7098 | dr_group_first_a = DR_GROUP_FIRST_DR (vinfo_for_stmt (stmt_a)); | |
7099 | if (dr_group_first_a) | |
7100 | { | |
7101 | stmt_a = dr_group_first_a; | |
7102 | dr_a = STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt_a)); | |
7103 | } | |
7104 | ||
7105 | dr_b = DDR_B (ddr); | |
7106 | stmt_b = DR_STMT (DDR_B (ddr)); | |
7107 | dr_group_first_b = DR_GROUP_FIRST_DR (vinfo_for_stmt (stmt_b)); | |
7108 | if (dr_group_first_b) | |
7109 | { | |
7110 | stmt_b = dr_group_first_b; | |
7111 | dr_b = STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt_b)); | |
7112 | } | |
7113 | ||
7114 | addr_base_a = | |
bc1edb77 | 7115 | vect_create_addr_base_for_vector_ref (stmt_a, cond_expr_stmt_list, |
468c2ac0 | 7116 | NULL_TREE, loop); |
42cbdeac | 7117 | addr_base_b = |
bc1edb77 | 7118 | vect_create_addr_base_for_vector_ref (stmt_b, cond_expr_stmt_list, |
468c2ac0 | 7119 | NULL_TREE, loop); |
bc1edb77 | 7120 | |
42cbdeac VK |
7121 | segment_length_a = vect_vfa_segment_size (dr_a, vect_factor); |
7122 | segment_length_b = vect_vfa_segment_size (dr_b, vect_factor); | |
bc1edb77 VK |
7123 | |
7124 | if (vect_print_dump_info (REPORT_DR_DETAILS)) | |
7125 | { | |
7126 | fprintf (vect_dump, | |
7127 | "create runtime check for data references "); | |
42cbdeac | 7128 | print_generic_expr (vect_dump, DR_REF (dr_a), TDF_SLIM); |
bc1edb77 | 7129 | fprintf (vect_dump, " and "); |
42cbdeac | 7130 | print_generic_expr (vect_dump, DR_REF (dr_b), TDF_SLIM); |
bc1edb77 VK |
7131 | } |
7132 | ||
7133 | ||
7134 | part_cond_expr = | |
7135 | fold_build2 (TRUTH_OR_EXPR, boolean_type_node, | |
7136 | fold_build2 (LT_EXPR, boolean_type_node, | |
7137 | fold_build2 (POINTER_PLUS_EXPR, TREE_TYPE (addr_base_a), | |
7138 | addr_base_a, | |
7139 | segment_length_a), | |
7140 | addr_base_b), | |
7141 | fold_build2 (LT_EXPR, boolean_type_node, | |
7142 | fold_build2 (POINTER_PLUS_EXPR, TREE_TYPE (addr_base_b), | |
7143 | addr_base_b, | |
7144 | segment_length_b), | |
7145 | addr_base_a)); | |
7146 | ||
7147 | if (*cond_expr) | |
7148 | *cond_expr = fold_build2 (TRUTH_AND_EXPR, boolean_type_node, | |
7149 | *cond_expr, part_cond_expr); | |
7150 | else | |
7151 | *cond_expr = part_cond_expr; | |
7152 | } | |
7153 | if (vect_print_dump_info (REPORT_VECTORIZED_LOOPS)) | |
7154 | fprintf (vect_dump, "created %u versioning for alias checks.\n", | |
7155 | VEC_length (ddr_p, may_alias_ddrs)); | |
7156 | ||
7157 | } | |
c12cc930 | 7158 | |
42cbdeac VK |
7159 | /* Function vect_loop_versioning. |
7160 | ||
7161 | If the loop has data references that may or may not be aligned or/and | |
7162 | has data reference relations whose independence was not proven then | |
7163 | two versions of the loop need to be generated, one which is vectorized | |
7164 | and one which isn't. A test is then generated to control which of the | |
7165 | loops is executed. The test checks for the alignment of all of the | |
7166 | data references that may or may not be aligned. An additional | |
7167 | sequence of runtime tests is generated for each pairs of DDRs whose | |
7168 | independence was not proven. The vectorized version of loop is | |
749cc4b1 HJ |
7169 | executed only if both alias and alignment tests are passed. |
7170 | ||
7171 | The test generated to check which version of loop is executed | |
7172 | is modified to also check for profitability as indicated by the | |
7173 | cost model initially. */ | |
42cbdeac VK |
7174 | |
7175 | static void | |
7176 | vect_loop_versioning (loop_vec_info loop_vinfo) | |
7177 | { | |
7178 | struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo); | |
7179 | struct loop *nloop; | |
7180 | tree cond_expr = NULL_TREE; | |
7181 | tree cond_expr_stmt_list = NULL_TREE; | |
7182 | basic_block condition_bb; | |
7183 | block_stmt_iterator cond_exp_bsi; | |
7184 | basic_block merge_bb; | |
7185 | basic_block new_exit_bb; | |
7186 | edge new_exit_e, e; | |
7187 | tree orig_phi, new_phi, arg; | |
7188 | unsigned prob = 4 * REG_BR_PROB_BASE / 5; | |
7189 | tree gimplify_stmt_list; | |
749cc4b1 HJ |
7190 | tree scalar_loop_iters = LOOP_VINFO_NITERS (loop_vinfo); |
7191 | int min_profitable_iters = 0; | |
7192 | unsigned int th; | |
42cbdeac | 7193 | |
749cc4b1 HJ |
7194 | /* Get profitability threshold for vectorized loop. */ |
7195 | min_profitable_iters = LOOP_VINFO_COST_MODEL_MIN_ITERS (loop_vinfo); | |
7196 | ||
7197 | th = conservative_cost_threshold (loop_vinfo, | |
7198 | min_profitable_iters); | |
7199 | ||
7200 | cond_expr = | |
7201 | build2 (GT_EXPR, boolean_type_node, scalar_loop_iters, | |
7202 | build_int_cst (TREE_TYPE (scalar_loop_iters), th)); | |
7203 | ||
7204 | cond_expr = force_gimple_operand (cond_expr, &cond_expr_stmt_list, | |
7205 | false, NULL_TREE); | |
42cbdeac VK |
7206 | |
7207 | if (VEC_length (tree, LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo))) | |
749cc4b1 HJ |
7208 | vect_create_cond_for_align_checks (loop_vinfo, &cond_expr, |
7209 | &cond_expr_stmt_list); | |
42cbdeac VK |
7210 | |
7211 | if (VEC_length (ddr_p, LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo))) | |
749cc4b1 HJ |
7212 | vect_create_cond_for_alias_checks (loop_vinfo, &cond_expr, |
7213 | &cond_expr_stmt_list); | |
42cbdeac VK |
7214 | |
7215 | cond_expr = | |
7216 | fold_build2 (NE_EXPR, boolean_type_node, cond_expr, integer_zero_node); | |
7217 | cond_expr = | |
7218 | force_gimple_operand (cond_expr, &gimplify_stmt_list, true, | |
7219 | NULL_TREE); | |
7220 | append_to_statement_list (gimplify_stmt_list, &cond_expr_stmt_list); | |
7221 | ||
7222 | initialize_original_copy_tables (); | |
7223 | nloop = loop_version (loop, cond_expr, &condition_bb, | |
7224 | prob, prob, REG_BR_PROB_BASE - prob, true); | |
7225 | free_original_copy_tables(); | |
7226 | ||
7227 | /* Loop versioning violates an assumption we try to maintain during | |
7228 | vectorization - that the loop exit block has a single predecessor. | |
7229 | After versioning, the exit block of both loop versions is the same | |
7230 | basic block (i.e. it has two predecessors). Just in order to simplify | |
7231 | following transformations in the vectorizer, we fix this situation | |
7232 | here by adding a new (empty) block on the exit-edge of the loop, | |
7233 | with the proper loop-exit phis to maintain loop-closed-form. */ | |
7234 | ||
7235 | merge_bb = single_exit (loop)->dest; | |
7236 | gcc_assert (EDGE_COUNT (merge_bb->preds) == 2); | |
7237 | new_exit_bb = split_edge (single_exit (loop)); | |
7238 | new_exit_e = single_exit (loop); | |
7239 | e = EDGE_SUCC (new_exit_bb, 0); | |
7240 | ||
7241 | for (orig_phi = phi_nodes (merge_bb); orig_phi; | |
7242 | orig_phi = PHI_CHAIN (orig_phi)) | |
7243 | { | |
7244 | new_phi = create_phi_node (SSA_NAME_VAR (PHI_RESULT (orig_phi)), | |
7245 | new_exit_bb); | |
7246 | arg = PHI_ARG_DEF_FROM_EDGE (orig_phi, e); | |
7247 | add_phi_arg (new_phi, arg, new_exit_e); | |
7248 | SET_PHI_ARG_DEF (orig_phi, e->dest_idx, PHI_RESULT (new_phi)); | |
7249 | } | |
7250 | ||
7251 | /* End loop-exit-fixes after versioning. */ | |
7252 | ||
7253 | update_ssa (TODO_update_ssa); | |
7254 | if (cond_expr_stmt_list) | |
7255 | { | |
7256 | cond_exp_bsi = bsi_last (condition_bb); | |
7257 | bsi_insert_before (&cond_exp_bsi, cond_expr_stmt_list, BSI_SAME_STMT); | |
7258 | } | |
7259 | } | |
7260 | ||
805e2059 IR |
7261 | /* Remove a group of stores (for SLP or interleaving), free their |
7262 | stmt_vec_info. */ | |
7263 | ||
7264 | static void | |
7265 | vect_remove_stores (tree first_stmt) | |
7266 | { | |
7267 | stmt_ann_t ann; | |
7268 | tree next = first_stmt; | |
7269 | tree tmp; | |
7270 | stmt_vec_info next_stmt_info; | |
7271 | block_stmt_iterator next_si; | |
7272 | ||
7273 | while (next) | |
7274 | { | |
7275 | /* Free the attached stmt_vec_info and remove the stmt. */ | |
7276 | next_si = bsi_for_stmt (next); | |
7277 | bsi_remove (&next_si, true); | |
7278 | next_stmt_info = vinfo_for_stmt (next); | |
7279 | ann = stmt_ann (next); | |
7280 | tmp = DR_GROUP_NEXT_DR (next_stmt_info); | |
7281 | free (next_stmt_info); | |
7282 | set_stmt_info (ann, NULL); | |
7283 | next = tmp; | |
7284 | } | |
7285 | } | |
7286 | ||
7287 | ||
7288 | /* Vectorize SLP instance tree in postorder. */ | |
7289 | ||
7290 | static bool | |
7291 | vect_schedule_slp_instance (slp_tree node, unsigned int vec_stmts_size) | |
7292 | { | |
7293 | tree stmt; | |
7294 | bool strided_store, is_store; | |
7295 | block_stmt_iterator si; | |
7296 | stmt_vec_info stmt_info; | |
7297 | ||
7298 | if (!node) | |
7299 | return false; | |
7300 | ||
7301 | vect_schedule_slp_instance (SLP_TREE_LEFT (node), vec_stmts_size); | |
7302 | vect_schedule_slp_instance (SLP_TREE_RIGHT (node), vec_stmts_size); | |
7303 | ||
7304 | stmt = VEC_index(tree, SLP_TREE_SCALAR_STMTS (node), 0); | |
7305 | stmt_info = vinfo_for_stmt (stmt); | |
7306 | SLP_TREE_VEC_STMTS (node) = VEC_alloc (tree, heap, vec_stmts_size); | |
7307 | SLP_TREE_NUMBER_OF_VEC_STMTS (node) = vec_stmts_size; | |
7308 | ||
7309 | if (vect_print_dump_info (REPORT_DETAILS)) | |
7310 | { | |
7311 | fprintf (vect_dump, "------>vectorizing SLP node starting from: "); | |
7312 | print_generic_expr (vect_dump, stmt, TDF_SLIM); | |
7313 | } | |
7314 | ||
7315 | si = bsi_for_stmt (stmt); | |
7316 | is_store = vect_transform_stmt (stmt, &si, &strided_store, node); | |
7317 | if (is_store) | |
7318 | { | |
7319 | if (DR_GROUP_FIRST_DR (stmt_info)) | |
7320 | /* If IS_STORE is TRUE, the vectorization of the | |
7321 | interleaving chain was completed - free all the stores in | |
7322 | the chain. */ | |
7323 | vect_remove_stores (DR_GROUP_FIRST_DR (stmt_info)); | |
7324 | else | |
7325 | /* FORNOW: SLP originates only from strided stores. */ | |
7326 | gcc_unreachable (); | |
7327 | ||
7328 | return true; | |
7329 | } | |
7330 | ||
7331 | /* FORNOW: SLP originates only from strided stores. */ | |
7332 | return false; | |
7333 | } | |
7334 | ||
7335 | ||
7336 | static bool | |
7337 | vect_schedule_slp (loop_vec_info loop_vinfo, unsigned int nunits) | |
7338 | { | |
7339 | VEC (slp_instance, heap) *slp_instances = | |
7340 | LOOP_VINFO_SLP_INSTANCES (loop_vinfo); | |
7341 | slp_instance instance; | |
7342 | unsigned int vec_stmts_size; | |
7343 | unsigned int group_size, i; | |
7344 | unsigned int vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo); | |
7345 | bool is_store = false; | |
7346 | ||
7347 | for (i = 0; VEC_iterate (slp_instance, slp_instances, i, instance); i++) | |
7348 | { | |
7349 | group_size = SLP_INSTANCE_GROUP_SIZE (instance); | |
7350 | /* For each SLP instance calculate number of vector stmts to be created | |
7351 | for the scalar stmts in each node of the SLP tree. Number of vector | |
7352 | elements in one vector iteration is the number of scalar elements in | |
7353 | one scalar iteration (GROUP_SIZE) multiplied by VF divided by vector | |
7354 | size. */ | |
7355 | vec_stmts_size = vectorization_factor * group_size / nunits; | |
7356 | ||
7357 | /* Schedule the tree of INSTANCE. */ | |
7358 | is_store = vect_schedule_slp_instance (SLP_INSTANCE_TREE (instance), | |
7359 | vec_stmts_size); | |
7360 | ||
7361 | if (vect_print_dump_info (REPORT_VECTORIZED_LOOPS) | |
7362 | || vect_print_dump_info (REPORT_UNVECTORIZED_LOOPS)) | |
7363 | fprintf (vect_dump, "vectorizing stmts using SLP."); | |
7364 | } | |
7365 | ||
7366 | return is_store; | |
7367 | } | |
7368 | ||
f7064d11 DN |
7369 | /* Function vect_transform_loop. |
7370 | ||
7371 | The analysis phase has determined that the loop is vectorizable. | |
7372 | Vectorize the loop - created vectorized stmts to replace the scalar | |
7373 | stmts in the loop, and update the loop exit condition. */ | |
7374 | ||
7375 | void | |
d73be268 | 7376 | vect_transform_loop (loop_vec_info loop_vinfo) |
f7064d11 DN |
7377 | { |
7378 | struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo); | |
7379 | basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo); | |
7380 | int nbbs = loop->num_nodes; | |
8fca6de5 | 7381 | block_stmt_iterator si, next_si; |
f7064d11 DN |
7382 | int i; |
7383 | tree ratio = NULL; | |
7384 | int vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo); | |
98b44b0e | 7385 | bool strided_store; |
805e2059 IR |
7386 | bool slp_scheduled = false; |
7387 | unsigned int nunits; | |
f7064d11 | 7388 | |
00518cb1 | 7389 | if (vect_print_dump_info (REPORT_DETAILS)) |
f7064d11 | 7390 | fprintf (vect_dump, "=== vec_transform_loop ==="); |
749cc4b1 HJ |
7391 | |
7392 | if (VEC_length (tree, LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo)) | |
7393 | || VEC_length (ddr_p, LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo))) | |
7394 | vect_loop_versioning (loop_vinfo); | |
c12cc930 | 7395 | |
6fc0bb99 | 7396 | /* CHECKME: we wouldn't need this if we called update_ssa once |
90ff949f | 7397 | for all loops. */ |
38635499 | 7398 | bitmap_zero (vect_memsyms_to_rename); |
90ff949f | 7399 | |
f7064d11 DN |
7400 | /* Peel the loop if there are data refs with unknown alignment. |
7401 | Only one data ref with unknown store is allowed. */ | |
7402 | ||
5f55a1ba | 7403 | if (LOOP_PEELING_FOR_ALIGNMENT (loop_vinfo)) |
d73be268 | 7404 | vect_do_peeling_for_alignment (loop_vinfo); |
f7064d11 DN |
7405 | |
7406 | /* If the loop has a symbolic number of iterations 'n' (i.e. it's not a | |
7407 | compile time constant), or it is a constant that doesn't divide by the | |
7408 | vectorization factor, then an epilog loop needs to be created. | |
7409 | We therefore duplicate the loop: the original loop will be vectorized, | |
7410 | and will compute the first (n/VF) iterations. The second copy of the loop | |
7411 | will remain scalar and will compute the remaining (n%VF) iterations. | |
7412 | (VF is the vectorization factor). */ | |
7413 | ||
7414 | if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) | |
7415 | || (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) | |
7416 | && LOOP_VINFO_INT_NITERS (loop_vinfo) % vectorization_factor != 0)) | |
d73be268 | 7417 | vect_do_peeling_for_loop_bound (loop_vinfo, &ratio); |
f7064d11 DN |
7418 | else |
7419 | ratio = build_int_cst (TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo)), | |
7420 | LOOP_VINFO_INT_NITERS (loop_vinfo) / vectorization_factor); | |
7421 | ||
7422 | /* 1) Make sure the loop header has exactly two entries | |
7423 | 2) Make sure we have a preheader basic block. */ | |
7424 | ||
7425 | gcc_assert (EDGE_COUNT (loop->header->preds) == 2); | |
7426 | ||
598ec7bd | 7427 | split_edge (loop_preheader_edge (loop)); |
f7064d11 DN |
7428 | |
7429 | /* FORNOW: the vectorizer supports only loops which body consist | |
7430 | of one basic block (header + empty latch). When the vectorizer will | |
7431 | support more involved loop forms, the order by which the BBs are | |
7432 | traversed need to be reconsidered. */ | |
7433 | ||
7434 | for (i = 0; i < nbbs; i++) | |
7435 | { | |
7436 | basic_block bb = bbs[i]; | |
cd38ca7f DN |
7437 | stmt_vec_info stmt_info; |
7438 | tree phi; | |
7439 | ||
7440 | for (phi = phi_nodes (bb); phi; phi = PHI_CHAIN (phi)) | |
7441 | { | |
7442 | if (vect_print_dump_info (REPORT_DETAILS)) | |
7443 | { | |
7444 | fprintf (vect_dump, "------>vectorizing phi: "); | |
7445 | print_generic_expr (vect_dump, phi, TDF_SLIM); | |
7446 | } | |
7447 | stmt_info = vinfo_for_stmt (phi); | |
7448 | if (!stmt_info) | |
7449 | continue; | |
805e2059 | 7450 | |
cd38ca7f DN |
7451 | if (!STMT_VINFO_RELEVANT_P (stmt_info) |
7452 | && !STMT_VINFO_LIVE_P (stmt_info)) | |
7453 | continue; | |
7454 | ||
7455 | if ((TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info)) | |
7456 | != (unsigned HOST_WIDE_INT) vectorization_factor) | |
7457 | && vect_print_dump_info (REPORT_DETAILS)) | |
7458 | fprintf (vect_dump, "multiple-types."); | |
7459 | ||
7460 | if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def) | |
7461 | { | |
7462 | if (vect_print_dump_info (REPORT_DETAILS)) | |
7463 | fprintf (vect_dump, "transform phi."); | |
805e2059 | 7464 | vect_transform_stmt (phi, NULL, NULL, NULL); |
cd38ca7f DN |
7465 | } |
7466 | } | |
f7064d11 DN |
7467 | |
7468 | for (si = bsi_start (bb); !bsi_end_p (si);) | |
7469 | { | |
7470 | tree stmt = bsi_stmt (si); | |
f7064d11 DN |
7471 | bool is_store; |
7472 | ||
00518cb1 | 7473 | if (vect_print_dump_info (REPORT_DETAILS)) |
f7064d11 DN |
7474 | { |
7475 | fprintf (vect_dump, "------>vectorizing statement: "); | |
7476 | print_generic_expr (vect_dump, stmt, TDF_SLIM); | |
7477 | } | |
d29de1bf | 7478 | |
f7064d11 | 7479 | stmt_info = vinfo_for_stmt (stmt); |
d29de1bf DN |
7480 | |
7481 | /* vector stmts created in the outer-loop during vectorization of | |
7482 | stmts in an inner-loop may not have a stmt_info, and do not | |
7483 | need to be vectorized. */ | |
7484 | if (!stmt_info) | |
7485 | { | |
7486 | bsi_next (&si); | |
7487 | continue; | |
7488 | } | |
7489 | ||
61d3cdbb DN |
7490 | if (!STMT_VINFO_RELEVANT_P (stmt_info) |
7491 | && !STMT_VINFO_LIVE_P (stmt_info)) | |
f7064d11 DN |
7492 | { |
7493 | bsi_next (&si); | |
7494 | continue; | |
7495 | } | |
89d67cca | 7496 | |
28e44f4f | 7497 | gcc_assert (STMT_VINFO_VECTYPE (stmt_info)); |
805e2059 IR |
7498 | nunits = |
7499 | (unsigned int) TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info)); | |
7500 | if (!STMT_SLP_TYPE (stmt_info) | |
7501 | && nunits != (unsigned int) vectorization_factor | |
7502 | && vect_print_dump_info (REPORT_DETAILS)) | |
7503 | /* For SLP VF is set according to unrolling factor, and not to | |
7504 | vector size, hence for SLP this print is not valid. */ | |
7505 | fprintf (vect_dump, "multiple-types."); | |
7506 | ||
7507 | /* SLP. Schedule all the SLP instances when the first SLP stmt is | |
7508 | reached. */ | |
7509 | if (STMT_SLP_TYPE (stmt_info)) | |
7510 | { | |
7511 | if (!slp_scheduled) | |
7512 | { | |
7513 | slp_scheduled = true; | |
7514 | ||
7515 | if (vect_print_dump_info (REPORT_DETAILS)) | |
7516 | fprintf (vect_dump, "=== scheduling SLP instances ==="); | |
61d3cdbb | 7517 | |
805e2059 IR |
7518 | is_store = vect_schedule_slp (loop_vinfo, nunits); |
7519 | ||
7520 | /* IS_STORE is true if STMT is a store. Stores cannot be of | |
7521 | hybrid SLP type. They are removed in | |
7522 | vect_schedule_slp_instance and their vinfo is destroyed. */ | |
7523 | if (is_store) | |
7524 | { | |
7525 | bsi_next (&si); | |
7526 | continue; | |
7527 | } | |
7528 | } | |
7529 | ||
7530 | /* Hybrid SLP stmts must be vectorized in addition to SLP. */ | |
7531 | if (PURE_SLP_STMT (stmt_info)) | |
7532 | { | |
7533 | bsi_next (&si); | |
7534 | continue; | |
7535 | } | |
7536 | } | |
7537 | ||
f7064d11 | 7538 | /* -------- vectorize statement ------------ */ |
00518cb1 | 7539 | if (vect_print_dump_info (REPORT_DETAILS)) |
f7064d11 DN |
7540 | fprintf (vect_dump, "transform statement."); |
7541 | ||
98b44b0e | 7542 | strided_store = false; |
805e2059 | 7543 | is_store = vect_transform_stmt (stmt, &si, &strided_store, NULL); |
98b44b0e IR |
7544 | if (is_store) |
7545 | { | |
7546 | stmt_ann_t ann; | |
805e2059 | 7547 | if (STMT_VINFO_STRIDED_ACCESS (stmt_info)) |
98b44b0e IR |
7548 | { |
7549 | /* Interleaving. If IS_STORE is TRUE, the vectorization of the | |
7550 | interleaving chain was completed - free all the stores in | |
7551 | the chain. */ | |
7552 | tree next = DR_GROUP_FIRST_DR (stmt_info); | |
7553 | tree tmp; | |
7554 | stmt_vec_info next_stmt_info; | |
7555 | ||
7556 | while (next) | |
7557 | { | |
8fca6de5 | 7558 | next_si = bsi_for_stmt (next); |
98b44b0e IR |
7559 | next_stmt_info = vinfo_for_stmt (next); |
7560 | /* Free the attached stmt_vec_info and remove the stmt. */ | |
7561 | ann = stmt_ann (next); | |
7562 | tmp = DR_GROUP_NEXT_DR (next_stmt_info); | |
7563 | free (next_stmt_info); | |
7564 | set_stmt_info (ann, NULL); | |
8fca6de5 | 7565 | bsi_remove (&next_si, true); |
98b44b0e IR |
7566 | next = tmp; |
7567 | } | |
7568 | bsi_remove (&si, true); | |
7569 | continue; | |
7570 | } | |
7571 | else | |
7572 | { | |
7573 | /* Free the attached stmt_vec_info and remove the stmt. */ | |
7574 | ann = stmt_ann (stmt); | |
7575 | free (stmt_info); | |
7576 | set_stmt_info (ann, NULL); | |
7577 | bsi_remove (&si, true); | |
7578 | continue; | |
7579 | } | |
f7064d11 | 7580 | } |
f7064d11 DN |
7581 | bsi_next (&si); |
7582 | } /* stmts in BB */ | |
7583 | } /* BBs in loop */ | |
7584 | ||
7585 | slpeel_make_loop_iterate_ntimes (loop, ratio); | |
7586 | ||
38635499 | 7587 | mark_set_for_renaming (vect_memsyms_to_rename); |
90ff949f | 7588 | |
84d65814 DN |
7589 | /* The memory tags and pointers in vectorized statements need to |
7590 | have their SSA forms updated. FIXME, why can't this be delayed | |
7591 | until all the loops have been transformed? */ | |
7592 | update_ssa (TODO_update_ssa); | |
7593 | ||
00518cb1 | 7594 | if (vect_print_dump_info (REPORT_VECTORIZED_LOOPS)) |
f7064d11 | 7595 | fprintf (vect_dump, "LOOP VECTORIZED."); |
d29de1bf DN |
7596 | if (loop->inner && vect_print_dump_info (REPORT_VECTORIZED_LOOPS)) |
7597 | fprintf (vect_dump, "OUTER LOOP VECTORIZED."); | |
f7064d11 | 7598 | } |