]>
Commit | Line | Data |
---|---|---|
1cc521f1 MM |
1 | /* Loop unroll-and-jam. |
2 | Copyright (C) 2017 Free Software Foundation, Inc. | |
3 | ||
4 | This file is part of GCC. | |
5 | ||
6 | GCC is free software; you can redistribute it and/or modify it | |
7 | under the terms of the GNU General Public License as published by the | |
8 | Free Software Foundation; either version 3, or (at your option) any | |
9 | later version. | |
10 | ||
11 | GCC is distributed in the hope that it will be useful, but WITHOUT | |
12 | ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or | |
13 | FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License | |
14 | for more details. | |
15 | ||
16 | You should have received a copy of the GNU General Public License | |
17 | along with GCC; see the file COPYING3. If not see | |
18 | <http://www.gnu.org/licenses/>. */ | |
19 | ||
20 | #include "config.h" | |
21 | #include "system.h" | |
22 | #include "coretypes.h" | |
23 | #include "params.h" | |
24 | #include "tree-pass.h" | |
25 | #include "backend.h" | |
26 | #include "tree.h" | |
27 | #include "gimple.h" | |
28 | #include "ssa.h" | |
29 | #include "fold-const.h" | |
30 | #include "tree-cfg.h" | |
31 | #include "tree-ssa.h" | |
32 | #include "tree-ssa-loop-niter.h" | |
33 | #include "tree-ssa-loop.h" | |
34 | #include "tree-ssa-loop-manip.h" | |
35 | #include "cfgloop.h" | |
36 | #include "tree-scalar-evolution.h" | |
37 | #include "gimple-iterator.h" | |
38 | #include "cfghooks.h" | |
39 | #include "tree-data-ref.h" | |
40 | #include "tree-ssa-loop-ivopts.h" | |
41 | #include "tree-vectorizer.h" | |
42 | ||
43 | /* Unroll and Jam transformation | |
44 | ||
45 | This is a combination of two transformations, where the second | |
46 | is not always valid. It's applicable if a loop nest has redundancies | |
47 | over the iterations of an outer loop while not having that with | |
48 | an inner loop. | |
49 | ||
50 | Given this nest: | |
51 | for (i) { | |
52 | for (j) { | |
53 | B(i,j) | |
54 | } | |
55 | } | |
56 | ||
57 | first unroll: | |
58 | for (i by 2) { | |
59 | for (j) { | |
60 | B(i,j) | |
61 | } | |
62 | for (j) { | |
63 | B(i+1,j) | |
64 | } | |
65 | } | |
66 | ||
67 | then fuse the two adjacent inner loops resulting from that: | |
68 | for (i by 2) { | |
69 | for (j) { | |
70 | B(i,j) | |
71 | B(i+1,j) | |
72 | } | |
73 | } | |
74 | ||
75 | As the order of evaluations of the body B changes this is valid | |
76 | only in certain situations: all distance vectors need to be forward. | |
77 | Additionally if there are multiple induction variables than just | |
78 | a counting control IV (j above) we can also deal with some situations. | |
79 | ||
80 | The validity is checked by unroll_jam_possible_p, and the data-dep | |
81 | testing below. | |
82 | ||
83 | A trivial example where the fusion is wrong would be when | |
84 | B(i,j) == x[j-1] = x[j]; | |
85 | for (i by 2) { | |
86 | for (j) { | |
87 | x[j-1] = x[j]; | |
88 | } | |
89 | for (j) { | |
90 | x[j-1] = x[j]; | |
91 | } | |
92 | } effect: move content to front by two elements | |
93 | --> | |
94 | for (i by 2) { | |
95 | for (j) { | |
96 | x[j-1] = x[j]; | |
97 | x[j-1] = x[j]; | |
98 | } | |
99 | } effect: move content to front by one element | |
100 | */ | |
101 | ||
102 | /* Modify the loop tree for the fact that all code once belonging | |
103 | to the OLD loop or the outer loop of OLD now is inside LOOP. */ | |
104 | ||
105 | static void | |
106 | merge_loop_tree (struct loop *loop, struct loop *old) | |
107 | { | |
108 | basic_block *bbs; | |
109 | int i, n; | |
110 | struct loop *subloop; | |
111 | edge e; | |
112 | edge_iterator ei; | |
113 | ||
114 | /* Find its nodes. */ | |
115 | bbs = XNEWVEC (basic_block, n_basic_blocks_for_fn (cfun)); | |
116 | n = get_loop_body_with_size (loop, bbs, n_basic_blocks_for_fn (cfun)); | |
117 | ||
118 | for (i = 0; i < n; i++) | |
119 | { | |
120 | /* If the block was direct child of OLD loop it's now part | |
121 | of LOOP. If it was outside OLD, then it moved into LOOP | |
122 | as well. This avoids changing the loop father for BBs | |
123 | in inner loops of OLD. */ | |
124 | if (bbs[i]->loop_father == old | |
125 | || loop_depth (bbs[i]->loop_father) < loop_depth (old)) | |
126 | { | |
127 | remove_bb_from_loops (bbs[i]); | |
128 | add_bb_to_loop (bbs[i], loop); | |
129 | continue; | |
130 | } | |
131 | ||
132 | /* If we find a direct subloop of OLD, move it to LOOP. */ | |
133 | subloop = bbs[i]->loop_father; | |
134 | if (loop_outer (subloop) == old && subloop->header == bbs[i]) | |
135 | { | |
136 | flow_loop_tree_node_remove (subloop); | |
137 | flow_loop_tree_node_add (loop, subloop); | |
138 | } | |
139 | } | |
140 | ||
141 | /* Update the information about loop exit edges. */ | |
142 | for (i = 0; i < n; i++) | |
143 | { | |
144 | FOR_EACH_EDGE (e, ei, bbs[i]->succs) | |
145 | { | |
146 | rescan_loop_exit (e, false, false); | |
147 | } | |
148 | } | |
149 | ||
150 | loop->num_nodes = n; | |
151 | ||
152 | free (bbs); | |
153 | } | |
154 | ||
155 | /* BB exits the outer loop of an unroll-and-jam situation. | |
156 | Check if any statements therein would prevent the transformation. */ | |
157 | ||
158 | static bool | |
159 | bb_prevents_fusion_p (basic_block bb) | |
160 | { | |
161 | gimple_stmt_iterator gsi; | |
162 | /* BB is duplicated by outer unrolling and then all N-1 first copies | |
163 | move into the body of the fused inner loop. The last copy remains | |
164 | the exit block of the outer loop and is still outside the inner loop | |
165 | also after fusion. We can't allow this for some effects of BB: | |
166 | * stores or unknown side-effects prevent fusion | |
167 | * loads don't | |
168 | * computations into SSA names: these aren't problematic. Their | |
169 | result will be unused on the exit edges of the first N-1 copies | |
170 | (those aren't taken after unrolling). If they are used on the | |
171 | other edge (the one leading to the outer latch block) they are | |
172 | loop-carried (on the outer loop) and the Nth copy of BB will | |
173 | compute them again (i.e. the first N-1 copies will be dead). */ | |
174 | for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi); gsi_next (&gsi)) | |
175 | { | |
176 | gimple *g = gsi_stmt (gsi); | |
177 | if (gimple_vdef (g) || gimple_has_side_effects (g)) | |
178 | return true; | |
179 | } | |
180 | return false; | |
181 | } | |
182 | ||
183 | /* Given an inner loop LOOP (of some OUTER loop) determine if | |
184 | we can safely fuse copies of it (generated by outer unrolling). | |
185 | If so return true, otherwise return false. */ | |
186 | ||
187 | static bool | |
188 | unroll_jam_possible_p (struct loop *outer, struct loop *loop) | |
189 | { | |
190 | basic_block *bbs; | |
191 | int i, n; | |
192 | struct tree_niter_desc niter; | |
193 | ||
194 | /* When fusing the loops we skip the latch block | |
195 | of the first one, so it mustn't have any effects to | |
196 | preserve. */ | |
197 | if (!empty_block_p (loop->latch)) | |
198 | return false; | |
199 | ||
200 | if (!single_exit (loop)) | |
201 | return false; | |
202 | ||
203 | /* We need a perfect nest. Quick check for adjacent inner loops. */ | |
204 | if (outer->inner != loop || loop->next) | |
205 | return false; | |
206 | ||
207 | /* The number of iterations of the inner loop must be loop invariant | |
208 | with respect to the outer loop. */ | |
209 | if (!number_of_iterations_exit (loop, single_exit (loop), &niter, | |
210 | false, true) | |
211 | || niter.cmp == ERROR_MARK | |
212 | || !integer_zerop (niter.may_be_zero) | |
213 | || !expr_invariant_in_loop_p (outer, niter.niter)) | |
214 | return false; | |
215 | ||
216 | /* And check blocks belonging to just outer loop. */ | |
217 | bbs = XNEWVEC (basic_block, n_basic_blocks_for_fn (cfun)); | |
218 | n = get_loop_body_with_size (outer, bbs, n_basic_blocks_for_fn (cfun)); | |
219 | ||
220 | for (i = 0; i < n; i++) | |
221 | { | |
222 | if (bbs[i]->loop_father == outer | |
223 | && bbs[i] != outer->latch && bbs[i] != outer->header | |
224 | && (!loop_exits_from_bb_p (outer, bbs[i]) | |
225 | || bb_prevents_fusion_p (bbs[i]))) | |
226 | break; | |
227 | /* XXX Note that the above disallows head-controlled inner loops, | |
228 | that we usually have. The guard block would need to be accepted | |
229 | (invariant condition either entering or skipping the loop), | |
230 | without also accepting arbitrary control flow. When unswitching | |
231 | ran before us (as with -O3) this won't be a problem because its | |
232 | outer loop unswitching will have moved out the invariant condition. | |
233 | ||
234 | If we do that we need to extend fuse_loops() to cope with this | |
235 | by threading through the (still invariant) copied condition | |
236 | between the two loop copies. */ | |
237 | } | |
238 | free (bbs); | |
239 | if (i != n) | |
240 | return false; | |
241 | ||
242 | /* For now we can safely fuse copies of LOOP only if all | |
243 | loop carried variables are inductions (or the virtual op). | |
244 | ||
245 | We could handle reductions as well (the initial value in the second | |
246 | body would be the after-iter value of the first body) if it's over | |
247 | an associative and commutative operation. We wouldn't | |
248 | be able to handle unknown cycles. */ | |
249 | gphi_iterator psi; | |
250 | for (psi = gsi_start_phis (loop->header); !gsi_end_p (psi); gsi_next (&psi)) | |
251 | { | |
252 | affine_iv iv; | |
253 | tree op = gimple_phi_result (psi.phi ()); | |
254 | ||
255 | if (virtual_operand_p (op)) | |
256 | continue; | |
257 | if (!simple_iv (loop, loop, op, &iv, true)) | |
258 | return false; | |
259 | /* The inductions must be regular, loop invariant step and initial | |
260 | value. */ | |
261 | if (!expr_invariant_in_loop_p (outer, iv.step) | |
262 | || !expr_invariant_in_loop_p (outer, iv.base)) | |
263 | return false; | |
264 | /* XXX With more effort we could also be able to deal with inductions | |
265 | where the initial value is loop variant but a simple IV in the | |
266 | outer loop. The initial value for the second body would be | |
267 | the original initial value plus iv.base.step. The next value | |
268 | for the fused loop would be the original next value of the first | |
269 | copy, _not_ the next value of the second body. */ | |
270 | } | |
271 | ||
272 | return true; | |
273 | } | |
274 | ||
275 | /* Fuse LOOP with all further neighbors. The loops are expected to | |
276 | be in appropriate form. */ | |
277 | ||
278 | static void | |
279 | fuse_loops (struct loop *loop) | |
280 | { | |
281 | struct loop *next = loop->next; | |
282 | ||
283 | while (next) | |
284 | { | |
285 | edge e; | |
286 | ||
287 | remove_branch (single_pred_edge (loop->latch)); | |
288 | /* Make delete_basic_block not fiddle with the loop structure. */ | |
289 | basic_block oldlatch = loop->latch; | |
290 | loop->latch = NULL; | |
291 | delete_basic_block (oldlatch); | |
292 | e = redirect_edge_and_branch (loop_latch_edge (next), | |
293 | loop->header); | |
294 | loop->latch = e->src; | |
295 | flush_pending_stmts (e); | |
296 | ||
297 | gcc_assert (EDGE_COUNT (next->header->preds) == 1); | |
298 | ||
299 | /* The PHI nodes of the second body (single-argument now) | |
300 | need adjustments to use the right values: either directly | |
301 | the value of the corresponding PHI in the first copy or | |
302 | the one leaving the first body which unrolling did for us. | |
303 | ||
304 | See also unroll_jam_possible_p() for further possibilities. */ | |
305 | gphi_iterator psi_first, psi_second; | |
306 | e = single_pred_edge (next->header); | |
307 | for (psi_first = gsi_start_phis (loop->header), | |
308 | psi_second = gsi_start_phis (next->header); | |
309 | !gsi_end_p (psi_first); | |
310 | gsi_next (&psi_first), gsi_next (&psi_second)) | |
311 | { | |
312 | gphi *phi_first = psi_first.phi (); | |
313 | gphi *phi_second = psi_second.phi (); | |
314 | tree firstop = gimple_phi_result (phi_first); | |
315 | /* The virtual operand is correct already as it's | |
316 | always live at exit, hence has a LCSSA node and outer | |
317 | loop unrolling updated SSA form. */ | |
318 | if (virtual_operand_p (firstop)) | |
319 | continue; | |
320 | ||
321 | /* Due to unroll_jam_possible_p() we know that this is | |
322 | an induction. The second body goes over the same | |
323 | iteration space. */ | |
324 | add_phi_arg (phi_second, firstop, e, | |
325 | gimple_location (phi_first)); | |
326 | } | |
327 | gcc_assert (gsi_end_p (psi_second)); | |
328 | ||
329 | merge_loop_tree (loop, next); | |
330 | gcc_assert (!next->num_nodes); | |
331 | struct loop *ln = next->next; | |
332 | delete_loop (next); | |
333 | next = ln; | |
334 | } | |
335 | rewrite_into_loop_closed_ssa_1 (NULL, 0, SSA_OP_USE, loop); | |
336 | } | |
337 | ||
338 | /* Returns true if the distance in DDR can be determined and adjusts | |
339 | the unroll factor in *UNROLL to make unrolling valid for that distance. | |
340 | Otherwise return false. | |
341 | ||
342 | If this data dep can lead to a removed memory reference, increment | |
343 | *REMOVED and adjust *PROFIT_UNROLL to be the necessary unroll factor | |
344 | for this to happen. */ | |
345 | ||
346 | static bool | |
347 | adjust_unroll_factor (struct data_dependence_relation *ddr, | |
348 | unsigned *unroll, unsigned *profit_unroll, | |
349 | unsigned *removed) | |
350 | { | |
351 | bool ret = false; | |
352 | if (DDR_ARE_DEPENDENT (ddr) != chrec_known) | |
353 | { | |
354 | if (DDR_NUM_DIST_VECTS (ddr) == 0) | |
355 | return false; | |
356 | unsigned i; | |
357 | lambda_vector dist_v; | |
358 | FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v) | |
359 | { | |
360 | /* A distance (a,b) is at worst transformed into (a/N,b) by the | |
361 | unrolling (factor N), so the transformation is valid if | |
362 | a >= N, or b > 0, or b is zero and a > 0. Otherwise the unroll | |
363 | factor needs to be limited so that the first condition holds. | |
364 | That may limit the factor down to zero in the worst case. */ | |
365 | int dist = dist_v[0]; | |
366 | if (dist < 0) | |
367 | gcc_unreachable (); | |
368 | else if ((unsigned)dist >= *unroll) | |
369 | ; | |
370 | else if (lambda_vector_lexico_pos (dist_v + 1, DDR_NB_LOOPS (ddr) - 1) | |
371 | || (lambda_vector_zerop (dist_v + 1, DDR_NB_LOOPS (ddr) - 1) | |
372 | && dist > 0)) | |
373 | ; | |
374 | else | |
375 | *unroll = dist; | |
376 | ||
377 | /* With a distance (a,0) it's always profitable to unroll-and-jam | |
378 | (by a+1), because one memory reference will go away. With | |
379 | (a,b) and b != 0 that's less clear. We will increase the | |
380 | number of streams without lowering the number of mem refs. | |
381 | So for now only handle the first situation. */ | |
382 | if (lambda_vector_zerop (dist_v + 1, DDR_NB_LOOPS (ddr) - 1)) | |
383 | { | |
384 | *profit_unroll = MAX (*profit_unroll, (unsigned)dist + 1); | |
385 | (*removed)++; | |
386 | } | |
387 | ||
388 | ret = true; | |
389 | } | |
390 | } | |
391 | return ret; | |
392 | } | |
393 | ||
394 | /* Main entry point for the unroll-and-jam transformation | |
395 | described above. */ | |
396 | ||
397 | static unsigned int | |
398 | tree_loop_unroll_and_jam (void) | |
399 | { | |
400 | struct loop *loop; | |
401 | bool changed = false; | |
402 | ||
403 | gcc_assert (scev_initialized_p ()); | |
404 | ||
405 | /* Go through all innermost loops. */ | |
406 | FOR_EACH_LOOP (loop, LI_ONLY_INNERMOST) | |
407 | { | |
408 | struct loop *outer = loop_outer (loop); | |
409 | ||
410 | if (loop_depth (loop) < 2 | |
411 | || optimize_loop_nest_for_size_p (outer)) | |
412 | continue; | |
413 | ||
414 | if (!unroll_jam_possible_p (outer, loop)) | |
415 | continue; | |
416 | ||
417 | vec<data_reference_p> datarefs; | |
418 | vec<ddr_p> dependences; | |
419 | unsigned unroll_factor, profit_unroll, removed; | |
420 | struct tree_niter_desc desc; | |
421 | bool unroll = false; | |
422 | ||
423 | auto_vec<loop_p, 3> loop_nest; | |
424 | dependences.create (10); | |
425 | datarefs.create (10); | |
426 | if (!compute_data_dependences_for_loop (outer, true, &loop_nest, | |
427 | &datarefs, &dependences)) | |
428 | { | |
429 | if (dump_file && (dump_flags & TDF_DETAILS)) | |
430 | fprintf (dump_file, "Cannot analyze data dependencies\n"); | |
431 | free_data_refs (datarefs); | |
432 | free_dependence_relations (dependences); | |
433 | return false; | |
434 | } | |
435 | if (!datarefs.length ()) | |
436 | continue; | |
437 | ||
438 | if (dump_file && (dump_flags & TDF_DETAILS)) | |
439 | dump_data_dependence_relations (dump_file, dependences); | |
440 | ||
441 | unroll_factor = (unsigned)-1; | |
442 | profit_unroll = 1; | |
443 | removed = 0; | |
444 | ||
445 | /* Check all dependencies. */ | |
446 | unsigned i; | |
447 | struct data_dependence_relation *ddr; | |
448 | FOR_EACH_VEC_ELT (dependences, i, ddr) | |
449 | { | |
450 | struct data_reference *dra, *drb; | |
451 | ||
452 | /* If the refs are independend there's nothing to do. */ | |
453 | if (DDR_ARE_DEPENDENT (ddr) == chrec_known) | |
454 | continue; | |
455 | dra = DDR_A (ddr); | |
456 | drb = DDR_B (ddr); | |
457 | /* Nothing interesting for the self dependencies. */ | |
458 | if (dra == drb) | |
459 | continue; | |
460 | ||
461 | /* Now check the distance vector, for determining a sensible | |
462 | outer unroll factor, and for validity of merging the inner | |
463 | loop copies. */ | |
464 | if (!adjust_unroll_factor (ddr, &unroll_factor, &profit_unroll, | |
465 | &removed)) | |
466 | { | |
467 | /* Couldn't get the distance vector. For two reads that's | |
468 | harmless (we assume we should unroll). For at least | |
469 | one write this means we can't check the dependence direction | |
470 | and hence can't determine safety. */ | |
471 | ||
472 | if (DR_IS_WRITE (dra) || DR_IS_WRITE (drb)) | |
473 | { | |
474 | unroll_factor = 0; | |
475 | break; | |
476 | } | |
477 | } | |
478 | } | |
479 | ||
480 | /* We regard a user-specified minimum percentage of zero as a request | |
481 | to ignore all profitability concerns and apply the transformation | |
482 | always. */ | |
483 | if (!PARAM_VALUE (PARAM_UNROLL_JAM_MIN_PERCENT)) | |
484 | profit_unroll = 2; | |
485 | else if (removed * 100 / datarefs.length () | |
486 | < (unsigned)PARAM_VALUE (PARAM_UNROLL_JAM_MIN_PERCENT)) | |
487 | profit_unroll = 1; | |
488 | if (unroll_factor > profit_unroll) | |
489 | unroll_factor = profit_unroll; | |
490 | if (unroll_factor > (unsigned)PARAM_VALUE (PARAM_UNROLL_JAM_MAX_UNROLL)) | |
491 | unroll_factor = PARAM_VALUE (PARAM_UNROLL_JAM_MAX_UNROLL); | |
492 | unroll = (unroll_factor > 1 | |
493 | && can_unroll_loop_p (outer, unroll_factor, &desc)); | |
494 | ||
495 | if (unroll) | |
496 | { | |
497 | if (dump_enabled_p ()) | |
498 | dump_printf_loc (MSG_OPTIMIZED_LOCATIONS | TDF_DETAILS, | |
499 | find_loop_location (outer), | |
500 | "applying unroll and jam with factor %d\n", | |
501 | unroll_factor); | |
502 | initialize_original_copy_tables (); | |
503 | tree_unroll_loop (outer, unroll_factor, single_dom_exit (outer), | |
504 | &desc); | |
505 | free_original_copy_tables (); | |
506 | fuse_loops (outer->inner); | |
507 | changed = true; | |
508 | } | |
509 | ||
510 | loop_nest.release (); | |
511 | free_dependence_relations (dependences); | |
512 | free_data_refs (datarefs); | |
513 | } | |
514 | ||
515 | if (changed) | |
516 | { | |
517 | scev_reset (); | |
518 | free_dominance_info (CDI_DOMINATORS); | |
519 | return TODO_cleanup_cfg; | |
520 | } | |
521 | return 0; | |
522 | } | |
523 | ||
524 | /* Pass boilerplate */ | |
525 | ||
526 | namespace { | |
527 | ||
528 | const pass_data pass_data_loop_jam = | |
529 | { | |
530 | GIMPLE_PASS, /* type */ | |
531 | "unrolljam", /* name */ | |
532 | OPTGROUP_LOOP, /* optinfo_flags */ | |
533 | TV_LOOP_JAM, /* tv_id */ | |
534 | PROP_cfg, /* properties_required */ | |
535 | 0, /* properties_provided */ | |
536 | 0, /* properties_destroyed */ | |
537 | 0, /* todo_flags_start */ | |
538 | 0, /* todo_flags_finish */ | |
539 | }; | |
540 | ||
541 | class pass_loop_jam : public gimple_opt_pass | |
542 | { | |
543 | public: | |
544 | pass_loop_jam (gcc::context *ctxt) | |
545 | : gimple_opt_pass (pass_data_loop_jam, ctxt) | |
546 | {} | |
547 | ||
548 | /* opt_pass methods: */ | |
549 | virtual bool gate (function *) { return flag_unroll_jam != 0; } | |
550 | virtual unsigned int execute (function *); | |
551 | ||
552 | }; | |
553 | ||
554 | unsigned int | |
555 | pass_loop_jam::execute (function *fun) | |
556 | { | |
557 | if (number_of_loops (fun) <= 1) | |
558 | return 0; | |
559 | ||
560 | return tree_loop_unroll_and_jam (); | |
561 | } | |
562 | ||
563 | } | |
564 | ||
565 | gimple_opt_pass * | |
566 | make_pass_loop_jam (gcc::context *ctxt) | |
567 | { | |
568 | return new pass_loop_jam (ctxt); | |
569 | } |