]> git.ipfire.org Git - thirdparty/gcc.git/blame - gcc/gimple-loop-jam.c
ipa-polymorphic-call.c (noncall_stmt_may_be_vtbl_ptr_store): Fix a comment typo,...
[thirdparty/gcc.git] / gcc / gimple-loop-jam.c
CommitLineData
1cc521f1
MM
1/* Loop unroll-and-jam.
2 Copyright (C) 2017 Free Software Foundation, Inc.
3
4This file is part of GCC.
5
6GCC is free software; you can redistribute it and/or modify it
7under the terms of the GNU General Public License as published by the
8Free Software Foundation; either version 3, or (at your option) any
9later version.
10
11GCC is distributed in the hope that it will be useful, but WITHOUT
12ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
13FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
14for more details.
15
16You should have received a copy of the GNU General Public License
17along with GCC; see the file COPYING3. If not see
18<http://www.gnu.org/licenses/>. */
19
20#include "config.h"
21#include "system.h"
22#include "coretypes.h"
23#include "params.h"
24#include "tree-pass.h"
25#include "backend.h"
26#include "tree.h"
27#include "gimple.h"
28#include "ssa.h"
29#include "fold-const.h"
30#include "tree-cfg.h"
31#include "tree-ssa.h"
32#include "tree-ssa-loop-niter.h"
33#include "tree-ssa-loop.h"
34#include "tree-ssa-loop-manip.h"
35#include "cfgloop.h"
36#include "tree-scalar-evolution.h"
37#include "gimple-iterator.h"
38#include "cfghooks.h"
39#include "tree-data-ref.h"
40#include "tree-ssa-loop-ivopts.h"
41#include "tree-vectorizer.h"
42
43/* Unroll and Jam transformation
44
45 This is a combination of two transformations, where the second
46 is not always valid. It's applicable if a loop nest has redundancies
47 over the iterations of an outer loop while not having that with
48 an inner loop.
49
50 Given this nest:
51 for (i) {
52 for (j) {
53 B(i,j)
54 }
55 }
56
57 first unroll:
58 for (i by 2) {
59 for (j) {
60 B(i,j)
61 }
62 for (j) {
63 B(i+1,j)
64 }
65 }
66
67 then fuse the two adjacent inner loops resulting from that:
68 for (i by 2) {
69 for (j) {
70 B(i,j)
71 B(i+1,j)
72 }
73 }
74
75 As the order of evaluations of the body B changes this is valid
76 only in certain situations: all distance vectors need to be forward.
77 Additionally if there are multiple induction variables than just
78 a counting control IV (j above) we can also deal with some situations.
79
80 The validity is checked by unroll_jam_possible_p, and the data-dep
81 testing below.
82
83 A trivial example where the fusion is wrong would be when
84 B(i,j) == x[j-1] = x[j];
85 for (i by 2) {
86 for (j) {
87 x[j-1] = x[j];
88 }
89 for (j) {
90 x[j-1] = x[j];
91 }
92 } effect: move content to front by two elements
93 -->
94 for (i by 2) {
95 for (j) {
96 x[j-1] = x[j];
97 x[j-1] = x[j];
98 }
99 } effect: move content to front by one element
100*/
101
102/* Modify the loop tree for the fact that all code once belonging
103 to the OLD loop or the outer loop of OLD now is inside LOOP. */
104
105static void
106merge_loop_tree (struct loop *loop, struct loop *old)
107{
108 basic_block *bbs;
109 int i, n;
110 struct loop *subloop;
111 edge e;
112 edge_iterator ei;
113
114 /* Find its nodes. */
115 bbs = XNEWVEC (basic_block, n_basic_blocks_for_fn (cfun));
116 n = get_loop_body_with_size (loop, bbs, n_basic_blocks_for_fn (cfun));
117
118 for (i = 0; i < n; i++)
119 {
120 /* If the block was direct child of OLD loop it's now part
121 of LOOP. If it was outside OLD, then it moved into LOOP
122 as well. This avoids changing the loop father for BBs
123 in inner loops of OLD. */
124 if (bbs[i]->loop_father == old
125 || loop_depth (bbs[i]->loop_father) < loop_depth (old))
126 {
127 remove_bb_from_loops (bbs[i]);
128 add_bb_to_loop (bbs[i], loop);
129 continue;
130 }
131
132 /* If we find a direct subloop of OLD, move it to LOOP. */
133 subloop = bbs[i]->loop_father;
134 if (loop_outer (subloop) == old && subloop->header == bbs[i])
135 {
136 flow_loop_tree_node_remove (subloop);
137 flow_loop_tree_node_add (loop, subloop);
138 }
139 }
140
141 /* Update the information about loop exit edges. */
142 for (i = 0; i < n; i++)
143 {
144 FOR_EACH_EDGE (e, ei, bbs[i]->succs)
145 {
146 rescan_loop_exit (e, false, false);
147 }
148 }
149
150 loop->num_nodes = n;
151
152 free (bbs);
153}
154
155/* BB exits the outer loop of an unroll-and-jam situation.
156 Check if any statements therein would prevent the transformation. */
157
158static bool
159bb_prevents_fusion_p (basic_block bb)
160{
161 gimple_stmt_iterator gsi;
162 /* BB is duplicated by outer unrolling and then all N-1 first copies
163 move into the body of the fused inner loop. The last copy remains
164 the exit block of the outer loop and is still outside the inner loop
165 also after fusion. We can't allow this for some effects of BB:
166 * stores or unknown side-effects prevent fusion
167 * loads don't
168 * computations into SSA names: these aren't problematic. Their
169 result will be unused on the exit edges of the first N-1 copies
170 (those aren't taken after unrolling). If they are used on the
171 other edge (the one leading to the outer latch block) they are
172 loop-carried (on the outer loop) and the Nth copy of BB will
173 compute them again (i.e. the first N-1 copies will be dead). */
174 for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi); gsi_next (&gsi))
175 {
176 gimple *g = gsi_stmt (gsi);
177 if (gimple_vdef (g) || gimple_has_side_effects (g))
178 return true;
179 }
180 return false;
181}
182
183/* Given an inner loop LOOP (of some OUTER loop) determine if
184 we can safely fuse copies of it (generated by outer unrolling).
185 If so return true, otherwise return false. */
186
187static bool
188unroll_jam_possible_p (struct loop *outer, struct loop *loop)
189{
190 basic_block *bbs;
191 int i, n;
192 struct tree_niter_desc niter;
193
194 /* When fusing the loops we skip the latch block
195 of the first one, so it mustn't have any effects to
196 preserve. */
197 if (!empty_block_p (loop->latch))
198 return false;
199
200 if (!single_exit (loop))
201 return false;
202
203 /* We need a perfect nest. Quick check for adjacent inner loops. */
204 if (outer->inner != loop || loop->next)
205 return false;
206
207 /* The number of iterations of the inner loop must be loop invariant
208 with respect to the outer loop. */
209 if (!number_of_iterations_exit (loop, single_exit (loop), &niter,
210 false, true)
211 || niter.cmp == ERROR_MARK
212 || !integer_zerop (niter.may_be_zero)
213 || !expr_invariant_in_loop_p (outer, niter.niter))
214 return false;
215
216 /* And check blocks belonging to just outer loop. */
217 bbs = XNEWVEC (basic_block, n_basic_blocks_for_fn (cfun));
218 n = get_loop_body_with_size (outer, bbs, n_basic_blocks_for_fn (cfun));
219
220 for (i = 0; i < n; i++)
221 {
222 if (bbs[i]->loop_father == outer
223 && bbs[i] != outer->latch && bbs[i] != outer->header
224 && (!loop_exits_from_bb_p (outer, bbs[i])
225 || bb_prevents_fusion_p (bbs[i])))
226 break;
227 /* XXX Note that the above disallows head-controlled inner loops,
228 that we usually have. The guard block would need to be accepted
229 (invariant condition either entering or skipping the loop),
230 without also accepting arbitrary control flow. When unswitching
231 ran before us (as with -O3) this won't be a problem because its
232 outer loop unswitching will have moved out the invariant condition.
233
234 If we do that we need to extend fuse_loops() to cope with this
235 by threading through the (still invariant) copied condition
236 between the two loop copies. */
237 }
238 free (bbs);
239 if (i != n)
240 return false;
241
242 /* For now we can safely fuse copies of LOOP only if all
243 loop carried variables are inductions (or the virtual op).
244
245 We could handle reductions as well (the initial value in the second
246 body would be the after-iter value of the first body) if it's over
247 an associative and commutative operation. We wouldn't
248 be able to handle unknown cycles. */
249 gphi_iterator psi;
250 for (psi = gsi_start_phis (loop->header); !gsi_end_p (psi); gsi_next (&psi))
251 {
252 affine_iv iv;
253 tree op = gimple_phi_result (psi.phi ());
254
255 if (virtual_operand_p (op))
256 continue;
257 if (!simple_iv (loop, loop, op, &iv, true))
258 return false;
259 /* The inductions must be regular, loop invariant step and initial
260 value. */
261 if (!expr_invariant_in_loop_p (outer, iv.step)
262 || !expr_invariant_in_loop_p (outer, iv.base))
263 return false;
264 /* XXX With more effort we could also be able to deal with inductions
265 where the initial value is loop variant but a simple IV in the
266 outer loop. The initial value for the second body would be
267 the original initial value plus iv.base.step. The next value
268 for the fused loop would be the original next value of the first
269 copy, _not_ the next value of the second body. */
270 }
271
272 return true;
273}
274
275/* Fuse LOOP with all further neighbors. The loops are expected to
276 be in appropriate form. */
277
278static void
279fuse_loops (struct loop *loop)
280{
281 struct loop *next = loop->next;
282
283 while (next)
284 {
285 edge e;
286
287 remove_branch (single_pred_edge (loop->latch));
288 /* Make delete_basic_block not fiddle with the loop structure. */
289 basic_block oldlatch = loop->latch;
290 loop->latch = NULL;
291 delete_basic_block (oldlatch);
292 e = redirect_edge_and_branch (loop_latch_edge (next),
293 loop->header);
294 loop->latch = e->src;
295 flush_pending_stmts (e);
296
297 gcc_assert (EDGE_COUNT (next->header->preds) == 1);
298
299 /* The PHI nodes of the second body (single-argument now)
300 need adjustments to use the right values: either directly
301 the value of the corresponding PHI in the first copy or
302 the one leaving the first body which unrolling did for us.
303
304 See also unroll_jam_possible_p() for further possibilities. */
305 gphi_iterator psi_first, psi_second;
306 e = single_pred_edge (next->header);
307 for (psi_first = gsi_start_phis (loop->header),
308 psi_second = gsi_start_phis (next->header);
309 !gsi_end_p (psi_first);
310 gsi_next (&psi_first), gsi_next (&psi_second))
311 {
312 gphi *phi_first = psi_first.phi ();
313 gphi *phi_second = psi_second.phi ();
314 tree firstop = gimple_phi_result (phi_first);
315 /* The virtual operand is correct already as it's
316 always live at exit, hence has a LCSSA node and outer
317 loop unrolling updated SSA form. */
318 if (virtual_operand_p (firstop))
319 continue;
320
321 /* Due to unroll_jam_possible_p() we know that this is
322 an induction. The second body goes over the same
323 iteration space. */
324 add_phi_arg (phi_second, firstop, e,
325 gimple_location (phi_first));
326 }
327 gcc_assert (gsi_end_p (psi_second));
328
329 merge_loop_tree (loop, next);
330 gcc_assert (!next->num_nodes);
331 struct loop *ln = next->next;
332 delete_loop (next);
333 next = ln;
334 }
335 rewrite_into_loop_closed_ssa_1 (NULL, 0, SSA_OP_USE, loop);
336}
337
338/* Returns true if the distance in DDR can be determined and adjusts
339 the unroll factor in *UNROLL to make unrolling valid for that distance.
340 Otherwise return false.
341
342 If this data dep can lead to a removed memory reference, increment
343 *REMOVED and adjust *PROFIT_UNROLL to be the necessary unroll factor
344 for this to happen. */
345
346static bool
347adjust_unroll_factor (struct data_dependence_relation *ddr,
348 unsigned *unroll, unsigned *profit_unroll,
349 unsigned *removed)
350{
351 bool ret = false;
352 if (DDR_ARE_DEPENDENT (ddr) != chrec_known)
353 {
354 if (DDR_NUM_DIST_VECTS (ddr) == 0)
355 return false;
356 unsigned i;
357 lambda_vector dist_v;
358 FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v)
359 {
360 /* A distance (a,b) is at worst transformed into (a/N,b) by the
361 unrolling (factor N), so the transformation is valid if
362 a >= N, or b > 0, or b is zero and a > 0. Otherwise the unroll
363 factor needs to be limited so that the first condition holds.
364 That may limit the factor down to zero in the worst case. */
365 int dist = dist_v[0];
366 if (dist < 0)
367 gcc_unreachable ();
368 else if ((unsigned)dist >= *unroll)
369 ;
370 else if (lambda_vector_lexico_pos (dist_v + 1, DDR_NB_LOOPS (ddr) - 1)
371 || (lambda_vector_zerop (dist_v + 1, DDR_NB_LOOPS (ddr) - 1)
372 && dist > 0))
373 ;
374 else
375 *unroll = dist;
376
377 /* With a distance (a,0) it's always profitable to unroll-and-jam
378 (by a+1), because one memory reference will go away. With
379 (a,b) and b != 0 that's less clear. We will increase the
380 number of streams without lowering the number of mem refs.
381 So for now only handle the first situation. */
382 if (lambda_vector_zerop (dist_v + 1, DDR_NB_LOOPS (ddr) - 1))
383 {
384 *profit_unroll = MAX (*profit_unroll, (unsigned)dist + 1);
385 (*removed)++;
386 }
387
388 ret = true;
389 }
390 }
391 return ret;
392}
393
394/* Main entry point for the unroll-and-jam transformation
395 described above. */
396
397static unsigned int
398tree_loop_unroll_and_jam (void)
399{
400 struct loop *loop;
401 bool changed = false;
402
403 gcc_assert (scev_initialized_p ());
404
405 /* Go through all innermost loops. */
406 FOR_EACH_LOOP (loop, LI_ONLY_INNERMOST)
407 {
408 struct loop *outer = loop_outer (loop);
409
410 if (loop_depth (loop) < 2
411 || optimize_loop_nest_for_size_p (outer))
412 continue;
413
414 if (!unroll_jam_possible_p (outer, loop))
415 continue;
416
417 vec<data_reference_p> datarefs;
418 vec<ddr_p> dependences;
419 unsigned unroll_factor, profit_unroll, removed;
420 struct tree_niter_desc desc;
421 bool unroll = false;
422
423 auto_vec<loop_p, 3> loop_nest;
424 dependences.create (10);
425 datarefs.create (10);
426 if (!compute_data_dependences_for_loop (outer, true, &loop_nest,
427 &datarefs, &dependences))
428 {
429 if (dump_file && (dump_flags & TDF_DETAILS))
430 fprintf (dump_file, "Cannot analyze data dependencies\n");
431 free_data_refs (datarefs);
432 free_dependence_relations (dependences);
433 return false;
434 }
435 if (!datarefs.length ())
436 continue;
437
438 if (dump_file && (dump_flags & TDF_DETAILS))
439 dump_data_dependence_relations (dump_file, dependences);
440
441 unroll_factor = (unsigned)-1;
442 profit_unroll = 1;
443 removed = 0;
444
445 /* Check all dependencies. */
446 unsigned i;
447 struct data_dependence_relation *ddr;
448 FOR_EACH_VEC_ELT (dependences, i, ddr)
449 {
450 struct data_reference *dra, *drb;
451
452 /* If the refs are independend there's nothing to do. */
453 if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
454 continue;
455 dra = DDR_A (ddr);
456 drb = DDR_B (ddr);
457 /* Nothing interesting for the self dependencies. */
458 if (dra == drb)
459 continue;
460
461 /* Now check the distance vector, for determining a sensible
462 outer unroll factor, and for validity of merging the inner
463 loop copies. */
464 if (!adjust_unroll_factor (ddr, &unroll_factor, &profit_unroll,
465 &removed))
466 {
467 /* Couldn't get the distance vector. For two reads that's
468 harmless (we assume we should unroll). For at least
469 one write this means we can't check the dependence direction
470 and hence can't determine safety. */
471
472 if (DR_IS_WRITE (dra) || DR_IS_WRITE (drb))
473 {
474 unroll_factor = 0;
475 break;
476 }
477 }
478 }
479
480 /* We regard a user-specified minimum percentage of zero as a request
481 to ignore all profitability concerns and apply the transformation
482 always. */
483 if (!PARAM_VALUE (PARAM_UNROLL_JAM_MIN_PERCENT))
484 profit_unroll = 2;
485 else if (removed * 100 / datarefs.length ()
486 < (unsigned)PARAM_VALUE (PARAM_UNROLL_JAM_MIN_PERCENT))
487 profit_unroll = 1;
488 if (unroll_factor > profit_unroll)
489 unroll_factor = profit_unroll;
490 if (unroll_factor > (unsigned)PARAM_VALUE (PARAM_UNROLL_JAM_MAX_UNROLL))
491 unroll_factor = PARAM_VALUE (PARAM_UNROLL_JAM_MAX_UNROLL);
492 unroll = (unroll_factor > 1
493 && can_unroll_loop_p (outer, unroll_factor, &desc));
494
495 if (unroll)
496 {
497 if (dump_enabled_p ())
498 dump_printf_loc (MSG_OPTIMIZED_LOCATIONS | TDF_DETAILS,
499 find_loop_location (outer),
500 "applying unroll and jam with factor %d\n",
501 unroll_factor);
502 initialize_original_copy_tables ();
503 tree_unroll_loop (outer, unroll_factor, single_dom_exit (outer),
504 &desc);
505 free_original_copy_tables ();
506 fuse_loops (outer->inner);
507 changed = true;
508 }
509
510 loop_nest.release ();
511 free_dependence_relations (dependences);
512 free_data_refs (datarefs);
513 }
514
515 if (changed)
516 {
517 scev_reset ();
518 free_dominance_info (CDI_DOMINATORS);
519 return TODO_cleanup_cfg;
520 }
521 return 0;
522}
523
524/* Pass boilerplate */
525
526namespace {
527
528const pass_data pass_data_loop_jam =
529{
530 GIMPLE_PASS, /* type */
531 "unrolljam", /* name */
532 OPTGROUP_LOOP, /* optinfo_flags */
533 TV_LOOP_JAM, /* tv_id */
534 PROP_cfg, /* properties_required */
535 0, /* properties_provided */
536 0, /* properties_destroyed */
537 0, /* todo_flags_start */
538 0, /* todo_flags_finish */
539};
540
541class pass_loop_jam : public gimple_opt_pass
542{
543public:
544 pass_loop_jam (gcc::context *ctxt)
545 : gimple_opt_pass (pass_data_loop_jam, ctxt)
546 {}
547
548 /* opt_pass methods: */
549 virtual bool gate (function *) { return flag_unroll_jam != 0; }
550 virtual unsigned int execute (function *);
551
552};
553
554unsigned int
555pass_loop_jam::execute (function *fun)
556{
557 if (number_of_loops (fun) <= 1)
558 return 0;
559
560 return tree_loop_unroll_and_jam ();
561}
562
563}
564
565gimple_opt_pass *
566make_pass_loop_jam (gcc::context *ctxt)
567{
568 return new pass_loop_jam (ctxt);
569}