]>
Commit | Line | Data |
---|---|---|
60cfcb79 | 1 | /* Linear Loop transforms |
12c697cd | 2 | Copyright (C) 2003, 2004, 2005, 2007, 2008, 2009 |
3 | Free Software Foundation, Inc. | |
60cfcb79 | 4 | Contributed by Daniel Berlin <dberlin@dberlin.org>. |
5 | ||
6 | This file is part of GCC. | |
7 | ||
8 | GCC is free software; you can redistribute it and/or modify it under | |
9 | the terms of the GNU General Public License as published by the Free | |
8c4c00c1 | 10 | Software Foundation; either version 3, or (at your option) any later |
60cfcb79 | 11 | version. |
12 | ||
13 | GCC is distributed in the hope that it will be useful, but WITHOUT ANY | |
14 | WARRANTY; without even the implied warranty of MERCHANTABILITY or | |
15 | FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License | |
16 | for more details. | |
17 | ||
18 | You should have received a copy of the GNU General Public License | |
8c4c00c1 | 19 | along with GCC; see the file COPYING3. If not see |
20 | <http://www.gnu.org/licenses/>. */ | |
60cfcb79 | 21 | |
22 | ||
23 | #include "config.h" | |
24 | #include "system.h" | |
25 | #include "coretypes.h" | |
26 | #include "tm.h" | |
60cfcb79 | 27 | #include "ggc.h" |
28 | #include "tree.h" | |
29 | #include "target.h" | |
30 | ||
31 | #include "rtl.h" | |
32 | #include "basic-block.h" | |
33 | #include "diagnostic.h" | |
3026d558 | 34 | #include "obstack.h" |
60cfcb79 | 35 | #include "tree-flow.h" |
36 | #include "tree-dump.h" | |
37 | #include "timevar.h" | |
38 | #include "cfgloop.h" | |
39 | #include "expr.h" | |
40 | #include "optabs.h" | |
41 | #include "tree-chrec.h" | |
42 | #include "tree-data-ref.h" | |
43 | #include "tree-scalar-evolution.h" | |
44 | #include "tree-pass.h" | |
60cfcb79 | 45 | #include "lambda.h" |
46 | ||
47 | /* Linear loop transforms include any composition of interchange, | |
48 | scaling, skewing, and reversal. They are used to change the | |
49 | iteration order of loop nests in order to optimize data locality of | |
50 | traversals, or remove dependences that prevent | |
51 | parallelization/vectorization/etc. | |
52 | ||
53 | TODO: Determine reuse vectors/matrix and use it to determine optimal | |
54 | transform matrix for locality purposes. | |
55 | TODO: Completion of partial transforms. */ | |
56 | ||
331e13f9 | 57 | /* Gather statistics for loop interchange. LOOP is the loop being |
58 | considered. The first loop in the considered loop nest is | |
59 | FIRST_LOOP, and consequently, the index of the considered loop is | |
60 | obtained by LOOP->DEPTH - FIRST_LOOP->DEPTH | |
50caf588 | 61 | |
62 | Initializes: | |
63 | - DEPENDENCE_STEPS the sum of all the data dependence distances | |
331e13f9 | 64 | carried by loop LOOP, |
50caf588 | 65 | |
66 | - NB_DEPS_NOT_CARRIED_BY_LOOP the number of dependence relations | |
331e13f9 | 67 | for which the loop LOOP is not carrying any dependence, |
50caf588 | 68 | |
331e13f9 | 69 | - ACCESS_STRIDES the sum of all the strides in LOOP. |
50caf588 | 70 | |
71 | Example: for the following loop, | |
72 | ||
73 | | loop_1 runs 1335 times | |
74 | | loop_2 runs 1335 times | |
75 | | A[{{0, +, 1}_1, +, 1335}_2] | |
76 | | B[{{0, +, 1}_1, +, 1335}_2] | |
77 | | endloop_2 | |
78 | | A[{0, +, 1336}_1] | |
79 | | endloop_1 | |
80 | ||
81 | gather_interchange_stats (in loop_1) will return | |
82 | DEPENDENCE_STEPS = 3002 | |
83 | NB_DEPS_NOT_CARRIED_BY_LOOP = 5 | |
84 | ACCESS_STRIDES = 10694 | |
85 | ||
86 | gather_interchange_stats (in loop_2) will return | |
87 | DEPENDENCE_STEPS = 3000 | |
88 | NB_DEPS_NOT_CARRIED_BY_LOOP = 7 | |
89 | ACCESS_STRIDES = 8010 | |
84420feb | 90 | */ |
60cfcb79 | 91 | |
92 | static void | |
75a70cf9 | 93 | gather_interchange_stats (VEC (ddr_p, heap) *dependence_relations ATTRIBUTE_UNUSED, |
94 | VEC (data_reference_p, heap) *datarefs ATTRIBUTE_UNUSED, | |
95 | struct loop *loop ATTRIBUTE_UNUSED, | |
96 | struct loop *first_loop ATTRIBUTE_UNUSED, | |
97 | unsigned int *dependence_steps ATTRIBUTE_UNUSED, | |
98 | unsigned int *nb_deps_not_carried_by_loop ATTRIBUTE_UNUSED, | |
99 | double_int *access_strides ATTRIBUTE_UNUSED) | |
60cfcb79 | 100 | { |
1532ec98 | 101 | unsigned int i, j; |
41c7a324 | 102 | struct data_dependence_relation *ddr; |
103 | struct data_reference *dr; | |
60cfcb79 | 104 | |
50caf588 | 105 | *dependence_steps = 0; |
60cfcb79 | 106 | *nb_deps_not_carried_by_loop = 0; |
a75122be | 107 | *access_strides = double_int_zero; |
50caf588 | 108 | |
41c7a324 | 109 | for (i = 0; VEC_iterate (ddr_p, dependence_relations, i, ddr); i++) |
60cfcb79 | 110 | { |
84420feb | 111 | /* If we don't know anything about this dependence, or the distance |
112 | vector is NULL, or there is no dependence, then there is no reuse of | |
113 | data. */ | |
1532ec98 | 114 | if (DDR_ARE_DEPENDENT (ddr) == chrec_dont_know |
115 | || DDR_ARE_DEPENDENT (ddr) == chrec_known | |
116 | || DDR_NUM_DIST_VECTS (ddr) == 0) | |
84420feb | 117 | continue; |
60cfcb79 | 118 | |
1532ec98 | 119 | for (j = 0; j < DDR_NUM_DIST_VECTS (ddr); j++) |
120 | { | |
9e3536f4 | 121 | int dist = DDR_DIST_VECT (ddr, j)[loop_depth (loop) - loop_depth (first_loop)]; |
1532ec98 | 122 | |
123 | if (dist == 0) | |
124 | (*nb_deps_not_carried_by_loop) += 1; | |
125 | ||
126 | else if (dist < 0) | |
127 | (*dependence_steps) += -dist; | |
128 | ||
129 | else | |
130 | (*dependence_steps) += dist; | |
131 | } | |
50caf588 | 132 | } |
133 | ||
134 | /* Compute the access strides. */ | |
41c7a324 | 135 | for (i = 0; VEC_iterate (data_reference_p, datarefs, i, dr); i++) |
50caf588 | 136 | { |
137 | unsigned int it; | |
28d4f5a1 | 138 | tree ref = DR_REF (dr); |
75a70cf9 | 139 | gimple stmt = DR_STMT (dr); |
50caf588 | 140 | struct loop *stmt_loop = loop_containing_stmt (stmt); |
331e13f9 | 141 | struct loop *inner_loop = first_loop->inner; |
28d4f5a1 | 142 | |
331e13f9 | 143 | if (inner_loop != stmt_loop |
144 | && !flow_loop_nested_p (inner_loop, stmt_loop)) | |
50caf588 | 145 | continue; |
28d4f5a1 | 146 | |
147 | for (it = 0; it < DR_NUM_DIMENSIONS (dr); | |
148 | it++, ref = TREE_OPERAND (ref, 0)) | |
50caf588 | 149 | { |
b79b3386 | 150 | int num = am_vector_index_for_loop (DR_ACCESS_MATRIX (dr), loop->num); |
151 | int istride = AM_GET_ACCESS_MATRIX_ELEMENT (DR_ACCESS_MATRIX (dr), it, num); | |
28d4f5a1 | 152 | tree array_size = TYPE_SIZE (TREE_TYPE (ref)); |
a75122be | 153 | double_int dstride; |
28d4f5a1 | 154 | |
b79b3386 | 155 | if (array_size == NULL_TREE |
28d4f5a1 | 156 | || TREE_CODE (array_size) != INTEGER_CST) |
50caf588 | 157 | continue; |
28d4f5a1 | 158 | |
a75122be | 159 | dstride = double_int_mul (tree_to_double_int (array_size), |
b79b3386 | 160 | shwi_to_double_int (istride)); |
a75122be | 161 | (*access_strides) = double_int_add (*access_strides, dstride); |
50caf588 | 162 | } |
60cfcb79 | 163 | } |
164 | } | |
165 | ||
84420feb | 166 | /* Attempt to apply interchange transformations to TRANS to maximize the |
167 | spatial and temporal locality of the loop. | |
60cfcb79 | 168 | Returns the new transform matrix. The smaller the reuse vector |
50caf588 | 169 | distances in the inner loops, the fewer the cache misses. |
170 | FIRST_LOOP is the loop->num of the first loop in the analyzed loop | |
171 | nest. */ | |
172 | ||
60cfcb79 | 173 | |
174 | static lambda_trans_matrix | |
175 | try_interchange_loops (lambda_trans_matrix trans, | |
176 | unsigned int depth, | |
41c7a324 | 177 | VEC (ddr_p, heap) *dependence_relations, |
178 | VEC (data_reference_p, heap) *datarefs, | |
331e13f9 | 179 | struct loop *first_loop) |
60cfcb79 | 180 | { |
2fcf1fbb | 181 | bool res; |
331e13f9 | 182 | struct loop *loop_i; |
183 | struct loop *loop_j; | |
50caf588 | 184 | unsigned int dependence_steps_i, dependence_steps_j; |
a75122be | 185 | double_int access_strides_i, access_strides_j; |
2fcf1fbb | 186 | double_int small, large, nb_iter; |
187 | double_int l1_cache_size, l2_cache_size; | |
188 | int cmp; | |
60cfcb79 | 189 | unsigned int nb_deps_not_carried_by_i, nb_deps_not_carried_by_j; |
190 | struct data_dependence_relation *ddr; | |
191 | ||
e0cf06e1 | 192 | if (VEC_length (ddr_p, dependence_relations) == 0) |
193 | return trans; | |
194 | ||
60cfcb79 | 195 | /* When there is an unknown relation in the dependence_relations, we |
196 | know that it is no worth looking at this loop nest: give up. */ | |
41c7a324 | 197 | ddr = VEC_index (ddr_p, dependence_relations, 0); |
60cfcb79 | 198 | if (ddr == NULL || DDR_ARE_DEPENDENT (ddr) == chrec_dont_know) |
199 | return trans; | |
2fcf1fbb | 200 | |
201 | l1_cache_size = uhwi_to_double_int (L1_CACHE_SIZE * 1024); | |
202 | l2_cache_size = uhwi_to_double_int (L2_CACHE_SIZE * 1024); | |
203 | ||
60cfcb79 | 204 | /* LOOP_I is always the outer loop. */ |
331e13f9 | 205 | for (loop_j = first_loop->inner; |
206 | loop_j; | |
207 | loop_j = loop_j->inner) | |
208 | for (loop_i = first_loop; | |
9e3536f4 | 209 | loop_depth (loop_i) < loop_depth (loop_j); |
331e13f9 | 210 | loop_i = loop_i->inner) |
60cfcb79 | 211 | { |
50caf588 | 212 | gather_interchange_stats (dependence_relations, datarefs, |
213 | loop_i, first_loop, | |
214 | &dependence_steps_i, | |
215 | &nb_deps_not_carried_by_i, | |
216 | &access_strides_i); | |
217 | gather_interchange_stats (dependence_relations, datarefs, | |
218 | loop_j, first_loop, | |
219 | &dependence_steps_j, | |
220 | &nb_deps_not_carried_by_j, | |
221 | &access_strides_j); | |
60cfcb79 | 222 | |
223 | /* Heuristics for loop interchange profitability: | |
50caf588 | 224 | |
2fcf1fbb | 225 | 0. Don't transform if the smallest stride is larger than |
226 | the L2 cache, or if the largest stride multiplied by the | |
227 | number of iterations is smaller than the L1 cache. | |
228 | ||
50caf588 | 229 | 1. (spatial locality) Inner loops should have smallest |
230 | dependence steps. | |
231 | ||
232 | 2. (spatial locality) Inner loops should contain more | |
233 | dependence relations not carried by the loop. | |
234 | ||
2fcf1fbb | 235 | 3. (temporal locality) Inner loops should have smallest |
50caf588 | 236 | array access strides. |
60cfcb79 | 237 | */ |
2fcf1fbb | 238 | |
239 | cmp = double_int_ucmp (access_strides_i, access_strides_j); | |
240 | small = cmp < 0 ? access_strides_i : access_strides_j; | |
241 | large = cmp < 0 ? access_strides_j : access_strides_i; | |
242 | ||
243 | if (double_int_ucmp (small, l2_cache_size) > 0) | |
244 | continue; | |
245 | ||
246 | res = cmp < 0 ? | |
247 | estimated_loop_iterations (loop_j, false, &nb_iter): | |
248 | estimated_loop_iterations (loop_i, false, &nb_iter); | |
249 | large = double_int_mul (large, nb_iter); | |
250 | ||
251 | if (res && double_int_ucmp (large, l1_cache_size) < 0) | |
252 | continue; | |
253 | ||
50caf588 | 254 | if (dependence_steps_i < dependence_steps_j |
255 | || nb_deps_not_carried_by_i > nb_deps_not_carried_by_j | |
2fcf1fbb | 256 | || cmp < 0) |
60cfcb79 | 257 | { |
331e13f9 | 258 | lambda_matrix_row_exchange (LTM_MATRIX (trans), |
9e3536f4 | 259 | loop_depth (loop_i) - loop_depth (first_loop), |
260 | loop_depth (loop_j) - loop_depth (first_loop)); | |
60cfcb79 | 261 | /* Validate the resulting matrix. When the transformation |
50caf588 | 262 | is not valid, reverse to the previous transformation. */ |
60cfcb79 | 263 | if (!lambda_transform_legal_p (trans, depth, dependence_relations)) |
331e13f9 | 264 | lambda_matrix_row_exchange (LTM_MATRIX (trans), |
9e3536f4 | 265 | loop_depth (loop_i) - loop_depth (first_loop), |
266 | loop_depth (loop_j) - loop_depth (first_loop)); | |
60cfcb79 | 267 | } |
268 | } | |
50caf588 | 269 | |
60cfcb79 | 270 | return trans; |
271 | } | |
272 | ||
538dd0ca | 273 | /* Return the number of nested loops in LOOP_NEST, or 0 if the loops |
274 | are not perfectly nested. */ | |
275 | ||
255b6be7 | 276 | unsigned int |
538dd0ca | 277 | perfect_loop_nest_depth (struct loop *loop_nest) |
278 | { | |
279 | struct loop *temp; | |
280 | unsigned int depth = 1; | |
281 | ||
282 | /* If it's not a loop nest, we don't want it. We also don't handle | |
283 | sibling loops properly, which are loops of the following form: | |
284 | ||
285 | | for (i = 0; i < 50; i++) | |
286 | | { | |
287 | | for (j = 0; j < 50; j++) | |
288 | | { | |
289 | | ... | |
290 | | } | |
291 | | for (j = 0; j < 50; j++) | |
292 | | { | |
293 | | ... | |
294 | | } | |
295 | | } | |
296 | */ | |
297 | ||
298 | if (!loop_nest->inner || !single_exit (loop_nest)) | |
299 | return 0; | |
300 | ||
301 | for (temp = loop_nest->inner; temp; temp = temp->inner) | |
302 | { | |
303 | /* If we have a sibling loop or multiple exit edges, jump ship. */ | |
304 | if (temp->next || !single_exit (temp)) | |
305 | return 0; | |
306 | ||
307 | depth++; | |
308 | } | |
309 | ||
310 | return depth; | |
311 | } | |
312 | ||
7194de72 | 313 | /* Perform a set of linear transforms on loops. */ |
60cfcb79 | 314 | |
315 | void | |
7194de72 | 316 | linear_transform_loops (void) |
60cfcb79 | 317 | { |
dfe210e4 | 318 | bool modified = false; |
17519ba0 | 319 | loop_iterator li; |
81bd7518 | 320 | VEC(tree,heap) *oldivs = NULL; |
321 | VEC(tree,heap) *invariants = NULL; | |
b79b3386 | 322 | VEC(tree,heap) *lambda_parameters = NULL; |
75a70cf9 | 323 | VEC(gimple,heap) *remove_ivs = VEC_alloc (gimple, heap, 3); |
17519ba0 | 324 | struct loop *loop_nest; |
75a70cf9 | 325 | gimple oldiv_stmt; |
5aaf9e39 | 326 | unsigned i; |
3026d558 | 327 | |
17519ba0 | 328 | FOR_EACH_LOOP (li, loop_nest, 0) |
60cfcb79 | 329 | { |
330 | unsigned int depth = 0; | |
41c7a324 | 331 | VEC (ddr_p, heap) *dependence_relations; |
332 | VEC (data_reference_p, heap) *datarefs; | |
b79b3386 | 333 | |
60cfcb79 | 334 | lambda_loopnest before, after; |
335 | lambda_trans_matrix trans; | |
3026d558 | 336 | struct obstack lambda_obstack; |
2e54c85d | 337 | struct loop *loop; |
12c697cd | 338 | VEC(loop_p,heap) *nest; |
3026d558 | 339 | |
538dd0ca | 340 | depth = perfect_loop_nest_depth (loop_nest); |
341 | if (depth == 0) | |
60cfcb79 | 342 | continue; |
538dd0ca | 343 | |
12c697cd | 344 | nest = VEC_alloc (loop_p, heap, 3); |
2e54c85d | 345 | for (loop = loop_nest; loop; loop = loop->inner) |
346 | VEC_safe_push (loop_p, heap, nest, loop); | |
347 | ||
19af51e2 | 348 | gcc_obstack_init (&lambda_obstack); |
046bfc77 | 349 | VEC_truncate (tree, oldivs, 0); |
350 | VEC_truncate (tree, invariants, 0); | |
b79b3386 | 351 | VEC_truncate (tree, lambda_parameters, 0); |
60cfcb79 | 352 | |
41c7a324 | 353 | datarefs = VEC_alloc (data_reference_p, heap, 10); |
354 | dependence_relations = VEC_alloc (ddr_p, heap, 10 * 10); | |
b79b3386 | 355 | if (!compute_data_dependences_for_loop (loop_nest, true, &datarefs, |
356 | &dependence_relations)) | |
19af51e2 | 357 | goto free_and_continue; |
b79b3386 | 358 | |
359 | lambda_collect_parameters (datarefs, &lambda_parameters); | |
2e54c85d | 360 | if (!lambda_compute_access_matrices (datarefs, lambda_parameters, nest)) |
19af51e2 | 361 | goto free_and_continue; |
41c7a324 | 362 | |
60cfcb79 | 363 | if (dump_file && (dump_flags & TDF_DETAILS)) |
41c7a324 | 364 | dump_ddrs (dump_file, dependence_relations); |
60cfcb79 | 365 | |
60cfcb79 | 366 | /* Build the transformation matrix. */ |
367 | trans = lambda_trans_matrix_new (depth, depth); | |
368 | lambda_matrix_id (LTM_MATRIX (trans), depth); | |
50caf588 | 369 | trans = try_interchange_loops (trans, depth, dependence_relations, |
331e13f9 | 370 | datarefs, loop_nest); |
50caf588 | 371 | |
372 | if (lambda_trans_matrix_id_p (trans)) | |
373 | { | |
374 | if (dump_file) | |
375 | fprintf (dump_file, "Won't transform loop. Optimal transform is the identity transform\n"); | |
62761183 | 376 | goto free_and_continue; |
50caf588 | 377 | } |
60cfcb79 | 378 | |
379 | /* Check whether the transformation is legal. */ | |
380 | if (!lambda_transform_legal_p (trans, depth, dependence_relations)) | |
381 | { | |
382 | if (dump_file) | |
383 | fprintf (dump_file, "Can't transform loop, transform is illegal:\n"); | |
62761183 | 384 | goto free_and_continue; |
60cfcb79 | 385 | } |
41c7a324 | 386 | |
7194de72 | 387 | before = gcc_loopnest_to_lambda_loopnest (loop_nest, &oldivs, |
3026d558 | 388 | &invariants, &lambda_obstack); |
41c7a324 | 389 | |
60cfcb79 | 390 | if (!before) |
62761183 | 391 | goto free_and_continue; |
392 | ||
60cfcb79 | 393 | if (dump_file) |
394 | { | |
395 | fprintf (dump_file, "Before:\n"); | |
396 | print_lambda_loopnest (dump_file, before, 'i'); | |
397 | } | |
398 | ||
3026d558 | 399 | after = lambda_loopnest_transform (before, trans, &lambda_obstack); |
41c7a324 | 400 | |
60cfcb79 | 401 | if (dump_file) |
402 | { | |
403 | fprintf (dump_file, "After:\n"); | |
404 | print_lambda_loopnest (dump_file, after, 'u'); | |
405 | } | |
41c7a324 | 406 | |
60cfcb79 | 407 | lambda_loopnest_to_gcc_loopnest (loop_nest, oldivs, invariants, |
5aaf9e39 | 408 | &remove_ivs, |
3026d558 | 409 | after, trans, &lambda_obstack); |
dfe210e4 | 410 | modified = true; |
41c7a324 | 411 | |
557ef5d8 | 412 | if (dump_file) |
413 | fprintf (dump_file, "Successfully transformed loop.\n"); | |
41c7a324 | 414 | |
62761183 | 415 | free_and_continue: |
3026d558 | 416 | obstack_free (&lambda_obstack, NULL); |
60cfcb79 | 417 | free_dependence_relations (dependence_relations); |
418 | free_data_refs (datarefs); | |
2e54c85d | 419 | VEC_free (loop_p, heap, nest); |
60cfcb79 | 420 | } |
41c7a324 | 421 | |
75a70cf9 | 422 | for (i = 0; VEC_iterate (gimple, remove_ivs, i, oldiv_stmt); i++) |
5aaf9e39 | 423 | remove_iv (oldiv_stmt); |
424 | ||
81bd7518 | 425 | VEC_free (tree, heap, oldivs); |
426 | VEC_free (tree, heap, invariants); | |
75a70cf9 | 427 | VEC_free (gimple, heap, remove_ivs); |
557ef5d8 | 428 | scev_reset (); |
dfe210e4 | 429 | |
430 | if (modified) | |
431 | rewrite_into_loop_closed_ssa (NULL, TODO_update_ssa_full_phi); | |
60cfcb79 | 432 | } |