]>
Commit | Line | Data |
---|---|---|
c836de3f | 1 | /* Loop unrolling. |
d353bf18 | 2 | Copyright (C) 2002-2015 Free Software Foundation, Inc. |
ad5201d0 | 3 | |
4 | This file is part of GCC. | |
5 | ||
6 | GCC is free software; you can redistribute it and/or modify it under | |
7 | the terms of the GNU General Public License as published by the Free | |
8c4c00c1 | 8 | Software Foundation; either version 3, or (at your option) any later |
ad5201d0 | 9 | version. |
10 | ||
11 | GCC is distributed in the hope that it will be useful, but WITHOUT ANY | |
12 | WARRANTY; without even the implied warranty of MERCHANTABILITY or | |
13 | FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License | |
14 | for more details. | |
15 | ||
16 | You should have received a copy of the GNU General Public License | |
8c4c00c1 | 17 | along with GCC; see the file COPYING3. If not see |
18 | <http://www.gnu.org/licenses/>. */ | |
ad5201d0 | 19 | |
20 | #include "config.h" | |
21 | #include "system.h" | |
22 | #include "coretypes.h" | |
23 | #include "tm.h" | |
24 | #include "rtl.h" | |
b20a8bb4 | 25 | #include "hash-set.h" |
26 | #include "machmode.h" | |
27 | #include "vec.h" | |
28 | #include "double-int.h" | |
29 | #include "input.h" | |
30 | #include "alias.h" | |
31 | #include "symtab.h" | |
32 | #include "wide-int.h" | |
33 | #include "inchash.h" | |
41a8aa41 | 34 | #include "tree.h" |
ad5201d0 | 35 | #include "hard-reg-set.h" |
42fe97ed | 36 | #include "obstack.h" |
886c1262 | 37 | #include "profile.h" |
94ea8568 | 38 | #include "predict.h" |
94ea8568 | 39 | #include "function.h" |
40 | #include "dominance.h" | |
41 | #include "cfg.h" | |
42 | #include "cfgrtl.h" | |
ad5201d0 | 43 | #include "basic-block.h" |
44 | #include "cfgloop.h" | |
ad5201d0 | 45 | #include "params.h" |
34517c64 | 46 | #include "insn-codes.h" |
47 | #include "optabs.h" | |
d53441c8 | 48 | #include "hashtab.h" |
49 | #include "flags.h" | |
50 | #include "statistics.h" | |
51 | #include "real.h" | |
52 | #include "fixed-value.h" | |
53 | #include "insn-config.h" | |
54 | #include "expmed.h" | |
55 | #include "dojump.h" | |
56 | #include "explow.h" | |
57 | #include "calls.h" | |
58 | #include "emit-rtl.h" | |
59 | #include "varasm.h" | |
60 | #include "stmt.h" | |
ad5201d0 | 61 | #include "expr.h" |
d9dd21a8 | 62 | #include "hash-table.h" |
48e1416a | 63 | #include "recog.h" |
9ccaa774 | 64 | #include "target.h" |
b9ed1410 | 65 | #include "dumpfile.h" |
ad5201d0 | 66 | |
c836de3f | 67 | /* This pass performs loop unrolling. We only perform this |
68 | optimization on innermost loops (with single exception) because | |
ad5201d0 | 69 | the impact on performance is greatest here, and we want to avoid |
70 | unnecessary code size growth. The gain is caused by greater sequentiality | |
917bbcab | 71 | of code, better code to optimize for further passes and in some cases |
ad5201d0 | 72 | by fewer testings of exit conditions. The main problem is code growth, |
73 | that impacts performance negatively due to effect of caches. | |
74 | ||
75 | What we do: | |
76 | ||
ad5201d0 | 77 | -- unrolling of loops that roll constant times; this is almost always |
78 | win, as we get rid of exit condition tests. | |
79 | -- unrolling of loops that roll number of times that we can compute | |
80 | in runtime; we also get rid of exit condition tests here, but there | |
81 | is the extra expense for calculating the number of iterations | |
82 | -- simple unrolling of remaining loops; this is performed only if we | |
83 | are asked to, as the gain is questionable in this case and often | |
84 | it may even slow down the code | |
85 | For more detailed descriptions of each of those, see comments at | |
86 | appropriate function below. | |
87 | ||
88 | There is a lot of parameters (defined and described in params.def) that | |
c836de3f | 89 | control how much we unroll. |
ad5201d0 | 90 | |
91 | ??? A great problem is that we don't have a good way how to determine | |
92 | how many times we should unroll the loop; the experiments I have made | |
93 | showed that this choice may affect performance in order of several %. | |
94 | */ | |
95 | ||
a9989fb4 | 96 | /* Information about induction variables to split. */ |
97 | ||
98 | struct iv_to_split | |
99 | { | |
222dc1d3 | 100 | rtx_insn *insn; /* The insn in that the induction variable occurs. */ |
ef770610 | 101 | rtx orig_var; /* The variable (register) for the IV before split. */ |
a9989fb4 | 102 | rtx base_var; /* The variable on that the values in the further |
103 | iterations are based. */ | |
104 | rtx step; /* Step of the induction variable. */ | |
b635a1a3 | 105 | struct iv_to_split *next; /* Next entry in walking order. */ |
a9989fb4 | 106 | }; |
107 | ||
375bb675 | 108 | /* Information about accumulators to expand. */ |
109 | ||
110 | struct var_to_expand | |
111 | { | |
222dc1d3 | 112 | rtx_insn *insn; /* The insn in that the variable expansion occurs. */ |
dac49aa5 | 113 | rtx reg; /* The accumulator which is expanded. */ |
f1f41a6c | 114 | vec<rtx> var_expansions; /* The copies of the accumulator which is expanded. */ |
b635a1a3 | 115 | struct var_to_expand *next; /* Next entry in walking order. */ |
48e1416a | 116 | enum rtx_code op; /* The type of the accumulation - addition, subtraction |
375bb675 | 117 | or multiplication. */ |
118 | int expansion_count; /* Count the number of expansions generated so far. */ | |
119 | int reuse_expansion; /* The expansion we intend to reuse to expand | |
48e1416a | 120 | the accumulator. If REUSE_EXPANSION is 0 reuse |
121 | the original accumulator. Else use | |
375bb675 | 122 | var_expansions[REUSE_EXPANSION - 1]. */ |
123 | }; | |
124 | ||
d9dd21a8 | 125 | /* Hashtable helper for iv_to_split. */ |
126 | ||
127 | struct iv_split_hasher : typed_free_remove <iv_to_split> | |
128 | { | |
9969c043 | 129 | typedef iv_to_split *value_type; |
130 | typedef iv_to_split *compare_type; | |
131 | static inline hashval_t hash (const iv_to_split *); | |
132 | static inline bool equal (const iv_to_split *, const iv_to_split *); | |
d9dd21a8 | 133 | }; |
134 | ||
135 | ||
136 | /* A hash function for information about insns to split. */ | |
137 | ||
138 | inline hashval_t | |
9969c043 | 139 | iv_split_hasher::hash (const iv_to_split *ivts) |
d9dd21a8 | 140 | { |
141 | return (hashval_t) INSN_UID (ivts->insn); | |
142 | } | |
143 | ||
144 | /* An equality functions for information about insns to split. */ | |
145 | ||
146 | inline bool | |
9969c043 | 147 | iv_split_hasher::equal (const iv_to_split *i1, const iv_to_split *i2) |
d9dd21a8 | 148 | { |
149 | return i1->insn == i2->insn; | |
150 | } | |
151 | ||
152 | /* Hashtable helper for iv_to_split. */ | |
153 | ||
154 | struct var_expand_hasher : typed_free_remove <var_to_expand> | |
155 | { | |
9969c043 | 156 | typedef var_to_expand *value_type; |
157 | typedef var_to_expand *compare_type; | |
158 | static inline hashval_t hash (const var_to_expand *); | |
159 | static inline bool equal (const var_to_expand *, const var_to_expand *); | |
d9dd21a8 | 160 | }; |
161 | ||
162 | /* Return a hash for VES. */ | |
163 | ||
164 | inline hashval_t | |
9969c043 | 165 | var_expand_hasher::hash (const var_to_expand *ves) |
d9dd21a8 | 166 | { |
167 | return (hashval_t) INSN_UID (ves->insn); | |
168 | } | |
169 | ||
170 | /* Return true if I1 and I2 refer to the same instruction. */ | |
171 | ||
172 | inline bool | |
9969c043 | 173 | var_expand_hasher::equal (const var_to_expand *i1, const var_to_expand *i2) |
d9dd21a8 | 174 | { |
175 | return i1->insn == i2->insn; | |
176 | } | |
177 | ||
375bb675 | 178 | /* Information about optimization applied in |
179 | the unrolled loop. */ | |
180 | ||
181 | struct opt_info | |
a9989fb4 | 182 | { |
c1f445d2 | 183 | hash_table<iv_split_hasher> *insns_to_split; /* A hashtable of insns to |
d9dd21a8 | 184 | split. */ |
b635a1a3 | 185 | struct iv_to_split *iv_to_split_head; /* The first iv to split. */ |
186 | struct iv_to_split **iv_to_split_tail; /* Pointer to the tail of the list. */ | |
c1f445d2 | 187 | hash_table<var_expand_hasher> *insns_with_var_to_expand; /* A hashtable of |
d9dd21a8 | 188 | insns with accumulators to expand. */ |
b635a1a3 | 189 | struct var_to_expand *var_to_expand_head; /* The first var to expand. */ |
190 | struct var_to_expand **var_to_expand_tail; /* Pointer to the tail of the list. */ | |
375bb675 | 191 | unsigned first_new_block; /* The first basic block that was |
192 | duplicated. */ | |
193 | basic_block loop_exit; /* The loop exit basic block. */ | |
194 | basic_block loop_preheader; /* The loop preheader basic block. */ | |
a9989fb4 | 195 | }; |
196 | ||
0051c76a | 197 | static void decide_unroll_stupid (struct loop *, int); |
198 | static void decide_unroll_constant_iterations (struct loop *, int); | |
199 | static void decide_unroll_runtime_iterations (struct loop *, int); | |
7194de72 | 200 | static void unroll_loop_stupid (struct loop *); |
c836de3f | 201 | static void decide_unrolling (int); |
7194de72 | 202 | static void unroll_loop_constant_iterations (struct loop *); |
203 | static void unroll_loop_runtime_iterations (struct loop *); | |
375bb675 | 204 | static struct opt_info *analyze_insns_in_loop (struct loop *); |
205 | static void opt_info_start_duplication (struct opt_info *); | |
206 | static void apply_opt_in_copies (struct opt_info *, unsigned, bool, bool); | |
207 | static void free_opt_info (struct opt_info *); | |
222dc1d3 | 208 | static struct var_to_expand *analyze_insn_to_expand_var (struct loop*, rtx_insn *); |
3b3940d7 | 209 | static bool referenced_in_one_insn_in_loop_p (struct loop *, rtx, int *); |
3eeb4f9a | 210 | static struct iv_to_split *analyze_iv_to_split_insn (rtx_insn *); |
222dc1d3 | 211 | static void expand_var_during_unrolling (struct var_to_expand *, rtx_insn *); |
b635a1a3 | 212 | static void insert_var_expansion_initialization (struct var_to_expand *, |
213 | basic_block); | |
214 | static void combine_var_copies_in_loop_exit (struct var_to_expand *, | |
215 | basic_block); | |
375bb675 | 216 | static rtx get_expansion (struct var_to_expand *); |
ad5201d0 | 217 | |
c836de3f | 218 | /* Emit a message summarizing the unroll that will be |
f55775aa | 219 | performed for LOOP, along with the loop's location LOCUS, if |
220 | appropriate given the dump or -fopt-info settings. */ | |
221 | ||
222 | static void | |
c836de3f | 223 | report_unroll (struct loop *loop, location_t locus) |
f55775aa | 224 | { |
f55775aa | 225 | int report_flags = MSG_OPTIMIZED_LOCATIONS | TDF_RTL | TDF_DETAILS; |
226 | ||
830b796a | 227 | if (loop->lpt_decision.decision == LPT_NONE) |
228 | return; | |
229 | ||
f55775aa | 230 | if (!dump_enabled_p ()) |
231 | return; | |
232 | ||
c836de3f | 233 | dump_printf_loc (report_flags, locus, |
234 | "loop unrolled %d times", | |
235 | loop->lpt_decision.times); | |
f55775aa | 236 | if (profile_info) |
237 | dump_printf (report_flags, | |
c836de3f | 238 | " (header execution count %d)", |
f55775aa | 239 | (int)loop->header->count); |
f55775aa | 240 | |
241 | dump_printf (report_flags, "\n"); | |
242 | } | |
243 | ||
c836de3f | 244 | /* Decide whether unroll loops and how much. */ |
ad5201d0 | 245 | static void |
c836de3f | 246 | decide_unrolling (int flags) |
ad5201d0 | 247 | { |
17519ba0 | 248 | struct loop *loop; |
ad5201d0 | 249 | |
250 | /* Scan the loops, inner ones first. */ | |
f21d4d00 | 251 | FOR_EACH_LOOP (loop, LI_FROM_INNERMOST) |
ad5201d0 | 252 | { |
ad5201d0 | 253 | loop->lpt_decision.decision = LPT_NONE; |
f55775aa | 254 | location_t locus = get_loop_location (loop); |
ad5201d0 | 255 | |
f55775aa | 256 | if (dump_enabled_p ()) |
257 | dump_printf_loc (TDF_RTL, locus, | |
258 | ";; *** Considering loop %d at BB %d for " | |
c836de3f | 259 | "unrolling ***\n", |
f55775aa | 260 | loop->num, loop->header->index); |
ad5201d0 | 261 | |
262 | /* Do not peel cold areas. */ | |
0bfd8d5c | 263 | if (optimize_loop_for_size_p (loop)) |
ad5201d0 | 264 | { |
450d042a | 265 | if (dump_file) |
266 | fprintf (dump_file, ";; Not considering loop, cold area\n"); | |
ad5201d0 | 267 | continue; |
268 | } | |
269 | ||
270 | /* Can the loop be manipulated? */ | |
271 | if (!can_duplicate_loop_p (loop)) | |
272 | { | |
450d042a | 273 | if (dump_file) |
274 | fprintf (dump_file, | |
ad5201d0 | 275 | ";; Not considering loop, cannot duplicate\n"); |
ad5201d0 | 276 | continue; |
277 | } | |
278 | ||
279 | /* Skip non-innermost loops. */ | |
280 | if (loop->inner) | |
281 | { | |
450d042a | 282 | if (dump_file) |
283 | fprintf (dump_file, ";; Not considering loop, is not innermost\n"); | |
ad5201d0 | 284 | continue; |
285 | } | |
286 | ||
287 | loop->ninsns = num_loop_insns (loop); | |
288 | loop->av_ninsns = average_num_loop_insns (loop); | |
289 | ||
290 | /* Try transformations one by one in decreasing order of | |
291 | priority. */ | |
292 | ||
0051c76a | 293 | decide_unroll_constant_iterations (loop, flags); |
ad5201d0 | 294 | if (loop->lpt_decision.decision == LPT_NONE) |
0051c76a | 295 | decide_unroll_runtime_iterations (loop, flags); |
ad5201d0 | 296 | if (loop->lpt_decision.decision == LPT_NONE) |
0051c76a | 297 | decide_unroll_stupid (loop, flags); |
ad5201d0 | 298 | |
c836de3f | 299 | report_unroll (loop, locus); |
ad5201d0 | 300 | } |
ad5201d0 | 301 | } |
302 | ||
c836de3f | 303 | /* Unroll LOOPS. */ |
304 | void | |
305 | unroll_loops (int flags) | |
ad5201d0 | 306 | { |
c836de3f | 307 | struct loop *loop; |
308 | bool changed = false; | |
ad5201d0 | 309 | |
c836de3f | 310 | /* Now decide rest of unrolling. */ |
311 | decide_unrolling (flags); | |
ad5201d0 | 312 | |
c836de3f | 313 | /* Scan the loops, inner ones first. */ |
314 | FOR_EACH_LOOP (loop, LI_FROM_INNERMOST) | |
ad5201d0 | 315 | { |
c836de3f | 316 | /* And perform the appropriate transformations. */ |
317 | switch (loop->lpt_decision.decision) | |
3ad4992f | 318 | { |
c836de3f | 319 | case LPT_UNROLL_CONSTANT: |
320 | unroll_loop_constant_iterations (loop); | |
321 | changed = true; | |
322 | break; | |
323 | case LPT_UNROLL_RUNTIME: | |
324 | unroll_loop_runtime_iterations (loop); | |
325 | changed = true; | |
326 | break; | |
327 | case LPT_UNROLL_STUPID: | |
328 | unroll_loop_stupid (loop); | |
329 | changed = true; | |
330 | break; | |
331 | case LPT_NONE: | |
332 | break; | |
333 | default: | |
334 | gcc_unreachable (); | |
ad5201d0 | 335 | } |
ad5201d0 | 336 | } |
337 | ||
c836de3f | 338 | if (changed) |
339 | { | |
340 | calculate_dominance_info (CDI_DOMINATORS); | |
341 | fix_loop_structure (NULL); | |
342 | } | |
3ad4992f | 343 | |
c836de3f | 344 | iv_analysis_done (); |
345 | } | |
ad5201d0 | 346 | |
c836de3f | 347 | /* Check whether exit of the LOOP is at the end of loop body. */ |
3ad4992f | 348 | |
c836de3f | 349 | static bool |
350 | loop_exit_at_end_p (struct loop *loop) | |
ad5201d0 | 351 | { |
f9cce2dc | 352 | struct niter_desc *desc = get_simple_loop_desc (loop); |
c836de3f | 353 | rtx_insn *insn; |
a5414ff5 | 354 | |
c836de3f | 355 | /* We should never have conditional in latch block. */ |
356 | gcc_assert (desc->in_edge->dest != loop->header); | |
48e1416a | 357 | |
c836de3f | 358 | if (desc->in_edge->dest != loop->latch) |
359 | return false; | |
a9989fb4 | 360 | |
c836de3f | 361 | /* Check that the latch is empty. */ |
362 | FOR_BB_INSNS (loop->latch, insn) | |
363 | { | |
364 | if (INSN_P (insn) && active_insn_p (insn)) | |
365 | return false; | |
a5414ff5 | 366 | } |
ad5201d0 | 367 | |
c836de3f | 368 | return true; |
ad5201d0 | 369 | } |
370 | ||
450d042a | 371 | /* Decide whether to unroll LOOP iterating constant number of times |
372 | and how much. */ | |
f9cce2dc | 373 | |
ad5201d0 | 374 | static void |
0051c76a | 375 | decide_unroll_constant_iterations (struct loop *loop, int flags) |
ad5201d0 | 376 | { |
f9cce2dc | 377 | unsigned nunroll, nunroll_by_av, best_copies, best_unroll = 0, n_copies, i; |
378 | struct niter_desc *desc; | |
5de9d3ed | 379 | widest_int iterations; |
ad5201d0 | 380 | |
381 | if (!(flags & UAP_UNROLL)) | |
382 | { | |
383 | /* We were not asked to, just return back silently. */ | |
384 | return; | |
385 | } | |
386 | ||
450d042a | 387 | if (dump_file) |
388 | fprintf (dump_file, | |
389 | "\n;; Considering unrolling loop with constant " | |
390 | "number of iterations\n"); | |
ad5201d0 | 391 | |
392 | /* nunroll = total number of copies of the original loop body in | |
393 | unrolled loop (i.e. if it is 2, we have to duplicate loop body once. */ | |
394 | nunroll = PARAM_VALUE (PARAM_MAX_UNROLLED_INSNS) / loop->ninsns; | |
450d042a | 395 | nunroll_by_av |
396 | = PARAM_VALUE (PARAM_MAX_AVERAGE_UNROLLED_INSNS) / loop->av_ninsns; | |
ad5201d0 | 397 | if (nunroll > nunroll_by_av) |
398 | nunroll = nunroll_by_av; | |
399 | if (nunroll > (unsigned) PARAM_VALUE (PARAM_MAX_UNROLL_TIMES)) | |
400 | nunroll = PARAM_VALUE (PARAM_MAX_UNROLL_TIMES); | |
401 | ||
e7f33cf8 | 402 | if (targetm.loop_unroll_adjust) |
403 | nunroll = targetm.loop_unroll_adjust (nunroll, loop); | |
404 | ||
ad5201d0 | 405 | /* Skip big loops. */ |
406 | if (nunroll <= 1) | |
407 | { | |
450d042a | 408 | if (dump_file) |
409 | fprintf (dump_file, ";; Not considering loop, is too big\n"); | |
ad5201d0 | 410 | return; |
411 | } | |
412 | ||
413 | /* Check for simple loops. */ | |
f9cce2dc | 414 | desc = get_simple_loop_desc (loop); |
ad5201d0 | 415 | |
416 | /* Check number of iterations. */ | |
f9cce2dc | 417 | if (!desc->simple_p || !desc->const_iter || desc->assumptions) |
ad5201d0 | 418 | { |
450d042a | 419 | if (dump_file) |
420 | fprintf (dump_file, | |
421 | ";; Unable to prove that the loop iterates constant times\n"); | |
ad5201d0 | 422 | return; |
423 | } | |
424 | ||
c8fbcf58 | 425 | /* Check whether the loop rolls enough to consider. |
426 | Consult also loop bounds and profile; in the case the loop has more | |
427 | than one exit it may well loop less than determined maximal number | |
428 | of iterations. */ | |
429 | if (desc->niter < 2 * nunroll | |
f86b328b | 430 | || ((get_estimated_loop_iterations (loop, &iterations) |
431 | || get_max_loop_iterations (loop, &iterations)) | |
796b6678 | 432 | && wi::ltu_p (iterations, 2 * nunroll))) |
ad5201d0 | 433 | { |
450d042a | 434 | if (dump_file) |
435 | fprintf (dump_file, ";; Not unrolling loop, doesn't roll\n"); | |
ad5201d0 | 436 | return; |
437 | } | |
438 | ||
439 | /* Success; now compute number of iterations to unroll. We alter | |
440 | nunroll so that as few as possible copies of loop body are | |
d01481af | 441 | necessary, while still not decreasing the number of unrollings |
ad5201d0 | 442 | too much (at most by 1). */ |
443 | best_copies = 2 * nunroll + 10; | |
444 | ||
445 | i = 2 * nunroll + 2; | |
f9cce2dc | 446 | if (i - 1 >= desc->niter) |
447 | i = desc->niter - 2; | |
ad5201d0 | 448 | |
449 | for (; i >= nunroll - 1; i--) | |
450 | { | |
f9cce2dc | 451 | unsigned exit_mod = desc->niter % (i + 1); |
ad5201d0 | 452 | |
f9cce2dc | 453 | if (!loop_exit_at_end_p (loop)) |
ad5201d0 | 454 | n_copies = exit_mod + i + 1; |
f9cce2dc | 455 | else if (exit_mod != (unsigned) i |
456 | || desc->noloop_assumptions != NULL_RTX) | |
ad5201d0 | 457 | n_copies = exit_mod + i + 2; |
458 | else | |
459 | n_copies = i + 1; | |
460 | ||
461 | if (n_copies < best_copies) | |
462 | { | |
463 | best_copies = n_copies; | |
464 | best_unroll = i; | |
465 | } | |
466 | } | |
467 | ||
ad5201d0 | 468 | loop->lpt_decision.decision = LPT_UNROLL_CONSTANT; |
469 | loop->lpt_decision.times = best_unroll; | |
470 | } | |
471 | ||
1c18ed19 | 472 | /* Unroll LOOP with constant number of iterations LOOP->LPT_DECISION.TIMES times. |
473 | The transformation does this: | |
3ad4992f | 474 | |
ad5201d0 | 475 | for (i = 0; i < 102; i++) |
476 | body; | |
3ad4992f | 477 | |
1c18ed19 | 478 | ==> (LOOP->LPT_DECISION.TIMES == 3) |
3ad4992f | 479 | |
ad5201d0 | 480 | i = 0; |
481 | body; i++; | |
482 | body; i++; | |
483 | while (i < 102) | |
484 | { | |
485 | body; i++; | |
486 | body; i++; | |
487 | body; i++; | |
488 | body; i++; | |
489 | } | |
490 | */ | |
491 | static void | |
7194de72 | 492 | unroll_loop_constant_iterations (struct loop *loop) |
ad5201d0 | 493 | { |
494 | unsigned HOST_WIDE_INT niter; | |
495 | unsigned exit_mod; | |
496 | sbitmap wont_exit; | |
f3c40e6d | 497 | unsigned i; |
f3c40e6d | 498 | edge e; |
ad5201d0 | 499 | unsigned max_unroll = loop->lpt_decision.times; |
f9cce2dc | 500 | struct niter_desc *desc = get_simple_loop_desc (loop); |
501 | bool exit_at_end = loop_exit_at_end_p (loop); | |
375bb675 | 502 | struct opt_info *opt_info = NULL; |
a53ff4c1 | 503 | bool ok; |
48e1416a | 504 | |
ad5201d0 | 505 | niter = desc->niter; |
506 | ||
a9989fb4 | 507 | /* Should not get here (such loop should be peeled instead). */ |
508 | gcc_assert (niter > max_unroll + 1); | |
ad5201d0 | 509 | |
510 | exit_mod = niter % (max_unroll + 1); | |
511 | ||
512 | wont_exit = sbitmap_alloc (max_unroll + 1); | |
53c5d9d4 | 513 | bitmap_ones (wont_exit); |
ad5201d0 | 514 | |
c2078b80 | 515 | auto_vec<edge> remove_edges; |
48e1416a | 516 | if (flag_split_ivs_in_unroller |
375bb675 | 517 | || flag_variable_expansion_in_unroller) |
518 | opt_info = analyze_insns_in_loop (loop); | |
48e1416a | 519 | |
f9cce2dc | 520 | if (!exit_at_end) |
ad5201d0 | 521 | { |
f9cce2dc | 522 | /* The exit is not at the end of the loop; leave exit test |
ad5201d0 | 523 | in the first copy, so that the loops that start with test |
524 | of exit condition have continuous body after unrolling. */ | |
525 | ||
450d042a | 526 | if (dump_file) |
1c18ed19 | 527 | fprintf (dump_file, ";; Condition at beginning of loop.\n"); |
ad5201d0 | 528 | |
529 | /* Peel exit_mod iterations. */ | |
08b7917c | 530 | bitmap_clear_bit (wont_exit, 0); |
f9cce2dc | 531 | if (desc->noloop_assumptions) |
08b7917c | 532 | bitmap_clear_bit (wont_exit, 1); |
ad5201d0 | 533 | |
f9cce2dc | 534 | if (exit_mod) |
535 | { | |
375bb675 | 536 | opt_info_start_duplication (opt_info); |
a53ff4c1 | 537 | ok = duplicate_loop_to_header_edge (loop, loop_preheader_edge (loop), |
7194de72 | 538 | exit_mod, |
f9cce2dc | 539 | wont_exit, desc->out_edge, |
f3c40e6d | 540 | &remove_edges, |
75cc36a4 | 541 | DLTHE_FLAG_UPDATE_FREQ |
542 | | (opt_info && exit_mod > 1 | |
543 | ? DLTHE_RECORD_COPY_NUMBER | |
544 | : 0)); | |
a53ff4c1 | 545 | gcc_assert (ok); |
f9cce2dc | 546 | |
375bb675 | 547 | if (opt_info && exit_mod > 1) |
48e1416a | 548 | apply_opt_in_copies (opt_info, exit_mod, false, false); |
549 | ||
f9cce2dc | 550 | desc->noloop_assumptions = NULL_RTX; |
551 | desc->niter -= exit_mod; | |
e913b5cd | 552 | loop->nb_iterations_upper_bound -= exit_mod; |
a9ef9877 | 553 | if (loop->any_estimate |
796b6678 | 554 | && wi::leu_p (exit_mod, loop->nb_iterations_estimate)) |
e913b5cd | 555 | loop->nb_iterations_estimate -= exit_mod; |
a9ef9877 | 556 | else |
557 | loop->any_estimate = false; | |
f9cce2dc | 558 | } |
ad5201d0 | 559 | |
08b7917c | 560 | bitmap_set_bit (wont_exit, 1); |
ad5201d0 | 561 | } |
562 | else | |
563 | { | |
564 | /* Leave exit test in last copy, for the same reason as above if | |
565 | the loop tests the condition at the end of loop body. */ | |
566 | ||
450d042a | 567 | if (dump_file) |
1c18ed19 | 568 | fprintf (dump_file, ";; Condition at end of loop.\n"); |
ad5201d0 | 569 | |
570 | /* We know that niter >= max_unroll + 2; so we do not need to care of | |
571 | case when we would exit before reaching the loop. So just peel | |
f9cce2dc | 572 | exit_mod + 1 iterations. */ |
573 | if (exit_mod != max_unroll | |
574 | || desc->noloop_assumptions) | |
ad5201d0 | 575 | { |
08b7917c | 576 | bitmap_clear_bit (wont_exit, 0); |
f9cce2dc | 577 | if (desc->noloop_assumptions) |
08b7917c | 578 | bitmap_clear_bit (wont_exit, 1); |
48e1416a | 579 | |
375bb675 | 580 | opt_info_start_duplication (opt_info); |
a53ff4c1 | 581 | ok = duplicate_loop_to_header_edge (loop, loop_preheader_edge (loop), |
7194de72 | 582 | exit_mod + 1, |
a53ff4c1 | 583 | wont_exit, desc->out_edge, |
f3c40e6d | 584 | &remove_edges, |
75cc36a4 | 585 | DLTHE_FLAG_UPDATE_FREQ |
586 | | (opt_info && exit_mod > 0 | |
587 | ? DLTHE_RECORD_COPY_NUMBER | |
588 | : 0)); | |
a53ff4c1 | 589 | gcc_assert (ok); |
48e1416a | 590 | |
375bb675 | 591 | if (opt_info && exit_mod > 0) |
592 | apply_opt_in_copies (opt_info, exit_mod + 1, false, false); | |
a9989fb4 | 593 | |
f9cce2dc | 594 | desc->niter -= exit_mod + 1; |
e913b5cd | 595 | loop->nb_iterations_upper_bound -= exit_mod + 1; |
a9ef9877 | 596 | if (loop->any_estimate |
796b6678 | 597 | && wi::leu_p (exit_mod + 1, loop->nb_iterations_estimate)) |
e913b5cd | 598 | loop->nb_iterations_estimate -= exit_mod + 1; |
a9ef9877 | 599 | else |
600 | loop->any_estimate = false; | |
f9cce2dc | 601 | desc->noloop_assumptions = NULL_RTX; |
602 | ||
08b7917c | 603 | bitmap_set_bit (wont_exit, 0); |
604 | bitmap_set_bit (wont_exit, 1); | |
ad5201d0 | 605 | } |
606 | ||
08b7917c | 607 | bitmap_clear_bit (wont_exit, max_unroll); |
ad5201d0 | 608 | } |
609 | ||
610 | /* Now unroll the loop. */ | |
48e1416a | 611 | |
375bb675 | 612 | opt_info_start_duplication (opt_info); |
a53ff4c1 | 613 | ok = duplicate_loop_to_header_edge (loop, loop_latch_edge (loop), |
7194de72 | 614 | max_unroll, |
a53ff4c1 | 615 | wont_exit, desc->out_edge, |
f3c40e6d | 616 | &remove_edges, |
75cc36a4 | 617 | DLTHE_FLAG_UPDATE_FREQ |
618 | | (opt_info | |
619 | ? DLTHE_RECORD_COPY_NUMBER | |
620 | : 0)); | |
a53ff4c1 | 621 | gcc_assert (ok); |
ad5201d0 | 622 | |
375bb675 | 623 | if (opt_info) |
a9989fb4 | 624 | { |
375bb675 | 625 | apply_opt_in_copies (opt_info, max_unroll, true, true); |
626 | free_opt_info (opt_info); | |
a9989fb4 | 627 | } |
628 | ||
ad5201d0 | 629 | free (wont_exit); |
630 | ||
f9cce2dc | 631 | if (exit_at_end) |
632 | { | |
01020a5f | 633 | basic_block exit_block = get_bb_copy (desc->in_edge->src); |
f9cce2dc | 634 | /* Find a new in and out edge; they are in the last copy we have made. */ |
48e1416a | 635 | |
cd665a06 | 636 | if (EDGE_SUCC (exit_block, 0)->dest == desc->out_edge->dest) |
f9cce2dc | 637 | { |
cd665a06 | 638 | desc->out_edge = EDGE_SUCC (exit_block, 0); |
639 | desc->in_edge = EDGE_SUCC (exit_block, 1); | |
f9cce2dc | 640 | } |
641 | else | |
642 | { | |
cd665a06 | 643 | desc->out_edge = EDGE_SUCC (exit_block, 1); |
644 | desc->in_edge = EDGE_SUCC (exit_block, 0); | |
f9cce2dc | 645 | } |
646 | } | |
647 | ||
648 | desc->niter /= max_unroll + 1; | |
a9ef9877 | 649 | loop->nb_iterations_upper_bound |
796b6678 | 650 | = wi::udiv_trunc (loop->nb_iterations_upper_bound, max_unroll + 1); |
a9ef9877 | 651 | if (loop->any_estimate) |
652 | loop->nb_iterations_estimate | |
796b6678 | 653 | = wi::udiv_trunc (loop->nb_iterations_estimate, max_unroll + 1); |
f9cce2dc | 654 | desc->niter_expr = GEN_INT (desc->niter); |
655 | ||
ad5201d0 | 656 | /* Remove the edges. */ |
f1f41a6c | 657 | FOR_EACH_VEC_ELT (remove_edges, i, e) |
f3c40e6d | 658 | remove_path (e); |
ad5201d0 | 659 | |
450d042a | 660 | if (dump_file) |
661 | fprintf (dump_file, | |
662 | ";; Unrolled loop %d times, constant # of iterations %i insns\n", | |
663 | max_unroll, num_loop_insns (loop)); | |
ad5201d0 | 664 | } |
665 | ||
666 | /* Decide whether to unroll LOOP iterating runtime computable number of times | |
667 | and how much. */ | |
668 | static void | |
0051c76a | 669 | decide_unroll_runtime_iterations (struct loop *loop, int flags) |
ad5201d0 | 670 | { |
671 | unsigned nunroll, nunroll_by_av, i; | |
f9cce2dc | 672 | struct niter_desc *desc; |
5de9d3ed | 673 | widest_int iterations; |
ad5201d0 | 674 | |
675 | if (!(flags & UAP_UNROLL)) | |
676 | { | |
677 | /* We were not asked to, just return back silently. */ | |
678 | return; | |
679 | } | |
680 | ||
450d042a | 681 | if (dump_file) |
682 | fprintf (dump_file, | |
683 | "\n;; Considering unrolling loop with runtime " | |
684 | "computable number of iterations\n"); | |
ad5201d0 | 685 | |
686 | /* nunroll = total number of copies of the original loop body in | |
687 | unrolled loop (i.e. if it is 2, we have to duplicate loop body once. */ | |
688 | nunroll = PARAM_VALUE (PARAM_MAX_UNROLLED_INSNS) / loop->ninsns; | |
689 | nunroll_by_av = PARAM_VALUE (PARAM_MAX_AVERAGE_UNROLLED_INSNS) / loop->av_ninsns; | |
690 | if (nunroll > nunroll_by_av) | |
691 | nunroll = nunroll_by_av; | |
692 | if (nunroll > (unsigned) PARAM_VALUE (PARAM_MAX_UNROLL_TIMES)) | |
693 | nunroll = PARAM_VALUE (PARAM_MAX_UNROLL_TIMES); | |
694 | ||
9ccaa774 | 695 | if (targetm.loop_unroll_adjust) |
696 | nunroll = targetm.loop_unroll_adjust (nunroll, loop); | |
697 | ||
ad5201d0 | 698 | /* Skip big loops. */ |
699 | if (nunroll <= 1) | |
700 | { | |
450d042a | 701 | if (dump_file) |
702 | fprintf (dump_file, ";; Not considering loop, is too big\n"); | |
ad5201d0 | 703 | return; |
704 | } | |
705 | ||
706 | /* Check for simple loops. */ | |
f9cce2dc | 707 | desc = get_simple_loop_desc (loop); |
ad5201d0 | 708 | |
709 | /* Check simpleness. */ | |
f9cce2dc | 710 | if (!desc->simple_p || desc->assumptions) |
ad5201d0 | 711 | { |
450d042a | 712 | if (dump_file) |
713 | fprintf (dump_file, | |
714 | ";; Unable to prove that the number of iterations " | |
715 | "can be counted in runtime\n"); | |
ad5201d0 | 716 | return; |
717 | } | |
718 | ||
f9cce2dc | 719 | if (desc->const_iter) |
ad5201d0 | 720 | { |
450d042a | 721 | if (dump_file) |
722 | fprintf (dump_file, ";; Loop iterates constant times\n"); | |
ad5201d0 | 723 | return; |
724 | } | |
725 | ||
204a2e45 | 726 | /* Check whether the loop rolls. */ |
f86b328b | 727 | if ((get_estimated_loop_iterations (loop, &iterations) |
728 | || get_max_loop_iterations (loop, &iterations)) | |
796b6678 | 729 | && wi::ltu_p (iterations, 2 * nunroll)) |
ad5201d0 | 730 | { |
450d042a | 731 | if (dump_file) |
732 | fprintf (dump_file, ";; Not unrolling loop, doesn't roll\n"); | |
ad5201d0 | 733 | return; |
734 | } | |
735 | ||
736 | /* Success; now force nunroll to be power of 2, as we are unable to | |
737 | cope with overflows in computation of number of iterations. */ | |
f9cce2dc | 738 | for (i = 1; 2 * i <= nunroll; i *= 2) |
739 | continue; | |
ad5201d0 | 740 | |
741 | loop->lpt_decision.decision = LPT_UNROLL_RUNTIME; | |
742 | loop->lpt_decision.times = i - 1; | |
743 | } | |
744 | ||
d6a1bbdb | 745 | /* Splits edge E and inserts the sequence of instructions INSNS on it, and |
746 | returns the newly created block. If INSNS is NULL_RTX, nothing is changed | |
747 | and NULL is returned instead. */ | |
88e6f696 | 748 | |
749 | basic_block | |
222dc1d3 | 750 | split_edge_and_insert (edge e, rtx_insn *insns) |
88e6f696 | 751 | { |
d6a1bbdb | 752 | basic_block bb; |
753 | ||
754 | if (!insns) | |
755 | return NULL; | |
48e1416a | 756 | bb = split_edge (e); |
88e6f696 | 757 | emit_insn_after (insns, BB_END (bb)); |
db1c50be | 758 | |
759 | /* ??? We used to assume that INSNS can contain control flow insns, and | |
760 | that we had to try to find sub basic blocks in BB to maintain a valid | |
761 | CFG. For this purpose we used to set the BB_SUPERBLOCK flag on BB | |
762 | and call break_superblocks when going out of cfglayout mode. But it | |
763 | turns out that this never happens; and that if it does ever happen, | |
8b88439e | 764 | the verify_flow_info at the end of the RTL loop passes would fail. |
db1c50be | 765 | |
766 | There are two reasons why we expected we could have control flow insns | |
767 | in INSNS. The first is when a comparison has to be done in parts, and | |
768 | the second is when the number of iterations is computed for loops with | |
769 | the number of iterations known at runtime. In both cases, test cases | |
770 | to get control flow in INSNS appear to be impossible to construct: | |
771 | ||
772 | * If do_compare_rtx_and_jump needs several branches to do comparison | |
773 | in a mode that needs comparison by parts, we cannot analyze the | |
774 | number of iterations of the loop, and we never get to unrolling it. | |
775 | ||
776 | * The code in expand_divmod that was suspected to cause creation of | |
777 | branching code seems to be only accessed for signed division. The | |
778 | divisions used by # of iterations analysis are always unsigned. | |
779 | Problems might arise on architectures that emits branching code | |
780 | for some operations that may appear in the unroller (especially | |
781 | for division), but we have no such architectures. | |
782 | ||
783 | Considering all this, it was decided that we should for now assume | |
784 | that INSNS can in theory contain control flow insns, but in practice | |
785 | it never does. So we don't handle the theoretical case, and should | |
786 | a real failure ever show up, we have a pretty good clue for how to | |
787 | fix it. */ | |
788 | ||
88e6f696 | 789 | return bb; |
790 | } | |
791 | ||
b99ab275 | 792 | /* Prepare a sequence comparing OP0 with OP1 using COMP and jumping to LABEL if |
793 | true, with probability PROB. If CINSN is not NULL, it is the insn to copy | |
794 | in order to create a jump. */ | |
795 | ||
222dc1d3 | 796 | static rtx_insn * |
b99ab275 | 797 | compare_and_jump_seq (rtx op0, rtx op1, enum rtx_code comp, rtx label, int prob, |
222dc1d3 | 798 | rtx_insn *cinsn) |
b99ab275 | 799 | { |
222dc1d3 | 800 | rtx_insn *seq, *jump; |
801 | rtx cond; | |
3754d046 | 802 | machine_mode mode; |
b99ab275 | 803 | |
804 | mode = GET_MODE (op0); | |
805 | if (mode == VOIDmode) | |
806 | mode = GET_MODE (op1); | |
807 | ||
808 | start_sequence (); | |
809 | if (GET_MODE_CLASS (mode) == MODE_CC) | |
810 | { | |
811 | /* A hack -- there seems to be no easy generic way how to make a | |
812 | conditional jump from a ccmode comparison. */ | |
813 | gcc_assert (cinsn); | |
814 | cond = XEXP (SET_SRC (pc_set (cinsn)), 0); | |
815 | gcc_assert (GET_CODE (cond) == comp); | |
816 | gcc_assert (rtx_equal_p (op0, XEXP (cond, 0))); | |
817 | gcc_assert (rtx_equal_p (op1, XEXP (cond, 1))); | |
818 | emit_jump_insn (copy_insn (PATTERN (cinsn))); | |
819 | jump = get_last_insn (); | |
820 | gcc_assert (JUMP_P (jump)); | |
821 | JUMP_LABEL (jump) = JUMP_LABEL (cinsn); | |
822 | LABEL_NUSES (JUMP_LABEL (jump))++; | |
823 | redirect_jump (jump, label, 0); | |
824 | } | |
825 | else | |
826 | { | |
827 | gcc_assert (!cinsn); | |
828 | ||
829 | op0 = force_operand (op0, NULL_RTX); | |
830 | op1 = force_operand (op1, NULL_RTX); | |
831 | do_compare_rtx_and_jump (op0, op1, comp, 0, | |
832 | mode, NULL_RTX, NULL_RTX, label, -1); | |
833 | jump = get_last_insn (); | |
834 | gcc_assert (JUMP_P (jump)); | |
835 | JUMP_LABEL (jump) = label; | |
836 | LABEL_NUSES (label)++; | |
837 | } | |
838 | add_int_reg_note (jump, REG_BR_PROB, prob); | |
839 | ||
840 | seq = get_insns (); | |
841 | end_sequence (); | |
842 | ||
843 | return seq; | |
844 | } | |
845 | ||
1c18ed19 | 846 | /* Unroll LOOP for which we are able to count number of iterations in runtime |
847 | LOOP->LPT_DECISION.TIMES times. The transformation does this (with some | |
ad5201d0 | 848 | extra care for case n < 0): |
3ad4992f | 849 | |
ad5201d0 | 850 | for (i = 0; i < n; i++) |
851 | body; | |
3ad4992f | 852 | |
1c18ed19 | 853 | ==> (LOOP->LPT_DECISION.TIMES == 3) |
3ad4992f | 854 | |
ad5201d0 | 855 | i = 0; |
856 | mod = n % 4; | |
3ad4992f | 857 | |
ad5201d0 | 858 | switch (mod) |
859 | { | |
860 | case 3: | |
861 | body; i++; | |
862 | case 2: | |
863 | body; i++; | |
864 | case 1: | |
865 | body; i++; | |
866 | case 0: ; | |
867 | } | |
3ad4992f | 868 | |
ad5201d0 | 869 | while (i < n) |
870 | { | |
871 | body; i++; | |
872 | body; i++; | |
873 | body; i++; | |
874 | body; i++; | |
875 | } | |
876 | */ | |
877 | static void | |
7194de72 | 878 | unroll_loop_runtime_iterations (struct loop *loop) |
ad5201d0 | 879 | { |
222dc1d3 | 880 | rtx old_niter, niter, tmp; |
881 | rtx_insn *init_code, *branch_code; | |
ad5201d0 | 882 | unsigned i, j, p; |
3f9439d7 | 883 | basic_block preheader, *body, swtch, ezc_swtch; |
ad5201d0 | 884 | sbitmap wont_exit; |
885 | int may_exit_copy; | |
f3c40e6d | 886 | unsigned n_peel; |
f3c40e6d | 887 | edge e; |
ad5201d0 | 888 | bool extra_zero_check, last_may_exit; |
889 | unsigned max_unroll = loop->lpt_decision.times; | |
f9cce2dc | 890 | struct niter_desc *desc = get_simple_loop_desc (loop); |
891 | bool exit_at_end = loop_exit_at_end_p (loop); | |
375bb675 | 892 | struct opt_info *opt_info = NULL; |
a53ff4c1 | 893 | bool ok; |
48e1416a | 894 | |
375bb675 | 895 | if (flag_split_ivs_in_unroller |
896 | || flag_variable_expansion_in_unroller) | |
897 | opt_info = analyze_insns_in_loop (loop); | |
48e1416a | 898 | |
ad5201d0 | 899 | /* Remember blocks whose dominators will have to be updated. */ |
c2078b80 | 900 | auto_vec<basic_block> dom_bbs; |
ad5201d0 | 901 | |
902 | body = get_loop_body (loop); | |
903 | for (i = 0; i < loop->num_nodes; i++) | |
904 | { | |
f1f41a6c | 905 | vec<basic_block> ldom; |
3f9439d7 | 906 | basic_block bb; |
ad5201d0 | 907 | |
3f9439d7 | 908 | ldom = get_dominated_by (CDI_DOMINATORS, body[i]); |
f1f41a6c | 909 | FOR_EACH_VEC_ELT (ldom, j, bb) |
3f9439d7 | 910 | if (!flow_bb_inside_loop_p (loop, bb)) |
f1f41a6c | 911 | dom_bbs.safe_push (bb); |
ad5201d0 | 912 | |
f1f41a6c | 913 | ldom.release (); |
ad5201d0 | 914 | } |
915 | free (body); | |
916 | ||
f9cce2dc | 917 | if (!exit_at_end) |
ad5201d0 | 918 | { |
919 | /* Leave exit in first copy (for explanation why see comment in | |
920 | unroll_loop_constant_iterations). */ | |
921 | may_exit_copy = 0; | |
922 | n_peel = max_unroll - 1; | |
923 | extra_zero_check = true; | |
924 | last_may_exit = false; | |
925 | } | |
926 | else | |
927 | { | |
928 | /* Leave exit in last copy (for explanation why see comment in | |
929 | unroll_loop_constant_iterations). */ | |
930 | may_exit_copy = max_unroll; | |
931 | n_peel = max_unroll; | |
932 | extra_zero_check = false; | |
933 | last_may_exit = true; | |
934 | } | |
935 | ||
936 | /* Get expression for number of iterations. */ | |
937 | start_sequence (); | |
f9cce2dc | 938 | old_niter = niter = gen_reg_rtx (desc->mode); |
939 | tmp = force_operand (copy_rtx (desc->niter_expr), niter); | |
940 | if (tmp != niter) | |
941 | emit_move_insn (niter, tmp); | |
ad5201d0 | 942 | |
943 | /* Count modulo by ANDing it with max_unroll; we use the fact that | |
944 | the number of unrollings is a power of two, and thus this is correct | |
945 | even if there is overflow in the computation. */ | |
f9cce2dc | 946 | niter = expand_simple_binop (desc->mode, AND, |
0359f9f5 | 947 | niter, gen_int_mode (max_unroll, desc->mode), |
ad5201d0 | 948 | NULL_RTX, 0, OPTAB_LIB_WIDEN); |
949 | ||
950 | init_code = get_insns (); | |
951 | end_sequence (); | |
ffbd194b | 952 | unshare_all_rtl_in_chain (init_code); |
ad5201d0 | 953 | |
954 | /* Precondition the loop. */ | |
88e6f696 | 955 | split_edge_and_insert (loop_preheader_edge (loop), init_code); |
ad5201d0 | 956 | |
c2078b80 | 957 | auto_vec<edge> remove_edges; |
ad5201d0 | 958 | |
959 | wont_exit = sbitmap_alloc (max_unroll + 2); | |
960 | ||
961 | /* Peel the first copy of loop body (almost always we must leave exit test | |
962 | here; the only exception is when we have extra zero check and the number | |
f9cce2dc | 963 | of iterations is reliable. Also record the place of (possible) extra |
964 | zero check. */ | |
53c5d9d4 | 965 | bitmap_clear (wont_exit); |
f9cce2dc | 966 | if (extra_zero_check |
967 | && !desc->noloop_assumptions) | |
08b7917c | 968 | bitmap_set_bit (wont_exit, 1); |
ad5201d0 | 969 | ezc_swtch = loop_preheader_edge (loop)->src; |
a53ff4c1 | 970 | ok = duplicate_loop_to_header_edge (loop, loop_preheader_edge (loop), |
7194de72 | 971 | 1, wont_exit, desc->out_edge, |
f3c40e6d | 972 | &remove_edges, |
a53ff4c1 | 973 | DLTHE_FLAG_UPDATE_FREQ); |
974 | gcc_assert (ok); | |
ad5201d0 | 975 | |
976 | /* Record the place where switch will be built for preconditioning. */ | |
88e6f696 | 977 | swtch = split_edge (loop_preheader_edge (loop)); |
ad5201d0 | 978 | |
979 | for (i = 0; i < n_peel; i++) | |
980 | { | |
981 | /* Peel the copy. */ | |
53c5d9d4 | 982 | bitmap_clear (wont_exit); |
ad5201d0 | 983 | if (i != n_peel - 1 || !last_may_exit) |
08b7917c | 984 | bitmap_set_bit (wont_exit, 1); |
a53ff4c1 | 985 | ok = duplicate_loop_to_header_edge (loop, loop_preheader_edge (loop), |
7194de72 | 986 | 1, wont_exit, desc->out_edge, |
f3c40e6d | 987 | &remove_edges, |
a53ff4c1 | 988 | DLTHE_FLAG_UPDATE_FREQ); |
989 | gcc_assert (ok); | |
ad5201d0 | 990 | |
c87a3eff | 991 | /* Create item for switch. */ |
992 | j = n_peel - i - (extra_zero_check ? 0 : 1); | |
993 | p = REG_BR_PROB_BASE / (i + 2); | |
994 | ||
88e6f696 | 995 | preheader = split_edge (loop_preheader_edge (loop)); |
f9cce2dc | 996 | branch_code = compare_and_jump_seq (copy_rtx (niter), GEN_INT (j), EQ, |
a53ff4c1 | 997 | block_label (preheader), p, |
222dc1d3 | 998 | NULL); |
c87a3eff | 999 | |
d6a1bbdb | 1000 | /* We rely on the fact that the compare and jump cannot be optimized out, |
1001 | and hence the cfg we create is correct. */ | |
1002 | gcc_assert (branch_code != NULL_RTX); | |
1003 | ||
88e6f696 | 1004 | swtch = split_edge_and_insert (single_pred_edge (swtch), branch_code); |
0051c76a | 1005 | set_immediate_dominator (CDI_DOMINATORS, preheader, swtch); |
ea091dfd | 1006 | single_pred_edge (swtch)->probability = REG_BR_PROB_BASE - p; |
c87a3eff | 1007 | e = make_edge (swtch, preheader, |
ea091dfd | 1008 | single_succ_edge (swtch)->flags & EDGE_IRREDUCIBLE_LOOP); |
a9ef9877 | 1009 | e->count = RDIV (preheader->count * REG_BR_PROB_BASE, p); |
c87a3eff | 1010 | e->probability = p; |
ad5201d0 | 1011 | } |
1012 | ||
1013 | if (extra_zero_check) | |
1014 | { | |
1015 | /* Add branch for zero iterations. */ | |
1016 | p = REG_BR_PROB_BASE / (max_unroll + 1); | |
1017 | swtch = ezc_swtch; | |
88e6f696 | 1018 | preheader = split_edge (loop_preheader_edge (loop)); |
f9cce2dc | 1019 | branch_code = compare_and_jump_seq (copy_rtx (niter), const0_rtx, EQ, |
a53ff4c1 | 1020 | block_label (preheader), p, |
222dc1d3 | 1021 | NULL); |
d6a1bbdb | 1022 | gcc_assert (branch_code != NULL_RTX); |
ad5201d0 | 1023 | |
88e6f696 | 1024 | swtch = split_edge_and_insert (single_succ_edge (swtch), branch_code); |
0051c76a | 1025 | set_immediate_dominator (CDI_DOMINATORS, preheader, swtch); |
ea091dfd | 1026 | single_succ_edge (swtch)->probability = REG_BR_PROB_BASE - p; |
a964683a | 1027 | e = make_edge (swtch, preheader, |
ea091dfd | 1028 | single_succ_edge (swtch)->flags & EDGE_IRREDUCIBLE_LOOP); |
a9ef9877 | 1029 | e->count = RDIV (preheader->count * REG_BR_PROB_BASE, p); |
ad5201d0 | 1030 | e->probability = p; |
1031 | } | |
1032 | ||
1033 | /* Recount dominators for outer blocks. */ | |
3f9439d7 | 1034 | iterate_fix_dominators (CDI_DOMINATORS, dom_bbs, false); |
ad5201d0 | 1035 | |
1036 | /* And unroll loop. */ | |
1037 | ||
53c5d9d4 | 1038 | bitmap_ones (wont_exit); |
08b7917c | 1039 | bitmap_clear_bit (wont_exit, may_exit_copy); |
375bb675 | 1040 | opt_info_start_duplication (opt_info); |
48e1416a | 1041 | |
a53ff4c1 | 1042 | ok = duplicate_loop_to_header_edge (loop, loop_latch_edge (loop), |
7194de72 | 1043 | max_unroll, |
a53ff4c1 | 1044 | wont_exit, desc->out_edge, |
f3c40e6d | 1045 | &remove_edges, |
75cc36a4 | 1046 | DLTHE_FLAG_UPDATE_FREQ |
1047 | | (opt_info | |
1048 | ? DLTHE_RECORD_COPY_NUMBER | |
1049 | : 0)); | |
a53ff4c1 | 1050 | gcc_assert (ok); |
48e1416a | 1051 | |
375bb675 | 1052 | if (opt_info) |
a9989fb4 | 1053 | { |
375bb675 | 1054 | apply_opt_in_copies (opt_info, max_unroll, true, true); |
1055 | free_opt_info (opt_info); | |
a9989fb4 | 1056 | } |
1057 | ||
ad5201d0 | 1058 | free (wont_exit); |
1059 | ||
f9cce2dc | 1060 | if (exit_at_end) |
1061 | { | |
01020a5f | 1062 | basic_block exit_block = get_bb_copy (desc->in_edge->src); |
a53ff4c1 | 1063 | /* Find a new in and out edge; they are in the last copy we have |
1064 | made. */ | |
48e1416a | 1065 | |
cd665a06 | 1066 | if (EDGE_SUCC (exit_block, 0)->dest == desc->out_edge->dest) |
f9cce2dc | 1067 | { |
cd665a06 | 1068 | desc->out_edge = EDGE_SUCC (exit_block, 0); |
1069 | desc->in_edge = EDGE_SUCC (exit_block, 1); | |
f9cce2dc | 1070 | } |
1071 | else | |
1072 | { | |
cd665a06 | 1073 | desc->out_edge = EDGE_SUCC (exit_block, 1); |
1074 | desc->in_edge = EDGE_SUCC (exit_block, 0); | |
f9cce2dc | 1075 | } |
1076 | } | |
1077 | ||
ad5201d0 | 1078 | /* Remove the edges. */ |
f1f41a6c | 1079 | FOR_EACH_VEC_ELT (remove_edges, i, e) |
f3c40e6d | 1080 | remove_path (e); |
ad5201d0 | 1081 | |
f9cce2dc | 1082 | /* We must be careful when updating the number of iterations due to |
1083 | preconditioning and the fact that the value must be valid at entry | |
1084 | of the loop. After passing through the above code, we see that | |
1085 | the correct new number of iterations is this: */ | |
a9989fb4 | 1086 | gcc_assert (!desc->const_iter); |
f9cce2dc | 1087 | desc->niter_expr = |
a53ff4c1 | 1088 | simplify_gen_binary (UDIV, desc->mode, old_niter, |
5d5ee71f | 1089 | gen_int_mode (max_unroll + 1, desc->mode)); |
a9ef9877 | 1090 | loop->nb_iterations_upper_bound |
796b6678 | 1091 | = wi::udiv_trunc (loop->nb_iterations_upper_bound, max_unroll + 1); |
a9ef9877 | 1092 | if (loop->any_estimate) |
1093 | loop->nb_iterations_estimate | |
796b6678 | 1094 | = wi::udiv_trunc (loop->nb_iterations_estimate, max_unroll + 1); |
f9cce2dc | 1095 | if (exit_at_end) |
1096 | { | |
1097 | desc->niter_expr = | |
1098 | simplify_gen_binary (MINUS, desc->mode, desc->niter_expr, const1_rtx); | |
1099 | desc->noloop_assumptions = NULL_RTX; | |
a9ef9877 | 1100 | --loop->nb_iterations_upper_bound; |
1101 | if (loop->any_estimate | |
e913b5cd | 1102 | && loop->nb_iterations_estimate != 0) |
a9ef9877 | 1103 | --loop->nb_iterations_estimate; |
1104 | else | |
1105 | loop->any_estimate = false; | |
f9cce2dc | 1106 | } |
1107 | ||
450d042a | 1108 | if (dump_file) |
1109 | fprintf (dump_file, | |
1110 | ";; Unrolled loop %d times, counting # of iterations " | |
1111 | "in runtime, %i insns\n", | |
ad5201d0 | 1112 | max_unroll, num_loop_insns (loop)); |
1113 | } | |
3ad4992f | 1114 | |
ad5201d0 | 1115 | /* Decide whether to unroll LOOP stupidly and how much. */ |
1116 | static void | |
0051c76a | 1117 | decide_unroll_stupid (struct loop *loop, int flags) |
ad5201d0 | 1118 | { |
1119 | unsigned nunroll, nunroll_by_av, i; | |
f9cce2dc | 1120 | struct niter_desc *desc; |
5de9d3ed | 1121 | widest_int iterations; |
ad5201d0 | 1122 | |
1123 | if (!(flags & UAP_UNROLL_ALL)) | |
1124 | { | |
1125 | /* We were not asked to, just return back silently. */ | |
1126 | return; | |
1127 | } | |
1128 | ||
450d042a | 1129 | if (dump_file) |
1130 | fprintf (dump_file, "\n;; Considering unrolling loop stupidly\n"); | |
ad5201d0 | 1131 | |
1132 | /* nunroll = total number of copies of the original loop body in | |
1133 | unrolled loop (i.e. if it is 2, we have to duplicate loop body once. */ | |
1134 | nunroll = PARAM_VALUE (PARAM_MAX_UNROLLED_INSNS) / loop->ninsns; | |
450d042a | 1135 | nunroll_by_av |
1136 | = PARAM_VALUE (PARAM_MAX_AVERAGE_UNROLLED_INSNS) / loop->av_ninsns; | |
ad5201d0 | 1137 | if (nunroll > nunroll_by_av) |
1138 | nunroll = nunroll_by_av; | |
1139 | if (nunroll > (unsigned) PARAM_VALUE (PARAM_MAX_UNROLL_TIMES)) | |
1140 | nunroll = PARAM_VALUE (PARAM_MAX_UNROLL_TIMES); | |
1141 | ||
9ccaa774 | 1142 | if (targetm.loop_unroll_adjust) |
1143 | nunroll = targetm.loop_unroll_adjust (nunroll, loop); | |
1144 | ||
ad5201d0 | 1145 | /* Skip big loops. */ |
1146 | if (nunroll <= 1) | |
1147 | { | |
450d042a | 1148 | if (dump_file) |
1149 | fprintf (dump_file, ";; Not considering loop, is too big\n"); | |
ad5201d0 | 1150 | return; |
1151 | } | |
1152 | ||
1153 | /* Check for simple loops. */ | |
f9cce2dc | 1154 | desc = get_simple_loop_desc (loop); |
ad5201d0 | 1155 | |
1156 | /* Check simpleness. */ | |
f9cce2dc | 1157 | if (desc->simple_p && !desc->assumptions) |
ad5201d0 | 1158 | { |
450d042a | 1159 | if (dump_file) |
1160 | fprintf (dump_file, ";; The loop is simple\n"); | |
ad5201d0 | 1161 | return; |
1162 | } | |
1163 | ||
1164 | /* Do not unroll loops with branches inside -- it increases number | |
d9459f6b | 1165 | of mispredicts. |
1166 | TODO: this heuristic needs tunning; call inside the loop body | |
1167 | is also relatively good reason to not unroll. */ | |
f9cce2dc | 1168 | if (num_loop_branches (loop) > 1) |
ad5201d0 | 1169 | { |
450d042a | 1170 | if (dump_file) |
1171 | fprintf (dump_file, ";; Not unrolling, contains branches\n"); | |
ad5201d0 | 1172 | return; |
1173 | } | |
1174 | ||
204a2e45 | 1175 | /* Check whether the loop rolls. */ |
f86b328b | 1176 | if ((get_estimated_loop_iterations (loop, &iterations) |
1177 | || get_max_loop_iterations (loop, &iterations)) | |
796b6678 | 1178 | && wi::ltu_p (iterations, 2 * nunroll)) |
ad5201d0 | 1179 | { |
450d042a | 1180 | if (dump_file) |
1181 | fprintf (dump_file, ";; Not unrolling loop, doesn't roll\n"); | |
ad5201d0 | 1182 | return; |
1183 | } | |
1184 | ||
1185 | /* Success. Now force nunroll to be power of 2, as it seems that this | |
d01481af | 1186 | improves results (partially because of better alignments, partially |
ad5201d0 | 1187 | because of some dark magic). */ |
f9cce2dc | 1188 | for (i = 1; 2 * i <= nunroll; i *= 2) |
1189 | continue; | |
ad5201d0 | 1190 | |
1191 | loop->lpt_decision.decision = LPT_UNROLL_STUPID; | |
1192 | loop->lpt_decision.times = i - 1; | |
1193 | } | |
1194 | ||
1c18ed19 | 1195 | /* Unroll a LOOP LOOP->LPT_DECISION.TIMES times. The transformation does this: |
1196 | ||
ad5201d0 | 1197 | while (cond) |
1198 | body; | |
1199 | ||
1c18ed19 | 1200 | ==> (LOOP->LPT_DECISION.TIMES == 3) |
ad5201d0 | 1201 | |
1202 | while (cond) | |
1203 | { | |
1204 | body; | |
1205 | if (!cond) break; | |
1206 | body; | |
1207 | if (!cond) break; | |
1208 | body; | |
1209 | if (!cond) break; | |
1210 | body; | |
1211 | } | |
1212 | */ | |
1213 | static void | |
7194de72 | 1214 | unroll_loop_stupid (struct loop *loop) |
ad5201d0 | 1215 | { |
1216 | sbitmap wont_exit; | |
1217 | unsigned nunroll = loop->lpt_decision.times; | |
f9cce2dc | 1218 | struct niter_desc *desc = get_simple_loop_desc (loop); |
375bb675 | 1219 | struct opt_info *opt_info = NULL; |
a53ff4c1 | 1220 | bool ok; |
48e1416a | 1221 | |
375bb675 | 1222 | if (flag_split_ivs_in_unroller |
1223 | || flag_variable_expansion_in_unroller) | |
1224 | opt_info = analyze_insns_in_loop (loop); | |
48e1416a | 1225 | |
1226 | ||
ad5201d0 | 1227 | wont_exit = sbitmap_alloc (nunroll + 1); |
53c5d9d4 | 1228 | bitmap_clear (wont_exit); |
375bb675 | 1229 | opt_info_start_duplication (opt_info); |
48e1416a | 1230 | |
a53ff4c1 | 1231 | ok = duplicate_loop_to_header_edge (loop, loop_latch_edge (loop), |
7194de72 | 1232 | nunroll, wont_exit, |
f3c40e6d | 1233 | NULL, NULL, |
75cc36a4 | 1234 | DLTHE_FLAG_UPDATE_FREQ |
1235 | | (opt_info | |
1236 | ? DLTHE_RECORD_COPY_NUMBER | |
1237 | : 0)); | |
a53ff4c1 | 1238 | gcc_assert (ok); |
48e1416a | 1239 | |
375bb675 | 1240 | if (opt_info) |
a9989fb4 | 1241 | { |
375bb675 | 1242 | apply_opt_in_copies (opt_info, nunroll, true, true); |
1243 | free_opt_info (opt_info); | |
a9989fb4 | 1244 | } |
1245 | ||
ad5201d0 | 1246 | free (wont_exit); |
3ad4992f | 1247 | |
f9cce2dc | 1248 | if (desc->simple_p) |
1249 | { | |
1250 | /* We indeed may get here provided that there are nontrivial assumptions | |
1251 | for a loop to be really simple. We could update the counts, but the | |
1252 | problem is that we are unable to decide which exit will be taken | |
1253 | (not really true in case the number of iterations is constant, | |
75de4aa2 | 1254 | but no one will do anything with this information, so we do not |
f9cce2dc | 1255 | worry about it). */ |
1256 | desc->simple_p = false; | |
1257 | } | |
1258 | ||
450d042a | 1259 | if (dump_file) |
1260 | fprintf (dump_file, ";; Unrolled loop %d times, %i insns\n", | |
ad5201d0 | 1261 | nunroll, num_loop_insns (loop)); |
1262 | } | |
a9989fb4 | 1263 | |
3b3940d7 | 1264 | /* Returns true if REG is referenced in one nondebug insn in LOOP. |
1265 | Set *DEBUG_USES to the number of debug insns that reference the | |
1266 | variable. */ | |
375bb675 | 1267 | |
f7b51129 | 1268 | static bool |
3b3940d7 | 1269 | referenced_in_one_insn_in_loop_p (struct loop *loop, rtx reg, |
1270 | int *debug_uses) | |
375bb675 | 1271 | { |
1272 | basic_block *body, bb; | |
1273 | unsigned i; | |
1274 | int count_ref = 0; | |
222dc1d3 | 1275 | rtx_insn *insn; |
48e1416a | 1276 | |
1277 | body = get_loop_body (loop); | |
375bb675 | 1278 | for (i = 0; i < loop->num_nodes; i++) |
1279 | { | |
1280 | bb = body[i]; | |
48e1416a | 1281 | |
375bb675 | 1282 | FOR_BB_INSNS (bb, insn) |
3b3940d7 | 1283 | if (!rtx_referenced_p (reg, insn)) |
1284 | continue; | |
1285 | else if (DEBUG_INSN_P (insn)) | |
1286 | ++*debug_uses; | |
1287 | else if (++count_ref > 1) | |
1288 | break; | |
375bb675 | 1289 | } |
3b3940d7 | 1290 | free (body); |
375bb675 | 1291 | return (count_ref == 1); |
1292 | } | |
1293 | ||
3b3940d7 | 1294 | /* Reset the DEBUG_USES debug insns in LOOP that reference REG. */ |
1295 | ||
1296 | static void | |
1297 | reset_debug_uses_in_loop (struct loop *loop, rtx reg, int debug_uses) | |
1298 | { | |
1299 | basic_block *body, bb; | |
1300 | unsigned i; | |
222dc1d3 | 1301 | rtx_insn *insn; |
3b3940d7 | 1302 | |
1303 | body = get_loop_body (loop); | |
1304 | for (i = 0; debug_uses && i < loop->num_nodes; i++) | |
1305 | { | |
1306 | bb = body[i]; | |
1307 | ||
1308 | FOR_BB_INSNS (bb, insn) | |
1309 | if (!DEBUG_INSN_P (insn) || !rtx_referenced_p (reg, insn)) | |
1310 | continue; | |
1311 | else | |
1312 | { | |
1313 | validate_change (insn, &INSN_VAR_LOCATION_LOC (insn), | |
1314 | gen_rtx_UNKNOWN_VAR_LOC (), 0); | |
1315 | if (!--debug_uses) | |
1316 | break; | |
1317 | } | |
1318 | } | |
1319 | free (body); | |
1320 | } | |
1321 | ||
375bb675 | 1322 | /* Determine whether INSN contains an accumulator |
48e1416a | 1323 | which can be expanded into separate copies, |
375bb675 | 1324 | one for each copy of the LOOP body. |
48e1416a | 1325 | |
375bb675 | 1326 | for (i = 0 ; i < n; i++) |
1327 | sum += a[i]; | |
48e1416a | 1328 | |
375bb675 | 1329 | ==> |
48e1416a | 1330 | |
375bb675 | 1331 | sum += a[i] |
1332 | .... | |
1333 | i = i+1; | |
1334 | sum1 += a[i] | |
1335 | .... | |
1336 | i = i+1 | |
1337 | sum2 += a[i]; | |
1338 | .... | |
1339 | ||
48e1416a | 1340 | Return NULL if INSN contains no opportunity for expansion of accumulator. |
1341 | Otherwise, allocate a VAR_TO_EXPAND structure, fill it with the relevant | |
375bb675 | 1342 | information and return a pointer to it. |
1343 | */ | |
1344 | ||
1345 | static struct var_to_expand * | |
222dc1d3 | 1346 | analyze_insn_to_expand_var (struct loop *loop, rtx_insn *insn) |
375bb675 | 1347 | { |
7497b98c | 1348 | rtx set, dest, src; |
375bb675 | 1349 | struct var_to_expand *ves; |
0def560a | 1350 | unsigned accum_pos; |
7497b98c | 1351 | enum rtx_code code; |
3b3940d7 | 1352 | int debug_uses = 0; |
0def560a | 1353 | |
375bb675 | 1354 | set = single_set (insn); |
1355 | if (!set) | |
1356 | return NULL; | |
48e1416a | 1357 | |
375bb675 | 1358 | dest = SET_DEST (set); |
1359 | src = SET_SRC (set); | |
7497b98c | 1360 | code = GET_CODE (src); |
48e1416a | 1361 | |
7497b98c | 1362 | if (code != PLUS && code != MINUS && code != MULT && code != FMA) |
375bb675 | 1363 | return NULL; |
1c6304f6 | 1364 | |
7497b98c | 1365 | if (FLOAT_MODE_P (GET_MODE (dest))) |
1366 | { | |
1367 | if (!flag_associative_math) | |
1368 | return NULL; | |
1369 | /* In the case of FMA, we're also changing the rounding. */ | |
1370 | if (code == FMA && !flag_unsafe_math_optimizations) | |
1371 | return NULL; | |
1372 | } | |
1373 | ||
1c6304f6 | 1374 | /* Hmm, this is a bit paradoxical. We know that INSN is a valid insn |
1375 | in MD. But if there is no optab to generate the insn, we can not | |
1376 | perform the variable expansion. This can happen if an MD provides | |
1377 | an insn but not a named pattern to generate it, for example to avoid | |
1378 | producing code that needs additional mode switches like for x87/mmx. | |
1379 | ||
1380 | So we check have_insn_for which looks for an optab for the operation | |
1381 | in SRC. If it doesn't exist, we can't perform the expansion even | |
1382 | though INSN is valid. */ | |
7497b98c | 1383 | if (!have_insn_for (code, GET_MODE (src))) |
1c6304f6 | 1384 | return NULL; |
1385 | ||
375bb675 | 1386 | if (!REG_P (dest) |
1387 | && !(GET_CODE (dest) == SUBREG | |
1388 | && REG_P (SUBREG_REG (dest)))) | |
1389 | return NULL; | |
48e1416a | 1390 | |
7497b98c | 1391 | /* Find the accumulator use within the operation. */ |
1392 | if (code == FMA) | |
1393 | { | |
1394 | /* We only support accumulation via FMA in the ADD position. */ | |
1395 | if (!rtx_equal_p (dest, XEXP (src, 2))) | |
1396 | return NULL; | |
1397 | accum_pos = 2; | |
1398 | } | |
1399 | else if (rtx_equal_p (dest, XEXP (src, 0))) | |
0def560a | 1400 | accum_pos = 0; |
7497b98c | 1401 | else if (rtx_equal_p (dest, XEXP (src, 1))) |
1402 | { | |
1403 | /* The method of expansion that we are using; which includes the | |
1404 | initialization of the expansions with zero and the summation of | |
1405 | the expansions at the end of the computation will yield wrong | |
1406 | results for (x = something - x) thus avoid using it in that case. */ | |
1407 | if (code == MINUS) | |
1408 | return NULL; | |
1409 | accum_pos = 1; | |
1410 | } | |
0def560a | 1411 | else |
1412 | return NULL; | |
1413 | ||
7497b98c | 1414 | /* It must not otherwise be used. */ |
1415 | if (code == FMA) | |
1416 | { | |
1417 | if (rtx_referenced_p (dest, XEXP (src, 0)) | |
1418 | || rtx_referenced_p (dest, XEXP (src, 1))) | |
1419 | return NULL; | |
1420 | } | |
1421 | else if (rtx_referenced_p (dest, XEXP (src, 1 - accum_pos))) | |
375bb675 | 1422 | return NULL; |
48e1416a | 1423 | |
7497b98c | 1424 | /* It must be used in exactly one insn. */ |
3b3940d7 | 1425 | if (!referenced_in_one_insn_in_loop_p (loop, dest, &debug_uses)) |
375bb675 | 1426 | return NULL; |
48e1416a | 1427 | |
e24a805c | 1428 | if (dump_file) |
7497b98c | 1429 | { |
1430 | fprintf (dump_file, "\n;; Expanding Accumulator "); | |
1431 | print_rtl (dump_file, dest); | |
1432 | fprintf (dump_file, "\n"); | |
1433 | } | |
e24a805c | 1434 | |
3b3940d7 | 1435 | if (debug_uses) |
1436 | /* Instead of resetting the debug insns, we could replace each | |
1437 | debug use in the loop with the sum or product of all expanded | |
1438 | accummulators. Since we'll only know of all expansions at the | |
1439 | end, we'd have to keep track of which vars_to_expand a debug | |
1440 | insn in the loop references, take note of each copy of the | |
1441 | debug insn during unrolling, and when it's all done, compute | |
1442 | the sum or product of each variable and adjust the original | |
1443 | debug insn and each copy thereof. What a pain! */ | |
1444 | reset_debug_uses_in_loop (loop, dest, debug_uses); | |
1445 | ||
375bb675 | 1446 | /* Record the accumulator to expand. */ |
4c36ffe6 | 1447 | ves = XNEW (struct var_to_expand); |
375bb675 | 1448 | ves->insn = insn; |
375bb675 | 1449 | ves->reg = copy_rtx (dest); |
f1f41a6c | 1450 | ves->var_expansions.create (1); |
b635a1a3 | 1451 | ves->next = NULL; |
375bb675 | 1452 | ves->op = GET_CODE (src); |
1453 | ves->expansion_count = 0; | |
1454 | ves->reuse_expansion = 0; | |
48e1416a | 1455 | return ves; |
375bb675 | 1456 | } |
1457 | ||
a9989fb4 | 1458 | /* Determine whether there is an induction variable in INSN that |
48e1416a | 1459 | we would like to split during unrolling. |
375bb675 | 1460 | |
1461 | I.e. replace | |
1462 | ||
1463 | i = i + 1; | |
1464 | ... | |
1465 | i = i + 1; | |
1466 | ... | |
1467 | i = i + 1; | |
1468 | ... | |
1469 | ||
1470 | type chains by | |
1471 | ||
1472 | i0 = i + 1 | |
1473 | ... | |
1474 | i = i0 + 1 | |
1475 | ... | |
1476 | i = i0 + 2 | |
1477 | ... | |
1478 | ||
48e1416a | 1479 | Return NULL if INSN contains no interesting IVs. Otherwise, allocate |
375bb675 | 1480 | an IV_TO_SPLIT structure, fill it with the relevant information and return a |
a9989fb4 | 1481 | pointer to it. */ |
1482 | ||
1483 | static struct iv_to_split * | |
3eeb4f9a | 1484 | analyze_iv_to_split_insn (rtx_insn *insn) |
a9989fb4 | 1485 | { |
1486 | rtx set, dest; | |
1487 | struct rtx_iv iv; | |
1488 | struct iv_to_split *ivts; | |
a53ff4c1 | 1489 | bool ok; |
a9989fb4 | 1490 | |
1491 | /* For now we just split the basic induction variables. Later this may be | |
1492 | extended for example by selecting also addresses of memory references. */ | |
1493 | set = single_set (insn); | |
1494 | if (!set) | |
1495 | return NULL; | |
1496 | ||
1497 | dest = SET_DEST (set); | |
1498 | if (!REG_P (dest)) | |
1499 | return NULL; | |
1500 | ||
1501 | if (!biv_p (insn, dest)) | |
1502 | return NULL; | |
1503 | ||
3f37d465 | 1504 | ok = iv_analyze_result (insn, dest, &iv); |
e8820970 | 1505 | |
1506 | /* This used to be an assert under the assumption that if biv_p returns | |
1507 | true that iv_analyze_result must also return true. However, that | |
1508 | assumption is not strictly correct as evidenced by pr25569. | |
1509 | ||
1510 | Returning NULL when iv_analyze_result returns false is safe and | |
1511 | avoids the problems in pr25569 until the iv_analyze_* routines | |
1512 | can be fixed, which is apparently hard and time consuming | |
1513 | according to their author. */ | |
1514 | if (! ok) | |
1515 | return NULL; | |
a9989fb4 | 1516 | |
1517 | if (iv.step == const0_rtx | |
1518 | || iv.mode != iv.extend_mode) | |
1519 | return NULL; | |
1520 | ||
1521 | /* Record the insn to split. */ | |
4c36ffe6 | 1522 | ivts = XNEW (struct iv_to_split); |
a9989fb4 | 1523 | ivts->insn = insn; |
ef770610 | 1524 | ivts->orig_var = dest; |
a9989fb4 | 1525 | ivts->base_var = NULL_RTX; |
1526 | ivts->step = iv.step; | |
b635a1a3 | 1527 | ivts->next = NULL; |
48e1416a | 1528 | |
a9989fb4 | 1529 | return ivts; |
1530 | } | |
1531 | ||
375bb675 | 1532 | /* Determines which of insns in LOOP can be optimized. |
1533 | Return a OPT_INFO struct with the relevant hash tables filled | |
1534 | with all insns to be optimized. The FIRST_NEW_BLOCK field | |
a9989fb4 | 1535 | is undefined for the return value. */ |
1536 | ||
375bb675 | 1537 | static struct opt_info * |
1538 | analyze_insns_in_loop (struct loop *loop) | |
a9989fb4 | 1539 | { |
1540 | basic_block *body, bb; | |
749ea85f | 1541 | unsigned i; |
4c36ffe6 | 1542 | struct opt_info *opt_info = XCNEW (struct opt_info); |
3eeb4f9a | 1543 | rtx_insn *insn; |
375bb675 | 1544 | struct iv_to_split *ivts = NULL; |
1545 | struct var_to_expand *ves = NULL; | |
d9dd21a8 | 1546 | iv_to_split **slot1; |
1547 | var_to_expand **slot2; | |
f1f41a6c | 1548 | vec<edge> edges = get_loop_exit_edges (loop); |
749ea85f | 1549 | edge exit; |
375bb675 | 1550 | bool can_apply = false; |
48e1416a | 1551 | |
a9989fb4 | 1552 | iv_analysis_loop_init (loop); |
1553 | ||
1554 | body = get_loop_body (loop); | |
375bb675 | 1555 | |
1556 | if (flag_split_ivs_in_unroller) | |
b635a1a3 | 1557 | { |
c1f445d2 | 1558 | opt_info->insns_to_split |
1559 | = new hash_table<iv_split_hasher> (5 * loop->num_nodes); | |
b635a1a3 | 1560 | opt_info->iv_to_split_head = NULL; |
1561 | opt_info->iv_to_split_tail = &opt_info->iv_to_split_head; | |
1562 | } | |
48e1416a | 1563 | |
375bb675 | 1564 | /* Record the loop exit bb and loop preheader before the unrolling. */ |
88e6f696 | 1565 | opt_info->loop_preheader = loop_preheader_edge (loop)->src; |
48e1416a | 1566 | |
f1f41a6c | 1567 | if (edges.length () == 1) |
375bb675 | 1568 | { |
f1f41a6c | 1569 | exit = edges[0]; |
749ea85f | 1570 | if (!(exit->flags & EDGE_COMPLEX)) |
1571 | { | |
1572 | opt_info->loop_exit = split_edge (exit); | |
1573 | can_apply = true; | |
1574 | } | |
375bb675 | 1575 | } |
48e1416a | 1576 | |
375bb675 | 1577 | if (flag_variable_expansion_in_unroller |
1578 | && can_apply) | |
b635a1a3 | 1579 | { |
c1f445d2 | 1580 | opt_info->insns_with_var_to_expand |
1581 | = new hash_table<var_expand_hasher> (5 * loop->num_nodes); | |
b635a1a3 | 1582 | opt_info->var_to_expand_head = NULL; |
1583 | opt_info->var_to_expand_tail = &opt_info->var_to_expand_head; | |
1584 | } | |
48e1416a | 1585 | |
a9989fb4 | 1586 | for (i = 0; i < loop->num_nodes; i++) |
1587 | { | |
1588 | bb = body[i]; | |
1589 | if (!dominated_by_p (CDI_DOMINATORS, loop->latch, bb)) | |
1590 | continue; | |
1591 | ||
1592 | FOR_BB_INSNS (bb, insn) | |
375bb675 | 1593 | { |
1594 | if (!INSN_P (insn)) | |
1595 | continue; | |
48e1416a | 1596 | |
c1f445d2 | 1597 | if (opt_info->insns_to_split) |
375bb675 | 1598 | ivts = analyze_iv_to_split_insn (insn); |
48e1416a | 1599 | |
375bb675 | 1600 | if (ivts) |
1601 | { | |
c1f445d2 | 1602 | slot1 = opt_info->insns_to_split->find_slot (ivts, INSERT); |
b635a1a3 | 1603 | gcc_assert (*slot1 == NULL); |
375bb675 | 1604 | *slot1 = ivts; |
b635a1a3 | 1605 | *opt_info->iv_to_split_tail = ivts; |
1606 | opt_info->iv_to_split_tail = &ivts->next; | |
375bb675 | 1607 | continue; |
1608 | } | |
48e1416a | 1609 | |
c1f445d2 | 1610 | if (opt_info->insns_with_var_to_expand) |
375bb675 | 1611 | ves = analyze_insn_to_expand_var (loop, insn); |
48e1416a | 1612 | |
375bb675 | 1613 | if (ves) |
1614 | { | |
c1f445d2 | 1615 | slot2 = opt_info->insns_with_var_to_expand->find_slot (ves, INSERT); |
b635a1a3 | 1616 | gcc_assert (*slot2 == NULL); |
375bb675 | 1617 | *slot2 = ves; |
b635a1a3 | 1618 | *opt_info->var_to_expand_tail = ves; |
1619 | opt_info->var_to_expand_tail = &ves->next; | |
375bb675 | 1620 | } |
1621 | } | |
a9989fb4 | 1622 | } |
48e1416a | 1623 | |
f1f41a6c | 1624 | edges.release (); |
a9989fb4 | 1625 | free (body); |
375bb675 | 1626 | return opt_info; |
a9989fb4 | 1627 | } |
1628 | ||
1629 | /* Called just before loop duplication. Records start of duplicated area | |
375bb675 | 1630 | to OPT_INFO. */ |
a9989fb4 | 1631 | |
48e1416a | 1632 | static void |
375bb675 | 1633 | opt_info_start_duplication (struct opt_info *opt_info) |
a9989fb4 | 1634 | { |
375bb675 | 1635 | if (opt_info) |
fe672ac0 | 1636 | opt_info->first_new_block = last_basic_block_for_fn (cfun); |
a9989fb4 | 1637 | } |
1638 | ||
1639 | /* Determine the number of iterations between initialization of the base | |
1640 | variable and the current copy (N_COPY). N_COPIES is the total number | |
1641 | of newly created copies. UNROLLING is true if we are unrolling | |
1642 | (not peeling) the loop. */ | |
1643 | ||
1644 | static unsigned | |
1645 | determine_split_iv_delta (unsigned n_copy, unsigned n_copies, bool unrolling) | |
1646 | { | |
1647 | if (unrolling) | |
1648 | { | |
1649 | /* If we are unrolling, initialization is done in the original loop | |
1650 | body (number 0). */ | |
1651 | return n_copy; | |
1652 | } | |
1653 | else | |
1654 | { | |
1655 | /* If we are peeling, the copy in that the initialization occurs has | |
1656 | number 1. The original loop (number 0) is the last. */ | |
1657 | if (n_copy) | |
1658 | return n_copy - 1; | |
1659 | else | |
1660 | return n_copies; | |
1661 | } | |
1662 | } | |
1663 | ||
b635a1a3 | 1664 | /* Allocate basic variable for the induction variable chain. */ |
a9989fb4 | 1665 | |
b635a1a3 | 1666 | static void |
1667 | allocate_basic_variable (struct iv_to_split *ivts) | |
a9989fb4 | 1668 | { |
686fd44c | 1669 | rtx expr = SET_SRC (single_set (ivts->insn)); |
a9989fb4 | 1670 | |
1671 | ivts->base_var = gen_reg_rtx (GET_MODE (expr)); | |
a9989fb4 | 1672 | } |
1673 | ||
1674 | /* Insert initialization of basic variable of IVTS before INSN, taking | |
1675 | the initial value from INSN. */ | |
1676 | ||
1677 | static void | |
222dc1d3 | 1678 | insert_base_initialization (struct iv_to_split *ivts, rtx_insn *insn) |
a9989fb4 | 1679 | { |
686fd44c | 1680 | rtx expr = copy_rtx (SET_SRC (single_set (insn))); |
222dc1d3 | 1681 | rtx_insn *seq; |
a9989fb4 | 1682 | |
1683 | start_sequence (); | |
1684 | expr = force_operand (expr, ivts->base_var); | |
1685 | if (expr != ivts->base_var) | |
1686 | emit_move_insn (ivts->base_var, expr); | |
1687 | seq = get_insns (); | |
1688 | end_sequence (); | |
1689 | ||
1690 | emit_insn_before (seq, insn); | |
1691 | } | |
1692 | ||
1693 | /* Replace the use of induction variable described in IVTS in INSN | |
1694 | by base variable + DELTA * step. */ | |
1695 | ||
1696 | static void | |
222dc1d3 | 1697 | split_iv (struct iv_to_split *ivts, rtx_insn *insn, unsigned delta) |
a9989fb4 | 1698 | { |
222dc1d3 | 1699 | rtx expr, *loc, incr, var; |
1700 | rtx_insn *seq; | |
3754d046 | 1701 | machine_mode mode = GET_MODE (ivts->base_var); |
a9989fb4 | 1702 | rtx src, dest, set; |
1703 | ||
1704 | /* Construct base + DELTA * step. */ | |
1705 | if (!delta) | |
1706 | expr = ivts->base_var; | |
1707 | else | |
1708 | { | |
1709 | incr = simplify_gen_binary (MULT, mode, | |
1710 | ivts->step, gen_int_mode (delta, mode)); | |
1711 | expr = simplify_gen_binary (PLUS, GET_MODE (ivts->base_var), | |
1712 | ivts->base_var, incr); | |
1713 | } | |
1714 | ||
1715 | /* Figure out where to do the replacement. */ | |
686fd44c | 1716 | loc = &SET_SRC (single_set (insn)); |
a9989fb4 | 1717 | |
1718 | /* If we can make the replacement right away, we're done. */ | |
1719 | if (validate_change (insn, loc, expr, 0)) | |
1720 | return; | |
1721 | ||
1722 | /* Otherwise, force EXPR into a register and try again. */ | |
1723 | start_sequence (); | |
1724 | var = gen_reg_rtx (mode); | |
1725 | expr = force_operand (expr, var); | |
1726 | if (expr != var) | |
1727 | emit_move_insn (var, expr); | |
1728 | seq = get_insns (); | |
1729 | end_sequence (); | |
1730 | emit_insn_before (seq, insn); | |
48e1416a | 1731 | |
a9989fb4 | 1732 | if (validate_change (insn, loc, var, 0)) |
1733 | return; | |
1734 | ||
1735 | /* The last chance. Try recreating the assignment in insn | |
1736 | completely from scratch. */ | |
1737 | set = single_set (insn); | |
1738 | gcc_assert (set); | |
1739 | ||
1740 | start_sequence (); | |
1741 | *loc = var; | |
1742 | src = copy_rtx (SET_SRC (set)); | |
1743 | dest = copy_rtx (SET_DEST (set)); | |
1744 | src = force_operand (src, dest); | |
1745 | if (src != dest) | |
1746 | emit_move_insn (dest, src); | |
1747 | seq = get_insns (); | |
1748 | end_sequence (); | |
48e1416a | 1749 | |
a9989fb4 | 1750 | emit_insn_before (seq, insn); |
1751 | delete_insn (insn); | |
1752 | } | |
1753 | ||
a9989fb4 | 1754 | |
6dce5fff | 1755 | /* Return one expansion of the accumulator recorded in struct VE. */ |
a9989fb4 | 1756 | |
375bb675 | 1757 | static rtx |
1758 | get_expansion (struct var_to_expand *ve) | |
1759 | { | |
1760 | rtx reg; | |
48e1416a | 1761 | |
375bb675 | 1762 | if (ve->reuse_expansion == 0) |
1763 | reg = ve->reg; | |
1764 | else | |
f1f41a6c | 1765 | reg = ve->var_expansions[ve->reuse_expansion - 1]; |
48e1416a | 1766 | |
f1f41a6c | 1767 | if (ve->var_expansions.length () == (unsigned) ve->reuse_expansion) |
375bb675 | 1768 | ve->reuse_expansion = 0; |
48e1416a | 1769 | else |
375bb675 | 1770 | ve->reuse_expansion++; |
48e1416a | 1771 | |
375bb675 | 1772 | return reg; |
1773 | } | |
a9989fb4 | 1774 | |
a9989fb4 | 1775 | |
48e1416a | 1776 | /* Given INSN replace the uses of the accumulator recorded in VE |
375bb675 | 1777 | with a new register. */ |
1778 | ||
1779 | static void | |
222dc1d3 | 1780 | expand_var_during_unrolling (struct var_to_expand *ve, rtx_insn *insn) |
375bb675 | 1781 | { |
1782 | rtx new_reg, set; | |
1783 | bool really_new_expansion = false; | |
48e1416a | 1784 | |
375bb675 | 1785 | set = single_set (insn); |
972e95af | 1786 | gcc_assert (set); |
48e1416a | 1787 | |
375bb675 | 1788 | /* Generate a new register only if the expansion limit has not been |
1789 | reached. Else reuse an already existing expansion. */ | |
1790 | if (PARAM_VALUE (PARAM_MAX_VARIABLE_EXPANSIONS) > ve->expansion_count) | |
1791 | { | |
1792 | really_new_expansion = true; | |
1793 | new_reg = gen_reg_rtx (GET_MODE (ve->reg)); | |
1794 | } | |
1795 | else | |
1796 | new_reg = get_expansion (ve); | |
1797 | ||
d022e236 | 1798 | validate_replace_rtx_group (SET_DEST (set), new_reg, insn); |
375bb675 | 1799 | if (apply_change_group ()) |
1800 | if (really_new_expansion) | |
1801 | { | |
f1f41a6c | 1802 | ve->var_expansions.safe_push (new_reg); |
375bb675 | 1803 | ve->expansion_count++; |
1804 | } | |
1805 | } | |
1806 | ||
b635a1a3 | 1807 | /* Initialize the variable expansions in loop preheader. PLACE is the |
1808 | loop-preheader basic block where the initialization of the | |
1809 | expansions should take place. The expansions are initialized with | |
1810 | (-0) when the operation is plus or minus to honor sign zero. This | |
1811 | way we can prevent cases where the sign of the final result is | |
1812 | effected by the sign of the expansion. Here is an example to | |
1813 | demonstrate this: | |
48e1416a | 1814 | |
428c97ad | 1815 | for (i = 0 ; i < n; i++) |
1816 | sum += something; | |
1817 | ||
1818 | ==> | |
1819 | ||
1820 | sum += something | |
1821 | .... | |
1822 | i = i+1; | |
1823 | sum1 += something | |
1824 | .... | |
1825 | i = i+1 | |
1826 | sum2 += something; | |
1827 | .... | |
48e1416a | 1828 | |
428c97ad | 1829 | When SUM is initialized with -zero and SOMETHING is also -zero; the |
1830 | final result of sum should be -zero thus the expansions sum1 and sum2 | |
1831 | should be initialized with -zero as well (otherwise we will get +zero | |
1832 | as the final result). */ | |
375bb675 | 1833 | |
b635a1a3 | 1834 | static void |
1835 | insert_var_expansion_initialization (struct var_to_expand *ve, | |
1836 | basic_block place) | |
375bb675 | 1837 | { |
222dc1d3 | 1838 | rtx_insn *seq; |
1839 | rtx var, zero_init; | |
375bb675 | 1840 | unsigned i; |
3754d046 | 1841 | machine_mode mode = GET_MODE (ve->reg); |
428c97ad | 1842 | bool honor_signed_zero_p = HONOR_SIGNED_ZEROS (mode); |
1843 | ||
f1f41a6c | 1844 | if (ve->var_expansions.length () == 0) |
b635a1a3 | 1845 | return; |
48e1416a | 1846 | |
375bb675 | 1847 | start_sequence (); |
7497b98c | 1848 | switch (ve->op) |
1849 | { | |
1850 | case FMA: | |
1851 | /* Note that we only accumulate FMA via the ADD operand. */ | |
1852 | case PLUS: | |
1853 | case MINUS: | |
f1f41a6c | 1854 | FOR_EACH_VEC_ELT (ve->var_expansions, i, var) |
7497b98c | 1855 | { |
1856 | if (honor_signed_zero_p) | |
1857 | zero_init = simplify_gen_unary (NEG, mode, CONST0_RTX (mode), mode); | |
1858 | else | |
1859 | zero_init = CONST0_RTX (mode); | |
1860 | emit_move_insn (var, zero_init); | |
1861 | } | |
1862 | break; | |
1863 | ||
1864 | case MULT: | |
f1f41a6c | 1865 | FOR_EACH_VEC_ELT (ve->var_expansions, i, var) |
7497b98c | 1866 | { |
1867 | zero_init = CONST1_RTX (GET_MODE (var)); | |
1868 | emit_move_insn (var, zero_init); | |
1869 | } | |
1870 | break; | |
1871 | ||
1872 | default: | |
1873 | gcc_unreachable (); | |
1874 | } | |
48e1416a | 1875 | |
375bb675 | 1876 | seq = get_insns (); |
1877 | end_sequence (); | |
48e1416a | 1878 | |
d022e236 | 1879 | emit_insn_after (seq, BB_END (place)); |
375bb675 | 1880 | } |
1881 | ||
b635a1a3 | 1882 | /* Combine the variable expansions at the loop exit. PLACE is the |
1883 | loop exit basic block where the summation of the expansions should | |
1884 | take place. */ | |
375bb675 | 1885 | |
b635a1a3 | 1886 | static void |
1887 | combine_var_copies_in_loop_exit (struct var_to_expand *ve, basic_block place) | |
375bb675 | 1888 | { |
375bb675 | 1889 | rtx sum = ve->reg; |
222dc1d3 | 1890 | rtx expr, var; |
1891 | rtx_insn *seq, *insn; | |
375bb675 | 1892 | unsigned i; |
1893 | ||
f1f41a6c | 1894 | if (ve->var_expansions.length () == 0) |
b635a1a3 | 1895 | return; |
48e1416a | 1896 | |
375bb675 | 1897 | start_sequence (); |
7497b98c | 1898 | switch (ve->op) |
1899 | { | |
1900 | case FMA: | |
1901 | /* Note that we only accumulate FMA via the ADD operand. */ | |
1902 | case PLUS: | |
1903 | case MINUS: | |
f1f41a6c | 1904 | FOR_EACH_VEC_ELT (ve->var_expansions, i, var) |
7497b98c | 1905 | sum = simplify_gen_binary (PLUS, GET_MODE (ve->reg), var, sum); |
1906 | break; | |
1907 | ||
1908 | case MULT: | |
f1f41a6c | 1909 | FOR_EACH_VEC_ELT (ve->var_expansions, i, var) |
7497b98c | 1910 | sum = simplify_gen_binary (MULT, GET_MODE (ve->reg), var, sum); |
1911 | break; | |
1912 | ||
1913 | default: | |
1914 | gcc_unreachable (); | |
1915 | } | |
48e1416a | 1916 | |
375bb675 | 1917 | expr = force_operand (sum, ve->reg); |
1918 | if (expr != ve->reg) | |
1919 | emit_move_insn (ve->reg, expr); | |
1920 | seq = get_insns (); | |
1921 | end_sequence (); | |
48e1416a | 1922 | |
375bb675 | 1923 | insn = BB_HEAD (place); |
1924 | while (!NOTE_INSN_BASIC_BLOCK_P (insn)) | |
1925 | insn = NEXT_INSN (insn); | |
1926 | ||
1927 | emit_insn_after (seq, insn); | |
375bb675 | 1928 | } |
1929 | ||
ef770610 | 1930 | /* Strip away REG_EQUAL notes for IVs we're splitting. |
1931 | ||
1932 | Updating REG_EQUAL notes for IVs we split is tricky: We | |
1933 | cannot tell until after unrolling, DF-rescanning, and liveness | |
1934 | updating, whether an EQ_USE is reached by the split IV while | |
1935 | the IV reg is still live. See PR55006. | |
1936 | ||
1937 | ??? We cannot use remove_reg_equal_equiv_notes_for_regno, | |
1938 | because RTL loop-iv requires us to defer rescanning insns and | |
1939 | any notes attached to them. So resort to old techniques... */ | |
1940 | ||
1941 | static void | |
222dc1d3 | 1942 | maybe_strip_eq_note_for_split_iv (struct opt_info *opt_info, rtx_insn *insn) |
ef770610 | 1943 | { |
1944 | struct iv_to_split *ivts; | |
1945 | rtx note = find_reg_equal_equiv_note (insn); | |
1946 | if (! note) | |
1947 | return; | |
1948 | for (ivts = opt_info->iv_to_split_head; ivts; ivts = ivts->next) | |
1949 | if (reg_mentioned_p (ivts->orig_var, note)) | |
1950 | { | |
1951 | remove_note (insn, note); | |
1952 | return; | |
1953 | } | |
1954 | } | |
1955 | ||
48e1416a | 1956 | /* Apply loop optimizations in loop copies using the |
1957 | data which gathered during the unrolling. Structure | |
375bb675 | 1958 | OPT_INFO record that data. |
48e1416a | 1959 | |
a9989fb4 | 1960 | UNROLLING is true if we unrolled (not peeled) the loop. |
1961 | REWRITE_ORIGINAL_BODY is true if we should also rewrite the original body of | |
1962 | the loop (as it should happen in complete unrolling, but not in ordinary | |
1963 | peeling of the loop). */ | |
1964 | ||
1965 | static void | |
48e1416a | 1966 | apply_opt_in_copies (struct opt_info *opt_info, |
1967 | unsigned n_copies, bool unrolling, | |
375bb675 | 1968 | bool rewrite_original_loop) |
a9989fb4 | 1969 | { |
1970 | unsigned i, delta; | |
1971 | basic_block bb, orig_bb; | |
222dc1d3 | 1972 | rtx_insn *insn, *orig_insn, *next; |
a9989fb4 | 1973 | struct iv_to_split ivts_templ, *ivts; |
375bb675 | 1974 | struct var_to_expand ve_templ, *ves; |
48e1416a | 1975 | |
a9989fb4 | 1976 | /* Sanity check -- we need to put initialization in the original loop |
1977 | body. */ | |
1978 | gcc_assert (!unrolling || rewrite_original_loop); | |
48e1416a | 1979 | |
a9989fb4 | 1980 | /* Allocate the basic variables (i0). */ |
c1f445d2 | 1981 | if (opt_info->insns_to_split) |
b635a1a3 | 1982 | for (ivts = opt_info->iv_to_split_head; ivts; ivts = ivts->next) |
1983 | allocate_basic_variable (ivts); | |
48e1416a | 1984 | |
fe672ac0 | 1985 | for (i = opt_info->first_new_block; |
1986 | i < (unsigned) last_basic_block_for_fn (cfun); | |
1987 | i++) | |
a9989fb4 | 1988 | { |
f5a6b05f | 1989 | bb = BASIC_BLOCK_FOR_FN (cfun, i); |
01020a5f | 1990 | orig_bb = get_bb_original (bb); |
48e1416a | 1991 | |
01020a5f | 1992 | /* bb->aux holds position in copy sequence initialized by |
1993 | duplicate_loop_to_header_edge. */ | |
1994 | delta = determine_split_iv_delta ((size_t)bb->aux, n_copies, | |
a9989fb4 | 1995 | unrolling); |
75cc36a4 | 1996 | bb->aux = 0; |
a9989fb4 | 1997 | orig_insn = BB_HEAD (orig_bb); |
ef770610 | 1998 | FOR_BB_INSNS_SAFE (bb, insn, next) |
375bb675 | 1999 | { |
0e6893c5 | 2000 | if (!INSN_P (insn) |
2001 | || (DEBUG_INSN_P (insn) | |
2002 | && TREE_CODE (INSN_VAR_LOCATION_DECL (insn)) == LABEL_DECL)) | |
375bb675 | 2003 | continue; |
48e1416a | 2004 | |
0e6893c5 | 2005 | while (!INSN_P (orig_insn) |
2006 | || (DEBUG_INSN_P (orig_insn) | |
2007 | && (TREE_CODE (INSN_VAR_LOCATION_DECL (orig_insn)) | |
2008 | == LABEL_DECL))) | |
375bb675 | 2009 | orig_insn = NEXT_INSN (orig_insn); |
48e1416a | 2010 | |
375bb675 | 2011 | ivts_templ.insn = orig_insn; |
2012 | ve_templ.insn = orig_insn; | |
48e1416a | 2013 | |
375bb675 | 2014 | /* Apply splitting iv optimization. */ |
c1f445d2 | 2015 | if (opt_info->insns_to_split) |
375bb675 | 2016 | { |
ef770610 | 2017 | maybe_strip_eq_note_for_split_iv (opt_info, insn); |
2018 | ||
c1f445d2 | 2019 | ivts = opt_info->insns_to_split->find (&ivts_templ); |
48e1416a | 2020 | |
375bb675 | 2021 | if (ivts) |
2022 | { | |
19ef3705 | 2023 | gcc_assert (GET_CODE (PATTERN (insn)) |
2024 | == GET_CODE (PATTERN (orig_insn))); | |
48e1416a | 2025 | |
375bb675 | 2026 | if (!delta) |
2027 | insert_base_initialization (ivts, insn); | |
2028 | split_iv (ivts, insn, delta); | |
2029 | } | |
2030 | } | |
2031 | /* Apply variable expansion optimization. */ | |
c1f445d2 | 2032 | if (unrolling && opt_info->insns_with_var_to_expand) |
375bb675 | 2033 | { |
4077bf7a | 2034 | ves = (struct var_to_expand *) |
c1f445d2 | 2035 | opt_info->insns_with_var_to_expand->find (&ve_templ); |
375bb675 | 2036 | if (ves) |
48e1416a | 2037 | { |
19ef3705 | 2038 | gcc_assert (GET_CODE (PATTERN (insn)) |
2039 | == GET_CODE (PATTERN (orig_insn))); | |
375bb675 | 2040 | expand_var_during_unrolling (ves, insn); |
2041 | } | |
2042 | } | |
2043 | orig_insn = NEXT_INSN (orig_insn); | |
2044 | } | |
a9989fb4 | 2045 | } |
2046 | ||
2047 | if (!rewrite_original_loop) | |
2048 | return; | |
48e1416a | 2049 | |
375bb675 | 2050 | /* Initialize the variable expansions in the loop preheader |
48e1416a | 2051 | and take care of combining them at the loop exit. */ |
c1f445d2 | 2052 | if (opt_info->insns_with_var_to_expand) |
375bb675 | 2053 | { |
b635a1a3 | 2054 | for (ves = opt_info->var_to_expand_head; ves; ves = ves->next) |
2055 | insert_var_expansion_initialization (ves, opt_info->loop_preheader); | |
2056 | for (ves = opt_info->var_to_expand_head; ves; ves = ves->next) | |
2057 | combine_var_copies_in_loop_exit (ves, opt_info->loop_exit); | |
375bb675 | 2058 | } |
48e1416a | 2059 | |
a9989fb4 | 2060 | /* Rewrite also the original loop body. Find them as originals of the blocks |
2061 | in the last copied iteration, i.e. those that have | |
01020a5f | 2062 | get_bb_copy (get_bb_original (bb)) == bb. */ |
fe672ac0 | 2063 | for (i = opt_info->first_new_block; |
2064 | i < (unsigned) last_basic_block_for_fn (cfun); | |
2065 | i++) | |
a9989fb4 | 2066 | { |
f5a6b05f | 2067 | bb = BASIC_BLOCK_FOR_FN (cfun, i); |
01020a5f | 2068 | orig_bb = get_bb_original (bb); |
2069 | if (get_bb_copy (orig_bb) != bb) | |
a9989fb4 | 2070 | continue; |
48e1416a | 2071 | |
a9989fb4 | 2072 | delta = determine_split_iv_delta (0, n_copies, unrolling); |
2073 | for (orig_insn = BB_HEAD (orig_bb); | |
375bb675 | 2074 | orig_insn != NEXT_INSN (BB_END (bb)); |
2075 | orig_insn = next) | |
2076 | { | |
2077 | next = NEXT_INSN (orig_insn); | |
48e1416a | 2078 | |
375bb675 | 2079 | if (!INSN_P (orig_insn)) |
2080 | continue; | |
48e1416a | 2081 | |
375bb675 | 2082 | ivts_templ.insn = orig_insn; |
c1f445d2 | 2083 | if (opt_info->insns_to_split) |
375bb675 | 2084 | { |
ef770610 | 2085 | maybe_strip_eq_note_for_split_iv (opt_info, orig_insn); |
2086 | ||
4077bf7a | 2087 | ivts = (struct iv_to_split *) |
c1f445d2 | 2088 | opt_info->insns_to_split->find (&ivts_templ); |
375bb675 | 2089 | if (ivts) |
2090 | { | |
2091 | if (!delta) | |
2092 | insert_base_initialization (ivts, orig_insn); | |
2093 | split_iv (ivts, orig_insn, delta); | |
2094 | continue; | |
2095 | } | |
2096 | } | |
48e1416a | 2097 | |
375bb675 | 2098 | } |
2099 | } | |
2100 | } | |
a9989fb4 | 2101 | |
375bb675 | 2102 | /* Release OPT_INFO. */ |
a9989fb4 | 2103 | |
2104 | static void | |
375bb675 | 2105 | free_opt_info (struct opt_info *opt_info) |
a9989fb4 | 2106 | { |
c1f445d2 | 2107 | delete opt_info->insns_to_split; |
2108 | opt_info->insns_to_split = NULL; | |
2109 | if (opt_info->insns_with_var_to_expand) | |
375bb675 | 2110 | { |
b635a1a3 | 2111 | struct var_to_expand *ves; |
2112 | ||
2113 | for (ves = opt_info->var_to_expand_head; ves; ves = ves->next) | |
f1f41a6c | 2114 | ves->var_expansions.release (); |
c1f445d2 | 2115 | delete opt_info->insns_with_var_to_expand; |
2116 | opt_info->insns_with_var_to_expand = NULL; | |
375bb675 | 2117 | } |
2118 | free (opt_info); | |
a9989fb4 | 2119 | } |