]> git.ipfire.org Git - thirdparty/gcc.git/blame - gcc/loop-unroll.c
PR testsuite/92016 - Excess errors in Wstringop-overflow-17.c
[thirdparty/gcc.git] / gcc / loop-unroll.c
CommitLineData
c836de3f 1/* Loop unrolling.
fbd26352 2 Copyright (C) 2002-2019 Free Software Foundation, Inc.
ad5201d0 3
4This file is part of GCC.
5
6GCC is free software; you can redistribute it and/or modify it under
7the terms of the GNU General Public License as published by the Free
8c4c00c1 8Software Foundation; either version 3, or (at your option) any later
ad5201d0 9version.
10
11GCC is distributed in the hope that it will be useful, but WITHOUT ANY
12WARRANTY; without even the implied warranty of MERCHANTABILITY or
13FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
14for more details.
15
16You should have received a copy of the GNU General Public License
8c4c00c1 17along with GCC; see the file COPYING3. If not see
18<http://www.gnu.org/licenses/>. */
ad5201d0 19
20#include "config.h"
21#include "system.h"
22#include "coretypes.h"
9ef16211 23#include "backend.h"
7c29e30e 24#include "target.h"
ad5201d0 25#include "rtl.h"
41a8aa41 26#include "tree.h"
7c29e30e 27#include "cfghooks.h"
ad7b10a2 28#include "memmodel.h"
7c29e30e 29#include "optabs.h"
30#include "emit-rtl.h"
31#include "recog.h"
886c1262 32#include "profile.h"
94ea8568 33#include "cfgrtl.h"
ad5201d0 34#include "cfgloop.h"
ad5201d0 35#include "params.h"
d53441c8 36#include "dojump.h"
ad5201d0 37#include "expr.h"
b9ed1410 38#include "dumpfile.h"
ad5201d0 39
c836de3f 40/* This pass performs loop unrolling. We only perform this
41 optimization on innermost loops (with single exception) because
ad5201d0 42 the impact on performance is greatest here, and we want to avoid
43 unnecessary code size growth. The gain is caused by greater sequentiality
917bbcab 44 of code, better code to optimize for further passes and in some cases
ad5201d0 45 by fewer testings of exit conditions. The main problem is code growth,
46 that impacts performance negatively due to effect of caches.
47
48 What we do:
49
ad5201d0 50 -- unrolling of loops that roll constant times; this is almost always
51 win, as we get rid of exit condition tests.
52 -- unrolling of loops that roll number of times that we can compute
53 in runtime; we also get rid of exit condition tests here, but there
54 is the extra expense for calculating the number of iterations
55 -- simple unrolling of remaining loops; this is performed only if we
56 are asked to, as the gain is questionable in this case and often
57 it may even slow down the code
58 For more detailed descriptions of each of those, see comments at
59 appropriate function below.
60
61 There is a lot of parameters (defined and described in params.def) that
c836de3f 62 control how much we unroll.
ad5201d0 63
64 ??? A great problem is that we don't have a good way how to determine
65 how many times we should unroll the loop; the experiments I have made
66 showed that this choice may affect performance in order of several %.
67 */
68
a9989fb4 69/* Information about induction variables to split. */
70
71struct iv_to_split
72{
222dc1d3 73 rtx_insn *insn; /* The insn in that the induction variable occurs. */
ef770610 74 rtx orig_var; /* The variable (register) for the IV before split. */
a9989fb4 75 rtx base_var; /* The variable on that the values in the further
76 iterations are based. */
77 rtx step; /* Step of the induction variable. */
b635a1a3 78 struct iv_to_split *next; /* Next entry in walking order. */
a9989fb4 79};
80
375bb675 81/* Information about accumulators to expand. */
82
83struct var_to_expand
84{
222dc1d3 85 rtx_insn *insn; /* The insn in that the variable expansion occurs. */
dac49aa5 86 rtx reg; /* The accumulator which is expanded. */
f1f41a6c 87 vec<rtx> var_expansions; /* The copies of the accumulator which is expanded. */
b635a1a3 88 struct var_to_expand *next; /* Next entry in walking order. */
48e1416a 89 enum rtx_code op; /* The type of the accumulation - addition, subtraction
375bb675 90 or multiplication. */
91 int expansion_count; /* Count the number of expansions generated so far. */
92 int reuse_expansion; /* The expansion we intend to reuse to expand
48e1416a 93 the accumulator. If REUSE_EXPANSION is 0 reuse
94 the original accumulator. Else use
375bb675 95 var_expansions[REUSE_EXPANSION - 1]. */
96};
97
d9dd21a8 98/* Hashtable helper for iv_to_split. */
99
298e7f9a 100struct iv_split_hasher : free_ptr_hash <iv_to_split>
d9dd21a8 101{
9969c043 102 static inline hashval_t hash (const iv_to_split *);
103 static inline bool equal (const iv_to_split *, const iv_to_split *);
d9dd21a8 104};
105
106
107/* A hash function for information about insns to split. */
108
109inline hashval_t
9969c043 110iv_split_hasher::hash (const iv_to_split *ivts)
d9dd21a8 111{
112 return (hashval_t) INSN_UID (ivts->insn);
113}
114
115/* An equality functions for information about insns to split. */
116
117inline bool
9969c043 118iv_split_hasher::equal (const iv_to_split *i1, const iv_to_split *i2)
d9dd21a8 119{
120 return i1->insn == i2->insn;
121}
122
123/* Hashtable helper for iv_to_split. */
124
298e7f9a 125struct var_expand_hasher : free_ptr_hash <var_to_expand>
d9dd21a8 126{
9969c043 127 static inline hashval_t hash (const var_to_expand *);
128 static inline bool equal (const var_to_expand *, const var_to_expand *);
d9dd21a8 129};
130
131/* Return a hash for VES. */
132
133inline hashval_t
9969c043 134var_expand_hasher::hash (const var_to_expand *ves)
d9dd21a8 135{
136 return (hashval_t) INSN_UID (ves->insn);
137}
138
139/* Return true if I1 and I2 refer to the same instruction. */
140
141inline bool
9969c043 142var_expand_hasher::equal (const var_to_expand *i1, const var_to_expand *i2)
d9dd21a8 143{
144 return i1->insn == i2->insn;
145}
146
375bb675 147/* Information about optimization applied in
148 the unrolled loop. */
149
150struct opt_info
a9989fb4 151{
c1f445d2 152 hash_table<iv_split_hasher> *insns_to_split; /* A hashtable of insns to
d9dd21a8 153 split. */
b635a1a3 154 struct iv_to_split *iv_to_split_head; /* The first iv to split. */
155 struct iv_to_split **iv_to_split_tail; /* Pointer to the tail of the list. */
c1f445d2 156 hash_table<var_expand_hasher> *insns_with_var_to_expand; /* A hashtable of
d9dd21a8 157 insns with accumulators to expand. */
b635a1a3 158 struct var_to_expand *var_to_expand_head; /* The first var to expand. */
159 struct var_to_expand **var_to_expand_tail; /* Pointer to the tail of the list. */
375bb675 160 unsigned first_new_block; /* The first basic block that was
161 duplicated. */
162 basic_block loop_exit; /* The loop exit basic block. */
163 basic_block loop_preheader; /* The loop preheader basic block. */
a9989fb4 164};
165
2e966e2a 166static void decide_unroll_stupid (class loop *, int);
167static void decide_unroll_constant_iterations (class loop *, int);
168static void decide_unroll_runtime_iterations (class loop *, int);
169static void unroll_loop_stupid (class loop *);
c836de3f 170static void decide_unrolling (int);
2e966e2a 171static void unroll_loop_constant_iterations (class loop *);
172static void unroll_loop_runtime_iterations (class loop *);
173static struct opt_info *analyze_insns_in_loop (class loop *);
375bb675 174static void opt_info_start_duplication (struct opt_info *);
175static void apply_opt_in_copies (struct opt_info *, unsigned, bool, bool);
176static void free_opt_info (struct opt_info *);
2e966e2a 177static struct var_to_expand *analyze_insn_to_expand_var (class loop*, rtx_insn *);
178static bool referenced_in_one_insn_in_loop_p (class loop *, rtx, int *);
3eeb4f9a 179static struct iv_to_split *analyze_iv_to_split_insn (rtx_insn *);
222dc1d3 180static void expand_var_during_unrolling (struct var_to_expand *, rtx_insn *);
b635a1a3 181static void insert_var_expansion_initialization (struct var_to_expand *,
182 basic_block);
183static void combine_var_copies_in_loop_exit (struct var_to_expand *,
184 basic_block);
375bb675 185static rtx get_expansion (struct var_to_expand *);
ad5201d0 186
c836de3f 187/* Emit a message summarizing the unroll that will be
f55775aa 188 performed for LOOP, along with the loop's location LOCUS, if
189 appropriate given the dump or -fopt-info settings. */
190
191static void
2e966e2a 192report_unroll (class loop *loop, dump_location_t locus)
f55775aa 193{
ffdaf8f1 194 dump_flags_t report_flags = MSG_OPTIMIZED_LOCATIONS | TDF_DETAILS;
f55775aa 195
830b796a 196 if (loop->lpt_decision.decision == LPT_NONE)
197 return;
198
f55775aa 199 if (!dump_enabled_p ())
200 return;
201
e43345c1 202 dump_metadata_t metadata (report_flags, locus.get_impl_location ());
203 dump_printf_loc (metadata, locus.get_user_location (),
c836de3f 204 "loop unrolled %d times",
205 loop->lpt_decision.times);
db9cef39 206 if (profile_info && loop->header->count.initialized_p ())
e43345c1 207 dump_printf (metadata,
c836de3f 208 " (header execution count %d)",
db9cef39 209 (int)loop->header->count.to_gcov_type ());
f55775aa 210
e43345c1 211 dump_printf (metadata, "\n");
f55775aa 212}
213
c836de3f 214/* Decide whether unroll loops and how much. */
ad5201d0 215static void
c836de3f 216decide_unrolling (int flags)
ad5201d0 217{
2e966e2a 218 class loop *loop;
ad5201d0 219
220 /* Scan the loops, inner ones first. */
f21d4d00 221 FOR_EACH_LOOP (loop, LI_FROM_INNERMOST)
ad5201d0 222 {
ad5201d0 223 loop->lpt_decision.decision = LPT_NONE;
c309657f 224 dump_user_location_t locus = get_loop_location (loop);
ad5201d0 225
f55775aa 226 if (dump_enabled_p ())
ffdaf8f1 227 dump_printf_loc (MSG_NOTE, locus,
2a09b28c 228 "considering unrolling loop %d at BB %d\n",
229 loop->num, loop->header->index);
230
231 if (loop->unroll == 1)
232 {
233 if (dump_file)
234 fprintf (dump_file,
235 ";; Not unrolling loop, user didn't want it unrolled\n");
236 continue;
237 }
ad5201d0 238
239 /* Do not peel cold areas. */
0bfd8d5c 240 if (optimize_loop_for_size_p (loop))
ad5201d0 241 {
450d042a 242 if (dump_file)
243 fprintf (dump_file, ";; Not considering loop, cold area\n");
ad5201d0 244 continue;
245 }
246
247 /* Can the loop be manipulated? */
248 if (!can_duplicate_loop_p (loop))
249 {
450d042a 250 if (dump_file)
251 fprintf (dump_file,
ad5201d0 252 ";; Not considering loop, cannot duplicate\n");
ad5201d0 253 continue;
254 }
255
256 /* Skip non-innermost loops. */
257 if (loop->inner)
258 {
450d042a 259 if (dump_file)
260 fprintf (dump_file, ";; Not considering loop, is not innermost\n");
ad5201d0 261 continue;
262 }
263
264 loop->ninsns = num_loop_insns (loop);
265 loop->av_ninsns = average_num_loop_insns (loop);
266
2a09b28c 267 /* Try transformations one by one in decreasing order of priority. */
0051c76a 268 decide_unroll_constant_iterations (loop, flags);
ad5201d0 269 if (loop->lpt_decision.decision == LPT_NONE)
0051c76a 270 decide_unroll_runtime_iterations (loop, flags);
ad5201d0 271 if (loop->lpt_decision.decision == LPT_NONE)
0051c76a 272 decide_unroll_stupid (loop, flags);
ad5201d0 273
c836de3f 274 report_unroll (loop, locus);
ad5201d0 275 }
ad5201d0 276}
277
c836de3f 278/* Unroll LOOPS. */
279void
280unroll_loops (int flags)
ad5201d0 281{
2e966e2a 282 class loop *loop;
c836de3f 283 bool changed = false;
ad5201d0 284
c836de3f 285 /* Now decide rest of unrolling. */
286 decide_unrolling (flags);
ad5201d0 287
c836de3f 288 /* Scan the loops, inner ones first. */
289 FOR_EACH_LOOP (loop, LI_FROM_INNERMOST)
ad5201d0 290 {
c836de3f 291 /* And perform the appropriate transformations. */
292 switch (loop->lpt_decision.decision)
3ad4992f 293 {
c836de3f 294 case LPT_UNROLL_CONSTANT:
295 unroll_loop_constant_iterations (loop);
296 changed = true;
297 break;
298 case LPT_UNROLL_RUNTIME:
299 unroll_loop_runtime_iterations (loop);
300 changed = true;
301 break;
302 case LPT_UNROLL_STUPID:
303 unroll_loop_stupid (loop);
304 changed = true;
305 break;
306 case LPT_NONE:
307 break;
308 default:
309 gcc_unreachable ();
ad5201d0 310 }
ad5201d0 311 }
312
c836de3f 313 if (changed)
314 {
315 calculate_dominance_info (CDI_DOMINATORS);
316 fix_loop_structure (NULL);
317 }
3ad4992f 318
c836de3f 319 iv_analysis_done ();
320}
ad5201d0 321
c836de3f 322/* Check whether exit of the LOOP is at the end of loop body. */
3ad4992f 323
c836de3f 324static bool
2e966e2a 325loop_exit_at_end_p (class loop *loop)
ad5201d0 326{
2e966e2a 327 class niter_desc *desc = get_simple_loop_desc (loop);
c836de3f 328 rtx_insn *insn;
a5414ff5 329
c836de3f 330 /* We should never have conditional in latch block. */
331 gcc_assert (desc->in_edge->dest != loop->header);
48e1416a 332
c836de3f 333 if (desc->in_edge->dest != loop->latch)
334 return false;
a9989fb4 335
c836de3f 336 /* Check that the latch is empty. */
337 FOR_BB_INSNS (loop->latch, insn)
338 {
339 if (INSN_P (insn) && active_insn_p (insn))
340 return false;
a5414ff5 341 }
ad5201d0 342
c836de3f 343 return true;
ad5201d0 344}
345
450d042a 346/* Decide whether to unroll LOOP iterating constant number of times
347 and how much. */
f9cce2dc 348
ad5201d0 349static void
2e966e2a 350decide_unroll_constant_iterations (class loop *loop, int flags)
ad5201d0 351{
f9cce2dc 352 unsigned nunroll, nunroll_by_av, best_copies, best_unroll = 0, n_copies, i;
2e966e2a 353 class niter_desc *desc;
5de9d3ed 354 widest_int iterations;
ad5201d0 355
2a09b28c 356 /* If we were not asked to unroll this loop, just return back silently. */
357 if (!(flags & UAP_UNROLL) && !loop->unroll)
358 return;
ad5201d0 359
2a09b28c 360 if (dump_enabled_p ())
361 dump_printf (MSG_NOTE,
362 "considering unrolling loop with constant "
363 "number of iterations\n");
ad5201d0 364
365 /* nunroll = total number of copies of the original loop body in
2a09b28c 366 unrolled loop (i.e. if it is 2, we have to duplicate loop body once). */
ad5201d0 367 nunroll = PARAM_VALUE (PARAM_MAX_UNROLLED_INSNS) / loop->ninsns;
450d042a 368 nunroll_by_av
369 = PARAM_VALUE (PARAM_MAX_AVERAGE_UNROLLED_INSNS) / loop->av_ninsns;
ad5201d0 370 if (nunroll > nunroll_by_av)
371 nunroll = nunroll_by_av;
372 if (nunroll > (unsigned) PARAM_VALUE (PARAM_MAX_UNROLL_TIMES))
373 nunroll = PARAM_VALUE (PARAM_MAX_UNROLL_TIMES);
374
e7f33cf8 375 if (targetm.loop_unroll_adjust)
376 nunroll = targetm.loop_unroll_adjust (nunroll, loop);
377
ad5201d0 378 /* Skip big loops. */
379 if (nunroll <= 1)
380 {
450d042a 381 if (dump_file)
382 fprintf (dump_file, ";; Not considering loop, is too big\n");
ad5201d0 383 return;
384 }
385
386 /* Check for simple loops. */
f9cce2dc 387 desc = get_simple_loop_desc (loop);
ad5201d0 388
389 /* Check number of iterations. */
f9cce2dc 390 if (!desc->simple_p || !desc->const_iter || desc->assumptions)
ad5201d0 391 {
450d042a 392 if (dump_file)
393 fprintf (dump_file,
394 ";; Unable to prove that the loop iterates constant times\n");
ad5201d0 395 return;
396 }
397
2a09b28c 398 /* Check for an explicit unrolling factor. */
ba31ec12 399 if (loop->unroll > 0 && loop->unroll < USHRT_MAX)
2a09b28c 400 {
401 /* However we cannot unroll completely at the RTL level a loop with
402 constant number of iterations; it should have been peeled instead. */
17b0b4ec 403 if (desc->niter == 0 || (unsigned) loop->unroll > desc->niter - 1)
2a09b28c 404 {
405 if (dump_file)
406 fprintf (dump_file, ";; Loop should have been peeled\n");
407 }
408 else
409 {
410 loop->lpt_decision.decision = LPT_UNROLL_CONSTANT;
411 loop->lpt_decision.times = loop->unroll - 1;
412 }
413 return;
414 }
415
c8fbcf58 416 /* Check whether the loop rolls enough to consider.
417 Consult also loop bounds and profile; in the case the loop has more
418 than one exit it may well loop less than determined maximal number
419 of iterations. */
420 if (desc->niter < 2 * nunroll
f86b328b 421 || ((get_estimated_loop_iterations (loop, &iterations)
53b01069 422 || get_likely_max_loop_iterations (loop, &iterations))
796b6678 423 && wi::ltu_p (iterations, 2 * nunroll)))
ad5201d0 424 {
450d042a 425 if (dump_file)
426 fprintf (dump_file, ";; Not unrolling loop, doesn't roll\n");
ad5201d0 427 return;
428 }
429
430 /* Success; now compute number of iterations to unroll. We alter
431 nunroll so that as few as possible copies of loop body are
d01481af 432 necessary, while still not decreasing the number of unrollings
ad5201d0 433 too much (at most by 1). */
434 best_copies = 2 * nunroll + 10;
435
436 i = 2 * nunroll + 2;
2a09b28c 437 if (i > desc->niter - 2)
f9cce2dc 438 i = desc->niter - 2;
ad5201d0 439
440 for (; i >= nunroll - 1; i--)
441 {
f9cce2dc 442 unsigned exit_mod = desc->niter % (i + 1);
ad5201d0 443
f9cce2dc 444 if (!loop_exit_at_end_p (loop))
ad5201d0 445 n_copies = exit_mod + i + 1;
f9cce2dc 446 else if (exit_mod != (unsigned) i
447 || desc->noloop_assumptions != NULL_RTX)
ad5201d0 448 n_copies = exit_mod + i + 2;
449 else
450 n_copies = i + 1;
451
452 if (n_copies < best_copies)
453 {
454 best_copies = n_copies;
455 best_unroll = i;
456 }
457 }
458
ad5201d0 459 loop->lpt_decision.decision = LPT_UNROLL_CONSTANT;
460 loop->lpt_decision.times = best_unroll;
461}
462
1c18ed19 463/* Unroll LOOP with constant number of iterations LOOP->LPT_DECISION.TIMES times.
464 The transformation does this:
3ad4992f 465
ad5201d0 466 for (i = 0; i < 102; i++)
467 body;
3ad4992f 468
1c18ed19 469 ==> (LOOP->LPT_DECISION.TIMES == 3)
3ad4992f 470
ad5201d0 471 i = 0;
472 body; i++;
473 body; i++;
474 while (i < 102)
475 {
476 body; i++;
477 body; i++;
478 body; i++;
479 body; i++;
480 }
481 */
482static void
2e966e2a 483unroll_loop_constant_iterations (class loop *loop)
ad5201d0 484{
485 unsigned HOST_WIDE_INT niter;
486 unsigned exit_mod;
f3c40e6d 487 unsigned i;
f3c40e6d 488 edge e;
ad5201d0 489 unsigned max_unroll = loop->lpt_decision.times;
2e966e2a 490 class niter_desc *desc = get_simple_loop_desc (loop);
f9cce2dc 491 bool exit_at_end = loop_exit_at_end_p (loop);
375bb675 492 struct opt_info *opt_info = NULL;
a53ff4c1 493 bool ok;
48e1416a 494
ad5201d0 495 niter = desc->niter;
496
a9989fb4 497 /* Should not get here (such loop should be peeled instead). */
498 gcc_assert (niter > max_unroll + 1);
ad5201d0 499
500 exit_mod = niter % (max_unroll + 1);
501
e1592442 502 auto_sbitmap wont_exit (max_unroll + 2);
53c5d9d4 503 bitmap_ones (wont_exit);
ad5201d0 504
c2078b80 505 auto_vec<edge> remove_edges;
48e1416a 506 if (flag_split_ivs_in_unroller
375bb675 507 || flag_variable_expansion_in_unroller)
508 opt_info = analyze_insns_in_loop (loop);
48e1416a 509
f9cce2dc 510 if (!exit_at_end)
ad5201d0 511 {
f9cce2dc 512 /* The exit is not at the end of the loop; leave exit test
ad5201d0 513 in the first copy, so that the loops that start with test
514 of exit condition have continuous body after unrolling. */
515
450d042a 516 if (dump_file)
1c18ed19 517 fprintf (dump_file, ";; Condition at beginning of loop.\n");
ad5201d0 518
519 /* Peel exit_mod iterations. */
08b7917c 520 bitmap_clear_bit (wont_exit, 0);
f9cce2dc 521 if (desc->noloop_assumptions)
08b7917c 522 bitmap_clear_bit (wont_exit, 1);
ad5201d0 523
f9cce2dc 524 if (exit_mod)
525 {
375bb675 526 opt_info_start_duplication (opt_info);
a53ff4c1 527 ok = duplicate_loop_to_header_edge (loop, loop_preheader_edge (loop),
7194de72 528 exit_mod,
f9cce2dc 529 wont_exit, desc->out_edge,
f3c40e6d 530 &remove_edges,
75cc36a4 531 DLTHE_FLAG_UPDATE_FREQ
532 | (opt_info && exit_mod > 1
533 ? DLTHE_RECORD_COPY_NUMBER
534 : 0));
a53ff4c1 535 gcc_assert (ok);
f9cce2dc 536
375bb675 537 if (opt_info && exit_mod > 1)
48e1416a 538 apply_opt_in_copies (opt_info, exit_mod, false, false);
539
f9cce2dc 540 desc->noloop_assumptions = NULL_RTX;
541 desc->niter -= exit_mod;
e913b5cd 542 loop->nb_iterations_upper_bound -= exit_mod;
a9ef9877 543 if (loop->any_estimate
796b6678 544 && wi::leu_p (exit_mod, loop->nb_iterations_estimate))
e913b5cd 545 loop->nb_iterations_estimate -= exit_mod;
a9ef9877 546 else
547 loop->any_estimate = false;
8e3ffe30 548 if (loop->any_likely_upper_bound
549 && wi::leu_p (exit_mod, loop->nb_iterations_likely_upper_bound))
550 loop->nb_iterations_likely_upper_bound -= exit_mod;
551 else
552 loop->any_likely_upper_bound = false;
f9cce2dc 553 }
ad5201d0 554
08b7917c 555 bitmap_set_bit (wont_exit, 1);
ad5201d0 556 }
557 else
558 {
559 /* Leave exit test in last copy, for the same reason as above if
560 the loop tests the condition at the end of loop body. */
561
450d042a 562 if (dump_file)
1c18ed19 563 fprintf (dump_file, ";; Condition at end of loop.\n");
ad5201d0 564
565 /* We know that niter >= max_unroll + 2; so we do not need to care of
566 case when we would exit before reaching the loop. So just peel
f9cce2dc 567 exit_mod + 1 iterations. */
568 if (exit_mod != max_unroll
569 || desc->noloop_assumptions)
ad5201d0 570 {
08b7917c 571 bitmap_clear_bit (wont_exit, 0);
f9cce2dc 572 if (desc->noloop_assumptions)
08b7917c 573 bitmap_clear_bit (wont_exit, 1);
48e1416a 574
375bb675 575 opt_info_start_duplication (opt_info);
a53ff4c1 576 ok = duplicate_loop_to_header_edge (loop, loop_preheader_edge (loop),
7194de72 577 exit_mod + 1,
a53ff4c1 578 wont_exit, desc->out_edge,
f3c40e6d 579 &remove_edges,
75cc36a4 580 DLTHE_FLAG_UPDATE_FREQ
581 | (opt_info && exit_mod > 0
582 ? DLTHE_RECORD_COPY_NUMBER
583 : 0));
a53ff4c1 584 gcc_assert (ok);
48e1416a 585
375bb675 586 if (opt_info && exit_mod > 0)
587 apply_opt_in_copies (opt_info, exit_mod + 1, false, false);
a9989fb4 588
f9cce2dc 589 desc->niter -= exit_mod + 1;
e913b5cd 590 loop->nb_iterations_upper_bound -= exit_mod + 1;
a9ef9877 591 if (loop->any_estimate
796b6678 592 && wi::leu_p (exit_mod + 1, loop->nb_iterations_estimate))
e913b5cd 593 loop->nb_iterations_estimate -= exit_mod + 1;
a9ef9877 594 else
595 loop->any_estimate = false;
8e3ffe30 596 if (loop->any_likely_upper_bound
597 && wi::leu_p (exit_mod + 1, loop->nb_iterations_likely_upper_bound))
598 loop->nb_iterations_likely_upper_bound -= exit_mod + 1;
599 else
600 loop->any_likely_upper_bound = false;
f9cce2dc 601 desc->noloop_assumptions = NULL_RTX;
602
08b7917c 603 bitmap_set_bit (wont_exit, 0);
604 bitmap_set_bit (wont_exit, 1);
ad5201d0 605 }
606
08b7917c 607 bitmap_clear_bit (wont_exit, max_unroll);
ad5201d0 608 }
609
610 /* Now unroll the loop. */
48e1416a 611
375bb675 612 opt_info_start_duplication (opt_info);
a53ff4c1 613 ok = duplicate_loop_to_header_edge (loop, loop_latch_edge (loop),
7194de72 614 max_unroll,
a53ff4c1 615 wont_exit, desc->out_edge,
f3c40e6d 616 &remove_edges,
75cc36a4 617 DLTHE_FLAG_UPDATE_FREQ
618 | (opt_info
619 ? DLTHE_RECORD_COPY_NUMBER
620 : 0));
a53ff4c1 621 gcc_assert (ok);
ad5201d0 622
375bb675 623 if (opt_info)
a9989fb4 624 {
375bb675 625 apply_opt_in_copies (opt_info, max_unroll, true, true);
626 free_opt_info (opt_info);
a9989fb4 627 }
628
f9cce2dc 629 if (exit_at_end)
630 {
01020a5f 631 basic_block exit_block = get_bb_copy (desc->in_edge->src);
f9cce2dc 632 /* Find a new in and out edge; they are in the last copy we have made. */
48e1416a 633
cd665a06 634 if (EDGE_SUCC (exit_block, 0)->dest == desc->out_edge->dest)
f9cce2dc 635 {
cd665a06 636 desc->out_edge = EDGE_SUCC (exit_block, 0);
637 desc->in_edge = EDGE_SUCC (exit_block, 1);
f9cce2dc 638 }
639 else
640 {
cd665a06 641 desc->out_edge = EDGE_SUCC (exit_block, 1);
642 desc->in_edge = EDGE_SUCC (exit_block, 0);
f9cce2dc 643 }
644 }
645
646 desc->niter /= max_unroll + 1;
a9ef9877 647 loop->nb_iterations_upper_bound
796b6678 648 = wi::udiv_trunc (loop->nb_iterations_upper_bound, max_unroll + 1);
a9ef9877 649 if (loop->any_estimate)
650 loop->nb_iterations_estimate
796b6678 651 = wi::udiv_trunc (loop->nb_iterations_estimate, max_unroll + 1);
8e3ffe30 652 if (loop->any_likely_upper_bound)
653 loop->nb_iterations_likely_upper_bound
654 = wi::udiv_trunc (loop->nb_iterations_likely_upper_bound, max_unroll + 1);
567e4ed6 655 desc->niter_expr = gen_int_mode (desc->niter, desc->mode);
f9cce2dc 656
ad5201d0 657 /* Remove the edges. */
f1f41a6c 658 FOR_EACH_VEC_ELT (remove_edges, i, e)
f3c40e6d 659 remove_path (e);
ad5201d0 660
450d042a 661 if (dump_file)
662 fprintf (dump_file,
663 ";; Unrolled loop %d times, constant # of iterations %i insns\n",
664 max_unroll, num_loop_insns (loop));
ad5201d0 665}
666
667/* Decide whether to unroll LOOP iterating runtime computable number of times
668 and how much. */
669static void
2e966e2a 670decide_unroll_runtime_iterations (class loop *loop, int flags)
ad5201d0 671{
672 unsigned nunroll, nunroll_by_av, i;
2e966e2a 673 class niter_desc *desc;
5de9d3ed 674 widest_int iterations;
ad5201d0 675
2a09b28c 676 /* If we were not asked to unroll this loop, just return back silently. */
677 if (!(flags & UAP_UNROLL) && !loop->unroll)
678 return;
ad5201d0 679
2a09b28c 680 if (dump_enabled_p ())
681 dump_printf (MSG_NOTE,
682 "considering unrolling loop with runtime-"
683 "computable number of iterations\n");
ad5201d0 684
685 /* nunroll = total number of copies of the original loop body in
686 unrolled loop (i.e. if it is 2, we have to duplicate loop body once. */
687 nunroll = PARAM_VALUE (PARAM_MAX_UNROLLED_INSNS) / loop->ninsns;
688 nunroll_by_av = PARAM_VALUE (PARAM_MAX_AVERAGE_UNROLLED_INSNS) / loop->av_ninsns;
689 if (nunroll > nunroll_by_av)
690 nunroll = nunroll_by_av;
691 if (nunroll > (unsigned) PARAM_VALUE (PARAM_MAX_UNROLL_TIMES))
692 nunroll = PARAM_VALUE (PARAM_MAX_UNROLL_TIMES);
693
9ccaa774 694 if (targetm.loop_unroll_adjust)
695 nunroll = targetm.loop_unroll_adjust (nunroll, loop);
696
ba31ec12 697 if (loop->unroll > 0 && loop->unroll < USHRT_MAX)
2a09b28c 698 nunroll = loop->unroll;
699
ad5201d0 700 /* Skip big loops. */
701 if (nunroll <= 1)
702 {
450d042a 703 if (dump_file)
704 fprintf (dump_file, ";; Not considering loop, is too big\n");
ad5201d0 705 return;
706 }
707
708 /* Check for simple loops. */
f9cce2dc 709 desc = get_simple_loop_desc (loop);
ad5201d0 710
711 /* Check simpleness. */
f9cce2dc 712 if (!desc->simple_p || desc->assumptions)
ad5201d0 713 {
450d042a 714 if (dump_file)
715 fprintf (dump_file,
716 ";; Unable to prove that the number of iterations "
717 "can be counted in runtime\n");
ad5201d0 718 return;
719 }
720
f9cce2dc 721 if (desc->const_iter)
ad5201d0 722 {
450d042a 723 if (dump_file)
724 fprintf (dump_file, ";; Loop iterates constant times\n");
ad5201d0 725 return;
726 }
727
204a2e45 728 /* Check whether the loop rolls. */
f86b328b 729 if ((get_estimated_loop_iterations (loop, &iterations)
53b01069 730 || get_likely_max_loop_iterations (loop, &iterations))
796b6678 731 && wi::ltu_p (iterations, 2 * nunroll))
ad5201d0 732 {
450d042a 733 if (dump_file)
734 fprintf (dump_file, ";; Not unrolling loop, doesn't roll\n");
ad5201d0 735 return;
736 }
737
2a09b28c 738 /* Success; now force nunroll to be power of 2, as code-gen
739 requires it, we are unable to cope with overflows in
740 computation of number of iterations. */
f9cce2dc 741 for (i = 1; 2 * i <= nunroll; i *= 2)
742 continue;
ad5201d0 743
744 loop->lpt_decision.decision = LPT_UNROLL_RUNTIME;
745 loop->lpt_decision.times = i - 1;
746}
747
d6a1bbdb 748/* Splits edge E and inserts the sequence of instructions INSNS on it, and
749 returns the newly created block. If INSNS is NULL_RTX, nothing is changed
750 and NULL is returned instead. */
88e6f696 751
752basic_block
222dc1d3 753split_edge_and_insert (edge e, rtx_insn *insns)
88e6f696 754{
d6a1bbdb 755 basic_block bb;
756
757 if (!insns)
758 return NULL;
48e1416a 759 bb = split_edge (e);
88e6f696 760 emit_insn_after (insns, BB_END (bb));
db1c50be 761
762 /* ??? We used to assume that INSNS can contain control flow insns, and
763 that we had to try to find sub basic blocks in BB to maintain a valid
764 CFG. For this purpose we used to set the BB_SUPERBLOCK flag on BB
765 and call break_superblocks when going out of cfglayout mode. But it
766 turns out that this never happens; and that if it does ever happen,
8b88439e 767 the verify_flow_info at the end of the RTL loop passes would fail.
db1c50be 768
769 There are two reasons why we expected we could have control flow insns
770 in INSNS. The first is when a comparison has to be done in parts, and
771 the second is when the number of iterations is computed for loops with
772 the number of iterations known at runtime. In both cases, test cases
773 to get control flow in INSNS appear to be impossible to construct:
774
775 * If do_compare_rtx_and_jump needs several branches to do comparison
776 in a mode that needs comparison by parts, we cannot analyze the
777 number of iterations of the loop, and we never get to unrolling it.
778
779 * The code in expand_divmod that was suspected to cause creation of
780 branching code seems to be only accessed for signed division. The
781 divisions used by # of iterations analysis are always unsigned.
782 Problems might arise on architectures that emits branching code
783 for some operations that may appear in the unroller (especially
784 for division), but we have no such architectures.
785
786 Considering all this, it was decided that we should for now assume
787 that INSNS can in theory contain control flow insns, but in practice
788 it never does. So we don't handle the theoretical case, and should
789 a real failure ever show up, we have a pretty good clue for how to
790 fix it. */
791
88e6f696 792 return bb;
793}
794
b99ab275 795/* Prepare a sequence comparing OP0 with OP1 using COMP and jumping to LABEL if
796 true, with probability PROB. If CINSN is not NULL, it is the insn to copy
797 in order to create a jump. */
798
222dc1d3 799static rtx_insn *
f9a00e9e 800compare_and_jump_seq (rtx op0, rtx op1, enum rtx_code comp,
720cfc43 801 rtx_code_label *label, profile_probability prob,
802 rtx_insn *cinsn)
b99ab275 803{
f9a00e9e 804 rtx_insn *seq;
805 rtx_jump_insn *jump;
222dc1d3 806 rtx cond;
3754d046 807 machine_mode mode;
b99ab275 808
809 mode = GET_MODE (op0);
810 if (mode == VOIDmode)
811 mode = GET_MODE (op1);
812
813 start_sequence ();
814 if (GET_MODE_CLASS (mode) == MODE_CC)
815 {
816 /* A hack -- there seems to be no easy generic way how to make a
817 conditional jump from a ccmode comparison. */
818 gcc_assert (cinsn);
819 cond = XEXP (SET_SRC (pc_set (cinsn)), 0);
820 gcc_assert (GET_CODE (cond) == comp);
821 gcc_assert (rtx_equal_p (op0, XEXP (cond, 0)));
822 gcc_assert (rtx_equal_p (op1, XEXP (cond, 1)));
823 emit_jump_insn (copy_insn (PATTERN (cinsn)));
f9a00e9e 824 jump = as_a <rtx_jump_insn *> (get_last_insn ());
b99ab275 825 JUMP_LABEL (jump) = JUMP_LABEL (cinsn);
826 LABEL_NUSES (JUMP_LABEL (jump))++;
827 redirect_jump (jump, label, 0);
828 }
829 else
830 {
831 gcc_assert (!cinsn);
832
833 op0 = force_operand (op0, NULL_RTX);
834 op1 = force_operand (op1, NULL_RTX);
835 do_compare_rtx_and_jump (op0, op1, comp, 0,
720cfc43 836 mode, NULL_RTX, NULL, label,
837 profile_probability::uninitialized ());
f9a00e9e 838 jump = as_a <rtx_jump_insn *> (get_last_insn ());
839 jump->set_jump_target (label);
b99ab275 840 LABEL_NUSES (label)++;
841 }
720cfc43 842 if (prob.initialized_p ())
61cb1816 843 add_reg_br_prob_note (jump, prob);
b99ab275 844
845 seq = get_insns ();
846 end_sequence ();
847
848 return seq;
849}
850
2a09b28c 851/* Unroll LOOP for which we are able to count number of iterations in
852 runtime LOOP->LPT_DECISION.TIMES times. The times value must be a
853 power of two. The transformation does this (with some extra care
854 for case n < 0):
3ad4992f 855
ad5201d0 856 for (i = 0; i < n; i++)
857 body;
3ad4992f 858
1c18ed19 859 ==> (LOOP->LPT_DECISION.TIMES == 3)
3ad4992f 860
ad5201d0 861 i = 0;
862 mod = n % 4;
3ad4992f 863
ad5201d0 864 switch (mod)
865 {
866 case 3:
867 body; i++;
868 case 2:
869 body; i++;
870 case 1:
871 body; i++;
872 case 0: ;
873 }
3ad4992f 874
ad5201d0 875 while (i < n)
876 {
877 body; i++;
878 body; i++;
879 body; i++;
880 body; i++;
881 }
882 */
883static void
2e966e2a 884unroll_loop_runtime_iterations (class loop *loop)
ad5201d0 885{
222dc1d3 886 rtx old_niter, niter, tmp;
887 rtx_insn *init_code, *branch_code;
720cfc43 888 unsigned i, j;
889 profile_probability p;
bf866763 890 basic_block preheader, *body, swtch, ezc_swtch = NULL;
205ce1aa 891 int may_exit_copy;
db9cef39 892 profile_count iter_count, new_count;
f3c40e6d 893 unsigned n_peel;
f3c40e6d 894 edge e;
ad5201d0 895 bool extra_zero_check, last_may_exit;
896 unsigned max_unroll = loop->lpt_decision.times;
2e966e2a 897 class niter_desc *desc = get_simple_loop_desc (loop);
f9cce2dc 898 bool exit_at_end = loop_exit_at_end_p (loop);
375bb675 899 struct opt_info *opt_info = NULL;
a53ff4c1 900 bool ok;
48e1416a 901
375bb675 902 if (flag_split_ivs_in_unroller
903 || flag_variable_expansion_in_unroller)
904 opt_info = analyze_insns_in_loop (loop);
48e1416a 905
ad5201d0 906 /* Remember blocks whose dominators will have to be updated. */
c2078b80 907 auto_vec<basic_block> dom_bbs;
ad5201d0 908
909 body = get_loop_body (loop);
910 for (i = 0; i < loop->num_nodes; i++)
911 {
f1f41a6c 912 vec<basic_block> ldom;
3f9439d7 913 basic_block bb;
ad5201d0 914
3f9439d7 915 ldom = get_dominated_by (CDI_DOMINATORS, body[i]);
f1f41a6c 916 FOR_EACH_VEC_ELT (ldom, j, bb)
3f9439d7 917 if (!flow_bb_inside_loop_p (loop, bb))
f1f41a6c 918 dom_bbs.safe_push (bb);
ad5201d0 919
f1f41a6c 920 ldom.release ();
ad5201d0 921 }
922 free (body);
923
f9cce2dc 924 if (!exit_at_end)
ad5201d0 925 {
926 /* Leave exit in first copy (for explanation why see comment in
927 unroll_loop_constant_iterations). */
928 may_exit_copy = 0;
929 n_peel = max_unroll - 1;
930 extra_zero_check = true;
931 last_may_exit = false;
932 }
933 else
934 {
935 /* Leave exit in last copy (for explanation why see comment in
936 unroll_loop_constant_iterations). */
937 may_exit_copy = max_unroll;
938 n_peel = max_unroll;
939 extra_zero_check = false;
940 last_may_exit = true;
941 }
942
943 /* Get expression for number of iterations. */
944 start_sequence ();
f9cce2dc 945 old_niter = niter = gen_reg_rtx (desc->mode);
946 tmp = force_operand (copy_rtx (desc->niter_expr), niter);
947 if (tmp != niter)
948 emit_move_insn (niter, tmp);
ad5201d0 949
b851b8f0 950 /* For loops that exit at end and whose number of iterations is reliable,
951 add one to niter to account for first pass through loop body before
952 reaching exit test. */
953 if (exit_at_end && !desc->noloop_assumptions)
bf866763 954 {
955 niter = expand_simple_binop (desc->mode, PLUS,
956 niter, const1_rtx,
957 NULL_RTX, 0, OPTAB_LIB_WIDEN);
958 old_niter = niter;
959 }
960
ad5201d0 961 /* Count modulo by ANDing it with max_unroll; we use the fact that
962 the number of unrollings is a power of two, and thus this is correct
963 even if there is overflow in the computation. */
f9cce2dc 964 niter = expand_simple_binop (desc->mode, AND,
0359f9f5 965 niter, gen_int_mode (max_unroll, desc->mode),
ad5201d0 966 NULL_RTX, 0, OPTAB_LIB_WIDEN);
967
968 init_code = get_insns ();
969 end_sequence ();
ffbd194b 970 unshare_all_rtl_in_chain (init_code);
ad5201d0 971
972 /* Precondition the loop. */
88e6f696 973 split_edge_and_insert (loop_preheader_edge (loop), init_code);
ad5201d0 974
c2078b80 975 auto_vec<edge> remove_edges;
ad5201d0 976
3c6549f8 977 auto_sbitmap wont_exit (max_unroll + 2);
ad5201d0 978
b851b8f0 979 if (extra_zero_check || desc->noloop_assumptions)
bf866763 980 {
981 /* Peel the first copy of loop body. Leave the exit test if the number
982 of iterations is not reliable. Also record the place of the extra zero
983 check. */
984 bitmap_clear (wont_exit);
985 if (!desc->noloop_assumptions)
986 bitmap_set_bit (wont_exit, 1);
987 ezc_swtch = loop_preheader_edge (loop)->src;
988 ok = duplicate_loop_to_header_edge (loop, loop_preheader_edge (loop),
989 1, wont_exit, desc->out_edge,
990 &remove_edges,
991 DLTHE_FLAG_UPDATE_FREQ);
992 gcc_assert (ok);
993 }
ad5201d0 994
995 /* Record the place where switch will be built for preconditioning. */
88e6f696 996 swtch = split_edge (loop_preheader_edge (loop));
ad5201d0 997
205ce1aa 998 /* Compute count increments for each switch block and initialize
305db1a8 999 innermost switch block. Switch blocks and peeled loop copies are built
1000 from innermost outward. */
db9cef39 1001 iter_count = new_count = swtch->count.apply_scale (1, max_unroll + 1);
305db1a8 1002 swtch->count = new_count;
305db1a8 1003
ad5201d0 1004 for (i = 0; i < n_peel; i++)
1005 {
1006 /* Peel the copy. */
53c5d9d4 1007 bitmap_clear (wont_exit);
ad5201d0 1008 if (i != n_peel - 1 || !last_may_exit)
08b7917c 1009 bitmap_set_bit (wont_exit, 1);
a53ff4c1 1010 ok = duplicate_loop_to_header_edge (loop, loop_preheader_edge (loop),
7194de72 1011 1, wont_exit, desc->out_edge,
f3c40e6d 1012 &remove_edges,
a53ff4c1 1013 DLTHE_FLAG_UPDATE_FREQ);
1014 gcc_assert (ok);
ad5201d0 1015
c87a3eff 1016 /* Create item for switch. */
1017 j = n_peel - i - (extra_zero_check ? 0 : 1);
720cfc43 1018 p = profile_probability::always ().apply_scale (1, i + 2);
c87a3eff 1019
88e6f696 1020 preheader = split_edge (loop_preheader_edge (loop));
205ce1aa 1021 /* Add in count of edge from switch block. */
305db1a8 1022 preheader->count += iter_count;
567e4ed6 1023 branch_code = compare_and_jump_seq (copy_rtx (niter),
1024 gen_int_mode (j, desc->mode), EQ,
1025 block_label (preheader), p, NULL);
c87a3eff 1026
d6a1bbdb 1027 /* We rely on the fact that the compare and jump cannot be optimized out,
1028 and hence the cfg we create is correct. */
1029 gcc_assert (branch_code != NULL_RTX);
1030
88e6f696 1031 swtch = split_edge_and_insert (single_pred_edge (swtch), branch_code);
0051c76a 1032 set_immediate_dominator (CDI_DOMINATORS, preheader, swtch);
720cfc43 1033 single_succ_edge (swtch)->probability = p.invert ();
305db1a8 1034 new_count += iter_count;
305db1a8 1035 swtch->count = new_count;
c87a3eff 1036 e = make_edge (swtch, preheader,
ea091dfd 1037 single_succ_edge (swtch)->flags & EDGE_IRREDUCIBLE_LOOP);
c87a3eff 1038 e->probability = p;
ad5201d0 1039 }
1040
1041 if (extra_zero_check)
1042 {
1043 /* Add branch for zero iterations. */
720cfc43 1044 p = profile_probability::always ().apply_scale (1, max_unroll + 1);
ad5201d0 1045 swtch = ezc_swtch;
88e6f696 1046 preheader = split_edge (loop_preheader_edge (loop));
205ce1aa 1047 /* Recompute count adjustments since initial peel copy may
305db1a8 1048 have exited and reduced those values that were computed above. */
db9cef39 1049 iter_count = swtch->count.apply_scale (1, max_unroll + 1);
205ce1aa 1050 /* Add in count of edge from switch block. */
305db1a8 1051 preheader->count += iter_count;
f9cce2dc 1052 branch_code = compare_and_jump_seq (copy_rtx (niter), const0_rtx, EQ,
a53ff4c1 1053 block_label (preheader), p,
222dc1d3 1054 NULL);
d6a1bbdb 1055 gcc_assert (branch_code != NULL_RTX);
ad5201d0 1056
88e6f696 1057 swtch = split_edge_and_insert (single_succ_edge (swtch), branch_code);
0051c76a 1058 set_immediate_dominator (CDI_DOMINATORS, preheader, swtch);
720cfc43 1059 single_succ_edge (swtch)->probability = p.invert ();
a964683a 1060 e = make_edge (swtch, preheader,
ea091dfd 1061 single_succ_edge (swtch)->flags & EDGE_IRREDUCIBLE_LOOP);
ad5201d0 1062 e->probability = p;
1063 }
1064
1065 /* Recount dominators for outer blocks. */
3f9439d7 1066 iterate_fix_dominators (CDI_DOMINATORS, dom_bbs, false);
ad5201d0 1067
1068 /* And unroll loop. */
1069
53c5d9d4 1070 bitmap_ones (wont_exit);
08b7917c 1071 bitmap_clear_bit (wont_exit, may_exit_copy);
375bb675 1072 opt_info_start_duplication (opt_info);
48e1416a 1073
a53ff4c1 1074 ok = duplicate_loop_to_header_edge (loop, loop_latch_edge (loop),
7194de72 1075 max_unroll,
a53ff4c1 1076 wont_exit, desc->out_edge,
f3c40e6d 1077 &remove_edges,
75cc36a4 1078 DLTHE_FLAG_UPDATE_FREQ
1079 | (opt_info
1080 ? DLTHE_RECORD_COPY_NUMBER
1081 : 0));
a53ff4c1 1082 gcc_assert (ok);
48e1416a 1083
375bb675 1084 if (opt_info)
a9989fb4 1085 {
375bb675 1086 apply_opt_in_copies (opt_info, max_unroll, true, true);
1087 free_opt_info (opt_info);
a9989fb4 1088 }
1089
f9cce2dc 1090 if (exit_at_end)
1091 {
01020a5f 1092 basic_block exit_block = get_bb_copy (desc->in_edge->src);
a53ff4c1 1093 /* Find a new in and out edge; they are in the last copy we have
1094 made. */
48e1416a 1095
cd665a06 1096 if (EDGE_SUCC (exit_block, 0)->dest == desc->out_edge->dest)
f9cce2dc 1097 {
cd665a06 1098 desc->out_edge = EDGE_SUCC (exit_block, 0);
1099 desc->in_edge = EDGE_SUCC (exit_block, 1);
f9cce2dc 1100 }
1101 else
1102 {
cd665a06 1103 desc->out_edge = EDGE_SUCC (exit_block, 1);
1104 desc->in_edge = EDGE_SUCC (exit_block, 0);
f9cce2dc 1105 }
1106 }
1107
ad5201d0 1108 /* Remove the edges. */
f1f41a6c 1109 FOR_EACH_VEC_ELT (remove_edges, i, e)
f3c40e6d 1110 remove_path (e);
ad5201d0 1111
f9cce2dc 1112 /* We must be careful when updating the number of iterations due to
1113 preconditioning and the fact that the value must be valid at entry
1114 of the loop. After passing through the above code, we see that
1115 the correct new number of iterations is this: */
a9989fb4 1116 gcc_assert (!desc->const_iter);
f9cce2dc 1117 desc->niter_expr =
a53ff4c1 1118 simplify_gen_binary (UDIV, desc->mode, old_niter,
5d5ee71f 1119 gen_int_mode (max_unroll + 1, desc->mode));
a9ef9877 1120 loop->nb_iterations_upper_bound
796b6678 1121 = wi::udiv_trunc (loop->nb_iterations_upper_bound, max_unroll + 1);
a9ef9877 1122 if (loop->any_estimate)
1123 loop->nb_iterations_estimate
796b6678 1124 = wi::udiv_trunc (loop->nb_iterations_estimate, max_unroll + 1);
8e3ffe30 1125 if (loop->any_likely_upper_bound)
1126 loop->nb_iterations_likely_upper_bound
1127 = wi::udiv_trunc (loop->nb_iterations_likely_upper_bound, max_unroll + 1);
f9cce2dc 1128 if (exit_at_end)
1129 {
1130 desc->niter_expr =
1131 simplify_gen_binary (MINUS, desc->mode, desc->niter_expr, const1_rtx);
1132 desc->noloop_assumptions = NULL_RTX;
a9ef9877 1133 --loop->nb_iterations_upper_bound;
1134 if (loop->any_estimate
e913b5cd 1135 && loop->nb_iterations_estimate != 0)
a9ef9877 1136 --loop->nb_iterations_estimate;
1137 else
1138 loop->any_estimate = false;
8e3ffe30 1139 if (loop->any_likely_upper_bound
1140 && loop->nb_iterations_likely_upper_bound != 0)
1141 --loop->nb_iterations_likely_upper_bound;
1142 else
1143 loop->any_likely_upper_bound = false;
f9cce2dc 1144 }
1145
450d042a 1146 if (dump_file)
1147 fprintf (dump_file,
1148 ";; Unrolled loop %d times, counting # of iterations "
1149 "in runtime, %i insns\n",
ad5201d0 1150 max_unroll, num_loop_insns (loop));
1151}
3ad4992f 1152
ad5201d0 1153/* Decide whether to unroll LOOP stupidly and how much. */
1154static void
2e966e2a 1155decide_unroll_stupid (class loop *loop, int flags)
ad5201d0 1156{
1157 unsigned nunroll, nunroll_by_av, i;
2e966e2a 1158 class niter_desc *desc;
5de9d3ed 1159 widest_int iterations;
ad5201d0 1160
2a09b28c 1161 /* If we were not asked to unroll this loop, just return back silently. */
1162 if (!(flags & UAP_UNROLL_ALL) && !loop->unroll)
1163 return;
ad5201d0 1164
2a09b28c 1165 if (dump_enabled_p ())
1166 dump_printf (MSG_NOTE, "considering unrolling loop stupidly\n");
ad5201d0 1167
1168 /* nunroll = total number of copies of the original loop body in
1169 unrolled loop (i.e. if it is 2, we have to duplicate loop body once. */
1170 nunroll = PARAM_VALUE (PARAM_MAX_UNROLLED_INSNS) / loop->ninsns;
450d042a 1171 nunroll_by_av
1172 = PARAM_VALUE (PARAM_MAX_AVERAGE_UNROLLED_INSNS) / loop->av_ninsns;
ad5201d0 1173 if (nunroll > nunroll_by_av)
1174 nunroll = nunroll_by_av;
1175 if (nunroll > (unsigned) PARAM_VALUE (PARAM_MAX_UNROLL_TIMES))
1176 nunroll = PARAM_VALUE (PARAM_MAX_UNROLL_TIMES);
1177
9ccaa774 1178 if (targetm.loop_unroll_adjust)
1179 nunroll = targetm.loop_unroll_adjust (nunroll, loop);
1180
ba31ec12 1181 if (loop->unroll > 0 && loop->unroll < USHRT_MAX)
2a09b28c 1182 nunroll = loop->unroll;
1183
ad5201d0 1184 /* Skip big loops. */
1185 if (nunroll <= 1)
1186 {
450d042a 1187 if (dump_file)
1188 fprintf (dump_file, ";; Not considering loop, is too big\n");
ad5201d0 1189 return;
1190 }
1191
1192 /* Check for simple loops. */
f9cce2dc 1193 desc = get_simple_loop_desc (loop);
ad5201d0 1194
1195 /* Check simpleness. */
f9cce2dc 1196 if (desc->simple_p && !desc->assumptions)
ad5201d0 1197 {
450d042a 1198 if (dump_file)
2a09b28c 1199 fprintf (dump_file, ";; Loop is simple\n");
ad5201d0 1200 return;
1201 }
1202
1203 /* Do not unroll loops with branches inside -- it increases number
d9459f6b 1204 of mispredicts.
1205 TODO: this heuristic needs tunning; call inside the loop body
1206 is also relatively good reason to not unroll. */
f9cce2dc 1207 if (num_loop_branches (loop) > 1)
ad5201d0 1208 {
450d042a 1209 if (dump_file)
1210 fprintf (dump_file, ";; Not unrolling, contains branches\n");
ad5201d0 1211 return;
1212 }
1213
204a2e45 1214 /* Check whether the loop rolls. */
f86b328b 1215 if ((get_estimated_loop_iterations (loop, &iterations)
53b01069 1216 || get_likely_max_loop_iterations (loop, &iterations))
796b6678 1217 && wi::ltu_p (iterations, 2 * nunroll))
ad5201d0 1218 {
450d042a 1219 if (dump_file)
1220 fprintf (dump_file, ";; Not unrolling loop, doesn't roll\n");
ad5201d0 1221 return;
1222 }
1223
1224 /* Success. Now force nunroll to be power of 2, as it seems that this
d01481af 1225 improves results (partially because of better alignments, partially
ad5201d0 1226 because of some dark magic). */
f9cce2dc 1227 for (i = 1; 2 * i <= nunroll; i *= 2)
1228 continue;
ad5201d0 1229
1230 loop->lpt_decision.decision = LPT_UNROLL_STUPID;
1231 loop->lpt_decision.times = i - 1;
1232}
1233
1c18ed19 1234/* Unroll a LOOP LOOP->LPT_DECISION.TIMES times. The transformation does this:
1235
ad5201d0 1236 while (cond)
1237 body;
1238
1c18ed19 1239 ==> (LOOP->LPT_DECISION.TIMES == 3)
ad5201d0 1240
1241 while (cond)
1242 {
1243 body;
1244 if (!cond) break;
1245 body;
1246 if (!cond) break;
1247 body;
1248 if (!cond) break;
1249 body;
1250 }
1251 */
1252static void
2e966e2a 1253unroll_loop_stupid (class loop *loop)
ad5201d0 1254{
ad5201d0 1255 unsigned nunroll = loop->lpt_decision.times;
2e966e2a 1256 class niter_desc *desc = get_simple_loop_desc (loop);
375bb675 1257 struct opt_info *opt_info = NULL;
a53ff4c1 1258 bool ok;
48e1416a 1259
375bb675 1260 if (flag_split_ivs_in_unroller
1261 || flag_variable_expansion_in_unroller)
1262 opt_info = analyze_insns_in_loop (loop);
48e1416a 1263
3c6549f8 1264 auto_sbitmap wont_exit (nunroll + 1);
53c5d9d4 1265 bitmap_clear (wont_exit);
375bb675 1266 opt_info_start_duplication (opt_info);
48e1416a 1267
a53ff4c1 1268 ok = duplicate_loop_to_header_edge (loop, loop_latch_edge (loop),
7194de72 1269 nunroll, wont_exit,
f3c40e6d 1270 NULL, NULL,
75cc36a4 1271 DLTHE_FLAG_UPDATE_FREQ
1272 | (opt_info
1273 ? DLTHE_RECORD_COPY_NUMBER
1274 : 0));
a53ff4c1 1275 gcc_assert (ok);
48e1416a 1276
375bb675 1277 if (opt_info)
a9989fb4 1278 {
375bb675 1279 apply_opt_in_copies (opt_info, nunroll, true, true);
1280 free_opt_info (opt_info);
a9989fb4 1281 }
1282
f9cce2dc 1283 if (desc->simple_p)
1284 {
1285 /* We indeed may get here provided that there are nontrivial assumptions
1286 for a loop to be really simple. We could update the counts, but the
1287 problem is that we are unable to decide which exit will be taken
1288 (not really true in case the number of iterations is constant,
75de4aa2 1289 but no one will do anything with this information, so we do not
f9cce2dc 1290 worry about it). */
1291 desc->simple_p = false;
1292 }
1293
450d042a 1294 if (dump_file)
1295 fprintf (dump_file, ";; Unrolled loop %d times, %i insns\n",
ad5201d0 1296 nunroll, num_loop_insns (loop));
1297}
a9989fb4 1298
3b3940d7 1299/* Returns true if REG is referenced in one nondebug insn in LOOP.
1300 Set *DEBUG_USES to the number of debug insns that reference the
1301 variable. */
375bb675 1302
f7b51129 1303static bool
2e966e2a 1304referenced_in_one_insn_in_loop_p (class loop *loop, rtx reg,
3b3940d7 1305 int *debug_uses)
375bb675 1306{
1307 basic_block *body, bb;
1308 unsigned i;
1309 int count_ref = 0;
222dc1d3 1310 rtx_insn *insn;
48e1416a 1311
1312 body = get_loop_body (loop);
375bb675 1313 for (i = 0; i < loop->num_nodes; i++)
1314 {
1315 bb = body[i];
48e1416a 1316
375bb675 1317 FOR_BB_INSNS (bb, insn)
3b3940d7 1318 if (!rtx_referenced_p (reg, insn))
1319 continue;
1320 else if (DEBUG_INSN_P (insn))
1321 ++*debug_uses;
1322 else if (++count_ref > 1)
1323 break;
375bb675 1324 }
3b3940d7 1325 free (body);
375bb675 1326 return (count_ref == 1);
1327}
1328
3b3940d7 1329/* Reset the DEBUG_USES debug insns in LOOP that reference REG. */
1330
1331static void
2e966e2a 1332reset_debug_uses_in_loop (class loop *loop, rtx reg, int debug_uses)
3b3940d7 1333{
1334 basic_block *body, bb;
1335 unsigned i;
222dc1d3 1336 rtx_insn *insn;
3b3940d7 1337
1338 body = get_loop_body (loop);
1339 for (i = 0; debug_uses && i < loop->num_nodes; i++)
1340 {
1341 bb = body[i];
1342
1343 FOR_BB_INSNS (bb, insn)
1344 if (!DEBUG_INSN_P (insn) || !rtx_referenced_p (reg, insn))
1345 continue;
1346 else
1347 {
1348 validate_change (insn, &INSN_VAR_LOCATION_LOC (insn),
1349 gen_rtx_UNKNOWN_VAR_LOC (), 0);
1350 if (!--debug_uses)
1351 break;
1352 }
1353 }
1354 free (body);
1355}
1356
375bb675 1357/* Determine whether INSN contains an accumulator
48e1416a 1358 which can be expanded into separate copies,
375bb675 1359 one for each copy of the LOOP body.
48e1416a 1360
375bb675 1361 for (i = 0 ; i < n; i++)
1362 sum += a[i];
48e1416a 1363
375bb675 1364 ==>
48e1416a 1365
375bb675 1366 sum += a[i]
1367 ....
1368 i = i+1;
1369 sum1 += a[i]
1370 ....
1371 i = i+1
1372 sum2 += a[i];
1373 ....
1374
48e1416a 1375 Return NULL if INSN contains no opportunity for expansion of accumulator.
1376 Otherwise, allocate a VAR_TO_EXPAND structure, fill it with the relevant
375bb675 1377 information and return a pointer to it.
1378*/
1379
1380static struct var_to_expand *
2e966e2a 1381analyze_insn_to_expand_var (class loop *loop, rtx_insn *insn)
375bb675 1382{
7497b98c 1383 rtx set, dest, src;
375bb675 1384 struct var_to_expand *ves;
0def560a 1385 unsigned accum_pos;
7497b98c 1386 enum rtx_code code;
3b3940d7 1387 int debug_uses = 0;
0def560a 1388
375bb675 1389 set = single_set (insn);
1390 if (!set)
1391 return NULL;
48e1416a 1392
375bb675 1393 dest = SET_DEST (set);
1394 src = SET_SRC (set);
7497b98c 1395 code = GET_CODE (src);
48e1416a 1396
7497b98c 1397 if (code != PLUS && code != MINUS && code != MULT && code != FMA)
375bb675 1398 return NULL;
1c6304f6 1399
7497b98c 1400 if (FLOAT_MODE_P (GET_MODE (dest)))
1401 {
1402 if (!flag_associative_math)
1403 return NULL;
1404 /* In the case of FMA, we're also changing the rounding. */
1405 if (code == FMA && !flag_unsafe_math_optimizations)
1406 return NULL;
1407 }
1408
1c6304f6 1409 /* Hmm, this is a bit paradoxical. We know that INSN is a valid insn
f4d3c071 1410 in MD. But if there is no optab to generate the insn, we cannot
1c6304f6 1411 perform the variable expansion. This can happen if an MD provides
1412 an insn but not a named pattern to generate it, for example to avoid
1413 producing code that needs additional mode switches like for x87/mmx.
1414
1415 So we check have_insn_for which looks for an optab for the operation
1416 in SRC. If it doesn't exist, we can't perform the expansion even
1417 though INSN is valid. */
7497b98c 1418 if (!have_insn_for (code, GET_MODE (src)))
1c6304f6 1419 return NULL;
1420
375bb675 1421 if (!REG_P (dest)
1422 && !(GET_CODE (dest) == SUBREG
1423 && REG_P (SUBREG_REG (dest))))
1424 return NULL;
48e1416a 1425
7497b98c 1426 /* Find the accumulator use within the operation. */
1427 if (code == FMA)
1428 {
1429 /* We only support accumulation via FMA in the ADD position. */
1430 if (!rtx_equal_p (dest, XEXP (src, 2)))
1431 return NULL;
1432 accum_pos = 2;
1433 }
1434 else if (rtx_equal_p (dest, XEXP (src, 0)))
0def560a 1435 accum_pos = 0;
7497b98c 1436 else if (rtx_equal_p (dest, XEXP (src, 1)))
1437 {
1438 /* The method of expansion that we are using; which includes the
1439 initialization of the expansions with zero and the summation of
1440 the expansions at the end of the computation will yield wrong
1441 results for (x = something - x) thus avoid using it in that case. */
1442 if (code == MINUS)
1443 return NULL;
1444 accum_pos = 1;
1445 }
0def560a 1446 else
1447 return NULL;
1448
7497b98c 1449 /* It must not otherwise be used. */
1450 if (code == FMA)
1451 {
1452 if (rtx_referenced_p (dest, XEXP (src, 0))
1453 || rtx_referenced_p (dest, XEXP (src, 1)))
1454 return NULL;
1455 }
1456 else if (rtx_referenced_p (dest, XEXP (src, 1 - accum_pos)))
375bb675 1457 return NULL;
48e1416a 1458
7497b98c 1459 /* It must be used in exactly one insn. */
3b3940d7 1460 if (!referenced_in_one_insn_in_loop_p (loop, dest, &debug_uses))
375bb675 1461 return NULL;
48e1416a 1462
e24a805c 1463 if (dump_file)
7497b98c 1464 {
1465 fprintf (dump_file, "\n;; Expanding Accumulator ");
1466 print_rtl (dump_file, dest);
1467 fprintf (dump_file, "\n");
1468 }
e24a805c 1469
3b3940d7 1470 if (debug_uses)
1471 /* Instead of resetting the debug insns, we could replace each
1472 debug use in the loop with the sum or product of all expanded
2fbe7a32 1473 accumulators. Since we'll only know of all expansions at the
3b3940d7 1474 end, we'd have to keep track of which vars_to_expand a debug
1475 insn in the loop references, take note of each copy of the
1476 debug insn during unrolling, and when it's all done, compute
1477 the sum or product of each variable and adjust the original
1478 debug insn and each copy thereof. What a pain! */
1479 reset_debug_uses_in_loop (loop, dest, debug_uses);
1480
375bb675 1481 /* Record the accumulator to expand. */
4c36ffe6 1482 ves = XNEW (struct var_to_expand);
375bb675 1483 ves->insn = insn;
375bb675 1484 ves->reg = copy_rtx (dest);
f1f41a6c 1485 ves->var_expansions.create (1);
b635a1a3 1486 ves->next = NULL;
375bb675 1487 ves->op = GET_CODE (src);
1488 ves->expansion_count = 0;
1489 ves->reuse_expansion = 0;
48e1416a 1490 return ves;
375bb675 1491}
1492
a9989fb4 1493/* Determine whether there is an induction variable in INSN that
48e1416a 1494 we would like to split during unrolling.
375bb675 1495
1496 I.e. replace
1497
1498 i = i + 1;
1499 ...
1500 i = i + 1;
1501 ...
1502 i = i + 1;
1503 ...
1504
1505 type chains by
1506
1507 i0 = i + 1
1508 ...
1509 i = i0 + 1
1510 ...
1511 i = i0 + 2
1512 ...
1513
48e1416a 1514 Return NULL if INSN contains no interesting IVs. Otherwise, allocate
375bb675 1515 an IV_TO_SPLIT structure, fill it with the relevant information and return a
a9989fb4 1516 pointer to it. */
1517
1518static struct iv_to_split *
3eeb4f9a 1519analyze_iv_to_split_insn (rtx_insn *insn)
a9989fb4 1520{
1521 rtx set, dest;
2e966e2a 1522 class rtx_iv iv;
a9989fb4 1523 struct iv_to_split *ivts;
601e68ad 1524 scalar_int_mode mode;
a53ff4c1 1525 bool ok;
a9989fb4 1526
1527 /* For now we just split the basic induction variables. Later this may be
1528 extended for example by selecting also addresses of memory references. */
1529 set = single_set (insn);
1530 if (!set)
1531 return NULL;
1532
1533 dest = SET_DEST (set);
601e68ad 1534 if (!REG_P (dest) || !is_a <scalar_int_mode> (GET_MODE (dest), &mode))
a9989fb4 1535 return NULL;
1536
601e68ad 1537 if (!biv_p (insn, mode, dest))
a9989fb4 1538 return NULL;
1539
3f37d465 1540 ok = iv_analyze_result (insn, dest, &iv);
e8820970 1541
1542 /* This used to be an assert under the assumption that if biv_p returns
1543 true that iv_analyze_result must also return true. However, that
1544 assumption is not strictly correct as evidenced by pr25569.
1545
1546 Returning NULL when iv_analyze_result returns false is safe and
1547 avoids the problems in pr25569 until the iv_analyze_* routines
1548 can be fixed, which is apparently hard and time consuming
1549 according to their author. */
1550 if (! ok)
1551 return NULL;
a9989fb4 1552
1553 if (iv.step == const0_rtx
1554 || iv.mode != iv.extend_mode)
1555 return NULL;
1556
1557 /* Record the insn to split. */
4c36ffe6 1558 ivts = XNEW (struct iv_to_split);
a9989fb4 1559 ivts->insn = insn;
ef770610 1560 ivts->orig_var = dest;
a9989fb4 1561 ivts->base_var = NULL_RTX;
1562 ivts->step = iv.step;
b635a1a3 1563 ivts->next = NULL;
48e1416a 1564
a9989fb4 1565 return ivts;
1566}
1567
375bb675 1568/* Determines which of insns in LOOP can be optimized.
1569 Return a OPT_INFO struct with the relevant hash tables filled
1570 with all insns to be optimized. The FIRST_NEW_BLOCK field
a9989fb4 1571 is undefined for the return value. */
1572
375bb675 1573static struct opt_info *
2e966e2a 1574analyze_insns_in_loop (class loop *loop)
a9989fb4 1575{
1576 basic_block *body, bb;
749ea85f 1577 unsigned i;
4c36ffe6 1578 struct opt_info *opt_info = XCNEW (struct opt_info);
3eeb4f9a 1579 rtx_insn *insn;
375bb675 1580 struct iv_to_split *ivts = NULL;
1581 struct var_to_expand *ves = NULL;
d9dd21a8 1582 iv_to_split **slot1;
1583 var_to_expand **slot2;
f1f41a6c 1584 vec<edge> edges = get_loop_exit_edges (loop);
749ea85f 1585 edge exit;
375bb675 1586 bool can_apply = false;
48e1416a 1587
a9989fb4 1588 iv_analysis_loop_init (loop);
1589
1590 body = get_loop_body (loop);
375bb675 1591
1592 if (flag_split_ivs_in_unroller)
b635a1a3 1593 {
c1f445d2 1594 opt_info->insns_to_split
1595 = new hash_table<iv_split_hasher> (5 * loop->num_nodes);
b635a1a3 1596 opt_info->iv_to_split_head = NULL;
1597 opt_info->iv_to_split_tail = &opt_info->iv_to_split_head;
1598 }
48e1416a 1599
375bb675 1600 /* Record the loop exit bb and loop preheader before the unrolling. */
88e6f696 1601 opt_info->loop_preheader = loop_preheader_edge (loop)->src;
48e1416a 1602
f1f41a6c 1603 if (edges.length () == 1)
375bb675 1604 {
f1f41a6c 1605 exit = edges[0];
749ea85f 1606 if (!(exit->flags & EDGE_COMPLEX))
1607 {
1608 opt_info->loop_exit = split_edge (exit);
1609 can_apply = true;
1610 }
375bb675 1611 }
48e1416a 1612
375bb675 1613 if (flag_variable_expansion_in_unroller
1614 && can_apply)
b635a1a3 1615 {
c1f445d2 1616 opt_info->insns_with_var_to_expand
1617 = new hash_table<var_expand_hasher> (5 * loop->num_nodes);
b635a1a3 1618 opt_info->var_to_expand_head = NULL;
1619 opt_info->var_to_expand_tail = &opt_info->var_to_expand_head;
1620 }
48e1416a 1621
a9989fb4 1622 for (i = 0; i < loop->num_nodes; i++)
1623 {
1624 bb = body[i];
1625 if (!dominated_by_p (CDI_DOMINATORS, loop->latch, bb))
1626 continue;
1627
1628 FOR_BB_INSNS (bb, insn)
375bb675 1629 {
1630 if (!INSN_P (insn))
1631 continue;
48e1416a 1632
c1f445d2 1633 if (opt_info->insns_to_split)
375bb675 1634 ivts = analyze_iv_to_split_insn (insn);
48e1416a 1635
375bb675 1636 if (ivts)
1637 {
c1f445d2 1638 slot1 = opt_info->insns_to_split->find_slot (ivts, INSERT);
b635a1a3 1639 gcc_assert (*slot1 == NULL);
375bb675 1640 *slot1 = ivts;
b635a1a3 1641 *opt_info->iv_to_split_tail = ivts;
1642 opt_info->iv_to_split_tail = &ivts->next;
375bb675 1643 continue;
1644 }
48e1416a 1645
c1f445d2 1646 if (opt_info->insns_with_var_to_expand)
375bb675 1647 ves = analyze_insn_to_expand_var (loop, insn);
48e1416a 1648
375bb675 1649 if (ves)
1650 {
c1f445d2 1651 slot2 = opt_info->insns_with_var_to_expand->find_slot (ves, INSERT);
b635a1a3 1652 gcc_assert (*slot2 == NULL);
375bb675 1653 *slot2 = ves;
b635a1a3 1654 *opt_info->var_to_expand_tail = ves;
1655 opt_info->var_to_expand_tail = &ves->next;
375bb675 1656 }
1657 }
a9989fb4 1658 }
48e1416a 1659
f1f41a6c 1660 edges.release ();
a9989fb4 1661 free (body);
375bb675 1662 return opt_info;
a9989fb4 1663}
1664
1665/* Called just before loop duplication. Records start of duplicated area
375bb675 1666 to OPT_INFO. */
a9989fb4 1667
48e1416a 1668static void
375bb675 1669opt_info_start_duplication (struct opt_info *opt_info)
a9989fb4 1670{
375bb675 1671 if (opt_info)
fe672ac0 1672 opt_info->first_new_block = last_basic_block_for_fn (cfun);
a9989fb4 1673}
1674
1675/* Determine the number of iterations between initialization of the base
1676 variable and the current copy (N_COPY). N_COPIES is the total number
1677 of newly created copies. UNROLLING is true if we are unrolling
1678 (not peeling) the loop. */
1679
1680static unsigned
1681determine_split_iv_delta (unsigned n_copy, unsigned n_copies, bool unrolling)
1682{
1683 if (unrolling)
1684 {
1685 /* If we are unrolling, initialization is done in the original loop
1686 body (number 0). */
1687 return n_copy;
1688 }
1689 else
1690 {
1691 /* If we are peeling, the copy in that the initialization occurs has
1692 number 1. The original loop (number 0) is the last. */
1693 if (n_copy)
1694 return n_copy - 1;
1695 else
1696 return n_copies;
1697 }
1698}
1699
b635a1a3 1700/* Allocate basic variable for the induction variable chain. */
a9989fb4 1701
b635a1a3 1702static void
1703allocate_basic_variable (struct iv_to_split *ivts)
a9989fb4 1704{
686fd44c 1705 rtx expr = SET_SRC (single_set (ivts->insn));
a9989fb4 1706
1707 ivts->base_var = gen_reg_rtx (GET_MODE (expr));
a9989fb4 1708}
1709
1710/* Insert initialization of basic variable of IVTS before INSN, taking
1711 the initial value from INSN. */
1712
1713static void
222dc1d3 1714insert_base_initialization (struct iv_to_split *ivts, rtx_insn *insn)
a9989fb4 1715{
686fd44c 1716 rtx expr = copy_rtx (SET_SRC (single_set (insn)));
222dc1d3 1717 rtx_insn *seq;
a9989fb4 1718
1719 start_sequence ();
1720 expr = force_operand (expr, ivts->base_var);
1721 if (expr != ivts->base_var)
1722 emit_move_insn (ivts->base_var, expr);
1723 seq = get_insns ();
1724 end_sequence ();
1725
1726 emit_insn_before (seq, insn);
1727}
1728
1729/* Replace the use of induction variable described in IVTS in INSN
1730 by base variable + DELTA * step. */
1731
1732static void
222dc1d3 1733split_iv (struct iv_to_split *ivts, rtx_insn *insn, unsigned delta)
a9989fb4 1734{
222dc1d3 1735 rtx expr, *loc, incr, var;
1736 rtx_insn *seq;
3754d046 1737 machine_mode mode = GET_MODE (ivts->base_var);
a9989fb4 1738 rtx src, dest, set;
1739
1740 /* Construct base + DELTA * step. */
1741 if (!delta)
1742 expr = ivts->base_var;
1743 else
1744 {
1745 incr = simplify_gen_binary (MULT, mode,
60ca8f8e 1746 copy_rtx (ivts->step),
1747 gen_int_mode (delta, mode));
a9989fb4 1748 expr = simplify_gen_binary (PLUS, GET_MODE (ivts->base_var),
1749 ivts->base_var, incr);
1750 }
1751
1752 /* Figure out where to do the replacement. */
686fd44c 1753 loc = &SET_SRC (single_set (insn));
a9989fb4 1754
1755 /* If we can make the replacement right away, we're done. */
1756 if (validate_change (insn, loc, expr, 0))
1757 return;
1758
1759 /* Otherwise, force EXPR into a register and try again. */
1760 start_sequence ();
1761 var = gen_reg_rtx (mode);
1762 expr = force_operand (expr, var);
1763 if (expr != var)
1764 emit_move_insn (var, expr);
1765 seq = get_insns ();
1766 end_sequence ();
1767 emit_insn_before (seq, insn);
48e1416a 1768
a9989fb4 1769 if (validate_change (insn, loc, var, 0))
1770 return;
1771
1772 /* The last chance. Try recreating the assignment in insn
1773 completely from scratch. */
1774 set = single_set (insn);
1775 gcc_assert (set);
1776
1777 start_sequence ();
1778 *loc = var;
1779 src = copy_rtx (SET_SRC (set));
1780 dest = copy_rtx (SET_DEST (set));
1781 src = force_operand (src, dest);
1782 if (src != dest)
1783 emit_move_insn (dest, src);
1784 seq = get_insns ();
1785 end_sequence ();
48e1416a 1786
a9989fb4 1787 emit_insn_before (seq, insn);
1788 delete_insn (insn);
1789}
1790
a9989fb4 1791
6dce5fff 1792/* Return one expansion of the accumulator recorded in struct VE. */
a9989fb4 1793
375bb675 1794static rtx
1795get_expansion (struct var_to_expand *ve)
1796{
1797 rtx reg;
48e1416a 1798
375bb675 1799 if (ve->reuse_expansion == 0)
1800 reg = ve->reg;
1801 else
f1f41a6c 1802 reg = ve->var_expansions[ve->reuse_expansion - 1];
48e1416a 1803
f1f41a6c 1804 if (ve->var_expansions.length () == (unsigned) ve->reuse_expansion)
375bb675 1805 ve->reuse_expansion = 0;
48e1416a 1806 else
375bb675 1807 ve->reuse_expansion++;
48e1416a 1808
375bb675 1809 return reg;
1810}
a9989fb4 1811
a9989fb4 1812
48e1416a 1813/* Given INSN replace the uses of the accumulator recorded in VE
375bb675 1814 with a new register. */
1815
1816static void
222dc1d3 1817expand_var_during_unrolling (struct var_to_expand *ve, rtx_insn *insn)
375bb675 1818{
1819 rtx new_reg, set;
1820 bool really_new_expansion = false;
48e1416a 1821
375bb675 1822 set = single_set (insn);
972e95af 1823 gcc_assert (set);
48e1416a 1824
375bb675 1825 /* Generate a new register only if the expansion limit has not been
1826 reached. Else reuse an already existing expansion. */
1827 if (PARAM_VALUE (PARAM_MAX_VARIABLE_EXPANSIONS) > ve->expansion_count)
1828 {
1829 really_new_expansion = true;
1830 new_reg = gen_reg_rtx (GET_MODE (ve->reg));
1831 }
1832 else
1833 new_reg = get_expansion (ve);
1834
d022e236 1835 validate_replace_rtx_group (SET_DEST (set), new_reg, insn);
375bb675 1836 if (apply_change_group ())
1837 if (really_new_expansion)
1838 {
f1f41a6c 1839 ve->var_expansions.safe_push (new_reg);
375bb675 1840 ve->expansion_count++;
1841 }
1842}
1843
b635a1a3 1844/* Initialize the variable expansions in loop preheader. PLACE is the
1845 loop-preheader basic block where the initialization of the
1846 expansions should take place. The expansions are initialized with
1847 (-0) when the operation is plus or minus to honor sign zero. This
1848 way we can prevent cases where the sign of the final result is
1849 effected by the sign of the expansion. Here is an example to
1850 demonstrate this:
48e1416a 1851
428c97ad 1852 for (i = 0 ; i < n; i++)
1853 sum += something;
1854
1855 ==>
1856
1857 sum += something
1858 ....
1859 i = i+1;
1860 sum1 += something
1861 ....
1862 i = i+1
1863 sum2 += something;
1864 ....
48e1416a 1865
428c97ad 1866 When SUM is initialized with -zero and SOMETHING is also -zero; the
1867 final result of sum should be -zero thus the expansions sum1 and sum2
1868 should be initialized with -zero as well (otherwise we will get +zero
1869 as the final result). */
375bb675 1870
b635a1a3 1871static void
1872insert_var_expansion_initialization (struct var_to_expand *ve,
1873 basic_block place)
375bb675 1874{
222dc1d3 1875 rtx_insn *seq;
1876 rtx var, zero_init;
375bb675 1877 unsigned i;
3754d046 1878 machine_mode mode = GET_MODE (ve->reg);
428c97ad 1879 bool honor_signed_zero_p = HONOR_SIGNED_ZEROS (mode);
1880
f1f41a6c 1881 if (ve->var_expansions.length () == 0)
b635a1a3 1882 return;
48e1416a 1883
375bb675 1884 start_sequence ();
7497b98c 1885 switch (ve->op)
1886 {
1887 case FMA:
1888 /* Note that we only accumulate FMA via the ADD operand. */
1889 case PLUS:
1890 case MINUS:
f1f41a6c 1891 FOR_EACH_VEC_ELT (ve->var_expansions, i, var)
7497b98c 1892 {
1893 if (honor_signed_zero_p)
1894 zero_init = simplify_gen_unary (NEG, mode, CONST0_RTX (mode), mode);
1895 else
1896 zero_init = CONST0_RTX (mode);
1897 emit_move_insn (var, zero_init);
1898 }
1899 break;
1900
1901 case MULT:
f1f41a6c 1902 FOR_EACH_VEC_ELT (ve->var_expansions, i, var)
7497b98c 1903 {
1904 zero_init = CONST1_RTX (GET_MODE (var));
1905 emit_move_insn (var, zero_init);
1906 }
1907 break;
1908
1909 default:
1910 gcc_unreachable ();
1911 }
48e1416a 1912
375bb675 1913 seq = get_insns ();
1914 end_sequence ();
48e1416a 1915
d022e236 1916 emit_insn_after (seq, BB_END (place));
375bb675 1917}
1918
b635a1a3 1919/* Combine the variable expansions at the loop exit. PLACE is the
1920 loop exit basic block where the summation of the expansions should
1921 take place. */
375bb675 1922
b635a1a3 1923static void
1924combine_var_copies_in_loop_exit (struct var_to_expand *ve, basic_block place)
375bb675 1925{
375bb675 1926 rtx sum = ve->reg;
222dc1d3 1927 rtx expr, var;
1928 rtx_insn *seq, *insn;
375bb675 1929 unsigned i;
1930
f1f41a6c 1931 if (ve->var_expansions.length () == 0)
b635a1a3 1932 return;
48e1416a 1933
cbebc3ae 1934 /* ve->reg might be SUBREG or some other non-shareable RTL, and we use
1935 it both here and as the destination of the assignment. */
1936 sum = copy_rtx (sum);
375bb675 1937 start_sequence ();
7497b98c 1938 switch (ve->op)
1939 {
1940 case FMA:
1941 /* Note that we only accumulate FMA via the ADD operand. */
1942 case PLUS:
1943 case MINUS:
f1f41a6c 1944 FOR_EACH_VEC_ELT (ve->var_expansions, i, var)
7497b98c 1945 sum = simplify_gen_binary (PLUS, GET_MODE (ve->reg), var, sum);
1946 break;
1947
1948 case MULT:
f1f41a6c 1949 FOR_EACH_VEC_ELT (ve->var_expansions, i, var)
7497b98c 1950 sum = simplify_gen_binary (MULT, GET_MODE (ve->reg), var, sum);
1951 break;
1952
1953 default:
1954 gcc_unreachable ();
1955 }
48e1416a 1956
375bb675 1957 expr = force_operand (sum, ve->reg);
1958 if (expr != ve->reg)
1959 emit_move_insn (ve->reg, expr);
1960 seq = get_insns ();
1961 end_sequence ();
48e1416a 1962
375bb675 1963 insn = BB_HEAD (place);
1964 while (!NOTE_INSN_BASIC_BLOCK_P (insn))
1965 insn = NEXT_INSN (insn);
1966
1967 emit_insn_after (seq, insn);
375bb675 1968}
1969
ef770610 1970/* Strip away REG_EQUAL notes for IVs we're splitting.
1971
1972 Updating REG_EQUAL notes for IVs we split is tricky: We
1973 cannot tell until after unrolling, DF-rescanning, and liveness
1974 updating, whether an EQ_USE is reached by the split IV while
1975 the IV reg is still live. See PR55006.
1976
1977 ??? We cannot use remove_reg_equal_equiv_notes_for_regno,
1978 because RTL loop-iv requires us to defer rescanning insns and
1979 any notes attached to them. So resort to old techniques... */
1980
1981static void
222dc1d3 1982maybe_strip_eq_note_for_split_iv (struct opt_info *opt_info, rtx_insn *insn)
ef770610 1983{
1984 struct iv_to_split *ivts;
1985 rtx note = find_reg_equal_equiv_note (insn);
1986 if (! note)
1987 return;
1988 for (ivts = opt_info->iv_to_split_head; ivts; ivts = ivts->next)
1989 if (reg_mentioned_p (ivts->orig_var, note))
1990 {
1991 remove_note (insn, note);
1992 return;
1993 }
1994}
1995
48e1416a 1996/* Apply loop optimizations in loop copies using the
1997 data which gathered during the unrolling. Structure
375bb675 1998 OPT_INFO record that data.
48e1416a 1999
a9989fb4 2000 UNROLLING is true if we unrolled (not peeled) the loop.
2001 REWRITE_ORIGINAL_BODY is true if we should also rewrite the original body of
2002 the loop (as it should happen in complete unrolling, but not in ordinary
2003 peeling of the loop). */
2004
2005static void
48e1416a 2006apply_opt_in_copies (struct opt_info *opt_info,
2007 unsigned n_copies, bool unrolling,
375bb675 2008 bool rewrite_original_loop)
a9989fb4 2009{
2010 unsigned i, delta;
2011 basic_block bb, orig_bb;
222dc1d3 2012 rtx_insn *insn, *orig_insn, *next;
a9989fb4 2013 struct iv_to_split ivts_templ, *ivts;
375bb675 2014 struct var_to_expand ve_templ, *ves;
48e1416a 2015
a9989fb4 2016 /* Sanity check -- we need to put initialization in the original loop
2017 body. */
2018 gcc_assert (!unrolling || rewrite_original_loop);
48e1416a 2019
a9989fb4 2020 /* Allocate the basic variables (i0). */
c1f445d2 2021 if (opt_info->insns_to_split)
b635a1a3 2022 for (ivts = opt_info->iv_to_split_head; ivts; ivts = ivts->next)
2023 allocate_basic_variable (ivts);
48e1416a 2024
fe672ac0 2025 for (i = opt_info->first_new_block;
2026 i < (unsigned) last_basic_block_for_fn (cfun);
2027 i++)
a9989fb4 2028 {
f5a6b05f 2029 bb = BASIC_BLOCK_FOR_FN (cfun, i);
01020a5f 2030 orig_bb = get_bb_original (bb);
48e1416a 2031
01020a5f 2032 /* bb->aux holds position in copy sequence initialized by
2033 duplicate_loop_to_header_edge. */
2034 delta = determine_split_iv_delta ((size_t)bb->aux, n_copies,
a9989fb4 2035 unrolling);
75cc36a4 2036 bb->aux = 0;
a9989fb4 2037 orig_insn = BB_HEAD (orig_bb);
ef770610 2038 FOR_BB_INSNS_SAFE (bb, insn, next)
375bb675 2039 {
0e6893c5 2040 if (!INSN_P (insn)
c64f38bf 2041 || (DEBUG_BIND_INSN_P (insn)
2042 && INSN_VAR_LOCATION_DECL (insn)
0e6893c5 2043 && TREE_CODE (INSN_VAR_LOCATION_DECL (insn)) == LABEL_DECL))
375bb675 2044 continue;
48e1416a 2045
0e6893c5 2046 while (!INSN_P (orig_insn)
c64f38bf 2047 || (DEBUG_BIND_INSN_P (orig_insn)
2048 && INSN_VAR_LOCATION_DECL (orig_insn)
0e6893c5 2049 && (TREE_CODE (INSN_VAR_LOCATION_DECL (orig_insn))
2050 == LABEL_DECL)))
375bb675 2051 orig_insn = NEXT_INSN (orig_insn);
48e1416a 2052
375bb675 2053 ivts_templ.insn = orig_insn;
2054 ve_templ.insn = orig_insn;
48e1416a 2055
375bb675 2056 /* Apply splitting iv optimization. */
c1f445d2 2057 if (opt_info->insns_to_split)
375bb675 2058 {
ef770610 2059 maybe_strip_eq_note_for_split_iv (opt_info, insn);
2060
c1f445d2 2061 ivts = opt_info->insns_to_split->find (&ivts_templ);
48e1416a 2062
375bb675 2063 if (ivts)
2064 {
19ef3705 2065 gcc_assert (GET_CODE (PATTERN (insn))
2066 == GET_CODE (PATTERN (orig_insn)));
48e1416a 2067
375bb675 2068 if (!delta)
2069 insert_base_initialization (ivts, insn);
2070 split_iv (ivts, insn, delta);
2071 }
2072 }
2073 /* Apply variable expansion optimization. */
c1f445d2 2074 if (unrolling && opt_info->insns_with_var_to_expand)
375bb675 2075 {
4077bf7a 2076 ves = (struct var_to_expand *)
c1f445d2 2077 opt_info->insns_with_var_to_expand->find (&ve_templ);
375bb675 2078 if (ves)
48e1416a 2079 {
19ef3705 2080 gcc_assert (GET_CODE (PATTERN (insn))
2081 == GET_CODE (PATTERN (orig_insn)));
375bb675 2082 expand_var_during_unrolling (ves, insn);
2083 }
2084 }
2085 orig_insn = NEXT_INSN (orig_insn);
2086 }
a9989fb4 2087 }
2088
2089 if (!rewrite_original_loop)
2090 return;
48e1416a 2091
375bb675 2092 /* Initialize the variable expansions in the loop preheader
48e1416a 2093 and take care of combining them at the loop exit. */
c1f445d2 2094 if (opt_info->insns_with_var_to_expand)
375bb675 2095 {
b635a1a3 2096 for (ves = opt_info->var_to_expand_head; ves; ves = ves->next)
2097 insert_var_expansion_initialization (ves, opt_info->loop_preheader);
2098 for (ves = opt_info->var_to_expand_head; ves; ves = ves->next)
2099 combine_var_copies_in_loop_exit (ves, opt_info->loop_exit);
375bb675 2100 }
48e1416a 2101
a9989fb4 2102 /* Rewrite also the original loop body. Find them as originals of the blocks
2103 in the last copied iteration, i.e. those that have
01020a5f 2104 get_bb_copy (get_bb_original (bb)) == bb. */
fe672ac0 2105 for (i = opt_info->first_new_block;
2106 i < (unsigned) last_basic_block_for_fn (cfun);
2107 i++)
a9989fb4 2108 {
f5a6b05f 2109 bb = BASIC_BLOCK_FOR_FN (cfun, i);
01020a5f 2110 orig_bb = get_bb_original (bb);
2111 if (get_bb_copy (orig_bb) != bb)
a9989fb4 2112 continue;
48e1416a 2113
a9989fb4 2114 delta = determine_split_iv_delta (0, n_copies, unrolling);
2115 for (orig_insn = BB_HEAD (orig_bb);
375bb675 2116 orig_insn != NEXT_INSN (BB_END (bb));
2117 orig_insn = next)
2118 {
2119 next = NEXT_INSN (orig_insn);
48e1416a 2120
375bb675 2121 if (!INSN_P (orig_insn))
2122 continue;
48e1416a 2123
375bb675 2124 ivts_templ.insn = orig_insn;
c1f445d2 2125 if (opt_info->insns_to_split)
375bb675 2126 {
ef770610 2127 maybe_strip_eq_note_for_split_iv (opt_info, orig_insn);
2128
4077bf7a 2129 ivts = (struct iv_to_split *)
c1f445d2 2130 opt_info->insns_to_split->find (&ivts_templ);
375bb675 2131 if (ivts)
2132 {
2133 if (!delta)
2134 insert_base_initialization (ivts, orig_insn);
2135 split_iv (ivts, orig_insn, delta);
2136 continue;
2137 }
2138 }
48e1416a 2139
375bb675 2140 }
2141 }
2142}
a9989fb4 2143
375bb675 2144/* Release OPT_INFO. */
a9989fb4 2145
2146static void
375bb675 2147free_opt_info (struct opt_info *opt_info)
a9989fb4 2148{
c1f445d2 2149 delete opt_info->insns_to_split;
2150 opt_info->insns_to_split = NULL;
2151 if (opt_info->insns_with_var_to_expand)
375bb675 2152 {
b635a1a3 2153 struct var_to_expand *ves;
2154
2155 for (ves = opt_info->var_to_expand_head; ves; ves = ves->next)
f1f41a6c 2156 ves->var_expansions.release ();
c1f445d2 2157 delete opt_info->insns_with_var_to_expand;
2158 opt_info->insns_with_var_to_expand = NULL;
375bb675 2159 }
2160 free (opt_info);
a9989fb4 2161}