2 Copyright (C) 2002-2015 Free Software Foundation, Inc.
4 This file is part of GCC.
6 GCC is free software; you can redistribute it and/or modify it under
7 the terms of the GNU General Public License as published by the Free
8 Software Foundation; either version 3, or (at your option) any later
11 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
12 WARRANTY; without even the implied warranty of MERCHANTABILITY or
13 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
16 You should have received a copy of the GNU General Public License
17 along with GCC; see the file COPYING3. If not see
18 <http://www.gnu.org/licenses/>. */
22 #include "coretypes.h"
46 /* This pass performs loop unrolling. We only perform this
47 optimization on innermost loops (with single exception) because
48 the impact on performance is greatest here, and we want to avoid
49 unnecessary code size growth. The gain is caused by greater sequentiality
50 of code, better code to optimize for further passes and in some cases
51 by fewer testings of exit conditions. The main problem is code growth,
52 that impacts performance negatively due to effect of caches.
56 -- unrolling of loops that roll constant times; this is almost always
57 win, as we get rid of exit condition tests.
58 -- unrolling of loops that roll number of times that we can compute
59 in runtime; we also get rid of exit condition tests here, but there
60 is the extra expense for calculating the number of iterations
61 -- simple unrolling of remaining loops; this is performed only if we
62 are asked to, as the gain is questionable in this case and often
63 it may even slow down the code
64 For more detailed descriptions of each of those, see comments at
65 appropriate function below.
67 There is a lot of parameters (defined and described in params.def) that
68 control how much we unroll.
70 ??? A great problem is that we don't have a good way how to determine
71 how many times we should unroll the loop; the experiments I have made
72 showed that this choice may affect performance in order of several %.
75 /* Information about induction variables to split. */
79 rtx_insn
*insn
; /* The insn in that the induction variable occurs. */
80 rtx orig_var
; /* The variable (register) for the IV before split. */
81 rtx base_var
; /* The variable on that the values in the further
82 iterations are based. */
83 rtx step
; /* Step of the induction variable. */
84 struct iv_to_split
*next
; /* Next entry in walking order. */
87 /* Information about accumulators to expand. */
91 rtx_insn
*insn
; /* The insn in that the variable expansion occurs. */
92 rtx reg
; /* The accumulator which is expanded. */
93 vec
<rtx
> var_expansions
; /* The copies of the accumulator which is expanded. */
94 struct var_to_expand
*next
; /* Next entry in walking order. */
95 enum rtx_code op
; /* The type of the accumulation - addition, subtraction
97 int expansion_count
; /* Count the number of expansions generated so far. */
98 int reuse_expansion
; /* The expansion we intend to reuse to expand
99 the accumulator. If REUSE_EXPANSION is 0 reuse
100 the original accumulator. Else use
101 var_expansions[REUSE_EXPANSION - 1]. */
104 /* Hashtable helper for iv_to_split. */
106 struct iv_split_hasher
: free_ptr_hash
<iv_to_split
>
108 static inline hashval_t
hash (const iv_to_split
*);
109 static inline bool equal (const iv_to_split
*, const iv_to_split
*);
113 /* A hash function for information about insns to split. */
116 iv_split_hasher::hash (const iv_to_split
*ivts
)
118 return (hashval_t
) INSN_UID (ivts
->insn
);
121 /* An equality functions for information about insns to split. */
124 iv_split_hasher::equal (const iv_to_split
*i1
, const iv_to_split
*i2
)
126 return i1
->insn
== i2
->insn
;
129 /* Hashtable helper for iv_to_split. */
131 struct var_expand_hasher
: free_ptr_hash
<var_to_expand
>
133 static inline hashval_t
hash (const var_to_expand
*);
134 static inline bool equal (const var_to_expand
*, const var_to_expand
*);
137 /* Return a hash for VES. */
140 var_expand_hasher::hash (const var_to_expand
*ves
)
142 return (hashval_t
) INSN_UID (ves
->insn
);
145 /* Return true if I1 and I2 refer to the same instruction. */
148 var_expand_hasher::equal (const var_to_expand
*i1
, const var_to_expand
*i2
)
150 return i1
->insn
== i2
->insn
;
153 /* Information about optimization applied in
154 the unrolled loop. */
158 hash_table
<iv_split_hasher
> *insns_to_split
; /* A hashtable of insns to
160 struct iv_to_split
*iv_to_split_head
; /* The first iv to split. */
161 struct iv_to_split
**iv_to_split_tail
; /* Pointer to the tail of the list. */
162 hash_table
<var_expand_hasher
> *insns_with_var_to_expand
; /* A hashtable of
163 insns with accumulators to expand. */
164 struct var_to_expand
*var_to_expand_head
; /* The first var to expand. */
165 struct var_to_expand
**var_to_expand_tail
; /* Pointer to the tail of the list. */
166 unsigned first_new_block
; /* The first basic block that was
168 basic_block loop_exit
; /* The loop exit basic block. */
169 basic_block loop_preheader
; /* The loop preheader basic block. */
172 static void decide_unroll_stupid (struct loop
*, int);
173 static void decide_unroll_constant_iterations (struct loop
*, int);
174 static void decide_unroll_runtime_iterations (struct loop
*, int);
175 static void unroll_loop_stupid (struct loop
*);
176 static void decide_unrolling (int);
177 static void unroll_loop_constant_iterations (struct loop
*);
178 static void unroll_loop_runtime_iterations (struct loop
*);
179 static struct opt_info
*analyze_insns_in_loop (struct loop
*);
180 static void opt_info_start_duplication (struct opt_info
*);
181 static void apply_opt_in_copies (struct opt_info
*, unsigned, bool, bool);
182 static void free_opt_info (struct opt_info
*);
183 static struct var_to_expand
*analyze_insn_to_expand_var (struct loop
*, rtx_insn
*);
184 static bool referenced_in_one_insn_in_loop_p (struct loop
*, rtx
, int *);
185 static struct iv_to_split
*analyze_iv_to_split_insn (rtx_insn
*);
186 static void expand_var_during_unrolling (struct var_to_expand
*, rtx_insn
*);
187 static void insert_var_expansion_initialization (struct var_to_expand
*,
189 static void combine_var_copies_in_loop_exit (struct var_to_expand
*,
191 static rtx
get_expansion (struct var_to_expand
*);
193 /* Emit a message summarizing the unroll that will be
194 performed for LOOP, along with the loop's location LOCUS, if
195 appropriate given the dump or -fopt-info settings. */
198 report_unroll (struct loop
*loop
, location_t locus
)
200 int report_flags
= MSG_OPTIMIZED_LOCATIONS
| TDF_RTL
| TDF_DETAILS
;
202 if (loop
->lpt_decision
.decision
== LPT_NONE
)
205 if (!dump_enabled_p ())
208 dump_printf_loc (report_flags
, locus
,
209 "loop unrolled %d times",
210 loop
->lpt_decision
.times
);
212 dump_printf (report_flags
,
213 " (header execution count %d)",
214 (int)loop
->header
->count
);
216 dump_printf (report_flags
, "\n");
219 /* Decide whether unroll loops and how much. */
221 decide_unrolling (int flags
)
225 /* Scan the loops, inner ones first. */
226 FOR_EACH_LOOP (loop
, LI_FROM_INNERMOST
)
228 loop
->lpt_decision
.decision
= LPT_NONE
;
229 location_t locus
= get_loop_location (loop
);
231 if (dump_enabled_p ())
232 dump_printf_loc (TDF_RTL
, locus
,
233 ";; *** Considering loop %d at BB %d for "
235 loop
->num
, loop
->header
->index
);
237 /* Do not peel cold areas. */
238 if (optimize_loop_for_size_p (loop
))
241 fprintf (dump_file
, ";; Not considering loop, cold area\n");
245 /* Can the loop be manipulated? */
246 if (!can_duplicate_loop_p (loop
))
250 ";; Not considering loop, cannot duplicate\n");
254 /* Skip non-innermost loops. */
258 fprintf (dump_file
, ";; Not considering loop, is not innermost\n");
262 loop
->ninsns
= num_loop_insns (loop
);
263 loop
->av_ninsns
= average_num_loop_insns (loop
);
265 /* Try transformations one by one in decreasing order of
268 decide_unroll_constant_iterations (loop
, flags
);
269 if (loop
->lpt_decision
.decision
== LPT_NONE
)
270 decide_unroll_runtime_iterations (loop
, flags
);
271 if (loop
->lpt_decision
.decision
== LPT_NONE
)
272 decide_unroll_stupid (loop
, flags
);
274 report_unroll (loop
, locus
);
280 unroll_loops (int flags
)
283 bool changed
= false;
285 /* Now decide rest of unrolling. */
286 decide_unrolling (flags
);
288 /* Scan the loops, inner ones first. */
289 FOR_EACH_LOOP (loop
, LI_FROM_INNERMOST
)
291 /* And perform the appropriate transformations. */
292 switch (loop
->lpt_decision
.decision
)
294 case LPT_UNROLL_CONSTANT
:
295 unroll_loop_constant_iterations (loop
);
298 case LPT_UNROLL_RUNTIME
:
299 unroll_loop_runtime_iterations (loop
);
302 case LPT_UNROLL_STUPID
:
303 unroll_loop_stupid (loop
);
315 calculate_dominance_info (CDI_DOMINATORS
);
316 fix_loop_structure (NULL
);
322 /* Check whether exit of the LOOP is at the end of loop body. */
325 loop_exit_at_end_p (struct loop
*loop
)
327 struct niter_desc
*desc
= get_simple_loop_desc (loop
);
330 /* We should never have conditional in latch block. */
331 gcc_assert (desc
->in_edge
->dest
!= loop
->header
);
333 if (desc
->in_edge
->dest
!= loop
->latch
)
336 /* Check that the latch is empty. */
337 FOR_BB_INSNS (loop
->latch
, insn
)
339 if (INSN_P (insn
) && active_insn_p (insn
))
346 /* Decide whether to unroll LOOP iterating constant number of times
350 decide_unroll_constant_iterations (struct loop
*loop
, int flags
)
352 unsigned nunroll
, nunroll_by_av
, best_copies
, best_unroll
= 0, n_copies
, i
;
353 struct niter_desc
*desc
;
354 widest_int iterations
;
356 if (!(flags
& UAP_UNROLL
))
358 /* We were not asked to, just return back silently. */
364 "\n;; Considering unrolling loop with constant "
365 "number of iterations\n");
367 /* nunroll = total number of copies of the original loop body in
368 unrolled loop (i.e. if it is 2, we have to duplicate loop body once. */
369 nunroll
= PARAM_VALUE (PARAM_MAX_UNROLLED_INSNS
) / loop
->ninsns
;
371 = PARAM_VALUE (PARAM_MAX_AVERAGE_UNROLLED_INSNS
) / loop
->av_ninsns
;
372 if (nunroll
> nunroll_by_av
)
373 nunroll
= nunroll_by_av
;
374 if (nunroll
> (unsigned) PARAM_VALUE (PARAM_MAX_UNROLL_TIMES
))
375 nunroll
= PARAM_VALUE (PARAM_MAX_UNROLL_TIMES
);
377 if (targetm
.loop_unroll_adjust
)
378 nunroll
= targetm
.loop_unroll_adjust (nunroll
, loop
);
380 /* Skip big loops. */
384 fprintf (dump_file
, ";; Not considering loop, is too big\n");
388 /* Check for simple loops. */
389 desc
= get_simple_loop_desc (loop
);
391 /* Check number of iterations. */
392 if (!desc
->simple_p
|| !desc
->const_iter
|| desc
->assumptions
)
396 ";; Unable to prove that the loop iterates constant times\n");
400 /* Check whether the loop rolls enough to consider.
401 Consult also loop bounds and profile; in the case the loop has more
402 than one exit it may well loop less than determined maximal number
404 if (desc
->niter
< 2 * nunroll
405 || ((get_estimated_loop_iterations (loop
, &iterations
)
406 || get_max_loop_iterations (loop
, &iterations
))
407 && wi::ltu_p (iterations
, 2 * nunroll
)))
410 fprintf (dump_file
, ";; Not unrolling loop, doesn't roll\n");
414 /* Success; now compute number of iterations to unroll. We alter
415 nunroll so that as few as possible copies of loop body are
416 necessary, while still not decreasing the number of unrollings
417 too much (at most by 1). */
418 best_copies
= 2 * nunroll
+ 10;
421 if (i
- 1 >= desc
->niter
)
424 for (; i
>= nunroll
- 1; i
--)
426 unsigned exit_mod
= desc
->niter
% (i
+ 1);
428 if (!loop_exit_at_end_p (loop
))
429 n_copies
= exit_mod
+ i
+ 1;
430 else if (exit_mod
!= (unsigned) i
431 || desc
->noloop_assumptions
!= NULL_RTX
)
432 n_copies
= exit_mod
+ i
+ 2;
436 if (n_copies
< best_copies
)
438 best_copies
= n_copies
;
443 loop
->lpt_decision
.decision
= LPT_UNROLL_CONSTANT
;
444 loop
->lpt_decision
.times
= best_unroll
;
447 /* Unroll LOOP with constant number of iterations LOOP->LPT_DECISION.TIMES times.
448 The transformation does this:
450 for (i = 0; i < 102; i++)
453 ==> (LOOP->LPT_DECISION.TIMES == 3)
467 unroll_loop_constant_iterations (struct loop
*loop
)
469 unsigned HOST_WIDE_INT niter
;
474 unsigned max_unroll
= loop
->lpt_decision
.times
;
475 struct niter_desc
*desc
= get_simple_loop_desc (loop
);
476 bool exit_at_end
= loop_exit_at_end_p (loop
);
477 struct opt_info
*opt_info
= NULL
;
482 /* Should not get here (such loop should be peeled instead). */
483 gcc_assert (niter
> max_unroll
+ 1);
485 exit_mod
= niter
% (max_unroll
+ 1);
487 wont_exit
= sbitmap_alloc (max_unroll
+ 1);
488 bitmap_ones (wont_exit
);
490 auto_vec
<edge
> remove_edges
;
491 if (flag_split_ivs_in_unroller
492 || flag_variable_expansion_in_unroller
)
493 opt_info
= analyze_insns_in_loop (loop
);
497 /* The exit is not at the end of the loop; leave exit test
498 in the first copy, so that the loops that start with test
499 of exit condition have continuous body after unrolling. */
502 fprintf (dump_file
, ";; Condition at beginning of loop.\n");
504 /* Peel exit_mod iterations. */
505 bitmap_clear_bit (wont_exit
, 0);
506 if (desc
->noloop_assumptions
)
507 bitmap_clear_bit (wont_exit
, 1);
511 opt_info_start_duplication (opt_info
);
512 ok
= duplicate_loop_to_header_edge (loop
, loop_preheader_edge (loop
),
514 wont_exit
, desc
->out_edge
,
516 DLTHE_FLAG_UPDATE_FREQ
517 | (opt_info
&& exit_mod
> 1
518 ? DLTHE_RECORD_COPY_NUMBER
522 if (opt_info
&& exit_mod
> 1)
523 apply_opt_in_copies (opt_info
, exit_mod
, false, false);
525 desc
->noloop_assumptions
= NULL_RTX
;
526 desc
->niter
-= exit_mod
;
527 loop
->nb_iterations_upper_bound
-= exit_mod
;
528 if (loop
->any_estimate
529 && wi::leu_p (exit_mod
, loop
->nb_iterations_estimate
))
530 loop
->nb_iterations_estimate
-= exit_mod
;
532 loop
->any_estimate
= false;
535 bitmap_set_bit (wont_exit
, 1);
539 /* Leave exit test in last copy, for the same reason as above if
540 the loop tests the condition at the end of loop body. */
543 fprintf (dump_file
, ";; Condition at end of loop.\n");
545 /* We know that niter >= max_unroll + 2; so we do not need to care of
546 case when we would exit before reaching the loop. So just peel
547 exit_mod + 1 iterations. */
548 if (exit_mod
!= max_unroll
549 || desc
->noloop_assumptions
)
551 bitmap_clear_bit (wont_exit
, 0);
552 if (desc
->noloop_assumptions
)
553 bitmap_clear_bit (wont_exit
, 1);
555 opt_info_start_duplication (opt_info
);
556 ok
= duplicate_loop_to_header_edge (loop
, loop_preheader_edge (loop
),
558 wont_exit
, desc
->out_edge
,
560 DLTHE_FLAG_UPDATE_FREQ
561 | (opt_info
&& exit_mod
> 0
562 ? DLTHE_RECORD_COPY_NUMBER
566 if (opt_info
&& exit_mod
> 0)
567 apply_opt_in_copies (opt_info
, exit_mod
+ 1, false, false);
569 desc
->niter
-= exit_mod
+ 1;
570 loop
->nb_iterations_upper_bound
-= exit_mod
+ 1;
571 if (loop
->any_estimate
572 && wi::leu_p (exit_mod
+ 1, loop
->nb_iterations_estimate
))
573 loop
->nb_iterations_estimate
-= exit_mod
+ 1;
575 loop
->any_estimate
= false;
576 desc
->noloop_assumptions
= NULL_RTX
;
578 bitmap_set_bit (wont_exit
, 0);
579 bitmap_set_bit (wont_exit
, 1);
582 bitmap_clear_bit (wont_exit
, max_unroll
);
585 /* Now unroll the loop. */
587 opt_info_start_duplication (opt_info
);
588 ok
= duplicate_loop_to_header_edge (loop
, loop_latch_edge (loop
),
590 wont_exit
, desc
->out_edge
,
592 DLTHE_FLAG_UPDATE_FREQ
594 ? DLTHE_RECORD_COPY_NUMBER
600 apply_opt_in_copies (opt_info
, max_unroll
, true, true);
601 free_opt_info (opt_info
);
608 basic_block exit_block
= get_bb_copy (desc
->in_edge
->src
);
609 /* Find a new in and out edge; they are in the last copy we have made. */
611 if (EDGE_SUCC (exit_block
, 0)->dest
== desc
->out_edge
->dest
)
613 desc
->out_edge
= EDGE_SUCC (exit_block
, 0);
614 desc
->in_edge
= EDGE_SUCC (exit_block
, 1);
618 desc
->out_edge
= EDGE_SUCC (exit_block
, 1);
619 desc
->in_edge
= EDGE_SUCC (exit_block
, 0);
623 desc
->niter
/= max_unroll
+ 1;
624 loop
->nb_iterations_upper_bound
625 = wi::udiv_trunc (loop
->nb_iterations_upper_bound
, max_unroll
+ 1);
626 if (loop
->any_estimate
)
627 loop
->nb_iterations_estimate
628 = wi::udiv_trunc (loop
->nb_iterations_estimate
, max_unroll
+ 1);
629 desc
->niter_expr
= GEN_INT (desc
->niter
);
631 /* Remove the edges. */
632 FOR_EACH_VEC_ELT (remove_edges
, i
, e
)
637 ";; Unrolled loop %d times, constant # of iterations %i insns\n",
638 max_unroll
, num_loop_insns (loop
));
641 /* Decide whether to unroll LOOP iterating runtime computable number of times
644 decide_unroll_runtime_iterations (struct loop
*loop
, int flags
)
646 unsigned nunroll
, nunroll_by_av
, i
;
647 struct niter_desc
*desc
;
648 widest_int iterations
;
650 if (!(flags
& UAP_UNROLL
))
652 /* We were not asked to, just return back silently. */
658 "\n;; Considering unrolling loop with runtime "
659 "computable number of iterations\n");
661 /* nunroll = total number of copies of the original loop body in
662 unrolled loop (i.e. if it is 2, we have to duplicate loop body once. */
663 nunroll
= PARAM_VALUE (PARAM_MAX_UNROLLED_INSNS
) / loop
->ninsns
;
664 nunroll_by_av
= PARAM_VALUE (PARAM_MAX_AVERAGE_UNROLLED_INSNS
) / loop
->av_ninsns
;
665 if (nunroll
> nunroll_by_av
)
666 nunroll
= nunroll_by_av
;
667 if (nunroll
> (unsigned) PARAM_VALUE (PARAM_MAX_UNROLL_TIMES
))
668 nunroll
= PARAM_VALUE (PARAM_MAX_UNROLL_TIMES
);
670 if (targetm
.loop_unroll_adjust
)
671 nunroll
= targetm
.loop_unroll_adjust (nunroll
, loop
);
673 /* Skip big loops. */
677 fprintf (dump_file
, ";; Not considering loop, is too big\n");
681 /* Check for simple loops. */
682 desc
= get_simple_loop_desc (loop
);
684 /* Check simpleness. */
685 if (!desc
->simple_p
|| desc
->assumptions
)
689 ";; Unable to prove that the number of iterations "
690 "can be counted in runtime\n");
694 if (desc
->const_iter
)
697 fprintf (dump_file
, ";; Loop iterates constant times\n");
701 /* Check whether the loop rolls. */
702 if ((get_estimated_loop_iterations (loop
, &iterations
)
703 || get_max_loop_iterations (loop
, &iterations
))
704 && wi::ltu_p (iterations
, 2 * nunroll
))
707 fprintf (dump_file
, ";; Not unrolling loop, doesn't roll\n");
711 /* Success; now force nunroll to be power of 2, as we are unable to
712 cope with overflows in computation of number of iterations. */
713 for (i
= 1; 2 * i
<= nunroll
; i
*= 2)
716 loop
->lpt_decision
.decision
= LPT_UNROLL_RUNTIME
;
717 loop
->lpt_decision
.times
= i
- 1;
720 /* Splits edge E and inserts the sequence of instructions INSNS on it, and
721 returns the newly created block. If INSNS is NULL_RTX, nothing is changed
722 and NULL is returned instead. */
725 split_edge_and_insert (edge e
, rtx_insn
*insns
)
732 emit_insn_after (insns
, BB_END (bb
));
734 /* ??? We used to assume that INSNS can contain control flow insns, and
735 that we had to try to find sub basic blocks in BB to maintain a valid
736 CFG. For this purpose we used to set the BB_SUPERBLOCK flag on BB
737 and call break_superblocks when going out of cfglayout mode. But it
738 turns out that this never happens; and that if it does ever happen,
739 the verify_flow_info at the end of the RTL loop passes would fail.
741 There are two reasons why we expected we could have control flow insns
742 in INSNS. The first is when a comparison has to be done in parts, and
743 the second is when the number of iterations is computed for loops with
744 the number of iterations known at runtime. In both cases, test cases
745 to get control flow in INSNS appear to be impossible to construct:
747 * If do_compare_rtx_and_jump needs several branches to do comparison
748 in a mode that needs comparison by parts, we cannot analyze the
749 number of iterations of the loop, and we never get to unrolling it.
751 * The code in expand_divmod that was suspected to cause creation of
752 branching code seems to be only accessed for signed division. The
753 divisions used by # of iterations analysis are always unsigned.
754 Problems might arise on architectures that emits branching code
755 for some operations that may appear in the unroller (especially
756 for division), but we have no such architectures.
758 Considering all this, it was decided that we should for now assume
759 that INSNS can in theory contain control flow insns, but in practice
760 it never does. So we don't handle the theoretical case, and should
761 a real failure ever show up, we have a pretty good clue for how to
767 /* Prepare a sequence comparing OP0 with OP1 using COMP and jumping to LABEL if
768 true, with probability PROB. If CINSN is not NULL, it is the insn to copy
769 in order to create a jump. */
772 compare_and_jump_seq (rtx op0
, rtx op1
, enum rtx_code comp
,
773 rtx_code_label
*label
, int prob
, rtx_insn
*cinsn
)
780 mode
= GET_MODE (op0
);
781 if (mode
== VOIDmode
)
782 mode
= GET_MODE (op1
);
785 if (GET_MODE_CLASS (mode
) == MODE_CC
)
787 /* A hack -- there seems to be no easy generic way how to make a
788 conditional jump from a ccmode comparison. */
790 cond
= XEXP (SET_SRC (pc_set (cinsn
)), 0);
791 gcc_assert (GET_CODE (cond
) == comp
);
792 gcc_assert (rtx_equal_p (op0
, XEXP (cond
, 0)));
793 gcc_assert (rtx_equal_p (op1
, XEXP (cond
, 1)));
794 emit_jump_insn (copy_insn (PATTERN (cinsn
)));
795 jump
= as_a
<rtx_jump_insn
*> (get_last_insn ());
796 JUMP_LABEL (jump
) = JUMP_LABEL (cinsn
);
797 LABEL_NUSES (JUMP_LABEL (jump
))++;
798 redirect_jump (jump
, label
, 0);
804 op0
= force_operand (op0
, NULL_RTX
);
805 op1
= force_operand (op1
, NULL_RTX
);
806 do_compare_rtx_and_jump (op0
, op1
, comp
, 0,
807 mode
, NULL_RTX
, NULL
, label
, -1);
808 jump
= as_a
<rtx_jump_insn
*> (get_last_insn ());
809 jump
->set_jump_target (label
);
810 LABEL_NUSES (label
)++;
812 add_int_reg_note (jump
, REG_BR_PROB
, prob
);
820 /* Unroll LOOP for which we are able to count number of iterations in runtime
821 LOOP->LPT_DECISION.TIMES times. The transformation does this (with some
822 extra care for case n < 0):
824 for (i = 0; i < n; i++)
827 ==> (LOOP->LPT_DECISION.TIMES == 3)
852 unroll_loop_runtime_iterations (struct loop
*loop
)
854 rtx old_niter
, niter
, tmp
;
855 rtx_insn
*init_code
, *branch_code
;
857 basic_block preheader
, *body
, swtch
, ezc_swtch
;
862 bool extra_zero_check
, last_may_exit
;
863 unsigned max_unroll
= loop
->lpt_decision
.times
;
864 struct niter_desc
*desc
= get_simple_loop_desc (loop
);
865 bool exit_at_end
= loop_exit_at_end_p (loop
);
866 struct opt_info
*opt_info
= NULL
;
869 if (flag_split_ivs_in_unroller
870 || flag_variable_expansion_in_unroller
)
871 opt_info
= analyze_insns_in_loop (loop
);
873 /* Remember blocks whose dominators will have to be updated. */
874 auto_vec
<basic_block
> dom_bbs
;
876 body
= get_loop_body (loop
);
877 for (i
= 0; i
< loop
->num_nodes
; i
++)
879 vec
<basic_block
> ldom
;
882 ldom
= get_dominated_by (CDI_DOMINATORS
, body
[i
]);
883 FOR_EACH_VEC_ELT (ldom
, j
, bb
)
884 if (!flow_bb_inside_loop_p (loop
, bb
))
885 dom_bbs
.safe_push (bb
);
893 /* Leave exit in first copy (for explanation why see comment in
894 unroll_loop_constant_iterations). */
896 n_peel
= max_unroll
- 1;
897 extra_zero_check
= true;
898 last_may_exit
= false;
902 /* Leave exit in last copy (for explanation why see comment in
903 unroll_loop_constant_iterations). */
904 may_exit_copy
= max_unroll
;
906 extra_zero_check
= false;
907 last_may_exit
= true;
910 /* Get expression for number of iterations. */
912 old_niter
= niter
= gen_reg_rtx (desc
->mode
);
913 tmp
= force_operand (copy_rtx (desc
->niter_expr
), niter
);
915 emit_move_insn (niter
, tmp
);
917 /* Count modulo by ANDing it with max_unroll; we use the fact that
918 the number of unrollings is a power of two, and thus this is correct
919 even if there is overflow in the computation. */
920 niter
= expand_simple_binop (desc
->mode
, AND
,
921 niter
, gen_int_mode (max_unroll
, desc
->mode
),
922 NULL_RTX
, 0, OPTAB_LIB_WIDEN
);
924 init_code
= get_insns ();
926 unshare_all_rtl_in_chain (init_code
);
928 /* Precondition the loop. */
929 split_edge_and_insert (loop_preheader_edge (loop
), init_code
);
931 auto_vec
<edge
> remove_edges
;
933 wont_exit
= sbitmap_alloc (max_unroll
+ 2);
935 /* Peel the first copy of loop body (almost always we must leave exit test
936 here; the only exception is when we have extra zero check and the number
937 of iterations is reliable. Also record the place of (possible) extra
939 bitmap_clear (wont_exit
);
941 && !desc
->noloop_assumptions
)
942 bitmap_set_bit (wont_exit
, 1);
943 ezc_swtch
= loop_preheader_edge (loop
)->src
;
944 ok
= duplicate_loop_to_header_edge (loop
, loop_preheader_edge (loop
),
945 1, wont_exit
, desc
->out_edge
,
947 DLTHE_FLAG_UPDATE_FREQ
);
950 /* Record the place where switch will be built for preconditioning. */
951 swtch
= split_edge (loop_preheader_edge (loop
));
953 for (i
= 0; i
< n_peel
; i
++)
956 bitmap_clear (wont_exit
);
957 if (i
!= n_peel
- 1 || !last_may_exit
)
958 bitmap_set_bit (wont_exit
, 1);
959 ok
= duplicate_loop_to_header_edge (loop
, loop_preheader_edge (loop
),
960 1, wont_exit
, desc
->out_edge
,
962 DLTHE_FLAG_UPDATE_FREQ
);
965 /* Create item for switch. */
966 j
= n_peel
- i
- (extra_zero_check
? 0 : 1);
967 p
= REG_BR_PROB_BASE
/ (i
+ 2);
969 preheader
= split_edge (loop_preheader_edge (loop
));
970 branch_code
= compare_and_jump_seq (copy_rtx (niter
), GEN_INT (j
), EQ
,
971 block_label (preheader
), p
,
974 /* We rely on the fact that the compare and jump cannot be optimized out,
975 and hence the cfg we create is correct. */
976 gcc_assert (branch_code
!= NULL_RTX
);
978 swtch
= split_edge_and_insert (single_pred_edge (swtch
), branch_code
);
979 set_immediate_dominator (CDI_DOMINATORS
, preheader
, swtch
);
980 single_pred_edge (swtch
)->probability
= REG_BR_PROB_BASE
- p
;
981 e
= make_edge (swtch
, preheader
,
982 single_succ_edge (swtch
)->flags
& EDGE_IRREDUCIBLE_LOOP
);
983 e
->count
= RDIV (preheader
->count
* REG_BR_PROB_BASE
, p
);
987 if (extra_zero_check
)
989 /* Add branch for zero iterations. */
990 p
= REG_BR_PROB_BASE
/ (max_unroll
+ 1);
992 preheader
= split_edge (loop_preheader_edge (loop
));
993 branch_code
= compare_and_jump_seq (copy_rtx (niter
), const0_rtx
, EQ
,
994 block_label (preheader
), p
,
996 gcc_assert (branch_code
!= NULL_RTX
);
998 swtch
= split_edge_and_insert (single_succ_edge (swtch
), branch_code
);
999 set_immediate_dominator (CDI_DOMINATORS
, preheader
, swtch
);
1000 single_succ_edge (swtch
)->probability
= REG_BR_PROB_BASE
- p
;
1001 e
= make_edge (swtch
, preheader
,
1002 single_succ_edge (swtch
)->flags
& EDGE_IRREDUCIBLE_LOOP
);
1003 e
->count
= RDIV (preheader
->count
* REG_BR_PROB_BASE
, p
);
1007 /* Recount dominators for outer blocks. */
1008 iterate_fix_dominators (CDI_DOMINATORS
, dom_bbs
, false);
1010 /* And unroll loop. */
1012 bitmap_ones (wont_exit
);
1013 bitmap_clear_bit (wont_exit
, may_exit_copy
);
1014 opt_info_start_duplication (opt_info
);
1016 ok
= duplicate_loop_to_header_edge (loop
, loop_latch_edge (loop
),
1018 wont_exit
, desc
->out_edge
,
1020 DLTHE_FLAG_UPDATE_FREQ
1022 ? DLTHE_RECORD_COPY_NUMBER
1028 apply_opt_in_copies (opt_info
, max_unroll
, true, true);
1029 free_opt_info (opt_info
);
1036 basic_block exit_block
= get_bb_copy (desc
->in_edge
->src
);
1037 /* Find a new in and out edge; they are in the last copy we have
1040 if (EDGE_SUCC (exit_block
, 0)->dest
== desc
->out_edge
->dest
)
1042 desc
->out_edge
= EDGE_SUCC (exit_block
, 0);
1043 desc
->in_edge
= EDGE_SUCC (exit_block
, 1);
1047 desc
->out_edge
= EDGE_SUCC (exit_block
, 1);
1048 desc
->in_edge
= EDGE_SUCC (exit_block
, 0);
1052 /* Remove the edges. */
1053 FOR_EACH_VEC_ELT (remove_edges
, i
, e
)
1056 /* We must be careful when updating the number of iterations due to
1057 preconditioning and the fact that the value must be valid at entry
1058 of the loop. After passing through the above code, we see that
1059 the correct new number of iterations is this: */
1060 gcc_assert (!desc
->const_iter
);
1062 simplify_gen_binary (UDIV
, desc
->mode
, old_niter
,
1063 gen_int_mode (max_unroll
+ 1, desc
->mode
));
1064 loop
->nb_iterations_upper_bound
1065 = wi::udiv_trunc (loop
->nb_iterations_upper_bound
, max_unroll
+ 1);
1066 if (loop
->any_estimate
)
1067 loop
->nb_iterations_estimate
1068 = wi::udiv_trunc (loop
->nb_iterations_estimate
, max_unroll
+ 1);
1072 simplify_gen_binary (MINUS
, desc
->mode
, desc
->niter_expr
, const1_rtx
);
1073 desc
->noloop_assumptions
= NULL_RTX
;
1074 --loop
->nb_iterations_upper_bound
;
1075 if (loop
->any_estimate
1076 && loop
->nb_iterations_estimate
!= 0)
1077 --loop
->nb_iterations_estimate
;
1079 loop
->any_estimate
= false;
1084 ";; Unrolled loop %d times, counting # of iterations "
1085 "in runtime, %i insns\n",
1086 max_unroll
, num_loop_insns (loop
));
1089 /* Decide whether to unroll LOOP stupidly and how much. */
1091 decide_unroll_stupid (struct loop
*loop
, int flags
)
1093 unsigned nunroll
, nunroll_by_av
, i
;
1094 struct niter_desc
*desc
;
1095 widest_int iterations
;
1097 if (!(flags
& UAP_UNROLL_ALL
))
1099 /* We were not asked to, just return back silently. */
1104 fprintf (dump_file
, "\n;; Considering unrolling loop stupidly\n");
1106 /* nunroll = total number of copies of the original loop body in
1107 unrolled loop (i.e. if it is 2, we have to duplicate loop body once. */
1108 nunroll
= PARAM_VALUE (PARAM_MAX_UNROLLED_INSNS
) / loop
->ninsns
;
1110 = PARAM_VALUE (PARAM_MAX_AVERAGE_UNROLLED_INSNS
) / loop
->av_ninsns
;
1111 if (nunroll
> nunroll_by_av
)
1112 nunroll
= nunroll_by_av
;
1113 if (nunroll
> (unsigned) PARAM_VALUE (PARAM_MAX_UNROLL_TIMES
))
1114 nunroll
= PARAM_VALUE (PARAM_MAX_UNROLL_TIMES
);
1116 if (targetm
.loop_unroll_adjust
)
1117 nunroll
= targetm
.loop_unroll_adjust (nunroll
, loop
);
1119 /* Skip big loops. */
1123 fprintf (dump_file
, ";; Not considering loop, is too big\n");
1127 /* Check for simple loops. */
1128 desc
= get_simple_loop_desc (loop
);
1130 /* Check simpleness. */
1131 if (desc
->simple_p
&& !desc
->assumptions
)
1134 fprintf (dump_file
, ";; The loop is simple\n");
1138 /* Do not unroll loops with branches inside -- it increases number
1140 TODO: this heuristic needs tunning; call inside the loop body
1141 is also relatively good reason to not unroll. */
1142 if (num_loop_branches (loop
) > 1)
1145 fprintf (dump_file
, ";; Not unrolling, contains branches\n");
1149 /* Check whether the loop rolls. */
1150 if ((get_estimated_loop_iterations (loop
, &iterations
)
1151 || get_max_loop_iterations (loop
, &iterations
))
1152 && wi::ltu_p (iterations
, 2 * nunroll
))
1155 fprintf (dump_file
, ";; Not unrolling loop, doesn't roll\n");
1159 /* Success. Now force nunroll to be power of 2, as it seems that this
1160 improves results (partially because of better alignments, partially
1161 because of some dark magic). */
1162 for (i
= 1; 2 * i
<= nunroll
; i
*= 2)
1165 loop
->lpt_decision
.decision
= LPT_UNROLL_STUPID
;
1166 loop
->lpt_decision
.times
= i
- 1;
1169 /* Unroll a LOOP LOOP->LPT_DECISION.TIMES times. The transformation does this:
1174 ==> (LOOP->LPT_DECISION.TIMES == 3)
1188 unroll_loop_stupid (struct loop
*loop
)
1191 unsigned nunroll
= loop
->lpt_decision
.times
;
1192 struct niter_desc
*desc
= get_simple_loop_desc (loop
);
1193 struct opt_info
*opt_info
= NULL
;
1196 if (flag_split_ivs_in_unroller
1197 || flag_variable_expansion_in_unroller
)
1198 opt_info
= analyze_insns_in_loop (loop
);
1201 wont_exit
= sbitmap_alloc (nunroll
+ 1);
1202 bitmap_clear (wont_exit
);
1203 opt_info_start_duplication (opt_info
);
1205 ok
= duplicate_loop_to_header_edge (loop
, loop_latch_edge (loop
),
1208 DLTHE_FLAG_UPDATE_FREQ
1210 ? DLTHE_RECORD_COPY_NUMBER
1216 apply_opt_in_copies (opt_info
, nunroll
, true, true);
1217 free_opt_info (opt_info
);
1224 /* We indeed may get here provided that there are nontrivial assumptions
1225 for a loop to be really simple. We could update the counts, but the
1226 problem is that we are unable to decide which exit will be taken
1227 (not really true in case the number of iterations is constant,
1228 but no one will do anything with this information, so we do not
1230 desc
->simple_p
= false;
1234 fprintf (dump_file
, ";; Unrolled loop %d times, %i insns\n",
1235 nunroll
, num_loop_insns (loop
));
1238 /* Returns true if REG is referenced in one nondebug insn in LOOP.
1239 Set *DEBUG_USES to the number of debug insns that reference the
1243 referenced_in_one_insn_in_loop_p (struct loop
*loop
, rtx reg
,
1246 basic_block
*body
, bb
;
1251 body
= get_loop_body (loop
);
1252 for (i
= 0; i
< loop
->num_nodes
; i
++)
1256 FOR_BB_INSNS (bb
, insn
)
1257 if (!rtx_referenced_p (reg
, insn
))
1259 else if (DEBUG_INSN_P (insn
))
1261 else if (++count_ref
> 1)
1265 return (count_ref
== 1);
1268 /* Reset the DEBUG_USES debug insns in LOOP that reference REG. */
1271 reset_debug_uses_in_loop (struct loop
*loop
, rtx reg
, int debug_uses
)
1273 basic_block
*body
, bb
;
1277 body
= get_loop_body (loop
);
1278 for (i
= 0; debug_uses
&& i
< loop
->num_nodes
; i
++)
1282 FOR_BB_INSNS (bb
, insn
)
1283 if (!DEBUG_INSN_P (insn
) || !rtx_referenced_p (reg
, insn
))
1287 validate_change (insn
, &INSN_VAR_LOCATION_LOC (insn
),
1288 gen_rtx_UNKNOWN_VAR_LOC (), 0);
1296 /* Determine whether INSN contains an accumulator
1297 which can be expanded into separate copies,
1298 one for each copy of the LOOP body.
1300 for (i = 0 ; i < n; i++)
1314 Return NULL if INSN contains no opportunity for expansion of accumulator.
1315 Otherwise, allocate a VAR_TO_EXPAND structure, fill it with the relevant
1316 information and return a pointer to it.
1319 static struct var_to_expand
*
1320 analyze_insn_to_expand_var (struct loop
*loop
, rtx_insn
*insn
)
1323 struct var_to_expand
*ves
;
1328 set
= single_set (insn
);
1332 dest
= SET_DEST (set
);
1333 src
= SET_SRC (set
);
1334 code
= GET_CODE (src
);
1336 if (code
!= PLUS
&& code
!= MINUS
&& code
!= MULT
&& code
!= FMA
)
1339 if (FLOAT_MODE_P (GET_MODE (dest
)))
1341 if (!flag_associative_math
)
1343 /* In the case of FMA, we're also changing the rounding. */
1344 if (code
== FMA
&& !flag_unsafe_math_optimizations
)
1348 /* Hmm, this is a bit paradoxical. We know that INSN is a valid insn
1349 in MD. But if there is no optab to generate the insn, we can not
1350 perform the variable expansion. This can happen if an MD provides
1351 an insn but not a named pattern to generate it, for example to avoid
1352 producing code that needs additional mode switches like for x87/mmx.
1354 So we check have_insn_for which looks for an optab for the operation
1355 in SRC. If it doesn't exist, we can't perform the expansion even
1356 though INSN is valid. */
1357 if (!have_insn_for (code
, GET_MODE (src
)))
1361 && !(GET_CODE (dest
) == SUBREG
1362 && REG_P (SUBREG_REG (dest
))))
1365 /* Find the accumulator use within the operation. */
1368 /* We only support accumulation via FMA in the ADD position. */
1369 if (!rtx_equal_p (dest
, XEXP (src
, 2)))
1373 else if (rtx_equal_p (dest
, XEXP (src
, 0)))
1375 else if (rtx_equal_p (dest
, XEXP (src
, 1)))
1377 /* The method of expansion that we are using; which includes the
1378 initialization of the expansions with zero and the summation of
1379 the expansions at the end of the computation will yield wrong
1380 results for (x = something - x) thus avoid using it in that case. */
1388 /* It must not otherwise be used. */
1391 if (rtx_referenced_p (dest
, XEXP (src
, 0))
1392 || rtx_referenced_p (dest
, XEXP (src
, 1)))
1395 else if (rtx_referenced_p (dest
, XEXP (src
, 1 - accum_pos
)))
1398 /* It must be used in exactly one insn. */
1399 if (!referenced_in_one_insn_in_loop_p (loop
, dest
, &debug_uses
))
1404 fprintf (dump_file
, "\n;; Expanding Accumulator ");
1405 print_rtl (dump_file
, dest
);
1406 fprintf (dump_file
, "\n");
1410 /* Instead of resetting the debug insns, we could replace each
1411 debug use in the loop with the sum or product of all expanded
1412 accummulators. Since we'll only know of all expansions at the
1413 end, we'd have to keep track of which vars_to_expand a debug
1414 insn in the loop references, take note of each copy of the
1415 debug insn during unrolling, and when it's all done, compute
1416 the sum or product of each variable and adjust the original
1417 debug insn and each copy thereof. What a pain! */
1418 reset_debug_uses_in_loop (loop
, dest
, debug_uses
);
1420 /* Record the accumulator to expand. */
1421 ves
= XNEW (struct var_to_expand
);
1423 ves
->reg
= copy_rtx (dest
);
1424 ves
->var_expansions
.create (1);
1426 ves
->op
= GET_CODE (src
);
1427 ves
->expansion_count
= 0;
1428 ves
->reuse_expansion
= 0;
1432 /* Determine whether there is an induction variable in INSN that
1433 we would like to split during unrolling.
1453 Return NULL if INSN contains no interesting IVs. Otherwise, allocate
1454 an IV_TO_SPLIT structure, fill it with the relevant information and return a
1457 static struct iv_to_split
*
1458 analyze_iv_to_split_insn (rtx_insn
*insn
)
1462 struct iv_to_split
*ivts
;
1465 /* For now we just split the basic induction variables. Later this may be
1466 extended for example by selecting also addresses of memory references. */
1467 set
= single_set (insn
);
1471 dest
= SET_DEST (set
);
1475 if (!biv_p (insn
, dest
))
1478 ok
= iv_analyze_result (insn
, dest
, &iv
);
1480 /* This used to be an assert under the assumption that if biv_p returns
1481 true that iv_analyze_result must also return true. However, that
1482 assumption is not strictly correct as evidenced by pr25569.
1484 Returning NULL when iv_analyze_result returns false is safe and
1485 avoids the problems in pr25569 until the iv_analyze_* routines
1486 can be fixed, which is apparently hard and time consuming
1487 according to their author. */
1491 if (iv
.step
== const0_rtx
1492 || iv
.mode
!= iv
.extend_mode
)
1495 /* Record the insn to split. */
1496 ivts
= XNEW (struct iv_to_split
);
1498 ivts
->orig_var
= dest
;
1499 ivts
->base_var
= NULL_RTX
;
1500 ivts
->step
= iv
.step
;
1506 /* Determines which of insns in LOOP can be optimized.
1507 Return a OPT_INFO struct with the relevant hash tables filled
1508 with all insns to be optimized. The FIRST_NEW_BLOCK field
1509 is undefined for the return value. */
1511 static struct opt_info
*
1512 analyze_insns_in_loop (struct loop
*loop
)
1514 basic_block
*body
, bb
;
1516 struct opt_info
*opt_info
= XCNEW (struct opt_info
);
1518 struct iv_to_split
*ivts
= NULL
;
1519 struct var_to_expand
*ves
= NULL
;
1520 iv_to_split
**slot1
;
1521 var_to_expand
**slot2
;
1522 vec
<edge
> edges
= get_loop_exit_edges (loop
);
1524 bool can_apply
= false;
1526 iv_analysis_loop_init (loop
);
1528 body
= get_loop_body (loop
);
1530 if (flag_split_ivs_in_unroller
)
1532 opt_info
->insns_to_split
1533 = new hash_table
<iv_split_hasher
> (5 * loop
->num_nodes
);
1534 opt_info
->iv_to_split_head
= NULL
;
1535 opt_info
->iv_to_split_tail
= &opt_info
->iv_to_split_head
;
1538 /* Record the loop exit bb and loop preheader before the unrolling. */
1539 opt_info
->loop_preheader
= loop_preheader_edge (loop
)->src
;
1541 if (edges
.length () == 1)
1544 if (!(exit
->flags
& EDGE_COMPLEX
))
1546 opt_info
->loop_exit
= split_edge (exit
);
1551 if (flag_variable_expansion_in_unroller
1554 opt_info
->insns_with_var_to_expand
1555 = new hash_table
<var_expand_hasher
> (5 * loop
->num_nodes
);
1556 opt_info
->var_to_expand_head
= NULL
;
1557 opt_info
->var_to_expand_tail
= &opt_info
->var_to_expand_head
;
1560 for (i
= 0; i
< loop
->num_nodes
; i
++)
1563 if (!dominated_by_p (CDI_DOMINATORS
, loop
->latch
, bb
))
1566 FOR_BB_INSNS (bb
, insn
)
1571 if (opt_info
->insns_to_split
)
1572 ivts
= analyze_iv_to_split_insn (insn
);
1576 slot1
= opt_info
->insns_to_split
->find_slot (ivts
, INSERT
);
1577 gcc_assert (*slot1
== NULL
);
1579 *opt_info
->iv_to_split_tail
= ivts
;
1580 opt_info
->iv_to_split_tail
= &ivts
->next
;
1584 if (opt_info
->insns_with_var_to_expand
)
1585 ves
= analyze_insn_to_expand_var (loop
, insn
);
1589 slot2
= opt_info
->insns_with_var_to_expand
->find_slot (ves
, INSERT
);
1590 gcc_assert (*slot2
== NULL
);
1592 *opt_info
->var_to_expand_tail
= ves
;
1593 opt_info
->var_to_expand_tail
= &ves
->next
;
1603 /* Called just before loop duplication. Records start of duplicated area
1607 opt_info_start_duplication (struct opt_info
*opt_info
)
1610 opt_info
->first_new_block
= last_basic_block_for_fn (cfun
);
1613 /* Determine the number of iterations between initialization of the base
1614 variable and the current copy (N_COPY). N_COPIES is the total number
1615 of newly created copies. UNROLLING is true if we are unrolling
1616 (not peeling) the loop. */
1619 determine_split_iv_delta (unsigned n_copy
, unsigned n_copies
, bool unrolling
)
1623 /* If we are unrolling, initialization is done in the original loop
1629 /* If we are peeling, the copy in that the initialization occurs has
1630 number 1. The original loop (number 0) is the last. */
1638 /* Allocate basic variable for the induction variable chain. */
1641 allocate_basic_variable (struct iv_to_split
*ivts
)
1643 rtx expr
= SET_SRC (single_set (ivts
->insn
));
1645 ivts
->base_var
= gen_reg_rtx (GET_MODE (expr
));
1648 /* Insert initialization of basic variable of IVTS before INSN, taking
1649 the initial value from INSN. */
1652 insert_base_initialization (struct iv_to_split
*ivts
, rtx_insn
*insn
)
1654 rtx expr
= copy_rtx (SET_SRC (single_set (insn
)));
1658 expr
= force_operand (expr
, ivts
->base_var
);
1659 if (expr
!= ivts
->base_var
)
1660 emit_move_insn (ivts
->base_var
, expr
);
1664 emit_insn_before (seq
, insn
);
1667 /* Replace the use of induction variable described in IVTS in INSN
1668 by base variable + DELTA * step. */
1671 split_iv (struct iv_to_split
*ivts
, rtx_insn
*insn
, unsigned delta
)
1673 rtx expr
, *loc
, incr
, var
;
1675 machine_mode mode
= GET_MODE (ivts
->base_var
);
1678 /* Construct base + DELTA * step. */
1680 expr
= ivts
->base_var
;
1683 incr
= simplify_gen_binary (MULT
, mode
,
1684 ivts
->step
, gen_int_mode (delta
, mode
));
1685 expr
= simplify_gen_binary (PLUS
, GET_MODE (ivts
->base_var
),
1686 ivts
->base_var
, incr
);
1689 /* Figure out where to do the replacement. */
1690 loc
= &SET_SRC (single_set (insn
));
1692 /* If we can make the replacement right away, we're done. */
1693 if (validate_change (insn
, loc
, expr
, 0))
1696 /* Otherwise, force EXPR into a register and try again. */
1698 var
= gen_reg_rtx (mode
);
1699 expr
= force_operand (expr
, var
);
1701 emit_move_insn (var
, expr
);
1704 emit_insn_before (seq
, insn
);
1706 if (validate_change (insn
, loc
, var
, 0))
1709 /* The last chance. Try recreating the assignment in insn
1710 completely from scratch. */
1711 set
= single_set (insn
);
1716 src
= copy_rtx (SET_SRC (set
));
1717 dest
= copy_rtx (SET_DEST (set
));
1718 src
= force_operand (src
, dest
);
1720 emit_move_insn (dest
, src
);
1724 emit_insn_before (seq
, insn
);
1729 /* Return one expansion of the accumulator recorded in struct VE. */
1732 get_expansion (struct var_to_expand
*ve
)
1736 if (ve
->reuse_expansion
== 0)
1739 reg
= ve
->var_expansions
[ve
->reuse_expansion
- 1];
1741 if (ve
->var_expansions
.length () == (unsigned) ve
->reuse_expansion
)
1742 ve
->reuse_expansion
= 0;
1744 ve
->reuse_expansion
++;
1750 /* Given INSN replace the uses of the accumulator recorded in VE
1751 with a new register. */
1754 expand_var_during_unrolling (struct var_to_expand
*ve
, rtx_insn
*insn
)
1757 bool really_new_expansion
= false;
1759 set
= single_set (insn
);
1762 /* Generate a new register only if the expansion limit has not been
1763 reached. Else reuse an already existing expansion. */
1764 if (PARAM_VALUE (PARAM_MAX_VARIABLE_EXPANSIONS
) > ve
->expansion_count
)
1766 really_new_expansion
= true;
1767 new_reg
= gen_reg_rtx (GET_MODE (ve
->reg
));
1770 new_reg
= get_expansion (ve
);
1772 validate_replace_rtx_group (SET_DEST (set
), new_reg
, insn
);
1773 if (apply_change_group ())
1774 if (really_new_expansion
)
1776 ve
->var_expansions
.safe_push (new_reg
);
1777 ve
->expansion_count
++;
1781 /* Initialize the variable expansions in loop preheader. PLACE is the
1782 loop-preheader basic block where the initialization of the
1783 expansions should take place. The expansions are initialized with
1784 (-0) when the operation is plus or minus to honor sign zero. This
1785 way we can prevent cases where the sign of the final result is
1786 effected by the sign of the expansion. Here is an example to
1789 for (i = 0 ; i < n; i++)
1803 When SUM is initialized with -zero and SOMETHING is also -zero; the
1804 final result of sum should be -zero thus the expansions sum1 and sum2
1805 should be initialized with -zero as well (otherwise we will get +zero
1806 as the final result). */
1809 insert_var_expansion_initialization (struct var_to_expand
*ve
,
1815 machine_mode mode
= GET_MODE (ve
->reg
);
1816 bool honor_signed_zero_p
= HONOR_SIGNED_ZEROS (mode
);
1818 if (ve
->var_expansions
.length () == 0)
1825 /* Note that we only accumulate FMA via the ADD operand. */
1828 FOR_EACH_VEC_ELT (ve
->var_expansions
, i
, var
)
1830 if (honor_signed_zero_p
)
1831 zero_init
= simplify_gen_unary (NEG
, mode
, CONST0_RTX (mode
), mode
);
1833 zero_init
= CONST0_RTX (mode
);
1834 emit_move_insn (var
, zero_init
);
1839 FOR_EACH_VEC_ELT (ve
->var_expansions
, i
, var
)
1841 zero_init
= CONST1_RTX (GET_MODE (var
));
1842 emit_move_insn (var
, zero_init
);
1853 emit_insn_after (seq
, BB_END (place
));
1856 /* Combine the variable expansions at the loop exit. PLACE is the
1857 loop exit basic block where the summation of the expansions should
1861 combine_var_copies_in_loop_exit (struct var_to_expand
*ve
, basic_block place
)
1865 rtx_insn
*seq
, *insn
;
1868 if (ve
->var_expansions
.length () == 0)
1875 /* Note that we only accumulate FMA via the ADD operand. */
1878 FOR_EACH_VEC_ELT (ve
->var_expansions
, i
, var
)
1879 sum
= simplify_gen_binary (PLUS
, GET_MODE (ve
->reg
), var
, sum
);
1883 FOR_EACH_VEC_ELT (ve
->var_expansions
, i
, var
)
1884 sum
= simplify_gen_binary (MULT
, GET_MODE (ve
->reg
), var
, sum
);
1891 expr
= force_operand (sum
, ve
->reg
);
1892 if (expr
!= ve
->reg
)
1893 emit_move_insn (ve
->reg
, expr
);
1897 insn
= BB_HEAD (place
);
1898 while (!NOTE_INSN_BASIC_BLOCK_P (insn
))
1899 insn
= NEXT_INSN (insn
);
1901 emit_insn_after (seq
, insn
);
1904 /* Strip away REG_EQUAL notes for IVs we're splitting.
1906 Updating REG_EQUAL notes for IVs we split is tricky: We
1907 cannot tell until after unrolling, DF-rescanning, and liveness
1908 updating, whether an EQ_USE is reached by the split IV while
1909 the IV reg is still live. See PR55006.
1911 ??? We cannot use remove_reg_equal_equiv_notes_for_regno,
1912 because RTL loop-iv requires us to defer rescanning insns and
1913 any notes attached to them. So resort to old techniques... */
1916 maybe_strip_eq_note_for_split_iv (struct opt_info
*opt_info
, rtx_insn
*insn
)
1918 struct iv_to_split
*ivts
;
1919 rtx note
= find_reg_equal_equiv_note (insn
);
1922 for (ivts
= opt_info
->iv_to_split_head
; ivts
; ivts
= ivts
->next
)
1923 if (reg_mentioned_p (ivts
->orig_var
, note
))
1925 remove_note (insn
, note
);
1930 /* Apply loop optimizations in loop copies using the
1931 data which gathered during the unrolling. Structure
1932 OPT_INFO record that data.
1934 UNROLLING is true if we unrolled (not peeled) the loop.
1935 REWRITE_ORIGINAL_BODY is true if we should also rewrite the original body of
1936 the loop (as it should happen in complete unrolling, but not in ordinary
1937 peeling of the loop). */
1940 apply_opt_in_copies (struct opt_info
*opt_info
,
1941 unsigned n_copies
, bool unrolling
,
1942 bool rewrite_original_loop
)
1945 basic_block bb
, orig_bb
;
1946 rtx_insn
*insn
, *orig_insn
, *next
;
1947 struct iv_to_split ivts_templ
, *ivts
;
1948 struct var_to_expand ve_templ
, *ves
;
1950 /* Sanity check -- we need to put initialization in the original loop
1952 gcc_assert (!unrolling
|| rewrite_original_loop
);
1954 /* Allocate the basic variables (i0). */
1955 if (opt_info
->insns_to_split
)
1956 for (ivts
= opt_info
->iv_to_split_head
; ivts
; ivts
= ivts
->next
)
1957 allocate_basic_variable (ivts
);
1959 for (i
= opt_info
->first_new_block
;
1960 i
< (unsigned) last_basic_block_for_fn (cfun
);
1963 bb
= BASIC_BLOCK_FOR_FN (cfun
, i
);
1964 orig_bb
= get_bb_original (bb
);
1966 /* bb->aux holds position in copy sequence initialized by
1967 duplicate_loop_to_header_edge. */
1968 delta
= determine_split_iv_delta ((size_t)bb
->aux
, n_copies
,
1971 orig_insn
= BB_HEAD (orig_bb
);
1972 FOR_BB_INSNS_SAFE (bb
, insn
, next
)
1975 || (DEBUG_INSN_P (insn
)
1976 && TREE_CODE (INSN_VAR_LOCATION_DECL (insn
)) == LABEL_DECL
))
1979 while (!INSN_P (orig_insn
)
1980 || (DEBUG_INSN_P (orig_insn
)
1981 && (TREE_CODE (INSN_VAR_LOCATION_DECL (orig_insn
))
1983 orig_insn
= NEXT_INSN (orig_insn
);
1985 ivts_templ
.insn
= orig_insn
;
1986 ve_templ
.insn
= orig_insn
;
1988 /* Apply splitting iv optimization. */
1989 if (opt_info
->insns_to_split
)
1991 maybe_strip_eq_note_for_split_iv (opt_info
, insn
);
1993 ivts
= opt_info
->insns_to_split
->find (&ivts_templ
);
1997 gcc_assert (GET_CODE (PATTERN (insn
))
1998 == GET_CODE (PATTERN (orig_insn
)));
2001 insert_base_initialization (ivts
, insn
);
2002 split_iv (ivts
, insn
, delta
);
2005 /* Apply variable expansion optimization. */
2006 if (unrolling
&& opt_info
->insns_with_var_to_expand
)
2008 ves
= (struct var_to_expand
*)
2009 opt_info
->insns_with_var_to_expand
->find (&ve_templ
);
2012 gcc_assert (GET_CODE (PATTERN (insn
))
2013 == GET_CODE (PATTERN (orig_insn
)));
2014 expand_var_during_unrolling (ves
, insn
);
2017 orig_insn
= NEXT_INSN (orig_insn
);
2021 if (!rewrite_original_loop
)
2024 /* Initialize the variable expansions in the loop preheader
2025 and take care of combining them at the loop exit. */
2026 if (opt_info
->insns_with_var_to_expand
)
2028 for (ves
= opt_info
->var_to_expand_head
; ves
; ves
= ves
->next
)
2029 insert_var_expansion_initialization (ves
, opt_info
->loop_preheader
);
2030 for (ves
= opt_info
->var_to_expand_head
; ves
; ves
= ves
->next
)
2031 combine_var_copies_in_loop_exit (ves
, opt_info
->loop_exit
);
2034 /* Rewrite also the original loop body. Find them as originals of the blocks
2035 in the last copied iteration, i.e. those that have
2036 get_bb_copy (get_bb_original (bb)) == bb. */
2037 for (i
= opt_info
->first_new_block
;
2038 i
< (unsigned) last_basic_block_for_fn (cfun
);
2041 bb
= BASIC_BLOCK_FOR_FN (cfun
, i
);
2042 orig_bb
= get_bb_original (bb
);
2043 if (get_bb_copy (orig_bb
) != bb
)
2046 delta
= determine_split_iv_delta (0, n_copies
, unrolling
);
2047 for (orig_insn
= BB_HEAD (orig_bb
);
2048 orig_insn
!= NEXT_INSN (BB_END (bb
));
2051 next
= NEXT_INSN (orig_insn
);
2053 if (!INSN_P (orig_insn
))
2056 ivts_templ
.insn
= orig_insn
;
2057 if (opt_info
->insns_to_split
)
2059 maybe_strip_eq_note_for_split_iv (opt_info
, orig_insn
);
2061 ivts
= (struct iv_to_split
*)
2062 opt_info
->insns_to_split
->find (&ivts_templ
);
2066 insert_base_initialization (ivts
, orig_insn
);
2067 split_iv (ivts
, orig_insn
, delta
);
2076 /* Release OPT_INFO. */
2079 free_opt_info (struct opt_info
*opt_info
)
2081 delete opt_info
->insns_to_split
;
2082 opt_info
->insns_to_split
= NULL
;
2083 if (opt_info
->insns_with_var_to_expand
)
2085 struct var_to_expand
*ves
;
2087 for (ves
= opt_info
->var_to_expand_head
; ves
; ves
= ves
->next
)
2088 ves
->var_expansions
.release ();
2089 delete opt_info
->insns_with_var_to_expand
;
2090 opt_info
->insns_with_var_to_expand
= NULL
;