gcc/loop-unroll.c

   1 /* Loop unrolling and peeling.
   2    Copyright (C) 2002-2013 Free Software Foundation, Inc.
   3
   4 This file is part of GCC.
   5
   6 GCC is free software; you can redistribute it and/or modify it under
   7 the terms of the GNU General Public License as published by the Free
   8 Software Foundation; either version 3, or (at your option) any later
   9 version.
  10
  11 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
  12 WARRANTY; without even the implied warranty of MERCHANTABILITY or
  13 FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  14 for more details.
  15
  16 You should have received a copy of the GNU General Public License
  17 along with GCC; see the file COPYING3.  If not see
  18 <http://www.gnu.org/licenses/>.  */
  19
  20 #include "config.h"
  21 #include "system.h"
  22 #include "coretypes.h"
  23 #include "tm.h"
  24 #include "rtl.h"
  25 #include "tree.h"
  26 #include "hard-reg-set.h"
  27 #include "obstack.h"
  28 #include "basic-block.h"
  29 #include "cfgloop.h"
  30 #include "params.h"
  31 #include "expr.h"
  32 #include "hash-table.h"
  33 #include "recog.h"
  34 #include "target.h"
  35 #include "dumpfile.h"
  36
  37 /* This pass performs loop unrolling and peeling.  We only perform these
  38    optimizations on innermost loops (with single exception) because
  39    the impact on performance is greatest here, and we want to avoid
  40    unnecessary code size growth.  The gain is caused by greater sequentiality
  41    of code, better code to optimize for further passes and in some cases
  42    by fewer testings of exit conditions.  The main problem is code growth,
  43    that impacts performance negatively due to effect of caches.
  44
  45    What we do:
  46
  47    -- complete peeling of once-rolling loops; this is the above mentioned
  48       exception, as this causes loop to be cancelled completely and
  49       does not cause code growth
  50    -- complete peeling of loops that roll (small) constant times.
  51    -- simple peeling of first iterations of loops that do not roll much
  52       (according to profile feedback)
  53    -- unrolling of loops that roll constant times; this is almost always
  54       win, as we get rid of exit condition tests.
  55    -- unrolling of loops that roll number of times that we can compute
  56       in runtime; we also get rid of exit condition tests here, but there
  57       is the extra expense for calculating the number of iterations
  58    -- simple unrolling of remaining loops; this is performed only if we
  59       are asked to, as the gain is questionable in this case and often
  60       it may even slow down the code
  61    For more detailed descriptions of each of those, see comments at
  62    appropriate function below.
  63
  64    There is a lot of parameters (defined and described in params.def) that
  65    control how much we unroll/peel.
  66
  67    ??? A great problem is that we don't have a good way how to determine
  68    how many times we should unroll the loop; the experiments I have made
  69    showed that this choice may affect performance in order of several %.
  70    */
  71
  72 /* Information about induction variables to split.  */
  73
  74 struct iv_to_split
  75 {
  76   rtx insn;             /* The insn in that the induction variable occurs.  */
  77   rtx orig_var;         /* The variable (register) for the IV before split.  */
  78   rtx base_var;         /* The variable on that the values in the further
  79                            iterations are based.  */
  80   rtx step;             /* Step of the induction variable.  */
  81   struct iv_to_split *next; /* Next entry in walking order.  */
  82   unsigned n_loc;
  83   unsigned loc[3];      /* Location where the definition of the induction
  84                            variable occurs in the insn.  For example if
  85                            N_LOC is 2, the expression is located at
  86                            XEXP (XEXP (single_set, loc[0]), loc[1]).  */
  87 };
  88
  89 /* Information about accumulators to expand.  */
  90
  91 struct var_to_expand
  92 {
  93   rtx insn;                        /* The insn in that the variable expansion occurs.  */
  94   rtx reg;                         /* The accumulator which is expanded.  */
  95   vec<rtx> var_expansions;   /* The copies of the accumulator which is expanded.  */
  96   struct var_to_expand *next;      /* Next entry in walking order.  */
  97   enum rtx_code op;                /* The type of the accumulation - addition, subtraction
  98                                       or multiplication.  */
  99   int expansion_count;             /* Count the number of expansions generated so far.  */
 100   int reuse_expansion;             /* The expansion we intend to reuse to expand
 101                                       the accumulator.  If REUSE_EXPANSION is 0 reuse
 102                                       the original accumulator.  Else use
 103                                       var_expansions[REUSE_EXPANSION - 1].  */
 104 };
 105
 106 /* Hashtable helper for iv_to_split.  */
 107
 108 struct iv_split_hasher : typed_free_remove <iv_to_split>
 109 {
 110   typedef iv_to_split value_type;
 111   typedef iv_to_split compare_type;
 112   static inline hashval_t hash (const value_type *);
 113   static inline bool equal (const value_type *, const compare_type *);
 114 };
 115
 116
 117 /* A hash function for information about insns to split.  */
 118
 119 inline hashval_t
 120 iv_split_hasher::hash (const value_type *ivts)
 121 {
 122   return (hashval_t) INSN_UID (ivts->insn);
 123 }
 124
 125 /* An equality functions for information about insns to split.  */
 126
 127 inline bool
 128 iv_split_hasher::equal (const value_type *i1, const compare_type *i2)
 129 {
 130   return i1->insn == i2->insn;
 131 }
 132
 133 /* Hashtable helper for iv_to_split.  */
 134
 135 struct var_expand_hasher : typed_free_remove <var_to_expand>
 136 {
 137   typedef var_to_expand value_type;
 138   typedef var_to_expand compare_type;
 139   static inline hashval_t hash (const value_type *);
 140   static inline bool equal (const value_type *, const compare_type *);
 141 };
 142
 143 /* Return a hash for VES.  */
 144
 145 inline hashval_t
 146 var_expand_hasher::hash (const value_type *ves)
 147 {
 148   return (hashval_t) INSN_UID (ves->insn);
 149 }
 150
 151 /* Return true if I1 and I2 refer to the same instruction.  */
 152
 153 inline bool
 154 var_expand_hasher::equal (const value_type *i1, const compare_type *i2)
 155 {
 156   return i1->insn == i2->insn;
 157 }
 158
 159 /* Information about optimization applied in
 160    the unrolled loop.  */
 161
 162 struct opt_info
 163 {
 164   hash_table <iv_split_hasher> insns_to_split; /* A hashtable of insns to
 165                                                   split.  */
 166   struct iv_to_split *iv_to_split_head; /* The first iv to split.  */
 167   struct iv_to_split **iv_to_split_tail; /* Pointer to the tail of the list.  */
 168   hash_table <var_expand_hasher> insns_with_var_to_expand; /* A hashtable of
 169                                         insns with accumulators to expand.  */
 170   struct var_to_expand *var_to_expand_head; /* The first var to expand.  */
 171   struct var_to_expand **var_to_expand_tail; /* Pointer to the tail of the list.  */
 172   unsigned first_new_block;        /* The first basic block that was
 173                                       duplicated.  */
 174   basic_block loop_exit;           /* The loop exit basic block.  */
 175   basic_block loop_preheader;      /* The loop preheader basic block.  */
 176 };
 177
 178 static void decide_unrolling_and_peeling (int);
 179 static void peel_loops_completely (int);
 180 static void decide_peel_simple (struct loop *, int);
 181 static void decide_peel_once_rolling (struct loop *, int);
 182 static void decide_peel_completely (struct loop *, int);
 183 static void decide_unroll_stupid (struct loop *, int);
 184 static void decide_unroll_constant_iterations (struct loop *, int);
 185 static void decide_unroll_runtime_iterations (struct loop *, int);
 186 static void peel_loop_simple (struct loop *);
 187 static void peel_loop_completely (struct loop *);
 188 static void unroll_loop_stupid (struct loop *);
 189 static void unroll_loop_constant_iterations (struct loop *);
 190 static void unroll_loop_runtime_iterations (struct loop *);
 191 static struct opt_info *analyze_insns_in_loop (struct loop *);
 192 static void opt_info_start_duplication (struct opt_info *);
 193 static void apply_opt_in_copies (struct opt_info *, unsigned, bool, bool);
 194 static void free_opt_info (struct opt_info *);
 195 static struct var_to_expand *analyze_insn_to_expand_var (struct loop*, rtx);
 196 static bool referenced_in_one_insn_in_loop_p (struct loop *, rtx, int *);
 197 static struct iv_to_split *analyze_iv_to_split_insn (rtx);
 198 static void expand_var_during_unrolling (struct var_to_expand *, rtx);
 199 static void insert_var_expansion_initialization (struct var_to_expand *,
 200                                                  basic_block);
 201 static void combine_var_copies_in_loop_exit (struct var_to_expand *,
 202                                              basic_block);
 203 static rtx get_expansion (struct var_to_expand *);
 204
 205 /* Emit a message summarizing the unroll or peel that will be
 206    performed for LOOP, along with the loop's location LOCUS, if
 207    appropriate given the dump or -fopt-info settings.  */
 208
 209 static void
 210 report_unroll_peel (struct loop *loop, location_t locus)
 211 {
 212   struct niter_desc *desc;
 213   int niters = 0;
 214   int report_flags = MSG_OPTIMIZED_LOCATIONS | TDF_RTL | TDF_DETAILS;
 215
 216   if (loop->lpt_decision.decision == LPT_NONE)
 217     return;
 218
 219   if (!dump_enabled_p ())
 220     return;
 221
 222   /* In the special case where the loop never iterated, emit
 223      a different message so that we don't report an unroll by 0.
 224      This matches the equivalent message emitted during tree unrolling.  */
 225   if (loop->lpt_decision.decision == LPT_PEEL_COMPLETELY
 226       && !loop->lpt_decision.times)
 227     {
 228       dump_printf_loc (report_flags, locus,
 229                        "loop turned into non-loop; it never loops.\n");
 230       return;
 231     }
 232
 233   desc = get_simple_loop_desc (loop);
 234
 235   if (desc->const_iter)
 236     niters = desc->niter;
 237   else if (loop->header->count)
 238     niters = expected_loop_iterations (loop);
 239
 240   if (loop->lpt_decision.decision == LPT_PEEL_COMPLETELY)
 241     dump_printf_loc (report_flags, locus,
 242                      "loop with %d iterations completely unrolled",
 243                      loop->lpt_decision.times + 1);
 244   else
 245     dump_printf_loc (report_flags, locus,
 246                      "loop %s %d times",
 247                      (loop->lpt_decision.decision == LPT_PEEL_SIMPLE
 248                        ? "peeled" : "unrolled"),
 249                      loop->lpt_decision.times);
 250   if (profile_info)
 251     dump_printf (report_flags,
 252                  " (header execution count %d",
 253                  (int)loop->header->count);
 254   if (loop->lpt_decision.decision == LPT_PEEL_COMPLETELY)
 255     dump_printf (report_flags,
 256                  "%s%s iterations %d)",
 257                  profile_info ? ", " : " (",
 258                  desc->const_iter ? "const" : "average",
 259                  niters);
 260   else if (profile_info)
 261     dump_printf (report_flags, ")");
 262
 263   dump_printf (report_flags, "\n");
 264 }
 265
 266 /* Unroll and/or peel (depending on FLAGS) LOOPS.  */
 267 void
 268 unroll_and_peel_loops (int flags)
 269 {
 270   struct loop *loop;
 271   bool changed = false;
 272
 273   /* First perform complete loop peeling (it is almost surely a win,
 274      and affects parameters for further decision a lot).  */
 275   peel_loops_completely (flags);
 276
 277   /* Now decide rest of unrolling and peeling.  */
 278   decide_unrolling_and_peeling (flags);
 279
 280   /* Scan the loops, inner ones first.  */
 281   FOR_EACH_LOOP (loop, LI_FROM_INNERMOST)
 282     {
 283       /* And perform the appropriate transformations.  */
 284       switch (loop->lpt_decision.decision)
 285         {
 286         case LPT_PEEL_COMPLETELY:
 287           /* Already done.  */
 288           gcc_unreachable ();
 289         case LPT_PEEL_SIMPLE:
 290           peel_loop_simple (loop);
 291           changed = true;
 292           break;
 293         case LPT_UNROLL_CONSTANT:
 294           unroll_loop_constant_iterations (loop);
 295           changed = true;
 296           break;
 297         case LPT_UNROLL_RUNTIME:
 298           unroll_loop_runtime_iterations (loop);
 299           changed = true;
 300           break;
 301         case LPT_UNROLL_STUPID:
 302           unroll_loop_stupid (loop);
 303           changed = true;
 304           break;
 305         case LPT_NONE:
 306           break;
 307         default:
 308           gcc_unreachable ();
 309         }
 310     }
 311
 312     if (changed)
 313       {
 314         calculate_dominance_info (CDI_DOMINATORS);
 315         fix_loop_structure (NULL);
 316       }
 317
 318   iv_analysis_done ();
 319 }
 320
 321 /* Check whether exit of the LOOP is at the end of loop body.  */
 322
 323 static bool
 324 loop_exit_at_end_p (struct loop *loop)
 325 {
 326   struct niter_desc *desc = get_simple_loop_desc (loop);
 327   rtx insn;
 328
 329   if (desc->in_edge->dest != loop->latch)
 330     return false;
 331
 332   /* Check that the latch is empty.  */
 333   FOR_BB_INSNS (loop->latch, insn)
 334     {
 335       if (NONDEBUG_INSN_P (insn))
 336         return false;
 337     }
 338
 339   return true;
 340 }
 341
 342 /* Depending on FLAGS, check whether to peel loops completely and do so.  */
 343 static void
 344 peel_loops_completely (int flags)
 345 {
 346   struct loop *loop;
 347   bool changed = false;
 348
 349   /* Scan the loops, the inner ones first.  */
 350   FOR_EACH_LOOP (loop, LI_FROM_INNERMOST)
 351     {
 352       loop->lpt_decision.decision = LPT_NONE;
 353       location_t locus = get_loop_location (loop);
 354
 355       if (dump_enabled_p ())
 356         dump_printf_loc (TDF_RTL, locus,
 357                          ";; *** Considering loop %d at BB %d for "
 358                          "complete peeling ***\n",
 359                          loop->num, loop->header->index);
 360
 361       loop->ninsns = num_loop_insns (loop);
 362
 363       decide_peel_once_rolling (loop, flags);
 364       if (loop->lpt_decision.decision == LPT_NONE)
 365         decide_peel_completely (loop, flags);
 366
 367       if (loop->lpt_decision.decision == LPT_PEEL_COMPLETELY)
 368         {
 369           report_unroll_peel (loop, locus);
 370           peel_loop_completely (loop);
 371           changed = true;
 372         }
 373     }
 374
 375     if (changed)
 376       {
 377         calculate_dominance_info (CDI_DOMINATORS);
 378         fix_loop_structure (NULL);
 379       }
 380 }
 381
 382 /* Decide whether unroll or peel loops (depending on FLAGS) and how much.  */
 383 static void
 384 decide_unrolling_and_peeling (int flags)
 385 {
 386   struct loop *loop;
 387
 388   /* Scan the loops, inner ones first.  */
 389   FOR_EACH_LOOP (loop, LI_FROM_INNERMOST)
 390     {
 391       loop->lpt_decision.decision = LPT_NONE;
 392       location_t locus = get_loop_location (loop);
 393
 394       if (dump_enabled_p ())
 395         dump_printf_loc (TDF_RTL, locus,
 396                          ";; *** Considering loop %d at BB %d for "
 397                          "unrolling and peeling ***\n",
 398                          loop->num, loop->header->index);
 399
 400       /* Do not peel cold areas.  */
 401       if (optimize_loop_for_size_p (loop))
 402         {
 403           if (dump_file)
 404             fprintf (dump_file, ";; Not considering loop, cold area\n");
 405           continue;
 406         }
 407
 408       /* Can the loop be manipulated?  */
 409       if (!can_duplicate_loop_p (loop))
 410         {
 411           if (dump_file)
 412             fprintf (dump_file,
 413                      ";; Not considering loop, cannot duplicate\n");
 414           continue;
 415         }
 416
 417       /* Skip non-innermost loops.  */
 418       if (loop->inner)
 419         {
 420           if (dump_file)
 421             fprintf (dump_file, ";; Not considering loop, is not innermost\n");
 422           continue;
 423         }
 424
 425       loop->ninsns = num_loop_insns (loop);
 426       loop->av_ninsns = average_num_loop_insns (loop);
 427
 428       /* Try transformations one by one in decreasing order of
 429          priority.  */
 430
 431       decide_unroll_constant_iterations (loop, flags);
 432       if (loop->lpt_decision.decision == LPT_NONE)
 433         decide_unroll_runtime_iterations (loop, flags);
 434       if (loop->lpt_decision.decision == LPT_NONE)
 435         decide_unroll_stupid (loop, flags);
 436       if (loop->lpt_decision.decision == LPT_NONE)
 437         decide_peel_simple (loop, flags);
 438
 439       report_unroll_peel (loop, locus);
 440     }
 441 }
 442
 443 /* Decide whether the LOOP is once rolling and suitable for complete
 444    peeling.  */
 445 static void
 446 decide_peel_once_rolling (struct loop *loop, int flags ATTRIBUTE_UNUSED)
 447 {
 448   struct niter_desc *desc;
 449
 450   if (dump_file)
 451     fprintf (dump_file, "\n;; Considering peeling once rolling loop\n");
 452
 453   /* Is the loop small enough?  */
 454   if ((unsigned) PARAM_VALUE (PARAM_MAX_ONCE_PEELED_INSNS) < loop->ninsns)
 455     {
 456       if (dump_file)
 457         fprintf (dump_file, ";; Not considering loop, is too big\n");
 458       return;
 459     }
 460
 461   /* Check for simple loops.  */
 462   desc = get_simple_loop_desc (loop);
 463
 464   /* Check number of iterations.  */
 465   if (!desc->simple_p
 466       || desc->assumptions
 467       || desc->infinite
 468       || !desc->const_iter
 469       || (desc->niter != 0
 470           && get_max_loop_iterations_int (loop) != 0))
 471     {
 472       if (dump_file)
 473         fprintf (dump_file,
 474                  ";; Unable to prove that the loop rolls exactly once\n");
 475       return;
 476     }
 477
 478   /* Success.  */
 479   loop->lpt_decision.decision = LPT_PEEL_COMPLETELY;
 480 }
 481
 482 /* Decide whether the LOOP is suitable for complete peeling.  */
 483 static void
 484 decide_peel_completely (struct loop *loop, int flags ATTRIBUTE_UNUSED)
 485 {
 486   unsigned npeel;
 487   struct niter_desc *desc;
 488
 489   if (dump_file)
 490     fprintf (dump_file, "\n;; Considering peeling completely\n");
 491
 492   /* Skip non-innermost loops.  */
 493   if (loop->inner)
 494     {
 495       if (dump_file)
 496         fprintf (dump_file, ";; Not considering loop, is not innermost\n");
 497       return;
 498     }
 499
 500   /* Do not peel cold areas.  */
 501   if (optimize_loop_for_size_p (loop))
 502     {
 503       if (dump_file)
 504         fprintf (dump_file, ";; Not considering loop, cold area\n");
 505       return;
 506     }
 507
 508   /* Can the loop be manipulated?  */
 509   if (!can_duplicate_loop_p (loop))
 510     {
 511       if (dump_file)
 512         fprintf (dump_file,
 513                  ";; Not considering loop, cannot duplicate\n");
 514       return;
 515     }
 516
 517   /* npeel = number of iterations to peel.  */
 518   npeel = PARAM_VALUE (PARAM_MAX_COMPLETELY_PEELED_INSNS) / loop->ninsns;
 519   if (npeel > (unsigned) PARAM_VALUE (PARAM_MAX_COMPLETELY_PEEL_TIMES))
 520     npeel = PARAM_VALUE (PARAM_MAX_COMPLETELY_PEEL_TIMES);
 521
 522   /* Is the loop small enough?  */
 523   if (!npeel)
 524     {
 525       if (dump_file)
 526         fprintf (dump_file, ";; Not considering loop, is too big\n");
 527       return;
 528     }
 529
 530   /* Check for simple loops.  */
 531   desc = get_simple_loop_desc (loop);
 532
 533   /* Check number of iterations.  */
 534   if (!desc->simple_p
 535       || desc->assumptions
 536       || !desc->const_iter
 537       || desc->infinite)
 538     {
 539       if (dump_file)
 540         fprintf (dump_file,
 541                  ";; Unable to prove that the loop iterates constant times\n");
 542       return;
 543     }
 544
 545   if (desc->niter > npeel - 1)
 546     {
 547       if (dump_file)
 548         {
 549           fprintf (dump_file,
 550                    ";; Not peeling loop completely, rolls too much (");
 551           fprintf (dump_file, HOST_WIDEST_INT_PRINT_DEC, desc->niter);
 552           fprintf (dump_file, " iterations > %d [maximum peelings])\n", npeel);
 553         }
 554       return;
 555     }
 556
 557   /* Success.  */
 558   loop->lpt_decision.decision = LPT_PEEL_COMPLETELY;
 559 }
 560
 561 /* Peel all iterations of LOOP, remove exit edges and cancel the loop
 562    completely.  The transformation done:
 563
 564    for (i = 0; i < 4; i++)
 565      body;
 566
 567    ==>
 568
 569    i = 0;
 570    body; i++;
 571    body; i++;
 572    body; i++;
 573    body; i++;
 574    */
 575 static void
 576 peel_loop_completely (struct loop *loop)
 577 {
 578   sbitmap wont_exit;
 579   unsigned HOST_WIDE_INT npeel;
 580   unsigned i;
 581   edge ein;
 582   struct niter_desc *desc = get_simple_loop_desc (loop);
 583   struct opt_info *opt_info = NULL;
 584
 585   npeel = desc->niter;
 586
 587   if (npeel)
 588     {
 589       bool ok;
 590
 591       wont_exit = sbitmap_alloc (npeel + 1);
 592       bitmap_ones (wont_exit);
 593       bitmap_clear_bit (wont_exit, 0);
 594       if (desc->noloop_assumptions)
 595         bitmap_clear_bit (wont_exit, 1);
 596
 597       auto_vec<edge> remove_edges;
 598       if (flag_split_ivs_in_unroller)
 599         opt_info = analyze_insns_in_loop (loop);
 600
 601       opt_info_start_duplication (opt_info);
 602       ok = duplicate_loop_to_header_edge (loop, loop_preheader_edge (loop),
 603                                           npeel,
 604                                           wont_exit, desc->out_edge,
 605                                           &remove_edges,
 606                                           DLTHE_FLAG_UPDATE_FREQ
 607                                           | DLTHE_FLAG_COMPLETTE_PEEL
 608                                           | (opt_info
 609                                              ? DLTHE_RECORD_COPY_NUMBER : 0));
 610       gcc_assert (ok);
 611
 612       free (wont_exit);
 613
 614       if (opt_info)
 615         {
 616           apply_opt_in_copies (opt_info, npeel, false, true);
 617           free_opt_info (opt_info);
 618         }
 619
 620       /* Remove the exit edges.  */
 621       FOR_EACH_VEC_ELT (remove_edges, i, ein)
 622         remove_path (ein);
 623     }
 624
 625   ein = desc->in_edge;
 626   free_simple_loop_desc (loop);
 627
 628   /* Now remove the unreachable part of the last iteration and cancel
 629      the loop.  */
 630   remove_path (ein);
 631
 632   if (dump_file)
 633     fprintf (dump_file, ";; Peeled loop completely, %d times\n", (int) npeel);
 634 }
 635
 636 /* Decide whether to unroll LOOP iterating constant number of times
 637    and how much.  */
 638
 639 static void
 640 decide_unroll_constant_iterations (struct loop *loop, int flags)
 641 {
 642   unsigned nunroll, nunroll_by_av, best_copies, best_unroll = 0, n_copies, i;
 643   struct niter_desc *desc;
 644   double_int iterations;
 645
 646   if (!(flags & UAP_UNROLL))
 647     {
 648       /* We were not asked to, just return back silently.  */
 649       return;
 650     }
 651
 652   if (dump_file)
 653     fprintf (dump_file,
 654              "\n;; Considering unrolling loop with constant "
 655              "number of iterations\n");
 656
 657   /* nunroll = total number of copies of the original loop body in
 658      unrolled loop (i.e. if it is 2, we have to duplicate loop body once.  */
 659   nunroll = PARAM_VALUE (PARAM_MAX_UNROLLED_INSNS) / loop->ninsns;
 660   nunroll_by_av
 661     = PARAM_VALUE (PARAM_MAX_AVERAGE_UNROLLED_INSNS) / loop->av_ninsns;
 662   if (nunroll > nunroll_by_av)
 663     nunroll = nunroll_by_av;
 664   if (nunroll > (unsigned) PARAM_VALUE (PARAM_MAX_UNROLL_TIMES))
 665     nunroll = PARAM_VALUE (PARAM_MAX_UNROLL_TIMES);
 666
 667   if (targetm.loop_unroll_adjust)
 668     nunroll = targetm.loop_unroll_adjust (nunroll, loop);
 669
 670   /* Skip big loops.  */
 671   if (nunroll <= 1)
 672     {
 673       if (dump_file)
 674         fprintf (dump_file, ";; Not considering loop, is too big\n");
 675       return;
 676     }
 677
 678   /* Check for simple loops.  */
 679   desc = get_simple_loop_desc (loop);
 680
 681   /* Check number of iterations.  */
 682   if (!desc->simple_p || !desc->const_iter || desc->assumptions)
 683     {
 684       if (dump_file)
 685         fprintf (dump_file,
 686                  ";; Unable to prove that the loop iterates constant times\n");
 687       return;
 688     }
 689
 690   /* Check whether the loop rolls enough to consider.
 691      Consult also loop bounds and profile; in the case the loop has more
 692      than one exit it may well loop less than determined maximal number
 693      of iterations.  */
 694   if (desc->niter < 2 * nunroll
 695       || ((get_estimated_loop_iterations (loop, &iterations)
 696            || get_max_loop_iterations (loop, &iterations))
 697           && iterations.ult (double_int::from_shwi (2 * nunroll))))
 698     {
 699       if (dump_file)
 700         fprintf (dump_file, ";; Not unrolling loop, doesn't roll\n");
 701       return;
 702     }
 703
 704   /* Success; now compute number of iterations to unroll.  We alter
 705      nunroll so that as few as possible copies of loop body are
 706      necessary, while still not decreasing the number of unrollings
 707      too much (at most by 1).  */
 708   best_copies = 2 * nunroll + 10;
 709
 710   i = 2 * nunroll + 2;
 711   if (i - 1 >= desc->niter)
 712     i = desc->niter - 2;
 713
 714   for (; i >= nunroll - 1; i--)
 715     {
 716       unsigned exit_mod = desc->niter % (i + 1);
 717
 718       if (!loop_exit_at_end_p (loop))
 719         n_copies = exit_mod + i + 1;
 720       else if (exit_mod != (unsigned) i
 721                || desc->noloop_assumptions != NULL_RTX)
 722         n_copies = exit_mod + i + 2;
 723       else
 724         n_copies = i + 1;
 725
 726       if (n_copies < best_copies)
 727         {
 728           best_copies = n_copies;
 729           best_unroll = i;
 730         }
 731     }
 732
 733   loop->lpt_decision.decision = LPT_UNROLL_CONSTANT;
 734   loop->lpt_decision.times = best_unroll;
 735 }
 736
 737 /* Unroll LOOP with constant number of iterations LOOP->LPT_DECISION.TIMES times.
 738    The transformation does this:
 739
 740    for (i = 0; i < 102; i++)
 741      body;
 742
 743    ==>  (LOOP->LPT_DECISION.TIMES == 3)
 744
 745    i = 0;
 746    body; i++;
 747    body; i++;
 748    while (i < 102)
 749      {
 750        body; i++;
 751        body; i++;
 752        body; i++;
 753        body; i++;
 754      }
 755   */
 756 static void
 757 unroll_loop_constant_iterations (struct loop *loop)
 758 {
 759   unsigned HOST_WIDE_INT niter;
 760   unsigned exit_mod;
 761   sbitmap wont_exit;
 762   unsigned i;
 763   edge e;
 764   unsigned max_unroll = loop->lpt_decision.times;
 765   struct niter_desc *desc = get_simple_loop_desc (loop);
 766   bool exit_at_end = loop_exit_at_end_p (loop);
 767   struct opt_info *opt_info = NULL;
 768   bool ok;
 769
 770   niter = desc->niter;
 771
 772   /* Should not get here (such loop should be peeled instead).  */
 773   gcc_assert (niter > max_unroll + 1);
 774
 775   exit_mod = niter % (max_unroll + 1);
 776
 777   wont_exit = sbitmap_alloc (max_unroll + 1);
 778   bitmap_ones (wont_exit);
 779
 780   auto_vec<edge> remove_edges;
 781   if (flag_split_ivs_in_unroller
 782       || flag_variable_expansion_in_unroller)
 783     opt_info = analyze_insns_in_loop (loop);
 784
 785   if (!exit_at_end)
 786     {
 787       /* The exit is not at the end of the loop; leave exit test
 788          in the first copy, so that the loops that start with test
 789          of exit condition have continuous body after unrolling.  */
 790
 791       if (dump_file)
 792         fprintf (dump_file, ";; Condition at beginning of loop.\n");
 793
 794       /* Peel exit_mod iterations.  */
 795       bitmap_clear_bit (wont_exit, 0);
 796       if (desc->noloop_assumptions)
 797         bitmap_clear_bit (wont_exit, 1);
 798
 799       if (exit_mod)
 800         {
 801           opt_info_start_duplication (opt_info);
 802           ok = duplicate_loop_to_header_edge (loop, loop_preheader_edge (loop),
 803                                               exit_mod,
 804                                               wont_exit, desc->out_edge,
 805                                               &remove_edges,
 806                                               DLTHE_FLAG_UPDATE_FREQ
 807                                               | (opt_info && exit_mod > 1
 808                                                  ? DLTHE_RECORD_COPY_NUMBER
 809                                                    : 0));
 810           gcc_assert (ok);
 811
 812           if (opt_info && exit_mod > 1)
 813             apply_opt_in_copies (opt_info, exit_mod, false, false);
 814
 815           desc->noloop_assumptions = NULL_RTX;
 816           desc->niter -= exit_mod;
 817           loop->nb_iterations_upper_bound -= double_int::from_uhwi (exit_mod);
 818           if (loop->any_estimate
 819               && double_int::from_uhwi (exit_mod).ule
 820                    (loop->nb_iterations_estimate))
 821             loop->nb_iterations_estimate -= double_int::from_uhwi (exit_mod);
 822           else
 823             loop->any_estimate = false;
 824         }
 825
 826       bitmap_set_bit (wont_exit, 1);
 827     }
 828   else
 829     {
 830       /* Leave exit test in last copy, for the same reason as above if
 831          the loop tests the condition at the end of loop body.  */
 832
 833       if (dump_file)
 834         fprintf (dump_file, ";; Condition at end of loop.\n");
 835
 836       /* We know that niter >= max_unroll + 2; so we do not need to care of
 837          case when we would exit before reaching the loop.  So just peel
 838          exit_mod + 1 iterations.  */
 839       if (exit_mod != max_unroll
 840           || desc->noloop_assumptions)
 841         {
 842           bitmap_clear_bit (wont_exit, 0);
 843           if (desc->noloop_assumptions)
 844             bitmap_clear_bit (wont_exit, 1);
 845
 846           opt_info_start_duplication (opt_info);
 847           ok = duplicate_loop_to_header_edge (loop, loop_preheader_edge (loop),
 848                                               exit_mod + 1,
 849                                               wont_exit, desc->out_edge,
 850                                               &remove_edges,
 851                                               DLTHE_FLAG_UPDATE_FREQ
 852                                               | (opt_info && exit_mod > 0
 853                                                  ? DLTHE_RECORD_COPY_NUMBER
 854                                                    : 0));
 855           gcc_assert (ok);
 856
 857           if (opt_info && exit_mod > 0)
 858             apply_opt_in_copies (opt_info, exit_mod + 1, false, false);
 859
 860           desc->niter -= exit_mod + 1;
 861           loop->nb_iterations_upper_bound -= double_int::from_uhwi (exit_mod + 1);
 862           if (loop->any_estimate
 863               && double_int::from_uhwi (exit_mod + 1).ule
 864                    (loop->nb_iterations_estimate))
 865             loop->nb_iterations_estimate -= double_int::from_uhwi (exit_mod + 1);
 866           else
 867             loop->any_estimate = false;
 868           desc->noloop_assumptions = NULL_RTX;
 869
 870           bitmap_set_bit (wont_exit, 0);
 871           bitmap_set_bit (wont_exit, 1);
 872         }
 873
 874       bitmap_clear_bit (wont_exit, max_unroll);
 875     }
 876
 877   /* Now unroll the loop.  */
 878
 879   opt_info_start_duplication (opt_info);
 880   ok = duplicate_loop_to_header_edge (loop, loop_latch_edge (loop),
 881                                       max_unroll,
 882                                       wont_exit, desc->out_edge,
 883                                       &remove_edges,
 884                                       DLTHE_FLAG_UPDATE_FREQ
 885                                       | (opt_info
 886                                          ? DLTHE_RECORD_COPY_NUMBER
 887                                            : 0));
 888   gcc_assert (ok);
 889
 890   if (opt_info)
 891     {
 892       apply_opt_in_copies (opt_info, max_unroll, true, true);
 893       free_opt_info (opt_info);
 894     }
 895
 896   free (wont_exit);
 897
 898   if (exit_at_end)
 899     {
 900       basic_block exit_block = get_bb_copy (desc->in_edge->src);
 901       /* Find a new in and out edge; they are in the last copy we have made.  */
 902
 903       if (EDGE_SUCC (exit_block, 0)->dest == desc->out_edge->dest)
 904         {
 905           desc->out_edge = EDGE_SUCC (exit_block, 0);
 906           desc->in_edge = EDGE_SUCC (exit_block, 1);
 907         }
 908       else
 909         {
 910           desc->out_edge = EDGE_SUCC (exit_block, 1);
 911           desc->in_edge = EDGE_SUCC (exit_block, 0);
 912         }
 913     }
 914
 915   desc->niter /= max_unroll + 1;
 916   loop->nb_iterations_upper_bound
 917     = loop->nb_iterations_upper_bound.udiv (double_int::from_uhwi (max_unroll
 918                                                                    + 1),
 919                                             TRUNC_DIV_EXPR);
 920   if (loop->any_estimate)
 921     loop->nb_iterations_estimate
 922       = loop->nb_iterations_estimate.udiv (double_int::from_uhwi (max_unroll
 923                                                                   + 1),
 924                                            TRUNC_DIV_EXPR);
 925   desc->niter_expr = GEN_INT (desc->niter);
 926
 927   /* Remove the edges.  */
 928   FOR_EACH_VEC_ELT (remove_edges, i, e)
 929     remove_path (e);
 930
 931   if (dump_file)
 932     fprintf (dump_file,
 933              ";; Unrolled loop %d times, constant # of iterations %i insns\n",
 934              max_unroll, num_loop_insns (loop));
 935 }
 936
 937 /* Decide whether to unroll LOOP iterating runtime computable number of times
 938    and how much.  */
 939 static void
 940 decide_unroll_runtime_iterations (struct loop *loop, int flags)
 941 {
 942   unsigned nunroll, nunroll_by_av, i;
 943   struct niter_desc *desc;
 944   double_int iterations;
 945
 946   if (!(flags & UAP_UNROLL))
 947     {
 948       /* We were not asked to, just return back silently.  */
 949       return;
 950     }
 951
 952   if (dump_file)
 953     fprintf (dump_file,
 954              "\n;; Considering unrolling loop with runtime "
 955              "computable number of iterations\n");
 956
 957   /* nunroll = total number of copies of the original loop body in
 958      unrolled loop (i.e. if it is 2, we have to duplicate loop body once.  */
 959   nunroll = PARAM_VALUE (PARAM_MAX_UNROLLED_INSNS) / loop->ninsns;
 960   nunroll_by_av = PARAM_VALUE (PARAM_MAX_AVERAGE_UNROLLED_INSNS) / loop->av_ninsns;
 961   if (nunroll > nunroll_by_av)
 962     nunroll = nunroll_by_av;
 963   if (nunroll > (unsigned) PARAM_VALUE (PARAM_MAX_UNROLL_TIMES))
 964     nunroll = PARAM_VALUE (PARAM_MAX_UNROLL_TIMES);
 965
 966   if (targetm.loop_unroll_adjust)
 967     nunroll = targetm.loop_unroll_adjust (nunroll, loop);
 968
 969   /* Skip big loops.  */
 970   if (nunroll <= 1)
 971     {
 972       if (dump_file)
 973         fprintf (dump_file, ";; Not considering loop, is too big\n");
 974       return;
 975     }
 976
 977   /* Check for simple loops.  */
 978   desc = get_simple_loop_desc (loop);
 979
 980   /* Check simpleness.  */
 981   if (!desc->simple_p || desc->assumptions)
 982     {
 983       if (dump_file)
 984         fprintf (dump_file,
 985                  ";; Unable to prove that the number of iterations "
 986                  "can be counted in runtime\n");
 987       return;
 988     }
 989
 990   if (desc->const_iter)
 991     {
 992       if (dump_file)
 993         fprintf (dump_file, ";; Loop iterates constant times\n");
 994       return;
 995     }
 996
 997   /* Check whether the loop rolls.  */
 998   if ((get_estimated_loop_iterations (loop, &iterations)
 999        || get_max_loop_iterations (loop, &iterations))
1000       && iterations.ult (double_int::from_shwi (2 * nunroll)))
1001     {
1002       if (dump_file)
1003         fprintf (dump_file, ";; Not unrolling loop, doesn't roll\n");
1004       return;
1005     }
1006
1007   /* Success; now force nunroll to be power of 2, as we are unable to
1008      cope with overflows in computation of number of iterations.  */
1009   for (i = 1; 2 * i <= nunroll; i *= 2)
1010     continue;
1011
1012   loop->lpt_decision.decision = LPT_UNROLL_RUNTIME;
1013   loop->lpt_decision.times = i - 1;
1014 }
1015
1016 /* Splits edge E and inserts the sequence of instructions INSNS on it, and
1017    returns the newly created block.  If INSNS is NULL_RTX, nothing is changed
1018    and NULL is returned instead.  */
1019
1020 basic_block
1021 split_edge_and_insert (edge e, rtx insns)
1022 {
1023   basic_block bb;
1024
1025   if (!insns)
1026     return NULL;
1027   bb = split_edge (e);
1028   emit_insn_after (insns, BB_END (bb));
1029
1030   /* ??? We used to assume that INSNS can contain control flow insns, and
1031      that we had to try to find sub basic blocks in BB to maintain a valid
1032      CFG.  For this purpose we used to set the BB_SUPERBLOCK flag on BB
1033      and call break_superblocks when going out of cfglayout mode.  But it
1034      turns out that this never happens; and that if it does ever happen,
1035      the TODO_verify_flow at the end of the RTL loop passes would fail.
1036
1037      There are two reasons why we expected we could have control flow insns
1038      in INSNS.  The first is when a comparison has to be done in parts, and
1039      the second is when the number of iterations is computed for loops with
1040      the number of iterations known at runtime.  In both cases, test cases
1041      to get control flow in INSNS appear to be impossible to construct:
1042
1043       * If do_compare_rtx_and_jump needs several branches to do comparison
1044         in a mode that needs comparison by parts, we cannot analyze the
1045         number of iterations of the loop, and we never get to unrolling it.
1046
1047       * The code in expand_divmod that was suspected to cause creation of
1048         branching code seems to be only accessed for signed division.  The
1049         divisions used by # of iterations analysis are always unsigned.
1050         Problems might arise on architectures that emits branching code
1051         for some operations that may appear in the unroller (especially
1052         for division), but we have no such architectures.
1053
1054      Considering all this, it was decided that we should for now assume
1055      that INSNS can in theory contain control flow insns, but in practice
1056      it never does.  So we don't handle the theoretical case, and should
1057      a real failure ever show up, we have a pretty good clue for how to
1058      fix it.  */
1059
1060   return bb;
1061 }
1062
1063 /* Unroll LOOP for which we are able to count number of iterations in runtime
1064    LOOP->LPT_DECISION.TIMES times.  The transformation does this (with some
1065    extra care for case n < 0):
1066
1067    for (i = 0; i < n; i++)
1068      body;
1069
1070    ==>  (LOOP->LPT_DECISION.TIMES == 3)
1071
1072    i = 0;
1073    mod = n % 4;
1074
1075    switch (mod)
1076      {
1077        case 3:
1078          body; i++;
1079        case 2:
1080          body; i++;
1081        case 1:
1082          body; i++;
1083        case 0: ;
1084      }
1085
1086    while (i < n)
1087      {
1088        body; i++;
1089        body; i++;
1090        body; i++;
1091        body; i++;
1092      }
1093    */
1094 static void
1095 unroll_loop_runtime_iterations (struct loop *loop)
1096 {
1097   rtx old_niter, niter, init_code, branch_code, tmp;
1098   unsigned i, j, p;
1099   basic_block preheader, *body, swtch, ezc_swtch;
1100   sbitmap wont_exit;
1101   int may_exit_copy;
1102   unsigned n_peel;
1103   edge e;
1104   bool extra_zero_check, last_may_exit;
1105   unsigned max_unroll = loop->lpt_decision.times;
1106   struct niter_desc *desc = get_simple_loop_desc (loop);
1107   bool exit_at_end = loop_exit_at_end_p (loop);
1108   struct opt_info *opt_info = NULL;
1109   bool ok;
1110
1111   if (flag_split_ivs_in_unroller
1112       || flag_variable_expansion_in_unroller)
1113     opt_info = analyze_insns_in_loop (loop);
1114
1115   /* Remember blocks whose dominators will have to be updated.  */
1116   auto_vec<basic_block> dom_bbs;
1117
1118   body = get_loop_body (loop);
1119   for (i = 0; i < loop->num_nodes; i++)
1120     {
1121       vec<basic_block> ldom;
1122       basic_block bb;
1123
1124       ldom = get_dominated_by (CDI_DOMINATORS, body[i]);
1125       FOR_EACH_VEC_ELT (ldom, j, bb)
1126         if (!flow_bb_inside_loop_p (loop, bb))
1127           dom_bbs.safe_push (bb);
1128
1129       ldom.release ();
1130     }
1131   free (body);
1132
1133   if (!exit_at_end)
1134     {
1135       /* Leave exit in first copy (for explanation why see comment in
1136          unroll_loop_constant_iterations).  */
1137       may_exit_copy = 0;
1138       n_peel = max_unroll - 1;
1139       extra_zero_check = true;
1140       last_may_exit = false;
1141     }
1142   else
1143     {
1144       /* Leave exit in last copy (for explanation why see comment in
1145          unroll_loop_constant_iterations).  */
1146       may_exit_copy = max_unroll;
1147       n_peel = max_unroll;
1148       extra_zero_check = false;
1149       last_may_exit = true;
1150     }
1151
1152   /* Get expression for number of iterations.  */
1153   start_sequence ();
1154   old_niter = niter = gen_reg_rtx (desc->mode);
1155   tmp = force_operand (copy_rtx (desc->niter_expr), niter);
1156   if (tmp != niter)
1157     emit_move_insn (niter, tmp);
1158
1159   /* Count modulo by ANDing it with max_unroll; we use the fact that
1160      the number of unrollings is a power of two, and thus this is correct
1161      even if there is overflow in the computation.  */
1162   niter = expand_simple_binop (desc->mode, AND,
1163                                niter, gen_int_mode (max_unroll, desc->mode),
1164                                NULL_RTX, 0, OPTAB_LIB_WIDEN);
1165
1166   init_code = get_insns ();
1167   end_sequence ();
1168   unshare_all_rtl_in_chain (init_code);
1169
1170   /* Precondition the loop.  */
1171   split_edge_and_insert (loop_preheader_edge (loop), init_code);
1172
1173   auto_vec<edge> remove_edges;
1174
1175   wont_exit = sbitmap_alloc (max_unroll + 2);
1176
1177   /* Peel the first copy of loop body (almost always we must leave exit test
1178      here; the only exception is when we have extra zero check and the number
1179      of iterations is reliable.  Also record the place of (possible) extra
1180      zero check.  */
1181   bitmap_clear (wont_exit);
1182   if (extra_zero_check
1183       && !desc->noloop_assumptions)
1184     bitmap_set_bit (wont_exit, 1);
1185   ezc_swtch = loop_preheader_edge (loop)->src;
1186   ok = duplicate_loop_to_header_edge (loop, loop_preheader_edge (loop),
1187                                       1, wont_exit, desc->out_edge,
1188                                       &remove_edges,
1189                                       DLTHE_FLAG_UPDATE_FREQ);
1190   gcc_assert (ok);
1191
1192   /* Record the place where switch will be built for preconditioning.  */
1193   swtch = split_edge (loop_preheader_edge (loop));
1194
1195   for (i = 0; i < n_peel; i++)
1196     {
1197       /* Peel the copy.  */
1198       bitmap_clear (wont_exit);
1199       if (i != n_peel - 1 || !last_may_exit)
1200         bitmap_set_bit (wont_exit, 1);
1201       ok = duplicate_loop_to_header_edge (loop, loop_preheader_edge (loop),
1202                                           1, wont_exit, desc->out_edge,
1203                                           &remove_edges,
1204                                           DLTHE_FLAG_UPDATE_FREQ);
1205       gcc_assert (ok);
1206
1207       /* Create item for switch.  */
1208       j = n_peel - i - (extra_zero_check ? 0 : 1);
1209       p = REG_BR_PROB_BASE / (i + 2);
1210
1211       preheader = split_edge (loop_preheader_edge (loop));
1212       branch_code = compare_and_jump_seq (copy_rtx (niter), GEN_INT (j), EQ,
1213                                           block_label (preheader), p,
1214                                           NULL_RTX);
1215
1216       /* We rely on the fact that the compare and jump cannot be optimized out,
1217          and hence the cfg we create is correct.  */
1218       gcc_assert (branch_code != NULL_RTX);
1219
1220       swtch = split_edge_and_insert (single_pred_edge (swtch), branch_code);
1221       set_immediate_dominator (CDI_DOMINATORS, preheader, swtch);
1222       single_pred_edge (swtch)->probability = REG_BR_PROB_BASE - p;
1223       e = make_edge (swtch, preheader,
1224                      single_succ_edge (swtch)->flags & EDGE_IRREDUCIBLE_LOOP);
1225       e->count = RDIV (preheader->count * REG_BR_PROB_BASE, p);
1226       e->probability = p;
1227     }
1228
1229   if (extra_zero_check)
1230     {
1231       /* Add branch for zero iterations.  */
1232       p = REG_BR_PROB_BASE / (max_unroll + 1);
1233       swtch = ezc_swtch;
1234       preheader = split_edge (loop_preheader_edge (loop));
1235       branch_code = compare_and_jump_seq (copy_rtx (niter), const0_rtx, EQ,
1236                                           block_label (preheader), p,
1237                                           NULL_RTX);
1238       gcc_assert (branch_code != NULL_RTX);
1239
1240       swtch = split_edge_and_insert (single_succ_edge (swtch), branch_code);
1241       set_immediate_dominator (CDI_DOMINATORS, preheader, swtch);
1242       single_succ_edge (swtch)->probability = REG_BR_PROB_BASE - p;
1243       e = make_edge (swtch, preheader,
1244                      single_succ_edge (swtch)->flags & EDGE_IRREDUCIBLE_LOOP);
1245       e->count = RDIV (preheader->count * REG_BR_PROB_BASE, p);
1246       e->probability = p;
1247     }
1248
1249   /* Recount dominators for outer blocks.  */
1250   iterate_fix_dominators (CDI_DOMINATORS, dom_bbs, false);
1251
1252   /* And unroll loop.  */
1253
1254   bitmap_ones (wont_exit);
1255   bitmap_clear_bit (wont_exit, may_exit_copy);
1256   opt_info_start_duplication (opt_info);
1257
1258   ok = duplicate_loop_to_header_edge (loop, loop_latch_edge (loop),
1259                                       max_unroll,
1260                                       wont_exit, desc->out_edge,
1261                                       &remove_edges,
1262                                       DLTHE_FLAG_UPDATE_FREQ
1263                                       | (opt_info
1264                                          ? DLTHE_RECORD_COPY_NUMBER
1265                                            : 0));
1266   gcc_assert (ok);
1267
1268   if (opt_info)
1269     {
1270       apply_opt_in_copies (opt_info, max_unroll, true, true);
1271       free_opt_info (opt_info);
1272     }
1273
1274   free (wont_exit);
1275
1276   if (exit_at_end)
1277     {
1278       basic_block exit_block = get_bb_copy (desc->in_edge->src);
1279       /* Find a new in and out edge; they are in the last copy we have
1280          made.  */
1281
1282       if (EDGE_SUCC (exit_block, 0)->dest == desc->out_edge->dest)
1283         {
1284           desc->out_edge = EDGE_SUCC (exit_block, 0);
1285           desc->in_edge = EDGE_SUCC (exit_block, 1);
1286         }
1287       else
1288         {
1289           desc->out_edge = EDGE_SUCC (exit_block, 1);
1290           desc->in_edge = EDGE_SUCC (exit_block, 0);
1291         }
1292     }
1293
1294   /* Remove the edges.  */
1295   FOR_EACH_VEC_ELT (remove_edges, i, e)
1296     remove_path (e);
1297
1298   /* We must be careful when updating the number of iterations due to
1299      preconditioning and the fact that the value must be valid at entry
1300      of the loop.  After passing through the above code, we see that
1301      the correct new number of iterations is this:  */
1302   gcc_assert (!desc->const_iter);
1303   desc->niter_expr =
1304     simplify_gen_binary (UDIV, desc->mode, old_niter,
1305                          gen_int_mode (max_unroll + 1, desc->mode));
1306   loop->nb_iterations_upper_bound
1307     = loop->nb_iterations_upper_bound.udiv (double_int::from_uhwi (max_unroll
1308                                                                    + 1),
1309                                             TRUNC_DIV_EXPR);
1310   if (loop->any_estimate)
1311     loop->nb_iterations_estimate
1312       = loop->nb_iterations_estimate.udiv (double_int::from_uhwi (max_unroll
1313                                                                   + 1),
1314                                            TRUNC_DIV_EXPR);
1315   if (exit_at_end)
1316     {
1317       desc->niter_expr =
1318         simplify_gen_binary (MINUS, desc->mode, desc->niter_expr, const1_rtx);
1319       desc->noloop_assumptions = NULL_RTX;
1320       --loop->nb_iterations_upper_bound;
1321       if (loop->any_estimate
1322           && loop->nb_iterations_estimate != double_int_zero)
1323         --loop->nb_iterations_estimate;
1324       else
1325         loop->any_estimate = false;
1326     }
1327
1328   if (dump_file)
1329     fprintf (dump_file,
1330              ";; Unrolled loop %d times, counting # of iterations "
1331              "in runtime, %i insns\n",
1332              max_unroll, num_loop_insns (loop));
1333 }
1334
1335 /* Decide whether to simply peel LOOP and how much.  */
1336 static void
1337 decide_peel_simple (struct loop *loop, int flags)
1338 {
1339   unsigned npeel;
1340   double_int iterations;
1341
1342   if (!(flags & UAP_PEEL))
1343     {
1344       /* We were not asked to, just return back silently.  */
1345       return;
1346     }
1347
1348   if (dump_file)
1349     fprintf (dump_file, "\n;; Considering simply peeling loop\n");
1350
1351   /* npeel = number of iterations to peel.  */
1352   npeel = PARAM_VALUE (PARAM_MAX_PEELED_INSNS) / loop->ninsns;
1353   if (npeel > (unsigned) PARAM_VALUE (PARAM_MAX_PEEL_TIMES))
1354     npeel = PARAM_VALUE (PARAM_MAX_PEEL_TIMES);
1355
1356   /* Skip big loops.  */
1357   if (!npeel)
1358     {
1359       if (dump_file)
1360         fprintf (dump_file, ";; Not considering loop, is too big\n");
1361       return;
1362     }
1363
1364   /* Do not simply peel loops with branches inside -- it increases number
1365      of mispredicts.
1366      Exception is when we do have profile and we however have good chance
1367      to peel proper number of iterations loop will iterate in practice.
1368      TODO: this heuristic needs tunning; while for complette unrolling
1369      the branch inside loop mostly eliminates any improvements, for
1370      peeling it is not the case.  Also a function call inside loop is
1371      also branch from branch prediction POV (and probably better reason
1372      to not unroll/peel).  */
1373   if (num_loop_branches (loop) > 1
1374       && profile_status != PROFILE_READ)
1375     {
1376       if (dump_file)
1377         fprintf (dump_file, ";; Not peeling, contains branches\n");
1378       return;
1379     }
1380
1381   /* If we have realistic estimate on number of iterations, use it.  */
1382   if (get_estimated_loop_iterations (loop, &iterations))
1383     {
1384       if (double_int::from_shwi (npeel).ule (iterations))
1385         {
1386           if (dump_file)
1387             {
1388               fprintf (dump_file, ";; Not peeling loop, rolls too much (");
1389               fprintf (dump_file, HOST_WIDEST_INT_PRINT_DEC,
1390                        (HOST_WIDEST_INT) (iterations.to_shwi () + 1));
1391               fprintf (dump_file, " iterations > %d [maximum peelings])\n",
1392                        npeel);
1393             }
1394           return;
1395         }
1396       npeel = iterations.to_shwi () + 1;
1397     }
1398   /* If we have small enough bound on iterations, we can still peel (completely
1399      unroll).  */
1400   else if (get_max_loop_iterations (loop, &iterations)
1401            && iterations.ult (double_int::from_shwi (npeel)))
1402     npeel = iterations.to_shwi () + 1;
1403   else
1404     {
1405       /* For now we have no good heuristics to decide whether loop peeling
1406          will be effective, so disable it.  */
1407       if (dump_file)
1408         fprintf (dump_file,
1409                  ";; Not peeling loop, no evidence it will be profitable\n");
1410       return;
1411     }
1412
1413   /* Success.  */
1414   loop->lpt_decision.decision = LPT_PEEL_SIMPLE;
1415   loop->lpt_decision.times = npeel;
1416 }
1417
1418 /* Peel a LOOP LOOP->LPT_DECISION.TIMES times.  The transformation does this:
1419
1420    while (cond)
1421      body;
1422
1423    ==>  (LOOP->LPT_DECISION.TIMES == 3)
1424
1425    if (!cond) goto end;
1426    body;
1427    if (!cond) goto end;
1428    body;
1429    if (!cond) goto end;
1430    body;
1431    while (cond)
1432      body;
1433    end: ;
1434    */
1435 static void
1436 peel_loop_simple (struct loop *loop)
1437 {
1438   sbitmap wont_exit;
1439   unsigned npeel = loop->lpt_decision.times;
1440   struct niter_desc *desc = get_simple_loop_desc (loop);
1441   struct opt_info *opt_info = NULL;
1442   bool ok;
1443
1444   if (flag_split_ivs_in_unroller && npeel > 1)
1445     opt_info = analyze_insns_in_loop (loop);
1446
1447   wont_exit = sbitmap_alloc (npeel + 1);
1448   bitmap_clear (wont_exit);
1449
1450   opt_info_start_duplication (opt_info);
1451
1452   ok = duplicate_loop_to_header_edge (loop, loop_preheader_edge (loop),
1453                                       npeel, wont_exit, NULL,
1454                                       NULL, DLTHE_FLAG_UPDATE_FREQ
1455                                       | (opt_info
1456                                          ? DLTHE_RECORD_COPY_NUMBER
1457                                            : 0));
1458   gcc_assert (ok);
1459
1460   free (wont_exit);
1461
1462   if (opt_info)
1463     {
1464       apply_opt_in_copies (opt_info, npeel, false, false);
1465       free_opt_info (opt_info);
1466     }
1467
1468   if (desc->simple_p)
1469     {
1470       if (desc->const_iter)
1471         {
1472           desc->niter -= npeel;
1473           desc->niter_expr = GEN_INT (desc->niter);
1474           desc->noloop_assumptions = NULL_RTX;
1475         }
1476       else
1477         {
1478           /* We cannot just update niter_expr, as its value might be clobbered
1479              inside loop.  We could handle this by counting the number into
1480              temporary just like we do in runtime unrolling, but it does not
1481              seem worthwhile.  */
1482           free_simple_loop_desc (loop);
1483         }
1484     }
1485   if (dump_file)
1486     fprintf (dump_file, ";; Peeling loop %d times\n", npeel);
1487 }
1488
1489 /* Decide whether to unroll LOOP stupidly and how much.  */
1490 static void
1491 decide_unroll_stupid (struct loop *loop, int flags)
1492 {
1493   unsigned nunroll, nunroll_by_av, i;
1494   struct niter_desc *desc;
1495   double_int iterations;
1496
1497   if (!(flags & UAP_UNROLL_ALL))
1498     {
1499       /* We were not asked to, just return back silently.  */
1500       return;
1501     }
1502
1503   if (dump_file)
1504     fprintf (dump_file, "\n;; Considering unrolling loop stupidly\n");
1505
1506   /* nunroll = total number of copies of the original loop body in
1507      unrolled loop (i.e. if it is 2, we have to duplicate loop body once.  */
1508   nunroll = PARAM_VALUE (PARAM_MAX_UNROLLED_INSNS) / loop->ninsns;
1509   nunroll_by_av
1510     = PARAM_VALUE (PARAM_MAX_AVERAGE_UNROLLED_INSNS) / loop->av_ninsns;
1511   if (nunroll > nunroll_by_av)
1512     nunroll = nunroll_by_av;
1513   if (nunroll > (unsigned) PARAM_VALUE (PARAM_MAX_UNROLL_TIMES))
1514     nunroll = PARAM_VALUE (PARAM_MAX_UNROLL_TIMES);
1515
1516   if (targetm.loop_unroll_adjust)
1517     nunroll = targetm.loop_unroll_adjust (nunroll, loop);
1518
1519   /* Skip big loops.  */
1520   if (nunroll <= 1)
1521     {
1522       if (dump_file)
1523         fprintf (dump_file, ";; Not considering loop, is too big\n");
1524       return;
1525     }
1526
1527   /* Check for simple loops.  */
1528   desc = get_simple_loop_desc (loop);
1529
1530   /* Check simpleness.  */
1531   if (desc->simple_p && !desc->assumptions)
1532     {
1533       if (dump_file)
1534         fprintf (dump_file, ";; The loop is simple\n");
1535       return;
1536     }
1537
1538   /* Do not unroll loops with branches inside -- it increases number
1539      of mispredicts.
1540      TODO: this heuristic needs tunning; call inside the loop body
1541      is also relatively good reason to not unroll.  */
1542   if (num_loop_branches (loop) > 1)
1543     {
1544       if (dump_file)
1545         fprintf (dump_file, ";; Not unrolling, contains branches\n");
1546       return;
1547     }
1548
1549   /* Check whether the loop rolls.  */
1550   if ((get_estimated_loop_iterations (loop, &iterations)
1551        || get_max_loop_iterations (loop, &iterations))
1552       && iterations.ult (double_int::from_shwi (2 * nunroll)))
1553     {
1554       if (dump_file)
1555         fprintf (dump_file, ";; Not unrolling loop, doesn't roll\n");
1556       return;
1557     }
1558
1559   /* Success.  Now force nunroll to be power of 2, as it seems that this
1560      improves results (partially because of better alignments, partially
1561      because of some dark magic).  */
1562   for (i = 1; 2 * i <= nunroll; i *= 2)
1563     continue;
1564
1565   loop->lpt_decision.decision = LPT_UNROLL_STUPID;
1566   loop->lpt_decision.times = i - 1;
1567 }
1568
1569 /* Unroll a LOOP LOOP->LPT_DECISION.TIMES times.  The transformation does this:
1570
1571    while (cond)
1572      body;
1573
1574    ==>  (LOOP->LPT_DECISION.TIMES == 3)
1575
1576    while (cond)
1577      {
1578        body;
1579        if (!cond) break;
1580        body;
1581        if (!cond) break;
1582        body;
1583        if (!cond) break;
1584        body;
1585      }
1586    */
1587 static void
1588 unroll_loop_stupid (struct loop *loop)
1589 {
1590   sbitmap wont_exit;
1591   unsigned nunroll = loop->lpt_decision.times;
1592   struct niter_desc *desc = get_simple_loop_desc (loop);
1593   struct opt_info *opt_info = NULL;
1594   bool ok;
1595
1596   if (flag_split_ivs_in_unroller
1597       || flag_variable_expansion_in_unroller)
1598     opt_info = analyze_insns_in_loop (loop);
1599
1600
1601   wont_exit = sbitmap_alloc (nunroll + 1);
1602   bitmap_clear (wont_exit);
1603   opt_info_start_duplication (opt_info);
1604
1605   ok = duplicate_loop_to_header_edge (loop, loop_latch_edge (loop),
1606                                       nunroll, wont_exit,
1607                                       NULL, NULL,
1608                                       DLTHE_FLAG_UPDATE_FREQ
1609                                       | (opt_info
1610                                          ? DLTHE_RECORD_COPY_NUMBER
1611                                            : 0));
1612   gcc_assert (ok);
1613
1614   if (opt_info)
1615     {
1616       apply_opt_in_copies (opt_info, nunroll, true, true);
1617       free_opt_info (opt_info);
1618     }
1619
1620   free (wont_exit);
1621
1622   if (desc->simple_p)
1623     {
1624       /* We indeed may get here provided that there are nontrivial assumptions
1625          for a loop to be really simple.  We could update the counts, but the
1626          problem is that we are unable to decide which exit will be taken
1627          (not really true in case the number of iterations is constant,
1628          but no one will do anything with this information, so we do not
1629          worry about it).  */
1630       desc->simple_p = false;
1631     }
1632
1633   if (dump_file)
1634     fprintf (dump_file, ";; Unrolled loop %d times, %i insns\n",
1635              nunroll, num_loop_insns (loop));
1636 }
1637
1638 /* Returns true if REG is referenced in one nondebug insn in LOOP.
1639    Set *DEBUG_USES to the number of debug insns that reference the
1640    variable.  */
1641
1642 bool
1643 referenced_in_one_insn_in_loop_p (struct loop *loop, rtx reg,
1644                                   int *debug_uses)
1645 {
1646   basic_block *body, bb;
1647   unsigned i;
1648   int count_ref = 0;
1649   rtx insn;
1650
1651   body = get_loop_body (loop);
1652   for (i = 0; i < loop->num_nodes; i++)
1653     {
1654       bb = body[i];
1655
1656       FOR_BB_INSNS (bb, insn)
1657         if (!rtx_referenced_p (reg, insn))
1658           continue;
1659         else if (DEBUG_INSN_P (insn))
1660           ++*debug_uses;
1661         else if (++count_ref > 1)
1662           break;
1663     }
1664   free (body);
1665   return (count_ref  == 1);
1666 }
1667
1668 /* Reset the DEBUG_USES debug insns in LOOP that reference REG.  */
1669
1670 static void
1671 reset_debug_uses_in_loop (struct loop *loop, rtx reg, int debug_uses)
1672 {
1673   basic_block *body, bb;
1674   unsigned i;
1675   rtx insn;
1676
1677   body = get_loop_body (loop);
1678   for (i = 0; debug_uses && i < loop->num_nodes; i++)
1679     {
1680       bb = body[i];
1681
1682       FOR_BB_INSNS (bb, insn)
1683         if (!DEBUG_INSN_P (insn) || !rtx_referenced_p (reg, insn))
1684           continue;
1685         else
1686           {
1687             validate_change (insn, &INSN_VAR_LOCATION_LOC (insn),
1688                              gen_rtx_UNKNOWN_VAR_LOC (), 0);
1689             if (!--debug_uses)
1690               break;
1691           }
1692     }
1693   free (body);
1694 }
1695
1696 /* Determine whether INSN contains an accumulator
1697    which can be expanded into separate copies,
1698    one for each copy of the LOOP body.
1699
1700    for (i = 0 ; i < n; i++)
1701      sum += a[i];
1702
1703    ==>
1704
1705    sum += a[i]
1706    ....
1707    i = i+1;
1708    sum1 += a[i]
1709    ....
1710    i = i+1
1711    sum2 += a[i];
1712    ....
1713
1714    Return NULL if INSN contains no opportunity for expansion of accumulator.
1715    Otherwise, allocate a VAR_TO_EXPAND structure, fill it with the relevant
1716    information and return a pointer to it.
1717 */
1718
1719 static struct var_to_expand *
1720 analyze_insn_to_expand_var (struct loop *loop, rtx insn)
1721 {
1722   rtx set, dest, src;
1723   struct var_to_expand *ves;
1724   unsigned accum_pos;
1725   enum rtx_code code;
1726   int debug_uses = 0;
1727
1728   set = single_set (insn);
1729   if (!set)
1730     return NULL;
1731
1732   dest = SET_DEST (set);
1733   src = SET_SRC (set);
1734   code = GET_CODE (src);
1735
1736   if (code != PLUS && code != MINUS && code != MULT && code != FMA)
1737     return NULL;
1738
1739   if (FLOAT_MODE_P (GET_MODE (dest)))
1740     {
1741       if (!flag_associative_math)
1742         return NULL;
1743       /* In the case of FMA, we're also changing the rounding.  */
1744       if (code == FMA && !flag_unsafe_math_optimizations)
1745         return NULL;
1746     }
1747
1748   /* Hmm, this is a bit paradoxical.  We know that INSN is a valid insn
1749      in MD.  But if there is no optab to generate the insn, we can not
1750      perform the variable expansion.  This can happen if an MD provides
1751      an insn but not a named pattern to generate it, for example to avoid
1752      producing code that needs additional mode switches like for x87/mmx.
1753
1754      So we check have_insn_for which looks for an optab for the operation
1755      in SRC.  If it doesn't exist, we can't perform the expansion even
1756      though INSN is valid.  */
1757   if (!have_insn_for (code, GET_MODE (src)))
1758     return NULL;
1759
1760   if (!REG_P (dest)
1761       && !(GET_CODE (dest) == SUBREG
1762            && REG_P (SUBREG_REG (dest))))
1763     return NULL;
1764
1765   /* Find the accumulator use within the operation.  */
1766   if (code == FMA)
1767     {
1768       /* We only support accumulation via FMA in the ADD position.  */
1769       if (!rtx_equal_p  (dest, XEXP (src, 2)))
1770         return NULL;
1771       accum_pos = 2;
1772     }
1773   else if (rtx_equal_p (dest, XEXP (src, 0)))
1774     accum_pos = 0;
1775   else if (rtx_equal_p (dest, XEXP (src, 1)))
1776     {
1777       /* The method of expansion that we are using; which includes the
1778          initialization of the expansions with zero and the summation of
1779          the expansions at the end of the computation will yield wrong
1780          results for (x = something - x) thus avoid using it in that case.  */
1781       if (code == MINUS)
1782         return NULL;
1783       accum_pos = 1;
1784     }
1785   else
1786     return NULL;
1787
1788   /* It must not otherwise be used.  */
1789   if (code == FMA)
1790     {
1791       if (rtx_referenced_p (dest, XEXP (src, 0))
1792           || rtx_referenced_p (dest, XEXP (src, 1)))
1793         return NULL;
1794     }
1795   else if (rtx_referenced_p (dest, XEXP (src, 1 - accum_pos)))
1796     return NULL;
1797
1798   /* It must be used in exactly one insn.  */
1799   if (!referenced_in_one_insn_in_loop_p (loop, dest, &debug_uses))
1800     return NULL;
1801
1802   if (dump_file)
1803     {
1804       fprintf (dump_file, "\n;; Expanding Accumulator ");
1805       print_rtl (dump_file, dest);
1806       fprintf (dump_file, "\n");
1807     }
1808
1809   if (debug_uses)
1810     /* Instead of resetting the debug insns, we could replace each
1811        debug use in the loop with the sum or product of all expanded
1812        accummulators.  Since we'll only know of all expansions at the
1813        end, we'd have to keep track of which vars_to_expand a debug
1814        insn in the loop references, take note of each copy of the
1815        debug insn during unrolling, and when it's all done, compute
1816        the sum or product of each variable and adjust the original
1817        debug insn and each copy thereof.  What a pain!  */
1818     reset_debug_uses_in_loop (loop, dest, debug_uses);
1819
1820   /* Record the accumulator to expand.  */
1821   ves = XNEW (struct var_to_expand);
1822   ves->insn = insn;
1823   ves->reg = copy_rtx (dest);
1824   ves->var_expansions.create (1);
1825   ves->next = NULL;
1826   ves->op = GET_CODE (src);
1827   ves->expansion_count = 0;
1828   ves->reuse_expansion = 0;
1829   return ves;
1830 }
1831
1832 /* Determine whether there is an induction variable in INSN that
1833    we would like to split during unrolling.
1834
1835    I.e. replace
1836
1837    i = i + 1;
1838    ...
1839    i = i + 1;
1840    ...
1841    i = i + 1;
1842    ...
1843
1844    type chains by
1845
1846    i0 = i + 1
1847    ...
1848    i = i0 + 1
1849    ...
1850    i = i0 + 2
1851    ...
1852
1853    Return NULL if INSN contains no interesting IVs.  Otherwise, allocate
1854    an IV_TO_SPLIT structure, fill it with the relevant information and return a
1855    pointer to it.  */
1856
1857 static struct iv_to_split *
1858 analyze_iv_to_split_insn (rtx insn)
1859 {
1860   rtx set, dest;
1861   struct rtx_iv iv;
1862   struct iv_to_split *ivts;
1863   bool ok;
1864
1865   /* For now we just split the basic induction variables.  Later this may be
1866      extended for example by selecting also addresses of memory references.  */
1867   set = single_set (insn);
1868   if (!set)
1869     return NULL;
1870
1871   dest = SET_DEST (set);
1872   if (!REG_P (dest))
1873     return NULL;
1874
1875   if (!biv_p (insn, dest))
1876     return NULL;
1877
1878   ok = iv_analyze_result (insn, dest, &iv);
1879
1880   /* This used to be an assert under the assumption that if biv_p returns
1881      true that iv_analyze_result must also return true.  However, that
1882      assumption is not strictly correct as evidenced by pr25569.
1883
1884      Returning NULL when iv_analyze_result returns false is safe and
1885      avoids the problems in pr25569 until the iv_analyze_* routines
1886      can be fixed, which is apparently hard and time consuming
1887      according to their author.  */
1888   if (! ok)
1889     return NULL;
1890
1891   if (iv.step == const0_rtx
1892       || iv.mode != iv.extend_mode)
1893     return NULL;
1894
1895   /* Record the insn to split.  */
1896   ivts = XNEW (struct iv_to_split);
1897   ivts->insn = insn;
1898   ivts->orig_var = dest;
1899   ivts->base_var = NULL_RTX;
1900   ivts->step = iv.step;
1901   ivts->next = NULL;
1902   ivts->n_loc = 1;
1903   ivts->loc[0] = 1;
1904
1905   return ivts;
1906 }
1907
1908 /* Determines which of insns in LOOP can be optimized.
1909    Return a OPT_INFO struct with the relevant hash tables filled
1910    with all insns to be optimized.  The FIRST_NEW_BLOCK field
1911    is undefined for the return value.  */
1912
1913 static struct opt_info *
1914 analyze_insns_in_loop (struct loop *loop)
1915 {
1916   basic_block *body, bb;
1917   unsigned i;
1918   struct opt_info *opt_info = XCNEW (struct opt_info);
1919   rtx insn;
1920   struct iv_to_split *ivts = NULL;
1921   struct var_to_expand *ves = NULL;
1922   iv_to_split **slot1;
1923   var_to_expand **slot2;
1924   vec<edge> edges = get_loop_exit_edges (loop);
1925   edge exit;
1926   bool can_apply = false;
1927
1928   iv_analysis_loop_init (loop);
1929
1930   body = get_loop_body (loop);
1931
1932   if (flag_split_ivs_in_unroller)
1933     {
1934       opt_info->insns_to_split.create (5 * loop->num_nodes);
1935       opt_info->iv_to_split_head = NULL;
1936       opt_info->iv_to_split_tail = &opt_info->iv_to_split_head;
1937     }
1938
1939   /* Record the loop exit bb and loop preheader before the unrolling.  */
1940   opt_info->loop_preheader = loop_preheader_edge (loop)->src;
1941
1942   if (edges.length () == 1)
1943     {
1944       exit = edges[0];
1945       if (!(exit->flags & EDGE_COMPLEX))
1946         {
1947           opt_info->loop_exit = split_edge (exit);
1948           can_apply = true;
1949         }
1950     }
1951
1952   if (flag_variable_expansion_in_unroller
1953       && can_apply)
1954     {
1955       opt_info->insns_with_var_to_expand.create (5 * loop->num_nodes);
1956       opt_info->var_to_expand_head = NULL;
1957       opt_info->var_to_expand_tail = &opt_info->var_to_expand_head;
1958     }
1959
1960   for (i = 0; i < loop->num_nodes; i++)
1961     {
1962       bb = body[i];
1963       if (!dominated_by_p (CDI_DOMINATORS, loop->latch, bb))
1964         continue;
1965
1966       FOR_BB_INSNS (bb, insn)
1967       {
1968         if (!INSN_P (insn))
1969           continue;
1970
1971         if (opt_info->insns_to_split.is_created ())
1972           ivts = analyze_iv_to_split_insn (insn);
1973
1974         if (ivts)
1975           {
1976             slot1 = opt_info->insns_to_split.find_slot (ivts, INSERT);
1977             gcc_assert (*slot1 == NULL);
1978             *slot1 = ivts;
1979             *opt_info->iv_to_split_tail = ivts;
1980             opt_info->iv_to_split_tail = &ivts->next;
1981             continue;
1982           }
1983
1984         if (opt_info->insns_with_var_to_expand.is_created ())
1985           ves = analyze_insn_to_expand_var (loop, insn);
1986
1987         if (ves)
1988           {
1989             slot2 = opt_info->insns_with_var_to_expand.find_slot (ves, INSERT);
1990             gcc_assert (*slot2 == NULL);
1991             *slot2 = ves;
1992             *opt_info->var_to_expand_tail = ves;
1993             opt_info->var_to_expand_tail = &ves->next;
1994           }
1995       }
1996     }
1997
1998   edges.release ();
1999   free (body);
2000   return opt_info;
2001 }
2002
2003 /* Called just before loop duplication.  Records start of duplicated area
2004    to OPT_INFO.  */
2005
2006 static void
2007 opt_info_start_duplication (struct opt_info *opt_info)
2008 {
2009   if (opt_info)
2010     opt_info->first_new_block = last_basic_block;
2011 }
2012
2013 /* Determine the number of iterations between initialization of the base
2014    variable and the current copy (N_COPY).  N_COPIES is the total number
2015    of newly created copies.  UNROLLING is true if we are unrolling
2016    (not peeling) the loop.  */
2017
2018 static unsigned
2019 determine_split_iv_delta (unsigned n_copy, unsigned n_copies, bool unrolling)
2020 {
2021   if (unrolling)
2022     {
2023       /* If we are unrolling, initialization is done in the original loop
2024          body (number 0).  */
2025       return n_copy;
2026     }
2027   else
2028     {
2029       /* If we are peeling, the copy in that the initialization occurs has
2030          number 1.  The original loop (number 0) is the last.  */
2031       if (n_copy)
2032         return n_copy - 1;
2033       else
2034         return n_copies;
2035     }
2036 }
2037
2038 /* Locate in EXPR the expression corresponding to the location recorded
2039    in IVTS, and return a pointer to the RTX for this location.  */
2040
2041 static rtx *
2042 get_ivts_expr (rtx expr, struct iv_to_split *ivts)
2043 {
2044   unsigned i;
2045   rtx *ret = &expr;
2046
2047   for (i = 0; i < ivts->n_loc; i++)
2048     ret = &XEXP (*ret, ivts->loc[i]);
2049
2050   return ret;
2051 }
2052
2053 /* Allocate basic variable for the induction variable chain.  */
2054
2055 static void
2056 allocate_basic_variable (struct iv_to_split *ivts)
2057 {
2058   rtx expr = *get_ivts_expr (single_set (ivts->insn), ivts);
2059
2060   ivts->base_var = gen_reg_rtx (GET_MODE (expr));
2061 }
2062
2063 /* Insert initialization of basic variable of IVTS before INSN, taking
2064    the initial value from INSN.  */
2065
2066 static void
2067 insert_base_initialization (struct iv_to_split *ivts, rtx insn)
2068 {
2069   rtx expr = copy_rtx (*get_ivts_expr (single_set (insn), ivts));
2070   rtx seq;
2071
2072   start_sequence ();
2073   expr = force_operand (expr, ivts->base_var);
2074   if (expr != ivts->base_var)
2075     emit_move_insn (ivts->base_var, expr);
2076   seq = get_insns ();
2077   end_sequence ();
2078
2079   emit_insn_before (seq, insn);
2080 }
2081
2082 /* Replace the use of induction variable described in IVTS in INSN
2083    by base variable + DELTA * step.  */
2084
2085 static void
2086 split_iv (struct iv_to_split *ivts, rtx insn, unsigned delta)
2087 {
2088   rtx expr, *loc, seq, incr, var;
2089   enum machine_mode mode = GET_MODE (ivts->base_var);
2090   rtx src, dest, set;
2091
2092   /* Construct base + DELTA * step.  */
2093   if (!delta)
2094     expr = ivts->base_var;
2095   else
2096     {
2097       incr = simplify_gen_binary (MULT, mode,
2098                                   ivts->step, gen_int_mode (delta, mode));
2099       expr = simplify_gen_binary (PLUS, GET_MODE (ivts->base_var),
2100                                   ivts->base_var, incr);
2101     }
2102
2103   /* Figure out where to do the replacement.  */
2104   loc = get_ivts_expr (single_set (insn), ivts);
2105
2106   /* If we can make the replacement right away, we're done.  */
2107   if (validate_change (insn, loc, expr, 0))
2108     return;
2109
2110   /* Otherwise, force EXPR into a register and try again.  */
2111   start_sequence ();
2112   var = gen_reg_rtx (mode);
2113   expr = force_operand (expr, var);
2114   if (expr != var)
2115     emit_move_insn (var, expr);
2116   seq = get_insns ();
2117   end_sequence ();
2118   emit_insn_before (seq, insn);
2119
2120   if (validate_change (insn, loc, var, 0))
2121     return;
2122
2123   /* The last chance.  Try recreating the assignment in insn
2124      completely from scratch.  */
2125   set = single_set (insn);
2126   gcc_assert (set);
2127
2128   start_sequence ();
2129   *loc = var;
2130   src = copy_rtx (SET_SRC (set));
2131   dest = copy_rtx (SET_DEST (set));
2132   src = force_operand (src, dest);
2133   if (src != dest)
2134     emit_move_insn (dest, src);
2135   seq = get_insns ();
2136   end_sequence ();
2137
2138   emit_insn_before (seq, insn);
2139   delete_insn (insn);
2140 }
2141
2142
2143 /* Return one expansion of the accumulator recorded in struct VE.  */
2144
2145 static rtx
2146 get_expansion (struct var_to_expand *ve)
2147 {
2148   rtx reg;
2149
2150   if (ve->reuse_expansion == 0)
2151     reg = ve->reg;
2152   else
2153     reg = ve->var_expansions[ve->reuse_expansion - 1];
2154
2155   if (ve->var_expansions.length () == (unsigned) ve->reuse_expansion)
2156     ve->reuse_expansion = 0;
2157   else
2158     ve->reuse_expansion++;
2159
2160   return reg;
2161 }
2162
2163
2164 /* Given INSN replace the uses of the accumulator recorded in VE
2165    with a new register.  */
2166
2167 static void
2168 expand_var_during_unrolling (struct var_to_expand *ve, rtx insn)
2169 {
2170   rtx new_reg, set;
2171   bool really_new_expansion = false;
2172
2173   set = single_set (insn);
2174   gcc_assert (set);
2175
2176   /* Generate a new register only if the expansion limit has not been
2177      reached.  Else reuse an already existing expansion.  */
2178   if (PARAM_VALUE (PARAM_MAX_VARIABLE_EXPANSIONS) > ve->expansion_count)
2179     {
2180       really_new_expansion = true;
2181       new_reg = gen_reg_rtx (GET_MODE (ve->reg));
2182     }
2183   else
2184     new_reg = get_expansion (ve);
2185
2186   validate_replace_rtx_group (SET_DEST (set), new_reg, insn);
2187   if (apply_change_group ())
2188     if (really_new_expansion)
2189       {
2190         ve->var_expansions.safe_push (new_reg);
2191         ve->expansion_count++;
2192       }
2193 }
2194
2195 /* Initialize the variable expansions in loop preheader.  PLACE is the
2196    loop-preheader basic block where the initialization of the
2197    expansions should take place.  The expansions are initialized with
2198    (-0) when the operation is plus or minus to honor sign zero.  This
2199    way we can prevent cases where the sign of the final result is
2200    effected by the sign of the expansion.  Here is an example to
2201    demonstrate this:
2202
2203    for (i = 0 ; i < n; i++)
2204      sum += something;
2205
2206    ==>
2207
2208    sum += something
2209    ....
2210    i = i+1;
2211    sum1 += something
2212    ....
2213    i = i+1
2214    sum2 += something;
2215    ....
2216
2217    When SUM is initialized with -zero and SOMETHING is also -zero; the
2218    final result of sum should be -zero thus the expansions sum1 and sum2
2219    should be initialized with -zero as well (otherwise we will get +zero
2220    as the final result).  */
2221
2222 static void
2223 insert_var_expansion_initialization (struct var_to_expand *ve,
2224                                      basic_block place)
2225 {
2226   rtx seq, var, zero_init;
2227   unsigned i;
2228   enum machine_mode mode = GET_MODE (ve->reg);
2229   bool honor_signed_zero_p = HONOR_SIGNED_ZEROS (mode);
2230
2231   if (ve->var_expansions.length () == 0)
2232     return;
2233
2234   start_sequence ();
2235   switch (ve->op)
2236     {
2237     case FMA:
2238       /* Note that we only accumulate FMA via the ADD operand.  */
2239     case PLUS:
2240     case MINUS:
2241       FOR_EACH_VEC_ELT (ve->var_expansions, i, var)
2242         {
2243           if (honor_signed_zero_p)
2244             zero_init = simplify_gen_unary (NEG, mode, CONST0_RTX (mode), mode);
2245           else
2246             zero_init = CONST0_RTX (mode);
2247           emit_move_insn (var, zero_init);
2248         }
2249       break;
2250
2251     case MULT:
2252       FOR_EACH_VEC_ELT (ve->var_expansions, i, var)
2253         {
2254           zero_init = CONST1_RTX (GET_MODE (var));
2255           emit_move_insn (var, zero_init);
2256         }
2257       break;
2258
2259     default:
2260       gcc_unreachable ();
2261     }
2262
2263   seq = get_insns ();
2264   end_sequence ();
2265
2266   emit_insn_after (seq, BB_END (place));
2267 }
2268
2269 /* Combine the variable expansions at the loop exit.  PLACE is the
2270    loop exit basic block where the summation of the expansions should
2271    take place.  */
2272
2273 static void
2274 combine_var_copies_in_loop_exit (struct var_to_expand *ve, basic_block place)
2275 {
2276   rtx sum = ve->reg;
2277   rtx expr, seq, var, insn;
2278   unsigned i;
2279
2280   if (ve->var_expansions.length () == 0)
2281     return;
2282
2283   start_sequence ();
2284   switch (ve->op)
2285     {
2286     case FMA:
2287       /* Note that we only accumulate FMA via the ADD operand.  */
2288     case PLUS:
2289     case MINUS:
2290       FOR_EACH_VEC_ELT (ve->var_expansions, i, var)
2291         sum = simplify_gen_binary (PLUS, GET_MODE (ve->reg), var, sum);
2292       break;
2293
2294     case MULT:
2295       FOR_EACH_VEC_ELT (ve->var_expansions, i, var)
2296         sum = simplify_gen_binary (MULT, GET_MODE (ve->reg), var, sum);
2297       break;
2298
2299     default:
2300       gcc_unreachable ();
2301     }
2302
2303   expr = force_operand (sum, ve->reg);
2304   if (expr != ve->reg)
2305     emit_move_insn (ve->reg, expr);
2306   seq = get_insns ();
2307   end_sequence ();
2308
2309   insn = BB_HEAD (place);
2310   while (!NOTE_INSN_BASIC_BLOCK_P (insn))
2311     insn = NEXT_INSN (insn);
2312
2313   emit_insn_after (seq, insn);
2314 }
2315
2316 /* Strip away REG_EQUAL notes for IVs we're splitting.
2317
2318    Updating REG_EQUAL notes for IVs we split is tricky: We
2319    cannot tell until after unrolling, DF-rescanning, and liveness
2320    updating, whether an EQ_USE is reached by the split IV while
2321    the IV reg is still live.  See PR55006.
2322
2323    ??? We cannot use remove_reg_equal_equiv_notes_for_regno,
2324    because RTL loop-iv requires us to defer rescanning insns and
2325    any notes attached to them.  So resort to old techniques...  */
2326
2327 static void
2328 maybe_strip_eq_note_for_split_iv (struct opt_info *opt_info, rtx insn)
2329 {
2330   struct iv_to_split *ivts;
2331   rtx note = find_reg_equal_equiv_note (insn);
2332   if (! note)
2333     return;
2334   for (ivts = opt_info->iv_to_split_head; ivts; ivts = ivts->next)
2335     if (reg_mentioned_p (ivts->orig_var, note))
2336       {
2337         remove_note (insn, note);
2338         return;
2339       }
2340 }
2341
2342 /* Apply loop optimizations in loop copies using the
2343    data which gathered during the unrolling.  Structure
2344    OPT_INFO record that data.
2345
2346    UNROLLING is true if we unrolled (not peeled) the loop.
2347    REWRITE_ORIGINAL_BODY is true if we should also rewrite the original body of
2348    the loop (as it should happen in complete unrolling, but not in ordinary
2349    peeling of the loop).  */
2350
2351 static void
2352 apply_opt_in_copies (struct opt_info *opt_info,
2353                      unsigned n_copies, bool unrolling,
2354                      bool rewrite_original_loop)
2355 {
2356   unsigned i, delta;
2357   basic_block bb, orig_bb;
2358   rtx insn, orig_insn, next;
2359   struct iv_to_split ivts_templ, *ivts;
2360   struct var_to_expand ve_templ, *ves;
2361
2362   /* Sanity check -- we need to put initialization in the original loop
2363      body.  */
2364   gcc_assert (!unrolling || rewrite_original_loop);
2365
2366   /* Allocate the basic variables (i0).  */
2367   if (opt_info->insns_to_split.is_created ())
2368     for (ivts = opt_info->iv_to_split_head; ivts; ivts = ivts->next)
2369       allocate_basic_variable (ivts);
2370
2371   for (i = opt_info->first_new_block; i < (unsigned) last_basic_block; i++)
2372     {
2373       bb = BASIC_BLOCK_FOR_FN (cfun, i);
2374       orig_bb = get_bb_original (bb);
2375
2376       /* bb->aux holds position in copy sequence initialized by
2377          duplicate_loop_to_header_edge.  */
2378       delta = determine_split_iv_delta ((size_t)bb->aux, n_copies,
2379                                         unrolling);
2380       bb->aux = 0;
2381       orig_insn = BB_HEAD (orig_bb);
2382       FOR_BB_INSNS_SAFE (bb, insn, next)
2383         {
2384           if (!INSN_P (insn)
2385               || (DEBUG_INSN_P (insn)
2386                   && TREE_CODE (INSN_VAR_LOCATION_DECL (insn)) == LABEL_DECL))
2387             continue;
2388
2389           while (!INSN_P (orig_insn)
2390                  || (DEBUG_INSN_P (orig_insn)
2391                      && (TREE_CODE (INSN_VAR_LOCATION_DECL (orig_insn))
2392                          == LABEL_DECL)))
2393             orig_insn = NEXT_INSN (orig_insn);
2394
2395           ivts_templ.insn = orig_insn;
2396           ve_templ.insn = orig_insn;
2397
2398           /* Apply splitting iv optimization.  */
2399           if (opt_info->insns_to_split.is_created ())
2400             {
2401               maybe_strip_eq_note_for_split_iv (opt_info, insn);
2402
2403               ivts = opt_info->insns_to_split.find (&ivts_templ);
2404
2405               if (ivts)
2406                 {
2407                   gcc_assert (GET_CODE (PATTERN (insn))
2408                               == GET_CODE (PATTERN (orig_insn)));
2409
2410                   if (!delta)
2411                     insert_base_initialization (ivts, insn);
2412                   split_iv (ivts, insn, delta);
2413                 }
2414             }
2415           /* Apply variable expansion optimization.  */
2416           if (unrolling && opt_info->insns_with_var_to_expand.is_created ())
2417             {
2418               ves = (struct var_to_expand *)
2419                 opt_info->insns_with_var_to_expand.find (&ve_templ);
2420               if (ves)
2421                 {
2422                   gcc_assert (GET_CODE (PATTERN (insn))
2423                               == GET_CODE (PATTERN (orig_insn)));
2424                   expand_var_during_unrolling (ves, insn);
2425                 }
2426             }
2427           orig_insn = NEXT_INSN (orig_insn);
2428         }
2429     }
2430
2431   if (!rewrite_original_loop)
2432     return;
2433
2434   /* Initialize the variable expansions in the loop preheader
2435      and take care of combining them at the loop exit.  */
2436   if (opt_info->insns_with_var_to_expand.is_created ())
2437     {
2438       for (ves = opt_info->var_to_expand_head; ves; ves = ves->next)
2439         insert_var_expansion_initialization (ves, opt_info->loop_preheader);
2440       for (ves = opt_info->var_to_expand_head; ves; ves = ves->next)
2441         combine_var_copies_in_loop_exit (ves, opt_info->loop_exit);
2442     }
2443
2444   /* Rewrite also the original loop body.  Find them as originals of the blocks
2445      in the last copied iteration, i.e. those that have
2446      get_bb_copy (get_bb_original (bb)) == bb.  */
2447   for (i = opt_info->first_new_block; i < (unsigned) last_basic_block; i++)
2448     {
2449       bb = BASIC_BLOCK_FOR_FN (cfun, i);
2450       orig_bb = get_bb_original (bb);
2451       if (get_bb_copy (orig_bb) != bb)
2452         continue;
2453
2454       delta = determine_split_iv_delta (0, n_copies, unrolling);
2455       for (orig_insn = BB_HEAD (orig_bb);
2456            orig_insn != NEXT_INSN (BB_END (bb));
2457            orig_insn = next)
2458         {
2459           next = NEXT_INSN (orig_insn);
2460
2461           if (!INSN_P (orig_insn))
2462             continue;
2463
2464           ivts_templ.insn = orig_insn;
2465           if (opt_info->insns_to_split.is_created ())
2466             {
2467               maybe_strip_eq_note_for_split_iv (opt_info, orig_insn);
2468
2469               ivts = (struct iv_to_split *)
2470                 opt_info->insns_to_split.find (&ivts_templ);
2471               if (ivts)
2472                 {
2473                   if (!delta)
2474                     insert_base_initialization (ivts, orig_insn);
2475                   split_iv (ivts, orig_insn, delta);
2476                   continue;
2477                 }
2478             }
2479
2480         }
2481     }
2482 }
2483
2484 /* Release OPT_INFO.  */
2485
2486 static void
2487 free_opt_info (struct opt_info *opt_info)
2488 {
2489   if (opt_info->insns_to_split.is_created ())
2490     opt_info->insns_to_split.dispose ();
2491   if (opt_info->insns_with_var_to_expand.is_created ())
2492     {
2493       struct var_to_expand *ves;
2494
2495       for (ves = opt_info->var_to_expand_head; ves; ves = ves->next)
2496         ves->var_expansions.release ();
2497       opt_info->insns_with_var_to_expand.dispose ();
2498     }
2499   free (opt_info);
2500 }