gcc/gimple-loop-versioning.cc

   1 /* Loop versioning pass.
   2    Copyright (C) 2018-2019 Free Software Foundation, Inc.
   3
   4 This file is part of GCC.
   5
   6 GCC is free software; you can redistribute it and/or modify it
   7 under the terms of the GNU General Public License as published by the
   8 Free Software Foundation; either version 3, or (at your option) any
   9 later version.
  10
  11 GCC is distributed in the hope that it will be useful, but WITHOUT
  12 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  13 FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  14 for more details.
  15
  16 You should have received a copy of the GNU General Public License
  17 along with GCC; see the file COPYING3.  If not see
  18 <http://www.gnu.org/licenses/>.  */
  19
  20 #include "config.h"
  21 #include "system.h"
  22 #include "coretypes.h"
  23 #include "backend.h"
  24 #include "tree.h"
  25 #include "gimple.h"
  26 #include "gimple-iterator.h"
  27 #include "tree-pass.h"
  28 #include "gimplify-me.h"
  29 #include "cfgloop.h"
  30 #include "tree-ssa-loop.h"
  31 #include "ssa.h"
  32 #include "tree-scalar-evolution.h"
  33 #include "tree-chrec.h"
  34 #include "tree-ssa-loop-ivopts.h"
  35 #include "fold-const.h"
  36 #include "tree-ssa-propagate.h"
  37 #include "tree-inline.h"
  38 #include "domwalk.h"
  39 #include "alloc-pool.h"
  40 #include "vr-values.h"
  41 #include "gimple-ssa-evrp-analyze.h"
  42 #include "tree-vectorizer.h"
  43 #include "omp-general.h"
  44 #include "predict.h"
  45 #include "tree-into-ssa.h"
  46 #include "params.h"
  47
  48 namespace {
  49
  50 /* This pass looks for loops that could be simplified if certain loop
  51    invariant conditions were true.  It is effectively a form of loop
  52    splitting in which the pass produces the split conditions itself,
  53    instead of using ones that are already present in the IL.
  54
  55    Versioning for when strides are 1
  56    ---------------------------------
  57
  58    At the moment the only thing the pass looks for are memory references
  59    like:
  60
  61      for (auto i : ...)
  62        ...x[i * stride]...
  63
  64    It considers changing such loops to:
  65
  66      if (stride == 1)
  67        for (auto i : ...)    [A]
  68          ...x[i]...
  69      else
  70        for (auto i : ...)    [B]
  71          ...x[i * stride]...
  72
  73    This can have several benefits:
  74
  75    (1) [A] is often easier or cheaper to vectorize than [B].
  76
  77    (2) The scalar code in [A] is simpler than the scalar code in [B]
  78        (if the loops cannot be vectorized or need an epilogue loop).
  79
  80    (3) We might recognize [A] as a pattern, such as a memcpy or memset.
  81
  82    (4) [A] has simpler address evolutions, which can help other passes
  83        like loop interchange.
  84
  85    The optimization is particularly useful for assumed-shape arrays in
  86    Fortran, where the stride of the innermost dimension depends on the
  87    array descriptor but is often equal to 1 in practice.  For example:
  88
  89      subroutine f1(x)
  90        real :: x(:)
  91        x(:) = 100
  92      end subroutine f1
  93
  94    generates the equivalent of:
  95
  96      raw_stride = *x.dim[0].stride;
  97      stride = raw_stride != 0 ? raw_stride : 1;
  98      x_base = *x.data;
  99      ...
 100      tmp1 = stride * S;
 101      tmp2 = tmp1 - stride;
 102      *x_base[tmp2] = 1.0e+2;
 103
 104    but in the common case that stride == 1, the last three statements
 105    simplify to:
 106
 107      tmp3 = S + -1;
 108      *x_base[tmp3] = 1.0e+2;
 109
 110    The optimization is in principle very simple.  The difficult parts are:
 111
 112    (a) deciding which parts of a general address calculation correspond
 113        to the inner dimension of an array, since this usually isn't explicit
 114        in the IL, and for C often isn't even explicit in the source code
 115
 116    (b) estimating when the transformation is worthwhile
 117
 118    Structure
 119    ---------
 120
 121    The pass has four phases:
 122
 123    (1) Walk through the statements looking for and recording potential
 124        versioning opportunities.  Stop if there are none.
 125
 126    (2) Use context-sensitive range information to see whether any versioning
 127        conditions are impossible in practice.  Remove them if so, and stop
 128        if no opportunities remain.
 129
 130        (We do this only after (1) to keep compile time down when no
 131        versioning opportunities exist.)
 132
 133    (3) Apply the cost model.  Decide which versioning opportunities are
 134        worthwhile and at which nesting level they should be applied.
 135
 136    (4) Attempt to version all the loops selected by (3), so that:
 137
 138          for (...)
 139            ...
 140
 141        becomes:
 142
 143          if (!cond)
 144            for (...) // Original loop
 145              ...
 146          else
 147            for (...) // New loop
 148              ...
 149
 150        Use the version condition COND to simplify the new loop.  */
 151
 152 /* Enumerates the likelihood that a particular value indexes the inner
 153    dimension of an array.  */
 154 enum inner_likelihood {
 155   INNER_UNLIKELY,
 156   INNER_DONT_KNOW,
 157   INNER_LIKELY
 158 };
 159
 160 /* Information about one term of an address_info.  */
 161 struct address_term_info
 162 {
 163   /* The value of the term is EXPR * MULTIPLIER.  */
 164   tree expr;
 165   unsigned HOST_WIDE_INT multiplier;
 166
 167   /* The stride applied by EXPR in each iteration of some unrecorded loop,
 168      or null if no stride has been identified.  */
 169   tree stride;
 170
 171   /* Enumerates the likelihood that EXPR indexes the inner dimension
 172      of an array.  */
 173   enum inner_likelihood inner_likelihood;
 174
 175   /* True if STRIDE == 1 is a versioning opportunity when considered
 176      in isolation.  */
 177   bool versioning_opportunity_p;
 178 };
 179
 180 /* Information about an address calculation, and the range of constant
 181    offsets applied to it.  */
 182 struct address_info
 183 {
 184   static const unsigned int MAX_TERMS = 8;
 185
 186   /* One statement that calculates the address.  If multiple statements
 187      share the same address, we only record the first.  */
 188   gimple *stmt;
 189
 190   /* The loop containing STMT (cached for convenience).  If multiple
 191      statements share the same address, they all belong to this loop.  */
 192   struct loop *loop;
 193
 194   /* A decomposition of the calculation into a sum of terms plus an
 195      optional base.  When BASE is provided, it is never an SSA name.
 196      Once initialization is complete, all members of TERMs are SSA names.  */
 197   tree base;
 198   auto_vec<address_term_info, MAX_TERMS> terms;
 199
 200   /* All bytes accessed from the address fall in the offset range
 201      [MIN_OFFSET, MAX_OFFSET).  */
 202   HOST_WIDE_INT min_offset, max_offset;
 203 };
 204
 205 /* Stores addresses based on their base and terms (ignoring the offsets).  */
 206 struct address_info_hasher : nofree_ptr_hash <address_info>
 207 {
 208   static hashval_t hash (const address_info *);
 209   static bool equal (const address_info *, const address_info *);
 210 };
 211
 212 /* Information about the versioning we'd like to apply to a loop.  */
 213 struct loop_info
 214 {
 215   bool worth_versioning_p () const;
 216
 217   /* True if we've decided not to version this loop.  The remaining
 218      fields are meaningless if so.  */
 219   bool rejected_p;
 220
 221   /* True if at least one subloop of this loop benefits from versioning.  */
 222   bool subloops_benefit_p;
 223
 224   /* An estimate of the total number of instructions in the loop,
 225      excluding those in subloops that benefit from versioning.  */
 226   unsigned int num_insns;
 227
 228   /* The outermost loop that can handle all the version checks
 229      described below.  */
 230   struct loop *outermost;
 231
 232   /* The first entry in the list of blocks that belong to this loop
 233      (and not to subloops).  m_next_block_in_loop provides the chain
 234      pointers for the list.  */
 235   basic_block block_list;
 236
 237   /* We'd like to version the loop for the case in which these SSA names
 238      (keyed off their SSA_NAME_VERSION) are all equal to 1 at runtime.  */
 239   bitmap_head unity_names;
 240
 241   /* If versioning succeeds, this points the version of the loop that
 242      assumes the version conditions holds.  */
 243   struct loop *optimized_loop;
 244 };
 245
 246 /* The main pass structure.  */
 247 class loop_versioning
 248 {
 249 public:
 250   loop_versioning (function *);
 251   ~loop_versioning ();
 252   unsigned int run ();
 253
 254 private:
 255   /* Used to walk the dominator tree to find loop versioning conditions
 256      that are always false.  */
 257   class lv_dom_walker : public dom_walker
 258   {
 259   public:
 260     lv_dom_walker (loop_versioning &);
 261
 262     edge before_dom_children (basic_block) FINAL OVERRIDE;
 263     void after_dom_children (basic_block) FINAL OVERRIDE;
 264
 265   private:
 266     /* The parent pass.  */
 267     loop_versioning &m_lv;
 268
 269     /* Used to build context-dependent range information.  */
 270     evrp_range_analyzer m_range_analyzer;
 271   };
 272
 273   /* Used to simplify statements based on conditions that are established
 274      by the version checks.  */
 275   class name_prop : public substitute_and_fold_engine
 276   {
 277   public:
 278     name_prop (loop_info &li) : m_li (li) {}
 279     tree get_value (tree) FINAL OVERRIDE;
 280
 281   private:
 282     /* Information about the versioning we've performed on the loop.  */
 283     loop_info &m_li;
 284   };
 285
 286   loop_info &get_loop_info (struct loop *loop) { return m_loops[loop->num]; }
 287
 288   unsigned int max_insns_for_loop (struct loop *);
 289   bool expensive_stmt_p (gimple *);
 290
 291   void version_for_unity (gimple *, tree);
 292   bool acceptable_multiplier_p (tree, unsigned HOST_WIDE_INT,
 293                                 unsigned HOST_WIDE_INT * = 0);
 294   bool acceptable_type_p (tree, unsigned HOST_WIDE_INT *);
 295   bool multiply_term_by (address_term_info &, tree);
 296   inner_likelihood get_inner_likelihood (tree, unsigned HOST_WIDE_INT);
 297   void dump_inner_likelihood (address_info &, address_term_info &);
 298   void analyze_stride (address_info &, address_term_info &,
 299                        tree, struct loop *);
 300   bool find_per_loop_multiplication (address_info &, address_term_info &);
 301   bool analyze_term_using_scevs (address_info &, address_term_info &);
 302   void analyze_arbitrary_term (address_info &, address_term_info &);
 303   void analyze_address_fragment (address_info &);
 304   void record_address_fragment (gimple *, unsigned HOST_WIDE_INT,
 305                                 tree, unsigned HOST_WIDE_INT, HOST_WIDE_INT);
 306   void analyze_expr (gimple *, tree);
 307   bool analyze_block (basic_block);
 308   bool analyze_blocks ();
 309
 310   void prune_loop_conditions (struct loop *, vr_values *);
 311   bool prune_conditions ();
 312
 313   void merge_loop_info (struct loop *, struct loop *);
 314   void add_loop_to_queue (struct loop *);
 315   bool decide_whether_loop_is_versionable (struct loop *);
 316   bool make_versioning_decisions ();
 317
 318   bool version_loop (struct loop *);
 319   void implement_versioning_decisions ();
 320
 321   /* The function we're optimizing.  */
 322   function *m_fn;
 323
 324   /* The obstack to use for all pass-specific bitmaps.  */
 325   bitmap_obstack m_bitmap_obstack;
 326
 327   /* An obstack to use for general allocation.  */
 328   obstack m_obstack;
 329
 330   /* The number of loops in the function.  */
 331   unsigned int m_nloops;
 332
 333   /* The total number of loop version conditions we've found.  */
 334   unsigned int m_num_conditions;
 335
 336   /* Assume that an address fragment of the form i * stride * scale
 337      (for variable stride and constant scale) will not benefit from
 338      versioning for stride == 1 when scale is greater than this value.  */
 339   unsigned HOST_WIDE_INT m_maximum_scale;
 340
 341   /* Information about each loop.  */
 342   auto_vec<loop_info> m_loops;
 343
 344   /* Used to form a linked list of blocks that belong to a loop,
 345      started by loop_info::block_list.  */
 346   auto_vec<basic_block> m_next_block_in_loop;
 347
 348   /* The list of loops that we've decided to version.  */
 349   auto_vec<struct loop *> m_loops_to_version;
 350
 351   /* A table of addresses in the current loop, keyed off their values
 352      but not their offsets.  */
 353   hash_table <address_info_hasher> m_address_table;
 354
 355   /* A list of all addresses in M_ADDRESS_TABLE, in a predictable order.  */
 356   auto_vec <address_info *, 32> m_address_list;
 357 };
 358
 359 /* If EXPR is an SSA name and not a default definition, return the
 360    defining statement, otherwise return null.  */
 361
 362 static gimple *
 363 maybe_get_stmt (tree expr)
 364 {
 365   if (TREE_CODE (expr) == SSA_NAME && !SSA_NAME_IS_DEFAULT_DEF (expr))
 366     return SSA_NAME_DEF_STMT (expr);
 367   return NULL;
 368 }
 369
 370 /* Like maybe_get_stmt, but also return null if the defining
 371    statement isn't an assignment.  */
 372
 373 static gassign *
 374 maybe_get_assign (tree expr)
 375 {
 376   return safe_dyn_cast <gassign *> (maybe_get_stmt (expr));
 377 }
 378
 379 /* Return true if this pass should look through a cast of expression FROM
 380    to type TYPE when analyzing pieces of an address.  */
 381
 382 static bool
 383 look_through_cast_p (tree type, tree from)
 384 {
 385   return (INTEGRAL_TYPE_P (TREE_TYPE (from)) == INTEGRAL_TYPE_P (type)
 386           && POINTER_TYPE_P (TREE_TYPE (from)) == POINTER_TYPE_P (type));
 387 }
 388
 389 /* Strip all conversions of integers or pointers from EXPR, regardless
 390    of whether the conversions are nops.  This is useful in the context
 391    of this pass because we're not trying to fold or simulate the
 392    expression; we just want to see how it's structured.  */
 393
 394 static tree
 395 strip_casts (tree expr)
 396 {
 397   const unsigned int MAX_NITERS = 4;
 398
 399   tree type = TREE_TYPE (expr);
 400   while (CONVERT_EXPR_P (expr)
 401          && look_through_cast_p (type, TREE_OPERAND (expr, 0)))
 402     expr = TREE_OPERAND (expr, 0);
 403
 404   for (unsigned int niters = 0; niters < MAX_NITERS; ++niters)
 405     {
 406       gassign *assign = maybe_get_assign (expr);
 407       if (assign
 408           && CONVERT_EXPR_CODE_P (gimple_assign_rhs_code (assign))
 409           && look_through_cast_p (type, gimple_assign_rhs1 (assign)))
 410         expr = gimple_assign_rhs1 (assign);
 411       else
 412         break;
 413     }
 414   return expr;
 415 }
 416
 417 /* Compare two address_term_infos in the same address_info.  */
 418
 419 static int
 420 compare_address_terms (const void *a_uncast, const void *b_uncast)
 421 {
 422   const address_term_info *a = (const address_term_info *) a_uncast;
 423   const address_term_info *b = (const address_term_info *) b_uncast;
 424
 425   if (a->expr != b->expr)
 426     return SSA_NAME_VERSION (a->expr) < SSA_NAME_VERSION (b->expr) ? -1 : 1;
 427
 428   if (a->multiplier != b->multiplier)
 429     return a->multiplier < b->multiplier ? -1 : 1;
 430
 431   return 0;
 432 }
 433
 434 /* Dump ADDRESS using flags FLAGS.  */
 435
 436 static void
 437 dump_address_info (dump_flags_t flags, address_info &address)
 438 {
 439   if (address.base)
 440     dump_printf (flags, "%T + ", address.base);
 441   for (unsigned int i = 0; i < address.terms.length (); ++i)
 442     {
 443       if (i != 0)
 444         dump_printf (flags, " + ");
 445       dump_printf (flags, "%T", address.terms[i].expr);
 446       if (address.terms[i].multiplier != 1)
 447         dump_printf (flags, " * %wd", address.terms[i].multiplier);
 448     }
 449   dump_printf (flags, " + [%wd, %wd]",
 450                address.min_offset, address.max_offset - 1);
 451 }
 452
 453 /* Hash an address_info based on its base and terms.  */
 454
 455 hashval_t
 456 address_info_hasher::hash (const address_info *info)
 457 {
 458   inchash::hash hash;
 459   hash.add_int (info->base ? TREE_CODE (info->base) : 0);
 460   hash.add_int (info->terms.length ());
 461   for (unsigned int i = 0; i < info->terms.length (); ++i)
 462     {
 463       hash.add_int (SSA_NAME_VERSION (info->terms[i].expr));
 464       hash.add_hwi (info->terms[i].multiplier);
 465     }
 466   return hash.end ();
 467 }
 468
 469 /* Return true if two address_infos have equal bases and terms.  Other
 470    properties might be different (such as the statement or constant
 471    offset range).  */
 472
 473 bool
 474 address_info_hasher::equal (const address_info *a, const address_info *b)
 475 {
 476   if (a->base != b->base
 477       && (!a->base || !b->base || !operand_equal_p (a->base, b->base, 0)))
 478     return false;
 479
 480   if (a->terms.length () != b->terms.length ())
 481     return false;
 482
 483   for (unsigned int i = 0; i < a->terms.length (); ++i)
 484     if (a->terms[i].expr != b->terms[i].expr
 485         || a->terms[i].multiplier != b->terms[i].multiplier)
 486       return false;
 487
 488   return true;
 489 }
 490
 491 /* Return true if we want to version the loop, i.e. if we have a
 492    specific reason for doing so and no specific reason not to.  */
 493
 494 bool
 495 loop_info::worth_versioning_p () const
 496 {
 497   return (!rejected_p
 498           && (!bitmap_empty_p (&unity_names) || subloops_benefit_p));
 499 }
 500
 501 loop_versioning::lv_dom_walker::lv_dom_walker (loop_versioning &lv)
 502   : dom_walker (CDI_DOMINATORS), m_lv (lv), m_range_analyzer (false)
 503 {
 504 }
 505
 506 /* Process BB before processing the blocks it dominates.  */
 507
 508 edge
 509 loop_versioning::lv_dom_walker::before_dom_children (basic_block bb)
 510 {
 511   m_range_analyzer.enter (bb);
 512
 513   if (bb == bb->loop_father->header)
 514     m_lv.prune_loop_conditions (bb->loop_father,
 515                                 m_range_analyzer.get_vr_values ());
 516
 517   for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
 518        gsi_next (&si))
 519     m_range_analyzer.record_ranges_from_stmt (gsi_stmt (si), false);
 520
 521   return NULL;
 522 }
 523
 524 /* Process BB after processing the blocks it dominates.  */
 525
 526 void
 527 loop_versioning::lv_dom_walker::after_dom_children (basic_block bb)
 528 {
 529   m_range_analyzer.leave (bb);
 530 }
 531
 532 /* Decide whether to replace VAL with a new value in a versioned loop.
 533    Return the new value if so, otherwise return null.  */
 534
 535 tree
 536 loop_versioning::name_prop::get_value (tree val)
 537 {
 538   if (TREE_CODE (val) == SSA_NAME
 539       && bitmap_bit_p (&m_li.unity_names, SSA_NAME_VERSION (val)))
 540     return build_one_cst (TREE_TYPE (val));
 541   return NULL_TREE;
 542 }
 543
 544 /* Initialize the structure to optimize FN.  */
 545
 546 loop_versioning::loop_versioning (function *fn)
 547   : m_fn (fn),
 548     m_nloops (number_of_loops (fn)),
 549     m_num_conditions (0),
 550     m_address_table (31)
 551 {
 552   bitmap_obstack_initialize (&m_bitmap_obstack);
 553   gcc_obstack_init (&m_obstack);
 554
 555   /* Initialize the loop information.  */
 556   m_loops.safe_grow_cleared (m_nloops);
 557   for (unsigned int i = 0; i < m_nloops; ++i)
 558     {
 559       m_loops[i].outermost = get_loop (m_fn, 0);
 560       bitmap_initialize (&m_loops[i].unity_names, &m_bitmap_obstack);
 561     }
 562
 563   /* Initialize the list of blocks that belong to each loop.  */
 564   unsigned int nbbs = last_basic_block_for_fn (fn);
 565   m_next_block_in_loop.safe_grow (nbbs);
 566   basic_block bb;
 567   FOR_EACH_BB_FN (bb, fn)
 568     {
 569       loop_info &li = get_loop_info (bb->loop_father);
 570       m_next_block_in_loop[bb->index] = li.block_list;
 571       li.block_list = bb;
 572     }
 573
 574   /* MAX_FIXED_MODE_SIZE should be a reasonable maximum scale for
 575      unvectorizable code, since it is the largest size that can be
 576      handled efficiently by scalar code.  omp_max_vf calculates the
 577      maximum number of bytes in a vector, when such a value is relevant
 578      to loop optimization.  */
 579   m_maximum_scale = estimated_poly_value (omp_max_vf ());
 580   m_maximum_scale = MAX (m_maximum_scale, MAX_FIXED_MODE_SIZE);
 581 }
 582
 583 loop_versioning::~loop_versioning ()
 584 {
 585   bitmap_obstack_release (&m_bitmap_obstack);
 586   obstack_free (&m_obstack, NULL);
 587 }
 588
 589 /* Return the maximum number of instructions allowed in LOOP before
 590    it becomes too big for versioning.
 591
 592    There are separate limits for inner and outer loops.  The limit for
 593    inner loops applies only to loops that benefit directly from versioning.
 594    The limit for outer loops applies to all code in the outer loop and
 595    its subloops that *doesn't* benefit directly from versioning; such code
 596    would be "taken along for the ride".  The idea is that if the cost of
 597    the latter is small, it is better to version outer loops rather than
 598    inner loops, both to reduce the number of repeated checks and to enable
 599    more of the loop nest to be optimized as a natural nest (e.g. by loop
 600    interchange or outer-loop vectorization).  */
 601
 602 unsigned int
 603 loop_versioning::max_insns_for_loop (struct loop *loop)
 604 {
 605   return (loop->inner
 606           ? PARAM_VALUE (PARAM_LOOP_VERSIONING_MAX_OUTER_INSNS)
 607           : PARAM_VALUE (PARAM_LOOP_VERSIONING_MAX_INNER_INSNS));
 608 }
 609
 610 /* Return true if for cost reasons we should avoid versioning any loop
 611    that contains STMT.
 612
 613    Note that we don't need to check whether versioning is invalid for
 614    correctness reasons, since the versioning process does that for us.
 615    The conditions involved are too rare to be worth duplicating here.  */
 616
 617 bool
 618 loop_versioning::expensive_stmt_p (gimple *stmt)
 619 {
 620   if (gcall *call = dyn_cast <gcall *> (stmt))
 621     /* Assume for now that the time spent in an "expensive" call would
 622        overwhelm any saving from versioning.  */
 623     return !gimple_inexpensive_call_p (call);
 624   return false;
 625 }
 626
 627 /* Record that we want to version the loop that contains STMT for the
 628    case in which SSA name NAME is equal to 1.  We already know that NAME
 629    is invariant in the loop.  */
 630
 631 void
 632 loop_versioning::version_for_unity (gimple *stmt, tree name)
 633 {
 634   struct loop *loop = loop_containing_stmt (stmt);
 635   loop_info &li = get_loop_info (loop);
 636
 637   if (bitmap_set_bit (&li.unity_names, SSA_NAME_VERSION (name)))
 638     {
 639       /* This is the first time we've wanted to version LOOP for NAME.
 640          Keep track of the outermost loop that can handle all versioning
 641          checks in LI.  */
 642       struct loop *outermost
 643         = outermost_invariant_loop_for_expr (loop, name);
 644       if (loop_depth (li.outermost) < loop_depth (outermost))
 645         li.outermost = outermost;
 646
 647       if (dump_enabled_p ())
 648         {
 649           dump_printf_loc (MSG_NOTE, stmt, "want to version containing loop"
 650                            " for when %T == 1", name);
 651           if (outermost == loop)
 652             dump_printf (MSG_NOTE, "; cannot hoist check further");
 653           else
 654             {
 655               dump_printf (MSG_NOTE, "; could implement the check at loop"
 656                            " depth %d", loop_depth (outermost));
 657               if (loop_depth (li.outermost) > loop_depth (outermost))
 658                 dump_printf (MSG_NOTE, ", but other checks only allow"
 659                              " a depth of %d", loop_depth (li.outermost));
 660             }
 661           dump_printf (MSG_NOTE, "\n");
 662         }
 663
 664       m_num_conditions += 1;
 665     }
 666   else
 667     {
 668       /* This is a duplicate request.  */
 669       if (dump_enabled_p ())
 670         dump_printf_loc (MSG_NOTE, stmt, "already asked to version containing"
 671                          " loop for when %T == 1\n", name);
 672     }
 673 }
 674
 675 /* Return true if OP1_TREE is constant and if in principle it is worth
 676    versioning an address fragment of the form:
 677
 678      i * OP1_TREE * OP2 * stride
 679
 680    for the case in which stride == 1.  This in practice means testing
 681    whether:
 682
 683      OP1_TREE * OP2 <= M_MAXIMUM_SCALE.
 684
 685    If RESULT is nonnull, store OP1_TREE * OP2 there when returning true.  */
 686
 687 bool
 688 loop_versioning::acceptable_multiplier_p (tree op1_tree,
 689                                           unsigned HOST_WIDE_INT op2,
 690                                           unsigned HOST_WIDE_INT *result)
 691 {
 692   if (tree_fits_uhwi_p (op1_tree))
 693     {
 694       unsigned HOST_WIDE_INT op1 = tree_to_uhwi (op1_tree);
 695       /* The first part checks for overflow.  */
 696       if (op1 * op2 >= op2 && op1 * op2 <= m_maximum_scale)
 697         {
 698           if (result)
 699             *result = op1 * op2;
 700           return true;
 701         }
 702     }
 703   return false;
 704 }
 705
 706 /* Return true if it is worth using loop versioning on a memory access
 707    of type TYPE.  Store the size of the access in *SIZE if so.  */
 708
 709 bool
 710 loop_versioning::acceptable_type_p (tree type, unsigned HOST_WIDE_INT *size)
 711 {
 712   return (TYPE_SIZE_UNIT (type)
 713           && acceptable_multiplier_p (TYPE_SIZE_UNIT (type), 1, size));
 714 }
 715
 716 /* See whether OP is constant and whether we can multiply TERM by that
 717    constant without exceeding M_MAXIMUM_SCALE.  Return true and update
 718    TERM if so.  */
 719
 720 bool
 721 loop_versioning::multiply_term_by (address_term_info &term, tree op)
 722 {
 723   return acceptable_multiplier_p (op, term.multiplier, &term.multiplier);
 724 }
 725
 726 /* Decide whether an address fragment of the form STRIDE * MULTIPLIER
 727    is likely to be indexing an innermost dimension, returning the result
 728    as an INNER_* probability.  */
 729
 730 inner_likelihood
 731 loop_versioning::get_inner_likelihood (tree stride,
 732                                        unsigned HOST_WIDE_INT multiplier)
 733 {
 734   const unsigned int MAX_NITERS = 8;
 735
 736   /* Iterate over possible values of STRIDE.  Return INNER_LIKELY if at
 737      least one of those values is likely to be for the innermost dimension.
 738      Record in UNLIKELY_P if at least one of those values is unlikely to be
 739      for the innermost dimension.
 740
 741      E.g. for:
 742
 743        stride = cond ? a * b : 1
 744
 745      we should treat STRIDE as being a likely inner dimension, since
 746      we know that it is 1 under at least some circumstances.  (See the
 747      Fortran example below.)  However:
 748
 749        stride = a * b
 750
 751      on its own is unlikely to be for the innermost dimension, since
 752      that would require both a and b to be 1 at runtime.  */
 753   bool unlikely_p = false;
 754   tree worklist[MAX_NITERS];
 755   unsigned int length = 0;
 756   worklist[length++] = stride;
 757   for (unsigned int i = 0; i < length; ++i)
 758     {
 759       tree expr = worklist[i];
 760
 761       if (CONSTANT_CLASS_P (expr))
 762         {
 763           /* See if EXPR * MULTIPLIER would be consistent with an individual
 764              access or a small grouped access.  */
 765           if (acceptable_multiplier_p (expr, multiplier))
 766             return INNER_LIKELY;
 767           else
 768             unlikely_p = true;
 769         }
 770       else if (gimple *stmt = maybe_get_stmt (expr))
 771         {
 772           /* If EXPR is set by a PHI node, queue its arguments in case
 773              we find one that is consistent with an inner dimension.
 774
 775              An important instance of this is the Fortran handling of array
 776              descriptors, which calculates the stride of the inner dimension
 777              using a PHI equivalent of:
 778
 779                 raw_stride = a.dim[0].stride;
 780                 stride = raw_stride != 0 ? raw_stride : 1;
 781
 782              (Strides for outer dimensions do not treat 0 specially.)  */
 783           if (gphi *phi = dyn_cast <gphi *> (stmt))
 784             {
 785               unsigned int nargs = gimple_phi_num_args (phi);
 786               for (unsigned int j = 0; j < nargs && length < MAX_NITERS; ++j)
 787                 worklist[length++] = strip_casts (gimple_phi_arg_def (phi, j));
 788             }
 789           /* If the value is set by an assignment, expect it to be read
 790              from memory (such as an array descriptor) rather than be
 791              calculated.  */
 792           else if (gassign *assign = dyn_cast <gassign *> (stmt))
 793             {
 794               if (!gimple_assign_load_p (assign))
 795                 unlikely_p = true;
 796             }
 797           /* Things like calls don't really tell us anything.  */
 798         }
 799     }
 800
 801   /* We didn't find any possible values of STRIDE that were likely to be
 802      for the innermost dimension.  If we found one that was actively
 803      unlikely to be for the innermost dimension, assume that that applies
 804      to STRIDE too.  */
 805   return unlikely_p ? INNER_UNLIKELY : INNER_DONT_KNOW;
 806 }
 807
 808 /* Dump the likelihood that TERM's stride is for the innermost dimension.
 809    ADDRESS is the address that contains TERM.  */
 810
 811 void
 812 loop_versioning::dump_inner_likelihood (address_info &address,
 813                                         address_term_info &term)
 814 {
 815   if (term.inner_likelihood == INNER_LIKELY)
 816     dump_printf_loc (MSG_NOTE, address.stmt, "%T is likely to be the"
 817                      " innermost dimension\n", term.stride);
 818   else if (term.inner_likelihood == INNER_UNLIKELY)
 819     dump_printf_loc (MSG_NOTE, address.stmt, "%T is probably not the"
 820                      " innermost dimension\n", term.stride);
 821   else
 822     dump_printf_loc (MSG_NOTE, address.stmt, "cannot tell whether %T"
 823                      " is the innermost dimension\n", term.stride);
 824 }
 825
 826 /* The caller has identified that STRIDE is the stride of interest
 827    in TERM, and that the stride is applied in OP_LOOP.  Record this
 828    information in TERM, deciding whether STRIDE is likely to be for
 829    the innermost dimension of an array and whether it represents a
 830    versioning opportunity.  ADDRESS is the address that contains TERM.  */
 831
 832 void
 833 loop_versioning::analyze_stride (address_info &address,
 834                                  address_term_info &term,
 835                                  tree stride, struct loop *op_loop)
 836 {
 837   term.stride = stride;
 838
 839   term.inner_likelihood = get_inner_likelihood (stride, term.multiplier);
 840   if (dump_enabled_p ())
 841     dump_inner_likelihood (address, term);
 842
 843   /* To be a versioning opportunity we require:
 844
 845      - The multiplier applied by TERM is equal to the access size,
 846        so that when STRIDE is 1, the accesses in successive loop
 847        iterations are consecutive.
 848
 849        This is deliberately conservative.  We could relax it to handle
 850        other cases (such as those with gaps between iterations) if we
 851        find any real testcases for which it's useful.
 852
 853      - the stride is applied in the same loop as STMT rather than
 854        in an outer loop.  Although versioning for strides applied in
 855        outer loops could help in some cases -- such as enabling
 856        more loop interchange -- the savings are much lower than for
 857        inner loops.
 858
 859      - the stride is an SSA name that is invariant in STMT's loop,
 860        since otherwise versioning isn't possible.  */
 861   unsigned HOST_WIDE_INT access_size = address.max_offset - address.min_offset;
 862   if (term.multiplier == access_size
 863       && address.loop == op_loop
 864       && TREE_CODE (stride) == SSA_NAME
 865       && expr_invariant_in_loop_p (address.loop, stride))
 866     {
 867       term.versioning_opportunity_p = true;
 868       if (dump_enabled_p ())
 869         dump_printf_loc (MSG_NOTE, address.stmt, "%T == 1 is a versioning"
 870                          " opportunity\n", stride);
 871     }
 872 }
 873
 874 /* See whether address term TERM (which belongs to ADDRESS) is the result
 875    of multiplying a varying SSA name by a loop-invariant SSA name.
 876    Return true and update TERM if so.
 877
 878    This handles both cases that SCEV might handle, such as:
 879
 880      for (int i = 0; i < n; ++i)
 881        res += a[i * stride];
 882
 883    and ones in which the term varies arbitrarily between iterations, such as:
 884
 885      for (int i = 0; i < n; ++i)
 886        res += a[index[i] * stride];  */
 887
 888 bool
 889 loop_versioning::find_per_loop_multiplication (address_info &address,
 890                                                address_term_info &term)
 891 {
 892   gassign *mult = maybe_get_assign (term.expr);
 893   if (!mult || gimple_assign_rhs_code (mult) != MULT_EXPR)
 894     return false;
 895
 896   struct loop *mult_loop = loop_containing_stmt (mult);
 897   if (!loop_outer (mult_loop))
 898     return false;
 899
 900   tree op1 = strip_casts (gimple_assign_rhs1 (mult));
 901   tree op2 = strip_casts (gimple_assign_rhs2 (mult));
 902   if (TREE_CODE (op1) != SSA_NAME || TREE_CODE (op2) != SSA_NAME)
 903     return false;
 904
 905   bool invariant1_p = expr_invariant_in_loop_p (mult_loop, op1);
 906   bool invariant2_p = expr_invariant_in_loop_p (mult_loop, op2);
 907   if (invariant1_p == invariant2_p)
 908     return false;
 909
 910   /* Make sure that the loop invariant is OP2 rather than OP1.  */
 911   if (invariant1_p)
 912     std::swap (op1, op2);
 913
 914   if (dump_enabled_p ())
 915     dump_printf_loc (MSG_NOTE, address.stmt, "address term %T = varying %T"
 916                      " * loop-invariant %T\n", term.expr, op1, op2);
 917   analyze_stride (address, term, op2, mult_loop);
 918   return true;
 919 }
 920
 921 /* Try to use scalar evolutions to find an address stride for TERM,
 922    which belongs to ADDRESS.  Return true and update TERM if so.
 923
 924    Here we are interested in any evolution information we can find,
 925    not just evolutions wrt ADDRESS->LOOP.  For example, if we find that
 926    an outer loop obviously iterates over the inner dimension of an array,
 927    that information can help us eliminate worthless versioning opportunities
 928    in inner loops.  */
 929
 930 bool
 931 loop_versioning::analyze_term_using_scevs (address_info &address,
 932                                            address_term_info &term)
 933 {
 934   gimple *setter = maybe_get_stmt (term.expr);
 935   if (!setter)
 936     return false;
 937
 938   struct loop *wrt_loop = loop_containing_stmt (setter);
 939   if (!loop_outer (wrt_loop))
 940     return false;
 941
 942   tree chrec = strip_casts (analyze_scalar_evolution (wrt_loop, term.expr));
 943   if (TREE_CODE (chrec) == POLYNOMIAL_CHREC)
 944     {
 945       if (dump_enabled_p ())
 946         dump_printf_loc (MSG_NOTE, address.stmt,
 947                          "address term %T = %T\n", term.expr, chrec);
 948
 949       /* Peel casts and accumulate constant multiplications, up to the
 950          limit allowed by M_MAXIMUM_SCALE.  */
 951       tree stride = strip_casts (CHREC_RIGHT (chrec));
 952       while (TREE_CODE (stride) == MULT_EXPR
 953              && multiply_term_by (term, TREE_OPERAND (stride, 1)))
 954         stride = strip_casts (TREE_OPERAND (stride, 0));
 955
 956       gassign *assign;
 957       while ((assign = maybe_get_assign (stride))
 958              && gimple_assign_rhs_code (assign) == MULT_EXPR
 959              && multiply_term_by (term, gimple_assign_rhs2 (assign)))
 960         {
 961           if (dump_enabled_p ())
 962             dump_printf_loc (MSG_NOTE, address.stmt,
 963                              "looking through %G", assign);
 964           stride = strip_casts (gimple_assign_rhs1 (assign));
 965         }
 966
 967       analyze_stride (address, term, stride, get_chrec_loop (chrec));
 968       return true;
 969     }
 970
 971   return false;
 972 }
 973
 974 /* Address term TERM is an arbitrary term that provides no versioning
 975    opportunities.  Analyze it to see whether it contains any likely
 976    inner strides, so that we don't mistakenly version for other
 977    (less likely) candidates.
 978
 979    This copes with invariant innermost indices such as:
 980
 981      x(i, :) = 100
 982
 983    where the "i" component of the address is invariant in the loop
 984    but provides the real inner stride.
 985
 986    ADDRESS is the address that contains TERM.  */
 987
 988 void
 989 loop_versioning::analyze_arbitrary_term (address_info &address,
 990                                          address_term_info &term)
 991
 992 {
 993   /* A multiplication offers two potential strides.  Pick the one that
 994      is most likely to be an innermost stride.  */
 995   tree expr = term.expr, alt = NULL_TREE;
 996   gassign *mult = maybe_get_assign (expr);
 997   if (mult && gimple_assign_rhs_code (mult) == MULT_EXPR)
 998     {
 999       expr = strip_casts (gimple_assign_rhs1 (mult));
1000       alt = strip_casts (gimple_assign_rhs2 (mult));
1001     }
1002   term.stride = expr;
1003   term.inner_likelihood = get_inner_likelihood (expr, term.multiplier);
1004   if (alt)
1005     {
1006       inner_likelihood alt_l = get_inner_likelihood (alt, term.multiplier);
1007       if (alt_l > term.inner_likelihood)
1008         {
1009           term.stride = alt;
1010           term.inner_likelihood = alt_l;
1011         }
1012     }
1013   if (dump_enabled_p ())
1014     dump_inner_likelihood (address, term);
1015 }
1016
1017 /* Try to identify loop strides in ADDRESS and try to choose realistic
1018    versioning opportunities based on these strides.
1019
1020    The main difficulty here isn't finding strides that could be used
1021    in a version check (that's pretty easy).  The problem instead is to
1022    avoid versioning for some stride S that is unlikely ever to be 1 at
1023    runtime.  Versioning for S == 1 on its own would lead to unnecessary
1024    code bloat, while adding S == 1 to more realistic version conditions
1025    would lose the optimisation opportunity offered by those other conditions.
1026
1027    For example, versioning for a stride of 1 in the Fortran code:
1028
1029      integer :: a(:,:)
1030      a(1,:) = 1
1031
1032    is not usually a good idea, since the assignment is iterating over
1033    an outer dimension and is relatively unlikely to have a stride of 1.
1034    (It isn't impossible, since the inner dimension might be 1, or the
1035    array might be transposed.)  Similarly, in:
1036
1037      integer :: a(:,:), b(:,:)
1038      b(:,1) = a(1,:)
1039
1040    b(:,1) is relatively likely to have a stride of 1 while a(1,:) isn't.
1041    Versioning for when both strides are 1 would lose most of the benefit
1042    of versioning for b's access.
1043
1044    The approach we take is as follows:
1045
1046    - Analyze each term to see whether it has an identifiable stride,
1047      regardless of which loop applies the stride.
1048
1049    - Evaluate the likelihood that each such stride is for the innermost
1050      dimension of an array, on the scale "likely", "don't know" or "unlikely".
1051
1052    - If there is a single "likely" innermost stride, and that stride is
1053      applied in the loop that contains STMT, version the loop for when the
1054      stride is 1.  This deals with the cases in which we're fairly
1055      confident of doing the right thing, such as the b(:,1) reference above.
1056
1057    - If there are no "likely" innermost strides, and the loop that contains
1058      STMT uses a stride that we rated as "don't know", version for when
1059      that stride is 1.  This is principally used for C code such as:
1060
1061        for (int i = 0; i < n; ++i)
1062          a[i * x] = ...;
1063
1064      and:
1065
1066        for (int j = 0; j < n; ++j)
1067          for (int i = 0; i < n; ++i)
1068            a[i * x + j * y] = ...;
1069
1070      where nothing in the way "x" and "y" are set gives a hint as to
1071      whether "i" iterates over the innermost dimension of the array.
1072      In these situations it seems reasonable to assume the the
1073      programmer has nested the loops appropriately (although of course
1074      there are examples like GEMM in which this assumption doesn't hold
1075      for all accesses in the loop).
1076
1077      This case is also useful for the Fortran equivalent of the
1078      above C code.  */
1079
1080 void
1081 loop_versioning::analyze_address_fragment (address_info &address)
1082 {
1083   if (dump_enabled_p ())
1084     {
1085       dump_printf_loc (MSG_NOTE, address.stmt, "analyzing address fragment ");
1086       dump_address_info (MSG_NOTE, address);
1087       dump_printf (MSG_NOTE, "\n");
1088     }
1089
1090   /* Analyze each component of the sum to see whether it involves an
1091      apparent stride.
1092
1093      There is an overlap between the addresses that
1094      find_per_loop_multiplication and analyze_term_using_scevs can handle,
1095      but the former is much cheaper than SCEV analysis, so try it first.  */
1096   for (unsigned int i = 0; i < address.terms.length (); ++i)
1097     if (!find_per_loop_multiplication (address, address.terms[i])
1098         && !analyze_term_using_scevs (address, address.terms[i])
1099         && !POINTER_TYPE_P (TREE_TYPE (address.terms[i].expr)))
1100       analyze_arbitrary_term (address, address.terms[i]);
1101
1102   /* Check for strides that are likely to be for the innermost dimension.
1103
1104      1. If there is a single likely inner stride, if it is an SSA name,
1105         and if it is worth versioning the loop for when the SSA name
1106         equals 1, record that we want to do so.
1107
1108      2. Otherwise, if there any likely inner strides, bail out.  This means
1109         one of:
1110
1111         (a) There are multiple likely inner strides.  This suggests we're
1112             confused and be can't be confident of doing the right thing.
1113
1114         (b) There is a single likely inner stride and it is a constant
1115             rather than an SSA name.  This can mean either that the access
1116             is a natural one without any variable strides, such as:
1117
1118               for (int i = 0; i < n; ++i)
1119                 a[i] += 1;
1120
1121             or that a variable stride is applied to an outer dimension,
1122             such as:
1123
1124               for (int i = 0; i < n; ++i)
1125                 for (int j = 0; j < n; ++j)
1126                   a[j * stride][i] += 1;
1127
1128         (c) There is a single likely inner stride, and it is an SSA name,
1129             but it isn't a worthwhile versioning opportunity.  This usually
1130             means that the variable stride is applied by an outer loop,
1131             such as:
1132
1133               for (int i = 0; i < n; ++i)
1134                 for (int j = 0; j < n; ++j)
1135                   a[j][i * stride] += 1;
1136
1137             or (using an example with a more natural loop nesting):
1138
1139               for (int i = 0; i < n; ++i)
1140                 for (int j = 0; j < n; ++j)
1141                   a[i][j] += b[i * stride];
1142
1143             in cases where b[i * stride] cannot (yet) be hoisted for
1144             aliasing reasons.
1145
1146      3. If there are no likely inner strides, fall through to the next
1147         set of checks.
1148
1149      Pointer equality is enough to check for uniqueness in (1), since we
1150      only care about SSA names.  */
1151   tree chosen_stride = NULL_TREE;
1152   tree version_stride = NULL_TREE;
1153   for (unsigned int i = 0; i < address.terms.length (); ++i)
1154     if (chosen_stride != address.terms[i].stride
1155         && address.terms[i].inner_likelihood == INNER_LIKELY)
1156       {
1157         if (chosen_stride)
1158           return;
1159         chosen_stride = address.terms[i].stride;
1160         if (address.terms[i].versioning_opportunity_p)
1161           version_stride = chosen_stride;
1162       }
1163
1164   /* If there are no likely inner strides, see if there is a single
1165      versioning opportunity for a stride that was rated as INNER_DONT_KNOW.
1166      See the comment above the function for the cases that this code
1167      handles.  */
1168   if (!chosen_stride)
1169     for (unsigned int i = 0; i < address.terms.length (); ++i)
1170       if (version_stride != address.terms[i].stride
1171           && address.terms[i].inner_likelihood == INNER_DONT_KNOW
1172           && address.terms[i].versioning_opportunity_p)
1173         {
1174           if (version_stride)
1175             return;
1176           version_stride = address.terms[i].stride;
1177         }
1178
1179   if (version_stride)
1180     version_for_unity (address.stmt, version_stride);
1181 }
1182
1183 /* Treat EXPR * MULTIPLIER + OFFSET as a fragment of an address that addresses
1184    TYPE_SIZE bytes and record this address fragment for later processing.
1185    STMT is the statement that contains the address.  */
1186
1187 void
1188 loop_versioning::record_address_fragment (gimple *stmt,
1189                                           unsigned HOST_WIDE_INT type_size,
1190                                           tree expr,
1191                                           unsigned HOST_WIDE_INT multiplier,
1192                                           HOST_WIDE_INT offset)
1193 {
1194   /* We're only interested in computed values.  */
1195   if (TREE_CODE (expr) != SSA_NAME)
1196     return;
1197
1198   /* Quick exit if no part of the address is calculated in STMT's loop,
1199      since such addresses have no versioning opportunities.  */
1200   struct loop *loop = loop_containing_stmt (stmt);
1201   if (expr_invariant_in_loop_p (loop, expr))
1202     return;
1203
1204   /* Set up an address_info for EXPR * MULTIPLIER.  */
1205   address_info *address = XOBNEW (&m_obstack, address_info);
1206   new (address) address_info;
1207   address->stmt = stmt;
1208   address->loop = loop;
1209   address->base = NULL_TREE;
1210   address->terms.quick_grow (1);
1211   address->terms[0].expr = expr;
1212   address->terms[0].multiplier = multiplier;
1213   address->terms[0].stride = NULL_TREE;
1214   address->terms[0].inner_likelihood = INNER_UNLIKELY;
1215   address->terms[0].versioning_opportunity_p = false;
1216   address->min_offset = offset;
1217
1218   /* Peel apart the expression into a sum of address_terms, where each
1219      term is multiplied by a constant.  Treat a + b and a - b the same,
1220      since it doesn't matter for our purposes whether an address is
1221      increasing or decreasing.  Distribute (a + b) * constant into
1222      a * constant + b * constant.
1223
1224      We don't care which loop each term belongs to, since we want to
1225      examine as many candidate strides as possible when determining
1226      which is likely to be for the innermost dimension.  We therefore
1227      don't limit the search to statements in STMT's loop.  */
1228   for (unsigned int i = 0; i < address->terms.length (); )
1229     {
1230       if (gassign *assign = maybe_get_assign (address->terms[i].expr))
1231         {
1232           tree_code code = gimple_assign_rhs_code (assign);
1233           if (code == PLUS_EXPR
1234               || code == POINTER_PLUS_EXPR
1235               || code == MINUS_EXPR)
1236             {
1237               tree op1 = gimple_assign_rhs1 (assign);
1238               tree op2 = gimple_assign_rhs2 (assign);
1239               if (TREE_CODE (op2) == INTEGER_CST)
1240                 {
1241                   address->terms[i].expr = strip_casts (op1);
1242                   /* This is heuristic only, so don't worry about truncation
1243                      or overflow.  */
1244                   address->min_offset += (TREE_INT_CST_LOW (op2)
1245                                           * address->terms[i].multiplier);
1246                   continue;
1247                 }
1248               else if (address->terms.length () < address_info::MAX_TERMS)
1249                 {
1250                   unsigned int j = address->terms.length ();
1251                   address->terms.quick_push (address->terms[i]);
1252                   address->terms[i].expr = strip_casts (op1);
1253                   address->terms[j].expr = strip_casts (op2);
1254                   continue;
1255                 }
1256             }
1257           if (code == MULT_EXPR)
1258             {
1259               tree op1 = gimple_assign_rhs1 (assign);
1260               tree op2 = gimple_assign_rhs2 (assign);
1261               if (multiply_term_by (address->terms[i], op2))
1262                 {
1263                   address->terms[i].expr = strip_casts (op1);
1264                   continue;
1265                 }
1266             }
1267         }
1268       i += 1;
1269     }
1270
1271   /* Peel off any symbolic pointer.  */
1272   if (TREE_CODE (address->terms[0].expr) != SSA_NAME
1273       && address->terms[0].multiplier == 1)
1274     {
1275       if (address->terms.length () == 1)
1276         {
1277           obstack_free (&m_obstack, address);
1278           return;
1279         }
1280       address->base = address->terms[0].expr;
1281       address->terms.ordered_remove (0);
1282     }
1283
1284   /* Require all remaining terms to be SSA names.  (This could be false
1285      for unfolded statements, but they aren't worth dealing with.)  */
1286   for (unsigned int i = 0; i < address->terms.length (); ++i)
1287     if (TREE_CODE (address->terms[i].expr) != SSA_NAME)
1288       {
1289         obstack_free (&m_obstack, address);
1290         return;
1291       }
1292
1293   /* The loop above set MIN_OFFSET based on the first byte of the
1294      referenced data.  Calculate the end + 1.  */
1295   address->max_offset = address->min_offset + type_size;
1296
1297   /* Put the terms into a canonical order for the hash table lookup below.  */
1298   address->terms.qsort (compare_address_terms);
1299
1300   if (dump_enabled_p ())
1301     {
1302       dump_printf_loc (MSG_NOTE, stmt, "recording address fragment %T", expr);
1303       if (multiplier != 1)
1304         dump_printf (MSG_NOTE, " * %wd", multiplier);
1305       dump_printf (MSG_NOTE, " = ");
1306       dump_address_info (MSG_NOTE, *address);
1307       dump_printf (MSG_NOTE, "\n");
1308     }
1309
1310   /* Pool address information with the same terms (but potentially
1311      different offsets).  */
1312   address_info **slot = m_address_table.find_slot (address, INSERT);
1313   if (address_info *old_address = *slot)
1314     {
1315       /* We've already seen an address with the same terms.  Extend the
1316          offset range to account for this access.  Doing this can paper
1317          over gaps, such as in:
1318
1319            a[i * stride * 4] + a[i * stride * 4 + 3];
1320
1321          where nothing references "+ 1" or "+ 2".  However, the vectorizer
1322          handles such gapped accesses without problems, so it's not worth
1323          trying to exclude them.  */
1324       if (old_address->min_offset > address->min_offset)
1325         old_address->min_offset = address->min_offset;
1326       if (old_address->max_offset < address->max_offset)
1327         old_address->max_offset = address->max_offset;
1328       obstack_free (&m_obstack, address);
1329     }
1330   else
1331     {
1332       /* This is the first time we've seen an address with these terms.  */
1333       *slot = address;
1334       m_address_list.safe_push (address);
1335     }
1336 }
1337
1338 /* Analyze expression EXPR, which occurs in STMT.  */
1339
1340 void
1341 loop_versioning::analyze_expr (gimple *stmt, tree expr)
1342 {
1343   unsigned HOST_WIDE_INT type_size;
1344
1345   while (handled_component_p (expr))
1346     {
1347       /* See whether we can use versioning to avoid a multiplication
1348          in an array index.  */
1349       if (TREE_CODE (expr) == ARRAY_REF
1350           && acceptable_type_p (TREE_TYPE (expr), &type_size))
1351         record_address_fragment (stmt, type_size,
1352                                  TREE_OPERAND (expr, 1), type_size, 0);
1353       expr = TREE_OPERAND (expr, 0);
1354     }
1355
1356   /* See whether we can use versioning to avoid a multiplication
1357      in the pointer calculation of a MEM_REF.  */
1358   if (TREE_CODE (expr) == MEM_REF
1359       && acceptable_type_p (TREE_TYPE (expr), &type_size))
1360     record_address_fragment (stmt, type_size, TREE_OPERAND (expr, 0), 1,
1361                              /* This is heuristic only, so don't worry
1362                                 about truncation or overflow.  */
1363                              TREE_INT_CST_LOW (TREE_OPERAND (expr, 1)));
1364
1365   /* These would be easy to handle if they existed at this stage.  */
1366   gcc_checking_assert (TREE_CODE (expr) != TARGET_MEM_REF);
1367 }
1368
1369 /* Analyze all the statements in BB looking for useful version checks.
1370    Return true on success, false if something prevents the block from
1371    being versioned.  */
1372
1373 bool
1374 loop_versioning::analyze_block (basic_block bb)
1375 {
1376   struct loop *loop = bb->loop_father;
1377   loop_info &li = get_loop_info (loop);
1378   for (gimple_stmt_iterator gsi = gsi_start_bb (bb); !gsi_end_p (gsi);
1379        gsi_next (&gsi))
1380     {
1381       gimple *stmt = gsi_stmt (gsi);
1382       if (is_gimple_debug (stmt))
1383         continue;
1384
1385       if (expensive_stmt_p (stmt))
1386         {
1387           if (dump_enabled_p ())
1388             dump_printf_loc (MSG_NOTE, stmt, "expensive statement"
1389                              " prevents versioning: %G", stmt);
1390           return false;
1391         }
1392
1393       /* Only look for direct versioning opportunities in inner loops
1394          since the benefit tends to be much smaller for outer loops.  */
1395       if (!loop->inner)
1396         {
1397           unsigned int nops = gimple_num_ops (stmt);
1398           for (unsigned int i = 0; i < nops; ++i)
1399             if (tree op = gimple_op (stmt, i))
1400               analyze_expr (stmt, op);
1401         }
1402
1403       /* The point of the instruction limit is to prevent excessive
1404          code growth, so this is a size-based estimate even though
1405          the optimization is aimed at speed.  */
1406       li.num_insns += estimate_num_insns (stmt, &eni_size_weights);
1407     }
1408
1409   return true;
1410 }
1411
1412 /* Analyze all the blocks in the function, looking for useful version checks.
1413    Return true if we found one.  */
1414
1415 bool
1416 loop_versioning::analyze_blocks ()
1417 {
1418   AUTO_DUMP_SCOPE ("analyze_blocks",
1419                    dump_user_location_t::from_function_decl (m_fn->decl));
1420
1421   /* For now we don't try to version the whole function, although
1422      versioning at that level could be useful in some cases.  */
1423   get_loop_info (get_loop (m_fn, 0)).rejected_p = true;
1424
1425   struct loop *loop;
1426   FOR_EACH_LOOP (loop, LI_FROM_INNERMOST)
1427     {
1428       loop_info &linfo = get_loop_info (loop);
1429
1430       /* Ignore cold loops.  */
1431       if (!optimize_loop_for_speed_p (loop))
1432         linfo.rejected_p = true;
1433
1434       /* See whether an inner loop prevents versioning of this loop.  */
1435       if (!linfo.rejected_p)
1436         for (struct loop *inner = loop->inner; inner; inner = inner->next)
1437           if (get_loop_info (inner).rejected_p)
1438             {
1439               linfo.rejected_p = true;
1440               break;
1441             }
1442
1443       /* If versioning the loop is still a possibility, examine the
1444          statements in the loop to look for versioning opportunities.  */
1445       if (!linfo.rejected_p)
1446         {
1447           void *start_point = obstack_alloc (&m_obstack, 0);
1448
1449           for (basic_block bb = linfo.block_list; bb;
1450                bb = m_next_block_in_loop[bb->index])
1451             if (!analyze_block (bb))
1452               {
1453                 linfo.rejected_p = true;
1454                 break;
1455             }
1456
1457           if (!linfo.rejected_p)
1458             {
1459               /* Process any queued address fragments, now that we have
1460                  complete grouping information.  */
1461               address_info *address;
1462               unsigned int i;
1463               FOR_EACH_VEC_ELT (m_address_list, i, address)
1464                 analyze_address_fragment (*address);
1465             }
1466
1467           m_address_table.empty ();
1468           m_address_list.truncate (0);
1469           obstack_free (&m_obstack, start_point);
1470         }
1471     }
1472
1473   return m_num_conditions != 0;
1474 }
1475
1476 /* Use the ranges in VRS to remove impossible versioning conditions from
1477    LOOP.  */
1478
1479 void
1480 loop_versioning::prune_loop_conditions (struct loop *loop, vr_values *vrs)
1481 {
1482   loop_info &li = get_loop_info (loop);
1483
1484   int to_remove = -1;
1485   bitmap_iterator bi;
1486   unsigned int i;
1487   EXECUTE_IF_SET_IN_BITMAP (&li.unity_names, 0, i, bi)
1488     {
1489       tree name = ssa_name (i);
1490       value_range *vr = vrs->get_value_range (name);
1491       if (vr && !range_includes_p (vr, 1))
1492         {
1493           if (dump_enabled_p ())
1494             dump_printf_loc (MSG_NOTE, find_loop_location (loop),
1495                              "%T can never be 1 in this loop\n", name);
1496
1497           if (to_remove >= 0)
1498             bitmap_clear_bit (&li.unity_names, to_remove);
1499           to_remove = i;
1500           m_num_conditions -= 1;
1501         }
1502     }
1503   if (to_remove >= 0)
1504     bitmap_clear_bit (&li.unity_names, to_remove);
1505 }
1506
1507 /* Remove any scheduled loop version conditions that will never be true.
1508    Return true if any remain.  */
1509
1510 bool
1511 loop_versioning::prune_conditions ()
1512 {
1513   AUTO_DUMP_SCOPE ("prune_loop_conditions",
1514                    dump_user_location_t::from_function_decl (m_fn->decl));
1515
1516   calculate_dominance_info (CDI_DOMINATORS);
1517   lv_dom_walker dom_walker (*this);
1518   dom_walker.walk (ENTRY_BLOCK_PTR_FOR_FN (m_fn));
1519   return m_num_conditions != 0;
1520 }
1521
1522 /* Merge the version checks for INNER into immediately-enclosing loop
1523    OUTER.  */
1524
1525 void
1526 loop_versioning::merge_loop_info (struct loop *outer, struct loop *inner)
1527 {
1528   loop_info &inner_li = get_loop_info (inner);
1529   loop_info &outer_li = get_loop_info (outer);
1530
1531   if (dump_enabled_p ())
1532     {
1533       bitmap_iterator bi;
1534       unsigned int i;
1535       EXECUTE_IF_SET_IN_BITMAP (&inner_li.unity_names, 0, i, bi)
1536         if (!bitmap_bit_p (&outer_li.unity_names, i))
1537           dump_printf_loc (MSG_NOTE, find_loop_location (inner),
1538                            "hoisting check that %T == 1 to outer loop\n",
1539                            ssa_name (i));
1540     }
1541
1542   bitmap_ior_into (&outer_li.unity_names, &inner_li.unity_names);
1543   if (loop_depth (outer_li.outermost) < loop_depth (inner_li.outermost))
1544     outer_li.outermost = inner_li.outermost;
1545 }
1546
1547 /* Add LOOP to the queue of loops to version.  */
1548
1549 void
1550 loop_versioning::add_loop_to_queue (struct loop *loop)
1551 {
1552   loop_info &li = get_loop_info (loop);
1553
1554   if (dump_enabled_p ())
1555     dump_printf_loc (MSG_NOTE, find_loop_location (loop),
1556                      "queuing this loop for versioning\n");
1557   m_loops_to_version.safe_push (loop);
1558
1559   /* Don't try to version superloops.  */
1560   li.rejected_p = true;
1561 }
1562
1563 /* Decide whether the cost model would allow us to version LOOP,
1564    either directly or as part of a parent loop, and return true if so.
1565    This does not imply that the loop is actually worth versioning in its
1566    own right, just that it would be valid to version it if something
1567    benefited.
1568
1569    We have already made this decision for all inner loops of LOOP.  */
1570
1571 bool
1572 loop_versioning::decide_whether_loop_is_versionable (struct loop *loop)
1573 {
1574   loop_info &li = get_loop_info (loop);
1575
1576   if (li.rejected_p)
1577     return false;
1578
1579   /* Examine the decisions made for inner loops.  */
1580   for (struct loop *inner = loop->inner; inner; inner = inner->next)
1581     {
1582       loop_info &inner_li = get_loop_info (inner);
1583       if (inner_li.rejected_p)
1584         {
1585           if (dump_enabled_p ())
1586             dump_printf_loc (MSG_NOTE, find_loop_location (loop),
1587                              "not versioning this loop because one of its"
1588                              " inner loops should not be versioned\n");
1589           return false;
1590         }
1591
1592       if (inner_li.worth_versioning_p ())
1593         li.subloops_benefit_p = true;
1594
1595       /* Accumulate the number of instructions from subloops that are not
1596          the innermost, or that don't benefit from versioning.  Only the
1597          instructions from innermost loops that benefit from versioning
1598          should be weighed against loop-versioning-max-inner-insns;
1599          everything else should be weighed against
1600          loop-versioning-max-outer-insns.  */
1601       if (!inner_li.worth_versioning_p () || inner->inner)
1602         {
1603           if (dump_enabled_p ())
1604             dump_printf_loc (MSG_NOTE, find_loop_location (loop),
1605                              "counting %d instructions from this loop"
1606                              " against its parent loop\n", inner_li.num_insns);
1607           li.num_insns += inner_li.num_insns;
1608         }
1609     }
1610
1611   /* Enforce the size limits.  */
1612   if (li.worth_versioning_p ())
1613     {
1614       unsigned int max_num_insns = max_insns_for_loop (loop);
1615       if (dump_enabled_p ())
1616         dump_printf_loc (MSG_NOTE, find_loop_location (loop),
1617                          "this loop has %d instructions, against"
1618                          " a versioning limit of %d\n",
1619                          li.num_insns, max_num_insns);
1620       if (li.num_insns > max_num_insns)
1621         {
1622           if (dump_enabled_p ())
1623             dump_printf_loc (MSG_MISSED_OPTIMIZATION
1624                              | MSG_PRIORITY_USER_FACING,
1625                              find_loop_location (loop),
1626                              "this loop is too big to version");
1627           return false;
1628         }
1629     }
1630
1631   /* Hoist all version checks from subloops to this loop.  */
1632   for (struct loop *subloop = loop->inner; subloop; subloop = subloop->next)
1633     merge_loop_info (loop, subloop);
1634
1635   return true;
1636 }
1637
1638 /* Decide which loops to version and add them to the versioning queue.
1639    Return true if there are any loops to version.  */
1640
1641 bool
1642 loop_versioning::make_versioning_decisions ()
1643 {
1644   AUTO_DUMP_SCOPE ("make_versioning_decisions",
1645                    dump_user_location_t::from_function_decl (m_fn->decl));
1646
1647   struct loop *loop;
1648   FOR_EACH_LOOP (loop, LI_FROM_INNERMOST)
1649     {
1650       loop_info &linfo = get_loop_info (loop);
1651       if (decide_whether_loop_is_versionable (loop))
1652         {
1653           /* Commit to versioning LOOP directly if we can't hoist the
1654              version checks any further.  */
1655           if (linfo.worth_versioning_p ()
1656               && (loop_depth (loop) == 1 || linfo.outermost == loop))
1657             add_loop_to_queue (loop);
1658         }
1659       else
1660         {
1661           /* We can't version this loop, so individually version any
1662              subloops that would benefit and haven't been versioned yet.  */
1663           linfo.rejected_p = true;
1664           for (struct loop *subloop = loop->inner; subloop;
1665                subloop = subloop->next)
1666             if (get_loop_info (subloop).worth_versioning_p ())
1667               add_loop_to_queue (subloop);
1668         }
1669     }
1670
1671   return !m_loops_to_version.is_empty ();
1672 }
1673
1674 /* Attempt to implement loop versioning for LOOP, using the information
1675    cached in the associated loop_info.  Return true on success.  */
1676
1677 bool
1678 loop_versioning::version_loop (struct loop *loop)
1679 {
1680   loop_info &li = get_loop_info (loop);
1681
1682   /* Build up a condition that selects the original loop instead of
1683      the simplified loop.  */
1684   tree cond = boolean_false_node;
1685   bitmap_iterator bi;
1686   unsigned int i;
1687   EXECUTE_IF_SET_IN_BITMAP (&li.unity_names, 0, i, bi)
1688     {
1689       tree name = ssa_name (i);
1690       tree ne_one = fold_build2 (NE_EXPR, boolean_type_node, name,
1691                                  build_one_cst (TREE_TYPE (name)));
1692       cond = fold_build2 (TRUTH_OR_EXPR, boolean_type_node, cond, ne_one);
1693     }
1694
1695   /* Convert the condition into a suitable gcond.  */
1696   gimple_seq stmts = NULL;
1697   cond = force_gimple_operand_1 (cond, &stmts, is_gimple_condexpr, NULL_TREE);
1698
1699   /* Version the loop.  */
1700   initialize_original_copy_tables ();
1701   basic_block cond_bb;
1702   li.optimized_loop = loop_version (loop, cond, &cond_bb,
1703                                     profile_probability::unlikely (),
1704                                     profile_probability::likely (),
1705                                     profile_probability::unlikely (),
1706                                     profile_probability::likely (), true);
1707   free_original_copy_tables ();
1708   if (!li.optimized_loop)
1709     {
1710       if (dump_enabled_p ())
1711         dump_printf_loc (MSG_MISSED_OPTIMIZATION, find_loop_location (loop),
1712                          "tried but failed to version this loop for when"
1713                          " certain strides are 1\n");
1714       return false;
1715     }
1716
1717   if (dump_enabled_p ())
1718     dump_printf_loc (MSG_OPTIMIZED_LOCATIONS, find_loop_location (loop),
1719                      "versioned this loop for when certain strides are 1\n");
1720
1721   /* Insert the statements that feed COND.  */
1722   if (stmts)
1723     {
1724       gimple_stmt_iterator gsi = gsi_last_bb (cond_bb);
1725       gsi_insert_seq_before (&gsi, stmts, GSI_SAME_STMT);
1726     }
1727
1728   return true;
1729 }
1730
1731 /* Attempt to version all loops in the versioning queue.  */
1732
1733 void
1734 loop_versioning::implement_versioning_decisions ()
1735 {
1736   /* No AUTO_DUMP_SCOPE here since all messages are top-level and
1737      user-facing at this point.  */
1738
1739   bool any_succeeded_p = false;
1740   struct loop *loop;
1741   unsigned int i;
1742   FOR_EACH_VEC_ELT (m_loops_to_version, i, loop)
1743     if (version_loop (loop))
1744       any_succeeded_p = true;
1745   if (!any_succeeded_p)
1746     return;
1747
1748   update_ssa (TODO_update_ssa);
1749
1750   /* Simplify the new loop, which is used when COND is false.  */
1751   FOR_EACH_VEC_ELT (m_loops_to_version, i, loop)
1752     {
1753       loop_info &linfo = get_loop_info (loop);
1754       if (linfo.optimized_loop)
1755         name_prop (linfo).substitute_and_fold (linfo.optimized_loop->header);
1756     }
1757 }
1758
1759 /* Run the pass and return a set of TODO_* flags.  */
1760
1761 unsigned int
1762 loop_versioning::run ()
1763 {
1764   gcc_assert (scev_initialized_p ());
1765
1766   if (analyze_blocks ()
1767       && prune_conditions ()
1768       && make_versioning_decisions ())
1769     implement_versioning_decisions ();
1770
1771   return 0;
1772 }
1773
1774 /* Loop versioning pass.  */
1775
1776 const pass_data pass_data_loop_versioning =
1777 {
1778   GIMPLE_PASS, /* type */
1779   "lversion", /* name */
1780   OPTGROUP_LOOP, /* optinfo_flags */
1781   TV_LOOP_VERSIONING, /* tv_id */
1782   PROP_cfg, /* properties_required */
1783   0, /* properties_provided */
1784   0, /* properties_destroyed */
1785   0, /* todo_flags_start */
1786   0, /* todo_flags_finish */
1787 };
1788
1789 class pass_loop_versioning : public gimple_opt_pass
1790 {
1791 public:
1792   pass_loop_versioning (gcc::context *ctxt)
1793     : gimple_opt_pass (pass_data_loop_versioning, ctxt)
1794   {}
1795
1796   /* opt_pass methods: */
1797   virtual bool gate (function *) { return flag_version_loops_for_strides; }
1798   virtual unsigned int execute (function *);
1799 };
1800
1801 unsigned int
1802 pass_loop_versioning::execute (function *fn)
1803 {
1804   if (number_of_loops (fn) <= 1)
1805     return 0;
1806
1807   return loop_versioning (fn).run ();
1808 }
1809
1810 } // anon namespace
1811
1812 gimple_opt_pass *
1813 make_pass_loop_versioning (gcc::context *ctxt)
1814 {
1815   return new pass_loop_versioning (ctxt);
1816 }