gcc/tree-vect-loop-manip.c

   1 /* Vectorizer Specific Loop Manipulations
   2    Copyright (C) 2003-2018 Free Software Foundation, Inc.
   3    Contributed by Dorit Naishlos <dorit@il.ibm.com>
   4    and Ira Rosen <irar@il.ibm.com>
   5
   6 This file is part of GCC.
   7
   8 GCC is free software; you can redistribute it and/or modify it under
   9 the terms of the GNU General Public License as published by the Free
  10 Software Foundation; either version 3, or (at your option) any later
  11 version.
  12
  13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
  14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
  15 FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  16 for more details.
  17
  18 You should have received a copy of the GNU General Public License
  19 along with GCC; see the file COPYING3.  If not see
  20 <http://www.gnu.org/licenses/>.  */
  21
  22 #include "config.h"
  23 #include "system.h"
  24 #include "coretypes.h"
  25 #include "backend.h"
  26 #include "tree.h"
  27 #include "gimple.h"
  28 #include "cfghooks.h"
  29 #include "tree-pass.h"
  30 #include "ssa.h"
  31 #include "fold-const.h"
  32 #include "cfganal.h"
  33 #include "gimplify.h"
  34 #include "gimple-iterator.h"
  35 #include "gimplify-me.h"
  36 #include "tree-cfg.h"
  37 #include "tree-ssa-loop-manip.h"
  38 #include "tree-into-ssa.h"
  39 #include "tree-ssa.h"
  40 #include "cfgloop.h"
  41 #include "tree-scalar-evolution.h"
  42 #include "tree-vectorizer.h"
  43 #include "tree-ssa-loop-ivopts.h"
  44 #include "gimple-fold.h"
  45 #include "tree-ssa-loop-niter.h"
  46 #include "internal-fn.h"
  47 #include "stor-layout.h"
  48 #include "optabs-query.h"
  49 #include "vec-perm-indices.h"
  50
  51 /*************************************************************************
  52   Simple Loop Peeling Utilities
  53
  54   Utilities to support loop peeling for vectorization purposes.
  55  *************************************************************************/
  56
  57
  58 /* Renames the use *OP_P.  */
  59
  60 static void
  61 rename_use_op (use_operand_p op_p)
  62 {
  63   tree new_name;
  64
  65   if (TREE_CODE (USE_FROM_PTR (op_p)) != SSA_NAME)
  66     return;
  67
  68   new_name = get_current_def (USE_FROM_PTR (op_p));
  69
  70   /* Something defined outside of the loop.  */
  71   if (!new_name)
  72     return;
  73
  74   /* An ordinary ssa name defined in the loop.  */
  75
  76   SET_USE (op_p, new_name);
  77 }
  78
  79
  80 /* Renames the variables in basic block BB.  Allow renaming  of PHI arguments
  81    on edges incoming from outer-block header if RENAME_FROM_OUTER_LOOP is
  82    true.  */
  83
  84 static void
  85 rename_variables_in_bb (basic_block bb, bool rename_from_outer_loop)
  86 {
  87   gimple *stmt;
  88   use_operand_p use_p;
  89   ssa_op_iter iter;
  90   edge e;
  91   edge_iterator ei;
  92   struct loop *loop = bb->loop_father;
  93   struct loop *outer_loop = NULL;
  94
  95   if (rename_from_outer_loop)
  96     {
  97       gcc_assert (loop);
  98       outer_loop = loop_outer (loop);
  99     }
 100
 101   for (gimple_stmt_iterator gsi = gsi_start_bb (bb); !gsi_end_p (gsi);
 102        gsi_next (&gsi))
 103     {
 104       stmt = gsi_stmt (gsi);
 105       FOR_EACH_SSA_USE_OPERAND (use_p, stmt, iter, SSA_OP_ALL_USES)
 106         rename_use_op (use_p);
 107     }
 108
 109   FOR_EACH_EDGE (e, ei, bb->preds)
 110     {
 111       if (!flow_bb_inside_loop_p (loop, e->src))
 112         {
 113           if (!rename_from_outer_loop)
 114             continue;
 115           if (e->src != outer_loop->header)
 116             {
 117               if (outer_loop->inner->next)
 118                 {
 119                   /* If outer_loop has 2 inner loops, allow there to
 120                      be an extra basic block which decides which of the
 121                      two loops to use using LOOP_VECTORIZED.  */
 122                   if (!single_pred_p (e->src)
 123                       || single_pred (e->src) != outer_loop->header)
 124                     continue;
 125                 }
 126             }
 127         }
 128       for (gphi_iterator gsi = gsi_start_phis (bb); !gsi_end_p (gsi);
 129            gsi_next (&gsi))
 130         rename_use_op (PHI_ARG_DEF_PTR_FROM_EDGE (gsi.phi (), e));
 131     }
 132 }
 133
 134
 135 struct adjust_info
 136 {
 137   tree from, to;
 138   basic_block bb;
 139 };
 140
 141 /* A stack of values to be adjusted in debug stmts.  We have to
 142    process them LIFO, so that the closest substitution applies.  If we
 143    processed them FIFO, without the stack, we might substitute uses
 144    with a PHI DEF that would soon become non-dominant, and when we got
 145    to the suitable one, it wouldn't have anything to substitute any
 146    more.  */
 147 static vec<adjust_info, va_heap> adjust_vec;
 148
 149 /* Adjust any debug stmts that referenced AI->from values to use the
 150    loop-closed AI->to, if the references are dominated by AI->bb and
 151    not by the definition of AI->from.  */
 152
 153 static void
 154 adjust_debug_stmts_now (adjust_info *ai)
 155 {
 156   basic_block bbphi = ai->bb;
 157   tree orig_def = ai->from;
 158   tree new_def = ai->to;
 159   imm_use_iterator imm_iter;
 160   gimple *stmt;
 161   basic_block bbdef = gimple_bb (SSA_NAME_DEF_STMT (orig_def));
 162
 163   gcc_assert (dom_info_available_p (CDI_DOMINATORS));
 164
 165   /* Adjust any debug stmts that held onto non-loop-closed
 166      references.  */
 167   FOR_EACH_IMM_USE_STMT (stmt, imm_iter, orig_def)
 168     {
 169       use_operand_p use_p;
 170       basic_block bbuse;
 171
 172       if (!is_gimple_debug (stmt))
 173         continue;
 174
 175       gcc_assert (gimple_debug_bind_p (stmt));
 176
 177       bbuse = gimple_bb (stmt);
 178
 179       if ((bbuse == bbphi
 180            || dominated_by_p (CDI_DOMINATORS, bbuse, bbphi))
 181           && !(bbuse == bbdef
 182                || dominated_by_p (CDI_DOMINATORS, bbuse, bbdef)))
 183         {
 184           if (new_def)
 185             FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
 186               SET_USE (use_p, new_def);
 187           else
 188             {
 189               gimple_debug_bind_reset_value (stmt);
 190               update_stmt (stmt);
 191             }
 192         }
 193     }
 194 }
 195
 196 /* Adjust debug stmts as scheduled before.  */
 197
 198 static void
 199 adjust_vec_debug_stmts (void)
 200 {
 201   if (!MAY_HAVE_DEBUG_BIND_STMTS)
 202     return;
 203
 204   gcc_assert (adjust_vec.exists ());
 205
 206   while (!adjust_vec.is_empty ())
 207     {
 208       adjust_debug_stmts_now (&adjust_vec.last ());
 209       adjust_vec.pop ();
 210     }
 211 }
 212
 213 /* Adjust any debug stmts that referenced FROM values to use the
 214    loop-closed TO, if the references are dominated by BB and not by
 215    the definition of FROM.  If adjust_vec is non-NULL, adjustments
 216    will be postponed until adjust_vec_debug_stmts is called.  */
 217
 218 static void
 219 adjust_debug_stmts (tree from, tree to, basic_block bb)
 220 {
 221   adjust_info ai;
 222
 223   if (MAY_HAVE_DEBUG_BIND_STMTS
 224       && TREE_CODE (from) == SSA_NAME
 225       && ! SSA_NAME_IS_DEFAULT_DEF (from)
 226       && ! virtual_operand_p (from))
 227     {
 228       ai.from = from;
 229       ai.to = to;
 230       ai.bb = bb;
 231
 232       if (adjust_vec.exists ())
 233         adjust_vec.safe_push (ai);
 234       else
 235         adjust_debug_stmts_now (&ai);
 236     }
 237 }
 238
 239 /* Change E's phi arg in UPDATE_PHI to NEW_DEF, and record information
 240    to adjust any debug stmts that referenced the old phi arg,
 241    presumably non-loop-closed references left over from other
 242    transformations.  */
 243
 244 static void
 245 adjust_phi_and_debug_stmts (gimple *update_phi, edge e, tree new_def)
 246 {
 247   tree orig_def = PHI_ARG_DEF_FROM_EDGE (update_phi, e);
 248
 249   SET_PHI_ARG_DEF (update_phi, e->dest_idx, new_def);
 250
 251   if (MAY_HAVE_DEBUG_BIND_STMTS)
 252     adjust_debug_stmts (orig_def, PHI_RESULT (update_phi),
 253                         gimple_bb (update_phi));
 254 }
 255
 256 /* Define one loop mask MASK from loop LOOP.  INIT_MASK is the value that
 257    the mask should have during the first iteration and NEXT_MASK is the
 258    value that it should have on subsequent iterations.  */
 259
 260 static void
 261 vect_set_loop_mask (struct loop *loop, tree mask, tree init_mask,
 262                     tree next_mask)
 263 {
 264   gphi *phi = create_phi_node (mask, loop->header);
 265   add_phi_arg (phi, init_mask, loop_preheader_edge (loop), UNKNOWN_LOCATION);
 266   add_phi_arg (phi, next_mask, loop_latch_edge (loop), UNKNOWN_LOCATION);
 267 }
 268
 269 /* Add SEQ to the end of LOOP's preheader block.  */
 270
 271 static void
 272 add_preheader_seq (struct loop *loop, gimple_seq seq)
 273 {
 274   if (seq)
 275     {
 276       edge pe = loop_preheader_edge (loop);
 277       basic_block new_bb = gsi_insert_seq_on_edge_immediate (pe, seq);
 278       gcc_assert (!new_bb);
 279     }
 280 }
 281
 282 /* Add SEQ to the beginning of LOOP's header block.  */
 283
 284 static void
 285 add_header_seq (struct loop *loop, gimple_seq seq)
 286 {
 287   if (seq)
 288     {
 289       gimple_stmt_iterator gsi = gsi_after_labels (loop->header);
 290       gsi_insert_seq_before (&gsi, seq, GSI_SAME_STMT);
 291     }
 292 }
 293
 294 /* Return true if the target can interleave elements of two vectors.
 295    OFFSET is 0 if the first half of the vectors should be interleaved
 296    or 1 if the second half should.  When returning true, store the
 297    associated permutation in INDICES.  */
 298
 299 static bool
 300 interleave_supported_p (vec_perm_indices *indices, tree vectype,
 301                         unsigned int offset)
 302 {
 303   poly_uint64 nelts = TYPE_VECTOR_SUBPARTS (vectype);
 304   poly_uint64 base = exact_div (nelts, 2) * offset;
 305   vec_perm_builder sel (nelts, 2, 3);
 306   for (unsigned int i = 0; i < 3; ++i)
 307     {
 308       sel.quick_push (base + i);
 309       sel.quick_push (base + i + nelts);
 310     }
 311   indices->new_vector (sel, 2, nelts);
 312   return can_vec_perm_const_p (TYPE_MODE (vectype), *indices);
 313 }
 314
 315 /* Try to use permutes to define the masks in DEST_RGM using the masks
 316    in SRC_RGM, given that the former has twice as many masks as the
 317    latter.  Return true on success, adding any new statements to SEQ.  */
 318
 319 static bool
 320 vect_maybe_permute_loop_masks (gimple_seq *seq, rgroup_masks *dest_rgm,
 321                                rgroup_masks *src_rgm)
 322 {
 323   tree src_masktype = src_rgm->mask_type;
 324   tree dest_masktype = dest_rgm->mask_type;
 325   machine_mode src_mode = TYPE_MODE (src_masktype);
 326   if (dest_rgm->max_nscalars_per_iter <= src_rgm->max_nscalars_per_iter
 327       && optab_handler (vec_unpacku_hi_optab, src_mode) != CODE_FOR_nothing
 328       && optab_handler (vec_unpacku_lo_optab, src_mode) != CODE_FOR_nothing)
 329     {
 330       /* Unpacking the source masks gives at least as many mask bits as
 331          we need.  We can then VIEW_CONVERT any excess bits away.  */
 332       tree unpack_masktype = vect_halve_mask_nunits (src_masktype);
 333       for (unsigned int i = 0; i < dest_rgm->masks.length (); ++i)
 334         {
 335           tree src = src_rgm->masks[i / 2];
 336           tree dest = dest_rgm->masks[i];
 337           tree_code code = (i & 1 ? VEC_UNPACK_HI_EXPR
 338                             : VEC_UNPACK_LO_EXPR);
 339           gassign *stmt;
 340           if (dest_masktype == unpack_masktype)
 341             stmt = gimple_build_assign (dest, code, src);
 342           else
 343             {
 344               tree temp = make_ssa_name (unpack_masktype);
 345               stmt = gimple_build_assign (temp, code, src);
 346               gimple_seq_add_stmt (seq, stmt);
 347               stmt = gimple_build_assign (dest, VIEW_CONVERT_EXPR,
 348                                           build1 (VIEW_CONVERT_EXPR,
 349                                                   dest_masktype, temp));
 350             }
 351           gimple_seq_add_stmt (seq, stmt);
 352         }
 353       return true;
 354     }
 355   vec_perm_indices indices[2];
 356   if (dest_masktype == src_masktype
 357       && interleave_supported_p (&indices[0], src_masktype, 0)
 358       && interleave_supported_p (&indices[1], src_masktype, 1))
 359     {
 360       /* The destination requires twice as many mask bits as the source, so
 361          we can use interleaving permutes to double up the number of bits.  */
 362       tree masks[2];
 363       for (unsigned int i = 0; i < 2; ++i)
 364         masks[i] = vect_gen_perm_mask_checked (src_masktype, indices[i]);
 365       for (unsigned int i = 0; i < dest_rgm->masks.length (); ++i)
 366         {
 367           tree src = src_rgm->masks[i / 2];
 368           tree dest = dest_rgm->masks[i];
 369           gimple *stmt = gimple_build_assign (dest, VEC_PERM_EXPR,
 370                                               src, src, masks[i & 1]);
 371           gimple_seq_add_stmt (seq, stmt);
 372         }
 373       return true;
 374     }
 375   return false;
 376 }
 377
 378 /* Helper for vect_set_loop_condition_masked.  Generate definitions for
 379    all the masks in RGM and return a mask that is nonzero when the loop
 380    needs to iterate.  Add any new preheader statements to PREHEADER_SEQ.
 381    Use LOOP_COND_GSI to insert code before the exit gcond.
 382
 383    RGM belongs to loop LOOP.  The loop originally iterated NITERS
 384    times and has been vectorized according to LOOP_VINFO.  Each iteration
 385    of the vectorized loop handles VF iterations of the scalar loop.
 386
 387    It is known that:
 388
 389      NITERS * RGM->max_nscalars_per_iter
 390
 391    does not overflow.  However, MIGHT_WRAP_P says whether an induction
 392    variable that starts at 0 and has step:
 393
 394      VF * RGM->max_nscalars_per_iter
 395
 396    might overflow before hitting a value above:
 397
 398      NITERS * RGM->max_nscalars_per_iter
 399
 400    This means that we cannot guarantee that such an induction variable
 401    would ever hit a value that produces a set of all-false masks for RGM.  */
 402
 403 static tree
 404 vect_set_loop_masks_directly (struct loop *loop, loop_vec_info loop_vinfo,
 405                               gimple_seq *preheader_seq,
 406                               gimple_stmt_iterator loop_cond_gsi,
 407                               rgroup_masks *rgm, tree vf,
 408                               tree niters, bool might_wrap_p)
 409 {
 410   tree compare_type = LOOP_VINFO_MASK_COMPARE_TYPE (loop_vinfo);
 411   tree mask_type = rgm->mask_type;
 412   unsigned int nscalars_per_iter = rgm->max_nscalars_per_iter;
 413   poly_uint64 nscalars_per_mask = TYPE_VECTOR_SUBPARTS (mask_type);
 414
 415   /* Calculate the maximum number of scalar values that the rgroup
 416      handles in total and the number that it handles for each iteration
 417      of the vector loop.  */
 418   tree nscalars_total = niters;
 419   tree nscalars_step = vf;
 420   if (nscalars_per_iter != 1)
 421     {
 422       /* We checked before choosing to use a fully-masked loop that these
 423          multiplications don't overflow.  */
 424       tree factor = build_int_cst (compare_type, nscalars_per_iter);
 425       nscalars_total = gimple_build (preheader_seq, MULT_EXPR, compare_type,
 426                                      nscalars_total, factor);
 427       nscalars_step = gimple_build (preheader_seq, MULT_EXPR, compare_type,
 428                                     nscalars_step, factor);
 429     }
 430
 431   /* Create an induction variable that counts the number of scalars
 432      processed.  */
 433   tree index_before_incr, index_after_incr;
 434   gimple_stmt_iterator incr_gsi;
 435   bool insert_after;
 436   tree zero_index = build_int_cst (compare_type, 0);
 437   standard_iv_increment_position (loop, &incr_gsi, &insert_after);
 438   create_iv (zero_index, nscalars_step, NULL_TREE, loop, &incr_gsi,
 439              insert_after, &index_before_incr, &index_after_incr);
 440
 441   tree test_index, test_limit;
 442   gimple_stmt_iterator *test_gsi;
 443   if (might_wrap_p)
 444     {
 445       /* In principle the loop should stop iterating once the incremented
 446          IV reaches a value greater than or equal to NSCALAR_TOTAL.
 447          However, there's no guarantee that the IV hits a value above
 448          this value before wrapping around.  We therefore adjust the
 449          limit down by one IV step:
 450
 451            NSCALARS_TOTAL -[infinite-prec] NSCALARS_STEP
 452
 453          and compare the IV against this limit _before_ incrementing it.
 454          Since the comparison type is unsigned, we actually want the
 455          subtraction to saturate at zero:
 456
 457            NSCALARS_TOTAL -[sat] NSCALARS_STEP.  */
 458       test_index = index_before_incr;
 459       test_limit = gimple_build (preheader_seq, MAX_EXPR, compare_type,
 460                                  nscalars_total, nscalars_step);
 461       test_limit = gimple_build (preheader_seq, MINUS_EXPR, compare_type,
 462                                  test_limit, nscalars_step);
 463       test_gsi = &incr_gsi;
 464     }
 465   else
 466     {
 467       /* Test the incremented IV, which will always hit a value above
 468          the bound before wrapping.  */
 469       test_index = index_after_incr;
 470       test_limit = nscalars_total;
 471       test_gsi = &loop_cond_gsi;
 472     }
 473
 474   /* Provide a definition of each mask in the group.  */
 475   tree next_mask = NULL_TREE;
 476   tree mask;
 477   unsigned int i;
 478   FOR_EACH_VEC_ELT_REVERSE (rgm->masks, i, mask)
 479     {
 480       /* Previous masks will cover BIAS scalars.  This mask covers the
 481          next batch.  */
 482       poly_uint64 bias = nscalars_per_mask * i;
 483       tree bias_tree = build_int_cst (compare_type, bias);
 484       gimple *tmp_stmt;
 485
 486       /* See whether the first iteration of the vector loop is known
 487          to have a full mask.  */
 488       poly_uint64 const_limit;
 489       bool first_iteration_full
 490         = (poly_int_tree_p (nscalars_total, &const_limit)
 491            && known_ge (const_limit, (i + 1) * nscalars_per_mask));
 492
 493       /* Rather than have a new IV that starts at BIAS and goes up to
 494          TEST_LIMIT, prefer to use the same 0-based IV for each mask
 495          and adjust the bound down by BIAS.  */
 496       tree this_test_limit = test_limit;
 497       if (i != 0)
 498         {
 499           this_test_limit = gimple_build (preheader_seq, MAX_EXPR,
 500                                           compare_type, this_test_limit,
 501                                           bias_tree);
 502           this_test_limit = gimple_build (preheader_seq, MINUS_EXPR,
 503                                           compare_type, this_test_limit,
 504                                           bias_tree);
 505         }
 506
 507       /* Create the initial mask.  */
 508       tree init_mask = NULL_TREE;
 509       if (!first_iteration_full)
 510         {
 511           tree start, end;
 512           if (nscalars_total == test_limit)
 513             {
 514               /* Use a natural test between zero (the initial IV value)
 515                  and the loop limit.  The "else" block would be valid too,
 516                  but this choice can avoid the need to load BIAS_TREE into
 517                  a register.  */
 518               start = zero_index;
 519               end = this_test_limit;
 520             }
 521           else
 522             {
 523               start = bias_tree;
 524               end = nscalars_total;
 525             }
 526
 527           init_mask = make_temp_ssa_name (mask_type, NULL, "max_mask");
 528           tmp_stmt = vect_gen_while (init_mask, start, end);
 529           gimple_seq_add_stmt (preheader_seq, tmp_stmt);
 530         }
 531
 532       if (!init_mask)
 533         /* First iteration is full.  */
 534         init_mask = build_minus_one_cst (mask_type);
 535
 536       /* Get the mask value for the next iteration of the loop.  */
 537       next_mask = make_temp_ssa_name (mask_type, NULL, "next_mask");
 538       gcall *call = vect_gen_while (next_mask, test_index, this_test_limit);
 539       gsi_insert_before (test_gsi, call, GSI_SAME_STMT);
 540
 541       vect_set_loop_mask (loop, mask, init_mask, next_mask);
 542     }
 543   return next_mask;
 544 }
 545
 546 /* Make LOOP iterate NITERS times using masking and WHILE_ULT calls.
 547    LOOP_VINFO describes the vectorization of LOOP.  NITERS is the
 548    number of iterations of the original scalar loop.  NITERS_MAYBE_ZERO
 549    and FINAL_IV are as for vect_set_loop_condition.
 550
 551    Insert the branch-back condition before LOOP_COND_GSI and return the
 552    final gcond.  */
 553
 554 static gcond *
 555 vect_set_loop_condition_masked (struct loop *loop, loop_vec_info loop_vinfo,
 556                                 tree niters, tree final_iv,
 557                                 bool niters_maybe_zero,
 558                                 gimple_stmt_iterator loop_cond_gsi)
 559 {
 560   gimple_seq preheader_seq = NULL;
 561   gimple_seq header_seq = NULL;
 562
 563   tree compare_type = LOOP_VINFO_MASK_COMPARE_TYPE (loop_vinfo);
 564   unsigned int compare_precision = TYPE_PRECISION (compare_type);
 565   unsigned HOST_WIDE_INT max_vf = vect_max_vf (loop_vinfo);
 566   tree orig_niters = niters;
 567
 568   /* Type of the initial value of NITERS.  */
 569   tree ni_actual_type = TREE_TYPE (niters);
 570   unsigned int ni_actual_precision = TYPE_PRECISION (ni_actual_type);
 571
 572   /* Convert NITERS to the same size as the compare.  */
 573   if (compare_precision > ni_actual_precision
 574       && niters_maybe_zero)
 575     {
 576       /* We know that there is always at least one iteration, so if the
 577          count is zero then it must have wrapped.  Cope with this by
 578          subtracting 1 before the conversion and adding 1 to the result.  */
 579       gcc_assert (TYPE_UNSIGNED (ni_actual_type));
 580       niters = gimple_build (&preheader_seq, PLUS_EXPR, ni_actual_type,
 581                              niters, build_minus_one_cst (ni_actual_type));
 582       niters = gimple_convert (&preheader_seq, compare_type, niters);
 583       niters = gimple_build (&preheader_seq, PLUS_EXPR, compare_type,
 584                              niters, build_one_cst (compare_type));
 585     }
 586   else
 587     niters = gimple_convert (&preheader_seq, compare_type, niters);
 588
 589   /* Now calculate the value that the induction variable must be able
 590      to hit in order to ensure that we end the loop with an all-false mask.
 591      This involves adding the maximum number of inactive trailing scalar
 592      iterations.  */
 593   widest_int iv_limit;
 594   bool known_max_iters = max_loop_iterations (loop, &iv_limit);
 595   if (known_max_iters)
 596     {
 597       /* IV_LIMIT is the maximum number of latch iterations, which is also
 598          the maximum in-range IV value.  Round this value down to the previous
 599          vector alignment boundary and then add an extra full iteration.  */
 600       poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
 601       iv_limit = (iv_limit & -(int) known_alignment (vf)) + max_vf;
 602     }
 603
 604   /* Get the vectorization factor in tree form.  */
 605   tree vf = build_int_cst (compare_type,
 606                            LOOP_VINFO_VECT_FACTOR (loop_vinfo));
 607
 608   /* Iterate over all the rgroups and fill in their masks.  We could use
 609      the first mask from any rgroup for the loop condition; here we
 610      arbitrarily pick the last.  */
 611   tree test_mask = NULL_TREE;
 612   rgroup_masks *rgm;
 613   unsigned int i;
 614   vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo);
 615   FOR_EACH_VEC_ELT (*masks, i, rgm)
 616     if (!rgm->masks.is_empty ())
 617       {
 618         /* First try using permutes.  This adds a single vector
 619            instruction to the loop for each mask, but needs no extra
 620            loop invariants or IVs.  */
 621         unsigned int nmasks = i + 1;
 622         if ((nmasks & 1) == 0)
 623           {
 624             rgroup_masks *half_rgm = &(*masks)[nmasks / 2 - 1];
 625             if (!half_rgm->masks.is_empty ()
 626                 && vect_maybe_permute_loop_masks (&header_seq, rgm, half_rgm))
 627               continue;
 628           }
 629
 630         /* See whether zero-based IV would ever generate all-false masks
 631            before wrapping around.  */
 632         bool might_wrap_p
 633           = (!known_max_iters
 634              || (wi::min_precision (iv_limit * rgm->max_nscalars_per_iter,
 635                                     UNSIGNED)
 636                  > compare_precision));
 637
 638         /* Set up all masks for this group.  */
 639         test_mask = vect_set_loop_masks_directly (loop, loop_vinfo,
 640                                                   &preheader_seq,
 641                                                   loop_cond_gsi, rgm, vf,
 642                                                   niters, might_wrap_p);
 643       }
 644
 645   /* Emit all accumulated statements.  */
 646   add_preheader_seq (loop, preheader_seq);
 647   add_header_seq (loop, header_seq);
 648
 649   /* Get a boolean result that tells us whether to iterate.  */
 650   edge exit_edge = single_exit (loop);
 651   tree_code code = (exit_edge->flags & EDGE_TRUE_VALUE) ? EQ_EXPR : NE_EXPR;
 652   tree zero_mask = build_zero_cst (TREE_TYPE (test_mask));
 653   gcond *cond_stmt = gimple_build_cond (code, test_mask, zero_mask,
 654                                         NULL_TREE, NULL_TREE);
 655   gsi_insert_before (&loop_cond_gsi, cond_stmt, GSI_SAME_STMT);
 656
 657   /* The loop iterates (NITERS - 1) / VF + 1 times.
 658      Subtract one from this to get the latch count.  */
 659   tree step = build_int_cst (compare_type,
 660                              LOOP_VINFO_VECT_FACTOR (loop_vinfo));
 661   tree niters_minus_one = fold_build2 (PLUS_EXPR, compare_type, niters,
 662                                        build_minus_one_cst (compare_type));
 663   loop->nb_iterations = fold_build2 (TRUNC_DIV_EXPR, compare_type,
 664                                      niters_minus_one, step);
 665
 666   if (final_iv)
 667     {
 668       gassign *assign = gimple_build_assign (final_iv, orig_niters);
 669       gsi_insert_on_edge_immediate (single_exit (loop), assign);
 670     }
 671
 672   return cond_stmt;
 673 }
 674
 675 /* Like vect_set_loop_condition, but handle the case in which there
 676    are no loop masks.  */
 677
 678 static gcond *
 679 vect_set_loop_condition_unmasked (struct loop *loop, tree niters,
 680                                   tree step, tree final_iv,
 681                                   bool niters_maybe_zero,
 682                                   gimple_stmt_iterator loop_cond_gsi)
 683 {
 684   tree indx_before_incr, indx_after_incr;
 685   gcond *cond_stmt;
 686   gcond *orig_cond;
 687   edge pe = loop_preheader_edge (loop);
 688   edge exit_edge = single_exit (loop);
 689   gimple_stmt_iterator incr_gsi;
 690   bool insert_after;
 691   enum tree_code code;
 692   tree niters_type = TREE_TYPE (niters);
 693
 694   orig_cond = get_loop_exit_condition (loop);
 695   gcc_assert (orig_cond);
 696   loop_cond_gsi = gsi_for_stmt (orig_cond);
 697
 698   tree init, limit;
 699   if (!niters_maybe_zero && integer_onep (step))
 700     {
 701       /* In this case we can use a simple 0-based IV:
 702
 703          A:
 704            x = 0;
 705            do
 706              {
 707                ...
 708                x += 1;
 709              }
 710            while (x < NITERS);  */
 711       code = (exit_edge->flags & EDGE_TRUE_VALUE) ? GE_EXPR : LT_EXPR;
 712       init = build_zero_cst (niters_type);
 713       limit = niters;
 714     }
 715   else
 716     {
 717       /* The following works for all values of NITERS except 0:
 718
 719          B:
 720            x = 0;
 721            do
 722              {
 723                ...
 724                x += STEP;
 725              }
 726            while (x <= NITERS - STEP);
 727
 728          so that the loop continues to iterate if x + STEP - 1 < NITERS
 729          but stops if x + STEP - 1 >= NITERS.
 730
 731          However, if NITERS is zero, x never hits a value above NITERS - STEP
 732          before wrapping around.  There are two obvious ways of dealing with
 733          this:
 734
 735          - start at STEP - 1 and compare x before incrementing it
 736          - start at -1 and compare x after incrementing it
 737
 738          The latter is simpler and is what we use.  The loop in this case
 739          looks like:
 740
 741          C:
 742            x = -1;
 743            do
 744              {
 745                ...
 746                x += STEP;
 747              }
 748            while (x < NITERS - STEP);
 749
 750          In both cases the loop limit is NITERS - STEP.  */
 751       gimple_seq seq = NULL;
 752       limit = force_gimple_operand (niters, &seq, true, NULL_TREE);
 753       limit = gimple_build (&seq, MINUS_EXPR, TREE_TYPE (limit), limit, step);
 754       if (seq)
 755         {
 756           basic_block new_bb = gsi_insert_seq_on_edge_immediate (pe, seq);
 757           gcc_assert (!new_bb);
 758         }
 759       if (niters_maybe_zero)
 760         {
 761           /* Case C.  */
 762           code = (exit_edge->flags & EDGE_TRUE_VALUE) ? GE_EXPR : LT_EXPR;
 763           init = build_all_ones_cst (niters_type);
 764         }
 765       else
 766         {
 767           /* Case B.  */
 768           code = (exit_edge->flags & EDGE_TRUE_VALUE) ? GT_EXPR : LE_EXPR;
 769           init = build_zero_cst (niters_type);
 770         }
 771     }
 772
 773   standard_iv_increment_position (loop, &incr_gsi, &insert_after);
 774   create_iv (init, step, NULL_TREE, loop,
 775              &incr_gsi, insert_after, &indx_before_incr, &indx_after_incr);
 776   indx_after_incr = force_gimple_operand_gsi (&loop_cond_gsi, indx_after_incr,
 777                                               true, NULL_TREE, true,
 778                                               GSI_SAME_STMT);
 779   limit = force_gimple_operand_gsi (&loop_cond_gsi, limit, true, NULL_TREE,
 780                                      true, GSI_SAME_STMT);
 781
 782   cond_stmt = gimple_build_cond (code, indx_after_incr, limit, NULL_TREE,
 783                                  NULL_TREE);
 784
 785   gsi_insert_before (&loop_cond_gsi, cond_stmt, GSI_SAME_STMT);
 786
 787   /* Record the number of latch iterations.  */
 788   if (limit == niters)
 789     /* Case A: the loop iterates NITERS times.  Subtract one to get the
 790        latch count.  */
 791     loop->nb_iterations = fold_build2 (MINUS_EXPR, niters_type, niters,
 792                                        build_int_cst (niters_type, 1));
 793   else
 794     /* Case B or C: the loop iterates (NITERS - STEP) / STEP + 1 times.
 795        Subtract one from this to get the latch count.  */
 796     loop->nb_iterations = fold_build2 (TRUNC_DIV_EXPR, niters_type,
 797                                        limit, step);
 798
 799   if (final_iv)
 800     {
 801       gassign *assign = gimple_build_assign (final_iv, MINUS_EXPR,
 802                                              indx_after_incr, init);
 803       gsi_insert_on_edge_immediate (single_exit (loop), assign);
 804     }
 805
 806   return cond_stmt;
 807 }
 808
 809 /* If we're using fully-masked loops, make LOOP iterate:
 810
 811       N == (NITERS - 1) / STEP + 1
 812
 813    times.  When NITERS is zero, this is equivalent to making the loop
 814    execute (1 << M) / STEP times, where M is the precision of NITERS.
 815    NITERS_MAYBE_ZERO is true if this last case might occur.
 816
 817    If we're not using fully-masked loops, make LOOP iterate:
 818
 819       N == (NITERS - STEP) / STEP + 1
 820
 821    times, where NITERS is known to be outside the range [1, STEP - 1].
 822    This is equivalent to making the loop execute NITERS / STEP times
 823    when NITERS is nonzero and (1 << M) / STEP times otherwise.
 824    NITERS_MAYBE_ZERO again indicates whether this last case might occur.
 825
 826    If FINAL_IV is nonnull, it is an SSA name that should be set to
 827    N * STEP on exit from the loop.
 828
 829    Assumption: the exit-condition of LOOP is the last stmt in the loop.  */
 830
 831 void
 832 vect_set_loop_condition (struct loop *loop, loop_vec_info loop_vinfo,
 833                          tree niters, tree step, tree final_iv,
 834                          bool niters_maybe_zero)
 835 {
 836   gcond *cond_stmt;
 837   gcond *orig_cond = get_loop_exit_condition (loop);
 838   gimple_stmt_iterator loop_cond_gsi = gsi_for_stmt (orig_cond);
 839
 840   if (loop_vinfo && LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
 841     cond_stmt = vect_set_loop_condition_masked (loop, loop_vinfo, niters,
 842                                                 final_iv, niters_maybe_zero,
 843                                                 loop_cond_gsi);
 844   else
 845     cond_stmt = vect_set_loop_condition_unmasked (loop, niters, step,
 846                                                   final_iv, niters_maybe_zero,
 847                                                   loop_cond_gsi);
 848
 849   /* Remove old loop exit test.  */
 850   gsi_remove (&loop_cond_gsi, true);
 851   free_stmt_vec_info (orig_cond);
 852
 853   if (dump_enabled_p ())
 854     {
 855       dump_printf_loc (MSG_NOTE, vect_location, "New loop exit condition: ");
 856       dump_gimple_stmt (MSG_NOTE, TDF_SLIM, cond_stmt, 0);
 857     }
 858 }
 859
 860 /* Helper routine of slpeel_tree_duplicate_loop_to_edge_cfg.
 861    For all PHI arguments in FROM->dest and TO->dest from those
 862    edges ensure that TO->dest PHI arguments have current_def
 863    to that in from.  */
 864
 865 static void
 866 slpeel_duplicate_current_defs_from_edges (edge from, edge to)
 867 {
 868   gimple_stmt_iterator gsi_from, gsi_to;
 869
 870   for (gsi_from = gsi_start_phis (from->dest),
 871        gsi_to = gsi_start_phis (to->dest);
 872        !gsi_end_p (gsi_from) && !gsi_end_p (gsi_to);)
 873     {
 874       gimple *from_phi = gsi_stmt (gsi_from);
 875       gimple *to_phi = gsi_stmt (gsi_to);
 876       tree from_arg = PHI_ARG_DEF_FROM_EDGE (from_phi, from);
 877       tree to_arg = PHI_ARG_DEF_FROM_EDGE (to_phi, to);
 878       if (virtual_operand_p (from_arg))
 879         {
 880           gsi_next (&gsi_from);
 881           continue;
 882         }
 883       if (virtual_operand_p (to_arg))
 884         {
 885           gsi_next (&gsi_to);
 886           continue;
 887         }
 888       if (TREE_CODE (from_arg) != SSA_NAME)
 889         gcc_assert (operand_equal_p (from_arg, to_arg, 0));
 890       else
 891         {
 892           if (get_current_def (to_arg) == NULL_TREE)
 893             set_current_def (to_arg, get_current_def (from_arg));
 894         }
 895       gsi_next (&gsi_from);
 896       gsi_next (&gsi_to);
 897     }
 898
 899   gphi *from_phi = get_virtual_phi (from->dest);
 900   gphi *to_phi = get_virtual_phi (to->dest);
 901   if (from_phi)
 902     set_current_def (PHI_ARG_DEF_FROM_EDGE (to_phi, to),
 903                      get_current_def (PHI_ARG_DEF_FROM_EDGE (from_phi, from)));
 904 }
 905
 906
 907 /* Given LOOP this function generates a new copy of it and puts it
 908    on E which is either the entry or exit of LOOP.  If SCALAR_LOOP is
 909    non-NULL, assume LOOP and SCALAR_LOOP are equivalent and copy the
 910    basic blocks from SCALAR_LOOP instead of LOOP, but to either the
 911    entry or exit of LOOP.  */
 912
 913 struct loop *
 914 slpeel_tree_duplicate_loop_to_edge_cfg (struct loop *loop,
 915                                         struct loop *scalar_loop, edge e)
 916 {
 917   struct loop *new_loop;
 918   basic_block *new_bbs, *bbs, *pbbs;
 919   bool at_exit;
 920   bool was_imm_dom;
 921   basic_block exit_dest;
 922   edge exit, new_exit;
 923   bool duplicate_outer_loop = false;
 924
 925   exit = single_exit (loop);
 926   at_exit = (e == exit);
 927   if (!at_exit && e != loop_preheader_edge (loop))
 928     return NULL;
 929
 930   if (scalar_loop == NULL)
 931     scalar_loop = loop;
 932
 933   bbs = XNEWVEC (basic_block, scalar_loop->num_nodes + 1);
 934   pbbs = bbs + 1;
 935   get_loop_body_with_size (scalar_loop, pbbs, scalar_loop->num_nodes);
 936   /* Allow duplication of outer loops.  */
 937   if (scalar_loop->inner)
 938     duplicate_outer_loop = true;
 939   /* Check whether duplication is possible.  */
 940   if (!can_copy_bbs_p (pbbs, scalar_loop->num_nodes))
 941     {
 942       free (bbs);
 943       return NULL;
 944     }
 945
 946   /* Generate new loop structure.  */
 947   new_loop = duplicate_loop (scalar_loop, loop_outer (scalar_loop));
 948   duplicate_subloops (scalar_loop, new_loop);
 949
 950   exit_dest = exit->dest;
 951   was_imm_dom = (get_immediate_dominator (CDI_DOMINATORS,
 952                                           exit_dest) == loop->header ?
 953                  true : false);
 954
 955   /* Also copy the pre-header, this avoids jumping through hoops to
 956      duplicate the loop entry PHI arguments.  Create an empty
 957      pre-header unconditionally for this.  */
 958   basic_block preheader = split_edge (loop_preheader_edge (scalar_loop));
 959   edge entry_e = single_pred_edge (preheader);
 960   bbs[0] = preheader;
 961   new_bbs = XNEWVEC (basic_block, scalar_loop->num_nodes + 1);
 962
 963   exit = single_exit (scalar_loop);
 964   copy_bbs (bbs, scalar_loop->num_nodes + 1, new_bbs,
 965             &exit, 1, &new_exit, NULL,
 966             at_exit ? loop->latch : e->src, true);
 967   exit = single_exit (loop);
 968   basic_block new_preheader = new_bbs[0];
 969
 970   add_phi_args_after_copy (new_bbs, scalar_loop->num_nodes + 1, NULL);
 971
 972   if (scalar_loop != loop)
 973     {
 974       /* If we copied from SCALAR_LOOP rather than LOOP, SSA_NAMEs from
 975          SCALAR_LOOP will have current_def set to SSA_NAMEs in the new_loop,
 976          but LOOP will not.  slpeel_update_phi_nodes_for_guard{1,2} expects
 977          the LOOP SSA_NAMEs (on the exit edge and edge from latch to
 978          header) to have current_def set, so copy them over.  */
 979       slpeel_duplicate_current_defs_from_edges (single_exit (scalar_loop),
 980                                                 exit);
 981       slpeel_duplicate_current_defs_from_edges (EDGE_SUCC (scalar_loop->latch,
 982                                                            0),
 983                                                 EDGE_SUCC (loop->latch, 0));
 984     }
 985
 986   if (at_exit) /* Add the loop copy at exit.  */
 987     {
 988       if (scalar_loop != loop)
 989         {
 990           gphi_iterator gsi;
 991           new_exit = redirect_edge_and_branch (new_exit, exit_dest);
 992
 993           for (gsi = gsi_start_phis (exit_dest); !gsi_end_p (gsi);
 994                gsi_next (&gsi))
 995             {
 996               gphi *phi = gsi.phi ();
 997               tree orig_arg = PHI_ARG_DEF_FROM_EDGE (phi, e);
 998               location_t orig_locus
 999                 = gimple_phi_arg_location_from_edge (phi, e);
1000
1001               add_phi_arg (phi, orig_arg, new_exit, orig_locus);
1002             }
1003         }
1004       redirect_edge_and_branch_force (e, new_preheader);
1005       flush_pending_stmts (e);
1006       set_immediate_dominator (CDI_DOMINATORS, new_preheader, e->src);
1007       if (was_imm_dom || duplicate_outer_loop)
1008         set_immediate_dominator (CDI_DOMINATORS, exit_dest, new_exit->src);
1009
1010       /* And remove the non-necessary forwarder again.  Keep the other
1011          one so we have a proper pre-header for the loop at the exit edge.  */
1012       redirect_edge_pred (single_succ_edge (preheader),
1013                           single_pred (preheader));
1014       delete_basic_block (preheader);
1015       set_immediate_dominator (CDI_DOMINATORS, scalar_loop->header,
1016                                loop_preheader_edge (scalar_loop)->src);
1017     }
1018   else /* Add the copy at entry.  */
1019     {
1020       if (scalar_loop != loop)
1021         {
1022           /* Remove the non-necessary forwarder of scalar_loop again.  */
1023           redirect_edge_pred (single_succ_edge (preheader),
1024                               single_pred (preheader));
1025           delete_basic_block (preheader);
1026           set_immediate_dominator (CDI_DOMINATORS, scalar_loop->header,
1027                                    loop_preheader_edge (scalar_loop)->src);
1028           preheader = split_edge (loop_preheader_edge (loop));
1029           entry_e = single_pred_edge (preheader);
1030         }
1031
1032       redirect_edge_and_branch_force (entry_e, new_preheader);
1033       flush_pending_stmts (entry_e);
1034       set_immediate_dominator (CDI_DOMINATORS, new_preheader, entry_e->src);
1035
1036       redirect_edge_and_branch_force (new_exit, preheader);
1037       flush_pending_stmts (new_exit);
1038       set_immediate_dominator (CDI_DOMINATORS, preheader, new_exit->src);
1039
1040       /* And remove the non-necessary forwarder again.  Keep the other
1041          one so we have a proper pre-header for the loop at the exit edge.  */
1042       redirect_edge_pred (single_succ_edge (new_preheader),
1043                           single_pred (new_preheader));
1044       delete_basic_block (new_preheader);
1045       set_immediate_dominator (CDI_DOMINATORS, new_loop->header,
1046                                loop_preheader_edge (new_loop)->src);
1047     }
1048
1049   /* Skip new preheader since it's deleted if copy loop is added at entry.  */
1050   for (unsigned i = (at_exit ? 0 : 1); i < scalar_loop->num_nodes + 1; i++)
1051     rename_variables_in_bb (new_bbs[i], duplicate_outer_loop);
1052
1053   if (scalar_loop != loop)
1054     {
1055       /* Update new_loop->header PHIs, so that on the preheader
1056          edge they are the ones from loop rather than scalar_loop.  */
1057       gphi_iterator gsi_orig, gsi_new;
1058       edge orig_e = loop_preheader_edge (loop);
1059       edge new_e = loop_preheader_edge (new_loop);
1060
1061       for (gsi_orig = gsi_start_phis (loop->header),
1062            gsi_new = gsi_start_phis (new_loop->header);
1063            !gsi_end_p (gsi_orig) && !gsi_end_p (gsi_new);
1064            gsi_next (&gsi_orig), gsi_next (&gsi_new))
1065         {
1066           gphi *orig_phi = gsi_orig.phi ();
1067           gphi *new_phi = gsi_new.phi ();
1068           tree orig_arg = PHI_ARG_DEF_FROM_EDGE (orig_phi, orig_e);
1069           location_t orig_locus
1070             = gimple_phi_arg_location_from_edge (orig_phi, orig_e);
1071
1072           add_phi_arg (new_phi, orig_arg, new_e, orig_locus);
1073         }
1074     }
1075
1076   free (new_bbs);
1077   free (bbs);
1078
1079   checking_verify_dominators (CDI_DOMINATORS);
1080
1081   return new_loop;
1082 }
1083
1084
1085 /* Given the condition expression COND, put it as the last statement of
1086    GUARD_BB; set both edges' probability; set dominator of GUARD_TO to
1087    DOM_BB; return the skip edge.  GUARD_TO is the target basic block to
1088    skip the loop.  PROBABILITY is the skip edge's probability.  Mark the
1089    new edge as irreducible if IRREDUCIBLE_P is true.  */
1090
1091 static edge
1092 slpeel_add_loop_guard (basic_block guard_bb, tree cond,
1093                        basic_block guard_to, basic_block dom_bb,
1094                        profile_probability probability, bool irreducible_p)
1095 {
1096   gimple_stmt_iterator gsi;
1097   edge new_e, enter_e;
1098   gcond *cond_stmt;
1099   gimple_seq gimplify_stmt_list = NULL;
1100
1101   enter_e = EDGE_SUCC (guard_bb, 0);
1102   enter_e->flags &= ~EDGE_FALLTHRU;
1103   enter_e->flags |= EDGE_FALSE_VALUE;
1104   gsi = gsi_last_bb (guard_bb);
1105
1106   cond = force_gimple_operand_1 (cond, &gimplify_stmt_list, is_gimple_condexpr,
1107                                  NULL_TREE);
1108   if (gimplify_stmt_list)
1109     gsi_insert_seq_after (&gsi, gimplify_stmt_list, GSI_NEW_STMT);
1110
1111   cond_stmt = gimple_build_cond_from_tree (cond, NULL_TREE, NULL_TREE);
1112   gsi = gsi_last_bb (guard_bb);
1113   gsi_insert_after (&gsi, cond_stmt, GSI_NEW_STMT);
1114
1115   /* Add new edge to connect guard block to the merge/loop-exit block.  */
1116   new_e = make_edge (guard_bb, guard_to, EDGE_TRUE_VALUE);
1117
1118   new_e->probability = probability;
1119   if (irreducible_p)
1120     new_e->flags |= EDGE_IRREDUCIBLE_LOOP;
1121
1122   enter_e->probability = probability.invert ();
1123   set_immediate_dominator (CDI_DOMINATORS, guard_to, dom_bb);
1124
1125   /* Split enter_e to preserve LOOPS_HAVE_PREHEADERS.  */
1126   if (enter_e->dest->loop_father->header == enter_e->dest)
1127     split_edge (enter_e);
1128
1129   return new_e;
1130 }
1131
1132
1133 /* This function verifies that the following restrictions apply to LOOP:
1134    (1) it consists of exactly 2 basic blocks - header, and an empty latch
1135        for innermost loop and 5 basic blocks for outer-loop.
1136    (2) it is single entry, single exit
1137    (3) its exit condition is the last stmt in the header
1138    (4) E is the entry/exit edge of LOOP.
1139  */
1140
1141 bool
1142 slpeel_can_duplicate_loop_p (const struct loop *loop, const_edge e)
1143 {
1144   edge exit_e = single_exit (loop);
1145   edge entry_e = loop_preheader_edge (loop);
1146   gcond *orig_cond = get_loop_exit_condition (loop);
1147   gimple_stmt_iterator loop_exit_gsi = gsi_last_bb (exit_e->src);
1148   unsigned int num_bb = loop->inner? 5 : 2;
1149
1150   /* All loops have an outer scope; the only case loop->outer is NULL is for
1151      the function itself.  */
1152   if (!loop_outer (loop)
1153       || loop->num_nodes != num_bb
1154       || !empty_block_p (loop->latch)
1155       || !single_exit (loop)
1156       /* Verify that new loop exit condition can be trivially modified.  */
1157       || (!orig_cond || orig_cond != gsi_stmt (loop_exit_gsi))
1158       || (e != exit_e && e != entry_e))
1159     return false;
1160
1161   return true;
1162 }
1163
1164 /* If the loop has a virtual PHI, but exit bb doesn't, create a virtual PHI
1165    in the exit bb and rename all the uses after the loop.  This simplifies
1166    the *guard[12] routines, which assume loop closed SSA form for all PHIs
1167    (but normally loop closed SSA form doesn't require virtual PHIs to be
1168    in the same form).  Doing this early simplifies the checking what
1169    uses should be renamed.  */
1170
1171 static void
1172 create_lcssa_for_virtual_phi (struct loop *loop)
1173 {
1174   gphi_iterator gsi;
1175   edge exit_e = single_exit (loop);
1176
1177   for (gsi = gsi_start_phis (loop->header); !gsi_end_p (gsi); gsi_next (&gsi))
1178     if (virtual_operand_p (gimple_phi_result (gsi_stmt (gsi))))
1179       {
1180         gphi *phi = gsi.phi ();
1181         for (gsi = gsi_start_phis (exit_e->dest);
1182              !gsi_end_p (gsi); gsi_next (&gsi))
1183           if (virtual_operand_p (gimple_phi_result (gsi_stmt (gsi))))
1184             break;
1185         if (gsi_end_p (gsi))
1186           {
1187             tree new_vop = copy_ssa_name (PHI_RESULT (phi));
1188             gphi *new_phi = create_phi_node (new_vop, exit_e->dest);
1189             tree vop = PHI_ARG_DEF_FROM_EDGE (phi, EDGE_SUCC (loop->latch, 0));
1190             imm_use_iterator imm_iter;
1191             gimple *stmt;
1192             use_operand_p use_p;
1193
1194             SSA_NAME_OCCURS_IN_ABNORMAL_PHI (new_vop)
1195               = SSA_NAME_OCCURS_IN_ABNORMAL_PHI (vop);
1196             add_phi_arg (new_phi, vop, exit_e, UNKNOWN_LOCATION);
1197             gimple_phi_set_result (new_phi, new_vop);
1198             FOR_EACH_IMM_USE_STMT (stmt, imm_iter, vop)
1199               if (stmt != new_phi
1200                   && !flow_bb_inside_loop_p (loop, gimple_bb (stmt)))
1201                 FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
1202                   SET_USE (use_p, new_vop);
1203           }
1204         break;
1205       }
1206
1207 }
1208
1209 /* Function vect_get_loop_location.
1210
1211    Extract the location of the loop in the source code.
1212    If the loop is not well formed for vectorization, an estimated
1213    location is calculated.
1214    Return the loop location if succeed and NULL if not.  */
1215
1216 source_location
1217 find_loop_location (struct loop *loop)
1218 {
1219   gimple *stmt = NULL;
1220   basic_block bb;
1221   gimple_stmt_iterator si;
1222
1223   if (!loop)
1224     return UNKNOWN_LOCATION;
1225
1226   stmt = get_loop_exit_condition (loop);
1227
1228   if (stmt
1229       && LOCATION_LOCUS (gimple_location (stmt)) > BUILTINS_LOCATION)
1230     return gimple_location (stmt);
1231
1232   /* If we got here the loop is probably not "well formed",
1233      try to estimate the loop location */
1234
1235   if (!loop->header)
1236     return UNKNOWN_LOCATION;
1237
1238   bb = loop->header;
1239
1240   for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
1241     {
1242       stmt = gsi_stmt (si);
1243       if (LOCATION_LOCUS (gimple_location (stmt)) > BUILTINS_LOCATION)
1244         return gimple_location (stmt);
1245     }
1246
1247   return UNKNOWN_LOCATION;
1248 }
1249
1250 /* Return true if PHI defines an IV of the loop to be vectorized.  */
1251
1252 static bool
1253 iv_phi_p (gphi *phi)
1254 {
1255   if (virtual_operand_p (PHI_RESULT (phi)))
1256     return false;
1257
1258   stmt_vec_info stmt_info = vinfo_for_stmt (phi);
1259   gcc_assert (stmt_info != NULL);
1260   if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
1261       || STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
1262     return false;
1263
1264   return true;
1265 }
1266
1267 /* Function vect_can_advance_ivs_p
1268
1269    In case the number of iterations that LOOP iterates is unknown at compile
1270    time, an epilog loop will be generated, and the loop induction variables
1271    (IVs) will be "advanced" to the value they are supposed to take just before
1272    the epilog loop.  Here we check that the access function of the loop IVs
1273    and the expression that represents the loop bound are simple enough.
1274    These restrictions will be relaxed in the future.  */
1275
1276 bool
1277 vect_can_advance_ivs_p (loop_vec_info loop_vinfo)
1278 {
1279   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1280   basic_block bb = loop->header;
1281   gphi_iterator gsi;
1282
1283   /* Analyze phi functions of the loop header.  */
1284
1285   if (dump_enabled_p ())
1286     dump_printf_loc (MSG_NOTE, vect_location, "vect_can_advance_ivs_p:\n");
1287   for (gsi = gsi_start_phis (bb); !gsi_end_p (gsi); gsi_next (&gsi))
1288     {
1289       tree evolution_part;
1290
1291       gphi *phi = gsi.phi ();
1292       if (dump_enabled_p ())
1293         {
1294           dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: ");
1295           dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
1296         }
1297
1298       /* Skip virtual phi's. The data dependences that are associated with
1299          virtual defs/uses (i.e., memory accesses) are analyzed elsewhere.
1300
1301          Skip reduction phis.  */
1302       if (!iv_phi_p (phi))
1303         {
1304           if (dump_enabled_p ())
1305             dump_printf_loc (MSG_NOTE, vect_location,
1306                              "reduc or virtual phi. skip.\n");
1307           continue;
1308         }
1309
1310       /* Analyze the evolution function.  */
1311
1312       evolution_part
1313         = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (vinfo_for_stmt (phi));
1314       if (evolution_part == NULL_TREE)
1315         {
1316           if (dump_enabled_p ())
1317             dump_printf (MSG_MISSED_OPTIMIZATION,
1318                          "No access function or evolution.\n");
1319           return false;
1320         }
1321
1322       /* FORNOW: We do not transform initial conditions of IVs
1323          which evolution functions are not invariants in the loop.  */
1324
1325       if (!expr_invariant_in_loop_p (loop, evolution_part))
1326         {
1327           if (dump_enabled_p ())
1328             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1329                              "evolution not invariant in loop.\n");
1330           return false;
1331         }
1332
1333       /* FORNOW: We do not transform initial conditions of IVs
1334          which evolution functions are a polynomial of degree >= 2.  */
1335
1336       if (tree_is_chrec (evolution_part))
1337         {
1338           if (dump_enabled_p ())
1339             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1340                              "evolution is chrec.\n");
1341           return false;
1342         }
1343     }
1344
1345   return true;
1346 }
1347
1348
1349 /*   Function vect_update_ivs_after_vectorizer.
1350
1351      "Advance" the induction variables of LOOP to the value they should take
1352      after the execution of LOOP.  This is currently necessary because the
1353      vectorizer does not handle induction variables that are used after the
1354      loop.  Such a situation occurs when the last iterations of LOOP are
1355      peeled, because:
1356      1. We introduced new uses after LOOP for IVs that were not originally used
1357         after LOOP: the IVs of LOOP are now used by an epilog loop.
1358      2. LOOP is going to be vectorized; this means that it will iterate N/VF
1359         times, whereas the loop IVs should be bumped N times.
1360
1361      Input:
1362      - LOOP - a loop that is going to be vectorized. The last few iterations
1363               of LOOP were peeled.
1364      - NITERS - the number of iterations that LOOP executes (before it is
1365                 vectorized). i.e, the number of times the ivs should be bumped.
1366      - UPDATE_E - a successor edge of LOOP->exit that is on the (only) path
1367                   coming out from LOOP on which there are uses of the LOOP ivs
1368                   (this is the path from LOOP->exit to epilog_loop->preheader).
1369
1370                   The new definitions of the ivs are placed in LOOP->exit.
1371                   The phi args associated with the edge UPDATE_E in the bb
1372                   UPDATE_E->dest are updated accordingly.
1373
1374      Assumption 1: Like the rest of the vectorizer, this function assumes
1375      a single loop exit that has a single predecessor.
1376
1377      Assumption 2: The phi nodes in the LOOP header and in update_bb are
1378      organized in the same order.
1379
1380      Assumption 3: The access function of the ivs is simple enough (see
1381      vect_can_advance_ivs_p).  This assumption will be relaxed in the future.
1382
1383      Assumption 4: Exactly one of the successors of LOOP exit-bb is on a path
1384      coming out of LOOP on which the ivs of LOOP are used (this is the path
1385      that leads to the epilog loop; other paths skip the epilog loop).  This
1386      path starts with the edge UPDATE_E, and its destination (denoted update_bb)
1387      needs to have its phis updated.
1388  */
1389
1390 static void
1391 vect_update_ivs_after_vectorizer (loop_vec_info loop_vinfo,
1392                                   tree niters, edge update_e)
1393 {
1394   gphi_iterator gsi, gsi1;
1395   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1396   basic_block update_bb = update_e->dest;
1397   basic_block exit_bb = single_exit (loop)->dest;
1398
1399   /* Make sure there exists a single-predecessor exit bb:  */
1400   gcc_assert (single_pred_p (exit_bb));
1401   gcc_assert (single_succ_edge (exit_bb) == update_e);
1402
1403   for (gsi = gsi_start_phis (loop->header), gsi1 = gsi_start_phis (update_bb);
1404        !gsi_end_p (gsi) && !gsi_end_p (gsi1);
1405        gsi_next (&gsi), gsi_next (&gsi1))
1406     {
1407       tree init_expr;
1408       tree step_expr, off;
1409       tree type;
1410       tree var, ni, ni_name;
1411       gimple_stmt_iterator last_gsi;
1412
1413       gphi *phi = gsi.phi ();
1414       gphi *phi1 = gsi1.phi ();
1415       if (dump_enabled_p ())
1416         {
1417           dump_printf_loc (MSG_NOTE, vect_location,
1418                            "vect_update_ivs_after_vectorizer: phi: ");
1419           dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
1420         }
1421
1422       /* Skip reduction and virtual phis.  */
1423       if (!iv_phi_p (phi))
1424         {
1425           if (dump_enabled_p ())
1426             dump_printf_loc (MSG_NOTE, vect_location,
1427                              "reduc or virtual phi. skip.\n");
1428           continue;
1429         }
1430
1431       type = TREE_TYPE (gimple_phi_result (phi));
1432       step_expr = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (vinfo_for_stmt (phi));
1433       step_expr = unshare_expr (step_expr);
1434
1435       /* FORNOW: We do not support IVs whose evolution function is a polynomial
1436          of degree >= 2 or exponential.  */
1437       gcc_assert (!tree_is_chrec (step_expr));
1438
1439       init_expr = PHI_ARG_DEF_FROM_EDGE (phi, loop_preheader_edge (loop));
1440
1441       off = fold_build2 (MULT_EXPR, TREE_TYPE (step_expr),
1442                          fold_convert (TREE_TYPE (step_expr), niters),
1443                          step_expr);
1444       if (POINTER_TYPE_P (type))
1445         ni = fold_build_pointer_plus (init_expr, off);
1446       else
1447         ni = fold_build2 (PLUS_EXPR, type,
1448                           init_expr, fold_convert (type, off));
1449
1450       var = create_tmp_var (type, "tmp");
1451
1452       last_gsi = gsi_last_bb (exit_bb);
1453       gimple_seq new_stmts = NULL;
1454       ni_name = force_gimple_operand (ni, &new_stmts, false, var);
1455       /* Exit_bb shouldn't be empty.  */
1456       if (!gsi_end_p (last_gsi))
1457         gsi_insert_seq_after (&last_gsi, new_stmts, GSI_SAME_STMT);
1458       else
1459         gsi_insert_seq_before (&last_gsi, new_stmts, GSI_SAME_STMT);
1460
1461       /* Fix phi expressions in the successor bb.  */
1462       adjust_phi_and_debug_stmts (phi1, update_e, ni_name);
1463     }
1464 }
1465
1466 /* Function vect_gen_prolog_loop_niters
1467
1468    Generate the number of iterations which should be peeled as prolog for the
1469    loop represented by LOOP_VINFO.  It is calculated as the misalignment of
1470    DR - the data reference recorded in LOOP_VINFO_UNALIGNED_DR (LOOP_VINFO).
1471    As a result, after the execution of this loop, the data reference DR will
1472    refer to an aligned location.  The following computation is generated:
1473
1474    If the misalignment of DR is known at compile time:
1475      addr_mis = int mis = DR_MISALIGNMENT (dr);
1476    Else, compute address misalignment in bytes:
1477      addr_mis = addr & (vectype_align - 1)
1478
1479    prolog_niters = ((VF - addr_mis/elem_size)&(VF-1))/step
1480
1481    (elem_size = element type size; an element is the scalar element whose type
1482    is the inner type of the vectype)
1483
1484    The computations will be emitted at the end of BB.  We also compute and
1485    store upper bound (included) of the result in BOUND.
1486
1487    When the step of the data-ref in the loop is not 1 (as in interleaved data
1488    and SLP), the number of iterations of the prolog must be divided by the step
1489    (which is equal to the size of interleaved group).
1490
1491    The above formulas assume that VF == number of elements in the vector. This
1492    may not hold when there are multiple-types in the loop.
1493    In this case, for some data-references in the loop the VF does not represent
1494    the number of elements that fit in the vector.  Therefore, instead of VF we
1495    use TYPE_VECTOR_SUBPARTS.  */
1496
1497 static tree
1498 vect_gen_prolog_loop_niters (loop_vec_info loop_vinfo,
1499                              basic_block bb, int *bound)
1500 {
1501   struct data_reference *dr = LOOP_VINFO_UNALIGNED_DR (loop_vinfo);
1502   tree var;
1503   tree niters_type = TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo));
1504   gimple_seq stmts = NULL, new_stmts = NULL;
1505   tree iters, iters_name;
1506   gimple *dr_stmt = DR_STMT (dr);
1507   stmt_vec_info stmt_info = vinfo_for_stmt (dr_stmt);
1508   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
1509   unsigned int target_align = DR_TARGET_ALIGNMENT (dr);
1510
1511   if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) > 0)
1512     {
1513       int npeel = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
1514
1515       if (dump_enabled_p ())
1516         dump_printf_loc (MSG_NOTE, vect_location,
1517                          "known peeling = %d.\n", npeel);
1518
1519       iters = build_int_cst (niters_type, npeel);
1520       *bound = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
1521     }
1522   else
1523     {
1524       bool negative = tree_int_cst_compare (DR_STEP (dr), size_zero_node) < 0;
1525       tree offset = negative
1526           ? size_int (-TYPE_VECTOR_SUBPARTS (vectype) + 1) : size_zero_node;
1527       tree start_addr = vect_create_addr_base_for_vector_ref (dr_stmt,
1528                                                               &stmts, offset);
1529       tree type = unsigned_type_for (TREE_TYPE (start_addr));
1530       tree target_align_minus_1 = build_int_cst (type, target_align - 1);
1531       HOST_WIDE_INT elem_size
1532         = int_cst_value (TYPE_SIZE_UNIT (TREE_TYPE (vectype)));
1533       tree elem_size_log = build_int_cst (type, exact_log2 (elem_size));
1534       HOST_WIDE_INT align_in_elems = target_align / elem_size;
1535       tree align_in_elems_minus_1 = build_int_cst (type, align_in_elems - 1);
1536       tree align_in_elems_tree = build_int_cst (type, align_in_elems);
1537       tree misalign_in_bytes;
1538       tree misalign_in_elems;
1539
1540       /* Create:  misalign_in_bytes = addr & (target_align - 1).  */
1541       misalign_in_bytes
1542         = fold_build2 (BIT_AND_EXPR, type, fold_convert (type, start_addr),
1543                        target_align_minus_1);
1544
1545       /* Create:  misalign_in_elems = misalign_in_bytes / element_size.  */
1546       misalign_in_elems
1547         = fold_build2 (RSHIFT_EXPR, type, misalign_in_bytes, elem_size_log);
1548
1549       /* Create:  (niters_type) ((align_in_elems - misalign_in_elems)
1550                                  & (align_in_elems - 1)).  */
1551       if (negative)
1552         iters = fold_build2 (MINUS_EXPR, type, misalign_in_elems,
1553                              align_in_elems_tree);
1554       else
1555         iters = fold_build2 (MINUS_EXPR, type, align_in_elems_tree,
1556                              misalign_in_elems);
1557       iters = fold_build2 (BIT_AND_EXPR, type, iters, align_in_elems_minus_1);
1558       iters = fold_convert (niters_type, iters);
1559       *bound = align_in_elems - 1;
1560     }
1561
1562   if (dump_enabled_p ())
1563     {
1564       dump_printf_loc (MSG_NOTE, vect_location,
1565                        "niters for prolog loop: ");
1566       dump_generic_expr (MSG_NOTE, TDF_SLIM, iters);
1567       dump_printf (MSG_NOTE, "\n");
1568     }
1569
1570   var = create_tmp_var (niters_type, "prolog_loop_niters");
1571   iters_name = force_gimple_operand (iters, &new_stmts, false, var);
1572
1573   if (new_stmts)
1574     gimple_seq_add_seq (&stmts, new_stmts);
1575   if (stmts)
1576     {
1577       gcc_assert (single_succ_p (bb));
1578       gimple_stmt_iterator gsi = gsi_last_bb (bb);
1579       if (gsi_end_p (gsi))
1580         gsi_insert_seq_before (&gsi, stmts, GSI_SAME_STMT);
1581       else
1582         gsi_insert_seq_after (&gsi, stmts, GSI_SAME_STMT);
1583     }
1584   return iters_name;
1585 }
1586
1587
1588 /* Function vect_update_init_of_dr
1589
1590    NITERS iterations were peeled from LOOP.  DR represents a data reference
1591    in LOOP.  This function updates the information recorded in DR to
1592    account for the fact that the first NITERS iterations had already been
1593    executed.  Specifically, it updates the OFFSET field of DR.  */
1594
1595 static void
1596 vect_update_init_of_dr (struct data_reference *dr, tree niters)
1597 {
1598   tree offset = DR_OFFSET (dr);
1599
1600   niters = fold_build2 (MULT_EXPR, sizetype,
1601                         fold_convert (sizetype, niters),
1602                         fold_convert (sizetype, DR_STEP (dr)));
1603   offset = fold_build2 (PLUS_EXPR, sizetype,
1604                         fold_convert (sizetype, offset), niters);
1605   DR_OFFSET (dr) = offset;
1606 }
1607
1608
1609 /* Function vect_update_inits_of_drs
1610
1611    NITERS iterations were peeled from the loop represented by LOOP_VINFO.
1612    This function updates the information recorded for the data references in
1613    the loop to account for the fact that the first NITERS iterations had
1614    already been executed.  Specifically, it updates the initial_condition of
1615    the access_function of all the data_references in the loop.  */
1616
1617 static void
1618 vect_update_inits_of_drs (loop_vec_info loop_vinfo, tree niters)
1619 {
1620   unsigned int i;
1621   vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
1622   struct data_reference *dr;
1623
1624   if (dump_enabled_p ())
1625     dump_printf_loc (MSG_NOTE, vect_location,
1626                      "=== vect_update_inits_of_dr ===\n");
1627
1628   /* Adjust niters to sizetype and insert stmts on loop preheader edge.  */
1629   if (!types_compatible_p (sizetype, TREE_TYPE (niters)))
1630     {
1631       gimple_seq seq;
1632       edge pe = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo));
1633       tree var = create_tmp_var (sizetype, "prolog_loop_adjusted_niters");
1634
1635       niters = fold_convert (sizetype, niters);
1636       niters = force_gimple_operand (niters, &seq, false, var);
1637       if (seq)
1638         {
1639           basic_block new_bb = gsi_insert_seq_on_edge_immediate (pe, seq);
1640           gcc_assert (!new_bb);
1641         }
1642     }
1643
1644   FOR_EACH_VEC_ELT (datarefs, i, dr)
1645     vect_update_init_of_dr (dr, niters);
1646 }
1647
1648
1649 /* This function builds ni_name = number of iterations.  Statements
1650    are emitted on the loop preheader edge.  If NEW_VAR_P is not NULL, set
1651    it to TRUE if new ssa_var is generated.  */
1652
1653 tree
1654 vect_build_loop_niters (loop_vec_info loop_vinfo, bool *new_var_p)
1655 {
1656   tree ni = unshare_expr (LOOP_VINFO_NITERS (loop_vinfo));
1657   if (TREE_CODE (ni) == INTEGER_CST)
1658     return ni;
1659   else
1660     {
1661       tree ni_name, var;
1662       gimple_seq stmts = NULL;
1663       edge pe = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo));
1664
1665       var = create_tmp_var (TREE_TYPE (ni), "niters");
1666       ni_name = force_gimple_operand (ni, &stmts, false, var);
1667       if (stmts)
1668         {
1669           gsi_insert_seq_on_edge_immediate (pe, stmts);
1670           if (new_var_p != NULL)
1671             *new_var_p = true;
1672         }
1673
1674       return ni_name;
1675     }
1676 }
1677
1678 /* Calculate the number of iterations above which vectorized loop will be
1679    preferred than scalar loop.  NITERS_PROLOG is the number of iterations
1680    of prolog loop.  If it's integer const, the integer number is also passed
1681    in INT_NITERS_PROLOG.  BOUND_PROLOG is the upper bound (included) of
1682    number of iterations of prolog loop.  VFM1 is vector factor minus one.
1683    If CHECK_PROFITABILITY is true, TH is the threshold below which scalar
1684    (rather than vectorized) loop will be executed.  This function stores
1685    upper bound (included) of the result in BOUND_SCALAR.  */
1686
1687 static tree
1688 vect_gen_scalar_loop_niters (tree niters_prolog, int int_niters_prolog,
1689                              int bound_prolog, poly_int64 vfm1, int th,
1690                              poly_uint64 *bound_scalar,
1691                              bool check_profitability)
1692 {
1693   tree type = TREE_TYPE (niters_prolog);
1694   tree niters = fold_build2 (PLUS_EXPR, type, niters_prolog,
1695                              build_int_cst (type, vfm1));
1696
1697   *bound_scalar = vfm1 + bound_prolog;
1698   if (check_profitability)
1699     {
1700       /* TH indicates the minimum niters of vectorized loop, while we
1701          compute the maximum niters of scalar loop.  */
1702       th--;
1703       /* Peeling for constant times.  */
1704       if (int_niters_prolog >= 0)
1705         {
1706           *bound_scalar = upper_bound (int_niters_prolog + vfm1, th);
1707           return build_int_cst (type, *bound_scalar);
1708         }
1709       /* Peeling for unknown times.  Note BOUND_PROLOG is the upper
1710          bound (inlcuded) of niters of prolog loop.  */
1711       if (known_ge (th, vfm1 + bound_prolog))
1712         {
1713           *bound_scalar = th;
1714           return build_int_cst (type, th);
1715         }
1716       /* Need to do runtime comparison.  */
1717       else if (maybe_gt (th, vfm1))
1718         {
1719           *bound_scalar = upper_bound (*bound_scalar, th);
1720           return fold_build2 (MAX_EXPR, type,
1721                               build_int_cst (type, th), niters);
1722         }
1723     }
1724   return niters;
1725 }
1726
1727 /* NITERS is the number of times that the original scalar loop executes
1728    after peeling.  Work out the maximum number of iterations N that can
1729    be handled by the vectorized form of the loop and then either:
1730
1731    a) set *STEP_VECTOR_PTR to the vectorization factor and generate:
1732
1733         niters_vector = N
1734
1735    b) set *STEP_VECTOR_PTR to one and generate:
1736
1737         niters_vector = N / vf
1738
1739    In both cases, store niters_vector in *NITERS_VECTOR_PTR and add
1740    any new statements on the loop preheader edge.  NITERS_NO_OVERFLOW
1741    is true if NITERS doesn't overflow (i.e. if NITERS is always nonzero).  */
1742
1743 void
1744 vect_gen_vector_loop_niters (loop_vec_info loop_vinfo, tree niters,
1745                              tree *niters_vector_ptr, tree *step_vector_ptr,
1746                              bool niters_no_overflow)
1747 {
1748   tree ni_minus_gap, var;
1749   tree niters_vector, step_vector, type = TREE_TYPE (niters);
1750   poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1751   edge pe = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo));
1752   tree log_vf = NULL_TREE;
1753
1754   /* If epilogue loop is required because of data accesses with gaps, we
1755      subtract one iteration from the total number of iterations here for
1756      correct calculation of RATIO.  */
1757   if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
1758     {
1759       ni_minus_gap = fold_build2 (MINUS_EXPR, type, niters,
1760                                   build_one_cst (type));
1761       if (!is_gimple_val (ni_minus_gap))
1762         {
1763           var = create_tmp_var (type, "ni_gap");
1764           gimple *stmts = NULL;
1765           ni_minus_gap = force_gimple_operand (ni_minus_gap, &stmts,
1766                                                true, var);
1767           gsi_insert_seq_on_edge_immediate (pe, stmts);
1768         }
1769     }
1770   else
1771     ni_minus_gap = niters;
1772
1773   unsigned HOST_WIDE_INT const_vf;
1774   if (vf.is_constant (&const_vf)
1775       && !LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
1776     {
1777       /* Create: niters >> log2(vf) */
1778       /* If it's known that niters == number of latch executions + 1 doesn't
1779          overflow, we can generate niters >> log2(vf); otherwise we generate
1780          (niters - vf) >> log2(vf) + 1 by using the fact that we know ratio
1781          will be at least one.  */
1782       log_vf = build_int_cst (type, exact_log2 (const_vf));
1783       if (niters_no_overflow)
1784         niters_vector = fold_build2 (RSHIFT_EXPR, type, ni_minus_gap, log_vf);
1785       else
1786         niters_vector
1787           = fold_build2 (PLUS_EXPR, type,
1788                          fold_build2 (RSHIFT_EXPR, type,
1789                                       fold_build2 (MINUS_EXPR, type,
1790                                                    ni_minus_gap,
1791                                                    build_int_cst (type, vf)),
1792                                       log_vf),
1793                          build_int_cst (type, 1));
1794       step_vector = build_one_cst (type);
1795     }
1796   else
1797     {
1798       niters_vector = ni_minus_gap;
1799       step_vector = build_int_cst (type, vf);
1800     }
1801
1802   if (!is_gimple_val (niters_vector))
1803     {
1804       var = create_tmp_var (type, "bnd");
1805       gimple_seq stmts = NULL;
1806       niters_vector = force_gimple_operand (niters_vector, &stmts, true, var);
1807       gsi_insert_seq_on_edge_immediate (pe, stmts);
1808       /* Peeling algorithm guarantees that vector loop bound is at least ONE,
1809          we set range information to make niters analyzer's life easier.  */
1810       if (stmts != NULL && log_vf)
1811         set_range_info (niters_vector, VR_RANGE,
1812                         wi::to_wide (build_int_cst (type, 1)),
1813                         wi::to_wide (fold_build2 (RSHIFT_EXPR, type,
1814                                                   TYPE_MAX_VALUE (type),
1815                                                   log_vf)));
1816     }
1817   *niters_vector_ptr = niters_vector;
1818   *step_vector_ptr = step_vector;
1819
1820   return;
1821 }
1822
1823 /* Given NITERS_VECTOR which is the number of iterations for vectorized
1824    loop specified by LOOP_VINFO after vectorization, compute the number
1825    of iterations before vectorization (niters_vector * vf) and store it
1826    to NITERS_VECTOR_MULT_VF_PTR.  */
1827
1828 static void
1829 vect_gen_vector_loop_niters_mult_vf (loop_vec_info loop_vinfo,
1830                                      tree niters_vector,
1831                                      tree *niters_vector_mult_vf_ptr)
1832 {
1833   /* We should be using a step_vector of VF if VF is variable.  */
1834   int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo).to_constant ();
1835   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1836   tree type = TREE_TYPE (niters_vector);
1837   tree log_vf = build_int_cst (type, exact_log2 (vf));
1838   basic_block exit_bb = single_exit (loop)->dest;
1839
1840   gcc_assert (niters_vector_mult_vf_ptr != NULL);
1841   tree niters_vector_mult_vf = fold_build2 (LSHIFT_EXPR, type,
1842                                             niters_vector, log_vf);
1843   if (!is_gimple_val (niters_vector_mult_vf))
1844     {
1845       tree var = create_tmp_var (type, "niters_vector_mult_vf");
1846       gimple_seq stmts = NULL;
1847       niters_vector_mult_vf = force_gimple_operand (niters_vector_mult_vf,
1848                                                     &stmts, true, var);
1849       gimple_stmt_iterator gsi = gsi_start_bb (exit_bb);
1850       gsi_insert_seq_before (&gsi, stmts, GSI_SAME_STMT);
1851     }
1852   *niters_vector_mult_vf_ptr = niters_vector_mult_vf;
1853 }
1854
1855 /* Function slpeel_tree_duplicate_loop_to_edge_cfg duplciates FIRST/SECOND
1856    from SECOND/FIRST and puts it at the original loop's preheader/exit
1857    edge, the two loops are arranged as below:
1858
1859        preheader_a:
1860      first_loop:
1861        header_a:
1862          i_1 = PHI<i_0, i_2>;
1863          ...
1864          i_2 = i_1 + 1;
1865          if (cond_a)
1866            goto latch_a;
1867          else
1868            goto between_bb;
1869        latch_a:
1870          goto header_a;
1871
1872        between_bb:
1873          ;; i_x = PHI<i_2>;   ;; LCSSA phi node to be created for FIRST,
1874
1875      second_loop:
1876        header_b:
1877          i_3 = PHI<i_0, i_4>; ;; Use of i_0 to be replaced with i_x,
1878                                  or with i_2 if no LCSSA phi is created
1879                                  under condition of CREATE_LCSSA_FOR_IV_PHIS.
1880          ...
1881          i_4 = i_3 + 1;
1882          if (cond_b)
1883            goto latch_b;
1884          else
1885            goto exit_bb;
1886        latch_b:
1887          goto header_b;
1888
1889        exit_bb:
1890
1891    This function creates loop closed SSA for the first loop; update the
1892    second loop's PHI nodes by replacing argument on incoming edge with the
1893    result of newly created lcssa PHI nodes.  IF CREATE_LCSSA_FOR_IV_PHIS
1894    is false, Loop closed ssa phis will only be created for non-iv phis for
1895    the first loop.
1896
1897    This function assumes exit bb of the first loop is preheader bb of the
1898    second loop, i.e, between_bb in the example code.  With PHIs updated,
1899    the second loop will execute rest iterations of the first.  */
1900
1901 static void
1902 slpeel_update_phi_nodes_for_loops (loop_vec_info loop_vinfo,
1903                                    struct loop *first, struct loop *second,
1904                                    bool create_lcssa_for_iv_phis)
1905 {
1906   gphi_iterator gsi_update, gsi_orig;
1907   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1908
1909   edge first_latch_e = EDGE_SUCC (first->latch, 0);
1910   edge second_preheader_e = loop_preheader_edge (second);
1911   basic_block between_bb = single_exit (first)->dest;
1912
1913   gcc_assert (between_bb == second_preheader_e->src);
1914   gcc_assert (single_pred_p (between_bb) && single_succ_p (between_bb));
1915   /* Either the first loop or the second is the loop to be vectorized.  */
1916   gcc_assert (loop == first || loop == second);
1917
1918   for (gsi_orig = gsi_start_phis (first->header),
1919        gsi_update = gsi_start_phis (second->header);
1920        !gsi_end_p (gsi_orig) && !gsi_end_p (gsi_update);
1921        gsi_next (&gsi_orig), gsi_next (&gsi_update))
1922     {
1923       gphi *orig_phi = gsi_orig.phi ();
1924       gphi *update_phi = gsi_update.phi ();
1925
1926       tree arg = PHI_ARG_DEF_FROM_EDGE (orig_phi, first_latch_e);
1927       /* Generate lcssa PHI node for the first loop.  */
1928       gphi *vect_phi = (loop == first) ? orig_phi : update_phi;
1929       if (create_lcssa_for_iv_phis || !iv_phi_p (vect_phi))
1930         {
1931           tree new_res = copy_ssa_name (PHI_RESULT (orig_phi));
1932           gphi *lcssa_phi = create_phi_node (new_res, between_bb);
1933           add_phi_arg (lcssa_phi, arg, single_exit (first), UNKNOWN_LOCATION);
1934           arg = new_res;
1935         }
1936
1937       /* Update PHI node in the second loop by replacing arg on the loop's
1938          incoming edge.  */
1939       adjust_phi_and_debug_stmts (update_phi, second_preheader_e, arg);
1940     }
1941 }
1942
1943 /* Function slpeel_add_loop_guard adds guard skipping from the beginning
1944    of SKIP_LOOP to the beginning of UPDATE_LOOP.  GUARD_EDGE and MERGE_EDGE
1945    are two pred edges of the merge point before UPDATE_LOOP.  The two loops
1946    appear like below:
1947
1948        guard_bb:
1949          if (cond)
1950            goto merge_bb;
1951          else
1952            goto skip_loop;
1953
1954      skip_loop:
1955        header_a:
1956          i_1 = PHI<i_0, i_2>;
1957          ...
1958          i_2 = i_1 + 1;
1959          if (cond_a)
1960            goto latch_a;
1961          else
1962            goto exit_a;
1963        latch_a:
1964          goto header_a;
1965
1966        exit_a:
1967          i_5 = PHI<i_2>;
1968
1969        merge_bb:
1970          ;; PHI (i_x = PHI<i_0, i_5>) to be created at merge point.
1971
1972      update_loop:
1973        header_b:
1974          i_3 = PHI<i_5, i_4>;  ;; Use of i_5 to be replaced with i_x.
1975          ...
1976          i_4 = i_3 + 1;
1977          if (cond_b)
1978            goto latch_b;
1979          else
1980            goto exit_bb;
1981        latch_b:
1982          goto header_b;
1983
1984        exit_bb:
1985
1986    This function creates PHI nodes at merge_bb and replaces the use of i_5
1987    in the update_loop's PHI node with the result of new PHI result.  */
1988
1989 static void
1990 slpeel_update_phi_nodes_for_guard1 (struct loop *skip_loop,
1991                                     struct loop *update_loop,
1992                                     edge guard_edge, edge merge_edge)
1993 {
1994   source_location merge_loc, guard_loc;
1995   edge orig_e = loop_preheader_edge (skip_loop);
1996   edge update_e = loop_preheader_edge (update_loop);
1997   gphi_iterator gsi_orig, gsi_update;
1998
1999   for ((gsi_orig = gsi_start_phis (skip_loop->header),
2000         gsi_update = gsi_start_phis (update_loop->header));
2001        !gsi_end_p (gsi_orig) && !gsi_end_p (gsi_update);
2002        gsi_next (&gsi_orig), gsi_next (&gsi_update))
2003     {
2004       gphi *orig_phi = gsi_orig.phi ();
2005       gphi *update_phi = gsi_update.phi ();
2006
2007       /* Generate new phi node at merge bb of the guard.  */
2008       tree new_res = copy_ssa_name (PHI_RESULT (orig_phi));
2009       gphi *new_phi = create_phi_node (new_res, guard_edge->dest);
2010
2011       /* Merge bb has two incoming edges: GUARD_EDGE and MERGE_EDGE.  Set the
2012          args in NEW_PHI for these edges.  */
2013       tree merge_arg = PHI_ARG_DEF_FROM_EDGE (update_phi, update_e);
2014       tree guard_arg = PHI_ARG_DEF_FROM_EDGE (orig_phi, orig_e);
2015       merge_loc = gimple_phi_arg_location_from_edge (update_phi, update_e);
2016       guard_loc = gimple_phi_arg_location_from_edge (orig_phi, orig_e);
2017       add_phi_arg (new_phi, merge_arg, merge_edge, merge_loc);
2018       add_phi_arg (new_phi, guard_arg, guard_edge, guard_loc);
2019
2020       /* Update phi in UPDATE_PHI.  */
2021       adjust_phi_and_debug_stmts (update_phi, update_e, new_res);
2022     }
2023 }
2024
2025 /* LCSSA_PHI is a lcssa phi of EPILOG loop which is copied from LOOP,
2026    this function searches for the corresponding lcssa phi node in exit
2027    bb of LOOP.  If it is found, return the phi result; otherwise return
2028    NULL.  */
2029
2030 static tree
2031 find_guard_arg (struct loop *loop, struct loop *epilog ATTRIBUTE_UNUSED,
2032                 gphi *lcssa_phi)
2033 {
2034   gphi_iterator gsi;
2035   edge e = single_exit (loop);
2036
2037   gcc_assert (single_pred_p (e->dest));
2038   for (gsi = gsi_start_phis (e->dest); !gsi_end_p (gsi); gsi_next (&gsi))
2039     {
2040       gphi *phi = gsi.phi ();
2041       if (operand_equal_p (PHI_ARG_DEF (phi, 0),
2042                            PHI_ARG_DEF (lcssa_phi, 0), 0))
2043         return PHI_RESULT (phi);
2044     }
2045   return NULL_TREE;
2046 }
2047
2048 /* LOOP and EPILOG are two consecutive loops in CFG and EPILOG is copied
2049    from LOOP.  Function slpeel_add_loop_guard adds guard skipping from a
2050    point between the two loops to the end of EPILOG.  Edges GUARD_EDGE
2051    and MERGE_EDGE are the two pred edges of merge_bb at the end of EPILOG.
2052    The CFG looks like:
2053
2054      loop:
2055        header_a:
2056          i_1 = PHI<i_0, i_2>;
2057          ...
2058          i_2 = i_1 + 1;
2059          if (cond_a)
2060            goto latch_a;
2061          else
2062            goto exit_a;
2063        latch_a:
2064          goto header_a;
2065
2066        exit_a:
2067
2068        guard_bb:
2069          if (cond)
2070            goto merge_bb;
2071          else
2072            goto epilog_loop;
2073
2074        ;; fall_through_bb
2075
2076      epilog_loop:
2077        header_b:
2078          i_3 = PHI<i_2, i_4>;
2079          ...
2080          i_4 = i_3 + 1;
2081          if (cond_b)
2082            goto latch_b;
2083          else
2084            goto merge_bb;
2085        latch_b:
2086          goto header_b;
2087
2088        merge_bb:
2089          ; PHI node (i_y = PHI<i_2, i_4>) to be created at merge point.
2090
2091        exit_bb:
2092          i_x = PHI<i_4>;  ;Use of i_4 to be replaced with i_y in merge_bb.
2093
2094    For each name used out side EPILOG (i.e - for each name that has a lcssa
2095    phi in exit_bb) we create a new PHI in merge_bb.  The new PHI has two
2096    args corresponding to GUARD_EDGE and MERGE_EDGE.  Arg for MERGE_EDGE is
2097    the arg of the original PHI in exit_bb, arg for GUARD_EDGE is defined
2098    by LOOP and is found in the exit bb of LOOP.  Arg of the original PHI
2099    in exit_bb will also be updated.  */
2100
2101 static void
2102 slpeel_update_phi_nodes_for_guard2 (struct loop *loop, struct loop *epilog,
2103                                     edge guard_edge, edge merge_edge)
2104 {
2105   gphi_iterator gsi;
2106   basic_block merge_bb = guard_edge->dest;
2107
2108   gcc_assert (single_succ_p (merge_bb));
2109   edge e = single_succ_edge (merge_bb);
2110   basic_block exit_bb = e->dest;
2111   gcc_assert (single_pred_p (exit_bb));
2112   gcc_assert (single_pred (exit_bb) == single_exit (epilog)->dest);
2113
2114   for (gsi = gsi_start_phis (exit_bb); !gsi_end_p (gsi); gsi_next (&gsi))
2115     {
2116       gphi *update_phi = gsi.phi ();
2117       tree old_arg = PHI_ARG_DEF (update_phi, 0);
2118       /* This loop-closed-phi actually doesn't represent a use out of the
2119          loop - the phi arg is a constant.  */
2120       if (TREE_CODE (old_arg) != SSA_NAME)
2121         continue;
2122
2123       tree merge_arg = get_current_def (old_arg);
2124       if (!merge_arg)
2125         merge_arg = old_arg;
2126
2127       tree guard_arg = find_guard_arg (loop, epilog, update_phi);
2128       /* If the var is live after loop but not a reduction, we simply
2129          use the old arg.  */
2130       if (!guard_arg)
2131         guard_arg = old_arg;
2132
2133       /* Create new phi node in MERGE_BB:  */
2134       tree new_res = copy_ssa_name (PHI_RESULT (update_phi));
2135       gphi *merge_phi = create_phi_node (new_res, merge_bb);
2136
2137       /* MERGE_BB has two incoming edges: GUARD_EDGE and MERGE_EDGE, Set
2138          the two PHI args in merge_phi for these edges.  */
2139       add_phi_arg (merge_phi, merge_arg, merge_edge, UNKNOWN_LOCATION);
2140       add_phi_arg (merge_phi, guard_arg, guard_edge, UNKNOWN_LOCATION);
2141
2142       /* Update the original phi in exit_bb.  */
2143       adjust_phi_and_debug_stmts (update_phi, e, new_res);
2144     }
2145 }
2146
2147 /* EPILOG loop is duplicated from the original loop for vectorizing,
2148    the arg of its loop closed ssa PHI needs to be updated.  */
2149
2150 static void
2151 slpeel_update_phi_nodes_for_lcssa (struct loop *epilog)
2152 {
2153   gphi_iterator gsi;
2154   basic_block exit_bb = single_exit (epilog)->dest;
2155
2156   gcc_assert (single_pred_p (exit_bb));
2157   edge e = EDGE_PRED (exit_bb, 0);
2158   for (gsi = gsi_start_phis (exit_bb); !gsi_end_p (gsi); gsi_next (&gsi))
2159     rename_use_op (PHI_ARG_DEF_PTR_FROM_EDGE (gsi.phi (), e));
2160 }
2161
2162 /* Function vect_do_peeling.
2163
2164    Input:
2165    - LOOP_VINFO: Represent a loop to be vectorized, which looks like:
2166
2167        preheader:
2168      LOOP:
2169        header_bb:
2170          loop_body
2171          if (exit_loop_cond) goto exit_bb
2172          else                goto header_bb
2173        exit_bb:
2174
2175    - NITERS: The number of iterations of the loop.
2176    - NITERSM1: The number of iterations of the loop's latch.
2177    - NITERS_NO_OVERFLOW: No overflow in computing NITERS.
2178    - TH, CHECK_PROFITABILITY: Threshold of niters to vectorize loop if
2179                               CHECK_PROFITABILITY is true.
2180    Output:
2181    - *NITERS_VECTOR and *STEP_VECTOR describe how the main loop should
2182      iterate after vectorization; see vect_set_loop_condition for details.
2183    - *NITERS_VECTOR_MULT_VF_VAR is either null or an SSA name that
2184      should be set to the number of scalar iterations handled by the
2185      vector loop.  The SSA name is only used on exit from the loop.
2186
2187    This function peels prolog and epilog from the loop, adds guards skipping
2188    PROLOG and EPILOG for various conditions.  As a result, the changed CFG
2189    would look like:
2190
2191        guard_bb_1:
2192          if (prefer_scalar_loop) goto merge_bb_1
2193          else                    goto guard_bb_2
2194
2195        guard_bb_2:
2196          if (skip_prolog) goto merge_bb_2
2197          else             goto prolog_preheader
2198
2199        prolog_preheader:
2200      PROLOG:
2201        prolog_header_bb:
2202          prolog_body
2203          if (exit_prolog_cond) goto prolog_exit_bb
2204          else                  goto prolog_header_bb
2205        prolog_exit_bb:
2206
2207        merge_bb_2:
2208
2209        vector_preheader:
2210      VECTOR LOOP:
2211        vector_header_bb:
2212          vector_body
2213          if (exit_vector_cond) goto vector_exit_bb
2214          else                  goto vector_header_bb
2215        vector_exit_bb:
2216
2217        guard_bb_3:
2218          if (skip_epilog) goto merge_bb_3
2219          else             goto epilog_preheader
2220
2221        merge_bb_1:
2222
2223        epilog_preheader:
2224      EPILOG:
2225        epilog_header_bb:
2226          epilog_body
2227          if (exit_epilog_cond) goto merge_bb_3
2228          else                  goto epilog_header_bb
2229
2230        merge_bb_3:
2231
2232    Note this function peels prolog and epilog only if it's necessary,
2233    as well as guards.
2234    Returns created epilogue or NULL.
2235
2236    TODO: Guard for prefer_scalar_loop should be emitted along with
2237    versioning conditions if loop versioning is needed.  */
2238
2239
2240 struct loop *
2241 vect_do_peeling (loop_vec_info loop_vinfo, tree niters, tree nitersm1,
2242                  tree *niters_vector, tree *step_vector,
2243                  tree *niters_vector_mult_vf_var, int th,
2244                  bool check_profitability, bool niters_no_overflow)
2245 {
2246   edge e, guard_e;
2247   tree type = TREE_TYPE (niters), guard_cond;
2248   basic_block guard_bb, guard_to;
2249   profile_probability prob_prolog, prob_vector, prob_epilog;
2250   int bound_prolog = 0;
2251   poly_uint64 bound_scalar = 0;
2252   int estimated_vf;
2253   int prolog_peeling = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
2254   bool epilog_peeling = (LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo)
2255                          || LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo));
2256
2257   if (!prolog_peeling && !epilog_peeling)
2258     return NULL;
2259
2260   prob_vector = profile_probability::guessed_always ().apply_scale (9, 10);
2261   estimated_vf = vect_vf_for_cost (loop_vinfo);
2262   if (estimated_vf == 2)
2263     estimated_vf = 3;
2264   prob_prolog = prob_epilog = profile_probability::guessed_always ()
2265                         .apply_scale (estimated_vf - 1, estimated_vf);
2266   poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2267
2268   struct loop *prolog, *epilog = NULL, *loop = LOOP_VINFO_LOOP (loop_vinfo);
2269   struct loop *first_loop = loop;
2270   bool irred_flag = loop_preheader_edge (loop)->flags & EDGE_IRREDUCIBLE_LOOP;
2271   create_lcssa_for_virtual_phi (loop);
2272   update_ssa (TODO_update_ssa_only_virtuals);
2273
2274   if (MAY_HAVE_DEBUG_BIND_STMTS)
2275     {
2276       gcc_assert (!adjust_vec.exists ());
2277       adjust_vec.create (32);
2278     }
2279   initialize_original_copy_tables ();
2280
2281   /* Prolog loop may be skipped.  */
2282   bool skip_prolog = (prolog_peeling != 0);
2283   /* Skip to epilog if scalar loop may be preferred.  It's only needed
2284      when we peel for epilog loop and when it hasn't been checked with
2285      loop versioning.  */
2286   bool skip_vector = ((!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2287                        && !LOOP_REQUIRES_VERSIONING (loop_vinfo))
2288                       || !vf.is_constant ());
2289   /* Epilog loop must be executed if the number of iterations for epilog
2290      loop is known at compile time, otherwise we need to add a check at
2291      the end of vector loop and skip to the end of epilog loop.  */
2292   bool skip_epilog = (prolog_peeling < 0
2293                       || !LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2294                       || !vf.is_constant ());
2295   /* PEELING_FOR_GAPS is special because epilog loop must be executed.  */
2296   if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
2297     skip_epilog = false;
2298
2299   /* Record the anchor bb at which guard should be placed if scalar loop
2300      may be preferred.  */
2301   basic_block anchor = loop_preheader_edge (loop)->src;
2302   if (skip_vector)
2303     {
2304       split_edge (loop_preheader_edge (loop));
2305
2306       /* Due to the order in which we peel prolog and epilog, we first
2307          propagate probability to the whole loop.  The purpose is to
2308          avoid adjusting probabilities of both prolog and vector loops
2309          separately.  Note in this case, the probability of epilog loop
2310          needs to be scaled back later.  */
2311       basic_block bb_before_loop = loop_preheader_edge (loop)->src;
2312       if (prob_vector.initialized_p ())
2313         {
2314           scale_bbs_frequencies (&bb_before_loop, 1, prob_vector);
2315           scale_loop_profile (loop, prob_vector, 0);
2316         }
2317     }
2318
2319   tree niters_prolog = build_int_cst (type, 0);
2320   source_location loop_loc = find_loop_location (loop);
2321   struct loop *scalar_loop = LOOP_VINFO_SCALAR_LOOP (loop_vinfo);
2322   if (prolog_peeling)
2323     {
2324       e = loop_preheader_edge (loop);
2325       if (!slpeel_can_duplicate_loop_p (loop, e))
2326         {
2327           dump_printf_loc (MSG_MISSED_OPTIMIZATION, loop_loc,
2328                            "loop can't be duplicated to preheader edge.\n");
2329           gcc_unreachable ();
2330         }
2331       /* Peel prolog and put it on preheader edge of loop.  */
2332       prolog = slpeel_tree_duplicate_loop_to_edge_cfg (loop, scalar_loop, e);
2333       if (!prolog)
2334         {
2335           dump_printf_loc (MSG_MISSED_OPTIMIZATION, loop_loc,
2336                            "slpeel_tree_duplicate_loop_to_edge_cfg failed.\n");
2337           gcc_unreachable ();
2338         }
2339       slpeel_update_phi_nodes_for_loops (loop_vinfo, prolog, loop, true);
2340       first_loop = prolog;
2341       reset_original_copy_tables ();
2342
2343       /* Generate and update the number of iterations for prolog loop.  */
2344       niters_prolog = vect_gen_prolog_loop_niters (loop_vinfo, anchor,
2345                                                    &bound_prolog);
2346       tree step_prolog = build_one_cst (TREE_TYPE (niters_prolog));
2347       vect_set_loop_condition (prolog, NULL, niters_prolog,
2348                                step_prolog, NULL_TREE, false);
2349
2350       /* Skip the prolog loop.  */
2351       if (skip_prolog)
2352         {
2353           guard_cond = fold_build2 (EQ_EXPR, boolean_type_node,
2354                                     niters_prolog, build_int_cst (type, 0));
2355           guard_bb = loop_preheader_edge (prolog)->src;
2356           basic_block bb_after_prolog = loop_preheader_edge (loop)->src;
2357           guard_to = split_edge (loop_preheader_edge (loop));
2358           guard_e = slpeel_add_loop_guard (guard_bb, guard_cond,
2359                                            guard_to, guard_bb,
2360                                            prob_prolog.invert (),
2361                                            irred_flag);
2362           e = EDGE_PRED (guard_to, 0);
2363           e = (e != guard_e ? e : EDGE_PRED (guard_to, 1));
2364           slpeel_update_phi_nodes_for_guard1 (prolog, loop, guard_e, e);
2365
2366           scale_bbs_frequencies (&bb_after_prolog, 1, prob_prolog);
2367           scale_loop_profile (prolog, prob_prolog, bound_prolog);
2368         }
2369       /* Update init address of DRs.  */
2370       vect_update_inits_of_drs (loop_vinfo, niters_prolog);
2371       /* Update niters for vector loop.  */
2372       LOOP_VINFO_NITERS (loop_vinfo)
2373         = fold_build2 (MINUS_EXPR, type, niters, niters_prolog);
2374       LOOP_VINFO_NITERSM1 (loop_vinfo)
2375         = fold_build2 (MINUS_EXPR, type,
2376                        LOOP_VINFO_NITERSM1 (loop_vinfo), niters_prolog);
2377       bool new_var_p = false;
2378       niters = vect_build_loop_niters (loop_vinfo, &new_var_p);
2379       /* It's guaranteed that vector loop bound before vectorization is at
2380          least VF, so set range information for newly generated var.  */
2381       if (new_var_p)
2382         set_range_info (niters, VR_RANGE,
2383                         wi::to_wide (build_int_cst (type, vf)),
2384                         wi::to_wide (TYPE_MAX_VALUE (type)));
2385
2386       /* Prolog iterates at most bound_prolog times, latch iterates at
2387          most bound_prolog - 1 times.  */
2388       record_niter_bound (prolog, bound_prolog - 1, false, true);
2389       delete_update_ssa ();
2390       adjust_vec_debug_stmts ();
2391       scev_reset ();
2392     }
2393
2394   if (epilog_peeling)
2395     {
2396       e = single_exit (loop);
2397       if (!slpeel_can_duplicate_loop_p (loop, e))
2398         {
2399           dump_printf_loc (MSG_MISSED_OPTIMIZATION, loop_loc,
2400                            "loop can't be duplicated to exit edge.\n");
2401           gcc_unreachable ();
2402         }
2403       /* Peel epilog and put it on exit edge of loop.  */
2404       epilog = slpeel_tree_duplicate_loop_to_edge_cfg (loop, scalar_loop, e);
2405       if (!epilog)
2406         {
2407           dump_printf_loc (MSG_MISSED_OPTIMIZATION, loop_loc,
2408                            "slpeel_tree_duplicate_loop_to_edge_cfg failed.\n");
2409           gcc_unreachable ();
2410         }
2411       slpeel_update_phi_nodes_for_loops (loop_vinfo, loop, epilog, false);
2412
2413       /* Scalar version loop may be preferred.  In this case, add guard
2414          and skip to epilog.  Note this only happens when the number of
2415          iterations of loop is unknown at compile time, otherwise this
2416          won't be vectorized.  */
2417       if (skip_vector)
2418         {
2419           /* Additional epilogue iteration is peeled if gap exists.  */
2420           bool peel_for_gaps = LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo);
2421           tree t = vect_gen_scalar_loop_niters (niters_prolog, prolog_peeling,
2422                                                 bound_prolog,
2423                                                 peel_for_gaps ? vf : vf - 1,
2424                                                 th, &bound_scalar,
2425                                                 check_profitability);
2426           /* Build guard against NITERSM1 since NITERS may overflow.  */
2427           guard_cond = fold_build2 (LT_EXPR, boolean_type_node, nitersm1, t);
2428           guard_bb = anchor;
2429           guard_to = split_edge (loop_preheader_edge (epilog));
2430           guard_e = slpeel_add_loop_guard (guard_bb, guard_cond,
2431                                            guard_to, guard_bb,
2432                                            prob_vector.invert (),
2433                                            irred_flag);
2434           e = EDGE_PRED (guard_to, 0);
2435           e = (e != guard_e ? e : EDGE_PRED (guard_to, 1));
2436           slpeel_update_phi_nodes_for_guard1 (first_loop, epilog, guard_e, e);
2437
2438           /* Simply propagate profile info from guard_bb to guard_to which is
2439              a merge point of control flow.  */
2440           guard_to->count = guard_bb->count;
2441
2442           /* Scale probability of epilog loop back.
2443              FIXME: We should avoid scaling down and back up.  Profile may
2444              get lost if we scale down to 0.  */
2445           basic_block *bbs = get_loop_body (epilog);
2446           for (unsigned int i = 0; i < epilog->num_nodes; i++)
2447             bbs[i]->count = bbs[i]->count.apply_scale
2448                                  (bbs[i]->count,
2449                                   bbs[i]->count.apply_probability
2450                                     (prob_vector));
2451           free (bbs);
2452         }
2453
2454       basic_block bb_before_epilog = loop_preheader_edge (epilog)->src;
2455       tree niters_vector_mult_vf;
2456       /* If loop is peeled for non-zero constant times, now niters refers to
2457          orig_niters - prolog_peeling, it won't overflow even the orig_niters
2458          overflows.  */
2459       niters_no_overflow |= (prolog_peeling > 0);
2460       vect_gen_vector_loop_niters (loop_vinfo, niters,
2461                                    niters_vector, step_vector,
2462                                    niters_no_overflow);
2463       if (!integer_onep (*step_vector))
2464         {
2465           /* On exit from the loop we will have an easy way of calcalating
2466              NITERS_VECTOR / STEP * STEP.  Install a dummy definition
2467              until then.  */
2468           niters_vector_mult_vf = make_ssa_name (TREE_TYPE (*niters_vector));
2469           SSA_NAME_DEF_STMT (niters_vector_mult_vf) = gimple_build_nop ();
2470           *niters_vector_mult_vf_var = niters_vector_mult_vf;
2471         }
2472       else
2473         vect_gen_vector_loop_niters_mult_vf (loop_vinfo, *niters_vector,
2474                                              &niters_vector_mult_vf);
2475       /* Update IVs of original loop as if they were advanced by
2476          niters_vector_mult_vf steps.  */
2477       gcc_checking_assert (vect_can_advance_ivs_p (loop_vinfo));
2478       edge update_e = skip_vector ? e : loop_preheader_edge (epilog);
2479       vect_update_ivs_after_vectorizer (loop_vinfo, niters_vector_mult_vf,
2480                                         update_e);
2481
2482       if (skip_epilog)
2483         {
2484           guard_cond = fold_build2 (EQ_EXPR, boolean_type_node,
2485                                     niters, niters_vector_mult_vf);
2486           guard_bb = single_exit (loop)->dest;
2487           guard_to = split_edge (single_exit (epilog));
2488           guard_e = slpeel_add_loop_guard (guard_bb, guard_cond, guard_to,
2489                                            skip_vector ? anchor : guard_bb,
2490                                            prob_epilog.invert (),
2491                                            irred_flag);
2492           slpeel_update_phi_nodes_for_guard2 (loop, epilog, guard_e,
2493                                               single_exit (epilog));
2494           /* Only need to handle basic block before epilog loop if it's not
2495              the guard_bb, which is the case when skip_vector is true.  */
2496           if (guard_bb != bb_before_epilog)
2497             {
2498               prob_epilog = prob_vector * prob_epilog + prob_vector.invert ();
2499
2500               scale_bbs_frequencies (&bb_before_epilog, 1, prob_epilog);
2501             }
2502           scale_loop_profile (epilog, prob_epilog, 0);
2503         }
2504       else
2505         slpeel_update_phi_nodes_for_lcssa (epilog);
2506
2507       unsigned HOST_WIDE_INT bound1, bound2;
2508       if (vf.is_constant (&bound1) && bound_scalar.is_constant (&bound2))
2509         {
2510           bound1 -= LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) ? 1 : 2;
2511           if (bound2)
2512             /* We share epilog loop with scalar version loop.  */
2513             bound1 = MAX (bound1, bound2 - 1);
2514           record_niter_bound (epilog, bound1, false, true);
2515         }
2516
2517       delete_update_ssa ();
2518       adjust_vec_debug_stmts ();
2519       scev_reset ();
2520     }
2521   adjust_vec.release ();
2522   free_original_copy_tables ();
2523
2524   return epilog;
2525 }
2526
2527 /* Function vect_create_cond_for_niters_checks.
2528
2529    Create a conditional expression that represents the run-time checks for
2530    loop's niter.  The loop is guaranteed to to terminate if the run-time
2531    checks hold.
2532
2533    Input:
2534    COND_EXPR  - input conditional expression.  New conditions will be chained
2535                 with logical AND operation.  If it is NULL, then the function
2536                 is used to return the number of alias checks.
2537    LOOP_VINFO - field LOOP_VINFO_MAY_ALIAS_STMTS contains the list of ddrs
2538                 to be checked.
2539
2540    Output:
2541    COND_EXPR - conditional expression.
2542
2543    The returned COND_EXPR is the conditional expression to be used in the
2544    if statement that controls which version of the loop gets executed at
2545    runtime.  */
2546
2547 static void
2548 vect_create_cond_for_niters_checks (loop_vec_info loop_vinfo, tree *cond_expr)
2549 {
2550   tree part_cond_expr = LOOP_VINFO_NITERS_ASSUMPTIONS (loop_vinfo);
2551
2552   if (*cond_expr)
2553     *cond_expr = fold_build2 (TRUTH_AND_EXPR, boolean_type_node,
2554                               *cond_expr, part_cond_expr);
2555   else
2556     *cond_expr = part_cond_expr;
2557 }
2558
2559 /* Set *COND_EXPR to a tree that is true when both the original *COND_EXPR
2560    and PART_COND_EXPR are true.  Treat a null *COND_EXPR as "true".  */
2561
2562 static void
2563 chain_cond_expr (tree *cond_expr, tree part_cond_expr)
2564 {
2565   if (*cond_expr)
2566     *cond_expr = fold_build2 (TRUTH_AND_EXPR, boolean_type_node,
2567                               *cond_expr, part_cond_expr);
2568   else
2569     *cond_expr = part_cond_expr;
2570 }
2571
2572 /* Function vect_create_cond_for_align_checks.
2573
2574    Create a conditional expression that represents the alignment checks for
2575    all of data references (array element references) whose alignment must be
2576    checked at runtime.
2577
2578    Input:
2579    COND_EXPR  - input conditional expression.  New conditions will be chained
2580                 with logical AND operation.
2581    LOOP_VINFO - two fields of the loop information are used.
2582                 LOOP_VINFO_PTR_MASK is the mask used to check the alignment.
2583                 LOOP_VINFO_MAY_MISALIGN_STMTS contains the refs to be checked.
2584
2585    Output:
2586    COND_EXPR_STMT_LIST - statements needed to construct the conditional
2587                          expression.
2588    The returned value is the conditional expression to be used in the if
2589    statement that controls which version of the loop gets executed at runtime.
2590
2591    The algorithm makes two assumptions:
2592      1) The number of bytes "n" in a vector is a power of 2.
2593      2) An address "a" is aligned if a%n is zero and that this
2594         test can be done as a&(n-1) == 0.  For example, for 16
2595         byte vectors the test is a&0xf == 0.  */
2596
2597 static void
2598 vect_create_cond_for_align_checks (loop_vec_info loop_vinfo,
2599                                    tree *cond_expr,
2600                                    gimple_seq *cond_expr_stmt_list)
2601 {
2602   vec<gimple *> may_misalign_stmts
2603     = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo);
2604   gimple *ref_stmt;
2605   int mask = LOOP_VINFO_PTR_MASK (loop_vinfo);
2606   tree mask_cst;
2607   unsigned int i;
2608   tree int_ptrsize_type;
2609   char tmp_name[20];
2610   tree or_tmp_name = NULL_TREE;
2611   tree and_tmp_name;
2612   gimple *and_stmt;
2613   tree ptrsize_zero;
2614   tree part_cond_expr;
2615
2616   /* Check that mask is one less than a power of 2, i.e., mask is
2617      all zeros followed by all ones.  */
2618   gcc_assert ((mask != 0) && ((mask & (mask+1)) == 0));
2619
2620   int_ptrsize_type = signed_type_for (ptr_type_node);
2621
2622   /* Create expression (mask & (dr_1 || ... || dr_n)) where dr_i is the address
2623      of the first vector of the i'th data reference. */
2624
2625   FOR_EACH_VEC_ELT (may_misalign_stmts, i, ref_stmt)
2626     {
2627       gimple_seq new_stmt_list = NULL;
2628       tree addr_base;
2629       tree addr_tmp_name;
2630       tree new_or_tmp_name;
2631       gimple *addr_stmt, *or_stmt;
2632       stmt_vec_info stmt_vinfo = vinfo_for_stmt (ref_stmt);
2633       tree vectype = STMT_VINFO_VECTYPE (stmt_vinfo);
2634       bool negative = tree_int_cst_compare
2635         (DR_STEP (STMT_VINFO_DATA_REF (stmt_vinfo)), size_zero_node) < 0;
2636       tree offset = negative
2637         ? size_int (-TYPE_VECTOR_SUBPARTS (vectype) + 1) : size_zero_node;
2638
2639       /* create: addr_tmp = (int)(address_of_first_vector) */
2640       addr_base =
2641         vect_create_addr_base_for_vector_ref (ref_stmt, &new_stmt_list,
2642                                               offset);
2643       if (new_stmt_list != NULL)
2644         gimple_seq_add_seq (cond_expr_stmt_list, new_stmt_list);
2645
2646       sprintf (tmp_name, "addr2int%d", i);
2647       addr_tmp_name = make_temp_ssa_name (int_ptrsize_type, NULL, tmp_name);
2648       addr_stmt = gimple_build_assign (addr_tmp_name, NOP_EXPR, addr_base);
2649       gimple_seq_add_stmt (cond_expr_stmt_list, addr_stmt);
2650
2651       /* The addresses are OR together.  */
2652
2653       if (or_tmp_name != NULL_TREE)
2654         {
2655           /* create: or_tmp = or_tmp | addr_tmp */
2656           sprintf (tmp_name, "orptrs%d", i);
2657           new_or_tmp_name = make_temp_ssa_name (int_ptrsize_type, NULL, tmp_name);
2658           or_stmt = gimple_build_assign (new_or_tmp_name, BIT_IOR_EXPR,
2659                                          or_tmp_name, addr_tmp_name);
2660           gimple_seq_add_stmt (cond_expr_stmt_list, or_stmt);
2661           or_tmp_name = new_or_tmp_name;
2662         }
2663       else
2664         or_tmp_name = addr_tmp_name;
2665
2666     } /* end for i */
2667
2668   mask_cst = build_int_cst (int_ptrsize_type, mask);
2669
2670   /* create: and_tmp = or_tmp & mask  */
2671   and_tmp_name = make_temp_ssa_name (int_ptrsize_type, NULL, "andmask");
2672
2673   and_stmt = gimple_build_assign (and_tmp_name, BIT_AND_EXPR,
2674                                   or_tmp_name, mask_cst);
2675   gimple_seq_add_stmt (cond_expr_stmt_list, and_stmt);
2676
2677   /* Make and_tmp the left operand of the conditional test against zero.
2678      if and_tmp has a nonzero bit then some address is unaligned.  */
2679   ptrsize_zero = build_int_cst (int_ptrsize_type, 0);
2680   part_cond_expr = fold_build2 (EQ_EXPR, boolean_type_node,
2681                                 and_tmp_name, ptrsize_zero);
2682   chain_cond_expr (cond_expr, part_cond_expr);
2683 }
2684
2685 /* If LOOP_VINFO_CHECK_UNEQUAL_ADDRS contains <A1, B1>, ..., <An, Bn>,
2686    create a tree representation of: (&A1 != &B1) && ... && (&An != &Bn).
2687    Set *COND_EXPR to a tree that is true when both the original *COND_EXPR
2688    and this new condition are true.  Treat a null *COND_EXPR as "true".  */
2689
2690 static void
2691 vect_create_cond_for_unequal_addrs (loop_vec_info loop_vinfo, tree *cond_expr)
2692 {
2693   vec<vec_object_pair> pairs = LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo);
2694   unsigned int i;
2695   vec_object_pair *pair;
2696   FOR_EACH_VEC_ELT (pairs, i, pair)
2697     {
2698       tree addr1 = build_fold_addr_expr (pair->first);
2699       tree addr2 = build_fold_addr_expr (pair->second);
2700       tree part_cond_expr = fold_build2 (NE_EXPR, boolean_type_node,
2701                                          addr1, addr2);
2702       chain_cond_expr (cond_expr, part_cond_expr);
2703     }
2704 }
2705
2706 /* Function vect_create_cond_for_alias_checks.
2707
2708    Create a conditional expression that represents the run-time checks for
2709    overlapping of address ranges represented by a list of data references
2710    relations passed as input.
2711
2712    Input:
2713    COND_EXPR  - input conditional expression.  New conditions will be chained
2714                 with logical AND operation.  If it is NULL, then the function
2715                 is used to return the number of alias checks.
2716    LOOP_VINFO - field LOOP_VINFO_MAY_ALIAS_STMTS contains the list of ddrs
2717                 to be checked.
2718
2719    Output:
2720    COND_EXPR - conditional expression.
2721
2722    The returned COND_EXPR is the conditional expression to be used in the if
2723    statement that controls which version of the loop gets executed at runtime.
2724 */
2725
2726 void
2727 vect_create_cond_for_alias_checks (loop_vec_info loop_vinfo, tree * cond_expr)
2728 {
2729   vec<dr_with_seg_len_pair_t> comp_alias_ddrs =
2730     LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo);
2731
2732   if (comp_alias_ddrs.is_empty ())
2733     return;
2734
2735   create_runtime_alias_checks (LOOP_VINFO_LOOP (loop_vinfo),
2736                                &comp_alias_ddrs, cond_expr);
2737   if (dump_enabled_p ())
2738     dump_printf_loc (MSG_NOTE, vect_location,
2739                      "created %u versioning for alias checks.\n",
2740                      comp_alias_ddrs.length ());
2741 }
2742
2743
2744 /* Function vect_loop_versioning.
2745
2746    If the loop has data references that may or may not be aligned or/and
2747    has data reference relations whose independence was not proven then
2748    two versions of the loop need to be generated, one which is vectorized
2749    and one which isn't.  A test is then generated to control which of the
2750    loops is executed.  The test checks for the alignment of all of the
2751    data references that may or may not be aligned.  An additional
2752    sequence of runtime tests is generated for each pairs of DDRs whose
2753    independence was not proven.  The vectorized version of loop is
2754    executed only if both alias and alignment tests are passed.
2755
2756    The test generated to check which version of loop is executed
2757    is modified to also check for profitability as indicated by the
2758    cost model threshold TH.
2759
2760    The versioning precondition(s) are placed in *COND_EXPR and
2761    *COND_EXPR_STMT_LIST.  */
2762
2763 void
2764 vect_loop_versioning (loop_vec_info loop_vinfo,
2765                       unsigned int th, bool check_profitability,
2766                       poly_uint64 versioning_threshold)
2767 {
2768   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo), *nloop;
2769   struct loop *scalar_loop = LOOP_VINFO_SCALAR_LOOP (loop_vinfo);
2770   basic_block condition_bb;
2771   gphi_iterator gsi;
2772   gimple_stmt_iterator cond_exp_gsi;
2773   basic_block merge_bb;
2774   basic_block new_exit_bb;
2775   edge new_exit_e, e;
2776   gphi *orig_phi, *new_phi;
2777   tree cond_expr = NULL_TREE;
2778   gimple_seq cond_expr_stmt_list = NULL;
2779   tree arg;
2780   profile_probability prob = profile_probability::likely ();
2781   gimple_seq gimplify_stmt_list = NULL;
2782   tree scalar_loop_iters = LOOP_VINFO_NITERSM1 (loop_vinfo);
2783   bool version_align = LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo);
2784   bool version_alias = LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo);
2785   bool version_niter = LOOP_REQUIRES_VERSIONING_FOR_NITERS (loop_vinfo);
2786
2787   if (check_profitability)
2788     cond_expr = fold_build2 (GE_EXPR, boolean_type_node, scalar_loop_iters,
2789                              build_int_cst (TREE_TYPE (scalar_loop_iters),
2790                                             th - 1));
2791   if (maybe_ne (versioning_threshold, 0U))
2792     {
2793       tree expr = fold_build2 (GE_EXPR, boolean_type_node, scalar_loop_iters,
2794                                build_int_cst (TREE_TYPE (scalar_loop_iters),
2795                                               versioning_threshold - 1));
2796       if (cond_expr)
2797         cond_expr = fold_build2 (BIT_AND_EXPR, boolean_type_node,
2798                                  expr, cond_expr);
2799       else
2800         cond_expr = expr;
2801     }
2802
2803   if (version_niter)
2804     vect_create_cond_for_niters_checks (loop_vinfo, &cond_expr);
2805
2806   if (cond_expr)
2807     cond_expr = force_gimple_operand_1 (cond_expr, &cond_expr_stmt_list,
2808                                         is_gimple_condexpr, NULL_TREE);
2809
2810   if (version_align)
2811     vect_create_cond_for_align_checks (loop_vinfo, &cond_expr,
2812                                        &cond_expr_stmt_list);
2813
2814   if (version_alias)
2815     {
2816       vect_create_cond_for_unequal_addrs (loop_vinfo, &cond_expr);
2817       vect_create_cond_for_alias_checks (loop_vinfo, &cond_expr);
2818     }
2819
2820   cond_expr = force_gimple_operand_1 (cond_expr, &gimplify_stmt_list,
2821                                       is_gimple_condexpr, NULL_TREE);
2822   gimple_seq_add_seq (&cond_expr_stmt_list, gimplify_stmt_list);
2823
2824   initialize_original_copy_tables ();
2825   if (scalar_loop)
2826     {
2827       edge scalar_e;
2828       basic_block preheader, scalar_preheader;
2829
2830       /* We don't want to scale SCALAR_LOOP's frequencies, we need to
2831          scale LOOP's frequencies instead.  */
2832       nloop = loop_version (scalar_loop, cond_expr, &condition_bb,
2833                             prob, prob.invert (), prob, prob.invert (), true);
2834       scale_loop_frequencies (loop, prob);
2835       /* CONDITION_BB was created above SCALAR_LOOP's preheader,
2836          while we need to move it above LOOP's preheader.  */
2837       e = loop_preheader_edge (loop);
2838       scalar_e = loop_preheader_edge (scalar_loop);
2839       gcc_assert (empty_block_p (e->src)
2840                   && single_pred_p (e->src));
2841       gcc_assert (empty_block_p (scalar_e->src)
2842                   && single_pred_p (scalar_e->src));
2843       gcc_assert (single_pred_p (condition_bb));
2844       preheader = e->src;
2845       scalar_preheader = scalar_e->src;
2846       scalar_e = find_edge (condition_bb, scalar_preheader);
2847       e = single_pred_edge (preheader);
2848       redirect_edge_and_branch_force (single_pred_edge (condition_bb),
2849                                       scalar_preheader);
2850       redirect_edge_and_branch_force (scalar_e, preheader);
2851       redirect_edge_and_branch_force (e, condition_bb);
2852       set_immediate_dominator (CDI_DOMINATORS, condition_bb,
2853                                single_pred (condition_bb));
2854       set_immediate_dominator (CDI_DOMINATORS, scalar_preheader,
2855                                single_pred (scalar_preheader));
2856       set_immediate_dominator (CDI_DOMINATORS, preheader,
2857                                condition_bb);
2858     }
2859   else
2860     nloop = loop_version (loop, cond_expr, &condition_bb,
2861                           prob, prob.invert (), prob, prob.invert (), true);
2862
2863   if (version_niter)
2864     {
2865       /* The versioned loop could be infinite, we need to clear existing
2866          niter information which is copied from the original loop.  */
2867       gcc_assert (loop_constraint_set_p (loop, LOOP_C_FINITE));
2868       vect_free_loop_info_assumptions (nloop);
2869       /* And set constraint LOOP_C_INFINITE for niter analyzer.  */
2870       loop_constraint_set (loop, LOOP_C_INFINITE);
2871     }
2872
2873   if (LOCATION_LOCUS (vect_location) != UNKNOWN_LOCATION
2874       && dump_enabled_p ())
2875     {
2876       if (version_alias)
2877         dump_printf_loc (MSG_OPTIMIZED_LOCATIONS, vect_location,
2878                          "loop versioned for vectorization because of "
2879                          "possible aliasing\n");
2880       if (version_align)
2881         dump_printf_loc (MSG_OPTIMIZED_LOCATIONS, vect_location,
2882                          "loop versioned for vectorization to enhance "
2883                          "alignment\n");
2884
2885     }
2886   free_original_copy_tables ();
2887
2888   /* Loop versioning violates an assumption we try to maintain during
2889      vectorization - that the loop exit block has a single predecessor.
2890      After versioning, the exit block of both loop versions is the same
2891      basic block (i.e. it has two predecessors). Just in order to simplify
2892      following transformations in the vectorizer, we fix this situation
2893      here by adding a new (empty) block on the exit-edge of the loop,
2894      with the proper loop-exit phis to maintain loop-closed-form.
2895      If loop versioning wasn't done from loop, but scalar_loop instead,
2896      merge_bb will have already just a single successor.  */
2897
2898   merge_bb = single_exit (loop)->dest;
2899   if (scalar_loop == NULL || EDGE_COUNT (merge_bb->preds) >= 2)
2900     {
2901       gcc_assert (EDGE_COUNT (merge_bb->preds) >= 2);
2902       new_exit_bb = split_edge (single_exit (loop));
2903       new_exit_e = single_exit (loop);
2904       e = EDGE_SUCC (new_exit_bb, 0);
2905
2906       for (gsi = gsi_start_phis (merge_bb); !gsi_end_p (gsi); gsi_next (&gsi))
2907         {
2908           tree new_res;
2909           orig_phi = gsi.phi ();
2910           new_res = copy_ssa_name (PHI_RESULT (orig_phi));
2911           new_phi = create_phi_node (new_res, new_exit_bb);
2912           arg = PHI_ARG_DEF_FROM_EDGE (orig_phi, e);
2913           add_phi_arg (new_phi, arg, new_exit_e,
2914                        gimple_phi_arg_location_from_edge (orig_phi, e));
2915           adjust_phi_and_debug_stmts (orig_phi, e, PHI_RESULT (new_phi));
2916         }
2917     }
2918
2919   /* End loop-exit-fixes after versioning.  */
2920
2921   if (cond_expr_stmt_list)
2922     {
2923       cond_exp_gsi = gsi_last_bb (condition_bb);
2924       gsi_insert_seq_before (&cond_exp_gsi, cond_expr_stmt_list,
2925                              GSI_SAME_STMT);
2926     }
2927   update_ssa (TODO_update_ssa);
2928 }