gcc/tree-vect-loop-manip.c

   1 /* Vectorizer Specific Loop Manipulations
   2    Copyright (C) 2003-2019 Free Software Foundation, Inc.
   3    Contributed by Dorit Naishlos <dorit@il.ibm.com>
   4    and Ira Rosen <irar@il.ibm.com>
   5
   6 This file is part of GCC.
   7
   8 GCC is free software; you can redistribute it and/or modify it under
   9 the terms of the GNU General Public License as published by the Free
  10 Software Foundation; either version 3, or (at your option) any later
  11 version.
  12
  13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
  14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
  15 FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  16 for more details.
  17
  18 You should have received a copy of the GNU General Public License
  19 along with GCC; see the file COPYING3.  If not see
  20 <http://www.gnu.org/licenses/>.  */
  21
  22 #include "config.h"
  23 #include "system.h"
  24 #include "coretypes.h"
  25 #include "backend.h"
  26 #include "tree.h"
  27 #include "gimple.h"
  28 #include "cfghooks.h"
  29 #include "tree-pass.h"
  30 #include "ssa.h"
  31 #include "fold-const.h"
  32 #include "cfganal.h"
  33 #include "gimplify.h"
  34 #include "gimple-iterator.h"
  35 #include "gimplify-me.h"
  36 #include "tree-cfg.h"
  37 #include "tree-ssa-loop-manip.h"
  38 #include "tree-into-ssa.h"
  39 #include "tree-ssa.h"
  40 #include "cfgloop.h"
  41 #include "tree-scalar-evolution.h"
  42 #include "tree-vectorizer.h"
  43 #include "tree-ssa-loop-ivopts.h"
  44 #include "gimple-fold.h"
  45 #include "tree-ssa-loop-niter.h"
  46 #include "internal-fn.h"
  47 #include "stor-layout.h"
  48 #include "optabs-query.h"
  49 #include "vec-perm-indices.h"
  50
  51 /*************************************************************************
  52   Simple Loop Peeling Utilities
  53
  54   Utilities to support loop peeling for vectorization purposes.
  55  *************************************************************************/
  56
  57
  58 /* Renames the use *OP_P.  */
  59
  60 static void
  61 rename_use_op (use_operand_p op_p)
  62 {
  63   tree new_name;
  64
  65   if (TREE_CODE (USE_FROM_PTR (op_p)) != SSA_NAME)
  66     return;
  67
  68   new_name = get_current_def (USE_FROM_PTR (op_p));
  69
  70   /* Something defined outside of the loop.  */
  71   if (!new_name)
  72     return;
  73
  74   /* An ordinary ssa name defined in the loop.  */
  75
  76   SET_USE (op_p, new_name);
  77 }
  78
  79
  80 /* Renames the variables in basic block BB.  Allow renaming  of PHI arguments
  81    on edges incoming from outer-block header if RENAME_FROM_OUTER_LOOP is
  82    true.  */
  83
  84 static void
  85 rename_variables_in_bb (basic_block bb, bool rename_from_outer_loop)
  86 {
  87   gimple *stmt;
  88   use_operand_p use_p;
  89   ssa_op_iter iter;
  90   edge e;
  91   edge_iterator ei;
  92   struct loop *loop = bb->loop_father;
  93   struct loop *outer_loop = NULL;
  94
  95   if (rename_from_outer_loop)
  96     {
  97       gcc_assert (loop);
  98       outer_loop = loop_outer (loop);
  99     }
 100
 101   for (gimple_stmt_iterator gsi = gsi_start_bb (bb); !gsi_end_p (gsi);
 102        gsi_next (&gsi))
 103     {
 104       stmt = gsi_stmt (gsi);
 105       FOR_EACH_SSA_USE_OPERAND (use_p, stmt, iter, SSA_OP_ALL_USES)
 106         rename_use_op (use_p);
 107     }
 108
 109   FOR_EACH_EDGE (e, ei, bb->preds)
 110     {
 111       if (!flow_bb_inside_loop_p (loop, e->src))
 112         {
 113           if (!rename_from_outer_loop)
 114             continue;
 115           if (e->src != outer_loop->header)
 116             {
 117               if (outer_loop->inner->next)
 118                 {
 119                   /* If outer_loop has 2 inner loops, allow there to
 120                      be an extra basic block which decides which of the
 121                      two loops to use using LOOP_VECTORIZED.  */
 122                   if (!single_pred_p (e->src)
 123                       || single_pred (e->src) != outer_loop->header)
 124                     continue;
 125                 }
 126             }
 127         }
 128       for (gphi_iterator gsi = gsi_start_phis (bb); !gsi_end_p (gsi);
 129            gsi_next (&gsi))
 130         rename_use_op (PHI_ARG_DEF_PTR_FROM_EDGE (gsi.phi (), e));
 131     }
 132 }
 133
 134
 135 struct adjust_info
 136 {
 137   tree from, to;
 138   basic_block bb;
 139 };
 140
 141 /* A stack of values to be adjusted in debug stmts.  We have to
 142    process them LIFO, so that the closest substitution applies.  If we
 143    processed them FIFO, without the stack, we might substitute uses
 144    with a PHI DEF that would soon become non-dominant, and when we got
 145    to the suitable one, it wouldn't have anything to substitute any
 146    more.  */
 147 static vec<adjust_info, va_heap> adjust_vec;
 148
 149 /* Adjust any debug stmts that referenced AI->from values to use the
 150    loop-closed AI->to, if the references are dominated by AI->bb and
 151    not by the definition of AI->from.  */
 152
 153 static void
 154 adjust_debug_stmts_now (adjust_info *ai)
 155 {
 156   basic_block bbphi = ai->bb;
 157   tree orig_def = ai->from;
 158   tree new_def = ai->to;
 159   imm_use_iterator imm_iter;
 160   gimple *stmt;
 161   basic_block bbdef = gimple_bb (SSA_NAME_DEF_STMT (orig_def));
 162
 163   gcc_assert (dom_info_available_p (CDI_DOMINATORS));
 164
 165   /* Adjust any debug stmts that held onto non-loop-closed
 166      references.  */
 167   FOR_EACH_IMM_USE_STMT (stmt, imm_iter, orig_def)
 168     {
 169       use_operand_p use_p;
 170       basic_block bbuse;
 171
 172       if (!is_gimple_debug (stmt))
 173         continue;
 174
 175       gcc_assert (gimple_debug_bind_p (stmt));
 176
 177       bbuse = gimple_bb (stmt);
 178
 179       if ((bbuse == bbphi
 180            || dominated_by_p (CDI_DOMINATORS, bbuse, bbphi))
 181           && !(bbuse == bbdef
 182                || dominated_by_p (CDI_DOMINATORS, bbuse, bbdef)))
 183         {
 184           if (new_def)
 185             FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
 186               SET_USE (use_p, new_def);
 187           else
 188             {
 189               gimple_debug_bind_reset_value (stmt);
 190               update_stmt (stmt);
 191             }
 192         }
 193     }
 194 }
 195
 196 /* Adjust debug stmts as scheduled before.  */
 197
 198 static void
 199 adjust_vec_debug_stmts (void)
 200 {
 201   if (!MAY_HAVE_DEBUG_BIND_STMTS)
 202     return;
 203
 204   gcc_assert (adjust_vec.exists ());
 205
 206   while (!adjust_vec.is_empty ())
 207     {
 208       adjust_debug_stmts_now (&adjust_vec.last ());
 209       adjust_vec.pop ();
 210     }
 211 }
 212
 213 /* Adjust any debug stmts that referenced FROM values to use the
 214    loop-closed TO, if the references are dominated by BB and not by
 215    the definition of FROM.  If adjust_vec is non-NULL, adjustments
 216    will be postponed until adjust_vec_debug_stmts is called.  */
 217
 218 static void
 219 adjust_debug_stmts (tree from, tree to, basic_block bb)
 220 {
 221   adjust_info ai;
 222
 223   if (MAY_HAVE_DEBUG_BIND_STMTS
 224       && TREE_CODE (from) == SSA_NAME
 225       && ! SSA_NAME_IS_DEFAULT_DEF (from)
 226       && ! virtual_operand_p (from))
 227     {
 228       ai.from = from;
 229       ai.to = to;
 230       ai.bb = bb;
 231
 232       if (adjust_vec.exists ())
 233         adjust_vec.safe_push (ai);
 234       else
 235         adjust_debug_stmts_now (&ai);
 236     }
 237 }
 238
 239 /* Change E's phi arg in UPDATE_PHI to NEW_DEF, and record information
 240    to adjust any debug stmts that referenced the old phi arg,
 241    presumably non-loop-closed references left over from other
 242    transformations.  */
 243
 244 static void
 245 adjust_phi_and_debug_stmts (gimple *update_phi, edge e, tree new_def)
 246 {
 247   tree orig_def = PHI_ARG_DEF_FROM_EDGE (update_phi, e);
 248
 249   SET_PHI_ARG_DEF (update_phi, e->dest_idx, new_def);
 250
 251   if (MAY_HAVE_DEBUG_BIND_STMTS)
 252     adjust_debug_stmts (orig_def, PHI_RESULT (update_phi),
 253                         gimple_bb (update_phi));
 254 }
 255
 256 /* Define one loop mask MASK from loop LOOP.  INIT_MASK is the value that
 257    the mask should have during the first iteration and NEXT_MASK is the
 258    value that it should have on subsequent iterations.  */
 259
 260 static void
 261 vect_set_loop_mask (struct loop *loop, tree mask, tree init_mask,
 262                     tree next_mask)
 263 {
 264   gphi *phi = create_phi_node (mask, loop->header);
 265   add_phi_arg (phi, init_mask, loop_preheader_edge (loop), UNKNOWN_LOCATION);
 266   add_phi_arg (phi, next_mask, loop_latch_edge (loop), UNKNOWN_LOCATION);
 267 }
 268
 269 /* Add SEQ to the end of LOOP's preheader block.  */
 270
 271 static void
 272 add_preheader_seq (struct loop *loop, gimple_seq seq)
 273 {
 274   if (seq)
 275     {
 276       edge pe = loop_preheader_edge (loop);
 277       basic_block new_bb = gsi_insert_seq_on_edge_immediate (pe, seq);
 278       gcc_assert (!new_bb);
 279     }
 280 }
 281
 282 /* Add SEQ to the beginning of LOOP's header block.  */
 283
 284 static void
 285 add_header_seq (struct loop *loop, gimple_seq seq)
 286 {
 287   if (seq)
 288     {
 289       gimple_stmt_iterator gsi = gsi_after_labels (loop->header);
 290       gsi_insert_seq_before (&gsi, seq, GSI_SAME_STMT);
 291     }
 292 }
 293
 294 /* Return true if the target can interleave elements of two vectors.
 295    OFFSET is 0 if the first half of the vectors should be interleaved
 296    or 1 if the second half should.  When returning true, store the
 297    associated permutation in INDICES.  */
 298
 299 static bool
 300 interleave_supported_p (vec_perm_indices *indices, tree vectype,
 301                         unsigned int offset)
 302 {
 303   poly_uint64 nelts = TYPE_VECTOR_SUBPARTS (vectype);
 304   poly_uint64 base = exact_div (nelts, 2) * offset;
 305   vec_perm_builder sel (nelts, 2, 3);
 306   for (unsigned int i = 0; i < 3; ++i)
 307     {
 308       sel.quick_push (base + i);
 309       sel.quick_push (base + i + nelts);
 310     }
 311   indices->new_vector (sel, 2, nelts);
 312   return can_vec_perm_const_p (TYPE_MODE (vectype), *indices);
 313 }
 314
 315 /* Try to use permutes to define the masks in DEST_RGM using the masks
 316    in SRC_RGM, given that the former has twice as many masks as the
 317    latter.  Return true on success, adding any new statements to SEQ.  */
 318
 319 static bool
 320 vect_maybe_permute_loop_masks (gimple_seq *seq, rgroup_masks *dest_rgm,
 321                                rgroup_masks *src_rgm)
 322 {
 323   tree src_masktype = src_rgm->mask_type;
 324   tree dest_masktype = dest_rgm->mask_type;
 325   machine_mode src_mode = TYPE_MODE (src_masktype);
 326   if (dest_rgm->max_nscalars_per_iter <= src_rgm->max_nscalars_per_iter
 327       && optab_handler (vec_unpacku_hi_optab, src_mode) != CODE_FOR_nothing
 328       && optab_handler (vec_unpacku_lo_optab, src_mode) != CODE_FOR_nothing)
 329     {
 330       /* Unpacking the source masks gives at least as many mask bits as
 331          we need.  We can then VIEW_CONVERT any excess bits away.  */
 332       tree unpack_masktype = vect_halve_mask_nunits (src_masktype);
 333       for (unsigned int i = 0; i < dest_rgm->masks.length (); ++i)
 334         {
 335           tree src = src_rgm->masks[i / 2];
 336           tree dest = dest_rgm->masks[i];
 337           tree_code code = ((i & 1) == (BYTES_BIG_ENDIAN ? 0 : 1)
 338                             ? VEC_UNPACK_HI_EXPR
 339                             : VEC_UNPACK_LO_EXPR);
 340           gassign *stmt;
 341           if (dest_masktype == unpack_masktype)
 342             stmt = gimple_build_assign (dest, code, src);
 343           else
 344             {
 345               tree temp = make_ssa_name (unpack_masktype);
 346               stmt = gimple_build_assign (temp, code, src);
 347               gimple_seq_add_stmt (seq, stmt);
 348               stmt = gimple_build_assign (dest, VIEW_CONVERT_EXPR,
 349                                           build1 (VIEW_CONVERT_EXPR,
 350                                                   dest_masktype, temp));
 351             }
 352           gimple_seq_add_stmt (seq, stmt);
 353         }
 354       return true;
 355     }
 356   vec_perm_indices indices[2];
 357   if (dest_masktype == src_masktype
 358       && interleave_supported_p (&indices[0], src_masktype, 0)
 359       && interleave_supported_p (&indices[1], src_masktype, 1))
 360     {
 361       /* The destination requires twice as many mask bits as the source, so
 362          we can use interleaving permutes to double up the number of bits.  */
 363       tree masks[2];
 364       for (unsigned int i = 0; i < 2; ++i)
 365         masks[i] = vect_gen_perm_mask_checked (src_masktype, indices[i]);
 366       for (unsigned int i = 0; i < dest_rgm->masks.length (); ++i)
 367         {
 368           tree src = src_rgm->masks[i / 2];
 369           tree dest = dest_rgm->masks[i];
 370           gimple *stmt = gimple_build_assign (dest, VEC_PERM_EXPR,
 371                                               src, src, masks[i & 1]);
 372           gimple_seq_add_stmt (seq, stmt);
 373         }
 374       return true;
 375     }
 376   return false;
 377 }
 378
 379 /* Helper for vect_set_loop_condition_masked.  Generate definitions for
 380    all the masks in RGM and return a mask that is nonzero when the loop
 381    needs to iterate.  Add any new preheader statements to PREHEADER_SEQ.
 382    Use LOOP_COND_GSI to insert code before the exit gcond.
 383
 384    RGM belongs to loop LOOP.  The loop originally iterated NITERS
 385    times and has been vectorized according to LOOP_VINFO.  Each iteration
 386    of the vectorized loop handles VF iterations of the scalar loop.
 387
 388    If NITERS_SKIP is nonnull, the first iteration of the vectorized loop
 389    starts with NITERS_SKIP dummy iterations of the scalar loop before
 390    the real work starts.  The mask elements for these dummy iterations
 391    must be 0, to ensure that the extra iterations do not have an effect.
 392
 393    It is known that:
 394
 395      NITERS * RGM->max_nscalars_per_iter
 396
 397    does not overflow.  However, MIGHT_WRAP_P says whether an induction
 398    variable that starts at 0 and has step:
 399
 400      VF * RGM->max_nscalars_per_iter
 401
 402    might overflow before hitting a value above:
 403
 404      (NITERS + NITERS_SKIP) * RGM->max_nscalars_per_iter
 405
 406    This means that we cannot guarantee that such an induction variable
 407    would ever hit a value that produces a set of all-false masks for RGM.  */
 408
 409 static tree
 410 vect_set_loop_masks_directly (struct loop *loop, loop_vec_info loop_vinfo,
 411                               gimple_seq *preheader_seq,
 412                               gimple_stmt_iterator loop_cond_gsi,
 413                               rgroup_masks *rgm, tree vf,
 414                               tree niters, tree niters_skip,
 415                               bool might_wrap_p)
 416 {
 417   tree compare_type = LOOP_VINFO_MASK_COMPARE_TYPE (loop_vinfo);
 418   tree iv_type = LOOP_VINFO_MASK_IV_TYPE (loop_vinfo);
 419   tree mask_type = rgm->mask_type;
 420   unsigned int nscalars_per_iter = rgm->max_nscalars_per_iter;
 421   poly_uint64 nscalars_per_mask = TYPE_VECTOR_SUBPARTS (mask_type);
 422
 423   /* Calculate the maximum number of scalar values that the rgroup
 424      handles in total, the number that it handles for each iteration
 425      of the vector loop, and the number that it should skip during the
 426      first iteration of the vector loop.  */
 427   tree nscalars_total = niters;
 428   tree nscalars_step = vf;
 429   tree nscalars_skip = niters_skip;
 430   if (nscalars_per_iter != 1)
 431     {
 432       /* We checked before choosing to use a fully-masked loop that these
 433          multiplications don't overflow.  */
 434       tree factor = build_int_cst (compare_type, nscalars_per_iter);
 435       nscalars_total = gimple_build (preheader_seq, MULT_EXPR, compare_type,
 436                                      nscalars_total, factor);
 437       nscalars_step = gimple_build (preheader_seq, MULT_EXPR, compare_type,
 438                                     nscalars_step, factor);
 439       if (nscalars_skip)
 440         nscalars_skip = gimple_build (preheader_seq, MULT_EXPR, compare_type,
 441                                       nscalars_skip, factor);
 442     }
 443
 444   /* Create an induction variable that counts the number of scalars
 445      processed.  */
 446   tree index_before_incr, index_after_incr;
 447   gimple_stmt_iterator incr_gsi;
 448   bool insert_after;
 449   standard_iv_increment_position (loop, &incr_gsi, &insert_after);
 450
 451   tree zero_index = build_int_cst (iv_type, 0);
 452   tree step = build_int_cst (iv_type,
 453                              LOOP_VINFO_VECT_FACTOR (loop_vinfo));
 454   /* Create IV of iv_type.  */
 455   create_iv (zero_index, step, NULL_TREE, loop, &incr_gsi,
 456              insert_after, &index_before_incr, &index_after_incr);
 457
 458   zero_index = build_int_cst (compare_type, 0);
 459   tree test_index, test_limit, first_limit;
 460   gimple_stmt_iterator *test_gsi;
 461   if (might_wrap_p)
 462     {
 463       /* In principle the loop should stop iterating once the incremented
 464          IV reaches a value greater than or equal to:
 465
 466            NSCALARS_TOTAL +[infinite-prec] NSCALARS_SKIP
 467
 468          However, there's no guarantee that this addition doesn't overflow
 469          the comparison type, or that the IV hits a value above it before
 470          wrapping around.  We therefore adjust the limit down by one
 471          IV step:
 472
 473            (NSCALARS_TOTAL +[infinite-prec] NSCALARS_SKIP)
 474            -[infinite-prec] NSCALARS_STEP
 475
 476          and compare the IV against this limit _before_ incrementing it.
 477          Since the comparison type is unsigned, we actually want the
 478          subtraction to saturate at zero:
 479
 480            (NSCALARS_TOTAL +[infinite-prec] NSCALARS_SKIP)
 481            -[sat] NSCALARS_STEP
 482
 483          And since NSCALARS_SKIP < NSCALARS_STEP, we can reassociate this as:
 484
 485            NSCALARS_TOTAL -[sat] (NSCALARS_STEP - NSCALARS_SKIP)
 486
 487          where the rightmost subtraction can be done directly in
 488          COMPARE_TYPE.  */
 489       test_index = index_before_incr;
 490       tree adjust = nscalars_step;
 491       if (nscalars_skip)
 492         adjust = gimple_build (preheader_seq, MINUS_EXPR, compare_type,
 493                                adjust, nscalars_skip);
 494       test_limit = gimple_build (preheader_seq, MAX_EXPR, compare_type,
 495                                  nscalars_total, adjust);
 496       test_limit = gimple_build (preheader_seq, MINUS_EXPR, compare_type,
 497                                  test_limit, adjust);
 498       test_gsi = &incr_gsi;
 499
 500       /* Get a safe limit for the first iteration.  */
 501       if (nscalars_skip)
 502         {
 503           /* The first vector iteration can handle at most NSCALARS_STEP
 504              scalars.  NSCALARS_STEP <= CONST_LIMIT, and adding
 505              NSCALARS_SKIP to that cannot overflow.  */
 506           tree const_limit = build_int_cst (compare_type,
 507                                             LOOP_VINFO_VECT_FACTOR (loop_vinfo)
 508                                             * nscalars_per_iter);
 509           first_limit = gimple_build (preheader_seq, MIN_EXPR, compare_type,
 510                                       nscalars_total, const_limit);
 511           first_limit = gimple_build (preheader_seq, PLUS_EXPR, compare_type,
 512                                       first_limit, nscalars_skip);
 513         }
 514       else
 515         /* For the first iteration it doesn't matter whether the IV hits
 516            a value above NSCALARS_TOTAL.  That only matters for the latch
 517            condition.  */
 518         first_limit = nscalars_total;
 519     }
 520   else
 521     {
 522       /* Test the incremented IV, which will always hit a value above
 523          the bound before wrapping.  */
 524       test_index = index_after_incr;
 525       test_limit = nscalars_total;
 526       if (nscalars_skip)
 527         test_limit = gimple_build (preheader_seq, PLUS_EXPR, compare_type,
 528                                    test_limit, nscalars_skip);
 529       test_gsi = &loop_cond_gsi;
 530
 531       first_limit = test_limit;
 532     }
 533
 534   /* Provide a definition of each mask in the group.  */
 535   tree next_mask = NULL_TREE;
 536   tree mask;
 537   unsigned int i;
 538   gimple_seq test_seq = NULL;
 539   test_index = gimple_convert (&test_seq, compare_type, test_index);
 540   gsi_insert_seq_before (test_gsi, test_seq, GSI_SAME_STMT);
 541
 542   FOR_EACH_VEC_ELT_REVERSE (rgm->masks, i, mask)
 543     {
 544       /* Previous masks will cover BIAS scalars.  This mask covers the
 545          next batch.  */
 546       poly_uint64 bias = nscalars_per_mask * i;
 547       tree bias_tree = build_int_cst (compare_type, bias);
 548       gimple *tmp_stmt;
 549
 550       /* See whether the first iteration of the vector loop is known
 551          to have a full mask.  */
 552       poly_uint64 const_limit;
 553       bool first_iteration_full
 554         = (poly_int_tree_p (first_limit, &const_limit)
 555            && known_ge (const_limit, (i + 1) * nscalars_per_mask));
 556
 557       /* Rather than have a new IV that starts at BIAS and goes up to
 558          TEST_LIMIT, prefer to use the same 0-based IV for each mask
 559          and adjust the bound down by BIAS.  */
 560       tree this_test_limit = test_limit;
 561       if (i != 0)
 562         {
 563           this_test_limit = gimple_build (preheader_seq, MAX_EXPR,
 564                                           compare_type, this_test_limit,
 565                                           bias_tree);
 566           this_test_limit = gimple_build (preheader_seq, MINUS_EXPR,
 567                                           compare_type, this_test_limit,
 568                                           bias_tree);
 569         }
 570
 571       /* Create the initial mask.  First include all scalars that
 572          are within the loop limit.  */
 573       tree init_mask = NULL_TREE;
 574       if (!first_iteration_full)
 575         {
 576           tree start, end;
 577           if (first_limit == test_limit)
 578             {
 579               /* Use a natural test between zero (the initial IV value)
 580                  and the loop limit.  The "else" block would be valid too,
 581                  but this choice can avoid the need to load BIAS_TREE into
 582                  a register.  */
 583               start = zero_index;
 584               end = this_test_limit;
 585             }
 586           else
 587             {
 588               /* FIRST_LIMIT is the maximum number of scalars handled by the
 589                  first iteration of the vector loop.  Test the portion
 590                  associated with this mask.  */
 591               start = bias_tree;
 592               end = first_limit;
 593             }
 594
 595           init_mask = make_temp_ssa_name (mask_type, NULL, "max_mask");
 596           tmp_stmt = vect_gen_while (init_mask, start, end);
 597           gimple_seq_add_stmt (preheader_seq, tmp_stmt);
 598         }
 599
 600       /* Now AND out the bits that are within the number of skipped
 601          scalars.  */
 602       poly_uint64 const_skip;
 603       if (nscalars_skip
 604           && !(poly_int_tree_p (nscalars_skip, &const_skip)
 605                && known_le (const_skip, bias)))
 606         {
 607           tree unskipped_mask = vect_gen_while_not (preheader_seq, mask_type,
 608                                                     bias_tree, nscalars_skip);
 609           if (init_mask)
 610             init_mask = gimple_build (preheader_seq, BIT_AND_EXPR, mask_type,
 611                                       init_mask, unskipped_mask);
 612           else
 613             init_mask = unskipped_mask;
 614         }
 615
 616       if (!init_mask)
 617         /* First iteration is full.  */
 618         init_mask = build_minus_one_cst (mask_type);
 619
 620       /* Get the mask value for the next iteration of the loop.  */
 621       next_mask = make_temp_ssa_name (mask_type, NULL, "next_mask");
 622       gcall *call = vect_gen_while (next_mask, test_index, this_test_limit);
 623       gsi_insert_before (test_gsi, call, GSI_SAME_STMT);
 624
 625       vect_set_loop_mask (loop, mask, init_mask, next_mask);
 626     }
 627   return next_mask;
 628 }
 629
 630 /* Make LOOP iterate NITERS times using masking and WHILE_ULT calls.
 631    LOOP_VINFO describes the vectorization of LOOP.  NITERS is the
 632    number of iterations of the original scalar loop that should be
 633    handled by the vector loop.  NITERS_MAYBE_ZERO and FINAL_IV are
 634    as for vect_set_loop_condition.
 635
 636    Insert the branch-back condition before LOOP_COND_GSI and return the
 637    final gcond.  */
 638
 639 static gcond *
 640 vect_set_loop_condition_masked (struct loop *loop, loop_vec_info loop_vinfo,
 641                                 tree niters, tree final_iv,
 642                                 bool niters_maybe_zero,
 643                                 gimple_stmt_iterator loop_cond_gsi)
 644 {
 645   gimple_seq preheader_seq = NULL;
 646   gimple_seq header_seq = NULL;
 647
 648   tree compare_type = LOOP_VINFO_MASK_COMPARE_TYPE (loop_vinfo);
 649   unsigned int compare_precision = TYPE_PRECISION (compare_type);
 650   tree orig_niters = niters;
 651
 652   /* Type of the initial value of NITERS.  */
 653   tree ni_actual_type = TREE_TYPE (niters);
 654   unsigned int ni_actual_precision = TYPE_PRECISION (ni_actual_type);
 655   tree niters_skip = LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo);
 656
 657   /* Convert NITERS to the same size as the compare.  */
 658   if (compare_precision > ni_actual_precision
 659       && niters_maybe_zero)
 660     {
 661       /* We know that there is always at least one iteration, so if the
 662          count is zero then it must have wrapped.  Cope with this by
 663          subtracting 1 before the conversion and adding 1 to the result.  */
 664       gcc_assert (TYPE_UNSIGNED (ni_actual_type));
 665       niters = gimple_build (&preheader_seq, PLUS_EXPR, ni_actual_type,
 666                              niters, build_minus_one_cst (ni_actual_type));
 667       niters = gimple_convert (&preheader_seq, compare_type, niters);
 668       niters = gimple_build (&preheader_seq, PLUS_EXPR, compare_type,
 669                              niters, build_one_cst (compare_type));
 670     }
 671   else
 672     niters = gimple_convert (&preheader_seq, compare_type, niters);
 673
 674   widest_int iv_limit = vect_iv_limit_for_full_masking (loop_vinfo);
 675   /* Get the vectorization factor in tree form.  */
 676   tree vf = build_int_cst (compare_type,
 677                            LOOP_VINFO_VECT_FACTOR (loop_vinfo));
 678
 679   /* Iterate over all the rgroups and fill in their masks.  We could use
 680      the first mask from any rgroup for the loop condition; here we
 681      arbitrarily pick the last.  */
 682   tree test_mask = NULL_TREE;
 683   rgroup_masks *rgm;
 684   unsigned int i;
 685   vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo);
 686   FOR_EACH_VEC_ELT (*masks, i, rgm)
 687     if (!rgm->masks.is_empty ())
 688       {
 689         /* First try using permutes.  This adds a single vector
 690            instruction to the loop for each mask, but needs no extra
 691            loop invariants or IVs.  */
 692         unsigned int nmasks = i + 1;
 693         if ((nmasks & 1) == 0)
 694           {
 695             rgroup_masks *half_rgm = &(*masks)[nmasks / 2 - 1];
 696             if (!half_rgm->masks.is_empty ()
 697                 && vect_maybe_permute_loop_masks (&header_seq, rgm, half_rgm))
 698               continue;
 699           }
 700
 701         /* See whether zero-based IV would ever generate all-false masks
 702            before wrapping around.  */
 703         bool might_wrap_p
 704           = (iv_limit == -1
 705              || (wi::min_precision (iv_limit * rgm->max_nscalars_per_iter,
 706                                     UNSIGNED)
 707                  > compare_precision));
 708
 709         /* Set up all masks for this group.  */
 710         test_mask = vect_set_loop_masks_directly (loop, loop_vinfo,
 711                                                   &preheader_seq,
 712                                                   loop_cond_gsi, rgm, vf,
 713                                                   niters, niters_skip,
 714                                                   might_wrap_p);
 715       }
 716
 717   /* Emit all accumulated statements.  */
 718   add_preheader_seq (loop, preheader_seq);
 719   add_header_seq (loop, header_seq);
 720
 721   /* Get a boolean result that tells us whether to iterate.  */
 722   edge exit_edge = single_exit (loop);
 723   tree_code code = (exit_edge->flags & EDGE_TRUE_VALUE) ? EQ_EXPR : NE_EXPR;
 724   tree zero_mask = build_zero_cst (TREE_TYPE (test_mask));
 725   gcond *cond_stmt = gimple_build_cond (code, test_mask, zero_mask,
 726                                         NULL_TREE, NULL_TREE);
 727   gsi_insert_before (&loop_cond_gsi, cond_stmt, GSI_SAME_STMT);
 728
 729   /* The loop iterates (NITERS - 1) / VF + 1 times.
 730      Subtract one from this to get the latch count.  */
 731   tree step = build_int_cst (compare_type,
 732                              LOOP_VINFO_VECT_FACTOR (loop_vinfo));
 733   tree niters_minus_one = fold_build2 (PLUS_EXPR, compare_type, niters,
 734                                        build_minus_one_cst (compare_type));
 735   loop->nb_iterations = fold_build2 (TRUNC_DIV_EXPR, compare_type,
 736                                      niters_minus_one, step);
 737
 738   if (final_iv)
 739     {
 740       gassign *assign = gimple_build_assign (final_iv, orig_niters);
 741       gsi_insert_on_edge_immediate (single_exit (loop), assign);
 742     }
 743
 744   return cond_stmt;
 745 }
 746
 747 /* Like vect_set_loop_condition, but handle the case in which there
 748    are no loop masks.  */
 749
 750 static gcond *
 751 vect_set_loop_condition_unmasked (struct loop *loop, tree niters,
 752                                   tree step, tree final_iv,
 753                                   bool niters_maybe_zero,
 754                                   gimple_stmt_iterator loop_cond_gsi)
 755 {
 756   tree indx_before_incr, indx_after_incr;
 757   gcond *cond_stmt;
 758   gcond *orig_cond;
 759   edge pe = loop_preheader_edge (loop);
 760   edge exit_edge = single_exit (loop);
 761   gimple_stmt_iterator incr_gsi;
 762   bool insert_after;
 763   enum tree_code code;
 764   tree niters_type = TREE_TYPE (niters);
 765
 766   orig_cond = get_loop_exit_condition (loop);
 767   gcc_assert (orig_cond);
 768   loop_cond_gsi = gsi_for_stmt (orig_cond);
 769
 770   tree init, limit;
 771   if (!niters_maybe_zero && integer_onep (step))
 772     {
 773       /* In this case we can use a simple 0-based IV:
 774
 775          A:
 776            x = 0;
 777            do
 778              {
 779                ...
 780                x += 1;
 781              }
 782            while (x < NITERS);  */
 783       code = (exit_edge->flags & EDGE_TRUE_VALUE) ? GE_EXPR : LT_EXPR;
 784       init = build_zero_cst (niters_type);
 785       limit = niters;
 786     }
 787   else
 788     {
 789       /* The following works for all values of NITERS except 0:
 790
 791          B:
 792            x = 0;
 793            do
 794              {
 795                ...
 796                x += STEP;
 797              }
 798            while (x <= NITERS - STEP);
 799
 800          so that the loop continues to iterate if x + STEP - 1 < NITERS
 801          but stops if x + STEP - 1 >= NITERS.
 802
 803          However, if NITERS is zero, x never hits a value above NITERS - STEP
 804          before wrapping around.  There are two obvious ways of dealing with
 805          this:
 806
 807          - start at STEP - 1 and compare x before incrementing it
 808          - start at -1 and compare x after incrementing it
 809
 810          The latter is simpler and is what we use.  The loop in this case
 811          looks like:
 812
 813          C:
 814            x = -1;
 815            do
 816              {
 817                ...
 818                x += STEP;
 819              }
 820            while (x < NITERS - STEP);
 821
 822          In both cases the loop limit is NITERS - STEP.  */
 823       gimple_seq seq = NULL;
 824       limit = force_gimple_operand (niters, &seq, true, NULL_TREE);
 825       limit = gimple_build (&seq, MINUS_EXPR, TREE_TYPE (limit), limit, step);
 826       if (seq)
 827         {
 828           basic_block new_bb = gsi_insert_seq_on_edge_immediate (pe, seq);
 829           gcc_assert (!new_bb);
 830         }
 831       if (niters_maybe_zero)
 832         {
 833           /* Case C.  */
 834           code = (exit_edge->flags & EDGE_TRUE_VALUE) ? GE_EXPR : LT_EXPR;
 835           init = build_all_ones_cst (niters_type);
 836         }
 837       else
 838         {
 839           /* Case B.  */
 840           code = (exit_edge->flags & EDGE_TRUE_VALUE) ? GT_EXPR : LE_EXPR;
 841           init = build_zero_cst (niters_type);
 842         }
 843     }
 844
 845   standard_iv_increment_position (loop, &incr_gsi, &insert_after);
 846   create_iv (init, step, NULL_TREE, loop,
 847              &incr_gsi, insert_after, &indx_before_incr, &indx_after_incr);
 848   indx_after_incr = force_gimple_operand_gsi (&loop_cond_gsi, indx_after_incr,
 849                                               true, NULL_TREE, true,
 850                                               GSI_SAME_STMT);
 851   limit = force_gimple_operand_gsi (&loop_cond_gsi, limit, true, NULL_TREE,
 852                                      true, GSI_SAME_STMT);
 853
 854   cond_stmt = gimple_build_cond (code, indx_after_incr, limit, NULL_TREE,
 855                                  NULL_TREE);
 856
 857   gsi_insert_before (&loop_cond_gsi, cond_stmt, GSI_SAME_STMT);
 858
 859   /* Record the number of latch iterations.  */
 860   if (limit == niters)
 861     /* Case A: the loop iterates NITERS times.  Subtract one to get the
 862        latch count.  */
 863     loop->nb_iterations = fold_build2 (MINUS_EXPR, niters_type, niters,
 864                                        build_int_cst (niters_type, 1));
 865   else
 866     /* Case B or C: the loop iterates (NITERS - STEP) / STEP + 1 times.
 867        Subtract one from this to get the latch count.  */
 868     loop->nb_iterations = fold_build2 (TRUNC_DIV_EXPR, niters_type,
 869                                        limit, step);
 870
 871   if (final_iv)
 872     {
 873       gassign *assign = gimple_build_assign (final_iv, MINUS_EXPR,
 874                                              indx_after_incr, init);
 875       gsi_insert_on_edge_immediate (single_exit (loop), assign);
 876     }
 877
 878   return cond_stmt;
 879 }
 880
 881 /* If we're using fully-masked loops, make LOOP iterate:
 882
 883       N == (NITERS - 1) / STEP + 1
 884
 885    times.  When NITERS is zero, this is equivalent to making the loop
 886    execute (1 << M) / STEP times, where M is the precision of NITERS.
 887    NITERS_MAYBE_ZERO is true if this last case might occur.
 888
 889    If we're not using fully-masked loops, make LOOP iterate:
 890
 891       N == (NITERS - STEP) / STEP + 1
 892
 893    times, where NITERS is known to be outside the range [1, STEP - 1].
 894    This is equivalent to making the loop execute NITERS / STEP times
 895    when NITERS is nonzero and (1 << M) / STEP times otherwise.
 896    NITERS_MAYBE_ZERO again indicates whether this last case might occur.
 897
 898    If FINAL_IV is nonnull, it is an SSA name that should be set to
 899    N * STEP on exit from the loop.
 900
 901    Assumption: the exit-condition of LOOP is the last stmt in the loop.  */
 902
 903 void
 904 vect_set_loop_condition (struct loop *loop, loop_vec_info loop_vinfo,
 905                          tree niters, tree step, tree final_iv,
 906                          bool niters_maybe_zero)
 907 {
 908   gcond *cond_stmt;
 909   gcond *orig_cond = get_loop_exit_condition (loop);
 910   gimple_stmt_iterator loop_cond_gsi = gsi_for_stmt (orig_cond);
 911
 912   if (loop_vinfo && LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
 913     cond_stmt = vect_set_loop_condition_masked (loop, loop_vinfo, niters,
 914                                                 final_iv, niters_maybe_zero,
 915                                                 loop_cond_gsi);
 916   else
 917     cond_stmt = vect_set_loop_condition_unmasked (loop, niters, step,
 918                                                   final_iv, niters_maybe_zero,
 919                                                   loop_cond_gsi);
 920
 921   /* Remove old loop exit test.  */
 922   stmt_vec_info orig_cond_info;
 923   if (loop_vinfo
 924       && (orig_cond_info = loop_vinfo->lookup_stmt (orig_cond)))
 925     loop_vinfo->remove_stmt (orig_cond_info);
 926   else
 927     gsi_remove (&loop_cond_gsi, true);
 928
 929   if (dump_enabled_p ())
 930     dump_printf_loc (MSG_NOTE, vect_location, "New loop exit condition: %G",
 931                      cond_stmt);
 932 }
 933
 934 /* Helper routine of slpeel_tree_duplicate_loop_to_edge_cfg.
 935    For all PHI arguments in FROM->dest and TO->dest from those
 936    edges ensure that TO->dest PHI arguments have current_def
 937    to that in from.  */
 938
 939 static void
 940 slpeel_duplicate_current_defs_from_edges (edge from, edge to)
 941 {
 942   gimple_stmt_iterator gsi_from, gsi_to;
 943
 944   for (gsi_from = gsi_start_phis (from->dest),
 945        gsi_to = gsi_start_phis (to->dest);
 946        !gsi_end_p (gsi_from) && !gsi_end_p (gsi_to);)
 947     {
 948       gimple *from_phi = gsi_stmt (gsi_from);
 949       gimple *to_phi = gsi_stmt (gsi_to);
 950       tree from_arg = PHI_ARG_DEF_FROM_EDGE (from_phi, from);
 951       tree to_arg = PHI_ARG_DEF_FROM_EDGE (to_phi, to);
 952       if (virtual_operand_p (from_arg))
 953         {
 954           gsi_next (&gsi_from);
 955           continue;
 956         }
 957       if (virtual_operand_p (to_arg))
 958         {
 959           gsi_next (&gsi_to);
 960           continue;
 961         }
 962       if (TREE_CODE (from_arg) != SSA_NAME)
 963         gcc_assert (operand_equal_p (from_arg, to_arg, 0));
 964       else if (TREE_CODE (to_arg) == SSA_NAME
 965                && from_arg != to_arg)
 966         {
 967           if (get_current_def (to_arg) == NULL_TREE)
 968             {
 969               gcc_assert (types_compatible_p (TREE_TYPE (to_arg),
 970                                               TREE_TYPE (get_current_def
 971                                                            (from_arg))));
 972               set_current_def (to_arg, get_current_def (from_arg));
 973             }
 974         }
 975       gsi_next (&gsi_from);
 976       gsi_next (&gsi_to);
 977     }
 978
 979   gphi *from_phi = get_virtual_phi (from->dest);
 980   gphi *to_phi = get_virtual_phi (to->dest);
 981   if (from_phi)
 982     set_current_def (PHI_ARG_DEF_FROM_EDGE (to_phi, to),
 983                      get_current_def (PHI_ARG_DEF_FROM_EDGE (from_phi, from)));
 984 }
 985
 986
 987 /* Given LOOP this function generates a new copy of it and puts it
 988    on E which is either the entry or exit of LOOP.  If SCALAR_LOOP is
 989    non-NULL, assume LOOP and SCALAR_LOOP are equivalent and copy the
 990    basic blocks from SCALAR_LOOP instead of LOOP, but to either the
 991    entry or exit of LOOP.  */
 992
 993 struct loop *
 994 slpeel_tree_duplicate_loop_to_edge_cfg (struct loop *loop,
 995                                         struct loop *scalar_loop, edge e)
 996 {
 997   struct loop *new_loop;
 998   basic_block *new_bbs, *bbs, *pbbs;
 999   bool at_exit;
1000   bool was_imm_dom;
1001   basic_block exit_dest;
1002   edge exit, new_exit;
1003   bool duplicate_outer_loop = false;
1004
1005   exit = single_exit (loop);
1006   at_exit = (e == exit);
1007   if (!at_exit && e != loop_preheader_edge (loop))
1008     return NULL;
1009
1010   if (scalar_loop == NULL)
1011     scalar_loop = loop;
1012
1013   bbs = XNEWVEC (basic_block, scalar_loop->num_nodes + 1);
1014   pbbs = bbs + 1;
1015   get_loop_body_with_size (scalar_loop, pbbs, scalar_loop->num_nodes);
1016   /* Allow duplication of outer loops.  */
1017   if (scalar_loop->inner)
1018     duplicate_outer_loop = true;
1019   /* Check whether duplication is possible.  */
1020   if (!can_copy_bbs_p (pbbs, scalar_loop->num_nodes))
1021     {
1022       free (bbs);
1023       return NULL;
1024     }
1025
1026   /* Generate new loop structure.  */
1027   new_loop = duplicate_loop (scalar_loop, loop_outer (scalar_loop));
1028   duplicate_subloops (scalar_loop, new_loop);
1029
1030   exit_dest = exit->dest;
1031   was_imm_dom = (get_immediate_dominator (CDI_DOMINATORS,
1032                                           exit_dest) == loop->header ?
1033                  true : false);
1034
1035   /* Also copy the pre-header, this avoids jumping through hoops to
1036      duplicate the loop entry PHI arguments.  Create an empty
1037      pre-header unconditionally for this.  */
1038   basic_block preheader = split_edge (loop_preheader_edge (scalar_loop));
1039   edge entry_e = single_pred_edge (preheader);
1040   bbs[0] = preheader;
1041   new_bbs = XNEWVEC (basic_block, scalar_loop->num_nodes + 1);
1042
1043   exit = single_exit (scalar_loop);
1044   copy_bbs (bbs, scalar_loop->num_nodes + 1, new_bbs,
1045             &exit, 1, &new_exit, NULL,
1046             at_exit ? loop->latch : e->src, true);
1047   exit = single_exit (loop);
1048   basic_block new_preheader = new_bbs[0];
1049
1050   add_phi_args_after_copy (new_bbs, scalar_loop->num_nodes + 1, NULL);
1051
1052   if (scalar_loop != loop)
1053     {
1054       /* If we copied from SCALAR_LOOP rather than LOOP, SSA_NAMEs from
1055          SCALAR_LOOP will have current_def set to SSA_NAMEs in the new_loop,
1056          but LOOP will not.  slpeel_update_phi_nodes_for_guard{1,2} expects
1057          the LOOP SSA_NAMEs (on the exit edge and edge from latch to
1058          header) to have current_def set, so copy them over.  */
1059       slpeel_duplicate_current_defs_from_edges (single_exit (scalar_loop),
1060                                                 exit);
1061       slpeel_duplicate_current_defs_from_edges (EDGE_SUCC (scalar_loop->latch,
1062                                                            0),
1063                                                 EDGE_SUCC (loop->latch, 0));
1064     }
1065
1066   if (at_exit) /* Add the loop copy at exit.  */
1067     {
1068       if (scalar_loop != loop)
1069         {
1070           gphi_iterator gsi;
1071           new_exit = redirect_edge_and_branch (new_exit, exit_dest);
1072
1073           for (gsi = gsi_start_phis (exit_dest); !gsi_end_p (gsi);
1074                gsi_next (&gsi))
1075             {
1076               gphi *phi = gsi.phi ();
1077               tree orig_arg = PHI_ARG_DEF_FROM_EDGE (phi, e);
1078               location_t orig_locus
1079                 = gimple_phi_arg_location_from_edge (phi, e);
1080
1081               add_phi_arg (phi, orig_arg, new_exit, orig_locus);
1082             }
1083         }
1084       redirect_edge_and_branch_force (e, new_preheader);
1085       flush_pending_stmts (e);
1086       set_immediate_dominator (CDI_DOMINATORS, new_preheader, e->src);
1087       if (was_imm_dom || duplicate_outer_loop)
1088         set_immediate_dominator (CDI_DOMINATORS, exit_dest, new_exit->src);
1089
1090       /* And remove the non-necessary forwarder again.  Keep the other
1091          one so we have a proper pre-header for the loop at the exit edge.  */
1092       redirect_edge_pred (single_succ_edge (preheader),
1093                           single_pred (preheader));
1094       delete_basic_block (preheader);
1095       set_immediate_dominator (CDI_DOMINATORS, scalar_loop->header,
1096                                loop_preheader_edge (scalar_loop)->src);
1097     }
1098   else /* Add the copy at entry.  */
1099     {
1100       if (scalar_loop != loop)
1101         {
1102           /* Remove the non-necessary forwarder of scalar_loop again.  */
1103           redirect_edge_pred (single_succ_edge (preheader),
1104                               single_pred (preheader));
1105           delete_basic_block (preheader);
1106           set_immediate_dominator (CDI_DOMINATORS, scalar_loop->header,
1107                                    loop_preheader_edge (scalar_loop)->src);
1108           preheader = split_edge (loop_preheader_edge (loop));
1109           entry_e = single_pred_edge (preheader);
1110         }
1111
1112       redirect_edge_and_branch_force (entry_e, new_preheader);
1113       flush_pending_stmts (entry_e);
1114       set_immediate_dominator (CDI_DOMINATORS, new_preheader, entry_e->src);
1115
1116       redirect_edge_and_branch_force (new_exit, preheader);
1117       flush_pending_stmts (new_exit);
1118       set_immediate_dominator (CDI_DOMINATORS, preheader, new_exit->src);
1119
1120       /* And remove the non-necessary forwarder again.  Keep the other
1121          one so we have a proper pre-header for the loop at the exit edge.  */
1122       redirect_edge_pred (single_succ_edge (new_preheader),
1123                           single_pred (new_preheader));
1124       delete_basic_block (new_preheader);
1125       set_immediate_dominator (CDI_DOMINATORS, new_loop->header,
1126                                loop_preheader_edge (new_loop)->src);
1127     }
1128
1129   /* Skip new preheader since it's deleted if copy loop is added at entry.  */
1130   for (unsigned i = (at_exit ? 0 : 1); i < scalar_loop->num_nodes + 1; i++)
1131     rename_variables_in_bb (new_bbs[i], duplicate_outer_loop);
1132
1133   if (scalar_loop != loop)
1134     {
1135       /* Update new_loop->header PHIs, so that on the preheader
1136          edge they are the ones from loop rather than scalar_loop.  */
1137       gphi_iterator gsi_orig, gsi_new;
1138       edge orig_e = loop_preheader_edge (loop);
1139       edge new_e = loop_preheader_edge (new_loop);
1140
1141       for (gsi_orig = gsi_start_phis (loop->header),
1142            gsi_new = gsi_start_phis (new_loop->header);
1143            !gsi_end_p (gsi_orig) && !gsi_end_p (gsi_new);
1144            gsi_next (&gsi_orig), gsi_next (&gsi_new))
1145         {
1146           gphi *orig_phi = gsi_orig.phi ();
1147           gphi *new_phi = gsi_new.phi ();
1148           tree orig_arg = PHI_ARG_DEF_FROM_EDGE (orig_phi, orig_e);
1149           location_t orig_locus
1150             = gimple_phi_arg_location_from_edge (orig_phi, orig_e);
1151
1152           add_phi_arg (new_phi, orig_arg, new_e, orig_locus);
1153         }
1154     }
1155
1156   free (new_bbs);
1157   free (bbs);
1158
1159   checking_verify_dominators (CDI_DOMINATORS);
1160
1161   return new_loop;
1162 }
1163
1164
1165 /* Given the condition expression COND, put it as the last statement of
1166    GUARD_BB; set both edges' probability; set dominator of GUARD_TO to
1167    DOM_BB; return the skip edge.  GUARD_TO is the target basic block to
1168    skip the loop.  PROBABILITY is the skip edge's probability.  Mark the
1169    new edge as irreducible if IRREDUCIBLE_P is true.  */
1170
1171 static edge
1172 slpeel_add_loop_guard (basic_block guard_bb, tree cond,
1173                        basic_block guard_to, basic_block dom_bb,
1174                        profile_probability probability, bool irreducible_p)
1175 {
1176   gimple_stmt_iterator gsi;
1177   edge new_e, enter_e;
1178   gcond *cond_stmt;
1179   gimple_seq gimplify_stmt_list = NULL;
1180
1181   enter_e = EDGE_SUCC (guard_bb, 0);
1182   enter_e->flags &= ~EDGE_FALLTHRU;
1183   enter_e->flags |= EDGE_FALSE_VALUE;
1184   gsi = gsi_last_bb (guard_bb);
1185
1186   cond = force_gimple_operand_1 (cond, &gimplify_stmt_list, is_gimple_condexpr,
1187                                  NULL_TREE);
1188   if (gimplify_stmt_list)
1189     gsi_insert_seq_after (&gsi, gimplify_stmt_list, GSI_NEW_STMT);
1190
1191   cond_stmt = gimple_build_cond_from_tree (cond, NULL_TREE, NULL_TREE);
1192   gsi = gsi_last_bb (guard_bb);
1193   gsi_insert_after (&gsi, cond_stmt, GSI_NEW_STMT);
1194
1195   /* Add new edge to connect guard block to the merge/loop-exit block.  */
1196   new_e = make_edge (guard_bb, guard_to, EDGE_TRUE_VALUE);
1197
1198   new_e->probability = probability;
1199   if (irreducible_p)
1200     new_e->flags |= EDGE_IRREDUCIBLE_LOOP;
1201
1202   enter_e->probability = probability.invert ();
1203   set_immediate_dominator (CDI_DOMINATORS, guard_to, dom_bb);
1204
1205   /* Split enter_e to preserve LOOPS_HAVE_PREHEADERS.  */
1206   if (enter_e->dest->loop_father->header == enter_e->dest)
1207     split_edge (enter_e);
1208
1209   return new_e;
1210 }
1211
1212
1213 /* This function verifies that the following restrictions apply to LOOP:
1214    (1) it consists of exactly 2 basic blocks - header, and an empty latch
1215        for innermost loop and 5 basic blocks for outer-loop.
1216    (2) it is single entry, single exit
1217    (3) its exit condition is the last stmt in the header
1218    (4) E is the entry/exit edge of LOOP.
1219  */
1220
1221 bool
1222 slpeel_can_duplicate_loop_p (const struct loop *loop, const_edge e)
1223 {
1224   edge exit_e = single_exit (loop);
1225   edge entry_e = loop_preheader_edge (loop);
1226   gcond *orig_cond = get_loop_exit_condition (loop);
1227   gimple_stmt_iterator loop_exit_gsi = gsi_last_bb (exit_e->src);
1228   unsigned int num_bb = loop->inner? 5 : 2;
1229
1230   /* All loops have an outer scope; the only case loop->outer is NULL is for
1231      the function itself.  */
1232   if (!loop_outer (loop)
1233       || loop->num_nodes != num_bb
1234       || !empty_block_p (loop->latch)
1235       || !single_exit (loop)
1236       /* Verify that new loop exit condition can be trivially modified.  */
1237       || (!orig_cond || orig_cond != gsi_stmt (loop_exit_gsi))
1238       || (e != exit_e && e != entry_e))
1239     return false;
1240
1241   return true;
1242 }
1243
1244 /* If the loop has a virtual PHI, but exit bb doesn't, create a virtual PHI
1245    in the exit bb and rename all the uses after the loop.  This simplifies
1246    the *guard[12] routines, which assume loop closed SSA form for all PHIs
1247    (but normally loop closed SSA form doesn't require virtual PHIs to be
1248    in the same form).  Doing this early simplifies the checking what
1249    uses should be renamed.  */
1250
1251 static void
1252 create_lcssa_for_virtual_phi (struct loop *loop)
1253 {
1254   gphi_iterator gsi;
1255   edge exit_e = single_exit (loop);
1256
1257   for (gsi = gsi_start_phis (loop->header); !gsi_end_p (gsi); gsi_next (&gsi))
1258     if (virtual_operand_p (gimple_phi_result (gsi_stmt (gsi))))
1259       {
1260         gphi *phi = gsi.phi ();
1261         for (gsi = gsi_start_phis (exit_e->dest);
1262              !gsi_end_p (gsi); gsi_next (&gsi))
1263           if (virtual_operand_p (gimple_phi_result (gsi_stmt (gsi))))
1264             break;
1265         if (gsi_end_p (gsi))
1266           {
1267             tree new_vop = copy_ssa_name (PHI_RESULT (phi));
1268             gphi *new_phi = create_phi_node (new_vop, exit_e->dest);
1269             tree vop = PHI_ARG_DEF_FROM_EDGE (phi, EDGE_SUCC (loop->latch, 0));
1270             imm_use_iterator imm_iter;
1271             gimple *stmt;
1272             use_operand_p use_p;
1273
1274             SSA_NAME_OCCURS_IN_ABNORMAL_PHI (new_vop)
1275               = SSA_NAME_OCCURS_IN_ABNORMAL_PHI (vop);
1276             add_phi_arg (new_phi, vop, exit_e, UNKNOWN_LOCATION);
1277             gimple_phi_set_result (new_phi, new_vop);
1278             FOR_EACH_IMM_USE_STMT (stmt, imm_iter, vop)
1279               if (stmt != new_phi
1280                   && !flow_bb_inside_loop_p (loop, gimple_bb (stmt)))
1281                 FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
1282                   SET_USE (use_p, new_vop);
1283           }
1284         break;
1285       }
1286
1287 }
1288
1289 /* Function vect_get_loop_location.
1290
1291    Extract the location of the loop in the source code.
1292    If the loop is not well formed for vectorization, an estimated
1293    location is calculated.
1294    Return the loop location if succeed and NULL if not.  */
1295
1296 dump_user_location_t
1297 find_loop_location (struct loop *loop)
1298 {
1299   gimple *stmt = NULL;
1300   basic_block bb;
1301   gimple_stmt_iterator si;
1302
1303   if (!loop)
1304     return dump_user_location_t ();
1305
1306   stmt = get_loop_exit_condition (loop);
1307
1308   if (stmt
1309       && LOCATION_LOCUS (gimple_location (stmt)) > BUILTINS_LOCATION)
1310     return stmt;
1311
1312   /* If we got here the loop is probably not "well formed",
1313      try to estimate the loop location */
1314
1315   if (!loop->header)
1316     return dump_user_location_t ();
1317
1318   bb = loop->header;
1319
1320   for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
1321     {
1322       stmt = gsi_stmt (si);
1323       if (LOCATION_LOCUS (gimple_location (stmt)) > BUILTINS_LOCATION)
1324         return stmt;
1325     }
1326
1327   return dump_user_location_t ();
1328 }
1329
1330 /* Return true if the phi described by STMT_INFO defines an IV of the
1331    loop to be vectorized.  */
1332
1333 static bool
1334 iv_phi_p (stmt_vec_info stmt_info)
1335 {
1336   gphi *phi = as_a <gphi *> (stmt_info->stmt);
1337   if (virtual_operand_p (PHI_RESULT (phi)))
1338     return false;
1339
1340   if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
1341       || STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
1342     return false;
1343
1344   return true;
1345 }
1346
1347 /* Function vect_can_advance_ivs_p
1348
1349    In case the number of iterations that LOOP iterates is unknown at compile
1350    time, an epilog loop will be generated, and the loop induction variables
1351    (IVs) will be "advanced" to the value they are supposed to take just before
1352    the epilog loop.  Here we check that the access function of the loop IVs
1353    and the expression that represents the loop bound are simple enough.
1354    These restrictions will be relaxed in the future.  */
1355
1356 bool
1357 vect_can_advance_ivs_p (loop_vec_info loop_vinfo)
1358 {
1359   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1360   basic_block bb = loop->header;
1361   gphi_iterator gsi;
1362
1363   /* Analyze phi functions of the loop header.  */
1364
1365   if (dump_enabled_p ())
1366     dump_printf_loc (MSG_NOTE, vect_location, "vect_can_advance_ivs_p:\n");
1367   for (gsi = gsi_start_phis (bb); !gsi_end_p (gsi); gsi_next (&gsi))
1368     {
1369       tree evolution_part;
1370
1371       gphi *phi = gsi.phi ();
1372       stmt_vec_info phi_info = loop_vinfo->lookup_stmt (phi);
1373       if (dump_enabled_p ())
1374         dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: %G",
1375                          phi_info->stmt);
1376
1377       /* Skip virtual phi's. The data dependences that are associated with
1378          virtual defs/uses (i.e., memory accesses) are analyzed elsewhere.
1379
1380          Skip reduction phis.  */
1381       if (!iv_phi_p (phi_info))
1382         {
1383           if (dump_enabled_p ())
1384             dump_printf_loc (MSG_NOTE, vect_location,
1385                              "reduc or virtual phi. skip.\n");
1386           continue;
1387         }
1388
1389       /* Analyze the evolution function.  */
1390
1391       evolution_part = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (phi_info);
1392       if (evolution_part == NULL_TREE)
1393         {
1394           if (dump_enabled_p ())
1395             dump_printf (MSG_MISSED_OPTIMIZATION,
1396                          "No access function or evolution.\n");
1397           return false;
1398         }
1399
1400       /* FORNOW: We do not transform initial conditions of IVs
1401          which evolution functions are not invariants in the loop.  */
1402
1403       if (!expr_invariant_in_loop_p (loop, evolution_part))
1404         {
1405           if (dump_enabled_p ())
1406             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1407                              "evolution not invariant in loop.\n");
1408           return false;
1409         }
1410
1411       /* FORNOW: We do not transform initial conditions of IVs
1412          which evolution functions are a polynomial of degree >= 2.  */
1413
1414       if (tree_is_chrec (evolution_part))
1415         {
1416           if (dump_enabled_p ())
1417             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1418                              "evolution is chrec.\n");
1419           return false;
1420         }
1421     }
1422
1423   return true;
1424 }
1425
1426
1427 /*   Function vect_update_ivs_after_vectorizer.
1428
1429      "Advance" the induction variables of LOOP to the value they should take
1430      after the execution of LOOP.  This is currently necessary because the
1431      vectorizer does not handle induction variables that are used after the
1432      loop.  Such a situation occurs when the last iterations of LOOP are
1433      peeled, because:
1434      1. We introduced new uses after LOOP for IVs that were not originally used
1435         after LOOP: the IVs of LOOP are now used by an epilog loop.
1436      2. LOOP is going to be vectorized; this means that it will iterate N/VF
1437         times, whereas the loop IVs should be bumped N times.
1438
1439      Input:
1440      - LOOP - a loop that is going to be vectorized. The last few iterations
1441               of LOOP were peeled.
1442      - NITERS - the number of iterations that LOOP executes (before it is
1443                 vectorized). i.e, the number of times the ivs should be bumped.
1444      - UPDATE_E - a successor edge of LOOP->exit that is on the (only) path
1445                   coming out from LOOP on which there are uses of the LOOP ivs
1446                   (this is the path from LOOP->exit to epilog_loop->preheader).
1447
1448                   The new definitions of the ivs are placed in LOOP->exit.
1449                   The phi args associated with the edge UPDATE_E in the bb
1450                   UPDATE_E->dest are updated accordingly.
1451
1452      Assumption 1: Like the rest of the vectorizer, this function assumes
1453      a single loop exit that has a single predecessor.
1454
1455      Assumption 2: The phi nodes in the LOOP header and in update_bb are
1456      organized in the same order.
1457
1458      Assumption 3: The access function of the ivs is simple enough (see
1459      vect_can_advance_ivs_p).  This assumption will be relaxed in the future.
1460
1461      Assumption 4: Exactly one of the successors of LOOP exit-bb is on a path
1462      coming out of LOOP on which the ivs of LOOP are used (this is the path
1463      that leads to the epilog loop; other paths skip the epilog loop).  This
1464      path starts with the edge UPDATE_E, and its destination (denoted update_bb)
1465      needs to have its phis updated.
1466  */
1467
1468 static void
1469 vect_update_ivs_after_vectorizer (loop_vec_info loop_vinfo,
1470                                   tree niters, edge update_e)
1471 {
1472   gphi_iterator gsi, gsi1;
1473   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1474   basic_block update_bb = update_e->dest;
1475   basic_block exit_bb = single_exit (loop)->dest;
1476
1477   /* Make sure there exists a single-predecessor exit bb:  */
1478   gcc_assert (single_pred_p (exit_bb));
1479   gcc_assert (single_succ_edge (exit_bb) == update_e);
1480
1481   for (gsi = gsi_start_phis (loop->header), gsi1 = gsi_start_phis (update_bb);
1482        !gsi_end_p (gsi) && !gsi_end_p (gsi1);
1483        gsi_next (&gsi), gsi_next (&gsi1))
1484     {
1485       tree init_expr;
1486       tree step_expr, off;
1487       tree type;
1488       tree var, ni, ni_name;
1489       gimple_stmt_iterator last_gsi;
1490
1491       gphi *phi = gsi.phi ();
1492       gphi *phi1 = gsi1.phi ();
1493       stmt_vec_info phi_info = loop_vinfo->lookup_stmt (phi);
1494       if (dump_enabled_p ())
1495         dump_printf_loc (MSG_NOTE, vect_location,
1496                          "vect_update_ivs_after_vectorizer: phi: %G", phi);
1497
1498       /* Skip reduction and virtual phis.  */
1499       if (!iv_phi_p (phi_info))
1500         {
1501           if (dump_enabled_p ())
1502             dump_printf_loc (MSG_NOTE, vect_location,
1503                              "reduc or virtual phi. skip.\n");
1504           continue;
1505         }
1506
1507       type = TREE_TYPE (gimple_phi_result (phi));
1508       step_expr = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (phi_info);
1509       step_expr = unshare_expr (step_expr);
1510
1511       /* FORNOW: We do not support IVs whose evolution function is a polynomial
1512          of degree >= 2 or exponential.  */
1513       gcc_assert (!tree_is_chrec (step_expr));
1514
1515       init_expr = PHI_ARG_DEF_FROM_EDGE (phi, loop_preheader_edge (loop));
1516
1517       off = fold_build2 (MULT_EXPR, TREE_TYPE (step_expr),
1518                          fold_convert (TREE_TYPE (step_expr), niters),
1519                          step_expr);
1520       if (POINTER_TYPE_P (type))
1521         ni = fold_build_pointer_plus (init_expr, off);
1522       else
1523         ni = fold_build2 (PLUS_EXPR, type,
1524                           init_expr, fold_convert (type, off));
1525
1526       var = create_tmp_var (type, "tmp");
1527
1528       last_gsi = gsi_last_bb (exit_bb);
1529       gimple_seq new_stmts = NULL;
1530       ni_name = force_gimple_operand (ni, &new_stmts, false, var);
1531       /* Exit_bb shouldn't be empty.  */
1532       if (!gsi_end_p (last_gsi))
1533         gsi_insert_seq_after (&last_gsi, new_stmts, GSI_SAME_STMT);
1534       else
1535         gsi_insert_seq_before (&last_gsi, new_stmts, GSI_SAME_STMT);
1536
1537       /* Fix phi expressions in the successor bb.  */
1538       adjust_phi_and_debug_stmts (phi1, update_e, ni_name);
1539     }
1540 }
1541
1542 /* Return a gimple value containing the misalignment (measured in vector
1543    elements) for the loop described by LOOP_VINFO, i.e. how many elements
1544    it is away from a perfectly aligned address.  Add any new statements
1545    to SEQ.  */
1546
1547 static tree
1548 get_misalign_in_elems (gimple **seq, loop_vec_info loop_vinfo)
1549 {
1550   dr_vec_info *dr_info = LOOP_VINFO_UNALIGNED_DR (loop_vinfo);
1551   stmt_vec_info stmt_info = dr_info->stmt;
1552   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
1553
1554   poly_uint64 target_align = DR_TARGET_ALIGNMENT (dr_info);
1555   unsigned HOST_WIDE_INT target_align_c;
1556   tree target_align_minus_1;
1557
1558   bool negative = tree_int_cst_compare (DR_STEP (dr_info->dr),
1559                                         size_zero_node) < 0;
1560   tree offset = (negative
1561                  ? size_int (-TYPE_VECTOR_SUBPARTS (vectype) + 1)
1562                  : size_zero_node);
1563   tree start_addr = vect_create_addr_base_for_vector_ref (stmt_info, seq,
1564                                                           offset);
1565   tree type = unsigned_type_for (TREE_TYPE (start_addr));
1566   if (target_align.is_constant (&target_align_c))
1567     target_align_minus_1 = build_int_cst (type, target_align_c - 1);
1568   else
1569     {
1570       tree vla = build_int_cst (type, target_align);
1571       tree vla_align = fold_build2 (BIT_AND_EXPR, type, vla,
1572                                     fold_build2 (MINUS_EXPR, type,
1573                                                  build_int_cst (type, 0), vla));
1574       target_align_minus_1 = fold_build2 (MINUS_EXPR, type, vla_align,
1575                                           build_int_cst (type, 1));
1576     }
1577
1578   HOST_WIDE_INT elem_size
1579     = int_cst_value (TYPE_SIZE_UNIT (TREE_TYPE (vectype)));
1580   tree elem_size_log = build_int_cst (type, exact_log2 (elem_size));
1581
1582   /* Create:  misalign_in_bytes = addr & (target_align - 1).  */
1583   tree int_start_addr = fold_convert (type, start_addr);
1584   tree misalign_in_bytes = fold_build2 (BIT_AND_EXPR, type, int_start_addr,
1585                                         target_align_minus_1);
1586
1587   /* Create:  misalign_in_elems = misalign_in_bytes / element_size.  */
1588   tree misalign_in_elems = fold_build2 (RSHIFT_EXPR, type, misalign_in_bytes,
1589                                         elem_size_log);
1590
1591   return misalign_in_elems;
1592 }
1593
1594 /* Function vect_gen_prolog_loop_niters
1595
1596    Generate the number of iterations which should be peeled as prolog for the
1597    loop represented by LOOP_VINFO.  It is calculated as the misalignment of
1598    DR - the data reference recorded in LOOP_VINFO_UNALIGNED_DR (LOOP_VINFO).
1599    As a result, after the execution of this loop, the data reference DR will
1600    refer to an aligned location.  The following computation is generated:
1601
1602    If the misalignment of DR is known at compile time:
1603      addr_mis = int mis = DR_MISALIGNMENT (dr);
1604    Else, compute address misalignment in bytes:
1605      addr_mis = addr & (target_align - 1)
1606
1607    prolog_niters = ((VF - addr_mis/elem_size)&(VF-1))/step
1608
1609    (elem_size = element type size; an element is the scalar element whose type
1610    is the inner type of the vectype)
1611
1612    The computations will be emitted at the end of BB.  We also compute and
1613    store upper bound (included) of the result in BOUND.
1614
1615    When the step of the data-ref in the loop is not 1 (as in interleaved data
1616    and SLP), the number of iterations of the prolog must be divided by the step
1617    (which is equal to the size of interleaved group).
1618
1619    The above formulas assume that VF == number of elements in the vector. This
1620    may not hold when there are multiple-types in the loop.
1621    In this case, for some data-references in the loop the VF does not represent
1622    the number of elements that fit in the vector.  Therefore, instead of VF we
1623    use TYPE_VECTOR_SUBPARTS.  */
1624
1625 static tree
1626 vect_gen_prolog_loop_niters (loop_vec_info loop_vinfo,
1627                              basic_block bb, int *bound)
1628 {
1629   dr_vec_info *dr_info = LOOP_VINFO_UNALIGNED_DR (loop_vinfo);
1630   tree var;
1631   tree niters_type = TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo));
1632   gimple_seq stmts = NULL, new_stmts = NULL;
1633   tree iters, iters_name;
1634   stmt_vec_info stmt_info = dr_info->stmt;
1635   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
1636   poly_uint64 target_align = DR_TARGET_ALIGNMENT (dr_info);
1637
1638   if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) > 0)
1639     {
1640       int npeel = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
1641
1642       if (dump_enabled_p ())
1643         dump_printf_loc (MSG_NOTE, vect_location,
1644                          "known peeling = %d.\n", npeel);
1645
1646       iters = build_int_cst (niters_type, npeel);
1647       *bound = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
1648     }
1649   else
1650     {
1651       tree misalign_in_elems = get_misalign_in_elems (&stmts, loop_vinfo);
1652       tree type = TREE_TYPE (misalign_in_elems);
1653       HOST_WIDE_INT elem_size
1654         = int_cst_value (TYPE_SIZE_UNIT (TREE_TYPE (vectype)));
1655       /* We only do prolog peeling if the target alignment is known at compile
1656          time.  */
1657       poly_uint64 align_in_elems =
1658         exact_div (target_align, elem_size);
1659       tree align_in_elems_minus_1 =
1660         build_int_cst (type, align_in_elems - 1);
1661       tree align_in_elems_tree = build_int_cst (type, align_in_elems);
1662
1663       /* Create:  (niters_type) ((align_in_elems - misalign_in_elems)
1664                                  & (align_in_elems - 1)).  */
1665       bool negative = tree_int_cst_compare (DR_STEP (dr_info->dr),
1666                                             size_zero_node) < 0;
1667       if (negative)
1668         iters = fold_build2 (MINUS_EXPR, type, misalign_in_elems,
1669                              align_in_elems_tree);
1670       else
1671         iters = fold_build2 (MINUS_EXPR, type, align_in_elems_tree,
1672                              misalign_in_elems);
1673       iters = fold_build2 (BIT_AND_EXPR, type, iters, align_in_elems_minus_1);
1674       iters = fold_convert (niters_type, iters);
1675       unsigned HOST_WIDE_INT align_in_elems_c;
1676       if (align_in_elems.is_constant (&align_in_elems_c))
1677         *bound = align_in_elems_c - 1;
1678       else
1679         *bound = -1;
1680     }
1681
1682   if (dump_enabled_p ())
1683     dump_printf_loc (MSG_NOTE, vect_location,
1684                      "niters for prolog loop: %T\n", iters);
1685
1686   var = create_tmp_var (niters_type, "prolog_loop_niters");
1687   iters_name = force_gimple_operand (iters, &new_stmts, false, var);
1688
1689   if (new_stmts)
1690     gimple_seq_add_seq (&stmts, new_stmts);
1691   if (stmts)
1692     {
1693       gcc_assert (single_succ_p (bb));
1694       gimple_stmt_iterator gsi = gsi_last_bb (bb);
1695       if (gsi_end_p (gsi))
1696         gsi_insert_seq_before (&gsi, stmts, GSI_SAME_STMT);
1697       else
1698         gsi_insert_seq_after (&gsi, stmts, GSI_SAME_STMT);
1699     }
1700   return iters_name;
1701 }
1702
1703
1704 /* Function vect_update_init_of_dr
1705
1706    If CODE is PLUS, the vector loop starts NITERS iterations after the
1707    scalar one, otherwise CODE is MINUS and the vector loop starts NITERS
1708    iterations before the scalar one (using masking to skip inactive
1709    elements).  This function updates the information recorded in DR to
1710    account for the difference.  Specifically, it updates the OFFSET
1711    field of DR.  */
1712
1713 static void
1714 vect_update_init_of_dr (struct data_reference *dr, tree niters, tree_code code)
1715 {
1716   tree offset = DR_OFFSET (dr);
1717
1718   niters = fold_build2 (MULT_EXPR, sizetype,
1719                         fold_convert (sizetype, niters),
1720                         fold_convert (sizetype, DR_STEP (dr)));
1721   offset = fold_build2 (code, sizetype,
1722                         fold_convert (sizetype, offset), niters);
1723   DR_OFFSET (dr) = offset;
1724 }
1725
1726
1727 /* Function vect_update_inits_of_drs
1728
1729    Apply vect_update_inits_of_dr to all accesses in LOOP_VINFO.
1730    CODE and NITERS are as for vect_update_inits_of_dr.  */
1731
1732 static void
1733 vect_update_inits_of_drs (loop_vec_info loop_vinfo, tree niters,
1734                           tree_code code)
1735 {
1736   unsigned int i;
1737   vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
1738   struct data_reference *dr;
1739
1740   DUMP_VECT_SCOPE ("vect_update_inits_of_dr");
1741
1742   /* Adjust niters to sizetype and insert stmts on loop preheader edge.  */
1743   if (!types_compatible_p (sizetype, TREE_TYPE (niters)))
1744     {
1745       gimple_seq seq;
1746       edge pe = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo));
1747       tree var = create_tmp_var (sizetype, "prolog_loop_adjusted_niters");
1748
1749       niters = fold_convert (sizetype, niters);
1750       niters = force_gimple_operand (niters, &seq, false, var);
1751       if (seq)
1752         {
1753           basic_block new_bb = gsi_insert_seq_on_edge_immediate (pe, seq);
1754           gcc_assert (!new_bb);
1755         }
1756     }
1757
1758   FOR_EACH_VEC_ELT (datarefs, i, dr)
1759     {
1760       dr_vec_info *dr_info = loop_vinfo->lookup_dr (dr);
1761       if (!STMT_VINFO_GATHER_SCATTER_P (dr_info->stmt))
1762         vect_update_init_of_dr (dr, niters, code);
1763     }
1764 }
1765
1766 /* For the information recorded in LOOP_VINFO prepare the loop for peeling
1767    by masking.  This involves calculating the number of iterations to
1768    be peeled and then aligning all memory references appropriately.  */
1769
1770 void
1771 vect_prepare_for_masked_peels (loop_vec_info loop_vinfo)
1772 {
1773   tree misalign_in_elems;
1774   tree type = LOOP_VINFO_MASK_COMPARE_TYPE (loop_vinfo);
1775
1776   gcc_assert (vect_use_loop_mask_for_alignment_p (loop_vinfo));
1777
1778   /* From the information recorded in LOOP_VINFO get the number of iterations
1779      that need to be skipped via masking.  */
1780   if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) > 0)
1781     {
1782       poly_int64 misalign = (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
1783                              - LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo));
1784       misalign_in_elems = build_int_cst (type, misalign);
1785     }
1786   else
1787     {
1788       gimple_seq seq1 = NULL, seq2 = NULL;
1789       misalign_in_elems = get_misalign_in_elems (&seq1, loop_vinfo);
1790       misalign_in_elems = fold_convert (type, misalign_in_elems);
1791       misalign_in_elems = force_gimple_operand (misalign_in_elems,
1792                                                 &seq2, true, NULL_TREE);
1793       gimple_seq_add_seq (&seq1, seq2);
1794       if (seq1)
1795         {
1796           edge pe = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo));
1797           basic_block new_bb = gsi_insert_seq_on_edge_immediate (pe, seq1);
1798           gcc_assert (!new_bb);
1799         }
1800     }
1801
1802   if (dump_enabled_p ())
1803     dump_printf_loc (MSG_NOTE, vect_location,
1804                      "misalignment for fully-masked loop: %T\n",
1805                      misalign_in_elems);
1806
1807   LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo) = misalign_in_elems;
1808
1809   vect_update_inits_of_drs (loop_vinfo, misalign_in_elems, MINUS_EXPR);
1810 }
1811
1812 /* This function builds ni_name = number of iterations.  Statements
1813    are emitted on the loop preheader edge.  If NEW_VAR_P is not NULL, set
1814    it to TRUE if new ssa_var is generated.  */
1815
1816 tree
1817 vect_build_loop_niters (loop_vec_info loop_vinfo, bool *new_var_p)
1818 {
1819   tree ni = unshare_expr (LOOP_VINFO_NITERS (loop_vinfo));
1820   if (TREE_CODE (ni) == INTEGER_CST)
1821     return ni;
1822   else
1823     {
1824       tree ni_name, var;
1825       gimple_seq stmts = NULL;
1826       edge pe = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo));
1827
1828       var = create_tmp_var (TREE_TYPE (ni), "niters");
1829       ni_name = force_gimple_operand (ni, &stmts, false, var);
1830       if (stmts)
1831         {
1832           gsi_insert_seq_on_edge_immediate (pe, stmts);
1833           if (new_var_p != NULL)
1834             *new_var_p = true;
1835         }
1836
1837       return ni_name;
1838     }
1839 }
1840
1841 /* Calculate the number of iterations above which vectorized loop will be
1842    preferred than scalar loop.  NITERS_PROLOG is the number of iterations
1843    of prolog loop.  If it's integer const, the integer number is also passed
1844    in INT_NITERS_PROLOG.  BOUND_PROLOG is the upper bound (inclusive) of the
1845    number of iterations of the prolog loop.  BOUND_EPILOG is the corresponding
1846    value for the epilog loop.  If CHECK_PROFITABILITY is true, TH is the
1847    threshold below which the scalar (rather than vectorized) loop will be
1848    executed.  This function stores the upper bound (inclusive) of the result
1849    in BOUND_SCALAR.  */
1850
1851 static tree
1852 vect_gen_scalar_loop_niters (tree niters_prolog, int int_niters_prolog,
1853                              int bound_prolog, poly_int64 bound_epilog, int th,
1854                              poly_uint64 *bound_scalar,
1855                              bool check_profitability)
1856 {
1857   tree type = TREE_TYPE (niters_prolog);
1858   tree niters = fold_build2 (PLUS_EXPR, type, niters_prolog,
1859                              build_int_cst (type, bound_epilog));
1860
1861   *bound_scalar = bound_prolog + bound_epilog;
1862   if (check_profitability)
1863     {
1864       /* TH indicates the minimum niters of vectorized loop, while we
1865          compute the maximum niters of scalar loop.  */
1866       th--;
1867       /* Peeling for constant times.  */
1868       if (int_niters_prolog >= 0)
1869         {
1870           *bound_scalar = upper_bound (int_niters_prolog + bound_epilog, th);
1871           return build_int_cst (type, *bound_scalar);
1872         }
1873       /* Peeling an unknown number of times.  Note that both BOUND_PROLOG
1874          and BOUND_EPILOG are inclusive upper bounds.  */
1875       if (known_ge (th, bound_prolog + bound_epilog))
1876         {
1877           *bound_scalar = th;
1878           return build_int_cst (type, th);
1879         }
1880       /* Need to do runtime comparison.  */
1881       else if (maybe_gt (th, bound_epilog))
1882         {
1883           *bound_scalar = upper_bound (*bound_scalar, th);
1884           return fold_build2 (MAX_EXPR, type,
1885                               build_int_cst (type, th), niters);
1886         }
1887     }
1888   return niters;
1889 }
1890
1891 /* NITERS is the number of times that the original scalar loop executes
1892    after peeling.  Work out the maximum number of iterations N that can
1893    be handled by the vectorized form of the loop and then either:
1894
1895    a) set *STEP_VECTOR_PTR to the vectorization factor and generate:
1896
1897         niters_vector = N
1898
1899    b) set *STEP_VECTOR_PTR to one and generate:
1900
1901         niters_vector = N / vf
1902
1903    In both cases, store niters_vector in *NITERS_VECTOR_PTR and add
1904    any new statements on the loop preheader edge.  NITERS_NO_OVERFLOW
1905    is true if NITERS doesn't overflow (i.e. if NITERS is always nonzero).  */
1906
1907 void
1908 vect_gen_vector_loop_niters (loop_vec_info loop_vinfo, tree niters,
1909                              tree *niters_vector_ptr, tree *step_vector_ptr,
1910                              bool niters_no_overflow)
1911 {
1912   tree ni_minus_gap, var;
1913   tree niters_vector, step_vector, type = TREE_TYPE (niters);
1914   poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1915   edge pe = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo));
1916   tree log_vf = NULL_TREE;
1917
1918   /* If epilogue loop is required because of data accesses with gaps, we
1919      subtract one iteration from the total number of iterations here for
1920      correct calculation of RATIO.  */
1921   if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
1922     {
1923       ni_minus_gap = fold_build2 (MINUS_EXPR, type, niters,
1924                                   build_one_cst (type));
1925       if (!is_gimple_val (ni_minus_gap))
1926         {
1927           var = create_tmp_var (type, "ni_gap");
1928           gimple *stmts = NULL;
1929           ni_minus_gap = force_gimple_operand (ni_minus_gap, &stmts,
1930                                                true, var);
1931           gsi_insert_seq_on_edge_immediate (pe, stmts);
1932         }
1933     }
1934   else
1935     ni_minus_gap = niters;
1936
1937   unsigned HOST_WIDE_INT const_vf;
1938   if (vf.is_constant (&const_vf)
1939       && !LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
1940     {
1941       /* Create: niters >> log2(vf) */
1942       /* If it's known that niters == number of latch executions + 1 doesn't
1943          overflow, we can generate niters >> log2(vf); otherwise we generate
1944          (niters - vf) >> log2(vf) + 1 by using the fact that we know ratio
1945          will be at least one.  */
1946       log_vf = build_int_cst (type, exact_log2 (const_vf));
1947       if (niters_no_overflow)
1948         niters_vector = fold_build2 (RSHIFT_EXPR, type, ni_minus_gap, log_vf);
1949       else
1950         niters_vector
1951           = fold_build2 (PLUS_EXPR, type,
1952                          fold_build2 (RSHIFT_EXPR, type,
1953                                       fold_build2 (MINUS_EXPR, type,
1954                                                    ni_minus_gap,
1955                                                    build_int_cst (type, vf)),
1956                                       log_vf),
1957                          build_int_cst (type, 1));
1958       step_vector = build_one_cst (type);
1959     }
1960   else
1961     {
1962       niters_vector = ni_minus_gap;
1963       step_vector = build_int_cst (type, vf);
1964     }
1965
1966   if (!is_gimple_val (niters_vector))
1967     {
1968       var = create_tmp_var (type, "bnd");
1969       gimple_seq stmts = NULL;
1970       niters_vector = force_gimple_operand (niters_vector, &stmts, true, var);
1971       gsi_insert_seq_on_edge_immediate (pe, stmts);
1972       /* Peeling algorithm guarantees that vector loop bound is at least ONE,
1973          we set range information to make niters analyzer's life easier.  */
1974       if (stmts != NULL && log_vf)
1975         set_range_info (niters_vector, VR_RANGE,
1976                         wi::to_wide (build_int_cst (type, 1)),
1977                         wi::to_wide (fold_build2 (RSHIFT_EXPR, type,
1978                                                   TYPE_MAX_VALUE (type),
1979                                                   log_vf)));
1980     }
1981   *niters_vector_ptr = niters_vector;
1982   *step_vector_ptr = step_vector;
1983
1984   return;
1985 }
1986
1987 /* Given NITERS_VECTOR which is the number of iterations for vectorized
1988    loop specified by LOOP_VINFO after vectorization, compute the number
1989    of iterations before vectorization (niters_vector * vf) and store it
1990    to NITERS_VECTOR_MULT_VF_PTR.  */
1991
1992 static void
1993 vect_gen_vector_loop_niters_mult_vf (loop_vec_info loop_vinfo,
1994                                      tree niters_vector,
1995                                      tree *niters_vector_mult_vf_ptr)
1996 {
1997   /* We should be using a step_vector of VF if VF is variable.  */
1998   int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo).to_constant ();
1999   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2000   tree type = TREE_TYPE (niters_vector);
2001   tree log_vf = build_int_cst (type, exact_log2 (vf));
2002   basic_block exit_bb = single_exit (loop)->dest;
2003
2004   gcc_assert (niters_vector_mult_vf_ptr != NULL);
2005   tree niters_vector_mult_vf = fold_build2 (LSHIFT_EXPR, type,
2006                                             niters_vector, log_vf);
2007   if (!is_gimple_val (niters_vector_mult_vf))
2008     {
2009       tree var = create_tmp_var (type, "niters_vector_mult_vf");
2010       gimple_seq stmts = NULL;
2011       niters_vector_mult_vf = force_gimple_operand (niters_vector_mult_vf,
2012                                                     &stmts, true, var);
2013       gimple_stmt_iterator gsi = gsi_start_bb (exit_bb);
2014       gsi_insert_seq_before (&gsi, stmts, GSI_SAME_STMT);
2015     }
2016   *niters_vector_mult_vf_ptr = niters_vector_mult_vf;
2017 }
2018
2019 /* Function slpeel_tree_duplicate_loop_to_edge_cfg duplciates FIRST/SECOND
2020    from SECOND/FIRST and puts it at the original loop's preheader/exit
2021    edge, the two loops are arranged as below:
2022
2023        preheader_a:
2024      first_loop:
2025        header_a:
2026          i_1 = PHI<i_0, i_2>;
2027          ...
2028          i_2 = i_1 + 1;
2029          if (cond_a)
2030            goto latch_a;
2031          else
2032            goto between_bb;
2033        latch_a:
2034          goto header_a;
2035
2036        between_bb:
2037          ;; i_x = PHI<i_2>;   ;; LCSSA phi node to be created for FIRST,
2038
2039      second_loop:
2040        header_b:
2041          i_3 = PHI<i_0, i_4>; ;; Use of i_0 to be replaced with i_x,
2042                                  or with i_2 if no LCSSA phi is created
2043                                  under condition of CREATE_LCSSA_FOR_IV_PHIS.
2044          ...
2045          i_4 = i_3 + 1;
2046          if (cond_b)
2047            goto latch_b;
2048          else
2049            goto exit_bb;
2050        latch_b:
2051          goto header_b;
2052
2053        exit_bb:
2054
2055    This function creates loop closed SSA for the first loop; update the
2056    second loop's PHI nodes by replacing argument on incoming edge with the
2057    result of newly created lcssa PHI nodes.  IF CREATE_LCSSA_FOR_IV_PHIS
2058    is false, Loop closed ssa phis will only be created for non-iv phis for
2059    the first loop.
2060
2061    This function assumes exit bb of the first loop is preheader bb of the
2062    second loop, i.e, between_bb in the example code.  With PHIs updated,
2063    the second loop will execute rest iterations of the first.  */
2064
2065 static void
2066 slpeel_update_phi_nodes_for_loops (loop_vec_info loop_vinfo,
2067                                    struct loop *first, struct loop *second,
2068                                    bool create_lcssa_for_iv_phis)
2069 {
2070   gphi_iterator gsi_update, gsi_orig;
2071   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2072
2073   edge first_latch_e = EDGE_SUCC (first->latch, 0);
2074   edge second_preheader_e = loop_preheader_edge (second);
2075   basic_block between_bb = single_exit (first)->dest;
2076
2077   gcc_assert (between_bb == second_preheader_e->src);
2078   gcc_assert (single_pred_p (between_bb) && single_succ_p (between_bb));
2079   /* Either the first loop or the second is the loop to be vectorized.  */
2080   gcc_assert (loop == first || loop == second);
2081
2082   for (gsi_orig = gsi_start_phis (first->header),
2083        gsi_update = gsi_start_phis (second->header);
2084        !gsi_end_p (gsi_orig) && !gsi_end_p (gsi_update);
2085        gsi_next (&gsi_orig), gsi_next (&gsi_update))
2086     {
2087       gphi *orig_phi = gsi_orig.phi ();
2088       gphi *update_phi = gsi_update.phi ();
2089
2090       tree arg = PHI_ARG_DEF_FROM_EDGE (orig_phi, first_latch_e);
2091       /* Generate lcssa PHI node for the first loop.  */
2092       gphi *vect_phi = (loop == first) ? orig_phi : update_phi;
2093       stmt_vec_info vect_phi_info = loop_vinfo->lookup_stmt (vect_phi);
2094       if (create_lcssa_for_iv_phis || !iv_phi_p (vect_phi_info))
2095         {
2096           tree new_res = copy_ssa_name (PHI_RESULT (orig_phi));
2097           gphi *lcssa_phi = create_phi_node (new_res, between_bb);
2098           add_phi_arg (lcssa_phi, arg, single_exit (first), UNKNOWN_LOCATION);
2099           arg = new_res;
2100         }
2101
2102       /* Update PHI node in the second loop by replacing arg on the loop's
2103          incoming edge.  */
2104       adjust_phi_and_debug_stmts (update_phi, second_preheader_e, arg);
2105     }
2106 }
2107
2108 /* Function slpeel_add_loop_guard adds guard skipping from the beginning
2109    of SKIP_LOOP to the beginning of UPDATE_LOOP.  GUARD_EDGE and MERGE_EDGE
2110    are two pred edges of the merge point before UPDATE_LOOP.  The two loops
2111    appear like below:
2112
2113        guard_bb:
2114          if (cond)
2115            goto merge_bb;
2116          else
2117            goto skip_loop;
2118
2119      skip_loop:
2120        header_a:
2121          i_1 = PHI<i_0, i_2>;
2122          ...
2123          i_2 = i_1 + 1;
2124          if (cond_a)
2125            goto latch_a;
2126          else
2127            goto exit_a;
2128        latch_a:
2129          goto header_a;
2130
2131        exit_a:
2132          i_5 = PHI<i_2>;
2133
2134        merge_bb:
2135          ;; PHI (i_x = PHI<i_0, i_5>) to be created at merge point.
2136
2137      update_loop:
2138        header_b:
2139          i_3 = PHI<i_5, i_4>;  ;; Use of i_5 to be replaced with i_x.
2140          ...
2141          i_4 = i_3 + 1;
2142          if (cond_b)
2143            goto latch_b;
2144          else
2145            goto exit_bb;
2146        latch_b:
2147          goto header_b;
2148
2149        exit_bb:
2150
2151    This function creates PHI nodes at merge_bb and replaces the use of i_5
2152    in the update_loop's PHI node with the result of new PHI result.  */
2153
2154 static void
2155 slpeel_update_phi_nodes_for_guard1 (struct loop *skip_loop,
2156                                     struct loop *update_loop,
2157                                     edge guard_edge, edge merge_edge)
2158 {
2159   location_t merge_loc, guard_loc;
2160   edge orig_e = loop_preheader_edge (skip_loop);
2161   edge update_e = loop_preheader_edge (update_loop);
2162   gphi_iterator gsi_orig, gsi_update;
2163
2164   for ((gsi_orig = gsi_start_phis (skip_loop->header),
2165         gsi_update = gsi_start_phis (update_loop->header));
2166        !gsi_end_p (gsi_orig) && !gsi_end_p (gsi_update);
2167        gsi_next (&gsi_orig), gsi_next (&gsi_update))
2168     {
2169       gphi *orig_phi = gsi_orig.phi ();
2170       gphi *update_phi = gsi_update.phi ();
2171
2172       /* Generate new phi node at merge bb of the guard.  */
2173       tree new_res = copy_ssa_name (PHI_RESULT (orig_phi));
2174       gphi *new_phi = create_phi_node (new_res, guard_edge->dest);
2175
2176       /* Merge bb has two incoming edges: GUARD_EDGE and MERGE_EDGE.  Set the
2177          args in NEW_PHI for these edges.  */
2178       tree merge_arg = PHI_ARG_DEF_FROM_EDGE (update_phi, update_e);
2179       tree guard_arg = PHI_ARG_DEF_FROM_EDGE (orig_phi, orig_e);
2180       merge_loc = gimple_phi_arg_location_from_edge (update_phi, update_e);
2181       guard_loc = gimple_phi_arg_location_from_edge (orig_phi, orig_e);
2182       add_phi_arg (new_phi, merge_arg, merge_edge, merge_loc);
2183       add_phi_arg (new_phi, guard_arg, guard_edge, guard_loc);
2184
2185       /* Update phi in UPDATE_PHI.  */
2186       adjust_phi_and_debug_stmts (update_phi, update_e, new_res);
2187     }
2188 }
2189
2190 /* LCSSA_PHI is a lcssa phi of EPILOG loop which is copied from LOOP,
2191    this function searches for the corresponding lcssa phi node in exit
2192    bb of LOOP.  If it is found, return the phi result; otherwise return
2193    NULL.  */
2194
2195 static tree
2196 find_guard_arg (struct loop *loop, struct loop *epilog ATTRIBUTE_UNUSED,
2197                 gphi *lcssa_phi)
2198 {
2199   gphi_iterator gsi;
2200   edge e = single_exit (loop);
2201
2202   gcc_assert (single_pred_p (e->dest));
2203   for (gsi = gsi_start_phis (e->dest); !gsi_end_p (gsi); gsi_next (&gsi))
2204     {
2205       gphi *phi = gsi.phi ();
2206       if (operand_equal_p (PHI_ARG_DEF (phi, 0),
2207                            PHI_ARG_DEF (lcssa_phi, 0), 0))
2208         return PHI_RESULT (phi);
2209     }
2210   return NULL_TREE;
2211 }
2212
2213 /* LOOP and EPILOG are two consecutive loops in CFG and EPILOG is copied
2214    from LOOP.  Function slpeel_add_loop_guard adds guard skipping from a
2215    point between the two loops to the end of EPILOG.  Edges GUARD_EDGE
2216    and MERGE_EDGE are the two pred edges of merge_bb at the end of EPILOG.
2217    The CFG looks like:
2218
2219      loop:
2220        header_a:
2221          i_1 = PHI<i_0, i_2>;
2222          ...
2223          i_2 = i_1 + 1;
2224          if (cond_a)
2225            goto latch_a;
2226          else
2227            goto exit_a;
2228        latch_a:
2229          goto header_a;
2230
2231        exit_a:
2232
2233        guard_bb:
2234          if (cond)
2235            goto merge_bb;
2236          else
2237            goto epilog_loop;
2238
2239        ;; fall_through_bb
2240
2241      epilog_loop:
2242        header_b:
2243          i_3 = PHI<i_2, i_4>;
2244          ...
2245          i_4 = i_3 + 1;
2246          if (cond_b)
2247            goto latch_b;
2248          else
2249            goto merge_bb;
2250        latch_b:
2251          goto header_b;
2252
2253        merge_bb:
2254          ; PHI node (i_y = PHI<i_2, i_4>) to be created at merge point.
2255
2256        exit_bb:
2257          i_x = PHI<i_4>;  ;Use of i_4 to be replaced with i_y in merge_bb.
2258
2259    For each name used out side EPILOG (i.e - for each name that has a lcssa
2260    phi in exit_bb) we create a new PHI in merge_bb.  The new PHI has two
2261    args corresponding to GUARD_EDGE and MERGE_EDGE.  Arg for MERGE_EDGE is
2262    the arg of the original PHI in exit_bb, arg for GUARD_EDGE is defined
2263    by LOOP and is found in the exit bb of LOOP.  Arg of the original PHI
2264    in exit_bb will also be updated.  */
2265
2266 static void
2267 slpeel_update_phi_nodes_for_guard2 (struct loop *loop, struct loop *epilog,
2268                                     edge guard_edge, edge merge_edge)
2269 {
2270   gphi_iterator gsi;
2271   basic_block merge_bb = guard_edge->dest;
2272
2273   gcc_assert (single_succ_p (merge_bb));
2274   edge e = single_succ_edge (merge_bb);
2275   basic_block exit_bb = e->dest;
2276   gcc_assert (single_pred_p (exit_bb));
2277   gcc_assert (single_pred (exit_bb) == single_exit (epilog)->dest);
2278
2279   for (gsi = gsi_start_phis (exit_bb); !gsi_end_p (gsi); gsi_next (&gsi))
2280     {
2281       gphi *update_phi = gsi.phi ();
2282       tree old_arg = PHI_ARG_DEF (update_phi, 0);
2283       /* This loop-closed-phi actually doesn't represent a use out of the
2284          loop - the phi arg is a constant.  */
2285       if (TREE_CODE (old_arg) != SSA_NAME)
2286         continue;
2287
2288       tree merge_arg = get_current_def (old_arg);
2289       if (!merge_arg)
2290         merge_arg = old_arg;
2291
2292       tree guard_arg = find_guard_arg (loop, epilog, update_phi);
2293       /* If the var is live after loop but not a reduction, we simply
2294          use the old arg.  */
2295       if (!guard_arg)
2296         guard_arg = old_arg;
2297
2298       /* Create new phi node in MERGE_BB:  */
2299       tree new_res = copy_ssa_name (PHI_RESULT (update_phi));
2300       gphi *merge_phi = create_phi_node (new_res, merge_bb);
2301
2302       /* MERGE_BB has two incoming edges: GUARD_EDGE and MERGE_EDGE, Set
2303          the two PHI args in merge_phi for these edges.  */
2304       add_phi_arg (merge_phi, merge_arg, merge_edge, UNKNOWN_LOCATION);
2305       add_phi_arg (merge_phi, guard_arg, guard_edge, UNKNOWN_LOCATION);
2306
2307       /* Update the original phi in exit_bb.  */
2308       adjust_phi_and_debug_stmts (update_phi, e, new_res);
2309     }
2310 }
2311
2312 /* EPILOG loop is duplicated from the original loop for vectorizing,
2313    the arg of its loop closed ssa PHI needs to be updated.  */
2314
2315 static void
2316 slpeel_update_phi_nodes_for_lcssa (struct loop *epilog)
2317 {
2318   gphi_iterator gsi;
2319   basic_block exit_bb = single_exit (epilog)->dest;
2320
2321   gcc_assert (single_pred_p (exit_bb));
2322   edge e = EDGE_PRED (exit_bb, 0);
2323   for (gsi = gsi_start_phis (exit_bb); !gsi_end_p (gsi); gsi_next (&gsi))
2324     rename_use_op (PHI_ARG_DEF_PTR_FROM_EDGE (gsi.phi (), e));
2325 }
2326
2327 /* Function vect_do_peeling.
2328
2329    Input:
2330    - LOOP_VINFO: Represent a loop to be vectorized, which looks like:
2331
2332        preheader:
2333      LOOP:
2334        header_bb:
2335          loop_body
2336          if (exit_loop_cond) goto exit_bb
2337          else                goto header_bb
2338        exit_bb:
2339
2340    - NITERS: The number of iterations of the loop.
2341    - NITERSM1: The number of iterations of the loop's latch.
2342    - NITERS_NO_OVERFLOW: No overflow in computing NITERS.
2343    - TH, CHECK_PROFITABILITY: Threshold of niters to vectorize loop if
2344                               CHECK_PROFITABILITY is true.
2345    Output:
2346    - *NITERS_VECTOR and *STEP_VECTOR describe how the main loop should
2347      iterate after vectorization; see vect_set_loop_condition for details.
2348    - *NITERS_VECTOR_MULT_VF_VAR is either null or an SSA name that
2349      should be set to the number of scalar iterations handled by the
2350      vector loop.  The SSA name is only used on exit from the loop.
2351
2352    This function peels prolog and epilog from the loop, adds guards skipping
2353    PROLOG and EPILOG for various conditions.  As a result, the changed CFG
2354    would look like:
2355
2356        guard_bb_1:
2357          if (prefer_scalar_loop) goto merge_bb_1
2358          else                    goto guard_bb_2
2359
2360        guard_bb_2:
2361          if (skip_prolog) goto merge_bb_2
2362          else             goto prolog_preheader
2363
2364        prolog_preheader:
2365      PROLOG:
2366        prolog_header_bb:
2367          prolog_body
2368          if (exit_prolog_cond) goto prolog_exit_bb
2369          else                  goto prolog_header_bb
2370        prolog_exit_bb:
2371
2372        merge_bb_2:
2373
2374        vector_preheader:
2375      VECTOR LOOP:
2376        vector_header_bb:
2377          vector_body
2378          if (exit_vector_cond) goto vector_exit_bb
2379          else                  goto vector_header_bb
2380        vector_exit_bb:
2381
2382        guard_bb_3:
2383          if (skip_epilog) goto merge_bb_3
2384          else             goto epilog_preheader
2385
2386        merge_bb_1:
2387
2388        epilog_preheader:
2389      EPILOG:
2390        epilog_header_bb:
2391          epilog_body
2392          if (exit_epilog_cond) goto merge_bb_3
2393          else                  goto epilog_header_bb
2394
2395        merge_bb_3:
2396
2397    Note this function peels prolog and epilog only if it's necessary,
2398    as well as guards.
2399    Returns created epilogue or NULL.
2400
2401    TODO: Guard for prefer_scalar_loop should be emitted along with
2402    versioning conditions if loop versioning is needed.  */
2403
2404
2405 struct loop *
2406 vect_do_peeling (loop_vec_info loop_vinfo, tree niters, tree nitersm1,
2407                  tree *niters_vector, tree *step_vector,
2408                  tree *niters_vector_mult_vf_var, int th,
2409                  bool check_profitability, bool niters_no_overflow)
2410 {
2411   edge e, guard_e;
2412   tree type = TREE_TYPE (niters), guard_cond;
2413   basic_block guard_bb, guard_to;
2414   profile_probability prob_prolog, prob_vector, prob_epilog;
2415   int estimated_vf;
2416   int prolog_peeling = 0;
2417   /* We currently do not support prolog peeling if the target alignment is not
2418      known at compile time.  'vect_gen_prolog_loop_niters' depends on the
2419      target alignment being constant.  */
2420   dr_vec_info *dr_info = LOOP_VINFO_UNALIGNED_DR (loop_vinfo);
2421   if (dr_info && !DR_TARGET_ALIGNMENT (dr_info).is_constant ())
2422     return NULL;
2423
2424   if (!vect_use_loop_mask_for_alignment_p (loop_vinfo))
2425     prolog_peeling = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
2426
2427   poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2428   poly_uint64 bound_epilog = 0;
2429   if (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
2430       && LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo))
2431     bound_epilog += vf - 1;
2432   if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
2433     bound_epilog += 1;
2434   bool epilog_peeling = maybe_ne (bound_epilog, 0U);
2435   poly_uint64 bound_scalar = bound_epilog;
2436
2437   if (!prolog_peeling && !epilog_peeling)
2438     return NULL;
2439
2440   prob_vector = profile_probability::guessed_always ().apply_scale (9, 10);
2441   estimated_vf = vect_vf_for_cost (loop_vinfo);
2442   if (estimated_vf == 2)
2443     estimated_vf = 3;
2444   prob_prolog = prob_epilog = profile_probability::guessed_always ()
2445                         .apply_scale (estimated_vf - 1, estimated_vf);
2446
2447   struct loop *prolog, *epilog = NULL, *loop = LOOP_VINFO_LOOP (loop_vinfo);
2448   struct loop *first_loop = loop;
2449   bool irred_flag = loop_preheader_edge (loop)->flags & EDGE_IRREDUCIBLE_LOOP;
2450   create_lcssa_for_virtual_phi (loop);
2451   update_ssa (TODO_update_ssa_only_virtuals);
2452
2453   if (MAY_HAVE_DEBUG_BIND_STMTS)
2454     {
2455       gcc_assert (!adjust_vec.exists ());
2456       adjust_vec.create (32);
2457     }
2458   initialize_original_copy_tables ();
2459
2460   /* Record the anchor bb at which the guard should be placed if the scalar
2461      loop might be preferred.  */
2462   basic_block anchor = loop_preheader_edge (loop)->src;
2463
2464   /* Generate the number of iterations for the prolog loop.  We do this here
2465      so that we can also get the upper bound on the number of iterations.  */
2466   tree niters_prolog;
2467   int bound_prolog = 0;
2468   if (prolog_peeling)
2469     niters_prolog = vect_gen_prolog_loop_niters (loop_vinfo, anchor,
2470                                                  &bound_prolog);
2471   else
2472     niters_prolog = build_int_cst (type, 0);
2473
2474   /* Prolog loop may be skipped.  */
2475   bool skip_prolog = (prolog_peeling != 0);
2476   /* Skip to epilog if scalar loop may be preferred.  It's only needed
2477      when we peel for epilog loop and when it hasn't been checked with
2478      loop versioning.  */
2479   bool skip_vector = (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2480                       ? maybe_lt (LOOP_VINFO_INT_NITERS (loop_vinfo),
2481                                   bound_prolog + bound_epilog)
2482                       : !LOOP_REQUIRES_VERSIONING (loop_vinfo));
2483   /* Epilog loop must be executed if the number of iterations for epilog
2484      loop is known at compile time, otherwise we need to add a check at
2485      the end of vector loop and skip to the end of epilog loop.  */
2486   bool skip_epilog = (prolog_peeling < 0
2487                       || !LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2488                       || !vf.is_constant ());
2489   /* PEELING_FOR_GAPS is special because epilog loop must be executed.  */
2490   if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
2491     skip_epilog = false;
2492
2493   if (skip_vector)
2494     {
2495       split_edge (loop_preheader_edge (loop));
2496
2497       /* Due to the order in which we peel prolog and epilog, we first
2498          propagate probability to the whole loop.  The purpose is to
2499          avoid adjusting probabilities of both prolog and vector loops
2500          separately.  Note in this case, the probability of epilog loop
2501          needs to be scaled back later.  */
2502       basic_block bb_before_loop = loop_preheader_edge (loop)->src;
2503       if (prob_vector.initialized_p ())
2504         {
2505           scale_bbs_frequencies (&bb_before_loop, 1, prob_vector);
2506           scale_loop_profile (loop, prob_vector, 0);
2507         }
2508     }
2509
2510   dump_user_location_t loop_loc = find_loop_location (loop);
2511   struct loop *scalar_loop = LOOP_VINFO_SCALAR_LOOP (loop_vinfo);
2512   if (prolog_peeling)
2513     {
2514       e = loop_preheader_edge (loop);
2515       if (!slpeel_can_duplicate_loop_p (loop, e))
2516         {
2517           dump_printf_loc (MSG_MISSED_OPTIMIZATION, loop_loc,
2518                            "loop can't be duplicated to preheader edge.\n");
2519           gcc_unreachable ();
2520         }
2521       /* Peel prolog and put it on preheader edge of loop.  */
2522       prolog = slpeel_tree_duplicate_loop_to_edge_cfg (loop, scalar_loop, e);
2523       if (!prolog)
2524         {
2525           dump_printf_loc (MSG_MISSED_OPTIMIZATION, loop_loc,
2526                            "slpeel_tree_duplicate_loop_to_edge_cfg failed.\n");
2527           gcc_unreachable ();
2528         }
2529       prolog->force_vectorize = false;
2530       slpeel_update_phi_nodes_for_loops (loop_vinfo, prolog, loop, true);
2531       first_loop = prolog;
2532       reset_original_copy_tables ();
2533
2534       /* Update the number of iterations for prolog loop.  */
2535       tree step_prolog = build_one_cst (TREE_TYPE (niters_prolog));
2536       vect_set_loop_condition (prolog, NULL, niters_prolog,
2537                                step_prolog, NULL_TREE, false);
2538
2539       /* Skip the prolog loop.  */
2540       if (skip_prolog)
2541         {
2542           guard_cond = fold_build2 (EQ_EXPR, boolean_type_node,
2543                                     niters_prolog, build_int_cst (type, 0));
2544           guard_bb = loop_preheader_edge (prolog)->src;
2545           basic_block bb_after_prolog = loop_preheader_edge (loop)->src;
2546           guard_to = split_edge (loop_preheader_edge (loop));
2547           guard_e = slpeel_add_loop_guard (guard_bb, guard_cond,
2548                                            guard_to, guard_bb,
2549                                            prob_prolog.invert (),
2550                                            irred_flag);
2551           e = EDGE_PRED (guard_to, 0);
2552           e = (e != guard_e ? e : EDGE_PRED (guard_to, 1));
2553           slpeel_update_phi_nodes_for_guard1 (prolog, loop, guard_e, e);
2554
2555           scale_bbs_frequencies (&bb_after_prolog, 1, prob_prolog);
2556           scale_loop_profile (prolog, prob_prolog, bound_prolog);
2557         }
2558       /* Update init address of DRs.  */
2559       vect_update_inits_of_drs (loop_vinfo, niters_prolog, PLUS_EXPR);
2560       /* Update niters for vector loop.  */
2561       LOOP_VINFO_NITERS (loop_vinfo)
2562         = fold_build2 (MINUS_EXPR, type, niters, niters_prolog);
2563       LOOP_VINFO_NITERSM1 (loop_vinfo)
2564         = fold_build2 (MINUS_EXPR, type,
2565                        LOOP_VINFO_NITERSM1 (loop_vinfo), niters_prolog);
2566       bool new_var_p = false;
2567       niters = vect_build_loop_niters (loop_vinfo, &new_var_p);
2568       /* It's guaranteed that vector loop bound before vectorization is at
2569          least VF, so set range information for newly generated var.  */
2570       if (new_var_p)
2571         set_range_info (niters, VR_RANGE,
2572                         wi::to_wide (build_int_cst (type, vf)),
2573                         wi::to_wide (TYPE_MAX_VALUE (type)));
2574
2575       /* Prolog iterates at most bound_prolog times, latch iterates at
2576          most bound_prolog - 1 times.  */
2577       record_niter_bound (prolog, bound_prolog - 1, false, true);
2578       delete_update_ssa ();
2579       adjust_vec_debug_stmts ();
2580       scev_reset ();
2581     }
2582
2583   if (epilog_peeling)
2584     {
2585       e = single_exit (loop);
2586       if (!slpeel_can_duplicate_loop_p (loop, e))
2587         {
2588           dump_printf_loc (MSG_MISSED_OPTIMIZATION, loop_loc,
2589                            "loop can't be duplicated to exit edge.\n");
2590           gcc_unreachable ();
2591         }
2592       /* Peel epilog and put it on exit edge of loop.  */
2593       epilog = slpeel_tree_duplicate_loop_to_edge_cfg (loop, scalar_loop, e);
2594       if (!epilog)
2595         {
2596           dump_printf_loc (MSG_MISSED_OPTIMIZATION, loop_loc,
2597                            "slpeel_tree_duplicate_loop_to_edge_cfg failed.\n");
2598           gcc_unreachable ();
2599         }
2600       epilog->force_vectorize = false;
2601       slpeel_update_phi_nodes_for_loops (loop_vinfo, loop, epilog, false);
2602
2603       /* Scalar version loop may be preferred.  In this case, add guard
2604          and skip to epilog.  Note this only happens when the number of
2605          iterations of loop is unknown at compile time, otherwise this
2606          won't be vectorized.  */
2607       if (skip_vector)
2608         {
2609           /* Additional epilogue iteration is peeled if gap exists.  */
2610           tree t = vect_gen_scalar_loop_niters (niters_prolog, prolog_peeling,
2611                                                 bound_prolog, bound_epilog,
2612                                                 th, &bound_scalar,
2613                                                 check_profitability);
2614           /* Build guard against NITERSM1 since NITERS may overflow.  */
2615           guard_cond = fold_build2 (LT_EXPR, boolean_type_node, nitersm1, t);
2616           guard_bb = anchor;
2617           guard_to = split_edge (loop_preheader_edge (epilog));
2618           guard_e = slpeel_add_loop_guard (guard_bb, guard_cond,
2619                                            guard_to, guard_bb,
2620                                            prob_vector.invert (),
2621                                            irred_flag);
2622           e = EDGE_PRED (guard_to, 0);
2623           e = (e != guard_e ? e : EDGE_PRED (guard_to, 1));
2624           slpeel_update_phi_nodes_for_guard1 (first_loop, epilog, guard_e, e);
2625
2626           /* Simply propagate profile info from guard_bb to guard_to which is
2627              a merge point of control flow.  */
2628           guard_to->count = guard_bb->count;
2629
2630           /* Scale probability of epilog loop back.
2631              FIXME: We should avoid scaling down and back up.  Profile may
2632              get lost if we scale down to 0.  */
2633           basic_block *bbs = get_loop_body (epilog);
2634           for (unsigned int i = 0; i < epilog->num_nodes; i++)
2635             bbs[i]->count = bbs[i]->count.apply_scale
2636                                  (bbs[i]->count,
2637                                   bbs[i]->count.apply_probability
2638                                     (prob_vector));
2639           free (bbs);
2640         }
2641
2642       basic_block bb_before_epilog = loop_preheader_edge (epilog)->src;
2643       tree niters_vector_mult_vf;
2644       /* If loop is peeled for non-zero constant times, now niters refers to
2645          orig_niters - prolog_peeling, it won't overflow even the orig_niters
2646          overflows.  */
2647       niters_no_overflow |= (prolog_peeling > 0);
2648       vect_gen_vector_loop_niters (loop_vinfo, niters,
2649                                    niters_vector, step_vector,
2650                                    niters_no_overflow);
2651       if (!integer_onep (*step_vector))
2652         {
2653           /* On exit from the loop we will have an easy way of calcalating
2654              NITERS_VECTOR / STEP * STEP.  Install a dummy definition
2655              until then.  */
2656           niters_vector_mult_vf = make_ssa_name (TREE_TYPE (*niters_vector));
2657           SSA_NAME_DEF_STMT (niters_vector_mult_vf) = gimple_build_nop ();
2658           *niters_vector_mult_vf_var = niters_vector_mult_vf;
2659         }
2660       else
2661         vect_gen_vector_loop_niters_mult_vf (loop_vinfo, *niters_vector,
2662                                              &niters_vector_mult_vf);
2663       /* Update IVs of original loop as if they were advanced by
2664          niters_vector_mult_vf steps.  */
2665       gcc_checking_assert (vect_can_advance_ivs_p (loop_vinfo));
2666       edge update_e = skip_vector ? e : loop_preheader_edge (epilog);
2667       vect_update_ivs_after_vectorizer (loop_vinfo, niters_vector_mult_vf,
2668                                         update_e);
2669
2670       if (skip_epilog)
2671         {
2672           guard_cond = fold_build2 (EQ_EXPR, boolean_type_node,
2673                                     niters, niters_vector_mult_vf);
2674           guard_bb = single_exit (loop)->dest;
2675           guard_to = split_edge (single_exit (epilog));
2676           guard_e = slpeel_add_loop_guard (guard_bb, guard_cond, guard_to,
2677                                            skip_vector ? anchor : guard_bb,
2678                                            prob_epilog.invert (),
2679                                            irred_flag);
2680           slpeel_update_phi_nodes_for_guard2 (loop, epilog, guard_e,
2681                                               single_exit (epilog));
2682           /* Only need to handle basic block before epilog loop if it's not
2683              the guard_bb, which is the case when skip_vector is true.  */
2684           if (guard_bb != bb_before_epilog)
2685             {
2686               prob_epilog = prob_vector * prob_epilog + prob_vector.invert ();
2687
2688               scale_bbs_frequencies (&bb_before_epilog, 1, prob_epilog);
2689             }
2690           scale_loop_profile (epilog, prob_epilog, 0);
2691         }
2692       else
2693         slpeel_update_phi_nodes_for_lcssa (epilog);
2694
2695       unsigned HOST_WIDE_INT bound;
2696       if (bound_scalar.is_constant (&bound))
2697         {
2698           gcc_assert (bound != 0);
2699           /* -1 to convert loop iterations to latch iterations.  */
2700           record_niter_bound (epilog, bound - 1, false, true);
2701         }
2702
2703       delete_update_ssa ();
2704       adjust_vec_debug_stmts ();
2705       scev_reset ();
2706     }
2707   adjust_vec.release ();
2708   free_original_copy_tables ();
2709
2710   return epilog;
2711 }
2712
2713 /* Function vect_create_cond_for_niters_checks.
2714
2715    Create a conditional expression that represents the run-time checks for
2716    loop's niter.  The loop is guaranteed to terminate if the run-time
2717    checks hold.
2718
2719    Input:
2720    COND_EXPR  - input conditional expression.  New conditions will be chained
2721                 with logical AND operation.  If it is NULL, then the function
2722                 is used to return the number of alias checks.
2723    LOOP_VINFO - field LOOP_VINFO_MAY_ALIAS_STMTS contains the list of ddrs
2724                 to be checked.
2725
2726    Output:
2727    COND_EXPR - conditional expression.
2728
2729    The returned COND_EXPR is the conditional expression to be used in the
2730    if statement that controls which version of the loop gets executed at
2731    runtime.  */
2732
2733 static void
2734 vect_create_cond_for_niters_checks (loop_vec_info loop_vinfo, tree *cond_expr)
2735 {
2736   tree part_cond_expr = LOOP_VINFO_NITERS_ASSUMPTIONS (loop_vinfo);
2737
2738   if (*cond_expr)
2739     *cond_expr = fold_build2 (TRUTH_AND_EXPR, boolean_type_node,
2740                               *cond_expr, part_cond_expr);
2741   else
2742     *cond_expr = part_cond_expr;
2743 }
2744
2745 /* Set *COND_EXPR to a tree that is true when both the original *COND_EXPR
2746    and PART_COND_EXPR are true.  Treat a null *COND_EXPR as "true".  */
2747
2748 static void
2749 chain_cond_expr (tree *cond_expr, tree part_cond_expr)
2750 {
2751   if (*cond_expr)
2752     *cond_expr = fold_build2 (TRUTH_AND_EXPR, boolean_type_node,
2753                               *cond_expr, part_cond_expr);
2754   else
2755     *cond_expr = part_cond_expr;
2756 }
2757
2758 /* Function vect_create_cond_for_align_checks.
2759
2760    Create a conditional expression that represents the alignment checks for
2761    all of data references (array element references) whose alignment must be
2762    checked at runtime.
2763
2764    Input:
2765    COND_EXPR  - input conditional expression.  New conditions will be chained
2766                 with logical AND operation.
2767    LOOP_VINFO - two fields of the loop information are used.
2768                 LOOP_VINFO_PTR_MASK is the mask used to check the alignment.
2769                 LOOP_VINFO_MAY_MISALIGN_STMTS contains the refs to be checked.
2770
2771    Output:
2772    COND_EXPR_STMT_LIST - statements needed to construct the conditional
2773                          expression.
2774    The returned value is the conditional expression to be used in the if
2775    statement that controls which version of the loop gets executed at runtime.
2776
2777    The algorithm makes two assumptions:
2778      1) The number of bytes "n" in a vector is a power of 2.
2779      2) An address "a" is aligned if a%n is zero and that this
2780         test can be done as a&(n-1) == 0.  For example, for 16
2781         byte vectors the test is a&0xf == 0.  */
2782
2783 static void
2784 vect_create_cond_for_align_checks (loop_vec_info loop_vinfo,
2785                                    tree *cond_expr,
2786                                    gimple_seq *cond_expr_stmt_list)
2787 {
2788   vec<stmt_vec_info> may_misalign_stmts
2789     = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo);
2790   stmt_vec_info stmt_info;
2791   int mask = LOOP_VINFO_PTR_MASK (loop_vinfo);
2792   tree mask_cst;
2793   unsigned int i;
2794   tree int_ptrsize_type;
2795   char tmp_name[20];
2796   tree or_tmp_name = NULL_TREE;
2797   tree and_tmp_name;
2798   gimple *and_stmt;
2799   tree ptrsize_zero;
2800   tree part_cond_expr;
2801
2802   /* Check that mask is one less than a power of 2, i.e., mask is
2803      all zeros followed by all ones.  */
2804   gcc_assert ((mask != 0) && ((mask & (mask+1)) == 0));
2805
2806   int_ptrsize_type = signed_type_for (ptr_type_node);
2807
2808   /* Create expression (mask & (dr_1 || ... || dr_n)) where dr_i is the address
2809      of the first vector of the i'th data reference. */
2810
2811   FOR_EACH_VEC_ELT (may_misalign_stmts, i, stmt_info)
2812     {
2813       gimple_seq new_stmt_list = NULL;
2814       tree addr_base;
2815       tree addr_tmp_name;
2816       tree new_or_tmp_name;
2817       gimple *addr_stmt, *or_stmt;
2818       tree vectype = STMT_VINFO_VECTYPE (stmt_info);
2819       bool negative = tree_int_cst_compare
2820         (DR_STEP (STMT_VINFO_DATA_REF (stmt_info)), size_zero_node) < 0;
2821       tree offset = negative
2822         ? size_int (-TYPE_VECTOR_SUBPARTS (vectype) + 1) : size_zero_node;
2823
2824       /* create: addr_tmp = (int)(address_of_first_vector) */
2825       addr_base =
2826         vect_create_addr_base_for_vector_ref (stmt_info, &new_stmt_list,
2827                                               offset);
2828       if (new_stmt_list != NULL)
2829         gimple_seq_add_seq (cond_expr_stmt_list, new_stmt_list);
2830
2831       sprintf (tmp_name, "addr2int%d", i);
2832       addr_tmp_name = make_temp_ssa_name (int_ptrsize_type, NULL, tmp_name);
2833       addr_stmt = gimple_build_assign (addr_tmp_name, NOP_EXPR, addr_base);
2834       gimple_seq_add_stmt (cond_expr_stmt_list, addr_stmt);
2835
2836       /* The addresses are OR together.  */
2837
2838       if (or_tmp_name != NULL_TREE)
2839         {
2840           /* create: or_tmp = or_tmp | addr_tmp */
2841           sprintf (tmp_name, "orptrs%d", i);
2842           new_or_tmp_name = make_temp_ssa_name (int_ptrsize_type, NULL, tmp_name);
2843           or_stmt = gimple_build_assign (new_or_tmp_name, BIT_IOR_EXPR,
2844                                          or_tmp_name, addr_tmp_name);
2845           gimple_seq_add_stmt (cond_expr_stmt_list, or_stmt);
2846           or_tmp_name = new_or_tmp_name;
2847         }
2848       else
2849         or_tmp_name = addr_tmp_name;
2850
2851     } /* end for i */
2852
2853   mask_cst = build_int_cst (int_ptrsize_type, mask);
2854
2855   /* create: and_tmp = or_tmp & mask  */
2856   and_tmp_name = make_temp_ssa_name (int_ptrsize_type, NULL, "andmask");
2857
2858   and_stmt = gimple_build_assign (and_tmp_name, BIT_AND_EXPR,
2859                                   or_tmp_name, mask_cst);
2860   gimple_seq_add_stmt (cond_expr_stmt_list, and_stmt);
2861
2862   /* Make and_tmp the left operand of the conditional test against zero.
2863      if and_tmp has a nonzero bit then some address is unaligned.  */
2864   ptrsize_zero = build_int_cst (int_ptrsize_type, 0);
2865   part_cond_expr = fold_build2 (EQ_EXPR, boolean_type_node,
2866                                 and_tmp_name, ptrsize_zero);
2867   chain_cond_expr (cond_expr, part_cond_expr);
2868 }
2869
2870 /* If LOOP_VINFO_CHECK_UNEQUAL_ADDRS contains <A1, B1>, ..., <An, Bn>,
2871    create a tree representation of: (&A1 != &B1) && ... && (&An != &Bn).
2872    Set *COND_EXPR to a tree that is true when both the original *COND_EXPR
2873    and this new condition are true.  Treat a null *COND_EXPR as "true".  */
2874
2875 static void
2876 vect_create_cond_for_unequal_addrs (loop_vec_info loop_vinfo, tree *cond_expr)
2877 {
2878   vec<vec_object_pair> pairs = LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo);
2879   unsigned int i;
2880   vec_object_pair *pair;
2881   FOR_EACH_VEC_ELT (pairs, i, pair)
2882     {
2883       tree addr1 = build_fold_addr_expr (pair->first);
2884       tree addr2 = build_fold_addr_expr (pair->second);
2885       tree part_cond_expr = fold_build2 (NE_EXPR, boolean_type_node,
2886                                          addr1, addr2);
2887       chain_cond_expr (cond_expr, part_cond_expr);
2888     }
2889 }
2890
2891 /* Create an expression that is true when all lower-bound conditions for
2892    the vectorized loop are met.  Chain this condition with *COND_EXPR.  */
2893
2894 static void
2895 vect_create_cond_for_lower_bounds (loop_vec_info loop_vinfo, tree *cond_expr)
2896 {
2897   vec<vec_lower_bound> lower_bounds = LOOP_VINFO_LOWER_BOUNDS (loop_vinfo);
2898   for (unsigned int i = 0; i < lower_bounds.length (); ++i)
2899     {
2900       tree expr = lower_bounds[i].expr;
2901       tree type = unsigned_type_for (TREE_TYPE (expr));
2902       expr = fold_convert (type, expr);
2903       poly_uint64 bound = lower_bounds[i].min_value;
2904       if (!lower_bounds[i].unsigned_p)
2905         {
2906           expr = fold_build2 (PLUS_EXPR, type, expr,
2907                               build_int_cstu (type, bound - 1));
2908           bound += bound - 1;
2909         }
2910       tree part_cond_expr = fold_build2 (GE_EXPR, boolean_type_node, expr,
2911                                          build_int_cstu (type, bound));
2912       chain_cond_expr (cond_expr, part_cond_expr);
2913     }
2914 }
2915
2916 /* Function vect_create_cond_for_alias_checks.
2917
2918    Create a conditional expression that represents the run-time checks for
2919    overlapping of address ranges represented by a list of data references
2920    relations passed as input.
2921
2922    Input:
2923    COND_EXPR  - input conditional expression.  New conditions will be chained
2924                 with logical AND operation.  If it is NULL, then the function
2925                 is used to return the number of alias checks.
2926    LOOP_VINFO - field LOOP_VINFO_MAY_ALIAS_STMTS contains the list of ddrs
2927                 to be checked.
2928
2929    Output:
2930    COND_EXPR - conditional expression.
2931
2932    The returned COND_EXPR is the conditional expression to be used in the if
2933    statement that controls which version of the loop gets executed at runtime.
2934 */
2935
2936 void
2937 vect_create_cond_for_alias_checks (loop_vec_info loop_vinfo, tree * cond_expr)
2938 {
2939   vec<dr_with_seg_len_pair_t> comp_alias_ddrs =
2940     LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo);
2941
2942   if (comp_alias_ddrs.is_empty ())
2943     return;
2944
2945   create_runtime_alias_checks (LOOP_VINFO_LOOP (loop_vinfo),
2946                                &comp_alias_ddrs, cond_expr);
2947   if (dump_enabled_p ())
2948     dump_printf_loc (MSG_NOTE, vect_location,
2949                      "created %u versioning for alias checks.\n",
2950                      comp_alias_ddrs.length ());
2951 }
2952
2953
2954 /* Function vect_loop_versioning.
2955
2956    If the loop has data references that may or may not be aligned or/and
2957    has data reference relations whose independence was not proven then
2958    two versions of the loop need to be generated, one which is vectorized
2959    and one which isn't.  A test is then generated to control which of the
2960    loops is executed.  The test checks for the alignment of all of the
2961    data references that may or may not be aligned.  An additional
2962    sequence of runtime tests is generated for each pairs of DDRs whose
2963    independence was not proven.  The vectorized version of loop is
2964    executed only if both alias and alignment tests are passed.
2965
2966    The test generated to check which version of loop is executed
2967    is modified to also check for profitability as indicated by the
2968    cost model threshold TH.
2969
2970    The versioning precondition(s) are placed in *COND_EXPR and
2971    *COND_EXPR_STMT_LIST.  */
2972
2973 struct loop *
2974 vect_loop_versioning (loop_vec_info loop_vinfo,
2975                       unsigned int th, bool check_profitability,
2976                       poly_uint64 versioning_threshold)
2977 {
2978   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo), *nloop;
2979   struct loop *scalar_loop = LOOP_VINFO_SCALAR_LOOP (loop_vinfo);
2980   basic_block condition_bb;
2981   gphi_iterator gsi;
2982   gimple_stmt_iterator cond_exp_gsi;
2983   basic_block merge_bb;
2984   basic_block new_exit_bb;
2985   edge new_exit_e, e;
2986   gphi *orig_phi, *new_phi;
2987   tree cond_expr = NULL_TREE;
2988   gimple_seq cond_expr_stmt_list = NULL;
2989   tree arg;
2990   profile_probability prob = profile_probability::likely ();
2991   gimple_seq gimplify_stmt_list = NULL;
2992   tree scalar_loop_iters = LOOP_VINFO_NITERSM1 (loop_vinfo);
2993   bool version_align = LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo);
2994   bool version_alias = LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo);
2995   bool version_niter = LOOP_REQUIRES_VERSIONING_FOR_NITERS (loop_vinfo);
2996   tree version_simd_if_cond
2997     = LOOP_REQUIRES_VERSIONING_FOR_SIMD_IF_COND (loop_vinfo);
2998
2999   if (check_profitability)
3000     cond_expr = fold_build2 (GE_EXPR, boolean_type_node, scalar_loop_iters,
3001                              build_int_cst (TREE_TYPE (scalar_loop_iters),
3002                                             th - 1));
3003   if (maybe_ne (versioning_threshold, 0U))
3004     {
3005       tree expr = fold_build2 (GE_EXPR, boolean_type_node, scalar_loop_iters,
3006                                build_int_cst (TREE_TYPE (scalar_loop_iters),
3007                                               versioning_threshold - 1));
3008       if (cond_expr)
3009         cond_expr = fold_build2 (BIT_AND_EXPR, boolean_type_node,
3010                                  expr, cond_expr);
3011       else
3012         cond_expr = expr;
3013     }
3014
3015   if (version_niter)
3016     vect_create_cond_for_niters_checks (loop_vinfo, &cond_expr);
3017
3018   if (cond_expr)
3019     cond_expr = force_gimple_operand_1 (unshare_expr (cond_expr),
3020                                         &cond_expr_stmt_list,
3021                                         is_gimple_condexpr, NULL_TREE);
3022
3023   if (version_align)
3024     vect_create_cond_for_align_checks (loop_vinfo, &cond_expr,
3025                                        &cond_expr_stmt_list);
3026
3027   if (version_alias)
3028     {
3029       vect_create_cond_for_unequal_addrs (loop_vinfo, &cond_expr);
3030       vect_create_cond_for_lower_bounds (loop_vinfo, &cond_expr);
3031       vect_create_cond_for_alias_checks (loop_vinfo, &cond_expr);
3032     }
3033
3034   if (version_simd_if_cond)
3035     {
3036       gcc_assert (dom_info_available_p (CDI_DOMINATORS));
3037       if (flag_checking)
3038         if (basic_block bb
3039             = gimple_bb (SSA_NAME_DEF_STMT (version_simd_if_cond)))
3040           gcc_assert (bb != loop->header
3041                       && dominated_by_p (CDI_DOMINATORS, loop->header, bb)
3042                       && (scalar_loop == NULL
3043                           || (bb != scalar_loop->header
3044                               && dominated_by_p (CDI_DOMINATORS,
3045                                                  scalar_loop->header, bb))));
3046       tree zero = build_zero_cst (TREE_TYPE (version_simd_if_cond));
3047       tree c = fold_build2 (NE_EXPR, boolean_type_node,
3048                             version_simd_if_cond, zero);
3049       if (cond_expr)
3050         cond_expr = fold_build2 (TRUTH_AND_EXPR, boolean_type_node,
3051                                  c, cond_expr);
3052       else
3053         cond_expr = c;
3054       if (dump_enabled_p ())
3055         dump_printf_loc (MSG_NOTE, vect_location,
3056                          "created versioning for simd if condition check.\n");
3057     }
3058
3059   cond_expr = force_gimple_operand_1 (unshare_expr (cond_expr),
3060                                       &gimplify_stmt_list,
3061                                       is_gimple_condexpr, NULL_TREE);
3062   gimple_seq_add_seq (&cond_expr_stmt_list, gimplify_stmt_list);
3063
3064   /* Compute the outermost loop cond_expr and cond_expr_stmt_list are
3065      invariant in.  */
3066   struct loop *outermost = outermost_invariant_loop_for_expr (loop, cond_expr);
3067   for (gimple_stmt_iterator gsi = gsi_start (cond_expr_stmt_list);
3068        !gsi_end_p (gsi); gsi_next (&gsi))
3069     {
3070       gimple *stmt = gsi_stmt (gsi);
3071       update_stmt (stmt);
3072       ssa_op_iter iter;
3073       use_operand_p use_p;
3074       basic_block def_bb;
3075       FOR_EACH_SSA_USE_OPERAND (use_p, stmt, iter, SSA_OP_USE)
3076         if ((def_bb = gimple_bb (SSA_NAME_DEF_STMT (USE_FROM_PTR (use_p))))
3077             && flow_bb_inside_loop_p (outermost, def_bb))
3078           outermost = superloop_at_depth (loop, bb_loop_depth (def_bb) + 1);
3079     }
3080
3081   /* Search for the outermost loop we can version.  Avoid versioning of
3082      non-perfect nests but allow if-conversion versioned loops inside.  */
3083   struct loop *loop_to_version = loop;
3084   if (flow_loop_nested_p (outermost, loop))
3085     {
3086       if (dump_enabled_p ())
3087         dump_printf_loc (MSG_NOTE, vect_location,
3088                          "trying to apply versioning to outer loop %d\n",
3089                          outermost->num);
3090       if (outermost->num == 0)
3091         outermost = superloop_at_depth (loop, 1);
3092       /* And avoid applying versioning on non-perfect nests.  */
3093       while (loop_to_version != outermost
3094              && single_exit (loop_outer (loop_to_version))
3095              && (!loop_outer (loop_to_version)->inner->next
3096                  || vect_loop_vectorized_call (loop_to_version))
3097              && (!loop_outer (loop_to_version)->inner->next
3098                  || !loop_outer (loop_to_version)->inner->next->next))
3099         loop_to_version = loop_outer (loop_to_version);
3100     }
3101
3102   /* Apply versioning.  If there is already a scalar version created by
3103      if-conversion re-use that.  */
3104   gcond *cond;
3105   if (gimple *call = vect_loop_vectorized_call (loop_to_version, &cond))
3106     {
3107       gcc_assert (scalar_loop);
3108       condition_bb = gimple_bb (cond);
3109       gimple_cond_set_condition_from_tree (cond, cond_expr);
3110       update_stmt (cond);
3111
3112       if (cond_expr_stmt_list)
3113         {
3114           cond_exp_gsi = gsi_for_stmt (call);
3115           gsi_insert_seq_before (&cond_exp_gsi, cond_expr_stmt_list,
3116                                  GSI_SAME_STMT);
3117         }
3118
3119       /* ???  if-conversion uses profile_probability::always () but
3120          prob below is profile_probability::likely ().  */
3121       nloop = scalar_loop;
3122       if (dump_enabled_p ())
3123         dump_printf_loc (MSG_NOTE, vect_location,
3124                          "reusing %sloop version created by if conversion\n",
3125                          loop_to_version != loop ? "outer " : "");
3126     }
3127   else
3128     {
3129       if (loop_to_version != loop
3130           && dump_enabled_p ())
3131         dump_printf_loc (MSG_NOTE, vect_location,
3132                          "applying loop versioning to outer loop %d\n",
3133                          loop_to_version->num);
3134
3135       initialize_original_copy_tables ();
3136       nloop = loop_version (loop_to_version, cond_expr, &condition_bb,
3137                             prob, prob.invert (), prob, prob.invert (), true);
3138       gcc_assert (nloop);
3139       nloop = get_loop_copy (loop);
3140
3141       /* Kill off IFN_LOOP_VECTORIZED_CALL in the copy, nobody will
3142          reap those otherwise;  they also refer to the original
3143          loops.  */
3144       struct loop *l = loop;
3145       while (gimple *call = vect_loop_vectorized_call (l))
3146         {
3147           call = SSA_NAME_DEF_STMT (get_current_def (gimple_call_lhs (call)));
3148           fold_loop_internal_call (call, boolean_false_node);
3149           l = loop_outer (l);
3150         }
3151       free_original_copy_tables ();
3152
3153       if (cond_expr_stmt_list)
3154         {
3155           cond_exp_gsi = gsi_last_bb (condition_bb);
3156           gsi_insert_seq_before (&cond_exp_gsi, cond_expr_stmt_list,
3157                                  GSI_SAME_STMT);
3158         }
3159
3160       /* Loop versioning violates an assumption we try to maintain during
3161          vectorization - that the loop exit block has a single predecessor.
3162          After versioning, the exit block of both loop versions is the same
3163          basic block (i.e. it has two predecessors). Just in order to simplify
3164          following transformations in the vectorizer, we fix this situation
3165          here by adding a new (empty) block on the exit-edge of the loop,
3166          with the proper loop-exit phis to maintain loop-closed-form.
3167          If loop versioning wasn't done from loop, but scalar_loop instead,
3168          merge_bb will have already just a single successor.  */
3169
3170       merge_bb = single_exit (loop_to_version)->dest;
3171       if (EDGE_COUNT (merge_bb->preds) >= 2)
3172         {
3173           gcc_assert (EDGE_COUNT (merge_bb->preds) >= 2);
3174           new_exit_bb = split_edge (single_exit (loop_to_version));
3175           new_exit_e = single_exit (loop_to_version);
3176           e = EDGE_SUCC (new_exit_bb, 0);
3177
3178           for (gsi = gsi_start_phis (merge_bb); !gsi_end_p (gsi);
3179                gsi_next (&gsi))
3180             {
3181               tree new_res;
3182               orig_phi = gsi.phi ();
3183               new_res = copy_ssa_name (PHI_RESULT (orig_phi));
3184               new_phi = create_phi_node (new_res, new_exit_bb);
3185               arg = PHI_ARG_DEF_FROM_EDGE (orig_phi, e);
3186               add_phi_arg (new_phi, arg, new_exit_e,
3187                            gimple_phi_arg_location_from_edge (orig_phi, e));
3188               adjust_phi_and_debug_stmts (orig_phi, e, PHI_RESULT (new_phi));
3189             }
3190         }
3191
3192       update_ssa (TODO_update_ssa);
3193     }
3194
3195   if (version_niter)
3196     {
3197       /* The versioned loop could be infinite, we need to clear existing
3198          niter information which is copied from the original loop.  */
3199       gcc_assert (loop_constraint_set_p (loop, LOOP_C_FINITE));
3200       vect_free_loop_info_assumptions (nloop);
3201       /* And set constraint LOOP_C_INFINITE for niter analyzer.  */
3202       loop_constraint_set (loop, LOOP_C_INFINITE);
3203     }
3204
3205   if (LOCATION_LOCUS (vect_location.get_location_t ()) != UNKNOWN_LOCATION
3206       && dump_enabled_p ())
3207     {
3208       if (version_alias)
3209         dump_printf_loc (MSG_OPTIMIZED_LOCATIONS | MSG_PRIORITY_USER_FACING,
3210                          vect_location,
3211                          "loop versioned for vectorization because of "
3212                          "possible aliasing\n");
3213       if (version_align)
3214         dump_printf_loc (MSG_OPTIMIZED_LOCATIONS | MSG_PRIORITY_USER_FACING,
3215                          vect_location,
3216                          "loop versioned for vectorization to enhance "
3217                          "alignment\n");
3218
3219     }
3220
3221   return nloop;
3222 }