gcc/tree-vect-loop-manip.c

   1 /* Vectorizer Specific Loop Manipulations
   2    Copyright (C) 2003-2018 Free Software Foundation, Inc.
   3    Contributed by Dorit Naishlos <dorit@il.ibm.com>
   4    and Ira Rosen <irar@il.ibm.com>
   5
   6 This file is part of GCC.
   7
   8 GCC is free software; you can redistribute it and/or modify it under
   9 the terms of the GNU General Public License as published by the Free
  10 Software Foundation; either version 3, or (at your option) any later
  11 version.
  12
  13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
  14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
  15 FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  16 for more details.
  17
  18 You should have received a copy of the GNU General Public License
  19 along with GCC; see the file COPYING3.  If not see
  20 <http://www.gnu.org/licenses/>.  */
  21
  22 #include "config.h"
  23 #include "system.h"
  24 #include "coretypes.h"
  25 #include "backend.h"
  26 #include "tree.h"
  27 #include "gimple.h"
  28 #include "cfghooks.h"
  29 #include "tree-pass.h"
  30 #include "ssa.h"
  31 #include "fold-const.h"
  32 #include "cfganal.h"
  33 #include "gimplify.h"
  34 #include "gimple-iterator.h"
  35 #include "gimplify-me.h"
  36 #include "tree-cfg.h"
  37 #include "tree-ssa-loop-manip.h"
  38 #include "tree-into-ssa.h"
  39 #include "tree-ssa.h"
  40 #include "cfgloop.h"
  41 #include "tree-scalar-evolution.h"
  42 #include "tree-vectorizer.h"
  43 #include "tree-ssa-loop-ivopts.h"
  44 #include "gimple-fold.h"
  45 #include "tree-ssa-loop-niter.h"
  46 #include "internal-fn.h"
  47 #include "stor-layout.h"
  48 #include "optabs-query.h"
  49 #include "vec-perm-indices.h"
  50
  51 /*************************************************************************
  52   Simple Loop Peeling Utilities
  53
  54   Utilities to support loop peeling for vectorization purposes.
  55  *************************************************************************/
  56
  57
  58 /* Renames the use *OP_P.  */
  59
  60 static void
  61 rename_use_op (use_operand_p op_p)
  62 {
  63   tree new_name;
  64
  65   if (TREE_CODE (USE_FROM_PTR (op_p)) != SSA_NAME)
  66     return;
  67
  68   new_name = get_current_def (USE_FROM_PTR (op_p));
  69
  70   /* Something defined outside of the loop.  */
  71   if (!new_name)
  72     return;
  73
  74   /* An ordinary ssa name defined in the loop.  */
  75
  76   SET_USE (op_p, new_name);
  77 }
  78
  79
  80 /* Renames the variables in basic block BB.  Allow renaming  of PHI arguments
  81    on edges incoming from outer-block header if RENAME_FROM_OUTER_LOOP is
  82    true.  */
  83
  84 static void
  85 rename_variables_in_bb (basic_block bb, bool rename_from_outer_loop)
  86 {
  87   gimple *stmt;
  88   use_operand_p use_p;
  89   ssa_op_iter iter;
  90   edge e;
  91   edge_iterator ei;
  92   struct loop *loop = bb->loop_father;
  93   struct loop *outer_loop = NULL;
  94
  95   if (rename_from_outer_loop)
  96     {
  97       gcc_assert (loop);
  98       outer_loop = loop_outer (loop);
  99     }
 100
 101   for (gimple_stmt_iterator gsi = gsi_start_bb (bb); !gsi_end_p (gsi);
 102        gsi_next (&gsi))
 103     {
 104       stmt = gsi_stmt (gsi);
 105       FOR_EACH_SSA_USE_OPERAND (use_p, stmt, iter, SSA_OP_ALL_USES)
 106         rename_use_op (use_p);
 107     }
 108
 109   FOR_EACH_EDGE (e, ei, bb->preds)
 110     {
 111       if (!flow_bb_inside_loop_p (loop, e->src))
 112         {
 113           if (!rename_from_outer_loop)
 114             continue;
 115           if (e->src != outer_loop->header)
 116             {
 117               if (outer_loop->inner->next)
 118                 {
 119                   /* If outer_loop has 2 inner loops, allow there to
 120                      be an extra basic block which decides which of the
 121                      two loops to use using LOOP_VECTORIZED.  */
 122                   if (!single_pred_p (e->src)
 123                       || single_pred (e->src) != outer_loop->header)
 124                     continue;
 125                 }
 126             }
 127         }
 128       for (gphi_iterator gsi = gsi_start_phis (bb); !gsi_end_p (gsi);
 129            gsi_next (&gsi))
 130         rename_use_op (PHI_ARG_DEF_PTR_FROM_EDGE (gsi.phi (), e));
 131     }
 132 }
 133
 134
 135 struct adjust_info
 136 {
 137   tree from, to;
 138   basic_block bb;
 139 };
 140
 141 /* A stack of values to be adjusted in debug stmts.  We have to
 142    process them LIFO, so that the closest substitution applies.  If we
 143    processed them FIFO, without the stack, we might substitute uses
 144    with a PHI DEF that would soon become non-dominant, and when we got
 145    to the suitable one, it wouldn't have anything to substitute any
 146    more.  */
 147 static vec<adjust_info, va_heap> adjust_vec;
 148
 149 /* Adjust any debug stmts that referenced AI->from values to use the
 150    loop-closed AI->to, if the references are dominated by AI->bb and
 151    not by the definition of AI->from.  */
 152
 153 static void
 154 adjust_debug_stmts_now (adjust_info *ai)
 155 {
 156   basic_block bbphi = ai->bb;
 157   tree orig_def = ai->from;
 158   tree new_def = ai->to;
 159   imm_use_iterator imm_iter;
 160   gimple *stmt;
 161   basic_block bbdef = gimple_bb (SSA_NAME_DEF_STMT (orig_def));
 162
 163   gcc_assert (dom_info_available_p (CDI_DOMINATORS));
 164
 165   /* Adjust any debug stmts that held onto non-loop-closed
 166      references.  */
 167   FOR_EACH_IMM_USE_STMT (stmt, imm_iter, orig_def)
 168     {
 169       use_operand_p use_p;
 170       basic_block bbuse;
 171
 172       if (!is_gimple_debug (stmt))
 173         continue;
 174
 175       gcc_assert (gimple_debug_bind_p (stmt));
 176
 177       bbuse = gimple_bb (stmt);
 178
 179       if ((bbuse == bbphi
 180            || dominated_by_p (CDI_DOMINATORS, bbuse, bbphi))
 181           && !(bbuse == bbdef
 182                || dominated_by_p (CDI_DOMINATORS, bbuse, bbdef)))
 183         {
 184           if (new_def)
 185             FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
 186               SET_USE (use_p, new_def);
 187           else
 188             {
 189               gimple_debug_bind_reset_value (stmt);
 190               update_stmt (stmt);
 191             }
 192         }
 193     }
 194 }
 195
 196 /* Adjust debug stmts as scheduled before.  */
 197
 198 static void
 199 adjust_vec_debug_stmts (void)
 200 {
 201   if (!MAY_HAVE_DEBUG_BIND_STMTS)
 202     return;
 203
 204   gcc_assert (adjust_vec.exists ());
 205
 206   while (!adjust_vec.is_empty ())
 207     {
 208       adjust_debug_stmts_now (&adjust_vec.last ());
 209       adjust_vec.pop ();
 210     }
 211 }
 212
 213 /* Adjust any debug stmts that referenced FROM values to use the
 214    loop-closed TO, if the references are dominated by BB and not by
 215    the definition of FROM.  If adjust_vec is non-NULL, adjustments
 216    will be postponed until adjust_vec_debug_stmts is called.  */
 217
 218 static void
 219 adjust_debug_stmts (tree from, tree to, basic_block bb)
 220 {
 221   adjust_info ai;
 222
 223   if (MAY_HAVE_DEBUG_BIND_STMTS
 224       && TREE_CODE (from) == SSA_NAME
 225       && ! SSA_NAME_IS_DEFAULT_DEF (from)
 226       && ! virtual_operand_p (from))
 227     {
 228       ai.from = from;
 229       ai.to = to;
 230       ai.bb = bb;
 231
 232       if (adjust_vec.exists ())
 233         adjust_vec.safe_push (ai);
 234       else
 235         adjust_debug_stmts_now (&ai);
 236     }
 237 }
 238
 239 /* Change E's phi arg in UPDATE_PHI to NEW_DEF, and record information
 240    to adjust any debug stmts that referenced the old phi arg,
 241    presumably non-loop-closed references left over from other
 242    transformations.  */
 243
 244 static void
 245 adjust_phi_and_debug_stmts (gimple *update_phi, edge e, tree new_def)
 246 {
 247   tree orig_def = PHI_ARG_DEF_FROM_EDGE (update_phi, e);
 248
 249   SET_PHI_ARG_DEF (update_phi, e->dest_idx, new_def);
 250
 251   if (MAY_HAVE_DEBUG_BIND_STMTS)
 252     adjust_debug_stmts (orig_def, PHI_RESULT (update_phi),
 253                         gimple_bb (update_phi));
 254 }
 255
 256 /* Define one loop mask MASK from loop LOOP.  INIT_MASK is the value that
 257    the mask should have during the first iteration and NEXT_MASK is the
 258    value that it should have on subsequent iterations.  */
 259
 260 static void
 261 vect_set_loop_mask (struct loop *loop, tree mask, tree init_mask,
 262                     tree next_mask)
 263 {
 264   gphi *phi = create_phi_node (mask, loop->header);
 265   add_phi_arg (phi, init_mask, loop_preheader_edge (loop), UNKNOWN_LOCATION);
 266   add_phi_arg (phi, next_mask, loop_latch_edge (loop), UNKNOWN_LOCATION);
 267 }
 268
 269 /* Add SEQ to the end of LOOP's preheader block.  */
 270
 271 static void
 272 add_preheader_seq (struct loop *loop, gimple_seq seq)
 273 {
 274   if (seq)
 275     {
 276       edge pe = loop_preheader_edge (loop);
 277       basic_block new_bb = gsi_insert_seq_on_edge_immediate (pe, seq);
 278       gcc_assert (!new_bb);
 279     }
 280 }
 281
 282 /* Add SEQ to the beginning of LOOP's header block.  */
 283
 284 static void
 285 add_header_seq (struct loop *loop, gimple_seq seq)
 286 {
 287   if (seq)
 288     {
 289       gimple_stmt_iterator gsi = gsi_after_labels (loop->header);
 290       gsi_insert_seq_before (&gsi, seq, GSI_SAME_STMT);
 291     }
 292 }
 293
 294 /* Return true if the target can interleave elements of two vectors.
 295    OFFSET is 0 if the first half of the vectors should be interleaved
 296    or 1 if the second half should.  When returning true, store the
 297    associated permutation in INDICES.  */
 298
 299 static bool
 300 interleave_supported_p (vec_perm_indices *indices, tree vectype,
 301                         unsigned int offset)
 302 {
 303   poly_uint64 nelts = TYPE_VECTOR_SUBPARTS (vectype);
 304   poly_uint64 base = exact_div (nelts, 2) * offset;
 305   vec_perm_builder sel (nelts, 2, 3);
 306   for (unsigned int i = 0; i < 3; ++i)
 307     {
 308       sel.quick_push (base + i);
 309       sel.quick_push (base + i + nelts);
 310     }
 311   indices->new_vector (sel, 2, nelts);
 312   return can_vec_perm_const_p (TYPE_MODE (vectype), *indices);
 313 }
 314
 315 /* Try to use permutes to define the masks in DEST_RGM using the masks
 316    in SRC_RGM, given that the former has twice as many masks as the
 317    latter.  Return true on success, adding any new statements to SEQ.  */
 318
 319 static bool
 320 vect_maybe_permute_loop_masks (gimple_seq *seq, rgroup_masks *dest_rgm,
 321                                rgroup_masks *src_rgm)
 322 {
 323   tree src_masktype = src_rgm->mask_type;
 324   tree dest_masktype = dest_rgm->mask_type;
 325   machine_mode src_mode = TYPE_MODE (src_masktype);
 326   if (dest_rgm->max_nscalars_per_iter <= src_rgm->max_nscalars_per_iter
 327       && optab_handler (vec_unpacku_hi_optab, src_mode) != CODE_FOR_nothing
 328       && optab_handler (vec_unpacku_lo_optab, src_mode) != CODE_FOR_nothing)
 329     {
 330       /* Unpacking the source masks gives at least as many mask bits as
 331          we need.  We can then VIEW_CONVERT any excess bits away.  */
 332       tree unpack_masktype = vect_halve_mask_nunits (src_masktype);
 333       for (unsigned int i = 0; i < dest_rgm->masks.length (); ++i)
 334         {
 335           tree src = src_rgm->masks[i / 2];
 336           tree dest = dest_rgm->masks[i];
 337           tree_code code = ((i & 1) == (BYTES_BIG_ENDIAN ? 0 : 1)
 338                             ? VEC_UNPACK_HI_EXPR
 339                             : VEC_UNPACK_LO_EXPR);
 340           gassign *stmt;
 341           if (dest_masktype == unpack_masktype)
 342             stmt = gimple_build_assign (dest, code, src);
 343           else
 344             {
 345               tree temp = make_ssa_name (unpack_masktype);
 346               stmt = gimple_build_assign (temp, code, src);
 347               gimple_seq_add_stmt (seq, stmt);
 348               stmt = gimple_build_assign (dest, VIEW_CONVERT_EXPR,
 349                                           build1 (VIEW_CONVERT_EXPR,
 350                                                   dest_masktype, temp));
 351             }
 352           gimple_seq_add_stmt (seq, stmt);
 353         }
 354       return true;
 355     }
 356   vec_perm_indices indices[2];
 357   if (dest_masktype == src_masktype
 358       && interleave_supported_p (&indices[0], src_masktype, 0)
 359       && interleave_supported_p (&indices[1], src_masktype, 1))
 360     {
 361       /* The destination requires twice as many mask bits as the source, so
 362          we can use interleaving permutes to double up the number of bits.  */
 363       tree masks[2];
 364       for (unsigned int i = 0; i < 2; ++i)
 365         masks[i] = vect_gen_perm_mask_checked (src_masktype, indices[i]);
 366       for (unsigned int i = 0; i < dest_rgm->masks.length (); ++i)
 367         {
 368           tree src = src_rgm->masks[i / 2];
 369           tree dest = dest_rgm->masks[i];
 370           gimple *stmt = gimple_build_assign (dest, VEC_PERM_EXPR,
 371                                               src, src, masks[i & 1]);
 372           gimple_seq_add_stmt (seq, stmt);
 373         }
 374       return true;
 375     }
 376   return false;
 377 }
 378
 379 /* Helper for vect_set_loop_condition_masked.  Generate definitions for
 380    all the masks in RGM and return a mask that is nonzero when the loop
 381    needs to iterate.  Add any new preheader statements to PREHEADER_SEQ.
 382    Use LOOP_COND_GSI to insert code before the exit gcond.
 383
 384    RGM belongs to loop LOOP.  The loop originally iterated NITERS
 385    times and has been vectorized according to LOOP_VINFO.  Each iteration
 386    of the vectorized loop handles VF iterations of the scalar loop.
 387
 388    If NITERS_SKIP is nonnull, the first iteration of the vectorized loop
 389    starts with NITERS_SKIP dummy iterations of the scalar loop before
 390    the real work starts.  The mask elements for these dummy iterations
 391    must be 0, to ensure that the extra iterations do not have an effect.
 392
 393    It is known that:
 394
 395      NITERS * RGM->max_nscalars_per_iter
 396
 397    does not overflow.  However, MIGHT_WRAP_P says whether an induction
 398    variable that starts at 0 and has step:
 399
 400      VF * RGM->max_nscalars_per_iter
 401
 402    might overflow before hitting a value above:
 403
 404      (NITERS + NITERS_SKIP) * RGM->max_nscalars_per_iter
 405
 406    This means that we cannot guarantee that such an induction variable
 407    would ever hit a value that produces a set of all-false masks for RGM.  */
 408
 409 static tree
 410 vect_set_loop_masks_directly (struct loop *loop, loop_vec_info loop_vinfo,
 411                               gimple_seq *preheader_seq,
 412                               gimple_stmt_iterator loop_cond_gsi,
 413                               rgroup_masks *rgm, tree vf,
 414                               tree niters, tree niters_skip,
 415                               bool might_wrap_p)
 416 {
 417   tree compare_type = LOOP_VINFO_MASK_COMPARE_TYPE (loop_vinfo);
 418   tree mask_type = rgm->mask_type;
 419   unsigned int nscalars_per_iter = rgm->max_nscalars_per_iter;
 420   poly_uint64 nscalars_per_mask = TYPE_VECTOR_SUBPARTS (mask_type);
 421
 422   /* Calculate the maximum number of scalar values that the rgroup
 423      handles in total, the number that it handles for each iteration
 424      of the vector loop, and the number that it should skip during the
 425      first iteration of the vector loop.  */
 426   tree nscalars_total = niters;
 427   tree nscalars_step = vf;
 428   tree nscalars_skip = niters_skip;
 429   if (nscalars_per_iter != 1)
 430     {
 431       /* We checked before choosing to use a fully-masked loop that these
 432          multiplications don't overflow.  */
 433       tree factor = build_int_cst (compare_type, nscalars_per_iter);
 434       nscalars_total = gimple_build (preheader_seq, MULT_EXPR, compare_type,
 435                                      nscalars_total, factor);
 436       nscalars_step = gimple_build (preheader_seq, MULT_EXPR, compare_type,
 437                                     nscalars_step, factor);
 438       if (nscalars_skip)
 439         nscalars_skip = gimple_build (preheader_seq, MULT_EXPR, compare_type,
 440                                       nscalars_skip, factor);
 441     }
 442
 443   /* Create an induction variable that counts the number of scalars
 444      processed.  */
 445   tree index_before_incr, index_after_incr;
 446   gimple_stmt_iterator incr_gsi;
 447   bool insert_after;
 448   tree zero_index = build_int_cst (compare_type, 0);
 449   standard_iv_increment_position (loop, &incr_gsi, &insert_after);
 450   create_iv (zero_index, nscalars_step, NULL_TREE, loop, &incr_gsi,
 451              insert_after, &index_before_incr, &index_after_incr);
 452
 453   tree test_index, test_limit, first_limit;
 454   gimple_stmt_iterator *test_gsi;
 455   if (might_wrap_p)
 456     {
 457       /* In principle the loop should stop iterating once the incremented
 458          IV reaches a value greater than or equal to:
 459
 460            NSCALARS_TOTAL +[infinite-prec] NSCALARS_SKIP
 461
 462          However, there's no guarantee that this addition doesn't overflow
 463          the comparison type, or that the IV hits a value above it before
 464          wrapping around.  We therefore adjust the limit down by one
 465          IV step:
 466
 467            (NSCALARS_TOTAL +[infinite-prec] NSCALARS_SKIP)
 468            -[infinite-prec] NSCALARS_STEP
 469
 470          and compare the IV against this limit _before_ incrementing it.
 471          Since the comparison type is unsigned, we actually want the
 472          subtraction to saturate at zero:
 473
 474            (NSCALARS_TOTAL +[infinite-prec] NSCALARS_SKIP)
 475            -[sat] NSCALARS_STEP
 476
 477          And since NSCALARS_SKIP < NSCALARS_STEP, we can reassociate this as:
 478
 479            NSCALARS_TOTAL -[sat] (NSCALARS_STEP - NSCALARS_SKIP)
 480
 481          where the rightmost subtraction can be done directly in
 482          COMPARE_TYPE.  */
 483       test_index = index_before_incr;
 484       tree adjust = nscalars_step;
 485       if (nscalars_skip)
 486         adjust = gimple_build (preheader_seq, MINUS_EXPR, compare_type,
 487                                adjust, nscalars_skip);
 488       test_limit = gimple_build (preheader_seq, MAX_EXPR, compare_type,
 489                                  nscalars_total, adjust);
 490       test_limit = gimple_build (preheader_seq, MINUS_EXPR, compare_type,
 491                                  test_limit, adjust);
 492       test_gsi = &incr_gsi;
 493
 494       /* Get a safe limit for the first iteration.  */
 495       if (nscalars_skip)
 496         {
 497           /* The first vector iteration can handle at most NSCALARS_STEP
 498              scalars.  NSCALARS_STEP <= CONST_LIMIT, and adding
 499              NSCALARS_SKIP to that cannot overflow.  */
 500           tree const_limit = build_int_cst (compare_type,
 501                                             LOOP_VINFO_VECT_FACTOR (loop_vinfo)
 502                                             * nscalars_per_iter);
 503           first_limit = gimple_build (preheader_seq, MIN_EXPR, compare_type,
 504                                       nscalars_total, const_limit);
 505           first_limit = gimple_build (preheader_seq, PLUS_EXPR, compare_type,
 506                                       first_limit, nscalars_skip);
 507         }
 508       else
 509         /* For the first iteration it doesn't matter whether the IV hits
 510            a value above NSCALARS_TOTAL.  That only matters for the latch
 511            condition.  */
 512         first_limit = nscalars_total;
 513     }
 514   else
 515     {
 516       /* Test the incremented IV, which will always hit a value above
 517          the bound before wrapping.  */
 518       test_index = index_after_incr;
 519       test_limit = nscalars_total;
 520       if (nscalars_skip)
 521         test_limit = gimple_build (preheader_seq, PLUS_EXPR, compare_type,
 522                                    test_limit, nscalars_skip);
 523       test_gsi = &loop_cond_gsi;
 524
 525       first_limit = test_limit;
 526     }
 527
 528   /* Provide a definition of each mask in the group.  */
 529   tree next_mask = NULL_TREE;
 530   tree mask;
 531   unsigned int i;
 532   FOR_EACH_VEC_ELT_REVERSE (rgm->masks, i, mask)
 533     {
 534       /* Previous masks will cover BIAS scalars.  This mask covers the
 535          next batch.  */
 536       poly_uint64 bias = nscalars_per_mask * i;
 537       tree bias_tree = build_int_cst (compare_type, bias);
 538       gimple *tmp_stmt;
 539
 540       /* See whether the first iteration of the vector loop is known
 541          to have a full mask.  */
 542       poly_uint64 const_limit;
 543       bool first_iteration_full
 544         = (poly_int_tree_p (first_limit, &const_limit)
 545            && known_ge (const_limit, (i + 1) * nscalars_per_mask));
 546
 547       /* Rather than have a new IV that starts at BIAS and goes up to
 548          TEST_LIMIT, prefer to use the same 0-based IV for each mask
 549          and adjust the bound down by BIAS.  */
 550       tree this_test_limit = test_limit;
 551       if (i != 0)
 552         {
 553           this_test_limit = gimple_build (preheader_seq, MAX_EXPR,
 554                                           compare_type, this_test_limit,
 555                                           bias_tree);
 556           this_test_limit = gimple_build (preheader_seq, MINUS_EXPR,
 557                                           compare_type, this_test_limit,
 558                                           bias_tree);
 559         }
 560
 561       /* Create the initial mask.  First include all scalars that
 562          are within the loop limit.  */
 563       tree init_mask = NULL_TREE;
 564       if (!first_iteration_full)
 565         {
 566           tree start, end;
 567           if (first_limit == test_limit)
 568             {
 569               /* Use a natural test between zero (the initial IV value)
 570                  and the loop limit.  The "else" block would be valid too,
 571                  but this choice can avoid the need to load BIAS_TREE into
 572                  a register.  */
 573               start = zero_index;
 574               end = this_test_limit;
 575             }
 576           else
 577             {
 578               /* FIRST_LIMIT is the maximum number of scalars handled by the
 579                  first iteration of the vector loop.  Test the portion
 580                  associated with this mask.  */
 581               start = bias_tree;
 582               end = first_limit;
 583             }
 584
 585           init_mask = make_temp_ssa_name (mask_type, NULL, "max_mask");
 586           tmp_stmt = vect_gen_while (init_mask, start, end);
 587           gimple_seq_add_stmt (preheader_seq, tmp_stmt);
 588         }
 589
 590       /* Now AND out the bits that are within the number of skipped
 591          scalars.  */
 592       poly_uint64 const_skip;
 593       if (nscalars_skip
 594           && !(poly_int_tree_p (nscalars_skip, &const_skip)
 595                && known_le (const_skip, bias)))
 596         {
 597           tree unskipped_mask = vect_gen_while_not (preheader_seq, mask_type,
 598                                                     bias_tree, nscalars_skip);
 599           if (init_mask)
 600             init_mask = gimple_build (preheader_seq, BIT_AND_EXPR, mask_type,
 601                                       init_mask, unskipped_mask);
 602           else
 603             init_mask = unskipped_mask;
 604         }
 605
 606       if (!init_mask)
 607         /* First iteration is full.  */
 608         init_mask = build_minus_one_cst (mask_type);
 609
 610       /* Get the mask value for the next iteration of the loop.  */
 611       next_mask = make_temp_ssa_name (mask_type, NULL, "next_mask");
 612       gcall *call = vect_gen_while (next_mask, test_index, this_test_limit);
 613       gsi_insert_before (test_gsi, call, GSI_SAME_STMT);
 614
 615       vect_set_loop_mask (loop, mask, init_mask, next_mask);
 616     }
 617   return next_mask;
 618 }
 619
 620 /* Make LOOP iterate NITERS times using masking and WHILE_ULT calls.
 621    LOOP_VINFO describes the vectorization of LOOP.  NITERS is the
 622    number of iterations of the original scalar loop that should be
 623    handled by the vector loop.  NITERS_MAYBE_ZERO and FINAL_IV are
 624    as for vect_set_loop_condition.
 625
 626    Insert the branch-back condition before LOOP_COND_GSI and return the
 627    final gcond.  */
 628
 629 static gcond *
 630 vect_set_loop_condition_masked (struct loop *loop, loop_vec_info loop_vinfo,
 631                                 tree niters, tree final_iv,
 632                                 bool niters_maybe_zero,
 633                                 gimple_stmt_iterator loop_cond_gsi)
 634 {
 635   gimple_seq preheader_seq = NULL;
 636   gimple_seq header_seq = NULL;
 637
 638   tree compare_type = LOOP_VINFO_MASK_COMPARE_TYPE (loop_vinfo);
 639   unsigned int compare_precision = TYPE_PRECISION (compare_type);
 640   unsigned HOST_WIDE_INT max_vf = vect_max_vf (loop_vinfo);
 641   tree orig_niters = niters;
 642
 643   /* Type of the initial value of NITERS.  */
 644   tree ni_actual_type = TREE_TYPE (niters);
 645   unsigned int ni_actual_precision = TYPE_PRECISION (ni_actual_type);
 646
 647   /* Convert NITERS to the same size as the compare.  */
 648   if (compare_precision > ni_actual_precision
 649       && niters_maybe_zero)
 650     {
 651       /* We know that there is always at least one iteration, so if the
 652          count is zero then it must have wrapped.  Cope with this by
 653          subtracting 1 before the conversion and adding 1 to the result.  */
 654       gcc_assert (TYPE_UNSIGNED (ni_actual_type));
 655       niters = gimple_build (&preheader_seq, PLUS_EXPR, ni_actual_type,
 656                              niters, build_minus_one_cst (ni_actual_type));
 657       niters = gimple_convert (&preheader_seq, compare_type, niters);
 658       niters = gimple_build (&preheader_seq, PLUS_EXPR, compare_type,
 659                              niters, build_one_cst (compare_type));
 660     }
 661   else
 662     niters = gimple_convert (&preheader_seq, compare_type, niters);
 663
 664   /* Convert skip_niters to the right type.  */
 665   tree niters_skip = LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo);
 666
 667   /* Now calculate the value that the induction variable must be able
 668      to hit in order to ensure that we end the loop with an all-false mask.
 669      This involves adding the maximum number of inactive trailing scalar
 670      iterations.  */
 671   widest_int iv_limit;
 672   bool known_max_iters = max_loop_iterations (loop, &iv_limit);
 673   if (known_max_iters)
 674     {
 675       if (niters_skip)
 676         {
 677           /* Add the maximum number of skipped iterations to the
 678              maximum iteration count.  */
 679           if (TREE_CODE (niters_skip) == INTEGER_CST)
 680             iv_limit += wi::to_widest (niters_skip);
 681           else
 682             iv_limit += max_vf - 1;
 683         }
 684       /* IV_LIMIT is the maximum number of latch iterations, which is also
 685          the maximum in-range IV value.  Round this value down to the previous
 686          vector alignment boundary and then add an extra full iteration.  */
 687       poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
 688       iv_limit = (iv_limit & -(int) known_alignment (vf)) + max_vf;
 689     }
 690
 691   /* Get the vectorization factor in tree form.  */
 692   tree vf = build_int_cst (compare_type,
 693                            LOOP_VINFO_VECT_FACTOR (loop_vinfo));
 694
 695   /* Iterate over all the rgroups and fill in their masks.  We could use
 696      the first mask from any rgroup for the loop condition; here we
 697      arbitrarily pick the last.  */
 698   tree test_mask = NULL_TREE;
 699   rgroup_masks *rgm;
 700   unsigned int i;
 701   vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo);
 702   FOR_EACH_VEC_ELT (*masks, i, rgm)
 703     if (!rgm->masks.is_empty ())
 704       {
 705         /* First try using permutes.  This adds a single vector
 706            instruction to the loop for each mask, but needs no extra
 707            loop invariants or IVs.  */
 708         unsigned int nmasks = i + 1;
 709         if ((nmasks & 1) == 0)
 710           {
 711             rgroup_masks *half_rgm = &(*masks)[nmasks / 2 - 1];
 712             if (!half_rgm->masks.is_empty ()
 713                 && vect_maybe_permute_loop_masks (&header_seq, rgm, half_rgm))
 714               continue;
 715           }
 716
 717         /* See whether zero-based IV would ever generate all-false masks
 718            before wrapping around.  */
 719         bool might_wrap_p
 720           = (!known_max_iters
 721              || (wi::min_precision (iv_limit * rgm->max_nscalars_per_iter,
 722                                     UNSIGNED)
 723                  > compare_precision));
 724
 725         /* Set up all masks for this group.  */
 726         test_mask = vect_set_loop_masks_directly (loop, loop_vinfo,
 727                                                   &preheader_seq,
 728                                                   loop_cond_gsi, rgm, vf,
 729                                                   niters, niters_skip,
 730                                                   might_wrap_p);
 731       }
 732
 733   /* Emit all accumulated statements.  */
 734   add_preheader_seq (loop, preheader_seq);
 735   add_header_seq (loop, header_seq);
 736
 737   /* Get a boolean result that tells us whether to iterate.  */
 738   edge exit_edge = single_exit (loop);
 739   tree_code code = (exit_edge->flags & EDGE_TRUE_VALUE) ? EQ_EXPR : NE_EXPR;
 740   tree zero_mask = build_zero_cst (TREE_TYPE (test_mask));
 741   gcond *cond_stmt = gimple_build_cond (code, test_mask, zero_mask,
 742                                         NULL_TREE, NULL_TREE);
 743   gsi_insert_before (&loop_cond_gsi, cond_stmt, GSI_SAME_STMT);
 744
 745   /* The loop iterates (NITERS - 1) / VF + 1 times.
 746      Subtract one from this to get the latch count.  */
 747   tree step = build_int_cst (compare_type,
 748                              LOOP_VINFO_VECT_FACTOR (loop_vinfo));
 749   tree niters_minus_one = fold_build2 (PLUS_EXPR, compare_type, niters,
 750                                        build_minus_one_cst (compare_type));
 751   loop->nb_iterations = fold_build2 (TRUNC_DIV_EXPR, compare_type,
 752                                      niters_minus_one, step);
 753
 754   if (final_iv)
 755     {
 756       gassign *assign = gimple_build_assign (final_iv, orig_niters);
 757       gsi_insert_on_edge_immediate (single_exit (loop), assign);
 758     }
 759
 760   return cond_stmt;
 761 }
 762
 763 /* Like vect_set_loop_condition, but handle the case in which there
 764    are no loop masks.  */
 765
 766 static gcond *
 767 vect_set_loop_condition_unmasked (struct loop *loop, tree niters,
 768                                   tree step, tree final_iv,
 769                                   bool niters_maybe_zero,
 770                                   gimple_stmt_iterator loop_cond_gsi)
 771 {
 772   tree indx_before_incr, indx_after_incr;
 773   gcond *cond_stmt;
 774   gcond *orig_cond;
 775   edge pe = loop_preheader_edge (loop);
 776   edge exit_edge = single_exit (loop);
 777   gimple_stmt_iterator incr_gsi;
 778   bool insert_after;
 779   enum tree_code code;
 780   tree niters_type = TREE_TYPE (niters);
 781
 782   orig_cond = get_loop_exit_condition (loop);
 783   gcc_assert (orig_cond);
 784   loop_cond_gsi = gsi_for_stmt (orig_cond);
 785
 786   tree init, limit;
 787   if (!niters_maybe_zero && integer_onep (step))
 788     {
 789       /* In this case we can use a simple 0-based IV:
 790
 791          A:
 792            x = 0;
 793            do
 794              {
 795                ...
 796                x += 1;
 797              }
 798            while (x < NITERS);  */
 799       code = (exit_edge->flags & EDGE_TRUE_VALUE) ? GE_EXPR : LT_EXPR;
 800       init = build_zero_cst (niters_type);
 801       limit = niters;
 802     }
 803   else
 804     {
 805       /* The following works for all values of NITERS except 0:
 806
 807          B:
 808            x = 0;
 809            do
 810              {
 811                ...
 812                x += STEP;
 813              }
 814            while (x <= NITERS - STEP);
 815
 816          so that the loop continues to iterate if x + STEP - 1 < NITERS
 817          but stops if x + STEP - 1 >= NITERS.
 818
 819          However, if NITERS is zero, x never hits a value above NITERS - STEP
 820          before wrapping around.  There are two obvious ways of dealing with
 821          this:
 822
 823          - start at STEP - 1 and compare x before incrementing it
 824          - start at -1 and compare x after incrementing it
 825
 826          The latter is simpler and is what we use.  The loop in this case
 827          looks like:
 828
 829          C:
 830            x = -1;
 831            do
 832              {
 833                ...
 834                x += STEP;
 835              }
 836            while (x < NITERS - STEP);
 837
 838          In both cases the loop limit is NITERS - STEP.  */
 839       gimple_seq seq = NULL;
 840       limit = force_gimple_operand (niters, &seq, true, NULL_TREE);
 841       limit = gimple_build (&seq, MINUS_EXPR, TREE_TYPE (limit), limit, step);
 842       if (seq)
 843         {
 844           basic_block new_bb = gsi_insert_seq_on_edge_immediate (pe, seq);
 845           gcc_assert (!new_bb);
 846         }
 847       if (niters_maybe_zero)
 848         {
 849           /* Case C.  */
 850           code = (exit_edge->flags & EDGE_TRUE_VALUE) ? GE_EXPR : LT_EXPR;
 851           init = build_all_ones_cst (niters_type);
 852         }
 853       else
 854         {
 855           /* Case B.  */
 856           code = (exit_edge->flags & EDGE_TRUE_VALUE) ? GT_EXPR : LE_EXPR;
 857           init = build_zero_cst (niters_type);
 858         }
 859     }
 860
 861   standard_iv_increment_position (loop, &incr_gsi, &insert_after);
 862   create_iv (init, step, NULL_TREE, loop,
 863              &incr_gsi, insert_after, &indx_before_incr, &indx_after_incr);
 864   indx_after_incr = force_gimple_operand_gsi (&loop_cond_gsi, indx_after_incr,
 865                                               true, NULL_TREE, true,
 866                                               GSI_SAME_STMT);
 867   limit = force_gimple_operand_gsi (&loop_cond_gsi, limit, true, NULL_TREE,
 868                                      true, GSI_SAME_STMT);
 869
 870   cond_stmt = gimple_build_cond (code, indx_after_incr, limit, NULL_TREE,
 871                                  NULL_TREE);
 872
 873   gsi_insert_before (&loop_cond_gsi, cond_stmt, GSI_SAME_STMT);
 874
 875   /* Record the number of latch iterations.  */
 876   if (limit == niters)
 877     /* Case A: the loop iterates NITERS times.  Subtract one to get the
 878        latch count.  */
 879     loop->nb_iterations = fold_build2 (MINUS_EXPR, niters_type, niters,
 880                                        build_int_cst (niters_type, 1));
 881   else
 882     /* Case B or C: the loop iterates (NITERS - STEP) / STEP + 1 times.
 883        Subtract one from this to get the latch count.  */
 884     loop->nb_iterations = fold_build2 (TRUNC_DIV_EXPR, niters_type,
 885                                        limit, step);
 886
 887   if (final_iv)
 888     {
 889       gassign *assign = gimple_build_assign (final_iv, MINUS_EXPR,
 890                                              indx_after_incr, init);
 891       gsi_insert_on_edge_immediate (single_exit (loop), assign);
 892     }
 893
 894   return cond_stmt;
 895 }
 896
 897 /* If we're using fully-masked loops, make LOOP iterate:
 898
 899       N == (NITERS - 1) / STEP + 1
 900
 901    times.  When NITERS is zero, this is equivalent to making the loop
 902    execute (1 << M) / STEP times, where M is the precision of NITERS.
 903    NITERS_MAYBE_ZERO is true if this last case might occur.
 904
 905    If we're not using fully-masked loops, make LOOP iterate:
 906
 907       N == (NITERS - STEP) / STEP + 1
 908
 909    times, where NITERS is known to be outside the range [1, STEP - 1].
 910    This is equivalent to making the loop execute NITERS / STEP times
 911    when NITERS is nonzero and (1 << M) / STEP times otherwise.
 912    NITERS_MAYBE_ZERO again indicates whether this last case might occur.
 913
 914    If FINAL_IV is nonnull, it is an SSA name that should be set to
 915    N * STEP on exit from the loop.
 916
 917    Assumption: the exit-condition of LOOP is the last stmt in the loop.  */
 918
 919 void
 920 vect_set_loop_condition (struct loop *loop, loop_vec_info loop_vinfo,
 921                          tree niters, tree step, tree final_iv,
 922                          bool niters_maybe_zero)
 923 {
 924   gcond *cond_stmt;
 925   gcond *orig_cond = get_loop_exit_condition (loop);
 926   gimple_stmt_iterator loop_cond_gsi = gsi_for_stmt (orig_cond);
 927
 928   if (loop_vinfo && LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
 929     cond_stmt = vect_set_loop_condition_masked (loop, loop_vinfo, niters,
 930                                                 final_iv, niters_maybe_zero,
 931                                                 loop_cond_gsi);
 932   else
 933     cond_stmt = vect_set_loop_condition_unmasked (loop, niters, step,
 934                                                   final_iv, niters_maybe_zero,
 935                                                   loop_cond_gsi);
 936
 937   /* Remove old loop exit test.  */
 938   gsi_remove (&loop_cond_gsi, true);
 939   free_stmt_vec_info (orig_cond);
 940
 941   if (dump_enabled_p ())
 942     {
 943       dump_printf_loc (MSG_NOTE, vect_location, "New loop exit condition: ");
 944       dump_gimple_stmt (MSG_NOTE, TDF_SLIM, cond_stmt, 0);
 945     }
 946 }
 947
 948 /* Helper routine of slpeel_tree_duplicate_loop_to_edge_cfg.
 949    For all PHI arguments in FROM->dest and TO->dest from those
 950    edges ensure that TO->dest PHI arguments have current_def
 951    to that in from.  */
 952
 953 static void
 954 slpeel_duplicate_current_defs_from_edges (edge from, edge to)
 955 {
 956   gimple_stmt_iterator gsi_from, gsi_to;
 957
 958   for (gsi_from = gsi_start_phis (from->dest),
 959        gsi_to = gsi_start_phis (to->dest);
 960        !gsi_end_p (gsi_from) && !gsi_end_p (gsi_to);)
 961     {
 962       gimple *from_phi = gsi_stmt (gsi_from);
 963       gimple *to_phi = gsi_stmt (gsi_to);
 964       tree from_arg = PHI_ARG_DEF_FROM_EDGE (from_phi, from);
 965       tree to_arg = PHI_ARG_DEF_FROM_EDGE (to_phi, to);
 966       if (virtual_operand_p (from_arg))
 967         {
 968           gsi_next (&gsi_from);
 969           continue;
 970         }
 971       if (virtual_operand_p (to_arg))
 972         {
 973           gsi_next (&gsi_to);
 974           continue;
 975         }
 976       if (TREE_CODE (from_arg) != SSA_NAME)
 977         gcc_assert (operand_equal_p (from_arg, to_arg, 0));
 978       else
 979         {
 980           if (get_current_def (to_arg) == NULL_TREE)
 981             set_current_def (to_arg, get_current_def (from_arg));
 982         }
 983       gsi_next (&gsi_from);
 984       gsi_next (&gsi_to);
 985     }
 986
 987   gphi *from_phi = get_virtual_phi (from->dest);
 988   gphi *to_phi = get_virtual_phi (to->dest);
 989   if (from_phi)
 990     set_current_def (PHI_ARG_DEF_FROM_EDGE (to_phi, to),
 991                      get_current_def (PHI_ARG_DEF_FROM_EDGE (from_phi, from)));
 992 }
 993
 994
 995 /* Given LOOP this function generates a new copy of it and puts it
 996    on E which is either the entry or exit of LOOP.  If SCALAR_LOOP is
 997    non-NULL, assume LOOP and SCALAR_LOOP are equivalent and copy the
 998    basic blocks from SCALAR_LOOP instead of LOOP, but to either the
 999    entry or exit of LOOP.  */
1000
1001 struct loop *
1002 slpeel_tree_duplicate_loop_to_edge_cfg (struct loop *loop,
1003                                         struct loop *scalar_loop, edge e)
1004 {
1005   struct loop *new_loop;
1006   basic_block *new_bbs, *bbs, *pbbs;
1007   bool at_exit;
1008   bool was_imm_dom;
1009   basic_block exit_dest;
1010   edge exit, new_exit;
1011   bool duplicate_outer_loop = false;
1012
1013   exit = single_exit (loop);
1014   at_exit = (e == exit);
1015   if (!at_exit && e != loop_preheader_edge (loop))
1016     return NULL;
1017
1018   if (scalar_loop == NULL)
1019     scalar_loop = loop;
1020
1021   bbs = XNEWVEC (basic_block, scalar_loop->num_nodes + 1);
1022   pbbs = bbs + 1;
1023   get_loop_body_with_size (scalar_loop, pbbs, scalar_loop->num_nodes);
1024   /* Allow duplication of outer loops.  */
1025   if (scalar_loop->inner)
1026     duplicate_outer_loop = true;
1027   /* Check whether duplication is possible.  */
1028   if (!can_copy_bbs_p (pbbs, scalar_loop->num_nodes))
1029     {
1030       free (bbs);
1031       return NULL;
1032     }
1033
1034   /* Generate new loop structure.  */
1035   new_loop = duplicate_loop (scalar_loop, loop_outer (scalar_loop));
1036   duplicate_subloops (scalar_loop, new_loop);
1037
1038   exit_dest = exit->dest;
1039   was_imm_dom = (get_immediate_dominator (CDI_DOMINATORS,
1040                                           exit_dest) == loop->header ?
1041                  true : false);
1042
1043   /* Also copy the pre-header, this avoids jumping through hoops to
1044      duplicate the loop entry PHI arguments.  Create an empty
1045      pre-header unconditionally for this.  */
1046   basic_block preheader = split_edge (loop_preheader_edge (scalar_loop));
1047   edge entry_e = single_pred_edge (preheader);
1048   bbs[0] = preheader;
1049   new_bbs = XNEWVEC (basic_block, scalar_loop->num_nodes + 1);
1050
1051   exit = single_exit (scalar_loop);
1052   copy_bbs (bbs, scalar_loop->num_nodes + 1, new_bbs,
1053             &exit, 1, &new_exit, NULL,
1054             at_exit ? loop->latch : e->src, true);
1055   exit = single_exit (loop);
1056   basic_block new_preheader = new_bbs[0];
1057
1058   add_phi_args_after_copy (new_bbs, scalar_loop->num_nodes + 1, NULL);
1059
1060   if (scalar_loop != loop)
1061     {
1062       /* If we copied from SCALAR_LOOP rather than LOOP, SSA_NAMEs from
1063          SCALAR_LOOP will have current_def set to SSA_NAMEs in the new_loop,
1064          but LOOP will not.  slpeel_update_phi_nodes_for_guard{1,2} expects
1065          the LOOP SSA_NAMEs (on the exit edge and edge from latch to
1066          header) to have current_def set, so copy them over.  */
1067       slpeel_duplicate_current_defs_from_edges (single_exit (scalar_loop),
1068                                                 exit);
1069       slpeel_duplicate_current_defs_from_edges (EDGE_SUCC (scalar_loop->latch,
1070                                                            0),
1071                                                 EDGE_SUCC (loop->latch, 0));
1072     }
1073
1074   if (at_exit) /* Add the loop copy at exit.  */
1075     {
1076       if (scalar_loop != loop)
1077         {
1078           gphi_iterator gsi;
1079           new_exit = redirect_edge_and_branch (new_exit, exit_dest);
1080
1081           for (gsi = gsi_start_phis (exit_dest); !gsi_end_p (gsi);
1082                gsi_next (&gsi))
1083             {
1084               gphi *phi = gsi.phi ();
1085               tree orig_arg = PHI_ARG_DEF_FROM_EDGE (phi, e);
1086               location_t orig_locus
1087                 = gimple_phi_arg_location_from_edge (phi, e);
1088
1089               add_phi_arg (phi, orig_arg, new_exit, orig_locus);
1090             }
1091         }
1092       redirect_edge_and_branch_force (e, new_preheader);
1093       flush_pending_stmts (e);
1094       set_immediate_dominator (CDI_DOMINATORS, new_preheader, e->src);
1095       if (was_imm_dom || duplicate_outer_loop)
1096         set_immediate_dominator (CDI_DOMINATORS, exit_dest, new_exit->src);
1097
1098       /* And remove the non-necessary forwarder again.  Keep the other
1099          one so we have a proper pre-header for the loop at the exit edge.  */
1100       redirect_edge_pred (single_succ_edge (preheader),
1101                           single_pred (preheader));
1102       delete_basic_block (preheader);
1103       set_immediate_dominator (CDI_DOMINATORS, scalar_loop->header,
1104                                loop_preheader_edge (scalar_loop)->src);
1105     }
1106   else /* Add the copy at entry.  */
1107     {
1108       if (scalar_loop != loop)
1109         {
1110           /* Remove the non-necessary forwarder of scalar_loop again.  */
1111           redirect_edge_pred (single_succ_edge (preheader),
1112                               single_pred (preheader));
1113           delete_basic_block (preheader);
1114           set_immediate_dominator (CDI_DOMINATORS, scalar_loop->header,
1115                                    loop_preheader_edge (scalar_loop)->src);
1116           preheader = split_edge (loop_preheader_edge (loop));
1117           entry_e = single_pred_edge (preheader);
1118         }
1119
1120       redirect_edge_and_branch_force (entry_e, new_preheader);
1121       flush_pending_stmts (entry_e);
1122       set_immediate_dominator (CDI_DOMINATORS, new_preheader, entry_e->src);
1123
1124       redirect_edge_and_branch_force (new_exit, preheader);
1125       flush_pending_stmts (new_exit);
1126       set_immediate_dominator (CDI_DOMINATORS, preheader, new_exit->src);
1127
1128       /* And remove the non-necessary forwarder again.  Keep the other
1129          one so we have a proper pre-header for the loop at the exit edge.  */
1130       redirect_edge_pred (single_succ_edge (new_preheader),
1131                           single_pred (new_preheader));
1132       delete_basic_block (new_preheader);
1133       set_immediate_dominator (CDI_DOMINATORS, new_loop->header,
1134                                loop_preheader_edge (new_loop)->src);
1135     }
1136
1137   /* Skip new preheader since it's deleted if copy loop is added at entry.  */
1138   for (unsigned i = (at_exit ? 0 : 1); i < scalar_loop->num_nodes + 1; i++)
1139     rename_variables_in_bb (new_bbs[i], duplicate_outer_loop);
1140
1141   if (scalar_loop != loop)
1142     {
1143       /* Update new_loop->header PHIs, so that on the preheader
1144          edge they are the ones from loop rather than scalar_loop.  */
1145       gphi_iterator gsi_orig, gsi_new;
1146       edge orig_e = loop_preheader_edge (loop);
1147       edge new_e = loop_preheader_edge (new_loop);
1148
1149       for (gsi_orig = gsi_start_phis (loop->header),
1150            gsi_new = gsi_start_phis (new_loop->header);
1151            !gsi_end_p (gsi_orig) && !gsi_end_p (gsi_new);
1152            gsi_next (&gsi_orig), gsi_next (&gsi_new))
1153         {
1154           gphi *orig_phi = gsi_orig.phi ();
1155           gphi *new_phi = gsi_new.phi ();
1156           tree orig_arg = PHI_ARG_DEF_FROM_EDGE (orig_phi, orig_e);
1157           location_t orig_locus
1158             = gimple_phi_arg_location_from_edge (orig_phi, orig_e);
1159
1160           add_phi_arg (new_phi, orig_arg, new_e, orig_locus);
1161         }
1162     }
1163
1164   free (new_bbs);
1165   free (bbs);
1166
1167   checking_verify_dominators (CDI_DOMINATORS);
1168
1169   return new_loop;
1170 }
1171
1172
1173 /* Given the condition expression COND, put it as the last statement of
1174    GUARD_BB; set both edges' probability; set dominator of GUARD_TO to
1175    DOM_BB; return the skip edge.  GUARD_TO is the target basic block to
1176    skip the loop.  PROBABILITY is the skip edge's probability.  Mark the
1177    new edge as irreducible if IRREDUCIBLE_P is true.  */
1178
1179 static edge
1180 slpeel_add_loop_guard (basic_block guard_bb, tree cond,
1181                        basic_block guard_to, basic_block dom_bb,
1182                        profile_probability probability, bool irreducible_p)
1183 {
1184   gimple_stmt_iterator gsi;
1185   edge new_e, enter_e;
1186   gcond *cond_stmt;
1187   gimple_seq gimplify_stmt_list = NULL;
1188
1189   enter_e = EDGE_SUCC (guard_bb, 0);
1190   enter_e->flags &= ~EDGE_FALLTHRU;
1191   enter_e->flags |= EDGE_FALSE_VALUE;
1192   gsi = gsi_last_bb (guard_bb);
1193
1194   cond = force_gimple_operand_1 (cond, &gimplify_stmt_list, is_gimple_condexpr,
1195                                  NULL_TREE);
1196   if (gimplify_stmt_list)
1197     gsi_insert_seq_after (&gsi, gimplify_stmt_list, GSI_NEW_STMT);
1198
1199   cond_stmt = gimple_build_cond_from_tree (cond, NULL_TREE, NULL_TREE);
1200   gsi = gsi_last_bb (guard_bb);
1201   gsi_insert_after (&gsi, cond_stmt, GSI_NEW_STMT);
1202
1203   /* Add new edge to connect guard block to the merge/loop-exit block.  */
1204   new_e = make_edge (guard_bb, guard_to, EDGE_TRUE_VALUE);
1205
1206   new_e->probability = probability;
1207   if (irreducible_p)
1208     new_e->flags |= EDGE_IRREDUCIBLE_LOOP;
1209
1210   enter_e->probability = probability.invert ();
1211   set_immediate_dominator (CDI_DOMINATORS, guard_to, dom_bb);
1212
1213   /* Split enter_e to preserve LOOPS_HAVE_PREHEADERS.  */
1214   if (enter_e->dest->loop_father->header == enter_e->dest)
1215     split_edge (enter_e);
1216
1217   return new_e;
1218 }
1219
1220
1221 /* This function verifies that the following restrictions apply to LOOP:
1222    (1) it consists of exactly 2 basic blocks - header, and an empty latch
1223        for innermost loop and 5 basic blocks for outer-loop.
1224    (2) it is single entry, single exit
1225    (3) its exit condition is the last stmt in the header
1226    (4) E is the entry/exit edge of LOOP.
1227  */
1228
1229 bool
1230 slpeel_can_duplicate_loop_p (const struct loop *loop, const_edge e)
1231 {
1232   edge exit_e = single_exit (loop);
1233   edge entry_e = loop_preheader_edge (loop);
1234   gcond *orig_cond = get_loop_exit_condition (loop);
1235   gimple_stmt_iterator loop_exit_gsi = gsi_last_bb (exit_e->src);
1236   unsigned int num_bb = loop->inner? 5 : 2;
1237
1238   /* All loops have an outer scope; the only case loop->outer is NULL is for
1239      the function itself.  */
1240   if (!loop_outer (loop)
1241       || loop->num_nodes != num_bb
1242       || !empty_block_p (loop->latch)
1243       || !single_exit (loop)
1244       /* Verify that new loop exit condition can be trivially modified.  */
1245       || (!orig_cond || orig_cond != gsi_stmt (loop_exit_gsi))
1246       || (e != exit_e && e != entry_e))
1247     return false;
1248
1249   return true;
1250 }
1251
1252 /* If the loop has a virtual PHI, but exit bb doesn't, create a virtual PHI
1253    in the exit bb and rename all the uses after the loop.  This simplifies
1254    the *guard[12] routines, which assume loop closed SSA form for all PHIs
1255    (but normally loop closed SSA form doesn't require virtual PHIs to be
1256    in the same form).  Doing this early simplifies the checking what
1257    uses should be renamed.  */
1258
1259 static void
1260 create_lcssa_for_virtual_phi (struct loop *loop)
1261 {
1262   gphi_iterator gsi;
1263   edge exit_e = single_exit (loop);
1264
1265   for (gsi = gsi_start_phis (loop->header); !gsi_end_p (gsi); gsi_next (&gsi))
1266     if (virtual_operand_p (gimple_phi_result (gsi_stmt (gsi))))
1267       {
1268         gphi *phi = gsi.phi ();
1269         for (gsi = gsi_start_phis (exit_e->dest);
1270              !gsi_end_p (gsi); gsi_next (&gsi))
1271           if (virtual_operand_p (gimple_phi_result (gsi_stmt (gsi))))
1272             break;
1273         if (gsi_end_p (gsi))
1274           {
1275             tree new_vop = copy_ssa_name (PHI_RESULT (phi));
1276             gphi *new_phi = create_phi_node (new_vop, exit_e->dest);
1277             tree vop = PHI_ARG_DEF_FROM_EDGE (phi, EDGE_SUCC (loop->latch, 0));
1278             imm_use_iterator imm_iter;
1279             gimple *stmt;
1280             use_operand_p use_p;
1281
1282             SSA_NAME_OCCURS_IN_ABNORMAL_PHI (new_vop)
1283               = SSA_NAME_OCCURS_IN_ABNORMAL_PHI (vop);
1284             add_phi_arg (new_phi, vop, exit_e, UNKNOWN_LOCATION);
1285             gimple_phi_set_result (new_phi, new_vop);
1286             FOR_EACH_IMM_USE_STMT (stmt, imm_iter, vop)
1287               if (stmt != new_phi
1288                   && !flow_bb_inside_loop_p (loop, gimple_bb (stmt)))
1289                 FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
1290                   SET_USE (use_p, new_vop);
1291           }
1292         break;
1293       }
1294
1295 }
1296
1297 /* Function vect_get_loop_location.
1298
1299    Extract the location of the loop in the source code.
1300    If the loop is not well formed for vectorization, an estimated
1301    location is calculated.
1302    Return the loop location if succeed and NULL if not.  */
1303
1304 dump_user_location_t
1305 find_loop_location (struct loop *loop)
1306 {
1307   gimple *stmt = NULL;
1308   basic_block bb;
1309   gimple_stmt_iterator si;
1310
1311   if (!loop)
1312     return dump_user_location_t ();
1313
1314   stmt = get_loop_exit_condition (loop);
1315
1316   if (stmt
1317       && LOCATION_LOCUS (gimple_location (stmt)) > BUILTINS_LOCATION)
1318     return stmt;
1319
1320   /* If we got here the loop is probably not "well formed",
1321      try to estimate the loop location */
1322
1323   if (!loop->header)
1324     return dump_user_location_t ();
1325
1326   bb = loop->header;
1327
1328   for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
1329     {
1330       stmt = gsi_stmt (si);
1331       if (LOCATION_LOCUS (gimple_location (stmt)) > BUILTINS_LOCATION)
1332         return stmt;
1333     }
1334
1335   return dump_user_location_t ();
1336 }
1337
1338 /* Return true if the phi described by STMT_INFO defines an IV of the
1339    loop to be vectorized.  */
1340
1341 static bool
1342 iv_phi_p (stmt_vec_info stmt_info)
1343 {
1344   gphi *phi = as_a <gphi *> (stmt_info->stmt);
1345   if (virtual_operand_p (PHI_RESULT (phi)))
1346     return false;
1347
1348   if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
1349       || STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
1350     return false;
1351
1352   return true;
1353 }
1354
1355 /* Function vect_can_advance_ivs_p
1356
1357    In case the number of iterations that LOOP iterates is unknown at compile
1358    time, an epilog loop will be generated, and the loop induction variables
1359    (IVs) will be "advanced" to the value they are supposed to take just before
1360    the epilog loop.  Here we check that the access function of the loop IVs
1361    and the expression that represents the loop bound are simple enough.
1362    These restrictions will be relaxed in the future.  */
1363
1364 bool
1365 vect_can_advance_ivs_p (loop_vec_info loop_vinfo)
1366 {
1367   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1368   basic_block bb = loop->header;
1369   gphi_iterator gsi;
1370
1371   /* Analyze phi functions of the loop header.  */
1372
1373   if (dump_enabled_p ())
1374     dump_printf_loc (MSG_NOTE, vect_location, "vect_can_advance_ivs_p:\n");
1375   for (gsi = gsi_start_phis (bb); !gsi_end_p (gsi); gsi_next (&gsi))
1376     {
1377       tree evolution_part;
1378
1379       gphi *phi = gsi.phi ();
1380       stmt_vec_info phi_info = loop_vinfo->lookup_stmt (phi);
1381       if (dump_enabled_p ())
1382         {
1383           dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: ");
1384           dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi_info->stmt, 0);
1385         }
1386
1387       /* Skip virtual phi's. The data dependences that are associated with
1388          virtual defs/uses (i.e., memory accesses) are analyzed elsewhere.
1389
1390          Skip reduction phis.  */
1391       if (!iv_phi_p (phi_info))
1392         {
1393           if (dump_enabled_p ())
1394             dump_printf_loc (MSG_NOTE, vect_location,
1395                              "reduc or virtual phi. skip.\n");
1396           continue;
1397         }
1398
1399       /* Analyze the evolution function.  */
1400
1401       evolution_part = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (phi_info);
1402       if (evolution_part == NULL_TREE)
1403         {
1404           if (dump_enabled_p ())
1405             dump_printf (MSG_MISSED_OPTIMIZATION,
1406                          "No access function or evolution.\n");
1407           return false;
1408         }
1409
1410       /* FORNOW: We do not transform initial conditions of IVs
1411          which evolution functions are not invariants in the loop.  */
1412
1413       if (!expr_invariant_in_loop_p (loop, evolution_part))
1414         {
1415           if (dump_enabled_p ())
1416             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1417                              "evolution not invariant in loop.\n");
1418           return false;
1419         }
1420
1421       /* FORNOW: We do not transform initial conditions of IVs
1422          which evolution functions are a polynomial of degree >= 2.  */
1423
1424       if (tree_is_chrec (evolution_part))
1425         {
1426           if (dump_enabled_p ())
1427             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1428                              "evolution is chrec.\n");
1429           return false;
1430         }
1431     }
1432
1433   return true;
1434 }
1435
1436
1437 /*   Function vect_update_ivs_after_vectorizer.
1438
1439      "Advance" the induction variables of LOOP to the value they should take
1440      after the execution of LOOP.  This is currently necessary because the
1441      vectorizer does not handle induction variables that are used after the
1442      loop.  Such a situation occurs when the last iterations of LOOP are
1443      peeled, because:
1444      1. We introduced new uses after LOOP for IVs that were not originally used
1445         after LOOP: the IVs of LOOP are now used by an epilog loop.
1446      2. LOOP is going to be vectorized; this means that it will iterate N/VF
1447         times, whereas the loop IVs should be bumped N times.
1448
1449      Input:
1450      - LOOP - a loop that is going to be vectorized. The last few iterations
1451               of LOOP were peeled.
1452      - NITERS - the number of iterations that LOOP executes (before it is
1453                 vectorized). i.e, the number of times the ivs should be bumped.
1454      - UPDATE_E - a successor edge of LOOP->exit that is on the (only) path
1455                   coming out from LOOP on which there are uses of the LOOP ivs
1456                   (this is the path from LOOP->exit to epilog_loop->preheader).
1457
1458                   The new definitions of the ivs are placed in LOOP->exit.
1459                   The phi args associated with the edge UPDATE_E in the bb
1460                   UPDATE_E->dest are updated accordingly.
1461
1462      Assumption 1: Like the rest of the vectorizer, this function assumes
1463      a single loop exit that has a single predecessor.
1464
1465      Assumption 2: The phi nodes in the LOOP header and in update_bb are
1466      organized in the same order.
1467
1468      Assumption 3: The access function of the ivs is simple enough (see
1469      vect_can_advance_ivs_p).  This assumption will be relaxed in the future.
1470
1471      Assumption 4: Exactly one of the successors of LOOP exit-bb is on a path
1472      coming out of LOOP on which the ivs of LOOP are used (this is the path
1473      that leads to the epilog loop; other paths skip the epilog loop).  This
1474      path starts with the edge UPDATE_E, and its destination (denoted update_bb)
1475      needs to have its phis updated.
1476  */
1477
1478 static void
1479 vect_update_ivs_after_vectorizer (loop_vec_info loop_vinfo,
1480                                   tree niters, edge update_e)
1481 {
1482   gphi_iterator gsi, gsi1;
1483   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1484   basic_block update_bb = update_e->dest;
1485   basic_block exit_bb = single_exit (loop)->dest;
1486
1487   /* Make sure there exists a single-predecessor exit bb:  */
1488   gcc_assert (single_pred_p (exit_bb));
1489   gcc_assert (single_succ_edge (exit_bb) == update_e);
1490
1491   for (gsi = gsi_start_phis (loop->header), gsi1 = gsi_start_phis (update_bb);
1492        !gsi_end_p (gsi) && !gsi_end_p (gsi1);
1493        gsi_next (&gsi), gsi_next (&gsi1))
1494     {
1495       tree init_expr;
1496       tree step_expr, off;
1497       tree type;
1498       tree var, ni, ni_name;
1499       gimple_stmt_iterator last_gsi;
1500
1501       gphi *phi = gsi.phi ();
1502       gphi *phi1 = gsi1.phi ();
1503       stmt_vec_info phi_info = loop_vinfo->lookup_stmt (phi);
1504       if (dump_enabled_p ())
1505         {
1506           dump_printf_loc (MSG_NOTE, vect_location,
1507                            "vect_update_ivs_after_vectorizer: phi: ");
1508           dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
1509         }
1510
1511       /* Skip reduction and virtual phis.  */
1512       if (!iv_phi_p (phi_info))
1513         {
1514           if (dump_enabled_p ())
1515             dump_printf_loc (MSG_NOTE, vect_location,
1516                              "reduc or virtual phi. skip.\n");
1517           continue;
1518         }
1519
1520       type = TREE_TYPE (gimple_phi_result (phi));
1521       step_expr = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (phi_info);
1522       step_expr = unshare_expr (step_expr);
1523
1524       /* FORNOW: We do not support IVs whose evolution function is a polynomial
1525          of degree >= 2 or exponential.  */
1526       gcc_assert (!tree_is_chrec (step_expr));
1527
1528       init_expr = PHI_ARG_DEF_FROM_EDGE (phi, loop_preheader_edge (loop));
1529
1530       off = fold_build2 (MULT_EXPR, TREE_TYPE (step_expr),
1531                          fold_convert (TREE_TYPE (step_expr), niters),
1532                          step_expr);
1533       if (POINTER_TYPE_P (type))
1534         ni = fold_build_pointer_plus (init_expr, off);
1535       else
1536         ni = fold_build2 (PLUS_EXPR, type,
1537                           init_expr, fold_convert (type, off));
1538
1539       var = create_tmp_var (type, "tmp");
1540
1541       last_gsi = gsi_last_bb (exit_bb);
1542       gimple_seq new_stmts = NULL;
1543       ni_name = force_gimple_operand (ni, &new_stmts, false, var);
1544       /* Exit_bb shouldn't be empty.  */
1545       if (!gsi_end_p (last_gsi))
1546         gsi_insert_seq_after (&last_gsi, new_stmts, GSI_SAME_STMT);
1547       else
1548         gsi_insert_seq_before (&last_gsi, new_stmts, GSI_SAME_STMT);
1549
1550       /* Fix phi expressions in the successor bb.  */
1551       adjust_phi_and_debug_stmts (phi1, update_e, ni_name);
1552     }
1553 }
1554
1555 /* Return a gimple value containing the misalignment (measured in vector
1556    elements) for the loop described by LOOP_VINFO, i.e. how many elements
1557    it is away from a perfectly aligned address.  Add any new statements
1558    to SEQ.  */
1559
1560 static tree
1561 get_misalign_in_elems (gimple **seq, loop_vec_info loop_vinfo)
1562 {
1563   struct data_reference *dr = LOOP_VINFO_UNALIGNED_DR (loop_vinfo);
1564   stmt_vec_info stmt_info = vect_dr_stmt (dr);
1565   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
1566
1567   unsigned int target_align = DR_TARGET_ALIGNMENT (dr);
1568   gcc_assert (target_align != 0);
1569
1570   bool negative = tree_int_cst_compare (DR_STEP (dr), size_zero_node) < 0;
1571   tree offset = (negative
1572                  ? size_int (-TYPE_VECTOR_SUBPARTS (vectype) + 1)
1573                  : size_zero_node);
1574   tree start_addr = vect_create_addr_base_for_vector_ref (stmt_info, seq,
1575                                                           offset);
1576   tree type = unsigned_type_for (TREE_TYPE (start_addr));
1577   tree target_align_minus_1 = build_int_cst (type, target_align - 1);
1578   HOST_WIDE_INT elem_size
1579     = int_cst_value (TYPE_SIZE_UNIT (TREE_TYPE (vectype)));
1580   tree elem_size_log = build_int_cst (type, exact_log2 (elem_size));
1581
1582   /* Create:  misalign_in_bytes = addr & (target_align - 1).  */
1583   tree int_start_addr = fold_convert (type, start_addr);
1584   tree misalign_in_bytes = fold_build2 (BIT_AND_EXPR, type, int_start_addr,
1585                                         target_align_minus_1);
1586
1587   /* Create:  misalign_in_elems = misalign_in_bytes / element_size.  */
1588   tree misalign_in_elems = fold_build2 (RSHIFT_EXPR, type, misalign_in_bytes,
1589                                         elem_size_log);
1590
1591   return misalign_in_elems;
1592 }
1593
1594 /* Function vect_gen_prolog_loop_niters
1595
1596    Generate the number of iterations which should be peeled as prolog for the
1597    loop represented by LOOP_VINFO.  It is calculated as the misalignment of
1598    DR - the data reference recorded in LOOP_VINFO_UNALIGNED_DR (LOOP_VINFO).
1599    As a result, after the execution of this loop, the data reference DR will
1600    refer to an aligned location.  The following computation is generated:
1601
1602    If the misalignment of DR is known at compile time:
1603      addr_mis = int mis = DR_MISALIGNMENT (dr);
1604    Else, compute address misalignment in bytes:
1605      addr_mis = addr & (target_align - 1)
1606
1607    prolog_niters = ((VF - addr_mis/elem_size)&(VF-1))/step
1608
1609    (elem_size = element type size; an element is the scalar element whose type
1610    is the inner type of the vectype)
1611
1612    The computations will be emitted at the end of BB.  We also compute and
1613    store upper bound (included) of the result in BOUND.
1614
1615    When the step of the data-ref in the loop is not 1 (as in interleaved data
1616    and SLP), the number of iterations of the prolog must be divided by the step
1617    (which is equal to the size of interleaved group).
1618
1619    The above formulas assume that VF == number of elements in the vector. This
1620    may not hold when there are multiple-types in the loop.
1621    In this case, for some data-references in the loop the VF does not represent
1622    the number of elements that fit in the vector.  Therefore, instead of VF we
1623    use TYPE_VECTOR_SUBPARTS.  */
1624
1625 static tree
1626 vect_gen_prolog_loop_niters (loop_vec_info loop_vinfo,
1627                              basic_block bb, int *bound)
1628 {
1629   struct data_reference *dr = LOOP_VINFO_UNALIGNED_DR (loop_vinfo);
1630   tree var;
1631   tree niters_type = TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo));
1632   gimple_seq stmts = NULL, new_stmts = NULL;
1633   tree iters, iters_name;
1634   stmt_vec_info stmt_info = vect_dr_stmt (dr);
1635   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
1636   unsigned int target_align = DR_TARGET_ALIGNMENT (dr);
1637
1638   if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) > 0)
1639     {
1640       int npeel = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
1641
1642       if (dump_enabled_p ())
1643         dump_printf_loc (MSG_NOTE, vect_location,
1644                          "known peeling = %d.\n", npeel);
1645
1646       iters = build_int_cst (niters_type, npeel);
1647       *bound = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
1648     }
1649   else
1650     {
1651       tree misalign_in_elems = get_misalign_in_elems (&stmts, loop_vinfo);
1652       tree type = TREE_TYPE (misalign_in_elems);
1653       HOST_WIDE_INT elem_size
1654         = int_cst_value (TYPE_SIZE_UNIT (TREE_TYPE (vectype)));
1655       HOST_WIDE_INT align_in_elems = target_align / elem_size;
1656       tree align_in_elems_minus_1 = build_int_cst (type, align_in_elems - 1);
1657       tree align_in_elems_tree = build_int_cst (type, align_in_elems);
1658
1659       /* Create:  (niters_type) ((align_in_elems - misalign_in_elems)
1660                                  & (align_in_elems - 1)).  */
1661       bool negative = tree_int_cst_compare (DR_STEP (dr), size_zero_node) < 0;
1662       if (negative)
1663         iters = fold_build2 (MINUS_EXPR, type, misalign_in_elems,
1664                              align_in_elems_tree);
1665       else
1666         iters = fold_build2 (MINUS_EXPR, type, align_in_elems_tree,
1667                              misalign_in_elems);
1668       iters = fold_build2 (BIT_AND_EXPR, type, iters, align_in_elems_minus_1);
1669       iters = fold_convert (niters_type, iters);
1670       *bound = align_in_elems - 1;
1671     }
1672
1673   if (dump_enabled_p ())
1674     {
1675       dump_printf_loc (MSG_NOTE, vect_location,
1676                        "niters for prolog loop: ");
1677       dump_generic_expr (MSG_NOTE, TDF_SLIM, iters);
1678       dump_printf (MSG_NOTE, "\n");
1679     }
1680
1681   var = create_tmp_var (niters_type, "prolog_loop_niters");
1682   iters_name = force_gimple_operand (iters, &new_stmts, false, var);
1683
1684   if (new_stmts)
1685     gimple_seq_add_seq (&stmts, new_stmts);
1686   if (stmts)
1687     {
1688       gcc_assert (single_succ_p (bb));
1689       gimple_stmt_iterator gsi = gsi_last_bb (bb);
1690       if (gsi_end_p (gsi))
1691         gsi_insert_seq_before (&gsi, stmts, GSI_SAME_STMT);
1692       else
1693         gsi_insert_seq_after (&gsi, stmts, GSI_SAME_STMT);
1694     }
1695   return iters_name;
1696 }
1697
1698
1699 /* Function vect_update_init_of_dr
1700
1701    If CODE is PLUS, the vector loop starts NITERS iterations after the
1702    scalar one, otherwise CODE is MINUS and the vector loop starts NITERS
1703    iterations before the scalar one (using masking to skip inactive
1704    elements).  This function updates the information recorded in DR to
1705    account for the difference.  Specifically, it updates the OFFSET
1706    field of DR.  */
1707
1708 static void
1709 vect_update_init_of_dr (struct data_reference *dr, tree niters, tree_code code)
1710 {
1711   tree offset = DR_OFFSET (dr);
1712
1713   niters = fold_build2 (MULT_EXPR, sizetype,
1714                         fold_convert (sizetype, niters),
1715                         fold_convert (sizetype, DR_STEP (dr)));
1716   offset = fold_build2 (code, sizetype,
1717                         fold_convert (sizetype, offset), niters);
1718   DR_OFFSET (dr) = offset;
1719 }
1720
1721
1722 /* Function vect_update_inits_of_drs
1723
1724    Apply vect_update_inits_of_dr to all accesses in LOOP_VINFO.
1725    CODE and NITERS are as for vect_update_inits_of_dr.  */
1726
1727 static void
1728 vect_update_inits_of_drs (loop_vec_info loop_vinfo, tree niters,
1729                           tree_code code)
1730 {
1731   unsigned int i;
1732   vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
1733   struct data_reference *dr;
1734
1735   DUMP_VECT_SCOPE ("vect_update_inits_of_dr");
1736
1737   /* Adjust niters to sizetype and insert stmts on loop preheader edge.  */
1738   if (!types_compatible_p (sizetype, TREE_TYPE (niters)))
1739     {
1740       gimple_seq seq;
1741       edge pe = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo));
1742       tree var = create_tmp_var (sizetype, "prolog_loop_adjusted_niters");
1743
1744       niters = fold_convert (sizetype, niters);
1745       niters = force_gimple_operand (niters, &seq, false, var);
1746       if (seq)
1747         {
1748           basic_block new_bb = gsi_insert_seq_on_edge_immediate (pe, seq);
1749           gcc_assert (!new_bb);
1750         }
1751     }
1752
1753   FOR_EACH_VEC_ELT (datarefs, i, dr)
1754     {
1755       gimple *stmt = DR_STMT (dr);
1756       if (!STMT_VINFO_GATHER_SCATTER_P (vinfo_for_stmt (stmt)))
1757         vect_update_init_of_dr (dr, niters, code);
1758     }
1759 }
1760
1761 /* For the information recorded in LOOP_VINFO prepare the loop for peeling
1762    by masking.  This involves calculating the number of iterations to
1763    be peeled and then aligning all memory references appropriately.  */
1764
1765 void
1766 vect_prepare_for_masked_peels (loop_vec_info loop_vinfo)
1767 {
1768   tree misalign_in_elems;
1769   tree type = LOOP_VINFO_MASK_COMPARE_TYPE (loop_vinfo);
1770
1771   gcc_assert (vect_use_loop_mask_for_alignment_p (loop_vinfo));
1772
1773   /* From the information recorded in LOOP_VINFO get the number of iterations
1774      that need to be skipped via masking.  */
1775   if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) > 0)
1776     {
1777       poly_int64 misalign = (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
1778                              - LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo));
1779       misalign_in_elems = build_int_cst (type, misalign);
1780     }
1781   else
1782     {
1783       gimple_seq seq1 = NULL, seq2 = NULL;
1784       misalign_in_elems = get_misalign_in_elems (&seq1, loop_vinfo);
1785       misalign_in_elems = fold_convert (type, misalign_in_elems);
1786       misalign_in_elems = force_gimple_operand (misalign_in_elems,
1787                                                 &seq2, true, NULL_TREE);
1788       gimple_seq_add_seq (&seq1, seq2);
1789       if (seq1)
1790         {
1791           edge pe = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo));
1792           basic_block new_bb = gsi_insert_seq_on_edge_immediate (pe, seq1);
1793           gcc_assert (!new_bb);
1794         }
1795     }
1796
1797   if (dump_enabled_p ())
1798     {
1799       dump_printf_loc (MSG_NOTE, vect_location,
1800                        "misalignment for fully-masked loop: ");
1801       dump_generic_expr (MSG_NOTE, TDF_SLIM, misalign_in_elems);
1802       dump_printf (MSG_NOTE, "\n");
1803     }
1804
1805   LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo) = misalign_in_elems;
1806
1807   vect_update_inits_of_drs (loop_vinfo, misalign_in_elems, MINUS_EXPR);
1808 }
1809
1810 /* This function builds ni_name = number of iterations.  Statements
1811    are emitted on the loop preheader edge.  If NEW_VAR_P is not NULL, set
1812    it to TRUE if new ssa_var is generated.  */
1813
1814 tree
1815 vect_build_loop_niters (loop_vec_info loop_vinfo, bool *new_var_p)
1816 {
1817   tree ni = unshare_expr (LOOP_VINFO_NITERS (loop_vinfo));
1818   if (TREE_CODE (ni) == INTEGER_CST)
1819     return ni;
1820   else
1821     {
1822       tree ni_name, var;
1823       gimple_seq stmts = NULL;
1824       edge pe = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo));
1825
1826       var = create_tmp_var (TREE_TYPE (ni), "niters");
1827       ni_name = force_gimple_operand (ni, &stmts, false, var);
1828       if (stmts)
1829         {
1830           gsi_insert_seq_on_edge_immediate (pe, stmts);
1831           if (new_var_p != NULL)
1832             *new_var_p = true;
1833         }
1834
1835       return ni_name;
1836     }
1837 }
1838
1839 /* Calculate the number of iterations above which vectorized loop will be
1840    preferred than scalar loop.  NITERS_PROLOG is the number of iterations
1841    of prolog loop.  If it's integer const, the integer number is also passed
1842    in INT_NITERS_PROLOG.  BOUND_PROLOG is the upper bound (inclusive) of the
1843    number of iterations of the prolog loop.  BOUND_EPILOG is the corresponding
1844    value for the epilog loop.  If CHECK_PROFITABILITY is true, TH is the
1845    threshold below which the scalar (rather than vectorized) loop will be
1846    executed.  This function stores the upper bound (inclusive) of the result
1847    in BOUND_SCALAR.  */
1848
1849 static tree
1850 vect_gen_scalar_loop_niters (tree niters_prolog, int int_niters_prolog,
1851                              int bound_prolog, poly_int64 bound_epilog, int th,
1852                              poly_uint64 *bound_scalar,
1853                              bool check_profitability)
1854 {
1855   tree type = TREE_TYPE (niters_prolog);
1856   tree niters = fold_build2 (PLUS_EXPR, type, niters_prolog,
1857                              build_int_cst (type, bound_epilog));
1858
1859   *bound_scalar = bound_prolog + bound_epilog;
1860   if (check_profitability)
1861     {
1862       /* TH indicates the minimum niters of vectorized loop, while we
1863          compute the maximum niters of scalar loop.  */
1864       th--;
1865       /* Peeling for constant times.  */
1866       if (int_niters_prolog >= 0)
1867         {
1868           *bound_scalar = upper_bound (int_niters_prolog + bound_epilog, th);
1869           return build_int_cst (type, *bound_scalar);
1870         }
1871       /* Peeling an unknown number of times.  Note that both BOUND_PROLOG
1872          and BOUND_EPILOG are inclusive upper bounds.  */
1873       if (known_ge (th, bound_prolog + bound_epilog))
1874         {
1875           *bound_scalar = th;
1876           return build_int_cst (type, th);
1877         }
1878       /* Need to do runtime comparison.  */
1879       else if (maybe_gt (th, bound_epilog))
1880         {
1881           *bound_scalar = upper_bound (*bound_scalar, th);
1882           return fold_build2 (MAX_EXPR, type,
1883                               build_int_cst (type, th), niters);
1884         }
1885     }
1886   return niters;
1887 }
1888
1889 /* NITERS is the number of times that the original scalar loop executes
1890    after peeling.  Work out the maximum number of iterations N that can
1891    be handled by the vectorized form of the loop and then either:
1892
1893    a) set *STEP_VECTOR_PTR to the vectorization factor and generate:
1894
1895         niters_vector = N
1896
1897    b) set *STEP_VECTOR_PTR to one and generate:
1898
1899         niters_vector = N / vf
1900
1901    In both cases, store niters_vector in *NITERS_VECTOR_PTR and add
1902    any new statements on the loop preheader edge.  NITERS_NO_OVERFLOW
1903    is true if NITERS doesn't overflow (i.e. if NITERS is always nonzero).  */
1904
1905 void
1906 vect_gen_vector_loop_niters (loop_vec_info loop_vinfo, tree niters,
1907                              tree *niters_vector_ptr, tree *step_vector_ptr,
1908                              bool niters_no_overflow)
1909 {
1910   tree ni_minus_gap, var;
1911   tree niters_vector, step_vector, type = TREE_TYPE (niters);
1912   poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1913   edge pe = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo));
1914   tree log_vf = NULL_TREE;
1915
1916   /* If epilogue loop is required because of data accesses with gaps, we
1917      subtract one iteration from the total number of iterations here for
1918      correct calculation of RATIO.  */
1919   if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
1920     {
1921       ni_minus_gap = fold_build2 (MINUS_EXPR, type, niters,
1922                                   build_one_cst (type));
1923       if (!is_gimple_val (ni_minus_gap))
1924         {
1925           var = create_tmp_var (type, "ni_gap");
1926           gimple *stmts = NULL;
1927           ni_minus_gap = force_gimple_operand (ni_minus_gap, &stmts,
1928                                                true, var);
1929           gsi_insert_seq_on_edge_immediate (pe, stmts);
1930         }
1931     }
1932   else
1933     ni_minus_gap = niters;
1934
1935   unsigned HOST_WIDE_INT const_vf;
1936   if (vf.is_constant (&const_vf)
1937       && !LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
1938     {
1939       /* Create: niters >> log2(vf) */
1940       /* If it's known that niters == number of latch executions + 1 doesn't
1941          overflow, we can generate niters >> log2(vf); otherwise we generate
1942          (niters - vf) >> log2(vf) + 1 by using the fact that we know ratio
1943          will be at least one.  */
1944       log_vf = build_int_cst (type, exact_log2 (const_vf));
1945       if (niters_no_overflow)
1946         niters_vector = fold_build2 (RSHIFT_EXPR, type, ni_minus_gap, log_vf);
1947       else
1948         niters_vector
1949           = fold_build2 (PLUS_EXPR, type,
1950                          fold_build2 (RSHIFT_EXPR, type,
1951                                       fold_build2 (MINUS_EXPR, type,
1952                                                    ni_minus_gap,
1953                                                    build_int_cst (type, vf)),
1954                                       log_vf),
1955                          build_int_cst (type, 1));
1956       step_vector = build_one_cst (type);
1957     }
1958   else
1959     {
1960       niters_vector = ni_minus_gap;
1961       step_vector = build_int_cst (type, vf);
1962     }
1963
1964   if (!is_gimple_val (niters_vector))
1965     {
1966       var = create_tmp_var (type, "bnd");
1967       gimple_seq stmts = NULL;
1968       niters_vector = force_gimple_operand (niters_vector, &stmts, true, var);
1969       gsi_insert_seq_on_edge_immediate (pe, stmts);
1970       /* Peeling algorithm guarantees that vector loop bound is at least ONE,
1971          we set range information to make niters analyzer's life easier.  */
1972       if (stmts != NULL && log_vf)
1973         set_range_info (niters_vector, VR_RANGE,
1974                         wi::to_wide (build_int_cst (type, 1)),
1975                         wi::to_wide (fold_build2 (RSHIFT_EXPR, type,
1976                                                   TYPE_MAX_VALUE (type),
1977                                                   log_vf)));
1978     }
1979   *niters_vector_ptr = niters_vector;
1980   *step_vector_ptr = step_vector;
1981
1982   return;
1983 }
1984
1985 /* Given NITERS_VECTOR which is the number of iterations for vectorized
1986    loop specified by LOOP_VINFO after vectorization, compute the number
1987    of iterations before vectorization (niters_vector * vf) and store it
1988    to NITERS_VECTOR_MULT_VF_PTR.  */
1989
1990 static void
1991 vect_gen_vector_loop_niters_mult_vf (loop_vec_info loop_vinfo,
1992                                      tree niters_vector,
1993                                      tree *niters_vector_mult_vf_ptr)
1994 {
1995   /* We should be using a step_vector of VF if VF is variable.  */
1996   int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo).to_constant ();
1997   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1998   tree type = TREE_TYPE (niters_vector);
1999   tree log_vf = build_int_cst (type, exact_log2 (vf));
2000   basic_block exit_bb = single_exit (loop)->dest;
2001
2002   gcc_assert (niters_vector_mult_vf_ptr != NULL);
2003   tree niters_vector_mult_vf = fold_build2 (LSHIFT_EXPR, type,
2004                                             niters_vector, log_vf);
2005   if (!is_gimple_val (niters_vector_mult_vf))
2006     {
2007       tree var = create_tmp_var (type, "niters_vector_mult_vf");
2008       gimple_seq stmts = NULL;
2009       niters_vector_mult_vf = force_gimple_operand (niters_vector_mult_vf,
2010                                                     &stmts, true, var);
2011       gimple_stmt_iterator gsi = gsi_start_bb (exit_bb);
2012       gsi_insert_seq_before (&gsi, stmts, GSI_SAME_STMT);
2013     }
2014   *niters_vector_mult_vf_ptr = niters_vector_mult_vf;
2015 }
2016
2017 /* Function slpeel_tree_duplicate_loop_to_edge_cfg duplciates FIRST/SECOND
2018    from SECOND/FIRST and puts it at the original loop's preheader/exit
2019    edge, the two loops are arranged as below:
2020
2021        preheader_a:
2022      first_loop:
2023        header_a:
2024          i_1 = PHI<i_0, i_2>;
2025          ...
2026          i_2 = i_1 + 1;
2027          if (cond_a)
2028            goto latch_a;
2029          else
2030            goto between_bb;
2031        latch_a:
2032          goto header_a;
2033
2034        between_bb:
2035          ;; i_x = PHI<i_2>;   ;; LCSSA phi node to be created for FIRST,
2036
2037      second_loop:
2038        header_b:
2039          i_3 = PHI<i_0, i_4>; ;; Use of i_0 to be replaced with i_x,
2040                                  or with i_2 if no LCSSA phi is created
2041                                  under condition of CREATE_LCSSA_FOR_IV_PHIS.
2042          ...
2043          i_4 = i_3 + 1;
2044          if (cond_b)
2045            goto latch_b;
2046          else
2047            goto exit_bb;
2048        latch_b:
2049          goto header_b;
2050
2051        exit_bb:
2052
2053    This function creates loop closed SSA for the first loop; update the
2054    second loop's PHI nodes by replacing argument on incoming edge with the
2055    result of newly created lcssa PHI nodes.  IF CREATE_LCSSA_FOR_IV_PHIS
2056    is false, Loop closed ssa phis will only be created for non-iv phis for
2057    the first loop.
2058
2059    This function assumes exit bb of the first loop is preheader bb of the
2060    second loop, i.e, between_bb in the example code.  With PHIs updated,
2061    the second loop will execute rest iterations of the first.  */
2062
2063 static void
2064 slpeel_update_phi_nodes_for_loops (loop_vec_info loop_vinfo,
2065                                    struct loop *first, struct loop *second,
2066                                    bool create_lcssa_for_iv_phis)
2067 {
2068   gphi_iterator gsi_update, gsi_orig;
2069   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2070
2071   edge first_latch_e = EDGE_SUCC (first->latch, 0);
2072   edge second_preheader_e = loop_preheader_edge (second);
2073   basic_block between_bb = single_exit (first)->dest;
2074
2075   gcc_assert (between_bb == second_preheader_e->src);
2076   gcc_assert (single_pred_p (between_bb) && single_succ_p (between_bb));
2077   /* Either the first loop or the second is the loop to be vectorized.  */
2078   gcc_assert (loop == first || loop == second);
2079
2080   for (gsi_orig = gsi_start_phis (first->header),
2081        gsi_update = gsi_start_phis (second->header);
2082        !gsi_end_p (gsi_orig) && !gsi_end_p (gsi_update);
2083        gsi_next (&gsi_orig), gsi_next (&gsi_update))
2084     {
2085       gphi *orig_phi = gsi_orig.phi ();
2086       gphi *update_phi = gsi_update.phi ();
2087
2088       tree arg = PHI_ARG_DEF_FROM_EDGE (orig_phi, first_latch_e);
2089       /* Generate lcssa PHI node for the first loop.  */
2090       gphi *vect_phi = (loop == first) ? orig_phi : update_phi;
2091       stmt_vec_info vect_phi_info = loop_vinfo->lookup_stmt (vect_phi);
2092       if (create_lcssa_for_iv_phis || !iv_phi_p (vect_phi_info))
2093         {
2094           tree new_res = copy_ssa_name (PHI_RESULT (orig_phi));
2095           gphi *lcssa_phi = create_phi_node (new_res, between_bb);
2096           add_phi_arg (lcssa_phi, arg, single_exit (first), UNKNOWN_LOCATION);
2097           arg = new_res;
2098         }
2099
2100       /* Update PHI node in the second loop by replacing arg on the loop's
2101          incoming edge.  */
2102       adjust_phi_and_debug_stmts (update_phi, second_preheader_e, arg);
2103     }
2104 }
2105
2106 /* Function slpeel_add_loop_guard adds guard skipping from the beginning
2107    of SKIP_LOOP to the beginning of UPDATE_LOOP.  GUARD_EDGE and MERGE_EDGE
2108    are two pred edges of the merge point before UPDATE_LOOP.  The two loops
2109    appear like below:
2110
2111        guard_bb:
2112          if (cond)
2113            goto merge_bb;
2114          else
2115            goto skip_loop;
2116
2117      skip_loop:
2118        header_a:
2119          i_1 = PHI<i_0, i_2>;
2120          ...
2121          i_2 = i_1 + 1;
2122          if (cond_a)
2123            goto latch_a;
2124          else
2125            goto exit_a;
2126        latch_a:
2127          goto header_a;
2128
2129        exit_a:
2130          i_5 = PHI<i_2>;
2131
2132        merge_bb:
2133          ;; PHI (i_x = PHI<i_0, i_5>) to be created at merge point.
2134
2135      update_loop:
2136        header_b:
2137          i_3 = PHI<i_5, i_4>;  ;; Use of i_5 to be replaced with i_x.
2138          ...
2139          i_4 = i_3 + 1;
2140          if (cond_b)
2141            goto latch_b;
2142          else
2143            goto exit_bb;
2144        latch_b:
2145          goto header_b;
2146
2147        exit_bb:
2148
2149    This function creates PHI nodes at merge_bb and replaces the use of i_5
2150    in the update_loop's PHI node with the result of new PHI result.  */
2151
2152 static void
2153 slpeel_update_phi_nodes_for_guard1 (struct loop *skip_loop,
2154                                     struct loop *update_loop,
2155                                     edge guard_edge, edge merge_edge)
2156 {
2157   source_location merge_loc, guard_loc;
2158   edge orig_e = loop_preheader_edge (skip_loop);
2159   edge update_e = loop_preheader_edge (update_loop);
2160   gphi_iterator gsi_orig, gsi_update;
2161
2162   for ((gsi_orig = gsi_start_phis (skip_loop->header),
2163         gsi_update = gsi_start_phis (update_loop->header));
2164        !gsi_end_p (gsi_orig) && !gsi_end_p (gsi_update);
2165        gsi_next (&gsi_orig), gsi_next (&gsi_update))
2166     {
2167       gphi *orig_phi = gsi_orig.phi ();
2168       gphi *update_phi = gsi_update.phi ();
2169
2170       /* Generate new phi node at merge bb of the guard.  */
2171       tree new_res = copy_ssa_name (PHI_RESULT (orig_phi));
2172       gphi *new_phi = create_phi_node (new_res, guard_edge->dest);
2173
2174       /* Merge bb has two incoming edges: GUARD_EDGE and MERGE_EDGE.  Set the
2175          args in NEW_PHI for these edges.  */
2176       tree merge_arg = PHI_ARG_DEF_FROM_EDGE (update_phi, update_e);
2177       tree guard_arg = PHI_ARG_DEF_FROM_EDGE (orig_phi, orig_e);
2178       merge_loc = gimple_phi_arg_location_from_edge (update_phi, update_e);
2179       guard_loc = gimple_phi_arg_location_from_edge (orig_phi, orig_e);
2180       add_phi_arg (new_phi, merge_arg, merge_edge, merge_loc);
2181       add_phi_arg (new_phi, guard_arg, guard_edge, guard_loc);
2182
2183       /* Update phi in UPDATE_PHI.  */
2184       adjust_phi_and_debug_stmts (update_phi, update_e, new_res);
2185     }
2186 }
2187
2188 /* LCSSA_PHI is a lcssa phi of EPILOG loop which is copied from LOOP,
2189    this function searches for the corresponding lcssa phi node in exit
2190    bb of LOOP.  If it is found, return the phi result; otherwise return
2191    NULL.  */
2192
2193 static tree
2194 find_guard_arg (struct loop *loop, struct loop *epilog ATTRIBUTE_UNUSED,
2195                 gphi *lcssa_phi)
2196 {
2197   gphi_iterator gsi;
2198   edge e = single_exit (loop);
2199
2200   gcc_assert (single_pred_p (e->dest));
2201   for (gsi = gsi_start_phis (e->dest); !gsi_end_p (gsi); gsi_next (&gsi))
2202     {
2203       gphi *phi = gsi.phi ();
2204       if (operand_equal_p (PHI_ARG_DEF (phi, 0),
2205                            PHI_ARG_DEF (lcssa_phi, 0), 0))
2206         return PHI_RESULT (phi);
2207     }
2208   return NULL_TREE;
2209 }
2210
2211 /* LOOP and EPILOG are two consecutive loops in CFG and EPILOG is copied
2212    from LOOP.  Function slpeel_add_loop_guard adds guard skipping from a
2213    point between the two loops to the end of EPILOG.  Edges GUARD_EDGE
2214    and MERGE_EDGE are the two pred edges of merge_bb at the end of EPILOG.
2215    The CFG looks like:
2216
2217      loop:
2218        header_a:
2219          i_1 = PHI<i_0, i_2>;
2220          ...
2221          i_2 = i_1 + 1;
2222          if (cond_a)
2223            goto latch_a;
2224          else
2225            goto exit_a;
2226        latch_a:
2227          goto header_a;
2228
2229        exit_a:
2230
2231        guard_bb:
2232          if (cond)
2233            goto merge_bb;
2234          else
2235            goto epilog_loop;
2236
2237        ;; fall_through_bb
2238
2239      epilog_loop:
2240        header_b:
2241          i_3 = PHI<i_2, i_4>;
2242          ...
2243          i_4 = i_3 + 1;
2244          if (cond_b)
2245            goto latch_b;
2246          else
2247            goto merge_bb;
2248        latch_b:
2249          goto header_b;
2250
2251        merge_bb:
2252          ; PHI node (i_y = PHI<i_2, i_4>) to be created at merge point.
2253
2254        exit_bb:
2255          i_x = PHI<i_4>;  ;Use of i_4 to be replaced with i_y in merge_bb.
2256
2257    For each name used out side EPILOG (i.e - for each name that has a lcssa
2258    phi in exit_bb) we create a new PHI in merge_bb.  The new PHI has two
2259    args corresponding to GUARD_EDGE and MERGE_EDGE.  Arg for MERGE_EDGE is
2260    the arg of the original PHI in exit_bb, arg for GUARD_EDGE is defined
2261    by LOOP and is found in the exit bb of LOOP.  Arg of the original PHI
2262    in exit_bb will also be updated.  */
2263
2264 static void
2265 slpeel_update_phi_nodes_for_guard2 (struct loop *loop, struct loop *epilog,
2266                                     edge guard_edge, edge merge_edge)
2267 {
2268   gphi_iterator gsi;
2269   basic_block merge_bb = guard_edge->dest;
2270
2271   gcc_assert (single_succ_p (merge_bb));
2272   edge e = single_succ_edge (merge_bb);
2273   basic_block exit_bb = e->dest;
2274   gcc_assert (single_pred_p (exit_bb));
2275   gcc_assert (single_pred (exit_bb) == single_exit (epilog)->dest);
2276
2277   for (gsi = gsi_start_phis (exit_bb); !gsi_end_p (gsi); gsi_next (&gsi))
2278     {
2279       gphi *update_phi = gsi.phi ();
2280       tree old_arg = PHI_ARG_DEF (update_phi, 0);
2281       /* This loop-closed-phi actually doesn't represent a use out of the
2282          loop - the phi arg is a constant.  */
2283       if (TREE_CODE (old_arg) != SSA_NAME)
2284         continue;
2285
2286       tree merge_arg = get_current_def (old_arg);
2287       if (!merge_arg)
2288         merge_arg = old_arg;
2289
2290       tree guard_arg = find_guard_arg (loop, epilog, update_phi);
2291       /* If the var is live after loop but not a reduction, we simply
2292          use the old arg.  */
2293       if (!guard_arg)
2294         guard_arg = old_arg;
2295
2296       /* Create new phi node in MERGE_BB:  */
2297       tree new_res = copy_ssa_name (PHI_RESULT (update_phi));
2298       gphi *merge_phi = create_phi_node (new_res, merge_bb);
2299
2300       /* MERGE_BB has two incoming edges: GUARD_EDGE and MERGE_EDGE, Set
2301          the two PHI args in merge_phi for these edges.  */
2302       add_phi_arg (merge_phi, merge_arg, merge_edge, UNKNOWN_LOCATION);
2303       add_phi_arg (merge_phi, guard_arg, guard_edge, UNKNOWN_LOCATION);
2304
2305       /* Update the original phi in exit_bb.  */
2306       adjust_phi_and_debug_stmts (update_phi, e, new_res);
2307     }
2308 }
2309
2310 /* EPILOG loop is duplicated from the original loop for vectorizing,
2311    the arg of its loop closed ssa PHI needs to be updated.  */
2312
2313 static void
2314 slpeel_update_phi_nodes_for_lcssa (struct loop *epilog)
2315 {
2316   gphi_iterator gsi;
2317   basic_block exit_bb = single_exit (epilog)->dest;
2318
2319   gcc_assert (single_pred_p (exit_bb));
2320   edge e = EDGE_PRED (exit_bb, 0);
2321   for (gsi = gsi_start_phis (exit_bb); !gsi_end_p (gsi); gsi_next (&gsi))
2322     rename_use_op (PHI_ARG_DEF_PTR_FROM_EDGE (gsi.phi (), e));
2323 }
2324
2325 /* Function vect_do_peeling.
2326
2327    Input:
2328    - LOOP_VINFO: Represent a loop to be vectorized, which looks like:
2329
2330        preheader:
2331      LOOP:
2332        header_bb:
2333          loop_body
2334          if (exit_loop_cond) goto exit_bb
2335          else                goto header_bb
2336        exit_bb:
2337
2338    - NITERS: The number of iterations of the loop.
2339    - NITERSM1: The number of iterations of the loop's latch.
2340    - NITERS_NO_OVERFLOW: No overflow in computing NITERS.
2341    - TH, CHECK_PROFITABILITY: Threshold of niters to vectorize loop if
2342                               CHECK_PROFITABILITY is true.
2343    Output:
2344    - *NITERS_VECTOR and *STEP_VECTOR describe how the main loop should
2345      iterate after vectorization; see vect_set_loop_condition for details.
2346    - *NITERS_VECTOR_MULT_VF_VAR is either null or an SSA name that
2347      should be set to the number of scalar iterations handled by the
2348      vector loop.  The SSA name is only used on exit from the loop.
2349
2350    This function peels prolog and epilog from the loop, adds guards skipping
2351    PROLOG and EPILOG for various conditions.  As a result, the changed CFG
2352    would look like:
2353
2354        guard_bb_1:
2355          if (prefer_scalar_loop) goto merge_bb_1
2356          else                    goto guard_bb_2
2357
2358        guard_bb_2:
2359          if (skip_prolog) goto merge_bb_2
2360          else             goto prolog_preheader
2361
2362        prolog_preheader:
2363      PROLOG:
2364        prolog_header_bb:
2365          prolog_body
2366          if (exit_prolog_cond) goto prolog_exit_bb
2367          else                  goto prolog_header_bb
2368        prolog_exit_bb:
2369
2370        merge_bb_2:
2371
2372        vector_preheader:
2373      VECTOR LOOP:
2374        vector_header_bb:
2375          vector_body
2376          if (exit_vector_cond) goto vector_exit_bb
2377          else                  goto vector_header_bb
2378        vector_exit_bb:
2379
2380        guard_bb_3:
2381          if (skip_epilog) goto merge_bb_3
2382          else             goto epilog_preheader
2383
2384        merge_bb_1:
2385
2386        epilog_preheader:
2387      EPILOG:
2388        epilog_header_bb:
2389          epilog_body
2390          if (exit_epilog_cond) goto merge_bb_3
2391          else                  goto epilog_header_bb
2392
2393        merge_bb_3:
2394
2395    Note this function peels prolog and epilog only if it's necessary,
2396    as well as guards.
2397    Returns created epilogue or NULL.
2398
2399    TODO: Guard for prefer_scalar_loop should be emitted along with
2400    versioning conditions if loop versioning is needed.  */
2401
2402
2403 struct loop *
2404 vect_do_peeling (loop_vec_info loop_vinfo, tree niters, tree nitersm1,
2405                  tree *niters_vector, tree *step_vector,
2406                  tree *niters_vector_mult_vf_var, int th,
2407                  bool check_profitability, bool niters_no_overflow)
2408 {
2409   edge e, guard_e;
2410   tree type = TREE_TYPE (niters), guard_cond;
2411   basic_block guard_bb, guard_to;
2412   profile_probability prob_prolog, prob_vector, prob_epilog;
2413   int estimated_vf;
2414   int prolog_peeling = 0;
2415   if (!vect_use_loop_mask_for_alignment_p (loop_vinfo))
2416     prolog_peeling = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
2417
2418   poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2419   poly_uint64 bound_epilog = 0;
2420   if (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
2421       && LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo))
2422     bound_epilog += vf - 1;
2423   if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
2424     bound_epilog += 1;
2425   bool epilog_peeling = maybe_ne (bound_epilog, 0U);
2426   poly_uint64 bound_scalar = bound_epilog;
2427
2428   if (!prolog_peeling && !epilog_peeling)
2429     return NULL;
2430
2431   prob_vector = profile_probability::guessed_always ().apply_scale (9, 10);
2432   estimated_vf = vect_vf_for_cost (loop_vinfo);
2433   if (estimated_vf == 2)
2434     estimated_vf = 3;
2435   prob_prolog = prob_epilog = profile_probability::guessed_always ()
2436                         .apply_scale (estimated_vf - 1, estimated_vf);
2437
2438   struct loop *prolog, *epilog = NULL, *loop = LOOP_VINFO_LOOP (loop_vinfo);
2439   struct loop *first_loop = loop;
2440   bool irred_flag = loop_preheader_edge (loop)->flags & EDGE_IRREDUCIBLE_LOOP;
2441   create_lcssa_for_virtual_phi (loop);
2442   update_ssa (TODO_update_ssa_only_virtuals);
2443
2444   if (MAY_HAVE_DEBUG_BIND_STMTS)
2445     {
2446       gcc_assert (!adjust_vec.exists ());
2447       adjust_vec.create (32);
2448     }
2449   initialize_original_copy_tables ();
2450
2451   /* Record the anchor bb at which the guard should be placed if the scalar
2452      loop might be preferred.  */
2453   basic_block anchor = loop_preheader_edge (loop)->src;
2454
2455   /* Generate the number of iterations for the prolog loop.  We do this here
2456      so that we can also get the upper bound on the number of iterations.  */
2457   tree niters_prolog;
2458   int bound_prolog = 0;
2459   if (prolog_peeling)
2460     niters_prolog = vect_gen_prolog_loop_niters (loop_vinfo, anchor,
2461                                                  &bound_prolog);
2462   else
2463     niters_prolog = build_int_cst (type, 0);
2464
2465   /* Prolog loop may be skipped.  */
2466   bool skip_prolog = (prolog_peeling != 0);
2467   /* Skip to epilog if scalar loop may be preferred.  It's only needed
2468      when we peel for epilog loop and when it hasn't been checked with
2469      loop versioning.  */
2470   bool skip_vector = (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2471                       ? maybe_lt (LOOP_VINFO_INT_NITERS (loop_vinfo),
2472                                   bound_prolog + bound_epilog)
2473                       : !LOOP_REQUIRES_VERSIONING (loop_vinfo));
2474   /* Epilog loop must be executed if the number of iterations for epilog
2475      loop is known at compile time, otherwise we need to add a check at
2476      the end of vector loop and skip to the end of epilog loop.  */
2477   bool skip_epilog = (prolog_peeling < 0
2478                       || !LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2479                       || !vf.is_constant ());
2480   /* PEELING_FOR_GAPS is special because epilog loop must be executed.  */
2481   if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
2482     skip_epilog = false;
2483
2484   if (skip_vector)
2485     {
2486       split_edge (loop_preheader_edge (loop));
2487
2488       /* Due to the order in which we peel prolog and epilog, we first
2489          propagate probability to the whole loop.  The purpose is to
2490          avoid adjusting probabilities of both prolog and vector loops
2491          separately.  Note in this case, the probability of epilog loop
2492          needs to be scaled back later.  */
2493       basic_block bb_before_loop = loop_preheader_edge (loop)->src;
2494       if (prob_vector.initialized_p ())
2495         {
2496           scale_bbs_frequencies (&bb_before_loop, 1, prob_vector);
2497           scale_loop_profile (loop, prob_vector, 0);
2498         }
2499     }
2500
2501   dump_user_location_t loop_loc = find_loop_location (loop);
2502   struct loop *scalar_loop = LOOP_VINFO_SCALAR_LOOP (loop_vinfo);
2503   if (prolog_peeling)
2504     {
2505       e = loop_preheader_edge (loop);
2506       if (!slpeel_can_duplicate_loop_p (loop, e))
2507         {
2508           dump_printf_loc (MSG_MISSED_OPTIMIZATION, loop_loc,
2509                            "loop can't be duplicated to preheader edge.\n");
2510           gcc_unreachable ();
2511         }
2512       /* Peel prolog and put it on preheader edge of loop.  */
2513       prolog = slpeel_tree_duplicate_loop_to_edge_cfg (loop, scalar_loop, e);
2514       if (!prolog)
2515         {
2516           dump_printf_loc (MSG_MISSED_OPTIMIZATION, loop_loc,
2517                            "slpeel_tree_duplicate_loop_to_edge_cfg failed.\n");
2518           gcc_unreachable ();
2519         }
2520       slpeel_update_phi_nodes_for_loops (loop_vinfo, prolog, loop, true);
2521       first_loop = prolog;
2522       reset_original_copy_tables ();
2523
2524       /* Update the number of iterations for prolog loop.  */
2525       tree step_prolog = build_one_cst (TREE_TYPE (niters_prolog));
2526       vect_set_loop_condition (prolog, NULL, niters_prolog,
2527                                step_prolog, NULL_TREE, false);
2528
2529       /* Skip the prolog loop.  */
2530       if (skip_prolog)
2531         {
2532           guard_cond = fold_build2 (EQ_EXPR, boolean_type_node,
2533                                     niters_prolog, build_int_cst (type, 0));
2534           guard_bb = loop_preheader_edge (prolog)->src;
2535           basic_block bb_after_prolog = loop_preheader_edge (loop)->src;
2536           guard_to = split_edge (loop_preheader_edge (loop));
2537           guard_e = slpeel_add_loop_guard (guard_bb, guard_cond,
2538                                            guard_to, guard_bb,
2539                                            prob_prolog.invert (),
2540                                            irred_flag);
2541           e = EDGE_PRED (guard_to, 0);
2542           e = (e != guard_e ? e : EDGE_PRED (guard_to, 1));
2543           slpeel_update_phi_nodes_for_guard1 (prolog, loop, guard_e, e);
2544
2545           scale_bbs_frequencies (&bb_after_prolog, 1, prob_prolog);
2546           scale_loop_profile (prolog, prob_prolog, bound_prolog);
2547         }
2548       /* Update init address of DRs.  */
2549       vect_update_inits_of_drs (loop_vinfo, niters_prolog, PLUS_EXPR);
2550       /* Update niters for vector loop.  */
2551       LOOP_VINFO_NITERS (loop_vinfo)
2552         = fold_build2 (MINUS_EXPR, type, niters, niters_prolog);
2553       LOOP_VINFO_NITERSM1 (loop_vinfo)
2554         = fold_build2 (MINUS_EXPR, type,
2555                        LOOP_VINFO_NITERSM1 (loop_vinfo), niters_prolog);
2556       bool new_var_p = false;
2557       niters = vect_build_loop_niters (loop_vinfo, &new_var_p);
2558       /* It's guaranteed that vector loop bound before vectorization is at
2559          least VF, so set range information for newly generated var.  */
2560       if (new_var_p)
2561         set_range_info (niters, VR_RANGE,
2562                         wi::to_wide (build_int_cst (type, vf)),
2563                         wi::to_wide (TYPE_MAX_VALUE (type)));
2564
2565       /* Prolog iterates at most bound_prolog times, latch iterates at
2566          most bound_prolog - 1 times.  */
2567       record_niter_bound (prolog, bound_prolog - 1, false, true);
2568       delete_update_ssa ();
2569       adjust_vec_debug_stmts ();
2570       scev_reset ();
2571     }
2572
2573   if (epilog_peeling)
2574     {
2575       e = single_exit (loop);
2576       if (!slpeel_can_duplicate_loop_p (loop, e))
2577         {
2578           dump_printf_loc (MSG_MISSED_OPTIMIZATION, loop_loc,
2579                            "loop can't be duplicated to exit edge.\n");
2580           gcc_unreachable ();
2581         }
2582       /* Peel epilog and put it on exit edge of loop.  */
2583       epilog = slpeel_tree_duplicate_loop_to_edge_cfg (loop, scalar_loop, e);
2584       if (!epilog)
2585         {
2586           dump_printf_loc (MSG_MISSED_OPTIMIZATION, loop_loc,
2587                            "slpeel_tree_duplicate_loop_to_edge_cfg failed.\n");
2588           gcc_unreachable ();
2589         }
2590       slpeel_update_phi_nodes_for_loops (loop_vinfo, loop, epilog, false);
2591
2592       /* Scalar version loop may be preferred.  In this case, add guard
2593          and skip to epilog.  Note this only happens when the number of
2594          iterations of loop is unknown at compile time, otherwise this
2595          won't be vectorized.  */
2596       if (skip_vector)
2597         {
2598           /* Additional epilogue iteration is peeled if gap exists.  */
2599           tree t = vect_gen_scalar_loop_niters (niters_prolog, prolog_peeling,
2600                                                 bound_prolog, bound_epilog,
2601                                                 th, &bound_scalar,
2602                                                 check_profitability);
2603           /* Build guard against NITERSM1 since NITERS may overflow.  */
2604           guard_cond = fold_build2 (LT_EXPR, boolean_type_node, nitersm1, t);
2605           guard_bb = anchor;
2606           guard_to = split_edge (loop_preheader_edge (epilog));
2607           guard_e = slpeel_add_loop_guard (guard_bb, guard_cond,
2608                                            guard_to, guard_bb,
2609                                            prob_vector.invert (),
2610                                            irred_flag);
2611           e = EDGE_PRED (guard_to, 0);
2612           e = (e != guard_e ? e : EDGE_PRED (guard_to, 1));
2613           slpeel_update_phi_nodes_for_guard1 (first_loop, epilog, guard_e, e);
2614
2615           /* Simply propagate profile info from guard_bb to guard_to which is
2616              a merge point of control flow.  */
2617           guard_to->count = guard_bb->count;
2618
2619           /* Scale probability of epilog loop back.
2620              FIXME: We should avoid scaling down and back up.  Profile may
2621              get lost if we scale down to 0.  */
2622           basic_block *bbs = get_loop_body (epilog);
2623           for (unsigned int i = 0; i < epilog->num_nodes; i++)
2624             bbs[i]->count = bbs[i]->count.apply_scale
2625                                  (bbs[i]->count,
2626                                   bbs[i]->count.apply_probability
2627                                     (prob_vector));
2628           free (bbs);
2629         }
2630
2631       basic_block bb_before_epilog = loop_preheader_edge (epilog)->src;
2632       tree niters_vector_mult_vf;
2633       /* If loop is peeled for non-zero constant times, now niters refers to
2634          orig_niters - prolog_peeling, it won't overflow even the orig_niters
2635          overflows.  */
2636       niters_no_overflow |= (prolog_peeling > 0);
2637       vect_gen_vector_loop_niters (loop_vinfo, niters,
2638                                    niters_vector, step_vector,
2639                                    niters_no_overflow);
2640       if (!integer_onep (*step_vector))
2641         {
2642           /* On exit from the loop we will have an easy way of calcalating
2643              NITERS_VECTOR / STEP * STEP.  Install a dummy definition
2644              until then.  */
2645           niters_vector_mult_vf = make_ssa_name (TREE_TYPE (*niters_vector));
2646           SSA_NAME_DEF_STMT (niters_vector_mult_vf) = gimple_build_nop ();
2647           *niters_vector_mult_vf_var = niters_vector_mult_vf;
2648         }
2649       else
2650         vect_gen_vector_loop_niters_mult_vf (loop_vinfo, *niters_vector,
2651                                              &niters_vector_mult_vf);
2652       /* Update IVs of original loop as if they were advanced by
2653          niters_vector_mult_vf steps.  */
2654       gcc_checking_assert (vect_can_advance_ivs_p (loop_vinfo));
2655       edge update_e = skip_vector ? e : loop_preheader_edge (epilog);
2656       vect_update_ivs_after_vectorizer (loop_vinfo, niters_vector_mult_vf,
2657                                         update_e);
2658
2659       if (skip_epilog)
2660         {
2661           guard_cond = fold_build2 (EQ_EXPR, boolean_type_node,
2662                                     niters, niters_vector_mult_vf);
2663           guard_bb = single_exit (loop)->dest;
2664           guard_to = split_edge (single_exit (epilog));
2665           guard_e = slpeel_add_loop_guard (guard_bb, guard_cond, guard_to,
2666                                            skip_vector ? anchor : guard_bb,
2667                                            prob_epilog.invert (),
2668                                            irred_flag);
2669           slpeel_update_phi_nodes_for_guard2 (loop, epilog, guard_e,
2670                                               single_exit (epilog));
2671           /* Only need to handle basic block before epilog loop if it's not
2672              the guard_bb, which is the case when skip_vector is true.  */
2673           if (guard_bb != bb_before_epilog)
2674             {
2675               prob_epilog = prob_vector * prob_epilog + prob_vector.invert ();
2676
2677               scale_bbs_frequencies (&bb_before_epilog, 1, prob_epilog);
2678             }
2679           scale_loop_profile (epilog, prob_epilog, 0);
2680         }
2681       else
2682         slpeel_update_phi_nodes_for_lcssa (epilog);
2683
2684       unsigned HOST_WIDE_INT bound;
2685       if (bound_scalar.is_constant (&bound))
2686         {
2687           gcc_assert (bound != 0);
2688           /* -1 to convert loop iterations to latch iterations.  */
2689           record_niter_bound (epilog, bound - 1, false, true);
2690         }
2691
2692       delete_update_ssa ();
2693       adjust_vec_debug_stmts ();
2694       scev_reset ();
2695     }
2696   adjust_vec.release ();
2697   free_original_copy_tables ();
2698
2699   return epilog;
2700 }
2701
2702 /* Function vect_create_cond_for_niters_checks.
2703
2704    Create a conditional expression that represents the run-time checks for
2705    loop's niter.  The loop is guaranteed to terminate if the run-time
2706    checks hold.
2707
2708    Input:
2709    COND_EXPR  - input conditional expression.  New conditions will be chained
2710                 with logical AND operation.  If it is NULL, then the function
2711                 is used to return the number of alias checks.
2712    LOOP_VINFO - field LOOP_VINFO_MAY_ALIAS_STMTS contains the list of ddrs
2713                 to be checked.
2714
2715    Output:
2716    COND_EXPR - conditional expression.
2717
2718    The returned COND_EXPR is the conditional expression to be used in the
2719    if statement that controls which version of the loop gets executed at
2720    runtime.  */
2721
2722 static void
2723 vect_create_cond_for_niters_checks (loop_vec_info loop_vinfo, tree *cond_expr)
2724 {
2725   tree part_cond_expr = LOOP_VINFO_NITERS_ASSUMPTIONS (loop_vinfo);
2726
2727   if (*cond_expr)
2728     *cond_expr = fold_build2 (TRUTH_AND_EXPR, boolean_type_node,
2729                               *cond_expr, part_cond_expr);
2730   else
2731     *cond_expr = part_cond_expr;
2732 }
2733
2734 /* Set *COND_EXPR to a tree that is true when both the original *COND_EXPR
2735    and PART_COND_EXPR are true.  Treat a null *COND_EXPR as "true".  */
2736
2737 static void
2738 chain_cond_expr (tree *cond_expr, tree part_cond_expr)
2739 {
2740   if (*cond_expr)
2741     *cond_expr = fold_build2 (TRUTH_AND_EXPR, boolean_type_node,
2742                               *cond_expr, part_cond_expr);
2743   else
2744     *cond_expr = part_cond_expr;
2745 }
2746
2747 /* Function vect_create_cond_for_align_checks.
2748
2749    Create a conditional expression that represents the alignment checks for
2750    all of data references (array element references) whose alignment must be
2751    checked at runtime.
2752
2753    Input:
2754    COND_EXPR  - input conditional expression.  New conditions will be chained
2755                 with logical AND operation.
2756    LOOP_VINFO - two fields of the loop information are used.
2757                 LOOP_VINFO_PTR_MASK is the mask used to check the alignment.
2758                 LOOP_VINFO_MAY_MISALIGN_STMTS contains the refs to be checked.
2759
2760    Output:
2761    COND_EXPR_STMT_LIST - statements needed to construct the conditional
2762                          expression.
2763    The returned value is the conditional expression to be used in the if
2764    statement that controls which version of the loop gets executed at runtime.
2765
2766    The algorithm makes two assumptions:
2767      1) The number of bytes "n" in a vector is a power of 2.
2768      2) An address "a" is aligned if a%n is zero and that this
2769         test can be done as a&(n-1) == 0.  For example, for 16
2770         byte vectors the test is a&0xf == 0.  */
2771
2772 static void
2773 vect_create_cond_for_align_checks (loop_vec_info loop_vinfo,
2774                                    tree *cond_expr,
2775                                    gimple_seq *cond_expr_stmt_list)
2776 {
2777   vec<stmt_vec_info> may_misalign_stmts
2778     = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo);
2779   stmt_vec_info stmt_info;
2780   int mask = LOOP_VINFO_PTR_MASK (loop_vinfo);
2781   tree mask_cst;
2782   unsigned int i;
2783   tree int_ptrsize_type;
2784   char tmp_name[20];
2785   tree or_tmp_name = NULL_TREE;
2786   tree and_tmp_name;
2787   gimple *and_stmt;
2788   tree ptrsize_zero;
2789   tree part_cond_expr;
2790
2791   /* Check that mask is one less than a power of 2, i.e., mask is
2792      all zeros followed by all ones.  */
2793   gcc_assert ((mask != 0) && ((mask & (mask+1)) == 0));
2794
2795   int_ptrsize_type = signed_type_for (ptr_type_node);
2796
2797   /* Create expression (mask & (dr_1 || ... || dr_n)) where dr_i is the address
2798      of the first vector of the i'th data reference. */
2799
2800   FOR_EACH_VEC_ELT (may_misalign_stmts, i, stmt_info)
2801     {
2802       gimple_seq new_stmt_list = NULL;
2803       tree addr_base;
2804       tree addr_tmp_name;
2805       tree new_or_tmp_name;
2806       gimple *addr_stmt, *or_stmt;
2807       tree vectype = STMT_VINFO_VECTYPE (stmt_info);
2808       bool negative = tree_int_cst_compare
2809         (DR_STEP (STMT_VINFO_DATA_REF (stmt_info)), size_zero_node) < 0;
2810       tree offset = negative
2811         ? size_int (-TYPE_VECTOR_SUBPARTS (vectype) + 1) : size_zero_node;
2812
2813       /* create: addr_tmp = (int)(address_of_first_vector) */
2814       addr_base =
2815         vect_create_addr_base_for_vector_ref (stmt_info, &new_stmt_list,
2816                                               offset);
2817       if (new_stmt_list != NULL)
2818         gimple_seq_add_seq (cond_expr_stmt_list, new_stmt_list);
2819
2820       sprintf (tmp_name, "addr2int%d", i);
2821       addr_tmp_name = make_temp_ssa_name (int_ptrsize_type, NULL, tmp_name);
2822       addr_stmt = gimple_build_assign (addr_tmp_name, NOP_EXPR, addr_base);
2823       gimple_seq_add_stmt (cond_expr_stmt_list, addr_stmt);
2824
2825       /* The addresses are OR together.  */
2826
2827       if (or_tmp_name != NULL_TREE)
2828         {
2829           /* create: or_tmp = or_tmp | addr_tmp */
2830           sprintf (tmp_name, "orptrs%d", i);
2831           new_or_tmp_name = make_temp_ssa_name (int_ptrsize_type, NULL, tmp_name);
2832           or_stmt = gimple_build_assign (new_or_tmp_name, BIT_IOR_EXPR,
2833                                          or_tmp_name, addr_tmp_name);
2834           gimple_seq_add_stmt (cond_expr_stmt_list, or_stmt);
2835           or_tmp_name = new_or_tmp_name;
2836         }
2837       else
2838         or_tmp_name = addr_tmp_name;
2839
2840     } /* end for i */
2841
2842   mask_cst = build_int_cst (int_ptrsize_type, mask);
2843
2844   /* create: and_tmp = or_tmp & mask  */
2845   and_tmp_name = make_temp_ssa_name (int_ptrsize_type, NULL, "andmask");
2846
2847   and_stmt = gimple_build_assign (and_tmp_name, BIT_AND_EXPR,
2848                                   or_tmp_name, mask_cst);
2849   gimple_seq_add_stmt (cond_expr_stmt_list, and_stmt);
2850
2851   /* Make and_tmp the left operand of the conditional test against zero.
2852      if and_tmp has a nonzero bit then some address is unaligned.  */
2853   ptrsize_zero = build_int_cst (int_ptrsize_type, 0);
2854   part_cond_expr = fold_build2 (EQ_EXPR, boolean_type_node,
2855                                 and_tmp_name, ptrsize_zero);
2856   chain_cond_expr (cond_expr, part_cond_expr);
2857 }
2858
2859 /* If LOOP_VINFO_CHECK_UNEQUAL_ADDRS contains <A1, B1>, ..., <An, Bn>,
2860    create a tree representation of: (&A1 != &B1) && ... && (&An != &Bn).
2861    Set *COND_EXPR to a tree that is true when both the original *COND_EXPR
2862    and this new condition are true.  Treat a null *COND_EXPR as "true".  */
2863
2864 static void
2865 vect_create_cond_for_unequal_addrs (loop_vec_info loop_vinfo, tree *cond_expr)
2866 {
2867   vec<vec_object_pair> pairs = LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo);
2868   unsigned int i;
2869   vec_object_pair *pair;
2870   FOR_EACH_VEC_ELT (pairs, i, pair)
2871     {
2872       tree addr1 = build_fold_addr_expr (pair->first);
2873       tree addr2 = build_fold_addr_expr (pair->second);
2874       tree part_cond_expr = fold_build2 (NE_EXPR, boolean_type_node,
2875                                          addr1, addr2);
2876       chain_cond_expr (cond_expr, part_cond_expr);
2877     }
2878 }
2879
2880 /* Create an expression that is true when all lower-bound conditions for
2881    the vectorized loop are met.  Chain this condition with *COND_EXPR.  */
2882
2883 static void
2884 vect_create_cond_for_lower_bounds (loop_vec_info loop_vinfo, tree *cond_expr)
2885 {
2886   vec<vec_lower_bound> lower_bounds = LOOP_VINFO_LOWER_BOUNDS (loop_vinfo);
2887   for (unsigned int i = 0; i < lower_bounds.length (); ++i)
2888     {
2889       tree expr = lower_bounds[i].expr;
2890       tree type = unsigned_type_for (TREE_TYPE (expr));
2891       expr = fold_convert (type, expr);
2892       poly_uint64 bound = lower_bounds[i].min_value;
2893       if (!lower_bounds[i].unsigned_p)
2894         {
2895           expr = fold_build2 (PLUS_EXPR, type, expr,
2896                               build_int_cstu (type, bound - 1));
2897           bound += bound - 1;
2898         }
2899       tree part_cond_expr = fold_build2 (GE_EXPR, boolean_type_node, expr,
2900                                          build_int_cstu (type, bound));
2901       chain_cond_expr (cond_expr, part_cond_expr);
2902     }
2903 }
2904
2905 /* Function vect_create_cond_for_alias_checks.
2906
2907    Create a conditional expression that represents the run-time checks for
2908    overlapping of address ranges represented by a list of data references
2909    relations passed as input.
2910
2911    Input:
2912    COND_EXPR  - input conditional expression.  New conditions will be chained
2913                 with logical AND operation.  If it is NULL, then the function
2914                 is used to return the number of alias checks.
2915    LOOP_VINFO - field LOOP_VINFO_MAY_ALIAS_STMTS contains the list of ddrs
2916                 to be checked.
2917
2918    Output:
2919    COND_EXPR - conditional expression.
2920
2921    The returned COND_EXPR is the conditional expression to be used in the if
2922    statement that controls which version of the loop gets executed at runtime.
2923 */
2924
2925 void
2926 vect_create_cond_for_alias_checks (loop_vec_info loop_vinfo, tree * cond_expr)
2927 {
2928   vec<dr_with_seg_len_pair_t> comp_alias_ddrs =
2929     LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo);
2930
2931   if (comp_alias_ddrs.is_empty ())
2932     return;
2933
2934   create_runtime_alias_checks (LOOP_VINFO_LOOP (loop_vinfo),
2935                                &comp_alias_ddrs, cond_expr);
2936   if (dump_enabled_p ())
2937     dump_printf_loc (MSG_NOTE, vect_location,
2938                      "created %u versioning for alias checks.\n",
2939                      comp_alias_ddrs.length ());
2940 }
2941
2942
2943 /* Function vect_loop_versioning.
2944
2945    If the loop has data references that may or may not be aligned or/and
2946    has data reference relations whose independence was not proven then
2947    two versions of the loop need to be generated, one which is vectorized
2948    and one which isn't.  A test is then generated to control which of the
2949    loops is executed.  The test checks for the alignment of all of the
2950    data references that may or may not be aligned.  An additional
2951    sequence of runtime tests is generated for each pairs of DDRs whose
2952    independence was not proven.  The vectorized version of loop is
2953    executed only if both alias and alignment tests are passed.
2954
2955    The test generated to check which version of loop is executed
2956    is modified to also check for profitability as indicated by the
2957    cost model threshold TH.
2958
2959    The versioning precondition(s) are placed in *COND_EXPR and
2960    *COND_EXPR_STMT_LIST.  */
2961
2962 void
2963 vect_loop_versioning (loop_vec_info loop_vinfo,
2964                       unsigned int th, bool check_profitability,
2965                       poly_uint64 versioning_threshold)
2966 {
2967   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo), *nloop;
2968   struct loop *scalar_loop = LOOP_VINFO_SCALAR_LOOP (loop_vinfo);
2969   basic_block condition_bb;
2970   gphi_iterator gsi;
2971   gimple_stmt_iterator cond_exp_gsi;
2972   basic_block merge_bb;
2973   basic_block new_exit_bb;
2974   edge new_exit_e, e;
2975   gphi *orig_phi, *new_phi;
2976   tree cond_expr = NULL_TREE;
2977   gimple_seq cond_expr_stmt_list = NULL;
2978   tree arg;
2979   profile_probability prob = profile_probability::likely ();
2980   gimple_seq gimplify_stmt_list = NULL;
2981   tree scalar_loop_iters = LOOP_VINFO_NITERSM1 (loop_vinfo);
2982   bool version_align = LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo);
2983   bool version_alias = LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo);
2984   bool version_niter = LOOP_REQUIRES_VERSIONING_FOR_NITERS (loop_vinfo);
2985
2986   if (check_profitability)
2987     cond_expr = fold_build2 (GE_EXPR, boolean_type_node, scalar_loop_iters,
2988                              build_int_cst (TREE_TYPE (scalar_loop_iters),
2989                                             th - 1));
2990   if (maybe_ne (versioning_threshold, 0U))
2991     {
2992       tree expr = fold_build2 (GE_EXPR, boolean_type_node, scalar_loop_iters,
2993                                build_int_cst (TREE_TYPE (scalar_loop_iters),
2994                                               versioning_threshold - 1));
2995       if (cond_expr)
2996         cond_expr = fold_build2 (BIT_AND_EXPR, boolean_type_node,
2997                                  expr, cond_expr);
2998       else
2999         cond_expr = expr;
3000     }
3001
3002   if (version_niter)
3003     vect_create_cond_for_niters_checks (loop_vinfo, &cond_expr);
3004
3005   if (cond_expr)
3006     cond_expr = force_gimple_operand_1 (cond_expr, &cond_expr_stmt_list,
3007                                         is_gimple_condexpr, NULL_TREE);
3008
3009   if (version_align)
3010     vect_create_cond_for_align_checks (loop_vinfo, &cond_expr,
3011                                        &cond_expr_stmt_list);
3012
3013   if (version_alias)
3014     {
3015       vect_create_cond_for_unequal_addrs (loop_vinfo, &cond_expr);
3016       vect_create_cond_for_lower_bounds (loop_vinfo, &cond_expr);
3017       vect_create_cond_for_alias_checks (loop_vinfo, &cond_expr);
3018     }
3019
3020   cond_expr = force_gimple_operand_1 (unshare_expr (cond_expr),
3021                                       &gimplify_stmt_list,
3022                                       is_gimple_condexpr, NULL_TREE);
3023   gimple_seq_add_seq (&cond_expr_stmt_list, gimplify_stmt_list);
3024
3025   initialize_original_copy_tables ();
3026   if (scalar_loop)
3027     {
3028       edge scalar_e;
3029       basic_block preheader, scalar_preheader;
3030
3031       /* We don't want to scale SCALAR_LOOP's frequencies, we need to
3032          scale LOOP's frequencies instead.  */
3033       nloop = loop_version (scalar_loop, cond_expr, &condition_bb,
3034                             prob, prob.invert (), prob, prob.invert (), true);
3035       scale_loop_frequencies (loop, prob);
3036       /* CONDITION_BB was created above SCALAR_LOOP's preheader,
3037          while we need to move it above LOOP's preheader.  */
3038       e = loop_preheader_edge (loop);
3039       scalar_e = loop_preheader_edge (scalar_loop);
3040       /* The vector loop preheader might not be empty, since new
3041          invariants could have been created while analyzing the loop.  */
3042       gcc_assert (single_pred_p (e->src));
3043       gcc_assert (empty_block_p (scalar_e->src)
3044                   && single_pred_p (scalar_e->src));
3045       gcc_assert (single_pred_p (condition_bb));
3046       preheader = e->src;
3047       scalar_preheader = scalar_e->src;
3048       scalar_e = find_edge (condition_bb, scalar_preheader);
3049       e = single_pred_edge (preheader);
3050       redirect_edge_and_branch_force (single_pred_edge (condition_bb),
3051                                       scalar_preheader);
3052       redirect_edge_and_branch_force (scalar_e, preheader);
3053       redirect_edge_and_branch_force (e, condition_bb);
3054       set_immediate_dominator (CDI_DOMINATORS, condition_bb,
3055                                single_pred (condition_bb));
3056       set_immediate_dominator (CDI_DOMINATORS, scalar_preheader,
3057                                single_pred (scalar_preheader));
3058       set_immediate_dominator (CDI_DOMINATORS, preheader,
3059                                condition_bb);
3060     }
3061   else
3062     nloop = loop_version (loop, cond_expr, &condition_bb,
3063                           prob, prob.invert (), prob, prob.invert (), true);
3064
3065   if (version_niter)
3066     {
3067       /* The versioned loop could be infinite, we need to clear existing
3068          niter information which is copied from the original loop.  */
3069       gcc_assert (loop_constraint_set_p (loop, LOOP_C_FINITE));
3070       vect_free_loop_info_assumptions (nloop);
3071       /* And set constraint LOOP_C_INFINITE for niter analyzer.  */
3072       loop_constraint_set (loop, LOOP_C_INFINITE);
3073     }
3074
3075   if (LOCATION_LOCUS (vect_location.get_location_t ()) != UNKNOWN_LOCATION
3076       && dump_enabled_p ())
3077     {
3078       if (version_alias)
3079         dump_printf_loc (MSG_OPTIMIZED_LOCATIONS, vect_location,
3080                          "loop versioned for vectorization because of "
3081                          "possible aliasing\n");
3082       if (version_align)
3083         dump_printf_loc (MSG_OPTIMIZED_LOCATIONS, vect_location,
3084                          "loop versioned for vectorization to enhance "
3085                          "alignment\n");
3086
3087     }
3088   free_original_copy_tables ();
3089
3090   /* Loop versioning violates an assumption we try to maintain during
3091      vectorization - that the loop exit block has a single predecessor.
3092      After versioning, the exit block of both loop versions is the same
3093      basic block (i.e. it has two predecessors). Just in order to simplify
3094      following transformations in the vectorizer, we fix this situation
3095      here by adding a new (empty) block on the exit-edge of the loop,
3096      with the proper loop-exit phis to maintain loop-closed-form.
3097      If loop versioning wasn't done from loop, but scalar_loop instead,
3098      merge_bb will have already just a single successor.  */
3099
3100   merge_bb = single_exit (loop)->dest;
3101   if (scalar_loop == NULL || EDGE_COUNT (merge_bb->preds) >= 2)
3102     {
3103       gcc_assert (EDGE_COUNT (merge_bb->preds) >= 2);
3104       new_exit_bb = split_edge (single_exit (loop));
3105       new_exit_e = single_exit (loop);
3106       e = EDGE_SUCC (new_exit_bb, 0);
3107
3108       for (gsi = gsi_start_phis (merge_bb); !gsi_end_p (gsi); gsi_next (&gsi))
3109         {
3110           tree new_res;
3111           orig_phi = gsi.phi ();
3112           new_res = copy_ssa_name (PHI_RESULT (orig_phi));
3113           new_phi = create_phi_node (new_res, new_exit_bb);
3114           arg = PHI_ARG_DEF_FROM_EDGE (orig_phi, e);
3115           add_phi_arg (new_phi, arg, new_exit_e,
3116                        gimple_phi_arg_location_from_edge (orig_phi, e));
3117           adjust_phi_and_debug_stmts (orig_phi, e, PHI_RESULT (new_phi));
3118         }
3119     }
3120
3121   /* End loop-exit-fixes after versioning.  */
3122
3123   if (cond_expr_stmt_list)
3124     {
3125       cond_exp_gsi = gsi_last_bb (condition_bb);
3126       gsi_insert_seq_before (&cond_exp_gsi, cond_expr_stmt_list,
3127                              GSI_SAME_STMT);
3128     }
3129   update_ssa (TODO_update_ssa);
3130 }