gcc/tree-vect-loop.cc

   1 /* Loop Vectorization
   2    Copyright (C) 2003-2023 Free Software Foundation, Inc.
   3    Contributed by Dorit Naishlos <dorit@il.ibm.com> and
   4    Ira Rosen <irar@il.ibm.com>
   5
   6 This file is part of GCC.
   7
   8 GCC is free software; you can redistribute it and/or modify it under
   9 the terms of the GNU General Public License as published by the Free
  10 Software Foundation; either version 3, or (at your option) any later
  11 version.
  12
  13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
  14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
  15 FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  16 for more details.
  17
  18 You should have received a copy of the GNU General Public License
  19 along with GCC; see the file COPYING3.  If not see
  20 <http://www.gnu.org/licenses/>.  */
  21
  22 #define INCLUDE_ALGORITHM
  23 #include "config.h"
  24 #include "system.h"
  25 #include "coretypes.h"
  26 #include "backend.h"
  27 #include "target.h"
  28 #include "rtl.h"
  29 #include "tree.h"
  30 #include "gimple.h"
  31 #include "cfghooks.h"
  32 #include "tree-pass.h"
  33 #include "ssa.h"
  34 #include "optabs-tree.h"
  35 #include "diagnostic-core.h"
  36 #include "fold-const.h"
  37 #include "stor-layout.h"
  38 #include "cfganal.h"
  39 #include "gimplify.h"
  40 #include "gimple-iterator.h"
  41 #include "gimplify-me.h"
  42 #include "tree-ssa-loop-ivopts.h"
  43 #include "tree-ssa-loop-manip.h"
  44 #include "tree-ssa-loop-niter.h"
  45 #include "tree-ssa-loop.h"
  46 #include "cfgloop.h"
  47 #include "tree-scalar-evolution.h"
  48 #include "tree-vectorizer.h"
  49 #include "gimple-fold.h"
  50 #include "cgraph.h"
  51 #include "tree-cfg.h"
  52 #include "tree-if-conv.h"
  53 #include "internal-fn.h"
  54 #include "tree-vector-builder.h"
  55 #include "vec-perm-indices.h"
  56 #include "tree-eh.h"
  57 #include "case-cfn-macros.h"
  58 #include "langhooks.h"
  59
  60 /* Loop Vectorization Pass.
  61
  62    This pass tries to vectorize loops.
  63
  64    For example, the vectorizer transforms the following simple loop:
  65
  66         short a[N]; short b[N]; short c[N]; int i;
  67
  68         for (i=0; i<N; i++){
  69           a[i] = b[i] + c[i];
  70         }
  71
  72    as if it was manually vectorized by rewriting the source code into:
  73
  74         typedef int __attribute__((mode(V8HI))) v8hi;
  75         short a[N];  short b[N]; short c[N];   int i;
  76         v8hi *pa = (v8hi*)a, *pb = (v8hi*)b, *pc = (v8hi*)c;
  77         v8hi va, vb, vc;
  78
  79         for (i=0; i<N/8; i++){
  80           vb = pb[i];
  81           vc = pc[i];
  82           va = vb + vc;
  83           pa[i] = va;
  84         }
  85
  86         The main entry to this pass is vectorize_loops(), in which
  87    the vectorizer applies a set of analyses on a given set of loops,
  88    followed by the actual vectorization transformation for the loops that
  89    had successfully passed the analysis phase.
  90         Throughout this pass we make a distinction between two types of
  91    data: scalars (which are represented by SSA_NAMES), and memory references
  92    ("data-refs").  These two types of data require different handling both
  93    during analysis and transformation. The types of data-refs that the
  94    vectorizer currently supports are ARRAY_REFS which base is an array DECL
  95    (not a pointer), and INDIRECT_REFS through pointers; both array and pointer
  96    accesses are required to have a simple (consecutive) access pattern.
  97
  98    Analysis phase:
  99    ===============
 100         The driver for the analysis phase is vect_analyze_loop().
 101    It applies a set of analyses, some of which rely on the scalar evolution
 102    analyzer (scev) developed by Sebastian Pop.
 103
 104         During the analysis phase the vectorizer records some information
 105    per stmt in a "stmt_vec_info" struct which is attached to each stmt in the
 106    loop, as well as general information about the loop as a whole, which is
 107    recorded in a "loop_vec_info" struct attached to each loop.
 108
 109    Transformation phase:
 110    =====================
 111         The loop transformation phase scans all the stmts in the loop, and
 112    creates a vector stmt (or a sequence of stmts) for each scalar stmt S in
 113    the loop that needs to be vectorized.  It inserts the vector code sequence
 114    just before the scalar stmt S, and records a pointer to the vector code
 115    in STMT_VINFO_VEC_STMT (stmt_info) (stmt_info is the stmt_vec_info struct
 116    attached to S).  This pointer will be used for the vectorization of following
 117    stmts which use the def of stmt S. Stmt S is removed if it writes to memory;
 118    otherwise, we rely on dead code elimination for removing it.
 119
 120         For example, say stmt S1 was vectorized into stmt VS1:
 121
 122    VS1: vb = px[i];
 123    S1:  b = x[i];    STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
 124    S2:  a = b;
 125
 126    To vectorize stmt S2, the vectorizer first finds the stmt that defines
 127    the operand 'b' (S1), and gets the relevant vector def 'vb' from the
 128    vector stmt VS1 pointed to by STMT_VINFO_VEC_STMT (stmt_info (S1)).  The
 129    resulting sequence would be:
 130
 131    VS1: vb = px[i];
 132    S1:  b = x[i];       STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
 133    VS2: va = vb;
 134    S2:  a = b;          STMT_VINFO_VEC_STMT (stmt_info (S2)) = VS2
 135
 136         Operands that are not SSA_NAMEs, are data-refs that appear in
 137    load/store operations (like 'x[i]' in S1), and are handled differently.
 138
 139    Target modeling:
 140    =================
 141         Currently the only target specific information that is used is the
 142    size of the vector (in bytes) - "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD".
 143    Targets that can support different sizes of vectors, for now will need
 144    to specify one value for "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD".  More
 145    flexibility will be added in the future.
 146
 147         Since we only vectorize operations which vector form can be
 148    expressed using existing tree codes, to verify that an operation is
 149    supported, the vectorizer checks the relevant optab at the relevant
 150    machine_mode (e.g, optab_handler (add_optab, V8HImode)).  If
 151    the value found is CODE_FOR_nothing, then there's no target support, and
 152    we can't vectorize the stmt.
 153
 154    For additional information on this project see:
 155    http://gcc.gnu.org/projects/tree-ssa/vectorization.html
 156 */
 157
 158 static void vect_estimate_min_profitable_iters (loop_vec_info, int *, int *,
 159                                                 unsigned *);
 160 static stmt_vec_info vect_is_simple_reduction (loop_vec_info, stmt_vec_info,
 161                                                bool *, bool *, bool);
 162
 163 /* Subroutine of vect_determine_vf_for_stmt that handles only one
 164    statement.  VECTYPE_MAYBE_SET_P is true if STMT_VINFO_VECTYPE
 165    may already be set for general statements (not just data refs).  */
 166
 167 static opt_result
 168 vect_determine_vf_for_stmt_1 (vec_info *vinfo, stmt_vec_info stmt_info,
 169                               bool vectype_maybe_set_p,
 170                               poly_uint64 *vf)
 171 {
 172   gimple *stmt = stmt_info->stmt;
 173
 174   if ((!STMT_VINFO_RELEVANT_P (stmt_info)
 175        && !STMT_VINFO_LIVE_P (stmt_info))
 176       || gimple_clobber_p (stmt))
 177     {
 178       if (dump_enabled_p ())
 179         dump_printf_loc (MSG_NOTE, vect_location, "skip.\n");
 180       return opt_result::success ();
 181     }
 182
 183   tree stmt_vectype, nunits_vectype;
 184   opt_result res = vect_get_vector_types_for_stmt (vinfo, stmt_info,
 185                                                    &stmt_vectype,
 186                                                    &nunits_vectype);
 187   if (!res)
 188     return res;
 189
 190   if (stmt_vectype)
 191     {
 192       if (STMT_VINFO_VECTYPE (stmt_info))
 193         /* The only case when a vectype had been already set is for stmts
 194            that contain a data ref, or for "pattern-stmts" (stmts generated
 195            by the vectorizer to represent/replace a certain idiom).  */
 196         gcc_assert ((STMT_VINFO_DATA_REF (stmt_info)
 197                      || vectype_maybe_set_p)
 198                     && STMT_VINFO_VECTYPE (stmt_info) == stmt_vectype);
 199       else
 200         STMT_VINFO_VECTYPE (stmt_info) = stmt_vectype;
 201     }
 202
 203   if (nunits_vectype)
 204     vect_update_max_nunits (vf, nunits_vectype);
 205
 206   return opt_result::success ();
 207 }
 208
 209 /* Subroutine of vect_determine_vectorization_factor.  Set the vector
 210    types of STMT_INFO and all attached pattern statements and update
 211    the vectorization factor VF accordingly.  Return true on success
 212    or false if something prevented vectorization.  */
 213
 214 static opt_result
 215 vect_determine_vf_for_stmt (vec_info *vinfo,
 216                             stmt_vec_info stmt_info, poly_uint64 *vf)
 217 {
 218   if (dump_enabled_p ())
 219     dump_printf_loc (MSG_NOTE, vect_location, "==> examining statement: %G",
 220                      stmt_info->stmt);
 221   opt_result res = vect_determine_vf_for_stmt_1 (vinfo, stmt_info, false, vf);
 222   if (!res)
 223     return res;
 224
 225   if (STMT_VINFO_IN_PATTERN_P (stmt_info)
 226       && STMT_VINFO_RELATED_STMT (stmt_info))
 227     {
 228       gimple *pattern_def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
 229       stmt_info = STMT_VINFO_RELATED_STMT (stmt_info);
 230
 231       /* If a pattern statement has def stmts, analyze them too.  */
 232       for (gimple_stmt_iterator si = gsi_start (pattern_def_seq);
 233            !gsi_end_p (si); gsi_next (&si))
 234         {
 235           stmt_vec_info def_stmt_info = vinfo->lookup_stmt (gsi_stmt (si));
 236           if (dump_enabled_p ())
 237             dump_printf_loc (MSG_NOTE, vect_location,
 238                              "==> examining pattern def stmt: %G",
 239                              def_stmt_info->stmt);
 240           res = vect_determine_vf_for_stmt_1 (vinfo, def_stmt_info, true, vf);
 241           if (!res)
 242             return res;
 243         }
 244
 245       if (dump_enabled_p ())
 246         dump_printf_loc (MSG_NOTE, vect_location,
 247                          "==> examining pattern statement: %G",
 248                          stmt_info->stmt);
 249       res = vect_determine_vf_for_stmt_1 (vinfo, stmt_info, true, vf);
 250       if (!res)
 251         return res;
 252     }
 253
 254   return opt_result::success ();
 255 }
 256
 257 /* Function vect_determine_vectorization_factor
 258
 259    Determine the vectorization factor (VF).  VF is the number of data elements
 260    that are operated upon in parallel in a single iteration of the vectorized
 261    loop.  For example, when vectorizing a loop that operates on 4byte elements,
 262    on a target with vector size (VS) 16byte, the VF is set to 4, since 4
 263    elements can fit in a single vector register.
 264
 265    We currently support vectorization of loops in which all types operated upon
 266    are of the same size.  Therefore this function currently sets VF according to
 267    the size of the types operated upon, and fails if there are multiple sizes
 268    in the loop.
 269
 270    VF is also the factor by which the loop iterations are strip-mined, e.g.:
 271    original loop:
 272         for (i=0; i<N; i++){
 273           a[i] = b[i] + c[i];
 274         }
 275
 276    vectorized loop:
 277         for (i=0; i<N; i+=VF){
 278           a[i:VF] = b[i:VF] + c[i:VF];
 279         }
 280 */
 281
 282 static opt_result
 283 vect_determine_vectorization_factor (loop_vec_info loop_vinfo)
 284 {
 285   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
 286   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
 287   unsigned nbbs = loop->num_nodes;
 288   poly_uint64 vectorization_factor = 1;
 289   tree scalar_type = NULL_TREE;
 290   gphi *phi;
 291   tree vectype;
 292   stmt_vec_info stmt_info;
 293   unsigned i;
 294
 295   DUMP_VECT_SCOPE ("vect_determine_vectorization_factor");
 296
 297   for (i = 0; i < nbbs; i++)
 298     {
 299       basic_block bb = bbs[i];
 300
 301       for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
 302            gsi_next (&si))
 303         {
 304           phi = si.phi ();
 305           stmt_info = loop_vinfo->lookup_stmt (phi);
 306           if (dump_enabled_p ())
 307             dump_printf_loc (MSG_NOTE, vect_location, "==> examining phi: %G",
 308                              (gimple *) phi);
 309
 310           gcc_assert (stmt_info);
 311
 312           if (STMT_VINFO_RELEVANT_P (stmt_info)
 313               || STMT_VINFO_LIVE_P (stmt_info))
 314             {
 315               gcc_assert (!STMT_VINFO_VECTYPE (stmt_info));
 316               scalar_type = TREE_TYPE (PHI_RESULT (phi));
 317
 318               if (dump_enabled_p ())
 319                 dump_printf_loc (MSG_NOTE, vect_location,
 320                                  "get vectype for scalar type:  %T\n",
 321                                  scalar_type);
 322
 323               vectype = get_vectype_for_scalar_type (loop_vinfo, scalar_type);
 324               if (!vectype)
 325                 return opt_result::failure_at (phi,
 326                                                "not vectorized: unsupported "
 327                                                "data-type %T\n",
 328                                                scalar_type);
 329               STMT_VINFO_VECTYPE (stmt_info) = vectype;
 330
 331               if (dump_enabled_p ())
 332                 dump_printf_loc (MSG_NOTE, vect_location, "vectype: %T\n",
 333                                  vectype);
 334
 335               if (dump_enabled_p ())
 336                 {
 337                   dump_printf_loc (MSG_NOTE, vect_location, "nunits = ");
 338                   dump_dec (MSG_NOTE, TYPE_VECTOR_SUBPARTS (vectype));
 339                   dump_printf (MSG_NOTE, "\n");
 340                 }
 341
 342               vect_update_max_nunits (&vectorization_factor, vectype);
 343             }
 344         }
 345
 346       for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
 347            gsi_next (&si))
 348         {
 349           if (is_gimple_debug (gsi_stmt (si)))
 350             continue;
 351           stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
 352           opt_result res
 353             = vect_determine_vf_for_stmt (loop_vinfo,
 354                                           stmt_info, &vectorization_factor);
 355           if (!res)
 356             return res;
 357         }
 358     }
 359
 360   /* TODO: Analyze cost. Decide if worth while to vectorize.  */
 361   if (dump_enabled_p ())
 362     {
 363       dump_printf_loc (MSG_NOTE, vect_location, "vectorization factor = ");
 364       dump_dec (MSG_NOTE, vectorization_factor);
 365       dump_printf (MSG_NOTE, "\n");
 366     }
 367
 368   if (known_le (vectorization_factor, 1U))
 369     return opt_result::failure_at (vect_location,
 370                                    "not vectorized: unsupported data-type\n");
 371   LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
 372   return opt_result::success ();
 373 }
 374
 375
 376 /* Function vect_is_simple_iv_evolution.
 377
 378    FORNOW: A simple evolution of an induction variables in the loop is
 379    considered a polynomial evolution.  */
 380
 381 static bool
 382 vect_is_simple_iv_evolution (unsigned loop_nb, tree access_fn, tree * init,
 383                              tree * step)
 384 {
 385   tree init_expr;
 386   tree step_expr;
 387   tree evolution_part = evolution_part_in_loop_num (access_fn, loop_nb);
 388   basic_block bb;
 389
 390   /* When there is no evolution in this loop, the evolution function
 391      is not "simple".  */
 392   if (evolution_part == NULL_TREE)
 393     return false;
 394
 395   /* When the evolution is a polynomial of degree >= 2
 396      the evolution function is not "simple".  */
 397   if (tree_is_chrec (evolution_part))
 398     return false;
 399
 400   step_expr = evolution_part;
 401   init_expr = unshare_expr (initial_condition_in_loop_num (access_fn, loop_nb));
 402
 403   if (dump_enabled_p ())
 404     dump_printf_loc (MSG_NOTE, vect_location, "step: %T,  init: %T\n",
 405                      step_expr, init_expr);
 406
 407   *init = init_expr;
 408   *step = step_expr;
 409
 410   if (TREE_CODE (step_expr) != INTEGER_CST
 411       && (TREE_CODE (step_expr) != SSA_NAME
 412           || ((bb = gimple_bb (SSA_NAME_DEF_STMT (step_expr)))
 413               && flow_bb_inside_loop_p (get_loop (cfun, loop_nb), bb))
 414           || (!INTEGRAL_TYPE_P (TREE_TYPE (step_expr))
 415               && (!SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr))
 416                   || !flag_associative_math)))
 417       && (TREE_CODE (step_expr) != REAL_CST
 418           || !flag_associative_math))
 419     {
 420       if (dump_enabled_p ())
 421         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 422                          "step unknown.\n");
 423       return false;
 424     }
 425
 426   return true;
 427 }
 428
 429 /* Function vect_is_nonlinear_iv_evolution
 430
 431    Only support nonlinear induction for integer type
 432    1. neg
 433    2. mul by constant
 434    3. lshift/rshift by constant.
 435
 436    For neg induction, return a fake step as integer -1.  */
 437 static bool
 438 vect_is_nonlinear_iv_evolution (class loop* loop, stmt_vec_info stmt_info,
 439                                 gphi* loop_phi_node, tree *init, tree *step)
 440 {
 441   tree init_expr, ev_expr, result, op1, op2;
 442   gimple* def;
 443
 444   if (gimple_phi_num_args (loop_phi_node) != 2)
 445     return false;
 446
 447   init_expr = PHI_ARG_DEF_FROM_EDGE (loop_phi_node, loop_preheader_edge (loop));
 448   ev_expr = PHI_ARG_DEF_FROM_EDGE (loop_phi_node, loop_latch_edge (loop));
 449
 450   /* Support nonlinear induction only for integer type.  */
 451   if (!INTEGRAL_TYPE_P (TREE_TYPE (init_expr)))
 452     return false;
 453
 454   *init = init_expr;
 455   result = PHI_RESULT (loop_phi_node);
 456
 457   if (TREE_CODE (ev_expr) != SSA_NAME
 458       || ((def = SSA_NAME_DEF_STMT (ev_expr)), false)
 459       || !is_gimple_assign (def))
 460     return false;
 461
 462   enum tree_code t_code = gimple_assign_rhs_code (def);
 463   switch (t_code)
 464     {
 465     case NEGATE_EXPR:
 466       if (gimple_assign_rhs1 (def) != result)
 467         return false;
 468       *step = build_int_cst (TREE_TYPE (init_expr), -1);
 469       STMT_VINFO_LOOP_PHI_EVOLUTION_TYPE (stmt_info) = vect_step_op_neg;
 470       break;
 471
 472     case RSHIFT_EXPR:
 473     case LSHIFT_EXPR:
 474     case MULT_EXPR:
 475       op1 = gimple_assign_rhs1 (def);
 476       op2 = gimple_assign_rhs2 (def);
 477       if (TREE_CODE (op2) != INTEGER_CST
 478           || op1 != result)
 479         return false;
 480       *step = op2;
 481       if (t_code == LSHIFT_EXPR)
 482         STMT_VINFO_LOOP_PHI_EVOLUTION_TYPE (stmt_info) = vect_step_op_shl;
 483       else if (t_code == RSHIFT_EXPR)
 484         STMT_VINFO_LOOP_PHI_EVOLUTION_TYPE (stmt_info) = vect_step_op_shr;
 485       /* NEGATE_EXPR and MULT_EXPR are both vect_step_op_mul.  */
 486       else
 487         STMT_VINFO_LOOP_PHI_EVOLUTION_TYPE (stmt_info) = vect_step_op_mul;
 488       break;
 489
 490     default:
 491       return false;
 492     }
 493
 494   STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_info) = *init;
 495   STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_info) = *step;
 496
 497   return true;
 498 }
 499
 500 /* Return true if PHI, described by STMT_INFO, is the inner PHI in
 501    what we are assuming is a double reduction.  For example, given
 502    a structure like this:
 503
 504       outer1:
 505         x_1 = PHI <x_4(outer2), ...>;
 506         ...
 507
 508       inner:
 509         x_2 = PHI <x_1(outer1), ...>;
 510         ...
 511         x_3 = ...;
 512         ...
 513
 514       outer2:
 515         x_4 = PHI <x_3(inner)>;
 516         ...
 517
 518    outer loop analysis would treat x_1 as a double reduction phi and
 519    this function would then return true for x_2.  */
 520
 521 static bool
 522 vect_inner_phi_in_double_reduction_p (loop_vec_info loop_vinfo, gphi *phi)
 523 {
 524   use_operand_p use_p;
 525   ssa_op_iter op_iter;
 526   FOR_EACH_PHI_ARG (use_p, phi, op_iter, SSA_OP_USE)
 527     if (stmt_vec_info def_info = loop_vinfo->lookup_def (USE_FROM_PTR (use_p)))
 528       if (STMT_VINFO_DEF_TYPE (def_info) == vect_double_reduction_def)
 529         return true;
 530   return false;
 531 }
 532
 533 /* Returns true if Phi is a first-order recurrence. A first-order
 534    recurrence is a non-reduction recurrence relation in which the value of
 535    the recurrence in the current loop iteration equals a value defined in
 536    the previous iteration.  */
 537
 538 static bool
 539 vect_phi_first_order_recurrence_p (loop_vec_info loop_vinfo, class loop *loop,
 540                                    gphi *phi)
 541 {
 542   /* A nested cycle isn't vectorizable as first order recurrence.  */
 543   if (LOOP_VINFO_LOOP (loop_vinfo) != loop)
 544     return false;
 545
 546   /* Ensure the loop latch definition is from within the loop.  */
 547   edge latch = loop_latch_edge (loop);
 548   tree ldef = PHI_ARG_DEF_FROM_EDGE (phi, latch);
 549   if (TREE_CODE (ldef) != SSA_NAME
 550       || SSA_NAME_IS_DEFAULT_DEF (ldef)
 551       || is_a <gphi *> (SSA_NAME_DEF_STMT (ldef))
 552       || !flow_bb_inside_loop_p (loop, gimple_bb (SSA_NAME_DEF_STMT (ldef))))
 553     return false;
 554
 555   tree def = gimple_phi_result (phi);
 556
 557   /* Ensure every use_stmt of the phi node is dominated by the latch
 558      definition.  */
 559   imm_use_iterator imm_iter;
 560   use_operand_p use_p;
 561   FOR_EACH_IMM_USE_FAST (use_p, imm_iter, def)
 562     if (!is_gimple_debug (USE_STMT (use_p))
 563         && (SSA_NAME_DEF_STMT (ldef) == USE_STMT (use_p)
 564             || !vect_stmt_dominates_stmt_p (SSA_NAME_DEF_STMT (ldef),
 565                                             USE_STMT (use_p))))
 566       return false;
 567
 568   /* First-order recurrence autovectorization needs shuffle vector.  */
 569   tree scalar_type = TREE_TYPE (def);
 570   tree vectype = get_vectype_for_scalar_type (loop_vinfo, scalar_type);
 571   if (!vectype)
 572     return false;
 573
 574   return true;
 575 }
 576
 577 /* Function vect_analyze_scalar_cycles_1.
 578
 579    Examine the cross iteration def-use cycles of scalar variables
 580    in LOOP.  LOOP_VINFO represents the loop that is now being
 581    considered for vectorization (can be LOOP, or an outer-loop
 582    enclosing LOOP).  SLP indicates there will be some subsequent
 583    slp analyses or not.  */
 584
 585 static void
 586 vect_analyze_scalar_cycles_1 (loop_vec_info loop_vinfo, class loop *loop,
 587                               bool slp)
 588 {
 589   basic_block bb = loop->header;
 590   tree init, step;
 591   auto_vec<stmt_vec_info, 64> worklist;
 592   gphi_iterator gsi;
 593   bool double_reduc, reduc_chain;
 594
 595   DUMP_VECT_SCOPE ("vect_analyze_scalar_cycles");
 596
 597   /* First - identify all inductions.  Reduction detection assumes that all the
 598      inductions have been identified, therefore, this order must not be
 599      changed.  */
 600   for (gsi = gsi_start_phis  (bb); !gsi_end_p (gsi); gsi_next (&gsi))
 601     {
 602       gphi *phi = gsi.phi ();
 603       tree access_fn = NULL;
 604       tree def = PHI_RESULT (phi);
 605       stmt_vec_info stmt_vinfo = loop_vinfo->lookup_stmt (phi);
 606
 607       if (dump_enabled_p ())
 608         dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: %G",
 609                          (gimple *) phi);
 610
 611       /* Skip virtual phi's.  The data dependences that are associated with
 612          virtual defs/uses (i.e., memory accesses) are analyzed elsewhere.  */
 613       if (virtual_operand_p (def))
 614         continue;
 615
 616       STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_unknown_def_type;
 617
 618       /* Analyze the evolution function.  */
 619       access_fn = analyze_scalar_evolution (loop, def);
 620       if (access_fn)
 621         {
 622           STRIP_NOPS (access_fn);
 623           if (dump_enabled_p ())
 624             dump_printf_loc (MSG_NOTE, vect_location,
 625                              "Access function of PHI: %T\n", access_fn);
 626           STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo)
 627             = initial_condition_in_loop_num (access_fn, loop->num);
 628           STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo)
 629             = evolution_part_in_loop_num (access_fn, loop->num);
 630         }
 631
 632       if ((!access_fn
 633            || vect_inner_phi_in_double_reduction_p (loop_vinfo, phi)
 634            || !vect_is_simple_iv_evolution (loop->num, access_fn,
 635                                             &init, &step)
 636            || (LOOP_VINFO_LOOP (loop_vinfo) != loop
 637                && TREE_CODE (step) != INTEGER_CST))
 638           /* Only handle nonlinear iv for same loop.  */
 639           && (LOOP_VINFO_LOOP (loop_vinfo) != loop
 640               || !vect_is_nonlinear_iv_evolution (loop, stmt_vinfo,
 641                                                   phi, &init, &step)))
 642         {
 643           worklist.safe_push (stmt_vinfo);
 644           continue;
 645         }
 646
 647       gcc_assert (STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo)
 648                   != NULL_TREE);
 649       gcc_assert (STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo) != NULL_TREE);
 650
 651       if (dump_enabled_p ())
 652         dump_printf_loc (MSG_NOTE, vect_location, "Detected induction.\n");
 653       STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_induction_def;
 654     }
 655
 656
 657   /* Second - identify all reductions and nested cycles.  */
 658   while (worklist.length () > 0)
 659     {
 660       stmt_vec_info stmt_vinfo = worklist.pop ();
 661       gphi *phi = as_a <gphi *> (stmt_vinfo->stmt);
 662       tree def = PHI_RESULT (phi);
 663
 664       if (dump_enabled_p ())
 665         dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: %G",
 666                          (gimple *) phi);
 667
 668       gcc_assert (!virtual_operand_p (def)
 669                   && STMT_VINFO_DEF_TYPE (stmt_vinfo) == vect_unknown_def_type);
 670
 671       stmt_vec_info reduc_stmt_info
 672         = vect_is_simple_reduction (loop_vinfo, stmt_vinfo, &double_reduc,
 673                                     &reduc_chain, slp);
 674       if (reduc_stmt_info)
 675         {
 676           STMT_VINFO_REDUC_DEF (stmt_vinfo) = reduc_stmt_info;
 677           STMT_VINFO_REDUC_DEF (reduc_stmt_info) = stmt_vinfo;
 678           if (double_reduc)
 679             {
 680               if (dump_enabled_p ())
 681                 dump_printf_loc (MSG_NOTE, vect_location,
 682                                  "Detected double reduction.\n");
 683
 684               STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_double_reduction_def;
 685               STMT_VINFO_DEF_TYPE (reduc_stmt_info) = vect_double_reduction_def;
 686             }
 687           else
 688             {
 689               if (loop != LOOP_VINFO_LOOP (loop_vinfo))
 690                 {
 691                   if (dump_enabled_p ())
 692                     dump_printf_loc (MSG_NOTE, vect_location,
 693                                      "Detected vectorizable nested cycle.\n");
 694
 695                   STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_nested_cycle;
 696                 }
 697               else
 698                 {
 699                   if (dump_enabled_p ())
 700                     dump_printf_loc (MSG_NOTE, vect_location,
 701                                      "Detected reduction.\n");
 702
 703                   STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_reduction_def;
 704                   STMT_VINFO_DEF_TYPE (reduc_stmt_info) = vect_reduction_def;
 705                   /* Store the reduction cycles for possible vectorization in
 706                      loop-aware SLP if it was not detected as reduction
 707                      chain.  */
 708                   if (! reduc_chain)
 709                     LOOP_VINFO_REDUCTIONS (loop_vinfo).safe_push
 710                       (reduc_stmt_info);
 711                 }
 712             }
 713         }
 714       else if (vect_phi_first_order_recurrence_p (loop_vinfo, loop, phi))
 715         STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_first_order_recurrence;
 716       else
 717         if (dump_enabled_p ())
 718           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 719                            "Unknown def-use cycle pattern.\n");
 720     }
 721 }
 722
 723
 724 /* Function vect_analyze_scalar_cycles.
 725
 726    Examine the cross iteration def-use cycles of scalar variables, by
 727    analyzing the loop-header PHIs of scalar variables.  Classify each
 728    cycle as one of the following: invariant, induction, reduction, unknown.
 729    We do that for the loop represented by LOOP_VINFO, and also to its
 730    inner-loop, if exists.
 731    Examples for scalar cycles:
 732
 733    Example1: reduction:
 734
 735               loop1:
 736               for (i=0; i<N; i++)
 737                  sum += a[i];
 738
 739    Example2: induction:
 740
 741               loop2:
 742               for (i=0; i<N; i++)
 743                  a[i] = i;  */
 744
 745 static void
 746 vect_analyze_scalar_cycles (loop_vec_info loop_vinfo, bool slp)
 747 {
 748   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
 749
 750   vect_analyze_scalar_cycles_1 (loop_vinfo, loop, slp);
 751
 752   /* When vectorizing an outer-loop, the inner-loop is executed sequentially.
 753      Reductions in such inner-loop therefore have different properties than
 754      the reductions in the nest that gets vectorized:
 755      1. When vectorized, they are executed in the same order as in the original
 756         scalar loop, so we can't change the order of computation when
 757         vectorizing them.
 758      2. FIXME: Inner-loop reductions can be used in the inner-loop, so the
 759         current checks are too strict.  */
 760
 761   if (loop->inner)
 762     vect_analyze_scalar_cycles_1 (loop_vinfo, loop->inner, slp);
 763 }
 764
 765 /* Transfer group and reduction information from STMT_INFO to its
 766    pattern stmt.  */
 767
 768 static void
 769 vect_fixup_reduc_chain (stmt_vec_info stmt_info)
 770 {
 771   stmt_vec_info firstp = STMT_VINFO_RELATED_STMT (stmt_info);
 772   stmt_vec_info stmtp;
 773   gcc_assert (!REDUC_GROUP_FIRST_ELEMENT (firstp)
 774               && REDUC_GROUP_FIRST_ELEMENT (stmt_info));
 775   REDUC_GROUP_SIZE (firstp) = REDUC_GROUP_SIZE (stmt_info);
 776   do
 777     {
 778       stmtp = STMT_VINFO_RELATED_STMT (stmt_info);
 779       gcc_checking_assert (STMT_VINFO_DEF_TYPE (stmtp)
 780                            == STMT_VINFO_DEF_TYPE (stmt_info));
 781       REDUC_GROUP_FIRST_ELEMENT (stmtp) = firstp;
 782       stmt_info = REDUC_GROUP_NEXT_ELEMENT (stmt_info);
 783       if (stmt_info)
 784         REDUC_GROUP_NEXT_ELEMENT (stmtp)
 785           = STMT_VINFO_RELATED_STMT (stmt_info);
 786     }
 787   while (stmt_info);
 788 }
 789
 790 /* Fixup scalar cycles that now have their stmts detected as patterns.  */
 791
 792 static void
 793 vect_fixup_scalar_cycles_with_patterns (loop_vec_info loop_vinfo)
 794 {
 795   stmt_vec_info first;
 796   unsigned i;
 797
 798   FOR_EACH_VEC_ELT (LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo), i, first)
 799     {
 800       stmt_vec_info next = REDUC_GROUP_NEXT_ELEMENT (first);
 801       while (next)
 802         {
 803           if ((STMT_VINFO_IN_PATTERN_P (next)
 804                != STMT_VINFO_IN_PATTERN_P (first))
 805               || STMT_VINFO_REDUC_IDX (vect_stmt_to_vectorize (next)) == -1)
 806             break;
 807           next = REDUC_GROUP_NEXT_ELEMENT (next);
 808         }
 809       /* If all reduction chain members are well-formed patterns adjust
 810          the group to group the pattern stmts instead.  */
 811       if (! next
 812           && STMT_VINFO_REDUC_IDX (vect_stmt_to_vectorize (first)) != -1)
 813         {
 814           if (STMT_VINFO_IN_PATTERN_P (first))
 815             {
 816               vect_fixup_reduc_chain (first);
 817               LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo)[i]
 818                 = STMT_VINFO_RELATED_STMT (first);
 819             }
 820         }
 821       /* If not all stmt in the chain are patterns or if we failed
 822          to update STMT_VINFO_REDUC_IDX dissolve the chain and handle
 823          it as regular reduction instead.  */
 824       else
 825         {
 826           stmt_vec_info vinfo = first;
 827           stmt_vec_info last = NULL;
 828           while (vinfo)
 829             {
 830               next = REDUC_GROUP_NEXT_ELEMENT (vinfo);
 831               REDUC_GROUP_FIRST_ELEMENT (vinfo) = NULL;
 832               REDUC_GROUP_NEXT_ELEMENT (vinfo) = NULL;
 833               last = vinfo;
 834               vinfo = next;
 835             }
 836           STMT_VINFO_DEF_TYPE (vect_stmt_to_vectorize (first))
 837             = vect_internal_def;
 838           loop_vinfo->reductions.safe_push (vect_stmt_to_vectorize (last));
 839           LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo).unordered_remove (i);
 840           --i;
 841         }
 842     }
 843 }
 844
 845 /* Function vect_get_loop_niters.
 846
 847    Determine how many iterations the loop is executed and place it
 848    in NUMBER_OF_ITERATIONS.  Place the number of latch iterations
 849    in NUMBER_OF_ITERATIONSM1.  Place the condition under which the
 850    niter information holds in ASSUMPTIONS.
 851
 852    Return the loop exit condition.  */
 853
 854
 855 static gcond *
 856 vect_get_loop_niters (class loop *loop, tree *assumptions,
 857                       tree *number_of_iterations, tree *number_of_iterationsm1)
 858 {
 859   edge exit = single_exit (loop);
 860   class tree_niter_desc niter_desc;
 861   tree niter_assumptions, niter, may_be_zero;
 862   gcond *cond = get_loop_exit_condition (loop);
 863
 864   *assumptions = boolean_true_node;
 865   *number_of_iterationsm1 = chrec_dont_know;
 866   *number_of_iterations = chrec_dont_know;
 867   DUMP_VECT_SCOPE ("get_loop_niters");
 868
 869   if (!exit)
 870     return cond;
 871
 872   may_be_zero = NULL_TREE;
 873   if (!number_of_iterations_exit_assumptions (loop, exit, &niter_desc, NULL)
 874       || chrec_contains_undetermined (niter_desc.niter))
 875     return cond;
 876
 877   niter_assumptions = niter_desc.assumptions;
 878   may_be_zero = niter_desc.may_be_zero;
 879   niter = niter_desc.niter;
 880
 881   if (may_be_zero && integer_zerop (may_be_zero))
 882     may_be_zero = NULL_TREE;
 883
 884   if (may_be_zero)
 885     {
 886       if (COMPARISON_CLASS_P (may_be_zero))
 887         {
 888           /* Try to combine may_be_zero with assumptions, this can simplify
 889              computation of niter expression.  */
 890           if (niter_assumptions && !integer_nonzerop (niter_assumptions))
 891             niter_assumptions = fold_build2 (TRUTH_AND_EXPR, boolean_type_node,
 892                                              niter_assumptions,
 893                                              fold_build1 (TRUTH_NOT_EXPR,
 894                                                           boolean_type_node,
 895                                                           may_be_zero));
 896           else
 897             niter = fold_build3 (COND_EXPR, TREE_TYPE (niter), may_be_zero,
 898                                  build_int_cst (TREE_TYPE (niter), 0),
 899                                  rewrite_to_non_trapping_overflow (niter));
 900
 901           may_be_zero = NULL_TREE;
 902         }
 903       else if (integer_nonzerop (may_be_zero))
 904         {
 905           *number_of_iterationsm1 = build_int_cst (TREE_TYPE (niter), 0);
 906           *number_of_iterations = build_int_cst (TREE_TYPE (niter), 1);
 907           return cond;
 908         }
 909       else
 910         return cond;
 911     }
 912
 913   *assumptions = niter_assumptions;
 914   *number_of_iterationsm1 = niter;
 915
 916   /* We want the number of loop header executions which is the number
 917      of latch executions plus one.
 918      ???  For UINT_MAX latch executions this number overflows to zero
 919      for loops like do { n++; } while (n != 0);  */
 920   if (niter && !chrec_contains_undetermined (niter))
 921     niter = fold_build2 (PLUS_EXPR, TREE_TYPE (niter), unshare_expr (niter),
 922                           build_int_cst (TREE_TYPE (niter), 1));
 923   *number_of_iterations = niter;
 924
 925   return cond;
 926 }
 927
 928 /* Function bb_in_loop_p
 929
 930    Used as predicate for dfs order traversal of the loop bbs.  */
 931
 932 static bool
 933 bb_in_loop_p (const_basic_block bb, const void *data)
 934 {
 935   const class loop *const loop = (const class loop *)data;
 936   if (flow_bb_inside_loop_p (loop, bb))
 937     return true;
 938   return false;
 939 }
 940
 941
 942 /* Create and initialize a new loop_vec_info struct for LOOP_IN, as well as
 943    stmt_vec_info structs for all the stmts in LOOP_IN.  */
 944
 945 _loop_vec_info::_loop_vec_info (class loop *loop_in, vec_info_shared *shared)
 946   : vec_info (vec_info::loop, shared),
 947     loop (loop_in),
 948     bbs (XCNEWVEC (basic_block, loop->num_nodes)),
 949     num_itersm1 (NULL_TREE),
 950     num_iters (NULL_TREE),
 951     num_iters_unchanged (NULL_TREE),
 952     num_iters_assumptions (NULL_TREE),
 953     vector_costs (nullptr),
 954     scalar_costs (nullptr),
 955     th (0),
 956     versioning_threshold (0),
 957     vectorization_factor (0),
 958     main_loop_edge (nullptr),
 959     skip_main_loop_edge (nullptr),
 960     skip_this_loop_edge (nullptr),
 961     reusable_accumulators (),
 962     suggested_unroll_factor (1),
 963     max_vectorization_factor (0),
 964     mask_skip_niters (NULL_TREE),
 965     rgroup_compare_type (NULL_TREE),
 966     simd_if_cond (NULL_TREE),
 967     partial_vector_style (vect_partial_vectors_none),
 968     unaligned_dr (NULL),
 969     peeling_for_alignment (0),
 970     ptr_mask (0),
 971     ivexpr_map (NULL),
 972     scan_map (NULL),
 973     slp_unrolling_factor (1),
 974     inner_loop_cost_factor (param_vect_inner_loop_cost_factor),
 975     vectorizable (false),
 976     can_use_partial_vectors_p (param_vect_partial_vector_usage != 0),
 977     using_partial_vectors_p (false),
 978     using_decrementing_iv_p (false),
 979     using_select_vl_p (false),
 980     epil_using_partial_vectors_p (false),
 981     partial_load_store_bias (0),
 982     peeling_for_gaps (false),
 983     peeling_for_niter (false),
 984     no_data_dependencies (false),
 985     has_mask_store (false),
 986     scalar_loop_scaling (profile_probability::uninitialized ()),
 987     scalar_loop (NULL),
 988     orig_loop_info (NULL)
 989 {
 990   /* CHECKME: We want to visit all BBs before their successors (except for
 991      latch blocks, for which this assertion wouldn't hold).  In the simple
 992      case of the loop forms we allow, a dfs order of the BBs would the same
 993      as reversed postorder traversal, so we are safe.  */
 994
 995   unsigned int nbbs = dfs_enumerate_from (loop->header, 0, bb_in_loop_p,
 996                                           bbs, loop->num_nodes, loop);
 997   gcc_assert (nbbs == loop->num_nodes);
 998
 999   for (unsigned int i = 0; i < nbbs; i++)
1000     {
1001       basic_block bb = bbs[i];
1002       gimple_stmt_iterator si;
1003
1004       for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
1005         {
1006           gimple *phi = gsi_stmt (si);
1007           gimple_set_uid (phi, 0);
1008           add_stmt (phi);
1009         }
1010
1011       for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
1012         {
1013           gimple *stmt = gsi_stmt (si);
1014           gimple_set_uid (stmt, 0);
1015           if (is_gimple_debug (stmt))
1016             continue;
1017           add_stmt (stmt);
1018           /* If .GOMP_SIMD_LANE call for the current loop has 3 arguments, the
1019              third argument is the #pragma omp simd if (x) condition, when 0,
1020              loop shouldn't be vectorized, when non-zero constant, it should
1021              be vectorized normally, otherwise versioned with vectorized loop
1022              done if the condition is non-zero at runtime.  */
1023           if (loop_in->simduid
1024               && is_gimple_call (stmt)
1025               && gimple_call_internal_p (stmt)
1026               && gimple_call_internal_fn (stmt) == IFN_GOMP_SIMD_LANE
1027               && gimple_call_num_args (stmt) >= 3
1028               && TREE_CODE (gimple_call_arg (stmt, 0)) == SSA_NAME
1029               && (loop_in->simduid
1030                   == SSA_NAME_VAR (gimple_call_arg (stmt, 0))))
1031             {
1032               tree arg = gimple_call_arg (stmt, 2);
1033               if (integer_zerop (arg) || TREE_CODE (arg) == SSA_NAME)
1034                 simd_if_cond = arg;
1035               else
1036                 gcc_assert (integer_nonzerop (arg));
1037             }
1038         }
1039     }
1040
1041   epilogue_vinfos.create (6);
1042 }
1043
1044 /* Free all levels of rgroup CONTROLS.  */
1045
1046 void
1047 release_vec_loop_controls (vec<rgroup_controls> *controls)
1048 {
1049   rgroup_controls *rgc;
1050   unsigned int i;
1051   FOR_EACH_VEC_ELT (*controls, i, rgc)
1052     rgc->controls.release ();
1053   controls->release ();
1054 }
1055
1056 /* Free all memory used by the _loop_vec_info, as well as all the
1057    stmt_vec_info structs of all the stmts in the loop.  */
1058
1059 _loop_vec_info::~_loop_vec_info ()
1060 {
1061   free (bbs);
1062
1063   release_vec_loop_controls (&masks.rgc_vec);
1064   release_vec_loop_controls (&lens);
1065   delete ivexpr_map;
1066   delete scan_map;
1067   epilogue_vinfos.release ();
1068   delete scalar_costs;
1069   delete vector_costs;
1070
1071   /* When we release an epiloge vinfo that we do not intend to use
1072      avoid clearing AUX of the main loop which should continue to
1073      point to the main loop vinfo since otherwise we'll leak that.  */
1074   if (loop->aux == this)
1075     loop->aux = NULL;
1076 }
1077
1078 /* Return an invariant or register for EXPR and emit necessary
1079    computations in the LOOP_VINFO loop preheader.  */
1080
1081 tree
1082 cse_and_gimplify_to_preheader (loop_vec_info loop_vinfo, tree expr)
1083 {
1084   if (is_gimple_reg (expr)
1085       || is_gimple_min_invariant (expr))
1086     return expr;
1087
1088   if (! loop_vinfo->ivexpr_map)
1089     loop_vinfo->ivexpr_map = new hash_map<tree_operand_hash, tree>;
1090   tree &cached = loop_vinfo->ivexpr_map->get_or_insert (expr);
1091   if (! cached)
1092     {
1093       gimple_seq stmts = NULL;
1094       cached = force_gimple_operand (unshare_expr (expr),
1095                                      &stmts, true, NULL_TREE);
1096       if (stmts)
1097         {
1098           edge e = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo));
1099           gsi_insert_seq_on_edge_immediate (e, stmts);
1100         }
1101     }
1102   return cached;
1103 }
1104
1105 /* Return true if we can use CMP_TYPE as the comparison type to produce
1106    all masks required to mask LOOP_VINFO.  */
1107
1108 static bool
1109 can_produce_all_loop_masks_p (loop_vec_info loop_vinfo, tree cmp_type)
1110 {
1111   rgroup_controls *rgm;
1112   unsigned int i;
1113   FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo).rgc_vec, i, rgm)
1114     if (rgm->type != NULL_TREE
1115         && !direct_internal_fn_supported_p (IFN_WHILE_ULT,
1116                                             cmp_type, rgm->type,
1117                                             OPTIMIZE_FOR_SPEED))
1118       return false;
1119   return true;
1120 }
1121
1122 /* Calculate the maximum number of scalars per iteration for every
1123    rgroup in LOOP_VINFO.  */
1124
1125 static unsigned int
1126 vect_get_max_nscalars_per_iter (loop_vec_info loop_vinfo)
1127 {
1128   unsigned int res = 1;
1129   unsigned int i;
1130   rgroup_controls *rgm;
1131   FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo).rgc_vec, i, rgm)
1132     res = MAX (res, rgm->max_nscalars_per_iter);
1133   return res;
1134 }
1135
1136 /* Calculate the minimum precision necessary to represent:
1137
1138       MAX_NITERS * FACTOR
1139
1140    as an unsigned integer, where MAX_NITERS is the maximum number of
1141    loop header iterations for the original scalar form of LOOP_VINFO.  */
1142
1143 static unsigned
1144 vect_min_prec_for_max_niters (loop_vec_info loop_vinfo, unsigned int factor)
1145 {
1146   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1147
1148   /* Get the maximum number of iterations that is representable
1149      in the counter type.  */
1150   tree ni_type = TREE_TYPE (LOOP_VINFO_NITERSM1 (loop_vinfo));
1151   widest_int max_ni = wi::to_widest (TYPE_MAX_VALUE (ni_type)) + 1;
1152
1153   /* Get a more refined estimate for the number of iterations.  */
1154   widest_int max_back_edges;
1155   if (max_loop_iterations (loop, &max_back_edges))
1156     max_ni = wi::smin (max_ni, max_back_edges + 1);
1157
1158   /* Work out how many bits we need to represent the limit.  */
1159   return wi::min_precision (max_ni * factor, UNSIGNED);
1160 }
1161
1162 /* True if the loop needs peeling or partial vectors when vectorized.  */
1163
1164 static bool
1165 vect_need_peeling_or_partial_vectors_p (loop_vec_info loop_vinfo)
1166 {
1167   unsigned HOST_WIDE_INT const_vf;
1168   HOST_WIDE_INT max_niter
1169     = likely_max_stmt_executions_int (LOOP_VINFO_LOOP (loop_vinfo));
1170
1171   unsigned th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
1172   if (!th && LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo))
1173     th = LOOP_VINFO_COST_MODEL_THRESHOLD (LOOP_VINFO_ORIG_LOOP_INFO
1174                                           (loop_vinfo));
1175
1176   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
1177       && LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) >= 0)
1178     {
1179       /* Work out the (constant) number of iterations that need to be
1180          peeled for reasons other than niters.  */
1181       unsigned int peel_niter = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
1182       if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
1183         peel_niter += 1;
1184       if (!multiple_p (LOOP_VINFO_INT_NITERS (loop_vinfo) - peel_niter,
1185                        LOOP_VINFO_VECT_FACTOR (loop_vinfo)))
1186         return true;
1187     }
1188   else if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
1189       /* ??? When peeling for gaps but not alignment, we could
1190          try to check whether the (variable) niters is known to be
1191          VF * N + 1.  That's something of a niche case though.  */
1192       || LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
1193       || !LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant (&const_vf)
1194       || ((tree_ctz (LOOP_VINFO_NITERS (loop_vinfo))
1195            < (unsigned) exact_log2 (const_vf))
1196           /* In case of versioning, check if the maximum number of
1197              iterations is greater than th.  If they are identical,
1198              the epilogue is unnecessary.  */
1199           && (!LOOP_REQUIRES_VERSIONING (loop_vinfo)
1200               || ((unsigned HOST_WIDE_INT) max_niter
1201                   > (th / const_vf) * const_vf))))
1202     return true;
1203
1204   return false;
1205 }
1206
1207 /* Each statement in LOOP_VINFO can be masked where necessary.  Check
1208    whether we can actually generate the masks required.  Return true if so,
1209    storing the type of the scalar IV in LOOP_VINFO_RGROUP_COMPARE_TYPE.  */
1210
1211 static bool
1212 vect_verify_full_masking (loop_vec_info loop_vinfo)
1213 {
1214   unsigned int min_ni_width;
1215
1216   /* Use a normal loop if there are no statements that need masking.
1217      This only happens in rare degenerate cases: it means that the loop
1218      has no loads, no stores, and no live-out values.  */
1219   if (LOOP_VINFO_MASKS (loop_vinfo).is_empty ())
1220     return false;
1221
1222   /* Produce the rgroup controls.  */
1223   for (auto mask : LOOP_VINFO_MASKS (loop_vinfo).mask_set)
1224     {
1225       vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo);
1226       tree vectype = mask.first;
1227       unsigned nvectors = mask.second;
1228
1229       if (masks->rgc_vec.length () < nvectors)
1230         masks->rgc_vec.safe_grow_cleared (nvectors, true);
1231       rgroup_controls *rgm = &(*masks).rgc_vec[nvectors - 1];
1232       /* The number of scalars per iteration and the number of vectors are
1233          both compile-time constants.  */
1234       unsigned int nscalars_per_iter
1235           = exact_div (nvectors * TYPE_VECTOR_SUBPARTS (vectype),
1236                        LOOP_VINFO_VECT_FACTOR (loop_vinfo)).to_constant ();
1237
1238       if (rgm->max_nscalars_per_iter < nscalars_per_iter)
1239         {
1240           rgm->max_nscalars_per_iter = nscalars_per_iter;
1241           rgm->type = truth_type_for (vectype);
1242           rgm->factor = 1;
1243         }
1244     }
1245
1246   unsigned int max_nscalars_per_iter
1247     = vect_get_max_nscalars_per_iter (loop_vinfo);
1248
1249   /* Work out how many bits we need to represent the limit.  */
1250   min_ni_width
1251     = vect_min_prec_for_max_niters (loop_vinfo, max_nscalars_per_iter);
1252
1253   /* Find a scalar mode for which WHILE_ULT is supported.  */
1254   opt_scalar_int_mode cmp_mode_iter;
1255   tree cmp_type = NULL_TREE;
1256   tree iv_type = NULL_TREE;
1257   widest_int iv_limit = vect_iv_limit_for_partial_vectors (loop_vinfo);
1258   unsigned int iv_precision = UINT_MAX;
1259
1260   if (iv_limit != -1)
1261     iv_precision = wi::min_precision (iv_limit * max_nscalars_per_iter,
1262                                       UNSIGNED);
1263
1264   FOR_EACH_MODE_IN_CLASS (cmp_mode_iter, MODE_INT)
1265     {
1266       unsigned int cmp_bits = GET_MODE_BITSIZE (cmp_mode_iter.require ());
1267       if (cmp_bits >= min_ni_width
1268           && targetm.scalar_mode_supported_p (cmp_mode_iter.require ()))
1269         {
1270           tree this_type = build_nonstandard_integer_type (cmp_bits, true);
1271           if (this_type
1272               && can_produce_all_loop_masks_p (loop_vinfo, this_type))
1273             {
1274               /* Although we could stop as soon as we find a valid mode,
1275                  there are at least two reasons why that's not always the
1276                  best choice:
1277
1278                  - An IV that's Pmode or wider is more likely to be reusable
1279                    in address calculations than an IV that's narrower than
1280                    Pmode.
1281
1282                  - Doing the comparison in IV_PRECISION or wider allows
1283                    a natural 0-based IV, whereas using a narrower comparison
1284                    type requires mitigations against wrap-around.
1285
1286                  Conversely, if the IV limit is variable, doing the comparison
1287                  in a wider type than the original type can introduce
1288                  unnecessary extensions, so picking the widest valid mode
1289                  is not always a good choice either.
1290
1291                  Here we prefer the first IV type that's Pmode or wider,
1292                  and the first comparison type that's IV_PRECISION or wider.
1293                  (The comparison type must be no wider than the IV type,
1294                  to avoid extensions in the vector loop.)
1295
1296                  ??? We might want to try continuing beyond Pmode for ILP32
1297                  targets if CMP_BITS < IV_PRECISION.  */
1298               iv_type = this_type;
1299               if (!cmp_type || iv_precision > TYPE_PRECISION (cmp_type))
1300                 cmp_type = this_type;
1301               if (cmp_bits >= GET_MODE_BITSIZE (Pmode))
1302                 break;
1303             }
1304         }
1305     }
1306
1307   if (!cmp_type)
1308     {
1309       LOOP_VINFO_MASKS (loop_vinfo).rgc_vec.release ();
1310       return false;
1311     }
1312
1313   LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo) = cmp_type;
1314   LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo) = iv_type;
1315   LOOP_VINFO_PARTIAL_VECTORS_STYLE (loop_vinfo) = vect_partial_vectors_while_ult;
1316   return true;
1317 }
1318
1319 /* Each statement in LOOP_VINFO can be masked where necessary.  Check
1320    whether we can actually generate AVX512 style masks.  Return true if so,
1321    storing the type of the scalar IV in LOOP_VINFO_RGROUP_IV_TYPE.  */
1322
1323 static bool
1324 vect_verify_full_masking_avx512 (loop_vec_info loop_vinfo)
1325 {
1326   /* Produce differently organized rgc_vec and differently check
1327      we can produce masks.  */
1328
1329   /* Use a normal loop if there are no statements that need masking.
1330      This only happens in rare degenerate cases: it means that the loop
1331      has no loads, no stores, and no live-out values.  */
1332   if (LOOP_VINFO_MASKS (loop_vinfo).is_empty ())
1333     return false;
1334
1335   /* For the decrementing IV we need to represent all values in
1336      [0, niter + niter_skip] where niter_skip is the elements we
1337      skip in the first iteration for prologue peeling.  */
1338   tree iv_type = NULL_TREE;
1339   widest_int iv_limit = vect_iv_limit_for_partial_vectors (loop_vinfo);
1340   unsigned int iv_precision = UINT_MAX;
1341   if (iv_limit != -1)
1342     iv_precision = wi::min_precision (iv_limit, UNSIGNED);
1343
1344   /* First compute the type for the IV we use to track the remaining
1345      scalar iterations.  */
1346   opt_scalar_int_mode cmp_mode_iter;
1347   FOR_EACH_MODE_IN_CLASS (cmp_mode_iter, MODE_INT)
1348     {
1349       unsigned int cmp_bits = GET_MODE_BITSIZE (cmp_mode_iter.require ());
1350       if (cmp_bits >= iv_precision
1351           && targetm.scalar_mode_supported_p (cmp_mode_iter.require ()))
1352         {
1353           iv_type = build_nonstandard_integer_type (cmp_bits, true);
1354           if (iv_type)
1355             break;
1356         }
1357     }
1358   if (!iv_type)
1359     return false;
1360
1361   /* Produce the rgroup controls.  */
1362   for (auto const &mask : LOOP_VINFO_MASKS (loop_vinfo).mask_set)
1363     {
1364       vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo);
1365       tree vectype = mask.first;
1366       unsigned nvectors = mask.second;
1367
1368       /* The number of scalars per iteration and the number of vectors are
1369          both compile-time constants.  */
1370       unsigned int nscalars_per_iter
1371         = exact_div (nvectors * TYPE_VECTOR_SUBPARTS (vectype),
1372                      LOOP_VINFO_VECT_FACTOR (loop_vinfo)).to_constant ();
1373
1374       /* We index the rgroup_controls vector with nscalars_per_iter
1375          which we keep constant and instead have a varying nvectors,
1376          remembering the vector mask with the fewest nV.  */
1377       if (masks->rgc_vec.length () < nscalars_per_iter)
1378         masks->rgc_vec.safe_grow_cleared (nscalars_per_iter, true);
1379       rgroup_controls *rgm = &(*masks).rgc_vec[nscalars_per_iter - 1];
1380
1381       if (!rgm->type || rgm->factor > nvectors)
1382         {
1383           rgm->type = truth_type_for (vectype);
1384           rgm->compare_type = NULL_TREE;
1385           rgm->max_nscalars_per_iter = nscalars_per_iter;
1386           rgm->factor = nvectors;
1387           rgm->bias_adjusted_ctrl = NULL_TREE;
1388         }
1389     }
1390
1391   /* There is no fixed compare type we are going to use but we have to
1392      be able to get at one for each mask group.  */
1393   unsigned int min_ni_width
1394     = wi::min_precision (vect_max_vf (loop_vinfo), UNSIGNED);
1395
1396   bool ok = true;
1397   for (auto &rgc : LOOP_VINFO_MASKS (loop_vinfo).rgc_vec)
1398     {
1399       tree mask_type = rgc.type;
1400       if (!mask_type)
1401         continue;
1402
1403       if (TYPE_PRECISION (TREE_TYPE (mask_type)) != 1)
1404         {
1405           ok = false;
1406           break;
1407         }
1408
1409       /* If iv_type is usable as compare type use that - we can elide the
1410          saturation in that case.   */
1411       if (TYPE_PRECISION (iv_type) >= min_ni_width)
1412         {
1413           tree cmp_vectype
1414             = build_vector_type (iv_type, TYPE_VECTOR_SUBPARTS (mask_type));
1415           if (expand_vec_cmp_expr_p (cmp_vectype, mask_type, LT_EXPR))
1416             rgc.compare_type = cmp_vectype;
1417         }
1418       if (!rgc.compare_type)
1419         FOR_EACH_MODE_IN_CLASS (cmp_mode_iter, MODE_INT)
1420           {
1421             unsigned int cmp_bits = GET_MODE_BITSIZE (cmp_mode_iter.require ());
1422             if (cmp_bits >= min_ni_width
1423                 && targetm.scalar_mode_supported_p (cmp_mode_iter.require ()))
1424               {
1425                 tree cmp_type = build_nonstandard_integer_type (cmp_bits, true);
1426                 if (!cmp_type)
1427                   continue;
1428
1429                 /* Check whether we can produce the mask with cmp_type.  */
1430                 tree cmp_vectype
1431                   = build_vector_type (cmp_type, TYPE_VECTOR_SUBPARTS (mask_type));
1432                 if (expand_vec_cmp_expr_p (cmp_vectype, mask_type, LT_EXPR))
1433                   {
1434                     rgc.compare_type = cmp_vectype;
1435                     break;
1436                   }
1437               }
1438         }
1439       if (!rgc.compare_type)
1440         {
1441           ok = false;
1442           break;
1443         }
1444     }
1445   if (!ok)
1446     {
1447       release_vec_loop_controls (&LOOP_VINFO_MASKS (loop_vinfo).rgc_vec);
1448       return false;
1449     }
1450
1451   LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo) = error_mark_node;
1452   LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo) = iv_type;
1453   LOOP_VINFO_PARTIAL_VECTORS_STYLE (loop_vinfo) = vect_partial_vectors_avx512;
1454   return true;
1455 }
1456
1457 /* Check whether we can use vector access with length based on precison
1458    comparison.  So far, to keep it simple, we only allow the case that the
1459    precision of the target supported length is larger than the precision
1460    required by loop niters.  */
1461
1462 static bool
1463 vect_verify_loop_lens (loop_vec_info loop_vinfo)
1464 {
1465   if (LOOP_VINFO_LENS (loop_vinfo).is_empty ())
1466     return false;
1467
1468   machine_mode len_load_mode = get_len_load_store_mode
1469     (loop_vinfo->vector_mode, true).require ();
1470   machine_mode len_store_mode = get_len_load_store_mode
1471     (loop_vinfo->vector_mode, false).require ();
1472
1473   signed char partial_load_bias = internal_len_load_store_bias
1474     (IFN_LEN_LOAD, len_load_mode);
1475
1476   signed char partial_store_bias = internal_len_load_store_bias
1477     (IFN_LEN_STORE, len_store_mode);
1478
1479   gcc_assert (partial_load_bias == partial_store_bias);
1480
1481   if (partial_load_bias == VECT_PARTIAL_BIAS_UNSUPPORTED)
1482     return false;
1483
1484   /* If the backend requires a bias of -1 for LEN_LOAD, we must not emit
1485      len_loads with a length of zero.  In order to avoid that we prohibit
1486      more than one loop length here.  */
1487   if (partial_load_bias == -1
1488       && LOOP_VINFO_LENS (loop_vinfo).length () > 1)
1489     return false;
1490
1491   LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo) = partial_load_bias;
1492
1493   unsigned int max_nitems_per_iter = 1;
1494   unsigned int i;
1495   rgroup_controls *rgl;
1496   /* Find the maximum number of items per iteration for every rgroup.  */
1497   FOR_EACH_VEC_ELT (LOOP_VINFO_LENS (loop_vinfo), i, rgl)
1498     {
1499       unsigned nitems_per_iter = rgl->max_nscalars_per_iter * rgl->factor;
1500       max_nitems_per_iter = MAX (max_nitems_per_iter, nitems_per_iter);
1501     }
1502
1503   /* Work out how many bits we need to represent the length limit.  */
1504   unsigned int min_ni_prec
1505     = vect_min_prec_for_max_niters (loop_vinfo, max_nitems_per_iter);
1506
1507   /* Now use the maximum of below precisions for one suitable IV type:
1508      - the IV's natural precision
1509      - the precision needed to hold: the maximum number of scalar
1510        iterations multiplied by the scale factor (min_ni_prec above)
1511      - the Pmode precision
1512
1513      If min_ni_prec is less than the precision of the current niters,
1514      we perfer to still use the niters type.  Prefer to use Pmode and
1515      wider IV to avoid narrow conversions.  */
1516
1517   unsigned int ni_prec
1518     = TYPE_PRECISION (TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo)));
1519   min_ni_prec = MAX (min_ni_prec, ni_prec);
1520   min_ni_prec = MAX (min_ni_prec, GET_MODE_BITSIZE (Pmode));
1521
1522   tree iv_type = NULL_TREE;
1523   opt_scalar_int_mode tmode_iter;
1524   FOR_EACH_MODE_IN_CLASS (tmode_iter, MODE_INT)
1525     {
1526       scalar_mode tmode = tmode_iter.require ();
1527       unsigned int tbits = GET_MODE_BITSIZE (tmode);
1528
1529       /* ??? Do we really want to construct one IV whose precision exceeds
1530          BITS_PER_WORD?  */
1531       if (tbits > BITS_PER_WORD)
1532         break;
1533
1534       /* Find the first available standard integral type.  */
1535       if (tbits >= min_ni_prec && targetm.scalar_mode_supported_p (tmode))
1536         {
1537           iv_type = build_nonstandard_integer_type (tbits, true);
1538           break;
1539         }
1540     }
1541
1542   if (!iv_type)
1543     {
1544       if (dump_enabled_p ())
1545         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1546                          "can't vectorize with length-based partial vectors"
1547                          " because there is no suitable iv type.\n");
1548       return false;
1549     }
1550
1551   LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo) = iv_type;
1552   LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo) = iv_type;
1553   LOOP_VINFO_PARTIAL_VECTORS_STYLE (loop_vinfo) = vect_partial_vectors_len;
1554
1555   return true;
1556 }
1557
1558 /* Calculate the cost of one scalar iteration of the loop.  */
1559 static void
1560 vect_compute_single_scalar_iteration_cost (loop_vec_info loop_vinfo)
1561 {
1562   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1563   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1564   int nbbs = loop->num_nodes, factor;
1565   int innerloop_iters, i;
1566
1567   DUMP_VECT_SCOPE ("vect_compute_single_scalar_iteration_cost");
1568
1569   /* Gather costs for statements in the scalar loop.  */
1570
1571   /* FORNOW.  */
1572   innerloop_iters = 1;
1573   if (loop->inner)
1574     innerloop_iters = LOOP_VINFO_INNER_LOOP_COST_FACTOR (loop_vinfo);
1575
1576   for (i = 0; i < nbbs; i++)
1577     {
1578       gimple_stmt_iterator si;
1579       basic_block bb = bbs[i];
1580
1581       if (bb->loop_father == loop->inner)
1582         factor = innerloop_iters;
1583       else
1584         factor = 1;
1585
1586       for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
1587         {
1588           gimple *stmt = gsi_stmt (si);
1589           stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (stmt);
1590
1591           if (!is_gimple_assign (stmt) && !is_gimple_call (stmt))
1592             continue;
1593
1594           /* Skip stmts that are not vectorized inside the loop.  */
1595           stmt_vec_info vstmt_info = vect_stmt_to_vectorize (stmt_info);
1596           if (!STMT_VINFO_RELEVANT_P (vstmt_info)
1597               && (!STMT_VINFO_LIVE_P (vstmt_info)
1598                   || !VECTORIZABLE_CYCLE_DEF
1599                         (STMT_VINFO_DEF_TYPE (vstmt_info))))
1600             continue;
1601
1602           vect_cost_for_stmt kind;
1603           if (STMT_VINFO_DATA_REF (stmt_info))
1604             {
1605               if (DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
1606                kind = scalar_load;
1607              else
1608                kind = scalar_store;
1609             }
1610           else if (vect_nop_conversion_p (stmt_info))
1611             continue;
1612           else
1613             kind = scalar_stmt;
1614
1615           /* We are using vect_prologue here to avoid scaling twice
1616              by the inner loop factor.  */
1617           record_stmt_cost (&LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1618                             factor, kind, stmt_info, 0, vect_prologue);
1619         }
1620     }
1621
1622   /* Now accumulate cost.  */
1623   loop_vinfo->scalar_costs = init_cost (loop_vinfo, true);
1624   add_stmt_costs (loop_vinfo->scalar_costs,
1625                   &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo));
1626   loop_vinfo->scalar_costs->finish_cost (nullptr);
1627 }
1628
1629
1630 /* Function vect_analyze_loop_form.
1631
1632    Verify that certain CFG restrictions hold, including:
1633    - the loop has a pre-header
1634    - the loop has a single entry and exit
1635    - the loop exit condition is simple enough
1636    - the number of iterations can be analyzed, i.e, a countable loop.  The
1637      niter could be analyzed under some assumptions.  */
1638
1639 opt_result
1640 vect_analyze_loop_form (class loop *loop, vect_loop_form_info *info)
1641 {
1642   DUMP_VECT_SCOPE ("vect_analyze_loop_form");
1643
1644   /* Different restrictions apply when we are considering an inner-most loop,
1645      vs. an outer (nested) loop.
1646      (FORNOW. May want to relax some of these restrictions in the future).  */
1647
1648   info->inner_loop_cond = NULL;
1649   if (!loop->inner)
1650     {
1651       /* Inner-most loop.  We currently require that the number of BBs is
1652          exactly 2 (the header and latch).  Vectorizable inner-most loops
1653          look like this:
1654
1655                         (pre-header)
1656                            |
1657                           header <--------+
1658                            | |            |
1659                            | +--> latch --+
1660                            |
1661                         (exit-bb)  */
1662
1663       if (loop->num_nodes != 2)
1664         return opt_result::failure_at (vect_location,
1665                                        "not vectorized:"
1666                                        " control flow in loop.\n");
1667
1668       if (empty_block_p (loop->header))
1669         return opt_result::failure_at (vect_location,
1670                                        "not vectorized: empty loop.\n");
1671     }
1672   else
1673     {
1674       class loop *innerloop = loop->inner;
1675       edge entryedge;
1676
1677       /* Nested loop. We currently require that the loop is doubly-nested,
1678          contains a single inner loop, and the number of BBs is exactly 5.
1679          Vectorizable outer-loops look like this:
1680
1681                         (pre-header)
1682                            |
1683                           header <---+
1684                            |         |
1685                           inner-loop |
1686                            |         |
1687                           tail ------+
1688                            |
1689                         (exit-bb)
1690
1691          The inner-loop has the properties expected of inner-most loops
1692          as described above.  */
1693
1694       if ((loop->inner)->inner || (loop->inner)->next)
1695         return opt_result::failure_at (vect_location,
1696                                        "not vectorized:"
1697                                        " multiple nested loops.\n");
1698
1699       if (loop->num_nodes != 5)
1700         return opt_result::failure_at (vect_location,
1701                                        "not vectorized:"
1702                                        " control flow in loop.\n");
1703
1704       entryedge = loop_preheader_edge (innerloop);
1705       if (entryedge->src != loop->header
1706           || !single_exit (innerloop)
1707           || single_exit (innerloop)->dest != EDGE_PRED (loop->latch, 0)->src)
1708         return opt_result::failure_at (vect_location,
1709                                        "not vectorized:"
1710                                        " unsupported outerloop form.\n");
1711
1712       /* Analyze the inner-loop.  */
1713       vect_loop_form_info inner;
1714       opt_result res = vect_analyze_loop_form (loop->inner, &inner);
1715       if (!res)
1716         {
1717           if (dump_enabled_p ())
1718             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1719                              "not vectorized: Bad inner loop.\n");
1720           return res;
1721         }
1722
1723       /* Don't support analyzing niter under assumptions for inner
1724          loop.  */
1725       if (!integer_onep (inner.assumptions))
1726         return opt_result::failure_at (vect_location,
1727                                        "not vectorized: Bad inner loop.\n");
1728
1729       if (!expr_invariant_in_loop_p (loop, inner.number_of_iterations))
1730         return opt_result::failure_at (vect_location,
1731                                        "not vectorized: inner-loop count not"
1732                                        " invariant.\n");
1733
1734       if (dump_enabled_p ())
1735         dump_printf_loc (MSG_NOTE, vect_location,
1736                          "Considering outer-loop vectorization.\n");
1737       info->inner_loop_cond = inner.loop_cond;
1738     }
1739
1740   if (!single_exit (loop))
1741     return opt_result::failure_at (vect_location,
1742                                    "not vectorized: multiple exits.\n");
1743   if (EDGE_COUNT (loop->header->preds) != 2)
1744     return opt_result::failure_at (vect_location,
1745                                    "not vectorized:"
1746                                    " too many incoming edges.\n");
1747
1748   /* We assume that the loop exit condition is at the end of the loop. i.e,
1749      that the loop is represented as a do-while (with a proper if-guard
1750      before the loop if needed), where the loop header contains all the
1751      executable statements, and the latch is empty.  */
1752   if (!empty_block_p (loop->latch)
1753       || !gimple_seq_empty_p (phi_nodes (loop->latch)))
1754     return opt_result::failure_at (vect_location,
1755                                    "not vectorized: latch block not empty.\n");
1756
1757   /* Make sure the exit is not abnormal.  */
1758   edge e = single_exit (loop);
1759   if (e->flags & EDGE_ABNORMAL)
1760     return opt_result::failure_at (vect_location,
1761                                    "not vectorized:"
1762                                    " abnormal loop exit edge.\n");
1763
1764   info->loop_cond
1765     = vect_get_loop_niters (loop, &info->assumptions,
1766                             &info->number_of_iterations,
1767                             &info->number_of_iterationsm1);
1768   if (!info->loop_cond)
1769     return opt_result::failure_at
1770       (vect_location,
1771        "not vectorized: complicated exit condition.\n");
1772
1773   if (integer_zerop (info->assumptions)
1774       || !info->number_of_iterations
1775       || chrec_contains_undetermined (info->number_of_iterations))
1776     return opt_result::failure_at
1777       (info->loop_cond,
1778        "not vectorized: number of iterations cannot be computed.\n");
1779
1780   if (integer_zerop (info->number_of_iterations))
1781     return opt_result::failure_at
1782       (info->loop_cond,
1783        "not vectorized: number of iterations = 0.\n");
1784
1785   if (!(tree_fits_shwi_p (info->number_of_iterations)
1786         && tree_to_shwi (info->number_of_iterations) > 0))
1787     {
1788       if (dump_enabled_p ())
1789         {
1790           dump_printf_loc (MSG_NOTE, vect_location,
1791                            "Symbolic number of iterations is ");
1792           dump_generic_expr (MSG_NOTE, TDF_DETAILS, info->number_of_iterations);
1793           dump_printf (MSG_NOTE, "\n");
1794         }
1795     }
1796
1797   return opt_result::success ();
1798 }
1799
1800 /* Create a loop_vec_info for LOOP with SHARED and the
1801    vect_analyze_loop_form result.  */
1802
1803 loop_vec_info
1804 vect_create_loop_vinfo (class loop *loop, vec_info_shared *shared,
1805                         const vect_loop_form_info *info,
1806                         loop_vec_info main_loop_info)
1807 {
1808   loop_vec_info loop_vinfo = new _loop_vec_info (loop, shared);
1809   LOOP_VINFO_NITERSM1 (loop_vinfo) = info->number_of_iterationsm1;
1810   LOOP_VINFO_NITERS (loop_vinfo) = info->number_of_iterations;
1811   LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = info->number_of_iterations;
1812   LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo) = main_loop_info;
1813   /* Also record the assumptions for versioning.  */
1814   if (!integer_onep (info->assumptions) && !main_loop_info)
1815     LOOP_VINFO_NITERS_ASSUMPTIONS (loop_vinfo) = info->assumptions;
1816
1817   stmt_vec_info loop_cond_info = loop_vinfo->lookup_stmt (info->loop_cond);
1818   STMT_VINFO_TYPE (loop_cond_info) = loop_exit_ctrl_vec_info_type;
1819   if (info->inner_loop_cond)
1820     {
1821       stmt_vec_info inner_loop_cond_info
1822         = loop_vinfo->lookup_stmt (info->inner_loop_cond);
1823       STMT_VINFO_TYPE (inner_loop_cond_info) = loop_exit_ctrl_vec_info_type;
1824       /* If we have an estimate on the number of iterations of the inner
1825          loop use that to limit the scale for costing, otherwise use
1826          --param vect-inner-loop-cost-factor literally.  */
1827       widest_int nit;
1828       if (estimated_stmt_executions (loop->inner, &nit))
1829         LOOP_VINFO_INNER_LOOP_COST_FACTOR (loop_vinfo)
1830           = wi::smin (nit, param_vect_inner_loop_cost_factor).to_uhwi ();
1831     }
1832
1833   return loop_vinfo;
1834 }
1835
1836
1837
1838 /* Scan the loop stmts and dependent on whether there are any (non-)SLP
1839    statements update the vectorization factor.  */
1840
1841 static void
1842 vect_update_vf_for_slp (loop_vec_info loop_vinfo)
1843 {
1844   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1845   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1846   int nbbs = loop->num_nodes;
1847   poly_uint64 vectorization_factor;
1848   int i;
1849
1850   DUMP_VECT_SCOPE ("vect_update_vf_for_slp");
1851
1852   vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1853   gcc_assert (known_ne (vectorization_factor, 0U));
1854
1855   /* If all the stmts in the loop can be SLPed, we perform only SLP, and
1856      vectorization factor of the loop is the unrolling factor required by
1857      the SLP instances.  If that unrolling factor is 1, we say, that we
1858      perform pure SLP on loop - cross iteration parallelism is not
1859      exploited.  */
1860   bool only_slp_in_loop = true;
1861   for (i = 0; i < nbbs; i++)
1862     {
1863       basic_block bb = bbs[i];
1864       for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
1865            gsi_next (&si))
1866         {
1867           stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (si.phi ());
1868           if (!stmt_info)
1869             continue;
1870           if ((STMT_VINFO_RELEVANT_P (stmt_info)
1871                || VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
1872               && !PURE_SLP_STMT (stmt_info))
1873             /* STMT needs both SLP and loop-based vectorization.  */
1874             only_slp_in_loop = false;
1875         }
1876       for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
1877            gsi_next (&si))
1878         {
1879           if (is_gimple_debug (gsi_stmt (si)))
1880             continue;
1881           stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
1882           stmt_info = vect_stmt_to_vectorize (stmt_info);
1883           if ((STMT_VINFO_RELEVANT_P (stmt_info)
1884                || VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
1885               && !PURE_SLP_STMT (stmt_info))
1886             /* STMT needs both SLP and loop-based vectorization.  */
1887             only_slp_in_loop = false;
1888         }
1889     }
1890
1891   if (only_slp_in_loop)
1892     {
1893       if (dump_enabled_p ())
1894         dump_printf_loc (MSG_NOTE, vect_location,
1895                          "Loop contains only SLP stmts\n");
1896       vectorization_factor = LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo);
1897     }
1898   else
1899     {
1900       if (dump_enabled_p ())
1901         dump_printf_loc (MSG_NOTE, vect_location,
1902                          "Loop contains SLP and non-SLP stmts\n");
1903       /* Both the vectorization factor and unroll factor have the form
1904          GET_MODE_SIZE (loop_vinfo->vector_mode) * X for some rational X,
1905          so they must have a common multiple.  */
1906       vectorization_factor
1907         = force_common_multiple (vectorization_factor,
1908                                  LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo));
1909     }
1910
1911   LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
1912   if (dump_enabled_p ())
1913     {
1914       dump_printf_loc (MSG_NOTE, vect_location,
1915                        "Updating vectorization factor to ");
1916       dump_dec (MSG_NOTE, vectorization_factor);
1917       dump_printf (MSG_NOTE, ".\n");
1918     }
1919 }
1920
1921 /* Return true if STMT_INFO describes a double reduction phi and if
1922    the other phi in the reduction is also relevant for vectorization.
1923    This rejects cases such as:
1924
1925       outer1:
1926         x_1 = PHI <x_3(outer2), ...>;
1927         ...
1928
1929       inner:
1930         x_2 = ...;
1931         ...
1932
1933       outer2:
1934         x_3 = PHI <x_2(inner)>;
1935
1936    if nothing in x_2 or elsewhere makes x_1 relevant.  */
1937
1938 static bool
1939 vect_active_double_reduction_p (stmt_vec_info stmt_info)
1940 {
1941   if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_double_reduction_def)
1942     return false;
1943
1944   return STMT_VINFO_RELEVANT_P (STMT_VINFO_REDUC_DEF (stmt_info));
1945 }
1946
1947 /* Function vect_analyze_loop_operations.
1948
1949    Scan the loop stmts and make sure they are all vectorizable.  */
1950
1951 static opt_result
1952 vect_analyze_loop_operations (loop_vec_info loop_vinfo)
1953 {
1954   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1955   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1956   int nbbs = loop->num_nodes;
1957   int i;
1958   stmt_vec_info stmt_info;
1959   bool need_to_vectorize = false;
1960   bool ok;
1961
1962   DUMP_VECT_SCOPE ("vect_analyze_loop_operations");
1963
1964   auto_vec<stmt_info_for_cost> cost_vec;
1965
1966   for (i = 0; i < nbbs; i++)
1967     {
1968       basic_block bb = bbs[i];
1969
1970       for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
1971            gsi_next (&si))
1972         {
1973           gphi *phi = si.phi ();
1974           ok = true;
1975
1976           stmt_info = loop_vinfo->lookup_stmt (phi);
1977           if (dump_enabled_p ())
1978             dump_printf_loc (MSG_NOTE, vect_location, "examining phi: %G",
1979                              (gimple *) phi);
1980           if (virtual_operand_p (gimple_phi_result (phi)))
1981             continue;
1982
1983           /* Inner-loop loop-closed exit phi in outer-loop vectorization
1984              (i.e., a phi in the tail of the outer-loop).  */
1985           if (! is_loop_header_bb_p (bb))
1986             {
1987               /* FORNOW: we currently don't support the case that these phis
1988                  are not used in the outerloop (unless it is double reduction,
1989                  i.e., this phi is vect_reduction_def), cause this case
1990                  requires to actually do something here.  */
1991               if (STMT_VINFO_LIVE_P (stmt_info)
1992                   && !vect_active_double_reduction_p (stmt_info))
1993                 return opt_result::failure_at (phi,
1994                                                "Unsupported loop-closed phi"
1995                                                " in outer-loop.\n");
1996
1997               /* If PHI is used in the outer loop, we check that its operand
1998                  is defined in the inner loop.  */
1999               if (STMT_VINFO_RELEVANT_P (stmt_info))
2000                 {
2001                   tree phi_op;
2002
2003                   if (gimple_phi_num_args (phi) != 1)
2004                     return opt_result::failure_at (phi, "unsupported phi");
2005
2006                   phi_op = PHI_ARG_DEF (phi, 0);
2007                   stmt_vec_info op_def_info = loop_vinfo->lookup_def (phi_op);
2008                   if (!op_def_info)
2009                     return opt_result::failure_at (phi, "unsupported phi\n");
2010
2011                   if (STMT_VINFO_RELEVANT (op_def_info) != vect_used_in_outer
2012                       && (STMT_VINFO_RELEVANT (op_def_info)
2013                           != vect_used_in_outer_by_reduction))
2014                     return opt_result::failure_at (phi, "unsupported phi\n");
2015
2016                   if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_internal_def
2017                        || (STMT_VINFO_DEF_TYPE (stmt_info)
2018                            == vect_double_reduction_def))
2019                       && !vectorizable_lc_phi (loop_vinfo,
2020                                                stmt_info, NULL, NULL))
2021                     return opt_result::failure_at (phi, "unsupported phi\n");
2022                 }
2023
2024               continue;
2025             }
2026
2027           gcc_assert (stmt_info);
2028
2029           if ((STMT_VINFO_RELEVANT (stmt_info) == vect_used_in_scope
2030                || STMT_VINFO_LIVE_P (stmt_info))
2031               && STMT_VINFO_DEF_TYPE (stmt_info) != vect_induction_def
2032               && STMT_VINFO_DEF_TYPE (stmt_info) != vect_first_order_recurrence)
2033             /* A scalar-dependence cycle that we don't support.  */
2034             return opt_result::failure_at (phi,
2035                                            "not vectorized:"
2036                                            " scalar dependence cycle.\n");
2037
2038           if (STMT_VINFO_RELEVANT_P (stmt_info))
2039             {
2040               need_to_vectorize = true;
2041               if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def
2042                   && ! PURE_SLP_STMT (stmt_info))
2043                 ok = vectorizable_induction (loop_vinfo,
2044                                              stmt_info, NULL, NULL,
2045                                              &cost_vec);
2046               else if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
2047                         || (STMT_VINFO_DEF_TYPE (stmt_info)
2048                             == vect_double_reduction_def)
2049                         || STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle)
2050                        && ! PURE_SLP_STMT (stmt_info))
2051                 ok = vectorizable_reduction (loop_vinfo,
2052                                              stmt_info, NULL, NULL, &cost_vec);
2053               else if ((STMT_VINFO_DEF_TYPE (stmt_info)
2054                         == vect_first_order_recurrence)
2055                        && ! PURE_SLP_STMT (stmt_info))
2056                 ok = vectorizable_recurr (loop_vinfo, stmt_info, NULL, NULL,
2057                                            &cost_vec);
2058             }
2059
2060           /* SLP PHIs are tested by vect_slp_analyze_node_operations.  */
2061           if (ok
2062               && STMT_VINFO_LIVE_P (stmt_info)
2063               && !PURE_SLP_STMT (stmt_info))
2064             ok = vectorizable_live_operation (loop_vinfo,
2065                                               stmt_info, NULL, NULL, NULL,
2066                                               -1, false, &cost_vec);
2067
2068           if (!ok)
2069             return opt_result::failure_at (phi,
2070                                            "not vectorized: relevant phi not "
2071                                            "supported: %G",
2072                                            static_cast <gimple *> (phi));
2073         }
2074
2075       for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
2076            gsi_next (&si))
2077         {
2078           gimple *stmt = gsi_stmt (si);
2079           if (!gimple_clobber_p (stmt)
2080               && !is_gimple_debug (stmt))
2081             {
2082               opt_result res
2083                 = vect_analyze_stmt (loop_vinfo,
2084                                      loop_vinfo->lookup_stmt (stmt),
2085                                      &need_to_vectorize,
2086                                      NULL, NULL, &cost_vec);
2087               if (!res)
2088                 return res;
2089             }
2090         }
2091     } /* bbs */
2092
2093   add_stmt_costs (loop_vinfo->vector_costs, &cost_vec);
2094
2095   /* All operations in the loop are either irrelevant (deal with loop
2096      control, or dead), or only used outside the loop and can be moved
2097      out of the loop (e.g. invariants, inductions).  The loop can be
2098      optimized away by scalar optimizations.  We're better off not
2099      touching this loop.  */
2100   if (!need_to_vectorize)
2101     {
2102       if (dump_enabled_p ())
2103         dump_printf_loc (MSG_NOTE, vect_location,
2104                          "All the computation can be taken out of the loop.\n");
2105       return opt_result::failure_at
2106         (vect_location,
2107          "not vectorized: redundant loop. no profit to vectorize.\n");
2108     }
2109
2110   return opt_result::success ();
2111 }
2112
2113 /* Return true if we know that the iteration count is smaller than the
2114    vectorization factor.  Return false if it isn't, or if we can't be sure
2115    either way.  */
2116
2117 static bool
2118 vect_known_niters_smaller_than_vf (loop_vec_info loop_vinfo)
2119 {
2120   unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
2121
2122   HOST_WIDE_INT max_niter;
2123   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
2124     max_niter = LOOP_VINFO_INT_NITERS (loop_vinfo);
2125   else
2126     max_niter = max_stmt_executions_int (LOOP_VINFO_LOOP (loop_vinfo));
2127
2128   if (max_niter != -1 && (unsigned HOST_WIDE_INT) max_niter < assumed_vf)
2129     return true;
2130
2131   return false;
2132 }
2133
2134 /* Analyze the cost of the loop described by LOOP_VINFO.  Decide if it
2135    is worthwhile to vectorize.  Return 1 if definitely yes, 0 if
2136    definitely no, or -1 if it's worth retrying.  */
2137
2138 static int
2139 vect_analyze_loop_costing (loop_vec_info loop_vinfo,
2140                            unsigned *suggested_unroll_factor)
2141 {
2142   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2143   unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
2144
2145   /* Only loops that can handle partially-populated vectors can have iteration
2146      counts less than the vectorization factor.  */
2147   if (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo)
2148       && vect_known_niters_smaller_than_vf (loop_vinfo))
2149     {
2150       if (dump_enabled_p ())
2151         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2152                          "not vectorized: iteration count smaller than "
2153                          "vectorization factor.\n");
2154       return 0;
2155     }
2156
2157   /* If we know the number of iterations we can do better, for the
2158      epilogue we can also decide whether the main loop leaves us
2159      with enough iterations, prefering a smaller vector epilog then
2160      also possibly used for the case we skip the vector loop.  */
2161   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
2162     {
2163       widest_int scalar_niters
2164         = wi::to_widest (LOOP_VINFO_NITERSM1 (loop_vinfo)) + 1;
2165       if (LOOP_VINFO_EPILOGUE_P (loop_vinfo))
2166         {
2167           loop_vec_info orig_loop_vinfo
2168             = LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo);
2169           unsigned lowest_vf
2170             = constant_lower_bound (LOOP_VINFO_VECT_FACTOR (orig_loop_vinfo));
2171           int prolog_peeling = 0;
2172           if (!vect_use_loop_mask_for_alignment_p (loop_vinfo))
2173             prolog_peeling = LOOP_VINFO_PEELING_FOR_ALIGNMENT (orig_loop_vinfo);
2174           if (prolog_peeling >= 0
2175               && known_eq (LOOP_VINFO_VECT_FACTOR (orig_loop_vinfo),
2176                            lowest_vf))
2177             {
2178               unsigned gap
2179                 = LOOP_VINFO_PEELING_FOR_GAPS (orig_loop_vinfo) ? 1 : 0;
2180               scalar_niters = ((scalar_niters - gap - prolog_peeling)
2181                                % lowest_vf + gap);
2182             }
2183         }
2184       /* Reject vectorizing for a single scalar iteration, even if
2185          we could in principle implement that using partial vectors.  */
2186       unsigned peeling_gap = LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo);
2187       if (scalar_niters <= peeling_gap + 1)
2188         {
2189           if (dump_enabled_p ())
2190             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2191                              "not vectorized: loop only has a single "
2192                              "scalar iteration.\n");
2193           return 0;
2194         }
2195
2196       if (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
2197         {
2198           /* Check that the loop processes at least one full vector.  */
2199           poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2200           if (known_lt (scalar_niters, vf))
2201             {
2202               if (dump_enabled_p ())
2203                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2204                                  "loop does not have enough iterations "
2205                                  "to support vectorization.\n");
2206               return 0;
2207             }
2208
2209           /* If we need to peel an extra epilogue iteration to handle data
2210              accesses with gaps, check that there are enough scalar iterations
2211              available.
2212
2213              The check above is redundant with this one when peeling for gaps,
2214              but the distinction is useful for diagnostics.  */
2215           if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
2216               && known_le (scalar_niters, vf))
2217             {
2218               if (dump_enabled_p ())
2219                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2220                                  "loop does not have enough iterations "
2221                                  "to support peeling for gaps.\n");
2222               return 0;
2223             }
2224         }
2225     }
2226
2227   /* If using the "very cheap" model. reject cases in which we'd keep
2228      a copy of the scalar code (even if we might be able to vectorize it).  */
2229   if (loop_cost_model (loop) == VECT_COST_MODEL_VERY_CHEAP
2230       && (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
2231           || LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
2232           || LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo)))
2233     {
2234       if (dump_enabled_p ())
2235         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2236                          "some scalar iterations would need to be peeled\n");
2237       return 0;
2238     }
2239
2240   int min_profitable_iters, min_profitable_estimate;
2241   vect_estimate_min_profitable_iters (loop_vinfo, &min_profitable_iters,
2242                                       &min_profitable_estimate,
2243                                       suggested_unroll_factor);
2244
2245   if (min_profitable_iters < 0)
2246     {
2247       if (dump_enabled_p ())
2248         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2249                          "not vectorized: vectorization not profitable.\n");
2250       if (dump_enabled_p ())
2251         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2252                          "not vectorized: vector version will never be "
2253                          "profitable.\n");
2254       return -1;
2255     }
2256
2257   int min_scalar_loop_bound = (param_min_vect_loop_bound
2258                                * assumed_vf);
2259
2260   /* Use the cost model only if it is more conservative than user specified
2261      threshold.  */
2262   unsigned int th = (unsigned) MAX (min_scalar_loop_bound,
2263                                     min_profitable_iters);
2264
2265   LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo) = th;
2266
2267   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2268       && LOOP_VINFO_INT_NITERS (loop_vinfo) < th)
2269     {
2270       if (dump_enabled_p ())
2271         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2272                          "not vectorized: vectorization not profitable.\n");
2273       if (dump_enabled_p ())
2274         dump_printf_loc (MSG_NOTE, vect_location,
2275                          "not vectorized: iteration count smaller than user "
2276                          "specified loop bound parameter or minimum profitable "
2277                          "iterations (whichever is more conservative).\n");
2278       return 0;
2279     }
2280
2281   /* The static profitablity threshold min_profitable_estimate includes
2282      the cost of having to check at runtime whether the scalar loop
2283      should be used instead.  If it turns out that we don't need or want
2284      such a check, the threshold we should use for the static estimate
2285      is simply the point at which the vector loop becomes more profitable
2286      than the scalar loop.  */
2287   if (min_profitable_estimate > min_profitable_iters
2288       && !LOOP_REQUIRES_VERSIONING (loop_vinfo)
2289       && !LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo)
2290       && !LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
2291       && !vect_apply_runtime_profitability_check_p (loop_vinfo))
2292     {
2293       if (dump_enabled_p ())
2294         dump_printf_loc (MSG_NOTE, vect_location, "no need for a runtime"
2295                          " choice between the scalar and vector loops\n");
2296       min_profitable_estimate = min_profitable_iters;
2297     }
2298
2299   /* If the vector loop needs multiple iterations to be beneficial then
2300      things are probably too close to call, and the conservative thing
2301      would be to stick with the scalar code.  */
2302   if (loop_cost_model (loop) == VECT_COST_MODEL_VERY_CHEAP
2303       && min_profitable_estimate > (int) vect_vf_for_cost (loop_vinfo))
2304     {
2305       if (dump_enabled_p ())
2306         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2307                          "one iteration of the vector loop would be"
2308                          " more expensive than the equivalent number of"
2309                          " iterations of the scalar loop\n");
2310       return 0;
2311     }
2312
2313   HOST_WIDE_INT estimated_niter;
2314
2315   /* If we are vectorizing an epilogue then we know the maximum number of
2316      scalar iterations it will cover is at least one lower than the
2317      vectorization factor of the main loop.  */
2318   if (LOOP_VINFO_EPILOGUE_P (loop_vinfo))
2319     estimated_niter
2320       = vect_vf_for_cost (LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo)) - 1;
2321   else
2322     {
2323       estimated_niter = estimated_stmt_executions_int (loop);
2324       if (estimated_niter == -1)
2325         estimated_niter = likely_max_stmt_executions_int (loop);
2326     }
2327   if (estimated_niter != -1
2328       && ((unsigned HOST_WIDE_INT) estimated_niter
2329           < MAX (th, (unsigned) min_profitable_estimate)))
2330     {
2331       if (dump_enabled_p ())
2332         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2333                          "not vectorized: estimated iteration count too "
2334                          "small.\n");
2335       if (dump_enabled_p ())
2336         dump_printf_loc (MSG_NOTE, vect_location,
2337                          "not vectorized: estimated iteration count smaller "
2338                          "than specified loop bound parameter or minimum "
2339                          "profitable iterations (whichever is more "
2340                          "conservative).\n");
2341       return -1;
2342     }
2343
2344   return 1;
2345 }
2346
2347 static opt_result
2348 vect_get_datarefs_in_loop (loop_p loop, basic_block *bbs,
2349                            vec<data_reference_p> *datarefs,
2350                            unsigned int *n_stmts)
2351 {
2352   *n_stmts = 0;
2353   for (unsigned i = 0; i < loop->num_nodes; i++)
2354     for (gimple_stmt_iterator gsi = gsi_start_bb (bbs[i]);
2355          !gsi_end_p (gsi); gsi_next (&gsi))
2356       {
2357         gimple *stmt = gsi_stmt (gsi);
2358         if (is_gimple_debug (stmt))
2359           continue;
2360         ++(*n_stmts);
2361         opt_result res = vect_find_stmt_data_reference (loop, stmt, datarefs,
2362                                                         NULL, 0);
2363         if (!res)
2364           {
2365             if (is_gimple_call (stmt) && loop->safelen)
2366               {
2367                 tree fndecl = gimple_call_fndecl (stmt), op;
2368                 if (fndecl == NULL_TREE
2369                     && gimple_call_internal_p (stmt, IFN_MASK_CALL))
2370                   {
2371                     fndecl = gimple_call_arg (stmt, 0);
2372                     gcc_checking_assert (TREE_CODE (fndecl) == ADDR_EXPR);
2373                     fndecl = TREE_OPERAND (fndecl, 0);
2374                     gcc_checking_assert (TREE_CODE (fndecl) == FUNCTION_DECL);
2375                   }
2376                 if (fndecl != NULL_TREE)
2377                   {
2378                     cgraph_node *node = cgraph_node::get (fndecl);
2379                     if (node != NULL && node->simd_clones != NULL)
2380                       {
2381                         unsigned int j, n = gimple_call_num_args (stmt);
2382                         for (j = 0; j < n; j++)
2383                           {
2384                             op = gimple_call_arg (stmt, j);
2385                             if (DECL_P (op)
2386                                 || (REFERENCE_CLASS_P (op)
2387                                     && get_base_address (op)))
2388                               break;
2389                           }
2390                         op = gimple_call_lhs (stmt);
2391                         /* Ignore #pragma omp declare simd functions
2392                            if they don't have data references in the
2393                            call stmt itself.  */
2394                         if (j == n
2395                             && !(op
2396                                  && (DECL_P (op)
2397                                      || (REFERENCE_CLASS_P (op)
2398                                          && get_base_address (op)))))
2399                           continue;
2400                       }
2401                   }
2402               }
2403             return res;
2404           }
2405         /* If dependence analysis will give up due to the limit on the
2406            number of datarefs stop here and fail fatally.  */
2407         if (datarefs->length ()
2408             > (unsigned)param_loop_max_datarefs_for_datadeps)
2409           return opt_result::failure_at (stmt, "exceeded param "
2410                                          "loop-max-datarefs-for-datadeps\n");
2411       }
2412   return opt_result::success ();
2413 }
2414
2415 /* Look for SLP-only access groups and turn each individual access into its own
2416    group.  */
2417 static void
2418 vect_dissolve_slp_only_groups (loop_vec_info loop_vinfo)
2419 {
2420   unsigned int i;
2421   struct data_reference *dr;
2422
2423   DUMP_VECT_SCOPE ("vect_dissolve_slp_only_groups");
2424
2425   vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
2426   FOR_EACH_VEC_ELT (datarefs, i, dr)
2427     {
2428       gcc_assert (DR_REF (dr));
2429       stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (DR_STMT (dr));
2430
2431       /* Check if the load is a part of an interleaving chain.  */
2432       if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
2433         {
2434           stmt_vec_info first_element = DR_GROUP_FIRST_ELEMENT (stmt_info);
2435           dr_vec_info *dr_info = STMT_VINFO_DR_INFO (first_element);
2436           unsigned int group_size = DR_GROUP_SIZE (first_element);
2437
2438           /* Check if SLP-only groups.  */
2439           if (!STMT_SLP_TYPE (stmt_info)
2440               && STMT_VINFO_SLP_VECT_ONLY (first_element))
2441             {
2442               /* Dissolve the group.  */
2443               STMT_VINFO_SLP_VECT_ONLY (first_element) = false;
2444
2445               stmt_vec_info vinfo = first_element;
2446               while (vinfo)
2447                 {
2448                   stmt_vec_info next = DR_GROUP_NEXT_ELEMENT (vinfo);
2449                   DR_GROUP_FIRST_ELEMENT (vinfo) = vinfo;
2450                   DR_GROUP_NEXT_ELEMENT (vinfo) = NULL;
2451                   DR_GROUP_SIZE (vinfo) = 1;
2452                   if (STMT_VINFO_STRIDED_P (first_element))
2453                     DR_GROUP_GAP (vinfo) = 0;
2454                   else
2455                     DR_GROUP_GAP (vinfo) = group_size - 1;
2456                   /* Duplicate and adjust alignment info, it needs to
2457                      be present on each group leader, see dr_misalignment.  */
2458                   if (vinfo != first_element)
2459                     {
2460                       dr_vec_info *dr_info2 = STMT_VINFO_DR_INFO (vinfo);
2461                       dr_info2->target_alignment = dr_info->target_alignment;
2462                       int misalignment = dr_info->misalignment;
2463                       if (misalignment != DR_MISALIGNMENT_UNKNOWN)
2464                         {
2465                           HOST_WIDE_INT diff
2466                             = (TREE_INT_CST_LOW (DR_INIT (dr_info2->dr))
2467                                - TREE_INT_CST_LOW (DR_INIT (dr_info->dr)));
2468                           unsigned HOST_WIDE_INT align_c
2469                             = dr_info->target_alignment.to_constant ();
2470                           misalignment = (misalignment + diff) % align_c;
2471                         }
2472                       dr_info2->misalignment = misalignment;
2473                     }
2474                   vinfo = next;
2475                 }
2476             }
2477         }
2478     }
2479 }
2480
2481 /* Determine if operating on full vectors for LOOP_VINFO might leave
2482    some scalar iterations still to do.  If so, decide how we should
2483    handle those scalar iterations.  The possibilities are:
2484
2485    (1) Make LOOP_VINFO operate on partial vectors instead of full vectors.
2486        In this case:
2487
2488          LOOP_VINFO_USING_PARTIAL_VECTORS_P == true
2489          LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P == false
2490          LOOP_VINFO_PEELING_FOR_NITER == false
2491
2492    (2) Make LOOP_VINFO operate on full vectors and use an epilogue loop
2493        to handle the remaining scalar iterations.  In this case:
2494
2495          LOOP_VINFO_USING_PARTIAL_VECTORS_P == false
2496          LOOP_VINFO_PEELING_FOR_NITER == true
2497
2498        There are two choices:
2499
2500        (2a) Consider vectorizing the epilogue loop at the same VF as the
2501             main loop, but using partial vectors instead of full vectors.
2502             In this case:
2503
2504               LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P == true
2505
2506        (2b) Consider vectorizing the epilogue loop at lower VFs only.
2507             In this case:
2508
2509               LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P == false
2510  */
2511
2512 opt_result
2513 vect_determine_partial_vectors_and_peeling (loop_vec_info loop_vinfo)
2514 {
2515   /* Determine whether there would be any scalar iterations left over.  */
2516   bool need_peeling_or_partial_vectors_p
2517     = vect_need_peeling_or_partial_vectors_p (loop_vinfo);
2518
2519   /* Decide whether to vectorize the loop with partial vectors.  */
2520   LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo) = false;
2521   LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P (loop_vinfo) = false;
2522   if (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)
2523       && need_peeling_or_partial_vectors_p)
2524     {
2525       /* For partial-vector-usage=1, try to push the handling of partial
2526          vectors to the epilogue, with the main loop continuing to operate
2527          on full vectors.
2528
2529          If we are unrolling we also do not want to use partial vectors. This
2530          is to avoid the overhead of generating multiple masks and also to
2531          avoid having to execute entire iterations of FALSE masked instructions
2532          when dealing with one or less full iterations.
2533
2534          ??? We could then end up failing to use partial vectors if we
2535          decide to peel iterations into a prologue, and if the main loop
2536          then ends up processing fewer than VF iterations.  */
2537       if ((param_vect_partial_vector_usage == 1
2538            || loop_vinfo->suggested_unroll_factor > 1)
2539           && !LOOP_VINFO_EPILOGUE_P (loop_vinfo)
2540           && !vect_known_niters_smaller_than_vf (loop_vinfo))
2541         LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P (loop_vinfo) = true;
2542       else
2543         LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo) = true;
2544     }
2545
2546   if (dump_enabled_p ())
2547     dump_printf_loc (MSG_NOTE, vect_location,
2548                      "operating on %s vectors%s.\n",
2549                      LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo)
2550                      ? "partial" : "full",
2551                      LOOP_VINFO_EPILOGUE_P (loop_vinfo)
2552                      ? " for epilogue loop" : "");
2553
2554   LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo)
2555     = (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo)
2556        && need_peeling_or_partial_vectors_p);
2557
2558   return opt_result::success ();
2559 }
2560
2561 /* Function vect_analyze_loop_2.
2562
2563    Apply a set of analyses on LOOP specified by LOOP_VINFO, the different
2564    analyses will record information in some members of LOOP_VINFO.  FATAL
2565    indicates if some analysis meets fatal error.  If one non-NULL pointer
2566    SUGGESTED_UNROLL_FACTOR is provided, it's intent to be filled with one
2567    worked out suggested unroll factor, while one NULL pointer shows it's
2568    going to apply the suggested unroll factor.  SLP_DONE_FOR_SUGGESTED_UF
2569    is to hold the slp decision when the suggested unroll factor is worked
2570    out.  */
2571 static opt_result
2572 vect_analyze_loop_2 (loop_vec_info loop_vinfo, bool &fatal,
2573                      unsigned *suggested_unroll_factor,
2574                      bool& slp_done_for_suggested_uf)
2575 {
2576   opt_result ok = opt_result::success ();
2577   int res;
2578   unsigned int max_vf = MAX_VECTORIZATION_FACTOR;
2579   poly_uint64 min_vf = 2;
2580   loop_vec_info orig_loop_vinfo = NULL;
2581
2582   /* If we are dealing with an epilogue then orig_loop_vinfo points to the
2583      loop_vec_info of the first vectorized loop.  */
2584   if (LOOP_VINFO_EPILOGUE_P (loop_vinfo))
2585     orig_loop_vinfo = LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo);
2586   else
2587     orig_loop_vinfo = loop_vinfo;
2588   gcc_assert (orig_loop_vinfo);
2589
2590   /* The first group of checks is independent of the vector size.  */
2591   fatal = true;
2592
2593   if (LOOP_VINFO_SIMD_IF_COND (loop_vinfo)
2594       && integer_zerop (LOOP_VINFO_SIMD_IF_COND (loop_vinfo)))
2595     return opt_result::failure_at (vect_location,
2596                                    "not vectorized: simd if(0)\n");
2597
2598   /* Find all data references in the loop (which correspond to vdefs/vuses)
2599      and analyze their evolution in the loop.  */
2600
2601   loop_p loop = LOOP_VINFO_LOOP (loop_vinfo);
2602
2603   /* Gather the data references and count stmts in the loop.  */
2604   if (!LOOP_VINFO_DATAREFS (loop_vinfo).exists ())
2605     {
2606       opt_result res
2607         = vect_get_datarefs_in_loop (loop, LOOP_VINFO_BBS (loop_vinfo),
2608                                      &LOOP_VINFO_DATAREFS (loop_vinfo),
2609                                      &LOOP_VINFO_N_STMTS (loop_vinfo));
2610       if (!res)
2611         {
2612           if (dump_enabled_p ())
2613             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2614                              "not vectorized: loop contains function "
2615                              "calls or data references that cannot "
2616                              "be analyzed\n");
2617           return res;
2618         }
2619       loop_vinfo->shared->save_datarefs ();
2620     }
2621   else
2622     loop_vinfo->shared->check_datarefs ();
2623
2624   /* Analyze the data references and also adjust the minimal
2625      vectorization factor according to the loads and stores.  */
2626
2627   ok = vect_analyze_data_refs (loop_vinfo, &min_vf, &fatal);
2628   if (!ok)
2629     {
2630       if (dump_enabled_p ())
2631         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2632                          "bad data references.\n");
2633       return ok;
2634     }
2635
2636   /* Check if we are applying unroll factor now.  */
2637   bool applying_suggested_uf = loop_vinfo->suggested_unroll_factor > 1;
2638   gcc_assert (!applying_suggested_uf || !suggested_unroll_factor);
2639
2640   /* If the slp decision is false when suggested unroll factor is worked
2641      out, and we are applying suggested unroll factor, we can simply skip
2642      all slp related analyses this time.  */
2643   bool slp = !applying_suggested_uf || slp_done_for_suggested_uf;
2644
2645   /* Classify all cross-iteration scalar data-flow cycles.
2646      Cross-iteration cycles caused by virtual phis are analyzed separately.  */
2647   vect_analyze_scalar_cycles (loop_vinfo, slp);
2648
2649   vect_pattern_recog (loop_vinfo);
2650
2651   vect_fixup_scalar_cycles_with_patterns (loop_vinfo);
2652
2653   /* Analyze the access patterns of the data-refs in the loop (consecutive,
2654      complex, etc.). FORNOW: Only handle consecutive access pattern.  */
2655
2656   ok = vect_analyze_data_ref_accesses (loop_vinfo, NULL);
2657   if (!ok)
2658     {
2659       if (dump_enabled_p ())
2660         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2661                          "bad data access.\n");
2662       return ok;
2663     }
2664
2665   /* Data-flow analysis to detect stmts that do not need to be vectorized.  */
2666
2667   ok = vect_mark_stmts_to_be_vectorized (loop_vinfo, &fatal);
2668   if (!ok)
2669     {
2670       if (dump_enabled_p ())
2671         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2672                          "unexpected pattern.\n");
2673       return ok;
2674     }
2675
2676   /* While the rest of the analysis below depends on it in some way.  */
2677   fatal = false;
2678
2679   /* Analyze data dependences between the data-refs in the loop
2680      and adjust the maximum vectorization factor according to
2681      the dependences.
2682      FORNOW: fail at the first data dependence that we encounter.  */
2683
2684   ok = vect_analyze_data_ref_dependences (loop_vinfo, &max_vf);
2685   if (!ok)
2686     {
2687       if (dump_enabled_p ())
2688         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2689                          "bad data dependence.\n");
2690       return ok;
2691     }
2692   if (max_vf != MAX_VECTORIZATION_FACTOR
2693       && maybe_lt (max_vf, min_vf))
2694     return opt_result::failure_at (vect_location, "bad data dependence.\n");
2695   LOOP_VINFO_MAX_VECT_FACTOR (loop_vinfo) = max_vf;
2696
2697   ok = vect_determine_vectorization_factor (loop_vinfo);
2698   if (!ok)
2699     {
2700       if (dump_enabled_p ())
2701         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2702                          "can't determine vectorization factor.\n");
2703       return ok;
2704     }
2705   if (max_vf != MAX_VECTORIZATION_FACTOR
2706       && maybe_lt (max_vf, LOOP_VINFO_VECT_FACTOR (loop_vinfo)))
2707     return opt_result::failure_at (vect_location, "bad data dependence.\n");
2708
2709   /* Compute the scalar iteration cost.  */
2710   vect_compute_single_scalar_iteration_cost (loop_vinfo);
2711
2712   poly_uint64 saved_vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2713
2714   if (slp)
2715     {
2716       /* Check the SLP opportunities in the loop, analyze and build
2717          SLP trees.  */
2718       ok = vect_analyze_slp (loop_vinfo, LOOP_VINFO_N_STMTS (loop_vinfo));
2719       if (!ok)
2720         return ok;
2721
2722       /* If there are any SLP instances mark them as pure_slp.  */
2723       slp = vect_make_slp_decision (loop_vinfo);
2724       if (slp)
2725         {
2726           /* Find stmts that need to be both vectorized and SLPed.  */
2727           vect_detect_hybrid_slp (loop_vinfo);
2728
2729           /* Update the vectorization factor based on the SLP decision.  */
2730           vect_update_vf_for_slp (loop_vinfo);
2731
2732           /* Optimize the SLP graph with the vectorization factor fixed.  */
2733           vect_optimize_slp (loop_vinfo);
2734
2735           /* Gather the loads reachable from the SLP graph entries.  */
2736           vect_gather_slp_loads (loop_vinfo);
2737         }
2738     }
2739
2740   bool saved_can_use_partial_vectors_p
2741     = LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo);
2742
2743   /* We don't expect to have to roll back to anything other than an empty
2744      set of rgroups.  */
2745   gcc_assert (LOOP_VINFO_MASKS (loop_vinfo).is_empty ());
2746
2747   /* This is the point where we can re-start analysis with SLP forced off.  */
2748 start_over:
2749
2750   /* Apply the suggested unrolling factor, this was determined by the backend
2751      during finish_cost the first time we ran the analyzis for this
2752      vector mode.  */
2753   if (applying_suggested_uf)
2754     LOOP_VINFO_VECT_FACTOR (loop_vinfo) *= loop_vinfo->suggested_unroll_factor;
2755
2756   /* Now the vectorization factor is final.  */
2757   poly_uint64 vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2758   gcc_assert (known_ne (vectorization_factor, 0U));
2759
2760   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) && dump_enabled_p ())
2761     {
2762       dump_printf_loc (MSG_NOTE, vect_location,
2763                        "vectorization_factor = ");
2764       dump_dec (MSG_NOTE, vectorization_factor);
2765       dump_printf (MSG_NOTE, ", niters = %wd\n",
2766                    LOOP_VINFO_INT_NITERS (loop_vinfo));
2767     }
2768
2769   loop_vinfo->vector_costs = init_cost (loop_vinfo, false);
2770
2771   /* Analyze the alignment of the data-refs in the loop.
2772      Fail if a data reference is found that cannot be vectorized.  */
2773
2774   ok = vect_analyze_data_refs_alignment (loop_vinfo);
2775   if (!ok)
2776     {
2777       if (dump_enabled_p ())
2778         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2779                          "bad data alignment.\n");
2780       return ok;
2781     }
2782
2783   /* Prune the list of ddrs to be tested at run-time by versioning for alias.
2784      It is important to call pruning after vect_analyze_data_ref_accesses,
2785      since we use grouping information gathered by interleaving analysis.  */
2786   ok = vect_prune_runtime_alias_test_list (loop_vinfo);
2787   if (!ok)
2788     return ok;
2789
2790   /* Do not invoke vect_enhance_data_refs_alignment for epilogue
2791      vectorization, since we do not want to add extra peeling or
2792      add versioning for alignment.  */
2793   if (!LOOP_VINFO_EPILOGUE_P (loop_vinfo))
2794     /* This pass will decide on using loop versioning and/or loop peeling in
2795        order to enhance the alignment of data references in the loop.  */
2796     ok = vect_enhance_data_refs_alignment (loop_vinfo);
2797   if (!ok)
2798     return ok;
2799
2800   if (slp)
2801     {
2802       /* Analyze operations in the SLP instances.  Note this may
2803          remove unsupported SLP instances which makes the above
2804          SLP kind detection invalid.  */
2805       unsigned old_size = LOOP_VINFO_SLP_INSTANCES (loop_vinfo).length ();
2806       vect_slp_analyze_operations (loop_vinfo);
2807       if (LOOP_VINFO_SLP_INSTANCES (loop_vinfo).length () != old_size)
2808         {
2809           ok = opt_result::failure_at (vect_location,
2810                                        "unsupported SLP instances\n");
2811           goto again;
2812         }
2813
2814       /* Check whether any load in ALL SLP instances is possibly permuted.  */
2815       slp_tree load_node, slp_root;
2816       unsigned i, x;
2817       slp_instance instance;
2818       bool can_use_lanes = true;
2819       FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), x, instance)
2820         {
2821           slp_root = SLP_INSTANCE_TREE (instance);
2822           int group_size = SLP_TREE_LANES (slp_root);
2823           tree vectype = SLP_TREE_VECTYPE (slp_root);
2824           bool loads_permuted = false;
2825           FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), i, load_node)
2826             {
2827               if (!SLP_TREE_LOAD_PERMUTATION (load_node).exists ())
2828                 continue;
2829               unsigned j;
2830               stmt_vec_info load_info;
2831               FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (load_node), j, load_info)
2832                 if (SLP_TREE_LOAD_PERMUTATION (load_node)[j] != j)
2833                   {
2834                     loads_permuted = true;
2835                     break;
2836                   }
2837             }
2838
2839           /* If the loads and stores can be handled with load/store-lane
2840              instructions record it and move on to the next instance.  */
2841           if (loads_permuted
2842               && SLP_INSTANCE_KIND (instance) == slp_inst_kind_store
2843               && vect_store_lanes_supported (vectype, group_size, false))
2844             {
2845               FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), i, load_node)
2846                 {
2847                   stmt_vec_info stmt_vinfo = DR_GROUP_FIRST_ELEMENT
2848                       (SLP_TREE_SCALAR_STMTS (load_node)[0]);
2849                   /* Use SLP for strided accesses (or if we can't
2850                      load-lanes).  */
2851                   if (STMT_VINFO_STRIDED_P (stmt_vinfo)
2852                       || ! vect_load_lanes_supported
2853                             (STMT_VINFO_VECTYPE (stmt_vinfo),
2854                              DR_GROUP_SIZE (stmt_vinfo), false))
2855                     break;
2856                 }
2857
2858               can_use_lanes
2859                 = can_use_lanes && i == SLP_INSTANCE_LOADS (instance).length ();
2860
2861               if (can_use_lanes && dump_enabled_p ())
2862                 dump_printf_loc (MSG_NOTE, vect_location,
2863                                  "SLP instance %p can use load/store-lanes\n",
2864                                  (void *) instance);
2865             }
2866           else
2867             {
2868               can_use_lanes = false;
2869               break;
2870             }
2871         }
2872
2873       /* If all SLP instances can use load/store-lanes abort SLP and try again
2874          with SLP disabled.  */
2875       if (can_use_lanes)
2876         {
2877           ok = opt_result::failure_at (vect_location,
2878                                        "Built SLP cancelled: can use "
2879                                        "load/store-lanes\n");
2880           if (dump_enabled_p ())
2881             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2882                              "Built SLP cancelled: all SLP instances support "
2883                              "load/store-lanes\n");
2884           goto again;
2885         }
2886     }
2887
2888   /* Dissolve SLP-only groups.  */
2889   vect_dissolve_slp_only_groups (loop_vinfo);
2890
2891   /* Scan all the remaining operations in the loop that are not subject
2892      to SLP and make sure they are vectorizable.  */
2893   ok = vect_analyze_loop_operations (loop_vinfo);
2894   if (!ok)
2895     {
2896       if (dump_enabled_p ())
2897         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2898                          "bad operation or unsupported loop bound.\n");
2899       return ok;
2900     }
2901
2902   /* For now, we don't expect to mix both masking and length approaches for one
2903      loop, disable it if both are recorded.  */
2904   if (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)
2905       && !LOOP_VINFO_MASKS (loop_vinfo).is_empty ()
2906       && !LOOP_VINFO_LENS (loop_vinfo).is_empty ())
2907     {
2908       if (dump_enabled_p ())
2909         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2910                          "can't vectorize a loop with partial vectors"
2911                          " because we don't expect to mix different"
2912                          " approaches with partial vectors for the"
2913                          " same loop.\n");
2914       LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
2915     }
2916
2917   /* If we still have the option of using partial vectors,
2918      check whether we can generate the necessary loop controls.  */
2919   if (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo))
2920     {
2921       if (!LOOP_VINFO_MASKS (loop_vinfo).is_empty ())
2922         {
2923           if (!vect_verify_full_masking (loop_vinfo)
2924               && !vect_verify_full_masking_avx512 (loop_vinfo))
2925             LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
2926         }
2927       else /* !LOOP_VINFO_LENS (loop_vinfo).is_empty () */
2928         if (!vect_verify_loop_lens (loop_vinfo))
2929           LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
2930     }
2931
2932   /* If we're vectorizing a loop that uses length "controls" and
2933      can iterate more than once, we apply decrementing IV approach
2934      in loop control.  */
2935   if (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)
2936       && LOOP_VINFO_PARTIAL_VECTORS_STYLE (loop_vinfo) == vect_partial_vectors_len
2937       && LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo) == 0
2938       && !(LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2939            && known_le (LOOP_VINFO_INT_NITERS (loop_vinfo),
2940                         LOOP_VINFO_VECT_FACTOR (loop_vinfo))))
2941     LOOP_VINFO_USING_DECREMENTING_IV_P (loop_vinfo) = true;
2942
2943   /* If a loop uses length controls and has a decrementing loop control IV,
2944      we will normally pass that IV through a MIN_EXPR to calcaluate the
2945      basis for the length controls.  E.g. in a loop that processes one
2946      element per scalar iteration, the number of elements would be
2947      MIN_EXPR <N, VF>, where N is the number of scalar iterations left.
2948
2949      This MIN_EXPR approach allows us to use pointer IVs with an invariant
2950      step, since only the final iteration of the vector loop can have
2951      inactive lanes.
2952
2953      However, some targets have a dedicated instruction for calculating the
2954      preferred length, given the total number of elements that still need to
2955      be processed.  This is encapsulated in the SELECT_VL internal function.
2956
2957      If the target supports SELECT_VL, we can use it instead of MIN_EXPR
2958      to determine the basis for the length controls.  However, unlike the
2959      MIN_EXPR calculation, the SELECT_VL calculation can decide to make
2960      lanes inactive in any iteration of the vector loop, not just the last
2961      iteration.  This SELECT_VL approach therefore requires us to use pointer
2962      IVs with variable steps.
2963
2964      Once we've decided how many elements should be processed by one
2965      iteration of the vector loop, we need to populate the rgroup controls.
2966      If a loop has multiple rgroups, we need to make sure that those rgroups
2967      "line up" (that is, they must be consistent about which elements are
2968      active and which aren't).  This is done by vect_adjust_loop_lens_control.
2969
2970      In principle, it would be possible to use vect_adjust_loop_lens_control
2971      on either the result of a MIN_EXPR or the result of a SELECT_VL.
2972      However:
2973
2974      (1) In practice, it only makes sense to use SELECT_VL when a vector
2975          operation will be controlled directly by the result.  It is not
2976          worth using SELECT_VL if it would only be the input to other
2977          calculations.
2978
2979      (2) If we use SELECT_VL for an rgroup that has N controls, each associated
2980          pointer IV will need N updates by a variable amount (N-1 updates
2981          within the iteration and 1 update to move to the next iteration).
2982
2983      Because of this, we prefer to use the MIN_EXPR approach whenever there
2984      is more than one length control.
2985
2986      In addition, SELECT_VL always operates to a granularity of 1 unit.
2987      If we wanted to use it to control an SLP operation on N consecutive
2988      elements, we would need to make the SELECT_VL inputs measure scalar
2989      iterations (rather than elements) and then multiply the SELECT_VL
2990      result by N.  But using SELECT_VL this way is inefficient because
2991      of (1) above.
2992
2993      2. We don't apply SELECT_VL on single-rgroup when both (1) and (2) are
2994         satisfied:
2995
2996      (1). LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) is true.
2997      (2). LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant () is true.
2998
2999      Since SELECT_VL (variable step) will make SCEV analysis failed and then
3000      we will fail to gain benefits of following unroll optimizations. We prefer
3001      using the MIN_EXPR approach in this situation.  */
3002   if (LOOP_VINFO_USING_DECREMENTING_IV_P (loop_vinfo))
3003     {
3004       tree iv_type = LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo);
3005       if (direct_internal_fn_supported_p (IFN_SELECT_VL, iv_type,
3006                                           OPTIMIZE_FOR_SPEED)
3007           && LOOP_VINFO_LENS (loop_vinfo).length () == 1
3008           && LOOP_VINFO_LENS (loop_vinfo)[0].factor == 1 && !slp
3009           && (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
3010               || !LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant ()))
3011         LOOP_VINFO_USING_SELECT_VL_P (loop_vinfo) = true;
3012     }
3013
3014   /* Decide whether this loop_vinfo should use partial vectors or peeling,
3015      assuming that the loop will be used as a main loop.  We will redo
3016      this analysis later if we instead decide to use the loop as an
3017      epilogue loop.  */
3018   ok = vect_determine_partial_vectors_and_peeling (loop_vinfo);
3019   if (!ok)
3020     return ok;
3021
3022   /* If we're vectorizing an epilogue loop, the vectorized loop either needs
3023      to be able to handle fewer than VF scalars, or needs to have a lower VF
3024      than the main loop.  */
3025   if (LOOP_VINFO_EPILOGUE_P (loop_vinfo)
3026       && !LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
3027     {
3028       poly_uint64 unscaled_vf
3029         = exact_div (LOOP_VINFO_VECT_FACTOR (orig_loop_vinfo),
3030                      orig_loop_vinfo->suggested_unroll_factor);
3031       if (maybe_ge (LOOP_VINFO_VECT_FACTOR (loop_vinfo), unscaled_vf))
3032         return opt_result::failure_at (vect_location,
3033                                        "Vectorization factor too high for"
3034                                        " epilogue loop.\n");
3035     }
3036
3037   /* Check the costings of the loop make vectorizing worthwhile.  */
3038   res = vect_analyze_loop_costing (loop_vinfo, suggested_unroll_factor);
3039   if (res < 0)
3040     {
3041       ok = opt_result::failure_at (vect_location,
3042                                    "Loop costings may not be worthwhile.\n");
3043       goto again;
3044     }
3045   if (!res)
3046     return opt_result::failure_at (vect_location,
3047                                    "Loop costings not worthwhile.\n");
3048
3049   /* If an epilogue loop is required make sure we can create one.  */
3050   if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
3051       || LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo))
3052     {
3053       if (dump_enabled_p ())
3054         dump_printf_loc (MSG_NOTE, vect_location, "epilog loop required\n");
3055       if (!vect_can_advance_ivs_p (loop_vinfo)
3056           || !slpeel_can_duplicate_loop_p (LOOP_VINFO_LOOP (loop_vinfo),
3057                                            single_exit (LOOP_VINFO_LOOP
3058                                                          (loop_vinfo))))
3059         {
3060           ok = opt_result::failure_at (vect_location,
3061                                        "not vectorized: can't create required "
3062                                        "epilog loop\n");
3063           goto again;
3064         }
3065     }
3066
3067   /* During peeling, we need to check if number of loop iterations is
3068      enough for both peeled prolog loop and vector loop.  This check
3069      can be merged along with threshold check of loop versioning, so
3070      increase threshold for this case if necessary.
3071
3072      If we are analyzing an epilogue we still want to check what its
3073      versioning threshold would be.  If we decide to vectorize the epilogues we
3074      will want to use the lowest versioning threshold of all epilogues and main
3075      loop.  This will enable us to enter a vectorized epilogue even when
3076      versioning the loop.  We can't simply check whether the epilogue requires
3077      versioning though since we may have skipped some versioning checks when
3078      analyzing the epilogue.  For instance, checks for alias versioning will be
3079      skipped when dealing with epilogues as we assume we already checked them
3080      for the main loop.  So instead we always check the 'orig_loop_vinfo'.  */
3081   if (LOOP_REQUIRES_VERSIONING (orig_loop_vinfo))
3082     {
3083       poly_uint64 niters_th = 0;
3084       unsigned int th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
3085
3086       if (!vect_use_loop_mask_for_alignment_p (loop_vinfo))
3087         {
3088           /* Niters for peeled prolog loop.  */
3089           if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) < 0)
3090             {
3091               dr_vec_info *dr_info = LOOP_VINFO_UNALIGNED_DR (loop_vinfo);
3092               tree vectype = STMT_VINFO_VECTYPE (dr_info->stmt);
3093               niters_th += TYPE_VECTOR_SUBPARTS (vectype) - 1;
3094             }
3095           else
3096             niters_th += LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
3097         }
3098
3099       /* Niters for at least one iteration of vectorized loop.  */
3100       if (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
3101         niters_th += LOOP_VINFO_VECT_FACTOR (loop_vinfo);
3102       /* One additional iteration because of peeling for gap.  */
3103       if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
3104         niters_th += 1;
3105
3106       /*  Use the same condition as vect_transform_loop to decide when to use
3107           the cost to determine a versioning threshold.  */
3108       if (vect_apply_runtime_profitability_check_p (loop_vinfo)
3109           && ordered_p (th, niters_th))
3110         niters_th = ordered_max (poly_uint64 (th), niters_th);
3111
3112       LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo) = niters_th;
3113     }
3114
3115   gcc_assert (known_eq (vectorization_factor,
3116                         LOOP_VINFO_VECT_FACTOR (loop_vinfo)));
3117
3118   slp_done_for_suggested_uf = slp;
3119
3120   /* Ok to vectorize!  */
3121   LOOP_VINFO_VECTORIZABLE_P (loop_vinfo) = 1;
3122   return opt_result::success ();
3123
3124 again:
3125   /* Ensure that "ok" is false (with an opt_problem if dumping is enabled).  */
3126   gcc_assert (!ok);
3127
3128   /* Try again with SLP forced off but if we didn't do any SLP there is
3129      no point in re-trying.  */
3130   if (!slp)
3131     return ok;
3132
3133   /* If the slp decision is true when suggested unroll factor is worked
3134      out, and we are applying suggested unroll factor, we don't need to
3135      re-try any more.  */
3136   if (applying_suggested_uf && slp_done_for_suggested_uf)
3137     return ok;
3138
3139   /* If there are reduction chains re-trying will fail anyway.  */
3140   if (! LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo).is_empty ())
3141     return ok;
3142
3143   /* Likewise if the grouped loads or stores in the SLP cannot be handled
3144      via interleaving or lane instructions.  */
3145   slp_instance instance;
3146   slp_tree node;
3147   unsigned i, j;
3148   FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), i, instance)
3149     {
3150       stmt_vec_info vinfo;
3151       vinfo = SLP_TREE_SCALAR_STMTS (SLP_INSTANCE_TREE (instance))[0];
3152       if (! STMT_VINFO_GROUPED_ACCESS (vinfo))
3153         continue;
3154       vinfo = DR_GROUP_FIRST_ELEMENT (vinfo);
3155       unsigned int size = DR_GROUP_SIZE (vinfo);
3156       tree vectype = STMT_VINFO_VECTYPE (vinfo);
3157       if (! vect_store_lanes_supported (vectype, size, false)
3158          && ! known_eq (TYPE_VECTOR_SUBPARTS (vectype), 1U)
3159          && ! vect_grouped_store_supported (vectype, size))
3160         return opt_result::failure_at (vinfo->stmt,
3161                                        "unsupported grouped store\n");
3162       FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), j, node)
3163         {
3164           vinfo = SLP_TREE_SCALAR_STMTS (node)[0];
3165           vinfo = DR_GROUP_FIRST_ELEMENT (vinfo);
3166           bool single_element_p = !DR_GROUP_NEXT_ELEMENT (vinfo);
3167           size = DR_GROUP_SIZE (vinfo);
3168           vectype = STMT_VINFO_VECTYPE (vinfo);
3169           if (! vect_load_lanes_supported (vectype, size, false)
3170               && ! vect_grouped_load_supported (vectype, single_element_p,
3171                                                 size))
3172             return opt_result::failure_at (vinfo->stmt,
3173                                            "unsupported grouped load\n");
3174         }
3175     }
3176
3177   if (dump_enabled_p ())
3178     dump_printf_loc (MSG_NOTE, vect_location,
3179                      "re-trying with SLP disabled\n");
3180
3181   /* Roll back state appropriately.  No SLP this time.  */
3182   slp = false;
3183   /* Restore vectorization factor as it were without SLP.  */
3184   LOOP_VINFO_VECT_FACTOR (loop_vinfo) = saved_vectorization_factor;
3185   /* Free the SLP instances.  */
3186   FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), j, instance)
3187     vect_free_slp_instance (instance);
3188   LOOP_VINFO_SLP_INSTANCES (loop_vinfo).release ();
3189   /* Reset SLP type to loop_vect on all stmts.  */
3190   for (i = 0; i < LOOP_VINFO_LOOP (loop_vinfo)->num_nodes; ++i)
3191     {
3192       basic_block bb = LOOP_VINFO_BBS (loop_vinfo)[i];
3193       for (gimple_stmt_iterator si = gsi_start_phis (bb);
3194            !gsi_end_p (si); gsi_next (&si))
3195         {
3196           stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
3197           STMT_SLP_TYPE (stmt_info) = loop_vect;
3198           if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
3199               || STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
3200             {
3201               /* vectorizable_reduction adjusts reduction stmt def-types,
3202                  restore them to that of the PHI.  */
3203               STMT_VINFO_DEF_TYPE (STMT_VINFO_REDUC_DEF (stmt_info))
3204                 = STMT_VINFO_DEF_TYPE (stmt_info);
3205               STMT_VINFO_DEF_TYPE (vect_stmt_to_vectorize
3206                                         (STMT_VINFO_REDUC_DEF (stmt_info)))
3207                 = STMT_VINFO_DEF_TYPE (stmt_info);
3208             }
3209         }
3210       for (gimple_stmt_iterator si = gsi_start_bb (bb);
3211            !gsi_end_p (si); gsi_next (&si))
3212         {
3213           if (is_gimple_debug (gsi_stmt (si)))
3214             continue;
3215           stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
3216           STMT_SLP_TYPE (stmt_info) = loop_vect;
3217           if (STMT_VINFO_IN_PATTERN_P (stmt_info))
3218             {
3219               stmt_vec_info pattern_stmt_info
3220                 = STMT_VINFO_RELATED_STMT (stmt_info);
3221               if (STMT_VINFO_SLP_VECT_ONLY_PATTERN (pattern_stmt_info))
3222                 STMT_VINFO_IN_PATTERN_P (stmt_info) = false;
3223
3224               gimple *pattern_def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
3225               STMT_SLP_TYPE (pattern_stmt_info) = loop_vect;
3226               for (gimple_stmt_iterator pi = gsi_start (pattern_def_seq);
3227                    !gsi_end_p (pi); gsi_next (&pi))
3228                 STMT_SLP_TYPE (loop_vinfo->lookup_stmt (gsi_stmt (pi)))
3229                   = loop_vect;
3230             }
3231         }
3232     }
3233   /* Free optimized alias test DDRS.  */
3234   LOOP_VINFO_LOWER_BOUNDS (loop_vinfo).truncate (0);
3235   LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo).release ();
3236   LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo).release ();
3237   /* Reset target cost data.  */
3238   delete loop_vinfo->vector_costs;
3239   loop_vinfo->vector_costs = nullptr;
3240   /* Reset accumulated rgroup information.  */
3241   LOOP_VINFO_MASKS (loop_vinfo).mask_set.empty ();
3242   release_vec_loop_controls (&LOOP_VINFO_MASKS (loop_vinfo).rgc_vec);
3243   release_vec_loop_controls (&LOOP_VINFO_LENS (loop_vinfo));
3244   /* Reset assorted flags.  */
3245   LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = false;
3246   LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) = false;
3247   LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo) = 0;
3248   LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo) = 0;
3249   LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)
3250     = saved_can_use_partial_vectors_p;
3251
3252   goto start_over;
3253 }
3254
3255 /* Return true if vectorizing a loop using NEW_LOOP_VINFO appears
3256    to be better than vectorizing it using OLD_LOOP_VINFO.  Assume that
3257    OLD_LOOP_VINFO is better unless something specifically indicates
3258    otherwise.
3259
3260    Note that this deliberately isn't a partial order.  */
3261
3262 static bool
3263 vect_better_loop_vinfo_p (loop_vec_info new_loop_vinfo,
3264                           loop_vec_info old_loop_vinfo)
3265 {
3266   struct loop *loop = LOOP_VINFO_LOOP (new_loop_vinfo);
3267   gcc_assert (LOOP_VINFO_LOOP (old_loop_vinfo) == loop);
3268
3269   poly_int64 new_vf = LOOP_VINFO_VECT_FACTOR (new_loop_vinfo);
3270   poly_int64 old_vf = LOOP_VINFO_VECT_FACTOR (old_loop_vinfo);
3271
3272   /* Always prefer a VF of loop->simdlen over any other VF.  */
3273   if (loop->simdlen)
3274     {
3275       bool new_simdlen_p = known_eq (new_vf, loop->simdlen);
3276       bool old_simdlen_p = known_eq (old_vf, loop->simdlen);
3277       if (new_simdlen_p != old_simdlen_p)
3278         return new_simdlen_p;
3279     }
3280
3281   const auto *old_costs = old_loop_vinfo->vector_costs;
3282   const auto *new_costs = new_loop_vinfo->vector_costs;
3283   if (loop_vec_info main_loop = LOOP_VINFO_ORIG_LOOP_INFO (old_loop_vinfo))
3284     return new_costs->better_epilogue_loop_than_p (old_costs, main_loop);
3285
3286   return new_costs->better_main_loop_than_p (old_costs);
3287 }
3288
3289 /* Decide whether to replace OLD_LOOP_VINFO with NEW_LOOP_VINFO.  Return
3290    true if we should.  */
3291
3292 static bool
3293 vect_joust_loop_vinfos (loop_vec_info new_loop_vinfo,
3294                         loop_vec_info old_loop_vinfo)
3295 {
3296   if (!vect_better_loop_vinfo_p (new_loop_vinfo, old_loop_vinfo))
3297     return false;
3298
3299   if (dump_enabled_p ())
3300     dump_printf_loc (MSG_NOTE, vect_location,
3301                      "***** Preferring vector mode %s to vector mode %s\n",
3302                      GET_MODE_NAME (new_loop_vinfo->vector_mode),
3303                      GET_MODE_NAME (old_loop_vinfo->vector_mode));
3304   return true;
3305 }
3306
3307 /* Analyze LOOP with VECTOR_MODES[MODE_I] and as epilogue if MAIN_LOOP_VINFO is
3308    not NULL.  Set AUTODETECTED_VECTOR_MODE if VOIDmode and advance
3309    MODE_I to the next mode useful to analyze.
3310    Return the loop_vinfo on success and wrapped null on failure.  */
3311
3312 static opt_loop_vec_info
3313 vect_analyze_loop_1 (class loop *loop, vec_info_shared *shared,
3314                      const vect_loop_form_info *loop_form_info,
3315                      loop_vec_info main_loop_vinfo,
3316                      const vector_modes &vector_modes, unsigned &mode_i,
3317                      machine_mode &autodetected_vector_mode,
3318                      bool &fatal)
3319 {
3320   loop_vec_info loop_vinfo
3321     = vect_create_loop_vinfo (loop, shared, loop_form_info, main_loop_vinfo);
3322
3323   machine_mode vector_mode = vector_modes[mode_i];
3324   loop_vinfo->vector_mode = vector_mode;
3325   unsigned int suggested_unroll_factor = 1;
3326   bool slp_done_for_suggested_uf = false;
3327
3328   /* Run the main analysis.  */
3329   opt_result res = vect_analyze_loop_2 (loop_vinfo, fatal,
3330                                         &suggested_unroll_factor,
3331                                         slp_done_for_suggested_uf);
3332   if (dump_enabled_p ())
3333     dump_printf_loc (MSG_NOTE, vect_location,
3334                      "***** Analysis %s with vector mode %s\n",
3335                      res ? "succeeded" : " failed",
3336                      GET_MODE_NAME (loop_vinfo->vector_mode));
3337
3338   if (res && !main_loop_vinfo && suggested_unroll_factor > 1)
3339     {
3340       if (dump_enabled_p ())
3341         dump_printf_loc (MSG_NOTE, vect_location,
3342                          "***** Re-trying analysis for unrolling"
3343                          " with unroll factor %d and slp %s.\n",
3344                          suggested_unroll_factor,
3345                          slp_done_for_suggested_uf ? "on" : "off");
3346       loop_vec_info unroll_vinfo
3347         = vect_create_loop_vinfo (loop, shared, loop_form_info, main_loop_vinfo);
3348       unroll_vinfo->vector_mode = vector_mode;
3349       unroll_vinfo->suggested_unroll_factor = suggested_unroll_factor;
3350       opt_result new_res = vect_analyze_loop_2 (unroll_vinfo, fatal, NULL,
3351                                                 slp_done_for_suggested_uf);
3352       if (new_res)
3353         {
3354           delete loop_vinfo;
3355           loop_vinfo = unroll_vinfo;
3356         }
3357       else
3358         delete unroll_vinfo;
3359     }
3360
3361   /* Remember the autodetected vector mode.  */
3362   if (vector_mode == VOIDmode)
3363     autodetected_vector_mode = loop_vinfo->vector_mode;
3364
3365   /* Advance mode_i, first skipping modes that would result in the
3366      same analysis result.  */
3367   while (mode_i + 1 < vector_modes.length ()
3368          && vect_chooses_same_modes_p (loop_vinfo,
3369                                        vector_modes[mode_i + 1]))
3370     {
3371       if (dump_enabled_p ())
3372         dump_printf_loc (MSG_NOTE, vect_location,
3373                          "***** The result for vector mode %s would"
3374                          " be the same\n",
3375                          GET_MODE_NAME (vector_modes[mode_i + 1]));
3376       mode_i += 1;
3377     }
3378   if (mode_i + 1 < vector_modes.length ()
3379       && VECTOR_MODE_P (autodetected_vector_mode)
3380       && (related_vector_mode (vector_modes[mode_i + 1],
3381                                GET_MODE_INNER (autodetected_vector_mode))
3382           == autodetected_vector_mode)
3383       && (related_vector_mode (autodetected_vector_mode,
3384                                GET_MODE_INNER (vector_modes[mode_i + 1]))
3385           == vector_modes[mode_i + 1]))
3386     {
3387       if (dump_enabled_p ())
3388         dump_printf_loc (MSG_NOTE, vect_location,
3389                          "***** Skipping vector mode %s, which would"
3390                          " repeat the analysis for %s\n",
3391                          GET_MODE_NAME (vector_modes[mode_i + 1]),
3392                          GET_MODE_NAME (autodetected_vector_mode));
3393       mode_i += 1;
3394     }
3395   mode_i++;
3396
3397   if (!res)
3398     {
3399       delete loop_vinfo;
3400       if (fatal)
3401         gcc_checking_assert (main_loop_vinfo == NULL);
3402       return opt_loop_vec_info::propagate_failure (res);
3403     }
3404
3405   return opt_loop_vec_info::success (loop_vinfo);
3406 }
3407
3408 /* Function vect_analyze_loop.
3409
3410    Apply a set of analyses on LOOP, and create a loop_vec_info struct
3411    for it.  The different analyses will record information in the
3412    loop_vec_info struct.  */
3413 opt_loop_vec_info
3414 vect_analyze_loop (class loop *loop, vec_info_shared *shared)
3415 {
3416   DUMP_VECT_SCOPE ("analyze_loop_nest");
3417
3418   if (loop_outer (loop)
3419       && loop_vec_info_for_loop (loop_outer (loop))
3420       && LOOP_VINFO_VECTORIZABLE_P (loop_vec_info_for_loop (loop_outer (loop))))
3421     return opt_loop_vec_info::failure_at (vect_location,
3422                                           "outer-loop already vectorized.\n");
3423
3424   if (!find_loop_nest (loop, &shared->loop_nest))
3425     return opt_loop_vec_info::failure_at
3426       (vect_location,
3427        "not vectorized: loop nest containing two or more consecutive inner"
3428        " loops cannot be vectorized\n");
3429
3430   /* Analyze the loop form.  */
3431   vect_loop_form_info loop_form_info;
3432   opt_result res = vect_analyze_loop_form (loop, &loop_form_info);
3433   if (!res)
3434     {
3435       if (dump_enabled_p ())
3436         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3437                          "bad loop form.\n");
3438       return opt_loop_vec_info::propagate_failure (res);
3439     }
3440   if (!integer_onep (loop_form_info.assumptions))
3441     {
3442       /* We consider to vectorize this loop by versioning it under
3443          some assumptions.  In order to do this, we need to clear
3444          existing information computed by scev and niter analyzer.  */
3445       scev_reset_htab ();
3446       free_numbers_of_iterations_estimates (loop);
3447       /* Also set flag for this loop so that following scev and niter
3448          analysis are done under the assumptions.  */
3449       loop_constraint_set (loop, LOOP_C_FINITE);
3450     }
3451
3452   auto_vector_modes vector_modes;
3453   /* Autodetect first vector size we try.  */
3454   vector_modes.safe_push (VOIDmode);
3455   unsigned int autovec_flags
3456     = targetm.vectorize.autovectorize_vector_modes (&vector_modes,
3457                                                     loop->simdlen != 0);
3458   bool pick_lowest_cost_p = ((autovec_flags & VECT_COMPARE_COSTS)
3459                              && !unlimited_cost_model (loop));
3460   machine_mode autodetected_vector_mode = VOIDmode;
3461   opt_loop_vec_info first_loop_vinfo = opt_loop_vec_info::success (NULL);
3462   unsigned int mode_i = 0;
3463   unsigned HOST_WIDE_INT simdlen = loop->simdlen;
3464
3465   /* Keep track of the VF for each mode.  Initialize all to 0 which indicates
3466      a mode has not been analyzed.  */
3467   auto_vec<poly_uint64, 8> cached_vf_per_mode;
3468   for (unsigned i = 0; i < vector_modes.length (); ++i)
3469     cached_vf_per_mode.safe_push (0);
3470
3471   /* First determine the main loop vectorization mode, either the first
3472      one that works, starting with auto-detecting the vector mode and then
3473      following the targets order of preference, or the one with the
3474      lowest cost if pick_lowest_cost_p.  */
3475   while (1)
3476     {
3477       bool fatal;
3478       unsigned int last_mode_i = mode_i;
3479       /* Set cached VF to -1 prior to analysis, which indicates a mode has
3480          failed.  */
3481       cached_vf_per_mode[last_mode_i] = -1;
3482       opt_loop_vec_info loop_vinfo
3483         = vect_analyze_loop_1 (loop, shared, &loop_form_info,
3484                                NULL, vector_modes, mode_i,
3485                                autodetected_vector_mode, fatal);
3486       if (fatal)
3487         break;
3488
3489       if (loop_vinfo)
3490         {
3491           /*  Analyzis has been successful so update the VF value.  The
3492               VF should always be a multiple of unroll_factor and we want to
3493               capture the original VF here.  */
3494           cached_vf_per_mode[last_mode_i]
3495             = exact_div (LOOP_VINFO_VECT_FACTOR (loop_vinfo),
3496                          loop_vinfo->suggested_unroll_factor);
3497           /* Once we hit the desired simdlen for the first time,
3498              discard any previous attempts.  */
3499           if (simdlen
3500               && known_eq (LOOP_VINFO_VECT_FACTOR (loop_vinfo), simdlen))
3501             {
3502               delete first_loop_vinfo;
3503               first_loop_vinfo = opt_loop_vec_info::success (NULL);
3504               simdlen = 0;
3505             }
3506           else if (pick_lowest_cost_p
3507                    && first_loop_vinfo
3508                    && vect_joust_loop_vinfos (loop_vinfo, first_loop_vinfo))
3509             {
3510               /* Pick loop_vinfo over first_loop_vinfo.  */
3511               delete first_loop_vinfo;
3512               first_loop_vinfo = opt_loop_vec_info::success (NULL);
3513             }
3514           if (first_loop_vinfo == NULL)
3515             first_loop_vinfo = loop_vinfo;
3516           else
3517             {
3518               delete loop_vinfo;
3519               loop_vinfo = opt_loop_vec_info::success (NULL);
3520             }
3521
3522           /* Commit to first_loop_vinfo if we have no reason to try
3523              alternatives.  */
3524           if (!simdlen && !pick_lowest_cost_p)
3525             break;
3526         }
3527       if (mode_i == vector_modes.length ()
3528           || autodetected_vector_mode == VOIDmode)
3529         break;
3530
3531       /* Try the next biggest vector size.  */
3532       if (dump_enabled_p ())
3533         dump_printf_loc (MSG_NOTE, vect_location,
3534                          "***** Re-trying analysis with vector mode %s\n",
3535                          GET_MODE_NAME (vector_modes[mode_i]));
3536     }
3537   if (!first_loop_vinfo)
3538     return opt_loop_vec_info::propagate_failure (res);
3539
3540   if (dump_enabled_p ())
3541     dump_printf_loc (MSG_NOTE, vect_location,
3542                      "***** Choosing vector mode %s\n",
3543                      GET_MODE_NAME (first_loop_vinfo->vector_mode));
3544
3545   /* Only vectorize epilogues if PARAM_VECT_EPILOGUES_NOMASK is
3546      enabled, SIMDUID is not set, it is the innermost loop and we have
3547      either already found the loop's SIMDLEN or there was no SIMDLEN to
3548      begin with.
3549      TODO: Enable epilogue vectorization for loops with SIMDUID set.  */
3550   bool vect_epilogues = (!simdlen
3551                          && loop->inner == NULL
3552                          && param_vect_epilogues_nomask
3553                          && LOOP_VINFO_PEELING_FOR_NITER (first_loop_vinfo)
3554                          && !loop->simduid);
3555   if (!vect_epilogues)
3556     return first_loop_vinfo;
3557
3558   /* Now analyze first_loop_vinfo for epilogue vectorization.  */
3559   poly_uint64 lowest_th = LOOP_VINFO_VERSIONING_THRESHOLD (first_loop_vinfo);
3560
3561   /* For epilogues start the analysis from the first mode.  The motivation
3562      behind starting from the beginning comes from cases where the VECTOR_MODES
3563      array may contain length-agnostic and length-specific modes.  Their
3564      ordering is not guaranteed, so we could end up picking a mode for the main
3565      loop that is after the epilogue's optimal mode.  */
3566   vector_modes[0] = autodetected_vector_mode;
3567   mode_i = 0;
3568
3569   bool supports_partial_vectors =
3570     partial_vectors_supported_p () && param_vect_partial_vector_usage != 0;
3571   poly_uint64 first_vinfo_vf = LOOP_VINFO_VECT_FACTOR (first_loop_vinfo);
3572
3573   while (1)
3574     {
3575       /* If the target does not support partial vectors we can shorten the
3576          number of modes to analyze for the epilogue as we know we can't pick a
3577          mode that would lead to a VF at least as big as the
3578          FIRST_VINFO_VF.  */
3579       if (!supports_partial_vectors
3580           && maybe_ge (cached_vf_per_mode[mode_i], first_vinfo_vf))
3581         {
3582           mode_i++;
3583           if (mode_i == vector_modes.length ())
3584             break;
3585           continue;
3586         }
3587
3588       if (dump_enabled_p ())
3589         dump_printf_loc (MSG_NOTE, vect_location,
3590                          "***** Re-trying epilogue analysis with vector "
3591                          "mode %s\n", GET_MODE_NAME (vector_modes[mode_i]));
3592
3593       bool fatal;
3594       opt_loop_vec_info loop_vinfo
3595         = vect_analyze_loop_1 (loop, shared, &loop_form_info,
3596                                first_loop_vinfo,
3597                                vector_modes, mode_i,
3598                                autodetected_vector_mode, fatal);
3599       if (fatal)
3600         break;
3601
3602       if (loop_vinfo)
3603         {
3604           if (pick_lowest_cost_p)
3605             {
3606               /* Keep trying to roll back vectorization attempts while the
3607                  loop_vec_infos they produced were worse than this one.  */
3608               vec<loop_vec_info> &vinfos = first_loop_vinfo->epilogue_vinfos;
3609               while (!vinfos.is_empty ()
3610                      && vect_joust_loop_vinfos (loop_vinfo, vinfos.last ()))
3611                 {
3612                   gcc_assert (vect_epilogues);
3613                   delete vinfos.pop ();
3614                 }
3615             }
3616           /* For now only allow one epilogue loop.  */
3617           if (first_loop_vinfo->epilogue_vinfos.is_empty ())
3618             {
3619               first_loop_vinfo->epilogue_vinfos.safe_push (loop_vinfo);
3620               poly_uint64 th = LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo);
3621               gcc_assert (!LOOP_REQUIRES_VERSIONING (loop_vinfo)
3622                           || maybe_ne (lowest_th, 0U));
3623               /* Keep track of the known smallest versioning
3624                  threshold.  */
3625               if (ordered_p (lowest_th, th))
3626                 lowest_th = ordered_min (lowest_th, th);
3627             }
3628           else
3629             {
3630               delete loop_vinfo;
3631               loop_vinfo = opt_loop_vec_info::success (NULL);
3632             }
3633
3634           /* For now only allow one epilogue loop, but allow
3635              pick_lowest_cost_p to replace it, so commit to the
3636              first epilogue if we have no reason to try alternatives.  */
3637           if (!pick_lowest_cost_p)
3638             break;
3639         }
3640
3641       if (mode_i == vector_modes.length ())
3642         break;
3643
3644     }
3645
3646   if (!first_loop_vinfo->epilogue_vinfos.is_empty ())
3647     {
3648       LOOP_VINFO_VERSIONING_THRESHOLD (first_loop_vinfo) = lowest_th;
3649       if (dump_enabled_p ())
3650         dump_printf_loc (MSG_NOTE, vect_location,
3651                          "***** Choosing epilogue vector mode %s\n",
3652                          GET_MODE_NAME
3653                            (first_loop_vinfo->epilogue_vinfos[0]->vector_mode));
3654     }
3655
3656   return first_loop_vinfo;
3657 }
3658
3659 /* Return true if there is an in-order reduction function for CODE, storing
3660    it in *REDUC_FN if so.  */
3661
3662 static bool
3663 fold_left_reduction_fn (code_helper code, internal_fn *reduc_fn)
3664 {
3665   if (code == PLUS_EXPR)
3666     {
3667       *reduc_fn = IFN_FOLD_LEFT_PLUS;
3668       return true;
3669     }
3670   return false;
3671 }
3672
3673 /* Function reduction_fn_for_scalar_code
3674
3675    Input:
3676    CODE - tree_code of a reduction operations.
3677
3678    Output:
3679    REDUC_FN - the corresponding internal function to be used to reduce the
3680       vector of partial results into a single scalar result, or IFN_LAST
3681       if the operation is a supported reduction operation, but does not have
3682       such an internal function.
3683
3684    Return FALSE if CODE currently cannot be vectorized as reduction.  */
3685
3686 bool
3687 reduction_fn_for_scalar_code (code_helper code, internal_fn *reduc_fn)
3688 {
3689   if (code.is_tree_code ())
3690     switch (tree_code (code))
3691       {
3692       case MAX_EXPR:
3693         *reduc_fn = IFN_REDUC_MAX;
3694         return true;
3695
3696       case MIN_EXPR:
3697         *reduc_fn = IFN_REDUC_MIN;
3698         return true;
3699
3700       case PLUS_EXPR:
3701         *reduc_fn = IFN_REDUC_PLUS;
3702         return true;
3703
3704       case BIT_AND_EXPR:
3705         *reduc_fn = IFN_REDUC_AND;
3706         return true;
3707
3708       case BIT_IOR_EXPR:
3709         *reduc_fn = IFN_REDUC_IOR;
3710         return true;
3711
3712       case BIT_XOR_EXPR:
3713         *reduc_fn = IFN_REDUC_XOR;
3714         return true;
3715
3716       case MULT_EXPR:
3717       case MINUS_EXPR:
3718         *reduc_fn = IFN_LAST;
3719         return true;
3720
3721       default:
3722         return false;
3723       }
3724   else
3725     switch (combined_fn (code))
3726       {
3727       CASE_CFN_FMAX:
3728         *reduc_fn = IFN_REDUC_FMAX;
3729         return true;
3730
3731       CASE_CFN_FMIN:
3732         *reduc_fn = IFN_REDUC_FMIN;
3733         return true;
3734
3735       default:
3736         return false;
3737       }
3738 }
3739
3740 /* If there is a neutral value X such that a reduction would not be affected
3741    by the introduction of additional X elements, return that X, otherwise
3742    return null.  CODE is the code of the reduction and SCALAR_TYPE is type
3743    of the scalar elements.  If the reduction has just a single initial value
3744    then INITIAL_VALUE is that value, otherwise it is null.  */
3745
3746 tree
3747 neutral_op_for_reduction (tree scalar_type, code_helper code,
3748                           tree initial_value)
3749 {
3750   if (code.is_tree_code ())
3751     switch (tree_code (code))
3752       {
3753       case WIDEN_SUM_EXPR:
3754       case DOT_PROD_EXPR:
3755       case SAD_EXPR:
3756       case PLUS_EXPR:
3757       case MINUS_EXPR:
3758       case BIT_IOR_EXPR:
3759       case BIT_XOR_EXPR:
3760         return build_zero_cst (scalar_type);
3761
3762       case MULT_EXPR:
3763         return build_one_cst (scalar_type);
3764
3765       case BIT_AND_EXPR:
3766         return build_all_ones_cst (scalar_type);
3767
3768       case MAX_EXPR:
3769       case MIN_EXPR:
3770         return initial_value;
3771
3772       default:
3773         return NULL_TREE;
3774       }
3775   else
3776     switch (combined_fn (code))
3777       {
3778       CASE_CFN_FMIN:
3779       CASE_CFN_FMAX:
3780         return initial_value;
3781
3782       default:
3783         return NULL_TREE;
3784       }
3785 }
3786
3787 /* Error reporting helper for vect_is_simple_reduction below.  GIMPLE statement
3788    STMT is printed with a message MSG. */
3789
3790 static void
3791 report_vect_op (dump_flags_t msg_type, gimple *stmt, const char *msg)
3792 {
3793   dump_printf_loc (msg_type, vect_location, "%s%G", msg, stmt);
3794 }
3795
3796 /* Return true if we need an in-order reduction for operation CODE
3797    on type TYPE.  NEED_WRAPPING_INTEGRAL_OVERFLOW is true if integer
3798    overflow must wrap.  */
3799
3800 bool
3801 needs_fold_left_reduction_p (tree type, code_helper code)
3802 {
3803   /* CHECKME: check for !flag_finite_math_only too?  */
3804   if (SCALAR_FLOAT_TYPE_P (type))
3805     {
3806       if (code.is_tree_code ())
3807         switch (tree_code (code))
3808           {
3809           case MIN_EXPR:
3810           case MAX_EXPR:
3811             return false;
3812
3813           default:
3814             return !flag_associative_math;
3815           }
3816       else
3817         switch (combined_fn (code))
3818           {
3819           CASE_CFN_FMIN:
3820           CASE_CFN_FMAX:
3821             return false;
3822
3823           default:
3824             return !flag_associative_math;
3825           }
3826     }
3827
3828   if (INTEGRAL_TYPE_P (type))
3829     return (!code.is_tree_code ()
3830             || !operation_no_trapping_overflow (type, tree_code (code)));
3831
3832   if (SAT_FIXED_POINT_TYPE_P (type))
3833     return true;
3834
3835   return false;
3836 }
3837
3838 /* Return true if the reduction PHI in LOOP with latch arg LOOP_ARG and
3839    has a handled computation expression.  Store the main reduction
3840    operation in *CODE.  */
3841
3842 static bool
3843 check_reduction_path (dump_user_location_t loc, loop_p loop, gphi *phi,
3844                       tree loop_arg, code_helper *code,
3845                       vec<std::pair<ssa_op_iter, use_operand_p> > &path)
3846 {
3847   auto_bitmap visited;
3848   tree lookfor = PHI_RESULT (phi);
3849   ssa_op_iter curri;
3850   use_operand_p curr = op_iter_init_phiuse (&curri, phi, SSA_OP_USE);
3851   while (USE_FROM_PTR (curr) != loop_arg)
3852     curr = op_iter_next_use (&curri);
3853   curri.i = curri.numops;
3854   do
3855     {
3856       path.safe_push (std::make_pair (curri, curr));
3857       tree use = USE_FROM_PTR (curr);
3858       if (use == lookfor)
3859         break;
3860       gimple *def = SSA_NAME_DEF_STMT (use);
3861       if (gimple_nop_p (def)
3862           || ! flow_bb_inside_loop_p (loop, gimple_bb (def)))
3863         {
3864 pop:
3865           do
3866             {
3867               std::pair<ssa_op_iter, use_operand_p> x = path.pop ();
3868               curri = x.first;
3869               curr = x.second;
3870               do
3871                 curr = op_iter_next_use (&curri);
3872               /* Skip already visited or non-SSA operands (from iterating
3873                  over PHI args).  */
3874               while (curr != NULL_USE_OPERAND_P
3875                      && (TREE_CODE (USE_FROM_PTR (curr)) != SSA_NAME
3876                          || ! bitmap_set_bit (visited,
3877                                               SSA_NAME_VERSION
3878                                                 (USE_FROM_PTR (curr)))));
3879             }
3880           while (curr == NULL_USE_OPERAND_P && ! path.is_empty ());
3881           if (curr == NULL_USE_OPERAND_P)
3882             break;
3883         }
3884       else
3885         {
3886           if (gimple_code (def) == GIMPLE_PHI)
3887             curr = op_iter_init_phiuse (&curri, as_a <gphi *>(def), SSA_OP_USE);
3888           else
3889             curr = op_iter_init_use (&curri, def, SSA_OP_USE);
3890           while (curr != NULL_USE_OPERAND_P
3891                  && (TREE_CODE (USE_FROM_PTR (curr)) != SSA_NAME
3892                      || ! bitmap_set_bit (visited,
3893                                           SSA_NAME_VERSION
3894                                             (USE_FROM_PTR (curr)))))
3895             curr = op_iter_next_use (&curri);
3896           if (curr == NULL_USE_OPERAND_P)
3897             goto pop;
3898         }
3899     }
3900   while (1);
3901   if (dump_file && (dump_flags & TDF_DETAILS))
3902     {
3903       dump_printf_loc (MSG_NOTE, loc, "reduction path: ");
3904       unsigned i;
3905       std::pair<ssa_op_iter, use_operand_p> *x;
3906       FOR_EACH_VEC_ELT (path, i, x)
3907         dump_printf (MSG_NOTE, "%T ", USE_FROM_PTR (x->second));
3908       dump_printf (MSG_NOTE, "\n");
3909     }
3910
3911   /* Check whether the reduction path detected is valid.  */
3912   bool fail = path.length () == 0;
3913   bool neg = false;
3914   int sign = -1;
3915   *code = ERROR_MARK;
3916   for (unsigned i = 1; i < path.length (); ++i)
3917     {
3918       gimple *use_stmt = USE_STMT (path[i].second);
3919       gimple_match_op op;
3920       if (!gimple_extract_op (use_stmt, &op))
3921         {
3922           fail = true;
3923           break;
3924         }
3925       unsigned int opi = op.num_ops;
3926       if (gassign *assign = dyn_cast<gassign *> (use_stmt))
3927         {
3928           /* The following make sure we can compute the operand index
3929              easily plus it mostly disallows chaining via COND_EXPR condition
3930              operands.  */
3931           for (opi = 0; opi < op.num_ops; ++opi)
3932             if (gimple_assign_rhs1_ptr (assign) + opi == path[i].second->use)
3933               break;
3934         }
3935       else if (gcall *call = dyn_cast<gcall *> (use_stmt))
3936         {
3937           for (opi = 0; opi < op.num_ops; ++opi)
3938             if (gimple_call_arg_ptr (call, opi) == path[i].second->use)
3939               break;
3940         }
3941       if (opi == op.num_ops)
3942         {
3943           fail = true;
3944           break;
3945         }
3946       op.code = canonicalize_code (op.code, op.type);
3947       if (op.code == MINUS_EXPR)
3948         {
3949           op.code = PLUS_EXPR;
3950           /* Track whether we negate the reduction value each iteration.  */
3951           if (op.ops[1] == op.ops[opi])
3952             neg = ! neg;
3953         }
3954       if (CONVERT_EXPR_CODE_P (op.code)
3955           && tree_nop_conversion_p (op.type, TREE_TYPE (op.ops[0])))
3956         ;
3957       else if (*code == ERROR_MARK)
3958         {
3959           *code = op.code;
3960           sign = TYPE_SIGN (op.type);
3961         }
3962       else if (op.code != *code)
3963         {
3964           fail = true;
3965           break;
3966         }
3967       else if ((op.code == MIN_EXPR
3968                 || op.code == MAX_EXPR)
3969                && sign != TYPE_SIGN (op.type))
3970         {
3971           fail = true;
3972           break;
3973         }
3974       /* Check there's only a single stmt the op is used on.  For the
3975          not value-changing tail and the last stmt allow out-of-loop uses.
3976          ???  We could relax this and handle arbitrary live stmts by
3977          forcing a scalar epilogue for example.  */
3978       imm_use_iterator imm_iter;
3979       gimple *op_use_stmt;
3980       unsigned cnt = 0;
3981       FOR_EACH_IMM_USE_STMT (op_use_stmt, imm_iter, op.ops[opi])
3982         if (!is_gimple_debug (op_use_stmt)
3983             && (*code != ERROR_MARK
3984                 || flow_bb_inside_loop_p (loop, gimple_bb (op_use_stmt))))
3985           {
3986             /* We want to allow x + x but not x < 1 ? x : 2.  */
3987             if (is_gimple_assign (op_use_stmt)
3988                 && gimple_assign_rhs_code (op_use_stmt) == COND_EXPR)
3989               {
3990                 use_operand_p use_p;
3991                 FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
3992                   cnt++;
3993               }
3994             else
3995               cnt++;
3996           }
3997       if (cnt != 1)
3998         {
3999           fail = true;
4000           break;
4001         }
4002     }
4003   return ! fail && ! neg && *code != ERROR_MARK;
4004 }
4005
4006 bool
4007 check_reduction_path (dump_user_location_t loc, loop_p loop, gphi *phi,
4008                       tree loop_arg, enum tree_code code)
4009 {
4010   auto_vec<std::pair<ssa_op_iter, use_operand_p> > path;
4011   code_helper code_;
4012   return (check_reduction_path (loc, loop, phi, loop_arg, &code_, path)
4013           && code_ == code);
4014 }
4015
4016
4017
4018 /* Function vect_is_simple_reduction
4019
4020    (1) Detect a cross-iteration def-use cycle that represents a simple
4021    reduction computation.  We look for the following pattern:
4022
4023    loop_header:
4024      a1 = phi < a0, a2 >
4025      a3 = ...
4026      a2 = operation (a3, a1)
4027
4028    or
4029
4030    a3 = ...
4031    loop_header:
4032      a1 = phi < a0, a2 >
4033      a2 = operation (a3, a1)
4034
4035    such that:
4036    1. operation is commutative and associative and it is safe to
4037       change the order of the computation
4038    2. no uses for a2 in the loop (a2 is used out of the loop)
4039    3. no uses of a1 in the loop besides the reduction operation
4040    4. no uses of a1 outside the loop.
4041
4042    Conditions 1,4 are tested here.
4043    Conditions 2,3 are tested in vect_mark_stmts_to_be_vectorized.
4044
4045    (2) Detect a cross-iteration def-use cycle in nested loops, i.e.,
4046    nested cycles.
4047
4048    (3) Detect cycles of phi nodes in outer-loop vectorization, i.e., double
4049    reductions:
4050
4051      a1 = phi < a0, a2 >
4052      inner loop (def of a3)
4053      a2 = phi < a3 >
4054
4055    (4) Detect condition expressions, ie:
4056      for (int i = 0; i < N; i++)
4057        if (a[i] < val)
4058         ret_val = a[i];
4059
4060 */
4061
4062 static stmt_vec_info
4063 vect_is_simple_reduction (loop_vec_info loop_info, stmt_vec_info phi_info,
4064                           bool *double_reduc, bool *reduc_chain_p, bool slp)
4065 {
4066   gphi *phi = as_a <gphi *> (phi_info->stmt);
4067   gimple *phi_use_stmt = NULL;
4068   imm_use_iterator imm_iter;
4069   use_operand_p use_p;
4070
4071   *double_reduc = false;
4072   *reduc_chain_p = false;
4073   STMT_VINFO_REDUC_TYPE (phi_info) = TREE_CODE_REDUCTION;
4074
4075   tree phi_name = PHI_RESULT (phi);
4076   /* ???  If there are no uses of the PHI result the inner loop reduction
4077      won't be detected as possibly double-reduction by vectorizable_reduction
4078      because that tries to walk the PHI arg from the preheader edge which
4079      can be constant.  See PR60382.  */
4080   if (has_zero_uses (phi_name))
4081     return NULL;
4082   class loop *loop = (gimple_bb (phi))->loop_father;
4083   unsigned nphi_def_loop_uses = 0;
4084   FOR_EACH_IMM_USE_FAST (use_p, imm_iter, phi_name)
4085     {
4086       gimple *use_stmt = USE_STMT (use_p);
4087       if (is_gimple_debug (use_stmt))
4088         continue;
4089
4090       if (!flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
4091         {
4092           if (dump_enabled_p ())
4093             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4094                              "intermediate value used outside loop.\n");
4095
4096           return NULL;
4097         }
4098
4099       nphi_def_loop_uses++;
4100       phi_use_stmt = use_stmt;
4101     }
4102
4103   tree latch_def = PHI_ARG_DEF_FROM_EDGE (phi, loop_latch_edge (loop));
4104   if (TREE_CODE (latch_def) != SSA_NAME)
4105     {
4106       if (dump_enabled_p ())
4107         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4108                          "reduction: not ssa_name: %T\n", latch_def);
4109       return NULL;
4110     }
4111
4112   stmt_vec_info def_stmt_info = loop_info->lookup_def (latch_def);
4113   if (!def_stmt_info
4114       || !flow_bb_inside_loop_p (loop, gimple_bb (def_stmt_info->stmt)))
4115     return NULL;
4116
4117   bool nested_in_vect_loop
4118     = flow_loop_nested_p (LOOP_VINFO_LOOP (loop_info), loop);
4119   unsigned nlatch_def_loop_uses = 0;
4120   auto_vec<gphi *, 3> lcphis;
4121   bool inner_loop_of_double_reduc = false;
4122   FOR_EACH_IMM_USE_FAST (use_p, imm_iter, latch_def)
4123     {
4124       gimple *use_stmt = USE_STMT (use_p);
4125       if (is_gimple_debug (use_stmt))
4126         continue;
4127       if (flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
4128         nlatch_def_loop_uses++;
4129       else
4130         {
4131           /* We can have more than one loop-closed PHI.  */
4132           lcphis.safe_push (as_a <gphi *> (use_stmt));
4133           if (nested_in_vect_loop
4134               && (STMT_VINFO_DEF_TYPE (loop_info->lookup_stmt (use_stmt))
4135                   == vect_double_reduction_def))
4136             inner_loop_of_double_reduc = true;
4137         }
4138     }
4139
4140   /* If we are vectorizing an inner reduction we are executing that
4141      in the original order only in case we are not dealing with a
4142      double reduction.  */
4143   if (nested_in_vect_loop && !inner_loop_of_double_reduc)
4144     {
4145       if (dump_enabled_p ())
4146         report_vect_op (MSG_NOTE, def_stmt_info->stmt,
4147                         "detected nested cycle: ");
4148       return def_stmt_info;
4149     }
4150
4151   /* When the inner loop of a double reduction ends up with more than
4152      one loop-closed PHI we have failed to classify alternate such
4153      PHIs as double reduction, leading to wrong code.  See PR103237.  */
4154   if (inner_loop_of_double_reduc && lcphis.length () != 1)
4155     {
4156       if (dump_enabled_p ())
4157         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4158                          "unhandle double reduction\n");
4159       return NULL;
4160     }
4161
4162   /* If this isn't a nested cycle or if the nested cycle reduction value
4163      is used ouside of the inner loop we cannot handle uses of the reduction
4164      value.  */
4165   if (nlatch_def_loop_uses > 1 || nphi_def_loop_uses > 1)
4166     {
4167       if (dump_enabled_p ())
4168         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4169                          "reduction used in loop.\n");
4170       return NULL;
4171     }
4172
4173   /* If DEF_STMT is a phi node itself, we expect it to have a single argument
4174      defined in the inner loop.  */
4175   if (gphi *def_stmt = dyn_cast <gphi *> (def_stmt_info->stmt))
4176     {
4177       tree op1 = PHI_ARG_DEF (def_stmt, 0);
4178       if (gimple_phi_num_args (def_stmt) != 1
4179           || TREE_CODE (op1) != SSA_NAME)
4180         {
4181           if (dump_enabled_p ())
4182             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4183                              "unsupported phi node definition.\n");
4184
4185           return NULL;
4186         }
4187
4188       /* Verify there is an inner cycle composed of the PHI phi_use_stmt
4189          and the latch definition op1.  */
4190       gimple *def1 = SSA_NAME_DEF_STMT (op1);
4191       if (gimple_bb (def1)
4192           && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
4193           && loop->inner
4194           && flow_bb_inside_loop_p (loop->inner, gimple_bb (def1))
4195           && (is_gimple_assign (def1) || is_gimple_call (def1))
4196           && is_a <gphi *> (phi_use_stmt)
4197           && flow_bb_inside_loop_p (loop->inner, gimple_bb (phi_use_stmt))
4198           && (op1 == PHI_ARG_DEF_FROM_EDGE (phi_use_stmt,
4199                                             loop_latch_edge (loop->inner))))
4200         {
4201           if (dump_enabled_p ())
4202             report_vect_op (MSG_NOTE, def_stmt,
4203                             "detected double reduction: ");
4204
4205           *double_reduc = true;
4206           return def_stmt_info;
4207         }
4208
4209       return NULL;
4210     }
4211
4212   /* Look for the expression computing latch_def from then loop PHI result.  */
4213   auto_vec<std::pair<ssa_op_iter, use_operand_p> > path;
4214   code_helper code;
4215   if (check_reduction_path (vect_location, loop, phi, latch_def, &code,
4216                             path))
4217     {
4218       STMT_VINFO_REDUC_CODE (phi_info) = code;
4219       if (code == COND_EXPR && !nested_in_vect_loop)
4220         STMT_VINFO_REDUC_TYPE (phi_info) = COND_REDUCTION;
4221
4222       /* Fill in STMT_VINFO_REDUC_IDX and gather stmts for an SLP
4223          reduction chain for which the additional restriction is that
4224          all operations in the chain are the same.  */
4225       auto_vec<stmt_vec_info, 8> reduc_chain;
4226       unsigned i;
4227       bool is_slp_reduc = !nested_in_vect_loop && code != COND_EXPR;
4228       for (i = path.length () - 1; i >= 1; --i)
4229         {
4230           gimple *stmt = USE_STMT (path[i].second);
4231           stmt_vec_info stmt_info = loop_info->lookup_stmt (stmt);
4232           gimple_match_op op;
4233           if (!gimple_extract_op (stmt, &op))
4234             gcc_unreachable ();
4235           if (gassign *assign = dyn_cast<gassign *> (stmt))
4236             STMT_VINFO_REDUC_IDX (stmt_info)
4237               = path[i].second->use - gimple_assign_rhs1_ptr (assign);
4238           else
4239             {
4240               gcall *call = as_a<gcall *> (stmt);
4241               STMT_VINFO_REDUC_IDX (stmt_info)
4242                 = path[i].second->use - gimple_call_arg_ptr (call, 0);
4243             }
4244           bool leading_conversion = (CONVERT_EXPR_CODE_P (op.code)
4245                                      && (i == 1 || i == path.length () - 1));
4246           if ((op.code != code && !leading_conversion)
4247               /* We can only handle the final value in epilogue
4248                  generation for reduction chains.  */
4249               || (i != 1 && !has_single_use (gimple_get_lhs (stmt))))
4250             is_slp_reduc = false;
4251           /* For reduction chains we support a trailing/leading
4252              conversions.  We do not store those in the actual chain.  */
4253           if (leading_conversion)
4254             continue;
4255           reduc_chain.safe_push (stmt_info);
4256         }
4257       if (slp && is_slp_reduc && reduc_chain.length () > 1)
4258         {
4259           for (unsigned i = 0; i < reduc_chain.length () - 1; ++i)
4260             {
4261               REDUC_GROUP_FIRST_ELEMENT (reduc_chain[i]) = reduc_chain[0];
4262               REDUC_GROUP_NEXT_ELEMENT (reduc_chain[i]) = reduc_chain[i+1];
4263             }
4264           REDUC_GROUP_FIRST_ELEMENT (reduc_chain.last ()) = reduc_chain[0];
4265           REDUC_GROUP_NEXT_ELEMENT (reduc_chain.last ()) = NULL;
4266
4267           /* Save the chain for further analysis in SLP detection.  */
4268           LOOP_VINFO_REDUCTION_CHAINS (loop_info).safe_push (reduc_chain[0]);
4269           REDUC_GROUP_SIZE (reduc_chain[0]) = reduc_chain.length ();
4270
4271           *reduc_chain_p = true;
4272           if (dump_enabled_p ())
4273             dump_printf_loc (MSG_NOTE, vect_location,
4274                             "reduction: detected reduction chain\n");
4275         }
4276       else if (dump_enabled_p ())
4277         dump_printf_loc (MSG_NOTE, vect_location,
4278                          "reduction: detected reduction\n");
4279
4280       return def_stmt_info;
4281     }
4282
4283   if (dump_enabled_p ())
4284     dump_printf_loc (MSG_NOTE, vect_location,
4285                      "reduction: unknown pattern\n");
4286
4287   return NULL;
4288 }
4289
4290 /* Estimate the number of peeled epilogue iterations for LOOP_VINFO.
4291    PEEL_ITERS_PROLOGUE is the number of peeled prologue iterations,
4292    or -1 if not known.  */
4293
4294 static int
4295 vect_get_peel_iters_epilogue (loop_vec_info loop_vinfo, int peel_iters_prologue)
4296 {
4297   int assumed_vf = vect_vf_for_cost (loop_vinfo);
4298   if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) || peel_iters_prologue == -1)
4299     {
4300       if (dump_enabled_p ())
4301         dump_printf_loc (MSG_NOTE, vect_location,
4302                          "cost model: epilogue peel iters set to vf/2 "
4303                          "because loop iterations are unknown .\n");
4304       return assumed_vf / 2;
4305     }
4306   else
4307     {
4308       int niters = LOOP_VINFO_INT_NITERS (loop_vinfo);
4309       peel_iters_prologue = MIN (niters, peel_iters_prologue);
4310       int peel_iters_epilogue = (niters - peel_iters_prologue) % assumed_vf;
4311       /* If we need to peel for gaps, but no peeling is required, we have to
4312          peel VF iterations.  */
4313       if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) && !peel_iters_epilogue)
4314         peel_iters_epilogue = assumed_vf;
4315       return peel_iters_epilogue;
4316     }
4317 }
4318
4319 /* Calculate cost of peeling the loop PEEL_ITERS_PROLOGUE times.  */
4320 int
4321 vect_get_known_peeling_cost (loop_vec_info loop_vinfo, int peel_iters_prologue,
4322                              int *peel_iters_epilogue,
4323                              stmt_vector_for_cost *scalar_cost_vec,
4324                              stmt_vector_for_cost *prologue_cost_vec,
4325                              stmt_vector_for_cost *epilogue_cost_vec)
4326 {
4327   int retval = 0;
4328
4329   *peel_iters_epilogue
4330     = vect_get_peel_iters_epilogue (loop_vinfo, peel_iters_prologue);
4331
4332   if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
4333     {
4334       /* If peeled iterations are known but number of scalar loop
4335          iterations are unknown, count a taken branch per peeled loop.  */
4336       if (peel_iters_prologue > 0)
4337         retval = record_stmt_cost (prologue_cost_vec, 1, cond_branch_taken,
4338                                    vect_prologue);
4339       if (*peel_iters_epilogue > 0)
4340         retval += record_stmt_cost (epilogue_cost_vec, 1, cond_branch_taken,
4341                                     vect_epilogue);
4342     }
4343
4344   stmt_info_for_cost *si;
4345   int j;
4346   if (peel_iters_prologue)
4347     FOR_EACH_VEC_ELT (*scalar_cost_vec, j, si)
4348       retval += record_stmt_cost (prologue_cost_vec,
4349                                   si->count * peel_iters_prologue,
4350                                   si->kind, si->stmt_info, si->misalign,
4351                                   vect_prologue);
4352   if (*peel_iters_epilogue)
4353     FOR_EACH_VEC_ELT (*scalar_cost_vec, j, si)
4354       retval += record_stmt_cost (epilogue_cost_vec,
4355                                   si->count * *peel_iters_epilogue,
4356                                   si->kind, si->stmt_info, si->misalign,
4357                                   vect_epilogue);
4358
4359   return retval;
4360 }
4361
4362 /* Function vect_estimate_min_profitable_iters
4363
4364    Return the number of iterations required for the vector version of the
4365    loop to be profitable relative to the cost of the scalar version of the
4366    loop.
4367
4368    *RET_MIN_PROFITABLE_NITERS is a cost model profitability threshold
4369    of iterations for vectorization.  -1 value means loop vectorization
4370    is not profitable.  This returned value may be used for dynamic
4371    profitability check.
4372
4373    *RET_MIN_PROFITABLE_ESTIMATE is a profitability threshold to be used
4374    for static check against estimated number of iterations.  */
4375
4376 static void
4377 vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo,
4378                                     int *ret_min_profitable_niters,
4379                                     int *ret_min_profitable_estimate,
4380                                     unsigned *suggested_unroll_factor)
4381 {
4382   int min_profitable_iters;
4383   int min_profitable_estimate;
4384   int peel_iters_prologue;
4385   int peel_iters_epilogue;
4386   unsigned vec_inside_cost = 0;
4387   int vec_outside_cost = 0;
4388   unsigned vec_prologue_cost = 0;
4389   unsigned vec_epilogue_cost = 0;
4390   int scalar_single_iter_cost = 0;
4391   int scalar_outside_cost = 0;
4392   int assumed_vf = vect_vf_for_cost (loop_vinfo);
4393   int npeel = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
4394   vector_costs *target_cost_data = loop_vinfo->vector_costs;
4395
4396   /* Cost model disabled.  */
4397   if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
4398     {
4399       if (dump_enabled_p ())
4400         dump_printf_loc (MSG_NOTE, vect_location, "cost model disabled.\n");
4401       *ret_min_profitable_niters = 0;
4402       *ret_min_profitable_estimate = 0;
4403       return;
4404     }
4405
4406   /* Requires loop versioning tests to handle misalignment.  */
4407   if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo))
4408     {
4409       /*  FIXME: Make cost depend on complexity of individual check.  */
4410       unsigned len = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).length ();
4411       (void) add_stmt_cost (target_cost_data, len, scalar_stmt, vect_prologue);
4412       if (dump_enabled_p ())
4413         dump_printf (MSG_NOTE,
4414                      "cost model: Adding cost of checks for loop "
4415                      "versioning to treat misalignment.\n");
4416     }
4417
4418   /* Requires loop versioning with alias checks.  */
4419   if (LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
4420     {
4421       /*  FIXME: Make cost depend on complexity of individual check.  */
4422       unsigned len = LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo).length ();
4423       (void) add_stmt_cost (target_cost_data, len, scalar_stmt, vect_prologue);
4424       len = LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo).length ();
4425       if (len)
4426         /* Count LEN - 1 ANDs and LEN comparisons.  */
4427         (void) add_stmt_cost (target_cost_data, len * 2 - 1,
4428                               scalar_stmt, vect_prologue);
4429       len = LOOP_VINFO_LOWER_BOUNDS (loop_vinfo).length ();
4430       if (len)
4431         {
4432           /* Count LEN - 1 ANDs and LEN comparisons.  */
4433           unsigned int nstmts = len * 2 - 1;
4434           /* +1 for each bias that needs adding.  */
4435           for (unsigned int i = 0; i < len; ++i)
4436             if (!LOOP_VINFO_LOWER_BOUNDS (loop_vinfo)[i].unsigned_p)
4437               nstmts += 1;
4438           (void) add_stmt_cost (target_cost_data, nstmts,
4439                                 scalar_stmt, vect_prologue);
4440         }
4441       if (dump_enabled_p ())
4442         dump_printf (MSG_NOTE,
4443                      "cost model: Adding cost of checks for loop "
4444                      "versioning aliasing.\n");
4445     }
4446
4447   /* Requires loop versioning with niter checks.  */
4448   if (LOOP_REQUIRES_VERSIONING_FOR_NITERS (loop_vinfo))
4449     {
4450       /*  FIXME: Make cost depend on complexity of individual check.  */
4451       (void) add_stmt_cost (target_cost_data, 1, vector_stmt,
4452                             NULL, NULL, NULL_TREE, 0, vect_prologue);
4453       if (dump_enabled_p ())
4454         dump_printf (MSG_NOTE,
4455                      "cost model: Adding cost of checks for loop "
4456                      "versioning niters.\n");
4457     }
4458
4459   if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
4460     (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken,
4461                           vect_prologue);
4462
4463   /* Count statements in scalar loop.  Using this as scalar cost for a single
4464      iteration for now.
4465
4466      TODO: Add outer loop support.
4467
4468      TODO: Consider assigning different costs to different scalar
4469      statements.  */
4470
4471   scalar_single_iter_cost = loop_vinfo->scalar_costs->total_cost ();
4472
4473   /* Add additional cost for the peeled instructions in prologue and epilogue
4474      loop.  (For fully-masked loops there will be no peeling.)
4475
4476      FORNOW: If we don't know the value of peel_iters for prologue or epilogue
4477      at compile-time - we assume it's vf/2 (the worst would be vf-1).
4478
4479      TODO: Build an expression that represents peel_iters for prologue and
4480      epilogue to be used in a run-time test.  */
4481
4482   bool prologue_need_br_taken_cost = false;
4483   bool prologue_need_br_not_taken_cost = false;
4484
4485   /* Calculate peel_iters_prologue.  */
4486   if (vect_use_loop_mask_for_alignment_p (loop_vinfo))
4487     peel_iters_prologue = 0;
4488   else if (npeel < 0)
4489     {
4490       peel_iters_prologue = assumed_vf / 2;
4491       if (dump_enabled_p ())
4492         dump_printf (MSG_NOTE, "cost model: "
4493                      "prologue peel iters set to vf/2.\n");
4494
4495       /* If peeled iterations are unknown, count a taken branch and a not taken
4496          branch per peeled loop.  Even if scalar loop iterations are known,
4497          vector iterations are not known since peeled prologue iterations are
4498          not known.  Hence guards remain the same.  */
4499       prologue_need_br_taken_cost = true;
4500       prologue_need_br_not_taken_cost = true;
4501     }
4502   else
4503     {
4504       peel_iters_prologue = npeel;
4505       if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) && peel_iters_prologue > 0)
4506         /* If peeled iterations are known but number of scalar loop
4507            iterations are unknown, count a taken branch per peeled loop.  */
4508         prologue_need_br_taken_cost = true;
4509     }
4510
4511   bool epilogue_need_br_taken_cost = false;
4512   bool epilogue_need_br_not_taken_cost = false;
4513
4514   /* Calculate peel_iters_epilogue.  */
4515   if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
4516     /* We need to peel exactly one iteration for gaps.  */
4517     peel_iters_epilogue = LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) ? 1 : 0;
4518   else if (npeel < 0)
4519     {
4520       /* If peeling for alignment is unknown, loop bound of main loop
4521          becomes unknown.  */
4522       peel_iters_epilogue = assumed_vf / 2;
4523       if (dump_enabled_p ())
4524         dump_printf (MSG_NOTE, "cost model: "
4525                      "epilogue peel iters set to vf/2 because "
4526                      "peeling for alignment is unknown.\n");
4527
4528       /* See the same reason above in peel_iters_prologue calculation.  */
4529       epilogue_need_br_taken_cost = true;
4530       epilogue_need_br_not_taken_cost = true;
4531     }
4532   else
4533     {
4534       peel_iters_epilogue = vect_get_peel_iters_epilogue (loop_vinfo, npeel);
4535       if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) && peel_iters_epilogue > 0)
4536         /* If peeled iterations are known but number of scalar loop
4537            iterations are unknown, count a taken branch per peeled loop.  */
4538         epilogue_need_br_taken_cost = true;
4539     }
4540
4541   stmt_info_for_cost *si;
4542   int j;
4543   /* Add costs associated with peel_iters_prologue.  */
4544   if (peel_iters_prologue)
4545     FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo), j, si)
4546       {
4547         (void) add_stmt_cost (target_cost_data,
4548                               si->count * peel_iters_prologue, si->kind,
4549                               si->stmt_info, si->node, si->vectype,
4550                               si->misalign, vect_prologue);
4551       }
4552
4553   /* Add costs associated with peel_iters_epilogue.  */
4554   if (peel_iters_epilogue)
4555     FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo), j, si)
4556       {
4557         (void) add_stmt_cost (target_cost_data,
4558                               si->count * peel_iters_epilogue, si->kind,
4559                               si->stmt_info, si->node, si->vectype,
4560                               si->misalign, vect_epilogue);
4561       }
4562
4563   /* Add possible cond_branch_taken/cond_branch_not_taken cost.  */
4564
4565   if (prologue_need_br_taken_cost)
4566     (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken,
4567                           vect_prologue);
4568
4569   if (prologue_need_br_not_taken_cost)
4570     (void) add_stmt_cost (target_cost_data, 1,
4571                           cond_branch_not_taken, vect_prologue);
4572
4573   if (epilogue_need_br_taken_cost)
4574     (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken,
4575                           vect_epilogue);
4576
4577   if (epilogue_need_br_not_taken_cost)
4578     (void) add_stmt_cost (target_cost_data, 1,
4579                           cond_branch_not_taken, vect_epilogue);
4580
4581   /* Take care of special costs for rgroup controls of partial vectors.  */
4582   if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
4583       && (LOOP_VINFO_PARTIAL_VECTORS_STYLE (loop_vinfo)
4584           == vect_partial_vectors_avx512))
4585     {
4586       /* Calculate how many masks we need to generate.  */
4587       unsigned int num_masks = 0;
4588       bool need_saturation = false;
4589       for (auto rgm : LOOP_VINFO_MASKS (loop_vinfo).rgc_vec)
4590         if (rgm.type)
4591           {
4592             unsigned nvectors = rgm.factor;
4593             num_masks += nvectors;
4594             if (TYPE_PRECISION (TREE_TYPE (rgm.compare_type))
4595                 < TYPE_PRECISION (LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo)))
4596               need_saturation = true;
4597           }
4598
4599       /* ???  The target isn't able to identify the costs below as
4600          producing masks so it cannot penaltize cases where we'd run
4601          out of mask registers for example.  */
4602
4603       /* ???  We are also failing to account for smaller vector masks
4604          we generate by splitting larger masks in vect_get_loop_mask.  */
4605
4606       /* In the worst case, we need to generate each mask in the prologue
4607          and in the loop body.  We need one splat per group and one
4608          compare per mask.
4609
4610          Sometimes the prologue mask will fold to a constant,
4611          so the actual prologue cost might be smaller.  However, it's
4612          simpler and safer to use the worst-case cost; if this ends up
4613          being the tie-breaker between vectorizing or not, then it's
4614          probably better not to vectorize.  */
4615       (void) add_stmt_cost (target_cost_data,
4616                             num_masks
4617                             + LOOP_VINFO_MASKS (loop_vinfo).rgc_vec.length (),
4618                             vector_stmt, NULL, NULL, NULL_TREE, 0,
4619                             vect_prologue);
4620       (void) add_stmt_cost (target_cost_data,
4621                             num_masks
4622                             + LOOP_VINFO_MASKS (loop_vinfo).rgc_vec.length (),
4623                             vector_stmt, NULL, NULL, NULL_TREE, 0, vect_body);
4624
4625       /* When we need saturation we need it both in the prologue and
4626          the epilogue.  */
4627       if (need_saturation)
4628         {
4629           (void) add_stmt_cost (target_cost_data, 1, scalar_stmt,
4630                                 NULL, NULL, NULL_TREE, 0, vect_prologue);
4631           (void) add_stmt_cost (target_cost_data, 1, scalar_stmt,
4632                                 NULL, NULL, NULL_TREE, 0, vect_body);
4633         }
4634     }
4635   else if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
4636            && (LOOP_VINFO_PARTIAL_VECTORS_STYLE (loop_vinfo)
4637                == vect_partial_vectors_while_ult))
4638     {
4639       /* Calculate how many masks we need to generate.  */
4640       unsigned int num_masks = 0;
4641       rgroup_controls *rgm;
4642       unsigned int num_vectors_m1;
4643       FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo).rgc_vec,
4644                         num_vectors_m1, rgm)
4645         if (rgm->type)
4646           num_masks += num_vectors_m1 + 1;
4647       gcc_assert (num_masks > 0);
4648
4649       /* In the worst case, we need to generate each mask in the prologue
4650          and in the loop body.  One of the loop body mask instructions
4651          replaces the comparison in the scalar loop, and since we don't
4652          count the scalar comparison against the scalar body, we shouldn't
4653          count that vector instruction against the vector body either.
4654
4655          Sometimes we can use unpacks instead of generating prologue
4656          masks and sometimes the prologue mask will fold to a constant,
4657          so the actual prologue cost might be smaller.  However, it's
4658          simpler and safer to use the worst-case cost; if this ends up
4659          being the tie-breaker between vectorizing or not, then it's
4660          probably better not to vectorize.  */
4661       (void) add_stmt_cost (target_cost_data, num_masks,
4662                             vector_stmt, NULL, NULL, NULL_TREE, 0,
4663                             vect_prologue);
4664       (void) add_stmt_cost (target_cost_data, num_masks - 1,
4665                             vector_stmt, NULL, NULL, NULL_TREE, 0,
4666                             vect_body);
4667     }
4668   else if (LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo))
4669     {
4670       /* Referring to the functions vect_set_loop_condition_partial_vectors
4671          and vect_set_loop_controls_directly, we need to generate each
4672          length in the prologue and in the loop body if required. Although
4673          there are some possible optimizations, we consider the worst case
4674          here.  */
4675
4676       bool niters_known_p = LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo);
4677       signed char partial_load_store_bias
4678         = LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo);
4679       bool need_iterate_p
4680         = (!LOOP_VINFO_EPILOGUE_P (loop_vinfo)
4681            && !vect_known_niters_smaller_than_vf (loop_vinfo));
4682
4683       /* Calculate how many statements to be added.  */
4684       unsigned int prologue_stmts = 0;
4685       unsigned int body_stmts = 0;
4686
4687       rgroup_controls *rgc;
4688       unsigned int num_vectors_m1;
4689       FOR_EACH_VEC_ELT (LOOP_VINFO_LENS (loop_vinfo), num_vectors_m1, rgc)
4690         if (rgc->type)
4691           {
4692             /* May need one SHIFT for nitems_total computation.  */
4693             unsigned nitems = rgc->max_nscalars_per_iter * rgc->factor;
4694             if (nitems != 1 && !niters_known_p)
4695               prologue_stmts += 1;
4696
4697             /* May need one MAX and one MINUS for wrap around.  */
4698             if (vect_rgroup_iv_might_wrap_p (loop_vinfo, rgc))
4699               prologue_stmts += 2;
4700
4701             /* Need one MAX and one MINUS for each batch limit excepting for
4702                the 1st one.  */
4703             prologue_stmts += num_vectors_m1 * 2;
4704
4705             unsigned int num_vectors = num_vectors_m1 + 1;
4706
4707             /* Need to set up lengths in prologue, only one MIN required
4708                for each since start index is zero.  */
4709             prologue_stmts += num_vectors;
4710
4711             /* If we have a non-zero partial load bias, we need one PLUS
4712                to adjust the load length.  */
4713             if (partial_load_store_bias != 0)
4714               body_stmts += 1;
4715
4716             /* Each may need two MINs and one MINUS to update lengths in body
4717                for next iteration.  */
4718             if (need_iterate_p)
4719               body_stmts += 3 * num_vectors;
4720           }
4721
4722       (void) add_stmt_cost (target_cost_data, prologue_stmts,
4723                             scalar_stmt, vect_prologue);
4724       (void) add_stmt_cost (target_cost_data, body_stmts,
4725                             scalar_stmt, vect_body);
4726     }
4727
4728   /* FORNOW: The scalar outside cost is incremented in one of the
4729      following ways:
4730
4731      1. The vectorizer checks for alignment and aliasing and generates
4732      a condition that allows dynamic vectorization.  A cost model
4733      check is ANDED with the versioning condition.  Hence scalar code
4734      path now has the added cost of the versioning check.
4735
4736        if (cost > th & versioning_check)
4737          jmp to vector code
4738
4739      Hence run-time scalar is incremented by not-taken branch cost.
4740
4741      2. The vectorizer then checks if a prologue is required.  If the
4742      cost model check was not done before during versioning, it has to
4743      be done before the prologue check.
4744
4745        if (cost <= th)
4746          prologue = scalar_iters
4747        if (prologue == 0)
4748          jmp to vector code
4749        else
4750          execute prologue
4751        if (prologue == num_iters)
4752          go to exit
4753
4754      Hence the run-time scalar cost is incremented by a taken branch,
4755      plus a not-taken branch, plus a taken branch cost.
4756
4757      3. The vectorizer then checks if an epilogue is required.  If the
4758      cost model check was not done before during prologue check, it
4759      has to be done with the epilogue check.
4760
4761        if (prologue == 0)
4762          jmp to vector code
4763        else
4764          execute prologue
4765        if (prologue == num_iters)
4766          go to exit
4767        vector code:
4768          if ((cost <= th) | (scalar_iters-prologue-epilogue == 0))
4769            jmp to epilogue
4770
4771      Hence the run-time scalar cost should be incremented by 2 taken
4772      branches.
4773
4774      TODO: The back end may reorder the BBS's differently and reverse
4775      conditions/branch directions.  Change the estimates below to
4776      something more reasonable.  */
4777
4778   /* If the number of iterations is known and we do not do versioning, we can
4779      decide whether to vectorize at compile time.  Hence the scalar version
4780      do not carry cost model guard costs.  */
4781   if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
4782       || LOOP_REQUIRES_VERSIONING (loop_vinfo))
4783     {
4784       /* Cost model check occurs at versioning.  */
4785       if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
4786         scalar_outside_cost += vect_get_stmt_cost (cond_branch_not_taken);
4787       else
4788         {
4789           /* Cost model check occurs at prologue generation.  */
4790           if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) < 0)
4791             scalar_outside_cost += 2 * vect_get_stmt_cost (cond_branch_taken)
4792               + vect_get_stmt_cost (cond_branch_not_taken);
4793           /* Cost model check occurs at epilogue generation.  */
4794           else
4795             scalar_outside_cost += 2 * vect_get_stmt_cost (cond_branch_taken);
4796         }
4797     }
4798
4799   /* Complete the target-specific cost calculations.  */
4800   finish_cost (loop_vinfo->vector_costs, loop_vinfo->scalar_costs,
4801                &vec_prologue_cost, &vec_inside_cost, &vec_epilogue_cost,
4802                suggested_unroll_factor);
4803
4804   if (suggested_unroll_factor && *suggested_unroll_factor > 1
4805       && LOOP_VINFO_MAX_VECT_FACTOR (loop_vinfo) != MAX_VECTORIZATION_FACTOR
4806       && !known_le (LOOP_VINFO_VECT_FACTOR (loop_vinfo) *
4807                     *suggested_unroll_factor,
4808                     LOOP_VINFO_MAX_VECT_FACTOR (loop_vinfo)))
4809     {
4810       if (dump_enabled_p ())
4811         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4812                          "can't unroll as unrolled vectorization factor larger"
4813                          " than maximum vectorization factor: "
4814                          HOST_WIDE_INT_PRINT_UNSIGNED "\n",
4815                          LOOP_VINFO_MAX_VECT_FACTOR (loop_vinfo));
4816       *suggested_unroll_factor = 1;
4817     }
4818
4819   vec_outside_cost = (int)(vec_prologue_cost + vec_epilogue_cost);
4820
4821   if (dump_enabled_p ())
4822     {
4823       dump_printf_loc (MSG_NOTE, vect_location, "Cost model analysis: \n");
4824       dump_printf (MSG_NOTE, "  Vector inside of loop cost: %d\n",
4825                    vec_inside_cost);
4826       dump_printf (MSG_NOTE, "  Vector prologue cost: %d\n",
4827                    vec_prologue_cost);
4828       dump_printf (MSG_NOTE, "  Vector epilogue cost: %d\n",
4829                    vec_epilogue_cost);
4830       dump_printf (MSG_NOTE, "  Scalar iteration cost: %d\n",
4831                    scalar_single_iter_cost);
4832       dump_printf (MSG_NOTE, "  Scalar outside cost: %d\n",
4833                    scalar_outside_cost);
4834       dump_printf (MSG_NOTE, "  Vector outside cost: %d\n",
4835                    vec_outside_cost);
4836       dump_printf (MSG_NOTE, "  prologue iterations: %d\n",
4837                    peel_iters_prologue);
4838       dump_printf (MSG_NOTE, "  epilogue iterations: %d\n",
4839                    peel_iters_epilogue);
4840     }
4841
4842   /* Calculate number of iterations required to make the vector version
4843      profitable, relative to the loop bodies only.  The following condition
4844      must hold true:
4845      SIC * niters + SOC > VIC * ((niters - NPEEL) / VF) + VOC
4846      where
4847      SIC = scalar iteration cost, VIC = vector iteration cost,
4848      VOC = vector outside cost, VF = vectorization factor,
4849      NPEEL = prologue iterations + epilogue iterations,
4850      SOC = scalar outside cost for run time cost model check.  */
4851
4852   int saving_per_viter = (scalar_single_iter_cost * assumed_vf
4853                           - vec_inside_cost);
4854   if (saving_per_viter <= 0)
4855     {
4856       if (LOOP_VINFO_LOOP (loop_vinfo)->force_vectorize)
4857         warning_at (vect_location.get_location_t (), OPT_Wopenmp_simd,
4858                     "vectorization did not happen for a simd loop");
4859
4860       if (dump_enabled_p ())
4861         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4862                          "cost model: the vector iteration cost = %d "
4863                          "divided by the scalar iteration cost = %d "
4864                          "is greater or equal to the vectorization factor = %d"
4865                          ".\n",
4866                          vec_inside_cost, scalar_single_iter_cost, assumed_vf);
4867       *ret_min_profitable_niters = -1;
4868       *ret_min_profitable_estimate = -1;
4869       return;
4870     }
4871
4872   /* ??? The "if" arm is written to handle all cases; see below for what
4873      we would do for !LOOP_VINFO_USING_PARTIAL_VECTORS_P.  */
4874   if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
4875     {
4876       /* Rewriting the condition above in terms of the number of
4877          vector iterations (vniters) rather than the number of
4878          scalar iterations (niters) gives:
4879
4880          SIC * (vniters * VF + NPEEL) + SOC > VIC * vniters + VOC
4881
4882          <==> vniters * (SIC * VF - VIC) > VOC - SIC * NPEEL - SOC
4883
4884          For integer N, X and Y when X > 0:
4885
4886          N * X > Y <==> N >= (Y /[floor] X) + 1.  */
4887       int outside_overhead = (vec_outside_cost
4888                               - scalar_single_iter_cost * peel_iters_prologue
4889                               - scalar_single_iter_cost * peel_iters_epilogue
4890                               - scalar_outside_cost);
4891       /* We're only interested in cases that require at least one
4892          vector iteration.  */
4893       int min_vec_niters = 1;
4894       if (outside_overhead > 0)
4895         min_vec_niters = outside_overhead / saving_per_viter + 1;
4896
4897       if (dump_enabled_p ())
4898         dump_printf (MSG_NOTE, "  Minimum number of vector iterations: %d\n",
4899                      min_vec_niters);
4900
4901       if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
4902         {
4903           /* Now that we know the minimum number of vector iterations,
4904              find the minimum niters for which the scalar cost is larger:
4905
4906              SIC * niters > VIC * vniters + VOC - SOC
4907
4908              We know that the minimum niters is no more than
4909              vniters * VF + NPEEL, but it might be (and often is) less
4910              than that if a partial vector iteration is cheaper than the
4911              equivalent scalar code.  */
4912           int threshold = (vec_inside_cost * min_vec_niters
4913                            + vec_outside_cost
4914                            - scalar_outside_cost);
4915           if (threshold <= 0)
4916             min_profitable_iters = 1;
4917           else
4918             min_profitable_iters = threshold / scalar_single_iter_cost + 1;
4919         }
4920       else
4921         /* Convert the number of vector iterations into a number of
4922            scalar iterations.  */
4923         min_profitable_iters = (min_vec_niters * assumed_vf
4924                                 + peel_iters_prologue
4925                                 + peel_iters_epilogue);
4926     }
4927   else
4928     {
4929       min_profitable_iters = ((vec_outside_cost - scalar_outside_cost)
4930                               * assumed_vf
4931                               - vec_inside_cost * peel_iters_prologue
4932                               - vec_inside_cost * peel_iters_epilogue);
4933       if (min_profitable_iters <= 0)
4934         min_profitable_iters = 0;
4935       else
4936         {
4937           min_profitable_iters /= saving_per_viter;
4938
4939           if ((scalar_single_iter_cost * assumed_vf * min_profitable_iters)
4940               <= (((int) vec_inside_cost * min_profitable_iters)
4941                   + (((int) vec_outside_cost - scalar_outside_cost)
4942                      * assumed_vf)))
4943             min_profitable_iters++;
4944         }
4945     }
4946
4947   if (dump_enabled_p ())
4948     dump_printf (MSG_NOTE,
4949                  "  Calculated minimum iters for profitability: %d\n",
4950                  min_profitable_iters);
4951
4952   if (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo)
4953       && min_profitable_iters < (assumed_vf + peel_iters_prologue))
4954     /* We want the vectorized loop to execute at least once.  */
4955     min_profitable_iters = assumed_vf + peel_iters_prologue;
4956   else if (min_profitable_iters < peel_iters_prologue)
4957     /* For LOOP_VINFO_USING_PARTIAL_VECTORS_P, we need to ensure the
4958        vectorized loop executes at least once.  */
4959     min_profitable_iters = peel_iters_prologue;
4960
4961   if (dump_enabled_p ())
4962     dump_printf_loc (MSG_NOTE, vect_location,
4963                      "  Runtime profitability threshold = %d\n",
4964                      min_profitable_iters);
4965
4966   *ret_min_profitable_niters = min_profitable_iters;
4967
4968   /* Calculate number of iterations required to make the vector version
4969      profitable, relative to the loop bodies only.
4970
4971      Non-vectorized variant is SIC * niters and it must win over vector
4972      variant on the expected loop trip count.  The following condition must hold true:
4973      SIC * niters > VIC * ((niters - NPEEL) / VF) + VOC + SOC  */
4974
4975   if (vec_outside_cost <= 0)
4976     min_profitable_estimate = 0;
4977   /* ??? This "else if" arm is written to handle all cases; see below for
4978      what we would do for !LOOP_VINFO_USING_PARTIAL_VECTORS_P.  */
4979   else if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
4980     {
4981       /* This is a repeat of the code above, but with + SOC rather
4982          than - SOC.  */
4983       int outside_overhead = (vec_outside_cost
4984                               - scalar_single_iter_cost * peel_iters_prologue
4985                               - scalar_single_iter_cost * peel_iters_epilogue
4986                               + scalar_outside_cost);
4987       int min_vec_niters = 1;
4988       if (outside_overhead > 0)
4989         min_vec_niters = outside_overhead / saving_per_viter + 1;
4990
4991       if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
4992         {
4993           int threshold = (vec_inside_cost * min_vec_niters
4994                            + vec_outside_cost
4995                            + scalar_outside_cost);
4996           min_profitable_estimate = threshold / scalar_single_iter_cost + 1;
4997         }
4998       else
4999         min_profitable_estimate = (min_vec_niters * assumed_vf
5000                                    + peel_iters_prologue
5001                                    + peel_iters_epilogue);
5002     }
5003   else
5004     {
5005       min_profitable_estimate = ((vec_outside_cost + scalar_outside_cost)
5006                                  * assumed_vf
5007                                  - vec_inside_cost * peel_iters_prologue
5008                                  - vec_inside_cost * peel_iters_epilogue)
5009                                  / ((scalar_single_iter_cost * assumed_vf)
5010                                    - vec_inside_cost);
5011     }
5012   min_profitable_estimate = MAX (min_profitable_estimate, min_profitable_iters);
5013   if (dump_enabled_p ())
5014     dump_printf_loc (MSG_NOTE, vect_location,
5015                      "  Static estimate profitability threshold = %d\n",
5016                      min_profitable_estimate);
5017
5018   *ret_min_profitable_estimate = min_profitable_estimate;
5019 }
5020
5021 /* Writes into SEL a mask for a vec_perm, equivalent to a vec_shr by OFFSET
5022    vector elements (not bits) for a vector with NELT elements.  */
5023 static void
5024 calc_vec_perm_mask_for_shift (unsigned int offset, unsigned int nelt,
5025                               vec_perm_builder *sel)
5026 {
5027   /* The encoding is a single stepped pattern.  Any wrap-around is handled
5028      by vec_perm_indices.  */
5029   sel->new_vector (nelt, 1, 3);
5030   for (unsigned int i = 0; i < 3; i++)
5031     sel->quick_push (i + offset);
5032 }
5033
5034 /* Checks whether the target supports whole-vector shifts for vectors of mode
5035    MODE.  This is the case if _either_ the platform handles vec_shr_optab, _or_
5036    it supports vec_perm_const with masks for all necessary shift amounts.  */
5037 static bool
5038 have_whole_vector_shift (machine_mode mode)
5039 {
5040   if (optab_handler (vec_shr_optab, mode) != CODE_FOR_nothing)
5041     return true;
5042
5043   /* Variable-length vectors should be handled via the optab.  */
5044   unsigned int nelt;
5045   if (!GET_MODE_NUNITS (mode).is_constant (&nelt))
5046     return false;
5047
5048   vec_perm_builder sel;
5049   vec_perm_indices indices;
5050   for (unsigned int i = nelt / 2; i >= 1; i /= 2)
5051     {
5052       calc_vec_perm_mask_for_shift (i, nelt, &sel);
5053       indices.new_vector (sel, 2, nelt);
5054       if (!can_vec_perm_const_p (mode, mode, indices, false))
5055         return false;
5056     }
5057   return true;
5058 }
5059
5060 /* Return true if (a) STMT_INFO is a DOT_PROD_EXPR reduction whose
5061    multiplication operands have differing signs and (b) we intend
5062    to emulate the operation using a series of signed DOT_PROD_EXPRs.
5063    See vect_emulate_mixed_dot_prod for the actual sequence used.  */
5064
5065 static bool
5066 vect_is_emulated_mixed_dot_prod (loop_vec_info loop_vinfo,
5067                                  stmt_vec_info stmt_info)
5068 {
5069   gassign *assign = dyn_cast<gassign *> (stmt_info->stmt);
5070   if (!assign || gimple_assign_rhs_code (assign) != DOT_PROD_EXPR)
5071     return false;
5072
5073   tree rhs1 = gimple_assign_rhs1 (assign);
5074   tree rhs2 = gimple_assign_rhs2 (assign);
5075   if (TYPE_SIGN (TREE_TYPE (rhs1)) == TYPE_SIGN (TREE_TYPE (rhs2)))
5076     return false;
5077
5078   stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info);
5079   gcc_assert (reduc_info->is_reduc_info);
5080   return !directly_supported_p (DOT_PROD_EXPR,
5081                                 STMT_VINFO_REDUC_VECTYPE_IN (reduc_info),
5082                                 optab_vector_mixed_sign);
5083 }
5084
5085 /* TODO: Close dependency between vect_model_*_cost and vectorizable_*
5086    functions. Design better to avoid maintenance issues.  */
5087
5088 /* Function vect_model_reduction_cost.
5089
5090    Models cost for a reduction operation, including the vector ops
5091    generated within the strip-mine loop in some cases, the initial
5092    definition before the loop, and the epilogue code that must be generated.  */
5093
5094 static void
5095 vect_model_reduction_cost (loop_vec_info loop_vinfo,
5096                            stmt_vec_info stmt_info, internal_fn reduc_fn,
5097                            vect_reduction_type reduction_type,
5098                            int ncopies, stmt_vector_for_cost *cost_vec)
5099 {
5100   int prologue_cost = 0, epilogue_cost = 0, inside_cost = 0;
5101   tree vectype;
5102   machine_mode mode;
5103   class loop *loop = NULL;
5104
5105   if (loop_vinfo)
5106     loop = LOOP_VINFO_LOOP (loop_vinfo);
5107
5108   /* Condition reductions generate two reductions in the loop.  */
5109   if (reduction_type == COND_REDUCTION)
5110     ncopies *= 2;
5111
5112   vectype = STMT_VINFO_VECTYPE (stmt_info);
5113   mode = TYPE_MODE (vectype);
5114   stmt_vec_info orig_stmt_info = vect_orig_stmt (stmt_info);
5115
5116   gimple_match_op op;
5117   if (!gimple_extract_op (orig_stmt_info->stmt, &op))
5118     gcc_unreachable ();
5119
5120   bool emulated_mixed_dot_prod
5121     = vect_is_emulated_mixed_dot_prod (loop_vinfo, stmt_info);
5122   if (reduction_type == EXTRACT_LAST_REDUCTION)
5123     /* No extra instructions are needed in the prologue.  The loop body
5124        operations are costed in vectorizable_condition.  */
5125     inside_cost = 0;
5126   else if (reduction_type == FOLD_LEFT_REDUCTION)
5127     {
5128       /* No extra instructions needed in the prologue.  */
5129       prologue_cost = 0;
5130
5131       if (reduc_fn != IFN_LAST)
5132         /* Count one reduction-like operation per vector.  */
5133         inside_cost = record_stmt_cost (cost_vec, ncopies, vec_to_scalar,
5134                                         stmt_info, 0, vect_body);
5135       else
5136         {
5137           /* Use NELEMENTS extracts and NELEMENTS scalar ops.  */
5138           unsigned int nelements = ncopies * vect_nunits_for_cost (vectype);
5139           inside_cost = record_stmt_cost (cost_vec, nelements,
5140                                           vec_to_scalar, stmt_info, 0,
5141                                           vect_body);
5142           inside_cost += record_stmt_cost (cost_vec, nelements,
5143                                            scalar_stmt, stmt_info, 0,
5144                                            vect_body);
5145         }
5146     }
5147   else
5148     {
5149       /* Add in the cost of the initial definitions.  */
5150       int prologue_stmts;
5151       if (reduction_type == COND_REDUCTION)
5152         /* For cond reductions we have four vectors: initial index, step,
5153            initial result of the data reduction, initial value of the index
5154            reduction.  */
5155         prologue_stmts = 4;
5156       else if (emulated_mixed_dot_prod)
5157         /* We need the initial reduction value and two invariants:
5158            one that contains the minimum signed value and one that
5159            contains half of its negative.  */
5160         prologue_stmts = 3;
5161       else
5162         prologue_stmts = 1;
5163       prologue_cost += record_stmt_cost (cost_vec, prologue_stmts,
5164                                          scalar_to_vec, stmt_info, 0,
5165                                          vect_prologue);
5166     }
5167
5168   /* Determine cost of epilogue code.
5169
5170      We have a reduction operator that will reduce the vector in one statement.
5171      Also requires scalar extract.  */
5172
5173   if (!loop || !nested_in_vect_loop_p (loop, orig_stmt_info))
5174     {
5175       if (reduc_fn != IFN_LAST)
5176         {
5177           if (reduction_type == COND_REDUCTION)
5178             {
5179               /* An EQ stmt and an COND_EXPR stmt.  */
5180               epilogue_cost += record_stmt_cost (cost_vec, 2,
5181                                                  vector_stmt, stmt_info, 0,
5182                                                  vect_epilogue);
5183               /* Reduction of the max index and a reduction of the found
5184                  values.  */
5185               epilogue_cost += record_stmt_cost (cost_vec, 2,
5186                                                  vec_to_scalar, stmt_info, 0,
5187                                                  vect_epilogue);
5188               /* A broadcast of the max value.  */
5189               epilogue_cost += record_stmt_cost (cost_vec, 1,
5190                                                  scalar_to_vec, stmt_info, 0,
5191                                                  vect_epilogue);
5192             }
5193           else
5194             {
5195               epilogue_cost += record_stmt_cost (cost_vec, 1, vector_stmt,
5196                                                  stmt_info, 0, vect_epilogue);
5197               epilogue_cost += record_stmt_cost (cost_vec, 1,
5198                                                  vec_to_scalar, stmt_info, 0,
5199                                                  vect_epilogue);
5200             }
5201         }
5202       else if (reduction_type == COND_REDUCTION)
5203         {
5204           unsigned estimated_nunits = vect_nunits_for_cost (vectype);
5205           /* Extraction of scalar elements.  */
5206           epilogue_cost += record_stmt_cost (cost_vec,
5207                                              2 * estimated_nunits,
5208                                              vec_to_scalar, stmt_info, 0,
5209                                              vect_epilogue);
5210           /* Scalar max reductions via COND_EXPR / MAX_EXPR.  */
5211           epilogue_cost += record_stmt_cost (cost_vec,
5212                                              2 * estimated_nunits - 3,
5213                                              scalar_stmt, stmt_info, 0,
5214                                              vect_epilogue);
5215         }
5216       else if (reduction_type == EXTRACT_LAST_REDUCTION
5217                || reduction_type == FOLD_LEFT_REDUCTION)
5218         /* No extra instructions need in the epilogue.  */
5219         ;
5220       else
5221         {
5222           int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
5223           tree bitsize = TYPE_SIZE (op.type);
5224           int element_bitsize = tree_to_uhwi (bitsize);
5225           int nelements = vec_size_in_bits / element_bitsize;
5226
5227           if (op.code == COND_EXPR)
5228             op.code = MAX_EXPR;
5229
5230           /* We have a whole vector shift available.  */
5231           if (VECTOR_MODE_P (mode)
5232               && directly_supported_p (op.code, vectype)
5233               && have_whole_vector_shift (mode))
5234             {
5235               /* Final reduction via vector shifts and the reduction operator.
5236                  Also requires scalar extract.  */
5237               epilogue_cost += record_stmt_cost (cost_vec,
5238                                                  exact_log2 (nelements) * 2,
5239                                                  vector_stmt, stmt_info, 0,
5240                                                  vect_epilogue);
5241               epilogue_cost += record_stmt_cost (cost_vec, 1,
5242                                                  vec_to_scalar, stmt_info, 0,
5243                                                  vect_epilogue);
5244             }
5245           else
5246             /* Use extracts and reduction op for final reduction.  For N
5247                elements, we have N extracts and N-1 reduction ops.  */
5248             epilogue_cost += record_stmt_cost (cost_vec,
5249                                                nelements + nelements - 1,
5250                                                vector_stmt, stmt_info, 0,
5251                                                vect_epilogue);
5252         }
5253     }
5254
5255   if (dump_enabled_p ())
5256     dump_printf (MSG_NOTE,
5257                  "vect_model_reduction_cost: inside_cost = %d, "
5258                  "prologue_cost = %d, epilogue_cost = %d .\n", inside_cost,
5259                  prologue_cost, epilogue_cost);
5260 }
5261
5262 /* SEQ is a sequence of instructions that initialize the reduction
5263    described by REDUC_INFO.  Emit them in the appropriate place.  */
5264
5265 static void
5266 vect_emit_reduction_init_stmts (loop_vec_info loop_vinfo,
5267                                 stmt_vec_info reduc_info, gimple *seq)
5268 {
5269   if (reduc_info->reused_accumulator)
5270     {
5271       /* When reusing an accumulator from the main loop, we only need
5272          initialization instructions if the main loop can be skipped.
5273          In that case, emit the initialization instructions at the end
5274          of the guard block that does the skip.  */
5275       edge skip_edge = loop_vinfo->skip_main_loop_edge;
5276       gcc_assert (skip_edge);
5277       gimple_stmt_iterator gsi = gsi_last_bb (skip_edge->src);
5278       gsi_insert_seq_before (&gsi, seq, GSI_SAME_STMT);
5279     }
5280   else
5281     {
5282       /* The normal case: emit the initialization instructions on the
5283          preheader edge.  */
5284       class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
5285       gsi_insert_seq_on_edge_immediate (loop_preheader_edge (loop), seq);
5286     }
5287 }
5288
5289 /* Function get_initial_def_for_reduction
5290
5291    Input:
5292    REDUC_INFO - the info_for_reduction
5293    INIT_VAL - the initial value of the reduction variable
5294    NEUTRAL_OP - a value that has no effect on the reduction, as per
5295                 neutral_op_for_reduction
5296
5297    Output:
5298    Return a vector variable, initialized according to the operation that
5299         STMT_VINFO performs. This vector will be used as the initial value
5300         of the vector of partial results.
5301
5302    The value we need is a vector in which element 0 has value INIT_VAL
5303    and every other element has value NEUTRAL_OP.  */
5304
5305 static tree
5306 get_initial_def_for_reduction (loop_vec_info loop_vinfo,
5307                                stmt_vec_info reduc_info,
5308                                tree init_val, tree neutral_op)
5309 {
5310   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
5311   tree scalar_type = TREE_TYPE (init_val);
5312   tree vectype = get_vectype_for_scalar_type (loop_vinfo, scalar_type);
5313   tree init_def;
5314   gimple_seq stmts = NULL;
5315
5316   gcc_assert (vectype);
5317
5318   gcc_assert (POINTER_TYPE_P (scalar_type) || INTEGRAL_TYPE_P (scalar_type)
5319               || SCALAR_FLOAT_TYPE_P (scalar_type));
5320
5321   gcc_assert (nested_in_vect_loop_p (loop, reduc_info)
5322               || loop == (gimple_bb (reduc_info->stmt))->loop_father);
5323
5324   if (operand_equal_p (init_val, neutral_op))
5325     {
5326       /* If both elements are equal then the vector described above is
5327          just a splat.  */
5328       neutral_op = gimple_convert (&stmts, TREE_TYPE (vectype), neutral_op);
5329       init_def = gimple_build_vector_from_val (&stmts, vectype, neutral_op);
5330     }
5331   else
5332     {
5333       neutral_op = gimple_convert (&stmts, TREE_TYPE (vectype), neutral_op);
5334       init_val = gimple_convert (&stmts, TREE_TYPE (vectype), init_val);
5335       if (!TYPE_VECTOR_SUBPARTS (vectype).is_constant ())
5336         {
5337           /* Construct a splat of NEUTRAL_OP and insert INIT_VAL into
5338              element 0.  */
5339           init_def = gimple_build_vector_from_val (&stmts, vectype,
5340                                                    neutral_op);
5341           init_def = gimple_build (&stmts, CFN_VEC_SHL_INSERT,
5342                                    vectype, init_def, init_val);
5343         }
5344       else
5345         {
5346           /* Build {INIT_VAL, NEUTRAL_OP, NEUTRAL_OP, ...}.  */
5347           tree_vector_builder elts (vectype, 1, 2);
5348           elts.quick_push (init_val);
5349           elts.quick_push (neutral_op);
5350           init_def = gimple_build_vector (&stmts, &elts);
5351         }
5352     }
5353
5354   if (stmts)
5355     vect_emit_reduction_init_stmts (loop_vinfo, reduc_info, stmts);
5356   return init_def;
5357 }
5358
5359 /* Get at the initial defs for the reduction PHIs for REDUC_INFO,
5360    which performs a reduction involving GROUP_SIZE scalar statements.
5361    NUMBER_OF_VECTORS is the number of vector defs to create.  If NEUTRAL_OP
5362    is nonnull, introducing extra elements of that value will not change the
5363    result.  */
5364
5365 static void
5366 get_initial_defs_for_reduction (loop_vec_info loop_vinfo,
5367                                 stmt_vec_info reduc_info,
5368                                 vec<tree> *vec_oprnds,
5369                                 unsigned int number_of_vectors,
5370                                 unsigned int group_size, tree neutral_op)
5371 {
5372   vec<tree> &initial_values = reduc_info->reduc_initial_values;
5373   unsigned HOST_WIDE_INT nunits;
5374   unsigned j, number_of_places_left_in_vector;
5375   tree vector_type = STMT_VINFO_VECTYPE (reduc_info);
5376   unsigned int i;
5377
5378   gcc_assert (group_size == initial_values.length () || neutral_op);
5379
5380   /* NUMBER_OF_COPIES is the number of times we need to use the same values in
5381      created vectors. It is greater than 1 if unrolling is performed.
5382
5383      For example, we have two scalar operands, s1 and s2 (e.g., group of
5384      strided accesses of size two), while NUNITS is four (i.e., four scalars
5385      of this type can be packed in a vector).  The output vector will contain
5386      two copies of each scalar operand: {s1, s2, s1, s2}.  (NUMBER_OF_COPIES
5387      will be 2).
5388
5389      If REDUC_GROUP_SIZE > NUNITS, the scalars will be split into several
5390      vectors containing the operands.
5391
5392      For example, NUNITS is four as before, and the group size is 8
5393      (s1, s2, ..., s8).  We will create two vectors {s1, s2, s3, s4} and
5394      {s5, s6, s7, s8}.  */
5395
5396   if (!TYPE_VECTOR_SUBPARTS (vector_type).is_constant (&nunits))
5397     nunits = group_size;
5398
5399   number_of_places_left_in_vector = nunits;
5400   bool constant_p = true;
5401   tree_vector_builder elts (vector_type, nunits, 1);
5402   elts.quick_grow (nunits);
5403   gimple_seq ctor_seq = NULL;
5404   for (j = 0; j < nunits * number_of_vectors; ++j)
5405     {
5406       tree op;
5407       i = j % group_size;
5408
5409       /* Get the def before the loop.  In reduction chain we have only
5410          one initial value.  Else we have as many as PHIs in the group.  */
5411       if (i >= initial_values.length () || (j > i && neutral_op))
5412         op = neutral_op;
5413       else
5414         op = initial_values[i];
5415
5416       /* Create 'vect_ = {op0,op1,...,opn}'.  */
5417       number_of_places_left_in_vector--;
5418       elts[nunits - number_of_places_left_in_vector - 1] = op;
5419       if (!CONSTANT_CLASS_P (op))
5420         constant_p = false;
5421
5422       if (number_of_places_left_in_vector == 0)
5423         {
5424           tree init;
5425           if (constant_p && !neutral_op
5426               ? multiple_p (TYPE_VECTOR_SUBPARTS (vector_type), nunits)
5427               : known_eq (TYPE_VECTOR_SUBPARTS (vector_type), nunits))
5428             /* Build the vector directly from ELTS.  */
5429             init = gimple_build_vector (&ctor_seq, &elts);
5430           else if (neutral_op)
5431             {
5432               /* Build a vector of the neutral value and shift the
5433                  other elements into place.  */
5434               init = gimple_build_vector_from_val (&ctor_seq, vector_type,
5435                                                    neutral_op);
5436               int k = nunits;
5437               while (k > 0 && elts[k - 1] == neutral_op)
5438                 k -= 1;
5439               while (k > 0)
5440                 {
5441                   k -= 1;
5442                   init = gimple_build (&ctor_seq, CFN_VEC_SHL_INSERT,
5443                                        vector_type, init, elts[k]);
5444                 }
5445             }
5446           else
5447             {
5448               /* First time round, duplicate ELTS to fill the
5449                  required number of vectors.  */
5450               duplicate_and_interleave (loop_vinfo, &ctor_seq, vector_type,
5451                                         elts, number_of_vectors, *vec_oprnds);
5452               break;
5453             }
5454           vec_oprnds->quick_push (init);
5455
5456           number_of_places_left_in_vector = nunits;
5457           elts.new_vector (vector_type, nunits, 1);
5458           elts.quick_grow (nunits);
5459           constant_p = true;
5460         }
5461     }
5462   if (ctor_seq != NULL)
5463     vect_emit_reduction_init_stmts (loop_vinfo, reduc_info, ctor_seq);
5464 }
5465
5466 /* For a statement STMT_INFO taking part in a reduction operation return
5467    the stmt_vec_info the meta information is stored on.  */
5468
5469 stmt_vec_info
5470 info_for_reduction (vec_info *vinfo, stmt_vec_info stmt_info)
5471 {
5472   stmt_info = vect_orig_stmt (stmt_info);
5473   gcc_assert (STMT_VINFO_REDUC_DEF (stmt_info));
5474   if (!is_a <gphi *> (stmt_info->stmt)
5475       || !VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
5476     stmt_info = STMT_VINFO_REDUC_DEF (stmt_info);
5477   gphi *phi = as_a <gphi *> (stmt_info->stmt);
5478   if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
5479     {
5480       if (gimple_phi_num_args (phi) == 1)
5481         stmt_info = STMT_VINFO_REDUC_DEF (stmt_info);
5482     }
5483   else if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle)
5484     {
5485       stmt_vec_info info = vinfo->lookup_def (vect_phi_initial_value (phi));
5486       if (info && STMT_VINFO_DEF_TYPE (info) == vect_double_reduction_def)
5487         stmt_info = info;
5488     }
5489   return stmt_info;
5490 }
5491
5492 /* See if LOOP_VINFO is an epilogue loop whose main loop had a reduction that
5493    REDUC_INFO can build on.  Adjust REDUC_INFO and return true if so, otherwise
5494    return false.  */
5495
5496 static bool
5497 vect_find_reusable_accumulator (loop_vec_info loop_vinfo,
5498                                 stmt_vec_info reduc_info)
5499 {
5500   loop_vec_info main_loop_vinfo = LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo);
5501   if (!main_loop_vinfo)
5502     return false;
5503
5504   if (STMT_VINFO_REDUC_TYPE (reduc_info) != TREE_CODE_REDUCTION)
5505     return false;
5506
5507   unsigned int num_phis = reduc_info->reduc_initial_values.length ();
5508   auto_vec<tree, 16> main_loop_results (num_phis);
5509   auto_vec<tree, 16> initial_values (num_phis);
5510   if (edge main_loop_edge = loop_vinfo->main_loop_edge)
5511     {
5512       /* The epilogue loop can be entered either from the main loop or
5513          from an earlier guard block.  */
5514       edge skip_edge = loop_vinfo->skip_main_loop_edge;
5515       for (tree incoming_value : reduc_info->reduc_initial_values)
5516         {
5517           /* Look for:
5518
5519                INCOMING_VALUE = phi<MAIN_LOOP_RESULT(main loop),
5520                                     INITIAL_VALUE(guard block)>.  */
5521           gcc_assert (TREE_CODE (incoming_value) == SSA_NAME);
5522
5523           gphi *phi = as_a <gphi *> (SSA_NAME_DEF_STMT (incoming_value));
5524           gcc_assert (gimple_bb (phi) == main_loop_edge->dest);
5525
5526           tree from_main_loop = PHI_ARG_DEF_FROM_EDGE (phi, main_loop_edge);
5527           tree from_skip = PHI_ARG_DEF_FROM_EDGE (phi, skip_edge);
5528
5529           main_loop_results.quick_push (from_main_loop);
5530           initial_values.quick_push (from_skip);
5531         }
5532     }
5533   else
5534     /* The main loop dominates the epilogue loop.  */
5535     main_loop_results.splice (reduc_info->reduc_initial_values);
5536
5537   /* See if the main loop has the kind of accumulator we need.  */
5538   vect_reusable_accumulator *accumulator
5539     = main_loop_vinfo->reusable_accumulators.get (main_loop_results[0]);
5540   if (!accumulator
5541       || num_phis != accumulator->reduc_info->reduc_scalar_results.length ()
5542       || !std::equal (main_loop_results.begin (), main_loop_results.end (),
5543                       accumulator->reduc_info->reduc_scalar_results.begin ()))
5544     return false;
5545
5546   /* Handle the case where we can reduce wider vectors to narrower ones.  */
5547   tree vectype = STMT_VINFO_VECTYPE (reduc_info);
5548   tree old_vectype = TREE_TYPE (accumulator->reduc_input);
5549   unsigned HOST_WIDE_INT m;
5550   if (!constant_multiple_p (TYPE_VECTOR_SUBPARTS (old_vectype),
5551                             TYPE_VECTOR_SUBPARTS (vectype), &m))
5552     return false;
5553   /* Check the intermediate vector types and operations are available.  */
5554   tree prev_vectype = old_vectype;
5555   poly_uint64 intermediate_nunits = TYPE_VECTOR_SUBPARTS (old_vectype);
5556   while (known_gt (intermediate_nunits, TYPE_VECTOR_SUBPARTS (vectype)))
5557     {
5558       intermediate_nunits = exact_div (intermediate_nunits, 2);
5559       tree intermediate_vectype = get_related_vectype_for_scalar_type
5560         (TYPE_MODE (vectype), TREE_TYPE (vectype), intermediate_nunits);
5561       if (!intermediate_vectype
5562           || !directly_supported_p (STMT_VINFO_REDUC_CODE (reduc_info),
5563                                     intermediate_vectype)
5564           || !can_vec_extract (TYPE_MODE (prev_vectype),
5565                                TYPE_MODE (intermediate_vectype)))
5566         return false;
5567       prev_vectype = intermediate_vectype;
5568     }
5569
5570   /* Non-SLP reductions might apply an adjustment after the reduction
5571      operation, in order to simplify the initialization of the accumulator.
5572      If the epilogue loop carries on from where the main loop left off,
5573      it should apply the same adjustment to the final reduction result.
5574
5575      If the epilogue loop can also be entered directly (rather than via
5576      the main loop), we need to be able to handle that case in the same way,
5577      with the same adjustment.  (In principle we could add a PHI node
5578      to select the correct adjustment, but in practice that shouldn't be
5579      necessary.)  */
5580   tree main_adjustment
5581     = STMT_VINFO_REDUC_EPILOGUE_ADJUSTMENT (accumulator->reduc_info);
5582   if (loop_vinfo->main_loop_edge && main_adjustment)
5583     {
5584       gcc_assert (num_phis == 1);
5585       tree initial_value = initial_values[0];
5586       /* Check that we can use INITIAL_VALUE as the adjustment and
5587          initialize the accumulator with a neutral value instead.  */
5588       if (!operand_equal_p (initial_value, main_adjustment))
5589         return false;
5590       code_helper code = STMT_VINFO_REDUC_CODE (reduc_info);
5591       initial_values[0] = neutral_op_for_reduction (TREE_TYPE (initial_value),
5592                                                     code, initial_value);
5593     }
5594   STMT_VINFO_REDUC_EPILOGUE_ADJUSTMENT (reduc_info) = main_adjustment;
5595   reduc_info->reduc_initial_values.truncate (0);
5596   reduc_info->reduc_initial_values.splice (initial_values);
5597   reduc_info->reused_accumulator = accumulator;
5598   return true;
5599 }
5600
5601 /* Reduce the vector VEC_DEF down to VECTYPE with reduction operation
5602    CODE emitting stmts before GSI.  Returns a vector def of VECTYPE.  */
5603
5604 static tree
5605 vect_create_partial_epilog (tree vec_def, tree vectype, code_helper code,
5606                             gimple_seq *seq)
5607 {
5608   unsigned nunits = TYPE_VECTOR_SUBPARTS (TREE_TYPE (vec_def)).to_constant ();
5609   unsigned nunits1 = TYPE_VECTOR_SUBPARTS (vectype).to_constant ();
5610   tree stype = TREE_TYPE (vectype);
5611   tree new_temp = vec_def;
5612   while (nunits > nunits1)
5613     {
5614       nunits /= 2;
5615       tree vectype1 = get_related_vectype_for_scalar_type (TYPE_MODE (vectype),
5616                                                            stype, nunits);
5617       unsigned int bitsize = tree_to_uhwi (TYPE_SIZE (vectype1));
5618
5619       /* The target has to make sure we support lowpart/highpart
5620          extraction, either via direct vector extract or through
5621          an integer mode punning.  */
5622       tree dst1, dst2;
5623       gimple *epilog_stmt;
5624       if (convert_optab_handler (vec_extract_optab,
5625                                  TYPE_MODE (TREE_TYPE (new_temp)),
5626                                  TYPE_MODE (vectype1))
5627           != CODE_FOR_nothing)
5628         {
5629           /* Extract sub-vectors directly once vec_extract becomes
5630              a conversion optab.  */
5631           dst1 = make_ssa_name (vectype1);
5632           epilog_stmt
5633               = gimple_build_assign (dst1, BIT_FIELD_REF,
5634                                      build3 (BIT_FIELD_REF, vectype1,
5635                                              new_temp, TYPE_SIZE (vectype1),
5636                                              bitsize_int (0)));
5637           gimple_seq_add_stmt_without_update (seq, epilog_stmt);
5638           dst2 =  make_ssa_name (vectype1);
5639           epilog_stmt
5640               = gimple_build_assign (dst2, BIT_FIELD_REF,
5641                                      build3 (BIT_FIELD_REF, vectype1,
5642                                              new_temp, TYPE_SIZE (vectype1),
5643                                              bitsize_int (bitsize)));
5644           gimple_seq_add_stmt_without_update (seq, epilog_stmt);
5645         }
5646       else
5647         {
5648           /* Extract via punning to appropriately sized integer mode
5649              vector.  */
5650           tree eltype = build_nonstandard_integer_type (bitsize, 1);
5651           tree etype = build_vector_type (eltype, 2);
5652           gcc_assert (convert_optab_handler (vec_extract_optab,
5653                                              TYPE_MODE (etype),
5654                                              TYPE_MODE (eltype))
5655                       != CODE_FOR_nothing);
5656           tree tem = make_ssa_name (etype);
5657           epilog_stmt = gimple_build_assign (tem, VIEW_CONVERT_EXPR,
5658                                              build1 (VIEW_CONVERT_EXPR,
5659                                                      etype, new_temp));
5660           gimple_seq_add_stmt_without_update (seq, epilog_stmt);
5661           new_temp = tem;
5662           tem = make_ssa_name (eltype);
5663           epilog_stmt
5664               = gimple_build_assign (tem, BIT_FIELD_REF,
5665                                      build3 (BIT_FIELD_REF, eltype,
5666                                              new_temp, TYPE_SIZE (eltype),
5667                                              bitsize_int (0)));
5668           gimple_seq_add_stmt_without_update (seq, epilog_stmt);
5669           dst1 = make_ssa_name (vectype1);
5670           epilog_stmt = gimple_build_assign (dst1, VIEW_CONVERT_EXPR,
5671                                              build1 (VIEW_CONVERT_EXPR,
5672                                                      vectype1, tem));
5673           gimple_seq_add_stmt_without_update (seq, epilog_stmt);
5674           tem = make_ssa_name (eltype);
5675           epilog_stmt
5676               = gimple_build_assign (tem, BIT_FIELD_REF,
5677                                      build3 (BIT_FIELD_REF, eltype,
5678                                              new_temp, TYPE_SIZE (eltype),
5679                                              bitsize_int (bitsize)));
5680           gimple_seq_add_stmt_without_update (seq, epilog_stmt);
5681           dst2 =  make_ssa_name (vectype1);
5682           epilog_stmt = gimple_build_assign (dst2, VIEW_CONVERT_EXPR,
5683                                              build1 (VIEW_CONVERT_EXPR,
5684                                                      vectype1, tem));
5685           gimple_seq_add_stmt_without_update (seq, epilog_stmt);
5686         }
5687
5688       new_temp = gimple_build (seq, code, vectype1, dst1, dst2);
5689     }
5690
5691   return new_temp;
5692 }
5693
5694 /* Function vect_create_epilog_for_reduction
5695
5696    Create code at the loop-epilog to finalize the result of a reduction
5697    computation.
5698
5699    STMT_INFO is the scalar reduction stmt that is being vectorized.
5700    SLP_NODE is an SLP node containing a group of reduction statements. The
5701      first one in this group is STMT_INFO.
5702    SLP_NODE_INSTANCE is the SLP node instance containing SLP_NODE
5703    REDUC_INDEX says which rhs operand of the STMT_INFO is the reduction phi
5704      (counting from 0)
5705
5706    This function:
5707    1. Completes the reduction def-use cycles.
5708    2. "Reduces" each vector of partial results VECT_DEFS into a single result,
5709       by calling the function specified by REDUC_FN if available, or by
5710       other means (whole-vector shifts or a scalar loop).
5711       The function also creates a new phi node at the loop exit to preserve
5712       loop-closed form, as illustrated below.
5713
5714      The flow at the entry to this function:
5715
5716         loop:
5717           vec_def = phi <vec_init, null>        # REDUCTION_PHI
5718           VECT_DEF = vector_stmt                # vectorized form of STMT_INFO
5719           s_loop = scalar_stmt                  # (scalar) STMT_INFO
5720         loop_exit:
5721           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
5722           use <s_out0>
5723           use <s_out0>
5724
5725      The above is transformed by this function into:
5726
5727         loop:
5728           vec_def = phi <vec_init, VECT_DEF>    # REDUCTION_PHI
5729           VECT_DEF = vector_stmt                # vectorized form of STMT_INFO
5730           s_loop = scalar_stmt                  # (scalar) STMT_INFO
5731         loop_exit:
5732           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
5733           v_out1 = phi <VECT_DEF>               # NEW_EXIT_PHI
5734           v_out2 = reduce <v_out1>
5735           s_out3 = extract_field <v_out2, 0>
5736           s_out4 = adjust_result <s_out3>
5737           use <s_out4>
5738           use <s_out4>
5739 */
5740
5741 static void
5742 vect_create_epilog_for_reduction (loop_vec_info loop_vinfo,
5743                                   stmt_vec_info stmt_info,
5744                                   slp_tree slp_node,
5745                                   slp_instance slp_node_instance)
5746 {
5747   stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info);
5748   gcc_assert (reduc_info->is_reduc_info);
5749   /* For double reductions we need to get at the inner loop reduction
5750      stmt which has the meta info attached.  Our stmt_info is that of the
5751      loop-closed PHI of the inner loop which we remember as
5752      def for the reduction PHI generation.  */
5753   bool double_reduc = false;
5754   stmt_vec_info rdef_info = stmt_info;
5755   if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
5756     {
5757       gcc_assert (!slp_node);
5758       double_reduc = true;
5759       stmt_info = loop_vinfo->lookup_def (gimple_phi_arg_def
5760                                             (stmt_info->stmt, 0));
5761       stmt_info = vect_stmt_to_vectorize (stmt_info);
5762     }
5763   gphi *reduc_def_stmt
5764     = as_a <gphi *> (STMT_VINFO_REDUC_DEF (vect_orig_stmt (stmt_info))->stmt);
5765   code_helper code = STMT_VINFO_REDUC_CODE (reduc_info);
5766   internal_fn reduc_fn = STMT_VINFO_REDUC_FN (reduc_info);
5767   tree vectype;
5768   machine_mode mode;
5769   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo), *outer_loop = NULL;
5770   basic_block exit_bb;
5771   tree scalar_dest;
5772   tree scalar_type;
5773   gimple *new_phi = NULL, *phi;
5774   gimple_stmt_iterator exit_gsi;
5775   tree new_temp = NULL_TREE, new_name, new_scalar_dest;
5776   gimple *epilog_stmt = NULL;
5777   gimple *exit_phi;
5778   tree bitsize;
5779   tree def;
5780   tree orig_name, scalar_result;
5781   imm_use_iterator imm_iter, phi_imm_iter;
5782   use_operand_p use_p, phi_use_p;
5783   gimple *use_stmt;
5784   auto_vec<tree> reduc_inputs;
5785   int j, i;
5786   vec<tree> &scalar_results = reduc_info->reduc_scalar_results;
5787   unsigned int group_size = 1, k;
5788   auto_vec<gimple *> phis;
5789   /* SLP reduction without reduction chain, e.g.,
5790      # a1 = phi <a2, a0>
5791      # b1 = phi <b2, b0>
5792      a2 = operation (a1)
5793      b2 = operation (b1)  */
5794   bool slp_reduc = (slp_node && !REDUC_GROUP_FIRST_ELEMENT (stmt_info));
5795   bool direct_slp_reduc;
5796   tree induction_index = NULL_TREE;
5797
5798   if (slp_node)
5799     group_size = SLP_TREE_LANES (slp_node);
5800
5801   if (nested_in_vect_loop_p (loop, stmt_info))
5802     {
5803       outer_loop = loop;
5804       loop = loop->inner;
5805       gcc_assert (!slp_node && double_reduc);
5806     }
5807
5808   vectype = STMT_VINFO_REDUC_VECTYPE (reduc_info);
5809   gcc_assert (vectype);
5810   mode = TYPE_MODE (vectype);
5811
5812   tree induc_val = NULL_TREE;
5813   tree adjustment_def = NULL;
5814   if (slp_node)
5815     ;
5816   else
5817     {
5818       /* Optimize: for induction condition reduction, if we can't use zero
5819          for induc_val, use initial_def.  */
5820       if (STMT_VINFO_REDUC_TYPE (reduc_info) == INTEGER_INDUC_COND_REDUCTION)
5821         induc_val = STMT_VINFO_VEC_INDUC_COND_INITIAL_VAL (reduc_info);
5822       else if (double_reduc)
5823         ;
5824       else
5825         adjustment_def = STMT_VINFO_REDUC_EPILOGUE_ADJUSTMENT (reduc_info);
5826     }
5827
5828   stmt_vec_info single_live_out_stmt[] = { stmt_info };
5829   array_slice<const stmt_vec_info> live_out_stmts = single_live_out_stmt;
5830   if (slp_reduc)
5831     /* All statements produce live-out values.  */
5832     live_out_stmts = SLP_TREE_SCALAR_STMTS (slp_node);
5833   else if (slp_node)
5834     {
5835       /* The last statement in the reduction chain produces the live-out
5836          value.  Note SLP optimization can shuffle scalar stmts to
5837          optimize permutations so we have to search for the last stmt.  */
5838       for (k = 0; k < group_size; ++k)
5839         if (!REDUC_GROUP_NEXT_ELEMENT (SLP_TREE_SCALAR_STMTS (slp_node)[k]))
5840           {
5841             single_live_out_stmt[0] = SLP_TREE_SCALAR_STMTS (slp_node)[k];
5842             break;
5843           }
5844     }
5845
5846   unsigned vec_num;
5847   int ncopies;
5848   if (slp_node)
5849     {
5850       vec_num = SLP_TREE_VEC_DEFS (slp_node_instance->reduc_phis).length ();
5851       ncopies = 1;
5852     }
5853   else
5854     {
5855       stmt_vec_info reduc_info = loop_vinfo->lookup_stmt (reduc_def_stmt);
5856       vec_num = 1;
5857       ncopies = STMT_VINFO_VEC_STMTS (reduc_info).length ();
5858     }
5859
5860   /* For cond reductions we want to create a new vector (INDEX_COND_EXPR)
5861      which is updated with the current index of the loop for every match of
5862      the original loop's cond_expr (VEC_STMT).  This results in a vector
5863      containing the last time the condition passed for that vector lane.
5864      The first match will be a 1 to allow 0 to be used for non-matching
5865      indexes.  If there are no matches at all then the vector will be all
5866      zeroes.
5867
5868      PR92772: This algorithm is broken for architectures that support
5869      masked vectors, but do not provide fold_extract_last.  */
5870   if (STMT_VINFO_REDUC_TYPE (reduc_info) == COND_REDUCTION)
5871     {
5872       auto_vec<std::pair<tree, bool>, 2> ccompares;
5873       stmt_vec_info cond_info = STMT_VINFO_REDUC_DEF (reduc_info);
5874       cond_info = vect_stmt_to_vectorize (cond_info);
5875       while (cond_info != reduc_info)
5876         {
5877           if (gimple_assign_rhs_code (cond_info->stmt) == COND_EXPR)
5878             {
5879               gimple *vec_stmt = STMT_VINFO_VEC_STMTS (cond_info)[0];
5880               gcc_assert (gimple_assign_rhs_code (vec_stmt) == VEC_COND_EXPR);
5881               ccompares.safe_push
5882                 (std::make_pair (unshare_expr (gimple_assign_rhs1 (vec_stmt)),
5883                                  STMT_VINFO_REDUC_IDX (cond_info) == 2));
5884             }
5885           cond_info
5886             = loop_vinfo->lookup_def (gimple_op (cond_info->stmt,
5887                                                  1 + STMT_VINFO_REDUC_IDX
5888                                                         (cond_info)));
5889           cond_info = vect_stmt_to_vectorize (cond_info);
5890         }
5891       gcc_assert (ccompares.length () != 0);
5892
5893       tree indx_before_incr, indx_after_incr;
5894       poly_uint64 nunits_out = TYPE_VECTOR_SUBPARTS (vectype);
5895       int scalar_precision
5896         = GET_MODE_PRECISION (SCALAR_TYPE_MODE (TREE_TYPE (vectype)));
5897       tree cr_index_scalar_type = make_unsigned_type (scalar_precision);
5898       tree cr_index_vector_type = get_related_vectype_for_scalar_type
5899         (TYPE_MODE (vectype), cr_index_scalar_type,
5900          TYPE_VECTOR_SUBPARTS (vectype));
5901
5902       /* First we create a simple vector induction variable which starts
5903          with the values {1,2,3,...} (SERIES_VECT) and increments by the
5904          vector size (STEP).  */
5905
5906       /* Create a {1,2,3,...} vector.  */
5907       tree series_vect = build_index_vector (cr_index_vector_type, 1, 1);
5908
5909       /* Create a vector of the step value.  */
5910       tree step = build_int_cst (cr_index_scalar_type, nunits_out);
5911       tree vec_step = build_vector_from_val (cr_index_vector_type, step);
5912
5913       /* Create an induction variable.  */
5914       gimple_stmt_iterator incr_gsi;
5915       bool insert_after;
5916       standard_iv_increment_position (loop, &incr_gsi, &insert_after);
5917       create_iv (series_vect, PLUS_EXPR, vec_step, NULL_TREE, loop, &incr_gsi,
5918                  insert_after, &indx_before_incr, &indx_after_incr);
5919
5920       /* Next create a new phi node vector (NEW_PHI_TREE) which starts
5921          filled with zeros (VEC_ZERO).  */
5922
5923       /* Create a vector of 0s.  */
5924       tree zero = build_zero_cst (cr_index_scalar_type);
5925       tree vec_zero = build_vector_from_val (cr_index_vector_type, zero);
5926
5927       /* Create a vector phi node.  */
5928       tree new_phi_tree = make_ssa_name (cr_index_vector_type);
5929       new_phi = create_phi_node (new_phi_tree, loop->header);
5930       add_phi_arg (as_a <gphi *> (new_phi), vec_zero,
5931                    loop_preheader_edge (loop), UNKNOWN_LOCATION);
5932
5933       /* Now take the condition from the loops original cond_exprs
5934          and produce a new cond_exprs (INDEX_COND_EXPR) which for
5935          every match uses values from the induction variable
5936          (INDEX_BEFORE_INCR) otherwise uses values from the phi node
5937          (NEW_PHI_TREE).
5938          Finally, we update the phi (NEW_PHI_TREE) to take the value of
5939          the new cond_expr (INDEX_COND_EXPR).  */
5940       gimple_seq stmts = NULL;
5941       for (int i = ccompares.length () - 1; i != -1; --i)
5942         {
5943           tree ccompare = ccompares[i].first;
5944           if (ccompares[i].second)
5945             new_phi_tree = gimple_build (&stmts, VEC_COND_EXPR,
5946                                          cr_index_vector_type,
5947                                          ccompare,
5948                                          indx_before_incr, new_phi_tree);
5949           else
5950             new_phi_tree = gimple_build (&stmts, VEC_COND_EXPR,
5951                                          cr_index_vector_type,
5952                                          ccompare,
5953                                          new_phi_tree, indx_before_incr);
5954         }
5955       gsi_insert_seq_before (&incr_gsi, stmts, GSI_SAME_STMT);
5956
5957       /* Update the phi with the vec cond.  */
5958       induction_index = new_phi_tree;
5959       add_phi_arg (as_a <gphi *> (new_phi), induction_index,
5960                    loop_latch_edge (loop), UNKNOWN_LOCATION);
5961     }
5962
5963   /* 2. Create epilog code.
5964         The reduction epilog code operates across the elements of the vector
5965         of partial results computed by the vectorized loop.
5966         The reduction epilog code consists of:
5967
5968         step 1: compute the scalar result in a vector (v_out2)
5969         step 2: extract the scalar result (s_out3) from the vector (v_out2)
5970         step 3: adjust the scalar result (s_out3) if needed.
5971
5972         Step 1 can be accomplished using one the following three schemes:
5973           (scheme 1) using reduc_fn, if available.
5974           (scheme 2) using whole-vector shifts, if available.
5975           (scheme 3) using a scalar loop. In this case steps 1+2 above are
5976                      combined.
5977
5978           The overall epilog code looks like this:
5979
5980           s_out0 = phi <s_loop>         # original EXIT_PHI
5981           v_out1 = phi <VECT_DEF>       # NEW_EXIT_PHI
5982           v_out2 = reduce <v_out1>              # step 1
5983           s_out3 = extract_field <v_out2, 0>    # step 2
5984           s_out4 = adjust_result <s_out3>       # step 3
5985
5986           (step 3 is optional, and steps 1 and 2 may be combined).
5987           Lastly, the uses of s_out0 are replaced by s_out4.  */
5988
5989
5990   /* 2.1 Create new loop-exit-phis to preserve loop-closed form:
5991          v_out1 = phi <VECT_DEF>
5992          Store them in NEW_PHIS.  */
5993   if (double_reduc)
5994     loop = outer_loop;
5995   exit_bb = single_exit (loop)->dest;
5996   exit_gsi = gsi_after_labels (exit_bb);
5997   reduc_inputs.create (slp_node ? vec_num : ncopies);
5998   for (unsigned i = 0; i < vec_num; i++)
5999     {
6000       gimple_seq stmts = NULL;
6001       if (slp_node)
6002         def = vect_get_slp_vect_def (slp_node, i);
6003       else
6004         def = gimple_get_lhs (STMT_VINFO_VEC_STMTS (rdef_info)[0]);
6005       for (j = 0; j < ncopies; j++)
6006         {
6007           tree new_def = copy_ssa_name (def);
6008           phi = create_phi_node (new_def, exit_bb);
6009           if (j)
6010             def = gimple_get_lhs (STMT_VINFO_VEC_STMTS (rdef_info)[j]);
6011           SET_PHI_ARG_DEF (phi, single_exit (loop)->dest_idx, def);
6012           new_def = gimple_convert (&stmts, vectype, new_def);
6013           reduc_inputs.quick_push (new_def);
6014         }
6015       gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
6016     }
6017
6018   /* 2.2 Get the relevant tree-code to use in the epilog for schemes 2,3
6019          (i.e. when reduc_fn is not available) and in the final adjustment
6020          code (if needed).  Also get the original scalar reduction variable as
6021          defined in the loop.  In case STMT is a "pattern-stmt" (i.e. - it
6022          represents a reduction pattern), the tree-code and scalar-def are
6023          taken from the original stmt that the pattern-stmt (STMT) replaces.
6024          Otherwise (it is a regular reduction) - the tree-code and scalar-def
6025          are taken from STMT.  */
6026
6027   stmt_vec_info orig_stmt_info = vect_orig_stmt (stmt_info);
6028   if (orig_stmt_info != stmt_info)
6029     {
6030       /* Reduction pattern  */
6031       gcc_assert (STMT_VINFO_IN_PATTERN_P (orig_stmt_info));
6032       gcc_assert (STMT_VINFO_RELATED_STMT (orig_stmt_info) == stmt_info);
6033     }
6034
6035   scalar_dest = gimple_get_lhs (orig_stmt_info->stmt);
6036   scalar_type = TREE_TYPE (scalar_dest);
6037   scalar_results.truncate (0);
6038   scalar_results.reserve_exact (group_size);
6039   new_scalar_dest = vect_create_destination_var (scalar_dest, NULL);
6040   bitsize = TYPE_SIZE (scalar_type);
6041
6042   /* True if we should implement SLP_REDUC using native reduction operations
6043      instead of scalar operations.  */
6044   direct_slp_reduc = (reduc_fn != IFN_LAST
6045                       && slp_reduc
6046                       && !TYPE_VECTOR_SUBPARTS (vectype).is_constant ());
6047
6048   /* In case of reduction chain, e.g.,
6049      # a1 = phi <a3, a0>
6050      a2 = operation (a1)
6051      a3 = operation (a2),
6052
6053      we may end up with more than one vector result.  Here we reduce them
6054      to one vector.
6055
6056      The same is true for a SLP reduction, e.g.,
6057      # a1 = phi <a2, a0>
6058      # b1 = phi <b2, b0>
6059      a2 = operation (a1)
6060      b2 = operation (a2),
6061
6062      where we can end up with more than one vector as well.  We can
6063      easily accumulate vectors when the number of vector elements is
6064      a multiple of the SLP group size.
6065
6066      The same is true if we couldn't use a single defuse cycle.  */
6067   if (REDUC_GROUP_FIRST_ELEMENT (stmt_info)
6068       || direct_slp_reduc
6069       || (slp_reduc
6070           && constant_multiple_p (TYPE_VECTOR_SUBPARTS (vectype), group_size))
6071       || ncopies > 1)
6072     {
6073       gimple_seq stmts = NULL;
6074       tree single_input = reduc_inputs[0];
6075       for (k = 1; k < reduc_inputs.length (); k++)
6076         single_input = gimple_build (&stmts, code, vectype,
6077                                      single_input, reduc_inputs[k]);
6078       gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
6079
6080       reduc_inputs.truncate (0);
6081       reduc_inputs.safe_push (single_input);
6082     }
6083
6084   tree orig_reduc_input = reduc_inputs[0];
6085
6086   /* If this loop is an epilogue loop that can be skipped after the
6087      main loop, we can only share a reduction operation between the
6088      main loop and the epilogue if we put it at the target of the
6089      skip edge.
6090
6091      We can still reuse accumulators if this check fails.  Doing so has
6092      the minor(?) benefit of making the epilogue loop's scalar result
6093      independent of the main loop's scalar result.  */
6094   bool unify_with_main_loop_p = false;
6095   if (reduc_info->reused_accumulator
6096       && loop_vinfo->skip_this_loop_edge
6097       && single_succ_p (exit_bb)
6098       && single_succ (exit_bb) == loop_vinfo->skip_this_loop_edge->dest)
6099     {
6100       unify_with_main_loop_p = true;
6101
6102       basic_block reduc_block = loop_vinfo->skip_this_loop_edge->dest;
6103       reduc_inputs[0] = make_ssa_name (vectype);
6104       gphi *new_phi = create_phi_node (reduc_inputs[0], reduc_block);
6105       add_phi_arg (new_phi, orig_reduc_input, single_succ_edge (exit_bb),
6106                    UNKNOWN_LOCATION);
6107       add_phi_arg (new_phi, reduc_info->reused_accumulator->reduc_input,
6108                    loop_vinfo->skip_this_loop_edge, UNKNOWN_LOCATION);
6109       exit_gsi = gsi_after_labels (reduc_block);
6110     }
6111
6112   /* Shouldn't be used beyond this point.  */
6113   exit_bb = nullptr;
6114
6115   if (STMT_VINFO_REDUC_TYPE (reduc_info) == COND_REDUCTION
6116       && reduc_fn != IFN_LAST)
6117     {
6118       /* For condition reductions, we have a vector (REDUC_INPUTS 0) containing
6119          various data values where the condition matched and another vector
6120          (INDUCTION_INDEX) containing all the indexes of those matches.  We
6121          need to extract the last matching index (which will be the index with
6122          highest value) and use this to index into the data vector.
6123          For the case where there were no matches, the data vector will contain
6124          all default values and the index vector will be all zeros.  */
6125
6126       /* Get various versions of the type of the vector of indexes.  */
6127       tree index_vec_type = TREE_TYPE (induction_index);
6128       gcc_checking_assert (TYPE_UNSIGNED (index_vec_type));
6129       tree index_scalar_type = TREE_TYPE (index_vec_type);
6130       tree index_vec_cmp_type = truth_type_for (index_vec_type);
6131
6132       /* Get an unsigned integer version of the type of the data vector.  */
6133       int scalar_precision
6134         = GET_MODE_PRECISION (SCALAR_TYPE_MODE (scalar_type));
6135       tree scalar_type_unsigned = make_unsigned_type (scalar_precision);
6136       tree vectype_unsigned = get_same_sized_vectype (scalar_type_unsigned,
6137                                                 vectype);
6138
6139       /* First we need to create a vector (ZERO_VEC) of zeros and another
6140          vector (MAX_INDEX_VEC) filled with the last matching index, which we
6141          can create using a MAX reduction and then expanding.
6142          In the case where the loop never made any matches, the max index will
6143          be zero.  */
6144
6145       /* Vector of {0, 0, 0,...}.  */
6146       tree zero_vec = build_zero_cst (vectype);
6147
6148       /* Find maximum value from the vector of found indexes.  */
6149       tree max_index = make_ssa_name (index_scalar_type);
6150       gcall *max_index_stmt = gimple_build_call_internal (IFN_REDUC_MAX,
6151                                                           1, induction_index);
6152       gimple_call_set_lhs (max_index_stmt, max_index);
6153       gsi_insert_before (&exit_gsi, max_index_stmt, GSI_SAME_STMT);
6154
6155       /* Vector of {max_index, max_index, max_index,...}.  */
6156       tree max_index_vec = make_ssa_name (index_vec_type);
6157       tree max_index_vec_rhs = build_vector_from_val (index_vec_type,
6158                                                       max_index);
6159       gimple *max_index_vec_stmt = gimple_build_assign (max_index_vec,
6160                                                         max_index_vec_rhs);
6161       gsi_insert_before (&exit_gsi, max_index_vec_stmt, GSI_SAME_STMT);
6162
6163       /* Next we compare the new vector (MAX_INDEX_VEC) full of max indexes
6164          with the vector (INDUCTION_INDEX) of found indexes, choosing values
6165          from the data vector (REDUC_INPUTS 0) for matches, 0 (ZERO_VEC)
6166          otherwise.  Only one value should match, resulting in a vector
6167          (VEC_COND) with one data value and the rest zeros.
6168          In the case where the loop never made any matches, every index will
6169          match, resulting in a vector with all data values (which will all be
6170          the default value).  */
6171
6172       /* Compare the max index vector to the vector of found indexes to find
6173          the position of the max value.  */
6174       tree vec_compare = make_ssa_name (index_vec_cmp_type);
6175       gimple *vec_compare_stmt = gimple_build_assign (vec_compare, EQ_EXPR,
6176                                                       induction_index,
6177                                                       max_index_vec);
6178       gsi_insert_before (&exit_gsi, vec_compare_stmt, GSI_SAME_STMT);
6179
6180       /* Use the compare to choose either values from the data vector or
6181          zero.  */
6182       tree vec_cond = make_ssa_name (vectype);
6183       gimple *vec_cond_stmt = gimple_build_assign (vec_cond, VEC_COND_EXPR,
6184                                                    vec_compare,
6185                                                    reduc_inputs[0],
6186                                                    zero_vec);
6187       gsi_insert_before (&exit_gsi, vec_cond_stmt, GSI_SAME_STMT);
6188
6189       /* Finally we need to extract the data value from the vector (VEC_COND)
6190          into a scalar (MATCHED_DATA_REDUC).  Logically we want to do a OR
6191          reduction, but because this doesn't exist, we can use a MAX reduction
6192          instead.  The data value might be signed or a float so we need to cast
6193          it first.
6194          In the case where the loop never made any matches, the data values are
6195          all identical, and so will reduce down correctly.  */
6196
6197       /* Make the matched data values unsigned.  */
6198       tree vec_cond_cast = make_ssa_name (vectype_unsigned);
6199       tree vec_cond_cast_rhs = build1 (VIEW_CONVERT_EXPR, vectype_unsigned,
6200                                        vec_cond);
6201       gimple *vec_cond_cast_stmt = gimple_build_assign (vec_cond_cast,
6202                                                         VIEW_CONVERT_EXPR,
6203                                                         vec_cond_cast_rhs);
6204       gsi_insert_before (&exit_gsi, vec_cond_cast_stmt, GSI_SAME_STMT);
6205
6206       /* Reduce down to a scalar value.  */
6207       tree data_reduc = make_ssa_name (scalar_type_unsigned);
6208       gcall *data_reduc_stmt = gimple_build_call_internal (IFN_REDUC_MAX,
6209                                                            1, vec_cond_cast);
6210       gimple_call_set_lhs (data_reduc_stmt, data_reduc);
6211       gsi_insert_before (&exit_gsi, data_reduc_stmt, GSI_SAME_STMT);
6212
6213       /* Convert the reduced value back to the result type and set as the
6214          result.  */
6215       gimple_seq stmts = NULL;
6216       new_temp = gimple_build (&stmts, VIEW_CONVERT_EXPR, scalar_type,
6217                                data_reduc);
6218       gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
6219       scalar_results.safe_push (new_temp);
6220     }
6221   else if (STMT_VINFO_REDUC_TYPE (reduc_info) == COND_REDUCTION
6222            && reduc_fn == IFN_LAST)
6223     {
6224       /* Condition reduction without supported IFN_REDUC_MAX.  Generate
6225          idx = 0;
6226          idx_val = induction_index[0];
6227          val = data_reduc[0];
6228          for (idx = 0, val = init, i = 0; i < nelts; ++i)
6229            if (induction_index[i] > idx_val)
6230              val = data_reduc[i], idx_val = induction_index[i];
6231          return val;  */
6232
6233       tree data_eltype = TREE_TYPE (vectype);
6234       tree idx_eltype = TREE_TYPE (TREE_TYPE (induction_index));
6235       unsigned HOST_WIDE_INT el_size = tree_to_uhwi (TYPE_SIZE (idx_eltype));
6236       poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (TREE_TYPE (induction_index));
6237       /* Enforced by vectorizable_reduction, which ensures we have target
6238          support before allowing a conditional reduction on variable-length
6239          vectors.  */
6240       unsigned HOST_WIDE_INT v_size = el_size * nunits.to_constant ();
6241       tree idx_val = NULL_TREE, val = NULL_TREE;
6242       for (unsigned HOST_WIDE_INT off = 0; off < v_size; off += el_size)
6243         {
6244           tree old_idx_val = idx_val;
6245           tree old_val = val;
6246           idx_val = make_ssa_name (idx_eltype);
6247           epilog_stmt = gimple_build_assign (idx_val, BIT_FIELD_REF,
6248                                              build3 (BIT_FIELD_REF, idx_eltype,
6249                                                      induction_index,
6250                                                      bitsize_int (el_size),
6251                                                      bitsize_int (off)));
6252           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
6253           val = make_ssa_name (data_eltype);
6254           epilog_stmt = gimple_build_assign (val, BIT_FIELD_REF,
6255                                              build3 (BIT_FIELD_REF,
6256                                                      data_eltype,
6257                                                      reduc_inputs[0],
6258                                                      bitsize_int (el_size),
6259                                                      bitsize_int (off)));
6260           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
6261           if (off != 0)
6262             {
6263               tree new_idx_val = idx_val;
6264               if (off != v_size - el_size)
6265                 {
6266                   new_idx_val = make_ssa_name (idx_eltype);
6267                   epilog_stmt = gimple_build_assign (new_idx_val,
6268                                                      MAX_EXPR, idx_val,
6269                                                      old_idx_val);
6270                   gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
6271                 }
6272               tree cond = make_ssa_name (boolean_type_node);
6273               epilog_stmt = gimple_build_assign (cond, GT_EXPR,
6274                                                  idx_val, old_idx_val);
6275               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
6276               tree new_val = make_ssa_name (data_eltype);
6277               epilog_stmt = gimple_build_assign (new_val, COND_EXPR,
6278                                                  cond, val, old_val);
6279               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
6280               idx_val = new_idx_val;
6281               val = new_val;
6282             }
6283         }
6284       /* Convert the reduced value back to the result type and set as the
6285          result.  */
6286       gimple_seq stmts = NULL;
6287       val = gimple_convert (&stmts, scalar_type, val);
6288       gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
6289       scalar_results.safe_push (val);
6290     }
6291
6292   /* 2.3 Create the reduction code, using one of the three schemes described
6293          above. In SLP we simply need to extract all the elements from the
6294          vector (without reducing them), so we use scalar shifts.  */
6295   else if (reduc_fn != IFN_LAST && !slp_reduc)
6296     {
6297       tree tmp;
6298       tree vec_elem_type;
6299
6300       /* Case 1:  Create:
6301          v_out2 = reduc_expr <v_out1>  */
6302
6303       if (dump_enabled_p ())
6304         dump_printf_loc (MSG_NOTE, vect_location,
6305                          "Reduce using direct vector reduction.\n");
6306
6307       gimple_seq stmts = NULL;
6308       vec_elem_type = TREE_TYPE (vectype);
6309       new_temp = gimple_build (&stmts, as_combined_fn (reduc_fn),
6310                                vec_elem_type, reduc_inputs[0]);
6311       new_temp = gimple_convert (&stmts, scalar_type, new_temp);
6312       gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
6313
6314       if ((STMT_VINFO_REDUC_TYPE (reduc_info) == INTEGER_INDUC_COND_REDUCTION)
6315           && induc_val)
6316         {
6317           /* Earlier we set the initial value to be a vector if induc_val
6318              values.  Check the result and if it is induc_val then replace
6319              with the original initial value, unless induc_val is
6320              the same as initial_def already.  */
6321           tree zcompare = make_ssa_name (boolean_type_node);
6322           epilog_stmt = gimple_build_assign (zcompare, EQ_EXPR,
6323                                              new_temp, induc_val);
6324           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
6325           tree initial_def = reduc_info->reduc_initial_values[0];
6326           tmp = make_ssa_name (new_scalar_dest);
6327           epilog_stmt = gimple_build_assign (tmp, COND_EXPR, zcompare,
6328                                              initial_def, new_temp);
6329           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
6330           new_temp = tmp;
6331         }
6332
6333       scalar_results.safe_push (new_temp);
6334     }
6335   else if (direct_slp_reduc)
6336     {
6337       /* Here we create one vector for each of the REDUC_GROUP_SIZE results,
6338          with the elements for other SLP statements replaced with the
6339          neutral value.  We can then do a normal reduction on each vector.  */
6340
6341       /* Enforced by vectorizable_reduction.  */
6342       gcc_assert (reduc_inputs.length () == 1);
6343       gcc_assert (pow2p_hwi (group_size));
6344
6345       gimple_seq seq = NULL;
6346
6347       /* Build a vector {0, 1, 2, ...}, with the same number of elements
6348          and the same element size as VECTYPE.  */
6349       tree index = build_index_vector (vectype, 0, 1);
6350       tree index_type = TREE_TYPE (index);
6351       tree index_elt_type = TREE_TYPE (index_type);
6352       tree mask_type = truth_type_for (index_type);
6353
6354       /* Create a vector that, for each element, identifies which of
6355          the REDUC_GROUP_SIZE results should use it.  */
6356       tree index_mask = build_int_cst (index_elt_type, group_size - 1);
6357       index = gimple_build (&seq, BIT_AND_EXPR, index_type, index,
6358                             build_vector_from_val (index_type, index_mask));
6359
6360       /* Get a neutral vector value.  This is simply a splat of the neutral
6361          scalar value if we have one, otherwise the initial scalar value
6362          is itself a neutral value.  */
6363       tree vector_identity = NULL_TREE;
6364       tree neutral_op = NULL_TREE;
6365       if (slp_node)
6366         {
6367           tree initial_value = NULL_TREE;
6368           if (REDUC_GROUP_FIRST_ELEMENT (stmt_info))
6369             initial_value = reduc_info->reduc_initial_values[0];
6370           neutral_op = neutral_op_for_reduction (TREE_TYPE (vectype), code,
6371                                                  initial_value);
6372         }
6373       if (neutral_op)
6374         vector_identity = gimple_build_vector_from_val (&seq, vectype,
6375                                                         neutral_op);
6376       for (unsigned int i = 0; i < group_size; ++i)
6377         {
6378           /* If there's no univeral neutral value, we can use the
6379              initial scalar value from the original PHI.  This is used
6380              for MIN and MAX reduction, for example.  */
6381           if (!neutral_op)
6382             {
6383               tree scalar_value = reduc_info->reduc_initial_values[i];
6384               scalar_value = gimple_convert (&seq, TREE_TYPE (vectype),
6385                                              scalar_value);
6386               vector_identity = gimple_build_vector_from_val (&seq, vectype,
6387                                                               scalar_value);
6388             }
6389
6390           /* Calculate the equivalent of:
6391
6392              sel[j] = (index[j] == i);
6393
6394              which selects the elements of REDUC_INPUTS[0] that should
6395              be included in the result.  */
6396           tree compare_val = build_int_cst (index_elt_type, i);
6397           compare_val = build_vector_from_val (index_type, compare_val);
6398           tree sel = gimple_build (&seq, EQ_EXPR, mask_type,
6399                                    index, compare_val);
6400
6401           /* Calculate the equivalent of:
6402
6403              vec = seq ? reduc_inputs[0] : vector_identity;
6404
6405              VEC is now suitable for a full vector reduction.  */
6406           tree vec = gimple_build (&seq, VEC_COND_EXPR, vectype,
6407                                    sel, reduc_inputs[0], vector_identity);
6408
6409           /* Do the reduction and convert it to the appropriate type.  */
6410           tree scalar = gimple_build (&seq, as_combined_fn (reduc_fn),
6411                                       TREE_TYPE (vectype), vec);
6412           scalar = gimple_convert (&seq, scalar_type, scalar);
6413           scalar_results.safe_push (scalar);
6414         }
6415       gsi_insert_seq_before (&exit_gsi, seq, GSI_SAME_STMT);
6416     }
6417   else
6418     {
6419       bool reduce_with_shift;
6420       tree vec_temp;
6421
6422       gcc_assert (slp_reduc || reduc_inputs.length () == 1);
6423
6424       /* See if the target wants to do the final (shift) reduction
6425          in a vector mode of smaller size and first reduce upper/lower
6426          halves against each other.  */
6427       enum machine_mode mode1 = mode;
6428       tree stype = TREE_TYPE (vectype);
6429       unsigned nunits = TYPE_VECTOR_SUBPARTS (vectype).to_constant ();
6430       unsigned nunits1 = nunits;
6431       if ((mode1 = targetm.vectorize.split_reduction (mode)) != mode
6432           && reduc_inputs.length () == 1)
6433         {
6434           nunits1 = GET_MODE_NUNITS (mode1).to_constant ();
6435           /* For SLP reductions we have to make sure lanes match up, but
6436              since we're doing individual element final reduction reducing
6437              vector width here is even more important.
6438              ???  We can also separate lanes with permutes, for the common
6439              case of power-of-two group-size odd/even extracts would work.  */
6440           if (slp_reduc && nunits != nunits1)
6441             {
6442               nunits1 = least_common_multiple (nunits1, group_size);
6443               gcc_assert (exact_log2 (nunits1) != -1 && nunits1 <= nunits);
6444             }
6445         }
6446       if (!slp_reduc
6447           && (mode1 = targetm.vectorize.split_reduction (mode)) != mode)
6448         nunits1 = GET_MODE_NUNITS (mode1).to_constant ();
6449
6450       tree vectype1 = get_related_vectype_for_scalar_type (TYPE_MODE (vectype),
6451                                                            stype, nunits1);
6452       reduce_with_shift = have_whole_vector_shift (mode1);
6453       if (!VECTOR_MODE_P (mode1)
6454           || !directly_supported_p (code, vectype1))
6455         reduce_with_shift = false;
6456
6457       /* First reduce the vector to the desired vector size we should
6458          do shift reduction on by combining upper and lower halves.  */
6459       gimple_seq stmts = NULL;
6460       new_temp = vect_create_partial_epilog (reduc_inputs[0], vectype1,
6461                                              code, &stmts);
6462       gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
6463       reduc_inputs[0] = new_temp;
6464
6465       if (reduce_with_shift && !slp_reduc)
6466         {
6467           int element_bitsize = tree_to_uhwi (bitsize);
6468           /* Enforced by vectorizable_reduction, which disallows SLP reductions
6469              for variable-length vectors and also requires direct target support
6470              for loop reductions.  */
6471           int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype1));
6472           int nelements = vec_size_in_bits / element_bitsize;
6473           vec_perm_builder sel;
6474           vec_perm_indices indices;
6475
6476           int elt_offset;
6477
6478           tree zero_vec = build_zero_cst (vectype1);
6479           /* Case 2: Create:
6480              for (offset = nelements/2; offset >= 1; offset/=2)
6481                 {
6482                   Create:  va' = vec_shift <va, offset>
6483                   Create:  va = vop <va, va'>
6484                 }  */
6485
6486           tree rhs;
6487
6488           if (dump_enabled_p ())
6489             dump_printf_loc (MSG_NOTE, vect_location,
6490                              "Reduce using vector shifts\n");
6491
6492           gimple_seq stmts = NULL;
6493           new_temp = gimple_convert (&stmts, vectype1, new_temp);
6494           for (elt_offset = nelements / 2;
6495                elt_offset >= 1;
6496                elt_offset /= 2)
6497             {
6498               calc_vec_perm_mask_for_shift (elt_offset, nelements, &sel);
6499               indices.new_vector (sel, 2, nelements);
6500               tree mask = vect_gen_perm_mask_any (vectype1, indices);
6501               new_name = gimple_build (&stmts, VEC_PERM_EXPR, vectype1,
6502                                        new_temp, zero_vec, mask);
6503               new_temp = gimple_build (&stmts, code,
6504                                        vectype1, new_name, new_temp);
6505             }
6506           gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
6507
6508           /* 2.4  Extract the final scalar result.  Create:
6509              s_out3 = extract_field <v_out2, bitpos>  */
6510
6511           if (dump_enabled_p ())
6512             dump_printf_loc (MSG_NOTE, vect_location,
6513                              "extract scalar result\n");
6514
6515           rhs = build3 (BIT_FIELD_REF, scalar_type, new_temp,
6516                         bitsize, bitsize_zero_node);
6517           epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
6518           new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
6519           gimple_assign_set_lhs (epilog_stmt, new_temp);
6520           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
6521           scalar_results.safe_push (new_temp);
6522         }
6523       else
6524         {
6525           /* Case 3: Create:
6526              s = extract_field <v_out2, 0>
6527              for (offset = element_size;
6528                   offset < vector_size;
6529                   offset += element_size;)
6530                {
6531                  Create:  s' = extract_field <v_out2, offset>
6532                  Create:  s = op <s, s'>  // For non SLP cases
6533                }  */
6534
6535           if (dump_enabled_p ())
6536             dump_printf_loc (MSG_NOTE, vect_location,
6537                              "Reduce using scalar code.\n");
6538
6539           int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype1));
6540           int element_bitsize = tree_to_uhwi (bitsize);
6541           tree compute_type = TREE_TYPE (vectype);
6542           gimple_seq stmts = NULL;
6543           FOR_EACH_VEC_ELT (reduc_inputs, i, vec_temp)
6544             {
6545               int bit_offset;
6546               new_temp = gimple_build (&stmts, BIT_FIELD_REF, compute_type,
6547                                        vec_temp, bitsize, bitsize_zero_node);
6548
6549               /* In SLP we don't need to apply reduction operation, so we just
6550                  collect s' values in SCALAR_RESULTS.  */
6551               if (slp_reduc)
6552                 scalar_results.safe_push (new_temp);
6553
6554               for (bit_offset = element_bitsize;
6555                    bit_offset < vec_size_in_bits;
6556                    bit_offset += element_bitsize)
6557                 {
6558                   tree bitpos = bitsize_int (bit_offset);
6559                   new_name = gimple_build (&stmts, BIT_FIELD_REF,
6560                                            compute_type, vec_temp,
6561                                            bitsize, bitpos);
6562                   if (slp_reduc)
6563                     {
6564                       /* In SLP we don't need to apply reduction operation, so
6565                          we just collect s' values in SCALAR_RESULTS.  */
6566                       new_temp = new_name;
6567                       scalar_results.safe_push (new_name);
6568                     }
6569                   else
6570                     new_temp = gimple_build (&stmts, code, compute_type,
6571                                              new_name, new_temp);
6572                 }
6573             }
6574
6575           /* The only case where we need to reduce scalar results in SLP, is
6576              unrolling.  If the size of SCALAR_RESULTS is greater than
6577              REDUC_GROUP_SIZE, we reduce them combining elements modulo
6578              REDUC_GROUP_SIZE.  */
6579           if (slp_reduc)
6580             {
6581               tree res, first_res, new_res;
6582
6583               /* Reduce multiple scalar results in case of SLP unrolling.  */
6584               for (j = group_size; scalar_results.iterate (j, &res);
6585                    j++)
6586                 {
6587                   first_res = scalar_results[j % group_size];
6588                   new_res = gimple_build (&stmts, code, compute_type,
6589                                           first_res, res);
6590                   scalar_results[j % group_size] = new_res;
6591                 }
6592               scalar_results.truncate (group_size);
6593               for (k = 0; k < group_size; k++)
6594                 scalar_results[k] = gimple_convert (&stmts, scalar_type,
6595                                                     scalar_results[k]);
6596             }
6597           else
6598             {
6599               /* Not SLP - we have one scalar to keep in SCALAR_RESULTS.  */
6600               new_temp = gimple_convert (&stmts, scalar_type, new_temp);
6601               scalar_results.safe_push (new_temp);
6602             }
6603
6604           gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
6605         }
6606
6607       if ((STMT_VINFO_REDUC_TYPE (reduc_info) == INTEGER_INDUC_COND_REDUCTION)
6608           && induc_val)
6609         {
6610           /* Earlier we set the initial value to be a vector if induc_val
6611              values.  Check the result and if it is induc_val then replace
6612              with the original initial value, unless induc_val is
6613              the same as initial_def already.  */
6614           tree zcompare = make_ssa_name (boolean_type_node);
6615           epilog_stmt = gimple_build_assign (zcompare, EQ_EXPR, new_temp,
6616                                              induc_val);
6617           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
6618           tree initial_def = reduc_info->reduc_initial_values[0];
6619           tree tmp = make_ssa_name (new_scalar_dest);
6620           epilog_stmt = gimple_build_assign (tmp, COND_EXPR, zcompare,
6621                                              initial_def, new_temp);
6622           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
6623           scalar_results[0] = tmp;
6624         }
6625     }
6626
6627   /* 2.5 Adjust the final result by the initial value of the reduction
6628          variable. (When such adjustment is not needed, then
6629          'adjustment_def' is zero).  For example, if code is PLUS we create:
6630          new_temp = loop_exit_def + adjustment_def  */
6631
6632   if (adjustment_def)
6633     {
6634       gcc_assert (!slp_reduc);
6635       gimple_seq stmts = NULL;
6636       if (double_reduc)
6637         {
6638           gcc_assert (VECTOR_TYPE_P (TREE_TYPE (adjustment_def)));
6639           adjustment_def = gimple_convert (&stmts, vectype, adjustment_def);
6640           new_temp = gimple_build (&stmts, code, vectype,
6641                                    reduc_inputs[0], adjustment_def);
6642         }
6643       else
6644         {
6645           new_temp = scalar_results[0];
6646           gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) != VECTOR_TYPE);
6647           adjustment_def = gimple_convert (&stmts, TREE_TYPE (vectype),
6648                                            adjustment_def);
6649           new_temp = gimple_convert (&stmts, TREE_TYPE (vectype), new_temp);
6650           new_temp = gimple_build (&stmts, code, TREE_TYPE (vectype),
6651                                    new_temp, adjustment_def);
6652           new_temp = gimple_convert (&stmts, scalar_type, new_temp);
6653         }
6654
6655       epilog_stmt = gimple_seq_last_stmt (stmts);
6656       gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
6657       scalar_results[0] = new_temp;
6658     }
6659
6660   /* Record this operation if it could be reused by the epilogue loop.  */
6661   if (STMT_VINFO_REDUC_TYPE (reduc_info) == TREE_CODE_REDUCTION
6662       && reduc_inputs.length () == 1)
6663     loop_vinfo->reusable_accumulators.put (scalar_results[0],
6664                                            { orig_reduc_input, reduc_info });
6665
6666   if (double_reduc)
6667     loop = outer_loop;
6668
6669   /* 2.6  Handle the loop-exit phis.  Replace the uses of scalar loop-exit
6670           phis with new adjusted scalar results, i.e., replace use <s_out0>
6671           with use <s_out4>.
6672
6673      Transform:
6674         loop_exit:
6675           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
6676           v_out1 = phi <VECT_DEF>               # NEW_EXIT_PHI
6677           v_out2 = reduce <v_out1>
6678           s_out3 = extract_field <v_out2, 0>
6679           s_out4 = adjust_result <s_out3>
6680           use <s_out0>
6681           use <s_out0>
6682
6683      into:
6684
6685         loop_exit:
6686           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
6687           v_out1 = phi <VECT_DEF>               # NEW_EXIT_PHI
6688           v_out2 = reduce <v_out1>
6689           s_out3 = extract_field <v_out2, 0>
6690           s_out4 = adjust_result <s_out3>
6691           use <s_out4>
6692           use <s_out4> */
6693
6694   gcc_assert (live_out_stmts.size () == scalar_results.length ());
6695   for (k = 0; k < live_out_stmts.size (); k++)
6696     {
6697       stmt_vec_info scalar_stmt_info = vect_orig_stmt (live_out_stmts[k]);
6698       scalar_dest = gimple_get_lhs (scalar_stmt_info->stmt);
6699
6700       phis.create (3);
6701       /* Find the loop-closed-use at the loop exit of the original scalar
6702          result.  (The reduction result is expected to have two immediate uses,
6703          one at the latch block, and one at the loop exit).  For double
6704          reductions we are looking for exit phis of the outer loop.  */
6705       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, scalar_dest)
6706         {
6707           if (!flow_bb_inside_loop_p (loop, gimple_bb (USE_STMT (use_p))))
6708             {
6709               if (!is_gimple_debug (USE_STMT (use_p)))
6710                 phis.safe_push (USE_STMT (use_p));
6711             }
6712           else
6713             {
6714               if (double_reduc && gimple_code (USE_STMT (use_p)) == GIMPLE_PHI)
6715                 {
6716                   tree phi_res = PHI_RESULT (USE_STMT (use_p));
6717
6718                   FOR_EACH_IMM_USE_FAST (phi_use_p, phi_imm_iter, phi_res)
6719                     {
6720                       if (!flow_bb_inside_loop_p (loop,
6721                                              gimple_bb (USE_STMT (phi_use_p)))
6722                           && !is_gimple_debug (USE_STMT (phi_use_p)))
6723                         phis.safe_push (USE_STMT (phi_use_p));
6724                     }
6725                 }
6726             }
6727         }
6728
6729       FOR_EACH_VEC_ELT (phis, i, exit_phi)
6730         {
6731           /* Replace the uses:  */
6732           orig_name = PHI_RESULT (exit_phi);
6733
6734           /* Look for a single use at the target of the skip edge.  */
6735           if (unify_with_main_loop_p)
6736             {
6737               use_operand_p use_p;
6738               gimple *user;
6739               if (!single_imm_use (orig_name, &use_p, &user))
6740                 gcc_unreachable ();
6741               orig_name = gimple_get_lhs (user);
6742             }
6743
6744           scalar_result = scalar_results[k];
6745           FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name)
6746             {
6747               FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
6748                 SET_USE (use_p, scalar_result);
6749               update_stmt (use_stmt);
6750             }
6751         }
6752
6753       phis.release ();
6754     }
6755 }
6756
6757 /* Return a vector of type VECTYPE that is equal to the vector select
6758    operation "MASK ? VEC : IDENTITY".  Insert the select statements
6759    before GSI.  */
6760
6761 static tree
6762 merge_with_identity (gimple_stmt_iterator *gsi, tree mask, tree vectype,
6763                      tree vec, tree identity)
6764 {
6765   tree cond = make_temp_ssa_name (vectype, NULL, "cond");
6766   gimple *new_stmt = gimple_build_assign (cond, VEC_COND_EXPR,
6767                                           mask, vec, identity);
6768   gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
6769   return cond;
6770 }
6771
6772 /* Successively apply CODE to each element of VECTOR_RHS, in left-to-right
6773    order, starting with LHS.  Insert the extraction statements before GSI and
6774    associate the new scalar SSA names with variable SCALAR_DEST.
6775    Return the SSA name for the result.  */
6776
6777 static tree
6778 vect_expand_fold_left (gimple_stmt_iterator *gsi, tree scalar_dest,
6779                        tree_code code, tree lhs, tree vector_rhs)
6780 {
6781   tree vectype = TREE_TYPE (vector_rhs);
6782   tree scalar_type = TREE_TYPE (vectype);
6783   tree bitsize = TYPE_SIZE (scalar_type);
6784   unsigned HOST_WIDE_INT vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
6785   unsigned HOST_WIDE_INT element_bitsize = tree_to_uhwi (bitsize);
6786
6787   for (unsigned HOST_WIDE_INT bit_offset = 0;
6788        bit_offset < vec_size_in_bits;
6789        bit_offset += element_bitsize)
6790     {
6791       tree bitpos = bitsize_int (bit_offset);
6792       tree rhs = build3 (BIT_FIELD_REF, scalar_type, vector_rhs,
6793                          bitsize, bitpos);
6794
6795       gassign *stmt = gimple_build_assign (scalar_dest, rhs);
6796       rhs = make_ssa_name (scalar_dest, stmt);
6797       gimple_assign_set_lhs (stmt, rhs);
6798       gsi_insert_before (gsi, stmt, GSI_SAME_STMT);
6799
6800       stmt = gimple_build_assign (scalar_dest, code, lhs, rhs);
6801       tree new_name = make_ssa_name (scalar_dest, stmt);
6802       gimple_assign_set_lhs (stmt, new_name);
6803       gsi_insert_before (gsi, stmt, GSI_SAME_STMT);
6804       lhs = new_name;
6805     }
6806   return lhs;
6807 }
6808
6809 /* Get a masked internal function equivalent to REDUC_FN.  VECTYPE_IN is the
6810    type of the vector input.  */
6811
6812 static internal_fn
6813 get_masked_reduction_fn (internal_fn reduc_fn, tree vectype_in)
6814 {
6815   internal_fn mask_reduc_fn;
6816   internal_fn mask_len_reduc_fn;
6817
6818   switch (reduc_fn)
6819     {
6820     case IFN_FOLD_LEFT_PLUS:
6821       mask_reduc_fn = IFN_MASK_FOLD_LEFT_PLUS;
6822       mask_len_reduc_fn = IFN_MASK_LEN_FOLD_LEFT_PLUS;
6823       break;
6824
6825     default:
6826       return IFN_LAST;
6827     }
6828
6829   if (direct_internal_fn_supported_p (mask_reduc_fn, vectype_in,
6830                                       OPTIMIZE_FOR_SPEED))
6831     return mask_reduc_fn;
6832   if (direct_internal_fn_supported_p (mask_len_reduc_fn, vectype_in,
6833                                       OPTIMIZE_FOR_SPEED))
6834     return mask_len_reduc_fn;
6835   return IFN_LAST;
6836 }
6837
6838 /* Perform an in-order reduction (FOLD_LEFT_REDUCTION).  STMT_INFO is the
6839    statement that sets the live-out value.  REDUC_DEF_STMT is the phi
6840    statement.  CODE is the operation performed by STMT_INFO and OPS are
6841    its scalar operands.  REDUC_INDEX is the index of the operand in
6842    OPS that is set by REDUC_DEF_STMT.  REDUC_FN is the function that
6843    implements in-order reduction, or IFN_LAST if we should open-code it.
6844    VECTYPE_IN is the type of the vector input.  MASKS specifies the masks
6845    that should be used to control the operation in a fully-masked loop.  */
6846
6847 static bool
6848 vectorize_fold_left_reduction (loop_vec_info loop_vinfo,
6849                                stmt_vec_info stmt_info,
6850                                gimple_stmt_iterator *gsi,
6851                                gimple **vec_stmt, slp_tree slp_node,
6852                                gimple *reduc_def_stmt,
6853                                tree_code code, internal_fn reduc_fn,
6854                                tree ops[3], tree vectype_in,
6855                                int reduc_index, vec_loop_masks *masks,
6856                                vec_loop_lens *lens)
6857 {
6858   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
6859   tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
6860   internal_fn mask_reduc_fn = get_masked_reduction_fn (reduc_fn, vectype_in);
6861
6862   int ncopies;
6863   if (slp_node)
6864     ncopies = 1;
6865   else
6866     ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
6867
6868   gcc_assert (!nested_in_vect_loop_p (loop, stmt_info));
6869   gcc_assert (ncopies == 1);
6870   gcc_assert (TREE_CODE_LENGTH (code) == binary_op);
6871
6872   if (slp_node)
6873     gcc_assert (known_eq (TYPE_VECTOR_SUBPARTS (vectype_out),
6874                           TYPE_VECTOR_SUBPARTS (vectype_in)));
6875
6876   tree op0 = ops[1 - reduc_index];
6877
6878   int group_size = 1;
6879   stmt_vec_info scalar_dest_def_info;
6880   auto_vec<tree> vec_oprnds0;
6881   if (slp_node)
6882     {
6883       auto_vec<vec<tree> > vec_defs (2);
6884       vect_get_slp_defs (loop_vinfo, slp_node, &vec_defs);
6885       vec_oprnds0.safe_splice (vec_defs[1 - reduc_index]);
6886       vec_defs[0].release ();
6887       vec_defs[1].release ();
6888       group_size = SLP_TREE_SCALAR_STMTS (slp_node).length ();
6889       scalar_dest_def_info = SLP_TREE_SCALAR_STMTS (slp_node)[group_size - 1];
6890     }
6891   else
6892     {
6893       vect_get_vec_defs_for_operand (loop_vinfo, stmt_info, 1,
6894                                      op0, &vec_oprnds0);
6895       scalar_dest_def_info = stmt_info;
6896     }
6897
6898   tree scalar_dest = gimple_assign_lhs (scalar_dest_def_info->stmt);
6899   tree scalar_type = TREE_TYPE (scalar_dest);
6900   tree reduc_var = gimple_phi_result (reduc_def_stmt);
6901
6902   int vec_num = vec_oprnds0.length ();
6903   gcc_assert (vec_num == 1 || slp_node);
6904   tree vec_elem_type = TREE_TYPE (vectype_out);
6905   gcc_checking_assert (useless_type_conversion_p (scalar_type, vec_elem_type));
6906
6907   tree vector_identity = NULL_TREE;
6908   if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
6909     vector_identity = build_zero_cst (vectype_out);
6910
6911   tree scalar_dest_var = vect_create_destination_var (scalar_dest, NULL);
6912   int i;
6913   tree def0;
6914   FOR_EACH_VEC_ELT (vec_oprnds0, i, def0)
6915     {
6916       gimple *new_stmt;
6917       tree mask = NULL_TREE;
6918       tree len = NULL_TREE;
6919       tree bias = NULL_TREE;
6920       if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
6921         mask = vect_get_loop_mask (loop_vinfo, gsi, masks, vec_num, vectype_in, i);
6922       if (LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo))
6923         {
6924           len = vect_get_loop_len (loop_vinfo, gsi, lens, vec_num, vectype_in,
6925                                    i, 1);
6926           signed char biasval = LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo);
6927           bias = build_int_cst (intQI_type_node, biasval);
6928           mask = build_minus_one_cst (truth_type_for (vectype_in));
6929         }
6930
6931       /* Handle MINUS by adding the negative.  */
6932       if (reduc_fn != IFN_LAST && code == MINUS_EXPR)
6933         {
6934           tree negated = make_ssa_name (vectype_out);
6935           new_stmt = gimple_build_assign (negated, NEGATE_EXPR, def0);
6936           gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
6937           def0 = negated;
6938         }
6939
6940       if (mask && mask_reduc_fn == IFN_LAST)
6941         def0 = merge_with_identity (gsi, mask, vectype_out, def0,
6942                                     vector_identity);
6943
6944       /* On the first iteration the input is simply the scalar phi
6945          result, and for subsequent iterations it is the output of
6946          the preceding operation.  */
6947       if (reduc_fn != IFN_LAST || (mask && mask_reduc_fn != IFN_LAST))
6948         {
6949           if (mask && len && mask_reduc_fn == IFN_MASK_LEN_FOLD_LEFT_PLUS)
6950             new_stmt = gimple_build_call_internal (mask_reduc_fn, 5, reduc_var,
6951                                                    def0, mask, len, bias);
6952           else if (mask && mask_reduc_fn == IFN_MASK_FOLD_LEFT_PLUS)
6953             new_stmt = gimple_build_call_internal (mask_reduc_fn, 3, reduc_var,
6954                                                    def0, mask);
6955           else
6956             new_stmt = gimple_build_call_internal (reduc_fn, 2, reduc_var,
6957                                                    def0);
6958           /* For chained SLP reductions the output of the previous reduction
6959              operation serves as the input of the next. For the final statement
6960              the output cannot be a temporary - we reuse the original
6961              scalar destination of the last statement.  */
6962           if (i != vec_num - 1)
6963             {
6964               gimple_set_lhs (new_stmt, scalar_dest_var);
6965               reduc_var = make_ssa_name (scalar_dest_var, new_stmt);
6966               gimple_set_lhs (new_stmt, reduc_var);
6967             }
6968         }
6969       else
6970         {
6971           reduc_var = vect_expand_fold_left (gsi, scalar_dest_var, code,
6972                                              reduc_var, def0);
6973           new_stmt = SSA_NAME_DEF_STMT (reduc_var);
6974           /* Remove the statement, so that we can use the same code paths
6975              as for statements that we've just created.  */
6976           gimple_stmt_iterator tmp_gsi = gsi_for_stmt (new_stmt);
6977           gsi_remove (&tmp_gsi, true);
6978         }
6979
6980       if (i == vec_num - 1)
6981         {
6982           gimple_set_lhs (new_stmt, scalar_dest);
6983           vect_finish_replace_stmt (loop_vinfo,
6984                                     scalar_dest_def_info,
6985                                     new_stmt);
6986         }
6987       else
6988         vect_finish_stmt_generation (loop_vinfo,
6989                                      scalar_dest_def_info,
6990                                      new_stmt, gsi);
6991
6992       if (slp_node)
6993         slp_node->push_vec_def (new_stmt);
6994       else
6995         {
6996           STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_stmt);
6997           *vec_stmt = new_stmt;
6998         }
6999     }
7000
7001   return true;
7002 }
7003
7004 /* Function is_nonwrapping_integer_induction.
7005
7006    Check if STMT_VINO (which is part of loop LOOP) both increments and
7007    does not cause overflow.  */
7008
7009 static bool
7010 is_nonwrapping_integer_induction (stmt_vec_info stmt_vinfo, class loop *loop)
7011 {
7012   gphi *phi = as_a <gphi *> (stmt_vinfo->stmt);
7013   tree base = STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo);
7014   tree step = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo);
7015   tree lhs_type = TREE_TYPE (gimple_phi_result (phi));
7016   widest_int ni, max_loop_value, lhs_max;
7017   wi::overflow_type overflow = wi::OVF_NONE;
7018
7019   /* Make sure the loop is integer based.  */
7020   if (TREE_CODE (base) != INTEGER_CST
7021       || TREE_CODE (step) != INTEGER_CST)
7022     return false;
7023
7024   /* Check that the max size of the loop will not wrap.  */
7025
7026   if (TYPE_OVERFLOW_UNDEFINED (lhs_type))
7027     return true;
7028
7029   if (! max_stmt_executions (loop, &ni))
7030     return false;
7031
7032   max_loop_value = wi::mul (wi::to_widest (step), ni, TYPE_SIGN (lhs_type),
7033                             &overflow);
7034   if (overflow)
7035     return false;
7036
7037   max_loop_value = wi::add (wi::to_widest (base), max_loop_value,
7038                             TYPE_SIGN (lhs_type), &overflow);
7039   if (overflow)
7040     return false;
7041
7042   return (wi::min_precision (max_loop_value, TYPE_SIGN (lhs_type))
7043           <= TYPE_PRECISION (lhs_type));
7044 }
7045
7046 /* Check if masking can be supported by inserting a conditional expression.
7047    CODE is the code for the operation.  COND_FN is the conditional internal
7048    function, if it exists.  VECTYPE_IN is the type of the vector input.  */
7049 static bool
7050 use_mask_by_cond_expr_p (code_helper code, internal_fn cond_fn,
7051                          tree vectype_in)
7052 {
7053   if (cond_fn != IFN_LAST
7054       && direct_internal_fn_supported_p (cond_fn, vectype_in,
7055                                          OPTIMIZE_FOR_SPEED))
7056     return false;
7057
7058   if (code.is_tree_code ())
7059     switch (tree_code (code))
7060       {
7061       case DOT_PROD_EXPR:
7062       case SAD_EXPR:
7063         return true;
7064
7065       default:
7066         break;
7067       }
7068   return false;
7069 }
7070
7071 /* Insert a conditional expression to enable masked vectorization.  CODE is the
7072    code for the operation.  VOP is the array of operands.  MASK is the loop
7073    mask.  GSI is a statement iterator used to place the new conditional
7074    expression.  */
7075 static void
7076 build_vect_cond_expr (code_helper code, tree vop[3], tree mask,
7077                       gimple_stmt_iterator *gsi)
7078 {
7079   switch (tree_code (code))
7080     {
7081     case DOT_PROD_EXPR:
7082       {
7083         tree vectype = TREE_TYPE (vop[1]);
7084         tree zero = build_zero_cst (vectype);
7085         tree masked_op1 = make_temp_ssa_name (vectype, NULL, "masked_op1");
7086         gassign *select = gimple_build_assign (masked_op1, VEC_COND_EXPR,
7087                                                mask, vop[1], zero);
7088         gsi_insert_before (gsi, select, GSI_SAME_STMT);
7089         vop[1] = masked_op1;
7090         break;
7091       }
7092
7093     case SAD_EXPR:
7094       {
7095         tree vectype = TREE_TYPE (vop[1]);
7096         tree masked_op1 = make_temp_ssa_name (vectype, NULL, "masked_op1");
7097         gassign *select = gimple_build_assign (masked_op1, VEC_COND_EXPR,
7098                                                mask, vop[1], vop[0]);
7099         gsi_insert_before (gsi, select, GSI_SAME_STMT);
7100         vop[1] = masked_op1;
7101         break;
7102       }
7103
7104     default:
7105       gcc_unreachable ();
7106     }
7107 }
7108
7109 /* Function vectorizable_reduction.
7110
7111    Check if STMT_INFO performs a reduction operation that can be vectorized.
7112    If VEC_STMT is also passed, vectorize STMT_INFO: create a vectorized
7113    stmt to replace it, put it in VEC_STMT, and insert it at GSI.
7114    Return true if STMT_INFO is vectorizable in this way.
7115
7116    This function also handles reduction idioms (patterns) that have been
7117    recognized in advance during vect_pattern_recog.  In this case, STMT_INFO
7118    may be of this form:
7119      X = pattern_expr (arg0, arg1, ..., X)
7120    and its STMT_VINFO_RELATED_STMT points to the last stmt in the original
7121    sequence that had been detected and replaced by the pattern-stmt
7122    (STMT_INFO).
7123
7124    This function also handles reduction of condition expressions, for example:
7125      for (int i = 0; i < N; i++)
7126        if (a[i] < value)
7127          last = a[i];
7128    This is handled by vectorising the loop and creating an additional vector
7129    containing the loop indexes for which "a[i] < value" was true.  In the
7130    function epilogue this is reduced to a single max value and then used to
7131    index into the vector of results.
7132
7133    In some cases of reduction patterns, the type of the reduction variable X is
7134    different than the type of the other arguments of STMT_INFO.
7135    In such cases, the vectype that is used when transforming STMT_INFO into
7136    a vector stmt is different than the vectype that is used to determine the
7137    vectorization factor, because it consists of a different number of elements
7138    than the actual number of elements that are being operated upon in parallel.
7139
7140    For example, consider an accumulation of shorts into an int accumulator.
7141    On some targets it's possible to vectorize this pattern operating on 8
7142    shorts at a time (hence, the vectype for purposes of determining the
7143    vectorization factor should be V8HI); on the other hand, the vectype that
7144    is used to create the vector form is actually V4SI (the type of the result).
7145
7146    Upon entry to this function, STMT_VINFO_VECTYPE records the vectype that
7147    indicates what is the actual level of parallelism (V8HI in the example), so
7148    that the right vectorization factor would be derived.  This vectype
7149    corresponds to the type of arguments to the reduction stmt, and should *NOT*
7150    be used to create the vectorized stmt.  The right vectype for the vectorized
7151    stmt is obtained from the type of the result X:
7152       get_vectype_for_scalar_type (vinfo, TREE_TYPE (X))
7153
7154    This means that, contrary to "regular" reductions (or "regular" stmts in
7155    general), the following equation:
7156       STMT_VINFO_VECTYPE == get_vectype_for_scalar_type (vinfo, TREE_TYPE (X))
7157    does *NOT* necessarily hold for reduction patterns.  */
7158
7159 bool
7160 vectorizable_reduction (loop_vec_info loop_vinfo,
7161                         stmt_vec_info stmt_info, slp_tree slp_node,
7162                         slp_instance slp_node_instance,
7163                         stmt_vector_for_cost *cost_vec)
7164 {
7165   tree vectype_in = NULL_TREE;
7166   tree vectype_op[3] = { NULL_TREE, NULL_TREE, NULL_TREE };
7167   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
7168   enum vect_def_type cond_reduc_dt = vect_unknown_def_type;
7169   stmt_vec_info cond_stmt_vinfo = NULL;
7170   int i;
7171   int ncopies;
7172   bool single_defuse_cycle = false;
7173   bool nested_cycle = false;
7174   bool double_reduc = false;
7175   int vec_num;
7176   tree cr_index_scalar_type = NULL_TREE, cr_index_vector_type = NULL_TREE;
7177   tree cond_reduc_val = NULL_TREE;
7178
7179   /* Make sure it was already recognized as a reduction computation.  */
7180   if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_reduction_def
7181       && STMT_VINFO_DEF_TYPE (stmt_info) != vect_double_reduction_def
7182       && STMT_VINFO_DEF_TYPE (stmt_info) != vect_nested_cycle)
7183     return false;
7184
7185   /* The stmt we store reduction analysis meta on.  */
7186   stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info);
7187   reduc_info->is_reduc_info = true;
7188
7189   if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle)
7190     {
7191       if (is_a <gphi *> (stmt_info->stmt))
7192         {
7193           if (slp_node)
7194             {
7195               /* We eventually need to set a vector type on invariant
7196                  arguments.  */
7197               unsigned j;
7198               slp_tree child;
7199               FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (slp_node), j, child)
7200                 if (!vect_maybe_update_slp_op_vectype
7201                        (child, SLP_TREE_VECTYPE (slp_node)))
7202                   {
7203                     if (dump_enabled_p ())
7204                       dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7205                                        "incompatible vector types for "
7206                                        "invariants\n");
7207                     return false;
7208                   }
7209             }
7210           /* Analysis for double-reduction is done on the outer
7211              loop PHI, nested cycles have no further restrictions.  */
7212           STMT_VINFO_TYPE (stmt_info) = cycle_phi_info_type;
7213         }
7214       else
7215         STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
7216       return true;
7217     }
7218
7219   stmt_vec_info orig_stmt_of_analysis = stmt_info;
7220   stmt_vec_info phi_info = stmt_info;
7221   if (!is_a <gphi *> (stmt_info->stmt))
7222     {
7223       STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
7224       return true;
7225     }
7226   if (slp_node)
7227     {
7228       slp_node_instance->reduc_phis = slp_node;
7229       /* ???  We're leaving slp_node to point to the PHIs, we only
7230          need it to get at the number of vector stmts which wasn't
7231          yet initialized for the instance root.  */
7232     }
7233   if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
7234     {
7235       use_operand_p use_p;
7236       gimple *use_stmt;
7237       bool res = single_imm_use (gimple_phi_result (stmt_info->stmt),
7238                                  &use_p, &use_stmt);
7239       gcc_assert (res);
7240       phi_info = loop_vinfo->lookup_stmt (use_stmt);
7241     }
7242
7243   /* PHIs should not participate in patterns.  */
7244   gcc_assert (!STMT_VINFO_RELATED_STMT (phi_info));
7245   gphi *reduc_def_phi = as_a <gphi *> (phi_info->stmt);
7246
7247   /* Verify following REDUC_IDX from the latch def leads us back to the PHI
7248      and compute the reduction chain length.  Discover the real
7249      reduction operation stmt on the way (stmt_info and slp_for_stmt_info).  */
7250   tree reduc_def
7251     = PHI_ARG_DEF_FROM_EDGE (reduc_def_phi,
7252                              loop_latch_edge
7253                                (gimple_bb (reduc_def_phi)->loop_father));
7254   unsigned reduc_chain_length = 0;
7255   bool only_slp_reduc_chain = true;
7256   stmt_info = NULL;
7257   slp_tree slp_for_stmt_info = slp_node ? slp_node_instance->root : NULL;
7258   while (reduc_def != PHI_RESULT (reduc_def_phi))
7259     {
7260       stmt_vec_info def = loop_vinfo->lookup_def (reduc_def);
7261       stmt_vec_info vdef = vect_stmt_to_vectorize (def);
7262       if (STMT_VINFO_REDUC_IDX (vdef) == -1)
7263         {
7264           if (dump_enabled_p ())
7265             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7266                              "reduction chain broken by patterns.\n");
7267           return false;
7268         }
7269       if (!REDUC_GROUP_FIRST_ELEMENT (vdef))
7270         only_slp_reduc_chain = false;
7271       /* For epilogue generation live members of the chain need
7272          to point back to the PHI via their original stmt for
7273          info_for_reduction to work.  For SLP we need to look at
7274          all lanes here - even though we only will vectorize from
7275          the SLP node with live lane zero the other live lanes also
7276          need to be identified as part of a reduction to be able
7277          to skip code generation for them.  */
7278       if (slp_for_stmt_info)
7279         {
7280           for (auto s : SLP_TREE_SCALAR_STMTS (slp_for_stmt_info))
7281             if (STMT_VINFO_LIVE_P (s))
7282               STMT_VINFO_REDUC_DEF (vect_orig_stmt (s)) = phi_info;
7283         }
7284       else if (STMT_VINFO_LIVE_P (vdef))
7285         STMT_VINFO_REDUC_DEF (def) = phi_info;
7286       gimple_match_op op;
7287       if (!gimple_extract_op (vdef->stmt, &op))
7288         {
7289           if (dump_enabled_p ())
7290             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7291                              "reduction chain includes unsupported"
7292                              " statement type.\n");
7293           return false;
7294         }
7295       if (CONVERT_EXPR_CODE_P (op.code))
7296         {
7297           if (!tree_nop_conversion_p (op.type, TREE_TYPE (op.ops[0])))
7298             {
7299               if (dump_enabled_p ())
7300                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7301                                  "conversion in the reduction chain.\n");
7302               return false;
7303             }
7304         }
7305       else if (!stmt_info)
7306         /* First non-conversion stmt.  */
7307         stmt_info = vdef;
7308       reduc_def = op.ops[STMT_VINFO_REDUC_IDX (vdef)];
7309       reduc_chain_length++;
7310       if (!stmt_info && slp_node)
7311         slp_for_stmt_info = SLP_TREE_CHILDREN (slp_for_stmt_info)[0];
7312     }
7313   /* PHIs should not participate in patterns.  */
7314   gcc_assert (!STMT_VINFO_RELATED_STMT (phi_info));
7315
7316   if (nested_in_vect_loop_p (loop, stmt_info))
7317     {
7318       loop = loop->inner;
7319       nested_cycle = true;
7320     }
7321
7322   /* STMT_VINFO_REDUC_DEF doesn't point to the first but the last
7323      element.  */
7324   if (slp_node && REDUC_GROUP_FIRST_ELEMENT (stmt_info))
7325     {
7326       gcc_assert (!REDUC_GROUP_NEXT_ELEMENT (stmt_info));
7327       stmt_info = REDUC_GROUP_FIRST_ELEMENT (stmt_info);
7328     }
7329   if (REDUC_GROUP_FIRST_ELEMENT (stmt_info))
7330     gcc_assert (slp_node
7331                 && REDUC_GROUP_FIRST_ELEMENT (stmt_info) == stmt_info);
7332
7333   /* 1. Is vectorizable reduction?  */
7334   /* Not supportable if the reduction variable is used in the loop, unless
7335      it's a reduction chain.  */
7336   if (STMT_VINFO_RELEVANT (stmt_info) > vect_used_in_outer
7337       && !REDUC_GROUP_FIRST_ELEMENT (stmt_info))
7338     return false;
7339
7340   /* Reductions that are not used even in an enclosing outer-loop,
7341      are expected to be "live" (used out of the loop).  */
7342   if (STMT_VINFO_RELEVANT (stmt_info) == vect_unused_in_scope
7343       && !STMT_VINFO_LIVE_P (stmt_info))
7344     return false;
7345
7346   /* 2. Has this been recognized as a reduction pattern?
7347
7348      Check if STMT represents a pattern that has been recognized
7349      in earlier analysis stages.  For stmts that represent a pattern,
7350      the STMT_VINFO_RELATED_STMT field records the last stmt in
7351      the original sequence that constitutes the pattern.  */
7352
7353   stmt_vec_info orig_stmt_info = STMT_VINFO_RELATED_STMT (stmt_info);
7354   if (orig_stmt_info)
7355     {
7356       gcc_assert (STMT_VINFO_IN_PATTERN_P (orig_stmt_info));
7357       gcc_assert (!STMT_VINFO_IN_PATTERN_P (stmt_info));
7358     }
7359
7360   /* 3. Check the operands of the operation.  The first operands are defined
7361         inside the loop body. The last operand is the reduction variable,
7362         which is defined by the loop-header-phi.  */
7363
7364   tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
7365   STMT_VINFO_REDUC_VECTYPE (reduc_info) = vectype_out;
7366   gimple_match_op op;
7367   if (!gimple_extract_op (stmt_info->stmt, &op))
7368     gcc_unreachable ();
7369   bool lane_reduc_code_p = (op.code == DOT_PROD_EXPR
7370                             || op.code == WIDEN_SUM_EXPR
7371                             || op.code == SAD_EXPR);
7372
7373   if (!POINTER_TYPE_P (op.type) && !INTEGRAL_TYPE_P (op.type)
7374       && !SCALAR_FLOAT_TYPE_P (op.type))
7375     return false;
7376
7377   /* Do not try to vectorize bit-precision reductions.  */
7378   if (!type_has_mode_precision_p (op.type))
7379     return false;
7380
7381   /* For lane-reducing ops we're reducing the number of reduction PHIs
7382      which means the only use of that may be in the lane-reducing operation.  */
7383   if (lane_reduc_code_p
7384       && reduc_chain_length != 1
7385       && !only_slp_reduc_chain)
7386     {
7387       if (dump_enabled_p ())
7388         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7389                          "lane-reducing reduction with extra stmts.\n");
7390       return false;
7391     }
7392
7393   /* All uses but the last are expected to be defined in the loop.
7394      The last use is the reduction variable.  In case of nested cycle this
7395      assumption is not true: we use reduc_index to record the index of the
7396      reduction variable.  */
7397   slp_tree *slp_op = XALLOCAVEC (slp_tree, op.num_ops);
7398   /* We need to skip an extra operand for COND_EXPRs with embedded
7399      comparison.  */
7400   unsigned opno_adjust = 0;
7401   if (op.code == COND_EXPR && COMPARISON_CLASS_P (op.ops[0]))
7402     opno_adjust = 1;
7403   for (i = 0; i < (int) op.num_ops; i++)
7404     {
7405       /* The condition of COND_EXPR is checked in vectorizable_condition().  */
7406       if (i == 0 && op.code == COND_EXPR)
7407         continue;
7408
7409       stmt_vec_info def_stmt_info;
7410       enum vect_def_type dt;
7411       if (!vect_is_simple_use (loop_vinfo, stmt_info, slp_for_stmt_info,
7412                                i + opno_adjust, &op.ops[i], &slp_op[i], &dt,
7413                                &vectype_op[i], &def_stmt_info))
7414         {
7415           if (dump_enabled_p ())
7416             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7417                              "use not simple.\n");
7418           return false;
7419         }
7420       if (i == STMT_VINFO_REDUC_IDX (stmt_info))
7421         continue;
7422
7423       /* There should be only one cycle def in the stmt, the one
7424          leading to reduc_def.  */
7425       if (VECTORIZABLE_CYCLE_DEF (dt))
7426         return false;
7427
7428       if (!vectype_op[i])
7429         vectype_op[i]
7430           = get_vectype_for_scalar_type (loop_vinfo,
7431                                          TREE_TYPE (op.ops[i]), slp_op[i]);
7432
7433       /* To properly compute ncopies we are interested in the widest
7434          non-reduction input type in case we're looking at a widening
7435          accumulation that we later handle in vect_transform_reduction.  */
7436       if (lane_reduc_code_p
7437           && vectype_op[i]
7438           && (!vectype_in
7439               || (GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (vectype_in)))
7440                   < GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (vectype_op[i]))))))
7441         vectype_in = vectype_op[i];
7442
7443       if (op.code == COND_EXPR)
7444         {
7445           /* Record how the non-reduction-def value of COND_EXPR is defined.  */
7446           if (dt == vect_constant_def)
7447             {
7448               cond_reduc_dt = dt;
7449               cond_reduc_val = op.ops[i];
7450             }
7451           if (dt == vect_induction_def
7452               && def_stmt_info
7453               && is_nonwrapping_integer_induction (def_stmt_info, loop))
7454             {
7455               cond_reduc_dt = dt;
7456               cond_stmt_vinfo = def_stmt_info;
7457             }
7458         }
7459     }
7460   if (!vectype_in)
7461     vectype_in = STMT_VINFO_VECTYPE (phi_info);
7462   STMT_VINFO_REDUC_VECTYPE_IN (reduc_info) = vectype_in;
7463
7464   enum vect_reduction_type v_reduc_type = STMT_VINFO_REDUC_TYPE (phi_info);
7465   STMT_VINFO_REDUC_TYPE (reduc_info) = v_reduc_type;
7466   /* If we have a condition reduction, see if we can simplify it further.  */
7467   if (v_reduc_type == COND_REDUCTION)
7468     {
7469       if (slp_node)
7470         return false;
7471
7472       /* When the condition uses the reduction value in the condition, fail.  */
7473       if (STMT_VINFO_REDUC_IDX (stmt_info) == 0)
7474         {
7475           if (dump_enabled_p ())
7476             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7477                              "condition depends on previous iteration\n");
7478           return false;
7479         }
7480
7481       if (reduc_chain_length == 1
7482           && direct_internal_fn_supported_p (IFN_FOLD_EXTRACT_LAST,
7483                                              vectype_in, OPTIMIZE_FOR_SPEED))
7484         {
7485           if (dump_enabled_p ())
7486             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7487                              "optimizing condition reduction with"
7488                              " FOLD_EXTRACT_LAST.\n");
7489           STMT_VINFO_REDUC_TYPE (reduc_info) = EXTRACT_LAST_REDUCTION;
7490         }
7491       else if (cond_reduc_dt == vect_induction_def)
7492         {
7493           tree base
7494             = STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (cond_stmt_vinfo);
7495           tree step = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (cond_stmt_vinfo);
7496
7497           gcc_assert (TREE_CODE (base) == INTEGER_CST
7498                       && TREE_CODE (step) == INTEGER_CST);
7499           cond_reduc_val = NULL_TREE;
7500           enum tree_code cond_reduc_op_code = ERROR_MARK;
7501           tree res = PHI_RESULT (STMT_VINFO_STMT (cond_stmt_vinfo));
7502           if (!types_compatible_p (TREE_TYPE (res), TREE_TYPE (base)))
7503             ;
7504           /* Find a suitable value, for MAX_EXPR below base, for MIN_EXPR
7505              above base; punt if base is the minimum value of the type for
7506              MAX_EXPR or maximum value of the type for MIN_EXPR for now.  */
7507           else if (tree_int_cst_sgn (step) == -1)
7508             {
7509               cond_reduc_op_code = MIN_EXPR;
7510               if (tree_int_cst_sgn (base) == -1)
7511                 cond_reduc_val = build_int_cst (TREE_TYPE (base), 0);
7512               else if (tree_int_cst_lt (base,
7513                                         TYPE_MAX_VALUE (TREE_TYPE (base))))
7514                 cond_reduc_val
7515                   = int_const_binop (PLUS_EXPR, base, integer_one_node);
7516             }
7517           else
7518             {
7519               cond_reduc_op_code = MAX_EXPR;
7520               if (tree_int_cst_sgn (base) == 1)
7521                 cond_reduc_val = build_int_cst (TREE_TYPE (base), 0);
7522               else if (tree_int_cst_lt (TYPE_MIN_VALUE (TREE_TYPE (base)),
7523                                         base))
7524                 cond_reduc_val
7525                   = int_const_binop (MINUS_EXPR, base, integer_one_node);
7526             }
7527           if (cond_reduc_val)
7528             {
7529               if (dump_enabled_p ())
7530                 dump_printf_loc (MSG_NOTE, vect_location,
7531                                  "condition expression based on "
7532                                  "integer induction.\n");
7533               STMT_VINFO_REDUC_CODE (reduc_info) = cond_reduc_op_code;
7534               STMT_VINFO_VEC_INDUC_COND_INITIAL_VAL (reduc_info)
7535                 = cond_reduc_val;
7536               STMT_VINFO_REDUC_TYPE (reduc_info) = INTEGER_INDUC_COND_REDUCTION;
7537             }
7538         }
7539       else if (cond_reduc_dt == vect_constant_def)
7540         {
7541           enum vect_def_type cond_initial_dt;
7542           tree cond_initial_val = vect_phi_initial_value (reduc_def_phi);
7543           vect_is_simple_use (cond_initial_val, loop_vinfo, &cond_initial_dt);
7544           if (cond_initial_dt == vect_constant_def
7545               && types_compatible_p (TREE_TYPE (cond_initial_val),
7546                                      TREE_TYPE (cond_reduc_val)))
7547             {
7548               tree e = fold_binary (LE_EXPR, boolean_type_node,
7549                                     cond_initial_val, cond_reduc_val);
7550               if (e && (integer_onep (e) || integer_zerop (e)))
7551                 {
7552                   if (dump_enabled_p ())
7553                     dump_printf_loc (MSG_NOTE, vect_location,
7554                                      "condition expression based on "
7555                                      "compile time constant.\n");
7556                   /* Record reduction code at analysis stage.  */
7557                   STMT_VINFO_REDUC_CODE (reduc_info)
7558                     = integer_onep (e) ? MAX_EXPR : MIN_EXPR;
7559                   STMT_VINFO_REDUC_TYPE (reduc_info) = CONST_COND_REDUCTION;
7560                 }
7561             }
7562         }
7563     }
7564
7565   if (STMT_VINFO_LIVE_P (phi_info))
7566     return false;
7567
7568   if (slp_node)
7569     ncopies = 1;
7570   else
7571     ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
7572
7573   gcc_assert (ncopies >= 1);
7574
7575   poly_uint64 nunits_out = TYPE_VECTOR_SUBPARTS (vectype_out);
7576
7577   if (nested_cycle)
7578     {
7579       gcc_assert (STMT_VINFO_DEF_TYPE (reduc_info)
7580                   == vect_double_reduction_def);
7581       double_reduc = true;
7582     }
7583
7584   /* 4.2. Check support for the epilog operation.
7585
7586           If STMT represents a reduction pattern, then the type of the
7587           reduction variable may be different than the type of the rest
7588           of the arguments.  For example, consider the case of accumulation
7589           of shorts into an int accumulator; The original code:
7590                         S1: int_a = (int) short_a;
7591           orig_stmt->   S2: int_acc = plus <int_a ,int_acc>;
7592
7593           was replaced with:
7594                         STMT: int_acc = widen_sum <short_a, int_acc>
7595
7596           This means that:
7597           1. The tree-code that is used to create the vector operation in the
7598              epilog code (that reduces the partial results) is not the
7599              tree-code of STMT, but is rather the tree-code of the original
7600              stmt from the pattern that STMT is replacing.  I.e, in the example
7601              above we want to use 'widen_sum' in the loop, but 'plus' in the
7602              epilog.
7603           2. The type (mode) we use to check available target support
7604              for the vector operation to be created in the *epilog*, is
7605              determined by the type of the reduction variable (in the example
7606              above we'd check this: optab_handler (plus_optab, vect_int_mode])).
7607              However the type (mode) we use to check available target support
7608              for the vector operation to be created *inside the loop*, is
7609              determined by the type of the other arguments to STMT (in the
7610              example we'd check this: optab_handler (widen_sum_optab,
7611              vect_short_mode)).
7612
7613           This is contrary to "regular" reductions, in which the types of all
7614           the arguments are the same as the type of the reduction variable.
7615           For "regular" reductions we can therefore use the same vector type
7616           (and also the same tree-code) when generating the epilog code and
7617           when generating the code inside the loop.  */
7618
7619   code_helper orig_code = STMT_VINFO_REDUC_CODE (phi_info);
7620   STMT_VINFO_REDUC_CODE (reduc_info) = orig_code;
7621
7622   vect_reduction_type reduction_type = STMT_VINFO_REDUC_TYPE (reduc_info);
7623   if (reduction_type == TREE_CODE_REDUCTION)
7624     {
7625       /* Check whether it's ok to change the order of the computation.
7626          Generally, when vectorizing a reduction we change the order of the
7627          computation.  This may change the behavior of the program in some
7628          cases, so we need to check that this is ok.  One exception is when
7629          vectorizing an outer-loop: the inner-loop is executed sequentially,
7630          and therefore vectorizing reductions in the inner-loop during
7631          outer-loop vectorization is safe.  Likewise when we are vectorizing
7632          a series of reductions using SLP and the VF is one the reductions
7633          are performed in scalar order.  */
7634       if (slp_node
7635           && !REDUC_GROUP_FIRST_ELEMENT (stmt_info)
7636           && known_eq (LOOP_VINFO_VECT_FACTOR (loop_vinfo), 1u))
7637         ;
7638       else if (needs_fold_left_reduction_p (op.type, orig_code))
7639         {
7640           /* When vectorizing a reduction chain w/o SLP the reduction PHI
7641              is not directy used in stmt.  */
7642           if (!only_slp_reduc_chain
7643               && reduc_chain_length != 1)
7644             {
7645               if (dump_enabled_p ())
7646                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7647                                  "in-order reduction chain without SLP.\n");
7648               return false;
7649             }
7650           STMT_VINFO_REDUC_TYPE (reduc_info)
7651             = reduction_type = FOLD_LEFT_REDUCTION;
7652         }
7653       else if (!commutative_binary_op_p (orig_code, op.type)
7654                || !associative_binary_op_p (orig_code, op.type))
7655         {
7656           if (dump_enabled_p ())
7657             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7658                             "reduction: not commutative/associative");
7659           return false;
7660         }
7661     }
7662
7663   if ((double_reduc || reduction_type != TREE_CODE_REDUCTION)
7664       && ncopies > 1)
7665     {
7666       if (dump_enabled_p ())
7667         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7668                          "multiple types in double reduction or condition "
7669                          "reduction or fold-left reduction.\n");
7670       return false;
7671     }
7672
7673   internal_fn reduc_fn = IFN_LAST;
7674   if (reduction_type == TREE_CODE_REDUCTION
7675       || reduction_type == FOLD_LEFT_REDUCTION
7676       || reduction_type == INTEGER_INDUC_COND_REDUCTION
7677       || reduction_type == CONST_COND_REDUCTION)
7678     {
7679       if (reduction_type == FOLD_LEFT_REDUCTION
7680           ? fold_left_reduction_fn (orig_code, &reduc_fn)
7681           : reduction_fn_for_scalar_code (orig_code, &reduc_fn))
7682         {
7683           if (reduc_fn != IFN_LAST
7684               && !direct_internal_fn_supported_p (reduc_fn, vectype_out,
7685                                                   OPTIMIZE_FOR_SPEED))
7686             {
7687               if (dump_enabled_p ())
7688                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7689                                  "reduc op not supported by target.\n");
7690
7691               reduc_fn = IFN_LAST;
7692             }
7693         }
7694       else
7695         {
7696           if (!nested_cycle || double_reduc)
7697             {
7698               if (dump_enabled_p ())
7699                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7700                                  "no reduc code for scalar code.\n");
7701
7702               return false;
7703             }
7704         }
7705     }
7706   else if (reduction_type == COND_REDUCTION)
7707     {
7708       int scalar_precision
7709         = GET_MODE_PRECISION (SCALAR_TYPE_MODE (op.type));
7710       cr_index_scalar_type = make_unsigned_type (scalar_precision);
7711       cr_index_vector_type = get_same_sized_vectype (cr_index_scalar_type,
7712                                                 vectype_out);
7713
7714       if (direct_internal_fn_supported_p (IFN_REDUC_MAX, cr_index_vector_type,
7715                                           OPTIMIZE_FOR_SPEED))
7716         reduc_fn = IFN_REDUC_MAX;
7717     }
7718   STMT_VINFO_REDUC_FN (reduc_info) = reduc_fn;
7719
7720   if (reduction_type != EXTRACT_LAST_REDUCTION
7721       && (!nested_cycle || double_reduc)
7722       && reduc_fn == IFN_LAST
7723       && !nunits_out.is_constant ())
7724     {
7725       if (dump_enabled_p ())
7726         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7727                          "missing target support for reduction on"
7728                          " variable-length vectors.\n");
7729       return false;
7730     }
7731
7732   /* For SLP reductions, see if there is a neutral value we can use.  */
7733   tree neutral_op = NULL_TREE;
7734   if (slp_node)
7735     {
7736       tree initial_value = NULL_TREE;
7737       if (REDUC_GROUP_FIRST_ELEMENT (stmt_info) != NULL)
7738         initial_value = vect_phi_initial_value (reduc_def_phi);
7739       neutral_op = neutral_op_for_reduction (TREE_TYPE (vectype_out),
7740                                              orig_code, initial_value);
7741     }
7742
7743   if (double_reduc && reduction_type == FOLD_LEFT_REDUCTION)
7744     {
7745       /* We can't support in-order reductions of code such as this:
7746
7747            for (int i = 0; i < n1; ++i)
7748              for (int j = 0; j < n2; ++j)
7749                l += a[j];
7750
7751          since GCC effectively transforms the loop when vectorizing:
7752
7753            for (int i = 0; i < n1 / VF; ++i)
7754              for (int j = 0; j < n2; ++j)
7755                for (int k = 0; k < VF; ++k)
7756                  l += a[j];
7757
7758          which is a reassociation of the original operation.  */
7759       if (dump_enabled_p ())
7760         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7761                          "in-order double reduction not supported.\n");
7762
7763       return false;
7764     }
7765
7766   if (reduction_type == FOLD_LEFT_REDUCTION
7767       && slp_node
7768       && !REDUC_GROUP_FIRST_ELEMENT (stmt_info))
7769     {
7770       /* We cannot use in-order reductions in this case because there is
7771          an implicit reassociation of the operations involved.  */
7772       if (dump_enabled_p ())
7773         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7774                          "in-order unchained SLP reductions not supported.\n");
7775       return false;
7776     }
7777
7778   /* For double reductions, and for SLP reductions with a neutral value,
7779      we construct a variable-length initial vector by loading a vector
7780      full of the neutral value and then shift-and-inserting the start
7781      values into the low-numbered elements.  */
7782   if ((double_reduc || neutral_op)
7783       && !nunits_out.is_constant ()
7784       && !direct_internal_fn_supported_p (IFN_VEC_SHL_INSERT,
7785                                           vectype_out, OPTIMIZE_FOR_SPEED))
7786     {
7787       if (dump_enabled_p ())
7788         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7789                          "reduction on variable-length vectors requires"
7790                          " target support for a vector-shift-and-insert"
7791                          " operation.\n");
7792       return false;
7793     }
7794
7795   /* Check extra constraints for variable-length unchained SLP reductions.  */
7796   if (slp_node
7797       && !REDUC_GROUP_FIRST_ELEMENT (stmt_info)
7798       && !nunits_out.is_constant ())
7799     {
7800       /* We checked above that we could build the initial vector when
7801          there's a neutral element value.  Check here for the case in
7802          which each SLP statement has its own initial value and in which
7803          that value needs to be repeated for every instance of the
7804          statement within the initial vector.  */
7805       unsigned int group_size = SLP_TREE_LANES (slp_node);
7806       if (!neutral_op
7807           && !can_duplicate_and_interleave_p (loop_vinfo, group_size,
7808                                               TREE_TYPE (vectype_out)))
7809         {
7810           if (dump_enabled_p ())
7811             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7812                              "unsupported form of SLP reduction for"
7813                              " variable-length vectors: cannot build"
7814                              " initial vector.\n");
7815           return false;
7816         }
7817       /* The epilogue code relies on the number of elements being a multiple
7818          of the group size.  The duplicate-and-interleave approach to setting
7819          up the initial vector does too.  */
7820       if (!multiple_p (nunits_out, group_size))
7821         {
7822           if (dump_enabled_p ())
7823             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7824                              "unsupported form of SLP reduction for"
7825                              " variable-length vectors: the vector size"
7826                              " is not a multiple of the number of results.\n");
7827           return false;
7828         }
7829     }
7830
7831   if (reduction_type == COND_REDUCTION)
7832     {
7833       widest_int ni;
7834
7835       if (! max_loop_iterations (loop, &ni))
7836         {
7837           if (dump_enabled_p ())
7838             dump_printf_loc (MSG_NOTE, vect_location,
7839                              "loop count not known, cannot create cond "
7840                              "reduction.\n");
7841           return false;
7842         }
7843       /* Convert backedges to iterations.  */
7844       ni += 1;
7845
7846       /* The additional index will be the same type as the condition.  Check
7847          that the loop can fit into this less one (because we'll use up the
7848          zero slot for when there are no matches).  */
7849       tree max_index = TYPE_MAX_VALUE (cr_index_scalar_type);
7850       if (wi::geu_p (ni, wi::to_widest (max_index)))
7851         {
7852           if (dump_enabled_p ())
7853             dump_printf_loc (MSG_NOTE, vect_location,
7854                              "loop size is greater than data size.\n");
7855           return false;
7856         }
7857     }
7858
7859   /* In case the vectorization factor (VF) is bigger than the number
7860      of elements that we can fit in a vectype (nunits), we have to generate
7861      more than one vector stmt - i.e - we need to "unroll" the
7862      vector stmt by a factor VF/nunits.  For more details see documentation
7863      in vectorizable_operation.  */
7864
7865   /* If the reduction is used in an outer loop we need to generate
7866      VF intermediate results, like so (e.g. for ncopies=2):
7867         r0 = phi (init, r0)
7868         r1 = phi (init, r1)
7869         r0 = x0 + r0;
7870         r1 = x1 + r1;
7871     (i.e. we generate VF results in 2 registers).
7872     In this case we have a separate def-use cycle for each copy, and therefore
7873     for each copy we get the vector def for the reduction variable from the
7874     respective phi node created for this copy.
7875
7876     Otherwise (the reduction is unused in the loop nest), we can combine
7877     together intermediate results, like so (e.g. for ncopies=2):
7878         r = phi (init, r)
7879         r = x0 + r;
7880         r = x1 + r;
7881    (i.e. we generate VF/2 results in a single register).
7882    In this case for each copy we get the vector def for the reduction variable
7883    from the vectorized reduction operation generated in the previous iteration.
7884
7885    This only works when we see both the reduction PHI and its only consumer
7886    in vectorizable_reduction and there are no intermediate stmts
7887    participating.  When unrolling we want each unrolled iteration to have its
7888    own reduction accumulator since one of the main goals of unrolling a
7889    reduction is to reduce the aggregate loop-carried latency.  */
7890   if (ncopies > 1
7891       && (STMT_VINFO_RELEVANT (stmt_info) <= vect_used_only_live)
7892       && reduc_chain_length == 1
7893       && loop_vinfo->suggested_unroll_factor == 1)
7894     single_defuse_cycle = true;
7895
7896   if (single_defuse_cycle || lane_reduc_code_p)
7897     {
7898       gcc_assert (op.code != COND_EXPR);
7899
7900       /* 4. Supportable by target?  */
7901       bool ok = true;
7902
7903       /* 4.1. check support for the operation in the loop
7904
7905          This isn't necessary for the lane reduction codes, since they
7906          can only be produced by pattern matching, and it's up to the
7907          pattern matcher to test for support.  The main reason for
7908          specifically skipping this step is to avoid rechecking whether
7909          mixed-sign dot-products can be implemented using signed
7910          dot-products.  */
7911       machine_mode vec_mode = TYPE_MODE (vectype_in);
7912       if (!lane_reduc_code_p
7913           && !directly_supported_p (op.code, vectype_in, optab_vector))
7914         {
7915           if (dump_enabled_p ())
7916             dump_printf (MSG_NOTE, "op not supported by target.\n");
7917           if (maybe_ne (GET_MODE_SIZE (vec_mode), UNITS_PER_WORD)
7918               || !vect_can_vectorize_without_simd_p (op.code))
7919             ok = false;
7920           else
7921             if (dump_enabled_p ())
7922               dump_printf (MSG_NOTE, "proceeding using word mode.\n");
7923         }
7924
7925       if (vect_emulated_vector_p (vectype_in)
7926           && !vect_can_vectorize_without_simd_p (op.code))
7927         {
7928           if (dump_enabled_p ())
7929             dump_printf (MSG_NOTE, "using word mode not possible.\n");
7930           return false;
7931         }
7932
7933       /* lane-reducing operations have to go through vect_transform_reduction.
7934          For the other cases try without the single cycle optimization.  */
7935       if (!ok)
7936         {
7937           if (lane_reduc_code_p)
7938             return false;
7939           else
7940             single_defuse_cycle = false;
7941         }
7942     }
7943   STMT_VINFO_FORCE_SINGLE_CYCLE (reduc_info) = single_defuse_cycle;
7944
7945   /* If the reduction stmt is one of the patterns that have lane
7946      reduction embedded we cannot handle the case of ! single_defuse_cycle.  */
7947   if ((ncopies > 1 && ! single_defuse_cycle)
7948       && lane_reduc_code_p)
7949     {
7950       if (dump_enabled_p ())
7951         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7952                          "multi def-use cycle not possible for lane-reducing "
7953                          "reduction operation\n");
7954       return false;
7955     }
7956
7957   if (slp_node
7958       && !(!single_defuse_cycle
7959            && !lane_reduc_code_p
7960            && reduction_type != FOLD_LEFT_REDUCTION))
7961     for (i = 0; i < (int) op.num_ops; i++)
7962       if (!vect_maybe_update_slp_op_vectype (slp_op[i], vectype_op[i]))
7963         {
7964           if (dump_enabled_p ())
7965             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7966                              "incompatible vector types for invariants\n");
7967           return false;
7968         }
7969
7970   if (slp_node)
7971     vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
7972   else
7973     vec_num = 1;
7974
7975   vect_model_reduction_cost (loop_vinfo, stmt_info, reduc_fn,
7976                              reduction_type, ncopies, cost_vec);
7977   /* Cost the reduction op inside the loop if transformed via
7978      vect_transform_reduction.  Otherwise this is costed by the
7979      separate vectorizable_* routines.  */
7980   if (single_defuse_cycle || lane_reduc_code_p)
7981     {
7982       int factor = 1;
7983       if (vect_is_emulated_mixed_dot_prod (loop_vinfo, stmt_info))
7984         /* Three dot-products and a subtraction.  */
7985         factor = 4;
7986       record_stmt_cost (cost_vec, ncopies * factor, vector_stmt,
7987                         stmt_info, 0, vect_body);
7988     }
7989
7990   if (dump_enabled_p ()
7991       && reduction_type == FOLD_LEFT_REDUCTION)
7992     dump_printf_loc (MSG_NOTE, vect_location,
7993                      "using an in-order (fold-left) reduction.\n");
7994   STMT_VINFO_TYPE (orig_stmt_of_analysis) = cycle_phi_info_type;
7995   /* All but single defuse-cycle optimized, lane-reducing and fold-left
7996      reductions go through their own vectorizable_* routines.  */
7997   if (!single_defuse_cycle
7998       && !lane_reduc_code_p
7999       && reduction_type != FOLD_LEFT_REDUCTION)
8000     {
8001       stmt_vec_info tem
8002         = vect_stmt_to_vectorize (STMT_VINFO_REDUC_DEF (phi_info));
8003       if (slp_node && REDUC_GROUP_FIRST_ELEMENT (tem))
8004         {
8005           gcc_assert (!REDUC_GROUP_NEXT_ELEMENT (tem));
8006           tem = REDUC_GROUP_FIRST_ELEMENT (tem);
8007         }
8008       STMT_VINFO_DEF_TYPE (vect_orig_stmt (tem)) = vect_internal_def;
8009       STMT_VINFO_DEF_TYPE (tem) = vect_internal_def;
8010     }
8011   else if (loop_vinfo && LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo))
8012     {
8013       vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo);
8014       vec_loop_lens *lens = &LOOP_VINFO_LENS (loop_vinfo);
8015       internal_fn cond_fn = get_conditional_internal_fn (op.code, op.type);
8016
8017       if (reduction_type != FOLD_LEFT_REDUCTION
8018           && !use_mask_by_cond_expr_p (op.code, cond_fn, vectype_in)
8019           && (cond_fn == IFN_LAST
8020               || !direct_internal_fn_supported_p (cond_fn, vectype_in,
8021                                                   OPTIMIZE_FOR_SPEED)))
8022         {
8023           if (dump_enabled_p ())
8024             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8025                              "can't operate on partial vectors because"
8026                              " no conditional operation is available.\n");
8027           LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
8028         }
8029       else if (reduction_type == FOLD_LEFT_REDUCTION
8030                && reduc_fn == IFN_LAST
8031                && !expand_vec_cond_expr_p (vectype_in,
8032                                            truth_type_for (vectype_in),
8033                                            SSA_NAME))
8034         {
8035           if (dump_enabled_p ())
8036             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8037                              "can't operate on partial vectors because"
8038                              " no conditional operation is available.\n");
8039           LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
8040         }
8041       else
8042         {
8043           internal_fn mask_reduc_fn
8044             = get_masked_reduction_fn (reduc_fn, vectype_in);
8045
8046           if (mask_reduc_fn == IFN_MASK_LEN_FOLD_LEFT_PLUS)
8047             vect_record_loop_len (loop_vinfo, lens, ncopies * vec_num,
8048                                   vectype_in, 1);
8049           else
8050             vect_record_loop_mask (loop_vinfo, masks, ncopies * vec_num,
8051                                    vectype_in, NULL);
8052         }
8053     }
8054   return true;
8055 }
8056
8057 /* STMT_INFO is a dot-product reduction whose multiplication operands
8058    have different signs.  Emit a sequence to emulate the operation
8059    using a series of signed DOT_PROD_EXPRs and return the last
8060    statement generated.  VEC_DEST is the result of the vector operation
8061    and VOP lists its inputs.  */
8062
8063 static gassign *
8064 vect_emulate_mixed_dot_prod (loop_vec_info loop_vinfo, stmt_vec_info stmt_info,
8065                              gimple_stmt_iterator *gsi, tree vec_dest,
8066                              tree vop[3])
8067 {
8068   tree wide_vectype = signed_type_for (TREE_TYPE (vec_dest));
8069   tree narrow_vectype = signed_type_for (TREE_TYPE (vop[0]));
8070   tree narrow_elttype = TREE_TYPE (narrow_vectype);
8071   gimple *new_stmt;
8072
8073   /* Make VOP[0] the unsigned operand VOP[1] the signed operand.  */
8074   if (!TYPE_UNSIGNED (TREE_TYPE (vop[0])))
8075     std::swap (vop[0], vop[1]);
8076
8077   /* Convert all inputs to signed types.  */
8078   for (int i = 0; i < 3; ++i)
8079     if (TYPE_UNSIGNED (TREE_TYPE (vop[i])))
8080       {
8081         tree tmp = make_ssa_name (signed_type_for (TREE_TYPE (vop[i])));
8082         new_stmt = gimple_build_assign (tmp, NOP_EXPR, vop[i]);
8083         vect_finish_stmt_generation (loop_vinfo, stmt_info, new_stmt, gsi);
8084         vop[i] = tmp;
8085       }
8086
8087   /* In the comments below we assume 8-bit inputs for simplicity,
8088      but the approach works for any full integer type.  */
8089
8090   /* Create a vector of -128.  */
8091   tree min_narrow_elttype = TYPE_MIN_VALUE (narrow_elttype);
8092   tree min_narrow = build_vector_from_val (narrow_vectype,
8093                                            min_narrow_elttype);
8094
8095   /* Create a vector of 64.  */
8096   auto half_wi = wi::lrshift (wi::to_wide (min_narrow_elttype), 1);
8097   tree half_narrow = wide_int_to_tree (narrow_elttype, half_wi);
8098   half_narrow = build_vector_from_val (narrow_vectype, half_narrow);
8099
8100   /* Emit: SUB_RES = VOP[0] - 128.  */
8101   tree sub_res = make_ssa_name (narrow_vectype);
8102   new_stmt = gimple_build_assign (sub_res, PLUS_EXPR, vop[0], min_narrow);
8103   vect_finish_stmt_generation (loop_vinfo, stmt_info, new_stmt, gsi);
8104
8105   /* Emit:
8106
8107        STAGE1 = DOT_PROD_EXPR <VOP[1], 64, VOP[2]>;
8108        STAGE2 = DOT_PROD_EXPR <VOP[1], 64, STAGE1>;
8109        STAGE3 = DOT_PROD_EXPR <SUB_RES, -128, STAGE2>;
8110
8111      on the basis that x * y == (x - 128) * y + 64 * y + 64 * y
8112      Doing the two 64 * y steps first allows more time to compute x.  */
8113   tree stage1 = make_ssa_name (wide_vectype);
8114   new_stmt = gimple_build_assign (stage1, DOT_PROD_EXPR,
8115                                   vop[1], half_narrow, vop[2]);
8116   vect_finish_stmt_generation (loop_vinfo, stmt_info, new_stmt, gsi);
8117
8118   tree stage2 = make_ssa_name (wide_vectype);
8119   new_stmt = gimple_build_assign (stage2, DOT_PROD_EXPR,
8120                                   vop[1], half_narrow, stage1);
8121   vect_finish_stmt_generation (loop_vinfo, stmt_info, new_stmt, gsi);
8122
8123   tree stage3 = make_ssa_name (wide_vectype);
8124   new_stmt = gimple_build_assign (stage3, DOT_PROD_EXPR,
8125                                   sub_res, vop[1], stage2);
8126   vect_finish_stmt_generation (loop_vinfo, stmt_info, new_stmt, gsi);
8127
8128   /* Convert STAGE3 to the reduction type.  */
8129   return gimple_build_assign (vec_dest, CONVERT_EXPR, stage3);
8130 }
8131
8132 /* Transform the definition stmt STMT_INFO of a reduction PHI backedge
8133    value.  */
8134
8135 bool
8136 vect_transform_reduction (loop_vec_info loop_vinfo,
8137                           stmt_vec_info stmt_info, gimple_stmt_iterator *gsi,
8138                           gimple **vec_stmt, slp_tree slp_node)
8139 {
8140   tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
8141   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
8142   int i;
8143   int ncopies;
8144   int vec_num;
8145
8146   stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info);
8147   gcc_assert (reduc_info->is_reduc_info);
8148
8149   if (nested_in_vect_loop_p (loop, stmt_info))
8150     {
8151       loop = loop->inner;
8152       gcc_assert (STMT_VINFO_DEF_TYPE (reduc_info) == vect_double_reduction_def);
8153     }
8154
8155   gimple_match_op op;
8156   if (!gimple_extract_op (stmt_info->stmt, &op))
8157     gcc_unreachable ();
8158
8159   /* All uses but the last are expected to be defined in the loop.
8160      The last use is the reduction variable.  In case of nested cycle this
8161      assumption is not true: we use reduc_index to record the index of the
8162      reduction variable.  */
8163   stmt_vec_info phi_info = STMT_VINFO_REDUC_DEF (vect_orig_stmt (stmt_info));
8164   gphi *reduc_def_phi = as_a <gphi *> (phi_info->stmt);
8165   int reduc_index = STMT_VINFO_REDUC_IDX (stmt_info);
8166   tree vectype_in = STMT_VINFO_REDUC_VECTYPE_IN (reduc_info);
8167
8168   if (slp_node)
8169     {
8170       ncopies = 1;
8171       vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
8172     }
8173   else
8174     {
8175       ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
8176       vec_num = 1;
8177     }
8178
8179   code_helper code = canonicalize_code (op.code, op.type);
8180   internal_fn cond_fn = get_conditional_internal_fn (code, op.type);
8181   vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo);
8182   vec_loop_lens *lens = &LOOP_VINFO_LENS (loop_vinfo);
8183   bool mask_by_cond_expr = use_mask_by_cond_expr_p (code, cond_fn, vectype_in);
8184
8185   /* Transform.  */
8186   tree new_temp = NULL_TREE;
8187   auto_vec<tree> vec_oprnds0;
8188   auto_vec<tree> vec_oprnds1;
8189   auto_vec<tree> vec_oprnds2;
8190   tree def0;
8191
8192   if (dump_enabled_p ())
8193     dump_printf_loc (MSG_NOTE, vect_location, "transform reduction.\n");
8194
8195   /* FORNOW: Multiple types are not supported for condition.  */
8196   if (code == COND_EXPR)
8197     gcc_assert (ncopies == 1);
8198
8199   bool masked_loop_p = LOOP_VINFO_FULLY_MASKED_P (loop_vinfo);
8200
8201   vect_reduction_type reduction_type = STMT_VINFO_REDUC_TYPE (reduc_info);
8202   if (reduction_type == FOLD_LEFT_REDUCTION)
8203     {
8204       internal_fn reduc_fn = STMT_VINFO_REDUC_FN (reduc_info);
8205       gcc_assert (code.is_tree_code ());
8206       return vectorize_fold_left_reduction
8207           (loop_vinfo, stmt_info, gsi, vec_stmt, slp_node, reduc_def_phi,
8208            tree_code (code), reduc_fn, op.ops, vectype_in, reduc_index, masks,
8209            lens);
8210     }
8211
8212   bool single_defuse_cycle = STMT_VINFO_FORCE_SINGLE_CYCLE (reduc_info);
8213   gcc_assert (single_defuse_cycle
8214               || code == DOT_PROD_EXPR
8215               || code == WIDEN_SUM_EXPR
8216               || code == SAD_EXPR);
8217
8218   /* Create the destination vector  */
8219   tree scalar_dest = gimple_get_lhs (stmt_info->stmt);
8220   tree vec_dest = vect_create_destination_var (scalar_dest, vectype_out);
8221
8222   vect_get_vec_defs (loop_vinfo, stmt_info, slp_node, ncopies,
8223                      single_defuse_cycle && reduc_index == 0
8224                      ? NULL_TREE : op.ops[0], &vec_oprnds0,
8225                      single_defuse_cycle && reduc_index == 1
8226                      ? NULL_TREE : op.ops[1], &vec_oprnds1,
8227                      op.num_ops == 3
8228                      && !(single_defuse_cycle && reduc_index == 2)
8229                      ? op.ops[2] : NULL_TREE, &vec_oprnds2);
8230   if (single_defuse_cycle)
8231     {
8232       gcc_assert (!slp_node);
8233       vect_get_vec_defs_for_operand (loop_vinfo, stmt_info, 1,
8234                                      op.ops[reduc_index],
8235                                      reduc_index == 0 ? &vec_oprnds0
8236                                      : (reduc_index == 1 ? &vec_oprnds1
8237                                         : &vec_oprnds2));
8238     }
8239
8240   bool emulated_mixed_dot_prod
8241     = vect_is_emulated_mixed_dot_prod (loop_vinfo, stmt_info);
8242   FOR_EACH_VEC_ELT (vec_oprnds0, i, def0)
8243     {
8244       gimple *new_stmt;
8245       tree vop[3] = { def0, vec_oprnds1[i], NULL_TREE };
8246       if (masked_loop_p && !mask_by_cond_expr)
8247         {
8248           /* No conditional ifns have been defined for dot-product yet.  */
8249           gcc_assert (code != DOT_PROD_EXPR);
8250
8251           /* Make sure that the reduction accumulator is vop[0].  */
8252           if (reduc_index == 1)
8253             {
8254               gcc_assert (commutative_binary_op_p (code, op.type));
8255               std::swap (vop[0], vop[1]);
8256             }
8257           tree mask = vect_get_loop_mask (loop_vinfo, gsi, masks,
8258                                           vec_num * ncopies, vectype_in, i);
8259           gcall *call = gimple_build_call_internal (cond_fn, 4, mask,
8260                                                     vop[0], vop[1], vop[0]);
8261           new_temp = make_ssa_name (vec_dest, call);
8262           gimple_call_set_lhs (call, new_temp);
8263           gimple_call_set_nothrow (call, true);
8264           vect_finish_stmt_generation (loop_vinfo, stmt_info, call, gsi);
8265           new_stmt = call;
8266         }
8267       else
8268         {
8269           if (op.num_ops == 3)
8270             vop[2] = vec_oprnds2[i];
8271
8272           if (masked_loop_p && mask_by_cond_expr)
8273             {
8274               tree mask = vect_get_loop_mask (loop_vinfo, gsi, masks,
8275                                               vec_num * ncopies, vectype_in, i);
8276               build_vect_cond_expr (code, vop, mask, gsi);
8277             }
8278
8279           if (emulated_mixed_dot_prod)
8280             new_stmt = vect_emulate_mixed_dot_prod (loop_vinfo, stmt_info, gsi,
8281                                                     vec_dest, vop);
8282           else if (code.is_internal_fn ())
8283             new_stmt = gimple_build_call_internal (internal_fn (code),
8284                                                    op.num_ops,
8285                                                    vop[0], vop[1], vop[2]);
8286           else
8287             new_stmt = gimple_build_assign (vec_dest, tree_code (op.code),
8288                                             vop[0], vop[1], vop[2]);
8289           new_temp = make_ssa_name (vec_dest, new_stmt);
8290           gimple_set_lhs (new_stmt, new_temp);
8291           vect_finish_stmt_generation (loop_vinfo, stmt_info, new_stmt, gsi);
8292         }
8293
8294       if (slp_node)
8295         slp_node->push_vec_def (new_stmt);
8296       else if (single_defuse_cycle
8297                && i < ncopies - 1)
8298         {
8299           if (reduc_index == 0)
8300             vec_oprnds0.safe_push (gimple_get_lhs (new_stmt));
8301           else if (reduc_index == 1)
8302             vec_oprnds1.safe_push (gimple_get_lhs (new_stmt));
8303           else if (reduc_index == 2)
8304             vec_oprnds2.safe_push (gimple_get_lhs (new_stmt));
8305         }
8306       else
8307         STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_stmt);
8308     }
8309
8310   if (!slp_node)
8311     *vec_stmt = STMT_VINFO_VEC_STMTS (stmt_info)[0];
8312
8313   return true;
8314 }
8315
8316 /* Transform phase of a cycle PHI.  */
8317
8318 bool
8319 vect_transform_cycle_phi (loop_vec_info loop_vinfo,
8320                           stmt_vec_info stmt_info, gimple **vec_stmt,
8321                           slp_tree slp_node, slp_instance slp_node_instance)
8322 {
8323   tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
8324   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
8325   int i;
8326   int ncopies;
8327   int j;
8328   bool nested_cycle = false;
8329   int vec_num;
8330
8331   if (nested_in_vect_loop_p (loop, stmt_info))
8332     {
8333       loop = loop->inner;
8334       nested_cycle = true;
8335     }
8336
8337   stmt_vec_info reduc_stmt_info = STMT_VINFO_REDUC_DEF (stmt_info);
8338   reduc_stmt_info = vect_stmt_to_vectorize (reduc_stmt_info);
8339   stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info);
8340   gcc_assert (reduc_info->is_reduc_info);
8341
8342   if (STMT_VINFO_REDUC_TYPE (reduc_info) == EXTRACT_LAST_REDUCTION
8343       || STMT_VINFO_REDUC_TYPE (reduc_info) == FOLD_LEFT_REDUCTION)
8344     /* Leave the scalar phi in place.  */
8345     return true;
8346
8347   tree vectype_in = STMT_VINFO_REDUC_VECTYPE_IN (reduc_info);
8348   /* For a nested cycle we do not fill the above.  */
8349   if (!vectype_in)
8350     vectype_in = STMT_VINFO_VECTYPE (stmt_info);
8351   gcc_assert (vectype_in);
8352
8353   if (slp_node)
8354     {
8355       /* The size vect_schedule_slp_instance computes is off for us.  */
8356       vec_num = vect_get_num_vectors (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
8357                                       * SLP_TREE_LANES (slp_node), vectype_in);
8358       ncopies = 1;
8359     }
8360   else
8361     {
8362       vec_num = 1;
8363       ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
8364     }
8365
8366   /* Check whether we should use a single PHI node and accumulate
8367      vectors to one before the backedge.  */
8368   if (STMT_VINFO_FORCE_SINGLE_CYCLE (reduc_info))
8369     ncopies = 1;
8370
8371   /* Create the destination vector  */
8372   gphi *phi = as_a <gphi *> (stmt_info->stmt);
8373   tree vec_dest = vect_create_destination_var (gimple_phi_result (phi),
8374                                                vectype_out);
8375
8376   /* Get the loop-entry arguments.  */
8377   tree vec_initial_def = NULL_TREE;
8378   auto_vec<tree> vec_initial_defs;
8379   if (slp_node)
8380     {
8381       vec_initial_defs.reserve (vec_num);
8382       if (nested_cycle)
8383         {
8384           unsigned phi_idx = loop_preheader_edge (loop)->dest_idx;
8385           vect_get_slp_defs (SLP_TREE_CHILDREN (slp_node)[phi_idx],
8386                              &vec_initial_defs);
8387         }
8388       else
8389         {
8390           gcc_assert (slp_node == slp_node_instance->reduc_phis);
8391           vec<tree> &initial_values = reduc_info->reduc_initial_values;
8392           vec<stmt_vec_info> &stmts = SLP_TREE_SCALAR_STMTS (slp_node);
8393
8394           unsigned int num_phis = stmts.length ();
8395           if (REDUC_GROUP_FIRST_ELEMENT (reduc_stmt_info))
8396             num_phis = 1;
8397           initial_values.reserve (num_phis);
8398           for (unsigned int i = 0; i < num_phis; ++i)
8399             {
8400               gphi *this_phi = as_a<gphi *> (stmts[i]->stmt);
8401               initial_values.quick_push (vect_phi_initial_value (this_phi));
8402             }
8403           if (vec_num == 1)
8404             vect_find_reusable_accumulator (loop_vinfo, reduc_info);
8405           if (!initial_values.is_empty ())
8406             {
8407               tree initial_value
8408                 = (num_phis == 1 ? initial_values[0] : NULL_TREE);
8409               code_helper code = STMT_VINFO_REDUC_CODE (reduc_info);
8410               tree neutral_op
8411                 = neutral_op_for_reduction (TREE_TYPE (vectype_out),
8412                                             code, initial_value);
8413               get_initial_defs_for_reduction (loop_vinfo, reduc_info,
8414                                               &vec_initial_defs, vec_num,
8415                                               stmts.length (), neutral_op);
8416             }
8417         }
8418     }
8419   else
8420     {
8421       /* Get at the scalar def before the loop, that defines the initial
8422          value of the reduction variable.  */
8423       tree initial_def = vect_phi_initial_value (phi);
8424       reduc_info->reduc_initial_values.safe_push (initial_def);
8425       /* Optimize: if initial_def is for REDUC_MAX smaller than the base
8426          and we can't use zero for induc_val, use initial_def.  Similarly
8427          for REDUC_MIN and initial_def larger than the base.  */
8428       if (STMT_VINFO_REDUC_TYPE (reduc_info) == INTEGER_INDUC_COND_REDUCTION)
8429         {
8430           tree induc_val = STMT_VINFO_VEC_INDUC_COND_INITIAL_VAL (reduc_info);
8431           if (TREE_CODE (initial_def) == INTEGER_CST
8432               && !integer_zerop (induc_val)
8433               && ((STMT_VINFO_REDUC_CODE (reduc_info) == MAX_EXPR
8434                    && tree_int_cst_lt (initial_def, induc_val))
8435                   || (STMT_VINFO_REDUC_CODE (reduc_info) == MIN_EXPR
8436                       && tree_int_cst_lt (induc_val, initial_def))))
8437             {
8438               induc_val = initial_def;
8439               /* Communicate we used the initial_def to epilouge
8440                  generation.  */
8441               STMT_VINFO_VEC_INDUC_COND_INITIAL_VAL (reduc_info) = NULL_TREE;
8442             }
8443           vec_initial_def = build_vector_from_val (vectype_out, induc_val);
8444         }
8445       else if (nested_cycle)
8446         {
8447           /* Do not use an adjustment def as that case is not supported
8448              correctly if ncopies is not one.  */
8449           vect_get_vec_defs_for_operand (loop_vinfo, reduc_stmt_info,
8450                                          ncopies, initial_def,
8451                                          &vec_initial_defs);
8452         }
8453       else if (STMT_VINFO_REDUC_TYPE (reduc_info) == CONST_COND_REDUCTION
8454                || STMT_VINFO_REDUC_TYPE (reduc_info) == COND_REDUCTION)
8455         /* Fill the initial vector with the initial scalar value.  */
8456         vec_initial_def
8457           = get_initial_def_for_reduction (loop_vinfo, reduc_stmt_info,
8458                                            initial_def, initial_def);
8459       else
8460         {
8461           if (ncopies == 1)
8462             vect_find_reusable_accumulator (loop_vinfo, reduc_info);
8463           if (!reduc_info->reduc_initial_values.is_empty ())
8464             {
8465               initial_def = reduc_info->reduc_initial_values[0];
8466               code_helper code = STMT_VINFO_REDUC_CODE (reduc_info);
8467               tree neutral_op
8468                 = neutral_op_for_reduction (TREE_TYPE (initial_def),
8469                                             code, initial_def);
8470               gcc_assert (neutral_op);
8471               /* Try to simplify the vector initialization by applying an
8472                  adjustment after the reduction has been performed.  */
8473               if (!reduc_info->reused_accumulator
8474                   && STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
8475                   && !operand_equal_p (neutral_op, initial_def))
8476                 {
8477                   STMT_VINFO_REDUC_EPILOGUE_ADJUSTMENT (reduc_info)
8478                     = initial_def;
8479                   initial_def = neutral_op;
8480                 }
8481               vec_initial_def
8482                 = get_initial_def_for_reduction (loop_vinfo, reduc_info,
8483                                                  initial_def, neutral_op);
8484             }
8485         }
8486     }
8487
8488   if (vec_initial_def)
8489     {
8490       vec_initial_defs.create (ncopies);
8491       for (i = 0; i < ncopies; ++i)
8492         vec_initial_defs.quick_push (vec_initial_def);
8493     }
8494
8495   if (auto *accumulator = reduc_info->reused_accumulator)
8496     {
8497       tree def = accumulator->reduc_input;
8498       if (!useless_type_conversion_p (vectype_out, TREE_TYPE (def)))
8499         {
8500           unsigned int nreduc;
8501           bool res = constant_multiple_p (TYPE_VECTOR_SUBPARTS
8502                                             (TREE_TYPE (def)),
8503                                           TYPE_VECTOR_SUBPARTS (vectype_out),
8504                                           &nreduc);
8505           gcc_assert (res);
8506           gimple_seq stmts = NULL;
8507           /* Reduce the single vector to a smaller one.  */
8508           if (nreduc != 1)
8509             {
8510               /* Perform the reduction in the appropriate type.  */
8511               tree rvectype = vectype_out;
8512               if (!useless_type_conversion_p (TREE_TYPE (vectype_out),
8513                                               TREE_TYPE (TREE_TYPE (def))))
8514                 rvectype = build_vector_type (TREE_TYPE (TREE_TYPE (def)),
8515                                               TYPE_VECTOR_SUBPARTS
8516                                                 (vectype_out));
8517               def = vect_create_partial_epilog (def, rvectype,
8518                                                 STMT_VINFO_REDUC_CODE
8519                                                   (reduc_info),
8520                                                 &stmts);
8521             }
8522           /* The epilogue loop might use a different vector mode, like
8523              VNx2DI vs. V2DI.  */
8524           if (TYPE_MODE (vectype_out) != TYPE_MODE (TREE_TYPE (def)))
8525             {
8526               tree reduc_type = build_vector_type_for_mode
8527                 (TREE_TYPE (TREE_TYPE (def)), TYPE_MODE (vectype_out));
8528               def = gimple_convert (&stmts, reduc_type, def);
8529             }
8530           /* Adjust the input so we pick up the partially reduced value
8531              for the skip edge in vect_create_epilog_for_reduction.  */
8532           accumulator->reduc_input = def;
8533           /* And the reduction could be carried out using a different sign.  */
8534           if (!useless_type_conversion_p (vectype_out, TREE_TYPE (def)))
8535             def = gimple_convert (&stmts, vectype_out, def);
8536           if (loop_vinfo->main_loop_edge)
8537             {
8538               /* While we'd like to insert on the edge this will split
8539                  blocks and disturb bookkeeping, we also will eventually
8540                  need this on the skip edge.  Rely on sinking to
8541                  fixup optimal placement and insert in the pred.  */
8542               gimple_stmt_iterator gsi
8543                 = gsi_last_bb (loop_vinfo->main_loop_edge->src);
8544               /* Insert before a cond that eventually skips the
8545                  epilogue.  */
8546               if (!gsi_end_p (gsi) && stmt_ends_bb_p (gsi_stmt (gsi)))
8547                 gsi_prev (&gsi);
8548               gsi_insert_seq_after (&gsi, stmts, GSI_CONTINUE_LINKING);
8549             }
8550           else
8551             gsi_insert_seq_on_edge_immediate (loop_preheader_edge (loop),
8552                                               stmts);
8553         }
8554       if (loop_vinfo->main_loop_edge)
8555         vec_initial_defs[0]
8556           = vect_get_main_loop_result (loop_vinfo, def,
8557                                        vec_initial_defs[0]);
8558       else
8559         vec_initial_defs.safe_push (def);
8560     }
8561
8562   /* Generate the reduction PHIs upfront.  */
8563   for (i = 0; i < vec_num; i++)
8564     {
8565       tree vec_init_def = vec_initial_defs[i];
8566       for (j = 0; j < ncopies; j++)
8567         {
8568           /* Create the reduction-phi that defines the reduction
8569              operand.  */
8570           gphi *new_phi = create_phi_node (vec_dest, loop->header);
8571
8572           /* Set the loop-entry arg of the reduction-phi.  */
8573           if (j != 0 && nested_cycle)
8574             vec_init_def = vec_initial_defs[j];
8575           add_phi_arg (new_phi, vec_init_def, loop_preheader_edge (loop),
8576                        UNKNOWN_LOCATION);
8577
8578           /* The loop-latch arg is set in epilogue processing.  */
8579
8580           if (slp_node)
8581             slp_node->push_vec_def (new_phi);
8582           else
8583             {
8584               if (j == 0)
8585                 *vec_stmt = new_phi;
8586               STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_phi);
8587             }
8588         }
8589     }
8590
8591   return true;
8592 }
8593
8594 /* Vectorizes LC PHIs.  */
8595
8596 bool
8597 vectorizable_lc_phi (loop_vec_info loop_vinfo,
8598                      stmt_vec_info stmt_info, gimple **vec_stmt,
8599                      slp_tree slp_node)
8600 {
8601   if (!loop_vinfo
8602       || !is_a <gphi *> (stmt_info->stmt)
8603       || gimple_phi_num_args (stmt_info->stmt) != 1)
8604     return false;
8605
8606   if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_internal_def
8607       && STMT_VINFO_DEF_TYPE (stmt_info) != vect_double_reduction_def)
8608     return false;
8609
8610   if (!vec_stmt) /* transformation not required.  */
8611     {
8612       /* Deal with copies from externs or constants that disguise as
8613          loop-closed PHI nodes (PR97886).  */
8614       if (slp_node
8615           && !vect_maybe_update_slp_op_vectype (SLP_TREE_CHILDREN (slp_node)[0],
8616                                                 SLP_TREE_VECTYPE (slp_node)))
8617         {
8618           if (dump_enabled_p ())
8619             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8620                              "incompatible vector types for invariants\n");
8621           return false;
8622         }
8623       STMT_VINFO_TYPE (stmt_info) = lc_phi_info_type;
8624       return true;
8625     }
8626
8627   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
8628   tree scalar_dest = gimple_phi_result (stmt_info->stmt);
8629   basic_block bb = gimple_bb (stmt_info->stmt);
8630   edge e = single_pred_edge (bb);
8631   tree vec_dest = vect_create_destination_var (scalar_dest, vectype);
8632   auto_vec<tree> vec_oprnds;
8633   vect_get_vec_defs (loop_vinfo, stmt_info, slp_node,
8634                      !slp_node ? vect_get_num_copies (loop_vinfo, vectype) : 1,
8635                      gimple_phi_arg_def (stmt_info->stmt, 0), &vec_oprnds);
8636   for (unsigned i = 0; i < vec_oprnds.length (); i++)
8637     {
8638       /* Create the vectorized LC PHI node.  */
8639       gphi *new_phi = create_phi_node (vec_dest, bb);
8640       add_phi_arg (new_phi, vec_oprnds[i], e, UNKNOWN_LOCATION);
8641       if (slp_node)
8642         slp_node->push_vec_def (new_phi);
8643       else
8644         STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_phi);
8645     }
8646   if (!slp_node)
8647     *vec_stmt = STMT_VINFO_VEC_STMTS (stmt_info)[0];
8648
8649   return true;
8650 }
8651
8652 /* Vectorizes PHIs.  */
8653
8654 bool
8655 vectorizable_phi (vec_info *,
8656                   stmt_vec_info stmt_info, gimple **vec_stmt,
8657                   slp_tree slp_node, stmt_vector_for_cost *cost_vec)
8658 {
8659   if (!is_a <gphi *> (stmt_info->stmt) || !slp_node)
8660     return false;
8661
8662   if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_internal_def)
8663     return false;
8664
8665   tree vectype = SLP_TREE_VECTYPE (slp_node);
8666
8667   if (!vec_stmt) /* transformation not required.  */
8668     {
8669       slp_tree child;
8670       unsigned i;
8671       FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (slp_node), i, child)
8672         if (!child)
8673           {
8674             if (dump_enabled_p ())
8675               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8676                                "PHI node with unvectorized backedge def\n");
8677             return false;
8678           }
8679         else if (!vect_maybe_update_slp_op_vectype (child, vectype))
8680           {
8681             if (dump_enabled_p ())
8682               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8683                                "incompatible vector types for invariants\n");
8684             return false;
8685           }
8686         else if (SLP_TREE_DEF_TYPE (child) == vect_internal_def
8687                  && !useless_type_conversion_p (vectype,
8688                                                 SLP_TREE_VECTYPE (child)))
8689           {
8690             /* With bools we can have mask and non-mask precision vectors
8691                or different non-mask precisions.  while pattern recog is
8692                supposed to guarantee consistency here bugs in it can cause
8693                mismatches (PR103489 and PR103800 for example).
8694                Deal with them here instead of ICEing later.  */
8695             if (dump_enabled_p ())
8696               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8697                                "incompatible vector type setup from "
8698                                "bool pattern detection\n");
8699             return false;
8700           }
8701
8702       /* For single-argument PHIs assume coalescing which means zero cost
8703          for the scalar and the vector PHIs.  This avoids artificially
8704          favoring the vector path (but may pessimize it in some cases).  */
8705       if (gimple_phi_num_args (as_a <gphi *> (stmt_info->stmt)) > 1)
8706         record_stmt_cost (cost_vec, SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node),
8707                           vector_stmt, stmt_info, vectype, 0, vect_body);
8708       STMT_VINFO_TYPE (stmt_info) = phi_info_type;
8709       return true;
8710     }
8711
8712   tree scalar_dest = gimple_phi_result (stmt_info->stmt);
8713   basic_block bb = gimple_bb (stmt_info->stmt);
8714   tree vec_dest = vect_create_destination_var (scalar_dest, vectype);
8715   auto_vec<gphi *> new_phis;
8716   for (unsigned i = 0; i < gimple_phi_num_args (stmt_info->stmt); ++i)
8717     {
8718       slp_tree child = SLP_TREE_CHILDREN (slp_node)[i];
8719
8720       /* Skip not yet vectorized defs.  */
8721       if (SLP_TREE_DEF_TYPE (child) == vect_internal_def
8722           && SLP_TREE_VEC_DEFS (child).is_empty ())
8723         continue;
8724
8725       auto_vec<tree> vec_oprnds;
8726       vect_get_slp_defs (SLP_TREE_CHILDREN (slp_node)[i], &vec_oprnds);
8727       if (!new_phis.exists ())
8728         {
8729           new_phis.create (vec_oprnds.length ());
8730           for (unsigned j = 0; j < vec_oprnds.length (); j++)
8731             {
8732               /* Create the vectorized LC PHI node.  */
8733               new_phis.quick_push (create_phi_node (vec_dest, bb));
8734               slp_node->push_vec_def (new_phis[j]);
8735             }
8736         }
8737       edge e = gimple_phi_arg_edge (as_a <gphi *> (stmt_info->stmt), i);
8738       for (unsigned j = 0; j < vec_oprnds.length (); j++)
8739         add_phi_arg (new_phis[j], vec_oprnds[j], e, UNKNOWN_LOCATION);
8740     }
8741   /* We should have at least one already vectorized child.  */
8742   gcc_assert (new_phis.exists ());
8743
8744   return true;
8745 }
8746
8747 /* Vectorizes first order recurrences.  An overview of the transformation
8748    is described below. Suppose we have the following loop.
8749
8750      int t = 0;
8751      for (int i = 0; i < n; ++i)
8752        {
8753          b[i] = a[i] - t;
8754          t = a[i];
8755        }
8756
8757    There is a first-order recurrence on 'a'. For this loop, the scalar IR
8758    looks (simplified) like:
8759
8760     scalar.preheader:
8761       init = 0;
8762
8763     scalar.body:
8764       i = PHI <0(scalar.preheader), i+1(scalar.body)>
8765       _2 = PHI <(init(scalar.preheader), <_1(scalar.body)>
8766       _1 = a[i]
8767       b[i] = _1 - _2
8768       if (i < n) goto scalar.body
8769
8770    In this example, _2 is a recurrence because it's value depends on the
8771    previous iteration.  We vectorize this as (VF = 4)
8772
8773     vector.preheader:
8774       vect_init = vect_cst(..., ..., ..., 0)
8775
8776     vector.body
8777       i = PHI <0(vector.preheader), i+4(vector.body)>
8778       vect_1 = PHI <vect_init(vector.preheader), v2(vector.body)>
8779       vect_2 = a[i, i+1, i+2, i+3];
8780       vect_3 = vec_perm (vect_1, vect_2, { 3, 4, 5, 6 })
8781       b[i, i+1, i+2, i+3] = vect_2 - vect_3
8782       if (..) goto vector.body
8783
8784    In this function, vectorizable_recurr, we code generate both the
8785    vector PHI node and the permute since those together compute the
8786    vectorized value of the scalar PHI.  We do not yet have the
8787    backedge value to fill in there nor into the vec_perm.  Those
8788    are filled in maybe_set_vectorized_backedge_value and
8789    vect_schedule_scc.
8790
8791    TODO:  Since the scalar loop does not have a use of the recurrence
8792    outside of the loop the natural way to implement peeling via
8793    vectorizing the live value doesn't work.  For now peeling of loops
8794    with a recurrence is not implemented.  For SLP the supported cases
8795    are restricted to those requiring a single vector recurrence PHI.  */
8796
8797 bool
8798 vectorizable_recurr (loop_vec_info loop_vinfo, stmt_vec_info stmt_info,
8799                      gimple **vec_stmt, slp_tree slp_node,
8800                      stmt_vector_for_cost *cost_vec)
8801 {
8802   if (!loop_vinfo || !is_a<gphi *> (stmt_info->stmt))
8803     return false;
8804
8805   gphi *phi = as_a<gphi *> (stmt_info->stmt);
8806
8807   /* So far we only support first-order recurrence auto-vectorization.  */
8808   if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_first_order_recurrence)
8809     return false;
8810
8811   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
8812   unsigned ncopies;
8813   if (slp_node)
8814     ncopies = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
8815   else
8816     ncopies = vect_get_num_copies (loop_vinfo, vectype);
8817   poly_int64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
8818   unsigned dist = slp_node ? SLP_TREE_LANES (slp_node) : 1;
8819   /* We need to be able to make progress with a single vector.  */
8820   if (maybe_gt (dist * 2, nunits))
8821     {
8822       if (dump_enabled_p ())
8823         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8824                          "first order recurrence exceeds half of "
8825                          "a vector\n");
8826       return false;
8827     }
8828
8829   /* First-order recurrence autovectorization needs to handle permutation
8830      with indices = [nunits-1, nunits, nunits+1, ...].  */
8831   vec_perm_builder sel (nunits, 1, 3);
8832   for (int i = 0; i < 3; ++i)
8833     sel.quick_push (nunits - dist + i);
8834   vec_perm_indices indices (sel, 2, nunits);
8835
8836   if (!vec_stmt) /* transformation not required.  */
8837     {
8838       if (!can_vec_perm_const_p (TYPE_MODE (vectype), TYPE_MODE (vectype),
8839                                  indices))
8840         return false;
8841
8842       if (slp_node)
8843         {
8844           /* We eventually need to set a vector type on invariant
8845              arguments.  */
8846           unsigned j;
8847           slp_tree child;
8848           FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (slp_node), j, child)
8849             if (!vect_maybe_update_slp_op_vectype
8850                   (child, SLP_TREE_VECTYPE (slp_node)))
8851               {
8852                 if (dump_enabled_p ())
8853                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8854                                    "incompatible vector types for "
8855                                    "invariants\n");
8856                 return false;
8857               }
8858         }
8859       /* The recurrence costs the initialization vector and one permute
8860          for each copy.  */
8861       unsigned prologue_cost = record_stmt_cost (cost_vec, 1, scalar_to_vec,
8862                                                  stmt_info, 0, vect_prologue);
8863       unsigned inside_cost = record_stmt_cost (cost_vec, ncopies, vector_stmt,
8864                                                stmt_info, 0, vect_body);
8865       if (dump_enabled_p ())
8866         dump_printf_loc (MSG_NOTE, vect_location,
8867                          "vectorizable_recurr: inside_cost = %d, "
8868                          "prologue_cost = %d .\n", inside_cost,
8869                          prologue_cost);
8870
8871       STMT_VINFO_TYPE (stmt_info) = recurr_info_type;
8872       return true;
8873     }
8874
8875   edge pe = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo));
8876   basic_block bb = gimple_bb (phi);
8877   tree preheader = PHI_ARG_DEF_FROM_EDGE (phi, pe);
8878   if (!useless_type_conversion_p (TREE_TYPE (vectype), TREE_TYPE (preheader)))
8879     {
8880       gimple_seq stmts = NULL;
8881       preheader = gimple_convert (&stmts, TREE_TYPE (vectype), preheader);
8882       gsi_insert_seq_on_edge_immediate (pe, stmts);
8883     }
8884   tree vec_init = build_vector_from_val (vectype, preheader);
8885   vec_init = vect_init_vector (loop_vinfo, stmt_info, vec_init, vectype, NULL);
8886
8887   /* Create the vectorized first-order PHI node.  */
8888   tree vec_dest = vect_get_new_vect_var (vectype,
8889                                          vect_simple_var, "vec_recur_");
8890   gphi *new_phi = create_phi_node (vec_dest, bb);
8891   add_phi_arg (new_phi, vec_init, pe, UNKNOWN_LOCATION);
8892
8893   /* Insert shuffles the first-order recurrence autovectorization.
8894        result = VEC_PERM <vec_recur, vect_1, index[nunits-1, nunits, ...]>.  */
8895   tree perm = vect_gen_perm_mask_checked (vectype, indices);
8896
8897   /* Insert the required permute after the latch definition.  The
8898      second and later operands are tentative and will be updated when we have
8899      vectorized the latch definition.  */
8900   edge le = loop_latch_edge (LOOP_VINFO_LOOP (loop_vinfo));
8901   gimple *latch_def = SSA_NAME_DEF_STMT (PHI_ARG_DEF_FROM_EDGE (phi, le));
8902   gimple_stmt_iterator gsi2 = gsi_for_stmt (latch_def);
8903   gsi_next (&gsi2);
8904
8905   for (unsigned i = 0; i < ncopies; ++i)
8906     {
8907       vec_dest = make_ssa_name (vectype);
8908       gassign *vperm
8909           = gimple_build_assign (vec_dest, VEC_PERM_EXPR,
8910                                  i == 0 ? gimple_phi_result (new_phi) : NULL,
8911                                  NULL, perm);
8912       vect_finish_stmt_generation (loop_vinfo, stmt_info, vperm, &gsi2);
8913
8914       if (slp_node)
8915         slp_node->push_vec_def (vperm);
8916       else
8917         STMT_VINFO_VEC_STMTS (stmt_info).safe_push (vperm);
8918     }
8919
8920   if (!slp_node)
8921     *vec_stmt = STMT_VINFO_VEC_STMTS (stmt_info)[0];
8922   return true;
8923 }
8924
8925 /* Return true if VECTYPE represents a vector that requires lowering
8926    by the vector lowering pass.  */
8927
8928 bool
8929 vect_emulated_vector_p (tree vectype)
8930 {
8931   return (!VECTOR_MODE_P (TYPE_MODE (vectype))
8932           && (!VECTOR_BOOLEAN_TYPE_P (vectype)
8933               || TYPE_PRECISION (TREE_TYPE (vectype)) != 1));
8934 }
8935
8936 /* Return true if we can emulate CODE on an integer mode representation
8937    of a vector.  */
8938
8939 bool
8940 vect_can_vectorize_without_simd_p (tree_code code)
8941 {
8942   switch (code)
8943     {
8944     case PLUS_EXPR:
8945     case MINUS_EXPR:
8946     case NEGATE_EXPR:
8947     case BIT_AND_EXPR:
8948     case BIT_IOR_EXPR:
8949     case BIT_XOR_EXPR:
8950     case BIT_NOT_EXPR:
8951       return true;
8952
8953     default:
8954       return false;
8955     }
8956 }
8957
8958 /* Likewise, but taking a code_helper.  */
8959
8960 bool
8961 vect_can_vectorize_without_simd_p (code_helper code)
8962 {
8963   return (code.is_tree_code ()
8964           && vect_can_vectorize_without_simd_p (tree_code (code)));
8965 }
8966
8967 /* Create vector init for vectorized iv.  */
8968 static tree
8969 vect_create_nonlinear_iv_init (gimple_seq* stmts, tree init_expr,
8970                                tree step_expr, poly_uint64 nunits,
8971                                tree vectype,
8972                                enum vect_induction_op_type induction_type)
8973 {
8974   unsigned HOST_WIDE_INT const_nunits;
8975   tree vec_shift, vec_init, new_name;
8976   unsigned i;
8977   tree itype = TREE_TYPE (vectype);
8978
8979   /* iv_loop is the loop to be vectorized. Create:
8980      vec_init = [X, X+S, X+2*S, X+3*S] (S = step_expr, X = init_expr).  */
8981   new_name = gimple_convert (stmts, itype, init_expr);
8982   switch (induction_type)
8983     {
8984     case vect_step_op_shr:
8985     case vect_step_op_shl:
8986       /* Build the Initial value from shift_expr.  */
8987       vec_init = gimple_build_vector_from_val (stmts,
8988                                                vectype,
8989                                                new_name);
8990       vec_shift = gimple_build (stmts, VEC_SERIES_EXPR, vectype,
8991                                 build_zero_cst (itype), step_expr);
8992       vec_init = gimple_build (stmts,
8993                                (induction_type == vect_step_op_shr
8994                                 ? RSHIFT_EXPR : LSHIFT_EXPR),
8995                                vectype, vec_init, vec_shift);
8996       break;
8997
8998     case vect_step_op_neg:
8999       {
9000         vec_init = gimple_build_vector_from_val (stmts,
9001                                                  vectype,
9002                                                  new_name);
9003         tree vec_neg = gimple_build (stmts, NEGATE_EXPR,
9004                                      vectype, vec_init);
9005         /* The encoding has 2 interleaved stepped patterns.  */
9006         vec_perm_builder sel (nunits, 2, 3);
9007         sel.quick_grow (6);
9008         for (i = 0; i < 3; i++)
9009           {
9010             sel[2 * i] = i;
9011             sel[2 * i + 1] = i + nunits;
9012           }
9013         vec_perm_indices indices (sel, 2, nunits);
9014         /* Don't use vect_gen_perm_mask_checked since can_vec_perm_const_p may
9015            fail when vec_init is const vector. In that situation vec_perm is not
9016            really needed.  */
9017         tree perm_mask_even
9018           = vect_gen_perm_mask_any (vectype, indices);
9019         vec_init = gimple_build (stmts, VEC_PERM_EXPR,
9020                                  vectype,
9021                                  vec_init, vec_neg,
9022                                  perm_mask_even);
9023       }
9024       break;
9025
9026     case vect_step_op_mul:
9027       {
9028         /* Use unsigned mult to avoid UD integer overflow.  */
9029         gcc_assert (nunits.is_constant (&const_nunits));
9030         tree utype = unsigned_type_for (itype);
9031         tree uvectype = build_vector_type (utype,
9032                                            TYPE_VECTOR_SUBPARTS (vectype));
9033         new_name = gimple_convert (stmts, utype, new_name);
9034         vec_init = gimple_build_vector_from_val (stmts,
9035                                                  uvectype,
9036                                                  new_name);
9037         tree_vector_builder elts (uvectype, const_nunits, 1);
9038         tree elt_step = build_one_cst (utype);
9039
9040         elts.quick_push (elt_step);
9041         for (i = 1; i < const_nunits; i++)
9042           {
9043             /* Create: new_name_i = new_name + step_expr.  */
9044             elt_step = gimple_build (stmts, MULT_EXPR,
9045                                      utype, elt_step, step_expr);
9046             elts.quick_push (elt_step);
9047           }
9048         /* Create a vector from [new_name_0, new_name_1, ...,
9049            new_name_nunits-1].  */
9050         tree vec_mul = gimple_build_vector (stmts, &elts);
9051         vec_init = gimple_build (stmts, MULT_EXPR, uvectype,
9052                                  vec_init, vec_mul);
9053         vec_init = gimple_convert (stmts, vectype, vec_init);
9054       }
9055       break;
9056
9057     default:
9058       gcc_unreachable ();
9059     }
9060
9061   return vec_init;
9062 }
9063
9064 /* Peel init_expr by skip_niter for induction_type.  */
9065 tree
9066 vect_peel_nonlinear_iv_init (gimple_seq* stmts, tree init_expr,
9067                              tree skip_niters, tree step_expr,
9068                              enum vect_induction_op_type induction_type)
9069 {
9070   gcc_assert (TREE_CODE (skip_niters) == INTEGER_CST);
9071   tree type = TREE_TYPE (init_expr);
9072   unsigned prec = TYPE_PRECISION (type);
9073   switch (induction_type)
9074     {
9075     case vect_step_op_neg:
9076       if (TREE_INT_CST_LOW (skip_niters) % 2)
9077         init_expr = gimple_build (stmts, NEGATE_EXPR, type, init_expr);
9078       /* else no change.  */
9079       break;
9080
9081     case vect_step_op_shr:
9082     case vect_step_op_shl:
9083       skip_niters = gimple_convert (stmts, type, skip_niters);
9084       step_expr = gimple_build (stmts, MULT_EXPR, type, step_expr, skip_niters);
9085       /* When shift mount >= precision, need to avoid UD.
9086          In the original loop, there's no UD, and according to semantic,
9087          init_expr should be 0 for lshr, ashl, and >>= (prec - 1) for ashr.  */
9088       if (!tree_fits_uhwi_p (step_expr)
9089           || tree_to_uhwi (step_expr) >= prec)
9090         {
9091           if (induction_type == vect_step_op_shl
9092               || TYPE_UNSIGNED (type))
9093             init_expr = build_zero_cst (type);
9094           else
9095             init_expr = gimple_build (stmts, RSHIFT_EXPR, type,
9096                                       init_expr,
9097                                       wide_int_to_tree (type, prec - 1));
9098         }
9099       else
9100         init_expr = gimple_build (stmts, (induction_type == vect_step_op_shr
9101                                           ? RSHIFT_EXPR : LSHIFT_EXPR),
9102                                   type, init_expr, step_expr);
9103       break;
9104
9105     case vect_step_op_mul:
9106       {
9107         tree utype = unsigned_type_for (type);
9108         init_expr = gimple_convert (stmts, utype, init_expr);
9109         unsigned skipn = TREE_INT_CST_LOW (skip_niters);
9110         wide_int begin = wi::to_wide (step_expr);
9111         for (unsigned i = 0; i != skipn - 1; i++)
9112           begin = wi::mul (begin, wi::to_wide (step_expr));
9113         tree mult_expr = wide_int_to_tree (utype, begin);
9114         init_expr = gimple_build (stmts, MULT_EXPR, utype, init_expr, mult_expr);
9115         init_expr = gimple_convert (stmts, type, init_expr);
9116       }
9117       break;
9118
9119     default:
9120       gcc_unreachable ();
9121     }
9122
9123   return init_expr;
9124 }
9125
9126 /* Create vector step for vectorized iv.  */
9127 static tree
9128 vect_create_nonlinear_iv_step (gimple_seq* stmts, tree step_expr,
9129                                poly_uint64 vf,
9130                                enum vect_induction_op_type induction_type)
9131 {
9132   tree expr = build_int_cst (TREE_TYPE (step_expr), vf);
9133   tree new_name = NULL;
9134   /* Step should be pow (step, vf) for mult induction.  */
9135   if (induction_type == vect_step_op_mul)
9136     {
9137       gcc_assert (vf.is_constant ());
9138       wide_int begin = wi::to_wide (step_expr);
9139
9140       for (unsigned i = 0; i != vf.to_constant () - 1; i++)
9141         begin = wi::mul (begin, wi::to_wide (step_expr));
9142
9143       new_name = wide_int_to_tree (TREE_TYPE (step_expr), begin);
9144     }
9145   else if (induction_type == vect_step_op_neg)
9146     /* Do nothing.  */
9147     ;
9148   else
9149     new_name = gimple_build (stmts, MULT_EXPR, TREE_TYPE (step_expr),
9150                              expr, step_expr);
9151   return new_name;
9152 }
9153
9154 static tree
9155 vect_create_nonlinear_iv_vec_step (loop_vec_info loop_vinfo,
9156                                    stmt_vec_info stmt_info,
9157                                    tree new_name, tree vectype,
9158                                    enum vect_induction_op_type induction_type)
9159 {
9160   /* No step is needed for neg induction.  */
9161   if (induction_type == vect_step_op_neg)
9162     return NULL;
9163
9164   tree t = unshare_expr (new_name);
9165   gcc_assert (CONSTANT_CLASS_P (new_name)
9166               || TREE_CODE (new_name) == SSA_NAME);
9167   tree new_vec = build_vector_from_val (vectype, t);
9168   tree vec_step = vect_init_vector (loop_vinfo, stmt_info,
9169                                     new_vec, vectype, NULL);
9170   return vec_step;
9171 }
9172
9173 /* Update vectorized iv with vect_step, induc_def is init.  */
9174 static tree
9175 vect_update_nonlinear_iv (gimple_seq* stmts, tree vectype,
9176                           tree induc_def, tree vec_step,
9177                           enum vect_induction_op_type induction_type)
9178 {
9179   tree vec_def = induc_def;
9180   switch (induction_type)
9181     {
9182     case vect_step_op_mul:
9183       {
9184         /* Use unsigned mult to avoid UD integer overflow.  */
9185         tree uvectype
9186           = build_vector_type (unsigned_type_for (TREE_TYPE (vectype)),
9187                                TYPE_VECTOR_SUBPARTS (vectype));
9188         vec_def = gimple_convert (stmts, uvectype, vec_def);
9189         vec_step = gimple_convert (stmts, uvectype, vec_step);
9190         vec_def = gimple_build (stmts, MULT_EXPR, uvectype,
9191                                 vec_def, vec_step);
9192         vec_def = gimple_convert (stmts, vectype, vec_def);
9193       }
9194       break;
9195
9196     case vect_step_op_shr:
9197       vec_def = gimple_build (stmts, RSHIFT_EXPR, vectype,
9198                               vec_def, vec_step);
9199       break;
9200
9201     case vect_step_op_shl:
9202       vec_def = gimple_build (stmts, LSHIFT_EXPR, vectype,
9203                               vec_def, vec_step);
9204       break;
9205     case vect_step_op_neg:
9206       vec_def = induc_def;
9207       /* Do nothing.  */
9208       break;
9209     default:
9210       gcc_unreachable ();
9211     }
9212
9213   return vec_def;
9214
9215 }
9216
9217 /* Function vectorizable_induction
9218
9219    Check if STMT_INFO performs an nonlinear induction computation that can be
9220    vectorized. If VEC_STMT is also passed, vectorize the induction PHI: create
9221    a vectorized phi to replace it, put it in VEC_STMT, and add it to the same
9222    basic block.
9223    Return true if STMT_INFO is vectorizable in this way.  */
9224
9225 static bool
9226 vectorizable_nonlinear_induction (loop_vec_info loop_vinfo,
9227                                   stmt_vec_info stmt_info,
9228                                   gimple **vec_stmt, slp_tree slp_node,
9229                                   stmt_vector_for_cost *cost_vec)
9230 {
9231   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
9232   unsigned ncopies;
9233   bool nested_in_vect_loop = false;
9234   class loop *iv_loop;
9235   tree vec_def;
9236   edge pe = loop_preheader_edge (loop);
9237   basic_block new_bb;
9238   tree vec_init, vec_step;
9239   tree new_name;
9240   gimple *new_stmt;
9241   gphi *induction_phi;
9242   tree induc_def, vec_dest;
9243   tree init_expr, step_expr;
9244   tree niters_skip;
9245   poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
9246   unsigned i;
9247   gimple_stmt_iterator si;
9248
9249   gphi *phi = dyn_cast <gphi *> (stmt_info->stmt);
9250
9251   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
9252   poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
9253   enum vect_induction_op_type induction_type
9254     = STMT_VINFO_LOOP_PHI_EVOLUTION_TYPE (stmt_info);
9255
9256   gcc_assert (induction_type > vect_step_op_add);
9257
9258   if (slp_node)
9259     ncopies = 1;
9260   else
9261     ncopies = vect_get_num_copies (loop_vinfo, vectype);
9262   gcc_assert (ncopies >= 1);
9263
9264   /* FORNOW. Only handle nonlinear induction in the same loop.  */
9265   if (nested_in_vect_loop_p (loop, stmt_info))
9266     {
9267       if (dump_enabled_p ())
9268         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9269                          "nonlinear induction in nested loop.\n");
9270       return false;
9271     }
9272
9273   iv_loop = loop;
9274   gcc_assert (iv_loop == (gimple_bb (phi))->loop_father);
9275
9276   /* TODO: Support slp for nonlinear iv. There should be separate vector iv
9277      update for each iv and a permutation to generate wanted vector iv.  */
9278   if (slp_node)
9279     {
9280       if (dump_enabled_p ())
9281         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9282                          "SLP induction not supported for nonlinear"
9283                          " induction.\n");
9284       return false;
9285     }
9286
9287   if (!INTEGRAL_TYPE_P (TREE_TYPE (vectype)))
9288     {
9289       if (dump_enabled_p ())
9290         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9291                          "floating point nonlinear induction vectorization"
9292                          " not supported.\n");
9293       return false;
9294     }
9295
9296   step_expr = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_info);
9297   init_expr = vect_phi_initial_value (phi);
9298   gcc_assert (step_expr != NULL_TREE && init_expr != NULL
9299               && TREE_CODE (step_expr) == INTEGER_CST);
9300   /* step_expr should be aligned with init_expr,
9301      .i.e. uint64 a >> 1, step is int, but vector<uint64> shift is used.  */
9302   step_expr = fold_convert (TREE_TYPE (vectype), step_expr);
9303
9304   if (TREE_CODE (init_expr) == INTEGER_CST)
9305     init_expr = fold_convert (TREE_TYPE (vectype), init_expr);
9306   else
9307     gcc_assert (tree_nop_conversion_p (TREE_TYPE (vectype),
9308                                        TREE_TYPE (init_expr)));
9309
9310   switch (induction_type)
9311     {
9312     case vect_step_op_neg:
9313       if (TREE_CODE (init_expr) != INTEGER_CST
9314           && TREE_CODE (init_expr) != REAL_CST)
9315         {
9316           /* Check for backend support of NEGATE_EXPR and vec_perm.  */
9317           if (!directly_supported_p (NEGATE_EXPR, vectype))
9318             return false;
9319
9320           /* The encoding has 2 interleaved stepped patterns.  */
9321           vec_perm_builder sel (nunits, 2, 3);
9322           machine_mode mode = TYPE_MODE (vectype);
9323           sel.quick_grow (6);
9324           for (i = 0; i < 3; i++)
9325             {
9326               sel[i * 2] = i;
9327               sel[i * 2 + 1] = i + nunits;
9328             }
9329           vec_perm_indices indices (sel, 2, nunits);
9330           if (!can_vec_perm_const_p (mode, mode, indices))
9331             return false;
9332         }
9333       break;
9334
9335     case vect_step_op_mul:
9336       {
9337         /* Check for backend support of MULT_EXPR.  */
9338         if (!directly_supported_p (MULT_EXPR, vectype))
9339           return false;
9340
9341         /* ?? How to construct vector step for variable number vector.
9342            [ 1, step, pow (step, 2), pow (step, 4), .. ].  */
9343         if (!vf.is_constant ())
9344           return false;
9345       }
9346       break;
9347
9348     case vect_step_op_shr:
9349       /* Check for backend support of RSHIFT_EXPR.  */
9350       if (!directly_supported_p (RSHIFT_EXPR, vectype, optab_vector))
9351         return false;
9352
9353       /* Don't shift more than type precision to avoid UD.  */
9354       if (!tree_fits_uhwi_p (step_expr)
9355           || maybe_ge (nunits * tree_to_uhwi (step_expr),
9356                        TYPE_PRECISION (TREE_TYPE (init_expr))))
9357         return false;
9358       break;
9359
9360     case vect_step_op_shl:
9361       /* Check for backend support of RSHIFT_EXPR.  */
9362       if (!directly_supported_p (LSHIFT_EXPR, vectype, optab_vector))
9363         return false;
9364
9365       /* Don't shift more than type precision to avoid UD.  */
9366       if (!tree_fits_uhwi_p (step_expr)
9367           || maybe_ge (nunits * tree_to_uhwi (step_expr),
9368                        TYPE_PRECISION (TREE_TYPE (init_expr))))
9369         return false;
9370
9371       break;
9372
9373     default:
9374       gcc_unreachable ();
9375     }
9376
9377   if (!vec_stmt) /* transformation not required.  */
9378     {
9379       unsigned inside_cost = 0, prologue_cost = 0;
9380       /* loop cost for vec_loop. Neg induction doesn't have any
9381          inside_cost.  */
9382       inside_cost = record_stmt_cost (cost_vec, ncopies, vector_stmt,
9383                                       stmt_info, 0, vect_body);
9384
9385       /* loop cost for vec_loop. Neg induction doesn't have any
9386          inside_cost.  */
9387       if (induction_type == vect_step_op_neg)
9388         inside_cost = 0;
9389
9390       /* prologue cost for vec_init and vec_step.  */
9391       prologue_cost = record_stmt_cost (cost_vec, 2, scalar_to_vec,
9392                                         stmt_info, 0, vect_prologue);
9393
9394       if (dump_enabled_p ())
9395         dump_printf_loc (MSG_NOTE, vect_location,
9396                          "vect_model_induction_cost: inside_cost = %d, "
9397                          "prologue_cost = %d. \n", inside_cost,
9398                          prologue_cost);
9399
9400       STMT_VINFO_TYPE (stmt_info) = induc_vec_info_type;
9401       DUMP_VECT_SCOPE ("vectorizable_nonlinear_induction");
9402       return true;
9403     }
9404
9405   /* Transform.  */
9406
9407   /* Compute a vector variable, initialized with the first VF values of
9408      the induction variable.  E.g., for an iv with IV_PHI='X' and
9409      evolution S, for a vector of 4 units, we want to compute:
9410      [X, X + S, X + 2*S, X + 3*S].  */
9411
9412   if (dump_enabled_p ())
9413     dump_printf_loc (MSG_NOTE, vect_location, "transform induction phi.\n");
9414
9415   pe = loop_preheader_edge (iv_loop);
9416   /* Find the first insertion point in the BB.  */
9417   basic_block bb = gimple_bb (phi);
9418   si = gsi_after_labels (bb);
9419
9420   gimple_seq stmts = NULL;
9421
9422   niters_skip = LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo);
9423   /* If we are using the loop mask to "peel" for alignment then we need
9424      to adjust the start value here.  */
9425   if (niters_skip != NULL_TREE)
9426     init_expr = vect_peel_nonlinear_iv_init (&stmts, init_expr, niters_skip,
9427                                              step_expr, induction_type);
9428
9429   vec_init = vect_create_nonlinear_iv_init (&stmts, init_expr,
9430                                             step_expr, nunits, vectype,
9431                                             induction_type);
9432   if (stmts)
9433     {
9434       new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
9435       gcc_assert (!new_bb);
9436     }
9437
9438   stmts = NULL;
9439   new_name = vect_create_nonlinear_iv_step (&stmts, step_expr,
9440                                             vf, induction_type);
9441   if (stmts)
9442     {
9443       new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
9444       gcc_assert (!new_bb);
9445     }
9446
9447   vec_step = vect_create_nonlinear_iv_vec_step (loop_vinfo, stmt_info,
9448                                                 new_name, vectype,
9449                                                 induction_type);
9450   /* Create the following def-use cycle:
9451      loop prolog:
9452      vec_init = ...
9453      vec_step = ...
9454      loop:
9455      vec_iv = PHI <vec_init, vec_loop>
9456      ...
9457      STMT
9458      ...
9459      vec_loop = vec_iv + vec_step;  */
9460
9461   /* Create the induction-phi that defines the induction-operand.  */
9462   vec_dest = vect_get_new_vect_var (vectype, vect_simple_var, "vec_iv_");
9463   induction_phi = create_phi_node (vec_dest, iv_loop->header);
9464   induc_def = PHI_RESULT (induction_phi);
9465
9466   /* Create the iv update inside the loop.  */
9467   stmts = NULL;
9468   vec_def = vect_update_nonlinear_iv (&stmts, vectype,
9469                                       induc_def, vec_step,
9470                                       induction_type);
9471
9472   gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
9473   new_stmt = SSA_NAME_DEF_STMT (vec_def);
9474
9475   /* Set the arguments of the phi node:  */
9476   add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION);
9477   add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
9478                UNKNOWN_LOCATION);
9479
9480   STMT_VINFO_VEC_STMTS (stmt_info).safe_push (induction_phi);
9481   *vec_stmt = induction_phi;
9482
9483   /* In case that vectorization factor (VF) is bigger than the number
9484      of elements that we can fit in a vectype (nunits), we have to generate
9485      more than one vector stmt - i.e - we need to "unroll" the
9486      vector stmt by a factor VF/nunits.  For more details see documentation
9487      in vectorizable_operation.  */
9488
9489   if (ncopies > 1)
9490     {
9491       stmts = NULL;
9492       /* FORNOW. This restriction should be relaxed.  */
9493       gcc_assert (!nested_in_vect_loop);
9494
9495       new_name = vect_create_nonlinear_iv_step (&stmts, step_expr,
9496                                                 nunits, induction_type);
9497
9498       vec_step = vect_create_nonlinear_iv_vec_step (loop_vinfo, stmt_info,
9499                                                     new_name, vectype,
9500                                                     induction_type);
9501       vec_def = induc_def;
9502       for (i = 1; i < ncopies; i++)
9503         {
9504           /* vec_i = vec_prev + vec_step.  */
9505           stmts = NULL;
9506           vec_def = vect_update_nonlinear_iv (&stmts, vectype,
9507                                               vec_def, vec_step,
9508                                               induction_type);
9509           gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
9510           new_stmt = SSA_NAME_DEF_STMT (vec_def);
9511           STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_stmt);
9512         }
9513     }
9514
9515   if (dump_enabled_p ())
9516     dump_printf_loc (MSG_NOTE, vect_location,
9517                      "transform induction: created def-use cycle: %G%G",
9518                      (gimple *) induction_phi, SSA_NAME_DEF_STMT (vec_def));
9519
9520   return true;
9521 }
9522
9523 /* Function vectorizable_induction
9524
9525    Check if STMT_INFO performs an induction computation that can be vectorized.
9526    If VEC_STMT is also passed, vectorize the induction PHI: create a vectorized
9527    phi to replace it, put it in VEC_STMT, and add it to the same basic block.
9528    Return true if STMT_INFO is vectorizable in this way.  */
9529
9530 bool
9531 vectorizable_induction (loop_vec_info loop_vinfo,
9532                         stmt_vec_info stmt_info,
9533                         gimple **vec_stmt, slp_tree slp_node,
9534                         stmt_vector_for_cost *cost_vec)
9535 {
9536   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
9537   unsigned ncopies;
9538   bool nested_in_vect_loop = false;
9539   class loop *iv_loop;
9540   tree vec_def;
9541   edge pe = loop_preheader_edge (loop);
9542   basic_block new_bb;
9543   tree new_vec, vec_init, vec_step, t;
9544   tree new_name;
9545   gimple *new_stmt;
9546   gphi *induction_phi;
9547   tree induc_def, vec_dest;
9548   tree init_expr, step_expr;
9549   poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
9550   unsigned i;
9551   tree expr;
9552   gimple_stmt_iterator si;
9553   enum vect_induction_op_type induction_type
9554     = STMT_VINFO_LOOP_PHI_EVOLUTION_TYPE (stmt_info);
9555
9556   gphi *phi = dyn_cast <gphi *> (stmt_info->stmt);
9557   if (!phi)
9558     return false;
9559
9560   if (!STMT_VINFO_RELEVANT_P (stmt_info))
9561     return false;
9562
9563   /* Make sure it was recognized as induction computation.  */
9564   if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_induction_def)
9565     return false;
9566
9567   /* Handle nonlinear induction in a separate place.  */
9568   if (induction_type != vect_step_op_add)
9569     return vectorizable_nonlinear_induction (loop_vinfo, stmt_info,
9570                                              vec_stmt, slp_node, cost_vec);
9571
9572   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
9573   poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
9574
9575   if (slp_node)
9576     ncopies = 1;
9577   else
9578     ncopies = vect_get_num_copies (loop_vinfo, vectype);
9579   gcc_assert (ncopies >= 1);
9580
9581   /* FORNOW. These restrictions should be relaxed.  */
9582   if (nested_in_vect_loop_p (loop, stmt_info))
9583     {
9584       imm_use_iterator imm_iter;
9585       use_operand_p use_p;
9586       gimple *exit_phi;
9587       edge latch_e;
9588       tree loop_arg;
9589
9590       if (ncopies > 1)
9591         {
9592           if (dump_enabled_p ())
9593             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9594                              "multiple types in nested loop.\n");
9595           return false;
9596         }
9597
9598       exit_phi = NULL;
9599       latch_e = loop_latch_edge (loop->inner);
9600       loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
9601       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, loop_arg)
9602         {
9603           gimple *use_stmt = USE_STMT (use_p);
9604           if (is_gimple_debug (use_stmt))
9605             continue;
9606
9607           if (!flow_bb_inside_loop_p (loop->inner, gimple_bb (use_stmt)))
9608             {
9609               exit_phi = use_stmt;
9610               break;
9611             }
9612         }
9613       if (exit_phi)
9614         {
9615           stmt_vec_info exit_phi_vinfo = loop_vinfo->lookup_stmt (exit_phi);
9616           if (!(STMT_VINFO_RELEVANT_P (exit_phi_vinfo)
9617                 && !STMT_VINFO_LIVE_P (exit_phi_vinfo)))
9618             {
9619               if (dump_enabled_p ())
9620                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9621                                  "inner-loop induction only used outside "
9622                                  "of the outer vectorized loop.\n");
9623               return false;
9624             }
9625         }
9626
9627       nested_in_vect_loop = true;
9628       iv_loop = loop->inner;
9629     }
9630   else
9631     iv_loop = loop;
9632   gcc_assert (iv_loop == (gimple_bb (phi))->loop_father);
9633
9634   if (slp_node && !nunits.is_constant ())
9635     {
9636       /* The current SLP code creates the step value element-by-element.  */
9637       if (dump_enabled_p ())
9638         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9639                          "SLP induction not supported for variable-length"
9640                          " vectors.\n");
9641       return false;
9642     }
9643
9644   if (FLOAT_TYPE_P (vectype) && !param_vect_induction_float)
9645     {
9646       if (dump_enabled_p ())
9647         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9648                          "floating point induction vectorization disabled\n");
9649       return false;
9650     }
9651
9652   step_expr = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_info);
9653   gcc_assert (step_expr != NULL_TREE);
9654   tree step_vectype = get_same_sized_vectype (TREE_TYPE (step_expr), vectype);
9655
9656   /* Check for backend support of PLUS/MINUS_EXPR. */
9657   if (!directly_supported_p (PLUS_EXPR, step_vectype)
9658       || !directly_supported_p (MINUS_EXPR, step_vectype))
9659     return false;
9660
9661   if (!vec_stmt) /* transformation not required.  */
9662     {
9663       unsigned inside_cost = 0, prologue_cost = 0;
9664       if (slp_node)
9665         {
9666           /* We eventually need to set a vector type on invariant
9667              arguments.  */
9668           unsigned j;
9669           slp_tree child;
9670           FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (slp_node), j, child)
9671             if (!vect_maybe_update_slp_op_vectype
9672                 (child, SLP_TREE_VECTYPE (slp_node)))
9673               {
9674                 if (dump_enabled_p ())
9675                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9676                                    "incompatible vector types for "
9677                                    "invariants\n");
9678                 return false;
9679               }
9680           /* loop cost for vec_loop.  */
9681           inside_cost
9682             = record_stmt_cost (cost_vec,
9683                                 SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node),
9684                                 vector_stmt, stmt_info, 0, vect_body);
9685           /* prologue cost for vec_init (if not nested) and step.  */
9686           prologue_cost = record_stmt_cost (cost_vec, 1 + !nested_in_vect_loop,
9687                                             scalar_to_vec,
9688                                             stmt_info, 0, vect_prologue);
9689         }
9690       else /* if (!slp_node) */
9691         {
9692           /* loop cost for vec_loop.  */
9693           inside_cost = record_stmt_cost (cost_vec, ncopies, vector_stmt,
9694                                           stmt_info, 0, vect_body);
9695           /* prologue cost for vec_init and vec_step.  */
9696           prologue_cost = record_stmt_cost (cost_vec, 2, scalar_to_vec,
9697                                             stmt_info, 0, vect_prologue);
9698         }
9699       if (dump_enabled_p ())
9700         dump_printf_loc (MSG_NOTE, vect_location,
9701                          "vect_model_induction_cost: inside_cost = %d, "
9702                          "prologue_cost = %d .\n", inside_cost,
9703                          prologue_cost);
9704
9705       STMT_VINFO_TYPE (stmt_info) = induc_vec_info_type;
9706       DUMP_VECT_SCOPE ("vectorizable_induction");
9707       return true;
9708     }
9709
9710   /* Transform.  */
9711
9712   /* Compute a vector variable, initialized with the first VF values of
9713      the induction variable.  E.g., for an iv with IV_PHI='X' and
9714      evolution S, for a vector of 4 units, we want to compute:
9715      [X, X + S, X + 2*S, X + 3*S].  */
9716
9717   if (dump_enabled_p ())
9718     dump_printf_loc (MSG_NOTE, vect_location, "transform induction phi.\n");
9719
9720   pe = loop_preheader_edge (iv_loop);
9721   /* Find the first insertion point in the BB.  */
9722   basic_block bb = gimple_bb (phi);
9723   si = gsi_after_labels (bb);
9724
9725   /* For SLP induction we have to generate several IVs as for example
9726      with group size 3 we need
9727        [i0, i1, i2, i0 + S0] [i1 + S1, i2 + S2, i0 + 2*S0, i1 + 2*S1]
9728        [i2 + 2*S2, i0 + 3*S0, i1 + 3*S1, i2 + 3*S2].  */
9729   if (slp_node)
9730     {
9731       /* Enforced above.  */
9732       unsigned int const_nunits = nunits.to_constant ();
9733
9734       /* The initial values are vectorized, but any lanes > group_size
9735          need adjustment.  */
9736       slp_tree init_node
9737         = SLP_TREE_CHILDREN (slp_node)[pe->dest_idx];
9738
9739       /* Gather steps.  Since we do not vectorize inductions as
9740          cycles we have to reconstruct the step from SCEV data.  */
9741       unsigned group_size = SLP_TREE_LANES (slp_node);
9742       tree *steps = XALLOCAVEC (tree, group_size);
9743       tree *inits = XALLOCAVEC (tree, group_size);
9744       stmt_vec_info phi_info;
9745       FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (slp_node), i, phi_info)
9746         {
9747           steps[i] = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (phi_info);
9748           if (!init_node)
9749             inits[i] = gimple_phi_arg_def (as_a<gphi *> (phi_info->stmt),
9750                                            pe->dest_idx);
9751         }
9752
9753       /* Now generate the IVs.  */
9754       unsigned nvects = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
9755       gcc_assert ((const_nunits * nvects) % group_size == 0);
9756       unsigned nivs;
9757       if (nested_in_vect_loop)
9758         nivs = nvects;
9759       else
9760         {
9761           /* Compute the number of distinct IVs we need.  First reduce
9762              group_size if it is a multiple of const_nunits so we get
9763              one IV for a group_size of 4 but const_nunits 2.  */
9764           unsigned group_sizep = group_size;
9765           if (group_sizep % const_nunits == 0)
9766             group_sizep = group_sizep / const_nunits;
9767           nivs = least_common_multiple (group_sizep,
9768                                         const_nunits) / const_nunits;
9769         }
9770       tree stept = TREE_TYPE (step_vectype);
9771       tree lupdate_mul = NULL_TREE;
9772       if (!nested_in_vect_loop)
9773         {
9774           /* The number of iterations covered in one vector iteration.  */
9775           unsigned lup_mul = (nvects * const_nunits) / group_size;
9776           lupdate_mul
9777             = build_vector_from_val (step_vectype,
9778                                      SCALAR_FLOAT_TYPE_P (stept)
9779                                      ? build_real_from_wide (stept, lup_mul,
9780                                                              UNSIGNED)
9781                                      : build_int_cstu (stept, lup_mul));
9782         }
9783       tree peel_mul = NULL_TREE;
9784       gimple_seq init_stmts = NULL;
9785       if (LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo))
9786         {
9787           if (SCALAR_FLOAT_TYPE_P (stept))
9788             peel_mul = gimple_build (&init_stmts, FLOAT_EXPR, stept,
9789                                      LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo));
9790           else
9791             peel_mul = gimple_convert (&init_stmts, stept,
9792                                        LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo));
9793           peel_mul = gimple_build_vector_from_val (&init_stmts,
9794                                                    step_vectype, peel_mul);
9795         }
9796       unsigned ivn;
9797       auto_vec<tree> vec_steps;
9798       for (ivn = 0; ivn < nivs; ++ivn)
9799         {
9800           tree_vector_builder step_elts (step_vectype, const_nunits, 1);
9801           tree_vector_builder init_elts (vectype, const_nunits, 1);
9802           tree_vector_builder mul_elts (step_vectype, const_nunits, 1);
9803           for (unsigned eltn = 0; eltn < const_nunits; ++eltn)
9804             {
9805               /* The scalar steps of the IVs.  */
9806               tree elt = steps[(ivn*const_nunits + eltn) % group_size];
9807               elt = gimple_convert (&init_stmts, TREE_TYPE (step_vectype), elt);
9808               step_elts.quick_push (elt);
9809               if (!init_node)
9810                 {
9811                   /* The scalar inits of the IVs if not vectorized.  */
9812                   elt = inits[(ivn*const_nunits + eltn) % group_size];
9813                   if (!useless_type_conversion_p (TREE_TYPE (vectype),
9814                                                   TREE_TYPE (elt)))
9815                     elt = gimple_build (&init_stmts, VIEW_CONVERT_EXPR,
9816                                         TREE_TYPE (vectype), elt);
9817                   init_elts.quick_push (elt);
9818                 }
9819               /* The number of steps to add to the initial values.  */
9820               unsigned mul_elt = (ivn*const_nunits + eltn) / group_size;
9821               mul_elts.quick_push (SCALAR_FLOAT_TYPE_P (stept)
9822                                    ? build_real_from_wide (stept,
9823                                                            mul_elt, UNSIGNED)
9824                                    : build_int_cstu (stept, mul_elt));
9825             }
9826           vec_step = gimple_build_vector (&init_stmts, &step_elts);
9827           vec_steps.safe_push (vec_step);
9828           tree step_mul = gimple_build_vector (&init_stmts, &mul_elts);
9829           if (peel_mul)
9830             step_mul = gimple_build (&init_stmts, PLUS_EXPR, step_vectype,
9831                                      step_mul, peel_mul);
9832           if (!init_node)
9833             vec_init = gimple_build_vector (&init_stmts, &init_elts);
9834
9835           /* Create the induction-phi that defines the induction-operand.  */
9836           vec_dest = vect_get_new_vect_var (vectype, vect_simple_var,
9837                                             "vec_iv_");
9838           induction_phi = create_phi_node (vec_dest, iv_loop->header);
9839           induc_def = PHI_RESULT (induction_phi);
9840
9841           /* Create the iv update inside the loop  */
9842           tree up = vec_step;
9843           if (lupdate_mul)
9844             up = gimple_build (&init_stmts, MULT_EXPR, step_vectype,
9845                                vec_step, lupdate_mul);
9846           gimple_seq stmts = NULL;
9847           vec_def = gimple_convert (&stmts, step_vectype, induc_def);
9848           vec_def = gimple_build (&stmts,
9849                                   PLUS_EXPR, step_vectype, vec_def, up);
9850           vec_def = gimple_convert (&stmts, vectype, vec_def);
9851           gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
9852           add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
9853                        UNKNOWN_LOCATION);
9854
9855           if (init_node)
9856             vec_init = vect_get_slp_vect_def (init_node, ivn);
9857           if (!nested_in_vect_loop
9858               && !integer_zerop (step_mul))
9859             {
9860               vec_def = gimple_convert (&init_stmts, step_vectype, vec_init);
9861               up = gimple_build (&init_stmts, MULT_EXPR, step_vectype,
9862                                  vec_step, step_mul);
9863               vec_def = gimple_build (&init_stmts, PLUS_EXPR, step_vectype,
9864                                       vec_def, up);
9865               vec_init = gimple_convert (&init_stmts, vectype, vec_def);
9866             }
9867
9868           /* Set the arguments of the phi node:  */
9869           add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION);
9870
9871           slp_node->push_vec_def (induction_phi);
9872         }
9873       if (!nested_in_vect_loop)
9874         {
9875           /* Fill up to the number of vectors we need for the whole group.  */
9876           nivs = least_common_multiple (group_size,
9877                                         const_nunits) / const_nunits;
9878           vec_steps.reserve (nivs-ivn);
9879           for (; ivn < nivs; ++ivn)
9880             {
9881               slp_node->push_vec_def (SLP_TREE_VEC_DEFS (slp_node)[0]);
9882               vec_steps.quick_push (vec_steps[0]);
9883             }
9884         }
9885
9886       /* Re-use IVs when we can.  We are generating further vector
9887          stmts by adding VF' * stride to the IVs generated above.  */
9888       if (ivn < nvects)
9889         {
9890           unsigned vfp
9891             = least_common_multiple (group_size, const_nunits) / group_size;
9892           tree lupdate_mul
9893             = build_vector_from_val (step_vectype,
9894                                      SCALAR_FLOAT_TYPE_P (stept)
9895                                      ? build_real_from_wide (stept,
9896                                                              vfp, UNSIGNED)
9897                                      : build_int_cstu (stept, vfp));
9898           for (; ivn < nvects; ++ivn)
9899             {
9900               gimple *iv
9901                 = SSA_NAME_DEF_STMT (SLP_TREE_VEC_DEFS (slp_node)[ivn - nivs]);
9902               tree def = gimple_get_lhs (iv);
9903               if (ivn < 2*nivs)
9904                 vec_steps[ivn - nivs]
9905                   = gimple_build (&init_stmts, MULT_EXPR, step_vectype,
9906                                   vec_steps[ivn - nivs], lupdate_mul);
9907               gimple_seq stmts = NULL;
9908               def = gimple_convert (&stmts, step_vectype, def);
9909               def = gimple_build (&stmts, PLUS_EXPR, step_vectype,
9910                                   def, vec_steps[ivn % nivs]);
9911               def = gimple_convert (&stmts, vectype, def);
9912               if (gimple_code (iv) == GIMPLE_PHI)
9913                 gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
9914               else
9915                 {
9916                   gimple_stmt_iterator tgsi = gsi_for_stmt (iv);
9917                   gsi_insert_seq_after (&tgsi, stmts, GSI_CONTINUE_LINKING);
9918                 }
9919               slp_node->push_vec_def (def);
9920             }
9921         }
9922
9923       new_bb = gsi_insert_seq_on_edge_immediate (pe, init_stmts);
9924       gcc_assert (!new_bb);
9925
9926       return true;
9927     }
9928
9929   init_expr = vect_phi_initial_value (phi);
9930
9931   gimple_seq stmts = NULL;
9932   if (!nested_in_vect_loop)
9933     {
9934       /* Convert the initial value to the IV update type.  */
9935       tree new_type = TREE_TYPE (step_expr);
9936       init_expr = gimple_convert (&stmts, new_type, init_expr);
9937
9938       /* If we are using the loop mask to "peel" for alignment then we need
9939          to adjust the start value here.  */
9940       tree skip_niters = LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo);
9941       if (skip_niters != NULL_TREE)
9942         {
9943           if (FLOAT_TYPE_P (vectype))
9944             skip_niters = gimple_build (&stmts, FLOAT_EXPR, new_type,
9945                                         skip_niters);
9946           else
9947             skip_niters = gimple_convert (&stmts, new_type, skip_niters);
9948           tree skip_step = gimple_build (&stmts, MULT_EXPR, new_type,
9949                                          skip_niters, step_expr);
9950           init_expr = gimple_build (&stmts, MINUS_EXPR, new_type,
9951                                     init_expr, skip_step);
9952         }
9953     }
9954
9955   if (stmts)
9956     {
9957       new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
9958       gcc_assert (!new_bb);
9959     }
9960
9961   /* Create the vector that holds the initial_value of the induction.  */
9962   if (nested_in_vect_loop)
9963     {
9964       /* iv_loop is nested in the loop to be vectorized.  init_expr had already
9965          been created during vectorization of previous stmts.  We obtain it
9966          from the STMT_VINFO_VEC_STMT of the defining stmt.  */
9967       auto_vec<tree> vec_inits;
9968       vect_get_vec_defs_for_operand (loop_vinfo, stmt_info, 1,
9969                                      init_expr, &vec_inits);
9970       vec_init = vec_inits[0];
9971       /* If the initial value is not of proper type, convert it.  */
9972       if (!useless_type_conversion_p (vectype, TREE_TYPE (vec_init)))
9973         {
9974           new_stmt
9975             = gimple_build_assign (vect_get_new_ssa_name (vectype,
9976                                                           vect_simple_var,
9977                                                           "vec_iv_"),
9978                                    VIEW_CONVERT_EXPR,
9979                                    build1 (VIEW_CONVERT_EXPR, vectype,
9980                                            vec_init));
9981           vec_init = gimple_assign_lhs (new_stmt);
9982           new_bb = gsi_insert_on_edge_immediate (loop_preheader_edge (iv_loop),
9983                                                  new_stmt);
9984           gcc_assert (!new_bb);
9985         }
9986     }
9987   else
9988     {
9989       /* iv_loop is the loop to be vectorized. Create:
9990          vec_init = [X, X+S, X+2*S, X+3*S] (S = step_expr, X = init_expr)  */
9991       stmts = NULL;
9992       new_name = gimple_convert (&stmts, TREE_TYPE (step_expr), init_expr);
9993
9994       unsigned HOST_WIDE_INT const_nunits;
9995       if (nunits.is_constant (&const_nunits))
9996         {
9997           tree_vector_builder elts (step_vectype, const_nunits, 1);
9998           elts.quick_push (new_name);
9999           for (i = 1; i < const_nunits; i++)
10000             {
10001               /* Create: new_name_i = new_name + step_expr  */
10002               new_name = gimple_build (&stmts, PLUS_EXPR, TREE_TYPE (new_name),
10003                                        new_name, step_expr);
10004               elts.quick_push (new_name);
10005             }
10006           /* Create a vector from [new_name_0, new_name_1, ...,
10007              new_name_nunits-1]  */
10008           vec_init = gimple_build_vector (&stmts, &elts);
10009         }
10010       else if (INTEGRAL_TYPE_P (TREE_TYPE (step_expr)))
10011         /* Build the initial value directly from a VEC_SERIES_EXPR.  */
10012         vec_init = gimple_build (&stmts, VEC_SERIES_EXPR, step_vectype,
10013                                  new_name, step_expr);
10014       else
10015         {
10016           /* Build:
10017                 [base, base, base, ...]
10018                 + (vectype) [0, 1, 2, ...] * [step, step, step, ...].  */
10019           gcc_assert (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)));
10020           gcc_assert (flag_associative_math);
10021           tree index = build_index_vector (step_vectype, 0, 1);
10022           tree base_vec = gimple_build_vector_from_val (&stmts, step_vectype,
10023                                                         new_name);
10024           tree step_vec = gimple_build_vector_from_val (&stmts, step_vectype,
10025                                                         step_expr);
10026           vec_init = gimple_build (&stmts, FLOAT_EXPR, step_vectype, index);
10027           vec_init = gimple_build (&stmts, MULT_EXPR, step_vectype,
10028                                    vec_init, step_vec);
10029           vec_init = gimple_build (&stmts, PLUS_EXPR, step_vectype,
10030                                    vec_init, base_vec);
10031         }
10032       vec_init = gimple_convert (&stmts, vectype, vec_init);
10033
10034       if (stmts)
10035         {
10036           new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
10037           gcc_assert (!new_bb);
10038         }
10039     }
10040
10041
10042   /* Create the vector that holds the step of the induction.  */
10043   if (nested_in_vect_loop)
10044     /* iv_loop is nested in the loop to be vectorized. Generate:
10045        vec_step = [S, S, S, S]  */
10046     new_name = step_expr;
10047   else
10048     {
10049       /* iv_loop is the loop to be vectorized. Generate:
10050           vec_step = [VF*S, VF*S, VF*S, VF*S]  */
10051       gimple_seq seq = NULL;
10052       if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
10053         {
10054           expr = build_int_cst (integer_type_node, vf);
10055           expr = gimple_build (&seq, FLOAT_EXPR, TREE_TYPE (step_expr), expr);
10056         }
10057       else
10058         expr = build_int_cst (TREE_TYPE (step_expr), vf);
10059       new_name = gimple_build (&seq, MULT_EXPR, TREE_TYPE (step_expr),
10060                                expr, step_expr);
10061       if (seq)
10062         {
10063           new_bb = gsi_insert_seq_on_edge_immediate (pe, seq);
10064           gcc_assert (!new_bb);
10065         }
10066     }
10067
10068   t = unshare_expr (new_name);
10069   gcc_assert (CONSTANT_CLASS_P (new_name)
10070               || TREE_CODE (new_name) == SSA_NAME);
10071   new_vec = build_vector_from_val (step_vectype, t);
10072   vec_step = vect_init_vector (loop_vinfo, stmt_info,
10073                                new_vec, step_vectype, NULL);
10074
10075
10076   /* Create the following def-use cycle:
10077      loop prolog:
10078          vec_init = ...
10079          vec_step = ...
10080      loop:
10081          vec_iv = PHI <vec_init, vec_loop>
10082          ...
10083          STMT
10084          ...
10085          vec_loop = vec_iv + vec_step;  */
10086
10087   /* Create the induction-phi that defines the induction-operand.  */
10088   vec_dest = vect_get_new_vect_var (vectype, vect_simple_var, "vec_iv_");
10089   induction_phi = create_phi_node (vec_dest, iv_loop->header);
10090   induc_def = PHI_RESULT (induction_phi);
10091
10092   /* Create the iv update inside the loop  */
10093   stmts = NULL;
10094   vec_def = gimple_convert (&stmts, step_vectype, induc_def);
10095   vec_def = gimple_build (&stmts, PLUS_EXPR, step_vectype, vec_def, vec_step);
10096   vec_def = gimple_convert (&stmts, vectype, vec_def);
10097   gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
10098   new_stmt = SSA_NAME_DEF_STMT (vec_def);
10099
10100   /* Set the arguments of the phi node:  */
10101   add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION);
10102   add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
10103                UNKNOWN_LOCATION);
10104
10105   STMT_VINFO_VEC_STMTS (stmt_info).safe_push (induction_phi);
10106   *vec_stmt = induction_phi;
10107
10108   /* In case that vectorization factor (VF) is bigger than the number
10109      of elements that we can fit in a vectype (nunits), we have to generate
10110      more than one vector stmt - i.e - we need to "unroll" the
10111      vector stmt by a factor VF/nunits.  For more details see documentation
10112      in vectorizable_operation.  */
10113
10114   if (ncopies > 1)
10115     {
10116       gimple_seq seq = NULL;
10117       /* FORNOW. This restriction should be relaxed.  */
10118       gcc_assert (!nested_in_vect_loop);
10119
10120       /* Create the vector that holds the step of the induction.  */
10121       if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
10122         {
10123           expr = build_int_cst (integer_type_node, nunits);
10124           expr = gimple_build (&seq, FLOAT_EXPR, TREE_TYPE (step_expr), expr);
10125         }
10126       else
10127         expr = build_int_cst (TREE_TYPE (step_expr), nunits);
10128       new_name = gimple_build (&seq, MULT_EXPR, TREE_TYPE (step_expr),
10129                                expr, step_expr);
10130       if (seq)
10131         {
10132           new_bb = gsi_insert_seq_on_edge_immediate (pe, seq);
10133           gcc_assert (!new_bb);
10134         }
10135
10136       t = unshare_expr (new_name);
10137       gcc_assert (CONSTANT_CLASS_P (new_name)
10138                   || TREE_CODE (new_name) == SSA_NAME);
10139       new_vec = build_vector_from_val (step_vectype, t);
10140       vec_step = vect_init_vector (loop_vinfo, stmt_info,
10141                                    new_vec, step_vectype, NULL);
10142
10143       vec_def = induc_def;
10144       for (i = 1; i < ncopies + 1; i++)
10145         {
10146           /* vec_i = vec_prev + vec_step  */
10147           gimple_seq stmts = NULL;
10148           vec_def = gimple_convert (&stmts, step_vectype, vec_def);
10149           vec_def = gimple_build (&stmts,
10150                                   PLUS_EXPR, step_vectype, vec_def, vec_step);
10151           vec_def = gimple_convert (&stmts, vectype, vec_def);
10152
10153           gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
10154           if (i < ncopies)
10155             {
10156               new_stmt = SSA_NAME_DEF_STMT (vec_def);
10157               STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_stmt);
10158             }
10159           else
10160             {
10161               /* vec_1 = vec_iv + (VF/n * S)
10162                  vec_2 = vec_1 + (VF/n * S)
10163                  ...
10164                  vec_n = vec_prev + (VF/n * S) = vec_iv + VF * S = vec_loop
10165
10166                  vec_n is used as vec_loop to save the large step register and
10167                  related operations.  */
10168               add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
10169                            UNKNOWN_LOCATION);
10170             }
10171         }
10172     }
10173
10174   if (dump_enabled_p ())
10175     dump_printf_loc (MSG_NOTE, vect_location,
10176                      "transform induction: created def-use cycle: %G%G",
10177                      (gimple *) induction_phi, SSA_NAME_DEF_STMT (vec_def));
10178
10179   return true;
10180 }
10181
10182 /* Function vectorizable_live_operation.
10183
10184    STMT_INFO computes a value that is used outside the loop.  Check if
10185    it can be supported.  */
10186
10187 bool
10188 vectorizable_live_operation (vec_info *vinfo,
10189                              stmt_vec_info stmt_info,
10190                              gimple_stmt_iterator *gsi,
10191                              slp_tree slp_node, slp_instance slp_node_instance,
10192                              int slp_index, bool vec_stmt_p,
10193                              stmt_vector_for_cost *cost_vec)
10194 {
10195   loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
10196   imm_use_iterator imm_iter;
10197   tree lhs, lhs_type, bitsize;
10198   tree vectype = (slp_node
10199                   ? SLP_TREE_VECTYPE (slp_node)
10200                   : STMT_VINFO_VECTYPE (stmt_info));
10201   poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
10202   int ncopies;
10203   gimple *use_stmt;
10204   auto_vec<tree> vec_oprnds;
10205   int vec_entry = 0;
10206   poly_uint64 vec_index = 0;
10207
10208   gcc_assert (STMT_VINFO_LIVE_P (stmt_info));
10209
10210   /* If a stmt of a reduction is live, vectorize it via
10211      vect_create_epilog_for_reduction.  vectorizable_reduction assessed
10212      validity so just trigger the transform here.  */
10213   if (STMT_VINFO_REDUC_DEF (vect_orig_stmt (stmt_info)))
10214     {
10215       if (!vec_stmt_p)
10216         return true;
10217       if (slp_node)
10218         {
10219           /* For reduction chains the meta-info is attached to
10220              the group leader.  */
10221           if (REDUC_GROUP_FIRST_ELEMENT (stmt_info))
10222             stmt_info = REDUC_GROUP_FIRST_ELEMENT (stmt_info);
10223           /* For SLP reductions we vectorize the epilogue for
10224              all involved stmts together.  */
10225           else if (slp_index != 0)
10226             return true;
10227         }
10228       stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info);
10229       gcc_assert (reduc_info->is_reduc_info);
10230       if (STMT_VINFO_REDUC_TYPE (reduc_info) == FOLD_LEFT_REDUCTION
10231           || STMT_VINFO_REDUC_TYPE (reduc_info) == EXTRACT_LAST_REDUCTION)
10232         return true;
10233       vect_create_epilog_for_reduction (loop_vinfo, stmt_info, slp_node,
10234                                         slp_node_instance);
10235       return true;
10236     }
10237
10238   /* If STMT is not relevant and it is a simple assignment and its inputs are
10239      invariant then it can remain in place, unvectorized.  The original last
10240      scalar value that it computes will be used.  */
10241   if (!STMT_VINFO_RELEVANT_P (stmt_info))
10242     {
10243       gcc_assert (is_simple_and_all_uses_invariant (stmt_info, loop_vinfo));
10244       if (dump_enabled_p ())
10245         dump_printf_loc (MSG_NOTE, vect_location,
10246                          "statement is simple and uses invariant.  Leaving in "
10247                          "place.\n");
10248       return true;
10249     }
10250
10251   if (slp_node)
10252     ncopies = 1;
10253   else
10254     ncopies = vect_get_num_copies (loop_vinfo, vectype);
10255
10256   if (slp_node)
10257     {
10258       gcc_assert (slp_index >= 0);
10259
10260       /* Get the last occurrence of the scalar index from the concatenation of
10261          all the slp vectors. Calculate which slp vector it is and the index
10262          within.  */
10263       int num_scalar = SLP_TREE_LANES (slp_node);
10264       int num_vec = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
10265       poly_uint64 pos = (num_vec * nunits) - num_scalar + slp_index;
10266
10267       /* Calculate which vector contains the result, and which lane of
10268          that vector we need.  */
10269       if (!can_div_trunc_p (pos, nunits, &vec_entry, &vec_index))
10270         {
10271           if (dump_enabled_p ())
10272             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
10273                              "Cannot determine which vector holds the"
10274                              " final result.\n");
10275           return false;
10276         }
10277     }
10278
10279   if (!vec_stmt_p)
10280     {
10281       /* No transformation required.  */
10282       if (loop_vinfo && LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo))
10283         {
10284           if (!direct_internal_fn_supported_p (IFN_EXTRACT_LAST, vectype,
10285                                                OPTIMIZE_FOR_SPEED))
10286             {
10287               if (dump_enabled_p ())
10288                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
10289                                  "can't operate on partial vectors "
10290                                  "because the target doesn't support extract "
10291                                  "last reduction.\n");
10292               LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
10293             }
10294           else if (slp_node)
10295             {
10296               if (dump_enabled_p ())
10297                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
10298                                  "can't operate on partial vectors "
10299                                  "because an SLP statement is live after "
10300                                  "the loop.\n");
10301               LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
10302             }
10303           else if (ncopies > 1)
10304             {
10305               if (dump_enabled_p ())
10306                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
10307                                  "can't operate on partial vectors "
10308                                  "because ncopies is greater than 1.\n");
10309               LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
10310             }
10311           else
10312             {
10313               gcc_assert (ncopies == 1 && !slp_node);
10314               vect_record_loop_mask (loop_vinfo,
10315                                      &LOOP_VINFO_MASKS (loop_vinfo),
10316                                      1, vectype, NULL);
10317             }
10318         }
10319       /* ???  Enable for loop costing as well.  */
10320       if (!loop_vinfo)
10321         record_stmt_cost (cost_vec, 1, vec_to_scalar, stmt_info, NULL_TREE,
10322                           0, vect_epilogue);
10323       return true;
10324     }
10325
10326   /* Use the lhs of the original scalar statement.  */
10327   gimple *stmt = vect_orig_stmt (stmt_info)->stmt;
10328   if (dump_enabled_p ())
10329     dump_printf_loc (MSG_NOTE, vect_location, "extracting lane for live "
10330                      "stmt %G", stmt);
10331
10332   lhs = gimple_get_lhs (stmt);
10333   lhs_type = TREE_TYPE (lhs);
10334
10335   bitsize = vector_element_bits_tree (vectype);
10336
10337   /* Get the vectorized lhs of STMT and the lane to use (counted in bits).  */
10338   tree vec_lhs, bitstart;
10339   gimple *vec_stmt;
10340   if (slp_node)
10341     {
10342       gcc_assert (!loop_vinfo || !LOOP_VINFO_FULLY_MASKED_P (loop_vinfo));
10343
10344       /* Get the correct slp vectorized stmt.  */
10345       vec_lhs = SLP_TREE_VEC_DEFS (slp_node)[vec_entry];
10346       vec_stmt = SSA_NAME_DEF_STMT (vec_lhs);
10347
10348       /* Get entry to use.  */
10349       bitstart = bitsize_int (vec_index);
10350       bitstart = int_const_binop (MULT_EXPR, bitsize, bitstart);
10351     }
10352   else
10353     {
10354       /* For multiple copies, get the last copy.  */
10355       vec_stmt = STMT_VINFO_VEC_STMTS (stmt_info).last ();
10356       vec_lhs = gimple_get_lhs (vec_stmt);
10357
10358       /* Get the last lane in the vector.  */
10359       bitstart = int_const_binop (MULT_EXPR, bitsize, bitsize_int (nunits - 1));
10360     }
10361
10362   if (loop_vinfo)
10363     {
10364       /* Ensure the VEC_LHS for lane extraction stmts satisfy loop-closed PHI
10365          requirement, insert one phi node for it.  It looks like:
10366            loop;
10367          BB:
10368            # lhs' = PHI <lhs>
10369          ==>
10370            loop;
10371          BB:
10372            # vec_lhs' = PHI <vec_lhs>
10373            new_tree = lane_extract <vec_lhs', ...>;
10374            lhs' = new_tree;  */
10375
10376       class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
10377       basic_block exit_bb = single_exit (loop)->dest;
10378       gcc_assert (single_pred_p (exit_bb));
10379
10380       tree vec_lhs_phi = copy_ssa_name (vec_lhs);
10381       gimple *phi = create_phi_node (vec_lhs_phi, exit_bb);
10382       SET_PHI_ARG_DEF (phi, single_exit (loop)->dest_idx, vec_lhs);
10383
10384       gimple_seq stmts = NULL;
10385       tree new_tree;
10386       if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
10387         {
10388           /* Emit:
10389
10390                SCALAR_RES = EXTRACT_LAST <VEC_LHS, MASK>
10391
10392              where VEC_LHS is the vectorized live-out result and MASK is
10393              the loop mask for the final iteration.  */
10394           gcc_assert (ncopies == 1 && !slp_node);
10395           tree scalar_type = TREE_TYPE (STMT_VINFO_VECTYPE (stmt_info));
10396           tree mask = vect_get_loop_mask (loop_vinfo, gsi,
10397                                           &LOOP_VINFO_MASKS (loop_vinfo),
10398                                           1, vectype, 0);
10399           tree scalar_res = gimple_build (&stmts, CFN_EXTRACT_LAST, scalar_type,
10400                                           mask, vec_lhs_phi);
10401
10402           /* Convert the extracted vector element to the scalar type.  */
10403           new_tree = gimple_convert (&stmts, lhs_type, scalar_res);
10404         }
10405       else
10406         {
10407           tree bftype = TREE_TYPE (vectype);
10408           if (VECTOR_BOOLEAN_TYPE_P (vectype))
10409             bftype = build_nonstandard_integer_type (tree_to_uhwi (bitsize), 1);
10410           new_tree = build3 (BIT_FIELD_REF, bftype,
10411                              vec_lhs_phi, bitsize, bitstart);
10412           new_tree = force_gimple_operand (fold_convert (lhs_type, new_tree),
10413                                            &stmts, true, NULL_TREE);
10414         }
10415
10416       if (stmts)
10417         {
10418           gimple_stmt_iterator exit_gsi = gsi_after_labels (exit_bb);
10419           gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
10420
10421           /* Remove existing phi from lhs and create one copy from new_tree.  */
10422           tree lhs_phi = NULL_TREE;
10423           gimple_stmt_iterator gsi;
10424           for (gsi = gsi_start_phis (exit_bb);
10425                !gsi_end_p (gsi); gsi_next (&gsi))
10426             {
10427               gimple *phi = gsi_stmt (gsi);
10428               if ((gimple_phi_arg_def (phi, 0) == lhs))
10429                 {
10430                   remove_phi_node (&gsi, false);
10431                   lhs_phi = gimple_phi_result (phi);
10432                   gimple *copy = gimple_build_assign (lhs_phi, new_tree);
10433                   gsi_insert_before (&exit_gsi, copy, GSI_SAME_STMT);
10434                   break;
10435                 }
10436             }
10437         }
10438
10439       /* Replace use of lhs with newly computed result.  If the use stmt is a
10440          single arg PHI, just replace all uses of PHI result.  It's necessary
10441          because lcssa PHI defining lhs may be before newly inserted stmt.  */
10442       use_operand_p use_p;
10443       FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, lhs)
10444         if (!flow_bb_inside_loop_p (loop, gimple_bb (use_stmt))
10445             && !is_gimple_debug (use_stmt))
10446           {
10447             if (gimple_code (use_stmt) == GIMPLE_PHI
10448                 && gimple_phi_num_args (use_stmt) == 1)
10449               {
10450                 replace_uses_by (gimple_phi_result (use_stmt), new_tree);
10451               }
10452             else
10453               {
10454                 FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
10455                     SET_USE (use_p, new_tree);
10456               }
10457             update_stmt (use_stmt);
10458           }
10459     }
10460   else
10461     {
10462       /* For basic-block vectorization simply insert the lane-extraction.  */
10463       tree bftype = TREE_TYPE (vectype);
10464       if (VECTOR_BOOLEAN_TYPE_P (vectype))
10465         bftype = build_nonstandard_integer_type (tree_to_uhwi (bitsize), 1);
10466       tree new_tree = build3 (BIT_FIELD_REF, bftype,
10467                               vec_lhs, bitsize, bitstart);
10468       gimple_seq stmts = NULL;
10469       new_tree = force_gimple_operand (fold_convert (lhs_type, new_tree),
10470                                        &stmts, true, NULL_TREE);
10471       if (TREE_CODE (new_tree) == SSA_NAME
10472           && SSA_NAME_OCCURS_IN_ABNORMAL_PHI (lhs))
10473         SSA_NAME_OCCURS_IN_ABNORMAL_PHI (new_tree) = 1;
10474       if (is_a <gphi *> (vec_stmt))
10475         {
10476           gimple_stmt_iterator si = gsi_after_labels (gimple_bb (vec_stmt));
10477           gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
10478         }
10479       else
10480         {
10481           gimple_stmt_iterator si = gsi_for_stmt (vec_stmt);
10482           gsi_insert_seq_after (&si, stmts, GSI_SAME_STMT);
10483         }
10484
10485       /* Replace use of lhs with newly computed result.  If the use stmt is a
10486          single arg PHI, just replace all uses of PHI result.  It's necessary
10487          because lcssa PHI defining lhs may be before newly inserted stmt.  */
10488       use_operand_p use_p;
10489       stmt_vec_info use_stmt_info;
10490       FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, lhs)
10491         if (!is_gimple_debug (use_stmt)
10492             && (!(use_stmt_info = vinfo->lookup_stmt (use_stmt))
10493                 || !PURE_SLP_STMT (vect_stmt_to_vectorize (use_stmt_info))))
10494           {
10495             /* ???  This can happen when the live lane ends up being
10496                used in a vector construction code-generated by an
10497                external SLP node (and code-generation for that already
10498                happened).  See gcc.dg/vect/bb-slp-47.c.
10499                Doing this is what would happen if that vector CTOR
10500                were not code-generated yet so it is not too bad.
10501                ???  In fact we'd likely want to avoid this situation
10502                in the first place.  */
10503             if (TREE_CODE (new_tree) == SSA_NAME
10504                 && !SSA_NAME_IS_DEFAULT_DEF (new_tree)
10505                 && gimple_code (use_stmt) != GIMPLE_PHI
10506                 && !vect_stmt_dominates_stmt_p (SSA_NAME_DEF_STMT (new_tree),
10507                                                 use_stmt))
10508               {
10509                 enum tree_code code = gimple_assign_rhs_code (use_stmt);
10510                 gcc_checking_assert (code == SSA_NAME
10511                                      || code == CONSTRUCTOR
10512                                      || code == VIEW_CONVERT_EXPR
10513                                      || CONVERT_EXPR_CODE_P (code));
10514                 if (dump_enabled_p ())
10515                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
10516                                    "Using original scalar computation for "
10517                                    "live lane because use preceeds vector "
10518                                    "def\n");
10519                 continue;
10520               }
10521             /* ???  It can also happen that we end up pulling a def into
10522                a loop where replacing out-of-loop uses would require
10523                a new LC SSA PHI node.  Retain the original scalar in
10524                those cases as well.  PR98064.  */
10525             if (TREE_CODE (new_tree) == SSA_NAME
10526                 && !SSA_NAME_IS_DEFAULT_DEF (new_tree)
10527                 && (gimple_bb (use_stmt)->loop_father
10528                     != gimple_bb (vec_stmt)->loop_father)
10529                 && !flow_loop_nested_p (gimple_bb (vec_stmt)->loop_father,
10530                                         gimple_bb (use_stmt)->loop_father))
10531               {
10532                 if (dump_enabled_p ())
10533                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
10534                                    "Using original scalar computation for "
10535                                    "live lane because there is an out-of-loop "
10536                                    "definition for it\n");
10537                 continue;
10538               }
10539             FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
10540               SET_USE (use_p, new_tree);
10541             update_stmt (use_stmt);
10542           }
10543     }
10544
10545   return true;
10546 }
10547
10548 /* Kill any debug uses outside LOOP of SSA names defined in STMT_INFO.  */
10549
10550 static void
10551 vect_loop_kill_debug_uses (class loop *loop, stmt_vec_info stmt_info)
10552 {
10553   ssa_op_iter op_iter;
10554   imm_use_iterator imm_iter;
10555   def_operand_p def_p;
10556   gimple *ustmt;
10557
10558   FOR_EACH_PHI_OR_STMT_DEF (def_p, stmt_info->stmt, op_iter, SSA_OP_DEF)
10559     {
10560       FOR_EACH_IMM_USE_STMT (ustmt, imm_iter, DEF_FROM_PTR (def_p))
10561         {
10562           basic_block bb;
10563
10564           if (!is_gimple_debug (ustmt))
10565             continue;
10566
10567           bb = gimple_bb (ustmt);
10568
10569           if (!flow_bb_inside_loop_p (loop, bb))
10570             {
10571               if (gimple_debug_bind_p (ustmt))
10572                 {
10573                   if (dump_enabled_p ())
10574                     dump_printf_loc (MSG_NOTE, vect_location,
10575                                      "killing debug use\n");
10576
10577                   gimple_debug_bind_reset_value (ustmt);
10578                   update_stmt (ustmt);
10579                 }
10580               else
10581                 gcc_unreachable ();
10582             }
10583         }
10584     }
10585 }
10586
10587 /* Given loop represented by LOOP_VINFO, return true if computation of
10588    LOOP_VINFO_NITERS (= LOOP_VINFO_NITERSM1 + 1) doesn't overflow, false
10589    otherwise.  */
10590
10591 static bool
10592 loop_niters_no_overflow (loop_vec_info loop_vinfo)
10593 {
10594   /* Constant case.  */
10595   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
10596     {
10597       tree cst_niters = LOOP_VINFO_NITERS (loop_vinfo);
10598       tree cst_nitersm1 = LOOP_VINFO_NITERSM1 (loop_vinfo);
10599
10600       gcc_assert (TREE_CODE (cst_niters) == INTEGER_CST);
10601       gcc_assert (TREE_CODE (cst_nitersm1) == INTEGER_CST);
10602       if (wi::to_widest (cst_nitersm1) < wi::to_widest (cst_niters))
10603         return true;
10604     }
10605
10606   widest_int max;
10607   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
10608   /* Check the upper bound of loop niters.  */
10609   if (get_max_loop_iterations (loop, &max))
10610     {
10611       tree type = TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo));
10612       signop sgn = TYPE_SIGN (type);
10613       widest_int type_max = widest_int::from (wi::max_value (type), sgn);
10614       if (max < type_max)
10615         return true;
10616     }
10617   return false;
10618 }
10619
10620 /* Return a mask type with half the number of elements as OLD_TYPE,
10621    given that it should have mode NEW_MODE.  */
10622
10623 tree
10624 vect_halve_mask_nunits (tree old_type, machine_mode new_mode)
10625 {
10626   poly_uint64 nunits = exact_div (TYPE_VECTOR_SUBPARTS (old_type), 2);
10627   return build_truth_vector_type_for_mode (nunits, new_mode);
10628 }
10629
10630 /* Return a mask type with twice as many elements as OLD_TYPE,
10631    given that it should have mode NEW_MODE.  */
10632
10633 tree
10634 vect_double_mask_nunits (tree old_type, machine_mode new_mode)
10635 {
10636   poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (old_type) * 2;
10637   return build_truth_vector_type_for_mode (nunits, new_mode);
10638 }
10639
10640 /* Record that a fully-masked version of LOOP_VINFO would need MASKS to
10641    contain a sequence of NVECTORS masks that each control a vector of type
10642    VECTYPE.  If SCALAR_MASK is nonnull, the fully-masked loop would AND
10643    these vector masks with the vector version of SCALAR_MASK.  */
10644
10645 void
10646 vect_record_loop_mask (loop_vec_info loop_vinfo, vec_loop_masks *masks,
10647                        unsigned int nvectors, tree vectype, tree scalar_mask)
10648 {
10649   gcc_assert (nvectors != 0);
10650
10651   if (scalar_mask)
10652     {
10653       scalar_cond_masked_key cond (scalar_mask, nvectors);
10654       loop_vinfo->scalar_cond_masked_set.add (cond);
10655     }
10656
10657   masks->mask_set.add (std::make_pair (vectype, nvectors));
10658 }
10659
10660 /* Given a complete set of masks MASKS, extract mask number INDEX
10661    for an rgroup that operates on NVECTORS vectors of type VECTYPE,
10662    where 0 <= INDEX < NVECTORS.  Insert any set-up statements before GSI.
10663
10664    See the comment above vec_loop_masks for more details about the mask
10665    arrangement.  */
10666
10667 tree
10668 vect_get_loop_mask (loop_vec_info loop_vinfo,
10669                     gimple_stmt_iterator *gsi, vec_loop_masks *masks,
10670                     unsigned int nvectors, tree vectype, unsigned int index)
10671 {
10672   if (LOOP_VINFO_PARTIAL_VECTORS_STYLE (loop_vinfo)
10673       == vect_partial_vectors_while_ult)
10674     {
10675       rgroup_controls *rgm = &(masks->rgc_vec)[nvectors - 1];
10676       tree mask_type = rgm->type;
10677
10678       /* Populate the rgroup's mask array, if this is the first time we've
10679          used it.  */
10680       if (rgm->controls.is_empty ())
10681         {
10682           rgm->controls.safe_grow_cleared (nvectors, true);
10683           for (unsigned int i = 0; i < nvectors; ++i)
10684             {
10685               tree mask = make_temp_ssa_name (mask_type, NULL, "loop_mask");
10686               /* Provide a dummy definition until the real one is available.  */
10687               SSA_NAME_DEF_STMT (mask) = gimple_build_nop ();
10688               rgm->controls[i] = mask;
10689             }
10690         }
10691
10692       tree mask = rgm->controls[index];
10693       if (maybe_ne (TYPE_VECTOR_SUBPARTS (mask_type),
10694                     TYPE_VECTOR_SUBPARTS (vectype)))
10695         {
10696           /* A loop mask for data type X can be reused for data type Y
10697              if X has N times more elements than Y and if Y's elements
10698              are N times bigger than X's.  In this case each sequence
10699              of N elements in the loop mask will be all-zero or all-one.
10700              We can then view-convert the mask so that each sequence of
10701              N elements is replaced by a single element.  */
10702           gcc_assert (multiple_p (TYPE_VECTOR_SUBPARTS (mask_type),
10703                                   TYPE_VECTOR_SUBPARTS (vectype)));
10704           gimple_seq seq = NULL;
10705           mask_type = truth_type_for (vectype);
10706           mask = gimple_build (&seq, VIEW_CONVERT_EXPR, mask_type, mask);
10707           if (seq)
10708             gsi_insert_seq_before (gsi, seq, GSI_SAME_STMT);
10709         }
10710       return mask;
10711     }
10712   else if (LOOP_VINFO_PARTIAL_VECTORS_STYLE (loop_vinfo)
10713            == vect_partial_vectors_avx512)
10714     {
10715       /* The number of scalars per iteration and the number of vectors are
10716          both compile-time constants.  */
10717       unsigned int nscalars_per_iter
10718         = exact_div (nvectors * TYPE_VECTOR_SUBPARTS (vectype),
10719                      LOOP_VINFO_VECT_FACTOR (loop_vinfo)).to_constant ();
10720
10721       rgroup_controls *rgm = &masks->rgc_vec[nscalars_per_iter - 1];
10722
10723       /* The stored nV is dependent on the mask type produced.  */
10724       gcc_assert (exact_div (nvectors * TYPE_VECTOR_SUBPARTS (vectype),
10725                              TYPE_VECTOR_SUBPARTS (rgm->type)).to_constant ()
10726                   == rgm->factor);
10727       nvectors = rgm->factor;
10728
10729       /* Populate the rgroup's mask array, if this is the first time we've
10730          used it.  */
10731       if (rgm->controls.is_empty ())
10732         {
10733           rgm->controls.safe_grow_cleared (nvectors, true);
10734           for (unsigned int i = 0; i < nvectors; ++i)
10735             {
10736               tree mask = make_temp_ssa_name (rgm->type, NULL, "loop_mask");
10737               /* Provide a dummy definition until the real one is available.  */
10738               SSA_NAME_DEF_STMT (mask) = gimple_build_nop ();
10739               rgm->controls[i] = mask;
10740             }
10741         }
10742       if (known_eq (TYPE_VECTOR_SUBPARTS (rgm->type),
10743                     TYPE_VECTOR_SUBPARTS (vectype)))
10744         return rgm->controls[index];
10745
10746       /* Split the vector if needed.  Since we are dealing with integer mode
10747          masks with AVX512 we can operate on the integer representation
10748          performing the whole vector shifting.  */
10749       unsigned HOST_WIDE_INT factor;
10750       bool ok = constant_multiple_p (TYPE_VECTOR_SUBPARTS (rgm->type),
10751                                      TYPE_VECTOR_SUBPARTS (vectype), &factor);
10752       gcc_assert (ok);
10753       gcc_assert (GET_MODE_CLASS (TYPE_MODE (rgm->type)) == MODE_INT);
10754       tree mask_type = truth_type_for (vectype);
10755       gcc_assert (GET_MODE_CLASS (TYPE_MODE (mask_type)) == MODE_INT);
10756       unsigned vi = index / factor;
10757       unsigned vpart = index % factor;
10758       tree vec = rgm->controls[vi];
10759       gimple_seq seq = NULL;
10760       vec = gimple_build (&seq, VIEW_CONVERT_EXPR,
10761                           lang_hooks.types.type_for_mode
10762                                 (TYPE_MODE (rgm->type), 1), vec);
10763       /* For integer mode masks simply shift the right bits into position.  */
10764       if (vpart != 0)
10765         vec = gimple_build (&seq, RSHIFT_EXPR, TREE_TYPE (vec), vec,
10766                             build_int_cst (integer_type_node,
10767                                            (TYPE_VECTOR_SUBPARTS (vectype)
10768                                             * vpart)));
10769       vec = gimple_convert (&seq, lang_hooks.types.type_for_mode
10770                                     (TYPE_MODE (mask_type), 1), vec);
10771       vec = gimple_build (&seq, VIEW_CONVERT_EXPR, mask_type, vec);
10772       if (seq)
10773         gsi_insert_seq_before (gsi, seq, GSI_SAME_STMT);
10774       return vec;
10775     }
10776   else
10777     gcc_unreachable ();
10778 }
10779
10780 /* Record that LOOP_VINFO would need LENS to contain a sequence of NVECTORS
10781    lengths for controlling an operation on VECTYPE.  The operation splits
10782    each element of VECTYPE into FACTOR separate subelements, measuring the
10783    length as a number of these subelements.  */
10784
10785 void
10786 vect_record_loop_len (loop_vec_info loop_vinfo, vec_loop_lens *lens,
10787                       unsigned int nvectors, tree vectype, unsigned int factor)
10788 {
10789   gcc_assert (nvectors != 0);
10790   if (lens->length () < nvectors)
10791     lens->safe_grow_cleared (nvectors, true);
10792   rgroup_controls *rgl = &(*lens)[nvectors - 1];
10793
10794   /* The number of scalars per iteration, scalar occupied bytes and
10795      the number of vectors are both compile-time constants.  */
10796   unsigned int nscalars_per_iter
10797     = exact_div (nvectors * TYPE_VECTOR_SUBPARTS (vectype),
10798                  LOOP_VINFO_VECT_FACTOR (loop_vinfo)).to_constant ();
10799
10800   if (rgl->max_nscalars_per_iter < nscalars_per_iter)
10801     {
10802       /* For now, we only support cases in which all loads and stores fall back
10803          to VnQI or none do.  */
10804       gcc_assert (!rgl->max_nscalars_per_iter
10805                   || (rgl->factor == 1 && factor == 1)
10806                   || (rgl->max_nscalars_per_iter * rgl->factor
10807                       == nscalars_per_iter * factor));
10808       rgl->max_nscalars_per_iter = nscalars_per_iter;
10809       rgl->type = vectype;
10810       rgl->factor = factor;
10811     }
10812 }
10813
10814 /* Given a complete set of lengths LENS, extract length number INDEX
10815    for an rgroup that operates on NVECTORS vectors of type VECTYPE,
10816    where 0 <= INDEX < NVECTORS.  Return a value that contains FACTOR
10817    multipled by the number of elements that should be processed.
10818    Insert any set-up statements before GSI.  */
10819
10820 tree
10821 vect_get_loop_len (loop_vec_info loop_vinfo, gimple_stmt_iterator *gsi,
10822                    vec_loop_lens *lens, unsigned int nvectors, tree vectype,
10823                    unsigned int index, unsigned int factor)
10824 {
10825   rgroup_controls *rgl = &(*lens)[nvectors - 1];
10826   bool use_bias_adjusted_len =
10827     LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo) != 0;
10828
10829   /* Populate the rgroup's len array, if this is the first time we've
10830      used it.  */
10831   if (rgl->controls.is_empty ())
10832     {
10833       rgl->controls.safe_grow_cleared (nvectors, true);
10834       for (unsigned int i = 0; i < nvectors; ++i)
10835         {
10836           tree len_type = LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo);
10837           gcc_assert (len_type != NULL_TREE);
10838
10839           tree len = make_temp_ssa_name (len_type, NULL, "loop_len");
10840
10841           /* Provide a dummy definition until the real one is available.  */
10842           SSA_NAME_DEF_STMT (len) = gimple_build_nop ();
10843           rgl->controls[i] = len;
10844
10845           if (use_bias_adjusted_len)
10846             {
10847               gcc_assert (i == 0);
10848               tree adjusted_len =
10849                 make_temp_ssa_name (len_type, NULL, "adjusted_loop_len");
10850               SSA_NAME_DEF_STMT (adjusted_len) = gimple_build_nop ();
10851               rgl->bias_adjusted_ctrl = adjusted_len;
10852             }
10853         }
10854     }
10855
10856   if (use_bias_adjusted_len)
10857     return rgl->bias_adjusted_ctrl;
10858
10859   tree loop_len = rgl->controls[index];
10860   if (rgl->factor == 1 && factor == 1)
10861     {
10862       poly_int64 nunits1 = TYPE_VECTOR_SUBPARTS (rgl->type);
10863       poly_int64 nunits2 = TYPE_VECTOR_SUBPARTS (vectype);
10864       if (maybe_ne (nunits1, nunits2))
10865         {
10866           /* A loop len for data type X can be reused for data type Y
10867              if X has N times more elements than Y and if Y's elements
10868              are N times bigger than X's.  */
10869           gcc_assert (multiple_p (nunits1, nunits2));
10870           factor = exact_div (nunits1, nunits2).to_constant ();
10871           tree iv_type = LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo);
10872           gimple_seq seq = NULL;
10873           loop_len = gimple_build (&seq, RDIV_EXPR, iv_type, loop_len,
10874                                    build_int_cst (iv_type, factor));
10875           if (seq)
10876             gsi_insert_seq_before (gsi, seq, GSI_SAME_STMT);
10877         }
10878     }
10879   return loop_len;
10880 }
10881
10882 /* Scale profiling counters by estimation for LOOP which is vectorized
10883    by factor VF.
10884    If FLAT is true, the loop we started with had unrealistically flat
10885    profile.  */
10886
10887 static void
10888 scale_profile_for_vect_loop (class loop *loop, unsigned vf, bool flat)
10889 {
10890   /* For flat profiles do not scale down proportionally by VF and only
10891      cap by known iteration count bounds.  */
10892   if (flat)
10893     {
10894       if (dump_file && (dump_flags & TDF_DETAILS))
10895         fprintf (dump_file,
10896                  "Vectorized loop profile seems flat; not scaling iteration "
10897                  "count down by the vectorization factor %i\n", vf);
10898       scale_loop_profile (loop, profile_probability::always (),
10899                           get_likely_max_loop_iterations_int (loop));
10900       return;
10901     }
10902   /* Loop body executes VF fewer times and exit increases VF times.  */
10903   edge exit_e = single_exit (loop);
10904   profile_count entry_count = loop_preheader_edge (loop)->count ();
10905
10906   /* If we have unreliable loop profile avoid dropping entry
10907      count bellow header count.  This can happen since loops
10908      has unrealistically low trip counts.  */
10909   while (vf > 1
10910          && loop->header->count > entry_count
10911          && loop->header->count < entry_count * vf)
10912     {
10913       if (dump_file && (dump_flags & TDF_DETAILS))
10914         fprintf (dump_file,
10915                  "Vectorization factor %i seems too large for profile "
10916                  "prevoiusly believed to be consistent; reducing.\n", vf);
10917       vf /= 2;
10918     }
10919
10920   if (entry_count.nonzero_p ())
10921     set_edge_probability_and_rescale_others
10922             (exit_e,
10923              entry_count.probability_in (loop->header->count / vf));
10924   /* Avoid producing very large exit probability when we do not have
10925      sensible profile.  */
10926   else if (exit_e->probability < profile_probability::always () / (vf * 2))
10927     set_edge_probability_and_rescale_others (exit_e, exit_e->probability * vf);
10928   loop->latch->count = single_pred_edge (loop->latch)->count ();
10929
10930   scale_loop_profile (loop, profile_probability::always () / vf,
10931                       get_likely_max_loop_iterations_int (loop));
10932 }
10933
10934 /* For a vectorized stmt DEF_STMT_INFO adjust all vectorized PHI
10935    latch edge values originally defined by it.  */
10936
10937 static void
10938 maybe_set_vectorized_backedge_value (loop_vec_info loop_vinfo,
10939                                      stmt_vec_info def_stmt_info)
10940 {
10941   tree def = gimple_get_lhs (vect_orig_stmt (def_stmt_info)->stmt);
10942   if (!def || TREE_CODE (def) != SSA_NAME)
10943     return;
10944   stmt_vec_info phi_info;
10945   imm_use_iterator iter;
10946   use_operand_p use_p;
10947   FOR_EACH_IMM_USE_FAST (use_p, iter, def)
10948     {
10949       gphi *phi = dyn_cast <gphi *> (USE_STMT (use_p));
10950       if (!phi)
10951         continue;
10952       if (!(gimple_bb (phi)->loop_father->header == gimple_bb (phi)
10953             && (phi_info = loop_vinfo->lookup_stmt (phi))
10954             && STMT_VINFO_RELEVANT_P (phi_info)))
10955         continue;
10956       loop_p loop = gimple_bb (phi)->loop_father;
10957       edge e = loop_latch_edge (loop);
10958       if (PHI_ARG_DEF_FROM_EDGE (phi, e) != def)
10959         continue;
10960
10961       if (VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (phi_info))
10962           && STMT_VINFO_REDUC_TYPE (phi_info) != FOLD_LEFT_REDUCTION
10963           && STMT_VINFO_REDUC_TYPE (phi_info) != EXTRACT_LAST_REDUCTION)
10964         {
10965           vec<gimple *> &phi_defs = STMT_VINFO_VEC_STMTS (phi_info);
10966           vec<gimple *> &latch_defs = STMT_VINFO_VEC_STMTS (def_stmt_info);
10967           gcc_assert (phi_defs.length () == latch_defs.length ());
10968           for (unsigned i = 0; i < phi_defs.length (); ++i)
10969             add_phi_arg (as_a <gphi *> (phi_defs[i]),
10970                          gimple_get_lhs (latch_defs[i]), e,
10971                          gimple_phi_arg_location (phi, e->dest_idx));
10972         }
10973       else if (STMT_VINFO_DEF_TYPE (phi_info) == vect_first_order_recurrence)
10974         {
10975           /* For first order recurrences we have to update both uses of
10976              the latch definition, the one in the PHI node and the one
10977              in the generated VEC_PERM_EXPR.  */
10978           vec<gimple *> &phi_defs = STMT_VINFO_VEC_STMTS (phi_info);
10979           vec<gimple *> &latch_defs = STMT_VINFO_VEC_STMTS (def_stmt_info);
10980           gcc_assert (phi_defs.length () == latch_defs.length ());
10981           tree phidef = gimple_assign_rhs1 (phi_defs[0]);
10982           gphi *vphi = as_a <gphi *> (SSA_NAME_DEF_STMT (phidef));
10983           for (unsigned i = 0; i < phi_defs.length (); ++i)
10984             {
10985               gassign *perm = as_a <gassign *> (phi_defs[i]);
10986               if (i > 0)
10987                 gimple_assign_set_rhs1 (perm, gimple_get_lhs (latch_defs[i-1]));
10988               gimple_assign_set_rhs2 (perm, gimple_get_lhs (latch_defs[i]));
10989               update_stmt (perm);
10990             }
10991           add_phi_arg (vphi, gimple_get_lhs (latch_defs.last ()), e,
10992                        gimple_phi_arg_location (phi, e->dest_idx));
10993         }
10994     }
10995 }
10996
10997 /* Vectorize STMT_INFO if relevant, inserting any new instructions before GSI.
10998    When vectorizing STMT_INFO as a store, set *SEEN_STORE to its
10999    stmt_vec_info.  */
11000
11001 static bool
11002 vect_transform_loop_stmt (loop_vec_info loop_vinfo, stmt_vec_info stmt_info,
11003                           gimple_stmt_iterator *gsi, stmt_vec_info *seen_store)
11004 {
11005   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
11006   poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
11007
11008   if (dump_enabled_p ())
11009     dump_printf_loc (MSG_NOTE, vect_location,
11010                      "------>vectorizing statement: %G", stmt_info->stmt);
11011
11012   if (MAY_HAVE_DEBUG_BIND_STMTS && !STMT_VINFO_LIVE_P (stmt_info))
11013     vect_loop_kill_debug_uses (loop, stmt_info);
11014
11015   if (!STMT_VINFO_RELEVANT_P (stmt_info)
11016       && !STMT_VINFO_LIVE_P (stmt_info))
11017     return false;
11018
11019   if (STMT_VINFO_VECTYPE (stmt_info))
11020     {
11021       poly_uint64 nunits
11022         = TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info));
11023       if (!STMT_SLP_TYPE (stmt_info)
11024           && maybe_ne (nunits, vf)
11025           && dump_enabled_p ())
11026         /* For SLP VF is set according to unrolling factor, and not
11027            to vector size, hence for SLP this print is not valid.  */
11028         dump_printf_loc (MSG_NOTE, vect_location, "multiple-types.\n");
11029     }
11030
11031   /* Pure SLP statements have already been vectorized.  We still need
11032      to apply loop vectorization to hybrid SLP statements.  */
11033   if (PURE_SLP_STMT (stmt_info))
11034     return false;
11035
11036   if (dump_enabled_p ())
11037     dump_printf_loc (MSG_NOTE, vect_location, "transform statement.\n");
11038
11039   if (vect_transform_stmt (loop_vinfo, stmt_info, gsi, NULL, NULL))
11040     *seen_store = stmt_info;
11041
11042   return true;
11043 }
11044
11045 /* Helper function to pass to simplify_replace_tree to enable replacing tree's
11046    in the hash_map with its corresponding values.  */
11047
11048 static tree
11049 find_in_mapping (tree t, void *context)
11050 {
11051   hash_map<tree,tree>* mapping = (hash_map<tree, tree>*) context;
11052
11053   tree *value = mapping->get (t);
11054   return value ? *value : t;
11055 }
11056
11057 /* Update EPILOGUE's loop_vec_info.  EPILOGUE was constructed as a copy of the
11058    original loop that has now been vectorized.
11059
11060    The inits of the data_references need to be advanced with the number of
11061    iterations of the main loop.  This has been computed in vect_do_peeling and
11062    is stored in parameter ADVANCE.  We first restore the data_references
11063    initial offset with the values recored in ORIG_DRS_INIT.
11064
11065    Since the loop_vec_info of this EPILOGUE was constructed for the original
11066    loop, its stmt_vec_infos all point to the original statements.  These need
11067    to be updated to point to their corresponding copies as well as the SSA_NAMES
11068    in their PATTERN_DEF_SEQs and RELATED_STMTs.
11069
11070    The data_reference's connections also need to be updated.  Their
11071    corresponding dr_vec_info need to be reconnected to the EPILOGUE's
11072    stmt_vec_infos, their statements need to point to their corresponding copy,
11073    if they are gather loads or scatter stores then their reference needs to be
11074    updated to point to its corresponding copy and finally we set
11075    'base_misaligned' to false as we have already peeled for alignment in the
11076    prologue of the main loop.  */
11077
11078 static void
11079 update_epilogue_loop_vinfo (class loop *epilogue, tree advance)
11080 {
11081   loop_vec_info epilogue_vinfo = loop_vec_info_for_loop (epilogue);
11082   auto_vec<gimple *> stmt_worklist;
11083   hash_map<tree,tree> mapping;
11084   gimple *orig_stmt, *new_stmt;
11085   gimple_stmt_iterator epilogue_gsi;
11086   gphi_iterator epilogue_phi_gsi;
11087   stmt_vec_info stmt_vinfo = NULL, related_vinfo;
11088   basic_block *epilogue_bbs = get_loop_body (epilogue);
11089   unsigned i;
11090
11091   free (LOOP_VINFO_BBS (epilogue_vinfo));
11092   LOOP_VINFO_BBS (epilogue_vinfo) = epilogue_bbs;
11093
11094   /* Advance data_reference's with the number of iterations of the previous
11095      loop and its prologue.  */
11096   vect_update_inits_of_drs (epilogue_vinfo, advance, PLUS_EXPR);
11097
11098
11099   /* The EPILOGUE loop is a copy of the original loop so they share the same
11100      gimple UIDs.  In this loop we update the loop_vec_info of the EPILOGUE to
11101      point to the copied statements.  We also create a mapping of all LHS' in
11102      the original loop and all the LHS' in the EPILOGUE and create worklists to
11103      update teh STMT_VINFO_PATTERN_DEF_SEQs and STMT_VINFO_RELATED_STMTs.  */
11104   for (unsigned i = 0; i < epilogue->num_nodes; ++i)
11105     {
11106       for (epilogue_phi_gsi = gsi_start_phis (epilogue_bbs[i]);
11107            !gsi_end_p (epilogue_phi_gsi); gsi_next (&epilogue_phi_gsi))
11108         {
11109           new_stmt = epilogue_phi_gsi.phi ();
11110
11111           gcc_assert (gimple_uid (new_stmt) > 0);
11112           stmt_vinfo
11113             = epilogue_vinfo->stmt_vec_infos[gimple_uid (new_stmt) - 1];
11114
11115           orig_stmt = STMT_VINFO_STMT (stmt_vinfo);
11116           STMT_VINFO_STMT (stmt_vinfo) = new_stmt;
11117
11118           mapping.put (gimple_phi_result (orig_stmt),
11119                        gimple_phi_result (new_stmt));
11120           /* PHI nodes can not have patterns or related statements.  */
11121           gcc_assert (STMT_VINFO_PATTERN_DEF_SEQ (stmt_vinfo) == NULL
11122                       && STMT_VINFO_RELATED_STMT (stmt_vinfo) == NULL);
11123         }
11124
11125       for (epilogue_gsi = gsi_start_bb (epilogue_bbs[i]);
11126            !gsi_end_p (epilogue_gsi); gsi_next (&epilogue_gsi))
11127         {
11128           new_stmt = gsi_stmt (epilogue_gsi);
11129           if (is_gimple_debug (new_stmt))
11130             continue;
11131
11132           gcc_assert (gimple_uid (new_stmt) > 0);
11133           stmt_vinfo
11134             = epilogue_vinfo->stmt_vec_infos[gimple_uid (new_stmt) - 1];
11135
11136           orig_stmt = STMT_VINFO_STMT (stmt_vinfo);
11137           STMT_VINFO_STMT (stmt_vinfo) = new_stmt;
11138
11139           if (tree old_lhs = gimple_get_lhs (orig_stmt))
11140             mapping.put (old_lhs, gimple_get_lhs (new_stmt));
11141
11142           if (STMT_VINFO_PATTERN_DEF_SEQ (stmt_vinfo))
11143             {
11144               gimple_seq seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_vinfo);
11145               for (gimple_stmt_iterator gsi = gsi_start (seq);
11146                    !gsi_end_p (gsi); gsi_next (&gsi))
11147                 stmt_worklist.safe_push (gsi_stmt (gsi));
11148             }
11149
11150           related_vinfo = STMT_VINFO_RELATED_STMT (stmt_vinfo);
11151           if (related_vinfo != NULL && related_vinfo != stmt_vinfo)
11152             {
11153               gimple *stmt = STMT_VINFO_STMT (related_vinfo);
11154               stmt_worklist.safe_push (stmt);
11155               /* Set BB such that the assert in
11156                 'get_initial_def_for_reduction' is able to determine that
11157                 the BB of the related stmt is inside this loop.  */
11158               gimple_set_bb (stmt,
11159                              gimple_bb (new_stmt));
11160               related_vinfo = STMT_VINFO_RELATED_STMT (related_vinfo);
11161               gcc_assert (related_vinfo == NULL
11162                           || related_vinfo == stmt_vinfo);
11163             }
11164         }
11165     }
11166
11167   /* The PATTERN_DEF_SEQs and RELATED_STMTs in the epilogue were constructed
11168      using the original main loop and thus need to be updated to refer to the
11169      cloned variables used in the epilogue.  */
11170   for (unsigned i = 0; i < stmt_worklist.length (); ++i)
11171     {
11172       gimple *stmt = stmt_worklist[i];
11173       tree *new_op;
11174
11175       for (unsigned j = 1; j < gimple_num_ops (stmt); ++j)
11176         {
11177           tree op = gimple_op (stmt, j);
11178           if ((new_op = mapping.get(op)))
11179             gimple_set_op (stmt, j, *new_op);
11180           else
11181             {
11182               /* PR92429: The last argument of simplify_replace_tree disables
11183                  folding when replacing arguments.  This is required as
11184                  otherwise you might end up with different statements than the
11185                  ones analyzed in vect_loop_analyze, leading to different
11186                  vectorization.  */
11187               op = simplify_replace_tree (op, NULL_TREE, NULL_TREE,
11188                                           &find_in_mapping, &mapping, false);
11189               gimple_set_op (stmt, j, op);
11190             }
11191         }
11192     }
11193
11194   struct data_reference *dr;
11195   vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (epilogue_vinfo);
11196   FOR_EACH_VEC_ELT (datarefs, i, dr)
11197     {
11198       orig_stmt = DR_STMT (dr);
11199       gcc_assert (gimple_uid (orig_stmt) > 0);
11200       stmt_vinfo = epilogue_vinfo->stmt_vec_infos[gimple_uid (orig_stmt) - 1];
11201       /* Data references for gather loads and scatter stores do not use the
11202          updated offset we set using ADVANCE.  Instead we have to make sure the
11203          reference in the data references point to the corresponding copy of
11204          the original in the epilogue.  */
11205       if (STMT_VINFO_MEMORY_ACCESS_TYPE (vect_stmt_to_vectorize (stmt_vinfo))
11206           == VMAT_GATHER_SCATTER)
11207         {
11208           DR_REF (dr)
11209             = simplify_replace_tree (DR_REF (dr), NULL_TREE, NULL_TREE,
11210                                      &find_in_mapping, &mapping);
11211           DR_BASE_ADDRESS (dr)
11212             = simplify_replace_tree (DR_BASE_ADDRESS (dr), NULL_TREE, NULL_TREE,
11213                                      &find_in_mapping, &mapping);
11214         }
11215       DR_STMT (dr) = STMT_VINFO_STMT (stmt_vinfo);
11216       stmt_vinfo->dr_aux.stmt = stmt_vinfo;
11217       /* The vector size of the epilogue is smaller than that of the main loop
11218          so the alignment is either the same or lower. This means the dr will
11219          thus by definition be aligned.  */
11220       STMT_VINFO_DR_INFO (stmt_vinfo)->base_misaligned = false;
11221     }
11222
11223   epilogue_vinfo->shared->datarefs_copy.release ();
11224   epilogue_vinfo->shared->save_datarefs ();
11225 }
11226
11227 /* Function vect_transform_loop.
11228
11229    The analysis phase has determined that the loop is vectorizable.
11230    Vectorize the loop - created vectorized stmts to replace the scalar
11231    stmts in the loop, and update the loop exit condition.
11232    Returns scalar epilogue loop if any.  */
11233
11234 class loop *
11235 vect_transform_loop (loop_vec_info loop_vinfo, gimple *loop_vectorized_call)
11236 {
11237   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
11238   class loop *epilogue = NULL;
11239   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
11240   int nbbs = loop->num_nodes;
11241   int i;
11242   tree niters_vector = NULL_TREE;
11243   tree step_vector = NULL_TREE;
11244   tree niters_vector_mult_vf = NULL_TREE;
11245   poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
11246   unsigned int lowest_vf = constant_lower_bound (vf);
11247   gimple *stmt;
11248   bool check_profitability = false;
11249   unsigned int th;
11250   bool flat = maybe_flat_loop_profile (loop);
11251
11252   DUMP_VECT_SCOPE ("vec_transform_loop");
11253
11254   loop_vinfo->shared->check_datarefs ();
11255
11256   /* Use the more conservative vectorization threshold.  If the number
11257      of iterations is constant assume the cost check has been performed
11258      by our caller.  If the threshold makes all loops profitable that
11259      run at least the (estimated) vectorization factor number of times
11260      checking is pointless, too.  */
11261   th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
11262   if (vect_apply_runtime_profitability_check_p (loop_vinfo))
11263     {
11264       if (dump_enabled_p ())
11265         dump_printf_loc (MSG_NOTE, vect_location,
11266                          "Profitability threshold is %d loop iterations.\n",
11267                          th);
11268       check_profitability = true;
11269     }
11270
11271   /* Make sure there exists a single-predecessor exit bb.  Do this before
11272      versioning.   */
11273   edge e = single_exit (loop);
11274   if (! single_pred_p (e->dest))
11275     {
11276       split_loop_exit_edge (e, true);
11277       if (dump_enabled_p ())
11278         dump_printf (MSG_NOTE, "split exit edge\n");
11279     }
11280
11281   /* Version the loop first, if required, so the profitability check
11282      comes first.  */
11283
11284   if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
11285     {
11286       class loop *sloop
11287         = vect_loop_versioning (loop_vinfo, loop_vectorized_call);
11288       sloop->force_vectorize = false;
11289       check_profitability = false;
11290     }
11291
11292   /* Make sure there exists a single-predecessor exit bb also on the
11293      scalar loop copy.  Do this after versioning but before peeling
11294      so CFG structure is fine for both scalar and if-converted loop
11295      to make slpeel_duplicate_current_defs_from_edges face matched
11296      loop closed PHI nodes on the exit.  */
11297   if (LOOP_VINFO_SCALAR_LOOP (loop_vinfo))
11298     {
11299       e = single_exit (LOOP_VINFO_SCALAR_LOOP (loop_vinfo));
11300       if (! single_pred_p (e->dest))
11301         {
11302           split_loop_exit_edge (e, true);
11303           if (dump_enabled_p ())
11304             dump_printf (MSG_NOTE, "split exit edge of scalar loop\n");
11305         }
11306     }
11307
11308   tree niters = vect_build_loop_niters (loop_vinfo);
11309   LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = niters;
11310   tree nitersm1 = unshare_expr (LOOP_VINFO_NITERSM1 (loop_vinfo));
11311   bool niters_no_overflow = loop_niters_no_overflow (loop_vinfo);
11312   tree advance;
11313   drs_init_vec orig_drs_init;
11314
11315   epilogue = vect_do_peeling (loop_vinfo, niters, nitersm1, &niters_vector,
11316                               &step_vector, &niters_vector_mult_vf, th,
11317                               check_profitability, niters_no_overflow,
11318                               &advance);
11319   if (LOOP_VINFO_SCALAR_LOOP (loop_vinfo)
11320       && LOOP_VINFO_SCALAR_LOOP_SCALING (loop_vinfo).initialized_p ())
11321     scale_loop_frequencies (LOOP_VINFO_SCALAR_LOOP (loop_vinfo),
11322                             LOOP_VINFO_SCALAR_LOOP_SCALING (loop_vinfo));
11323
11324   if (niters_vector == NULL_TREE)
11325     {
11326       if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
11327           && !LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo)
11328           && known_eq (lowest_vf, vf))
11329         {
11330           niters_vector
11331             = build_int_cst (TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo)),
11332                              LOOP_VINFO_INT_NITERS (loop_vinfo) / lowest_vf);
11333           step_vector = build_one_cst (TREE_TYPE (niters));
11334         }
11335       else if (vect_use_loop_mask_for_alignment_p (loop_vinfo))
11336         vect_gen_vector_loop_niters (loop_vinfo, niters, &niters_vector,
11337                                      &step_vector, niters_no_overflow);
11338       else
11339         /* vect_do_peeling subtracted the number of peeled prologue
11340            iterations from LOOP_VINFO_NITERS.  */
11341         vect_gen_vector_loop_niters (loop_vinfo, LOOP_VINFO_NITERS (loop_vinfo),
11342                                      &niters_vector, &step_vector,
11343                                      niters_no_overflow);
11344     }
11345
11346   /* 1) Make sure the loop header has exactly two entries
11347      2) Make sure we have a preheader basic block.  */
11348
11349   gcc_assert (EDGE_COUNT (loop->header->preds) == 2);
11350
11351   split_edge (loop_preheader_edge (loop));
11352
11353   if (vect_use_loop_mask_for_alignment_p (loop_vinfo))
11354     /* This will deal with any possible peeling.  */
11355     vect_prepare_for_masked_peels (loop_vinfo);
11356
11357   /* Schedule the SLP instances first, then handle loop vectorization
11358      below.  */
11359   if (!loop_vinfo->slp_instances.is_empty ())
11360     {
11361       DUMP_VECT_SCOPE ("scheduling SLP instances");
11362       vect_schedule_slp (loop_vinfo, LOOP_VINFO_SLP_INSTANCES (loop_vinfo));
11363     }
11364
11365   /* FORNOW: the vectorizer supports only loops which body consist
11366      of one basic block (header + empty latch). When the vectorizer will
11367      support more involved loop forms, the order by which the BBs are
11368      traversed need to be reconsidered.  */
11369
11370   for (i = 0; i < nbbs; i++)
11371     {
11372       basic_block bb = bbs[i];
11373       stmt_vec_info stmt_info;
11374
11375       for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
11376            gsi_next (&si))
11377         {
11378           gphi *phi = si.phi ();
11379           if (dump_enabled_p ())
11380             dump_printf_loc (MSG_NOTE, vect_location,
11381                              "------>vectorizing phi: %G", (gimple *) phi);
11382           stmt_info = loop_vinfo->lookup_stmt (phi);
11383           if (!stmt_info)
11384             continue;
11385
11386           if (MAY_HAVE_DEBUG_BIND_STMTS && !STMT_VINFO_LIVE_P (stmt_info))
11387             vect_loop_kill_debug_uses (loop, stmt_info);
11388
11389           if (!STMT_VINFO_RELEVANT_P (stmt_info)
11390               && !STMT_VINFO_LIVE_P (stmt_info))
11391             continue;
11392
11393           if (STMT_VINFO_VECTYPE (stmt_info)
11394               && (maybe_ne
11395                   (TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info)), vf))
11396               && dump_enabled_p ())
11397             dump_printf_loc (MSG_NOTE, vect_location, "multiple-types.\n");
11398
11399           if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def
11400                || STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
11401                || STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def
11402                || STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle
11403                || STMT_VINFO_DEF_TYPE (stmt_info) == vect_first_order_recurrence
11404                || STMT_VINFO_DEF_TYPE (stmt_info) == vect_internal_def)
11405               && ! PURE_SLP_STMT (stmt_info))
11406             {
11407               if (dump_enabled_p ())
11408                 dump_printf_loc (MSG_NOTE, vect_location, "transform phi.\n");
11409               vect_transform_stmt (loop_vinfo, stmt_info, NULL, NULL, NULL);
11410             }
11411         }
11412
11413       for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
11414            gsi_next (&si))
11415         {
11416           gphi *phi = si.phi ();
11417           stmt_info = loop_vinfo->lookup_stmt (phi);
11418           if (!stmt_info)
11419             continue;
11420
11421           if (!STMT_VINFO_RELEVANT_P (stmt_info)
11422               && !STMT_VINFO_LIVE_P (stmt_info))
11423             continue;
11424
11425           if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def
11426                || STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
11427                || STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def
11428                || STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle
11429                || STMT_VINFO_DEF_TYPE (stmt_info) == vect_internal_def
11430                || STMT_VINFO_DEF_TYPE (stmt_info) == vect_first_order_recurrence)
11431               && ! PURE_SLP_STMT (stmt_info))
11432             maybe_set_vectorized_backedge_value (loop_vinfo, stmt_info);
11433         }
11434
11435       for (gimple_stmt_iterator si = gsi_start_bb (bb);
11436            !gsi_end_p (si);)
11437         {
11438           stmt = gsi_stmt (si);
11439           /* During vectorization remove existing clobber stmts.  */
11440           if (gimple_clobber_p (stmt))
11441             {
11442               unlink_stmt_vdef (stmt);
11443               gsi_remove (&si, true);
11444               release_defs (stmt);
11445             }
11446           else
11447             {
11448               /* Ignore vector stmts created in the outer loop.  */
11449               stmt_info = loop_vinfo->lookup_stmt (stmt);
11450
11451               /* vector stmts created in the outer-loop during vectorization of
11452                  stmts in an inner-loop may not have a stmt_info, and do not
11453                  need to be vectorized.  */
11454               stmt_vec_info seen_store = NULL;
11455               if (stmt_info)
11456                 {
11457                   if (STMT_VINFO_IN_PATTERN_P (stmt_info))
11458                     {
11459                       gimple *def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
11460                       for (gimple_stmt_iterator subsi = gsi_start (def_seq);
11461                            !gsi_end_p (subsi); gsi_next (&subsi))
11462                         {
11463                           stmt_vec_info pat_stmt_info
11464                             = loop_vinfo->lookup_stmt (gsi_stmt (subsi));
11465                           vect_transform_loop_stmt (loop_vinfo, pat_stmt_info,
11466                                                     &si, &seen_store);
11467                         }
11468                       stmt_vec_info pat_stmt_info
11469                         = STMT_VINFO_RELATED_STMT (stmt_info);
11470                       if (vect_transform_loop_stmt (loop_vinfo, pat_stmt_info,
11471                                                     &si, &seen_store))
11472                         maybe_set_vectorized_backedge_value (loop_vinfo,
11473                                                              pat_stmt_info);
11474                     }
11475                   else
11476                     {
11477                       if (vect_transform_loop_stmt (loop_vinfo, stmt_info, &si,
11478                                                     &seen_store))
11479                         maybe_set_vectorized_backedge_value (loop_vinfo,
11480                                                              stmt_info);
11481                     }
11482                 }
11483               gsi_next (&si);
11484               if (seen_store)
11485                 {
11486                   if (STMT_VINFO_GROUPED_ACCESS (seen_store))
11487                     /* Interleaving.  If IS_STORE is TRUE, the
11488                        vectorization of the interleaving chain was
11489                        completed - free all the stores in the chain.  */
11490                     vect_remove_stores (loop_vinfo,
11491                                         DR_GROUP_FIRST_ELEMENT (seen_store));
11492                   else
11493                     /* Free the attached stmt_vec_info and remove the stmt.  */
11494                     loop_vinfo->remove_stmt (stmt_info);
11495                 }
11496             }
11497         }
11498
11499       /* Stub out scalar statements that must not survive vectorization.
11500          Doing this here helps with grouped statements, or statements that
11501          are involved in patterns.  */
11502       for (gimple_stmt_iterator gsi = gsi_start_bb (bb);
11503            !gsi_end_p (gsi); gsi_next (&gsi))
11504         {
11505           gcall *call = dyn_cast <gcall *> (gsi_stmt (gsi));
11506           if (!call || !gimple_call_internal_p (call))
11507             continue;
11508           internal_fn ifn = gimple_call_internal_fn (call);
11509           if (ifn == IFN_MASK_LOAD)
11510             {
11511               tree lhs = gimple_get_lhs (call);
11512               if (!VECTOR_TYPE_P (TREE_TYPE (lhs)))
11513                 {
11514                   tree zero = build_zero_cst (TREE_TYPE (lhs));
11515                   gimple *new_stmt = gimple_build_assign (lhs, zero);
11516                   gsi_replace (&gsi, new_stmt, true);
11517                 }
11518             }
11519           else if (conditional_internal_fn_code (ifn) != ERROR_MARK)
11520             {
11521               tree lhs = gimple_get_lhs (call);
11522               if (!VECTOR_TYPE_P (TREE_TYPE (lhs)))
11523                 {
11524                   tree else_arg
11525                     = gimple_call_arg (call, gimple_call_num_args (call) - 1);
11526                   gimple *new_stmt = gimple_build_assign (lhs, else_arg);
11527                   gsi_replace (&gsi, new_stmt, true);
11528                 }
11529             }
11530         }
11531     }                           /* BBs in loop */
11532
11533   /* The vectorization factor is always > 1, so if we use an IV increment of 1.
11534      a zero NITERS becomes a nonzero NITERS_VECTOR.  */
11535   if (integer_onep (step_vector))
11536     niters_no_overflow = true;
11537   vect_set_loop_condition (loop, loop_vinfo, niters_vector, step_vector,
11538                            niters_vector_mult_vf, !niters_no_overflow);
11539
11540   unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
11541
11542   /* True if the final iteration might not handle a full vector's
11543      worth of scalar iterations.  */
11544   bool final_iter_may_be_partial
11545     = LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo);
11546   /* The minimum number of iterations performed by the epilogue.  This
11547      is 1 when peeling for gaps because we always need a final scalar
11548      iteration.  */
11549   int min_epilogue_iters = LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) ? 1 : 0;
11550   /* +1 to convert latch counts to loop iteration counts,
11551      -min_epilogue_iters to remove iterations that cannot be performed
11552        by the vector code.  */
11553   int bias_for_lowest = 1 - min_epilogue_iters;
11554   int bias_for_assumed = bias_for_lowest;
11555   int alignment_npeels = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
11556   if (alignment_npeels && LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
11557     {
11558       /* When the amount of peeling is known at compile time, the first
11559          iteration will have exactly alignment_npeels active elements.
11560          In the worst case it will have at least one.  */
11561       int min_first_active = (alignment_npeels > 0 ? alignment_npeels : 1);
11562       bias_for_lowest += lowest_vf - min_first_active;
11563       bias_for_assumed += assumed_vf - min_first_active;
11564     }
11565   /* In these calculations the "- 1" converts loop iteration counts
11566      back to latch counts.  */
11567   if (loop->any_upper_bound)
11568     {
11569       loop_vec_info main_vinfo = LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo);
11570       loop->nb_iterations_upper_bound
11571         = (final_iter_may_be_partial
11572            ? wi::udiv_ceil (loop->nb_iterations_upper_bound + bias_for_lowest,
11573                             lowest_vf) - 1
11574            : wi::udiv_floor (loop->nb_iterations_upper_bound + bias_for_lowest,
11575                              lowest_vf) - 1);
11576       if (main_vinfo
11577           /* Both peeling for alignment and peeling for gaps can end up
11578              with the scalar epilogue running for more than VF-1 iterations.  */
11579           && !main_vinfo->peeling_for_alignment
11580           && !main_vinfo->peeling_for_gaps)
11581         {
11582           unsigned int bound;
11583           poly_uint64 main_iters
11584             = upper_bound (LOOP_VINFO_VECT_FACTOR (main_vinfo),
11585                            LOOP_VINFO_COST_MODEL_THRESHOLD (main_vinfo));
11586           main_iters
11587             = upper_bound (main_iters,
11588                            LOOP_VINFO_VERSIONING_THRESHOLD (main_vinfo));
11589           if (can_div_away_from_zero_p (main_iters,
11590                                         LOOP_VINFO_VECT_FACTOR (loop_vinfo),
11591                                         &bound))
11592             loop->nb_iterations_upper_bound
11593               = wi::umin ((widest_int) (bound - 1),
11594                           loop->nb_iterations_upper_bound);
11595       }
11596   }
11597   if (loop->any_likely_upper_bound)
11598     loop->nb_iterations_likely_upper_bound
11599       = (final_iter_may_be_partial
11600          ? wi::udiv_ceil (loop->nb_iterations_likely_upper_bound
11601                           + bias_for_lowest, lowest_vf) - 1
11602          : wi::udiv_floor (loop->nb_iterations_likely_upper_bound
11603                            + bias_for_lowest, lowest_vf) - 1);
11604   if (loop->any_estimate)
11605     loop->nb_iterations_estimate
11606       = (final_iter_may_be_partial
11607          ? wi::udiv_ceil (loop->nb_iterations_estimate + bias_for_assumed,
11608                           assumed_vf) - 1
11609          : wi::udiv_floor (loop->nb_iterations_estimate + bias_for_assumed,
11610                            assumed_vf) - 1);
11611   scale_profile_for_vect_loop (loop, assumed_vf, flat);
11612
11613   if (dump_enabled_p ())
11614     {
11615       if (!LOOP_VINFO_EPILOGUE_P (loop_vinfo))
11616         {
11617           dump_printf_loc (MSG_NOTE, vect_location,
11618                            "LOOP VECTORIZED\n");
11619           if (loop->inner)
11620             dump_printf_loc (MSG_NOTE, vect_location,
11621                              "OUTER LOOP VECTORIZED\n");
11622           dump_printf (MSG_NOTE, "\n");
11623         }
11624       else
11625         dump_printf_loc (MSG_NOTE, vect_location,
11626                          "LOOP EPILOGUE VECTORIZED (MODE=%s)\n",
11627                          GET_MODE_NAME (loop_vinfo->vector_mode));
11628     }
11629
11630   /* Loops vectorized with a variable factor won't benefit from
11631      unrolling/peeling.  */
11632   if (!vf.is_constant ())
11633     {
11634       loop->unroll = 1;
11635       if (dump_enabled_p ())
11636         dump_printf_loc (MSG_NOTE, vect_location, "Disabling unrolling due to"
11637                          " variable-length vectorization factor\n");
11638     }
11639   /* Free SLP instances here because otherwise stmt reference counting
11640      won't work.  */
11641   slp_instance instance;
11642   FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), i, instance)
11643     vect_free_slp_instance (instance);
11644   LOOP_VINFO_SLP_INSTANCES (loop_vinfo).release ();
11645   /* Clear-up safelen field since its value is invalid after vectorization
11646      since vectorized loop can have loop-carried dependencies.  */
11647   loop->safelen = 0;
11648
11649   if (epilogue)
11650     {
11651       update_epilogue_loop_vinfo (epilogue, advance);
11652
11653       epilogue->simduid = loop->simduid;
11654       epilogue->force_vectorize = loop->force_vectorize;
11655       epilogue->dont_vectorize = false;
11656     }
11657
11658   return epilogue;
11659 }
11660
11661 /* The code below is trying to perform simple optimization - revert
11662    if-conversion for masked stores, i.e. if the mask of a store is zero
11663    do not perform it and all stored value producers also if possible.
11664    For example,
11665      for (i=0; i<n; i++)
11666        if (c[i])
11667         {
11668           p1[i] += 1;
11669           p2[i] = p3[i] +2;
11670         }
11671    this transformation will produce the following semi-hammock:
11672
11673    if (!mask__ifc__42.18_165 == { 0, 0, 0, 0, 0, 0, 0, 0 })
11674      {
11675        vect__11.19_170 = MASK_LOAD (vectp_p1.20_168, 0B, mask__ifc__42.18_165);
11676        vect__12.22_172 = vect__11.19_170 + vect_cst__171;
11677        MASK_STORE (vectp_p1.23_175, 0B, mask__ifc__42.18_165, vect__12.22_172);
11678        vect__18.25_182 = MASK_LOAD (vectp_p3.26_180, 0B, mask__ifc__42.18_165);
11679        vect__19.28_184 = vect__18.25_182 + vect_cst__183;
11680        MASK_STORE (vectp_p2.29_187, 0B, mask__ifc__42.18_165, vect__19.28_184);
11681      }
11682 */
11683
11684 void
11685 optimize_mask_stores (class loop *loop)
11686 {
11687   basic_block *bbs = get_loop_body (loop);
11688   unsigned nbbs = loop->num_nodes;
11689   unsigned i;
11690   basic_block bb;
11691   class loop *bb_loop;
11692   gimple_stmt_iterator gsi;
11693   gimple *stmt;
11694   auto_vec<gimple *> worklist;
11695   auto_purge_vect_location sentinel;
11696
11697   vect_location = find_loop_location (loop);
11698   /* Pick up all masked stores in loop if any.  */
11699   for (i = 0; i < nbbs; i++)
11700     {
11701       bb = bbs[i];
11702       for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi);
11703            gsi_next (&gsi))
11704         {
11705           stmt = gsi_stmt (gsi);
11706           if (gimple_call_internal_p (stmt, IFN_MASK_STORE))
11707             worklist.safe_push (stmt);
11708         }
11709     }
11710
11711   free (bbs);
11712   if (worklist.is_empty ())
11713     return;
11714
11715   /* Loop has masked stores.  */
11716   while (!worklist.is_empty ())
11717     {
11718       gimple *last, *last_store;
11719       edge e, efalse;
11720       tree mask;
11721       basic_block store_bb, join_bb;
11722       gimple_stmt_iterator gsi_to;
11723       tree vdef, new_vdef;
11724       gphi *phi;
11725       tree vectype;
11726       tree zero;
11727
11728       last = worklist.pop ();
11729       mask = gimple_call_arg (last, 2);
11730       bb = gimple_bb (last);
11731       /* Create then_bb and if-then structure in CFG, then_bb belongs to
11732          the same loop as if_bb.  It could be different to LOOP when two
11733          level loop-nest is vectorized and mask_store belongs to the inner
11734          one.  */
11735       e = split_block (bb, last);
11736       bb_loop = bb->loop_father;
11737       gcc_assert (loop == bb_loop || flow_loop_nested_p (loop, bb_loop));
11738       join_bb = e->dest;
11739       store_bb = create_empty_bb (bb);
11740       add_bb_to_loop (store_bb, bb_loop);
11741       e->flags = EDGE_TRUE_VALUE;
11742       efalse = make_edge (bb, store_bb, EDGE_FALSE_VALUE);
11743       /* Put STORE_BB to likely part.  */
11744       efalse->probability = profile_probability::likely ();
11745       e->probability = efalse->probability.invert ();
11746       store_bb->count = efalse->count ();
11747       make_single_succ_edge (store_bb, join_bb, EDGE_FALLTHRU);
11748       if (dom_info_available_p (CDI_DOMINATORS))
11749         set_immediate_dominator (CDI_DOMINATORS, store_bb, bb);
11750       if (dump_enabled_p ())
11751         dump_printf_loc (MSG_NOTE, vect_location,
11752                          "Create new block %d to sink mask stores.",
11753                          store_bb->index);
11754       /* Create vector comparison with boolean result.  */
11755       vectype = TREE_TYPE (mask);
11756       zero = build_zero_cst (vectype);
11757       stmt = gimple_build_cond (EQ_EXPR, mask, zero, NULL_TREE, NULL_TREE);
11758       gsi = gsi_last_bb (bb);
11759       gsi_insert_after (&gsi, stmt, GSI_SAME_STMT);
11760       /* Create new PHI node for vdef of the last masked store:
11761          .MEM_2 = VDEF <.MEM_1>
11762          will be converted to
11763          .MEM.3 = VDEF <.MEM_1>
11764          and new PHI node will be created in join bb
11765          .MEM_2 = PHI <.MEM_1, .MEM_3>
11766       */
11767       vdef = gimple_vdef (last);
11768       new_vdef = make_ssa_name (gimple_vop (cfun), last);
11769       gimple_set_vdef (last, new_vdef);
11770       phi = create_phi_node (vdef, join_bb);
11771       add_phi_arg (phi, new_vdef, EDGE_SUCC (store_bb, 0), UNKNOWN_LOCATION);
11772
11773       /* Put all masked stores with the same mask to STORE_BB if possible.  */
11774       while (true)
11775         {
11776           gimple_stmt_iterator gsi_from;
11777           gimple *stmt1 = NULL;
11778
11779           /* Move masked store to STORE_BB.  */
11780           last_store = last;
11781           gsi = gsi_for_stmt (last);
11782           gsi_from = gsi;
11783           /* Shift GSI to the previous stmt for further traversal.  */
11784           gsi_prev (&gsi);
11785           gsi_to = gsi_start_bb (store_bb);
11786           gsi_move_before (&gsi_from, &gsi_to);
11787           /* Setup GSI_TO to the non-empty block start.  */
11788           gsi_to = gsi_start_bb (store_bb);
11789           if (dump_enabled_p ())
11790             dump_printf_loc (MSG_NOTE, vect_location,
11791                              "Move stmt to created bb\n%G", last);
11792           /* Move all stored value producers if possible.  */
11793           while (!gsi_end_p (gsi))
11794             {
11795               tree lhs;
11796               imm_use_iterator imm_iter;
11797               use_operand_p use_p;
11798               bool res;
11799
11800               /* Skip debug statements.  */
11801               if (is_gimple_debug (gsi_stmt (gsi)))
11802                 {
11803                   gsi_prev (&gsi);
11804                   continue;
11805                 }
11806               stmt1 = gsi_stmt (gsi);
11807               /* Do not consider statements writing to memory or having
11808                  volatile operand.  */
11809               if (gimple_vdef (stmt1)
11810                   || gimple_has_volatile_ops (stmt1))
11811                 break;
11812               gsi_from = gsi;
11813               gsi_prev (&gsi);
11814               lhs = gimple_get_lhs (stmt1);
11815               if (!lhs)
11816                 break;
11817
11818               /* LHS of vectorized stmt must be SSA_NAME.  */
11819               if (TREE_CODE (lhs) != SSA_NAME)
11820                 break;
11821
11822               if (!VECTOR_TYPE_P (TREE_TYPE (lhs)))
11823                 {
11824                   /* Remove dead scalar statement.  */
11825                   if (has_zero_uses (lhs))
11826                     {
11827                       gsi_remove (&gsi_from, true);
11828                       continue;
11829                     }
11830                 }
11831
11832               /* Check that LHS does not have uses outside of STORE_BB.  */
11833               res = true;
11834               FOR_EACH_IMM_USE_FAST (use_p, imm_iter, lhs)
11835                 {
11836                   gimple *use_stmt;
11837                   use_stmt = USE_STMT (use_p);
11838                   if (is_gimple_debug (use_stmt))
11839                     continue;
11840                   if (gimple_bb (use_stmt) != store_bb)
11841                     {
11842                       res = false;
11843                       break;
11844                     }
11845                 }
11846               if (!res)
11847                 break;
11848
11849               if (gimple_vuse (stmt1)
11850                   && gimple_vuse (stmt1) != gimple_vuse (last_store))
11851                 break;
11852
11853               /* Can move STMT1 to STORE_BB.  */
11854               if (dump_enabled_p ())
11855                 dump_printf_loc (MSG_NOTE, vect_location,
11856                                  "Move stmt to created bb\n%G", stmt1);
11857               gsi_move_before (&gsi_from, &gsi_to);
11858               /* Shift GSI_TO for further insertion.  */
11859               gsi_prev (&gsi_to);
11860             }
11861           /* Put other masked stores with the same mask to STORE_BB.  */
11862           if (worklist.is_empty ()
11863               || gimple_call_arg (worklist.last (), 2) != mask
11864               || worklist.last () != stmt1)
11865             break;
11866           last = worklist.pop ();
11867         }
11868       add_phi_arg (phi, gimple_vuse (last_store), e, UNKNOWN_LOCATION);
11869     }
11870 }
11871
11872 /* Decide whether it is possible to use a zero-based induction variable
11873    when vectorizing LOOP_VINFO with partial vectors.  If it is, return
11874    the value that the induction variable must be able to hold in order
11875    to ensure that the rgroups eventually have no active vector elements.
11876    Return -1 otherwise.  */
11877
11878 widest_int
11879 vect_iv_limit_for_partial_vectors (loop_vec_info loop_vinfo)
11880 {
11881   tree niters_skip = LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo);
11882   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
11883   unsigned HOST_WIDE_INT max_vf = vect_max_vf (loop_vinfo);
11884
11885   /* Calculate the value that the induction variable must be able
11886      to hit in order to ensure that we end the loop with an all-false mask.
11887      This involves adding the maximum number of inactive trailing scalar
11888      iterations.  */
11889   widest_int iv_limit = -1;
11890   if (max_loop_iterations (loop, &iv_limit))
11891     {
11892       if (niters_skip)
11893         {
11894           /* Add the maximum number of skipped iterations to the
11895              maximum iteration count.  */
11896           if (TREE_CODE (niters_skip) == INTEGER_CST)
11897             iv_limit += wi::to_widest (niters_skip);
11898           else
11899             iv_limit += max_vf - 1;
11900         }
11901       else if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo))
11902         /* Make a conservatively-correct assumption.  */
11903         iv_limit += max_vf - 1;
11904
11905       /* IV_LIMIT is the maximum number of latch iterations, which is also
11906          the maximum in-range IV value.  Round this value down to the previous
11907          vector alignment boundary and then add an extra full iteration.  */
11908       poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
11909       iv_limit = (iv_limit & -(int) known_alignment (vf)) + max_vf;
11910     }
11911   return iv_limit;
11912 }
11913
11914 /* For the given rgroup_controls RGC, check whether an induction variable
11915    would ever hit a value that produces a set of all-false masks or zero
11916    lengths before wrapping around.  Return true if it's possible to wrap
11917    around before hitting the desirable value, otherwise return false.  */
11918
11919 bool
11920 vect_rgroup_iv_might_wrap_p (loop_vec_info loop_vinfo, rgroup_controls *rgc)
11921 {
11922   widest_int iv_limit = vect_iv_limit_for_partial_vectors (loop_vinfo);
11923
11924   if (iv_limit == -1)
11925     return true;
11926
11927   tree compare_type = LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo);
11928   unsigned int compare_precision = TYPE_PRECISION (compare_type);
11929   unsigned nitems = rgc->max_nscalars_per_iter * rgc->factor;
11930
11931   if (wi::min_precision (iv_limit * nitems, UNSIGNED) > compare_precision)
11932     return true;
11933
11934   return false;
11935 }