gcc/tree-vect-loop.cc

   1 /* Loop Vectorization
   2    Copyright (C) 2003-2024 Free Software Foundation, Inc.
   3    Contributed by Dorit Naishlos <dorit@il.ibm.com> and
   4    Ira Rosen <irar@il.ibm.com>
   5
   6 This file is part of GCC.
   7
   8 GCC is free software; you can redistribute it and/or modify it under
   9 the terms of the GNU General Public License as published by the Free
  10 Software Foundation; either version 3, or (at your option) any later
  11 version.
  12
  13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
  14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
  15 FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  16 for more details.
  17
  18 You should have received a copy of the GNU General Public License
  19 along with GCC; see the file COPYING3.  If not see
  20 <http://www.gnu.org/licenses/>.  */
  21
  22 #define INCLUDE_ALGORITHM
  23 #include "config.h"
  24 #include "system.h"
  25 #include "coretypes.h"
  26 #include "backend.h"
  27 #include "target.h"
  28 #include "rtl.h"
  29 #include "tree.h"
  30 #include "gimple.h"
  31 #include "cfghooks.h"
  32 #include "tree-pass.h"
  33 #include "ssa.h"
  34 #include "optabs-tree.h"
  35 #include "memmodel.h"
  36 #include "optabs.h"
  37 #include "diagnostic-core.h"
  38 #include "fold-const.h"
  39 #include "stor-layout.h"
  40 #include "cfganal.h"
  41 #include "gimplify.h"
  42 #include "gimple-iterator.h"
  43 #include "gimplify-me.h"
  44 #include "tree-ssa-loop-ivopts.h"
  45 #include "tree-ssa-loop-manip.h"
  46 #include "tree-ssa-loop-niter.h"
  47 #include "tree-ssa-loop.h"
  48 #include "cfgloop.h"
  49 #include "tree-scalar-evolution.h"
  50 #include "tree-vectorizer.h"
  51 #include "gimple-fold.h"
  52 #include "cgraph.h"
  53 #include "tree-cfg.h"
  54 #include "tree-if-conv.h"
  55 #include "internal-fn.h"
  56 #include "tree-vector-builder.h"
  57 #include "vec-perm-indices.h"
  58 #include "tree-eh.h"
  59 #include "case-cfn-macros.h"
  60 #include "langhooks.h"
  61
  62 /* Loop Vectorization Pass.
  63
  64    This pass tries to vectorize loops.
  65
  66    For example, the vectorizer transforms the following simple loop:
  67
  68         short a[N]; short b[N]; short c[N]; int i;
  69
  70         for (i=0; i<N; i++){
  71           a[i] = b[i] + c[i];
  72         }
  73
  74    as if it was manually vectorized by rewriting the source code into:
  75
  76         typedef int __attribute__((mode(V8HI))) v8hi;
  77         short a[N];  short b[N]; short c[N];   int i;
  78         v8hi *pa = (v8hi*)a, *pb = (v8hi*)b, *pc = (v8hi*)c;
  79         v8hi va, vb, vc;
  80
  81         for (i=0; i<N/8; i++){
  82           vb = pb[i];
  83           vc = pc[i];
  84           va = vb + vc;
  85           pa[i] = va;
  86         }
  87
  88         The main entry to this pass is vectorize_loops(), in which
  89    the vectorizer applies a set of analyses on a given set of loops,
  90    followed by the actual vectorization transformation for the loops that
  91    had successfully passed the analysis phase.
  92         Throughout this pass we make a distinction between two types of
  93    data: scalars (which are represented by SSA_NAMES), and memory references
  94    ("data-refs").  These two types of data require different handling both
  95    during analysis and transformation. The types of data-refs that the
  96    vectorizer currently supports are ARRAY_REFS which base is an array DECL
  97    (not a pointer), and INDIRECT_REFS through pointers; both array and pointer
  98    accesses are required to have a simple (consecutive) access pattern.
  99
 100    Analysis phase:
 101    ===============
 102         The driver for the analysis phase is vect_analyze_loop().
 103    It applies a set of analyses, some of which rely on the scalar evolution
 104    analyzer (scev) developed by Sebastian Pop.
 105
 106         During the analysis phase the vectorizer records some information
 107    per stmt in a "stmt_vec_info" struct which is attached to each stmt in the
 108    loop, as well as general information about the loop as a whole, which is
 109    recorded in a "loop_vec_info" struct attached to each loop.
 110
 111    Transformation phase:
 112    =====================
 113         The loop transformation phase scans all the stmts in the loop, and
 114    creates a vector stmt (or a sequence of stmts) for each scalar stmt S in
 115    the loop that needs to be vectorized.  It inserts the vector code sequence
 116    just before the scalar stmt S, and records a pointer to the vector code
 117    in STMT_VINFO_VEC_STMT (stmt_info) (stmt_info is the stmt_vec_info struct
 118    attached to S).  This pointer will be used for the vectorization of following
 119    stmts which use the def of stmt S. Stmt S is removed if it writes to memory;
 120    otherwise, we rely on dead code elimination for removing it.
 121
 122         For example, say stmt S1 was vectorized into stmt VS1:
 123
 124    VS1: vb = px[i];
 125    S1:  b = x[i];    STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
 126    S2:  a = b;
 127
 128    To vectorize stmt S2, the vectorizer first finds the stmt that defines
 129    the operand 'b' (S1), and gets the relevant vector def 'vb' from the
 130    vector stmt VS1 pointed to by STMT_VINFO_VEC_STMT (stmt_info (S1)).  The
 131    resulting sequence would be:
 132
 133    VS1: vb = px[i];
 134    S1:  b = x[i];       STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
 135    VS2: va = vb;
 136    S2:  a = b;          STMT_VINFO_VEC_STMT (stmt_info (S2)) = VS2
 137
 138         Operands that are not SSA_NAMEs, are data-refs that appear in
 139    load/store operations (like 'x[i]' in S1), and are handled differently.
 140
 141    Target modeling:
 142    =================
 143         Currently the only target specific information that is used is the
 144    size of the vector (in bytes) - "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD".
 145    Targets that can support different sizes of vectors, for now will need
 146    to specify one value for "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD".  More
 147    flexibility will be added in the future.
 148
 149         Since we only vectorize operations which vector form can be
 150    expressed using existing tree codes, to verify that an operation is
 151    supported, the vectorizer checks the relevant optab at the relevant
 152    machine_mode (e.g, optab_handler (add_optab, V8HImode)).  If
 153    the value found is CODE_FOR_nothing, then there's no target support, and
 154    we can't vectorize the stmt.
 155
 156    For additional information on this project see:
 157    http://gcc.gnu.org/projects/tree-ssa/vectorization.html
 158 */
 159
 160 static void vect_estimate_min_profitable_iters (loop_vec_info, int *, int *,
 161                                                 unsigned *);
 162 static stmt_vec_info vect_is_simple_reduction (loop_vec_info, stmt_vec_info,
 163                                                bool *, bool *, bool);
 164
 165 /* Subroutine of vect_determine_vf_for_stmt that handles only one
 166    statement.  VECTYPE_MAYBE_SET_P is true if STMT_VINFO_VECTYPE
 167    may already be set for general statements (not just data refs).  */
 168
 169 static opt_result
 170 vect_determine_vf_for_stmt_1 (vec_info *vinfo, stmt_vec_info stmt_info,
 171                               bool vectype_maybe_set_p,
 172                               poly_uint64 *vf)
 173 {
 174   gimple *stmt = stmt_info->stmt;
 175
 176   if ((!STMT_VINFO_RELEVANT_P (stmt_info)
 177        && !STMT_VINFO_LIVE_P (stmt_info))
 178       || gimple_clobber_p (stmt))
 179     {
 180       if (dump_enabled_p ())
 181         dump_printf_loc (MSG_NOTE, vect_location, "skip.\n");
 182       return opt_result::success ();
 183     }
 184
 185   tree stmt_vectype, nunits_vectype;
 186   opt_result res = vect_get_vector_types_for_stmt (vinfo, stmt_info,
 187                                                    &stmt_vectype,
 188                                                    &nunits_vectype);
 189   if (!res)
 190     return res;
 191
 192   if (stmt_vectype)
 193     {
 194       if (STMT_VINFO_VECTYPE (stmt_info))
 195         /* The only case when a vectype had been already set is for stmts
 196            that contain a data ref, or for "pattern-stmts" (stmts generated
 197            by the vectorizer to represent/replace a certain idiom).  */
 198         gcc_assert ((STMT_VINFO_DATA_REF (stmt_info)
 199                      || vectype_maybe_set_p)
 200                     && STMT_VINFO_VECTYPE (stmt_info) == stmt_vectype);
 201       else
 202         STMT_VINFO_VECTYPE (stmt_info) = stmt_vectype;
 203     }
 204
 205   if (nunits_vectype)
 206     vect_update_max_nunits (vf, nunits_vectype);
 207
 208   return opt_result::success ();
 209 }
 210
 211 /* Subroutine of vect_determine_vectorization_factor.  Set the vector
 212    types of STMT_INFO and all attached pattern statements and update
 213    the vectorization factor VF accordingly.  Return true on success
 214    or false if something prevented vectorization.  */
 215
 216 static opt_result
 217 vect_determine_vf_for_stmt (vec_info *vinfo,
 218                             stmt_vec_info stmt_info, poly_uint64 *vf)
 219 {
 220   if (dump_enabled_p ())
 221     dump_printf_loc (MSG_NOTE, vect_location, "==> examining statement: %G",
 222                      stmt_info->stmt);
 223   opt_result res = vect_determine_vf_for_stmt_1 (vinfo, stmt_info, false, vf);
 224   if (!res)
 225     return res;
 226
 227   if (STMT_VINFO_IN_PATTERN_P (stmt_info)
 228       && STMT_VINFO_RELATED_STMT (stmt_info))
 229     {
 230       gimple *pattern_def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
 231       stmt_info = STMT_VINFO_RELATED_STMT (stmt_info);
 232
 233       /* If a pattern statement has def stmts, analyze them too.  */
 234       for (gimple_stmt_iterator si = gsi_start (pattern_def_seq);
 235            !gsi_end_p (si); gsi_next (&si))
 236         {
 237           stmt_vec_info def_stmt_info = vinfo->lookup_stmt (gsi_stmt (si));
 238           if (dump_enabled_p ())
 239             dump_printf_loc (MSG_NOTE, vect_location,
 240                              "==> examining pattern def stmt: %G",
 241                              def_stmt_info->stmt);
 242           res = vect_determine_vf_for_stmt_1 (vinfo, def_stmt_info, true, vf);
 243           if (!res)
 244             return res;
 245         }
 246
 247       if (dump_enabled_p ())
 248         dump_printf_loc (MSG_NOTE, vect_location,
 249                          "==> examining pattern statement: %G",
 250                          stmt_info->stmt);
 251       res = vect_determine_vf_for_stmt_1 (vinfo, stmt_info, true, vf);
 252       if (!res)
 253         return res;
 254     }
 255
 256   return opt_result::success ();
 257 }
 258
 259 /* Function vect_determine_vectorization_factor
 260
 261    Determine the vectorization factor (VF).  VF is the number of data elements
 262    that are operated upon in parallel in a single iteration of the vectorized
 263    loop.  For example, when vectorizing a loop that operates on 4byte elements,
 264    on a target with vector size (VS) 16byte, the VF is set to 4, since 4
 265    elements can fit in a single vector register.
 266
 267    We currently support vectorization of loops in which all types operated upon
 268    are of the same size.  Therefore this function currently sets VF according to
 269    the size of the types operated upon, and fails if there are multiple sizes
 270    in the loop.
 271
 272    VF is also the factor by which the loop iterations are strip-mined, e.g.:
 273    original loop:
 274         for (i=0; i<N; i++){
 275           a[i] = b[i] + c[i];
 276         }
 277
 278    vectorized loop:
 279         for (i=0; i<N; i+=VF){
 280           a[i:VF] = b[i:VF] + c[i:VF];
 281         }
 282 */
 283
 284 static opt_result
 285 vect_determine_vectorization_factor (loop_vec_info loop_vinfo)
 286 {
 287   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
 288   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
 289   unsigned nbbs = loop->num_nodes;
 290   poly_uint64 vectorization_factor = 1;
 291   tree scalar_type = NULL_TREE;
 292   gphi *phi;
 293   tree vectype;
 294   stmt_vec_info stmt_info;
 295   unsigned i;
 296
 297   DUMP_VECT_SCOPE ("vect_determine_vectorization_factor");
 298
 299   for (i = 0; i < nbbs; i++)
 300     {
 301       basic_block bb = bbs[i];
 302
 303       for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
 304            gsi_next (&si))
 305         {
 306           phi = si.phi ();
 307           stmt_info = loop_vinfo->lookup_stmt (phi);
 308           if (dump_enabled_p ())
 309             dump_printf_loc (MSG_NOTE, vect_location, "==> examining phi: %G",
 310                              (gimple *) phi);
 311
 312           gcc_assert (stmt_info);
 313
 314           if (STMT_VINFO_RELEVANT_P (stmt_info)
 315               || STMT_VINFO_LIVE_P (stmt_info))
 316             {
 317               gcc_assert (!STMT_VINFO_VECTYPE (stmt_info));
 318               scalar_type = TREE_TYPE (PHI_RESULT (phi));
 319
 320               if (dump_enabled_p ())
 321                 dump_printf_loc (MSG_NOTE, vect_location,
 322                                  "get vectype for scalar type:  %T\n",
 323                                  scalar_type);
 324
 325               vectype = get_vectype_for_scalar_type (loop_vinfo, scalar_type);
 326               if (!vectype)
 327                 return opt_result::failure_at (phi,
 328                                                "not vectorized: unsupported "
 329                                                "data-type %T\n",
 330                                                scalar_type);
 331               STMT_VINFO_VECTYPE (stmt_info) = vectype;
 332
 333               if (dump_enabled_p ())
 334                 dump_printf_loc (MSG_NOTE, vect_location, "vectype: %T\n",
 335                                  vectype);
 336
 337               if (dump_enabled_p ())
 338                 {
 339                   dump_printf_loc (MSG_NOTE, vect_location, "nunits = ");
 340                   dump_dec (MSG_NOTE, TYPE_VECTOR_SUBPARTS (vectype));
 341                   dump_printf (MSG_NOTE, "\n");
 342                 }
 343
 344               vect_update_max_nunits (&vectorization_factor, vectype);
 345             }
 346         }
 347
 348       for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
 349            gsi_next (&si))
 350         {
 351           if (is_gimple_debug (gsi_stmt (si)))
 352             continue;
 353           stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
 354           opt_result res
 355             = vect_determine_vf_for_stmt (loop_vinfo,
 356                                           stmt_info, &vectorization_factor);
 357           if (!res)
 358             return res;
 359         }
 360     }
 361
 362   /* TODO: Analyze cost. Decide if worth while to vectorize.  */
 363   if (dump_enabled_p ())
 364     {
 365       dump_printf_loc (MSG_NOTE, vect_location, "vectorization factor = ");
 366       dump_dec (MSG_NOTE, vectorization_factor);
 367       dump_printf (MSG_NOTE, "\n");
 368     }
 369
 370   if (known_le (vectorization_factor, 1U))
 371     return opt_result::failure_at (vect_location,
 372                                    "not vectorized: unsupported data-type\n");
 373   LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
 374   return opt_result::success ();
 375 }
 376
 377
 378 /* Function vect_is_simple_iv_evolution.
 379
 380    FORNOW: A simple evolution of an induction variables in the loop is
 381    considered a polynomial evolution.  */
 382
 383 static bool
 384 vect_is_simple_iv_evolution (unsigned loop_nb, tree access_fn, tree * init,
 385                              tree * step)
 386 {
 387   tree init_expr;
 388   tree step_expr;
 389   tree evolution_part = evolution_part_in_loop_num (access_fn, loop_nb);
 390   basic_block bb;
 391
 392   /* When there is no evolution in this loop, the evolution function
 393      is not "simple".  */
 394   if (evolution_part == NULL_TREE)
 395     return false;
 396
 397   /* When the evolution is a polynomial of degree >= 2
 398      the evolution function is not "simple".  */
 399   if (tree_is_chrec (evolution_part))
 400     return false;
 401
 402   step_expr = evolution_part;
 403   init_expr = unshare_expr (initial_condition_in_loop_num (access_fn, loop_nb));
 404
 405   if (dump_enabled_p ())
 406     dump_printf_loc (MSG_NOTE, vect_location, "step: %T,  init: %T\n",
 407                      step_expr, init_expr);
 408
 409   *init = init_expr;
 410   *step = step_expr;
 411
 412   if (TREE_CODE (step_expr) != INTEGER_CST
 413       && (TREE_CODE (step_expr) != SSA_NAME
 414           || ((bb = gimple_bb (SSA_NAME_DEF_STMT (step_expr)))
 415               && flow_bb_inside_loop_p (get_loop (cfun, loop_nb), bb))
 416           || (!INTEGRAL_TYPE_P (TREE_TYPE (step_expr))
 417               && (!SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr))
 418                   || !flag_associative_math)))
 419       && (TREE_CODE (step_expr) != REAL_CST
 420           || !flag_associative_math))
 421     {
 422       if (dump_enabled_p ())
 423         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 424                          "step unknown.\n");
 425       return false;
 426     }
 427
 428   return true;
 429 }
 430
 431 /* Function vect_is_nonlinear_iv_evolution
 432
 433    Only support nonlinear induction for integer type
 434    1. neg
 435    2. mul by constant
 436    3. lshift/rshift by constant.
 437
 438    For neg induction, return a fake step as integer -1.  */
 439 static bool
 440 vect_is_nonlinear_iv_evolution (class loop* loop, stmt_vec_info stmt_info,
 441                                 gphi* loop_phi_node, tree *init, tree *step)
 442 {
 443   tree init_expr, ev_expr, result, op1, op2;
 444   gimple* def;
 445
 446   if (gimple_phi_num_args (loop_phi_node) != 2)
 447     return false;
 448
 449   init_expr = PHI_ARG_DEF_FROM_EDGE (loop_phi_node, loop_preheader_edge (loop));
 450   ev_expr = PHI_ARG_DEF_FROM_EDGE (loop_phi_node, loop_latch_edge (loop));
 451
 452   /* Support nonlinear induction only for integer type.  */
 453   if (!INTEGRAL_TYPE_P (TREE_TYPE (init_expr)))
 454     return false;
 455
 456   *init = init_expr;
 457   result = PHI_RESULT (loop_phi_node);
 458
 459   if (TREE_CODE (ev_expr) != SSA_NAME
 460       || ((def = SSA_NAME_DEF_STMT (ev_expr)), false)
 461       || !is_gimple_assign (def))
 462     return false;
 463
 464   enum tree_code t_code = gimple_assign_rhs_code (def);
 465   switch (t_code)
 466     {
 467     case NEGATE_EXPR:
 468       if (gimple_assign_rhs1 (def) != result)
 469         return false;
 470       *step = build_int_cst (TREE_TYPE (init_expr), -1);
 471       STMT_VINFO_LOOP_PHI_EVOLUTION_TYPE (stmt_info) = vect_step_op_neg;
 472       break;
 473
 474     case RSHIFT_EXPR:
 475     case LSHIFT_EXPR:
 476     case MULT_EXPR:
 477       op1 = gimple_assign_rhs1 (def);
 478       op2 = gimple_assign_rhs2 (def);
 479       if (TREE_CODE (op2) != INTEGER_CST
 480           || op1 != result)
 481         return false;
 482       *step = op2;
 483       if (t_code == LSHIFT_EXPR)
 484         STMT_VINFO_LOOP_PHI_EVOLUTION_TYPE (stmt_info) = vect_step_op_shl;
 485       else if (t_code == RSHIFT_EXPR)
 486         STMT_VINFO_LOOP_PHI_EVOLUTION_TYPE (stmt_info) = vect_step_op_shr;
 487       /* NEGATE_EXPR and MULT_EXPR are both vect_step_op_mul.  */
 488       else
 489         STMT_VINFO_LOOP_PHI_EVOLUTION_TYPE (stmt_info) = vect_step_op_mul;
 490       break;
 491
 492     default:
 493       return false;
 494     }
 495
 496   STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_info) = *init;
 497   STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_info) = *step;
 498
 499   return true;
 500 }
 501
 502 /* Return true if PHI, described by STMT_INFO, is the inner PHI in
 503    what we are assuming is a double reduction.  For example, given
 504    a structure like this:
 505
 506       outer1:
 507         x_1 = PHI <x_4(outer2), ...>;
 508         ...
 509
 510       inner:
 511         x_2 = PHI <x_1(outer1), ...>;
 512         ...
 513         x_3 = ...;
 514         ...
 515
 516       outer2:
 517         x_4 = PHI <x_3(inner)>;
 518         ...
 519
 520    outer loop analysis would treat x_1 as a double reduction phi and
 521    this function would then return true for x_2.  */
 522
 523 static bool
 524 vect_inner_phi_in_double_reduction_p (loop_vec_info loop_vinfo, gphi *phi)
 525 {
 526   use_operand_p use_p;
 527   ssa_op_iter op_iter;
 528   FOR_EACH_PHI_ARG (use_p, phi, op_iter, SSA_OP_USE)
 529     if (stmt_vec_info def_info = loop_vinfo->lookup_def (USE_FROM_PTR (use_p)))
 530       if (STMT_VINFO_DEF_TYPE (def_info) == vect_double_reduction_def)
 531         return true;
 532   return false;
 533 }
 534
 535 /* Returns true if Phi is a first-order recurrence. A first-order
 536    recurrence is a non-reduction recurrence relation in which the value of
 537    the recurrence in the current loop iteration equals a value defined in
 538    the previous iteration.  */
 539
 540 static bool
 541 vect_phi_first_order_recurrence_p (loop_vec_info loop_vinfo, class loop *loop,
 542                                    gphi *phi)
 543 {
 544   /* A nested cycle isn't vectorizable as first order recurrence.  */
 545   if (LOOP_VINFO_LOOP (loop_vinfo) != loop)
 546     return false;
 547
 548   /* Ensure the loop latch definition is from within the loop.  */
 549   edge latch = loop_latch_edge (loop);
 550   tree ldef = PHI_ARG_DEF_FROM_EDGE (phi, latch);
 551   if (TREE_CODE (ldef) != SSA_NAME
 552       || SSA_NAME_IS_DEFAULT_DEF (ldef)
 553       || is_a <gphi *> (SSA_NAME_DEF_STMT (ldef))
 554       || !flow_bb_inside_loop_p (loop, gimple_bb (SSA_NAME_DEF_STMT (ldef))))
 555     return false;
 556
 557   tree def = gimple_phi_result (phi);
 558
 559   /* Ensure every use_stmt of the phi node is dominated by the latch
 560      definition.  */
 561   imm_use_iterator imm_iter;
 562   use_operand_p use_p;
 563   FOR_EACH_IMM_USE_FAST (use_p, imm_iter, def)
 564     if (!is_gimple_debug (USE_STMT (use_p))
 565         && (SSA_NAME_DEF_STMT (ldef) == USE_STMT (use_p)
 566             || !vect_stmt_dominates_stmt_p (SSA_NAME_DEF_STMT (ldef),
 567                                             USE_STMT (use_p))))
 568       return false;
 569
 570   /* First-order recurrence autovectorization needs shuffle vector.  */
 571   tree scalar_type = TREE_TYPE (def);
 572   tree vectype = get_vectype_for_scalar_type (loop_vinfo, scalar_type);
 573   if (!vectype)
 574     return false;
 575
 576   return true;
 577 }
 578
 579 /* Function vect_analyze_scalar_cycles_1.
 580
 581    Examine the cross iteration def-use cycles of scalar variables
 582    in LOOP.  LOOP_VINFO represents the loop that is now being
 583    considered for vectorization (can be LOOP, or an outer-loop
 584    enclosing LOOP).  SLP indicates there will be some subsequent
 585    slp analyses or not.  */
 586
 587 static void
 588 vect_analyze_scalar_cycles_1 (loop_vec_info loop_vinfo, class loop *loop,
 589                               bool slp)
 590 {
 591   basic_block bb = loop->header;
 592   tree init, step;
 593   auto_vec<stmt_vec_info, 64> worklist;
 594   gphi_iterator gsi;
 595   bool double_reduc, reduc_chain;
 596
 597   DUMP_VECT_SCOPE ("vect_analyze_scalar_cycles");
 598
 599   /* First - identify all inductions.  Reduction detection assumes that all the
 600      inductions have been identified, therefore, this order must not be
 601      changed.  */
 602   for (gsi = gsi_start_phis  (bb); !gsi_end_p (gsi); gsi_next (&gsi))
 603     {
 604       gphi *phi = gsi.phi ();
 605       tree access_fn = NULL;
 606       tree def = PHI_RESULT (phi);
 607       stmt_vec_info stmt_vinfo = loop_vinfo->lookup_stmt (phi);
 608
 609       if (dump_enabled_p ())
 610         dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: %G",
 611                          (gimple *) phi);
 612
 613       /* Skip virtual phi's.  The data dependences that are associated with
 614          virtual defs/uses (i.e., memory accesses) are analyzed elsewhere.  */
 615       if (virtual_operand_p (def))
 616         continue;
 617
 618       STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_unknown_def_type;
 619
 620       /* Analyze the evolution function.  */
 621       access_fn = analyze_scalar_evolution (loop, def);
 622       if (access_fn)
 623         {
 624           STRIP_NOPS (access_fn);
 625           if (dump_enabled_p ())
 626             dump_printf_loc (MSG_NOTE, vect_location,
 627                              "Access function of PHI: %T\n", access_fn);
 628           STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo)
 629             = initial_condition_in_loop_num (access_fn, loop->num);
 630           STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo)
 631             = evolution_part_in_loop_num (access_fn, loop->num);
 632         }
 633
 634       if ((!access_fn
 635            || vect_inner_phi_in_double_reduction_p (loop_vinfo, phi)
 636            || !vect_is_simple_iv_evolution (loop->num, access_fn,
 637                                             &init, &step)
 638            || (LOOP_VINFO_LOOP (loop_vinfo) != loop
 639                && TREE_CODE (step) != INTEGER_CST))
 640           /* Only handle nonlinear iv for same loop.  */
 641           && (LOOP_VINFO_LOOP (loop_vinfo) != loop
 642               || !vect_is_nonlinear_iv_evolution (loop, stmt_vinfo,
 643                                                   phi, &init, &step)))
 644         {
 645           worklist.safe_push (stmt_vinfo);
 646           continue;
 647         }
 648
 649       gcc_assert (STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo)
 650                   != NULL_TREE);
 651       gcc_assert (STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo) != NULL_TREE);
 652
 653       if (dump_enabled_p ())
 654         dump_printf_loc (MSG_NOTE, vect_location, "Detected induction.\n");
 655       STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_induction_def;
 656     }
 657
 658
 659   /* Second - identify all reductions and nested cycles.  */
 660   while (worklist.length () > 0)
 661     {
 662       stmt_vec_info stmt_vinfo = worklist.pop ();
 663       gphi *phi = as_a <gphi *> (stmt_vinfo->stmt);
 664       tree def = PHI_RESULT (phi);
 665
 666       if (dump_enabled_p ())
 667         dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: %G",
 668                          (gimple *) phi);
 669
 670       gcc_assert (!virtual_operand_p (def)
 671                   && STMT_VINFO_DEF_TYPE (stmt_vinfo) == vect_unknown_def_type);
 672
 673       stmt_vec_info reduc_stmt_info
 674         = vect_is_simple_reduction (loop_vinfo, stmt_vinfo, &double_reduc,
 675                                     &reduc_chain, slp);
 676       if (reduc_stmt_info)
 677         {
 678           STMT_VINFO_REDUC_DEF (stmt_vinfo) = reduc_stmt_info;
 679           STMT_VINFO_REDUC_DEF (reduc_stmt_info) = stmt_vinfo;
 680           if (double_reduc)
 681             {
 682               if (dump_enabled_p ())
 683                 dump_printf_loc (MSG_NOTE, vect_location,
 684                                  "Detected double reduction.\n");
 685
 686               STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_double_reduction_def;
 687               STMT_VINFO_DEF_TYPE (reduc_stmt_info) = vect_double_reduction_def;
 688             }
 689           else
 690             {
 691               if (loop != LOOP_VINFO_LOOP (loop_vinfo))
 692                 {
 693                   if (dump_enabled_p ())
 694                     dump_printf_loc (MSG_NOTE, vect_location,
 695                                      "Detected vectorizable nested cycle.\n");
 696
 697                   STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_nested_cycle;
 698                 }
 699               else
 700                 {
 701                   if (dump_enabled_p ())
 702                     dump_printf_loc (MSG_NOTE, vect_location,
 703                                      "Detected reduction.\n");
 704
 705                   STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_reduction_def;
 706                   STMT_VINFO_DEF_TYPE (reduc_stmt_info) = vect_reduction_def;
 707                   /* Store the reduction cycles for possible vectorization in
 708                      loop-aware SLP if it was not detected as reduction
 709                      chain.  */
 710                   if (! reduc_chain)
 711                     LOOP_VINFO_REDUCTIONS (loop_vinfo).safe_push
 712                       (reduc_stmt_info);
 713                 }
 714             }
 715         }
 716       else if (vect_phi_first_order_recurrence_p (loop_vinfo, loop, phi))
 717         STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_first_order_recurrence;
 718       else
 719         if (dump_enabled_p ())
 720           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 721                            "Unknown def-use cycle pattern.\n");
 722     }
 723 }
 724
 725
 726 /* Function vect_analyze_scalar_cycles.
 727
 728    Examine the cross iteration def-use cycles of scalar variables, by
 729    analyzing the loop-header PHIs of scalar variables.  Classify each
 730    cycle as one of the following: invariant, induction, reduction, unknown.
 731    We do that for the loop represented by LOOP_VINFO, and also to its
 732    inner-loop, if exists.
 733    Examples for scalar cycles:
 734
 735    Example1: reduction:
 736
 737               loop1:
 738               for (i=0; i<N; i++)
 739                  sum += a[i];
 740
 741    Example2: induction:
 742
 743               loop2:
 744               for (i=0; i<N; i++)
 745                  a[i] = i;  */
 746
 747 static void
 748 vect_analyze_scalar_cycles (loop_vec_info loop_vinfo, bool slp)
 749 {
 750   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
 751
 752   vect_analyze_scalar_cycles_1 (loop_vinfo, loop, slp);
 753
 754   /* When vectorizing an outer-loop, the inner-loop is executed sequentially.
 755      Reductions in such inner-loop therefore have different properties than
 756      the reductions in the nest that gets vectorized:
 757      1. When vectorized, they are executed in the same order as in the original
 758         scalar loop, so we can't change the order of computation when
 759         vectorizing them.
 760      2. FIXME: Inner-loop reductions can be used in the inner-loop, so the
 761         current checks are too strict.  */
 762
 763   if (loop->inner)
 764     vect_analyze_scalar_cycles_1 (loop_vinfo, loop->inner, slp);
 765 }
 766
 767 /* Transfer group and reduction information from STMT_INFO to its
 768    pattern stmt.  */
 769
 770 static void
 771 vect_fixup_reduc_chain (stmt_vec_info stmt_info)
 772 {
 773   stmt_vec_info firstp = STMT_VINFO_RELATED_STMT (stmt_info);
 774   stmt_vec_info stmtp;
 775   gcc_assert (!REDUC_GROUP_FIRST_ELEMENT (firstp)
 776               && REDUC_GROUP_FIRST_ELEMENT (stmt_info));
 777   REDUC_GROUP_SIZE (firstp) = REDUC_GROUP_SIZE (stmt_info);
 778   do
 779     {
 780       stmtp = STMT_VINFO_RELATED_STMT (stmt_info);
 781       gcc_checking_assert (STMT_VINFO_DEF_TYPE (stmtp)
 782                            == STMT_VINFO_DEF_TYPE (stmt_info));
 783       REDUC_GROUP_FIRST_ELEMENT (stmtp) = firstp;
 784       stmt_info = REDUC_GROUP_NEXT_ELEMENT (stmt_info);
 785       if (stmt_info)
 786         REDUC_GROUP_NEXT_ELEMENT (stmtp)
 787           = STMT_VINFO_RELATED_STMT (stmt_info);
 788     }
 789   while (stmt_info);
 790 }
 791
 792 /* Fixup scalar cycles that now have their stmts detected as patterns.  */
 793
 794 static void
 795 vect_fixup_scalar_cycles_with_patterns (loop_vec_info loop_vinfo)
 796 {
 797   stmt_vec_info first;
 798   unsigned i;
 799
 800   FOR_EACH_VEC_ELT (LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo), i, first)
 801     {
 802       stmt_vec_info next = REDUC_GROUP_NEXT_ELEMENT (first);
 803       while (next)
 804         {
 805           if ((STMT_VINFO_IN_PATTERN_P (next)
 806                != STMT_VINFO_IN_PATTERN_P (first))
 807               || STMT_VINFO_REDUC_IDX (vect_stmt_to_vectorize (next)) == -1)
 808             break;
 809           next = REDUC_GROUP_NEXT_ELEMENT (next);
 810         }
 811       /* If all reduction chain members are well-formed patterns adjust
 812          the group to group the pattern stmts instead.  */
 813       if (! next
 814           && STMT_VINFO_REDUC_IDX (vect_stmt_to_vectorize (first)) != -1)
 815         {
 816           if (STMT_VINFO_IN_PATTERN_P (first))
 817             {
 818               vect_fixup_reduc_chain (first);
 819               LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo)[i]
 820                 = STMT_VINFO_RELATED_STMT (first);
 821             }
 822         }
 823       /* If not all stmt in the chain are patterns or if we failed
 824          to update STMT_VINFO_REDUC_IDX dissolve the chain and handle
 825          it as regular reduction instead.  */
 826       else
 827         {
 828           stmt_vec_info vinfo = first;
 829           stmt_vec_info last = NULL;
 830           while (vinfo)
 831             {
 832               next = REDUC_GROUP_NEXT_ELEMENT (vinfo);
 833               REDUC_GROUP_FIRST_ELEMENT (vinfo) = NULL;
 834               REDUC_GROUP_NEXT_ELEMENT (vinfo) = NULL;
 835               last = vinfo;
 836               vinfo = next;
 837             }
 838           STMT_VINFO_DEF_TYPE (vect_stmt_to_vectorize (first))
 839             = vect_internal_def;
 840           loop_vinfo->reductions.safe_push (vect_stmt_to_vectorize (last));
 841           LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo).unordered_remove (i);
 842           --i;
 843         }
 844     }
 845 }
 846
 847 /* Function vect_get_loop_niters.
 848
 849    Determine how many iterations the loop is executed and place it
 850    in NUMBER_OF_ITERATIONS.  Place the number of latch iterations
 851    in NUMBER_OF_ITERATIONSM1.  Place the condition under which the
 852    niter information holds in ASSUMPTIONS.
 853
 854    Return the loop exit conditions.  */
 855
 856
 857 static vec<gcond *>
 858 vect_get_loop_niters (class loop *loop, const_edge main_exit, tree *assumptions,
 859                       tree *number_of_iterations, tree *number_of_iterationsm1)
 860 {
 861   auto_vec<edge> exits = get_loop_exit_edges (loop);
 862   vec<gcond *> conds;
 863   conds.create (exits.length ());
 864   class tree_niter_desc niter_desc;
 865   tree niter_assumptions, niter, may_be_zero;
 866
 867   *assumptions = boolean_true_node;
 868   *number_of_iterationsm1 = chrec_dont_know;
 869   *number_of_iterations = chrec_dont_know;
 870
 871   DUMP_VECT_SCOPE ("get_loop_niters");
 872
 873   if (exits.is_empty ())
 874     return conds;
 875
 876   if (dump_enabled_p ())
 877     dump_printf_loc (MSG_NOTE, vect_location, "Loop has %d exits.\n",
 878                      exits.length ());
 879
 880   edge exit;
 881   unsigned int i;
 882   FOR_EACH_VEC_ELT (exits, i, exit)
 883     {
 884       gcond *cond = get_loop_exit_condition (exit);
 885       if (cond)
 886         conds.safe_push (cond);
 887
 888       if (dump_enabled_p ())
 889         dump_printf_loc (MSG_NOTE, vect_location, "Analyzing exit %d...\n", i);
 890
 891       if (exit != main_exit)
 892         continue;
 893
 894       may_be_zero = NULL_TREE;
 895       if (!number_of_iterations_exit_assumptions (loop, exit, &niter_desc, NULL)
 896           || chrec_contains_undetermined (niter_desc.niter))
 897         continue;
 898
 899       niter_assumptions = niter_desc.assumptions;
 900       may_be_zero = niter_desc.may_be_zero;
 901       niter = niter_desc.niter;
 902
 903       if (may_be_zero && integer_zerop (may_be_zero))
 904         may_be_zero = NULL_TREE;
 905
 906       if (may_be_zero)
 907         {
 908           if (COMPARISON_CLASS_P (may_be_zero))
 909             {
 910               /* Try to combine may_be_zero with assumptions, this can simplify
 911                  computation of niter expression.  */
 912               if (niter_assumptions && !integer_nonzerop (niter_assumptions))
 913                 niter_assumptions = fold_build2 (TRUTH_AND_EXPR, boolean_type_node,
 914                                                  niter_assumptions,
 915                                                  fold_build1 (TRUTH_NOT_EXPR,
 916                                                               boolean_type_node,
 917                                                               may_be_zero));
 918               else
 919                 niter = fold_build3 (COND_EXPR, TREE_TYPE (niter), may_be_zero,
 920                                      build_int_cst (TREE_TYPE (niter), 0),
 921                                      rewrite_to_non_trapping_overflow (niter));
 922
 923               may_be_zero = NULL_TREE;
 924             }
 925           else if (integer_nonzerop (may_be_zero))
 926             {
 927               *number_of_iterationsm1 = build_int_cst (TREE_TYPE (niter), 0);
 928               *number_of_iterations = build_int_cst (TREE_TYPE (niter), 1);
 929               continue;
 930             }
 931           else
 932             continue;
 933        }
 934
 935       /* Loop assumptions are based off the normal exit.  */
 936       *assumptions = niter_assumptions;
 937       *number_of_iterationsm1 = niter;
 938
 939       /* We want the number of loop header executions which is the number
 940          of latch executions plus one.
 941          ???  For UINT_MAX latch executions this number overflows to zero
 942          for loops like do { n++; } while (n != 0);  */
 943       if (niter && !chrec_contains_undetermined (niter))
 944         {
 945           niter = fold_build2 (PLUS_EXPR, TREE_TYPE (niter),
 946                                unshare_expr (niter),
 947                                build_int_cst (TREE_TYPE (niter), 1));
 948           if (TREE_CODE (niter) == INTEGER_CST
 949               && TREE_CODE (*number_of_iterationsm1) != INTEGER_CST)
 950             {
 951               /* If we manage to fold niter + 1 into INTEGER_CST even when
 952                  niter is some complex expression, ensure back
 953                  *number_of_iterationsm1 is an INTEGER_CST as well.  See
 954                  PR113210.  */
 955               *number_of_iterationsm1
 956                 = fold_build2 (PLUS_EXPR, TREE_TYPE (niter), niter,
 957                                build_minus_one_cst (TREE_TYPE (niter)));
 958             }
 959         }
 960       *number_of_iterations = niter;
 961     }
 962
 963   if (dump_enabled_p ())
 964     dump_printf_loc (MSG_NOTE, vect_location, "All loop exits successfully analyzed.\n");
 965
 966   return conds;
 967 }
 968
 969 /*  Determine the main loop exit for the vectorizer.  */
 970
 971 edge
 972 vec_init_loop_exit_info (class loop *loop)
 973 {
 974   /* Before we begin we must first determine which exit is the main one and
 975      which are auxilary exits.  */
 976   auto_vec<edge> exits = get_loop_exit_edges (loop);
 977   if (exits.length () == 1)
 978     return exits[0];
 979
 980   /* If we have multiple exits we only support counting IV at the moment.
 981      Analyze all exits and return the last one we can analyze.  */
 982   class tree_niter_desc niter_desc;
 983   edge candidate = NULL;
 984   for (edge exit : exits)
 985     {
 986       if (!get_loop_exit_condition (exit))
 987         continue;
 988
 989       if (number_of_iterations_exit_assumptions (loop, exit, &niter_desc, NULL)
 990           && !chrec_contains_undetermined (niter_desc.niter))
 991         {
 992           tree may_be_zero = niter_desc.may_be_zero;
 993           if ((integer_zerop (may_be_zero)
 994                /* As we are handling may_be_zero that's not false by
 995                   rewriting niter to may_be_zero ? 0 : niter we require
 996                   an empty latch.  */
 997                || (single_pred_p (loop->latch)
 998                    && exit->src == single_pred (loop->latch)
 999                    && (integer_nonzerop (may_be_zero)
1000                        || COMPARISON_CLASS_P (may_be_zero))))
1001               && (!candidate
1002                   || dominated_by_p (CDI_DOMINATORS, exit->src,
1003                                      candidate->src)))
1004             candidate = exit;
1005         }
1006     }
1007
1008   return candidate;
1009 }
1010
1011 /* Function bb_in_loop_p
1012
1013    Used as predicate for dfs order traversal of the loop bbs.  */
1014
1015 static bool
1016 bb_in_loop_p (const_basic_block bb, const void *data)
1017 {
1018   const class loop *const loop = (const class loop *)data;
1019   if (flow_bb_inside_loop_p (loop, bb))
1020     return true;
1021   return false;
1022 }
1023
1024
1025 /* Create and initialize a new loop_vec_info struct for LOOP_IN, as well as
1026    stmt_vec_info structs for all the stmts in LOOP_IN.  */
1027
1028 _loop_vec_info::_loop_vec_info (class loop *loop_in, vec_info_shared *shared)
1029   : vec_info (vec_info::loop, shared),
1030     loop (loop_in),
1031     bbs (XCNEWVEC (basic_block, loop->num_nodes)),
1032     num_itersm1 (NULL_TREE),
1033     num_iters (NULL_TREE),
1034     num_iters_unchanged (NULL_TREE),
1035     num_iters_assumptions (NULL_TREE),
1036     vector_costs (nullptr),
1037     scalar_costs (nullptr),
1038     th (0),
1039     versioning_threshold (0),
1040     vectorization_factor (0),
1041     main_loop_edge (nullptr),
1042     skip_main_loop_edge (nullptr),
1043     skip_this_loop_edge (nullptr),
1044     reusable_accumulators (),
1045     suggested_unroll_factor (1),
1046     max_vectorization_factor (0),
1047     mask_skip_niters (NULL_TREE),
1048     rgroup_compare_type (NULL_TREE),
1049     simd_if_cond (NULL_TREE),
1050     partial_vector_style (vect_partial_vectors_none),
1051     unaligned_dr (NULL),
1052     peeling_for_alignment (0),
1053     ptr_mask (0),
1054     ivexpr_map (NULL),
1055     scan_map (NULL),
1056     slp_unrolling_factor (1),
1057     inner_loop_cost_factor (param_vect_inner_loop_cost_factor),
1058     vectorizable (false),
1059     can_use_partial_vectors_p (param_vect_partial_vector_usage != 0),
1060     using_partial_vectors_p (false),
1061     using_decrementing_iv_p (false),
1062     using_select_vl_p (false),
1063     epil_using_partial_vectors_p (false),
1064     partial_load_store_bias (0),
1065     peeling_for_gaps (false),
1066     peeling_for_niter (false),
1067     early_breaks (false),
1068     no_data_dependencies (false),
1069     has_mask_store (false),
1070     scalar_loop_scaling (profile_probability::uninitialized ()),
1071     scalar_loop (NULL),
1072     orig_loop_info (NULL),
1073     vec_loop_iv_exit (NULL),
1074     vec_epilogue_loop_iv_exit (NULL),
1075     scalar_loop_iv_exit (NULL)
1076 {
1077   /* CHECKME: We want to visit all BBs before their successors (except for
1078      latch blocks, for which this assertion wouldn't hold).  In the simple
1079      case of the loop forms we allow, a dfs order of the BBs would the same
1080      as reversed postorder traversal, so we are safe.  */
1081
1082   unsigned int nbbs = dfs_enumerate_from (loop->header, 0, bb_in_loop_p,
1083                                           bbs, loop->num_nodes, loop);
1084   gcc_assert (nbbs == loop->num_nodes);
1085
1086   for (unsigned int i = 0; i < nbbs; i++)
1087     {
1088       basic_block bb = bbs[i];
1089       gimple_stmt_iterator si;
1090
1091       for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
1092         {
1093           gimple *phi = gsi_stmt (si);
1094           gimple_set_uid (phi, 0);
1095           add_stmt (phi);
1096         }
1097
1098       for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
1099         {
1100           gimple *stmt = gsi_stmt (si);
1101           gimple_set_uid (stmt, 0);
1102           if (is_gimple_debug (stmt))
1103             continue;
1104           add_stmt (stmt);
1105           /* If .GOMP_SIMD_LANE call for the current loop has 3 arguments, the
1106              third argument is the #pragma omp simd if (x) condition, when 0,
1107              loop shouldn't be vectorized, when non-zero constant, it should
1108              be vectorized normally, otherwise versioned with vectorized loop
1109              done if the condition is non-zero at runtime.  */
1110           if (loop_in->simduid
1111               && is_gimple_call (stmt)
1112               && gimple_call_internal_p (stmt)
1113               && gimple_call_internal_fn (stmt) == IFN_GOMP_SIMD_LANE
1114               && gimple_call_num_args (stmt) >= 3
1115               && TREE_CODE (gimple_call_arg (stmt, 0)) == SSA_NAME
1116               && (loop_in->simduid
1117                   == SSA_NAME_VAR (gimple_call_arg (stmt, 0))))
1118             {
1119               tree arg = gimple_call_arg (stmt, 2);
1120               if (integer_zerop (arg) || TREE_CODE (arg) == SSA_NAME)
1121                 simd_if_cond = arg;
1122               else
1123                 gcc_assert (integer_nonzerop (arg));
1124             }
1125         }
1126     }
1127
1128   epilogue_vinfos.create (6);
1129 }
1130
1131 /* Free all levels of rgroup CONTROLS.  */
1132
1133 void
1134 release_vec_loop_controls (vec<rgroup_controls> *controls)
1135 {
1136   rgroup_controls *rgc;
1137   unsigned int i;
1138   FOR_EACH_VEC_ELT (*controls, i, rgc)
1139     rgc->controls.release ();
1140   controls->release ();
1141 }
1142
1143 /* Free all memory used by the _loop_vec_info, as well as all the
1144    stmt_vec_info structs of all the stmts in the loop.  */
1145
1146 _loop_vec_info::~_loop_vec_info ()
1147 {
1148   free (bbs);
1149
1150   release_vec_loop_controls (&masks.rgc_vec);
1151   release_vec_loop_controls (&lens);
1152   delete ivexpr_map;
1153   delete scan_map;
1154   epilogue_vinfos.release ();
1155   delete scalar_costs;
1156   delete vector_costs;
1157
1158   /* When we release an epiloge vinfo that we do not intend to use
1159      avoid clearing AUX of the main loop which should continue to
1160      point to the main loop vinfo since otherwise we'll leak that.  */
1161   if (loop->aux == this)
1162     loop->aux = NULL;
1163 }
1164
1165 /* Return an invariant or register for EXPR and emit necessary
1166    computations in the LOOP_VINFO loop preheader.  */
1167
1168 tree
1169 cse_and_gimplify_to_preheader (loop_vec_info loop_vinfo, tree expr)
1170 {
1171   if (is_gimple_reg (expr)
1172       || is_gimple_min_invariant (expr))
1173     return expr;
1174
1175   if (! loop_vinfo->ivexpr_map)
1176     loop_vinfo->ivexpr_map = new hash_map<tree_operand_hash, tree>;
1177   tree &cached = loop_vinfo->ivexpr_map->get_or_insert (expr);
1178   if (! cached)
1179     {
1180       gimple_seq stmts = NULL;
1181       cached = force_gimple_operand (unshare_expr (expr),
1182                                      &stmts, true, NULL_TREE);
1183       if (stmts)
1184         {
1185           edge e = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo));
1186           gsi_insert_seq_on_edge_immediate (e, stmts);
1187         }
1188     }
1189   return cached;
1190 }
1191
1192 /* Return true if we can use CMP_TYPE as the comparison type to produce
1193    all masks required to mask LOOP_VINFO.  */
1194
1195 static bool
1196 can_produce_all_loop_masks_p (loop_vec_info loop_vinfo, tree cmp_type)
1197 {
1198   rgroup_controls *rgm;
1199   unsigned int i;
1200   FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo).rgc_vec, i, rgm)
1201     if (rgm->type != NULL_TREE
1202         && !direct_internal_fn_supported_p (IFN_WHILE_ULT,
1203                                             cmp_type, rgm->type,
1204                                             OPTIMIZE_FOR_SPEED))
1205       return false;
1206   return true;
1207 }
1208
1209 /* Calculate the maximum number of scalars per iteration for every
1210    rgroup in LOOP_VINFO.  */
1211
1212 static unsigned int
1213 vect_get_max_nscalars_per_iter (loop_vec_info loop_vinfo)
1214 {
1215   unsigned int res = 1;
1216   unsigned int i;
1217   rgroup_controls *rgm;
1218   FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo).rgc_vec, i, rgm)
1219     res = MAX (res, rgm->max_nscalars_per_iter);
1220   return res;
1221 }
1222
1223 /* Calculate the minimum precision necessary to represent:
1224
1225       MAX_NITERS * FACTOR
1226
1227    as an unsigned integer, where MAX_NITERS is the maximum number of
1228    loop header iterations for the original scalar form of LOOP_VINFO.  */
1229
1230 static unsigned
1231 vect_min_prec_for_max_niters (loop_vec_info loop_vinfo, unsigned int factor)
1232 {
1233   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1234
1235   /* Get the maximum number of iterations that is representable
1236      in the counter type.  */
1237   tree ni_type = TREE_TYPE (LOOP_VINFO_NITERSM1 (loop_vinfo));
1238   widest_int max_ni = wi::to_widest (TYPE_MAX_VALUE (ni_type)) + 1;
1239
1240   /* Get a more refined estimate for the number of iterations.  */
1241   widest_int max_back_edges;
1242   if (max_loop_iterations (loop, &max_back_edges))
1243     max_ni = wi::smin (max_ni, max_back_edges + 1);
1244
1245   /* Work out how many bits we need to represent the limit.  */
1246   return wi::min_precision (max_ni * factor, UNSIGNED);
1247 }
1248
1249 /* True if the loop needs peeling or partial vectors when vectorized.  */
1250
1251 static bool
1252 vect_need_peeling_or_partial_vectors_p (loop_vec_info loop_vinfo)
1253 {
1254   unsigned HOST_WIDE_INT const_vf;
1255   HOST_WIDE_INT max_niter
1256     = likely_max_stmt_executions_int (LOOP_VINFO_LOOP (loop_vinfo));
1257
1258   unsigned th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
1259   if (!th && LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo))
1260     th = LOOP_VINFO_COST_MODEL_THRESHOLD (LOOP_VINFO_ORIG_LOOP_INFO
1261                                           (loop_vinfo));
1262
1263   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
1264       && LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) >= 0)
1265     {
1266       /* Work out the (constant) number of iterations that need to be
1267          peeled for reasons other than niters.  */
1268       unsigned int peel_niter = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
1269       if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
1270         peel_niter += 1;
1271       if (!multiple_p (LOOP_VINFO_INT_NITERS (loop_vinfo) - peel_niter,
1272                        LOOP_VINFO_VECT_FACTOR (loop_vinfo)))
1273         return true;
1274     }
1275   else if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
1276       /* ??? When peeling for gaps but not alignment, we could
1277          try to check whether the (variable) niters is known to be
1278          VF * N + 1.  That's something of a niche case though.  */
1279       || LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
1280       || !LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant (&const_vf)
1281       || ((tree_ctz (LOOP_VINFO_NITERS (loop_vinfo))
1282            < (unsigned) exact_log2 (const_vf))
1283           /* In case of versioning, check if the maximum number of
1284              iterations is greater than th.  If they are identical,
1285              the epilogue is unnecessary.  */
1286           && (!LOOP_REQUIRES_VERSIONING (loop_vinfo)
1287               || ((unsigned HOST_WIDE_INT) max_niter
1288                   /* We'd like to use LOOP_VINFO_VERSIONING_THRESHOLD
1289                      but that's only computed later based on our result.
1290                      The following is the most conservative approximation.  */
1291                   > (std::max ((unsigned HOST_WIDE_INT) th,
1292                                const_vf) / const_vf) * const_vf))))
1293     return true;
1294
1295   return false;
1296 }
1297
1298 /* Each statement in LOOP_VINFO can be masked where necessary.  Check
1299    whether we can actually generate the masks required.  Return true if so,
1300    storing the type of the scalar IV in LOOP_VINFO_RGROUP_COMPARE_TYPE.  */
1301
1302 static bool
1303 vect_verify_full_masking (loop_vec_info loop_vinfo)
1304 {
1305   unsigned int min_ni_width;
1306
1307   /* Use a normal loop if there are no statements that need masking.
1308      This only happens in rare degenerate cases: it means that the loop
1309      has no loads, no stores, and no live-out values.  */
1310   if (LOOP_VINFO_MASKS (loop_vinfo).is_empty ())
1311     return false;
1312
1313   /* Produce the rgroup controls.  */
1314   for (auto mask : LOOP_VINFO_MASKS (loop_vinfo).mask_set)
1315     {
1316       vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo);
1317       tree vectype = mask.first;
1318       unsigned nvectors = mask.second;
1319
1320       if (masks->rgc_vec.length () < nvectors)
1321         masks->rgc_vec.safe_grow_cleared (nvectors, true);
1322       rgroup_controls *rgm = &(*masks).rgc_vec[nvectors - 1];
1323       /* The number of scalars per iteration and the number of vectors are
1324          both compile-time constants.  */
1325       unsigned int nscalars_per_iter
1326           = exact_div (nvectors * TYPE_VECTOR_SUBPARTS (vectype),
1327                        LOOP_VINFO_VECT_FACTOR (loop_vinfo)).to_constant ();
1328
1329       if (rgm->max_nscalars_per_iter < nscalars_per_iter)
1330         {
1331           rgm->max_nscalars_per_iter = nscalars_per_iter;
1332           rgm->type = truth_type_for (vectype);
1333           rgm->factor = 1;
1334         }
1335     }
1336
1337   unsigned int max_nscalars_per_iter
1338     = vect_get_max_nscalars_per_iter (loop_vinfo);
1339
1340   /* Work out how many bits we need to represent the limit.  */
1341   min_ni_width
1342     = vect_min_prec_for_max_niters (loop_vinfo, max_nscalars_per_iter);
1343
1344   /* Find a scalar mode for which WHILE_ULT is supported.  */
1345   opt_scalar_int_mode cmp_mode_iter;
1346   tree cmp_type = NULL_TREE;
1347   tree iv_type = NULL_TREE;
1348   widest_int iv_limit = vect_iv_limit_for_partial_vectors (loop_vinfo);
1349   unsigned int iv_precision = UINT_MAX;
1350
1351   if (iv_limit != -1)
1352     iv_precision = wi::min_precision (iv_limit * max_nscalars_per_iter,
1353                                       UNSIGNED);
1354
1355   FOR_EACH_MODE_IN_CLASS (cmp_mode_iter, MODE_INT)
1356     {
1357       unsigned int cmp_bits = GET_MODE_BITSIZE (cmp_mode_iter.require ());
1358       if (cmp_bits >= min_ni_width
1359           && targetm.scalar_mode_supported_p (cmp_mode_iter.require ()))
1360         {
1361           tree this_type = build_nonstandard_integer_type (cmp_bits, true);
1362           if (this_type
1363               && can_produce_all_loop_masks_p (loop_vinfo, this_type))
1364             {
1365               /* Although we could stop as soon as we find a valid mode,
1366                  there are at least two reasons why that's not always the
1367                  best choice:
1368
1369                  - An IV that's Pmode or wider is more likely to be reusable
1370                    in address calculations than an IV that's narrower than
1371                    Pmode.
1372
1373                  - Doing the comparison in IV_PRECISION or wider allows
1374                    a natural 0-based IV, whereas using a narrower comparison
1375                    type requires mitigations against wrap-around.
1376
1377                  Conversely, if the IV limit is variable, doing the comparison
1378                  in a wider type than the original type can introduce
1379                  unnecessary extensions, so picking the widest valid mode
1380                  is not always a good choice either.
1381
1382                  Here we prefer the first IV type that's Pmode or wider,
1383                  and the first comparison type that's IV_PRECISION or wider.
1384                  (The comparison type must be no wider than the IV type,
1385                  to avoid extensions in the vector loop.)
1386
1387                  ??? We might want to try continuing beyond Pmode for ILP32
1388                  targets if CMP_BITS < IV_PRECISION.  */
1389               iv_type = this_type;
1390               if (!cmp_type || iv_precision > TYPE_PRECISION (cmp_type))
1391                 cmp_type = this_type;
1392               if (cmp_bits >= GET_MODE_BITSIZE (Pmode))
1393                 break;
1394             }
1395         }
1396     }
1397
1398   if (!cmp_type)
1399     {
1400       LOOP_VINFO_MASKS (loop_vinfo).rgc_vec.release ();
1401       return false;
1402     }
1403
1404   LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo) = cmp_type;
1405   LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo) = iv_type;
1406   LOOP_VINFO_PARTIAL_VECTORS_STYLE (loop_vinfo) = vect_partial_vectors_while_ult;
1407   return true;
1408 }
1409
1410 /* Each statement in LOOP_VINFO can be masked where necessary.  Check
1411    whether we can actually generate AVX512 style masks.  Return true if so,
1412    storing the type of the scalar IV in LOOP_VINFO_RGROUP_IV_TYPE.  */
1413
1414 static bool
1415 vect_verify_full_masking_avx512 (loop_vec_info loop_vinfo)
1416 {
1417   /* Produce differently organized rgc_vec and differently check
1418      we can produce masks.  */
1419
1420   /* Use a normal loop if there are no statements that need masking.
1421      This only happens in rare degenerate cases: it means that the loop
1422      has no loads, no stores, and no live-out values.  */
1423   if (LOOP_VINFO_MASKS (loop_vinfo).is_empty ())
1424     return false;
1425
1426   /* For the decrementing IV we need to represent all values in
1427      [0, niter + niter_skip] where niter_skip is the elements we
1428      skip in the first iteration for prologue peeling.  */
1429   tree iv_type = NULL_TREE;
1430   widest_int iv_limit = vect_iv_limit_for_partial_vectors (loop_vinfo);
1431   unsigned int iv_precision = UINT_MAX;
1432   if (iv_limit != -1)
1433     iv_precision = wi::min_precision (iv_limit, UNSIGNED);
1434
1435   /* First compute the type for the IV we use to track the remaining
1436      scalar iterations.  */
1437   opt_scalar_int_mode cmp_mode_iter;
1438   FOR_EACH_MODE_IN_CLASS (cmp_mode_iter, MODE_INT)
1439     {
1440       unsigned int cmp_bits = GET_MODE_BITSIZE (cmp_mode_iter.require ());
1441       if (cmp_bits >= iv_precision
1442           && targetm.scalar_mode_supported_p (cmp_mode_iter.require ()))
1443         {
1444           iv_type = build_nonstandard_integer_type (cmp_bits, true);
1445           if (iv_type)
1446             break;
1447         }
1448     }
1449   if (!iv_type)
1450     return false;
1451
1452   /* Produce the rgroup controls.  */
1453   for (auto const &mask : LOOP_VINFO_MASKS (loop_vinfo).mask_set)
1454     {
1455       vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo);
1456       tree vectype = mask.first;
1457       unsigned nvectors = mask.second;
1458
1459       /* The number of scalars per iteration and the number of vectors are
1460          both compile-time constants.  */
1461       unsigned int nscalars_per_iter
1462         = exact_div (nvectors * TYPE_VECTOR_SUBPARTS (vectype),
1463                      LOOP_VINFO_VECT_FACTOR (loop_vinfo)).to_constant ();
1464
1465       /* We index the rgroup_controls vector with nscalars_per_iter
1466          which we keep constant and instead have a varying nvectors,
1467          remembering the vector mask with the fewest nV.  */
1468       if (masks->rgc_vec.length () < nscalars_per_iter)
1469         masks->rgc_vec.safe_grow_cleared (nscalars_per_iter, true);
1470       rgroup_controls *rgm = &(*masks).rgc_vec[nscalars_per_iter - 1];
1471
1472       if (!rgm->type || rgm->factor > nvectors)
1473         {
1474           rgm->type = truth_type_for (vectype);
1475           rgm->compare_type = NULL_TREE;
1476           rgm->max_nscalars_per_iter = nscalars_per_iter;
1477           rgm->factor = nvectors;
1478           rgm->bias_adjusted_ctrl = NULL_TREE;
1479         }
1480     }
1481
1482   /* There is no fixed compare type we are going to use but we have to
1483      be able to get at one for each mask group.  */
1484   unsigned int min_ni_width
1485     = wi::min_precision (vect_max_vf (loop_vinfo), UNSIGNED);
1486
1487   bool ok = true;
1488   for (auto &rgc : LOOP_VINFO_MASKS (loop_vinfo).rgc_vec)
1489     {
1490       tree mask_type = rgc.type;
1491       if (!mask_type)
1492         continue;
1493
1494       /* For now vect_get_loop_mask only supports integer mode masks
1495          when we need to split it.  */
1496       if (GET_MODE_CLASS (TYPE_MODE (mask_type)) != MODE_INT
1497           || TYPE_PRECISION (TREE_TYPE (mask_type)) != 1)
1498         {
1499           ok = false;
1500           break;
1501         }
1502
1503       /* If iv_type is usable as compare type use that - we can elide the
1504          saturation in that case.   */
1505       if (TYPE_PRECISION (iv_type) >= min_ni_width)
1506         {
1507           tree cmp_vectype
1508             = build_vector_type (iv_type, TYPE_VECTOR_SUBPARTS (mask_type));
1509           if (expand_vec_cmp_expr_p (cmp_vectype, mask_type, LT_EXPR))
1510             rgc.compare_type = cmp_vectype;
1511         }
1512       if (!rgc.compare_type)
1513         FOR_EACH_MODE_IN_CLASS (cmp_mode_iter, MODE_INT)
1514           {
1515             unsigned int cmp_bits = GET_MODE_BITSIZE (cmp_mode_iter.require ());
1516             if (cmp_bits >= min_ni_width
1517                 && targetm.scalar_mode_supported_p (cmp_mode_iter.require ()))
1518               {
1519                 tree cmp_type = build_nonstandard_integer_type (cmp_bits, true);
1520                 if (!cmp_type)
1521                   continue;
1522
1523                 /* Check whether we can produce the mask with cmp_type.  */
1524                 tree cmp_vectype
1525                   = build_vector_type (cmp_type, TYPE_VECTOR_SUBPARTS (mask_type));
1526                 if (expand_vec_cmp_expr_p (cmp_vectype, mask_type, LT_EXPR))
1527                   {
1528                     rgc.compare_type = cmp_vectype;
1529                     break;
1530                   }
1531               }
1532         }
1533       if (!rgc.compare_type)
1534         {
1535           ok = false;
1536           break;
1537         }
1538     }
1539   if (!ok)
1540     {
1541       release_vec_loop_controls (&LOOP_VINFO_MASKS (loop_vinfo).rgc_vec);
1542       return false;
1543     }
1544
1545   LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo) = error_mark_node;
1546   LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo) = iv_type;
1547   LOOP_VINFO_PARTIAL_VECTORS_STYLE (loop_vinfo) = vect_partial_vectors_avx512;
1548   return true;
1549 }
1550
1551 /* Check whether we can use vector access with length based on precison
1552    comparison.  So far, to keep it simple, we only allow the case that the
1553    precision of the target supported length is larger than the precision
1554    required by loop niters.  */
1555
1556 static bool
1557 vect_verify_loop_lens (loop_vec_info loop_vinfo)
1558 {
1559   if (LOOP_VINFO_LENS (loop_vinfo).is_empty ())
1560     return false;
1561
1562   machine_mode len_load_mode, len_store_mode;
1563   if (!get_len_load_store_mode (loop_vinfo->vector_mode, true)
1564          .exists (&len_load_mode))
1565     return false;
1566   if (!get_len_load_store_mode (loop_vinfo->vector_mode, false)
1567          .exists (&len_store_mode))
1568     return false;
1569
1570   signed char partial_load_bias = internal_len_load_store_bias
1571     (IFN_LEN_LOAD, len_load_mode);
1572
1573   signed char partial_store_bias = internal_len_load_store_bias
1574     (IFN_LEN_STORE, len_store_mode);
1575
1576   gcc_assert (partial_load_bias == partial_store_bias);
1577
1578   if (partial_load_bias == VECT_PARTIAL_BIAS_UNSUPPORTED)
1579     return false;
1580
1581   /* If the backend requires a bias of -1 for LEN_LOAD, we must not emit
1582      len_loads with a length of zero.  In order to avoid that we prohibit
1583      more than one loop length here.  */
1584   if (partial_load_bias == -1
1585       && LOOP_VINFO_LENS (loop_vinfo).length () > 1)
1586     return false;
1587
1588   LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo) = partial_load_bias;
1589
1590   unsigned int max_nitems_per_iter = 1;
1591   unsigned int i;
1592   rgroup_controls *rgl;
1593   /* Find the maximum number of items per iteration for every rgroup.  */
1594   FOR_EACH_VEC_ELT (LOOP_VINFO_LENS (loop_vinfo), i, rgl)
1595     {
1596       unsigned nitems_per_iter = rgl->max_nscalars_per_iter * rgl->factor;
1597       max_nitems_per_iter = MAX (max_nitems_per_iter, nitems_per_iter);
1598     }
1599
1600   /* Work out how many bits we need to represent the length limit.  */
1601   unsigned int min_ni_prec
1602     = vect_min_prec_for_max_niters (loop_vinfo, max_nitems_per_iter);
1603
1604   /* Now use the maximum of below precisions for one suitable IV type:
1605      - the IV's natural precision
1606      - the precision needed to hold: the maximum number of scalar
1607        iterations multiplied by the scale factor (min_ni_prec above)
1608      - the Pmode precision
1609
1610      If min_ni_prec is less than the precision of the current niters,
1611      we perfer to still use the niters type.  Prefer to use Pmode and
1612      wider IV to avoid narrow conversions.  */
1613
1614   unsigned int ni_prec
1615     = TYPE_PRECISION (TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo)));
1616   min_ni_prec = MAX (min_ni_prec, ni_prec);
1617   min_ni_prec = MAX (min_ni_prec, GET_MODE_BITSIZE (Pmode));
1618
1619   tree iv_type = NULL_TREE;
1620   opt_scalar_int_mode tmode_iter;
1621   FOR_EACH_MODE_IN_CLASS (tmode_iter, MODE_INT)
1622     {
1623       scalar_mode tmode = tmode_iter.require ();
1624       unsigned int tbits = GET_MODE_BITSIZE (tmode);
1625
1626       /* ??? Do we really want to construct one IV whose precision exceeds
1627          BITS_PER_WORD?  */
1628       if (tbits > BITS_PER_WORD)
1629         break;
1630
1631       /* Find the first available standard integral type.  */
1632       if (tbits >= min_ni_prec && targetm.scalar_mode_supported_p (tmode))
1633         {
1634           iv_type = build_nonstandard_integer_type (tbits, true);
1635           break;
1636         }
1637     }
1638
1639   if (!iv_type)
1640     {
1641       if (dump_enabled_p ())
1642         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1643                          "can't vectorize with length-based partial vectors"
1644                          " because there is no suitable iv type.\n");
1645       return false;
1646     }
1647
1648   LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo) = iv_type;
1649   LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo) = iv_type;
1650   LOOP_VINFO_PARTIAL_VECTORS_STYLE (loop_vinfo) = vect_partial_vectors_len;
1651
1652   return true;
1653 }
1654
1655 /* Calculate the cost of one scalar iteration of the loop.  */
1656 static void
1657 vect_compute_single_scalar_iteration_cost (loop_vec_info loop_vinfo)
1658 {
1659   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1660   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1661   int nbbs = loop->num_nodes, factor;
1662   int innerloop_iters, i;
1663
1664   DUMP_VECT_SCOPE ("vect_compute_single_scalar_iteration_cost");
1665
1666   /* Gather costs for statements in the scalar loop.  */
1667
1668   /* FORNOW.  */
1669   innerloop_iters = 1;
1670   if (loop->inner)
1671     innerloop_iters = LOOP_VINFO_INNER_LOOP_COST_FACTOR (loop_vinfo);
1672
1673   for (i = 0; i < nbbs; i++)
1674     {
1675       gimple_stmt_iterator si;
1676       basic_block bb = bbs[i];
1677
1678       if (bb->loop_father == loop->inner)
1679         factor = innerloop_iters;
1680       else
1681         factor = 1;
1682
1683       for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
1684         {
1685           gimple *stmt = gsi_stmt (si);
1686           stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (stmt);
1687
1688           if (!is_gimple_assign (stmt) && !is_gimple_call (stmt))
1689             continue;
1690
1691           /* Skip stmts that are not vectorized inside the loop.  */
1692           stmt_vec_info vstmt_info = vect_stmt_to_vectorize (stmt_info);
1693           if (!STMT_VINFO_RELEVANT_P (vstmt_info)
1694               && (!STMT_VINFO_LIVE_P (vstmt_info)
1695                   || !VECTORIZABLE_CYCLE_DEF
1696                         (STMT_VINFO_DEF_TYPE (vstmt_info))))
1697             continue;
1698
1699           vect_cost_for_stmt kind;
1700           if (STMT_VINFO_DATA_REF (stmt_info))
1701             {
1702               if (DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
1703                kind = scalar_load;
1704              else
1705                kind = scalar_store;
1706             }
1707           else if (vect_nop_conversion_p (stmt_info))
1708             continue;
1709           else
1710             kind = scalar_stmt;
1711
1712           /* We are using vect_prologue here to avoid scaling twice
1713              by the inner loop factor.  */
1714           record_stmt_cost (&LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1715                             factor, kind, stmt_info, 0, vect_prologue);
1716         }
1717     }
1718
1719   /* Now accumulate cost.  */
1720   loop_vinfo->scalar_costs = init_cost (loop_vinfo, true);
1721   add_stmt_costs (loop_vinfo->scalar_costs,
1722                   &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo));
1723   loop_vinfo->scalar_costs->finish_cost (nullptr);
1724 }
1725
1726 /* Function vect_analyze_loop_form.
1727
1728    Verify that certain CFG restrictions hold, including:
1729    - the loop has a pre-header
1730    - the loop has a single entry
1731    - nested loops can have only a single exit.
1732    - the loop exit condition is simple enough
1733    - the number of iterations can be analyzed, i.e, a countable loop.  The
1734      niter could be analyzed under some assumptions.  */
1735
1736 opt_result
1737 vect_analyze_loop_form (class loop *loop, vect_loop_form_info *info)
1738 {
1739   DUMP_VECT_SCOPE ("vect_analyze_loop_form");
1740
1741   edge exit_e = vec_init_loop_exit_info (loop);
1742   if (!exit_e)
1743     return opt_result::failure_at (vect_location,
1744                                    "not vectorized:"
1745                                    " could not determine main exit from"
1746                                    " loop with multiple exits.\n");
1747   info->loop_exit = exit_e;
1748   if (dump_enabled_p ())
1749       dump_printf_loc (MSG_NOTE, vect_location,
1750                        "using as main loop exit: %d -> %d [AUX: %p]\n",
1751                        exit_e->src->index, exit_e->dest->index, exit_e->aux);
1752
1753   /* Check if we have any control flow that doesn't leave the loop.  */
1754   class loop *v_loop = loop->inner ? loop->inner : loop;
1755   basic_block *bbs = get_loop_body (v_loop);
1756   for (unsigned i = 0; i < v_loop->num_nodes; i++)
1757     if (EDGE_COUNT (bbs[i]->succs) != 1
1758         && (EDGE_COUNT (bbs[i]->succs) != 2
1759             || !loop_exits_from_bb_p (bbs[i]->loop_father, bbs[i])))
1760       {
1761         free (bbs);
1762         return opt_result::failure_at (vect_location,
1763                                        "not vectorized:"
1764                                        " unsupported control flow in loop.\n");
1765       }
1766   free (bbs);
1767
1768   /* Different restrictions apply when we are considering an inner-most loop,
1769      vs. an outer (nested) loop.
1770      (FORNOW. May want to relax some of these restrictions in the future).  */
1771
1772   info->inner_loop_cond = NULL;
1773   if (!loop->inner)
1774     {
1775       /* Inner-most loop.  */
1776
1777       if (empty_block_p (loop->header))
1778         return opt_result::failure_at (vect_location,
1779                                        "not vectorized: empty loop.\n");
1780     }
1781   else
1782     {
1783       class loop *innerloop = loop->inner;
1784       edge entryedge;
1785
1786       /* Nested loop. We currently require that the loop is doubly-nested,
1787          contains a single inner loop with a single exit to the block
1788          with the single exit condition in the outer loop.
1789          Vectorizable outer-loops look like this:
1790
1791                         (pre-header)
1792                            |
1793                           header <---+
1794                            |         |
1795                           inner-loop |
1796                            |         |
1797                           tail ------+
1798                            |
1799                         (exit-bb)
1800
1801          The inner-loop also has the properties expected of inner-most loops
1802          as described above.  */
1803
1804       if ((loop->inner)->inner || (loop->inner)->next)
1805         return opt_result::failure_at (vect_location,
1806                                        "not vectorized:"
1807                                        " multiple nested loops.\n");
1808
1809       entryedge = loop_preheader_edge (innerloop);
1810       if (entryedge->src != loop->header
1811           || !single_exit (innerloop)
1812           || single_exit (innerloop)->dest != EDGE_PRED (loop->latch, 0)->src)
1813         return opt_result::failure_at (vect_location,
1814                                        "not vectorized:"
1815                                        " unsupported outerloop form.\n");
1816
1817       /* Analyze the inner-loop.  */
1818       vect_loop_form_info inner;
1819       opt_result res = vect_analyze_loop_form (loop->inner, &inner);
1820       if (!res)
1821         {
1822           if (dump_enabled_p ())
1823             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1824                              "not vectorized: Bad inner loop.\n");
1825           return res;
1826         }
1827
1828       /* Don't support analyzing niter under assumptions for inner
1829          loop.  */
1830       if (!integer_onep (inner.assumptions))
1831         return opt_result::failure_at (vect_location,
1832                                        "not vectorized: Bad inner loop.\n");
1833
1834       if (!expr_invariant_in_loop_p (loop, inner.number_of_iterations))
1835         return opt_result::failure_at (vect_location,
1836                                        "not vectorized: inner-loop count not"
1837                                        " invariant.\n");
1838
1839       if (dump_enabled_p ())
1840         dump_printf_loc (MSG_NOTE, vect_location,
1841                          "Considering outer-loop vectorization.\n");
1842       info->inner_loop_cond = inner.conds[0];
1843     }
1844
1845   if (EDGE_COUNT (loop->header->preds) != 2)
1846     return opt_result::failure_at (vect_location,
1847                                    "not vectorized:"
1848                                    " too many incoming edges.\n");
1849
1850   /* We assume that the latch is empty.  */
1851   if (!empty_block_p (loop->latch)
1852       || !gimple_seq_empty_p (phi_nodes (loop->latch)))
1853     return opt_result::failure_at (vect_location,
1854                                    "not vectorized: latch block not empty.\n");
1855
1856   /* Make sure there is no abnormal exit.  */
1857   auto_vec<edge> exits = get_loop_exit_edges (loop);
1858   for (edge e : exits)
1859     {
1860       if (e->flags & EDGE_ABNORMAL)
1861         return opt_result::failure_at (vect_location,
1862                                        "not vectorized:"
1863                                        " abnormal loop exit edge.\n");
1864     }
1865
1866   info->conds
1867     = vect_get_loop_niters (loop, exit_e, &info->assumptions,
1868                             &info->number_of_iterations,
1869                             &info->number_of_iterationsm1);
1870   if (info->conds.is_empty ())
1871     return opt_result::failure_at
1872       (vect_location,
1873        "not vectorized: complicated exit condition.\n");
1874
1875   /* Determine what the primary and alternate exit conds are.  */
1876   for (unsigned i = 0; i < info->conds.length (); i++)
1877     {
1878       gcond *cond = info->conds[i];
1879       if (exit_e->src == gimple_bb (cond))
1880         std::swap (info->conds[0], info->conds[i]);
1881     }
1882
1883   if (integer_zerop (info->assumptions)
1884       || !info->number_of_iterations
1885       || chrec_contains_undetermined (info->number_of_iterations))
1886     return opt_result::failure_at
1887       (info->conds[0],
1888        "not vectorized: number of iterations cannot be computed.\n");
1889
1890   if (integer_zerop (info->number_of_iterations))
1891     return opt_result::failure_at
1892       (info->conds[0],
1893        "not vectorized: number of iterations = 0.\n");
1894
1895   if (!(tree_fits_shwi_p (info->number_of_iterations)
1896         && tree_to_shwi (info->number_of_iterations) > 0))
1897     {
1898       if (dump_enabled_p ())
1899         {
1900           dump_printf_loc (MSG_NOTE, vect_location,
1901                            "Symbolic number of iterations is ");
1902           dump_generic_expr (MSG_NOTE, TDF_DETAILS, info->number_of_iterations);
1903           dump_printf (MSG_NOTE, "\n");
1904         }
1905     }
1906
1907   return opt_result::success ();
1908 }
1909
1910 /* Create a loop_vec_info for LOOP with SHARED and the
1911    vect_analyze_loop_form result.  */
1912
1913 loop_vec_info
1914 vect_create_loop_vinfo (class loop *loop, vec_info_shared *shared,
1915                         const vect_loop_form_info *info,
1916                         loop_vec_info main_loop_info)
1917 {
1918   loop_vec_info loop_vinfo = new _loop_vec_info (loop, shared);
1919   LOOP_VINFO_NITERSM1 (loop_vinfo) = info->number_of_iterationsm1;
1920   LOOP_VINFO_NITERS (loop_vinfo) = info->number_of_iterations;
1921   LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = info->number_of_iterations;
1922   LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo) = main_loop_info;
1923   /* Also record the assumptions for versioning.  */
1924   if (!integer_onep (info->assumptions) && !main_loop_info)
1925     LOOP_VINFO_NITERS_ASSUMPTIONS (loop_vinfo) = info->assumptions;
1926
1927   for (gcond *cond : info->conds)
1928     {
1929       stmt_vec_info loop_cond_info = loop_vinfo->lookup_stmt (cond);
1930       STMT_VINFO_TYPE (loop_cond_info) = loop_exit_ctrl_vec_info_type;
1931       /* Mark the statement as a condition.  */
1932       STMT_VINFO_DEF_TYPE (loop_cond_info) = vect_condition_def;
1933     }
1934
1935   for (unsigned i = 1; i < info->conds.length (); i ++)
1936     LOOP_VINFO_LOOP_CONDS (loop_vinfo).safe_push (info->conds[i]);
1937   LOOP_VINFO_LOOP_IV_COND (loop_vinfo) = info->conds[0];
1938
1939   LOOP_VINFO_IV_EXIT (loop_vinfo) = info->loop_exit;
1940
1941   /* Check to see if we're vectorizing multiple exits.  */
1942   LOOP_VINFO_EARLY_BREAKS (loop_vinfo)
1943     = !LOOP_VINFO_LOOP_CONDS (loop_vinfo).is_empty ();
1944
1945   if (info->inner_loop_cond)
1946     {
1947       stmt_vec_info inner_loop_cond_info
1948         = loop_vinfo->lookup_stmt (info->inner_loop_cond);
1949       STMT_VINFO_TYPE (inner_loop_cond_info) = loop_exit_ctrl_vec_info_type;
1950       /* If we have an estimate on the number of iterations of the inner
1951          loop use that to limit the scale for costing, otherwise use
1952          --param vect-inner-loop-cost-factor literally.  */
1953       widest_int nit;
1954       if (estimated_stmt_executions (loop->inner, &nit))
1955         LOOP_VINFO_INNER_LOOP_COST_FACTOR (loop_vinfo)
1956           = wi::smin (nit, param_vect_inner_loop_cost_factor).to_uhwi ();
1957     }
1958
1959   return loop_vinfo;
1960 }
1961
1962
1963
1964 /* Scan the loop stmts and dependent on whether there are any (non-)SLP
1965    statements update the vectorization factor.  */
1966
1967 static void
1968 vect_update_vf_for_slp (loop_vec_info loop_vinfo)
1969 {
1970   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1971   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1972   int nbbs = loop->num_nodes;
1973   poly_uint64 vectorization_factor;
1974   int i;
1975
1976   DUMP_VECT_SCOPE ("vect_update_vf_for_slp");
1977
1978   vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1979   gcc_assert (known_ne (vectorization_factor, 0U));
1980
1981   /* If all the stmts in the loop can be SLPed, we perform only SLP, and
1982      vectorization factor of the loop is the unrolling factor required by
1983      the SLP instances.  If that unrolling factor is 1, we say, that we
1984      perform pure SLP on loop - cross iteration parallelism is not
1985      exploited.  */
1986   bool only_slp_in_loop = true;
1987   for (i = 0; i < nbbs; i++)
1988     {
1989       basic_block bb = bbs[i];
1990       for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
1991            gsi_next (&si))
1992         {
1993           stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (si.phi ());
1994           if (!stmt_info)
1995             continue;
1996           if ((STMT_VINFO_RELEVANT_P (stmt_info)
1997                || VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
1998               && !PURE_SLP_STMT (stmt_info))
1999             /* STMT needs both SLP and loop-based vectorization.  */
2000             only_slp_in_loop = false;
2001         }
2002       for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
2003            gsi_next (&si))
2004         {
2005           if (is_gimple_debug (gsi_stmt (si)))
2006             continue;
2007           stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
2008           stmt_info = vect_stmt_to_vectorize (stmt_info);
2009           if ((STMT_VINFO_RELEVANT_P (stmt_info)
2010                || VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
2011               && !PURE_SLP_STMT (stmt_info))
2012             /* STMT needs both SLP and loop-based vectorization.  */
2013             only_slp_in_loop = false;
2014         }
2015     }
2016
2017   if (only_slp_in_loop)
2018     {
2019       if (dump_enabled_p ())
2020         dump_printf_loc (MSG_NOTE, vect_location,
2021                          "Loop contains only SLP stmts\n");
2022       vectorization_factor = LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo);
2023     }
2024   else
2025     {
2026       if (dump_enabled_p ())
2027         dump_printf_loc (MSG_NOTE, vect_location,
2028                          "Loop contains SLP and non-SLP stmts\n");
2029       /* Both the vectorization factor and unroll factor have the form
2030          GET_MODE_SIZE (loop_vinfo->vector_mode) * X for some rational X,
2031          so they must have a common multiple.  */
2032       vectorization_factor
2033         = force_common_multiple (vectorization_factor,
2034                                  LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo));
2035     }
2036
2037   LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
2038   if (dump_enabled_p ())
2039     {
2040       dump_printf_loc (MSG_NOTE, vect_location,
2041                        "Updating vectorization factor to ");
2042       dump_dec (MSG_NOTE, vectorization_factor);
2043       dump_printf (MSG_NOTE, ".\n");
2044     }
2045 }
2046
2047 /* Return true if STMT_INFO describes a double reduction phi and if
2048    the other phi in the reduction is also relevant for vectorization.
2049    This rejects cases such as:
2050
2051       outer1:
2052         x_1 = PHI <x_3(outer2), ...>;
2053         ...
2054
2055       inner:
2056         x_2 = ...;
2057         ...
2058
2059       outer2:
2060         x_3 = PHI <x_2(inner)>;
2061
2062    if nothing in x_2 or elsewhere makes x_1 relevant.  */
2063
2064 static bool
2065 vect_active_double_reduction_p (stmt_vec_info stmt_info)
2066 {
2067   if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_double_reduction_def)
2068     return false;
2069
2070   return STMT_VINFO_RELEVANT_P (STMT_VINFO_REDUC_DEF (stmt_info));
2071 }
2072
2073 /* Function vect_analyze_loop_operations.
2074
2075    Scan the loop stmts and make sure they are all vectorizable.  */
2076
2077 static opt_result
2078 vect_analyze_loop_operations (loop_vec_info loop_vinfo)
2079 {
2080   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2081   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
2082   int nbbs = loop->num_nodes;
2083   int i;
2084   stmt_vec_info stmt_info;
2085   bool need_to_vectorize = false;
2086   bool ok;
2087
2088   DUMP_VECT_SCOPE ("vect_analyze_loop_operations");
2089
2090   auto_vec<stmt_info_for_cost> cost_vec;
2091
2092   for (i = 0; i < nbbs; i++)
2093     {
2094       basic_block bb = bbs[i];
2095
2096       for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
2097            gsi_next (&si))
2098         {
2099           gphi *phi = si.phi ();
2100           ok = true;
2101
2102           stmt_info = loop_vinfo->lookup_stmt (phi);
2103           if (dump_enabled_p ())
2104             dump_printf_loc (MSG_NOTE, vect_location, "examining phi: %G",
2105                              (gimple *) phi);
2106           if (virtual_operand_p (gimple_phi_result (phi)))
2107             continue;
2108
2109           /* Inner-loop loop-closed exit phi in outer-loop vectorization
2110              (i.e., a phi in the tail of the outer-loop).  */
2111           if (! is_loop_header_bb_p (bb))
2112             {
2113               /* FORNOW: we currently don't support the case that these phis
2114                  are not used in the outerloop (unless it is double reduction,
2115                  i.e., this phi is vect_reduction_def), cause this case
2116                  requires to actually do something here.  */
2117               if (STMT_VINFO_LIVE_P (stmt_info)
2118                   && !vect_active_double_reduction_p (stmt_info))
2119                 return opt_result::failure_at (phi,
2120                                                "Unsupported loop-closed phi"
2121                                                " in outer-loop.\n");
2122
2123               /* If PHI is used in the outer loop, we check that its operand
2124                  is defined in the inner loop.  */
2125               if (STMT_VINFO_RELEVANT_P (stmt_info))
2126                 {
2127                   tree phi_op;
2128
2129                   if (gimple_phi_num_args (phi) != 1)
2130                     return opt_result::failure_at (phi, "unsupported phi");
2131
2132                   phi_op = PHI_ARG_DEF (phi, 0);
2133                   stmt_vec_info op_def_info = loop_vinfo->lookup_def (phi_op);
2134                   if (!op_def_info)
2135                     return opt_result::failure_at (phi, "unsupported phi\n");
2136
2137                   if (STMT_VINFO_RELEVANT (op_def_info) != vect_used_in_outer
2138                       && (STMT_VINFO_RELEVANT (op_def_info)
2139                           != vect_used_in_outer_by_reduction))
2140                     return opt_result::failure_at (phi, "unsupported phi\n");
2141
2142                   if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_internal_def
2143                        || (STMT_VINFO_DEF_TYPE (stmt_info)
2144                            == vect_double_reduction_def))
2145                       && !vectorizable_lc_phi (loop_vinfo,
2146                                                stmt_info, NULL, NULL))
2147                     return opt_result::failure_at (phi, "unsupported phi\n");
2148                 }
2149
2150               continue;
2151             }
2152
2153           gcc_assert (stmt_info);
2154
2155           if ((STMT_VINFO_RELEVANT (stmt_info) == vect_used_in_scope
2156                || STMT_VINFO_LIVE_P (stmt_info))
2157               && STMT_VINFO_DEF_TYPE (stmt_info) != vect_induction_def
2158               && STMT_VINFO_DEF_TYPE (stmt_info) != vect_first_order_recurrence)
2159             /* A scalar-dependence cycle that we don't support.  */
2160             return opt_result::failure_at (phi,
2161                                            "not vectorized:"
2162                                            " scalar dependence cycle.\n");
2163
2164           if (STMT_VINFO_RELEVANT_P (stmt_info))
2165             {
2166               need_to_vectorize = true;
2167               if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def
2168                   && ! PURE_SLP_STMT (stmt_info))
2169                 ok = vectorizable_induction (loop_vinfo,
2170                                              stmt_info, NULL, NULL,
2171                                              &cost_vec);
2172               else if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
2173                         || (STMT_VINFO_DEF_TYPE (stmt_info)
2174                             == vect_double_reduction_def)
2175                         || STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle)
2176                        && ! PURE_SLP_STMT (stmt_info))
2177                 ok = vectorizable_reduction (loop_vinfo,
2178                                              stmt_info, NULL, NULL, &cost_vec);
2179               else if ((STMT_VINFO_DEF_TYPE (stmt_info)
2180                         == vect_first_order_recurrence)
2181                        && ! PURE_SLP_STMT (stmt_info))
2182                 ok = vectorizable_recurr (loop_vinfo, stmt_info, NULL, NULL,
2183                                            &cost_vec);
2184             }
2185
2186           /* SLP PHIs are tested by vect_slp_analyze_node_operations.  */
2187           if (ok
2188               && STMT_VINFO_LIVE_P (stmt_info)
2189               && !PURE_SLP_STMT (stmt_info))
2190             ok = vectorizable_live_operation (loop_vinfo, stmt_info, NULL, NULL,
2191                                               -1, false, &cost_vec);
2192
2193           if (!ok)
2194             return opt_result::failure_at (phi,
2195                                            "not vectorized: relevant phi not "
2196                                            "supported: %G",
2197                                            static_cast <gimple *> (phi));
2198         }
2199
2200       for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
2201            gsi_next (&si))
2202         {
2203           gimple *stmt = gsi_stmt (si);
2204           if (!gimple_clobber_p (stmt)
2205               && !is_gimple_debug (stmt))
2206             {
2207               opt_result res
2208                 = vect_analyze_stmt (loop_vinfo,
2209                                      loop_vinfo->lookup_stmt (stmt),
2210                                      &need_to_vectorize,
2211                                      NULL, NULL, &cost_vec);
2212               if (!res)
2213                 return res;
2214             }
2215         }
2216     } /* bbs */
2217
2218   add_stmt_costs (loop_vinfo->vector_costs, &cost_vec);
2219
2220   /* All operations in the loop are either irrelevant (deal with loop
2221      control, or dead), or only used outside the loop and can be moved
2222      out of the loop (e.g. invariants, inductions).  The loop can be
2223      optimized away by scalar optimizations.  We're better off not
2224      touching this loop.  */
2225   if (!need_to_vectorize)
2226     {
2227       if (dump_enabled_p ())
2228         dump_printf_loc (MSG_NOTE, vect_location,
2229                          "All the computation can be taken out of the loop.\n");
2230       return opt_result::failure_at
2231         (vect_location,
2232          "not vectorized: redundant loop. no profit to vectorize.\n");
2233     }
2234
2235   return opt_result::success ();
2236 }
2237
2238 /* Return true if we know that the iteration count is smaller than the
2239    vectorization factor.  Return false if it isn't, or if we can't be sure
2240    either way.  */
2241
2242 static bool
2243 vect_known_niters_smaller_than_vf (loop_vec_info loop_vinfo)
2244 {
2245   unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
2246
2247   HOST_WIDE_INT max_niter;
2248   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
2249     max_niter = LOOP_VINFO_INT_NITERS (loop_vinfo);
2250   else
2251     max_niter = max_stmt_executions_int (LOOP_VINFO_LOOP (loop_vinfo));
2252
2253   if (max_niter != -1 && (unsigned HOST_WIDE_INT) max_niter < assumed_vf)
2254     return true;
2255
2256   return false;
2257 }
2258
2259 /* Analyze the cost of the loop described by LOOP_VINFO.  Decide if it
2260    is worthwhile to vectorize.  Return 1 if definitely yes, 0 if
2261    definitely no, or -1 if it's worth retrying.  */
2262
2263 static int
2264 vect_analyze_loop_costing (loop_vec_info loop_vinfo,
2265                            unsigned *suggested_unroll_factor)
2266 {
2267   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2268   unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
2269
2270   /* Only loops that can handle partially-populated vectors can have iteration
2271      counts less than the vectorization factor.  */
2272   if (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo)
2273       && vect_known_niters_smaller_than_vf (loop_vinfo))
2274     {
2275       if (dump_enabled_p ())
2276         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2277                          "not vectorized: iteration count smaller than "
2278                          "vectorization factor.\n");
2279       return 0;
2280     }
2281
2282   /* If we know the number of iterations we can do better, for the
2283      epilogue we can also decide whether the main loop leaves us
2284      with enough iterations, prefering a smaller vector epilog then
2285      also possibly used for the case we skip the vector loop.  */
2286   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
2287     {
2288       widest_int scalar_niters
2289         = wi::to_widest (LOOP_VINFO_NITERSM1 (loop_vinfo)) + 1;
2290       if (LOOP_VINFO_EPILOGUE_P (loop_vinfo))
2291         {
2292           loop_vec_info orig_loop_vinfo
2293             = LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo);
2294           unsigned lowest_vf
2295             = constant_lower_bound (LOOP_VINFO_VECT_FACTOR (orig_loop_vinfo));
2296           int prolog_peeling = 0;
2297           if (!vect_use_loop_mask_for_alignment_p (loop_vinfo))
2298             prolog_peeling = LOOP_VINFO_PEELING_FOR_ALIGNMENT (orig_loop_vinfo);
2299           if (prolog_peeling >= 0
2300               && known_eq (LOOP_VINFO_VECT_FACTOR (orig_loop_vinfo),
2301                            lowest_vf))
2302             {
2303               unsigned gap
2304                 = LOOP_VINFO_PEELING_FOR_GAPS (orig_loop_vinfo) ? 1 : 0;
2305               scalar_niters = ((scalar_niters - gap - prolog_peeling)
2306                                % lowest_vf + gap);
2307             }
2308         }
2309       /* Reject vectorizing for a single scalar iteration, even if
2310          we could in principle implement that using partial vectors.  */
2311       unsigned peeling_gap = LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo);
2312       if (scalar_niters <= peeling_gap + 1)
2313         {
2314           if (dump_enabled_p ())
2315             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2316                              "not vectorized: loop only has a single "
2317                              "scalar iteration.\n");
2318           return 0;
2319         }
2320
2321       if (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
2322         {
2323           /* Check that the loop processes at least one full vector.  */
2324           poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2325           if (known_lt (scalar_niters, vf))
2326             {
2327               if (dump_enabled_p ())
2328                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2329                                  "loop does not have enough iterations "
2330                                  "to support vectorization.\n");
2331               return 0;
2332             }
2333
2334           /* If we need to peel an extra epilogue iteration to handle data
2335              accesses with gaps, check that there are enough scalar iterations
2336              available.
2337
2338              The check above is redundant with this one when peeling for gaps,
2339              but the distinction is useful for diagnostics.  */
2340           if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
2341               && known_le (scalar_niters, vf))
2342             {
2343               if (dump_enabled_p ())
2344                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2345                                  "loop does not have enough iterations "
2346                                  "to support peeling for gaps.\n");
2347               return 0;
2348             }
2349         }
2350     }
2351
2352   /* If using the "very cheap" model. reject cases in which we'd keep
2353      a copy of the scalar code (even if we might be able to vectorize it).  */
2354   if (loop_cost_model (loop) == VECT_COST_MODEL_VERY_CHEAP
2355       && (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
2356           || LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
2357           || LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo)))
2358     {
2359       if (dump_enabled_p ())
2360         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2361                          "some scalar iterations would need to be peeled\n");
2362       return 0;
2363     }
2364
2365   int min_profitable_iters, min_profitable_estimate;
2366   vect_estimate_min_profitable_iters (loop_vinfo, &min_profitable_iters,
2367                                       &min_profitable_estimate,
2368                                       suggested_unroll_factor);
2369
2370   if (min_profitable_iters < 0)
2371     {
2372       if (dump_enabled_p ())
2373         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2374                          "not vectorized: vectorization not profitable.\n");
2375       if (dump_enabled_p ())
2376         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2377                          "not vectorized: vector version will never be "
2378                          "profitable.\n");
2379       return -1;
2380     }
2381
2382   int min_scalar_loop_bound = (param_min_vect_loop_bound
2383                                * assumed_vf);
2384
2385   /* Use the cost model only if it is more conservative than user specified
2386      threshold.  */
2387   unsigned int th = (unsigned) MAX (min_scalar_loop_bound,
2388                                     min_profitable_iters);
2389
2390   LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo) = th;
2391
2392   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2393       && LOOP_VINFO_INT_NITERS (loop_vinfo) < th)
2394     {
2395       if (dump_enabled_p ())
2396         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2397                          "not vectorized: vectorization not profitable.\n");
2398       if (dump_enabled_p ())
2399         dump_printf_loc (MSG_NOTE, vect_location,
2400                          "not vectorized: iteration count smaller than user "
2401                          "specified loop bound parameter or minimum profitable "
2402                          "iterations (whichever is more conservative).\n");
2403       return 0;
2404     }
2405
2406   /* The static profitablity threshold min_profitable_estimate includes
2407      the cost of having to check at runtime whether the scalar loop
2408      should be used instead.  If it turns out that we don't need or want
2409      such a check, the threshold we should use for the static estimate
2410      is simply the point at which the vector loop becomes more profitable
2411      than the scalar loop.  */
2412   if (min_profitable_estimate > min_profitable_iters
2413       && !LOOP_REQUIRES_VERSIONING (loop_vinfo)
2414       && !LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo)
2415       && !LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
2416       && !vect_apply_runtime_profitability_check_p (loop_vinfo))
2417     {
2418       if (dump_enabled_p ())
2419         dump_printf_loc (MSG_NOTE, vect_location, "no need for a runtime"
2420                          " choice between the scalar and vector loops\n");
2421       min_profitable_estimate = min_profitable_iters;
2422     }
2423
2424   /* If the vector loop needs multiple iterations to be beneficial then
2425      things are probably too close to call, and the conservative thing
2426      would be to stick with the scalar code.  */
2427   if (loop_cost_model (loop) == VECT_COST_MODEL_VERY_CHEAP
2428       && min_profitable_estimate > (int) vect_vf_for_cost (loop_vinfo))
2429     {
2430       if (dump_enabled_p ())
2431         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2432                          "one iteration of the vector loop would be"
2433                          " more expensive than the equivalent number of"
2434                          " iterations of the scalar loop\n");
2435       return 0;
2436     }
2437
2438   HOST_WIDE_INT estimated_niter;
2439
2440   /* If we are vectorizing an epilogue then we know the maximum number of
2441      scalar iterations it will cover is at least one lower than the
2442      vectorization factor of the main loop.  */
2443   if (LOOP_VINFO_EPILOGUE_P (loop_vinfo))
2444     estimated_niter
2445       = vect_vf_for_cost (LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo)) - 1;
2446   else
2447     {
2448       estimated_niter = estimated_stmt_executions_int (loop);
2449       if (estimated_niter == -1)
2450         estimated_niter = likely_max_stmt_executions_int (loop);
2451     }
2452   if (estimated_niter != -1
2453       && ((unsigned HOST_WIDE_INT) estimated_niter
2454           < MAX (th, (unsigned) min_profitable_estimate)))
2455     {
2456       if (dump_enabled_p ())
2457         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2458                          "not vectorized: estimated iteration count too "
2459                          "small.\n");
2460       if (dump_enabled_p ())
2461         dump_printf_loc (MSG_NOTE, vect_location,
2462                          "not vectorized: estimated iteration count smaller "
2463                          "than specified loop bound parameter or minimum "
2464                          "profitable iterations (whichever is more "
2465                          "conservative).\n");
2466       return -1;
2467     }
2468
2469   return 1;
2470 }
2471
2472 static opt_result
2473 vect_get_datarefs_in_loop (loop_p loop, basic_block *bbs,
2474                            vec<data_reference_p> *datarefs,
2475                            unsigned int *n_stmts)
2476 {
2477   *n_stmts = 0;
2478   for (unsigned i = 0; i < loop->num_nodes; i++)
2479     for (gimple_stmt_iterator gsi = gsi_start_bb (bbs[i]);
2480          !gsi_end_p (gsi); gsi_next (&gsi))
2481       {
2482         gimple *stmt = gsi_stmt (gsi);
2483         if (is_gimple_debug (stmt))
2484           continue;
2485         ++(*n_stmts);
2486         opt_result res = vect_find_stmt_data_reference (loop, stmt, datarefs,
2487                                                         NULL, 0);
2488         if (!res)
2489           {
2490             if (is_gimple_call (stmt) && loop->safelen)
2491               {
2492                 tree fndecl = gimple_call_fndecl (stmt), op;
2493                 if (fndecl == NULL_TREE
2494                     && gimple_call_internal_p (stmt, IFN_MASK_CALL))
2495                   {
2496                     fndecl = gimple_call_arg (stmt, 0);
2497                     gcc_checking_assert (TREE_CODE (fndecl) == ADDR_EXPR);
2498                     fndecl = TREE_OPERAND (fndecl, 0);
2499                     gcc_checking_assert (TREE_CODE (fndecl) == FUNCTION_DECL);
2500                   }
2501                 if (fndecl != NULL_TREE)
2502                   {
2503                     cgraph_node *node = cgraph_node::get (fndecl);
2504                     if (node != NULL && node->simd_clones != NULL)
2505                       {
2506                         unsigned int j, n = gimple_call_num_args (stmt);
2507                         for (j = 0; j < n; j++)
2508                           {
2509                             op = gimple_call_arg (stmt, j);
2510                             if (DECL_P (op)
2511                                 || (REFERENCE_CLASS_P (op)
2512                                     && get_base_address (op)))
2513                               break;
2514                           }
2515                         op = gimple_call_lhs (stmt);
2516                         /* Ignore #pragma omp declare simd functions
2517                            if they don't have data references in the
2518                            call stmt itself.  */
2519                         if (j == n
2520                             && !(op
2521                                  && (DECL_P (op)
2522                                      || (REFERENCE_CLASS_P (op)
2523                                          && get_base_address (op)))))
2524                           continue;
2525                       }
2526                   }
2527               }
2528             return res;
2529           }
2530         /* If dependence analysis will give up due to the limit on the
2531            number of datarefs stop here and fail fatally.  */
2532         if (datarefs->length ()
2533             > (unsigned)param_loop_max_datarefs_for_datadeps)
2534           return opt_result::failure_at (stmt, "exceeded param "
2535                                          "loop-max-datarefs-for-datadeps\n");
2536       }
2537   return opt_result::success ();
2538 }
2539
2540 /* Look for SLP-only access groups and turn each individual access into its own
2541    group.  */
2542 static void
2543 vect_dissolve_slp_only_groups (loop_vec_info loop_vinfo)
2544 {
2545   unsigned int i;
2546   struct data_reference *dr;
2547
2548   DUMP_VECT_SCOPE ("vect_dissolve_slp_only_groups");
2549
2550   vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
2551   FOR_EACH_VEC_ELT (datarefs, i, dr)
2552     {
2553       gcc_assert (DR_REF (dr));
2554       stmt_vec_info stmt_info
2555         = vect_stmt_to_vectorize (loop_vinfo->lookup_stmt (DR_STMT (dr)));
2556
2557       /* Check if the load is a part of an interleaving chain.  */
2558       if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
2559         {
2560           stmt_vec_info first_element = DR_GROUP_FIRST_ELEMENT (stmt_info);
2561           dr_vec_info *dr_info = STMT_VINFO_DR_INFO (first_element);
2562           unsigned int group_size = DR_GROUP_SIZE (first_element);
2563
2564           /* Check if SLP-only groups.  */
2565           if (!STMT_SLP_TYPE (stmt_info)
2566               && STMT_VINFO_SLP_VECT_ONLY (first_element))
2567             {
2568               /* Dissolve the group.  */
2569               STMT_VINFO_SLP_VECT_ONLY (first_element) = false;
2570
2571               stmt_vec_info vinfo = first_element;
2572               while (vinfo)
2573                 {
2574                   stmt_vec_info next = DR_GROUP_NEXT_ELEMENT (vinfo);
2575                   DR_GROUP_FIRST_ELEMENT (vinfo) = vinfo;
2576                   DR_GROUP_NEXT_ELEMENT (vinfo) = NULL;
2577                   DR_GROUP_SIZE (vinfo) = 1;
2578                   if (STMT_VINFO_STRIDED_P (first_element)
2579                       /* We cannot handle stores with gaps.  */
2580                       || DR_IS_WRITE (dr_info->dr))
2581                     {
2582                       STMT_VINFO_STRIDED_P (vinfo) = true;
2583                       DR_GROUP_GAP (vinfo) = 0;
2584                     }
2585                   else
2586                     DR_GROUP_GAP (vinfo) = group_size - 1;
2587                   /* Duplicate and adjust alignment info, it needs to
2588                      be present on each group leader, see dr_misalignment.  */
2589                   if (vinfo != first_element)
2590                     {
2591                       dr_vec_info *dr_info2 = STMT_VINFO_DR_INFO (vinfo);
2592                       dr_info2->target_alignment = dr_info->target_alignment;
2593                       int misalignment = dr_info->misalignment;
2594                       if (misalignment != DR_MISALIGNMENT_UNKNOWN)
2595                         {
2596                           HOST_WIDE_INT diff
2597                             = (TREE_INT_CST_LOW (DR_INIT (dr_info2->dr))
2598                                - TREE_INT_CST_LOW (DR_INIT (dr_info->dr)));
2599                           unsigned HOST_WIDE_INT align_c
2600                             = dr_info->target_alignment.to_constant ();
2601                           misalignment = (misalignment + diff) % align_c;
2602                         }
2603                       dr_info2->misalignment = misalignment;
2604                     }
2605                   vinfo = next;
2606                 }
2607             }
2608         }
2609     }
2610 }
2611
2612 /* Determine if operating on full vectors for LOOP_VINFO might leave
2613    some scalar iterations still to do.  If so, decide how we should
2614    handle those scalar iterations.  The possibilities are:
2615
2616    (1) Make LOOP_VINFO operate on partial vectors instead of full vectors.
2617        In this case:
2618
2619          LOOP_VINFO_USING_PARTIAL_VECTORS_P == true
2620          LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P == false
2621          LOOP_VINFO_PEELING_FOR_NITER == false
2622
2623    (2) Make LOOP_VINFO operate on full vectors and use an epilogue loop
2624        to handle the remaining scalar iterations.  In this case:
2625
2626          LOOP_VINFO_USING_PARTIAL_VECTORS_P == false
2627          LOOP_VINFO_PEELING_FOR_NITER == true
2628
2629        There are two choices:
2630
2631        (2a) Consider vectorizing the epilogue loop at the same VF as the
2632             main loop, but using partial vectors instead of full vectors.
2633             In this case:
2634
2635               LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P == true
2636
2637        (2b) Consider vectorizing the epilogue loop at lower VFs only.
2638             In this case:
2639
2640               LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P == false
2641  */
2642
2643 opt_result
2644 vect_determine_partial_vectors_and_peeling (loop_vec_info loop_vinfo)
2645 {
2646   /* Determine whether there would be any scalar iterations left over.  */
2647   bool need_peeling_or_partial_vectors_p
2648     = vect_need_peeling_or_partial_vectors_p (loop_vinfo);
2649
2650   /* Decide whether to vectorize the loop with partial vectors.  */
2651   LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo) = false;
2652   LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P (loop_vinfo) = false;
2653   if (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)
2654       && need_peeling_or_partial_vectors_p)
2655     {
2656       /* For partial-vector-usage=1, try to push the handling of partial
2657          vectors to the epilogue, with the main loop continuing to operate
2658          on full vectors.
2659
2660          If we are unrolling we also do not want to use partial vectors. This
2661          is to avoid the overhead of generating multiple masks and also to
2662          avoid having to execute entire iterations of FALSE masked instructions
2663          when dealing with one or less full iterations.
2664
2665          ??? We could then end up failing to use partial vectors if we
2666          decide to peel iterations into a prologue, and if the main loop
2667          then ends up processing fewer than VF iterations.  */
2668       if ((param_vect_partial_vector_usage == 1
2669            || loop_vinfo->suggested_unroll_factor > 1)
2670           && !LOOP_VINFO_EPILOGUE_P (loop_vinfo)
2671           && !vect_known_niters_smaller_than_vf (loop_vinfo))
2672         LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P (loop_vinfo) = true;
2673       else
2674         LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo) = true;
2675     }
2676
2677   if (dump_enabled_p ())
2678     dump_printf_loc (MSG_NOTE, vect_location,
2679                      "operating on %s vectors%s.\n",
2680                      LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo)
2681                      ? "partial" : "full",
2682                      LOOP_VINFO_EPILOGUE_P (loop_vinfo)
2683                      ? " for epilogue loop" : "");
2684
2685   LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo)
2686     = (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo)
2687        && need_peeling_or_partial_vectors_p);
2688
2689   /* We set LOOP_VINFO_USING_SELECT_VL_P as true before loop vectorization
2690      analysis that we don't know whether the loop is vectorized by partial
2691      vectors (More details see tree-vect-loop-manip.cc).
2692
2693      However, SELECT_VL vectorizaton style should only applied on partial
2694      vectorization since SELECT_VL is the GIMPLE IR that calculates the
2695      number of elements to be process for each iteration.
2696
2697      After loop vectorization analysis, Clear LOOP_VINFO_USING_SELECT_VL_P
2698      if it is not partial vectorized loop.  */
2699   if (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
2700     LOOP_VINFO_USING_SELECT_VL_P (loop_vinfo) = false;
2701
2702   return opt_result::success ();
2703 }
2704
2705 /* Function vect_analyze_loop_2.
2706
2707    Apply a set of analyses on LOOP specified by LOOP_VINFO, the different
2708    analyses will record information in some members of LOOP_VINFO.  FATAL
2709    indicates if some analysis meets fatal error.  If one non-NULL pointer
2710    SUGGESTED_UNROLL_FACTOR is provided, it's intent to be filled with one
2711    worked out suggested unroll factor, while one NULL pointer shows it's
2712    going to apply the suggested unroll factor.  SLP_DONE_FOR_SUGGESTED_UF
2713    is to hold the slp decision when the suggested unroll factor is worked
2714    out.  */
2715 static opt_result
2716 vect_analyze_loop_2 (loop_vec_info loop_vinfo, bool &fatal,
2717                      unsigned *suggested_unroll_factor,
2718                      bool& slp_done_for_suggested_uf)
2719 {
2720   opt_result ok = opt_result::success ();
2721   int res;
2722   unsigned int max_vf = MAX_VECTORIZATION_FACTOR;
2723   poly_uint64 min_vf = 2;
2724   loop_vec_info orig_loop_vinfo = NULL;
2725
2726   /* If we are dealing with an epilogue then orig_loop_vinfo points to the
2727      loop_vec_info of the first vectorized loop.  */
2728   if (LOOP_VINFO_EPILOGUE_P (loop_vinfo))
2729     orig_loop_vinfo = LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo);
2730   else
2731     orig_loop_vinfo = loop_vinfo;
2732   gcc_assert (orig_loop_vinfo);
2733
2734   /* The first group of checks is independent of the vector size.  */
2735   fatal = true;
2736
2737   if (LOOP_VINFO_SIMD_IF_COND (loop_vinfo)
2738       && integer_zerop (LOOP_VINFO_SIMD_IF_COND (loop_vinfo)))
2739     return opt_result::failure_at (vect_location,
2740                                    "not vectorized: simd if(0)\n");
2741
2742   /* Find all data references in the loop (which correspond to vdefs/vuses)
2743      and analyze their evolution in the loop.  */
2744
2745   loop_p loop = LOOP_VINFO_LOOP (loop_vinfo);
2746
2747   /* Gather the data references and count stmts in the loop.  */
2748   if (!LOOP_VINFO_DATAREFS (loop_vinfo).exists ())
2749     {
2750       opt_result res
2751         = vect_get_datarefs_in_loop (loop, LOOP_VINFO_BBS (loop_vinfo),
2752                                      &LOOP_VINFO_DATAREFS (loop_vinfo),
2753                                      &LOOP_VINFO_N_STMTS (loop_vinfo));
2754       if (!res)
2755         {
2756           if (dump_enabled_p ())
2757             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2758                              "not vectorized: loop contains function "
2759                              "calls or data references that cannot "
2760                              "be analyzed\n");
2761           return res;
2762         }
2763       loop_vinfo->shared->save_datarefs ();
2764     }
2765   else
2766     loop_vinfo->shared->check_datarefs ();
2767
2768   /* Analyze the data references and also adjust the minimal
2769      vectorization factor according to the loads and stores.  */
2770
2771   ok = vect_analyze_data_refs (loop_vinfo, &min_vf, &fatal);
2772   if (!ok)
2773     {
2774       if (dump_enabled_p ())
2775         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2776                          "bad data references.\n");
2777       return ok;
2778     }
2779
2780   /* Check if we are applying unroll factor now.  */
2781   bool applying_suggested_uf = loop_vinfo->suggested_unroll_factor > 1;
2782   gcc_assert (!applying_suggested_uf || !suggested_unroll_factor);
2783
2784   /* If the slp decision is false when suggested unroll factor is worked
2785      out, and we are applying suggested unroll factor, we can simply skip
2786      all slp related analyses this time.  */
2787   bool slp = !applying_suggested_uf || slp_done_for_suggested_uf;
2788
2789   /* Classify all cross-iteration scalar data-flow cycles.
2790      Cross-iteration cycles caused by virtual phis are analyzed separately.  */
2791   vect_analyze_scalar_cycles (loop_vinfo, slp);
2792
2793   vect_pattern_recog (loop_vinfo);
2794
2795   vect_fixup_scalar_cycles_with_patterns (loop_vinfo);
2796
2797   /* Analyze the access patterns of the data-refs in the loop (consecutive,
2798      complex, etc.). FORNOW: Only handle consecutive access pattern.  */
2799
2800   ok = vect_analyze_data_ref_accesses (loop_vinfo, NULL);
2801   if (!ok)
2802     {
2803       if (dump_enabled_p ())
2804         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2805                          "bad data access.\n");
2806       return ok;
2807     }
2808
2809   /* Data-flow analysis to detect stmts that do not need to be vectorized.  */
2810
2811   ok = vect_mark_stmts_to_be_vectorized (loop_vinfo, &fatal);
2812   if (!ok)
2813     {
2814       if (dump_enabled_p ())
2815         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2816                          "unexpected pattern.\n");
2817       return ok;
2818     }
2819
2820   /* While the rest of the analysis below depends on it in some way.  */
2821   fatal = false;
2822
2823   /* Analyze data dependences between the data-refs in the loop
2824      and adjust the maximum vectorization factor according to
2825      the dependences.
2826      FORNOW: fail at the first data dependence that we encounter.  */
2827
2828   ok = vect_analyze_data_ref_dependences (loop_vinfo, &max_vf);
2829   if (!ok)
2830     {
2831       if (dump_enabled_p ())
2832         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2833                          "bad data dependence.\n");
2834       return ok;
2835     }
2836   if (max_vf != MAX_VECTORIZATION_FACTOR
2837       && maybe_lt (max_vf, min_vf))
2838     return opt_result::failure_at (vect_location, "bad data dependence.\n");
2839   LOOP_VINFO_MAX_VECT_FACTOR (loop_vinfo) = max_vf;
2840
2841   ok = vect_determine_vectorization_factor (loop_vinfo);
2842   if (!ok)
2843     {
2844       if (dump_enabled_p ())
2845         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2846                          "can't determine vectorization factor.\n");
2847       return ok;
2848     }
2849
2850   /* Compute the scalar iteration cost.  */
2851   vect_compute_single_scalar_iteration_cost (loop_vinfo);
2852
2853   poly_uint64 saved_vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2854
2855   if (slp)
2856     {
2857       /* Check the SLP opportunities in the loop, analyze and build
2858          SLP trees.  */
2859       ok = vect_analyze_slp (loop_vinfo, LOOP_VINFO_N_STMTS (loop_vinfo));
2860       if (!ok)
2861         return ok;
2862
2863       /* If there are any SLP instances mark them as pure_slp.  */
2864       slp = vect_make_slp_decision (loop_vinfo);
2865       if (slp)
2866         {
2867           /* Find stmts that need to be both vectorized and SLPed.  */
2868           vect_detect_hybrid_slp (loop_vinfo);
2869
2870           /* Update the vectorization factor based on the SLP decision.  */
2871           vect_update_vf_for_slp (loop_vinfo);
2872
2873           /* Optimize the SLP graph with the vectorization factor fixed.  */
2874           vect_optimize_slp (loop_vinfo);
2875
2876           /* Gather the loads reachable from the SLP graph entries.  */
2877           vect_gather_slp_loads (loop_vinfo);
2878         }
2879     }
2880
2881   bool saved_can_use_partial_vectors_p
2882     = LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo);
2883
2884   /* We don't expect to have to roll back to anything other than an empty
2885      set of rgroups.  */
2886   gcc_assert (LOOP_VINFO_MASKS (loop_vinfo).is_empty ());
2887
2888   /* This is the point where we can re-start analysis with SLP forced off.  */
2889 start_over:
2890
2891   /* Apply the suggested unrolling factor, this was determined by the backend
2892      during finish_cost the first time we ran the analyzis for this
2893      vector mode.  */
2894   if (applying_suggested_uf)
2895     LOOP_VINFO_VECT_FACTOR (loop_vinfo) *= loop_vinfo->suggested_unroll_factor;
2896
2897   /* Now the vectorization factor is final.  */
2898   poly_uint64 vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2899   gcc_assert (known_ne (vectorization_factor, 0U));
2900
2901   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) && dump_enabled_p ())
2902     {
2903       dump_printf_loc (MSG_NOTE, vect_location,
2904                        "vectorization_factor = ");
2905       dump_dec (MSG_NOTE, vectorization_factor);
2906       dump_printf (MSG_NOTE, ", niters = %wd\n",
2907                    LOOP_VINFO_INT_NITERS (loop_vinfo));
2908     }
2909
2910   if (max_vf != MAX_VECTORIZATION_FACTOR
2911       && maybe_lt (max_vf, LOOP_VINFO_VECT_FACTOR (loop_vinfo)))
2912     return opt_result::failure_at (vect_location, "bad data dependence.\n");
2913
2914   loop_vinfo->vector_costs = init_cost (loop_vinfo, false);
2915
2916   /* Analyze the alignment of the data-refs in the loop.
2917      Fail if a data reference is found that cannot be vectorized.  */
2918
2919   ok = vect_analyze_data_refs_alignment (loop_vinfo);
2920   if (!ok)
2921     {
2922       if (dump_enabled_p ())
2923         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2924                          "bad data alignment.\n");
2925       return ok;
2926     }
2927
2928   /* Prune the list of ddrs to be tested at run-time by versioning for alias.
2929      It is important to call pruning after vect_analyze_data_ref_accesses,
2930      since we use grouping information gathered by interleaving analysis.  */
2931   ok = vect_prune_runtime_alias_test_list (loop_vinfo);
2932   if (!ok)
2933     return ok;
2934
2935   /* Do not invoke vect_enhance_data_refs_alignment for epilogue
2936      vectorization, since we do not want to add extra peeling or
2937      add versioning for alignment.  */
2938   if (!LOOP_VINFO_EPILOGUE_P (loop_vinfo))
2939     /* This pass will decide on using loop versioning and/or loop peeling in
2940        order to enhance the alignment of data references in the loop.  */
2941     ok = vect_enhance_data_refs_alignment (loop_vinfo);
2942   if (!ok)
2943     return ok;
2944
2945   if (slp)
2946     {
2947       /* Analyze operations in the SLP instances.  Note this may
2948          remove unsupported SLP instances which makes the above
2949          SLP kind detection invalid.  */
2950       unsigned old_size = LOOP_VINFO_SLP_INSTANCES (loop_vinfo).length ();
2951       vect_slp_analyze_operations (loop_vinfo);
2952       if (LOOP_VINFO_SLP_INSTANCES (loop_vinfo).length () != old_size)
2953         {
2954           ok = opt_result::failure_at (vect_location,
2955                                        "unsupported SLP instances\n");
2956           goto again;
2957         }
2958
2959       /* Check whether any load in ALL SLP instances is possibly permuted.  */
2960       slp_tree load_node, slp_root;
2961       unsigned i, x;
2962       slp_instance instance;
2963       bool can_use_lanes = true;
2964       FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), x, instance)
2965         {
2966           slp_root = SLP_INSTANCE_TREE (instance);
2967           int group_size = SLP_TREE_LANES (slp_root);
2968           tree vectype = SLP_TREE_VECTYPE (slp_root);
2969           bool loads_permuted = false;
2970           FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), i, load_node)
2971             {
2972               if (!SLP_TREE_LOAD_PERMUTATION (load_node).exists ())
2973                 continue;
2974               unsigned j;
2975               stmt_vec_info load_info;
2976               FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (load_node), j, load_info)
2977                 if (SLP_TREE_LOAD_PERMUTATION (load_node)[j] != j)
2978                   {
2979                     loads_permuted = true;
2980                     break;
2981                   }
2982             }
2983
2984           /* If the loads and stores can be handled with load/store-lane
2985              instructions record it and move on to the next instance.  */
2986           if (loads_permuted
2987               && SLP_INSTANCE_KIND (instance) == slp_inst_kind_store
2988               && vect_store_lanes_supported (vectype, group_size, false)
2989                    != IFN_LAST)
2990             {
2991               FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), i, load_node)
2992                 if (STMT_VINFO_GROUPED_ACCESS
2993                       (SLP_TREE_REPRESENTATIVE (load_node)))
2994                   {
2995                     stmt_vec_info stmt_vinfo = DR_GROUP_FIRST_ELEMENT
2996                         (SLP_TREE_REPRESENTATIVE (load_node));
2997                     /* Use SLP for strided accesses (or if we can't
2998                        load-lanes).  */
2999                     if (STMT_VINFO_STRIDED_P (stmt_vinfo)
3000                         || vect_load_lanes_supported
3001                              (STMT_VINFO_VECTYPE (stmt_vinfo),
3002                               DR_GROUP_SIZE (stmt_vinfo), false) == IFN_LAST)
3003                       break;
3004                   }
3005
3006               can_use_lanes
3007                 = can_use_lanes && i == SLP_INSTANCE_LOADS (instance).length ();
3008
3009               if (can_use_lanes && dump_enabled_p ())
3010                 dump_printf_loc (MSG_NOTE, vect_location,
3011                                  "SLP instance %p can use load/store-lanes\n",
3012                                  (void *) instance);
3013             }
3014           else
3015             {
3016               can_use_lanes = false;
3017               break;
3018             }
3019         }
3020
3021       /* If all SLP instances can use load/store-lanes abort SLP and try again
3022          with SLP disabled.  */
3023       if (can_use_lanes)
3024         {
3025           ok = opt_result::failure_at (vect_location,
3026                                        "Built SLP cancelled: can use "
3027                                        "load/store-lanes\n");
3028           if (dump_enabled_p ())
3029             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3030                              "Built SLP cancelled: all SLP instances support "
3031                              "load/store-lanes\n");
3032           goto again;
3033         }
3034     }
3035
3036   /* Dissolve SLP-only groups.  */
3037   vect_dissolve_slp_only_groups (loop_vinfo);
3038
3039   /* Scan all the remaining operations in the loop that are not subject
3040      to SLP and make sure they are vectorizable.  */
3041   ok = vect_analyze_loop_operations (loop_vinfo);
3042   if (!ok)
3043     {
3044       if (dump_enabled_p ())
3045         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3046                          "bad operation or unsupported loop bound.\n");
3047       return ok;
3048     }
3049
3050   /* For now, we don't expect to mix both masking and length approaches for one
3051      loop, disable it if both are recorded.  */
3052   if (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)
3053       && !LOOP_VINFO_MASKS (loop_vinfo).is_empty ()
3054       && !LOOP_VINFO_LENS (loop_vinfo).is_empty ())
3055     {
3056       if (dump_enabled_p ())
3057         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3058                          "can't vectorize a loop with partial vectors"
3059                          " because we don't expect to mix different"
3060                          " approaches with partial vectors for the"
3061                          " same loop.\n");
3062       LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
3063     }
3064
3065   /* If we still have the option of using partial vectors,
3066      check whether we can generate the necessary loop controls.  */
3067   if (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo))
3068     {
3069       if (!LOOP_VINFO_MASKS (loop_vinfo).is_empty ())
3070         {
3071           if (!vect_verify_full_masking (loop_vinfo)
3072               && !vect_verify_full_masking_avx512 (loop_vinfo))
3073             LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
3074         }
3075       else /* !LOOP_VINFO_LENS (loop_vinfo).is_empty () */
3076         if (!vect_verify_loop_lens (loop_vinfo))
3077           LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
3078     }
3079
3080   /* If we're vectorizing a loop that uses length "controls" and
3081      can iterate more than once, we apply decrementing IV approach
3082      in loop control.  */
3083   if (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)
3084       && LOOP_VINFO_PARTIAL_VECTORS_STYLE (loop_vinfo) == vect_partial_vectors_len
3085       && LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo) == 0
3086       && !(LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
3087            && known_le (LOOP_VINFO_INT_NITERS (loop_vinfo),
3088                         LOOP_VINFO_VECT_FACTOR (loop_vinfo))))
3089     LOOP_VINFO_USING_DECREMENTING_IV_P (loop_vinfo) = true;
3090
3091   /* If a loop uses length controls and has a decrementing loop control IV,
3092      we will normally pass that IV through a MIN_EXPR to calcaluate the
3093      basis for the length controls.  E.g. in a loop that processes one
3094      element per scalar iteration, the number of elements would be
3095      MIN_EXPR <N, VF>, where N is the number of scalar iterations left.
3096
3097      This MIN_EXPR approach allows us to use pointer IVs with an invariant
3098      step, since only the final iteration of the vector loop can have
3099      inactive lanes.
3100
3101      However, some targets have a dedicated instruction for calculating the
3102      preferred length, given the total number of elements that still need to
3103      be processed.  This is encapsulated in the SELECT_VL internal function.
3104
3105      If the target supports SELECT_VL, we can use it instead of MIN_EXPR
3106      to determine the basis for the length controls.  However, unlike the
3107      MIN_EXPR calculation, the SELECT_VL calculation can decide to make
3108      lanes inactive in any iteration of the vector loop, not just the last
3109      iteration.  This SELECT_VL approach therefore requires us to use pointer
3110      IVs with variable steps.
3111
3112      Once we've decided how many elements should be processed by one
3113      iteration of the vector loop, we need to populate the rgroup controls.
3114      If a loop has multiple rgroups, we need to make sure that those rgroups
3115      "line up" (that is, they must be consistent about which elements are
3116      active and which aren't).  This is done by vect_adjust_loop_lens_control.
3117
3118      In principle, it would be possible to use vect_adjust_loop_lens_control
3119      on either the result of a MIN_EXPR or the result of a SELECT_VL.
3120      However:
3121
3122      (1) In practice, it only makes sense to use SELECT_VL when a vector
3123          operation will be controlled directly by the result.  It is not
3124          worth using SELECT_VL if it would only be the input to other
3125          calculations.
3126
3127      (2) If we use SELECT_VL for an rgroup that has N controls, each associated
3128          pointer IV will need N updates by a variable amount (N-1 updates
3129          within the iteration and 1 update to move to the next iteration).
3130
3131      Because of this, we prefer to use the MIN_EXPR approach whenever there
3132      is more than one length control.
3133
3134      In addition, SELECT_VL always operates to a granularity of 1 unit.
3135      If we wanted to use it to control an SLP operation on N consecutive
3136      elements, we would need to make the SELECT_VL inputs measure scalar
3137      iterations (rather than elements) and then multiply the SELECT_VL
3138      result by N.  But using SELECT_VL this way is inefficient because
3139      of (1) above.
3140
3141      2. We don't apply SELECT_VL on single-rgroup when both (1) and (2) are
3142         satisfied:
3143
3144      (1). LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) is true.
3145      (2). LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant () is true.
3146
3147      Since SELECT_VL (variable step) will make SCEV analysis failed and then
3148      we will fail to gain benefits of following unroll optimizations. We prefer
3149      using the MIN_EXPR approach in this situation.  */
3150   if (LOOP_VINFO_USING_DECREMENTING_IV_P (loop_vinfo))
3151     {
3152       tree iv_type = LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo);
3153       if (direct_internal_fn_supported_p (IFN_SELECT_VL, iv_type,
3154                                           OPTIMIZE_FOR_SPEED)
3155           && LOOP_VINFO_LENS (loop_vinfo).length () == 1
3156           && LOOP_VINFO_LENS (loop_vinfo)[0].factor == 1 && !slp
3157           && (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
3158               || !LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant ()))
3159         LOOP_VINFO_USING_SELECT_VL_P (loop_vinfo) = true;
3160     }
3161
3162   /* Decide whether this loop_vinfo should use partial vectors or peeling,
3163      assuming that the loop will be used as a main loop.  We will redo
3164      this analysis later if we instead decide to use the loop as an
3165      epilogue loop.  */
3166   ok = vect_determine_partial_vectors_and_peeling (loop_vinfo);
3167   if (!ok)
3168     return ok;
3169
3170   /* If we're vectorizing an epilogue loop, the vectorized loop either needs
3171      to be able to handle fewer than VF scalars, or needs to have a lower VF
3172      than the main loop.  */
3173   if (LOOP_VINFO_EPILOGUE_P (loop_vinfo)
3174       && !LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
3175     {
3176       poly_uint64 unscaled_vf
3177         = exact_div (LOOP_VINFO_VECT_FACTOR (orig_loop_vinfo),
3178                      orig_loop_vinfo->suggested_unroll_factor);
3179       if (maybe_ge (LOOP_VINFO_VECT_FACTOR (loop_vinfo), unscaled_vf))
3180         return opt_result::failure_at (vect_location,
3181                                        "Vectorization factor too high for"
3182                                        " epilogue loop.\n");
3183     }
3184
3185   /* Check the costings of the loop make vectorizing worthwhile.  */
3186   res = vect_analyze_loop_costing (loop_vinfo, suggested_unroll_factor);
3187   if (res < 0)
3188     {
3189       ok = opt_result::failure_at (vect_location,
3190                                    "Loop costings may not be worthwhile.\n");
3191       goto again;
3192     }
3193   if (!res)
3194     return opt_result::failure_at (vect_location,
3195                                    "Loop costings not worthwhile.\n");
3196
3197   /* If an epilogue loop is required make sure we can create one.  */
3198   if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
3199       || LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo)
3200       || LOOP_VINFO_EARLY_BREAKS (loop_vinfo))
3201     {
3202       if (dump_enabled_p ())
3203         dump_printf_loc (MSG_NOTE, vect_location, "epilog loop required\n");
3204       if (!vect_can_advance_ivs_p (loop_vinfo)
3205           || !slpeel_can_duplicate_loop_p (loop,
3206                                            LOOP_VINFO_IV_EXIT (loop_vinfo),
3207                                            LOOP_VINFO_IV_EXIT (loop_vinfo)))
3208         {
3209           ok = opt_result::failure_at (vect_location,
3210                                        "not vectorized: can't create required "
3211                                        "epilog loop\n");
3212           goto again;
3213         }
3214     }
3215
3216   /* During peeling, we need to check if number of loop iterations is
3217      enough for both peeled prolog loop and vector loop.  This check
3218      can be merged along with threshold check of loop versioning, so
3219      increase threshold for this case if necessary.
3220
3221      If we are analyzing an epilogue we still want to check what its
3222      versioning threshold would be.  If we decide to vectorize the epilogues we
3223      will want to use the lowest versioning threshold of all epilogues and main
3224      loop.  This will enable us to enter a vectorized epilogue even when
3225      versioning the loop.  We can't simply check whether the epilogue requires
3226      versioning though since we may have skipped some versioning checks when
3227      analyzing the epilogue.  For instance, checks for alias versioning will be
3228      skipped when dealing with epilogues as we assume we already checked them
3229      for the main loop.  So instead we always check the 'orig_loop_vinfo'.  */
3230   if (LOOP_REQUIRES_VERSIONING (orig_loop_vinfo))
3231     {
3232       poly_uint64 niters_th = 0;
3233       unsigned int th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
3234
3235       if (!vect_use_loop_mask_for_alignment_p (loop_vinfo))
3236         {
3237           /* Niters for peeled prolog loop.  */
3238           if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) < 0)
3239             {
3240               dr_vec_info *dr_info = LOOP_VINFO_UNALIGNED_DR (loop_vinfo);
3241               tree vectype = STMT_VINFO_VECTYPE (dr_info->stmt);
3242               niters_th += TYPE_VECTOR_SUBPARTS (vectype) - 1;
3243             }
3244           else
3245             niters_th += LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
3246         }
3247
3248       /* Niters for at least one iteration of vectorized loop.  */
3249       if (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
3250         niters_th += LOOP_VINFO_VECT_FACTOR (loop_vinfo);
3251       /* One additional iteration because of peeling for gap.  */
3252       if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
3253         niters_th += 1;
3254
3255       /*  Use the same condition as vect_transform_loop to decide when to use
3256           the cost to determine a versioning threshold.  */
3257       if (vect_apply_runtime_profitability_check_p (loop_vinfo)
3258           && ordered_p (th, niters_th))
3259         niters_th = ordered_max (poly_uint64 (th), niters_th);
3260
3261       LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo) = niters_th;
3262     }
3263
3264   gcc_assert (known_eq (vectorization_factor,
3265                         LOOP_VINFO_VECT_FACTOR (loop_vinfo)));
3266
3267   slp_done_for_suggested_uf = slp;
3268
3269   /* Ok to vectorize!  */
3270   LOOP_VINFO_VECTORIZABLE_P (loop_vinfo) = 1;
3271   return opt_result::success ();
3272
3273 again:
3274   /* Ensure that "ok" is false (with an opt_problem if dumping is enabled).  */
3275   gcc_assert (!ok);
3276
3277   /* Try again with SLP forced off but if we didn't do any SLP there is
3278      no point in re-trying.  */
3279   if (!slp)
3280     return ok;
3281
3282   /* If the slp decision is true when suggested unroll factor is worked
3283      out, and we are applying suggested unroll factor, we don't need to
3284      re-try any more.  */
3285   if (applying_suggested_uf && slp_done_for_suggested_uf)
3286     return ok;
3287
3288   /* If there are reduction chains re-trying will fail anyway.  */
3289   if (! LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo).is_empty ())
3290     return ok;
3291
3292   /* Likewise if the grouped loads or stores in the SLP cannot be handled
3293      via interleaving or lane instructions.  */
3294   slp_instance instance;
3295   slp_tree node;
3296   unsigned i, j;
3297   FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), i, instance)
3298     {
3299       stmt_vec_info vinfo;
3300       vinfo = SLP_TREE_SCALAR_STMTS (SLP_INSTANCE_TREE (instance))[0];
3301       if (! STMT_VINFO_GROUPED_ACCESS (vinfo))
3302         continue;
3303       vinfo = DR_GROUP_FIRST_ELEMENT (vinfo);
3304       unsigned int size = DR_GROUP_SIZE (vinfo);
3305       tree vectype = STMT_VINFO_VECTYPE (vinfo);
3306       if (vect_store_lanes_supported (vectype, size, false) == IFN_LAST
3307          && ! known_eq (TYPE_VECTOR_SUBPARTS (vectype), 1U)
3308          && ! vect_grouped_store_supported (vectype, size))
3309         return opt_result::failure_at (vinfo->stmt,
3310                                        "unsupported grouped store\n");
3311       FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), j, node)
3312         {
3313           vinfo = SLP_TREE_REPRESENTATIVE (node);
3314           if (STMT_VINFO_GROUPED_ACCESS (vinfo))
3315             {
3316               vinfo = DR_GROUP_FIRST_ELEMENT (vinfo);
3317               bool single_element_p = !DR_GROUP_NEXT_ELEMENT (vinfo);
3318               size = DR_GROUP_SIZE (vinfo);
3319               vectype = STMT_VINFO_VECTYPE (vinfo);
3320               if (vect_load_lanes_supported (vectype, size, false) == IFN_LAST
3321                   && ! vect_grouped_load_supported (vectype, single_element_p,
3322                                                     size))
3323                 return opt_result::failure_at (vinfo->stmt,
3324                                                "unsupported grouped load\n");
3325             }
3326         }
3327     }
3328
3329   if (dump_enabled_p ())
3330     dump_printf_loc (MSG_NOTE, vect_location,
3331                      "re-trying with SLP disabled\n");
3332
3333   /* Roll back state appropriately.  No SLP this time.  */
3334   slp = false;
3335   /* Restore vectorization factor as it were without SLP.  */
3336   LOOP_VINFO_VECT_FACTOR (loop_vinfo) = saved_vectorization_factor;
3337   /* Free the SLP instances.  */
3338   FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), j, instance)
3339     vect_free_slp_instance (instance);
3340   LOOP_VINFO_SLP_INSTANCES (loop_vinfo).release ();
3341   /* Reset SLP type to loop_vect on all stmts.  */
3342   for (i = 0; i < LOOP_VINFO_LOOP (loop_vinfo)->num_nodes; ++i)
3343     {
3344       basic_block bb = LOOP_VINFO_BBS (loop_vinfo)[i];
3345       for (gimple_stmt_iterator si = gsi_start_phis (bb);
3346            !gsi_end_p (si); gsi_next (&si))
3347         {
3348           stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
3349           STMT_SLP_TYPE (stmt_info) = loop_vect;
3350           if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
3351               || STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
3352             {
3353               /* vectorizable_reduction adjusts reduction stmt def-types,
3354                  restore them to that of the PHI.  */
3355               STMT_VINFO_DEF_TYPE (STMT_VINFO_REDUC_DEF (stmt_info))
3356                 = STMT_VINFO_DEF_TYPE (stmt_info);
3357               STMT_VINFO_DEF_TYPE (vect_stmt_to_vectorize
3358                                         (STMT_VINFO_REDUC_DEF (stmt_info)))
3359                 = STMT_VINFO_DEF_TYPE (stmt_info);
3360             }
3361         }
3362       for (gimple_stmt_iterator si = gsi_start_bb (bb);
3363            !gsi_end_p (si); gsi_next (&si))
3364         {
3365           if (is_gimple_debug (gsi_stmt (si)))
3366             continue;
3367           stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
3368           STMT_SLP_TYPE (stmt_info) = loop_vect;
3369           if (STMT_VINFO_IN_PATTERN_P (stmt_info))
3370             {
3371               stmt_vec_info pattern_stmt_info
3372                 = STMT_VINFO_RELATED_STMT (stmt_info);
3373               if (STMT_VINFO_SLP_VECT_ONLY_PATTERN (pattern_stmt_info))
3374                 STMT_VINFO_IN_PATTERN_P (stmt_info) = false;
3375
3376               gimple *pattern_def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
3377               STMT_SLP_TYPE (pattern_stmt_info) = loop_vect;
3378               for (gimple_stmt_iterator pi = gsi_start (pattern_def_seq);
3379                    !gsi_end_p (pi); gsi_next (&pi))
3380                 STMT_SLP_TYPE (loop_vinfo->lookup_stmt (gsi_stmt (pi)))
3381                   = loop_vect;
3382             }
3383         }
3384     }
3385   /* Free optimized alias test DDRS.  */
3386   LOOP_VINFO_LOWER_BOUNDS (loop_vinfo).truncate (0);
3387   LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo).release ();
3388   LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo).release ();
3389   /* Reset target cost data.  */
3390   delete loop_vinfo->vector_costs;
3391   loop_vinfo->vector_costs = nullptr;
3392   /* Reset accumulated rgroup information.  */
3393   LOOP_VINFO_MASKS (loop_vinfo).mask_set.empty ();
3394   release_vec_loop_controls (&LOOP_VINFO_MASKS (loop_vinfo).rgc_vec);
3395   release_vec_loop_controls (&LOOP_VINFO_LENS (loop_vinfo));
3396   /* Reset assorted flags.  */
3397   LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = false;
3398   LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) = false;
3399   LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo) = 0;
3400   LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo) = 0;
3401   LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)
3402     = saved_can_use_partial_vectors_p;
3403   LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo) = false;
3404
3405   goto start_over;
3406 }
3407
3408 /* Return true if vectorizing a loop using NEW_LOOP_VINFO appears
3409    to be better than vectorizing it using OLD_LOOP_VINFO.  Assume that
3410    OLD_LOOP_VINFO is better unless something specifically indicates
3411    otherwise.
3412
3413    Note that this deliberately isn't a partial order.  */
3414
3415 static bool
3416 vect_better_loop_vinfo_p (loop_vec_info new_loop_vinfo,
3417                           loop_vec_info old_loop_vinfo)
3418 {
3419   struct loop *loop = LOOP_VINFO_LOOP (new_loop_vinfo);
3420   gcc_assert (LOOP_VINFO_LOOP (old_loop_vinfo) == loop);
3421
3422   poly_int64 new_vf = LOOP_VINFO_VECT_FACTOR (new_loop_vinfo);
3423   poly_int64 old_vf = LOOP_VINFO_VECT_FACTOR (old_loop_vinfo);
3424
3425   /* Always prefer a VF of loop->simdlen over any other VF.  */
3426   if (loop->simdlen)
3427     {
3428       bool new_simdlen_p = known_eq (new_vf, loop->simdlen);
3429       bool old_simdlen_p = known_eq (old_vf, loop->simdlen);
3430       if (new_simdlen_p != old_simdlen_p)
3431         return new_simdlen_p;
3432     }
3433
3434   const auto *old_costs = old_loop_vinfo->vector_costs;
3435   const auto *new_costs = new_loop_vinfo->vector_costs;
3436   if (loop_vec_info main_loop = LOOP_VINFO_ORIG_LOOP_INFO (old_loop_vinfo))
3437     return new_costs->better_epilogue_loop_than_p (old_costs, main_loop);
3438
3439   return new_costs->better_main_loop_than_p (old_costs);
3440 }
3441
3442 /* Decide whether to replace OLD_LOOP_VINFO with NEW_LOOP_VINFO.  Return
3443    true if we should.  */
3444
3445 static bool
3446 vect_joust_loop_vinfos (loop_vec_info new_loop_vinfo,
3447                         loop_vec_info old_loop_vinfo)
3448 {
3449   if (!vect_better_loop_vinfo_p (new_loop_vinfo, old_loop_vinfo))
3450     return false;
3451
3452   if (dump_enabled_p ())
3453     dump_printf_loc (MSG_NOTE, vect_location,
3454                      "***** Preferring vector mode %s to vector mode %s\n",
3455                      GET_MODE_NAME (new_loop_vinfo->vector_mode),
3456                      GET_MODE_NAME (old_loop_vinfo->vector_mode));
3457   return true;
3458 }
3459
3460 /* Analyze LOOP with VECTOR_MODES[MODE_I] and as epilogue if MAIN_LOOP_VINFO is
3461    not NULL.  Set AUTODETECTED_VECTOR_MODE if VOIDmode and advance
3462    MODE_I to the next mode useful to analyze.
3463    Return the loop_vinfo on success and wrapped null on failure.  */
3464
3465 static opt_loop_vec_info
3466 vect_analyze_loop_1 (class loop *loop, vec_info_shared *shared,
3467                      const vect_loop_form_info *loop_form_info,
3468                      loop_vec_info main_loop_vinfo,
3469                      const vector_modes &vector_modes, unsigned &mode_i,
3470                      machine_mode &autodetected_vector_mode,
3471                      bool &fatal)
3472 {
3473   loop_vec_info loop_vinfo
3474     = vect_create_loop_vinfo (loop, shared, loop_form_info, main_loop_vinfo);
3475
3476   machine_mode vector_mode = vector_modes[mode_i];
3477   loop_vinfo->vector_mode = vector_mode;
3478   unsigned int suggested_unroll_factor = 1;
3479   bool slp_done_for_suggested_uf = false;
3480
3481   /* Run the main analysis.  */
3482   opt_result res = vect_analyze_loop_2 (loop_vinfo, fatal,
3483                                         &suggested_unroll_factor,
3484                                         slp_done_for_suggested_uf);
3485   if (dump_enabled_p ())
3486     dump_printf_loc (MSG_NOTE, vect_location,
3487                      "***** Analysis %s with vector mode %s\n",
3488                      res ? "succeeded" : " failed",
3489                      GET_MODE_NAME (loop_vinfo->vector_mode));
3490
3491   if (res && !main_loop_vinfo && suggested_unroll_factor > 1)
3492     {
3493       if (dump_enabled_p ())
3494         dump_printf_loc (MSG_NOTE, vect_location,
3495                          "***** Re-trying analysis for unrolling"
3496                          " with unroll factor %d and slp %s.\n",
3497                          suggested_unroll_factor,
3498                          slp_done_for_suggested_uf ? "on" : "off");
3499       loop_vec_info unroll_vinfo
3500         = vect_create_loop_vinfo (loop, shared, loop_form_info, main_loop_vinfo);
3501       unroll_vinfo->vector_mode = vector_mode;
3502       unroll_vinfo->suggested_unroll_factor = suggested_unroll_factor;
3503       opt_result new_res = vect_analyze_loop_2 (unroll_vinfo, fatal, NULL,
3504                                                 slp_done_for_suggested_uf);
3505       if (new_res)
3506         {
3507           delete loop_vinfo;
3508           loop_vinfo = unroll_vinfo;
3509         }
3510       else
3511         delete unroll_vinfo;
3512     }
3513
3514   /* Remember the autodetected vector mode.  */
3515   if (vector_mode == VOIDmode)
3516     autodetected_vector_mode = loop_vinfo->vector_mode;
3517
3518   /* Advance mode_i, first skipping modes that would result in the
3519      same analysis result.  */
3520   while (mode_i + 1 < vector_modes.length ()
3521          && vect_chooses_same_modes_p (loop_vinfo,
3522                                        vector_modes[mode_i + 1]))
3523     {
3524       if (dump_enabled_p ())
3525         dump_printf_loc (MSG_NOTE, vect_location,
3526                          "***** The result for vector mode %s would"
3527                          " be the same\n",
3528                          GET_MODE_NAME (vector_modes[mode_i + 1]));
3529       mode_i += 1;
3530     }
3531   if (mode_i + 1 < vector_modes.length ()
3532       && VECTOR_MODE_P (autodetected_vector_mode)
3533       && (related_vector_mode (vector_modes[mode_i + 1],
3534                                GET_MODE_INNER (autodetected_vector_mode))
3535           == autodetected_vector_mode)
3536       && (related_vector_mode (autodetected_vector_mode,
3537                                GET_MODE_INNER (vector_modes[mode_i + 1]))
3538           == vector_modes[mode_i + 1]))
3539     {
3540       if (dump_enabled_p ())
3541         dump_printf_loc (MSG_NOTE, vect_location,
3542                          "***** Skipping vector mode %s, which would"
3543                          " repeat the analysis for %s\n",
3544                          GET_MODE_NAME (vector_modes[mode_i + 1]),
3545                          GET_MODE_NAME (autodetected_vector_mode));
3546       mode_i += 1;
3547     }
3548   mode_i++;
3549
3550   if (!res)
3551     {
3552       delete loop_vinfo;
3553       if (fatal)
3554         gcc_checking_assert (main_loop_vinfo == NULL);
3555       return opt_loop_vec_info::propagate_failure (res);
3556     }
3557
3558   return opt_loop_vec_info::success (loop_vinfo);
3559 }
3560
3561 /* Function vect_analyze_loop.
3562
3563    Apply a set of analyses on LOOP, and create a loop_vec_info struct
3564    for it.  The different analyses will record information in the
3565    loop_vec_info struct.  */
3566 opt_loop_vec_info
3567 vect_analyze_loop (class loop *loop, vec_info_shared *shared)
3568 {
3569   DUMP_VECT_SCOPE ("analyze_loop_nest");
3570
3571   if (loop_outer (loop)
3572       && loop_vec_info_for_loop (loop_outer (loop))
3573       && LOOP_VINFO_VECTORIZABLE_P (loop_vec_info_for_loop (loop_outer (loop))))
3574     return opt_loop_vec_info::failure_at (vect_location,
3575                                           "outer-loop already vectorized.\n");
3576
3577   if (!find_loop_nest (loop, &shared->loop_nest))
3578     return opt_loop_vec_info::failure_at
3579       (vect_location,
3580        "not vectorized: loop nest containing two or more consecutive inner"
3581        " loops cannot be vectorized\n");
3582
3583   /* Analyze the loop form.  */
3584   vect_loop_form_info loop_form_info;
3585   opt_result res = vect_analyze_loop_form (loop, &loop_form_info);
3586   if (!res)
3587     {
3588       if (dump_enabled_p ())
3589         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3590                          "bad loop form.\n");
3591       return opt_loop_vec_info::propagate_failure (res);
3592     }
3593   if (!integer_onep (loop_form_info.assumptions))
3594     {
3595       /* We consider to vectorize this loop by versioning it under
3596          some assumptions.  In order to do this, we need to clear
3597          existing information computed by scev and niter analyzer.  */
3598       scev_reset_htab ();
3599       free_numbers_of_iterations_estimates (loop);
3600       /* Also set flag for this loop so that following scev and niter
3601          analysis are done under the assumptions.  */
3602       loop_constraint_set (loop, LOOP_C_FINITE);
3603     }
3604   else
3605     /* Clear the existing niter information to make sure the nonwrapping flag
3606        will be calculated and set propriately.  */
3607     free_numbers_of_iterations_estimates (loop);
3608
3609   auto_vector_modes vector_modes;
3610   /* Autodetect first vector size we try.  */
3611   vector_modes.safe_push (VOIDmode);
3612   unsigned int autovec_flags
3613     = targetm.vectorize.autovectorize_vector_modes (&vector_modes,
3614                                                     loop->simdlen != 0);
3615   bool pick_lowest_cost_p = ((autovec_flags & VECT_COMPARE_COSTS)
3616                              && !unlimited_cost_model (loop));
3617   machine_mode autodetected_vector_mode = VOIDmode;
3618   opt_loop_vec_info first_loop_vinfo = opt_loop_vec_info::success (NULL);
3619   unsigned int mode_i = 0;
3620   unsigned HOST_WIDE_INT simdlen = loop->simdlen;
3621
3622   /* Keep track of the VF for each mode.  Initialize all to 0 which indicates
3623      a mode has not been analyzed.  */
3624   auto_vec<poly_uint64, 8> cached_vf_per_mode;
3625   for (unsigned i = 0; i < vector_modes.length (); ++i)
3626     cached_vf_per_mode.safe_push (0);
3627
3628   /* First determine the main loop vectorization mode, either the first
3629      one that works, starting with auto-detecting the vector mode and then
3630      following the targets order of preference, or the one with the
3631      lowest cost if pick_lowest_cost_p.  */
3632   while (1)
3633     {
3634       bool fatal;
3635       unsigned int last_mode_i = mode_i;
3636       /* Set cached VF to -1 prior to analysis, which indicates a mode has
3637          failed.  */
3638       cached_vf_per_mode[last_mode_i] = -1;
3639       opt_loop_vec_info loop_vinfo
3640         = vect_analyze_loop_1 (loop, shared, &loop_form_info,
3641                                NULL, vector_modes, mode_i,
3642                                autodetected_vector_mode, fatal);
3643       if (fatal)
3644         break;
3645
3646       if (loop_vinfo)
3647         {
3648           /*  Analyzis has been successful so update the VF value.  The
3649               VF should always be a multiple of unroll_factor and we want to
3650               capture the original VF here.  */
3651           cached_vf_per_mode[last_mode_i]
3652             = exact_div (LOOP_VINFO_VECT_FACTOR (loop_vinfo),
3653                          loop_vinfo->suggested_unroll_factor);
3654           /* Once we hit the desired simdlen for the first time,
3655              discard any previous attempts.  */
3656           if (simdlen
3657               && known_eq (LOOP_VINFO_VECT_FACTOR (loop_vinfo), simdlen))
3658             {
3659               delete first_loop_vinfo;
3660               first_loop_vinfo = opt_loop_vec_info::success (NULL);
3661               simdlen = 0;
3662             }
3663           else if (pick_lowest_cost_p
3664                    && first_loop_vinfo
3665                    && vect_joust_loop_vinfos (loop_vinfo, first_loop_vinfo))
3666             {
3667               /* Pick loop_vinfo over first_loop_vinfo.  */
3668               delete first_loop_vinfo;
3669               first_loop_vinfo = opt_loop_vec_info::success (NULL);
3670             }
3671           if (first_loop_vinfo == NULL)
3672             first_loop_vinfo = loop_vinfo;
3673           else
3674             {
3675               delete loop_vinfo;
3676               loop_vinfo = opt_loop_vec_info::success (NULL);
3677             }
3678
3679           /* Commit to first_loop_vinfo if we have no reason to try
3680              alternatives.  */
3681           if (!simdlen && !pick_lowest_cost_p)
3682             break;
3683         }
3684       if (mode_i == vector_modes.length ()
3685           || autodetected_vector_mode == VOIDmode)
3686         break;
3687
3688       /* Try the next biggest vector size.  */
3689       if (dump_enabled_p ())
3690         dump_printf_loc (MSG_NOTE, vect_location,
3691                          "***** Re-trying analysis with vector mode %s\n",
3692                          GET_MODE_NAME (vector_modes[mode_i]));
3693     }
3694   if (!first_loop_vinfo)
3695     return opt_loop_vec_info::propagate_failure (res);
3696
3697   if (dump_enabled_p ())
3698     dump_printf_loc (MSG_NOTE, vect_location,
3699                      "***** Choosing vector mode %s\n",
3700                      GET_MODE_NAME (first_loop_vinfo->vector_mode));
3701
3702   /* Only vectorize epilogues if PARAM_VECT_EPILOGUES_NOMASK is
3703      enabled, SIMDUID is not set, it is the innermost loop and we have
3704      either already found the loop's SIMDLEN or there was no SIMDLEN to
3705      begin with.
3706      TODO: Enable epilogue vectorization for loops with SIMDUID set.  */
3707   bool vect_epilogues = (!simdlen
3708                          && loop->inner == NULL
3709                          && param_vect_epilogues_nomask
3710                          && LOOP_VINFO_PEELING_FOR_NITER (first_loop_vinfo)
3711                            /* No code motion support for multiple epilogues so for now
3712                               not supported when multiple exits.  */
3713                          && !LOOP_VINFO_EARLY_BREAKS (first_loop_vinfo)
3714                          && !loop->simduid);
3715   if (!vect_epilogues)
3716     return first_loop_vinfo;
3717
3718   /* Now analyze first_loop_vinfo for epilogue vectorization.  */
3719   poly_uint64 lowest_th = LOOP_VINFO_VERSIONING_THRESHOLD (first_loop_vinfo);
3720
3721   /* For epilogues start the analysis from the first mode.  The motivation
3722      behind starting from the beginning comes from cases where the VECTOR_MODES
3723      array may contain length-agnostic and length-specific modes.  Their
3724      ordering is not guaranteed, so we could end up picking a mode for the main
3725      loop that is after the epilogue's optimal mode.  */
3726   vector_modes[0] = autodetected_vector_mode;
3727   mode_i = 0;
3728
3729   bool supports_partial_vectors =
3730     partial_vectors_supported_p () && param_vect_partial_vector_usage != 0;
3731   poly_uint64 first_vinfo_vf = LOOP_VINFO_VECT_FACTOR (first_loop_vinfo);
3732
3733   while (1)
3734     {
3735       /* If the target does not support partial vectors we can shorten the
3736          number of modes to analyze for the epilogue as we know we can't pick a
3737          mode that would lead to a VF at least as big as the
3738          FIRST_VINFO_VF.  */
3739       if (!supports_partial_vectors
3740           && maybe_ge (cached_vf_per_mode[mode_i], first_vinfo_vf))
3741         {
3742           mode_i++;
3743           if (mode_i == vector_modes.length ())
3744             break;
3745           continue;
3746         }
3747
3748       if (dump_enabled_p ())
3749         dump_printf_loc (MSG_NOTE, vect_location,
3750                          "***** Re-trying epilogue analysis with vector "
3751                          "mode %s\n", GET_MODE_NAME (vector_modes[mode_i]));
3752
3753       bool fatal;
3754       opt_loop_vec_info loop_vinfo
3755         = vect_analyze_loop_1 (loop, shared, &loop_form_info,
3756                                first_loop_vinfo,
3757                                vector_modes, mode_i,
3758                                autodetected_vector_mode, fatal);
3759       if (fatal)
3760         break;
3761
3762       if (loop_vinfo)
3763         {
3764           if (pick_lowest_cost_p)
3765             {
3766               /* Keep trying to roll back vectorization attempts while the
3767                  loop_vec_infos they produced were worse than this one.  */
3768               vec<loop_vec_info> &vinfos = first_loop_vinfo->epilogue_vinfos;
3769               while (!vinfos.is_empty ()
3770                      && vect_joust_loop_vinfos (loop_vinfo, vinfos.last ()))
3771                 {
3772                   gcc_assert (vect_epilogues);
3773                   delete vinfos.pop ();
3774                 }
3775             }
3776           /* For now only allow one epilogue loop.  */
3777           if (first_loop_vinfo->epilogue_vinfos.is_empty ())
3778             {
3779               first_loop_vinfo->epilogue_vinfos.safe_push (loop_vinfo);
3780               poly_uint64 th = LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo);
3781               gcc_assert (!LOOP_REQUIRES_VERSIONING (loop_vinfo)
3782                           || maybe_ne (lowest_th, 0U));
3783               /* Keep track of the known smallest versioning
3784                  threshold.  */
3785               if (ordered_p (lowest_th, th))
3786                 lowest_th = ordered_min (lowest_th, th);
3787             }
3788           else
3789             {
3790               delete loop_vinfo;
3791               loop_vinfo = opt_loop_vec_info::success (NULL);
3792             }
3793
3794           /* For now only allow one epilogue loop, but allow
3795              pick_lowest_cost_p to replace it, so commit to the
3796              first epilogue if we have no reason to try alternatives.  */
3797           if (!pick_lowest_cost_p)
3798             break;
3799         }
3800
3801       if (mode_i == vector_modes.length ())
3802         break;
3803
3804     }
3805
3806   if (!first_loop_vinfo->epilogue_vinfos.is_empty ())
3807     {
3808       LOOP_VINFO_VERSIONING_THRESHOLD (first_loop_vinfo) = lowest_th;
3809       if (dump_enabled_p ())
3810         dump_printf_loc (MSG_NOTE, vect_location,
3811                          "***** Choosing epilogue vector mode %s\n",
3812                          GET_MODE_NAME
3813                            (first_loop_vinfo->epilogue_vinfos[0]->vector_mode));
3814     }
3815
3816   return first_loop_vinfo;
3817 }
3818
3819 /* Return true if there is an in-order reduction function for CODE, storing
3820    it in *REDUC_FN if so.  */
3821
3822 static bool
3823 fold_left_reduction_fn (code_helper code, internal_fn *reduc_fn)
3824 {
3825   /* We support MINUS_EXPR by negating the operand.  This also preserves an
3826      initial -0.0 since -0.0 - 0.0 (neutral op for MINUS_EXPR) == -0.0 +
3827      (-0.0) = -0.0.  */
3828   if (code == PLUS_EXPR || code == MINUS_EXPR)
3829     {
3830       *reduc_fn = IFN_FOLD_LEFT_PLUS;
3831       return true;
3832     }
3833   return false;
3834 }
3835
3836 /* Function reduction_fn_for_scalar_code
3837
3838    Input:
3839    CODE - tree_code of a reduction operations.
3840
3841    Output:
3842    REDUC_FN - the corresponding internal function to be used to reduce the
3843       vector of partial results into a single scalar result, or IFN_LAST
3844       if the operation is a supported reduction operation, but does not have
3845       such an internal function.
3846
3847    Return FALSE if CODE currently cannot be vectorized as reduction.  */
3848
3849 bool
3850 reduction_fn_for_scalar_code (code_helper code, internal_fn *reduc_fn)
3851 {
3852   if (code.is_tree_code ())
3853     switch (tree_code (code))
3854       {
3855       case MAX_EXPR:
3856         *reduc_fn = IFN_REDUC_MAX;
3857         return true;
3858
3859       case MIN_EXPR:
3860         *reduc_fn = IFN_REDUC_MIN;
3861         return true;
3862
3863       case PLUS_EXPR:
3864         *reduc_fn = IFN_REDUC_PLUS;
3865         return true;
3866
3867       case BIT_AND_EXPR:
3868         *reduc_fn = IFN_REDUC_AND;
3869         return true;
3870
3871       case BIT_IOR_EXPR:
3872         *reduc_fn = IFN_REDUC_IOR;
3873         return true;
3874
3875       case BIT_XOR_EXPR:
3876         *reduc_fn = IFN_REDUC_XOR;
3877         return true;
3878
3879       case MULT_EXPR:
3880       case MINUS_EXPR:
3881         *reduc_fn = IFN_LAST;
3882         return true;
3883
3884       default:
3885         return false;
3886       }
3887   else
3888     switch (combined_fn (code))
3889       {
3890       CASE_CFN_FMAX:
3891         *reduc_fn = IFN_REDUC_FMAX;
3892         return true;
3893
3894       CASE_CFN_FMIN:
3895         *reduc_fn = IFN_REDUC_FMIN;
3896         return true;
3897
3898       default:
3899         return false;
3900       }
3901 }
3902
3903 /* If there is a neutral value X such that a reduction would not be affected
3904    by the introduction of additional X elements, return that X, otherwise
3905    return null.  CODE is the code of the reduction and SCALAR_TYPE is type
3906    of the scalar elements.  If the reduction has just a single initial value
3907    then INITIAL_VALUE is that value, otherwise it is null.
3908    If AS_INITIAL is TRUE the value is supposed to be used as initial value.
3909    In that case no signed zero is returned.  */
3910
3911 tree
3912 neutral_op_for_reduction (tree scalar_type, code_helper code,
3913                           tree initial_value, bool as_initial)
3914 {
3915   if (code.is_tree_code ())
3916     switch (tree_code (code))
3917       {
3918       case DOT_PROD_EXPR:
3919       case SAD_EXPR:
3920       case MINUS_EXPR:
3921       case BIT_IOR_EXPR:
3922       case BIT_XOR_EXPR:
3923         return build_zero_cst (scalar_type);
3924       case WIDEN_SUM_EXPR:
3925       case PLUS_EXPR:
3926         if (!as_initial && HONOR_SIGNED_ZEROS (scalar_type))
3927           return build_real (scalar_type, dconstm0);
3928         else
3929           return build_zero_cst (scalar_type);
3930
3931       case MULT_EXPR:
3932         return build_one_cst (scalar_type);
3933
3934       case BIT_AND_EXPR:
3935         return build_all_ones_cst (scalar_type);
3936
3937       case MAX_EXPR:
3938       case MIN_EXPR:
3939         return initial_value;
3940
3941       default:
3942         return NULL_TREE;
3943       }
3944   else
3945     switch (combined_fn (code))
3946       {
3947       CASE_CFN_FMIN:
3948       CASE_CFN_FMAX:
3949         return initial_value;
3950
3951       default:
3952         return NULL_TREE;
3953       }
3954 }
3955
3956 /* Error reporting helper for vect_is_simple_reduction below.  GIMPLE statement
3957    STMT is printed with a message MSG. */
3958
3959 static void
3960 report_vect_op (dump_flags_t msg_type, gimple *stmt, const char *msg)
3961 {
3962   dump_printf_loc (msg_type, vect_location, "%s%G", msg, stmt);
3963 }
3964
3965 /* Return true if we need an in-order reduction for operation CODE
3966    on type TYPE.  NEED_WRAPPING_INTEGRAL_OVERFLOW is true if integer
3967    overflow must wrap.  */
3968
3969 bool
3970 needs_fold_left_reduction_p (tree type, code_helper code)
3971 {
3972   /* CHECKME: check for !flag_finite_math_only too?  */
3973   if (SCALAR_FLOAT_TYPE_P (type))
3974     {
3975       if (code.is_tree_code ())
3976         switch (tree_code (code))
3977           {
3978           case MIN_EXPR:
3979           case MAX_EXPR:
3980             return false;
3981
3982           default:
3983             return !flag_associative_math;
3984           }
3985       else
3986         switch (combined_fn (code))
3987           {
3988           CASE_CFN_FMIN:
3989           CASE_CFN_FMAX:
3990             return false;
3991
3992           default:
3993             return !flag_associative_math;
3994           }
3995     }
3996
3997   if (INTEGRAL_TYPE_P (type))
3998     return (!code.is_tree_code ()
3999             || !operation_no_trapping_overflow (type, tree_code (code)));
4000
4001   if (SAT_FIXED_POINT_TYPE_P (type))
4002     return true;
4003
4004   return false;
4005 }
4006
4007 /* Return true if the reduction PHI in LOOP with latch arg LOOP_ARG and
4008    has a handled computation expression.  Store the main reduction
4009    operation in *CODE.  */
4010
4011 static bool
4012 check_reduction_path (dump_user_location_t loc, loop_p loop, gphi *phi,
4013                       tree loop_arg, code_helper *code,
4014                       vec<std::pair<ssa_op_iter, use_operand_p> > &path)
4015 {
4016   auto_bitmap visited;
4017   tree lookfor = PHI_RESULT (phi);
4018   ssa_op_iter curri;
4019   use_operand_p curr = op_iter_init_phiuse (&curri, phi, SSA_OP_USE);
4020   while (USE_FROM_PTR (curr) != loop_arg)
4021     curr = op_iter_next_use (&curri);
4022   curri.i = curri.numops;
4023   do
4024     {
4025       path.safe_push (std::make_pair (curri, curr));
4026       tree use = USE_FROM_PTR (curr);
4027       if (use == lookfor)
4028         break;
4029       gimple *def = SSA_NAME_DEF_STMT (use);
4030       if (gimple_nop_p (def)
4031           || ! flow_bb_inside_loop_p (loop, gimple_bb (def)))
4032         {
4033 pop:
4034           do
4035             {
4036               std::pair<ssa_op_iter, use_operand_p> x = path.pop ();
4037               curri = x.first;
4038               curr = x.second;
4039               do
4040                 curr = op_iter_next_use (&curri);
4041               /* Skip already visited or non-SSA operands (from iterating
4042                  over PHI args).  */
4043               while (curr != NULL_USE_OPERAND_P
4044                      && (TREE_CODE (USE_FROM_PTR (curr)) != SSA_NAME
4045                          || ! bitmap_set_bit (visited,
4046                                               SSA_NAME_VERSION
4047                                                 (USE_FROM_PTR (curr)))));
4048             }
4049           while (curr == NULL_USE_OPERAND_P && ! path.is_empty ());
4050           if (curr == NULL_USE_OPERAND_P)
4051             break;
4052         }
4053       else
4054         {
4055           if (gimple_code (def) == GIMPLE_PHI)
4056             curr = op_iter_init_phiuse (&curri, as_a <gphi *>(def), SSA_OP_USE);
4057           else
4058             curr = op_iter_init_use (&curri, def, SSA_OP_USE);
4059           while (curr != NULL_USE_OPERAND_P
4060                  && (TREE_CODE (USE_FROM_PTR (curr)) != SSA_NAME
4061                      || ! bitmap_set_bit (visited,
4062                                           SSA_NAME_VERSION
4063                                             (USE_FROM_PTR (curr)))))
4064             curr = op_iter_next_use (&curri);
4065           if (curr == NULL_USE_OPERAND_P)
4066             goto pop;
4067         }
4068     }
4069   while (1);
4070   if (dump_file && (dump_flags & TDF_DETAILS))
4071     {
4072       dump_printf_loc (MSG_NOTE, loc, "reduction path: ");
4073       unsigned i;
4074       std::pair<ssa_op_iter, use_operand_p> *x;
4075       FOR_EACH_VEC_ELT (path, i, x)
4076         dump_printf (MSG_NOTE, "%T ", USE_FROM_PTR (x->second));
4077       dump_printf (MSG_NOTE, "\n");
4078     }
4079
4080   /* Check whether the reduction path detected is valid.  */
4081   bool fail = path.length () == 0;
4082   bool neg = false;
4083   int sign = -1;
4084   *code = ERROR_MARK;
4085   for (unsigned i = 1; i < path.length (); ++i)
4086     {
4087       gimple *use_stmt = USE_STMT (path[i].second);
4088       gimple_match_op op;
4089       if (!gimple_extract_op (use_stmt, &op))
4090         {
4091           fail = true;
4092           break;
4093         }
4094       unsigned int opi = op.num_ops;
4095       if (gassign *assign = dyn_cast<gassign *> (use_stmt))
4096         {
4097           /* The following make sure we can compute the operand index
4098              easily plus it mostly disallows chaining via COND_EXPR condition
4099              operands.  */
4100           for (opi = 0; opi < op.num_ops; ++opi)
4101             if (gimple_assign_rhs1_ptr (assign) + opi == path[i].second->use)
4102               break;
4103         }
4104       else if (gcall *call = dyn_cast<gcall *> (use_stmt))
4105         {
4106           for (opi = 0; opi < op.num_ops; ++opi)
4107             if (gimple_call_arg_ptr (call, opi) == path[i].second->use)
4108               break;
4109         }
4110       if (opi == op.num_ops)
4111         {
4112           fail = true;
4113           break;
4114         }
4115       op.code = canonicalize_code (op.code, op.type);
4116       if (op.code == MINUS_EXPR)
4117         {
4118           op.code = PLUS_EXPR;
4119           /* Track whether we negate the reduction value each iteration.  */
4120           if (op.ops[1] == op.ops[opi])
4121             neg = ! neg;
4122         }
4123       else if (op.code == IFN_COND_SUB)
4124         {
4125           op.code = IFN_COND_ADD;
4126           /* Track whether we negate the reduction value each iteration.  */
4127           if (op.ops[2] == op.ops[opi])
4128             neg = ! neg;
4129         }
4130       if (CONVERT_EXPR_CODE_P (op.code)
4131           && tree_nop_conversion_p (op.type, TREE_TYPE (op.ops[0])))
4132         ;
4133       else if (*code == ERROR_MARK)
4134         {
4135           *code = op.code;
4136           sign = TYPE_SIGN (op.type);
4137         }
4138       else if (op.code != *code)
4139         {
4140           fail = true;
4141           break;
4142         }
4143       else if ((op.code == MIN_EXPR
4144                 || op.code == MAX_EXPR)
4145                && sign != TYPE_SIGN (op.type))
4146         {
4147           fail = true;
4148           break;
4149         }
4150       /* Check there's only a single stmt the op is used on.  For the
4151          not value-changing tail and the last stmt allow out-of-loop uses.
4152          ???  We could relax this and handle arbitrary live stmts by
4153          forcing a scalar epilogue for example.  */
4154       imm_use_iterator imm_iter;
4155       use_operand_p use_p;
4156       gimple *op_use_stmt;
4157       unsigned cnt = 0;
4158       bool cond_fn_p = op.code.is_internal_fn ()
4159         && (conditional_internal_fn_code (internal_fn (op.code))
4160             != ERROR_MARK);
4161
4162       FOR_EACH_IMM_USE_STMT (op_use_stmt, imm_iter, op.ops[opi])
4163         {
4164         /* In case of a COND_OP (mask, op1, op2, op1) reduction we might have
4165            op1 twice (once as definition, once as else) in the same operation.
4166            Allow this.  */
4167           if (cond_fn_p && op_use_stmt == use_stmt)
4168             {
4169               gcall *call = as_a<gcall *> (use_stmt);
4170               unsigned else_pos
4171                 = internal_fn_else_index (internal_fn (op.code));
4172
4173               for (unsigned int j = 0; j < gimple_call_num_args (call); ++j)
4174                 {
4175                   if (j == else_pos)
4176                     continue;
4177                   if (gimple_call_arg (call, j) == op.ops[opi])
4178                     cnt++;
4179                 }
4180             }
4181           else if (!is_gimple_debug (op_use_stmt)
4182                    && (*code != ERROR_MARK
4183                        || flow_bb_inside_loop_p (loop,
4184                                                  gimple_bb (op_use_stmt))))
4185             FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
4186               cnt++;
4187         }
4188
4189       if (cnt != 1)
4190         {
4191           fail = true;
4192           break;
4193         }
4194     }
4195   return ! fail && ! neg && *code != ERROR_MARK;
4196 }
4197
4198 bool
4199 check_reduction_path (dump_user_location_t loc, loop_p loop, gphi *phi,
4200                       tree loop_arg, enum tree_code code)
4201 {
4202   auto_vec<std::pair<ssa_op_iter, use_operand_p> > path;
4203   code_helper code_;
4204   return (check_reduction_path (loc, loop, phi, loop_arg, &code_, path)
4205           && code_ == code);
4206 }
4207
4208
4209
4210 /* Function vect_is_simple_reduction
4211
4212    (1) Detect a cross-iteration def-use cycle that represents a simple
4213    reduction computation.  We look for the following pattern:
4214
4215    loop_header:
4216      a1 = phi < a0, a2 >
4217      a3 = ...
4218      a2 = operation (a3, a1)
4219
4220    or
4221
4222    a3 = ...
4223    loop_header:
4224      a1 = phi < a0, a2 >
4225      a2 = operation (a3, a1)
4226
4227    such that:
4228    1. operation is commutative and associative and it is safe to
4229       change the order of the computation
4230    2. no uses for a2 in the loop (a2 is used out of the loop)
4231    3. no uses of a1 in the loop besides the reduction operation
4232    4. no uses of a1 outside the loop.
4233
4234    Conditions 1,4 are tested here.
4235    Conditions 2,3 are tested in vect_mark_stmts_to_be_vectorized.
4236
4237    (2) Detect a cross-iteration def-use cycle in nested loops, i.e.,
4238    nested cycles.
4239
4240    (3) Detect cycles of phi nodes in outer-loop vectorization, i.e., double
4241    reductions:
4242
4243      a1 = phi < a0, a2 >
4244      inner loop (def of a3)
4245      a2 = phi < a3 >
4246
4247    (4) Detect condition expressions, ie:
4248      for (int i = 0; i < N; i++)
4249        if (a[i] < val)
4250         ret_val = a[i];
4251
4252 */
4253
4254 static stmt_vec_info
4255 vect_is_simple_reduction (loop_vec_info loop_info, stmt_vec_info phi_info,
4256                           bool *double_reduc, bool *reduc_chain_p, bool slp)
4257 {
4258   gphi *phi = as_a <gphi *> (phi_info->stmt);
4259   gimple *phi_use_stmt = NULL;
4260   imm_use_iterator imm_iter;
4261   use_operand_p use_p;
4262
4263   *double_reduc = false;
4264   *reduc_chain_p = false;
4265   STMT_VINFO_REDUC_TYPE (phi_info) = TREE_CODE_REDUCTION;
4266
4267   tree phi_name = PHI_RESULT (phi);
4268   /* ???  If there are no uses of the PHI result the inner loop reduction
4269      won't be detected as possibly double-reduction by vectorizable_reduction
4270      because that tries to walk the PHI arg from the preheader edge which
4271      can be constant.  See PR60382.  */
4272   if (has_zero_uses (phi_name))
4273     return NULL;
4274   class loop *loop = (gimple_bb (phi))->loop_father;
4275   unsigned nphi_def_loop_uses = 0;
4276   FOR_EACH_IMM_USE_FAST (use_p, imm_iter, phi_name)
4277     {
4278       gimple *use_stmt = USE_STMT (use_p);
4279       if (is_gimple_debug (use_stmt))
4280         continue;
4281
4282       if (!flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
4283         {
4284           if (dump_enabled_p ())
4285             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4286                              "intermediate value used outside loop.\n");
4287
4288           return NULL;
4289         }
4290
4291       /* In case of a COND_OP (mask, op1, op2, op1) reduction we might have
4292          op1 twice (once as definition, once as else) in the same operation.
4293          Only count it as one. */
4294       if (use_stmt != phi_use_stmt)
4295         {
4296           nphi_def_loop_uses++;
4297           phi_use_stmt = use_stmt;
4298         }
4299     }
4300
4301   tree latch_def = PHI_ARG_DEF_FROM_EDGE (phi, loop_latch_edge (loop));
4302   if (TREE_CODE (latch_def) != SSA_NAME)
4303     {
4304       if (dump_enabled_p ())
4305         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4306                          "reduction: not ssa_name: %T\n", latch_def);
4307       return NULL;
4308     }
4309
4310   stmt_vec_info def_stmt_info = loop_info->lookup_def (latch_def);
4311   if (!def_stmt_info
4312       || !flow_bb_inside_loop_p (loop, gimple_bb (def_stmt_info->stmt)))
4313     return NULL;
4314
4315   bool nested_in_vect_loop
4316     = flow_loop_nested_p (LOOP_VINFO_LOOP (loop_info), loop);
4317   unsigned nlatch_def_loop_uses = 0;
4318   auto_vec<gphi *, 3> lcphis;
4319   bool inner_loop_of_double_reduc = false;
4320   FOR_EACH_IMM_USE_FAST (use_p, imm_iter, latch_def)
4321     {
4322       gimple *use_stmt = USE_STMT (use_p);
4323       if (is_gimple_debug (use_stmt))
4324         continue;
4325       if (flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
4326         nlatch_def_loop_uses++;
4327       else
4328         {
4329           /* We can have more than one loop-closed PHI.  */
4330           lcphis.safe_push (as_a <gphi *> (use_stmt));
4331           if (nested_in_vect_loop
4332               && (STMT_VINFO_DEF_TYPE (loop_info->lookup_stmt (use_stmt))
4333                   == vect_double_reduction_def))
4334             inner_loop_of_double_reduc = true;
4335         }
4336     }
4337
4338   /* If we are vectorizing an inner reduction we are executing that
4339      in the original order only in case we are not dealing with a
4340      double reduction.  */
4341   if (nested_in_vect_loop && !inner_loop_of_double_reduc)
4342     {
4343       if (dump_enabled_p ())
4344         report_vect_op (MSG_NOTE, def_stmt_info->stmt,
4345                         "detected nested cycle: ");
4346       return def_stmt_info;
4347     }
4348
4349   /* When the inner loop of a double reduction ends up with more than
4350      one loop-closed PHI we have failed to classify alternate such
4351      PHIs as double reduction, leading to wrong code.  See PR103237.  */
4352   if (inner_loop_of_double_reduc && lcphis.length () != 1)
4353     {
4354       if (dump_enabled_p ())
4355         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4356                          "unhandle double reduction\n");
4357       return NULL;
4358     }
4359
4360   /* If this isn't a nested cycle or if the nested cycle reduction value
4361      is used ouside of the inner loop we cannot handle uses of the reduction
4362      value.  */
4363   if (nlatch_def_loop_uses > 1 || nphi_def_loop_uses > 1)
4364     {
4365       if (dump_enabled_p ())
4366         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4367                          "reduction used in loop.\n");
4368       return NULL;
4369     }
4370
4371   /* If DEF_STMT is a phi node itself, we expect it to have a single argument
4372      defined in the inner loop.  */
4373   if (gphi *def_stmt = dyn_cast <gphi *> (def_stmt_info->stmt))
4374     {
4375       tree op1 = PHI_ARG_DEF (def_stmt, 0);
4376       if (gimple_phi_num_args (def_stmt) != 1
4377           || TREE_CODE (op1) != SSA_NAME)
4378         {
4379           if (dump_enabled_p ())
4380             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4381                              "unsupported phi node definition.\n");
4382
4383           return NULL;
4384         }
4385
4386       /* Verify there is an inner cycle composed of the PHI phi_use_stmt
4387          and the latch definition op1.  */
4388       gimple *def1 = SSA_NAME_DEF_STMT (op1);
4389       if (gimple_bb (def1)
4390           && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
4391           && loop->inner
4392           && flow_bb_inside_loop_p (loop->inner, gimple_bb (def1))
4393           && (is_gimple_assign (def1) || is_gimple_call (def1))
4394           && is_a <gphi *> (phi_use_stmt)
4395           && flow_bb_inside_loop_p (loop->inner, gimple_bb (phi_use_stmt))
4396           && (op1 == PHI_ARG_DEF_FROM_EDGE (phi_use_stmt,
4397                                             loop_latch_edge (loop->inner))))
4398         {
4399           if (dump_enabled_p ())
4400             report_vect_op (MSG_NOTE, def_stmt,
4401                             "detected double reduction: ");
4402
4403           *double_reduc = true;
4404           return def_stmt_info;
4405         }
4406
4407       return NULL;
4408     }
4409
4410   /* Look for the expression computing latch_def from then loop PHI result.  */
4411   auto_vec<std::pair<ssa_op_iter, use_operand_p> > path;
4412   code_helper code;
4413   if (check_reduction_path (vect_location, loop, phi, latch_def, &code,
4414                             path))
4415     {
4416       STMT_VINFO_REDUC_CODE (phi_info) = code;
4417       if (code == COND_EXPR && !nested_in_vect_loop)
4418         STMT_VINFO_REDUC_TYPE (phi_info) = COND_REDUCTION;
4419
4420       /* Fill in STMT_VINFO_REDUC_IDX and gather stmts for an SLP
4421          reduction chain for which the additional restriction is that
4422          all operations in the chain are the same.  */
4423       auto_vec<stmt_vec_info, 8> reduc_chain;
4424       unsigned i;
4425       bool is_slp_reduc = !nested_in_vect_loop && code != COND_EXPR;
4426       for (i = path.length () - 1; i >= 1; --i)
4427         {
4428           gimple *stmt = USE_STMT (path[i].second);
4429           stmt_vec_info stmt_info = loop_info->lookup_stmt (stmt);
4430           gimple_match_op op;
4431           if (!gimple_extract_op (stmt, &op))
4432             gcc_unreachable ();
4433           if (gassign *assign = dyn_cast<gassign *> (stmt))
4434             STMT_VINFO_REDUC_IDX (stmt_info)
4435               = path[i].second->use - gimple_assign_rhs1_ptr (assign);
4436           else
4437             {
4438               gcall *call = as_a<gcall *> (stmt);
4439               STMT_VINFO_REDUC_IDX (stmt_info)
4440                 = path[i].second->use - gimple_call_arg_ptr (call, 0);
4441             }
4442           bool leading_conversion = (CONVERT_EXPR_CODE_P (op.code)
4443                                      && (i == 1 || i == path.length () - 1));
4444           if ((op.code != code && !leading_conversion)
4445               /* We can only handle the final value in epilogue
4446                  generation for reduction chains.  */
4447               || (i != 1 && !has_single_use (gimple_get_lhs (stmt))))
4448             is_slp_reduc = false;
4449           /* For reduction chains we support a trailing/leading
4450              conversions.  We do not store those in the actual chain.  */
4451           if (leading_conversion)
4452             continue;
4453           reduc_chain.safe_push (stmt_info);
4454         }
4455       if (slp && is_slp_reduc && reduc_chain.length () > 1)
4456         {
4457           for (unsigned i = 0; i < reduc_chain.length () - 1; ++i)
4458             {
4459               REDUC_GROUP_FIRST_ELEMENT (reduc_chain[i]) = reduc_chain[0];
4460               REDUC_GROUP_NEXT_ELEMENT (reduc_chain[i]) = reduc_chain[i+1];
4461             }
4462           REDUC_GROUP_FIRST_ELEMENT (reduc_chain.last ()) = reduc_chain[0];
4463           REDUC_GROUP_NEXT_ELEMENT (reduc_chain.last ()) = NULL;
4464
4465           /* Save the chain for further analysis in SLP detection.  */
4466           LOOP_VINFO_REDUCTION_CHAINS (loop_info).safe_push (reduc_chain[0]);
4467           REDUC_GROUP_SIZE (reduc_chain[0]) = reduc_chain.length ();
4468
4469           *reduc_chain_p = true;
4470           if (dump_enabled_p ())
4471             dump_printf_loc (MSG_NOTE, vect_location,
4472                             "reduction: detected reduction chain\n");
4473         }
4474       else if (dump_enabled_p ())
4475         dump_printf_loc (MSG_NOTE, vect_location,
4476                          "reduction: detected reduction\n");
4477
4478       return def_stmt_info;
4479     }
4480
4481   if (dump_enabled_p ())
4482     dump_printf_loc (MSG_NOTE, vect_location,
4483                      "reduction: unknown pattern\n");
4484
4485   return NULL;
4486 }
4487
4488 /* Estimate the number of peeled epilogue iterations for LOOP_VINFO.
4489    PEEL_ITERS_PROLOGUE is the number of peeled prologue iterations,
4490    or -1 if not known.  */
4491
4492 static int
4493 vect_get_peel_iters_epilogue (loop_vec_info loop_vinfo, int peel_iters_prologue)
4494 {
4495   int assumed_vf = vect_vf_for_cost (loop_vinfo);
4496   if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) || peel_iters_prologue == -1)
4497     {
4498       if (dump_enabled_p ())
4499         dump_printf_loc (MSG_NOTE, vect_location,
4500                          "cost model: epilogue peel iters set to vf/2 "
4501                          "because loop iterations are unknown .\n");
4502       return assumed_vf / 2;
4503     }
4504   else
4505     {
4506       int niters = LOOP_VINFO_INT_NITERS (loop_vinfo);
4507       peel_iters_prologue = MIN (niters, peel_iters_prologue);
4508       int peel_iters_epilogue = (niters - peel_iters_prologue) % assumed_vf;
4509       /* If we need to peel for gaps, but no peeling is required, we have to
4510          peel VF iterations.  */
4511       if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) && !peel_iters_epilogue)
4512         peel_iters_epilogue = assumed_vf;
4513       return peel_iters_epilogue;
4514     }
4515 }
4516
4517 /* Calculate cost of peeling the loop PEEL_ITERS_PROLOGUE times.  */
4518 int
4519 vect_get_known_peeling_cost (loop_vec_info loop_vinfo, int peel_iters_prologue,
4520                              int *peel_iters_epilogue,
4521                              stmt_vector_for_cost *scalar_cost_vec,
4522                              stmt_vector_for_cost *prologue_cost_vec,
4523                              stmt_vector_for_cost *epilogue_cost_vec)
4524 {
4525   int retval = 0;
4526
4527   *peel_iters_epilogue
4528     = vect_get_peel_iters_epilogue (loop_vinfo, peel_iters_prologue);
4529
4530   if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
4531     {
4532       /* If peeled iterations are known but number of scalar loop
4533          iterations are unknown, count a taken branch per peeled loop.  */
4534       if (peel_iters_prologue > 0)
4535         retval = record_stmt_cost (prologue_cost_vec, 1, cond_branch_taken,
4536                                    vect_prologue);
4537       if (*peel_iters_epilogue > 0)
4538         retval += record_stmt_cost (epilogue_cost_vec, 1, cond_branch_taken,
4539                                     vect_epilogue);
4540     }
4541
4542   stmt_info_for_cost *si;
4543   int j;
4544   if (peel_iters_prologue)
4545     FOR_EACH_VEC_ELT (*scalar_cost_vec, j, si)
4546       retval += record_stmt_cost (prologue_cost_vec,
4547                                   si->count * peel_iters_prologue,
4548                                   si->kind, si->stmt_info, si->misalign,
4549                                   vect_prologue);
4550   if (*peel_iters_epilogue)
4551     FOR_EACH_VEC_ELT (*scalar_cost_vec, j, si)
4552       retval += record_stmt_cost (epilogue_cost_vec,
4553                                   si->count * *peel_iters_epilogue,
4554                                   si->kind, si->stmt_info, si->misalign,
4555                                   vect_epilogue);
4556
4557   return retval;
4558 }
4559
4560 /* Function vect_estimate_min_profitable_iters
4561
4562    Return the number of iterations required for the vector version of the
4563    loop to be profitable relative to the cost of the scalar version of the
4564    loop.
4565
4566    *RET_MIN_PROFITABLE_NITERS is a cost model profitability threshold
4567    of iterations for vectorization.  -1 value means loop vectorization
4568    is not profitable.  This returned value may be used for dynamic
4569    profitability check.
4570
4571    *RET_MIN_PROFITABLE_ESTIMATE is a profitability threshold to be used
4572    for static check against estimated number of iterations.  */
4573
4574 static void
4575 vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo,
4576                                     int *ret_min_profitable_niters,
4577                                     int *ret_min_profitable_estimate,
4578                                     unsigned *suggested_unroll_factor)
4579 {
4580   int min_profitable_iters;
4581   int min_profitable_estimate;
4582   int peel_iters_prologue;
4583   int peel_iters_epilogue;
4584   unsigned vec_inside_cost = 0;
4585   int vec_outside_cost = 0;
4586   unsigned vec_prologue_cost = 0;
4587   unsigned vec_epilogue_cost = 0;
4588   int scalar_single_iter_cost = 0;
4589   int scalar_outside_cost = 0;
4590   int assumed_vf = vect_vf_for_cost (loop_vinfo);
4591   int npeel = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
4592   vector_costs *target_cost_data = loop_vinfo->vector_costs;
4593
4594   /* Cost model disabled.  */
4595   if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
4596     {
4597       if (dump_enabled_p ())
4598         dump_printf_loc (MSG_NOTE, vect_location, "cost model disabled.\n");
4599       *ret_min_profitable_niters = 0;
4600       *ret_min_profitable_estimate = 0;
4601       return;
4602     }
4603
4604   /* Requires loop versioning tests to handle misalignment.  */
4605   if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo))
4606     {
4607       /*  FIXME: Make cost depend on complexity of individual check.  */
4608       unsigned len = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).length ();
4609       (void) add_stmt_cost (target_cost_data, len, scalar_stmt, vect_prologue);
4610       if (dump_enabled_p ())
4611         dump_printf (MSG_NOTE,
4612                      "cost model: Adding cost of checks for loop "
4613                      "versioning to treat misalignment.\n");
4614     }
4615
4616   /* Requires loop versioning with alias checks.  */
4617   if (LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
4618     {
4619       /*  FIXME: Make cost depend on complexity of individual check.  */
4620       unsigned len = LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo).length ();
4621       (void) add_stmt_cost (target_cost_data, len, scalar_stmt, vect_prologue);
4622       len = LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo).length ();
4623       if (len)
4624         /* Count LEN - 1 ANDs and LEN comparisons.  */
4625         (void) add_stmt_cost (target_cost_data, len * 2 - 1,
4626                               scalar_stmt, vect_prologue);
4627       len = LOOP_VINFO_LOWER_BOUNDS (loop_vinfo).length ();
4628       if (len)
4629         {
4630           /* Count LEN - 1 ANDs and LEN comparisons.  */
4631           unsigned int nstmts = len * 2 - 1;
4632           /* +1 for each bias that needs adding.  */
4633           for (unsigned int i = 0; i < len; ++i)
4634             if (!LOOP_VINFO_LOWER_BOUNDS (loop_vinfo)[i].unsigned_p)
4635               nstmts += 1;
4636           (void) add_stmt_cost (target_cost_data, nstmts,
4637                                 scalar_stmt, vect_prologue);
4638         }
4639       if (dump_enabled_p ())
4640         dump_printf (MSG_NOTE,
4641                      "cost model: Adding cost of checks for loop "
4642                      "versioning aliasing.\n");
4643     }
4644
4645   /* Requires loop versioning with niter checks.  */
4646   if (LOOP_REQUIRES_VERSIONING_FOR_NITERS (loop_vinfo))
4647     {
4648       /*  FIXME: Make cost depend on complexity of individual check.  */
4649       (void) add_stmt_cost (target_cost_data, 1, vector_stmt,
4650                             NULL, NULL, NULL_TREE, 0, vect_prologue);
4651       if (dump_enabled_p ())
4652         dump_printf (MSG_NOTE,
4653                      "cost model: Adding cost of checks for loop "
4654                      "versioning niters.\n");
4655     }
4656
4657   if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
4658     (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken,
4659                           vect_prologue);
4660
4661   /* Count statements in scalar loop.  Using this as scalar cost for a single
4662      iteration for now.
4663
4664      TODO: Add outer loop support.
4665
4666      TODO: Consider assigning different costs to different scalar
4667      statements.  */
4668
4669   scalar_single_iter_cost = loop_vinfo->scalar_costs->total_cost ();
4670
4671   /* Add additional cost for the peeled instructions in prologue and epilogue
4672      loop.  (For fully-masked loops there will be no peeling.)
4673
4674      FORNOW: If we don't know the value of peel_iters for prologue or epilogue
4675      at compile-time - we assume it's vf/2 (the worst would be vf-1).
4676
4677      TODO: Build an expression that represents peel_iters for prologue and
4678      epilogue to be used in a run-time test.  */
4679
4680   bool prologue_need_br_taken_cost = false;
4681   bool prologue_need_br_not_taken_cost = false;
4682
4683   /* Calculate peel_iters_prologue.  */
4684   if (vect_use_loop_mask_for_alignment_p (loop_vinfo))
4685     peel_iters_prologue = 0;
4686   else if (npeel < 0)
4687     {
4688       peel_iters_prologue = assumed_vf / 2;
4689       if (dump_enabled_p ())
4690         dump_printf (MSG_NOTE, "cost model: "
4691                      "prologue peel iters set to vf/2.\n");
4692
4693       /* If peeled iterations are unknown, count a taken branch and a not taken
4694          branch per peeled loop.  Even if scalar loop iterations are known,
4695          vector iterations are not known since peeled prologue iterations are
4696          not known.  Hence guards remain the same.  */
4697       prologue_need_br_taken_cost = true;
4698       prologue_need_br_not_taken_cost = true;
4699     }
4700   else
4701     {
4702       peel_iters_prologue = npeel;
4703       if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) && peel_iters_prologue > 0)
4704         /* If peeled iterations are known but number of scalar loop
4705            iterations are unknown, count a taken branch per peeled loop.  */
4706         prologue_need_br_taken_cost = true;
4707     }
4708
4709   bool epilogue_need_br_taken_cost = false;
4710   bool epilogue_need_br_not_taken_cost = false;
4711
4712   /* Calculate peel_iters_epilogue.  */
4713   if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
4714     /* We need to peel exactly one iteration for gaps.  */
4715     peel_iters_epilogue = LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) ? 1 : 0;
4716   else if (npeel < 0)
4717     {
4718       /* If peeling for alignment is unknown, loop bound of main loop
4719          becomes unknown.  */
4720       peel_iters_epilogue = assumed_vf / 2;
4721       if (dump_enabled_p ())
4722         dump_printf (MSG_NOTE, "cost model: "
4723                      "epilogue peel iters set to vf/2 because "
4724                      "peeling for alignment is unknown.\n");
4725
4726       /* See the same reason above in peel_iters_prologue calculation.  */
4727       epilogue_need_br_taken_cost = true;
4728       epilogue_need_br_not_taken_cost = true;
4729     }
4730   else
4731     {
4732       peel_iters_epilogue = vect_get_peel_iters_epilogue (loop_vinfo, npeel);
4733       if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) && peel_iters_epilogue > 0)
4734         /* If peeled iterations are known but number of scalar loop
4735            iterations are unknown, count a taken branch per peeled loop.  */
4736         epilogue_need_br_taken_cost = true;
4737     }
4738
4739   stmt_info_for_cost *si;
4740   int j;
4741   /* Add costs associated with peel_iters_prologue.  */
4742   if (peel_iters_prologue)
4743     FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo), j, si)
4744       {
4745         (void) add_stmt_cost (target_cost_data,
4746                               si->count * peel_iters_prologue, si->kind,
4747                               si->stmt_info, si->node, si->vectype,
4748                               si->misalign, vect_prologue);
4749       }
4750
4751   /* Add costs associated with peel_iters_epilogue.  */
4752   if (peel_iters_epilogue)
4753     FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo), j, si)
4754       {
4755         (void) add_stmt_cost (target_cost_data,
4756                               si->count * peel_iters_epilogue, si->kind,
4757                               si->stmt_info, si->node, si->vectype,
4758                               si->misalign, vect_epilogue);
4759       }
4760
4761   /* Add possible cond_branch_taken/cond_branch_not_taken cost.  */
4762
4763   if (prologue_need_br_taken_cost)
4764     (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken,
4765                           vect_prologue);
4766
4767   if (prologue_need_br_not_taken_cost)
4768     (void) add_stmt_cost (target_cost_data, 1,
4769                           cond_branch_not_taken, vect_prologue);
4770
4771   if (epilogue_need_br_taken_cost)
4772     (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken,
4773                           vect_epilogue);
4774
4775   if (epilogue_need_br_not_taken_cost)
4776     (void) add_stmt_cost (target_cost_data, 1,
4777                           cond_branch_not_taken, vect_epilogue);
4778
4779   /* Take care of special costs for rgroup controls of partial vectors.  */
4780   if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
4781       && (LOOP_VINFO_PARTIAL_VECTORS_STYLE (loop_vinfo)
4782           == vect_partial_vectors_avx512))
4783     {
4784       /* Calculate how many masks we need to generate.  */
4785       unsigned int num_masks = 0;
4786       bool need_saturation = false;
4787       for (auto rgm : LOOP_VINFO_MASKS (loop_vinfo).rgc_vec)
4788         if (rgm.type)
4789           {
4790             unsigned nvectors = rgm.factor;
4791             num_masks += nvectors;
4792             if (TYPE_PRECISION (TREE_TYPE (rgm.compare_type))
4793                 < TYPE_PRECISION (LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo)))
4794               need_saturation = true;
4795           }
4796
4797       /* ???  The target isn't able to identify the costs below as
4798          producing masks so it cannot penaltize cases where we'd run
4799          out of mask registers for example.  */
4800
4801       /* ???  We are also failing to account for smaller vector masks
4802          we generate by splitting larger masks in vect_get_loop_mask.  */
4803
4804       /* In the worst case, we need to generate each mask in the prologue
4805          and in the loop body.  We need one splat per group and one
4806          compare per mask.
4807
4808          Sometimes the prologue mask will fold to a constant,
4809          so the actual prologue cost might be smaller.  However, it's
4810          simpler and safer to use the worst-case cost; if this ends up
4811          being the tie-breaker between vectorizing or not, then it's
4812          probably better not to vectorize.  */
4813       (void) add_stmt_cost (target_cost_data,
4814                             num_masks
4815                             + LOOP_VINFO_MASKS (loop_vinfo).rgc_vec.length (),
4816                             vector_stmt, NULL, NULL, NULL_TREE, 0,
4817                             vect_prologue);
4818       (void) add_stmt_cost (target_cost_data,
4819                             num_masks
4820                             + LOOP_VINFO_MASKS (loop_vinfo).rgc_vec.length (),
4821                             vector_stmt, NULL, NULL, NULL_TREE, 0, vect_body);
4822
4823       /* When we need saturation we need it both in the prologue and
4824          the epilogue.  */
4825       if (need_saturation)
4826         {
4827           (void) add_stmt_cost (target_cost_data, 1, scalar_stmt,
4828                                 NULL, NULL, NULL_TREE, 0, vect_prologue);
4829           (void) add_stmt_cost (target_cost_data, 1, scalar_stmt,
4830                                 NULL, NULL, NULL_TREE, 0, vect_body);
4831         }
4832     }
4833   else if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
4834            && (LOOP_VINFO_PARTIAL_VECTORS_STYLE (loop_vinfo)
4835                == vect_partial_vectors_while_ult))
4836     {
4837       /* Calculate how many masks we need to generate.  */
4838       unsigned int num_masks = 0;
4839       rgroup_controls *rgm;
4840       unsigned int num_vectors_m1;
4841       FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo).rgc_vec,
4842                         num_vectors_m1, rgm)
4843         if (rgm->type)
4844           num_masks += num_vectors_m1 + 1;
4845       gcc_assert (num_masks > 0);
4846
4847       /* In the worst case, we need to generate each mask in the prologue
4848          and in the loop body.  One of the loop body mask instructions
4849          replaces the comparison in the scalar loop, and since we don't
4850          count the scalar comparison against the scalar body, we shouldn't
4851          count that vector instruction against the vector body either.
4852
4853          Sometimes we can use unpacks instead of generating prologue
4854          masks and sometimes the prologue mask will fold to a constant,
4855          so the actual prologue cost might be smaller.  However, it's
4856          simpler and safer to use the worst-case cost; if this ends up
4857          being the tie-breaker between vectorizing or not, then it's
4858          probably better not to vectorize.  */
4859       (void) add_stmt_cost (target_cost_data, num_masks,
4860                             vector_stmt, NULL, NULL, NULL_TREE, 0,
4861                             vect_prologue);
4862       (void) add_stmt_cost (target_cost_data, num_masks - 1,
4863                             vector_stmt, NULL, NULL, NULL_TREE, 0,
4864                             vect_body);
4865     }
4866   else if (LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo))
4867     {
4868       /* Referring to the functions vect_set_loop_condition_partial_vectors
4869          and vect_set_loop_controls_directly, we need to generate each
4870          length in the prologue and in the loop body if required. Although
4871          there are some possible optimizations, we consider the worst case
4872          here.  */
4873
4874       bool niters_known_p = LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo);
4875       signed char partial_load_store_bias
4876         = LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo);
4877       bool need_iterate_p
4878         = (!LOOP_VINFO_EPILOGUE_P (loop_vinfo)
4879            && !vect_known_niters_smaller_than_vf (loop_vinfo));
4880
4881       /* Calculate how many statements to be added.  */
4882       unsigned int prologue_stmts = 0;
4883       unsigned int body_stmts = 0;
4884
4885       rgroup_controls *rgc;
4886       unsigned int num_vectors_m1;
4887       FOR_EACH_VEC_ELT (LOOP_VINFO_LENS (loop_vinfo), num_vectors_m1, rgc)
4888         if (rgc->type)
4889           {
4890             /* May need one SHIFT for nitems_total computation.  */
4891             unsigned nitems = rgc->max_nscalars_per_iter * rgc->factor;
4892             if (nitems != 1 && !niters_known_p)
4893               prologue_stmts += 1;
4894
4895             /* May need one MAX and one MINUS for wrap around.  */
4896             if (vect_rgroup_iv_might_wrap_p (loop_vinfo, rgc))
4897               prologue_stmts += 2;
4898
4899             /* Need one MAX and one MINUS for each batch limit excepting for
4900                the 1st one.  */
4901             prologue_stmts += num_vectors_m1 * 2;
4902
4903             unsigned int num_vectors = num_vectors_m1 + 1;
4904
4905             /* Need to set up lengths in prologue, only one MIN required
4906                for each since start index is zero.  */
4907             prologue_stmts += num_vectors;
4908
4909             /* If we have a non-zero partial load bias, we need one PLUS
4910                to adjust the load length.  */
4911             if (partial_load_store_bias != 0)
4912               body_stmts += 1;
4913
4914             unsigned int length_update_cost = 0;
4915             if (LOOP_VINFO_USING_DECREMENTING_IV_P (loop_vinfo))
4916               /* For decrement IV style, Each only need a single SELECT_VL
4917                  or MIN since beginning to calculate the number of elements
4918                  need to be processed in current iteration.  */
4919               length_update_cost = 1;
4920             else
4921               /* For increment IV stype, Each may need two MINs and one MINUS to
4922                  update lengths in body for next iteration.  */
4923               length_update_cost = 3;
4924
4925             if (need_iterate_p)
4926               body_stmts += length_update_cost * num_vectors;
4927           }
4928
4929       (void) add_stmt_cost (target_cost_data, prologue_stmts,
4930                             scalar_stmt, vect_prologue);
4931       (void) add_stmt_cost (target_cost_data, body_stmts,
4932                             scalar_stmt, vect_body);
4933     }
4934
4935   /* FORNOW: The scalar outside cost is incremented in one of the
4936      following ways:
4937
4938      1. The vectorizer checks for alignment and aliasing and generates
4939      a condition that allows dynamic vectorization.  A cost model
4940      check is ANDED with the versioning condition.  Hence scalar code
4941      path now has the added cost of the versioning check.
4942
4943        if (cost > th & versioning_check)
4944          jmp to vector code
4945
4946      Hence run-time scalar is incremented by not-taken branch cost.
4947
4948      2. The vectorizer then checks if a prologue is required.  If the
4949      cost model check was not done before during versioning, it has to
4950      be done before the prologue check.
4951
4952        if (cost <= th)
4953          prologue = scalar_iters
4954        if (prologue == 0)
4955          jmp to vector code
4956        else
4957          execute prologue
4958        if (prologue == num_iters)
4959          go to exit
4960
4961      Hence the run-time scalar cost is incremented by a taken branch,
4962      plus a not-taken branch, plus a taken branch cost.
4963
4964      3. The vectorizer then checks if an epilogue is required.  If the
4965      cost model check was not done before during prologue check, it
4966      has to be done with the epilogue check.
4967
4968        if (prologue == 0)
4969          jmp to vector code
4970        else
4971          execute prologue
4972        if (prologue == num_iters)
4973          go to exit
4974        vector code:
4975          if ((cost <= th) | (scalar_iters-prologue-epilogue == 0))
4976            jmp to epilogue
4977
4978      Hence the run-time scalar cost should be incremented by 2 taken
4979      branches.
4980
4981      TODO: The back end may reorder the BBS's differently and reverse
4982      conditions/branch directions.  Change the estimates below to
4983      something more reasonable.  */
4984
4985   /* If the number of iterations is known and we do not do versioning, we can
4986      decide whether to vectorize at compile time.  Hence the scalar version
4987      do not carry cost model guard costs.  */
4988   if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
4989       || LOOP_REQUIRES_VERSIONING (loop_vinfo))
4990     {
4991       /* Cost model check occurs at versioning.  */
4992       if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
4993         scalar_outside_cost += vect_get_stmt_cost (cond_branch_not_taken);
4994       else
4995         {
4996           /* Cost model check occurs at prologue generation.  */
4997           if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) < 0)
4998             scalar_outside_cost += 2 * vect_get_stmt_cost (cond_branch_taken)
4999               + vect_get_stmt_cost (cond_branch_not_taken);
5000           /* Cost model check occurs at epilogue generation.  */
5001           else
5002             scalar_outside_cost += 2 * vect_get_stmt_cost (cond_branch_taken);
5003         }
5004     }
5005
5006   /* Complete the target-specific cost calculations.  */
5007   finish_cost (loop_vinfo->vector_costs, loop_vinfo->scalar_costs,
5008                &vec_prologue_cost, &vec_inside_cost, &vec_epilogue_cost,
5009                suggested_unroll_factor);
5010
5011   if (suggested_unroll_factor && *suggested_unroll_factor > 1
5012       && LOOP_VINFO_MAX_VECT_FACTOR (loop_vinfo) != MAX_VECTORIZATION_FACTOR
5013       && !known_le (LOOP_VINFO_VECT_FACTOR (loop_vinfo) *
5014                     *suggested_unroll_factor,
5015                     LOOP_VINFO_MAX_VECT_FACTOR (loop_vinfo)))
5016     {
5017       if (dump_enabled_p ())
5018         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5019                          "can't unroll as unrolled vectorization factor larger"
5020                          " than maximum vectorization factor: "
5021                          HOST_WIDE_INT_PRINT_UNSIGNED "\n",
5022                          LOOP_VINFO_MAX_VECT_FACTOR (loop_vinfo));
5023       *suggested_unroll_factor = 1;
5024     }
5025
5026   vec_outside_cost = (int)(vec_prologue_cost + vec_epilogue_cost);
5027
5028   if (dump_enabled_p ())
5029     {
5030       dump_printf_loc (MSG_NOTE, vect_location, "Cost model analysis: \n");
5031       dump_printf (MSG_NOTE, "  Vector inside of loop cost: %d\n",
5032                    vec_inside_cost);
5033       dump_printf (MSG_NOTE, "  Vector prologue cost: %d\n",
5034                    vec_prologue_cost);
5035       dump_printf (MSG_NOTE, "  Vector epilogue cost: %d\n",
5036                    vec_epilogue_cost);
5037       dump_printf (MSG_NOTE, "  Scalar iteration cost: %d\n",
5038                    scalar_single_iter_cost);
5039       dump_printf (MSG_NOTE, "  Scalar outside cost: %d\n",
5040                    scalar_outside_cost);
5041       dump_printf (MSG_NOTE, "  Vector outside cost: %d\n",
5042                    vec_outside_cost);
5043       dump_printf (MSG_NOTE, "  prologue iterations: %d\n",
5044                    peel_iters_prologue);
5045       dump_printf (MSG_NOTE, "  epilogue iterations: %d\n",
5046                    peel_iters_epilogue);
5047     }
5048
5049   /* Calculate number of iterations required to make the vector version
5050      profitable, relative to the loop bodies only.  The following condition
5051      must hold true:
5052      SIC * niters + SOC > VIC * ((niters - NPEEL) / VF) + VOC
5053      where
5054      SIC = scalar iteration cost, VIC = vector iteration cost,
5055      VOC = vector outside cost, VF = vectorization factor,
5056      NPEEL = prologue iterations + epilogue iterations,
5057      SOC = scalar outside cost for run time cost model check.  */
5058
5059   int saving_per_viter = (scalar_single_iter_cost * assumed_vf
5060                           - vec_inside_cost);
5061   if (saving_per_viter <= 0)
5062     {
5063       if (LOOP_VINFO_LOOP (loop_vinfo)->force_vectorize)
5064         warning_at (vect_location.get_location_t (), OPT_Wopenmp_simd,
5065                     "vectorization did not happen for a simd loop");
5066
5067       if (dump_enabled_p ())
5068         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5069                          "cost model: the vector iteration cost = %d "
5070                          "divided by the scalar iteration cost = %d "
5071                          "is greater or equal to the vectorization factor = %d"
5072                          ".\n",
5073                          vec_inside_cost, scalar_single_iter_cost, assumed_vf);
5074       *ret_min_profitable_niters = -1;
5075       *ret_min_profitable_estimate = -1;
5076       return;
5077     }
5078
5079   /* ??? The "if" arm is written to handle all cases; see below for what
5080      we would do for !LOOP_VINFO_USING_PARTIAL_VECTORS_P.  */
5081   if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
5082     {
5083       /* Rewriting the condition above in terms of the number of
5084          vector iterations (vniters) rather than the number of
5085          scalar iterations (niters) gives:
5086
5087          SIC * (vniters * VF + NPEEL) + SOC > VIC * vniters + VOC
5088
5089          <==> vniters * (SIC * VF - VIC) > VOC - SIC * NPEEL - SOC
5090
5091          For integer N, X and Y when X > 0:
5092
5093          N * X > Y <==> N >= (Y /[floor] X) + 1.  */
5094       int outside_overhead = (vec_outside_cost
5095                               - scalar_single_iter_cost * peel_iters_prologue
5096                               - scalar_single_iter_cost * peel_iters_epilogue
5097                               - scalar_outside_cost);
5098       /* We're only interested in cases that require at least one
5099          vector iteration.  */
5100       int min_vec_niters = 1;
5101       if (outside_overhead > 0)
5102         min_vec_niters = outside_overhead / saving_per_viter + 1;
5103
5104       if (dump_enabled_p ())
5105         dump_printf (MSG_NOTE, "  Minimum number of vector iterations: %d\n",
5106                      min_vec_niters);
5107
5108       if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
5109         {
5110           /* Now that we know the minimum number of vector iterations,
5111              find the minimum niters for which the scalar cost is larger:
5112
5113              SIC * niters > VIC * vniters + VOC - SOC
5114
5115              We know that the minimum niters is no more than
5116              vniters * VF + NPEEL, but it might be (and often is) less
5117              than that if a partial vector iteration is cheaper than the
5118              equivalent scalar code.  */
5119           int threshold = (vec_inside_cost * min_vec_niters
5120                            + vec_outside_cost
5121                            - scalar_outside_cost);
5122           if (threshold <= 0)
5123             min_profitable_iters = 1;
5124           else
5125             min_profitable_iters = threshold / scalar_single_iter_cost + 1;
5126         }
5127       else
5128         /* Convert the number of vector iterations into a number of
5129            scalar iterations.  */
5130         min_profitable_iters = (min_vec_niters * assumed_vf
5131                                 + peel_iters_prologue
5132                                 + peel_iters_epilogue);
5133     }
5134   else
5135     {
5136       min_profitable_iters = ((vec_outside_cost - scalar_outside_cost)
5137                               * assumed_vf
5138                               - vec_inside_cost * peel_iters_prologue
5139                               - vec_inside_cost * peel_iters_epilogue);
5140       if (min_profitable_iters <= 0)
5141         min_profitable_iters = 0;
5142       else
5143         {
5144           min_profitable_iters /= saving_per_viter;
5145
5146           if ((scalar_single_iter_cost * assumed_vf * min_profitable_iters)
5147               <= (((int) vec_inside_cost * min_profitable_iters)
5148                   + (((int) vec_outside_cost - scalar_outside_cost)
5149                      * assumed_vf)))
5150             min_profitable_iters++;
5151         }
5152     }
5153
5154   if (dump_enabled_p ())
5155     dump_printf (MSG_NOTE,
5156                  "  Calculated minimum iters for profitability: %d\n",
5157                  min_profitable_iters);
5158
5159   if (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo)
5160       && min_profitable_iters < (assumed_vf + peel_iters_prologue))
5161     /* We want the vectorized loop to execute at least once.  */
5162     min_profitable_iters = assumed_vf + peel_iters_prologue;
5163   else if (min_profitable_iters < peel_iters_prologue)
5164     /* For LOOP_VINFO_USING_PARTIAL_VECTORS_P, we need to ensure the
5165        vectorized loop executes at least once.  */
5166     min_profitable_iters = peel_iters_prologue;
5167
5168   if (dump_enabled_p ())
5169     dump_printf_loc (MSG_NOTE, vect_location,
5170                      "  Runtime profitability threshold = %d\n",
5171                      min_profitable_iters);
5172
5173   *ret_min_profitable_niters = min_profitable_iters;
5174
5175   /* Calculate number of iterations required to make the vector version
5176      profitable, relative to the loop bodies only.
5177
5178      Non-vectorized variant is SIC * niters and it must win over vector
5179      variant on the expected loop trip count.  The following condition must hold true:
5180      SIC * niters > VIC * ((niters - NPEEL) / VF) + VOC + SOC  */
5181
5182   if (vec_outside_cost <= 0)
5183     min_profitable_estimate = 0;
5184   /* ??? This "else if" arm is written to handle all cases; see below for
5185      what we would do for !LOOP_VINFO_USING_PARTIAL_VECTORS_P.  */
5186   else if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
5187     {
5188       /* This is a repeat of the code above, but with + SOC rather
5189          than - SOC.  */
5190       int outside_overhead = (vec_outside_cost
5191                               - scalar_single_iter_cost * peel_iters_prologue
5192                               - scalar_single_iter_cost * peel_iters_epilogue
5193                               + scalar_outside_cost);
5194       int min_vec_niters = 1;
5195       if (outside_overhead > 0)
5196         min_vec_niters = outside_overhead / saving_per_viter + 1;
5197
5198       if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
5199         {
5200           int threshold = (vec_inside_cost * min_vec_niters
5201                            + vec_outside_cost
5202                            + scalar_outside_cost);
5203           min_profitable_estimate = threshold / scalar_single_iter_cost + 1;
5204         }
5205       else
5206         min_profitable_estimate = (min_vec_niters * assumed_vf
5207                                    + peel_iters_prologue
5208                                    + peel_iters_epilogue);
5209     }
5210   else
5211     {
5212       min_profitable_estimate = ((vec_outside_cost + scalar_outside_cost)
5213                                  * assumed_vf
5214                                  - vec_inside_cost * peel_iters_prologue
5215                                  - vec_inside_cost * peel_iters_epilogue)
5216                                  / ((scalar_single_iter_cost * assumed_vf)
5217                                    - vec_inside_cost);
5218     }
5219   min_profitable_estimate = MAX (min_profitable_estimate, min_profitable_iters);
5220   if (dump_enabled_p ())
5221     dump_printf_loc (MSG_NOTE, vect_location,
5222                      "  Static estimate profitability threshold = %d\n",
5223                      min_profitable_estimate);
5224
5225   *ret_min_profitable_estimate = min_profitable_estimate;
5226 }
5227
5228 /* Writes into SEL a mask for a vec_perm, equivalent to a vec_shr by OFFSET
5229    vector elements (not bits) for a vector with NELT elements.  */
5230 static void
5231 calc_vec_perm_mask_for_shift (unsigned int offset, unsigned int nelt,
5232                               vec_perm_builder *sel)
5233 {
5234   /* The encoding is a single stepped pattern.  Any wrap-around is handled
5235      by vec_perm_indices.  */
5236   sel->new_vector (nelt, 1, 3);
5237   for (unsigned int i = 0; i < 3; i++)
5238     sel->quick_push (i + offset);
5239 }
5240
5241 /* Checks whether the target supports whole-vector shifts for vectors of mode
5242    MODE.  This is the case if _either_ the platform handles vec_shr_optab, _or_
5243    it supports vec_perm_const with masks for all necessary shift amounts.  */
5244 static bool
5245 have_whole_vector_shift (machine_mode mode)
5246 {
5247   if (optab_handler (vec_shr_optab, mode) != CODE_FOR_nothing)
5248     return true;
5249
5250   /* Variable-length vectors should be handled via the optab.  */
5251   unsigned int nelt;
5252   if (!GET_MODE_NUNITS (mode).is_constant (&nelt))
5253     return false;
5254
5255   vec_perm_builder sel;
5256   vec_perm_indices indices;
5257   for (unsigned int i = nelt / 2; i >= 1; i /= 2)
5258     {
5259       calc_vec_perm_mask_for_shift (i, nelt, &sel);
5260       indices.new_vector (sel, 2, nelt);
5261       if (!can_vec_perm_const_p (mode, mode, indices, false))
5262         return false;
5263     }
5264   return true;
5265 }
5266
5267 /* Return true if (a) STMT_INFO is a DOT_PROD_EXPR reduction whose
5268    multiplication operands have differing signs and (b) we intend
5269    to emulate the operation using a series of signed DOT_PROD_EXPRs.
5270    See vect_emulate_mixed_dot_prod for the actual sequence used.  */
5271
5272 static bool
5273 vect_is_emulated_mixed_dot_prod (loop_vec_info loop_vinfo,
5274                                  stmt_vec_info stmt_info)
5275 {
5276   gassign *assign = dyn_cast<gassign *> (stmt_info->stmt);
5277   if (!assign || gimple_assign_rhs_code (assign) != DOT_PROD_EXPR)
5278     return false;
5279
5280   tree rhs1 = gimple_assign_rhs1 (assign);
5281   tree rhs2 = gimple_assign_rhs2 (assign);
5282   if (TYPE_SIGN (TREE_TYPE (rhs1)) == TYPE_SIGN (TREE_TYPE (rhs2)))
5283     return false;
5284
5285   stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info);
5286   gcc_assert (reduc_info->is_reduc_info);
5287   return !directly_supported_p (DOT_PROD_EXPR,
5288                                 STMT_VINFO_REDUC_VECTYPE_IN (reduc_info),
5289                                 optab_vector_mixed_sign);
5290 }
5291
5292 /* TODO: Close dependency between vect_model_*_cost and vectorizable_*
5293    functions. Design better to avoid maintenance issues.  */
5294
5295 /* Function vect_model_reduction_cost.
5296
5297    Models cost for a reduction operation, including the vector ops
5298    generated within the strip-mine loop in some cases, the initial
5299    definition before the loop, and the epilogue code that must be generated.  */
5300
5301 static void
5302 vect_model_reduction_cost (loop_vec_info loop_vinfo,
5303                            stmt_vec_info stmt_info, internal_fn reduc_fn,
5304                            vect_reduction_type reduction_type,
5305                            int ncopies, stmt_vector_for_cost *cost_vec)
5306 {
5307   int prologue_cost = 0, epilogue_cost = 0, inside_cost = 0;
5308   tree vectype;
5309   machine_mode mode;
5310   class loop *loop = NULL;
5311
5312   if (loop_vinfo)
5313     loop = LOOP_VINFO_LOOP (loop_vinfo);
5314
5315   /* Condition reductions generate two reductions in the loop.  */
5316   if (reduction_type == COND_REDUCTION)
5317     ncopies *= 2;
5318
5319   vectype = STMT_VINFO_VECTYPE (stmt_info);
5320   mode = TYPE_MODE (vectype);
5321   stmt_vec_info orig_stmt_info = vect_orig_stmt (stmt_info);
5322
5323   gimple_match_op op;
5324   if (!gimple_extract_op (orig_stmt_info->stmt, &op))
5325     gcc_unreachable ();
5326
5327   bool emulated_mixed_dot_prod
5328     = vect_is_emulated_mixed_dot_prod (loop_vinfo, stmt_info);
5329   if (reduction_type == EXTRACT_LAST_REDUCTION)
5330     /* No extra instructions are needed in the prologue.  The loop body
5331        operations are costed in vectorizable_condition.  */
5332     inside_cost = 0;
5333   else if (reduction_type == FOLD_LEFT_REDUCTION)
5334     {
5335       /* No extra instructions needed in the prologue.  */
5336       prologue_cost = 0;
5337
5338       if (reduc_fn != IFN_LAST)
5339         /* Count one reduction-like operation per vector.  */
5340         inside_cost = record_stmt_cost (cost_vec, ncopies, vec_to_scalar,
5341                                         stmt_info, 0, vect_body);
5342       else
5343         {
5344           /* Use NELEMENTS extracts and NELEMENTS scalar ops.  */
5345           unsigned int nelements = ncopies * vect_nunits_for_cost (vectype);
5346           inside_cost = record_stmt_cost (cost_vec, nelements,
5347                                           vec_to_scalar, stmt_info, 0,
5348                                           vect_body);
5349           inside_cost += record_stmt_cost (cost_vec, nelements,
5350                                            scalar_stmt, stmt_info, 0,
5351                                            vect_body);
5352         }
5353     }
5354   else
5355     {
5356       /* Add in the cost of the initial definitions.  */
5357       int prologue_stmts;
5358       if (reduction_type == COND_REDUCTION)
5359         /* For cond reductions we have four vectors: initial index, step,
5360            initial result of the data reduction, initial value of the index
5361            reduction.  */
5362         prologue_stmts = 4;
5363       else if (emulated_mixed_dot_prod)
5364         /* We need the initial reduction value and two invariants:
5365            one that contains the minimum signed value and one that
5366            contains half of its negative.  */
5367         prologue_stmts = 3;
5368       else
5369         prologue_stmts = 1;
5370       prologue_cost += record_stmt_cost (cost_vec, prologue_stmts,
5371                                          scalar_to_vec, stmt_info, 0,
5372                                          vect_prologue);
5373     }
5374
5375   /* Determine cost of epilogue code.
5376
5377      We have a reduction operator that will reduce the vector in one statement.
5378      Also requires scalar extract.  */
5379
5380   if (!loop || !nested_in_vect_loop_p (loop, orig_stmt_info))
5381     {
5382       if (reduc_fn != IFN_LAST)
5383         {
5384           if (reduction_type == COND_REDUCTION)
5385             {
5386               /* An EQ stmt and an COND_EXPR stmt.  */
5387               epilogue_cost += record_stmt_cost (cost_vec, 2,
5388                                                  vector_stmt, stmt_info, 0,
5389                                                  vect_epilogue);
5390               /* Reduction of the max index and a reduction of the found
5391                  values.  */
5392               epilogue_cost += record_stmt_cost (cost_vec, 2,
5393                                                  vec_to_scalar, stmt_info, 0,
5394                                                  vect_epilogue);
5395               /* A broadcast of the max value.  */
5396               epilogue_cost += record_stmt_cost (cost_vec, 1,
5397                                                  scalar_to_vec, stmt_info, 0,
5398                                                  vect_epilogue);
5399             }
5400           else
5401             {
5402               epilogue_cost += record_stmt_cost (cost_vec, 1, vector_stmt,
5403                                                  stmt_info, 0, vect_epilogue);
5404               epilogue_cost += record_stmt_cost (cost_vec, 1,
5405                                                  vec_to_scalar, stmt_info, 0,
5406                                                  vect_epilogue);
5407             }
5408         }
5409       else if (reduction_type == COND_REDUCTION)
5410         {
5411           unsigned estimated_nunits = vect_nunits_for_cost (vectype);
5412           /* Extraction of scalar elements.  */
5413           epilogue_cost += record_stmt_cost (cost_vec,
5414                                              2 * estimated_nunits,
5415                                              vec_to_scalar, stmt_info, 0,
5416                                              vect_epilogue);
5417           /* Scalar max reductions via COND_EXPR / MAX_EXPR.  */
5418           epilogue_cost += record_stmt_cost (cost_vec,
5419                                              2 * estimated_nunits - 3,
5420                                              scalar_stmt, stmt_info, 0,
5421                                              vect_epilogue);
5422         }
5423       else if (reduction_type == EXTRACT_LAST_REDUCTION
5424                || reduction_type == FOLD_LEFT_REDUCTION)
5425         /* No extra instructions need in the epilogue.  */
5426         ;
5427       else
5428         {
5429           int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
5430           tree bitsize = TYPE_SIZE (op.type);
5431           int element_bitsize = tree_to_uhwi (bitsize);
5432           int nelements = vec_size_in_bits / element_bitsize;
5433
5434           if (op.code == COND_EXPR)
5435             op.code = MAX_EXPR;
5436
5437           /* We have a whole vector shift available.  */
5438           if (VECTOR_MODE_P (mode)
5439               && directly_supported_p (op.code, vectype)
5440               && have_whole_vector_shift (mode))
5441             {
5442               /* Final reduction via vector shifts and the reduction operator.
5443                  Also requires scalar extract.  */
5444               epilogue_cost += record_stmt_cost (cost_vec,
5445                                                  exact_log2 (nelements) * 2,
5446                                                  vector_stmt, stmt_info, 0,
5447                                                  vect_epilogue);
5448               epilogue_cost += record_stmt_cost (cost_vec, 1,
5449                                                  vec_to_scalar, stmt_info, 0,
5450                                                  vect_epilogue);
5451             }
5452           else
5453             /* Use extracts and reduction op for final reduction.  For N
5454                elements, we have N extracts and N-1 reduction ops.  */
5455             epilogue_cost += record_stmt_cost (cost_vec,
5456                                                nelements + nelements - 1,
5457                                                vector_stmt, stmt_info, 0,
5458                                                vect_epilogue);
5459         }
5460     }
5461
5462   if (dump_enabled_p ())
5463     dump_printf (MSG_NOTE,
5464                  "vect_model_reduction_cost: inside_cost = %d, "
5465                  "prologue_cost = %d, epilogue_cost = %d .\n", inside_cost,
5466                  prologue_cost, epilogue_cost);
5467 }
5468
5469 /* SEQ is a sequence of instructions that initialize the reduction
5470    described by REDUC_INFO.  Emit them in the appropriate place.  */
5471
5472 static void
5473 vect_emit_reduction_init_stmts (loop_vec_info loop_vinfo,
5474                                 stmt_vec_info reduc_info, gimple *seq)
5475 {
5476   if (reduc_info->reused_accumulator)
5477     {
5478       /* When reusing an accumulator from the main loop, we only need
5479          initialization instructions if the main loop can be skipped.
5480          In that case, emit the initialization instructions at the end
5481          of the guard block that does the skip.  */
5482       edge skip_edge = loop_vinfo->skip_main_loop_edge;
5483       gcc_assert (skip_edge);
5484       gimple_stmt_iterator gsi = gsi_last_bb (skip_edge->src);
5485       gsi_insert_seq_before (&gsi, seq, GSI_SAME_STMT);
5486     }
5487   else
5488     {
5489       /* The normal case: emit the initialization instructions on the
5490          preheader edge.  */
5491       class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
5492       gsi_insert_seq_on_edge_immediate (loop_preheader_edge (loop), seq);
5493     }
5494 }
5495
5496 /* Function get_initial_def_for_reduction
5497
5498    Input:
5499    REDUC_INFO - the info_for_reduction
5500    INIT_VAL - the initial value of the reduction variable
5501    NEUTRAL_OP - a value that has no effect on the reduction, as per
5502                 neutral_op_for_reduction
5503
5504    Output:
5505    Return a vector variable, initialized according to the operation that
5506         STMT_VINFO performs. This vector will be used as the initial value
5507         of the vector of partial results.
5508
5509    The value we need is a vector in which element 0 has value INIT_VAL
5510    and every other element has value NEUTRAL_OP.  */
5511
5512 static tree
5513 get_initial_def_for_reduction (loop_vec_info loop_vinfo,
5514                                stmt_vec_info reduc_info,
5515                                tree init_val, tree neutral_op)
5516 {
5517   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
5518   tree scalar_type = TREE_TYPE (init_val);
5519   tree vectype = get_vectype_for_scalar_type (loop_vinfo, scalar_type);
5520   tree init_def;
5521   gimple_seq stmts = NULL;
5522
5523   gcc_assert (vectype);
5524
5525   gcc_assert (POINTER_TYPE_P (scalar_type) || INTEGRAL_TYPE_P (scalar_type)
5526               || SCALAR_FLOAT_TYPE_P (scalar_type));
5527
5528   gcc_assert (nested_in_vect_loop_p (loop, reduc_info)
5529               || loop == (gimple_bb (reduc_info->stmt))->loop_father);
5530
5531   if (operand_equal_p (init_val, neutral_op))
5532     {
5533       /* If both elements are equal then the vector described above is
5534          just a splat.  */
5535       neutral_op = gimple_convert (&stmts, TREE_TYPE (vectype), neutral_op);
5536       init_def = gimple_build_vector_from_val (&stmts, vectype, neutral_op);
5537     }
5538   else
5539     {
5540       neutral_op = gimple_convert (&stmts, TREE_TYPE (vectype), neutral_op);
5541       init_val = gimple_convert (&stmts, TREE_TYPE (vectype), init_val);
5542       if (!TYPE_VECTOR_SUBPARTS (vectype).is_constant ())
5543         {
5544           /* Construct a splat of NEUTRAL_OP and insert INIT_VAL into
5545              element 0.  */
5546           init_def = gimple_build_vector_from_val (&stmts, vectype,
5547                                                    neutral_op);
5548           init_def = gimple_build (&stmts, CFN_VEC_SHL_INSERT,
5549                                    vectype, init_def, init_val);
5550         }
5551       else
5552         {
5553           /* Build {INIT_VAL, NEUTRAL_OP, NEUTRAL_OP, ...}.  */
5554           tree_vector_builder elts (vectype, 1, 2);
5555           elts.quick_push (init_val);
5556           elts.quick_push (neutral_op);
5557           init_def = gimple_build_vector (&stmts, &elts);
5558         }
5559     }
5560
5561   if (stmts)
5562     vect_emit_reduction_init_stmts (loop_vinfo, reduc_info, stmts);
5563   return init_def;
5564 }
5565
5566 /* Get at the initial defs for the reduction PHIs for REDUC_INFO,
5567    which performs a reduction involving GROUP_SIZE scalar statements.
5568    NUMBER_OF_VECTORS is the number of vector defs to create.  If NEUTRAL_OP
5569    is nonnull, introducing extra elements of that value will not change the
5570    result.  */
5571
5572 static void
5573 get_initial_defs_for_reduction (loop_vec_info loop_vinfo,
5574                                 stmt_vec_info reduc_info,
5575                                 vec<tree> *vec_oprnds,
5576                                 unsigned int number_of_vectors,
5577                                 unsigned int group_size, tree neutral_op)
5578 {
5579   vec<tree> &initial_values = reduc_info->reduc_initial_values;
5580   unsigned HOST_WIDE_INT nunits;
5581   unsigned j, number_of_places_left_in_vector;
5582   tree vector_type = STMT_VINFO_VECTYPE (reduc_info);
5583   unsigned int i;
5584
5585   gcc_assert (group_size == initial_values.length () || neutral_op);
5586
5587   /* NUMBER_OF_COPIES is the number of times we need to use the same values in
5588      created vectors. It is greater than 1 if unrolling is performed.
5589
5590      For example, we have two scalar operands, s1 and s2 (e.g., group of
5591      strided accesses of size two), while NUNITS is four (i.e., four scalars
5592      of this type can be packed in a vector).  The output vector will contain
5593      two copies of each scalar operand: {s1, s2, s1, s2}.  (NUMBER_OF_COPIES
5594      will be 2).
5595
5596      If REDUC_GROUP_SIZE > NUNITS, the scalars will be split into several
5597      vectors containing the operands.
5598
5599      For example, NUNITS is four as before, and the group size is 8
5600      (s1, s2, ..., s8).  We will create two vectors {s1, s2, s3, s4} and
5601      {s5, s6, s7, s8}.  */
5602
5603   if (!TYPE_VECTOR_SUBPARTS (vector_type).is_constant (&nunits))
5604     nunits = group_size;
5605
5606   number_of_places_left_in_vector = nunits;
5607   bool constant_p = true;
5608   tree_vector_builder elts (vector_type, nunits, 1);
5609   elts.quick_grow (nunits);
5610   gimple_seq ctor_seq = NULL;
5611   for (j = 0; j < nunits * number_of_vectors; ++j)
5612     {
5613       tree op;
5614       i = j % group_size;
5615
5616       /* Get the def before the loop.  In reduction chain we have only
5617          one initial value.  Else we have as many as PHIs in the group.  */
5618       if (i >= initial_values.length () || (j > i && neutral_op))
5619         op = neutral_op;
5620       else
5621         {
5622           if (!useless_type_conversion_p (TREE_TYPE (vector_type),
5623                                           TREE_TYPE (initial_values[i])))
5624             initial_values[i] = gimple_convert (&ctor_seq,
5625                                                 TREE_TYPE (vector_type),
5626                                                 initial_values[i]);
5627           op = initial_values[i];
5628         }
5629
5630       /* Create 'vect_ = {op0,op1,...,opn}'.  */
5631       number_of_places_left_in_vector--;
5632       elts[nunits - number_of_places_left_in_vector - 1] = op;
5633       if (!CONSTANT_CLASS_P (op))
5634         constant_p = false;
5635
5636       if (number_of_places_left_in_vector == 0)
5637         {
5638           tree init;
5639           if (constant_p && !neutral_op
5640               ? multiple_p (TYPE_VECTOR_SUBPARTS (vector_type), nunits)
5641               : known_eq (TYPE_VECTOR_SUBPARTS (vector_type), nunits))
5642             /* Build the vector directly from ELTS.  */
5643             init = gimple_build_vector (&ctor_seq, &elts);
5644           else if (neutral_op)
5645             {
5646               /* Build a vector of the neutral value and shift the
5647                  other elements into place.  */
5648               init = gimple_build_vector_from_val (&ctor_seq, vector_type,
5649                                                    neutral_op);
5650               int k = nunits;
5651               while (k > 0 && elts[k - 1] == neutral_op)
5652                 k -= 1;
5653               while (k > 0)
5654                 {
5655                   k -= 1;
5656                   init = gimple_build (&ctor_seq, CFN_VEC_SHL_INSERT,
5657                                        vector_type, init, elts[k]);
5658                 }
5659             }
5660           else
5661             {
5662               /* First time round, duplicate ELTS to fill the
5663                  required number of vectors.  */
5664               duplicate_and_interleave (loop_vinfo, &ctor_seq, vector_type,
5665                                         elts, number_of_vectors, *vec_oprnds);
5666               break;
5667             }
5668           vec_oprnds->quick_push (init);
5669
5670           number_of_places_left_in_vector = nunits;
5671           elts.new_vector (vector_type, nunits, 1);
5672           elts.quick_grow (nunits);
5673           constant_p = true;
5674         }
5675     }
5676   if (ctor_seq != NULL)
5677     vect_emit_reduction_init_stmts (loop_vinfo, reduc_info, ctor_seq);
5678 }
5679
5680 /* For a statement STMT_INFO taking part in a reduction operation return
5681    the stmt_vec_info the meta information is stored on.  */
5682
5683 stmt_vec_info
5684 info_for_reduction (vec_info *vinfo, stmt_vec_info stmt_info)
5685 {
5686   stmt_info = vect_orig_stmt (stmt_info);
5687   gcc_assert (STMT_VINFO_REDUC_DEF (stmt_info));
5688   if (!is_a <gphi *> (stmt_info->stmt)
5689       || !VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
5690     stmt_info = STMT_VINFO_REDUC_DEF (stmt_info);
5691   gphi *phi = as_a <gphi *> (stmt_info->stmt);
5692   if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
5693     {
5694       if (gimple_phi_num_args (phi) == 1)
5695         stmt_info = STMT_VINFO_REDUC_DEF (stmt_info);
5696     }
5697   else if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle)
5698     {
5699       stmt_vec_info info = vinfo->lookup_def (vect_phi_initial_value (phi));
5700       if (info && STMT_VINFO_DEF_TYPE (info) == vect_double_reduction_def)
5701         stmt_info = info;
5702     }
5703   return stmt_info;
5704 }
5705
5706 /* See if LOOP_VINFO is an epilogue loop whose main loop had a reduction that
5707    REDUC_INFO can build on.  Adjust REDUC_INFO and return true if so, otherwise
5708    return false.  */
5709
5710 static bool
5711 vect_find_reusable_accumulator (loop_vec_info loop_vinfo,
5712                                 stmt_vec_info reduc_info)
5713 {
5714   loop_vec_info main_loop_vinfo = LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo);
5715   if (!main_loop_vinfo)
5716     return false;
5717
5718   if (STMT_VINFO_REDUC_TYPE (reduc_info) != TREE_CODE_REDUCTION)
5719     return false;
5720
5721   unsigned int num_phis = reduc_info->reduc_initial_values.length ();
5722   auto_vec<tree, 16> main_loop_results (num_phis);
5723   auto_vec<tree, 16> initial_values (num_phis);
5724   if (edge main_loop_edge = loop_vinfo->main_loop_edge)
5725     {
5726       /* The epilogue loop can be entered either from the main loop or
5727          from an earlier guard block.  */
5728       edge skip_edge = loop_vinfo->skip_main_loop_edge;
5729       for (tree incoming_value : reduc_info->reduc_initial_values)
5730         {
5731           /* Look for:
5732
5733                INCOMING_VALUE = phi<MAIN_LOOP_RESULT(main loop),
5734                                     INITIAL_VALUE(guard block)>.  */
5735           gcc_assert (TREE_CODE (incoming_value) == SSA_NAME);
5736
5737           gphi *phi = as_a <gphi *> (SSA_NAME_DEF_STMT (incoming_value));
5738           gcc_assert (gimple_bb (phi) == main_loop_edge->dest);
5739
5740           tree from_main_loop = PHI_ARG_DEF_FROM_EDGE (phi, main_loop_edge);
5741           tree from_skip = PHI_ARG_DEF_FROM_EDGE (phi, skip_edge);
5742
5743           main_loop_results.quick_push (from_main_loop);
5744           initial_values.quick_push (from_skip);
5745         }
5746     }
5747   else
5748     /* The main loop dominates the epilogue loop.  */
5749     main_loop_results.splice (reduc_info->reduc_initial_values);
5750
5751   /* See if the main loop has the kind of accumulator we need.  */
5752   vect_reusable_accumulator *accumulator
5753     = main_loop_vinfo->reusable_accumulators.get (main_loop_results[0]);
5754   if (!accumulator
5755       || num_phis != accumulator->reduc_info->reduc_scalar_results.length ()
5756       || !std::equal (main_loop_results.begin (), main_loop_results.end (),
5757                       accumulator->reduc_info->reduc_scalar_results.begin ()))
5758     return false;
5759
5760   /* Handle the case where we can reduce wider vectors to narrower ones.  */
5761   tree vectype = STMT_VINFO_VECTYPE (reduc_info);
5762   tree old_vectype = TREE_TYPE (accumulator->reduc_input);
5763   unsigned HOST_WIDE_INT m;
5764   if (!constant_multiple_p (TYPE_VECTOR_SUBPARTS (old_vectype),
5765                             TYPE_VECTOR_SUBPARTS (vectype), &m))
5766     return false;
5767   /* Check the intermediate vector types and operations are available.  */
5768   tree prev_vectype = old_vectype;
5769   poly_uint64 intermediate_nunits = TYPE_VECTOR_SUBPARTS (old_vectype);
5770   while (known_gt (intermediate_nunits, TYPE_VECTOR_SUBPARTS (vectype)))
5771     {
5772       intermediate_nunits = exact_div (intermediate_nunits, 2);
5773       tree intermediate_vectype = get_related_vectype_for_scalar_type
5774         (TYPE_MODE (vectype), TREE_TYPE (vectype), intermediate_nunits);
5775       if (!intermediate_vectype
5776           || !directly_supported_p (STMT_VINFO_REDUC_CODE (reduc_info),
5777                                     intermediate_vectype)
5778           || !can_vec_extract (TYPE_MODE (prev_vectype),
5779                                TYPE_MODE (intermediate_vectype)))
5780         return false;
5781       prev_vectype = intermediate_vectype;
5782     }
5783
5784   /* Non-SLP reductions might apply an adjustment after the reduction
5785      operation, in order to simplify the initialization of the accumulator.
5786      If the epilogue loop carries on from where the main loop left off,
5787      it should apply the same adjustment to the final reduction result.
5788
5789      If the epilogue loop can also be entered directly (rather than via
5790      the main loop), we need to be able to handle that case in the same way,
5791      with the same adjustment.  (In principle we could add a PHI node
5792      to select the correct adjustment, but in practice that shouldn't be
5793      necessary.)  */
5794   tree main_adjustment
5795     = STMT_VINFO_REDUC_EPILOGUE_ADJUSTMENT (accumulator->reduc_info);
5796   if (loop_vinfo->main_loop_edge && main_adjustment)
5797     {
5798       gcc_assert (num_phis == 1);
5799       tree initial_value = initial_values[0];
5800       /* Check that we can use INITIAL_VALUE as the adjustment and
5801          initialize the accumulator with a neutral value instead.  */
5802       if (!operand_equal_p (initial_value, main_adjustment))
5803         return false;
5804       code_helper code = STMT_VINFO_REDUC_CODE (reduc_info);
5805       initial_values[0] = neutral_op_for_reduction (TREE_TYPE (initial_value),
5806                                                     code, initial_value);
5807     }
5808   STMT_VINFO_REDUC_EPILOGUE_ADJUSTMENT (reduc_info) = main_adjustment;
5809   reduc_info->reduc_initial_values.truncate (0);
5810   reduc_info->reduc_initial_values.splice (initial_values);
5811   reduc_info->reused_accumulator = accumulator;
5812   return true;
5813 }
5814
5815 /* Reduce the vector VEC_DEF down to VECTYPE with reduction operation
5816    CODE emitting stmts before GSI.  Returns a vector def of VECTYPE.  */
5817
5818 static tree
5819 vect_create_partial_epilog (tree vec_def, tree vectype, code_helper code,
5820                             gimple_seq *seq)
5821 {
5822   unsigned nunits = TYPE_VECTOR_SUBPARTS (TREE_TYPE (vec_def)).to_constant ();
5823   unsigned nunits1 = TYPE_VECTOR_SUBPARTS (vectype).to_constant ();
5824   tree stype = TREE_TYPE (vectype);
5825   tree new_temp = vec_def;
5826   while (nunits > nunits1)
5827     {
5828       nunits /= 2;
5829       tree vectype1 = get_related_vectype_for_scalar_type (TYPE_MODE (vectype),
5830                                                            stype, nunits);
5831       unsigned int bitsize = tree_to_uhwi (TYPE_SIZE (vectype1));
5832
5833       /* The target has to make sure we support lowpart/highpart
5834          extraction, either via direct vector extract or through
5835          an integer mode punning.  */
5836       tree dst1, dst2;
5837       gimple *epilog_stmt;
5838       if (convert_optab_handler (vec_extract_optab,
5839                                  TYPE_MODE (TREE_TYPE (new_temp)),
5840                                  TYPE_MODE (vectype1))
5841           != CODE_FOR_nothing)
5842         {
5843           /* Extract sub-vectors directly once vec_extract becomes
5844              a conversion optab.  */
5845           dst1 = make_ssa_name (vectype1);
5846           epilog_stmt
5847               = gimple_build_assign (dst1, BIT_FIELD_REF,
5848                                      build3 (BIT_FIELD_REF, vectype1,
5849                                              new_temp, TYPE_SIZE (vectype1),
5850                                              bitsize_int (0)));
5851           gimple_seq_add_stmt_without_update (seq, epilog_stmt);
5852           dst2 =  make_ssa_name (vectype1);
5853           epilog_stmt
5854               = gimple_build_assign (dst2, BIT_FIELD_REF,
5855                                      build3 (BIT_FIELD_REF, vectype1,
5856                                              new_temp, TYPE_SIZE (vectype1),
5857                                              bitsize_int (bitsize)));
5858           gimple_seq_add_stmt_without_update (seq, epilog_stmt);
5859         }
5860       else
5861         {
5862           /* Extract via punning to appropriately sized integer mode
5863              vector.  */
5864           tree eltype = build_nonstandard_integer_type (bitsize, 1);
5865           tree etype = build_vector_type (eltype, 2);
5866           gcc_assert (convert_optab_handler (vec_extract_optab,
5867                                              TYPE_MODE (etype),
5868                                              TYPE_MODE (eltype))
5869                       != CODE_FOR_nothing);
5870           tree tem = make_ssa_name (etype);
5871           epilog_stmt = gimple_build_assign (tem, VIEW_CONVERT_EXPR,
5872                                              build1 (VIEW_CONVERT_EXPR,
5873                                                      etype, new_temp));
5874           gimple_seq_add_stmt_without_update (seq, epilog_stmt);
5875           new_temp = tem;
5876           tem = make_ssa_name (eltype);
5877           epilog_stmt
5878               = gimple_build_assign (tem, BIT_FIELD_REF,
5879                                      build3 (BIT_FIELD_REF, eltype,
5880                                              new_temp, TYPE_SIZE (eltype),
5881                                              bitsize_int (0)));
5882           gimple_seq_add_stmt_without_update (seq, epilog_stmt);
5883           dst1 = make_ssa_name (vectype1);
5884           epilog_stmt = gimple_build_assign (dst1, VIEW_CONVERT_EXPR,
5885                                              build1 (VIEW_CONVERT_EXPR,
5886                                                      vectype1, tem));
5887           gimple_seq_add_stmt_without_update (seq, epilog_stmt);
5888           tem = make_ssa_name (eltype);
5889           epilog_stmt
5890               = gimple_build_assign (tem, BIT_FIELD_REF,
5891                                      build3 (BIT_FIELD_REF, eltype,
5892                                              new_temp, TYPE_SIZE (eltype),
5893                                              bitsize_int (bitsize)));
5894           gimple_seq_add_stmt_without_update (seq, epilog_stmt);
5895           dst2 =  make_ssa_name (vectype1);
5896           epilog_stmt = gimple_build_assign (dst2, VIEW_CONVERT_EXPR,
5897                                              build1 (VIEW_CONVERT_EXPR,
5898                                                      vectype1, tem));
5899           gimple_seq_add_stmt_without_update (seq, epilog_stmt);
5900         }
5901
5902       new_temp = gimple_build (seq, code, vectype1, dst1, dst2);
5903     }
5904
5905   return new_temp;
5906 }
5907
5908 /* Function vect_create_epilog_for_reduction
5909
5910    Create code at the loop-epilog to finalize the result of a reduction
5911    computation.
5912
5913    STMT_INFO is the scalar reduction stmt that is being vectorized.
5914    SLP_NODE is an SLP node containing a group of reduction statements. The
5915      first one in this group is STMT_INFO.
5916    SLP_NODE_INSTANCE is the SLP node instance containing SLP_NODE
5917    REDUC_INDEX says which rhs operand of the STMT_INFO is the reduction phi
5918      (counting from 0)
5919    LOOP_EXIT is the edge to update in the merge block.  In the case of a single
5920      exit this edge is always the main loop exit.
5921
5922    This function:
5923    1. Completes the reduction def-use cycles.
5924    2. "Reduces" each vector of partial results VECT_DEFS into a single result,
5925       by calling the function specified by REDUC_FN if available, or by
5926       other means (whole-vector shifts or a scalar loop).
5927       The function also creates a new phi node at the loop exit to preserve
5928       loop-closed form, as illustrated below.
5929
5930      The flow at the entry to this function:
5931
5932         loop:
5933           vec_def = phi <vec_init, null>        # REDUCTION_PHI
5934           VECT_DEF = vector_stmt                # vectorized form of STMT_INFO
5935           s_loop = scalar_stmt                  # (scalar) STMT_INFO
5936         loop_exit:
5937           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
5938           use <s_out0>
5939           use <s_out0>
5940
5941      The above is transformed by this function into:
5942
5943         loop:
5944           vec_def = phi <vec_init, VECT_DEF>    # REDUCTION_PHI
5945           VECT_DEF = vector_stmt                # vectorized form of STMT_INFO
5946           s_loop = scalar_stmt                  # (scalar) STMT_INFO
5947         loop_exit:
5948           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
5949           v_out1 = phi <VECT_DEF>               # NEW_EXIT_PHI
5950           v_out2 = reduce <v_out1>
5951           s_out3 = extract_field <v_out2, 0>
5952           s_out4 = adjust_result <s_out3>
5953           use <s_out4>
5954           use <s_out4>
5955 */
5956
5957 static void
5958 vect_create_epilog_for_reduction (loop_vec_info loop_vinfo,
5959                                   stmt_vec_info stmt_info,
5960                                   slp_tree slp_node,
5961                                   slp_instance slp_node_instance,
5962                                   edge loop_exit)
5963 {
5964   stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info);
5965   gcc_assert (reduc_info->is_reduc_info);
5966   /* For double reductions we need to get at the inner loop reduction
5967      stmt which has the meta info attached.  Our stmt_info is that of the
5968      loop-closed PHI of the inner loop which we remember as
5969      def for the reduction PHI generation.  */
5970   bool double_reduc = false;
5971   stmt_vec_info rdef_info = stmt_info;
5972   if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
5973     {
5974       gcc_assert (!slp_node);
5975       double_reduc = true;
5976       stmt_info = loop_vinfo->lookup_def (gimple_phi_arg_def
5977                                             (stmt_info->stmt, 0));
5978       stmt_info = vect_stmt_to_vectorize (stmt_info);
5979     }
5980   code_helper code = STMT_VINFO_REDUC_CODE (reduc_info);
5981   internal_fn reduc_fn = STMT_VINFO_REDUC_FN (reduc_info);
5982   tree vectype;
5983   machine_mode mode;
5984   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo), *outer_loop = NULL;
5985   basic_block exit_bb;
5986   tree scalar_dest;
5987   tree scalar_type;
5988   gimple *new_phi = NULL, *phi = NULL;
5989   gimple_stmt_iterator exit_gsi;
5990   tree new_temp = NULL_TREE, new_name, new_scalar_dest;
5991   gimple *epilog_stmt = NULL;
5992   gimple *exit_phi;
5993   tree bitsize;
5994   tree def;
5995   tree orig_name, scalar_result;
5996   imm_use_iterator imm_iter, phi_imm_iter;
5997   use_operand_p use_p, phi_use_p;
5998   gimple *use_stmt;
5999   auto_vec<tree> reduc_inputs;
6000   int j, i;
6001   vec<tree> &scalar_results = reduc_info->reduc_scalar_results;
6002   unsigned int group_size = 1, k;
6003   /* SLP reduction without reduction chain, e.g.,
6004      # a1 = phi <a2, a0>
6005      # b1 = phi <b2, b0>
6006      a2 = operation (a1)
6007      b2 = operation (b1)  */
6008   bool slp_reduc = (slp_node && !REDUC_GROUP_FIRST_ELEMENT (stmt_info));
6009   bool direct_slp_reduc;
6010   tree induction_index = NULL_TREE;
6011
6012   if (slp_node)
6013     group_size = SLP_TREE_LANES (slp_node);
6014
6015   if (nested_in_vect_loop_p (loop, stmt_info))
6016     {
6017       outer_loop = loop;
6018       loop = loop->inner;
6019       gcc_assert (!slp_node && double_reduc);
6020     }
6021
6022   vectype = STMT_VINFO_REDUC_VECTYPE (reduc_info);
6023   gcc_assert (vectype);
6024   mode = TYPE_MODE (vectype);
6025
6026   tree induc_val = NULL_TREE;
6027   tree adjustment_def = NULL;
6028   if (slp_node)
6029     ;
6030   else
6031     {
6032       /* Optimize: for induction condition reduction, if we can't use zero
6033          for induc_val, use initial_def.  */
6034       if (STMT_VINFO_REDUC_TYPE (reduc_info) == INTEGER_INDUC_COND_REDUCTION)
6035         induc_val = STMT_VINFO_VEC_INDUC_COND_INITIAL_VAL (reduc_info);
6036       else if (double_reduc)
6037         ;
6038       else
6039         adjustment_def = STMT_VINFO_REDUC_EPILOGUE_ADJUSTMENT (reduc_info);
6040     }
6041
6042   stmt_vec_info single_live_out_stmt[] = { stmt_info };
6043   array_slice<const stmt_vec_info> live_out_stmts = single_live_out_stmt;
6044   if (slp_reduc)
6045     /* All statements produce live-out values.  */
6046     live_out_stmts = SLP_TREE_SCALAR_STMTS (slp_node);
6047
6048   unsigned vec_num;
6049   int ncopies;
6050   if (slp_node)
6051     {
6052       vec_num = SLP_TREE_VEC_DEFS (slp_node_instance->reduc_phis).length ();
6053       ncopies = 1;
6054     }
6055   else
6056     {
6057       vec_num = 1;
6058       ncopies = STMT_VINFO_VEC_STMTS (reduc_info).length ();
6059     }
6060
6061   /* For cond reductions we want to create a new vector (INDEX_COND_EXPR)
6062      which is updated with the current index of the loop for every match of
6063      the original loop's cond_expr (VEC_STMT).  This results in a vector
6064      containing the last time the condition passed for that vector lane.
6065      The first match will be a 1 to allow 0 to be used for non-matching
6066      indexes.  If there are no matches at all then the vector will be all
6067      zeroes.
6068
6069      PR92772: This algorithm is broken for architectures that support
6070      masked vectors, but do not provide fold_extract_last.  */
6071   if (STMT_VINFO_REDUC_TYPE (reduc_info) == COND_REDUCTION)
6072     {
6073       auto_vec<std::pair<tree, bool>, 2> ccompares;
6074       stmt_vec_info cond_info = STMT_VINFO_REDUC_DEF (reduc_info);
6075       cond_info = vect_stmt_to_vectorize (cond_info);
6076       while (cond_info != reduc_info)
6077         {
6078           if (gimple_assign_rhs_code (cond_info->stmt) == COND_EXPR)
6079             {
6080               gimple *vec_stmt = STMT_VINFO_VEC_STMTS (cond_info)[0];
6081               gcc_assert (gimple_assign_rhs_code (vec_stmt) == VEC_COND_EXPR);
6082               ccompares.safe_push
6083                 (std::make_pair (unshare_expr (gimple_assign_rhs1 (vec_stmt)),
6084                                  STMT_VINFO_REDUC_IDX (cond_info) == 2));
6085             }
6086           cond_info
6087             = loop_vinfo->lookup_def (gimple_op (cond_info->stmt,
6088                                                  1 + STMT_VINFO_REDUC_IDX
6089                                                         (cond_info)));
6090           cond_info = vect_stmt_to_vectorize (cond_info);
6091         }
6092       gcc_assert (ccompares.length () != 0);
6093
6094       tree indx_before_incr, indx_after_incr;
6095       poly_uint64 nunits_out = TYPE_VECTOR_SUBPARTS (vectype);
6096       int scalar_precision
6097         = GET_MODE_PRECISION (SCALAR_TYPE_MODE (TREE_TYPE (vectype)));
6098       tree cr_index_scalar_type = make_unsigned_type (scalar_precision);
6099       tree cr_index_vector_type = get_related_vectype_for_scalar_type
6100         (TYPE_MODE (vectype), cr_index_scalar_type,
6101          TYPE_VECTOR_SUBPARTS (vectype));
6102
6103       /* First we create a simple vector induction variable which starts
6104          with the values {1,2,3,...} (SERIES_VECT) and increments by the
6105          vector size (STEP).  */
6106
6107       /* Create a {1,2,3,...} vector.  */
6108       tree series_vect = build_index_vector (cr_index_vector_type, 1, 1);
6109
6110       /* Create a vector of the step value.  */
6111       tree step = build_int_cst (cr_index_scalar_type, nunits_out);
6112       tree vec_step = build_vector_from_val (cr_index_vector_type, step);
6113
6114       /* Create an induction variable.  */
6115       gimple_stmt_iterator incr_gsi;
6116       bool insert_after;
6117       vect_iv_increment_position (loop_exit, &incr_gsi, &insert_after);
6118       create_iv (series_vect, PLUS_EXPR, vec_step, NULL_TREE, loop, &incr_gsi,
6119                  insert_after, &indx_before_incr, &indx_after_incr);
6120
6121       /* Next create a new phi node vector (NEW_PHI_TREE) which starts
6122          filled with zeros (VEC_ZERO).  */
6123
6124       /* Create a vector of 0s.  */
6125       tree zero = build_zero_cst (cr_index_scalar_type);
6126       tree vec_zero = build_vector_from_val (cr_index_vector_type, zero);
6127
6128       /* Create a vector phi node.  */
6129       tree new_phi_tree = make_ssa_name (cr_index_vector_type);
6130       new_phi = create_phi_node (new_phi_tree, loop->header);
6131       add_phi_arg (as_a <gphi *> (new_phi), vec_zero,
6132                    loop_preheader_edge (loop), UNKNOWN_LOCATION);
6133
6134       /* Now take the condition from the loops original cond_exprs
6135          and produce a new cond_exprs (INDEX_COND_EXPR) which for
6136          every match uses values from the induction variable
6137          (INDEX_BEFORE_INCR) otherwise uses values from the phi node
6138          (NEW_PHI_TREE).
6139          Finally, we update the phi (NEW_PHI_TREE) to take the value of
6140          the new cond_expr (INDEX_COND_EXPR).  */
6141       gimple_seq stmts = NULL;
6142       for (int i = ccompares.length () - 1; i != -1; --i)
6143         {
6144           tree ccompare = ccompares[i].first;
6145           if (ccompares[i].second)
6146             new_phi_tree = gimple_build (&stmts, VEC_COND_EXPR,
6147                                          cr_index_vector_type,
6148                                          ccompare,
6149                                          indx_before_incr, new_phi_tree);
6150           else
6151             new_phi_tree = gimple_build (&stmts, VEC_COND_EXPR,
6152                                          cr_index_vector_type,
6153                                          ccompare,
6154                                          new_phi_tree, indx_before_incr);
6155         }
6156       gsi_insert_seq_before (&incr_gsi, stmts, GSI_SAME_STMT);
6157
6158       /* Update the phi with the vec cond.  */
6159       induction_index = new_phi_tree;
6160       add_phi_arg (as_a <gphi *> (new_phi), induction_index,
6161                    loop_latch_edge (loop), UNKNOWN_LOCATION);
6162     }
6163
6164   /* 2. Create epilog code.
6165         The reduction epilog code operates across the elements of the vector
6166         of partial results computed by the vectorized loop.
6167         The reduction epilog code consists of:
6168
6169         step 1: compute the scalar result in a vector (v_out2)
6170         step 2: extract the scalar result (s_out3) from the vector (v_out2)
6171         step 3: adjust the scalar result (s_out3) if needed.
6172
6173         Step 1 can be accomplished using one the following three schemes:
6174           (scheme 1) using reduc_fn, if available.
6175           (scheme 2) using whole-vector shifts, if available.
6176           (scheme 3) using a scalar loop. In this case steps 1+2 above are
6177                      combined.
6178
6179           The overall epilog code looks like this:
6180
6181           s_out0 = phi <s_loop>         # original EXIT_PHI
6182           v_out1 = phi <VECT_DEF>       # NEW_EXIT_PHI
6183           v_out2 = reduce <v_out1>              # step 1
6184           s_out3 = extract_field <v_out2, 0>    # step 2
6185           s_out4 = adjust_result <s_out3>       # step 3
6186
6187           (step 3 is optional, and steps 1 and 2 may be combined).
6188           Lastly, the uses of s_out0 are replaced by s_out4.  */
6189
6190
6191   /* 2.1 Create new loop-exit-phis to preserve loop-closed form:
6192          v_out1 = phi <VECT_DEF>
6193          Store them in NEW_PHIS.  */
6194   if (double_reduc)
6195     loop = outer_loop;
6196   /* We need to reduce values in all exits.  */
6197   exit_bb = loop_exit->dest;
6198   exit_gsi = gsi_after_labels (exit_bb);
6199   reduc_inputs.create (slp_node ? vec_num : ncopies);
6200   for (unsigned i = 0; i < vec_num; i++)
6201     {
6202       gimple_seq stmts = NULL;
6203       if (slp_node)
6204         def = vect_get_slp_vect_def (slp_node, i);
6205       else
6206         def = gimple_get_lhs (STMT_VINFO_VEC_STMTS (rdef_info)[0]);
6207       for (j = 0; j < ncopies; j++)
6208         {
6209           tree new_def = copy_ssa_name (def);
6210           phi = create_phi_node (new_def, exit_bb);
6211           if (j)
6212             def = gimple_get_lhs (STMT_VINFO_VEC_STMTS (rdef_info)[j]);
6213           if (LOOP_VINFO_IV_EXIT (loop_vinfo) == loop_exit)
6214             SET_PHI_ARG_DEF (phi, loop_exit->dest_idx, def);
6215           else
6216             {
6217               for (unsigned k = 0; k < gimple_phi_num_args (phi); k++)
6218                 SET_PHI_ARG_DEF (phi, k, def);
6219             }
6220           new_def = gimple_convert (&stmts, vectype, new_def);
6221           reduc_inputs.quick_push (new_def);
6222         }
6223       gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
6224     }
6225
6226   /* 2.2 Get the relevant tree-code to use in the epilog for schemes 2,3
6227          (i.e. when reduc_fn is not available) and in the final adjustment
6228          code (if needed).  Also get the original scalar reduction variable as
6229          defined in the loop.  In case STMT is a "pattern-stmt" (i.e. - it
6230          represents a reduction pattern), the tree-code and scalar-def are
6231          taken from the original stmt that the pattern-stmt (STMT) replaces.
6232          Otherwise (it is a regular reduction) - the tree-code and scalar-def
6233          are taken from STMT.  */
6234
6235   stmt_vec_info orig_stmt_info = vect_orig_stmt (stmt_info);
6236   if (orig_stmt_info != stmt_info)
6237     {
6238       /* Reduction pattern  */
6239       gcc_assert (STMT_VINFO_IN_PATTERN_P (orig_stmt_info));
6240       gcc_assert (STMT_VINFO_RELATED_STMT (orig_stmt_info) == stmt_info);
6241     }
6242
6243   scalar_dest = gimple_get_lhs (orig_stmt_info->stmt);
6244   scalar_type = TREE_TYPE (scalar_dest);
6245   scalar_results.truncate (0);
6246   scalar_results.reserve_exact (group_size);
6247   new_scalar_dest = vect_create_destination_var (scalar_dest, NULL);
6248   bitsize = TYPE_SIZE (scalar_type);
6249
6250   /* True if we should implement SLP_REDUC using native reduction operations
6251      instead of scalar operations.  */
6252   direct_slp_reduc = (reduc_fn != IFN_LAST
6253                       && slp_reduc
6254                       && !TYPE_VECTOR_SUBPARTS (vectype).is_constant ());
6255
6256   /* In case of reduction chain, e.g.,
6257      # a1 = phi <a3, a0>
6258      a2 = operation (a1)
6259      a3 = operation (a2),
6260
6261      we may end up with more than one vector result.  Here we reduce them
6262      to one vector.
6263
6264      The same is true for a SLP reduction, e.g.,
6265      # a1 = phi <a2, a0>
6266      # b1 = phi <b2, b0>
6267      a2 = operation (a1)
6268      b2 = operation (a2),
6269
6270      where we can end up with more than one vector as well.  We can
6271      easily accumulate vectors when the number of vector elements is
6272      a multiple of the SLP group size.
6273
6274      The same is true if we couldn't use a single defuse cycle.  */
6275   if (REDUC_GROUP_FIRST_ELEMENT (stmt_info)
6276       || direct_slp_reduc
6277       || (slp_reduc
6278           && constant_multiple_p (TYPE_VECTOR_SUBPARTS (vectype), group_size))
6279       || ncopies > 1)
6280     {
6281       gimple_seq stmts = NULL;
6282       tree single_input = reduc_inputs[0];
6283       for (k = 1; k < reduc_inputs.length (); k++)
6284         single_input = gimple_build (&stmts, code, vectype,
6285                                      single_input, reduc_inputs[k]);
6286       gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
6287
6288       reduc_inputs.truncate (0);
6289       reduc_inputs.safe_push (single_input);
6290     }
6291
6292   tree orig_reduc_input = reduc_inputs[0];
6293
6294   /* If this loop is an epilogue loop that can be skipped after the
6295      main loop, we can only share a reduction operation between the
6296      main loop and the epilogue if we put it at the target of the
6297      skip edge.
6298
6299      We can still reuse accumulators if this check fails.  Doing so has
6300      the minor(?) benefit of making the epilogue loop's scalar result
6301      independent of the main loop's scalar result.  */
6302   bool unify_with_main_loop_p = false;
6303   if (reduc_info->reused_accumulator
6304       && loop_vinfo->skip_this_loop_edge
6305       && single_succ_p (exit_bb)
6306       && single_succ (exit_bb) == loop_vinfo->skip_this_loop_edge->dest)
6307     {
6308       unify_with_main_loop_p = true;
6309
6310       basic_block reduc_block = loop_vinfo->skip_this_loop_edge->dest;
6311       reduc_inputs[0] = make_ssa_name (vectype);
6312       gphi *new_phi = create_phi_node (reduc_inputs[0], reduc_block);
6313       add_phi_arg (new_phi, orig_reduc_input, single_succ_edge (exit_bb),
6314                    UNKNOWN_LOCATION);
6315       add_phi_arg (new_phi, reduc_info->reused_accumulator->reduc_input,
6316                    loop_vinfo->skip_this_loop_edge, UNKNOWN_LOCATION);
6317       exit_gsi = gsi_after_labels (reduc_block);
6318     }
6319
6320   /* Shouldn't be used beyond this point.  */
6321   exit_bb = nullptr;
6322
6323   if (STMT_VINFO_REDUC_TYPE (reduc_info) == COND_REDUCTION
6324       && reduc_fn != IFN_LAST)
6325     {
6326       /* For condition reductions, we have a vector (REDUC_INPUTS 0) containing
6327          various data values where the condition matched and another vector
6328          (INDUCTION_INDEX) containing all the indexes of those matches.  We
6329          need to extract the last matching index (which will be the index with
6330          highest value) and use this to index into the data vector.
6331          For the case where there were no matches, the data vector will contain
6332          all default values and the index vector will be all zeros.  */
6333
6334       /* Get various versions of the type of the vector of indexes.  */
6335       tree index_vec_type = TREE_TYPE (induction_index);
6336       gcc_checking_assert (TYPE_UNSIGNED (index_vec_type));
6337       tree index_scalar_type = TREE_TYPE (index_vec_type);
6338       tree index_vec_cmp_type = truth_type_for (index_vec_type);
6339
6340       /* Get an unsigned integer version of the type of the data vector.  */
6341       int scalar_precision
6342         = GET_MODE_PRECISION (SCALAR_TYPE_MODE (scalar_type));
6343       tree scalar_type_unsigned = make_unsigned_type (scalar_precision);
6344       tree vectype_unsigned = get_same_sized_vectype (scalar_type_unsigned,
6345                                                 vectype);
6346
6347       /* First we need to create a vector (ZERO_VEC) of zeros and another
6348          vector (MAX_INDEX_VEC) filled with the last matching index, which we
6349          can create using a MAX reduction and then expanding.
6350          In the case where the loop never made any matches, the max index will
6351          be zero.  */
6352
6353       /* Vector of {0, 0, 0,...}.  */
6354       tree zero_vec = build_zero_cst (vectype);
6355
6356       /* Find maximum value from the vector of found indexes.  */
6357       tree max_index = make_ssa_name (index_scalar_type);
6358       gcall *max_index_stmt = gimple_build_call_internal (IFN_REDUC_MAX,
6359                                                           1, induction_index);
6360       gimple_call_set_lhs (max_index_stmt, max_index);
6361       gsi_insert_before (&exit_gsi, max_index_stmt, GSI_SAME_STMT);
6362
6363       /* Vector of {max_index, max_index, max_index,...}.  */
6364       tree max_index_vec = make_ssa_name (index_vec_type);
6365       tree max_index_vec_rhs = build_vector_from_val (index_vec_type,
6366                                                       max_index);
6367       gimple *max_index_vec_stmt = gimple_build_assign (max_index_vec,
6368                                                         max_index_vec_rhs);
6369       gsi_insert_before (&exit_gsi, max_index_vec_stmt, GSI_SAME_STMT);
6370
6371       /* Next we compare the new vector (MAX_INDEX_VEC) full of max indexes
6372          with the vector (INDUCTION_INDEX) of found indexes, choosing values
6373          from the data vector (REDUC_INPUTS 0) for matches, 0 (ZERO_VEC)
6374          otherwise.  Only one value should match, resulting in a vector
6375          (VEC_COND) with one data value and the rest zeros.
6376          In the case where the loop never made any matches, every index will
6377          match, resulting in a vector with all data values (which will all be
6378          the default value).  */
6379
6380       /* Compare the max index vector to the vector of found indexes to find
6381          the position of the max value.  */
6382       tree vec_compare = make_ssa_name (index_vec_cmp_type);
6383       gimple *vec_compare_stmt = gimple_build_assign (vec_compare, EQ_EXPR,
6384                                                       induction_index,
6385                                                       max_index_vec);
6386       gsi_insert_before (&exit_gsi, vec_compare_stmt, GSI_SAME_STMT);
6387
6388       /* Use the compare to choose either values from the data vector or
6389          zero.  */
6390       tree vec_cond = make_ssa_name (vectype);
6391       gimple *vec_cond_stmt = gimple_build_assign (vec_cond, VEC_COND_EXPR,
6392                                                    vec_compare,
6393                                                    reduc_inputs[0],
6394                                                    zero_vec);
6395       gsi_insert_before (&exit_gsi, vec_cond_stmt, GSI_SAME_STMT);
6396
6397       /* Finally we need to extract the data value from the vector (VEC_COND)
6398          into a scalar (MATCHED_DATA_REDUC).  Logically we want to do a OR
6399          reduction, but because this doesn't exist, we can use a MAX reduction
6400          instead.  The data value might be signed or a float so we need to cast
6401          it first.
6402          In the case where the loop never made any matches, the data values are
6403          all identical, and so will reduce down correctly.  */
6404
6405       /* Make the matched data values unsigned.  */
6406       tree vec_cond_cast = make_ssa_name (vectype_unsigned);
6407       tree vec_cond_cast_rhs = build1 (VIEW_CONVERT_EXPR, vectype_unsigned,
6408                                        vec_cond);
6409       gimple *vec_cond_cast_stmt = gimple_build_assign (vec_cond_cast,
6410                                                         VIEW_CONVERT_EXPR,
6411                                                         vec_cond_cast_rhs);
6412       gsi_insert_before (&exit_gsi, vec_cond_cast_stmt, GSI_SAME_STMT);
6413
6414       /* Reduce down to a scalar value.  */
6415       tree data_reduc = make_ssa_name (scalar_type_unsigned);
6416       gcall *data_reduc_stmt = gimple_build_call_internal (IFN_REDUC_MAX,
6417                                                            1, vec_cond_cast);
6418       gimple_call_set_lhs (data_reduc_stmt, data_reduc);
6419       gsi_insert_before (&exit_gsi, data_reduc_stmt, GSI_SAME_STMT);
6420
6421       /* Convert the reduced value back to the result type and set as the
6422          result.  */
6423       gimple_seq stmts = NULL;
6424       new_temp = gimple_build (&stmts, VIEW_CONVERT_EXPR, scalar_type,
6425                                data_reduc);
6426       gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
6427       scalar_results.safe_push (new_temp);
6428     }
6429   else if (STMT_VINFO_REDUC_TYPE (reduc_info) == COND_REDUCTION
6430            && reduc_fn == IFN_LAST)
6431     {
6432       /* Condition reduction without supported IFN_REDUC_MAX.  Generate
6433          idx = 0;
6434          idx_val = induction_index[0];
6435          val = data_reduc[0];
6436          for (idx = 0, val = init, i = 0; i < nelts; ++i)
6437            if (induction_index[i] > idx_val)
6438              val = data_reduc[i], idx_val = induction_index[i];
6439          return val;  */
6440
6441       tree data_eltype = TREE_TYPE (vectype);
6442       tree idx_eltype = TREE_TYPE (TREE_TYPE (induction_index));
6443       unsigned HOST_WIDE_INT el_size = tree_to_uhwi (TYPE_SIZE (idx_eltype));
6444       poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (TREE_TYPE (induction_index));
6445       /* Enforced by vectorizable_reduction, which ensures we have target
6446          support before allowing a conditional reduction on variable-length
6447          vectors.  */
6448       unsigned HOST_WIDE_INT v_size = el_size * nunits.to_constant ();
6449       tree idx_val = NULL_TREE, val = NULL_TREE;
6450       for (unsigned HOST_WIDE_INT off = 0; off < v_size; off += el_size)
6451         {
6452           tree old_idx_val = idx_val;
6453           tree old_val = val;
6454           idx_val = make_ssa_name (idx_eltype);
6455           epilog_stmt = gimple_build_assign (idx_val, BIT_FIELD_REF,
6456                                              build3 (BIT_FIELD_REF, idx_eltype,
6457                                                      induction_index,
6458                                                      bitsize_int (el_size),
6459                                                      bitsize_int (off)));
6460           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
6461           val = make_ssa_name (data_eltype);
6462           epilog_stmt = gimple_build_assign (val, BIT_FIELD_REF,
6463                                              build3 (BIT_FIELD_REF,
6464                                                      data_eltype,
6465                                                      reduc_inputs[0],
6466                                                      bitsize_int (el_size),
6467                                                      bitsize_int (off)));
6468           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
6469           if (off != 0)
6470             {
6471               tree new_idx_val = idx_val;
6472               if (off != v_size - el_size)
6473                 {
6474                   new_idx_val = make_ssa_name (idx_eltype);
6475                   epilog_stmt = gimple_build_assign (new_idx_val,
6476                                                      MAX_EXPR, idx_val,
6477                                                      old_idx_val);
6478                   gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
6479                 }
6480               tree cond = make_ssa_name (boolean_type_node);
6481               epilog_stmt = gimple_build_assign (cond, GT_EXPR,
6482                                                  idx_val, old_idx_val);
6483               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
6484               tree new_val = make_ssa_name (data_eltype);
6485               epilog_stmt = gimple_build_assign (new_val, COND_EXPR,
6486                                                  cond, val, old_val);
6487               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
6488               idx_val = new_idx_val;
6489               val = new_val;
6490             }
6491         }
6492       /* Convert the reduced value back to the result type and set as the
6493          result.  */
6494       gimple_seq stmts = NULL;
6495       val = gimple_convert (&stmts, scalar_type, val);
6496       gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
6497       scalar_results.safe_push (val);
6498     }
6499
6500   /* 2.3 Create the reduction code, using one of the three schemes described
6501          above. In SLP we simply need to extract all the elements from the
6502          vector (without reducing them), so we use scalar shifts.  */
6503   else if (reduc_fn != IFN_LAST && !slp_reduc)
6504     {
6505       tree tmp;
6506       tree vec_elem_type;
6507
6508       /* Case 1:  Create:
6509          v_out2 = reduc_expr <v_out1>  */
6510
6511       if (dump_enabled_p ())
6512         dump_printf_loc (MSG_NOTE, vect_location,
6513                          "Reduce using direct vector reduction.\n");
6514
6515       gimple_seq stmts = NULL;
6516       vec_elem_type = TREE_TYPE (vectype);
6517       new_temp = gimple_build (&stmts, as_combined_fn (reduc_fn),
6518                                vec_elem_type, reduc_inputs[0]);
6519       new_temp = gimple_convert (&stmts, scalar_type, new_temp);
6520       gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
6521
6522       if ((STMT_VINFO_REDUC_TYPE (reduc_info) == INTEGER_INDUC_COND_REDUCTION)
6523           && induc_val)
6524         {
6525           /* Earlier we set the initial value to be a vector if induc_val
6526              values.  Check the result and if it is induc_val then replace
6527              with the original initial value, unless induc_val is
6528              the same as initial_def already.  */
6529           tree zcompare = make_ssa_name (boolean_type_node);
6530           epilog_stmt = gimple_build_assign (zcompare, EQ_EXPR,
6531                                              new_temp, induc_val);
6532           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
6533           tree initial_def = reduc_info->reduc_initial_values[0];
6534           tmp = make_ssa_name (new_scalar_dest);
6535           epilog_stmt = gimple_build_assign (tmp, COND_EXPR, zcompare,
6536                                              initial_def, new_temp);
6537           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
6538           new_temp = tmp;
6539         }
6540
6541       scalar_results.safe_push (new_temp);
6542     }
6543   else if (direct_slp_reduc)
6544     {
6545       /* Here we create one vector for each of the REDUC_GROUP_SIZE results,
6546          with the elements for other SLP statements replaced with the
6547          neutral value.  We can then do a normal reduction on each vector.  */
6548
6549       /* Enforced by vectorizable_reduction.  */
6550       gcc_assert (reduc_inputs.length () == 1);
6551       gcc_assert (pow2p_hwi (group_size));
6552
6553       gimple_seq seq = NULL;
6554
6555       /* Build a vector {0, 1, 2, ...}, with the same number of elements
6556          and the same element size as VECTYPE.  */
6557       tree index = build_index_vector (vectype, 0, 1);
6558       tree index_type = TREE_TYPE (index);
6559       tree index_elt_type = TREE_TYPE (index_type);
6560       tree mask_type = truth_type_for (index_type);
6561
6562       /* Create a vector that, for each element, identifies which of
6563          the REDUC_GROUP_SIZE results should use it.  */
6564       tree index_mask = build_int_cst (index_elt_type, group_size - 1);
6565       index = gimple_build (&seq, BIT_AND_EXPR, index_type, index,
6566                             build_vector_from_val (index_type, index_mask));
6567
6568       /* Get a neutral vector value.  This is simply a splat of the neutral
6569          scalar value if we have one, otherwise the initial scalar value
6570          is itself a neutral value.  */
6571       tree vector_identity = NULL_TREE;
6572       tree neutral_op = NULL_TREE;
6573       if (slp_node)
6574         {
6575           tree initial_value = NULL_TREE;
6576           if (REDUC_GROUP_FIRST_ELEMENT (stmt_info))
6577             initial_value = reduc_info->reduc_initial_values[0];
6578           neutral_op = neutral_op_for_reduction (TREE_TYPE (vectype), code,
6579                                                  initial_value, false);
6580         }
6581       if (neutral_op)
6582         vector_identity = gimple_build_vector_from_val (&seq, vectype,
6583                                                         neutral_op);
6584       for (unsigned int i = 0; i < group_size; ++i)
6585         {
6586           /* If there's no univeral neutral value, we can use the
6587              initial scalar value from the original PHI.  This is used
6588              for MIN and MAX reduction, for example.  */
6589           if (!neutral_op)
6590             {
6591               tree scalar_value = reduc_info->reduc_initial_values[i];
6592               scalar_value = gimple_convert (&seq, TREE_TYPE (vectype),
6593                                              scalar_value);
6594               vector_identity = gimple_build_vector_from_val (&seq, vectype,
6595                                                               scalar_value);
6596             }
6597
6598           /* Calculate the equivalent of:
6599
6600              sel[j] = (index[j] == i);
6601
6602              which selects the elements of REDUC_INPUTS[0] that should
6603              be included in the result.  */
6604           tree compare_val = build_int_cst (index_elt_type, i);
6605           compare_val = build_vector_from_val (index_type, compare_val);
6606           tree sel = gimple_build (&seq, EQ_EXPR, mask_type,
6607                                    index, compare_val);
6608
6609           /* Calculate the equivalent of:
6610
6611              vec = seq ? reduc_inputs[0] : vector_identity;
6612
6613              VEC is now suitable for a full vector reduction.  */
6614           tree vec = gimple_build (&seq, VEC_COND_EXPR, vectype,
6615                                    sel, reduc_inputs[0], vector_identity);
6616
6617           /* Do the reduction and convert it to the appropriate type.  */
6618           tree scalar = gimple_build (&seq, as_combined_fn (reduc_fn),
6619                                       TREE_TYPE (vectype), vec);
6620           scalar = gimple_convert (&seq, scalar_type, scalar);
6621           scalar_results.safe_push (scalar);
6622         }
6623       gsi_insert_seq_before (&exit_gsi, seq, GSI_SAME_STMT);
6624     }
6625   else
6626     {
6627       bool reduce_with_shift;
6628       tree vec_temp;
6629
6630       gcc_assert (slp_reduc || reduc_inputs.length () == 1);
6631
6632       /* See if the target wants to do the final (shift) reduction
6633          in a vector mode of smaller size and first reduce upper/lower
6634          halves against each other.  */
6635       enum machine_mode mode1 = mode;
6636       tree stype = TREE_TYPE (vectype);
6637       unsigned nunits = TYPE_VECTOR_SUBPARTS (vectype).to_constant ();
6638       unsigned nunits1 = nunits;
6639       if ((mode1 = targetm.vectorize.split_reduction (mode)) != mode
6640           && reduc_inputs.length () == 1)
6641         {
6642           nunits1 = GET_MODE_NUNITS (mode1).to_constant ();
6643           /* For SLP reductions we have to make sure lanes match up, but
6644              since we're doing individual element final reduction reducing
6645              vector width here is even more important.
6646              ???  We can also separate lanes with permutes, for the common
6647              case of power-of-two group-size odd/even extracts would work.  */
6648           if (slp_reduc && nunits != nunits1)
6649             {
6650               nunits1 = least_common_multiple (nunits1, group_size);
6651               gcc_assert (exact_log2 (nunits1) != -1 && nunits1 <= nunits);
6652             }
6653         }
6654       if (!slp_reduc
6655           && (mode1 = targetm.vectorize.split_reduction (mode)) != mode)
6656         nunits1 = GET_MODE_NUNITS (mode1).to_constant ();
6657
6658       tree vectype1 = get_related_vectype_for_scalar_type (TYPE_MODE (vectype),
6659                                                            stype, nunits1);
6660       reduce_with_shift = have_whole_vector_shift (mode1);
6661       if (!VECTOR_MODE_P (mode1)
6662           || !directly_supported_p (code, vectype1))
6663         reduce_with_shift = false;
6664
6665       /* First reduce the vector to the desired vector size we should
6666          do shift reduction on by combining upper and lower halves.  */
6667       gimple_seq stmts = NULL;
6668       new_temp = vect_create_partial_epilog (reduc_inputs[0], vectype1,
6669                                              code, &stmts);
6670       gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
6671       reduc_inputs[0] = new_temp;
6672
6673       if (reduce_with_shift && !slp_reduc)
6674         {
6675           int element_bitsize = tree_to_uhwi (bitsize);
6676           /* Enforced by vectorizable_reduction, which disallows SLP reductions
6677              for variable-length vectors and also requires direct target support
6678              for loop reductions.  */
6679           int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype1));
6680           int nelements = vec_size_in_bits / element_bitsize;
6681           vec_perm_builder sel;
6682           vec_perm_indices indices;
6683
6684           int elt_offset;
6685
6686           tree zero_vec = build_zero_cst (vectype1);
6687           /* Case 2: Create:
6688              for (offset = nelements/2; offset >= 1; offset/=2)
6689                 {
6690                   Create:  va' = vec_shift <va, offset>
6691                   Create:  va = vop <va, va'>
6692                 }  */
6693
6694           tree rhs;
6695
6696           if (dump_enabled_p ())
6697             dump_printf_loc (MSG_NOTE, vect_location,
6698                              "Reduce using vector shifts\n");
6699
6700           gimple_seq stmts = NULL;
6701           new_temp = gimple_convert (&stmts, vectype1, new_temp);
6702           for (elt_offset = nelements / 2;
6703                elt_offset >= 1;
6704                elt_offset /= 2)
6705             {
6706               calc_vec_perm_mask_for_shift (elt_offset, nelements, &sel);
6707               indices.new_vector (sel, 2, nelements);
6708               tree mask = vect_gen_perm_mask_any (vectype1, indices);
6709               new_name = gimple_build (&stmts, VEC_PERM_EXPR, vectype1,
6710                                        new_temp, zero_vec, mask);
6711               new_temp = gimple_build (&stmts, code,
6712                                        vectype1, new_name, new_temp);
6713             }
6714           gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
6715
6716           /* 2.4  Extract the final scalar result.  Create:
6717              s_out3 = extract_field <v_out2, bitpos>  */
6718
6719           if (dump_enabled_p ())
6720             dump_printf_loc (MSG_NOTE, vect_location,
6721                              "extract scalar result\n");
6722
6723           rhs = build3 (BIT_FIELD_REF, scalar_type, new_temp,
6724                         bitsize, bitsize_zero_node);
6725           epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
6726           new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
6727           gimple_assign_set_lhs (epilog_stmt, new_temp);
6728           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
6729           scalar_results.safe_push (new_temp);
6730         }
6731       else
6732         {
6733           /* Case 3: Create:
6734              s = extract_field <v_out2, 0>
6735              for (offset = element_size;
6736                   offset < vector_size;
6737                   offset += element_size;)
6738                {
6739                  Create:  s' = extract_field <v_out2, offset>
6740                  Create:  s = op <s, s'>  // For non SLP cases
6741                }  */
6742
6743           if (dump_enabled_p ())
6744             dump_printf_loc (MSG_NOTE, vect_location,
6745                              "Reduce using scalar code.\n");
6746
6747           int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype1));
6748           int element_bitsize = tree_to_uhwi (bitsize);
6749           tree compute_type = TREE_TYPE (vectype);
6750           gimple_seq stmts = NULL;
6751           FOR_EACH_VEC_ELT (reduc_inputs, i, vec_temp)
6752             {
6753               int bit_offset;
6754               new_temp = gimple_build (&stmts, BIT_FIELD_REF, compute_type,
6755                                        vec_temp, bitsize, bitsize_zero_node);
6756
6757               /* In SLP we don't need to apply reduction operation, so we just
6758                  collect s' values in SCALAR_RESULTS.  */
6759               if (slp_reduc)
6760                 scalar_results.safe_push (new_temp);
6761
6762               for (bit_offset = element_bitsize;
6763                    bit_offset < vec_size_in_bits;
6764                    bit_offset += element_bitsize)
6765                 {
6766                   tree bitpos = bitsize_int (bit_offset);
6767                   new_name = gimple_build (&stmts, BIT_FIELD_REF,
6768                                            compute_type, vec_temp,
6769                                            bitsize, bitpos);
6770                   if (slp_reduc)
6771                     {
6772                       /* In SLP we don't need to apply reduction operation, so
6773                          we just collect s' values in SCALAR_RESULTS.  */
6774                       new_temp = new_name;
6775                       scalar_results.safe_push (new_name);
6776                     }
6777                   else
6778                     new_temp = gimple_build (&stmts, code, compute_type,
6779                                              new_name, new_temp);
6780                 }
6781             }
6782
6783           /* The only case where we need to reduce scalar results in SLP, is
6784              unrolling.  If the size of SCALAR_RESULTS is greater than
6785              REDUC_GROUP_SIZE, we reduce them combining elements modulo
6786              REDUC_GROUP_SIZE.  */
6787           if (slp_reduc)
6788             {
6789               tree res, first_res, new_res;
6790
6791               /* Reduce multiple scalar results in case of SLP unrolling.  */
6792               for (j = group_size; scalar_results.iterate (j, &res);
6793                    j++)
6794                 {
6795                   first_res = scalar_results[j % group_size];
6796                   new_res = gimple_build (&stmts, code, compute_type,
6797                                           first_res, res);
6798                   scalar_results[j % group_size] = new_res;
6799                 }
6800               scalar_results.truncate (group_size);
6801               for (k = 0; k < group_size; k++)
6802                 scalar_results[k] = gimple_convert (&stmts, scalar_type,
6803                                                     scalar_results[k]);
6804             }
6805           else
6806             {
6807               /* Not SLP - we have one scalar to keep in SCALAR_RESULTS.  */
6808               new_temp = gimple_convert (&stmts, scalar_type, new_temp);
6809               scalar_results.safe_push (new_temp);
6810             }
6811
6812           gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
6813         }
6814
6815       if ((STMT_VINFO_REDUC_TYPE (reduc_info) == INTEGER_INDUC_COND_REDUCTION)
6816           && induc_val)
6817         {
6818           /* Earlier we set the initial value to be a vector if induc_val
6819              values.  Check the result and if it is induc_val then replace
6820              with the original initial value, unless induc_val is
6821              the same as initial_def already.  */
6822           tree zcompare = make_ssa_name (boolean_type_node);
6823           epilog_stmt = gimple_build_assign (zcompare, EQ_EXPR, new_temp,
6824                                              induc_val);
6825           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
6826           tree initial_def = reduc_info->reduc_initial_values[0];
6827           tree tmp = make_ssa_name (new_scalar_dest);
6828           epilog_stmt = gimple_build_assign (tmp, COND_EXPR, zcompare,
6829                                              initial_def, new_temp);
6830           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
6831           scalar_results[0] = tmp;
6832         }
6833     }
6834
6835   /* 2.5 Adjust the final result by the initial value of the reduction
6836          variable. (When such adjustment is not needed, then
6837          'adjustment_def' is zero).  For example, if code is PLUS we create:
6838          new_temp = loop_exit_def + adjustment_def  */
6839
6840   if (adjustment_def)
6841     {
6842       gcc_assert (!slp_reduc);
6843       gimple_seq stmts = NULL;
6844       if (double_reduc)
6845         {
6846           gcc_assert (VECTOR_TYPE_P (TREE_TYPE (adjustment_def)));
6847           adjustment_def = gimple_convert (&stmts, vectype, adjustment_def);
6848           new_temp = gimple_build (&stmts, code, vectype,
6849                                    reduc_inputs[0], adjustment_def);
6850         }
6851       else
6852         {
6853           new_temp = scalar_results[0];
6854           gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) != VECTOR_TYPE);
6855           adjustment_def = gimple_convert (&stmts, TREE_TYPE (vectype),
6856                                            adjustment_def);
6857           new_temp = gimple_convert (&stmts, TREE_TYPE (vectype), new_temp);
6858           new_temp = gimple_build (&stmts, code, TREE_TYPE (vectype),
6859                                    new_temp, adjustment_def);
6860           new_temp = gimple_convert (&stmts, scalar_type, new_temp);
6861         }
6862
6863       epilog_stmt = gimple_seq_last_stmt (stmts);
6864       gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
6865       scalar_results[0] = new_temp;
6866     }
6867
6868   /* Record this operation if it could be reused by the epilogue loop.  */
6869   if (STMT_VINFO_REDUC_TYPE (reduc_info) == TREE_CODE_REDUCTION
6870       && reduc_inputs.length () == 1)
6871     loop_vinfo->reusable_accumulators.put (scalar_results[0],
6872                                            { orig_reduc_input, reduc_info });
6873
6874   if (double_reduc)
6875     loop = outer_loop;
6876
6877   /* 2.6  Handle the loop-exit phis.  Replace the uses of scalar loop-exit
6878           phis with new adjusted scalar results, i.e., replace use <s_out0>
6879           with use <s_out4>.
6880
6881      Transform:
6882         loop_exit:
6883           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
6884           v_out1 = phi <VECT_DEF>               # NEW_EXIT_PHI
6885           v_out2 = reduce <v_out1>
6886           s_out3 = extract_field <v_out2, 0>
6887           s_out4 = adjust_result <s_out3>
6888           use <s_out0>
6889           use <s_out0>
6890
6891      into:
6892
6893         loop_exit:
6894           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
6895           v_out1 = phi <VECT_DEF>               # NEW_EXIT_PHI
6896           v_out2 = reduce <v_out1>
6897           s_out3 = extract_field <v_out2, 0>
6898           s_out4 = adjust_result <s_out3>
6899           use <s_out4>
6900           use <s_out4> */
6901
6902   gcc_assert (live_out_stmts.size () == scalar_results.length ());
6903   auto_vec<gimple *> phis;
6904   for (k = 0; k < live_out_stmts.size (); k++)
6905     {
6906       stmt_vec_info scalar_stmt_info = vect_orig_stmt (live_out_stmts[k]);
6907       scalar_dest = gimple_get_lhs (scalar_stmt_info->stmt);
6908
6909       /* Find the loop-closed-use at the loop exit of the original scalar
6910          result.  (The reduction result is expected to have two immediate uses,
6911          one at the latch block, and one at the loop exit).  For double
6912          reductions we are looking for exit phis of the outer loop.  */
6913       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, scalar_dest)
6914         {
6915           if (!flow_bb_inside_loop_p (loop, gimple_bb (USE_STMT (use_p))))
6916             {
6917               if (!is_gimple_debug (USE_STMT (use_p))
6918                   && gimple_bb (USE_STMT (use_p)) == loop_exit->dest)
6919                 phis.safe_push (USE_STMT (use_p));
6920             }
6921           else
6922             {
6923               if (double_reduc && gimple_code (USE_STMT (use_p)) == GIMPLE_PHI)
6924                 {
6925                   tree phi_res = PHI_RESULT (USE_STMT (use_p));
6926
6927                   FOR_EACH_IMM_USE_FAST (phi_use_p, phi_imm_iter, phi_res)
6928                     {
6929                       if (!flow_bb_inside_loop_p (loop,
6930                                              gimple_bb (USE_STMT (phi_use_p)))
6931                           && !is_gimple_debug (USE_STMT (phi_use_p)))
6932                         phis.safe_push (USE_STMT (phi_use_p));
6933                     }
6934                 }
6935             }
6936         }
6937
6938       FOR_EACH_VEC_ELT (phis, i, exit_phi)
6939         {
6940           /* Replace the uses:  */
6941           orig_name = PHI_RESULT (exit_phi);
6942
6943           /* Look for a single use at the target of the skip edge.  */
6944           if (unify_with_main_loop_p)
6945             {
6946               use_operand_p use_p;
6947               gimple *user;
6948               if (!single_imm_use (orig_name, &use_p, &user))
6949                 gcc_unreachable ();
6950               orig_name = gimple_get_lhs (user);
6951             }
6952
6953           scalar_result = scalar_results[k];
6954           FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name)
6955             {
6956               FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
6957                 SET_USE (use_p, scalar_result);
6958               update_stmt (use_stmt);
6959             }
6960         }
6961
6962       phis.truncate (0);
6963     }
6964 }
6965
6966 /* Return a vector of type VECTYPE that is equal to the vector select
6967    operation "MASK ? VEC : IDENTITY".  Insert the select statements
6968    before GSI.  */
6969
6970 static tree
6971 merge_with_identity (gimple_stmt_iterator *gsi, tree mask, tree vectype,
6972                      tree vec, tree identity)
6973 {
6974   tree cond = make_temp_ssa_name (vectype, NULL, "cond");
6975   gimple *new_stmt = gimple_build_assign (cond, VEC_COND_EXPR,
6976                                           mask, vec, identity);
6977   gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
6978   return cond;
6979 }
6980
6981 /* Successively apply CODE to each element of VECTOR_RHS, in left-to-right
6982    order, starting with LHS.  Insert the extraction statements before GSI and
6983    associate the new scalar SSA names with variable SCALAR_DEST.
6984    If MASK is nonzero mask the input and then operate on it unconditionally.
6985    Return the SSA name for the result.  */
6986
6987 static tree
6988 vect_expand_fold_left (gimple_stmt_iterator *gsi, tree scalar_dest,
6989                        tree_code code, tree lhs, tree vector_rhs,
6990                        tree mask)
6991 {
6992   tree vectype = TREE_TYPE (vector_rhs);
6993   tree scalar_type = TREE_TYPE (vectype);
6994   tree bitsize = TYPE_SIZE (scalar_type);
6995   unsigned HOST_WIDE_INT vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
6996   unsigned HOST_WIDE_INT element_bitsize = tree_to_uhwi (bitsize);
6997
6998   /* Re-create a VEC_COND_EXPR to mask the input here in order to be able
6999      to perform an unconditional element-wise reduction of it.  */
7000   if (mask)
7001     {
7002       tree masked_vector_rhs = make_temp_ssa_name (vectype, NULL,
7003                                                    "masked_vector_rhs");
7004       tree neutral_op = neutral_op_for_reduction (scalar_type, code, NULL_TREE,
7005                                                   false);
7006       tree vector_identity = build_vector_from_val (vectype, neutral_op);
7007       gassign *select = gimple_build_assign (masked_vector_rhs, VEC_COND_EXPR,
7008                                              mask, vector_rhs, vector_identity);
7009       gsi_insert_before (gsi, select, GSI_SAME_STMT);
7010       vector_rhs = masked_vector_rhs;
7011     }
7012
7013   for (unsigned HOST_WIDE_INT bit_offset = 0;
7014        bit_offset < vec_size_in_bits;
7015        bit_offset += element_bitsize)
7016     {
7017       tree bitpos = bitsize_int (bit_offset);
7018       tree rhs = build3 (BIT_FIELD_REF, scalar_type, vector_rhs,
7019                          bitsize, bitpos);
7020
7021       gassign *stmt = gimple_build_assign (scalar_dest, rhs);
7022       rhs = make_ssa_name (scalar_dest, stmt);
7023       gimple_assign_set_lhs (stmt, rhs);
7024       gsi_insert_before (gsi, stmt, GSI_SAME_STMT);
7025
7026       stmt = gimple_build_assign (scalar_dest, code, lhs, rhs);
7027       tree new_name = make_ssa_name (scalar_dest, stmt);
7028       gimple_assign_set_lhs (stmt, new_name);
7029       gsi_insert_before (gsi, stmt, GSI_SAME_STMT);
7030       lhs = new_name;
7031     }
7032   return lhs;
7033 }
7034
7035 /* Get a masked internal function equivalent to REDUC_FN.  VECTYPE_IN is the
7036    type of the vector input.  */
7037
7038 static internal_fn
7039 get_masked_reduction_fn (internal_fn reduc_fn, tree vectype_in)
7040 {
7041   internal_fn mask_reduc_fn;
7042   internal_fn mask_len_reduc_fn;
7043
7044   switch (reduc_fn)
7045     {
7046     case IFN_FOLD_LEFT_PLUS:
7047       mask_reduc_fn = IFN_MASK_FOLD_LEFT_PLUS;
7048       mask_len_reduc_fn = IFN_MASK_LEN_FOLD_LEFT_PLUS;
7049       break;
7050
7051     default:
7052       return IFN_LAST;
7053     }
7054
7055   if (direct_internal_fn_supported_p (mask_reduc_fn, vectype_in,
7056                                       OPTIMIZE_FOR_SPEED))
7057     return mask_reduc_fn;
7058   if (direct_internal_fn_supported_p (mask_len_reduc_fn, vectype_in,
7059                                       OPTIMIZE_FOR_SPEED))
7060     return mask_len_reduc_fn;
7061   return IFN_LAST;
7062 }
7063
7064 /* Perform an in-order reduction (FOLD_LEFT_REDUCTION).  STMT_INFO is the
7065    statement that sets the live-out value.  REDUC_DEF_STMT is the phi
7066    statement.  CODE is the operation performed by STMT_INFO and OPS are
7067    its scalar operands.  REDUC_INDEX is the index of the operand in
7068    OPS that is set by REDUC_DEF_STMT.  REDUC_FN is the function that
7069    implements in-order reduction, or IFN_LAST if we should open-code it.
7070    VECTYPE_IN is the type of the vector input.  MASKS specifies the masks
7071    that should be used to control the operation in a fully-masked loop.  */
7072
7073 static bool
7074 vectorize_fold_left_reduction (loop_vec_info loop_vinfo,
7075                                stmt_vec_info stmt_info,
7076                                gimple_stmt_iterator *gsi,
7077                                gimple **vec_stmt, slp_tree slp_node,
7078                                gimple *reduc_def_stmt,
7079                                code_helper code, internal_fn reduc_fn,
7080                                tree *ops, int num_ops, tree vectype_in,
7081                                int reduc_index, vec_loop_masks *masks,
7082                                vec_loop_lens *lens)
7083 {
7084   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
7085   tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
7086   internal_fn mask_reduc_fn = get_masked_reduction_fn (reduc_fn, vectype_in);
7087
7088   int ncopies;
7089   if (slp_node)
7090     ncopies = 1;
7091   else
7092     ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
7093
7094   gcc_assert (!nested_in_vect_loop_p (loop, stmt_info));
7095   gcc_assert (ncopies == 1);
7096
7097   bool is_cond_op = false;
7098   if (!code.is_tree_code ())
7099     {
7100       code = conditional_internal_fn_code (internal_fn (code));
7101       gcc_assert (code != ERROR_MARK);
7102       is_cond_op = true;
7103     }
7104
7105   gcc_assert (TREE_CODE_LENGTH (tree_code (code)) == binary_op);
7106
7107   if (slp_node)
7108     {
7109       if (is_cond_op)
7110         {
7111           if (dump_enabled_p ())
7112             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7113                              "fold-left reduction on SLP not supported.\n");
7114           return false;
7115         }
7116
7117       gcc_assert (known_eq (TYPE_VECTOR_SUBPARTS (vectype_out),
7118                             TYPE_VECTOR_SUBPARTS (vectype_in)));
7119     }
7120
7121   /* The operands either come from a binary operation or an IFN_COND operation.
7122      The former is a gimple assign with binary rhs and the latter is a
7123      gimple call with four arguments.  */
7124   gcc_assert (num_ops == 2 || num_ops == 4);
7125   tree op0, opmask;
7126   if (!is_cond_op)
7127     op0 = ops[1 - reduc_index];
7128   else
7129     {
7130       op0 = ops[2 + (1 - reduc_index)];
7131       opmask = ops[0];
7132       gcc_assert (!slp_node);
7133     }
7134
7135   int group_size = 1;
7136   stmt_vec_info scalar_dest_def_info;
7137   auto_vec<tree> vec_oprnds0, vec_opmask;
7138   if (slp_node)
7139     {
7140       auto_vec<vec<tree> > vec_defs (2);
7141       vect_get_slp_defs (loop_vinfo, slp_node, &vec_defs);
7142       vec_oprnds0.safe_splice (vec_defs[1 - reduc_index]);
7143       vec_defs[0].release ();
7144       vec_defs[1].release ();
7145       group_size = SLP_TREE_SCALAR_STMTS (slp_node).length ();
7146       scalar_dest_def_info = SLP_TREE_SCALAR_STMTS (slp_node)[group_size - 1];
7147     }
7148   else
7149     {
7150       vect_get_vec_defs_for_operand (loop_vinfo, stmt_info, 1,
7151                                      op0, &vec_oprnds0);
7152       scalar_dest_def_info = stmt_info;
7153
7154       /* For an IFN_COND_OP we also need the vector mask operand.  */
7155       if (is_cond_op)
7156           vect_get_vec_defs_for_operand (loop_vinfo, stmt_info, 1,
7157                                          opmask, &vec_opmask);
7158     }
7159
7160   gimple *sdef = vect_orig_stmt (scalar_dest_def_info)->stmt;
7161   tree scalar_dest = gimple_get_lhs (sdef);
7162   tree scalar_type = TREE_TYPE (scalar_dest);
7163   tree reduc_var = gimple_phi_result (reduc_def_stmt);
7164
7165   int vec_num = vec_oprnds0.length ();
7166   gcc_assert (vec_num == 1 || slp_node);
7167   tree vec_elem_type = TREE_TYPE (vectype_out);
7168   gcc_checking_assert (useless_type_conversion_p (scalar_type, vec_elem_type));
7169
7170   tree vector_identity = NULL_TREE;
7171   if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
7172     {
7173       vector_identity = build_zero_cst (vectype_out);
7174       if (!HONOR_SIGNED_ZEROS (vectype_out))
7175         ;
7176       else
7177         {
7178           gcc_assert (!HONOR_SIGN_DEPENDENT_ROUNDING (vectype_out));
7179           vector_identity = const_unop (NEGATE_EXPR, vectype_out,
7180                                         vector_identity);
7181         }
7182     }
7183
7184   tree scalar_dest_var = vect_create_destination_var (scalar_dest, NULL);
7185   int i;
7186   tree def0;
7187   FOR_EACH_VEC_ELT (vec_oprnds0, i, def0)
7188     {
7189       gimple *new_stmt;
7190       tree mask = NULL_TREE;
7191       tree len = NULL_TREE;
7192       tree bias = NULL_TREE;
7193       if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
7194         mask = vect_get_loop_mask (loop_vinfo, gsi, masks, vec_num, vectype_in, i);
7195       else if (is_cond_op)
7196         mask = vec_opmask[0];
7197       if (LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo))
7198         {
7199           len = vect_get_loop_len (loop_vinfo, gsi, lens, vec_num, vectype_in,
7200                                    i, 1);
7201           signed char biasval = LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo);
7202           bias = build_int_cst (intQI_type_node, biasval);
7203           if (!is_cond_op)
7204             mask = build_minus_one_cst (truth_type_for (vectype_in));
7205         }
7206
7207       /* Handle MINUS by adding the negative.  */
7208       if (reduc_fn != IFN_LAST && code == MINUS_EXPR)
7209         {
7210           tree negated = make_ssa_name (vectype_out);
7211           new_stmt = gimple_build_assign (negated, NEGATE_EXPR, def0);
7212           gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
7213           def0 = negated;
7214         }
7215
7216       if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
7217           && mask && mask_reduc_fn == IFN_LAST)
7218         def0 = merge_with_identity (gsi, mask, vectype_out, def0,
7219                                     vector_identity);
7220
7221       /* On the first iteration the input is simply the scalar phi
7222          result, and for subsequent iterations it is the output of
7223          the preceding operation.  */
7224       if (reduc_fn != IFN_LAST || (mask && mask_reduc_fn != IFN_LAST))
7225         {
7226           if (mask && len && mask_reduc_fn == IFN_MASK_LEN_FOLD_LEFT_PLUS)
7227             new_stmt = gimple_build_call_internal (mask_reduc_fn, 5, reduc_var,
7228                                                    def0, mask, len, bias);
7229           else if (mask && mask_reduc_fn == IFN_MASK_FOLD_LEFT_PLUS)
7230             new_stmt = gimple_build_call_internal (mask_reduc_fn, 3, reduc_var,
7231                                                    def0, mask);
7232           else
7233             new_stmt = gimple_build_call_internal (reduc_fn, 2, reduc_var,
7234                                                    def0);
7235           /* For chained SLP reductions the output of the previous reduction
7236              operation serves as the input of the next. For the final statement
7237              the output cannot be a temporary - we reuse the original
7238              scalar destination of the last statement.  */
7239           if (i != vec_num - 1)
7240             {
7241               gimple_set_lhs (new_stmt, scalar_dest_var);
7242               reduc_var = make_ssa_name (scalar_dest_var, new_stmt);
7243               gimple_set_lhs (new_stmt, reduc_var);
7244             }
7245         }
7246       else
7247         {
7248           reduc_var = vect_expand_fold_left (gsi, scalar_dest_var,
7249                                              tree_code (code), reduc_var, def0,
7250                                              mask);
7251           new_stmt = SSA_NAME_DEF_STMT (reduc_var);
7252           /* Remove the statement, so that we can use the same code paths
7253              as for statements that we've just created.  */
7254           gimple_stmt_iterator tmp_gsi = gsi_for_stmt (new_stmt);
7255           gsi_remove (&tmp_gsi, true);
7256         }
7257
7258       if (i == vec_num - 1)
7259         {
7260           gimple_set_lhs (new_stmt, scalar_dest);
7261           vect_finish_replace_stmt (loop_vinfo,
7262                                     scalar_dest_def_info,
7263                                     new_stmt);
7264         }
7265       else
7266         vect_finish_stmt_generation (loop_vinfo,
7267                                      scalar_dest_def_info,
7268                                      new_stmt, gsi);
7269
7270       if (slp_node)
7271         slp_node->push_vec_def (new_stmt);
7272       else
7273         {
7274           STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_stmt);
7275           *vec_stmt = new_stmt;
7276         }
7277     }
7278
7279   return true;
7280 }
7281
7282 /* Function is_nonwrapping_integer_induction.
7283
7284    Check if STMT_VINO (which is part of loop LOOP) both increments and
7285    does not cause overflow.  */
7286
7287 static bool
7288 is_nonwrapping_integer_induction (stmt_vec_info stmt_vinfo, class loop *loop)
7289 {
7290   gphi *phi = as_a <gphi *> (stmt_vinfo->stmt);
7291   tree base = STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo);
7292   tree step = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo);
7293   tree lhs_type = TREE_TYPE (gimple_phi_result (phi));
7294   widest_int ni, max_loop_value, lhs_max;
7295   wi::overflow_type overflow = wi::OVF_NONE;
7296
7297   /* Make sure the loop is integer based.  */
7298   if (TREE_CODE (base) != INTEGER_CST
7299       || TREE_CODE (step) != INTEGER_CST)
7300     return false;
7301
7302   /* Check that the max size of the loop will not wrap.  */
7303
7304   if (TYPE_OVERFLOW_UNDEFINED (lhs_type))
7305     return true;
7306
7307   if (! max_stmt_executions (loop, &ni))
7308     return false;
7309
7310   max_loop_value = wi::mul (wi::to_widest (step), ni, TYPE_SIGN (lhs_type),
7311                             &overflow);
7312   if (overflow)
7313     return false;
7314
7315   max_loop_value = wi::add (wi::to_widest (base), max_loop_value,
7316                             TYPE_SIGN (lhs_type), &overflow);
7317   if (overflow)
7318     return false;
7319
7320   return (wi::min_precision (max_loop_value, TYPE_SIGN (lhs_type))
7321           <= TYPE_PRECISION (lhs_type));
7322 }
7323
7324 /* Check if masking can be supported by inserting a conditional expression.
7325    CODE is the code for the operation.  COND_FN is the conditional internal
7326    function, if it exists.  VECTYPE_IN is the type of the vector input.  */
7327 static bool
7328 use_mask_by_cond_expr_p (code_helper code, internal_fn cond_fn,
7329                          tree vectype_in)
7330 {
7331   if (cond_fn != IFN_LAST
7332       && direct_internal_fn_supported_p (cond_fn, vectype_in,
7333                                          OPTIMIZE_FOR_SPEED))
7334     return false;
7335
7336   if (code.is_tree_code ())
7337     switch (tree_code (code))
7338       {
7339       case DOT_PROD_EXPR:
7340       case SAD_EXPR:
7341         return true;
7342
7343       default:
7344         break;
7345       }
7346   return false;
7347 }
7348
7349 /* Insert a conditional expression to enable masked vectorization.  CODE is the
7350    code for the operation.  VOP is the array of operands.  MASK is the loop
7351    mask.  GSI is a statement iterator used to place the new conditional
7352    expression.  */
7353 static void
7354 build_vect_cond_expr (code_helper code, tree vop[3], tree mask,
7355                       gimple_stmt_iterator *gsi)
7356 {
7357   switch (tree_code (code))
7358     {
7359     case DOT_PROD_EXPR:
7360       {
7361         tree vectype = TREE_TYPE (vop[1]);
7362         tree zero = build_zero_cst (vectype);
7363         tree masked_op1 = make_temp_ssa_name (vectype, NULL, "masked_op1");
7364         gassign *select = gimple_build_assign (masked_op1, VEC_COND_EXPR,
7365                                                mask, vop[1], zero);
7366         gsi_insert_before (gsi, select, GSI_SAME_STMT);
7367         vop[1] = masked_op1;
7368         break;
7369       }
7370
7371     case SAD_EXPR:
7372       {
7373         tree vectype = TREE_TYPE (vop[1]);
7374         tree masked_op1 = make_temp_ssa_name (vectype, NULL, "masked_op1");
7375         gassign *select = gimple_build_assign (masked_op1, VEC_COND_EXPR,
7376                                                mask, vop[1], vop[0]);
7377         gsi_insert_before (gsi, select, GSI_SAME_STMT);
7378         vop[1] = masked_op1;
7379         break;
7380       }
7381
7382     default:
7383       gcc_unreachable ();
7384     }
7385 }
7386
7387 /* Function vectorizable_reduction.
7388
7389    Check if STMT_INFO performs a reduction operation that can be vectorized.
7390    If VEC_STMT is also passed, vectorize STMT_INFO: create a vectorized
7391    stmt to replace it, put it in VEC_STMT, and insert it at GSI.
7392    Return true if STMT_INFO is vectorizable in this way.
7393
7394    This function also handles reduction idioms (patterns) that have been
7395    recognized in advance during vect_pattern_recog.  In this case, STMT_INFO
7396    may be of this form:
7397      X = pattern_expr (arg0, arg1, ..., X)
7398    and its STMT_VINFO_RELATED_STMT points to the last stmt in the original
7399    sequence that had been detected and replaced by the pattern-stmt
7400    (STMT_INFO).
7401
7402    This function also handles reduction of condition expressions, for example:
7403      for (int i = 0; i < N; i++)
7404        if (a[i] < value)
7405          last = a[i];
7406    This is handled by vectorising the loop and creating an additional vector
7407    containing the loop indexes for which "a[i] < value" was true.  In the
7408    function epilogue this is reduced to a single max value and then used to
7409    index into the vector of results.
7410
7411    In some cases of reduction patterns, the type of the reduction variable X is
7412    different than the type of the other arguments of STMT_INFO.
7413    In such cases, the vectype that is used when transforming STMT_INFO into
7414    a vector stmt is different than the vectype that is used to determine the
7415    vectorization factor, because it consists of a different number of elements
7416    than the actual number of elements that are being operated upon in parallel.
7417
7418    For example, consider an accumulation of shorts into an int accumulator.
7419    On some targets it's possible to vectorize this pattern operating on 8
7420    shorts at a time (hence, the vectype for purposes of determining the
7421    vectorization factor should be V8HI); on the other hand, the vectype that
7422    is used to create the vector form is actually V4SI (the type of the result).
7423
7424    Upon entry to this function, STMT_VINFO_VECTYPE records the vectype that
7425    indicates what is the actual level of parallelism (V8HI in the example), so
7426    that the right vectorization factor would be derived.  This vectype
7427    corresponds to the type of arguments to the reduction stmt, and should *NOT*
7428    be used to create the vectorized stmt.  The right vectype for the vectorized
7429    stmt is obtained from the type of the result X:
7430       get_vectype_for_scalar_type (vinfo, TREE_TYPE (X))
7431
7432    This means that, contrary to "regular" reductions (or "regular" stmts in
7433    general), the following equation:
7434       STMT_VINFO_VECTYPE == get_vectype_for_scalar_type (vinfo, TREE_TYPE (X))
7435    does *NOT* necessarily hold for reduction patterns.  */
7436
7437 bool
7438 vectorizable_reduction (loop_vec_info loop_vinfo,
7439                         stmt_vec_info stmt_info, slp_tree slp_node,
7440                         slp_instance slp_node_instance,
7441                         stmt_vector_for_cost *cost_vec)
7442 {
7443   tree vectype_in = NULL_TREE;
7444   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
7445   enum vect_def_type cond_reduc_dt = vect_unknown_def_type;
7446   stmt_vec_info cond_stmt_vinfo = NULL;
7447   int i;
7448   int ncopies;
7449   bool single_defuse_cycle = false;
7450   bool nested_cycle = false;
7451   bool double_reduc = false;
7452   int vec_num;
7453   tree cr_index_scalar_type = NULL_TREE, cr_index_vector_type = NULL_TREE;
7454   tree cond_reduc_val = NULL_TREE;
7455
7456   /* Make sure it was already recognized as a reduction computation.  */
7457   if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_reduction_def
7458       && STMT_VINFO_DEF_TYPE (stmt_info) != vect_double_reduction_def
7459       && STMT_VINFO_DEF_TYPE (stmt_info) != vect_nested_cycle)
7460     return false;
7461
7462   /* The stmt we store reduction analysis meta on.  */
7463   stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info);
7464   reduc_info->is_reduc_info = true;
7465
7466   if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle)
7467     {
7468       if (is_a <gphi *> (stmt_info->stmt))
7469         {
7470           if (slp_node)
7471             {
7472               /* We eventually need to set a vector type on invariant
7473                  arguments.  */
7474               unsigned j;
7475               slp_tree child;
7476               FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (slp_node), j, child)
7477                 if (!vect_maybe_update_slp_op_vectype
7478                        (child, SLP_TREE_VECTYPE (slp_node)))
7479                   {
7480                     if (dump_enabled_p ())
7481                       dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7482                                        "incompatible vector types for "
7483                                        "invariants\n");
7484                     return false;
7485                   }
7486             }
7487           /* Analysis for double-reduction is done on the outer
7488              loop PHI, nested cycles have no further restrictions.  */
7489           STMT_VINFO_TYPE (stmt_info) = cycle_phi_info_type;
7490         }
7491       else
7492         STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
7493       return true;
7494     }
7495
7496   stmt_vec_info orig_stmt_of_analysis = stmt_info;
7497   stmt_vec_info phi_info = stmt_info;
7498   if (!is_a <gphi *> (stmt_info->stmt))
7499     {
7500       STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
7501       return true;
7502     }
7503   if (slp_node)
7504     {
7505       slp_node_instance->reduc_phis = slp_node;
7506       /* ???  We're leaving slp_node to point to the PHIs, we only
7507          need it to get at the number of vector stmts which wasn't
7508          yet initialized for the instance root.  */
7509     }
7510   if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
7511     {
7512       use_operand_p use_p;
7513       gimple *use_stmt;
7514       bool res = single_imm_use (gimple_phi_result (stmt_info->stmt),
7515                                  &use_p, &use_stmt);
7516       gcc_assert (res);
7517       phi_info = loop_vinfo->lookup_stmt (use_stmt);
7518     }
7519
7520   /* PHIs should not participate in patterns.  */
7521   gcc_assert (!STMT_VINFO_RELATED_STMT (phi_info));
7522   gphi *reduc_def_phi = as_a <gphi *> (phi_info->stmt);
7523
7524   /* Verify following REDUC_IDX from the latch def leads us back to the PHI
7525      and compute the reduction chain length.  Discover the real
7526      reduction operation stmt on the way (stmt_info and slp_for_stmt_info).  */
7527   tree reduc_def
7528     = PHI_ARG_DEF_FROM_EDGE (reduc_def_phi,
7529                              loop_latch_edge
7530                                (gimple_bb (reduc_def_phi)->loop_father));
7531   unsigned reduc_chain_length = 0;
7532   bool only_slp_reduc_chain = true;
7533   stmt_info = NULL;
7534   slp_tree slp_for_stmt_info = slp_node ? slp_node_instance->root : NULL;
7535   while (reduc_def != PHI_RESULT (reduc_def_phi))
7536     {
7537       stmt_vec_info def = loop_vinfo->lookup_def (reduc_def);
7538       stmt_vec_info vdef = vect_stmt_to_vectorize (def);
7539       if (STMT_VINFO_REDUC_IDX (vdef) == -1)
7540         {
7541           if (dump_enabled_p ())
7542             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7543                              "reduction chain broken by patterns.\n");
7544           return false;
7545         }
7546       if (!REDUC_GROUP_FIRST_ELEMENT (vdef))
7547         only_slp_reduc_chain = false;
7548       /* For epilogue generation live members of the chain need
7549          to point back to the PHI via their original stmt for
7550          info_for_reduction to work.  For SLP we need to look at
7551          all lanes here - even though we only will vectorize from
7552          the SLP node with live lane zero the other live lanes also
7553          need to be identified as part of a reduction to be able
7554          to skip code generation for them.  */
7555       if (slp_for_stmt_info)
7556         {
7557           for (auto s : SLP_TREE_SCALAR_STMTS (slp_for_stmt_info))
7558             if (STMT_VINFO_LIVE_P (s))
7559               STMT_VINFO_REDUC_DEF (vect_orig_stmt (s)) = phi_info;
7560         }
7561       else if (STMT_VINFO_LIVE_P (vdef))
7562         STMT_VINFO_REDUC_DEF (def) = phi_info;
7563       gimple_match_op op;
7564       if (!gimple_extract_op (vdef->stmt, &op))
7565         {
7566           if (dump_enabled_p ())
7567             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7568                              "reduction chain includes unsupported"
7569                              " statement type.\n");
7570           return false;
7571         }
7572       if (CONVERT_EXPR_CODE_P (op.code))
7573         {
7574           if (!tree_nop_conversion_p (op.type, TREE_TYPE (op.ops[0])))
7575             {
7576               if (dump_enabled_p ())
7577                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7578                                  "conversion in the reduction chain.\n");
7579               return false;
7580             }
7581         }
7582       else if (!stmt_info)
7583         /* First non-conversion stmt.  */
7584         stmt_info = vdef;
7585       reduc_def = op.ops[STMT_VINFO_REDUC_IDX (vdef)];
7586       reduc_chain_length++;
7587       if (!stmt_info && slp_node)
7588         slp_for_stmt_info = SLP_TREE_CHILDREN (slp_for_stmt_info)[0];
7589     }
7590   /* PHIs should not participate in patterns.  */
7591   gcc_assert (!STMT_VINFO_RELATED_STMT (phi_info));
7592
7593   if (nested_in_vect_loop_p (loop, stmt_info))
7594     {
7595       loop = loop->inner;
7596       nested_cycle = true;
7597     }
7598
7599   /* STMT_VINFO_REDUC_DEF doesn't point to the first but the last
7600      element.  */
7601   if (slp_node && REDUC_GROUP_FIRST_ELEMENT (stmt_info))
7602     {
7603       gcc_assert (!REDUC_GROUP_NEXT_ELEMENT (stmt_info));
7604       stmt_info = REDUC_GROUP_FIRST_ELEMENT (stmt_info);
7605     }
7606   if (REDUC_GROUP_FIRST_ELEMENT (stmt_info))
7607     gcc_assert (slp_node
7608                 && REDUC_GROUP_FIRST_ELEMENT (stmt_info) == stmt_info);
7609
7610   /* 1. Is vectorizable reduction?  */
7611   /* Not supportable if the reduction variable is used in the loop, unless
7612      it's a reduction chain.  */
7613   if (STMT_VINFO_RELEVANT (stmt_info) > vect_used_in_outer
7614       && !REDUC_GROUP_FIRST_ELEMENT (stmt_info))
7615     return false;
7616
7617   /* Reductions that are not used even in an enclosing outer-loop,
7618      are expected to be "live" (used out of the loop).  */
7619   if (STMT_VINFO_RELEVANT (stmt_info) == vect_unused_in_scope
7620       && !STMT_VINFO_LIVE_P (stmt_info))
7621     return false;
7622
7623   /* 2. Has this been recognized as a reduction pattern?
7624
7625      Check if STMT represents a pattern that has been recognized
7626      in earlier analysis stages.  For stmts that represent a pattern,
7627      the STMT_VINFO_RELATED_STMT field records the last stmt in
7628      the original sequence that constitutes the pattern.  */
7629
7630   stmt_vec_info orig_stmt_info = STMT_VINFO_RELATED_STMT (stmt_info);
7631   if (orig_stmt_info)
7632     {
7633       gcc_assert (STMT_VINFO_IN_PATTERN_P (orig_stmt_info));
7634       gcc_assert (!STMT_VINFO_IN_PATTERN_P (stmt_info));
7635     }
7636
7637   /* 3. Check the operands of the operation.  The first operands are defined
7638         inside the loop body. The last operand is the reduction variable,
7639         which is defined by the loop-header-phi.  */
7640
7641   tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
7642   STMT_VINFO_REDUC_VECTYPE (reduc_info) = vectype_out;
7643   gimple_match_op op;
7644   if (!gimple_extract_op (stmt_info->stmt, &op))
7645     gcc_unreachable ();
7646   bool lane_reduc_code_p = (op.code == DOT_PROD_EXPR
7647                             || op.code == WIDEN_SUM_EXPR
7648                             || op.code == SAD_EXPR);
7649
7650   if (!POINTER_TYPE_P (op.type) && !INTEGRAL_TYPE_P (op.type)
7651       && !SCALAR_FLOAT_TYPE_P (op.type))
7652     return false;
7653
7654   /* Do not try to vectorize bit-precision reductions.  */
7655   if (!type_has_mode_precision_p (op.type))
7656     return false;
7657
7658   /* For lane-reducing ops we're reducing the number of reduction PHIs
7659      which means the only use of that may be in the lane-reducing operation.  */
7660   if (lane_reduc_code_p
7661       && reduc_chain_length != 1
7662       && !only_slp_reduc_chain)
7663     {
7664       if (dump_enabled_p ())
7665         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7666                          "lane-reducing reduction with extra stmts.\n");
7667       return false;
7668     }
7669
7670   /* All uses but the last are expected to be defined in the loop.
7671      The last use is the reduction variable.  In case of nested cycle this
7672      assumption is not true: we use reduc_index to record the index of the
7673      reduction variable.  */
7674   slp_tree *slp_op = XALLOCAVEC (slp_tree, op.num_ops);
7675   tree *vectype_op = XALLOCAVEC (tree, op.num_ops);
7676   /* We need to skip an extra operand for COND_EXPRs with embedded
7677      comparison.  */
7678   unsigned opno_adjust = 0;
7679   if (op.code == COND_EXPR && COMPARISON_CLASS_P (op.ops[0]))
7680     opno_adjust = 1;
7681   for (i = 0; i < (int) op.num_ops; i++)
7682     {
7683       /* The condition of COND_EXPR is checked in vectorizable_condition().  */
7684       if (i == 0 && op.code == COND_EXPR)
7685         continue;
7686
7687       stmt_vec_info def_stmt_info;
7688       enum vect_def_type dt;
7689       if (!vect_is_simple_use (loop_vinfo, stmt_info, slp_for_stmt_info,
7690                                i + opno_adjust, &op.ops[i], &slp_op[i], &dt,
7691                                &vectype_op[i], &def_stmt_info))
7692         {
7693           if (dump_enabled_p ())
7694             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7695                              "use not simple.\n");
7696           return false;
7697         }
7698       if (i == STMT_VINFO_REDUC_IDX (stmt_info))
7699         continue;
7700
7701       /* For an IFN_COND_OP we might hit the reduction definition operand
7702          twice (once as definition, once as else).  */
7703       if (op.ops[i] == op.ops[STMT_VINFO_REDUC_IDX (stmt_info)])
7704         continue;
7705
7706       /* There should be only one cycle def in the stmt, the one
7707          leading to reduc_def.  */
7708       if (VECTORIZABLE_CYCLE_DEF (dt))
7709         return false;
7710
7711       if (!vectype_op[i])
7712         vectype_op[i]
7713           = get_vectype_for_scalar_type (loop_vinfo,
7714                                          TREE_TYPE (op.ops[i]), slp_op[i]);
7715
7716       /* To properly compute ncopies we are interested in the widest
7717          non-reduction input type in case we're looking at a widening
7718          accumulation that we later handle in vect_transform_reduction.  */
7719       if (lane_reduc_code_p
7720           && vectype_op[i]
7721           && (!vectype_in
7722               || (GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (vectype_in)))
7723                   < GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (vectype_op[i]))))))
7724         vectype_in = vectype_op[i];
7725
7726       /* Record how the non-reduction-def value of COND_EXPR is defined.
7727          ???  For a chain of multiple CONDs we'd have to match them up all.  */
7728       if (op.code == COND_EXPR && reduc_chain_length == 1)
7729         {
7730           if (dt == vect_constant_def)
7731             {
7732               cond_reduc_dt = dt;
7733               cond_reduc_val = op.ops[i];
7734             }
7735           else if (dt == vect_induction_def
7736                    && def_stmt_info
7737                    && is_nonwrapping_integer_induction (def_stmt_info, loop))
7738             {
7739               cond_reduc_dt = dt;
7740               cond_stmt_vinfo = def_stmt_info;
7741             }
7742         }
7743     }
7744   if (!vectype_in)
7745     vectype_in = STMT_VINFO_VECTYPE (phi_info);
7746   STMT_VINFO_REDUC_VECTYPE_IN (reduc_info) = vectype_in;
7747
7748   enum vect_reduction_type v_reduc_type = STMT_VINFO_REDUC_TYPE (phi_info);
7749   STMT_VINFO_REDUC_TYPE (reduc_info) = v_reduc_type;
7750   /* If we have a condition reduction, see if we can simplify it further.  */
7751   if (v_reduc_type == COND_REDUCTION)
7752     {
7753       if (slp_node)
7754         return false;
7755
7756       /* When the condition uses the reduction value in the condition, fail.  */
7757       if (STMT_VINFO_REDUC_IDX (stmt_info) == 0)
7758         {
7759           if (dump_enabled_p ())
7760             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7761                              "condition depends on previous iteration\n");
7762           return false;
7763         }
7764
7765       if (reduc_chain_length == 1
7766           && (direct_internal_fn_supported_p (IFN_FOLD_EXTRACT_LAST, vectype_in,
7767                                               OPTIMIZE_FOR_SPEED)
7768               || direct_internal_fn_supported_p (IFN_LEN_FOLD_EXTRACT_LAST,
7769                                                  vectype_in,
7770                                                  OPTIMIZE_FOR_SPEED)))
7771         {
7772           if (dump_enabled_p ())
7773             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7774                              "optimizing condition reduction with"
7775                              " FOLD_EXTRACT_LAST.\n");
7776           STMT_VINFO_REDUC_TYPE (reduc_info) = EXTRACT_LAST_REDUCTION;
7777         }
7778       else if (cond_reduc_dt == vect_induction_def)
7779         {
7780           tree base
7781             = STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (cond_stmt_vinfo);
7782           tree step = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (cond_stmt_vinfo);
7783
7784           gcc_assert (TREE_CODE (base) == INTEGER_CST
7785                       && TREE_CODE (step) == INTEGER_CST);
7786           cond_reduc_val = NULL_TREE;
7787           enum tree_code cond_reduc_op_code = ERROR_MARK;
7788           tree res = PHI_RESULT (STMT_VINFO_STMT (cond_stmt_vinfo));
7789           if (!types_compatible_p (TREE_TYPE (res), TREE_TYPE (base)))
7790             ;
7791           /* Find a suitable value, for MAX_EXPR below base, for MIN_EXPR
7792              above base; punt if base is the minimum value of the type for
7793              MAX_EXPR or maximum value of the type for MIN_EXPR for now.  */
7794           else if (tree_int_cst_sgn (step) == -1)
7795             {
7796               cond_reduc_op_code = MIN_EXPR;
7797               if (tree_int_cst_sgn (base) == -1)
7798                 cond_reduc_val = build_int_cst (TREE_TYPE (base), 0);
7799               else if (tree_int_cst_lt (base,
7800                                         TYPE_MAX_VALUE (TREE_TYPE (base))))
7801                 cond_reduc_val
7802                   = int_const_binop (PLUS_EXPR, base, integer_one_node);
7803             }
7804           else
7805             {
7806               cond_reduc_op_code = MAX_EXPR;
7807               if (tree_int_cst_sgn (base) == 1)
7808                 cond_reduc_val = build_int_cst (TREE_TYPE (base), 0);
7809               else if (tree_int_cst_lt (TYPE_MIN_VALUE (TREE_TYPE (base)),
7810                                         base))
7811                 cond_reduc_val
7812                   = int_const_binop (MINUS_EXPR, base, integer_one_node);
7813             }
7814           if (cond_reduc_val)
7815             {
7816               if (dump_enabled_p ())
7817                 dump_printf_loc (MSG_NOTE, vect_location,
7818                                  "condition expression based on "
7819                                  "integer induction.\n");
7820               STMT_VINFO_REDUC_CODE (reduc_info) = cond_reduc_op_code;
7821               STMT_VINFO_VEC_INDUC_COND_INITIAL_VAL (reduc_info)
7822                 = cond_reduc_val;
7823               STMT_VINFO_REDUC_TYPE (reduc_info) = INTEGER_INDUC_COND_REDUCTION;
7824             }
7825         }
7826       else if (cond_reduc_dt == vect_constant_def)
7827         {
7828           enum vect_def_type cond_initial_dt;
7829           tree cond_initial_val = vect_phi_initial_value (reduc_def_phi);
7830           vect_is_simple_use (cond_initial_val, loop_vinfo, &cond_initial_dt);
7831           if (cond_initial_dt == vect_constant_def
7832               && types_compatible_p (TREE_TYPE (cond_initial_val),
7833                                      TREE_TYPE (cond_reduc_val)))
7834             {
7835               tree e = fold_binary (LE_EXPR, boolean_type_node,
7836                                     cond_initial_val, cond_reduc_val);
7837               if (e && (integer_onep (e) || integer_zerop (e)))
7838                 {
7839                   if (dump_enabled_p ())
7840                     dump_printf_loc (MSG_NOTE, vect_location,
7841                                      "condition expression based on "
7842                                      "compile time constant.\n");
7843                   /* Record reduction code at analysis stage.  */
7844                   STMT_VINFO_REDUC_CODE (reduc_info)
7845                     = integer_onep (e) ? MAX_EXPR : MIN_EXPR;
7846                   STMT_VINFO_REDUC_TYPE (reduc_info) = CONST_COND_REDUCTION;
7847                 }
7848             }
7849         }
7850     }
7851
7852   if (STMT_VINFO_LIVE_P (phi_info))
7853     return false;
7854
7855   if (slp_node)
7856     ncopies = 1;
7857   else
7858     ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
7859
7860   gcc_assert (ncopies >= 1);
7861
7862   poly_uint64 nunits_out = TYPE_VECTOR_SUBPARTS (vectype_out);
7863
7864   if (nested_cycle)
7865     {
7866       gcc_assert (STMT_VINFO_DEF_TYPE (reduc_info)
7867                   == vect_double_reduction_def);
7868       double_reduc = true;
7869     }
7870
7871   /* 4.2. Check support for the epilog operation.
7872
7873           If STMT represents a reduction pattern, then the type of the
7874           reduction variable may be different than the type of the rest
7875           of the arguments.  For example, consider the case of accumulation
7876           of shorts into an int accumulator; The original code:
7877                         S1: int_a = (int) short_a;
7878           orig_stmt->   S2: int_acc = plus <int_a ,int_acc>;
7879
7880           was replaced with:
7881                         STMT: int_acc = widen_sum <short_a, int_acc>
7882
7883           This means that:
7884           1. The tree-code that is used to create the vector operation in the
7885              epilog code (that reduces the partial results) is not the
7886              tree-code of STMT, but is rather the tree-code of the original
7887              stmt from the pattern that STMT is replacing.  I.e, in the example
7888              above we want to use 'widen_sum' in the loop, but 'plus' in the
7889              epilog.
7890           2. The type (mode) we use to check available target support
7891              for the vector operation to be created in the *epilog*, is
7892              determined by the type of the reduction variable (in the example
7893              above we'd check this: optab_handler (plus_optab, vect_int_mode])).
7894              However the type (mode) we use to check available target support
7895              for the vector operation to be created *inside the loop*, is
7896              determined by the type of the other arguments to STMT (in the
7897              example we'd check this: optab_handler (widen_sum_optab,
7898              vect_short_mode)).
7899
7900           This is contrary to "regular" reductions, in which the types of all
7901           the arguments are the same as the type of the reduction variable.
7902           For "regular" reductions we can therefore use the same vector type
7903           (and also the same tree-code) when generating the epilog code and
7904           when generating the code inside the loop.  */
7905
7906   code_helper orig_code = STMT_VINFO_REDUC_CODE (phi_info);
7907
7908   /* If conversion might have created a conditional operation like
7909      IFN_COND_ADD already.  Use the internal code for the following checks.  */
7910   if (orig_code.is_internal_fn ())
7911     {
7912       tree_code new_code = conditional_internal_fn_code (internal_fn (orig_code));
7913       orig_code = new_code != ERROR_MARK ? new_code : orig_code;
7914     }
7915
7916   STMT_VINFO_REDUC_CODE (reduc_info) = orig_code;
7917
7918   vect_reduction_type reduction_type = STMT_VINFO_REDUC_TYPE (reduc_info);
7919   if (reduction_type == TREE_CODE_REDUCTION)
7920     {
7921       /* Check whether it's ok to change the order of the computation.
7922          Generally, when vectorizing a reduction we change the order of the
7923          computation.  This may change the behavior of the program in some
7924          cases, so we need to check that this is ok.  One exception is when
7925          vectorizing an outer-loop: the inner-loop is executed sequentially,
7926          and therefore vectorizing reductions in the inner-loop during
7927          outer-loop vectorization is safe.  Likewise when we are vectorizing
7928          a series of reductions using SLP and the VF is one the reductions
7929          are performed in scalar order.  */
7930       if (slp_node
7931           && !REDUC_GROUP_FIRST_ELEMENT (stmt_info)
7932           && known_eq (LOOP_VINFO_VECT_FACTOR (loop_vinfo), 1u))
7933         ;
7934       else if (needs_fold_left_reduction_p (op.type, orig_code))
7935         {
7936           /* When vectorizing a reduction chain w/o SLP the reduction PHI
7937              is not directy used in stmt.  */
7938           if (!only_slp_reduc_chain
7939               && reduc_chain_length != 1)
7940             {
7941               if (dump_enabled_p ())
7942                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7943                                  "in-order reduction chain without SLP.\n");
7944               return false;
7945             }
7946           STMT_VINFO_REDUC_TYPE (reduc_info)
7947             = reduction_type = FOLD_LEFT_REDUCTION;
7948         }
7949       else if (!commutative_binary_op_p (orig_code, op.type)
7950                || !associative_binary_op_p (orig_code, op.type))
7951         {
7952           if (dump_enabled_p ())
7953             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7954                             "reduction: not commutative/associative\n");
7955           return false;
7956         }
7957     }
7958
7959   if ((double_reduc || reduction_type != TREE_CODE_REDUCTION)
7960       && ncopies > 1)
7961     {
7962       if (dump_enabled_p ())
7963         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7964                          "multiple types in double reduction or condition "
7965                          "reduction or fold-left reduction.\n");
7966       return false;
7967     }
7968
7969   internal_fn reduc_fn = IFN_LAST;
7970   if (reduction_type == TREE_CODE_REDUCTION
7971       || reduction_type == FOLD_LEFT_REDUCTION
7972       || reduction_type == INTEGER_INDUC_COND_REDUCTION
7973       || reduction_type == CONST_COND_REDUCTION)
7974     {
7975       if (reduction_type == FOLD_LEFT_REDUCTION
7976           ? fold_left_reduction_fn (orig_code, &reduc_fn)
7977           : reduction_fn_for_scalar_code (orig_code, &reduc_fn))
7978         {
7979           if (reduc_fn != IFN_LAST
7980               && !direct_internal_fn_supported_p (reduc_fn, vectype_out,
7981                                                   OPTIMIZE_FOR_SPEED))
7982             {
7983               if (dump_enabled_p ())
7984                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7985                                  "reduc op not supported by target.\n");
7986
7987               reduc_fn = IFN_LAST;
7988             }
7989         }
7990       else
7991         {
7992           if (!nested_cycle || double_reduc)
7993             {
7994               if (dump_enabled_p ())
7995                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7996                                  "no reduc code for scalar code.\n");
7997
7998               return false;
7999             }
8000         }
8001     }
8002   else if (reduction_type == COND_REDUCTION)
8003     {
8004       int scalar_precision
8005         = GET_MODE_PRECISION (SCALAR_TYPE_MODE (op.type));
8006       cr_index_scalar_type = make_unsigned_type (scalar_precision);
8007       cr_index_vector_type = get_same_sized_vectype (cr_index_scalar_type,
8008                                                 vectype_out);
8009
8010       if (direct_internal_fn_supported_p (IFN_REDUC_MAX, cr_index_vector_type,
8011                                           OPTIMIZE_FOR_SPEED))
8012         reduc_fn = IFN_REDUC_MAX;
8013     }
8014   STMT_VINFO_REDUC_FN (reduc_info) = reduc_fn;
8015
8016   if (reduction_type != EXTRACT_LAST_REDUCTION
8017       && (!nested_cycle || double_reduc)
8018       && reduc_fn == IFN_LAST
8019       && !nunits_out.is_constant ())
8020     {
8021       if (dump_enabled_p ())
8022         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8023                          "missing target support for reduction on"
8024                          " variable-length vectors.\n");
8025       return false;
8026     }
8027
8028   /* For SLP reductions, see if there is a neutral value we can use.  */
8029   tree neutral_op = NULL_TREE;
8030   if (slp_node)
8031     {
8032       tree initial_value = NULL_TREE;
8033       if (REDUC_GROUP_FIRST_ELEMENT (stmt_info) != NULL)
8034         initial_value = vect_phi_initial_value (reduc_def_phi);
8035       neutral_op = neutral_op_for_reduction (TREE_TYPE (vectype_out),
8036                                              orig_code, initial_value);
8037     }
8038
8039   if (double_reduc && reduction_type == FOLD_LEFT_REDUCTION)
8040     {
8041       /* We can't support in-order reductions of code such as this:
8042
8043            for (int i = 0; i < n1; ++i)
8044              for (int j = 0; j < n2; ++j)
8045                l += a[j];
8046
8047          since GCC effectively transforms the loop when vectorizing:
8048
8049            for (int i = 0; i < n1 / VF; ++i)
8050              for (int j = 0; j < n2; ++j)
8051                for (int k = 0; k < VF; ++k)
8052                  l += a[j];
8053
8054          which is a reassociation of the original operation.  */
8055       if (dump_enabled_p ())
8056         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8057                          "in-order double reduction not supported.\n");
8058
8059       return false;
8060     }
8061
8062   if (reduction_type == FOLD_LEFT_REDUCTION
8063       && slp_node
8064       && !REDUC_GROUP_FIRST_ELEMENT (stmt_info))
8065     {
8066       /* We cannot use in-order reductions in this case because there is
8067          an implicit reassociation of the operations involved.  */
8068       if (dump_enabled_p ())
8069         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8070                          "in-order unchained SLP reductions not supported.\n");
8071       return false;
8072     }
8073
8074   /* For double reductions, and for SLP reductions with a neutral value,
8075      we construct a variable-length initial vector by loading a vector
8076      full of the neutral value and then shift-and-inserting the start
8077      values into the low-numbered elements.  */
8078   if ((double_reduc || neutral_op)
8079       && !nunits_out.is_constant ()
8080       && !direct_internal_fn_supported_p (IFN_VEC_SHL_INSERT,
8081                                           vectype_out, OPTIMIZE_FOR_SPEED))
8082     {
8083       if (dump_enabled_p ())
8084         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8085                          "reduction on variable-length vectors requires"
8086                          " target support for a vector-shift-and-insert"
8087                          " operation.\n");
8088       return false;
8089     }
8090
8091   /* Check extra constraints for variable-length unchained SLP reductions.  */
8092   if (slp_node
8093       && !REDUC_GROUP_FIRST_ELEMENT (stmt_info)
8094       && !nunits_out.is_constant ())
8095     {
8096       /* We checked above that we could build the initial vector when
8097          there's a neutral element value.  Check here for the case in
8098          which each SLP statement has its own initial value and in which
8099          that value needs to be repeated for every instance of the
8100          statement within the initial vector.  */
8101       unsigned int group_size = SLP_TREE_LANES (slp_node);
8102       if (!neutral_op
8103           && !can_duplicate_and_interleave_p (loop_vinfo, group_size,
8104                                               TREE_TYPE (vectype_out)))
8105         {
8106           if (dump_enabled_p ())
8107             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8108                              "unsupported form of SLP reduction for"
8109                              " variable-length vectors: cannot build"
8110                              " initial vector.\n");
8111           return false;
8112         }
8113       /* The epilogue code relies on the number of elements being a multiple
8114          of the group size.  The duplicate-and-interleave approach to setting
8115          up the initial vector does too.  */
8116       if (!multiple_p (nunits_out, group_size))
8117         {
8118           if (dump_enabled_p ())
8119             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8120                              "unsupported form of SLP reduction for"
8121                              " variable-length vectors: the vector size"
8122                              " is not a multiple of the number of results.\n");
8123           return false;
8124         }
8125     }
8126
8127   if (reduction_type == COND_REDUCTION)
8128     {
8129       widest_int ni;
8130
8131       if (! max_loop_iterations (loop, &ni))
8132         {
8133           if (dump_enabled_p ())
8134             dump_printf_loc (MSG_NOTE, vect_location,
8135                              "loop count not known, cannot create cond "
8136                              "reduction.\n");
8137           return false;
8138         }
8139       /* Convert backedges to iterations.  */
8140       ni += 1;
8141
8142       /* The additional index will be the same type as the condition.  Check
8143          that the loop can fit into this less one (because we'll use up the
8144          zero slot for when there are no matches).  */
8145       tree max_index = TYPE_MAX_VALUE (cr_index_scalar_type);
8146       if (wi::geu_p (ni, wi::to_widest (max_index)))
8147         {
8148           if (dump_enabled_p ())
8149             dump_printf_loc (MSG_NOTE, vect_location,
8150                              "loop size is greater than data size.\n");
8151           return false;
8152         }
8153     }
8154
8155   /* In case the vectorization factor (VF) is bigger than the number
8156      of elements that we can fit in a vectype (nunits), we have to generate
8157      more than one vector stmt - i.e - we need to "unroll" the
8158      vector stmt by a factor VF/nunits.  For more details see documentation
8159      in vectorizable_operation.  */
8160
8161   /* If the reduction is used in an outer loop we need to generate
8162      VF intermediate results, like so (e.g. for ncopies=2):
8163         r0 = phi (init, r0)
8164         r1 = phi (init, r1)
8165         r0 = x0 + r0;
8166         r1 = x1 + r1;
8167     (i.e. we generate VF results in 2 registers).
8168     In this case we have a separate def-use cycle for each copy, and therefore
8169     for each copy we get the vector def for the reduction variable from the
8170     respective phi node created for this copy.
8171
8172     Otherwise (the reduction is unused in the loop nest), we can combine
8173     together intermediate results, like so (e.g. for ncopies=2):
8174         r = phi (init, r)
8175         r = x0 + r;
8176         r = x1 + r;
8177    (i.e. we generate VF/2 results in a single register).
8178    In this case for each copy we get the vector def for the reduction variable
8179    from the vectorized reduction operation generated in the previous iteration.
8180
8181    This only works when we see both the reduction PHI and its only consumer
8182    in vectorizable_reduction and there are no intermediate stmts
8183    participating.  When unrolling we want each unrolled iteration to have its
8184    own reduction accumulator since one of the main goals of unrolling a
8185    reduction is to reduce the aggregate loop-carried latency.  */
8186   if (ncopies > 1
8187       && (STMT_VINFO_RELEVANT (stmt_info) <= vect_used_only_live)
8188       && reduc_chain_length == 1
8189       && loop_vinfo->suggested_unroll_factor == 1)
8190     single_defuse_cycle = true;
8191
8192   if (single_defuse_cycle || lane_reduc_code_p)
8193     {
8194       gcc_assert (op.code != COND_EXPR);
8195
8196       /* 4. Supportable by target?  */
8197       bool ok = true;
8198
8199       /* 4.1. check support for the operation in the loop
8200
8201          This isn't necessary for the lane reduction codes, since they
8202          can only be produced by pattern matching, and it's up to the
8203          pattern matcher to test for support.  The main reason for
8204          specifically skipping this step is to avoid rechecking whether
8205          mixed-sign dot-products can be implemented using signed
8206          dot-products.  */
8207       machine_mode vec_mode = TYPE_MODE (vectype_in);
8208       if (!lane_reduc_code_p
8209           && !directly_supported_p (op.code, vectype_in, optab_vector))
8210         {
8211           if (dump_enabled_p ())
8212             dump_printf (MSG_NOTE, "op not supported by target.\n");
8213           if (maybe_ne (GET_MODE_SIZE (vec_mode), UNITS_PER_WORD)
8214               || !vect_can_vectorize_without_simd_p (op.code))
8215             ok = false;
8216           else
8217             if (dump_enabled_p ())
8218               dump_printf (MSG_NOTE, "proceeding using word mode.\n");
8219         }
8220
8221       if (vect_emulated_vector_p (vectype_in)
8222           && !vect_can_vectorize_without_simd_p (op.code))
8223         {
8224           if (dump_enabled_p ())
8225             dump_printf (MSG_NOTE, "using word mode not possible.\n");
8226           return false;
8227         }
8228
8229       /* lane-reducing operations have to go through vect_transform_reduction.
8230          For the other cases try without the single cycle optimization.  */
8231       if (!ok)
8232         {
8233           if (lane_reduc_code_p)
8234             return false;
8235           else
8236             single_defuse_cycle = false;
8237         }
8238     }
8239   STMT_VINFO_FORCE_SINGLE_CYCLE (reduc_info) = single_defuse_cycle;
8240
8241   /* If the reduction stmt is one of the patterns that have lane
8242      reduction embedded we cannot handle the case of ! single_defuse_cycle.  */
8243   if ((ncopies > 1 && ! single_defuse_cycle)
8244       && lane_reduc_code_p)
8245     {
8246       if (dump_enabled_p ())
8247         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8248                          "multi def-use cycle not possible for lane-reducing "
8249                          "reduction operation\n");
8250       return false;
8251     }
8252
8253   if (slp_node
8254       && !(!single_defuse_cycle
8255            && !lane_reduc_code_p
8256            && reduction_type != FOLD_LEFT_REDUCTION))
8257     for (i = 0; i < (int) op.num_ops; i++)
8258       if (!vect_maybe_update_slp_op_vectype (slp_op[i], vectype_op[i]))
8259         {
8260           if (dump_enabled_p ())
8261             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8262                              "incompatible vector types for invariants\n");
8263           return false;
8264         }
8265
8266   if (slp_node)
8267     vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
8268   else
8269     vec_num = 1;
8270
8271   vect_model_reduction_cost (loop_vinfo, stmt_info, reduc_fn,
8272                              reduction_type, ncopies, cost_vec);
8273   /* Cost the reduction op inside the loop if transformed via
8274      vect_transform_reduction.  Otherwise this is costed by the
8275      separate vectorizable_* routines.  */
8276   if (single_defuse_cycle || lane_reduc_code_p)
8277     {
8278       int factor = 1;
8279       if (vect_is_emulated_mixed_dot_prod (loop_vinfo, stmt_info))
8280         /* Three dot-products and a subtraction.  */
8281         factor = 4;
8282       record_stmt_cost (cost_vec, ncopies * factor, vector_stmt,
8283                         stmt_info, 0, vect_body);
8284     }
8285
8286   if (dump_enabled_p ()
8287       && reduction_type == FOLD_LEFT_REDUCTION)
8288     dump_printf_loc (MSG_NOTE, vect_location,
8289                      "using an in-order (fold-left) reduction.\n");
8290   STMT_VINFO_TYPE (orig_stmt_of_analysis) = cycle_phi_info_type;
8291   /* All but single defuse-cycle optimized, lane-reducing and fold-left
8292      reductions go through their own vectorizable_* routines.  */
8293   if (!single_defuse_cycle
8294       && !lane_reduc_code_p
8295       && reduction_type != FOLD_LEFT_REDUCTION)
8296     {
8297       stmt_vec_info tem
8298         = vect_stmt_to_vectorize (STMT_VINFO_REDUC_DEF (phi_info));
8299       if (slp_node && REDUC_GROUP_FIRST_ELEMENT (tem))
8300         {
8301           gcc_assert (!REDUC_GROUP_NEXT_ELEMENT (tem));
8302           tem = REDUC_GROUP_FIRST_ELEMENT (tem);
8303         }
8304       STMT_VINFO_DEF_TYPE (vect_orig_stmt (tem)) = vect_internal_def;
8305       STMT_VINFO_DEF_TYPE (tem) = vect_internal_def;
8306     }
8307   else if (loop_vinfo && LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo))
8308     {
8309       vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo);
8310       vec_loop_lens *lens = &LOOP_VINFO_LENS (loop_vinfo);
8311       internal_fn cond_fn = get_conditional_internal_fn (op.code, op.type);
8312
8313       if (reduction_type != FOLD_LEFT_REDUCTION
8314           && !use_mask_by_cond_expr_p (op.code, cond_fn, vectype_in)
8315           && (cond_fn == IFN_LAST
8316               || !direct_internal_fn_supported_p (cond_fn, vectype_in,
8317                                                   OPTIMIZE_FOR_SPEED)))
8318         {
8319           if (dump_enabled_p ())
8320             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8321                              "can't operate on partial vectors because"
8322                              " no conditional operation is available.\n");
8323           LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
8324         }
8325       else if (reduction_type == FOLD_LEFT_REDUCTION
8326                && reduc_fn == IFN_LAST
8327                && !expand_vec_cond_expr_p (vectype_in,
8328                                            truth_type_for (vectype_in),
8329                                            SSA_NAME))
8330         {
8331           if (dump_enabled_p ())
8332             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8333                              "can't operate on partial vectors because"
8334                              " no conditional operation is available.\n");
8335           LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
8336         }
8337       else if (reduction_type == FOLD_LEFT_REDUCTION
8338                && internal_fn_mask_index (reduc_fn) == -1
8339                && FLOAT_TYPE_P (vectype_in)
8340                && HONOR_SIGN_DEPENDENT_ROUNDING (vectype_in))
8341         {
8342           if (dump_enabled_p ())
8343             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8344                              "can't operate on partial vectors because"
8345                              " signed zeros cannot be preserved.\n");
8346           LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
8347         }
8348       else
8349         {
8350           internal_fn mask_reduc_fn
8351             = get_masked_reduction_fn (reduc_fn, vectype_in);
8352
8353           if (mask_reduc_fn == IFN_MASK_LEN_FOLD_LEFT_PLUS)
8354             vect_record_loop_len (loop_vinfo, lens, ncopies * vec_num,
8355                                   vectype_in, 1);
8356           else
8357             vect_record_loop_mask (loop_vinfo, masks, ncopies * vec_num,
8358                                    vectype_in, NULL);
8359         }
8360     }
8361   return true;
8362 }
8363
8364 /* STMT_INFO is a dot-product reduction whose multiplication operands
8365    have different signs.  Emit a sequence to emulate the operation
8366    using a series of signed DOT_PROD_EXPRs and return the last
8367    statement generated.  VEC_DEST is the result of the vector operation
8368    and VOP lists its inputs.  */
8369
8370 static gassign *
8371 vect_emulate_mixed_dot_prod (loop_vec_info loop_vinfo, stmt_vec_info stmt_info,
8372                              gimple_stmt_iterator *gsi, tree vec_dest,
8373                              tree vop[3])
8374 {
8375   tree wide_vectype = signed_type_for (TREE_TYPE (vec_dest));
8376   tree narrow_vectype = signed_type_for (TREE_TYPE (vop[0]));
8377   tree narrow_elttype = TREE_TYPE (narrow_vectype);
8378   gimple *new_stmt;
8379
8380   /* Make VOP[0] the unsigned operand VOP[1] the signed operand.  */
8381   if (!TYPE_UNSIGNED (TREE_TYPE (vop[0])))
8382     std::swap (vop[0], vop[1]);
8383
8384   /* Convert all inputs to signed types.  */
8385   for (int i = 0; i < 3; ++i)
8386     if (TYPE_UNSIGNED (TREE_TYPE (vop[i])))
8387       {
8388         tree tmp = make_ssa_name (signed_type_for (TREE_TYPE (vop[i])));
8389         new_stmt = gimple_build_assign (tmp, NOP_EXPR, vop[i]);
8390         vect_finish_stmt_generation (loop_vinfo, stmt_info, new_stmt, gsi);
8391         vop[i] = tmp;
8392       }
8393
8394   /* In the comments below we assume 8-bit inputs for simplicity,
8395      but the approach works for any full integer type.  */
8396
8397   /* Create a vector of -128.  */
8398   tree min_narrow_elttype = TYPE_MIN_VALUE (narrow_elttype);
8399   tree min_narrow = build_vector_from_val (narrow_vectype,
8400                                            min_narrow_elttype);
8401
8402   /* Create a vector of 64.  */
8403   auto half_wi = wi::lrshift (wi::to_wide (min_narrow_elttype), 1);
8404   tree half_narrow = wide_int_to_tree (narrow_elttype, half_wi);
8405   half_narrow = build_vector_from_val (narrow_vectype, half_narrow);
8406
8407   /* Emit: SUB_RES = VOP[0] - 128.  */
8408   tree sub_res = make_ssa_name (narrow_vectype);
8409   new_stmt = gimple_build_assign (sub_res, PLUS_EXPR, vop[0], min_narrow);
8410   vect_finish_stmt_generation (loop_vinfo, stmt_info, new_stmt, gsi);
8411
8412   /* Emit:
8413
8414        STAGE1 = DOT_PROD_EXPR <VOP[1], 64, VOP[2]>;
8415        STAGE2 = DOT_PROD_EXPR <VOP[1], 64, STAGE1>;
8416        STAGE3 = DOT_PROD_EXPR <SUB_RES, -128, STAGE2>;
8417
8418      on the basis that x * y == (x - 128) * y + 64 * y + 64 * y
8419      Doing the two 64 * y steps first allows more time to compute x.  */
8420   tree stage1 = make_ssa_name (wide_vectype);
8421   new_stmt = gimple_build_assign (stage1, DOT_PROD_EXPR,
8422                                   vop[1], half_narrow, vop[2]);
8423   vect_finish_stmt_generation (loop_vinfo, stmt_info, new_stmt, gsi);
8424
8425   tree stage2 = make_ssa_name (wide_vectype);
8426   new_stmt = gimple_build_assign (stage2, DOT_PROD_EXPR,
8427                                   vop[1], half_narrow, stage1);
8428   vect_finish_stmt_generation (loop_vinfo, stmt_info, new_stmt, gsi);
8429
8430   tree stage3 = make_ssa_name (wide_vectype);
8431   new_stmt = gimple_build_assign (stage3, DOT_PROD_EXPR,
8432                                   sub_res, vop[1], stage2);
8433   vect_finish_stmt_generation (loop_vinfo, stmt_info, new_stmt, gsi);
8434
8435   /* Convert STAGE3 to the reduction type.  */
8436   return gimple_build_assign (vec_dest, CONVERT_EXPR, stage3);
8437 }
8438
8439 /* Transform the definition stmt STMT_INFO of a reduction PHI backedge
8440    value.  */
8441
8442 bool
8443 vect_transform_reduction (loop_vec_info loop_vinfo,
8444                           stmt_vec_info stmt_info, gimple_stmt_iterator *gsi,
8445                           gimple **vec_stmt, slp_tree slp_node)
8446 {
8447   tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
8448   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
8449   int i;
8450   int ncopies;
8451   int vec_num;
8452
8453   stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info);
8454   gcc_assert (reduc_info->is_reduc_info);
8455
8456   if (nested_in_vect_loop_p (loop, stmt_info))
8457     {
8458       loop = loop->inner;
8459       gcc_assert (STMT_VINFO_DEF_TYPE (reduc_info) == vect_double_reduction_def);
8460     }
8461
8462   gimple_match_op op;
8463   if (!gimple_extract_op (stmt_info->stmt, &op))
8464     gcc_unreachable ();
8465
8466   /* All uses but the last are expected to be defined in the loop.
8467      The last use is the reduction variable.  In case of nested cycle this
8468      assumption is not true: we use reduc_index to record the index of the
8469      reduction variable.  */
8470   stmt_vec_info phi_info = STMT_VINFO_REDUC_DEF (vect_orig_stmt (stmt_info));
8471   gphi *reduc_def_phi = as_a <gphi *> (phi_info->stmt);
8472   int reduc_index = STMT_VINFO_REDUC_IDX (stmt_info);
8473   tree vectype_in = STMT_VINFO_REDUC_VECTYPE_IN (reduc_info);
8474
8475   if (slp_node)
8476     {
8477       ncopies = 1;
8478       vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
8479     }
8480   else
8481     {
8482       ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
8483       vec_num = 1;
8484     }
8485
8486   code_helper code = canonicalize_code (op.code, op.type);
8487   internal_fn cond_fn = get_conditional_internal_fn (code, op.type);
8488
8489   vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo);
8490   vec_loop_lens *lens = &LOOP_VINFO_LENS (loop_vinfo);
8491   bool mask_by_cond_expr = use_mask_by_cond_expr_p (code, cond_fn, vectype_in);
8492
8493   /* Transform.  */
8494   tree new_temp = NULL_TREE;
8495   auto_vec<tree> vec_oprnds0;
8496   auto_vec<tree> vec_oprnds1;
8497   auto_vec<tree> vec_oprnds2;
8498   tree def0;
8499
8500   if (dump_enabled_p ())
8501     dump_printf_loc (MSG_NOTE, vect_location, "transform reduction.\n");
8502
8503   /* FORNOW: Multiple types are not supported for condition.  */
8504   if (code == COND_EXPR)
8505     gcc_assert (ncopies == 1);
8506
8507   /* A binary COND_OP reduction must have the same definition and else
8508      value. */
8509   bool cond_fn_p = code.is_internal_fn ()
8510     && conditional_internal_fn_code (internal_fn (code)) != ERROR_MARK;
8511   if (cond_fn_p)
8512     {
8513       gcc_assert (code == IFN_COND_ADD || code == IFN_COND_SUB
8514                   || code == IFN_COND_MUL || code == IFN_COND_AND
8515                   || code == IFN_COND_IOR || code == IFN_COND_XOR
8516                   || code == IFN_COND_MIN || code == IFN_COND_MAX);
8517       gcc_assert (op.num_ops == 4
8518                   && (op.ops[reduc_index]
8519                       == op.ops[internal_fn_else_index ((internal_fn) code)]));
8520     }
8521
8522   bool masked_loop_p = LOOP_VINFO_FULLY_MASKED_P (loop_vinfo);
8523
8524   vect_reduction_type reduction_type = STMT_VINFO_REDUC_TYPE (reduc_info);
8525   if (reduction_type == FOLD_LEFT_REDUCTION)
8526     {
8527       internal_fn reduc_fn = STMT_VINFO_REDUC_FN (reduc_info);
8528       gcc_assert (code.is_tree_code () || cond_fn_p);
8529       return vectorize_fold_left_reduction
8530           (loop_vinfo, stmt_info, gsi, vec_stmt, slp_node, reduc_def_phi,
8531            code, reduc_fn, op.ops, op.num_ops, vectype_in,
8532            reduc_index, masks, lens);
8533     }
8534
8535   bool single_defuse_cycle = STMT_VINFO_FORCE_SINGLE_CYCLE (reduc_info);
8536   gcc_assert (single_defuse_cycle
8537               || code == DOT_PROD_EXPR
8538               || code == WIDEN_SUM_EXPR
8539               || code == SAD_EXPR);
8540
8541   /* Create the destination vector  */
8542   tree scalar_dest = gimple_get_lhs (stmt_info->stmt);
8543   tree vec_dest = vect_create_destination_var (scalar_dest, vectype_out);
8544
8545   /* Get NCOPIES vector definitions for all operands except the reduction
8546      definition.  */
8547   if (!cond_fn_p)
8548     {
8549       vect_get_vec_defs (loop_vinfo, stmt_info, slp_node, ncopies,
8550                          single_defuse_cycle && reduc_index == 0
8551                          ? NULL_TREE : op.ops[0], &vec_oprnds0,
8552                          single_defuse_cycle && reduc_index == 1
8553                          ? NULL_TREE : op.ops[1], &vec_oprnds1,
8554                          op.num_ops == 3
8555                          && !(single_defuse_cycle && reduc_index == 2)
8556                          ? op.ops[2] : NULL_TREE, &vec_oprnds2);
8557     }
8558   else
8559     {
8560       /* For a conditional operation pass the truth type as mask
8561          vectype.  */
8562       gcc_assert (single_defuse_cycle
8563                   && (reduc_index == 1 || reduc_index == 2));
8564       vect_get_vec_defs (loop_vinfo, stmt_info, slp_node, ncopies,
8565                          op.ops[0], truth_type_for (vectype_in), &vec_oprnds0,
8566                          reduc_index == 1 ? NULL_TREE : op.ops[1],
8567                          NULL_TREE, &vec_oprnds1,
8568                          reduc_index == 2 ? NULL_TREE : op.ops[2],
8569                          NULL_TREE, &vec_oprnds2);
8570     }
8571
8572   /* For single def-use cycles get one copy of the vectorized reduction
8573      definition.  */
8574   if (single_defuse_cycle)
8575     {
8576       gcc_assert (!slp_node);
8577       vect_get_vec_defs_for_operand (loop_vinfo, stmt_info, 1,
8578                                      op.ops[reduc_index],
8579                                      reduc_index == 0 ? &vec_oprnds0
8580                                      : (reduc_index == 1 ? &vec_oprnds1
8581                                         : &vec_oprnds2));
8582     }
8583
8584   bool emulated_mixed_dot_prod
8585     = vect_is_emulated_mixed_dot_prod (loop_vinfo, stmt_info);
8586   FOR_EACH_VEC_ELT (vec_oprnds0, i, def0)
8587     {
8588       gimple *new_stmt;
8589       tree vop[3] = { def0, vec_oprnds1[i], NULL_TREE };
8590       if (masked_loop_p && !mask_by_cond_expr)
8591         {
8592           /* No conditional ifns have been defined for dot-product yet.  */
8593           gcc_assert (code != DOT_PROD_EXPR);
8594
8595           /* Make sure that the reduction accumulator is vop[0].  */
8596           if (reduc_index == 1)
8597             {
8598               gcc_assert (commutative_binary_op_p (code, op.type));
8599               std::swap (vop[0], vop[1]);
8600             }
8601           tree mask = vect_get_loop_mask (loop_vinfo, gsi, masks,
8602                                           vec_num * ncopies, vectype_in, i);
8603           gcall *call = gimple_build_call_internal (cond_fn, 4, mask,
8604                                                     vop[0], vop[1], vop[0]);
8605           new_temp = make_ssa_name (vec_dest, call);
8606           gimple_call_set_lhs (call, new_temp);
8607           gimple_call_set_nothrow (call, true);
8608           vect_finish_stmt_generation (loop_vinfo, stmt_info, call, gsi);
8609           new_stmt = call;
8610         }
8611       else
8612         {
8613           if (op.num_ops >= 3)
8614             vop[2] = vec_oprnds2[i];
8615
8616           if (masked_loop_p && mask_by_cond_expr)
8617             {
8618               tree mask = vect_get_loop_mask (loop_vinfo, gsi, masks,
8619                                               vec_num * ncopies, vectype_in, i);
8620               build_vect_cond_expr (code, vop, mask, gsi);
8621             }
8622
8623           if (emulated_mixed_dot_prod)
8624             new_stmt = vect_emulate_mixed_dot_prod (loop_vinfo, stmt_info, gsi,
8625                                                     vec_dest, vop);
8626
8627           else if (code.is_internal_fn () && !cond_fn_p)
8628             new_stmt = gimple_build_call_internal (internal_fn (code),
8629                                                    op.num_ops,
8630                                                    vop[0], vop[1], vop[2]);
8631           else if (code.is_internal_fn () && cond_fn_p)
8632             new_stmt = gimple_build_call_internal (internal_fn (code),
8633                                                    op.num_ops,
8634                                                    vop[0], vop[1], vop[2],
8635                                                    vop[1]);
8636           else
8637             new_stmt = gimple_build_assign (vec_dest, tree_code (op.code),
8638                                             vop[0], vop[1], vop[2]);
8639           new_temp = make_ssa_name (vec_dest, new_stmt);
8640           gimple_set_lhs (new_stmt, new_temp);
8641           vect_finish_stmt_generation (loop_vinfo, stmt_info, new_stmt, gsi);
8642         }
8643
8644       if (slp_node)
8645         slp_node->push_vec_def (new_stmt);
8646       else if (single_defuse_cycle
8647                && i < ncopies - 1)
8648         {
8649           if (reduc_index == 0)
8650             vec_oprnds0.safe_push (gimple_get_lhs (new_stmt));
8651           else if (reduc_index == 1)
8652             vec_oprnds1.safe_push (gimple_get_lhs (new_stmt));
8653           else if (reduc_index == 2)
8654             vec_oprnds2.safe_push (gimple_get_lhs (new_stmt));
8655         }
8656       else
8657         STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_stmt);
8658     }
8659
8660   if (!slp_node)
8661     *vec_stmt = STMT_VINFO_VEC_STMTS (stmt_info)[0];
8662
8663   return true;
8664 }
8665
8666 /* Transform phase of a cycle PHI.  */
8667
8668 bool
8669 vect_transform_cycle_phi (loop_vec_info loop_vinfo,
8670                           stmt_vec_info stmt_info, gimple **vec_stmt,
8671                           slp_tree slp_node, slp_instance slp_node_instance)
8672 {
8673   tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
8674   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
8675   int i;
8676   int ncopies;
8677   int j;
8678   bool nested_cycle = false;
8679   int vec_num;
8680
8681   if (nested_in_vect_loop_p (loop, stmt_info))
8682     {
8683       loop = loop->inner;
8684       nested_cycle = true;
8685     }
8686
8687   stmt_vec_info reduc_stmt_info = STMT_VINFO_REDUC_DEF (stmt_info);
8688   reduc_stmt_info = vect_stmt_to_vectorize (reduc_stmt_info);
8689   stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info);
8690   gcc_assert (reduc_info->is_reduc_info);
8691
8692   if (STMT_VINFO_REDUC_TYPE (reduc_info) == EXTRACT_LAST_REDUCTION
8693       || STMT_VINFO_REDUC_TYPE (reduc_info) == FOLD_LEFT_REDUCTION)
8694     /* Leave the scalar phi in place.  */
8695     return true;
8696
8697   tree vectype_in = STMT_VINFO_REDUC_VECTYPE_IN (reduc_info);
8698   /* For a nested cycle we do not fill the above.  */
8699   if (!vectype_in)
8700     vectype_in = STMT_VINFO_VECTYPE (stmt_info);
8701   gcc_assert (vectype_in);
8702
8703   if (slp_node)
8704     {
8705       /* The size vect_schedule_slp_instance computes is off for us.  */
8706       vec_num = vect_get_num_vectors (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
8707                                       * SLP_TREE_LANES (slp_node), vectype_in);
8708       ncopies = 1;
8709     }
8710   else
8711     {
8712       vec_num = 1;
8713       ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
8714     }
8715
8716   /* Check whether we should use a single PHI node and accumulate
8717      vectors to one before the backedge.  */
8718   if (STMT_VINFO_FORCE_SINGLE_CYCLE (reduc_info))
8719     ncopies = 1;
8720
8721   /* Create the destination vector  */
8722   gphi *phi = as_a <gphi *> (stmt_info->stmt);
8723   tree vec_dest = vect_create_destination_var (gimple_phi_result (phi),
8724                                                vectype_out);
8725
8726   /* Get the loop-entry arguments.  */
8727   tree vec_initial_def = NULL_TREE;
8728   auto_vec<tree> vec_initial_defs;
8729   if (slp_node)
8730     {
8731       vec_initial_defs.reserve (vec_num);
8732       if (nested_cycle)
8733         {
8734           unsigned phi_idx = loop_preheader_edge (loop)->dest_idx;
8735           vect_get_slp_defs (SLP_TREE_CHILDREN (slp_node)[phi_idx],
8736                              &vec_initial_defs);
8737         }
8738       else
8739         {
8740           gcc_assert (slp_node == slp_node_instance->reduc_phis);
8741           vec<tree> &initial_values = reduc_info->reduc_initial_values;
8742           vec<stmt_vec_info> &stmts = SLP_TREE_SCALAR_STMTS (slp_node);
8743
8744           unsigned int num_phis = stmts.length ();
8745           if (REDUC_GROUP_FIRST_ELEMENT (reduc_stmt_info))
8746             num_phis = 1;
8747           initial_values.reserve (num_phis);
8748           for (unsigned int i = 0; i < num_phis; ++i)
8749             {
8750               gphi *this_phi = as_a<gphi *> (stmts[i]->stmt);
8751               initial_values.quick_push (vect_phi_initial_value (this_phi));
8752             }
8753           if (vec_num == 1)
8754             vect_find_reusable_accumulator (loop_vinfo, reduc_info);
8755           if (!initial_values.is_empty ())
8756             {
8757               tree initial_value
8758                 = (num_phis == 1 ? initial_values[0] : NULL_TREE);
8759               code_helper code = STMT_VINFO_REDUC_CODE (reduc_info);
8760               tree neutral_op
8761                 = neutral_op_for_reduction (TREE_TYPE (vectype_out),
8762                                             code, initial_value);
8763               get_initial_defs_for_reduction (loop_vinfo, reduc_info,
8764                                               &vec_initial_defs, vec_num,
8765                                               stmts.length (), neutral_op);
8766             }
8767         }
8768     }
8769   else
8770     {
8771       /* Get at the scalar def before the loop, that defines the initial
8772          value of the reduction variable.  */
8773       tree initial_def = vect_phi_initial_value (phi);
8774       reduc_info->reduc_initial_values.safe_push (initial_def);
8775       /* Optimize: if initial_def is for REDUC_MAX smaller than the base
8776          and we can't use zero for induc_val, use initial_def.  Similarly
8777          for REDUC_MIN and initial_def larger than the base.  */
8778       if (STMT_VINFO_REDUC_TYPE (reduc_info) == INTEGER_INDUC_COND_REDUCTION)
8779         {
8780           tree induc_val = STMT_VINFO_VEC_INDUC_COND_INITIAL_VAL (reduc_info);
8781           if (TREE_CODE (initial_def) == INTEGER_CST
8782               && !integer_zerop (induc_val)
8783               && ((STMT_VINFO_REDUC_CODE (reduc_info) == MAX_EXPR
8784                    && tree_int_cst_lt (initial_def, induc_val))
8785                   || (STMT_VINFO_REDUC_CODE (reduc_info) == MIN_EXPR
8786                       && tree_int_cst_lt (induc_val, initial_def))))
8787             {
8788               induc_val = initial_def;
8789               /* Communicate we used the initial_def to epilouge
8790                  generation.  */
8791               STMT_VINFO_VEC_INDUC_COND_INITIAL_VAL (reduc_info) = NULL_TREE;
8792             }
8793           vec_initial_def = build_vector_from_val (vectype_out, induc_val);
8794         }
8795       else if (nested_cycle)
8796         {
8797           /* Do not use an adjustment def as that case is not supported
8798              correctly if ncopies is not one.  */
8799           vect_get_vec_defs_for_operand (loop_vinfo, reduc_stmt_info,
8800                                          ncopies, initial_def,
8801                                          &vec_initial_defs);
8802         }
8803       else if (STMT_VINFO_REDUC_TYPE (reduc_info) == CONST_COND_REDUCTION
8804                || STMT_VINFO_REDUC_TYPE (reduc_info) == COND_REDUCTION)
8805         /* Fill the initial vector with the initial scalar value.  */
8806         vec_initial_def
8807           = get_initial_def_for_reduction (loop_vinfo, reduc_stmt_info,
8808                                            initial_def, initial_def);
8809       else
8810         {
8811           if (ncopies == 1)
8812             vect_find_reusable_accumulator (loop_vinfo, reduc_info);
8813           if (!reduc_info->reduc_initial_values.is_empty ())
8814             {
8815               initial_def = reduc_info->reduc_initial_values[0];
8816               code_helper code = STMT_VINFO_REDUC_CODE (reduc_info);
8817               tree neutral_op
8818                 = neutral_op_for_reduction (TREE_TYPE (initial_def),
8819                                             code, initial_def);
8820               gcc_assert (neutral_op);
8821               /* Try to simplify the vector initialization by applying an
8822                  adjustment after the reduction has been performed.  */
8823               if (!reduc_info->reused_accumulator
8824                   && STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
8825                   && !operand_equal_p (neutral_op, initial_def))
8826                 {
8827                   STMT_VINFO_REDUC_EPILOGUE_ADJUSTMENT (reduc_info)
8828                     = initial_def;
8829                   initial_def = neutral_op;
8830                 }
8831               vec_initial_def
8832                 = get_initial_def_for_reduction (loop_vinfo, reduc_info,
8833                                                  initial_def, neutral_op);
8834             }
8835         }
8836     }
8837
8838   if (vec_initial_def)
8839     {
8840       vec_initial_defs.create (ncopies);
8841       for (i = 0; i < ncopies; ++i)
8842         vec_initial_defs.quick_push (vec_initial_def);
8843     }
8844
8845   if (auto *accumulator = reduc_info->reused_accumulator)
8846     {
8847       tree def = accumulator->reduc_input;
8848       if (!useless_type_conversion_p (vectype_out, TREE_TYPE (def)))
8849         {
8850           unsigned int nreduc;
8851           bool res = constant_multiple_p (TYPE_VECTOR_SUBPARTS
8852                                             (TREE_TYPE (def)),
8853                                           TYPE_VECTOR_SUBPARTS (vectype_out),
8854                                           &nreduc);
8855           gcc_assert (res);
8856           gimple_seq stmts = NULL;
8857           /* Reduce the single vector to a smaller one.  */
8858           if (nreduc != 1)
8859             {
8860               /* Perform the reduction in the appropriate type.  */
8861               tree rvectype = vectype_out;
8862               if (!useless_type_conversion_p (TREE_TYPE (vectype_out),
8863                                               TREE_TYPE (TREE_TYPE (def))))
8864                 rvectype = build_vector_type (TREE_TYPE (TREE_TYPE (def)),
8865                                               TYPE_VECTOR_SUBPARTS
8866                                                 (vectype_out));
8867               def = vect_create_partial_epilog (def, rvectype,
8868                                                 STMT_VINFO_REDUC_CODE
8869                                                   (reduc_info),
8870                                                 &stmts);
8871             }
8872           /* The epilogue loop might use a different vector mode, like
8873              VNx2DI vs. V2DI.  */
8874           if (TYPE_MODE (vectype_out) != TYPE_MODE (TREE_TYPE (def)))
8875             {
8876               tree reduc_type = build_vector_type_for_mode
8877                 (TREE_TYPE (TREE_TYPE (def)), TYPE_MODE (vectype_out));
8878               def = gimple_convert (&stmts, reduc_type, def);
8879             }
8880           /* Adjust the input so we pick up the partially reduced value
8881              for the skip edge in vect_create_epilog_for_reduction.  */
8882           accumulator->reduc_input = def;
8883           /* And the reduction could be carried out using a different sign.  */
8884           if (!useless_type_conversion_p (vectype_out, TREE_TYPE (def)))
8885             def = gimple_convert (&stmts, vectype_out, def);
8886           if (loop_vinfo->main_loop_edge)
8887             {
8888               /* While we'd like to insert on the edge this will split
8889                  blocks and disturb bookkeeping, we also will eventually
8890                  need this on the skip edge.  Rely on sinking to
8891                  fixup optimal placement and insert in the pred.  */
8892               gimple_stmt_iterator gsi
8893                 = gsi_last_bb (loop_vinfo->main_loop_edge->src);
8894               /* Insert before a cond that eventually skips the
8895                  epilogue.  */
8896               if (!gsi_end_p (gsi) && stmt_ends_bb_p (gsi_stmt (gsi)))
8897                 gsi_prev (&gsi);
8898               gsi_insert_seq_after (&gsi, stmts, GSI_CONTINUE_LINKING);
8899             }
8900           else
8901             gsi_insert_seq_on_edge_immediate (loop_preheader_edge (loop),
8902                                               stmts);
8903         }
8904       if (loop_vinfo->main_loop_edge)
8905         vec_initial_defs[0]
8906           = vect_get_main_loop_result (loop_vinfo, def,
8907                                        vec_initial_defs[0]);
8908       else
8909         vec_initial_defs.safe_push (def);
8910     }
8911
8912   /* Generate the reduction PHIs upfront.  */
8913   for (i = 0; i < vec_num; i++)
8914     {
8915       tree vec_init_def = vec_initial_defs[i];
8916       for (j = 0; j < ncopies; j++)
8917         {
8918           /* Create the reduction-phi that defines the reduction
8919              operand.  */
8920           gphi *new_phi = create_phi_node (vec_dest, loop->header);
8921
8922           /* Set the loop-entry arg of the reduction-phi.  */
8923           if (j != 0 && nested_cycle)
8924             vec_init_def = vec_initial_defs[j];
8925           add_phi_arg (new_phi, vec_init_def, loop_preheader_edge (loop),
8926                        UNKNOWN_LOCATION);
8927
8928           /* The loop-latch arg is set in epilogue processing.  */
8929
8930           if (slp_node)
8931             slp_node->push_vec_def (new_phi);
8932           else
8933             {
8934               if (j == 0)
8935                 *vec_stmt = new_phi;
8936               STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_phi);
8937             }
8938         }
8939     }
8940
8941   return true;
8942 }
8943
8944 /* Vectorizes LC PHIs.  */
8945
8946 bool
8947 vectorizable_lc_phi (loop_vec_info loop_vinfo,
8948                      stmt_vec_info stmt_info, gimple **vec_stmt,
8949                      slp_tree slp_node)
8950 {
8951   if (!loop_vinfo
8952       || !is_a <gphi *> (stmt_info->stmt)
8953       || gimple_phi_num_args (stmt_info->stmt) != 1)
8954     return false;
8955
8956   if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_internal_def
8957       && STMT_VINFO_DEF_TYPE (stmt_info) != vect_double_reduction_def)
8958     return false;
8959
8960   if (!vec_stmt) /* transformation not required.  */
8961     {
8962       /* Deal with copies from externs or constants that disguise as
8963          loop-closed PHI nodes (PR97886).  */
8964       if (slp_node
8965           && !vect_maybe_update_slp_op_vectype (SLP_TREE_CHILDREN (slp_node)[0],
8966                                                 SLP_TREE_VECTYPE (slp_node)))
8967         {
8968           if (dump_enabled_p ())
8969             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8970                              "incompatible vector types for invariants\n");
8971           return false;
8972         }
8973       STMT_VINFO_TYPE (stmt_info) = lc_phi_info_type;
8974       return true;
8975     }
8976
8977   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
8978   tree scalar_dest = gimple_phi_result (stmt_info->stmt);
8979   basic_block bb = gimple_bb (stmt_info->stmt);
8980   edge e = single_pred_edge (bb);
8981   tree vec_dest = vect_create_destination_var (scalar_dest, vectype);
8982   auto_vec<tree> vec_oprnds;
8983   vect_get_vec_defs (loop_vinfo, stmt_info, slp_node,
8984                      !slp_node ? vect_get_num_copies (loop_vinfo, vectype) : 1,
8985                      gimple_phi_arg_def (stmt_info->stmt, 0), &vec_oprnds);
8986   for (unsigned i = 0; i < vec_oprnds.length (); i++)
8987     {
8988       /* Create the vectorized LC PHI node.  */
8989       gphi *new_phi = create_phi_node (vec_dest, bb);
8990       add_phi_arg (new_phi, vec_oprnds[i], e, UNKNOWN_LOCATION);
8991       if (slp_node)
8992         slp_node->push_vec_def (new_phi);
8993       else
8994         STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_phi);
8995     }
8996   if (!slp_node)
8997     *vec_stmt = STMT_VINFO_VEC_STMTS (stmt_info)[0];
8998
8999   return true;
9000 }
9001
9002 /* Vectorizes PHIs.  */
9003
9004 bool
9005 vectorizable_phi (vec_info *,
9006                   stmt_vec_info stmt_info, gimple **vec_stmt,
9007                   slp_tree slp_node, stmt_vector_for_cost *cost_vec)
9008 {
9009   if (!is_a <gphi *> (stmt_info->stmt) || !slp_node)
9010     return false;
9011
9012   if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_internal_def)
9013     return false;
9014
9015   tree vectype = SLP_TREE_VECTYPE (slp_node);
9016
9017   if (!vec_stmt) /* transformation not required.  */
9018     {
9019       slp_tree child;
9020       unsigned i;
9021       FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (slp_node), i, child)
9022         if (!child)
9023           {
9024             if (dump_enabled_p ())
9025               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9026                                "PHI node with unvectorized backedge def\n");
9027             return false;
9028           }
9029         else if (!vect_maybe_update_slp_op_vectype (child, vectype))
9030           {
9031             if (dump_enabled_p ())
9032               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9033                                "incompatible vector types for invariants\n");
9034             return false;
9035           }
9036         else if (SLP_TREE_DEF_TYPE (child) == vect_internal_def
9037                  && !useless_type_conversion_p (vectype,
9038                                                 SLP_TREE_VECTYPE (child)))
9039           {
9040             /* With bools we can have mask and non-mask precision vectors
9041                or different non-mask precisions.  while pattern recog is
9042                supposed to guarantee consistency here bugs in it can cause
9043                mismatches (PR103489 and PR103800 for example).
9044                Deal with them here instead of ICEing later.  */
9045             if (dump_enabled_p ())
9046               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9047                                "incompatible vector type setup from "
9048                                "bool pattern detection\n");
9049             return false;
9050           }
9051
9052       /* For single-argument PHIs assume coalescing which means zero cost
9053          for the scalar and the vector PHIs.  This avoids artificially
9054          favoring the vector path (but may pessimize it in some cases).  */
9055       if (gimple_phi_num_args (as_a <gphi *> (stmt_info->stmt)) > 1)
9056         record_stmt_cost (cost_vec, SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node),
9057                           vector_stmt, stmt_info, vectype, 0, vect_body);
9058       STMT_VINFO_TYPE (stmt_info) = phi_info_type;
9059       return true;
9060     }
9061
9062   tree scalar_dest = gimple_phi_result (stmt_info->stmt);
9063   basic_block bb = gimple_bb (stmt_info->stmt);
9064   tree vec_dest = vect_create_destination_var (scalar_dest, vectype);
9065   auto_vec<gphi *> new_phis;
9066   for (unsigned i = 0; i < gimple_phi_num_args (stmt_info->stmt); ++i)
9067     {
9068       slp_tree child = SLP_TREE_CHILDREN (slp_node)[i];
9069
9070       /* Skip not yet vectorized defs.  */
9071       if (SLP_TREE_DEF_TYPE (child) == vect_internal_def
9072           && SLP_TREE_VEC_DEFS (child).is_empty ())
9073         continue;
9074
9075       auto_vec<tree> vec_oprnds;
9076       vect_get_slp_defs (SLP_TREE_CHILDREN (slp_node)[i], &vec_oprnds);
9077       if (!new_phis.exists ())
9078         {
9079           new_phis.create (vec_oprnds.length ());
9080           for (unsigned j = 0; j < vec_oprnds.length (); j++)
9081             {
9082               /* Create the vectorized LC PHI node.  */
9083               new_phis.quick_push (create_phi_node (vec_dest, bb));
9084               slp_node->push_vec_def (new_phis[j]);
9085             }
9086         }
9087       edge e = gimple_phi_arg_edge (as_a <gphi *> (stmt_info->stmt), i);
9088       for (unsigned j = 0; j < vec_oprnds.length (); j++)
9089         add_phi_arg (new_phis[j], vec_oprnds[j], e, UNKNOWN_LOCATION);
9090     }
9091   /* We should have at least one already vectorized child.  */
9092   gcc_assert (new_phis.exists ());
9093
9094   return true;
9095 }
9096
9097 /* Vectorizes first order recurrences.  An overview of the transformation
9098    is described below. Suppose we have the following loop.
9099
9100      int t = 0;
9101      for (int i = 0; i < n; ++i)
9102        {
9103          b[i] = a[i] - t;
9104          t = a[i];
9105        }
9106
9107    There is a first-order recurrence on 'a'. For this loop, the scalar IR
9108    looks (simplified) like:
9109
9110     scalar.preheader:
9111       init = 0;
9112
9113     scalar.body:
9114       i = PHI <0(scalar.preheader), i+1(scalar.body)>
9115       _2 = PHI <(init(scalar.preheader), <_1(scalar.body)>
9116       _1 = a[i]
9117       b[i] = _1 - _2
9118       if (i < n) goto scalar.body
9119
9120    In this example, _2 is a recurrence because it's value depends on the
9121    previous iteration.  We vectorize this as (VF = 4)
9122
9123     vector.preheader:
9124       vect_init = vect_cst(..., ..., ..., 0)
9125
9126     vector.body
9127       i = PHI <0(vector.preheader), i+4(vector.body)>
9128       vect_1 = PHI <vect_init(vector.preheader), v2(vector.body)>
9129       vect_2 = a[i, i+1, i+2, i+3];
9130       vect_3 = vec_perm (vect_1, vect_2, { 3, 4, 5, 6 })
9131       b[i, i+1, i+2, i+3] = vect_2 - vect_3
9132       if (..) goto vector.body
9133
9134    In this function, vectorizable_recurr, we code generate both the
9135    vector PHI node and the permute since those together compute the
9136    vectorized value of the scalar PHI.  We do not yet have the
9137    backedge value to fill in there nor into the vec_perm.  Those
9138    are filled in maybe_set_vectorized_backedge_value and
9139    vect_schedule_scc.
9140
9141    TODO:  Since the scalar loop does not have a use of the recurrence
9142    outside of the loop the natural way to implement peeling via
9143    vectorizing the live value doesn't work.  For now peeling of loops
9144    with a recurrence is not implemented.  For SLP the supported cases
9145    are restricted to those requiring a single vector recurrence PHI.  */
9146
9147 bool
9148 vectorizable_recurr (loop_vec_info loop_vinfo, stmt_vec_info stmt_info,
9149                      gimple **vec_stmt, slp_tree slp_node,
9150                      stmt_vector_for_cost *cost_vec)
9151 {
9152   if (!loop_vinfo || !is_a<gphi *> (stmt_info->stmt))
9153     return false;
9154
9155   gphi *phi = as_a<gphi *> (stmt_info->stmt);
9156
9157   /* So far we only support first-order recurrence auto-vectorization.  */
9158   if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_first_order_recurrence)
9159     return false;
9160
9161   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
9162   unsigned ncopies;
9163   if (slp_node)
9164     ncopies = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
9165   else
9166     ncopies = vect_get_num_copies (loop_vinfo, vectype);
9167   poly_int64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
9168   unsigned dist = slp_node ? SLP_TREE_LANES (slp_node) : 1;
9169   /* We need to be able to make progress with a single vector.  */
9170   if (maybe_gt (dist * 2, nunits))
9171     {
9172       if (dump_enabled_p ())
9173         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9174                          "first order recurrence exceeds half of "
9175                          "a vector\n");
9176       return false;
9177     }
9178
9179   /* First-order recurrence autovectorization needs to handle permutation
9180      with indices = [nunits-1, nunits, nunits+1, ...].  */
9181   vec_perm_builder sel (nunits, 1, 3);
9182   for (int i = 0; i < 3; ++i)
9183     sel.quick_push (nunits - dist + i);
9184   vec_perm_indices indices (sel, 2, nunits);
9185
9186   if (!vec_stmt) /* transformation not required.  */
9187     {
9188       if (!can_vec_perm_const_p (TYPE_MODE (vectype), TYPE_MODE (vectype),
9189                                  indices))
9190         return false;
9191
9192       if (slp_node)
9193         {
9194           /* We eventually need to set a vector type on invariant
9195              arguments.  */
9196           unsigned j;
9197           slp_tree child;
9198           FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (slp_node), j, child)
9199             if (!vect_maybe_update_slp_op_vectype
9200                   (child, SLP_TREE_VECTYPE (slp_node)))
9201               {
9202                 if (dump_enabled_p ())
9203                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9204                                    "incompatible vector types for "
9205                                    "invariants\n");
9206                 return false;
9207               }
9208         }
9209
9210       /* Verify we have set up compatible types.  */
9211       edge le = loop_latch_edge (LOOP_VINFO_LOOP (loop_vinfo));
9212       tree latch_vectype = NULL_TREE;
9213       if (slp_node)
9214         {
9215           slp_tree latch_def = SLP_TREE_CHILDREN (slp_node)[le->dest_idx];
9216           latch_vectype = SLP_TREE_VECTYPE (latch_def);
9217         }
9218       else
9219         {
9220           tree latch_def = PHI_ARG_DEF_FROM_EDGE (phi, le);
9221           if (TREE_CODE (latch_def) == SSA_NAME)
9222             {
9223               stmt_vec_info latch_def_info = loop_vinfo->lookup_def (latch_def);
9224               latch_def_info = vect_stmt_to_vectorize (latch_def_info);
9225               latch_vectype = STMT_VINFO_VECTYPE (latch_def_info);
9226             }
9227         }
9228       if (!types_compatible_p (latch_vectype, vectype))
9229         return false;
9230
9231       /* The recurrence costs the initialization vector and one permute
9232          for each copy.  */
9233       unsigned prologue_cost = record_stmt_cost (cost_vec, 1, scalar_to_vec,
9234                                                  stmt_info, 0, vect_prologue);
9235       unsigned inside_cost = record_stmt_cost (cost_vec, ncopies, vector_stmt,
9236                                                stmt_info, 0, vect_body);
9237       if (dump_enabled_p ())
9238         dump_printf_loc (MSG_NOTE, vect_location,
9239                          "vectorizable_recurr: inside_cost = %d, "
9240                          "prologue_cost = %d .\n", inside_cost,
9241                          prologue_cost);
9242
9243       STMT_VINFO_TYPE (stmt_info) = recurr_info_type;
9244       return true;
9245     }
9246
9247   edge pe = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo));
9248   basic_block bb = gimple_bb (phi);
9249   tree preheader = PHI_ARG_DEF_FROM_EDGE (phi, pe);
9250   if (!useless_type_conversion_p (TREE_TYPE (vectype), TREE_TYPE (preheader)))
9251     {
9252       gimple_seq stmts = NULL;
9253       preheader = gimple_convert (&stmts, TREE_TYPE (vectype), preheader);
9254       gsi_insert_seq_on_edge_immediate (pe, stmts);
9255     }
9256   tree vec_init = build_vector_from_val (vectype, preheader);
9257   vec_init = vect_init_vector (loop_vinfo, stmt_info, vec_init, vectype, NULL);
9258
9259   /* Create the vectorized first-order PHI node.  */
9260   tree vec_dest = vect_get_new_vect_var (vectype,
9261                                          vect_simple_var, "vec_recur_");
9262   gphi *new_phi = create_phi_node (vec_dest, bb);
9263   add_phi_arg (new_phi, vec_init, pe, UNKNOWN_LOCATION);
9264
9265   /* Insert shuffles the first-order recurrence autovectorization.
9266        result = VEC_PERM <vec_recur, vect_1, index[nunits-1, nunits, ...]>.  */
9267   tree perm = vect_gen_perm_mask_checked (vectype, indices);
9268
9269   /* Insert the required permute after the latch definition.  The
9270      second and later operands are tentative and will be updated when we have
9271      vectorized the latch definition.  */
9272   edge le = loop_latch_edge (LOOP_VINFO_LOOP (loop_vinfo));
9273   gimple *latch_def = SSA_NAME_DEF_STMT (PHI_ARG_DEF_FROM_EDGE (phi, le));
9274   gimple_stmt_iterator gsi2 = gsi_for_stmt (latch_def);
9275   gsi_next (&gsi2);
9276
9277   for (unsigned i = 0; i < ncopies; ++i)
9278     {
9279       vec_dest = make_ssa_name (vectype);
9280       gassign *vperm
9281           = gimple_build_assign (vec_dest, VEC_PERM_EXPR,
9282                                  i == 0 ? gimple_phi_result (new_phi) : NULL,
9283                                  NULL, perm);
9284       vect_finish_stmt_generation (loop_vinfo, stmt_info, vperm, &gsi2);
9285
9286       if (slp_node)
9287         slp_node->push_vec_def (vperm);
9288       else
9289         STMT_VINFO_VEC_STMTS (stmt_info).safe_push (vperm);
9290     }
9291
9292   if (!slp_node)
9293     *vec_stmt = STMT_VINFO_VEC_STMTS (stmt_info)[0];
9294   return true;
9295 }
9296
9297 /* Return true if VECTYPE represents a vector that requires lowering
9298    by the vector lowering pass.  */
9299
9300 bool
9301 vect_emulated_vector_p (tree vectype)
9302 {
9303   return (!VECTOR_MODE_P (TYPE_MODE (vectype))
9304           && (!VECTOR_BOOLEAN_TYPE_P (vectype)
9305               || TYPE_PRECISION (TREE_TYPE (vectype)) != 1));
9306 }
9307
9308 /* Return true if we can emulate CODE on an integer mode representation
9309    of a vector.  */
9310
9311 bool
9312 vect_can_vectorize_without_simd_p (tree_code code)
9313 {
9314   switch (code)
9315     {
9316     case PLUS_EXPR:
9317     case MINUS_EXPR:
9318     case NEGATE_EXPR:
9319     case BIT_AND_EXPR:
9320     case BIT_IOR_EXPR:
9321     case BIT_XOR_EXPR:
9322     case BIT_NOT_EXPR:
9323       return true;
9324
9325     default:
9326       return false;
9327     }
9328 }
9329
9330 /* Likewise, but taking a code_helper.  */
9331
9332 bool
9333 vect_can_vectorize_without_simd_p (code_helper code)
9334 {
9335   return (code.is_tree_code ()
9336           && vect_can_vectorize_without_simd_p (tree_code (code)));
9337 }
9338
9339 /* Create vector init for vectorized iv.  */
9340 static tree
9341 vect_create_nonlinear_iv_init (gimple_seq* stmts, tree init_expr,
9342                                tree step_expr, poly_uint64 nunits,
9343                                tree vectype,
9344                                enum vect_induction_op_type induction_type)
9345 {
9346   unsigned HOST_WIDE_INT const_nunits;
9347   tree vec_shift, vec_init, new_name;
9348   unsigned i;
9349   tree itype = TREE_TYPE (vectype);
9350
9351   /* iv_loop is the loop to be vectorized. Create:
9352      vec_init = [X, X+S, X+2*S, X+3*S] (S = step_expr, X = init_expr).  */
9353   new_name = gimple_convert (stmts, itype, init_expr);
9354   switch (induction_type)
9355     {
9356     case vect_step_op_shr:
9357     case vect_step_op_shl:
9358       /* Build the Initial value from shift_expr.  */
9359       vec_init = gimple_build_vector_from_val (stmts,
9360                                                vectype,
9361                                                new_name);
9362       vec_shift = gimple_build (stmts, VEC_SERIES_EXPR, vectype,
9363                                 build_zero_cst (itype), step_expr);
9364       vec_init = gimple_build (stmts,
9365                                (induction_type == vect_step_op_shr
9366                                 ? RSHIFT_EXPR : LSHIFT_EXPR),
9367                                vectype, vec_init, vec_shift);
9368       break;
9369
9370     case vect_step_op_neg:
9371       {
9372         vec_init = gimple_build_vector_from_val (stmts,
9373                                                  vectype,
9374                                                  new_name);
9375         tree vec_neg = gimple_build (stmts, NEGATE_EXPR,
9376                                      vectype, vec_init);
9377         /* The encoding has 2 interleaved stepped patterns.  */
9378         vec_perm_builder sel (nunits, 2, 3);
9379         sel.quick_grow (6);
9380         for (i = 0; i < 3; i++)
9381           {
9382             sel[2 * i] = i;
9383             sel[2 * i + 1] = i + nunits;
9384           }
9385         vec_perm_indices indices (sel, 2, nunits);
9386         /* Don't use vect_gen_perm_mask_checked since can_vec_perm_const_p may
9387            fail when vec_init is const vector. In that situation vec_perm is not
9388            really needed.  */
9389         tree perm_mask_even
9390           = vect_gen_perm_mask_any (vectype, indices);
9391         vec_init = gimple_build (stmts, VEC_PERM_EXPR,
9392                                  vectype,
9393                                  vec_init, vec_neg,
9394                                  perm_mask_even);
9395       }
9396       break;
9397
9398     case vect_step_op_mul:
9399       {
9400         /* Use unsigned mult to avoid UD integer overflow.  */
9401         gcc_assert (nunits.is_constant (&const_nunits));
9402         tree utype = unsigned_type_for (itype);
9403         tree uvectype = build_vector_type (utype,
9404                                            TYPE_VECTOR_SUBPARTS (vectype));
9405         new_name = gimple_convert (stmts, utype, new_name);
9406         vec_init = gimple_build_vector_from_val (stmts,
9407                                                  uvectype,
9408                                                  new_name);
9409         tree_vector_builder elts (uvectype, const_nunits, 1);
9410         tree elt_step = build_one_cst (utype);
9411
9412         elts.quick_push (elt_step);
9413         for (i = 1; i < const_nunits; i++)
9414           {
9415             /* Create: new_name_i = new_name + step_expr.  */
9416             elt_step = gimple_build (stmts, MULT_EXPR,
9417                                      utype, elt_step, step_expr);
9418             elts.quick_push (elt_step);
9419           }
9420         /* Create a vector from [new_name_0, new_name_1, ...,
9421            new_name_nunits-1].  */
9422         tree vec_mul = gimple_build_vector (stmts, &elts);
9423         vec_init = gimple_build (stmts, MULT_EXPR, uvectype,
9424                                  vec_init, vec_mul);
9425         vec_init = gimple_convert (stmts, vectype, vec_init);
9426       }
9427       break;
9428
9429     default:
9430       gcc_unreachable ();
9431     }
9432
9433   return vec_init;
9434 }
9435
9436 /* Peel init_expr by skip_niter for induction_type.  */
9437 tree
9438 vect_peel_nonlinear_iv_init (gimple_seq* stmts, tree init_expr,
9439                              tree skip_niters, tree step_expr,
9440                              enum vect_induction_op_type induction_type)
9441 {
9442   gcc_assert (TREE_CODE (skip_niters) == INTEGER_CST);
9443   tree type = TREE_TYPE (init_expr);
9444   unsigned prec = TYPE_PRECISION (type);
9445   switch (induction_type)
9446     {
9447     case vect_step_op_neg:
9448       if (TREE_INT_CST_LOW (skip_niters) % 2)
9449         init_expr = gimple_build (stmts, NEGATE_EXPR, type, init_expr);
9450       /* else no change.  */
9451       break;
9452
9453     case vect_step_op_shr:
9454     case vect_step_op_shl:
9455       skip_niters = gimple_convert (stmts, type, skip_niters);
9456       step_expr = gimple_build (stmts, MULT_EXPR, type, step_expr, skip_niters);
9457       /* When shift mount >= precision, need to avoid UD.
9458          In the original loop, there's no UD, and according to semantic,
9459          init_expr should be 0 for lshr, ashl, and >>= (prec - 1) for ashr.  */
9460       if (!tree_fits_uhwi_p (step_expr)
9461           || tree_to_uhwi (step_expr) >= prec)
9462         {
9463           if (induction_type == vect_step_op_shl
9464               || TYPE_UNSIGNED (type))
9465             init_expr = build_zero_cst (type);
9466           else
9467             init_expr = gimple_build (stmts, RSHIFT_EXPR, type,
9468                                       init_expr,
9469                                       wide_int_to_tree (type, prec - 1));
9470         }
9471       else
9472         init_expr = gimple_build (stmts, (induction_type == vect_step_op_shr
9473                                           ? RSHIFT_EXPR : LSHIFT_EXPR),
9474                                   type, init_expr, step_expr);
9475       break;
9476
9477     case vect_step_op_mul:
9478       {
9479         tree utype = unsigned_type_for (type);
9480         init_expr = gimple_convert (stmts, utype, init_expr);
9481         wide_int skipn = wi::to_wide (skip_niters);
9482         wide_int begin = wi::to_wide (step_expr);
9483         auto_mpz base, exp, mod, res;
9484         wi::to_mpz (begin, base, TYPE_SIGN (type));
9485         wi::to_mpz (skipn, exp, UNSIGNED);
9486         mpz_ui_pow_ui (mod, 2, TYPE_PRECISION (type));
9487         mpz_powm (res, base, exp, mod);
9488         begin = wi::from_mpz (utype, res, true);
9489         tree mult_expr = wide_int_to_tree (utype, begin);
9490         init_expr = gimple_build (stmts, MULT_EXPR, utype,
9491                                   init_expr, mult_expr);
9492         init_expr = gimple_convert (stmts, type, init_expr);
9493       }
9494       break;
9495
9496     default:
9497       gcc_unreachable ();
9498     }
9499
9500   return init_expr;
9501 }
9502
9503 /* Create vector step for vectorized iv.  */
9504 static tree
9505 vect_create_nonlinear_iv_step (gimple_seq* stmts, tree step_expr,
9506                                poly_uint64 vf,
9507                                enum vect_induction_op_type induction_type)
9508 {
9509   tree expr = build_int_cst (TREE_TYPE (step_expr), vf);
9510   tree new_name = NULL;
9511   /* Step should be pow (step, vf) for mult induction.  */
9512   if (induction_type == vect_step_op_mul)
9513     {
9514       gcc_assert (vf.is_constant ());
9515       wide_int begin = wi::to_wide (step_expr);
9516
9517       for (unsigned i = 0; i != vf.to_constant () - 1; i++)
9518         begin = wi::mul (begin, wi::to_wide (step_expr));
9519
9520       new_name = wide_int_to_tree (TREE_TYPE (step_expr), begin);
9521     }
9522   else if (induction_type == vect_step_op_neg)
9523     /* Do nothing.  */
9524     ;
9525   else
9526     new_name = gimple_build (stmts, MULT_EXPR, TREE_TYPE (step_expr),
9527                              expr, step_expr);
9528   return new_name;
9529 }
9530
9531 static tree
9532 vect_create_nonlinear_iv_vec_step (loop_vec_info loop_vinfo,
9533                                    stmt_vec_info stmt_info,
9534                                    tree new_name, tree vectype,
9535                                    enum vect_induction_op_type induction_type)
9536 {
9537   /* No step is needed for neg induction.  */
9538   if (induction_type == vect_step_op_neg)
9539     return NULL;
9540
9541   tree t = unshare_expr (new_name);
9542   gcc_assert (CONSTANT_CLASS_P (new_name)
9543               || TREE_CODE (new_name) == SSA_NAME);
9544   tree new_vec = build_vector_from_val (vectype, t);
9545   tree vec_step = vect_init_vector (loop_vinfo, stmt_info,
9546                                     new_vec, vectype, NULL);
9547   return vec_step;
9548 }
9549
9550 /* Update vectorized iv with vect_step, induc_def is init.  */
9551 static tree
9552 vect_update_nonlinear_iv (gimple_seq* stmts, tree vectype,
9553                           tree induc_def, tree vec_step,
9554                           enum vect_induction_op_type induction_type)
9555 {
9556   tree vec_def = induc_def;
9557   switch (induction_type)
9558     {
9559     case vect_step_op_mul:
9560       {
9561         /* Use unsigned mult to avoid UD integer overflow.  */
9562         tree uvectype
9563           = build_vector_type (unsigned_type_for (TREE_TYPE (vectype)),
9564                                TYPE_VECTOR_SUBPARTS (vectype));
9565         vec_def = gimple_convert (stmts, uvectype, vec_def);
9566         vec_step = gimple_convert (stmts, uvectype, vec_step);
9567         vec_def = gimple_build (stmts, MULT_EXPR, uvectype,
9568                                 vec_def, vec_step);
9569         vec_def = gimple_convert (stmts, vectype, vec_def);
9570       }
9571       break;
9572
9573     case vect_step_op_shr:
9574       vec_def = gimple_build (stmts, RSHIFT_EXPR, vectype,
9575                               vec_def, vec_step);
9576       break;
9577
9578     case vect_step_op_shl:
9579       vec_def = gimple_build (stmts, LSHIFT_EXPR, vectype,
9580                               vec_def, vec_step);
9581       break;
9582     case vect_step_op_neg:
9583       vec_def = induc_def;
9584       /* Do nothing.  */
9585       break;
9586     default:
9587       gcc_unreachable ();
9588     }
9589
9590   return vec_def;
9591
9592 }
9593
9594 /* Function vectorizable_induction
9595
9596    Check if STMT_INFO performs an nonlinear induction computation that can be
9597    vectorized. If VEC_STMT is also passed, vectorize the induction PHI: create
9598    a vectorized phi to replace it, put it in VEC_STMT, and add it to the same
9599    basic block.
9600    Return true if STMT_INFO is vectorizable in this way.  */
9601
9602 static bool
9603 vectorizable_nonlinear_induction (loop_vec_info loop_vinfo,
9604                                   stmt_vec_info stmt_info,
9605                                   gimple **vec_stmt, slp_tree slp_node,
9606                                   stmt_vector_for_cost *cost_vec)
9607 {
9608   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
9609   unsigned ncopies;
9610   bool nested_in_vect_loop = false;
9611   class loop *iv_loop;
9612   tree vec_def;
9613   edge pe = loop_preheader_edge (loop);
9614   basic_block new_bb;
9615   tree vec_init, vec_step;
9616   tree new_name;
9617   gimple *new_stmt;
9618   gphi *induction_phi;
9619   tree induc_def, vec_dest;
9620   tree init_expr, step_expr;
9621   tree niters_skip;
9622   poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
9623   unsigned i;
9624   gimple_stmt_iterator si;
9625
9626   gphi *phi = dyn_cast <gphi *> (stmt_info->stmt);
9627
9628   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
9629   poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
9630   enum vect_induction_op_type induction_type
9631     = STMT_VINFO_LOOP_PHI_EVOLUTION_TYPE (stmt_info);
9632
9633   gcc_assert (induction_type > vect_step_op_add);
9634
9635   if (slp_node)
9636     ncopies = 1;
9637   else
9638     ncopies = vect_get_num_copies (loop_vinfo, vectype);
9639   gcc_assert (ncopies >= 1);
9640
9641   /* FORNOW. Only handle nonlinear induction in the same loop.  */
9642   if (nested_in_vect_loop_p (loop, stmt_info))
9643     {
9644       if (dump_enabled_p ())
9645         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9646                          "nonlinear induction in nested loop.\n");
9647       return false;
9648     }
9649
9650   iv_loop = loop;
9651   gcc_assert (iv_loop == (gimple_bb (phi))->loop_father);
9652
9653   /* TODO: Support slp for nonlinear iv. There should be separate vector iv
9654      update for each iv and a permutation to generate wanted vector iv.  */
9655   if (slp_node)
9656     {
9657       if (dump_enabled_p ())
9658         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9659                          "SLP induction not supported for nonlinear"
9660                          " induction.\n");
9661       return false;
9662     }
9663
9664   if (!INTEGRAL_TYPE_P (TREE_TYPE (vectype)))
9665     {
9666       if (dump_enabled_p ())
9667         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9668                          "floating point nonlinear induction vectorization"
9669                          " not supported.\n");
9670       return false;
9671     }
9672
9673   step_expr = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_info);
9674   init_expr = vect_phi_initial_value (phi);
9675   gcc_assert (step_expr != NULL_TREE && init_expr != NULL
9676               && TREE_CODE (step_expr) == INTEGER_CST);
9677   /* step_expr should be aligned with init_expr,
9678      .i.e. uint64 a >> 1, step is int, but vector<uint64> shift is used.  */
9679   step_expr = fold_convert (TREE_TYPE (vectype), step_expr);
9680
9681   if (TREE_CODE (init_expr) == INTEGER_CST)
9682     init_expr = fold_convert (TREE_TYPE (vectype), init_expr);
9683   else if (!tree_nop_conversion_p (TREE_TYPE (vectype), TREE_TYPE (init_expr)))
9684     {
9685       /* INIT_EXPR could be a bit_field, bail out for such case.  */
9686       if (dump_enabled_p ())
9687         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9688                          "nonlinear induction vectorization failed:"
9689                          " component type of vectype is not a nop conversion"
9690                          " from type of init_expr.\n");
9691       return false;
9692     }
9693
9694   switch (induction_type)
9695     {
9696     case vect_step_op_neg:
9697       if (maybe_eq (TYPE_VECTOR_SUBPARTS (vectype), 1u))
9698         return false;
9699       if (TREE_CODE (init_expr) != INTEGER_CST
9700           && TREE_CODE (init_expr) != REAL_CST)
9701         {
9702           /* Check for backend support of NEGATE_EXPR and vec_perm.  */
9703           if (!directly_supported_p (NEGATE_EXPR, vectype))
9704             return false;
9705
9706           /* The encoding has 2 interleaved stepped patterns.  */
9707           vec_perm_builder sel (nunits, 2, 3);
9708           machine_mode mode = TYPE_MODE (vectype);
9709           sel.quick_grow (6);
9710           for (i = 0; i < 3; i++)
9711             {
9712               sel[i * 2] = i;
9713               sel[i * 2 + 1] = i + nunits;
9714             }
9715           vec_perm_indices indices (sel, 2, nunits);
9716           if (!can_vec_perm_const_p (mode, mode, indices))
9717             return false;
9718         }
9719       break;
9720
9721     case vect_step_op_mul:
9722       {
9723         /* Check for backend support of MULT_EXPR.  */
9724         if (!directly_supported_p (MULT_EXPR, vectype))
9725           return false;
9726
9727         /* ?? How to construct vector step for variable number vector.
9728            [ 1, step, pow (step, 2), pow (step, 4), .. ].  */
9729         if (!vf.is_constant ())
9730           return false;
9731       }
9732       break;
9733
9734     case vect_step_op_shr:
9735       /* Check for backend support of RSHIFT_EXPR.  */
9736       if (!directly_supported_p (RSHIFT_EXPR, vectype, optab_vector))
9737         return false;
9738
9739       /* Don't shift more than type precision to avoid UD.  */
9740       if (!tree_fits_uhwi_p (step_expr)
9741           || maybe_ge (nunits * tree_to_uhwi (step_expr),
9742                        TYPE_PRECISION (TREE_TYPE (init_expr))))
9743         return false;
9744       break;
9745
9746     case vect_step_op_shl:
9747       /* Check for backend support of RSHIFT_EXPR.  */
9748       if (!directly_supported_p (LSHIFT_EXPR, vectype, optab_vector))
9749         return false;
9750
9751       /* Don't shift more than type precision to avoid UD.  */
9752       if (!tree_fits_uhwi_p (step_expr)
9753           || maybe_ge (nunits * tree_to_uhwi (step_expr),
9754                        TYPE_PRECISION (TREE_TYPE (init_expr))))
9755         return false;
9756
9757       break;
9758
9759     default:
9760       gcc_unreachable ();
9761     }
9762
9763   if (!vec_stmt) /* transformation not required.  */
9764     {
9765       unsigned inside_cost = 0, prologue_cost = 0;
9766       /* loop cost for vec_loop. Neg induction doesn't have any
9767          inside_cost.  */
9768       inside_cost = record_stmt_cost (cost_vec, ncopies, vector_stmt,
9769                                       stmt_info, 0, vect_body);
9770
9771       /* loop cost for vec_loop. Neg induction doesn't have any
9772          inside_cost.  */
9773       if (induction_type == vect_step_op_neg)
9774         inside_cost = 0;
9775
9776       /* prologue cost for vec_init and vec_step.  */
9777       prologue_cost = record_stmt_cost (cost_vec, 2, scalar_to_vec,
9778                                         stmt_info, 0, vect_prologue);
9779
9780       if (dump_enabled_p ())
9781         dump_printf_loc (MSG_NOTE, vect_location,
9782                          "vect_model_induction_cost: inside_cost = %d, "
9783                          "prologue_cost = %d. \n", inside_cost,
9784                          prologue_cost);
9785
9786       STMT_VINFO_TYPE (stmt_info) = induc_vec_info_type;
9787       DUMP_VECT_SCOPE ("vectorizable_nonlinear_induction");
9788       return true;
9789     }
9790
9791   /* Transform.  */
9792
9793   /* Compute a vector variable, initialized with the first VF values of
9794      the induction variable.  E.g., for an iv with IV_PHI='X' and
9795      evolution S, for a vector of 4 units, we want to compute:
9796      [X, X + S, X + 2*S, X + 3*S].  */
9797
9798   if (dump_enabled_p ())
9799     dump_printf_loc (MSG_NOTE, vect_location, "transform induction phi.\n");
9800
9801   pe = loop_preheader_edge (iv_loop);
9802   /* Find the first insertion point in the BB.  */
9803   basic_block bb = gimple_bb (phi);
9804   si = gsi_after_labels (bb);
9805
9806   gimple_seq stmts = NULL;
9807
9808   niters_skip = LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo);
9809   /* If we are using the loop mask to "peel" for alignment then we need
9810      to adjust the start value here.  */
9811   if (niters_skip != NULL_TREE)
9812     init_expr = vect_peel_nonlinear_iv_init (&stmts, init_expr, niters_skip,
9813                                              step_expr, induction_type);
9814
9815   vec_init = vect_create_nonlinear_iv_init (&stmts, init_expr,
9816                                             step_expr, nunits, vectype,
9817                                             induction_type);
9818   if (stmts)
9819     {
9820       new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
9821       gcc_assert (!new_bb);
9822     }
9823
9824   stmts = NULL;
9825   new_name = vect_create_nonlinear_iv_step (&stmts, step_expr,
9826                                             vf, induction_type);
9827   if (stmts)
9828     {
9829       new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
9830       gcc_assert (!new_bb);
9831     }
9832
9833   vec_step = vect_create_nonlinear_iv_vec_step (loop_vinfo, stmt_info,
9834                                                 new_name, vectype,
9835                                                 induction_type);
9836   /* Create the following def-use cycle:
9837      loop prolog:
9838      vec_init = ...
9839      vec_step = ...
9840      loop:
9841      vec_iv = PHI <vec_init, vec_loop>
9842      ...
9843      STMT
9844      ...
9845      vec_loop = vec_iv + vec_step;  */
9846
9847   /* Create the induction-phi that defines the induction-operand.  */
9848   vec_dest = vect_get_new_vect_var (vectype, vect_simple_var, "vec_iv_");
9849   induction_phi = create_phi_node (vec_dest, iv_loop->header);
9850   induc_def = PHI_RESULT (induction_phi);
9851
9852   /* Create the iv update inside the loop.  */
9853   stmts = NULL;
9854   vec_def = vect_update_nonlinear_iv (&stmts, vectype,
9855                                       induc_def, vec_step,
9856                                       induction_type);
9857
9858   gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
9859   new_stmt = SSA_NAME_DEF_STMT (vec_def);
9860
9861   /* Set the arguments of the phi node:  */
9862   add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION);
9863   add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
9864                UNKNOWN_LOCATION);
9865
9866   STMT_VINFO_VEC_STMTS (stmt_info).safe_push (induction_phi);
9867   *vec_stmt = induction_phi;
9868
9869   /* In case that vectorization factor (VF) is bigger than the number
9870      of elements that we can fit in a vectype (nunits), we have to generate
9871      more than one vector stmt - i.e - we need to "unroll" the
9872      vector stmt by a factor VF/nunits.  For more details see documentation
9873      in vectorizable_operation.  */
9874
9875   if (ncopies > 1)
9876     {
9877       stmts = NULL;
9878       /* FORNOW. This restriction should be relaxed.  */
9879       gcc_assert (!nested_in_vect_loop);
9880
9881       new_name = vect_create_nonlinear_iv_step (&stmts, step_expr,
9882                                                 nunits, induction_type);
9883
9884       vec_step = vect_create_nonlinear_iv_vec_step (loop_vinfo, stmt_info,
9885                                                     new_name, vectype,
9886                                                     induction_type);
9887       vec_def = induc_def;
9888       for (i = 1; i < ncopies; i++)
9889         {
9890           /* vec_i = vec_prev + vec_step.  */
9891           stmts = NULL;
9892           vec_def = vect_update_nonlinear_iv (&stmts, vectype,
9893                                               vec_def, vec_step,
9894                                               induction_type);
9895           gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
9896           new_stmt = SSA_NAME_DEF_STMT (vec_def);
9897           STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_stmt);
9898         }
9899     }
9900
9901   if (dump_enabled_p ())
9902     dump_printf_loc (MSG_NOTE, vect_location,
9903                      "transform induction: created def-use cycle: %G%G",
9904                      (gimple *) induction_phi, SSA_NAME_DEF_STMT (vec_def));
9905
9906   return true;
9907 }
9908
9909 /* Function vectorizable_induction
9910
9911    Check if STMT_INFO performs an induction computation that can be vectorized.
9912    If VEC_STMT is also passed, vectorize the induction PHI: create a vectorized
9913    phi to replace it, put it in VEC_STMT, and add it to the same basic block.
9914    Return true if STMT_INFO is vectorizable in this way.  */
9915
9916 bool
9917 vectorizable_induction (loop_vec_info loop_vinfo,
9918                         stmt_vec_info stmt_info,
9919                         gimple **vec_stmt, slp_tree slp_node,
9920                         stmt_vector_for_cost *cost_vec)
9921 {
9922   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
9923   unsigned ncopies;
9924   bool nested_in_vect_loop = false;
9925   class loop *iv_loop;
9926   tree vec_def;
9927   edge pe = loop_preheader_edge (loop);
9928   basic_block new_bb;
9929   tree new_vec, vec_init, vec_step, t;
9930   tree new_name;
9931   gimple *new_stmt;
9932   gphi *induction_phi;
9933   tree induc_def, vec_dest;
9934   tree init_expr, step_expr;
9935   poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
9936   unsigned i;
9937   tree expr;
9938   gimple_stmt_iterator si;
9939   enum vect_induction_op_type induction_type
9940     = STMT_VINFO_LOOP_PHI_EVOLUTION_TYPE (stmt_info);
9941
9942   gphi *phi = dyn_cast <gphi *> (stmt_info->stmt);
9943   if (!phi)
9944     return false;
9945
9946   if (!STMT_VINFO_RELEVANT_P (stmt_info))
9947     return false;
9948
9949   /* Make sure it was recognized as induction computation.  */
9950   if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_induction_def)
9951     return false;
9952
9953   /* Handle nonlinear induction in a separate place.  */
9954   if (induction_type != vect_step_op_add)
9955     return vectorizable_nonlinear_induction (loop_vinfo, stmt_info,
9956                                              vec_stmt, slp_node, cost_vec);
9957
9958   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
9959   poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
9960
9961   if (slp_node)
9962     ncopies = 1;
9963   else
9964     ncopies = vect_get_num_copies (loop_vinfo, vectype);
9965   gcc_assert (ncopies >= 1);
9966
9967   /* FORNOW. These restrictions should be relaxed.  */
9968   if (nested_in_vect_loop_p (loop, stmt_info))
9969     {
9970       imm_use_iterator imm_iter;
9971       use_operand_p use_p;
9972       gimple *exit_phi;
9973       edge latch_e;
9974       tree loop_arg;
9975
9976       if (ncopies > 1)
9977         {
9978           if (dump_enabled_p ())
9979             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9980                              "multiple types in nested loop.\n");
9981           return false;
9982         }
9983
9984       exit_phi = NULL;
9985       latch_e = loop_latch_edge (loop->inner);
9986       loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
9987       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, loop_arg)
9988         {
9989           gimple *use_stmt = USE_STMT (use_p);
9990           if (is_gimple_debug (use_stmt))
9991             continue;
9992
9993           if (!flow_bb_inside_loop_p (loop->inner, gimple_bb (use_stmt)))
9994             {
9995               exit_phi = use_stmt;
9996               break;
9997             }
9998         }
9999       if (exit_phi)
10000         {
10001           stmt_vec_info exit_phi_vinfo = loop_vinfo->lookup_stmt (exit_phi);
10002           if (!(STMT_VINFO_RELEVANT_P (exit_phi_vinfo)
10003                 && !STMT_VINFO_LIVE_P (exit_phi_vinfo)))
10004             {
10005               if (dump_enabled_p ())
10006                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
10007                                  "inner-loop induction only used outside "
10008                                  "of the outer vectorized loop.\n");
10009               return false;
10010             }
10011         }
10012
10013       nested_in_vect_loop = true;
10014       iv_loop = loop->inner;
10015     }
10016   else
10017     iv_loop = loop;
10018   gcc_assert (iv_loop == (gimple_bb (phi))->loop_father);
10019
10020   if (slp_node && !nunits.is_constant ())
10021     {
10022       /* The current SLP code creates the step value element-by-element.  */
10023       if (dump_enabled_p ())
10024         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
10025                          "SLP induction not supported for variable-length"
10026                          " vectors.\n");
10027       return false;
10028     }
10029
10030   if (FLOAT_TYPE_P (vectype) && !param_vect_induction_float)
10031     {
10032       if (dump_enabled_p ())
10033         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
10034                          "floating point induction vectorization disabled\n");
10035       return false;
10036     }
10037
10038   step_expr = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_info);
10039   gcc_assert (step_expr != NULL_TREE);
10040   if (INTEGRAL_TYPE_P (TREE_TYPE (step_expr))
10041       && !type_has_mode_precision_p (TREE_TYPE (step_expr)))
10042     {
10043       if (dump_enabled_p ())
10044         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
10045                          "bit-precision induction vectorization not "
10046                          "supported.\n");
10047       return false;
10048     }
10049   tree step_vectype = get_same_sized_vectype (TREE_TYPE (step_expr), vectype);
10050
10051   /* Check for backend support of PLUS/MINUS_EXPR. */
10052   if (!directly_supported_p (PLUS_EXPR, step_vectype)
10053       || !directly_supported_p (MINUS_EXPR, step_vectype))
10054     return false;
10055
10056   if (!vec_stmt) /* transformation not required.  */
10057     {
10058       unsigned inside_cost = 0, prologue_cost = 0;
10059       if (slp_node)
10060         {
10061           /* We eventually need to set a vector type on invariant
10062              arguments.  */
10063           unsigned j;
10064           slp_tree child;
10065           FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (slp_node), j, child)
10066             if (!vect_maybe_update_slp_op_vectype
10067                 (child, SLP_TREE_VECTYPE (slp_node)))
10068               {
10069                 if (dump_enabled_p ())
10070                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
10071                                    "incompatible vector types for "
10072                                    "invariants\n");
10073                 return false;
10074               }
10075           /* loop cost for vec_loop.  */
10076           inside_cost
10077             = record_stmt_cost (cost_vec,
10078                                 SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node),
10079                                 vector_stmt, stmt_info, 0, vect_body);
10080           /* prologue cost for vec_init (if not nested) and step.  */
10081           prologue_cost = record_stmt_cost (cost_vec, 1 + !nested_in_vect_loop,
10082                                             scalar_to_vec,
10083                                             stmt_info, 0, vect_prologue);
10084         }
10085       else /* if (!slp_node) */
10086         {
10087           /* loop cost for vec_loop.  */
10088           inside_cost = record_stmt_cost (cost_vec, ncopies, vector_stmt,
10089                                           stmt_info, 0, vect_body);
10090           /* prologue cost for vec_init and vec_step.  */
10091           prologue_cost = record_stmt_cost (cost_vec, 2, scalar_to_vec,
10092                                             stmt_info, 0, vect_prologue);
10093         }
10094       if (dump_enabled_p ())
10095         dump_printf_loc (MSG_NOTE, vect_location,
10096                          "vect_model_induction_cost: inside_cost = %d, "
10097                          "prologue_cost = %d .\n", inside_cost,
10098                          prologue_cost);
10099
10100       STMT_VINFO_TYPE (stmt_info) = induc_vec_info_type;
10101       DUMP_VECT_SCOPE ("vectorizable_induction");
10102       return true;
10103     }
10104
10105   /* Transform.  */
10106
10107   /* Compute a vector variable, initialized with the first VF values of
10108      the induction variable.  E.g., for an iv with IV_PHI='X' and
10109      evolution S, for a vector of 4 units, we want to compute:
10110      [X, X + S, X + 2*S, X + 3*S].  */
10111
10112   if (dump_enabled_p ())
10113     dump_printf_loc (MSG_NOTE, vect_location, "transform induction phi.\n");
10114
10115   pe = loop_preheader_edge (iv_loop);
10116   /* Find the first insertion point in the BB.  */
10117   basic_block bb = gimple_bb (phi);
10118   si = gsi_after_labels (bb);
10119
10120   /* For SLP induction we have to generate several IVs as for example
10121      with group size 3 we need
10122        [i0, i1, i2, i0 + S0] [i1 + S1, i2 + S2, i0 + 2*S0, i1 + 2*S1]
10123        [i2 + 2*S2, i0 + 3*S0, i1 + 3*S1, i2 + 3*S2].  */
10124   if (slp_node)
10125     {
10126       /* Enforced above.  */
10127       unsigned int const_nunits = nunits.to_constant ();
10128
10129       /* The initial values are vectorized, but any lanes > group_size
10130          need adjustment.  */
10131       slp_tree init_node
10132         = SLP_TREE_CHILDREN (slp_node)[pe->dest_idx];
10133
10134       /* Gather steps.  Since we do not vectorize inductions as
10135          cycles we have to reconstruct the step from SCEV data.  */
10136       unsigned group_size = SLP_TREE_LANES (slp_node);
10137       tree *steps = XALLOCAVEC (tree, group_size);
10138       tree *inits = XALLOCAVEC (tree, group_size);
10139       stmt_vec_info phi_info;
10140       FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (slp_node), i, phi_info)
10141         {
10142           steps[i] = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (phi_info);
10143           if (!init_node)
10144             inits[i] = gimple_phi_arg_def (as_a<gphi *> (phi_info->stmt),
10145                                            pe->dest_idx);
10146         }
10147
10148       /* Now generate the IVs.  */
10149       unsigned nvects = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
10150       gcc_assert ((const_nunits * nvects) % group_size == 0);
10151       unsigned nivs;
10152       if (nested_in_vect_loop)
10153         nivs = nvects;
10154       else
10155         {
10156           /* Compute the number of distinct IVs we need.  First reduce
10157              group_size if it is a multiple of const_nunits so we get
10158              one IV for a group_size of 4 but const_nunits 2.  */
10159           unsigned group_sizep = group_size;
10160           if (group_sizep % const_nunits == 0)
10161             group_sizep = group_sizep / const_nunits;
10162           nivs = least_common_multiple (group_sizep,
10163                                         const_nunits) / const_nunits;
10164         }
10165       tree stept = TREE_TYPE (step_vectype);
10166       tree lupdate_mul = NULL_TREE;
10167       if (!nested_in_vect_loop)
10168         {
10169           /* The number of iterations covered in one vector iteration.  */
10170           unsigned lup_mul = (nvects * const_nunits) / group_size;
10171           lupdate_mul
10172             = build_vector_from_val (step_vectype,
10173                                      SCALAR_FLOAT_TYPE_P (stept)
10174                                      ? build_real_from_wide (stept, lup_mul,
10175                                                              UNSIGNED)
10176                                      : build_int_cstu (stept, lup_mul));
10177         }
10178       tree peel_mul = NULL_TREE;
10179       gimple_seq init_stmts = NULL;
10180       if (LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo))
10181         {
10182           if (SCALAR_FLOAT_TYPE_P (stept))
10183             peel_mul = gimple_build (&init_stmts, FLOAT_EXPR, stept,
10184                                      LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo));
10185           else
10186             peel_mul = gimple_convert (&init_stmts, stept,
10187                                        LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo));
10188           peel_mul = gimple_build_vector_from_val (&init_stmts,
10189                                                    step_vectype, peel_mul);
10190         }
10191       unsigned ivn;
10192       auto_vec<tree> vec_steps;
10193       for (ivn = 0; ivn < nivs; ++ivn)
10194         {
10195           tree_vector_builder step_elts (step_vectype, const_nunits, 1);
10196           tree_vector_builder init_elts (vectype, const_nunits, 1);
10197           tree_vector_builder mul_elts (step_vectype, const_nunits, 1);
10198           for (unsigned eltn = 0; eltn < const_nunits; ++eltn)
10199             {
10200               /* The scalar steps of the IVs.  */
10201               tree elt = steps[(ivn*const_nunits + eltn) % group_size];
10202               elt = gimple_convert (&init_stmts, TREE_TYPE (step_vectype), elt);
10203               step_elts.quick_push (elt);
10204               if (!init_node)
10205                 {
10206                   /* The scalar inits of the IVs if not vectorized.  */
10207                   elt = inits[(ivn*const_nunits + eltn) % group_size];
10208                   if (!useless_type_conversion_p (TREE_TYPE (vectype),
10209                                                   TREE_TYPE (elt)))
10210                     elt = gimple_build (&init_stmts, VIEW_CONVERT_EXPR,
10211                                         TREE_TYPE (vectype), elt);
10212                   init_elts.quick_push (elt);
10213                 }
10214               /* The number of steps to add to the initial values.  */
10215               unsigned mul_elt = (ivn*const_nunits + eltn) / group_size;
10216               mul_elts.quick_push (SCALAR_FLOAT_TYPE_P (stept)
10217                                    ? build_real_from_wide (stept,
10218                                                            mul_elt, UNSIGNED)
10219                                    : build_int_cstu (stept, mul_elt));
10220             }
10221           vec_step = gimple_build_vector (&init_stmts, &step_elts);
10222           vec_steps.safe_push (vec_step);
10223           tree step_mul = gimple_build_vector (&init_stmts, &mul_elts);
10224           if (peel_mul)
10225             step_mul = gimple_build (&init_stmts, PLUS_EXPR, step_vectype,
10226                                      step_mul, peel_mul);
10227           if (!init_node)
10228             vec_init = gimple_build_vector (&init_stmts, &init_elts);
10229
10230           /* Create the induction-phi that defines the induction-operand.  */
10231           vec_dest = vect_get_new_vect_var (vectype, vect_simple_var,
10232                                             "vec_iv_");
10233           induction_phi = create_phi_node (vec_dest, iv_loop->header);
10234           induc_def = PHI_RESULT (induction_phi);
10235
10236           /* Create the iv update inside the loop  */
10237           tree up = vec_step;
10238           if (lupdate_mul)
10239             up = gimple_build (&init_stmts, MULT_EXPR, step_vectype,
10240                                vec_step, lupdate_mul);
10241           gimple_seq stmts = NULL;
10242           vec_def = gimple_convert (&stmts, step_vectype, induc_def);
10243           vec_def = gimple_build (&stmts,
10244                                   PLUS_EXPR, step_vectype, vec_def, up);
10245           vec_def = gimple_convert (&stmts, vectype, vec_def);
10246           gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
10247           add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
10248                        UNKNOWN_LOCATION);
10249
10250           if (init_node)
10251             vec_init = vect_get_slp_vect_def (init_node, ivn);
10252           if (!nested_in_vect_loop
10253               && !integer_zerop (step_mul))
10254             {
10255               vec_def = gimple_convert (&init_stmts, step_vectype, vec_init);
10256               up = gimple_build (&init_stmts, MULT_EXPR, step_vectype,
10257                                  vec_step, step_mul);
10258               vec_def = gimple_build (&init_stmts, PLUS_EXPR, step_vectype,
10259                                       vec_def, up);
10260               vec_init = gimple_convert (&init_stmts, vectype, vec_def);
10261             }
10262
10263           /* Set the arguments of the phi node:  */
10264           add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION);
10265
10266           slp_node->push_vec_def (induction_phi);
10267         }
10268       if (!nested_in_vect_loop)
10269         {
10270           /* Fill up to the number of vectors we need for the whole group.  */
10271           nivs = least_common_multiple (group_size,
10272                                         const_nunits) / const_nunits;
10273           vec_steps.reserve (nivs-ivn);
10274           for (; ivn < nivs; ++ivn)
10275             {
10276               slp_node->push_vec_def (SLP_TREE_VEC_DEFS (slp_node)[0]);
10277               vec_steps.quick_push (vec_steps[0]);
10278             }
10279         }
10280
10281       /* Re-use IVs when we can.  We are generating further vector
10282          stmts by adding VF' * stride to the IVs generated above.  */
10283       if (ivn < nvects)
10284         {
10285           unsigned vfp
10286             = least_common_multiple (group_size, const_nunits) / group_size;
10287           tree lupdate_mul
10288             = build_vector_from_val (step_vectype,
10289                                      SCALAR_FLOAT_TYPE_P (stept)
10290                                      ? build_real_from_wide (stept,
10291                                                              vfp, UNSIGNED)
10292                                      : build_int_cstu (stept, vfp));
10293           for (; ivn < nvects; ++ivn)
10294             {
10295               gimple *iv
10296                 = SSA_NAME_DEF_STMT (SLP_TREE_VEC_DEFS (slp_node)[ivn - nivs]);
10297               tree def = gimple_get_lhs (iv);
10298               if (ivn < 2*nivs)
10299                 vec_steps[ivn - nivs]
10300                   = gimple_build (&init_stmts, MULT_EXPR, step_vectype,
10301                                   vec_steps[ivn - nivs], lupdate_mul);
10302               gimple_seq stmts = NULL;
10303               def = gimple_convert (&stmts, step_vectype, def);
10304               def = gimple_build (&stmts, PLUS_EXPR, step_vectype,
10305                                   def, vec_steps[ivn % nivs]);
10306               def = gimple_convert (&stmts, vectype, def);
10307               if (gimple_code (iv) == GIMPLE_PHI)
10308                 gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
10309               else
10310                 {
10311                   gimple_stmt_iterator tgsi = gsi_for_stmt (iv);
10312                   gsi_insert_seq_after (&tgsi, stmts, GSI_CONTINUE_LINKING);
10313                 }
10314               slp_node->push_vec_def (def);
10315             }
10316         }
10317
10318       new_bb = gsi_insert_seq_on_edge_immediate (pe, init_stmts);
10319       gcc_assert (!new_bb);
10320
10321       return true;
10322     }
10323
10324   init_expr = vect_phi_initial_value (phi);
10325
10326   gimple_seq stmts = NULL;
10327   if (!nested_in_vect_loop)
10328     {
10329       /* Convert the initial value to the IV update type.  */
10330       tree new_type = TREE_TYPE (step_expr);
10331       init_expr = gimple_convert (&stmts, new_type, init_expr);
10332
10333       /* If we are using the loop mask to "peel" for alignment then we need
10334          to adjust the start value here.  */
10335       tree skip_niters = LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo);
10336       if (skip_niters != NULL_TREE)
10337         {
10338           if (FLOAT_TYPE_P (vectype))
10339             skip_niters = gimple_build (&stmts, FLOAT_EXPR, new_type,
10340                                         skip_niters);
10341           else
10342             skip_niters = gimple_convert (&stmts, new_type, skip_niters);
10343           tree skip_step = gimple_build (&stmts, MULT_EXPR, new_type,
10344                                          skip_niters, step_expr);
10345           init_expr = gimple_build (&stmts, MINUS_EXPR, new_type,
10346                                     init_expr, skip_step);
10347         }
10348     }
10349
10350   if (stmts)
10351     {
10352       new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
10353       gcc_assert (!new_bb);
10354     }
10355
10356   /* Create the vector that holds the initial_value of the induction.  */
10357   if (nested_in_vect_loop)
10358     {
10359       /* iv_loop is nested in the loop to be vectorized.  init_expr had already
10360          been created during vectorization of previous stmts.  We obtain it
10361          from the STMT_VINFO_VEC_STMT of the defining stmt.  */
10362       auto_vec<tree> vec_inits;
10363       vect_get_vec_defs_for_operand (loop_vinfo, stmt_info, 1,
10364                                      init_expr, &vec_inits);
10365       vec_init = vec_inits[0];
10366       /* If the initial value is not of proper type, convert it.  */
10367       if (!useless_type_conversion_p (vectype, TREE_TYPE (vec_init)))
10368         {
10369           new_stmt
10370             = gimple_build_assign (vect_get_new_ssa_name (vectype,
10371                                                           vect_simple_var,
10372                                                           "vec_iv_"),
10373                                    VIEW_CONVERT_EXPR,
10374                                    build1 (VIEW_CONVERT_EXPR, vectype,
10375                                            vec_init));
10376           vec_init = gimple_assign_lhs (new_stmt);
10377           new_bb = gsi_insert_on_edge_immediate (loop_preheader_edge (iv_loop),
10378                                                  new_stmt);
10379           gcc_assert (!new_bb);
10380         }
10381     }
10382   else
10383     {
10384       /* iv_loop is the loop to be vectorized. Create:
10385          vec_init = [X, X+S, X+2*S, X+3*S] (S = step_expr, X = init_expr)  */
10386       stmts = NULL;
10387       new_name = gimple_convert (&stmts, TREE_TYPE (step_expr), init_expr);
10388
10389       unsigned HOST_WIDE_INT const_nunits;
10390       if (nunits.is_constant (&const_nunits))
10391         {
10392           tree_vector_builder elts (step_vectype, const_nunits, 1);
10393           elts.quick_push (new_name);
10394           for (i = 1; i < const_nunits; i++)
10395             {
10396               /* Create: new_name_i = new_name + step_expr  */
10397               new_name = gimple_build (&stmts, PLUS_EXPR, TREE_TYPE (new_name),
10398                                        new_name, step_expr);
10399               elts.quick_push (new_name);
10400             }
10401           /* Create a vector from [new_name_0, new_name_1, ...,
10402              new_name_nunits-1]  */
10403           vec_init = gimple_build_vector (&stmts, &elts);
10404         }
10405       else if (INTEGRAL_TYPE_P (TREE_TYPE (step_expr)))
10406         /* Build the initial value directly from a VEC_SERIES_EXPR.  */
10407         vec_init = gimple_build (&stmts, VEC_SERIES_EXPR, step_vectype,
10408                                  new_name, step_expr);
10409       else
10410         {
10411           /* Build:
10412                 [base, base, base, ...]
10413                 + (vectype) [0, 1, 2, ...] * [step, step, step, ...].  */
10414           gcc_assert (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)));
10415           gcc_assert (flag_associative_math);
10416           tree index = build_index_vector (step_vectype, 0, 1);
10417           tree base_vec = gimple_build_vector_from_val (&stmts, step_vectype,
10418                                                         new_name);
10419           tree step_vec = gimple_build_vector_from_val (&stmts, step_vectype,
10420                                                         step_expr);
10421           vec_init = gimple_build (&stmts, FLOAT_EXPR, step_vectype, index);
10422           vec_init = gimple_build (&stmts, MULT_EXPR, step_vectype,
10423                                    vec_init, step_vec);
10424           vec_init = gimple_build (&stmts, PLUS_EXPR, step_vectype,
10425                                    vec_init, base_vec);
10426         }
10427       vec_init = gimple_convert (&stmts, vectype, vec_init);
10428
10429       if (stmts)
10430         {
10431           new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
10432           gcc_assert (!new_bb);
10433         }
10434     }
10435
10436
10437   /* Create the vector that holds the step of the induction.  */
10438   gimple_stmt_iterator *step_iv_si = NULL;
10439   if (nested_in_vect_loop)
10440     /* iv_loop is nested in the loop to be vectorized. Generate:
10441        vec_step = [S, S, S, S]  */
10442     new_name = step_expr;
10443   else if (LOOP_VINFO_USING_SELECT_VL_P (loop_vinfo))
10444     {
10445       /* When we're using loop_len produced by SELEC_VL, the non-final
10446          iterations are not always processing VF elements.  So vectorize
10447          induction variable instead of
10448
10449            _21 = vect_vec_iv_.6_22 + { VF, ... };
10450
10451          We should generate:
10452
10453            _35 = .SELECT_VL (ivtmp_33, VF);
10454            vect_cst__22 = [vec_duplicate_expr] _35;
10455            _21 = vect_vec_iv_.6_22 + vect_cst__22;  */
10456       gcc_assert (!slp_node);
10457       gimple_seq seq = NULL;
10458       vec_loop_lens *lens = &LOOP_VINFO_LENS (loop_vinfo);
10459       tree len = vect_get_loop_len (loop_vinfo, NULL, lens, 1, vectype, 0, 0);
10460       expr = force_gimple_operand (fold_convert (TREE_TYPE (step_expr),
10461                                                  unshare_expr (len)),
10462                                    &seq, true, NULL_TREE);
10463       new_name = gimple_build (&seq, MULT_EXPR, TREE_TYPE (step_expr), expr,
10464                                step_expr);
10465       gsi_insert_seq_before (&si, seq, GSI_SAME_STMT);
10466       step_iv_si = &si;
10467     }
10468   else
10469     {
10470       /* iv_loop is the loop to be vectorized. Generate:
10471           vec_step = [VF*S, VF*S, VF*S, VF*S]  */
10472       gimple_seq seq = NULL;
10473       if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
10474         {
10475           expr = build_int_cst (integer_type_node, vf);
10476           expr = gimple_build (&seq, FLOAT_EXPR, TREE_TYPE (step_expr), expr);
10477         }
10478       else
10479         expr = build_int_cst (TREE_TYPE (step_expr), vf);
10480       new_name = gimple_build (&seq, MULT_EXPR, TREE_TYPE (step_expr),
10481                                expr, step_expr);
10482       if (seq)
10483         {
10484           new_bb = gsi_insert_seq_on_edge_immediate (pe, seq);
10485           gcc_assert (!new_bb);
10486         }
10487     }
10488
10489   t = unshare_expr (new_name);
10490   gcc_assert (CONSTANT_CLASS_P (new_name)
10491               || TREE_CODE (new_name) == SSA_NAME);
10492   new_vec = build_vector_from_val (step_vectype, t);
10493   vec_step = vect_init_vector (loop_vinfo, stmt_info,
10494                                new_vec, step_vectype, step_iv_si);
10495
10496
10497   /* Create the following def-use cycle:
10498      loop prolog:
10499          vec_init = ...
10500          vec_step = ...
10501      loop:
10502          vec_iv = PHI <vec_init, vec_loop>
10503          ...
10504          STMT
10505          ...
10506          vec_loop = vec_iv + vec_step;  */
10507
10508   /* Create the induction-phi that defines the induction-operand.  */
10509   vec_dest = vect_get_new_vect_var (vectype, vect_simple_var, "vec_iv_");
10510   induction_phi = create_phi_node (vec_dest, iv_loop->header);
10511   induc_def = PHI_RESULT (induction_phi);
10512
10513   /* Create the iv update inside the loop  */
10514   stmts = NULL;
10515   vec_def = gimple_convert (&stmts, step_vectype, induc_def);
10516   vec_def = gimple_build (&stmts, PLUS_EXPR, step_vectype, vec_def, vec_step);
10517   vec_def = gimple_convert (&stmts, vectype, vec_def);
10518   gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
10519   new_stmt = SSA_NAME_DEF_STMT (vec_def);
10520
10521   /* Set the arguments of the phi node:  */
10522   add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION);
10523   add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
10524                UNKNOWN_LOCATION);
10525
10526   STMT_VINFO_VEC_STMTS (stmt_info).safe_push (induction_phi);
10527   *vec_stmt = induction_phi;
10528
10529   /* In case that vectorization factor (VF) is bigger than the number
10530      of elements that we can fit in a vectype (nunits), we have to generate
10531      more than one vector stmt - i.e - we need to "unroll" the
10532      vector stmt by a factor VF/nunits.  For more details see documentation
10533      in vectorizable_operation.  */
10534
10535   if (ncopies > 1)
10536     {
10537       gimple_seq seq = NULL;
10538       /* FORNOW. This restriction should be relaxed.  */
10539       gcc_assert (!nested_in_vect_loop);
10540       /* We expect LOOP_VINFO_USING_SELECT_VL_P to be false if ncopies > 1.  */
10541       gcc_assert (!LOOP_VINFO_USING_SELECT_VL_P (loop_vinfo));
10542
10543       /* Create the vector that holds the step of the induction.  */
10544       if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
10545         {
10546           expr = build_int_cst (integer_type_node, nunits);
10547           expr = gimple_build (&seq, FLOAT_EXPR, TREE_TYPE (step_expr), expr);
10548         }
10549       else
10550         expr = build_int_cst (TREE_TYPE (step_expr), nunits);
10551       new_name = gimple_build (&seq, MULT_EXPR, TREE_TYPE (step_expr),
10552                                expr, step_expr);
10553       if (seq)
10554         {
10555           new_bb = gsi_insert_seq_on_edge_immediate (pe, seq);
10556           gcc_assert (!new_bb);
10557         }
10558
10559       t = unshare_expr (new_name);
10560       gcc_assert (CONSTANT_CLASS_P (new_name)
10561                   || TREE_CODE (new_name) == SSA_NAME);
10562       new_vec = build_vector_from_val (step_vectype, t);
10563       vec_step = vect_init_vector (loop_vinfo, stmt_info,
10564                                    new_vec, step_vectype, NULL);
10565
10566       vec_def = induc_def;
10567       for (i = 1; i < ncopies + 1; i++)
10568         {
10569           /* vec_i = vec_prev + vec_step  */
10570           gimple_seq stmts = NULL;
10571           vec_def = gimple_convert (&stmts, step_vectype, vec_def);
10572           vec_def = gimple_build (&stmts,
10573                                   PLUS_EXPR, step_vectype, vec_def, vec_step);
10574           vec_def = gimple_convert (&stmts, vectype, vec_def);
10575
10576           gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
10577           if (i < ncopies)
10578             {
10579               new_stmt = SSA_NAME_DEF_STMT (vec_def);
10580               STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_stmt);
10581             }
10582           else
10583             {
10584               /* vec_1 = vec_iv + (VF/n * S)
10585                  vec_2 = vec_1 + (VF/n * S)
10586                  ...
10587                  vec_n = vec_prev + (VF/n * S) = vec_iv + VF * S = vec_loop
10588
10589                  vec_n is used as vec_loop to save the large step register and
10590                  related operations.  */
10591               add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
10592                            UNKNOWN_LOCATION);
10593             }
10594         }
10595     }
10596
10597   if (dump_enabled_p ())
10598     dump_printf_loc (MSG_NOTE, vect_location,
10599                      "transform induction: created def-use cycle: %G%G",
10600                      (gimple *) induction_phi, SSA_NAME_DEF_STMT (vec_def));
10601
10602   return true;
10603 }
10604
10605 /* Function vectorizable_live_operation_1.
10606
10607    helper function for vectorizable_live_operation.  */
10608
10609 static tree
10610 vectorizable_live_operation_1 (loop_vec_info loop_vinfo,
10611                                stmt_vec_info stmt_info, basic_block exit_bb,
10612                                tree vectype, int ncopies, slp_tree slp_node,
10613                                tree bitsize, tree bitstart, tree vec_lhs,
10614                                tree lhs_type, gimple_stmt_iterator *exit_gsi)
10615 {
10616   gcc_assert (single_pred_p (exit_bb) || LOOP_VINFO_EARLY_BREAKS (loop_vinfo));
10617
10618   tree vec_lhs_phi = copy_ssa_name (vec_lhs);
10619   gimple *phi = create_phi_node (vec_lhs_phi, exit_bb);
10620   for (unsigned i = 0; i < gimple_phi_num_args (phi); i++)
10621     SET_PHI_ARG_DEF (phi, i, vec_lhs);
10622
10623   gimple_seq stmts = NULL;
10624   tree new_tree;
10625
10626   /* If bitstart is 0 then we can use a BIT_FIELD_REF  */
10627   if (integer_zerop (bitstart))
10628     {
10629       tree scalar_res = gimple_build (&stmts, BIT_FIELD_REF, TREE_TYPE (vectype),
10630                                       vec_lhs_phi, bitsize, bitstart);
10631
10632       /* Convert the extracted vector element to the scalar type.  */
10633       new_tree = gimple_convert (&stmts, lhs_type, scalar_res);
10634     }
10635   else if (LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo))
10636     {
10637       /* Emit:
10638
10639          SCALAR_RES = VEC_EXTRACT <VEC_LHS, LEN + BIAS - 1>
10640
10641          where VEC_LHS is the vectorized live-out result and MASK is
10642          the loop mask for the final iteration.  */
10643       gcc_assert (ncopies == 1 && !slp_node);
10644       gimple_seq tem = NULL;
10645       gimple_stmt_iterator gsi = gsi_last (tem);
10646       tree len = vect_get_loop_len (loop_vinfo, &gsi,
10647                                     &LOOP_VINFO_LENS (loop_vinfo),
10648                                     1, vectype, 0, 0);
10649
10650       /* BIAS - 1.  */
10651       signed char biasval = LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo);
10652       tree bias_minus_one
10653         = int_const_binop (MINUS_EXPR,
10654                            build_int_cst (TREE_TYPE (len), biasval),
10655                            build_one_cst (TREE_TYPE (len)));
10656
10657       /* LAST_INDEX = LEN + (BIAS - 1).  */
10658       tree last_index = gimple_build (&stmts, PLUS_EXPR, TREE_TYPE (len),
10659                                      len, bias_minus_one);
10660
10661       /* SCALAR_RES = VEC_EXTRACT <VEC_LHS, LEN + BIAS - 1>.  */
10662       tree scalar_res
10663         = gimple_build (&stmts, CFN_VEC_EXTRACT, TREE_TYPE (vectype),
10664                         vec_lhs_phi, last_index);
10665
10666       /* Convert the extracted vector element to the scalar type.  */
10667       new_tree = gimple_convert (&stmts, lhs_type, scalar_res);
10668     }
10669   else if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
10670     {
10671       /* Emit:
10672
10673          SCALAR_RES = EXTRACT_LAST <VEC_LHS, MASK>
10674
10675          where VEC_LHS is the vectorized live-out result and MASK is
10676          the loop mask for the final iteration.  */
10677       gcc_assert (!slp_node);
10678       tree scalar_type = TREE_TYPE (STMT_VINFO_VECTYPE (stmt_info));
10679       gimple_seq tem = NULL;
10680       gimple_stmt_iterator gsi = gsi_last (tem);
10681       tree mask = vect_get_loop_mask (loop_vinfo, &gsi,
10682                                       &LOOP_VINFO_MASKS (loop_vinfo),
10683                                       1, vectype, 0);
10684       tree scalar_res;
10685       gimple_seq_add_seq (&stmts, tem);
10686
10687       scalar_res = gimple_build (&stmts, CFN_EXTRACT_LAST, scalar_type,
10688                                  mask, vec_lhs_phi);
10689
10690       /* Convert the extracted vector element to the scalar type.  */
10691       new_tree = gimple_convert (&stmts, lhs_type, scalar_res);
10692     }
10693   else
10694     {
10695       tree bftype = TREE_TYPE (vectype);
10696       if (VECTOR_BOOLEAN_TYPE_P (vectype))
10697         bftype = build_nonstandard_integer_type (tree_to_uhwi (bitsize), 1);
10698       new_tree = build3 (BIT_FIELD_REF, bftype, vec_lhs_phi, bitsize, bitstart);
10699       new_tree = force_gimple_operand (fold_convert (lhs_type, new_tree),
10700                                        &stmts, true, NULL_TREE);
10701     }
10702
10703   *exit_gsi = gsi_after_labels (exit_bb);
10704   if (stmts)
10705     gsi_insert_seq_before (exit_gsi, stmts, GSI_SAME_STMT);
10706
10707   return new_tree;
10708 }
10709
10710 /* Function vectorizable_live_operation.
10711
10712    STMT_INFO computes a value that is used outside the loop.  Check if
10713    it can be supported.  */
10714
10715 bool
10716 vectorizable_live_operation (vec_info *vinfo, stmt_vec_info stmt_info,
10717                              slp_tree slp_node, slp_instance slp_node_instance,
10718                              int slp_index, bool vec_stmt_p,
10719                              stmt_vector_for_cost *cost_vec)
10720 {
10721   loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
10722   imm_use_iterator imm_iter;
10723   tree lhs, lhs_type, bitsize;
10724   tree vectype = (slp_node
10725                   ? SLP_TREE_VECTYPE (slp_node)
10726                   : STMT_VINFO_VECTYPE (stmt_info));
10727   poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
10728   int ncopies;
10729   gimple *use_stmt;
10730   use_operand_p use_p;
10731   auto_vec<tree> vec_oprnds;
10732   int vec_entry = 0;
10733   poly_uint64 vec_index = 0;
10734
10735   gcc_assert (STMT_VINFO_LIVE_P (stmt_info)
10736               || LOOP_VINFO_EARLY_BREAKS (loop_vinfo));
10737
10738   /* If a stmt of a reduction is live, vectorize it via
10739      vect_create_epilog_for_reduction.  vectorizable_reduction assessed
10740      validity so just trigger the transform here.  */
10741   if (STMT_VINFO_REDUC_DEF (vect_orig_stmt (stmt_info)))
10742     {
10743       if (!vec_stmt_p)
10744         return true;
10745       /* For SLP reductions we vectorize the epilogue for all involved stmts
10746          together.  */
10747       if (slp_node && !REDUC_GROUP_FIRST_ELEMENT (stmt_info) && slp_index != 0)
10748         return true;
10749       stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info);
10750       gcc_assert (reduc_info->is_reduc_info);
10751       if (STMT_VINFO_REDUC_TYPE (reduc_info) == FOLD_LEFT_REDUCTION
10752           || STMT_VINFO_REDUC_TYPE (reduc_info) == EXTRACT_LAST_REDUCTION)
10753         return true;
10754
10755       if (!LOOP_VINFO_EARLY_BREAKS (loop_vinfo)
10756           || !LOOP_VINFO_EARLY_BREAKS_VECT_PEELED (loop_vinfo))
10757         vect_create_epilog_for_reduction (loop_vinfo, stmt_info, slp_node,
10758                                           slp_node_instance,
10759                                           LOOP_VINFO_IV_EXIT (loop_vinfo));
10760
10761       /* If early break we only have to materialize the reduction on the merge
10762          block, but we have to find an alternate exit first.  */
10763       if (LOOP_VINFO_EARLY_BREAKS (loop_vinfo))
10764         {
10765           slp_tree phis_node = slp_node ? slp_node_instance->reduc_phis : NULL;
10766           for (auto exit : get_loop_exit_edges (LOOP_VINFO_LOOP (loop_vinfo)))
10767             if (exit != LOOP_VINFO_IV_EXIT (loop_vinfo))
10768               {
10769                 vect_create_epilog_for_reduction (loop_vinfo, reduc_info,
10770                                                   phis_node, slp_node_instance,
10771                                                   exit);
10772                 break;
10773               }
10774           if (LOOP_VINFO_EARLY_BREAKS_VECT_PEELED (loop_vinfo))
10775             vect_create_epilog_for_reduction (loop_vinfo, reduc_info,
10776                                               phis_node, slp_node_instance,
10777                                               LOOP_VINFO_IV_EXIT (loop_vinfo));
10778         }
10779
10780       return true;
10781     }
10782
10783   /* If STMT is not relevant and it is a simple assignment and its inputs are
10784      invariant then it can remain in place, unvectorized.  The original last
10785      scalar value that it computes will be used.  */
10786   if (!STMT_VINFO_RELEVANT_P (stmt_info))
10787     {
10788       gcc_assert (is_simple_and_all_uses_invariant (stmt_info, loop_vinfo));
10789       if (dump_enabled_p ())
10790         dump_printf_loc (MSG_NOTE, vect_location,
10791                          "statement is simple and uses invariant.  Leaving in "
10792                          "place.\n");
10793       return true;
10794     }
10795
10796   if (slp_node)
10797     ncopies = 1;
10798   else
10799     ncopies = vect_get_num_copies (loop_vinfo, vectype);
10800
10801   if (slp_node)
10802     {
10803       gcc_assert (slp_index >= 0);
10804
10805       /* Get the last occurrence of the scalar index from the concatenation of
10806          all the slp vectors. Calculate which slp vector it is and the index
10807          within.  */
10808       int num_scalar = SLP_TREE_LANES (slp_node);
10809       int num_vec = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
10810       poly_uint64 pos = (num_vec * nunits) - num_scalar + slp_index;
10811
10812       /* Calculate which vector contains the result, and which lane of
10813          that vector we need.  */
10814       if (!can_div_trunc_p (pos, nunits, &vec_entry, &vec_index))
10815         {
10816           if (dump_enabled_p ())
10817             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
10818                              "Cannot determine which vector holds the"
10819                              " final result.\n");
10820           return false;
10821         }
10822     }
10823
10824   if (!vec_stmt_p)
10825     {
10826       /* No transformation required.  */
10827       if (loop_vinfo && LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo))
10828         {
10829           if (slp_node)
10830             {
10831               if (dump_enabled_p ())
10832                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
10833                                  "can't operate on partial vectors "
10834                                  "because an SLP statement is live after "
10835                                  "the loop.\n");
10836               LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
10837             }
10838           else if (ncopies > 1)
10839             {
10840               if (dump_enabled_p ())
10841                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
10842                                  "can't operate on partial vectors "
10843                                  "because ncopies is greater than 1.\n");
10844               LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
10845             }
10846           else
10847             {
10848               gcc_assert (ncopies == 1 && !slp_node);
10849               if (direct_internal_fn_supported_p (IFN_EXTRACT_LAST, vectype,
10850                                                   OPTIMIZE_FOR_SPEED))
10851                 vect_record_loop_mask (loop_vinfo,
10852                                        &LOOP_VINFO_MASKS (loop_vinfo),
10853                                        1, vectype, NULL);
10854               else if (can_vec_extract_var_idx_p (
10855                          TYPE_MODE (vectype), TYPE_MODE (TREE_TYPE (vectype))))
10856                 vect_record_loop_len (loop_vinfo,
10857                                       &LOOP_VINFO_LENS (loop_vinfo),
10858                                       1, vectype, 1);
10859               else
10860                 {
10861                   if (dump_enabled_p ())
10862                     dump_printf_loc (
10863                       MSG_MISSED_OPTIMIZATION, vect_location,
10864                       "can't operate on partial vectors "
10865                       "because the target doesn't support extract "
10866                       "last reduction.\n");
10867                   LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
10868                 }
10869             }
10870         }
10871       /* ???  Enable for loop costing as well.  */
10872       if (!loop_vinfo)
10873         record_stmt_cost (cost_vec, 1, vec_to_scalar, stmt_info, NULL_TREE,
10874                           0, vect_epilogue);
10875       return true;
10876     }
10877
10878   /* Use the lhs of the original scalar statement.  */
10879   gimple *stmt = vect_orig_stmt (stmt_info)->stmt;
10880   if (dump_enabled_p ())
10881     dump_printf_loc (MSG_NOTE, vect_location, "extracting lane for live "
10882                      "stmt %G", stmt);
10883
10884   lhs = gimple_get_lhs (stmt);
10885   lhs_type = TREE_TYPE (lhs);
10886
10887   bitsize = vector_element_bits_tree (vectype);
10888
10889   /* Get the vectorized lhs of STMT and the lane to use (counted in bits).  */
10890   tree vec_lhs, vec_lhs0, bitstart;
10891   gimple *vec_stmt, *vec_stmt0;
10892   if (slp_node)
10893     {
10894       gcc_assert (!loop_vinfo
10895                   || (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
10896                       && !LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo)));
10897
10898       /* Get the correct slp vectorized stmt.  */
10899       vec_lhs = SLP_TREE_VEC_DEFS (slp_node)[vec_entry];
10900       vec_stmt = SSA_NAME_DEF_STMT (vec_lhs);
10901
10902       /* In case we need to early break vectorize also get the first stmt.  */
10903       vec_lhs0 = SLP_TREE_VEC_DEFS (slp_node)[0];
10904       vec_stmt0 = SSA_NAME_DEF_STMT (vec_lhs0);
10905
10906       /* Get entry to use.  */
10907       bitstart = bitsize_int (vec_index);
10908       bitstart = int_const_binop (MULT_EXPR, bitsize, bitstart);
10909     }
10910   else
10911     {
10912       /* For multiple copies, get the last copy.  */
10913       vec_stmt = STMT_VINFO_VEC_STMTS (stmt_info).last ();
10914       vec_lhs = gimple_get_lhs (vec_stmt);
10915
10916       /* In case we need to early break vectorize also get the first stmt.  */
10917       vec_stmt0 = STMT_VINFO_VEC_STMTS (stmt_info)[0];
10918       vec_lhs0 = gimple_get_lhs (vec_stmt0);
10919
10920       /* Get the last lane in the vector.  */
10921       bitstart = int_const_binop (MULT_EXPR, bitsize, bitsize_int (nunits - 1));
10922     }
10923
10924   if (loop_vinfo)
10925     {
10926       /* Ensure the VEC_LHS for lane extraction stmts satisfy loop-closed PHI
10927          requirement, insert one phi node for it.  It looks like:
10928            loop;
10929          BB:
10930            # lhs' = PHI <lhs>
10931          ==>
10932            loop;
10933          BB:
10934            # vec_lhs' = PHI <vec_lhs>
10935            new_tree = lane_extract <vec_lhs', ...>;
10936            lhs' = new_tree;  */
10937
10938       class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
10939       /* Check if we have a loop where the chosen exit is not the main exit,
10940          in these cases for an early break we restart the iteration the vector code
10941          did.  For the live values we want the value at the start of the iteration
10942          rather than at the end.  */
10943       edge main_e = LOOP_VINFO_IV_EXIT (loop_vinfo);
10944       bool all_exits_as_early_p = LOOP_VINFO_EARLY_BREAKS_VECT_PEELED (loop_vinfo);
10945       FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, lhs)
10946         if (!is_gimple_debug (use_stmt)
10947             && !flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
10948           FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
10949             {
10950               edge e = gimple_phi_arg_edge (as_a <gphi *> (use_stmt),
10951                                            phi_arg_index_from_use (use_p));
10952               gcc_assert (loop_exit_edge_p (loop, e));
10953               bool main_exit_edge = e == main_e;
10954               tree tmp_vec_lhs = vec_lhs;
10955               tree tmp_bitstart = bitstart;
10956
10957               /* For early exit where the exit is not in the BB that leads
10958                  to the latch then we're restarting the iteration in the
10959                  scalar loop.  So get the first live value.  */
10960               if ((all_exits_as_early_p || !main_exit_edge)
10961                   && STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def)
10962                 {
10963                   tmp_vec_lhs = vec_lhs0;
10964                   tmp_bitstart = build_zero_cst (TREE_TYPE (bitstart));
10965                 }
10966
10967               gimple_stmt_iterator exit_gsi;
10968               tree new_tree
10969                 = vectorizable_live_operation_1 (loop_vinfo, stmt_info,
10970                                                  e->dest, vectype, ncopies,
10971                                                  slp_node, bitsize,
10972                                                  tmp_bitstart, tmp_vec_lhs,
10973                                                  lhs_type, &exit_gsi);
10974
10975               auto gsi = gsi_for_stmt (use_stmt);
10976               tree lhs_phi = gimple_phi_result (use_stmt);
10977               remove_phi_node (&gsi, false);
10978               gimple *copy = gimple_build_assign (lhs_phi, new_tree);
10979               gsi_insert_before (&exit_gsi, copy, GSI_SAME_STMT);
10980               break;
10981             }
10982
10983       /* There a no further out-of-loop uses of lhs by LC-SSA construction.  */
10984       FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, lhs)
10985         gcc_assert (flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)));
10986     }
10987   else
10988     {
10989       /* For basic-block vectorization simply insert the lane-extraction.  */
10990       tree bftype = TREE_TYPE (vectype);
10991       if (VECTOR_BOOLEAN_TYPE_P (vectype))
10992         bftype = build_nonstandard_integer_type (tree_to_uhwi (bitsize), 1);
10993       tree new_tree = build3 (BIT_FIELD_REF, bftype,
10994                               vec_lhs, bitsize, bitstart);
10995       gimple_seq stmts = NULL;
10996       new_tree = force_gimple_operand (fold_convert (lhs_type, new_tree),
10997                                        &stmts, true, NULL_TREE);
10998       if (TREE_CODE (new_tree) == SSA_NAME
10999           && SSA_NAME_OCCURS_IN_ABNORMAL_PHI (lhs))
11000         SSA_NAME_OCCURS_IN_ABNORMAL_PHI (new_tree) = 1;
11001       if (is_a <gphi *> (vec_stmt))
11002         {
11003           gimple_stmt_iterator si = gsi_after_labels (gimple_bb (vec_stmt));
11004           gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
11005         }
11006       else
11007         {
11008           gimple_stmt_iterator si = gsi_for_stmt (vec_stmt);
11009           gsi_insert_seq_after (&si, stmts, GSI_SAME_STMT);
11010         }
11011
11012       /* Replace use of lhs with newly computed result.  If the use stmt is a
11013          single arg PHI, just replace all uses of PHI result.  It's necessary
11014          because lcssa PHI defining lhs may be before newly inserted stmt.  */
11015       use_operand_p use_p;
11016       stmt_vec_info use_stmt_info;
11017       FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, lhs)
11018         if (!is_gimple_debug (use_stmt)
11019             && (!(use_stmt_info = vinfo->lookup_stmt (use_stmt))
11020                 || !PURE_SLP_STMT (vect_stmt_to_vectorize (use_stmt_info))))
11021           {
11022             /* ???  This can happen when the live lane ends up being
11023                rooted in a vector construction code-generated by an
11024                external SLP node (and code-generation for that already
11025                happened).  See gcc.dg/vect/bb-slp-47.c.
11026                Doing this is what would happen if that vector CTOR
11027                were not code-generated yet so it is not too bad.
11028                ???  In fact we'd likely want to avoid this situation
11029                in the first place.  */
11030             if (TREE_CODE (new_tree) == SSA_NAME
11031                 && !SSA_NAME_IS_DEFAULT_DEF (new_tree)
11032                 && gimple_code (use_stmt) != GIMPLE_PHI
11033                 && !vect_stmt_dominates_stmt_p (SSA_NAME_DEF_STMT (new_tree),
11034                                                 use_stmt))
11035               {
11036                 if (dump_enabled_p ())
11037                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
11038                                    "Using original scalar computation for "
11039                                    "live lane because use preceeds vector "
11040                                    "def\n");
11041                 continue;
11042               }
11043             /* ???  It can also happen that we end up pulling a def into
11044                a loop where replacing out-of-loop uses would require
11045                a new LC SSA PHI node.  Retain the original scalar in
11046                those cases as well.  PR98064.  */
11047             if (TREE_CODE (new_tree) == SSA_NAME
11048                 && !SSA_NAME_IS_DEFAULT_DEF (new_tree)
11049                 && (gimple_bb (use_stmt)->loop_father
11050                     != gimple_bb (vec_stmt)->loop_father)
11051                 && !flow_loop_nested_p (gimple_bb (vec_stmt)->loop_father,
11052                                         gimple_bb (use_stmt)->loop_father))
11053               {
11054                 if (dump_enabled_p ())
11055                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
11056                                    "Using original scalar computation for "
11057                                    "live lane because there is an out-of-loop "
11058                                    "definition for it\n");
11059                 continue;
11060               }
11061             FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
11062               SET_USE (use_p, new_tree);
11063             update_stmt (use_stmt);
11064           }
11065     }
11066
11067   return true;
11068 }
11069
11070 /* Kill any debug uses outside LOOP of SSA names defined in STMT_INFO.  */
11071
11072 static void
11073 vect_loop_kill_debug_uses (class loop *loop, stmt_vec_info stmt_info)
11074 {
11075   ssa_op_iter op_iter;
11076   imm_use_iterator imm_iter;
11077   def_operand_p def_p;
11078   gimple *ustmt;
11079
11080   FOR_EACH_PHI_OR_STMT_DEF (def_p, stmt_info->stmt, op_iter, SSA_OP_DEF)
11081     {
11082       FOR_EACH_IMM_USE_STMT (ustmt, imm_iter, DEF_FROM_PTR (def_p))
11083         {
11084           basic_block bb;
11085
11086           if (!is_gimple_debug (ustmt))
11087             continue;
11088
11089           bb = gimple_bb (ustmt);
11090
11091           if (!flow_bb_inside_loop_p (loop, bb))
11092             {
11093               if (gimple_debug_bind_p (ustmt))
11094                 {
11095                   if (dump_enabled_p ())
11096                     dump_printf_loc (MSG_NOTE, vect_location,
11097                                      "killing debug use\n");
11098
11099                   gimple_debug_bind_reset_value (ustmt);
11100                   update_stmt (ustmt);
11101                 }
11102               else
11103                 gcc_unreachable ();
11104             }
11105         }
11106     }
11107 }
11108
11109 /* Given loop represented by LOOP_VINFO, return true if computation of
11110    LOOP_VINFO_NITERS (= LOOP_VINFO_NITERSM1 + 1) doesn't overflow, false
11111    otherwise.  */
11112
11113 static bool
11114 loop_niters_no_overflow (loop_vec_info loop_vinfo)
11115 {
11116   /* Constant case.  */
11117   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
11118     {
11119       tree cst_niters = LOOP_VINFO_NITERS (loop_vinfo);
11120       tree cst_nitersm1 = LOOP_VINFO_NITERSM1 (loop_vinfo);
11121
11122       gcc_assert (TREE_CODE (cst_niters) == INTEGER_CST);
11123       gcc_assert (TREE_CODE (cst_nitersm1) == INTEGER_CST);
11124       if (wi::to_widest (cst_nitersm1) < wi::to_widest (cst_niters))
11125         return true;
11126     }
11127
11128   widest_int max;
11129   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
11130   /* Check the upper bound of loop niters.  */
11131   if (get_max_loop_iterations (loop, &max))
11132     {
11133       tree type = TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo));
11134       signop sgn = TYPE_SIGN (type);
11135       widest_int type_max = widest_int::from (wi::max_value (type), sgn);
11136       if (max < type_max)
11137         return true;
11138     }
11139   return false;
11140 }
11141
11142 /* Return a mask type with half the number of elements as OLD_TYPE,
11143    given that it should have mode NEW_MODE.  */
11144
11145 tree
11146 vect_halve_mask_nunits (tree old_type, machine_mode new_mode)
11147 {
11148   poly_uint64 nunits = exact_div (TYPE_VECTOR_SUBPARTS (old_type), 2);
11149   return build_truth_vector_type_for_mode (nunits, new_mode);
11150 }
11151
11152 /* Return a mask type with twice as many elements as OLD_TYPE,
11153    given that it should have mode NEW_MODE.  */
11154
11155 tree
11156 vect_double_mask_nunits (tree old_type, machine_mode new_mode)
11157 {
11158   poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (old_type) * 2;
11159   return build_truth_vector_type_for_mode (nunits, new_mode);
11160 }
11161
11162 /* Record that a fully-masked version of LOOP_VINFO would need MASKS to
11163    contain a sequence of NVECTORS masks that each control a vector of type
11164    VECTYPE.  If SCALAR_MASK is nonnull, the fully-masked loop would AND
11165    these vector masks with the vector version of SCALAR_MASK.  */
11166
11167 void
11168 vect_record_loop_mask (loop_vec_info loop_vinfo, vec_loop_masks *masks,
11169                        unsigned int nvectors, tree vectype, tree scalar_mask)
11170 {
11171   gcc_assert (nvectors != 0);
11172
11173   if (scalar_mask)
11174     {
11175       scalar_cond_masked_key cond (scalar_mask, nvectors);
11176       loop_vinfo->scalar_cond_masked_set.add (cond);
11177     }
11178
11179   masks->mask_set.add (std::make_pair (vectype, nvectors));
11180 }
11181
11182 /* Given a complete set of masks MASKS, extract mask number INDEX
11183    for an rgroup that operates on NVECTORS vectors of type VECTYPE,
11184    where 0 <= INDEX < NVECTORS.  Insert any set-up statements before GSI.
11185
11186    See the comment above vec_loop_masks for more details about the mask
11187    arrangement.  */
11188
11189 tree
11190 vect_get_loop_mask (loop_vec_info loop_vinfo,
11191                     gimple_stmt_iterator *gsi, vec_loop_masks *masks,
11192                     unsigned int nvectors, tree vectype, unsigned int index)
11193 {
11194   if (LOOP_VINFO_PARTIAL_VECTORS_STYLE (loop_vinfo)
11195       == vect_partial_vectors_while_ult)
11196     {
11197       rgroup_controls *rgm = &(masks->rgc_vec)[nvectors - 1];
11198       tree mask_type = rgm->type;
11199
11200       /* Populate the rgroup's mask array, if this is the first time we've
11201          used it.  */
11202       if (rgm->controls.is_empty ())
11203         {
11204           rgm->controls.safe_grow_cleared (nvectors, true);
11205           for (unsigned int i = 0; i < nvectors; ++i)
11206             {
11207               tree mask = make_temp_ssa_name (mask_type, NULL, "loop_mask");
11208               /* Provide a dummy definition until the real one is available.  */
11209               SSA_NAME_DEF_STMT (mask) = gimple_build_nop ();
11210               rgm->controls[i] = mask;
11211             }
11212         }
11213
11214       tree mask = rgm->controls[index];
11215       if (maybe_ne (TYPE_VECTOR_SUBPARTS (mask_type),
11216                     TYPE_VECTOR_SUBPARTS (vectype)))
11217         {
11218           /* A loop mask for data type X can be reused for data type Y
11219              if X has N times more elements than Y and if Y's elements
11220              are N times bigger than X's.  In this case each sequence
11221              of N elements in the loop mask will be all-zero or all-one.
11222              We can then view-convert the mask so that each sequence of
11223              N elements is replaced by a single element.  */
11224           gcc_assert (multiple_p (TYPE_VECTOR_SUBPARTS (mask_type),
11225                                   TYPE_VECTOR_SUBPARTS (vectype)));
11226           gimple_seq seq = NULL;
11227           mask_type = truth_type_for (vectype);
11228           mask = gimple_build (&seq, VIEW_CONVERT_EXPR, mask_type, mask);
11229           if (seq)
11230             gsi_insert_seq_before (gsi, seq, GSI_SAME_STMT);
11231         }
11232       return mask;
11233     }
11234   else if (LOOP_VINFO_PARTIAL_VECTORS_STYLE (loop_vinfo)
11235            == vect_partial_vectors_avx512)
11236     {
11237       /* The number of scalars per iteration and the number of vectors are
11238          both compile-time constants.  */
11239       unsigned int nscalars_per_iter
11240         = exact_div (nvectors * TYPE_VECTOR_SUBPARTS (vectype),
11241                      LOOP_VINFO_VECT_FACTOR (loop_vinfo)).to_constant ();
11242
11243       rgroup_controls *rgm = &masks->rgc_vec[nscalars_per_iter - 1];
11244
11245       /* The stored nV is dependent on the mask type produced.  */
11246       gcc_assert (exact_div (nvectors * TYPE_VECTOR_SUBPARTS (vectype),
11247                              TYPE_VECTOR_SUBPARTS (rgm->type)).to_constant ()
11248                   == rgm->factor);
11249       nvectors = rgm->factor;
11250
11251       /* Populate the rgroup's mask array, if this is the first time we've
11252          used it.  */
11253       if (rgm->controls.is_empty ())
11254         {
11255           rgm->controls.safe_grow_cleared (nvectors, true);
11256           for (unsigned int i = 0; i < nvectors; ++i)
11257             {
11258               tree mask = make_temp_ssa_name (rgm->type, NULL, "loop_mask");
11259               /* Provide a dummy definition until the real one is available.  */
11260               SSA_NAME_DEF_STMT (mask) = gimple_build_nop ();
11261               rgm->controls[i] = mask;
11262             }
11263         }
11264       if (known_eq (TYPE_VECTOR_SUBPARTS (rgm->type),
11265                     TYPE_VECTOR_SUBPARTS (vectype)))
11266         return rgm->controls[index];
11267
11268       /* Split the vector if needed.  Since we are dealing with integer mode
11269          masks with AVX512 we can operate on the integer representation
11270          performing the whole vector shifting.  */
11271       unsigned HOST_WIDE_INT factor;
11272       bool ok = constant_multiple_p (TYPE_VECTOR_SUBPARTS (rgm->type),
11273                                      TYPE_VECTOR_SUBPARTS (vectype), &factor);
11274       gcc_assert (ok);
11275       gcc_assert (GET_MODE_CLASS (TYPE_MODE (rgm->type)) == MODE_INT);
11276       tree mask_type = truth_type_for (vectype);
11277       gcc_assert (GET_MODE_CLASS (TYPE_MODE (mask_type)) == MODE_INT);
11278       unsigned vi = index / factor;
11279       unsigned vpart = index % factor;
11280       tree vec = rgm->controls[vi];
11281       gimple_seq seq = NULL;
11282       vec = gimple_build (&seq, VIEW_CONVERT_EXPR,
11283                           lang_hooks.types.type_for_mode
11284                                 (TYPE_MODE (rgm->type), 1), vec);
11285       /* For integer mode masks simply shift the right bits into position.  */
11286       if (vpart != 0)
11287         vec = gimple_build (&seq, RSHIFT_EXPR, TREE_TYPE (vec), vec,
11288                             build_int_cst (integer_type_node,
11289                                            (TYPE_VECTOR_SUBPARTS (vectype)
11290                                             * vpart)));
11291       vec = gimple_convert (&seq, lang_hooks.types.type_for_mode
11292                                     (TYPE_MODE (mask_type), 1), vec);
11293       vec = gimple_build (&seq, VIEW_CONVERT_EXPR, mask_type, vec);
11294       if (seq)
11295         gsi_insert_seq_before (gsi, seq, GSI_SAME_STMT);
11296       return vec;
11297     }
11298   else
11299     gcc_unreachable ();
11300 }
11301
11302 /* Record that LOOP_VINFO would need LENS to contain a sequence of NVECTORS
11303    lengths for controlling an operation on VECTYPE.  The operation splits
11304    each element of VECTYPE into FACTOR separate subelements, measuring the
11305    length as a number of these subelements.  */
11306
11307 void
11308 vect_record_loop_len (loop_vec_info loop_vinfo, vec_loop_lens *lens,
11309                       unsigned int nvectors, tree vectype, unsigned int factor)
11310 {
11311   gcc_assert (nvectors != 0);
11312   if (lens->length () < nvectors)
11313     lens->safe_grow_cleared (nvectors, true);
11314   rgroup_controls *rgl = &(*lens)[nvectors - 1];
11315
11316   /* The number of scalars per iteration, scalar occupied bytes and
11317      the number of vectors are both compile-time constants.  */
11318   unsigned int nscalars_per_iter
11319     = exact_div (nvectors * TYPE_VECTOR_SUBPARTS (vectype),
11320                  LOOP_VINFO_VECT_FACTOR (loop_vinfo)).to_constant ();
11321
11322   if (rgl->max_nscalars_per_iter < nscalars_per_iter)
11323     {
11324       /* For now, we only support cases in which all loads and stores fall back
11325          to VnQI or none do.  */
11326       gcc_assert (!rgl->max_nscalars_per_iter
11327                   || (rgl->factor == 1 && factor == 1)
11328                   || (rgl->max_nscalars_per_iter * rgl->factor
11329                       == nscalars_per_iter * factor));
11330       rgl->max_nscalars_per_iter = nscalars_per_iter;
11331       rgl->type = vectype;
11332       rgl->factor = factor;
11333     }
11334 }
11335
11336 /* Given a complete set of lengths LENS, extract length number INDEX
11337    for an rgroup that operates on NVECTORS vectors of type VECTYPE,
11338    where 0 <= INDEX < NVECTORS.  Return a value that contains FACTOR
11339    multipled by the number of elements that should be processed.
11340    Insert any set-up statements before GSI.  */
11341
11342 tree
11343 vect_get_loop_len (loop_vec_info loop_vinfo, gimple_stmt_iterator *gsi,
11344                    vec_loop_lens *lens, unsigned int nvectors, tree vectype,
11345                    unsigned int index, unsigned int factor)
11346 {
11347   rgroup_controls *rgl = &(*lens)[nvectors - 1];
11348   bool use_bias_adjusted_len =
11349     LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo) != 0;
11350
11351   /* Populate the rgroup's len array, if this is the first time we've
11352      used it.  */
11353   if (rgl->controls.is_empty ())
11354     {
11355       rgl->controls.safe_grow_cleared (nvectors, true);
11356       for (unsigned int i = 0; i < nvectors; ++i)
11357         {
11358           tree len_type = LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo);
11359           gcc_assert (len_type != NULL_TREE);
11360
11361           tree len = make_temp_ssa_name (len_type, NULL, "loop_len");
11362
11363           /* Provide a dummy definition until the real one is available.  */
11364           SSA_NAME_DEF_STMT (len) = gimple_build_nop ();
11365           rgl->controls[i] = len;
11366
11367           if (use_bias_adjusted_len)
11368             {
11369               gcc_assert (i == 0);
11370               tree adjusted_len =
11371                 make_temp_ssa_name (len_type, NULL, "adjusted_loop_len");
11372               SSA_NAME_DEF_STMT (adjusted_len) = gimple_build_nop ();
11373               rgl->bias_adjusted_ctrl = adjusted_len;
11374             }
11375         }
11376     }
11377
11378   if (use_bias_adjusted_len)
11379     return rgl->bias_adjusted_ctrl;
11380
11381   tree loop_len = rgl->controls[index];
11382   if (rgl->factor == 1 && factor == 1)
11383     {
11384       poly_int64 nunits1 = TYPE_VECTOR_SUBPARTS (rgl->type);
11385       poly_int64 nunits2 = TYPE_VECTOR_SUBPARTS (vectype);
11386       if (maybe_ne (nunits1, nunits2))
11387         {
11388           /* A loop len for data type X can be reused for data type Y
11389              if X has N times more elements than Y and if Y's elements
11390              are N times bigger than X's.  */
11391           gcc_assert (multiple_p (nunits1, nunits2));
11392           factor = exact_div (nunits1, nunits2).to_constant ();
11393           tree iv_type = LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo);
11394           gimple_seq seq = NULL;
11395           loop_len = gimple_build (&seq, RDIV_EXPR, iv_type, loop_len,
11396                                    build_int_cst (iv_type, factor));
11397           if (seq)
11398             gsi_insert_seq_before (gsi, seq, GSI_SAME_STMT);
11399         }
11400     }
11401   return loop_len;
11402 }
11403
11404 /* Scale profiling counters by estimation for LOOP which is vectorized
11405    by factor VF.
11406    If FLAT is true, the loop we started with had unrealistically flat
11407    profile.  */
11408
11409 static void
11410 scale_profile_for_vect_loop (class loop *loop, edge exit_e, unsigned vf, bool flat)
11411 {
11412   /* For flat profiles do not scale down proportionally by VF and only
11413      cap by known iteration count bounds.  */
11414   if (flat)
11415     {
11416       if (dump_file && (dump_flags & TDF_DETAILS))
11417         fprintf (dump_file,
11418                  "Vectorized loop profile seems flat; not scaling iteration "
11419                  "count down by the vectorization factor %i\n", vf);
11420       scale_loop_profile (loop, profile_probability::always (),
11421                           get_likely_max_loop_iterations_int (loop));
11422       return;
11423     }
11424   /* Loop body executes VF fewer times and exit increases VF times.  */
11425   profile_count entry_count = loop_preheader_edge (loop)->count ();
11426
11427   /* If we have unreliable loop profile avoid dropping entry
11428      count bellow header count.  This can happen since loops
11429      has unrealistically low trip counts.  */
11430   while (vf > 1
11431          && loop->header->count > entry_count
11432          && loop->header->count < entry_count * vf)
11433     {
11434       if (dump_file && (dump_flags & TDF_DETAILS))
11435         fprintf (dump_file,
11436                  "Vectorization factor %i seems too large for profile "
11437                  "prevoiusly believed to be consistent; reducing.\n", vf);
11438       vf /= 2;
11439     }
11440
11441   if (entry_count.nonzero_p ())
11442     set_edge_probability_and_rescale_others
11443             (exit_e,
11444              entry_count.probability_in (loop->header->count / vf));
11445   /* Avoid producing very large exit probability when we do not have
11446      sensible profile.  */
11447   else if (exit_e->probability < profile_probability::always () / (vf * 2))
11448     set_edge_probability_and_rescale_others (exit_e, exit_e->probability * vf);
11449   loop->latch->count = single_pred_edge (loop->latch)->count ();
11450
11451   scale_loop_profile (loop, profile_probability::always () / vf,
11452                       get_likely_max_loop_iterations_int (loop));
11453 }
11454
11455 /* For a vectorized stmt DEF_STMT_INFO adjust all vectorized PHI
11456    latch edge values originally defined by it.  */
11457
11458 static void
11459 maybe_set_vectorized_backedge_value (loop_vec_info loop_vinfo,
11460                                      stmt_vec_info def_stmt_info)
11461 {
11462   tree def = gimple_get_lhs (vect_orig_stmt (def_stmt_info)->stmt);
11463   if (!def || TREE_CODE (def) != SSA_NAME)
11464     return;
11465   stmt_vec_info phi_info;
11466   imm_use_iterator iter;
11467   use_operand_p use_p;
11468   FOR_EACH_IMM_USE_FAST (use_p, iter, def)
11469     {
11470       gphi *phi = dyn_cast <gphi *> (USE_STMT (use_p));
11471       if (!phi)
11472         continue;
11473       if (!(gimple_bb (phi)->loop_father->header == gimple_bb (phi)
11474             && (phi_info = loop_vinfo->lookup_stmt (phi))
11475             && STMT_VINFO_RELEVANT_P (phi_info)))
11476         continue;
11477       loop_p loop = gimple_bb (phi)->loop_father;
11478       edge e = loop_latch_edge (loop);
11479       if (PHI_ARG_DEF_FROM_EDGE (phi, e) != def)
11480         continue;
11481
11482       if (VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (phi_info))
11483           && STMT_VINFO_REDUC_TYPE (phi_info) != FOLD_LEFT_REDUCTION
11484           && STMT_VINFO_REDUC_TYPE (phi_info) != EXTRACT_LAST_REDUCTION)
11485         {
11486           vec<gimple *> &phi_defs = STMT_VINFO_VEC_STMTS (phi_info);
11487           vec<gimple *> &latch_defs = STMT_VINFO_VEC_STMTS (def_stmt_info);
11488           gcc_assert (phi_defs.length () == latch_defs.length ());
11489           for (unsigned i = 0; i < phi_defs.length (); ++i)
11490             add_phi_arg (as_a <gphi *> (phi_defs[i]),
11491                          gimple_get_lhs (latch_defs[i]), e,
11492                          gimple_phi_arg_location (phi, e->dest_idx));
11493         }
11494       else if (STMT_VINFO_DEF_TYPE (phi_info) == vect_first_order_recurrence)
11495         {
11496           /* For first order recurrences we have to update both uses of
11497              the latch definition, the one in the PHI node and the one
11498              in the generated VEC_PERM_EXPR.  */
11499           vec<gimple *> &phi_defs = STMT_VINFO_VEC_STMTS (phi_info);
11500           vec<gimple *> &latch_defs = STMT_VINFO_VEC_STMTS (def_stmt_info);
11501           gcc_assert (phi_defs.length () == latch_defs.length ());
11502           tree phidef = gimple_assign_rhs1 (phi_defs[0]);
11503           gphi *vphi = as_a <gphi *> (SSA_NAME_DEF_STMT (phidef));
11504           for (unsigned i = 0; i < phi_defs.length (); ++i)
11505             {
11506               gassign *perm = as_a <gassign *> (phi_defs[i]);
11507               if (i > 0)
11508                 gimple_assign_set_rhs1 (perm, gimple_get_lhs (latch_defs[i-1]));
11509               gimple_assign_set_rhs2 (perm, gimple_get_lhs (latch_defs[i]));
11510               update_stmt (perm);
11511             }
11512           add_phi_arg (vphi, gimple_get_lhs (latch_defs.last ()), e,
11513                        gimple_phi_arg_location (phi, e->dest_idx));
11514         }
11515     }
11516 }
11517
11518 /* Vectorize STMT_INFO if relevant, inserting any new instructions before GSI.
11519    When vectorizing STMT_INFO as a store, set *SEEN_STORE to its
11520    stmt_vec_info.  */
11521
11522 static bool
11523 vect_transform_loop_stmt (loop_vec_info loop_vinfo, stmt_vec_info stmt_info,
11524                           gimple_stmt_iterator *gsi, stmt_vec_info *seen_store)
11525 {
11526   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
11527   poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
11528
11529   if (dump_enabled_p ())
11530     dump_printf_loc (MSG_NOTE, vect_location,
11531                      "------>vectorizing statement: %G", stmt_info->stmt);
11532
11533   if (MAY_HAVE_DEBUG_BIND_STMTS && !STMT_VINFO_LIVE_P (stmt_info))
11534     vect_loop_kill_debug_uses (loop, stmt_info);
11535
11536   if (!STMT_VINFO_RELEVANT_P (stmt_info)
11537       && !STMT_VINFO_LIVE_P (stmt_info))
11538     {
11539       if (is_gimple_call (stmt_info->stmt)
11540           && gimple_call_internal_p (stmt_info->stmt, IFN_MASK_CALL))
11541         {
11542           gcc_assert (!gimple_call_lhs (stmt_info->stmt));
11543           *seen_store = stmt_info;
11544           return false;
11545         }
11546       return false;
11547     }
11548
11549   if (STMT_VINFO_VECTYPE (stmt_info))
11550     {
11551       poly_uint64 nunits
11552         = TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info));
11553       if (!STMT_SLP_TYPE (stmt_info)
11554           && maybe_ne (nunits, vf)
11555           && dump_enabled_p ())
11556         /* For SLP VF is set according to unrolling factor, and not
11557            to vector size, hence for SLP this print is not valid.  */
11558         dump_printf_loc (MSG_NOTE, vect_location, "multiple-types.\n");
11559     }
11560
11561   /* Pure SLP statements have already been vectorized.  We still need
11562      to apply loop vectorization to hybrid SLP statements.  */
11563   if (PURE_SLP_STMT (stmt_info))
11564     return false;
11565
11566   if (dump_enabled_p ())
11567     dump_printf_loc (MSG_NOTE, vect_location, "transform statement.\n");
11568
11569   if (vect_transform_stmt (loop_vinfo, stmt_info, gsi, NULL, NULL))
11570     *seen_store = stmt_info;
11571
11572   return true;
11573 }
11574
11575 /* Helper function to pass to simplify_replace_tree to enable replacing tree's
11576    in the hash_map with its corresponding values.  */
11577
11578 static tree
11579 find_in_mapping (tree t, void *context)
11580 {
11581   hash_map<tree,tree>* mapping = (hash_map<tree, tree>*) context;
11582
11583   tree *value = mapping->get (t);
11584   return value ? *value : t;
11585 }
11586
11587 /* Update EPILOGUE's loop_vec_info.  EPILOGUE was constructed as a copy of the
11588    original loop that has now been vectorized.
11589
11590    The inits of the data_references need to be advanced with the number of
11591    iterations of the main loop.  This has been computed in vect_do_peeling and
11592    is stored in parameter ADVANCE.  We first restore the data_references
11593    initial offset with the values recored in ORIG_DRS_INIT.
11594
11595    Since the loop_vec_info of this EPILOGUE was constructed for the original
11596    loop, its stmt_vec_infos all point to the original statements.  These need
11597    to be updated to point to their corresponding copies as well as the SSA_NAMES
11598    in their PATTERN_DEF_SEQs and RELATED_STMTs.
11599
11600    The data_reference's connections also need to be updated.  Their
11601    corresponding dr_vec_info need to be reconnected to the EPILOGUE's
11602    stmt_vec_infos, their statements need to point to their corresponding copy,
11603    if they are gather loads or scatter stores then their reference needs to be
11604    updated to point to its corresponding copy.  */
11605
11606 static void
11607 update_epilogue_loop_vinfo (class loop *epilogue, tree advance)
11608 {
11609   loop_vec_info epilogue_vinfo = loop_vec_info_for_loop (epilogue);
11610   auto_vec<gimple *> stmt_worklist;
11611   hash_map<tree,tree> mapping;
11612   gimple *orig_stmt, *new_stmt;
11613   gimple_stmt_iterator epilogue_gsi;
11614   gphi_iterator epilogue_phi_gsi;
11615   stmt_vec_info stmt_vinfo = NULL, related_vinfo;
11616   basic_block *epilogue_bbs = get_loop_body (epilogue);
11617   unsigned i;
11618
11619   free (LOOP_VINFO_BBS (epilogue_vinfo));
11620   LOOP_VINFO_BBS (epilogue_vinfo) = epilogue_bbs;
11621
11622   /* Advance data_reference's with the number of iterations of the previous
11623      loop and its prologue.  */
11624   vect_update_inits_of_drs (epilogue_vinfo, advance, PLUS_EXPR);
11625
11626
11627   /* The EPILOGUE loop is a copy of the original loop so they share the same
11628      gimple UIDs.  In this loop we update the loop_vec_info of the EPILOGUE to
11629      point to the copied statements.  We also create a mapping of all LHS' in
11630      the original loop and all the LHS' in the EPILOGUE and create worklists to
11631      update teh STMT_VINFO_PATTERN_DEF_SEQs and STMT_VINFO_RELATED_STMTs.  */
11632   for (unsigned i = 0; i < epilogue->num_nodes; ++i)
11633     {
11634       for (epilogue_phi_gsi = gsi_start_phis (epilogue_bbs[i]);
11635            !gsi_end_p (epilogue_phi_gsi); gsi_next (&epilogue_phi_gsi))
11636         {
11637           new_stmt = epilogue_phi_gsi.phi ();
11638
11639           gcc_assert (gimple_uid (new_stmt) > 0);
11640           stmt_vinfo
11641             = epilogue_vinfo->stmt_vec_infos[gimple_uid (new_stmt) - 1];
11642
11643           orig_stmt = STMT_VINFO_STMT (stmt_vinfo);
11644           STMT_VINFO_STMT (stmt_vinfo) = new_stmt;
11645
11646           mapping.put (gimple_phi_result (orig_stmt),
11647                        gimple_phi_result (new_stmt));
11648           /* PHI nodes can not have patterns or related statements.  */
11649           gcc_assert (STMT_VINFO_PATTERN_DEF_SEQ (stmt_vinfo) == NULL
11650                       && STMT_VINFO_RELATED_STMT (stmt_vinfo) == NULL);
11651         }
11652
11653       for (epilogue_gsi = gsi_start_bb (epilogue_bbs[i]);
11654            !gsi_end_p (epilogue_gsi); gsi_next (&epilogue_gsi))
11655         {
11656           new_stmt = gsi_stmt (epilogue_gsi);
11657           if (is_gimple_debug (new_stmt))
11658             continue;
11659
11660           gcc_assert (gimple_uid (new_stmt) > 0);
11661           stmt_vinfo
11662             = epilogue_vinfo->stmt_vec_infos[gimple_uid (new_stmt) - 1];
11663
11664           orig_stmt = STMT_VINFO_STMT (stmt_vinfo);
11665           STMT_VINFO_STMT (stmt_vinfo) = new_stmt;
11666
11667           if (tree old_lhs = gimple_get_lhs (orig_stmt))
11668             mapping.put (old_lhs, gimple_get_lhs (new_stmt));
11669
11670           if (STMT_VINFO_PATTERN_DEF_SEQ (stmt_vinfo))
11671             {
11672               gimple_seq seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_vinfo);
11673               for (gimple_stmt_iterator gsi = gsi_start (seq);
11674                    !gsi_end_p (gsi); gsi_next (&gsi))
11675                 stmt_worklist.safe_push (gsi_stmt (gsi));
11676             }
11677
11678           related_vinfo = STMT_VINFO_RELATED_STMT (stmt_vinfo);
11679           if (related_vinfo != NULL && related_vinfo != stmt_vinfo)
11680             {
11681               gimple *stmt = STMT_VINFO_STMT (related_vinfo);
11682               stmt_worklist.safe_push (stmt);
11683               /* Set BB such that the assert in
11684                 'get_initial_def_for_reduction' is able to determine that
11685                 the BB of the related stmt is inside this loop.  */
11686               gimple_set_bb (stmt,
11687                              gimple_bb (new_stmt));
11688               related_vinfo = STMT_VINFO_RELATED_STMT (related_vinfo);
11689               gcc_assert (related_vinfo == NULL
11690                           || related_vinfo == stmt_vinfo);
11691             }
11692         }
11693     }
11694
11695   /* The PATTERN_DEF_SEQs and RELATED_STMTs in the epilogue were constructed
11696      using the original main loop and thus need to be updated to refer to the
11697      cloned variables used in the epilogue.  */
11698   for (unsigned i = 0; i < stmt_worklist.length (); ++i)
11699     {
11700       gimple *stmt = stmt_worklist[i];
11701       tree *new_op;
11702
11703       for (unsigned j = 1; j < gimple_num_ops (stmt); ++j)
11704         {
11705           tree op = gimple_op (stmt, j);
11706           if ((new_op = mapping.get(op)))
11707             gimple_set_op (stmt, j, *new_op);
11708           else
11709             {
11710               /* PR92429: The last argument of simplify_replace_tree disables
11711                  folding when replacing arguments.  This is required as
11712                  otherwise you might end up with different statements than the
11713                  ones analyzed in vect_loop_analyze, leading to different
11714                  vectorization.  */
11715               op = simplify_replace_tree (op, NULL_TREE, NULL_TREE,
11716                                           &find_in_mapping, &mapping, false);
11717               gimple_set_op (stmt, j, op);
11718             }
11719         }
11720     }
11721
11722   struct data_reference *dr;
11723   vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (epilogue_vinfo);
11724   FOR_EACH_VEC_ELT (datarefs, i, dr)
11725     {
11726       orig_stmt = DR_STMT (dr);
11727       gcc_assert (gimple_uid (orig_stmt) > 0);
11728       stmt_vinfo = epilogue_vinfo->stmt_vec_infos[gimple_uid (orig_stmt) - 1];
11729       /* Data references for gather loads and scatter stores do not use the
11730          updated offset we set using ADVANCE.  Instead we have to make sure the
11731          reference in the data references point to the corresponding copy of
11732          the original in the epilogue.  Make sure to update both
11733          gather/scatters recognized by dataref analysis and also other
11734          refs that get_load_store_type classified as VMAT_GATHER_SCATTER.  */
11735       auto vstmt_vinfo = vect_stmt_to_vectorize (stmt_vinfo);
11736       if (STMT_VINFO_MEMORY_ACCESS_TYPE (vstmt_vinfo) == VMAT_GATHER_SCATTER
11737           || STMT_VINFO_GATHER_SCATTER_P (vstmt_vinfo))
11738         {
11739           DR_REF (dr)
11740             = simplify_replace_tree (DR_REF (dr), NULL_TREE, NULL_TREE,
11741                                      &find_in_mapping, &mapping);
11742           DR_BASE_ADDRESS (dr)
11743             = simplify_replace_tree (DR_BASE_ADDRESS (dr), NULL_TREE, NULL_TREE,
11744                                      &find_in_mapping, &mapping);
11745         }
11746       DR_STMT (dr) = STMT_VINFO_STMT (stmt_vinfo);
11747       stmt_vinfo->dr_aux.stmt = stmt_vinfo;
11748     }
11749
11750   epilogue_vinfo->shared->datarefs_copy.release ();
11751   epilogue_vinfo->shared->save_datarefs ();
11752 }
11753
11754 /*  When vectorizing early break statements instructions that happen before
11755     the early break in the current BB need to be moved to after the early
11756     break.  This function deals with that and assumes that any validity
11757     checks has already been performed.
11758
11759     While moving the instructions if it encounters a VUSE or VDEF it then
11760     corrects the VUSES as it moves the statements along.  GDEST is the location
11761     in which to insert the new statements.  */
11762
11763 static void
11764 move_early_exit_stmts (loop_vec_info loop_vinfo)
11765 {
11766   DUMP_VECT_SCOPE ("move_early_exit_stmts");
11767
11768   if (LOOP_VINFO_EARLY_BRK_STORES (loop_vinfo).is_empty ())
11769     return;
11770
11771   /* Move all stmts that need moving.  */
11772   basic_block dest_bb = LOOP_VINFO_EARLY_BRK_DEST_BB (loop_vinfo);
11773   gimple_stmt_iterator dest_gsi = gsi_after_labels (dest_bb);
11774
11775   tree last_seen_vuse = NULL_TREE;
11776   for (gimple *stmt : LOOP_VINFO_EARLY_BRK_STORES (loop_vinfo))
11777     {
11778       /* We have to update crossed degenerate virtual PHIs.  Simply
11779          elide them.  */
11780       if (gphi *vphi = dyn_cast <gphi *> (stmt))
11781         {
11782           tree vdef = gimple_phi_result (vphi);
11783           tree vuse = gimple_phi_arg_def (vphi, 0);
11784           imm_use_iterator iter;
11785           use_operand_p use_p;
11786           gimple *use_stmt;
11787           FOR_EACH_IMM_USE_STMT (use_stmt, iter, vdef)
11788             {
11789               FOR_EACH_IMM_USE_ON_STMT (use_p, iter)
11790                 SET_USE (use_p, vuse);
11791             }
11792           auto gsi = gsi_for_stmt (stmt);
11793           remove_phi_node (&gsi, true);
11794           last_seen_vuse = vuse;
11795           continue;
11796         }
11797
11798       /* Check to see if statement is still required for vect or has been
11799          elided.  */
11800       auto stmt_info = loop_vinfo->lookup_stmt (stmt);
11801       if (!stmt_info)
11802         continue;
11803
11804       if (dump_enabled_p ())
11805         dump_printf_loc (MSG_NOTE, vect_location, "moving stmt %G", stmt);
11806
11807       gimple_stmt_iterator stmt_gsi = gsi_for_stmt (stmt);
11808       gsi_move_before (&stmt_gsi, &dest_gsi, GSI_NEW_STMT);
11809       last_seen_vuse = gimple_vuse (stmt);
11810     }
11811
11812   /* Update all the stmts with their new reaching VUSES.  */
11813   for (auto p : LOOP_VINFO_EARLY_BRK_VUSES (loop_vinfo))
11814     {
11815       if (dump_enabled_p ())
11816           dump_printf_loc (MSG_NOTE, vect_location,
11817                            "updating vuse to %T for load %G",
11818                            last_seen_vuse, p);
11819       gimple_set_vuse (p, last_seen_vuse);
11820       update_stmt (p);
11821     }
11822
11823   /* And update the LC PHIs on exits.  */
11824   for (edge e : get_loop_exit_edges (LOOP_VINFO_LOOP  (loop_vinfo)))
11825     if (!dominated_by_p (CDI_DOMINATORS, e->src, dest_bb))
11826       if (gphi *phi = get_virtual_phi (e->dest))
11827         SET_PHI_ARG_DEF_ON_EDGE (phi, e, last_seen_vuse);
11828 }
11829
11830 /* Function vect_transform_loop.
11831
11832    The analysis phase has determined that the loop is vectorizable.
11833    Vectorize the loop - created vectorized stmts to replace the scalar
11834    stmts in the loop, and update the loop exit condition.
11835    Returns scalar epilogue loop if any.  */
11836
11837 class loop *
11838 vect_transform_loop (loop_vec_info loop_vinfo, gimple *loop_vectorized_call)
11839 {
11840   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
11841   class loop *epilogue = NULL;
11842   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
11843   int nbbs = loop->num_nodes;
11844   int i;
11845   tree niters_vector = NULL_TREE;
11846   tree step_vector = NULL_TREE;
11847   tree niters_vector_mult_vf = NULL_TREE;
11848   poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
11849   unsigned int lowest_vf = constant_lower_bound (vf);
11850   gimple *stmt;
11851   bool check_profitability = false;
11852   unsigned int th;
11853   bool flat = maybe_flat_loop_profile (loop);
11854
11855   DUMP_VECT_SCOPE ("vec_transform_loop");
11856
11857   loop_vinfo->shared->check_datarefs ();
11858
11859   /* Use the more conservative vectorization threshold.  If the number
11860      of iterations is constant assume the cost check has been performed
11861      by our caller.  If the threshold makes all loops profitable that
11862      run at least the (estimated) vectorization factor number of times
11863      checking is pointless, too.  */
11864   th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
11865   if (vect_apply_runtime_profitability_check_p (loop_vinfo))
11866     {
11867       if (dump_enabled_p ())
11868         dump_printf_loc (MSG_NOTE, vect_location,
11869                          "Profitability threshold is %d loop iterations.\n",
11870                          th);
11871       check_profitability = true;
11872     }
11873
11874   /* Make sure there exists a single-predecessor exit bb.  Do this before
11875      versioning.   */
11876   edge e = LOOP_VINFO_IV_EXIT (loop_vinfo);
11877   if (! single_pred_p (e->dest) && !LOOP_VINFO_EARLY_BREAKS (loop_vinfo))
11878     {
11879       split_loop_exit_edge (e, true);
11880       if (dump_enabled_p ())
11881         dump_printf (MSG_NOTE, "split exit edge\n");
11882     }
11883
11884   /* Version the loop first, if required, so the profitability check
11885      comes first.  */
11886
11887   if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
11888     {
11889       class loop *sloop
11890         = vect_loop_versioning (loop_vinfo, loop_vectorized_call);
11891       sloop->force_vectorize = false;
11892       check_profitability = false;
11893     }
11894
11895   /* Make sure there exists a single-predecessor exit bb also on the
11896      scalar loop copy.  Do this after versioning but before peeling
11897      so CFG structure is fine for both scalar and if-converted loop
11898      to make slpeel_duplicate_current_defs_from_edges face matched
11899      loop closed PHI nodes on the exit.  */
11900   if (LOOP_VINFO_SCALAR_LOOP (loop_vinfo))
11901     {
11902       e = LOOP_VINFO_SCALAR_IV_EXIT (loop_vinfo);
11903       if (! single_pred_p (e->dest))
11904         {
11905           split_loop_exit_edge (e, true);
11906           if (dump_enabled_p ())
11907             dump_printf (MSG_NOTE, "split exit edge of scalar loop\n");
11908         }
11909     }
11910
11911   tree niters = vect_build_loop_niters (loop_vinfo);
11912   LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = niters;
11913   tree nitersm1 = unshare_expr (LOOP_VINFO_NITERSM1 (loop_vinfo));
11914   bool niters_no_overflow = loop_niters_no_overflow (loop_vinfo);
11915   tree advance;
11916   drs_init_vec orig_drs_init;
11917
11918   epilogue = vect_do_peeling (loop_vinfo, niters, nitersm1, &niters_vector,
11919                               &step_vector, &niters_vector_mult_vf, th,
11920                               check_profitability, niters_no_overflow,
11921                               &advance);
11922   if (LOOP_VINFO_SCALAR_LOOP (loop_vinfo)
11923       && LOOP_VINFO_SCALAR_LOOP_SCALING (loop_vinfo).initialized_p ())
11924     {
11925       /* Ifcvt duplicates loop preheader, loop body and produces an basic
11926          block after loop exit.  We need to scale all that.  */
11927       basic_block preheader
11928         = loop_preheader_edge (LOOP_VINFO_SCALAR_LOOP (loop_vinfo))->src;
11929       preheader->count
11930         = preheader->count.apply_probability
11931               (LOOP_VINFO_SCALAR_LOOP_SCALING (loop_vinfo));
11932       scale_loop_frequencies (LOOP_VINFO_SCALAR_LOOP (loop_vinfo),
11933                               LOOP_VINFO_SCALAR_LOOP_SCALING (loop_vinfo));
11934       LOOP_VINFO_SCALAR_IV_EXIT (loop_vinfo)->dest->count = preheader->count;
11935     }
11936
11937   if (niters_vector == NULL_TREE)
11938     {
11939       if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
11940           && !LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo)
11941           && known_eq (lowest_vf, vf))
11942         {
11943           niters_vector
11944             = build_int_cst (TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo)),
11945                              LOOP_VINFO_INT_NITERS (loop_vinfo) / lowest_vf);
11946           step_vector = build_one_cst (TREE_TYPE (niters));
11947         }
11948       else if (vect_use_loop_mask_for_alignment_p (loop_vinfo))
11949         vect_gen_vector_loop_niters (loop_vinfo, niters, &niters_vector,
11950                                      &step_vector, niters_no_overflow);
11951       else
11952         /* vect_do_peeling subtracted the number of peeled prologue
11953            iterations from LOOP_VINFO_NITERS.  */
11954         vect_gen_vector_loop_niters (loop_vinfo, LOOP_VINFO_NITERS (loop_vinfo),
11955                                      &niters_vector, &step_vector,
11956                                      niters_no_overflow);
11957     }
11958
11959   /* 1) Make sure the loop header has exactly two entries
11960      2) Make sure we have a preheader basic block.  */
11961
11962   gcc_assert (EDGE_COUNT (loop->header->preds) == 2);
11963
11964   split_edge (loop_preheader_edge (loop));
11965
11966   if (vect_use_loop_mask_for_alignment_p (loop_vinfo))
11967     /* This will deal with any possible peeling.  */
11968     vect_prepare_for_masked_peels (loop_vinfo);
11969
11970   /* Handle any code motion that we need to for early-break vectorization after
11971      we've done peeling but just before we start vectorizing.  */
11972   if (LOOP_VINFO_EARLY_BREAKS (loop_vinfo))
11973     move_early_exit_stmts (loop_vinfo);
11974
11975   /* Schedule the SLP instances first, then handle loop vectorization
11976      below.  */
11977   if (!loop_vinfo->slp_instances.is_empty ())
11978     {
11979       DUMP_VECT_SCOPE ("scheduling SLP instances");
11980       vect_schedule_slp (loop_vinfo, LOOP_VINFO_SLP_INSTANCES (loop_vinfo));
11981     }
11982
11983   /* FORNOW: the vectorizer supports only loops which body consist
11984      of one basic block (header + empty latch). When the vectorizer will
11985      support more involved loop forms, the order by which the BBs are
11986      traversed need to be reconsidered.  */
11987
11988   for (i = 0; i < nbbs; i++)
11989     {
11990       basic_block bb = bbs[i];
11991       stmt_vec_info stmt_info;
11992
11993       for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
11994            gsi_next (&si))
11995         {
11996           gphi *phi = si.phi ();
11997           if (dump_enabled_p ())
11998             dump_printf_loc (MSG_NOTE, vect_location,
11999                              "------>vectorizing phi: %G", (gimple *) phi);
12000           stmt_info = loop_vinfo->lookup_stmt (phi);
12001           if (!stmt_info)
12002             continue;
12003
12004           if (MAY_HAVE_DEBUG_BIND_STMTS && !STMT_VINFO_LIVE_P (stmt_info))
12005             vect_loop_kill_debug_uses (loop, stmt_info);
12006
12007           if (!STMT_VINFO_RELEVANT_P (stmt_info)
12008               && !STMT_VINFO_LIVE_P (stmt_info))
12009             continue;
12010
12011           if (STMT_VINFO_VECTYPE (stmt_info)
12012               && (maybe_ne
12013                   (TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info)), vf))
12014               && dump_enabled_p ())
12015             dump_printf_loc (MSG_NOTE, vect_location, "multiple-types.\n");
12016
12017           if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def
12018                || STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
12019                || STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def
12020                || STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle
12021                || STMT_VINFO_DEF_TYPE (stmt_info) == vect_first_order_recurrence
12022                || STMT_VINFO_DEF_TYPE (stmt_info) == vect_internal_def)
12023               && ! PURE_SLP_STMT (stmt_info))
12024             {
12025               if (dump_enabled_p ())
12026                 dump_printf_loc (MSG_NOTE, vect_location, "transform phi.\n");
12027               vect_transform_stmt (loop_vinfo, stmt_info, NULL, NULL, NULL);
12028             }
12029         }
12030
12031       for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
12032            gsi_next (&si))
12033         {
12034           gphi *phi = si.phi ();
12035           stmt_info = loop_vinfo->lookup_stmt (phi);
12036           if (!stmt_info)
12037             continue;
12038
12039           if (!STMT_VINFO_RELEVANT_P (stmt_info)
12040               && !STMT_VINFO_LIVE_P (stmt_info))
12041             continue;
12042
12043           if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def
12044                || STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
12045                || STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def
12046                || STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle
12047                || STMT_VINFO_DEF_TYPE (stmt_info) == vect_internal_def
12048                || STMT_VINFO_DEF_TYPE (stmt_info) == vect_first_order_recurrence)
12049               && ! PURE_SLP_STMT (stmt_info))
12050             maybe_set_vectorized_backedge_value (loop_vinfo, stmt_info);
12051         }
12052
12053       for (gimple_stmt_iterator si = gsi_start_bb (bb);
12054            !gsi_end_p (si);)
12055         {
12056           stmt = gsi_stmt (si);
12057           /* During vectorization remove existing clobber stmts.  */
12058           if (gimple_clobber_p (stmt))
12059             {
12060               unlink_stmt_vdef (stmt);
12061               gsi_remove (&si, true);
12062               release_defs (stmt);
12063             }
12064           else
12065             {
12066               /* Ignore vector stmts created in the outer loop.  */
12067               stmt_info = loop_vinfo->lookup_stmt (stmt);
12068
12069               /* vector stmts created in the outer-loop during vectorization of
12070                  stmts in an inner-loop may not have a stmt_info, and do not
12071                  need to be vectorized.  */
12072               stmt_vec_info seen_store = NULL;
12073               if (stmt_info)
12074                 {
12075                   if (STMT_VINFO_IN_PATTERN_P (stmt_info))
12076                     {
12077                       gimple *def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
12078                       for (gimple_stmt_iterator subsi = gsi_start (def_seq);
12079                            !gsi_end_p (subsi); gsi_next (&subsi))
12080                         {
12081                           stmt_vec_info pat_stmt_info
12082                             = loop_vinfo->lookup_stmt (gsi_stmt (subsi));
12083                           vect_transform_loop_stmt (loop_vinfo, pat_stmt_info,
12084                                                     &si, &seen_store);
12085                         }
12086                       stmt_vec_info pat_stmt_info
12087                         = STMT_VINFO_RELATED_STMT (stmt_info);
12088                       if (vect_transform_loop_stmt (loop_vinfo, pat_stmt_info,
12089                                                     &si, &seen_store))
12090                         maybe_set_vectorized_backedge_value (loop_vinfo,
12091                                                              pat_stmt_info);
12092                     }
12093                   else
12094                     {
12095                       if (vect_transform_loop_stmt (loop_vinfo, stmt_info, &si,
12096                                                     &seen_store))
12097                         maybe_set_vectorized_backedge_value (loop_vinfo,
12098                                                              stmt_info);
12099                     }
12100                 }
12101               gsi_next (&si);
12102               if (seen_store)
12103                 {
12104                   if (STMT_VINFO_GROUPED_ACCESS (seen_store))
12105                     /* Interleaving.  If IS_STORE is TRUE, the
12106                        vectorization of the interleaving chain was
12107                        completed - free all the stores in the chain.  */
12108                     vect_remove_stores (loop_vinfo,
12109                                         DR_GROUP_FIRST_ELEMENT (seen_store));
12110                   else
12111                     /* Free the attached stmt_vec_info and remove the stmt.  */
12112                     loop_vinfo->remove_stmt (stmt_info);
12113                 }
12114             }
12115         }
12116
12117       /* Stub out scalar statements that must not survive vectorization.
12118          Doing this here helps with grouped statements, or statements that
12119          are involved in patterns.  */
12120       for (gimple_stmt_iterator gsi = gsi_start_bb (bb);
12121            !gsi_end_p (gsi); gsi_next (&gsi))
12122         {
12123           gcall *call = dyn_cast <gcall *> (gsi_stmt (gsi));
12124           if (!call || !gimple_call_internal_p (call))
12125             continue;
12126           internal_fn ifn = gimple_call_internal_fn (call);
12127           if (ifn == IFN_MASK_LOAD)
12128             {
12129               tree lhs = gimple_get_lhs (call);
12130               if (!VECTOR_TYPE_P (TREE_TYPE (lhs)))
12131                 {
12132                   tree zero = build_zero_cst (TREE_TYPE (lhs));
12133                   gimple *new_stmt = gimple_build_assign (lhs, zero);
12134                   gsi_replace (&gsi, new_stmt, true);
12135                 }
12136             }
12137           else if (conditional_internal_fn_code (ifn) != ERROR_MARK)
12138             {
12139               tree lhs = gimple_get_lhs (call);
12140               if (!VECTOR_TYPE_P (TREE_TYPE (lhs)))
12141                 {
12142                   tree else_arg
12143                     = gimple_call_arg (call, gimple_call_num_args (call) - 1);
12144                   gimple *new_stmt = gimple_build_assign (lhs, else_arg);
12145                   gsi_replace (&gsi, new_stmt, true);
12146                 }
12147             }
12148         }
12149     }                           /* BBs in loop */
12150
12151   /* The vectorization factor is always > 1, so if we use an IV increment of 1.
12152      a zero NITERS becomes a nonzero NITERS_VECTOR.  */
12153   if (integer_onep (step_vector))
12154     niters_no_overflow = true;
12155   vect_set_loop_condition (loop, LOOP_VINFO_IV_EXIT (loop_vinfo), loop_vinfo,
12156                            niters_vector, step_vector, niters_vector_mult_vf,
12157                            !niters_no_overflow);
12158
12159   unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
12160
12161   /* True if the final iteration might not handle a full vector's
12162      worth of scalar iterations.  */
12163   bool final_iter_may_be_partial
12164     = LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo)
12165       || LOOP_VINFO_EARLY_BREAKS (loop_vinfo);
12166
12167   /* +1 to convert latch counts to loop iteration counts.  */
12168   int bias_for_lowest = 1;
12169
12170   /* When we are peeling for gaps then we take away one scalar iteration
12171      from the vector loop.  Thus we can adjust the upper bound by one
12172      scalar iteration.  But only when we know the bound applies to the
12173      IV exit test which might not be true when we have multiple exits.  */
12174   if (!LOOP_VINFO_EARLY_BREAKS (loop_vinfo))
12175     bias_for_lowest -= LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) ? 1 : 0;
12176
12177   int bias_for_assumed = bias_for_lowest;
12178   int alignment_npeels = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
12179   if (alignment_npeels && LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
12180     {
12181       /* When the amount of peeling is known at compile time, the first
12182          iteration will have exactly alignment_npeels active elements.
12183          In the worst case it will have at least one.  */
12184       int min_first_active = (alignment_npeels > 0 ? alignment_npeels : 1);
12185       bias_for_lowest += lowest_vf - min_first_active;
12186       bias_for_assumed += assumed_vf - min_first_active;
12187     }
12188   /* In these calculations the "- 1" converts loop iteration counts
12189      back to latch counts.  */
12190   if (loop->any_upper_bound)
12191     {
12192       loop_vec_info main_vinfo = LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo);
12193       loop->nb_iterations_upper_bound
12194         = (final_iter_may_be_partial
12195            ? wi::udiv_ceil (loop->nb_iterations_upper_bound + bias_for_lowest,
12196                             lowest_vf) - 1
12197            : wi::udiv_floor (loop->nb_iterations_upper_bound + bias_for_lowest,
12198                              lowest_vf) - 1);
12199       if (main_vinfo
12200           /* Both peeling for alignment and peeling for gaps can end up
12201              with the scalar epilogue running for more than VF-1 iterations.  */
12202           && !main_vinfo->peeling_for_alignment
12203           && !main_vinfo->peeling_for_gaps)
12204         {
12205           unsigned int bound;
12206           poly_uint64 main_iters
12207             = upper_bound (LOOP_VINFO_VECT_FACTOR (main_vinfo),
12208                            LOOP_VINFO_COST_MODEL_THRESHOLD (main_vinfo));
12209           main_iters
12210             = upper_bound (main_iters,
12211                            LOOP_VINFO_VERSIONING_THRESHOLD (main_vinfo));
12212           if (can_div_away_from_zero_p (main_iters,
12213                                         LOOP_VINFO_VECT_FACTOR (loop_vinfo),
12214                                         &bound))
12215             loop->nb_iterations_upper_bound
12216               = wi::umin ((bound_wide_int) (bound - 1),
12217                           loop->nb_iterations_upper_bound);
12218       }
12219   }
12220   if (loop->any_likely_upper_bound)
12221     loop->nb_iterations_likely_upper_bound
12222       = (final_iter_may_be_partial
12223          ? wi::udiv_ceil (loop->nb_iterations_likely_upper_bound
12224                           + bias_for_lowest, lowest_vf) - 1
12225          : wi::udiv_floor (loop->nb_iterations_likely_upper_bound
12226                            + bias_for_lowest, lowest_vf) - 1);
12227   if (loop->any_estimate)
12228     loop->nb_iterations_estimate
12229       = (final_iter_may_be_partial
12230          ? wi::udiv_ceil (loop->nb_iterations_estimate + bias_for_assumed,
12231                           assumed_vf) - 1
12232          : wi::udiv_floor (loop->nb_iterations_estimate + bias_for_assumed,
12233                            assumed_vf) - 1);
12234   scale_profile_for_vect_loop (loop, LOOP_VINFO_IV_EXIT (loop_vinfo),
12235                                assumed_vf, flat);
12236
12237   if (dump_enabled_p ())
12238     {
12239       if (!LOOP_VINFO_EPILOGUE_P (loop_vinfo))
12240         {
12241           dump_printf_loc (MSG_NOTE, vect_location,
12242                            "LOOP VECTORIZED\n");
12243           if (loop->inner)
12244             dump_printf_loc (MSG_NOTE, vect_location,
12245                              "OUTER LOOP VECTORIZED\n");
12246           dump_printf (MSG_NOTE, "\n");
12247         }
12248       else
12249         dump_printf_loc (MSG_NOTE, vect_location,
12250                          "LOOP EPILOGUE VECTORIZED (MODE=%s)\n",
12251                          GET_MODE_NAME (loop_vinfo->vector_mode));
12252     }
12253
12254   /* Loops vectorized with a variable factor won't benefit from
12255      unrolling/peeling.  */
12256   if (!vf.is_constant ())
12257     {
12258       loop->unroll = 1;
12259       if (dump_enabled_p ())
12260         dump_printf_loc (MSG_NOTE, vect_location, "Disabling unrolling due to"
12261                          " variable-length vectorization factor\n");
12262     }
12263   /* Free SLP instances here because otherwise stmt reference counting
12264      won't work.  */
12265   slp_instance instance;
12266   FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), i, instance)
12267     vect_free_slp_instance (instance);
12268   LOOP_VINFO_SLP_INSTANCES (loop_vinfo).release ();
12269   /* Clear-up safelen field since its value is invalid after vectorization
12270      since vectorized loop can have loop-carried dependencies.  */
12271   loop->safelen = 0;
12272
12273   if (epilogue)
12274     {
12275       update_epilogue_loop_vinfo (epilogue, advance);
12276
12277       epilogue->simduid = loop->simduid;
12278       epilogue->force_vectorize = loop->force_vectorize;
12279       epilogue->dont_vectorize = false;
12280     }
12281
12282   return epilogue;
12283 }
12284
12285 /* The code below is trying to perform simple optimization - revert
12286    if-conversion for masked stores, i.e. if the mask of a store is zero
12287    do not perform it and all stored value producers also if possible.
12288    For example,
12289      for (i=0; i<n; i++)
12290        if (c[i])
12291         {
12292           p1[i] += 1;
12293           p2[i] = p3[i] +2;
12294         }
12295    this transformation will produce the following semi-hammock:
12296
12297    if (!mask__ifc__42.18_165 == { 0, 0, 0, 0, 0, 0, 0, 0 })
12298      {
12299        vect__11.19_170 = MASK_LOAD (vectp_p1.20_168, 0B, mask__ifc__42.18_165);
12300        vect__12.22_172 = vect__11.19_170 + vect_cst__171;
12301        MASK_STORE (vectp_p1.23_175, 0B, mask__ifc__42.18_165, vect__12.22_172);
12302        vect__18.25_182 = MASK_LOAD (vectp_p3.26_180, 0B, mask__ifc__42.18_165);
12303        vect__19.28_184 = vect__18.25_182 + vect_cst__183;
12304        MASK_STORE (vectp_p2.29_187, 0B, mask__ifc__42.18_165, vect__19.28_184);
12305      }
12306 */
12307
12308 void
12309 optimize_mask_stores (class loop *loop)
12310 {
12311   basic_block *bbs = get_loop_body (loop);
12312   unsigned nbbs = loop->num_nodes;
12313   unsigned i;
12314   basic_block bb;
12315   class loop *bb_loop;
12316   gimple_stmt_iterator gsi;
12317   gimple *stmt;
12318   auto_vec<gimple *> worklist;
12319   auto_purge_vect_location sentinel;
12320
12321   vect_location = find_loop_location (loop);
12322   /* Pick up all masked stores in loop if any.  */
12323   for (i = 0; i < nbbs; i++)
12324     {
12325       bb = bbs[i];
12326       for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi);
12327            gsi_next (&gsi))
12328         {
12329           stmt = gsi_stmt (gsi);
12330           if (gimple_call_internal_p (stmt, IFN_MASK_STORE))
12331             worklist.safe_push (stmt);
12332         }
12333     }
12334
12335   free (bbs);
12336   if (worklist.is_empty ())
12337     return;
12338
12339   /* Loop has masked stores.  */
12340   while (!worklist.is_empty ())
12341     {
12342       gimple *last, *last_store;
12343       edge e, efalse;
12344       tree mask;
12345       basic_block store_bb, join_bb;
12346       gimple_stmt_iterator gsi_to;
12347       tree vdef, new_vdef;
12348       gphi *phi;
12349       tree vectype;
12350       tree zero;
12351
12352       last = worklist.pop ();
12353       mask = gimple_call_arg (last, 2);
12354       bb = gimple_bb (last);
12355       /* Create then_bb and if-then structure in CFG, then_bb belongs to
12356          the same loop as if_bb.  It could be different to LOOP when two
12357          level loop-nest is vectorized and mask_store belongs to the inner
12358          one.  */
12359       e = split_block (bb, last);
12360       bb_loop = bb->loop_father;
12361       gcc_assert (loop == bb_loop || flow_loop_nested_p (loop, bb_loop));
12362       join_bb = e->dest;
12363       store_bb = create_empty_bb (bb);
12364       add_bb_to_loop (store_bb, bb_loop);
12365       e->flags = EDGE_TRUE_VALUE;
12366       efalse = make_edge (bb, store_bb, EDGE_FALSE_VALUE);
12367       /* Put STORE_BB to likely part.  */
12368       efalse->probability = profile_probability::likely ();
12369       e->probability = efalse->probability.invert ();
12370       store_bb->count = efalse->count ();
12371       make_single_succ_edge (store_bb, join_bb, EDGE_FALLTHRU);
12372       if (dom_info_available_p (CDI_DOMINATORS))
12373         set_immediate_dominator (CDI_DOMINATORS, store_bb, bb);
12374       if (dump_enabled_p ())
12375         dump_printf_loc (MSG_NOTE, vect_location,
12376                          "Create new block %d to sink mask stores.",
12377                          store_bb->index);
12378       /* Create vector comparison with boolean result.  */
12379       vectype = TREE_TYPE (mask);
12380       zero = build_zero_cst (vectype);
12381       stmt = gimple_build_cond (EQ_EXPR, mask, zero, NULL_TREE, NULL_TREE);
12382       gsi = gsi_last_bb (bb);
12383       gsi_insert_after (&gsi, stmt, GSI_SAME_STMT);
12384       /* Create new PHI node for vdef of the last masked store:
12385          .MEM_2 = VDEF <.MEM_1>
12386          will be converted to
12387          .MEM.3 = VDEF <.MEM_1>
12388          and new PHI node will be created in join bb
12389          .MEM_2 = PHI <.MEM_1, .MEM_3>
12390       */
12391       vdef = gimple_vdef (last);
12392       new_vdef = make_ssa_name (gimple_vop (cfun), last);
12393       gimple_set_vdef (last, new_vdef);
12394       phi = create_phi_node (vdef, join_bb);
12395       add_phi_arg (phi, new_vdef, EDGE_SUCC (store_bb, 0), UNKNOWN_LOCATION);
12396
12397       /* Put all masked stores with the same mask to STORE_BB if possible.  */
12398       while (true)
12399         {
12400           gimple_stmt_iterator gsi_from;
12401           gimple *stmt1 = NULL;
12402
12403           /* Move masked store to STORE_BB.  */
12404           last_store = last;
12405           gsi = gsi_for_stmt (last);
12406           gsi_from = gsi;
12407           /* Shift GSI to the previous stmt for further traversal.  */
12408           gsi_prev (&gsi);
12409           gsi_to = gsi_start_bb (store_bb);
12410           gsi_move_before (&gsi_from, &gsi_to);
12411           /* Setup GSI_TO to the non-empty block start.  */
12412           gsi_to = gsi_start_bb (store_bb);
12413           if (dump_enabled_p ())
12414             dump_printf_loc (MSG_NOTE, vect_location,
12415                              "Move stmt to created bb\n%G", last);
12416           /* Move all stored value producers if possible.  */
12417           while (!gsi_end_p (gsi))
12418             {
12419               tree lhs;
12420               imm_use_iterator imm_iter;
12421               use_operand_p use_p;
12422               bool res;
12423
12424               /* Skip debug statements.  */
12425               if (is_gimple_debug (gsi_stmt (gsi)))
12426                 {
12427                   gsi_prev (&gsi);
12428                   continue;
12429                 }
12430               stmt1 = gsi_stmt (gsi);
12431               /* Do not consider statements writing to memory or having
12432                  volatile operand.  */
12433               if (gimple_vdef (stmt1)
12434                   || gimple_has_volatile_ops (stmt1))
12435                 break;
12436               gsi_from = gsi;
12437               gsi_prev (&gsi);
12438               lhs = gimple_get_lhs (stmt1);
12439               if (!lhs)
12440                 break;
12441
12442               /* LHS of vectorized stmt must be SSA_NAME.  */
12443               if (TREE_CODE (lhs) != SSA_NAME)
12444                 break;
12445
12446               if (!VECTOR_TYPE_P (TREE_TYPE (lhs)))
12447                 {
12448                   /* Remove dead scalar statement.  */
12449                   if (has_zero_uses (lhs))
12450                     {
12451                       gsi_remove (&gsi_from, true);
12452                       continue;
12453                     }
12454                 }
12455
12456               /* Check that LHS does not have uses outside of STORE_BB.  */
12457               res = true;
12458               FOR_EACH_IMM_USE_FAST (use_p, imm_iter, lhs)
12459                 {
12460                   gimple *use_stmt;
12461                   use_stmt = USE_STMT (use_p);
12462                   if (is_gimple_debug (use_stmt))
12463                     continue;
12464                   if (gimple_bb (use_stmt) != store_bb)
12465                     {
12466                       res = false;
12467                       break;
12468                     }
12469                 }
12470               if (!res)
12471                 break;
12472
12473               if (gimple_vuse (stmt1)
12474                   && gimple_vuse (stmt1) != gimple_vuse (last_store))
12475                 break;
12476
12477               /* Can move STMT1 to STORE_BB.  */
12478               if (dump_enabled_p ())
12479                 dump_printf_loc (MSG_NOTE, vect_location,
12480                                  "Move stmt to created bb\n%G", stmt1);
12481               gsi_move_before (&gsi_from, &gsi_to);
12482               /* Shift GSI_TO for further insertion.  */
12483               gsi_prev (&gsi_to);
12484             }
12485           /* Put other masked stores with the same mask to STORE_BB.  */
12486           if (worklist.is_empty ()
12487               || gimple_call_arg (worklist.last (), 2) != mask
12488               || worklist.last () != stmt1)
12489             break;
12490           last = worklist.pop ();
12491         }
12492       add_phi_arg (phi, gimple_vuse (last_store), e, UNKNOWN_LOCATION);
12493     }
12494 }
12495
12496 /* Decide whether it is possible to use a zero-based induction variable
12497    when vectorizing LOOP_VINFO with partial vectors.  If it is, return
12498    the value that the induction variable must be able to hold in order
12499    to ensure that the rgroups eventually have no active vector elements.
12500    Return -1 otherwise.  */
12501
12502 widest_int
12503 vect_iv_limit_for_partial_vectors (loop_vec_info loop_vinfo)
12504 {
12505   tree niters_skip = LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo);
12506   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
12507   unsigned HOST_WIDE_INT max_vf = vect_max_vf (loop_vinfo);
12508
12509   /* Calculate the value that the induction variable must be able
12510      to hit in order to ensure that we end the loop with an all-false mask.
12511      This involves adding the maximum number of inactive trailing scalar
12512      iterations.  */
12513   widest_int iv_limit = -1;
12514   if (max_loop_iterations (loop, &iv_limit))
12515     {
12516       if (niters_skip)
12517         {
12518           /* Add the maximum number of skipped iterations to the
12519              maximum iteration count.  */
12520           if (TREE_CODE (niters_skip) == INTEGER_CST)
12521             iv_limit += wi::to_widest (niters_skip);
12522           else
12523             iv_limit += max_vf - 1;
12524         }
12525       else if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo))
12526         /* Make a conservatively-correct assumption.  */
12527         iv_limit += max_vf - 1;
12528
12529       /* IV_LIMIT is the maximum number of latch iterations, which is also
12530          the maximum in-range IV value.  Round this value down to the previous
12531          vector alignment boundary and then add an extra full iteration.  */
12532       poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
12533       iv_limit = (iv_limit & -(int) known_alignment (vf)) + max_vf;
12534     }
12535   return iv_limit;
12536 }
12537
12538 /* For the given rgroup_controls RGC, check whether an induction variable
12539    would ever hit a value that produces a set of all-false masks or zero
12540    lengths before wrapping around.  Return true if it's possible to wrap
12541    around before hitting the desirable value, otherwise return false.  */
12542
12543 bool
12544 vect_rgroup_iv_might_wrap_p (loop_vec_info loop_vinfo, rgroup_controls *rgc)
12545 {
12546   widest_int iv_limit = vect_iv_limit_for_partial_vectors (loop_vinfo);
12547
12548   if (iv_limit == -1)
12549     return true;
12550
12551   tree compare_type = LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo);
12552   unsigned int compare_precision = TYPE_PRECISION (compare_type);
12553   unsigned nitems = rgc->max_nscalars_per_iter * rgc->factor;
12554
12555   if (wi::min_precision (iv_limit * nitems, UNSIGNED) > compare_precision)
12556     return true;
12557
12558   return false;
12559 }