gcc/tree-vect-loop.cc

   1 /* Loop Vectorization
   2    Copyright (C) 2003-2023 Free Software Foundation, Inc.
   3    Contributed by Dorit Naishlos <dorit@il.ibm.com> and
   4    Ira Rosen <irar@il.ibm.com>
   5
   6 This file is part of GCC.
   7
   8 GCC is free software; you can redistribute it and/or modify it under
   9 the terms of the GNU General Public License as published by the Free
  10 Software Foundation; either version 3, or (at your option) any later
  11 version.
  12
  13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
  14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
  15 FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  16 for more details.
  17
  18 You should have received a copy of the GNU General Public License
  19 along with GCC; see the file COPYING3.  If not see
  20 <http://www.gnu.org/licenses/>.  */
  21
  22 #define INCLUDE_ALGORITHM
  23 #include "config.h"
  24 #include "system.h"
  25 #include "coretypes.h"
  26 #include "backend.h"
  27 #include "target.h"
  28 #include "rtl.h"
  29 #include "tree.h"
  30 #include "gimple.h"
  31 #include "cfghooks.h"
  32 #include "tree-pass.h"
  33 #include "ssa.h"
  34 #include "optabs-tree.h"
  35 #include "diagnostic-core.h"
  36 #include "fold-const.h"
  37 #include "stor-layout.h"
  38 #include "cfganal.h"
  39 #include "gimplify.h"
  40 #include "gimple-iterator.h"
  41 #include "gimplify-me.h"
  42 #include "tree-ssa-loop-ivopts.h"
  43 #include "tree-ssa-loop-manip.h"
  44 #include "tree-ssa-loop-niter.h"
  45 #include "tree-ssa-loop.h"
  46 #include "cfgloop.h"
  47 #include "tree-scalar-evolution.h"
  48 #include "tree-vectorizer.h"
  49 #include "gimple-fold.h"
  50 #include "cgraph.h"
  51 #include "tree-cfg.h"
  52 #include "tree-if-conv.h"
  53 #include "internal-fn.h"
  54 #include "tree-vector-builder.h"
  55 #include "vec-perm-indices.h"
  56 #include "tree-eh.h"
  57 #include "case-cfn-macros.h"
  58 #include "langhooks.h"
  59
  60 /* Loop Vectorization Pass.
  61
  62    This pass tries to vectorize loops.
  63
  64    For example, the vectorizer transforms the following simple loop:
  65
  66         short a[N]; short b[N]; short c[N]; int i;
  67
  68         for (i=0; i<N; i++){
  69           a[i] = b[i] + c[i];
  70         }
  71
  72    as if it was manually vectorized by rewriting the source code into:
  73
  74         typedef int __attribute__((mode(V8HI))) v8hi;
  75         short a[N];  short b[N]; short c[N];   int i;
  76         v8hi *pa = (v8hi*)a, *pb = (v8hi*)b, *pc = (v8hi*)c;
  77         v8hi va, vb, vc;
  78
  79         for (i=0; i<N/8; i++){
  80           vb = pb[i];
  81           vc = pc[i];
  82           va = vb + vc;
  83           pa[i] = va;
  84         }
  85
  86         The main entry to this pass is vectorize_loops(), in which
  87    the vectorizer applies a set of analyses on a given set of loops,
  88    followed by the actual vectorization transformation for the loops that
  89    had successfully passed the analysis phase.
  90         Throughout this pass we make a distinction between two types of
  91    data: scalars (which are represented by SSA_NAMES), and memory references
  92    ("data-refs").  These two types of data require different handling both
  93    during analysis and transformation. The types of data-refs that the
  94    vectorizer currently supports are ARRAY_REFS which base is an array DECL
  95    (not a pointer), and INDIRECT_REFS through pointers; both array and pointer
  96    accesses are required to have a simple (consecutive) access pattern.
  97
  98    Analysis phase:
  99    ===============
 100         The driver for the analysis phase is vect_analyze_loop().
 101    It applies a set of analyses, some of which rely on the scalar evolution
 102    analyzer (scev) developed by Sebastian Pop.
 103
 104         During the analysis phase the vectorizer records some information
 105    per stmt in a "stmt_vec_info" struct which is attached to each stmt in the
 106    loop, as well as general information about the loop as a whole, which is
 107    recorded in a "loop_vec_info" struct attached to each loop.
 108
 109    Transformation phase:
 110    =====================
 111         The loop transformation phase scans all the stmts in the loop, and
 112    creates a vector stmt (or a sequence of stmts) for each scalar stmt S in
 113    the loop that needs to be vectorized.  It inserts the vector code sequence
 114    just before the scalar stmt S, and records a pointer to the vector code
 115    in STMT_VINFO_VEC_STMT (stmt_info) (stmt_info is the stmt_vec_info struct
 116    attached to S).  This pointer will be used for the vectorization of following
 117    stmts which use the def of stmt S. Stmt S is removed if it writes to memory;
 118    otherwise, we rely on dead code elimination for removing it.
 119
 120         For example, say stmt S1 was vectorized into stmt VS1:
 121
 122    VS1: vb = px[i];
 123    S1:  b = x[i];    STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
 124    S2:  a = b;
 125
 126    To vectorize stmt S2, the vectorizer first finds the stmt that defines
 127    the operand 'b' (S1), and gets the relevant vector def 'vb' from the
 128    vector stmt VS1 pointed to by STMT_VINFO_VEC_STMT (stmt_info (S1)).  The
 129    resulting sequence would be:
 130
 131    VS1: vb = px[i];
 132    S1:  b = x[i];       STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
 133    VS2: va = vb;
 134    S2:  a = b;          STMT_VINFO_VEC_STMT (stmt_info (S2)) = VS2
 135
 136         Operands that are not SSA_NAMEs, are data-refs that appear in
 137    load/store operations (like 'x[i]' in S1), and are handled differently.
 138
 139    Target modeling:
 140    =================
 141         Currently the only target specific information that is used is the
 142    size of the vector (in bytes) - "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD".
 143    Targets that can support different sizes of vectors, for now will need
 144    to specify one value for "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD".  More
 145    flexibility will be added in the future.
 146
 147         Since we only vectorize operations which vector form can be
 148    expressed using existing tree codes, to verify that an operation is
 149    supported, the vectorizer checks the relevant optab at the relevant
 150    machine_mode (e.g, optab_handler (add_optab, V8HImode)).  If
 151    the value found is CODE_FOR_nothing, then there's no target support, and
 152    we can't vectorize the stmt.
 153
 154    For additional information on this project see:
 155    http://gcc.gnu.org/projects/tree-ssa/vectorization.html
 156 */
 157
 158 static void vect_estimate_min_profitable_iters (loop_vec_info, int *, int *,
 159                                                 unsigned *);
 160 static stmt_vec_info vect_is_simple_reduction (loop_vec_info, stmt_vec_info,
 161                                                bool *, bool *, bool);
 162
 163 /* Subroutine of vect_determine_vf_for_stmt that handles only one
 164    statement.  VECTYPE_MAYBE_SET_P is true if STMT_VINFO_VECTYPE
 165    may already be set for general statements (not just data refs).  */
 166
 167 static opt_result
 168 vect_determine_vf_for_stmt_1 (vec_info *vinfo, stmt_vec_info stmt_info,
 169                               bool vectype_maybe_set_p,
 170                               poly_uint64 *vf)
 171 {
 172   gimple *stmt = stmt_info->stmt;
 173
 174   if ((!STMT_VINFO_RELEVANT_P (stmt_info)
 175        && !STMT_VINFO_LIVE_P (stmt_info))
 176       || gimple_clobber_p (stmt))
 177     {
 178       if (dump_enabled_p ())
 179         dump_printf_loc (MSG_NOTE, vect_location, "skip.\n");
 180       return opt_result::success ();
 181     }
 182
 183   tree stmt_vectype, nunits_vectype;
 184   opt_result res = vect_get_vector_types_for_stmt (vinfo, stmt_info,
 185                                                    &stmt_vectype,
 186                                                    &nunits_vectype);
 187   if (!res)
 188     return res;
 189
 190   if (stmt_vectype)
 191     {
 192       if (STMT_VINFO_VECTYPE (stmt_info))
 193         /* The only case when a vectype had been already set is for stmts
 194            that contain a data ref, or for "pattern-stmts" (stmts generated
 195            by the vectorizer to represent/replace a certain idiom).  */
 196         gcc_assert ((STMT_VINFO_DATA_REF (stmt_info)
 197                      || vectype_maybe_set_p)
 198                     && STMT_VINFO_VECTYPE (stmt_info) == stmt_vectype);
 199       else
 200         STMT_VINFO_VECTYPE (stmt_info) = stmt_vectype;
 201     }
 202
 203   if (nunits_vectype)
 204     vect_update_max_nunits (vf, nunits_vectype);
 205
 206   return opt_result::success ();
 207 }
 208
 209 /* Subroutine of vect_determine_vectorization_factor.  Set the vector
 210    types of STMT_INFO and all attached pattern statements and update
 211    the vectorization factor VF accordingly.  Return true on success
 212    or false if something prevented vectorization.  */
 213
 214 static opt_result
 215 vect_determine_vf_for_stmt (vec_info *vinfo,
 216                             stmt_vec_info stmt_info, poly_uint64 *vf)
 217 {
 218   if (dump_enabled_p ())
 219     dump_printf_loc (MSG_NOTE, vect_location, "==> examining statement: %G",
 220                      stmt_info->stmt);
 221   opt_result res = vect_determine_vf_for_stmt_1 (vinfo, stmt_info, false, vf);
 222   if (!res)
 223     return res;
 224
 225   if (STMT_VINFO_IN_PATTERN_P (stmt_info)
 226       && STMT_VINFO_RELATED_STMT (stmt_info))
 227     {
 228       gimple *pattern_def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
 229       stmt_info = STMT_VINFO_RELATED_STMT (stmt_info);
 230
 231       /* If a pattern statement has def stmts, analyze them too.  */
 232       for (gimple_stmt_iterator si = gsi_start (pattern_def_seq);
 233            !gsi_end_p (si); gsi_next (&si))
 234         {
 235           stmt_vec_info def_stmt_info = vinfo->lookup_stmt (gsi_stmt (si));
 236           if (dump_enabled_p ())
 237             dump_printf_loc (MSG_NOTE, vect_location,
 238                              "==> examining pattern def stmt: %G",
 239                              def_stmt_info->stmt);
 240           res = vect_determine_vf_for_stmt_1 (vinfo, def_stmt_info, true, vf);
 241           if (!res)
 242             return res;
 243         }
 244
 245       if (dump_enabled_p ())
 246         dump_printf_loc (MSG_NOTE, vect_location,
 247                          "==> examining pattern statement: %G",
 248                          stmt_info->stmt);
 249       res = vect_determine_vf_for_stmt_1 (vinfo, stmt_info, true, vf);
 250       if (!res)
 251         return res;
 252     }
 253
 254   return opt_result::success ();
 255 }
 256
 257 /* Function vect_determine_vectorization_factor
 258
 259    Determine the vectorization factor (VF).  VF is the number of data elements
 260    that are operated upon in parallel in a single iteration of the vectorized
 261    loop.  For example, when vectorizing a loop that operates on 4byte elements,
 262    on a target with vector size (VS) 16byte, the VF is set to 4, since 4
 263    elements can fit in a single vector register.
 264
 265    We currently support vectorization of loops in which all types operated upon
 266    are of the same size.  Therefore this function currently sets VF according to
 267    the size of the types operated upon, and fails if there are multiple sizes
 268    in the loop.
 269
 270    VF is also the factor by which the loop iterations are strip-mined, e.g.:
 271    original loop:
 272         for (i=0; i<N; i++){
 273           a[i] = b[i] + c[i];
 274         }
 275
 276    vectorized loop:
 277         for (i=0; i<N; i+=VF){
 278           a[i:VF] = b[i:VF] + c[i:VF];
 279         }
 280 */
 281
 282 static opt_result
 283 vect_determine_vectorization_factor (loop_vec_info loop_vinfo)
 284 {
 285   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
 286   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
 287   unsigned nbbs = loop->num_nodes;
 288   poly_uint64 vectorization_factor = 1;
 289   tree scalar_type = NULL_TREE;
 290   gphi *phi;
 291   tree vectype;
 292   stmt_vec_info stmt_info;
 293   unsigned i;
 294
 295   DUMP_VECT_SCOPE ("vect_determine_vectorization_factor");
 296
 297   for (i = 0; i < nbbs; i++)
 298     {
 299       basic_block bb = bbs[i];
 300
 301       for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
 302            gsi_next (&si))
 303         {
 304           phi = si.phi ();
 305           stmt_info = loop_vinfo->lookup_stmt (phi);
 306           if (dump_enabled_p ())
 307             dump_printf_loc (MSG_NOTE, vect_location, "==> examining phi: %G",
 308                              (gimple *) phi);
 309
 310           gcc_assert (stmt_info);
 311
 312           if (STMT_VINFO_RELEVANT_P (stmt_info)
 313               || STMT_VINFO_LIVE_P (stmt_info))
 314             {
 315               gcc_assert (!STMT_VINFO_VECTYPE (stmt_info));
 316               scalar_type = TREE_TYPE (PHI_RESULT (phi));
 317
 318               if (dump_enabled_p ())
 319                 dump_printf_loc (MSG_NOTE, vect_location,
 320                                  "get vectype for scalar type:  %T\n",
 321                                  scalar_type);
 322
 323               vectype = get_vectype_for_scalar_type (loop_vinfo, scalar_type);
 324               if (!vectype)
 325                 return opt_result::failure_at (phi,
 326                                                "not vectorized: unsupported "
 327                                                "data-type %T\n",
 328                                                scalar_type);
 329               STMT_VINFO_VECTYPE (stmt_info) = vectype;
 330
 331               if (dump_enabled_p ())
 332                 dump_printf_loc (MSG_NOTE, vect_location, "vectype: %T\n",
 333                                  vectype);
 334
 335               if (dump_enabled_p ())
 336                 {
 337                   dump_printf_loc (MSG_NOTE, vect_location, "nunits = ");
 338                   dump_dec (MSG_NOTE, TYPE_VECTOR_SUBPARTS (vectype));
 339                   dump_printf (MSG_NOTE, "\n");
 340                 }
 341
 342               vect_update_max_nunits (&vectorization_factor, vectype);
 343             }
 344         }
 345
 346       for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
 347            gsi_next (&si))
 348         {
 349           if (is_gimple_debug (gsi_stmt (si)))
 350             continue;
 351           stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
 352           opt_result res
 353             = vect_determine_vf_for_stmt (loop_vinfo,
 354                                           stmt_info, &vectorization_factor);
 355           if (!res)
 356             return res;
 357         }
 358     }
 359
 360   /* TODO: Analyze cost. Decide if worth while to vectorize.  */
 361   if (dump_enabled_p ())
 362     {
 363       dump_printf_loc (MSG_NOTE, vect_location, "vectorization factor = ");
 364       dump_dec (MSG_NOTE, vectorization_factor);
 365       dump_printf (MSG_NOTE, "\n");
 366     }
 367
 368   if (known_le (vectorization_factor, 1U))
 369     return opt_result::failure_at (vect_location,
 370                                    "not vectorized: unsupported data-type\n");
 371   LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
 372   return opt_result::success ();
 373 }
 374
 375
 376 /* Function vect_is_simple_iv_evolution.
 377
 378    FORNOW: A simple evolution of an induction variables in the loop is
 379    considered a polynomial evolution.  */
 380
 381 static bool
 382 vect_is_simple_iv_evolution (unsigned loop_nb, tree access_fn, tree * init,
 383                              tree * step)
 384 {
 385   tree init_expr;
 386   tree step_expr;
 387   tree evolution_part = evolution_part_in_loop_num (access_fn, loop_nb);
 388   basic_block bb;
 389
 390   /* When there is no evolution in this loop, the evolution function
 391      is not "simple".  */
 392   if (evolution_part == NULL_TREE)
 393     return false;
 394
 395   /* When the evolution is a polynomial of degree >= 2
 396      the evolution function is not "simple".  */
 397   if (tree_is_chrec (evolution_part))
 398     return false;
 399
 400   step_expr = evolution_part;
 401   init_expr = unshare_expr (initial_condition_in_loop_num (access_fn, loop_nb));
 402
 403   if (dump_enabled_p ())
 404     dump_printf_loc (MSG_NOTE, vect_location, "step: %T,  init: %T\n",
 405                      step_expr, init_expr);
 406
 407   *init = init_expr;
 408   *step = step_expr;
 409
 410   if (TREE_CODE (step_expr) != INTEGER_CST
 411       && (TREE_CODE (step_expr) != SSA_NAME
 412           || ((bb = gimple_bb (SSA_NAME_DEF_STMT (step_expr)))
 413               && flow_bb_inside_loop_p (get_loop (cfun, loop_nb), bb))
 414           || (!INTEGRAL_TYPE_P (TREE_TYPE (step_expr))
 415               && (!SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr))
 416                   || !flag_associative_math)))
 417       && (TREE_CODE (step_expr) != REAL_CST
 418           || !flag_associative_math))
 419     {
 420       if (dump_enabled_p ())
 421         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 422                          "step unknown.\n");
 423       return false;
 424     }
 425
 426   return true;
 427 }
 428
 429 /* Function vect_is_nonlinear_iv_evolution
 430
 431    Only support nonlinear induction for integer type
 432    1. neg
 433    2. mul by constant
 434    3. lshift/rshift by constant.
 435
 436    For neg induction, return a fake step as integer -1.  */
 437 static bool
 438 vect_is_nonlinear_iv_evolution (class loop* loop, stmt_vec_info stmt_info,
 439                                 gphi* loop_phi_node, tree *init, tree *step)
 440 {
 441   tree init_expr, ev_expr, result, op1, op2;
 442   gimple* def;
 443
 444   if (gimple_phi_num_args (loop_phi_node) != 2)
 445     return false;
 446
 447   init_expr = PHI_ARG_DEF_FROM_EDGE (loop_phi_node, loop_preheader_edge (loop));
 448   ev_expr = PHI_ARG_DEF_FROM_EDGE (loop_phi_node, loop_latch_edge (loop));
 449
 450   /* Support nonlinear induction only for integer type.  */
 451   if (!INTEGRAL_TYPE_P (TREE_TYPE (init_expr)))
 452     return false;
 453
 454   *init = init_expr;
 455   result = PHI_RESULT (loop_phi_node);
 456
 457   if (TREE_CODE (ev_expr) != SSA_NAME
 458       || ((def = SSA_NAME_DEF_STMT (ev_expr)), false)
 459       || !is_gimple_assign (def))
 460     return false;
 461
 462   enum tree_code t_code = gimple_assign_rhs_code (def);
 463   switch (t_code)
 464     {
 465     case NEGATE_EXPR:
 466       if (gimple_assign_rhs1 (def) != result)
 467         return false;
 468       *step = build_int_cst (TREE_TYPE (init_expr), -1);
 469       STMT_VINFO_LOOP_PHI_EVOLUTION_TYPE (stmt_info) = vect_step_op_neg;
 470       break;
 471
 472     case RSHIFT_EXPR:
 473     case LSHIFT_EXPR:
 474     case MULT_EXPR:
 475       op1 = gimple_assign_rhs1 (def);
 476       op2 = gimple_assign_rhs2 (def);
 477       if (TREE_CODE (op2) != INTEGER_CST
 478           || op1 != result)
 479         return false;
 480       *step = op2;
 481       if (t_code == LSHIFT_EXPR)
 482         STMT_VINFO_LOOP_PHI_EVOLUTION_TYPE (stmt_info) = vect_step_op_shl;
 483       else if (t_code == RSHIFT_EXPR)
 484         STMT_VINFO_LOOP_PHI_EVOLUTION_TYPE (stmt_info) = vect_step_op_shr;
 485       /* NEGATE_EXPR and MULT_EXPR are both vect_step_op_mul.  */
 486       else
 487         STMT_VINFO_LOOP_PHI_EVOLUTION_TYPE (stmt_info) = vect_step_op_mul;
 488       break;
 489
 490     default:
 491       return false;
 492     }
 493
 494   STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_info) = *init;
 495   STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_info) = *step;
 496
 497   return true;
 498 }
 499
 500 /* Return true if PHI, described by STMT_INFO, is the inner PHI in
 501    what we are assuming is a double reduction.  For example, given
 502    a structure like this:
 503
 504       outer1:
 505         x_1 = PHI <x_4(outer2), ...>;
 506         ...
 507
 508       inner:
 509         x_2 = PHI <x_1(outer1), ...>;
 510         ...
 511         x_3 = ...;
 512         ...
 513
 514       outer2:
 515         x_4 = PHI <x_3(inner)>;
 516         ...
 517
 518    outer loop analysis would treat x_1 as a double reduction phi and
 519    this function would then return true for x_2.  */
 520
 521 static bool
 522 vect_inner_phi_in_double_reduction_p (loop_vec_info loop_vinfo, gphi *phi)
 523 {
 524   use_operand_p use_p;
 525   ssa_op_iter op_iter;
 526   FOR_EACH_PHI_ARG (use_p, phi, op_iter, SSA_OP_USE)
 527     if (stmt_vec_info def_info = loop_vinfo->lookup_def (USE_FROM_PTR (use_p)))
 528       if (STMT_VINFO_DEF_TYPE (def_info) == vect_double_reduction_def)
 529         return true;
 530   return false;
 531 }
 532
 533 /* Returns true if Phi is a first-order recurrence. A first-order
 534    recurrence is a non-reduction recurrence relation in which the value of
 535    the recurrence in the current loop iteration equals a value defined in
 536    the previous iteration.  */
 537
 538 static bool
 539 vect_phi_first_order_recurrence_p (loop_vec_info loop_vinfo, class loop *loop,
 540                                    gphi *phi)
 541 {
 542   /* A nested cycle isn't vectorizable as first order recurrence.  */
 543   if (LOOP_VINFO_LOOP (loop_vinfo) != loop)
 544     return false;
 545
 546   /* Ensure the loop latch definition is from within the loop.  */
 547   edge latch = loop_latch_edge (loop);
 548   tree ldef = PHI_ARG_DEF_FROM_EDGE (phi, latch);
 549   if (TREE_CODE (ldef) != SSA_NAME
 550       || SSA_NAME_IS_DEFAULT_DEF (ldef)
 551       || is_a <gphi *> (SSA_NAME_DEF_STMT (ldef))
 552       || !flow_bb_inside_loop_p (loop, gimple_bb (SSA_NAME_DEF_STMT (ldef))))
 553     return false;
 554
 555   tree def = gimple_phi_result (phi);
 556
 557   /* Ensure every use_stmt of the phi node is dominated by the latch
 558      definition.  */
 559   imm_use_iterator imm_iter;
 560   use_operand_p use_p;
 561   FOR_EACH_IMM_USE_FAST (use_p, imm_iter, def)
 562     if (!is_gimple_debug (USE_STMT (use_p))
 563         && (SSA_NAME_DEF_STMT (ldef) == USE_STMT (use_p)
 564             || !vect_stmt_dominates_stmt_p (SSA_NAME_DEF_STMT (ldef),
 565                                             USE_STMT (use_p))))
 566       return false;
 567
 568   /* First-order recurrence autovectorization needs shuffle vector.  */
 569   tree scalar_type = TREE_TYPE (def);
 570   tree vectype = get_vectype_for_scalar_type (loop_vinfo, scalar_type);
 571   if (!vectype)
 572     return false;
 573
 574   return true;
 575 }
 576
 577 /* Function vect_analyze_scalar_cycles_1.
 578
 579    Examine the cross iteration def-use cycles of scalar variables
 580    in LOOP.  LOOP_VINFO represents the loop that is now being
 581    considered for vectorization (can be LOOP, or an outer-loop
 582    enclosing LOOP).  SLP indicates there will be some subsequent
 583    slp analyses or not.  */
 584
 585 static void
 586 vect_analyze_scalar_cycles_1 (loop_vec_info loop_vinfo, class loop *loop,
 587                               bool slp)
 588 {
 589   basic_block bb = loop->header;
 590   tree init, step;
 591   auto_vec<stmt_vec_info, 64> worklist;
 592   gphi_iterator gsi;
 593   bool double_reduc, reduc_chain;
 594
 595   DUMP_VECT_SCOPE ("vect_analyze_scalar_cycles");
 596
 597   /* First - identify all inductions.  Reduction detection assumes that all the
 598      inductions have been identified, therefore, this order must not be
 599      changed.  */
 600   for (gsi = gsi_start_phis  (bb); !gsi_end_p (gsi); gsi_next (&gsi))
 601     {
 602       gphi *phi = gsi.phi ();
 603       tree access_fn = NULL;
 604       tree def = PHI_RESULT (phi);
 605       stmt_vec_info stmt_vinfo = loop_vinfo->lookup_stmt (phi);
 606
 607       if (dump_enabled_p ())
 608         dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: %G",
 609                          (gimple *) phi);
 610
 611       /* Skip virtual phi's.  The data dependences that are associated with
 612          virtual defs/uses (i.e., memory accesses) are analyzed elsewhere.  */
 613       if (virtual_operand_p (def))
 614         continue;
 615
 616       STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_unknown_def_type;
 617
 618       /* Analyze the evolution function.  */
 619       access_fn = analyze_scalar_evolution (loop, def);
 620       if (access_fn)
 621         {
 622           STRIP_NOPS (access_fn);
 623           if (dump_enabled_p ())
 624             dump_printf_loc (MSG_NOTE, vect_location,
 625                              "Access function of PHI: %T\n", access_fn);
 626           STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo)
 627             = initial_condition_in_loop_num (access_fn, loop->num);
 628           STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo)
 629             = evolution_part_in_loop_num (access_fn, loop->num);
 630         }
 631
 632       if ((!access_fn
 633            || vect_inner_phi_in_double_reduction_p (loop_vinfo, phi)
 634            || !vect_is_simple_iv_evolution (loop->num, access_fn,
 635                                             &init, &step)
 636            || (LOOP_VINFO_LOOP (loop_vinfo) != loop
 637                && TREE_CODE (step) != INTEGER_CST))
 638           /* Only handle nonlinear iv for same loop.  */
 639           && (LOOP_VINFO_LOOP (loop_vinfo) != loop
 640               || !vect_is_nonlinear_iv_evolution (loop, stmt_vinfo,
 641                                                   phi, &init, &step)))
 642         {
 643           worklist.safe_push (stmt_vinfo);
 644           continue;
 645         }
 646
 647       gcc_assert (STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo)
 648                   != NULL_TREE);
 649       gcc_assert (STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo) != NULL_TREE);
 650
 651       if (dump_enabled_p ())
 652         dump_printf_loc (MSG_NOTE, vect_location, "Detected induction.\n");
 653       STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_induction_def;
 654     }
 655
 656
 657   /* Second - identify all reductions and nested cycles.  */
 658   while (worklist.length () > 0)
 659     {
 660       stmt_vec_info stmt_vinfo = worklist.pop ();
 661       gphi *phi = as_a <gphi *> (stmt_vinfo->stmt);
 662       tree def = PHI_RESULT (phi);
 663
 664       if (dump_enabled_p ())
 665         dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: %G",
 666                          (gimple *) phi);
 667
 668       gcc_assert (!virtual_operand_p (def)
 669                   && STMT_VINFO_DEF_TYPE (stmt_vinfo) == vect_unknown_def_type);
 670
 671       stmt_vec_info reduc_stmt_info
 672         = vect_is_simple_reduction (loop_vinfo, stmt_vinfo, &double_reduc,
 673                                     &reduc_chain, slp);
 674       if (reduc_stmt_info)
 675         {
 676           STMT_VINFO_REDUC_DEF (stmt_vinfo) = reduc_stmt_info;
 677           STMT_VINFO_REDUC_DEF (reduc_stmt_info) = stmt_vinfo;
 678           if (double_reduc)
 679             {
 680               if (dump_enabled_p ())
 681                 dump_printf_loc (MSG_NOTE, vect_location,
 682                                  "Detected double reduction.\n");
 683
 684               STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_double_reduction_def;
 685               STMT_VINFO_DEF_TYPE (reduc_stmt_info) = vect_double_reduction_def;
 686             }
 687           else
 688             {
 689               if (loop != LOOP_VINFO_LOOP (loop_vinfo))
 690                 {
 691                   if (dump_enabled_p ())
 692                     dump_printf_loc (MSG_NOTE, vect_location,
 693                                      "Detected vectorizable nested cycle.\n");
 694
 695                   STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_nested_cycle;
 696                 }
 697               else
 698                 {
 699                   if (dump_enabled_p ())
 700                     dump_printf_loc (MSG_NOTE, vect_location,
 701                                      "Detected reduction.\n");
 702
 703                   STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_reduction_def;
 704                   STMT_VINFO_DEF_TYPE (reduc_stmt_info) = vect_reduction_def;
 705                   /* Store the reduction cycles for possible vectorization in
 706                      loop-aware SLP if it was not detected as reduction
 707                      chain.  */
 708                   if (! reduc_chain)
 709                     LOOP_VINFO_REDUCTIONS (loop_vinfo).safe_push
 710                       (reduc_stmt_info);
 711                 }
 712             }
 713         }
 714       else if (vect_phi_first_order_recurrence_p (loop_vinfo, loop, phi))
 715         STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_first_order_recurrence;
 716       else
 717         if (dump_enabled_p ())
 718           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 719                            "Unknown def-use cycle pattern.\n");
 720     }
 721 }
 722
 723
 724 /* Function vect_analyze_scalar_cycles.
 725
 726    Examine the cross iteration def-use cycles of scalar variables, by
 727    analyzing the loop-header PHIs of scalar variables.  Classify each
 728    cycle as one of the following: invariant, induction, reduction, unknown.
 729    We do that for the loop represented by LOOP_VINFO, and also to its
 730    inner-loop, if exists.
 731    Examples for scalar cycles:
 732
 733    Example1: reduction:
 734
 735               loop1:
 736               for (i=0; i<N; i++)
 737                  sum += a[i];
 738
 739    Example2: induction:
 740
 741               loop2:
 742               for (i=0; i<N; i++)
 743                  a[i] = i;  */
 744
 745 static void
 746 vect_analyze_scalar_cycles (loop_vec_info loop_vinfo, bool slp)
 747 {
 748   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
 749
 750   vect_analyze_scalar_cycles_1 (loop_vinfo, loop, slp);
 751
 752   /* When vectorizing an outer-loop, the inner-loop is executed sequentially.
 753      Reductions in such inner-loop therefore have different properties than
 754      the reductions in the nest that gets vectorized:
 755      1. When vectorized, they are executed in the same order as in the original
 756         scalar loop, so we can't change the order of computation when
 757         vectorizing them.
 758      2. FIXME: Inner-loop reductions can be used in the inner-loop, so the
 759         current checks are too strict.  */
 760
 761   if (loop->inner)
 762     vect_analyze_scalar_cycles_1 (loop_vinfo, loop->inner, slp);
 763 }
 764
 765 /* Transfer group and reduction information from STMT_INFO to its
 766    pattern stmt.  */
 767
 768 static void
 769 vect_fixup_reduc_chain (stmt_vec_info stmt_info)
 770 {
 771   stmt_vec_info firstp = STMT_VINFO_RELATED_STMT (stmt_info);
 772   stmt_vec_info stmtp;
 773   gcc_assert (!REDUC_GROUP_FIRST_ELEMENT (firstp)
 774               && REDUC_GROUP_FIRST_ELEMENT (stmt_info));
 775   REDUC_GROUP_SIZE (firstp) = REDUC_GROUP_SIZE (stmt_info);
 776   do
 777     {
 778       stmtp = STMT_VINFO_RELATED_STMT (stmt_info);
 779       gcc_checking_assert (STMT_VINFO_DEF_TYPE (stmtp)
 780                            == STMT_VINFO_DEF_TYPE (stmt_info));
 781       REDUC_GROUP_FIRST_ELEMENT (stmtp) = firstp;
 782       stmt_info = REDUC_GROUP_NEXT_ELEMENT (stmt_info);
 783       if (stmt_info)
 784         REDUC_GROUP_NEXT_ELEMENT (stmtp)
 785           = STMT_VINFO_RELATED_STMT (stmt_info);
 786     }
 787   while (stmt_info);
 788 }
 789
 790 /* Fixup scalar cycles that now have their stmts detected as patterns.  */
 791
 792 static void
 793 vect_fixup_scalar_cycles_with_patterns (loop_vec_info loop_vinfo)
 794 {
 795   stmt_vec_info first;
 796   unsigned i;
 797
 798   FOR_EACH_VEC_ELT (LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo), i, first)
 799     {
 800       stmt_vec_info next = REDUC_GROUP_NEXT_ELEMENT (first);
 801       while (next)
 802         {
 803           if ((STMT_VINFO_IN_PATTERN_P (next)
 804                != STMT_VINFO_IN_PATTERN_P (first))
 805               || STMT_VINFO_REDUC_IDX (vect_stmt_to_vectorize (next)) == -1)
 806             break;
 807           next = REDUC_GROUP_NEXT_ELEMENT (next);
 808         }
 809       /* If all reduction chain members are well-formed patterns adjust
 810          the group to group the pattern stmts instead.  */
 811       if (! next
 812           && STMT_VINFO_REDUC_IDX (vect_stmt_to_vectorize (first)) != -1)
 813         {
 814           if (STMT_VINFO_IN_PATTERN_P (first))
 815             {
 816               vect_fixup_reduc_chain (first);
 817               LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo)[i]
 818                 = STMT_VINFO_RELATED_STMT (first);
 819             }
 820         }
 821       /* If not all stmt in the chain are patterns or if we failed
 822          to update STMT_VINFO_REDUC_IDX dissolve the chain and handle
 823          it as regular reduction instead.  */
 824       else
 825         {
 826           stmt_vec_info vinfo = first;
 827           stmt_vec_info last = NULL;
 828           while (vinfo)
 829             {
 830               next = REDUC_GROUP_NEXT_ELEMENT (vinfo);
 831               REDUC_GROUP_FIRST_ELEMENT (vinfo) = NULL;
 832               REDUC_GROUP_NEXT_ELEMENT (vinfo) = NULL;
 833               last = vinfo;
 834               vinfo = next;
 835             }
 836           STMT_VINFO_DEF_TYPE (vect_stmt_to_vectorize (first))
 837             = vect_internal_def;
 838           loop_vinfo->reductions.safe_push (vect_stmt_to_vectorize (last));
 839           LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo).unordered_remove (i);
 840           --i;
 841         }
 842     }
 843 }
 844
 845 /* Function vect_get_loop_niters.
 846
 847    Determine how many iterations the loop is executed and place it
 848    in NUMBER_OF_ITERATIONS.  Place the number of latch iterations
 849    in NUMBER_OF_ITERATIONSM1.  Place the condition under which the
 850    niter information holds in ASSUMPTIONS.
 851
 852    Return the loop exit condition.  */
 853
 854
 855 static gcond *
 856 vect_get_loop_niters (class loop *loop, tree *assumptions,
 857                       tree *number_of_iterations, tree *number_of_iterationsm1)
 858 {
 859   edge exit = single_exit (loop);
 860   class tree_niter_desc niter_desc;
 861   tree niter_assumptions, niter, may_be_zero;
 862   gcond *cond = get_loop_exit_condition (loop);
 863
 864   *assumptions = boolean_true_node;
 865   *number_of_iterationsm1 = chrec_dont_know;
 866   *number_of_iterations = chrec_dont_know;
 867   DUMP_VECT_SCOPE ("get_loop_niters");
 868
 869   if (!exit)
 870     return cond;
 871
 872   may_be_zero = NULL_TREE;
 873   if (!number_of_iterations_exit_assumptions (loop, exit, &niter_desc, NULL)
 874       || chrec_contains_undetermined (niter_desc.niter))
 875     return cond;
 876
 877   niter_assumptions = niter_desc.assumptions;
 878   may_be_zero = niter_desc.may_be_zero;
 879   niter = niter_desc.niter;
 880
 881   if (may_be_zero && integer_zerop (may_be_zero))
 882     may_be_zero = NULL_TREE;
 883
 884   if (may_be_zero)
 885     {
 886       if (COMPARISON_CLASS_P (may_be_zero))
 887         {
 888           /* Try to combine may_be_zero with assumptions, this can simplify
 889              computation of niter expression.  */
 890           if (niter_assumptions && !integer_nonzerop (niter_assumptions))
 891             niter_assumptions = fold_build2 (TRUTH_AND_EXPR, boolean_type_node,
 892                                              niter_assumptions,
 893                                              fold_build1 (TRUTH_NOT_EXPR,
 894                                                           boolean_type_node,
 895                                                           may_be_zero));
 896           else
 897             niter = fold_build3 (COND_EXPR, TREE_TYPE (niter), may_be_zero,
 898                                  build_int_cst (TREE_TYPE (niter), 0),
 899                                  rewrite_to_non_trapping_overflow (niter));
 900
 901           may_be_zero = NULL_TREE;
 902         }
 903       else if (integer_nonzerop (may_be_zero))
 904         {
 905           *number_of_iterationsm1 = build_int_cst (TREE_TYPE (niter), 0);
 906           *number_of_iterations = build_int_cst (TREE_TYPE (niter), 1);
 907           return cond;
 908         }
 909       else
 910         return cond;
 911     }
 912
 913   *assumptions = niter_assumptions;
 914   *number_of_iterationsm1 = niter;
 915
 916   /* We want the number of loop header executions which is the number
 917      of latch executions plus one.
 918      ???  For UINT_MAX latch executions this number overflows to zero
 919      for loops like do { n++; } while (n != 0);  */
 920   if (niter && !chrec_contains_undetermined (niter))
 921     niter = fold_build2 (PLUS_EXPR, TREE_TYPE (niter), unshare_expr (niter),
 922                           build_int_cst (TREE_TYPE (niter), 1));
 923   *number_of_iterations = niter;
 924
 925   return cond;
 926 }
 927
 928 /* Function bb_in_loop_p
 929
 930    Used as predicate for dfs order traversal of the loop bbs.  */
 931
 932 static bool
 933 bb_in_loop_p (const_basic_block bb, const void *data)
 934 {
 935   const class loop *const loop = (const class loop *)data;
 936   if (flow_bb_inside_loop_p (loop, bb))
 937     return true;
 938   return false;
 939 }
 940
 941
 942 /* Create and initialize a new loop_vec_info struct for LOOP_IN, as well as
 943    stmt_vec_info structs for all the stmts in LOOP_IN.  */
 944
 945 _loop_vec_info::_loop_vec_info (class loop *loop_in, vec_info_shared *shared)
 946   : vec_info (vec_info::loop, shared),
 947     loop (loop_in),
 948     bbs (XCNEWVEC (basic_block, loop->num_nodes)),
 949     num_itersm1 (NULL_TREE),
 950     num_iters (NULL_TREE),
 951     num_iters_unchanged (NULL_TREE),
 952     num_iters_assumptions (NULL_TREE),
 953     vector_costs (nullptr),
 954     scalar_costs (nullptr),
 955     th (0),
 956     versioning_threshold (0),
 957     vectorization_factor (0),
 958     main_loop_edge (nullptr),
 959     skip_main_loop_edge (nullptr),
 960     skip_this_loop_edge (nullptr),
 961     reusable_accumulators (),
 962     suggested_unroll_factor (1),
 963     max_vectorization_factor (0),
 964     mask_skip_niters (NULL_TREE),
 965     rgroup_compare_type (NULL_TREE),
 966     simd_if_cond (NULL_TREE),
 967     partial_vector_style (vect_partial_vectors_none),
 968     unaligned_dr (NULL),
 969     peeling_for_alignment (0),
 970     ptr_mask (0),
 971     ivexpr_map (NULL),
 972     scan_map (NULL),
 973     slp_unrolling_factor (1),
 974     inner_loop_cost_factor (param_vect_inner_loop_cost_factor),
 975     vectorizable (false),
 976     can_use_partial_vectors_p (param_vect_partial_vector_usage != 0),
 977     using_partial_vectors_p (false),
 978     using_decrementing_iv_p (false),
 979     using_select_vl_p (false),
 980     epil_using_partial_vectors_p (false),
 981     partial_load_store_bias (0),
 982     peeling_for_gaps (false),
 983     peeling_for_niter (false),
 984     no_data_dependencies (false),
 985     has_mask_store (false),
 986     scalar_loop_scaling (profile_probability::uninitialized ()),
 987     scalar_loop (NULL),
 988     orig_loop_info (NULL)
 989 {
 990   /* CHECKME: We want to visit all BBs before their successors (except for
 991      latch blocks, for which this assertion wouldn't hold).  In the simple
 992      case of the loop forms we allow, a dfs order of the BBs would the same
 993      as reversed postorder traversal, so we are safe.  */
 994
 995   unsigned int nbbs = dfs_enumerate_from (loop->header, 0, bb_in_loop_p,
 996                                           bbs, loop->num_nodes, loop);
 997   gcc_assert (nbbs == loop->num_nodes);
 998
 999   for (unsigned int i = 0; i < nbbs; i++)
1000     {
1001       basic_block bb = bbs[i];
1002       gimple_stmt_iterator si;
1003
1004       for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
1005         {
1006           gimple *phi = gsi_stmt (si);
1007           gimple_set_uid (phi, 0);
1008           add_stmt (phi);
1009         }
1010
1011       for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
1012         {
1013           gimple *stmt = gsi_stmt (si);
1014           gimple_set_uid (stmt, 0);
1015           if (is_gimple_debug (stmt))
1016             continue;
1017           add_stmt (stmt);
1018           /* If .GOMP_SIMD_LANE call for the current loop has 3 arguments, the
1019              third argument is the #pragma omp simd if (x) condition, when 0,
1020              loop shouldn't be vectorized, when non-zero constant, it should
1021              be vectorized normally, otherwise versioned with vectorized loop
1022              done if the condition is non-zero at runtime.  */
1023           if (loop_in->simduid
1024               && is_gimple_call (stmt)
1025               && gimple_call_internal_p (stmt)
1026               && gimple_call_internal_fn (stmt) == IFN_GOMP_SIMD_LANE
1027               && gimple_call_num_args (stmt) >= 3
1028               && TREE_CODE (gimple_call_arg (stmt, 0)) == SSA_NAME
1029               && (loop_in->simduid
1030                   == SSA_NAME_VAR (gimple_call_arg (stmt, 0))))
1031             {
1032               tree arg = gimple_call_arg (stmt, 2);
1033               if (integer_zerop (arg) || TREE_CODE (arg) == SSA_NAME)
1034                 simd_if_cond = arg;
1035               else
1036                 gcc_assert (integer_nonzerop (arg));
1037             }
1038         }
1039     }
1040
1041   epilogue_vinfos.create (6);
1042 }
1043
1044 /* Free all levels of rgroup CONTROLS.  */
1045
1046 void
1047 release_vec_loop_controls (vec<rgroup_controls> *controls)
1048 {
1049   rgroup_controls *rgc;
1050   unsigned int i;
1051   FOR_EACH_VEC_ELT (*controls, i, rgc)
1052     rgc->controls.release ();
1053   controls->release ();
1054 }
1055
1056 /* Free all memory used by the _loop_vec_info, as well as all the
1057    stmt_vec_info structs of all the stmts in the loop.  */
1058
1059 _loop_vec_info::~_loop_vec_info ()
1060 {
1061   free (bbs);
1062
1063   release_vec_loop_controls (&masks.rgc_vec);
1064   release_vec_loop_controls (&lens);
1065   delete ivexpr_map;
1066   delete scan_map;
1067   epilogue_vinfos.release ();
1068   delete scalar_costs;
1069   delete vector_costs;
1070
1071   /* When we release an epiloge vinfo that we do not intend to use
1072      avoid clearing AUX of the main loop which should continue to
1073      point to the main loop vinfo since otherwise we'll leak that.  */
1074   if (loop->aux == this)
1075     loop->aux = NULL;
1076 }
1077
1078 /* Return an invariant or register for EXPR and emit necessary
1079    computations in the LOOP_VINFO loop preheader.  */
1080
1081 tree
1082 cse_and_gimplify_to_preheader (loop_vec_info loop_vinfo, tree expr)
1083 {
1084   if (is_gimple_reg (expr)
1085       || is_gimple_min_invariant (expr))
1086     return expr;
1087
1088   if (! loop_vinfo->ivexpr_map)
1089     loop_vinfo->ivexpr_map = new hash_map<tree_operand_hash, tree>;
1090   tree &cached = loop_vinfo->ivexpr_map->get_or_insert (expr);
1091   if (! cached)
1092     {
1093       gimple_seq stmts = NULL;
1094       cached = force_gimple_operand (unshare_expr (expr),
1095                                      &stmts, true, NULL_TREE);
1096       if (stmts)
1097         {
1098           edge e = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo));
1099           gsi_insert_seq_on_edge_immediate (e, stmts);
1100         }
1101     }
1102   return cached;
1103 }
1104
1105 /* Return true if we can use CMP_TYPE as the comparison type to produce
1106    all masks required to mask LOOP_VINFO.  */
1107
1108 static bool
1109 can_produce_all_loop_masks_p (loop_vec_info loop_vinfo, tree cmp_type)
1110 {
1111   rgroup_controls *rgm;
1112   unsigned int i;
1113   FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo).rgc_vec, i, rgm)
1114     if (rgm->type != NULL_TREE
1115         && !direct_internal_fn_supported_p (IFN_WHILE_ULT,
1116                                             cmp_type, rgm->type,
1117                                             OPTIMIZE_FOR_SPEED))
1118       return false;
1119   return true;
1120 }
1121
1122 /* Calculate the maximum number of scalars per iteration for every
1123    rgroup in LOOP_VINFO.  */
1124
1125 static unsigned int
1126 vect_get_max_nscalars_per_iter (loop_vec_info loop_vinfo)
1127 {
1128   unsigned int res = 1;
1129   unsigned int i;
1130   rgroup_controls *rgm;
1131   FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo).rgc_vec, i, rgm)
1132     res = MAX (res, rgm->max_nscalars_per_iter);
1133   return res;
1134 }
1135
1136 /* Calculate the minimum precision necessary to represent:
1137
1138       MAX_NITERS * FACTOR
1139
1140    as an unsigned integer, where MAX_NITERS is the maximum number of
1141    loop header iterations for the original scalar form of LOOP_VINFO.  */
1142
1143 static unsigned
1144 vect_min_prec_for_max_niters (loop_vec_info loop_vinfo, unsigned int factor)
1145 {
1146   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1147
1148   /* Get the maximum number of iterations that is representable
1149      in the counter type.  */
1150   tree ni_type = TREE_TYPE (LOOP_VINFO_NITERSM1 (loop_vinfo));
1151   widest_int max_ni = wi::to_widest (TYPE_MAX_VALUE (ni_type)) + 1;
1152
1153   /* Get a more refined estimate for the number of iterations.  */
1154   widest_int max_back_edges;
1155   if (max_loop_iterations (loop, &max_back_edges))
1156     max_ni = wi::smin (max_ni, max_back_edges + 1);
1157
1158   /* Work out how many bits we need to represent the limit.  */
1159   return wi::min_precision (max_ni * factor, UNSIGNED);
1160 }
1161
1162 /* True if the loop needs peeling or partial vectors when vectorized.  */
1163
1164 static bool
1165 vect_need_peeling_or_partial_vectors_p (loop_vec_info loop_vinfo)
1166 {
1167   unsigned HOST_WIDE_INT const_vf;
1168   HOST_WIDE_INT max_niter
1169     = likely_max_stmt_executions_int (LOOP_VINFO_LOOP (loop_vinfo));
1170
1171   unsigned th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
1172   if (!th && LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo))
1173     th = LOOP_VINFO_COST_MODEL_THRESHOLD (LOOP_VINFO_ORIG_LOOP_INFO
1174                                           (loop_vinfo));
1175
1176   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
1177       && LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) >= 0)
1178     {
1179       /* Work out the (constant) number of iterations that need to be
1180          peeled for reasons other than niters.  */
1181       unsigned int peel_niter = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
1182       if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
1183         peel_niter += 1;
1184       if (!multiple_p (LOOP_VINFO_INT_NITERS (loop_vinfo) - peel_niter,
1185                        LOOP_VINFO_VECT_FACTOR (loop_vinfo)))
1186         return true;
1187     }
1188   else if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
1189       /* ??? When peeling for gaps but not alignment, we could
1190          try to check whether the (variable) niters is known to be
1191          VF * N + 1.  That's something of a niche case though.  */
1192       || LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
1193       || !LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant (&const_vf)
1194       || ((tree_ctz (LOOP_VINFO_NITERS (loop_vinfo))
1195            < (unsigned) exact_log2 (const_vf))
1196           /* In case of versioning, check if the maximum number of
1197              iterations is greater than th.  If they are identical,
1198              the epilogue is unnecessary.  */
1199           && (!LOOP_REQUIRES_VERSIONING (loop_vinfo)
1200               || ((unsigned HOST_WIDE_INT) max_niter
1201                   > (th / const_vf) * const_vf))))
1202     return true;
1203
1204   return false;
1205 }
1206
1207 /* Each statement in LOOP_VINFO can be masked where necessary.  Check
1208    whether we can actually generate the masks required.  Return true if so,
1209    storing the type of the scalar IV in LOOP_VINFO_RGROUP_COMPARE_TYPE.  */
1210
1211 static bool
1212 vect_verify_full_masking (loop_vec_info loop_vinfo)
1213 {
1214   unsigned int min_ni_width;
1215
1216   /* Use a normal loop if there are no statements that need masking.
1217      This only happens in rare degenerate cases: it means that the loop
1218      has no loads, no stores, and no live-out values.  */
1219   if (LOOP_VINFO_MASKS (loop_vinfo).is_empty ())
1220     return false;
1221
1222   /* Produce the rgroup controls.  */
1223   for (auto mask : LOOP_VINFO_MASKS (loop_vinfo).mask_set)
1224     {
1225       vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo);
1226       tree vectype = mask.first;
1227       unsigned nvectors = mask.second;
1228
1229       if (masks->rgc_vec.length () < nvectors)
1230         masks->rgc_vec.safe_grow_cleared (nvectors, true);
1231       rgroup_controls *rgm = &(*masks).rgc_vec[nvectors - 1];
1232       /* The number of scalars per iteration and the number of vectors are
1233          both compile-time constants.  */
1234       unsigned int nscalars_per_iter
1235           = exact_div (nvectors * TYPE_VECTOR_SUBPARTS (vectype),
1236                        LOOP_VINFO_VECT_FACTOR (loop_vinfo)).to_constant ();
1237
1238       if (rgm->max_nscalars_per_iter < nscalars_per_iter)
1239         {
1240           rgm->max_nscalars_per_iter = nscalars_per_iter;
1241           rgm->type = truth_type_for (vectype);
1242           rgm->factor = 1;
1243         }
1244     }
1245
1246   unsigned int max_nscalars_per_iter
1247     = vect_get_max_nscalars_per_iter (loop_vinfo);
1248
1249   /* Work out how many bits we need to represent the limit.  */
1250   min_ni_width
1251     = vect_min_prec_for_max_niters (loop_vinfo, max_nscalars_per_iter);
1252
1253   /* Find a scalar mode for which WHILE_ULT is supported.  */
1254   opt_scalar_int_mode cmp_mode_iter;
1255   tree cmp_type = NULL_TREE;
1256   tree iv_type = NULL_TREE;
1257   widest_int iv_limit = vect_iv_limit_for_partial_vectors (loop_vinfo);
1258   unsigned int iv_precision = UINT_MAX;
1259
1260   if (iv_limit != -1)
1261     iv_precision = wi::min_precision (iv_limit * max_nscalars_per_iter,
1262                                       UNSIGNED);
1263
1264   FOR_EACH_MODE_IN_CLASS (cmp_mode_iter, MODE_INT)
1265     {
1266       unsigned int cmp_bits = GET_MODE_BITSIZE (cmp_mode_iter.require ());
1267       if (cmp_bits >= min_ni_width
1268           && targetm.scalar_mode_supported_p (cmp_mode_iter.require ()))
1269         {
1270           tree this_type = build_nonstandard_integer_type (cmp_bits, true);
1271           if (this_type
1272               && can_produce_all_loop_masks_p (loop_vinfo, this_type))
1273             {
1274               /* Although we could stop as soon as we find a valid mode,
1275                  there are at least two reasons why that's not always the
1276                  best choice:
1277
1278                  - An IV that's Pmode or wider is more likely to be reusable
1279                    in address calculations than an IV that's narrower than
1280                    Pmode.
1281
1282                  - Doing the comparison in IV_PRECISION or wider allows
1283                    a natural 0-based IV, whereas using a narrower comparison
1284                    type requires mitigations against wrap-around.
1285
1286                  Conversely, if the IV limit is variable, doing the comparison
1287                  in a wider type than the original type can introduce
1288                  unnecessary extensions, so picking the widest valid mode
1289                  is not always a good choice either.
1290
1291                  Here we prefer the first IV type that's Pmode or wider,
1292                  and the first comparison type that's IV_PRECISION or wider.
1293                  (The comparison type must be no wider than the IV type,
1294                  to avoid extensions in the vector loop.)
1295
1296                  ??? We might want to try continuing beyond Pmode for ILP32
1297                  targets if CMP_BITS < IV_PRECISION.  */
1298               iv_type = this_type;
1299               if (!cmp_type || iv_precision > TYPE_PRECISION (cmp_type))
1300                 cmp_type = this_type;
1301               if (cmp_bits >= GET_MODE_BITSIZE (Pmode))
1302                 break;
1303             }
1304         }
1305     }
1306
1307   if (!cmp_type)
1308     {
1309       LOOP_VINFO_MASKS (loop_vinfo).rgc_vec.release ();
1310       return false;
1311     }
1312
1313   LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo) = cmp_type;
1314   LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo) = iv_type;
1315   LOOP_VINFO_PARTIAL_VECTORS_STYLE (loop_vinfo) = vect_partial_vectors_while_ult;
1316   return true;
1317 }
1318
1319 /* Each statement in LOOP_VINFO can be masked where necessary.  Check
1320    whether we can actually generate AVX512 style masks.  Return true if so,
1321    storing the type of the scalar IV in LOOP_VINFO_RGROUP_IV_TYPE.  */
1322
1323 static bool
1324 vect_verify_full_masking_avx512 (loop_vec_info loop_vinfo)
1325 {
1326   /* Produce differently organized rgc_vec and differently check
1327      we can produce masks.  */
1328
1329   /* Use a normal loop if there are no statements that need masking.
1330      This only happens in rare degenerate cases: it means that the loop
1331      has no loads, no stores, and no live-out values.  */
1332   if (LOOP_VINFO_MASKS (loop_vinfo).is_empty ())
1333     return false;
1334
1335   /* For the decrementing IV we need to represent all values in
1336      [0, niter + niter_skip] where niter_skip is the elements we
1337      skip in the first iteration for prologue peeling.  */
1338   tree iv_type = NULL_TREE;
1339   widest_int iv_limit = vect_iv_limit_for_partial_vectors (loop_vinfo);
1340   unsigned int iv_precision = UINT_MAX;
1341   if (iv_limit != -1)
1342     iv_precision = wi::min_precision (iv_limit, UNSIGNED);
1343
1344   /* First compute the type for the IV we use to track the remaining
1345      scalar iterations.  */
1346   opt_scalar_int_mode cmp_mode_iter;
1347   FOR_EACH_MODE_IN_CLASS (cmp_mode_iter, MODE_INT)
1348     {
1349       unsigned int cmp_bits = GET_MODE_BITSIZE (cmp_mode_iter.require ());
1350       if (cmp_bits >= iv_precision
1351           && targetm.scalar_mode_supported_p (cmp_mode_iter.require ()))
1352         {
1353           iv_type = build_nonstandard_integer_type (cmp_bits, true);
1354           if (iv_type)
1355             break;
1356         }
1357     }
1358   if (!iv_type)
1359     return false;
1360
1361   /* Produce the rgroup controls.  */
1362   for (auto const &mask : LOOP_VINFO_MASKS (loop_vinfo).mask_set)
1363     {
1364       vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo);
1365       tree vectype = mask.first;
1366       unsigned nvectors = mask.second;
1367
1368       /* The number of scalars per iteration and the number of vectors are
1369          both compile-time constants.  */
1370       unsigned int nscalars_per_iter
1371         = exact_div (nvectors * TYPE_VECTOR_SUBPARTS (vectype),
1372                      LOOP_VINFO_VECT_FACTOR (loop_vinfo)).to_constant ();
1373
1374       /* We index the rgroup_controls vector with nscalars_per_iter
1375          which we keep constant and instead have a varying nvectors,
1376          remembering the vector mask with the fewest nV.  */
1377       if (masks->rgc_vec.length () < nscalars_per_iter)
1378         masks->rgc_vec.safe_grow_cleared (nscalars_per_iter, true);
1379       rgroup_controls *rgm = &(*masks).rgc_vec[nscalars_per_iter - 1];
1380
1381       if (!rgm->type || rgm->factor > nvectors)
1382         {
1383           rgm->type = truth_type_for (vectype);
1384           rgm->compare_type = NULL_TREE;
1385           rgm->max_nscalars_per_iter = nscalars_per_iter;
1386           rgm->factor = nvectors;
1387           rgm->bias_adjusted_ctrl = NULL_TREE;
1388         }
1389     }
1390
1391   /* There is no fixed compare type we are going to use but we have to
1392      be able to get at one for each mask group.  */
1393   unsigned int min_ni_width
1394     = wi::min_precision (vect_max_vf (loop_vinfo), UNSIGNED);
1395
1396   bool ok = true;
1397   for (auto &rgc : LOOP_VINFO_MASKS (loop_vinfo).rgc_vec)
1398     {
1399       tree mask_type = rgc.type;
1400       if (!mask_type)
1401         continue;
1402
1403       if (TYPE_PRECISION (TREE_TYPE (mask_type)) != 1)
1404         {
1405           ok = false;
1406           break;
1407         }
1408
1409       /* If iv_type is usable as compare type use that - we can elide the
1410          saturation in that case.   */
1411       if (TYPE_PRECISION (iv_type) >= min_ni_width)
1412         {
1413           tree cmp_vectype
1414             = build_vector_type (iv_type, TYPE_VECTOR_SUBPARTS (mask_type));
1415           if (expand_vec_cmp_expr_p (cmp_vectype, mask_type, LT_EXPR))
1416             rgc.compare_type = cmp_vectype;
1417         }
1418       if (!rgc.compare_type)
1419         FOR_EACH_MODE_IN_CLASS (cmp_mode_iter, MODE_INT)
1420           {
1421             unsigned int cmp_bits = GET_MODE_BITSIZE (cmp_mode_iter.require ());
1422             if (cmp_bits >= min_ni_width
1423                 && targetm.scalar_mode_supported_p (cmp_mode_iter.require ()))
1424               {
1425                 tree cmp_type = build_nonstandard_integer_type (cmp_bits, true);
1426                 if (!cmp_type)
1427                   continue;
1428
1429                 /* Check whether we can produce the mask with cmp_type.  */
1430                 tree cmp_vectype
1431                   = build_vector_type (cmp_type, TYPE_VECTOR_SUBPARTS (mask_type));
1432                 if (expand_vec_cmp_expr_p (cmp_vectype, mask_type, LT_EXPR))
1433                   {
1434                     rgc.compare_type = cmp_vectype;
1435                     break;
1436                   }
1437               }
1438         }
1439       if (!rgc.compare_type)
1440         {
1441           ok = false;
1442           break;
1443         }
1444     }
1445   if (!ok)
1446     {
1447       release_vec_loop_controls (&LOOP_VINFO_MASKS (loop_vinfo).rgc_vec);
1448       return false;
1449     }
1450
1451   LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo) = error_mark_node;
1452   LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo) = iv_type;
1453   LOOP_VINFO_PARTIAL_VECTORS_STYLE (loop_vinfo) = vect_partial_vectors_avx512;
1454   return true;
1455 }
1456
1457 /* Check whether we can use vector access with length based on precison
1458    comparison.  So far, to keep it simple, we only allow the case that the
1459    precision of the target supported length is larger than the precision
1460    required by loop niters.  */
1461
1462 static bool
1463 vect_verify_loop_lens (loop_vec_info loop_vinfo)
1464 {
1465   if (LOOP_VINFO_LENS (loop_vinfo).is_empty ())
1466     return false;
1467
1468   machine_mode len_load_mode = get_len_load_store_mode
1469     (loop_vinfo->vector_mode, true).require ();
1470   machine_mode len_store_mode = get_len_load_store_mode
1471     (loop_vinfo->vector_mode, false).require ();
1472
1473   signed char partial_load_bias = internal_len_load_store_bias
1474     (IFN_LEN_LOAD, len_load_mode);
1475
1476   signed char partial_store_bias = internal_len_load_store_bias
1477     (IFN_LEN_STORE, len_store_mode);
1478
1479   gcc_assert (partial_load_bias == partial_store_bias);
1480
1481   if (partial_load_bias == VECT_PARTIAL_BIAS_UNSUPPORTED)
1482     return false;
1483
1484   /* If the backend requires a bias of -1 for LEN_LOAD, we must not emit
1485      len_loads with a length of zero.  In order to avoid that we prohibit
1486      more than one loop length here.  */
1487   if (partial_load_bias == -1
1488       && LOOP_VINFO_LENS (loop_vinfo).length () > 1)
1489     return false;
1490
1491   LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo) = partial_load_bias;
1492
1493   unsigned int max_nitems_per_iter = 1;
1494   unsigned int i;
1495   rgroup_controls *rgl;
1496   /* Find the maximum number of items per iteration for every rgroup.  */
1497   FOR_EACH_VEC_ELT (LOOP_VINFO_LENS (loop_vinfo), i, rgl)
1498     {
1499       unsigned nitems_per_iter = rgl->max_nscalars_per_iter * rgl->factor;
1500       max_nitems_per_iter = MAX (max_nitems_per_iter, nitems_per_iter);
1501     }
1502
1503   /* Work out how many bits we need to represent the length limit.  */
1504   unsigned int min_ni_prec
1505     = vect_min_prec_for_max_niters (loop_vinfo, max_nitems_per_iter);
1506
1507   /* Now use the maximum of below precisions for one suitable IV type:
1508      - the IV's natural precision
1509      - the precision needed to hold: the maximum number of scalar
1510        iterations multiplied by the scale factor (min_ni_prec above)
1511      - the Pmode precision
1512
1513      If min_ni_prec is less than the precision of the current niters,
1514      we perfer to still use the niters type.  Prefer to use Pmode and
1515      wider IV to avoid narrow conversions.  */
1516
1517   unsigned int ni_prec
1518     = TYPE_PRECISION (TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo)));
1519   min_ni_prec = MAX (min_ni_prec, ni_prec);
1520   min_ni_prec = MAX (min_ni_prec, GET_MODE_BITSIZE (Pmode));
1521
1522   tree iv_type = NULL_TREE;
1523   opt_scalar_int_mode tmode_iter;
1524   FOR_EACH_MODE_IN_CLASS (tmode_iter, MODE_INT)
1525     {
1526       scalar_mode tmode = tmode_iter.require ();
1527       unsigned int tbits = GET_MODE_BITSIZE (tmode);
1528
1529       /* ??? Do we really want to construct one IV whose precision exceeds
1530          BITS_PER_WORD?  */
1531       if (tbits > BITS_PER_WORD)
1532         break;
1533
1534       /* Find the first available standard integral type.  */
1535       if (tbits >= min_ni_prec && targetm.scalar_mode_supported_p (tmode))
1536         {
1537           iv_type = build_nonstandard_integer_type (tbits, true);
1538           break;
1539         }
1540     }
1541
1542   if (!iv_type)
1543     {
1544       if (dump_enabled_p ())
1545         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1546                          "can't vectorize with length-based partial vectors"
1547                          " because there is no suitable iv type.\n");
1548       return false;
1549     }
1550
1551   LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo) = iv_type;
1552   LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo) = iv_type;
1553   LOOP_VINFO_PARTIAL_VECTORS_STYLE (loop_vinfo) = vect_partial_vectors_len;
1554
1555   return true;
1556 }
1557
1558 /* Calculate the cost of one scalar iteration of the loop.  */
1559 static void
1560 vect_compute_single_scalar_iteration_cost (loop_vec_info loop_vinfo)
1561 {
1562   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1563   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1564   int nbbs = loop->num_nodes, factor;
1565   int innerloop_iters, i;
1566
1567   DUMP_VECT_SCOPE ("vect_compute_single_scalar_iteration_cost");
1568
1569   /* Gather costs for statements in the scalar loop.  */
1570
1571   /* FORNOW.  */
1572   innerloop_iters = 1;
1573   if (loop->inner)
1574     innerloop_iters = LOOP_VINFO_INNER_LOOP_COST_FACTOR (loop_vinfo);
1575
1576   for (i = 0; i < nbbs; i++)
1577     {
1578       gimple_stmt_iterator si;
1579       basic_block bb = bbs[i];
1580
1581       if (bb->loop_father == loop->inner)
1582         factor = innerloop_iters;
1583       else
1584         factor = 1;
1585
1586       for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
1587         {
1588           gimple *stmt = gsi_stmt (si);
1589           stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (stmt);
1590
1591           if (!is_gimple_assign (stmt) && !is_gimple_call (stmt))
1592             continue;
1593
1594           /* Skip stmts that are not vectorized inside the loop.  */
1595           stmt_vec_info vstmt_info = vect_stmt_to_vectorize (stmt_info);
1596           if (!STMT_VINFO_RELEVANT_P (vstmt_info)
1597               && (!STMT_VINFO_LIVE_P (vstmt_info)
1598                   || !VECTORIZABLE_CYCLE_DEF
1599                         (STMT_VINFO_DEF_TYPE (vstmt_info))))
1600             continue;
1601
1602           vect_cost_for_stmt kind;
1603           if (STMT_VINFO_DATA_REF (stmt_info))
1604             {
1605               if (DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
1606                kind = scalar_load;
1607              else
1608                kind = scalar_store;
1609             }
1610           else if (vect_nop_conversion_p (stmt_info))
1611             continue;
1612           else
1613             kind = scalar_stmt;
1614
1615           /* We are using vect_prologue here to avoid scaling twice
1616              by the inner loop factor.  */
1617           record_stmt_cost (&LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1618                             factor, kind, stmt_info, 0, vect_prologue);
1619         }
1620     }
1621
1622   /* Now accumulate cost.  */
1623   loop_vinfo->scalar_costs = init_cost (loop_vinfo, true);
1624   add_stmt_costs (loop_vinfo->scalar_costs,
1625                   &LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo));
1626   loop_vinfo->scalar_costs->finish_cost (nullptr);
1627 }
1628
1629
1630 /* Function vect_analyze_loop_form.
1631
1632    Verify that certain CFG restrictions hold, including:
1633    - the loop has a pre-header
1634    - the loop has a single entry and exit
1635    - the loop exit condition is simple enough
1636    - the number of iterations can be analyzed, i.e, a countable loop.  The
1637      niter could be analyzed under some assumptions.  */
1638
1639 opt_result
1640 vect_analyze_loop_form (class loop *loop, vect_loop_form_info *info)
1641 {
1642   DUMP_VECT_SCOPE ("vect_analyze_loop_form");
1643
1644   /* Different restrictions apply when we are considering an inner-most loop,
1645      vs. an outer (nested) loop.
1646      (FORNOW. May want to relax some of these restrictions in the future).  */
1647
1648   info->inner_loop_cond = NULL;
1649   if (!loop->inner)
1650     {
1651       /* Inner-most loop.  We currently require that the number of BBs is
1652          exactly 2 (the header and latch).  Vectorizable inner-most loops
1653          look like this:
1654
1655                         (pre-header)
1656                            |
1657                           header <--------+
1658                            | |            |
1659                            | +--> latch --+
1660                            |
1661                         (exit-bb)  */
1662
1663       if (loop->num_nodes != 2)
1664         return opt_result::failure_at (vect_location,
1665                                        "not vectorized:"
1666                                        " control flow in loop.\n");
1667
1668       if (empty_block_p (loop->header))
1669         return opt_result::failure_at (vect_location,
1670                                        "not vectorized: empty loop.\n");
1671     }
1672   else
1673     {
1674       class loop *innerloop = loop->inner;
1675       edge entryedge;
1676
1677       /* Nested loop. We currently require that the loop is doubly-nested,
1678          contains a single inner loop, and the number of BBs is exactly 5.
1679          Vectorizable outer-loops look like this:
1680
1681                         (pre-header)
1682                            |
1683                           header <---+
1684                            |         |
1685                           inner-loop |
1686                            |         |
1687                           tail ------+
1688                            |
1689                         (exit-bb)
1690
1691          The inner-loop has the properties expected of inner-most loops
1692          as described above.  */
1693
1694       if ((loop->inner)->inner || (loop->inner)->next)
1695         return opt_result::failure_at (vect_location,
1696                                        "not vectorized:"
1697                                        " multiple nested loops.\n");
1698
1699       if (loop->num_nodes != 5)
1700         return opt_result::failure_at (vect_location,
1701                                        "not vectorized:"
1702                                        " control flow in loop.\n");
1703
1704       entryedge = loop_preheader_edge (innerloop);
1705       if (entryedge->src != loop->header
1706           || !single_exit (innerloop)
1707           || single_exit (innerloop)->dest != EDGE_PRED (loop->latch, 0)->src)
1708         return opt_result::failure_at (vect_location,
1709                                        "not vectorized:"
1710                                        " unsupported outerloop form.\n");
1711
1712       /* Analyze the inner-loop.  */
1713       vect_loop_form_info inner;
1714       opt_result res = vect_analyze_loop_form (loop->inner, &inner);
1715       if (!res)
1716         {
1717           if (dump_enabled_p ())
1718             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1719                              "not vectorized: Bad inner loop.\n");
1720           return res;
1721         }
1722
1723       /* Don't support analyzing niter under assumptions for inner
1724          loop.  */
1725       if (!integer_onep (inner.assumptions))
1726         return opt_result::failure_at (vect_location,
1727                                        "not vectorized: Bad inner loop.\n");
1728
1729       if (!expr_invariant_in_loop_p (loop, inner.number_of_iterations))
1730         return opt_result::failure_at (vect_location,
1731                                        "not vectorized: inner-loop count not"
1732                                        " invariant.\n");
1733
1734       if (dump_enabled_p ())
1735         dump_printf_loc (MSG_NOTE, vect_location,
1736                          "Considering outer-loop vectorization.\n");
1737       info->inner_loop_cond = inner.loop_cond;
1738     }
1739
1740   if (!single_exit (loop))
1741     return opt_result::failure_at (vect_location,
1742                                    "not vectorized: multiple exits.\n");
1743   if (EDGE_COUNT (loop->header->preds) != 2)
1744     return opt_result::failure_at (vect_location,
1745                                    "not vectorized:"
1746                                    " too many incoming edges.\n");
1747
1748   /* We assume that the loop exit condition is at the end of the loop. i.e,
1749      that the loop is represented as a do-while (with a proper if-guard
1750      before the loop if needed), where the loop header contains all the
1751      executable statements, and the latch is empty.  */
1752   if (!empty_block_p (loop->latch)
1753       || !gimple_seq_empty_p (phi_nodes (loop->latch)))
1754     return opt_result::failure_at (vect_location,
1755                                    "not vectorized: latch block not empty.\n");
1756
1757   /* Make sure the exit is not abnormal.  */
1758   edge e = single_exit (loop);
1759   if (e->flags & EDGE_ABNORMAL)
1760     return opt_result::failure_at (vect_location,
1761                                    "not vectorized:"
1762                                    " abnormal loop exit edge.\n");
1763
1764   info->loop_cond
1765     = vect_get_loop_niters (loop, &info->assumptions,
1766                             &info->number_of_iterations,
1767                             &info->number_of_iterationsm1);
1768   if (!info->loop_cond)
1769     return opt_result::failure_at
1770       (vect_location,
1771        "not vectorized: complicated exit condition.\n");
1772
1773   if (integer_zerop (info->assumptions)
1774       || !info->number_of_iterations
1775       || chrec_contains_undetermined (info->number_of_iterations))
1776     return opt_result::failure_at
1777       (info->loop_cond,
1778        "not vectorized: number of iterations cannot be computed.\n");
1779
1780   if (integer_zerop (info->number_of_iterations))
1781     return opt_result::failure_at
1782       (info->loop_cond,
1783        "not vectorized: number of iterations = 0.\n");
1784
1785   if (!(tree_fits_shwi_p (info->number_of_iterations)
1786         && tree_to_shwi (info->number_of_iterations) > 0))
1787     {
1788       if (dump_enabled_p ())
1789         {
1790           dump_printf_loc (MSG_NOTE, vect_location,
1791                            "Symbolic number of iterations is ");
1792           dump_generic_expr (MSG_NOTE, TDF_DETAILS, info->number_of_iterations);
1793           dump_printf (MSG_NOTE, "\n");
1794         }
1795     }
1796
1797   return opt_result::success ();
1798 }
1799
1800 /* Create a loop_vec_info for LOOP with SHARED and the
1801    vect_analyze_loop_form result.  */
1802
1803 loop_vec_info
1804 vect_create_loop_vinfo (class loop *loop, vec_info_shared *shared,
1805                         const vect_loop_form_info *info,
1806                         loop_vec_info main_loop_info)
1807 {
1808   loop_vec_info loop_vinfo = new _loop_vec_info (loop, shared);
1809   LOOP_VINFO_NITERSM1 (loop_vinfo) = info->number_of_iterationsm1;
1810   LOOP_VINFO_NITERS (loop_vinfo) = info->number_of_iterations;
1811   LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = info->number_of_iterations;
1812   LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo) = main_loop_info;
1813   /* Also record the assumptions for versioning.  */
1814   if (!integer_onep (info->assumptions) && !main_loop_info)
1815     LOOP_VINFO_NITERS_ASSUMPTIONS (loop_vinfo) = info->assumptions;
1816
1817   stmt_vec_info loop_cond_info = loop_vinfo->lookup_stmt (info->loop_cond);
1818   STMT_VINFO_TYPE (loop_cond_info) = loop_exit_ctrl_vec_info_type;
1819   if (info->inner_loop_cond)
1820     {
1821       stmt_vec_info inner_loop_cond_info
1822         = loop_vinfo->lookup_stmt (info->inner_loop_cond);
1823       STMT_VINFO_TYPE (inner_loop_cond_info) = loop_exit_ctrl_vec_info_type;
1824       /* If we have an estimate on the number of iterations of the inner
1825          loop use that to limit the scale for costing, otherwise use
1826          --param vect-inner-loop-cost-factor literally.  */
1827       widest_int nit;
1828       if (estimated_stmt_executions (loop->inner, &nit))
1829         LOOP_VINFO_INNER_LOOP_COST_FACTOR (loop_vinfo)
1830           = wi::smin (nit, param_vect_inner_loop_cost_factor).to_uhwi ();
1831     }
1832
1833   return loop_vinfo;
1834 }
1835
1836
1837
1838 /* Scan the loop stmts and dependent on whether there are any (non-)SLP
1839    statements update the vectorization factor.  */
1840
1841 static void
1842 vect_update_vf_for_slp (loop_vec_info loop_vinfo)
1843 {
1844   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1845   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1846   int nbbs = loop->num_nodes;
1847   poly_uint64 vectorization_factor;
1848   int i;
1849
1850   DUMP_VECT_SCOPE ("vect_update_vf_for_slp");
1851
1852   vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1853   gcc_assert (known_ne (vectorization_factor, 0U));
1854
1855   /* If all the stmts in the loop can be SLPed, we perform only SLP, and
1856      vectorization factor of the loop is the unrolling factor required by
1857      the SLP instances.  If that unrolling factor is 1, we say, that we
1858      perform pure SLP on loop - cross iteration parallelism is not
1859      exploited.  */
1860   bool only_slp_in_loop = true;
1861   for (i = 0; i < nbbs; i++)
1862     {
1863       basic_block bb = bbs[i];
1864       for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
1865            gsi_next (&si))
1866         {
1867           stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (si.phi ());
1868           if (!stmt_info)
1869             continue;
1870           if ((STMT_VINFO_RELEVANT_P (stmt_info)
1871                || VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
1872               && !PURE_SLP_STMT (stmt_info))
1873             /* STMT needs both SLP and loop-based vectorization.  */
1874             only_slp_in_loop = false;
1875         }
1876       for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
1877            gsi_next (&si))
1878         {
1879           if (is_gimple_debug (gsi_stmt (si)))
1880             continue;
1881           stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
1882           stmt_info = vect_stmt_to_vectorize (stmt_info);
1883           if ((STMT_VINFO_RELEVANT_P (stmt_info)
1884                || VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
1885               && !PURE_SLP_STMT (stmt_info))
1886             /* STMT needs both SLP and loop-based vectorization.  */
1887             only_slp_in_loop = false;
1888         }
1889     }
1890
1891   if (only_slp_in_loop)
1892     {
1893       if (dump_enabled_p ())
1894         dump_printf_loc (MSG_NOTE, vect_location,
1895                          "Loop contains only SLP stmts\n");
1896       vectorization_factor = LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo);
1897     }
1898   else
1899     {
1900       if (dump_enabled_p ())
1901         dump_printf_loc (MSG_NOTE, vect_location,
1902                          "Loop contains SLP and non-SLP stmts\n");
1903       /* Both the vectorization factor and unroll factor have the form
1904          GET_MODE_SIZE (loop_vinfo->vector_mode) * X for some rational X,
1905          so they must have a common multiple.  */
1906       vectorization_factor
1907         = force_common_multiple (vectorization_factor,
1908                                  LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo));
1909     }
1910
1911   LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
1912   if (dump_enabled_p ())
1913     {
1914       dump_printf_loc (MSG_NOTE, vect_location,
1915                        "Updating vectorization factor to ");
1916       dump_dec (MSG_NOTE, vectorization_factor);
1917       dump_printf (MSG_NOTE, ".\n");
1918     }
1919 }
1920
1921 /* Return true if STMT_INFO describes a double reduction phi and if
1922    the other phi in the reduction is also relevant for vectorization.
1923    This rejects cases such as:
1924
1925       outer1:
1926         x_1 = PHI <x_3(outer2), ...>;
1927         ...
1928
1929       inner:
1930         x_2 = ...;
1931         ...
1932
1933       outer2:
1934         x_3 = PHI <x_2(inner)>;
1935
1936    if nothing in x_2 or elsewhere makes x_1 relevant.  */
1937
1938 static bool
1939 vect_active_double_reduction_p (stmt_vec_info stmt_info)
1940 {
1941   if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_double_reduction_def)
1942     return false;
1943
1944   return STMT_VINFO_RELEVANT_P (STMT_VINFO_REDUC_DEF (stmt_info));
1945 }
1946
1947 /* Function vect_analyze_loop_operations.
1948
1949    Scan the loop stmts and make sure they are all vectorizable.  */
1950
1951 static opt_result
1952 vect_analyze_loop_operations (loop_vec_info loop_vinfo)
1953 {
1954   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1955   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1956   int nbbs = loop->num_nodes;
1957   int i;
1958   stmt_vec_info stmt_info;
1959   bool need_to_vectorize = false;
1960   bool ok;
1961
1962   DUMP_VECT_SCOPE ("vect_analyze_loop_operations");
1963
1964   auto_vec<stmt_info_for_cost> cost_vec;
1965
1966   for (i = 0; i < nbbs; i++)
1967     {
1968       basic_block bb = bbs[i];
1969
1970       for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
1971            gsi_next (&si))
1972         {
1973           gphi *phi = si.phi ();
1974           ok = true;
1975
1976           stmt_info = loop_vinfo->lookup_stmt (phi);
1977           if (dump_enabled_p ())
1978             dump_printf_loc (MSG_NOTE, vect_location, "examining phi: %G",
1979                              (gimple *) phi);
1980           if (virtual_operand_p (gimple_phi_result (phi)))
1981             continue;
1982
1983           /* Inner-loop loop-closed exit phi in outer-loop vectorization
1984              (i.e., a phi in the tail of the outer-loop).  */
1985           if (! is_loop_header_bb_p (bb))
1986             {
1987               /* FORNOW: we currently don't support the case that these phis
1988                  are not used in the outerloop (unless it is double reduction,
1989                  i.e., this phi is vect_reduction_def), cause this case
1990                  requires to actually do something here.  */
1991               if (STMT_VINFO_LIVE_P (stmt_info)
1992                   && !vect_active_double_reduction_p (stmt_info))
1993                 return opt_result::failure_at (phi,
1994                                                "Unsupported loop-closed phi"
1995                                                " in outer-loop.\n");
1996
1997               /* If PHI is used in the outer loop, we check that its operand
1998                  is defined in the inner loop.  */
1999               if (STMT_VINFO_RELEVANT_P (stmt_info))
2000                 {
2001                   tree phi_op;
2002
2003                   if (gimple_phi_num_args (phi) != 1)
2004                     return opt_result::failure_at (phi, "unsupported phi");
2005
2006                   phi_op = PHI_ARG_DEF (phi, 0);
2007                   stmt_vec_info op_def_info = loop_vinfo->lookup_def (phi_op);
2008                   if (!op_def_info)
2009                     return opt_result::failure_at (phi, "unsupported phi\n");
2010
2011                   if (STMT_VINFO_RELEVANT (op_def_info) != vect_used_in_outer
2012                       && (STMT_VINFO_RELEVANT (op_def_info)
2013                           != vect_used_in_outer_by_reduction))
2014                     return opt_result::failure_at (phi, "unsupported phi\n");
2015
2016                   if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_internal_def
2017                        || (STMT_VINFO_DEF_TYPE (stmt_info)
2018                            == vect_double_reduction_def))
2019                       && !vectorizable_lc_phi (loop_vinfo,
2020                                                stmt_info, NULL, NULL))
2021                     return opt_result::failure_at (phi, "unsupported phi\n");
2022                 }
2023
2024               continue;
2025             }
2026
2027           gcc_assert (stmt_info);
2028
2029           if ((STMT_VINFO_RELEVANT (stmt_info) == vect_used_in_scope
2030                || STMT_VINFO_LIVE_P (stmt_info))
2031               && STMT_VINFO_DEF_TYPE (stmt_info) != vect_induction_def
2032               && STMT_VINFO_DEF_TYPE (stmt_info) != vect_first_order_recurrence)
2033             /* A scalar-dependence cycle that we don't support.  */
2034             return opt_result::failure_at (phi,
2035                                            "not vectorized:"
2036                                            " scalar dependence cycle.\n");
2037
2038           if (STMT_VINFO_RELEVANT_P (stmt_info))
2039             {
2040               need_to_vectorize = true;
2041               if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def
2042                   && ! PURE_SLP_STMT (stmt_info))
2043                 ok = vectorizable_induction (loop_vinfo,
2044                                              stmt_info, NULL, NULL,
2045                                              &cost_vec);
2046               else if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
2047                         || (STMT_VINFO_DEF_TYPE (stmt_info)
2048                             == vect_double_reduction_def)
2049                         || STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle)
2050                        && ! PURE_SLP_STMT (stmt_info))
2051                 ok = vectorizable_reduction (loop_vinfo,
2052                                              stmt_info, NULL, NULL, &cost_vec);
2053               else if ((STMT_VINFO_DEF_TYPE (stmt_info)
2054                         == vect_first_order_recurrence)
2055                        && ! PURE_SLP_STMT (stmt_info))
2056                 ok = vectorizable_recurr (loop_vinfo, stmt_info, NULL, NULL,
2057                                            &cost_vec);
2058             }
2059
2060           /* SLP PHIs are tested by vect_slp_analyze_node_operations.  */
2061           if (ok
2062               && STMT_VINFO_LIVE_P (stmt_info)
2063               && !PURE_SLP_STMT (stmt_info))
2064             ok = vectorizable_live_operation (loop_vinfo,
2065                                               stmt_info, NULL, NULL, NULL,
2066                                               -1, false, &cost_vec);
2067
2068           if (!ok)
2069             return opt_result::failure_at (phi,
2070                                            "not vectorized: relevant phi not "
2071                                            "supported: %G",
2072                                            static_cast <gimple *> (phi));
2073         }
2074
2075       for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
2076            gsi_next (&si))
2077         {
2078           gimple *stmt = gsi_stmt (si);
2079           if (!gimple_clobber_p (stmt)
2080               && !is_gimple_debug (stmt))
2081             {
2082               opt_result res
2083                 = vect_analyze_stmt (loop_vinfo,
2084                                      loop_vinfo->lookup_stmt (stmt),
2085                                      &need_to_vectorize,
2086                                      NULL, NULL, &cost_vec);
2087               if (!res)
2088                 return res;
2089             }
2090         }
2091     } /* bbs */
2092
2093   add_stmt_costs (loop_vinfo->vector_costs, &cost_vec);
2094
2095   /* All operations in the loop are either irrelevant (deal with loop
2096      control, or dead), or only used outside the loop and can be moved
2097      out of the loop (e.g. invariants, inductions).  The loop can be
2098      optimized away by scalar optimizations.  We're better off not
2099      touching this loop.  */
2100   if (!need_to_vectorize)
2101     {
2102       if (dump_enabled_p ())
2103         dump_printf_loc (MSG_NOTE, vect_location,
2104                          "All the computation can be taken out of the loop.\n");
2105       return opt_result::failure_at
2106         (vect_location,
2107          "not vectorized: redundant loop. no profit to vectorize.\n");
2108     }
2109
2110   return opt_result::success ();
2111 }
2112
2113 /* Return true if we know that the iteration count is smaller than the
2114    vectorization factor.  Return false if it isn't, or if we can't be sure
2115    either way.  */
2116
2117 static bool
2118 vect_known_niters_smaller_than_vf (loop_vec_info loop_vinfo)
2119 {
2120   unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
2121
2122   HOST_WIDE_INT max_niter;
2123   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
2124     max_niter = LOOP_VINFO_INT_NITERS (loop_vinfo);
2125   else
2126     max_niter = max_stmt_executions_int (LOOP_VINFO_LOOP (loop_vinfo));
2127
2128   if (max_niter != -1 && (unsigned HOST_WIDE_INT) max_niter < assumed_vf)
2129     return true;
2130
2131   return false;
2132 }
2133
2134 /* Analyze the cost of the loop described by LOOP_VINFO.  Decide if it
2135    is worthwhile to vectorize.  Return 1 if definitely yes, 0 if
2136    definitely no, or -1 if it's worth retrying.  */
2137
2138 static int
2139 vect_analyze_loop_costing (loop_vec_info loop_vinfo,
2140                            unsigned *suggested_unroll_factor)
2141 {
2142   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2143   unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
2144
2145   /* Only loops that can handle partially-populated vectors can have iteration
2146      counts less than the vectorization factor.  */
2147   if (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo)
2148       && vect_known_niters_smaller_than_vf (loop_vinfo))
2149     {
2150       if (dump_enabled_p ())
2151         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2152                          "not vectorized: iteration count smaller than "
2153                          "vectorization factor.\n");
2154       return 0;
2155     }
2156
2157   /* If we know the number of iterations we can do better, for the
2158      epilogue we can also decide whether the main loop leaves us
2159      with enough iterations, prefering a smaller vector epilog then
2160      also possibly used for the case we skip the vector loop.  */
2161   if (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo)
2162       && LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
2163     {
2164       widest_int scalar_niters
2165         = wi::to_widest (LOOP_VINFO_NITERSM1 (loop_vinfo)) + 1;
2166       if (LOOP_VINFO_EPILOGUE_P (loop_vinfo))
2167         {
2168           loop_vec_info orig_loop_vinfo
2169             = LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo);
2170           unsigned lowest_vf
2171             = constant_lower_bound (LOOP_VINFO_VECT_FACTOR (orig_loop_vinfo));
2172           int prolog_peeling = 0;
2173           if (!vect_use_loop_mask_for_alignment_p (loop_vinfo))
2174             prolog_peeling = LOOP_VINFO_PEELING_FOR_ALIGNMENT (orig_loop_vinfo);
2175           if (prolog_peeling >= 0
2176               && known_eq (LOOP_VINFO_VECT_FACTOR (orig_loop_vinfo),
2177                            lowest_vf))
2178             {
2179               unsigned gap
2180                 = LOOP_VINFO_PEELING_FOR_GAPS (orig_loop_vinfo) ? 1 : 0;
2181               scalar_niters = ((scalar_niters - gap - prolog_peeling)
2182                                % lowest_vf + gap);
2183             }
2184         }
2185
2186       /* Check that the loop processes at least one full vector.  */
2187       poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2188       if (known_lt (scalar_niters, vf))
2189         {
2190           if (dump_enabled_p ())
2191             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2192                              "loop does not have enough iterations "
2193                              "to support vectorization.\n");
2194           return 0;
2195         }
2196
2197       /* If we need to peel an extra epilogue iteration to handle data
2198          accesses with gaps, check that there are enough scalar iterations
2199          available.
2200
2201          The check above is redundant with this one when peeling for gaps,
2202          but the distinction is useful for diagnostics.  */
2203       if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
2204           && known_le (scalar_niters, vf))
2205         {
2206           if (dump_enabled_p ())
2207             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2208                              "loop does not have enough iterations "
2209                              "to support peeling for gaps.\n");
2210           return 0;
2211         }
2212     }
2213
2214   /* If using the "very cheap" model. reject cases in which we'd keep
2215      a copy of the scalar code (even if we might be able to vectorize it).  */
2216   if (loop_cost_model (loop) == VECT_COST_MODEL_VERY_CHEAP
2217       && (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
2218           || LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
2219           || LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo)))
2220     {
2221       if (dump_enabled_p ())
2222         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2223                          "some scalar iterations would need to be peeled\n");
2224       return 0;
2225     }
2226
2227   int min_profitable_iters, min_profitable_estimate;
2228   vect_estimate_min_profitable_iters (loop_vinfo, &min_profitable_iters,
2229                                       &min_profitable_estimate,
2230                                       suggested_unroll_factor);
2231
2232   if (min_profitable_iters < 0)
2233     {
2234       if (dump_enabled_p ())
2235         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2236                          "not vectorized: vectorization not profitable.\n");
2237       if (dump_enabled_p ())
2238         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2239                          "not vectorized: vector version will never be "
2240                          "profitable.\n");
2241       return -1;
2242     }
2243
2244   int min_scalar_loop_bound = (param_min_vect_loop_bound
2245                                * assumed_vf);
2246
2247   /* Use the cost model only if it is more conservative than user specified
2248      threshold.  */
2249   unsigned int th = (unsigned) MAX (min_scalar_loop_bound,
2250                                     min_profitable_iters);
2251
2252   LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo) = th;
2253
2254   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2255       && LOOP_VINFO_INT_NITERS (loop_vinfo) < th)
2256     {
2257       if (dump_enabled_p ())
2258         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2259                          "not vectorized: vectorization not profitable.\n");
2260       if (dump_enabled_p ())
2261         dump_printf_loc (MSG_NOTE, vect_location,
2262                          "not vectorized: iteration count smaller than user "
2263                          "specified loop bound parameter or minimum profitable "
2264                          "iterations (whichever is more conservative).\n");
2265       return 0;
2266     }
2267
2268   /* The static profitablity threshold min_profitable_estimate includes
2269      the cost of having to check at runtime whether the scalar loop
2270      should be used instead.  If it turns out that we don't need or want
2271      such a check, the threshold we should use for the static estimate
2272      is simply the point at which the vector loop becomes more profitable
2273      than the scalar loop.  */
2274   if (min_profitable_estimate > min_profitable_iters
2275       && !LOOP_REQUIRES_VERSIONING (loop_vinfo)
2276       && !LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo)
2277       && !LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
2278       && !vect_apply_runtime_profitability_check_p (loop_vinfo))
2279     {
2280       if (dump_enabled_p ())
2281         dump_printf_loc (MSG_NOTE, vect_location, "no need for a runtime"
2282                          " choice between the scalar and vector loops\n");
2283       min_profitable_estimate = min_profitable_iters;
2284     }
2285
2286   /* If the vector loop needs multiple iterations to be beneficial then
2287      things are probably too close to call, and the conservative thing
2288      would be to stick with the scalar code.  */
2289   if (loop_cost_model (loop) == VECT_COST_MODEL_VERY_CHEAP
2290       && min_profitable_estimate > (int) vect_vf_for_cost (loop_vinfo))
2291     {
2292       if (dump_enabled_p ())
2293         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2294                          "one iteration of the vector loop would be"
2295                          " more expensive than the equivalent number of"
2296                          " iterations of the scalar loop\n");
2297       return 0;
2298     }
2299
2300   HOST_WIDE_INT estimated_niter;
2301
2302   /* If we are vectorizing an epilogue then we know the maximum number of
2303      scalar iterations it will cover is at least one lower than the
2304      vectorization factor of the main loop.  */
2305   if (LOOP_VINFO_EPILOGUE_P (loop_vinfo))
2306     estimated_niter
2307       = vect_vf_for_cost (LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo)) - 1;
2308   else
2309     {
2310       estimated_niter = estimated_stmt_executions_int (loop);
2311       if (estimated_niter == -1)
2312         estimated_niter = likely_max_stmt_executions_int (loop);
2313     }
2314   if (estimated_niter != -1
2315       && ((unsigned HOST_WIDE_INT) estimated_niter
2316           < MAX (th, (unsigned) min_profitable_estimate)))
2317     {
2318       if (dump_enabled_p ())
2319         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2320                          "not vectorized: estimated iteration count too "
2321                          "small.\n");
2322       if (dump_enabled_p ())
2323         dump_printf_loc (MSG_NOTE, vect_location,
2324                          "not vectorized: estimated iteration count smaller "
2325                          "than specified loop bound parameter or minimum "
2326                          "profitable iterations (whichever is more "
2327                          "conservative).\n");
2328       return -1;
2329     }
2330
2331   return 1;
2332 }
2333
2334 static opt_result
2335 vect_get_datarefs_in_loop (loop_p loop, basic_block *bbs,
2336                            vec<data_reference_p> *datarefs,
2337                            unsigned int *n_stmts)
2338 {
2339   *n_stmts = 0;
2340   for (unsigned i = 0; i < loop->num_nodes; i++)
2341     for (gimple_stmt_iterator gsi = gsi_start_bb (bbs[i]);
2342          !gsi_end_p (gsi); gsi_next (&gsi))
2343       {
2344         gimple *stmt = gsi_stmt (gsi);
2345         if (is_gimple_debug (stmt))
2346           continue;
2347         ++(*n_stmts);
2348         opt_result res = vect_find_stmt_data_reference (loop, stmt, datarefs,
2349                                                         NULL, 0);
2350         if (!res)
2351           {
2352             if (is_gimple_call (stmt) && loop->safelen)
2353               {
2354                 tree fndecl = gimple_call_fndecl (stmt), op;
2355                 if (fndecl == NULL_TREE
2356                     && gimple_call_internal_p (stmt, IFN_MASK_CALL))
2357                   {
2358                     fndecl = gimple_call_arg (stmt, 0);
2359                     gcc_checking_assert (TREE_CODE (fndecl) == ADDR_EXPR);
2360                     fndecl = TREE_OPERAND (fndecl, 0);
2361                     gcc_checking_assert (TREE_CODE (fndecl) == FUNCTION_DECL);
2362                   }
2363                 if (fndecl != NULL_TREE)
2364                   {
2365                     cgraph_node *node = cgraph_node::get (fndecl);
2366                     if (node != NULL && node->simd_clones != NULL)
2367                       {
2368                         unsigned int j, n = gimple_call_num_args (stmt);
2369                         for (j = 0; j < n; j++)
2370                           {
2371                             op = gimple_call_arg (stmt, j);
2372                             if (DECL_P (op)
2373                                 || (REFERENCE_CLASS_P (op)
2374                                     && get_base_address (op)))
2375                               break;
2376                           }
2377                         op = gimple_call_lhs (stmt);
2378                         /* Ignore #pragma omp declare simd functions
2379                            if they don't have data references in the
2380                            call stmt itself.  */
2381                         if (j == n
2382                             && !(op
2383                                  && (DECL_P (op)
2384                                      || (REFERENCE_CLASS_P (op)
2385                                          && get_base_address (op)))))
2386                           continue;
2387                       }
2388                   }
2389               }
2390             return res;
2391           }
2392         /* If dependence analysis will give up due to the limit on the
2393            number of datarefs stop here and fail fatally.  */
2394         if (datarefs->length ()
2395             > (unsigned)param_loop_max_datarefs_for_datadeps)
2396           return opt_result::failure_at (stmt, "exceeded param "
2397                                          "loop-max-datarefs-for-datadeps\n");
2398       }
2399   return opt_result::success ();
2400 }
2401
2402 /* Look for SLP-only access groups and turn each individual access into its own
2403    group.  */
2404 static void
2405 vect_dissolve_slp_only_groups (loop_vec_info loop_vinfo)
2406 {
2407   unsigned int i;
2408   struct data_reference *dr;
2409
2410   DUMP_VECT_SCOPE ("vect_dissolve_slp_only_groups");
2411
2412   vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
2413   FOR_EACH_VEC_ELT (datarefs, i, dr)
2414     {
2415       gcc_assert (DR_REF (dr));
2416       stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (DR_STMT (dr));
2417
2418       /* Check if the load is a part of an interleaving chain.  */
2419       if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
2420         {
2421           stmt_vec_info first_element = DR_GROUP_FIRST_ELEMENT (stmt_info);
2422           dr_vec_info *dr_info = STMT_VINFO_DR_INFO (first_element);
2423           unsigned int group_size = DR_GROUP_SIZE (first_element);
2424
2425           /* Check if SLP-only groups.  */
2426           if (!STMT_SLP_TYPE (stmt_info)
2427               && STMT_VINFO_SLP_VECT_ONLY (first_element))
2428             {
2429               /* Dissolve the group.  */
2430               STMT_VINFO_SLP_VECT_ONLY (first_element) = false;
2431
2432               stmt_vec_info vinfo = first_element;
2433               while (vinfo)
2434                 {
2435                   stmt_vec_info next = DR_GROUP_NEXT_ELEMENT (vinfo);
2436                   DR_GROUP_FIRST_ELEMENT (vinfo) = vinfo;
2437                   DR_GROUP_NEXT_ELEMENT (vinfo) = NULL;
2438                   DR_GROUP_SIZE (vinfo) = 1;
2439                   if (STMT_VINFO_STRIDED_P (first_element))
2440                     DR_GROUP_GAP (vinfo) = 0;
2441                   else
2442                     DR_GROUP_GAP (vinfo) = group_size - 1;
2443                   /* Duplicate and adjust alignment info, it needs to
2444                      be present on each group leader, see dr_misalignment.  */
2445                   if (vinfo != first_element)
2446                     {
2447                       dr_vec_info *dr_info2 = STMT_VINFO_DR_INFO (vinfo);
2448                       dr_info2->target_alignment = dr_info->target_alignment;
2449                       int misalignment = dr_info->misalignment;
2450                       if (misalignment != DR_MISALIGNMENT_UNKNOWN)
2451                         {
2452                           HOST_WIDE_INT diff
2453                             = (TREE_INT_CST_LOW (DR_INIT (dr_info2->dr))
2454                                - TREE_INT_CST_LOW (DR_INIT (dr_info->dr)));
2455                           unsigned HOST_WIDE_INT align_c
2456                             = dr_info->target_alignment.to_constant ();
2457                           misalignment = (misalignment + diff) % align_c;
2458                         }
2459                       dr_info2->misalignment = misalignment;
2460                     }
2461                   vinfo = next;
2462                 }
2463             }
2464         }
2465     }
2466 }
2467
2468 /* Determine if operating on full vectors for LOOP_VINFO might leave
2469    some scalar iterations still to do.  If so, decide how we should
2470    handle those scalar iterations.  The possibilities are:
2471
2472    (1) Make LOOP_VINFO operate on partial vectors instead of full vectors.
2473        In this case:
2474
2475          LOOP_VINFO_USING_PARTIAL_VECTORS_P == true
2476          LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P == false
2477          LOOP_VINFO_PEELING_FOR_NITER == false
2478
2479    (2) Make LOOP_VINFO operate on full vectors and use an epilogue loop
2480        to handle the remaining scalar iterations.  In this case:
2481
2482          LOOP_VINFO_USING_PARTIAL_VECTORS_P == false
2483          LOOP_VINFO_PEELING_FOR_NITER == true
2484
2485        There are two choices:
2486
2487        (2a) Consider vectorizing the epilogue loop at the same VF as the
2488             main loop, but using partial vectors instead of full vectors.
2489             In this case:
2490
2491               LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P == true
2492
2493        (2b) Consider vectorizing the epilogue loop at lower VFs only.
2494             In this case:
2495
2496               LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P == false
2497  */
2498
2499 opt_result
2500 vect_determine_partial_vectors_and_peeling (loop_vec_info loop_vinfo)
2501 {
2502   /* Determine whether there would be any scalar iterations left over.  */
2503   bool need_peeling_or_partial_vectors_p
2504     = vect_need_peeling_or_partial_vectors_p (loop_vinfo);
2505
2506   /* Decide whether to vectorize the loop with partial vectors.  */
2507   LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo) = false;
2508   LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P (loop_vinfo) = false;
2509   if (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)
2510       && need_peeling_or_partial_vectors_p)
2511     {
2512       /* For partial-vector-usage=1, try to push the handling of partial
2513          vectors to the epilogue, with the main loop continuing to operate
2514          on full vectors.
2515
2516          If we are unrolling we also do not want to use partial vectors. This
2517          is to avoid the overhead of generating multiple masks and also to
2518          avoid having to execute entire iterations of FALSE masked instructions
2519          when dealing with one or less full iterations.
2520
2521          ??? We could then end up failing to use partial vectors if we
2522          decide to peel iterations into a prologue, and if the main loop
2523          then ends up processing fewer than VF iterations.  */
2524       if ((param_vect_partial_vector_usage == 1
2525            || loop_vinfo->suggested_unroll_factor > 1)
2526           && !LOOP_VINFO_EPILOGUE_P (loop_vinfo)
2527           && !vect_known_niters_smaller_than_vf (loop_vinfo))
2528         LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P (loop_vinfo) = true;
2529       else
2530         LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo) = true;
2531     }
2532
2533   if (dump_enabled_p ())
2534     dump_printf_loc (MSG_NOTE, vect_location,
2535                      "operating on %s vectors%s.\n",
2536                      LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo)
2537                      ? "partial" : "full",
2538                      LOOP_VINFO_EPILOGUE_P (loop_vinfo)
2539                      ? " for epilogue loop" : "");
2540
2541   LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo)
2542     = (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo)
2543        && need_peeling_or_partial_vectors_p);
2544
2545   return opt_result::success ();
2546 }
2547
2548 /* Function vect_analyze_loop_2.
2549
2550    Apply a set of analyses on LOOP specified by LOOP_VINFO, the different
2551    analyses will record information in some members of LOOP_VINFO.  FATAL
2552    indicates if some analysis meets fatal error.  If one non-NULL pointer
2553    SUGGESTED_UNROLL_FACTOR is provided, it's intent to be filled with one
2554    worked out suggested unroll factor, while one NULL pointer shows it's
2555    going to apply the suggested unroll factor.  SLP_DONE_FOR_SUGGESTED_UF
2556    is to hold the slp decision when the suggested unroll factor is worked
2557    out.  */
2558 static opt_result
2559 vect_analyze_loop_2 (loop_vec_info loop_vinfo, bool &fatal,
2560                      unsigned *suggested_unroll_factor,
2561                      bool& slp_done_for_suggested_uf)
2562 {
2563   opt_result ok = opt_result::success ();
2564   int res;
2565   unsigned int max_vf = MAX_VECTORIZATION_FACTOR;
2566   poly_uint64 min_vf = 2;
2567   loop_vec_info orig_loop_vinfo = NULL;
2568
2569   /* If we are dealing with an epilogue then orig_loop_vinfo points to the
2570      loop_vec_info of the first vectorized loop.  */
2571   if (LOOP_VINFO_EPILOGUE_P (loop_vinfo))
2572     orig_loop_vinfo = LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo);
2573   else
2574     orig_loop_vinfo = loop_vinfo;
2575   gcc_assert (orig_loop_vinfo);
2576
2577   /* The first group of checks is independent of the vector size.  */
2578   fatal = true;
2579
2580   if (LOOP_VINFO_SIMD_IF_COND (loop_vinfo)
2581       && integer_zerop (LOOP_VINFO_SIMD_IF_COND (loop_vinfo)))
2582     return opt_result::failure_at (vect_location,
2583                                    "not vectorized: simd if(0)\n");
2584
2585   /* Find all data references in the loop (which correspond to vdefs/vuses)
2586      and analyze their evolution in the loop.  */
2587
2588   loop_p loop = LOOP_VINFO_LOOP (loop_vinfo);
2589
2590   /* Gather the data references and count stmts in the loop.  */
2591   if (!LOOP_VINFO_DATAREFS (loop_vinfo).exists ())
2592     {
2593       opt_result res
2594         = vect_get_datarefs_in_loop (loop, LOOP_VINFO_BBS (loop_vinfo),
2595                                      &LOOP_VINFO_DATAREFS (loop_vinfo),
2596                                      &LOOP_VINFO_N_STMTS (loop_vinfo));
2597       if (!res)
2598         {
2599           if (dump_enabled_p ())
2600             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2601                              "not vectorized: loop contains function "
2602                              "calls or data references that cannot "
2603                              "be analyzed\n");
2604           return res;
2605         }
2606       loop_vinfo->shared->save_datarefs ();
2607     }
2608   else
2609     loop_vinfo->shared->check_datarefs ();
2610
2611   /* Analyze the data references and also adjust the minimal
2612      vectorization factor according to the loads and stores.  */
2613
2614   ok = vect_analyze_data_refs (loop_vinfo, &min_vf, &fatal);
2615   if (!ok)
2616     {
2617       if (dump_enabled_p ())
2618         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2619                          "bad data references.\n");
2620       return ok;
2621     }
2622
2623   /* Check if we are applying unroll factor now.  */
2624   bool applying_suggested_uf = loop_vinfo->suggested_unroll_factor > 1;
2625   gcc_assert (!applying_suggested_uf || !suggested_unroll_factor);
2626
2627   /* If the slp decision is false when suggested unroll factor is worked
2628      out, and we are applying suggested unroll factor, we can simply skip
2629      all slp related analyses this time.  */
2630   bool slp = !applying_suggested_uf || slp_done_for_suggested_uf;
2631
2632   /* Classify all cross-iteration scalar data-flow cycles.
2633      Cross-iteration cycles caused by virtual phis are analyzed separately.  */
2634   vect_analyze_scalar_cycles (loop_vinfo, slp);
2635
2636   vect_pattern_recog (loop_vinfo);
2637
2638   vect_fixup_scalar_cycles_with_patterns (loop_vinfo);
2639
2640   /* Analyze the access patterns of the data-refs in the loop (consecutive,
2641      complex, etc.). FORNOW: Only handle consecutive access pattern.  */
2642
2643   ok = vect_analyze_data_ref_accesses (loop_vinfo, NULL);
2644   if (!ok)
2645     {
2646       if (dump_enabled_p ())
2647         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2648                          "bad data access.\n");
2649       return ok;
2650     }
2651
2652   /* Data-flow analysis to detect stmts that do not need to be vectorized.  */
2653
2654   ok = vect_mark_stmts_to_be_vectorized (loop_vinfo, &fatal);
2655   if (!ok)
2656     {
2657       if (dump_enabled_p ())
2658         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2659                          "unexpected pattern.\n");
2660       return ok;
2661     }
2662
2663   /* While the rest of the analysis below depends on it in some way.  */
2664   fatal = false;
2665
2666   /* Analyze data dependences between the data-refs in the loop
2667      and adjust the maximum vectorization factor according to
2668      the dependences.
2669      FORNOW: fail at the first data dependence that we encounter.  */
2670
2671   ok = vect_analyze_data_ref_dependences (loop_vinfo, &max_vf);
2672   if (!ok)
2673     {
2674       if (dump_enabled_p ())
2675         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2676                          "bad data dependence.\n");
2677       return ok;
2678     }
2679   if (max_vf != MAX_VECTORIZATION_FACTOR
2680       && maybe_lt (max_vf, min_vf))
2681     return opt_result::failure_at (vect_location, "bad data dependence.\n");
2682   LOOP_VINFO_MAX_VECT_FACTOR (loop_vinfo) = max_vf;
2683
2684   ok = vect_determine_vectorization_factor (loop_vinfo);
2685   if (!ok)
2686     {
2687       if (dump_enabled_p ())
2688         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2689                          "can't determine vectorization factor.\n");
2690       return ok;
2691     }
2692   if (max_vf != MAX_VECTORIZATION_FACTOR
2693       && maybe_lt (max_vf, LOOP_VINFO_VECT_FACTOR (loop_vinfo)))
2694     return opt_result::failure_at (vect_location, "bad data dependence.\n");
2695
2696   /* Compute the scalar iteration cost.  */
2697   vect_compute_single_scalar_iteration_cost (loop_vinfo);
2698
2699   poly_uint64 saved_vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2700
2701   if (slp)
2702     {
2703       /* Check the SLP opportunities in the loop, analyze and build
2704          SLP trees.  */
2705       ok = vect_analyze_slp (loop_vinfo, LOOP_VINFO_N_STMTS (loop_vinfo));
2706       if (!ok)
2707         return ok;
2708
2709       /* If there are any SLP instances mark them as pure_slp.  */
2710       slp = vect_make_slp_decision (loop_vinfo);
2711       if (slp)
2712         {
2713           /* Find stmts that need to be both vectorized and SLPed.  */
2714           vect_detect_hybrid_slp (loop_vinfo);
2715
2716           /* Update the vectorization factor based on the SLP decision.  */
2717           vect_update_vf_for_slp (loop_vinfo);
2718
2719           /* Optimize the SLP graph with the vectorization factor fixed.  */
2720           vect_optimize_slp (loop_vinfo);
2721
2722           /* Gather the loads reachable from the SLP graph entries.  */
2723           vect_gather_slp_loads (loop_vinfo);
2724         }
2725     }
2726
2727   bool saved_can_use_partial_vectors_p
2728     = LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo);
2729
2730   /* We don't expect to have to roll back to anything other than an empty
2731      set of rgroups.  */
2732   gcc_assert (LOOP_VINFO_MASKS (loop_vinfo).is_empty ());
2733
2734   /* This is the point where we can re-start analysis with SLP forced off.  */
2735 start_over:
2736
2737   /* Apply the suggested unrolling factor, this was determined by the backend
2738      during finish_cost the first time we ran the analyzis for this
2739      vector mode.  */
2740   if (applying_suggested_uf)
2741     LOOP_VINFO_VECT_FACTOR (loop_vinfo) *= loop_vinfo->suggested_unroll_factor;
2742
2743   /* Now the vectorization factor is final.  */
2744   poly_uint64 vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2745   gcc_assert (known_ne (vectorization_factor, 0U));
2746
2747   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) && dump_enabled_p ())
2748     {
2749       dump_printf_loc (MSG_NOTE, vect_location,
2750                        "vectorization_factor = ");
2751       dump_dec (MSG_NOTE, vectorization_factor);
2752       dump_printf (MSG_NOTE, ", niters = %wd\n",
2753                    LOOP_VINFO_INT_NITERS (loop_vinfo));
2754     }
2755
2756   loop_vinfo->vector_costs = init_cost (loop_vinfo, false);
2757
2758   /* Analyze the alignment of the data-refs in the loop.
2759      Fail if a data reference is found that cannot be vectorized.  */
2760
2761   ok = vect_analyze_data_refs_alignment (loop_vinfo);
2762   if (!ok)
2763     {
2764       if (dump_enabled_p ())
2765         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2766                          "bad data alignment.\n");
2767       return ok;
2768     }
2769
2770   /* Prune the list of ddrs to be tested at run-time by versioning for alias.
2771      It is important to call pruning after vect_analyze_data_ref_accesses,
2772      since we use grouping information gathered by interleaving analysis.  */
2773   ok = vect_prune_runtime_alias_test_list (loop_vinfo);
2774   if (!ok)
2775     return ok;
2776
2777   /* Do not invoke vect_enhance_data_refs_alignment for epilogue
2778      vectorization, since we do not want to add extra peeling or
2779      add versioning for alignment.  */
2780   if (!LOOP_VINFO_EPILOGUE_P (loop_vinfo))
2781     /* This pass will decide on using loop versioning and/or loop peeling in
2782        order to enhance the alignment of data references in the loop.  */
2783     ok = vect_enhance_data_refs_alignment (loop_vinfo);
2784   if (!ok)
2785     return ok;
2786
2787   if (slp)
2788     {
2789       /* Analyze operations in the SLP instances.  Note this may
2790          remove unsupported SLP instances which makes the above
2791          SLP kind detection invalid.  */
2792       unsigned old_size = LOOP_VINFO_SLP_INSTANCES (loop_vinfo).length ();
2793       vect_slp_analyze_operations (loop_vinfo);
2794       if (LOOP_VINFO_SLP_INSTANCES (loop_vinfo).length () != old_size)
2795         {
2796           ok = opt_result::failure_at (vect_location,
2797                                        "unsupported SLP instances\n");
2798           goto again;
2799         }
2800
2801       /* Check whether any load in ALL SLP instances is possibly permuted.  */
2802       slp_tree load_node, slp_root;
2803       unsigned i, x;
2804       slp_instance instance;
2805       bool can_use_lanes = true;
2806       FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), x, instance)
2807         {
2808           slp_root = SLP_INSTANCE_TREE (instance);
2809           int group_size = SLP_TREE_LANES (slp_root);
2810           tree vectype = SLP_TREE_VECTYPE (slp_root);
2811           bool loads_permuted = false;
2812           FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), i, load_node)
2813             {
2814               if (!SLP_TREE_LOAD_PERMUTATION (load_node).exists ())
2815                 continue;
2816               unsigned j;
2817               stmt_vec_info load_info;
2818               FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (load_node), j, load_info)
2819                 if (SLP_TREE_LOAD_PERMUTATION (load_node)[j] != j)
2820                   {
2821                     loads_permuted = true;
2822                     break;
2823                   }
2824             }
2825
2826           /* If the loads and stores can be handled with load/store-lane
2827              instructions record it and move on to the next instance.  */
2828           if (loads_permuted
2829               && SLP_INSTANCE_KIND (instance) == slp_inst_kind_store
2830               && vect_store_lanes_supported (vectype, group_size, false))
2831             {
2832               FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), i, load_node)
2833                 {
2834                   stmt_vec_info stmt_vinfo = DR_GROUP_FIRST_ELEMENT
2835                       (SLP_TREE_SCALAR_STMTS (load_node)[0]);
2836                   /* Use SLP for strided accesses (or if we can't
2837                      load-lanes).  */
2838                   if (STMT_VINFO_STRIDED_P (stmt_vinfo)
2839                       || ! vect_load_lanes_supported
2840                             (STMT_VINFO_VECTYPE (stmt_vinfo),
2841                              DR_GROUP_SIZE (stmt_vinfo), false))
2842                     break;
2843                 }
2844
2845               can_use_lanes
2846                 = can_use_lanes && i == SLP_INSTANCE_LOADS (instance).length ();
2847
2848               if (can_use_lanes && dump_enabled_p ())
2849                 dump_printf_loc (MSG_NOTE, vect_location,
2850                                  "SLP instance %p can use load/store-lanes\n",
2851                                  (void *) instance);
2852             }
2853           else
2854             {
2855               can_use_lanes = false;
2856               break;
2857             }
2858         }
2859
2860       /* If all SLP instances can use load/store-lanes abort SLP and try again
2861          with SLP disabled.  */
2862       if (can_use_lanes)
2863         {
2864           ok = opt_result::failure_at (vect_location,
2865                                        "Built SLP cancelled: can use "
2866                                        "load/store-lanes\n");
2867           if (dump_enabled_p ())
2868             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2869                              "Built SLP cancelled: all SLP instances support "
2870                              "load/store-lanes\n");
2871           goto again;
2872         }
2873     }
2874
2875   /* Dissolve SLP-only groups.  */
2876   vect_dissolve_slp_only_groups (loop_vinfo);
2877
2878   /* Scan all the remaining operations in the loop that are not subject
2879      to SLP and make sure they are vectorizable.  */
2880   ok = vect_analyze_loop_operations (loop_vinfo);
2881   if (!ok)
2882     {
2883       if (dump_enabled_p ())
2884         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2885                          "bad operation or unsupported loop bound.\n");
2886       return ok;
2887     }
2888
2889   /* For now, we don't expect to mix both masking and length approaches for one
2890      loop, disable it if both are recorded.  */
2891   if (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)
2892       && !LOOP_VINFO_MASKS (loop_vinfo).is_empty ()
2893       && !LOOP_VINFO_LENS (loop_vinfo).is_empty ())
2894     {
2895       if (dump_enabled_p ())
2896         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2897                          "can't vectorize a loop with partial vectors"
2898                          " because we don't expect to mix different"
2899                          " approaches with partial vectors for the"
2900                          " same loop.\n");
2901       LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
2902     }
2903
2904   /* If we still have the option of using partial vectors,
2905      check whether we can generate the necessary loop controls.  */
2906   if (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo))
2907     {
2908       if (!LOOP_VINFO_MASKS (loop_vinfo).is_empty ())
2909         {
2910           if (!vect_verify_full_masking (loop_vinfo)
2911               && !vect_verify_full_masking_avx512 (loop_vinfo))
2912             LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
2913         }
2914       else /* !LOOP_VINFO_LENS (loop_vinfo).is_empty () */
2915         if (!vect_verify_loop_lens (loop_vinfo))
2916           LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
2917     }
2918
2919   /* If we're vectorizing a loop that uses length "controls" and
2920      can iterate more than once, we apply decrementing IV approach
2921      in loop control.  */
2922   if (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)
2923       && LOOP_VINFO_PARTIAL_VECTORS_STYLE (loop_vinfo) == vect_partial_vectors_len
2924       && LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo) == 0
2925       && !(LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2926            && known_le (LOOP_VINFO_INT_NITERS (loop_vinfo),
2927                         LOOP_VINFO_VECT_FACTOR (loop_vinfo))))
2928     LOOP_VINFO_USING_DECREMENTING_IV_P (loop_vinfo) = true;
2929
2930   /* If a loop uses length controls and has a decrementing loop control IV,
2931      we will normally pass that IV through a MIN_EXPR to calcaluate the
2932      basis for the length controls.  E.g. in a loop that processes one
2933      element per scalar iteration, the number of elements would be
2934      MIN_EXPR <N, VF>, where N is the number of scalar iterations left.
2935
2936      This MIN_EXPR approach allows us to use pointer IVs with an invariant
2937      step, since only the final iteration of the vector loop can have
2938      inactive lanes.
2939
2940      However, some targets have a dedicated instruction for calculating the
2941      preferred length, given the total number of elements that still need to
2942      be processed.  This is encapsulated in the SELECT_VL internal function.
2943
2944      If the target supports SELECT_VL, we can use it instead of MIN_EXPR
2945      to determine the basis for the length controls.  However, unlike the
2946      MIN_EXPR calculation, the SELECT_VL calculation can decide to make
2947      lanes inactive in any iteration of the vector loop, not just the last
2948      iteration.  This SELECT_VL approach therefore requires us to use pointer
2949      IVs with variable steps.
2950
2951      Once we've decided how many elements should be processed by one
2952      iteration of the vector loop, we need to populate the rgroup controls.
2953      If a loop has multiple rgroups, we need to make sure that those rgroups
2954      "line up" (that is, they must be consistent about which elements are
2955      active and which aren't).  This is done by vect_adjust_loop_lens_control.
2956
2957      In principle, it would be possible to use vect_adjust_loop_lens_control
2958      on either the result of a MIN_EXPR or the result of a SELECT_VL.
2959      However:
2960
2961      (1) In practice, it only makes sense to use SELECT_VL when a vector
2962          operation will be controlled directly by the result.  It is not
2963          worth using SELECT_VL if it would only be the input to other
2964          calculations.
2965
2966      (2) If we use SELECT_VL for an rgroup that has N controls, each associated
2967          pointer IV will need N updates by a variable amount (N-1 updates
2968          within the iteration and 1 update to move to the next iteration).
2969
2970      Because of this, we prefer to use the MIN_EXPR approach whenever there
2971      is more than one length control.
2972
2973      In addition, SELECT_VL always operates to a granularity of 1 unit.
2974      If we wanted to use it to control an SLP operation on N consecutive
2975      elements, we would need to make the SELECT_VL inputs measure scalar
2976      iterations (rather than elements) and then multiply the SELECT_VL
2977      result by N.  But using SELECT_VL this way is inefficient because
2978      of (1) above.
2979
2980      2. We don't apply SELECT_VL on single-rgroup when both (1) and (2) are
2981         satisfied:
2982
2983      (1). LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) is true.
2984      (2). LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant () is true.
2985
2986      Since SELECT_VL (variable step) will make SCEV analysis failed and then
2987      we will fail to gain benefits of following unroll optimizations. We prefer
2988      using the MIN_EXPR approach in this situation.  */
2989   if (LOOP_VINFO_USING_DECREMENTING_IV_P (loop_vinfo))
2990     {
2991       tree iv_type = LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo);
2992       if (direct_internal_fn_supported_p (IFN_SELECT_VL, iv_type,
2993                                           OPTIMIZE_FOR_SPEED)
2994           && LOOP_VINFO_LENS (loop_vinfo).length () == 1
2995           && LOOP_VINFO_LENS (loop_vinfo)[0].factor == 1 && !slp
2996           && (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2997               || !LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant ()))
2998         LOOP_VINFO_USING_SELECT_VL_P (loop_vinfo) = true;
2999     }
3000
3001   /* Decide whether this loop_vinfo should use partial vectors or peeling,
3002      assuming that the loop will be used as a main loop.  We will redo
3003      this analysis later if we instead decide to use the loop as an
3004      epilogue loop.  */
3005   ok = vect_determine_partial_vectors_and_peeling (loop_vinfo);
3006   if (!ok)
3007     return ok;
3008
3009   /* If we're vectorizing an epilogue loop, the vectorized loop either needs
3010      to be able to handle fewer than VF scalars, or needs to have a lower VF
3011      than the main loop.  */
3012   if (LOOP_VINFO_EPILOGUE_P (loop_vinfo)
3013       && !LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
3014     {
3015       poly_uint64 unscaled_vf
3016         = exact_div (LOOP_VINFO_VECT_FACTOR (orig_loop_vinfo),
3017                      orig_loop_vinfo->suggested_unroll_factor);
3018       if (maybe_ge (LOOP_VINFO_VECT_FACTOR (loop_vinfo), unscaled_vf))
3019         return opt_result::failure_at (vect_location,
3020                                        "Vectorization factor too high for"
3021                                        " epilogue loop.\n");
3022     }
3023
3024   /* Check the costings of the loop make vectorizing worthwhile.  */
3025   res = vect_analyze_loop_costing (loop_vinfo, suggested_unroll_factor);
3026   if (res < 0)
3027     {
3028       ok = opt_result::failure_at (vect_location,
3029                                    "Loop costings may not be worthwhile.\n");
3030       goto again;
3031     }
3032   if (!res)
3033     return opt_result::failure_at (vect_location,
3034                                    "Loop costings not worthwhile.\n");
3035
3036   /* If an epilogue loop is required make sure we can create one.  */
3037   if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
3038       || LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo))
3039     {
3040       if (dump_enabled_p ())
3041         dump_printf_loc (MSG_NOTE, vect_location, "epilog loop required\n");
3042       if (!vect_can_advance_ivs_p (loop_vinfo)
3043           || !slpeel_can_duplicate_loop_p (LOOP_VINFO_LOOP (loop_vinfo),
3044                                            single_exit (LOOP_VINFO_LOOP
3045                                                          (loop_vinfo))))
3046         {
3047           ok = opt_result::failure_at (vect_location,
3048                                        "not vectorized: can't create required "
3049                                        "epilog loop\n");
3050           goto again;
3051         }
3052     }
3053
3054   /* During peeling, we need to check if number of loop iterations is
3055      enough for both peeled prolog loop and vector loop.  This check
3056      can be merged along with threshold check of loop versioning, so
3057      increase threshold for this case if necessary.
3058
3059      If we are analyzing an epilogue we still want to check what its
3060      versioning threshold would be.  If we decide to vectorize the epilogues we
3061      will want to use the lowest versioning threshold of all epilogues and main
3062      loop.  This will enable us to enter a vectorized epilogue even when
3063      versioning the loop.  We can't simply check whether the epilogue requires
3064      versioning though since we may have skipped some versioning checks when
3065      analyzing the epilogue.  For instance, checks for alias versioning will be
3066      skipped when dealing with epilogues as we assume we already checked them
3067      for the main loop.  So instead we always check the 'orig_loop_vinfo'.  */
3068   if (LOOP_REQUIRES_VERSIONING (orig_loop_vinfo))
3069     {
3070       poly_uint64 niters_th = 0;
3071       unsigned int th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
3072
3073       if (!vect_use_loop_mask_for_alignment_p (loop_vinfo))
3074         {
3075           /* Niters for peeled prolog loop.  */
3076           if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) < 0)
3077             {
3078               dr_vec_info *dr_info = LOOP_VINFO_UNALIGNED_DR (loop_vinfo);
3079               tree vectype = STMT_VINFO_VECTYPE (dr_info->stmt);
3080               niters_th += TYPE_VECTOR_SUBPARTS (vectype) - 1;
3081             }
3082           else
3083             niters_th += LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
3084         }
3085
3086       /* Niters for at least one iteration of vectorized loop.  */
3087       if (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
3088         niters_th += LOOP_VINFO_VECT_FACTOR (loop_vinfo);
3089       /* One additional iteration because of peeling for gap.  */
3090       if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
3091         niters_th += 1;
3092
3093       /*  Use the same condition as vect_transform_loop to decide when to use
3094           the cost to determine a versioning threshold.  */
3095       if (vect_apply_runtime_profitability_check_p (loop_vinfo)
3096           && ordered_p (th, niters_th))
3097         niters_th = ordered_max (poly_uint64 (th), niters_th);
3098
3099       LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo) = niters_th;
3100     }
3101
3102   gcc_assert (known_eq (vectorization_factor,
3103                         LOOP_VINFO_VECT_FACTOR (loop_vinfo)));
3104
3105   slp_done_for_suggested_uf = slp;
3106
3107   /* Ok to vectorize!  */
3108   LOOP_VINFO_VECTORIZABLE_P (loop_vinfo) = 1;
3109   return opt_result::success ();
3110
3111 again:
3112   /* Ensure that "ok" is false (with an opt_problem if dumping is enabled).  */
3113   gcc_assert (!ok);
3114
3115   /* Try again with SLP forced off but if we didn't do any SLP there is
3116      no point in re-trying.  */
3117   if (!slp)
3118     return ok;
3119
3120   /* If the slp decision is true when suggested unroll factor is worked
3121      out, and we are applying suggested unroll factor, we don't need to
3122      re-try any more.  */
3123   if (applying_suggested_uf && slp_done_for_suggested_uf)
3124     return ok;
3125
3126   /* If there are reduction chains re-trying will fail anyway.  */
3127   if (! LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo).is_empty ())
3128     return ok;
3129
3130   /* Likewise if the grouped loads or stores in the SLP cannot be handled
3131      via interleaving or lane instructions.  */
3132   slp_instance instance;
3133   slp_tree node;
3134   unsigned i, j;
3135   FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), i, instance)
3136     {
3137       stmt_vec_info vinfo;
3138       vinfo = SLP_TREE_SCALAR_STMTS (SLP_INSTANCE_TREE (instance))[0];
3139       if (! STMT_VINFO_GROUPED_ACCESS (vinfo))
3140         continue;
3141       vinfo = DR_GROUP_FIRST_ELEMENT (vinfo);
3142       unsigned int size = DR_GROUP_SIZE (vinfo);
3143       tree vectype = STMT_VINFO_VECTYPE (vinfo);
3144       if (! vect_store_lanes_supported (vectype, size, false)
3145          && ! known_eq (TYPE_VECTOR_SUBPARTS (vectype), 1U)
3146          && ! vect_grouped_store_supported (vectype, size))
3147         return opt_result::failure_at (vinfo->stmt,
3148                                        "unsupported grouped store\n");
3149       FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), j, node)
3150         {
3151           vinfo = SLP_TREE_SCALAR_STMTS (node)[0];
3152           vinfo = DR_GROUP_FIRST_ELEMENT (vinfo);
3153           bool single_element_p = !DR_GROUP_NEXT_ELEMENT (vinfo);
3154           size = DR_GROUP_SIZE (vinfo);
3155           vectype = STMT_VINFO_VECTYPE (vinfo);
3156           if (! vect_load_lanes_supported (vectype, size, false)
3157               && ! vect_grouped_load_supported (vectype, single_element_p,
3158                                                 size))
3159             return opt_result::failure_at (vinfo->stmt,
3160                                            "unsupported grouped load\n");
3161         }
3162     }
3163
3164   if (dump_enabled_p ())
3165     dump_printf_loc (MSG_NOTE, vect_location,
3166                      "re-trying with SLP disabled\n");
3167
3168   /* Roll back state appropriately.  No SLP this time.  */
3169   slp = false;
3170   /* Restore vectorization factor as it were without SLP.  */
3171   LOOP_VINFO_VECT_FACTOR (loop_vinfo) = saved_vectorization_factor;
3172   /* Free the SLP instances.  */
3173   FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), j, instance)
3174     vect_free_slp_instance (instance);
3175   LOOP_VINFO_SLP_INSTANCES (loop_vinfo).release ();
3176   /* Reset SLP type to loop_vect on all stmts.  */
3177   for (i = 0; i < LOOP_VINFO_LOOP (loop_vinfo)->num_nodes; ++i)
3178     {
3179       basic_block bb = LOOP_VINFO_BBS (loop_vinfo)[i];
3180       for (gimple_stmt_iterator si = gsi_start_phis (bb);
3181            !gsi_end_p (si); gsi_next (&si))
3182         {
3183           stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
3184           STMT_SLP_TYPE (stmt_info) = loop_vect;
3185           if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
3186               || STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
3187             {
3188               /* vectorizable_reduction adjusts reduction stmt def-types,
3189                  restore them to that of the PHI.  */
3190               STMT_VINFO_DEF_TYPE (STMT_VINFO_REDUC_DEF (stmt_info))
3191                 = STMT_VINFO_DEF_TYPE (stmt_info);
3192               STMT_VINFO_DEF_TYPE (vect_stmt_to_vectorize
3193                                         (STMT_VINFO_REDUC_DEF (stmt_info)))
3194                 = STMT_VINFO_DEF_TYPE (stmt_info);
3195             }
3196         }
3197       for (gimple_stmt_iterator si = gsi_start_bb (bb);
3198            !gsi_end_p (si); gsi_next (&si))
3199         {
3200           if (is_gimple_debug (gsi_stmt (si)))
3201             continue;
3202           stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
3203           STMT_SLP_TYPE (stmt_info) = loop_vect;
3204           if (STMT_VINFO_IN_PATTERN_P (stmt_info))
3205             {
3206               stmt_vec_info pattern_stmt_info
3207                 = STMT_VINFO_RELATED_STMT (stmt_info);
3208               if (STMT_VINFO_SLP_VECT_ONLY_PATTERN (pattern_stmt_info))
3209                 STMT_VINFO_IN_PATTERN_P (stmt_info) = false;
3210
3211               gimple *pattern_def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
3212               STMT_SLP_TYPE (pattern_stmt_info) = loop_vect;
3213               for (gimple_stmt_iterator pi = gsi_start (pattern_def_seq);
3214                    !gsi_end_p (pi); gsi_next (&pi))
3215                 STMT_SLP_TYPE (loop_vinfo->lookup_stmt (gsi_stmt (pi)))
3216                   = loop_vect;
3217             }
3218         }
3219     }
3220   /* Free optimized alias test DDRS.  */
3221   LOOP_VINFO_LOWER_BOUNDS (loop_vinfo).truncate (0);
3222   LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo).release ();
3223   LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo).release ();
3224   /* Reset target cost data.  */
3225   delete loop_vinfo->vector_costs;
3226   loop_vinfo->vector_costs = nullptr;
3227   /* Reset accumulated rgroup information.  */
3228   LOOP_VINFO_MASKS (loop_vinfo).mask_set.empty ();
3229   release_vec_loop_controls (&LOOP_VINFO_MASKS (loop_vinfo).rgc_vec);
3230   release_vec_loop_controls (&LOOP_VINFO_LENS (loop_vinfo));
3231   /* Reset assorted flags.  */
3232   LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = false;
3233   LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) = false;
3234   LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo) = 0;
3235   LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo) = 0;
3236   LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)
3237     = saved_can_use_partial_vectors_p;
3238
3239   goto start_over;
3240 }
3241
3242 /* Return true if vectorizing a loop using NEW_LOOP_VINFO appears
3243    to be better than vectorizing it using OLD_LOOP_VINFO.  Assume that
3244    OLD_LOOP_VINFO is better unless something specifically indicates
3245    otherwise.
3246
3247    Note that this deliberately isn't a partial order.  */
3248
3249 static bool
3250 vect_better_loop_vinfo_p (loop_vec_info new_loop_vinfo,
3251                           loop_vec_info old_loop_vinfo)
3252 {
3253   struct loop *loop = LOOP_VINFO_LOOP (new_loop_vinfo);
3254   gcc_assert (LOOP_VINFO_LOOP (old_loop_vinfo) == loop);
3255
3256   poly_int64 new_vf = LOOP_VINFO_VECT_FACTOR (new_loop_vinfo);
3257   poly_int64 old_vf = LOOP_VINFO_VECT_FACTOR (old_loop_vinfo);
3258
3259   /* Always prefer a VF of loop->simdlen over any other VF.  */
3260   if (loop->simdlen)
3261     {
3262       bool new_simdlen_p = known_eq (new_vf, loop->simdlen);
3263       bool old_simdlen_p = known_eq (old_vf, loop->simdlen);
3264       if (new_simdlen_p != old_simdlen_p)
3265         return new_simdlen_p;
3266     }
3267
3268   const auto *old_costs = old_loop_vinfo->vector_costs;
3269   const auto *new_costs = new_loop_vinfo->vector_costs;
3270   if (loop_vec_info main_loop = LOOP_VINFO_ORIG_LOOP_INFO (old_loop_vinfo))
3271     return new_costs->better_epilogue_loop_than_p (old_costs, main_loop);
3272
3273   return new_costs->better_main_loop_than_p (old_costs);
3274 }
3275
3276 /* Decide whether to replace OLD_LOOP_VINFO with NEW_LOOP_VINFO.  Return
3277    true if we should.  */
3278
3279 static bool
3280 vect_joust_loop_vinfos (loop_vec_info new_loop_vinfo,
3281                         loop_vec_info old_loop_vinfo)
3282 {
3283   if (!vect_better_loop_vinfo_p (new_loop_vinfo, old_loop_vinfo))
3284     return false;
3285
3286   if (dump_enabled_p ())
3287     dump_printf_loc (MSG_NOTE, vect_location,
3288                      "***** Preferring vector mode %s to vector mode %s\n",
3289                      GET_MODE_NAME (new_loop_vinfo->vector_mode),
3290                      GET_MODE_NAME (old_loop_vinfo->vector_mode));
3291   return true;
3292 }
3293
3294 /* Analyze LOOP with VECTOR_MODES[MODE_I] and as epilogue if MAIN_LOOP_VINFO is
3295    not NULL.  Set AUTODETECTED_VECTOR_MODE if VOIDmode and advance
3296    MODE_I to the next mode useful to analyze.
3297    Return the loop_vinfo on success and wrapped null on failure.  */
3298
3299 static opt_loop_vec_info
3300 vect_analyze_loop_1 (class loop *loop, vec_info_shared *shared,
3301                      const vect_loop_form_info *loop_form_info,
3302                      loop_vec_info main_loop_vinfo,
3303                      const vector_modes &vector_modes, unsigned &mode_i,
3304                      machine_mode &autodetected_vector_mode,
3305                      bool &fatal)
3306 {
3307   loop_vec_info loop_vinfo
3308     = vect_create_loop_vinfo (loop, shared, loop_form_info, main_loop_vinfo);
3309
3310   machine_mode vector_mode = vector_modes[mode_i];
3311   loop_vinfo->vector_mode = vector_mode;
3312   unsigned int suggested_unroll_factor = 1;
3313   bool slp_done_for_suggested_uf = false;
3314
3315   /* Run the main analysis.  */
3316   opt_result res = vect_analyze_loop_2 (loop_vinfo, fatal,
3317                                         &suggested_unroll_factor,
3318                                         slp_done_for_suggested_uf);
3319   if (dump_enabled_p ())
3320     dump_printf_loc (MSG_NOTE, vect_location,
3321                      "***** Analysis %s with vector mode %s\n",
3322                      res ? "succeeded" : " failed",
3323                      GET_MODE_NAME (loop_vinfo->vector_mode));
3324
3325   if (res && !main_loop_vinfo && suggested_unroll_factor > 1)
3326     {
3327       if (dump_enabled_p ())
3328         dump_printf_loc (MSG_NOTE, vect_location,
3329                          "***** Re-trying analysis for unrolling"
3330                          " with unroll factor %d and slp %s.\n",
3331                          suggested_unroll_factor,
3332                          slp_done_for_suggested_uf ? "on" : "off");
3333       loop_vec_info unroll_vinfo
3334         = vect_create_loop_vinfo (loop, shared, loop_form_info, main_loop_vinfo);
3335       unroll_vinfo->vector_mode = vector_mode;
3336       unroll_vinfo->suggested_unroll_factor = suggested_unroll_factor;
3337       opt_result new_res = vect_analyze_loop_2 (unroll_vinfo, fatal, NULL,
3338                                                 slp_done_for_suggested_uf);
3339       if (new_res)
3340         {
3341           delete loop_vinfo;
3342           loop_vinfo = unroll_vinfo;
3343         }
3344       else
3345         delete unroll_vinfo;
3346     }
3347
3348   /* Remember the autodetected vector mode.  */
3349   if (vector_mode == VOIDmode)
3350     autodetected_vector_mode = loop_vinfo->vector_mode;
3351
3352   /* Advance mode_i, first skipping modes that would result in the
3353      same analysis result.  */
3354   while (mode_i + 1 < vector_modes.length ()
3355          && vect_chooses_same_modes_p (loop_vinfo,
3356                                        vector_modes[mode_i + 1]))
3357     {
3358       if (dump_enabled_p ())
3359         dump_printf_loc (MSG_NOTE, vect_location,
3360                          "***** The result for vector mode %s would"
3361                          " be the same\n",
3362                          GET_MODE_NAME (vector_modes[mode_i + 1]));
3363       mode_i += 1;
3364     }
3365   if (mode_i + 1 < vector_modes.length ()
3366       && VECTOR_MODE_P (autodetected_vector_mode)
3367       && (related_vector_mode (vector_modes[mode_i + 1],
3368                                GET_MODE_INNER (autodetected_vector_mode))
3369           == autodetected_vector_mode)
3370       && (related_vector_mode (autodetected_vector_mode,
3371                                GET_MODE_INNER (vector_modes[mode_i + 1]))
3372           == vector_modes[mode_i + 1]))
3373     {
3374       if (dump_enabled_p ())
3375         dump_printf_loc (MSG_NOTE, vect_location,
3376                          "***** Skipping vector mode %s, which would"
3377                          " repeat the analysis for %s\n",
3378                          GET_MODE_NAME (vector_modes[mode_i + 1]),
3379                          GET_MODE_NAME (autodetected_vector_mode));
3380       mode_i += 1;
3381     }
3382   mode_i++;
3383
3384   if (!res)
3385     {
3386       delete loop_vinfo;
3387       if (fatal)
3388         gcc_checking_assert (main_loop_vinfo == NULL);
3389       return opt_loop_vec_info::propagate_failure (res);
3390     }
3391
3392   return opt_loop_vec_info::success (loop_vinfo);
3393 }
3394
3395 /* Function vect_analyze_loop.
3396
3397    Apply a set of analyses on LOOP, and create a loop_vec_info struct
3398    for it.  The different analyses will record information in the
3399    loop_vec_info struct.  */
3400 opt_loop_vec_info
3401 vect_analyze_loop (class loop *loop, vec_info_shared *shared)
3402 {
3403   DUMP_VECT_SCOPE ("analyze_loop_nest");
3404
3405   if (loop_outer (loop)
3406       && loop_vec_info_for_loop (loop_outer (loop))
3407       && LOOP_VINFO_VECTORIZABLE_P (loop_vec_info_for_loop (loop_outer (loop))))
3408     return opt_loop_vec_info::failure_at (vect_location,
3409                                           "outer-loop already vectorized.\n");
3410
3411   if (!find_loop_nest (loop, &shared->loop_nest))
3412     return opt_loop_vec_info::failure_at
3413       (vect_location,
3414        "not vectorized: loop nest containing two or more consecutive inner"
3415        " loops cannot be vectorized\n");
3416
3417   /* Analyze the loop form.  */
3418   vect_loop_form_info loop_form_info;
3419   opt_result res = vect_analyze_loop_form (loop, &loop_form_info);
3420   if (!res)
3421     {
3422       if (dump_enabled_p ())
3423         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3424                          "bad loop form.\n");
3425       return opt_loop_vec_info::propagate_failure (res);
3426     }
3427   if (!integer_onep (loop_form_info.assumptions))
3428     {
3429       /* We consider to vectorize this loop by versioning it under
3430          some assumptions.  In order to do this, we need to clear
3431          existing information computed by scev and niter analyzer.  */
3432       scev_reset_htab ();
3433       free_numbers_of_iterations_estimates (loop);
3434       /* Also set flag for this loop so that following scev and niter
3435          analysis are done under the assumptions.  */
3436       loop_constraint_set (loop, LOOP_C_FINITE);
3437     }
3438
3439   auto_vector_modes vector_modes;
3440   /* Autodetect first vector size we try.  */
3441   vector_modes.safe_push (VOIDmode);
3442   unsigned int autovec_flags
3443     = targetm.vectorize.autovectorize_vector_modes (&vector_modes,
3444                                                     loop->simdlen != 0);
3445   bool pick_lowest_cost_p = ((autovec_flags & VECT_COMPARE_COSTS)
3446                              && !unlimited_cost_model (loop));
3447   machine_mode autodetected_vector_mode = VOIDmode;
3448   opt_loop_vec_info first_loop_vinfo = opt_loop_vec_info::success (NULL);
3449   unsigned int mode_i = 0;
3450   unsigned HOST_WIDE_INT simdlen = loop->simdlen;
3451
3452   /* Keep track of the VF for each mode.  Initialize all to 0 which indicates
3453      a mode has not been analyzed.  */
3454   auto_vec<poly_uint64, 8> cached_vf_per_mode;
3455   for (unsigned i = 0; i < vector_modes.length (); ++i)
3456     cached_vf_per_mode.safe_push (0);
3457
3458   /* First determine the main loop vectorization mode, either the first
3459      one that works, starting with auto-detecting the vector mode and then
3460      following the targets order of preference, or the one with the
3461      lowest cost if pick_lowest_cost_p.  */
3462   while (1)
3463     {
3464       bool fatal;
3465       unsigned int last_mode_i = mode_i;
3466       /* Set cached VF to -1 prior to analysis, which indicates a mode has
3467          failed.  */
3468       cached_vf_per_mode[last_mode_i] = -1;
3469       opt_loop_vec_info loop_vinfo
3470         = vect_analyze_loop_1 (loop, shared, &loop_form_info,
3471                                NULL, vector_modes, mode_i,
3472                                autodetected_vector_mode, fatal);
3473       if (fatal)
3474         break;
3475
3476       if (loop_vinfo)
3477         {
3478           /*  Analyzis has been successful so update the VF value.  The
3479               VF should always be a multiple of unroll_factor and we want to
3480               capture the original VF here.  */
3481           cached_vf_per_mode[last_mode_i]
3482             = exact_div (LOOP_VINFO_VECT_FACTOR (loop_vinfo),
3483                          loop_vinfo->suggested_unroll_factor);
3484           /* Once we hit the desired simdlen for the first time,
3485              discard any previous attempts.  */
3486           if (simdlen
3487               && known_eq (LOOP_VINFO_VECT_FACTOR (loop_vinfo), simdlen))
3488             {
3489               delete first_loop_vinfo;
3490               first_loop_vinfo = opt_loop_vec_info::success (NULL);
3491               simdlen = 0;
3492             }
3493           else if (pick_lowest_cost_p
3494                    && first_loop_vinfo
3495                    && vect_joust_loop_vinfos (loop_vinfo, first_loop_vinfo))
3496             {
3497               /* Pick loop_vinfo over first_loop_vinfo.  */
3498               delete first_loop_vinfo;
3499               first_loop_vinfo = opt_loop_vec_info::success (NULL);
3500             }
3501           if (first_loop_vinfo == NULL)
3502             first_loop_vinfo = loop_vinfo;
3503           else
3504             {
3505               delete loop_vinfo;
3506               loop_vinfo = opt_loop_vec_info::success (NULL);
3507             }
3508
3509           /* Commit to first_loop_vinfo if we have no reason to try
3510              alternatives.  */
3511           if (!simdlen && !pick_lowest_cost_p)
3512             break;
3513         }
3514       if (mode_i == vector_modes.length ()
3515           || autodetected_vector_mode == VOIDmode)
3516         break;
3517
3518       /* Try the next biggest vector size.  */
3519       if (dump_enabled_p ())
3520         dump_printf_loc (MSG_NOTE, vect_location,
3521                          "***** Re-trying analysis with vector mode %s\n",
3522                          GET_MODE_NAME (vector_modes[mode_i]));
3523     }
3524   if (!first_loop_vinfo)
3525     return opt_loop_vec_info::propagate_failure (res);
3526
3527   if (dump_enabled_p ())
3528     dump_printf_loc (MSG_NOTE, vect_location,
3529                      "***** Choosing vector mode %s\n",
3530                      GET_MODE_NAME (first_loop_vinfo->vector_mode));
3531
3532   /* Only vectorize epilogues if PARAM_VECT_EPILOGUES_NOMASK is
3533      enabled, SIMDUID is not set, it is the innermost loop and we have
3534      either already found the loop's SIMDLEN or there was no SIMDLEN to
3535      begin with.
3536      TODO: Enable epilogue vectorization for loops with SIMDUID set.  */
3537   bool vect_epilogues = (!simdlen
3538                          && loop->inner == NULL
3539                          && param_vect_epilogues_nomask
3540                          && LOOP_VINFO_PEELING_FOR_NITER (first_loop_vinfo)
3541                          && !loop->simduid);
3542   if (!vect_epilogues)
3543     return first_loop_vinfo;
3544
3545   /* Now analyze first_loop_vinfo for epilogue vectorization.  */
3546   poly_uint64 lowest_th = LOOP_VINFO_VERSIONING_THRESHOLD (first_loop_vinfo);
3547
3548   /* For epilogues start the analysis from the first mode.  The motivation
3549      behind starting from the beginning comes from cases where the VECTOR_MODES
3550      array may contain length-agnostic and length-specific modes.  Their
3551      ordering is not guaranteed, so we could end up picking a mode for the main
3552      loop that is after the epilogue's optimal mode.  */
3553   vector_modes[0] = autodetected_vector_mode;
3554   mode_i = 0;
3555
3556   bool supports_partial_vectors =
3557     partial_vectors_supported_p () && param_vect_partial_vector_usage != 0;
3558   poly_uint64 first_vinfo_vf = LOOP_VINFO_VECT_FACTOR (first_loop_vinfo);
3559
3560   while (1)
3561     {
3562       /* If the target does not support partial vectors we can shorten the
3563          number of modes to analyze for the epilogue as we know we can't pick a
3564          mode that would lead to a VF at least as big as the
3565          FIRST_VINFO_VF.  */
3566       if (!supports_partial_vectors
3567           && maybe_ge (cached_vf_per_mode[mode_i], first_vinfo_vf))
3568         {
3569           mode_i++;
3570           if (mode_i == vector_modes.length ())
3571             break;
3572           continue;
3573         }
3574
3575       if (dump_enabled_p ())
3576         dump_printf_loc (MSG_NOTE, vect_location,
3577                          "***** Re-trying epilogue analysis with vector "
3578                          "mode %s\n", GET_MODE_NAME (vector_modes[mode_i]));
3579
3580       bool fatal;
3581       opt_loop_vec_info loop_vinfo
3582         = vect_analyze_loop_1 (loop, shared, &loop_form_info,
3583                                first_loop_vinfo,
3584                                vector_modes, mode_i,
3585                                autodetected_vector_mode, fatal);
3586       if (fatal)
3587         break;
3588
3589       if (loop_vinfo)
3590         {
3591           if (pick_lowest_cost_p)
3592             {
3593               /* Keep trying to roll back vectorization attempts while the
3594                  loop_vec_infos they produced were worse than this one.  */
3595               vec<loop_vec_info> &vinfos = first_loop_vinfo->epilogue_vinfos;
3596               while (!vinfos.is_empty ()
3597                      && vect_joust_loop_vinfos (loop_vinfo, vinfos.last ()))
3598                 {
3599                   gcc_assert (vect_epilogues);
3600                   delete vinfos.pop ();
3601                 }
3602             }
3603           /* For now only allow one epilogue loop.  */
3604           if (first_loop_vinfo->epilogue_vinfos.is_empty ())
3605             {
3606               first_loop_vinfo->epilogue_vinfos.safe_push (loop_vinfo);
3607               poly_uint64 th = LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo);
3608               gcc_assert (!LOOP_REQUIRES_VERSIONING (loop_vinfo)
3609                           || maybe_ne (lowest_th, 0U));
3610               /* Keep track of the known smallest versioning
3611                  threshold.  */
3612               if (ordered_p (lowest_th, th))
3613                 lowest_th = ordered_min (lowest_th, th);
3614             }
3615           else
3616             {
3617               delete loop_vinfo;
3618               loop_vinfo = opt_loop_vec_info::success (NULL);
3619             }
3620
3621           /* For now only allow one epilogue loop, but allow
3622              pick_lowest_cost_p to replace it, so commit to the
3623              first epilogue if we have no reason to try alternatives.  */
3624           if (!pick_lowest_cost_p)
3625             break;
3626         }
3627
3628       if (mode_i == vector_modes.length ())
3629         break;
3630
3631     }
3632
3633   if (!first_loop_vinfo->epilogue_vinfos.is_empty ())
3634     {
3635       LOOP_VINFO_VERSIONING_THRESHOLD (first_loop_vinfo) = lowest_th;
3636       if (dump_enabled_p ())
3637         dump_printf_loc (MSG_NOTE, vect_location,
3638                          "***** Choosing epilogue vector mode %s\n",
3639                          GET_MODE_NAME
3640                            (first_loop_vinfo->epilogue_vinfos[0]->vector_mode));
3641     }
3642
3643   return first_loop_vinfo;
3644 }
3645
3646 /* Return true if there is an in-order reduction function for CODE, storing
3647    it in *REDUC_FN if so.  */
3648
3649 static bool
3650 fold_left_reduction_fn (code_helper code, internal_fn *reduc_fn)
3651 {
3652   if (code == PLUS_EXPR)
3653     {
3654       *reduc_fn = IFN_FOLD_LEFT_PLUS;
3655       return true;
3656     }
3657   return false;
3658 }
3659
3660 /* Function reduction_fn_for_scalar_code
3661
3662    Input:
3663    CODE - tree_code of a reduction operations.
3664
3665    Output:
3666    REDUC_FN - the corresponding internal function to be used to reduce the
3667       vector of partial results into a single scalar result, or IFN_LAST
3668       if the operation is a supported reduction operation, but does not have
3669       such an internal function.
3670
3671    Return FALSE if CODE currently cannot be vectorized as reduction.  */
3672
3673 bool
3674 reduction_fn_for_scalar_code (code_helper code, internal_fn *reduc_fn)
3675 {
3676   if (code.is_tree_code ())
3677     switch (tree_code (code))
3678       {
3679       case MAX_EXPR:
3680         *reduc_fn = IFN_REDUC_MAX;
3681         return true;
3682
3683       case MIN_EXPR:
3684         *reduc_fn = IFN_REDUC_MIN;
3685         return true;
3686
3687       case PLUS_EXPR:
3688         *reduc_fn = IFN_REDUC_PLUS;
3689         return true;
3690
3691       case BIT_AND_EXPR:
3692         *reduc_fn = IFN_REDUC_AND;
3693         return true;
3694
3695       case BIT_IOR_EXPR:
3696         *reduc_fn = IFN_REDUC_IOR;
3697         return true;
3698
3699       case BIT_XOR_EXPR:
3700         *reduc_fn = IFN_REDUC_XOR;
3701         return true;
3702
3703       case MULT_EXPR:
3704       case MINUS_EXPR:
3705         *reduc_fn = IFN_LAST;
3706         return true;
3707
3708       default:
3709         return false;
3710       }
3711   else
3712     switch (combined_fn (code))
3713       {
3714       CASE_CFN_FMAX:
3715         *reduc_fn = IFN_REDUC_FMAX;
3716         return true;
3717
3718       CASE_CFN_FMIN:
3719         *reduc_fn = IFN_REDUC_FMIN;
3720         return true;
3721
3722       default:
3723         return false;
3724       }
3725 }
3726
3727 /* If there is a neutral value X such that a reduction would not be affected
3728    by the introduction of additional X elements, return that X, otherwise
3729    return null.  CODE is the code of the reduction and SCALAR_TYPE is type
3730    of the scalar elements.  If the reduction has just a single initial value
3731    then INITIAL_VALUE is that value, otherwise it is null.  */
3732
3733 tree
3734 neutral_op_for_reduction (tree scalar_type, code_helper code,
3735                           tree initial_value)
3736 {
3737   if (code.is_tree_code ())
3738     switch (tree_code (code))
3739       {
3740       case WIDEN_SUM_EXPR:
3741       case DOT_PROD_EXPR:
3742       case SAD_EXPR:
3743       case PLUS_EXPR:
3744       case MINUS_EXPR:
3745       case BIT_IOR_EXPR:
3746       case BIT_XOR_EXPR:
3747         return build_zero_cst (scalar_type);
3748
3749       case MULT_EXPR:
3750         return build_one_cst (scalar_type);
3751
3752       case BIT_AND_EXPR:
3753         return build_all_ones_cst (scalar_type);
3754
3755       case MAX_EXPR:
3756       case MIN_EXPR:
3757         return initial_value;
3758
3759       default:
3760         return NULL_TREE;
3761       }
3762   else
3763     switch (combined_fn (code))
3764       {
3765       CASE_CFN_FMIN:
3766       CASE_CFN_FMAX:
3767         return initial_value;
3768
3769       default:
3770         return NULL_TREE;
3771       }
3772 }
3773
3774 /* Error reporting helper for vect_is_simple_reduction below.  GIMPLE statement
3775    STMT is printed with a message MSG. */
3776
3777 static void
3778 report_vect_op (dump_flags_t msg_type, gimple *stmt, const char *msg)
3779 {
3780   dump_printf_loc (msg_type, vect_location, "%s%G", msg, stmt);
3781 }
3782
3783 /* Return true if we need an in-order reduction for operation CODE
3784    on type TYPE.  NEED_WRAPPING_INTEGRAL_OVERFLOW is true if integer
3785    overflow must wrap.  */
3786
3787 bool
3788 needs_fold_left_reduction_p (tree type, code_helper code)
3789 {
3790   /* CHECKME: check for !flag_finite_math_only too?  */
3791   if (SCALAR_FLOAT_TYPE_P (type))
3792     {
3793       if (code.is_tree_code ())
3794         switch (tree_code (code))
3795           {
3796           case MIN_EXPR:
3797           case MAX_EXPR:
3798             return false;
3799
3800           default:
3801             return !flag_associative_math;
3802           }
3803       else
3804         switch (combined_fn (code))
3805           {
3806           CASE_CFN_FMIN:
3807           CASE_CFN_FMAX:
3808             return false;
3809
3810           default:
3811             return !flag_associative_math;
3812           }
3813     }
3814
3815   if (INTEGRAL_TYPE_P (type))
3816     return (!code.is_tree_code ()
3817             || !operation_no_trapping_overflow (type, tree_code (code)));
3818
3819   if (SAT_FIXED_POINT_TYPE_P (type))
3820     return true;
3821
3822   return false;
3823 }
3824
3825 /* Return true if the reduction PHI in LOOP with latch arg LOOP_ARG and
3826    has a handled computation expression.  Store the main reduction
3827    operation in *CODE.  */
3828
3829 static bool
3830 check_reduction_path (dump_user_location_t loc, loop_p loop, gphi *phi,
3831                       tree loop_arg, code_helper *code,
3832                       vec<std::pair<ssa_op_iter, use_operand_p> > &path)
3833 {
3834   auto_bitmap visited;
3835   tree lookfor = PHI_RESULT (phi);
3836   ssa_op_iter curri;
3837   use_operand_p curr = op_iter_init_phiuse (&curri, phi, SSA_OP_USE);
3838   while (USE_FROM_PTR (curr) != loop_arg)
3839     curr = op_iter_next_use (&curri);
3840   curri.i = curri.numops;
3841   do
3842     {
3843       path.safe_push (std::make_pair (curri, curr));
3844       tree use = USE_FROM_PTR (curr);
3845       if (use == lookfor)
3846         break;
3847       gimple *def = SSA_NAME_DEF_STMT (use);
3848       if (gimple_nop_p (def)
3849           || ! flow_bb_inside_loop_p (loop, gimple_bb (def)))
3850         {
3851 pop:
3852           do
3853             {
3854               std::pair<ssa_op_iter, use_operand_p> x = path.pop ();
3855               curri = x.first;
3856               curr = x.second;
3857               do
3858                 curr = op_iter_next_use (&curri);
3859               /* Skip already visited or non-SSA operands (from iterating
3860                  over PHI args).  */
3861               while (curr != NULL_USE_OPERAND_P
3862                      && (TREE_CODE (USE_FROM_PTR (curr)) != SSA_NAME
3863                          || ! bitmap_set_bit (visited,
3864                                               SSA_NAME_VERSION
3865                                                 (USE_FROM_PTR (curr)))));
3866             }
3867           while (curr == NULL_USE_OPERAND_P && ! path.is_empty ());
3868           if (curr == NULL_USE_OPERAND_P)
3869             break;
3870         }
3871       else
3872         {
3873           if (gimple_code (def) == GIMPLE_PHI)
3874             curr = op_iter_init_phiuse (&curri, as_a <gphi *>(def), SSA_OP_USE);
3875           else
3876             curr = op_iter_init_use (&curri, def, SSA_OP_USE);
3877           while (curr != NULL_USE_OPERAND_P
3878                  && (TREE_CODE (USE_FROM_PTR (curr)) != SSA_NAME
3879                      || ! bitmap_set_bit (visited,
3880                                           SSA_NAME_VERSION
3881                                             (USE_FROM_PTR (curr)))))
3882             curr = op_iter_next_use (&curri);
3883           if (curr == NULL_USE_OPERAND_P)
3884             goto pop;
3885         }
3886     }
3887   while (1);
3888   if (dump_file && (dump_flags & TDF_DETAILS))
3889     {
3890       dump_printf_loc (MSG_NOTE, loc, "reduction path: ");
3891       unsigned i;
3892       std::pair<ssa_op_iter, use_operand_p> *x;
3893       FOR_EACH_VEC_ELT (path, i, x)
3894         dump_printf (MSG_NOTE, "%T ", USE_FROM_PTR (x->second));
3895       dump_printf (MSG_NOTE, "\n");
3896     }
3897
3898   /* Check whether the reduction path detected is valid.  */
3899   bool fail = path.length () == 0;
3900   bool neg = false;
3901   int sign = -1;
3902   *code = ERROR_MARK;
3903   for (unsigned i = 1; i < path.length (); ++i)
3904     {
3905       gimple *use_stmt = USE_STMT (path[i].second);
3906       gimple_match_op op;
3907       if (!gimple_extract_op (use_stmt, &op))
3908         {
3909           fail = true;
3910           break;
3911         }
3912       unsigned int opi = op.num_ops;
3913       if (gassign *assign = dyn_cast<gassign *> (use_stmt))
3914         {
3915           /* The following make sure we can compute the operand index
3916              easily plus it mostly disallows chaining via COND_EXPR condition
3917              operands.  */
3918           for (opi = 0; opi < op.num_ops; ++opi)
3919             if (gimple_assign_rhs1_ptr (assign) + opi == path[i].second->use)
3920               break;
3921         }
3922       else if (gcall *call = dyn_cast<gcall *> (use_stmt))
3923         {
3924           for (opi = 0; opi < op.num_ops; ++opi)
3925             if (gimple_call_arg_ptr (call, opi) == path[i].second->use)
3926               break;
3927         }
3928       if (opi == op.num_ops)
3929         {
3930           fail = true;
3931           break;
3932         }
3933       op.code = canonicalize_code (op.code, op.type);
3934       if (op.code == MINUS_EXPR)
3935         {
3936           op.code = PLUS_EXPR;
3937           /* Track whether we negate the reduction value each iteration.  */
3938           if (op.ops[1] == op.ops[opi])
3939             neg = ! neg;
3940         }
3941       if (CONVERT_EXPR_CODE_P (op.code)
3942           && tree_nop_conversion_p (op.type, TREE_TYPE (op.ops[0])))
3943         ;
3944       else if (*code == ERROR_MARK)
3945         {
3946           *code = op.code;
3947           sign = TYPE_SIGN (op.type);
3948         }
3949       else if (op.code != *code)
3950         {
3951           fail = true;
3952           break;
3953         }
3954       else if ((op.code == MIN_EXPR
3955                 || op.code == MAX_EXPR)
3956                && sign != TYPE_SIGN (op.type))
3957         {
3958           fail = true;
3959           break;
3960         }
3961       /* Check there's only a single stmt the op is used on.  For the
3962          not value-changing tail and the last stmt allow out-of-loop uses.
3963          ???  We could relax this and handle arbitrary live stmts by
3964          forcing a scalar epilogue for example.  */
3965       imm_use_iterator imm_iter;
3966       gimple *op_use_stmt;
3967       unsigned cnt = 0;
3968       FOR_EACH_IMM_USE_STMT (op_use_stmt, imm_iter, op.ops[opi])
3969         if (!is_gimple_debug (op_use_stmt)
3970             && (*code != ERROR_MARK
3971                 || flow_bb_inside_loop_p (loop, gimple_bb (op_use_stmt))))
3972           {
3973             /* We want to allow x + x but not x < 1 ? x : 2.  */
3974             if (is_gimple_assign (op_use_stmt)
3975                 && gimple_assign_rhs_code (op_use_stmt) == COND_EXPR)
3976               {
3977                 use_operand_p use_p;
3978                 FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
3979                   cnt++;
3980               }
3981             else
3982               cnt++;
3983           }
3984       if (cnt != 1)
3985         {
3986           fail = true;
3987           break;
3988         }
3989     }
3990   return ! fail && ! neg && *code != ERROR_MARK;
3991 }
3992
3993 bool
3994 check_reduction_path (dump_user_location_t loc, loop_p loop, gphi *phi,
3995                       tree loop_arg, enum tree_code code)
3996 {
3997   auto_vec<std::pair<ssa_op_iter, use_operand_p> > path;
3998   code_helper code_;
3999   return (check_reduction_path (loc, loop, phi, loop_arg, &code_, path)
4000           && code_ == code);
4001 }
4002
4003
4004
4005 /* Function vect_is_simple_reduction
4006
4007    (1) Detect a cross-iteration def-use cycle that represents a simple
4008    reduction computation.  We look for the following pattern:
4009
4010    loop_header:
4011      a1 = phi < a0, a2 >
4012      a3 = ...
4013      a2 = operation (a3, a1)
4014
4015    or
4016
4017    a3 = ...
4018    loop_header:
4019      a1 = phi < a0, a2 >
4020      a2 = operation (a3, a1)
4021
4022    such that:
4023    1. operation is commutative and associative and it is safe to
4024       change the order of the computation
4025    2. no uses for a2 in the loop (a2 is used out of the loop)
4026    3. no uses of a1 in the loop besides the reduction operation
4027    4. no uses of a1 outside the loop.
4028
4029    Conditions 1,4 are tested here.
4030    Conditions 2,3 are tested in vect_mark_stmts_to_be_vectorized.
4031
4032    (2) Detect a cross-iteration def-use cycle in nested loops, i.e.,
4033    nested cycles.
4034
4035    (3) Detect cycles of phi nodes in outer-loop vectorization, i.e., double
4036    reductions:
4037
4038      a1 = phi < a0, a2 >
4039      inner loop (def of a3)
4040      a2 = phi < a3 >
4041
4042    (4) Detect condition expressions, ie:
4043      for (int i = 0; i < N; i++)
4044        if (a[i] < val)
4045         ret_val = a[i];
4046
4047 */
4048
4049 static stmt_vec_info
4050 vect_is_simple_reduction (loop_vec_info loop_info, stmt_vec_info phi_info,
4051                           bool *double_reduc, bool *reduc_chain_p, bool slp)
4052 {
4053   gphi *phi = as_a <gphi *> (phi_info->stmt);
4054   gimple *phi_use_stmt = NULL;
4055   imm_use_iterator imm_iter;
4056   use_operand_p use_p;
4057
4058   *double_reduc = false;
4059   *reduc_chain_p = false;
4060   STMT_VINFO_REDUC_TYPE (phi_info) = TREE_CODE_REDUCTION;
4061
4062   tree phi_name = PHI_RESULT (phi);
4063   /* ???  If there are no uses of the PHI result the inner loop reduction
4064      won't be detected as possibly double-reduction by vectorizable_reduction
4065      because that tries to walk the PHI arg from the preheader edge which
4066      can be constant.  See PR60382.  */
4067   if (has_zero_uses (phi_name))
4068     return NULL;
4069   class loop *loop = (gimple_bb (phi))->loop_father;
4070   unsigned nphi_def_loop_uses = 0;
4071   FOR_EACH_IMM_USE_FAST (use_p, imm_iter, phi_name)
4072     {
4073       gimple *use_stmt = USE_STMT (use_p);
4074       if (is_gimple_debug (use_stmt))
4075         continue;
4076
4077       if (!flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
4078         {
4079           if (dump_enabled_p ())
4080             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4081                              "intermediate value used outside loop.\n");
4082
4083           return NULL;
4084         }
4085
4086       nphi_def_loop_uses++;
4087       phi_use_stmt = use_stmt;
4088     }
4089
4090   tree latch_def = PHI_ARG_DEF_FROM_EDGE (phi, loop_latch_edge (loop));
4091   if (TREE_CODE (latch_def) != SSA_NAME)
4092     {
4093       if (dump_enabled_p ())
4094         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4095                          "reduction: not ssa_name: %T\n", latch_def);
4096       return NULL;
4097     }
4098
4099   stmt_vec_info def_stmt_info = loop_info->lookup_def (latch_def);
4100   if (!def_stmt_info
4101       || !flow_bb_inside_loop_p (loop, gimple_bb (def_stmt_info->stmt)))
4102     return NULL;
4103
4104   bool nested_in_vect_loop
4105     = flow_loop_nested_p (LOOP_VINFO_LOOP (loop_info), loop);
4106   unsigned nlatch_def_loop_uses = 0;
4107   auto_vec<gphi *, 3> lcphis;
4108   bool inner_loop_of_double_reduc = false;
4109   FOR_EACH_IMM_USE_FAST (use_p, imm_iter, latch_def)
4110     {
4111       gimple *use_stmt = USE_STMT (use_p);
4112       if (is_gimple_debug (use_stmt))
4113         continue;
4114       if (flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
4115         nlatch_def_loop_uses++;
4116       else
4117         {
4118           /* We can have more than one loop-closed PHI.  */
4119           lcphis.safe_push (as_a <gphi *> (use_stmt));
4120           if (nested_in_vect_loop
4121               && (STMT_VINFO_DEF_TYPE (loop_info->lookup_stmt (use_stmt))
4122                   == vect_double_reduction_def))
4123             inner_loop_of_double_reduc = true;
4124         }
4125     }
4126
4127   /* If we are vectorizing an inner reduction we are executing that
4128      in the original order only in case we are not dealing with a
4129      double reduction.  */
4130   if (nested_in_vect_loop && !inner_loop_of_double_reduc)
4131     {
4132       if (dump_enabled_p ())
4133         report_vect_op (MSG_NOTE, def_stmt_info->stmt,
4134                         "detected nested cycle: ");
4135       return def_stmt_info;
4136     }
4137
4138   /* When the inner loop of a double reduction ends up with more than
4139      one loop-closed PHI we have failed to classify alternate such
4140      PHIs as double reduction, leading to wrong code.  See PR103237.  */
4141   if (inner_loop_of_double_reduc && lcphis.length () != 1)
4142     {
4143       if (dump_enabled_p ())
4144         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4145                          "unhandle double reduction\n");
4146       return NULL;
4147     }
4148
4149   /* If this isn't a nested cycle or if the nested cycle reduction value
4150      is used ouside of the inner loop we cannot handle uses of the reduction
4151      value.  */
4152   if (nlatch_def_loop_uses > 1 || nphi_def_loop_uses > 1)
4153     {
4154       if (dump_enabled_p ())
4155         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4156                          "reduction used in loop.\n");
4157       return NULL;
4158     }
4159
4160   /* If DEF_STMT is a phi node itself, we expect it to have a single argument
4161      defined in the inner loop.  */
4162   if (gphi *def_stmt = dyn_cast <gphi *> (def_stmt_info->stmt))
4163     {
4164       tree op1 = PHI_ARG_DEF (def_stmt, 0);
4165       if (gimple_phi_num_args (def_stmt) != 1
4166           || TREE_CODE (op1) != SSA_NAME)
4167         {
4168           if (dump_enabled_p ())
4169             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4170                              "unsupported phi node definition.\n");
4171
4172           return NULL;
4173         }
4174
4175       /* Verify there is an inner cycle composed of the PHI phi_use_stmt
4176          and the latch definition op1.  */
4177       gimple *def1 = SSA_NAME_DEF_STMT (op1);
4178       if (gimple_bb (def1)
4179           && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
4180           && loop->inner
4181           && flow_bb_inside_loop_p (loop->inner, gimple_bb (def1))
4182           && (is_gimple_assign (def1) || is_gimple_call (def1))
4183           && is_a <gphi *> (phi_use_stmt)
4184           && flow_bb_inside_loop_p (loop->inner, gimple_bb (phi_use_stmt))
4185           && (op1 == PHI_ARG_DEF_FROM_EDGE (phi_use_stmt,
4186                                             loop_latch_edge (loop->inner))))
4187         {
4188           if (dump_enabled_p ())
4189             report_vect_op (MSG_NOTE, def_stmt,
4190                             "detected double reduction: ");
4191
4192           *double_reduc = true;
4193           return def_stmt_info;
4194         }
4195
4196       return NULL;
4197     }
4198
4199   /* Look for the expression computing latch_def from then loop PHI result.  */
4200   auto_vec<std::pair<ssa_op_iter, use_operand_p> > path;
4201   code_helper code;
4202   if (check_reduction_path (vect_location, loop, phi, latch_def, &code,
4203                             path))
4204     {
4205       STMT_VINFO_REDUC_CODE (phi_info) = code;
4206       if (code == COND_EXPR && !nested_in_vect_loop)
4207         STMT_VINFO_REDUC_TYPE (phi_info) = COND_REDUCTION;
4208
4209       /* Fill in STMT_VINFO_REDUC_IDX and gather stmts for an SLP
4210          reduction chain for which the additional restriction is that
4211          all operations in the chain are the same.  */
4212       auto_vec<stmt_vec_info, 8> reduc_chain;
4213       unsigned i;
4214       bool is_slp_reduc = !nested_in_vect_loop && code != COND_EXPR;
4215       for (i = path.length () - 1; i >= 1; --i)
4216         {
4217           gimple *stmt = USE_STMT (path[i].second);
4218           stmt_vec_info stmt_info = loop_info->lookup_stmt (stmt);
4219           gimple_match_op op;
4220           if (!gimple_extract_op (stmt, &op))
4221             gcc_unreachable ();
4222           if (gassign *assign = dyn_cast<gassign *> (stmt))
4223             STMT_VINFO_REDUC_IDX (stmt_info)
4224               = path[i].second->use - gimple_assign_rhs1_ptr (assign);
4225           else
4226             {
4227               gcall *call = as_a<gcall *> (stmt);
4228               STMT_VINFO_REDUC_IDX (stmt_info)
4229                 = path[i].second->use - gimple_call_arg_ptr (call, 0);
4230             }
4231           bool leading_conversion = (CONVERT_EXPR_CODE_P (op.code)
4232                                      && (i == 1 || i == path.length () - 1));
4233           if ((op.code != code && !leading_conversion)
4234               /* We can only handle the final value in epilogue
4235                  generation for reduction chains.  */
4236               || (i != 1 && !has_single_use (gimple_get_lhs (stmt))))
4237             is_slp_reduc = false;
4238           /* For reduction chains we support a trailing/leading
4239              conversions.  We do not store those in the actual chain.  */
4240           if (leading_conversion)
4241             continue;
4242           reduc_chain.safe_push (stmt_info);
4243         }
4244       if (slp && is_slp_reduc && reduc_chain.length () > 1)
4245         {
4246           for (unsigned i = 0; i < reduc_chain.length () - 1; ++i)
4247             {
4248               REDUC_GROUP_FIRST_ELEMENT (reduc_chain[i]) = reduc_chain[0];
4249               REDUC_GROUP_NEXT_ELEMENT (reduc_chain[i]) = reduc_chain[i+1];
4250             }
4251           REDUC_GROUP_FIRST_ELEMENT (reduc_chain.last ()) = reduc_chain[0];
4252           REDUC_GROUP_NEXT_ELEMENT (reduc_chain.last ()) = NULL;
4253
4254           /* Save the chain for further analysis in SLP detection.  */
4255           LOOP_VINFO_REDUCTION_CHAINS (loop_info).safe_push (reduc_chain[0]);
4256           REDUC_GROUP_SIZE (reduc_chain[0]) = reduc_chain.length ();
4257
4258           *reduc_chain_p = true;
4259           if (dump_enabled_p ())
4260             dump_printf_loc (MSG_NOTE, vect_location,
4261                             "reduction: detected reduction chain\n");
4262         }
4263       else if (dump_enabled_p ())
4264         dump_printf_loc (MSG_NOTE, vect_location,
4265                          "reduction: detected reduction\n");
4266
4267       return def_stmt_info;
4268     }
4269
4270   if (dump_enabled_p ())
4271     dump_printf_loc (MSG_NOTE, vect_location,
4272                      "reduction: unknown pattern\n");
4273
4274   return NULL;
4275 }
4276
4277 /* Estimate the number of peeled epilogue iterations for LOOP_VINFO.
4278    PEEL_ITERS_PROLOGUE is the number of peeled prologue iterations,
4279    or -1 if not known.  */
4280
4281 static int
4282 vect_get_peel_iters_epilogue (loop_vec_info loop_vinfo, int peel_iters_prologue)
4283 {
4284   int assumed_vf = vect_vf_for_cost (loop_vinfo);
4285   if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) || peel_iters_prologue == -1)
4286     {
4287       if (dump_enabled_p ())
4288         dump_printf_loc (MSG_NOTE, vect_location,
4289                          "cost model: epilogue peel iters set to vf/2 "
4290                          "because loop iterations are unknown .\n");
4291       return assumed_vf / 2;
4292     }
4293   else
4294     {
4295       int niters = LOOP_VINFO_INT_NITERS (loop_vinfo);
4296       peel_iters_prologue = MIN (niters, peel_iters_prologue);
4297       int peel_iters_epilogue = (niters - peel_iters_prologue) % assumed_vf;
4298       /* If we need to peel for gaps, but no peeling is required, we have to
4299          peel VF iterations.  */
4300       if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) && !peel_iters_epilogue)
4301         peel_iters_epilogue = assumed_vf;
4302       return peel_iters_epilogue;
4303     }
4304 }
4305
4306 /* Calculate cost of peeling the loop PEEL_ITERS_PROLOGUE times.  */
4307 int
4308 vect_get_known_peeling_cost (loop_vec_info loop_vinfo, int peel_iters_prologue,
4309                              int *peel_iters_epilogue,
4310                              stmt_vector_for_cost *scalar_cost_vec,
4311                              stmt_vector_for_cost *prologue_cost_vec,
4312                              stmt_vector_for_cost *epilogue_cost_vec)
4313 {
4314   int retval = 0;
4315
4316   *peel_iters_epilogue
4317     = vect_get_peel_iters_epilogue (loop_vinfo, peel_iters_prologue);
4318
4319   if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
4320     {
4321       /* If peeled iterations are known but number of scalar loop
4322          iterations are unknown, count a taken branch per peeled loop.  */
4323       if (peel_iters_prologue > 0)
4324         retval = record_stmt_cost (prologue_cost_vec, 1, cond_branch_taken,
4325                                    vect_prologue);
4326       if (*peel_iters_epilogue > 0)
4327         retval += record_stmt_cost (epilogue_cost_vec, 1, cond_branch_taken,
4328                                     vect_epilogue);
4329     }
4330
4331   stmt_info_for_cost *si;
4332   int j;
4333   if (peel_iters_prologue)
4334     FOR_EACH_VEC_ELT (*scalar_cost_vec, j, si)
4335       retval += record_stmt_cost (prologue_cost_vec,
4336                                   si->count * peel_iters_prologue,
4337                                   si->kind, si->stmt_info, si->misalign,
4338                                   vect_prologue);
4339   if (*peel_iters_epilogue)
4340     FOR_EACH_VEC_ELT (*scalar_cost_vec, j, si)
4341       retval += record_stmt_cost (epilogue_cost_vec,
4342                                   si->count * *peel_iters_epilogue,
4343                                   si->kind, si->stmt_info, si->misalign,
4344                                   vect_epilogue);
4345
4346   return retval;
4347 }
4348
4349 /* Function vect_estimate_min_profitable_iters
4350
4351    Return the number of iterations required for the vector version of the
4352    loop to be profitable relative to the cost of the scalar version of the
4353    loop.
4354
4355    *RET_MIN_PROFITABLE_NITERS is a cost model profitability threshold
4356    of iterations for vectorization.  -1 value means loop vectorization
4357    is not profitable.  This returned value may be used for dynamic
4358    profitability check.
4359
4360    *RET_MIN_PROFITABLE_ESTIMATE is a profitability threshold to be used
4361    for static check against estimated number of iterations.  */
4362
4363 static void
4364 vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo,
4365                                     int *ret_min_profitable_niters,
4366                                     int *ret_min_profitable_estimate,
4367                                     unsigned *suggested_unroll_factor)
4368 {
4369   int min_profitable_iters;
4370   int min_profitable_estimate;
4371   int peel_iters_prologue;
4372   int peel_iters_epilogue;
4373   unsigned vec_inside_cost = 0;
4374   int vec_outside_cost = 0;
4375   unsigned vec_prologue_cost = 0;
4376   unsigned vec_epilogue_cost = 0;
4377   int scalar_single_iter_cost = 0;
4378   int scalar_outside_cost = 0;
4379   int assumed_vf = vect_vf_for_cost (loop_vinfo);
4380   int npeel = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
4381   vector_costs *target_cost_data = loop_vinfo->vector_costs;
4382
4383   /* Cost model disabled.  */
4384   if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
4385     {
4386       if (dump_enabled_p ())
4387         dump_printf_loc (MSG_NOTE, vect_location, "cost model disabled.\n");
4388       *ret_min_profitable_niters = 0;
4389       *ret_min_profitable_estimate = 0;
4390       return;
4391     }
4392
4393   /* Requires loop versioning tests to handle misalignment.  */
4394   if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo))
4395     {
4396       /*  FIXME: Make cost depend on complexity of individual check.  */
4397       unsigned len = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).length ();
4398       (void) add_stmt_cost (target_cost_data, len, scalar_stmt, vect_prologue);
4399       if (dump_enabled_p ())
4400         dump_printf (MSG_NOTE,
4401                      "cost model: Adding cost of checks for loop "
4402                      "versioning to treat misalignment.\n");
4403     }
4404
4405   /* Requires loop versioning with alias checks.  */
4406   if (LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
4407     {
4408       /*  FIXME: Make cost depend on complexity of individual check.  */
4409       unsigned len = LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo).length ();
4410       (void) add_stmt_cost (target_cost_data, len, scalar_stmt, vect_prologue);
4411       len = LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo).length ();
4412       if (len)
4413         /* Count LEN - 1 ANDs and LEN comparisons.  */
4414         (void) add_stmt_cost (target_cost_data, len * 2 - 1,
4415                               scalar_stmt, vect_prologue);
4416       len = LOOP_VINFO_LOWER_BOUNDS (loop_vinfo).length ();
4417       if (len)
4418         {
4419           /* Count LEN - 1 ANDs and LEN comparisons.  */
4420           unsigned int nstmts = len * 2 - 1;
4421           /* +1 for each bias that needs adding.  */
4422           for (unsigned int i = 0; i < len; ++i)
4423             if (!LOOP_VINFO_LOWER_BOUNDS (loop_vinfo)[i].unsigned_p)
4424               nstmts += 1;
4425           (void) add_stmt_cost (target_cost_data, nstmts,
4426                                 scalar_stmt, vect_prologue);
4427         }
4428       if (dump_enabled_p ())
4429         dump_printf (MSG_NOTE,
4430                      "cost model: Adding cost of checks for loop "
4431                      "versioning aliasing.\n");
4432     }
4433
4434   /* Requires loop versioning with niter checks.  */
4435   if (LOOP_REQUIRES_VERSIONING_FOR_NITERS (loop_vinfo))
4436     {
4437       /*  FIXME: Make cost depend on complexity of individual check.  */
4438       (void) add_stmt_cost (target_cost_data, 1, vector_stmt,
4439                             NULL, NULL, NULL_TREE, 0, vect_prologue);
4440       if (dump_enabled_p ())
4441         dump_printf (MSG_NOTE,
4442                      "cost model: Adding cost of checks for loop "
4443                      "versioning niters.\n");
4444     }
4445
4446   if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
4447     (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken,
4448                           vect_prologue);
4449
4450   /* Count statements in scalar loop.  Using this as scalar cost for a single
4451      iteration for now.
4452
4453      TODO: Add outer loop support.
4454
4455      TODO: Consider assigning different costs to different scalar
4456      statements.  */
4457
4458   scalar_single_iter_cost = loop_vinfo->scalar_costs->total_cost ();
4459
4460   /* Add additional cost for the peeled instructions in prologue and epilogue
4461      loop.  (For fully-masked loops there will be no peeling.)
4462
4463      FORNOW: If we don't know the value of peel_iters for prologue or epilogue
4464      at compile-time - we assume it's vf/2 (the worst would be vf-1).
4465
4466      TODO: Build an expression that represents peel_iters for prologue and
4467      epilogue to be used in a run-time test.  */
4468
4469   bool prologue_need_br_taken_cost = false;
4470   bool prologue_need_br_not_taken_cost = false;
4471
4472   /* Calculate peel_iters_prologue.  */
4473   if (vect_use_loop_mask_for_alignment_p (loop_vinfo))
4474     peel_iters_prologue = 0;
4475   else if (npeel < 0)
4476     {
4477       peel_iters_prologue = assumed_vf / 2;
4478       if (dump_enabled_p ())
4479         dump_printf (MSG_NOTE, "cost model: "
4480                      "prologue peel iters set to vf/2.\n");
4481
4482       /* If peeled iterations are unknown, count a taken branch and a not taken
4483          branch per peeled loop.  Even if scalar loop iterations are known,
4484          vector iterations are not known since peeled prologue iterations are
4485          not known.  Hence guards remain the same.  */
4486       prologue_need_br_taken_cost = true;
4487       prologue_need_br_not_taken_cost = true;
4488     }
4489   else
4490     {
4491       peel_iters_prologue = npeel;
4492       if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) && peel_iters_prologue > 0)
4493         /* If peeled iterations are known but number of scalar loop
4494            iterations are unknown, count a taken branch per peeled loop.  */
4495         prologue_need_br_taken_cost = true;
4496     }
4497
4498   bool epilogue_need_br_taken_cost = false;
4499   bool epilogue_need_br_not_taken_cost = false;
4500
4501   /* Calculate peel_iters_epilogue.  */
4502   if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
4503     /* We need to peel exactly one iteration for gaps.  */
4504     peel_iters_epilogue = LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) ? 1 : 0;
4505   else if (npeel < 0)
4506     {
4507       /* If peeling for alignment is unknown, loop bound of main loop
4508          becomes unknown.  */
4509       peel_iters_epilogue = assumed_vf / 2;
4510       if (dump_enabled_p ())
4511         dump_printf (MSG_NOTE, "cost model: "
4512                      "epilogue peel iters set to vf/2 because "
4513                      "peeling for alignment is unknown.\n");
4514
4515       /* See the same reason above in peel_iters_prologue calculation.  */
4516       epilogue_need_br_taken_cost = true;
4517       epilogue_need_br_not_taken_cost = true;
4518     }
4519   else
4520     {
4521       peel_iters_epilogue = vect_get_peel_iters_epilogue (loop_vinfo, npeel);
4522       if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) && peel_iters_epilogue > 0)
4523         /* If peeled iterations are known but number of scalar loop
4524            iterations are unknown, count a taken branch per peeled loop.  */
4525         epilogue_need_br_taken_cost = true;
4526     }
4527
4528   stmt_info_for_cost *si;
4529   int j;
4530   /* Add costs associated with peel_iters_prologue.  */
4531   if (peel_iters_prologue)
4532     FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo), j, si)
4533       {
4534         (void) add_stmt_cost (target_cost_data,
4535                               si->count * peel_iters_prologue, si->kind,
4536                               si->stmt_info, si->node, si->vectype,
4537                               si->misalign, vect_prologue);
4538       }
4539
4540   /* Add costs associated with peel_iters_epilogue.  */
4541   if (peel_iters_epilogue)
4542     FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo), j, si)
4543       {
4544         (void) add_stmt_cost (target_cost_data,
4545                               si->count * peel_iters_epilogue, si->kind,
4546                               si->stmt_info, si->node, si->vectype,
4547                               si->misalign, vect_epilogue);
4548       }
4549
4550   /* Add possible cond_branch_taken/cond_branch_not_taken cost.  */
4551
4552   if (prologue_need_br_taken_cost)
4553     (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken,
4554                           vect_prologue);
4555
4556   if (prologue_need_br_not_taken_cost)
4557     (void) add_stmt_cost (target_cost_data, 1,
4558                           cond_branch_not_taken, vect_prologue);
4559
4560   if (epilogue_need_br_taken_cost)
4561     (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken,
4562                           vect_epilogue);
4563
4564   if (epilogue_need_br_not_taken_cost)
4565     (void) add_stmt_cost (target_cost_data, 1,
4566                           cond_branch_not_taken, vect_epilogue);
4567
4568   /* Take care of special costs for rgroup controls of partial vectors.  */
4569   if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
4570       && (LOOP_VINFO_PARTIAL_VECTORS_STYLE (loop_vinfo)
4571           == vect_partial_vectors_avx512))
4572     {
4573       /* Calculate how many masks we need to generate.  */
4574       unsigned int num_masks = 0;
4575       bool need_saturation = false;
4576       for (auto rgm : LOOP_VINFO_MASKS (loop_vinfo).rgc_vec)
4577         if (rgm.type)
4578           {
4579             unsigned nvectors = rgm.factor;
4580             num_masks += nvectors;
4581             if (TYPE_PRECISION (TREE_TYPE (rgm.compare_type))
4582                 < TYPE_PRECISION (LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo)))
4583               need_saturation = true;
4584           }
4585
4586       /* ???  The target isn't able to identify the costs below as
4587          producing masks so it cannot penaltize cases where we'd run
4588          out of mask registers for example.  */
4589
4590       /* ???  We are also failing to account for smaller vector masks
4591          we generate by splitting larger masks in vect_get_loop_mask.  */
4592
4593       /* In the worst case, we need to generate each mask in the prologue
4594          and in the loop body.  We need one splat per group and one
4595          compare per mask.
4596
4597          Sometimes the prologue mask will fold to a constant,
4598          so the actual prologue cost might be smaller.  However, it's
4599          simpler and safer to use the worst-case cost; if this ends up
4600          being the tie-breaker between vectorizing or not, then it's
4601          probably better not to vectorize.  */
4602       (void) add_stmt_cost (target_cost_data,
4603                             num_masks
4604                             + LOOP_VINFO_MASKS (loop_vinfo).rgc_vec.length (),
4605                             vector_stmt, NULL, NULL, NULL_TREE, 0,
4606                             vect_prologue);
4607       (void) add_stmt_cost (target_cost_data,
4608                             num_masks
4609                             + LOOP_VINFO_MASKS (loop_vinfo).rgc_vec.length (),
4610                             vector_stmt, NULL, NULL, NULL_TREE, 0, vect_body);
4611
4612       /* When we need saturation we need it both in the prologue and
4613          the epilogue.  */
4614       if (need_saturation)
4615         {
4616           (void) add_stmt_cost (target_cost_data, 1, scalar_stmt,
4617                                 NULL, NULL, NULL_TREE, 0, vect_prologue);
4618           (void) add_stmt_cost (target_cost_data, 1, scalar_stmt,
4619                                 NULL, NULL, NULL_TREE, 0, vect_body);
4620         }
4621     }
4622   else if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
4623            && (LOOP_VINFO_PARTIAL_VECTORS_STYLE (loop_vinfo)
4624                == vect_partial_vectors_while_ult))
4625     {
4626       /* Calculate how many masks we need to generate.  */
4627       unsigned int num_masks = 0;
4628       rgroup_controls *rgm;
4629       unsigned int num_vectors_m1;
4630       FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo).rgc_vec,
4631                         num_vectors_m1, rgm)
4632         if (rgm->type)
4633           num_masks += num_vectors_m1 + 1;
4634       gcc_assert (num_masks > 0);
4635
4636       /* In the worst case, we need to generate each mask in the prologue
4637          and in the loop body.  One of the loop body mask instructions
4638          replaces the comparison in the scalar loop, and since we don't
4639          count the scalar comparison against the scalar body, we shouldn't
4640          count that vector instruction against the vector body either.
4641
4642          Sometimes we can use unpacks instead of generating prologue
4643          masks and sometimes the prologue mask will fold to a constant,
4644          so the actual prologue cost might be smaller.  However, it's
4645          simpler and safer to use the worst-case cost; if this ends up
4646          being the tie-breaker between vectorizing or not, then it's
4647          probably better not to vectorize.  */
4648       (void) add_stmt_cost (target_cost_data, num_masks,
4649                             vector_stmt, NULL, NULL, NULL_TREE, 0,
4650                             vect_prologue);
4651       (void) add_stmt_cost (target_cost_data, num_masks - 1,
4652                             vector_stmt, NULL, NULL, NULL_TREE, 0,
4653                             vect_body);
4654     }
4655   else if (LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo))
4656     {
4657       /* Referring to the functions vect_set_loop_condition_partial_vectors
4658          and vect_set_loop_controls_directly, we need to generate each
4659          length in the prologue and in the loop body if required. Although
4660          there are some possible optimizations, we consider the worst case
4661          here.  */
4662
4663       bool niters_known_p = LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo);
4664       signed char partial_load_store_bias
4665         = LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo);
4666       bool need_iterate_p
4667         = (!LOOP_VINFO_EPILOGUE_P (loop_vinfo)
4668            && !vect_known_niters_smaller_than_vf (loop_vinfo));
4669
4670       /* Calculate how many statements to be added.  */
4671       unsigned int prologue_stmts = 0;
4672       unsigned int body_stmts = 0;
4673
4674       rgroup_controls *rgc;
4675       unsigned int num_vectors_m1;
4676       FOR_EACH_VEC_ELT (LOOP_VINFO_LENS (loop_vinfo), num_vectors_m1, rgc)
4677         if (rgc->type)
4678           {
4679             /* May need one SHIFT for nitems_total computation.  */
4680             unsigned nitems = rgc->max_nscalars_per_iter * rgc->factor;
4681             if (nitems != 1 && !niters_known_p)
4682               prologue_stmts += 1;
4683
4684             /* May need one MAX and one MINUS for wrap around.  */
4685             if (vect_rgroup_iv_might_wrap_p (loop_vinfo, rgc))
4686               prologue_stmts += 2;
4687
4688             /* Need one MAX and one MINUS for each batch limit excepting for
4689                the 1st one.  */
4690             prologue_stmts += num_vectors_m1 * 2;
4691
4692             unsigned int num_vectors = num_vectors_m1 + 1;
4693
4694             /* Need to set up lengths in prologue, only one MIN required
4695                for each since start index is zero.  */
4696             prologue_stmts += num_vectors;
4697
4698             /* If we have a non-zero partial load bias, we need one PLUS
4699                to adjust the load length.  */
4700             if (partial_load_store_bias != 0)
4701               body_stmts += 1;
4702
4703             /* Each may need two MINs and one MINUS to update lengths in body
4704                for next iteration.  */
4705             if (need_iterate_p)
4706               body_stmts += 3 * num_vectors;
4707           }
4708
4709       (void) add_stmt_cost (target_cost_data, prologue_stmts,
4710                             scalar_stmt, vect_prologue);
4711       (void) add_stmt_cost (target_cost_data, body_stmts,
4712                             scalar_stmt, vect_body);
4713     }
4714
4715   /* FORNOW: The scalar outside cost is incremented in one of the
4716      following ways:
4717
4718      1. The vectorizer checks for alignment and aliasing and generates
4719      a condition that allows dynamic vectorization.  A cost model
4720      check is ANDED with the versioning condition.  Hence scalar code
4721      path now has the added cost of the versioning check.
4722
4723        if (cost > th & versioning_check)
4724          jmp to vector code
4725
4726      Hence run-time scalar is incremented by not-taken branch cost.
4727
4728      2. The vectorizer then checks if a prologue is required.  If the
4729      cost model check was not done before during versioning, it has to
4730      be done before the prologue check.
4731
4732        if (cost <= th)
4733          prologue = scalar_iters
4734        if (prologue == 0)
4735          jmp to vector code
4736        else
4737          execute prologue
4738        if (prologue == num_iters)
4739          go to exit
4740
4741      Hence the run-time scalar cost is incremented by a taken branch,
4742      plus a not-taken branch, plus a taken branch cost.
4743
4744      3. The vectorizer then checks if an epilogue is required.  If the
4745      cost model check was not done before during prologue check, it
4746      has to be done with the epilogue check.
4747
4748        if (prologue == 0)
4749          jmp to vector code
4750        else
4751          execute prologue
4752        if (prologue == num_iters)
4753          go to exit
4754        vector code:
4755          if ((cost <= th) | (scalar_iters-prologue-epilogue == 0))
4756            jmp to epilogue
4757
4758      Hence the run-time scalar cost should be incremented by 2 taken
4759      branches.
4760
4761      TODO: The back end may reorder the BBS's differently and reverse
4762      conditions/branch directions.  Change the estimates below to
4763      something more reasonable.  */
4764
4765   /* If the number of iterations is known and we do not do versioning, we can
4766      decide whether to vectorize at compile time.  Hence the scalar version
4767      do not carry cost model guard costs.  */
4768   if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
4769       || LOOP_REQUIRES_VERSIONING (loop_vinfo))
4770     {
4771       /* Cost model check occurs at versioning.  */
4772       if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
4773         scalar_outside_cost += vect_get_stmt_cost (cond_branch_not_taken);
4774       else
4775         {
4776           /* Cost model check occurs at prologue generation.  */
4777           if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) < 0)
4778             scalar_outside_cost += 2 * vect_get_stmt_cost (cond_branch_taken)
4779               + vect_get_stmt_cost (cond_branch_not_taken);
4780           /* Cost model check occurs at epilogue generation.  */
4781           else
4782             scalar_outside_cost += 2 * vect_get_stmt_cost (cond_branch_taken);
4783         }
4784     }
4785
4786   /* Complete the target-specific cost calculations.  */
4787   finish_cost (loop_vinfo->vector_costs, loop_vinfo->scalar_costs,
4788                &vec_prologue_cost, &vec_inside_cost, &vec_epilogue_cost,
4789                suggested_unroll_factor);
4790
4791   if (suggested_unroll_factor && *suggested_unroll_factor > 1
4792       && LOOP_VINFO_MAX_VECT_FACTOR (loop_vinfo) != MAX_VECTORIZATION_FACTOR
4793       && !known_le (LOOP_VINFO_VECT_FACTOR (loop_vinfo) *
4794                     *suggested_unroll_factor,
4795                     LOOP_VINFO_MAX_VECT_FACTOR (loop_vinfo)))
4796     {
4797       if (dump_enabled_p ())
4798         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4799                          "can't unroll as unrolled vectorization factor larger"
4800                          " than maximum vectorization factor: "
4801                          HOST_WIDE_INT_PRINT_UNSIGNED "\n",
4802                          LOOP_VINFO_MAX_VECT_FACTOR (loop_vinfo));
4803       *suggested_unroll_factor = 1;
4804     }
4805
4806   vec_outside_cost = (int)(vec_prologue_cost + vec_epilogue_cost);
4807
4808   if (dump_enabled_p ())
4809     {
4810       dump_printf_loc (MSG_NOTE, vect_location, "Cost model analysis: \n");
4811       dump_printf (MSG_NOTE, "  Vector inside of loop cost: %d\n",
4812                    vec_inside_cost);
4813       dump_printf (MSG_NOTE, "  Vector prologue cost: %d\n",
4814                    vec_prologue_cost);
4815       dump_printf (MSG_NOTE, "  Vector epilogue cost: %d\n",
4816                    vec_epilogue_cost);
4817       dump_printf (MSG_NOTE, "  Scalar iteration cost: %d\n",
4818                    scalar_single_iter_cost);
4819       dump_printf (MSG_NOTE, "  Scalar outside cost: %d\n",
4820                    scalar_outside_cost);
4821       dump_printf (MSG_NOTE, "  Vector outside cost: %d\n",
4822                    vec_outside_cost);
4823       dump_printf (MSG_NOTE, "  prologue iterations: %d\n",
4824                    peel_iters_prologue);
4825       dump_printf (MSG_NOTE, "  epilogue iterations: %d\n",
4826                    peel_iters_epilogue);
4827     }
4828
4829   /* Calculate number of iterations required to make the vector version
4830      profitable, relative to the loop bodies only.  The following condition
4831      must hold true:
4832      SIC * niters + SOC > VIC * ((niters - NPEEL) / VF) + VOC
4833      where
4834      SIC = scalar iteration cost, VIC = vector iteration cost,
4835      VOC = vector outside cost, VF = vectorization factor,
4836      NPEEL = prologue iterations + epilogue iterations,
4837      SOC = scalar outside cost for run time cost model check.  */
4838
4839   int saving_per_viter = (scalar_single_iter_cost * assumed_vf
4840                           - vec_inside_cost);
4841   if (saving_per_viter <= 0)
4842     {
4843       if (LOOP_VINFO_LOOP (loop_vinfo)->force_vectorize)
4844         warning_at (vect_location.get_location_t (), OPT_Wopenmp_simd,
4845                     "vectorization did not happen for a simd loop");
4846
4847       if (dump_enabled_p ())
4848         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4849                          "cost model: the vector iteration cost = %d "
4850                          "divided by the scalar iteration cost = %d "
4851                          "is greater or equal to the vectorization factor = %d"
4852                          ".\n",
4853                          vec_inside_cost, scalar_single_iter_cost, assumed_vf);
4854       *ret_min_profitable_niters = -1;
4855       *ret_min_profitable_estimate = -1;
4856       return;
4857     }
4858
4859   /* ??? The "if" arm is written to handle all cases; see below for what
4860      we would do for !LOOP_VINFO_USING_PARTIAL_VECTORS_P.  */
4861   if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
4862     {
4863       /* Rewriting the condition above in terms of the number of
4864          vector iterations (vniters) rather than the number of
4865          scalar iterations (niters) gives:
4866
4867          SIC * (vniters * VF + NPEEL) + SOC > VIC * vniters + VOC
4868
4869          <==> vniters * (SIC * VF - VIC) > VOC - SIC * NPEEL - SOC
4870
4871          For integer N, X and Y when X > 0:
4872
4873          N * X > Y <==> N >= (Y /[floor] X) + 1.  */
4874       int outside_overhead = (vec_outside_cost
4875                               - scalar_single_iter_cost * peel_iters_prologue
4876                               - scalar_single_iter_cost * peel_iters_epilogue
4877                               - scalar_outside_cost);
4878       /* We're only interested in cases that require at least one
4879          vector iteration.  */
4880       int min_vec_niters = 1;
4881       if (outside_overhead > 0)
4882         min_vec_niters = outside_overhead / saving_per_viter + 1;
4883
4884       if (dump_enabled_p ())
4885         dump_printf (MSG_NOTE, "  Minimum number of vector iterations: %d\n",
4886                      min_vec_niters);
4887
4888       if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
4889         {
4890           /* Now that we know the minimum number of vector iterations,
4891              find the minimum niters for which the scalar cost is larger:
4892
4893              SIC * niters > VIC * vniters + VOC - SOC
4894
4895              We know that the minimum niters is no more than
4896              vniters * VF + NPEEL, but it might be (and often is) less
4897              than that if a partial vector iteration is cheaper than the
4898              equivalent scalar code.  */
4899           int threshold = (vec_inside_cost * min_vec_niters
4900                            + vec_outside_cost
4901                            - scalar_outside_cost);
4902           if (threshold <= 0)
4903             min_profitable_iters = 1;
4904           else
4905             min_profitable_iters = threshold / scalar_single_iter_cost + 1;
4906         }
4907       else
4908         /* Convert the number of vector iterations into a number of
4909            scalar iterations.  */
4910         min_profitable_iters = (min_vec_niters * assumed_vf
4911                                 + peel_iters_prologue
4912                                 + peel_iters_epilogue);
4913     }
4914   else
4915     {
4916       min_profitable_iters = ((vec_outside_cost - scalar_outside_cost)
4917                               * assumed_vf
4918                               - vec_inside_cost * peel_iters_prologue
4919                               - vec_inside_cost * peel_iters_epilogue);
4920       if (min_profitable_iters <= 0)
4921         min_profitable_iters = 0;
4922       else
4923         {
4924           min_profitable_iters /= saving_per_viter;
4925
4926           if ((scalar_single_iter_cost * assumed_vf * min_profitable_iters)
4927               <= (((int) vec_inside_cost * min_profitable_iters)
4928                   + (((int) vec_outside_cost - scalar_outside_cost)
4929                      * assumed_vf)))
4930             min_profitable_iters++;
4931         }
4932     }
4933
4934   if (dump_enabled_p ())
4935     dump_printf (MSG_NOTE,
4936                  "  Calculated minimum iters for profitability: %d\n",
4937                  min_profitable_iters);
4938
4939   if (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo)
4940       && min_profitable_iters < (assumed_vf + peel_iters_prologue))
4941     /* We want the vectorized loop to execute at least once.  */
4942     min_profitable_iters = assumed_vf + peel_iters_prologue;
4943   else if (min_profitable_iters < peel_iters_prologue)
4944     /* For LOOP_VINFO_USING_PARTIAL_VECTORS_P, we need to ensure the
4945        vectorized loop executes at least once.  */
4946     min_profitable_iters = peel_iters_prologue;
4947
4948   if (dump_enabled_p ())
4949     dump_printf_loc (MSG_NOTE, vect_location,
4950                      "  Runtime profitability threshold = %d\n",
4951                      min_profitable_iters);
4952
4953   *ret_min_profitable_niters = min_profitable_iters;
4954
4955   /* Calculate number of iterations required to make the vector version
4956      profitable, relative to the loop bodies only.
4957
4958      Non-vectorized variant is SIC * niters and it must win over vector
4959      variant on the expected loop trip count.  The following condition must hold true:
4960      SIC * niters > VIC * ((niters - NPEEL) / VF) + VOC + SOC  */
4961
4962   if (vec_outside_cost <= 0)
4963     min_profitable_estimate = 0;
4964   /* ??? This "else if" arm is written to handle all cases; see below for
4965      what we would do for !LOOP_VINFO_USING_PARTIAL_VECTORS_P.  */
4966   else if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
4967     {
4968       /* This is a repeat of the code above, but with + SOC rather
4969          than - SOC.  */
4970       int outside_overhead = (vec_outside_cost
4971                               - scalar_single_iter_cost * peel_iters_prologue
4972                               - scalar_single_iter_cost * peel_iters_epilogue
4973                               + scalar_outside_cost);
4974       int min_vec_niters = 1;
4975       if (outside_overhead > 0)
4976         min_vec_niters = outside_overhead / saving_per_viter + 1;
4977
4978       if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
4979         {
4980           int threshold = (vec_inside_cost * min_vec_niters
4981                            + vec_outside_cost
4982                            + scalar_outside_cost);
4983           min_profitable_estimate = threshold / scalar_single_iter_cost + 1;
4984         }
4985       else
4986         min_profitable_estimate = (min_vec_niters * assumed_vf
4987                                    + peel_iters_prologue
4988                                    + peel_iters_epilogue);
4989     }
4990   else
4991     {
4992       min_profitable_estimate = ((vec_outside_cost + scalar_outside_cost)
4993                                  * assumed_vf
4994                                  - vec_inside_cost * peel_iters_prologue
4995                                  - vec_inside_cost * peel_iters_epilogue)
4996                                  / ((scalar_single_iter_cost * assumed_vf)
4997                                    - vec_inside_cost);
4998     }
4999   min_profitable_estimate = MAX (min_profitable_estimate, min_profitable_iters);
5000   if (dump_enabled_p ())
5001     dump_printf_loc (MSG_NOTE, vect_location,
5002                      "  Static estimate profitability threshold = %d\n",
5003                      min_profitable_estimate);
5004
5005   *ret_min_profitable_estimate = min_profitable_estimate;
5006 }
5007
5008 /* Writes into SEL a mask for a vec_perm, equivalent to a vec_shr by OFFSET
5009    vector elements (not bits) for a vector with NELT elements.  */
5010 static void
5011 calc_vec_perm_mask_for_shift (unsigned int offset, unsigned int nelt,
5012                               vec_perm_builder *sel)
5013 {
5014   /* The encoding is a single stepped pattern.  Any wrap-around is handled
5015      by vec_perm_indices.  */
5016   sel->new_vector (nelt, 1, 3);
5017   for (unsigned int i = 0; i < 3; i++)
5018     sel->quick_push (i + offset);
5019 }
5020
5021 /* Checks whether the target supports whole-vector shifts for vectors of mode
5022    MODE.  This is the case if _either_ the platform handles vec_shr_optab, _or_
5023    it supports vec_perm_const with masks for all necessary shift amounts.  */
5024 static bool
5025 have_whole_vector_shift (machine_mode mode)
5026 {
5027   if (optab_handler (vec_shr_optab, mode) != CODE_FOR_nothing)
5028     return true;
5029
5030   /* Variable-length vectors should be handled via the optab.  */
5031   unsigned int nelt;
5032   if (!GET_MODE_NUNITS (mode).is_constant (&nelt))
5033     return false;
5034
5035   vec_perm_builder sel;
5036   vec_perm_indices indices;
5037   for (unsigned int i = nelt / 2; i >= 1; i /= 2)
5038     {
5039       calc_vec_perm_mask_for_shift (i, nelt, &sel);
5040       indices.new_vector (sel, 2, nelt);
5041       if (!can_vec_perm_const_p (mode, mode, indices, false))
5042         return false;
5043     }
5044   return true;
5045 }
5046
5047 /* Return true if (a) STMT_INFO is a DOT_PROD_EXPR reduction whose
5048    multiplication operands have differing signs and (b) we intend
5049    to emulate the operation using a series of signed DOT_PROD_EXPRs.
5050    See vect_emulate_mixed_dot_prod for the actual sequence used.  */
5051
5052 static bool
5053 vect_is_emulated_mixed_dot_prod (loop_vec_info loop_vinfo,
5054                                  stmt_vec_info stmt_info)
5055 {
5056   gassign *assign = dyn_cast<gassign *> (stmt_info->stmt);
5057   if (!assign || gimple_assign_rhs_code (assign) != DOT_PROD_EXPR)
5058     return false;
5059
5060   tree rhs1 = gimple_assign_rhs1 (assign);
5061   tree rhs2 = gimple_assign_rhs2 (assign);
5062   if (TYPE_SIGN (TREE_TYPE (rhs1)) == TYPE_SIGN (TREE_TYPE (rhs2)))
5063     return false;
5064
5065   stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info);
5066   gcc_assert (reduc_info->is_reduc_info);
5067   return !directly_supported_p (DOT_PROD_EXPR,
5068                                 STMT_VINFO_REDUC_VECTYPE_IN (reduc_info),
5069                                 optab_vector_mixed_sign);
5070 }
5071
5072 /* TODO: Close dependency between vect_model_*_cost and vectorizable_*
5073    functions. Design better to avoid maintenance issues.  */
5074
5075 /* Function vect_model_reduction_cost.
5076
5077    Models cost for a reduction operation, including the vector ops
5078    generated within the strip-mine loop in some cases, the initial
5079    definition before the loop, and the epilogue code that must be generated.  */
5080
5081 static void
5082 vect_model_reduction_cost (loop_vec_info loop_vinfo,
5083                            stmt_vec_info stmt_info, internal_fn reduc_fn,
5084                            vect_reduction_type reduction_type,
5085                            int ncopies, stmt_vector_for_cost *cost_vec)
5086 {
5087   int prologue_cost = 0, epilogue_cost = 0, inside_cost = 0;
5088   tree vectype;
5089   machine_mode mode;
5090   class loop *loop = NULL;
5091
5092   if (loop_vinfo)
5093     loop = LOOP_VINFO_LOOP (loop_vinfo);
5094
5095   /* Condition reductions generate two reductions in the loop.  */
5096   if (reduction_type == COND_REDUCTION)
5097     ncopies *= 2;
5098
5099   vectype = STMT_VINFO_VECTYPE (stmt_info);
5100   mode = TYPE_MODE (vectype);
5101   stmt_vec_info orig_stmt_info = vect_orig_stmt (stmt_info);
5102
5103   gimple_match_op op;
5104   if (!gimple_extract_op (orig_stmt_info->stmt, &op))
5105     gcc_unreachable ();
5106
5107   bool emulated_mixed_dot_prod
5108     = vect_is_emulated_mixed_dot_prod (loop_vinfo, stmt_info);
5109   if (reduction_type == EXTRACT_LAST_REDUCTION)
5110     /* No extra instructions are needed in the prologue.  The loop body
5111        operations are costed in vectorizable_condition.  */
5112     inside_cost = 0;
5113   else if (reduction_type == FOLD_LEFT_REDUCTION)
5114     {
5115       /* No extra instructions needed in the prologue.  */
5116       prologue_cost = 0;
5117
5118       if (reduc_fn != IFN_LAST)
5119         /* Count one reduction-like operation per vector.  */
5120         inside_cost = record_stmt_cost (cost_vec, ncopies, vec_to_scalar,
5121                                         stmt_info, 0, vect_body);
5122       else
5123         {
5124           /* Use NELEMENTS extracts and NELEMENTS scalar ops.  */
5125           unsigned int nelements = ncopies * vect_nunits_for_cost (vectype);
5126           inside_cost = record_stmt_cost (cost_vec, nelements,
5127                                           vec_to_scalar, stmt_info, 0,
5128                                           vect_body);
5129           inside_cost += record_stmt_cost (cost_vec, nelements,
5130                                            scalar_stmt, stmt_info, 0,
5131                                            vect_body);
5132         }
5133     }
5134   else
5135     {
5136       /* Add in the cost of the initial definitions.  */
5137       int prologue_stmts;
5138       if (reduction_type == COND_REDUCTION)
5139         /* For cond reductions we have four vectors: initial index, step,
5140            initial result of the data reduction, initial value of the index
5141            reduction.  */
5142         prologue_stmts = 4;
5143       else if (emulated_mixed_dot_prod)
5144         /* We need the initial reduction value and two invariants:
5145            one that contains the minimum signed value and one that
5146            contains half of its negative.  */
5147         prologue_stmts = 3;
5148       else
5149         prologue_stmts = 1;
5150       prologue_cost += record_stmt_cost (cost_vec, prologue_stmts,
5151                                          scalar_to_vec, stmt_info, 0,
5152                                          vect_prologue);
5153     }
5154
5155   /* Determine cost of epilogue code.
5156
5157      We have a reduction operator that will reduce the vector in one statement.
5158      Also requires scalar extract.  */
5159
5160   if (!loop || !nested_in_vect_loop_p (loop, orig_stmt_info))
5161     {
5162       if (reduc_fn != IFN_LAST)
5163         {
5164           if (reduction_type == COND_REDUCTION)
5165             {
5166               /* An EQ stmt and an COND_EXPR stmt.  */
5167               epilogue_cost += record_stmt_cost (cost_vec, 2,
5168                                                  vector_stmt, stmt_info, 0,
5169                                                  vect_epilogue);
5170               /* Reduction of the max index and a reduction of the found
5171                  values.  */
5172               epilogue_cost += record_stmt_cost (cost_vec, 2,
5173                                                  vec_to_scalar, stmt_info, 0,
5174                                                  vect_epilogue);
5175               /* A broadcast of the max value.  */
5176               epilogue_cost += record_stmt_cost (cost_vec, 1,
5177                                                  scalar_to_vec, stmt_info, 0,
5178                                                  vect_epilogue);
5179             }
5180           else
5181             {
5182               epilogue_cost += record_stmt_cost (cost_vec, 1, vector_stmt,
5183                                                  stmt_info, 0, vect_epilogue);
5184               epilogue_cost += record_stmt_cost (cost_vec, 1,
5185                                                  vec_to_scalar, stmt_info, 0,
5186                                                  vect_epilogue);
5187             }
5188         }
5189       else if (reduction_type == COND_REDUCTION)
5190         {
5191           unsigned estimated_nunits = vect_nunits_for_cost (vectype);
5192           /* Extraction of scalar elements.  */
5193           epilogue_cost += record_stmt_cost (cost_vec,
5194                                              2 * estimated_nunits,
5195                                              vec_to_scalar, stmt_info, 0,
5196                                              vect_epilogue);
5197           /* Scalar max reductions via COND_EXPR / MAX_EXPR.  */
5198           epilogue_cost += record_stmt_cost (cost_vec,
5199                                              2 * estimated_nunits - 3,
5200                                              scalar_stmt, stmt_info, 0,
5201                                              vect_epilogue);
5202         }
5203       else if (reduction_type == EXTRACT_LAST_REDUCTION
5204                || reduction_type == FOLD_LEFT_REDUCTION)
5205         /* No extra instructions need in the epilogue.  */
5206         ;
5207       else
5208         {
5209           int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
5210           tree bitsize = TYPE_SIZE (op.type);
5211           int element_bitsize = tree_to_uhwi (bitsize);
5212           int nelements = vec_size_in_bits / element_bitsize;
5213
5214           if (op.code == COND_EXPR)
5215             op.code = MAX_EXPR;
5216
5217           /* We have a whole vector shift available.  */
5218           if (VECTOR_MODE_P (mode)
5219               && directly_supported_p (op.code, vectype)
5220               && have_whole_vector_shift (mode))
5221             {
5222               /* Final reduction via vector shifts and the reduction operator.
5223                  Also requires scalar extract.  */
5224               epilogue_cost += record_stmt_cost (cost_vec,
5225                                                  exact_log2 (nelements) * 2,
5226                                                  vector_stmt, stmt_info, 0,
5227                                                  vect_epilogue);
5228               epilogue_cost += record_stmt_cost (cost_vec, 1,
5229                                                  vec_to_scalar, stmt_info, 0,
5230                                                  vect_epilogue);
5231             }
5232           else
5233             /* Use extracts and reduction op for final reduction.  For N
5234                elements, we have N extracts and N-1 reduction ops.  */
5235             epilogue_cost += record_stmt_cost (cost_vec,
5236                                                nelements + nelements - 1,
5237                                                vector_stmt, stmt_info, 0,
5238                                                vect_epilogue);
5239         }
5240     }
5241
5242   if (dump_enabled_p ())
5243     dump_printf (MSG_NOTE,
5244                  "vect_model_reduction_cost: inside_cost = %d, "
5245                  "prologue_cost = %d, epilogue_cost = %d .\n", inside_cost,
5246                  prologue_cost, epilogue_cost);
5247 }
5248
5249 /* SEQ is a sequence of instructions that initialize the reduction
5250    described by REDUC_INFO.  Emit them in the appropriate place.  */
5251
5252 static void
5253 vect_emit_reduction_init_stmts (loop_vec_info loop_vinfo,
5254                                 stmt_vec_info reduc_info, gimple *seq)
5255 {
5256   if (reduc_info->reused_accumulator)
5257     {
5258       /* When reusing an accumulator from the main loop, we only need
5259          initialization instructions if the main loop can be skipped.
5260          In that case, emit the initialization instructions at the end
5261          of the guard block that does the skip.  */
5262       edge skip_edge = loop_vinfo->skip_main_loop_edge;
5263       gcc_assert (skip_edge);
5264       gimple_stmt_iterator gsi = gsi_last_bb (skip_edge->src);
5265       gsi_insert_seq_before (&gsi, seq, GSI_SAME_STMT);
5266     }
5267   else
5268     {
5269       /* The normal case: emit the initialization instructions on the
5270          preheader edge.  */
5271       class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
5272       gsi_insert_seq_on_edge_immediate (loop_preheader_edge (loop), seq);
5273     }
5274 }
5275
5276 /* Function get_initial_def_for_reduction
5277
5278    Input:
5279    REDUC_INFO - the info_for_reduction
5280    INIT_VAL - the initial value of the reduction variable
5281    NEUTRAL_OP - a value that has no effect on the reduction, as per
5282                 neutral_op_for_reduction
5283
5284    Output:
5285    Return a vector variable, initialized according to the operation that
5286         STMT_VINFO performs. This vector will be used as the initial value
5287         of the vector of partial results.
5288
5289    The value we need is a vector in which element 0 has value INIT_VAL
5290    and every other element has value NEUTRAL_OP.  */
5291
5292 static tree
5293 get_initial_def_for_reduction (loop_vec_info loop_vinfo,
5294                                stmt_vec_info reduc_info,
5295                                tree init_val, tree neutral_op)
5296 {
5297   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
5298   tree scalar_type = TREE_TYPE (init_val);
5299   tree vectype = get_vectype_for_scalar_type (loop_vinfo, scalar_type);
5300   tree init_def;
5301   gimple_seq stmts = NULL;
5302
5303   gcc_assert (vectype);
5304
5305   gcc_assert (POINTER_TYPE_P (scalar_type) || INTEGRAL_TYPE_P (scalar_type)
5306               || SCALAR_FLOAT_TYPE_P (scalar_type));
5307
5308   gcc_assert (nested_in_vect_loop_p (loop, reduc_info)
5309               || loop == (gimple_bb (reduc_info->stmt))->loop_father);
5310
5311   if (operand_equal_p (init_val, neutral_op))
5312     {
5313       /* If both elements are equal then the vector described above is
5314          just a splat.  */
5315       neutral_op = gimple_convert (&stmts, TREE_TYPE (vectype), neutral_op);
5316       init_def = gimple_build_vector_from_val (&stmts, vectype, neutral_op);
5317     }
5318   else
5319     {
5320       neutral_op = gimple_convert (&stmts, TREE_TYPE (vectype), neutral_op);
5321       init_val = gimple_convert (&stmts, TREE_TYPE (vectype), init_val);
5322       if (!TYPE_VECTOR_SUBPARTS (vectype).is_constant ())
5323         {
5324           /* Construct a splat of NEUTRAL_OP and insert INIT_VAL into
5325              element 0.  */
5326           init_def = gimple_build_vector_from_val (&stmts, vectype,
5327                                                    neutral_op);
5328           init_def = gimple_build (&stmts, CFN_VEC_SHL_INSERT,
5329                                    vectype, init_def, init_val);
5330         }
5331       else
5332         {
5333           /* Build {INIT_VAL, NEUTRAL_OP, NEUTRAL_OP, ...}.  */
5334           tree_vector_builder elts (vectype, 1, 2);
5335           elts.quick_push (init_val);
5336           elts.quick_push (neutral_op);
5337           init_def = gimple_build_vector (&stmts, &elts);
5338         }
5339     }
5340
5341   if (stmts)
5342     vect_emit_reduction_init_stmts (loop_vinfo, reduc_info, stmts);
5343   return init_def;
5344 }
5345
5346 /* Get at the initial defs for the reduction PHIs for REDUC_INFO,
5347    which performs a reduction involving GROUP_SIZE scalar statements.
5348    NUMBER_OF_VECTORS is the number of vector defs to create.  If NEUTRAL_OP
5349    is nonnull, introducing extra elements of that value will not change the
5350    result.  */
5351
5352 static void
5353 get_initial_defs_for_reduction (loop_vec_info loop_vinfo,
5354                                 stmt_vec_info reduc_info,
5355                                 vec<tree> *vec_oprnds,
5356                                 unsigned int number_of_vectors,
5357                                 unsigned int group_size, tree neutral_op)
5358 {
5359   vec<tree> &initial_values = reduc_info->reduc_initial_values;
5360   unsigned HOST_WIDE_INT nunits;
5361   unsigned j, number_of_places_left_in_vector;
5362   tree vector_type = STMT_VINFO_VECTYPE (reduc_info);
5363   unsigned int i;
5364
5365   gcc_assert (group_size == initial_values.length () || neutral_op);
5366
5367   /* NUMBER_OF_COPIES is the number of times we need to use the same values in
5368      created vectors. It is greater than 1 if unrolling is performed.
5369
5370      For example, we have two scalar operands, s1 and s2 (e.g., group of
5371      strided accesses of size two), while NUNITS is four (i.e., four scalars
5372      of this type can be packed in a vector).  The output vector will contain
5373      two copies of each scalar operand: {s1, s2, s1, s2}.  (NUMBER_OF_COPIES
5374      will be 2).
5375
5376      If REDUC_GROUP_SIZE > NUNITS, the scalars will be split into several
5377      vectors containing the operands.
5378
5379      For example, NUNITS is four as before, and the group size is 8
5380      (s1, s2, ..., s8).  We will create two vectors {s1, s2, s3, s4} and
5381      {s5, s6, s7, s8}.  */
5382
5383   if (!TYPE_VECTOR_SUBPARTS (vector_type).is_constant (&nunits))
5384     nunits = group_size;
5385
5386   number_of_places_left_in_vector = nunits;
5387   bool constant_p = true;
5388   tree_vector_builder elts (vector_type, nunits, 1);
5389   elts.quick_grow (nunits);
5390   gimple_seq ctor_seq = NULL;
5391   for (j = 0; j < nunits * number_of_vectors; ++j)
5392     {
5393       tree op;
5394       i = j % group_size;
5395
5396       /* Get the def before the loop.  In reduction chain we have only
5397          one initial value.  Else we have as many as PHIs in the group.  */
5398       if (i >= initial_values.length () || (j > i && neutral_op))
5399         op = neutral_op;
5400       else
5401         op = initial_values[i];
5402
5403       /* Create 'vect_ = {op0,op1,...,opn}'.  */
5404       number_of_places_left_in_vector--;
5405       elts[nunits - number_of_places_left_in_vector - 1] = op;
5406       if (!CONSTANT_CLASS_P (op))
5407         constant_p = false;
5408
5409       if (number_of_places_left_in_vector == 0)
5410         {
5411           tree init;
5412           if (constant_p && !neutral_op
5413               ? multiple_p (TYPE_VECTOR_SUBPARTS (vector_type), nunits)
5414               : known_eq (TYPE_VECTOR_SUBPARTS (vector_type), nunits))
5415             /* Build the vector directly from ELTS.  */
5416             init = gimple_build_vector (&ctor_seq, &elts);
5417           else if (neutral_op)
5418             {
5419               /* Build a vector of the neutral value and shift the
5420                  other elements into place.  */
5421               init = gimple_build_vector_from_val (&ctor_seq, vector_type,
5422                                                    neutral_op);
5423               int k = nunits;
5424               while (k > 0 && elts[k - 1] == neutral_op)
5425                 k -= 1;
5426               while (k > 0)
5427                 {
5428                   k -= 1;
5429                   init = gimple_build (&ctor_seq, CFN_VEC_SHL_INSERT,
5430                                        vector_type, init, elts[k]);
5431                 }
5432             }
5433           else
5434             {
5435               /* First time round, duplicate ELTS to fill the
5436                  required number of vectors.  */
5437               duplicate_and_interleave (loop_vinfo, &ctor_seq, vector_type,
5438                                         elts, number_of_vectors, *vec_oprnds);
5439               break;
5440             }
5441           vec_oprnds->quick_push (init);
5442
5443           number_of_places_left_in_vector = nunits;
5444           elts.new_vector (vector_type, nunits, 1);
5445           elts.quick_grow (nunits);
5446           constant_p = true;
5447         }
5448     }
5449   if (ctor_seq != NULL)
5450     vect_emit_reduction_init_stmts (loop_vinfo, reduc_info, ctor_seq);
5451 }
5452
5453 /* For a statement STMT_INFO taking part in a reduction operation return
5454    the stmt_vec_info the meta information is stored on.  */
5455
5456 stmt_vec_info
5457 info_for_reduction (vec_info *vinfo, stmt_vec_info stmt_info)
5458 {
5459   stmt_info = vect_orig_stmt (stmt_info);
5460   gcc_assert (STMT_VINFO_REDUC_DEF (stmt_info));
5461   if (!is_a <gphi *> (stmt_info->stmt)
5462       || !VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
5463     stmt_info = STMT_VINFO_REDUC_DEF (stmt_info);
5464   gphi *phi = as_a <gphi *> (stmt_info->stmt);
5465   if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
5466     {
5467       if (gimple_phi_num_args (phi) == 1)
5468         stmt_info = STMT_VINFO_REDUC_DEF (stmt_info);
5469     }
5470   else if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle)
5471     {
5472       stmt_vec_info info = vinfo->lookup_def (vect_phi_initial_value (phi));
5473       if (info && STMT_VINFO_DEF_TYPE (info) == vect_double_reduction_def)
5474         stmt_info = info;
5475     }
5476   return stmt_info;
5477 }
5478
5479 /* See if LOOP_VINFO is an epilogue loop whose main loop had a reduction that
5480    REDUC_INFO can build on.  Adjust REDUC_INFO and return true if so, otherwise
5481    return false.  */
5482
5483 static bool
5484 vect_find_reusable_accumulator (loop_vec_info loop_vinfo,
5485                                 stmt_vec_info reduc_info)
5486 {
5487   loop_vec_info main_loop_vinfo = LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo);
5488   if (!main_loop_vinfo)
5489     return false;
5490
5491   if (STMT_VINFO_REDUC_TYPE (reduc_info) != TREE_CODE_REDUCTION)
5492     return false;
5493
5494   unsigned int num_phis = reduc_info->reduc_initial_values.length ();
5495   auto_vec<tree, 16> main_loop_results (num_phis);
5496   auto_vec<tree, 16> initial_values (num_phis);
5497   if (edge main_loop_edge = loop_vinfo->main_loop_edge)
5498     {
5499       /* The epilogue loop can be entered either from the main loop or
5500          from an earlier guard block.  */
5501       edge skip_edge = loop_vinfo->skip_main_loop_edge;
5502       for (tree incoming_value : reduc_info->reduc_initial_values)
5503         {
5504           /* Look for:
5505
5506                INCOMING_VALUE = phi<MAIN_LOOP_RESULT(main loop),
5507                                     INITIAL_VALUE(guard block)>.  */
5508           gcc_assert (TREE_CODE (incoming_value) == SSA_NAME);
5509
5510           gphi *phi = as_a <gphi *> (SSA_NAME_DEF_STMT (incoming_value));
5511           gcc_assert (gimple_bb (phi) == main_loop_edge->dest);
5512
5513           tree from_main_loop = PHI_ARG_DEF_FROM_EDGE (phi, main_loop_edge);
5514           tree from_skip = PHI_ARG_DEF_FROM_EDGE (phi, skip_edge);
5515
5516           main_loop_results.quick_push (from_main_loop);
5517           initial_values.quick_push (from_skip);
5518         }
5519     }
5520   else
5521     /* The main loop dominates the epilogue loop.  */
5522     main_loop_results.splice (reduc_info->reduc_initial_values);
5523
5524   /* See if the main loop has the kind of accumulator we need.  */
5525   vect_reusable_accumulator *accumulator
5526     = main_loop_vinfo->reusable_accumulators.get (main_loop_results[0]);
5527   if (!accumulator
5528       || num_phis != accumulator->reduc_info->reduc_scalar_results.length ()
5529       || !std::equal (main_loop_results.begin (), main_loop_results.end (),
5530                       accumulator->reduc_info->reduc_scalar_results.begin ()))
5531     return false;
5532
5533   /* Handle the case where we can reduce wider vectors to narrower ones.  */
5534   tree vectype = STMT_VINFO_VECTYPE (reduc_info);
5535   tree old_vectype = TREE_TYPE (accumulator->reduc_input);
5536   unsigned HOST_WIDE_INT m;
5537   if (!constant_multiple_p (TYPE_VECTOR_SUBPARTS (old_vectype),
5538                             TYPE_VECTOR_SUBPARTS (vectype), &m))
5539     return false;
5540   /* Check the intermediate vector types and operations are available.  */
5541   tree prev_vectype = old_vectype;
5542   poly_uint64 intermediate_nunits = TYPE_VECTOR_SUBPARTS (old_vectype);
5543   while (known_gt (intermediate_nunits, TYPE_VECTOR_SUBPARTS (vectype)))
5544     {
5545       intermediate_nunits = exact_div (intermediate_nunits, 2);
5546       tree intermediate_vectype = get_related_vectype_for_scalar_type
5547         (TYPE_MODE (vectype), TREE_TYPE (vectype), intermediate_nunits);
5548       if (!intermediate_vectype
5549           || !directly_supported_p (STMT_VINFO_REDUC_CODE (reduc_info),
5550                                     intermediate_vectype)
5551           || !can_vec_extract (TYPE_MODE (prev_vectype),
5552                                TYPE_MODE (intermediate_vectype)))
5553         return false;
5554       prev_vectype = intermediate_vectype;
5555     }
5556
5557   /* Non-SLP reductions might apply an adjustment after the reduction
5558      operation, in order to simplify the initialization of the accumulator.
5559      If the epilogue loop carries on from where the main loop left off,
5560      it should apply the same adjustment to the final reduction result.
5561
5562      If the epilogue loop can also be entered directly (rather than via
5563      the main loop), we need to be able to handle that case in the same way,
5564      with the same adjustment.  (In principle we could add a PHI node
5565      to select the correct adjustment, but in practice that shouldn't be
5566      necessary.)  */
5567   tree main_adjustment
5568     = STMT_VINFO_REDUC_EPILOGUE_ADJUSTMENT (accumulator->reduc_info);
5569   if (loop_vinfo->main_loop_edge && main_adjustment)
5570     {
5571       gcc_assert (num_phis == 1);
5572       tree initial_value = initial_values[0];
5573       /* Check that we can use INITIAL_VALUE as the adjustment and
5574          initialize the accumulator with a neutral value instead.  */
5575       if (!operand_equal_p (initial_value, main_adjustment))
5576         return false;
5577       code_helper code = STMT_VINFO_REDUC_CODE (reduc_info);
5578       initial_values[0] = neutral_op_for_reduction (TREE_TYPE (initial_value),
5579                                                     code, initial_value);
5580     }
5581   STMT_VINFO_REDUC_EPILOGUE_ADJUSTMENT (reduc_info) = main_adjustment;
5582   reduc_info->reduc_initial_values.truncate (0);
5583   reduc_info->reduc_initial_values.splice (initial_values);
5584   reduc_info->reused_accumulator = accumulator;
5585   return true;
5586 }
5587
5588 /* Reduce the vector VEC_DEF down to VECTYPE with reduction operation
5589    CODE emitting stmts before GSI.  Returns a vector def of VECTYPE.  */
5590
5591 static tree
5592 vect_create_partial_epilog (tree vec_def, tree vectype, code_helper code,
5593                             gimple_seq *seq)
5594 {
5595   unsigned nunits = TYPE_VECTOR_SUBPARTS (TREE_TYPE (vec_def)).to_constant ();
5596   unsigned nunits1 = TYPE_VECTOR_SUBPARTS (vectype).to_constant ();
5597   tree stype = TREE_TYPE (vectype);
5598   tree new_temp = vec_def;
5599   while (nunits > nunits1)
5600     {
5601       nunits /= 2;
5602       tree vectype1 = get_related_vectype_for_scalar_type (TYPE_MODE (vectype),
5603                                                            stype, nunits);
5604       unsigned int bitsize = tree_to_uhwi (TYPE_SIZE (vectype1));
5605
5606       /* The target has to make sure we support lowpart/highpart
5607          extraction, either via direct vector extract or through
5608          an integer mode punning.  */
5609       tree dst1, dst2;
5610       gimple *epilog_stmt;
5611       if (convert_optab_handler (vec_extract_optab,
5612                                  TYPE_MODE (TREE_TYPE (new_temp)),
5613                                  TYPE_MODE (vectype1))
5614           != CODE_FOR_nothing)
5615         {
5616           /* Extract sub-vectors directly once vec_extract becomes
5617              a conversion optab.  */
5618           dst1 = make_ssa_name (vectype1);
5619           epilog_stmt
5620               = gimple_build_assign (dst1, BIT_FIELD_REF,
5621                                      build3 (BIT_FIELD_REF, vectype1,
5622                                              new_temp, TYPE_SIZE (vectype1),
5623                                              bitsize_int (0)));
5624           gimple_seq_add_stmt_without_update (seq, epilog_stmt);
5625           dst2 =  make_ssa_name (vectype1);
5626           epilog_stmt
5627               = gimple_build_assign (dst2, BIT_FIELD_REF,
5628                                      build3 (BIT_FIELD_REF, vectype1,
5629                                              new_temp, TYPE_SIZE (vectype1),
5630                                              bitsize_int (bitsize)));
5631           gimple_seq_add_stmt_without_update (seq, epilog_stmt);
5632         }
5633       else
5634         {
5635           /* Extract via punning to appropriately sized integer mode
5636              vector.  */
5637           tree eltype = build_nonstandard_integer_type (bitsize, 1);
5638           tree etype = build_vector_type (eltype, 2);
5639           gcc_assert (convert_optab_handler (vec_extract_optab,
5640                                              TYPE_MODE (etype),
5641                                              TYPE_MODE (eltype))
5642                       != CODE_FOR_nothing);
5643           tree tem = make_ssa_name (etype);
5644           epilog_stmt = gimple_build_assign (tem, VIEW_CONVERT_EXPR,
5645                                              build1 (VIEW_CONVERT_EXPR,
5646                                                      etype, new_temp));
5647           gimple_seq_add_stmt_without_update (seq, epilog_stmt);
5648           new_temp = tem;
5649           tem = make_ssa_name (eltype);
5650           epilog_stmt
5651               = gimple_build_assign (tem, BIT_FIELD_REF,
5652                                      build3 (BIT_FIELD_REF, eltype,
5653                                              new_temp, TYPE_SIZE (eltype),
5654                                              bitsize_int (0)));
5655           gimple_seq_add_stmt_without_update (seq, epilog_stmt);
5656           dst1 = make_ssa_name (vectype1);
5657           epilog_stmt = gimple_build_assign (dst1, VIEW_CONVERT_EXPR,
5658                                              build1 (VIEW_CONVERT_EXPR,
5659                                                      vectype1, tem));
5660           gimple_seq_add_stmt_without_update (seq, epilog_stmt);
5661           tem = make_ssa_name (eltype);
5662           epilog_stmt
5663               = gimple_build_assign (tem, BIT_FIELD_REF,
5664                                      build3 (BIT_FIELD_REF, eltype,
5665                                              new_temp, TYPE_SIZE (eltype),
5666                                              bitsize_int (bitsize)));
5667           gimple_seq_add_stmt_without_update (seq, epilog_stmt);
5668           dst2 =  make_ssa_name (vectype1);
5669           epilog_stmt = gimple_build_assign (dst2, VIEW_CONVERT_EXPR,
5670                                              build1 (VIEW_CONVERT_EXPR,
5671                                                      vectype1, tem));
5672           gimple_seq_add_stmt_without_update (seq, epilog_stmt);
5673         }
5674
5675       new_temp = gimple_build (seq, code, vectype1, dst1, dst2);
5676     }
5677
5678   return new_temp;
5679 }
5680
5681 /* Function vect_create_epilog_for_reduction
5682
5683    Create code at the loop-epilog to finalize the result of a reduction
5684    computation.
5685
5686    STMT_INFO is the scalar reduction stmt that is being vectorized.
5687    SLP_NODE is an SLP node containing a group of reduction statements. The
5688      first one in this group is STMT_INFO.
5689    SLP_NODE_INSTANCE is the SLP node instance containing SLP_NODE
5690    REDUC_INDEX says which rhs operand of the STMT_INFO is the reduction phi
5691      (counting from 0)
5692
5693    This function:
5694    1. Completes the reduction def-use cycles.
5695    2. "Reduces" each vector of partial results VECT_DEFS into a single result,
5696       by calling the function specified by REDUC_FN if available, or by
5697       other means (whole-vector shifts or a scalar loop).
5698       The function also creates a new phi node at the loop exit to preserve
5699       loop-closed form, as illustrated below.
5700
5701      The flow at the entry to this function:
5702
5703         loop:
5704           vec_def = phi <vec_init, null>        # REDUCTION_PHI
5705           VECT_DEF = vector_stmt                # vectorized form of STMT_INFO
5706           s_loop = scalar_stmt                  # (scalar) STMT_INFO
5707         loop_exit:
5708           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
5709           use <s_out0>
5710           use <s_out0>
5711
5712      The above is transformed by this function into:
5713
5714         loop:
5715           vec_def = phi <vec_init, VECT_DEF>    # REDUCTION_PHI
5716           VECT_DEF = vector_stmt                # vectorized form of STMT_INFO
5717           s_loop = scalar_stmt                  # (scalar) STMT_INFO
5718         loop_exit:
5719           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
5720           v_out1 = phi <VECT_DEF>               # NEW_EXIT_PHI
5721           v_out2 = reduce <v_out1>
5722           s_out3 = extract_field <v_out2, 0>
5723           s_out4 = adjust_result <s_out3>
5724           use <s_out4>
5725           use <s_out4>
5726 */
5727
5728 static void
5729 vect_create_epilog_for_reduction (loop_vec_info loop_vinfo,
5730                                   stmt_vec_info stmt_info,
5731                                   slp_tree slp_node,
5732                                   slp_instance slp_node_instance)
5733 {
5734   stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info);
5735   gcc_assert (reduc_info->is_reduc_info);
5736   /* For double reductions we need to get at the inner loop reduction
5737      stmt which has the meta info attached.  Our stmt_info is that of the
5738      loop-closed PHI of the inner loop which we remember as
5739      def for the reduction PHI generation.  */
5740   bool double_reduc = false;
5741   stmt_vec_info rdef_info = stmt_info;
5742   if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
5743     {
5744       gcc_assert (!slp_node);
5745       double_reduc = true;
5746       stmt_info = loop_vinfo->lookup_def (gimple_phi_arg_def
5747                                             (stmt_info->stmt, 0));
5748       stmt_info = vect_stmt_to_vectorize (stmt_info);
5749     }
5750   gphi *reduc_def_stmt
5751     = as_a <gphi *> (STMT_VINFO_REDUC_DEF (vect_orig_stmt (stmt_info))->stmt);
5752   code_helper code = STMT_VINFO_REDUC_CODE (reduc_info);
5753   internal_fn reduc_fn = STMT_VINFO_REDUC_FN (reduc_info);
5754   tree vectype;
5755   machine_mode mode;
5756   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo), *outer_loop = NULL;
5757   basic_block exit_bb;
5758   tree scalar_dest;
5759   tree scalar_type;
5760   gimple *new_phi = NULL, *phi;
5761   gimple_stmt_iterator exit_gsi;
5762   tree new_temp = NULL_TREE, new_name, new_scalar_dest;
5763   gimple *epilog_stmt = NULL;
5764   gimple *exit_phi;
5765   tree bitsize;
5766   tree def;
5767   tree orig_name, scalar_result;
5768   imm_use_iterator imm_iter, phi_imm_iter;
5769   use_operand_p use_p, phi_use_p;
5770   gimple *use_stmt;
5771   auto_vec<tree> reduc_inputs;
5772   int j, i;
5773   vec<tree> &scalar_results = reduc_info->reduc_scalar_results;
5774   unsigned int group_size = 1, k;
5775   auto_vec<gimple *> phis;
5776   /* SLP reduction without reduction chain, e.g.,
5777      # a1 = phi <a2, a0>
5778      # b1 = phi <b2, b0>
5779      a2 = operation (a1)
5780      b2 = operation (b1)  */
5781   bool slp_reduc = (slp_node && !REDUC_GROUP_FIRST_ELEMENT (stmt_info));
5782   bool direct_slp_reduc;
5783   tree induction_index = NULL_TREE;
5784
5785   if (slp_node)
5786     group_size = SLP_TREE_LANES (slp_node);
5787
5788   if (nested_in_vect_loop_p (loop, stmt_info))
5789     {
5790       outer_loop = loop;
5791       loop = loop->inner;
5792       gcc_assert (!slp_node && double_reduc);
5793     }
5794
5795   vectype = STMT_VINFO_REDUC_VECTYPE (reduc_info);
5796   gcc_assert (vectype);
5797   mode = TYPE_MODE (vectype);
5798
5799   tree induc_val = NULL_TREE;
5800   tree adjustment_def = NULL;
5801   if (slp_node)
5802     ;
5803   else
5804     {
5805       /* Optimize: for induction condition reduction, if we can't use zero
5806          for induc_val, use initial_def.  */
5807       if (STMT_VINFO_REDUC_TYPE (reduc_info) == INTEGER_INDUC_COND_REDUCTION)
5808         induc_val = STMT_VINFO_VEC_INDUC_COND_INITIAL_VAL (reduc_info);
5809       else if (double_reduc)
5810         ;
5811       else
5812         adjustment_def = STMT_VINFO_REDUC_EPILOGUE_ADJUSTMENT (reduc_info);
5813     }
5814
5815   stmt_vec_info single_live_out_stmt[] = { stmt_info };
5816   array_slice<const stmt_vec_info> live_out_stmts = single_live_out_stmt;
5817   if (slp_reduc)
5818     /* All statements produce live-out values.  */
5819     live_out_stmts = SLP_TREE_SCALAR_STMTS (slp_node);
5820   else if (slp_node)
5821     {
5822       /* The last statement in the reduction chain produces the live-out
5823          value.  Note SLP optimization can shuffle scalar stmts to
5824          optimize permutations so we have to search for the last stmt.  */
5825       for (k = 0; k < group_size; ++k)
5826         if (!REDUC_GROUP_NEXT_ELEMENT (SLP_TREE_SCALAR_STMTS (slp_node)[k]))
5827           {
5828             single_live_out_stmt[0] = SLP_TREE_SCALAR_STMTS (slp_node)[k];
5829             break;
5830           }
5831     }
5832
5833   unsigned vec_num;
5834   int ncopies;
5835   if (slp_node)
5836     {
5837       vec_num = SLP_TREE_VEC_STMTS (slp_node_instance->reduc_phis).length ();
5838       ncopies = 1;
5839     }
5840   else
5841     {
5842       stmt_vec_info reduc_info = loop_vinfo->lookup_stmt (reduc_def_stmt);
5843       vec_num = 1;
5844       ncopies = STMT_VINFO_VEC_STMTS (reduc_info).length ();
5845     }
5846
5847   /* For cond reductions we want to create a new vector (INDEX_COND_EXPR)
5848      which is updated with the current index of the loop for every match of
5849      the original loop's cond_expr (VEC_STMT).  This results in a vector
5850      containing the last time the condition passed for that vector lane.
5851      The first match will be a 1 to allow 0 to be used for non-matching
5852      indexes.  If there are no matches at all then the vector will be all
5853      zeroes.
5854
5855      PR92772: This algorithm is broken for architectures that support
5856      masked vectors, but do not provide fold_extract_last.  */
5857   if (STMT_VINFO_REDUC_TYPE (reduc_info) == COND_REDUCTION)
5858     {
5859       auto_vec<std::pair<tree, bool>, 2> ccompares;
5860       stmt_vec_info cond_info = STMT_VINFO_REDUC_DEF (reduc_info);
5861       cond_info = vect_stmt_to_vectorize (cond_info);
5862       while (cond_info != reduc_info)
5863         {
5864           if (gimple_assign_rhs_code (cond_info->stmt) == COND_EXPR)
5865             {
5866               gimple *vec_stmt = STMT_VINFO_VEC_STMTS (cond_info)[0];
5867               gcc_assert (gimple_assign_rhs_code (vec_stmt) == VEC_COND_EXPR);
5868               ccompares.safe_push
5869                 (std::make_pair (unshare_expr (gimple_assign_rhs1 (vec_stmt)),
5870                                  STMT_VINFO_REDUC_IDX (cond_info) == 2));
5871             }
5872           cond_info
5873             = loop_vinfo->lookup_def (gimple_op (cond_info->stmt,
5874                                                  1 + STMT_VINFO_REDUC_IDX
5875                                                         (cond_info)));
5876           cond_info = vect_stmt_to_vectorize (cond_info);
5877         }
5878       gcc_assert (ccompares.length () != 0);
5879
5880       tree indx_before_incr, indx_after_incr;
5881       poly_uint64 nunits_out = TYPE_VECTOR_SUBPARTS (vectype);
5882       int scalar_precision
5883         = GET_MODE_PRECISION (SCALAR_TYPE_MODE (TREE_TYPE (vectype)));
5884       tree cr_index_scalar_type = make_unsigned_type (scalar_precision);
5885       tree cr_index_vector_type = get_related_vectype_for_scalar_type
5886         (TYPE_MODE (vectype), cr_index_scalar_type,
5887          TYPE_VECTOR_SUBPARTS (vectype));
5888
5889       /* First we create a simple vector induction variable which starts
5890          with the values {1,2,3,...} (SERIES_VECT) and increments by the
5891          vector size (STEP).  */
5892
5893       /* Create a {1,2,3,...} vector.  */
5894       tree series_vect = build_index_vector (cr_index_vector_type, 1, 1);
5895
5896       /* Create a vector of the step value.  */
5897       tree step = build_int_cst (cr_index_scalar_type, nunits_out);
5898       tree vec_step = build_vector_from_val (cr_index_vector_type, step);
5899
5900       /* Create an induction variable.  */
5901       gimple_stmt_iterator incr_gsi;
5902       bool insert_after;
5903       standard_iv_increment_position (loop, &incr_gsi, &insert_after);
5904       create_iv (series_vect, PLUS_EXPR, vec_step, NULL_TREE, loop, &incr_gsi,
5905                  insert_after, &indx_before_incr, &indx_after_incr);
5906
5907       /* Next create a new phi node vector (NEW_PHI_TREE) which starts
5908          filled with zeros (VEC_ZERO).  */
5909
5910       /* Create a vector of 0s.  */
5911       tree zero = build_zero_cst (cr_index_scalar_type);
5912       tree vec_zero = build_vector_from_val (cr_index_vector_type, zero);
5913
5914       /* Create a vector phi node.  */
5915       tree new_phi_tree = make_ssa_name (cr_index_vector_type);
5916       new_phi = create_phi_node (new_phi_tree, loop->header);
5917       add_phi_arg (as_a <gphi *> (new_phi), vec_zero,
5918                    loop_preheader_edge (loop), UNKNOWN_LOCATION);
5919
5920       /* Now take the condition from the loops original cond_exprs
5921          and produce a new cond_exprs (INDEX_COND_EXPR) which for
5922          every match uses values from the induction variable
5923          (INDEX_BEFORE_INCR) otherwise uses values from the phi node
5924          (NEW_PHI_TREE).
5925          Finally, we update the phi (NEW_PHI_TREE) to take the value of
5926          the new cond_expr (INDEX_COND_EXPR).  */
5927       gimple_seq stmts = NULL;
5928       for (int i = ccompares.length () - 1; i != -1; --i)
5929         {
5930           tree ccompare = ccompares[i].first;
5931           if (ccompares[i].second)
5932             new_phi_tree = gimple_build (&stmts, VEC_COND_EXPR,
5933                                          cr_index_vector_type,
5934                                          ccompare,
5935                                          indx_before_incr, new_phi_tree);
5936           else
5937             new_phi_tree = gimple_build (&stmts, VEC_COND_EXPR,
5938                                          cr_index_vector_type,
5939                                          ccompare,
5940                                          new_phi_tree, indx_before_incr);
5941         }
5942       gsi_insert_seq_before (&incr_gsi, stmts, GSI_SAME_STMT);
5943
5944       /* Update the phi with the vec cond.  */
5945       induction_index = new_phi_tree;
5946       add_phi_arg (as_a <gphi *> (new_phi), induction_index,
5947                    loop_latch_edge (loop), UNKNOWN_LOCATION);
5948     }
5949
5950   /* 2. Create epilog code.
5951         The reduction epilog code operates across the elements of the vector
5952         of partial results computed by the vectorized loop.
5953         The reduction epilog code consists of:
5954
5955         step 1: compute the scalar result in a vector (v_out2)
5956         step 2: extract the scalar result (s_out3) from the vector (v_out2)
5957         step 3: adjust the scalar result (s_out3) if needed.
5958
5959         Step 1 can be accomplished using one the following three schemes:
5960           (scheme 1) using reduc_fn, if available.
5961           (scheme 2) using whole-vector shifts, if available.
5962           (scheme 3) using a scalar loop. In this case steps 1+2 above are
5963                      combined.
5964
5965           The overall epilog code looks like this:
5966
5967           s_out0 = phi <s_loop>         # original EXIT_PHI
5968           v_out1 = phi <VECT_DEF>       # NEW_EXIT_PHI
5969           v_out2 = reduce <v_out1>              # step 1
5970           s_out3 = extract_field <v_out2, 0>    # step 2
5971           s_out4 = adjust_result <s_out3>       # step 3
5972
5973           (step 3 is optional, and steps 1 and 2 may be combined).
5974           Lastly, the uses of s_out0 are replaced by s_out4.  */
5975
5976
5977   /* 2.1 Create new loop-exit-phis to preserve loop-closed form:
5978          v_out1 = phi <VECT_DEF>
5979          Store them in NEW_PHIS.  */
5980   if (double_reduc)
5981     loop = outer_loop;
5982   exit_bb = single_exit (loop)->dest;
5983   exit_gsi = gsi_after_labels (exit_bb);
5984   reduc_inputs.create (slp_node ? vec_num : ncopies);
5985   for (unsigned i = 0; i < vec_num; i++)
5986     {
5987       gimple_seq stmts = NULL;
5988       if (slp_node)
5989         def = vect_get_slp_vect_def (slp_node, i);
5990       else
5991         def = gimple_get_lhs (STMT_VINFO_VEC_STMTS (rdef_info)[0]);
5992       for (j = 0; j < ncopies; j++)
5993         {
5994           tree new_def = copy_ssa_name (def);
5995           phi = create_phi_node (new_def, exit_bb);
5996           if (j)
5997             def = gimple_get_lhs (STMT_VINFO_VEC_STMTS (rdef_info)[j]);
5998           SET_PHI_ARG_DEF (phi, single_exit (loop)->dest_idx, def);
5999           new_def = gimple_convert (&stmts, vectype, new_def);
6000           reduc_inputs.quick_push (new_def);
6001         }
6002       gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
6003     }
6004
6005   /* 2.2 Get the relevant tree-code to use in the epilog for schemes 2,3
6006          (i.e. when reduc_fn is not available) and in the final adjustment
6007          code (if needed).  Also get the original scalar reduction variable as
6008          defined in the loop.  In case STMT is a "pattern-stmt" (i.e. - it
6009          represents a reduction pattern), the tree-code and scalar-def are
6010          taken from the original stmt that the pattern-stmt (STMT) replaces.
6011          Otherwise (it is a regular reduction) - the tree-code and scalar-def
6012          are taken from STMT.  */
6013
6014   stmt_vec_info orig_stmt_info = vect_orig_stmt (stmt_info);
6015   if (orig_stmt_info != stmt_info)
6016     {
6017       /* Reduction pattern  */
6018       gcc_assert (STMT_VINFO_IN_PATTERN_P (orig_stmt_info));
6019       gcc_assert (STMT_VINFO_RELATED_STMT (orig_stmt_info) == stmt_info);
6020     }
6021
6022   scalar_dest = gimple_get_lhs (orig_stmt_info->stmt);
6023   scalar_type = TREE_TYPE (scalar_dest);
6024   scalar_results.truncate (0);
6025   scalar_results.reserve_exact (group_size);
6026   new_scalar_dest = vect_create_destination_var (scalar_dest, NULL);
6027   bitsize = TYPE_SIZE (scalar_type);
6028
6029   /* True if we should implement SLP_REDUC using native reduction operations
6030      instead of scalar operations.  */
6031   direct_slp_reduc = (reduc_fn != IFN_LAST
6032                       && slp_reduc
6033                       && !TYPE_VECTOR_SUBPARTS (vectype).is_constant ());
6034
6035   /* In case of reduction chain, e.g.,
6036      # a1 = phi <a3, a0>
6037      a2 = operation (a1)
6038      a3 = operation (a2),
6039
6040      we may end up with more than one vector result.  Here we reduce them
6041      to one vector.
6042
6043      The same is true for a SLP reduction, e.g.,
6044      # a1 = phi <a2, a0>
6045      # b1 = phi <b2, b0>
6046      a2 = operation (a1)
6047      b2 = operation (a2),
6048
6049      where we can end up with more than one vector as well.  We can
6050      easily accumulate vectors when the number of vector elements is
6051      a multiple of the SLP group size.
6052
6053      The same is true if we couldn't use a single defuse cycle.  */
6054   if (REDUC_GROUP_FIRST_ELEMENT (stmt_info)
6055       || direct_slp_reduc
6056       || (slp_reduc
6057           && constant_multiple_p (TYPE_VECTOR_SUBPARTS (vectype), group_size))
6058       || ncopies > 1)
6059     {
6060       gimple_seq stmts = NULL;
6061       tree single_input = reduc_inputs[0];
6062       for (k = 1; k < reduc_inputs.length (); k++)
6063         single_input = gimple_build (&stmts, code, vectype,
6064                                      single_input, reduc_inputs[k]);
6065       gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
6066
6067       reduc_inputs.truncate (0);
6068       reduc_inputs.safe_push (single_input);
6069     }
6070
6071   tree orig_reduc_input = reduc_inputs[0];
6072
6073   /* If this loop is an epilogue loop that can be skipped after the
6074      main loop, we can only share a reduction operation between the
6075      main loop and the epilogue if we put it at the target of the
6076      skip edge.
6077
6078      We can still reuse accumulators if this check fails.  Doing so has
6079      the minor(?) benefit of making the epilogue loop's scalar result
6080      independent of the main loop's scalar result.  */
6081   bool unify_with_main_loop_p = false;
6082   if (reduc_info->reused_accumulator
6083       && loop_vinfo->skip_this_loop_edge
6084       && single_succ_p (exit_bb)
6085       && single_succ (exit_bb) == loop_vinfo->skip_this_loop_edge->dest)
6086     {
6087       unify_with_main_loop_p = true;
6088
6089       basic_block reduc_block = loop_vinfo->skip_this_loop_edge->dest;
6090       reduc_inputs[0] = make_ssa_name (vectype);
6091       gphi *new_phi = create_phi_node (reduc_inputs[0], reduc_block);
6092       add_phi_arg (new_phi, orig_reduc_input, single_succ_edge (exit_bb),
6093                    UNKNOWN_LOCATION);
6094       add_phi_arg (new_phi, reduc_info->reused_accumulator->reduc_input,
6095                    loop_vinfo->skip_this_loop_edge, UNKNOWN_LOCATION);
6096       exit_gsi = gsi_after_labels (reduc_block);
6097     }
6098
6099   /* Shouldn't be used beyond this point.  */
6100   exit_bb = nullptr;
6101
6102   if (STMT_VINFO_REDUC_TYPE (reduc_info) == COND_REDUCTION
6103       && reduc_fn != IFN_LAST)
6104     {
6105       /* For condition reductions, we have a vector (REDUC_INPUTS 0) containing
6106          various data values where the condition matched and another vector
6107          (INDUCTION_INDEX) containing all the indexes of those matches.  We
6108          need to extract the last matching index (which will be the index with
6109          highest value) and use this to index into the data vector.
6110          For the case where there were no matches, the data vector will contain
6111          all default values and the index vector will be all zeros.  */
6112
6113       /* Get various versions of the type of the vector of indexes.  */
6114       tree index_vec_type = TREE_TYPE (induction_index);
6115       gcc_checking_assert (TYPE_UNSIGNED (index_vec_type));
6116       tree index_scalar_type = TREE_TYPE (index_vec_type);
6117       tree index_vec_cmp_type = truth_type_for (index_vec_type);
6118
6119       /* Get an unsigned integer version of the type of the data vector.  */
6120       int scalar_precision
6121         = GET_MODE_PRECISION (SCALAR_TYPE_MODE (scalar_type));
6122       tree scalar_type_unsigned = make_unsigned_type (scalar_precision);
6123       tree vectype_unsigned = get_same_sized_vectype (scalar_type_unsigned,
6124                                                 vectype);
6125
6126       /* First we need to create a vector (ZERO_VEC) of zeros and another
6127          vector (MAX_INDEX_VEC) filled with the last matching index, which we
6128          can create using a MAX reduction and then expanding.
6129          In the case where the loop never made any matches, the max index will
6130          be zero.  */
6131
6132       /* Vector of {0, 0, 0,...}.  */
6133       tree zero_vec = build_zero_cst (vectype);
6134
6135       /* Find maximum value from the vector of found indexes.  */
6136       tree max_index = make_ssa_name (index_scalar_type);
6137       gcall *max_index_stmt = gimple_build_call_internal (IFN_REDUC_MAX,
6138                                                           1, induction_index);
6139       gimple_call_set_lhs (max_index_stmt, max_index);
6140       gsi_insert_before (&exit_gsi, max_index_stmt, GSI_SAME_STMT);
6141
6142       /* Vector of {max_index, max_index, max_index,...}.  */
6143       tree max_index_vec = make_ssa_name (index_vec_type);
6144       tree max_index_vec_rhs = build_vector_from_val (index_vec_type,
6145                                                       max_index);
6146       gimple *max_index_vec_stmt = gimple_build_assign (max_index_vec,
6147                                                         max_index_vec_rhs);
6148       gsi_insert_before (&exit_gsi, max_index_vec_stmt, GSI_SAME_STMT);
6149
6150       /* Next we compare the new vector (MAX_INDEX_VEC) full of max indexes
6151          with the vector (INDUCTION_INDEX) of found indexes, choosing values
6152          from the data vector (REDUC_INPUTS 0) for matches, 0 (ZERO_VEC)
6153          otherwise.  Only one value should match, resulting in a vector
6154          (VEC_COND) with one data value and the rest zeros.
6155          In the case where the loop never made any matches, every index will
6156          match, resulting in a vector with all data values (which will all be
6157          the default value).  */
6158
6159       /* Compare the max index vector to the vector of found indexes to find
6160          the position of the max value.  */
6161       tree vec_compare = make_ssa_name (index_vec_cmp_type);
6162       gimple *vec_compare_stmt = gimple_build_assign (vec_compare, EQ_EXPR,
6163                                                       induction_index,
6164                                                       max_index_vec);
6165       gsi_insert_before (&exit_gsi, vec_compare_stmt, GSI_SAME_STMT);
6166
6167       /* Use the compare to choose either values from the data vector or
6168          zero.  */
6169       tree vec_cond = make_ssa_name (vectype);
6170       gimple *vec_cond_stmt = gimple_build_assign (vec_cond, VEC_COND_EXPR,
6171                                                    vec_compare,
6172                                                    reduc_inputs[0],
6173                                                    zero_vec);
6174       gsi_insert_before (&exit_gsi, vec_cond_stmt, GSI_SAME_STMT);
6175
6176       /* Finally we need to extract the data value from the vector (VEC_COND)
6177          into a scalar (MATCHED_DATA_REDUC).  Logically we want to do a OR
6178          reduction, but because this doesn't exist, we can use a MAX reduction
6179          instead.  The data value might be signed or a float so we need to cast
6180          it first.
6181          In the case where the loop never made any matches, the data values are
6182          all identical, and so will reduce down correctly.  */
6183
6184       /* Make the matched data values unsigned.  */
6185       tree vec_cond_cast = make_ssa_name (vectype_unsigned);
6186       tree vec_cond_cast_rhs = build1 (VIEW_CONVERT_EXPR, vectype_unsigned,
6187                                        vec_cond);
6188       gimple *vec_cond_cast_stmt = gimple_build_assign (vec_cond_cast,
6189                                                         VIEW_CONVERT_EXPR,
6190                                                         vec_cond_cast_rhs);
6191       gsi_insert_before (&exit_gsi, vec_cond_cast_stmt, GSI_SAME_STMT);
6192
6193       /* Reduce down to a scalar value.  */
6194       tree data_reduc = make_ssa_name (scalar_type_unsigned);
6195       gcall *data_reduc_stmt = gimple_build_call_internal (IFN_REDUC_MAX,
6196                                                            1, vec_cond_cast);
6197       gimple_call_set_lhs (data_reduc_stmt, data_reduc);
6198       gsi_insert_before (&exit_gsi, data_reduc_stmt, GSI_SAME_STMT);
6199
6200       /* Convert the reduced value back to the result type and set as the
6201          result.  */
6202       gimple_seq stmts = NULL;
6203       new_temp = gimple_build (&stmts, VIEW_CONVERT_EXPR, scalar_type,
6204                                data_reduc);
6205       gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
6206       scalar_results.safe_push (new_temp);
6207     }
6208   else if (STMT_VINFO_REDUC_TYPE (reduc_info) == COND_REDUCTION
6209            && reduc_fn == IFN_LAST)
6210     {
6211       /* Condition reduction without supported IFN_REDUC_MAX.  Generate
6212          idx = 0;
6213          idx_val = induction_index[0];
6214          val = data_reduc[0];
6215          for (idx = 0, val = init, i = 0; i < nelts; ++i)
6216            if (induction_index[i] > idx_val)
6217              val = data_reduc[i], idx_val = induction_index[i];
6218          return val;  */
6219
6220       tree data_eltype = TREE_TYPE (vectype);
6221       tree idx_eltype = TREE_TYPE (TREE_TYPE (induction_index));
6222       unsigned HOST_WIDE_INT el_size = tree_to_uhwi (TYPE_SIZE (idx_eltype));
6223       poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (TREE_TYPE (induction_index));
6224       /* Enforced by vectorizable_reduction, which ensures we have target
6225          support before allowing a conditional reduction on variable-length
6226          vectors.  */
6227       unsigned HOST_WIDE_INT v_size = el_size * nunits.to_constant ();
6228       tree idx_val = NULL_TREE, val = NULL_TREE;
6229       for (unsigned HOST_WIDE_INT off = 0; off < v_size; off += el_size)
6230         {
6231           tree old_idx_val = idx_val;
6232           tree old_val = val;
6233           idx_val = make_ssa_name (idx_eltype);
6234           epilog_stmt = gimple_build_assign (idx_val, BIT_FIELD_REF,
6235                                              build3 (BIT_FIELD_REF, idx_eltype,
6236                                                      induction_index,
6237                                                      bitsize_int (el_size),
6238                                                      bitsize_int (off)));
6239           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
6240           val = make_ssa_name (data_eltype);
6241           epilog_stmt = gimple_build_assign (val, BIT_FIELD_REF,
6242                                              build3 (BIT_FIELD_REF,
6243                                                      data_eltype,
6244                                                      reduc_inputs[0],
6245                                                      bitsize_int (el_size),
6246                                                      bitsize_int (off)));
6247           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
6248           if (off != 0)
6249             {
6250               tree new_idx_val = idx_val;
6251               if (off != v_size - el_size)
6252                 {
6253                   new_idx_val = make_ssa_name (idx_eltype);
6254                   epilog_stmt = gimple_build_assign (new_idx_val,
6255                                                      MAX_EXPR, idx_val,
6256                                                      old_idx_val);
6257                   gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
6258                 }
6259               tree cond = make_ssa_name (boolean_type_node);
6260               epilog_stmt = gimple_build_assign (cond, GT_EXPR,
6261                                                  idx_val, old_idx_val);
6262               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
6263               tree new_val = make_ssa_name (data_eltype);
6264               epilog_stmt = gimple_build_assign (new_val, COND_EXPR,
6265                                                  cond, val, old_val);
6266               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
6267               idx_val = new_idx_val;
6268               val = new_val;
6269             }
6270         }
6271       /* Convert the reduced value back to the result type and set as the
6272          result.  */
6273       gimple_seq stmts = NULL;
6274       val = gimple_convert (&stmts, scalar_type, val);
6275       gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
6276       scalar_results.safe_push (val);
6277     }
6278
6279   /* 2.3 Create the reduction code, using one of the three schemes described
6280          above. In SLP we simply need to extract all the elements from the
6281          vector (without reducing them), so we use scalar shifts.  */
6282   else if (reduc_fn != IFN_LAST && !slp_reduc)
6283     {
6284       tree tmp;
6285       tree vec_elem_type;
6286
6287       /* Case 1:  Create:
6288          v_out2 = reduc_expr <v_out1>  */
6289
6290       if (dump_enabled_p ())
6291         dump_printf_loc (MSG_NOTE, vect_location,
6292                          "Reduce using direct vector reduction.\n");
6293
6294       gimple_seq stmts = NULL;
6295       vec_elem_type = TREE_TYPE (vectype);
6296       new_temp = gimple_build (&stmts, as_combined_fn (reduc_fn),
6297                                vec_elem_type, reduc_inputs[0]);
6298       new_temp = gimple_convert (&stmts, scalar_type, new_temp);
6299       gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
6300
6301       if ((STMT_VINFO_REDUC_TYPE (reduc_info) == INTEGER_INDUC_COND_REDUCTION)
6302           && induc_val)
6303         {
6304           /* Earlier we set the initial value to be a vector if induc_val
6305              values.  Check the result and if it is induc_val then replace
6306              with the original initial value, unless induc_val is
6307              the same as initial_def already.  */
6308           tree zcompare = make_ssa_name (boolean_type_node);
6309           epilog_stmt = gimple_build_assign (zcompare, EQ_EXPR,
6310                                              new_temp, induc_val);
6311           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
6312           tree initial_def = reduc_info->reduc_initial_values[0];
6313           tmp = make_ssa_name (new_scalar_dest);
6314           epilog_stmt = gimple_build_assign (tmp, COND_EXPR, zcompare,
6315                                              initial_def, new_temp);
6316           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
6317           new_temp = tmp;
6318         }
6319
6320       scalar_results.safe_push (new_temp);
6321     }
6322   else if (direct_slp_reduc)
6323     {
6324       /* Here we create one vector for each of the REDUC_GROUP_SIZE results,
6325          with the elements for other SLP statements replaced with the
6326          neutral value.  We can then do a normal reduction on each vector.  */
6327
6328       /* Enforced by vectorizable_reduction.  */
6329       gcc_assert (reduc_inputs.length () == 1);
6330       gcc_assert (pow2p_hwi (group_size));
6331
6332       gimple_seq seq = NULL;
6333
6334       /* Build a vector {0, 1, 2, ...}, with the same number of elements
6335          and the same element size as VECTYPE.  */
6336       tree index = build_index_vector (vectype, 0, 1);
6337       tree index_type = TREE_TYPE (index);
6338       tree index_elt_type = TREE_TYPE (index_type);
6339       tree mask_type = truth_type_for (index_type);
6340
6341       /* Create a vector that, for each element, identifies which of
6342          the REDUC_GROUP_SIZE results should use it.  */
6343       tree index_mask = build_int_cst (index_elt_type, group_size - 1);
6344       index = gimple_build (&seq, BIT_AND_EXPR, index_type, index,
6345                             build_vector_from_val (index_type, index_mask));
6346
6347       /* Get a neutral vector value.  This is simply a splat of the neutral
6348          scalar value if we have one, otherwise the initial scalar value
6349          is itself a neutral value.  */
6350       tree vector_identity = NULL_TREE;
6351       tree neutral_op = NULL_TREE;
6352       if (slp_node)
6353         {
6354           tree initial_value = NULL_TREE;
6355           if (REDUC_GROUP_FIRST_ELEMENT (stmt_info))
6356             initial_value = reduc_info->reduc_initial_values[0];
6357           neutral_op = neutral_op_for_reduction (TREE_TYPE (vectype), code,
6358                                                  initial_value);
6359         }
6360       if (neutral_op)
6361         vector_identity = gimple_build_vector_from_val (&seq, vectype,
6362                                                         neutral_op);
6363       for (unsigned int i = 0; i < group_size; ++i)
6364         {
6365           /* If there's no univeral neutral value, we can use the
6366              initial scalar value from the original PHI.  This is used
6367              for MIN and MAX reduction, for example.  */
6368           if (!neutral_op)
6369             {
6370               tree scalar_value = reduc_info->reduc_initial_values[i];
6371               scalar_value = gimple_convert (&seq, TREE_TYPE (vectype),
6372                                              scalar_value);
6373               vector_identity = gimple_build_vector_from_val (&seq, vectype,
6374                                                               scalar_value);
6375             }
6376
6377           /* Calculate the equivalent of:
6378
6379              sel[j] = (index[j] == i);
6380
6381              which selects the elements of REDUC_INPUTS[0] that should
6382              be included in the result.  */
6383           tree compare_val = build_int_cst (index_elt_type, i);
6384           compare_val = build_vector_from_val (index_type, compare_val);
6385           tree sel = gimple_build (&seq, EQ_EXPR, mask_type,
6386                                    index, compare_val);
6387
6388           /* Calculate the equivalent of:
6389
6390              vec = seq ? reduc_inputs[0] : vector_identity;
6391
6392              VEC is now suitable for a full vector reduction.  */
6393           tree vec = gimple_build (&seq, VEC_COND_EXPR, vectype,
6394                                    sel, reduc_inputs[0], vector_identity);
6395
6396           /* Do the reduction and convert it to the appropriate type.  */
6397           tree scalar = gimple_build (&seq, as_combined_fn (reduc_fn),
6398                                       TREE_TYPE (vectype), vec);
6399           scalar = gimple_convert (&seq, scalar_type, scalar);
6400           scalar_results.safe_push (scalar);
6401         }
6402       gsi_insert_seq_before (&exit_gsi, seq, GSI_SAME_STMT);
6403     }
6404   else
6405     {
6406       bool reduce_with_shift;
6407       tree vec_temp;
6408
6409       gcc_assert (slp_reduc || reduc_inputs.length () == 1);
6410
6411       /* See if the target wants to do the final (shift) reduction
6412          in a vector mode of smaller size and first reduce upper/lower
6413          halves against each other.  */
6414       enum machine_mode mode1 = mode;
6415       tree stype = TREE_TYPE (vectype);
6416       unsigned nunits = TYPE_VECTOR_SUBPARTS (vectype).to_constant ();
6417       unsigned nunits1 = nunits;
6418       if ((mode1 = targetm.vectorize.split_reduction (mode)) != mode
6419           && reduc_inputs.length () == 1)
6420         {
6421           nunits1 = GET_MODE_NUNITS (mode1).to_constant ();
6422           /* For SLP reductions we have to make sure lanes match up, but
6423              since we're doing individual element final reduction reducing
6424              vector width here is even more important.
6425              ???  We can also separate lanes with permutes, for the common
6426              case of power-of-two group-size odd/even extracts would work.  */
6427           if (slp_reduc && nunits != nunits1)
6428             {
6429               nunits1 = least_common_multiple (nunits1, group_size);
6430               gcc_assert (exact_log2 (nunits1) != -1 && nunits1 <= nunits);
6431             }
6432         }
6433       if (!slp_reduc
6434           && (mode1 = targetm.vectorize.split_reduction (mode)) != mode)
6435         nunits1 = GET_MODE_NUNITS (mode1).to_constant ();
6436
6437       tree vectype1 = get_related_vectype_for_scalar_type (TYPE_MODE (vectype),
6438                                                            stype, nunits1);
6439       reduce_with_shift = have_whole_vector_shift (mode1);
6440       if (!VECTOR_MODE_P (mode1)
6441           || !directly_supported_p (code, vectype1))
6442         reduce_with_shift = false;
6443
6444       /* First reduce the vector to the desired vector size we should
6445          do shift reduction on by combining upper and lower halves.  */
6446       gimple_seq stmts = NULL;
6447       new_temp = vect_create_partial_epilog (reduc_inputs[0], vectype1,
6448                                              code, &stmts);
6449       gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
6450       reduc_inputs[0] = new_temp;
6451
6452       if (reduce_with_shift && !slp_reduc)
6453         {
6454           int element_bitsize = tree_to_uhwi (bitsize);
6455           /* Enforced by vectorizable_reduction, which disallows SLP reductions
6456              for variable-length vectors and also requires direct target support
6457              for loop reductions.  */
6458           int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype1));
6459           int nelements = vec_size_in_bits / element_bitsize;
6460           vec_perm_builder sel;
6461           vec_perm_indices indices;
6462
6463           int elt_offset;
6464
6465           tree zero_vec = build_zero_cst (vectype1);
6466           /* Case 2: Create:
6467              for (offset = nelements/2; offset >= 1; offset/=2)
6468                 {
6469                   Create:  va' = vec_shift <va, offset>
6470                   Create:  va = vop <va, va'>
6471                 }  */
6472
6473           tree rhs;
6474
6475           if (dump_enabled_p ())
6476             dump_printf_loc (MSG_NOTE, vect_location,
6477                              "Reduce using vector shifts\n");
6478
6479           gimple_seq stmts = NULL;
6480           new_temp = gimple_convert (&stmts, vectype1, new_temp);
6481           for (elt_offset = nelements / 2;
6482                elt_offset >= 1;
6483                elt_offset /= 2)
6484             {
6485               calc_vec_perm_mask_for_shift (elt_offset, nelements, &sel);
6486               indices.new_vector (sel, 2, nelements);
6487               tree mask = vect_gen_perm_mask_any (vectype1, indices);
6488               new_name = gimple_build (&stmts, VEC_PERM_EXPR, vectype1,
6489                                        new_temp, zero_vec, mask);
6490               new_temp = gimple_build (&stmts, code,
6491                                        vectype1, new_name, new_temp);
6492             }
6493           gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
6494
6495           /* 2.4  Extract the final scalar result.  Create:
6496              s_out3 = extract_field <v_out2, bitpos>  */
6497
6498           if (dump_enabled_p ())
6499             dump_printf_loc (MSG_NOTE, vect_location,
6500                              "extract scalar result\n");
6501
6502           rhs = build3 (BIT_FIELD_REF, scalar_type, new_temp,
6503                         bitsize, bitsize_zero_node);
6504           epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
6505           new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
6506           gimple_assign_set_lhs (epilog_stmt, new_temp);
6507           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
6508           scalar_results.safe_push (new_temp);
6509         }
6510       else
6511         {
6512           /* Case 3: Create:
6513              s = extract_field <v_out2, 0>
6514              for (offset = element_size;
6515                   offset < vector_size;
6516                   offset += element_size;)
6517                {
6518                  Create:  s' = extract_field <v_out2, offset>
6519                  Create:  s = op <s, s'>  // For non SLP cases
6520                }  */
6521
6522           if (dump_enabled_p ())
6523             dump_printf_loc (MSG_NOTE, vect_location,
6524                              "Reduce using scalar code.\n");
6525
6526           int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype1));
6527           int element_bitsize = tree_to_uhwi (bitsize);
6528           tree compute_type = TREE_TYPE (vectype);
6529           gimple_seq stmts = NULL;
6530           FOR_EACH_VEC_ELT (reduc_inputs, i, vec_temp)
6531             {
6532               int bit_offset;
6533               new_temp = gimple_build (&stmts, BIT_FIELD_REF, compute_type,
6534                                        vec_temp, bitsize, bitsize_zero_node);
6535
6536               /* In SLP we don't need to apply reduction operation, so we just
6537                  collect s' values in SCALAR_RESULTS.  */
6538               if (slp_reduc)
6539                 scalar_results.safe_push (new_temp);
6540
6541               for (bit_offset = element_bitsize;
6542                    bit_offset < vec_size_in_bits;
6543                    bit_offset += element_bitsize)
6544                 {
6545                   tree bitpos = bitsize_int (bit_offset);
6546                   new_name = gimple_build (&stmts, BIT_FIELD_REF,
6547                                            compute_type, vec_temp,
6548                                            bitsize, bitpos);
6549                   if (slp_reduc)
6550                     {
6551                       /* In SLP we don't need to apply reduction operation, so
6552                          we just collect s' values in SCALAR_RESULTS.  */
6553                       new_temp = new_name;
6554                       scalar_results.safe_push (new_name);
6555                     }
6556                   else
6557                     new_temp = gimple_build (&stmts, code, compute_type,
6558                                              new_name, new_temp);
6559                 }
6560             }
6561
6562           /* The only case where we need to reduce scalar results in SLP, is
6563              unrolling.  If the size of SCALAR_RESULTS is greater than
6564              REDUC_GROUP_SIZE, we reduce them combining elements modulo
6565              REDUC_GROUP_SIZE.  */
6566           if (slp_reduc)
6567             {
6568               tree res, first_res, new_res;
6569
6570               /* Reduce multiple scalar results in case of SLP unrolling.  */
6571               for (j = group_size; scalar_results.iterate (j, &res);
6572                    j++)
6573                 {
6574                   first_res = scalar_results[j % group_size];
6575                   new_res = gimple_build (&stmts, code, compute_type,
6576                                           first_res, res);
6577                   scalar_results[j % group_size] = new_res;
6578                 }
6579               scalar_results.truncate (group_size);
6580               for (k = 0; k < group_size; k++)
6581                 scalar_results[k] = gimple_convert (&stmts, scalar_type,
6582                                                     scalar_results[k]);
6583             }
6584           else
6585             {
6586               /* Not SLP - we have one scalar to keep in SCALAR_RESULTS.  */
6587               new_temp = gimple_convert (&stmts, scalar_type, new_temp);
6588               scalar_results.safe_push (new_temp);
6589             }
6590
6591           gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
6592         }
6593
6594       if ((STMT_VINFO_REDUC_TYPE (reduc_info) == INTEGER_INDUC_COND_REDUCTION)
6595           && induc_val)
6596         {
6597           /* Earlier we set the initial value to be a vector if induc_val
6598              values.  Check the result and if it is induc_val then replace
6599              with the original initial value, unless induc_val is
6600              the same as initial_def already.  */
6601           tree zcompare = make_ssa_name (boolean_type_node);
6602           epilog_stmt = gimple_build_assign (zcompare, EQ_EXPR, new_temp,
6603                                              induc_val);
6604           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
6605           tree initial_def = reduc_info->reduc_initial_values[0];
6606           tree tmp = make_ssa_name (new_scalar_dest);
6607           epilog_stmt = gimple_build_assign (tmp, COND_EXPR, zcompare,
6608                                              initial_def, new_temp);
6609           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
6610           scalar_results[0] = tmp;
6611         }
6612     }
6613
6614   /* 2.5 Adjust the final result by the initial value of the reduction
6615          variable. (When such adjustment is not needed, then
6616          'adjustment_def' is zero).  For example, if code is PLUS we create:
6617          new_temp = loop_exit_def + adjustment_def  */
6618
6619   if (adjustment_def)
6620     {
6621       gcc_assert (!slp_reduc);
6622       gimple_seq stmts = NULL;
6623       if (double_reduc)
6624         {
6625           gcc_assert (VECTOR_TYPE_P (TREE_TYPE (adjustment_def)));
6626           adjustment_def = gimple_convert (&stmts, vectype, adjustment_def);
6627           new_temp = gimple_build (&stmts, code, vectype,
6628                                    reduc_inputs[0], adjustment_def);
6629         }
6630       else
6631         {
6632           new_temp = scalar_results[0];
6633           gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) != VECTOR_TYPE);
6634           adjustment_def = gimple_convert (&stmts, TREE_TYPE (vectype),
6635                                            adjustment_def);
6636           new_temp = gimple_convert (&stmts, TREE_TYPE (vectype), new_temp);
6637           new_temp = gimple_build (&stmts, code, TREE_TYPE (vectype),
6638                                    new_temp, adjustment_def);
6639           new_temp = gimple_convert (&stmts, scalar_type, new_temp);
6640         }
6641
6642       epilog_stmt = gimple_seq_last_stmt (stmts);
6643       gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
6644       scalar_results[0] = new_temp;
6645     }
6646
6647   /* Record this operation if it could be reused by the epilogue loop.  */
6648   if (STMT_VINFO_REDUC_TYPE (reduc_info) == TREE_CODE_REDUCTION
6649       && reduc_inputs.length () == 1)
6650     loop_vinfo->reusable_accumulators.put (scalar_results[0],
6651                                            { orig_reduc_input, reduc_info });
6652
6653   if (double_reduc)
6654     loop = outer_loop;
6655
6656   /* 2.6  Handle the loop-exit phis.  Replace the uses of scalar loop-exit
6657           phis with new adjusted scalar results, i.e., replace use <s_out0>
6658           with use <s_out4>.
6659
6660      Transform:
6661         loop_exit:
6662           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
6663           v_out1 = phi <VECT_DEF>               # NEW_EXIT_PHI
6664           v_out2 = reduce <v_out1>
6665           s_out3 = extract_field <v_out2, 0>
6666           s_out4 = adjust_result <s_out3>
6667           use <s_out0>
6668           use <s_out0>
6669
6670      into:
6671
6672         loop_exit:
6673           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
6674           v_out1 = phi <VECT_DEF>               # NEW_EXIT_PHI
6675           v_out2 = reduce <v_out1>
6676           s_out3 = extract_field <v_out2, 0>
6677           s_out4 = adjust_result <s_out3>
6678           use <s_out4>
6679           use <s_out4> */
6680
6681   gcc_assert (live_out_stmts.size () == scalar_results.length ());
6682   for (k = 0; k < live_out_stmts.size (); k++)
6683     {
6684       stmt_vec_info scalar_stmt_info = vect_orig_stmt (live_out_stmts[k]);
6685       scalar_dest = gimple_get_lhs (scalar_stmt_info->stmt);
6686
6687       phis.create (3);
6688       /* Find the loop-closed-use at the loop exit of the original scalar
6689          result.  (The reduction result is expected to have two immediate uses,
6690          one at the latch block, and one at the loop exit).  For double
6691          reductions we are looking for exit phis of the outer loop.  */
6692       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, scalar_dest)
6693         {
6694           if (!flow_bb_inside_loop_p (loop, gimple_bb (USE_STMT (use_p))))
6695             {
6696               if (!is_gimple_debug (USE_STMT (use_p)))
6697                 phis.safe_push (USE_STMT (use_p));
6698             }
6699           else
6700             {
6701               if (double_reduc && gimple_code (USE_STMT (use_p)) == GIMPLE_PHI)
6702                 {
6703                   tree phi_res = PHI_RESULT (USE_STMT (use_p));
6704
6705                   FOR_EACH_IMM_USE_FAST (phi_use_p, phi_imm_iter, phi_res)
6706                     {
6707                       if (!flow_bb_inside_loop_p (loop,
6708                                              gimple_bb (USE_STMT (phi_use_p)))
6709                           && !is_gimple_debug (USE_STMT (phi_use_p)))
6710                         phis.safe_push (USE_STMT (phi_use_p));
6711                     }
6712                 }
6713             }
6714         }
6715
6716       FOR_EACH_VEC_ELT (phis, i, exit_phi)
6717         {
6718           /* Replace the uses:  */
6719           orig_name = PHI_RESULT (exit_phi);
6720
6721           /* Look for a single use at the target of the skip edge.  */
6722           if (unify_with_main_loop_p)
6723             {
6724               use_operand_p use_p;
6725               gimple *user;
6726               if (!single_imm_use (orig_name, &use_p, &user))
6727                 gcc_unreachable ();
6728               orig_name = gimple_get_lhs (user);
6729             }
6730
6731           scalar_result = scalar_results[k];
6732           FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name)
6733             {
6734               FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
6735                 SET_USE (use_p, scalar_result);
6736               update_stmt (use_stmt);
6737             }
6738         }
6739
6740       phis.release ();
6741     }
6742 }
6743
6744 /* Return a vector of type VECTYPE that is equal to the vector select
6745    operation "MASK ? VEC : IDENTITY".  Insert the select statements
6746    before GSI.  */
6747
6748 static tree
6749 merge_with_identity (gimple_stmt_iterator *gsi, tree mask, tree vectype,
6750                      tree vec, tree identity)
6751 {
6752   tree cond = make_temp_ssa_name (vectype, NULL, "cond");
6753   gimple *new_stmt = gimple_build_assign (cond, VEC_COND_EXPR,
6754                                           mask, vec, identity);
6755   gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
6756   return cond;
6757 }
6758
6759 /* Successively apply CODE to each element of VECTOR_RHS, in left-to-right
6760    order, starting with LHS.  Insert the extraction statements before GSI and
6761    associate the new scalar SSA names with variable SCALAR_DEST.
6762    Return the SSA name for the result.  */
6763
6764 static tree
6765 vect_expand_fold_left (gimple_stmt_iterator *gsi, tree scalar_dest,
6766                        tree_code code, tree lhs, tree vector_rhs)
6767 {
6768   tree vectype = TREE_TYPE (vector_rhs);
6769   tree scalar_type = TREE_TYPE (vectype);
6770   tree bitsize = TYPE_SIZE (scalar_type);
6771   unsigned HOST_WIDE_INT vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
6772   unsigned HOST_WIDE_INT element_bitsize = tree_to_uhwi (bitsize);
6773
6774   for (unsigned HOST_WIDE_INT bit_offset = 0;
6775        bit_offset < vec_size_in_bits;
6776        bit_offset += element_bitsize)
6777     {
6778       tree bitpos = bitsize_int (bit_offset);
6779       tree rhs = build3 (BIT_FIELD_REF, scalar_type, vector_rhs,
6780                          bitsize, bitpos);
6781
6782       gassign *stmt = gimple_build_assign (scalar_dest, rhs);
6783       rhs = make_ssa_name (scalar_dest, stmt);
6784       gimple_assign_set_lhs (stmt, rhs);
6785       gsi_insert_before (gsi, stmt, GSI_SAME_STMT);
6786
6787       stmt = gimple_build_assign (scalar_dest, code, lhs, rhs);
6788       tree new_name = make_ssa_name (scalar_dest, stmt);
6789       gimple_assign_set_lhs (stmt, new_name);
6790       gsi_insert_before (gsi, stmt, GSI_SAME_STMT);
6791       lhs = new_name;
6792     }
6793   return lhs;
6794 }
6795
6796 /* Get a masked internal function equivalent to REDUC_FN.  VECTYPE_IN is the
6797    type of the vector input.  */
6798
6799 static internal_fn
6800 get_masked_reduction_fn (internal_fn reduc_fn, tree vectype_in)
6801 {
6802   internal_fn mask_reduc_fn;
6803
6804   switch (reduc_fn)
6805     {
6806     case IFN_FOLD_LEFT_PLUS:
6807       mask_reduc_fn = IFN_MASK_FOLD_LEFT_PLUS;
6808       break;
6809
6810     default:
6811       return IFN_LAST;
6812     }
6813
6814   if (direct_internal_fn_supported_p (mask_reduc_fn, vectype_in,
6815                                       OPTIMIZE_FOR_SPEED))
6816     return mask_reduc_fn;
6817   return IFN_LAST;
6818 }
6819
6820 /* Perform an in-order reduction (FOLD_LEFT_REDUCTION).  STMT_INFO is the
6821    statement that sets the live-out value.  REDUC_DEF_STMT is the phi
6822    statement.  CODE is the operation performed by STMT_INFO and OPS are
6823    its scalar operands.  REDUC_INDEX is the index of the operand in
6824    OPS that is set by REDUC_DEF_STMT.  REDUC_FN is the function that
6825    implements in-order reduction, or IFN_LAST if we should open-code it.
6826    VECTYPE_IN is the type of the vector input.  MASKS specifies the masks
6827    that should be used to control the operation in a fully-masked loop.  */
6828
6829 static bool
6830 vectorize_fold_left_reduction (loop_vec_info loop_vinfo,
6831                                stmt_vec_info stmt_info,
6832                                gimple_stmt_iterator *gsi,
6833                                gimple **vec_stmt, slp_tree slp_node,
6834                                gimple *reduc_def_stmt,
6835                                tree_code code, internal_fn reduc_fn,
6836                                tree ops[3], tree vectype_in,
6837                                int reduc_index, vec_loop_masks *masks)
6838 {
6839   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
6840   tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
6841   internal_fn mask_reduc_fn = get_masked_reduction_fn (reduc_fn, vectype_in);
6842
6843   int ncopies;
6844   if (slp_node)
6845     ncopies = 1;
6846   else
6847     ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
6848
6849   gcc_assert (!nested_in_vect_loop_p (loop, stmt_info));
6850   gcc_assert (ncopies == 1);
6851   gcc_assert (TREE_CODE_LENGTH (code) == binary_op);
6852
6853   if (slp_node)
6854     gcc_assert (known_eq (TYPE_VECTOR_SUBPARTS (vectype_out),
6855                           TYPE_VECTOR_SUBPARTS (vectype_in)));
6856
6857   tree op0 = ops[1 - reduc_index];
6858
6859   int group_size = 1;
6860   stmt_vec_info scalar_dest_def_info;
6861   auto_vec<tree> vec_oprnds0;
6862   if (slp_node)
6863     {
6864       auto_vec<vec<tree> > vec_defs (2);
6865       vect_get_slp_defs (loop_vinfo, slp_node, &vec_defs);
6866       vec_oprnds0.safe_splice (vec_defs[1 - reduc_index]);
6867       vec_defs[0].release ();
6868       vec_defs[1].release ();
6869       group_size = SLP_TREE_SCALAR_STMTS (slp_node).length ();
6870       scalar_dest_def_info = SLP_TREE_SCALAR_STMTS (slp_node)[group_size - 1];
6871     }
6872   else
6873     {
6874       vect_get_vec_defs_for_operand (loop_vinfo, stmt_info, 1,
6875                                      op0, &vec_oprnds0);
6876       scalar_dest_def_info = stmt_info;
6877     }
6878
6879   tree scalar_dest = gimple_assign_lhs (scalar_dest_def_info->stmt);
6880   tree scalar_type = TREE_TYPE (scalar_dest);
6881   tree reduc_var = gimple_phi_result (reduc_def_stmt);
6882
6883   int vec_num = vec_oprnds0.length ();
6884   gcc_assert (vec_num == 1 || slp_node);
6885   tree vec_elem_type = TREE_TYPE (vectype_out);
6886   gcc_checking_assert (useless_type_conversion_p (scalar_type, vec_elem_type));
6887
6888   tree vector_identity = NULL_TREE;
6889   if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
6890     vector_identity = build_zero_cst (vectype_out);
6891
6892   tree scalar_dest_var = vect_create_destination_var (scalar_dest, NULL);
6893   int i;
6894   tree def0;
6895   FOR_EACH_VEC_ELT (vec_oprnds0, i, def0)
6896     {
6897       gimple *new_stmt;
6898       tree mask = NULL_TREE;
6899       if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
6900         mask = vect_get_loop_mask (loop_vinfo, gsi, masks, vec_num, vectype_in, i);
6901
6902       /* Handle MINUS by adding the negative.  */
6903       if (reduc_fn != IFN_LAST && code == MINUS_EXPR)
6904         {
6905           tree negated = make_ssa_name (vectype_out);
6906           new_stmt = gimple_build_assign (negated, NEGATE_EXPR, def0);
6907           gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
6908           def0 = negated;
6909         }
6910
6911       if (mask && mask_reduc_fn == IFN_LAST)
6912         def0 = merge_with_identity (gsi, mask, vectype_out, def0,
6913                                     vector_identity);
6914
6915       /* On the first iteration the input is simply the scalar phi
6916          result, and for subsequent iterations it is the output of
6917          the preceding operation.  */
6918       if (reduc_fn != IFN_LAST || (mask && mask_reduc_fn != IFN_LAST))
6919         {
6920           if (mask && mask_reduc_fn != IFN_LAST)
6921             new_stmt = gimple_build_call_internal (mask_reduc_fn, 3, reduc_var,
6922                                                    def0, mask);
6923           else
6924             new_stmt = gimple_build_call_internal (reduc_fn, 2, reduc_var,
6925                                                    def0);
6926           /* For chained SLP reductions the output of the previous reduction
6927              operation serves as the input of the next. For the final statement
6928              the output cannot be a temporary - we reuse the original
6929              scalar destination of the last statement.  */
6930           if (i != vec_num - 1)
6931             {
6932               gimple_set_lhs (new_stmt, scalar_dest_var);
6933               reduc_var = make_ssa_name (scalar_dest_var, new_stmt);
6934               gimple_set_lhs (new_stmt, reduc_var);
6935             }
6936         }
6937       else
6938         {
6939           reduc_var = vect_expand_fold_left (gsi, scalar_dest_var, code,
6940                                              reduc_var, def0);
6941           new_stmt = SSA_NAME_DEF_STMT (reduc_var);
6942           /* Remove the statement, so that we can use the same code paths
6943              as for statements that we've just created.  */
6944           gimple_stmt_iterator tmp_gsi = gsi_for_stmt (new_stmt);
6945           gsi_remove (&tmp_gsi, true);
6946         }
6947
6948       if (i == vec_num - 1)
6949         {
6950           gimple_set_lhs (new_stmt, scalar_dest);
6951           vect_finish_replace_stmt (loop_vinfo,
6952                                     scalar_dest_def_info,
6953                                     new_stmt);
6954         }
6955       else
6956         vect_finish_stmt_generation (loop_vinfo,
6957                                      scalar_dest_def_info,
6958                                      new_stmt, gsi);
6959
6960       if (slp_node)
6961         SLP_TREE_VEC_STMTS (slp_node).quick_push (new_stmt);
6962       else
6963         {
6964           STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_stmt);
6965           *vec_stmt = new_stmt;
6966         }
6967     }
6968
6969   return true;
6970 }
6971
6972 /* Function is_nonwrapping_integer_induction.
6973
6974    Check if STMT_VINO (which is part of loop LOOP) both increments and
6975    does not cause overflow.  */
6976
6977 static bool
6978 is_nonwrapping_integer_induction (stmt_vec_info stmt_vinfo, class loop *loop)
6979 {
6980   gphi *phi = as_a <gphi *> (stmt_vinfo->stmt);
6981   tree base = STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo);
6982   tree step = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo);
6983   tree lhs_type = TREE_TYPE (gimple_phi_result (phi));
6984   widest_int ni, max_loop_value, lhs_max;
6985   wi::overflow_type overflow = wi::OVF_NONE;
6986
6987   /* Make sure the loop is integer based.  */
6988   if (TREE_CODE (base) != INTEGER_CST
6989       || TREE_CODE (step) != INTEGER_CST)
6990     return false;
6991
6992   /* Check that the max size of the loop will not wrap.  */
6993
6994   if (TYPE_OVERFLOW_UNDEFINED (lhs_type))
6995     return true;
6996
6997   if (! max_stmt_executions (loop, &ni))
6998     return false;
6999
7000   max_loop_value = wi::mul (wi::to_widest (step), ni, TYPE_SIGN (lhs_type),
7001                             &overflow);
7002   if (overflow)
7003     return false;
7004
7005   max_loop_value = wi::add (wi::to_widest (base), max_loop_value,
7006                             TYPE_SIGN (lhs_type), &overflow);
7007   if (overflow)
7008     return false;
7009
7010   return (wi::min_precision (max_loop_value, TYPE_SIGN (lhs_type))
7011           <= TYPE_PRECISION (lhs_type));
7012 }
7013
7014 /* Check if masking can be supported by inserting a conditional expression.
7015    CODE is the code for the operation.  COND_FN is the conditional internal
7016    function, if it exists.  VECTYPE_IN is the type of the vector input.  */
7017 static bool
7018 use_mask_by_cond_expr_p (code_helper code, internal_fn cond_fn,
7019                          tree vectype_in)
7020 {
7021   if (cond_fn != IFN_LAST
7022       && direct_internal_fn_supported_p (cond_fn, vectype_in,
7023                                          OPTIMIZE_FOR_SPEED))
7024     return false;
7025
7026   if (code.is_tree_code ())
7027     switch (tree_code (code))
7028       {
7029       case DOT_PROD_EXPR:
7030       case SAD_EXPR:
7031         return true;
7032
7033       default:
7034         break;
7035       }
7036   return false;
7037 }
7038
7039 /* Insert a conditional expression to enable masked vectorization.  CODE is the
7040    code for the operation.  VOP is the array of operands.  MASK is the loop
7041    mask.  GSI is a statement iterator used to place the new conditional
7042    expression.  */
7043 static void
7044 build_vect_cond_expr (code_helper code, tree vop[3], tree mask,
7045                       gimple_stmt_iterator *gsi)
7046 {
7047   switch (tree_code (code))
7048     {
7049     case DOT_PROD_EXPR:
7050       {
7051         tree vectype = TREE_TYPE (vop[1]);
7052         tree zero = build_zero_cst (vectype);
7053         tree masked_op1 = make_temp_ssa_name (vectype, NULL, "masked_op1");
7054         gassign *select = gimple_build_assign (masked_op1, VEC_COND_EXPR,
7055                                                mask, vop[1], zero);
7056         gsi_insert_before (gsi, select, GSI_SAME_STMT);
7057         vop[1] = masked_op1;
7058         break;
7059       }
7060
7061     case SAD_EXPR:
7062       {
7063         tree vectype = TREE_TYPE (vop[1]);
7064         tree masked_op1 = make_temp_ssa_name (vectype, NULL, "masked_op1");
7065         gassign *select = gimple_build_assign (masked_op1, VEC_COND_EXPR,
7066                                                mask, vop[1], vop[0]);
7067         gsi_insert_before (gsi, select, GSI_SAME_STMT);
7068         vop[1] = masked_op1;
7069         break;
7070       }
7071
7072     default:
7073       gcc_unreachable ();
7074     }
7075 }
7076
7077 /* Function vectorizable_reduction.
7078
7079    Check if STMT_INFO performs a reduction operation that can be vectorized.
7080    If VEC_STMT is also passed, vectorize STMT_INFO: create a vectorized
7081    stmt to replace it, put it in VEC_STMT, and insert it at GSI.
7082    Return true if STMT_INFO is vectorizable in this way.
7083
7084    This function also handles reduction idioms (patterns) that have been
7085    recognized in advance during vect_pattern_recog.  In this case, STMT_INFO
7086    may be of this form:
7087      X = pattern_expr (arg0, arg1, ..., X)
7088    and its STMT_VINFO_RELATED_STMT points to the last stmt in the original
7089    sequence that had been detected and replaced by the pattern-stmt
7090    (STMT_INFO).
7091
7092    This function also handles reduction of condition expressions, for example:
7093      for (int i = 0; i < N; i++)
7094        if (a[i] < value)
7095          last = a[i];
7096    This is handled by vectorising the loop and creating an additional vector
7097    containing the loop indexes for which "a[i] < value" was true.  In the
7098    function epilogue this is reduced to a single max value and then used to
7099    index into the vector of results.
7100
7101    In some cases of reduction patterns, the type of the reduction variable X is
7102    different than the type of the other arguments of STMT_INFO.
7103    In such cases, the vectype that is used when transforming STMT_INFO into
7104    a vector stmt is different than the vectype that is used to determine the
7105    vectorization factor, because it consists of a different number of elements
7106    than the actual number of elements that are being operated upon in parallel.
7107
7108    For example, consider an accumulation of shorts into an int accumulator.
7109    On some targets it's possible to vectorize this pattern operating on 8
7110    shorts at a time (hence, the vectype for purposes of determining the
7111    vectorization factor should be V8HI); on the other hand, the vectype that
7112    is used to create the vector form is actually V4SI (the type of the result).
7113
7114    Upon entry to this function, STMT_VINFO_VECTYPE records the vectype that
7115    indicates what is the actual level of parallelism (V8HI in the example), so
7116    that the right vectorization factor would be derived.  This vectype
7117    corresponds to the type of arguments to the reduction stmt, and should *NOT*
7118    be used to create the vectorized stmt.  The right vectype for the vectorized
7119    stmt is obtained from the type of the result X:
7120       get_vectype_for_scalar_type (vinfo, TREE_TYPE (X))
7121
7122    This means that, contrary to "regular" reductions (or "regular" stmts in
7123    general), the following equation:
7124       STMT_VINFO_VECTYPE == get_vectype_for_scalar_type (vinfo, TREE_TYPE (X))
7125    does *NOT* necessarily hold for reduction patterns.  */
7126
7127 bool
7128 vectorizable_reduction (loop_vec_info loop_vinfo,
7129                         stmt_vec_info stmt_info, slp_tree slp_node,
7130                         slp_instance slp_node_instance,
7131                         stmt_vector_for_cost *cost_vec)
7132 {
7133   tree vectype_in = NULL_TREE;
7134   tree vectype_op[3] = { NULL_TREE, NULL_TREE, NULL_TREE };
7135   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
7136   enum vect_def_type cond_reduc_dt = vect_unknown_def_type;
7137   stmt_vec_info cond_stmt_vinfo = NULL;
7138   int i;
7139   int ncopies;
7140   bool single_defuse_cycle = false;
7141   bool nested_cycle = false;
7142   bool double_reduc = false;
7143   int vec_num;
7144   tree cr_index_scalar_type = NULL_TREE, cr_index_vector_type = NULL_TREE;
7145   tree cond_reduc_val = NULL_TREE;
7146
7147   /* Make sure it was already recognized as a reduction computation.  */
7148   if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_reduction_def
7149       && STMT_VINFO_DEF_TYPE (stmt_info) != vect_double_reduction_def
7150       && STMT_VINFO_DEF_TYPE (stmt_info) != vect_nested_cycle)
7151     return false;
7152
7153   /* The stmt we store reduction analysis meta on.  */
7154   stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info);
7155   reduc_info->is_reduc_info = true;
7156
7157   if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle)
7158     {
7159       if (is_a <gphi *> (stmt_info->stmt))
7160         {
7161           if (slp_node)
7162             {
7163               /* We eventually need to set a vector type on invariant
7164                  arguments.  */
7165               unsigned j;
7166               slp_tree child;
7167               FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (slp_node), j, child)
7168                 if (!vect_maybe_update_slp_op_vectype
7169                        (child, SLP_TREE_VECTYPE (slp_node)))
7170                   {
7171                     if (dump_enabled_p ())
7172                       dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7173                                        "incompatible vector types for "
7174                                        "invariants\n");
7175                     return false;
7176                   }
7177             }
7178           /* Analysis for double-reduction is done on the outer
7179              loop PHI, nested cycles have no further restrictions.  */
7180           STMT_VINFO_TYPE (stmt_info) = cycle_phi_info_type;
7181         }
7182       else
7183         STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
7184       return true;
7185     }
7186
7187   stmt_vec_info orig_stmt_of_analysis = stmt_info;
7188   stmt_vec_info phi_info = stmt_info;
7189   if (!is_a <gphi *> (stmt_info->stmt))
7190     {
7191       STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
7192       return true;
7193     }
7194   if (slp_node)
7195     {
7196       slp_node_instance->reduc_phis = slp_node;
7197       /* ???  We're leaving slp_node to point to the PHIs, we only
7198          need it to get at the number of vector stmts which wasn't
7199          yet initialized for the instance root.  */
7200     }
7201   if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
7202     {
7203       use_operand_p use_p;
7204       gimple *use_stmt;
7205       bool res = single_imm_use (gimple_phi_result (stmt_info->stmt),
7206                                  &use_p, &use_stmt);
7207       gcc_assert (res);
7208       phi_info = loop_vinfo->lookup_stmt (use_stmt);
7209     }
7210
7211   /* PHIs should not participate in patterns.  */
7212   gcc_assert (!STMT_VINFO_RELATED_STMT (phi_info));
7213   gphi *reduc_def_phi = as_a <gphi *> (phi_info->stmt);
7214
7215   /* Verify following REDUC_IDX from the latch def leads us back to the PHI
7216      and compute the reduction chain length.  Discover the real
7217      reduction operation stmt on the way (stmt_info and slp_for_stmt_info).  */
7218   tree reduc_def
7219     = PHI_ARG_DEF_FROM_EDGE (reduc_def_phi,
7220                              loop_latch_edge
7221                                (gimple_bb (reduc_def_phi)->loop_father));
7222   unsigned reduc_chain_length = 0;
7223   bool only_slp_reduc_chain = true;
7224   stmt_info = NULL;
7225   slp_tree slp_for_stmt_info = slp_node ? slp_node_instance->root : NULL;
7226   while (reduc_def != PHI_RESULT (reduc_def_phi))
7227     {
7228       stmt_vec_info def = loop_vinfo->lookup_def (reduc_def);
7229       stmt_vec_info vdef = vect_stmt_to_vectorize (def);
7230       if (STMT_VINFO_REDUC_IDX (vdef) == -1)
7231         {
7232           if (dump_enabled_p ())
7233             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7234                              "reduction chain broken by patterns.\n");
7235           return false;
7236         }
7237       if (!REDUC_GROUP_FIRST_ELEMENT (vdef))
7238         only_slp_reduc_chain = false;
7239       /* For epilogue generation live members of the chain need
7240          to point back to the PHI via their original stmt for
7241          info_for_reduction to work.  For SLP we need to look at
7242          all lanes here - even though we only will vectorize from
7243          the SLP node with live lane zero the other live lanes also
7244          need to be identified as part of a reduction to be able
7245          to skip code generation for them.  */
7246       if (slp_for_stmt_info)
7247         {
7248           for (auto s : SLP_TREE_SCALAR_STMTS (slp_for_stmt_info))
7249             if (STMT_VINFO_LIVE_P (s))
7250               STMT_VINFO_REDUC_DEF (vect_orig_stmt (s)) = phi_info;
7251         }
7252       else if (STMT_VINFO_LIVE_P (vdef))
7253         STMT_VINFO_REDUC_DEF (def) = phi_info;
7254       gimple_match_op op;
7255       if (!gimple_extract_op (vdef->stmt, &op))
7256         {
7257           if (dump_enabled_p ())
7258             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7259                              "reduction chain includes unsupported"
7260                              " statement type.\n");
7261           return false;
7262         }
7263       if (CONVERT_EXPR_CODE_P (op.code))
7264         {
7265           if (!tree_nop_conversion_p (op.type, TREE_TYPE (op.ops[0])))
7266             {
7267               if (dump_enabled_p ())
7268                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7269                                  "conversion in the reduction chain.\n");
7270               return false;
7271             }
7272         }
7273       else if (!stmt_info)
7274         /* First non-conversion stmt.  */
7275         stmt_info = vdef;
7276       reduc_def = op.ops[STMT_VINFO_REDUC_IDX (vdef)];
7277       reduc_chain_length++;
7278       if (!stmt_info && slp_node)
7279         slp_for_stmt_info = SLP_TREE_CHILDREN (slp_for_stmt_info)[0];
7280     }
7281   /* PHIs should not participate in patterns.  */
7282   gcc_assert (!STMT_VINFO_RELATED_STMT (phi_info));
7283
7284   if (nested_in_vect_loop_p (loop, stmt_info))
7285     {
7286       loop = loop->inner;
7287       nested_cycle = true;
7288     }
7289
7290   /* STMT_VINFO_REDUC_DEF doesn't point to the first but the last
7291      element.  */
7292   if (slp_node && REDUC_GROUP_FIRST_ELEMENT (stmt_info))
7293     {
7294       gcc_assert (!REDUC_GROUP_NEXT_ELEMENT (stmt_info));
7295       stmt_info = REDUC_GROUP_FIRST_ELEMENT (stmt_info);
7296     }
7297   if (REDUC_GROUP_FIRST_ELEMENT (stmt_info))
7298     gcc_assert (slp_node
7299                 && REDUC_GROUP_FIRST_ELEMENT (stmt_info) == stmt_info);
7300
7301   /* 1. Is vectorizable reduction?  */
7302   /* Not supportable if the reduction variable is used in the loop, unless
7303      it's a reduction chain.  */
7304   if (STMT_VINFO_RELEVANT (stmt_info) > vect_used_in_outer
7305       && !REDUC_GROUP_FIRST_ELEMENT (stmt_info))
7306     return false;
7307
7308   /* Reductions that are not used even in an enclosing outer-loop,
7309      are expected to be "live" (used out of the loop).  */
7310   if (STMT_VINFO_RELEVANT (stmt_info) == vect_unused_in_scope
7311       && !STMT_VINFO_LIVE_P (stmt_info))
7312     return false;
7313
7314   /* 2. Has this been recognized as a reduction pattern?
7315
7316      Check if STMT represents a pattern that has been recognized
7317      in earlier analysis stages.  For stmts that represent a pattern,
7318      the STMT_VINFO_RELATED_STMT field records the last stmt in
7319      the original sequence that constitutes the pattern.  */
7320
7321   stmt_vec_info orig_stmt_info = STMT_VINFO_RELATED_STMT (stmt_info);
7322   if (orig_stmt_info)
7323     {
7324       gcc_assert (STMT_VINFO_IN_PATTERN_P (orig_stmt_info));
7325       gcc_assert (!STMT_VINFO_IN_PATTERN_P (stmt_info));
7326     }
7327
7328   /* 3. Check the operands of the operation.  The first operands are defined
7329         inside the loop body. The last operand is the reduction variable,
7330         which is defined by the loop-header-phi.  */
7331
7332   tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
7333   STMT_VINFO_REDUC_VECTYPE (reduc_info) = vectype_out;
7334   gimple_match_op op;
7335   if (!gimple_extract_op (stmt_info->stmt, &op))
7336     gcc_unreachable ();
7337   bool lane_reduc_code_p = (op.code == DOT_PROD_EXPR
7338                             || op.code == WIDEN_SUM_EXPR
7339                             || op.code == SAD_EXPR);
7340
7341   if (!POINTER_TYPE_P (op.type) && !INTEGRAL_TYPE_P (op.type)
7342       && !SCALAR_FLOAT_TYPE_P (op.type))
7343     return false;
7344
7345   /* Do not try to vectorize bit-precision reductions.  */
7346   if (!type_has_mode_precision_p (op.type))
7347     return false;
7348
7349   /* For lane-reducing ops we're reducing the number of reduction PHIs
7350      which means the only use of that may be in the lane-reducing operation.  */
7351   if (lane_reduc_code_p
7352       && reduc_chain_length != 1
7353       && !only_slp_reduc_chain)
7354     {
7355       if (dump_enabled_p ())
7356         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7357                          "lane-reducing reduction with extra stmts.\n");
7358       return false;
7359     }
7360
7361   /* All uses but the last are expected to be defined in the loop.
7362      The last use is the reduction variable.  In case of nested cycle this
7363      assumption is not true: we use reduc_index to record the index of the
7364      reduction variable.  */
7365   slp_tree *slp_op = XALLOCAVEC (slp_tree, op.num_ops);
7366   /* We need to skip an extra operand for COND_EXPRs with embedded
7367      comparison.  */
7368   unsigned opno_adjust = 0;
7369   if (op.code == COND_EXPR && COMPARISON_CLASS_P (op.ops[0]))
7370     opno_adjust = 1;
7371   for (i = 0; i < (int) op.num_ops; i++)
7372     {
7373       /* The condition of COND_EXPR is checked in vectorizable_condition().  */
7374       if (i == 0 && op.code == COND_EXPR)
7375         continue;
7376
7377       stmt_vec_info def_stmt_info;
7378       enum vect_def_type dt;
7379       if (!vect_is_simple_use (loop_vinfo, stmt_info, slp_for_stmt_info,
7380                                i + opno_adjust, &op.ops[i], &slp_op[i], &dt,
7381                                &vectype_op[i], &def_stmt_info))
7382         {
7383           if (dump_enabled_p ())
7384             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7385                              "use not simple.\n");
7386           return false;
7387         }
7388       if (i == STMT_VINFO_REDUC_IDX (stmt_info))
7389         continue;
7390
7391       /* There should be only one cycle def in the stmt, the one
7392          leading to reduc_def.  */
7393       if (VECTORIZABLE_CYCLE_DEF (dt))
7394         return false;
7395
7396       if (!vectype_op[i])
7397         vectype_op[i]
7398           = get_vectype_for_scalar_type (loop_vinfo,
7399                                          TREE_TYPE (op.ops[i]), slp_op[i]);
7400
7401       /* To properly compute ncopies we are interested in the widest
7402          non-reduction input type in case we're looking at a widening
7403          accumulation that we later handle in vect_transform_reduction.  */
7404       if (lane_reduc_code_p
7405           && vectype_op[i]
7406           && (!vectype_in
7407               || (GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (vectype_in)))
7408                   < GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (vectype_op[i]))))))
7409         vectype_in = vectype_op[i];
7410
7411       if (op.code == COND_EXPR)
7412         {
7413           /* Record how the non-reduction-def value of COND_EXPR is defined.  */
7414           if (dt == vect_constant_def)
7415             {
7416               cond_reduc_dt = dt;
7417               cond_reduc_val = op.ops[i];
7418             }
7419           if (dt == vect_induction_def
7420               && def_stmt_info
7421               && is_nonwrapping_integer_induction (def_stmt_info, loop))
7422             {
7423               cond_reduc_dt = dt;
7424               cond_stmt_vinfo = def_stmt_info;
7425             }
7426         }
7427     }
7428   if (!vectype_in)
7429     vectype_in = STMT_VINFO_VECTYPE (phi_info);
7430   STMT_VINFO_REDUC_VECTYPE_IN (reduc_info) = vectype_in;
7431
7432   enum vect_reduction_type v_reduc_type = STMT_VINFO_REDUC_TYPE (phi_info);
7433   STMT_VINFO_REDUC_TYPE (reduc_info) = v_reduc_type;
7434   /* If we have a condition reduction, see if we can simplify it further.  */
7435   if (v_reduc_type == COND_REDUCTION)
7436     {
7437       if (slp_node)
7438         return false;
7439
7440       /* When the condition uses the reduction value in the condition, fail.  */
7441       if (STMT_VINFO_REDUC_IDX (stmt_info) == 0)
7442         {
7443           if (dump_enabled_p ())
7444             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7445                              "condition depends on previous iteration\n");
7446           return false;
7447         }
7448
7449       if (reduc_chain_length == 1
7450           && direct_internal_fn_supported_p (IFN_FOLD_EXTRACT_LAST,
7451                                              vectype_in, OPTIMIZE_FOR_SPEED))
7452         {
7453           if (dump_enabled_p ())
7454             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7455                              "optimizing condition reduction with"
7456                              " FOLD_EXTRACT_LAST.\n");
7457           STMT_VINFO_REDUC_TYPE (reduc_info) = EXTRACT_LAST_REDUCTION;
7458         }
7459       else if (cond_reduc_dt == vect_induction_def)
7460         {
7461           tree base
7462             = STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (cond_stmt_vinfo);
7463           tree step = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (cond_stmt_vinfo);
7464
7465           gcc_assert (TREE_CODE (base) == INTEGER_CST
7466                       && TREE_CODE (step) == INTEGER_CST);
7467           cond_reduc_val = NULL_TREE;
7468           enum tree_code cond_reduc_op_code = ERROR_MARK;
7469           tree res = PHI_RESULT (STMT_VINFO_STMT (cond_stmt_vinfo));
7470           if (!types_compatible_p (TREE_TYPE (res), TREE_TYPE (base)))
7471             ;
7472           /* Find a suitable value, for MAX_EXPR below base, for MIN_EXPR
7473              above base; punt if base is the minimum value of the type for
7474              MAX_EXPR or maximum value of the type for MIN_EXPR for now.  */
7475           else if (tree_int_cst_sgn (step) == -1)
7476             {
7477               cond_reduc_op_code = MIN_EXPR;
7478               if (tree_int_cst_sgn (base) == -1)
7479                 cond_reduc_val = build_int_cst (TREE_TYPE (base), 0);
7480               else if (tree_int_cst_lt (base,
7481                                         TYPE_MAX_VALUE (TREE_TYPE (base))))
7482                 cond_reduc_val
7483                   = int_const_binop (PLUS_EXPR, base, integer_one_node);
7484             }
7485           else
7486             {
7487               cond_reduc_op_code = MAX_EXPR;
7488               if (tree_int_cst_sgn (base) == 1)
7489                 cond_reduc_val = build_int_cst (TREE_TYPE (base), 0);
7490               else if (tree_int_cst_lt (TYPE_MIN_VALUE (TREE_TYPE (base)),
7491                                         base))
7492                 cond_reduc_val
7493                   = int_const_binop (MINUS_EXPR, base, integer_one_node);
7494             }
7495           if (cond_reduc_val)
7496             {
7497               if (dump_enabled_p ())
7498                 dump_printf_loc (MSG_NOTE, vect_location,
7499                                  "condition expression based on "
7500                                  "integer induction.\n");
7501               STMT_VINFO_REDUC_CODE (reduc_info) = cond_reduc_op_code;
7502               STMT_VINFO_VEC_INDUC_COND_INITIAL_VAL (reduc_info)
7503                 = cond_reduc_val;
7504               STMT_VINFO_REDUC_TYPE (reduc_info) = INTEGER_INDUC_COND_REDUCTION;
7505             }
7506         }
7507       else if (cond_reduc_dt == vect_constant_def)
7508         {
7509           enum vect_def_type cond_initial_dt;
7510           tree cond_initial_val = vect_phi_initial_value (reduc_def_phi);
7511           vect_is_simple_use (cond_initial_val, loop_vinfo, &cond_initial_dt);
7512           if (cond_initial_dt == vect_constant_def
7513               && types_compatible_p (TREE_TYPE (cond_initial_val),
7514                                      TREE_TYPE (cond_reduc_val)))
7515             {
7516               tree e = fold_binary (LE_EXPR, boolean_type_node,
7517                                     cond_initial_val, cond_reduc_val);
7518               if (e && (integer_onep (e) || integer_zerop (e)))
7519                 {
7520                   if (dump_enabled_p ())
7521                     dump_printf_loc (MSG_NOTE, vect_location,
7522                                      "condition expression based on "
7523                                      "compile time constant.\n");
7524                   /* Record reduction code at analysis stage.  */
7525                   STMT_VINFO_REDUC_CODE (reduc_info)
7526                     = integer_onep (e) ? MAX_EXPR : MIN_EXPR;
7527                   STMT_VINFO_REDUC_TYPE (reduc_info) = CONST_COND_REDUCTION;
7528                 }
7529             }
7530         }
7531     }
7532
7533   if (STMT_VINFO_LIVE_P (phi_info))
7534     return false;
7535
7536   if (slp_node)
7537     ncopies = 1;
7538   else
7539     ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
7540
7541   gcc_assert (ncopies >= 1);
7542
7543   poly_uint64 nunits_out = TYPE_VECTOR_SUBPARTS (vectype_out);
7544
7545   if (nested_cycle)
7546     {
7547       gcc_assert (STMT_VINFO_DEF_TYPE (reduc_info)
7548                   == vect_double_reduction_def);
7549       double_reduc = true;
7550     }
7551
7552   /* 4.2. Check support for the epilog operation.
7553
7554           If STMT represents a reduction pattern, then the type of the
7555           reduction variable may be different than the type of the rest
7556           of the arguments.  For example, consider the case of accumulation
7557           of shorts into an int accumulator; The original code:
7558                         S1: int_a = (int) short_a;
7559           orig_stmt->   S2: int_acc = plus <int_a ,int_acc>;
7560
7561           was replaced with:
7562                         STMT: int_acc = widen_sum <short_a, int_acc>
7563
7564           This means that:
7565           1. The tree-code that is used to create the vector operation in the
7566              epilog code (that reduces the partial results) is not the
7567              tree-code of STMT, but is rather the tree-code of the original
7568              stmt from the pattern that STMT is replacing.  I.e, in the example
7569              above we want to use 'widen_sum' in the loop, but 'plus' in the
7570              epilog.
7571           2. The type (mode) we use to check available target support
7572              for the vector operation to be created in the *epilog*, is
7573              determined by the type of the reduction variable (in the example
7574              above we'd check this: optab_handler (plus_optab, vect_int_mode])).
7575              However the type (mode) we use to check available target support
7576              for the vector operation to be created *inside the loop*, is
7577              determined by the type of the other arguments to STMT (in the
7578              example we'd check this: optab_handler (widen_sum_optab,
7579              vect_short_mode)).
7580
7581           This is contrary to "regular" reductions, in which the types of all
7582           the arguments are the same as the type of the reduction variable.
7583           For "regular" reductions we can therefore use the same vector type
7584           (and also the same tree-code) when generating the epilog code and
7585           when generating the code inside the loop.  */
7586
7587   code_helper orig_code = STMT_VINFO_REDUC_CODE (phi_info);
7588   STMT_VINFO_REDUC_CODE (reduc_info) = orig_code;
7589
7590   vect_reduction_type reduction_type = STMT_VINFO_REDUC_TYPE (reduc_info);
7591   if (reduction_type == TREE_CODE_REDUCTION)
7592     {
7593       /* Check whether it's ok to change the order of the computation.
7594          Generally, when vectorizing a reduction we change the order of the
7595          computation.  This may change the behavior of the program in some
7596          cases, so we need to check that this is ok.  One exception is when
7597          vectorizing an outer-loop: the inner-loop is executed sequentially,
7598          and therefore vectorizing reductions in the inner-loop during
7599          outer-loop vectorization is safe.  Likewise when we are vectorizing
7600          a series of reductions using SLP and the VF is one the reductions
7601          are performed in scalar order.  */
7602       if (slp_node
7603           && !REDUC_GROUP_FIRST_ELEMENT (stmt_info)
7604           && known_eq (LOOP_VINFO_VECT_FACTOR (loop_vinfo), 1u))
7605         ;
7606       else if (needs_fold_left_reduction_p (op.type, orig_code))
7607         {
7608           /* When vectorizing a reduction chain w/o SLP the reduction PHI
7609              is not directy used in stmt.  */
7610           if (!only_slp_reduc_chain
7611               && reduc_chain_length != 1)
7612             {
7613               if (dump_enabled_p ())
7614                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7615                                  "in-order reduction chain without SLP.\n");
7616               return false;
7617             }
7618           STMT_VINFO_REDUC_TYPE (reduc_info)
7619             = reduction_type = FOLD_LEFT_REDUCTION;
7620         }
7621       else if (!commutative_binary_op_p (orig_code, op.type)
7622                || !associative_binary_op_p (orig_code, op.type))
7623         {
7624           if (dump_enabled_p ())
7625             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7626                             "reduction: not commutative/associative");
7627           return false;
7628         }
7629     }
7630
7631   if ((double_reduc || reduction_type != TREE_CODE_REDUCTION)
7632       && ncopies > 1)
7633     {
7634       if (dump_enabled_p ())
7635         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7636                          "multiple types in double reduction or condition "
7637                          "reduction or fold-left reduction.\n");
7638       return false;
7639     }
7640
7641   internal_fn reduc_fn = IFN_LAST;
7642   if (reduction_type == TREE_CODE_REDUCTION
7643       || reduction_type == FOLD_LEFT_REDUCTION
7644       || reduction_type == INTEGER_INDUC_COND_REDUCTION
7645       || reduction_type == CONST_COND_REDUCTION)
7646     {
7647       if (reduction_type == FOLD_LEFT_REDUCTION
7648           ? fold_left_reduction_fn (orig_code, &reduc_fn)
7649           : reduction_fn_for_scalar_code (orig_code, &reduc_fn))
7650         {
7651           if (reduc_fn != IFN_LAST
7652               && !direct_internal_fn_supported_p (reduc_fn, vectype_out,
7653                                                   OPTIMIZE_FOR_SPEED))
7654             {
7655               if (dump_enabled_p ())
7656                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7657                                  "reduc op not supported by target.\n");
7658
7659               reduc_fn = IFN_LAST;
7660             }
7661         }
7662       else
7663         {
7664           if (!nested_cycle || double_reduc)
7665             {
7666               if (dump_enabled_p ())
7667                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7668                                  "no reduc code for scalar code.\n");
7669
7670               return false;
7671             }
7672         }
7673     }
7674   else if (reduction_type == COND_REDUCTION)
7675     {
7676       int scalar_precision
7677         = GET_MODE_PRECISION (SCALAR_TYPE_MODE (op.type));
7678       cr_index_scalar_type = make_unsigned_type (scalar_precision);
7679       cr_index_vector_type = get_same_sized_vectype (cr_index_scalar_type,
7680                                                 vectype_out);
7681
7682       if (direct_internal_fn_supported_p (IFN_REDUC_MAX, cr_index_vector_type,
7683                                           OPTIMIZE_FOR_SPEED))
7684         reduc_fn = IFN_REDUC_MAX;
7685     }
7686   STMT_VINFO_REDUC_FN (reduc_info) = reduc_fn;
7687
7688   if (reduction_type != EXTRACT_LAST_REDUCTION
7689       && (!nested_cycle || double_reduc)
7690       && reduc_fn == IFN_LAST
7691       && !nunits_out.is_constant ())
7692     {
7693       if (dump_enabled_p ())
7694         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7695                          "missing target support for reduction on"
7696                          " variable-length vectors.\n");
7697       return false;
7698     }
7699
7700   /* For SLP reductions, see if there is a neutral value we can use.  */
7701   tree neutral_op = NULL_TREE;
7702   if (slp_node)
7703     {
7704       tree initial_value = NULL_TREE;
7705       if (REDUC_GROUP_FIRST_ELEMENT (stmt_info) != NULL)
7706         initial_value = vect_phi_initial_value (reduc_def_phi);
7707       neutral_op = neutral_op_for_reduction (TREE_TYPE (vectype_out),
7708                                              orig_code, initial_value);
7709     }
7710
7711   if (double_reduc && reduction_type == FOLD_LEFT_REDUCTION)
7712     {
7713       /* We can't support in-order reductions of code such as this:
7714
7715            for (int i = 0; i < n1; ++i)
7716              for (int j = 0; j < n2; ++j)
7717                l += a[j];
7718
7719          since GCC effectively transforms the loop when vectorizing:
7720
7721            for (int i = 0; i < n1 / VF; ++i)
7722              for (int j = 0; j < n2; ++j)
7723                for (int k = 0; k < VF; ++k)
7724                  l += a[j];
7725
7726          which is a reassociation of the original operation.  */
7727       if (dump_enabled_p ())
7728         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7729                          "in-order double reduction not supported.\n");
7730
7731       return false;
7732     }
7733
7734   if (reduction_type == FOLD_LEFT_REDUCTION
7735       && slp_node
7736       && !REDUC_GROUP_FIRST_ELEMENT (stmt_info))
7737     {
7738       /* We cannot use in-order reductions in this case because there is
7739          an implicit reassociation of the operations involved.  */
7740       if (dump_enabled_p ())
7741         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7742                          "in-order unchained SLP reductions not supported.\n");
7743       return false;
7744     }
7745
7746   /* For double reductions, and for SLP reductions with a neutral value,
7747      we construct a variable-length initial vector by loading a vector
7748      full of the neutral value and then shift-and-inserting the start
7749      values into the low-numbered elements.  */
7750   if ((double_reduc || neutral_op)
7751       && !nunits_out.is_constant ()
7752       && !direct_internal_fn_supported_p (IFN_VEC_SHL_INSERT,
7753                                           vectype_out, OPTIMIZE_FOR_SPEED))
7754     {
7755       if (dump_enabled_p ())
7756         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7757                          "reduction on variable-length vectors requires"
7758                          " target support for a vector-shift-and-insert"
7759                          " operation.\n");
7760       return false;
7761     }
7762
7763   /* Check extra constraints for variable-length unchained SLP reductions.  */
7764   if (slp_node
7765       && !REDUC_GROUP_FIRST_ELEMENT (stmt_info)
7766       && !nunits_out.is_constant ())
7767     {
7768       /* We checked above that we could build the initial vector when
7769          there's a neutral element value.  Check here for the case in
7770          which each SLP statement has its own initial value and in which
7771          that value needs to be repeated for every instance of the
7772          statement within the initial vector.  */
7773       unsigned int group_size = SLP_TREE_LANES (slp_node);
7774       if (!neutral_op
7775           && !can_duplicate_and_interleave_p (loop_vinfo, group_size,
7776                                               TREE_TYPE (vectype_out)))
7777         {
7778           if (dump_enabled_p ())
7779             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7780                              "unsupported form of SLP reduction for"
7781                              " variable-length vectors: cannot build"
7782                              " initial vector.\n");
7783           return false;
7784         }
7785       /* The epilogue code relies on the number of elements being a multiple
7786          of the group size.  The duplicate-and-interleave approach to setting
7787          up the initial vector does too.  */
7788       if (!multiple_p (nunits_out, group_size))
7789         {
7790           if (dump_enabled_p ())
7791             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7792                              "unsupported form of SLP reduction for"
7793                              " variable-length vectors: the vector size"
7794                              " is not a multiple of the number of results.\n");
7795           return false;
7796         }
7797     }
7798
7799   if (reduction_type == COND_REDUCTION)
7800     {
7801       widest_int ni;
7802
7803       if (! max_loop_iterations (loop, &ni))
7804         {
7805           if (dump_enabled_p ())
7806             dump_printf_loc (MSG_NOTE, vect_location,
7807                              "loop count not known, cannot create cond "
7808                              "reduction.\n");
7809           return false;
7810         }
7811       /* Convert backedges to iterations.  */
7812       ni += 1;
7813
7814       /* The additional index will be the same type as the condition.  Check
7815          that the loop can fit into this less one (because we'll use up the
7816          zero slot for when there are no matches).  */
7817       tree max_index = TYPE_MAX_VALUE (cr_index_scalar_type);
7818       if (wi::geu_p (ni, wi::to_widest (max_index)))
7819         {
7820           if (dump_enabled_p ())
7821             dump_printf_loc (MSG_NOTE, vect_location,
7822                              "loop size is greater than data size.\n");
7823           return false;
7824         }
7825     }
7826
7827   /* In case the vectorization factor (VF) is bigger than the number
7828      of elements that we can fit in a vectype (nunits), we have to generate
7829      more than one vector stmt - i.e - we need to "unroll" the
7830      vector stmt by a factor VF/nunits.  For more details see documentation
7831      in vectorizable_operation.  */
7832
7833   /* If the reduction is used in an outer loop we need to generate
7834      VF intermediate results, like so (e.g. for ncopies=2):
7835         r0 = phi (init, r0)
7836         r1 = phi (init, r1)
7837         r0 = x0 + r0;
7838         r1 = x1 + r1;
7839     (i.e. we generate VF results in 2 registers).
7840     In this case we have a separate def-use cycle for each copy, and therefore
7841     for each copy we get the vector def for the reduction variable from the
7842     respective phi node created for this copy.
7843
7844     Otherwise (the reduction is unused in the loop nest), we can combine
7845     together intermediate results, like so (e.g. for ncopies=2):
7846         r = phi (init, r)
7847         r = x0 + r;
7848         r = x1 + r;
7849    (i.e. we generate VF/2 results in a single register).
7850    In this case for each copy we get the vector def for the reduction variable
7851    from the vectorized reduction operation generated in the previous iteration.
7852
7853    This only works when we see both the reduction PHI and its only consumer
7854    in vectorizable_reduction and there are no intermediate stmts
7855    participating.  When unrolling we want each unrolled iteration to have its
7856    own reduction accumulator since one of the main goals of unrolling a
7857    reduction is to reduce the aggregate loop-carried latency.  */
7858   if (ncopies > 1
7859       && (STMT_VINFO_RELEVANT (stmt_info) <= vect_used_only_live)
7860       && reduc_chain_length == 1
7861       && loop_vinfo->suggested_unroll_factor == 1)
7862     single_defuse_cycle = true;
7863
7864   if (single_defuse_cycle || lane_reduc_code_p)
7865     {
7866       gcc_assert (op.code != COND_EXPR);
7867
7868       /* 4. Supportable by target?  */
7869       bool ok = true;
7870
7871       /* 4.1. check support for the operation in the loop
7872
7873          This isn't necessary for the lane reduction codes, since they
7874          can only be produced by pattern matching, and it's up to the
7875          pattern matcher to test for support.  The main reason for
7876          specifically skipping this step is to avoid rechecking whether
7877          mixed-sign dot-products can be implemented using signed
7878          dot-products.  */
7879       machine_mode vec_mode = TYPE_MODE (vectype_in);
7880       if (!lane_reduc_code_p
7881           && !directly_supported_p (op.code, vectype_in, optab_vector))
7882         {
7883           if (dump_enabled_p ())
7884             dump_printf (MSG_NOTE, "op not supported by target.\n");
7885           if (maybe_ne (GET_MODE_SIZE (vec_mode), UNITS_PER_WORD)
7886               || !vect_can_vectorize_without_simd_p (op.code))
7887             ok = false;
7888           else
7889             if (dump_enabled_p ())
7890               dump_printf (MSG_NOTE, "proceeding using word mode.\n");
7891         }
7892
7893       if (vect_emulated_vector_p (vectype_in)
7894           && !vect_can_vectorize_without_simd_p (op.code))
7895         {
7896           if (dump_enabled_p ())
7897             dump_printf (MSG_NOTE, "using word mode not possible.\n");
7898           return false;
7899         }
7900
7901       /* lane-reducing operations have to go through vect_transform_reduction.
7902          For the other cases try without the single cycle optimization.  */
7903       if (!ok)
7904         {
7905           if (lane_reduc_code_p)
7906             return false;
7907           else
7908             single_defuse_cycle = false;
7909         }
7910     }
7911   STMT_VINFO_FORCE_SINGLE_CYCLE (reduc_info) = single_defuse_cycle;
7912
7913   /* If the reduction stmt is one of the patterns that have lane
7914      reduction embedded we cannot handle the case of ! single_defuse_cycle.  */
7915   if ((ncopies > 1 && ! single_defuse_cycle)
7916       && lane_reduc_code_p)
7917     {
7918       if (dump_enabled_p ())
7919         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7920                          "multi def-use cycle not possible for lane-reducing "
7921                          "reduction operation\n");
7922       return false;
7923     }
7924
7925   if (slp_node
7926       && !(!single_defuse_cycle
7927            && !lane_reduc_code_p
7928            && reduction_type != FOLD_LEFT_REDUCTION))
7929     for (i = 0; i < (int) op.num_ops; i++)
7930       if (!vect_maybe_update_slp_op_vectype (slp_op[i], vectype_op[i]))
7931         {
7932           if (dump_enabled_p ())
7933             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7934                              "incompatible vector types for invariants\n");
7935           return false;
7936         }
7937
7938   if (slp_node)
7939     vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
7940   else
7941     vec_num = 1;
7942
7943   vect_model_reduction_cost (loop_vinfo, stmt_info, reduc_fn,
7944                              reduction_type, ncopies, cost_vec);
7945   /* Cost the reduction op inside the loop if transformed via
7946      vect_transform_reduction.  Otherwise this is costed by the
7947      separate vectorizable_* routines.  */
7948   if (single_defuse_cycle || lane_reduc_code_p)
7949     {
7950       int factor = 1;
7951       if (vect_is_emulated_mixed_dot_prod (loop_vinfo, stmt_info))
7952         /* Three dot-products and a subtraction.  */
7953         factor = 4;
7954       record_stmt_cost (cost_vec, ncopies * factor, vector_stmt,
7955                         stmt_info, 0, vect_body);
7956     }
7957
7958   if (dump_enabled_p ()
7959       && reduction_type == FOLD_LEFT_REDUCTION)
7960     dump_printf_loc (MSG_NOTE, vect_location,
7961                      "using an in-order (fold-left) reduction.\n");
7962   STMT_VINFO_TYPE (orig_stmt_of_analysis) = cycle_phi_info_type;
7963   /* All but single defuse-cycle optimized, lane-reducing and fold-left
7964      reductions go through their own vectorizable_* routines.  */
7965   if (!single_defuse_cycle
7966       && !lane_reduc_code_p
7967       && reduction_type != FOLD_LEFT_REDUCTION)
7968     {
7969       stmt_vec_info tem
7970         = vect_stmt_to_vectorize (STMT_VINFO_REDUC_DEF (phi_info));
7971       if (slp_node && REDUC_GROUP_FIRST_ELEMENT (tem))
7972         {
7973           gcc_assert (!REDUC_GROUP_NEXT_ELEMENT (tem));
7974           tem = REDUC_GROUP_FIRST_ELEMENT (tem);
7975         }
7976       STMT_VINFO_DEF_TYPE (vect_orig_stmt (tem)) = vect_internal_def;
7977       STMT_VINFO_DEF_TYPE (tem) = vect_internal_def;
7978     }
7979   else if (loop_vinfo && LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo))
7980     {
7981       vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo);
7982       internal_fn cond_fn = get_conditional_internal_fn (op.code, op.type);
7983
7984       if (reduction_type != FOLD_LEFT_REDUCTION
7985           && !use_mask_by_cond_expr_p (op.code, cond_fn, vectype_in)
7986           && (cond_fn == IFN_LAST
7987               || !direct_internal_fn_supported_p (cond_fn, vectype_in,
7988                                                   OPTIMIZE_FOR_SPEED)))
7989         {
7990           if (dump_enabled_p ())
7991             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7992                              "can't operate on partial vectors because"
7993                              " no conditional operation is available.\n");
7994           LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
7995         }
7996       else if (reduction_type == FOLD_LEFT_REDUCTION
7997                && reduc_fn == IFN_LAST
7998                && !expand_vec_cond_expr_p (vectype_in,
7999                                            truth_type_for (vectype_in),
8000                                            SSA_NAME))
8001         {
8002           if (dump_enabled_p ())
8003             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8004                              "can't operate on partial vectors because"
8005                              " no conditional operation is available.\n");
8006           LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
8007         }
8008       else
8009         vect_record_loop_mask (loop_vinfo, masks, ncopies * vec_num,
8010                                vectype_in, NULL);
8011     }
8012   return true;
8013 }
8014
8015 /* STMT_INFO is a dot-product reduction whose multiplication operands
8016    have different signs.  Emit a sequence to emulate the operation
8017    using a series of signed DOT_PROD_EXPRs and return the last
8018    statement generated.  VEC_DEST is the result of the vector operation
8019    and VOP lists its inputs.  */
8020
8021 static gassign *
8022 vect_emulate_mixed_dot_prod (loop_vec_info loop_vinfo, stmt_vec_info stmt_info,
8023                              gimple_stmt_iterator *gsi, tree vec_dest,
8024                              tree vop[3])
8025 {
8026   tree wide_vectype = signed_type_for (TREE_TYPE (vec_dest));
8027   tree narrow_vectype = signed_type_for (TREE_TYPE (vop[0]));
8028   tree narrow_elttype = TREE_TYPE (narrow_vectype);
8029   gimple *new_stmt;
8030
8031   /* Make VOP[0] the unsigned operand VOP[1] the signed operand.  */
8032   if (!TYPE_UNSIGNED (TREE_TYPE (vop[0])))
8033     std::swap (vop[0], vop[1]);
8034
8035   /* Convert all inputs to signed types.  */
8036   for (int i = 0; i < 3; ++i)
8037     if (TYPE_UNSIGNED (TREE_TYPE (vop[i])))
8038       {
8039         tree tmp = make_ssa_name (signed_type_for (TREE_TYPE (vop[i])));
8040         new_stmt = gimple_build_assign (tmp, NOP_EXPR, vop[i]);
8041         vect_finish_stmt_generation (loop_vinfo, stmt_info, new_stmt, gsi);
8042         vop[i] = tmp;
8043       }
8044
8045   /* In the comments below we assume 8-bit inputs for simplicity,
8046      but the approach works for any full integer type.  */
8047
8048   /* Create a vector of -128.  */
8049   tree min_narrow_elttype = TYPE_MIN_VALUE (narrow_elttype);
8050   tree min_narrow = build_vector_from_val (narrow_vectype,
8051                                            min_narrow_elttype);
8052
8053   /* Create a vector of 64.  */
8054   auto half_wi = wi::lrshift (wi::to_wide (min_narrow_elttype), 1);
8055   tree half_narrow = wide_int_to_tree (narrow_elttype, half_wi);
8056   half_narrow = build_vector_from_val (narrow_vectype, half_narrow);
8057
8058   /* Emit: SUB_RES = VOP[0] - 128.  */
8059   tree sub_res = make_ssa_name (narrow_vectype);
8060   new_stmt = gimple_build_assign (sub_res, PLUS_EXPR, vop[0], min_narrow);
8061   vect_finish_stmt_generation (loop_vinfo, stmt_info, new_stmt, gsi);
8062
8063   /* Emit:
8064
8065        STAGE1 = DOT_PROD_EXPR <VOP[1], 64, VOP[2]>;
8066        STAGE2 = DOT_PROD_EXPR <VOP[1], 64, STAGE1>;
8067        STAGE3 = DOT_PROD_EXPR <SUB_RES, -128, STAGE2>;
8068
8069      on the basis that x * y == (x - 128) * y + 64 * y + 64 * y
8070      Doing the two 64 * y steps first allows more time to compute x.  */
8071   tree stage1 = make_ssa_name (wide_vectype);
8072   new_stmt = gimple_build_assign (stage1, DOT_PROD_EXPR,
8073                                   vop[1], half_narrow, vop[2]);
8074   vect_finish_stmt_generation (loop_vinfo, stmt_info, new_stmt, gsi);
8075
8076   tree stage2 = make_ssa_name (wide_vectype);
8077   new_stmt = gimple_build_assign (stage2, DOT_PROD_EXPR,
8078                                   vop[1], half_narrow, stage1);
8079   vect_finish_stmt_generation (loop_vinfo, stmt_info, new_stmt, gsi);
8080
8081   tree stage3 = make_ssa_name (wide_vectype);
8082   new_stmt = gimple_build_assign (stage3, DOT_PROD_EXPR,
8083                                   sub_res, vop[1], stage2);
8084   vect_finish_stmt_generation (loop_vinfo, stmt_info, new_stmt, gsi);
8085
8086   /* Convert STAGE3 to the reduction type.  */
8087   return gimple_build_assign (vec_dest, CONVERT_EXPR, stage3);
8088 }
8089
8090 /* Transform the definition stmt STMT_INFO of a reduction PHI backedge
8091    value.  */
8092
8093 bool
8094 vect_transform_reduction (loop_vec_info loop_vinfo,
8095                           stmt_vec_info stmt_info, gimple_stmt_iterator *gsi,
8096                           gimple **vec_stmt, slp_tree slp_node)
8097 {
8098   tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
8099   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
8100   int i;
8101   int ncopies;
8102   int vec_num;
8103
8104   stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info);
8105   gcc_assert (reduc_info->is_reduc_info);
8106
8107   if (nested_in_vect_loop_p (loop, stmt_info))
8108     {
8109       loop = loop->inner;
8110       gcc_assert (STMT_VINFO_DEF_TYPE (reduc_info) == vect_double_reduction_def);
8111     }
8112
8113   gimple_match_op op;
8114   if (!gimple_extract_op (stmt_info->stmt, &op))
8115     gcc_unreachable ();
8116
8117   /* All uses but the last are expected to be defined in the loop.
8118      The last use is the reduction variable.  In case of nested cycle this
8119      assumption is not true: we use reduc_index to record the index of the
8120      reduction variable.  */
8121   stmt_vec_info phi_info = STMT_VINFO_REDUC_DEF (vect_orig_stmt (stmt_info));
8122   gphi *reduc_def_phi = as_a <gphi *> (phi_info->stmt);
8123   int reduc_index = STMT_VINFO_REDUC_IDX (stmt_info);
8124   tree vectype_in = STMT_VINFO_REDUC_VECTYPE_IN (reduc_info);
8125
8126   if (slp_node)
8127     {
8128       ncopies = 1;
8129       vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
8130     }
8131   else
8132     {
8133       ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
8134       vec_num = 1;
8135     }
8136
8137   code_helper code = canonicalize_code (op.code, op.type);
8138   internal_fn cond_fn = get_conditional_internal_fn (code, op.type);
8139   vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo);
8140   bool mask_by_cond_expr = use_mask_by_cond_expr_p (code, cond_fn, vectype_in);
8141
8142   /* Transform.  */
8143   tree new_temp = NULL_TREE;
8144   auto_vec<tree> vec_oprnds0;
8145   auto_vec<tree> vec_oprnds1;
8146   auto_vec<tree> vec_oprnds2;
8147   tree def0;
8148
8149   if (dump_enabled_p ())
8150     dump_printf_loc (MSG_NOTE, vect_location, "transform reduction.\n");
8151
8152   /* FORNOW: Multiple types are not supported for condition.  */
8153   if (code == COND_EXPR)
8154     gcc_assert (ncopies == 1);
8155
8156   bool masked_loop_p = LOOP_VINFO_FULLY_MASKED_P (loop_vinfo);
8157
8158   vect_reduction_type reduction_type = STMT_VINFO_REDUC_TYPE (reduc_info);
8159   if (reduction_type == FOLD_LEFT_REDUCTION)
8160     {
8161       internal_fn reduc_fn = STMT_VINFO_REDUC_FN (reduc_info);
8162       gcc_assert (code.is_tree_code ());
8163       return vectorize_fold_left_reduction
8164           (loop_vinfo, stmt_info, gsi, vec_stmt, slp_node, reduc_def_phi,
8165            tree_code (code), reduc_fn, op.ops, vectype_in, reduc_index, masks);
8166     }
8167
8168   bool single_defuse_cycle = STMT_VINFO_FORCE_SINGLE_CYCLE (reduc_info);
8169   gcc_assert (single_defuse_cycle
8170               || code == DOT_PROD_EXPR
8171               || code == WIDEN_SUM_EXPR
8172               || code == SAD_EXPR);
8173
8174   /* Create the destination vector  */
8175   tree scalar_dest = gimple_get_lhs (stmt_info->stmt);
8176   tree vec_dest = vect_create_destination_var (scalar_dest, vectype_out);
8177
8178   vect_get_vec_defs (loop_vinfo, stmt_info, slp_node, ncopies,
8179                      single_defuse_cycle && reduc_index == 0
8180                      ? NULL_TREE : op.ops[0], &vec_oprnds0,
8181                      single_defuse_cycle && reduc_index == 1
8182                      ? NULL_TREE : op.ops[1], &vec_oprnds1,
8183                      op.num_ops == 3
8184                      && !(single_defuse_cycle && reduc_index == 2)
8185                      ? op.ops[2] : NULL_TREE, &vec_oprnds2);
8186   if (single_defuse_cycle)
8187     {
8188       gcc_assert (!slp_node);
8189       vect_get_vec_defs_for_operand (loop_vinfo, stmt_info, 1,
8190                                      op.ops[reduc_index],
8191                                      reduc_index == 0 ? &vec_oprnds0
8192                                      : (reduc_index == 1 ? &vec_oprnds1
8193                                         : &vec_oprnds2));
8194     }
8195
8196   bool emulated_mixed_dot_prod
8197     = vect_is_emulated_mixed_dot_prod (loop_vinfo, stmt_info);
8198   FOR_EACH_VEC_ELT (vec_oprnds0, i, def0)
8199     {
8200       gimple *new_stmt;
8201       tree vop[3] = { def0, vec_oprnds1[i], NULL_TREE };
8202       if (masked_loop_p && !mask_by_cond_expr)
8203         {
8204           /* No conditional ifns have been defined for dot-product yet.  */
8205           gcc_assert (code != DOT_PROD_EXPR);
8206
8207           /* Make sure that the reduction accumulator is vop[0].  */
8208           if (reduc_index == 1)
8209             {
8210               gcc_assert (commutative_binary_op_p (code, op.type));
8211               std::swap (vop[0], vop[1]);
8212             }
8213           tree mask = vect_get_loop_mask (loop_vinfo, gsi, masks,
8214                                           vec_num * ncopies, vectype_in, i);
8215           gcall *call = gimple_build_call_internal (cond_fn, 4, mask,
8216                                                     vop[0], vop[1], vop[0]);
8217           new_temp = make_ssa_name (vec_dest, call);
8218           gimple_call_set_lhs (call, new_temp);
8219           gimple_call_set_nothrow (call, true);
8220           vect_finish_stmt_generation (loop_vinfo, stmt_info, call, gsi);
8221           new_stmt = call;
8222         }
8223       else
8224         {
8225           if (op.num_ops == 3)
8226             vop[2] = vec_oprnds2[i];
8227
8228           if (masked_loop_p && mask_by_cond_expr)
8229             {
8230               tree mask = vect_get_loop_mask (loop_vinfo, gsi, masks,
8231                                               vec_num * ncopies, vectype_in, i);
8232               build_vect_cond_expr (code, vop, mask, gsi);
8233             }
8234
8235           if (emulated_mixed_dot_prod)
8236             new_stmt = vect_emulate_mixed_dot_prod (loop_vinfo, stmt_info, gsi,
8237                                                     vec_dest, vop);
8238           else if (code.is_internal_fn ())
8239             new_stmt = gimple_build_call_internal (internal_fn (code),
8240                                                    op.num_ops,
8241                                                    vop[0], vop[1], vop[2]);
8242           else
8243             new_stmt = gimple_build_assign (vec_dest, tree_code (op.code),
8244                                             vop[0], vop[1], vop[2]);
8245           new_temp = make_ssa_name (vec_dest, new_stmt);
8246           gimple_set_lhs (new_stmt, new_temp);
8247           vect_finish_stmt_generation (loop_vinfo, stmt_info, new_stmt, gsi);
8248         }
8249
8250       if (slp_node)
8251         SLP_TREE_VEC_STMTS (slp_node).quick_push (new_stmt);
8252       else if (single_defuse_cycle
8253                && i < ncopies - 1)
8254         {
8255           if (reduc_index == 0)
8256             vec_oprnds0.safe_push (gimple_get_lhs (new_stmt));
8257           else if (reduc_index == 1)
8258             vec_oprnds1.safe_push (gimple_get_lhs (new_stmt));
8259           else if (reduc_index == 2)
8260             vec_oprnds2.safe_push (gimple_get_lhs (new_stmt));
8261         }
8262       else
8263         STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_stmt);
8264     }
8265
8266   if (!slp_node)
8267     *vec_stmt = STMT_VINFO_VEC_STMTS (stmt_info)[0];
8268
8269   return true;
8270 }
8271
8272 /* Transform phase of a cycle PHI.  */
8273
8274 bool
8275 vect_transform_cycle_phi (loop_vec_info loop_vinfo,
8276                           stmt_vec_info stmt_info, gimple **vec_stmt,
8277                           slp_tree slp_node, slp_instance slp_node_instance)
8278 {
8279   tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
8280   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
8281   int i;
8282   int ncopies;
8283   int j;
8284   bool nested_cycle = false;
8285   int vec_num;
8286
8287   if (nested_in_vect_loop_p (loop, stmt_info))
8288     {
8289       loop = loop->inner;
8290       nested_cycle = true;
8291     }
8292
8293   stmt_vec_info reduc_stmt_info = STMT_VINFO_REDUC_DEF (stmt_info);
8294   reduc_stmt_info = vect_stmt_to_vectorize (reduc_stmt_info);
8295   stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info);
8296   gcc_assert (reduc_info->is_reduc_info);
8297
8298   if (STMT_VINFO_REDUC_TYPE (reduc_info) == EXTRACT_LAST_REDUCTION
8299       || STMT_VINFO_REDUC_TYPE (reduc_info) == FOLD_LEFT_REDUCTION)
8300     /* Leave the scalar phi in place.  */
8301     return true;
8302
8303   tree vectype_in = STMT_VINFO_REDUC_VECTYPE_IN (reduc_info);
8304   /* For a nested cycle we do not fill the above.  */
8305   if (!vectype_in)
8306     vectype_in = STMT_VINFO_VECTYPE (stmt_info);
8307   gcc_assert (vectype_in);
8308
8309   if (slp_node)
8310     {
8311       /* The size vect_schedule_slp_instance computes is off for us.  */
8312       vec_num = vect_get_num_vectors (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
8313                                       * SLP_TREE_LANES (slp_node), vectype_in);
8314       ncopies = 1;
8315     }
8316   else
8317     {
8318       vec_num = 1;
8319       ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
8320     }
8321
8322   /* Check whether we should use a single PHI node and accumulate
8323      vectors to one before the backedge.  */
8324   if (STMT_VINFO_FORCE_SINGLE_CYCLE (reduc_info))
8325     ncopies = 1;
8326
8327   /* Create the destination vector  */
8328   gphi *phi = as_a <gphi *> (stmt_info->stmt);
8329   tree vec_dest = vect_create_destination_var (gimple_phi_result (phi),
8330                                                vectype_out);
8331
8332   /* Get the loop-entry arguments.  */
8333   tree vec_initial_def = NULL_TREE;
8334   auto_vec<tree> vec_initial_defs;
8335   if (slp_node)
8336     {
8337       vec_initial_defs.reserve (vec_num);
8338       if (nested_cycle)
8339         {
8340           unsigned phi_idx = loop_preheader_edge (loop)->dest_idx;
8341           vect_get_slp_defs (SLP_TREE_CHILDREN (slp_node)[phi_idx],
8342                              &vec_initial_defs);
8343         }
8344       else
8345         {
8346           gcc_assert (slp_node == slp_node_instance->reduc_phis);
8347           vec<tree> &initial_values = reduc_info->reduc_initial_values;
8348           vec<stmt_vec_info> &stmts = SLP_TREE_SCALAR_STMTS (slp_node);
8349
8350           unsigned int num_phis = stmts.length ();
8351           if (REDUC_GROUP_FIRST_ELEMENT (reduc_stmt_info))
8352             num_phis = 1;
8353           initial_values.reserve (num_phis);
8354           for (unsigned int i = 0; i < num_phis; ++i)
8355             {
8356               gphi *this_phi = as_a<gphi *> (stmts[i]->stmt);
8357               initial_values.quick_push (vect_phi_initial_value (this_phi));
8358             }
8359           if (vec_num == 1)
8360             vect_find_reusable_accumulator (loop_vinfo, reduc_info);
8361           if (!initial_values.is_empty ())
8362             {
8363               tree initial_value
8364                 = (num_phis == 1 ? initial_values[0] : NULL_TREE);
8365               code_helper code = STMT_VINFO_REDUC_CODE (reduc_info);
8366               tree neutral_op
8367                 = neutral_op_for_reduction (TREE_TYPE (vectype_out),
8368                                             code, initial_value);
8369               get_initial_defs_for_reduction (loop_vinfo, reduc_info,
8370                                               &vec_initial_defs, vec_num,
8371                                               stmts.length (), neutral_op);
8372             }
8373         }
8374     }
8375   else
8376     {
8377       /* Get at the scalar def before the loop, that defines the initial
8378          value of the reduction variable.  */
8379       tree initial_def = vect_phi_initial_value (phi);
8380       reduc_info->reduc_initial_values.safe_push (initial_def);
8381       /* Optimize: if initial_def is for REDUC_MAX smaller than the base
8382          and we can't use zero for induc_val, use initial_def.  Similarly
8383          for REDUC_MIN and initial_def larger than the base.  */
8384       if (STMT_VINFO_REDUC_TYPE (reduc_info) == INTEGER_INDUC_COND_REDUCTION)
8385         {
8386           tree induc_val = STMT_VINFO_VEC_INDUC_COND_INITIAL_VAL (reduc_info);
8387           if (TREE_CODE (initial_def) == INTEGER_CST
8388               && !integer_zerop (induc_val)
8389               && ((STMT_VINFO_REDUC_CODE (reduc_info) == MAX_EXPR
8390                    && tree_int_cst_lt (initial_def, induc_val))
8391                   || (STMT_VINFO_REDUC_CODE (reduc_info) == MIN_EXPR
8392                       && tree_int_cst_lt (induc_val, initial_def))))
8393             {
8394               induc_val = initial_def;
8395               /* Communicate we used the initial_def to epilouge
8396                  generation.  */
8397               STMT_VINFO_VEC_INDUC_COND_INITIAL_VAL (reduc_info) = NULL_TREE;
8398             }
8399           vec_initial_def = build_vector_from_val (vectype_out, induc_val);
8400         }
8401       else if (nested_cycle)
8402         {
8403           /* Do not use an adjustment def as that case is not supported
8404              correctly if ncopies is not one.  */
8405           vect_get_vec_defs_for_operand (loop_vinfo, reduc_stmt_info,
8406                                          ncopies, initial_def,
8407                                          &vec_initial_defs);
8408         }
8409       else if (STMT_VINFO_REDUC_TYPE (reduc_info) == CONST_COND_REDUCTION
8410                || STMT_VINFO_REDUC_TYPE (reduc_info) == COND_REDUCTION)
8411         /* Fill the initial vector with the initial scalar value.  */
8412         vec_initial_def
8413           = get_initial_def_for_reduction (loop_vinfo, reduc_stmt_info,
8414                                            initial_def, initial_def);
8415       else
8416         {
8417           if (ncopies == 1)
8418             vect_find_reusable_accumulator (loop_vinfo, reduc_info);
8419           if (!reduc_info->reduc_initial_values.is_empty ())
8420             {
8421               initial_def = reduc_info->reduc_initial_values[0];
8422               code_helper code = STMT_VINFO_REDUC_CODE (reduc_info);
8423               tree neutral_op
8424                 = neutral_op_for_reduction (TREE_TYPE (initial_def),
8425                                             code, initial_def);
8426               gcc_assert (neutral_op);
8427               /* Try to simplify the vector initialization by applying an
8428                  adjustment after the reduction has been performed.  */
8429               if (!reduc_info->reused_accumulator
8430                   && STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
8431                   && !operand_equal_p (neutral_op, initial_def))
8432                 {
8433                   STMT_VINFO_REDUC_EPILOGUE_ADJUSTMENT (reduc_info)
8434                     = initial_def;
8435                   initial_def = neutral_op;
8436                 }
8437               vec_initial_def
8438                 = get_initial_def_for_reduction (loop_vinfo, reduc_info,
8439                                                  initial_def, neutral_op);
8440             }
8441         }
8442     }
8443
8444   if (vec_initial_def)
8445     {
8446       vec_initial_defs.create (ncopies);
8447       for (i = 0; i < ncopies; ++i)
8448         vec_initial_defs.quick_push (vec_initial_def);
8449     }
8450
8451   if (auto *accumulator = reduc_info->reused_accumulator)
8452     {
8453       tree def = accumulator->reduc_input;
8454       if (!useless_type_conversion_p (vectype_out, TREE_TYPE (def)))
8455         {
8456           unsigned int nreduc;
8457           bool res = constant_multiple_p (TYPE_VECTOR_SUBPARTS
8458                                             (TREE_TYPE (def)),
8459                                           TYPE_VECTOR_SUBPARTS (vectype_out),
8460                                           &nreduc);
8461           gcc_assert (res);
8462           gimple_seq stmts = NULL;
8463           /* Reduce the single vector to a smaller one.  */
8464           if (nreduc != 1)
8465             {
8466               /* Perform the reduction in the appropriate type.  */
8467               tree rvectype = vectype_out;
8468               if (!useless_type_conversion_p (TREE_TYPE (vectype_out),
8469                                               TREE_TYPE (TREE_TYPE (def))))
8470                 rvectype = build_vector_type (TREE_TYPE (TREE_TYPE (def)),
8471                                               TYPE_VECTOR_SUBPARTS
8472                                                 (vectype_out));
8473               def = vect_create_partial_epilog (def, rvectype,
8474                                                 STMT_VINFO_REDUC_CODE
8475                                                   (reduc_info),
8476                                                 &stmts);
8477             }
8478           /* The epilogue loop might use a different vector mode, like
8479              VNx2DI vs. V2DI.  */
8480           if (TYPE_MODE (vectype_out) != TYPE_MODE (TREE_TYPE (def)))
8481             {
8482               tree reduc_type = build_vector_type_for_mode
8483                 (TREE_TYPE (TREE_TYPE (def)), TYPE_MODE (vectype_out));
8484               def = gimple_convert (&stmts, reduc_type, def);
8485             }
8486           /* Adjust the input so we pick up the partially reduced value
8487              for the skip edge in vect_create_epilog_for_reduction.  */
8488           accumulator->reduc_input = def;
8489           /* And the reduction could be carried out using a different sign.  */
8490           if (!useless_type_conversion_p (vectype_out, TREE_TYPE (def)))
8491             def = gimple_convert (&stmts, vectype_out, def);
8492           if (loop_vinfo->main_loop_edge)
8493             {
8494               /* While we'd like to insert on the edge this will split
8495                  blocks and disturb bookkeeping, we also will eventually
8496                  need this on the skip edge.  Rely on sinking to
8497                  fixup optimal placement and insert in the pred.  */
8498               gimple_stmt_iterator gsi
8499                 = gsi_last_bb (loop_vinfo->main_loop_edge->src);
8500               /* Insert before a cond that eventually skips the
8501                  epilogue.  */
8502               if (!gsi_end_p (gsi) && stmt_ends_bb_p (gsi_stmt (gsi)))
8503                 gsi_prev (&gsi);
8504               gsi_insert_seq_after (&gsi, stmts, GSI_CONTINUE_LINKING);
8505             }
8506           else
8507             gsi_insert_seq_on_edge_immediate (loop_preheader_edge (loop),
8508                                               stmts);
8509         }
8510       if (loop_vinfo->main_loop_edge)
8511         vec_initial_defs[0]
8512           = vect_get_main_loop_result (loop_vinfo, def,
8513                                        vec_initial_defs[0]);
8514       else
8515         vec_initial_defs.safe_push (def);
8516     }
8517
8518   /* Generate the reduction PHIs upfront.  */
8519   for (i = 0; i < vec_num; i++)
8520     {
8521       tree vec_init_def = vec_initial_defs[i];
8522       for (j = 0; j < ncopies; j++)
8523         {
8524           /* Create the reduction-phi that defines the reduction
8525              operand.  */
8526           gphi *new_phi = create_phi_node (vec_dest, loop->header);
8527
8528           /* Set the loop-entry arg of the reduction-phi.  */
8529           if (j != 0 && nested_cycle)
8530             vec_init_def = vec_initial_defs[j];
8531           add_phi_arg (new_phi, vec_init_def, loop_preheader_edge (loop),
8532                        UNKNOWN_LOCATION);
8533
8534           /* The loop-latch arg is set in epilogue processing.  */
8535
8536           if (slp_node)
8537             SLP_TREE_VEC_STMTS (slp_node).quick_push (new_phi);
8538           else
8539             {
8540               if (j == 0)
8541                 *vec_stmt = new_phi;
8542               STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_phi);
8543             }
8544         }
8545     }
8546
8547   return true;
8548 }
8549
8550 /* Vectorizes LC PHIs.  */
8551
8552 bool
8553 vectorizable_lc_phi (loop_vec_info loop_vinfo,
8554                      stmt_vec_info stmt_info, gimple **vec_stmt,
8555                      slp_tree slp_node)
8556 {
8557   if (!loop_vinfo
8558       || !is_a <gphi *> (stmt_info->stmt)
8559       || gimple_phi_num_args (stmt_info->stmt) != 1)
8560     return false;
8561
8562   if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_internal_def
8563       && STMT_VINFO_DEF_TYPE (stmt_info) != vect_double_reduction_def)
8564     return false;
8565
8566   if (!vec_stmt) /* transformation not required.  */
8567     {
8568       /* Deal with copies from externs or constants that disguise as
8569          loop-closed PHI nodes (PR97886).  */
8570       if (slp_node
8571           && !vect_maybe_update_slp_op_vectype (SLP_TREE_CHILDREN (slp_node)[0],
8572                                                 SLP_TREE_VECTYPE (slp_node)))
8573         {
8574           if (dump_enabled_p ())
8575             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8576                              "incompatible vector types for invariants\n");
8577           return false;
8578         }
8579       STMT_VINFO_TYPE (stmt_info) = lc_phi_info_type;
8580       return true;
8581     }
8582
8583   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
8584   tree scalar_dest = gimple_phi_result (stmt_info->stmt);
8585   basic_block bb = gimple_bb (stmt_info->stmt);
8586   edge e = single_pred_edge (bb);
8587   tree vec_dest = vect_create_destination_var (scalar_dest, vectype);
8588   auto_vec<tree> vec_oprnds;
8589   vect_get_vec_defs (loop_vinfo, stmt_info, slp_node,
8590                      !slp_node ? vect_get_num_copies (loop_vinfo, vectype) : 1,
8591                      gimple_phi_arg_def (stmt_info->stmt, 0), &vec_oprnds);
8592   for (unsigned i = 0; i < vec_oprnds.length (); i++)
8593     {
8594       /* Create the vectorized LC PHI node.  */
8595       gphi *new_phi = create_phi_node (vec_dest, bb);
8596       add_phi_arg (new_phi, vec_oprnds[i], e, UNKNOWN_LOCATION);
8597       if (slp_node)
8598         SLP_TREE_VEC_STMTS (slp_node).quick_push (new_phi);
8599       else
8600         STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_phi);
8601     }
8602   if (!slp_node)
8603     *vec_stmt = STMT_VINFO_VEC_STMTS (stmt_info)[0];
8604
8605   return true;
8606 }
8607
8608 /* Vectorizes PHIs.  */
8609
8610 bool
8611 vectorizable_phi (vec_info *,
8612                   stmt_vec_info stmt_info, gimple **vec_stmt,
8613                   slp_tree slp_node, stmt_vector_for_cost *cost_vec)
8614 {
8615   if (!is_a <gphi *> (stmt_info->stmt) || !slp_node)
8616     return false;
8617
8618   if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_internal_def)
8619     return false;
8620
8621   tree vectype = SLP_TREE_VECTYPE (slp_node);
8622
8623   if (!vec_stmt) /* transformation not required.  */
8624     {
8625       slp_tree child;
8626       unsigned i;
8627       FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (slp_node), i, child)
8628         if (!child)
8629           {
8630             if (dump_enabled_p ())
8631               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8632                                "PHI node with unvectorized backedge def\n");
8633             return false;
8634           }
8635         else if (!vect_maybe_update_slp_op_vectype (child, vectype))
8636           {
8637             if (dump_enabled_p ())
8638               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8639                                "incompatible vector types for invariants\n");
8640             return false;
8641           }
8642         else if (SLP_TREE_DEF_TYPE (child) == vect_internal_def
8643                  && !useless_type_conversion_p (vectype,
8644                                                 SLP_TREE_VECTYPE (child)))
8645           {
8646             /* With bools we can have mask and non-mask precision vectors
8647                or different non-mask precisions.  while pattern recog is
8648                supposed to guarantee consistency here bugs in it can cause
8649                mismatches (PR103489 and PR103800 for example).
8650                Deal with them here instead of ICEing later.  */
8651             if (dump_enabled_p ())
8652               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8653                                "incompatible vector type setup from "
8654                                "bool pattern detection\n");
8655             return false;
8656           }
8657
8658       /* For single-argument PHIs assume coalescing which means zero cost
8659          for the scalar and the vector PHIs.  This avoids artificially
8660          favoring the vector path (but may pessimize it in some cases).  */
8661       if (gimple_phi_num_args (as_a <gphi *> (stmt_info->stmt)) > 1)
8662         record_stmt_cost (cost_vec, SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node),
8663                           vector_stmt, stmt_info, vectype, 0, vect_body);
8664       STMT_VINFO_TYPE (stmt_info) = phi_info_type;
8665       return true;
8666     }
8667
8668   tree scalar_dest = gimple_phi_result (stmt_info->stmt);
8669   basic_block bb = gimple_bb (stmt_info->stmt);
8670   tree vec_dest = vect_create_destination_var (scalar_dest, vectype);
8671   auto_vec<gphi *> new_phis;
8672   for (unsigned i = 0; i < gimple_phi_num_args (stmt_info->stmt); ++i)
8673     {
8674       slp_tree child = SLP_TREE_CHILDREN (slp_node)[i];
8675
8676       /* Skip not yet vectorized defs.  */
8677       if (SLP_TREE_DEF_TYPE (child) == vect_internal_def
8678           && SLP_TREE_VEC_STMTS (child).is_empty ())
8679         continue;
8680
8681       auto_vec<tree> vec_oprnds;
8682       vect_get_slp_defs (SLP_TREE_CHILDREN (slp_node)[i], &vec_oprnds);
8683       if (!new_phis.exists ())
8684         {
8685           new_phis.create (vec_oprnds.length ());
8686           for (unsigned j = 0; j < vec_oprnds.length (); j++)
8687             {
8688               /* Create the vectorized LC PHI node.  */
8689               new_phis.quick_push (create_phi_node (vec_dest, bb));
8690               SLP_TREE_VEC_STMTS (slp_node).quick_push (new_phis[j]);
8691             }
8692         }
8693       edge e = gimple_phi_arg_edge (as_a <gphi *> (stmt_info->stmt), i);
8694       for (unsigned j = 0; j < vec_oprnds.length (); j++)
8695         add_phi_arg (new_phis[j], vec_oprnds[j], e, UNKNOWN_LOCATION);
8696     }
8697   /* We should have at least one already vectorized child.  */
8698   gcc_assert (new_phis.exists ());
8699
8700   return true;
8701 }
8702
8703 /* Vectorizes first order recurrences.  An overview of the transformation
8704    is described below. Suppose we have the following loop.
8705
8706      int t = 0;
8707      for (int i = 0; i < n; ++i)
8708        {
8709          b[i] = a[i] - t;
8710          t = a[i];
8711        }
8712
8713    There is a first-order recurrence on 'a'. For this loop, the scalar IR
8714    looks (simplified) like:
8715
8716     scalar.preheader:
8717       init = 0;
8718
8719     scalar.body:
8720       i = PHI <0(scalar.preheader), i+1(scalar.body)>
8721       _2 = PHI <(init(scalar.preheader), <_1(scalar.body)>
8722       _1 = a[i]
8723       b[i] = _1 - _2
8724       if (i < n) goto scalar.body
8725
8726    In this example, _2 is a recurrence because it's value depends on the
8727    previous iteration.  We vectorize this as (VF = 4)
8728
8729     vector.preheader:
8730       vect_init = vect_cst(..., ..., ..., 0)
8731
8732     vector.body
8733       i = PHI <0(vector.preheader), i+4(vector.body)>
8734       vect_1 = PHI <vect_init(vector.preheader), v2(vector.body)>
8735       vect_2 = a[i, i+1, i+2, i+3];
8736       vect_3 = vec_perm (vect_1, vect_2, { 3, 4, 5, 6 })
8737       b[i, i+1, i+2, i+3] = vect_2 - vect_3
8738       if (..) goto vector.body
8739
8740    In this function, vectorizable_recurr, we code generate both the
8741    vector PHI node and the permute since those together compute the
8742    vectorized value of the scalar PHI.  We do not yet have the
8743    backedge value to fill in there nor into the vec_perm.  Those
8744    are filled in maybe_set_vectorized_backedge_value and
8745    vect_schedule_scc.
8746
8747    TODO:  Since the scalar loop does not have a use of the recurrence
8748    outside of the loop the natural way to implement peeling via
8749    vectorizing the live value doesn't work.  For now peeling of loops
8750    with a recurrence is not implemented.  For SLP the supported cases
8751    are restricted to those requiring a single vector recurrence PHI.  */
8752
8753 bool
8754 vectorizable_recurr (loop_vec_info loop_vinfo, stmt_vec_info stmt_info,
8755                      gimple **vec_stmt, slp_tree slp_node,
8756                      stmt_vector_for_cost *cost_vec)
8757 {
8758   if (!loop_vinfo || !is_a<gphi *> (stmt_info->stmt))
8759     return false;
8760
8761   gphi *phi = as_a<gphi *> (stmt_info->stmt);
8762
8763   /* So far we only support first-order recurrence auto-vectorization.  */
8764   if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_first_order_recurrence)
8765     return false;
8766
8767   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
8768   unsigned ncopies;
8769   if (slp_node)
8770     ncopies = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
8771   else
8772     ncopies = vect_get_num_copies (loop_vinfo, vectype);
8773   poly_int64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
8774   unsigned dist = slp_node ? SLP_TREE_LANES (slp_node) : 1;
8775   /* We need to be able to make progress with a single vector.  */
8776   if (maybe_gt (dist * 2, nunits))
8777     {
8778       if (dump_enabled_p ())
8779         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8780                          "first order recurrence exceeds half of "
8781                          "a vector\n");
8782       return false;
8783     }
8784
8785   /* First-order recurrence autovectorization needs to handle permutation
8786      with indices = [nunits-1, nunits, nunits+1, ...].  */
8787   vec_perm_builder sel (nunits, 1, 3);
8788   for (int i = 0; i < 3; ++i)
8789     sel.quick_push (nunits - dist + i);
8790   vec_perm_indices indices (sel, 2, nunits);
8791
8792   if (!vec_stmt) /* transformation not required.  */
8793     {
8794       if (!can_vec_perm_const_p (TYPE_MODE (vectype), TYPE_MODE (vectype),
8795                                  indices))
8796         return false;
8797
8798       if (slp_node)
8799         {
8800           /* We eventually need to set a vector type on invariant
8801              arguments.  */
8802           unsigned j;
8803           slp_tree child;
8804           FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (slp_node), j, child)
8805             if (!vect_maybe_update_slp_op_vectype
8806                   (child, SLP_TREE_VECTYPE (slp_node)))
8807               {
8808                 if (dump_enabled_p ())
8809                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8810                                    "incompatible vector types for "
8811                                    "invariants\n");
8812                 return false;
8813               }
8814         }
8815       /* The recurrence costs the initialization vector and one permute
8816          for each copy.  */
8817       unsigned prologue_cost = record_stmt_cost (cost_vec, 1, scalar_to_vec,
8818                                                  stmt_info, 0, vect_prologue);
8819       unsigned inside_cost = record_stmt_cost (cost_vec, ncopies, vector_stmt,
8820                                                stmt_info, 0, vect_body);
8821       if (dump_enabled_p ())
8822         dump_printf_loc (MSG_NOTE, vect_location,
8823                          "vectorizable_recurr: inside_cost = %d, "
8824                          "prologue_cost = %d .\n", inside_cost,
8825                          prologue_cost);
8826
8827       STMT_VINFO_TYPE (stmt_info) = recurr_info_type;
8828       return true;
8829     }
8830
8831   edge pe = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo));
8832   basic_block bb = gimple_bb (phi);
8833   tree preheader = PHI_ARG_DEF_FROM_EDGE (phi, pe);
8834   if (!useless_type_conversion_p (TREE_TYPE (vectype), TREE_TYPE (preheader)))
8835     {
8836       gimple_seq stmts = NULL;
8837       preheader = gimple_convert (&stmts, TREE_TYPE (vectype), preheader);
8838       gsi_insert_seq_on_edge_immediate (pe, stmts);
8839     }
8840   tree vec_init = build_vector_from_val (vectype, preheader);
8841   vec_init = vect_init_vector (loop_vinfo, stmt_info, vec_init, vectype, NULL);
8842
8843   /* Create the vectorized first-order PHI node.  */
8844   tree vec_dest = vect_get_new_vect_var (vectype,
8845                                          vect_simple_var, "vec_recur_");
8846   gphi *new_phi = create_phi_node (vec_dest, bb);
8847   add_phi_arg (new_phi, vec_init, pe, UNKNOWN_LOCATION);
8848
8849   /* Insert shuffles the first-order recurrence autovectorization.
8850        result = VEC_PERM <vec_recur, vect_1, index[nunits-1, nunits, ...]>.  */
8851   tree perm = vect_gen_perm_mask_checked (vectype, indices);
8852
8853   /* Insert the required permute after the latch definition.  The
8854      second and later operands are tentative and will be updated when we have
8855      vectorized the latch definition.  */
8856   edge le = loop_latch_edge (LOOP_VINFO_LOOP (loop_vinfo));
8857   gimple *latch_def = SSA_NAME_DEF_STMT (PHI_ARG_DEF_FROM_EDGE (phi, le));
8858   gimple_stmt_iterator gsi2 = gsi_for_stmt (latch_def);
8859   gsi_next (&gsi2);
8860
8861   for (unsigned i = 0; i < ncopies; ++i)
8862     {
8863       vec_dest = make_ssa_name (vectype);
8864       gassign *vperm
8865           = gimple_build_assign (vec_dest, VEC_PERM_EXPR,
8866                                  i == 0 ? gimple_phi_result (new_phi) : NULL,
8867                                  NULL, perm);
8868       vect_finish_stmt_generation (loop_vinfo, stmt_info, vperm, &gsi2);
8869
8870       if (slp_node)
8871         SLP_TREE_VEC_STMTS (slp_node).quick_push (vperm);
8872       else
8873         STMT_VINFO_VEC_STMTS (stmt_info).safe_push (vperm);
8874     }
8875
8876   if (!slp_node)
8877     *vec_stmt = STMT_VINFO_VEC_STMTS (stmt_info)[0];
8878   return true;
8879 }
8880
8881 /* Return true if VECTYPE represents a vector that requires lowering
8882    by the vector lowering pass.  */
8883
8884 bool
8885 vect_emulated_vector_p (tree vectype)
8886 {
8887   return (!VECTOR_MODE_P (TYPE_MODE (vectype))
8888           && (!VECTOR_BOOLEAN_TYPE_P (vectype)
8889               || TYPE_PRECISION (TREE_TYPE (vectype)) != 1));
8890 }
8891
8892 /* Return true if we can emulate CODE on an integer mode representation
8893    of a vector.  */
8894
8895 bool
8896 vect_can_vectorize_without_simd_p (tree_code code)
8897 {
8898   switch (code)
8899     {
8900     case PLUS_EXPR:
8901     case MINUS_EXPR:
8902     case NEGATE_EXPR:
8903     case BIT_AND_EXPR:
8904     case BIT_IOR_EXPR:
8905     case BIT_XOR_EXPR:
8906     case BIT_NOT_EXPR:
8907       return true;
8908
8909     default:
8910       return false;
8911     }
8912 }
8913
8914 /* Likewise, but taking a code_helper.  */
8915
8916 bool
8917 vect_can_vectorize_without_simd_p (code_helper code)
8918 {
8919   return (code.is_tree_code ()
8920           && vect_can_vectorize_without_simd_p (tree_code (code)));
8921 }
8922
8923 /* Create vector init for vectorized iv.  */
8924 static tree
8925 vect_create_nonlinear_iv_init (gimple_seq* stmts, tree init_expr,
8926                                tree step_expr, poly_uint64 nunits,
8927                                tree vectype,
8928                                enum vect_induction_op_type induction_type)
8929 {
8930   unsigned HOST_WIDE_INT const_nunits;
8931   tree vec_shift, vec_init, new_name;
8932   unsigned i;
8933   tree itype = TREE_TYPE (vectype);
8934
8935   /* iv_loop is the loop to be vectorized. Create:
8936      vec_init = [X, X+S, X+2*S, X+3*S] (S = step_expr, X = init_expr).  */
8937   new_name = gimple_convert (stmts, itype, init_expr);
8938   switch (induction_type)
8939     {
8940     case vect_step_op_shr:
8941     case vect_step_op_shl:
8942       /* Build the Initial value from shift_expr.  */
8943       vec_init = gimple_build_vector_from_val (stmts,
8944                                                vectype,
8945                                                new_name);
8946       vec_shift = gimple_build (stmts, VEC_SERIES_EXPR, vectype,
8947                                 build_zero_cst (itype), step_expr);
8948       vec_init = gimple_build (stmts,
8949                                (induction_type == vect_step_op_shr
8950                                 ? RSHIFT_EXPR : LSHIFT_EXPR),
8951                                vectype, vec_init, vec_shift);
8952       break;
8953
8954     case vect_step_op_neg:
8955       {
8956         vec_init = gimple_build_vector_from_val (stmts,
8957                                                  vectype,
8958                                                  new_name);
8959         tree vec_neg = gimple_build (stmts, NEGATE_EXPR,
8960                                      vectype, vec_init);
8961         /* The encoding has 2 interleaved stepped patterns.  */
8962         vec_perm_builder sel (nunits, 2, 3);
8963         sel.quick_grow (6);
8964         for (i = 0; i < 3; i++)
8965           {
8966             sel[2 * i] = i;
8967             sel[2 * i + 1] = i + nunits;
8968           }
8969         vec_perm_indices indices (sel, 2, nunits);
8970         /* Don't use vect_gen_perm_mask_checked since can_vec_perm_const_p may
8971            fail when vec_init is const vector. In that situation vec_perm is not
8972            really needed.  */
8973         tree perm_mask_even
8974           = vect_gen_perm_mask_any (vectype, indices);
8975         vec_init = gimple_build (stmts, VEC_PERM_EXPR,
8976                                  vectype,
8977                                  vec_init, vec_neg,
8978                                  perm_mask_even);
8979       }
8980       break;
8981
8982     case vect_step_op_mul:
8983       {
8984         /* Use unsigned mult to avoid UD integer overflow.  */
8985         gcc_assert (nunits.is_constant (&const_nunits));
8986         tree utype = unsigned_type_for (itype);
8987         tree uvectype = build_vector_type (utype,
8988                                            TYPE_VECTOR_SUBPARTS (vectype));
8989         new_name = gimple_convert (stmts, utype, new_name);
8990         vec_init = gimple_build_vector_from_val (stmts,
8991                                                  uvectype,
8992                                                  new_name);
8993         tree_vector_builder elts (uvectype, const_nunits, 1);
8994         tree elt_step = build_one_cst (utype);
8995
8996         elts.quick_push (elt_step);
8997         for (i = 1; i < const_nunits; i++)
8998           {
8999             /* Create: new_name_i = new_name + step_expr.  */
9000             elt_step = gimple_build (stmts, MULT_EXPR,
9001                                      utype, elt_step, step_expr);
9002             elts.quick_push (elt_step);
9003           }
9004         /* Create a vector from [new_name_0, new_name_1, ...,
9005            new_name_nunits-1].  */
9006         tree vec_mul = gimple_build_vector (stmts, &elts);
9007         vec_init = gimple_build (stmts, MULT_EXPR, uvectype,
9008                                  vec_init, vec_mul);
9009         vec_init = gimple_convert (stmts, vectype, vec_init);
9010       }
9011       break;
9012
9013     default:
9014       gcc_unreachable ();
9015     }
9016
9017   return vec_init;
9018 }
9019
9020 /* Peel init_expr by skip_niter for induction_type.  */
9021 tree
9022 vect_peel_nonlinear_iv_init (gimple_seq* stmts, tree init_expr,
9023                              tree skip_niters, tree step_expr,
9024                              enum vect_induction_op_type induction_type)
9025 {
9026   gcc_assert (TREE_CODE (skip_niters) == INTEGER_CST);
9027   tree type = TREE_TYPE (init_expr);
9028   unsigned prec = TYPE_PRECISION (type);
9029   switch (induction_type)
9030     {
9031     case vect_step_op_neg:
9032       if (TREE_INT_CST_LOW (skip_niters) % 2)
9033         init_expr = gimple_build (stmts, NEGATE_EXPR, type, init_expr);
9034       /* else no change.  */
9035       break;
9036
9037     case vect_step_op_shr:
9038     case vect_step_op_shl:
9039       skip_niters = gimple_convert (stmts, type, skip_niters);
9040       step_expr = gimple_build (stmts, MULT_EXPR, type, step_expr, skip_niters);
9041       /* When shift mount >= precision, need to avoid UD.
9042          In the original loop, there's no UD, and according to semantic,
9043          init_expr should be 0 for lshr, ashl, and >>= (prec - 1) for ashr.  */
9044       if (!tree_fits_uhwi_p (step_expr)
9045           || tree_to_uhwi (step_expr) >= prec)
9046         {
9047           if (induction_type == vect_step_op_shl
9048               || TYPE_UNSIGNED (type))
9049             init_expr = build_zero_cst (type);
9050           else
9051             init_expr = gimple_build (stmts, RSHIFT_EXPR, type,
9052                                       init_expr,
9053                                       wide_int_to_tree (type, prec - 1));
9054         }
9055       else
9056         init_expr = gimple_build (stmts, (induction_type == vect_step_op_shr
9057                                           ? RSHIFT_EXPR : LSHIFT_EXPR),
9058                                   type, init_expr, step_expr);
9059       break;
9060
9061     case vect_step_op_mul:
9062       {
9063         tree utype = unsigned_type_for (type);
9064         init_expr = gimple_convert (stmts, utype, init_expr);
9065         unsigned skipn = TREE_INT_CST_LOW (skip_niters);
9066         wide_int begin = wi::to_wide (step_expr);
9067         for (unsigned i = 0; i != skipn - 1; i++)
9068           begin = wi::mul (begin, wi::to_wide (step_expr));
9069         tree mult_expr = wide_int_to_tree (utype, begin);
9070         init_expr = gimple_build (stmts, MULT_EXPR, utype, init_expr, mult_expr);
9071         init_expr = gimple_convert (stmts, type, init_expr);
9072       }
9073       break;
9074
9075     default:
9076       gcc_unreachable ();
9077     }
9078
9079   return init_expr;
9080 }
9081
9082 /* Create vector step for vectorized iv.  */
9083 static tree
9084 vect_create_nonlinear_iv_step (gimple_seq* stmts, tree step_expr,
9085                                poly_uint64 vf,
9086                                enum vect_induction_op_type induction_type)
9087 {
9088   tree expr = build_int_cst (TREE_TYPE (step_expr), vf);
9089   tree new_name = NULL;
9090   /* Step should be pow (step, vf) for mult induction.  */
9091   if (induction_type == vect_step_op_mul)
9092     {
9093       gcc_assert (vf.is_constant ());
9094       wide_int begin = wi::to_wide (step_expr);
9095
9096       for (unsigned i = 0; i != vf.to_constant () - 1; i++)
9097         begin = wi::mul (begin, wi::to_wide (step_expr));
9098
9099       new_name = wide_int_to_tree (TREE_TYPE (step_expr), begin);
9100     }
9101   else if (induction_type == vect_step_op_neg)
9102     /* Do nothing.  */
9103     ;
9104   else
9105     new_name = gimple_build (stmts, MULT_EXPR, TREE_TYPE (step_expr),
9106                              expr, step_expr);
9107   return new_name;
9108 }
9109
9110 static tree
9111 vect_create_nonlinear_iv_vec_step (loop_vec_info loop_vinfo,
9112                                    stmt_vec_info stmt_info,
9113                                    tree new_name, tree vectype,
9114                                    enum vect_induction_op_type induction_type)
9115 {
9116   /* No step is needed for neg induction.  */
9117   if (induction_type == vect_step_op_neg)
9118     return NULL;
9119
9120   tree t = unshare_expr (new_name);
9121   gcc_assert (CONSTANT_CLASS_P (new_name)
9122               || TREE_CODE (new_name) == SSA_NAME);
9123   tree new_vec = build_vector_from_val (vectype, t);
9124   tree vec_step = vect_init_vector (loop_vinfo, stmt_info,
9125                                     new_vec, vectype, NULL);
9126   return vec_step;
9127 }
9128
9129 /* Update vectorized iv with vect_step, induc_def is init.  */
9130 static tree
9131 vect_update_nonlinear_iv (gimple_seq* stmts, tree vectype,
9132                           tree induc_def, tree vec_step,
9133                           enum vect_induction_op_type induction_type)
9134 {
9135   tree vec_def = induc_def;
9136   switch (induction_type)
9137     {
9138     case vect_step_op_mul:
9139       {
9140         /* Use unsigned mult to avoid UD integer overflow.  */
9141         tree uvectype
9142           = build_vector_type (unsigned_type_for (TREE_TYPE (vectype)),
9143                                TYPE_VECTOR_SUBPARTS (vectype));
9144         vec_def = gimple_convert (stmts, uvectype, vec_def);
9145         vec_step = gimple_convert (stmts, uvectype, vec_step);
9146         vec_def = gimple_build (stmts, MULT_EXPR, uvectype,
9147                                 vec_def, vec_step);
9148         vec_def = gimple_convert (stmts, vectype, vec_def);
9149       }
9150       break;
9151
9152     case vect_step_op_shr:
9153       vec_def = gimple_build (stmts, RSHIFT_EXPR, vectype,
9154                               vec_def, vec_step);
9155       break;
9156
9157     case vect_step_op_shl:
9158       vec_def = gimple_build (stmts, LSHIFT_EXPR, vectype,
9159                               vec_def, vec_step);
9160       break;
9161     case vect_step_op_neg:
9162       vec_def = induc_def;
9163       /* Do nothing.  */
9164       break;
9165     default:
9166       gcc_unreachable ();
9167     }
9168
9169   return vec_def;
9170
9171 }
9172
9173 /* Function vectorizable_induction
9174
9175    Check if STMT_INFO performs an nonlinear induction computation that can be
9176    vectorized. If VEC_STMT is also passed, vectorize the induction PHI: create
9177    a vectorized phi to replace it, put it in VEC_STMT, and add it to the same
9178    basic block.
9179    Return true if STMT_INFO is vectorizable in this way.  */
9180
9181 static bool
9182 vectorizable_nonlinear_induction (loop_vec_info loop_vinfo,
9183                                   stmt_vec_info stmt_info,
9184                                   gimple **vec_stmt, slp_tree slp_node,
9185                                   stmt_vector_for_cost *cost_vec)
9186 {
9187   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
9188   unsigned ncopies;
9189   bool nested_in_vect_loop = false;
9190   class loop *iv_loop;
9191   tree vec_def;
9192   edge pe = loop_preheader_edge (loop);
9193   basic_block new_bb;
9194   tree vec_init, vec_step;
9195   tree new_name;
9196   gimple *new_stmt;
9197   gphi *induction_phi;
9198   tree induc_def, vec_dest;
9199   tree init_expr, step_expr;
9200   tree niters_skip;
9201   poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
9202   unsigned i;
9203   gimple_stmt_iterator si;
9204
9205   gphi *phi = dyn_cast <gphi *> (stmt_info->stmt);
9206
9207   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
9208   poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
9209   enum vect_induction_op_type induction_type
9210     = STMT_VINFO_LOOP_PHI_EVOLUTION_TYPE (stmt_info);
9211
9212   gcc_assert (induction_type > vect_step_op_add);
9213
9214   if (slp_node)
9215     ncopies = 1;
9216   else
9217     ncopies = vect_get_num_copies (loop_vinfo, vectype);
9218   gcc_assert (ncopies >= 1);
9219
9220   /* FORNOW. Only handle nonlinear induction in the same loop.  */
9221   if (nested_in_vect_loop_p (loop, stmt_info))
9222     {
9223       if (dump_enabled_p ())
9224         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9225                          "nonlinear induction in nested loop.\n");
9226       return false;
9227     }
9228
9229   iv_loop = loop;
9230   gcc_assert (iv_loop == (gimple_bb (phi))->loop_father);
9231
9232   /* TODO: Support slp for nonlinear iv. There should be separate vector iv
9233      update for each iv and a permutation to generate wanted vector iv.  */
9234   if (slp_node)
9235     {
9236       if (dump_enabled_p ())
9237         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9238                          "SLP induction not supported for nonlinear"
9239                          " induction.\n");
9240       return false;
9241     }
9242
9243   if (!INTEGRAL_TYPE_P (TREE_TYPE (vectype)))
9244     {
9245       if (dump_enabled_p ())
9246         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9247                          "floating point nonlinear induction vectorization"
9248                          " not supported.\n");
9249       return false;
9250     }
9251
9252   step_expr = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_info);
9253   init_expr = vect_phi_initial_value (phi);
9254   gcc_assert (step_expr != NULL_TREE && init_expr != NULL
9255               && TREE_CODE (step_expr) == INTEGER_CST);
9256   /* step_expr should be aligned with init_expr,
9257      .i.e. uint64 a >> 1, step is int, but vector<uint64> shift is used.  */
9258   step_expr = fold_convert (TREE_TYPE (vectype), step_expr);
9259
9260   if (TREE_CODE (init_expr) == INTEGER_CST)
9261     init_expr = fold_convert (TREE_TYPE (vectype), init_expr);
9262   else
9263     gcc_assert (tree_nop_conversion_p (TREE_TYPE (vectype),
9264                                        TREE_TYPE (init_expr)));
9265
9266   switch (induction_type)
9267     {
9268     case vect_step_op_neg:
9269       if (TREE_CODE (init_expr) != INTEGER_CST
9270           && TREE_CODE (init_expr) != REAL_CST)
9271         {
9272           /* Check for backend support of NEGATE_EXPR and vec_perm.  */
9273           if (!directly_supported_p (NEGATE_EXPR, vectype))
9274             return false;
9275
9276           /* The encoding has 2 interleaved stepped patterns.  */
9277           vec_perm_builder sel (nunits, 2, 3);
9278           machine_mode mode = TYPE_MODE (vectype);
9279           sel.quick_grow (6);
9280           for (i = 0; i < 3; i++)
9281             {
9282               sel[i * 2] = i;
9283               sel[i * 2 + 1] = i + nunits;
9284             }
9285           vec_perm_indices indices (sel, 2, nunits);
9286           if (!can_vec_perm_const_p (mode, mode, indices))
9287             return false;
9288         }
9289       break;
9290
9291     case vect_step_op_mul:
9292       {
9293         /* Check for backend support of MULT_EXPR.  */
9294         if (!directly_supported_p (MULT_EXPR, vectype))
9295           return false;
9296
9297         /* ?? How to construct vector step for variable number vector.
9298            [ 1, step, pow (step, 2), pow (step, 4), .. ].  */
9299         if (!vf.is_constant ())
9300           return false;
9301       }
9302       break;
9303
9304     case vect_step_op_shr:
9305       /* Check for backend support of RSHIFT_EXPR.  */
9306       if (!directly_supported_p (RSHIFT_EXPR, vectype, optab_vector))
9307         return false;
9308
9309       /* Don't shift more than type precision to avoid UD.  */
9310       if (!tree_fits_uhwi_p (step_expr)
9311           || maybe_ge (nunits * tree_to_uhwi (step_expr),
9312                        TYPE_PRECISION (TREE_TYPE (init_expr))))
9313         return false;
9314       break;
9315
9316     case vect_step_op_shl:
9317       /* Check for backend support of RSHIFT_EXPR.  */
9318       if (!directly_supported_p (LSHIFT_EXPR, vectype, optab_vector))
9319         return false;
9320
9321       /* Don't shift more than type precision to avoid UD.  */
9322       if (!tree_fits_uhwi_p (step_expr)
9323           || maybe_ge (nunits * tree_to_uhwi (step_expr),
9324                        TYPE_PRECISION (TREE_TYPE (init_expr))))
9325         return false;
9326
9327       break;
9328
9329     default:
9330       gcc_unreachable ();
9331     }
9332
9333   if (!vec_stmt) /* transformation not required.  */
9334     {
9335       unsigned inside_cost = 0, prologue_cost = 0;
9336       /* loop cost for vec_loop. Neg induction doesn't have any
9337          inside_cost.  */
9338       inside_cost = record_stmt_cost (cost_vec, ncopies, vector_stmt,
9339                                       stmt_info, 0, vect_body);
9340
9341       /* loop cost for vec_loop. Neg induction doesn't have any
9342          inside_cost.  */
9343       if (induction_type == vect_step_op_neg)
9344         inside_cost = 0;
9345
9346       /* prologue cost for vec_init and vec_step.  */
9347       prologue_cost = record_stmt_cost (cost_vec, 2, scalar_to_vec,
9348                                         stmt_info, 0, vect_prologue);
9349
9350       if (dump_enabled_p ())
9351         dump_printf_loc (MSG_NOTE, vect_location,
9352                          "vect_model_induction_cost: inside_cost = %d, "
9353                          "prologue_cost = %d. \n", inside_cost,
9354                          prologue_cost);
9355
9356       STMT_VINFO_TYPE (stmt_info) = induc_vec_info_type;
9357       DUMP_VECT_SCOPE ("vectorizable_nonlinear_induction");
9358       return true;
9359     }
9360
9361   /* Transform.  */
9362
9363   /* Compute a vector variable, initialized with the first VF values of
9364      the induction variable.  E.g., for an iv with IV_PHI='X' and
9365      evolution S, for a vector of 4 units, we want to compute:
9366      [X, X + S, X + 2*S, X + 3*S].  */
9367
9368   if (dump_enabled_p ())
9369     dump_printf_loc (MSG_NOTE, vect_location, "transform induction phi.\n");
9370
9371   pe = loop_preheader_edge (iv_loop);
9372   /* Find the first insertion point in the BB.  */
9373   basic_block bb = gimple_bb (phi);
9374   si = gsi_after_labels (bb);
9375
9376   gimple_seq stmts = NULL;
9377
9378   niters_skip = LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo);
9379   /* If we are using the loop mask to "peel" for alignment then we need
9380      to adjust the start value here.  */
9381   if (niters_skip != NULL_TREE)
9382     init_expr = vect_peel_nonlinear_iv_init (&stmts, init_expr, niters_skip,
9383                                              step_expr, induction_type);
9384
9385   vec_init = vect_create_nonlinear_iv_init (&stmts, init_expr,
9386                                             step_expr, nunits, vectype,
9387                                             induction_type);
9388   if (stmts)
9389     {
9390       new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
9391       gcc_assert (!new_bb);
9392     }
9393
9394   stmts = NULL;
9395   new_name = vect_create_nonlinear_iv_step (&stmts, step_expr,
9396                                             vf, induction_type);
9397   if (stmts)
9398     {
9399       new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
9400       gcc_assert (!new_bb);
9401     }
9402
9403   vec_step = vect_create_nonlinear_iv_vec_step (loop_vinfo, stmt_info,
9404                                                 new_name, vectype,
9405                                                 induction_type);
9406   /* Create the following def-use cycle:
9407      loop prolog:
9408      vec_init = ...
9409      vec_step = ...
9410      loop:
9411      vec_iv = PHI <vec_init, vec_loop>
9412      ...
9413      STMT
9414      ...
9415      vec_loop = vec_iv + vec_step;  */
9416
9417   /* Create the induction-phi that defines the induction-operand.  */
9418   vec_dest = vect_get_new_vect_var (vectype, vect_simple_var, "vec_iv_");
9419   induction_phi = create_phi_node (vec_dest, iv_loop->header);
9420   induc_def = PHI_RESULT (induction_phi);
9421
9422   /* Create the iv update inside the loop.  */
9423   stmts = NULL;
9424   vec_def = vect_update_nonlinear_iv (&stmts, vectype,
9425                                       induc_def, vec_step,
9426                                       induction_type);
9427
9428   gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
9429   new_stmt = SSA_NAME_DEF_STMT (vec_def);
9430
9431   /* Set the arguments of the phi node:  */
9432   add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION);
9433   add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
9434                UNKNOWN_LOCATION);
9435
9436   STMT_VINFO_VEC_STMTS (stmt_info).safe_push (induction_phi);
9437   *vec_stmt = induction_phi;
9438
9439   /* In case that vectorization factor (VF) is bigger than the number
9440      of elements that we can fit in a vectype (nunits), we have to generate
9441      more than one vector stmt - i.e - we need to "unroll" the
9442      vector stmt by a factor VF/nunits.  For more details see documentation
9443      in vectorizable_operation.  */
9444
9445   if (ncopies > 1)
9446     {
9447       stmts = NULL;
9448       /* FORNOW. This restriction should be relaxed.  */
9449       gcc_assert (!nested_in_vect_loop);
9450
9451       new_name = vect_create_nonlinear_iv_step (&stmts, step_expr,
9452                                                 nunits, induction_type);
9453
9454       vec_step = vect_create_nonlinear_iv_vec_step (loop_vinfo, stmt_info,
9455                                                     new_name, vectype,
9456                                                     induction_type);
9457       vec_def = induc_def;
9458       for (i = 1; i < ncopies; i++)
9459         {
9460           /* vec_i = vec_prev + vec_step.  */
9461           stmts = NULL;
9462           vec_def = vect_update_nonlinear_iv (&stmts, vectype,
9463                                               vec_def, vec_step,
9464                                               induction_type);
9465           gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
9466           new_stmt = SSA_NAME_DEF_STMT (vec_def);
9467           STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_stmt);
9468         }
9469     }
9470
9471   if (dump_enabled_p ())
9472     dump_printf_loc (MSG_NOTE, vect_location,
9473                      "transform induction: created def-use cycle: %G%G",
9474                      (gimple *) induction_phi, SSA_NAME_DEF_STMT (vec_def));
9475
9476   return true;
9477 }
9478
9479 /* Function vectorizable_induction
9480
9481    Check if STMT_INFO performs an induction computation that can be vectorized.
9482    If VEC_STMT is also passed, vectorize the induction PHI: create a vectorized
9483    phi to replace it, put it in VEC_STMT, and add it to the same basic block.
9484    Return true if STMT_INFO is vectorizable in this way.  */
9485
9486 bool
9487 vectorizable_induction (loop_vec_info loop_vinfo,
9488                         stmt_vec_info stmt_info,
9489                         gimple **vec_stmt, slp_tree slp_node,
9490                         stmt_vector_for_cost *cost_vec)
9491 {
9492   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
9493   unsigned ncopies;
9494   bool nested_in_vect_loop = false;
9495   class loop *iv_loop;
9496   tree vec_def;
9497   edge pe = loop_preheader_edge (loop);
9498   basic_block new_bb;
9499   tree new_vec, vec_init, vec_step, t;
9500   tree new_name;
9501   gimple *new_stmt;
9502   gphi *induction_phi;
9503   tree induc_def, vec_dest;
9504   tree init_expr, step_expr;
9505   poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
9506   unsigned i;
9507   tree expr;
9508   gimple_stmt_iterator si;
9509   enum vect_induction_op_type induction_type
9510     = STMT_VINFO_LOOP_PHI_EVOLUTION_TYPE (stmt_info);
9511
9512   gphi *phi = dyn_cast <gphi *> (stmt_info->stmt);
9513   if (!phi)
9514     return false;
9515
9516   if (!STMT_VINFO_RELEVANT_P (stmt_info))
9517     return false;
9518
9519   /* Make sure it was recognized as induction computation.  */
9520   if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_induction_def)
9521     return false;
9522
9523   /* Handle nonlinear induction in a separate place.  */
9524   if (induction_type != vect_step_op_add)
9525     return vectorizable_nonlinear_induction (loop_vinfo, stmt_info,
9526                                              vec_stmt, slp_node, cost_vec);
9527
9528   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
9529   poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
9530
9531   if (slp_node)
9532     ncopies = 1;
9533   else
9534     ncopies = vect_get_num_copies (loop_vinfo, vectype);
9535   gcc_assert (ncopies >= 1);
9536
9537   /* FORNOW. These restrictions should be relaxed.  */
9538   if (nested_in_vect_loop_p (loop, stmt_info))
9539     {
9540       imm_use_iterator imm_iter;
9541       use_operand_p use_p;
9542       gimple *exit_phi;
9543       edge latch_e;
9544       tree loop_arg;
9545
9546       if (ncopies > 1)
9547         {
9548           if (dump_enabled_p ())
9549             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9550                              "multiple types in nested loop.\n");
9551           return false;
9552         }
9553
9554       exit_phi = NULL;
9555       latch_e = loop_latch_edge (loop->inner);
9556       loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
9557       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, loop_arg)
9558         {
9559           gimple *use_stmt = USE_STMT (use_p);
9560           if (is_gimple_debug (use_stmt))
9561             continue;
9562
9563           if (!flow_bb_inside_loop_p (loop->inner, gimple_bb (use_stmt)))
9564             {
9565               exit_phi = use_stmt;
9566               break;
9567             }
9568         }
9569       if (exit_phi)
9570         {
9571           stmt_vec_info exit_phi_vinfo = loop_vinfo->lookup_stmt (exit_phi);
9572           if (!(STMT_VINFO_RELEVANT_P (exit_phi_vinfo)
9573                 && !STMT_VINFO_LIVE_P (exit_phi_vinfo)))
9574             {
9575               if (dump_enabled_p ())
9576                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9577                                  "inner-loop induction only used outside "
9578                                  "of the outer vectorized loop.\n");
9579               return false;
9580             }
9581         }
9582
9583       nested_in_vect_loop = true;
9584       iv_loop = loop->inner;
9585     }
9586   else
9587     iv_loop = loop;
9588   gcc_assert (iv_loop == (gimple_bb (phi))->loop_father);
9589
9590   if (slp_node && !nunits.is_constant ())
9591     {
9592       /* The current SLP code creates the step value element-by-element.  */
9593       if (dump_enabled_p ())
9594         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9595                          "SLP induction not supported for variable-length"
9596                          " vectors.\n");
9597       return false;
9598     }
9599
9600   if (FLOAT_TYPE_P (vectype) && !param_vect_induction_float)
9601     {
9602       if (dump_enabled_p ())
9603         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9604                          "floating point induction vectorization disabled\n");
9605       return false;
9606     }
9607
9608   step_expr = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_info);
9609   gcc_assert (step_expr != NULL_TREE);
9610   tree step_vectype = get_same_sized_vectype (TREE_TYPE (step_expr), vectype);
9611
9612   /* Check for backend support of PLUS/MINUS_EXPR. */
9613   if (!directly_supported_p (PLUS_EXPR, step_vectype)
9614       || !directly_supported_p (MINUS_EXPR, step_vectype))
9615     return false;
9616
9617   if (!vec_stmt) /* transformation not required.  */
9618     {
9619       unsigned inside_cost = 0, prologue_cost = 0;
9620       if (slp_node)
9621         {
9622           /* We eventually need to set a vector type on invariant
9623              arguments.  */
9624           unsigned j;
9625           slp_tree child;
9626           FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (slp_node), j, child)
9627             if (!vect_maybe_update_slp_op_vectype
9628                 (child, SLP_TREE_VECTYPE (slp_node)))
9629               {
9630                 if (dump_enabled_p ())
9631                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
9632                                    "incompatible vector types for "
9633                                    "invariants\n");
9634                 return false;
9635               }
9636           /* loop cost for vec_loop.  */
9637           inside_cost
9638             = record_stmt_cost (cost_vec,
9639                                 SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node),
9640                                 vector_stmt, stmt_info, 0, vect_body);
9641           /* prologue cost for vec_init (if not nested) and step.  */
9642           prologue_cost = record_stmt_cost (cost_vec, 1 + !nested_in_vect_loop,
9643                                             scalar_to_vec,
9644                                             stmt_info, 0, vect_prologue);
9645         }
9646       else /* if (!slp_node) */
9647         {
9648           /* loop cost for vec_loop.  */
9649           inside_cost = record_stmt_cost (cost_vec, ncopies, vector_stmt,
9650                                           stmt_info, 0, vect_body);
9651           /* prologue cost for vec_init and vec_step.  */
9652           prologue_cost = record_stmt_cost (cost_vec, 2, scalar_to_vec,
9653                                             stmt_info, 0, vect_prologue);
9654         }
9655       if (dump_enabled_p ())
9656         dump_printf_loc (MSG_NOTE, vect_location,
9657                          "vect_model_induction_cost: inside_cost = %d, "
9658                          "prologue_cost = %d .\n", inside_cost,
9659                          prologue_cost);
9660
9661       STMT_VINFO_TYPE (stmt_info) = induc_vec_info_type;
9662       DUMP_VECT_SCOPE ("vectorizable_induction");
9663       return true;
9664     }
9665
9666   /* Transform.  */
9667
9668   /* Compute a vector variable, initialized with the first VF values of
9669      the induction variable.  E.g., for an iv with IV_PHI='X' and
9670      evolution S, for a vector of 4 units, we want to compute:
9671      [X, X + S, X + 2*S, X + 3*S].  */
9672
9673   if (dump_enabled_p ())
9674     dump_printf_loc (MSG_NOTE, vect_location, "transform induction phi.\n");
9675
9676   pe = loop_preheader_edge (iv_loop);
9677   /* Find the first insertion point in the BB.  */
9678   basic_block bb = gimple_bb (phi);
9679   si = gsi_after_labels (bb);
9680
9681   /* For SLP induction we have to generate several IVs as for example
9682      with group size 3 we need
9683        [i0, i1, i2, i0 + S0] [i1 + S1, i2 + S2, i0 + 2*S0, i1 + 2*S1]
9684        [i2 + 2*S2, i0 + 3*S0, i1 + 3*S1, i2 + 3*S2].  */
9685   if (slp_node)
9686     {
9687       /* Enforced above.  */
9688       unsigned int const_nunits = nunits.to_constant ();
9689
9690       /* The initial values are vectorized, but any lanes > group_size
9691          need adjustment.  */
9692       slp_tree init_node
9693         = SLP_TREE_CHILDREN (slp_node)[pe->dest_idx];
9694
9695       /* Gather steps.  Since we do not vectorize inductions as
9696          cycles we have to reconstruct the step from SCEV data.  */
9697       unsigned group_size = SLP_TREE_LANES (slp_node);
9698       tree *steps = XALLOCAVEC (tree, group_size);
9699       tree *inits = XALLOCAVEC (tree, group_size);
9700       stmt_vec_info phi_info;
9701       FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (slp_node), i, phi_info)
9702         {
9703           steps[i] = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (phi_info);
9704           if (!init_node)
9705             inits[i] = gimple_phi_arg_def (as_a<gphi *> (phi_info->stmt),
9706                                            pe->dest_idx);
9707         }
9708
9709       /* Now generate the IVs.  */
9710       unsigned nvects = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
9711       gcc_assert ((const_nunits * nvects) % group_size == 0);
9712       unsigned nivs;
9713       if (nested_in_vect_loop)
9714         nivs = nvects;
9715       else
9716         {
9717           /* Compute the number of distinct IVs we need.  First reduce
9718              group_size if it is a multiple of const_nunits so we get
9719              one IV for a group_size of 4 but const_nunits 2.  */
9720           unsigned group_sizep = group_size;
9721           if (group_sizep % const_nunits == 0)
9722             group_sizep = group_sizep / const_nunits;
9723           nivs = least_common_multiple (group_sizep,
9724                                         const_nunits) / const_nunits;
9725         }
9726       tree stept = TREE_TYPE (step_vectype);
9727       tree lupdate_mul = NULL_TREE;
9728       if (!nested_in_vect_loop)
9729         {
9730           /* The number of iterations covered in one vector iteration.  */
9731           unsigned lup_mul = (nvects * const_nunits) / group_size;
9732           lupdate_mul
9733             = build_vector_from_val (step_vectype,
9734                                      SCALAR_FLOAT_TYPE_P (stept)
9735                                      ? build_real_from_wide (stept, lup_mul,
9736                                                              UNSIGNED)
9737                                      : build_int_cstu (stept, lup_mul));
9738         }
9739       tree peel_mul = NULL_TREE;
9740       gimple_seq init_stmts = NULL;
9741       if (LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo))
9742         {
9743           if (SCALAR_FLOAT_TYPE_P (stept))
9744             peel_mul = gimple_build (&init_stmts, FLOAT_EXPR, stept,
9745                                      LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo));
9746           else
9747             peel_mul = gimple_convert (&init_stmts, stept,
9748                                        LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo));
9749           peel_mul = gimple_build_vector_from_val (&init_stmts,
9750                                                    step_vectype, peel_mul);
9751         }
9752       unsigned ivn;
9753       auto_vec<tree> vec_steps;
9754       for (ivn = 0; ivn < nivs; ++ivn)
9755         {
9756           tree_vector_builder step_elts (step_vectype, const_nunits, 1);
9757           tree_vector_builder init_elts (vectype, const_nunits, 1);
9758           tree_vector_builder mul_elts (step_vectype, const_nunits, 1);
9759           for (unsigned eltn = 0; eltn < const_nunits; ++eltn)
9760             {
9761               /* The scalar steps of the IVs.  */
9762               tree elt = steps[(ivn*const_nunits + eltn) % group_size];
9763               elt = gimple_convert (&init_stmts, TREE_TYPE (step_vectype), elt);
9764               step_elts.quick_push (elt);
9765               if (!init_node)
9766                 {
9767                   /* The scalar inits of the IVs if not vectorized.  */
9768                   elt = inits[(ivn*const_nunits + eltn) % group_size];
9769                   if (!useless_type_conversion_p (TREE_TYPE (vectype),
9770                                                   TREE_TYPE (elt)))
9771                     elt = gimple_build (&init_stmts, VIEW_CONVERT_EXPR,
9772                                         TREE_TYPE (vectype), elt);
9773                   init_elts.quick_push (elt);
9774                 }
9775               /* The number of steps to add to the initial values.  */
9776               unsigned mul_elt = (ivn*const_nunits + eltn) / group_size;
9777               mul_elts.quick_push (SCALAR_FLOAT_TYPE_P (stept)
9778                                    ? build_real_from_wide (stept,
9779                                                            mul_elt, UNSIGNED)
9780                                    : build_int_cstu (stept, mul_elt));
9781             }
9782           vec_step = gimple_build_vector (&init_stmts, &step_elts);
9783           vec_steps.safe_push (vec_step);
9784           tree step_mul = gimple_build_vector (&init_stmts, &mul_elts);
9785           if (peel_mul)
9786             step_mul = gimple_build (&init_stmts, PLUS_EXPR, step_vectype,
9787                                      step_mul, peel_mul);
9788           if (!init_node)
9789             vec_init = gimple_build_vector (&init_stmts, &init_elts);
9790
9791           /* Create the induction-phi that defines the induction-operand.  */
9792           vec_dest = vect_get_new_vect_var (vectype, vect_simple_var,
9793                                             "vec_iv_");
9794           induction_phi = create_phi_node (vec_dest, iv_loop->header);
9795           induc_def = PHI_RESULT (induction_phi);
9796
9797           /* Create the iv update inside the loop  */
9798           tree up = vec_step;
9799           if (lupdate_mul)
9800             up = gimple_build (&init_stmts, MULT_EXPR, step_vectype,
9801                                vec_step, lupdate_mul);
9802           gimple_seq stmts = NULL;
9803           vec_def = gimple_convert (&stmts, step_vectype, induc_def);
9804           vec_def = gimple_build (&stmts,
9805                                   PLUS_EXPR, step_vectype, vec_def, up);
9806           vec_def = gimple_convert (&stmts, vectype, vec_def);
9807           gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
9808           add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
9809                        UNKNOWN_LOCATION);
9810
9811           if (init_node)
9812             vec_init = vect_get_slp_vect_def (init_node, ivn);
9813           if (!nested_in_vect_loop
9814               && !integer_zerop (step_mul))
9815             {
9816               vec_def = gimple_convert (&init_stmts, step_vectype, vec_init);
9817               up = gimple_build (&init_stmts, MULT_EXPR, step_vectype,
9818                                  vec_step, step_mul);
9819               vec_def = gimple_build (&init_stmts, PLUS_EXPR, step_vectype,
9820                                       vec_def, up);
9821               vec_init = gimple_convert (&init_stmts, vectype, vec_def);
9822             }
9823
9824           /* Set the arguments of the phi node:  */
9825           add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION);
9826
9827           SLP_TREE_VEC_STMTS (slp_node).quick_push (induction_phi);
9828         }
9829       if (!nested_in_vect_loop)
9830         {
9831           /* Fill up to the number of vectors we need for the whole group.  */
9832           nivs = least_common_multiple (group_size,
9833                                         const_nunits) / const_nunits;
9834           vec_steps.reserve (nivs-ivn);
9835           for (; ivn < nivs; ++ivn)
9836             {
9837               SLP_TREE_VEC_STMTS (slp_node)
9838                 .quick_push (SLP_TREE_VEC_STMTS (slp_node)[0]);
9839               vec_steps.quick_push (vec_steps[0]);
9840             }
9841         }
9842
9843       /* Re-use IVs when we can.  We are generating further vector
9844          stmts by adding VF' * stride to the IVs generated above.  */
9845       if (ivn < nvects)
9846         {
9847           unsigned vfp
9848             = least_common_multiple (group_size, const_nunits) / group_size;
9849           tree lupdate_mul
9850             = build_vector_from_val (step_vectype,
9851                                      SCALAR_FLOAT_TYPE_P (stept)
9852                                      ? build_real_from_wide (stept,
9853                                                              vfp, UNSIGNED)
9854                                      : build_int_cstu (stept, vfp));
9855           for (; ivn < nvects; ++ivn)
9856             {
9857               gimple *iv = SLP_TREE_VEC_STMTS (slp_node)[ivn - nivs];
9858               tree def = gimple_get_lhs (iv);
9859               if (ivn < 2*nivs)
9860                 vec_steps[ivn - nivs]
9861                   = gimple_build (&init_stmts, MULT_EXPR, step_vectype,
9862                                   vec_steps[ivn - nivs], lupdate_mul);
9863               gimple_seq stmts = NULL;
9864               def = gimple_convert (&stmts, step_vectype, def);
9865               def = gimple_build (&stmts, PLUS_EXPR, step_vectype,
9866                                   def, vec_steps[ivn % nivs]);
9867               def = gimple_convert (&stmts, vectype, def);
9868               if (gimple_code (iv) == GIMPLE_PHI)
9869                 gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
9870               else
9871                 {
9872                   gimple_stmt_iterator tgsi = gsi_for_stmt (iv);
9873                   gsi_insert_seq_after (&tgsi, stmts, GSI_CONTINUE_LINKING);
9874                 }
9875               SLP_TREE_VEC_STMTS (slp_node)
9876                 .quick_push (SSA_NAME_DEF_STMT (def));
9877             }
9878         }
9879
9880       new_bb = gsi_insert_seq_on_edge_immediate (pe, init_stmts);
9881       gcc_assert (!new_bb);
9882
9883       return true;
9884     }
9885
9886   init_expr = vect_phi_initial_value (phi);
9887
9888   gimple_seq stmts = NULL;
9889   if (!nested_in_vect_loop)
9890     {
9891       /* Convert the initial value to the IV update type.  */
9892       tree new_type = TREE_TYPE (step_expr);
9893       init_expr = gimple_convert (&stmts, new_type, init_expr);
9894
9895       /* If we are using the loop mask to "peel" for alignment then we need
9896          to adjust the start value here.  */
9897       tree skip_niters = LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo);
9898       if (skip_niters != NULL_TREE)
9899         {
9900           if (FLOAT_TYPE_P (vectype))
9901             skip_niters = gimple_build (&stmts, FLOAT_EXPR, new_type,
9902                                         skip_niters);
9903           else
9904             skip_niters = gimple_convert (&stmts, new_type, skip_niters);
9905           tree skip_step = gimple_build (&stmts, MULT_EXPR, new_type,
9906                                          skip_niters, step_expr);
9907           init_expr = gimple_build (&stmts, MINUS_EXPR, new_type,
9908                                     init_expr, skip_step);
9909         }
9910     }
9911
9912   if (stmts)
9913     {
9914       new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
9915       gcc_assert (!new_bb);
9916     }
9917
9918   /* Create the vector that holds the initial_value of the induction.  */
9919   if (nested_in_vect_loop)
9920     {
9921       /* iv_loop is nested in the loop to be vectorized.  init_expr had already
9922          been created during vectorization of previous stmts.  We obtain it
9923          from the STMT_VINFO_VEC_STMT of the defining stmt.  */
9924       auto_vec<tree> vec_inits;
9925       vect_get_vec_defs_for_operand (loop_vinfo, stmt_info, 1,
9926                                      init_expr, &vec_inits);
9927       vec_init = vec_inits[0];
9928       /* If the initial value is not of proper type, convert it.  */
9929       if (!useless_type_conversion_p (vectype, TREE_TYPE (vec_init)))
9930         {
9931           new_stmt
9932             = gimple_build_assign (vect_get_new_ssa_name (vectype,
9933                                                           vect_simple_var,
9934                                                           "vec_iv_"),
9935                                    VIEW_CONVERT_EXPR,
9936                                    build1 (VIEW_CONVERT_EXPR, vectype,
9937                                            vec_init));
9938           vec_init = gimple_assign_lhs (new_stmt);
9939           new_bb = gsi_insert_on_edge_immediate (loop_preheader_edge (iv_loop),
9940                                                  new_stmt);
9941           gcc_assert (!new_bb);
9942         }
9943     }
9944   else
9945     {
9946       /* iv_loop is the loop to be vectorized. Create:
9947          vec_init = [X, X+S, X+2*S, X+3*S] (S = step_expr, X = init_expr)  */
9948       stmts = NULL;
9949       new_name = gimple_convert (&stmts, TREE_TYPE (step_expr), init_expr);
9950
9951       unsigned HOST_WIDE_INT const_nunits;
9952       if (nunits.is_constant (&const_nunits))
9953         {
9954           tree_vector_builder elts (step_vectype, const_nunits, 1);
9955           elts.quick_push (new_name);
9956           for (i = 1; i < const_nunits; i++)
9957             {
9958               /* Create: new_name_i = new_name + step_expr  */
9959               new_name = gimple_build (&stmts, PLUS_EXPR, TREE_TYPE (new_name),
9960                                        new_name, step_expr);
9961               elts.quick_push (new_name);
9962             }
9963           /* Create a vector from [new_name_0, new_name_1, ...,
9964              new_name_nunits-1]  */
9965           vec_init = gimple_build_vector (&stmts, &elts);
9966         }
9967       else if (INTEGRAL_TYPE_P (TREE_TYPE (step_expr)))
9968         /* Build the initial value directly from a VEC_SERIES_EXPR.  */
9969         vec_init = gimple_build (&stmts, VEC_SERIES_EXPR, step_vectype,
9970                                  new_name, step_expr);
9971       else
9972         {
9973           /* Build:
9974                 [base, base, base, ...]
9975                 + (vectype) [0, 1, 2, ...] * [step, step, step, ...].  */
9976           gcc_assert (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)));
9977           gcc_assert (flag_associative_math);
9978           tree index = build_index_vector (step_vectype, 0, 1);
9979           tree base_vec = gimple_build_vector_from_val (&stmts, step_vectype,
9980                                                         new_name);
9981           tree step_vec = gimple_build_vector_from_val (&stmts, step_vectype,
9982                                                         step_expr);
9983           vec_init = gimple_build (&stmts, FLOAT_EXPR, step_vectype, index);
9984           vec_init = gimple_build (&stmts, MULT_EXPR, step_vectype,
9985                                    vec_init, step_vec);
9986           vec_init = gimple_build (&stmts, PLUS_EXPR, step_vectype,
9987                                    vec_init, base_vec);
9988         }
9989       vec_init = gimple_convert (&stmts, vectype, vec_init);
9990
9991       if (stmts)
9992         {
9993           new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
9994           gcc_assert (!new_bb);
9995         }
9996     }
9997
9998
9999   /* Create the vector that holds the step of the induction.  */
10000   if (nested_in_vect_loop)
10001     /* iv_loop is nested in the loop to be vectorized. Generate:
10002        vec_step = [S, S, S, S]  */
10003     new_name = step_expr;
10004   else
10005     {
10006       /* iv_loop is the loop to be vectorized. Generate:
10007           vec_step = [VF*S, VF*S, VF*S, VF*S]  */
10008       gimple_seq seq = NULL;
10009       if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
10010         {
10011           expr = build_int_cst (integer_type_node, vf);
10012           expr = gimple_build (&seq, FLOAT_EXPR, TREE_TYPE (step_expr), expr);
10013         }
10014       else
10015         expr = build_int_cst (TREE_TYPE (step_expr), vf);
10016       new_name = gimple_build (&seq, MULT_EXPR, TREE_TYPE (step_expr),
10017                                expr, step_expr);
10018       if (seq)
10019         {
10020           new_bb = gsi_insert_seq_on_edge_immediate (pe, seq);
10021           gcc_assert (!new_bb);
10022         }
10023     }
10024
10025   t = unshare_expr (new_name);
10026   gcc_assert (CONSTANT_CLASS_P (new_name)
10027               || TREE_CODE (new_name) == SSA_NAME);
10028   new_vec = build_vector_from_val (step_vectype, t);
10029   vec_step = vect_init_vector (loop_vinfo, stmt_info,
10030                                new_vec, step_vectype, NULL);
10031
10032
10033   /* Create the following def-use cycle:
10034      loop prolog:
10035          vec_init = ...
10036          vec_step = ...
10037      loop:
10038          vec_iv = PHI <vec_init, vec_loop>
10039          ...
10040          STMT
10041          ...
10042          vec_loop = vec_iv + vec_step;  */
10043
10044   /* Create the induction-phi that defines the induction-operand.  */
10045   vec_dest = vect_get_new_vect_var (vectype, vect_simple_var, "vec_iv_");
10046   induction_phi = create_phi_node (vec_dest, iv_loop->header);
10047   induc_def = PHI_RESULT (induction_phi);
10048
10049   /* Create the iv update inside the loop  */
10050   stmts = NULL;
10051   vec_def = gimple_convert (&stmts, step_vectype, induc_def);
10052   vec_def = gimple_build (&stmts, PLUS_EXPR, step_vectype, vec_def, vec_step);
10053   vec_def = gimple_convert (&stmts, vectype, vec_def);
10054   gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
10055   new_stmt = SSA_NAME_DEF_STMT (vec_def);
10056
10057   /* Set the arguments of the phi node:  */
10058   add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION);
10059   add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
10060                UNKNOWN_LOCATION);
10061
10062   STMT_VINFO_VEC_STMTS (stmt_info).safe_push (induction_phi);
10063   *vec_stmt = induction_phi;
10064
10065   /* In case that vectorization factor (VF) is bigger than the number
10066      of elements that we can fit in a vectype (nunits), we have to generate
10067      more than one vector stmt - i.e - we need to "unroll" the
10068      vector stmt by a factor VF/nunits.  For more details see documentation
10069      in vectorizable_operation.  */
10070
10071   if (ncopies > 1)
10072     {
10073       gimple_seq seq = NULL;
10074       /* FORNOW. This restriction should be relaxed.  */
10075       gcc_assert (!nested_in_vect_loop);
10076
10077       /* Create the vector that holds the step of the induction.  */
10078       if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
10079         {
10080           expr = build_int_cst (integer_type_node, nunits);
10081           expr = gimple_build (&seq, FLOAT_EXPR, TREE_TYPE (step_expr), expr);
10082         }
10083       else
10084         expr = build_int_cst (TREE_TYPE (step_expr), nunits);
10085       new_name = gimple_build (&seq, MULT_EXPR, TREE_TYPE (step_expr),
10086                                expr, step_expr);
10087       if (seq)
10088         {
10089           new_bb = gsi_insert_seq_on_edge_immediate (pe, seq);
10090           gcc_assert (!new_bb);
10091         }
10092
10093       t = unshare_expr (new_name);
10094       gcc_assert (CONSTANT_CLASS_P (new_name)
10095                   || TREE_CODE (new_name) == SSA_NAME);
10096       new_vec = build_vector_from_val (step_vectype, t);
10097       vec_step = vect_init_vector (loop_vinfo, stmt_info,
10098                                    new_vec, step_vectype, NULL);
10099
10100       vec_def = induc_def;
10101       for (i = 1; i < ncopies + 1; i++)
10102         {
10103           /* vec_i = vec_prev + vec_step  */
10104           gimple_seq stmts = NULL;
10105           vec_def = gimple_convert (&stmts, step_vectype, vec_def);
10106           vec_def = gimple_build (&stmts,
10107                                   PLUS_EXPR, step_vectype, vec_def, vec_step);
10108           vec_def = gimple_convert (&stmts, vectype, vec_def);
10109
10110           gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
10111           if (i < ncopies)
10112             {
10113               new_stmt = SSA_NAME_DEF_STMT (vec_def);
10114               STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_stmt);
10115             }
10116           else
10117             {
10118               /* vec_1 = vec_iv + (VF/n * S)
10119                  vec_2 = vec_1 + (VF/n * S)
10120                  ...
10121                  vec_n = vec_prev + (VF/n * S) = vec_iv + VF * S = vec_loop
10122
10123                  vec_n is used as vec_loop to save the large step register and
10124                  related operations.  */
10125               add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
10126                            UNKNOWN_LOCATION);
10127             }
10128         }
10129     }
10130
10131   if (dump_enabled_p ())
10132     dump_printf_loc (MSG_NOTE, vect_location,
10133                      "transform induction: created def-use cycle: %G%G",
10134                      (gimple *) induction_phi, SSA_NAME_DEF_STMT (vec_def));
10135
10136   return true;
10137 }
10138
10139 /* Function vectorizable_live_operation.
10140
10141    STMT_INFO computes a value that is used outside the loop.  Check if
10142    it can be supported.  */
10143
10144 bool
10145 vectorizable_live_operation (vec_info *vinfo,
10146                              stmt_vec_info stmt_info,
10147                              gimple_stmt_iterator *gsi,
10148                              slp_tree slp_node, slp_instance slp_node_instance,
10149                              int slp_index, bool vec_stmt_p,
10150                              stmt_vector_for_cost *cost_vec)
10151 {
10152   loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
10153   imm_use_iterator imm_iter;
10154   tree lhs, lhs_type, bitsize;
10155   tree vectype = (slp_node
10156                   ? SLP_TREE_VECTYPE (slp_node)
10157                   : STMT_VINFO_VECTYPE (stmt_info));
10158   poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
10159   int ncopies;
10160   gimple *use_stmt;
10161   auto_vec<tree> vec_oprnds;
10162   int vec_entry = 0;
10163   poly_uint64 vec_index = 0;
10164
10165   gcc_assert (STMT_VINFO_LIVE_P (stmt_info));
10166
10167   /* If a stmt of a reduction is live, vectorize it via
10168      vect_create_epilog_for_reduction.  vectorizable_reduction assessed
10169      validity so just trigger the transform here.  */
10170   if (STMT_VINFO_REDUC_DEF (vect_orig_stmt (stmt_info)))
10171     {
10172       if (!vec_stmt_p)
10173         return true;
10174       if (slp_node)
10175         {
10176           /* For reduction chains the meta-info is attached to
10177              the group leader.  */
10178           if (REDUC_GROUP_FIRST_ELEMENT (stmt_info))
10179             stmt_info = REDUC_GROUP_FIRST_ELEMENT (stmt_info);
10180           /* For SLP reductions we vectorize the epilogue for
10181              all involved stmts together.  */
10182           else if (slp_index != 0)
10183             return true;
10184         }
10185       stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info);
10186       gcc_assert (reduc_info->is_reduc_info);
10187       if (STMT_VINFO_REDUC_TYPE (reduc_info) == FOLD_LEFT_REDUCTION
10188           || STMT_VINFO_REDUC_TYPE (reduc_info) == EXTRACT_LAST_REDUCTION)
10189         return true;
10190       vect_create_epilog_for_reduction (loop_vinfo, stmt_info, slp_node,
10191                                         slp_node_instance);
10192       return true;
10193     }
10194
10195   /* If STMT is not relevant and it is a simple assignment and its inputs are
10196      invariant then it can remain in place, unvectorized.  The original last
10197      scalar value that it computes will be used.  */
10198   if (!STMT_VINFO_RELEVANT_P (stmt_info))
10199     {
10200       gcc_assert (is_simple_and_all_uses_invariant (stmt_info, loop_vinfo));
10201       if (dump_enabled_p ())
10202         dump_printf_loc (MSG_NOTE, vect_location,
10203                          "statement is simple and uses invariant.  Leaving in "
10204                          "place.\n");
10205       return true;
10206     }
10207
10208   if (slp_node)
10209     ncopies = 1;
10210   else
10211     ncopies = vect_get_num_copies (loop_vinfo, vectype);
10212
10213   if (slp_node)
10214     {
10215       gcc_assert (slp_index >= 0);
10216
10217       /* Get the last occurrence of the scalar index from the concatenation of
10218          all the slp vectors. Calculate which slp vector it is and the index
10219          within.  */
10220       int num_scalar = SLP_TREE_LANES (slp_node);
10221       int num_vec = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
10222       poly_uint64 pos = (num_vec * nunits) - num_scalar + slp_index;
10223
10224       /* Calculate which vector contains the result, and which lane of
10225          that vector we need.  */
10226       if (!can_div_trunc_p (pos, nunits, &vec_entry, &vec_index))
10227         {
10228           if (dump_enabled_p ())
10229             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
10230                              "Cannot determine which vector holds the"
10231                              " final result.\n");
10232           return false;
10233         }
10234     }
10235
10236   if (!vec_stmt_p)
10237     {
10238       /* No transformation required.  */
10239       if (loop_vinfo && LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo))
10240         {
10241           if (!direct_internal_fn_supported_p (IFN_EXTRACT_LAST, vectype,
10242                                                OPTIMIZE_FOR_SPEED))
10243             {
10244               if (dump_enabled_p ())
10245                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
10246                                  "can't operate on partial vectors "
10247                                  "because the target doesn't support extract "
10248                                  "last reduction.\n");
10249               LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
10250             }
10251           else if (slp_node)
10252             {
10253               if (dump_enabled_p ())
10254                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
10255                                  "can't operate on partial vectors "
10256                                  "because an SLP statement is live after "
10257                                  "the loop.\n");
10258               LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
10259             }
10260           else if (ncopies > 1)
10261             {
10262               if (dump_enabled_p ())
10263                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
10264                                  "can't operate on partial vectors "
10265                                  "because ncopies is greater than 1.\n");
10266               LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
10267             }
10268           else
10269             {
10270               gcc_assert (ncopies == 1 && !slp_node);
10271               vect_record_loop_mask (loop_vinfo,
10272                                      &LOOP_VINFO_MASKS (loop_vinfo),
10273                                      1, vectype, NULL);
10274             }
10275         }
10276       /* ???  Enable for loop costing as well.  */
10277       if (!loop_vinfo)
10278         record_stmt_cost (cost_vec, 1, vec_to_scalar, stmt_info, NULL_TREE,
10279                           0, vect_epilogue);
10280       return true;
10281     }
10282
10283   /* Use the lhs of the original scalar statement.  */
10284   gimple *stmt = vect_orig_stmt (stmt_info)->stmt;
10285   if (dump_enabled_p ())
10286     dump_printf_loc (MSG_NOTE, vect_location, "extracting lane for live "
10287                      "stmt %G", stmt);
10288
10289   lhs = gimple_get_lhs (stmt);
10290   lhs_type = TREE_TYPE (lhs);
10291
10292   bitsize = vector_element_bits_tree (vectype);
10293
10294   /* Get the vectorized lhs of STMT and the lane to use (counted in bits).  */
10295   tree vec_lhs, bitstart;
10296   gimple *vec_stmt;
10297   if (slp_node)
10298     {
10299       gcc_assert (!loop_vinfo || !LOOP_VINFO_FULLY_MASKED_P (loop_vinfo));
10300
10301       /* Get the correct slp vectorized stmt.  */
10302       vec_stmt = SLP_TREE_VEC_STMTS (slp_node)[vec_entry];
10303       vec_lhs = gimple_get_lhs (vec_stmt);
10304
10305       /* Get entry to use.  */
10306       bitstart = bitsize_int (vec_index);
10307       bitstart = int_const_binop (MULT_EXPR, bitsize, bitstart);
10308     }
10309   else
10310     {
10311       /* For multiple copies, get the last copy.  */
10312       vec_stmt = STMT_VINFO_VEC_STMTS (stmt_info).last ();
10313       vec_lhs = gimple_get_lhs (vec_stmt);
10314
10315       /* Get the last lane in the vector.  */
10316       bitstart = int_const_binop (MULT_EXPR, bitsize, bitsize_int (nunits - 1));
10317     }
10318
10319   if (loop_vinfo)
10320     {
10321       /* Ensure the VEC_LHS for lane extraction stmts satisfy loop-closed PHI
10322          requirement, insert one phi node for it.  It looks like:
10323            loop;
10324          BB:
10325            # lhs' = PHI <lhs>
10326          ==>
10327            loop;
10328          BB:
10329            # vec_lhs' = PHI <vec_lhs>
10330            new_tree = lane_extract <vec_lhs', ...>;
10331            lhs' = new_tree;  */
10332
10333       class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
10334       basic_block exit_bb = single_exit (loop)->dest;
10335       gcc_assert (single_pred_p (exit_bb));
10336
10337       tree vec_lhs_phi = copy_ssa_name (vec_lhs);
10338       gimple *phi = create_phi_node (vec_lhs_phi, exit_bb);
10339       SET_PHI_ARG_DEF (phi, single_exit (loop)->dest_idx, vec_lhs);
10340
10341       gimple_seq stmts = NULL;
10342       tree new_tree;
10343       if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
10344         {
10345           /* Emit:
10346
10347                SCALAR_RES = EXTRACT_LAST <VEC_LHS, MASK>
10348
10349              where VEC_LHS is the vectorized live-out result and MASK is
10350              the loop mask for the final iteration.  */
10351           gcc_assert (ncopies == 1 && !slp_node);
10352           tree scalar_type = TREE_TYPE (STMT_VINFO_VECTYPE (stmt_info));
10353           tree mask = vect_get_loop_mask (loop_vinfo, gsi,
10354                                           &LOOP_VINFO_MASKS (loop_vinfo),
10355                                           1, vectype, 0);
10356           tree scalar_res = gimple_build (&stmts, CFN_EXTRACT_LAST, scalar_type,
10357                                           mask, vec_lhs_phi);
10358
10359           /* Convert the extracted vector element to the scalar type.  */
10360           new_tree = gimple_convert (&stmts, lhs_type, scalar_res);
10361         }
10362       else
10363         {
10364           tree bftype = TREE_TYPE (vectype);
10365           if (VECTOR_BOOLEAN_TYPE_P (vectype))
10366             bftype = build_nonstandard_integer_type (tree_to_uhwi (bitsize), 1);
10367           new_tree = build3 (BIT_FIELD_REF, bftype,
10368                              vec_lhs_phi, bitsize, bitstart);
10369           new_tree = force_gimple_operand (fold_convert (lhs_type, new_tree),
10370                                            &stmts, true, NULL_TREE);
10371         }
10372
10373       if (stmts)
10374         {
10375           gimple_stmt_iterator exit_gsi = gsi_after_labels (exit_bb);
10376           gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
10377
10378           /* Remove existing phi from lhs and create one copy from new_tree.  */
10379           tree lhs_phi = NULL_TREE;
10380           gimple_stmt_iterator gsi;
10381           for (gsi = gsi_start_phis (exit_bb);
10382                !gsi_end_p (gsi); gsi_next (&gsi))
10383             {
10384               gimple *phi = gsi_stmt (gsi);
10385               if ((gimple_phi_arg_def (phi, 0) == lhs))
10386                 {
10387                   remove_phi_node (&gsi, false);
10388                   lhs_phi = gimple_phi_result (phi);
10389                   gimple *copy = gimple_build_assign (lhs_phi, new_tree);
10390                   gsi_insert_before (&exit_gsi, copy, GSI_SAME_STMT);
10391                   break;
10392                 }
10393             }
10394         }
10395
10396       /* Replace use of lhs with newly computed result.  If the use stmt is a
10397          single arg PHI, just replace all uses of PHI result.  It's necessary
10398          because lcssa PHI defining lhs may be before newly inserted stmt.  */
10399       use_operand_p use_p;
10400       FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, lhs)
10401         if (!flow_bb_inside_loop_p (loop, gimple_bb (use_stmt))
10402             && !is_gimple_debug (use_stmt))
10403           {
10404             if (gimple_code (use_stmt) == GIMPLE_PHI
10405                 && gimple_phi_num_args (use_stmt) == 1)
10406               {
10407                 replace_uses_by (gimple_phi_result (use_stmt), new_tree);
10408               }
10409             else
10410               {
10411                 FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
10412                     SET_USE (use_p, new_tree);
10413               }
10414             update_stmt (use_stmt);
10415           }
10416     }
10417   else
10418     {
10419       /* For basic-block vectorization simply insert the lane-extraction.  */
10420       tree bftype = TREE_TYPE (vectype);
10421       if (VECTOR_BOOLEAN_TYPE_P (vectype))
10422         bftype = build_nonstandard_integer_type (tree_to_uhwi (bitsize), 1);
10423       tree new_tree = build3 (BIT_FIELD_REF, bftype,
10424                               vec_lhs, bitsize, bitstart);
10425       gimple_seq stmts = NULL;
10426       new_tree = force_gimple_operand (fold_convert (lhs_type, new_tree),
10427                                        &stmts, true, NULL_TREE);
10428       if (TREE_CODE (new_tree) == SSA_NAME
10429           && SSA_NAME_OCCURS_IN_ABNORMAL_PHI (lhs))
10430         SSA_NAME_OCCURS_IN_ABNORMAL_PHI (new_tree) = 1;
10431       if (is_a <gphi *> (vec_stmt))
10432         {
10433           gimple_stmt_iterator si = gsi_after_labels (gimple_bb (vec_stmt));
10434           gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
10435         }
10436       else
10437         {
10438           gimple_stmt_iterator si = gsi_for_stmt (vec_stmt);
10439           gsi_insert_seq_after (&si, stmts, GSI_SAME_STMT);
10440         }
10441
10442       /* Replace use of lhs with newly computed result.  If the use stmt is a
10443          single arg PHI, just replace all uses of PHI result.  It's necessary
10444          because lcssa PHI defining lhs may be before newly inserted stmt.  */
10445       use_operand_p use_p;
10446       stmt_vec_info use_stmt_info;
10447       FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, lhs)
10448         if (!is_gimple_debug (use_stmt)
10449             && (!(use_stmt_info = vinfo->lookup_stmt (use_stmt))
10450                 || !PURE_SLP_STMT (vect_stmt_to_vectorize (use_stmt_info))))
10451           {
10452             /* ???  This can happen when the live lane ends up being
10453                used in a vector construction code-generated by an
10454                external SLP node (and code-generation for that already
10455                happened).  See gcc.dg/vect/bb-slp-47.c.
10456                Doing this is what would happen if that vector CTOR
10457                were not code-generated yet so it is not too bad.
10458                ???  In fact we'd likely want to avoid this situation
10459                in the first place.  */
10460             if (TREE_CODE (new_tree) == SSA_NAME
10461                 && !SSA_NAME_IS_DEFAULT_DEF (new_tree)
10462                 && gimple_code (use_stmt) != GIMPLE_PHI
10463                 && !vect_stmt_dominates_stmt_p (SSA_NAME_DEF_STMT (new_tree),
10464                                                 use_stmt))
10465               {
10466                 enum tree_code code = gimple_assign_rhs_code (use_stmt);
10467                 gcc_checking_assert (code == SSA_NAME
10468                                      || code == CONSTRUCTOR
10469                                      || code == VIEW_CONVERT_EXPR
10470                                      || CONVERT_EXPR_CODE_P (code));
10471                 if (dump_enabled_p ())
10472                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
10473                                    "Using original scalar computation for "
10474                                    "live lane because use preceeds vector "
10475                                    "def\n");
10476                 continue;
10477               }
10478             /* ???  It can also happen that we end up pulling a def into
10479                a loop where replacing out-of-loop uses would require
10480                a new LC SSA PHI node.  Retain the original scalar in
10481                those cases as well.  PR98064.  */
10482             if (TREE_CODE (new_tree) == SSA_NAME
10483                 && !SSA_NAME_IS_DEFAULT_DEF (new_tree)
10484                 && (gimple_bb (use_stmt)->loop_father
10485                     != gimple_bb (vec_stmt)->loop_father)
10486                 && !flow_loop_nested_p (gimple_bb (vec_stmt)->loop_father,
10487                                         gimple_bb (use_stmt)->loop_father))
10488               {
10489                 if (dump_enabled_p ())
10490                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
10491                                    "Using original scalar computation for "
10492                                    "live lane because there is an out-of-loop "
10493                                    "definition for it\n");
10494                 continue;
10495               }
10496             FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
10497               SET_USE (use_p, new_tree);
10498             update_stmt (use_stmt);
10499           }
10500     }
10501
10502   return true;
10503 }
10504
10505 /* Kill any debug uses outside LOOP of SSA names defined in STMT_INFO.  */
10506
10507 static void
10508 vect_loop_kill_debug_uses (class loop *loop, stmt_vec_info stmt_info)
10509 {
10510   ssa_op_iter op_iter;
10511   imm_use_iterator imm_iter;
10512   def_operand_p def_p;
10513   gimple *ustmt;
10514
10515   FOR_EACH_PHI_OR_STMT_DEF (def_p, stmt_info->stmt, op_iter, SSA_OP_DEF)
10516     {
10517       FOR_EACH_IMM_USE_STMT (ustmt, imm_iter, DEF_FROM_PTR (def_p))
10518         {
10519           basic_block bb;
10520
10521           if (!is_gimple_debug (ustmt))
10522             continue;
10523
10524           bb = gimple_bb (ustmt);
10525
10526           if (!flow_bb_inside_loop_p (loop, bb))
10527             {
10528               if (gimple_debug_bind_p (ustmt))
10529                 {
10530                   if (dump_enabled_p ())
10531                     dump_printf_loc (MSG_NOTE, vect_location,
10532                                      "killing debug use\n");
10533
10534                   gimple_debug_bind_reset_value (ustmt);
10535                   update_stmt (ustmt);
10536                 }
10537               else
10538                 gcc_unreachable ();
10539             }
10540         }
10541     }
10542 }
10543
10544 /* Given loop represented by LOOP_VINFO, return true if computation of
10545    LOOP_VINFO_NITERS (= LOOP_VINFO_NITERSM1 + 1) doesn't overflow, false
10546    otherwise.  */
10547
10548 static bool
10549 loop_niters_no_overflow (loop_vec_info loop_vinfo)
10550 {
10551   /* Constant case.  */
10552   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
10553     {
10554       tree cst_niters = LOOP_VINFO_NITERS (loop_vinfo);
10555       tree cst_nitersm1 = LOOP_VINFO_NITERSM1 (loop_vinfo);
10556
10557       gcc_assert (TREE_CODE (cst_niters) == INTEGER_CST);
10558       gcc_assert (TREE_CODE (cst_nitersm1) == INTEGER_CST);
10559       if (wi::to_widest (cst_nitersm1) < wi::to_widest (cst_niters))
10560         return true;
10561     }
10562
10563   widest_int max;
10564   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
10565   /* Check the upper bound of loop niters.  */
10566   if (get_max_loop_iterations (loop, &max))
10567     {
10568       tree type = TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo));
10569       signop sgn = TYPE_SIGN (type);
10570       widest_int type_max = widest_int::from (wi::max_value (type), sgn);
10571       if (max < type_max)
10572         return true;
10573     }
10574   return false;
10575 }
10576
10577 /* Return a mask type with half the number of elements as OLD_TYPE,
10578    given that it should have mode NEW_MODE.  */
10579
10580 tree
10581 vect_halve_mask_nunits (tree old_type, machine_mode new_mode)
10582 {
10583   poly_uint64 nunits = exact_div (TYPE_VECTOR_SUBPARTS (old_type), 2);
10584   return build_truth_vector_type_for_mode (nunits, new_mode);
10585 }
10586
10587 /* Return a mask type with twice as many elements as OLD_TYPE,
10588    given that it should have mode NEW_MODE.  */
10589
10590 tree
10591 vect_double_mask_nunits (tree old_type, machine_mode new_mode)
10592 {
10593   poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (old_type) * 2;
10594   return build_truth_vector_type_for_mode (nunits, new_mode);
10595 }
10596
10597 /* Record that a fully-masked version of LOOP_VINFO would need MASKS to
10598    contain a sequence of NVECTORS masks that each control a vector of type
10599    VECTYPE.  If SCALAR_MASK is nonnull, the fully-masked loop would AND
10600    these vector masks with the vector version of SCALAR_MASK.  */
10601
10602 void
10603 vect_record_loop_mask (loop_vec_info loop_vinfo, vec_loop_masks *masks,
10604                        unsigned int nvectors, tree vectype, tree scalar_mask)
10605 {
10606   gcc_assert (nvectors != 0);
10607
10608   if (scalar_mask)
10609     {
10610       scalar_cond_masked_key cond (scalar_mask, nvectors);
10611       loop_vinfo->scalar_cond_masked_set.add (cond);
10612     }
10613
10614   masks->mask_set.add (std::make_pair (vectype, nvectors));
10615 }
10616
10617 /* Given a complete set of masks MASKS, extract mask number INDEX
10618    for an rgroup that operates on NVECTORS vectors of type VECTYPE,
10619    where 0 <= INDEX < NVECTORS.  Insert any set-up statements before GSI.
10620
10621    See the comment above vec_loop_masks for more details about the mask
10622    arrangement.  */
10623
10624 tree
10625 vect_get_loop_mask (loop_vec_info loop_vinfo,
10626                     gimple_stmt_iterator *gsi, vec_loop_masks *masks,
10627                     unsigned int nvectors, tree vectype, unsigned int index)
10628 {
10629   if (LOOP_VINFO_PARTIAL_VECTORS_STYLE (loop_vinfo)
10630       == vect_partial_vectors_while_ult)
10631     {
10632       rgroup_controls *rgm = &(masks->rgc_vec)[nvectors - 1];
10633       tree mask_type = rgm->type;
10634
10635       /* Populate the rgroup's mask array, if this is the first time we've
10636          used it.  */
10637       if (rgm->controls.is_empty ())
10638         {
10639           rgm->controls.safe_grow_cleared (nvectors, true);
10640           for (unsigned int i = 0; i < nvectors; ++i)
10641             {
10642               tree mask = make_temp_ssa_name (mask_type, NULL, "loop_mask");
10643               /* Provide a dummy definition until the real one is available.  */
10644               SSA_NAME_DEF_STMT (mask) = gimple_build_nop ();
10645               rgm->controls[i] = mask;
10646             }
10647         }
10648
10649       tree mask = rgm->controls[index];
10650       if (maybe_ne (TYPE_VECTOR_SUBPARTS (mask_type),
10651                     TYPE_VECTOR_SUBPARTS (vectype)))
10652         {
10653           /* A loop mask for data type X can be reused for data type Y
10654              if X has N times more elements than Y and if Y's elements
10655              are N times bigger than X's.  In this case each sequence
10656              of N elements in the loop mask will be all-zero or all-one.
10657              We can then view-convert the mask so that each sequence of
10658              N elements is replaced by a single element.  */
10659           gcc_assert (multiple_p (TYPE_VECTOR_SUBPARTS (mask_type),
10660                                   TYPE_VECTOR_SUBPARTS (vectype)));
10661           gimple_seq seq = NULL;
10662           mask_type = truth_type_for (vectype);
10663           mask = gimple_build (&seq, VIEW_CONVERT_EXPR, mask_type, mask);
10664           if (seq)
10665             gsi_insert_seq_before (gsi, seq, GSI_SAME_STMT);
10666         }
10667       return mask;
10668     }
10669   else if (LOOP_VINFO_PARTIAL_VECTORS_STYLE (loop_vinfo)
10670            == vect_partial_vectors_avx512)
10671     {
10672       /* The number of scalars per iteration and the number of vectors are
10673          both compile-time constants.  */
10674       unsigned int nscalars_per_iter
10675         = exact_div (nvectors * TYPE_VECTOR_SUBPARTS (vectype),
10676                      LOOP_VINFO_VECT_FACTOR (loop_vinfo)).to_constant ();
10677
10678       rgroup_controls *rgm = &masks->rgc_vec[nscalars_per_iter - 1];
10679
10680       /* The stored nV is dependent on the mask type produced.  */
10681       gcc_assert (exact_div (nvectors * TYPE_VECTOR_SUBPARTS (vectype),
10682                              TYPE_VECTOR_SUBPARTS (rgm->type)).to_constant ()
10683                   == rgm->factor);
10684       nvectors = rgm->factor;
10685
10686       /* Populate the rgroup's mask array, if this is the first time we've
10687          used it.  */
10688       if (rgm->controls.is_empty ())
10689         {
10690           rgm->controls.safe_grow_cleared (nvectors, true);
10691           for (unsigned int i = 0; i < nvectors; ++i)
10692             {
10693               tree mask = make_temp_ssa_name (rgm->type, NULL, "loop_mask");
10694               /* Provide a dummy definition until the real one is available.  */
10695               SSA_NAME_DEF_STMT (mask) = gimple_build_nop ();
10696               rgm->controls[i] = mask;
10697             }
10698         }
10699       if (known_eq (TYPE_VECTOR_SUBPARTS (rgm->type),
10700                     TYPE_VECTOR_SUBPARTS (vectype)))
10701         return rgm->controls[index];
10702
10703       /* Split the vector if needed.  Since we are dealing with integer mode
10704          masks with AVX512 we can operate on the integer representation
10705          performing the whole vector shifting.  */
10706       unsigned HOST_WIDE_INT factor;
10707       bool ok = constant_multiple_p (TYPE_VECTOR_SUBPARTS (rgm->type),
10708                                      TYPE_VECTOR_SUBPARTS (vectype), &factor);
10709       gcc_assert (ok);
10710       gcc_assert (GET_MODE_CLASS (TYPE_MODE (rgm->type)) == MODE_INT);
10711       tree mask_type = truth_type_for (vectype);
10712       gcc_assert (GET_MODE_CLASS (TYPE_MODE (mask_type)) == MODE_INT);
10713       unsigned vi = index / factor;
10714       unsigned vpart = index % factor;
10715       tree vec = rgm->controls[vi];
10716       gimple_seq seq = NULL;
10717       vec = gimple_build (&seq, VIEW_CONVERT_EXPR,
10718                           lang_hooks.types.type_for_mode
10719                                 (TYPE_MODE (rgm->type), 1), vec);
10720       /* For integer mode masks simply shift the right bits into position.  */
10721       if (vpart != 0)
10722         vec = gimple_build (&seq, RSHIFT_EXPR, TREE_TYPE (vec), vec,
10723                             build_int_cst (integer_type_node,
10724                                            (TYPE_VECTOR_SUBPARTS (vectype)
10725                                             * vpart)));
10726       vec = gimple_convert (&seq, lang_hooks.types.type_for_mode
10727                                     (TYPE_MODE (mask_type), 1), vec);
10728       vec = gimple_build (&seq, VIEW_CONVERT_EXPR, mask_type, vec);
10729       if (seq)
10730         gsi_insert_seq_before (gsi, seq, GSI_SAME_STMT);
10731       return vec;
10732     }
10733   else
10734     gcc_unreachable ();
10735 }
10736
10737 /* Record that LOOP_VINFO would need LENS to contain a sequence of NVECTORS
10738    lengths for controlling an operation on VECTYPE.  The operation splits
10739    each element of VECTYPE into FACTOR separate subelements, measuring the
10740    length as a number of these subelements.  */
10741
10742 void
10743 vect_record_loop_len (loop_vec_info loop_vinfo, vec_loop_lens *lens,
10744                       unsigned int nvectors, tree vectype, unsigned int factor)
10745 {
10746   gcc_assert (nvectors != 0);
10747   if (lens->length () < nvectors)
10748     lens->safe_grow_cleared (nvectors, true);
10749   rgroup_controls *rgl = &(*lens)[nvectors - 1];
10750
10751   /* The number of scalars per iteration, scalar occupied bytes and
10752      the number of vectors are both compile-time constants.  */
10753   unsigned int nscalars_per_iter
10754     = exact_div (nvectors * TYPE_VECTOR_SUBPARTS (vectype),
10755                  LOOP_VINFO_VECT_FACTOR (loop_vinfo)).to_constant ();
10756
10757   if (rgl->max_nscalars_per_iter < nscalars_per_iter)
10758     {
10759       /* For now, we only support cases in which all loads and stores fall back
10760          to VnQI or none do.  */
10761       gcc_assert (!rgl->max_nscalars_per_iter
10762                   || (rgl->factor == 1 && factor == 1)
10763                   || (rgl->max_nscalars_per_iter * rgl->factor
10764                       == nscalars_per_iter * factor));
10765       rgl->max_nscalars_per_iter = nscalars_per_iter;
10766       rgl->type = vectype;
10767       rgl->factor = factor;
10768     }
10769 }
10770
10771 /* Given a complete set of lengths LENS, extract length number INDEX
10772    for an rgroup that operates on NVECTORS vectors of type VECTYPE,
10773    where 0 <= INDEX < NVECTORS.  Return a value that contains FACTOR
10774    multipled by the number of elements that should be processed.
10775    Insert any set-up statements before GSI.  */
10776
10777 tree
10778 vect_get_loop_len (loop_vec_info loop_vinfo, gimple_stmt_iterator *gsi,
10779                    vec_loop_lens *lens, unsigned int nvectors, tree vectype,
10780                    unsigned int index, unsigned int factor)
10781 {
10782   rgroup_controls *rgl = &(*lens)[nvectors - 1];
10783   bool use_bias_adjusted_len =
10784     LOOP_VINFO_PARTIAL_LOAD_STORE_BIAS (loop_vinfo) != 0;
10785
10786   /* Populate the rgroup's len array, if this is the first time we've
10787      used it.  */
10788   if (rgl->controls.is_empty ())
10789     {
10790       rgl->controls.safe_grow_cleared (nvectors, true);
10791       for (unsigned int i = 0; i < nvectors; ++i)
10792         {
10793           tree len_type = LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo);
10794           gcc_assert (len_type != NULL_TREE);
10795
10796           tree len = make_temp_ssa_name (len_type, NULL, "loop_len");
10797
10798           /* Provide a dummy definition until the real one is available.  */
10799           SSA_NAME_DEF_STMT (len) = gimple_build_nop ();
10800           rgl->controls[i] = len;
10801
10802           if (use_bias_adjusted_len)
10803             {
10804               gcc_assert (i == 0);
10805               tree adjusted_len =
10806                 make_temp_ssa_name (len_type, NULL, "adjusted_loop_len");
10807               SSA_NAME_DEF_STMT (adjusted_len) = gimple_build_nop ();
10808               rgl->bias_adjusted_ctrl = adjusted_len;
10809             }
10810         }
10811     }
10812
10813   if (use_bias_adjusted_len)
10814     return rgl->bias_adjusted_ctrl;
10815
10816   tree loop_len = rgl->controls[index];
10817   if (rgl->factor == 1 && factor == 1)
10818     {
10819       poly_int64 nunits1 = TYPE_VECTOR_SUBPARTS (rgl->type);
10820       poly_int64 nunits2 = TYPE_VECTOR_SUBPARTS (vectype);
10821       if (maybe_ne (nunits1, nunits2))
10822         {
10823           /* A loop len for data type X can be reused for data type Y
10824              if X has N times more elements than Y and if Y's elements
10825              are N times bigger than X's.  */
10826           gcc_assert (multiple_p (nunits1, nunits2));
10827           factor = exact_div (nunits1, nunits2).to_constant ();
10828           tree iv_type = LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo);
10829           gimple_seq seq = NULL;
10830           loop_len = gimple_build (&seq, RDIV_EXPR, iv_type, loop_len,
10831                                    build_int_cst (iv_type, factor));
10832           if (seq)
10833             gsi_insert_seq_before (gsi, seq, GSI_SAME_STMT);
10834         }
10835     }
10836   return loop_len;
10837 }
10838
10839 /* Scale profiling counters by estimation for LOOP which is vectorized
10840    by factor VF.
10841    If FLAT is true, the loop we started with had unrealistically flat
10842    profile.  */
10843
10844 static void
10845 scale_profile_for_vect_loop (class loop *loop, unsigned vf, bool flat)
10846 {
10847   /* For flat profiles do not scale down proportionally by VF and only
10848      cap by known iteration count bounds.  */
10849   if (flat)
10850     {
10851       if (dump_file && (dump_flags & TDF_DETAILS))
10852         fprintf (dump_file,
10853                  "Vectorized loop profile seems flat; not scaling iteration "
10854                  "count down by the vectorization factor %i\n", vf);
10855       scale_loop_profile (loop, profile_probability::always (),
10856                           get_likely_max_loop_iterations_int (loop));
10857       return;
10858     }
10859   /* Loop body executes VF fewer times and exit increases VF times.  */
10860   edge exit_e = single_exit (loop);
10861   profile_count entry_count = loop_preheader_edge (loop)->count ();
10862
10863   /* If we have unreliable loop profile avoid dropping entry
10864      count bellow header count.  This can happen since loops
10865      has unrealistically low trip counts.  */
10866   while (vf > 1
10867          && loop->header->count > entry_count
10868          && loop->header->count < entry_count * vf)
10869     {
10870       if (dump_file && (dump_flags & TDF_DETAILS))
10871         fprintf (dump_file,
10872                  "Vectorization factor %i seems too large for profile "
10873                  "prevoiusly believed to be consistent; reducing.\n", vf);
10874       vf /= 2;
10875     }
10876
10877   if (entry_count.nonzero_p ())
10878     set_edge_probability_and_rescale_others
10879             (exit_e,
10880              entry_count.probability_in (loop->header->count / vf));
10881   /* Avoid producing very large exit probability when we do not have
10882      sensible profile.  */
10883   else if (exit_e->probability < profile_probability::always () / (vf * 2))
10884     set_edge_probability_and_rescale_others (exit_e, exit_e->probability * vf);
10885   loop->latch->count = single_pred_edge (loop->latch)->count ();
10886
10887   scale_loop_profile (loop, profile_probability::always () / vf,
10888                       get_likely_max_loop_iterations_int (loop));
10889 }
10890
10891 /* For a vectorized stmt DEF_STMT_INFO adjust all vectorized PHI
10892    latch edge values originally defined by it.  */
10893
10894 static void
10895 maybe_set_vectorized_backedge_value (loop_vec_info loop_vinfo,
10896                                      stmt_vec_info def_stmt_info)
10897 {
10898   tree def = gimple_get_lhs (vect_orig_stmt (def_stmt_info)->stmt);
10899   if (!def || TREE_CODE (def) != SSA_NAME)
10900     return;
10901   stmt_vec_info phi_info;
10902   imm_use_iterator iter;
10903   use_operand_p use_p;
10904   FOR_EACH_IMM_USE_FAST (use_p, iter, def)
10905     {
10906       gphi *phi = dyn_cast <gphi *> (USE_STMT (use_p));
10907       if (!phi)
10908         continue;
10909       if (!(gimple_bb (phi)->loop_father->header == gimple_bb (phi)
10910             && (phi_info = loop_vinfo->lookup_stmt (phi))
10911             && STMT_VINFO_RELEVANT_P (phi_info)))
10912         continue;
10913       loop_p loop = gimple_bb (phi)->loop_father;
10914       edge e = loop_latch_edge (loop);
10915       if (PHI_ARG_DEF_FROM_EDGE (phi, e) != def)
10916         continue;
10917
10918       if (VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (phi_info))
10919           && STMT_VINFO_REDUC_TYPE (phi_info) != FOLD_LEFT_REDUCTION
10920           && STMT_VINFO_REDUC_TYPE (phi_info) != EXTRACT_LAST_REDUCTION)
10921         {
10922           vec<gimple *> &phi_defs = STMT_VINFO_VEC_STMTS (phi_info);
10923           vec<gimple *> &latch_defs = STMT_VINFO_VEC_STMTS (def_stmt_info);
10924           gcc_assert (phi_defs.length () == latch_defs.length ());
10925           for (unsigned i = 0; i < phi_defs.length (); ++i)
10926             add_phi_arg (as_a <gphi *> (phi_defs[i]),
10927                          gimple_get_lhs (latch_defs[i]), e,
10928                          gimple_phi_arg_location (phi, e->dest_idx));
10929         }
10930       else if (STMT_VINFO_DEF_TYPE (phi_info) == vect_first_order_recurrence)
10931         {
10932           /* For first order recurrences we have to update both uses of
10933              the latch definition, the one in the PHI node and the one
10934              in the generated VEC_PERM_EXPR.  */
10935           vec<gimple *> &phi_defs = STMT_VINFO_VEC_STMTS (phi_info);
10936           vec<gimple *> &latch_defs = STMT_VINFO_VEC_STMTS (def_stmt_info);
10937           gcc_assert (phi_defs.length () == latch_defs.length ());
10938           tree phidef = gimple_assign_rhs1 (phi_defs[0]);
10939           gphi *vphi = as_a <gphi *> (SSA_NAME_DEF_STMT (phidef));
10940           for (unsigned i = 0; i < phi_defs.length (); ++i)
10941             {
10942               gassign *perm = as_a <gassign *> (phi_defs[i]);
10943               if (i > 0)
10944                 gimple_assign_set_rhs1 (perm, gimple_get_lhs (latch_defs[i-1]));
10945               gimple_assign_set_rhs2 (perm, gimple_get_lhs (latch_defs[i]));
10946               update_stmt (perm);
10947             }
10948           add_phi_arg (vphi, gimple_get_lhs (latch_defs.last ()), e,
10949                        gimple_phi_arg_location (phi, e->dest_idx));
10950         }
10951     }
10952 }
10953
10954 /* Vectorize STMT_INFO if relevant, inserting any new instructions before GSI.
10955    When vectorizing STMT_INFO as a store, set *SEEN_STORE to its
10956    stmt_vec_info.  */
10957
10958 static bool
10959 vect_transform_loop_stmt (loop_vec_info loop_vinfo, stmt_vec_info stmt_info,
10960                           gimple_stmt_iterator *gsi, stmt_vec_info *seen_store)
10961 {
10962   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
10963   poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
10964
10965   if (dump_enabled_p ())
10966     dump_printf_loc (MSG_NOTE, vect_location,
10967                      "------>vectorizing statement: %G", stmt_info->stmt);
10968
10969   if (MAY_HAVE_DEBUG_BIND_STMTS && !STMT_VINFO_LIVE_P (stmt_info))
10970     vect_loop_kill_debug_uses (loop, stmt_info);
10971
10972   if (!STMT_VINFO_RELEVANT_P (stmt_info)
10973       && !STMT_VINFO_LIVE_P (stmt_info))
10974     return false;
10975
10976   if (STMT_VINFO_VECTYPE (stmt_info))
10977     {
10978       poly_uint64 nunits
10979         = TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info));
10980       if (!STMT_SLP_TYPE (stmt_info)
10981           && maybe_ne (nunits, vf)
10982           && dump_enabled_p ())
10983         /* For SLP VF is set according to unrolling factor, and not
10984            to vector size, hence for SLP this print is not valid.  */
10985         dump_printf_loc (MSG_NOTE, vect_location, "multiple-types.\n");
10986     }
10987
10988   /* Pure SLP statements have already been vectorized.  We still need
10989      to apply loop vectorization to hybrid SLP statements.  */
10990   if (PURE_SLP_STMT (stmt_info))
10991     return false;
10992
10993   if (dump_enabled_p ())
10994     dump_printf_loc (MSG_NOTE, vect_location, "transform statement.\n");
10995
10996   if (vect_transform_stmt (loop_vinfo, stmt_info, gsi, NULL, NULL))
10997     *seen_store = stmt_info;
10998
10999   return true;
11000 }
11001
11002 /* Helper function to pass to simplify_replace_tree to enable replacing tree's
11003    in the hash_map with its corresponding values.  */
11004
11005 static tree
11006 find_in_mapping (tree t, void *context)
11007 {
11008   hash_map<tree,tree>* mapping = (hash_map<tree, tree>*) context;
11009
11010   tree *value = mapping->get (t);
11011   return value ? *value : t;
11012 }
11013
11014 /* Update EPILOGUE's loop_vec_info.  EPILOGUE was constructed as a copy of the
11015    original loop that has now been vectorized.
11016
11017    The inits of the data_references need to be advanced with the number of
11018    iterations of the main loop.  This has been computed in vect_do_peeling and
11019    is stored in parameter ADVANCE.  We first restore the data_references
11020    initial offset with the values recored in ORIG_DRS_INIT.
11021
11022    Since the loop_vec_info of this EPILOGUE was constructed for the original
11023    loop, its stmt_vec_infos all point to the original statements.  These need
11024    to be updated to point to their corresponding copies as well as the SSA_NAMES
11025    in their PATTERN_DEF_SEQs and RELATED_STMTs.
11026
11027    The data_reference's connections also need to be updated.  Their
11028    corresponding dr_vec_info need to be reconnected to the EPILOGUE's
11029    stmt_vec_infos, their statements need to point to their corresponding copy,
11030    if they are gather loads or scatter stores then their reference needs to be
11031    updated to point to its corresponding copy and finally we set
11032    'base_misaligned' to false as we have already peeled for alignment in the
11033    prologue of the main loop.  */
11034
11035 static void
11036 update_epilogue_loop_vinfo (class loop *epilogue, tree advance)
11037 {
11038   loop_vec_info epilogue_vinfo = loop_vec_info_for_loop (epilogue);
11039   auto_vec<gimple *> stmt_worklist;
11040   hash_map<tree,tree> mapping;
11041   gimple *orig_stmt, *new_stmt;
11042   gimple_stmt_iterator epilogue_gsi;
11043   gphi_iterator epilogue_phi_gsi;
11044   stmt_vec_info stmt_vinfo = NULL, related_vinfo;
11045   basic_block *epilogue_bbs = get_loop_body (epilogue);
11046   unsigned i;
11047
11048   free (LOOP_VINFO_BBS (epilogue_vinfo));
11049   LOOP_VINFO_BBS (epilogue_vinfo) = epilogue_bbs;
11050
11051   /* Advance data_reference's with the number of iterations of the previous
11052      loop and its prologue.  */
11053   vect_update_inits_of_drs (epilogue_vinfo, advance, PLUS_EXPR);
11054
11055
11056   /* The EPILOGUE loop is a copy of the original loop so they share the same
11057      gimple UIDs.  In this loop we update the loop_vec_info of the EPILOGUE to
11058      point to the copied statements.  We also create a mapping of all LHS' in
11059      the original loop and all the LHS' in the EPILOGUE and create worklists to
11060      update teh STMT_VINFO_PATTERN_DEF_SEQs and STMT_VINFO_RELATED_STMTs.  */
11061   for (unsigned i = 0; i < epilogue->num_nodes; ++i)
11062     {
11063       for (epilogue_phi_gsi = gsi_start_phis (epilogue_bbs[i]);
11064            !gsi_end_p (epilogue_phi_gsi); gsi_next (&epilogue_phi_gsi))
11065         {
11066           new_stmt = epilogue_phi_gsi.phi ();
11067
11068           gcc_assert (gimple_uid (new_stmt) > 0);
11069           stmt_vinfo
11070             = epilogue_vinfo->stmt_vec_infos[gimple_uid (new_stmt) - 1];
11071
11072           orig_stmt = STMT_VINFO_STMT (stmt_vinfo);
11073           STMT_VINFO_STMT (stmt_vinfo) = new_stmt;
11074
11075           mapping.put (gimple_phi_result (orig_stmt),
11076                        gimple_phi_result (new_stmt));
11077           /* PHI nodes can not have patterns or related statements.  */
11078           gcc_assert (STMT_VINFO_PATTERN_DEF_SEQ (stmt_vinfo) == NULL
11079                       && STMT_VINFO_RELATED_STMT (stmt_vinfo) == NULL);
11080         }
11081
11082       for (epilogue_gsi = gsi_start_bb (epilogue_bbs[i]);
11083            !gsi_end_p (epilogue_gsi); gsi_next (&epilogue_gsi))
11084         {
11085           new_stmt = gsi_stmt (epilogue_gsi);
11086           if (is_gimple_debug (new_stmt))
11087             continue;
11088
11089           gcc_assert (gimple_uid (new_stmt) > 0);
11090           stmt_vinfo
11091             = epilogue_vinfo->stmt_vec_infos[gimple_uid (new_stmt) - 1];
11092
11093           orig_stmt = STMT_VINFO_STMT (stmt_vinfo);
11094           STMT_VINFO_STMT (stmt_vinfo) = new_stmt;
11095
11096           if (tree old_lhs = gimple_get_lhs (orig_stmt))
11097             mapping.put (old_lhs, gimple_get_lhs (new_stmt));
11098
11099           if (STMT_VINFO_PATTERN_DEF_SEQ (stmt_vinfo))
11100             {
11101               gimple_seq seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_vinfo);
11102               for (gimple_stmt_iterator gsi = gsi_start (seq);
11103                    !gsi_end_p (gsi); gsi_next (&gsi))
11104                 stmt_worklist.safe_push (gsi_stmt (gsi));
11105             }
11106
11107           related_vinfo = STMT_VINFO_RELATED_STMT (stmt_vinfo);
11108           if (related_vinfo != NULL && related_vinfo != stmt_vinfo)
11109             {
11110               gimple *stmt = STMT_VINFO_STMT (related_vinfo);
11111               stmt_worklist.safe_push (stmt);
11112               /* Set BB such that the assert in
11113                 'get_initial_def_for_reduction' is able to determine that
11114                 the BB of the related stmt is inside this loop.  */
11115               gimple_set_bb (stmt,
11116                              gimple_bb (new_stmt));
11117               related_vinfo = STMT_VINFO_RELATED_STMT (related_vinfo);
11118               gcc_assert (related_vinfo == NULL
11119                           || related_vinfo == stmt_vinfo);
11120             }
11121         }
11122     }
11123
11124   /* The PATTERN_DEF_SEQs and RELATED_STMTs in the epilogue were constructed
11125      using the original main loop and thus need to be updated to refer to the
11126      cloned variables used in the epilogue.  */
11127   for (unsigned i = 0; i < stmt_worklist.length (); ++i)
11128     {
11129       gimple *stmt = stmt_worklist[i];
11130       tree *new_op;
11131
11132       for (unsigned j = 1; j < gimple_num_ops (stmt); ++j)
11133         {
11134           tree op = gimple_op (stmt, j);
11135           if ((new_op = mapping.get(op)))
11136             gimple_set_op (stmt, j, *new_op);
11137           else
11138             {
11139               /* PR92429: The last argument of simplify_replace_tree disables
11140                  folding when replacing arguments.  This is required as
11141                  otherwise you might end up with different statements than the
11142                  ones analyzed in vect_loop_analyze, leading to different
11143                  vectorization.  */
11144               op = simplify_replace_tree (op, NULL_TREE, NULL_TREE,
11145                                           &find_in_mapping, &mapping, false);
11146               gimple_set_op (stmt, j, op);
11147             }
11148         }
11149     }
11150
11151   struct data_reference *dr;
11152   vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (epilogue_vinfo);
11153   FOR_EACH_VEC_ELT (datarefs, i, dr)
11154     {
11155       orig_stmt = DR_STMT (dr);
11156       gcc_assert (gimple_uid (orig_stmt) > 0);
11157       stmt_vinfo = epilogue_vinfo->stmt_vec_infos[gimple_uid (orig_stmt) - 1];
11158       /* Data references for gather loads and scatter stores do not use the
11159          updated offset we set using ADVANCE.  Instead we have to make sure the
11160          reference in the data references point to the corresponding copy of
11161          the original in the epilogue.  */
11162       if (STMT_VINFO_MEMORY_ACCESS_TYPE (vect_stmt_to_vectorize (stmt_vinfo))
11163           == VMAT_GATHER_SCATTER)
11164         {
11165           DR_REF (dr)
11166             = simplify_replace_tree (DR_REF (dr), NULL_TREE, NULL_TREE,
11167                                      &find_in_mapping, &mapping);
11168           DR_BASE_ADDRESS (dr)
11169             = simplify_replace_tree (DR_BASE_ADDRESS (dr), NULL_TREE, NULL_TREE,
11170                                      &find_in_mapping, &mapping);
11171         }
11172       DR_STMT (dr) = STMT_VINFO_STMT (stmt_vinfo);
11173       stmt_vinfo->dr_aux.stmt = stmt_vinfo;
11174       /* The vector size of the epilogue is smaller than that of the main loop
11175          so the alignment is either the same or lower. This means the dr will
11176          thus by definition be aligned.  */
11177       STMT_VINFO_DR_INFO (stmt_vinfo)->base_misaligned = false;
11178     }
11179
11180   epilogue_vinfo->shared->datarefs_copy.release ();
11181   epilogue_vinfo->shared->save_datarefs ();
11182 }
11183
11184 /* Function vect_transform_loop.
11185
11186    The analysis phase has determined that the loop is vectorizable.
11187    Vectorize the loop - created vectorized stmts to replace the scalar
11188    stmts in the loop, and update the loop exit condition.
11189    Returns scalar epilogue loop if any.  */
11190
11191 class loop *
11192 vect_transform_loop (loop_vec_info loop_vinfo, gimple *loop_vectorized_call)
11193 {
11194   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
11195   class loop *epilogue = NULL;
11196   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
11197   int nbbs = loop->num_nodes;
11198   int i;
11199   tree niters_vector = NULL_TREE;
11200   tree step_vector = NULL_TREE;
11201   tree niters_vector_mult_vf = NULL_TREE;
11202   poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
11203   unsigned int lowest_vf = constant_lower_bound (vf);
11204   gimple *stmt;
11205   bool check_profitability = false;
11206   unsigned int th;
11207   bool flat = maybe_flat_loop_profile (loop);
11208
11209   DUMP_VECT_SCOPE ("vec_transform_loop");
11210
11211   loop_vinfo->shared->check_datarefs ();
11212
11213   /* Use the more conservative vectorization threshold.  If the number
11214      of iterations is constant assume the cost check has been performed
11215      by our caller.  If the threshold makes all loops profitable that
11216      run at least the (estimated) vectorization factor number of times
11217      checking is pointless, too.  */
11218   th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
11219   if (vect_apply_runtime_profitability_check_p (loop_vinfo))
11220     {
11221       if (dump_enabled_p ())
11222         dump_printf_loc (MSG_NOTE, vect_location,
11223                          "Profitability threshold is %d loop iterations.\n",
11224                          th);
11225       check_profitability = true;
11226     }
11227
11228   /* Make sure there exists a single-predecessor exit bb.  Do this before
11229      versioning.   */
11230   edge e = single_exit (loop);
11231   if (! single_pred_p (e->dest))
11232     {
11233       split_loop_exit_edge (e, true);
11234       if (dump_enabled_p ())
11235         dump_printf (MSG_NOTE, "split exit edge\n");
11236     }
11237
11238   /* Version the loop first, if required, so the profitability check
11239      comes first.  */
11240
11241   if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
11242     {
11243       class loop *sloop
11244         = vect_loop_versioning (loop_vinfo, loop_vectorized_call);
11245       sloop->force_vectorize = false;
11246       check_profitability = false;
11247     }
11248
11249   /* Make sure there exists a single-predecessor exit bb also on the
11250      scalar loop copy.  Do this after versioning but before peeling
11251      so CFG structure is fine for both scalar and if-converted loop
11252      to make slpeel_duplicate_current_defs_from_edges face matched
11253      loop closed PHI nodes on the exit.  */
11254   if (LOOP_VINFO_SCALAR_LOOP (loop_vinfo))
11255     {
11256       e = single_exit (LOOP_VINFO_SCALAR_LOOP (loop_vinfo));
11257       if (! single_pred_p (e->dest))
11258         {
11259           split_loop_exit_edge (e, true);
11260           if (dump_enabled_p ())
11261             dump_printf (MSG_NOTE, "split exit edge of scalar loop\n");
11262         }
11263     }
11264
11265   tree niters = vect_build_loop_niters (loop_vinfo);
11266   LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = niters;
11267   tree nitersm1 = unshare_expr (LOOP_VINFO_NITERSM1 (loop_vinfo));
11268   bool niters_no_overflow = loop_niters_no_overflow (loop_vinfo);
11269   tree advance;
11270   drs_init_vec orig_drs_init;
11271
11272   epilogue = vect_do_peeling (loop_vinfo, niters, nitersm1, &niters_vector,
11273                               &step_vector, &niters_vector_mult_vf, th,
11274                               check_profitability, niters_no_overflow,
11275                               &advance);
11276   if (LOOP_VINFO_SCALAR_LOOP (loop_vinfo)
11277       && LOOP_VINFO_SCALAR_LOOP_SCALING (loop_vinfo).initialized_p ())
11278     scale_loop_frequencies (LOOP_VINFO_SCALAR_LOOP (loop_vinfo),
11279                             LOOP_VINFO_SCALAR_LOOP_SCALING (loop_vinfo));
11280
11281   if (niters_vector == NULL_TREE)
11282     {
11283       if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
11284           && !LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo)
11285           && known_eq (lowest_vf, vf))
11286         {
11287           niters_vector
11288             = build_int_cst (TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo)),
11289                              LOOP_VINFO_INT_NITERS (loop_vinfo) / lowest_vf);
11290           step_vector = build_one_cst (TREE_TYPE (niters));
11291         }
11292       else if (vect_use_loop_mask_for_alignment_p (loop_vinfo))
11293         vect_gen_vector_loop_niters (loop_vinfo, niters, &niters_vector,
11294                                      &step_vector, niters_no_overflow);
11295       else
11296         /* vect_do_peeling subtracted the number of peeled prologue
11297            iterations from LOOP_VINFO_NITERS.  */
11298         vect_gen_vector_loop_niters (loop_vinfo, LOOP_VINFO_NITERS (loop_vinfo),
11299                                      &niters_vector, &step_vector,
11300                                      niters_no_overflow);
11301     }
11302
11303   /* 1) Make sure the loop header has exactly two entries
11304      2) Make sure we have a preheader basic block.  */
11305
11306   gcc_assert (EDGE_COUNT (loop->header->preds) == 2);
11307
11308   split_edge (loop_preheader_edge (loop));
11309
11310   if (vect_use_loop_mask_for_alignment_p (loop_vinfo))
11311     /* This will deal with any possible peeling.  */
11312     vect_prepare_for_masked_peels (loop_vinfo);
11313
11314   /* Schedule the SLP instances first, then handle loop vectorization
11315      below.  */
11316   if (!loop_vinfo->slp_instances.is_empty ())
11317     {
11318       DUMP_VECT_SCOPE ("scheduling SLP instances");
11319       vect_schedule_slp (loop_vinfo, LOOP_VINFO_SLP_INSTANCES (loop_vinfo));
11320     }
11321
11322   /* FORNOW: the vectorizer supports only loops which body consist
11323      of one basic block (header + empty latch). When the vectorizer will
11324      support more involved loop forms, the order by which the BBs are
11325      traversed need to be reconsidered.  */
11326
11327   for (i = 0; i < nbbs; i++)
11328     {
11329       basic_block bb = bbs[i];
11330       stmt_vec_info stmt_info;
11331
11332       for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
11333            gsi_next (&si))
11334         {
11335           gphi *phi = si.phi ();
11336           if (dump_enabled_p ())
11337             dump_printf_loc (MSG_NOTE, vect_location,
11338                              "------>vectorizing phi: %G", (gimple *) phi);
11339           stmt_info = loop_vinfo->lookup_stmt (phi);
11340           if (!stmt_info)
11341             continue;
11342
11343           if (MAY_HAVE_DEBUG_BIND_STMTS && !STMT_VINFO_LIVE_P (stmt_info))
11344             vect_loop_kill_debug_uses (loop, stmt_info);
11345
11346           if (!STMT_VINFO_RELEVANT_P (stmt_info)
11347               && !STMT_VINFO_LIVE_P (stmt_info))
11348             continue;
11349
11350           if (STMT_VINFO_VECTYPE (stmt_info)
11351               && (maybe_ne
11352                   (TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info)), vf))
11353               && dump_enabled_p ())
11354             dump_printf_loc (MSG_NOTE, vect_location, "multiple-types.\n");
11355
11356           if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def
11357                || STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
11358                || STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def
11359                || STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle
11360                || STMT_VINFO_DEF_TYPE (stmt_info) == vect_first_order_recurrence
11361                || STMT_VINFO_DEF_TYPE (stmt_info) == vect_internal_def)
11362               && ! PURE_SLP_STMT (stmt_info))
11363             {
11364               if (dump_enabled_p ())
11365                 dump_printf_loc (MSG_NOTE, vect_location, "transform phi.\n");
11366               vect_transform_stmt (loop_vinfo, stmt_info, NULL, NULL, NULL);
11367             }
11368         }
11369
11370       for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
11371            gsi_next (&si))
11372         {
11373           gphi *phi = si.phi ();
11374           stmt_info = loop_vinfo->lookup_stmt (phi);
11375           if (!stmt_info)
11376             continue;
11377
11378           if (!STMT_VINFO_RELEVANT_P (stmt_info)
11379               && !STMT_VINFO_LIVE_P (stmt_info))
11380             continue;
11381
11382           if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def
11383                || STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
11384                || STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def
11385                || STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle
11386                || STMT_VINFO_DEF_TYPE (stmt_info) == vect_internal_def
11387                || STMT_VINFO_DEF_TYPE (stmt_info) == vect_first_order_recurrence)
11388               && ! PURE_SLP_STMT (stmt_info))
11389             maybe_set_vectorized_backedge_value (loop_vinfo, stmt_info);
11390         }
11391
11392       for (gimple_stmt_iterator si = gsi_start_bb (bb);
11393            !gsi_end_p (si);)
11394         {
11395           stmt = gsi_stmt (si);
11396           /* During vectorization remove existing clobber stmts.  */
11397           if (gimple_clobber_p (stmt))
11398             {
11399               unlink_stmt_vdef (stmt);
11400               gsi_remove (&si, true);
11401               release_defs (stmt);
11402             }
11403           else
11404             {
11405               /* Ignore vector stmts created in the outer loop.  */
11406               stmt_info = loop_vinfo->lookup_stmt (stmt);
11407
11408               /* vector stmts created in the outer-loop during vectorization of
11409                  stmts in an inner-loop may not have a stmt_info, and do not
11410                  need to be vectorized.  */
11411               stmt_vec_info seen_store = NULL;
11412               if (stmt_info)
11413                 {
11414                   if (STMT_VINFO_IN_PATTERN_P (stmt_info))
11415                     {
11416                       gimple *def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
11417                       for (gimple_stmt_iterator subsi = gsi_start (def_seq);
11418                            !gsi_end_p (subsi); gsi_next (&subsi))
11419                         {
11420                           stmt_vec_info pat_stmt_info
11421                             = loop_vinfo->lookup_stmt (gsi_stmt (subsi));
11422                           vect_transform_loop_stmt (loop_vinfo, pat_stmt_info,
11423                                                     &si, &seen_store);
11424                         }
11425                       stmt_vec_info pat_stmt_info
11426                         = STMT_VINFO_RELATED_STMT (stmt_info);
11427                       if (vect_transform_loop_stmt (loop_vinfo, pat_stmt_info,
11428                                                     &si, &seen_store))
11429                         maybe_set_vectorized_backedge_value (loop_vinfo,
11430                                                              pat_stmt_info);
11431                     }
11432                   else
11433                     {
11434                       if (vect_transform_loop_stmt (loop_vinfo, stmt_info, &si,
11435                                                     &seen_store))
11436                         maybe_set_vectorized_backedge_value (loop_vinfo,
11437                                                              stmt_info);
11438                     }
11439                 }
11440               gsi_next (&si);
11441               if (seen_store)
11442                 {
11443                   if (STMT_VINFO_GROUPED_ACCESS (seen_store))
11444                     /* Interleaving.  If IS_STORE is TRUE, the
11445                        vectorization of the interleaving chain was
11446                        completed - free all the stores in the chain.  */
11447                     vect_remove_stores (loop_vinfo,
11448                                         DR_GROUP_FIRST_ELEMENT (seen_store));
11449                   else
11450                     /* Free the attached stmt_vec_info and remove the stmt.  */
11451                     loop_vinfo->remove_stmt (stmt_info);
11452                 }
11453             }
11454         }
11455
11456       /* Stub out scalar statements that must not survive vectorization.
11457          Doing this here helps with grouped statements, or statements that
11458          are involved in patterns.  */
11459       for (gimple_stmt_iterator gsi = gsi_start_bb (bb);
11460            !gsi_end_p (gsi); gsi_next (&gsi))
11461         {
11462           gcall *call = dyn_cast <gcall *> (gsi_stmt (gsi));
11463           if (!call || !gimple_call_internal_p (call))
11464             continue;
11465           internal_fn ifn = gimple_call_internal_fn (call);
11466           if (ifn == IFN_MASK_LOAD)
11467             {
11468               tree lhs = gimple_get_lhs (call);
11469               if (!VECTOR_TYPE_P (TREE_TYPE (lhs)))
11470                 {
11471                   tree zero = build_zero_cst (TREE_TYPE (lhs));
11472                   gimple *new_stmt = gimple_build_assign (lhs, zero);
11473                   gsi_replace (&gsi, new_stmt, true);
11474                 }
11475             }
11476           else if (conditional_internal_fn_code (ifn) != ERROR_MARK)
11477             {
11478               tree lhs = gimple_get_lhs (call);
11479               if (!VECTOR_TYPE_P (TREE_TYPE (lhs)))
11480                 {
11481                   tree else_arg
11482                     = gimple_call_arg (call, gimple_call_num_args (call) - 1);
11483                   gimple *new_stmt = gimple_build_assign (lhs, else_arg);
11484                   gsi_replace (&gsi, new_stmt, true);
11485                 }
11486             }
11487         }
11488     }                           /* BBs in loop */
11489
11490   /* The vectorization factor is always > 1, so if we use an IV increment of 1.
11491      a zero NITERS becomes a nonzero NITERS_VECTOR.  */
11492   if (integer_onep (step_vector))
11493     niters_no_overflow = true;
11494   vect_set_loop_condition (loop, loop_vinfo, niters_vector, step_vector,
11495                            niters_vector_mult_vf, !niters_no_overflow);
11496
11497   unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
11498
11499   /* True if the final iteration might not handle a full vector's
11500      worth of scalar iterations.  */
11501   bool final_iter_may_be_partial
11502     = LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo);
11503   /* The minimum number of iterations performed by the epilogue.  This
11504      is 1 when peeling for gaps because we always need a final scalar
11505      iteration.  */
11506   int min_epilogue_iters = LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) ? 1 : 0;
11507   /* +1 to convert latch counts to loop iteration counts,
11508      -min_epilogue_iters to remove iterations that cannot be performed
11509        by the vector code.  */
11510   int bias_for_lowest = 1 - min_epilogue_iters;
11511   int bias_for_assumed = bias_for_lowest;
11512   int alignment_npeels = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
11513   if (alignment_npeels && LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
11514     {
11515       /* When the amount of peeling is known at compile time, the first
11516          iteration will have exactly alignment_npeels active elements.
11517          In the worst case it will have at least one.  */
11518       int min_first_active = (alignment_npeels > 0 ? alignment_npeels : 1);
11519       bias_for_lowest += lowest_vf - min_first_active;
11520       bias_for_assumed += assumed_vf - min_first_active;
11521     }
11522   /* In these calculations the "- 1" converts loop iteration counts
11523      back to latch counts.  */
11524   if (loop->any_upper_bound)
11525     {
11526       loop_vec_info main_vinfo = LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo);
11527       loop->nb_iterations_upper_bound
11528         = (final_iter_may_be_partial
11529            ? wi::udiv_ceil (loop->nb_iterations_upper_bound + bias_for_lowest,
11530                             lowest_vf) - 1
11531            : wi::udiv_floor (loop->nb_iterations_upper_bound + bias_for_lowest,
11532                              lowest_vf) - 1);
11533       if (main_vinfo
11534           /* Both peeling for alignment and peeling for gaps can end up
11535              with the scalar epilogue running for more than VF-1 iterations.  */
11536           && !main_vinfo->peeling_for_alignment
11537           && !main_vinfo->peeling_for_gaps)
11538         {
11539           unsigned int bound;
11540           poly_uint64 main_iters
11541             = upper_bound (LOOP_VINFO_VECT_FACTOR (main_vinfo),
11542                            LOOP_VINFO_COST_MODEL_THRESHOLD (main_vinfo));
11543           main_iters
11544             = upper_bound (main_iters,
11545                            LOOP_VINFO_VERSIONING_THRESHOLD (main_vinfo));
11546           if (can_div_away_from_zero_p (main_iters,
11547                                         LOOP_VINFO_VECT_FACTOR (loop_vinfo),
11548                                         &bound))
11549             loop->nb_iterations_upper_bound
11550               = wi::umin ((widest_int) (bound - 1),
11551                           loop->nb_iterations_upper_bound);
11552       }
11553   }
11554   if (loop->any_likely_upper_bound)
11555     loop->nb_iterations_likely_upper_bound
11556       = (final_iter_may_be_partial
11557          ? wi::udiv_ceil (loop->nb_iterations_likely_upper_bound
11558                           + bias_for_lowest, lowest_vf) - 1
11559          : wi::udiv_floor (loop->nb_iterations_likely_upper_bound
11560                            + bias_for_lowest, lowest_vf) - 1);
11561   if (loop->any_estimate)
11562     loop->nb_iterations_estimate
11563       = (final_iter_may_be_partial
11564          ? wi::udiv_ceil (loop->nb_iterations_estimate + bias_for_assumed,
11565                           assumed_vf) - 1
11566          : wi::udiv_floor (loop->nb_iterations_estimate + bias_for_assumed,
11567                            assumed_vf) - 1);
11568   scale_profile_for_vect_loop (loop, assumed_vf, flat);
11569
11570   if (dump_enabled_p ())
11571     {
11572       if (!LOOP_VINFO_EPILOGUE_P (loop_vinfo))
11573         {
11574           dump_printf_loc (MSG_NOTE, vect_location,
11575                            "LOOP VECTORIZED\n");
11576           if (loop->inner)
11577             dump_printf_loc (MSG_NOTE, vect_location,
11578                              "OUTER LOOP VECTORIZED\n");
11579           dump_printf (MSG_NOTE, "\n");
11580         }
11581       else
11582         dump_printf_loc (MSG_NOTE, vect_location,
11583                          "LOOP EPILOGUE VECTORIZED (MODE=%s)\n",
11584                          GET_MODE_NAME (loop_vinfo->vector_mode));
11585     }
11586
11587   /* Loops vectorized with a variable factor won't benefit from
11588      unrolling/peeling.  */
11589   if (!vf.is_constant ())
11590     {
11591       loop->unroll = 1;
11592       if (dump_enabled_p ())
11593         dump_printf_loc (MSG_NOTE, vect_location, "Disabling unrolling due to"
11594                          " variable-length vectorization factor\n");
11595     }
11596   /* Free SLP instances here because otherwise stmt reference counting
11597      won't work.  */
11598   slp_instance instance;
11599   FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), i, instance)
11600     vect_free_slp_instance (instance);
11601   LOOP_VINFO_SLP_INSTANCES (loop_vinfo).release ();
11602   /* Clear-up safelen field since its value is invalid after vectorization
11603      since vectorized loop can have loop-carried dependencies.  */
11604   loop->safelen = 0;
11605
11606   if (epilogue)
11607     {
11608       update_epilogue_loop_vinfo (epilogue, advance);
11609
11610       epilogue->simduid = loop->simduid;
11611       epilogue->force_vectorize = loop->force_vectorize;
11612       epilogue->dont_vectorize = false;
11613     }
11614
11615   return epilogue;
11616 }
11617
11618 /* The code below is trying to perform simple optimization - revert
11619    if-conversion for masked stores, i.e. if the mask of a store is zero
11620    do not perform it and all stored value producers also if possible.
11621    For example,
11622      for (i=0; i<n; i++)
11623        if (c[i])
11624         {
11625           p1[i] += 1;
11626           p2[i] = p3[i] +2;
11627         }
11628    this transformation will produce the following semi-hammock:
11629
11630    if (!mask__ifc__42.18_165 == { 0, 0, 0, 0, 0, 0, 0, 0 })
11631      {
11632        vect__11.19_170 = MASK_LOAD (vectp_p1.20_168, 0B, mask__ifc__42.18_165);
11633        vect__12.22_172 = vect__11.19_170 + vect_cst__171;
11634        MASK_STORE (vectp_p1.23_175, 0B, mask__ifc__42.18_165, vect__12.22_172);
11635        vect__18.25_182 = MASK_LOAD (vectp_p3.26_180, 0B, mask__ifc__42.18_165);
11636        vect__19.28_184 = vect__18.25_182 + vect_cst__183;
11637        MASK_STORE (vectp_p2.29_187, 0B, mask__ifc__42.18_165, vect__19.28_184);
11638      }
11639 */
11640
11641 void
11642 optimize_mask_stores (class loop *loop)
11643 {
11644   basic_block *bbs = get_loop_body (loop);
11645   unsigned nbbs = loop->num_nodes;
11646   unsigned i;
11647   basic_block bb;
11648   class loop *bb_loop;
11649   gimple_stmt_iterator gsi;
11650   gimple *stmt;
11651   auto_vec<gimple *> worklist;
11652   auto_purge_vect_location sentinel;
11653
11654   vect_location = find_loop_location (loop);
11655   /* Pick up all masked stores in loop if any.  */
11656   for (i = 0; i < nbbs; i++)
11657     {
11658       bb = bbs[i];
11659       for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi);
11660            gsi_next (&gsi))
11661         {
11662           stmt = gsi_stmt (gsi);
11663           if (gimple_call_internal_p (stmt, IFN_MASK_STORE))
11664             worklist.safe_push (stmt);
11665         }
11666     }
11667
11668   free (bbs);
11669   if (worklist.is_empty ())
11670     return;
11671
11672   /* Loop has masked stores.  */
11673   while (!worklist.is_empty ())
11674     {
11675       gimple *last, *last_store;
11676       edge e, efalse;
11677       tree mask;
11678       basic_block store_bb, join_bb;
11679       gimple_stmt_iterator gsi_to;
11680       tree vdef, new_vdef;
11681       gphi *phi;
11682       tree vectype;
11683       tree zero;
11684
11685       last = worklist.pop ();
11686       mask = gimple_call_arg (last, 2);
11687       bb = gimple_bb (last);
11688       /* Create then_bb and if-then structure in CFG, then_bb belongs to
11689          the same loop as if_bb.  It could be different to LOOP when two
11690          level loop-nest is vectorized and mask_store belongs to the inner
11691          one.  */
11692       e = split_block (bb, last);
11693       bb_loop = bb->loop_father;
11694       gcc_assert (loop == bb_loop || flow_loop_nested_p (loop, bb_loop));
11695       join_bb = e->dest;
11696       store_bb = create_empty_bb (bb);
11697       add_bb_to_loop (store_bb, bb_loop);
11698       e->flags = EDGE_TRUE_VALUE;
11699       efalse = make_edge (bb, store_bb, EDGE_FALSE_VALUE);
11700       /* Put STORE_BB to likely part.  */
11701       efalse->probability = profile_probability::unlikely ();
11702       e->probability = efalse->probability.invert ();
11703       store_bb->count = efalse->count ();
11704       make_single_succ_edge (store_bb, join_bb, EDGE_FALLTHRU);
11705       if (dom_info_available_p (CDI_DOMINATORS))
11706         set_immediate_dominator (CDI_DOMINATORS, store_bb, bb);
11707       if (dump_enabled_p ())
11708         dump_printf_loc (MSG_NOTE, vect_location,
11709                          "Create new block %d to sink mask stores.",
11710                          store_bb->index);
11711       /* Create vector comparison with boolean result.  */
11712       vectype = TREE_TYPE (mask);
11713       zero = build_zero_cst (vectype);
11714       stmt = gimple_build_cond (EQ_EXPR, mask, zero, NULL_TREE, NULL_TREE);
11715       gsi = gsi_last_bb (bb);
11716       gsi_insert_after (&gsi, stmt, GSI_SAME_STMT);
11717       /* Create new PHI node for vdef of the last masked store:
11718          .MEM_2 = VDEF <.MEM_1>
11719          will be converted to
11720          .MEM.3 = VDEF <.MEM_1>
11721          and new PHI node will be created in join bb
11722          .MEM_2 = PHI <.MEM_1, .MEM_3>
11723       */
11724       vdef = gimple_vdef (last);
11725       new_vdef = make_ssa_name (gimple_vop (cfun), last);
11726       gimple_set_vdef (last, new_vdef);
11727       phi = create_phi_node (vdef, join_bb);
11728       add_phi_arg (phi, new_vdef, EDGE_SUCC (store_bb, 0), UNKNOWN_LOCATION);
11729
11730       /* Put all masked stores with the same mask to STORE_BB if possible.  */
11731       while (true)
11732         {
11733           gimple_stmt_iterator gsi_from;
11734           gimple *stmt1 = NULL;
11735
11736           /* Move masked store to STORE_BB.  */
11737           last_store = last;
11738           gsi = gsi_for_stmt (last);
11739           gsi_from = gsi;
11740           /* Shift GSI to the previous stmt for further traversal.  */
11741           gsi_prev (&gsi);
11742           gsi_to = gsi_start_bb (store_bb);
11743           gsi_move_before (&gsi_from, &gsi_to);
11744           /* Setup GSI_TO to the non-empty block start.  */
11745           gsi_to = gsi_start_bb (store_bb);
11746           if (dump_enabled_p ())
11747             dump_printf_loc (MSG_NOTE, vect_location,
11748                              "Move stmt to created bb\n%G", last);
11749           /* Move all stored value producers if possible.  */
11750           while (!gsi_end_p (gsi))
11751             {
11752               tree lhs;
11753               imm_use_iterator imm_iter;
11754               use_operand_p use_p;
11755               bool res;
11756
11757               /* Skip debug statements.  */
11758               if (is_gimple_debug (gsi_stmt (gsi)))
11759                 {
11760                   gsi_prev (&gsi);
11761                   continue;
11762                 }
11763               stmt1 = gsi_stmt (gsi);
11764               /* Do not consider statements writing to memory or having
11765                  volatile operand.  */
11766               if (gimple_vdef (stmt1)
11767                   || gimple_has_volatile_ops (stmt1))
11768                 break;
11769               gsi_from = gsi;
11770               gsi_prev (&gsi);
11771               lhs = gimple_get_lhs (stmt1);
11772               if (!lhs)
11773                 break;
11774
11775               /* LHS of vectorized stmt must be SSA_NAME.  */
11776               if (TREE_CODE (lhs) != SSA_NAME)
11777                 break;
11778
11779               if (!VECTOR_TYPE_P (TREE_TYPE (lhs)))
11780                 {
11781                   /* Remove dead scalar statement.  */
11782                   if (has_zero_uses (lhs))
11783                     {
11784                       gsi_remove (&gsi_from, true);
11785                       continue;
11786                     }
11787                 }
11788
11789               /* Check that LHS does not have uses outside of STORE_BB.  */
11790               res = true;
11791               FOR_EACH_IMM_USE_FAST (use_p, imm_iter, lhs)
11792                 {
11793                   gimple *use_stmt;
11794                   use_stmt = USE_STMT (use_p);
11795                   if (is_gimple_debug (use_stmt))
11796                     continue;
11797                   if (gimple_bb (use_stmt) != store_bb)
11798                     {
11799                       res = false;
11800                       break;
11801                     }
11802                 }
11803               if (!res)
11804                 break;
11805
11806               if (gimple_vuse (stmt1)
11807                   && gimple_vuse (stmt1) != gimple_vuse (last_store))
11808                 break;
11809
11810               /* Can move STMT1 to STORE_BB.  */
11811               if (dump_enabled_p ())
11812                 dump_printf_loc (MSG_NOTE, vect_location,
11813                                  "Move stmt to created bb\n%G", stmt1);
11814               gsi_move_before (&gsi_from, &gsi_to);
11815               /* Shift GSI_TO for further insertion.  */
11816               gsi_prev (&gsi_to);
11817             }
11818           /* Put other masked stores with the same mask to STORE_BB.  */
11819           if (worklist.is_empty ()
11820               || gimple_call_arg (worklist.last (), 2) != mask
11821               || worklist.last () != stmt1)
11822             break;
11823           last = worklist.pop ();
11824         }
11825       add_phi_arg (phi, gimple_vuse (last_store), e, UNKNOWN_LOCATION);
11826     }
11827 }
11828
11829 /* Decide whether it is possible to use a zero-based induction variable
11830    when vectorizing LOOP_VINFO with partial vectors.  If it is, return
11831    the value that the induction variable must be able to hold in order
11832    to ensure that the rgroups eventually have no active vector elements.
11833    Return -1 otherwise.  */
11834
11835 widest_int
11836 vect_iv_limit_for_partial_vectors (loop_vec_info loop_vinfo)
11837 {
11838   tree niters_skip = LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo);
11839   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
11840   unsigned HOST_WIDE_INT max_vf = vect_max_vf (loop_vinfo);
11841
11842   /* Calculate the value that the induction variable must be able
11843      to hit in order to ensure that we end the loop with an all-false mask.
11844      This involves adding the maximum number of inactive trailing scalar
11845      iterations.  */
11846   widest_int iv_limit = -1;
11847   if (max_loop_iterations (loop, &iv_limit))
11848     {
11849       if (niters_skip)
11850         {
11851           /* Add the maximum number of skipped iterations to the
11852              maximum iteration count.  */
11853           if (TREE_CODE (niters_skip) == INTEGER_CST)
11854             iv_limit += wi::to_widest (niters_skip);
11855           else
11856             iv_limit += max_vf - 1;
11857         }
11858       else if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo))
11859         /* Make a conservatively-correct assumption.  */
11860         iv_limit += max_vf - 1;
11861
11862       /* IV_LIMIT is the maximum number of latch iterations, which is also
11863          the maximum in-range IV value.  Round this value down to the previous
11864          vector alignment boundary and then add an extra full iteration.  */
11865       poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
11866       iv_limit = (iv_limit & -(int) known_alignment (vf)) + max_vf;
11867     }
11868   return iv_limit;
11869 }
11870
11871 /* For the given rgroup_controls RGC, check whether an induction variable
11872    would ever hit a value that produces a set of all-false masks or zero
11873    lengths before wrapping around.  Return true if it's possible to wrap
11874    around before hitting the desirable value, otherwise return false.  */
11875
11876 bool
11877 vect_rgroup_iv_might_wrap_p (loop_vec_info loop_vinfo, rgroup_controls *rgc)
11878 {
11879   widest_int iv_limit = vect_iv_limit_for_partial_vectors (loop_vinfo);
11880
11881   if (iv_limit == -1)
11882     return true;
11883
11884   tree compare_type = LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo);
11885   unsigned int compare_precision = TYPE_PRECISION (compare_type);
11886   unsigned nitems = rgc->max_nscalars_per_iter * rgc->factor;
11887
11888   if (wi::min_precision (iv_limit * nitems, UNSIGNED) > compare_precision)
11889     return true;
11890
11891   return false;
11892 }