gcc/tree-vect-loop.c

   1 /* Loop Vectorization
   2    Copyright (C) 2003-2021 Free Software Foundation, Inc.
   3    Contributed by Dorit Naishlos <dorit@il.ibm.com> and
   4    Ira Rosen <irar@il.ibm.com>
   5
   6 This file is part of GCC.
   7
   8 GCC is free software; you can redistribute it and/or modify it under
   9 the terms of the GNU General Public License as published by the Free
  10 Software Foundation; either version 3, or (at your option) any later
  11 version.
  12
  13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
  14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
  15 FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  16 for more details.
  17
  18 You should have received a copy of the GNU General Public License
  19 along with GCC; see the file COPYING3.  If not see
  20 <http://www.gnu.org/licenses/>.  */
  21
  22 #define INCLUDE_ALGORITHM
  23 #include "config.h"
  24 #include "system.h"
  25 #include "coretypes.h"
  26 #include "backend.h"
  27 #include "target.h"
  28 #include "rtl.h"
  29 #include "tree.h"
  30 #include "gimple.h"
  31 #include "cfghooks.h"
  32 #include "tree-pass.h"
  33 #include "ssa.h"
  34 #include "optabs-tree.h"
  35 #include "diagnostic-core.h"
  36 #include "fold-const.h"
  37 #include "stor-layout.h"
  38 #include "cfganal.h"
  39 #include "gimplify.h"
  40 #include "gimple-iterator.h"
  41 #include "gimplify-me.h"
  42 #include "tree-ssa-loop-ivopts.h"
  43 #include "tree-ssa-loop-manip.h"
  44 #include "tree-ssa-loop-niter.h"
  45 #include "tree-ssa-loop.h"
  46 #include "cfgloop.h"
  47 #include "tree-scalar-evolution.h"
  48 #include "tree-vectorizer.h"
  49 #include "gimple-fold.h"
  50 #include "cgraph.h"
  51 #include "tree-cfg.h"
  52 #include "tree-if-conv.h"
  53 #include "internal-fn.h"
  54 #include "tree-vector-builder.h"
  55 #include "vec-perm-indices.h"
  56 #include "tree-eh.h"
  57
  58 /* Loop Vectorization Pass.
  59
  60    This pass tries to vectorize loops.
  61
  62    For example, the vectorizer transforms the following simple loop:
  63
  64         short a[N]; short b[N]; short c[N]; int i;
  65
  66         for (i=0; i<N; i++){
  67           a[i] = b[i] + c[i];
  68         }
  69
  70    as if it was manually vectorized by rewriting the source code into:
  71
  72         typedef int __attribute__((mode(V8HI))) v8hi;
  73         short a[N];  short b[N]; short c[N];   int i;
  74         v8hi *pa = (v8hi*)a, *pb = (v8hi*)b, *pc = (v8hi*)c;
  75         v8hi va, vb, vc;
  76
  77         for (i=0; i<N/8; i++){
  78           vb = pb[i];
  79           vc = pc[i];
  80           va = vb + vc;
  81           pa[i] = va;
  82         }
  83
  84         The main entry to this pass is vectorize_loops(), in which
  85    the vectorizer applies a set of analyses on a given set of loops,
  86    followed by the actual vectorization transformation for the loops that
  87    had successfully passed the analysis phase.
  88         Throughout this pass we make a distinction between two types of
  89    data: scalars (which are represented by SSA_NAMES), and memory references
  90    ("data-refs").  These two types of data require different handling both
  91    during analysis and transformation. The types of data-refs that the
  92    vectorizer currently supports are ARRAY_REFS which base is an array DECL
  93    (not a pointer), and INDIRECT_REFS through pointers; both array and pointer
  94    accesses are required to have a simple (consecutive) access pattern.
  95
  96    Analysis phase:
  97    ===============
  98         The driver for the analysis phase is vect_analyze_loop().
  99    It applies a set of analyses, some of which rely on the scalar evolution
 100    analyzer (scev) developed by Sebastian Pop.
 101
 102         During the analysis phase the vectorizer records some information
 103    per stmt in a "stmt_vec_info" struct which is attached to each stmt in the
 104    loop, as well as general information about the loop as a whole, which is
 105    recorded in a "loop_vec_info" struct attached to each loop.
 106
 107    Transformation phase:
 108    =====================
 109         The loop transformation phase scans all the stmts in the loop, and
 110    creates a vector stmt (or a sequence of stmts) for each scalar stmt S in
 111    the loop that needs to be vectorized.  It inserts the vector code sequence
 112    just before the scalar stmt S, and records a pointer to the vector code
 113    in STMT_VINFO_VEC_STMT (stmt_info) (stmt_info is the stmt_vec_info struct
 114    attached to S).  This pointer will be used for the vectorization of following
 115    stmts which use the def of stmt S. Stmt S is removed if it writes to memory;
 116    otherwise, we rely on dead code elimination for removing it.
 117
 118         For example, say stmt S1 was vectorized into stmt VS1:
 119
 120    VS1: vb = px[i];
 121    S1:  b = x[i];    STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
 122    S2:  a = b;
 123
 124    To vectorize stmt S2, the vectorizer first finds the stmt that defines
 125    the operand 'b' (S1), and gets the relevant vector def 'vb' from the
 126    vector stmt VS1 pointed to by STMT_VINFO_VEC_STMT (stmt_info (S1)).  The
 127    resulting sequence would be:
 128
 129    VS1: vb = px[i];
 130    S1:  b = x[i];       STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
 131    VS2: va = vb;
 132    S2:  a = b;          STMT_VINFO_VEC_STMT (stmt_info (S2)) = VS2
 133
 134         Operands that are not SSA_NAMEs, are data-refs that appear in
 135    load/store operations (like 'x[i]' in S1), and are handled differently.
 136
 137    Target modeling:
 138    =================
 139         Currently the only target specific information that is used is the
 140    size of the vector (in bytes) - "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD".
 141    Targets that can support different sizes of vectors, for now will need
 142    to specify one value for "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD".  More
 143    flexibility will be added in the future.
 144
 145         Since we only vectorize operations which vector form can be
 146    expressed using existing tree codes, to verify that an operation is
 147    supported, the vectorizer checks the relevant optab at the relevant
 148    machine_mode (e.g, optab_handler (add_optab, V8HImode)).  If
 149    the value found is CODE_FOR_nothing, then there's no target support, and
 150    we can't vectorize the stmt.
 151
 152    For additional information on this project see:
 153    http://gcc.gnu.org/projects/tree-ssa/vectorization.html
 154 */
 155
 156 static void vect_estimate_min_profitable_iters (loop_vec_info, int *, int *);
 157 static stmt_vec_info vect_is_simple_reduction (loop_vec_info, stmt_vec_info,
 158                                                bool *, bool *);
 159
 160 /* Subroutine of vect_determine_vf_for_stmt that handles only one
 161    statement.  VECTYPE_MAYBE_SET_P is true if STMT_VINFO_VECTYPE
 162    may already be set for general statements (not just data refs).  */
 163
 164 static opt_result
 165 vect_determine_vf_for_stmt_1 (vec_info *vinfo, stmt_vec_info stmt_info,
 166                               bool vectype_maybe_set_p,
 167                               poly_uint64 *vf)
 168 {
 169   gimple *stmt = stmt_info->stmt;
 170
 171   if ((!STMT_VINFO_RELEVANT_P (stmt_info)
 172        && !STMT_VINFO_LIVE_P (stmt_info))
 173       || gimple_clobber_p (stmt))
 174     {
 175       if (dump_enabled_p ())
 176         dump_printf_loc (MSG_NOTE, vect_location, "skip.\n");
 177       return opt_result::success ();
 178     }
 179
 180   tree stmt_vectype, nunits_vectype;
 181   opt_result res = vect_get_vector_types_for_stmt (vinfo, stmt_info,
 182                                                    &stmt_vectype,
 183                                                    &nunits_vectype);
 184   if (!res)
 185     return res;
 186
 187   if (stmt_vectype)
 188     {
 189       if (STMT_VINFO_VECTYPE (stmt_info))
 190         /* The only case when a vectype had been already set is for stmts
 191            that contain a data ref, or for "pattern-stmts" (stmts generated
 192            by the vectorizer to represent/replace a certain idiom).  */
 193         gcc_assert ((STMT_VINFO_DATA_REF (stmt_info)
 194                      || vectype_maybe_set_p)
 195                     && STMT_VINFO_VECTYPE (stmt_info) == stmt_vectype);
 196       else
 197         STMT_VINFO_VECTYPE (stmt_info) = stmt_vectype;
 198     }
 199
 200   if (nunits_vectype)
 201     vect_update_max_nunits (vf, nunits_vectype);
 202
 203   return opt_result::success ();
 204 }
 205
 206 /* Subroutine of vect_determine_vectorization_factor.  Set the vector
 207    types of STMT_INFO and all attached pattern statements and update
 208    the vectorization factor VF accordingly.  Return true on success
 209    or false if something prevented vectorization.  */
 210
 211 static opt_result
 212 vect_determine_vf_for_stmt (vec_info *vinfo,
 213                             stmt_vec_info stmt_info, poly_uint64 *vf)
 214 {
 215   if (dump_enabled_p ())
 216     dump_printf_loc (MSG_NOTE, vect_location, "==> examining statement: %G",
 217                      stmt_info->stmt);
 218   opt_result res = vect_determine_vf_for_stmt_1 (vinfo, stmt_info, false, vf);
 219   if (!res)
 220     return res;
 221
 222   if (STMT_VINFO_IN_PATTERN_P (stmt_info)
 223       && STMT_VINFO_RELATED_STMT (stmt_info))
 224     {
 225       gimple *pattern_def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
 226       stmt_info = STMT_VINFO_RELATED_STMT (stmt_info);
 227
 228       /* If a pattern statement has def stmts, analyze them too.  */
 229       for (gimple_stmt_iterator si = gsi_start (pattern_def_seq);
 230            !gsi_end_p (si); gsi_next (&si))
 231         {
 232           stmt_vec_info def_stmt_info = vinfo->lookup_stmt (gsi_stmt (si));
 233           if (dump_enabled_p ())
 234             dump_printf_loc (MSG_NOTE, vect_location,
 235                              "==> examining pattern def stmt: %G",
 236                              def_stmt_info->stmt);
 237           res = vect_determine_vf_for_stmt_1 (vinfo, def_stmt_info, true, vf);
 238           if (!res)
 239             return res;
 240         }
 241
 242       if (dump_enabled_p ())
 243         dump_printf_loc (MSG_NOTE, vect_location,
 244                          "==> examining pattern statement: %G",
 245                          stmt_info->stmt);
 246       res = vect_determine_vf_for_stmt_1 (vinfo, stmt_info, true, vf);
 247       if (!res)
 248         return res;
 249     }
 250
 251   return opt_result::success ();
 252 }
 253
 254 /* Function vect_determine_vectorization_factor
 255
 256    Determine the vectorization factor (VF).  VF is the number of data elements
 257    that are operated upon in parallel in a single iteration of the vectorized
 258    loop.  For example, when vectorizing a loop that operates on 4byte elements,
 259    on a target with vector size (VS) 16byte, the VF is set to 4, since 4
 260    elements can fit in a single vector register.
 261
 262    We currently support vectorization of loops in which all types operated upon
 263    are of the same size.  Therefore this function currently sets VF according to
 264    the size of the types operated upon, and fails if there are multiple sizes
 265    in the loop.
 266
 267    VF is also the factor by which the loop iterations are strip-mined, e.g.:
 268    original loop:
 269         for (i=0; i<N; i++){
 270           a[i] = b[i] + c[i];
 271         }
 272
 273    vectorized loop:
 274         for (i=0; i<N; i+=VF){
 275           a[i:VF] = b[i:VF] + c[i:VF];
 276         }
 277 */
 278
 279 static opt_result
 280 vect_determine_vectorization_factor (loop_vec_info loop_vinfo)
 281 {
 282   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
 283   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
 284   unsigned nbbs = loop->num_nodes;
 285   poly_uint64 vectorization_factor = 1;
 286   tree scalar_type = NULL_TREE;
 287   gphi *phi;
 288   tree vectype;
 289   stmt_vec_info stmt_info;
 290   unsigned i;
 291
 292   DUMP_VECT_SCOPE ("vect_determine_vectorization_factor");
 293
 294   for (i = 0; i < nbbs; i++)
 295     {
 296       basic_block bb = bbs[i];
 297
 298       for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
 299            gsi_next (&si))
 300         {
 301           phi = si.phi ();
 302           stmt_info = loop_vinfo->lookup_stmt (phi);
 303           if (dump_enabled_p ())
 304             dump_printf_loc (MSG_NOTE, vect_location, "==> examining phi: %G",
 305                              phi);
 306
 307           gcc_assert (stmt_info);
 308
 309           if (STMT_VINFO_RELEVANT_P (stmt_info)
 310               || STMT_VINFO_LIVE_P (stmt_info))
 311             {
 312               gcc_assert (!STMT_VINFO_VECTYPE (stmt_info));
 313               scalar_type = TREE_TYPE (PHI_RESULT (phi));
 314
 315               if (dump_enabled_p ())
 316                 dump_printf_loc (MSG_NOTE, vect_location,
 317                                  "get vectype for scalar type:  %T\n",
 318                                  scalar_type);
 319
 320               vectype = get_vectype_for_scalar_type (loop_vinfo, scalar_type);
 321               if (!vectype)
 322                 return opt_result::failure_at (phi,
 323                                                "not vectorized: unsupported "
 324                                                "data-type %T\n",
 325                                                scalar_type);
 326               STMT_VINFO_VECTYPE (stmt_info) = vectype;
 327
 328               if (dump_enabled_p ())
 329                 dump_printf_loc (MSG_NOTE, vect_location, "vectype: %T\n",
 330                                  vectype);
 331
 332               if (dump_enabled_p ())
 333                 {
 334                   dump_printf_loc (MSG_NOTE, vect_location, "nunits = ");
 335                   dump_dec (MSG_NOTE, TYPE_VECTOR_SUBPARTS (vectype));
 336                   dump_printf (MSG_NOTE, "\n");
 337                 }
 338
 339               vect_update_max_nunits (&vectorization_factor, vectype);
 340             }
 341         }
 342
 343       for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
 344            gsi_next (&si))
 345         {
 346           if (is_gimple_debug (gsi_stmt (si)))
 347             continue;
 348           stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
 349           opt_result res
 350             = vect_determine_vf_for_stmt (loop_vinfo,
 351                                           stmt_info, &vectorization_factor);
 352           if (!res)
 353             return res;
 354         }
 355     }
 356
 357   /* TODO: Analyze cost. Decide if worth while to vectorize.  */
 358   if (dump_enabled_p ())
 359     {
 360       dump_printf_loc (MSG_NOTE, vect_location, "vectorization factor = ");
 361       dump_dec (MSG_NOTE, vectorization_factor);
 362       dump_printf (MSG_NOTE, "\n");
 363     }
 364
 365   if (known_le (vectorization_factor, 1U))
 366     return opt_result::failure_at (vect_location,
 367                                    "not vectorized: unsupported data-type\n");
 368   LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
 369   return opt_result::success ();
 370 }
 371
 372
 373 /* Function vect_is_simple_iv_evolution.
 374
 375    FORNOW: A simple evolution of an induction variables in the loop is
 376    considered a polynomial evolution.  */
 377
 378 static bool
 379 vect_is_simple_iv_evolution (unsigned loop_nb, tree access_fn, tree * init,
 380                              tree * step)
 381 {
 382   tree init_expr;
 383   tree step_expr;
 384   tree evolution_part = evolution_part_in_loop_num (access_fn, loop_nb);
 385   basic_block bb;
 386
 387   /* When there is no evolution in this loop, the evolution function
 388      is not "simple".  */
 389   if (evolution_part == NULL_TREE)
 390     return false;
 391
 392   /* When the evolution is a polynomial of degree >= 2
 393      the evolution function is not "simple".  */
 394   if (tree_is_chrec (evolution_part))
 395     return false;
 396
 397   step_expr = evolution_part;
 398   init_expr = unshare_expr (initial_condition_in_loop_num (access_fn, loop_nb));
 399
 400   if (dump_enabled_p ())
 401     dump_printf_loc (MSG_NOTE, vect_location, "step: %T,  init: %T\n",
 402                      step_expr, init_expr);
 403
 404   *init = init_expr;
 405   *step = step_expr;
 406
 407   if (TREE_CODE (step_expr) != INTEGER_CST
 408       && (TREE_CODE (step_expr) != SSA_NAME
 409           || ((bb = gimple_bb (SSA_NAME_DEF_STMT (step_expr)))
 410               && flow_bb_inside_loop_p (get_loop (cfun, loop_nb), bb))
 411           || (!INTEGRAL_TYPE_P (TREE_TYPE (step_expr))
 412               && (!SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr))
 413                   || !flag_associative_math)))
 414       && (TREE_CODE (step_expr) != REAL_CST
 415           || !flag_associative_math))
 416     {
 417       if (dump_enabled_p ())
 418         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 419                          "step unknown.\n");
 420       return false;
 421     }
 422
 423   return true;
 424 }
 425
 426 /* Return true if PHI, described by STMT_INFO, is the inner PHI in
 427    what we are assuming is a double reduction.  For example, given
 428    a structure like this:
 429
 430       outer1:
 431         x_1 = PHI <x_4(outer2), ...>;
 432         ...
 433
 434       inner:
 435         x_2 = PHI <x_1(outer1), ...>;
 436         ...
 437         x_3 = ...;
 438         ...
 439
 440       outer2:
 441         x_4 = PHI <x_3(inner)>;
 442         ...
 443
 444    outer loop analysis would treat x_1 as a double reduction phi and
 445    this function would then return true for x_2.  */
 446
 447 static bool
 448 vect_inner_phi_in_double_reduction_p (loop_vec_info loop_vinfo, gphi *phi)
 449 {
 450   use_operand_p use_p;
 451   ssa_op_iter op_iter;
 452   FOR_EACH_PHI_ARG (use_p, phi, op_iter, SSA_OP_USE)
 453     if (stmt_vec_info def_info = loop_vinfo->lookup_def (USE_FROM_PTR (use_p)))
 454       if (STMT_VINFO_DEF_TYPE (def_info) == vect_double_reduction_def)
 455         return true;
 456   return false;
 457 }
 458
 459 /* Function vect_analyze_scalar_cycles_1.
 460
 461    Examine the cross iteration def-use cycles of scalar variables
 462    in LOOP.  LOOP_VINFO represents the loop that is now being
 463    considered for vectorization (can be LOOP, or an outer-loop
 464    enclosing LOOP).  */
 465
 466 static void
 467 vect_analyze_scalar_cycles_1 (loop_vec_info loop_vinfo, class loop *loop)
 468 {
 469   basic_block bb = loop->header;
 470   tree init, step;
 471   auto_vec<stmt_vec_info, 64> worklist;
 472   gphi_iterator gsi;
 473   bool double_reduc, reduc_chain;
 474
 475   DUMP_VECT_SCOPE ("vect_analyze_scalar_cycles");
 476
 477   /* First - identify all inductions.  Reduction detection assumes that all the
 478      inductions have been identified, therefore, this order must not be
 479      changed.  */
 480   for (gsi = gsi_start_phis  (bb); !gsi_end_p (gsi); gsi_next (&gsi))
 481     {
 482       gphi *phi = gsi.phi ();
 483       tree access_fn = NULL;
 484       tree def = PHI_RESULT (phi);
 485       stmt_vec_info stmt_vinfo = loop_vinfo->lookup_stmt (phi);
 486
 487       if (dump_enabled_p ())
 488         dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: %G", phi);
 489
 490       /* Skip virtual phi's.  The data dependences that are associated with
 491          virtual defs/uses (i.e., memory accesses) are analyzed elsewhere.  */
 492       if (virtual_operand_p (def))
 493         continue;
 494
 495       STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_unknown_def_type;
 496
 497       /* Analyze the evolution function.  */
 498       access_fn = analyze_scalar_evolution (loop, def);
 499       if (access_fn)
 500         {
 501           STRIP_NOPS (access_fn);
 502           if (dump_enabled_p ())
 503             dump_printf_loc (MSG_NOTE, vect_location,
 504                              "Access function of PHI: %T\n", access_fn);
 505           STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo)
 506             = initial_condition_in_loop_num (access_fn, loop->num);
 507           STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo)
 508             = evolution_part_in_loop_num (access_fn, loop->num);
 509         }
 510
 511       if (!access_fn
 512           || vect_inner_phi_in_double_reduction_p (loop_vinfo, phi)
 513           || !vect_is_simple_iv_evolution (loop->num, access_fn, &init, &step)
 514           || (LOOP_VINFO_LOOP (loop_vinfo) != loop
 515               && TREE_CODE (step) != INTEGER_CST))
 516         {
 517           worklist.safe_push (stmt_vinfo);
 518           continue;
 519         }
 520
 521       gcc_assert (STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo)
 522                   != NULL_TREE);
 523       gcc_assert (STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo) != NULL_TREE);
 524
 525       if (dump_enabled_p ())
 526         dump_printf_loc (MSG_NOTE, vect_location, "Detected induction.\n");
 527       STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_induction_def;
 528     }
 529
 530
 531   /* Second - identify all reductions and nested cycles.  */
 532   while (worklist.length () > 0)
 533     {
 534       stmt_vec_info stmt_vinfo = worklist.pop ();
 535       gphi *phi = as_a <gphi *> (stmt_vinfo->stmt);
 536       tree def = PHI_RESULT (phi);
 537
 538       if (dump_enabled_p ())
 539         dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: %G", phi);
 540
 541       gcc_assert (!virtual_operand_p (def)
 542                   && STMT_VINFO_DEF_TYPE (stmt_vinfo) == vect_unknown_def_type);
 543
 544       stmt_vec_info reduc_stmt_info
 545         = vect_is_simple_reduction (loop_vinfo, stmt_vinfo, &double_reduc,
 546                                     &reduc_chain);
 547       if (reduc_stmt_info)
 548         {
 549           STMT_VINFO_REDUC_DEF (stmt_vinfo) = reduc_stmt_info;
 550           STMT_VINFO_REDUC_DEF (reduc_stmt_info) = stmt_vinfo;
 551           if (double_reduc)
 552             {
 553               if (dump_enabled_p ())
 554                 dump_printf_loc (MSG_NOTE, vect_location,
 555                                  "Detected double reduction.\n");
 556
 557               STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_double_reduction_def;
 558               STMT_VINFO_DEF_TYPE (reduc_stmt_info) = vect_double_reduction_def;
 559             }
 560           else
 561             {
 562               if (loop != LOOP_VINFO_LOOP (loop_vinfo))
 563                 {
 564                   if (dump_enabled_p ())
 565                     dump_printf_loc (MSG_NOTE, vect_location,
 566                                      "Detected vectorizable nested cycle.\n");
 567
 568                   STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_nested_cycle;
 569                 }
 570               else
 571                 {
 572                   if (dump_enabled_p ())
 573                     dump_printf_loc (MSG_NOTE, vect_location,
 574                                      "Detected reduction.\n");
 575
 576                   STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_reduction_def;
 577                   STMT_VINFO_DEF_TYPE (reduc_stmt_info) = vect_reduction_def;
 578                   /* Store the reduction cycles for possible vectorization in
 579                      loop-aware SLP if it was not detected as reduction
 580                      chain.  */
 581                   if (! reduc_chain)
 582                     LOOP_VINFO_REDUCTIONS (loop_vinfo).safe_push
 583                       (reduc_stmt_info);
 584                 }
 585             }
 586         }
 587       else
 588         if (dump_enabled_p ())
 589           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 590                            "Unknown def-use cycle pattern.\n");
 591     }
 592 }
 593
 594
 595 /* Function vect_analyze_scalar_cycles.
 596
 597    Examine the cross iteration def-use cycles of scalar variables, by
 598    analyzing the loop-header PHIs of scalar variables.  Classify each
 599    cycle as one of the following: invariant, induction, reduction, unknown.
 600    We do that for the loop represented by LOOP_VINFO, and also to its
 601    inner-loop, if exists.
 602    Examples for scalar cycles:
 603
 604    Example1: reduction:
 605
 606               loop1:
 607               for (i=0; i<N; i++)
 608                  sum += a[i];
 609
 610    Example2: induction:
 611
 612               loop2:
 613               for (i=0; i<N; i++)
 614                  a[i] = i;  */
 615
 616 static void
 617 vect_analyze_scalar_cycles (loop_vec_info loop_vinfo)
 618 {
 619   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
 620
 621   vect_analyze_scalar_cycles_1 (loop_vinfo, loop);
 622
 623   /* When vectorizing an outer-loop, the inner-loop is executed sequentially.
 624      Reductions in such inner-loop therefore have different properties than
 625      the reductions in the nest that gets vectorized:
 626      1. When vectorized, they are executed in the same order as in the original
 627         scalar loop, so we can't change the order of computation when
 628         vectorizing them.
 629      2. FIXME: Inner-loop reductions can be used in the inner-loop, so the
 630         current checks are too strict.  */
 631
 632   if (loop->inner)
 633     vect_analyze_scalar_cycles_1 (loop_vinfo, loop->inner);
 634 }
 635
 636 /* Transfer group and reduction information from STMT_INFO to its
 637    pattern stmt.  */
 638
 639 static void
 640 vect_fixup_reduc_chain (stmt_vec_info stmt_info)
 641 {
 642   stmt_vec_info firstp = STMT_VINFO_RELATED_STMT (stmt_info);
 643   stmt_vec_info stmtp;
 644   gcc_assert (!REDUC_GROUP_FIRST_ELEMENT (firstp)
 645               && REDUC_GROUP_FIRST_ELEMENT (stmt_info));
 646   REDUC_GROUP_SIZE (firstp) = REDUC_GROUP_SIZE (stmt_info);
 647   do
 648     {
 649       stmtp = STMT_VINFO_RELATED_STMT (stmt_info);
 650       gcc_checking_assert (STMT_VINFO_DEF_TYPE (stmtp)
 651                            == STMT_VINFO_DEF_TYPE (stmt_info));
 652       REDUC_GROUP_FIRST_ELEMENT (stmtp) = firstp;
 653       stmt_info = REDUC_GROUP_NEXT_ELEMENT (stmt_info);
 654       if (stmt_info)
 655         REDUC_GROUP_NEXT_ELEMENT (stmtp)
 656           = STMT_VINFO_RELATED_STMT (stmt_info);
 657     }
 658   while (stmt_info);
 659 }
 660
 661 /* Fixup scalar cycles that now have their stmts detected as patterns.  */
 662
 663 static void
 664 vect_fixup_scalar_cycles_with_patterns (loop_vec_info loop_vinfo)
 665 {
 666   stmt_vec_info first;
 667   unsigned i;
 668
 669   FOR_EACH_VEC_ELT (LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo), i, first)
 670     {
 671       stmt_vec_info next = REDUC_GROUP_NEXT_ELEMENT (first);
 672       while (next)
 673         {
 674           if ((STMT_VINFO_IN_PATTERN_P (next)
 675                != STMT_VINFO_IN_PATTERN_P (first))
 676               || STMT_VINFO_REDUC_IDX (vect_stmt_to_vectorize (next)) == -1)
 677             break;
 678           next = REDUC_GROUP_NEXT_ELEMENT (next);
 679         }
 680       /* If all reduction chain members are well-formed patterns adjust
 681          the group to group the pattern stmts instead.  */
 682       if (! next
 683           && STMT_VINFO_REDUC_IDX (vect_stmt_to_vectorize (first)) != -1)
 684         {
 685           if (STMT_VINFO_IN_PATTERN_P (first))
 686             {
 687               vect_fixup_reduc_chain (first);
 688               LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo)[i]
 689                 = STMT_VINFO_RELATED_STMT (first);
 690             }
 691         }
 692       /* If not all stmt in the chain are patterns or if we failed
 693          to update STMT_VINFO_REDUC_IDX dissolve the chain and handle
 694          it as regular reduction instead.  */
 695       else
 696         {
 697           stmt_vec_info vinfo = first;
 698           stmt_vec_info last = NULL;
 699           while (vinfo)
 700             {
 701               next = REDUC_GROUP_NEXT_ELEMENT (vinfo);
 702               REDUC_GROUP_FIRST_ELEMENT (vinfo) = NULL;
 703               REDUC_GROUP_NEXT_ELEMENT (vinfo) = NULL;
 704               last = vinfo;
 705               vinfo = next;
 706             }
 707           STMT_VINFO_DEF_TYPE (vect_stmt_to_vectorize (first))
 708             = vect_internal_def;
 709           loop_vinfo->reductions.safe_push (vect_stmt_to_vectorize (last));
 710           LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo).unordered_remove (i);
 711           --i;
 712         }
 713     }
 714 }
 715
 716 /* Function vect_get_loop_niters.
 717
 718    Determine how many iterations the loop is executed and place it
 719    in NUMBER_OF_ITERATIONS.  Place the number of latch iterations
 720    in NUMBER_OF_ITERATIONSM1.  Place the condition under which the
 721    niter information holds in ASSUMPTIONS.
 722
 723    Return the loop exit condition.  */
 724
 725
 726 static gcond *
 727 vect_get_loop_niters (class loop *loop, tree *assumptions,
 728                       tree *number_of_iterations, tree *number_of_iterationsm1)
 729 {
 730   edge exit = single_exit (loop);
 731   class tree_niter_desc niter_desc;
 732   tree niter_assumptions, niter, may_be_zero;
 733   gcond *cond = get_loop_exit_condition (loop);
 734
 735   *assumptions = boolean_true_node;
 736   *number_of_iterationsm1 = chrec_dont_know;
 737   *number_of_iterations = chrec_dont_know;
 738   DUMP_VECT_SCOPE ("get_loop_niters");
 739
 740   if (!exit)
 741     return cond;
 742
 743   may_be_zero = NULL_TREE;
 744   if (!number_of_iterations_exit_assumptions (loop, exit, &niter_desc, NULL)
 745       || chrec_contains_undetermined (niter_desc.niter))
 746     return cond;
 747
 748   niter_assumptions = niter_desc.assumptions;
 749   may_be_zero = niter_desc.may_be_zero;
 750   niter = niter_desc.niter;
 751
 752   if (may_be_zero && integer_zerop (may_be_zero))
 753     may_be_zero = NULL_TREE;
 754
 755   if (may_be_zero)
 756     {
 757       if (COMPARISON_CLASS_P (may_be_zero))
 758         {
 759           /* Try to combine may_be_zero with assumptions, this can simplify
 760              computation of niter expression.  */
 761           if (niter_assumptions && !integer_nonzerop (niter_assumptions))
 762             niter_assumptions = fold_build2 (TRUTH_AND_EXPR, boolean_type_node,
 763                                              niter_assumptions,
 764                                              fold_build1 (TRUTH_NOT_EXPR,
 765                                                           boolean_type_node,
 766                                                           may_be_zero));
 767           else
 768             niter = fold_build3 (COND_EXPR, TREE_TYPE (niter), may_be_zero,
 769                                  build_int_cst (TREE_TYPE (niter), 0),
 770                                  rewrite_to_non_trapping_overflow (niter));
 771
 772           may_be_zero = NULL_TREE;
 773         }
 774       else if (integer_nonzerop (may_be_zero))
 775         {
 776           *number_of_iterationsm1 = build_int_cst (TREE_TYPE (niter), 0);
 777           *number_of_iterations = build_int_cst (TREE_TYPE (niter), 1);
 778           return cond;
 779         }
 780       else
 781         return cond;
 782     }
 783
 784   *assumptions = niter_assumptions;
 785   *number_of_iterationsm1 = niter;
 786
 787   /* We want the number of loop header executions which is the number
 788      of latch executions plus one.
 789      ???  For UINT_MAX latch executions this number overflows to zero
 790      for loops like do { n++; } while (n != 0);  */
 791   if (niter && !chrec_contains_undetermined (niter))
 792     niter = fold_build2 (PLUS_EXPR, TREE_TYPE (niter), unshare_expr (niter),
 793                           build_int_cst (TREE_TYPE (niter), 1));
 794   *number_of_iterations = niter;
 795
 796   return cond;
 797 }
 798
 799 /* Function bb_in_loop_p
 800
 801    Used as predicate for dfs order traversal of the loop bbs.  */
 802
 803 static bool
 804 bb_in_loop_p (const_basic_block bb, const void *data)
 805 {
 806   const class loop *const loop = (const class loop *)data;
 807   if (flow_bb_inside_loop_p (loop, bb))
 808     return true;
 809   return false;
 810 }
 811
 812
 813 /* Create and initialize a new loop_vec_info struct for LOOP_IN, as well as
 814    stmt_vec_info structs for all the stmts in LOOP_IN.  */
 815
 816 _loop_vec_info::_loop_vec_info (class loop *loop_in, vec_info_shared *shared)
 817   : vec_info (vec_info::loop, shared),
 818     loop (loop_in),
 819     bbs (XCNEWVEC (basic_block, loop->num_nodes)),
 820     num_itersm1 (NULL_TREE),
 821     num_iters (NULL_TREE),
 822     num_iters_unchanged (NULL_TREE),
 823     num_iters_assumptions (NULL_TREE),
 824     vector_costs (nullptr),
 825     scalar_costs (nullptr),
 826     th (0),
 827     versioning_threshold (0),
 828     vectorization_factor (0),
 829     main_loop_edge (nullptr),
 830     skip_main_loop_edge (nullptr),
 831     skip_this_loop_edge (nullptr),
 832     reusable_accumulators (),
 833     max_vectorization_factor (0),
 834     mask_skip_niters (NULL_TREE),
 835     rgroup_compare_type (NULL_TREE),
 836     simd_if_cond (NULL_TREE),
 837     unaligned_dr (NULL),
 838     peeling_for_alignment (0),
 839     ptr_mask (0),
 840     ivexpr_map (NULL),
 841     scan_map (NULL),
 842     slp_unrolling_factor (1),
 843     inner_loop_cost_factor (param_vect_inner_loop_cost_factor),
 844     vectorizable (false),
 845     can_use_partial_vectors_p (param_vect_partial_vector_usage != 0),
 846     using_partial_vectors_p (false),
 847     epil_using_partial_vectors_p (false),
 848     peeling_for_gaps (false),
 849     peeling_for_niter (false),
 850     no_data_dependencies (false),
 851     has_mask_store (false),
 852     scalar_loop_scaling (profile_probability::uninitialized ()),
 853     scalar_loop (NULL),
 854     orig_loop_info (NULL)
 855 {
 856   /* CHECKME: We want to visit all BBs before their successors (except for
 857      latch blocks, for which this assertion wouldn't hold).  In the simple
 858      case of the loop forms we allow, a dfs order of the BBs would the same
 859      as reversed postorder traversal, so we are safe.  */
 860
 861   unsigned int nbbs = dfs_enumerate_from (loop->header, 0, bb_in_loop_p,
 862                                           bbs, loop->num_nodes, loop);
 863   gcc_assert (nbbs == loop->num_nodes);
 864
 865   for (unsigned int i = 0; i < nbbs; i++)
 866     {
 867       basic_block bb = bbs[i];
 868       gimple_stmt_iterator si;
 869
 870       for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
 871         {
 872           gimple *phi = gsi_stmt (si);
 873           gimple_set_uid (phi, 0);
 874           add_stmt (phi);
 875         }
 876
 877       for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
 878         {
 879           gimple *stmt = gsi_stmt (si);
 880           gimple_set_uid (stmt, 0);
 881           if (is_gimple_debug (stmt))
 882             continue;
 883           add_stmt (stmt);
 884           /* If .GOMP_SIMD_LANE call for the current loop has 3 arguments, the
 885              third argument is the #pragma omp simd if (x) condition, when 0,
 886              loop shouldn't be vectorized, when non-zero constant, it should
 887              be vectorized normally, otherwise versioned with vectorized loop
 888              done if the condition is non-zero at runtime.  */
 889           if (loop_in->simduid
 890               && is_gimple_call (stmt)
 891               && gimple_call_internal_p (stmt)
 892               && gimple_call_internal_fn (stmt) == IFN_GOMP_SIMD_LANE
 893               && gimple_call_num_args (stmt) >= 3
 894               && TREE_CODE (gimple_call_arg (stmt, 0)) == SSA_NAME
 895               && (loop_in->simduid
 896                   == SSA_NAME_VAR (gimple_call_arg (stmt, 0))))
 897             {
 898               tree arg = gimple_call_arg (stmt, 2);
 899               if (integer_zerop (arg) || TREE_CODE (arg) == SSA_NAME)
 900                 simd_if_cond = arg;
 901               else
 902                 gcc_assert (integer_nonzerop (arg));
 903             }
 904         }
 905     }
 906
 907   epilogue_vinfos.create (6);
 908 }
 909
 910 /* Free all levels of rgroup CONTROLS.  */
 911
 912 void
 913 release_vec_loop_controls (vec<rgroup_controls> *controls)
 914 {
 915   rgroup_controls *rgc;
 916   unsigned int i;
 917   FOR_EACH_VEC_ELT (*controls, i, rgc)
 918     rgc->controls.release ();
 919   controls->release ();
 920 }
 921
 922 /* Free all memory used by the _loop_vec_info, as well as all the
 923    stmt_vec_info structs of all the stmts in the loop.  */
 924
 925 _loop_vec_info::~_loop_vec_info ()
 926 {
 927   free (bbs);
 928
 929   release_vec_loop_controls (&masks);
 930   release_vec_loop_controls (&lens);
 931   delete ivexpr_map;
 932   delete scan_map;
 933   epilogue_vinfos.release ();
 934   delete scalar_costs;
 935   delete vector_costs;
 936
 937   /* When we release an epiloge vinfo that we do not intend to use
 938      avoid clearing AUX of the main loop which should continue to
 939      point to the main loop vinfo since otherwise we'll leak that.  */
 940   if (loop->aux == this)
 941     loop->aux = NULL;
 942 }
 943
 944 /* Return an invariant or register for EXPR and emit necessary
 945    computations in the LOOP_VINFO loop preheader.  */
 946
 947 tree
 948 cse_and_gimplify_to_preheader (loop_vec_info loop_vinfo, tree expr)
 949 {
 950   if (is_gimple_reg (expr)
 951       || is_gimple_min_invariant (expr))
 952     return expr;
 953
 954   if (! loop_vinfo->ivexpr_map)
 955     loop_vinfo->ivexpr_map = new hash_map<tree_operand_hash, tree>;
 956   tree &cached = loop_vinfo->ivexpr_map->get_or_insert (expr);
 957   if (! cached)
 958     {
 959       gimple_seq stmts = NULL;
 960       cached = force_gimple_operand (unshare_expr (expr),
 961                                      &stmts, true, NULL_TREE);
 962       if (stmts)
 963         {
 964           edge e = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo));
 965           gsi_insert_seq_on_edge_immediate (e, stmts);
 966         }
 967     }
 968   return cached;
 969 }
 970
 971 /* Return true if we can use CMP_TYPE as the comparison type to produce
 972    all masks required to mask LOOP_VINFO.  */
 973
 974 static bool
 975 can_produce_all_loop_masks_p (loop_vec_info loop_vinfo, tree cmp_type)
 976 {
 977   rgroup_controls *rgm;
 978   unsigned int i;
 979   FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo), i, rgm)
 980     if (rgm->type != NULL_TREE
 981         && !direct_internal_fn_supported_p (IFN_WHILE_ULT,
 982                                             cmp_type, rgm->type,
 983                                             OPTIMIZE_FOR_SPEED))
 984       return false;
 985   return true;
 986 }
 987
 988 /* Calculate the maximum number of scalars per iteration for every
 989    rgroup in LOOP_VINFO.  */
 990
 991 static unsigned int
 992 vect_get_max_nscalars_per_iter (loop_vec_info loop_vinfo)
 993 {
 994   unsigned int res = 1;
 995   unsigned int i;
 996   rgroup_controls *rgm;
 997   FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo), i, rgm)
 998     res = MAX (res, rgm->max_nscalars_per_iter);
 999   return res;
1000 }
1001
1002 /* Calculate the minimum precision necessary to represent:
1003
1004       MAX_NITERS * FACTOR
1005
1006    as an unsigned integer, where MAX_NITERS is the maximum number of
1007    loop header iterations for the original scalar form of LOOP_VINFO.  */
1008
1009 static unsigned
1010 vect_min_prec_for_max_niters (loop_vec_info loop_vinfo, unsigned int factor)
1011 {
1012   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1013
1014   /* Get the maximum number of iterations that is representable
1015      in the counter type.  */
1016   tree ni_type = TREE_TYPE (LOOP_VINFO_NITERSM1 (loop_vinfo));
1017   widest_int max_ni = wi::to_widest (TYPE_MAX_VALUE (ni_type)) + 1;
1018
1019   /* Get a more refined estimate for the number of iterations.  */
1020   widest_int max_back_edges;
1021   if (max_loop_iterations (loop, &max_back_edges))
1022     max_ni = wi::smin (max_ni, max_back_edges + 1);
1023
1024   /* Work out how many bits we need to represent the limit.  */
1025   return wi::min_precision (max_ni * factor, UNSIGNED);
1026 }
1027
1028 /* True if the loop needs peeling or partial vectors when vectorized.  */
1029
1030 static bool
1031 vect_need_peeling_or_partial_vectors_p (loop_vec_info loop_vinfo)
1032 {
1033   unsigned HOST_WIDE_INT const_vf;
1034   HOST_WIDE_INT max_niter
1035     = likely_max_stmt_executions_int (LOOP_VINFO_LOOP (loop_vinfo));
1036
1037   unsigned th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
1038   if (!th && LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo))
1039     th = LOOP_VINFO_COST_MODEL_THRESHOLD (LOOP_VINFO_ORIG_LOOP_INFO
1040                                           (loop_vinfo));
1041
1042   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
1043       && LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) >= 0)
1044     {
1045       /* Work out the (constant) number of iterations that need to be
1046          peeled for reasons other than niters.  */
1047       unsigned int peel_niter = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
1048       if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
1049         peel_niter += 1;
1050       if (!multiple_p (LOOP_VINFO_INT_NITERS (loop_vinfo) - peel_niter,
1051                        LOOP_VINFO_VECT_FACTOR (loop_vinfo)))
1052         return true;
1053     }
1054   else if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
1055       /* ??? When peeling for gaps but not alignment, we could
1056          try to check whether the (variable) niters is known to be
1057          VF * N + 1.  That's something of a niche case though.  */
1058       || LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
1059       || !LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant (&const_vf)
1060       || ((tree_ctz (LOOP_VINFO_NITERS (loop_vinfo))
1061            < (unsigned) exact_log2 (const_vf))
1062           /* In case of versioning, check if the maximum number of
1063              iterations is greater than th.  If they are identical,
1064              the epilogue is unnecessary.  */
1065           && (!LOOP_REQUIRES_VERSIONING (loop_vinfo)
1066               || ((unsigned HOST_WIDE_INT) max_niter
1067                   > (th / const_vf) * const_vf))))
1068     return true;
1069
1070   return false;
1071 }
1072
1073 /* Each statement in LOOP_VINFO can be masked where necessary.  Check
1074    whether we can actually generate the masks required.  Return true if so,
1075    storing the type of the scalar IV in LOOP_VINFO_RGROUP_COMPARE_TYPE.  */
1076
1077 static bool
1078 vect_verify_full_masking (loop_vec_info loop_vinfo)
1079 {
1080   unsigned int min_ni_width;
1081   unsigned int max_nscalars_per_iter
1082     = vect_get_max_nscalars_per_iter (loop_vinfo);
1083
1084   /* Use a normal loop if there are no statements that need masking.
1085      This only happens in rare degenerate cases: it means that the loop
1086      has no loads, no stores, and no live-out values.  */
1087   if (LOOP_VINFO_MASKS (loop_vinfo).is_empty ())
1088     return false;
1089
1090   /* Work out how many bits we need to represent the limit.  */
1091   min_ni_width
1092     = vect_min_prec_for_max_niters (loop_vinfo, max_nscalars_per_iter);
1093
1094   /* Find a scalar mode for which WHILE_ULT is supported.  */
1095   opt_scalar_int_mode cmp_mode_iter;
1096   tree cmp_type = NULL_TREE;
1097   tree iv_type = NULL_TREE;
1098   widest_int iv_limit = vect_iv_limit_for_partial_vectors (loop_vinfo);
1099   unsigned int iv_precision = UINT_MAX;
1100
1101   if (iv_limit != -1)
1102     iv_precision = wi::min_precision (iv_limit * max_nscalars_per_iter,
1103                                       UNSIGNED);
1104
1105   FOR_EACH_MODE_IN_CLASS (cmp_mode_iter, MODE_INT)
1106     {
1107       unsigned int cmp_bits = GET_MODE_BITSIZE (cmp_mode_iter.require ());
1108       if (cmp_bits >= min_ni_width
1109           && targetm.scalar_mode_supported_p (cmp_mode_iter.require ()))
1110         {
1111           tree this_type = build_nonstandard_integer_type (cmp_bits, true);
1112           if (this_type
1113               && can_produce_all_loop_masks_p (loop_vinfo, this_type))
1114             {
1115               /* Although we could stop as soon as we find a valid mode,
1116                  there are at least two reasons why that's not always the
1117                  best choice:
1118
1119                  - An IV that's Pmode or wider is more likely to be reusable
1120                    in address calculations than an IV that's narrower than
1121                    Pmode.
1122
1123                  - Doing the comparison in IV_PRECISION or wider allows
1124                    a natural 0-based IV, whereas using a narrower comparison
1125                    type requires mitigations against wrap-around.
1126
1127                  Conversely, if the IV limit is variable, doing the comparison
1128                  in a wider type than the original type can introduce
1129                  unnecessary extensions, so picking the widest valid mode
1130                  is not always a good choice either.
1131
1132                  Here we prefer the first IV type that's Pmode or wider,
1133                  and the first comparison type that's IV_PRECISION or wider.
1134                  (The comparison type must be no wider than the IV type,
1135                  to avoid extensions in the vector loop.)
1136
1137                  ??? We might want to try continuing beyond Pmode for ILP32
1138                  targets if CMP_BITS < IV_PRECISION.  */
1139               iv_type = this_type;
1140               if (!cmp_type || iv_precision > TYPE_PRECISION (cmp_type))
1141                 cmp_type = this_type;
1142               if (cmp_bits >= GET_MODE_BITSIZE (Pmode))
1143                 break;
1144             }
1145         }
1146     }
1147
1148   if (!cmp_type)
1149     return false;
1150
1151   LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo) = cmp_type;
1152   LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo) = iv_type;
1153   return true;
1154 }
1155
1156 /* Check whether we can use vector access with length based on precison
1157    comparison.  So far, to keep it simple, we only allow the case that the
1158    precision of the target supported length is larger than the precision
1159    required by loop niters.  */
1160
1161 static bool
1162 vect_verify_loop_lens (loop_vec_info loop_vinfo)
1163 {
1164   if (LOOP_VINFO_LENS (loop_vinfo).is_empty ())
1165     return false;
1166
1167   unsigned int max_nitems_per_iter = 1;
1168   unsigned int i;
1169   rgroup_controls *rgl;
1170   /* Find the maximum number of items per iteration for every rgroup.  */
1171   FOR_EACH_VEC_ELT (LOOP_VINFO_LENS (loop_vinfo), i, rgl)
1172     {
1173       unsigned nitems_per_iter = rgl->max_nscalars_per_iter * rgl->factor;
1174       max_nitems_per_iter = MAX (max_nitems_per_iter, nitems_per_iter);
1175     }
1176
1177   /* Work out how many bits we need to represent the length limit.  */
1178   unsigned int min_ni_prec
1179     = vect_min_prec_for_max_niters (loop_vinfo, max_nitems_per_iter);
1180
1181   /* Now use the maximum of below precisions for one suitable IV type:
1182      - the IV's natural precision
1183      - the precision needed to hold: the maximum number of scalar
1184        iterations multiplied by the scale factor (min_ni_prec above)
1185      - the Pmode precision
1186
1187      If min_ni_prec is less than the precision of the current niters,
1188      we perfer to still use the niters type.  Prefer to use Pmode and
1189      wider IV to avoid narrow conversions.  */
1190
1191   unsigned int ni_prec
1192     = TYPE_PRECISION (TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo)));
1193   min_ni_prec = MAX (min_ni_prec, ni_prec);
1194   min_ni_prec = MAX (min_ni_prec, GET_MODE_BITSIZE (Pmode));
1195
1196   tree iv_type = NULL_TREE;
1197   opt_scalar_int_mode tmode_iter;
1198   FOR_EACH_MODE_IN_CLASS (tmode_iter, MODE_INT)
1199     {
1200       scalar_mode tmode = tmode_iter.require ();
1201       unsigned int tbits = GET_MODE_BITSIZE (tmode);
1202
1203       /* ??? Do we really want to construct one IV whose precision exceeds
1204          BITS_PER_WORD?  */
1205       if (tbits > BITS_PER_WORD)
1206         break;
1207
1208       /* Find the first available standard integral type.  */
1209       if (tbits >= min_ni_prec && targetm.scalar_mode_supported_p (tmode))
1210         {
1211           iv_type = build_nonstandard_integer_type (tbits, true);
1212           break;
1213         }
1214     }
1215
1216   if (!iv_type)
1217     {
1218       if (dump_enabled_p ())
1219         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1220                          "can't vectorize with length-based partial vectors"
1221                          " because there is no suitable iv type.\n");
1222       return false;
1223     }
1224
1225   LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo) = iv_type;
1226   LOOP_VINFO_RGROUP_IV_TYPE (loop_vinfo) = iv_type;
1227
1228   return true;
1229 }
1230
1231 /* Calculate the cost of one scalar iteration of the loop.  */
1232 static void
1233 vect_compute_single_scalar_iteration_cost (loop_vec_info loop_vinfo)
1234 {
1235   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1236   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1237   int nbbs = loop->num_nodes, factor;
1238   int innerloop_iters, i;
1239
1240   DUMP_VECT_SCOPE ("vect_compute_single_scalar_iteration_cost");
1241
1242   /* Gather costs for statements in the scalar loop.  */
1243
1244   /* FORNOW.  */
1245   innerloop_iters = 1;
1246   if (loop->inner)
1247     innerloop_iters = LOOP_VINFO_INNER_LOOP_COST_FACTOR (loop_vinfo);
1248
1249   for (i = 0; i < nbbs; i++)
1250     {
1251       gimple_stmt_iterator si;
1252       basic_block bb = bbs[i];
1253
1254       if (bb->loop_father == loop->inner)
1255         factor = innerloop_iters;
1256       else
1257         factor = 1;
1258
1259       for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
1260         {
1261           gimple *stmt = gsi_stmt (si);
1262           stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (stmt);
1263
1264           if (!is_gimple_assign (stmt) && !is_gimple_call (stmt))
1265             continue;
1266
1267           /* Skip stmts that are not vectorized inside the loop.  */
1268           stmt_vec_info vstmt_info = vect_stmt_to_vectorize (stmt_info);
1269           if (!STMT_VINFO_RELEVANT_P (vstmt_info)
1270               && (!STMT_VINFO_LIVE_P (vstmt_info)
1271                   || !VECTORIZABLE_CYCLE_DEF
1272                         (STMT_VINFO_DEF_TYPE (vstmt_info))))
1273             continue;
1274
1275           vect_cost_for_stmt kind;
1276           if (STMT_VINFO_DATA_REF (stmt_info))
1277             {
1278               if (DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
1279                kind = scalar_load;
1280              else
1281                kind = scalar_store;
1282             }
1283           else if (vect_nop_conversion_p (stmt_info))
1284             continue;
1285           else
1286             kind = scalar_stmt;
1287
1288           /* We are using vect_prologue here to avoid scaling twice
1289              by the inner loop factor.  */
1290           record_stmt_cost (&LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1291                             factor, kind, stmt_info, 0, vect_prologue);
1292         }
1293     }
1294
1295   /* Now accumulate cost.  */
1296   loop_vinfo->scalar_costs = init_cost (loop_vinfo, true);
1297   stmt_info_for_cost *si;
1298   int j;
1299   FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1300                     j, si)
1301     (void) add_stmt_cost (loop_vinfo->scalar_costs, si->count,
1302                           si->kind, si->stmt_info, si->vectype,
1303                           si->misalign, si->where);
1304   loop_vinfo->scalar_costs->finish_cost (nullptr);
1305 }
1306
1307
1308 /* Function vect_analyze_loop_form.
1309
1310    Verify that certain CFG restrictions hold, including:
1311    - the loop has a pre-header
1312    - the loop has a single entry and exit
1313    - the loop exit condition is simple enough
1314    - the number of iterations can be analyzed, i.e, a countable loop.  The
1315      niter could be analyzed under some assumptions.  */
1316
1317 opt_result
1318 vect_analyze_loop_form (class loop *loop, vect_loop_form_info *info)
1319 {
1320   DUMP_VECT_SCOPE ("vect_analyze_loop_form");
1321
1322   /* Different restrictions apply when we are considering an inner-most loop,
1323      vs. an outer (nested) loop.
1324      (FORNOW. May want to relax some of these restrictions in the future).  */
1325
1326   info->inner_loop_cond = NULL;
1327   if (!loop->inner)
1328     {
1329       /* Inner-most loop.  We currently require that the number of BBs is
1330          exactly 2 (the header and latch).  Vectorizable inner-most loops
1331          look like this:
1332
1333                         (pre-header)
1334                            |
1335                           header <--------+
1336                            | |            |
1337                            | +--> latch --+
1338                            |
1339                         (exit-bb)  */
1340
1341       if (loop->num_nodes != 2)
1342         return opt_result::failure_at (vect_location,
1343                                        "not vectorized:"
1344                                        " control flow in loop.\n");
1345
1346       if (empty_block_p (loop->header))
1347         return opt_result::failure_at (vect_location,
1348                                        "not vectorized: empty loop.\n");
1349     }
1350   else
1351     {
1352       class loop *innerloop = loop->inner;
1353       edge entryedge;
1354
1355       /* Nested loop. We currently require that the loop is doubly-nested,
1356          contains a single inner loop, and the number of BBs is exactly 5.
1357          Vectorizable outer-loops look like this:
1358
1359                         (pre-header)
1360                            |
1361                           header <---+
1362                            |         |
1363                           inner-loop |
1364                            |         |
1365                           tail ------+
1366                            |
1367                         (exit-bb)
1368
1369          The inner-loop has the properties expected of inner-most loops
1370          as described above.  */
1371
1372       if ((loop->inner)->inner || (loop->inner)->next)
1373         return opt_result::failure_at (vect_location,
1374                                        "not vectorized:"
1375                                        " multiple nested loops.\n");
1376
1377       if (loop->num_nodes != 5)
1378         return opt_result::failure_at (vect_location,
1379                                        "not vectorized:"
1380                                        " control flow in loop.\n");
1381
1382       entryedge = loop_preheader_edge (innerloop);
1383       if (entryedge->src != loop->header
1384           || !single_exit (innerloop)
1385           || single_exit (innerloop)->dest != EDGE_PRED (loop->latch, 0)->src)
1386         return opt_result::failure_at (vect_location,
1387                                        "not vectorized:"
1388                                        " unsupported outerloop form.\n");
1389
1390       /* Analyze the inner-loop.  */
1391       vect_loop_form_info inner;
1392       opt_result res = vect_analyze_loop_form (loop->inner, &inner);
1393       if (!res)
1394         {
1395           if (dump_enabled_p ())
1396             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1397                              "not vectorized: Bad inner loop.\n");
1398           return res;
1399         }
1400
1401       /* Don't support analyzing niter under assumptions for inner
1402          loop.  */
1403       if (!integer_onep (inner.assumptions))
1404         return opt_result::failure_at (vect_location,
1405                                        "not vectorized: Bad inner loop.\n");
1406
1407       if (!expr_invariant_in_loop_p (loop, inner.number_of_iterations))
1408         return opt_result::failure_at (vect_location,
1409                                        "not vectorized: inner-loop count not"
1410                                        " invariant.\n");
1411
1412       if (dump_enabled_p ())
1413         dump_printf_loc (MSG_NOTE, vect_location,
1414                          "Considering outer-loop vectorization.\n");
1415       info->inner_loop_cond = inner.loop_cond;
1416     }
1417
1418   if (!single_exit (loop))
1419     return opt_result::failure_at (vect_location,
1420                                    "not vectorized: multiple exits.\n");
1421   if (EDGE_COUNT (loop->header->preds) != 2)
1422     return opt_result::failure_at (vect_location,
1423                                    "not vectorized:"
1424                                    " too many incoming edges.\n");
1425
1426   /* We assume that the loop exit condition is at the end of the loop. i.e,
1427      that the loop is represented as a do-while (with a proper if-guard
1428      before the loop if needed), where the loop header contains all the
1429      executable statements, and the latch is empty.  */
1430   if (!empty_block_p (loop->latch)
1431       || !gimple_seq_empty_p (phi_nodes (loop->latch)))
1432     return opt_result::failure_at (vect_location,
1433                                    "not vectorized: latch block not empty.\n");
1434
1435   /* Make sure the exit is not abnormal.  */
1436   edge e = single_exit (loop);
1437   if (e->flags & EDGE_ABNORMAL)
1438     return opt_result::failure_at (vect_location,
1439                                    "not vectorized:"
1440                                    " abnormal loop exit edge.\n");
1441
1442   info->loop_cond
1443     = vect_get_loop_niters (loop, &info->assumptions,
1444                             &info->number_of_iterations,
1445                             &info->number_of_iterationsm1);
1446   if (!info->loop_cond)
1447     return opt_result::failure_at
1448       (vect_location,
1449        "not vectorized: complicated exit condition.\n");
1450
1451   if (integer_zerop (info->assumptions)
1452       || !info->number_of_iterations
1453       || chrec_contains_undetermined (info->number_of_iterations))
1454     return opt_result::failure_at
1455       (info->loop_cond,
1456        "not vectorized: number of iterations cannot be computed.\n");
1457
1458   if (integer_zerop (info->number_of_iterations))
1459     return opt_result::failure_at
1460       (info->loop_cond,
1461        "not vectorized: number of iterations = 0.\n");
1462
1463   if (!(tree_fits_shwi_p (info->number_of_iterations)
1464         && tree_to_shwi (info->number_of_iterations) > 0))
1465     {
1466       if (dump_enabled_p ())
1467         {
1468           dump_printf_loc (MSG_NOTE, vect_location,
1469                            "Symbolic number of iterations is ");
1470           dump_generic_expr (MSG_NOTE, TDF_DETAILS, info->number_of_iterations);
1471           dump_printf (MSG_NOTE, "\n");
1472         }
1473     }
1474
1475   return opt_result::success ();
1476 }
1477
1478 /* Create a loop_vec_info for LOOP with SHARED and the
1479    vect_analyze_loop_form result.  */
1480
1481 loop_vec_info
1482 vect_create_loop_vinfo (class loop *loop, vec_info_shared *shared,
1483                         const vect_loop_form_info *info,
1484                         loop_vec_info main_loop_info)
1485 {
1486   loop_vec_info loop_vinfo = new _loop_vec_info (loop, shared);
1487   LOOP_VINFO_NITERSM1 (loop_vinfo) = info->number_of_iterationsm1;
1488   LOOP_VINFO_NITERS (loop_vinfo) = info->number_of_iterations;
1489   LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = info->number_of_iterations;
1490   LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo) = main_loop_info;
1491   /* Also record the assumptions for versioning.  */
1492   if (!integer_onep (info->assumptions) && !main_loop_info)
1493     LOOP_VINFO_NITERS_ASSUMPTIONS (loop_vinfo) = info->assumptions;
1494
1495   stmt_vec_info loop_cond_info = loop_vinfo->lookup_stmt (info->loop_cond);
1496   STMT_VINFO_TYPE (loop_cond_info) = loop_exit_ctrl_vec_info_type;
1497   if (info->inner_loop_cond)
1498     {
1499       stmt_vec_info inner_loop_cond_info
1500         = loop_vinfo->lookup_stmt (info->inner_loop_cond);
1501       STMT_VINFO_TYPE (inner_loop_cond_info) = loop_exit_ctrl_vec_info_type;
1502       /* If we have an estimate on the number of iterations of the inner
1503          loop use that to limit the scale for costing, otherwise use
1504          --param vect-inner-loop-cost-factor literally.  */
1505       widest_int nit;
1506       if (estimated_stmt_executions (loop->inner, &nit))
1507         LOOP_VINFO_INNER_LOOP_COST_FACTOR (loop_vinfo)
1508           = wi::smin (nit, param_vect_inner_loop_cost_factor).to_uhwi ();
1509     }
1510
1511   return loop_vinfo;
1512 }
1513
1514
1515
1516 /* Scan the loop stmts and dependent on whether there are any (non-)SLP
1517    statements update the vectorization factor.  */
1518
1519 static void
1520 vect_update_vf_for_slp (loop_vec_info loop_vinfo)
1521 {
1522   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1523   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1524   int nbbs = loop->num_nodes;
1525   poly_uint64 vectorization_factor;
1526   int i;
1527
1528   DUMP_VECT_SCOPE ("vect_update_vf_for_slp");
1529
1530   vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1531   gcc_assert (known_ne (vectorization_factor, 0U));
1532
1533   /* If all the stmts in the loop can be SLPed, we perform only SLP, and
1534      vectorization factor of the loop is the unrolling factor required by
1535      the SLP instances.  If that unrolling factor is 1, we say, that we
1536      perform pure SLP on loop - cross iteration parallelism is not
1537      exploited.  */
1538   bool only_slp_in_loop = true;
1539   for (i = 0; i < nbbs; i++)
1540     {
1541       basic_block bb = bbs[i];
1542       for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
1543            gsi_next (&si))
1544         {
1545           stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (si.phi ());
1546           if (!stmt_info)
1547             continue;
1548           if ((STMT_VINFO_RELEVANT_P (stmt_info)
1549                || VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
1550               && !PURE_SLP_STMT (stmt_info))
1551             /* STMT needs both SLP and loop-based vectorization.  */
1552             only_slp_in_loop = false;
1553         }
1554       for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
1555            gsi_next (&si))
1556         {
1557           if (is_gimple_debug (gsi_stmt (si)))
1558             continue;
1559           stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
1560           stmt_info = vect_stmt_to_vectorize (stmt_info);
1561           if ((STMT_VINFO_RELEVANT_P (stmt_info)
1562                || VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
1563               && !PURE_SLP_STMT (stmt_info))
1564             /* STMT needs both SLP and loop-based vectorization.  */
1565             only_slp_in_loop = false;
1566         }
1567     }
1568
1569   if (only_slp_in_loop)
1570     {
1571       if (dump_enabled_p ())
1572         dump_printf_loc (MSG_NOTE, vect_location,
1573                          "Loop contains only SLP stmts\n");
1574       vectorization_factor = LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo);
1575     }
1576   else
1577     {
1578       if (dump_enabled_p ())
1579         dump_printf_loc (MSG_NOTE, vect_location,
1580                          "Loop contains SLP and non-SLP stmts\n");
1581       /* Both the vectorization factor and unroll factor have the form
1582          GET_MODE_SIZE (loop_vinfo->vector_mode) * X for some rational X,
1583          so they must have a common multiple.  */
1584       vectorization_factor
1585         = force_common_multiple (vectorization_factor,
1586                                  LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo));
1587     }
1588
1589   LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
1590   if (dump_enabled_p ())
1591     {
1592       dump_printf_loc (MSG_NOTE, vect_location,
1593                        "Updating vectorization factor to ");
1594       dump_dec (MSG_NOTE, vectorization_factor);
1595       dump_printf (MSG_NOTE, ".\n");
1596     }
1597 }
1598
1599 /* Return true if STMT_INFO describes a double reduction phi and if
1600    the other phi in the reduction is also relevant for vectorization.
1601    This rejects cases such as:
1602
1603       outer1:
1604         x_1 = PHI <x_3(outer2), ...>;
1605         ...
1606
1607       inner:
1608         x_2 = ...;
1609         ...
1610
1611       outer2:
1612         x_3 = PHI <x_2(inner)>;
1613
1614    if nothing in x_2 or elsewhere makes x_1 relevant.  */
1615
1616 static bool
1617 vect_active_double_reduction_p (stmt_vec_info stmt_info)
1618 {
1619   if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_double_reduction_def)
1620     return false;
1621
1622   return STMT_VINFO_RELEVANT_P (STMT_VINFO_REDUC_DEF (stmt_info));
1623 }
1624
1625 /* Function vect_analyze_loop_operations.
1626
1627    Scan the loop stmts and make sure they are all vectorizable.  */
1628
1629 static opt_result
1630 vect_analyze_loop_operations (loop_vec_info loop_vinfo)
1631 {
1632   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1633   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1634   int nbbs = loop->num_nodes;
1635   int i;
1636   stmt_vec_info stmt_info;
1637   bool need_to_vectorize = false;
1638   bool ok;
1639
1640   DUMP_VECT_SCOPE ("vect_analyze_loop_operations");
1641
1642   auto_vec<stmt_info_for_cost> cost_vec;
1643
1644   for (i = 0; i < nbbs; i++)
1645     {
1646       basic_block bb = bbs[i];
1647
1648       for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
1649            gsi_next (&si))
1650         {
1651           gphi *phi = si.phi ();
1652           ok = true;
1653
1654           stmt_info = loop_vinfo->lookup_stmt (phi);
1655           if (dump_enabled_p ())
1656             dump_printf_loc (MSG_NOTE, vect_location, "examining phi: %G", phi);
1657           if (virtual_operand_p (gimple_phi_result (phi)))
1658             continue;
1659
1660           /* Inner-loop loop-closed exit phi in outer-loop vectorization
1661              (i.e., a phi in the tail of the outer-loop).  */
1662           if (! is_loop_header_bb_p (bb))
1663             {
1664               /* FORNOW: we currently don't support the case that these phis
1665                  are not used in the outerloop (unless it is double reduction,
1666                  i.e., this phi is vect_reduction_def), cause this case
1667                  requires to actually do something here.  */
1668               if (STMT_VINFO_LIVE_P (stmt_info)
1669                   && !vect_active_double_reduction_p (stmt_info))
1670                 return opt_result::failure_at (phi,
1671                                                "Unsupported loop-closed phi"
1672                                                " in outer-loop.\n");
1673
1674               /* If PHI is used in the outer loop, we check that its operand
1675                  is defined in the inner loop.  */
1676               if (STMT_VINFO_RELEVANT_P (stmt_info))
1677                 {
1678                   tree phi_op;
1679
1680                   if (gimple_phi_num_args (phi) != 1)
1681                     return opt_result::failure_at (phi, "unsupported phi");
1682
1683                   phi_op = PHI_ARG_DEF (phi, 0);
1684                   stmt_vec_info op_def_info = loop_vinfo->lookup_def (phi_op);
1685                   if (!op_def_info)
1686                     return opt_result::failure_at (phi, "unsupported phi\n");
1687
1688                   if (STMT_VINFO_RELEVANT (op_def_info) != vect_used_in_outer
1689                       && (STMT_VINFO_RELEVANT (op_def_info)
1690                           != vect_used_in_outer_by_reduction))
1691                     return opt_result::failure_at (phi, "unsupported phi\n");
1692
1693                   if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_internal_def
1694                        || (STMT_VINFO_DEF_TYPE (stmt_info)
1695                            == vect_double_reduction_def))
1696                       && !vectorizable_lc_phi (loop_vinfo,
1697                                                stmt_info, NULL, NULL))
1698                     return opt_result::failure_at (phi, "unsupported phi\n");
1699                 }
1700
1701               continue;
1702             }
1703
1704           gcc_assert (stmt_info);
1705
1706           if ((STMT_VINFO_RELEVANT (stmt_info) == vect_used_in_scope
1707                || STMT_VINFO_LIVE_P (stmt_info))
1708               && STMT_VINFO_DEF_TYPE (stmt_info) != vect_induction_def)
1709             /* A scalar-dependence cycle that we don't support.  */
1710             return opt_result::failure_at (phi,
1711                                            "not vectorized:"
1712                                            " scalar dependence cycle.\n");
1713
1714           if (STMT_VINFO_RELEVANT_P (stmt_info))
1715             {
1716               need_to_vectorize = true;
1717               if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def
1718                   && ! PURE_SLP_STMT (stmt_info))
1719                 ok = vectorizable_induction (loop_vinfo,
1720                                              stmt_info, NULL, NULL,
1721                                              &cost_vec);
1722               else if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
1723                         || (STMT_VINFO_DEF_TYPE (stmt_info)
1724                             == vect_double_reduction_def)
1725                         || STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle)
1726                        && ! PURE_SLP_STMT (stmt_info))
1727                 ok = vectorizable_reduction (loop_vinfo,
1728                                              stmt_info, NULL, NULL, &cost_vec);
1729             }
1730
1731           /* SLP PHIs are tested by vect_slp_analyze_node_operations.  */
1732           if (ok
1733               && STMT_VINFO_LIVE_P (stmt_info)
1734               && !PURE_SLP_STMT (stmt_info))
1735             ok = vectorizable_live_operation (loop_vinfo,
1736                                               stmt_info, NULL, NULL, NULL,
1737                                               -1, false, &cost_vec);
1738
1739           if (!ok)
1740             return opt_result::failure_at (phi,
1741                                            "not vectorized: relevant phi not "
1742                                            "supported: %G",
1743                                            static_cast <gimple *> (phi));
1744         }
1745
1746       for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
1747            gsi_next (&si))
1748         {
1749           gimple *stmt = gsi_stmt (si);
1750           if (!gimple_clobber_p (stmt)
1751               && !is_gimple_debug (stmt))
1752             {
1753               opt_result res
1754                 = vect_analyze_stmt (loop_vinfo,
1755                                      loop_vinfo->lookup_stmt (stmt),
1756                                      &need_to_vectorize,
1757                                      NULL, NULL, &cost_vec);
1758               if (!res)
1759                 return res;
1760             }
1761         }
1762     } /* bbs */
1763
1764   add_stmt_costs (loop_vinfo->vector_costs, &cost_vec);
1765
1766   /* All operations in the loop are either irrelevant (deal with loop
1767      control, or dead), or only used outside the loop and can be moved
1768      out of the loop (e.g. invariants, inductions).  The loop can be
1769      optimized away by scalar optimizations.  We're better off not
1770      touching this loop.  */
1771   if (!need_to_vectorize)
1772     {
1773       if (dump_enabled_p ())
1774         dump_printf_loc (MSG_NOTE, vect_location,
1775                          "All the computation can be taken out of the loop.\n");
1776       return opt_result::failure_at
1777         (vect_location,
1778          "not vectorized: redundant loop. no profit to vectorize.\n");
1779     }
1780
1781   return opt_result::success ();
1782 }
1783
1784 /* Return true if we know that the iteration count is smaller than the
1785    vectorization factor.  Return false if it isn't, or if we can't be sure
1786    either way.  */
1787
1788 static bool
1789 vect_known_niters_smaller_than_vf (loop_vec_info loop_vinfo)
1790 {
1791   unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
1792
1793   HOST_WIDE_INT max_niter;
1794   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
1795     max_niter = LOOP_VINFO_INT_NITERS (loop_vinfo);
1796   else
1797     max_niter = max_stmt_executions_int (LOOP_VINFO_LOOP (loop_vinfo));
1798
1799   if (max_niter != -1 && (unsigned HOST_WIDE_INT) max_niter < assumed_vf)
1800     return true;
1801
1802   return false;
1803 }
1804
1805 /* Analyze the cost of the loop described by LOOP_VINFO.  Decide if it
1806    is worthwhile to vectorize.  Return 1 if definitely yes, 0 if
1807    definitely no, or -1 if it's worth retrying.  */
1808
1809 static int
1810 vect_analyze_loop_costing (loop_vec_info loop_vinfo)
1811 {
1812   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1813   unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
1814
1815   /* Only loops that can handle partially-populated vectors can have iteration
1816      counts less than the vectorization factor.  */
1817   if (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
1818     {
1819       if (vect_known_niters_smaller_than_vf (loop_vinfo))
1820         {
1821           if (dump_enabled_p ())
1822             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1823                              "not vectorized: iteration count smaller than "
1824                              "vectorization factor.\n");
1825           return 0;
1826         }
1827     }
1828
1829   /* If using the "very cheap" model. reject cases in which we'd keep
1830      a copy of the scalar code (even if we might be able to vectorize it).  */
1831   if (loop_cost_model (loop) == VECT_COST_MODEL_VERY_CHEAP
1832       && (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
1833           || LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
1834           || LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo)))
1835     {
1836       if (dump_enabled_p ())
1837         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1838                          "some scalar iterations would need to be peeled\n");
1839       return 0;
1840     }
1841
1842   int min_profitable_iters, min_profitable_estimate;
1843   vect_estimate_min_profitable_iters (loop_vinfo, &min_profitable_iters,
1844                                       &min_profitable_estimate);
1845
1846   if (min_profitable_iters < 0)
1847     {
1848       if (dump_enabled_p ())
1849         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1850                          "not vectorized: vectorization not profitable.\n");
1851       if (dump_enabled_p ())
1852         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1853                          "not vectorized: vector version will never be "
1854                          "profitable.\n");
1855       return -1;
1856     }
1857
1858   int min_scalar_loop_bound = (param_min_vect_loop_bound
1859                                * assumed_vf);
1860
1861   /* Use the cost model only if it is more conservative than user specified
1862      threshold.  */
1863   unsigned int th = (unsigned) MAX (min_scalar_loop_bound,
1864                                     min_profitable_iters);
1865
1866   LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo) = th;
1867
1868   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
1869       && LOOP_VINFO_INT_NITERS (loop_vinfo) < th)
1870     {
1871       if (dump_enabled_p ())
1872         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1873                          "not vectorized: vectorization not profitable.\n");
1874       if (dump_enabled_p ())
1875         dump_printf_loc (MSG_NOTE, vect_location,
1876                          "not vectorized: iteration count smaller than user "
1877                          "specified loop bound parameter or minimum profitable "
1878                          "iterations (whichever is more conservative).\n");
1879       return 0;
1880     }
1881
1882   /* The static profitablity threshold min_profitable_estimate includes
1883      the cost of having to check at runtime whether the scalar loop
1884      should be used instead.  If it turns out that we don't need or want
1885      such a check, the threshold we should use for the static estimate
1886      is simply the point at which the vector loop becomes more profitable
1887      than the scalar loop.  */
1888   if (min_profitable_estimate > min_profitable_iters
1889       && !LOOP_REQUIRES_VERSIONING (loop_vinfo)
1890       && !LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo)
1891       && !LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
1892       && !vect_apply_runtime_profitability_check_p (loop_vinfo))
1893     {
1894       if (dump_enabled_p ())
1895         dump_printf_loc (MSG_NOTE, vect_location, "no need for a runtime"
1896                          " choice between the scalar and vector loops\n");
1897       min_profitable_estimate = min_profitable_iters;
1898     }
1899
1900   /* If the vector loop needs multiple iterations to be beneficial then
1901      things are probably too close to call, and the conservative thing
1902      would be to stick with the scalar code.  */
1903   if (loop_cost_model (loop) == VECT_COST_MODEL_VERY_CHEAP
1904       && min_profitable_estimate > (int) vect_vf_for_cost (loop_vinfo))
1905     {
1906       if (dump_enabled_p ())
1907         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1908                          "one iteration of the vector loop would be"
1909                          " more expensive than the equivalent number of"
1910                          " iterations of the scalar loop\n");
1911       return 0;
1912     }
1913
1914   HOST_WIDE_INT estimated_niter;
1915
1916   /* If we are vectorizing an epilogue then we know the maximum number of
1917      scalar iterations it will cover is at least one lower than the
1918      vectorization factor of the main loop.  */
1919   if (LOOP_VINFO_EPILOGUE_P (loop_vinfo))
1920     estimated_niter
1921       = vect_vf_for_cost (LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo)) - 1;
1922   else
1923     {
1924       estimated_niter = estimated_stmt_executions_int (loop);
1925       if (estimated_niter == -1)
1926         estimated_niter = likely_max_stmt_executions_int (loop);
1927     }
1928   if (estimated_niter != -1
1929       && ((unsigned HOST_WIDE_INT) estimated_niter
1930           < MAX (th, (unsigned) min_profitable_estimate)))
1931     {
1932       if (dump_enabled_p ())
1933         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1934                          "not vectorized: estimated iteration count too "
1935                          "small.\n");
1936       if (dump_enabled_p ())
1937         dump_printf_loc (MSG_NOTE, vect_location,
1938                          "not vectorized: estimated iteration count smaller "
1939                          "than specified loop bound parameter or minimum "
1940                          "profitable iterations (whichever is more "
1941                          "conservative).\n");
1942       return -1;
1943     }
1944
1945   return 1;
1946 }
1947
1948 static opt_result
1949 vect_get_datarefs_in_loop (loop_p loop, basic_block *bbs,
1950                            vec<data_reference_p> *datarefs,
1951                            unsigned int *n_stmts)
1952 {
1953   *n_stmts = 0;
1954   for (unsigned i = 0; i < loop->num_nodes; i++)
1955     for (gimple_stmt_iterator gsi = gsi_start_bb (bbs[i]);
1956          !gsi_end_p (gsi); gsi_next (&gsi))
1957       {
1958         gimple *stmt = gsi_stmt (gsi);
1959         if (is_gimple_debug (stmt))
1960           continue;
1961         ++(*n_stmts);
1962         opt_result res = vect_find_stmt_data_reference (loop, stmt, datarefs,
1963                                                         NULL, 0);
1964         if (!res)
1965           {
1966             if (is_gimple_call (stmt) && loop->safelen)
1967               {
1968                 tree fndecl = gimple_call_fndecl (stmt), op;
1969                 if (fndecl != NULL_TREE)
1970                   {
1971                     cgraph_node *node = cgraph_node::get (fndecl);
1972                     if (node != NULL && node->simd_clones != NULL)
1973                       {
1974                         unsigned int j, n = gimple_call_num_args (stmt);
1975                         for (j = 0; j < n; j++)
1976                           {
1977                             op = gimple_call_arg (stmt, j);
1978                             if (DECL_P (op)
1979                                 || (REFERENCE_CLASS_P (op)
1980                                     && get_base_address (op)))
1981                               break;
1982                           }
1983                         op = gimple_call_lhs (stmt);
1984                         /* Ignore #pragma omp declare simd functions
1985                            if they don't have data references in the
1986                            call stmt itself.  */
1987                         if (j == n
1988                             && !(op
1989                                  && (DECL_P (op)
1990                                      || (REFERENCE_CLASS_P (op)
1991                                          && get_base_address (op)))))
1992                           continue;
1993                       }
1994                   }
1995               }
1996             return res;
1997           }
1998         /* If dependence analysis will give up due to the limit on the
1999            number of datarefs stop here and fail fatally.  */
2000         if (datarefs->length ()
2001             > (unsigned)param_loop_max_datarefs_for_datadeps)
2002           return opt_result::failure_at (stmt, "exceeded param "
2003                                          "loop-max-datarefs-for-datadeps\n");
2004       }
2005   return opt_result::success ();
2006 }
2007
2008 /* Look for SLP-only access groups and turn each individual access into its own
2009    group.  */
2010 static void
2011 vect_dissolve_slp_only_groups (loop_vec_info loop_vinfo)
2012 {
2013   unsigned int i;
2014   struct data_reference *dr;
2015
2016   DUMP_VECT_SCOPE ("vect_dissolve_slp_only_groups");
2017
2018   vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (loop_vinfo);
2019   FOR_EACH_VEC_ELT (datarefs, i, dr)
2020     {
2021       gcc_assert (DR_REF (dr));
2022       stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (DR_STMT (dr));
2023
2024       /* Check if the load is a part of an interleaving chain.  */
2025       if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
2026         {
2027           stmt_vec_info first_element = DR_GROUP_FIRST_ELEMENT (stmt_info);
2028           dr_vec_info *dr_info = STMT_VINFO_DR_INFO (first_element);
2029           unsigned int group_size = DR_GROUP_SIZE (first_element);
2030
2031           /* Check if SLP-only groups.  */
2032           if (!STMT_SLP_TYPE (stmt_info)
2033               && STMT_VINFO_SLP_VECT_ONLY (first_element))
2034             {
2035               /* Dissolve the group.  */
2036               STMT_VINFO_SLP_VECT_ONLY (first_element) = false;
2037
2038               stmt_vec_info vinfo = first_element;
2039               while (vinfo)
2040                 {
2041                   stmt_vec_info next = DR_GROUP_NEXT_ELEMENT (vinfo);
2042                   DR_GROUP_FIRST_ELEMENT (vinfo) = vinfo;
2043                   DR_GROUP_NEXT_ELEMENT (vinfo) = NULL;
2044                   DR_GROUP_SIZE (vinfo) = 1;
2045                   if (STMT_VINFO_STRIDED_P (first_element))
2046                     DR_GROUP_GAP (vinfo) = 0;
2047                   else
2048                     DR_GROUP_GAP (vinfo) = group_size - 1;
2049                   /* Duplicate and adjust alignment info, it needs to
2050                      be present on each group leader, see dr_misalignment.  */
2051                   if (vinfo != first_element)
2052                     {
2053                       dr_vec_info *dr_info2 = STMT_VINFO_DR_INFO (vinfo);
2054                       dr_info2->target_alignment = dr_info->target_alignment;
2055                       int misalignment = dr_info->misalignment;
2056                       if (misalignment != DR_MISALIGNMENT_UNKNOWN)
2057                         {
2058                           HOST_WIDE_INT diff
2059                             = (TREE_INT_CST_LOW (DR_INIT (dr_info2->dr))
2060                                - TREE_INT_CST_LOW (DR_INIT (dr_info->dr)));
2061                           unsigned HOST_WIDE_INT align_c
2062                             = dr_info->target_alignment.to_constant ();
2063                           misalignment = (misalignment + diff) % align_c;
2064                         }
2065                       dr_info2->misalignment = misalignment;
2066                     }
2067                   vinfo = next;
2068                 }
2069             }
2070         }
2071     }
2072 }
2073
2074 /* Determine if operating on full vectors for LOOP_VINFO might leave
2075    some scalar iterations still to do.  If so, decide how we should
2076    handle those scalar iterations.  The possibilities are:
2077
2078    (1) Make LOOP_VINFO operate on partial vectors instead of full vectors.
2079        In this case:
2080
2081          LOOP_VINFO_USING_PARTIAL_VECTORS_P == true
2082          LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P == false
2083          LOOP_VINFO_PEELING_FOR_NITER == false
2084
2085    (2) Make LOOP_VINFO operate on full vectors and use an epilogue loop
2086        to handle the remaining scalar iterations.  In this case:
2087
2088          LOOP_VINFO_USING_PARTIAL_VECTORS_P == false
2089          LOOP_VINFO_PEELING_FOR_NITER == true
2090
2091        There are two choices:
2092
2093        (2a) Consider vectorizing the epilogue loop at the same VF as the
2094             main loop, but using partial vectors instead of full vectors.
2095             In this case:
2096
2097               LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P == true
2098
2099        (2b) Consider vectorizing the epilogue loop at lower VFs only.
2100             In this case:
2101
2102               LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P == false
2103
2104    When FOR_EPILOGUE_P is true, make this determination based on the
2105    assumption that LOOP_VINFO is an epilogue loop, otherwise make it
2106    based on the assumption that LOOP_VINFO is the main loop.  The caller
2107    has made sure that the number of iterations is set appropriately for
2108    this value of FOR_EPILOGUE_P.  */
2109
2110 opt_result
2111 vect_determine_partial_vectors_and_peeling (loop_vec_info loop_vinfo,
2112                                             bool for_epilogue_p)
2113 {
2114   /* Determine whether there would be any scalar iterations left over.  */
2115   bool need_peeling_or_partial_vectors_p
2116     = vect_need_peeling_or_partial_vectors_p (loop_vinfo);
2117
2118   /* Decide whether to vectorize the loop with partial vectors.  */
2119   LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo) = false;
2120   LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P (loop_vinfo) = false;
2121   if (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)
2122       && need_peeling_or_partial_vectors_p)
2123     {
2124       /* For partial-vector-usage=1, try to push the handling of partial
2125          vectors to the epilogue, with the main loop continuing to operate
2126          on full vectors.
2127
2128          ??? We could then end up failing to use partial vectors if we
2129          decide to peel iterations into a prologue, and if the main loop
2130          then ends up processing fewer than VF iterations.  */
2131       if (param_vect_partial_vector_usage == 1
2132           && !LOOP_VINFO_EPILOGUE_P (loop_vinfo)
2133           && !vect_known_niters_smaller_than_vf (loop_vinfo))
2134         LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P (loop_vinfo) = true;
2135       else
2136         LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo) = true;
2137     }
2138
2139   if (dump_enabled_p ())
2140     {
2141       if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
2142         dump_printf_loc (MSG_NOTE, vect_location,
2143                          "operating on partial vectors%s.\n",
2144                          for_epilogue_p ? " for epilogue loop" : "");
2145       else
2146         dump_printf_loc (MSG_NOTE, vect_location,
2147                          "operating only on full vectors%s.\n",
2148                          for_epilogue_p ? " for epilogue loop" : "");
2149     }
2150
2151   if (for_epilogue_p)
2152     {
2153       loop_vec_info orig_loop_vinfo = LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo);
2154       gcc_assert (orig_loop_vinfo);
2155       if (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
2156         gcc_assert (known_lt (LOOP_VINFO_VECT_FACTOR (loop_vinfo),
2157                               LOOP_VINFO_VECT_FACTOR (orig_loop_vinfo)));
2158     }
2159
2160   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2161       && !LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
2162     {
2163       /* Check that the loop processes at least one full vector.  */
2164       poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2165       tree scalar_niters = LOOP_VINFO_NITERS (loop_vinfo);
2166       if (known_lt (wi::to_widest (scalar_niters), vf))
2167         return opt_result::failure_at (vect_location,
2168                                        "loop does not have enough iterations"
2169                                        " to support vectorization.\n");
2170
2171       /* If we need to peel an extra epilogue iteration to handle data
2172          accesses with gaps, check that there are enough scalar iterations
2173          available.
2174
2175          The check above is redundant with this one when peeling for gaps,
2176          but the distinction is useful for diagnostics.  */
2177       tree scalar_nitersm1 = LOOP_VINFO_NITERSM1 (loop_vinfo);
2178       if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
2179           && known_lt (wi::to_widest (scalar_nitersm1), vf))
2180         return opt_result::failure_at (vect_location,
2181                                        "loop does not have enough iterations"
2182                                        " to support peeling for gaps.\n");
2183     }
2184
2185   LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo)
2186     = (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo)
2187        && need_peeling_or_partial_vectors_p);
2188
2189   return opt_result::success ();
2190 }
2191
2192 /* Function vect_analyze_loop_2.
2193
2194    Apply a set of analyses on LOOP, and create a loop_vec_info struct
2195    for it.  The different analyses will record information in the
2196    loop_vec_info struct.  */
2197 static opt_result
2198 vect_analyze_loop_2 (loop_vec_info loop_vinfo, bool &fatal)
2199 {
2200   opt_result ok = opt_result::success ();
2201   int res;
2202   unsigned int max_vf = MAX_VECTORIZATION_FACTOR;
2203   poly_uint64 min_vf = 2;
2204   loop_vec_info orig_loop_vinfo = NULL;
2205
2206   /* If we are dealing with an epilogue then orig_loop_vinfo points to the
2207      loop_vec_info of the first vectorized loop.  */
2208   if (LOOP_VINFO_EPILOGUE_P (loop_vinfo))
2209     orig_loop_vinfo = LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo);
2210   else
2211     orig_loop_vinfo = loop_vinfo;
2212   gcc_assert (orig_loop_vinfo);
2213
2214   /* The first group of checks is independent of the vector size.  */
2215   fatal = true;
2216
2217   if (LOOP_VINFO_SIMD_IF_COND (loop_vinfo)
2218       && integer_zerop (LOOP_VINFO_SIMD_IF_COND (loop_vinfo)))
2219     return opt_result::failure_at (vect_location,
2220                                    "not vectorized: simd if(0)\n");
2221
2222   /* Find all data references in the loop (which correspond to vdefs/vuses)
2223      and analyze their evolution in the loop.  */
2224
2225   loop_p loop = LOOP_VINFO_LOOP (loop_vinfo);
2226
2227   /* Gather the data references and count stmts in the loop.  */
2228   if (!LOOP_VINFO_DATAREFS (loop_vinfo).exists ())
2229     {
2230       opt_result res
2231         = vect_get_datarefs_in_loop (loop, LOOP_VINFO_BBS (loop_vinfo),
2232                                      &LOOP_VINFO_DATAREFS (loop_vinfo),
2233                                      &LOOP_VINFO_N_STMTS (loop_vinfo));
2234       if (!res)
2235         {
2236           if (dump_enabled_p ())
2237             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2238                              "not vectorized: loop contains function "
2239                              "calls or data references that cannot "
2240                              "be analyzed\n");
2241           return res;
2242         }
2243       loop_vinfo->shared->save_datarefs ();
2244     }
2245   else
2246     loop_vinfo->shared->check_datarefs ();
2247
2248   /* Analyze the data references and also adjust the minimal
2249      vectorization factor according to the loads and stores.  */
2250
2251   ok = vect_analyze_data_refs (loop_vinfo, &min_vf, &fatal);
2252   if (!ok)
2253     {
2254       if (dump_enabled_p ())
2255         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2256                          "bad data references.\n");
2257       return ok;
2258     }
2259
2260   /* Classify all cross-iteration scalar data-flow cycles.
2261      Cross-iteration cycles caused by virtual phis are analyzed separately.  */
2262   vect_analyze_scalar_cycles (loop_vinfo);
2263
2264   vect_pattern_recog (loop_vinfo);
2265
2266   vect_fixup_scalar_cycles_with_patterns (loop_vinfo);
2267
2268   /* Analyze the access patterns of the data-refs in the loop (consecutive,
2269      complex, etc.). FORNOW: Only handle consecutive access pattern.  */
2270
2271   ok = vect_analyze_data_ref_accesses (loop_vinfo, NULL);
2272   if (!ok)
2273     {
2274       if (dump_enabled_p ())
2275         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2276                          "bad data access.\n");
2277       return ok;
2278     }
2279
2280   /* Data-flow analysis to detect stmts that do not need to be vectorized.  */
2281
2282   ok = vect_mark_stmts_to_be_vectorized (loop_vinfo, &fatal);
2283   if (!ok)
2284     {
2285       if (dump_enabled_p ())
2286         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2287                          "unexpected pattern.\n");
2288       return ok;
2289     }
2290
2291   /* While the rest of the analysis below depends on it in some way.  */
2292   fatal = false;
2293
2294   /* Analyze data dependences between the data-refs in the loop
2295      and adjust the maximum vectorization factor according to
2296      the dependences.
2297      FORNOW: fail at the first data dependence that we encounter.  */
2298
2299   ok = vect_analyze_data_ref_dependences (loop_vinfo, &max_vf);
2300   if (!ok)
2301     {
2302       if (dump_enabled_p ())
2303         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2304                          "bad data dependence.\n");
2305       return ok;
2306     }
2307   if (max_vf != MAX_VECTORIZATION_FACTOR
2308       && maybe_lt (max_vf, min_vf))
2309     return opt_result::failure_at (vect_location, "bad data dependence.\n");
2310   LOOP_VINFO_MAX_VECT_FACTOR (loop_vinfo) = max_vf;
2311
2312   ok = vect_determine_vectorization_factor (loop_vinfo);
2313   if (!ok)
2314     {
2315       if (dump_enabled_p ())
2316         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2317                          "can't determine vectorization factor.\n");
2318       return ok;
2319     }
2320   if (max_vf != MAX_VECTORIZATION_FACTOR
2321       && maybe_lt (max_vf, LOOP_VINFO_VECT_FACTOR (loop_vinfo)))
2322     return opt_result::failure_at (vect_location, "bad data dependence.\n");
2323
2324   /* Compute the scalar iteration cost.  */
2325   vect_compute_single_scalar_iteration_cost (loop_vinfo);
2326
2327   poly_uint64 saved_vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2328
2329   /* Check the SLP opportunities in the loop, analyze and build SLP trees.  */
2330   ok = vect_analyze_slp (loop_vinfo, LOOP_VINFO_N_STMTS (loop_vinfo));
2331   if (!ok)
2332     return ok;
2333
2334   /* If there are any SLP instances mark them as pure_slp.  */
2335   bool slp = vect_make_slp_decision (loop_vinfo);
2336   if (slp)
2337     {
2338       /* Find stmts that need to be both vectorized and SLPed.  */
2339       vect_detect_hybrid_slp (loop_vinfo);
2340
2341       /* Update the vectorization factor based on the SLP decision.  */
2342       vect_update_vf_for_slp (loop_vinfo);
2343
2344       /* Optimize the SLP graph with the vectorization factor fixed.  */
2345       vect_optimize_slp (loop_vinfo);
2346
2347       /* Gather the loads reachable from the SLP graph entries.  */
2348       vect_gather_slp_loads (loop_vinfo);
2349     }
2350
2351   bool saved_can_use_partial_vectors_p
2352     = LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo);
2353
2354   /* We don't expect to have to roll back to anything other than an empty
2355      set of rgroups.  */
2356   gcc_assert (LOOP_VINFO_MASKS (loop_vinfo).is_empty ());
2357
2358   /* This is the point where we can re-start analysis with SLP forced off.  */
2359 start_over:
2360
2361   /* Now the vectorization factor is final.  */
2362   poly_uint64 vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2363   gcc_assert (known_ne (vectorization_factor, 0U));
2364
2365   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) && dump_enabled_p ())
2366     {
2367       dump_printf_loc (MSG_NOTE, vect_location,
2368                        "vectorization_factor = ");
2369       dump_dec (MSG_NOTE, vectorization_factor);
2370       dump_printf (MSG_NOTE, ", niters = %wd\n",
2371                    LOOP_VINFO_INT_NITERS (loop_vinfo));
2372     }
2373
2374   loop_vinfo->vector_costs = init_cost (loop_vinfo, false);
2375
2376   /* Analyze the alignment of the data-refs in the loop.
2377      Fail if a data reference is found that cannot be vectorized.  */
2378
2379   ok = vect_analyze_data_refs_alignment (loop_vinfo);
2380   if (!ok)
2381     {
2382       if (dump_enabled_p ())
2383         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2384                          "bad data alignment.\n");
2385       return ok;
2386     }
2387
2388   /* Prune the list of ddrs to be tested at run-time by versioning for alias.
2389      It is important to call pruning after vect_analyze_data_ref_accesses,
2390      since we use grouping information gathered by interleaving analysis.  */
2391   ok = vect_prune_runtime_alias_test_list (loop_vinfo);
2392   if (!ok)
2393     return ok;
2394
2395   /* Do not invoke vect_enhance_data_refs_alignment for epilogue
2396      vectorization, since we do not want to add extra peeling or
2397      add versioning for alignment.  */
2398   if (!LOOP_VINFO_EPILOGUE_P (loop_vinfo))
2399     /* This pass will decide on using loop versioning and/or loop peeling in
2400        order to enhance the alignment of data references in the loop.  */
2401     ok = vect_enhance_data_refs_alignment (loop_vinfo);
2402   if (!ok)
2403     return ok;
2404
2405   if (slp)
2406     {
2407       /* Analyze operations in the SLP instances.  Note this may
2408          remove unsupported SLP instances which makes the above
2409          SLP kind detection invalid.  */
2410       unsigned old_size = LOOP_VINFO_SLP_INSTANCES (loop_vinfo).length ();
2411       vect_slp_analyze_operations (loop_vinfo);
2412       if (LOOP_VINFO_SLP_INSTANCES (loop_vinfo).length () != old_size)
2413         {
2414           ok = opt_result::failure_at (vect_location,
2415                                        "unsupported SLP instances\n");
2416           goto again;
2417         }
2418
2419       /* Check whether any load in ALL SLP instances is possibly permuted.  */
2420       slp_tree load_node, slp_root;
2421       unsigned i, x;
2422       slp_instance instance;
2423       bool can_use_lanes = true;
2424       FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), x, instance)
2425         {
2426           slp_root = SLP_INSTANCE_TREE (instance);
2427           int group_size = SLP_TREE_LANES (slp_root);
2428           tree vectype = SLP_TREE_VECTYPE (slp_root);
2429           bool loads_permuted = false;
2430           FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), i, load_node)
2431             {
2432               if (!SLP_TREE_LOAD_PERMUTATION (load_node).exists ())
2433                 continue;
2434               unsigned j;
2435               stmt_vec_info load_info;
2436               FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (load_node), j, load_info)
2437                 if (SLP_TREE_LOAD_PERMUTATION (load_node)[j] != j)
2438                   {
2439                     loads_permuted = true;
2440                     break;
2441                   }
2442             }
2443
2444           /* If the loads and stores can be handled with load/store-lane
2445              instructions record it and move on to the next instance.  */
2446           if (loads_permuted
2447               && SLP_INSTANCE_KIND (instance) == slp_inst_kind_store
2448               && vect_store_lanes_supported (vectype, group_size, false))
2449             {
2450               FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), i, load_node)
2451                 {
2452                   stmt_vec_info stmt_vinfo = DR_GROUP_FIRST_ELEMENT
2453                       (SLP_TREE_SCALAR_STMTS (load_node)[0]);
2454                   /* Use SLP for strided accesses (or if we can't
2455                      load-lanes).  */
2456                   if (STMT_VINFO_STRIDED_P (stmt_vinfo)
2457                       || ! vect_load_lanes_supported
2458                             (STMT_VINFO_VECTYPE (stmt_vinfo),
2459                              DR_GROUP_SIZE (stmt_vinfo), false))
2460                     break;
2461                 }
2462
2463               can_use_lanes
2464                 = can_use_lanes && i == SLP_INSTANCE_LOADS (instance).length ();
2465
2466               if (can_use_lanes && dump_enabled_p ())
2467                 dump_printf_loc (MSG_NOTE, vect_location,
2468                                  "SLP instance %p can use load/store-lanes\n",
2469                                  instance);
2470             }
2471           else
2472             {
2473               can_use_lanes = false;
2474               break;
2475             }
2476         }
2477
2478       /* If all SLP instances can use load/store-lanes abort SLP and try again
2479          with SLP disabled.  */
2480       if (can_use_lanes)
2481         {
2482           ok = opt_result::failure_at (vect_location,
2483                                        "Built SLP cancelled: can use "
2484                                        "load/store-lanes\n");
2485           if (dump_enabled_p ())
2486             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2487                              "Built SLP cancelled: all SLP instances support "
2488                              "load/store-lanes\n");
2489           goto again;
2490         }
2491     }
2492
2493   /* Dissolve SLP-only groups.  */
2494   vect_dissolve_slp_only_groups (loop_vinfo);
2495
2496   /* Scan all the remaining operations in the loop that are not subject
2497      to SLP and make sure they are vectorizable.  */
2498   ok = vect_analyze_loop_operations (loop_vinfo);
2499   if (!ok)
2500     {
2501       if (dump_enabled_p ())
2502         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2503                          "bad operation or unsupported loop bound.\n");
2504       return ok;
2505     }
2506
2507   /* For now, we don't expect to mix both masking and length approaches for one
2508      loop, disable it if both are recorded.  */
2509   if (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)
2510       && !LOOP_VINFO_MASKS (loop_vinfo).is_empty ()
2511       && !LOOP_VINFO_LENS (loop_vinfo).is_empty ())
2512     {
2513       if (dump_enabled_p ())
2514         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2515                          "can't vectorize a loop with partial vectors"
2516                          " because we don't expect to mix different"
2517                          " approaches with partial vectors for the"
2518                          " same loop.\n");
2519       LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
2520     }
2521
2522   /* If we still have the option of using partial vectors,
2523      check whether we can generate the necessary loop controls.  */
2524   if (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)
2525       && !vect_verify_full_masking (loop_vinfo)
2526       && !vect_verify_loop_lens (loop_vinfo))
2527     LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
2528
2529   /* If we're vectorizing an epilogue loop, the vectorized loop either needs
2530      to be able to handle fewer than VF scalars, or needs to have a lower VF
2531      than the main loop.  */
2532   if (LOOP_VINFO_EPILOGUE_P (loop_vinfo)
2533       && !LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)
2534       && maybe_ge (LOOP_VINFO_VECT_FACTOR (loop_vinfo),
2535                    LOOP_VINFO_VECT_FACTOR (orig_loop_vinfo)))
2536     return opt_result::failure_at (vect_location,
2537                                    "Vectorization factor too high for"
2538                                    " epilogue loop.\n");
2539
2540   /* Decide whether this loop_vinfo should use partial vectors or peeling,
2541      assuming that the loop will be used as a main loop.  We will redo
2542      this analysis later if we instead decide to use the loop as an
2543      epilogue loop.  */
2544   ok = vect_determine_partial_vectors_and_peeling (loop_vinfo, false);
2545   if (!ok)
2546     return ok;
2547
2548   /* Check the costings of the loop make vectorizing worthwhile.  */
2549   res = vect_analyze_loop_costing (loop_vinfo);
2550   if (res < 0)
2551     {
2552       ok = opt_result::failure_at (vect_location,
2553                                    "Loop costings may not be worthwhile.\n");
2554       goto again;
2555     }
2556   if (!res)
2557     return opt_result::failure_at (vect_location,
2558                                    "Loop costings not worthwhile.\n");
2559
2560   /* If an epilogue loop is required make sure we can create one.  */
2561   if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
2562       || LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo))
2563     {
2564       if (dump_enabled_p ())
2565         dump_printf_loc (MSG_NOTE, vect_location, "epilog loop required\n");
2566       if (!vect_can_advance_ivs_p (loop_vinfo)
2567           || !slpeel_can_duplicate_loop_p (LOOP_VINFO_LOOP (loop_vinfo),
2568                                            single_exit (LOOP_VINFO_LOOP
2569                                                          (loop_vinfo))))
2570         {
2571           ok = opt_result::failure_at (vect_location,
2572                                        "not vectorized: can't create required "
2573                                        "epilog loop\n");
2574           goto again;
2575         }
2576     }
2577
2578   /* During peeling, we need to check if number of loop iterations is
2579      enough for both peeled prolog loop and vector loop.  This check
2580      can be merged along with threshold check of loop versioning, so
2581      increase threshold for this case if necessary.
2582
2583      If we are analyzing an epilogue we still want to check what its
2584      versioning threshold would be.  If we decide to vectorize the epilogues we
2585      will want to use the lowest versioning threshold of all epilogues and main
2586      loop.  This will enable us to enter a vectorized epilogue even when
2587      versioning the loop.  We can't simply check whether the epilogue requires
2588      versioning though since we may have skipped some versioning checks when
2589      analyzing the epilogue.  For instance, checks for alias versioning will be
2590      skipped when dealing with epilogues as we assume we already checked them
2591      for the main loop.  So instead we always check the 'orig_loop_vinfo'.  */
2592   if (LOOP_REQUIRES_VERSIONING (orig_loop_vinfo))
2593     {
2594       poly_uint64 niters_th = 0;
2595       unsigned int th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
2596
2597       if (!vect_use_loop_mask_for_alignment_p (loop_vinfo))
2598         {
2599           /* Niters for peeled prolog loop.  */
2600           if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) < 0)
2601             {
2602               dr_vec_info *dr_info = LOOP_VINFO_UNALIGNED_DR (loop_vinfo);
2603               tree vectype = STMT_VINFO_VECTYPE (dr_info->stmt);
2604               niters_th += TYPE_VECTOR_SUBPARTS (vectype) - 1;
2605             }
2606           else
2607             niters_th += LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
2608         }
2609
2610       /* Niters for at least one iteration of vectorized loop.  */
2611       if (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
2612         niters_th += LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2613       /* One additional iteration because of peeling for gap.  */
2614       if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
2615         niters_th += 1;
2616
2617       /*  Use the same condition as vect_transform_loop to decide when to use
2618           the cost to determine a versioning threshold.  */
2619       if (vect_apply_runtime_profitability_check_p (loop_vinfo)
2620           && ordered_p (th, niters_th))
2621         niters_th = ordered_max (poly_uint64 (th), niters_th);
2622
2623       LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo) = niters_th;
2624     }
2625
2626   gcc_assert (known_eq (vectorization_factor,
2627                         LOOP_VINFO_VECT_FACTOR (loop_vinfo)));
2628
2629   /* Ok to vectorize!  */
2630   LOOP_VINFO_VECTORIZABLE_P (loop_vinfo) = 1;
2631   return opt_result::success ();
2632
2633 again:
2634   /* Ensure that "ok" is false (with an opt_problem if dumping is enabled).  */
2635   gcc_assert (!ok);
2636
2637   /* Try again with SLP forced off but if we didn't do any SLP there is
2638      no point in re-trying.  */
2639   if (!slp)
2640     return ok;
2641
2642   /* If there are reduction chains re-trying will fail anyway.  */
2643   if (! LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo).is_empty ())
2644     return ok;
2645
2646   /* Likewise if the grouped loads or stores in the SLP cannot be handled
2647      via interleaving or lane instructions.  */
2648   slp_instance instance;
2649   slp_tree node;
2650   unsigned i, j;
2651   FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), i, instance)
2652     {
2653       stmt_vec_info vinfo;
2654       vinfo = SLP_TREE_SCALAR_STMTS (SLP_INSTANCE_TREE (instance))[0];
2655       if (! STMT_VINFO_GROUPED_ACCESS (vinfo))
2656         continue;
2657       vinfo = DR_GROUP_FIRST_ELEMENT (vinfo);
2658       unsigned int size = DR_GROUP_SIZE (vinfo);
2659       tree vectype = STMT_VINFO_VECTYPE (vinfo);
2660       if (! vect_store_lanes_supported (vectype, size, false)
2661          && ! known_eq (TYPE_VECTOR_SUBPARTS (vectype), 1U)
2662          && ! vect_grouped_store_supported (vectype, size))
2663         return opt_result::failure_at (vinfo->stmt,
2664                                        "unsupported grouped store\n");
2665       FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), j, node)
2666         {
2667           vinfo = SLP_TREE_SCALAR_STMTS (node)[0];
2668           vinfo = DR_GROUP_FIRST_ELEMENT (vinfo);
2669           bool single_element_p = !DR_GROUP_NEXT_ELEMENT (vinfo);
2670           size = DR_GROUP_SIZE (vinfo);
2671           vectype = STMT_VINFO_VECTYPE (vinfo);
2672           if (! vect_load_lanes_supported (vectype, size, false)
2673               && ! vect_grouped_load_supported (vectype, single_element_p,
2674                                                 size))
2675             return opt_result::failure_at (vinfo->stmt,
2676                                            "unsupported grouped load\n");
2677         }
2678     }
2679
2680   if (dump_enabled_p ())
2681     dump_printf_loc (MSG_NOTE, vect_location,
2682                      "re-trying with SLP disabled\n");
2683
2684   /* Roll back state appropriately.  No SLP this time.  */
2685   slp = false;
2686   /* Restore vectorization factor as it were without SLP.  */
2687   LOOP_VINFO_VECT_FACTOR (loop_vinfo) = saved_vectorization_factor;
2688   /* Free the SLP instances.  */
2689   FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), j, instance)
2690     vect_free_slp_instance (instance);
2691   LOOP_VINFO_SLP_INSTANCES (loop_vinfo).release ();
2692   /* Reset SLP type to loop_vect on all stmts.  */
2693   for (i = 0; i < LOOP_VINFO_LOOP (loop_vinfo)->num_nodes; ++i)
2694     {
2695       basic_block bb = LOOP_VINFO_BBS (loop_vinfo)[i];
2696       for (gimple_stmt_iterator si = gsi_start_phis (bb);
2697            !gsi_end_p (si); gsi_next (&si))
2698         {
2699           stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
2700           STMT_SLP_TYPE (stmt_info) = loop_vect;
2701           if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
2702               || STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
2703             {
2704               /* vectorizable_reduction adjusts reduction stmt def-types,
2705                  restore them to that of the PHI.  */
2706               STMT_VINFO_DEF_TYPE (STMT_VINFO_REDUC_DEF (stmt_info))
2707                 = STMT_VINFO_DEF_TYPE (stmt_info);
2708               STMT_VINFO_DEF_TYPE (vect_stmt_to_vectorize
2709                                         (STMT_VINFO_REDUC_DEF (stmt_info)))
2710                 = STMT_VINFO_DEF_TYPE (stmt_info);
2711             }
2712         }
2713       for (gimple_stmt_iterator si = gsi_start_bb (bb);
2714            !gsi_end_p (si); gsi_next (&si))
2715         {
2716           if (is_gimple_debug (gsi_stmt (si)))
2717             continue;
2718           stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
2719           STMT_SLP_TYPE (stmt_info) = loop_vect;
2720           if (STMT_VINFO_IN_PATTERN_P (stmt_info))
2721             {
2722               stmt_vec_info pattern_stmt_info
2723                 = STMT_VINFO_RELATED_STMT (stmt_info);
2724               if (STMT_VINFO_SLP_VECT_ONLY_PATTERN (pattern_stmt_info))
2725                 STMT_VINFO_IN_PATTERN_P (stmt_info) = false;
2726
2727               gimple *pattern_def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
2728               STMT_SLP_TYPE (pattern_stmt_info) = loop_vect;
2729               for (gimple_stmt_iterator pi = gsi_start (pattern_def_seq);
2730                    !gsi_end_p (pi); gsi_next (&pi))
2731                 STMT_SLP_TYPE (loop_vinfo->lookup_stmt (gsi_stmt (pi)))
2732                   = loop_vect;
2733             }
2734         }
2735     }
2736   /* Free optimized alias test DDRS.  */
2737   LOOP_VINFO_LOWER_BOUNDS (loop_vinfo).truncate (0);
2738   LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo).release ();
2739   LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo).release ();
2740   /* Reset target cost data.  */
2741   delete loop_vinfo->vector_costs;
2742   loop_vinfo->vector_costs = nullptr;
2743   /* Reset accumulated rgroup information.  */
2744   release_vec_loop_controls (&LOOP_VINFO_MASKS (loop_vinfo));
2745   release_vec_loop_controls (&LOOP_VINFO_LENS (loop_vinfo));
2746   /* Reset assorted flags.  */
2747   LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = false;
2748   LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) = false;
2749   LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo) = 0;
2750   LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo) = 0;
2751   LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo)
2752     = saved_can_use_partial_vectors_p;
2753
2754   goto start_over;
2755 }
2756
2757 /* Return true if vectorizing a loop using NEW_LOOP_VINFO appears
2758    to be better than vectorizing it using OLD_LOOP_VINFO.  Assume that
2759    OLD_LOOP_VINFO is better unless something specifically indicates
2760    otherwise.
2761
2762    Note that this deliberately isn't a partial order.  */
2763
2764 static bool
2765 vect_better_loop_vinfo_p (loop_vec_info new_loop_vinfo,
2766                           loop_vec_info old_loop_vinfo)
2767 {
2768   struct loop *loop = LOOP_VINFO_LOOP (new_loop_vinfo);
2769   gcc_assert (LOOP_VINFO_LOOP (old_loop_vinfo) == loop);
2770
2771   poly_int64 new_vf = LOOP_VINFO_VECT_FACTOR (new_loop_vinfo);
2772   poly_int64 old_vf = LOOP_VINFO_VECT_FACTOR (old_loop_vinfo);
2773
2774   /* Always prefer a VF of loop->simdlen over any other VF.  */
2775   if (loop->simdlen)
2776     {
2777       bool new_simdlen_p = known_eq (new_vf, loop->simdlen);
2778       bool old_simdlen_p = known_eq (old_vf, loop->simdlen);
2779       if (new_simdlen_p != old_simdlen_p)
2780         return new_simdlen_p;
2781     }
2782
2783   const auto *old_costs = old_loop_vinfo->vector_costs;
2784   const auto *new_costs = new_loop_vinfo->vector_costs;
2785   if (loop_vec_info main_loop = LOOP_VINFO_ORIG_LOOP_INFO (old_loop_vinfo))
2786     return new_costs->better_epilogue_loop_than_p (old_costs, main_loop);
2787
2788   return new_costs->better_main_loop_than_p (old_costs);
2789 }
2790
2791 /* Decide whether to replace OLD_LOOP_VINFO with NEW_LOOP_VINFO.  Return
2792    true if we should.  */
2793
2794 static bool
2795 vect_joust_loop_vinfos (loop_vec_info new_loop_vinfo,
2796                         loop_vec_info old_loop_vinfo)
2797 {
2798   if (!vect_better_loop_vinfo_p (new_loop_vinfo, old_loop_vinfo))
2799     return false;
2800
2801   if (dump_enabled_p ())
2802     dump_printf_loc (MSG_NOTE, vect_location,
2803                      "***** Preferring vector mode %s to vector mode %s\n",
2804                      GET_MODE_NAME (new_loop_vinfo->vector_mode),
2805                      GET_MODE_NAME (old_loop_vinfo->vector_mode));
2806   return true;
2807 }
2808
2809 /* Analyze LOOP with VECTOR_MODES[MODE_I] and as epilogue if MAIN_LOOP_VINFO is
2810    not NULL.  Set AUTODETECTED_VECTOR_MODE if VOIDmode and advance
2811    MODE_I to the next mode useful to analyze.
2812    Return the loop_vinfo on success and wrapped null on failure.  */
2813
2814 static opt_loop_vec_info
2815 vect_analyze_loop_1 (class loop *loop, vec_info_shared *shared,
2816                      const vect_loop_form_info *loop_form_info,
2817                      loop_vec_info main_loop_vinfo,
2818                      const vector_modes &vector_modes, unsigned &mode_i,
2819                      machine_mode &autodetected_vector_mode,
2820                      bool &fatal)
2821 {
2822   loop_vec_info loop_vinfo
2823     = vect_create_loop_vinfo (loop, shared, loop_form_info, main_loop_vinfo);
2824
2825   machine_mode vector_mode = vector_modes[mode_i];
2826   loop_vinfo->vector_mode = vector_mode;
2827
2828   /* Run the main analysis.  */
2829   opt_result res = vect_analyze_loop_2 (loop_vinfo, fatal);
2830   if (dump_enabled_p ())
2831     dump_printf_loc (MSG_NOTE, vect_location,
2832                      "***** Analysis %s with vector mode %s\n",
2833                      res ? "succeeded" : " failed",
2834                      GET_MODE_NAME (loop_vinfo->vector_mode));
2835
2836   /* Remember the autodetected vector mode.  */
2837   if (vector_mode == VOIDmode)
2838     autodetected_vector_mode = loop_vinfo->vector_mode;
2839
2840   /* Advance mode_i, first skipping modes that would result in the
2841      same analysis result.  */
2842   while (mode_i + 1 < vector_modes.length ()
2843          && vect_chooses_same_modes_p (loop_vinfo,
2844                                        vector_modes[mode_i + 1]))
2845     {
2846       if (dump_enabled_p ())
2847         dump_printf_loc (MSG_NOTE, vect_location,
2848                          "***** The result for vector mode %s would"
2849                          " be the same\n",
2850                          GET_MODE_NAME (vector_modes[mode_i + 1]));
2851       mode_i += 1;
2852     }
2853   if (mode_i + 1 < vector_modes.length ()
2854       && VECTOR_MODE_P (autodetected_vector_mode)
2855       && (related_vector_mode (vector_modes[mode_i + 1],
2856                                GET_MODE_INNER (autodetected_vector_mode))
2857           == autodetected_vector_mode)
2858       && (related_vector_mode (autodetected_vector_mode,
2859                                GET_MODE_INNER (vector_modes[mode_i + 1]))
2860           == vector_modes[mode_i + 1]))
2861     {
2862       if (dump_enabled_p ())
2863         dump_printf_loc (MSG_NOTE, vect_location,
2864                          "***** Skipping vector mode %s, which would"
2865                          " repeat the analysis for %s\n",
2866                          GET_MODE_NAME (vector_modes[mode_i + 1]),
2867                          GET_MODE_NAME (autodetected_vector_mode));
2868       mode_i += 1;
2869     }
2870   mode_i++;
2871
2872   if (!res)
2873     {
2874       delete loop_vinfo;
2875       if (fatal)
2876         gcc_checking_assert (main_loop_vinfo == NULL);
2877       return opt_loop_vec_info::propagate_failure (res);
2878     }
2879
2880   return opt_loop_vec_info::success (loop_vinfo);
2881 }
2882
2883 /* Function vect_analyze_loop.
2884
2885    Apply a set of analyses on LOOP, and create a loop_vec_info struct
2886    for it.  The different analyses will record information in the
2887    loop_vec_info struct.  */
2888 opt_loop_vec_info
2889 vect_analyze_loop (class loop *loop, vec_info_shared *shared)
2890 {
2891   DUMP_VECT_SCOPE ("analyze_loop_nest");
2892
2893   if (loop_outer (loop)
2894       && loop_vec_info_for_loop (loop_outer (loop))
2895       && LOOP_VINFO_VECTORIZABLE_P (loop_vec_info_for_loop (loop_outer (loop))))
2896     return opt_loop_vec_info::failure_at (vect_location,
2897                                           "outer-loop already vectorized.\n");
2898
2899   if (!find_loop_nest (loop, &shared->loop_nest))
2900     return opt_loop_vec_info::failure_at
2901       (vect_location,
2902        "not vectorized: loop nest containing two or more consecutive inner"
2903        " loops cannot be vectorized\n");
2904
2905   /* Analyze the loop form.  */
2906   vect_loop_form_info loop_form_info;
2907   opt_result res = vect_analyze_loop_form (loop, &loop_form_info);
2908   if (!res)
2909     {
2910       if (dump_enabled_p ())
2911         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2912                          "bad loop form.\n");
2913       return opt_loop_vec_info::propagate_failure (res);
2914     }
2915   if (!integer_onep (loop_form_info.assumptions))
2916     {
2917       /* We consider to vectorize this loop by versioning it under
2918          some assumptions.  In order to do this, we need to clear
2919          existing information computed by scev and niter analyzer.  */
2920       scev_reset_htab ();
2921       free_numbers_of_iterations_estimates (loop);
2922       /* Also set flag for this loop so that following scev and niter
2923          analysis are done under the assumptions.  */
2924       loop_constraint_set (loop, LOOP_C_FINITE);
2925     }
2926
2927   auto_vector_modes vector_modes;
2928   /* Autodetect first vector size we try.  */
2929   vector_modes.safe_push (VOIDmode);
2930   unsigned int autovec_flags
2931     = targetm.vectorize.autovectorize_vector_modes (&vector_modes,
2932                                                     loop->simdlen != 0);
2933   bool pick_lowest_cost_p = ((autovec_flags & VECT_COMPARE_COSTS)
2934                              && !unlimited_cost_model (loop));
2935   machine_mode autodetected_vector_mode = VOIDmode;
2936   opt_loop_vec_info first_loop_vinfo = opt_loop_vec_info::success (NULL);
2937   unsigned int mode_i = 0;
2938   unsigned int first_loop_i = 0;
2939   unsigned int first_loop_next_i = 0;
2940   unsigned HOST_WIDE_INT simdlen = loop->simdlen;
2941
2942   /* First determine the main loop vectorization mode, either the first
2943      one that works, starting with auto-detecting the vector mode and then
2944      following the targets order of preference, or the one with the
2945      lowest cost if pick_lowest_cost_p.  */
2946   while (1)
2947     {
2948       unsigned int loop_vinfo_i = mode_i;
2949       bool fatal;
2950       opt_loop_vec_info loop_vinfo
2951         = vect_analyze_loop_1 (loop, shared, &loop_form_info,
2952                                NULL, vector_modes, mode_i,
2953                                autodetected_vector_mode, fatal);
2954       if (fatal)
2955         break;
2956
2957       if (loop_vinfo)
2958         {
2959           /* Once we hit the desired simdlen for the first time,
2960              discard any previous attempts.  */
2961           if (simdlen
2962               && known_eq (LOOP_VINFO_VECT_FACTOR (loop_vinfo), simdlen))
2963             {
2964               delete first_loop_vinfo;
2965               first_loop_vinfo = opt_loop_vec_info::success (NULL);
2966               simdlen = 0;
2967             }
2968           else if (pick_lowest_cost_p
2969                    && first_loop_vinfo
2970                    && vect_joust_loop_vinfos (loop_vinfo, first_loop_vinfo))
2971             {
2972               /* Pick loop_vinfo over first_loop_vinfo.  */
2973               delete first_loop_vinfo;
2974               first_loop_vinfo = opt_loop_vec_info::success (NULL);
2975             }
2976           if (first_loop_vinfo == NULL)
2977             {
2978               first_loop_vinfo = loop_vinfo;
2979               first_loop_i = loop_vinfo_i;
2980               first_loop_next_i = mode_i;
2981             }
2982           else
2983             {
2984               delete loop_vinfo;
2985               loop_vinfo = opt_loop_vec_info::success (NULL);
2986             }
2987
2988           /* Commit to first_loop_vinfo if we have no reason to try
2989              alternatives.  */
2990           if (!simdlen && !pick_lowest_cost_p)
2991             break;
2992         }
2993       if (mode_i == vector_modes.length ()
2994           || autodetected_vector_mode == VOIDmode)
2995         break;
2996
2997       /* Try the next biggest vector size.  */
2998       if (dump_enabled_p ())
2999         dump_printf_loc (MSG_NOTE, vect_location,
3000                          "***** Re-trying analysis with vector mode %s\n",
3001                          GET_MODE_NAME (vector_modes[mode_i]));
3002     }
3003   if (!first_loop_vinfo)
3004     return opt_loop_vec_info::propagate_failure (res);
3005
3006   if (dump_enabled_p ())
3007     dump_printf_loc (MSG_NOTE, vect_location,
3008                      "***** Choosing vector mode %s\n",
3009                      GET_MODE_NAME (first_loop_vinfo->vector_mode));
3010
3011   /* Only vectorize epilogues if PARAM_VECT_EPILOGUES_NOMASK is
3012      enabled, SIMDUID is not set, it is the innermost loop and we have
3013      either already found the loop's SIMDLEN or there was no SIMDLEN to
3014      begin with.
3015      TODO: Enable epilogue vectorization for loops with SIMDUID set.  */
3016   bool vect_epilogues = (!simdlen
3017                          && loop->inner == NULL
3018                          && param_vect_epilogues_nomask
3019                          && LOOP_VINFO_PEELING_FOR_NITER (first_loop_vinfo)
3020                          && !loop->simduid);
3021   if (!vect_epilogues)
3022     return first_loop_vinfo;
3023
3024   /* Now analyze first_loop_vinfo for epilogue vectorization.  */
3025   poly_uint64 lowest_th = LOOP_VINFO_VERSIONING_THRESHOLD (first_loop_vinfo);
3026
3027   /* Handle the case that the original loop can use partial
3028      vectorization, but want to only adopt it for the epilogue.
3029      The retry should be in the same mode as original.  */
3030   if (LOOP_VINFO_EPIL_USING_PARTIAL_VECTORS_P (first_loop_vinfo))
3031     {
3032       gcc_assert (LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (first_loop_vinfo)
3033                   && !LOOP_VINFO_USING_PARTIAL_VECTORS_P (first_loop_vinfo));
3034       if (dump_enabled_p ())
3035         dump_printf_loc (MSG_NOTE, vect_location,
3036                          "***** Re-trying analysis with same vector mode"
3037                          " %s for epilogue with partial vectors.\n",
3038                          GET_MODE_NAME (first_loop_vinfo->vector_mode));
3039       mode_i = first_loop_i;
3040     }
3041   else
3042     {
3043       mode_i = first_loop_next_i;
3044       if (mode_i == vector_modes.length ())
3045         return first_loop_vinfo;
3046     }
3047
3048   /* ???  If first_loop_vinfo was using VOIDmode then we probably
3049      want to instead search for the corresponding mode in vector_modes[].  */
3050
3051   while (1)
3052     {
3053       bool fatal;
3054       opt_loop_vec_info loop_vinfo
3055         = vect_analyze_loop_1 (loop, shared, &loop_form_info,
3056                                first_loop_vinfo,
3057                                vector_modes, mode_i,
3058                                autodetected_vector_mode, fatal);
3059       if (fatal)
3060         break;
3061
3062       if (loop_vinfo)
3063         {
3064           if (pick_lowest_cost_p)
3065             {
3066               /* Keep trying to roll back vectorization attempts while the
3067                  loop_vec_infos they produced were worse than this one.  */
3068               vec<loop_vec_info> &vinfos = first_loop_vinfo->epilogue_vinfos;
3069               while (!vinfos.is_empty ()
3070                      && vect_joust_loop_vinfos (loop_vinfo, vinfos.last ()))
3071                 {
3072                   gcc_assert (vect_epilogues);
3073                   delete vinfos.pop ();
3074                 }
3075             }
3076           /* For now only allow one epilogue loop.  */
3077           if (first_loop_vinfo->epilogue_vinfos.is_empty ())
3078             {
3079               first_loop_vinfo->epilogue_vinfos.safe_push (loop_vinfo);
3080               poly_uint64 th = LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo);
3081               gcc_assert (!LOOP_REQUIRES_VERSIONING (loop_vinfo)
3082                           || maybe_ne (lowest_th, 0U));
3083               /* Keep track of the known smallest versioning
3084                  threshold.  */
3085               if (ordered_p (lowest_th, th))
3086                 lowest_th = ordered_min (lowest_th, th);
3087             }
3088           else
3089             {
3090               delete loop_vinfo;
3091               loop_vinfo = opt_loop_vec_info::success (NULL);
3092             }
3093
3094           /* For now only allow one epilogue loop, but allow
3095              pick_lowest_cost_p to replace it, so commit to the
3096              first epilogue if we have no reason to try alternatives.  */
3097           if (!pick_lowest_cost_p)
3098             break;
3099         }
3100
3101       if (mode_i == vector_modes.length ())
3102         break;
3103
3104       /* Try the next biggest vector size.  */
3105       if (dump_enabled_p ())
3106         dump_printf_loc (MSG_NOTE, vect_location,
3107                          "***** Re-trying epilogue analysis with vector "
3108                          "mode %s\n", GET_MODE_NAME (vector_modes[mode_i]));
3109     }
3110
3111   if (!first_loop_vinfo->epilogue_vinfos.is_empty ())
3112     {
3113       LOOP_VINFO_VERSIONING_THRESHOLD (first_loop_vinfo) = lowest_th;
3114       if (dump_enabled_p ())
3115         dump_printf_loc (MSG_NOTE, vect_location,
3116                          "***** Choosing epilogue vector mode %s\n",
3117                          GET_MODE_NAME
3118                            (first_loop_vinfo->epilogue_vinfos[0]->vector_mode));
3119     }
3120
3121   return first_loop_vinfo;
3122 }
3123
3124 /* Return true if there is an in-order reduction function for CODE, storing
3125    it in *REDUC_FN if so.  */
3126
3127 static bool
3128 fold_left_reduction_fn (tree_code code, internal_fn *reduc_fn)
3129 {
3130   switch (code)
3131     {
3132     case PLUS_EXPR:
3133       *reduc_fn = IFN_FOLD_LEFT_PLUS;
3134       return true;
3135
3136     default:
3137       return false;
3138     }
3139 }
3140
3141 /* Function reduction_fn_for_scalar_code
3142
3143    Input:
3144    CODE - tree_code of a reduction operations.
3145
3146    Output:
3147    REDUC_FN - the corresponding internal function to be used to reduce the
3148       vector of partial results into a single scalar result, or IFN_LAST
3149       if the operation is a supported reduction operation, but does not have
3150       such an internal function.
3151
3152    Return FALSE if CODE currently cannot be vectorized as reduction.  */
3153
3154 bool
3155 reduction_fn_for_scalar_code (enum tree_code code, internal_fn *reduc_fn)
3156 {
3157   switch (code)
3158     {
3159       case MAX_EXPR:
3160         *reduc_fn = IFN_REDUC_MAX;
3161         return true;
3162
3163       case MIN_EXPR:
3164         *reduc_fn = IFN_REDUC_MIN;
3165         return true;
3166
3167       case PLUS_EXPR:
3168         *reduc_fn = IFN_REDUC_PLUS;
3169         return true;
3170
3171       case BIT_AND_EXPR:
3172         *reduc_fn = IFN_REDUC_AND;
3173         return true;
3174
3175       case BIT_IOR_EXPR:
3176         *reduc_fn = IFN_REDUC_IOR;
3177         return true;
3178
3179       case BIT_XOR_EXPR:
3180         *reduc_fn = IFN_REDUC_XOR;
3181         return true;
3182
3183       case MULT_EXPR:
3184       case MINUS_EXPR:
3185         *reduc_fn = IFN_LAST;
3186         return true;
3187
3188       default:
3189        return false;
3190     }
3191 }
3192
3193 /* If there is a neutral value X such that a reduction would not be affected
3194    by the introduction of additional X elements, return that X, otherwise
3195    return null.  CODE is the code of the reduction and SCALAR_TYPE is type
3196    of the scalar elements.  If the reduction has just a single initial value
3197    then INITIAL_VALUE is that value, otherwise it is null.  */
3198
3199 tree
3200 neutral_op_for_reduction (tree scalar_type, tree_code code, tree initial_value)
3201 {
3202   switch (code)
3203     {
3204     case WIDEN_SUM_EXPR:
3205     case DOT_PROD_EXPR:
3206     case SAD_EXPR:
3207     case PLUS_EXPR:
3208     case MINUS_EXPR:
3209     case BIT_IOR_EXPR:
3210     case BIT_XOR_EXPR:
3211       return build_zero_cst (scalar_type);
3212
3213     case MULT_EXPR:
3214       return build_one_cst (scalar_type);
3215
3216     case BIT_AND_EXPR:
3217       return build_all_ones_cst (scalar_type);
3218
3219     case MAX_EXPR:
3220     case MIN_EXPR:
3221       return initial_value;
3222
3223     default:
3224       return NULL_TREE;
3225     }
3226 }
3227
3228 /* Error reporting helper for vect_is_simple_reduction below.  GIMPLE statement
3229    STMT is printed with a message MSG. */
3230
3231 static void
3232 report_vect_op (dump_flags_t msg_type, gimple *stmt, const char *msg)
3233 {
3234   dump_printf_loc (msg_type, vect_location, "%s%G", msg, stmt);
3235 }
3236
3237 /* Return true if we need an in-order reduction for operation CODE
3238    on type TYPE.  NEED_WRAPPING_INTEGRAL_OVERFLOW is true if integer
3239    overflow must wrap.  */
3240
3241 bool
3242 needs_fold_left_reduction_p (tree type, tree_code code)
3243 {
3244   /* CHECKME: check for !flag_finite_math_only too?  */
3245   if (SCALAR_FLOAT_TYPE_P (type))
3246     switch (code)
3247       {
3248       case MIN_EXPR:
3249       case MAX_EXPR:
3250         return false;
3251
3252       default:
3253         return !flag_associative_math;
3254       }
3255
3256   if (INTEGRAL_TYPE_P (type))
3257     {
3258       if (!operation_no_trapping_overflow (type, code))
3259         return true;
3260       return false;
3261     }
3262
3263   if (SAT_FIXED_POINT_TYPE_P (type))
3264     return true;
3265
3266   return false;
3267 }
3268
3269 /* Return true if the reduction PHI in LOOP with latch arg LOOP_ARG and
3270    has a handled computation expression.  Store the main reduction
3271    operation in *CODE.  */
3272
3273 static bool
3274 check_reduction_path (dump_user_location_t loc, loop_p loop, gphi *phi,
3275                       tree loop_arg, enum tree_code *code,
3276                       vec<std::pair<ssa_op_iter, use_operand_p> > &path)
3277 {
3278   auto_bitmap visited;
3279   tree lookfor = PHI_RESULT (phi);
3280   ssa_op_iter curri;
3281   use_operand_p curr = op_iter_init_phiuse (&curri, phi, SSA_OP_USE);
3282   while (USE_FROM_PTR (curr) != loop_arg)
3283     curr = op_iter_next_use (&curri);
3284   curri.i = curri.numops;
3285   do
3286     {
3287       path.safe_push (std::make_pair (curri, curr));
3288       tree use = USE_FROM_PTR (curr);
3289       if (use == lookfor)
3290         break;
3291       gimple *def = SSA_NAME_DEF_STMT (use);
3292       if (gimple_nop_p (def)
3293           || ! flow_bb_inside_loop_p (loop, gimple_bb (def)))
3294         {
3295 pop:
3296           do
3297             {
3298               std::pair<ssa_op_iter, use_operand_p> x = path.pop ();
3299               curri = x.first;
3300               curr = x.second;
3301               do
3302                 curr = op_iter_next_use (&curri);
3303               /* Skip already visited or non-SSA operands (from iterating
3304                  over PHI args).  */
3305               while (curr != NULL_USE_OPERAND_P
3306                      && (TREE_CODE (USE_FROM_PTR (curr)) != SSA_NAME
3307                          || ! bitmap_set_bit (visited,
3308                                               SSA_NAME_VERSION
3309                                                 (USE_FROM_PTR (curr)))));
3310             }
3311           while (curr == NULL_USE_OPERAND_P && ! path.is_empty ());
3312           if (curr == NULL_USE_OPERAND_P)
3313             break;
3314         }
3315       else
3316         {
3317           if (gimple_code (def) == GIMPLE_PHI)
3318             curr = op_iter_init_phiuse (&curri, as_a <gphi *>(def), SSA_OP_USE);
3319           else
3320             curr = op_iter_init_use (&curri, def, SSA_OP_USE);
3321           while (curr != NULL_USE_OPERAND_P
3322                  && (TREE_CODE (USE_FROM_PTR (curr)) != SSA_NAME
3323                      || ! bitmap_set_bit (visited,
3324                                           SSA_NAME_VERSION
3325                                             (USE_FROM_PTR (curr)))))
3326             curr = op_iter_next_use (&curri);
3327           if (curr == NULL_USE_OPERAND_P)
3328             goto pop;
3329         }
3330     }
3331   while (1);
3332   if (dump_file && (dump_flags & TDF_DETAILS))
3333     {
3334       dump_printf_loc (MSG_NOTE, loc, "reduction path: ");
3335       unsigned i;
3336       std::pair<ssa_op_iter, use_operand_p> *x;
3337       FOR_EACH_VEC_ELT (path, i, x)
3338         dump_printf (MSG_NOTE, "%T ", USE_FROM_PTR (x->second));
3339       dump_printf (MSG_NOTE, "\n");
3340     }
3341
3342   /* Check whether the reduction path detected is valid.  */
3343   bool fail = path.length () == 0;
3344   bool neg = false;
3345   int sign = -1;
3346   *code = ERROR_MARK;
3347   for (unsigned i = 1; i < path.length (); ++i)
3348     {
3349       gimple *use_stmt = USE_STMT (path[i].second);
3350       tree op = USE_FROM_PTR (path[i].second);
3351       if (! is_gimple_assign (use_stmt)
3352           /* The following make sure we can compute the operand index
3353              easily plus it mostly disallows chaining via COND_EXPR condition
3354              operands.  */
3355           || (gimple_assign_rhs1_ptr (use_stmt) != path[i].second->use
3356               && (gimple_num_ops (use_stmt) <= 2
3357                   || gimple_assign_rhs2_ptr (use_stmt) != path[i].second->use)
3358               && (gimple_num_ops (use_stmt) <= 3
3359                   || gimple_assign_rhs3_ptr (use_stmt) != path[i].second->use)))
3360         {
3361           fail = true;
3362           break;
3363         }
3364       tree_code use_code = gimple_assign_rhs_code (use_stmt);
3365       if (use_code == MINUS_EXPR)
3366         {
3367           use_code = PLUS_EXPR;
3368           /* Track whether we negate the reduction value each iteration.  */
3369           if (gimple_assign_rhs2 (use_stmt) == op)
3370             neg = ! neg;
3371         }
3372       if (CONVERT_EXPR_CODE_P (use_code)
3373           && tree_nop_conversion_p (TREE_TYPE (gimple_assign_lhs (use_stmt)),
3374                                     TREE_TYPE (gimple_assign_rhs1 (use_stmt))))
3375         ;
3376       else if (*code == ERROR_MARK)
3377         {
3378           *code = use_code;
3379           sign = TYPE_SIGN (TREE_TYPE (gimple_assign_lhs (use_stmt)));
3380         }
3381       else if (use_code != *code)
3382         {
3383           fail = true;
3384           break;
3385         }
3386       else if ((use_code == MIN_EXPR
3387                 || use_code == MAX_EXPR)
3388                && sign != TYPE_SIGN (TREE_TYPE (gimple_assign_lhs (use_stmt))))
3389         {
3390           fail = true;
3391           break;
3392         }
3393       /* Check there's only a single stmt the op is used on.  For the
3394          not value-changing tail and the last stmt allow out-of-loop uses.
3395          ???  We could relax this and handle arbitrary live stmts by
3396          forcing a scalar epilogue for example.  */
3397       imm_use_iterator imm_iter;
3398       gimple *op_use_stmt;
3399       unsigned cnt = 0;
3400       FOR_EACH_IMM_USE_STMT (op_use_stmt, imm_iter, op)
3401         if (!is_gimple_debug (op_use_stmt)
3402             && (*code != ERROR_MARK
3403                 || flow_bb_inside_loop_p (loop, gimple_bb (op_use_stmt))))
3404           {
3405             /* We want to allow x + x but not x < 1 ? x : 2.  */
3406             if (is_gimple_assign (op_use_stmt)
3407                 && gimple_assign_rhs_code (op_use_stmt) == COND_EXPR)
3408               {
3409                 use_operand_p use_p;
3410                 FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
3411                   cnt++;
3412               }
3413             else
3414               cnt++;
3415           }
3416       if (cnt != 1)
3417         {
3418           fail = true;
3419           break;
3420         }
3421     }
3422   return ! fail && ! neg && *code != ERROR_MARK;
3423 }
3424
3425 bool
3426 check_reduction_path (dump_user_location_t loc, loop_p loop, gphi *phi,
3427                       tree loop_arg, enum tree_code code)
3428 {
3429   auto_vec<std::pair<ssa_op_iter, use_operand_p> > path;
3430   enum tree_code code_;
3431   return (check_reduction_path (loc, loop, phi, loop_arg, &code_, path)
3432           && code_ == code);
3433 }
3434
3435
3436
3437 /* Function vect_is_simple_reduction
3438
3439    (1) Detect a cross-iteration def-use cycle that represents a simple
3440    reduction computation.  We look for the following pattern:
3441
3442    loop_header:
3443      a1 = phi < a0, a2 >
3444      a3 = ...
3445      a2 = operation (a3, a1)
3446
3447    or
3448
3449    a3 = ...
3450    loop_header:
3451      a1 = phi < a0, a2 >
3452      a2 = operation (a3, a1)
3453
3454    such that:
3455    1. operation is commutative and associative and it is safe to
3456       change the order of the computation
3457    2. no uses for a2 in the loop (a2 is used out of the loop)
3458    3. no uses of a1 in the loop besides the reduction operation
3459    4. no uses of a1 outside the loop.
3460
3461    Conditions 1,4 are tested here.
3462    Conditions 2,3 are tested in vect_mark_stmts_to_be_vectorized.
3463
3464    (2) Detect a cross-iteration def-use cycle in nested loops, i.e.,
3465    nested cycles.
3466
3467    (3) Detect cycles of phi nodes in outer-loop vectorization, i.e., double
3468    reductions:
3469
3470      a1 = phi < a0, a2 >
3471      inner loop (def of a3)
3472      a2 = phi < a3 >
3473
3474    (4) Detect condition expressions, ie:
3475      for (int i = 0; i < N; i++)
3476        if (a[i] < val)
3477         ret_val = a[i];
3478
3479 */
3480
3481 static stmt_vec_info
3482 vect_is_simple_reduction (loop_vec_info loop_info, stmt_vec_info phi_info,
3483                           bool *double_reduc, bool *reduc_chain_p)
3484 {
3485   gphi *phi = as_a <gphi *> (phi_info->stmt);
3486   gimple *phi_use_stmt = NULL;
3487   imm_use_iterator imm_iter;
3488   use_operand_p use_p;
3489
3490   *double_reduc = false;
3491   *reduc_chain_p = false;
3492   STMT_VINFO_REDUC_TYPE (phi_info) = TREE_CODE_REDUCTION;
3493
3494   tree phi_name = PHI_RESULT (phi);
3495   /* ???  If there are no uses of the PHI result the inner loop reduction
3496      won't be detected as possibly double-reduction by vectorizable_reduction
3497      because that tries to walk the PHI arg from the preheader edge which
3498      can be constant.  See PR60382.  */
3499   if (has_zero_uses (phi_name))
3500     return NULL;
3501   class loop *loop = (gimple_bb (phi))->loop_father;
3502   unsigned nphi_def_loop_uses = 0;
3503   FOR_EACH_IMM_USE_FAST (use_p, imm_iter, phi_name)
3504     {
3505       gimple *use_stmt = USE_STMT (use_p);
3506       if (is_gimple_debug (use_stmt))
3507         continue;
3508
3509       if (!flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
3510         {
3511           if (dump_enabled_p ())
3512             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3513                              "intermediate value used outside loop.\n");
3514
3515           return NULL;
3516         }
3517
3518       nphi_def_loop_uses++;
3519       phi_use_stmt = use_stmt;
3520     }
3521
3522   tree latch_def = PHI_ARG_DEF_FROM_EDGE (phi, loop_latch_edge (loop));
3523   if (TREE_CODE (latch_def) != SSA_NAME)
3524     {
3525       if (dump_enabled_p ())
3526         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3527                          "reduction: not ssa_name: %T\n", latch_def);
3528       return NULL;
3529     }
3530
3531   stmt_vec_info def_stmt_info = loop_info->lookup_def (latch_def);
3532   if (!def_stmt_info
3533       || !flow_bb_inside_loop_p (loop, gimple_bb (def_stmt_info->stmt)))
3534     return NULL;
3535
3536   bool nested_in_vect_loop
3537     = flow_loop_nested_p (LOOP_VINFO_LOOP (loop_info), loop);
3538   unsigned nlatch_def_loop_uses = 0;
3539   auto_vec<gphi *, 3> lcphis;
3540   bool inner_loop_of_double_reduc = false;
3541   FOR_EACH_IMM_USE_FAST (use_p, imm_iter, latch_def)
3542     {
3543       gimple *use_stmt = USE_STMT (use_p);
3544       if (is_gimple_debug (use_stmt))
3545         continue;
3546       if (flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
3547         nlatch_def_loop_uses++;
3548       else
3549         {
3550           /* We can have more than one loop-closed PHI.  */
3551           lcphis.safe_push (as_a <gphi *> (use_stmt));
3552           if (nested_in_vect_loop
3553               && (STMT_VINFO_DEF_TYPE (loop_info->lookup_stmt (use_stmt))
3554                   == vect_double_reduction_def))
3555             inner_loop_of_double_reduc = true;
3556         }
3557     }
3558
3559   /* If we are vectorizing an inner reduction we are executing that
3560      in the original order only in case we are not dealing with a
3561      double reduction.  */
3562   if (nested_in_vect_loop && !inner_loop_of_double_reduc)
3563     {
3564       if (dump_enabled_p ())
3565         report_vect_op (MSG_NOTE, def_stmt_info->stmt,
3566                         "detected nested cycle: ");
3567       return def_stmt_info;
3568     }
3569
3570   /* When the inner loop of a double reduction ends up with more than
3571      one loop-closed PHI we have failed to classify alternate such
3572      PHIs as double reduction, leading to wrong code.  See PR103237.  */
3573   if (inner_loop_of_double_reduc && lcphis.length () != 1)
3574     {
3575       if (dump_enabled_p ())
3576         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3577                          "unhandle double reduction\n");
3578       return NULL;
3579     }
3580
3581   /* If this isn't a nested cycle or if the nested cycle reduction value
3582      is used ouside of the inner loop we cannot handle uses of the reduction
3583      value.  */
3584   if (nlatch_def_loop_uses > 1 || nphi_def_loop_uses > 1)
3585     {
3586       if (dump_enabled_p ())
3587         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3588                          "reduction used in loop.\n");
3589       return NULL;
3590     }
3591
3592   /* If DEF_STMT is a phi node itself, we expect it to have a single argument
3593      defined in the inner loop.  */
3594   if (gphi *def_stmt = dyn_cast <gphi *> (def_stmt_info->stmt))
3595     {
3596       tree op1 = PHI_ARG_DEF (def_stmt, 0);
3597       if (gimple_phi_num_args (def_stmt) != 1
3598           || TREE_CODE (op1) != SSA_NAME)
3599         {
3600           if (dump_enabled_p ())
3601             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3602                              "unsupported phi node definition.\n");
3603
3604           return NULL;
3605         }
3606
3607       gimple *def1 = SSA_NAME_DEF_STMT (op1);
3608       if (gimple_bb (def1)
3609           && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
3610           && loop->inner
3611           && flow_bb_inside_loop_p (loop->inner, gimple_bb (def1))
3612           && is_gimple_assign (def1)
3613           && is_a <gphi *> (phi_use_stmt)
3614           && flow_bb_inside_loop_p (loop->inner, gimple_bb (phi_use_stmt)))
3615         {
3616           if (dump_enabled_p ())
3617             report_vect_op (MSG_NOTE, def_stmt,
3618                             "detected double reduction: ");
3619
3620           *double_reduc = true;
3621           return def_stmt_info;
3622         }
3623
3624       return NULL;
3625     }
3626
3627   /* Look for the expression computing latch_def from then loop PHI result.  */
3628   auto_vec<std::pair<ssa_op_iter, use_operand_p> > path;
3629   enum tree_code code;
3630   if (check_reduction_path (vect_location, loop, phi, latch_def, &code,
3631                             path))
3632     {
3633       STMT_VINFO_REDUC_CODE (phi_info) = code;
3634       if (code == COND_EXPR && !nested_in_vect_loop)
3635         STMT_VINFO_REDUC_TYPE (phi_info) = COND_REDUCTION;
3636
3637       /* Fill in STMT_VINFO_REDUC_IDX and gather stmts for an SLP
3638          reduction chain for which the additional restriction is that
3639          all operations in the chain are the same.  */
3640       auto_vec<stmt_vec_info, 8> reduc_chain;
3641       unsigned i;
3642       bool is_slp_reduc = !nested_in_vect_loop && code != COND_EXPR;
3643       for (i = path.length () - 1; i >= 1; --i)
3644         {
3645           gimple *stmt = USE_STMT (path[i].second);
3646           stmt_vec_info stmt_info = loop_info->lookup_stmt (stmt);
3647           STMT_VINFO_REDUC_IDX (stmt_info)
3648             = path[i].second->use - gimple_assign_rhs1_ptr (stmt);
3649           enum tree_code stmt_code = gimple_assign_rhs_code (stmt);
3650           bool leading_conversion = (CONVERT_EXPR_CODE_P (stmt_code)
3651                                      && (i == 1 || i == path.length () - 1));
3652           if ((stmt_code != code && !leading_conversion)
3653               /* We can only handle the final value in epilogue
3654                  generation for reduction chains.  */
3655               || (i != 1 && !has_single_use (gimple_assign_lhs (stmt))))
3656             is_slp_reduc = false;
3657           /* For reduction chains we support a trailing/leading
3658              conversions.  We do not store those in the actual chain.  */
3659           if (leading_conversion)
3660             continue;
3661           reduc_chain.safe_push (stmt_info);
3662         }
3663       if (is_slp_reduc && reduc_chain.length () > 1)
3664         {
3665           for (unsigned i = 0; i < reduc_chain.length () - 1; ++i)
3666             {
3667               REDUC_GROUP_FIRST_ELEMENT (reduc_chain[i]) = reduc_chain[0];
3668               REDUC_GROUP_NEXT_ELEMENT (reduc_chain[i]) = reduc_chain[i+1];
3669             }
3670           REDUC_GROUP_FIRST_ELEMENT (reduc_chain.last ()) = reduc_chain[0];
3671           REDUC_GROUP_NEXT_ELEMENT (reduc_chain.last ()) = NULL;
3672
3673           /* Save the chain for further analysis in SLP detection.  */
3674           LOOP_VINFO_REDUCTION_CHAINS (loop_info).safe_push (reduc_chain[0]);
3675           REDUC_GROUP_SIZE (reduc_chain[0]) = reduc_chain.length ();
3676
3677           *reduc_chain_p = true;
3678           if (dump_enabled_p ())
3679             dump_printf_loc (MSG_NOTE, vect_location,
3680                             "reduction: detected reduction chain\n");
3681         }
3682       else if (dump_enabled_p ())
3683         dump_printf_loc (MSG_NOTE, vect_location,
3684                          "reduction: detected reduction\n");
3685
3686       return def_stmt_info;
3687     }
3688
3689   if (dump_enabled_p ())
3690     dump_printf_loc (MSG_NOTE, vect_location,
3691                      "reduction: unknown pattern\n");
3692
3693   return NULL;
3694 }
3695
3696 /* Estimate the number of peeled epilogue iterations for LOOP_VINFO.
3697    PEEL_ITERS_PROLOGUE is the number of peeled prologue iterations,
3698    or -1 if not known.  */
3699
3700 static int
3701 vect_get_peel_iters_epilogue (loop_vec_info loop_vinfo, int peel_iters_prologue)
3702 {
3703   int assumed_vf = vect_vf_for_cost (loop_vinfo);
3704   if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) || peel_iters_prologue == -1)
3705     {
3706       if (dump_enabled_p ())
3707         dump_printf_loc (MSG_NOTE, vect_location,
3708                          "cost model: epilogue peel iters set to vf/2 "
3709                          "because loop iterations are unknown .\n");
3710       return assumed_vf / 2;
3711     }
3712   else
3713     {
3714       int niters = LOOP_VINFO_INT_NITERS (loop_vinfo);
3715       peel_iters_prologue = MIN (niters, peel_iters_prologue);
3716       int peel_iters_epilogue = (niters - peel_iters_prologue) % assumed_vf;
3717       /* If we need to peel for gaps, but no peeling is required, we have to
3718          peel VF iterations.  */
3719       if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) && !peel_iters_epilogue)
3720         peel_iters_epilogue = assumed_vf;
3721       return peel_iters_epilogue;
3722     }
3723 }
3724
3725 /* Calculate cost of peeling the loop PEEL_ITERS_PROLOGUE times.  */
3726 int
3727 vect_get_known_peeling_cost (loop_vec_info loop_vinfo, int peel_iters_prologue,
3728                              int *peel_iters_epilogue,
3729                              stmt_vector_for_cost *scalar_cost_vec,
3730                              stmt_vector_for_cost *prologue_cost_vec,
3731                              stmt_vector_for_cost *epilogue_cost_vec)
3732 {
3733   int retval = 0;
3734
3735   *peel_iters_epilogue
3736     = vect_get_peel_iters_epilogue (loop_vinfo, peel_iters_prologue);
3737
3738   if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
3739     {
3740       /* If peeled iterations are known but number of scalar loop
3741          iterations are unknown, count a taken branch per peeled loop.  */
3742       if (peel_iters_prologue > 0)
3743         retval = record_stmt_cost (prologue_cost_vec, 1, cond_branch_taken,
3744                                    NULL, NULL_TREE, 0, vect_prologue);
3745       if (*peel_iters_epilogue > 0)
3746         retval += record_stmt_cost (epilogue_cost_vec, 1, cond_branch_taken,
3747                                     NULL, NULL_TREE, 0, vect_epilogue);
3748     }
3749
3750   stmt_info_for_cost *si;
3751   int j;
3752   if (peel_iters_prologue)
3753     FOR_EACH_VEC_ELT (*scalar_cost_vec, j, si)
3754       retval += record_stmt_cost (prologue_cost_vec,
3755                                   si->count * peel_iters_prologue,
3756                                   si->kind, si->stmt_info, si->misalign,
3757                                   vect_prologue);
3758   if (*peel_iters_epilogue)
3759     FOR_EACH_VEC_ELT (*scalar_cost_vec, j, si)
3760       retval += record_stmt_cost (epilogue_cost_vec,
3761                                   si->count * *peel_iters_epilogue,
3762                                   si->kind, si->stmt_info, si->misalign,
3763                                   vect_epilogue);
3764
3765   return retval;
3766 }
3767
3768 /* Function vect_estimate_min_profitable_iters
3769
3770    Return the number of iterations required for the vector version of the
3771    loop to be profitable relative to the cost of the scalar version of the
3772    loop.
3773
3774    *RET_MIN_PROFITABLE_NITERS is a cost model profitability threshold
3775    of iterations for vectorization.  -1 value means loop vectorization
3776    is not profitable.  This returned value may be used for dynamic
3777    profitability check.
3778
3779    *RET_MIN_PROFITABLE_ESTIMATE is a profitability threshold to be used
3780    for static check against estimated number of iterations.  */
3781
3782 static void
3783 vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo,
3784                                     int *ret_min_profitable_niters,
3785                                     int *ret_min_profitable_estimate)
3786 {
3787   int min_profitable_iters;
3788   int min_profitable_estimate;
3789   int peel_iters_prologue;
3790   int peel_iters_epilogue;
3791   unsigned vec_inside_cost = 0;
3792   int vec_outside_cost = 0;
3793   unsigned vec_prologue_cost = 0;
3794   unsigned vec_epilogue_cost = 0;
3795   int scalar_single_iter_cost = 0;
3796   int scalar_outside_cost = 0;
3797   int assumed_vf = vect_vf_for_cost (loop_vinfo);
3798   int npeel = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
3799   vector_costs *target_cost_data = loop_vinfo->vector_costs;
3800
3801   /* Cost model disabled.  */
3802   if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
3803     {
3804       if (dump_enabled_p ())
3805         dump_printf_loc (MSG_NOTE, vect_location, "cost model disabled.\n");
3806       *ret_min_profitable_niters = 0;
3807       *ret_min_profitable_estimate = 0;
3808       return;
3809     }
3810
3811   /* Requires loop versioning tests to handle misalignment.  */
3812   if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo))
3813     {
3814       /*  FIXME: Make cost depend on complexity of individual check.  */
3815       unsigned len = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).length ();
3816       (void) add_stmt_cost (target_cost_data, len, vector_stmt,
3817                             NULL, NULL_TREE, 0, vect_prologue);
3818       if (dump_enabled_p ())
3819         dump_printf (MSG_NOTE,
3820                      "cost model: Adding cost of checks for loop "
3821                      "versioning to treat misalignment.\n");
3822     }
3823
3824   /* Requires loop versioning with alias checks.  */
3825   if (LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
3826     {
3827       /*  FIXME: Make cost depend on complexity of individual check.  */
3828       unsigned len = LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo).length ();
3829       (void) add_stmt_cost (target_cost_data, len, vector_stmt,
3830                             NULL, NULL_TREE, 0, vect_prologue);
3831       len = LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo).length ();
3832       if (len)
3833         /* Count LEN - 1 ANDs and LEN comparisons.  */
3834         (void) add_stmt_cost (target_cost_data, len * 2 - 1,
3835                               scalar_stmt, NULL, NULL_TREE, 0, vect_prologue);
3836       len = LOOP_VINFO_LOWER_BOUNDS (loop_vinfo).length ();
3837       if (len)
3838         {
3839           /* Count LEN - 1 ANDs and LEN comparisons.  */
3840           unsigned int nstmts = len * 2 - 1;
3841           /* +1 for each bias that needs adding.  */
3842           for (unsigned int i = 0; i < len; ++i)
3843             if (!LOOP_VINFO_LOWER_BOUNDS (loop_vinfo)[i].unsigned_p)
3844               nstmts += 1;
3845           (void) add_stmt_cost (target_cost_data, nstmts,
3846                                 scalar_stmt, NULL, NULL_TREE, 0, vect_prologue);
3847         }
3848       if (dump_enabled_p ())
3849         dump_printf (MSG_NOTE,
3850                      "cost model: Adding cost of checks for loop "
3851                      "versioning aliasing.\n");
3852     }
3853
3854   /* Requires loop versioning with niter checks.  */
3855   if (LOOP_REQUIRES_VERSIONING_FOR_NITERS (loop_vinfo))
3856     {
3857       /*  FIXME: Make cost depend on complexity of individual check.  */
3858       (void) add_stmt_cost (target_cost_data, 1, vector_stmt,
3859                             NULL, NULL_TREE, 0, vect_prologue);
3860       if (dump_enabled_p ())
3861         dump_printf (MSG_NOTE,
3862                      "cost model: Adding cost of checks for loop "
3863                      "versioning niters.\n");
3864     }
3865
3866   if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
3867     (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken,
3868                           NULL, NULL_TREE, 0, vect_prologue);
3869
3870   /* Count statements in scalar loop.  Using this as scalar cost for a single
3871      iteration for now.
3872
3873      TODO: Add outer loop support.
3874
3875      TODO: Consider assigning different costs to different scalar
3876      statements.  */
3877
3878   scalar_single_iter_cost = loop_vinfo->scalar_costs->total_cost ();
3879
3880   /* Add additional cost for the peeled instructions in prologue and epilogue
3881      loop.  (For fully-masked loops there will be no peeling.)
3882
3883      FORNOW: If we don't know the value of peel_iters for prologue or epilogue
3884      at compile-time - we assume it's vf/2 (the worst would be vf-1).
3885
3886      TODO: Build an expression that represents peel_iters for prologue and
3887      epilogue to be used in a run-time test.  */
3888
3889   bool prologue_need_br_taken_cost = false;
3890   bool prologue_need_br_not_taken_cost = false;
3891
3892   /* Calculate peel_iters_prologue.  */
3893   if (vect_use_loop_mask_for_alignment_p (loop_vinfo))
3894     peel_iters_prologue = 0;
3895   else if (npeel < 0)
3896     {
3897       peel_iters_prologue = assumed_vf / 2;
3898       if (dump_enabled_p ())
3899         dump_printf (MSG_NOTE, "cost model: "
3900                      "prologue peel iters set to vf/2.\n");
3901
3902       /* If peeled iterations are unknown, count a taken branch and a not taken
3903          branch per peeled loop.  Even if scalar loop iterations are known,
3904          vector iterations are not known since peeled prologue iterations are
3905          not known.  Hence guards remain the same.  */
3906       prologue_need_br_taken_cost = true;
3907       prologue_need_br_not_taken_cost = true;
3908     }
3909   else
3910     {
3911       peel_iters_prologue = npeel;
3912       if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) && peel_iters_prologue > 0)
3913         /* If peeled iterations are known but number of scalar loop
3914            iterations are unknown, count a taken branch per peeled loop.  */
3915         prologue_need_br_taken_cost = true;
3916     }
3917
3918   bool epilogue_need_br_taken_cost = false;
3919   bool epilogue_need_br_not_taken_cost = false;
3920
3921   /* Calculate peel_iters_epilogue.  */
3922   if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
3923     /* We need to peel exactly one iteration for gaps.  */
3924     peel_iters_epilogue = LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) ? 1 : 0;
3925   else if (npeel < 0)
3926     {
3927       /* If peeling for alignment is unknown, loop bound of main loop
3928          becomes unknown.  */
3929       peel_iters_epilogue = assumed_vf / 2;
3930       if (dump_enabled_p ())
3931         dump_printf (MSG_NOTE, "cost model: "
3932                      "epilogue peel iters set to vf/2 because "
3933                      "peeling for alignment is unknown.\n");
3934
3935       /* See the same reason above in peel_iters_prologue calculation.  */
3936       epilogue_need_br_taken_cost = true;
3937       epilogue_need_br_not_taken_cost = true;
3938     }
3939   else
3940     {
3941       peel_iters_epilogue = vect_get_peel_iters_epilogue (loop_vinfo, npeel);
3942       if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) && peel_iters_epilogue > 0)
3943         /* If peeled iterations are known but number of scalar loop
3944            iterations are unknown, count a taken branch per peeled loop.  */
3945         epilogue_need_br_taken_cost = true;
3946     }
3947
3948   stmt_info_for_cost *si;
3949   int j;
3950   /* Add costs associated with peel_iters_prologue.  */
3951   if (peel_iters_prologue)
3952     FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo), j, si)
3953       {
3954         (void) add_stmt_cost (target_cost_data,
3955                               si->count * peel_iters_prologue, si->kind,
3956                               si->stmt_info, si->vectype, si->misalign,
3957                               vect_prologue);
3958       }
3959
3960   /* Add costs associated with peel_iters_epilogue.  */
3961   if (peel_iters_epilogue)
3962     FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo), j, si)
3963       {
3964         (void) add_stmt_cost (target_cost_data,
3965                               si->count * peel_iters_epilogue, si->kind,
3966                               si->stmt_info, si->vectype, si->misalign,
3967                               vect_epilogue);
3968       }
3969
3970   /* Add possible cond_branch_taken/cond_branch_not_taken cost.  */
3971
3972   if (prologue_need_br_taken_cost)
3973     (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken,
3974                           NULL, NULL_TREE, 0, vect_prologue);
3975
3976   if (prologue_need_br_not_taken_cost)
3977     (void) add_stmt_cost (target_cost_data, 1,
3978                           cond_branch_not_taken, NULL, NULL_TREE, 0,
3979                           vect_prologue);
3980
3981   if (epilogue_need_br_taken_cost)
3982     (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken,
3983                           NULL, NULL_TREE, 0, vect_epilogue);
3984
3985   if (epilogue_need_br_not_taken_cost)
3986     (void) add_stmt_cost (target_cost_data, 1,
3987                           cond_branch_not_taken, NULL, NULL_TREE, 0,
3988                           vect_epilogue);
3989
3990   /* Take care of special costs for rgroup controls of partial vectors.  */
3991   if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
3992     {
3993       /* Calculate how many masks we need to generate.  */
3994       unsigned int num_masks = 0;
3995       rgroup_controls *rgm;
3996       unsigned int num_vectors_m1;
3997       FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo), num_vectors_m1, rgm)
3998         if (rgm->type)
3999           num_masks += num_vectors_m1 + 1;
4000       gcc_assert (num_masks > 0);
4001
4002       /* In the worst case, we need to generate each mask in the prologue
4003          and in the loop body.  One of the loop body mask instructions
4004          replaces the comparison in the scalar loop, and since we don't
4005          count the scalar comparison against the scalar body, we shouldn't
4006          count that vector instruction against the vector body either.
4007
4008          Sometimes we can use unpacks instead of generating prologue
4009          masks and sometimes the prologue mask will fold to a constant,
4010          so the actual prologue cost might be smaller.  However, it's
4011          simpler and safer to use the worst-case cost; if this ends up
4012          being the tie-breaker between vectorizing or not, then it's
4013          probably better not to vectorize.  */
4014       (void) add_stmt_cost (target_cost_data, num_masks,
4015                             vector_stmt, NULL, NULL_TREE, 0, vect_prologue);
4016       (void) add_stmt_cost (target_cost_data, num_masks - 1,
4017                             vector_stmt, NULL, NULL_TREE, 0, vect_body);
4018     }
4019   else if (LOOP_VINFO_FULLY_WITH_LENGTH_P (loop_vinfo))
4020     {
4021       /* Referring to the functions vect_set_loop_condition_partial_vectors
4022          and vect_set_loop_controls_directly, we need to generate each
4023          length in the prologue and in the loop body if required. Although
4024          there are some possible optimizations, we consider the worst case
4025          here.  */
4026
4027       bool niters_known_p = LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo);
4028       bool need_iterate_p
4029         = (!LOOP_VINFO_EPILOGUE_P (loop_vinfo)
4030            && !vect_known_niters_smaller_than_vf (loop_vinfo));
4031
4032       /* Calculate how many statements to be added.  */
4033       unsigned int prologue_stmts = 0;
4034       unsigned int body_stmts = 0;
4035
4036       rgroup_controls *rgc;
4037       unsigned int num_vectors_m1;
4038       FOR_EACH_VEC_ELT (LOOP_VINFO_LENS (loop_vinfo), num_vectors_m1, rgc)
4039         if (rgc->type)
4040           {
4041             /* May need one SHIFT for nitems_total computation.  */
4042             unsigned nitems = rgc->max_nscalars_per_iter * rgc->factor;
4043             if (nitems != 1 && !niters_known_p)
4044               prologue_stmts += 1;
4045
4046             /* May need one MAX and one MINUS for wrap around.  */
4047             if (vect_rgroup_iv_might_wrap_p (loop_vinfo, rgc))
4048               prologue_stmts += 2;
4049
4050             /* Need one MAX and one MINUS for each batch limit excepting for
4051                the 1st one.  */
4052             prologue_stmts += num_vectors_m1 * 2;
4053
4054             unsigned int num_vectors = num_vectors_m1 + 1;
4055
4056             /* Need to set up lengths in prologue, only one MIN required
4057                for each since start index is zero.  */
4058             prologue_stmts += num_vectors;
4059
4060             /* Each may need two MINs and one MINUS to update lengths in body
4061                for next iteration.  */
4062             if (need_iterate_p)
4063               body_stmts += 3 * num_vectors;
4064           }
4065
4066       (void) add_stmt_cost (target_cost_data, prologue_stmts,
4067                             scalar_stmt, NULL, NULL_TREE, 0, vect_prologue);
4068       (void) add_stmt_cost (target_cost_data, body_stmts,
4069                             scalar_stmt, NULL, NULL_TREE, 0, vect_body);
4070     }
4071
4072   /* FORNOW: The scalar outside cost is incremented in one of the
4073      following ways:
4074
4075      1. The vectorizer checks for alignment and aliasing and generates
4076      a condition that allows dynamic vectorization.  A cost model
4077      check is ANDED with the versioning condition.  Hence scalar code
4078      path now has the added cost of the versioning check.
4079
4080        if (cost > th & versioning_check)
4081          jmp to vector code
4082
4083      Hence run-time scalar is incremented by not-taken branch cost.
4084
4085      2. The vectorizer then checks if a prologue is required.  If the
4086      cost model check was not done before during versioning, it has to
4087      be done before the prologue check.
4088
4089        if (cost <= th)
4090          prologue = scalar_iters
4091        if (prologue == 0)
4092          jmp to vector code
4093        else
4094          execute prologue
4095        if (prologue == num_iters)
4096          go to exit
4097
4098      Hence the run-time scalar cost is incremented by a taken branch,
4099      plus a not-taken branch, plus a taken branch cost.
4100
4101      3. The vectorizer then checks if an epilogue is required.  If the
4102      cost model check was not done before during prologue check, it
4103      has to be done with the epilogue check.
4104
4105        if (prologue == 0)
4106          jmp to vector code
4107        else
4108          execute prologue
4109        if (prologue == num_iters)
4110          go to exit
4111        vector code:
4112          if ((cost <= th) | (scalar_iters-prologue-epilogue == 0))
4113            jmp to epilogue
4114
4115      Hence the run-time scalar cost should be incremented by 2 taken
4116      branches.
4117
4118      TODO: The back end may reorder the BBS's differently and reverse
4119      conditions/branch directions.  Change the estimates below to
4120      something more reasonable.  */
4121
4122   /* If the number of iterations is known and we do not do versioning, we can
4123      decide whether to vectorize at compile time.  Hence the scalar version
4124      do not carry cost model guard costs.  */
4125   if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
4126       || LOOP_REQUIRES_VERSIONING (loop_vinfo))
4127     {
4128       /* Cost model check occurs at versioning.  */
4129       if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
4130         scalar_outside_cost += vect_get_stmt_cost (cond_branch_not_taken);
4131       else
4132         {
4133           /* Cost model check occurs at prologue generation.  */
4134           if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) < 0)
4135             scalar_outside_cost += 2 * vect_get_stmt_cost (cond_branch_taken)
4136               + vect_get_stmt_cost (cond_branch_not_taken);
4137           /* Cost model check occurs at epilogue generation.  */
4138           else
4139             scalar_outside_cost += 2 * vect_get_stmt_cost (cond_branch_taken);
4140         }
4141     }
4142
4143   /* Complete the target-specific cost calculations.  */
4144   finish_cost (loop_vinfo->vector_costs, loop_vinfo->scalar_costs,
4145                &vec_prologue_cost, &vec_inside_cost, &vec_epilogue_cost);
4146
4147   vec_outside_cost = (int)(vec_prologue_cost + vec_epilogue_cost);
4148
4149   if (dump_enabled_p ())
4150     {
4151       dump_printf_loc (MSG_NOTE, vect_location, "Cost model analysis: \n");
4152       dump_printf (MSG_NOTE, "  Vector inside of loop cost: %d\n",
4153                    vec_inside_cost);
4154       dump_printf (MSG_NOTE, "  Vector prologue cost: %d\n",
4155                    vec_prologue_cost);
4156       dump_printf (MSG_NOTE, "  Vector epilogue cost: %d\n",
4157                    vec_epilogue_cost);
4158       dump_printf (MSG_NOTE, "  Scalar iteration cost: %d\n",
4159                    scalar_single_iter_cost);
4160       dump_printf (MSG_NOTE, "  Scalar outside cost: %d\n",
4161                    scalar_outside_cost);
4162       dump_printf (MSG_NOTE, "  Vector outside cost: %d\n",
4163                    vec_outside_cost);
4164       dump_printf (MSG_NOTE, "  prologue iterations: %d\n",
4165                    peel_iters_prologue);
4166       dump_printf (MSG_NOTE, "  epilogue iterations: %d\n",
4167                    peel_iters_epilogue);
4168     }
4169
4170   /* Calculate number of iterations required to make the vector version
4171      profitable, relative to the loop bodies only.  The following condition
4172      must hold true:
4173      SIC * niters + SOC > VIC * ((niters - NPEEL) / VF) + VOC
4174      where
4175      SIC = scalar iteration cost, VIC = vector iteration cost,
4176      VOC = vector outside cost, VF = vectorization factor,
4177      NPEEL = prologue iterations + epilogue iterations,
4178      SOC = scalar outside cost for run time cost model check.  */
4179
4180   int saving_per_viter = (scalar_single_iter_cost * assumed_vf
4181                           - vec_inside_cost);
4182   if (saving_per_viter <= 0)
4183     {
4184       if (LOOP_VINFO_LOOP (loop_vinfo)->force_vectorize)
4185         warning_at (vect_location.get_location_t (), OPT_Wopenmp_simd,
4186                     "vectorization did not happen for a simd loop");
4187
4188       if (dump_enabled_p ())
4189         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4190                          "cost model: the vector iteration cost = %d "
4191                          "divided by the scalar iteration cost = %d "
4192                          "is greater or equal to the vectorization factor = %d"
4193                          ".\n",
4194                          vec_inside_cost, scalar_single_iter_cost, assumed_vf);
4195       *ret_min_profitable_niters = -1;
4196       *ret_min_profitable_estimate = -1;
4197       return;
4198     }
4199
4200   /* ??? The "if" arm is written to handle all cases; see below for what
4201      we would do for !LOOP_VINFO_USING_PARTIAL_VECTORS_P.  */
4202   if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
4203     {
4204       /* Rewriting the condition above in terms of the number of
4205          vector iterations (vniters) rather than the number of
4206          scalar iterations (niters) gives:
4207
4208          SIC * (vniters * VF + NPEEL) + SOC > VIC * vniters + VOC
4209
4210          <==> vniters * (SIC * VF - VIC) > VOC - SIC * NPEEL - SOC
4211
4212          For integer N, X and Y when X > 0:
4213
4214          N * X > Y <==> N >= (Y /[floor] X) + 1.  */
4215       int outside_overhead = (vec_outside_cost
4216                               - scalar_single_iter_cost * peel_iters_prologue
4217                               - scalar_single_iter_cost * peel_iters_epilogue
4218                               - scalar_outside_cost);
4219       /* We're only interested in cases that require at least one
4220          vector iteration.  */
4221       int min_vec_niters = 1;
4222       if (outside_overhead > 0)
4223         min_vec_niters = outside_overhead / saving_per_viter + 1;
4224
4225       if (dump_enabled_p ())
4226         dump_printf (MSG_NOTE, "  Minimum number of vector iterations: %d\n",
4227                      min_vec_niters);
4228
4229       if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
4230         {
4231           /* Now that we know the minimum number of vector iterations,
4232              find the minimum niters for which the scalar cost is larger:
4233
4234              SIC * niters > VIC * vniters + VOC - SOC
4235
4236              We know that the minimum niters is no more than
4237              vniters * VF + NPEEL, but it might be (and often is) less
4238              than that if a partial vector iteration is cheaper than the
4239              equivalent scalar code.  */
4240           int threshold = (vec_inside_cost * min_vec_niters
4241                            + vec_outside_cost
4242                            - scalar_outside_cost);
4243           if (threshold <= 0)
4244             min_profitable_iters = 1;
4245           else
4246             min_profitable_iters = threshold / scalar_single_iter_cost + 1;
4247         }
4248       else
4249         /* Convert the number of vector iterations into a number of
4250            scalar iterations.  */
4251         min_profitable_iters = (min_vec_niters * assumed_vf
4252                                 + peel_iters_prologue
4253                                 + peel_iters_epilogue);
4254     }
4255   else
4256     {
4257       min_profitable_iters = ((vec_outside_cost - scalar_outside_cost)
4258                               * assumed_vf
4259                               - vec_inside_cost * peel_iters_prologue
4260                               - vec_inside_cost * peel_iters_epilogue);
4261       if (min_profitable_iters <= 0)
4262         min_profitable_iters = 0;
4263       else
4264         {
4265           min_profitable_iters /= saving_per_viter;
4266
4267           if ((scalar_single_iter_cost * assumed_vf * min_profitable_iters)
4268               <= (((int) vec_inside_cost * min_profitable_iters)
4269                   + (((int) vec_outside_cost - scalar_outside_cost)
4270                      * assumed_vf)))
4271             min_profitable_iters++;
4272         }
4273     }
4274
4275   if (dump_enabled_p ())
4276     dump_printf (MSG_NOTE,
4277                  "  Calculated minimum iters for profitability: %d\n",
4278                  min_profitable_iters);
4279
4280   if (!LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo)
4281       && min_profitable_iters < (assumed_vf + peel_iters_prologue))
4282     /* We want the vectorized loop to execute at least once.  */
4283     min_profitable_iters = assumed_vf + peel_iters_prologue;
4284   else if (min_profitable_iters < peel_iters_prologue)
4285     /* For LOOP_VINFO_USING_PARTIAL_VECTORS_P, we need to ensure the
4286        vectorized loop executes at least once.  */
4287     min_profitable_iters = peel_iters_prologue;
4288
4289   if (dump_enabled_p ())
4290     dump_printf_loc (MSG_NOTE, vect_location,
4291                      "  Runtime profitability threshold = %d\n",
4292                      min_profitable_iters);
4293
4294   *ret_min_profitable_niters = min_profitable_iters;
4295
4296   /* Calculate number of iterations required to make the vector version
4297      profitable, relative to the loop bodies only.
4298
4299      Non-vectorized variant is SIC * niters and it must win over vector
4300      variant on the expected loop trip count.  The following condition must hold true:
4301      SIC * niters > VIC * ((niters - NPEEL) / VF) + VOC + SOC  */
4302
4303   if (vec_outside_cost <= 0)
4304     min_profitable_estimate = 0;
4305   /* ??? This "else if" arm is written to handle all cases; see below for
4306      what we would do for !LOOP_VINFO_USING_PARTIAL_VECTORS_P.  */
4307   else if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
4308     {
4309       /* This is a repeat of the code above, but with + SOC rather
4310          than - SOC.  */
4311       int outside_overhead = (vec_outside_cost
4312                               - scalar_single_iter_cost * peel_iters_prologue
4313                               - scalar_single_iter_cost * peel_iters_epilogue
4314                               + scalar_outside_cost);
4315       int min_vec_niters = 1;
4316       if (outside_overhead > 0)
4317         min_vec_niters = outside_overhead / saving_per_viter + 1;
4318
4319       if (LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
4320         {
4321           int threshold = (vec_inside_cost * min_vec_niters
4322                            + vec_outside_cost
4323                            + scalar_outside_cost);
4324           min_profitable_estimate = threshold / scalar_single_iter_cost + 1;
4325         }
4326       else
4327         min_profitable_estimate = (min_vec_niters * assumed_vf
4328                                    + peel_iters_prologue
4329                                    + peel_iters_epilogue);
4330     }
4331   else
4332     {
4333       min_profitable_estimate = ((vec_outside_cost + scalar_outside_cost)
4334                                  * assumed_vf
4335                                  - vec_inside_cost * peel_iters_prologue
4336                                  - vec_inside_cost * peel_iters_epilogue)
4337                                  / ((scalar_single_iter_cost * assumed_vf)
4338                                    - vec_inside_cost);
4339     }
4340   min_profitable_estimate = MAX (min_profitable_estimate, min_profitable_iters);
4341   if (dump_enabled_p ())
4342     dump_printf_loc (MSG_NOTE, vect_location,
4343                      "  Static estimate profitability threshold = %d\n",
4344                      min_profitable_estimate);
4345
4346   *ret_min_profitable_estimate = min_profitable_estimate;
4347 }
4348
4349 /* Writes into SEL a mask for a vec_perm, equivalent to a vec_shr by OFFSET
4350    vector elements (not bits) for a vector with NELT elements.  */
4351 static void
4352 calc_vec_perm_mask_for_shift (unsigned int offset, unsigned int nelt,
4353                               vec_perm_builder *sel)
4354 {
4355   /* The encoding is a single stepped pattern.  Any wrap-around is handled
4356      by vec_perm_indices.  */
4357   sel->new_vector (nelt, 1, 3);
4358   for (unsigned int i = 0; i < 3; i++)
4359     sel->quick_push (i + offset);
4360 }
4361
4362 /* Checks whether the target supports whole-vector shifts for vectors of mode
4363    MODE.  This is the case if _either_ the platform handles vec_shr_optab, _or_
4364    it supports vec_perm_const with masks for all necessary shift amounts.  */
4365 static bool
4366 have_whole_vector_shift (machine_mode mode)
4367 {
4368   if (optab_handler (vec_shr_optab, mode) != CODE_FOR_nothing)
4369     return true;
4370
4371   /* Variable-length vectors should be handled via the optab.  */
4372   unsigned int nelt;
4373   if (!GET_MODE_NUNITS (mode).is_constant (&nelt))
4374     return false;
4375
4376   vec_perm_builder sel;
4377   vec_perm_indices indices;
4378   for (unsigned int i = nelt / 2; i >= 1; i /= 2)
4379     {
4380       calc_vec_perm_mask_for_shift (i, nelt, &sel);
4381       indices.new_vector (sel, 2, nelt);
4382       if (!can_vec_perm_const_p (mode, indices, false))
4383         return false;
4384     }
4385   return true;
4386 }
4387
4388 /* TODO: Close dependency between vect_model_*_cost and vectorizable_*
4389    functions. Design better to avoid maintenance issues.  */
4390
4391 /* Function vect_model_reduction_cost.
4392
4393    Models cost for a reduction operation, including the vector ops
4394    generated within the strip-mine loop in some cases, the initial
4395    definition before the loop, and the epilogue code that must be generated.  */
4396
4397 static void
4398 vect_model_reduction_cost (loop_vec_info loop_vinfo,
4399                            stmt_vec_info stmt_info, internal_fn reduc_fn,
4400                            vect_reduction_type reduction_type,
4401                            int ncopies, stmt_vector_for_cost *cost_vec)
4402 {
4403   int prologue_cost = 0, epilogue_cost = 0, inside_cost = 0;
4404   enum tree_code code;
4405   optab optab;
4406   tree vectype;
4407   machine_mode mode;
4408   class loop *loop = NULL;
4409
4410   if (loop_vinfo)
4411     loop = LOOP_VINFO_LOOP (loop_vinfo);
4412
4413   /* Condition reductions generate two reductions in the loop.  */
4414   if (reduction_type == COND_REDUCTION)
4415     ncopies *= 2;
4416
4417   vectype = STMT_VINFO_VECTYPE (stmt_info);
4418   mode = TYPE_MODE (vectype);
4419   stmt_vec_info orig_stmt_info = vect_orig_stmt (stmt_info);
4420
4421   code = gimple_assign_rhs_code (orig_stmt_info->stmt);
4422
4423   if (reduction_type == EXTRACT_LAST_REDUCTION)
4424     /* No extra instructions are needed in the prologue.  The loop body
4425        operations are costed in vectorizable_condition.  */
4426     inside_cost = 0;
4427   else if (reduction_type == FOLD_LEFT_REDUCTION)
4428     {
4429       /* No extra instructions needed in the prologue.  */
4430       prologue_cost = 0;
4431
4432       if (reduc_fn != IFN_LAST)
4433         /* Count one reduction-like operation per vector.  */
4434         inside_cost = record_stmt_cost (cost_vec, ncopies, vec_to_scalar,
4435                                         stmt_info, 0, vect_body);
4436       else
4437         {
4438           /* Use NELEMENTS extracts and NELEMENTS scalar ops.  */
4439           unsigned int nelements = ncopies * vect_nunits_for_cost (vectype);
4440           inside_cost = record_stmt_cost (cost_vec, nelements,
4441                                           vec_to_scalar, stmt_info, 0,
4442                                           vect_body);
4443           inside_cost += record_stmt_cost (cost_vec, nelements,
4444                                            scalar_stmt, stmt_info, 0,
4445                                            vect_body);
4446         }
4447     }
4448   else
4449     {
4450       /* Add in cost for initial definition.
4451          For cond reduction we have four vectors: initial index, step,
4452          initial result of the data reduction, initial value of the index
4453          reduction.  */
4454       int prologue_stmts = reduction_type == COND_REDUCTION ? 4 : 1;
4455       prologue_cost += record_stmt_cost (cost_vec, prologue_stmts,
4456                                          scalar_to_vec, stmt_info, 0,
4457                                          vect_prologue);
4458     }
4459
4460   /* Determine cost of epilogue code.
4461
4462      We have a reduction operator that will reduce the vector in one statement.
4463      Also requires scalar extract.  */
4464
4465   if (!loop || !nested_in_vect_loop_p (loop, orig_stmt_info))
4466     {
4467       if (reduc_fn != IFN_LAST)
4468         {
4469           if (reduction_type == COND_REDUCTION)
4470             {
4471               /* An EQ stmt and an COND_EXPR stmt.  */
4472               epilogue_cost += record_stmt_cost (cost_vec, 2,
4473                                                  vector_stmt, stmt_info, 0,
4474                                                  vect_epilogue);
4475               /* Reduction of the max index and a reduction of the found
4476                  values.  */
4477               epilogue_cost += record_stmt_cost (cost_vec, 2,
4478                                                  vec_to_scalar, stmt_info, 0,
4479                                                  vect_epilogue);
4480               /* A broadcast of the max value.  */
4481               epilogue_cost += record_stmt_cost (cost_vec, 1,
4482                                                  scalar_to_vec, stmt_info, 0,
4483                                                  vect_epilogue);
4484             }
4485           else
4486             {
4487               epilogue_cost += record_stmt_cost (cost_vec, 1, vector_stmt,
4488                                                  stmt_info, 0, vect_epilogue);
4489               epilogue_cost += record_stmt_cost (cost_vec, 1,
4490                                                  vec_to_scalar, stmt_info, 0,
4491                                                  vect_epilogue);
4492             }
4493         }
4494       else if (reduction_type == COND_REDUCTION)
4495         {
4496           unsigned estimated_nunits = vect_nunits_for_cost (vectype);
4497           /* Extraction of scalar elements.  */
4498           epilogue_cost += record_stmt_cost (cost_vec,
4499                                              2 * estimated_nunits,
4500                                              vec_to_scalar, stmt_info, 0,
4501                                              vect_epilogue);
4502           /* Scalar max reductions via COND_EXPR / MAX_EXPR.  */
4503           epilogue_cost += record_stmt_cost (cost_vec,
4504                                              2 * estimated_nunits - 3,
4505                                              scalar_stmt, stmt_info, 0,
4506                                              vect_epilogue);
4507         }
4508       else if (reduction_type == EXTRACT_LAST_REDUCTION
4509                || reduction_type == FOLD_LEFT_REDUCTION)
4510         /* No extra instructions need in the epilogue.  */
4511         ;
4512       else
4513         {
4514           int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
4515           tree bitsize =
4516             TYPE_SIZE (TREE_TYPE (gimple_assign_lhs (orig_stmt_info->stmt)));
4517           int element_bitsize = tree_to_uhwi (bitsize);
4518           int nelements = vec_size_in_bits / element_bitsize;
4519
4520           if (code == COND_EXPR)
4521             code = MAX_EXPR;
4522
4523           optab = optab_for_tree_code (code, vectype, optab_default);
4524
4525           /* We have a whole vector shift available.  */
4526           if (optab != unknown_optab
4527               && VECTOR_MODE_P (mode)
4528               && optab_handler (optab, mode) != CODE_FOR_nothing
4529               && have_whole_vector_shift (mode))
4530             {
4531               /* Final reduction via vector shifts and the reduction operator.
4532                  Also requires scalar extract.  */
4533               epilogue_cost += record_stmt_cost (cost_vec,
4534                                                  exact_log2 (nelements) * 2,
4535                                                  vector_stmt, stmt_info, 0,
4536                                                  vect_epilogue);
4537               epilogue_cost += record_stmt_cost (cost_vec, 1,
4538                                                  vec_to_scalar, stmt_info, 0,
4539                                                  vect_epilogue);
4540             }
4541           else
4542             /* Use extracts and reduction op for final reduction.  For N
4543                elements, we have N extracts and N-1 reduction ops.  */
4544             epilogue_cost += record_stmt_cost (cost_vec,
4545                                                nelements + nelements - 1,
4546                                                vector_stmt, stmt_info, 0,
4547                                                vect_epilogue);
4548         }
4549     }
4550
4551   if (dump_enabled_p ())
4552     dump_printf (MSG_NOTE,
4553                  "vect_model_reduction_cost: inside_cost = %d, "
4554                  "prologue_cost = %d, epilogue_cost = %d .\n", inside_cost,
4555                  prologue_cost, epilogue_cost);
4556 }
4557
4558 /* SEQ is a sequence of instructions that initialize the reduction
4559    described by REDUC_INFO.  Emit them in the appropriate place.  */
4560
4561 static void
4562 vect_emit_reduction_init_stmts (loop_vec_info loop_vinfo,
4563                                 stmt_vec_info reduc_info, gimple *seq)
4564 {
4565   if (reduc_info->reused_accumulator)
4566     {
4567       /* When reusing an accumulator from the main loop, we only need
4568          initialization instructions if the main loop can be skipped.
4569          In that case, emit the initialization instructions at the end
4570          of the guard block that does the skip.  */
4571       edge skip_edge = loop_vinfo->skip_main_loop_edge;
4572       gcc_assert (skip_edge);
4573       gimple_stmt_iterator gsi = gsi_last_bb (skip_edge->src);
4574       gsi_insert_seq_before (&gsi, seq, GSI_SAME_STMT);
4575     }
4576   else
4577     {
4578       /* The normal case: emit the initialization instructions on the
4579          preheader edge.  */
4580       class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
4581       gsi_insert_seq_on_edge_immediate (loop_preheader_edge (loop), seq);
4582     }
4583 }
4584
4585 /* Function get_initial_def_for_reduction
4586
4587    Input:
4588    REDUC_INFO - the info_for_reduction
4589    INIT_VAL - the initial value of the reduction variable
4590    NEUTRAL_OP - a value that has no effect on the reduction, as per
4591                 neutral_op_for_reduction
4592
4593    Output:
4594    Return a vector variable, initialized according to the operation that
4595         STMT_VINFO performs. This vector will be used as the initial value
4596         of the vector of partial results.
4597
4598    The value we need is a vector in which element 0 has value INIT_VAL
4599    and every other element has value NEUTRAL_OP.  */
4600
4601 static tree
4602 get_initial_def_for_reduction (loop_vec_info loop_vinfo,
4603                                stmt_vec_info reduc_info,
4604                                tree init_val, tree neutral_op)
4605 {
4606   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
4607   tree scalar_type = TREE_TYPE (init_val);
4608   tree vectype = get_vectype_for_scalar_type (loop_vinfo, scalar_type);
4609   tree init_def;
4610   gimple_seq stmts = NULL;
4611
4612   gcc_assert (vectype);
4613
4614   gcc_assert (POINTER_TYPE_P (scalar_type) || INTEGRAL_TYPE_P (scalar_type)
4615               || SCALAR_FLOAT_TYPE_P (scalar_type));
4616
4617   gcc_assert (nested_in_vect_loop_p (loop, reduc_info)
4618               || loop == (gimple_bb (reduc_info->stmt))->loop_father);
4619
4620   if (operand_equal_p (init_val, neutral_op))
4621     {
4622       /* If both elements are equal then the vector described above is
4623          just a splat.  */
4624       neutral_op = gimple_convert (&stmts, TREE_TYPE (vectype), neutral_op);
4625       init_def = gimple_build_vector_from_val (&stmts, vectype, neutral_op);
4626     }
4627   else
4628     {
4629       neutral_op = gimple_convert (&stmts, TREE_TYPE (vectype), neutral_op);
4630       init_val = gimple_convert (&stmts, TREE_TYPE (vectype), init_val);
4631       if (!TYPE_VECTOR_SUBPARTS (vectype).is_constant ())
4632         {
4633           /* Construct a splat of NEUTRAL_OP and insert INIT_VAL into
4634              element 0.  */
4635           init_def = gimple_build_vector_from_val (&stmts, vectype,
4636                                                    neutral_op);
4637           init_def = gimple_build (&stmts, CFN_VEC_SHL_INSERT,
4638                                    vectype, init_def, init_val);
4639         }
4640       else
4641         {
4642           /* Build {INIT_VAL, NEUTRAL_OP, NEUTRAL_OP, ...}.  */
4643           tree_vector_builder elts (vectype, 1, 2);
4644           elts.quick_push (init_val);
4645           elts.quick_push (neutral_op);
4646           init_def = gimple_build_vector (&stmts, &elts);
4647         }
4648     }
4649
4650   if (stmts)
4651     vect_emit_reduction_init_stmts (loop_vinfo, reduc_info, stmts);
4652   return init_def;
4653 }
4654
4655 /* Get at the initial defs for the reduction PHIs for REDUC_INFO,
4656    which performs a reduction involving GROUP_SIZE scalar statements.
4657    NUMBER_OF_VECTORS is the number of vector defs to create.  If NEUTRAL_OP
4658    is nonnull, introducing extra elements of that value will not change the
4659    result.  */
4660
4661 static void
4662 get_initial_defs_for_reduction (loop_vec_info loop_vinfo,
4663                                 stmt_vec_info reduc_info,
4664                                 vec<tree> *vec_oprnds,
4665                                 unsigned int number_of_vectors,
4666                                 unsigned int group_size, tree neutral_op)
4667 {
4668   vec<tree> &initial_values = reduc_info->reduc_initial_values;
4669   unsigned HOST_WIDE_INT nunits;
4670   unsigned j, number_of_places_left_in_vector;
4671   tree vector_type = STMT_VINFO_VECTYPE (reduc_info);
4672   unsigned int i;
4673
4674   gcc_assert (group_size == initial_values.length () || neutral_op);
4675
4676   /* NUMBER_OF_COPIES is the number of times we need to use the same values in
4677      created vectors. It is greater than 1 if unrolling is performed.
4678
4679      For example, we have two scalar operands, s1 and s2 (e.g., group of
4680      strided accesses of size two), while NUNITS is four (i.e., four scalars
4681      of this type can be packed in a vector).  The output vector will contain
4682      two copies of each scalar operand: {s1, s2, s1, s2}.  (NUMBER_OF_COPIES
4683      will be 2).
4684
4685      If REDUC_GROUP_SIZE > NUNITS, the scalars will be split into several
4686      vectors containing the operands.
4687
4688      For example, NUNITS is four as before, and the group size is 8
4689      (s1, s2, ..., s8).  We will create two vectors {s1, s2, s3, s4} and
4690      {s5, s6, s7, s8}.  */
4691
4692   if (!TYPE_VECTOR_SUBPARTS (vector_type).is_constant (&nunits))
4693     nunits = group_size;
4694
4695   number_of_places_left_in_vector = nunits;
4696   bool constant_p = true;
4697   tree_vector_builder elts (vector_type, nunits, 1);
4698   elts.quick_grow (nunits);
4699   gimple_seq ctor_seq = NULL;
4700   for (j = 0; j < nunits * number_of_vectors; ++j)
4701     {
4702       tree op;
4703       i = j % group_size;
4704
4705       /* Get the def before the loop.  In reduction chain we have only
4706          one initial value.  Else we have as many as PHIs in the group.  */
4707       if (i >= initial_values.length () || (j > i && neutral_op))
4708         op = neutral_op;
4709       else
4710         op = initial_values[i];
4711
4712       /* Create 'vect_ = {op0,op1,...,opn}'.  */
4713       number_of_places_left_in_vector--;
4714       elts[nunits - number_of_places_left_in_vector - 1] = op;
4715       if (!CONSTANT_CLASS_P (op))
4716         constant_p = false;
4717
4718       if (number_of_places_left_in_vector == 0)
4719         {
4720           tree init;
4721           if (constant_p && !neutral_op
4722               ? multiple_p (TYPE_VECTOR_SUBPARTS (vector_type), nunits)
4723               : known_eq (TYPE_VECTOR_SUBPARTS (vector_type), nunits))
4724             /* Build the vector directly from ELTS.  */
4725             init = gimple_build_vector (&ctor_seq, &elts);
4726           else if (neutral_op)
4727             {
4728               /* Build a vector of the neutral value and shift the
4729                  other elements into place.  */
4730               init = gimple_build_vector_from_val (&ctor_seq, vector_type,
4731                                                    neutral_op);
4732               int k = nunits;
4733               while (k > 0 && elts[k - 1] == neutral_op)
4734                 k -= 1;
4735               while (k > 0)
4736                 {
4737                   k -= 1;
4738                   init = gimple_build (&ctor_seq, CFN_VEC_SHL_INSERT,
4739                                        vector_type, init, elts[k]);
4740                 }
4741             }
4742           else
4743             {
4744               /* First time round, duplicate ELTS to fill the
4745                  required number of vectors.  */
4746               duplicate_and_interleave (loop_vinfo, &ctor_seq, vector_type,
4747                                         elts, number_of_vectors, *vec_oprnds);
4748               break;
4749             }
4750           vec_oprnds->quick_push (init);
4751
4752           number_of_places_left_in_vector = nunits;
4753           elts.new_vector (vector_type, nunits, 1);
4754           elts.quick_grow (nunits);
4755           constant_p = true;
4756         }
4757     }
4758   if (ctor_seq != NULL)
4759     vect_emit_reduction_init_stmts (loop_vinfo, reduc_info, ctor_seq);
4760 }
4761
4762 /* For a statement STMT_INFO taking part in a reduction operation return
4763    the stmt_vec_info the meta information is stored on.  */
4764
4765 stmt_vec_info
4766 info_for_reduction (vec_info *vinfo, stmt_vec_info stmt_info)
4767 {
4768   stmt_info = vect_orig_stmt (stmt_info);
4769   gcc_assert (STMT_VINFO_REDUC_DEF (stmt_info));
4770   if (!is_a <gphi *> (stmt_info->stmt)
4771       || !VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
4772     stmt_info = STMT_VINFO_REDUC_DEF (stmt_info);
4773   gphi *phi = as_a <gphi *> (stmt_info->stmt);
4774   if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
4775     {
4776       if (gimple_phi_num_args (phi) == 1)
4777         stmt_info = STMT_VINFO_REDUC_DEF (stmt_info);
4778     }
4779   else if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle)
4780     {
4781       stmt_vec_info info = vinfo->lookup_def (vect_phi_initial_value (phi));
4782       if (info && STMT_VINFO_DEF_TYPE (info) == vect_double_reduction_def)
4783         stmt_info = info;
4784     }
4785   return stmt_info;
4786 }
4787
4788 /* See if LOOP_VINFO is an epilogue loop whose main loop had a reduction that
4789    REDUC_INFO can build on.  Adjust REDUC_INFO and return true if so, otherwise
4790    return false.  */
4791
4792 static bool
4793 vect_find_reusable_accumulator (loop_vec_info loop_vinfo,
4794                                 stmt_vec_info reduc_info)
4795 {
4796   loop_vec_info main_loop_vinfo = LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo);
4797   if (!main_loop_vinfo)
4798     return false;
4799
4800   if (STMT_VINFO_REDUC_TYPE (reduc_info) != TREE_CODE_REDUCTION)
4801     return false;
4802
4803   unsigned int num_phis = reduc_info->reduc_initial_values.length ();
4804   auto_vec<tree, 16> main_loop_results (num_phis);
4805   auto_vec<tree, 16> initial_values (num_phis);
4806   if (edge main_loop_edge = loop_vinfo->main_loop_edge)
4807     {
4808       /* The epilogue loop can be entered either from the main loop or
4809          from an earlier guard block.  */
4810       edge skip_edge = loop_vinfo->skip_main_loop_edge;
4811       for (tree incoming_value : reduc_info->reduc_initial_values)
4812         {
4813           /* Look for:
4814
4815                INCOMING_VALUE = phi<MAIN_LOOP_RESULT(main loop),
4816                                     INITIAL_VALUE(guard block)>.  */
4817           gcc_assert (TREE_CODE (incoming_value) == SSA_NAME);
4818
4819           gphi *phi = as_a <gphi *> (SSA_NAME_DEF_STMT (incoming_value));
4820           gcc_assert (gimple_bb (phi) == main_loop_edge->dest);
4821
4822           tree from_main_loop = PHI_ARG_DEF_FROM_EDGE (phi, main_loop_edge);
4823           tree from_skip = PHI_ARG_DEF_FROM_EDGE (phi, skip_edge);
4824
4825           main_loop_results.quick_push (from_main_loop);
4826           initial_values.quick_push (from_skip);
4827         }
4828     }
4829   else
4830     /* The main loop dominates the epilogue loop.  */
4831     main_loop_results.splice (reduc_info->reduc_initial_values);
4832
4833   /* See if the main loop has the kind of accumulator we need.  */
4834   vect_reusable_accumulator *accumulator
4835     = main_loop_vinfo->reusable_accumulators.get (main_loop_results[0]);
4836   if (!accumulator
4837       || num_phis != accumulator->reduc_info->reduc_scalar_results.length ()
4838       || !std::equal (main_loop_results.begin (), main_loop_results.end (),
4839                       accumulator->reduc_info->reduc_scalar_results.begin ()))
4840     return false;
4841
4842   /* Handle the case where we can reduce wider vectors to narrower ones.  */
4843   tree vectype = STMT_VINFO_VECTYPE (reduc_info);
4844   tree old_vectype = TREE_TYPE (accumulator->reduc_input);
4845   if (!constant_multiple_p (TYPE_VECTOR_SUBPARTS (old_vectype),
4846                             TYPE_VECTOR_SUBPARTS (vectype)))
4847     return false;
4848
4849   /* Non-SLP reductions might apply an adjustment after the reduction
4850      operation, in order to simplify the initialization of the accumulator.
4851      If the epilogue loop carries on from where the main loop left off,
4852      it should apply the same adjustment to the final reduction result.
4853
4854      If the epilogue loop can also be entered directly (rather than via
4855      the main loop), we need to be able to handle that case in the same way,
4856      with the same adjustment.  (In principle we could add a PHI node
4857      to select the correct adjustment, but in practice that shouldn't be
4858      necessary.)  */
4859   tree main_adjustment
4860     = STMT_VINFO_REDUC_EPILOGUE_ADJUSTMENT (accumulator->reduc_info);
4861   if (loop_vinfo->main_loop_edge && main_adjustment)
4862     {
4863       gcc_assert (num_phis == 1);
4864       tree initial_value = initial_values[0];
4865       /* Check that we can use INITIAL_VALUE as the adjustment and
4866          initialize the accumulator with a neutral value instead.  */
4867       if (!operand_equal_p (initial_value, main_adjustment))
4868         return false;
4869       tree_code code = STMT_VINFO_REDUC_CODE (reduc_info);
4870       initial_values[0] = neutral_op_for_reduction (TREE_TYPE (initial_value),
4871                                                     code, initial_value);
4872     }
4873   STMT_VINFO_REDUC_EPILOGUE_ADJUSTMENT (reduc_info) = main_adjustment;
4874   reduc_info->reduc_initial_values.truncate (0);
4875   reduc_info->reduc_initial_values.splice (initial_values);
4876   reduc_info->reused_accumulator = accumulator;
4877   return true;
4878 }
4879
4880 /* Reduce the vector VEC_DEF down to VECTYPE with reduction operation
4881    CODE emitting stmts before GSI.  Returns a vector def of VECTYPE.  */
4882
4883 static tree
4884 vect_create_partial_epilog (tree vec_def, tree vectype, enum tree_code code,
4885                             gimple_seq *seq)
4886 {
4887   unsigned nunits = TYPE_VECTOR_SUBPARTS (TREE_TYPE (vec_def)).to_constant ();
4888   unsigned nunits1 = TYPE_VECTOR_SUBPARTS (vectype).to_constant ();
4889   tree stype = TREE_TYPE (vectype);
4890   tree new_temp = vec_def;
4891   while (nunits > nunits1)
4892     {
4893       nunits /= 2;
4894       tree vectype1 = get_related_vectype_for_scalar_type (TYPE_MODE (vectype),
4895                                                            stype, nunits);
4896       unsigned int bitsize = tree_to_uhwi (TYPE_SIZE (vectype1));
4897
4898       /* The target has to make sure we support lowpart/highpart
4899          extraction, either via direct vector extract or through
4900          an integer mode punning.  */
4901       tree dst1, dst2;
4902       gimple *epilog_stmt;
4903       if (convert_optab_handler (vec_extract_optab,
4904                                  TYPE_MODE (TREE_TYPE (new_temp)),
4905                                  TYPE_MODE (vectype1))
4906           != CODE_FOR_nothing)
4907         {
4908           /* Extract sub-vectors directly once vec_extract becomes
4909              a conversion optab.  */
4910           dst1 = make_ssa_name (vectype1);
4911           epilog_stmt
4912               = gimple_build_assign (dst1, BIT_FIELD_REF,
4913                                      build3 (BIT_FIELD_REF, vectype1,
4914                                              new_temp, TYPE_SIZE (vectype1),
4915                                              bitsize_int (0)));
4916           gimple_seq_add_stmt_without_update (seq, epilog_stmt);
4917           dst2 =  make_ssa_name (vectype1);
4918           epilog_stmt
4919               = gimple_build_assign (dst2, BIT_FIELD_REF,
4920                                      build3 (BIT_FIELD_REF, vectype1,
4921                                              new_temp, TYPE_SIZE (vectype1),
4922                                              bitsize_int (bitsize)));
4923           gimple_seq_add_stmt_without_update (seq, epilog_stmt);
4924         }
4925       else
4926         {
4927           /* Extract via punning to appropriately sized integer mode
4928              vector.  */
4929           tree eltype = build_nonstandard_integer_type (bitsize, 1);
4930           tree etype = build_vector_type (eltype, 2);
4931           gcc_assert (convert_optab_handler (vec_extract_optab,
4932                                              TYPE_MODE (etype),
4933                                              TYPE_MODE (eltype))
4934                       != CODE_FOR_nothing);
4935           tree tem = make_ssa_name (etype);
4936           epilog_stmt = gimple_build_assign (tem, VIEW_CONVERT_EXPR,
4937                                              build1 (VIEW_CONVERT_EXPR,
4938                                                      etype, new_temp));
4939           gimple_seq_add_stmt_without_update (seq, epilog_stmt);
4940           new_temp = tem;
4941           tem = make_ssa_name (eltype);
4942           epilog_stmt
4943               = gimple_build_assign (tem, BIT_FIELD_REF,
4944                                      build3 (BIT_FIELD_REF, eltype,
4945                                              new_temp, TYPE_SIZE (eltype),
4946                                              bitsize_int (0)));
4947           gimple_seq_add_stmt_without_update (seq, epilog_stmt);
4948           dst1 = make_ssa_name (vectype1);
4949           epilog_stmt = gimple_build_assign (dst1, VIEW_CONVERT_EXPR,
4950                                              build1 (VIEW_CONVERT_EXPR,
4951                                                      vectype1, tem));
4952           gimple_seq_add_stmt_without_update (seq, epilog_stmt);
4953           tem = make_ssa_name (eltype);
4954           epilog_stmt
4955               = gimple_build_assign (tem, BIT_FIELD_REF,
4956                                      build3 (BIT_FIELD_REF, eltype,
4957                                              new_temp, TYPE_SIZE (eltype),
4958                                              bitsize_int (bitsize)));
4959           gimple_seq_add_stmt_without_update (seq, epilog_stmt);
4960           dst2 =  make_ssa_name (vectype1);
4961           epilog_stmt = gimple_build_assign (dst2, VIEW_CONVERT_EXPR,
4962                                              build1 (VIEW_CONVERT_EXPR,
4963                                                      vectype1, tem));
4964           gimple_seq_add_stmt_without_update (seq, epilog_stmt);
4965         }
4966
4967       new_temp = make_ssa_name (vectype1);
4968       epilog_stmt = gimple_build_assign (new_temp, code, dst1, dst2);
4969       gimple_seq_add_stmt_without_update (seq, epilog_stmt);
4970     }
4971
4972   return new_temp;
4973 }
4974
4975 /* Function vect_create_epilog_for_reduction
4976
4977    Create code at the loop-epilog to finalize the result of a reduction
4978    computation.
4979
4980    STMT_INFO is the scalar reduction stmt that is being vectorized.
4981    SLP_NODE is an SLP node containing a group of reduction statements. The
4982      first one in this group is STMT_INFO.
4983    SLP_NODE_INSTANCE is the SLP node instance containing SLP_NODE
4984    REDUC_INDEX says which rhs operand of the STMT_INFO is the reduction phi
4985      (counting from 0)
4986
4987    This function:
4988    1. Completes the reduction def-use cycles.
4989    2. "Reduces" each vector of partial results VECT_DEFS into a single result,
4990       by calling the function specified by REDUC_FN if available, or by
4991       other means (whole-vector shifts or a scalar loop).
4992       The function also creates a new phi node at the loop exit to preserve
4993       loop-closed form, as illustrated below.
4994
4995      The flow at the entry to this function:
4996
4997         loop:
4998           vec_def = phi <vec_init, null>        # REDUCTION_PHI
4999           VECT_DEF = vector_stmt                # vectorized form of STMT_INFO
5000           s_loop = scalar_stmt                  # (scalar) STMT_INFO
5001         loop_exit:
5002           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
5003           use <s_out0>
5004           use <s_out0>
5005
5006      The above is transformed by this function into:
5007
5008         loop:
5009           vec_def = phi <vec_init, VECT_DEF>    # REDUCTION_PHI
5010           VECT_DEF = vector_stmt                # vectorized form of STMT_INFO
5011           s_loop = scalar_stmt                  # (scalar) STMT_INFO
5012         loop_exit:
5013           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
5014           v_out1 = phi <VECT_DEF>               # NEW_EXIT_PHI
5015           v_out2 = reduce <v_out1>
5016           s_out3 = extract_field <v_out2, 0>
5017           s_out4 = adjust_result <s_out3>
5018           use <s_out4>
5019           use <s_out4>
5020 */
5021
5022 static void
5023 vect_create_epilog_for_reduction (loop_vec_info loop_vinfo,
5024                                   stmt_vec_info stmt_info,
5025                                   slp_tree slp_node,
5026                                   slp_instance slp_node_instance)
5027 {
5028   stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info);
5029   gcc_assert (reduc_info->is_reduc_info);
5030   /* For double reductions we need to get at the inner loop reduction
5031      stmt which has the meta info attached.  Our stmt_info is that of the
5032      loop-closed PHI of the inner loop which we remember as
5033      def for the reduction PHI generation.  */
5034   bool double_reduc = false;
5035   stmt_vec_info rdef_info = stmt_info;
5036   if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
5037     {
5038       gcc_assert (!slp_node);
5039       double_reduc = true;
5040       stmt_info = loop_vinfo->lookup_def (gimple_phi_arg_def
5041                                             (stmt_info->stmt, 0));
5042       stmt_info = vect_stmt_to_vectorize (stmt_info);
5043     }
5044   gphi *reduc_def_stmt
5045     = as_a <gphi *> (STMT_VINFO_REDUC_DEF (vect_orig_stmt (stmt_info))->stmt);
5046   enum tree_code code = STMT_VINFO_REDUC_CODE (reduc_info);
5047   internal_fn reduc_fn = STMT_VINFO_REDUC_FN (reduc_info);
5048   tree vectype;
5049   machine_mode mode;
5050   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo), *outer_loop = NULL;
5051   basic_block exit_bb;
5052   tree scalar_dest;
5053   tree scalar_type;
5054   gimple *new_phi = NULL, *phi;
5055   gimple_stmt_iterator exit_gsi;
5056   tree new_temp = NULL_TREE, new_name, new_scalar_dest;
5057   gimple *epilog_stmt = NULL;
5058   gimple *exit_phi;
5059   tree bitsize;
5060   tree def;
5061   tree orig_name, scalar_result;
5062   imm_use_iterator imm_iter, phi_imm_iter;
5063   use_operand_p use_p, phi_use_p;
5064   gimple *use_stmt;
5065   auto_vec<tree> reduc_inputs;
5066   int j, i;
5067   vec<tree> &scalar_results = reduc_info->reduc_scalar_results;
5068   unsigned int group_size = 1, k;
5069   auto_vec<gimple *> phis;
5070   /* SLP reduction without reduction chain, e.g.,
5071      # a1 = phi <a2, a0>
5072      # b1 = phi <b2, b0>
5073      a2 = operation (a1)
5074      b2 = operation (b1)  */
5075   bool slp_reduc = (slp_node && !REDUC_GROUP_FIRST_ELEMENT (stmt_info));
5076   bool direct_slp_reduc;
5077   tree induction_index = NULL_TREE;
5078
5079   if (slp_node)
5080     group_size = SLP_TREE_LANES (slp_node);
5081
5082   if (nested_in_vect_loop_p (loop, stmt_info))
5083     {
5084       outer_loop = loop;
5085       loop = loop->inner;
5086       gcc_assert (!slp_node && double_reduc);
5087     }
5088
5089   vectype = STMT_VINFO_REDUC_VECTYPE (reduc_info);
5090   gcc_assert (vectype);
5091   mode = TYPE_MODE (vectype);
5092
5093   tree induc_val = NULL_TREE;
5094   tree adjustment_def = NULL;
5095   if (slp_node)
5096     ;
5097   else
5098     {
5099       /* Optimize: for induction condition reduction, if we can't use zero
5100          for induc_val, use initial_def.  */
5101       if (STMT_VINFO_REDUC_TYPE (reduc_info) == INTEGER_INDUC_COND_REDUCTION)
5102         induc_val = STMT_VINFO_VEC_INDUC_COND_INITIAL_VAL (reduc_info);
5103       else if (double_reduc)
5104         ;
5105       else
5106         adjustment_def = STMT_VINFO_REDUC_EPILOGUE_ADJUSTMENT (reduc_info);
5107     }
5108
5109   stmt_vec_info single_live_out_stmt[] = { stmt_info };
5110   array_slice<const stmt_vec_info> live_out_stmts = single_live_out_stmt;
5111   if (slp_reduc)
5112     /* All statements produce live-out values.  */
5113     live_out_stmts = SLP_TREE_SCALAR_STMTS (slp_node);
5114   else if (slp_node)
5115     /* The last statement in the reduction chain produces the live-out
5116        value.  */
5117     single_live_out_stmt[0] = SLP_TREE_SCALAR_STMTS (slp_node)[group_size - 1];
5118
5119   unsigned vec_num;
5120   int ncopies;
5121   if (slp_node)
5122     {
5123       vec_num = SLP_TREE_VEC_STMTS (slp_node_instance->reduc_phis).length ();
5124       ncopies = 1;
5125     }
5126   else
5127     {
5128       stmt_vec_info reduc_info = loop_vinfo->lookup_stmt (reduc_def_stmt);
5129       vec_num = 1;
5130       ncopies = STMT_VINFO_VEC_STMTS (reduc_info).length ();
5131     }
5132
5133   /* For cond reductions we want to create a new vector (INDEX_COND_EXPR)
5134      which is updated with the current index of the loop for every match of
5135      the original loop's cond_expr (VEC_STMT).  This results in a vector
5136      containing the last time the condition passed for that vector lane.
5137      The first match will be a 1 to allow 0 to be used for non-matching
5138      indexes.  If there are no matches at all then the vector will be all
5139      zeroes.
5140
5141      PR92772: This algorithm is broken for architectures that support
5142      masked vectors, but do not provide fold_extract_last.  */
5143   if (STMT_VINFO_REDUC_TYPE (reduc_info) == COND_REDUCTION)
5144     {
5145       auto_vec<std::pair<tree, bool>, 2> ccompares;
5146       stmt_vec_info cond_info = STMT_VINFO_REDUC_DEF (reduc_info);
5147       cond_info = vect_stmt_to_vectorize (cond_info);
5148       while (cond_info != reduc_info)
5149         {
5150           if (gimple_assign_rhs_code (cond_info->stmt) == COND_EXPR)
5151             {
5152               gimple *vec_stmt = STMT_VINFO_VEC_STMTS (cond_info)[0];
5153               gcc_assert (gimple_assign_rhs_code (vec_stmt) == VEC_COND_EXPR);
5154               ccompares.safe_push
5155                 (std::make_pair (unshare_expr (gimple_assign_rhs1 (vec_stmt)),
5156                                  STMT_VINFO_REDUC_IDX (cond_info) == 2));
5157             }
5158           cond_info
5159             = loop_vinfo->lookup_def (gimple_op (cond_info->stmt,
5160                                                  1 + STMT_VINFO_REDUC_IDX
5161                                                         (cond_info)));
5162           cond_info = vect_stmt_to_vectorize (cond_info);
5163         }
5164       gcc_assert (ccompares.length () != 0);
5165
5166       tree indx_before_incr, indx_after_incr;
5167       poly_uint64 nunits_out = TYPE_VECTOR_SUBPARTS (vectype);
5168       int scalar_precision
5169         = GET_MODE_PRECISION (SCALAR_TYPE_MODE (TREE_TYPE (vectype)));
5170       tree cr_index_scalar_type = make_unsigned_type (scalar_precision);
5171       tree cr_index_vector_type = get_related_vectype_for_scalar_type
5172         (TYPE_MODE (vectype), cr_index_scalar_type,
5173          TYPE_VECTOR_SUBPARTS (vectype));
5174
5175       /* First we create a simple vector induction variable which starts
5176          with the values {1,2,3,...} (SERIES_VECT) and increments by the
5177          vector size (STEP).  */
5178
5179       /* Create a {1,2,3,...} vector.  */
5180       tree series_vect = build_index_vector (cr_index_vector_type, 1, 1);
5181
5182       /* Create a vector of the step value.  */
5183       tree step = build_int_cst (cr_index_scalar_type, nunits_out);
5184       tree vec_step = build_vector_from_val (cr_index_vector_type, step);
5185
5186       /* Create an induction variable.  */
5187       gimple_stmt_iterator incr_gsi;
5188       bool insert_after;
5189       standard_iv_increment_position (loop, &incr_gsi, &insert_after);
5190       create_iv (series_vect, vec_step, NULL_TREE, loop, &incr_gsi,
5191                  insert_after, &indx_before_incr, &indx_after_incr);
5192
5193       /* Next create a new phi node vector (NEW_PHI_TREE) which starts
5194          filled with zeros (VEC_ZERO).  */
5195
5196       /* Create a vector of 0s.  */
5197       tree zero = build_zero_cst (cr_index_scalar_type);
5198       tree vec_zero = build_vector_from_val (cr_index_vector_type, zero);
5199
5200       /* Create a vector phi node.  */
5201       tree new_phi_tree = make_ssa_name (cr_index_vector_type);
5202       new_phi = create_phi_node (new_phi_tree, loop->header);
5203       add_phi_arg (as_a <gphi *> (new_phi), vec_zero,
5204                    loop_preheader_edge (loop), UNKNOWN_LOCATION);
5205
5206       /* Now take the condition from the loops original cond_exprs
5207          and produce a new cond_exprs (INDEX_COND_EXPR) which for
5208          every match uses values from the induction variable
5209          (INDEX_BEFORE_INCR) otherwise uses values from the phi node
5210          (NEW_PHI_TREE).
5211          Finally, we update the phi (NEW_PHI_TREE) to take the value of
5212          the new cond_expr (INDEX_COND_EXPR).  */
5213       gimple_seq stmts = NULL;
5214       for (int i = ccompares.length () - 1; i != -1; --i)
5215         {
5216           tree ccompare = ccompares[i].first;
5217           if (ccompares[i].second)
5218             new_phi_tree = gimple_build (&stmts, VEC_COND_EXPR,
5219                                          cr_index_vector_type,
5220                                          ccompare,
5221                                          indx_before_incr, new_phi_tree);
5222           else
5223             new_phi_tree = gimple_build (&stmts, VEC_COND_EXPR,
5224                                          cr_index_vector_type,
5225                                          ccompare,
5226                                          new_phi_tree, indx_before_incr);
5227         }
5228       gsi_insert_seq_before (&incr_gsi, stmts, GSI_SAME_STMT);
5229
5230       /* Update the phi with the vec cond.  */
5231       induction_index = new_phi_tree;
5232       add_phi_arg (as_a <gphi *> (new_phi), induction_index,
5233                    loop_latch_edge (loop), UNKNOWN_LOCATION);
5234     }
5235
5236   /* 2. Create epilog code.
5237         The reduction epilog code operates across the elements of the vector
5238         of partial results computed by the vectorized loop.
5239         The reduction epilog code consists of:
5240
5241         step 1: compute the scalar result in a vector (v_out2)
5242         step 2: extract the scalar result (s_out3) from the vector (v_out2)
5243         step 3: adjust the scalar result (s_out3) if needed.
5244
5245         Step 1 can be accomplished using one the following three schemes:
5246           (scheme 1) using reduc_fn, if available.
5247           (scheme 2) using whole-vector shifts, if available.
5248           (scheme 3) using a scalar loop. In this case steps 1+2 above are
5249                      combined.
5250
5251           The overall epilog code looks like this:
5252
5253           s_out0 = phi <s_loop>         # original EXIT_PHI
5254           v_out1 = phi <VECT_DEF>       # NEW_EXIT_PHI
5255           v_out2 = reduce <v_out1>              # step 1
5256           s_out3 = extract_field <v_out2, 0>    # step 2
5257           s_out4 = adjust_result <s_out3>       # step 3
5258
5259           (step 3 is optional, and steps 1 and 2 may be combined).
5260           Lastly, the uses of s_out0 are replaced by s_out4.  */
5261
5262
5263   /* 2.1 Create new loop-exit-phis to preserve loop-closed form:
5264          v_out1 = phi <VECT_DEF>
5265          Store them in NEW_PHIS.  */
5266   if (double_reduc)
5267     loop = outer_loop;
5268   exit_bb = single_exit (loop)->dest;
5269   exit_gsi = gsi_after_labels (exit_bb);
5270   reduc_inputs.create (slp_node ? vec_num : ncopies);
5271   for (unsigned i = 0; i < vec_num; i++)
5272     {
5273       gimple_seq stmts = NULL;
5274       if (slp_node)
5275         def = vect_get_slp_vect_def (slp_node, i);
5276       else
5277         def = gimple_get_lhs (STMT_VINFO_VEC_STMTS (rdef_info)[0]);
5278       for (j = 0; j < ncopies; j++)
5279         {
5280           tree new_def = copy_ssa_name (def);
5281           phi = create_phi_node (new_def, exit_bb);
5282           if (j)
5283             def = gimple_get_lhs (STMT_VINFO_VEC_STMTS (rdef_info)[j]);
5284           SET_PHI_ARG_DEF (phi, single_exit (loop)->dest_idx, def);
5285           new_def = gimple_convert (&stmts, vectype, new_def);
5286           reduc_inputs.quick_push (new_def);
5287         }
5288       gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
5289     }
5290
5291   /* 2.2 Get the relevant tree-code to use in the epilog for schemes 2,3
5292          (i.e. when reduc_fn is not available) and in the final adjustment
5293          code (if needed).  Also get the original scalar reduction variable as
5294          defined in the loop.  In case STMT is a "pattern-stmt" (i.e. - it
5295          represents a reduction pattern), the tree-code and scalar-def are
5296          taken from the original stmt that the pattern-stmt (STMT) replaces.
5297          Otherwise (it is a regular reduction) - the tree-code and scalar-def
5298          are taken from STMT.  */
5299
5300   stmt_vec_info orig_stmt_info = vect_orig_stmt (stmt_info);
5301   if (orig_stmt_info != stmt_info)
5302     {
5303       /* Reduction pattern  */
5304       gcc_assert (STMT_VINFO_IN_PATTERN_P (orig_stmt_info));
5305       gcc_assert (STMT_VINFO_RELATED_STMT (orig_stmt_info) == stmt_info);
5306     }
5307
5308   scalar_dest = gimple_get_lhs (orig_stmt_info->stmt);
5309   scalar_type = TREE_TYPE (scalar_dest);
5310   scalar_results.create (group_size);
5311   new_scalar_dest = vect_create_destination_var (scalar_dest, NULL);
5312   bitsize = TYPE_SIZE (scalar_type);
5313
5314   /* True if we should implement SLP_REDUC using native reduction operations
5315      instead of scalar operations.  */
5316   direct_slp_reduc = (reduc_fn != IFN_LAST
5317                       && slp_reduc
5318                       && !TYPE_VECTOR_SUBPARTS (vectype).is_constant ());
5319
5320   /* In case of reduction chain, e.g.,
5321      # a1 = phi <a3, a0>
5322      a2 = operation (a1)
5323      a3 = operation (a2),
5324
5325      we may end up with more than one vector result.  Here we reduce them
5326      to one vector.
5327
5328      The same is true if we couldn't use a single defuse cycle.  */
5329   if (REDUC_GROUP_FIRST_ELEMENT (stmt_info)
5330       || direct_slp_reduc
5331       || ncopies > 1)
5332     {
5333       gimple_seq stmts = NULL;
5334       tree single_input = reduc_inputs[0];
5335       for (k = 1; k < reduc_inputs.length (); k++)
5336         single_input = gimple_build (&stmts, code, vectype,
5337                                      single_input, reduc_inputs[k]);
5338       gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
5339
5340       reduc_inputs.truncate (0);
5341       reduc_inputs.safe_push (single_input);
5342     }
5343
5344   tree orig_reduc_input = reduc_inputs[0];
5345
5346   /* If this loop is an epilogue loop that can be skipped after the
5347      main loop, we can only share a reduction operation between the
5348      main loop and the epilogue if we put it at the target of the
5349      skip edge.
5350
5351      We can still reuse accumulators if this check fails.  Doing so has
5352      the minor(?) benefit of making the epilogue loop's scalar result
5353      independent of the main loop's scalar result.  */
5354   bool unify_with_main_loop_p = false;
5355   if (reduc_info->reused_accumulator
5356       && loop_vinfo->skip_this_loop_edge
5357       && single_succ_p (exit_bb)
5358       && single_succ (exit_bb) == loop_vinfo->skip_this_loop_edge->dest)
5359     {
5360       unify_with_main_loop_p = true;
5361
5362       basic_block reduc_block = loop_vinfo->skip_this_loop_edge->dest;
5363       reduc_inputs[0] = make_ssa_name (vectype);
5364       gphi *new_phi = create_phi_node (reduc_inputs[0], reduc_block);
5365       add_phi_arg (new_phi, orig_reduc_input, single_succ_edge (exit_bb),
5366                    UNKNOWN_LOCATION);
5367       add_phi_arg (new_phi, reduc_info->reused_accumulator->reduc_input,
5368                    loop_vinfo->skip_this_loop_edge, UNKNOWN_LOCATION);
5369       exit_gsi = gsi_after_labels (reduc_block);
5370     }
5371
5372   /* Shouldn't be used beyond this point.  */
5373   exit_bb = nullptr;
5374
5375   if (STMT_VINFO_REDUC_TYPE (reduc_info) == COND_REDUCTION
5376       && reduc_fn != IFN_LAST)
5377     {
5378       /* For condition reductions, we have a vector (REDUC_INPUTS 0) containing
5379          various data values where the condition matched and another vector
5380          (INDUCTION_INDEX) containing all the indexes of those matches.  We
5381          need to extract the last matching index (which will be the index with
5382          highest value) and use this to index into the data vector.
5383          For the case where there were no matches, the data vector will contain
5384          all default values and the index vector will be all zeros.  */
5385
5386       /* Get various versions of the type of the vector of indexes.  */
5387       tree index_vec_type = TREE_TYPE (induction_index);
5388       gcc_checking_assert (TYPE_UNSIGNED (index_vec_type));
5389       tree index_scalar_type = TREE_TYPE (index_vec_type);
5390       tree index_vec_cmp_type = truth_type_for (index_vec_type);
5391
5392       /* Get an unsigned integer version of the type of the data vector.  */
5393       int scalar_precision
5394         = GET_MODE_PRECISION (SCALAR_TYPE_MODE (scalar_type));
5395       tree scalar_type_unsigned = make_unsigned_type (scalar_precision);
5396       tree vectype_unsigned = get_same_sized_vectype (scalar_type_unsigned,
5397                                                 vectype);
5398
5399       /* First we need to create a vector (ZERO_VEC) of zeros and another
5400          vector (MAX_INDEX_VEC) filled with the last matching index, which we
5401          can create using a MAX reduction and then expanding.
5402          In the case where the loop never made any matches, the max index will
5403          be zero.  */
5404
5405       /* Vector of {0, 0, 0,...}.  */
5406       tree zero_vec = build_zero_cst (vectype);
5407
5408       /* Find maximum value from the vector of found indexes.  */
5409       tree max_index = make_ssa_name (index_scalar_type);
5410       gcall *max_index_stmt = gimple_build_call_internal (IFN_REDUC_MAX,
5411                                                           1, induction_index);
5412       gimple_call_set_lhs (max_index_stmt, max_index);
5413       gsi_insert_before (&exit_gsi, max_index_stmt, GSI_SAME_STMT);
5414
5415       /* Vector of {max_index, max_index, max_index,...}.  */
5416       tree max_index_vec = make_ssa_name (index_vec_type);
5417       tree max_index_vec_rhs = build_vector_from_val (index_vec_type,
5418                                                       max_index);
5419       gimple *max_index_vec_stmt = gimple_build_assign (max_index_vec,
5420                                                         max_index_vec_rhs);
5421       gsi_insert_before (&exit_gsi, max_index_vec_stmt, GSI_SAME_STMT);
5422
5423       /* Next we compare the new vector (MAX_INDEX_VEC) full of max indexes
5424          with the vector (INDUCTION_INDEX) of found indexes, choosing values
5425          from the data vector (REDUC_INPUTS 0) for matches, 0 (ZERO_VEC)
5426          otherwise.  Only one value should match, resulting in a vector
5427          (VEC_COND) with one data value and the rest zeros.
5428          In the case where the loop never made any matches, every index will
5429          match, resulting in a vector with all data values (which will all be
5430          the default value).  */
5431
5432       /* Compare the max index vector to the vector of found indexes to find
5433          the position of the max value.  */
5434       tree vec_compare = make_ssa_name (index_vec_cmp_type);
5435       gimple *vec_compare_stmt = gimple_build_assign (vec_compare, EQ_EXPR,
5436                                                       induction_index,
5437                                                       max_index_vec);
5438       gsi_insert_before (&exit_gsi, vec_compare_stmt, GSI_SAME_STMT);
5439
5440       /* Use the compare to choose either values from the data vector or
5441          zero.  */
5442       tree vec_cond = make_ssa_name (vectype);
5443       gimple *vec_cond_stmt = gimple_build_assign (vec_cond, VEC_COND_EXPR,
5444                                                    vec_compare,
5445                                                    reduc_inputs[0],
5446                                                    zero_vec);
5447       gsi_insert_before (&exit_gsi, vec_cond_stmt, GSI_SAME_STMT);
5448
5449       /* Finally we need to extract the data value from the vector (VEC_COND)
5450          into a scalar (MATCHED_DATA_REDUC).  Logically we want to do a OR
5451          reduction, but because this doesn't exist, we can use a MAX reduction
5452          instead.  The data value might be signed or a float so we need to cast
5453          it first.
5454          In the case where the loop never made any matches, the data values are
5455          all identical, and so will reduce down correctly.  */
5456
5457       /* Make the matched data values unsigned.  */
5458       tree vec_cond_cast = make_ssa_name (vectype_unsigned);
5459       tree vec_cond_cast_rhs = build1 (VIEW_CONVERT_EXPR, vectype_unsigned,
5460                                        vec_cond);
5461       gimple *vec_cond_cast_stmt = gimple_build_assign (vec_cond_cast,
5462                                                         VIEW_CONVERT_EXPR,
5463                                                         vec_cond_cast_rhs);
5464       gsi_insert_before (&exit_gsi, vec_cond_cast_stmt, GSI_SAME_STMT);
5465
5466       /* Reduce down to a scalar value.  */
5467       tree data_reduc = make_ssa_name (scalar_type_unsigned);
5468       gcall *data_reduc_stmt = gimple_build_call_internal (IFN_REDUC_MAX,
5469                                                            1, vec_cond_cast);
5470       gimple_call_set_lhs (data_reduc_stmt, data_reduc);
5471       gsi_insert_before (&exit_gsi, data_reduc_stmt, GSI_SAME_STMT);
5472
5473       /* Convert the reduced value back to the result type and set as the
5474          result.  */
5475       gimple_seq stmts = NULL;
5476       new_temp = gimple_build (&stmts, VIEW_CONVERT_EXPR, scalar_type,
5477                                data_reduc);
5478       gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
5479       scalar_results.safe_push (new_temp);
5480     }
5481   else if (STMT_VINFO_REDUC_TYPE (reduc_info) == COND_REDUCTION
5482            && reduc_fn == IFN_LAST)
5483     {
5484       /* Condition reduction without supported IFN_REDUC_MAX.  Generate
5485          idx = 0;
5486          idx_val = induction_index[0];
5487          val = data_reduc[0];
5488          for (idx = 0, val = init, i = 0; i < nelts; ++i)
5489            if (induction_index[i] > idx_val)
5490              val = data_reduc[i], idx_val = induction_index[i];
5491          return val;  */
5492
5493       tree data_eltype = TREE_TYPE (vectype);
5494       tree idx_eltype = TREE_TYPE (TREE_TYPE (induction_index));
5495       unsigned HOST_WIDE_INT el_size = tree_to_uhwi (TYPE_SIZE (idx_eltype));
5496       poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (TREE_TYPE (induction_index));
5497       /* Enforced by vectorizable_reduction, which ensures we have target
5498          support before allowing a conditional reduction on variable-length
5499          vectors.  */
5500       unsigned HOST_WIDE_INT v_size = el_size * nunits.to_constant ();
5501       tree idx_val = NULL_TREE, val = NULL_TREE;
5502       for (unsigned HOST_WIDE_INT off = 0; off < v_size; off += el_size)
5503         {
5504           tree old_idx_val = idx_val;
5505           tree old_val = val;
5506           idx_val = make_ssa_name (idx_eltype);
5507           epilog_stmt = gimple_build_assign (idx_val, BIT_FIELD_REF,
5508                                              build3 (BIT_FIELD_REF, idx_eltype,
5509                                                      induction_index,
5510                                                      bitsize_int (el_size),
5511                                                      bitsize_int (off)));
5512           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5513           val = make_ssa_name (data_eltype);
5514           epilog_stmt = gimple_build_assign (val, BIT_FIELD_REF,
5515                                              build3 (BIT_FIELD_REF,
5516                                                      data_eltype,
5517                                                      reduc_inputs[0],
5518                                                      bitsize_int (el_size),
5519                                                      bitsize_int (off)));
5520           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5521           if (off != 0)
5522             {
5523               tree new_idx_val = idx_val;
5524               if (off != v_size - el_size)
5525                 {
5526                   new_idx_val = make_ssa_name (idx_eltype);
5527                   epilog_stmt = gimple_build_assign (new_idx_val,
5528                                                      MAX_EXPR, idx_val,
5529                                                      old_idx_val);
5530                   gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5531                 }
5532               tree new_val = make_ssa_name (data_eltype);
5533               epilog_stmt = gimple_build_assign (new_val,
5534                                                  COND_EXPR,
5535                                                  build2 (GT_EXPR,
5536                                                          boolean_type_node,
5537                                                          idx_val,
5538                                                          old_idx_val),
5539                                                  val, old_val);
5540               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5541               idx_val = new_idx_val;
5542               val = new_val;
5543             }
5544         }
5545       /* Convert the reduced value back to the result type and set as the
5546          result.  */
5547       gimple_seq stmts = NULL;
5548       val = gimple_convert (&stmts, scalar_type, val);
5549       gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
5550       scalar_results.safe_push (val);
5551     }
5552
5553   /* 2.3 Create the reduction code, using one of the three schemes described
5554          above. In SLP we simply need to extract all the elements from the
5555          vector (without reducing them), so we use scalar shifts.  */
5556   else if (reduc_fn != IFN_LAST && !slp_reduc)
5557     {
5558       tree tmp;
5559       tree vec_elem_type;
5560
5561       /* Case 1:  Create:
5562          v_out2 = reduc_expr <v_out1>  */
5563
5564       if (dump_enabled_p ())
5565         dump_printf_loc (MSG_NOTE, vect_location,
5566                          "Reduce using direct vector reduction.\n");
5567
5568       gimple_seq stmts = NULL;
5569       vec_elem_type = TREE_TYPE (vectype);
5570       new_temp = gimple_build (&stmts, as_combined_fn (reduc_fn),
5571                                vec_elem_type, reduc_inputs[0]);
5572       new_temp = gimple_convert (&stmts, scalar_type, new_temp);
5573       gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
5574
5575       if ((STMT_VINFO_REDUC_TYPE (reduc_info) == INTEGER_INDUC_COND_REDUCTION)
5576           && induc_val)
5577         {
5578           /* Earlier we set the initial value to be a vector if induc_val
5579              values.  Check the result and if it is induc_val then replace
5580              with the original initial value, unless induc_val is
5581              the same as initial_def already.  */
5582           tree zcompare = build2 (EQ_EXPR, boolean_type_node, new_temp,
5583                                   induc_val);
5584           tree initial_def = reduc_info->reduc_initial_values[0];
5585
5586           tmp = make_ssa_name (new_scalar_dest);
5587           epilog_stmt = gimple_build_assign (tmp, COND_EXPR, zcompare,
5588                                              initial_def, new_temp);
5589           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5590           new_temp = tmp;
5591         }
5592
5593       scalar_results.safe_push (new_temp);
5594     }
5595   else if (direct_slp_reduc)
5596     {
5597       /* Here we create one vector for each of the REDUC_GROUP_SIZE results,
5598          with the elements for other SLP statements replaced with the
5599          neutral value.  We can then do a normal reduction on each vector.  */
5600
5601       /* Enforced by vectorizable_reduction.  */
5602       gcc_assert (reduc_inputs.length () == 1);
5603       gcc_assert (pow2p_hwi (group_size));
5604
5605       gimple_seq seq = NULL;
5606
5607       /* Build a vector {0, 1, 2, ...}, with the same number of elements
5608          and the same element size as VECTYPE.  */
5609       tree index = build_index_vector (vectype, 0, 1);
5610       tree index_type = TREE_TYPE (index);
5611       tree index_elt_type = TREE_TYPE (index_type);
5612       tree mask_type = truth_type_for (index_type);
5613
5614       /* Create a vector that, for each element, identifies which of
5615          the REDUC_GROUP_SIZE results should use it.  */
5616       tree index_mask = build_int_cst (index_elt_type, group_size - 1);
5617       index = gimple_build (&seq, BIT_AND_EXPR, index_type, index,
5618                             build_vector_from_val (index_type, index_mask));
5619
5620       /* Get a neutral vector value.  This is simply a splat of the neutral
5621          scalar value if we have one, otherwise the initial scalar value
5622          is itself a neutral value.  */
5623       tree vector_identity = NULL_TREE;
5624       tree neutral_op = NULL_TREE;
5625       if (slp_node)
5626         {
5627           tree initial_value = NULL_TREE;
5628           if (REDUC_GROUP_FIRST_ELEMENT (stmt_info))
5629             initial_value = reduc_info->reduc_initial_values[0];
5630           neutral_op = neutral_op_for_reduction (TREE_TYPE (vectype), code,
5631                                                  initial_value);
5632         }
5633       if (neutral_op)
5634         vector_identity = gimple_build_vector_from_val (&seq, vectype,
5635                                                         neutral_op);
5636       for (unsigned int i = 0; i < group_size; ++i)
5637         {
5638           /* If there's no univeral neutral value, we can use the
5639              initial scalar value from the original PHI.  This is used
5640              for MIN and MAX reduction, for example.  */
5641           if (!neutral_op)
5642             {
5643               tree scalar_value = reduc_info->reduc_initial_values[i];
5644               scalar_value = gimple_convert (&seq, TREE_TYPE (vectype),
5645                                              scalar_value);
5646               vector_identity = gimple_build_vector_from_val (&seq, vectype,
5647                                                               scalar_value);
5648             }
5649
5650           /* Calculate the equivalent of:
5651
5652              sel[j] = (index[j] == i);
5653
5654              which selects the elements of REDUC_INPUTS[0] that should
5655              be included in the result.  */
5656           tree compare_val = build_int_cst (index_elt_type, i);
5657           compare_val = build_vector_from_val (index_type, compare_val);
5658           tree sel = gimple_build (&seq, EQ_EXPR, mask_type,
5659                                    index, compare_val);
5660
5661           /* Calculate the equivalent of:
5662
5663              vec = seq ? reduc_inputs[0] : vector_identity;
5664
5665              VEC is now suitable for a full vector reduction.  */
5666           tree vec = gimple_build (&seq, VEC_COND_EXPR, vectype,
5667                                    sel, reduc_inputs[0], vector_identity);
5668
5669           /* Do the reduction and convert it to the appropriate type.  */
5670           tree scalar = gimple_build (&seq, as_combined_fn (reduc_fn),
5671                                       TREE_TYPE (vectype), vec);
5672           scalar = gimple_convert (&seq, scalar_type, scalar);
5673           scalar_results.safe_push (scalar);
5674         }
5675       gsi_insert_seq_before (&exit_gsi, seq, GSI_SAME_STMT);
5676     }
5677   else
5678     {
5679       bool reduce_with_shift;
5680       tree vec_temp;
5681
5682       gcc_assert (slp_reduc || reduc_inputs.length () == 1);
5683
5684       /* See if the target wants to do the final (shift) reduction
5685          in a vector mode of smaller size and first reduce upper/lower
5686          halves against each other.  */
5687       enum machine_mode mode1 = mode;
5688       tree stype = TREE_TYPE (vectype);
5689       unsigned nunits = TYPE_VECTOR_SUBPARTS (vectype).to_constant ();
5690       unsigned nunits1 = nunits;
5691       if ((mode1 = targetm.vectorize.split_reduction (mode)) != mode
5692           && reduc_inputs.length () == 1)
5693         {
5694           nunits1 = GET_MODE_NUNITS (mode1).to_constant ();
5695           /* For SLP reductions we have to make sure lanes match up, but
5696              since we're doing individual element final reduction reducing
5697              vector width here is even more important.
5698              ???  We can also separate lanes with permutes, for the common
5699              case of power-of-two group-size odd/even extracts would work.  */
5700           if (slp_reduc && nunits != nunits1)
5701             {
5702               nunits1 = least_common_multiple (nunits1, group_size);
5703               gcc_assert (exact_log2 (nunits1) != -1 && nunits1 <= nunits);
5704             }
5705         }
5706       if (!slp_reduc
5707           && (mode1 = targetm.vectorize.split_reduction (mode)) != mode)
5708         nunits1 = GET_MODE_NUNITS (mode1).to_constant ();
5709
5710       tree vectype1 = get_related_vectype_for_scalar_type (TYPE_MODE (vectype),
5711                                                            stype, nunits1);
5712       reduce_with_shift = have_whole_vector_shift (mode1);
5713       if (!VECTOR_MODE_P (mode1))
5714         reduce_with_shift = false;
5715       else
5716         {
5717           optab optab = optab_for_tree_code (code, vectype1, optab_default);
5718           if (optab_handler (optab, mode1) == CODE_FOR_nothing)
5719             reduce_with_shift = false;
5720         }
5721
5722       /* First reduce the vector to the desired vector size we should
5723          do shift reduction on by combining upper and lower halves.  */
5724       gimple_seq stmts = NULL;
5725       new_temp = vect_create_partial_epilog (reduc_inputs[0], vectype1,
5726                                              code, &stmts);
5727       gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
5728       reduc_inputs[0] = new_temp;
5729
5730       if (reduce_with_shift && !slp_reduc)
5731         {
5732           int element_bitsize = tree_to_uhwi (bitsize);
5733           /* Enforced by vectorizable_reduction, which disallows SLP reductions
5734              for variable-length vectors and also requires direct target support
5735              for loop reductions.  */
5736           int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype1));
5737           int nelements = vec_size_in_bits / element_bitsize;
5738           vec_perm_builder sel;
5739           vec_perm_indices indices;
5740
5741           int elt_offset;
5742
5743           tree zero_vec = build_zero_cst (vectype1);
5744           /* Case 2: Create:
5745              for (offset = nelements/2; offset >= 1; offset/=2)
5746                 {
5747                   Create:  va' = vec_shift <va, offset>
5748                   Create:  va = vop <va, va'>
5749                 }  */
5750
5751           tree rhs;
5752
5753           if (dump_enabled_p ())
5754             dump_printf_loc (MSG_NOTE, vect_location,
5755                              "Reduce using vector shifts\n");
5756
5757           gimple_seq stmts = NULL;
5758           new_temp = gimple_convert (&stmts, vectype1, new_temp);
5759           for (elt_offset = nelements / 2;
5760                elt_offset >= 1;
5761                elt_offset /= 2)
5762             {
5763               calc_vec_perm_mask_for_shift (elt_offset, nelements, &sel);
5764               indices.new_vector (sel, 2, nelements);
5765               tree mask = vect_gen_perm_mask_any (vectype1, indices);
5766               new_name = gimple_build (&stmts, VEC_PERM_EXPR, vectype1,
5767                                        new_temp, zero_vec, mask);
5768               new_temp = gimple_build (&stmts, code,
5769                                        vectype1, new_name, new_temp);
5770             }
5771           gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
5772
5773           /* 2.4  Extract the final scalar result.  Create:
5774              s_out3 = extract_field <v_out2, bitpos>  */
5775
5776           if (dump_enabled_p ())
5777             dump_printf_loc (MSG_NOTE, vect_location,
5778                              "extract scalar result\n");
5779
5780           rhs = build3 (BIT_FIELD_REF, scalar_type, new_temp,
5781                         bitsize, bitsize_zero_node);
5782           epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
5783           new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
5784           gimple_assign_set_lhs (epilog_stmt, new_temp);
5785           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5786           scalar_results.safe_push (new_temp);
5787         }
5788       else
5789         {
5790           /* Case 3: Create:
5791              s = extract_field <v_out2, 0>
5792              for (offset = element_size;
5793                   offset < vector_size;
5794                   offset += element_size;)
5795                {
5796                  Create:  s' = extract_field <v_out2, offset>
5797                  Create:  s = op <s, s'>  // For non SLP cases
5798                }  */
5799
5800           if (dump_enabled_p ())
5801             dump_printf_loc (MSG_NOTE, vect_location,
5802                              "Reduce using scalar code.\n");
5803
5804           int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype1));
5805           int element_bitsize = tree_to_uhwi (bitsize);
5806           tree compute_type = TREE_TYPE (vectype);
5807           gimple_seq stmts = NULL;
5808           FOR_EACH_VEC_ELT (reduc_inputs, i, vec_temp)
5809             {
5810               int bit_offset;
5811               new_temp = gimple_build (&stmts, BIT_FIELD_REF, compute_type,
5812                                        vec_temp, bitsize, bitsize_zero_node);
5813
5814               /* In SLP we don't need to apply reduction operation, so we just
5815                  collect s' values in SCALAR_RESULTS.  */
5816               if (slp_reduc)
5817                 scalar_results.safe_push (new_temp);
5818
5819               for (bit_offset = element_bitsize;
5820                    bit_offset < vec_size_in_bits;
5821                    bit_offset += element_bitsize)
5822                 {
5823                   tree bitpos = bitsize_int (bit_offset);
5824                   new_name = gimple_build (&stmts, BIT_FIELD_REF,
5825                                            compute_type, vec_temp,
5826                                            bitsize, bitpos);
5827                   if (slp_reduc)
5828                     {
5829                       /* In SLP we don't need to apply reduction operation, so
5830                          we just collect s' values in SCALAR_RESULTS.  */
5831                       new_temp = new_name;
5832                       scalar_results.safe_push (new_name);
5833                     }
5834                   else
5835                     new_temp = gimple_build (&stmts, code, compute_type,
5836                                              new_name, new_temp);
5837                 }
5838             }
5839
5840           /* The only case where we need to reduce scalar results in SLP, is
5841              unrolling.  If the size of SCALAR_RESULTS is greater than
5842              REDUC_GROUP_SIZE, we reduce them combining elements modulo
5843              REDUC_GROUP_SIZE.  */
5844           if (slp_reduc)
5845             {
5846               tree res, first_res, new_res;
5847
5848               /* Reduce multiple scalar results in case of SLP unrolling.  */
5849               for (j = group_size; scalar_results.iterate (j, &res);
5850                    j++)
5851                 {
5852                   first_res = scalar_results[j % group_size];
5853                   new_res = gimple_build (&stmts, code, compute_type,
5854                                           first_res, res);
5855                   scalar_results[j % group_size] = new_res;
5856                 }
5857               scalar_results.truncate (group_size);
5858               for (k = 0; k < group_size; k++)
5859                 scalar_results[k] = gimple_convert (&stmts, scalar_type,
5860                                                     scalar_results[k]);
5861             }
5862           else
5863             {
5864               /* Not SLP - we have one scalar to keep in SCALAR_RESULTS.  */
5865               new_temp = gimple_convert (&stmts, scalar_type, new_temp);
5866               scalar_results.safe_push (new_temp);
5867             }
5868
5869           gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
5870         }
5871
5872       if ((STMT_VINFO_REDUC_TYPE (reduc_info) == INTEGER_INDUC_COND_REDUCTION)
5873           && induc_val)
5874         {
5875           /* Earlier we set the initial value to be a vector if induc_val
5876              values.  Check the result and if it is induc_val then replace
5877              with the original initial value, unless induc_val is
5878              the same as initial_def already.  */
5879           tree zcompare = build2 (EQ_EXPR, boolean_type_node, new_temp,
5880                                   induc_val);
5881           tree initial_def = reduc_info->reduc_initial_values[0];
5882
5883           tree tmp = make_ssa_name (new_scalar_dest);
5884           epilog_stmt = gimple_build_assign (tmp, COND_EXPR, zcompare,
5885                                              initial_def, new_temp);
5886           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5887           scalar_results[0] = tmp;
5888         }
5889     }
5890
5891   /* 2.5 Adjust the final result by the initial value of the reduction
5892          variable. (When such adjustment is not needed, then
5893          'adjustment_def' is zero).  For example, if code is PLUS we create:
5894          new_temp = loop_exit_def + adjustment_def  */
5895
5896   if (adjustment_def)
5897     {
5898       gcc_assert (!slp_reduc);
5899       gimple_seq stmts = NULL;
5900       if (double_reduc)
5901         {
5902           gcc_assert (VECTOR_TYPE_P (TREE_TYPE (adjustment_def)));
5903           adjustment_def = gimple_convert (&stmts, vectype, adjustment_def);
5904           new_temp = gimple_build (&stmts, code, vectype,
5905                                    reduc_inputs[0], adjustment_def);
5906         }
5907       else
5908         {
5909           new_temp = scalar_results[0];
5910           gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) != VECTOR_TYPE);
5911           adjustment_def = gimple_convert (&stmts, scalar_type, adjustment_def);
5912           new_temp = gimple_build (&stmts, code, scalar_type,
5913                                    new_temp, adjustment_def);
5914         }
5915
5916       epilog_stmt = gimple_seq_last_stmt (stmts);
5917       gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
5918       scalar_results[0] = new_temp;
5919     }
5920
5921   /* Record this operation if it could be reused by the epilogue loop.  */
5922   if (STMT_VINFO_REDUC_TYPE (reduc_info) == TREE_CODE_REDUCTION)
5923     loop_vinfo->reusable_accumulators.put (scalar_results[0],
5924                                            { orig_reduc_input, reduc_info });
5925
5926   if (double_reduc)
5927     loop = outer_loop;
5928
5929   /* 2.6  Handle the loop-exit phis.  Replace the uses of scalar loop-exit
5930           phis with new adjusted scalar results, i.e., replace use <s_out0>
5931           with use <s_out4>.
5932
5933      Transform:
5934         loop_exit:
5935           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
5936           v_out1 = phi <VECT_DEF>               # NEW_EXIT_PHI
5937           v_out2 = reduce <v_out1>
5938           s_out3 = extract_field <v_out2, 0>
5939           s_out4 = adjust_result <s_out3>
5940           use <s_out0>
5941           use <s_out0>
5942
5943      into:
5944
5945         loop_exit:
5946           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
5947           v_out1 = phi <VECT_DEF>               # NEW_EXIT_PHI
5948           v_out2 = reduce <v_out1>
5949           s_out3 = extract_field <v_out2, 0>
5950           s_out4 = adjust_result <s_out3>
5951           use <s_out4>
5952           use <s_out4> */
5953
5954   gcc_assert (live_out_stmts.size () == scalar_results.length ());
5955   for (k = 0; k < live_out_stmts.size (); k++)
5956     {
5957       stmt_vec_info scalar_stmt_info = vect_orig_stmt (live_out_stmts[k]);
5958       scalar_dest = gimple_assign_lhs (scalar_stmt_info->stmt);
5959
5960       phis.create (3);
5961       /* Find the loop-closed-use at the loop exit of the original scalar
5962          result.  (The reduction result is expected to have two immediate uses,
5963          one at the latch block, and one at the loop exit).  For double
5964          reductions we are looking for exit phis of the outer loop.  */
5965       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, scalar_dest)
5966         {
5967           if (!flow_bb_inside_loop_p (loop, gimple_bb (USE_STMT (use_p))))
5968             {
5969               if (!is_gimple_debug (USE_STMT (use_p)))
5970                 phis.safe_push (USE_STMT (use_p));
5971             }
5972           else
5973             {
5974               if (double_reduc && gimple_code (USE_STMT (use_p)) == GIMPLE_PHI)
5975                 {
5976                   tree phi_res = PHI_RESULT (USE_STMT (use_p));
5977
5978                   FOR_EACH_IMM_USE_FAST (phi_use_p, phi_imm_iter, phi_res)
5979                     {
5980                       if (!flow_bb_inside_loop_p (loop,
5981                                              gimple_bb (USE_STMT (phi_use_p)))
5982                           && !is_gimple_debug (USE_STMT (phi_use_p)))
5983                         phis.safe_push (USE_STMT (phi_use_p));
5984                     }
5985                 }
5986             }
5987         }
5988
5989       FOR_EACH_VEC_ELT (phis, i, exit_phi)
5990         {
5991           /* Replace the uses:  */
5992           orig_name = PHI_RESULT (exit_phi);
5993
5994           /* Look for a single use at the target of the skip edge.  */
5995           if (unify_with_main_loop_p)
5996             {
5997               use_operand_p use_p;
5998               gimple *user;
5999               if (!single_imm_use (orig_name, &use_p, &user))
6000                 gcc_unreachable ();
6001               orig_name = gimple_get_lhs (user);
6002             }
6003
6004           scalar_result = scalar_results[k];
6005           FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name)
6006             {
6007               FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
6008                 SET_USE (use_p, scalar_result);
6009               update_stmt (use_stmt);
6010             }
6011         }
6012
6013       phis.release ();
6014     }
6015 }
6016
6017 /* Return a vector of type VECTYPE that is equal to the vector select
6018    operation "MASK ? VEC : IDENTITY".  Insert the select statements
6019    before GSI.  */
6020
6021 static tree
6022 merge_with_identity (gimple_stmt_iterator *gsi, tree mask, tree vectype,
6023                      tree vec, tree identity)
6024 {
6025   tree cond = make_temp_ssa_name (vectype, NULL, "cond");
6026   gimple *new_stmt = gimple_build_assign (cond, VEC_COND_EXPR,
6027                                           mask, vec, identity);
6028   gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
6029   return cond;
6030 }
6031
6032 /* Successively apply CODE to each element of VECTOR_RHS, in left-to-right
6033    order, starting with LHS.  Insert the extraction statements before GSI and
6034    associate the new scalar SSA names with variable SCALAR_DEST.
6035    Return the SSA name for the result.  */
6036
6037 static tree
6038 vect_expand_fold_left (gimple_stmt_iterator *gsi, tree scalar_dest,
6039                        tree_code code, tree lhs, tree vector_rhs)
6040 {
6041   tree vectype = TREE_TYPE (vector_rhs);
6042   tree scalar_type = TREE_TYPE (vectype);
6043   tree bitsize = TYPE_SIZE (scalar_type);
6044   unsigned HOST_WIDE_INT vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
6045   unsigned HOST_WIDE_INT element_bitsize = tree_to_uhwi (bitsize);
6046
6047   for (unsigned HOST_WIDE_INT bit_offset = 0;
6048        bit_offset < vec_size_in_bits;
6049        bit_offset += element_bitsize)
6050     {
6051       tree bitpos = bitsize_int (bit_offset);
6052       tree rhs = build3 (BIT_FIELD_REF, scalar_type, vector_rhs,
6053                          bitsize, bitpos);
6054
6055       gassign *stmt = gimple_build_assign (scalar_dest, rhs);
6056       rhs = make_ssa_name (scalar_dest, stmt);
6057       gimple_assign_set_lhs (stmt, rhs);
6058       gsi_insert_before (gsi, stmt, GSI_SAME_STMT);
6059
6060       stmt = gimple_build_assign (scalar_dest, code, lhs, rhs);
6061       tree new_name = make_ssa_name (scalar_dest, stmt);
6062       gimple_assign_set_lhs (stmt, new_name);
6063       gsi_insert_before (gsi, stmt, GSI_SAME_STMT);
6064       lhs = new_name;
6065     }
6066   return lhs;
6067 }
6068
6069 /* Get a masked internal function equivalent to REDUC_FN.  VECTYPE_IN is the
6070    type of the vector input.  */
6071
6072 static internal_fn
6073 get_masked_reduction_fn (internal_fn reduc_fn, tree vectype_in)
6074 {
6075   internal_fn mask_reduc_fn;
6076
6077   switch (reduc_fn)
6078     {
6079     case IFN_FOLD_LEFT_PLUS:
6080       mask_reduc_fn = IFN_MASK_FOLD_LEFT_PLUS;
6081       break;
6082
6083     default:
6084       return IFN_LAST;
6085     }
6086
6087   if (direct_internal_fn_supported_p (mask_reduc_fn, vectype_in,
6088                                       OPTIMIZE_FOR_SPEED))
6089     return mask_reduc_fn;
6090   return IFN_LAST;
6091 }
6092
6093 /* Perform an in-order reduction (FOLD_LEFT_REDUCTION).  STMT_INFO is the
6094    statement that sets the live-out value.  REDUC_DEF_STMT is the phi
6095    statement.  CODE is the operation performed by STMT_INFO and OPS are
6096    its scalar operands.  REDUC_INDEX is the index of the operand in
6097    OPS that is set by REDUC_DEF_STMT.  REDUC_FN is the function that
6098    implements in-order reduction, or IFN_LAST if we should open-code it.
6099    VECTYPE_IN is the type of the vector input.  MASKS specifies the masks
6100    that should be used to control the operation in a fully-masked loop.  */
6101
6102 static bool
6103 vectorize_fold_left_reduction (loop_vec_info loop_vinfo,
6104                                stmt_vec_info stmt_info,
6105                                gimple_stmt_iterator *gsi,
6106                                gimple **vec_stmt, slp_tree slp_node,
6107                                gimple *reduc_def_stmt,
6108                                tree_code code, internal_fn reduc_fn,
6109                                tree ops[3], tree vectype_in,
6110                                int reduc_index, vec_loop_masks *masks)
6111 {
6112   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
6113   tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
6114   internal_fn mask_reduc_fn = get_masked_reduction_fn (reduc_fn, vectype_in);
6115
6116   int ncopies;
6117   if (slp_node)
6118     ncopies = 1;
6119   else
6120     ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
6121
6122   gcc_assert (!nested_in_vect_loop_p (loop, stmt_info));
6123   gcc_assert (ncopies == 1);
6124   gcc_assert (TREE_CODE_LENGTH (code) == binary_op);
6125
6126   if (slp_node)
6127     gcc_assert (known_eq (TYPE_VECTOR_SUBPARTS (vectype_out),
6128                           TYPE_VECTOR_SUBPARTS (vectype_in)));
6129
6130   tree op0 = ops[1 - reduc_index];
6131
6132   int group_size = 1;
6133   stmt_vec_info scalar_dest_def_info;
6134   auto_vec<tree> vec_oprnds0;
6135   if (slp_node)
6136     {
6137       auto_vec<vec<tree> > vec_defs (2);
6138       vect_get_slp_defs (loop_vinfo, slp_node, &vec_defs);
6139       vec_oprnds0.safe_splice (vec_defs[1 - reduc_index]);
6140       vec_defs[0].release ();
6141       vec_defs[1].release ();
6142       group_size = SLP_TREE_SCALAR_STMTS (slp_node).length ();
6143       scalar_dest_def_info = SLP_TREE_SCALAR_STMTS (slp_node)[group_size - 1];
6144     }
6145   else
6146     {
6147       vect_get_vec_defs_for_operand (loop_vinfo, stmt_info, 1,
6148                                      op0, &vec_oprnds0);
6149       scalar_dest_def_info = stmt_info;
6150     }
6151
6152   tree scalar_dest = gimple_assign_lhs (scalar_dest_def_info->stmt);
6153   tree scalar_type = TREE_TYPE (scalar_dest);
6154   tree reduc_var = gimple_phi_result (reduc_def_stmt);
6155
6156   int vec_num = vec_oprnds0.length ();
6157   gcc_assert (vec_num == 1 || slp_node);
6158   tree vec_elem_type = TREE_TYPE (vectype_out);
6159   gcc_checking_assert (useless_type_conversion_p (scalar_type, vec_elem_type));
6160
6161   tree vector_identity = NULL_TREE;
6162   if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
6163     vector_identity = build_zero_cst (vectype_out);
6164
6165   tree scalar_dest_var = vect_create_destination_var (scalar_dest, NULL);
6166   int i;
6167   tree def0;
6168   FOR_EACH_VEC_ELT (vec_oprnds0, i, def0)
6169     {
6170       gimple *new_stmt;
6171       tree mask = NULL_TREE;
6172       if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
6173         mask = vect_get_loop_mask (gsi, masks, vec_num, vectype_in, i);
6174
6175       /* Handle MINUS by adding the negative.  */
6176       if (reduc_fn != IFN_LAST && code == MINUS_EXPR)
6177         {
6178           tree negated = make_ssa_name (vectype_out);
6179           new_stmt = gimple_build_assign (negated, NEGATE_EXPR, def0);
6180           gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
6181           def0 = negated;
6182         }
6183
6184       if (mask && mask_reduc_fn == IFN_LAST)
6185         def0 = merge_with_identity (gsi, mask, vectype_out, def0,
6186                                     vector_identity);
6187
6188       /* On the first iteration the input is simply the scalar phi
6189          result, and for subsequent iterations it is the output of
6190          the preceding operation.  */
6191       if (reduc_fn != IFN_LAST || (mask && mask_reduc_fn != IFN_LAST))
6192         {
6193           if (mask && mask_reduc_fn != IFN_LAST)
6194             new_stmt = gimple_build_call_internal (mask_reduc_fn, 3, reduc_var,
6195                                                    def0, mask);
6196           else
6197             new_stmt = gimple_build_call_internal (reduc_fn, 2, reduc_var,
6198                                                    def0);
6199           /* For chained SLP reductions the output of the previous reduction
6200              operation serves as the input of the next. For the final statement
6201              the output cannot be a temporary - we reuse the original
6202              scalar destination of the last statement.  */
6203           if (i != vec_num - 1)
6204             {
6205               gimple_set_lhs (new_stmt, scalar_dest_var);
6206               reduc_var = make_ssa_name (scalar_dest_var, new_stmt);
6207               gimple_set_lhs (new_stmt, reduc_var);
6208             }
6209         }
6210       else
6211         {
6212           reduc_var = vect_expand_fold_left (gsi, scalar_dest_var, code,
6213                                              reduc_var, def0);
6214           new_stmt = SSA_NAME_DEF_STMT (reduc_var);
6215           /* Remove the statement, so that we can use the same code paths
6216              as for statements that we've just created.  */
6217           gimple_stmt_iterator tmp_gsi = gsi_for_stmt (new_stmt);
6218           gsi_remove (&tmp_gsi, true);
6219         }
6220
6221       if (i == vec_num - 1)
6222         {
6223           gimple_set_lhs (new_stmt, scalar_dest);
6224           vect_finish_replace_stmt (loop_vinfo,
6225                                     scalar_dest_def_info,
6226                                     new_stmt);
6227         }
6228       else
6229         vect_finish_stmt_generation (loop_vinfo,
6230                                      scalar_dest_def_info,
6231                                      new_stmt, gsi);
6232
6233       if (slp_node)
6234         SLP_TREE_VEC_STMTS (slp_node).quick_push (new_stmt);
6235       else
6236         {
6237           STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_stmt);
6238           *vec_stmt = new_stmt;
6239         }
6240     }
6241
6242   return true;
6243 }
6244
6245 /* Function is_nonwrapping_integer_induction.
6246
6247    Check if STMT_VINO (which is part of loop LOOP) both increments and
6248    does not cause overflow.  */
6249
6250 static bool
6251 is_nonwrapping_integer_induction (stmt_vec_info stmt_vinfo, class loop *loop)
6252 {
6253   gphi *phi = as_a <gphi *> (stmt_vinfo->stmt);
6254   tree base = STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo);
6255   tree step = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo);
6256   tree lhs_type = TREE_TYPE (gimple_phi_result (phi));
6257   widest_int ni, max_loop_value, lhs_max;
6258   wi::overflow_type overflow = wi::OVF_NONE;
6259
6260   /* Make sure the loop is integer based.  */
6261   if (TREE_CODE (base) != INTEGER_CST
6262       || TREE_CODE (step) != INTEGER_CST)
6263     return false;
6264
6265   /* Check that the max size of the loop will not wrap.  */
6266
6267   if (TYPE_OVERFLOW_UNDEFINED (lhs_type))
6268     return true;
6269
6270   if (! max_stmt_executions (loop, &ni))
6271     return false;
6272
6273   max_loop_value = wi::mul (wi::to_widest (step), ni, TYPE_SIGN (lhs_type),
6274                             &overflow);
6275   if (overflow)
6276     return false;
6277
6278   max_loop_value = wi::add (wi::to_widest (base), max_loop_value,
6279                             TYPE_SIGN (lhs_type), &overflow);
6280   if (overflow)
6281     return false;
6282
6283   return (wi::min_precision (max_loop_value, TYPE_SIGN (lhs_type))
6284           <= TYPE_PRECISION (lhs_type));
6285 }
6286
6287 /* Check if masking can be supported by inserting a conditional expression.
6288    CODE is the code for the operation.  COND_FN is the conditional internal
6289    function, if it exists.  VECTYPE_IN is the type of the vector input.  */
6290 static bool
6291 use_mask_by_cond_expr_p (enum tree_code code, internal_fn cond_fn,
6292                          tree vectype_in)
6293 {
6294   if (cond_fn != IFN_LAST
6295       && direct_internal_fn_supported_p (cond_fn, vectype_in,
6296                                          OPTIMIZE_FOR_SPEED))
6297     return false;
6298
6299   switch (code)
6300     {
6301     case DOT_PROD_EXPR:
6302     case SAD_EXPR:
6303       return true;
6304
6305     default:
6306       return false;
6307     }
6308 }
6309
6310 /* Insert a conditional expression to enable masked vectorization.  CODE is the
6311    code for the operation.  VOP is the array of operands.  MASK is the loop
6312    mask.  GSI is a statement iterator used to place the new conditional
6313    expression.  */
6314 static void
6315 build_vect_cond_expr (enum tree_code code, tree vop[3], tree mask,
6316                       gimple_stmt_iterator *gsi)
6317 {
6318   switch (code)
6319     {
6320     case DOT_PROD_EXPR:
6321       {
6322         tree vectype = TREE_TYPE (vop[1]);
6323         tree zero = build_zero_cst (vectype);
6324         tree masked_op1 = make_temp_ssa_name (vectype, NULL, "masked_op1");
6325         gassign *select = gimple_build_assign (masked_op1, VEC_COND_EXPR,
6326                                                mask, vop[1], zero);
6327         gsi_insert_before (gsi, select, GSI_SAME_STMT);
6328         vop[1] = masked_op1;
6329         break;
6330       }
6331
6332     case SAD_EXPR:
6333       {
6334         tree vectype = TREE_TYPE (vop[1]);
6335         tree masked_op1 = make_temp_ssa_name (vectype, NULL, "masked_op1");
6336         gassign *select = gimple_build_assign (masked_op1, VEC_COND_EXPR,
6337                                                mask, vop[1], vop[0]);
6338         gsi_insert_before (gsi, select, GSI_SAME_STMT);
6339         vop[1] = masked_op1;
6340         break;
6341       }
6342
6343     default:
6344       gcc_unreachable ();
6345     }
6346 }
6347
6348 /* Function vectorizable_reduction.
6349
6350    Check if STMT_INFO performs a reduction operation that can be vectorized.
6351    If VEC_STMT is also passed, vectorize STMT_INFO: create a vectorized
6352    stmt to replace it, put it in VEC_STMT, and insert it at GSI.
6353    Return true if STMT_INFO is vectorizable in this way.
6354
6355    This function also handles reduction idioms (patterns) that have been
6356    recognized in advance during vect_pattern_recog.  In this case, STMT_INFO
6357    may be of this form:
6358      X = pattern_expr (arg0, arg1, ..., X)
6359    and its STMT_VINFO_RELATED_STMT points to the last stmt in the original
6360    sequence that had been detected and replaced by the pattern-stmt
6361    (STMT_INFO).
6362
6363    This function also handles reduction of condition expressions, for example:
6364      for (int i = 0; i < N; i++)
6365        if (a[i] < value)
6366          last = a[i];
6367    This is handled by vectorising the loop and creating an additional vector
6368    containing the loop indexes for which "a[i] < value" was true.  In the
6369    function epilogue this is reduced to a single max value and then used to
6370    index into the vector of results.
6371
6372    In some cases of reduction patterns, the type of the reduction variable X is
6373    different than the type of the other arguments of STMT_INFO.
6374    In such cases, the vectype that is used when transforming STMT_INFO into
6375    a vector stmt is different than the vectype that is used to determine the
6376    vectorization factor, because it consists of a different number of elements
6377    than the actual number of elements that are being operated upon in parallel.
6378
6379    For example, consider an accumulation of shorts into an int accumulator.
6380    On some targets it's possible to vectorize this pattern operating on 8
6381    shorts at a time (hence, the vectype for purposes of determining the
6382    vectorization factor should be V8HI); on the other hand, the vectype that
6383    is used to create the vector form is actually V4SI (the type of the result).
6384
6385    Upon entry to this function, STMT_VINFO_VECTYPE records the vectype that
6386    indicates what is the actual level of parallelism (V8HI in the example), so
6387    that the right vectorization factor would be derived.  This vectype
6388    corresponds to the type of arguments to the reduction stmt, and should *NOT*
6389    be used to create the vectorized stmt.  The right vectype for the vectorized
6390    stmt is obtained from the type of the result X:
6391       get_vectype_for_scalar_type (vinfo, TREE_TYPE (X))
6392
6393    This means that, contrary to "regular" reductions (or "regular" stmts in
6394    general), the following equation:
6395       STMT_VINFO_VECTYPE == get_vectype_for_scalar_type (vinfo, TREE_TYPE (X))
6396    does *NOT* necessarily hold for reduction patterns.  */
6397
6398 bool
6399 vectorizable_reduction (loop_vec_info loop_vinfo,
6400                         stmt_vec_info stmt_info, slp_tree slp_node,
6401                         slp_instance slp_node_instance,
6402                         stmt_vector_for_cost *cost_vec)
6403 {
6404   tree scalar_dest;
6405   tree vectype_in = NULL_TREE;
6406   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
6407   enum vect_def_type cond_reduc_dt = vect_unknown_def_type;
6408   stmt_vec_info cond_stmt_vinfo = NULL;
6409   tree scalar_type;
6410   int i;
6411   int ncopies;
6412   bool single_defuse_cycle = false;
6413   bool nested_cycle = false;
6414   bool double_reduc = false;
6415   int vec_num;
6416   tree tem;
6417   tree cr_index_scalar_type = NULL_TREE, cr_index_vector_type = NULL_TREE;
6418   tree cond_reduc_val = NULL_TREE;
6419
6420   /* Make sure it was already recognized as a reduction computation.  */
6421   if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_reduction_def
6422       && STMT_VINFO_DEF_TYPE (stmt_info) != vect_double_reduction_def
6423       && STMT_VINFO_DEF_TYPE (stmt_info) != vect_nested_cycle)
6424     return false;
6425
6426   /* The stmt we store reduction analysis meta on.  */
6427   stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info);
6428   reduc_info->is_reduc_info = true;
6429
6430   if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle)
6431     {
6432       if (is_a <gphi *> (stmt_info->stmt))
6433         {
6434           if (slp_node)
6435             {
6436               /* We eventually need to set a vector type on invariant
6437                  arguments.  */
6438               unsigned j;
6439               slp_tree child;
6440               FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (slp_node), j, child)
6441                 if (!vect_maybe_update_slp_op_vectype
6442                        (child, SLP_TREE_VECTYPE (slp_node)))
6443                   {
6444                     if (dump_enabled_p ())
6445                       dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6446                                        "incompatible vector types for "
6447                                        "invariants\n");
6448                     return false;
6449                   }
6450             }
6451           /* Analysis for double-reduction is done on the outer
6452              loop PHI, nested cycles have no further restrictions.  */
6453           STMT_VINFO_TYPE (stmt_info) = cycle_phi_info_type;
6454         }
6455       else
6456         STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
6457       return true;
6458     }
6459
6460   stmt_vec_info orig_stmt_of_analysis = stmt_info;
6461   stmt_vec_info phi_info = stmt_info;
6462   if (!is_a <gphi *> (stmt_info->stmt))
6463     {
6464       STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
6465       return true;
6466     }
6467   if (slp_node)
6468     {
6469       slp_node_instance->reduc_phis = slp_node;
6470       /* ???  We're leaving slp_node to point to the PHIs, we only
6471          need it to get at the number of vector stmts which wasn't
6472          yet initialized for the instance root.  */
6473     }
6474   if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def)
6475     stmt_info = vect_stmt_to_vectorize (STMT_VINFO_REDUC_DEF (stmt_info));
6476   else
6477     {
6478       gcc_assert (STMT_VINFO_DEF_TYPE (stmt_info)
6479                   == vect_double_reduction_def);
6480       use_operand_p use_p;
6481       gimple *use_stmt;
6482       bool res = single_imm_use (gimple_phi_result (stmt_info->stmt),
6483                                  &use_p, &use_stmt);
6484       gcc_assert (res);
6485       phi_info = loop_vinfo->lookup_stmt (use_stmt);
6486       stmt_info = vect_stmt_to_vectorize (STMT_VINFO_REDUC_DEF (phi_info));
6487     }
6488
6489   /* PHIs should not participate in patterns.  */
6490   gcc_assert (!STMT_VINFO_RELATED_STMT (phi_info));
6491   gphi *reduc_def_phi = as_a <gphi *> (phi_info->stmt);
6492
6493   /* Verify following REDUC_IDX from the latch def leads us back to the PHI
6494      and compute the reduction chain length.  Discover the real
6495      reduction operation stmt on the way (stmt_info and slp_for_stmt_info).  */
6496   tree reduc_def
6497     = PHI_ARG_DEF_FROM_EDGE (reduc_def_phi,
6498                              loop_latch_edge
6499                                (gimple_bb (reduc_def_phi)->loop_father));
6500   unsigned reduc_chain_length = 0;
6501   bool only_slp_reduc_chain = true;
6502   stmt_info = NULL;
6503   slp_tree slp_for_stmt_info = slp_node ? slp_node_instance->root : NULL;
6504   while (reduc_def != PHI_RESULT (reduc_def_phi))
6505     {
6506       stmt_vec_info def = loop_vinfo->lookup_def (reduc_def);
6507       stmt_vec_info vdef = vect_stmt_to_vectorize (def);
6508       if (STMT_VINFO_REDUC_IDX (vdef) == -1)
6509         {
6510           if (dump_enabled_p ())
6511             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6512                              "reduction chain broken by patterns.\n");
6513           return false;
6514         }
6515       if (!REDUC_GROUP_FIRST_ELEMENT (vdef))
6516         only_slp_reduc_chain = false;
6517       /* ???  For epilogue generation live members of the chain need
6518          to point back to the PHI via their original stmt for
6519          info_for_reduction to work.  */
6520       if (STMT_VINFO_LIVE_P (vdef))
6521         STMT_VINFO_REDUC_DEF (def) = phi_info;
6522       gassign *assign = dyn_cast <gassign *> (vdef->stmt);
6523       if (!assign)
6524         {
6525           if (dump_enabled_p ())
6526             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6527                              "reduction chain includes calls.\n");
6528           return false;
6529         }
6530       if (CONVERT_EXPR_CODE_P (gimple_assign_rhs_code (assign)))
6531         {
6532           if (!tree_nop_conversion_p (TREE_TYPE (gimple_assign_lhs (assign)),
6533                                       TREE_TYPE (gimple_assign_rhs1 (assign))))
6534             {
6535               if (dump_enabled_p ())
6536                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6537                                  "conversion in the reduction chain.\n");
6538               return false;
6539             }
6540         }
6541       else if (!stmt_info)
6542         /* First non-conversion stmt.  */
6543         stmt_info = vdef;
6544       reduc_def = gimple_op (vdef->stmt, 1 + STMT_VINFO_REDUC_IDX (vdef));
6545       reduc_chain_length++;
6546       if (!stmt_info && slp_node)
6547         slp_for_stmt_info = SLP_TREE_CHILDREN (slp_for_stmt_info)[0];
6548     }
6549   /* PHIs should not participate in patterns.  */
6550   gcc_assert (!STMT_VINFO_RELATED_STMT (phi_info));
6551
6552   if (nested_in_vect_loop_p (loop, stmt_info))
6553     {
6554       loop = loop->inner;
6555       nested_cycle = true;
6556     }
6557
6558   /* STMT_VINFO_REDUC_DEF doesn't point to the first but the last
6559      element.  */
6560   if (slp_node && REDUC_GROUP_FIRST_ELEMENT (stmt_info))
6561     {
6562       gcc_assert (!REDUC_GROUP_NEXT_ELEMENT (stmt_info));
6563       stmt_info = REDUC_GROUP_FIRST_ELEMENT (stmt_info);
6564     }
6565   if (REDUC_GROUP_FIRST_ELEMENT (stmt_info))
6566     gcc_assert (slp_node
6567                 && REDUC_GROUP_FIRST_ELEMENT (stmt_info) == stmt_info);
6568
6569   /* 1. Is vectorizable reduction?  */
6570   /* Not supportable if the reduction variable is used in the loop, unless
6571      it's a reduction chain.  */
6572   if (STMT_VINFO_RELEVANT (stmt_info) > vect_used_in_outer
6573       && !REDUC_GROUP_FIRST_ELEMENT (stmt_info))
6574     return false;
6575
6576   /* Reductions that are not used even in an enclosing outer-loop,
6577      are expected to be "live" (used out of the loop).  */
6578   if (STMT_VINFO_RELEVANT (stmt_info) == vect_unused_in_scope
6579       && !STMT_VINFO_LIVE_P (stmt_info))
6580     return false;
6581
6582   /* 2. Has this been recognized as a reduction pattern?
6583
6584      Check if STMT represents a pattern that has been recognized
6585      in earlier analysis stages.  For stmts that represent a pattern,
6586      the STMT_VINFO_RELATED_STMT field records the last stmt in
6587      the original sequence that constitutes the pattern.  */
6588
6589   stmt_vec_info orig_stmt_info = STMT_VINFO_RELATED_STMT (stmt_info);
6590   if (orig_stmt_info)
6591     {
6592       gcc_assert (STMT_VINFO_IN_PATTERN_P (orig_stmt_info));
6593       gcc_assert (!STMT_VINFO_IN_PATTERN_P (stmt_info));
6594     }
6595
6596   /* 3. Check the operands of the operation.  The first operands are defined
6597         inside the loop body. The last operand is the reduction variable,
6598         which is defined by the loop-header-phi.  */
6599
6600   tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
6601   STMT_VINFO_REDUC_VECTYPE (reduc_info) = vectype_out;
6602   gassign *stmt = as_a <gassign *> (stmt_info->stmt);
6603   enum tree_code code = gimple_assign_rhs_code (stmt);
6604   bool lane_reduc_code_p
6605     = (code == DOT_PROD_EXPR || code == WIDEN_SUM_EXPR || code == SAD_EXPR);
6606   int op_type = TREE_CODE_LENGTH (code);
6607   enum optab_subtype optab_query_kind = optab_vector;
6608   if (code == DOT_PROD_EXPR
6609       && TYPE_SIGN (TREE_TYPE (gimple_assign_rhs1 (stmt)))
6610            != TYPE_SIGN (TREE_TYPE (gimple_assign_rhs2 (stmt))))
6611     optab_query_kind = optab_vector_mixed_sign;
6612
6613
6614   scalar_dest = gimple_assign_lhs (stmt);
6615   scalar_type = TREE_TYPE (scalar_dest);
6616   if (!POINTER_TYPE_P (scalar_type) && !INTEGRAL_TYPE_P (scalar_type)
6617       && !SCALAR_FLOAT_TYPE_P (scalar_type))
6618     return false;
6619
6620   /* Do not try to vectorize bit-precision reductions.  */
6621   if (!type_has_mode_precision_p (scalar_type))
6622     return false;
6623
6624   /* For lane-reducing ops we're reducing the number of reduction PHIs
6625      which means the only use of that may be in the lane-reducing operation.  */
6626   if (lane_reduc_code_p
6627       && reduc_chain_length != 1
6628       && !only_slp_reduc_chain)
6629     {
6630       if (dump_enabled_p ())
6631         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6632                          "lane-reducing reduction with extra stmts.\n");
6633       return false;
6634     }
6635
6636   /* All uses but the last are expected to be defined in the loop.
6637      The last use is the reduction variable.  In case of nested cycle this
6638      assumption is not true: we use reduc_index to record the index of the
6639      reduction variable.  */
6640   slp_tree *slp_op = XALLOCAVEC (slp_tree, op_type);
6641   /* We need to skip an extra operand for COND_EXPRs with embedded
6642      comparison.  */
6643   unsigned opno_adjust = 0;
6644   if (code == COND_EXPR
6645       && COMPARISON_CLASS_P (gimple_assign_rhs1 (stmt)))
6646     opno_adjust = 1;
6647   for (i = 0; i < op_type; i++)
6648     {
6649       /* The condition of COND_EXPR is checked in vectorizable_condition().  */
6650       if (i == 0 && code == COND_EXPR)
6651         continue;
6652
6653       stmt_vec_info def_stmt_info;
6654       enum vect_def_type dt;
6655       tree op;
6656       if (!vect_is_simple_use (loop_vinfo, stmt_info, slp_for_stmt_info,
6657                                i + opno_adjust, &op, &slp_op[i], &dt, &tem,
6658                                &def_stmt_info))
6659         {
6660           if (dump_enabled_p ())
6661             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6662                              "use not simple.\n");
6663           return false;
6664         }
6665       if (i == STMT_VINFO_REDUC_IDX (stmt_info))
6666         continue;
6667
6668       /* There should be only one cycle def in the stmt, the one
6669          leading to reduc_def.  */
6670       if (VECTORIZABLE_CYCLE_DEF (dt))
6671         return false;
6672
6673       /* To properly compute ncopies we are interested in the widest
6674          non-reduction input type in case we're looking at a widening
6675          accumulation that we later handle in vect_transform_reduction.  */
6676       if (lane_reduc_code_p
6677           && tem
6678           && (!vectype_in
6679               || (GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (vectype_in)))
6680                   < GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (tem))))))
6681         vectype_in = tem;
6682
6683       if (code == COND_EXPR)
6684         {
6685           /* Record how the non-reduction-def value of COND_EXPR is defined.  */
6686           if (dt == vect_constant_def)
6687             {
6688               cond_reduc_dt = dt;
6689               cond_reduc_val = op;
6690             }
6691           if (dt == vect_induction_def
6692               && def_stmt_info
6693               && is_nonwrapping_integer_induction (def_stmt_info, loop))
6694             {
6695               cond_reduc_dt = dt;
6696               cond_stmt_vinfo = def_stmt_info;
6697             }
6698         }
6699     }
6700   if (!vectype_in)
6701     vectype_in = STMT_VINFO_VECTYPE (phi_info);
6702   STMT_VINFO_REDUC_VECTYPE_IN (reduc_info) = vectype_in;
6703
6704   enum vect_reduction_type v_reduc_type = STMT_VINFO_REDUC_TYPE (phi_info);
6705   STMT_VINFO_REDUC_TYPE (reduc_info) = v_reduc_type;
6706   /* If we have a condition reduction, see if we can simplify it further.  */
6707   if (v_reduc_type == COND_REDUCTION)
6708     {
6709       if (slp_node)
6710         return false;
6711
6712       /* When the condition uses the reduction value in the condition, fail.  */
6713       if (STMT_VINFO_REDUC_IDX (stmt_info) == 0)
6714         {
6715           if (dump_enabled_p ())
6716             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6717                              "condition depends on previous iteration\n");
6718           return false;
6719         }
6720
6721       if (reduc_chain_length == 1
6722           && direct_internal_fn_supported_p (IFN_FOLD_EXTRACT_LAST,
6723                                              vectype_in, OPTIMIZE_FOR_SPEED))
6724         {
6725           if (dump_enabled_p ())
6726             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6727                              "optimizing condition reduction with"
6728                              " FOLD_EXTRACT_LAST.\n");
6729           STMT_VINFO_REDUC_TYPE (reduc_info) = EXTRACT_LAST_REDUCTION;
6730         }
6731       else if (cond_reduc_dt == vect_induction_def)
6732         {
6733           tree base
6734             = STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (cond_stmt_vinfo);
6735           tree step = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (cond_stmt_vinfo);
6736
6737           gcc_assert (TREE_CODE (base) == INTEGER_CST
6738                       && TREE_CODE (step) == INTEGER_CST);
6739           cond_reduc_val = NULL_TREE;
6740           enum tree_code cond_reduc_op_code = ERROR_MARK;
6741           tree res = PHI_RESULT (STMT_VINFO_STMT (cond_stmt_vinfo));
6742           if (!types_compatible_p (TREE_TYPE (res), TREE_TYPE (base)))
6743             ;
6744           /* Find a suitable value, for MAX_EXPR below base, for MIN_EXPR
6745              above base; punt if base is the minimum value of the type for
6746              MAX_EXPR or maximum value of the type for MIN_EXPR for now.  */
6747           else if (tree_int_cst_sgn (step) == -1)
6748             {
6749               cond_reduc_op_code = MIN_EXPR;
6750               if (tree_int_cst_sgn (base) == -1)
6751                 cond_reduc_val = build_int_cst (TREE_TYPE (base), 0);
6752               else if (tree_int_cst_lt (base,
6753                                         TYPE_MAX_VALUE (TREE_TYPE (base))))
6754                 cond_reduc_val
6755                   = int_const_binop (PLUS_EXPR, base, integer_one_node);
6756             }
6757           else
6758             {
6759               cond_reduc_op_code = MAX_EXPR;
6760               if (tree_int_cst_sgn (base) == 1)
6761                 cond_reduc_val = build_int_cst (TREE_TYPE (base), 0);
6762               else if (tree_int_cst_lt (TYPE_MIN_VALUE (TREE_TYPE (base)),
6763                                         base))
6764                 cond_reduc_val
6765                   = int_const_binop (MINUS_EXPR, base, integer_one_node);
6766             }
6767           if (cond_reduc_val)
6768             {
6769               if (dump_enabled_p ())
6770                 dump_printf_loc (MSG_NOTE, vect_location,
6771                                  "condition expression based on "
6772                                  "integer induction.\n");
6773               STMT_VINFO_REDUC_CODE (reduc_info) = cond_reduc_op_code;
6774               STMT_VINFO_VEC_INDUC_COND_INITIAL_VAL (reduc_info)
6775                 = cond_reduc_val;
6776               STMT_VINFO_REDUC_TYPE (reduc_info) = INTEGER_INDUC_COND_REDUCTION;
6777             }
6778         }
6779       else if (cond_reduc_dt == vect_constant_def)
6780         {
6781           enum vect_def_type cond_initial_dt;
6782           tree cond_initial_val = vect_phi_initial_value (reduc_def_phi);
6783           vect_is_simple_use (cond_initial_val, loop_vinfo, &cond_initial_dt);
6784           if (cond_initial_dt == vect_constant_def
6785               && types_compatible_p (TREE_TYPE (cond_initial_val),
6786                                      TREE_TYPE (cond_reduc_val)))
6787             {
6788               tree e = fold_binary (LE_EXPR, boolean_type_node,
6789                                     cond_initial_val, cond_reduc_val);
6790               if (e && (integer_onep (e) || integer_zerop (e)))
6791                 {
6792                   if (dump_enabled_p ())
6793                     dump_printf_loc (MSG_NOTE, vect_location,
6794                                      "condition expression based on "
6795                                      "compile time constant.\n");
6796                   /* Record reduction code at analysis stage.  */
6797                   STMT_VINFO_REDUC_CODE (reduc_info)
6798                     = integer_onep (e) ? MAX_EXPR : MIN_EXPR;
6799                   STMT_VINFO_REDUC_TYPE (reduc_info) = CONST_COND_REDUCTION;
6800                 }
6801             }
6802         }
6803     }
6804
6805   if (STMT_VINFO_LIVE_P (phi_info))
6806     return false;
6807
6808   if (slp_node)
6809     ncopies = 1;
6810   else
6811     ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
6812
6813   gcc_assert (ncopies >= 1);
6814
6815   poly_uint64 nunits_out = TYPE_VECTOR_SUBPARTS (vectype_out);
6816
6817   if (nested_cycle)
6818     {
6819       gcc_assert (STMT_VINFO_DEF_TYPE (reduc_info)
6820                   == vect_double_reduction_def);
6821       double_reduc = true;
6822     }
6823
6824   /* 4.2. Check support for the epilog operation.
6825
6826           If STMT represents a reduction pattern, then the type of the
6827           reduction variable may be different than the type of the rest
6828           of the arguments.  For example, consider the case of accumulation
6829           of shorts into an int accumulator; The original code:
6830                         S1: int_a = (int) short_a;
6831           orig_stmt->   S2: int_acc = plus <int_a ,int_acc>;
6832
6833           was replaced with:
6834                         STMT: int_acc = widen_sum <short_a, int_acc>
6835
6836           This means that:
6837           1. The tree-code that is used to create the vector operation in the
6838              epilog code (that reduces the partial results) is not the
6839              tree-code of STMT, but is rather the tree-code of the original
6840              stmt from the pattern that STMT is replacing.  I.e, in the example
6841              above we want to use 'widen_sum' in the loop, but 'plus' in the
6842              epilog.
6843           2. The type (mode) we use to check available target support
6844              for the vector operation to be created in the *epilog*, is
6845              determined by the type of the reduction variable (in the example
6846              above we'd check this: optab_handler (plus_optab, vect_int_mode])).
6847              However the type (mode) we use to check available target support
6848              for the vector operation to be created *inside the loop*, is
6849              determined by the type of the other arguments to STMT (in the
6850              example we'd check this: optab_handler (widen_sum_optab,
6851              vect_short_mode)).
6852
6853           This is contrary to "regular" reductions, in which the types of all
6854           the arguments are the same as the type of the reduction variable.
6855           For "regular" reductions we can therefore use the same vector type
6856           (and also the same tree-code) when generating the epilog code and
6857           when generating the code inside the loop.  */
6858
6859   enum tree_code orig_code = STMT_VINFO_REDUC_CODE (phi_info);
6860   STMT_VINFO_REDUC_CODE (reduc_info) = orig_code;
6861
6862   vect_reduction_type reduction_type = STMT_VINFO_REDUC_TYPE (reduc_info);
6863   if (reduction_type == TREE_CODE_REDUCTION)
6864     {
6865       /* Check whether it's ok to change the order of the computation.
6866          Generally, when vectorizing a reduction we change the order of the
6867          computation.  This may change the behavior of the program in some
6868          cases, so we need to check that this is ok.  One exception is when
6869          vectorizing an outer-loop: the inner-loop is executed sequentially,
6870          and therefore vectorizing reductions in the inner-loop during
6871          outer-loop vectorization is safe.  Likewise when we are vectorizing
6872          a series of reductions using SLP and the VF is one the reductions
6873          are performed in scalar order.  */
6874       if (slp_node
6875           && !REDUC_GROUP_FIRST_ELEMENT (stmt_info)
6876           && known_eq (LOOP_VINFO_VECT_FACTOR (loop_vinfo), 1u))
6877         ;
6878       else if (needs_fold_left_reduction_p (scalar_type, orig_code))
6879         {
6880           /* When vectorizing a reduction chain w/o SLP the reduction PHI
6881              is not directy used in stmt.  */
6882           if (!only_slp_reduc_chain
6883               && reduc_chain_length != 1)
6884             {
6885               if (dump_enabled_p ())
6886                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6887                                  "in-order reduction chain without SLP.\n");
6888               return false;
6889             }
6890           STMT_VINFO_REDUC_TYPE (reduc_info)
6891             = reduction_type = FOLD_LEFT_REDUCTION;
6892         }
6893       else if (!commutative_tree_code (orig_code)
6894                || !associative_tree_code (orig_code))
6895         {
6896           if (dump_enabled_p ())
6897             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6898                             "reduction: not commutative/associative");
6899           return false;
6900         }
6901     }
6902
6903   if ((double_reduc || reduction_type != TREE_CODE_REDUCTION)
6904       && ncopies > 1)
6905     {
6906       if (dump_enabled_p ())
6907         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6908                          "multiple types in double reduction or condition "
6909                          "reduction or fold-left reduction.\n");
6910       return false;
6911     }
6912
6913   internal_fn reduc_fn = IFN_LAST;
6914   if (reduction_type == TREE_CODE_REDUCTION
6915       || reduction_type == FOLD_LEFT_REDUCTION
6916       || reduction_type == INTEGER_INDUC_COND_REDUCTION
6917       || reduction_type == CONST_COND_REDUCTION)
6918     {
6919       if (reduction_type == FOLD_LEFT_REDUCTION
6920           ? fold_left_reduction_fn (orig_code, &reduc_fn)
6921           : reduction_fn_for_scalar_code (orig_code, &reduc_fn))
6922         {
6923           if (reduc_fn != IFN_LAST
6924               && !direct_internal_fn_supported_p (reduc_fn, vectype_out,
6925                                                   OPTIMIZE_FOR_SPEED))
6926             {
6927               if (dump_enabled_p ())
6928                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6929                                  "reduc op not supported by target.\n");
6930
6931               reduc_fn = IFN_LAST;
6932             }
6933         }
6934       else
6935         {
6936           if (!nested_cycle || double_reduc)
6937             {
6938               if (dump_enabled_p ())
6939                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6940                                  "no reduc code for scalar code.\n");
6941
6942               return false;
6943             }
6944         }
6945     }
6946   else if (reduction_type == COND_REDUCTION)
6947     {
6948       int scalar_precision
6949         = GET_MODE_PRECISION (SCALAR_TYPE_MODE (scalar_type));
6950       cr_index_scalar_type = make_unsigned_type (scalar_precision);
6951       cr_index_vector_type = get_same_sized_vectype (cr_index_scalar_type,
6952                                                 vectype_out);
6953
6954       if (direct_internal_fn_supported_p (IFN_REDUC_MAX, cr_index_vector_type,
6955                                           OPTIMIZE_FOR_SPEED))
6956         reduc_fn = IFN_REDUC_MAX;
6957     }
6958   STMT_VINFO_REDUC_FN (reduc_info) = reduc_fn;
6959
6960   if (reduction_type != EXTRACT_LAST_REDUCTION
6961       && (!nested_cycle || double_reduc)
6962       && reduc_fn == IFN_LAST
6963       && !nunits_out.is_constant ())
6964     {
6965       if (dump_enabled_p ())
6966         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6967                          "missing target support for reduction on"
6968                          " variable-length vectors.\n");
6969       return false;
6970     }
6971
6972   /* For SLP reductions, see if there is a neutral value we can use.  */
6973   tree neutral_op = NULL_TREE;
6974   if (slp_node)
6975     {
6976       tree initial_value = NULL_TREE;
6977       if (REDUC_GROUP_FIRST_ELEMENT (stmt_info) != NULL)
6978         initial_value = vect_phi_initial_value (reduc_def_phi);
6979       neutral_op = neutral_op_for_reduction (TREE_TYPE (vectype_out),
6980                                              orig_code, initial_value);
6981     }
6982
6983   if (double_reduc && reduction_type == FOLD_LEFT_REDUCTION)
6984     {
6985       /* We can't support in-order reductions of code such as this:
6986
6987            for (int i = 0; i < n1; ++i)
6988              for (int j = 0; j < n2; ++j)
6989                l += a[j];
6990
6991          since GCC effectively transforms the loop when vectorizing:
6992
6993            for (int i = 0; i < n1 / VF; ++i)
6994              for (int j = 0; j < n2; ++j)
6995                for (int k = 0; k < VF; ++k)
6996                  l += a[j];
6997
6998          which is a reassociation of the original operation.  */
6999       if (dump_enabled_p ())
7000         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7001                          "in-order double reduction not supported.\n");
7002
7003       return false;
7004     }
7005
7006   if (reduction_type == FOLD_LEFT_REDUCTION
7007       && slp_node
7008       && !REDUC_GROUP_FIRST_ELEMENT (stmt_info))
7009     {
7010       /* We cannot use in-order reductions in this case because there is
7011          an implicit reassociation of the operations involved.  */
7012       if (dump_enabled_p ())
7013         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7014                          "in-order unchained SLP reductions not supported.\n");
7015       return false;
7016     }
7017
7018   /* For double reductions, and for SLP reductions with a neutral value,
7019      we construct a variable-length initial vector by loading a vector
7020      full of the neutral value and then shift-and-inserting the start
7021      values into the low-numbered elements.  */
7022   if ((double_reduc || neutral_op)
7023       && !nunits_out.is_constant ()
7024       && !direct_internal_fn_supported_p (IFN_VEC_SHL_INSERT,
7025                                           vectype_out, OPTIMIZE_FOR_SPEED))
7026     {
7027       if (dump_enabled_p ())
7028         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7029                          "reduction on variable-length vectors requires"
7030                          " target support for a vector-shift-and-insert"
7031                          " operation.\n");
7032       return false;
7033     }
7034
7035   /* Check extra constraints for variable-length unchained SLP reductions.  */
7036   if (STMT_SLP_TYPE (stmt_info)
7037       && !REDUC_GROUP_FIRST_ELEMENT (stmt_info)
7038       && !nunits_out.is_constant ())
7039     {
7040       /* We checked above that we could build the initial vector when
7041          there's a neutral element value.  Check here for the case in
7042          which each SLP statement has its own initial value and in which
7043          that value needs to be repeated for every instance of the
7044          statement within the initial vector.  */
7045       unsigned int group_size = SLP_TREE_LANES (slp_node);
7046       if (!neutral_op
7047           && !can_duplicate_and_interleave_p (loop_vinfo, group_size,
7048                                               TREE_TYPE (vectype_out)))
7049         {
7050           if (dump_enabled_p ())
7051             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7052                              "unsupported form of SLP reduction for"
7053                              " variable-length vectors: cannot build"
7054                              " initial vector.\n");
7055           return false;
7056         }
7057       /* The epilogue code relies on the number of elements being a multiple
7058          of the group size.  The duplicate-and-interleave approach to setting
7059          up the initial vector does too.  */
7060       if (!multiple_p (nunits_out, group_size))
7061         {
7062           if (dump_enabled_p ())
7063             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7064                              "unsupported form of SLP reduction for"
7065                              " variable-length vectors: the vector size"
7066                              " is not a multiple of the number of results.\n");
7067           return false;
7068         }
7069     }
7070
7071   if (reduction_type == COND_REDUCTION)
7072     {
7073       widest_int ni;
7074
7075       if (! max_loop_iterations (loop, &ni))
7076         {
7077           if (dump_enabled_p ())
7078             dump_printf_loc (MSG_NOTE, vect_location,
7079                              "loop count not known, cannot create cond "
7080                              "reduction.\n");
7081           return false;
7082         }
7083       /* Convert backedges to iterations.  */
7084       ni += 1;
7085
7086       /* The additional index will be the same type as the condition.  Check
7087          that the loop can fit into this less one (because we'll use up the
7088          zero slot for when there are no matches).  */
7089       tree max_index = TYPE_MAX_VALUE (cr_index_scalar_type);
7090       if (wi::geu_p (ni, wi::to_widest (max_index)))
7091         {
7092           if (dump_enabled_p ())
7093             dump_printf_loc (MSG_NOTE, vect_location,
7094                              "loop size is greater than data size.\n");
7095           return false;
7096         }
7097     }
7098
7099   /* In case the vectorization factor (VF) is bigger than the number
7100      of elements that we can fit in a vectype (nunits), we have to generate
7101      more than one vector stmt - i.e - we need to "unroll" the
7102      vector stmt by a factor VF/nunits.  For more details see documentation
7103      in vectorizable_operation.  */
7104
7105   /* If the reduction is used in an outer loop we need to generate
7106      VF intermediate results, like so (e.g. for ncopies=2):
7107         r0 = phi (init, r0)
7108         r1 = phi (init, r1)
7109         r0 = x0 + r0;
7110         r1 = x1 + r1;
7111     (i.e. we generate VF results in 2 registers).
7112     In this case we have a separate def-use cycle for each copy, and therefore
7113     for each copy we get the vector def for the reduction variable from the
7114     respective phi node created for this copy.
7115
7116     Otherwise (the reduction is unused in the loop nest), we can combine
7117     together intermediate results, like so (e.g. for ncopies=2):
7118         r = phi (init, r)
7119         r = x0 + r;
7120         r = x1 + r;
7121    (i.e. we generate VF/2 results in a single register).
7122    In this case for each copy we get the vector def for the reduction variable
7123    from the vectorized reduction operation generated in the previous iteration.
7124
7125    This only works when we see both the reduction PHI and its only consumer
7126    in vectorizable_reduction and there are no intermediate stmts
7127    participating.  */
7128   if (ncopies > 1
7129       && (STMT_VINFO_RELEVANT (stmt_info) <= vect_used_only_live)
7130       && reduc_chain_length == 1)
7131     single_defuse_cycle = true;
7132
7133   if (single_defuse_cycle || lane_reduc_code_p)
7134     {
7135       gcc_assert (code != COND_EXPR);
7136
7137       /* 4. Supportable by target?  */
7138       bool ok = true;
7139
7140       /* 4.1. check support for the operation in the loop  */
7141       optab optab = optab_for_tree_code (code, vectype_in, optab_query_kind);
7142       if (!optab)
7143         {
7144           if (dump_enabled_p ())
7145             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7146                              "no optab.\n");
7147           ok = false;
7148         }
7149
7150       machine_mode vec_mode = TYPE_MODE (vectype_in);
7151       if (ok && optab_handler (optab, vec_mode) == CODE_FOR_nothing)
7152         {
7153           if (dump_enabled_p ())
7154             dump_printf (MSG_NOTE, "op not supported by target.\n");
7155           if (maybe_ne (GET_MODE_SIZE (vec_mode), UNITS_PER_WORD)
7156               || !vect_can_vectorize_without_simd_p (code))
7157             ok = false;
7158           else
7159             if (dump_enabled_p ())
7160               dump_printf (MSG_NOTE, "proceeding using word mode.\n");
7161         }
7162
7163       if (vect_emulated_vector_p (vectype_in)
7164           && !vect_can_vectorize_without_simd_p (code))
7165         {
7166           if (dump_enabled_p ())
7167             dump_printf (MSG_NOTE, "using word mode not possible.\n");
7168           return false;
7169         }
7170
7171       /* lane-reducing operations have to go through vect_transform_reduction.
7172          For the other cases try without the single cycle optimization.  */
7173       if (!ok)
7174         {
7175           if (lane_reduc_code_p)
7176             return false;
7177           else
7178             single_defuse_cycle = false;
7179         }
7180     }
7181   STMT_VINFO_FORCE_SINGLE_CYCLE (reduc_info) = single_defuse_cycle;
7182
7183   /* If the reduction stmt is one of the patterns that have lane
7184      reduction embedded we cannot handle the case of ! single_defuse_cycle.  */
7185   if ((ncopies > 1 && ! single_defuse_cycle)
7186       && lane_reduc_code_p)
7187     {
7188       if (dump_enabled_p ())
7189         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7190                          "multi def-use cycle not possible for lane-reducing "
7191                          "reduction operation\n");
7192       return false;
7193     }
7194
7195   if (slp_node
7196       && !(!single_defuse_cycle
7197            && code != DOT_PROD_EXPR
7198            && code != WIDEN_SUM_EXPR
7199            && code != SAD_EXPR
7200            && reduction_type != FOLD_LEFT_REDUCTION))
7201     for (i = 0; i < op_type; i++)
7202       if (!vect_maybe_update_slp_op_vectype (slp_op[i], vectype_in))
7203         {
7204           if (dump_enabled_p ())
7205             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7206                              "incompatible vector types for invariants\n");
7207           return false;
7208         }
7209
7210   if (slp_node)
7211     vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
7212   else
7213     vec_num = 1;
7214
7215   vect_model_reduction_cost (loop_vinfo, stmt_info, reduc_fn,
7216                              reduction_type, ncopies, cost_vec);
7217   /* Cost the reduction op inside the loop if transformed via
7218      vect_transform_reduction.  Otherwise this is costed by the
7219      separate vectorizable_* routines.  */
7220   if (single_defuse_cycle
7221       || code == DOT_PROD_EXPR
7222       || code == WIDEN_SUM_EXPR
7223       || code == SAD_EXPR)
7224     record_stmt_cost (cost_vec, ncopies, vector_stmt, stmt_info, 0, vect_body);
7225
7226   if (dump_enabled_p ()
7227       && reduction_type == FOLD_LEFT_REDUCTION)
7228     dump_printf_loc (MSG_NOTE, vect_location,
7229                      "using an in-order (fold-left) reduction.\n");
7230   STMT_VINFO_TYPE (orig_stmt_of_analysis) = cycle_phi_info_type;
7231   /* All but single defuse-cycle optimized, lane-reducing and fold-left
7232      reductions go through their own vectorizable_* routines.  */
7233   if (!single_defuse_cycle
7234       && code != DOT_PROD_EXPR
7235       && code != WIDEN_SUM_EXPR
7236       && code != SAD_EXPR
7237       && reduction_type != FOLD_LEFT_REDUCTION)
7238     {
7239       stmt_vec_info tem
7240         = vect_stmt_to_vectorize (STMT_VINFO_REDUC_DEF (phi_info));
7241       if (slp_node && REDUC_GROUP_FIRST_ELEMENT (tem))
7242         {
7243           gcc_assert (!REDUC_GROUP_NEXT_ELEMENT (tem));
7244           tem = REDUC_GROUP_FIRST_ELEMENT (tem);
7245         }
7246       STMT_VINFO_DEF_TYPE (vect_orig_stmt (tem)) = vect_internal_def;
7247       STMT_VINFO_DEF_TYPE (tem) = vect_internal_def;
7248     }
7249   else if (loop_vinfo && LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo))
7250     {
7251       vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo);
7252       internal_fn cond_fn = get_conditional_internal_fn (code);
7253
7254       if (reduction_type != FOLD_LEFT_REDUCTION
7255           && !use_mask_by_cond_expr_p (code, cond_fn, vectype_in)
7256           && (cond_fn == IFN_LAST
7257               || !direct_internal_fn_supported_p (cond_fn, vectype_in,
7258                                                   OPTIMIZE_FOR_SPEED)))
7259         {
7260           if (dump_enabled_p ())
7261             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7262                              "can't operate on partial vectors because"
7263                              " no conditional operation is available.\n");
7264           LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
7265         }
7266       else if (reduction_type == FOLD_LEFT_REDUCTION
7267                && reduc_fn == IFN_LAST
7268                && !expand_vec_cond_expr_p (vectype_in,
7269                                            truth_type_for (vectype_in),
7270                                            SSA_NAME))
7271         {
7272           if (dump_enabled_p ())
7273             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7274                              "can't operate on partial vectors because"
7275                              " no conditional operation is available.\n");
7276           LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
7277         }
7278       else
7279         vect_record_loop_mask (loop_vinfo, masks, ncopies * vec_num,
7280                                vectype_in, NULL);
7281     }
7282   return true;
7283 }
7284
7285 /* Transform the definition stmt STMT_INFO of a reduction PHI backedge
7286    value.  */
7287
7288 bool
7289 vect_transform_reduction (loop_vec_info loop_vinfo,
7290                           stmt_vec_info stmt_info, gimple_stmt_iterator *gsi,
7291                           gimple **vec_stmt, slp_tree slp_node)
7292 {
7293   tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
7294   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
7295   int i;
7296   int ncopies;
7297   int vec_num;
7298
7299   stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info);
7300   gcc_assert (reduc_info->is_reduc_info);
7301
7302   if (nested_in_vect_loop_p (loop, stmt_info))
7303     {
7304       loop = loop->inner;
7305       gcc_assert (STMT_VINFO_DEF_TYPE (reduc_info) == vect_double_reduction_def);
7306     }
7307
7308   gassign *stmt = as_a <gassign *> (stmt_info->stmt);
7309   enum tree_code code = gimple_assign_rhs_code (stmt);
7310   int op_type = TREE_CODE_LENGTH (code);
7311
7312   /* Flatten RHS.  */
7313   tree ops[3];
7314   switch (get_gimple_rhs_class (code))
7315     {
7316     case GIMPLE_TERNARY_RHS:
7317       ops[2] = gimple_assign_rhs3 (stmt);
7318       /* Fall thru.  */
7319     case GIMPLE_BINARY_RHS:
7320       ops[0] = gimple_assign_rhs1 (stmt);
7321       ops[1] = gimple_assign_rhs2 (stmt);
7322       break;
7323     default:
7324       gcc_unreachable ();
7325     }
7326
7327   /* All uses but the last are expected to be defined in the loop.
7328      The last use is the reduction variable.  In case of nested cycle this
7329      assumption is not true: we use reduc_index to record the index of the
7330      reduction variable.  */
7331   stmt_vec_info phi_info = STMT_VINFO_REDUC_DEF (vect_orig_stmt (stmt_info));
7332   gphi *reduc_def_phi = as_a <gphi *> (phi_info->stmt);
7333   int reduc_index = STMT_VINFO_REDUC_IDX (stmt_info);
7334   tree vectype_in = STMT_VINFO_REDUC_VECTYPE_IN (reduc_info);
7335
7336   if (slp_node)
7337     {
7338       ncopies = 1;
7339       vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
7340     }
7341   else
7342     {
7343       ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
7344       vec_num = 1;
7345     }
7346
7347   internal_fn cond_fn = get_conditional_internal_fn (code);
7348   vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo);
7349   bool mask_by_cond_expr = use_mask_by_cond_expr_p (code, cond_fn, vectype_in);
7350
7351   /* Transform.  */
7352   tree new_temp = NULL_TREE;
7353   auto_vec<tree> vec_oprnds0;
7354   auto_vec<tree> vec_oprnds1;
7355   auto_vec<tree> vec_oprnds2;
7356   tree def0;
7357
7358   if (dump_enabled_p ())
7359     dump_printf_loc (MSG_NOTE, vect_location, "transform reduction.\n");
7360
7361   /* FORNOW: Multiple types are not supported for condition.  */
7362   if (code == COND_EXPR)
7363     gcc_assert (ncopies == 1);
7364
7365   bool masked_loop_p = LOOP_VINFO_FULLY_MASKED_P (loop_vinfo);
7366
7367   vect_reduction_type reduction_type = STMT_VINFO_REDUC_TYPE (reduc_info);
7368   if (reduction_type == FOLD_LEFT_REDUCTION)
7369     {
7370       internal_fn reduc_fn = STMT_VINFO_REDUC_FN (reduc_info);
7371       return vectorize_fold_left_reduction
7372           (loop_vinfo, stmt_info, gsi, vec_stmt, slp_node, reduc_def_phi, code,
7373            reduc_fn, ops, vectype_in, reduc_index, masks);
7374     }
7375
7376   bool single_defuse_cycle = STMT_VINFO_FORCE_SINGLE_CYCLE (reduc_info);
7377   gcc_assert (single_defuse_cycle
7378               || code == DOT_PROD_EXPR
7379               || code == WIDEN_SUM_EXPR
7380               || code == SAD_EXPR);
7381
7382   /* Create the destination vector  */
7383   tree scalar_dest = gimple_assign_lhs (stmt);
7384   tree vec_dest = vect_create_destination_var (scalar_dest, vectype_out);
7385
7386   vect_get_vec_defs (loop_vinfo, stmt_info, slp_node, ncopies,
7387                      single_defuse_cycle && reduc_index == 0
7388                      ? NULL_TREE : ops[0], &vec_oprnds0,
7389                      single_defuse_cycle && reduc_index == 1
7390                      ? NULL_TREE : ops[1], &vec_oprnds1,
7391                      op_type == ternary_op
7392                      && !(single_defuse_cycle && reduc_index == 2)
7393                      ? ops[2] : NULL_TREE, &vec_oprnds2);
7394   if (single_defuse_cycle)
7395     {
7396       gcc_assert (!slp_node);
7397       vect_get_vec_defs_for_operand (loop_vinfo, stmt_info, 1,
7398                                      ops[reduc_index],
7399                                      reduc_index == 0 ? &vec_oprnds0
7400                                      : (reduc_index == 1 ? &vec_oprnds1
7401                                         : &vec_oprnds2));
7402     }
7403
7404   FOR_EACH_VEC_ELT (vec_oprnds0, i, def0)
7405     {
7406       gimple *new_stmt;
7407       tree vop[3] = { def0, vec_oprnds1[i], NULL_TREE };
7408       if (masked_loop_p && !mask_by_cond_expr)
7409         {
7410           /* Make sure that the reduction accumulator is vop[0].  */
7411           if (reduc_index == 1)
7412             {
7413               gcc_assert (commutative_tree_code (code));
7414               std::swap (vop[0], vop[1]);
7415             }
7416           tree mask = vect_get_loop_mask (gsi, masks, vec_num * ncopies,
7417                                           vectype_in, i);
7418           gcall *call = gimple_build_call_internal (cond_fn, 4, mask,
7419                                                     vop[0], vop[1], vop[0]);
7420           new_temp = make_ssa_name (vec_dest, call);
7421           gimple_call_set_lhs (call, new_temp);
7422           gimple_call_set_nothrow (call, true);
7423           vect_finish_stmt_generation (loop_vinfo, stmt_info, call, gsi);
7424           new_stmt = call;
7425         }
7426       else
7427         {
7428           if (op_type == ternary_op)
7429             vop[2] = vec_oprnds2[i];
7430
7431           if (masked_loop_p && mask_by_cond_expr)
7432             {
7433               tree mask = vect_get_loop_mask (gsi, masks, vec_num * ncopies,
7434                                               vectype_in, i);
7435               build_vect_cond_expr (code, vop, mask, gsi);
7436             }
7437
7438           new_stmt = gimple_build_assign (vec_dest, code,
7439                                           vop[0], vop[1], vop[2]);
7440           new_temp = make_ssa_name (vec_dest, new_stmt);
7441           gimple_assign_set_lhs (new_stmt, new_temp);
7442           vect_finish_stmt_generation (loop_vinfo, stmt_info, new_stmt, gsi);
7443         }
7444
7445       if (slp_node)
7446         SLP_TREE_VEC_STMTS (slp_node).quick_push (new_stmt);
7447       else if (single_defuse_cycle
7448                && i < ncopies - 1)
7449         {
7450           if (reduc_index == 0)
7451             vec_oprnds0.safe_push (gimple_get_lhs (new_stmt));
7452           else if (reduc_index == 1)
7453             vec_oprnds1.safe_push (gimple_get_lhs (new_stmt));
7454           else if (reduc_index == 2)
7455             vec_oprnds2.safe_push (gimple_get_lhs (new_stmt));
7456         }
7457       else
7458         STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_stmt);
7459     }
7460
7461   if (!slp_node)
7462     *vec_stmt = STMT_VINFO_VEC_STMTS (stmt_info)[0];
7463
7464   return true;
7465 }
7466
7467 /* Transform phase of a cycle PHI.  */
7468
7469 bool
7470 vect_transform_cycle_phi (loop_vec_info loop_vinfo,
7471                           stmt_vec_info stmt_info, gimple **vec_stmt,
7472                           slp_tree slp_node, slp_instance slp_node_instance)
7473 {
7474   tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
7475   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
7476   int i;
7477   int ncopies;
7478   int j;
7479   bool nested_cycle = false;
7480   int vec_num;
7481
7482   if (nested_in_vect_loop_p (loop, stmt_info))
7483     {
7484       loop = loop->inner;
7485       nested_cycle = true;
7486     }
7487
7488   stmt_vec_info reduc_stmt_info = STMT_VINFO_REDUC_DEF (stmt_info);
7489   reduc_stmt_info = vect_stmt_to_vectorize (reduc_stmt_info);
7490   stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info);
7491   gcc_assert (reduc_info->is_reduc_info);
7492
7493   if (STMT_VINFO_REDUC_TYPE (reduc_info) == EXTRACT_LAST_REDUCTION
7494       || STMT_VINFO_REDUC_TYPE (reduc_info) == FOLD_LEFT_REDUCTION)
7495     /* Leave the scalar phi in place.  */
7496     return true;
7497
7498   tree vectype_in = STMT_VINFO_REDUC_VECTYPE_IN (reduc_info);
7499   /* For a nested cycle we do not fill the above.  */
7500   if (!vectype_in)
7501     vectype_in = STMT_VINFO_VECTYPE (stmt_info);
7502   gcc_assert (vectype_in);
7503
7504   if (slp_node)
7505     {
7506       /* The size vect_schedule_slp_instance computes is off for us.  */
7507       vec_num = vect_get_num_vectors (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
7508                                       * SLP_TREE_LANES (slp_node), vectype_in);
7509       ncopies = 1;
7510     }
7511   else
7512     {
7513       vec_num = 1;
7514       ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
7515     }
7516
7517   /* Check whether we should use a single PHI node and accumulate
7518      vectors to one before the backedge.  */
7519   if (STMT_VINFO_FORCE_SINGLE_CYCLE (reduc_info))
7520     ncopies = 1;
7521
7522   /* Create the destination vector  */
7523   gphi *phi = as_a <gphi *> (stmt_info->stmt);
7524   tree vec_dest = vect_create_destination_var (gimple_phi_result (phi),
7525                                                vectype_out);
7526
7527   /* Get the loop-entry arguments.  */
7528   tree vec_initial_def = NULL_TREE;
7529   auto_vec<tree> vec_initial_defs;
7530   if (slp_node)
7531     {
7532       vec_initial_defs.reserve (vec_num);
7533       if (nested_cycle)
7534         {
7535           unsigned phi_idx = loop_preheader_edge (loop)->dest_idx;
7536           vect_get_slp_defs (SLP_TREE_CHILDREN (slp_node)[phi_idx],
7537                              &vec_initial_defs);
7538         }
7539       else
7540         {
7541           gcc_assert (slp_node == slp_node_instance->reduc_phis);
7542           vec<tree> &initial_values = reduc_info->reduc_initial_values;
7543           vec<stmt_vec_info> &stmts = SLP_TREE_SCALAR_STMTS (slp_node);
7544
7545           unsigned int num_phis = stmts.length ();
7546           if (REDUC_GROUP_FIRST_ELEMENT (reduc_stmt_info))
7547             num_phis = 1;
7548           initial_values.reserve (num_phis);
7549           for (unsigned int i = 0; i < num_phis; ++i)
7550             {
7551               gphi *this_phi = as_a<gphi *> (stmts[i]->stmt);
7552               initial_values.quick_push (vect_phi_initial_value (this_phi));
7553             }
7554           if (vec_num == 1)
7555             vect_find_reusable_accumulator (loop_vinfo, reduc_info);
7556           if (!initial_values.is_empty ())
7557             {
7558               tree initial_value
7559                 = (num_phis == 1 ? initial_values[0] : NULL_TREE);
7560               tree_code code = STMT_VINFO_REDUC_CODE (reduc_info);
7561               tree neutral_op
7562                 = neutral_op_for_reduction (TREE_TYPE (vectype_out),
7563                                             code, initial_value);
7564               get_initial_defs_for_reduction (loop_vinfo, reduc_info,
7565                                               &vec_initial_defs, vec_num,
7566                                               stmts.length (), neutral_op);
7567             }
7568         }
7569     }
7570   else
7571     {
7572       /* Get at the scalar def before the loop, that defines the initial
7573          value of the reduction variable.  */
7574       tree initial_def = vect_phi_initial_value (phi);
7575       reduc_info->reduc_initial_values.safe_push (initial_def);
7576       /* Optimize: if initial_def is for REDUC_MAX smaller than the base
7577          and we can't use zero for induc_val, use initial_def.  Similarly
7578          for REDUC_MIN and initial_def larger than the base.  */
7579       if (STMT_VINFO_REDUC_TYPE (reduc_info) == INTEGER_INDUC_COND_REDUCTION)
7580         {
7581           tree induc_val = STMT_VINFO_VEC_INDUC_COND_INITIAL_VAL (reduc_info);
7582           if (TREE_CODE (initial_def) == INTEGER_CST
7583               && !integer_zerop (induc_val)
7584               && ((STMT_VINFO_REDUC_CODE (reduc_info) == MAX_EXPR
7585                    && tree_int_cst_lt (initial_def, induc_val))
7586                   || (STMT_VINFO_REDUC_CODE (reduc_info) == MIN_EXPR
7587                       && tree_int_cst_lt (induc_val, initial_def))))
7588             {
7589               induc_val = initial_def;
7590               /* Communicate we used the initial_def to epilouge
7591                  generation.  */
7592               STMT_VINFO_VEC_INDUC_COND_INITIAL_VAL (reduc_info) = NULL_TREE;
7593             }
7594           vec_initial_def = build_vector_from_val (vectype_out, induc_val);
7595         }
7596       else if (nested_cycle)
7597         {
7598           /* Do not use an adjustment def as that case is not supported
7599              correctly if ncopies is not one.  */
7600           vect_get_vec_defs_for_operand (loop_vinfo, reduc_stmt_info,
7601                                          ncopies, initial_def,
7602                                          &vec_initial_defs);
7603         }
7604       else if (STMT_VINFO_REDUC_TYPE (reduc_info) == CONST_COND_REDUCTION
7605                || STMT_VINFO_REDUC_TYPE (reduc_info) == COND_REDUCTION)
7606         /* Fill the initial vector with the initial scalar value.  */
7607         vec_initial_def
7608           = get_initial_def_for_reduction (loop_vinfo, reduc_stmt_info,
7609                                            initial_def, initial_def);
7610       else
7611         {
7612           if (ncopies == 1)
7613             vect_find_reusable_accumulator (loop_vinfo, reduc_info);
7614           if (!reduc_info->reduc_initial_values.is_empty ())
7615             {
7616               initial_def = reduc_info->reduc_initial_values[0];
7617               enum tree_code code = STMT_VINFO_REDUC_CODE (reduc_info);
7618               tree neutral_op
7619                 = neutral_op_for_reduction (TREE_TYPE (initial_def),
7620                                             code, initial_def);
7621               gcc_assert (neutral_op);
7622               /* Try to simplify the vector initialization by applying an
7623                  adjustment after the reduction has been performed.  */
7624               if (!reduc_info->reused_accumulator
7625                   && STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
7626                   && !operand_equal_p (neutral_op, initial_def))
7627                 {
7628                   STMT_VINFO_REDUC_EPILOGUE_ADJUSTMENT (reduc_info)
7629                     = initial_def;
7630                   initial_def = neutral_op;
7631                 }
7632               vec_initial_def
7633                 = get_initial_def_for_reduction (loop_vinfo, reduc_info,
7634                                                  initial_def, neutral_op);
7635             }
7636         }
7637     }
7638
7639   if (vec_initial_def)
7640     {
7641       vec_initial_defs.create (ncopies);
7642       for (i = 0; i < ncopies; ++i)
7643         vec_initial_defs.quick_push (vec_initial_def);
7644     }
7645
7646   if (auto *accumulator = reduc_info->reused_accumulator)
7647     {
7648       tree def = accumulator->reduc_input;
7649       if (!useless_type_conversion_p (vectype_out, TREE_TYPE (def)))
7650         {
7651           unsigned int nreduc;
7652           bool res = constant_multiple_p (TYPE_VECTOR_SUBPARTS
7653                                             (TREE_TYPE (def)),
7654                                           TYPE_VECTOR_SUBPARTS (vectype_out),
7655                                           &nreduc);
7656           gcc_assert (res);
7657           gimple_seq stmts = NULL;
7658           /* Reduce the single vector to a smaller one.  */
7659           if (nreduc != 1)
7660             {
7661               /* Perform the reduction in the appropriate type.  */
7662               tree rvectype = vectype_out;
7663               if (!useless_type_conversion_p (TREE_TYPE (vectype_out),
7664                                               TREE_TYPE (TREE_TYPE (def))))
7665                 rvectype = build_vector_type (TREE_TYPE (TREE_TYPE (def)),
7666                                               TYPE_VECTOR_SUBPARTS
7667                                                 (vectype_out));
7668               def = vect_create_partial_epilog (def, rvectype,
7669                                                 STMT_VINFO_REDUC_CODE
7670                                                   (reduc_info),
7671                                                 &stmts);
7672             }
7673           /* The epilogue loop might use a different vector mode, like
7674              VNx2DI vs. V2DI.  */
7675           if (TYPE_MODE (vectype_out) != TYPE_MODE (TREE_TYPE (def)))
7676             {
7677               tree reduc_type = build_vector_type_for_mode
7678                 (TREE_TYPE (TREE_TYPE (def)), TYPE_MODE (vectype_out));
7679               def = gimple_convert (&stmts, reduc_type, def);
7680             }
7681           /* Adjust the input so we pick up the partially reduced value
7682              for the skip edge in vect_create_epilog_for_reduction.  */
7683           accumulator->reduc_input = def;
7684           /* And the reduction could be carried out using a different sign.  */
7685           if (!useless_type_conversion_p (vectype_out, TREE_TYPE (def)))
7686             def = gimple_convert (&stmts, vectype_out, def);
7687           if (loop_vinfo->main_loop_edge)
7688             {
7689               /* While we'd like to insert on the edge this will split
7690                  blocks and disturb bookkeeping, we also will eventually
7691                  need this on the skip edge.  Rely on sinking to
7692                  fixup optimal placement and insert in the pred.  */
7693               gimple_stmt_iterator gsi
7694                 = gsi_last_bb (loop_vinfo->main_loop_edge->src);
7695               /* Insert before a cond that eventually skips the
7696                  epilogue.  */
7697               if (!gsi_end_p (gsi) && stmt_ends_bb_p (gsi_stmt (gsi)))
7698                 gsi_prev (&gsi);
7699               gsi_insert_seq_after (&gsi, stmts, GSI_CONTINUE_LINKING);
7700             }
7701           else
7702             gsi_insert_seq_on_edge_immediate (loop_preheader_edge (loop),
7703                                               stmts);
7704         }
7705       if (loop_vinfo->main_loop_edge)
7706         vec_initial_defs[0]
7707           = vect_get_main_loop_result (loop_vinfo, def,
7708                                        vec_initial_defs[0]);
7709       else
7710         vec_initial_defs.safe_push (def);
7711     }
7712
7713   /* Generate the reduction PHIs upfront.  */
7714   for (i = 0; i < vec_num; i++)
7715     {
7716       tree vec_init_def = vec_initial_defs[i];
7717       for (j = 0; j < ncopies; j++)
7718         {
7719           /* Create the reduction-phi that defines the reduction
7720              operand.  */
7721           gphi *new_phi = create_phi_node (vec_dest, loop->header);
7722
7723           /* Set the loop-entry arg of the reduction-phi.  */
7724           if (j != 0 && nested_cycle)
7725             vec_init_def = vec_initial_defs[j];
7726           add_phi_arg (new_phi, vec_init_def, loop_preheader_edge (loop),
7727                        UNKNOWN_LOCATION);
7728
7729           /* The loop-latch arg is set in epilogue processing.  */
7730
7731           if (slp_node)
7732             SLP_TREE_VEC_STMTS (slp_node).quick_push (new_phi);
7733           else
7734             {
7735               if (j == 0)
7736                 *vec_stmt = new_phi;
7737               STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_phi);
7738             }
7739         }
7740     }
7741
7742   return true;
7743 }
7744
7745 /* Vectorizes LC PHIs.  */
7746
7747 bool
7748 vectorizable_lc_phi (loop_vec_info loop_vinfo,
7749                      stmt_vec_info stmt_info, gimple **vec_stmt,
7750                      slp_tree slp_node)
7751 {
7752   if (!loop_vinfo
7753       || !is_a <gphi *> (stmt_info->stmt)
7754       || gimple_phi_num_args (stmt_info->stmt) != 1)
7755     return false;
7756
7757   if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_internal_def
7758       && STMT_VINFO_DEF_TYPE (stmt_info) != vect_double_reduction_def)
7759     return false;
7760
7761   if (!vec_stmt) /* transformation not required.  */
7762     {
7763       /* Deal with copies from externs or constants that disguise as
7764          loop-closed PHI nodes (PR97886).  */
7765       if (slp_node
7766           && !vect_maybe_update_slp_op_vectype (SLP_TREE_CHILDREN (slp_node)[0],
7767                                                 SLP_TREE_VECTYPE (slp_node)))
7768         {
7769           if (dump_enabled_p ())
7770             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7771                              "incompatible vector types for invariants\n");
7772           return false;
7773         }
7774       STMT_VINFO_TYPE (stmt_info) = lc_phi_info_type;
7775       return true;
7776     }
7777
7778   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
7779   tree scalar_dest = gimple_phi_result (stmt_info->stmt);
7780   basic_block bb = gimple_bb (stmt_info->stmt);
7781   edge e = single_pred_edge (bb);
7782   tree vec_dest = vect_create_destination_var (scalar_dest, vectype);
7783   auto_vec<tree> vec_oprnds;
7784   vect_get_vec_defs (loop_vinfo, stmt_info, slp_node,
7785                      !slp_node ? vect_get_num_copies (loop_vinfo, vectype) : 1,
7786                      gimple_phi_arg_def (stmt_info->stmt, 0), &vec_oprnds);
7787   for (unsigned i = 0; i < vec_oprnds.length (); i++)
7788     {
7789       /* Create the vectorized LC PHI node.  */
7790       gphi *new_phi = create_phi_node (vec_dest, bb);
7791       add_phi_arg (new_phi, vec_oprnds[i], e, UNKNOWN_LOCATION);
7792       if (slp_node)
7793         SLP_TREE_VEC_STMTS (slp_node).quick_push (new_phi);
7794       else
7795         STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_phi);
7796     }
7797   if (!slp_node)
7798     *vec_stmt = STMT_VINFO_VEC_STMTS (stmt_info)[0];
7799
7800   return true;
7801 }
7802
7803 /* Vectorizes PHIs.  */
7804
7805 bool
7806 vectorizable_phi (vec_info *,
7807                   stmt_vec_info stmt_info, gimple **vec_stmt,
7808                   slp_tree slp_node, stmt_vector_for_cost *cost_vec)
7809 {
7810   if (!is_a <gphi *> (stmt_info->stmt) || !slp_node)
7811     return false;
7812
7813   if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_internal_def)
7814     return false;
7815
7816   tree vectype = SLP_TREE_VECTYPE (slp_node);
7817
7818   if (!vec_stmt) /* transformation not required.  */
7819     {
7820       slp_tree child;
7821       unsigned i;
7822       FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (slp_node), i, child)
7823         if (!child)
7824           {
7825             if (dump_enabled_p ())
7826               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7827                                "PHI node with unvectorized backedge def\n");
7828             return false;
7829           }
7830         else if (!vect_maybe_update_slp_op_vectype (child, vectype))
7831           {
7832             if (dump_enabled_p ())
7833               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7834                                "incompatible vector types for invariants\n");
7835             return false;
7836           }
7837       /* For single-argument PHIs assume coalescing which means zero cost
7838          for the scalar and the vector PHIs.  This avoids artificially
7839          favoring the vector path (but may pessimize it in some cases).  */
7840       if (gimple_phi_num_args (as_a <gphi *> (stmt_info->stmt)) > 1)
7841         record_stmt_cost (cost_vec, SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node),
7842                           vector_stmt, stmt_info, vectype, 0, vect_body);
7843       STMT_VINFO_TYPE (stmt_info) = phi_info_type;
7844       return true;
7845     }
7846
7847   tree scalar_dest = gimple_phi_result (stmt_info->stmt);
7848   basic_block bb = gimple_bb (stmt_info->stmt);
7849   tree vec_dest = vect_create_destination_var (scalar_dest, vectype);
7850   auto_vec<gphi *> new_phis;
7851   for (unsigned i = 0; i < gimple_phi_num_args (stmt_info->stmt); ++i)
7852     {
7853       slp_tree child = SLP_TREE_CHILDREN (slp_node)[i];
7854
7855       /* Skip not yet vectorized defs.  */
7856       if (SLP_TREE_DEF_TYPE (child) == vect_internal_def
7857           && SLP_TREE_VEC_STMTS (child).is_empty ())
7858         continue;
7859
7860       auto_vec<tree> vec_oprnds;
7861       vect_get_slp_defs (SLP_TREE_CHILDREN (slp_node)[i], &vec_oprnds);
7862       if (!new_phis.exists ())
7863         {
7864           new_phis.create (vec_oprnds.length ());
7865           for (unsigned j = 0; j < vec_oprnds.length (); j++)
7866             {
7867               /* Create the vectorized LC PHI node.  */
7868               new_phis.quick_push (create_phi_node (vec_dest, bb));
7869               SLP_TREE_VEC_STMTS (slp_node).quick_push (new_phis[j]);
7870             }
7871         }
7872       edge e = gimple_phi_arg_edge (as_a <gphi *> (stmt_info->stmt), i);
7873       for (unsigned j = 0; j < vec_oprnds.length (); j++)
7874         add_phi_arg (new_phis[j], vec_oprnds[j], e, UNKNOWN_LOCATION);
7875     }
7876   /* We should have at least one already vectorized child.  */
7877   gcc_assert (new_phis.exists ());
7878
7879   return true;
7880 }
7881
7882 /* Return true if VECTYPE represents a vector that requires lowering
7883    by the vector lowering pass.  */
7884
7885 bool
7886 vect_emulated_vector_p (tree vectype)
7887 {
7888   return (!VECTOR_MODE_P (TYPE_MODE (vectype))
7889           && (!VECTOR_BOOLEAN_TYPE_P (vectype)
7890               || TYPE_PRECISION (TREE_TYPE (vectype)) != 1));
7891 }
7892
7893 /* Return true if we can emulate CODE on an integer mode representation
7894    of a vector.  */
7895
7896 bool
7897 vect_can_vectorize_without_simd_p (tree_code code)
7898 {
7899   switch (code)
7900     {
7901     case PLUS_EXPR:
7902     case MINUS_EXPR:
7903     case NEGATE_EXPR:
7904     case BIT_AND_EXPR:
7905     case BIT_IOR_EXPR:
7906     case BIT_XOR_EXPR:
7907     case BIT_NOT_EXPR:
7908       return true;
7909
7910     default:
7911       return false;
7912     }
7913 }
7914
7915 /* Function vectorizable_induction
7916
7917    Check if STMT_INFO performs an induction computation that can be vectorized.
7918    If VEC_STMT is also passed, vectorize the induction PHI: create a vectorized
7919    phi to replace it, put it in VEC_STMT, and add it to the same basic block.
7920    Return true if STMT_INFO is vectorizable in this way.  */
7921
7922 bool
7923 vectorizable_induction (loop_vec_info loop_vinfo,
7924                         stmt_vec_info stmt_info,
7925                         gimple **vec_stmt, slp_tree slp_node,
7926                         stmt_vector_for_cost *cost_vec)
7927 {
7928   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
7929   unsigned ncopies;
7930   bool nested_in_vect_loop = false;
7931   class loop *iv_loop;
7932   tree vec_def;
7933   edge pe = loop_preheader_edge (loop);
7934   basic_block new_bb;
7935   tree new_vec, vec_init, vec_step, t;
7936   tree new_name;
7937   gimple *new_stmt;
7938   gphi *induction_phi;
7939   tree induc_def, vec_dest;
7940   tree init_expr, step_expr;
7941   poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
7942   unsigned i;
7943   tree expr;
7944   gimple_stmt_iterator si;
7945
7946   gphi *phi = dyn_cast <gphi *> (stmt_info->stmt);
7947   if (!phi)
7948     return false;
7949
7950   if (!STMT_VINFO_RELEVANT_P (stmt_info))
7951     return false;
7952
7953   /* Make sure it was recognized as induction computation.  */
7954   if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_induction_def)
7955     return false;
7956
7957   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
7958   poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
7959
7960   if (slp_node)
7961     ncopies = 1;
7962   else
7963     ncopies = vect_get_num_copies (loop_vinfo, vectype);
7964   gcc_assert (ncopies >= 1);
7965
7966   /* FORNOW. These restrictions should be relaxed.  */
7967   if (nested_in_vect_loop_p (loop, stmt_info))
7968     {
7969       imm_use_iterator imm_iter;
7970       use_operand_p use_p;
7971       gimple *exit_phi;
7972       edge latch_e;
7973       tree loop_arg;
7974
7975       if (ncopies > 1)
7976         {
7977           if (dump_enabled_p ())
7978             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7979                              "multiple types in nested loop.\n");
7980           return false;
7981         }
7982
7983       exit_phi = NULL;
7984       latch_e = loop_latch_edge (loop->inner);
7985       loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
7986       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, loop_arg)
7987         {
7988           gimple *use_stmt = USE_STMT (use_p);
7989           if (is_gimple_debug (use_stmt))
7990             continue;
7991
7992           if (!flow_bb_inside_loop_p (loop->inner, gimple_bb (use_stmt)))
7993             {
7994               exit_phi = use_stmt;
7995               break;
7996             }
7997         }
7998       if (exit_phi)
7999         {
8000           stmt_vec_info exit_phi_vinfo = loop_vinfo->lookup_stmt (exit_phi);
8001           if (!(STMT_VINFO_RELEVANT_P (exit_phi_vinfo)
8002                 && !STMT_VINFO_LIVE_P (exit_phi_vinfo)))
8003             {
8004               if (dump_enabled_p ())
8005                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8006                                  "inner-loop induction only used outside "
8007                                  "of the outer vectorized loop.\n");
8008               return false;
8009             }
8010         }
8011
8012       nested_in_vect_loop = true;
8013       iv_loop = loop->inner;
8014     }
8015   else
8016     iv_loop = loop;
8017   gcc_assert (iv_loop == (gimple_bb (phi))->loop_father);
8018
8019   if (slp_node && !nunits.is_constant ())
8020     {
8021       /* The current SLP code creates the step value element-by-element.  */
8022       if (dump_enabled_p ())
8023         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8024                          "SLP induction not supported for variable-length"
8025                          " vectors.\n");
8026       return false;
8027     }
8028
8029   if (!vec_stmt) /* transformation not required.  */
8030     {
8031       unsigned inside_cost = 0, prologue_cost = 0;
8032       if (slp_node)
8033         {
8034           /* We eventually need to set a vector type on invariant
8035              arguments.  */
8036           unsigned j;
8037           slp_tree child;
8038           FOR_EACH_VEC_ELT (SLP_TREE_CHILDREN (slp_node), j, child)
8039             if (!vect_maybe_update_slp_op_vectype
8040                 (child, SLP_TREE_VECTYPE (slp_node)))
8041               {
8042                 if (dump_enabled_p ())
8043                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8044                                    "incompatible vector types for "
8045                                    "invariants\n");
8046                 return false;
8047               }
8048           /* loop cost for vec_loop.  */
8049           inside_cost
8050             = record_stmt_cost (cost_vec,
8051                                 SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node),
8052                                 vector_stmt, stmt_info, 0, vect_body);
8053           /* prologue cost for vec_init (if not nested) and step.  */
8054           prologue_cost = record_stmt_cost (cost_vec, 1 + !nested_in_vect_loop,
8055                                             scalar_to_vec,
8056                                             stmt_info, 0, vect_prologue);
8057         }
8058       else /* if (!slp_node) */
8059         {
8060           /* loop cost for vec_loop.  */
8061           inside_cost = record_stmt_cost (cost_vec, ncopies, vector_stmt,
8062                                           stmt_info, 0, vect_body);
8063           /* prologue cost for vec_init and vec_step.  */
8064           prologue_cost = record_stmt_cost (cost_vec, 2, scalar_to_vec,
8065                                             stmt_info, 0, vect_prologue);
8066         }
8067       if (dump_enabled_p ())
8068         dump_printf_loc (MSG_NOTE, vect_location,
8069                          "vect_model_induction_cost: inside_cost = %d, "
8070                          "prologue_cost = %d .\n", inside_cost,
8071                          prologue_cost);
8072
8073       STMT_VINFO_TYPE (stmt_info) = induc_vec_info_type;
8074       DUMP_VECT_SCOPE ("vectorizable_induction");
8075       return true;
8076     }
8077
8078   /* Transform.  */
8079
8080   /* Compute a vector variable, initialized with the first VF values of
8081      the induction variable.  E.g., for an iv with IV_PHI='X' and
8082      evolution S, for a vector of 4 units, we want to compute:
8083      [X, X + S, X + 2*S, X + 3*S].  */
8084
8085   if (dump_enabled_p ())
8086     dump_printf_loc (MSG_NOTE, vect_location, "transform induction phi.\n");
8087
8088   step_expr = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_info);
8089   gcc_assert (step_expr != NULL_TREE);
8090   tree step_vectype = get_same_sized_vectype (TREE_TYPE (step_expr), vectype);
8091
8092   pe = loop_preheader_edge (iv_loop);
8093   /* Find the first insertion point in the BB.  */
8094   basic_block bb = gimple_bb (phi);
8095   si = gsi_after_labels (bb);
8096
8097   /* For SLP induction we have to generate several IVs as for example
8098      with group size 3 we need
8099        [i0, i1, i2, i0 + S0] [i1 + S1, i2 + S2, i0 + 2*S0, i1 + 2*S1]
8100        [i2 + 2*S2, i0 + 3*S0, i1 + 3*S1, i2 + 3*S2].  */
8101   if (slp_node)
8102     {
8103       /* Enforced above.  */
8104       unsigned int const_nunits = nunits.to_constant ();
8105
8106       /* The initial values are vectorized, but any lanes > group_size
8107          need adjustment.  */
8108       slp_tree init_node
8109         = SLP_TREE_CHILDREN (slp_node)[pe->dest_idx];
8110
8111       /* Gather steps.  Since we do not vectorize inductions as
8112          cycles we have to reconstruct the step from SCEV data.  */
8113       unsigned group_size = SLP_TREE_LANES (slp_node);
8114       tree *steps = XALLOCAVEC (tree, group_size);
8115       tree *inits = XALLOCAVEC (tree, group_size);
8116       stmt_vec_info phi_info;
8117       FOR_EACH_VEC_ELT (SLP_TREE_SCALAR_STMTS (slp_node), i, phi_info)
8118         {
8119           steps[i] = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (phi_info);
8120           if (!init_node)
8121             inits[i] = gimple_phi_arg_def (as_a<gphi *> (phi_info->stmt),
8122                                            pe->dest_idx);
8123         }
8124
8125       /* Now generate the IVs.  */
8126       unsigned nvects = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
8127       gcc_assert ((const_nunits * nvects) % group_size == 0);
8128       unsigned nivs;
8129       if (nested_in_vect_loop)
8130         nivs = nvects;
8131       else
8132         {
8133           /* Compute the number of distinct IVs we need.  First reduce
8134              group_size if it is a multiple of const_nunits so we get
8135              one IV for a group_size of 4 but const_nunits 2.  */
8136           unsigned group_sizep = group_size;
8137           if (group_sizep % const_nunits == 0)
8138             group_sizep = group_sizep / const_nunits;
8139           nivs = least_common_multiple (group_sizep,
8140                                         const_nunits) / const_nunits;
8141         }
8142       tree stept = TREE_TYPE (step_vectype);
8143       tree lupdate_mul = NULL_TREE;
8144       if (!nested_in_vect_loop)
8145         {
8146           /* The number of iterations covered in one vector iteration.  */
8147           unsigned lup_mul = (nvects * const_nunits) / group_size;
8148           lupdate_mul
8149             = build_vector_from_val (step_vectype,
8150                                      SCALAR_FLOAT_TYPE_P (stept)
8151                                      ? build_real_from_wide (stept, lup_mul,
8152                                                              UNSIGNED)
8153                                      : build_int_cstu (stept, lup_mul));
8154         }
8155       tree peel_mul = NULL_TREE;
8156       gimple_seq init_stmts = NULL;
8157       if (LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo))
8158         {
8159           if (SCALAR_FLOAT_TYPE_P (stept))
8160             peel_mul = gimple_build (&init_stmts, FLOAT_EXPR, stept,
8161                                      LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo));
8162           else
8163             peel_mul = gimple_convert (&init_stmts, stept,
8164                                        LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo));
8165           peel_mul = gimple_build_vector_from_val (&init_stmts,
8166                                                    step_vectype, peel_mul);
8167         }
8168       unsigned ivn;
8169       auto_vec<tree> vec_steps;
8170       for (ivn = 0; ivn < nivs; ++ivn)
8171         {
8172           tree_vector_builder step_elts (step_vectype, const_nunits, 1);
8173           tree_vector_builder init_elts (vectype, const_nunits, 1);
8174           tree_vector_builder mul_elts (step_vectype, const_nunits, 1);
8175           for (unsigned eltn = 0; eltn < const_nunits; ++eltn)
8176             {
8177               /* The scalar steps of the IVs.  */
8178               tree elt = steps[(ivn*const_nunits + eltn) % group_size];
8179               elt = gimple_convert (&init_stmts, TREE_TYPE (step_vectype), elt);
8180               step_elts.quick_push (elt);
8181               if (!init_node)
8182                 {
8183                   /* The scalar inits of the IVs if not vectorized.  */
8184                   elt = inits[(ivn*const_nunits + eltn) % group_size];
8185                   if (!useless_type_conversion_p (TREE_TYPE (vectype),
8186                                                   TREE_TYPE (elt)))
8187                     elt = gimple_build (&init_stmts, VIEW_CONVERT_EXPR,
8188                                         TREE_TYPE (vectype), elt);
8189                   init_elts.quick_push (elt);
8190                 }
8191               /* The number of steps to add to the initial values.  */
8192               unsigned mul_elt = (ivn*const_nunits + eltn) / group_size;
8193               mul_elts.quick_push (SCALAR_FLOAT_TYPE_P (stept)
8194                                    ? build_real_from_wide (stept,
8195                                                            mul_elt, UNSIGNED)
8196                                    : build_int_cstu (stept, mul_elt));
8197             }
8198           vec_step = gimple_build_vector (&init_stmts, &step_elts);
8199           vec_steps.safe_push (vec_step);
8200           tree step_mul = gimple_build_vector (&init_stmts, &mul_elts);
8201           if (peel_mul)
8202             step_mul = gimple_build (&init_stmts, PLUS_EXPR, step_vectype,
8203                                      step_mul, peel_mul);
8204           if (!init_node)
8205             vec_init = gimple_build_vector (&init_stmts, &init_elts);
8206
8207           /* Create the induction-phi that defines the induction-operand.  */
8208           vec_dest = vect_get_new_vect_var (vectype, vect_simple_var,
8209                                             "vec_iv_");
8210           induction_phi = create_phi_node (vec_dest, iv_loop->header);
8211           induc_def = PHI_RESULT (induction_phi);
8212
8213           /* Create the iv update inside the loop  */
8214           tree up = vec_step;
8215           if (lupdate_mul)
8216             up = gimple_build (&init_stmts, MULT_EXPR, step_vectype,
8217                                vec_step, lupdate_mul);
8218           gimple_seq stmts = NULL;
8219           vec_def = gimple_convert (&stmts, step_vectype, induc_def);
8220           vec_def = gimple_build (&stmts,
8221                                   PLUS_EXPR, step_vectype, vec_def, up);
8222           vec_def = gimple_convert (&stmts, vectype, vec_def);
8223           gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
8224           add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
8225                        UNKNOWN_LOCATION);
8226
8227           if (init_node)
8228             vec_init = vect_get_slp_vect_def (init_node, ivn);
8229           if (!nested_in_vect_loop
8230               && !integer_zerop (step_mul))
8231             {
8232               vec_def = gimple_convert (&init_stmts, step_vectype, vec_init);
8233               up = gimple_build (&init_stmts, MULT_EXPR, step_vectype,
8234                                  vec_step, step_mul);
8235               vec_def = gimple_build (&init_stmts, PLUS_EXPR, step_vectype,
8236                                       vec_def, up);
8237               vec_init = gimple_convert (&init_stmts, vectype, vec_def);
8238             }
8239
8240           /* Set the arguments of the phi node:  */
8241           add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION);
8242
8243           SLP_TREE_VEC_STMTS (slp_node).quick_push (induction_phi);
8244         }
8245       if (!nested_in_vect_loop)
8246         {
8247           /* Fill up to the number of vectors we need for the whole group.  */
8248           nivs = least_common_multiple (group_size,
8249                                         const_nunits) / const_nunits;
8250           vec_steps.reserve (nivs-ivn);
8251           for (; ivn < nivs; ++ivn)
8252             {
8253               SLP_TREE_VEC_STMTS (slp_node)
8254                 .quick_push (SLP_TREE_VEC_STMTS (slp_node)[0]);
8255               vec_steps.quick_push (vec_steps[0]);
8256             }
8257         }
8258
8259       /* Re-use IVs when we can.  We are generating further vector
8260          stmts by adding VF' * stride to the IVs generated above.  */
8261       if (ivn < nvects)
8262         {
8263           unsigned vfp
8264             = least_common_multiple (group_size, const_nunits) / group_size;
8265           tree lupdate_mul
8266             = build_vector_from_val (step_vectype,
8267                                      SCALAR_FLOAT_TYPE_P (stept)
8268                                      ? build_real_from_wide (stept,
8269                                                              vfp, UNSIGNED)
8270                                      : build_int_cstu (stept, vfp));
8271           for (; ivn < nvects; ++ivn)
8272             {
8273               gimple *iv = SLP_TREE_VEC_STMTS (slp_node)[ivn - nivs];
8274               tree def = gimple_get_lhs (iv);
8275               if (ivn < 2*nivs)
8276                 vec_steps[ivn - nivs]
8277                   = gimple_build (&init_stmts, MULT_EXPR, step_vectype,
8278                                   vec_steps[ivn - nivs], lupdate_mul);
8279               gimple_seq stmts = NULL;
8280               def = gimple_convert (&stmts, step_vectype, def);
8281               def = gimple_build (&stmts, PLUS_EXPR, step_vectype,
8282                                   def, vec_steps[ivn % nivs]);
8283               def = gimple_convert (&stmts, vectype, def);
8284               if (gimple_code (iv) == GIMPLE_PHI)
8285                 gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
8286               else
8287                 {
8288                   gimple_stmt_iterator tgsi = gsi_for_stmt (iv);
8289                   gsi_insert_seq_after (&tgsi, stmts, GSI_CONTINUE_LINKING);
8290                 }
8291               SLP_TREE_VEC_STMTS (slp_node)
8292                 .quick_push (SSA_NAME_DEF_STMT (def));
8293             }
8294         }
8295
8296       new_bb = gsi_insert_seq_on_edge_immediate (pe, init_stmts);
8297       gcc_assert (!new_bb);
8298
8299       return true;
8300     }
8301
8302   init_expr = vect_phi_initial_value (phi);
8303
8304   gimple_seq stmts = NULL;
8305   if (!nested_in_vect_loop)
8306     {
8307       /* Convert the initial value to the IV update type.  */
8308       tree new_type = TREE_TYPE (step_expr);
8309       init_expr = gimple_convert (&stmts, new_type, init_expr);
8310
8311       /* If we are using the loop mask to "peel" for alignment then we need
8312          to adjust the start value here.  */
8313       tree skip_niters = LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo);
8314       if (skip_niters != NULL_TREE)
8315         {
8316           if (FLOAT_TYPE_P (vectype))
8317             skip_niters = gimple_build (&stmts, FLOAT_EXPR, new_type,
8318                                         skip_niters);
8319           else
8320             skip_niters = gimple_convert (&stmts, new_type, skip_niters);
8321           tree skip_step = gimple_build (&stmts, MULT_EXPR, new_type,
8322                                          skip_niters, step_expr);
8323           init_expr = gimple_build (&stmts, MINUS_EXPR, new_type,
8324                                     init_expr, skip_step);
8325         }
8326     }
8327
8328   if (stmts)
8329     {
8330       new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
8331       gcc_assert (!new_bb);
8332     }
8333
8334   /* Create the vector that holds the initial_value of the induction.  */
8335   if (nested_in_vect_loop)
8336     {
8337       /* iv_loop is nested in the loop to be vectorized.  init_expr had already
8338          been created during vectorization of previous stmts.  We obtain it
8339          from the STMT_VINFO_VEC_STMT of the defining stmt.  */
8340       auto_vec<tree> vec_inits;
8341       vect_get_vec_defs_for_operand (loop_vinfo, stmt_info, 1,
8342                                      init_expr, &vec_inits);
8343       vec_init = vec_inits[0];
8344       /* If the initial value is not of proper type, convert it.  */
8345       if (!useless_type_conversion_p (vectype, TREE_TYPE (vec_init)))
8346         {
8347           new_stmt
8348             = gimple_build_assign (vect_get_new_ssa_name (vectype,
8349                                                           vect_simple_var,
8350                                                           "vec_iv_"),
8351                                    VIEW_CONVERT_EXPR,
8352                                    build1 (VIEW_CONVERT_EXPR, vectype,
8353                                            vec_init));
8354           vec_init = gimple_assign_lhs (new_stmt);
8355           new_bb = gsi_insert_on_edge_immediate (loop_preheader_edge (iv_loop),
8356                                                  new_stmt);
8357           gcc_assert (!new_bb);
8358         }
8359     }
8360   else
8361     {
8362       /* iv_loop is the loop to be vectorized. Create:
8363          vec_init = [X, X+S, X+2*S, X+3*S] (S = step_expr, X = init_expr)  */
8364       stmts = NULL;
8365       new_name = gimple_convert (&stmts, TREE_TYPE (step_expr), init_expr);
8366
8367       unsigned HOST_WIDE_INT const_nunits;
8368       if (nunits.is_constant (&const_nunits))
8369         {
8370           tree_vector_builder elts (step_vectype, const_nunits, 1);
8371           elts.quick_push (new_name);
8372           for (i = 1; i < const_nunits; i++)
8373             {
8374               /* Create: new_name_i = new_name + step_expr  */
8375               new_name = gimple_build (&stmts, PLUS_EXPR, TREE_TYPE (new_name),
8376                                        new_name, step_expr);
8377               elts.quick_push (new_name);
8378             }
8379           /* Create a vector from [new_name_0, new_name_1, ...,
8380              new_name_nunits-1]  */
8381           vec_init = gimple_build_vector (&stmts, &elts);
8382         }
8383       else if (INTEGRAL_TYPE_P (TREE_TYPE (step_expr)))
8384         /* Build the initial value directly from a VEC_SERIES_EXPR.  */
8385         vec_init = gimple_build (&stmts, VEC_SERIES_EXPR, step_vectype,
8386                                  new_name, step_expr);
8387       else
8388         {
8389           /* Build:
8390                 [base, base, base, ...]
8391                 + (vectype) [0, 1, 2, ...] * [step, step, step, ...].  */
8392           gcc_assert (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)));
8393           gcc_assert (flag_associative_math);
8394           tree index = build_index_vector (step_vectype, 0, 1);
8395           tree base_vec = gimple_build_vector_from_val (&stmts, step_vectype,
8396                                                         new_name);
8397           tree step_vec = gimple_build_vector_from_val (&stmts, step_vectype,
8398                                                         step_expr);
8399           vec_init = gimple_build (&stmts, FLOAT_EXPR, step_vectype, index);
8400           vec_init = gimple_build (&stmts, MULT_EXPR, step_vectype,
8401                                    vec_init, step_vec);
8402           vec_init = gimple_build (&stmts, PLUS_EXPR, step_vectype,
8403                                    vec_init, base_vec);
8404         }
8405       vec_init = gimple_convert (&stmts, vectype, vec_init);
8406
8407       if (stmts)
8408         {
8409           new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
8410           gcc_assert (!new_bb);
8411         }
8412     }
8413
8414
8415   /* Create the vector that holds the step of the induction.  */
8416   if (nested_in_vect_loop)
8417     /* iv_loop is nested in the loop to be vectorized. Generate:
8418        vec_step = [S, S, S, S]  */
8419     new_name = step_expr;
8420   else
8421     {
8422       /* iv_loop is the loop to be vectorized. Generate:
8423           vec_step = [VF*S, VF*S, VF*S, VF*S]  */
8424       gimple_seq seq = NULL;
8425       if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
8426         {
8427           expr = build_int_cst (integer_type_node, vf);
8428           expr = gimple_build (&seq, FLOAT_EXPR, TREE_TYPE (step_expr), expr);
8429         }
8430       else
8431         expr = build_int_cst (TREE_TYPE (step_expr), vf);
8432       new_name = gimple_build (&seq, MULT_EXPR, TREE_TYPE (step_expr),
8433                                expr, step_expr);
8434       if (seq)
8435         {
8436           new_bb = gsi_insert_seq_on_edge_immediate (pe, seq);
8437           gcc_assert (!new_bb);
8438         }
8439     }
8440
8441   t = unshare_expr (new_name);
8442   gcc_assert (CONSTANT_CLASS_P (new_name)
8443               || TREE_CODE (new_name) == SSA_NAME);
8444   new_vec = build_vector_from_val (step_vectype, t);
8445   vec_step = vect_init_vector (loop_vinfo, stmt_info,
8446                                new_vec, step_vectype, NULL);
8447
8448
8449   /* Create the following def-use cycle:
8450      loop prolog:
8451          vec_init = ...
8452          vec_step = ...
8453      loop:
8454          vec_iv = PHI <vec_init, vec_loop>
8455          ...
8456          STMT
8457          ...
8458          vec_loop = vec_iv + vec_step;  */
8459
8460   /* Create the induction-phi that defines the induction-operand.  */
8461   vec_dest = vect_get_new_vect_var (vectype, vect_simple_var, "vec_iv_");
8462   induction_phi = create_phi_node (vec_dest, iv_loop->header);
8463   induc_def = PHI_RESULT (induction_phi);
8464
8465   /* Create the iv update inside the loop  */
8466   stmts = NULL;
8467   vec_def = gimple_convert (&stmts, step_vectype, induc_def);
8468   vec_def = gimple_build (&stmts, PLUS_EXPR, step_vectype, vec_def, vec_step);
8469   vec_def = gimple_convert (&stmts, vectype, vec_def);
8470   gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
8471   new_stmt = SSA_NAME_DEF_STMT (vec_def);
8472
8473   /* Set the arguments of the phi node:  */
8474   add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION);
8475   add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
8476                UNKNOWN_LOCATION);
8477
8478   STMT_VINFO_VEC_STMTS (stmt_info).safe_push (induction_phi);
8479   *vec_stmt = induction_phi;
8480
8481   /* In case that vectorization factor (VF) is bigger than the number
8482      of elements that we can fit in a vectype (nunits), we have to generate
8483      more than one vector stmt - i.e - we need to "unroll" the
8484      vector stmt by a factor VF/nunits.  For more details see documentation
8485      in vectorizable_operation.  */
8486
8487   if (ncopies > 1)
8488     {
8489       gimple_seq seq = NULL;
8490       /* FORNOW. This restriction should be relaxed.  */
8491       gcc_assert (!nested_in_vect_loop);
8492
8493       /* Create the vector that holds the step of the induction.  */
8494       if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
8495         {
8496           expr = build_int_cst (integer_type_node, nunits);
8497           expr = gimple_build (&seq, FLOAT_EXPR, TREE_TYPE (step_expr), expr);
8498         }
8499       else
8500         expr = build_int_cst (TREE_TYPE (step_expr), nunits);
8501       new_name = gimple_build (&seq, MULT_EXPR, TREE_TYPE (step_expr),
8502                                expr, step_expr);
8503       if (seq)
8504         {
8505           new_bb = gsi_insert_seq_on_edge_immediate (pe, seq);
8506           gcc_assert (!new_bb);
8507         }
8508
8509       t = unshare_expr (new_name);
8510       gcc_assert (CONSTANT_CLASS_P (new_name)
8511                   || TREE_CODE (new_name) == SSA_NAME);
8512       new_vec = build_vector_from_val (step_vectype, t);
8513       vec_step = vect_init_vector (loop_vinfo, stmt_info,
8514                                    new_vec, step_vectype, NULL);
8515
8516       vec_def = induc_def;
8517       for (i = 1; i < ncopies; i++)
8518         {
8519           /* vec_i = vec_prev + vec_step  */
8520           gimple_seq stmts = NULL;
8521           vec_def = gimple_convert (&stmts, step_vectype, vec_def);
8522           vec_def = gimple_build (&stmts,
8523                                   PLUS_EXPR, step_vectype, vec_def, vec_step);
8524           vec_def = gimple_convert (&stmts, vectype, vec_def);
8525
8526           gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
8527           new_stmt = SSA_NAME_DEF_STMT (vec_def);
8528           STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_stmt);
8529         }
8530     }
8531
8532   if (dump_enabled_p ())
8533     dump_printf_loc (MSG_NOTE, vect_location,
8534                      "transform induction: created def-use cycle: %G%G",
8535                      induction_phi, SSA_NAME_DEF_STMT (vec_def));
8536
8537   return true;
8538 }
8539
8540 /* Function vectorizable_live_operation.
8541
8542    STMT_INFO computes a value that is used outside the loop.  Check if
8543    it can be supported.  */
8544
8545 bool
8546 vectorizable_live_operation (vec_info *vinfo,
8547                              stmt_vec_info stmt_info,
8548                              gimple_stmt_iterator *gsi,
8549                              slp_tree slp_node, slp_instance slp_node_instance,
8550                              int slp_index, bool vec_stmt_p,
8551                              stmt_vector_for_cost *cost_vec)
8552 {
8553   loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
8554   imm_use_iterator imm_iter;
8555   tree lhs, lhs_type, bitsize;
8556   tree vectype = (slp_node
8557                   ? SLP_TREE_VECTYPE (slp_node)
8558                   : STMT_VINFO_VECTYPE (stmt_info));
8559   poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
8560   int ncopies;
8561   gimple *use_stmt;
8562   auto_vec<tree> vec_oprnds;
8563   int vec_entry = 0;
8564   poly_uint64 vec_index = 0;
8565
8566   gcc_assert (STMT_VINFO_LIVE_P (stmt_info));
8567
8568   /* If a stmt of a reduction is live, vectorize it via
8569      vect_create_epilog_for_reduction.  vectorizable_reduction assessed
8570      validity so just trigger the transform here.  */
8571   if (STMT_VINFO_REDUC_DEF (vect_orig_stmt (stmt_info)))
8572     {
8573       if (!vec_stmt_p)
8574         return true;
8575       if (slp_node)
8576         {
8577           /* For reduction chains the meta-info is attached to
8578              the group leader.  */
8579           if (REDUC_GROUP_FIRST_ELEMENT (stmt_info))
8580             stmt_info = REDUC_GROUP_FIRST_ELEMENT (stmt_info);
8581           /* For SLP reductions we vectorize the epilogue for
8582              all involved stmts together.  */
8583           else if (slp_index != 0)
8584             return true;
8585           else
8586             /* For SLP reductions the meta-info is attached to
8587                the representative.  */
8588             stmt_info = SLP_TREE_REPRESENTATIVE (slp_node);
8589         }
8590       stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info);
8591       gcc_assert (reduc_info->is_reduc_info);
8592       if (STMT_VINFO_REDUC_TYPE (reduc_info) == FOLD_LEFT_REDUCTION
8593           || STMT_VINFO_REDUC_TYPE (reduc_info) == EXTRACT_LAST_REDUCTION)
8594         return true;
8595       vect_create_epilog_for_reduction (loop_vinfo, stmt_info, slp_node,
8596                                         slp_node_instance);
8597       return true;
8598     }
8599
8600   /* If STMT is not relevant and it is a simple assignment and its inputs are
8601      invariant then it can remain in place, unvectorized.  The original last
8602      scalar value that it computes will be used.  */
8603   if (!STMT_VINFO_RELEVANT_P (stmt_info))
8604     {
8605       gcc_assert (is_simple_and_all_uses_invariant (stmt_info, loop_vinfo));
8606       if (dump_enabled_p ())
8607         dump_printf_loc (MSG_NOTE, vect_location,
8608                          "statement is simple and uses invariant.  Leaving in "
8609                          "place.\n");
8610       return true;
8611     }
8612
8613   if (slp_node)
8614     ncopies = 1;
8615   else
8616     ncopies = vect_get_num_copies (loop_vinfo, vectype);
8617
8618   if (slp_node)
8619     {
8620       gcc_assert (slp_index >= 0);
8621
8622       /* Get the last occurrence of the scalar index from the concatenation of
8623          all the slp vectors. Calculate which slp vector it is and the index
8624          within.  */
8625       int num_scalar = SLP_TREE_LANES (slp_node);
8626       int num_vec = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
8627       poly_uint64 pos = (num_vec * nunits) - num_scalar + slp_index;
8628
8629       /* Calculate which vector contains the result, and which lane of
8630          that vector we need.  */
8631       if (!can_div_trunc_p (pos, nunits, &vec_entry, &vec_index))
8632         {
8633           if (dump_enabled_p ())
8634             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8635                              "Cannot determine which vector holds the"
8636                              " final result.\n");
8637           return false;
8638         }
8639     }
8640
8641   if (!vec_stmt_p)
8642     {
8643       /* No transformation required.  */
8644       if (loop_vinfo && LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo))
8645         {
8646           if (!direct_internal_fn_supported_p (IFN_EXTRACT_LAST, vectype,
8647                                                OPTIMIZE_FOR_SPEED))
8648             {
8649               if (dump_enabled_p ())
8650                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8651                                  "can't operate on partial vectors "
8652                                  "because the target doesn't support extract "
8653                                  "last reduction.\n");
8654               LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
8655             }
8656           else if (slp_node)
8657             {
8658               if (dump_enabled_p ())
8659                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8660                                  "can't operate on partial vectors "
8661                                  "because an SLP statement is live after "
8662                                  "the loop.\n");
8663               LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
8664             }
8665           else if (ncopies > 1)
8666             {
8667               if (dump_enabled_p ())
8668                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8669                                  "can't operate on partial vectors "
8670                                  "because ncopies is greater than 1.\n");
8671               LOOP_VINFO_CAN_USE_PARTIAL_VECTORS_P (loop_vinfo) = false;
8672             }
8673           else
8674             {
8675               gcc_assert (ncopies == 1 && !slp_node);
8676               vect_record_loop_mask (loop_vinfo,
8677                                      &LOOP_VINFO_MASKS (loop_vinfo),
8678                                      1, vectype, NULL);
8679             }
8680         }
8681       /* ???  Enable for loop costing as well.  */
8682       if (!loop_vinfo)
8683         record_stmt_cost (cost_vec, 1, vec_to_scalar, stmt_info, NULL_TREE,
8684                           0, vect_epilogue);
8685       return true;
8686     }
8687
8688   /* Use the lhs of the original scalar statement.  */
8689   gimple *stmt = vect_orig_stmt (stmt_info)->stmt;
8690   if (dump_enabled_p ())
8691     dump_printf_loc (MSG_NOTE, vect_location, "extracting lane for live "
8692                      "stmt %G", stmt);
8693
8694   lhs = gimple_get_lhs (stmt);
8695   lhs_type = TREE_TYPE (lhs);
8696
8697   bitsize = vector_element_bits_tree (vectype);
8698
8699   /* Get the vectorized lhs of STMT and the lane to use (counted in bits).  */
8700   tree vec_lhs, bitstart;
8701   gimple *vec_stmt;
8702   if (slp_node)
8703     {
8704       gcc_assert (!loop_vinfo || !LOOP_VINFO_FULLY_MASKED_P (loop_vinfo));
8705
8706       /* Get the correct slp vectorized stmt.  */
8707       vec_stmt = SLP_TREE_VEC_STMTS (slp_node)[vec_entry];
8708       vec_lhs = gimple_get_lhs (vec_stmt);
8709
8710       /* Get entry to use.  */
8711       bitstart = bitsize_int (vec_index);
8712       bitstart = int_const_binop (MULT_EXPR, bitsize, bitstart);
8713     }
8714   else
8715     {
8716       /* For multiple copies, get the last copy.  */
8717       vec_stmt = STMT_VINFO_VEC_STMTS (stmt_info).last ();
8718       vec_lhs = gimple_get_lhs (vec_stmt);
8719
8720       /* Get the last lane in the vector.  */
8721       bitstart = int_const_binop (MULT_EXPR, bitsize, bitsize_int (nunits - 1));
8722     }
8723
8724   if (loop_vinfo)
8725     {
8726       /* Ensure the VEC_LHS for lane extraction stmts satisfy loop-closed PHI
8727          requirement, insert one phi node for it.  It looks like:
8728            loop;
8729          BB:
8730            # lhs' = PHI <lhs>
8731          ==>
8732            loop;
8733          BB:
8734            # vec_lhs' = PHI <vec_lhs>
8735            new_tree = lane_extract <vec_lhs', ...>;
8736            lhs' = new_tree;  */
8737
8738       class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
8739       basic_block exit_bb = single_exit (loop)->dest;
8740       gcc_assert (single_pred_p (exit_bb));
8741
8742       tree vec_lhs_phi = copy_ssa_name (vec_lhs);
8743       gimple *phi = create_phi_node (vec_lhs_phi, exit_bb);
8744       SET_PHI_ARG_DEF (phi, single_exit (loop)->dest_idx, vec_lhs);
8745
8746       gimple_seq stmts = NULL;
8747       tree new_tree;
8748       if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
8749         {
8750           /* Emit:
8751
8752                SCALAR_RES = EXTRACT_LAST <VEC_LHS, MASK>
8753
8754              where VEC_LHS is the vectorized live-out result and MASK is
8755              the loop mask for the final iteration.  */
8756           gcc_assert (ncopies == 1 && !slp_node);
8757           tree scalar_type = TREE_TYPE (STMT_VINFO_VECTYPE (stmt_info));
8758           tree mask = vect_get_loop_mask (gsi, &LOOP_VINFO_MASKS (loop_vinfo),
8759                                           1, vectype, 0);
8760           tree scalar_res = gimple_build (&stmts, CFN_EXTRACT_LAST, scalar_type,
8761                                           mask, vec_lhs_phi);
8762
8763           /* Convert the extracted vector element to the scalar type.  */
8764           new_tree = gimple_convert (&stmts, lhs_type, scalar_res);
8765         }
8766       else
8767         {
8768           tree bftype = TREE_TYPE (vectype);
8769           if (VECTOR_BOOLEAN_TYPE_P (vectype))
8770             bftype = build_nonstandard_integer_type (tree_to_uhwi (bitsize), 1);
8771           new_tree = build3 (BIT_FIELD_REF, bftype,
8772                              vec_lhs_phi, bitsize, bitstart);
8773           new_tree = force_gimple_operand (fold_convert (lhs_type, new_tree),
8774                                            &stmts, true, NULL_TREE);
8775         }
8776
8777       if (stmts)
8778         {
8779           gimple_stmt_iterator exit_gsi = gsi_after_labels (exit_bb);
8780           gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
8781
8782           /* Remove existing phi from lhs and create one copy from new_tree.  */
8783           tree lhs_phi = NULL_TREE;
8784           gimple_stmt_iterator gsi;
8785           for (gsi = gsi_start_phis (exit_bb);
8786                !gsi_end_p (gsi); gsi_next (&gsi))
8787             {
8788               gimple *phi = gsi_stmt (gsi);
8789               if ((gimple_phi_arg_def (phi, 0) == lhs))
8790                 {
8791                   remove_phi_node (&gsi, false);
8792                   lhs_phi = gimple_phi_result (phi);
8793                   gimple *copy = gimple_build_assign (lhs_phi, new_tree);
8794                   gsi_insert_before (&exit_gsi, copy, GSI_SAME_STMT);
8795                   break;
8796                 }
8797             }
8798         }
8799
8800       /* Replace use of lhs with newly computed result.  If the use stmt is a
8801          single arg PHI, just replace all uses of PHI result.  It's necessary
8802          because lcssa PHI defining lhs may be before newly inserted stmt.  */
8803       use_operand_p use_p;
8804       FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, lhs)
8805         if (!flow_bb_inside_loop_p (loop, gimple_bb (use_stmt))
8806             && !is_gimple_debug (use_stmt))
8807           {
8808             if (gimple_code (use_stmt) == GIMPLE_PHI
8809                 && gimple_phi_num_args (use_stmt) == 1)
8810               {
8811                 replace_uses_by (gimple_phi_result (use_stmt), new_tree);
8812               }
8813             else
8814               {
8815                 FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
8816                     SET_USE (use_p, new_tree);
8817               }
8818             update_stmt (use_stmt);
8819           }
8820     }
8821   else
8822     {
8823       /* For basic-block vectorization simply insert the lane-extraction.  */
8824       tree bftype = TREE_TYPE (vectype);
8825       if (VECTOR_BOOLEAN_TYPE_P (vectype))
8826         bftype = build_nonstandard_integer_type (tree_to_uhwi (bitsize), 1);
8827       tree new_tree = build3 (BIT_FIELD_REF, bftype,
8828                               vec_lhs, bitsize, bitstart);
8829       gimple_seq stmts = NULL;
8830       new_tree = force_gimple_operand (fold_convert (lhs_type, new_tree),
8831                                        &stmts, true, NULL_TREE);
8832       if (TREE_CODE (new_tree) == SSA_NAME
8833           && SSA_NAME_OCCURS_IN_ABNORMAL_PHI (lhs))
8834         SSA_NAME_OCCURS_IN_ABNORMAL_PHI (new_tree) = 1;
8835       if (is_a <gphi *> (vec_stmt))
8836         {
8837           gimple_stmt_iterator si = gsi_after_labels (gimple_bb (vec_stmt));
8838           gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
8839         }
8840       else
8841         {
8842           gimple_stmt_iterator si = gsi_for_stmt (vec_stmt);
8843           gsi_insert_seq_after (&si, stmts, GSI_SAME_STMT);
8844         }
8845
8846       /* Replace use of lhs with newly computed result.  If the use stmt is a
8847          single arg PHI, just replace all uses of PHI result.  It's necessary
8848          because lcssa PHI defining lhs may be before newly inserted stmt.  */
8849       use_operand_p use_p;
8850       stmt_vec_info use_stmt_info;
8851       FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, lhs)
8852         if (!is_gimple_debug (use_stmt)
8853             && (!(use_stmt_info = vinfo->lookup_stmt (use_stmt))
8854                 || !PURE_SLP_STMT (vect_stmt_to_vectorize (use_stmt_info))))
8855           {
8856             /* ???  This can happen when the live lane ends up being
8857                used in a vector construction code-generated by an
8858                external SLP node (and code-generation for that already
8859                happened).  See gcc.dg/vect/bb-slp-47.c.
8860                Doing this is what would happen if that vector CTOR
8861                were not code-generated yet so it is not too bad.
8862                ???  In fact we'd likely want to avoid this situation
8863                in the first place.  */
8864             if (TREE_CODE (new_tree) == SSA_NAME
8865                 && !SSA_NAME_IS_DEFAULT_DEF (new_tree)
8866                 && gimple_code (use_stmt) != GIMPLE_PHI
8867                 && !vect_stmt_dominates_stmt_p (SSA_NAME_DEF_STMT (new_tree),
8868                                                 use_stmt))
8869               {
8870                 enum tree_code code = gimple_assign_rhs_code (use_stmt);
8871                 gcc_assert (code == CONSTRUCTOR
8872                             || code == VIEW_CONVERT_EXPR
8873                             || CONVERT_EXPR_CODE_P (code));
8874                 if (dump_enabled_p ())
8875                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8876                                    "Using original scalar computation for "
8877                                    "live lane because use preceeds vector "
8878                                    "def\n");
8879                 continue;
8880               }
8881             /* ???  It can also happen that we end up pulling a def into
8882                a loop where replacing out-of-loop uses would require
8883                a new LC SSA PHI node.  Retain the original scalar in
8884                those cases as well.  PR98064.  */
8885             if (TREE_CODE (new_tree) == SSA_NAME
8886                 && !SSA_NAME_IS_DEFAULT_DEF (new_tree)
8887                 && (gimple_bb (use_stmt)->loop_father
8888                     != gimple_bb (vec_stmt)->loop_father)
8889                 && !flow_loop_nested_p (gimple_bb (vec_stmt)->loop_father,
8890                                         gimple_bb (use_stmt)->loop_father))
8891               {
8892                 if (dump_enabled_p ())
8893                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8894                                    "Using original scalar computation for "
8895                                    "live lane because there is an out-of-loop "
8896                                    "definition for it\n");
8897                 continue;
8898               }
8899             FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
8900               SET_USE (use_p, new_tree);
8901             update_stmt (use_stmt);
8902           }
8903     }
8904
8905   return true;
8906 }
8907
8908 /* Kill any debug uses outside LOOP of SSA names defined in STMT_INFO.  */
8909
8910 static void
8911 vect_loop_kill_debug_uses (class loop *loop, stmt_vec_info stmt_info)
8912 {
8913   ssa_op_iter op_iter;
8914   imm_use_iterator imm_iter;
8915   def_operand_p def_p;
8916   gimple *ustmt;
8917
8918   FOR_EACH_PHI_OR_STMT_DEF (def_p, stmt_info->stmt, op_iter, SSA_OP_DEF)
8919     {
8920       FOR_EACH_IMM_USE_STMT (ustmt, imm_iter, DEF_FROM_PTR (def_p))
8921         {
8922           basic_block bb;
8923
8924           if (!is_gimple_debug (ustmt))
8925             continue;
8926
8927           bb = gimple_bb (ustmt);
8928
8929           if (!flow_bb_inside_loop_p (loop, bb))
8930             {
8931               if (gimple_debug_bind_p (ustmt))
8932                 {
8933                   if (dump_enabled_p ())
8934                     dump_printf_loc (MSG_NOTE, vect_location,
8935                                      "killing debug use\n");
8936
8937                   gimple_debug_bind_reset_value (ustmt);
8938                   update_stmt (ustmt);
8939                 }
8940               else
8941                 gcc_unreachable ();
8942             }
8943         }
8944     }
8945 }
8946
8947 /* Given loop represented by LOOP_VINFO, return true if computation of
8948    LOOP_VINFO_NITERS (= LOOP_VINFO_NITERSM1 + 1) doesn't overflow, false
8949    otherwise.  */
8950
8951 static bool
8952 loop_niters_no_overflow (loop_vec_info loop_vinfo)
8953 {
8954   /* Constant case.  */
8955   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
8956     {
8957       tree cst_niters = LOOP_VINFO_NITERS (loop_vinfo);
8958       tree cst_nitersm1 = LOOP_VINFO_NITERSM1 (loop_vinfo);
8959
8960       gcc_assert (TREE_CODE (cst_niters) == INTEGER_CST);
8961       gcc_assert (TREE_CODE (cst_nitersm1) == INTEGER_CST);
8962       if (wi::to_widest (cst_nitersm1) < wi::to_widest (cst_niters))
8963         return true;
8964     }
8965
8966   widest_int max;
8967   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
8968   /* Check the upper bound of loop niters.  */
8969   if (get_max_loop_iterations (loop, &max))
8970     {
8971       tree type = TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo));
8972       signop sgn = TYPE_SIGN (type);
8973       widest_int type_max = widest_int::from (wi::max_value (type), sgn);
8974       if (max < type_max)
8975         return true;
8976     }
8977   return false;
8978 }
8979
8980 /* Return a mask type with half the number of elements as OLD_TYPE,
8981    given that it should have mode NEW_MODE.  */
8982
8983 tree
8984 vect_halve_mask_nunits (tree old_type, machine_mode new_mode)
8985 {
8986   poly_uint64 nunits = exact_div (TYPE_VECTOR_SUBPARTS (old_type), 2);
8987   return build_truth_vector_type_for_mode (nunits, new_mode);
8988 }
8989
8990 /* Return a mask type with twice as many elements as OLD_TYPE,
8991    given that it should have mode NEW_MODE.  */
8992
8993 tree
8994 vect_double_mask_nunits (tree old_type, machine_mode new_mode)
8995 {
8996   poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (old_type) * 2;
8997   return build_truth_vector_type_for_mode (nunits, new_mode);
8998 }
8999
9000 /* Record that a fully-masked version of LOOP_VINFO would need MASKS to
9001    contain a sequence of NVECTORS masks that each control a vector of type
9002    VECTYPE.  If SCALAR_MASK is nonnull, the fully-masked loop would AND
9003    these vector masks with the vector version of SCALAR_MASK.  */
9004
9005 void
9006 vect_record_loop_mask (loop_vec_info loop_vinfo, vec_loop_masks *masks,
9007                        unsigned int nvectors, tree vectype, tree scalar_mask)
9008 {
9009   gcc_assert (nvectors != 0);
9010   if (masks->length () < nvectors)
9011     masks->safe_grow_cleared (nvectors, true);
9012   rgroup_controls *rgm = &(*masks)[nvectors - 1];
9013   /* The number of scalars per iteration and the number of vectors are
9014      both compile-time constants.  */
9015   unsigned int nscalars_per_iter
9016     = exact_div (nvectors * TYPE_VECTOR_SUBPARTS (vectype),
9017                  LOOP_VINFO_VECT_FACTOR (loop_vinfo)).to_constant ();
9018
9019   if (scalar_mask)
9020     {
9021       scalar_cond_masked_key cond (scalar_mask, nvectors);
9022       loop_vinfo->scalar_cond_masked_set.add (cond);
9023     }
9024
9025   if (rgm->max_nscalars_per_iter < nscalars_per_iter)
9026     {
9027       rgm->max_nscalars_per_iter = nscalars_per_iter;
9028       rgm->type = truth_type_for (vectype);
9029       rgm->factor = 1;
9030     }
9031 }
9032
9033 /* Given a complete set of masks MASKS, extract mask number INDEX
9034    for an rgroup that operates on NVECTORS vectors of type VECTYPE,
9035    where 0 <= INDEX < NVECTORS.  Insert any set-up statements before GSI.
9036
9037    See the comment above vec_loop_masks for more details about the mask
9038    arrangement.  */
9039
9040 tree
9041 vect_get_loop_mask (gimple_stmt_iterator *gsi, vec_loop_masks *masks,
9042                     unsigned int nvectors, tree vectype, unsigned int index)
9043 {
9044   rgroup_controls *rgm = &(*masks)[nvectors - 1];
9045   tree mask_type = rgm->type;
9046
9047   /* Populate the rgroup's mask array, if this is the first time we've
9048      used it.  */
9049   if (rgm->controls.is_empty ())
9050     {
9051       rgm->controls.safe_grow_cleared (nvectors, true);
9052       for (unsigned int i = 0; i < nvectors; ++i)
9053         {
9054           tree mask = make_temp_ssa_name (mask_type, NULL, "loop_mask");
9055           /* Provide a dummy definition until the real one is available.  */
9056           SSA_NAME_DEF_STMT (mask) = gimple_build_nop ();
9057           rgm->controls[i] = mask;
9058         }
9059     }
9060
9061   tree mask = rgm->controls[index];
9062   if (maybe_ne (TYPE_VECTOR_SUBPARTS (mask_type),
9063                 TYPE_VECTOR_SUBPARTS (vectype)))
9064     {
9065       /* A loop mask for data type X can be reused for data type Y
9066          if X has N times more elements than Y and if Y's elements
9067          are N times bigger than X's.  In this case each sequence
9068          of N elements in the loop mask will be all-zero or all-one.
9069          We can then view-convert the mask so that each sequence of
9070          N elements is replaced by a single element.  */
9071       gcc_assert (multiple_p (TYPE_VECTOR_SUBPARTS (mask_type),
9072                               TYPE_VECTOR_SUBPARTS (vectype)));
9073       gimple_seq seq = NULL;
9074       mask_type = truth_type_for (vectype);
9075       mask = gimple_build (&seq, VIEW_CONVERT_EXPR, mask_type, mask);
9076       if (seq)
9077         gsi_insert_seq_before (gsi, seq, GSI_SAME_STMT);
9078     }
9079   return mask;
9080 }
9081
9082 /* Record that LOOP_VINFO would need LENS to contain a sequence of NVECTORS
9083    lengths for controlling an operation on VECTYPE.  The operation splits
9084    each element of VECTYPE into FACTOR separate subelements, measuring the
9085    length as a number of these subelements.  */
9086
9087 void
9088 vect_record_loop_len (loop_vec_info loop_vinfo, vec_loop_lens *lens,
9089                       unsigned int nvectors, tree vectype, unsigned int factor)
9090 {
9091   gcc_assert (nvectors != 0);
9092   if (lens->length () < nvectors)
9093     lens->safe_grow_cleared (nvectors, true);
9094   rgroup_controls *rgl = &(*lens)[nvectors - 1];
9095
9096   /* The number of scalars per iteration, scalar occupied bytes and
9097      the number of vectors are both compile-time constants.  */
9098   unsigned int nscalars_per_iter
9099     = exact_div (nvectors * TYPE_VECTOR_SUBPARTS (vectype),
9100                  LOOP_VINFO_VECT_FACTOR (loop_vinfo)).to_constant ();
9101
9102   if (rgl->max_nscalars_per_iter < nscalars_per_iter)
9103     {
9104       /* For now, we only support cases in which all loads and stores fall back
9105          to VnQI or none do.  */
9106       gcc_assert (!rgl->max_nscalars_per_iter
9107                   || (rgl->factor == 1 && factor == 1)
9108                   || (rgl->max_nscalars_per_iter * rgl->factor
9109                       == nscalars_per_iter * factor));
9110       rgl->max_nscalars_per_iter = nscalars_per_iter;
9111       rgl->type = vectype;
9112       rgl->factor = factor;
9113     }
9114 }
9115
9116 /* Given a complete set of length LENS, extract length number INDEX for an
9117    rgroup that operates on NVECTORS vectors, where 0 <= INDEX < NVECTORS.  */
9118
9119 tree
9120 vect_get_loop_len (loop_vec_info loop_vinfo, vec_loop_lens *lens,
9121                    unsigned int nvectors, unsigned int index)
9122 {
9123   rgroup_controls *rgl = &(*lens)[nvectors - 1];
9124
9125   /* Populate the rgroup's len array, if this is the first time we've
9126      used it.  */
9127   if (rgl->controls.is_empty ())
9128     {
9129       rgl->controls.safe_grow_cleared (nvectors, true);
9130       for (unsigned int i = 0; i < nvectors; ++i)
9131         {
9132           tree len_type = LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo);
9133           gcc_assert (len_type != NULL_TREE);
9134           tree len = make_temp_ssa_name (len_type, NULL, "loop_len");
9135
9136           /* Provide a dummy definition until the real one is available.  */
9137           SSA_NAME_DEF_STMT (len) = gimple_build_nop ();
9138           rgl->controls[i] = len;
9139         }
9140     }
9141
9142   return rgl->controls[index];
9143 }
9144
9145 /* Scale profiling counters by estimation for LOOP which is vectorized
9146    by factor VF.  */
9147
9148 static void
9149 scale_profile_for_vect_loop (class loop *loop, unsigned vf)
9150 {
9151   edge preheader = loop_preheader_edge (loop);
9152   /* Reduce loop iterations by the vectorization factor.  */
9153   gcov_type new_est_niter = niter_for_unrolled_loop (loop, vf);
9154   profile_count freq_h = loop->header->count, freq_e = preheader->count ();
9155
9156   if (freq_h.nonzero_p ())
9157     {
9158       profile_probability p;
9159
9160       /* Avoid dropping loop body profile counter to 0 because of zero count
9161          in loop's preheader.  */
9162       if (!(freq_e == profile_count::zero ()))
9163         freq_e = freq_e.force_nonzero ();
9164       p = freq_e.apply_scale (new_est_niter + 1, 1).probability_in (freq_h);
9165       scale_loop_frequencies (loop, p);
9166     }
9167
9168   edge exit_e = single_exit (loop);
9169   exit_e->probability = profile_probability::always ()
9170                                  .apply_scale (1, new_est_niter + 1);
9171
9172   edge exit_l = single_pred_edge (loop->latch);
9173   profile_probability prob = exit_l->probability;
9174   exit_l->probability = exit_e->probability.invert ();
9175   if (prob.initialized_p () && exit_l->probability.initialized_p ())
9176     scale_bbs_frequencies (&loop->latch, 1, exit_l->probability / prob);
9177 }
9178
9179 /* For a vectorized stmt DEF_STMT_INFO adjust all vectorized PHI
9180    latch edge values originally defined by it.  */
9181
9182 static void
9183 maybe_set_vectorized_backedge_value (loop_vec_info loop_vinfo,
9184                                      stmt_vec_info def_stmt_info)
9185 {
9186   tree def = gimple_get_lhs (vect_orig_stmt (def_stmt_info)->stmt);
9187   if (!def || TREE_CODE (def) != SSA_NAME)
9188     return;
9189   stmt_vec_info phi_info;
9190   imm_use_iterator iter;
9191   use_operand_p use_p;
9192   FOR_EACH_IMM_USE_FAST (use_p, iter, def)
9193     if (gphi *phi = dyn_cast <gphi *> (USE_STMT (use_p)))
9194       if (gimple_bb (phi)->loop_father->header == gimple_bb (phi)
9195           && (phi_info = loop_vinfo->lookup_stmt (phi))
9196           && STMT_VINFO_RELEVANT_P (phi_info)
9197           && VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (phi_info))
9198           && STMT_VINFO_REDUC_TYPE (phi_info) != FOLD_LEFT_REDUCTION
9199           && STMT_VINFO_REDUC_TYPE (phi_info) != EXTRACT_LAST_REDUCTION)
9200         {
9201           loop_p loop = gimple_bb (phi)->loop_father;
9202           edge e = loop_latch_edge (loop);
9203           if (PHI_ARG_DEF_FROM_EDGE (phi, e) == def)
9204             {
9205               vec<gimple *> &phi_defs = STMT_VINFO_VEC_STMTS (phi_info);
9206               vec<gimple *> &latch_defs = STMT_VINFO_VEC_STMTS (def_stmt_info);
9207               gcc_assert (phi_defs.length () == latch_defs.length ());
9208               for (unsigned i = 0; i < phi_defs.length (); ++i)
9209                 add_phi_arg (as_a <gphi *> (phi_defs[i]),
9210                              gimple_get_lhs (latch_defs[i]), e,
9211                              gimple_phi_arg_location (phi, e->dest_idx));
9212             }
9213         }
9214 }
9215
9216 /* Vectorize STMT_INFO if relevant, inserting any new instructions before GSI.
9217    When vectorizing STMT_INFO as a store, set *SEEN_STORE to its
9218    stmt_vec_info.  */
9219
9220 static bool
9221 vect_transform_loop_stmt (loop_vec_info loop_vinfo, stmt_vec_info stmt_info,
9222                           gimple_stmt_iterator *gsi, stmt_vec_info *seen_store)
9223 {
9224   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
9225   poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
9226
9227   if (dump_enabled_p ())
9228     dump_printf_loc (MSG_NOTE, vect_location,
9229                      "------>vectorizing statement: %G", stmt_info->stmt);
9230
9231   if (MAY_HAVE_DEBUG_BIND_STMTS && !STMT_VINFO_LIVE_P (stmt_info))
9232     vect_loop_kill_debug_uses (loop, stmt_info);
9233
9234   if (!STMT_VINFO_RELEVANT_P (stmt_info)
9235       && !STMT_VINFO_LIVE_P (stmt_info))
9236     return false;
9237
9238   if (STMT_VINFO_VECTYPE (stmt_info))
9239     {
9240       poly_uint64 nunits
9241         = TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info));
9242       if (!STMT_SLP_TYPE (stmt_info)
9243           && maybe_ne (nunits, vf)
9244           && dump_enabled_p ())
9245         /* For SLP VF is set according to unrolling factor, and not
9246            to vector size, hence for SLP this print is not valid.  */
9247         dump_printf_loc (MSG_NOTE, vect_location, "multiple-types.\n");
9248     }
9249
9250   /* Pure SLP statements have already been vectorized.  We still need
9251      to apply loop vectorization to hybrid SLP statements.  */
9252   if (PURE_SLP_STMT (stmt_info))
9253     return false;
9254
9255   if (dump_enabled_p ())
9256     dump_printf_loc (MSG_NOTE, vect_location, "transform statement.\n");
9257
9258   if (vect_transform_stmt (loop_vinfo, stmt_info, gsi, NULL, NULL))
9259     *seen_store = stmt_info;
9260
9261   return true;
9262 }
9263
9264 /* Helper function to pass to simplify_replace_tree to enable replacing tree's
9265    in the hash_map with its corresponding values.  */
9266
9267 static tree
9268 find_in_mapping (tree t, void *context)
9269 {
9270   hash_map<tree,tree>* mapping = (hash_map<tree, tree>*) context;
9271
9272   tree *value = mapping->get (t);
9273   return value ? *value : t;
9274 }
9275
9276 /* Update EPILOGUE's loop_vec_info.  EPILOGUE was constructed as a copy of the
9277    original loop that has now been vectorized.
9278
9279    The inits of the data_references need to be advanced with the number of
9280    iterations of the main loop.  This has been computed in vect_do_peeling and
9281    is stored in parameter ADVANCE.  We first restore the data_references
9282    initial offset with the values recored in ORIG_DRS_INIT.
9283
9284    Since the loop_vec_info of this EPILOGUE was constructed for the original
9285    loop, its stmt_vec_infos all point to the original statements.  These need
9286    to be updated to point to their corresponding copies as well as the SSA_NAMES
9287    in their PATTERN_DEF_SEQs and RELATED_STMTs.
9288
9289    The data_reference's connections also need to be updated.  Their
9290    corresponding dr_vec_info need to be reconnected to the EPILOGUE's
9291    stmt_vec_infos, their statements need to point to their corresponding copy,
9292    if they are gather loads or scatter stores then their reference needs to be
9293    updated to point to its corresponding copy and finally we set
9294    'base_misaligned' to false as we have already peeled for alignment in the
9295    prologue of the main loop.  */
9296
9297 static void
9298 update_epilogue_loop_vinfo (class loop *epilogue, tree advance)
9299 {
9300   loop_vec_info epilogue_vinfo = loop_vec_info_for_loop (epilogue);
9301   auto_vec<gimple *> stmt_worklist;
9302   hash_map<tree,tree> mapping;
9303   gimple *orig_stmt, *new_stmt;
9304   gimple_stmt_iterator epilogue_gsi;
9305   gphi_iterator epilogue_phi_gsi;
9306   stmt_vec_info stmt_vinfo = NULL, related_vinfo;
9307   basic_block *epilogue_bbs = get_loop_body (epilogue);
9308   unsigned i;
9309
9310   free (LOOP_VINFO_BBS (epilogue_vinfo));
9311   LOOP_VINFO_BBS (epilogue_vinfo) = epilogue_bbs;
9312
9313   /* Advance data_reference's with the number of iterations of the previous
9314      loop and its prologue.  */
9315   vect_update_inits_of_drs (epilogue_vinfo, advance, PLUS_EXPR);
9316
9317
9318   /* The EPILOGUE loop is a copy of the original loop so they share the same
9319      gimple UIDs.  In this loop we update the loop_vec_info of the EPILOGUE to
9320      point to the copied statements.  We also create a mapping of all LHS' in
9321      the original loop and all the LHS' in the EPILOGUE and create worklists to
9322      update teh STMT_VINFO_PATTERN_DEF_SEQs and STMT_VINFO_RELATED_STMTs.  */
9323   for (unsigned i = 0; i < epilogue->num_nodes; ++i)
9324     {
9325       for (epilogue_phi_gsi = gsi_start_phis (epilogue_bbs[i]);
9326            !gsi_end_p (epilogue_phi_gsi); gsi_next (&epilogue_phi_gsi))
9327         {
9328           new_stmt = epilogue_phi_gsi.phi ();
9329
9330           gcc_assert (gimple_uid (new_stmt) > 0);
9331           stmt_vinfo
9332             = epilogue_vinfo->stmt_vec_infos[gimple_uid (new_stmt) - 1];
9333
9334           orig_stmt = STMT_VINFO_STMT (stmt_vinfo);
9335           STMT_VINFO_STMT (stmt_vinfo) = new_stmt;
9336
9337           mapping.put (gimple_phi_result (orig_stmt),
9338                        gimple_phi_result (new_stmt));
9339           /* PHI nodes can not have patterns or related statements.  */
9340           gcc_assert (STMT_VINFO_PATTERN_DEF_SEQ (stmt_vinfo) == NULL
9341                       && STMT_VINFO_RELATED_STMT (stmt_vinfo) == NULL);
9342         }
9343
9344       for (epilogue_gsi = gsi_start_bb (epilogue_bbs[i]);
9345            !gsi_end_p (epilogue_gsi); gsi_next (&epilogue_gsi))
9346         {
9347           new_stmt = gsi_stmt (epilogue_gsi);
9348           if (is_gimple_debug (new_stmt))
9349             continue;
9350
9351           gcc_assert (gimple_uid (new_stmt) > 0);
9352           stmt_vinfo
9353             = epilogue_vinfo->stmt_vec_infos[gimple_uid (new_stmt) - 1];
9354
9355           orig_stmt = STMT_VINFO_STMT (stmt_vinfo);
9356           STMT_VINFO_STMT (stmt_vinfo) = new_stmt;
9357
9358           if (tree old_lhs = gimple_get_lhs (orig_stmt))
9359             mapping.put (old_lhs, gimple_get_lhs (new_stmt));
9360
9361           if (STMT_VINFO_PATTERN_DEF_SEQ (stmt_vinfo))
9362             {
9363               gimple_seq seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_vinfo);
9364               for (gimple_stmt_iterator gsi = gsi_start (seq);
9365                    !gsi_end_p (gsi); gsi_next (&gsi))
9366                 stmt_worklist.safe_push (gsi_stmt (gsi));
9367             }
9368
9369           related_vinfo = STMT_VINFO_RELATED_STMT (stmt_vinfo);
9370           if (related_vinfo != NULL && related_vinfo != stmt_vinfo)
9371             {
9372               gimple *stmt = STMT_VINFO_STMT (related_vinfo);
9373               stmt_worklist.safe_push (stmt);
9374               /* Set BB such that the assert in
9375                 'get_initial_def_for_reduction' is able to determine that
9376                 the BB of the related stmt is inside this loop.  */
9377               gimple_set_bb (stmt,
9378                              gimple_bb (new_stmt));
9379               related_vinfo = STMT_VINFO_RELATED_STMT (related_vinfo);
9380               gcc_assert (related_vinfo == NULL
9381                           || related_vinfo == stmt_vinfo);
9382             }
9383         }
9384     }
9385
9386   /* The PATTERN_DEF_SEQs and RELATED_STMTs in the epilogue were constructed
9387      using the original main loop and thus need to be updated to refer to the
9388      cloned variables used in the epilogue.  */
9389   for (unsigned i = 0; i < stmt_worklist.length (); ++i)
9390     {
9391       gimple *stmt = stmt_worklist[i];
9392       tree *new_op;
9393
9394       for (unsigned j = 1; j < gimple_num_ops (stmt); ++j)
9395         {
9396           tree op = gimple_op (stmt, j);
9397           if ((new_op = mapping.get(op)))
9398             gimple_set_op (stmt, j, *new_op);
9399           else
9400             {
9401               /* PR92429: The last argument of simplify_replace_tree disables
9402                  folding when replacing arguments.  This is required as
9403                  otherwise you might end up with different statements than the
9404                  ones analyzed in vect_loop_analyze, leading to different
9405                  vectorization.  */
9406               op = simplify_replace_tree (op, NULL_TREE, NULL_TREE,
9407                                           &find_in_mapping, &mapping, false);
9408               gimple_set_op (stmt, j, op);
9409             }
9410         }
9411     }
9412
9413   struct data_reference *dr;
9414   vec<data_reference_p> datarefs = LOOP_VINFO_DATAREFS (epilogue_vinfo);
9415   FOR_EACH_VEC_ELT (datarefs, i, dr)
9416     {
9417       orig_stmt = DR_STMT (dr);
9418       gcc_assert (gimple_uid (orig_stmt) > 0);
9419       stmt_vinfo = epilogue_vinfo->stmt_vec_infos[gimple_uid (orig_stmt) - 1];
9420       /* Data references for gather loads and scatter stores do not use the
9421          updated offset we set using ADVANCE.  Instead we have to make sure the
9422          reference in the data references point to the corresponding copy of
9423          the original in the epilogue.  */
9424       if (STMT_VINFO_MEMORY_ACCESS_TYPE (vect_stmt_to_vectorize (stmt_vinfo))
9425           == VMAT_GATHER_SCATTER)
9426         {
9427           DR_REF (dr)
9428             = simplify_replace_tree (DR_REF (dr), NULL_TREE, NULL_TREE,
9429                                      &find_in_mapping, &mapping);
9430           DR_BASE_ADDRESS (dr)
9431             = simplify_replace_tree (DR_BASE_ADDRESS (dr), NULL_TREE, NULL_TREE,
9432                                      &find_in_mapping, &mapping);
9433         }
9434       DR_STMT (dr) = STMT_VINFO_STMT (stmt_vinfo);
9435       stmt_vinfo->dr_aux.stmt = stmt_vinfo;
9436       /* The vector size of the epilogue is smaller than that of the main loop
9437          so the alignment is either the same or lower. This means the dr will
9438          thus by definition be aligned.  */
9439       STMT_VINFO_DR_INFO (stmt_vinfo)->base_misaligned = false;
9440     }
9441
9442   epilogue_vinfo->shared->datarefs_copy.release ();
9443   epilogue_vinfo->shared->save_datarefs ();
9444 }
9445
9446 /* Function vect_transform_loop.
9447
9448    The analysis phase has determined that the loop is vectorizable.
9449    Vectorize the loop - created vectorized stmts to replace the scalar
9450    stmts in the loop, and update the loop exit condition.
9451    Returns scalar epilogue loop if any.  */
9452
9453 class loop *
9454 vect_transform_loop (loop_vec_info loop_vinfo, gimple *loop_vectorized_call)
9455 {
9456   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
9457   class loop *epilogue = NULL;
9458   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
9459   int nbbs = loop->num_nodes;
9460   int i;
9461   tree niters_vector = NULL_TREE;
9462   tree step_vector = NULL_TREE;
9463   tree niters_vector_mult_vf = NULL_TREE;
9464   poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
9465   unsigned int lowest_vf = constant_lower_bound (vf);
9466   gimple *stmt;
9467   bool check_profitability = false;
9468   unsigned int th;
9469
9470   DUMP_VECT_SCOPE ("vec_transform_loop");
9471
9472   loop_vinfo->shared->check_datarefs ();
9473
9474   /* Use the more conservative vectorization threshold.  If the number
9475      of iterations is constant assume the cost check has been performed
9476      by our caller.  If the threshold makes all loops profitable that
9477      run at least the (estimated) vectorization factor number of times
9478      checking is pointless, too.  */
9479   th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
9480   if (vect_apply_runtime_profitability_check_p (loop_vinfo))
9481     {
9482       if (dump_enabled_p ())
9483         dump_printf_loc (MSG_NOTE, vect_location,
9484                          "Profitability threshold is %d loop iterations.\n",
9485                          th);
9486       check_profitability = true;
9487     }
9488
9489   /* Make sure there exists a single-predecessor exit bb.  Do this before
9490      versioning.   */
9491   edge e = single_exit (loop);
9492   if (! single_pred_p (e->dest))
9493     {
9494       split_loop_exit_edge (e, true);
9495       if (dump_enabled_p ())
9496         dump_printf (MSG_NOTE, "split exit edge\n");
9497     }
9498
9499   /* Version the loop first, if required, so the profitability check
9500      comes first.  */
9501
9502   if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
9503     {
9504       class loop *sloop
9505         = vect_loop_versioning (loop_vinfo, loop_vectorized_call);
9506       sloop->force_vectorize = false;
9507       check_profitability = false;
9508     }
9509
9510   /* Make sure there exists a single-predecessor exit bb also on the
9511      scalar loop copy.  Do this after versioning but before peeling
9512      so CFG structure is fine for both scalar and if-converted loop
9513      to make slpeel_duplicate_current_defs_from_edges face matched
9514      loop closed PHI nodes on the exit.  */
9515   if (LOOP_VINFO_SCALAR_LOOP (loop_vinfo))
9516     {
9517       e = single_exit (LOOP_VINFO_SCALAR_LOOP (loop_vinfo));
9518       if (! single_pred_p (e->dest))
9519         {
9520           split_loop_exit_edge (e, true);
9521           if (dump_enabled_p ())
9522             dump_printf (MSG_NOTE, "split exit edge of scalar loop\n");
9523         }
9524     }
9525
9526   tree niters = vect_build_loop_niters (loop_vinfo);
9527   LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = niters;
9528   tree nitersm1 = unshare_expr (LOOP_VINFO_NITERSM1 (loop_vinfo));
9529   bool niters_no_overflow = loop_niters_no_overflow (loop_vinfo);
9530   tree advance;
9531   drs_init_vec orig_drs_init;
9532
9533   epilogue = vect_do_peeling (loop_vinfo, niters, nitersm1, &niters_vector,
9534                               &step_vector, &niters_vector_mult_vf, th,
9535                               check_profitability, niters_no_overflow,
9536                               &advance);
9537
9538   if (LOOP_VINFO_SCALAR_LOOP (loop_vinfo)
9539       && LOOP_VINFO_SCALAR_LOOP_SCALING (loop_vinfo).initialized_p ())
9540     scale_loop_frequencies (LOOP_VINFO_SCALAR_LOOP (loop_vinfo),
9541                             LOOP_VINFO_SCALAR_LOOP_SCALING (loop_vinfo));
9542
9543   if (niters_vector == NULL_TREE)
9544     {
9545       if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
9546           && !LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo)
9547           && known_eq (lowest_vf, vf))
9548         {
9549           niters_vector
9550             = build_int_cst (TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo)),
9551                              LOOP_VINFO_INT_NITERS (loop_vinfo) / lowest_vf);
9552           step_vector = build_one_cst (TREE_TYPE (niters));
9553         }
9554       else if (vect_use_loop_mask_for_alignment_p (loop_vinfo))
9555         vect_gen_vector_loop_niters (loop_vinfo, niters, &niters_vector,
9556                                      &step_vector, niters_no_overflow);
9557       else
9558         /* vect_do_peeling subtracted the number of peeled prologue
9559            iterations from LOOP_VINFO_NITERS.  */
9560         vect_gen_vector_loop_niters (loop_vinfo, LOOP_VINFO_NITERS (loop_vinfo),
9561                                      &niters_vector, &step_vector,
9562                                      niters_no_overflow);
9563     }
9564
9565   /* 1) Make sure the loop header has exactly two entries
9566      2) Make sure we have a preheader basic block.  */
9567
9568   gcc_assert (EDGE_COUNT (loop->header->preds) == 2);
9569
9570   split_edge (loop_preheader_edge (loop));
9571
9572   if (vect_use_loop_mask_for_alignment_p (loop_vinfo))
9573     /* This will deal with any possible peeling.  */
9574     vect_prepare_for_masked_peels (loop_vinfo);
9575
9576   /* Schedule the SLP instances first, then handle loop vectorization
9577      below.  */
9578   if (!loop_vinfo->slp_instances.is_empty ())
9579     {
9580       DUMP_VECT_SCOPE ("scheduling SLP instances");
9581       vect_schedule_slp (loop_vinfo, LOOP_VINFO_SLP_INSTANCES (loop_vinfo));
9582     }
9583
9584   /* FORNOW: the vectorizer supports only loops which body consist
9585      of one basic block (header + empty latch). When the vectorizer will
9586      support more involved loop forms, the order by which the BBs are
9587      traversed need to be reconsidered.  */
9588
9589   for (i = 0; i < nbbs; i++)
9590     {
9591       basic_block bb = bbs[i];
9592       stmt_vec_info stmt_info;
9593
9594       for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
9595            gsi_next (&si))
9596         {
9597           gphi *phi = si.phi ();
9598           if (dump_enabled_p ())
9599             dump_printf_loc (MSG_NOTE, vect_location,
9600                              "------>vectorizing phi: %G", phi);
9601           stmt_info = loop_vinfo->lookup_stmt (phi);
9602           if (!stmt_info)
9603             continue;
9604
9605           if (MAY_HAVE_DEBUG_BIND_STMTS && !STMT_VINFO_LIVE_P (stmt_info))
9606             vect_loop_kill_debug_uses (loop, stmt_info);
9607
9608           if (!STMT_VINFO_RELEVANT_P (stmt_info)
9609               && !STMT_VINFO_LIVE_P (stmt_info))
9610             continue;
9611
9612           if (STMT_VINFO_VECTYPE (stmt_info)
9613               && (maybe_ne
9614                   (TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info)), vf))
9615               && dump_enabled_p ())
9616             dump_printf_loc (MSG_NOTE, vect_location, "multiple-types.\n");
9617
9618           if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def
9619                || STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
9620                || STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def
9621                || STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle
9622                || STMT_VINFO_DEF_TYPE (stmt_info) == vect_internal_def)
9623               && ! PURE_SLP_STMT (stmt_info))
9624             {
9625               if (dump_enabled_p ())
9626                 dump_printf_loc (MSG_NOTE, vect_location, "transform phi.\n");
9627               vect_transform_stmt (loop_vinfo, stmt_info, NULL, NULL, NULL);
9628             }
9629         }
9630
9631       for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
9632            gsi_next (&si))
9633         {
9634           gphi *phi = si.phi ();
9635           stmt_info = loop_vinfo->lookup_stmt (phi);
9636           if (!stmt_info)
9637             continue;
9638
9639           if (!STMT_VINFO_RELEVANT_P (stmt_info)
9640               && !STMT_VINFO_LIVE_P (stmt_info))
9641             continue;
9642
9643           if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def
9644                || STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
9645                || STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def
9646                || STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle
9647                || STMT_VINFO_DEF_TYPE (stmt_info) == vect_internal_def)
9648               && ! PURE_SLP_STMT (stmt_info))
9649             maybe_set_vectorized_backedge_value (loop_vinfo, stmt_info);
9650         }
9651
9652       for (gimple_stmt_iterator si = gsi_start_bb (bb);
9653            !gsi_end_p (si);)
9654         {
9655           stmt = gsi_stmt (si);
9656           /* During vectorization remove existing clobber stmts.  */
9657           if (gimple_clobber_p (stmt))
9658             {
9659               unlink_stmt_vdef (stmt);
9660               gsi_remove (&si, true);
9661               release_defs (stmt);
9662             }
9663           else
9664             {
9665               /* Ignore vector stmts created in the outer loop.  */
9666               stmt_info = loop_vinfo->lookup_stmt (stmt);
9667
9668               /* vector stmts created in the outer-loop during vectorization of
9669                  stmts in an inner-loop may not have a stmt_info, and do not
9670                  need to be vectorized.  */
9671               stmt_vec_info seen_store = NULL;
9672               if (stmt_info)
9673                 {
9674                   if (STMT_VINFO_IN_PATTERN_P (stmt_info))
9675                     {
9676                       gimple *def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
9677                       for (gimple_stmt_iterator subsi = gsi_start (def_seq);
9678                            !gsi_end_p (subsi); gsi_next (&subsi))
9679                         {
9680                           stmt_vec_info pat_stmt_info
9681                             = loop_vinfo->lookup_stmt (gsi_stmt (subsi));
9682                           vect_transform_loop_stmt (loop_vinfo, pat_stmt_info,
9683                                                     &si, &seen_store);
9684                         }
9685                       stmt_vec_info pat_stmt_info
9686                         = STMT_VINFO_RELATED_STMT (stmt_info);
9687                       if (vect_transform_loop_stmt (loop_vinfo, pat_stmt_info,
9688                                                     &si, &seen_store))
9689                         maybe_set_vectorized_backedge_value (loop_vinfo,
9690                                                              pat_stmt_info);
9691                     }
9692                   else
9693                     {
9694                       if (vect_transform_loop_stmt (loop_vinfo, stmt_info, &si,
9695                                                     &seen_store))
9696                         maybe_set_vectorized_backedge_value (loop_vinfo,
9697                                                              stmt_info);
9698                     }
9699                 }
9700               gsi_next (&si);
9701               if (seen_store)
9702                 {
9703                   if (STMT_VINFO_GROUPED_ACCESS (seen_store))
9704                     /* Interleaving.  If IS_STORE is TRUE, the
9705                        vectorization of the interleaving chain was
9706                        completed - free all the stores in the chain.  */
9707                     vect_remove_stores (loop_vinfo,
9708                                         DR_GROUP_FIRST_ELEMENT (seen_store));
9709                   else
9710                     /* Free the attached stmt_vec_info and remove the stmt.  */
9711                     loop_vinfo->remove_stmt (stmt_info);
9712                 }
9713             }
9714         }
9715
9716       /* Stub out scalar statements that must not survive vectorization.
9717          Doing this here helps with grouped statements, or statements that
9718          are involved in patterns.  */
9719       for (gimple_stmt_iterator gsi = gsi_start_bb (bb);
9720            !gsi_end_p (gsi); gsi_next (&gsi))
9721         {
9722           gcall *call = dyn_cast <gcall *> (gsi_stmt (gsi));
9723           if (!call || !gimple_call_internal_p (call))
9724             continue;
9725           internal_fn ifn = gimple_call_internal_fn (call);
9726           if (ifn == IFN_MASK_LOAD)
9727             {
9728               tree lhs = gimple_get_lhs (call);
9729               if (!VECTOR_TYPE_P (TREE_TYPE (lhs)))
9730                 {
9731                   tree zero = build_zero_cst (TREE_TYPE (lhs));
9732                   gimple *new_stmt = gimple_build_assign (lhs, zero);
9733                   gsi_replace (&gsi, new_stmt, true);
9734                 }
9735             }
9736           else if (conditional_internal_fn_code (ifn) != ERROR_MARK)
9737             {
9738               tree lhs = gimple_get_lhs (call);
9739               if (!VECTOR_TYPE_P (TREE_TYPE (lhs)))
9740                 {
9741                   tree else_arg
9742                     = gimple_call_arg (call, gimple_call_num_args (call) - 1);
9743                   gimple *new_stmt = gimple_build_assign (lhs, else_arg);
9744                   gsi_replace (&gsi, new_stmt, true);
9745                 }
9746             }
9747         }
9748     }                           /* BBs in loop */
9749
9750   /* The vectorization factor is always > 1, so if we use an IV increment of 1.
9751      a zero NITERS becomes a nonzero NITERS_VECTOR.  */
9752   if (integer_onep (step_vector))
9753     niters_no_overflow = true;
9754   vect_set_loop_condition (loop, loop_vinfo, niters_vector, step_vector,
9755                            niters_vector_mult_vf, !niters_no_overflow);
9756
9757   unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
9758   scale_profile_for_vect_loop (loop, assumed_vf);
9759
9760   /* True if the final iteration might not handle a full vector's
9761      worth of scalar iterations.  */
9762   bool final_iter_may_be_partial
9763     = LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo);
9764   /* The minimum number of iterations performed by the epilogue.  This
9765      is 1 when peeling for gaps because we always need a final scalar
9766      iteration.  */
9767   int min_epilogue_iters = LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) ? 1 : 0;
9768   /* +1 to convert latch counts to loop iteration counts,
9769      -min_epilogue_iters to remove iterations that cannot be performed
9770        by the vector code.  */
9771   int bias_for_lowest = 1 - min_epilogue_iters;
9772   int bias_for_assumed = bias_for_lowest;
9773   int alignment_npeels = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
9774   if (alignment_npeels && LOOP_VINFO_USING_PARTIAL_VECTORS_P (loop_vinfo))
9775     {
9776       /* When the amount of peeling is known at compile time, the first
9777          iteration will have exactly alignment_npeels active elements.
9778          In the worst case it will have at least one.  */
9779       int min_first_active = (alignment_npeels > 0 ? alignment_npeels : 1);
9780       bias_for_lowest += lowest_vf - min_first_active;
9781       bias_for_assumed += assumed_vf - min_first_active;
9782     }
9783   /* In these calculations the "- 1" converts loop iteration counts
9784      back to latch counts.  */
9785   if (loop->any_upper_bound)
9786     {
9787       loop_vec_info main_vinfo = LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo);
9788       loop->nb_iterations_upper_bound
9789         = (final_iter_may_be_partial
9790            ? wi::udiv_ceil (loop->nb_iterations_upper_bound + bias_for_lowest,
9791                             lowest_vf) - 1
9792            : wi::udiv_floor (loop->nb_iterations_upper_bound + bias_for_lowest,
9793                              lowest_vf) - 1);
9794       if (main_vinfo)
9795         {
9796           unsigned int bound;
9797           poly_uint64 main_iters
9798             = upper_bound (LOOP_VINFO_VECT_FACTOR (main_vinfo),
9799                            LOOP_VINFO_COST_MODEL_THRESHOLD (main_vinfo));
9800           main_iters
9801             = upper_bound (main_iters,
9802                            LOOP_VINFO_VERSIONING_THRESHOLD (main_vinfo));
9803           if (can_div_away_from_zero_p (main_iters,
9804                                         LOOP_VINFO_VECT_FACTOR (loop_vinfo),
9805                                         &bound))
9806             loop->nb_iterations_upper_bound
9807               = wi::umin ((widest_int) (bound - 1),
9808                           loop->nb_iterations_upper_bound);
9809       }
9810   }
9811   if (loop->any_likely_upper_bound)
9812     loop->nb_iterations_likely_upper_bound
9813       = (final_iter_may_be_partial
9814          ? wi::udiv_ceil (loop->nb_iterations_likely_upper_bound
9815                           + bias_for_lowest, lowest_vf) - 1
9816          : wi::udiv_floor (loop->nb_iterations_likely_upper_bound
9817                            + bias_for_lowest, lowest_vf) - 1);
9818   if (loop->any_estimate)
9819     loop->nb_iterations_estimate
9820       = (final_iter_may_be_partial
9821          ? wi::udiv_ceil (loop->nb_iterations_estimate + bias_for_assumed,
9822                           assumed_vf) - 1
9823          : wi::udiv_floor (loop->nb_iterations_estimate + bias_for_assumed,
9824                            assumed_vf) - 1);
9825
9826   if (dump_enabled_p ())
9827     {
9828       if (!LOOP_VINFO_EPILOGUE_P (loop_vinfo))
9829         {
9830           dump_printf_loc (MSG_NOTE, vect_location,
9831                            "LOOP VECTORIZED\n");
9832           if (loop->inner)
9833             dump_printf_loc (MSG_NOTE, vect_location,
9834                              "OUTER LOOP VECTORIZED\n");
9835           dump_printf (MSG_NOTE, "\n");
9836         }
9837       else
9838         dump_printf_loc (MSG_NOTE, vect_location,
9839                          "LOOP EPILOGUE VECTORIZED (MODE=%s)\n",
9840                          GET_MODE_NAME (loop_vinfo->vector_mode));
9841     }
9842
9843   /* Loops vectorized with a variable factor won't benefit from
9844      unrolling/peeling.  */
9845   if (!vf.is_constant ())
9846     {
9847       loop->unroll = 1;
9848       if (dump_enabled_p ())
9849         dump_printf_loc (MSG_NOTE, vect_location, "Disabling unrolling due to"
9850                          " variable-length vectorization factor\n");
9851     }
9852   /* Free SLP instances here because otherwise stmt reference counting
9853      won't work.  */
9854   slp_instance instance;
9855   FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), i, instance)
9856     vect_free_slp_instance (instance);
9857   LOOP_VINFO_SLP_INSTANCES (loop_vinfo).release ();
9858   /* Clear-up safelen field since its value is invalid after vectorization
9859      since vectorized loop can have loop-carried dependencies.  */
9860   loop->safelen = 0;
9861
9862   if (epilogue)
9863     {
9864       update_epilogue_loop_vinfo (epilogue, advance);
9865
9866       epilogue->simduid = loop->simduid;
9867       epilogue->force_vectorize = loop->force_vectorize;
9868       epilogue->dont_vectorize = false;
9869     }
9870
9871   return epilogue;
9872 }
9873
9874 /* The code below is trying to perform simple optimization - revert
9875    if-conversion for masked stores, i.e. if the mask of a store is zero
9876    do not perform it and all stored value producers also if possible.
9877    For example,
9878      for (i=0; i<n; i++)
9879        if (c[i])
9880         {
9881           p1[i] += 1;
9882           p2[i] = p3[i] +2;
9883         }
9884    this transformation will produce the following semi-hammock:
9885
9886    if (!mask__ifc__42.18_165 == { 0, 0, 0, 0, 0, 0, 0, 0 })
9887      {
9888        vect__11.19_170 = MASK_LOAD (vectp_p1.20_168, 0B, mask__ifc__42.18_165);
9889        vect__12.22_172 = vect__11.19_170 + vect_cst__171;
9890        MASK_STORE (vectp_p1.23_175, 0B, mask__ifc__42.18_165, vect__12.22_172);
9891        vect__18.25_182 = MASK_LOAD (vectp_p3.26_180, 0B, mask__ifc__42.18_165);
9892        vect__19.28_184 = vect__18.25_182 + vect_cst__183;
9893        MASK_STORE (vectp_p2.29_187, 0B, mask__ifc__42.18_165, vect__19.28_184);
9894      }
9895 */
9896
9897 void
9898 optimize_mask_stores (class loop *loop)
9899 {
9900   basic_block *bbs = get_loop_body (loop);
9901   unsigned nbbs = loop->num_nodes;
9902   unsigned i;
9903   basic_block bb;
9904   class loop *bb_loop;
9905   gimple_stmt_iterator gsi;
9906   gimple *stmt;
9907   auto_vec<gimple *> worklist;
9908   auto_purge_vect_location sentinel;
9909
9910   vect_location = find_loop_location (loop);
9911   /* Pick up all masked stores in loop if any.  */
9912   for (i = 0; i < nbbs; i++)
9913     {
9914       bb = bbs[i];
9915       for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi);
9916            gsi_next (&gsi))
9917         {
9918           stmt = gsi_stmt (gsi);
9919           if (gimple_call_internal_p (stmt, IFN_MASK_STORE))
9920             worklist.safe_push (stmt);
9921         }
9922     }
9923
9924   free (bbs);
9925   if (worklist.is_empty ())
9926     return;
9927
9928   /* Loop has masked stores.  */
9929   while (!worklist.is_empty ())
9930     {
9931       gimple *last, *last_store;
9932       edge e, efalse;
9933       tree mask;
9934       basic_block store_bb, join_bb;
9935       gimple_stmt_iterator gsi_to;
9936       tree vdef, new_vdef;
9937       gphi *phi;
9938       tree vectype;
9939       tree zero;
9940
9941       last = worklist.pop ();
9942       mask = gimple_call_arg (last, 2);
9943       bb = gimple_bb (last);
9944       /* Create then_bb and if-then structure in CFG, then_bb belongs to
9945          the same loop as if_bb.  It could be different to LOOP when two
9946          level loop-nest is vectorized and mask_store belongs to the inner
9947          one.  */
9948       e = split_block (bb, last);
9949       bb_loop = bb->loop_father;
9950       gcc_assert (loop == bb_loop || flow_loop_nested_p (loop, bb_loop));
9951       join_bb = e->dest;
9952       store_bb = create_empty_bb (bb);
9953       add_bb_to_loop (store_bb, bb_loop);
9954       e->flags = EDGE_TRUE_VALUE;
9955       efalse = make_edge (bb, store_bb, EDGE_FALSE_VALUE);
9956       /* Put STORE_BB to likely part.  */
9957       efalse->probability = profile_probability::unlikely ();
9958       store_bb->count = efalse->count ();
9959       make_single_succ_edge (store_bb, join_bb, EDGE_FALLTHRU);
9960       if (dom_info_available_p (CDI_DOMINATORS))
9961         set_immediate_dominator (CDI_DOMINATORS, store_bb, bb);
9962       if (dump_enabled_p ())
9963         dump_printf_loc (MSG_NOTE, vect_location,
9964                          "Create new block %d to sink mask stores.",
9965                          store_bb->index);
9966       /* Create vector comparison with boolean result.  */
9967       vectype = TREE_TYPE (mask);
9968       zero = build_zero_cst (vectype);
9969       stmt = gimple_build_cond (EQ_EXPR, mask, zero, NULL_TREE, NULL_TREE);
9970       gsi = gsi_last_bb (bb);
9971       gsi_insert_after (&gsi, stmt, GSI_SAME_STMT);
9972       /* Create new PHI node for vdef of the last masked store:
9973          .MEM_2 = VDEF <.MEM_1>
9974          will be converted to
9975          .MEM.3 = VDEF <.MEM_1>
9976          and new PHI node will be created in join bb
9977          .MEM_2 = PHI <.MEM_1, .MEM_3>
9978       */
9979       vdef = gimple_vdef (last);
9980       new_vdef = make_ssa_name (gimple_vop (cfun), last);
9981       gimple_set_vdef (last, new_vdef);
9982       phi = create_phi_node (vdef, join_bb);
9983       add_phi_arg (phi, new_vdef, EDGE_SUCC (store_bb, 0), UNKNOWN_LOCATION);
9984
9985       /* Put all masked stores with the same mask to STORE_BB if possible.  */
9986       while (true)
9987         {
9988           gimple_stmt_iterator gsi_from;
9989           gimple *stmt1 = NULL;
9990
9991           /* Move masked store to STORE_BB.  */
9992           last_store = last;
9993           gsi = gsi_for_stmt (last);
9994           gsi_from = gsi;
9995           /* Shift GSI to the previous stmt for further traversal.  */
9996           gsi_prev (&gsi);
9997           gsi_to = gsi_start_bb (store_bb);
9998           gsi_move_before (&gsi_from, &gsi_to);
9999           /* Setup GSI_TO to the non-empty block start.  */
10000           gsi_to = gsi_start_bb (store_bb);
10001           if (dump_enabled_p ())
10002             dump_printf_loc (MSG_NOTE, vect_location,
10003                              "Move stmt to created bb\n%G", last);
10004           /* Move all stored value producers if possible.  */
10005           while (!gsi_end_p (gsi))
10006             {
10007               tree lhs;
10008               imm_use_iterator imm_iter;
10009               use_operand_p use_p;
10010               bool res;
10011
10012               /* Skip debug statements.  */
10013               if (is_gimple_debug (gsi_stmt (gsi)))
10014                 {
10015                   gsi_prev (&gsi);
10016                   continue;
10017                 }
10018               stmt1 = gsi_stmt (gsi);
10019               /* Do not consider statements writing to memory or having
10020                  volatile operand.  */
10021               if (gimple_vdef (stmt1)
10022                   || gimple_has_volatile_ops (stmt1))
10023                 break;
10024               gsi_from = gsi;
10025               gsi_prev (&gsi);
10026               lhs = gimple_get_lhs (stmt1);
10027               if (!lhs)
10028                 break;
10029
10030               /* LHS of vectorized stmt must be SSA_NAME.  */
10031               if (TREE_CODE (lhs) != SSA_NAME)
10032                 break;
10033
10034               if (!VECTOR_TYPE_P (TREE_TYPE (lhs)))
10035                 {
10036                   /* Remove dead scalar statement.  */
10037                   if (has_zero_uses (lhs))
10038                     {
10039                       gsi_remove (&gsi_from, true);
10040                       continue;
10041                     }
10042                 }
10043
10044               /* Check that LHS does not have uses outside of STORE_BB.  */
10045               res = true;
10046               FOR_EACH_IMM_USE_FAST (use_p, imm_iter, lhs)
10047                 {
10048                   gimple *use_stmt;
10049                   use_stmt = USE_STMT (use_p);
10050                   if (is_gimple_debug (use_stmt))
10051                     continue;
10052                   if (gimple_bb (use_stmt) != store_bb)
10053                     {
10054                       res = false;
10055                       break;
10056                     }
10057                 }
10058               if (!res)
10059                 break;
10060
10061               if (gimple_vuse (stmt1)
10062                   && gimple_vuse (stmt1) != gimple_vuse (last_store))
10063                 break;
10064
10065               /* Can move STMT1 to STORE_BB.  */
10066               if (dump_enabled_p ())
10067                 dump_printf_loc (MSG_NOTE, vect_location,
10068                                  "Move stmt to created bb\n%G", stmt1);
10069               gsi_move_before (&gsi_from, &gsi_to);
10070               /* Shift GSI_TO for further insertion.  */
10071               gsi_prev (&gsi_to);
10072             }
10073           /* Put other masked stores with the same mask to STORE_BB.  */
10074           if (worklist.is_empty ()
10075               || gimple_call_arg (worklist.last (), 2) != mask
10076               || worklist.last () != stmt1)
10077             break;
10078           last = worklist.pop ();
10079         }
10080       add_phi_arg (phi, gimple_vuse (last_store), e, UNKNOWN_LOCATION);
10081     }
10082 }
10083
10084 /* Decide whether it is possible to use a zero-based induction variable
10085    when vectorizing LOOP_VINFO with partial vectors.  If it is, return
10086    the value that the induction variable must be able to hold in order
10087    to ensure that the rgroups eventually have no active vector elements.
10088    Return -1 otherwise.  */
10089
10090 widest_int
10091 vect_iv_limit_for_partial_vectors (loop_vec_info loop_vinfo)
10092 {
10093   tree niters_skip = LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo);
10094   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
10095   unsigned HOST_WIDE_INT max_vf = vect_max_vf (loop_vinfo);
10096
10097   /* Calculate the value that the induction variable must be able
10098      to hit in order to ensure that we end the loop with an all-false mask.
10099      This involves adding the maximum number of inactive trailing scalar
10100      iterations.  */
10101   widest_int iv_limit = -1;
10102   if (max_loop_iterations (loop, &iv_limit))
10103     {
10104       if (niters_skip)
10105         {
10106           /* Add the maximum number of skipped iterations to the
10107              maximum iteration count.  */
10108           if (TREE_CODE (niters_skip) == INTEGER_CST)
10109             iv_limit += wi::to_widest (niters_skip);
10110           else
10111             iv_limit += max_vf - 1;
10112         }
10113       else if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo))
10114         /* Make a conservatively-correct assumption.  */
10115         iv_limit += max_vf - 1;
10116
10117       /* IV_LIMIT is the maximum number of latch iterations, which is also
10118          the maximum in-range IV value.  Round this value down to the previous
10119          vector alignment boundary and then add an extra full iteration.  */
10120       poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
10121       iv_limit = (iv_limit & -(int) known_alignment (vf)) + max_vf;
10122     }
10123   return iv_limit;
10124 }
10125
10126 /* For the given rgroup_controls RGC, check whether an induction variable
10127    would ever hit a value that produces a set of all-false masks or zero
10128    lengths before wrapping around.  Return true if it's possible to wrap
10129    around before hitting the desirable value, otherwise return false.  */
10130
10131 bool
10132 vect_rgroup_iv_might_wrap_p (loop_vec_info loop_vinfo, rgroup_controls *rgc)
10133 {
10134   widest_int iv_limit = vect_iv_limit_for_partial_vectors (loop_vinfo);
10135
10136   if (iv_limit == -1)
10137     return true;
10138
10139   tree compare_type = LOOP_VINFO_RGROUP_COMPARE_TYPE (loop_vinfo);
10140   unsigned int compare_precision = TYPE_PRECISION (compare_type);
10141   unsigned nitems = rgc->max_nscalars_per_iter * rgc->factor;
10142
10143   if (wi::min_precision (iv_limit * nitems, UNSIGNED) > compare_precision)
10144     return true;
10145
10146   return false;
10147 }