gcc/tree-vect-loop.c

   1 /* Loop Vectorization
   2    Copyright (C) 2003-2020 Free Software Foundation, Inc.
   3    Contributed by Dorit Naishlos <dorit@il.ibm.com> and
   4    Ira Rosen <irar@il.ibm.com>
   5
   6 This file is part of GCC.
   7
   8 GCC is free software; you can redistribute it and/or modify it under
   9 the terms of the GNU General Public License as published by the Free
  10 Software Foundation; either version 3, or (at your option) any later
  11 version.
  12
  13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
  14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
  15 FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  16 for more details.
  17
  18 You should have received a copy of the GNU General Public License
  19 along with GCC; see the file COPYING3.  If not see
  20 <http://www.gnu.org/licenses/>.  */
  21
  22 #include "config.h"
  23 #include "system.h"
  24 #include "coretypes.h"
  25 #include "backend.h"
  26 #include "target.h"
  27 #include "rtl.h"
  28 #include "tree.h"
  29 #include "gimple.h"
  30 #include "cfghooks.h"
  31 #include "tree-pass.h"
  32 #include "ssa.h"
  33 #include "optabs-tree.h"
  34 #include "diagnostic-core.h"
  35 #include "fold-const.h"
  36 #include "stor-layout.h"
  37 #include "cfganal.h"
  38 #include "gimplify.h"
  39 #include "gimple-iterator.h"
  40 #include "gimplify-me.h"
  41 #include "tree-ssa-loop-ivopts.h"
  42 #include "tree-ssa-loop-manip.h"
  43 #include "tree-ssa-loop-niter.h"
  44 #include "tree-ssa-loop.h"
  45 #include "cfgloop.h"
  46 #include "tree-scalar-evolution.h"
  47 #include "tree-vectorizer.h"
  48 #include "gimple-fold.h"
  49 #include "cgraph.h"
  50 #include "tree-cfg.h"
  51 #include "tree-if-conv.h"
  52 #include "internal-fn.h"
  53 #include "tree-vector-builder.h"
  54 #include "vec-perm-indices.h"
  55 #include "tree-eh.h"
  56
  57 /* Loop Vectorization Pass.
  58
  59    This pass tries to vectorize loops.
  60
  61    For example, the vectorizer transforms the following simple loop:
  62
  63         short a[N]; short b[N]; short c[N]; int i;
  64
  65         for (i=0; i<N; i++){
  66           a[i] = b[i] + c[i];
  67         }
  68
  69    as if it was manually vectorized by rewriting the source code into:
  70
  71         typedef int __attribute__((mode(V8HI))) v8hi;
  72         short a[N];  short b[N]; short c[N];   int i;
  73         v8hi *pa = (v8hi*)a, *pb = (v8hi*)b, *pc = (v8hi*)c;
  74         v8hi va, vb, vc;
  75
  76         for (i=0; i<N/8; i++){
  77           vb = pb[i];
  78           vc = pc[i];
  79           va = vb + vc;
  80           pa[i] = va;
  81         }
  82
  83         The main entry to this pass is vectorize_loops(), in which
  84    the vectorizer applies a set of analyses on a given set of loops,
  85    followed by the actual vectorization transformation for the loops that
  86    had successfully passed the analysis phase.
  87         Throughout this pass we make a distinction between two types of
  88    data: scalars (which are represented by SSA_NAMES), and memory references
  89    ("data-refs").  These two types of data require different handling both
  90    during analysis and transformation. The types of data-refs that the
  91    vectorizer currently supports are ARRAY_REFS which base is an array DECL
  92    (not a pointer), and INDIRECT_REFS through pointers; both array and pointer
  93    accesses are required to have a simple (consecutive) access pattern.
  94
  95    Analysis phase:
  96    ===============
  97         The driver for the analysis phase is vect_analyze_loop().
  98    It applies a set of analyses, some of which rely on the scalar evolution
  99    analyzer (scev) developed by Sebastian Pop.
 100
 101         During the analysis phase the vectorizer records some information
 102    per stmt in a "stmt_vec_info" struct which is attached to each stmt in the
 103    loop, as well as general information about the loop as a whole, which is
 104    recorded in a "loop_vec_info" struct attached to each loop.
 105
 106    Transformation phase:
 107    =====================
 108         The loop transformation phase scans all the stmts in the loop, and
 109    creates a vector stmt (or a sequence of stmts) for each scalar stmt S in
 110    the loop that needs to be vectorized.  It inserts the vector code sequence
 111    just before the scalar stmt S, and records a pointer to the vector code
 112    in STMT_VINFO_VEC_STMT (stmt_info) (stmt_info is the stmt_vec_info struct
 113    attached to S).  This pointer will be used for the vectorization of following
 114    stmts which use the def of stmt S. Stmt S is removed if it writes to memory;
 115    otherwise, we rely on dead code elimination for removing it.
 116
 117         For example, say stmt S1 was vectorized into stmt VS1:
 118
 119    VS1: vb = px[i];
 120    S1:  b = x[i];    STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
 121    S2:  a = b;
 122
 123    To vectorize stmt S2, the vectorizer first finds the stmt that defines
 124    the operand 'b' (S1), and gets the relevant vector def 'vb' from the
 125    vector stmt VS1 pointed to by STMT_VINFO_VEC_STMT (stmt_info (S1)).  The
 126    resulting sequence would be:
 127
 128    VS1: vb = px[i];
 129    S1:  b = x[i];       STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
 130    VS2: va = vb;
 131    S2:  a = b;          STMT_VINFO_VEC_STMT (stmt_info (S2)) = VS2
 132
 133         Operands that are not SSA_NAMEs, are data-refs that appear in
 134    load/store operations (like 'x[i]' in S1), and are handled differently.
 135
 136    Target modeling:
 137    =================
 138         Currently the only target specific information that is used is the
 139    size of the vector (in bytes) - "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD".
 140    Targets that can support different sizes of vectors, for now will need
 141    to specify one value for "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD".  More
 142    flexibility will be added in the future.
 143
 144         Since we only vectorize operations which vector form can be
 145    expressed using existing tree codes, to verify that an operation is
 146    supported, the vectorizer checks the relevant optab at the relevant
 147    machine_mode (e.g, optab_handler (add_optab, V8HImode)).  If
 148    the value found is CODE_FOR_nothing, then there's no target support, and
 149    we can't vectorize the stmt.
 150
 151    For additional information on this project see:
 152    http://gcc.gnu.org/projects/tree-ssa/vectorization.html
 153 */
 154
 155 static void vect_estimate_min_profitable_iters (loop_vec_info, int *, int *);
 156 static stmt_vec_info vect_is_simple_reduction (loop_vec_info, stmt_vec_info,
 157                                                bool *, bool *);
 158
 159 /* Subroutine of vect_determine_vf_for_stmt that handles only one
 160    statement.  VECTYPE_MAYBE_SET_P is true if STMT_VINFO_VECTYPE
 161    may already be set for general statements (not just data refs).  */
 162
 163 static opt_result
 164 vect_determine_vf_for_stmt_1 (vec_info *vinfo, stmt_vec_info stmt_info,
 165                               bool vectype_maybe_set_p,
 166                               poly_uint64 *vf)
 167 {
 168   gimple *stmt = stmt_info->stmt;
 169
 170   if ((!STMT_VINFO_RELEVANT_P (stmt_info)
 171        && !STMT_VINFO_LIVE_P (stmt_info))
 172       || gimple_clobber_p (stmt))
 173     {
 174       if (dump_enabled_p ())
 175         dump_printf_loc (MSG_NOTE, vect_location, "skip.\n");
 176       return opt_result::success ();
 177     }
 178
 179   tree stmt_vectype, nunits_vectype;
 180   opt_result res = vect_get_vector_types_for_stmt (vinfo, stmt_info,
 181                                                    &stmt_vectype,
 182                                                    &nunits_vectype);
 183   if (!res)
 184     return res;
 185
 186   if (stmt_vectype)
 187     {
 188       if (STMT_VINFO_VECTYPE (stmt_info))
 189         /* The only case when a vectype had been already set is for stmts
 190            that contain a data ref, or for "pattern-stmts" (stmts generated
 191            by the vectorizer to represent/replace a certain idiom).  */
 192         gcc_assert ((STMT_VINFO_DATA_REF (stmt_info)
 193                      || vectype_maybe_set_p)
 194                     && STMT_VINFO_VECTYPE (stmt_info) == stmt_vectype);
 195       else
 196         STMT_VINFO_VECTYPE (stmt_info) = stmt_vectype;
 197     }
 198
 199   if (nunits_vectype)
 200     vect_update_max_nunits (vf, nunits_vectype);
 201
 202   return opt_result::success ();
 203 }
 204
 205 /* Subroutine of vect_determine_vectorization_factor.  Set the vector
 206    types of STMT_INFO and all attached pattern statements and update
 207    the vectorization factor VF accordingly.  Return true on success
 208    or false if something prevented vectorization.  */
 209
 210 static opt_result
 211 vect_determine_vf_for_stmt (vec_info *vinfo,
 212                             stmt_vec_info stmt_info, poly_uint64 *vf)
 213 {
 214   if (dump_enabled_p ())
 215     dump_printf_loc (MSG_NOTE, vect_location, "==> examining statement: %G",
 216                      stmt_info->stmt);
 217   opt_result res = vect_determine_vf_for_stmt_1 (vinfo, stmt_info, false, vf);
 218   if (!res)
 219     return res;
 220
 221   if (STMT_VINFO_IN_PATTERN_P (stmt_info)
 222       && STMT_VINFO_RELATED_STMT (stmt_info))
 223     {
 224       gimple *pattern_def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
 225       stmt_info = STMT_VINFO_RELATED_STMT (stmt_info);
 226
 227       /* If a pattern statement has def stmts, analyze them too.  */
 228       for (gimple_stmt_iterator si = gsi_start (pattern_def_seq);
 229            !gsi_end_p (si); gsi_next (&si))
 230         {
 231           stmt_vec_info def_stmt_info = vinfo->lookup_stmt (gsi_stmt (si));
 232           if (dump_enabled_p ())
 233             dump_printf_loc (MSG_NOTE, vect_location,
 234                              "==> examining pattern def stmt: %G",
 235                              def_stmt_info->stmt);
 236           res = vect_determine_vf_for_stmt_1 (vinfo, def_stmt_info, true, vf);
 237           if (!res)
 238             return res;
 239         }
 240
 241       if (dump_enabled_p ())
 242         dump_printf_loc (MSG_NOTE, vect_location,
 243                          "==> examining pattern statement: %G",
 244                          stmt_info->stmt);
 245       res = vect_determine_vf_for_stmt_1 (vinfo, stmt_info, true, vf);
 246       if (!res)
 247         return res;
 248     }
 249
 250   return opt_result::success ();
 251 }
 252
 253 /* Function vect_determine_vectorization_factor
 254
 255    Determine the vectorization factor (VF).  VF is the number of data elements
 256    that are operated upon in parallel in a single iteration of the vectorized
 257    loop.  For example, when vectorizing a loop that operates on 4byte elements,
 258    on a target with vector size (VS) 16byte, the VF is set to 4, since 4
 259    elements can fit in a single vector register.
 260
 261    We currently support vectorization of loops in which all types operated upon
 262    are of the same size.  Therefore this function currently sets VF according to
 263    the size of the types operated upon, and fails if there are multiple sizes
 264    in the loop.
 265
 266    VF is also the factor by which the loop iterations are strip-mined, e.g.:
 267    original loop:
 268         for (i=0; i<N; i++){
 269           a[i] = b[i] + c[i];
 270         }
 271
 272    vectorized loop:
 273         for (i=0; i<N; i+=VF){
 274           a[i:VF] = b[i:VF] + c[i:VF];
 275         }
 276 */
 277
 278 static opt_result
 279 vect_determine_vectorization_factor (loop_vec_info loop_vinfo)
 280 {
 281   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
 282   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
 283   unsigned nbbs = loop->num_nodes;
 284   poly_uint64 vectorization_factor = 1;
 285   tree scalar_type = NULL_TREE;
 286   gphi *phi;
 287   tree vectype;
 288   stmt_vec_info stmt_info;
 289   unsigned i;
 290
 291   DUMP_VECT_SCOPE ("vect_determine_vectorization_factor");
 292
 293   for (i = 0; i < nbbs; i++)
 294     {
 295       basic_block bb = bbs[i];
 296
 297       for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
 298            gsi_next (&si))
 299         {
 300           phi = si.phi ();
 301           stmt_info = loop_vinfo->lookup_stmt (phi);
 302           if (dump_enabled_p ())
 303             dump_printf_loc (MSG_NOTE, vect_location, "==> examining phi: %G",
 304                              phi);
 305
 306           gcc_assert (stmt_info);
 307
 308           if (STMT_VINFO_RELEVANT_P (stmt_info)
 309               || STMT_VINFO_LIVE_P (stmt_info))
 310             {
 311               gcc_assert (!STMT_VINFO_VECTYPE (stmt_info));
 312               scalar_type = TREE_TYPE (PHI_RESULT (phi));
 313
 314               if (dump_enabled_p ())
 315                 dump_printf_loc (MSG_NOTE, vect_location,
 316                                  "get vectype for scalar type:  %T\n",
 317                                  scalar_type);
 318
 319               vectype = get_vectype_for_scalar_type (loop_vinfo, scalar_type);
 320               if (!vectype)
 321                 return opt_result::failure_at (phi,
 322                                                "not vectorized: unsupported "
 323                                                "data-type %T\n",
 324                                                scalar_type);
 325               STMT_VINFO_VECTYPE (stmt_info) = vectype;
 326
 327               if (dump_enabled_p ())
 328                 dump_printf_loc (MSG_NOTE, vect_location, "vectype: %T\n",
 329                                  vectype);
 330
 331               if (dump_enabled_p ())
 332                 {
 333                   dump_printf_loc (MSG_NOTE, vect_location, "nunits = ");
 334                   dump_dec (MSG_NOTE, TYPE_VECTOR_SUBPARTS (vectype));
 335                   dump_printf (MSG_NOTE, "\n");
 336                 }
 337
 338               vect_update_max_nunits (&vectorization_factor, vectype);
 339             }
 340         }
 341
 342       for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
 343            gsi_next (&si))
 344         {
 345           stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
 346           opt_result res
 347             = vect_determine_vf_for_stmt (loop_vinfo,
 348                                           stmt_info, &vectorization_factor);
 349           if (!res)
 350             return res;
 351         }
 352     }
 353
 354   /* TODO: Analyze cost. Decide if worth while to vectorize.  */
 355   if (dump_enabled_p ())
 356     {
 357       dump_printf_loc (MSG_NOTE, vect_location, "vectorization factor = ");
 358       dump_dec (MSG_NOTE, vectorization_factor);
 359       dump_printf (MSG_NOTE, "\n");
 360     }
 361
 362   if (known_le (vectorization_factor, 1U))
 363     return opt_result::failure_at (vect_location,
 364                                    "not vectorized: unsupported data-type\n");
 365   LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
 366   return opt_result::success ();
 367 }
 368
 369
 370 /* Function vect_is_simple_iv_evolution.
 371
 372    FORNOW: A simple evolution of an induction variables in the loop is
 373    considered a polynomial evolution.  */
 374
 375 static bool
 376 vect_is_simple_iv_evolution (unsigned loop_nb, tree access_fn, tree * init,
 377                              tree * step)
 378 {
 379   tree init_expr;
 380   tree step_expr;
 381   tree evolution_part = evolution_part_in_loop_num (access_fn, loop_nb);
 382   basic_block bb;
 383
 384   /* When there is no evolution in this loop, the evolution function
 385      is not "simple".  */
 386   if (evolution_part == NULL_TREE)
 387     return false;
 388
 389   /* When the evolution is a polynomial of degree >= 2
 390      the evolution function is not "simple".  */
 391   if (tree_is_chrec (evolution_part))
 392     return false;
 393
 394   step_expr = evolution_part;
 395   init_expr = unshare_expr (initial_condition_in_loop_num (access_fn, loop_nb));
 396
 397   if (dump_enabled_p ())
 398     dump_printf_loc (MSG_NOTE, vect_location, "step: %T,  init: %T\n",
 399                      step_expr, init_expr);
 400
 401   *init = init_expr;
 402   *step = step_expr;
 403
 404   if (TREE_CODE (step_expr) != INTEGER_CST
 405       && (TREE_CODE (step_expr) != SSA_NAME
 406           || ((bb = gimple_bb (SSA_NAME_DEF_STMT (step_expr)))
 407               && flow_bb_inside_loop_p (get_loop (cfun, loop_nb), bb))
 408           || (!INTEGRAL_TYPE_P (TREE_TYPE (step_expr))
 409               && (!SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr))
 410                   || !flag_associative_math)))
 411       && (TREE_CODE (step_expr) != REAL_CST
 412           || !flag_associative_math))
 413     {
 414       if (dump_enabled_p ())
 415         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 416                          "step unknown.\n");
 417       return false;
 418     }
 419
 420   return true;
 421 }
 422
 423 /* Return true if PHI, described by STMT_INFO, is the inner PHI in
 424    what we are assuming is a double reduction.  For example, given
 425    a structure like this:
 426
 427       outer1:
 428         x_1 = PHI <x_4(outer2), ...>;
 429         ...
 430
 431       inner:
 432         x_2 = PHI <x_1(outer1), ...>;
 433         ...
 434         x_3 = ...;
 435         ...
 436
 437       outer2:
 438         x_4 = PHI <x_3(inner)>;
 439         ...
 440
 441    outer loop analysis would treat x_1 as a double reduction phi and
 442    this function would then return true for x_2.  */
 443
 444 static bool
 445 vect_inner_phi_in_double_reduction_p (loop_vec_info loop_vinfo, gphi *phi)
 446 {
 447   use_operand_p use_p;
 448   ssa_op_iter op_iter;
 449   FOR_EACH_PHI_ARG (use_p, phi, op_iter, SSA_OP_USE)
 450     if (stmt_vec_info def_info = loop_vinfo->lookup_def (USE_FROM_PTR (use_p)))
 451       if (STMT_VINFO_DEF_TYPE (def_info) == vect_double_reduction_def)
 452         return true;
 453   return false;
 454 }
 455
 456 /* Function vect_analyze_scalar_cycles_1.
 457
 458    Examine the cross iteration def-use cycles of scalar variables
 459    in LOOP.  LOOP_VINFO represents the loop that is now being
 460    considered for vectorization (can be LOOP, or an outer-loop
 461    enclosing LOOP).  */
 462
 463 static void
 464 vect_analyze_scalar_cycles_1 (loop_vec_info loop_vinfo, class loop *loop)
 465 {
 466   basic_block bb = loop->header;
 467   tree init, step;
 468   auto_vec<stmt_vec_info, 64> worklist;
 469   gphi_iterator gsi;
 470   bool double_reduc, reduc_chain;
 471
 472   DUMP_VECT_SCOPE ("vect_analyze_scalar_cycles");
 473
 474   /* First - identify all inductions.  Reduction detection assumes that all the
 475      inductions have been identified, therefore, this order must not be
 476      changed.  */
 477   for (gsi = gsi_start_phis  (bb); !gsi_end_p (gsi); gsi_next (&gsi))
 478     {
 479       gphi *phi = gsi.phi ();
 480       tree access_fn = NULL;
 481       tree def = PHI_RESULT (phi);
 482       stmt_vec_info stmt_vinfo = loop_vinfo->lookup_stmt (phi);
 483
 484       if (dump_enabled_p ())
 485         dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: %G", phi);
 486
 487       /* Skip virtual phi's.  The data dependences that are associated with
 488          virtual defs/uses (i.e., memory accesses) are analyzed elsewhere.  */
 489       if (virtual_operand_p (def))
 490         continue;
 491
 492       STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_unknown_def_type;
 493
 494       /* Analyze the evolution function.  */
 495       access_fn = analyze_scalar_evolution (loop, def);
 496       if (access_fn)
 497         {
 498           STRIP_NOPS (access_fn);
 499           if (dump_enabled_p ())
 500             dump_printf_loc (MSG_NOTE, vect_location,
 501                              "Access function of PHI: %T\n", access_fn);
 502           STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo)
 503             = initial_condition_in_loop_num (access_fn, loop->num);
 504           STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo)
 505             = evolution_part_in_loop_num (access_fn, loop->num);
 506         }
 507
 508       if (!access_fn
 509           || vect_inner_phi_in_double_reduction_p (loop_vinfo, phi)
 510           || !vect_is_simple_iv_evolution (loop->num, access_fn, &init, &step)
 511           || (LOOP_VINFO_LOOP (loop_vinfo) != loop
 512               && TREE_CODE (step) != INTEGER_CST))
 513         {
 514           worklist.safe_push (stmt_vinfo);
 515           continue;
 516         }
 517
 518       gcc_assert (STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo)
 519                   != NULL_TREE);
 520       gcc_assert (STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo) != NULL_TREE);
 521
 522       if (dump_enabled_p ())
 523         dump_printf_loc (MSG_NOTE, vect_location, "Detected induction.\n");
 524       STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_induction_def;
 525     }
 526
 527
 528   /* Second - identify all reductions and nested cycles.  */
 529   while (worklist.length () > 0)
 530     {
 531       stmt_vec_info stmt_vinfo = worklist.pop ();
 532       gphi *phi = as_a <gphi *> (stmt_vinfo->stmt);
 533       tree def = PHI_RESULT (phi);
 534
 535       if (dump_enabled_p ())
 536         dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: %G", phi);
 537
 538       gcc_assert (!virtual_operand_p (def)
 539                   && STMT_VINFO_DEF_TYPE (stmt_vinfo) == vect_unknown_def_type);
 540
 541       stmt_vec_info reduc_stmt_info
 542         = vect_is_simple_reduction (loop_vinfo, stmt_vinfo, &double_reduc,
 543                                     &reduc_chain);
 544       if (reduc_stmt_info)
 545         {
 546           STMT_VINFO_REDUC_DEF (stmt_vinfo) = reduc_stmt_info;
 547           STMT_VINFO_REDUC_DEF (reduc_stmt_info) = stmt_vinfo;
 548           if (double_reduc)
 549             {
 550               if (dump_enabled_p ())
 551                 dump_printf_loc (MSG_NOTE, vect_location,
 552                                  "Detected double reduction.\n");
 553
 554               STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_double_reduction_def;
 555               STMT_VINFO_DEF_TYPE (reduc_stmt_info) = vect_double_reduction_def;
 556             }
 557           else
 558             {
 559               if (loop != LOOP_VINFO_LOOP (loop_vinfo))
 560                 {
 561                   if (dump_enabled_p ())
 562                     dump_printf_loc (MSG_NOTE, vect_location,
 563                                      "Detected vectorizable nested cycle.\n");
 564
 565                   STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_nested_cycle;
 566                 }
 567               else
 568                 {
 569                   if (dump_enabled_p ())
 570                     dump_printf_loc (MSG_NOTE, vect_location,
 571                                      "Detected reduction.\n");
 572
 573                   STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_reduction_def;
 574                   STMT_VINFO_DEF_TYPE (reduc_stmt_info) = vect_reduction_def;
 575                   /* Store the reduction cycles for possible vectorization in
 576                      loop-aware SLP if it was not detected as reduction
 577                      chain.  */
 578                   if (! reduc_chain)
 579                     LOOP_VINFO_REDUCTIONS (loop_vinfo).safe_push
 580                       (reduc_stmt_info);
 581                 }
 582             }
 583         }
 584       else
 585         if (dump_enabled_p ())
 586           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 587                            "Unknown def-use cycle pattern.\n");
 588     }
 589 }
 590
 591
 592 /* Function vect_analyze_scalar_cycles.
 593
 594    Examine the cross iteration def-use cycles of scalar variables, by
 595    analyzing the loop-header PHIs of scalar variables.  Classify each
 596    cycle as one of the following: invariant, induction, reduction, unknown.
 597    We do that for the loop represented by LOOP_VINFO, and also to its
 598    inner-loop, if exists.
 599    Examples for scalar cycles:
 600
 601    Example1: reduction:
 602
 603               loop1:
 604               for (i=0; i<N; i++)
 605                  sum += a[i];
 606
 607    Example2: induction:
 608
 609               loop2:
 610               for (i=0; i<N; i++)
 611                  a[i] = i;  */
 612
 613 static void
 614 vect_analyze_scalar_cycles (loop_vec_info loop_vinfo)
 615 {
 616   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
 617
 618   vect_analyze_scalar_cycles_1 (loop_vinfo, loop);
 619
 620   /* When vectorizing an outer-loop, the inner-loop is executed sequentially.
 621      Reductions in such inner-loop therefore have different properties than
 622      the reductions in the nest that gets vectorized:
 623      1. When vectorized, they are executed in the same order as in the original
 624         scalar loop, so we can't change the order of computation when
 625         vectorizing them.
 626      2. FIXME: Inner-loop reductions can be used in the inner-loop, so the
 627         current checks are too strict.  */
 628
 629   if (loop->inner)
 630     vect_analyze_scalar_cycles_1 (loop_vinfo, loop->inner);
 631 }
 632
 633 /* Transfer group and reduction information from STMT_INFO to its
 634    pattern stmt.  */
 635
 636 static void
 637 vect_fixup_reduc_chain (stmt_vec_info stmt_info)
 638 {
 639   stmt_vec_info firstp = STMT_VINFO_RELATED_STMT (stmt_info);
 640   stmt_vec_info stmtp;
 641   gcc_assert (!REDUC_GROUP_FIRST_ELEMENT (firstp)
 642               && REDUC_GROUP_FIRST_ELEMENT (stmt_info));
 643   REDUC_GROUP_SIZE (firstp) = REDUC_GROUP_SIZE (stmt_info);
 644   do
 645     {
 646       stmtp = STMT_VINFO_RELATED_STMT (stmt_info);
 647       gcc_checking_assert (STMT_VINFO_DEF_TYPE (stmtp)
 648                            == STMT_VINFO_DEF_TYPE (stmt_info));
 649       REDUC_GROUP_FIRST_ELEMENT (stmtp) = firstp;
 650       stmt_info = REDUC_GROUP_NEXT_ELEMENT (stmt_info);
 651       if (stmt_info)
 652         REDUC_GROUP_NEXT_ELEMENT (stmtp)
 653           = STMT_VINFO_RELATED_STMT (stmt_info);
 654     }
 655   while (stmt_info);
 656 }
 657
 658 /* Fixup scalar cycles that now have their stmts detected as patterns.  */
 659
 660 static void
 661 vect_fixup_scalar_cycles_with_patterns (loop_vec_info loop_vinfo)
 662 {
 663   stmt_vec_info first;
 664   unsigned i;
 665
 666   FOR_EACH_VEC_ELT (LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo), i, first)
 667     if (STMT_VINFO_IN_PATTERN_P (first))
 668       {
 669         stmt_vec_info next = REDUC_GROUP_NEXT_ELEMENT (first);
 670         while (next)
 671           {
 672             if (! STMT_VINFO_IN_PATTERN_P (next)
 673                 || STMT_VINFO_REDUC_IDX (STMT_VINFO_RELATED_STMT (next)) == -1)
 674               break;
 675             next = REDUC_GROUP_NEXT_ELEMENT (next);
 676           }
 677         /* If not all stmt in the chain are patterns or if we failed
 678            to update STMT_VINFO_REDUC_IDX try to handle the chain
 679            without patterns.  */
 680         if (! next
 681             && STMT_VINFO_REDUC_IDX (STMT_VINFO_RELATED_STMT (first)) != -1)
 682           {
 683             vect_fixup_reduc_chain (first);
 684             LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo)[i]
 685               = STMT_VINFO_RELATED_STMT (first);
 686           }
 687       }
 688 }
 689
 690 /* Function vect_get_loop_niters.
 691
 692    Determine how many iterations the loop is executed and place it
 693    in NUMBER_OF_ITERATIONS.  Place the number of latch iterations
 694    in NUMBER_OF_ITERATIONSM1.  Place the condition under which the
 695    niter information holds in ASSUMPTIONS.
 696
 697    Return the loop exit condition.  */
 698
 699
 700 static gcond *
 701 vect_get_loop_niters (class loop *loop, tree *assumptions,
 702                       tree *number_of_iterations, tree *number_of_iterationsm1)
 703 {
 704   edge exit = single_exit (loop);
 705   class tree_niter_desc niter_desc;
 706   tree niter_assumptions, niter, may_be_zero;
 707   gcond *cond = get_loop_exit_condition (loop);
 708
 709   *assumptions = boolean_true_node;
 710   *number_of_iterationsm1 = chrec_dont_know;
 711   *number_of_iterations = chrec_dont_know;
 712   DUMP_VECT_SCOPE ("get_loop_niters");
 713
 714   if (!exit)
 715     return cond;
 716
 717   may_be_zero = NULL_TREE;
 718   if (!number_of_iterations_exit_assumptions (loop, exit, &niter_desc, NULL)
 719       || chrec_contains_undetermined (niter_desc.niter))
 720     return cond;
 721
 722   niter_assumptions = niter_desc.assumptions;
 723   may_be_zero = niter_desc.may_be_zero;
 724   niter = niter_desc.niter;
 725
 726   if (may_be_zero && integer_zerop (may_be_zero))
 727     may_be_zero = NULL_TREE;
 728
 729   if (may_be_zero)
 730     {
 731       if (COMPARISON_CLASS_P (may_be_zero))
 732         {
 733           /* Try to combine may_be_zero with assumptions, this can simplify
 734              computation of niter expression.  */
 735           if (niter_assumptions && !integer_nonzerop (niter_assumptions))
 736             niter_assumptions = fold_build2 (TRUTH_AND_EXPR, boolean_type_node,
 737                                              niter_assumptions,
 738                                              fold_build1 (TRUTH_NOT_EXPR,
 739                                                           boolean_type_node,
 740                                                           may_be_zero));
 741           else
 742             niter = fold_build3 (COND_EXPR, TREE_TYPE (niter), may_be_zero,
 743                                  build_int_cst (TREE_TYPE (niter), 0),
 744                                  rewrite_to_non_trapping_overflow (niter));
 745
 746           may_be_zero = NULL_TREE;
 747         }
 748       else if (integer_nonzerop (may_be_zero))
 749         {
 750           *number_of_iterationsm1 = build_int_cst (TREE_TYPE (niter), 0);
 751           *number_of_iterations = build_int_cst (TREE_TYPE (niter), 1);
 752           return cond;
 753         }
 754       else
 755         return cond;
 756     }
 757
 758   *assumptions = niter_assumptions;
 759   *number_of_iterationsm1 = niter;
 760
 761   /* We want the number of loop header executions which is the number
 762      of latch executions plus one.
 763      ???  For UINT_MAX latch executions this number overflows to zero
 764      for loops like do { n++; } while (n != 0);  */
 765   if (niter && !chrec_contains_undetermined (niter))
 766     niter = fold_build2 (PLUS_EXPR, TREE_TYPE (niter), unshare_expr (niter),
 767                           build_int_cst (TREE_TYPE (niter), 1));
 768   *number_of_iterations = niter;
 769
 770   return cond;
 771 }
 772
 773 /* Function bb_in_loop_p
 774
 775    Used as predicate for dfs order traversal of the loop bbs.  */
 776
 777 static bool
 778 bb_in_loop_p (const_basic_block bb, const void *data)
 779 {
 780   const class loop *const loop = (const class loop *)data;
 781   if (flow_bb_inside_loop_p (loop, bb))
 782     return true;
 783   return false;
 784 }
 785
 786
 787 /* Create and initialize a new loop_vec_info struct for LOOP_IN, as well as
 788    stmt_vec_info structs for all the stmts in LOOP_IN.  */
 789
 790 _loop_vec_info::_loop_vec_info (class loop *loop_in, vec_info_shared *shared)
 791   : vec_info (vec_info::loop, init_cost (loop_in), shared),
 792     loop (loop_in),
 793     bbs (XCNEWVEC (basic_block, loop->num_nodes)),
 794     num_itersm1 (NULL_TREE),
 795     num_iters (NULL_TREE),
 796     num_iters_unchanged (NULL_TREE),
 797     num_iters_assumptions (NULL_TREE),
 798     th (0),
 799     versioning_threshold (0),
 800     vectorization_factor (0),
 801     max_vectorization_factor (0),
 802     mask_skip_niters (NULL_TREE),
 803     mask_compare_type (NULL_TREE),
 804     simd_if_cond (NULL_TREE),
 805     unaligned_dr (NULL),
 806     peeling_for_alignment (0),
 807     ptr_mask (0),
 808     ivexpr_map (NULL),
 809     scan_map (NULL),
 810     slp_unrolling_factor (1),
 811     single_scalar_iteration_cost (0),
 812     vec_outside_cost (0),
 813     vec_inside_cost (0),
 814     vectorizable (false),
 815     can_fully_mask_p (true),
 816     fully_masked_p (false),
 817     peeling_for_gaps (false),
 818     peeling_for_niter (false),
 819     no_data_dependencies (false),
 820     has_mask_store (false),
 821     scalar_loop_scaling (profile_probability::uninitialized ()),
 822     scalar_loop (NULL),
 823     orig_loop_info (NULL)
 824 {
 825   /* CHECKME: We want to visit all BBs before their successors (except for
 826      latch blocks, for which this assertion wouldn't hold).  In the simple
 827      case of the loop forms we allow, a dfs order of the BBs would the same
 828      as reversed postorder traversal, so we are safe.  */
 829
 830   unsigned int nbbs = dfs_enumerate_from (loop->header, 0, bb_in_loop_p,
 831                                           bbs, loop->num_nodes, loop);
 832   gcc_assert (nbbs == loop->num_nodes);
 833
 834   for (unsigned int i = 0; i < nbbs; i++)
 835     {
 836       basic_block bb = bbs[i];
 837       gimple_stmt_iterator si;
 838
 839       for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
 840         {
 841           gimple *phi = gsi_stmt (si);
 842           gimple_set_uid (phi, 0);
 843           add_stmt (phi);
 844         }
 845
 846       for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
 847         {
 848           gimple *stmt = gsi_stmt (si);
 849           gimple_set_uid (stmt, 0);
 850           add_stmt (stmt);
 851           /* If .GOMP_SIMD_LANE call for the current loop has 3 arguments, the
 852              third argument is the #pragma omp simd if (x) condition, when 0,
 853              loop shouldn't be vectorized, when non-zero constant, it should
 854              be vectorized normally, otherwise versioned with vectorized loop
 855              done if the condition is non-zero at runtime.  */
 856           if (loop_in->simduid
 857               && is_gimple_call (stmt)
 858               && gimple_call_internal_p (stmt)
 859               && gimple_call_internal_fn (stmt) == IFN_GOMP_SIMD_LANE
 860               && gimple_call_num_args (stmt) >= 3
 861               && TREE_CODE (gimple_call_arg (stmt, 0)) == SSA_NAME
 862               && (loop_in->simduid
 863                   == SSA_NAME_VAR (gimple_call_arg (stmt, 0))))
 864             {
 865               tree arg = gimple_call_arg (stmt, 2);
 866               if (integer_zerop (arg) || TREE_CODE (arg) == SSA_NAME)
 867                 simd_if_cond = arg;
 868               else
 869                 gcc_assert (integer_nonzerop (arg));
 870             }
 871         }
 872     }
 873
 874   epilogue_vinfos.create (6);
 875 }
 876
 877 /* Free all levels of MASKS.  */
 878
 879 void
 880 release_vec_loop_masks (vec_loop_masks *masks)
 881 {
 882   rgroup_masks *rgm;
 883   unsigned int i;
 884   FOR_EACH_VEC_ELT (*masks, i, rgm)
 885     rgm->masks.release ();
 886   masks->release ();
 887 }
 888
 889 /* Free all memory used by the _loop_vec_info, as well as all the
 890    stmt_vec_info structs of all the stmts in the loop.  */
 891
 892 _loop_vec_info::~_loop_vec_info ()
 893 {
 894   free (bbs);
 895
 896   release_vec_loop_masks (&masks);
 897   delete ivexpr_map;
 898   delete scan_map;
 899   epilogue_vinfos.release ();
 900
 901   loop->aux = NULL;
 902 }
 903
 904 /* Return an invariant or register for EXPR and emit necessary
 905    computations in the LOOP_VINFO loop preheader.  */
 906
 907 tree
 908 cse_and_gimplify_to_preheader (loop_vec_info loop_vinfo, tree expr)
 909 {
 910   if (is_gimple_reg (expr)
 911       || is_gimple_min_invariant (expr))
 912     return expr;
 913
 914   if (! loop_vinfo->ivexpr_map)
 915     loop_vinfo->ivexpr_map = new hash_map<tree_operand_hash, tree>;
 916   tree &cached = loop_vinfo->ivexpr_map->get_or_insert (expr);
 917   if (! cached)
 918     {
 919       gimple_seq stmts = NULL;
 920       cached = force_gimple_operand (unshare_expr (expr),
 921                                      &stmts, true, NULL_TREE);
 922       if (stmts)
 923         {
 924           edge e = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo));
 925           gsi_insert_seq_on_edge_immediate (e, stmts);
 926         }
 927     }
 928   return cached;
 929 }
 930
 931 /* Return true if we can use CMP_TYPE as the comparison type to produce
 932    all masks required to mask LOOP_VINFO.  */
 933
 934 static bool
 935 can_produce_all_loop_masks_p (loop_vec_info loop_vinfo, tree cmp_type)
 936 {
 937   rgroup_masks *rgm;
 938   unsigned int i;
 939   FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo), i, rgm)
 940     if (rgm->mask_type != NULL_TREE
 941         && !direct_internal_fn_supported_p (IFN_WHILE_ULT,
 942                                             cmp_type, rgm->mask_type,
 943                                             OPTIMIZE_FOR_SPEED))
 944       return false;
 945   return true;
 946 }
 947
 948 /* Calculate the maximum number of scalars per iteration for every
 949    rgroup in LOOP_VINFO.  */
 950
 951 static unsigned int
 952 vect_get_max_nscalars_per_iter (loop_vec_info loop_vinfo)
 953 {
 954   unsigned int res = 1;
 955   unsigned int i;
 956   rgroup_masks *rgm;
 957   FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo), i, rgm)
 958     res = MAX (res, rgm->max_nscalars_per_iter);
 959   return res;
 960 }
 961
 962 /* Each statement in LOOP_VINFO can be masked where necessary.  Check
 963    whether we can actually generate the masks required.  Return true if so,
 964    storing the type of the scalar IV in LOOP_VINFO_MASK_COMPARE_TYPE.  */
 965
 966 static bool
 967 vect_verify_full_masking (loop_vec_info loop_vinfo)
 968 {
 969   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
 970   unsigned int min_ni_width;
 971   unsigned int max_nscalars_per_iter
 972     = vect_get_max_nscalars_per_iter (loop_vinfo);
 973
 974   /* Use a normal loop if there are no statements that need masking.
 975      This only happens in rare degenerate cases: it means that the loop
 976      has no loads, no stores, and no live-out values.  */
 977   if (LOOP_VINFO_MASKS (loop_vinfo).is_empty ())
 978     return false;
 979
 980   /* Get the maximum number of iterations that is representable
 981      in the counter type.  */
 982   tree ni_type = TREE_TYPE (LOOP_VINFO_NITERSM1 (loop_vinfo));
 983   widest_int max_ni = wi::to_widest (TYPE_MAX_VALUE (ni_type)) + 1;
 984
 985   /* Get a more refined estimate for the number of iterations.  */
 986   widest_int max_back_edges;
 987   if (max_loop_iterations (loop, &max_back_edges))
 988     max_ni = wi::smin (max_ni, max_back_edges + 1);
 989
 990   /* Account for rgroup masks, in which each bit is replicated N times.  */
 991   max_ni *= max_nscalars_per_iter;
 992
 993   /* Work out how many bits we need to represent the limit.  */
 994   min_ni_width = wi::min_precision (max_ni, UNSIGNED);
 995
 996   /* Find a scalar mode for which WHILE_ULT is supported.  */
 997   opt_scalar_int_mode cmp_mode_iter;
 998   tree cmp_type = NULL_TREE;
 999   tree iv_type = NULL_TREE;
1000   widest_int iv_limit = vect_iv_limit_for_full_masking (loop_vinfo);
1001   unsigned int iv_precision = UINT_MAX;
1002
1003   if (iv_limit != -1)
1004     iv_precision = wi::min_precision (iv_limit * max_nscalars_per_iter,
1005                                       UNSIGNED);
1006
1007   FOR_EACH_MODE_IN_CLASS (cmp_mode_iter, MODE_INT)
1008     {
1009       unsigned int cmp_bits = GET_MODE_BITSIZE (cmp_mode_iter.require ());
1010       if (cmp_bits >= min_ni_width
1011           && targetm.scalar_mode_supported_p (cmp_mode_iter.require ()))
1012         {
1013           tree this_type = build_nonstandard_integer_type (cmp_bits, true);
1014           if (this_type
1015               && can_produce_all_loop_masks_p (loop_vinfo, this_type))
1016             {
1017               /* Although we could stop as soon as we find a valid mode,
1018                  there are at least two reasons why that's not always the
1019                  best choice:
1020
1021                  - An IV that's Pmode or wider is more likely to be reusable
1022                    in address calculations than an IV that's narrower than
1023                    Pmode.
1024
1025                  - Doing the comparison in IV_PRECISION or wider allows
1026                    a natural 0-based IV, whereas using a narrower comparison
1027                    type requires mitigations against wrap-around.
1028
1029                  Conversely, if the IV limit is variable, doing the comparison
1030                  in a wider type than the original type can introduce
1031                  unnecessary extensions, so picking the widest valid mode
1032                  is not always a good choice either.
1033
1034                  Here we prefer the first IV type that's Pmode or wider,
1035                  and the first comparison type that's IV_PRECISION or wider.
1036                  (The comparison type must be no wider than the IV type,
1037                  to avoid extensions in the vector loop.)
1038
1039                  ??? We might want to try continuing beyond Pmode for ILP32
1040                  targets if CMP_BITS < IV_PRECISION.  */
1041               iv_type = this_type;
1042               if (!cmp_type || iv_precision > TYPE_PRECISION (cmp_type))
1043                 cmp_type = this_type;
1044               if (cmp_bits >= GET_MODE_BITSIZE (Pmode))
1045                 break;
1046             }
1047         }
1048     }
1049
1050   if (!cmp_type)
1051     return false;
1052
1053   LOOP_VINFO_MASK_COMPARE_TYPE (loop_vinfo) = cmp_type;
1054   LOOP_VINFO_MASK_IV_TYPE (loop_vinfo) = iv_type;
1055   return true;
1056 }
1057
1058 /* Calculate the cost of one scalar iteration of the loop.  */
1059 static void
1060 vect_compute_single_scalar_iteration_cost (loop_vec_info loop_vinfo)
1061 {
1062   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1063   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1064   int nbbs = loop->num_nodes, factor;
1065   int innerloop_iters, i;
1066
1067   DUMP_VECT_SCOPE ("vect_compute_single_scalar_iteration_cost");
1068
1069   /* Gather costs for statements in the scalar loop.  */
1070
1071   /* FORNOW.  */
1072   innerloop_iters = 1;
1073   if (loop->inner)
1074     innerloop_iters = 50; /* FIXME */
1075
1076   for (i = 0; i < nbbs; i++)
1077     {
1078       gimple_stmt_iterator si;
1079       basic_block bb = bbs[i];
1080
1081       if (bb->loop_father == loop->inner)
1082         factor = innerloop_iters;
1083       else
1084         factor = 1;
1085
1086       for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
1087         {
1088           gimple *stmt = gsi_stmt (si);
1089           stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (stmt);
1090
1091           if (!is_gimple_assign (stmt) && !is_gimple_call (stmt))
1092             continue;
1093
1094           /* Skip stmts that are not vectorized inside the loop.  */
1095           stmt_vec_info vstmt_info = vect_stmt_to_vectorize (stmt_info);
1096           if (!STMT_VINFO_RELEVANT_P (vstmt_info)
1097               && (!STMT_VINFO_LIVE_P (vstmt_info)
1098                   || !VECTORIZABLE_CYCLE_DEF
1099                         (STMT_VINFO_DEF_TYPE (vstmt_info))))
1100             continue;
1101
1102           vect_cost_for_stmt kind;
1103           if (STMT_VINFO_DATA_REF (stmt_info))
1104             {
1105               if (DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
1106                kind = scalar_load;
1107              else
1108                kind = scalar_store;
1109             }
1110           else if (vect_nop_conversion_p (stmt_info))
1111             continue;
1112           else
1113             kind = scalar_stmt;
1114
1115           record_stmt_cost (&LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1116                             factor, kind, stmt_info, 0, vect_prologue);
1117         }
1118     }
1119
1120   /* Now accumulate cost.  */
1121   void *target_cost_data = init_cost (loop);
1122   stmt_info_for_cost *si;
1123   int j;
1124   FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1125                     j, si)
1126     (void) add_stmt_cost (loop_vinfo, target_cost_data, si->count,
1127                           si->kind, si->stmt_info, si->vectype,
1128                           si->misalign, vect_body);
1129   unsigned dummy, body_cost = 0;
1130   finish_cost (target_cost_data, &dummy, &body_cost, &dummy);
1131   destroy_cost_data (target_cost_data);
1132   LOOP_VINFO_SINGLE_SCALAR_ITERATION_COST (loop_vinfo) = body_cost;
1133 }
1134
1135
1136 /* Function vect_analyze_loop_form_1.
1137
1138    Verify that certain CFG restrictions hold, including:
1139    - the loop has a pre-header
1140    - the loop has a single entry and exit
1141    - the loop exit condition is simple enough
1142    - the number of iterations can be analyzed, i.e, a countable loop.  The
1143      niter could be analyzed under some assumptions.  */
1144
1145 opt_result
1146 vect_analyze_loop_form_1 (class loop *loop, gcond **loop_cond,
1147                           tree *assumptions, tree *number_of_iterationsm1,
1148                           tree *number_of_iterations, gcond **inner_loop_cond)
1149 {
1150   DUMP_VECT_SCOPE ("vect_analyze_loop_form");
1151
1152   /* Different restrictions apply when we are considering an inner-most loop,
1153      vs. an outer (nested) loop.
1154      (FORNOW. May want to relax some of these restrictions in the future).  */
1155
1156   if (!loop->inner)
1157     {
1158       /* Inner-most loop.  We currently require that the number of BBs is
1159          exactly 2 (the header and latch).  Vectorizable inner-most loops
1160          look like this:
1161
1162                         (pre-header)
1163                            |
1164                           header <--------+
1165                            | |            |
1166                            | +--> latch --+
1167                            |
1168                         (exit-bb)  */
1169
1170       if (loop->num_nodes != 2)
1171         return opt_result::failure_at (vect_location,
1172                                        "not vectorized:"
1173                                        " control flow in loop.\n");
1174
1175       if (empty_block_p (loop->header))
1176         return opt_result::failure_at (vect_location,
1177                                        "not vectorized: empty loop.\n");
1178     }
1179   else
1180     {
1181       class loop *innerloop = loop->inner;
1182       edge entryedge;
1183
1184       /* Nested loop. We currently require that the loop is doubly-nested,
1185          contains a single inner loop, and the number of BBs is exactly 5.
1186          Vectorizable outer-loops look like this:
1187
1188                         (pre-header)
1189                            |
1190                           header <---+
1191                            |         |
1192                           inner-loop |
1193                            |         |
1194                           tail ------+
1195                            |
1196                         (exit-bb)
1197
1198          The inner-loop has the properties expected of inner-most loops
1199          as described above.  */
1200
1201       if ((loop->inner)->inner || (loop->inner)->next)
1202         return opt_result::failure_at (vect_location,
1203                                        "not vectorized:"
1204                                        " multiple nested loops.\n");
1205
1206       if (loop->num_nodes != 5)
1207         return opt_result::failure_at (vect_location,
1208                                        "not vectorized:"
1209                                        " control flow in loop.\n");
1210
1211       entryedge = loop_preheader_edge (innerloop);
1212       if (entryedge->src != loop->header
1213           || !single_exit (innerloop)
1214           || single_exit (innerloop)->dest != EDGE_PRED (loop->latch, 0)->src)
1215         return opt_result::failure_at (vect_location,
1216                                        "not vectorized:"
1217                                        " unsupported outerloop form.\n");
1218
1219       /* Analyze the inner-loop.  */
1220       tree inner_niterm1, inner_niter, inner_assumptions;
1221       opt_result res
1222         = vect_analyze_loop_form_1 (loop->inner, inner_loop_cond,
1223                                     &inner_assumptions, &inner_niterm1,
1224                                     &inner_niter, NULL);
1225       if (!res)
1226         {
1227           if (dump_enabled_p ())
1228             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1229                              "not vectorized: Bad inner loop.\n");
1230           return res;
1231         }
1232
1233       /* Don't support analyzing niter under assumptions for inner
1234          loop.  */
1235       if (!integer_onep (inner_assumptions))
1236         return opt_result::failure_at (vect_location,
1237                                        "not vectorized: Bad inner loop.\n");
1238
1239       if (!expr_invariant_in_loop_p (loop, inner_niter))
1240         return opt_result::failure_at (vect_location,
1241                                        "not vectorized: inner-loop count not"
1242                                        " invariant.\n");
1243
1244       if (dump_enabled_p ())
1245         dump_printf_loc (MSG_NOTE, vect_location,
1246                          "Considering outer-loop vectorization.\n");
1247     }
1248
1249   if (!single_exit (loop))
1250     return opt_result::failure_at (vect_location,
1251                                    "not vectorized: multiple exits.\n");
1252   if (EDGE_COUNT (loop->header->preds) != 2)
1253     return opt_result::failure_at (vect_location,
1254                                    "not vectorized:"
1255                                    " too many incoming edges.\n");
1256
1257   /* We assume that the loop exit condition is at the end of the loop. i.e,
1258      that the loop is represented as a do-while (with a proper if-guard
1259      before the loop if needed), where the loop header contains all the
1260      executable statements, and the latch is empty.  */
1261   if (!empty_block_p (loop->latch)
1262       || !gimple_seq_empty_p (phi_nodes (loop->latch)))
1263     return opt_result::failure_at (vect_location,
1264                                    "not vectorized: latch block not empty.\n");
1265
1266   /* Make sure the exit is not abnormal.  */
1267   edge e = single_exit (loop);
1268   if (e->flags & EDGE_ABNORMAL)
1269     return opt_result::failure_at (vect_location,
1270                                    "not vectorized:"
1271                                    " abnormal loop exit edge.\n");
1272
1273   *loop_cond = vect_get_loop_niters (loop, assumptions, number_of_iterations,
1274                                      number_of_iterationsm1);
1275   if (!*loop_cond)
1276     return opt_result::failure_at
1277       (vect_location,
1278        "not vectorized: complicated exit condition.\n");
1279
1280   if (integer_zerop (*assumptions)
1281       || !*number_of_iterations
1282       || chrec_contains_undetermined (*number_of_iterations))
1283     return opt_result::failure_at
1284       (*loop_cond,
1285        "not vectorized: number of iterations cannot be computed.\n");
1286
1287   if (integer_zerop (*number_of_iterations))
1288     return opt_result::failure_at
1289       (*loop_cond,
1290        "not vectorized: number of iterations = 0.\n");
1291
1292   return opt_result::success ();
1293 }
1294
1295 /* Analyze LOOP form and return a loop_vec_info if it is of suitable form.  */
1296
1297 opt_loop_vec_info
1298 vect_analyze_loop_form (class loop *loop, vec_info_shared *shared)
1299 {
1300   tree assumptions, number_of_iterations, number_of_iterationsm1;
1301   gcond *loop_cond, *inner_loop_cond = NULL;
1302
1303   opt_result res
1304     = vect_analyze_loop_form_1 (loop, &loop_cond,
1305                                 &assumptions, &number_of_iterationsm1,
1306                                 &number_of_iterations, &inner_loop_cond);
1307   if (!res)
1308     return opt_loop_vec_info::propagate_failure (res);
1309
1310   loop_vec_info loop_vinfo = new _loop_vec_info (loop, shared);
1311   LOOP_VINFO_NITERSM1 (loop_vinfo) = number_of_iterationsm1;
1312   LOOP_VINFO_NITERS (loop_vinfo) = number_of_iterations;
1313   LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = number_of_iterations;
1314   if (!integer_onep (assumptions))
1315     {
1316       /* We consider to vectorize this loop by versioning it under
1317          some assumptions.  In order to do this, we need to clear
1318          existing information computed by scev and niter analyzer.  */
1319       scev_reset_htab ();
1320       free_numbers_of_iterations_estimates (loop);
1321       /* Also set flag for this loop so that following scev and niter
1322          analysis are done under the assumptions.  */
1323       loop_constraint_set (loop, LOOP_C_FINITE);
1324       /* Also record the assumptions for versioning.  */
1325       LOOP_VINFO_NITERS_ASSUMPTIONS (loop_vinfo) = assumptions;
1326     }
1327
1328   if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
1329     {
1330       if (dump_enabled_p ())
1331         {
1332           dump_printf_loc (MSG_NOTE, vect_location,
1333                            "Symbolic number of iterations is ");
1334           dump_generic_expr (MSG_NOTE, TDF_DETAILS, number_of_iterations);
1335           dump_printf (MSG_NOTE, "\n");
1336         }
1337     }
1338
1339   stmt_vec_info loop_cond_info = loop_vinfo->lookup_stmt (loop_cond);
1340   STMT_VINFO_TYPE (loop_cond_info) = loop_exit_ctrl_vec_info_type;
1341   if (inner_loop_cond)
1342     {
1343       stmt_vec_info inner_loop_cond_info
1344         = loop_vinfo->lookup_stmt (inner_loop_cond);
1345       STMT_VINFO_TYPE (inner_loop_cond_info) = loop_exit_ctrl_vec_info_type;
1346     }
1347
1348   gcc_assert (!loop->aux);
1349   loop->aux = loop_vinfo;
1350   return opt_loop_vec_info::success (loop_vinfo);
1351 }
1352
1353
1354
1355 /* Scan the loop stmts and dependent on whether there are any (non-)SLP
1356    statements update the vectorization factor.  */
1357
1358 static void
1359 vect_update_vf_for_slp (loop_vec_info loop_vinfo)
1360 {
1361   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1362   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1363   int nbbs = loop->num_nodes;
1364   poly_uint64 vectorization_factor;
1365   int i;
1366
1367   DUMP_VECT_SCOPE ("vect_update_vf_for_slp");
1368
1369   vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1370   gcc_assert (known_ne (vectorization_factor, 0U));
1371
1372   /* If all the stmts in the loop can be SLPed, we perform only SLP, and
1373      vectorization factor of the loop is the unrolling factor required by
1374      the SLP instances.  If that unrolling factor is 1, we say, that we
1375      perform pure SLP on loop - cross iteration parallelism is not
1376      exploited.  */
1377   bool only_slp_in_loop = true;
1378   for (i = 0; i < nbbs; i++)
1379     {
1380       basic_block bb = bbs[i];
1381       for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
1382            gsi_next (&si))
1383         {
1384           stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (si.phi ());
1385           if (!stmt_info)
1386             continue;
1387           if ((STMT_VINFO_RELEVANT_P (stmt_info)
1388                || VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
1389               && !PURE_SLP_STMT (stmt_info))
1390             /* STMT needs both SLP and loop-based vectorization.  */
1391             only_slp_in_loop = false;
1392         }
1393       for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
1394            gsi_next (&si))
1395         {
1396           stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
1397           stmt_info = vect_stmt_to_vectorize (stmt_info);
1398           if ((STMT_VINFO_RELEVANT_P (stmt_info)
1399                || VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
1400               && !PURE_SLP_STMT (stmt_info))
1401             /* STMT needs both SLP and loop-based vectorization.  */
1402             only_slp_in_loop = false;
1403         }
1404     }
1405
1406   if (only_slp_in_loop)
1407     {
1408       if (dump_enabled_p ())
1409         dump_printf_loc (MSG_NOTE, vect_location,
1410                          "Loop contains only SLP stmts\n");
1411       vectorization_factor = LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo);
1412     }
1413   else
1414     {
1415       if (dump_enabled_p ())
1416         dump_printf_loc (MSG_NOTE, vect_location,
1417                          "Loop contains SLP and non-SLP stmts\n");
1418       /* Both the vectorization factor and unroll factor have the form
1419          GET_MODE_SIZE (loop_vinfo->vector_mode) * X for some rational X,
1420          so they must have a common multiple.  */
1421       vectorization_factor
1422         = force_common_multiple (vectorization_factor,
1423                                  LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo));
1424     }
1425
1426   LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
1427   if (dump_enabled_p ())
1428     {
1429       dump_printf_loc (MSG_NOTE, vect_location,
1430                        "Updating vectorization factor to ");
1431       dump_dec (MSG_NOTE, vectorization_factor);
1432       dump_printf (MSG_NOTE, ".\n");
1433     }
1434 }
1435
1436 /* Return true if STMT_INFO describes a double reduction phi and if
1437    the other phi in the reduction is also relevant for vectorization.
1438    This rejects cases such as:
1439
1440       outer1:
1441         x_1 = PHI <x_3(outer2), ...>;
1442         ...
1443
1444       inner:
1445         x_2 = ...;
1446         ...
1447
1448       outer2:
1449         x_3 = PHI <x_2(inner)>;
1450
1451    if nothing in x_2 or elsewhere makes x_1 relevant.  */
1452
1453 static bool
1454 vect_active_double_reduction_p (stmt_vec_info stmt_info)
1455 {
1456   if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_double_reduction_def)
1457     return false;
1458
1459   return STMT_VINFO_RELEVANT_P (STMT_VINFO_REDUC_DEF (stmt_info));
1460 }
1461
1462 /* Function vect_analyze_loop_operations.
1463
1464    Scan the loop stmts and make sure they are all vectorizable.  */
1465
1466 static opt_result
1467 vect_analyze_loop_operations (loop_vec_info loop_vinfo)
1468 {
1469   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1470   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1471   int nbbs = loop->num_nodes;
1472   int i;
1473   stmt_vec_info stmt_info;
1474   bool need_to_vectorize = false;
1475   bool ok;
1476
1477   DUMP_VECT_SCOPE ("vect_analyze_loop_operations");
1478
1479   auto_vec<stmt_info_for_cost> cost_vec;
1480
1481   for (i = 0; i < nbbs; i++)
1482     {
1483       basic_block bb = bbs[i];
1484
1485       for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
1486            gsi_next (&si))
1487         {
1488           gphi *phi = si.phi ();
1489           ok = true;
1490
1491           stmt_info = loop_vinfo->lookup_stmt (phi);
1492           if (dump_enabled_p ())
1493             dump_printf_loc (MSG_NOTE, vect_location, "examining phi: %G", phi);
1494           if (virtual_operand_p (gimple_phi_result (phi)))
1495             continue;
1496
1497           /* Inner-loop loop-closed exit phi in outer-loop vectorization
1498              (i.e., a phi in the tail of the outer-loop).  */
1499           if (! is_loop_header_bb_p (bb))
1500             {
1501               /* FORNOW: we currently don't support the case that these phis
1502                  are not used in the outerloop (unless it is double reduction,
1503                  i.e., this phi is vect_reduction_def), cause this case
1504                  requires to actually do something here.  */
1505               if (STMT_VINFO_LIVE_P (stmt_info)
1506                   && !vect_active_double_reduction_p (stmt_info))
1507                 return opt_result::failure_at (phi,
1508                                                "Unsupported loop-closed phi"
1509                                                " in outer-loop.\n");
1510
1511               /* If PHI is used in the outer loop, we check that its operand
1512                  is defined in the inner loop.  */
1513               if (STMT_VINFO_RELEVANT_P (stmt_info))
1514                 {
1515                   tree phi_op;
1516
1517                   if (gimple_phi_num_args (phi) != 1)
1518                     return opt_result::failure_at (phi, "unsupported phi");
1519
1520                   phi_op = PHI_ARG_DEF (phi, 0);
1521                   stmt_vec_info op_def_info = loop_vinfo->lookup_def (phi_op);
1522                   if (!op_def_info)
1523                     return opt_result::failure_at (phi, "unsupported phi\n");
1524
1525                   if (STMT_VINFO_RELEVANT (op_def_info) != vect_used_in_outer
1526                       && (STMT_VINFO_RELEVANT (op_def_info)
1527                           != vect_used_in_outer_by_reduction))
1528                     return opt_result::failure_at (phi, "unsupported phi\n");
1529
1530                   if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_internal_def
1531                        || (STMT_VINFO_DEF_TYPE (stmt_info)
1532                            == vect_double_reduction_def))
1533                       && !vectorizable_lc_phi (loop_vinfo,
1534                                                stmt_info, NULL, NULL))
1535                     return opt_result::failure_at (phi, "unsupported phi\n");
1536                 }
1537
1538               continue;
1539             }
1540
1541           gcc_assert (stmt_info);
1542
1543           if ((STMT_VINFO_RELEVANT (stmt_info) == vect_used_in_scope
1544                || STMT_VINFO_LIVE_P (stmt_info))
1545               && STMT_VINFO_DEF_TYPE (stmt_info) != vect_induction_def)
1546             /* A scalar-dependence cycle that we don't support.  */
1547             return opt_result::failure_at (phi,
1548                                            "not vectorized:"
1549                                            " scalar dependence cycle.\n");
1550
1551           if (STMT_VINFO_RELEVANT_P (stmt_info))
1552             {
1553               need_to_vectorize = true;
1554               if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def
1555                   && ! PURE_SLP_STMT (stmt_info))
1556                 ok = vectorizable_induction (loop_vinfo,
1557                                              stmt_info, NULL, NULL, NULL,
1558                                              &cost_vec);
1559               else if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
1560                         || (STMT_VINFO_DEF_TYPE (stmt_info)
1561                             == vect_double_reduction_def)
1562                         || STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle)
1563                        && ! PURE_SLP_STMT (stmt_info))
1564                 ok = vectorizable_reduction (loop_vinfo,
1565                                              stmt_info, NULL, NULL, &cost_vec);
1566             }
1567
1568           /* SLP PHIs are tested by vect_slp_analyze_node_operations.  */
1569           if (ok
1570               && STMT_VINFO_LIVE_P (stmt_info)
1571               && !PURE_SLP_STMT (stmt_info))
1572             ok = vectorizable_live_operation (loop_vinfo,
1573                                               stmt_info, NULL, NULL, NULL,
1574                                               -1, false, &cost_vec);
1575
1576           if (!ok)
1577             return opt_result::failure_at (phi,
1578                                            "not vectorized: relevant phi not "
1579                                            "supported: %G",
1580                                            static_cast <gimple *> (phi));
1581         }
1582
1583       for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
1584            gsi_next (&si))
1585         {
1586           gimple *stmt = gsi_stmt (si);
1587           if (!gimple_clobber_p (stmt))
1588             {
1589               opt_result res
1590                 = vect_analyze_stmt (loop_vinfo,
1591                                      loop_vinfo->lookup_stmt (stmt),
1592                                      &need_to_vectorize,
1593                                      NULL, NULL, &cost_vec);
1594               if (!res)
1595                 return res;
1596             }
1597         }
1598     } /* bbs */
1599
1600   add_stmt_costs (loop_vinfo, loop_vinfo->target_cost_data, &cost_vec);
1601
1602   /* All operations in the loop are either irrelevant (deal with loop
1603      control, or dead), or only used outside the loop and can be moved
1604      out of the loop (e.g. invariants, inductions).  The loop can be
1605      optimized away by scalar optimizations.  We're better off not
1606      touching this loop.  */
1607   if (!need_to_vectorize)
1608     {
1609       if (dump_enabled_p ())
1610         dump_printf_loc (MSG_NOTE, vect_location,
1611                          "All the computation can be taken out of the loop.\n");
1612       return opt_result::failure_at
1613         (vect_location,
1614          "not vectorized: redundant loop. no profit to vectorize.\n");
1615     }
1616
1617   return opt_result::success ();
1618 }
1619
1620 /* Analyze the cost of the loop described by LOOP_VINFO.  Decide if it
1621    is worthwhile to vectorize.  Return 1 if definitely yes, 0 if
1622    definitely no, or -1 if it's worth retrying.  */
1623
1624 static int
1625 vect_analyze_loop_costing (loop_vec_info loop_vinfo)
1626 {
1627   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1628   unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
1629
1630   /* Only fully-masked loops can have iteration counts less than the
1631      vectorization factor.  */
1632   if (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
1633     {
1634       HOST_WIDE_INT max_niter;
1635
1636       if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
1637         max_niter = LOOP_VINFO_INT_NITERS (loop_vinfo);
1638       else
1639         max_niter = max_stmt_executions_int (loop);
1640
1641       if (max_niter != -1
1642           && (unsigned HOST_WIDE_INT) max_niter < assumed_vf)
1643         {
1644           if (dump_enabled_p ())
1645             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1646                              "not vectorized: iteration count smaller than "
1647                              "vectorization factor.\n");
1648           return 0;
1649         }
1650     }
1651
1652   int min_profitable_iters, min_profitable_estimate;
1653   vect_estimate_min_profitable_iters (loop_vinfo, &min_profitable_iters,
1654                                       &min_profitable_estimate);
1655
1656   if (min_profitable_iters < 0)
1657     {
1658       if (dump_enabled_p ())
1659         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1660                          "not vectorized: vectorization not profitable.\n");
1661       if (dump_enabled_p ())
1662         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1663                          "not vectorized: vector version will never be "
1664                          "profitable.\n");
1665       return -1;
1666     }
1667
1668   int min_scalar_loop_bound = (param_min_vect_loop_bound
1669                                * assumed_vf);
1670
1671   /* Use the cost model only if it is more conservative than user specified
1672      threshold.  */
1673   unsigned int th = (unsigned) MAX (min_scalar_loop_bound,
1674                                     min_profitable_iters);
1675
1676   LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo) = th;
1677
1678   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
1679       && LOOP_VINFO_INT_NITERS (loop_vinfo) < th)
1680     {
1681       if (dump_enabled_p ())
1682         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1683                          "not vectorized: vectorization not profitable.\n");
1684       if (dump_enabled_p ())
1685         dump_printf_loc (MSG_NOTE, vect_location,
1686                          "not vectorized: iteration count smaller than user "
1687                          "specified loop bound parameter or minimum profitable "
1688                          "iterations (whichever is more conservative).\n");
1689       return 0;
1690     }
1691
1692   /* The static profitablity threshold min_profitable_estimate includes
1693      the cost of having to check at runtime whether the scalar loop
1694      should be used instead.  If it turns out that we don't need or want
1695      such a check, the threshold we should use for the static estimate
1696      is simply the point at which the vector loop becomes more profitable
1697      than the scalar loop.  */
1698   if (min_profitable_estimate > min_profitable_iters
1699       && !LOOP_REQUIRES_VERSIONING (loop_vinfo)
1700       && !LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo)
1701       && !LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
1702       && !vect_apply_runtime_profitability_check_p (loop_vinfo))
1703     {
1704       if (dump_enabled_p ())
1705         dump_printf_loc (MSG_NOTE, vect_location, "no need for a runtime"
1706                          " choice between the scalar and vector loops\n");
1707       min_profitable_estimate = min_profitable_iters;
1708     }
1709
1710   HOST_WIDE_INT estimated_niter;
1711
1712   /* If we are vectorizing an epilogue then we know the maximum number of
1713      scalar iterations it will cover is at least one lower than the
1714      vectorization factor of the main loop.  */
1715   if (LOOP_VINFO_EPILOGUE_P (loop_vinfo))
1716     estimated_niter
1717       = vect_vf_for_cost (LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo)) - 1;
1718   else
1719     {
1720       estimated_niter = estimated_stmt_executions_int (loop);
1721       if (estimated_niter == -1)
1722         estimated_niter = likely_max_stmt_executions_int (loop);
1723     }
1724   if (estimated_niter != -1
1725       && ((unsigned HOST_WIDE_INT) estimated_niter
1726           < MAX (th, (unsigned) min_profitable_estimate)))
1727     {
1728       if (dump_enabled_p ())
1729         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1730                          "not vectorized: estimated iteration count too "
1731                          "small.\n");
1732       if (dump_enabled_p ())
1733         dump_printf_loc (MSG_NOTE, vect_location,
1734                          "not vectorized: estimated iteration count smaller "
1735                          "than specified loop bound parameter or minimum "
1736                          "profitable iterations (whichever is more "
1737                          "conservative).\n");
1738       return -1;
1739     }
1740
1741   return 1;
1742 }
1743
1744 static opt_result
1745 vect_get_datarefs_in_loop (loop_p loop, basic_block *bbs,
1746                            vec<data_reference_p> *datarefs,
1747                            unsigned int *n_stmts)
1748 {
1749   *n_stmts = 0;
1750   for (unsigned i = 0; i < loop->num_nodes; i++)
1751     for (gimple_stmt_iterator gsi = gsi_start_bb (bbs[i]);
1752          !gsi_end_p (gsi); gsi_next (&gsi))
1753       {
1754         gimple *stmt = gsi_stmt (gsi);
1755         if (is_gimple_debug (stmt))
1756           continue;
1757         ++(*n_stmts);
1758         opt_result res = vect_find_stmt_data_reference (loop, stmt, datarefs);
1759         if (!res)
1760           {
1761             if (is_gimple_call (stmt) && loop->safelen)
1762               {
1763                 tree fndecl = gimple_call_fndecl (stmt), op;
1764                 if (fndecl != NULL_TREE)
1765                   {
1766                     cgraph_node *node = cgraph_node::get (fndecl);
1767                     if (node != NULL && node->simd_clones != NULL)
1768                       {
1769                         unsigned int j, n = gimple_call_num_args (stmt);
1770                         for (j = 0; j < n; j++)
1771                           {
1772                             op = gimple_call_arg (stmt, j);
1773                             if (DECL_P (op)
1774                                 || (REFERENCE_CLASS_P (op)
1775                                     && get_base_address (op)))
1776                               break;
1777                           }
1778                         op = gimple_call_lhs (stmt);
1779                         /* Ignore #pragma omp declare simd functions
1780                            if they don't have data references in the
1781                            call stmt itself.  */
1782                         if (j == n
1783                             && !(op
1784                                  && (DECL_P (op)
1785                                      || (REFERENCE_CLASS_P (op)
1786                                          && get_base_address (op)))))
1787                           continue;
1788                       }
1789                   }
1790               }
1791             return res;
1792           }
1793         /* If dependence analysis will give up due to the limit on the
1794            number of datarefs stop here and fail fatally.  */
1795         if (datarefs->length ()
1796             > (unsigned)param_loop_max_datarefs_for_datadeps)
1797           return opt_result::failure_at (stmt, "exceeded param "
1798                                          "loop-max-datarefs-for-datadeps\n");
1799       }
1800   return opt_result::success ();
1801 }
1802
1803 /* Look for SLP-only access groups and turn each individual access into its own
1804    group.  */
1805 static void
1806 vect_dissolve_slp_only_groups (loop_vec_info loop_vinfo)
1807 {
1808   unsigned int i;
1809   struct data_reference *dr;
1810
1811   DUMP_VECT_SCOPE ("vect_dissolve_slp_only_groups");
1812
1813   vec<data_reference_p> datarefs = loop_vinfo->shared->datarefs;
1814   FOR_EACH_VEC_ELT (datarefs, i, dr)
1815     {
1816       gcc_assert (DR_REF (dr));
1817       stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (DR_STMT (dr));
1818
1819       /* Check if the load is a part of an interleaving chain.  */
1820       if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
1821         {
1822           stmt_vec_info first_element = DR_GROUP_FIRST_ELEMENT (stmt_info);
1823           unsigned int group_size = DR_GROUP_SIZE (first_element);
1824
1825           /* Check if SLP-only groups.  */
1826           if (!STMT_SLP_TYPE (stmt_info)
1827               && STMT_VINFO_SLP_VECT_ONLY (first_element))
1828             {
1829               /* Dissolve the group.  */
1830               STMT_VINFO_SLP_VECT_ONLY (first_element) = false;
1831
1832               stmt_vec_info vinfo = first_element;
1833               while (vinfo)
1834                 {
1835                   stmt_vec_info next = DR_GROUP_NEXT_ELEMENT (vinfo);
1836                   DR_GROUP_FIRST_ELEMENT (vinfo) = vinfo;
1837                   DR_GROUP_NEXT_ELEMENT (vinfo) = NULL;
1838                   DR_GROUP_SIZE (vinfo) = 1;
1839                   if (STMT_VINFO_STRIDED_P (first_element))
1840                     DR_GROUP_GAP (vinfo) = 0;
1841                   else
1842                     DR_GROUP_GAP (vinfo) = group_size - 1;
1843                   vinfo = next;
1844                 }
1845             }
1846         }
1847     }
1848 }
1849
1850
1851 /* Decides whether we need to create an epilogue loop to handle
1852    remaining scalar iterations and sets PEELING_FOR_NITERS accordingly.  */
1853
1854 void
1855 determine_peel_for_niter (loop_vec_info loop_vinfo)
1856 {
1857   LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = false;
1858
1859   unsigned HOST_WIDE_INT const_vf;
1860   HOST_WIDE_INT max_niter
1861     = likely_max_stmt_executions_int (LOOP_VINFO_LOOP (loop_vinfo));
1862
1863   unsigned th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
1864   if (!th && LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo))
1865     th = LOOP_VINFO_COST_MODEL_THRESHOLD (LOOP_VINFO_ORIG_LOOP_INFO
1866                                           (loop_vinfo));
1867
1868   if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
1869     /* The main loop handles all iterations.  */
1870     LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = false;
1871   else if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
1872            && LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) >= 0)
1873     {
1874       /* Work out the (constant) number of iterations that need to be
1875          peeled for reasons other than niters.  */
1876       unsigned int peel_niter = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
1877       if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
1878         peel_niter += 1;
1879       if (!multiple_p (LOOP_VINFO_INT_NITERS (loop_vinfo) - peel_niter,
1880                        LOOP_VINFO_VECT_FACTOR (loop_vinfo)))
1881         LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = true;
1882     }
1883   else if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
1884            /* ??? When peeling for gaps but not alignment, we could
1885               try to check whether the (variable) niters is known to be
1886               VF * N + 1.  That's something of a niche case though.  */
1887            || LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
1888            || !LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant (&const_vf)
1889            || ((tree_ctz (LOOP_VINFO_NITERS (loop_vinfo))
1890                 < (unsigned) exact_log2 (const_vf))
1891                /* In case of versioning, check if the maximum number of
1892                   iterations is greater than th.  If they are identical,
1893                   the epilogue is unnecessary.  */
1894                && (!LOOP_REQUIRES_VERSIONING (loop_vinfo)
1895                    || ((unsigned HOST_WIDE_INT) max_niter
1896                        > (th / const_vf) * const_vf))))
1897     LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = true;
1898 }
1899
1900
1901 /* Function vect_analyze_loop_2.
1902
1903    Apply a set of analyses on LOOP, and create a loop_vec_info struct
1904    for it.  The different analyses will record information in the
1905    loop_vec_info struct.  */
1906 static opt_result
1907 vect_analyze_loop_2 (loop_vec_info loop_vinfo, bool &fatal, unsigned *n_stmts)
1908 {
1909   opt_result ok = opt_result::success ();
1910   int res;
1911   unsigned int max_vf = MAX_VECTORIZATION_FACTOR;
1912   poly_uint64 min_vf = 2;
1913   loop_vec_info orig_loop_vinfo = NULL;
1914
1915   /* If we are dealing with an epilogue then orig_loop_vinfo points to the
1916      loop_vec_info of the first vectorized loop.  */
1917   if (LOOP_VINFO_EPILOGUE_P (loop_vinfo))
1918     orig_loop_vinfo = LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo);
1919   else
1920     orig_loop_vinfo = loop_vinfo;
1921   gcc_assert (orig_loop_vinfo);
1922
1923   /* The first group of checks is independent of the vector size.  */
1924   fatal = true;
1925
1926   if (LOOP_VINFO_SIMD_IF_COND (loop_vinfo)
1927       && integer_zerop (LOOP_VINFO_SIMD_IF_COND (loop_vinfo)))
1928     return opt_result::failure_at (vect_location,
1929                                    "not vectorized: simd if(0)\n");
1930
1931   /* Find all data references in the loop (which correspond to vdefs/vuses)
1932      and analyze their evolution in the loop.  */
1933
1934   loop_p loop = LOOP_VINFO_LOOP (loop_vinfo);
1935
1936   /* Gather the data references and count stmts in the loop.  */
1937   if (!LOOP_VINFO_DATAREFS (loop_vinfo).exists ())
1938     {
1939       opt_result res
1940         = vect_get_datarefs_in_loop (loop, LOOP_VINFO_BBS (loop_vinfo),
1941                                      &LOOP_VINFO_DATAREFS (loop_vinfo),
1942                                      n_stmts);
1943       if (!res)
1944         {
1945           if (dump_enabled_p ())
1946             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1947                              "not vectorized: loop contains function "
1948                              "calls or data references that cannot "
1949                              "be analyzed\n");
1950           return res;
1951         }
1952       loop_vinfo->shared->save_datarefs ();
1953     }
1954   else
1955     loop_vinfo->shared->check_datarefs ();
1956
1957   /* Analyze the data references and also adjust the minimal
1958      vectorization factor according to the loads and stores.  */
1959
1960   ok = vect_analyze_data_refs (loop_vinfo, &min_vf, &fatal);
1961   if (!ok)
1962     {
1963       if (dump_enabled_p ())
1964         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1965                          "bad data references.\n");
1966       return ok;
1967     }
1968
1969   /* Classify all cross-iteration scalar data-flow cycles.
1970      Cross-iteration cycles caused by virtual phis are analyzed separately.  */
1971   vect_analyze_scalar_cycles (loop_vinfo);
1972
1973   vect_pattern_recog (loop_vinfo);
1974
1975   vect_fixup_scalar_cycles_with_patterns (loop_vinfo);
1976
1977   /* Analyze the access patterns of the data-refs in the loop (consecutive,
1978      complex, etc.). FORNOW: Only handle consecutive access pattern.  */
1979
1980   ok = vect_analyze_data_ref_accesses (loop_vinfo);
1981   if (!ok)
1982     {
1983       if (dump_enabled_p ())
1984         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1985                          "bad data access.\n");
1986       return ok;
1987     }
1988
1989   /* Data-flow analysis to detect stmts that do not need to be vectorized.  */
1990
1991   ok = vect_mark_stmts_to_be_vectorized (loop_vinfo, &fatal);
1992   if (!ok)
1993     {
1994       if (dump_enabled_p ())
1995         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1996                          "unexpected pattern.\n");
1997       return ok;
1998     }
1999
2000   /* While the rest of the analysis below depends on it in some way.  */
2001   fatal = false;
2002
2003   /* Analyze data dependences between the data-refs in the loop
2004      and adjust the maximum vectorization factor according to
2005      the dependences.
2006      FORNOW: fail at the first data dependence that we encounter.  */
2007
2008   ok = vect_analyze_data_ref_dependences (loop_vinfo, &max_vf);
2009   if (!ok)
2010     {
2011       if (dump_enabled_p ())
2012         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2013                          "bad data dependence.\n");
2014       return ok;
2015     }
2016   if (max_vf != MAX_VECTORIZATION_FACTOR
2017       && maybe_lt (max_vf, min_vf))
2018     return opt_result::failure_at (vect_location, "bad data dependence.\n");
2019   LOOP_VINFO_MAX_VECT_FACTOR (loop_vinfo) = max_vf;
2020
2021   ok = vect_determine_vectorization_factor (loop_vinfo);
2022   if (!ok)
2023     {
2024       if (dump_enabled_p ())
2025         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2026                          "can't determine vectorization factor.\n");
2027       return ok;
2028     }
2029   if (max_vf != MAX_VECTORIZATION_FACTOR
2030       && maybe_lt (max_vf, LOOP_VINFO_VECT_FACTOR (loop_vinfo)))
2031     return opt_result::failure_at (vect_location, "bad data dependence.\n");
2032
2033   /* Compute the scalar iteration cost.  */
2034   vect_compute_single_scalar_iteration_cost (loop_vinfo);
2035
2036   poly_uint64 saved_vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2037
2038   /* Check the SLP opportunities in the loop, analyze and build SLP trees.  */
2039   ok = vect_analyze_slp (loop_vinfo, *n_stmts);
2040   if (!ok)
2041     return ok;
2042
2043   /* If there are any SLP instances mark them as pure_slp.  */
2044   bool slp = vect_make_slp_decision (loop_vinfo);
2045   if (slp)
2046     {
2047       /* Find stmts that need to be both vectorized and SLPed.  */
2048       vect_detect_hybrid_slp (loop_vinfo);
2049
2050       /* Update the vectorization factor based on the SLP decision.  */
2051       vect_update_vf_for_slp (loop_vinfo);
2052
2053       /* Optimize the SLP graph with the vectorization factor fixed.  */
2054       vect_optimize_slp (loop_vinfo);
2055     }
2056
2057   bool saved_can_fully_mask_p = LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo);
2058
2059   /* We don't expect to have to roll back to anything other than an empty
2060      set of rgroups.  */
2061   gcc_assert (LOOP_VINFO_MASKS (loop_vinfo).is_empty ());
2062
2063   /* This is the point where we can re-start analysis with SLP forced off.  */
2064 start_over:
2065
2066   /* Now the vectorization factor is final.  */
2067   poly_uint64 vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2068   gcc_assert (known_ne (vectorization_factor, 0U));
2069
2070   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) && dump_enabled_p ())
2071     {
2072       dump_printf_loc (MSG_NOTE, vect_location,
2073                        "vectorization_factor = ");
2074       dump_dec (MSG_NOTE, vectorization_factor);
2075       dump_printf (MSG_NOTE, ", niters = %wd\n",
2076                    LOOP_VINFO_INT_NITERS (loop_vinfo));
2077     }
2078
2079   /* Analyze the alignment of the data-refs in the loop.
2080      Fail if a data reference is found that cannot be vectorized.  */
2081
2082   ok = vect_analyze_data_refs_alignment (loop_vinfo);
2083   if (!ok)
2084     {
2085       if (dump_enabled_p ())
2086         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2087                          "bad data alignment.\n");
2088       return ok;
2089     }
2090
2091   /* Prune the list of ddrs to be tested at run-time by versioning for alias.
2092      It is important to call pruning after vect_analyze_data_ref_accesses,
2093      since we use grouping information gathered by interleaving analysis.  */
2094   ok = vect_prune_runtime_alias_test_list (loop_vinfo);
2095   if (!ok)
2096     return ok;
2097
2098   /* Do not invoke vect_enhance_data_refs_alignment for epilogue
2099      vectorization, since we do not want to add extra peeling or
2100      add versioning for alignment.  */
2101   if (!LOOP_VINFO_EPILOGUE_P (loop_vinfo))
2102     /* This pass will decide on using loop versioning and/or loop peeling in
2103        order to enhance the alignment of data references in the loop.  */
2104     ok = vect_enhance_data_refs_alignment (loop_vinfo);
2105   else
2106     ok = vect_verify_datarefs_alignment (loop_vinfo);
2107   if (!ok)
2108     return ok;
2109
2110   if (slp)
2111     {
2112       /* Analyze operations in the SLP instances.  Note this may
2113          remove unsupported SLP instances which makes the above
2114          SLP kind detection invalid.  */
2115       unsigned old_size = LOOP_VINFO_SLP_INSTANCES (loop_vinfo).length ();
2116       vect_slp_analyze_operations (loop_vinfo);
2117       if (LOOP_VINFO_SLP_INSTANCES (loop_vinfo).length () != old_size)
2118         {
2119           ok = opt_result::failure_at (vect_location,
2120                                        "unsupported SLP instances\n");
2121           goto again;
2122         }
2123     }
2124
2125   /* Dissolve SLP-only groups.  */
2126   vect_dissolve_slp_only_groups (loop_vinfo);
2127
2128   /* Scan all the remaining operations in the loop that are not subject
2129      to SLP and make sure they are vectorizable.  */
2130   ok = vect_analyze_loop_operations (loop_vinfo);
2131   if (!ok)
2132     {
2133       if (dump_enabled_p ())
2134         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2135                          "bad operation or unsupported loop bound.\n");
2136       return ok;
2137     }
2138
2139   /* Decide whether to use a fully-masked loop for this vectorization
2140      factor.  */
2141   LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
2142     = (LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo)
2143        && vect_verify_full_masking (loop_vinfo));
2144   if (dump_enabled_p ())
2145     {
2146       if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
2147         dump_printf_loc (MSG_NOTE, vect_location,
2148                          "using a fully-masked loop.\n");
2149       else
2150         dump_printf_loc (MSG_NOTE, vect_location,
2151                          "not using a fully-masked loop.\n");
2152     }
2153
2154   /* If epilog loop is required because of data accesses with gaps,
2155      one additional iteration needs to be peeled.  Check if there is
2156      enough iterations for vectorization.  */
2157   if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
2158       && LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2159       && !LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
2160     {
2161       poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2162       tree scalar_niters = LOOP_VINFO_NITERSM1 (loop_vinfo);
2163
2164       if (known_lt (wi::to_widest (scalar_niters), vf))
2165         return opt_result::failure_at (vect_location,
2166                                        "loop has no enough iterations to"
2167                                        " support peeling for gaps.\n");
2168     }
2169
2170   /* If we're vectorizing an epilogue loop, we either need a fully-masked
2171      loop or a loop that has a lower VF than the main loop.  */
2172   if (LOOP_VINFO_EPILOGUE_P (loop_vinfo)
2173       && !LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
2174       && maybe_ge (LOOP_VINFO_VECT_FACTOR (loop_vinfo),
2175                    LOOP_VINFO_VECT_FACTOR (orig_loop_vinfo)))
2176     return opt_result::failure_at (vect_location,
2177                                    "Vectorization factor too high for"
2178                                    " epilogue loop.\n");
2179
2180   /* Check the costings of the loop make vectorizing worthwhile.  */
2181   res = vect_analyze_loop_costing (loop_vinfo);
2182   if (res < 0)
2183     {
2184       ok = opt_result::failure_at (vect_location,
2185                                    "Loop costings may not be worthwhile.\n");
2186       goto again;
2187     }
2188   if (!res)
2189     return opt_result::failure_at (vect_location,
2190                                    "Loop costings not worthwhile.\n");
2191
2192   determine_peel_for_niter (loop_vinfo);
2193   /* If an epilogue loop is required make sure we can create one.  */
2194   if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
2195       || LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo))
2196     {
2197       if (dump_enabled_p ())
2198         dump_printf_loc (MSG_NOTE, vect_location, "epilog loop required\n");
2199       if (!vect_can_advance_ivs_p (loop_vinfo)
2200           || !slpeel_can_duplicate_loop_p (LOOP_VINFO_LOOP (loop_vinfo),
2201                                            single_exit (LOOP_VINFO_LOOP
2202                                                          (loop_vinfo))))
2203         {
2204           ok = opt_result::failure_at (vect_location,
2205                                        "not vectorized: can't create required "
2206                                        "epilog loop\n");
2207           goto again;
2208         }
2209     }
2210
2211   /* During peeling, we need to check if number of loop iterations is
2212      enough for both peeled prolog loop and vector loop.  This check
2213      can be merged along with threshold check of loop versioning, so
2214      increase threshold for this case if necessary.
2215
2216      If we are analyzing an epilogue we still want to check what its
2217      versioning threshold would be.  If we decide to vectorize the epilogues we
2218      will want to use the lowest versioning threshold of all epilogues and main
2219      loop.  This will enable us to enter a vectorized epilogue even when
2220      versioning the loop.  We can't simply check whether the epilogue requires
2221      versioning though since we may have skipped some versioning checks when
2222      analyzing the epilogue.  For instance, checks for alias versioning will be
2223      skipped when dealing with epilogues as we assume we already checked them
2224      for the main loop.  So instead we always check the 'orig_loop_vinfo'.  */
2225   if (LOOP_REQUIRES_VERSIONING (orig_loop_vinfo))
2226     {
2227       poly_uint64 niters_th = 0;
2228       unsigned int th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
2229
2230       if (!vect_use_loop_mask_for_alignment_p (loop_vinfo))
2231         {
2232           /* Niters for peeled prolog loop.  */
2233           if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) < 0)
2234             {
2235               dr_vec_info *dr_info = LOOP_VINFO_UNALIGNED_DR (loop_vinfo);
2236               tree vectype = STMT_VINFO_VECTYPE (dr_info->stmt);
2237               niters_th += TYPE_VECTOR_SUBPARTS (vectype) - 1;
2238             }
2239           else
2240             niters_th += LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
2241         }
2242
2243       /* Niters for at least one iteration of vectorized loop.  */
2244       if (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
2245         niters_th += LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2246       /* One additional iteration because of peeling for gap.  */
2247       if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
2248         niters_th += 1;
2249
2250       /*  Use the same condition as vect_transform_loop to decide when to use
2251           the cost to determine a versioning threshold.  */
2252       if (vect_apply_runtime_profitability_check_p (loop_vinfo)
2253           && ordered_p (th, niters_th))
2254         niters_th = ordered_max (poly_uint64 (th), niters_th);
2255
2256       LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo) = niters_th;
2257     }
2258
2259   gcc_assert (known_eq (vectorization_factor,
2260                         LOOP_VINFO_VECT_FACTOR (loop_vinfo)));
2261
2262   /* Ok to vectorize!  */
2263   return opt_result::success ();
2264
2265 again:
2266   /* Ensure that "ok" is false (with an opt_problem if dumping is enabled).  */
2267   gcc_assert (!ok);
2268
2269   /* Try again with SLP forced off but if we didn't do any SLP there is
2270      no point in re-trying.  */
2271   if (!slp)
2272     return ok;
2273
2274   /* If there are reduction chains re-trying will fail anyway.  */
2275   if (! LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo).is_empty ())
2276     return ok;
2277
2278   /* Likewise if the grouped loads or stores in the SLP cannot be handled
2279      via interleaving or lane instructions.  */
2280   slp_instance instance;
2281   slp_tree node;
2282   unsigned i, j;
2283   FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), i, instance)
2284     {
2285       stmt_vec_info vinfo;
2286       vinfo = SLP_TREE_SCALAR_STMTS (SLP_INSTANCE_TREE (instance))[0];
2287       if (! STMT_VINFO_GROUPED_ACCESS (vinfo))
2288         continue;
2289       vinfo = DR_GROUP_FIRST_ELEMENT (vinfo);
2290       unsigned int size = DR_GROUP_SIZE (vinfo);
2291       tree vectype = STMT_VINFO_VECTYPE (vinfo);
2292       if (! vect_store_lanes_supported (vectype, size, false)
2293          && ! known_eq (TYPE_VECTOR_SUBPARTS (vectype), 1U)
2294          && ! vect_grouped_store_supported (vectype, size))
2295         return opt_result::failure_at (vinfo->stmt,
2296                                        "unsupported grouped store\n");
2297       FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), j, node)
2298         {
2299           vinfo = SLP_TREE_SCALAR_STMTS (node)[0];
2300           vinfo = DR_GROUP_FIRST_ELEMENT (vinfo);
2301           bool single_element_p = !DR_GROUP_NEXT_ELEMENT (vinfo);
2302           size = DR_GROUP_SIZE (vinfo);
2303           vectype = STMT_VINFO_VECTYPE (vinfo);
2304           if (! vect_load_lanes_supported (vectype, size, false)
2305               && ! vect_grouped_load_supported (vectype, single_element_p,
2306                                                 size))
2307             return opt_result::failure_at (vinfo->stmt,
2308                                            "unsupported grouped load\n");
2309         }
2310     }
2311
2312   if (dump_enabled_p ())
2313     dump_printf_loc (MSG_NOTE, vect_location,
2314                      "re-trying with SLP disabled\n");
2315
2316   /* Roll back state appropriately.  No SLP this time.  */
2317   slp = false;
2318   /* Restore vectorization factor as it were without SLP.  */
2319   LOOP_VINFO_VECT_FACTOR (loop_vinfo) = saved_vectorization_factor;
2320   /* Free the SLP instances.  */
2321   FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), j, instance)
2322     vect_free_slp_instance (instance, false);
2323   LOOP_VINFO_SLP_INSTANCES (loop_vinfo).release ();
2324   /* Reset SLP type to loop_vect on all stmts.  */
2325   for (i = 0; i < LOOP_VINFO_LOOP (loop_vinfo)->num_nodes; ++i)
2326     {
2327       basic_block bb = LOOP_VINFO_BBS (loop_vinfo)[i];
2328       for (gimple_stmt_iterator si = gsi_start_phis (bb);
2329            !gsi_end_p (si); gsi_next (&si))
2330         {
2331           stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
2332           STMT_SLP_TYPE (stmt_info) = loop_vect;
2333           if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
2334               || STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
2335             {
2336               /* vectorizable_reduction adjusts reduction stmt def-types,
2337                  restore them to that of the PHI.  */
2338               STMT_VINFO_DEF_TYPE (STMT_VINFO_REDUC_DEF (stmt_info))
2339                 = STMT_VINFO_DEF_TYPE (stmt_info);
2340               STMT_VINFO_DEF_TYPE (vect_stmt_to_vectorize
2341                                         (STMT_VINFO_REDUC_DEF (stmt_info)))
2342                 = STMT_VINFO_DEF_TYPE (stmt_info);
2343             }
2344         }
2345       for (gimple_stmt_iterator si = gsi_start_bb (bb);
2346            !gsi_end_p (si); gsi_next (&si))
2347         {
2348           stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
2349           STMT_SLP_TYPE (stmt_info) = loop_vect;
2350           if (STMT_VINFO_IN_PATTERN_P (stmt_info))
2351             {
2352               gimple *pattern_def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
2353               stmt_info = STMT_VINFO_RELATED_STMT (stmt_info);
2354               STMT_SLP_TYPE (stmt_info) = loop_vect;
2355               for (gimple_stmt_iterator pi = gsi_start (pattern_def_seq);
2356                    !gsi_end_p (pi); gsi_next (&pi))
2357                 STMT_SLP_TYPE (loop_vinfo->lookup_stmt (gsi_stmt (pi)))
2358                   = loop_vect;
2359             }
2360         }
2361     }
2362   /* Free optimized alias test DDRS.  */
2363   LOOP_VINFO_LOWER_BOUNDS (loop_vinfo).truncate (0);
2364   LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo).release ();
2365   LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo).release ();
2366   /* Reset target cost data.  */
2367   destroy_cost_data (LOOP_VINFO_TARGET_COST_DATA (loop_vinfo));
2368   LOOP_VINFO_TARGET_COST_DATA (loop_vinfo)
2369     = init_cost (LOOP_VINFO_LOOP (loop_vinfo));
2370   /* Reset accumulated rgroup information.  */
2371   release_vec_loop_masks (&LOOP_VINFO_MASKS (loop_vinfo));
2372   /* Reset assorted flags.  */
2373   LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = false;
2374   LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) = false;
2375   LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo) = 0;
2376   LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo) = 0;
2377   LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = saved_can_fully_mask_p;
2378
2379   goto start_over;
2380 }
2381
2382 /* Return true if vectorizing a loop using NEW_LOOP_VINFO appears
2383    to be better than vectorizing it using OLD_LOOP_VINFO.  Assume that
2384    OLD_LOOP_VINFO is better unless something specifically indicates
2385    otherwise.
2386
2387    Note that this deliberately isn't a partial order.  */
2388
2389 static bool
2390 vect_better_loop_vinfo_p (loop_vec_info new_loop_vinfo,
2391                           loop_vec_info old_loop_vinfo)
2392 {
2393   struct loop *loop = LOOP_VINFO_LOOP (new_loop_vinfo);
2394   gcc_assert (LOOP_VINFO_LOOP (old_loop_vinfo) == loop);
2395
2396   poly_int64 new_vf = LOOP_VINFO_VECT_FACTOR (new_loop_vinfo);
2397   poly_int64 old_vf = LOOP_VINFO_VECT_FACTOR (old_loop_vinfo);
2398
2399   /* Always prefer a VF of loop->simdlen over any other VF.  */
2400   if (loop->simdlen)
2401     {
2402       bool new_simdlen_p = known_eq (new_vf, loop->simdlen);
2403       bool old_simdlen_p = known_eq (old_vf, loop->simdlen);
2404       if (new_simdlen_p != old_simdlen_p)
2405         return new_simdlen_p;
2406     }
2407
2408   /* Limit the VFs to what is likely to be the maximum number of iterations,
2409      to handle cases in which at least one loop_vinfo is fully-masked.  */
2410   HOST_WIDE_INT estimated_max_niter = likely_max_stmt_executions_int (loop);
2411   if (estimated_max_niter != -1)
2412     {
2413       if (known_le (estimated_max_niter, new_vf))
2414         new_vf = estimated_max_niter;
2415       if (known_le (estimated_max_niter, old_vf))
2416         old_vf = estimated_max_niter;
2417     }
2418
2419   /* Check whether the (fractional) cost per scalar iteration is lower
2420      or higher: new_inside_cost / new_vf vs. old_inside_cost / old_vf.  */
2421   poly_widest_int rel_new = (new_loop_vinfo->vec_inside_cost
2422                              * poly_widest_int (old_vf));
2423   poly_widest_int rel_old = (old_loop_vinfo->vec_inside_cost
2424                              * poly_widest_int (new_vf));
2425   if (maybe_lt (rel_old, rel_new))
2426     {
2427       /* When old_loop_vinfo uses a variable vectorization factor,
2428          we know that it has a lower cost for at least one runtime VF.
2429          However, we don't know how likely that VF is.
2430
2431          One option would be to compare the costs for the estimated VFs.
2432          The problem is that that can put too much pressure on the cost
2433          model.  E.g. if the estimated VF is also the lowest possible VF,
2434          and if old_loop_vinfo is 1 unit worse than new_loop_vinfo
2435          for the estimated VF, we'd then choose new_loop_vinfo even
2436          though (a) new_loop_vinfo might not actually be better than
2437          old_loop_vinfo for that VF and (b) it would be significantly
2438          worse at larger VFs.
2439
2440          Here we go for a hacky compromise: pick new_loop_vinfo if it is
2441          no more expensive than old_loop_vinfo even after doubling the
2442          estimated old_loop_vinfo VF.  For all but trivial loops, this
2443          ensures that we only pick new_loop_vinfo if it is significantly
2444          better than old_loop_vinfo at the estimated VF.  */
2445       if (rel_new.is_constant ())
2446         return false;
2447
2448       HOST_WIDE_INT new_estimated_vf = estimated_poly_value (new_vf);
2449       HOST_WIDE_INT old_estimated_vf = estimated_poly_value (old_vf);
2450       widest_int estimated_rel_new = (new_loop_vinfo->vec_inside_cost
2451                                       * widest_int (old_estimated_vf));
2452       widest_int estimated_rel_old = (old_loop_vinfo->vec_inside_cost
2453                                       * widest_int (new_estimated_vf));
2454       return estimated_rel_new * 2 <= estimated_rel_old;
2455     }
2456   if (known_lt (rel_new, rel_old))
2457     return true;
2458
2459   /* If there's nothing to choose between the loop bodies, see whether
2460      there's a difference in the prologue and epilogue costs.  */
2461   if (new_loop_vinfo->vec_outside_cost != old_loop_vinfo->vec_outside_cost)
2462     return new_loop_vinfo->vec_outside_cost < old_loop_vinfo->vec_outside_cost;
2463
2464   return false;
2465 }
2466
2467 /* Decide whether to replace OLD_LOOP_VINFO with NEW_LOOP_VINFO.  Return
2468    true if we should.  */
2469
2470 static bool
2471 vect_joust_loop_vinfos (loop_vec_info new_loop_vinfo,
2472                         loop_vec_info old_loop_vinfo)
2473 {
2474   if (!vect_better_loop_vinfo_p (new_loop_vinfo, old_loop_vinfo))
2475     return false;
2476
2477   if (dump_enabled_p ())
2478     dump_printf_loc (MSG_NOTE, vect_location,
2479                      "***** Preferring vector mode %s to vector mode %s\n",
2480                      GET_MODE_NAME (new_loop_vinfo->vector_mode),
2481                      GET_MODE_NAME (old_loop_vinfo->vector_mode));
2482   return true;
2483 }
2484
2485 /* Function vect_analyze_loop.
2486
2487    Apply a set of analyses on LOOP, and create a loop_vec_info struct
2488    for it.  The different analyses will record information in the
2489    loop_vec_info struct.  */
2490 opt_loop_vec_info
2491 vect_analyze_loop (class loop *loop, vec_info_shared *shared)
2492 {
2493   auto_vector_modes vector_modes;
2494
2495   /* Autodetect first vector size we try.  */
2496   unsigned int autovec_flags
2497     = targetm.vectorize.autovectorize_vector_modes (&vector_modes,
2498                                                     loop->simdlen != 0);
2499   unsigned int mode_i = 0;
2500
2501   DUMP_VECT_SCOPE ("analyze_loop_nest");
2502
2503   if (loop_outer (loop)
2504       && loop_vec_info_for_loop (loop_outer (loop))
2505       && LOOP_VINFO_VECTORIZABLE_P (loop_vec_info_for_loop (loop_outer (loop))))
2506     return opt_loop_vec_info::failure_at (vect_location,
2507                                           "outer-loop already vectorized.\n");
2508
2509   if (!find_loop_nest (loop, &shared->loop_nest))
2510     return opt_loop_vec_info::failure_at
2511       (vect_location,
2512        "not vectorized: loop nest containing two or more consecutive inner"
2513        " loops cannot be vectorized\n");
2514
2515   unsigned n_stmts = 0;
2516   machine_mode autodetected_vector_mode = VOIDmode;
2517   opt_loop_vec_info first_loop_vinfo = opt_loop_vec_info::success (NULL);
2518   machine_mode next_vector_mode = VOIDmode;
2519   poly_uint64 lowest_th = 0;
2520   unsigned vectorized_loops = 0;
2521   bool pick_lowest_cost_p = ((autovec_flags & VECT_COMPARE_COSTS)
2522                              && !unlimited_cost_model (loop));
2523
2524   bool vect_epilogues = false;
2525   opt_result res = opt_result::success ();
2526   unsigned HOST_WIDE_INT simdlen = loop->simdlen;
2527   while (1)
2528     {
2529       /* Check the CFG characteristics of the loop (nesting, entry/exit).  */
2530       opt_loop_vec_info loop_vinfo = vect_analyze_loop_form (loop, shared);
2531       if (!loop_vinfo)
2532         {
2533           if (dump_enabled_p ())
2534             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2535                              "bad loop form.\n");
2536           gcc_checking_assert (first_loop_vinfo == NULL);
2537           return loop_vinfo;
2538         }
2539       loop_vinfo->vector_mode = next_vector_mode;
2540
2541       bool fatal = false;
2542
2543       /* When pick_lowest_cost_p is true, we should in principle iterate
2544          over all the loop_vec_infos that LOOP_VINFO could replace and
2545          try to vectorize LOOP_VINFO under the same conditions.
2546          E.g. when trying to replace an epilogue loop, we should vectorize
2547          LOOP_VINFO as an epilogue loop with the same VF limit.  When trying
2548          to replace the main loop, we should vectorize LOOP_VINFO as a main
2549          loop too.
2550
2551          However, autovectorize_vector_modes is usually sorted as follows:
2552
2553          - Modes that naturally produce lower VFs usually follow modes that
2554            naturally produce higher VFs.
2555
2556          - When modes naturally produce the same VF, maskable modes
2557            usually follow unmaskable ones, so that the maskable mode
2558            can be used to vectorize the epilogue of the unmaskable mode.
2559
2560          This order is preferred because it leads to the maximum
2561          epilogue vectorization opportunities.  Targets should only use
2562          a different order if they want to make wide modes available while
2563          disparaging them relative to earlier, smaller modes.  The assumption
2564          in that case is that the wider modes are more expensive in some
2565          way that isn't reflected directly in the costs.
2566
2567          There should therefore be few interesting cases in which
2568          LOOP_VINFO fails when treated as an epilogue loop, succeeds when
2569          treated as a standalone loop, and ends up being genuinely cheaper
2570          than FIRST_LOOP_VINFO.  */
2571       if (vect_epilogues)
2572         LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo) = first_loop_vinfo;
2573
2574       res = vect_analyze_loop_2 (loop_vinfo, fatal, &n_stmts);
2575       if (mode_i == 0)
2576         autodetected_vector_mode = loop_vinfo->vector_mode;
2577       if (dump_enabled_p ())
2578         {
2579           if (res)
2580             dump_printf_loc (MSG_NOTE, vect_location,
2581                              "***** Analysis succeeded with vector mode %s\n",
2582                              GET_MODE_NAME (loop_vinfo->vector_mode));
2583           else
2584             dump_printf_loc (MSG_NOTE, vect_location,
2585                              "***** Analysis failed with vector mode %s\n",
2586                              GET_MODE_NAME (loop_vinfo->vector_mode));
2587         }
2588
2589       loop->aux = NULL;
2590
2591       if (!fatal)
2592         while (mode_i < vector_modes.length ()
2593                && vect_chooses_same_modes_p (loop_vinfo, vector_modes[mode_i]))
2594           {
2595             if (dump_enabled_p ())
2596               dump_printf_loc (MSG_NOTE, vect_location,
2597                                "***** The result for vector mode %s would"
2598                                " be the same\n",
2599                                GET_MODE_NAME (vector_modes[mode_i]));
2600             mode_i += 1;
2601           }
2602
2603       if (res)
2604         {
2605           LOOP_VINFO_VECTORIZABLE_P (loop_vinfo) = 1;
2606           vectorized_loops++;
2607
2608           /* Once we hit the desired simdlen for the first time,
2609              discard any previous attempts.  */
2610           if (simdlen
2611               && known_eq (LOOP_VINFO_VECT_FACTOR (loop_vinfo), simdlen))
2612             {
2613               delete first_loop_vinfo;
2614               first_loop_vinfo = opt_loop_vec_info::success (NULL);
2615               LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo) = NULL;
2616               simdlen = 0;
2617             }
2618           else if (pick_lowest_cost_p && first_loop_vinfo)
2619             {
2620               /* Keep trying to roll back vectorization attempts while the
2621                  loop_vec_infos they produced were worse than this one.  */
2622               vec<loop_vec_info> &vinfos = first_loop_vinfo->epilogue_vinfos;
2623               while (!vinfos.is_empty ()
2624                      && vect_joust_loop_vinfos (loop_vinfo, vinfos.last ()))
2625                 {
2626                   gcc_assert (vect_epilogues);
2627                   delete vinfos.pop ();
2628                 }
2629               if (vinfos.is_empty ()
2630                   && vect_joust_loop_vinfos (loop_vinfo, first_loop_vinfo))
2631                 {
2632                   delete first_loop_vinfo;
2633                   first_loop_vinfo = opt_loop_vec_info::success (NULL);
2634                   LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo) = NULL;
2635                 }
2636             }
2637
2638           if (first_loop_vinfo == NULL)
2639             {
2640               first_loop_vinfo = loop_vinfo;
2641               lowest_th = LOOP_VINFO_VERSIONING_THRESHOLD (first_loop_vinfo);
2642             }
2643           else if (vect_epilogues
2644                    /* For now only allow one epilogue loop.  */
2645                    && first_loop_vinfo->epilogue_vinfos.is_empty ())
2646             {
2647               first_loop_vinfo->epilogue_vinfos.safe_push (loop_vinfo);
2648               poly_uint64 th = LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo);
2649               gcc_assert (!LOOP_REQUIRES_VERSIONING (loop_vinfo)
2650                           || maybe_ne (lowest_th, 0U));
2651               /* Keep track of the known smallest versioning
2652                  threshold.  */
2653               if (ordered_p (lowest_th, th))
2654                 lowest_th = ordered_min (lowest_th, th);
2655             }
2656           else
2657             delete loop_vinfo;
2658
2659           /* Only vectorize epilogues if PARAM_VECT_EPILOGUES_NOMASK is
2660              enabled, SIMDUID is not set, it is the innermost loop and we have
2661              either already found the loop's SIMDLEN or there was no SIMDLEN to
2662              begin with.
2663              TODO: Enable epilogue vectorization for loops with SIMDUID set.  */
2664           vect_epilogues = (!simdlen
2665                             && loop->inner == NULL
2666                             && param_vect_epilogues_nomask
2667                             && LOOP_VINFO_PEELING_FOR_NITER (first_loop_vinfo)
2668                             && !loop->simduid
2669                             /* For now only allow one epilogue loop, but allow
2670                                pick_lowest_cost_p to replace it.  */
2671                             && (first_loop_vinfo->epilogue_vinfos.is_empty ()
2672                                 || pick_lowest_cost_p));
2673
2674           /* Commit to first_loop_vinfo if we have no reason to try
2675              alternatives.  */
2676           if (!simdlen && !vect_epilogues && !pick_lowest_cost_p)
2677             break;
2678         }
2679       else
2680         {
2681           delete loop_vinfo;
2682           if (fatal)
2683             {
2684               gcc_checking_assert (first_loop_vinfo == NULL);
2685               break;
2686             }
2687         }
2688
2689       if (mode_i < vector_modes.length ()
2690           && VECTOR_MODE_P (autodetected_vector_mode)
2691           && (related_vector_mode (vector_modes[mode_i],
2692                                    GET_MODE_INNER (autodetected_vector_mode))
2693               == autodetected_vector_mode)
2694           && (related_vector_mode (autodetected_vector_mode,
2695                                    GET_MODE_INNER (vector_modes[mode_i]))
2696               == vector_modes[mode_i]))
2697         {
2698           if (dump_enabled_p ())
2699             dump_printf_loc (MSG_NOTE, vect_location,
2700                              "***** Skipping vector mode %s, which would"
2701                              " repeat the analysis for %s\n",
2702                              GET_MODE_NAME (vector_modes[mode_i]),
2703                              GET_MODE_NAME (autodetected_vector_mode));
2704           mode_i += 1;
2705         }
2706
2707       if (mode_i == vector_modes.length ()
2708           || autodetected_vector_mode == VOIDmode)
2709         break;
2710
2711       /* Try the next biggest vector size.  */
2712       next_vector_mode = vector_modes[mode_i++];
2713       if (dump_enabled_p ())
2714         dump_printf_loc (MSG_NOTE, vect_location,
2715                          "***** Re-trying analysis with vector mode %s\n",
2716                          GET_MODE_NAME (next_vector_mode));
2717     }
2718
2719   if (first_loop_vinfo)
2720     {
2721       loop->aux = (loop_vec_info) first_loop_vinfo;
2722       if (dump_enabled_p ())
2723         dump_printf_loc (MSG_NOTE, vect_location,
2724                          "***** Choosing vector mode %s\n",
2725                          GET_MODE_NAME (first_loop_vinfo->vector_mode));
2726       LOOP_VINFO_VERSIONING_THRESHOLD (first_loop_vinfo) = lowest_th;
2727       return first_loop_vinfo;
2728     }
2729
2730   return opt_loop_vec_info::propagate_failure (res);
2731 }
2732
2733 /* Return true if there is an in-order reduction function for CODE, storing
2734    it in *REDUC_FN if so.  */
2735
2736 static bool
2737 fold_left_reduction_fn (tree_code code, internal_fn *reduc_fn)
2738 {
2739   switch (code)
2740     {
2741     case PLUS_EXPR:
2742       *reduc_fn = IFN_FOLD_LEFT_PLUS;
2743       return true;
2744
2745     default:
2746       return false;
2747     }
2748 }
2749
2750 /* Function reduction_fn_for_scalar_code
2751
2752    Input:
2753    CODE - tree_code of a reduction operations.
2754
2755    Output:
2756    REDUC_FN - the corresponding internal function to be used to reduce the
2757       vector of partial results into a single scalar result, or IFN_LAST
2758       if the operation is a supported reduction operation, but does not have
2759       such an internal function.
2760
2761    Return FALSE if CODE currently cannot be vectorized as reduction.  */
2762
2763 static bool
2764 reduction_fn_for_scalar_code (enum tree_code code, internal_fn *reduc_fn)
2765 {
2766   switch (code)
2767     {
2768       case MAX_EXPR:
2769         *reduc_fn = IFN_REDUC_MAX;
2770         return true;
2771
2772       case MIN_EXPR:
2773         *reduc_fn = IFN_REDUC_MIN;
2774         return true;
2775
2776       case PLUS_EXPR:
2777         *reduc_fn = IFN_REDUC_PLUS;
2778         return true;
2779
2780       case BIT_AND_EXPR:
2781         *reduc_fn = IFN_REDUC_AND;
2782         return true;
2783
2784       case BIT_IOR_EXPR:
2785         *reduc_fn = IFN_REDUC_IOR;
2786         return true;
2787
2788       case BIT_XOR_EXPR:
2789         *reduc_fn = IFN_REDUC_XOR;
2790         return true;
2791
2792       case MULT_EXPR:
2793       case MINUS_EXPR:
2794         *reduc_fn = IFN_LAST;
2795         return true;
2796
2797       default:
2798        return false;
2799     }
2800 }
2801
2802 /* If there is a neutral value X such that SLP reduction NODE would not
2803    be affected by the introduction of additional X elements, return that X,
2804    otherwise return null.  CODE is the code of the reduction and VECTOR_TYPE
2805    is the vector type that would hold element X.  REDUC_CHAIN is true if
2806    the SLP statements perform a single reduction, false if each statement
2807    performs an independent reduction.  */
2808
2809 static tree
2810 neutral_op_for_slp_reduction (slp_tree slp_node, tree vector_type,
2811                               tree_code code, bool reduc_chain)
2812 {
2813   vec<stmt_vec_info> stmts = SLP_TREE_SCALAR_STMTS (slp_node);
2814   stmt_vec_info stmt_vinfo = stmts[0];
2815   tree scalar_type = TREE_TYPE (vector_type);
2816   class loop *loop = gimple_bb (stmt_vinfo->stmt)->loop_father;
2817   gcc_assert (loop);
2818
2819   switch (code)
2820     {
2821     case WIDEN_SUM_EXPR:
2822     case DOT_PROD_EXPR:
2823     case SAD_EXPR:
2824     case PLUS_EXPR:
2825     case MINUS_EXPR:
2826     case BIT_IOR_EXPR:
2827     case BIT_XOR_EXPR:
2828       return build_zero_cst (scalar_type);
2829
2830     case MULT_EXPR:
2831       return build_one_cst (scalar_type);
2832
2833     case BIT_AND_EXPR:
2834       return build_all_ones_cst (scalar_type);
2835
2836     case MAX_EXPR:
2837     case MIN_EXPR:
2838       /* For MIN/MAX the initial values are neutral.  A reduction chain
2839          has only a single initial value, so that value is neutral for
2840          all statements.  */
2841       if (reduc_chain)
2842         return PHI_ARG_DEF_FROM_EDGE (stmt_vinfo->stmt,
2843                                       loop_preheader_edge (loop));
2844       return NULL_TREE;
2845
2846     default:
2847       return NULL_TREE;
2848     }
2849 }
2850
2851 /* Error reporting helper for vect_is_simple_reduction below.  GIMPLE statement
2852    STMT is printed with a message MSG. */
2853
2854 static void
2855 report_vect_op (dump_flags_t msg_type, gimple *stmt, const char *msg)
2856 {
2857   dump_printf_loc (msg_type, vect_location, "%s%G", msg, stmt);
2858 }
2859
2860 /* Return true if we need an in-order reduction for operation CODE
2861    on type TYPE.  NEED_WRAPPING_INTEGRAL_OVERFLOW is true if integer
2862    overflow must wrap.  */
2863
2864 bool
2865 needs_fold_left_reduction_p (tree type, tree_code code)
2866 {
2867   /* CHECKME: check for !flag_finite_math_only too?  */
2868   if (SCALAR_FLOAT_TYPE_P (type))
2869     switch (code)
2870       {
2871       case MIN_EXPR:
2872       case MAX_EXPR:
2873         return false;
2874
2875       default:
2876         return !flag_associative_math;
2877       }
2878
2879   if (INTEGRAL_TYPE_P (type))
2880     {
2881       if (!operation_no_trapping_overflow (type, code))
2882         return true;
2883       return false;
2884     }
2885
2886   if (SAT_FIXED_POINT_TYPE_P (type))
2887     return true;
2888
2889   return false;
2890 }
2891
2892 /* Return true if the reduction PHI in LOOP with latch arg LOOP_ARG and
2893    has a handled computation expression.  Store the main reduction
2894    operation in *CODE.  */
2895
2896 static bool
2897 check_reduction_path (dump_user_location_t loc, loop_p loop, gphi *phi,
2898                       tree loop_arg, enum tree_code *code,
2899                       vec<std::pair<ssa_op_iter, use_operand_p> > &path)
2900 {
2901   auto_bitmap visited;
2902   tree lookfor = PHI_RESULT (phi);
2903   ssa_op_iter curri;
2904   use_operand_p curr = op_iter_init_phiuse (&curri, phi, SSA_OP_USE);
2905   while (USE_FROM_PTR (curr) != loop_arg)
2906     curr = op_iter_next_use (&curri);
2907   curri.i = curri.numops;
2908   do
2909     {
2910       path.safe_push (std::make_pair (curri, curr));
2911       tree use = USE_FROM_PTR (curr);
2912       if (use == lookfor)
2913         break;
2914       gimple *def = SSA_NAME_DEF_STMT (use);
2915       if (gimple_nop_p (def)
2916           || ! flow_bb_inside_loop_p (loop, gimple_bb (def)))
2917         {
2918 pop:
2919           do
2920             {
2921               std::pair<ssa_op_iter, use_operand_p> x = path.pop ();
2922               curri = x.first;
2923               curr = x.second;
2924               do
2925                 curr = op_iter_next_use (&curri);
2926               /* Skip already visited or non-SSA operands (from iterating
2927                  over PHI args).  */
2928               while (curr != NULL_USE_OPERAND_P
2929                      && (TREE_CODE (USE_FROM_PTR (curr)) != SSA_NAME
2930                          || ! bitmap_set_bit (visited,
2931                                               SSA_NAME_VERSION
2932                                                 (USE_FROM_PTR (curr)))));
2933             }
2934           while (curr == NULL_USE_OPERAND_P && ! path.is_empty ());
2935           if (curr == NULL_USE_OPERAND_P)
2936             break;
2937         }
2938       else
2939         {
2940           if (gimple_code (def) == GIMPLE_PHI)
2941             curr = op_iter_init_phiuse (&curri, as_a <gphi *>(def), SSA_OP_USE);
2942           else
2943             curr = op_iter_init_use (&curri, def, SSA_OP_USE);
2944           while (curr != NULL_USE_OPERAND_P
2945                  && (TREE_CODE (USE_FROM_PTR (curr)) != SSA_NAME
2946                      || ! bitmap_set_bit (visited,
2947                                           SSA_NAME_VERSION
2948                                             (USE_FROM_PTR (curr)))))
2949             curr = op_iter_next_use (&curri);
2950           if (curr == NULL_USE_OPERAND_P)
2951             goto pop;
2952         }
2953     }
2954   while (1);
2955   if (dump_file && (dump_flags & TDF_DETAILS))
2956     {
2957       dump_printf_loc (MSG_NOTE, loc, "reduction path: ");
2958       unsigned i;
2959       std::pair<ssa_op_iter, use_operand_p> *x;
2960       FOR_EACH_VEC_ELT (path, i, x)
2961         dump_printf (MSG_NOTE, "%T ", USE_FROM_PTR (x->second));
2962       dump_printf (MSG_NOTE, "\n");
2963     }
2964
2965   /* Check whether the reduction path detected is valid.  */
2966   bool fail = path.length () == 0;
2967   bool neg = false;
2968   int sign = -1;
2969   *code = ERROR_MARK;
2970   for (unsigned i = 1; i < path.length (); ++i)
2971     {
2972       gimple *use_stmt = USE_STMT (path[i].second);
2973       tree op = USE_FROM_PTR (path[i].second);
2974       if (! is_gimple_assign (use_stmt)
2975           /* The following make sure we can compute the operand index
2976              easily plus it mostly disallows chaining via COND_EXPR condition
2977              operands.  */
2978           || (gimple_assign_rhs1_ptr (use_stmt) != path[i].second->use
2979               && (gimple_num_ops (use_stmt) <= 2
2980                   || gimple_assign_rhs2_ptr (use_stmt) != path[i].second->use)
2981               && (gimple_num_ops (use_stmt) <= 3
2982                   || gimple_assign_rhs3_ptr (use_stmt) != path[i].second->use)))
2983         {
2984           fail = true;
2985           break;
2986         }
2987       /* Check there's only a single stmt the op is used on inside
2988          of the loop.  */
2989       imm_use_iterator imm_iter;
2990       gimple *op_use_stmt;
2991       unsigned cnt = 0;
2992       FOR_EACH_IMM_USE_STMT (op_use_stmt, imm_iter, op)
2993         if (!is_gimple_debug (op_use_stmt)
2994             && flow_bb_inside_loop_p (loop, gimple_bb (op_use_stmt)))
2995           {
2996             /* We want to allow x + x but not x < 1 ? x : 2.  */
2997             if (is_gimple_assign (op_use_stmt)
2998                 && gimple_assign_rhs_code (op_use_stmt) == COND_EXPR)
2999               {
3000                 use_operand_p use_p;
3001                 FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
3002                   cnt++;
3003               }
3004             else
3005               cnt++;
3006           }
3007       if (cnt != 1)
3008         {
3009           fail = true;
3010           break;
3011         }
3012       tree_code use_code = gimple_assign_rhs_code (use_stmt);
3013       if (use_code == MINUS_EXPR)
3014         {
3015           use_code = PLUS_EXPR;
3016           /* Track whether we negate the reduction value each iteration.  */
3017           if (gimple_assign_rhs2 (use_stmt) == op)
3018             neg = ! neg;
3019         }
3020       if (CONVERT_EXPR_CODE_P (use_code)
3021           && tree_nop_conversion_p (TREE_TYPE (gimple_assign_lhs (use_stmt)),
3022                                     TREE_TYPE (gimple_assign_rhs1 (use_stmt))))
3023         ;
3024       else if (*code == ERROR_MARK)
3025         {
3026           *code = use_code;
3027           sign = TYPE_SIGN (TREE_TYPE (gimple_assign_lhs (use_stmt)));
3028         }
3029       else if (use_code != *code)
3030         {
3031           fail = true;
3032           break;
3033         }
3034       else if ((use_code == MIN_EXPR
3035                 || use_code == MAX_EXPR)
3036                && sign != TYPE_SIGN (TREE_TYPE (gimple_assign_lhs (use_stmt))))
3037         {
3038           fail = true;
3039           break;
3040         }
3041     }
3042   return ! fail && ! neg && *code != ERROR_MARK;
3043 }
3044
3045 bool
3046 check_reduction_path (dump_user_location_t loc, loop_p loop, gphi *phi,
3047                       tree loop_arg, enum tree_code code)
3048 {
3049   auto_vec<std::pair<ssa_op_iter, use_operand_p> > path;
3050   enum tree_code code_;
3051   return (check_reduction_path (loc, loop, phi, loop_arg, &code_, path)
3052           && code_ == code);
3053 }
3054
3055
3056
3057 /* Function vect_is_simple_reduction
3058
3059    (1) Detect a cross-iteration def-use cycle that represents a simple
3060    reduction computation.  We look for the following pattern:
3061
3062    loop_header:
3063      a1 = phi < a0, a2 >
3064      a3 = ...
3065      a2 = operation (a3, a1)
3066
3067    or
3068
3069    a3 = ...
3070    loop_header:
3071      a1 = phi < a0, a2 >
3072      a2 = operation (a3, a1)
3073
3074    such that:
3075    1. operation is commutative and associative and it is safe to
3076       change the order of the computation
3077    2. no uses for a2 in the loop (a2 is used out of the loop)
3078    3. no uses of a1 in the loop besides the reduction operation
3079    4. no uses of a1 outside the loop.
3080
3081    Conditions 1,4 are tested here.
3082    Conditions 2,3 are tested in vect_mark_stmts_to_be_vectorized.
3083
3084    (2) Detect a cross-iteration def-use cycle in nested loops, i.e.,
3085    nested cycles.
3086
3087    (3) Detect cycles of phi nodes in outer-loop vectorization, i.e., double
3088    reductions:
3089
3090      a1 = phi < a0, a2 >
3091      inner loop (def of a3)
3092      a2 = phi < a3 >
3093
3094    (4) Detect condition expressions, ie:
3095      for (int i = 0; i < N; i++)
3096        if (a[i] < val)
3097         ret_val = a[i];
3098
3099 */
3100
3101 static stmt_vec_info
3102 vect_is_simple_reduction (loop_vec_info loop_info, stmt_vec_info phi_info,
3103                           bool *double_reduc, bool *reduc_chain_p)
3104 {
3105   gphi *phi = as_a <gphi *> (phi_info->stmt);
3106   gimple *phi_use_stmt = NULL;
3107   imm_use_iterator imm_iter;
3108   use_operand_p use_p;
3109
3110   *double_reduc = false;
3111   *reduc_chain_p = false;
3112   STMT_VINFO_REDUC_TYPE (phi_info) = TREE_CODE_REDUCTION;
3113
3114   tree phi_name = PHI_RESULT (phi);
3115   /* ???  If there are no uses of the PHI result the inner loop reduction
3116      won't be detected as possibly double-reduction by vectorizable_reduction
3117      because that tries to walk the PHI arg from the preheader edge which
3118      can be constant.  See PR60382.  */
3119   if (has_zero_uses (phi_name))
3120     return NULL;
3121   class loop *loop = (gimple_bb (phi))->loop_father;
3122   unsigned nphi_def_loop_uses = 0;
3123   FOR_EACH_IMM_USE_FAST (use_p, imm_iter, phi_name)
3124     {
3125       gimple *use_stmt = USE_STMT (use_p);
3126       if (is_gimple_debug (use_stmt))
3127         continue;
3128
3129       if (!flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
3130         {
3131           if (dump_enabled_p ())
3132             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3133                              "intermediate value used outside loop.\n");
3134
3135           return NULL;
3136         }
3137
3138       nphi_def_loop_uses++;
3139       phi_use_stmt = use_stmt;
3140     }
3141
3142   tree latch_def = PHI_ARG_DEF_FROM_EDGE (phi, loop_latch_edge (loop));
3143   if (TREE_CODE (latch_def) != SSA_NAME)
3144     {
3145       if (dump_enabled_p ())
3146         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3147                          "reduction: not ssa_name: %T\n", latch_def);
3148       return NULL;
3149     }
3150
3151   stmt_vec_info def_stmt_info = loop_info->lookup_def (latch_def);
3152   if (!def_stmt_info
3153       || !flow_bb_inside_loop_p (loop, gimple_bb (def_stmt_info->stmt)))
3154     return NULL;
3155
3156   bool nested_in_vect_loop
3157     = flow_loop_nested_p (LOOP_VINFO_LOOP (loop_info), loop);
3158   unsigned nlatch_def_loop_uses = 0;
3159   auto_vec<gphi *, 3> lcphis;
3160   bool inner_loop_of_double_reduc = false;
3161   FOR_EACH_IMM_USE_FAST (use_p, imm_iter, latch_def)
3162     {
3163       gimple *use_stmt = USE_STMT (use_p);
3164       if (is_gimple_debug (use_stmt))
3165         continue;
3166       if (flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
3167         nlatch_def_loop_uses++;
3168       else
3169         {
3170           /* We can have more than one loop-closed PHI.  */
3171           lcphis.safe_push (as_a <gphi *> (use_stmt));
3172           if (nested_in_vect_loop
3173               && (STMT_VINFO_DEF_TYPE (loop_info->lookup_stmt (use_stmt))
3174                   == vect_double_reduction_def))
3175             inner_loop_of_double_reduc = true;
3176         }
3177     }
3178
3179   /* If we are vectorizing an inner reduction we are executing that
3180      in the original order only in case we are not dealing with a
3181      double reduction.  */
3182   if (nested_in_vect_loop && !inner_loop_of_double_reduc)
3183     {
3184       if (dump_enabled_p ())
3185         report_vect_op (MSG_NOTE, def_stmt_info->stmt,
3186                         "detected nested cycle: ");
3187       return def_stmt_info;
3188     }
3189
3190   /* If this isn't a nested cycle or if the nested cycle reduction value
3191      is used ouside of the inner loop we cannot handle uses of the reduction
3192      value.  */
3193   if (nlatch_def_loop_uses > 1 || nphi_def_loop_uses > 1)
3194     {
3195       if (dump_enabled_p ())
3196         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3197                          "reduction used in loop.\n");
3198       return NULL;
3199     }
3200
3201   /* If DEF_STMT is a phi node itself, we expect it to have a single argument
3202      defined in the inner loop.  */
3203   if (gphi *def_stmt = dyn_cast <gphi *> (def_stmt_info->stmt))
3204     {
3205       tree op1 = PHI_ARG_DEF (def_stmt, 0);
3206       if (gimple_phi_num_args (def_stmt) != 1
3207           || TREE_CODE (op1) != SSA_NAME)
3208         {
3209           if (dump_enabled_p ())
3210             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3211                              "unsupported phi node definition.\n");
3212
3213           return NULL;
3214         }
3215
3216       gimple *def1 = SSA_NAME_DEF_STMT (op1);
3217       if (gimple_bb (def1)
3218           && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
3219           && loop->inner
3220           && flow_bb_inside_loop_p (loop->inner, gimple_bb (def1))
3221           && is_gimple_assign (def1)
3222           && is_a <gphi *> (phi_use_stmt)
3223           && flow_bb_inside_loop_p (loop->inner, gimple_bb (phi_use_stmt)))
3224         {
3225           if (dump_enabled_p ())
3226             report_vect_op (MSG_NOTE, def_stmt,
3227                             "detected double reduction: ");
3228
3229           *double_reduc = true;
3230           return def_stmt_info;
3231         }
3232
3233       return NULL;
3234     }
3235
3236   /* Look for the expression computing latch_def from then loop PHI result.  */
3237   auto_vec<std::pair<ssa_op_iter, use_operand_p> > path;
3238   enum tree_code code;
3239   if (check_reduction_path (vect_location, loop, phi, latch_def, &code,
3240                             path))
3241     {
3242       STMT_VINFO_REDUC_CODE (phi_info) = code;
3243       if (code == COND_EXPR && !nested_in_vect_loop)
3244         STMT_VINFO_REDUC_TYPE (phi_info) = COND_REDUCTION;
3245
3246       /* Fill in STMT_VINFO_REDUC_IDX and gather stmts for an SLP
3247          reduction chain for which the additional restriction is that
3248          all operations in the chain are the same.  */
3249       auto_vec<stmt_vec_info, 8> reduc_chain;
3250       unsigned i;
3251       bool is_slp_reduc = !nested_in_vect_loop && code != COND_EXPR;
3252       for (i = path.length () - 1; i >= 1; --i)
3253         {
3254           gimple *stmt = USE_STMT (path[i].second);
3255           stmt_vec_info stmt_info = loop_info->lookup_stmt (stmt);
3256           STMT_VINFO_REDUC_IDX (stmt_info)
3257             = path[i].second->use - gimple_assign_rhs1_ptr (stmt);
3258           enum tree_code stmt_code = gimple_assign_rhs_code (stmt);
3259           bool leading_conversion = (CONVERT_EXPR_CODE_P (stmt_code)
3260                                      && (i == 1 || i == path.length () - 1));
3261           if ((stmt_code != code && !leading_conversion)
3262               /* We can only handle the final value in epilogue
3263                  generation for reduction chains.  */
3264               || (i != 1 && !has_single_use (gimple_assign_lhs (stmt))))
3265             is_slp_reduc = false;
3266           /* For reduction chains we support a trailing/leading
3267              conversions.  We do not store those in the actual chain.  */
3268           if (leading_conversion)
3269             continue;
3270           reduc_chain.safe_push (stmt_info);
3271         }
3272       if (is_slp_reduc && reduc_chain.length () > 1)
3273         {
3274           for (unsigned i = 0; i < reduc_chain.length () - 1; ++i)
3275             {
3276               REDUC_GROUP_FIRST_ELEMENT (reduc_chain[i]) = reduc_chain[0];
3277               REDUC_GROUP_NEXT_ELEMENT (reduc_chain[i]) = reduc_chain[i+1];
3278             }
3279           REDUC_GROUP_FIRST_ELEMENT (reduc_chain.last ()) = reduc_chain[0];
3280           REDUC_GROUP_NEXT_ELEMENT (reduc_chain.last ()) = NULL;
3281
3282           /* Save the chain for further analysis in SLP detection.  */
3283           LOOP_VINFO_REDUCTION_CHAINS (loop_info).safe_push (reduc_chain[0]);
3284           REDUC_GROUP_SIZE (reduc_chain[0]) = reduc_chain.length ();
3285
3286           *reduc_chain_p = true;
3287           if (dump_enabled_p ())
3288             dump_printf_loc (MSG_NOTE, vect_location,
3289                             "reduction: detected reduction chain\n");
3290         }
3291       else if (dump_enabled_p ())
3292         dump_printf_loc (MSG_NOTE, vect_location,
3293                          "reduction: detected reduction\n");
3294
3295       return def_stmt_info;
3296     }
3297
3298   if (dump_enabled_p ())
3299     dump_printf_loc (MSG_NOTE, vect_location,
3300                      "reduction: unknown pattern\n");
3301
3302   return NULL;
3303 }
3304
3305 /* Calculate cost of peeling the loop PEEL_ITERS_PROLOGUE times.  */
3306 int
3307 vect_get_known_peeling_cost (loop_vec_info loop_vinfo, int peel_iters_prologue,
3308                              int *peel_iters_epilogue,
3309                              stmt_vector_for_cost *scalar_cost_vec,
3310                              stmt_vector_for_cost *prologue_cost_vec,
3311                              stmt_vector_for_cost *epilogue_cost_vec)
3312 {
3313   int retval = 0;
3314   int assumed_vf = vect_vf_for_cost (loop_vinfo);
3315
3316   if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
3317     {
3318       *peel_iters_epilogue = assumed_vf / 2;
3319       if (dump_enabled_p ())
3320         dump_printf_loc (MSG_NOTE, vect_location,
3321                          "cost model: epilogue peel iters set to vf/2 "
3322                          "because loop iterations are unknown .\n");
3323
3324       /* If peeled iterations are known but number of scalar loop
3325          iterations are unknown, count a taken branch per peeled loop.  */
3326       retval = record_stmt_cost (prologue_cost_vec, 1, cond_branch_taken,
3327                                  NULL, NULL_TREE, 0, vect_prologue);
3328       retval += record_stmt_cost (epilogue_cost_vec, 1, cond_branch_taken,
3329                                   NULL, NULL_TREE, 0, vect_epilogue);
3330     }
3331   else
3332     {
3333       int niters = LOOP_VINFO_INT_NITERS (loop_vinfo);
3334       peel_iters_prologue = niters < peel_iters_prologue ?
3335                             niters : peel_iters_prologue;
3336       *peel_iters_epilogue = (niters - peel_iters_prologue) % assumed_vf;
3337       /* If we need to peel for gaps, but no peeling is required, we have to
3338          peel VF iterations.  */
3339       if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) && !*peel_iters_epilogue)
3340         *peel_iters_epilogue = assumed_vf;
3341     }
3342
3343   stmt_info_for_cost *si;
3344   int j;
3345   if (peel_iters_prologue)
3346     FOR_EACH_VEC_ELT (*scalar_cost_vec, j, si)
3347       retval += record_stmt_cost (prologue_cost_vec,
3348                                   si->count * peel_iters_prologue,
3349                                   si->kind, si->stmt_info, si->misalign,
3350                                   vect_prologue);
3351   if (*peel_iters_epilogue)
3352     FOR_EACH_VEC_ELT (*scalar_cost_vec, j, si)
3353       retval += record_stmt_cost (epilogue_cost_vec,
3354                                   si->count * *peel_iters_epilogue,
3355                                   si->kind, si->stmt_info, si->misalign,
3356                                   vect_epilogue);
3357
3358   return retval;
3359 }
3360
3361 /* Function vect_estimate_min_profitable_iters
3362
3363    Return the number of iterations required for the vector version of the
3364    loop to be profitable relative to the cost of the scalar version of the
3365    loop.
3366
3367    *RET_MIN_PROFITABLE_NITERS is a cost model profitability threshold
3368    of iterations for vectorization.  -1 value means loop vectorization
3369    is not profitable.  This returned value may be used for dynamic
3370    profitability check.
3371
3372    *RET_MIN_PROFITABLE_ESTIMATE is a profitability threshold to be used
3373    for static check against estimated number of iterations.  */
3374
3375 static void
3376 vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo,
3377                                     int *ret_min_profitable_niters,
3378                                     int *ret_min_profitable_estimate)
3379 {
3380   int min_profitable_iters;
3381   int min_profitable_estimate;
3382   int peel_iters_prologue;
3383   int peel_iters_epilogue;
3384   unsigned vec_inside_cost = 0;
3385   int vec_outside_cost = 0;
3386   unsigned vec_prologue_cost = 0;
3387   unsigned vec_epilogue_cost = 0;
3388   int scalar_single_iter_cost = 0;
3389   int scalar_outside_cost = 0;
3390   int assumed_vf = vect_vf_for_cost (loop_vinfo);
3391   int npeel = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
3392   void *target_cost_data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
3393
3394   /* Cost model disabled.  */
3395   if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
3396     {
3397       if (dump_enabled_p ())
3398         dump_printf_loc (MSG_NOTE, vect_location, "cost model disabled.\n");
3399       *ret_min_profitable_niters = 0;
3400       *ret_min_profitable_estimate = 0;
3401       return;
3402     }
3403
3404   /* Requires loop versioning tests to handle misalignment.  */
3405   if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo))
3406     {
3407       /*  FIXME: Make cost depend on complexity of individual check.  */
3408       unsigned len = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).length ();
3409       (void) add_stmt_cost (loop_vinfo, target_cost_data, len, vector_stmt,
3410                             NULL, NULL_TREE, 0, vect_prologue);
3411       if (dump_enabled_p ())
3412         dump_printf (MSG_NOTE,
3413                      "cost model: Adding cost of checks for loop "
3414                      "versioning to treat misalignment.\n");
3415     }
3416
3417   /* Requires loop versioning with alias checks.  */
3418   if (LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
3419     {
3420       /*  FIXME: Make cost depend on complexity of individual check.  */
3421       unsigned len = LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo).length ();
3422       (void) add_stmt_cost (loop_vinfo, target_cost_data, len, vector_stmt,
3423                             NULL, NULL_TREE, 0, vect_prologue);
3424       len = LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo).length ();
3425       if (len)
3426         /* Count LEN - 1 ANDs and LEN comparisons.  */
3427         (void) add_stmt_cost (loop_vinfo, target_cost_data, len * 2 - 1,
3428                               scalar_stmt, NULL, NULL_TREE, 0, vect_prologue);
3429       len = LOOP_VINFO_LOWER_BOUNDS (loop_vinfo).length ();
3430       if (len)
3431         {
3432           /* Count LEN - 1 ANDs and LEN comparisons.  */
3433           unsigned int nstmts = len * 2 - 1;
3434           /* +1 for each bias that needs adding.  */
3435           for (unsigned int i = 0; i < len; ++i)
3436             if (!LOOP_VINFO_LOWER_BOUNDS (loop_vinfo)[i].unsigned_p)
3437               nstmts += 1;
3438           (void) add_stmt_cost (loop_vinfo, target_cost_data, nstmts,
3439                                 scalar_stmt, NULL, NULL_TREE, 0, vect_prologue);
3440         }
3441       if (dump_enabled_p ())
3442         dump_printf (MSG_NOTE,
3443                      "cost model: Adding cost of checks for loop "
3444                      "versioning aliasing.\n");
3445     }
3446
3447   /* Requires loop versioning with niter checks.  */
3448   if (LOOP_REQUIRES_VERSIONING_FOR_NITERS (loop_vinfo))
3449     {
3450       /*  FIXME: Make cost depend on complexity of individual check.  */
3451       (void) add_stmt_cost (loop_vinfo, target_cost_data, 1, vector_stmt,
3452                             NULL, NULL_TREE, 0, vect_prologue);
3453       if (dump_enabled_p ())
3454         dump_printf (MSG_NOTE,
3455                      "cost model: Adding cost of checks for loop "
3456                      "versioning niters.\n");
3457     }
3458
3459   if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
3460     (void) add_stmt_cost (loop_vinfo, target_cost_data, 1, cond_branch_taken,
3461                           NULL, NULL_TREE, 0, vect_prologue);
3462
3463   /* Count statements in scalar loop.  Using this as scalar cost for a single
3464      iteration for now.
3465
3466      TODO: Add outer loop support.
3467
3468      TODO: Consider assigning different costs to different scalar
3469      statements.  */
3470
3471   scalar_single_iter_cost
3472     = LOOP_VINFO_SINGLE_SCALAR_ITERATION_COST (loop_vinfo);
3473
3474   /* Add additional cost for the peeled instructions in prologue and epilogue
3475      loop.  (For fully-masked loops there will be no peeling.)
3476
3477      FORNOW: If we don't know the value of peel_iters for prologue or epilogue
3478      at compile-time - we assume it's vf/2 (the worst would be vf-1).
3479
3480      TODO: Build an expression that represents peel_iters for prologue and
3481      epilogue to be used in a run-time test.  */
3482
3483   if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
3484     {
3485       peel_iters_prologue = 0;
3486       peel_iters_epilogue = 0;
3487
3488       if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
3489         {
3490           /* We need to peel exactly one iteration.  */
3491           peel_iters_epilogue += 1;
3492           stmt_info_for_cost *si;
3493           int j;
3494           FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
3495                             j, si)
3496             (void) add_stmt_cost (loop_vinfo, target_cost_data, si->count,
3497                                   si->kind, si->stmt_info, si->vectype,
3498                                   si->misalign, vect_epilogue);
3499         }
3500
3501       /* Calculate how many masks we need to generate.  */
3502       unsigned int num_masks = 0;
3503       rgroup_masks *rgm;
3504       unsigned int num_vectors_m1;
3505       FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo), num_vectors_m1, rgm)
3506         if (rgm->mask_type)
3507           num_masks += num_vectors_m1 + 1;
3508       gcc_assert (num_masks > 0);
3509
3510       /* In the worst case, we need to generate each mask in the prologue
3511          and in the loop body.  One of the loop body mask instructions
3512          replaces the comparison in the scalar loop, and since we don't
3513          count the scalar comparison against the scalar body, we shouldn't
3514          count that vector instruction against the vector body either.
3515
3516          Sometimes we can use unpacks instead of generating prologue
3517          masks and sometimes the prologue mask will fold to a constant,
3518          so the actual prologue cost might be smaller.  However, it's
3519          simpler and safer to use the worst-case cost; if this ends up
3520          being the tie-breaker between vectorizing or not, then it's
3521          probably better not to vectorize.  */
3522       (void) add_stmt_cost (loop_vinfo,
3523                             target_cost_data, num_masks, vector_stmt,
3524                             NULL, NULL_TREE, 0, vect_prologue);
3525       (void) add_stmt_cost (loop_vinfo,
3526                             target_cost_data, num_masks - 1, vector_stmt,
3527                             NULL, NULL_TREE, 0, vect_body);
3528     }
3529   else if (npeel < 0)
3530     {
3531       peel_iters_prologue = assumed_vf / 2;
3532       if (dump_enabled_p ())
3533         dump_printf (MSG_NOTE, "cost model: "
3534                      "prologue peel iters set to vf/2.\n");
3535
3536       /* If peeling for alignment is unknown, loop bound of main loop becomes
3537          unknown.  */
3538       peel_iters_epilogue = assumed_vf / 2;
3539       if (dump_enabled_p ())
3540         dump_printf (MSG_NOTE, "cost model: "
3541                      "epilogue peel iters set to vf/2 because "
3542                      "peeling for alignment is unknown.\n");
3543
3544       /* If peeled iterations are unknown, count a taken branch and a not taken
3545          branch per peeled loop. Even if scalar loop iterations are known,
3546          vector iterations are not known since peeled prologue iterations are
3547          not known. Hence guards remain the same.  */
3548       (void) add_stmt_cost (loop_vinfo, target_cost_data, 1, cond_branch_taken,
3549                             NULL, NULL_TREE, 0, vect_prologue);
3550       (void) add_stmt_cost (loop_vinfo,
3551                             target_cost_data, 1, cond_branch_not_taken,
3552                             NULL, NULL_TREE, 0, vect_prologue);
3553       (void) add_stmt_cost (loop_vinfo, target_cost_data, 1, cond_branch_taken,
3554                             NULL, NULL_TREE, 0, vect_epilogue);
3555       (void) add_stmt_cost (loop_vinfo,
3556                             target_cost_data, 1, cond_branch_not_taken,
3557                             NULL, NULL_TREE, 0, vect_epilogue);
3558       stmt_info_for_cost *si;
3559       int j;
3560       FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo), j, si)
3561         {
3562           (void) add_stmt_cost (loop_vinfo, target_cost_data,
3563                                 si->count * peel_iters_prologue,
3564                                 si->kind, si->stmt_info, si->vectype,
3565                                 si->misalign,
3566                                 vect_prologue);
3567           (void) add_stmt_cost (loop_vinfo, target_cost_data,
3568                                 si->count * peel_iters_epilogue,
3569                                 si->kind, si->stmt_info, si->vectype,
3570                                 si->misalign,
3571                                 vect_epilogue);
3572         }
3573     }
3574   else
3575     {
3576       stmt_vector_for_cost prologue_cost_vec, epilogue_cost_vec;
3577       stmt_info_for_cost *si;
3578       int j;
3579       void *data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
3580
3581       prologue_cost_vec.create (2);
3582       epilogue_cost_vec.create (2);
3583       peel_iters_prologue = npeel;
3584
3585       (void) vect_get_known_peeling_cost (loop_vinfo, peel_iters_prologue,
3586                                           &peel_iters_epilogue,
3587                                           &LOOP_VINFO_SCALAR_ITERATION_COST
3588                                             (loop_vinfo),
3589                                           &prologue_cost_vec,
3590                                           &epilogue_cost_vec);
3591
3592       FOR_EACH_VEC_ELT (prologue_cost_vec, j, si)
3593         (void) add_stmt_cost (loop_vinfo,
3594                               data, si->count, si->kind, si->stmt_info,
3595                               si->vectype, si->misalign, vect_prologue);
3596
3597       FOR_EACH_VEC_ELT (epilogue_cost_vec, j, si)
3598         (void) add_stmt_cost (loop_vinfo,
3599                               data, si->count, si->kind, si->stmt_info,
3600                               si->vectype, si->misalign, vect_epilogue);
3601
3602       prologue_cost_vec.release ();
3603       epilogue_cost_vec.release ();
3604     }
3605
3606   /* FORNOW: The scalar outside cost is incremented in one of the
3607      following ways:
3608
3609      1. The vectorizer checks for alignment and aliasing and generates
3610      a condition that allows dynamic vectorization.  A cost model
3611      check is ANDED with the versioning condition.  Hence scalar code
3612      path now has the added cost of the versioning check.
3613
3614        if (cost > th & versioning_check)
3615          jmp to vector code
3616
3617      Hence run-time scalar is incremented by not-taken branch cost.
3618
3619      2. The vectorizer then checks if a prologue is required.  If the
3620      cost model check was not done before during versioning, it has to
3621      be done before the prologue check.
3622
3623        if (cost <= th)
3624          prologue = scalar_iters
3625        if (prologue == 0)
3626          jmp to vector code
3627        else
3628          execute prologue
3629        if (prologue == num_iters)
3630          go to exit
3631
3632      Hence the run-time scalar cost is incremented by a taken branch,
3633      plus a not-taken branch, plus a taken branch cost.
3634
3635      3. The vectorizer then checks if an epilogue is required.  If the
3636      cost model check was not done before during prologue check, it
3637      has to be done with the epilogue check.
3638
3639        if (prologue == 0)
3640          jmp to vector code
3641        else
3642          execute prologue
3643        if (prologue == num_iters)
3644          go to exit
3645        vector code:
3646          if ((cost <= th) | (scalar_iters-prologue-epilogue == 0))
3647            jmp to epilogue
3648
3649      Hence the run-time scalar cost should be incremented by 2 taken
3650      branches.
3651
3652      TODO: The back end may reorder the BBS's differently and reverse
3653      conditions/branch directions.  Change the estimates below to
3654      something more reasonable.  */
3655
3656   /* If the number of iterations is known and we do not do versioning, we can
3657      decide whether to vectorize at compile time.  Hence the scalar version
3658      do not carry cost model guard costs.  */
3659   if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
3660       || LOOP_REQUIRES_VERSIONING (loop_vinfo))
3661     {
3662       /* Cost model check occurs at versioning.  */
3663       if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
3664         scalar_outside_cost += vect_get_stmt_cost (cond_branch_not_taken);
3665       else
3666         {
3667           /* Cost model check occurs at prologue generation.  */
3668           if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) < 0)
3669             scalar_outside_cost += 2 * vect_get_stmt_cost (cond_branch_taken)
3670               + vect_get_stmt_cost (cond_branch_not_taken);
3671           /* Cost model check occurs at epilogue generation.  */
3672           else
3673             scalar_outside_cost += 2 * vect_get_stmt_cost (cond_branch_taken);
3674         }
3675     }
3676
3677   /* Complete the target-specific cost calculations.  */
3678   finish_cost (LOOP_VINFO_TARGET_COST_DATA (loop_vinfo), &vec_prologue_cost,
3679                &vec_inside_cost, &vec_epilogue_cost);
3680
3681   vec_outside_cost = (int)(vec_prologue_cost + vec_epilogue_cost);
3682
3683   /* Stash the costs so that we can compare two loop_vec_infos.  */
3684   loop_vinfo->vec_inside_cost = vec_inside_cost;
3685   loop_vinfo->vec_outside_cost = vec_outside_cost;
3686
3687   if (dump_enabled_p ())
3688     {
3689       dump_printf_loc (MSG_NOTE, vect_location, "Cost model analysis: \n");
3690       dump_printf (MSG_NOTE, "  Vector inside of loop cost: %d\n",
3691                    vec_inside_cost);
3692       dump_printf (MSG_NOTE, "  Vector prologue cost: %d\n",
3693                    vec_prologue_cost);
3694       dump_printf (MSG_NOTE, "  Vector epilogue cost: %d\n",
3695                    vec_epilogue_cost);
3696       dump_printf (MSG_NOTE, "  Scalar iteration cost: %d\n",
3697                    scalar_single_iter_cost);
3698       dump_printf (MSG_NOTE, "  Scalar outside cost: %d\n",
3699                    scalar_outside_cost);
3700       dump_printf (MSG_NOTE, "  Vector outside cost: %d\n",
3701                    vec_outside_cost);
3702       dump_printf (MSG_NOTE, "  prologue iterations: %d\n",
3703                    peel_iters_prologue);
3704       dump_printf (MSG_NOTE, "  epilogue iterations: %d\n",
3705                    peel_iters_epilogue);
3706     }
3707
3708   /* Calculate number of iterations required to make the vector version
3709      profitable, relative to the loop bodies only.  The following condition
3710      must hold true:
3711      SIC * niters + SOC > VIC * ((niters - NPEEL) / VF) + VOC
3712      where
3713      SIC = scalar iteration cost, VIC = vector iteration cost,
3714      VOC = vector outside cost, VF = vectorization factor,
3715      NPEEL = prologue iterations + epilogue iterations,
3716      SOC = scalar outside cost for run time cost model check.  */
3717
3718   int saving_per_viter = (scalar_single_iter_cost * assumed_vf
3719                           - vec_inside_cost);
3720   if (saving_per_viter <= 0)
3721     {
3722       if (LOOP_VINFO_LOOP (loop_vinfo)->force_vectorize)
3723         warning_at (vect_location.get_location_t (), OPT_Wopenmp_simd,
3724                     "vectorization did not happen for a simd loop");
3725
3726       if (dump_enabled_p ())
3727         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3728                          "cost model: the vector iteration cost = %d "
3729                          "divided by the scalar iteration cost = %d "
3730                          "is greater or equal to the vectorization factor = %d"
3731                          ".\n",
3732                          vec_inside_cost, scalar_single_iter_cost, assumed_vf);
3733       *ret_min_profitable_niters = -1;
3734       *ret_min_profitable_estimate = -1;
3735       return;
3736     }
3737
3738   /* ??? The "if" arm is written to handle all cases; see below for what
3739      we would do for !LOOP_VINFO_FULLY_MASKED_P.  */
3740   if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
3741     {
3742       /* Rewriting the condition above in terms of the number of
3743          vector iterations (vniters) rather than the number of
3744          scalar iterations (niters) gives:
3745
3746          SIC * (vniters * VF + NPEEL) + SOC > VIC * vniters + VOC
3747
3748          <==> vniters * (SIC * VF - VIC) > VOC - SIC * NPEEL - SOC
3749
3750          For integer N, X and Y when X > 0:
3751
3752          N * X > Y <==> N >= (Y /[floor] X) + 1.  */
3753       int outside_overhead = (vec_outside_cost
3754                               - scalar_single_iter_cost * peel_iters_prologue
3755                               - scalar_single_iter_cost * peel_iters_epilogue
3756                               - scalar_outside_cost);
3757       /* We're only interested in cases that require at least one
3758          vector iteration.  */
3759       int min_vec_niters = 1;
3760       if (outside_overhead > 0)
3761         min_vec_niters = outside_overhead / saving_per_viter + 1;
3762
3763       if (dump_enabled_p ())
3764         dump_printf (MSG_NOTE, "  Minimum number of vector iterations: %d\n",
3765                      min_vec_niters);
3766
3767       if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
3768         {
3769           /* Now that we know the minimum number of vector iterations,
3770              find the minimum niters for which the scalar cost is larger:
3771
3772              SIC * niters > VIC * vniters + VOC - SOC
3773
3774              We know that the minimum niters is no more than
3775              vniters * VF + NPEEL, but it might be (and often is) less
3776              than that if a partial vector iteration is cheaper than the
3777              equivalent scalar code.  */
3778           int threshold = (vec_inside_cost * min_vec_niters
3779                            + vec_outside_cost
3780                            - scalar_outside_cost);
3781           if (threshold <= 0)
3782             min_profitable_iters = 1;
3783           else
3784             min_profitable_iters = threshold / scalar_single_iter_cost + 1;
3785         }
3786       else
3787         /* Convert the number of vector iterations into a number of
3788            scalar iterations.  */
3789         min_profitable_iters = (min_vec_niters * assumed_vf
3790                                 + peel_iters_prologue
3791                                 + peel_iters_epilogue);
3792     }
3793   else
3794     {
3795       min_profitable_iters = ((vec_outside_cost - scalar_outside_cost)
3796                               * assumed_vf
3797                               - vec_inside_cost * peel_iters_prologue
3798                               - vec_inside_cost * peel_iters_epilogue);
3799       if (min_profitable_iters <= 0)
3800         min_profitable_iters = 0;
3801       else
3802         {
3803           min_profitable_iters /= saving_per_viter;
3804
3805           if ((scalar_single_iter_cost * assumed_vf * min_profitable_iters)
3806               <= (((int) vec_inside_cost * min_profitable_iters)
3807                   + (((int) vec_outside_cost - scalar_outside_cost)
3808                      * assumed_vf)))
3809             min_profitable_iters++;
3810         }
3811     }
3812
3813   if (dump_enabled_p ())
3814     dump_printf (MSG_NOTE,
3815                  "  Calculated minimum iters for profitability: %d\n",
3816                  min_profitable_iters);
3817
3818   if (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
3819       && min_profitable_iters < (assumed_vf + peel_iters_prologue))
3820     /* We want the vectorized loop to execute at least once.  */
3821     min_profitable_iters = assumed_vf + peel_iters_prologue;
3822
3823   if (dump_enabled_p ())
3824     dump_printf_loc (MSG_NOTE, vect_location,
3825                      "  Runtime profitability threshold = %d\n",
3826                      min_profitable_iters);
3827
3828   *ret_min_profitable_niters = min_profitable_iters;
3829
3830   /* Calculate number of iterations required to make the vector version
3831      profitable, relative to the loop bodies only.
3832
3833      Non-vectorized variant is SIC * niters and it must win over vector
3834      variant on the expected loop trip count.  The following condition must hold true:
3835      SIC * niters > VIC * ((niters - NPEEL) / VF) + VOC + SOC  */
3836
3837   if (vec_outside_cost <= 0)
3838     min_profitable_estimate = 0;
3839   else if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
3840     {
3841       /* This is a repeat of the code above, but with + SOC rather
3842          than - SOC.  */
3843       int outside_overhead = (vec_outside_cost
3844                               - scalar_single_iter_cost * peel_iters_prologue
3845                               - scalar_single_iter_cost * peel_iters_epilogue
3846                               + scalar_outside_cost);
3847       int min_vec_niters = 1;
3848       if (outside_overhead > 0)
3849         min_vec_niters = outside_overhead / saving_per_viter + 1;
3850
3851       if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
3852         {
3853           int threshold = (vec_inside_cost * min_vec_niters
3854                            + vec_outside_cost
3855                            + scalar_outside_cost);
3856           min_profitable_estimate = threshold / scalar_single_iter_cost + 1;
3857         }
3858       else
3859         min_profitable_estimate = (min_vec_niters * assumed_vf
3860                                    + peel_iters_prologue
3861                                    + peel_iters_epilogue);
3862     }
3863   else
3864     {
3865       min_profitable_estimate = ((vec_outside_cost + scalar_outside_cost)
3866                                  * assumed_vf
3867                                  - vec_inside_cost * peel_iters_prologue
3868                                  - vec_inside_cost * peel_iters_epilogue)
3869                                  / ((scalar_single_iter_cost * assumed_vf)
3870                                    - vec_inside_cost);
3871     }
3872   min_profitable_estimate = MAX (min_profitable_estimate, min_profitable_iters);
3873   if (dump_enabled_p ())
3874     dump_printf_loc (MSG_NOTE, vect_location,
3875                      "  Static estimate profitability threshold = %d\n",
3876                      min_profitable_estimate);
3877
3878   *ret_min_profitable_estimate = min_profitable_estimate;
3879 }
3880
3881 /* Writes into SEL a mask for a vec_perm, equivalent to a vec_shr by OFFSET
3882    vector elements (not bits) for a vector with NELT elements.  */
3883 static void
3884 calc_vec_perm_mask_for_shift (unsigned int offset, unsigned int nelt,
3885                               vec_perm_builder *sel)
3886 {
3887   /* The encoding is a single stepped pattern.  Any wrap-around is handled
3888      by vec_perm_indices.  */
3889   sel->new_vector (nelt, 1, 3);
3890   for (unsigned int i = 0; i < 3; i++)
3891     sel->quick_push (i + offset);
3892 }
3893
3894 /* Checks whether the target supports whole-vector shifts for vectors of mode
3895    MODE.  This is the case if _either_ the platform handles vec_shr_optab, _or_
3896    it supports vec_perm_const with masks for all necessary shift amounts.  */
3897 static bool
3898 have_whole_vector_shift (machine_mode mode)
3899 {
3900   if (optab_handler (vec_shr_optab, mode) != CODE_FOR_nothing)
3901     return true;
3902
3903   /* Variable-length vectors should be handled via the optab.  */
3904   unsigned int nelt;
3905   if (!GET_MODE_NUNITS (mode).is_constant (&nelt))
3906     return false;
3907
3908   vec_perm_builder sel;
3909   vec_perm_indices indices;
3910   for (unsigned int i = nelt / 2; i >= 1; i /= 2)
3911     {
3912       calc_vec_perm_mask_for_shift (i, nelt, &sel);
3913       indices.new_vector (sel, 2, nelt);
3914       if (!can_vec_perm_const_p (mode, indices, false))
3915         return false;
3916     }
3917   return true;
3918 }
3919
3920 /* TODO: Close dependency between vect_model_*_cost and vectorizable_*
3921    functions. Design better to avoid maintenance issues.  */
3922
3923 /* Function vect_model_reduction_cost.
3924
3925    Models cost for a reduction operation, including the vector ops
3926    generated within the strip-mine loop, the initial definition before
3927    the loop, and the epilogue code that must be generated.  */
3928
3929 static void
3930 vect_model_reduction_cost (loop_vec_info loop_vinfo,
3931                            stmt_vec_info stmt_info, internal_fn reduc_fn,
3932                            vect_reduction_type reduction_type,
3933                            int ncopies, stmt_vector_for_cost *cost_vec)
3934 {
3935   int prologue_cost = 0, epilogue_cost = 0, inside_cost;
3936   enum tree_code code;
3937   optab optab;
3938   tree vectype;
3939   machine_mode mode;
3940   class loop *loop = NULL;
3941
3942   if (loop_vinfo)
3943     loop = LOOP_VINFO_LOOP (loop_vinfo);
3944
3945   /* Condition reductions generate two reductions in the loop.  */
3946   if (reduction_type == COND_REDUCTION)
3947     ncopies *= 2;
3948
3949   vectype = STMT_VINFO_VECTYPE (stmt_info);
3950   mode = TYPE_MODE (vectype);
3951   stmt_vec_info orig_stmt_info = vect_orig_stmt (stmt_info);
3952
3953   code = gimple_assign_rhs_code (orig_stmt_info->stmt);
3954
3955   if (reduction_type == EXTRACT_LAST_REDUCTION)
3956     /* No extra instructions are needed in the prologue.  The loop body
3957        operations are costed in vectorizable_condition.  */
3958     inside_cost = 0;
3959   else if (reduction_type == FOLD_LEFT_REDUCTION)
3960     {
3961       /* No extra instructions needed in the prologue.  */
3962       prologue_cost = 0;
3963
3964       if (reduc_fn != IFN_LAST)
3965         /* Count one reduction-like operation per vector.  */
3966         inside_cost = record_stmt_cost (cost_vec, ncopies, vec_to_scalar,
3967                                         stmt_info, 0, vect_body);
3968       else
3969         {
3970           /* Use NELEMENTS extracts and NELEMENTS scalar ops.  */
3971           unsigned int nelements = ncopies * vect_nunits_for_cost (vectype);
3972           inside_cost = record_stmt_cost (cost_vec, nelements,
3973                                           vec_to_scalar, stmt_info, 0,
3974                                           vect_body);
3975           inside_cost += record_stmt_cost (cost_vec, nelements,
3976                                            scalar_stmt, stmt_info, 0,
3977                                            vect_body);
3978         }
3979     }
3980   else
3981     {
3982       /* Add in cost for initial definition.
3983          For cond reduction we have four vectors: initial index, step,
3984          initial result of the data reduction, initial value of the index
3985          reduction.  */
3986       int prologue_stmts = reduction_type == COND_REDUCTION ? 4 : 1;
3987       prologue_cost += record_stmt_cost (cost_vec, prologue_stmts,
3988                                          scalar_to_vec, stmt_info, 0,
3989                                          vect_prologue);
3990
3991       /* Cost of reduction op inside loop.  */
3992       inside_cost = record_stmt_cost (cost_vec, ncopies, vector_stmt,
3993                                       stmt_info, 0, vect_body);
3994     }
3995
3996   /* Determine cost of epilogue code.
3997
3998      We have a reduction operator that will reduce the vector in one statement.
3999      Also requires scalar extract.  */
4000
4001   if (!loop || !nested_in_vect_loop_p (loop, orig_stmt_info))
4002     {
4003       if (reduc_fn != IFN_LAST)
4004         {
4005           if (reduction_type == COND_REDUCTION)
4006             {
4007               /* An EQ stmt and an COND_EXPR stmt.  */
4008               epilogue_cost += record_stmt_cost (cost_vec, 2,
4009                                                  vector_stmt, stmt_info, 0,
4010                                                  vect_epilogue);
4011               /* Reduction of the max index and a reduction of the found
4012                  values.  */
4013               epilogue_cost += record_stmt_cost (cost_vec, 2,
4014                                                  vec_to_scalar, stmt_info, 0,
4015                                                  vect_epilogue);
4016               /* A broadcast of the max value.  */
4017               epilogue_cost += record_stmt_cost (cost_vec, 1,
4018                                                  scalar_to_vec, stmt_info, 0,
4019                                                  vect_epilogue);
4020             }
4021           else
4022             {
4023               epilogue_cost += record_stmt_cost (cost_vec, 1, vector_stmt,
4024                                                  stmt_info, 0, vect_epilogue);
4025               epilogue_cost += record_stmt_cost (cost_vec, 1,
4026                                                  vec_to_scalar, stmt_info, 0,
4027                                                  vect_epilogue);
4028             }
4029         }
4030       else if (reduction_type == COND_REDUCTION)
4031         {
4032           unsigned estimated_nunits = vect_nunits_for_cost (vectype);
4033           /* Extraction of scalar elements.  */
4034           epilogue_cost += record_stmt_cost (cost_vec,
4035                                              2 * estimated_nunits,
4036                                              vec_to_scalar, stmt_info, 0,
4037                                              vect_epilogue);
4038           /* Scalar max reductions via COND_EXPR / MAX_EXPR.  */
4039           epilogue_cost += record_stmt_cost (cost_vec,
4040                                              2 * estimated_nunits - 3,
4041                                              scalar_stmt, stmt_info, 0,
4042                                              vect_epilogue);
4043         }
4044       else if (reduction_type == EXTRACT_LAST_REDUCTION
4045                || reduction_type == FOLD_LEFT_REDUCTION)
4046         /* No extra instructions need in the epilogue.  */
4047         ;
4048       else
4049         {
4050           int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
4051           tree bitsize =
4052             TYPE_SIZE (TREE_TYPE (gimple_assign_lhs (orig_stmt_info->stmt)));
4053           int element_bitsize = tree_to_uhwi (bitsize);
4054           int nelements = vec_size_in_bits / element_bitsize;
4055
4056           if (code == COND_EXPR)
4057             code = MAX_EXPR;
4058
4059           optab = optab_for_tree_code (code, vectype, optab_default);
4060
4061           /* We have a whole vector shift available.  */
4062           if (optab != unknown_optab
4063               && VECTOR_MODE_P (mode)
4064               && optab_handler (optab, mode) != CODE_FOR_nothing
4065               && have_whole_vector_shift (mode))
4066             {
4067               /* Final reduction via vector shifts and the reduction operator.
4068                  Also requires scalar extract.  */
4069               epilogue_cost += record_stmt_cost (cost_vec,
4070                                                  exact_log2 (nelements) * 2,
4071                                                  vector_stmt, stmt_info, 0,
4072                                                  vect_epilogue);
4073               epilogue_cost += record_stmt_cost (cost_vec, 1,
4074                                                  vec_to_scalar, stmt_info, 0,
4075                                                  vect_epilogue);
4076             }
4077           else
4078             /* Use extracts and reduction op for final reduction.  For N
4079                elements, we have N extracts and N-1 reduction ops.  */
4080             epilogue_cost += record_stmt_cost (cost_vec,
4081                                                nelements + nelements - 1,
4082                                                vector_stmt, stmt_info, 0,
4083                                                vect_epilogue);
4084         }
4085     }
4086
4087   if (dump_enabled_p ())
4088     dump_printf (MSG_NOTE,
4089                  "vect_model_reduction_cost: inside_cost = %d, "
4090                  "prologue_cost = %d, epilogue_cost = %d .\n", inside_cost,
4091                  prologue_cost, epilogue_cost);
4092 }
4093
4094
4095 /* Function vect_model_induction_cost.
4096
4097    Models cost for induction operations.  */
4098
4099 static void
4100 vect_model_induction_cost (stmt_vec_info stmt_info, int ncopies,
4101                            stmt_vector_for_cost *cost_vec)
4102 {
4103   unsigned inside_cost, prologue_cost;
4104
4105   if (PURE_SLP_STMT (stmt_info))
4106     return;
4107
4108   /* loop cost for vec_loop.  */
4109   inside_cost = record_stmt_cost (cost_vec, ncopies, vector_stmt,
4110                                   stmt_info, 0, vect_body);
4111
4112   /* prologue cost for vec_init and vec_step.  */
4113   prologue_cost = record_stmt_cost (cost_vec, 2, scalar_to_vec,
4114                                     stmt_info, 0, vect_prologue);
4115
4116   if (dump_enabled_p ())
4117     dump_printf_loc (MSG_NOTE, vect_location,
4118                      "vect_model_induction_cost: inside_cost = %d, "
4119                      "prologue_cost = %d .\n", inside_cost, prologue_cost);
4120 }
4121
4122
4123
4124 /* Function get_initial_def_for_reduction
4125
4126    Input:
4127    STMT_VINFO - a stmt that performs a reduction operation in the loop.
4128    INIT_VAL - the initial value of the reduction variable
4129
4130    Output:
4131    ADJUSTMENT_DEF - a tree that holds a value to be added to the final result
4132         of the reduction (used for adjusting the epilog - see below).
4133    Return a vector variable, initialized according to the operation that
4134         STMT_VINFO performs. This vector will be used as the initial value
4135         of the vector of partial results.
4136
4137    Option1 (adjust in epilog): Initialize the vector as follows:
4138      add/bit or/xor:    [0,0,...,0,0]
4139      mult/bit and:      [1,1,...,1,1]
4140      min/max/cond_expr: [init_val,init_val,..,init_val,init_val]
4141    and when necessary (e.g. add/mult case) let the caller know
4142    that it needs to adjust the result by init_val.
4143
4144    Option2: Initialize the vector as follows:
4145      add/bit or/xor:    [init_val,0,0,...,0]
4146      mult/bit and:      [init_val,1,1,...,1]
4147      min/max/cond_expr: [init_val,init_val,...,init_val]
4148    and no adjustments are needed.
4149
4150    For example, for the following code:
4151
4152    s = init_val;
4153    for (i=0;i<n;i++)
4154      s = s + a[i];
4155
4156    STMT_VINFO is 's = s + a[i]', and the reduction variable is 's'.
4157    For a vector of 4 units, we want to return either [0,0,0,init_val],
4158    or [0,0,0,0] and let the caller know that it needs to adjust
4159    the result at the end by 'init_val'.
4160
4161    FORNOW, we are using the 'adjust in epilog' scheme, because this way the
4162    initialization vector is simpler (same element in all entries), if
4163    ADJUSTMENT_DEF is not NULL, and Option2 otherwise.
4164
4165    A cost model should help decide between these two schemes.  */
4166
4167 static tree
4168 get_initial_def_for_reduction (loop_vec_info loop_vinfo,
4169                                stmt_vec_info stmt_vinfo,
4170                                enum tree_code code, tree init_val,
4171                                tree *adjustment_def)
4172 {
4173   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
4174   tree scalar_type = TREE_TYPE (init_val);
4175   tree vectype = get_vectype_for_scalar_type (loop_vinfo, scalar_type);
4176   tree def_for_init;
4177   tree init_def;
4178   REAL_VALUE_TYPE real_init_val = dconst0;
4179   int int_init_val = 0;
4180   gimple_seq stmts = NULL;
4181
4182   gcc_assert (vectype);
4183
4184   gcc_assert (POINTER_TYPE_P (scalar_type) || INTEGRAL_TYPE_P (scalar_type)
4185               || SCALAR_FLOAT_TYPE_P (scalar_type));
4186
4187   gcc_assert (nested_in_vect_loop_p (loop, stmt_vinfo)
4188               || loop == (gimple_bb (stmt_vinfo->stmt))->loop_father);
4189
4190   /* ADJUSTMENT_DEF is NULL when called from
4191      vect_create_epilog_for_reduction to vectorize double reduction.  */
4192   if (adjustment_def)
4193     *adjustment_def = NULL;
4194
4195   switch (code)
4196     {
4197     case WIDEN_SUM_EXPR:
4198     case DOT_PROD_EXPR:
4199     case SAD_EXPR:
4200     case PLUS_EXPR:
4201     case MINUS_EXPR:
4202     case BIT_IOR_EXPR:
4203     case BIT_XOR_EXPR:
4204     case MULT_EXPR:
4205     case BIT_AND_EXPR:
4206       {
4207         if (code == MULT_EXPR)
4208           {
4209             real_init_val = dconst1;
4210             int_init_val = 1;
4211           }
4212
4213         if (code == BIT_AND_EXPR)
4214           int_init_val = -1;
4215
4216         if (SCALAR_FLOAT_TYPE_P (scalar_type))
4217           def_for_init = build_real (scalar_type, real_init_val);
4218         else
4219           def_for_init = build_int_cst (scalar_type, int_init_val);
4220
4221         if (adjustment_def || operand_equal_p (def_for_init, init_val, 0))
4222           {
4223             /* Option1: the first element is '0' or '1' as well.  */
4224             if (!operand_equal_p (def_for_init, init_val, 0))
4225               *adjustment_def = init_val;
4226             init_def = gimple_build_vector_from_val (&stmts, vectype,
4227                                                      def_for_init);
4228           }
4229         else if (!TYPE_VECTOR_SUBPARTS (vectype).is_constant ())
4230           {
4231             /* Option2 (variable length): the first element is INIT_VAL.  */
4232             init_def = gimple_build_vector_from_val (&stmts, vectype,
4233                                                      def_for_init);
4234             init_def = gimple_build (&stmts, CFN_VEC_SHL_INSERT,
4235                                      vectype, init_def, init_val);
4236           }
4237         else
4238           {
4239             /* Option2: the first element is INIT_VAL.  */
4240             tree_vector_builder elts (vectype, 1, 2);
4241             elts.quick_push (init_val);
4242             elts.quick_push (def_for_init);
4243             init_def = gimple_build_vector (&stmts, &elts);
4244           }
4245       }
4246       break;
4247
4248     case MIN_EXPR:
4249     case MAX_EXPR:
4250     case COND_EXPR:
4251       {
4252         init_val = gimple_convert (&stmts, TREE_TYPE (vectype), init_val);
4253         init_def = gimple_build_vector_from_val (&stmts, vectype, init_val);
4254       }
4255       break;
4256
4257     default:
4258       gcc_unreachable ();
4259     }
4260
4261   if (stmts)
4262     gsi_insert_seq_on_edge_immediate (loop_preheader_edge (loop), stmts);
4263   return init_def;
4264 }
4265
4266 /* Get at the initial defs for the reduction PHIs in SLP_NODE.
4267    NUMBER_OF_VECTORS is the number of vector defs to create.
4268    If NEUTRAL_OP is nonnull, introducing extra elements of that
4269    value will not change the result.  */
4270
4271 static void
4272 get_initial_defs_for_reduction (vec_info *vinfo,
4273                                 slp_tree slp_node,
4274                                 vec<tree> *vec_oprnds,
4275                                 unsigned int number_of_vectors,
4276                                 bool reduc_chain, tree neutral_op)
4277 {
4278   vec<stmt_vec_info> stmts = SLP_TREE_SCALAR_STMTS (slp_node);
4279   stmt_vec_info stmt_vinfo = stmts[0];
4280   unsigned HOST_WIDE_INT nunits;
4281   unsigned j, number_of_places_left_in_vector;
4282   tree vector_type;
4283   unsigned int group_size = stmts.length ();
4284   unsigned int i;
4285   class loop *loop;
4286
4287   vector_type = STMT_VINFO_VECTYPE (stmt_vinfo);
4288
4289   gcc_assert (STMT_VINFO_DEF_TYPE (stmt_vinfo) == vect_reduction_def);
4290
4291   loop = (gimple_bb (stmt_vinfo->stmt))->loop_father;
4292   gcc_assert (loop);
4293   edge pe = loop_preheader_edge (loop);
4294
4295   gcc_assert (!reduc_chain || neutral_op);
4296
4297   /* NUMBER_OF_COPIES is the number of times we need to use the same values in
4298      created vectors. It is greater than 1 if unrolling is performed.
4299
4300      For example, we have two scalar operands, s1 and s2 (e.g., group of
4301      strided accesses of size two), while NUNITS is four (i.e., four scalars
4302      of this type can be packed in a vector).  The output vector will contain
4303      two copies of each scalar operand: {s1, s2, s1, s2}.  (NUMBER_OF_COPIES
4304      will be 2).
4305
4306      If REDUC_GROUP_SIZE > NUNITS, the scalars will be split into several
4307      vectors containing the operands.
4308
4309      For example, NUNITS is four as before, and the group size is 8
4310      (s1, s2, ..., s8).  We will create two vectors {s1, s2, s3, s4} and
4311      {s5, s6, s7, s8}.  */
4312
4313   if (!TYPE_VECTOR_SUBPARTS (vector_type).is_constant (&nunits))
4314     nunits = group_size;
4315
4316   number_of_places_left_in_vector = nunits;
4317   bool constant_p = true;
4318   tree_vector_builder elts (vector_type, nunits, 1);
4319   elts.quick_grow (nunits);
4320   gimple_seq ctor_seq = NULL;
4321   for (j = 0; j < nunits * number_of_vectors; ++j)
4322     {
4323       tree op;
4324       i = j % group_size;
4325       stmt_vinfo = stmts[i];
4326
4327       /* Get the def before the loop.  In reduction chain we have only
4328          one initial value.  Else we have as many as PHIs in the group.  */
4329       if (reduc_chain)
4330         op = j != 0 ? neutral_op : PHI_ARG_DEF_FROM_EDGE (stmt_vinfo->stmt, pe);
4331       else if (((vec_oprnds->length () + 1) * nunits
4332                 - number_of_places_left_in_vector >= group_size)
4333                && neutral_op)
4334         op = neutral_op;
4335       else
4336         op = PHI_ARG_DEF_FROM_EDGE (stmt_vinfo->stmt, pe);
4337
4338       /* Create 'vect_ = {op0,op1,...,opn}'.  */
4339       number_of_places_left_in_vector--;
4340       elts[nunits - number_of_places_left_in_vector - 1] = op;
4341       if (!CONSTANT_CLASS_P (op))
4342         constant_p = false;
4343
4344       if (number_of_places_left_in_vector == 0)
4345         {
4346           tree init;
4347           if (constant_p && !neutral_op
4348               ? multiple_p (TYPE_VECTOR_SUBPARTS (vector_type), nunits)
4349               : known_eq (TYPE_VECTOR_SUBPARTS (vector_type), nunits))
4350             /* Build the vector directly from ELTS.  */
4351             init = gimple_build_vector (&ctor_seq, &elts);
4352           else if (neutral_op)
4353             {
4354               /* Build a vector of the neutral value and shift the
4355                  other elements into place.  */
4356               init = gimple_build_vector_from_val (&ctor_seq, vector_type,
4357                                                    neutral_op);
4358               int k = nunits;
4359               while (k > 0 && elts[k - 1] == neutral_op)
4360                 k -= 1;
4361               while (k > 0)
4362                 {
4363                   k -= 1;
4364                   init = gimple_build (&ctor_seq, CFN_VEC_SHL_INSERT,
4365                                        vector_type, init, elts[k]);
4366                 }
4367             }
4368           else
4369             {
4370               /* First time round, duplicate ELTS to fill the
4371                  required number of vectors.  */
4372               duplicate_and_interleave (vinfo, &ctor_seq, vector_type, elts,
4373                                         number_of_vectors, *vec_oprnds);
4374               break;
4375             }
4376           vec_oprnds->quick_push (init);
4377
4378           number_of_places_left_in_vector = nunits;
4379           elts.new_vector (vector_type, nunits, 1);
4380           elts.quick_grow (nunits);
4381           constant_p = true;
4382         }
4383     }
4384   if (ctor_seq != NULL)
4385     gsi_insert_seq_on_edge_immediate (pe, ctor_seq);
4386 }
4387
4388 /* For a statement STMT_INFO taking part in a reduction operation return
4389    the stmt_vec_info the meta information is stored on.  */
4390
4391 stmt_vec_info
4392 info_for_reduction (vec_info *vinfo, stmt_vec_info stmt_info)
4393 {
4394   stmt_info = vect_orig_stmt (stmt_info);
4395   gcc_assert (STMT_VINFO_REDUC_DEF (stmt_info));
4396   if (!is_a <gphi *> (stmt_info->stmt))
4397     stmt_info = STMT_VINFO_REDUC_DEF (stmt_info);
4398   gphi *phi = as_a <gphi *> (stmt_info->stmt);
4399   if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
4400     {
4401       if (gimple_phi_num_args (phi) == 1)
4402         stmt_info = STMT_VINFO_REDUC_DEF (stmt_info);
4403     }
4404   else if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle)
4405     {
4406       edge pe = loop_preheader_edge (gimple_bb (phi)->loop_father);
4407       stmt_vec_info info
4408           = vinfo->lookup_def (PHI_ARG_DEF_FROM_EDGE (phi, pe));
4409       if (info && STMT_VINFO_DEF_TYPE (info) == vect_double_reduction_def)
4410         stmt_info = info;
4411     }
4412   return stmt_info;
4413 }
4414
4415 /* Function vect_create_epilog_for_reduction
4416
4417    Create code at the loop-epilog to finalize the result of a reduction
4418    computation.
4419
4420    STMT_INFO is the scalar reduction stmt that is being vectorized.
4421    SLP_NODE is an SLP node containing a group of reduction statements. The
4422      first one in this group is STMT_INFO.
4423    SLP_NODE_INSTANCE is the SLP node instance containing SLP_NODE
4424    REDUC_INDEX says which rhs operand of the STMT_INFO is the reduction phi
4425      (counting from 0)
4426
4427    This function:
4428    1. Completes the reduction def-use cycles.
4429    2. "Reduces" each vector of partial results VECT_DEFS into a single result,
4430       by calling the function specified by REDUC_FN if available, or by
4431       other means (whole-vector shifts or a scalar loop).
4432       The function also creates a new phi node at the loop exit to preserve
4433       loop-closed form, as illustrated below.
4434
4435      The flow at the entry to this function:
4436
4437         loop:
4438           vec_def = phi <vec_init, null>        # REDUCTION_PHI
4439           VECT_DEF = vector_stmt                # vectorized form of STMT_INFO
4440           s_loop = scalar_stmt                  # (scalar) STMT_INFO
4441         loop_exit:
4442           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
4443           use <s_out0>
4444           use <s_out0>
4445
4446      The above is transformed by this function into:
4447
4448         loop:
4449           vec_def = phi <vec_init, VECT_DEF>    # REDUCTION_PHI
4450           VECT_DEF = vector_stmt                # vectorized form of STMT_INFO
4451           s_loop = scalar_stmt                  # (scalar) STMT_INFO
4452         loop_exit:
4453           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
4454           v_out1 = phi <VECT_DEF>               # NEW_EXIT_PHI
4455           v_out2 = reduce <v_out1>
4456           s_out3 = extract_field <v_out2, 0>
4457           s_out4 = adjust_result <s_out3>
4458           use <s_out4>
4459           use <s_out4>
4460 */
4461
4462 static void
4463 vect_create_epilog_for_reduction (loop_vec_info loop_vinfo,
4464                                   stmt_vec_info stmt_info,
4465                                   slp_tree slp_node,
4466                                   slp_instance slp_node_instance)
4467 {
4468   stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info);
4469   gcc_assert (reduc_info->is_reduc_info);
4470   /* For double reductions we need to get at the inner loop reduction
4471      stmt which has the meta info attached.  Our stmt_info is that of the
4472      loop-closed PHI of the inner loop which we remember as
4473      def for the reduction PHI generation.  */
4474   bool double_reduc = false;
4475   stmt_vec_info rdef_info = stmt_info;
4476   if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
4477     {
4478       gcc_assert (!slp_node);
4479       double_reduc = true;
4480       stmt_info = loop_vinfo->lookup_def (gimple_phi_arg_def
4481                                             (stmt_info->stmt, 0));
4482       stmt_info = vect_stmt_to_vectorize (stmt_info);
4483     }
4484   gphi *reduc_def_stmt
4485     = as_a <gphi *> (STMT_VINFO_REDUC_DEF (vect_orig_stmt (stmt_info))->stmt);
4486   enum tree_code code = STMT_VINFO_REDUC_CODE (reduc_info);
4487   internal_fn reduc_fn = STMT_VINFO_REDUC_FN (reduc_info);
4488   stmt_vec_info prev_phi_info;
4489   tree vectype;
4490   machine_mode mode;
4491   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo), *outer_loop = NULL;
4492   basic_block exit_bb;
4493   tree scalar_dest;
4494   tree scalar_type;
4495   gimple *new_phi = NULL, *phi;
4496   stmt_vec_info phi_info;
4497   gimple_stmt_iterator exit_gsi;
4498   tree new_temp = NULL_TREE, new_name, new_scalar_dest;
4499   gimple *epilog_stmt = NULL;
4500   gimple *exit_phi;
4501   tree bitsize;
4502   tree def;
4503   tree orig_name, scalar_result;
4504   imm_use_iterator imm_iter, phi_imm_iter;
4505   use_operand_p use_p, phi_use_p;
4506   gimple *use_stmt;
4507   bool nested_in_vect_loop = false;
4508   auto_vec<gimple *> new_phis;
4509   int j, i;
4510   auto_vec<tree> scalar_results;
4511   unsigned int group_size = 1, k;
4512   auto_vec<gimple *> phis;
4513   bool slp_reduc = false;
4514   bool direct_slp_reduc;
4515   tree new_phi_result;
4516   tree induction_index = NULL_TREE;
4517
4518   if (slp_node)
4519     group_size = SLP_TREE_SCALAR_STMTS (slp_node).length ();
4520
4521   if (nested_in_vect_loop_p (loop, stmt_info))
4522     {
4523       outer_loop = loop;
4524       loop = loop->inner;
4525       nested_in_vect_loop = true;
4526       gcc_assert (!slp_node);
4527     }
4528   gcc_assert (!nested_in_vect_loop || double_reduc);
4529
4530   vectype = STMT_VINFO_REDUC_VECTYPE (reduc_info);
4531   gcc_assert (vectype);
4532   mode = TYPE_MODE (vectype);
4533
4534   tree initial_def = NULL;
4535   tree induc_val = NULL_TREE;
4536   tree adjustment_def = NULL;
4537   if (slp_node)
4538     ;
4539   else
4540     {
4541       /* Get at the scalar def before the loop, that defines the initial value
4542          of the reduction variable.  */
4543       initial_def = PHI_ARG_DEF_FROM_EDGE (reduc_def_stmt,
4544                                            loop_preheader_edge (loop));
4545       /* Optimize: for induction condition reduction, if we can't use zero
4546          for induc_val, use initial_def.  */
4547       if (STMT_VINFO_REDUC_TYPE (reduc_info) == INTEGER_INDUC_COND_REDUCTION)
4548         induc_val = STMT_VINFO_VEC_INDUC_COND_INITIAL_VAL (reduc_info);
4549       else if (double_reduc)
4550         ;
4551       else if (nested_in_vect_loop)
4552         ;
4553       else
4554         adjustment_def = STMT_VINFO_REDUC_EPILOGUE_ADJUSTMENT (reduc_info);
4555     }
4556
4557   unsigned vec_num;
4558   int ncopies;
4559   if (slp_node)
4560     {
4561       vec_num = SLP_TREE_VEC_STMTS (slp_node_instance->reduc_phis).length ();
4562       ncopies = 1;
4563     }
4564   else
4565     {
4566       vec_num = 1;
4567       ncopies = 0;
4568       phi_info = STMT_VINFO_VEC_STMT (loop_vinfo->lookup_stmt (reduc_def_stmt));
4569       do
4570         {
4571           ncopies++;
4572           phi_info = STMT_VINFO_RELATED_STMT (phi_info);
4573         }
4574       while (phi_info);
4575     }
4576
4577   /* For cond reductions we want to create a new vector (INDEX_COND_EXPR)
4578      which is updated with the current index of the loop for every match of
4579      the original loop's cond_expr (VEC_STMT).  This results in a vector
4580      containing the last time the condition passed for that vector lane.
4581      The first match will be a 1 to allow 0 to be used for non-matching
4582      indexes.  If there are no matches at all then the vector will be all
4583      zeroes.
4584
4585      PR92772: This algorithm is broken for architectures that support
4586      masked vectors, but do not provide fold_extract_last.  */
4587   if (STMT_VINFO_REDUC_TYPE (reduc_info) == COND_REDUCTION)
4588     {
4589       auto_vec<std::pair<tree, bool>, 2> ccompares;
4590       stmt_vec_info cond_info = STMT_VINFO_REDUC_DEF (reduc_info);
4591       cond_info = vect_stmt_to_vectorize (cond_info);
4592       while (cond_info != reduc_info)
4593         {
4594           if (gimple_assign_rhs_code (cond_info->stmt) == COND_EXPR)
4595             {
4596               gimple *vec_stmt = STMT_VINFO_VEC_STMT (cond_info)->stmt;
4597               gcc_assert (gimple_assign_rhs_code (vec_stmt) == VEC_COND_EXPR);
4598               ccompares.safe_push
4599                 (std::make_pair (unshare_expr (gimple_assign_rhs1 (vec_stmt)),
4600                                  STMT_VINFO_REDUC_IDX (cond_info) == 2));
4601             }
4602           cond_info
4603             = loop_vinfo->lookup_def (gimple_op (cond_info->stmt,
4604                                                  1 + STMT_VINFO_REDUC_IDX
4605                                                         (cond_info)));
4606           cond_info = vect_stmt_to_vectorize (cond_info);
4607         }
4608       gcc_assert (ccompares.length () != 0);
4609
4610       tree indx_before_incr, indx_after_incr;
4611       poly_uint64 nunits_out = TYPE_VECTOR_SUBPARTS (vectype);
4612       int scalar_precision
4613         = GET_MODE_PRECISION (SCALAR_TYPE_MODE (TREE_TYPE (vectype)));
4614       tree cr_index_scalar_type = make_unsigned_type (scalar_precision);
4615       tree cr_index_vector_type = get_related_vectype_for_scalar_type
4616         (TYPE_MODE (vectype), cr_index_scalar_type,
4617          TYPE_VECTOR_SUBPARTS (vectype));
4618
4619       /* First we create a simple vector induction variable which starts
4620          with the values {1,2,3,...} (SERIES_VECT) and increments by the
4621          vector size (STEP).  */
4622
4623       /* Create a {1,2,3,...} vector.  */
4624       tree series_vect = build_index_vector (cr_index_vector_type, 1, 1);
4625
4626       /* Create a vector of the step value.  */
4627       tree step = build_int_cst (cr_index_scalar_type, nunits_out);
4628       tree vec_step = build_vector_from_val (cr_index_vector_type, step);
4629
4630       /* Create an induction variable.  */
4631       gimple_stmt_iterator incr_gsi;
4632       bool insert_after;
4633       standard_iv_increment_position (loop, &incr_gsi, &insert_after);
4634       create_iv (series_vect, vec_step, NULL_TREE, loop, &incr_gsi,
4635                  insert_after, &indx_before_incr, &indx_after_incr);
4636
4637       /* Next create a new phi node vector (NEW_PHI_TREE) which starts
4638          filled with zeros (VEC_ZERO).  */
4639
4640       /* Create a vector of 0s.  */
4641       tree zero = build_zero_cst (cr_index_scalar_type);
4642       tree vec_zero = build_vector_from_val (cr_index_vector_type, zero);
4643
4644       /* Create a vector phi node.  */
4645       tree new_phi_tree = make_ssa_name (cr_index_vector_type);
4646       new_phi = create_phi_node (new_phi_tree, loop->header);
4647       loop_vinfo->add_stmt (new_phi);
4648       add_phi_arg (as_a <gphi *> (new_phi), vec_zero,
4649                    loop_preheader_edge (loop), UNKNOWN_LOCATION);
4650
4651       /* Now take the condition from the loops original cond_exprs
4652          and produce a new cond_exprs (INDEX_COND_EXPR) which for
4653          every match uses values from the induction variable
4654          (INDEX_BEFORE_INCR) otherwise uses values from the phi node
4655          (NEW_PHI_TREE).
4656          Finally, we update the phi (NEW_PHI_TREE) to take the value of
4657          the new cond_expr (INDEX_COND_EXPR).  */
4658       gimple_seq stmts = NULL;
4659       for (int i = ccompares.length () - 1; i != -1; --i)
4660         {
4661           tree ccompare = ccompares[i].first;
4662           if (ccompares[i].second)
4663             new_phi_tree = gimple_build (&stmts, VEC_COND_EXPR,
4664                                          cr_index_vector_type,
4665                                          ccompare,
4666                                          indx_before_incr, new_phi_tree);
4667           else
4668             new_phi_tree = gimple_build (&stmts, VEC_COND_EXPR,
4669                                          cr_index_vector_type,
4670                                          ccompare,
4671                                          new_phi_tree, indx_before_incr);
4672         }
4673       gsi_insert_seq_before (&incr_gsi, stmts, GSI_SAME_STMT);
4674       stmt_vec_info index_vec_info
4675         = loop_vinfo->add_stmt (SSA_NAME_DEF_STMT (new_phi_tree));
4676       STMT_VINFO_VECTYPE (index_vec_info) = cr_index_vector_type;
4677
4678       /* Update the phi with the vec cond.  */
4679       induction_index = new_phi_tree;
4680       add_phi_arg (as_a <gphi *> (new_phi), induction_index,
4681                    loop_latch_edge (loop), UNKNOWN_LOCATION);
4682     }
4683
4684   /* 2. Create epilog code.
4685         The reduction epilog code operates across the elements of the vector
4686         of partial results computed by the vectorized loop.
4687         The reduction epilog code consists of:
4688
4689         step 1: compute the scalar result in a vector (v_out2)
4690         step 2: extract the scalar result (s_out3) from the vector (v_out2)
4691         step 3: adjust the scalar result (s_out3) if needed.
4692
4693         Step 1 can be accomplished using one the following three schemes:
4694           (scheme 1) using reduc_fn, if available.
4695           (scheme 2) using whole-vector shifts, if available.
4696           (scheme 3) using a scalar loop. In this case steps 1+2 above are
4697                      combined.
4698
4699           The overall epilog code looks like this:
4700
4701           s_out0 = phi <s_loop>         # original EXIT_PHI
4702           v_out1 = phi <VECT_DEF>       # NEW_EXIT_PHI
4703           v_out2 = reduce <v_out1>              # step 1
4704           s_out3 = extract_field <v_out2, 0>    # step 2
4705           s_out4 = adjust_result <s_out3>       # step 3
4706
4707           (step 3 is optional, and steps 1 and 2 may be combined).
4708           Lastly, the uses of s_out0 are replaced by s_out4.  */
4709
4710
4711   /* 2.1 Create new loop-exit-phis to preserve loop-closed form:
4712          v_out1 = phi <VECT_DEF>
4713          Store them in NEW_PHIS.  */
4714   if (double_reduc)
4715     loop = outer_loop;
4716   exit_bb = single_exit (loop)->dest;
4717   prev_phi_info = NULL;
4718   new_phis.create (slp_node ? vec_num : ncopies);
4719   for (unsigned i = 0; i < vec_num; i++)
4720     {
4721       if (slp_node)
4722         def = gimple_get_lhs (SLP_TREE_VEC_STMTS (slp_node)[i]->stmt);
4723       else
4724         def = gimple_get_lhs (STMT_VINFO_VEC_STMT (rdef_info)->stmt);
4725       for (j = 0; j < ncopies; j++)
4726         {
4727           tree new_def = copy_ssa_name (def);
4728           phi = create_phi_node (new_def, exit_bb);
4729           stmt_vec_info phi_info = loop_vinfo->add_stmt (phi);
4730           if (j == 0)
4731             new_phis.quick_push (phi);
4732           else
4733             {
4734               def = vect_get_vec_def_for_stmt_copy (loop_vinfo, def);
4735               STMT_VINFO_RELATED_STMT (prev_phi_info) = phi_info;
4736             }
4737
4738           SET_PHI_ARG_DEF (phi, single_exit (loop)->dest_idx, def);
4739           prev_phi_info = phi_info;
4740         }
4741     }
4742
4743   exit_gsi = gsi_after_labels (exit_bb);
4744
4745   /* 2.2 Get the relevant tree-code to use in the epilog for schemes 2,3
4746          (i.e. when reduc_fn is not available) and in the final adjustment
4747          code (if needed).  Also get the original scalar reduction variable as
4748          defined in the loop.  In case STMT is a "pattern-stmt" (i.e. - it
4749          represents a reduction pattern), the tree-code and scalar-def are
4750          taken from the original stmt that the pattern-stmt (STMT) replaces.
4751          Otherwise (it is a regular reduction) - the tree-code and scalar-def
4752          are taken from STMT.  */
4753
4754   stmt_vec_info orig_stmt_info = vect_orig_stmt (stmt_info);
4755   if (orig_stmt_info != stmt_info)
4756     {
4757       /* Reduction pattern  */
4758       gcc_assert (STMT_VINFO_IN_PATTERN_P (orig_stmt_info));
4759       gcc_assert (STMT_VINFO_RELATED_STMT (orig_stmt_info) == stmt_info);
4760     }
4761
4762   scalar_dest = gimple_assign_lhs (orig_stmt_info->stmt);
4763   scalar_type = TREE_TYPE (scalar_dest);
4764   scalar_results.create (group_size);
4765   new_scalar_dest = vect_create_destination_var (scalar_dest, NULL);
4766   bitsize = TYPE_SIZE (scalar_type);
4767
4768   /* SLP reduction without reduction chain, e.g.,
4769      # a1 = phi <a2, a0>
4770      # b1 = phi <b2, b0>
4771      a2 = operation (a1)
4772      b2 = operation (b1)  */
4773   slp_reduc = (slp_node && !REDUC_GROUP_FIRST_ELEMENT (stmt_info));
4774
4775   /* True if we should implement SLP_REDUC using native reduction operations
4776      instead of scalar operations.  */
4777   direct_slp_reduc = (reduc_fn != IFN_LAST
4778                       && slp_reduc
4779                       && !TYPE_VECTOR_SUBPARTS (vectype).is_constant ());
4780
4781   /* In case of reduction chain, e.g.,
4782      # a1 = phi <a3, a0>
4783      a2 = operation (a1)
4784      a3 = operation (a2),
4785
4786      we may end up with more than one vector result.  Here we reduce them to
4787      one vector.  */
4788   if (REDUC_GROUP_FIRST_ELEMENT (stmt_info) || direct_slp_reduc)
4789     {
4790       gimple_seq stmts = NULL;
4791       tree first_vect = PHI_RESULT (new_phis[0]);
4792       first_vect = gimple_convert (&stmts, vectype, first_vect);
4793       for (k = 1; k < new_phis.length (); k++)
4794         {
4795           gimple *next_phi = new_phis[k];
4796           tree second_vect = PHI_RESULT (next_phi);
4797           second_vect = gimple_convert (&stmts, vectype, second_vect);
4798           first_vect = gimple_build (&stmts, code, vectype,
4799                                      first_vect, second_vect);
4800         }
4801       gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
4802
4803       new_phi_result = first_vect;
4804       new_phis.truncate (0);
4805       new_phis.safe_push (SSA_NAME_DEF_STMT (first_vect));
4806     }
4807   /* Likewise if we couldn't use a single defuse cycle.  */
4808   else if (ncopies > 1)
4809     {
4810       gcc_assert (new_phis.length () == 1);
4811       gimple_seq stmts = NULL;
4812       tree first_vect = PHI_RESULT (new_phis[0]);
4813       first_vect = gimple_convert (&stmts, vectype, first_vect);
4814       stmt_vec_info next_phi_info = loop_vinfo->lookup_stmt (new_phis[0]);
4815       for (int k = 1; k < ncopies; ++k)
4816         {
4817           next_phi_info = STMT_VINFO_RELATED_STMT (next_phi_info);
4818           tree second_vect = PHI_RESULT (next_phi_info->stmt);
4819           second_vect = gimple_convert (&stmts, vectype, second_vect);
4820           first_vect = gimple_build (&stmts, code, vectype,
4821                                      first_vect, second_vect);
4822         }
4823       gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
4824       new_phi_result = first_vect;
4825       new_phis.truncate (0);
4826       new_phis.safe_push (SSA_NAME_DEF_STMT (first_vect));
4827     }
4828   else
4829     new_phi_result = PHI_RESULT (new_phis[0]);
4830
4831   if (STMT_VINFO_REDUC_TYPE (reduc_info) == COND_REDUCTION
4832       && reduc_fn != IFN_LAST)
4833     {
4834       /* For condition reductions, we have a vector (NEW_PHI_RESULT) containing
4835          various data values where the condition matched and another vector
4836          (INDUCTION_INDEX) containing all the indexes of those matches.  We
4837          need to extract the last matching index (which will be the index with
4838          highest value) and use this to index into the data vector.
4839          For the case where there were no matches, the data vector will contain
4840          all default values and the index vector will be all zeros.  */
4841
4842       /* Get various versions of the type of the vector of indexes.  */
4843       tree index_vec_type = TREE_TYPE (induction_index);
4844       gcc_checking_assert (TYPE_UNSIGNED (index_vec_type));
4845       tree index_scalar_type = TREE_TYPE (index_vec_type);
4846       tree index_vec_cmp_type = truth_type_for (index_vec_type);
4847
4848       /* Get an unsigned integer version of the type of the data vector.  */
4849       int scalar_precision
4850         = GET_MODE_PRECISION (SCALAR_TYPE_MODE (scalar_type));
4851       tree scalar_type_unsigned = make_unsigned_type (scalar_precision);
4852       tree vectype_unsigned = build_vector_type
4853         (scalar_type_unsigned, TYPE_VECTOR_SUBPARTS (vectype));
4854
4855       /* First we need to create a vector (ZERO_VEC) of zeros and another
4856          vector (MAX_INDEX_VEC) filled with the last matching index, which we
4857          can create using a MAX reduction and then expanding.
4858          In the case where the loop never made any matches, the max index will
4859          be zero.  */
4860
4861       /* Vector of {0, 0, 0,...}.  */
4862       tree zero_vec = build_zero_cst (vectype);
4863
4864       gimple_seq stmts = NULL;
4865       new_phi_result = gimple_convert (&stmts, vectype, new_phi_result);
4866       gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
4867
4868       /* Find maximum value from the vector of found indexes.  */
4869       tree max_index = make_ssa_name (index_scalar_type);
4870       gcall *max_index_stmt = gimple_build_call_internal (IFN_REDUC_MAX,
4871                                                           1, induction_index);
4872       gimple_call_set_lhs (max_index_stmt, max_index);
4873       gsi_insert_before (&exit_gsi, max_index_stmt, GSI_SAME_STMT);
4874
4875       /* Vector of {max_index, max_index, max_index,...}.  */
4876       tree max_index_vec = make_ssa_name (index_vec_type);
4877       tree max_index_vec_rhs = build_vector_from_val (index_vec_type,
4878                                                       max_index);
4879       gimple *max_index_vec_stmt = gimple_build_assign (max_index_vec,
4880                                                         max_index_vec_rhs);
4881       gsi_insert_before (&exit_gsi, max_index_vec_stmt, GSI_SAME_STMT);
4882
4883       /* Next we compare the new vector (MAX_INDEX_VEC) full of max indexes
4884          with the vector (INDUCTION_INDEX) of found indexes, choosing values
4885          from the data vector (NEW_PHI_RESULT) for matches, 0 (ZERO_VEC)
4886          otherwise.  Only one value should match, resulting in a vector
4887          (VEC_COND) with one data value and the rest zeros.
4888          In the case where the loop never made any matches, every index will
4889          match, resulting in a vector with all data values (which will all be
4890          the default value).  */
4891
4892       /* Compare the max index vector to the vector of found indexes to find
4893          the position of the max value.  */
4894       tree vec_compare = make_ssa_name (index_vec_cmp_type);
4895       gimple *vec_compare_stmt = gimple_build_assign (vec_compare, EQ_EXPR,
4896                                                       induction_index,
4897                                                       max_index_vec);
4898       gsi_insert_before (&exit_gsi, vec_compare_stmt, GSI_SAME_STMT);
4899
4900       /* Use the compare to choose either values from the data vector or
4901          zero.  */
4902       tree vec_cond = make_ssa_name (vectype);
4903       gimple *vec_cond_stmt = gimple_build_assign (vec_cond, VEC_COND_EXPR,
4904                                                    vec_compare, new_phi_result,
4905                                                    zero_vec);
4906       gsi_insert_before (&exit_gsi, vec_cond_stmt, GSI_SAME_STMT);
4907
4908       /* Finally we need to extract the data value from the vector (VEC_COND)
4909          into a scalar (MATCHED_DATA_REDUC).  Logically we want to do a OR
4910          reduction, but because this doesn't exist, we can use a MAX reduction
4911          instead.  The data value might be signed or a float so we need to cast
4912          it first.
4913          In the case where the loop never made any matches, the data values are
4914          all identical, and so will reduce down correctly.  */
4915
4916       /* Make the matched data values unsigned.  */
4917       tree vec_cond_cast = make_ssa_name (vectype_unsigned);
4918       tree vec_cond_cast_rhs = build1 (VIEW_CONVERT_EXPR, vectype_unsigned,
4919                                        vec_cond);
4920       gimple *vec_cond_cast_stmt = gimple_build_assign (vec_cond_cast,
4921                                                         VIEW_CONVERT_EXPR,
4922                                                         vec_cond_cast_rhs);
4923       gsi_insert_before (&exit_gsi, vec_cond_cast_stmt, GSI_SAME_STMT);
4924
4925       /* Reduce down to a scalar value.  */
4926       tree data_reduc = make_ssa_name (scalar_type_unsigned);
4927       gcall *data_reduc_stmt = gimple_build_call_internal (IFN_REDUC_MAX,
4928                                                            1, vec_cond_cast);
4929       gimple_call_set_lhs (data_reduc_stmt, data_reduc);
4930       gsi_insert_before (&exit_gsi, data_reduc_stmt, GSI_SAME_STMT);
4931
4932       /* Convert the reduced value back to the result type and set as the
4933          result.  */
4934       stmts = NULL;
4935       new_temp = gimple_build (&stmts, VIEW_CONVERT_EXPR, scalar_type,
4936                                data_reduc);
4937       gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
4938       scalar_results.safe_push (new_temp);
4939     }
4940   else if (STMT_VINFO_REDUC_TYPE (reduc_info) == COND_REDUCTION
4941            && reduc_fn == IFN_LAST)
4942     {
4943       /* Condition reduction without supported IFN_REDUC_MAX.  Generate
4944          idx = 0;
4945          idx_val = induction_index[0];
4946          val = data_reduc[0];
4947          for (idx = 0, val = init, i = 0; i < nelts; ++i)
4948            if (induction_index[i] > idx_val)
4949              val = data_reduc[i], idx_val = induction_index[i];
4950          return val;  */
4951
4952       tree data_eltype = TREE_TYPE (TREE_TYPE (new_phi_result));
4953       tree idx_eltype = TREE_TYPE (TREE_TYPE (induction_index));
4954       unsigned HOST_WIDE_INT el_size = tree_to_uhwi (TYPE_SIZE (idx_eltype));
4955       poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (TREE_TYPE (induction_index));
4956       /* Enforced by vectorizable_reduction, which ensures we have target
4957          support before allowing a conditional reduction on variable-length
4958          vectors.  */
4959       unsigned HOST_WIDE_INT v_size = el_size * nunits.to_constant ();
4960       tree idx_val = NULL_TREE, val = NULL_TREE;
4961       for (unsigned HOST_WIDE_INT off = 0; off < v_size; off += el_size)
4962         {
4963           tree old_idx_val = idx_val;
4964           tree old_val = val;
4965           idx_val = make_ssa_name (idx_eltype);
4966           epilog_stmt = gimple_build_assign (idx_val, BIT_FIELD_REF,
4967                                              build3 (BIT_FIELD_REF, idx_eltype,
4968                                                      induction_index,
4969                                                      bitsize_int (el_size),
4970                                                      bitsize_int (off)));
4971           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4972           val = make_ssa_name (data_eltype);
4973           epilog_stmt = gimple_build_assign (val, BIT_FIELD_REF,
4974                                              build3 (BIT_FIELD_REF,
4975                                                      data_eltype,
4976                                                      new_phi_result,
4977                                                      bitsize_int (el_size),
4978                                                      bitsize_int (off)));
4979           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4980           if (off != 0)
4981             {
4982               tree new_idx_val = idx_val;
4983               if (off != v_size - el_size)
4984                 {
4985                   new_idx_val = make_ssa_name (idx_eltype);
4986                   epilog_stmt = gimple_build_assign (new_idx_val,
4987                                                      MAX_EXPR, idx_val,
4988                                                      old_idx_val);
4989                   gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4990                 }
4991               tree new_val = make_ssa_name (data_eltype);
4992               epilog_stmt = gimple_build_assign (new_val,
4993                                                  COND_EXPR,
4994                                                  build2 (GT_EXPR,
4995                                                          boolean_type_node,
4996                                                          idx_val,
4997                                                          old_idx_val),
4998                                                  val, old_val);
4999               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5000               idx_val = new_idx_val;
5001               val = new_val;
5002             }
5003         }
5004       /* Convert the reduced value back to the result type and set as the
5005          result.  */
5006       gimple_seq stmts = NULL;
5007       val = gimple_convert (&stmts, scalar_type, val);
5008       gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
5009       scalar_results.safe_push (val);
5010     }
5011
5012   /* 2.3 Create the reduction code, using one of the three schemes described
5013          above. In SLP we simply need to extract all the elements from the
5014          vector (without reducing them), so we use scalar shifts.  */
5015   else if (reduc_fn != IFN_LAST && !slp_reduc)
5016     {
5017       tree tmp;
5018       tree vec_elem_type;
5019
5020       /* Case 1:  Create:
5021          v_out2 = reduc_expr <v_out1>  */
5022
5023       if (dump_enabled_p ())
5024         dump_printf_loc (MSG_NOTE, vect_location,
5025                          "Reduce using direct vector reduction.\n");
5026
5027       gimple_seq stmts = NULL;
5028       new_phi_result = gimple_convert (&stmts, vectype, new_phi_result);
5029       vec_elem_type = TREE_TYPE (TREE_TYPE (new_phi_result));
5030       new_temp = gimple_build (&stmts, as_combined_fn (reduc_fn),
5031                                vec_elem_type, new_phi_result);
5032       new_temp = gimple_convert (&stmts, scalar_type, new_temp);
5033       gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
5034
5035       if ((STMT_VINFO_REDUC_TYPE (reduc_info) == INTEGER_INDUC_COND_REDUCTION)
5036           && induc_val)
5037         {
5038           /* Earlier we set the initial value to be a vector if induc_val
5039              values.  Check the result and if it is induc_val then replace
5040              with the original initial value, unless induc_val is
5041              the same as initial_def already.  */
5042           tree zcompare = build2 (EQ_EXPR, boolean_type_node, new_temp,
5043                                   induc_val);
5044
5045           tmp = make_ssa_name (new_scalar_dest);
5046           epilog_stmt = gimple_build_assign (tmp, COND_EXPR, zcompare,
5047                                              initial_def, new_temp);
5048           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5049           new_temp = tmp;
5050         }
5051
5052       scalar_results.safe_push (new_temp);
5053     }
5054   else if (direct_slp_reduc)
5055     {
5056       /* Here we create one vector for each of the REDUC_GROUP_SIZE results,
5057          with the elements for other SLP statements replaced with the
5058          neutral value.  We can then do a normal reduction on each vector.  */
5059
5060       /* Enforced by vectorizable_reduction.  */
5061       gcc_assert (new_phis.length () == 1);
5062       gcc_assert (pow2p_hwi (group_size));
5063
5064       slp_tree orig_phis_slp_node = slp_node_instance->reduc_phis;
5065       vec<stmt_vec_info> orig_phis
5066         = SLP_TREE_SCALAR_STMTS (orig_phis_slp_node);
5067       gimple_seq seq = NULL;
5068
5069       /* Build a vector {0, 1, 2, ...}, with the same number of elements
5070          and the same element size as VECTYPE.  */
5071       tree index = build_index_vector (vectype, 0, 1);
5072       tree index_type = TREE_TYPE (index);
5073       tree index_elt_type = TREE_TYPE (index_type);
5074       tree mask_type = truth_type_for (index_type);
5075
5076       /* Create a vector that, for each element, identifies which of
5077          the REDUC_GROUP_SIZE results should use it.  */
5078       tree index_mask = build_int_cst (index_elt_type, group_size - 1);
5079       index = gimple_build (&seq, BIT_AND_EXPR, index_type, index,
5080                             build_vector_from_val (index_type, index_mask));
5081
5082       /* Get a neutral vector value.  This is simply a splat of the neutral
5083          scalar value if we have one, otherwise the initial scalar value
5084          is itself a neutral value.  */
5085       tree vector_identity = NULL_TREE;
5086       tree neutral_op = NULL_TREE;
5087       if (slp_node)
5088         {
5089           stmt_vec_info first = REDUC_GROUP_FIRST_ELEMENT (stmt_info);
5090           neutral_op
5091             = neutral_op_for_slp_reduction (slp_node_instance->reduc_phis,
5092                                             vectype, code, first != NULL);
5093         }
5094       if (neutral_op)
5095         vector_identity = gimple_build_vector_from_val (&seq, vectype,
5096                                                         neutral_op);
5097       for (unsigned int i = 0; i < group_size; ++i)
5098         {
5099           /* If there's no univeral neutral value, we can use the
5100              initial scalar value from the original PHI.  This is used
5101              for MIN and MAX reduction, for example.  */
5102           if (!neutral_op)
5103             {
5104               tree scalar_value
5105                 = PHI_ARG_DEF_FROM_EDGE (orig_phis[i]->stmt,
5106                                          loop_preheader_edge (loop));
5107               scalar_value = gimple_convert (&seq, TREE_TYPE (vectype),
5108                                              scalar_value);
5109               vector_identity = gimple_build_vector_from_val (&seq, vectype,
5110                                                               scalar_value);
5111             }
5112
5113           /* Calculate the equivalent of:
5114
5115              sel[j] = (index[j] == i);
5116
5117              which selects the elements of NEW_PHI_RESULT that should
5118              be included in the result.  */
5119           tree compare_val = build_int_cst (index_elt_type, i);
5120           compare_val = build_vector_from_val (index_type, compare_val);
5121           tree sel = gimple_build (&seq, EQ_EXPR, mask_type,
5122                                    index, compare_val);
5123
5124           /* Calculate the equivalent of:
5125
5126              vec = seq ? new_phi_result : vector_identity;
5127
5128              VEC is now suitable for a full vector reduction.  */
5129           tree vec = gimple_build (&seq, VEC_COND_EXPR, vectype,
5130                                    sel, new_phi_result, vector_identity);
5131
5132           /* Do the reduction and convert it to the appropriate type.  */
5133           tree scalar = gimple_build (&seq, as_combined_fn (reduc_fn),
5134                                       TREE_TYPE (vectype), vec);
5135           scalar = gimple_convert (&seq, scalar_type, scalar);
5136           scalar_results.safe_push (scalar);
5137         }
5138       gsi_insert_seq_before (&exit_gsi, seq, GSI_SAME_STMT);
5139     }
5140   else
5141     {
5142       bool reduce_with_shift;
5143       tree vec_temp;
5144
5145       gcc_assert (slp_reduc || new_phis.length () == 1);
5146
5147       /* See if the target wants to do the final (shift) reduction
5148          in a vector mode of smaller size and first reduce upper/lower
5149          halves against each other.  */
5150       enum machine_mode mode1 = mode;
5151       tree stype = TREE_TYPE (vectype);
5152       unsigned nunits = TYPE_VECTOR_SUBPARTS (vectype).to_constant ();
5153       unsigned nunits1 = nunits;
5154       if ((mode1 = targetm.vectorize.split_reduction (mode)) != mode
5155           && new_phis.length () == 1)
5156         {
5157           nunits1 = GET_MODE_NUNITS (mode1).to_constant ();
5158           /* For SLP reductions we have to make sure lanes match up, but
5159              since we're doing individual element final reduction reducing
5160              vector width here is even more important.
5161              ???  We can also separate lanes with permutes, for the common
5162              case of power-of-two group-size odd/even extracts would work.  */
5163           if (slp_reduc && nunits != nunits1)
5164             {
5165               nunits1 = least_common_multiple (nunits1, group_size);
5166               gcc_assert (exact_log2 (nunits1) != -1 && nunits1 <= nunits);
5167             }
5168         }
5169       if (!slp_reduc
5170           && (mode1 = targetm.vectorize.split_reduction (mode)) != mode)
5171         nunits1 = GET_MODE_NUNITS (mode1).to_constant ();
5172
5173       tree vectype1 = get_related_vectype_for_scalar_type (TYPE_MODE (vectype),
5174                                                            stype, nunits1);
5175       reduce_with_shift = have_whole_vector_shift (mode1);
5176       if (!VECTOR_MODE_P (mode1))
5177         reduce_with_shift = false;
5178       else
5179         {
5180           optab optab = optab_for_tree_code (code, vectype1, optab_default);
5181           if (optab_handler (optab, mode1) == CODE_FOR_nothing)
5182             reduce_with_shift = false;
5183         }
5184
5185       /* First reduce the vector to the desired vector size we should
5186          do shift reduction on by combining upper and lower halves.  */
5187       new_temp = new_phi_result;
5188       while (nunits > nunits1)
5189         {
5190           nunits /= 2;
5191           vectype1 = get_related_vectype_for_scalar_type (TYPE_MODE (vectype),
5192                                                           stype, nunits);
5193           unsigned int bitsize = tree_to_uhwi (TYPE_SIZE (vectype1));
5194
5195           /* The target has to make sure we support lowpart/highpart
5196              extraction, either via direct vector extract or through
5197              an integer mode punning.  */
5198           tree dst1, dst2;
5199           if (convert_optab_handler (vec_extract_optab,
5200                                      TYPE_MODE (TREE_TYPE (new_temp)),
5201                                      TYPE_MODE (vectype1))
5202               != CODE_FOR_nothing)
5203             {
5204               /* Extract sub-vectors directly once vec_extract becomes
5205                  a conversion optab.  */
5206               dst1 = make_ssa_name (vectype1);
5207               epilog_stmt
5208                   = gimple_build_assign (dst1, BIT_FIELD_REF,
5209                                          build3 (BIT_FIELD_REF, vectype1,
5210                                                  new_temp, TYPE_SIZE (vectype1),
5211                                                  bitsize_int (0)));
5212               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5213               dst2 =  make_ssa_name (vectype1);
5214               epilog_stmt
5215                   = gimple_build_assign (dst2, BIT_FIELD_REF,
5216                                          build3 (BIT_FIELD_REF, vectype1,
5217                                                  new_temp, TYPE_SIZE (vectype1),
5218                                                  bitsize_int (bitsize)));
5219               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5220             }
5221           else
5222             {
5223               /* Extract via punning to appropriately sized integer mode
5224                  vector.  */
5225               tree eltype = build_nonstandard_integer_type (bitsize, 1);
5226               tree etype = build_vector_type (eltype, 2);
5227               gcc_assert (convert_optab_handler (vec_extract_optab,
5228                                                  TYPE_MODE (etype),
5229                                                  TYPE_MODE (eltype))
5230                           != CODE_FOR_nothing);
5231               tree tem = make_ssa_name (etype);
5232               epilog_stmt = gimple_build_assign (tem, VIEW_CONVERT_EXPR,
5233                                                  build1 (VIEW_CONVERT_EXPR,
5234                                                          etype, new_temp));
5235               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5236               new_temp = tem;
5237               tem = make_ssa_name (eltype);
5238               epilog_stmt
5239                   = gimple_build_assign (tem, BIT_FIELD_REF,
5240                                          build3 (BIT_FIELD_REF, eltype,
5241                                                  new_temp, TYPE_SIZE (eltype),
5242                                                  bitsize_int (0)));
5243               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5244               dst1 = make_ssa_name (vectype1);
5245               epilog_stmt = gimple_build_assign (dst1, VIEW_CONVERT_EXPR,
5246                                                  build1 (VIEW_CONVERT_EXPR,
5247                                                          vectype1, tem));
5248               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5249               tem = make_ssa_name (eltype);
5250               epilog_stmt
5251                   = gimple_build_assign (tem, BIT_FIELD_REF,
5252                                          build3 (BIT_FIELD_REF, eltype,
5253                                                  new_temp, TYPE_SIZE (eltype),
5254                                                  bitsize_int (bitsize)));
5255               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5256               dst2 =  make_ssa_name (vectype1);
5257               epilog_stmt = gimple_build_assign (dst2, VIEW_CONVERT_EXPR,
5258                                                  build1 (VIEW_CONVERT_EXPR,
5259                                                          vectype1, tem));
5260               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5261             }
5262
5263           new_temp = make_ssa_name (vectype1);
5264           epilog_stmt = gimple_build_assign (new_temp, code, dst1, dst2);
5265           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5266           new_phis[0] = epilog_stmt;
5267         }
5268
5269       if (reduce_with_shift && !slp_reduc)
5270         {
5271           int element_bitsize = tree_to_uhwi (bitsize);
5272           /* Enforced by vectorizable_reduction, which disallows SLP reductions
5273              for variable-length vectors and also requires direct target support
5274              for loop reductions.  */
5275           int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype1));
5276           int nelements = vec_size_in_bits / element_bitsize;
5277           vec_perm_builder sel;
5278           vec_perm_indices indices;
5279
5280           int elt_offset;
5281
5282           tree zero_vec = build_zero_cst (vectype1);
5283           /* Case 2: Create:
5284              for (offset = nelements/2; offset >= 1; offset/=2)
5285                 {
5286                   Create:  va' = vec_shift <va, offset>
5287                   Create:  va = vop <va, va'>
5288                 }  */
5289
5290           tree rhs;
5291
5292           if (dump_enabled_p ())
5293             dump_printf_loc (MSG_NOTE, vect_location,
5294                              "Reduce using vector shifts\n");
5295
5296           gimple_seq stmts = NULL;
5297           new_temp = gimple_convert (&stmts, vectype1, new_temp);
5298           for (elt_offset = nelements / 2;
5299                elt_offset >= 1;
5300                elt_offset /= 2)
5301             {
5302               calc_vec_perm_mask_for_shift (elt_offset, nelements, &sel);
5303               indices.new_vector (sel, 2, nelements);
5304               tree mask = vect_gen_perm_mask_any (vectype1, indices);
5305               new_name = gimple_build (&stmts, VEC_PERM_EXPR, vectype1,
5306                                        new_temp, zero_vec, mask);
5307               new_temp = gimple_build (&stmts, code,
5308                                        vectype1, new_name, new_temp);
5309             }
5310           gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
5311
5312           /* 2.4  Extract the final scalar result.  Create:
5313              s_out3 = extract_field <v_out2, bitpos>  */
5314
5315           if (dump_enabled_p ())
5316             dump_printf_loc (MSG_NOTE, vect_location,
5317                              "extract scalar result\n");
5318
5319           rhs = build3 (BIT_FIELD_REF, scalar_type, new_temp,
5320                         bitsize, bitsize_zero_node);
5321           epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
5322           new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
5323           gimple_assign_set_lhs (epilog_stmt, new_temp);
5324           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5325           scalar_results.safe_push (new_temp);
5326         }
5327       else
5328         {
5329           /* Case 3: Create:
5330              s = extract_field <v_out2, 0>
5331              for (offset = element_size;
5332                   offset < vector_size;
5333                   offset += element_size;)
5334                {
5335                  Create:  s' = extract_field <v_out2, offset>
5336                  Create:  s = op <s, s'>  // For non SLP cases
5337                }  */
5338
5339           if (dump_enabled_p ())
5340             dump_printf_loc (MSG_NOTE, vect_location,
5341                              "Reduce using scalar code.\n");
5342
5343           int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype1));
5344           int element_bitsize = tree_to_uhwi (bitsize);
5345           tree compute_type = TREE_TYPE (vectype);
5346           gimple_seq stmts = NULL;
5347           FOR_EACH_VEC_ELT (new_phis, i, new_phi)
5348             {
5349               int bit_offset;
5350               if (gimple_code (new_phi) == GIMPLE_PHI)
5351                 vec_temp = PHI_RESULT (new_phi);
5352               else
5353                 vec_temp = gimple_assign_lhs (new_phi);
5354               new_temp = gimple_build (&stmts, BIT_FIELD_REF, compute_type,
5355                                        vec_temp, bitsize, bitsize_zero_node);
5356
5357               /* In SLP we don't need to apply reduction operation, so we just
5358                  collect s' values in SCALAR_RESULTS.  */
5359               if (slp_reduc)
5360                 scalar_results.safe_push (new_temp);
5361
5362               for (bit_offset = element_bitsize;
5363                    bit_offset < vec_size_in_bits;
5364                    bit_offset += element_bitsize)
5365                 {
5366                   tree bitpos = bitsize_int (bit_offset);
5367                   new_name = gimple_build (&stmts, BIT_FIELD_REF,
5368                                            compute_type, vec_temp,
5369                                            bitsize, bitpos);
5370                   if (slp_reduc)
5371                     {
5372                       /* In SLP we don't need to apply reduction operation, so
5373                          we just collect s' values in SCALAR_RESULTS.  */
5374                       new_temp = new_name;
5375                       scalar_results.safe_push (new_name);
5376                     }
5377                   else
5378                     new_temp = gimple_build (&stmts, code, compute_type,
5379                                              new_name, new_temp);
5380                 }
5381             }
5382
5383           /* The only case where we need to reduce scalar results in SLP, is
5384              unrolling.  If the size of SCALAR_RESULTS is greater than
5385              REDUC_GROUP_SIZE, we reduce them combining elements modulo
5386              REDUC_GROUP_SIZE.  */
5387           if (slp_reduc)
5388             {
5389               tree res, first_res, new_res;
5390
5391               /* Reduce multiple scalar results in case of SLP unrolling.  */
5392               for (j = group_size; scalar_results.iterate (j, &res);
5393                    j++)
5394                 {
5395                   first_res = scalar_results[j % group_size];
5396                   new_res = gimple_build (&stmts, code, compute_type,
5397                                           first_res, res);
5398                   scalar_results[j % group_size] = new_res;
5399                 }
5400               for (k = 0; k < group_size; k++)
5401                 scalar_results[k] = gimple_convert (&stmts, scalar_type,
5402                                                     scalar_results[k]);
5403             }
5404           else
5405             {
5406               /* Not SLP - we have one scalar to keep in SCALAR_RESULTS.  */
5407               new_temp = gimple_convert (&stmts, scalar_type, new_temp);
5408               scalar_results.safe_push (new_temp);
5409             }
5410
5411           gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
5412         }
5413
5414       if ((STMT_VINFO_REDUC_TYPE (reduc_info) == INTEGER_INDUC_COND_REDUCTION)
5415           && induc_val)
5416         {
5417           /* Earlier we set the initial value to be a vector if induc_val
5418              values.  Check the result and if it is induc_val then replace
5419              with the original initial value, unless induc_val is
5420              the same as initial_def already.  */
5421           tree zcompare = build2 (EQ_EXPR, boolean_type_node, new_temp,
5422                                   induc_val);
5423
5424           tree tmp = make_ssa_name (new_scalar_dest);
5425           epilog_stmt = gimple_build_assign (tmp, COND_EXPR, zcompare,
5426                                              initial_def, new_temp);
5427           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5428           scalar_results[0] = tmp;
5429         }
5430     }
5431
5432   /* 2.5 Adjust the final result by the initial value of the reduction
5433          variable. (When such adjustment is not needed, then
5434          'adjustment_def' is zero).  For example, if code is PLUS we create:
5435          new_temp = loop_exit_def + adjustment_def  */
5436
5437   if (adjustment_def)
5438     {
5439       gcc_assert (!slp_reduc);
5440       gimple_seq stmts = NULL;
5441       if (nested_in_vect_loop)
5442         {
5443           new_phi = new_phis[0];
5444           gcc_assert (VECTOR_TYPE_P (TREE_TYPE (adjustment_def)));
5445           adjustment_def = gimple_convert (&stmts, vectype, adjustment_def);
5446           new_temp = gimple_build (&stmts, code, vectype,
5447                                    PHI_RESULT (new_phi), adjustment_def);
5448         }
5449       else
5450         {
5451           new_temp = scalar_results[0];
5452           gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) != VECTOR_TYPE);
5453           adjustment_def = gimple_convert (&stmts, scalar_type, adjustment_def);
5454           new_temp = gimple_build (&stmts, code, scalar_type,
5455                                    new_temp, adjustment_def);
5456         }
5457
5458       epilog_stmt = gimple_seq_last_stmt (stmts);
5459       gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
5460       if (nested_in_vect_loop)
5461         {
5462           stmt_vec_info epilog_stmt_info = loop_vinfo->add_stmt (epilog_stmt);
5463           STMT_VINFO_RELATED_STMT (epilog_stmt_info)
5464             = STMT_VINFO_RELATED_STMT (loop_vinfo->lookup_stmt (new_phi));
5465
5466           if (!double_reduc)
5467             scalar_results.quick_push (new_temp);
5468           else
5469             scalar_results[0] = new_temp;
5470         }
5471       else
5472         scalar_results[0] = new_temp;
5473
5474       new_phis[0] = epilog_stmt;
5475     }
5476
5477   if (double_reduc)
5478     loop = loop->inner;
5479
5480   /* 2.6  Handle the loop-exit phis.  Replace the uses of scalar loop-exit
5481           phis with new adjusted scalar results, i.e., replace use <s_out0>
5482           with use <s_out4>.
5483
5484      Transform:
5485         loop_exit:
5486           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
5487           v_out1 = phi <VECT_DEF>               # NEW_EXIT_PHI
5488           v_out2 = reduce <v_out1>
5489           s_out3 = extract_field <v_out2, 0>
5490           s_out4 = adjust_result <s_out3>
5491           use <s_out0>
5492           use <s_out0>
5493
5494      into:
5495
5496         loop_exit:
5497           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
5498           v_out1 = phi <VECT_DEF>               # NEW_EXIT_PHI
5499           v_out2 = reduce <v_out1>
5500           s_out3 = extract_field <v_out2, 0>
5501           s_out4 = adjust_result <s_out3>
5502           use <s_out4>
5503           use <s_out4> */
5504
5505
5506   /* In SLP reduction chain we reduce vector results into one vector if
5507      necessary, hence we set here REDUC_GROUP_SIZE to 1.  SCALAR_DEST is the
5508      LHS of the last stmt in the reduction chain, since we are looking for
5509      the loop exit phi node.  */
5510   if (REDUC_GROUP_FIRST_ELEMENT (stmt_info))
5511     {
5512       stmt_vec_info dest_stmt_info
5513         = vect_orig_stmt (SLP_TREE_SCALAR_STMTS (slp_node)[group_size - 1]);
5514       scalar_dest = gimple_assign_lhs (dest_stmt_info->stmt);
5515       group_size = 1;
5516     }
5517
5518   /* In SLP we may have several statements in NEW_PHIS and REDUCTION_PHIS (in
5519      case that REDUC_GROUP_SIZE is greater than vectorization factor).
5520      Therefore, we need to match SCALAR_RESULTS with corresponding statements.
5521      The first (REDUC_GROUP_SIZE / number of new vector stmts) scalar results
5522      correspond to the first vector stmt, etc.
5523      (RATIO is equal to (REDUC_GROUP_SIZE / number of new vector stmts)).  */
5524   if (group_size > new_phis.length ())
5525     gcc_assert (!(group_size % new_phis.length ()));
5526
5527   for (k = 0; k < group_size; k++)
5528     {
5529       if (slp_reduc)
5530         {
5531           stmt_vec_info scalar_stmt_info = SLP_TREE_SCALAR_STMTS (slp_node)[k];
5532
5533           orig_stmt_info = STMT_VINFO_RELATED_STMT (scalar_stmt_info);
5534           /* SLP statements can't participate in patterns.  */
5535           gcc_assert (!orig_stmt_info);
5536           scalar_dest = gimple_assign_lhs (scalar_stmt_info->stmt);
5537         }
5538
5539       if (nested_in_vect_loop)
5540         {
5541           if (double_reduc)
5542             loop = outer_loop;
5543           else
5544             gcc_unreachable ();
5545         }
5546
5547       phis.create (3);
5548       /* Find the loop-closed-use at the loop exit of the original scalar
5549          result.  (The reduction result is expected to have two immediate uses,
5550          one at the latch block, and one at the loop exit).  For double
5551          reductions we are looking for exit phis of the outer loop.  */
5552       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, scalar_dest)
5553         {
5554           if (!flow_bb_inside_loop_p (loop, gimple_bb (USE_STMT (use_p))))
5555             {
5556               if (!is_gimple_debug (USE_STMT (use_p)))
5557                 phis.safe_push (USE_STMT (use_p));
5558             }
5559           else
5560             {
5561               if (double_reduc && gimple_code (USE_STMT (use_p)) == GIMPLE_PHI)
5562                 {
5563                   tree phi_res = PHI_RESULT (USE_STMT (use_p));
5564
5565                   FOR_EACH_IMM_USE_FAST (phi_use_p, phi_imm_iter, phi_res)
5566                     {
5567                       if (!flow_bb_inside_loop_p (loop,
5568                                              gimple_bb (USE_STMT (phi_use_p)))
5569                           && !is_gimple_debug (USE_STMT (phi_use_p)))
5570                         phis.safe_push (USE_STMT (phi_use_p));
5571                     }
5572                 }
5573             }
5574         }
5575
5576       FOR_EACH_VEC_ELT (phis, i, exit_phi)
5577         {
5578           /* Replace the uses:  */
5579           orig_name = PHI_RESULT (exit_phi);
5580           scalar_result = scalar_results[k];
5581           FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name)
5582             {
5583               FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
5584                 SET_USE (use_p, scalar_result);
5585               update_stmt (use_stmt);
5586             }
5587         }
5588
5589       phis.release ();
5590     }
5591 }
5592
5593 /* Return a vector of type VECTYPE that is equal to the vector select
5594    operation "MASK ? VEC : IDENTITY".  Insert the select statements
5595    before GSI.  */
5596
5597 static tree
5598 merge_with_identity (gimple_stmt_iterator *gsi, tree mask, tree vectype,
5599                      tree vec, tree identity)
5600 {
5601   tree cond = make_temp_ssa_name (vectype, NULL, "cond");
5602   gimple *new_stmt = gimple_build_assign (cond, VEC_COND_EXPR,
5603                                           mask, vec, identity);
5604   gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
5605   return cond;
5606 }
5607
5608 /* Successively apply CODE to each element of VECTOR_RHS, in left-to-right
5609    order, starting with LHS.  Insert the extraction statements before GSI and
5610    associate the new scalar SSA names with variable SCALAR_DEST.
5611    Return the SSA name for the result.  */
5612
5613 static tree
5614 vect_expand_fold_left (gimple_stmt_iterator *gsi, tree scalar_dest,
5615                        tree_code code, tree lhs, tree vector_rhs)
5616 {
5617   tree vectype = TREE_TYPE (vector_rhs);
5618   tree scalar_type = TREE_TYPE (vectype);
5619   tree bitsize = TYPE_SIZE (scalar_type);
5620   unsigned HOST_WIDE_INT vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
5621   unsigned HOST_WIDE_INT element_bitsize = tree_to_uhwi (bitsize);
5622
5623   for (unsigned HOST_WIDE_INT bit_offset = 0;
5624        bit_offset < vec_size_in_bits;
5625        bit_offset += element_bitsize)
5626     {
5627       tree bitpos = bitsize_int (bit_offset);
5628       tree rhs = build3 (BIT_FIELD_REF, scalar_type, vector_rhs,
5629                          bitsize, bitpos);
5630
5631       gassign *stmt = gimple_build_assign (scalar_dest, rhs);
5632       rhs = make_ssa_name (scalar_dest, stmt);
5633       gimple_assign_set_lhs (stmt, rhs);
5634       gsi_insert_before (gsi, stmt, GSI_SAME_STMT);
5635
5636       stmt = gimple_build_assign (scalar_dest, code, lhs, rhs);
5637       tree new_name = make_ssa_name (scalar_dest, stmt);
5638       gimple_assign_set_lhs (stmt, new_name);
5639       gsi_insert_before (gsi, stmt, GSI_SAME_STMT);
5640       lhs = new_name;
5641     }
5642   return lhs;
5643 }
5644
5645 /* Get a masked internal function equivalent to REDUC_FN.  VECTYPE_IN is the
5646    type of the vector input.  */
5647
5648 static internal_fn
5649 get_masked_reduction_fn (internal_fn reduc_fn, tree vectype_in)
5650 {
5651   internal_fn mask_reduc_fn;
5652
5653   switch (reduc_fn)
5654     {
5655     case IFN_FOLD_LEFT_PLUS:
5656       mask_reduc_fn = IFN_MASK_FOLD_LEFT_PLUS;
5657       break;
5658
5659     default:
5660       return IFN_LAST;
5661     }
5662
5663   if (direct_internal_fn_supported_p (mask_reduc_fn, vectype_in,
5664                                       OPTIMIZE_FOR_SPEED))
5665     return mask_reduc_fn;
5666   return IFN_LAST;
5667 }
5668
5669 /* Perform an in-order reduction (FOLD_LEFT_REDUCTION).  STMT_INFO is the
5670    statement that sets the live-out value.  REDUC_DEF_STMT is the phi
5671    statement.  CODE is the operation performed by STMT_INFO and OPS are
5672    its scalar operands.  REDUC_INDEX is the index of the operand in
5673    OPS that is set by REDUC_DEF_STMT.  REDUC_FN is the function that
5674    implements in-order reduction, or IFN_LAST if we should open-code it.
5675    VECTYPE_IN is the type of the vector input.  MASKS specifies the masks
5676    that should be used to control the operation in a fully-masked loop.  */
5677
5678 static bool
5679 vectorize_fold_left_reduction (loop_vec_info loop_vinfo,
5680                                stmt_vec_info stmt_info,
5681                                gimple_stmt_iterator *gsi,
5682                                stmt_vec_info *vec_stmt, slp_tree slp_node,
5683                                gimple *reduc_def_stmt,
5684                                tree_code code, internal_fn reduc_fn,
5685                                tree ops[3], tree vectype_in,
5686                                int reduc_index, vec_loop_masks *masks)
5687 {
5688   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
5689   tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
5690   stmt_vec_info new_stmt_info = NULL;
5691   internal_fn mask_reduc_fn = get_masked_reduction_fn (reduc_fn, vectype_in);
5692
5693   int ncopies;
5694   if (slp_node)
5695     ncopies = 1;
5696   else
5697     ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
5698
5699   gcc_assert (!nested_in_vect_loop_p (loop, stmt_info));
5700   gcc_assert (ncopies == 1);
5701   gcc_assert (TREE_CODE_LENGTH (code) == binary_op);
5702
5703   if (slp_node)
5704     gcc_assert (known_eq (TYPE_VECTOR_SUBPARTS (vectype_out),
5705                           TYPE_VECTOR_SUBPARTS (vectype_in)));
5706
5707   tree op0 = ops[1 - reduc_index];
5708
5709   int group_size = 1;
5710   stmt_vec_info scalar_dest_def_info;
5711   auto_vec<tree> vec_oprnds0;
5712   if (slp_node)
5713     {
5714       auto_vec<vec<tree> > vec_defs (2);
5715       vect_get_slp_defs (loop_vinfo, slp_node, &vec_defs);
5716       vec_oprnds0.safe_splice (vec_defs[1 - reduc_index]);
5717       vec_defs[0].release ();
5718       vec_defs[1].release ();
5719       group_size = SLP_TREE_SCALAR_STMTS (slp_node).length ();
5720       scalar_dest_def_info = SLP_TREE_SCALAR_STMTS (slp_node)[group_size - 1];
5721     }
5722   else
5723     {
5724       tree loop_vec_def0 = vect_get_vec_def_for_operand (loop_vinfo,
5725                                                          op0, stmt_info);
5726       vec_oprnds0.create (1);
5727       vec_oprnds0.quick_push (loop_vec_def0);
5728       scalar_dest_def_info = stmt_info;
5729     }
5730
5731   tree scalar_dest = gimple_assign_lhs (scalar_dest_def_info->stmt);
5732   tree scalar_type = TREE_TYPE (scalar_dest);
5733   tree reduc_var = gimple_phi_result (reduc_def_stmt);
5734
5735   int vec_num = vec_oprnds0.length ();
5736   gcc_assert (vec_num == 1 || slp_node);
5737   tree vec_elem_type = TREE_TYPE (vectype_out);
5738   gcc_checking_assert (useless_type_conversion_p (scalar_type, vec_elem_type));
5739
5740   tree vector_identity = NULL_TREE;
5741   if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
5742     vector_identity = build_zero_cst (vectype_out);
5743
5744   tree scalar_dest_var = vect_create_destination_var (scalar_dest, NULL);
5745   int i;
5746   tree def0;
5747   FOR_EACH_VEC_ELT (vec_oprnds0, i, def0)
5748     {
5749       gimple *new_stmt;
5750       tree mask = NULL_TREE;
5751       if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
5752         mask = vect_get_loop_mask (gsi, masks, vec_num, vectype_in, i);
5753
5754       /* Handle MINUS by adding the negative.  */
5755       if (reduc_fn != IFN_LAST && code == MINUS_EXPR)
5756         {
5757           tree negated = make_ssa_name (vectype_out);
5758           new_stmt = gimple_build_assign (negated, NEGATE_EXPR, def0);
5759           gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
5760           def0 = negated;
5761         }
5762
5763       if (mask && mask_reduc_fn == IFN_LAST)
5764         def0 = merge_with_identity (gsi, mask, vectype_out, def0,
5765                                     vector_identity);
5766
5767       /* On the first iteration the input is simply the scalar phi
5768          result, and for subsequent iterations it is the output of
5769          the preceding operation.  */
5770       if (reduc_fn != IFN_LAST || (mask && mask_reduc_fn != IFN_LAST))
5771         {
5772           if (mask && mask_reduc_fn != IFN_LAST)
5773             new_stmt = gimple_build_call_internal (mask_reduc_fn, 3, reduc_var,
5774                                                    def0, mask);
5775           else
5776             new_stmt = gimple_build_call_internal (reduc_fn, 2, reduc_var,
5777                                                    def0);
5778           /* For chained SLP reductions the output of the previous reduction
5779              operation serves as the input of the next. For the final statement
5780              the output cannot be a temporary - we reuse the original
5781              scalar destination of the last statement.  */
5782           if (i != vec_num - 1)
5783             {
5784               gimple_set_lhs (new_stmt, scalar_dest_var);
5785               reduc_var = make_ssa_name (scalar_dest_var, new_stmt);
5786               gimple_set_lhs (new_stmt, reduc_var);
5787             }
5788         }
5789       else
5790         {
5791           reduc_var = vect_expand_fold_left (gsi, scalar_dest_var, code,
5792                                              reduc_var, def0);
5793           new_stmt = SSA_NAME_DEF_STMT (reduc_var);
5794           /* Remove the statement, so that we can use the same code paths
5795              as for statements that we've just created.  */
5796           gimple_stmt_iterator tmp_gsi = gsi_for_stmt (new_stmt);
5797           gsi_remove (&tmp_gsi, true);
5798         }
5799
5800       if (i == vec_num - 1)
5801         {
5802           gimple_set_lhs (new_stmt, scalar_dest);
5803           new_stmt_info = vect_finish_replace_stmt (loop_vinfo,
5804                                                     scalar_dest_def_info,
5805                                                     new_stmt);
5806         }
5807       else
5808         new_stmt_info = vect_finish_stmt_generation (loop_vinfo,
5809                                                      scalar_dest_def_info,
5810                                                      new_stmt, gsi);
5811
5812       if (slp_node)
5813         SLP_TREE_VEC_STMTS (slp_node).quick_push (new_stmt_info);
5814     }
5815
5816   if (!slp_node)
5817     STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt_info;
5818
5819   return true;
5820 }
5821
5822 /* Function is_nonwrapping_integer_induction.
5823
5824    Check if STMT_VINO (which is part of loop LOOP) both increments and
5825    does not cause overflow.  */
5826
5827 static bool
5828 is_nonwrapping_integer_induction (stmt_vec_info stmt_vinfo, class loop *loop)
5829 {
5830   gphi *phi = as_a <gphi *> (stmt_vinfo->stmt);
5831   tree base = STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo);
5832   tree step = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo);
5833   tree lhs_type = TREE_TYPE (gimple_phi_result (phi));
5834   widest_int ni, max_loop_value, lhs_max;
5835   wi::overflow_type overflow = wi::OVF_NONE;
5836
5837   /* Make sure the loop is integer based.  */
5838   if (TREE_CODE (base) != INTEGER_CST
5839       || TREE_CODE (step) != INTEGER_CST)
5840     return false;
5841
5842   /* Check that the max size of the loop will not wrap.  */
5843
5844   if (TYPE_OVERFLOW_UNDEFINED (lhs_type))
5845     return true;
5846
5847   if (! max_stmt_executions (loop, &ni))
5848     return false;
5849
5850   max_loop_value = wi::mul (wi::to_widest (step), ni, TYPE_SIGN (lhs_type),
5851                             &overflow);
5852   if (overflow)
5853     return false;
5854
5855   max_loop_value = wi::add (wi::to_widest (base), max_loop_value,
5856                             TYPE_SIGN (lhs_type), &overflow);
5857   if (overflow)
5858     return false;
5859
5860   return (wi::min_precision (max_loop_value, TYPE_SIGN (lhs_type))
5861           <= TYPE_PRECISION (lhs_type));
5862 }
5863
5864 /* Check if masking can be supported by inserting a conditional expression.
5865    CODE is the code for the operation.  COND_FN is the conditional internal
5866    function, if it exists.  VECTYPE_IN is the type of the vector input.  */
5867 static bool
5868 use_mask_by_cond_expr_p (enum tree_code code, internal_fn cond_fn,
5869                          tree vectype_in)
5870 {
5871   if (cond_fn != IFN_LAST
5872       && direct_internal_fn_supported_p (cond_fn, vectype_in,
5873                                          OPTIMIZE_FOR_SPEED))
5874     return false;
5875
5876   switch (code)
5877     {
5878     case DOT_PROD_EXPR:
5879     case SAD_EXPR:
5880       return true;
5881
5882     default:
5883       return false;
5884     }
5885 }
5886
5887 /* Insert a conditional expression to enable masked vectorization.  CODE is the
5888    code for the operation.  VOP is the array of operands.  MASK is the loop
5889    mask.  GSI is a statement iterator used to place the new conditional
5890    expression.  */
5891 static void
5892 build_vect_cond_expr (enum tree_code code, tree vop[3], tree mask,
5893                       gimple_stmt_iterator *gsi)
5894 {
5895   switch (code)
5896     {
5897     case DOT_PROD_EXPR:
5898       {
5899         tree vectype = TREE_TYPE (vop[1]);
5900         tree zero = build_zero_cst (vectype);
5901         tree masked_op1 = make_temp_ssa_name (vectype, NULL, "masked_op1");
5902         gassign *select = gimple_build_assign (masked_op1, VEC_COND_EXPR,
5903                                                mask, vop[1], zero);
5904         gsi_insert_before (gsi, select, GSI_SAME_STMT);
5905         vop[1] = masked_op1;
5906         break;
5907       }
5908
5909     case SAD_EXPR:
5910       {
5911         tree vectype = TREE_TYPE (vop[1]);
5912         tree masked_op1 = make_temp_ssa_name (vectype, NULL, "masked_op1");
5913         gassign *select = gimple_build_assign (masked_op1, VEC_COND_EXPR,
5914                                                mask, vop[1], vop[0]);
5915         gsi_insert_before (gsi, select, GSI_SAME_STMT);
5916         vop[1] = masked_op1;
5917         break;
5918       }
5919
5920     default:
5921       gcc_unreachable ();
5922     }
5923 }
5924
5925 /* Function vectorizable_reduction.
5926
5927    Check if STMT_INFO performs a reduction operation that can be vectorized.
5928    If VEC_STMT is also passed, vectorize STMT_INFO: create a vectorized
5929    stmt to replace it, put it in VEC_STMT, and insert it at GSI.
5930    Return true if STMT_INFO is vectorizable in this way.
5931
5932    This function also handles reduction idioms (patterns) that have been
5933    recognized in advance during vect_pattern_recog.  In this case, STMT_INFO
5934    may be of this form:
5935      X = pattern_expr (arg0, arg1, ..., X)
5936    and its STMT_VINFO_RELATED_STMT points to the last stmt in the original
5937    sequence that had been detected and replaced by the pattern-stmt
5938    (STMT_INFO).
5939
5940    This function also handles reduction of condition expressions, for example:
5941      for (int i = 0; i < N; i++)
5942        if (a[i] < value)
5943          last = a[i];
5944    This is handled by vectorising the loop and creating an additional vector
5945    containing the loop indexes for which "a[i] < value" was true.  In the
5946    function epilogue this is reduced to a single max value and then used to
5947    index into the vector of results.
5948
5949    In some cases of reduction patterns, the type of the reduction variable X is
5950    different than the type of the other arguments of STMT_INFO.
5951    In such cases, the vectype that is used when transforming STMT_INFO into
5952    a vector stmt is different than the vectype that is used to determine the
5953    vectorization factor, because it consists of a different number of elements
5954    than the actual number of elements that are being operated upon in parallel.
5955
5956    For example, consider an accumulation of shorts into an int accumulator.
5957    On some targets it's possible to vectorize this pattern operating on 8
5958    shorts at a time (hence, the vectype for purposes of determining the
5959    vectorization factor should be V8HI); on the other hand, the vectype that
5960    is used to create the vector form is actually V4SI (the type of the result).
5961
5962    Upon entry to this function, STMT_VINFO_VECTYPE records the vectype that
5963    indicates what is the actual level of parallelism (V8HI in the example), so
5964    that the right vectorization factor would be derived.  This vectype
5965    corresponds to the type of arguments to the reduction stmt, and should *NOT*
5966    be used to create the vectorized stmt.  The right vectype for the vectorized
5967    stmt is obtained from the type of the result X:
5968       get_vectype_for_scalar_type (vinfo, TREE_TYPE (X))
5969
5970    This means that, contrary to "regular" reductions (or "regular" stmts in
5971    general), the following equation:
5972       STMT_VINFO_VECTYPE == get_vectype_for_scalar_type (vinfo, TREE_TYPE (X))
5973    does *NOT* necessarily hold for reduction patterns.  */
5974
5975 bool
5976 vectorizable_reduction (loop_vec_info loop_vinfo,
5977                         stmt_vec_info stmt_info, slp_tree slp_node,
5978                         slp_instance slp_node_instance,
5979                         stmt_vector_for_cost *cost_vec)
5980 {
5981   tree scalar_dest;
5982   tree vectype_in = NULL_TREE;
5983   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
5984   enum vect_def_type cond_reduc_dt = vect_unknown_def_type;
5985   stmt_vec_info cond_stmt_vinfo = NULL;
5986   tree scalar_type;
5987   int i;
5988   int ncopies;
5989   bool single_defuse_cycle = false;
5990   bool nested_cycle = false;
5991   bool double_reduc = false;
5992   int vec_num;
5993   tree tem;
5994   tree cr_index_scalar_type = NULL_TREE, cr_index_vector_type = NULL_TREE;
5995   tree cond_reduc_val = NULL_TREE;
5996
5997   /* Make sure it was already recognized as a reduction computation.  */
5998   if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_reduction_def
5999       && STMT_VINFO_DEF_TYPE (stmt_info) != vect_double_reduction_def
6000       && STMT_VINFO_DEF_TYPE (stmt_info) != vect_nested_cycle)
6001     return false;
6002
6003   /* The stmt we store reduction analysis meta on.  */
6004   stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info);
6005   reduc_info->is_reduc_info = true;
6006
6007   if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle)
6008     {
6009       if (is_a <gphi *> (stmt_info->stmt))
6010         /* Analysis for double-reduction is done on the outer
6011            loop PHI, nested cycles have no further restrictions.  */
6012         STMT_VINFO_TYPE (stmt_info) = cycle_phi_info_type;
6013       else
6014         STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
6015       return true;
6016     }
6017
6018   stmt_vec_info orig_stmt_of_analysis = stmt_info;
6019   stmt_vec_info phi_info = stmt_info;
6020   if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
6021       || STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
6022     {
6023       if (!is_a <gphi *> (stmt_info->stmt))
6024         {
6025           STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
6026           return true;
6027         }
6028       if (slp_node)
6029         {
6030           slp_node_instance->reduc_phis = slp_node;
6031           /* ???  We're leaving slp_node to point to the PHIs, we only
6032              need it to get at the number of vector stmts which wasn't
6033              yet initialized for the instance root.  */
6034         }
6035       if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def)
6036         stmt_info = vect_stmt_to_vectorize (STMT_VINFO_REDUC_DEF (stmt_info));
6037       else /* STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def */
6038         {
6039           use_operand_p use_p;
6040           gimple *use_stmt;
6041           bool res = single_imm_use (gimple_phi_result (stmt_info->stmt),
6042                                      &use_p, &use_stmt);
6043           gcc_assert (res);
6044           phi_info = loop_vinfo->lookup_stmt (use_stmt);
6045           stmt_info = vect_stmt_to_vectorize (STMT_VINFO_REDUC_DEF (phi_info));
6046         }
6047     }
6048
6049   /* PHIs should not participate in patterns.  */
6050   gcc_assert (!STMT_VINFO_RELATED_STMT (phi_info));
6051   gphi *reduc_def_phi = as_a <gphi *> (phi_info->stmt);
6052
6053   /* Verify following REDUC_IDX from the latch def leads us back to the PHI
6054      and compute the reduction chain length.  */
6055   tree reduc_def = PHI_ARG_DEF_FROM_EDGE (reduc_def_phi,
6056                                           loop_latch_edge (loop));
6057   unsigned reduc_chain_length = 0;
6058   bool only_slp_reduc_chain = true;
6059   stmt_info = NULL;
6060   while (reduc_def != PHI_RESULT (reduc_def_phi))
6061     {
6062       stmt_vec_info def = loop_vinfo->lookup_def (reduc_def);
6063       stmt_vec_info vdef = vect_stmt_to_vectorize (def);
6064       if (STMT_VINFO_REDUC_IDX (vdef) == -1)
6065         {
6066           if (dump_enabled_p ())
6067             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6068                              "reduction chain broken by patterns.\n");
6069           return false;
6070         }
6071       if (!REDUC_GROUP_FIRST_ELEMENT (vdef))
6072         only_slp_reduc_chain = false;
6073       /* ???  For epilogue generation live members of the chain need
6074          to point back to the PHI via their original stmt for
6075          info_for_reduction to work.  */
6076       if (STMT_VINFO_LIVE_P (vdef))
6077         STMT_VINFO_REDUC_DEF (def) = phi_info;
6078       gassign *assign = dyn_cast <gassign *> (vdef->stmt);
6079       if (!assign)
6080         {
6081           if (dump_enabled_p ())
6082             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6083                              "reduction chain includes calls.\n");
6084           return false;
6085         }
6086       if (CONVERT_EXPR_CODE_P (gimple_assign_rhs_code (assign)))
6087         {
6088           if (!tree_nop_conversion_p (TREE_TYPE (gimple_assign_lhs (assign)),
6089                                       TREE_TYPE (gimple_assign_rhs1 (assign))))
6090             {
6091               if (dump_enabled_p ())
6092                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6093                                  "conversion in the reduction chain.\n");
6094               return false;
6095             }
6096         }
6097       else if (!stmt_info)
6098         /* First non-conversion stmt.  */
6099         stmt_info = vdef;
6100       reduc_def = gimple_op (vdef->stmt, 1 + STMT_VINFO_REDUC_IDX (vdef));
6101       reduc_chain_length++;
6102     }
6103   /* PHIs should not participate in patterns.  */
6104   gcc_assert (!STMT_VINFO_RELATED_STMT (phi_info));
6105
6106   if (nested_in_vect_loop_p (loop, stmt_info))
6107     {
6108       loop = loop->inner;
6109       nested_cycle = true;
6110     }
6111
6112   /* STMT_VINFO_REDUC_DEF doesn't point to the first but the last
6113      element.  */
6114   if (slp_node && REDUC_GROUP_FIRST_ELEMENT (stmt_info))
6115     {
6116       gcc_assert (!REDUC_GROUP_NEXT_ELEMENT (stmt_info));
6117       stmt_info = REDUC_GROUP_FIRST_ELEMENT (stmt_info);
6118     }
6119   if (REDUC_GROUP_FIRST_ELEMENT (stmt_info))
6120     gcc_assert (slp_node
6121                 && REDUC_GROUP_FIRST_ELEMENT (stmt_info) == stmt_info);
6122
6123   /* 1. Is vectorizable reduction?  */
6124   /* Not supportable if the reduction variable is used in the loop, unless
6125      it's a reduction chain.  */
6126   if (STMT_VINFO_RELEVANT (stmt_info) > vect_used_in_outer
6127       && !REDUC_GROUP_FIRST_ELEMENT (stmt_info))
6128     return false;
6129
6130   /* Reductions that are not used even in an enclosing outer-loop,
6131      are expected to be "live" (used out of the loop).  */
6132   if (STMT_VINFO_RELEVANT (stmt_info) == vect_unused_in_scope
6133       && !STMT_VINFO_LIVE_P (stmt_info))
6134     return false;
6135
6136   /* 2. Has this been recognized as a reduction pattern?
6137
6138      Check if STMT represents a pattern that has been recognized
6139      in earlier analysis stages.  For stmts that represent a pattern,
6140      the STMT_VINFO_RELATED_STMT field records the last stmt in
6141      the original sequence that constitutes the pattern.  */
6142
6143   stmt_vec_info orig_stmt_info = STMT_VINFO_RELATED_STMT (stmt_info);
6144   if (orig_stmt_info)
6145     {
6146       gcc_assert (STMT_VINFO_IN_PATTERN_P (orig_stmt_info));
6147       gcc_assert (!STMT_VINFO_IN_PATTERN_P (stmt_info));
6148     }
6149
6150   /* 3. Check the operands of the operation.  The first operands are defined
6151         inside the loop body. The last operand is the reduction variable,
6152         which is defined by the loop-header-phi.  */
6153
6154   tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
6155   STMT_VINFO_REDUC_VECTYPE (reduc_info) = vectype_out;
6156   gassign *stmt = as_a <gassign *> (stmt_info->stmt);
6157   enum tree_code code = gimple_assign_rhs_code (stmt);
6158   bool lane_reduc_code_p
6159     = (code == DOT_PROD_EXPR || code == WIDEN_SUM_EXPR || code == SAD_EXPR);
6160   int op_type = TREE_CODE_LENGTH (code);
6161
6162   scalar_dest = gimple_assign_lhs (stmt);
6163   scalar_type = TREE_TYPE (scalar_dest);
6164   if (!POINTER_TYPE_P (scalar_type) && !INTEGRAL_TYPE_P (scalar_type)
6165       && !SCALAR_FLOAT_TYPE_P (scalar_type))
6166     return false;
6167
6168   /* Do not try to vectorize bit-precision reductions.  */
6169   if (!type_has_mode_precision_p (scalar_type))
6170     return false;
6171
6172   /* For lane-reducing ops we're reducing the number of reduction PHIs
6173      which means the only use of that may be in the lane-reducing operation.  */
6174   if (lane_reduc_code_p
6175       && reduc_chain_length != 1
6176       && !only_slp_reduc_chain)
6177     {
6178       if (dump_enabled_p ())
6179         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6180                          "lane-reducing reduction with extra stmts.\n");
6181       return false;
6182     }
6183
6184   /* All uses but the last are expected to be defined in the loop.
6185      The last use is the reduction variable.  In case of nested cycle this
6186      assumption is not true: we use reduc_index to record the index of the
6187      reduction variable.  */
6188   /* ???  To get at invariant/constant uses on the SLP node we have to
6189      get to it here, slp_node is still the reduction PHI.  */
6190   slp_tree slp_for_stmt_info = NULL;
6191   if (slp_node)
6192     {
6193       slp_for_stmt_info = slp_node_instance->root;
6194       /* And then there's reduction chain with a conversion ...  */
6195       if (SLP_TREE_REPRESENTATIVE (slp_for_stmt_info) != stmt_info)
6196         slp_for_stmt_info = SLP_TREE_CHILDREN (slp_for_stmt_info)[0];
6197       gcc_assert (SLP_TREE_REPRESENTATIVE (slp_for_stmt_info) == stmt_info);
6198     }
6199   slp_tree *slp_op = XALLOCAVEC (slp_tree, op_type);
6200   for (i = 0; i < op_type; i++)
6201     {
6202       /* The condition of COND_EXPR is checked in vectorizable_condition().  */
6203       if (i == 0 && code == COND_EXPR)
6204         continue;
6205
6206       stmt_vec_info def_stmt_info;
6207       enum vect_def_type dt;
6208       tree op;
6209       if (!vect_is_simple_use (loop_vinfo, stmt_info, slp_for_stmt_info,
6210                                i, &op, &slp_op[i], &dt, &tem,
6211                                &def_stmt_info))
6212         {
6213           if (dump_enabled_p ())
6214             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6215                              "use not simple.\n");
6216           return false;
6217         }
6218       if (i == STMT_VINFO_REDUC_IDX (stmt_info))
6219         continue;
6220
6221       /* There should be only one cycle def in the stmt, the one
6222          leading to reduc_def.  */
6223       if (VECTORIZABLE_CYCLE_DEF (dt))
6224         return false;
6225
6226       /* To properly compute ncopies we are interested in the widest
6227          non-reduction input type in case we're looking at a widening
6228          accumulation that we later handle in vect_transform_reduction.  */
6229       if (lane_reduc_code_p
6230           && tem
6231           && (!vectype_in
6232               || (GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (vectype_in)))
6233                   < GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (tem))))))
6234         vectype_in = tem;
6235
6236       if (code == COND_EXPR)
6237         {
6238           /* Record how the non-reduction-def value of COND_EXPR is defined.  */
6239           if (dt == vect_constant_def)
6240             {
6241               cond_reduc_dt = dt;
6242               cond_reduc_val = op;
6243             }
6244           if (dt == vect_induction_def
6245               && def_stmt_info
6246               && is_nonwrapping_integer_induction (def_stmt_info, loop))
6247             {
6248               cond_reduc_dt = dt;
6249               cond_stmt_vinfo = def_stmt_info;
6250             }
6251         }
6252     }
6253   if (!vectype_in)
6254     vectype_in = STMT_VINFO_VECTYPE (phi_info);
6255   STMT_VINFO_REDUC_VECTYPE_IN (reduc_info) = vectype_in;
6256
6257   enum vect_reduction_type v_reduc_type = STMT_VINFO_REDUC_TYPE (phi_info);
6258   STMT_VINFO_REDUC_TYPE (reduc_info) = v_reduc_type;
6259   /* If we have a condition reduction, see if we can simplify it further.  */
6260   if (v_reduc_type == COND_REDUCTION)
6261     {
6262       if (slp_node)
6263         return false;
6264
6265       /* When the condition uses the reduction value in the condition, fail.  */
6266       if (STMT_VINFO_REDUC_IDX (stmt_info) == 0)
6267         {
6268           if (dump_enabled_p ())
6269             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6270                              "condition depends on previous iteration\n");
6271           return false;
6272         }
6273
6274       if (reduc_chain_length == 1
6275           && direct_internal_fn_supported_p (IFN_FOLD_EXTRACT_LAST,
6276                                              vectype_in, OPTIMIZE_FOR_SPEED))
6277         {
6278           if (dump_enabled_p ())
6279             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6280                              "optimizing condition reduction with"
6281                              " FOLD_EXTRACT_LAST.\n");
6282           STMT_VINFO_REDUC_TYPE (reduc_info) = EXTRACT_LAST_REDUCTION;
6283         }
6284       else if (cond_reduc_dt == vect_induction_def)
6285         {
6286           tree base
6287             = STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (cond_stmt_vinfo);
6288           tree step = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (cond_stmt_vinfo);
6289
6290           gcc_assert (TREE_CODE (base) == INTEGER_CST
6291                       && TREE_CODE (step) == INTEGER_CST);
6292           cond_reduc_val = NULL_TREE;
6293           enum tree_code cond_reduc_op_code = ERROR_MARK;
6294           tree res = PHI_RESULT (STMT_VINFO_STMT (cond_stmt_vinfo));
6295           if (!types_compatible_p (TREE_TYPE (res), TREE_TYPE (base)))
6296             ;
6297           /* Find a suitable value, for MAX_EXPR below base, for MIN_EXPR
6298              above base; punt if base is the minimum value of the type for
6299              MAX_EXPR or maximum value of the type for MIN_EXPR for now.  */
6300           else if (tree_int_cst_sgn (step) == -1)
6301             {
6302               cond_reduc_op_code = MIN_EXPR;
6303               if (tree_int_cst_sgn (base) == -1)
6304                 cond_reduc_val = build_int_cst (TREE_TYPE (base), 0);
6305               else if (tree_int_cst_lt (base,
6306                                         TYPE_MAX_VALUE (TREE_TYPE (base))))
6307                 cond_reduc_val
6308                   = int_const_binop (PLUS_EXPR, base, integer_one_node);
6309             }
6310           else
6311             {
6312               cond_reduc_op_code = MAX_EXPR;
6313               if (tree_int_cst_sgn (base) == 1)
6314                 cond_reduc_val = build_int_cst (TREE_TYPE (base), 0);
6315               else if (tree_int_cst_lt (TYPE_MIN_VALUE (TREE_TYPE (base)),
6316                                         base))
6317                 cond_reduc_val
6318                   = int_const_binop (MINUS_EXPR, base, integer_one_node);
6319             }
6320           if (cond_reduc_val)
6321             {
6322               if (dump_enabled_p ())
6323                 dump_printf_loc (MSG_NOTE, vect_location,
6324                                  "condition expression based on "
6325                                  "integer induction.\n");
6326               STMT_VINFO_REDUC_CODE (reduc_info) = cond_reduc_op_code;
6327               STMT_VINFO_VEC_INDUC_COND_INITIAL_VAL (reduc_info)
6328                 = cond_reduc_val;
6329               STMT_VINFO_REDUC_TYPE (reduc_info) = INTEGER_INDUC_COND_REDUCTION;
6330             }
6331         }
6332       else if (cond_reduc_dt == vect_constant_def)
6333         {
6334           enum vect_def_type cond_initial_dt;
6335           tree cond_initial_val
6336             = PHI_ARG_DEF_FROM_EDGE (reduc_def_phi, loop_preheader_edge (loop));
6337
6338           gcc_assert (cond_reduc_val != NULL_TREE);
6339           vect_is_simple_use (cond_initial_val, loop_vinfo, &cond_initial_dt);
6340           if (cond_initial_dt == vect_constant_def
6341               && types_compatible_p (TREE_TYPE (cond_initial_val),
6342                                      TREE_TYPE (cond_reduc_val)))
6343             {
6344               tree e = fold_binary (LE_EXPR, boolean_type_node,
6345                                     cond_initial_val, cond_reduc_val);
6346               if (e && (integer_onep (e) || integer_zerop (e)))
6347                 {
6348                   if (dump_enabled_p ())
6349                     dump_printf_loc (MSG_NOTE, vect_location,
6350                                      "condition expression based on "
6351                                      "compile time constant.\n");
6352                   /* Record reduction code at analysis stage.  */
6353                   STMT_VINFO_REDUC_CODE (reduc_info)
6354                     = integer_onep (e) ? MAX_EXPR : MIN_EXPR;
6355                   STMT_VINFO_REDUC_TYPE (reduc_info) = CONST_COND_REDUCTION;
6356                 }
6357             }
6358         }
6359     }
6360
6361   if (STMT_VINFO_LIVE_P (phi_info))
6362     return false;
6363
6364   if (slp_node)
6365     ncopies = 1;
6366   else
6367     ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
6368
6369   gcc_assert (ncopies >= 1);
6370
6371   poly_uint64 nunits_out = TYPE_VECTOR_SUBPARTS (vectype_out);
6372
6373   if (nested_cycle)
6374     {
6375       gcc_assert (STMT_VINFO_DEF_TYPE (reduc_info)
6376                   == vect_double_reduction_def);
6377       double_reduc = true;
6378     }
6379
6380   /* 4.2. Check support for the epilog operation.
6381
6382           If STMT represents a reduction pattern, then the type of the
6383           reduction variable may be different than the type of the rest
6384           of the arguments.  For example, consider the case of accumulation
6385           of shorts into an int accumulator; The original code:
6386                         S1: int_a = (int) short_a;
6387           orig_stmt->   S2: int_acc = plus <int_a ,int_acc>;
6388
6389           was replaced with:
6390                         STMT: int_acc = widen_sum <short_a, int_acc>
6391
6392           This means that:
6393           1. The tree-code that is used to create the vector operation in the
6394              epilog code (that reduces the partial results) is not the
6395              tree-code of STMT, but is rather the tree-code of the original
6396              stmt from the pattern that STMT is replacing.  I.e, in the example
6397              above we want to use 'widen_sum' in the loop, but 'plus' in the
6398              epilog.
6399           2. The type (mode) we use to check available target support
6400              for the vector operation to be created in the *epilog*, is
6401              determined by the type of the reduction variable (in the example
6402              above we'd check this: optab_handler (plus_optab, vect_int_mode])).
6403              However the type (mode) we use to check available target support
6404              for the vector operation to be created *inside the loop*, is
6405              determined by the type of the other arguments to STMT (in the
6406              example we'd check this: optab_handler (widen_sum_optab,
6407              vect_short_mode)).
6408
6409           This is contrary to "regular" reductions, in which the types of all
6410           the arguments are the same as the type of the reduction variable.
6411           For "regular" reductions we can therefore use the same vector type
6412           (and also the same tree-code) when generating the epilog code and
6413           when generating the code inside the loop.  */
6414
6415   enum tree_code orig_code = STMT_VINFO_REDUC_CODE (phi_info);
6416   STMT_VINFO_REDUC_CODE (reduc_info) = orig_code;
6417
6418   vect_reduction_type reduction_type = STMT_VINFO_REDUC_TYPE (reduc_info);
6419   if (reduction_type == TREE_CODE_REDUCTION)
6420     {
6421       /* Check whether it's ok to change the order of the computation.
6422          Generally, when vectorizing a reduction we change the order of the
6423          computation.  This may change the behavior of the program in some
6424          cases, so we need to check that this is ok.  One exception is when
6425          vectorizing an outer-loop: the inner-loop is executed sequentially,
6426          and therefore vectorizing reductions in the inner-loop during
6427          outer-loop vectorization is safe.  */
6428       if (needs_fold_left_reduction_p (scalar_type, orig_code))
6429         {
6430           /* When vectorizing a reduction chain w/o SLP the reduction PHI
6431              is not directy used in stmt.  */
6432           if (!only_slp_reduc_chain
6433               && reduc_chain_length != 1)
6434             {
6435               if (dump_enabled_p ())
6436                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6437                                  "in-order reduction chain without SLP.\n");
6438               return false;
6439             }
6440           STMT_VINFO_REDUC_TYPE (reduc_info)
6441             = reduction_type = FOLD_LEFT_REDUCTION;
6442         }
6443       else if (!commutative_tree_code (orig_code)
6444                || !associative_tree_code (orig_code))
6445         {
6446           if (dump_enabled_p ())
6447             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6448                             "reduction: not commutative/associative");
6449           return false;
6450         }
6451     }
6452
6453   if ((double_reduc || reduction_type != TREE_CODE_REDUCTION)
6454       && ncopies > 1)
6455     {
6456       if (dump_enabled_p ())
6457         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6458                          "multiple types in double reduction or condition "
6459                          "reduction or fold-left reduction.\n");
6460       return false;
6461     }
6462
6463   internal_fn reduc_fn = IFN_LAST;
6464   if (reduction_type == TREE_CODE_REDUCTION
6465       || reduction_type == FOLD_LEFT_REDUCTION
6466       || reduction_type == INTEGER_INDUC_COND_REDUCTION
6467       || reduction_type == CONST_COND_REDUCTION)
6468     {
6469       if (reduction_type == FOLD_LEFT_REDUCTION
6470           ? fold_left_reduction_fn (orig_code, &reduc_fn)
6471           : reduction_fn_for_scalar_code (orig_code, &reduc_fn))
6472         {
6473           if (reduc_fn != IFN_LAST
6474               && !direct_internal_fn_supported_p (reduc_fn, vectype_out,
6475                                                   OPTIMIZE_FOR_SPEED))
6476             {
6477               if (dump_enabled_p ())
6478                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6479                                  "reduc op not supported by target.\n");
6480
6481               reduc_fn = IFN_LAST;
6482             }
6483         }
6484       else
6485         {
6486           if (!nested_cycle || double_reduc)
6487             {
6488               if (dump_enabled_p ())
6489                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6490                                  "no reduc code for scalar code.\n");
6491
6492               return false;
6493             }
6494         }
6495     }
6496   else if (reduction_type == COND_REDUCTION)
6497     {
6498       int scalar_precision
6499         = GET_MODE_PRECISION (SCALAR_TYPE_MODE (scalar_type));
6500       cr_index_scalar_type = make_unsigned_type (scalar_precision);
6501       cr_index_vector_type = build_vector_type (cr_index_scalar_type,
6502                                                 nunits_out);
6503
6504       if (direct_internal_fn_supported_p (IFN_REDUC_MAX, cr_index_vector_type,
6505                                           OPTIMIZE_FOR_SPEED))
6506         reduc_fn = IFN_REDUC_MAX;
6507     }
6508   STMT_VINFO_REDUC_FN (reduc_info) = reduc_fn;
6509
6510   if (reduction_type != EXTRACT_LAST_REDUCTION
6511       && (!nested_cycle || double_reduc)
6512       && reduc_fn == IFN_LAST
6513       && !nunits_out.is_constant ())
6514     {
6515       if (dump_enabled_p ())
6516         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6517                          "missing target support for reduction on"
6518                          " variable-length vectors.\n");
6519       return false;
6520     }
6521
6522   /* For SLP reductions, see if there is a neutral value we can use.  */
6523   tree neutral_op = NULL_TREE;
6524   if (slp_node)
6525     neutral_op = neutral_op_for_slp_reduction
6526       (slp_node_instance->reduc_phis, vectype_out, orig_code,
6527        REDUC_GROUP_FIRST_ELEMENT (stmt_info) != NULL);
6528
6529   if (double_reduc && reduction_type == FOLD_LEFT_REDUCTION)
6530     {
6531       /* We can't support in-order reductions of code such as this:
6532
6533            for (int i = 0; i < n1; ++i)
6534              for (int j = 0; j < n2; ++j)
6535                l += a[j];
6536
6537          since GCC effectively transforms the loop when vectorizing:
6538
6539            for (int i = 0; i < n1 / VF; ++i)
6540              for (int j = 0; j < n2; ++j)
6541                for (int k = 0; k < VF; ++k)
6542                  l += a[j];
6543
6544          which is a reassociation of the original operation.  */
6545       if (dump_enabled_p ())
6546         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6547                          "in-order double reduction not supported.\n");
6548
6549       return false;
6550     }
6551
6552   if (reduction_type == FOLD_LEFT_REDUCTION
6553       && slp_node
6554       && !REDUC_GROUP_FIRST_ELEMENT (stmt_info))
6555     {
6556       /* We cannot use in-order reductions in this case because there is
6557          an implicit reassociation of the operations involved.  */
6558       if (dump_enabled_p ())
6559         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6560                          "in-order unchained SLP reductions not supported.\n");
6561       return false;
6562     }
6563
6564   /* For double reductions, and for SLP reductions with a neutral value,
6565      we construct a variable-length initial vector by loading a vector
6566      full of the neutral value and then shift-and-inserting the start
6567      values into the low-numbered elements.  */
6568   if ((double_reduc || neutral_op)
6569       && !nunits_out.is_constant ()
6570       && !direct_internal_fn_supported_p (IFN_VEC_SHL_INSERT,
6571                                           vectype_out, OPTIMIZE_FOR_SPEED))
6572     {
6573       if (dump_enabled_p ())
6574         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6575                          "reduction on variable-length vectors requires"
6576                          " target support for a vector-shift-and-insert"
6577                          " operation.\n");
6578       return false;
6579     }
6580
6581   /* Check extra constraints for variable-length unchained SLP reductions.  */
6582   if (STMT_SLP_TYPE (stmt_info)
6583       && !REDUC_GROUP_FIRST_ELEMENT (stmt_info)
6584       && !nunits_out.is_constant ())
6585     {
6586       /* We checked above that we could build the initial vector when
6587          there's a neutral element value.  Check here for the case in
6588          which each SLP statement has its own initial value and in which
6589          that value needs to be repeated for every instance of the
6590          statement within the initial vector.  */
6591       unsigned int group_size = SLP_TREE_SCALAR_STMTS (slp_node).length ();
6592       if (!neutral_op
6593           && !can_duplicate_and_interleave_p (loop_vinfo, group_size,
6594                                               TREE_TYPE (vectype_out)))
6595         {
6596           if (dump_enabled_p ())
6597             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6598                              "unsupported form of SLP reduction for"
6599                              " variable-length vectors: cannot build"
6600                              " initial vector.\n");
6601           return false;
6602         }
6603       /* The epilogue code relies on the number of elements being a multiple
6604          of the group size.  The duplicate-and-interleave approach to setting
6605          up the initial vector does too.  */
6606       if (!multiple_p (nunits_out, group_size))
6607         {
6608           if (dump_enabled_p ())
6609             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6610                              "unsupported form of SLP reduction for"
6611                              " variable-length vectors: the vector size"
6612                              " is not a multiple of the number of results.\n");
6613           return false;
6614         }
6615     }
6616
6617   if (reduction_type == COND_REDUCTION)
6618     {
6619       widest_int ni;
6620
6621       if (! max_loop_iterations (loop, &ni))
6622         {
6623           if (dump_enabled_p ())
6624             dump_printf_loc (MSG_NOTE, vect_location,
6625                              "loop count not known, cannot create cond "
6626                              "reduction.\n");
6627           return false;
6628         }
6629       /* Convert backedges to iterations.  */
6630       ni += 1;
6631
6632       /* The additional index will be the same type as the condition.  Check
6633          that the loop can fit into this less one (because we'll use up the
6634          zero slot for when there are no matches).  */
6635       tree max_index = TYPE_MAX_VALUE (cr_index_scalar_type);
6636       if (wi::geu_p (ni, wi::to_widest (max_index)))
6637         {
6638           if (dump_enabled_p ())
6639             dump_printf_loc (MSG_NOTE, vect_location,
6640                              "loop size is greater than data size.\n");
6641           return false;
6642         }
6643     }
6644
6645   /* In case the vectorization factor (VF) is bigger than the number
6646      of elements that we can fit in a vectype (nunits), we have to generate
6647      more than one vector stmt - i.e - we need to "unroll" the
6648      vector stmt by a factor VF/nunits.  For more details see documentation
6649      in vectorizable_operation.  */
6650
6651   /* If the reduction is used in an outer loop we need to generate
6652      VF intermediate results, like so (e.g. for ncopies=2):
6653         r0 = phi (init, r0)
6654         r1 = phi (init, r1)
6655         r0 = x0 + r0;
6656         r1 = x1 + r1;
6657     (i.e. we generate VF results in 2 registers).
6658     In this case we have a separate def-use cycle for each copy, and therefore
6659     for each copy we get the vector def for the reduction variable from the
6660     respective phi node created for this copy.
6661
6662     Otherwise (the reduction is unused in the loop nest), we can combine
6663     together intermediate results, like so (e.g. for ncopies=2):
6664         r = phi (init, r)
6665         r = x0 + r;
6666         r = x1 + r;
6667    (i.e. we generate VF/2 results in a single register).
6668    In this case for each copy we get the vector def for the reduction variable
6669    from the vectorized reduction operation generated in the previous iteration.
6670
6671    This only works when we see both the reduction PHI and its only consumer
6672    in vectorizable_reduction and there are no intermediate stmts
6673    participating.  */
6674   if (ncopies > 1
6675       && (STMT_VINFO_RELEVANT (stmt_info) <= vect_used_only_live)
6676       && reduc_chain_length == 1)
6677     single_defuse_cycle = true;
6678
6679   if (single_defuse_cycle || lane_reduc_code_p)
6680     {
6681       gcc_assert (code != COND_EXPR);
6682
6683       /* 4. Supportable by target?  */
6684       bool ok = true;
6685
6686       /* 4.1. check support for the operation in the loop  */
6687       optab optab = optab_for_tree_code (code, vectype_in, optab_vector);
6688       if (!optab)
6689         {
6690           if (dump_enabled_p ())
6691             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6692                              "no optab.\n");
6693           ok = false;
6694         }
6695
6696       machine_mode vec_mode = TYPE_MODE (vectype_in);
6697       if (ok && optab_handler (optab, vec_mode) == CODE_FOR_nothing)
6698         {
6699           if (dump_enabled_p ())
6700             dump_printf (MSG_NOTE, "op not supported by target.\n");
6701           if (maybe_ne (GET_MODE_SIZE (vec_mode), UNITS_PER_WORD)
6702               || !vect_worthwhile_without_simd_p (loop_vinfo, code))
6703             ok = false;
6704           else
6705             if (dump_enabled_p ())
6706               dump_printf (MSG_NOTE, "proceeding using word mode.\n");
6707         }
6708
6709       /* Worthwhile without SIMD support?  */
6710       if (ok
6711           && !VECTOR_MODE_P (TYPE_MODE (vectype_in))
6712           && !vect_worthwhile_without_simd_p (loop_vinfo, code))
6713         {
6714           if (dump_enabled_p ())
6715             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6716                              "not worthwhile without SIMD support.\n");
6717           ok = false;
6718         }
6719
6720       /* lane-reducing operations have to go through vect_transform_reduction.
6721          For the other cases try without the single cycle optimization.  */
6722       if (!ok)
6723         {
6724           if (lane_reduc_code_p)
6725             return false;
6726           else
6727             single_defuse_cycle = false;
6728         }
6729     }
6730   STMT_VINFO_FORCE_SINGLE_CYCLE (reduc_info) = single_defuse_cycle;
6731
6732   /* If the reduction stmt is one of the patterns that have lane
6733      reduction embedded we cannot handle the case of ! single_defuse_cycle.  */
6734   if ((ncopies > 1 && ! single_defuse_cycle)
6735       && lane_reduc_code_p)
6736     {
6737       if (dump_enabled_p ())
6738         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6739                          "multi def-use cycle not possible for lane-reducing "
6740                          "reduction operation\n");
6741       return false;
6742     }
6743
6744   if (slp_node
6745       && !(!single_defuse_cycle
6746            && code != DOT_PROD_EXPR
6747            && code != WIDEN_SUM_EXPR
6748            && code != SAD_EXPR
6749            && reduction_type != FOLD_LEFT_REDUCTION))
6750     for (i = 0; i < op_type; i++)
6751       if (!vect_maybe_update_slp_op_vectype (slp_op[i], vectype_in))
6752         {
6753           if (dump_enabled_p ())
6754             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6755                              "incompatible vector types for invariants\n");
6756           return false;
6757         }
6758
6759   if (slp_node)
6760     vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
6761   else
6762     vec_num = 1;
6763
6764   vect_model_reduction_cost (loop_vinfo, stmt_info, reduc_fn,
6765                              reduction_type, ncopies, cost_vec);
6766   if (dump_enabled_p ()
6767       && reduction_type == FOLD_LEFT_REDUCTION)
6768     dump_printf_loc (MSG_NOTE, vect_location,
6769                      "using an in-order (fold-left) reduction.\n");
6770   STMT_VINFO_TYPE (orig_stmt_of_analysis) = cycle_phi_info_type;
6771   /* All but single defuse-cycle optimized, lane-reducing and fold-left
6772      reductions go through their own vectorizable_* routines.  */
6773   if (!single_defuse_cycle
6774       && code != DOT_PROD_EXPR
6775       && code != WIDEN_SUM_EXPR
6776       && code != SAD_EXPR
6777       && reduction_type != FOLD_LEFT_REDUCTION)
6778     {
6779       stmt_vec_info tem
6780         = vect_stmt_to_vectorize (STMT_VINFO_REDUC_DEF (phi_info));
6781       if (slp_node && REDUC_GROUP_FIRST_ELEMENT (tem))
6782         {
6783           gcc_assert (!REDUC_GROUP_NEXT_ELEMENT (tem));
6784           tem = REDUC_GROUP_FIRST_ELEMENT (tem);
6785         }
6786       STMT_VINFO_DEF_TYPE (vect_orig_stmt (tem)) = vect_internal_def;
6787       STMT_VINFO_DEF_TYPE (tem) = vect_internal_def;
6788     }
6789   else if (loop_vinfo && LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo))
6790     {
6791       vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo);
6792       internal_fn cond_fn = get_conditional_internal_fn (code);
6793
6794       if (reduction_type != FOLD_LEFT_REDUCTION
6795           && !use_mask_by_cond_expr_p (code, cond_fn, vectype_in)
6796           && (cond_fn == IFN_LAST
6797               || !direct_internal_fn_supported_p (cond_fn, vectype_in,
6798                                                   OPTIMIZE_FOR_SPEED)))
6799         {
6800           if (dump_enabled_p ())
6801             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6802                              "can't use a fully-masked loop because no"
6803                              " conditional operation is available.\n");
6804           LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = false;
6805         }
6806       else if (reduction_type == FOLD_LEFT_REDUCTION
6807                && reduc_fn == IFN_LAST
6808                && !expand_vec_cond_expr_p (vectype_in,
6809                                            truth_type_for (vectype_in),
6810                                            SSA_NAME))
6811         {
6812           if (dump_enabled_p ())
6813             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6814                              "can't use a fully-masked loop because no"
6815                              " conditional operation is available.\n");
6816           LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = false;
6817         }
6818       else
6819         vect_record_loop_mask (loop_vinfo, masks, ncopies * vec_num,
6820                                vectype_in, NULL);
6821     }
6822   return true;
6823 }
6824
6825 /* Transform the definition stmt STMT_INFO of a reduction PHI backedge
6826    value.  */
6827
6828 bool
6829 vect_transform_reduction (loop_vec_info loop_vinfo,
6830                           stmt_vec_info stmt_info, gimple_stmt_iterator *gsi,
6831                           stmt_vec_info *vec_stmt, slp_tree slp_node)
6832 {
6833   tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
6834   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
6835   int i;
6836   int ncopies;
6837   int j;
6838   int vec_num;
6839
6840   stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info);
6841   gcc_assert (reduc_info->is_reduc_info);
6842
6843   if (nested_in_vect_loop_p (loop, stmt_info))
6844     {
6845       loop = loop->inner;
6846       gcc_assert (STMT_VINFO_DEF_TYPE (reduc_info) == vect_double_reduction_def);
6847     }
6848
6849   gassign *stmt = as_a <gassign *> (stmt_info->stmt);
6850   enum tree_code code = gimple_assign_rhs_code (stmt);
6851   int op_type = TREE_CODE_LENGTH (code);
6852
6853   /* Flatten RHS.  */
6854   tree ops[3];
6855   switch (get_gimple_rhs_class (code))
6856     {
6857     case GIMPLE_TERNARY_RHS:
6858       ops[2] = gimple_assign_rhs3 (stmt);
6859       /* Fall thru.  */
6860     case GIMPLE_BINARY_RHS:
6861       ops[0] = gimple_assign_rhs1 (stmt);
6862       ops[1] = gimple_assign_rhs2 (stmt);
6863       break;
6864     default:
6865       gcc_unreachable ();
6866     }
6867
6868   /* All uses but the last are expected to be defined in the loop.
6869      The last use is the reduction variable.  In case of nested cycle this
6870      assumption is not true: we use reduc_index to record the index of the
6871      reduction variable.  */
6872   stmt_vec_info phi_info = STMT_VINFO_REDUC_DEF (vect_orig_stmt (stmt_info));
6873   gphi *reduc_def_phi = as_a <gphi *> (phi_info->stmt);
6874   int reduc_index = STMT_VINFO_REDUC_IDX (stmt_info);
6875   tree vectype_in = STMT_VINFO_REDUC_VECTYPE_IN (reduc_info);
6876
6877   if (slp_node)
6878     {
6879       ncopies = 1;
6880       vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
6881     }
6882   else
6883     {
6884       ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
6885       vec_num = 1;
6886     }
6887
6888   internal_fn cond_fn = get_conditional_internal_fn (code);
6889   vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo);
6890   bool mask_by_cond_expr = use_mask_by_cond_expr_p (code, cond_fn, vectype_in);
6891
6892   /* Transform.  */
6893   stmt_vec_info new_stmt_info = NULL;
6894   stmt_vec_info prev_stmt_info;
6895   tree new_temp = NULL_TREE;
6896   auto_vec<tree> vec_oprnds0;
6897   auto_vec<tree> vec_oprnds1;
6898   auto_vec<tree> vec_oprnds2;
6899   tree def0;
6900
6901   if (dump_enabled_p ())
6902     dump_printf_loc (MSG_NOTE, vect_location, "transform reduction.\n");
6903
6904   /* FORNOW: Multiple types are not supported for condition.  */
6905   if (code == COND_EXPR)
6906     gcc_assert (ncopies == 1);
6907
6908   bool masked_loop_p = LOOP_VINFO_FULLY_MASKED_P (loop_vinfo);
6909
6910   vect_reduction_type reduction_type = STMT_VINFO_REDUC_TYPE (reduc_info);
6911   if (reduction_type == FOLD_LEFT_REDUCTION)
6912     {
6913       internal_fn reduc_fn = STMT_VINFO_REDUC_FN (reduc_info);
6914       return vectorize_fold_left_reduction
6915           (loop_vinfo, stmt_info, gsi, vec_stmt, slp_node, reduc_def_phi, code,
6916            reduc_fn, ops, vectype_in, reduc_index, masks);
6917     }
6918
6919   bool single_defuse_cycle = STMT_VINFO_FORCE_SINGLE_CYCLE (reduc_info);
6920   gcc_assert (single_defuse_cycle
6921               || code == DOT_PROD_EXPR
6922               || code == WIDEN_SUM_EXPR
6923               || code == SAD_EXPR);
6924
6925   /* Create the destination vector  */
6926   tree scalar_dest = gimple_assign_lhs (stmt);
6927   tree vec_dest = vect_create_destination_var (scalar_dest, vectype_out);
6928
6929   prev_stmt_info = NULL;
6930   if (!slp_node)
6931     {
6932       vec_oprnds0.create (1);
6933       vec_oprnds1.create (1);
6934       if (op_type == ternary_op)
6935         vec_oprnds2.create (1);
6936     }
6937
6938   for (j = 0; j < ncopies; j++)
6939     {
6940       /* Handle uses.  */
6941       if (j == 0)
6942         {
6943           if (slp_node)
6944             {
6945               /* Get vec defs for all the operands except the reduction index,
6946                  ensuring the ordering of the ops in the vector is kept.  */
6947               auto_vec<vec<tree>, 3> vec_defs;
6948               vect_get_slp_defs (loop_vinfo, slp_node, &vec_defs);
6949               vec_oprnds0.safe_splice (vec_defs[0]);
6950               vec_defs[0].release ();
6951               vec_oprnds1.safe_splice (vec_defs[1]);
6952               vec_defs[1].release ();
6953               if (op_type == ternary_op)
6954                 {
6955                   vec_oprnds2.safe_splice (vec_defs[2]);
6956                   vec_defs[2].release ();
6957                 }
6958             }
6959           else
6960             {
6961               vec_oprnds0.quick_push
6962                 (vect_get_vec_def_for_operand (loop_vinfo, ops[0], stmt_info));
6963               vec_oprnds1.quick_push
6964                 (vect_get_vec_def_for_operand (loop_vinfo, ops[1], stmt_info));
6965               if (op_type == ternary_op)
6966                 vec_oprnds2.quick_push
6967                   (vect_get_vec_def_for_operand (loop_vinfo, ops[2], stmt_info));
6968             }
6969         }
6970       else
6971         {
6972           if (!slp_node)
6973             {
6974               gcc_assert (reduc_index != -1 || ! single_defuse_cycle);
6975
6976               if (single_defuse_cycle && reduc_index == 0)
6977                 vec_oprnds0[0] = gimple_get_lhs (new_stmt_info->stmt);
6978               else
6979                 vec_oprnds0[0]
6980                   = vect_get_vec_def_for_stmt_copy (loop_vinfo,
6981                                                     vec_oprnds0[0]);
6982               if (single_defuse_cycle && reduc_index == 1)
6983                 vec_oprnds1[0] = gimple_get_lhs (new_stmt_info->stmt);
6984               else
6985                 vec_oprnds1[0]
6986                   = vect_get_vec_def_for_stmt_copy (loop_vinfo,
6987                                                     vec_oprnds1[0]);
6988               if (op_type == ternary_op)
6989                 {
6990                   if (single_defuse_cycle && reduc_index == 2)
6991                     vec_oprnds2[0] = gimple_get_lhs (new_stmt_info->stmt);
6992                   else
6993                     vec_oprnds2[0]
6994                       = vect_get_vec_def_for_stmt_copy (loop_vinfo,
6995                                                         vec_oprnds2[0]);
6996                 }
6997             }
6998         }
6999
7000       FOR_EACH_VEC_ELT (vec_oprnds0, i, def0)
7001         {
7002           tree vop[3] = { def0, vec_oprnds1[i], NULL_TREE };
7003           if (masked_loop_p && !mask_by_cond_expr)
7004             {
7005               /* Make sure that the reduction accumulator is vop[0].  */
7006               if (reduc_index == 1)
7007                 {
7008                   gcc_assert (commutative_tree_code (code));
7009                   std::swap (vop[0], vop[1]);
7010                 }
7011               tree mask = vect_get_loop_mask (gsi, masks, vec_num * ncopies,
7012                                               vectype_in, i * ncopies + j);
7013               gcall *call = gimple_build_call_internal (cond_fn, 4, mask,
7014                                                         vop[0], vop[1],
7015                                                         vop[0]);
7016               new_temp = make_ssa_name (vec_dest, call);
7017               gimple_call_set_lhs (call, new_temp);
7018               gimple_call_set_nothrow (call, true);
7019               new_stmt_info
7020                 = vect_finish_stmt_generation (loop_vinfo,
7021                                                stmt_info, call, gsi);
7022             }
7023           else
7024             {
7025               if (op_type == ternary_op)
7026                 vop[2] = vec_oprnds2[i];
7027
7028               if (masked_loop_p && mask_by_cond_expr)
7029                 {
7030                   tree mask = vect_get_loop_mask (gsi, masks,
7031                                                   vec_num * ncopies,
7032                                                   vectype_in, i * ncopies + j);
7033                   build_vect_cond_expr (code, vop, mask, gsi);
7034                 }
7035
7036               gassign *new_stmt = gimple_build_assign (vec_dest, code,
7037                                                        vop[0], vop[1], vop[2]);
7038               new_temp = make_ssa_name (vec_dest, new_stmt);
7039               gimple_assign_set_lhs (new_stmt, new_temp);
7040               new_stmt_info
7041                 = vect_finish_stmt_generation (loop_vinfo,
7042                                                stmt_info, new_stmt, gsi);
7043             }
7044
7045           if (slp_node)
7046             SLP_TREE_VEC_STMTS (slp_node).quick_push (new_stmt_info);
7047         }
7048
7049       if (slp_node || single_defuse_cycle)
7050         continue;
7051
7052       if (j == 0)
7053         STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt_info;
7054       else
7055         STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt_info;
7056
7057       prev_stmt_info = new_stmt_info;
7058     }
7059
7060   if (single_defuse_cycle && !slp_node)
7061     STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt_info;
7062
7063   return true;
7064 }
7065
7066 /* Transform phase of a cycle PHI.  */
7067
7068 bool
7069 vect_transform_cycle_phi (loop_vec_info loop_vinfo,
7070                           stmt_vec_info stmt_info, stmt_vec_info *vec_stmt,
7071                           slp_tree slp_node, slp_instance slp_node_instance)
7072 {
7073   tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
7074   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
7075   int i;
7076   int ncopies;
7077   stmt_vec_info prev_phi_info;
7078   int j;
7079   bool nested_cycle = false;
7080   int vec_num;
7081
7082   if (nested_in_vect_loop_p (loop, stmt_info))
7083     {
7084       loop = loop->inner;
7085       nested_cycle = true;
7086     }
7087
7088   stmt_vec_info reduc_stmt_info = STMT_VINFO_REDUC_DEF (stmt_info);
7089   reduc_stmt_info = vect_stmt_to_vectorize (reduc_stmt_info);
7090   stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info);
7091   gcc_assert (reduc_info->is_reduc_info);
7092
7093   if (STMT_VINFO_REDUC_TYPE (reduc_info) == EXTRACT_LAST_REDUCTION
7094       || STMT_VINFO_REDUC_TYPE (reduc_info) == FOLD_LEFT_REDUCTION)
7095     /* Leave the scalar phi in place.  */
7096     return true;
7097
7098   tree vectype_in = STMT_VINFO_REDUC_VECTYPE_IN (reduc_info);
7099   /* For a nested cycle we do not fill the above.  */
7100   if (!vectype_in)
7101     vectype_in = STMT_VINFO_VECTYPE (stmt_info);
7102   gcc_assert (vectype_in);
7103
7104   if (slp_node)
7105     {
7106       /* The size vect_schedule_slp_instance computes is off for us.  */
7107       vec_num = vect_get_num_vectors
7108           (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
7109            * SLP_TREE_SCALAR_STMTS (slp_node).length (), vectype_in);
7110       ncopies = 1;
7111     }
7112   else
7113     {
7114       vec_num = 1;
7115       ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
7116     }
7117
7118   /* Check whether we should use a single PHI node and accumulate
7119      vectors to one before the backedge.  */
7120   if (STMT_VINFO_FORCE_SINGLE_CYCLE (reduc_info))
7121     ncopies = 1;
7122
7123   /* Create the destination vector  */
7124   gphi *phi = as_a <gphi *> (stmt_info->stmt);
7125   tree vec_dest = vect_create_destination_var (gimple_phi_result (phi),
7126                                                vectype_out);
7127
7128   /* Get the loop-entry arguments.  */
7129   tree vec_initial_def;
7130   auto_vec<tree> vec_initial_defs;
7131   if (slp_node)
7132     {
7133       vec_initial_defs.reserve (vec_num);
7134       gcc_assert (slp_node == slp_node_instance->reduc_phis);
7135       stmt_vec_info first = REDUC_GROUP_FIRST_ELEMENT (reduc_stmt_info);
7136       tree neutral_op
7137         = neutral_op_for_slp_reduction (slp_node, vectype_out,
7138                                         STMT_VINFO_REDUC_CODE (reduc_info),
7139                                         first != NULL);
7140       get_initial_defs_for_reduction (loop_vinfo, slp_node_instance->reduc_phis,
7141                                       &vec_initial_defs, vec_num,
7142                                       first != NULL, neutral_op);
7143     }
7144   else
7145     {
7146       /* Get at the scalar def before the loop, that defines the initial
7147          value of the reduction variable.  */
7148       tree initial_def = PHI_ARG_DEF_FROM_EDGE (phi,
7149                                                 loop_preheader_edge (loop));
7150       /* Optimize: if initial_def is for REDUC_MAX smaller than the base
7151          and we can't use zero for induc_val, use initial_def.  Similarly
7152          for REDUC_MIN and initial_def larger than the base.  */
7153       if (STMT_VINFO_REDUC_TYPE (reduc_info) == INTEGER_INDUC_COND_REDUCTION)
7154         {
7155           tree induc_val = STMT_VINFO_VEC_INDUC_COND_INITIAL_VAL (reduc_info);
7156           if (TREE_CODE (initial_def) == INTEGER_CST
7157               && !integer_zerop (induc_val)
7158               && ((STMT_VINFO_REDUC_CODE (reduc_info) == MAX_EXPR
7159                    && tree_int_cst_lt (initial_def, induc_val))
7160                   || (STMT_VINFO_REDUC_CODE (reduc_info) == MIN_EXPR
7161                       && tree_int_cst_lt (induc_val, initial_def))))
7162             {
7163               induc_val = initial_def;
7164               /* Communicate we used the initial_def to epilouge
7165                  generation.  */
7166               STMT_VINFO_VEC_INDUC_COND_INITIAL_VAL (reduc_info) = NULL_TREE;
7167             }
7168           vec_initial_def = build_vector_from_val (vectype_out, induc_val);
7169         }
7170       else if (nested_cycle)
7171         {
7172           /* Do not use an adjustment def as that case is not supported
7173              correctly if ncopies is not one.  */
7174           vec_initial_def = vect_get_vec_def_for_operand (loop_vinfo,
7175                                                           initial_def,
7176                                                           reduc_stmt_info);
7177         }
7178       else
7179         {
7180           tree adjustment_def = NULL_TREE;
7181           tree *adjustment_defp = &adjustment_def;
7182           enum tree_code code = STMT_VINFO_REDUC_CODE (reduc_info);
7183           if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
7184             adjustment_defp = NULL;
7185           vec_initial_def
7186             = get_initial_def_for_reduction (loop_vinfo, reduc_stmt_info, code,
7187                                              initial_def, adjustment_defp);
7188           STMT_VINFO_REDUC_EPILOGUE_ADJUSTMENT (reduc_info) = adjustment_def;
7189         }
7190       vec_initial_defs.create (1);
7191       vec_initial_defs.quick_push (vec_initial_def);
7192     }
7193
7194   /* Generate the reduction PHIs upfront.  */
7195   prev_phi_info = NULL;
7196   for (i = 0; i < vec_num; i++)
7197     {
7198       tree vec_init_def = vec_initial_defs[i];
7199       for (j = 0; j < ncopies; j++)
7200         {
7201           /* Create the reduction-phi that defines the reduction
7202              operand.  */
7203           gphi *new_phi = create_phi_node (vec_dest, loop->header);
7204           stmt_vec_info new_phi_info = loop_vinfo->add_stmt (new_phi);
7205
7206           /* Set the loop-entry arg of the reduction-phi.  */
7207           if (j != 0 && nested_cycle)
7208             vec_init_def = vect_get_vec_def_for_stmt_copy (loop_vinfo,
7209                                                            vec_init_def);
7210           add_phi_arg (new_phi, vec_init_def, loop_preheader_edge (loop),
7211                        UNKNOWN_LOCATION);
7212
7213           /* The loop-latch arg is set in epilogue processing.  */
7214
7215           if (slp_node)
7216             SLP_TREE_VEC_STMTS (slp_node).quick_push (new_phi_info);
7217           else
7218             {
7219               if (j == 0)
7220                 STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_phi_info;
7221               else
7222                 STMT_VINFO_RELATED_STMT (prev_phi_info) = new_phi_info;
7223               prev_phi_info = new_phi_info;
7224             }
7225         }
7226     }
7227
7228   return true;
7229 }
7230
7231 /* Vectorizes LC PHIs.  */
7232
7233 bool
7234 vectorizable_lc_phi (loop_vec_info loop_vinfo,
7235                      stmt_vec_info stmt_info, stmt_vec_info *vec_stmt,
7236                      slp_tree slp_node)
7237 {
7238   if (!loop_vinfo
7239       || !is_a <gphi *> (stmt_info->stmt)
7240       || gimple_phi_num_args (stmt_info->stmt) != 1)
7241     return false;
7242
7243   if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_internal_def
7244       && STMT_VINFO_DEF_TYPE (stmt_info) != vect_double_reduction_def)
7245     return false;
7246
7247   if (!vec_stmt) /* transformation not required.  */
7248     {
7249       STMT_VINFO_TYPE (stmt_info) = lc_phi_info_type;
7250       return true;
7251     }
7252
7253   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
7254   tree scalar_dest = gimple_phi_result (stmt_info->stmt);
7255   basic_block bb = gimple_bb (stmt_info->stmt);
7256   edge e = single_pred_edge (bb);
7257   tree vec_dest = vect_create_destination_var (scalar_dest, vectype);
7258   vec<tree> vec_oprnds = vNULL;
7259   vect_get_vec_defs (loop_vinfo,
7260                      gimple_phi_arg_def (stmt_info->stmt, 0), NULL_TREE,
7261                      stmt_info, &vec_oprnds, NULL, slp_node);
7262   if (slp_node)
7263     {
7264       unsigned vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
7265       gcc_assert (vec_oprnds.length () == vec_num);
7266       for (unsigned i = 0; i < vec_num; i++)
7267         {
7268           /* Create the vectorized LC PHI node.  */
7269           gphi *new_phi = create_phi_node (vec_dest, bb);
7270           add_phi_arg (new_phi, vec_oprnds[i], e, UNKNOWN_LOCATION);
7271           stmt_vec_info new_phi_info = loop_vinfo->add_stmt (new_phi);
7272           SLP_TREE_VEC_STMTS (slp_node).quick_push (new_phi_info);
7273         }
7274     }
7275   else
7276     {
7277       unsigned ncopies = vect_get_num_copies (loop_vinfo, vectype);
7278       stmt_vec_info prev_phi_info = NULL;
7279       for (unsigned i = 0; i < ncopies; i++)
7280         {
7281           if (i != 0)
7282             vect_get_vec_defs_for_stmt_copy (loop_vinfo, &vec_oprnds, NULL);
7283           /* Create the vectorized LC PHI node.  */
7284           gphi *new_phi = create_phi_node (vec_dest, bb);
7285           add_phi_arg (new_phi, vec_oprnds[0], e, UNKNOWN_LOCATION);
7286           stmt_vec_info new_phi_info = loop_vinfo->add_stmt (new_phi);
7287           if (i == 0)
7288             STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_phi_info;
7289           else
7290             STMT_VINFO_RELATED_STMT (prev_phi_info) = new_phi_info;
7291           prev_phi_info = new_phi_info;
7292         }
7293     }
7294   vec_oprnds.release ();
7295
7296   return true;
7297 }
7298
7299
7300 /* Function vect_min_worthwhile_factor.
7301
7302    For a loop where we could vectorize the operation indicated by CODE,
7303    return the minimum vectorization factor that makes it worthwhile
7304    to use generic vectors.  */
7305 static unsigned int
7306 vect_min_worthwhile_factor (enum tree_code code)
7307 {
7308   switch (code)
7309     {
7310     case PLUS_EXPR:
7311     case MINUS_EXPR:
7312     case NEGATE_EXPR:
7313       return 4;
7314
7315     case BIT_AND_EXPR:
7316     case BIT_IOR_EXPR:
7317     case BIT_XOR_EXPR:
7318     case BIT_NOT_EXPR:
7319       return 2;
7320
7321     default:
7322       return INT_MAX;
7323     }
7324 }
7325
7326 /* Return true if VINFO indicates we are doing loop vectorization and if
7327    it is worth decomposing CODE operations into scalar operations for
7328    that loop's vectorization factor.  */
7329
7330 bool
7331 vect_worthwhile_without_simd_p (vec_info *vinfo, tree_code code)
7332 {
7333   loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
7334   unsigned HOST_WIDE_INT value;
7335   return (loop_vinfo
7336           && LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant (&value)
7337           && value >= vect_min_worthwhile_factor (code));
7338 }
7339
7340 /* Function vectorizable_induction
7341
7342    Check if STMT_INFO performs an induction computation that can be vectorized.
7343    If VEC_STMT is also passed, vectorize the induction PHI: create a vectorized
7344    phi to replace it, put it in VEC_STMT, and add it to the same basic block.
7345    Return true if STMT_INFO is vectorizable in this way.  */
7346
7347 bool
7348 vectorizable_induction (loop_vec_info loop_vinfo,
7349                         stmt_vec_info stmt_info,
7350                         gimple_stmt_iterator *gsi ATTRIBUTE_UNUSED,
7351                         stmt_vec_info *vec_stmt, slp_tree slp_node,
7352                         stmt_vector_for_cost *cost_vec)
7353 {
7354   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
7355   unsigned ncopies;
7356   bool nested_in_vect_loop = false;
7357   class loop *iv_loop;
7358   tree vec_def;
7359   edge pe = loop_preheader_edge (loop);
7360   basic_block new_bb;
7361   tree new_vec, vec_init, vec_step, t;
7362   tree new_name;
7363   gimple *new_stmt;
7364   gphi *induction_phi;
7365   tree induc_def, vec_dest;
7366   tree init_expr, step_expr;
7367   poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
7368   unsigned i;
7369   tree expr;
7370   gimple_seq stmts;
7371   imm_use_iterator imm_iter;
7372   use_operand_p use_p;
7373   gimple *exit_phi;
7374   edge latch_e;
7375   tree loop_arg;
7376   gimple_stmt_iterator si;
7377
7378   gphi *phi = dyn_cast <gphi *> (stmt_info->stmt);
7379   if (!phi)
7380     return false;
7381
7382   if (!STMT_VINFO_RELEVANT_P (stmt_info))
7383     return false;
7384
7385   /* Make sure it was recognized as induction computation.  */
7386   if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_induction_def)
7387     return false;
7388
7389   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
7390   poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
7391
7392   if (slp_node)
7393     ncopies = 1;
7394   else
7395     ncopies = vect_get_num_copies (loop_vinfo, vectype);
7396   gcc_assert (ncopies >= 1);
7397
7398   /* FORNOW. These restrictions should be relaxed.  */
7399   if (nested_in_vect_loop_p (loop, stmt_info))
7400     {
7401       imm_use_iterator imm_iter;
7402       use_operand_p use_p;
7403       gimple *exit_phi;
7404       edge latch_e;
7405       tree loop_arg;
7406
7407       if (ncopies > 1)
7408         {
7409           if (dump_enabled_p ())
7410             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7411                              "multiple types in nested loop.\n");
7412           return false;
7413         }
7414
7415       /* FORNOW: outer loop induction with SLP not supported.  */
7416       if (STMT_SLP_TYPE (stmt_info))
7417         return false;
7418
7419       exit_phi = NULL;
7420       latch_e = loop_latch_edge (loop->inner);
7421       loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
7422       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, loop_arg)
7423         {
7424           gimple *use_stmt = USE_STMT (use_p);
7425           if (is_gimple_debug (use_stmt))
7426             continue;
7427
7428           if (!flow_bb_inside_loop_p (loop->inner, gimple_bb (use_stmt)))
7429             {
7430               exit_phi = use_stmt;
7431               break;
7432             }
7433         }
7434       if (exit_phi)
7435         {
7436           stmt_vec_info exit_phi_vinfo = loop_vinfo->lookup_stmt (exit_phi);
7437           if (!(STMT_VINFO_RELEVANT_P (exit_phi_vinfo)
7438                 && !STMT_VINFO_LIVE_P (exit_phi_vinfo)))
7439             {
7440               if (dump_enabled_p ())
7441                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7442                                  "inner-loop induction only used outside "
7443                                  "of the outer vectorized loop.\n");
7444               return false;
7445             }
7446         }
7447
7448       nested_in_vect_loop = true;
7449       iv_loop = loop->inner;
7450     }
7451   else
7452     iv_loop = loop;
7453   gcc_assert (iv_loop == (gimple_bb (phi))->loop_father);
7454
7455   if (slp_node && !nunits.is_constant ())
7456     {
7457       /* The current SLP code creates the initial value element-by-element.  */
7458       if (dump_enabled_p ())
7459         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7460                          "SLP induction not supported for variable-length"
7461                          " vectors.\n");
7462       return false;
7463     }
7464
7465   if (!vec_stmt) /* transformation not required.  */
7466     {
7467       STMT_VINFO_TYPE (stmt_info) = induc_vec_info_type;
7468       DUMP_VECT_SCOPE ("vectorizable_induction");
7469       vect_model_induction_cost (stmt_info, ncopies, cost_vec);
7470       return true;
7471     }
7472
7473   /* Transform.  */
7474
7475   /* Compute a vector variable, initialized with the first VF values of
7476      the induction variable.  E.g., for an iv with IV_PHI='X' and
7477      evolution S, for a vector of 4 units, we want to compute:
7478      [X, X + S, X + 2*S, X + 3*S].  */
7479
7480   if (dump_enabled_p ())
7481     dump_printf_loc (MSG_NOTE, vect_location, "transform induction phi.\n");
7482
7483   latch_e = loop_latch_edge (iv_loop);
7484   loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
7485
7486   step_expr = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_info);
7487   gcc_assert (step_expr != NULL_TREE);
7488   tree step_vectype = get_same_sized_vectype (TREE_TYPE (step_expr), vectype);
7489
7490   pe = loop_preheader_edge (iv_loop);
7491   init_expr = PHI_ARG_DEF_FROM_EDGE (phi,
7492                                      loop_preheader_edge (iv_loop));
7493
7494   stmts = NULL;
7495   if (!nested_in_vect_loop)
7496     {
7497       /* Convert the initial value to the IV update type.  */
7498       tree new_type = TREE_TYPE (step_expr);
7499       init_expr = gimple_convert (&stmts, new_type, init_expr);
7500
7501       /* If we are using the loop mask to "peel" for alignment then we need
7502          to adjust the start value here.  */
7503       tree skip_niters = LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo);
7504       if (skip_niters != NULL_TREE)
7505         {
7506           if (FLOAT_TYPE_P (vectype))
7507             skip_niters = gimple_build (&stmts, FLOAT_EXPR, new_type,
7508                                         skip_niters);
7509           else
7510             skip_niters = gimple_convert (&stmts, new_type, skip_niters);
7511           tree skip_step = gimple_build (&stmts, MULT_EXPR, new_type,
7512                                          skip_niters, step_expr);
7513           init_expr = gimple_build (&stmts, MINUS_EXPR, new_type,
7514                                     init_expr, skip_step);
7515         }
7516     }
7517
7518   if (stmts)
7519     {
7520       new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
7521       gcc_assert (!new_bb);
7522     }
7523
7524   /* Find the first insertion point in the BB.  */
7525   basic_block bb = gimple_bb (phi);
7526   si = gsi_after_labels (bb);
7527
7528   /* For SLP induction we have to generate several IVs as for example
7529      with group size 3 we need [i, i, i, i + S] [i + S, i + S, i + 2*S, i + 2*S]
7530      [i + 2*S, i + 3*S, i + 3*S, i + 3*S].  The step is the same uniform
7531      [VF*S, VF*S, VF*S, VF*S] for all.  */
7532   if (slp_node)
7533     {
7534       /* Enforced above.  */
7535       unsigned int const_nunits = nunits.to_constant ();
7536
7537       /* Generate [VF*S, VF*S, ... ].  */
7538       if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
7539         {
7540           expr = build_int_cst (integer_type_node, vf);
7541           expr = fold_convert (TREE_TYPE (step_expr), expr);
7542         }
7543       else
7544         expr = build_int_cst (TREE_TYPE (step_expr), vf);
7545       new_name = fold_build2 (MULT_EXPR, TREE_TYPE (step_expr),
7546                               expr, step_expr);
7547       if (! CONSTANT_CLASS_P (new_name))
7548         new_name = vect_init_vector (loop_vinfo, stmt_info, new_name,
7549                                      TREE_TYPE (step_expr), NULL);
7550       new_vec = build_vector_from_val (step_vectype, new_name);
7551       vec_step = vect_init_vector (loop_vinfo, stmt_info,
7552                                    new_vec, step_vectype, NULL);
7553
7554       /* Now generate the IVs.  */
7555       unsigned group_size = SLP_TREE_SCALAR_STMTS (slp_node).length ();
7556       unsigned nvects = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
7557       unsigned elts = const_nunits * nvects;
7558       /* Compute the number of distinct IVs we need.  First reduce
7559          group_size if it is a multiple of const_nunits so we get
7560          one IV for a group_size of 4 but const_nunits 2.  */
7561       unsigned group_sizep = group_size;
7562       if (group_sizep % const_nunits == 0)
7563         group_sizep = group_sizep / const_nunits;
7564       unsigned nivs = least_common_multiple (group_sizep,
7565                                              const_nunits) / const_nunits;
7566       gcc_assert (elts % group_size == 0);
7567       tree elt = init_expr;
7568       unsigned ivn;
7569       for (ivn = 0; ivn < nivs; ++ivn)
7570         {
7571           tree_vector_builder elts (step_vectype, const_nunits, 1);
7572           stmts = NULL;
7573           for (unsigned eltn = 0; eltn < const_nunits; ++eltn)
7574             {
7575               if (ivn*const_nunits + eltn >= group_size
7576                   && (ivn * const_nunits + eltn) % group_size == 0)
7577                 elt = gimple_build (&stmts, PLUS_EXPR, TREE_TYPE (elt),
7578                                     elt, step_expr);
7579               elts.quick_push (elt);
7580             }
7581           vec_init = gimple_build_vector (&stmts, &elts);
7582           vec_init = gimple_convert (&stmts, vectype, vec_init);
7583           if (stmts)
7584             {
7585               new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
7586               gcc_assert (!new_bb);
7587             }
7588
7589           /* Create the induction-phi that defines the induction-operand.  */
7590           vec_dest = vect_get_new_vect_var (vectype, vect_simple_var, "vec_iv_");
7591           induction_phi = create_phi_node (vec_dest, iv_loop->header);
7592           stmt_vec_info induction_phi_info
7593             = loop_vinfo->add_stmt (induction_phi);
7594           induc_def = PHI_RESULT (induction_phi);
7595
7596           /* Create the iv update inside the loop  */
7597           gimple_seq stmts = NULL;
7598           vec_def = gimple_convert (&stmts, step_vectype, induc_def);
7599           vec_def = gimple_build (&stmts,
7600                                   PLUS_EXPR, step_vectype, vec_def, vec_step);
7601           vec_def = gimple_convert (&stmts, vectype, vec_def);
7602           loop_vinfo->add_stmt (SSA_NAME_DEF_STMT (vec_def));
7603           gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
7604
7605           /* Set the arguments of the phi node:  */
7606           add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION);
7607           add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
7608                        UNKNOWN_LOCATION);
7609
7610           SLP_TREE_VEC_STMTS (slp_node).quick_push (induction_phi_info);
7611         }
7612       /* Fill up to the number of vectors we need for the whole group.  */
7613       nivs = least_common_multiple (group_size,
7614                                     const_nunits) / const_nunits;
7615       for (; ivn < nivs; ++ivn)
7616         SLP_TREE_VEC_STMTS (slp_node)
7617           .quick_push (SLP_TREE_VEC_STMTS (slp_node)[0]);
7618
7619       /* Re-use IVs when we can.  */
7620       if (ivn < nvects)
7621         {
7622           unsigned vfp
7623             = least_common_multiple (group_size, const_nunits) / group_size;
7624           /* Generate [VF'*S, VF'*S, ... ].  */
7625           if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
7626             {
7627               expr = build_int_cst (integer_type_node, vfp);
7628               expr = fold_convert (TREE_TYPE (step_expr), expr);
7629             }
7630           else
7631             expr = build_int_cst (TREE_TYPE (step_expr), vfp);
7632           new_name = fold_build2 (MULT_EXPR, TREE_TYPE (step_expr),
7633                                   expr, step_expr);
7634           if (! CONSTANT_CLASS_P (new_name))
7635             new_name = vect_init_vector (loop_vinfo, stmt_info, new_name,
7636                                          TREE_TYPE (step_expr), NULL);
7637           new_vec = build_vector_from_val (step_vectype, new_name);
7638           vec_step = vect_init_vector (loop_vinfo, stmt_info, new_vec,
7639                                        step_vectype, NULL);
7640           for (; ivn < nvects; ++ivn)
7641             {
7642               gimple *iv = SLP_TREE_VEC_STMTS (slp_node)[ivn - nivs]->stmt;
7643               tree def;
7644               if (gimple_code (iv) == GIMPLE_PHI)
7645                 def = gimple_phi_result (iv);
7646               else
7647                 def = gimple_assign_lhs (iv);
7648               gimple_seq stmts = NULL;
7649               def = gimple_convert (&stmts, step_vectype, def);
7650               def = gimple_build (&stmts,
7651                                   PLUS_EXPR, step_vectype, def, vec_step);
7652               def = gimple_convert (&stmts, vectype, def);
7653               if (gimple_code (iv) == GIMPLE_PHI)
7654                 gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
7655               else
7656                 {
7657                   gimple_stmt_iterator tgsi = gsi_for_stmt (iv);
7658                   gsi_insert_seq_after (&tgsi, stmts, GSI_CONTINUE_LINKING);
7659                 }
7660               SLP_TREE_VEC_STMTS (slp_node).quick_push
7661                 (loop_vinfo->add_stmt (SSA_NAME_DEF_STMT (def)));
7662             }
7663         }
7664
7665       return true;
7666     }
7667
7668   /* Create the vector that holds the initial_value of the induction.  */
7669   if (nested_in_vect_loop)
7670     {
7671       /* iv_loop is nested in the loop to be vectorized.  init_expr had already
7672          been created during vectorization of previous stmts.  We obtain it
7673          from the STMT_VINFO_VEC_STMT of the defining stmt.  */
7674       vec_init = vect_get_vec_def_for_operand (loop_vinfo,
7675                                                init_expr, stmt_info);
7676       /* If the initial value is not of proper type, convert it.  */
7677       if (!useless_type_conversion_p (vectype, TREE_TYPE (vec_init)))
7678         {
7679           new_stmt
7680             = gimple_build_assign (vect_get_new_ssa_name (vectype,
7681                                                           vect_simple_var,
7682                                                           "vec_iv_"),
7683                                    VIEW_CONVERT_EXPR,
7684                                    build1 (VIEW_CONVERT_EXPR, vectype,
7685                                            vec_init));
7686           vec_init = gimple_assign_lhs (new_stmt);
7687           new_bb = gsi_insert_on_edge_immediate (loop_preheader_edge (iv_loop),
7688                                                  new_stmt);
7689           gcc_assert (!new_bb);
7690           loop_vinfo->add_stmt (new_stmt);
7691         }
7692     }
7693   else
7694     {
7695       /* iv_loop is the loop to be vectorized. Create:
7696          vec_init = [X, X+S, X+2*S, X+3*S] (S = step_expr, X = init_expr)  */
7697       stmts = NULL;
7698       new_name = gimple_convert (&stmts, TREE_TYPE (step_expr), init_expr);
7699
7700       unsigned HOST_WIDE_INT const_nunits;
7701       if (nunits.is_constant (&const_nunits))
7702         {
7703           tree_vector_builder elts (step_vectype, const_nunits, 1);
7704           elts.quick_push (new_name);
7705           for (i = 1; i < const_nunits; i++)
7706             {
7707               /* Create: new_name_i = new_name + step_expr  */
7708               new_name = gimple_build (&stmts, PLUS_EXPR, TREE_TYPE (new_name),
7709                                        new_name, step_expr);
7710               elts.quick_push (new_name);
7711             }
7712           /* Create a vector from [new_name_0, new_name_1, ...,
7713              new_name_nunits-1]  */
7714           vec_init = gimple_build_vector (&stmts, &elts);
7715         }
7716       else if (INTEGRAL_TYPE_P (TREE_TYPE (step_expr)))
7717         /* Build the initial value directly from a VEC_SERIES_EXPR.  */
7718         vec_init = gimple_build (&stmts, VEC_SERIES_EXPR, step_vectype,
7719                                  new_name, step_expr);
7720       else
7721         {
7722           /* Build:
7723                 [base, base, base, ...]
7724                 + (vectype) [0, 1, 2, ...] * [step, step, step, ...].  */
7725           gcc_assert (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)));
7726           gcc_assert (flag_associative_math);
7727           tree index = build_index_vector (step_vectype, 0, 1);
7728           tree base_vec = gimple_build_vector_from_val (&stmts, step_vectype,
7729                                                         new_name);
7730           tree step_vec = gimple_build_vector_from_val (&stmts, step_vectype,
7731                                                         step_expr);
7732           vec_init = gimple_build (&stmts, FLOAT_EXPR, step_vectype, index);
7733           vec_init = gimple_build (&stmts, MULT_EXPR, step_vectype,
7734                                    vec_init, step_vec);
7735           vec_init = gimple_build (&stmts, PLUS_EXPR, step_vectype,
7736                                    vec_init, base_vec);
7737         }
7738       vec_init = gimple_convert (&stmts, vectype, vec_init);
7739
7740       if (stmts)
7741         {
7742           new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
7743           gcc_assert (!new_bb);
7744         }
7745     }
7746
7747
7748   /* Create the vector that holds the step of the induction.  */
7749   if (nested_in_vect_loop)
7750     /* iv_loop is nested in the loop to be vectorized. Generate:
7751        vec_step = [S, S, S, S]  */
7752     new_name = step_expr;
7753   else
7754     {
7755       /* iv_loop is the loop to be vectorized. Generate:
7756           vec_step = [VF*S, VF*S, VF*S, VF*S]  */
7757       gimple_seq seq = NULL;
7758       if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
7759         {
7760           expr = build_int_cst (integer_type_node, vf);
7761           expr = gimple_build (&seq, FLOAT_EXPR, TREE_TYPE (step_expr), expr);
7762         }
7763       else
7764         expr = build_int_cst (TREE_TYPE (step_expr), vf);
7765       new_name = gimple_build (&seq, MULT_EXPR, TREE_TYPE (step_expr),
7766                                expr, step_expr);
7767       if (seq)
7768         {
7769           new_bb = gsi_insert_seq_on_edge_immediate (pe, seq);
7770           gcc_assert (!new_bb);
7771         }
7772     }
7773
7774   t = unshare_expr (new_name);
7775   gcc_assert (CONSTANT_CLASS_P (new_name)
7776               || TREE_CODE (new_name) == SSA_NAME);
7777   new_vec = build_vector_from_val (step_vectype, t);
7778   vec_step = vect_init_vector (loop_vinfo, stmt_info,
7779                                new_vec, step_vectype, NULL);
7780
7781
7782   /* Create the following def-use cycle:
7783      loop prolog:
7784          vec_init = ...
7785          vec_step = ...
7786      loop:
7787          vec_iv = PHI <vec_init, vec_loop>
7788          ...
7789          STMT
7790          ...
7791          vec_loop = vec_iv + vec_step;  */
7792
7793   /* Create the induction-phi that defines the induction-operand.  */
7794   vec_dest = vect_get_new_vect_var (vectype, vect_simple_var, "vec_iv_");
7795   induction_phi = create_phi_node (vec_dest, iv_loop->header);
7796   stmt_vec_info induction_phi_info = loop_vinfo->add_stmt (induction_phi);
7797   induc_def = PHI_RESULT (induction_phi);
7798
7799   /* Create the iv update inside the loop  */
7800   stmts = NULL;
7801   vec_def = gimple_convert (&stmts, step_vectype, induc_def);
7802   vec_def = gimple_build (&stmts, PLUS_EXPR, step_vectype, vec_def, vec_step);
7803   vec_def = gimple_convert (&stmts, vectype, vec_def);
7804   gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
7805   new_stmt = SSA_NAME_DEF_STMT (vec_def);
7806   stmt_vec_info new_stmt_info = loop_vinfo->add_stmt (new_stmt);
7807
7808   /* Set the arguments of the phi node:  */
7809   add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION);
7810   add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
7811                UNKNOWN_LOCATION);
7812
7813   STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = induction_phi_info;
7814
7815   /* In case that vectorization factor (VF) is bigger than the number
7816      of elements that we can fit in a vectype (nunits), we have to generate
7817      more than one vector stmt - i.e - we need to "unroll" the
7818      vector stmt by a factor VF/nunits.  For more details see documentation
7819      in vectorizable_operation.  */
7820
7821   if (ncopies > 1)
7822     {
7823       gimple_seq seq = NULL;
7824       stmt_vec_info prev_stmt_vinfo;
7825       /* FORNOW. This restriction should be relaxed.  */
7826       gcc_assert (!nested_in_vect_loop);
7827
7828       /* Create the vector that holds the step of the induction.  */
7829       if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
7830         {
7831           expr = build_int_cst (integer_type_node, nunits);
7832           expr = gimple_build (&seq, FLOAT_EXPR, TREE_TYPE (step_expr), expr);
7833         }
7834       else
7835         expr = build_int_cst (TREE_TYPE (step_expr), nunits);
7836       new_name = gimple_build (&seq, MULT_EXPR, TREE_TYPE (step_expr),
7837                                expr, step_expr);
7838       if (seq)
7839         {
7840           new_bb = gsi_insert_seq_on_edge_immediate (pe, seq);
7841           gcc_assert (!new_bb);
7842         }
7843
7844       t = unshare_expr (new_name);
7845       gcc_assert (CONSTANT_CLASS_P (new_name)
7846                   || TREE_CODE (new_name) == SSA_NAME);
7847       new_vec = build_vector_from_val (step_vectype, t);
7848       vec_step = vect_init_vector (loop_vinfo, stmt_info,
7849                                    new_vec, step_vectype, NULL);
7850
7851       vec_def = induc_def;
7852       prev_stmt_vinfo = induction_phi_info;
7853       for (i = 1; i < ncopies; i++)
7854         {
7855           /* vec_i = vec_prev + vec_step  */
7856           gimple_seq stmts = NULL;
7857           vec_def = gimple_convert (&stmts, step_vectype, vec_def);
7858           vec_def = gimple_build (&stmts,
7859                                   PLUS_EXPR, step_vectype, vec_def, vec_step);
7860           vec_def = gimple_convert (&stmts, vectype, vec_def);
7861
7862           gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
7863           new_stmt = SSA_NAME_DEF_STMT (vec_def);
7864           new_stmt_info = loop_vinfo->add_stmt (new_stmt);
7865           STMT_VINFO_RELATED_STMT (prev_stmt_vinfo) = new_stmt_info;
7866           prev_stmt_vinfo = new_stmt_info;
7867         }
7868     }
7869
7870   if (nested_in_vect_loop)
7871     {
7872       /* Find the loop-closed exit-phi of the induction, and record
7873          the final vector of induction results:  */
7874       exit_phi = NULL;
7875       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, loop_arg)
7876         {
7877           gimple *use_stmt = USE_STMT (use_p);
7878           if (is_gimple_debug (use_stmt))
7879             continue;
7880
7881           if (!flow_bb_inside_loop_p (iv_loop, gimple_bb (use_stmt)))
7882             {
7883               exit_phi = use_stmt;
7884               break;
7885             }
7886         }
7887       if (exit_phi)
7888         {
7889           stmt_vec_info stmt_vinfo = loop_vinfo->lookup_stmt (exit_phi);
7890           /* FORNOW. Currently not supporting the case that an inner-loop induction
7891              is not used in the outer-loop (i.e. only outside the outer-loop).  */
7892           gcc_assert (STMT_VINFO_RELEVANT_P (stmt_vinfo)
7893                       && !STMT_VINFO_LIVE_P (stmt_vinfo));
7894
7895           STMT_VINFO_VEC_STMT (stmt_vinfo) = new_stmt_info;
7896           if (dump_enabled_p ())
7897             dump_printf_loc (MSG_NOTE, vect_location,
7898                              "vector of inductions after inner-loop:%G",
7899                              new_stmt);
7900         }
7901     }
7902
7903
7904   if (dump_enabled_p ())
7905     dump_printf_loc (MSG_NOTE, vect_location,
7906                      "transform induction: created def-use cycle: %G%G",
7907                      induction_phi, SSA_NAME_DEF_STMT (vec_def));
7908
7909   return true;
7910 }
7911
7912 /* Function vectorizable_live_operation.
7913
7914    STMT_INFO computes a value that is used outside the loop.  Check if
7915    it can be supported.  */
7916
7917 bool
7918 vectorizable_live_operation (loop_vec_info loop_vinfo,
7919                              stmt_vec_info stmt_info,
7920                              gimple_stmt_iterator *gsi,
7921                              slp_tree slp_node, slp_instance slp_node_instance,
7922                              int slp_index, bool vec_stmt_p,
7923                              stmt_vector_for_cost *)
7924 {
7925   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
7926   imm_use_iterator imm_iter;
7927   tree lhs, lhs_type, bitsize, vec_bitsize;
7928   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
7929   poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
7930   int ncopies;
7931   gimple *use_stmt;
7932   auto_vec<tree> vec_oprnds;
7933   int vec_entry = 0;
7934   poly_uint64 vec_index = 0;
7935
7936   gcc_assert (STMT_VINFO_LIVE_P (stmt_info));
7937
7938   /* If a stmt of a reduction is live, vectorize it via
7939      vect_create_epilog_for_reduction.  vectorizable_reduction assessed
7940      validity so just trigger the transform here.  */
7941   if (STMT_VINFO_REDUC_DEF (vect_orig_stmt (stmt_info)))
7942     {
7943       if (!vec_stmt_p)
7944         return true;
7945       if (slp_node)
7946         {
7947           /* For reduction chains the meta-info is attached to
7948              the group leader.  */
7949           if (REDUC_GROUP_FIRST_ELEMENT (stmt_info))
7950             stmt_info = REDUC_GROUP_FIRST_ELEMENT (stmt_info);
7951           /* For SLP reductions we vectorize the epilogue for
7952              all involved stmts together.  */
7953           else if (slp_index != 0)
7954             return true;
7955           else
7956             /* For SLP reductions the meta-info is attached to
7957                the representative.  */
7958             stmt_info = SLP_TREE_REPRESENTATIVE (slp_node);
7959         }
7960       stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info);
7961       gcc_assert (reduc_info->is_reduc_info);
7962       if (STMT_VINFO_REDUC_TYPE (reduc_info) == FOLD_LEFT_REDUCTION
7963           || STMT_VINFO_REDUC_TYPE (reduc_info) == EXTRACT_LAST_REDUCTION)
7964         return true;
7965       vect_create_epilog_for_reduction (loop_vinfo, stmt_info, slp_node,
7966                                         slp_node_instance);
7967       return true;
7968     }
7969
7970   /* FORNOW.  CHECKME.  */
7971   if (nested_in_vect_loop_p (loop, stmt_info))
7972     return false;
7973
7974   /* If STMT is not relevant and it is a simple assignment and its inputs are
7975      invariant then it can remain in place, unvectorized.  The original last
7976      scalar value that it computes will be used.  */
7977   if (!STMT_VINFO_RELEVANT_P (stmt_info))
7978     {
7979       gcc_assert (is_simple_and_all_uses_invariant (stmt_info, loop_vinfo));
7980       if (dump_enabled_p ())
7981         dump_printf_loc (MSG_NOTE, vect_location,
7982                          "statement is simple and uses invariant.  Leaving in "
7983                          "place.\n");
7984       return true;
7985     }
7986
7987   if (slp_node)
7988     ncopies = 1;
7989   else
7990     ncopies = vect_get_num_copies (loop_vinfo, vectype);
7991
7992   if (slp_node)
7993     {
7994       gcc_assert (slp_index >= 0);
7995
7996       int num_scalar = SLP_TREE_SCALAR_STMTS (slp_node).length ();
7997       int num_vec = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
7998
7999       /* Get the last occurrence of the scalar index from the concatenation of
8000          all the slp vectors. Calculate which slp vector it is and the index
8001          within.  */
8002       poly_uint64 pos = (num_vec * nunits) - num_scalar + slp_index;
8003
8004       /* Calculate which vector contains the result, and which lane of
8005          that vector we need.  */
8006       if (!can_div_trunc_p (pos, nunits, &vec_entry, &vec_index))
8007         {
8008           if (dump_enabled_p ())
8009             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8010                              "Cannot determine which vector holds the"
8011                              " final result.\n");
8012           return false;
8013         }
8014     }
8015
8016   if (!vec_stmt_p)
8017     {
8018       /* No transformation required.  */
8019       if (LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo))
8020         {
8021           if (!direct_internal_fn_supported_p (IFN_EXTRACT_LAST, vectype,
8022                                                OPTIMIZE_FOR_SPEED))
8023             {
8024               if (dump_enabled_p ())
8025                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8026                                  "can't use a fully-masked loop because "
8027                                  "the target doesn't support extract last "
8028                                  "reduction.\n");
8029               LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = false;
8030             }
8031           else if (slp_node)
8032             {
8033               if (dump_enabled_p ())
8034                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8035                                  "can't use a fully-masked loop because an "
8036                                  "SLP statement is live after the loop.\n");
8037               LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = false;
8038             }
8039           else if (ncopies > 1)
8040             {
8041               if (dump_enabled_p ())
8042                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8043                                  "can't use a fully-masked loop because"
8044                                  " ncopies is greater than 1.\n");
8045               LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = false;
8046             }
8047           else
8048             {
8049               gcc_assert (ncopies == 1 && !slp_node);
8050               vect_record_loop_mask (loop_vinfo,
8051                                      &LOOP_VINFO_MASKS (loop_vinfo),
8052                                      1, vectype, NULL);
8053             }
8054         }
8055       return true;
8056     }
8057
8058   /* Use the lhs of the original scalar statement.  */
8059   gimple *stmt = vect_orig_stmt (stmt_info)->stmt;
8060
8061   lhs = (is_a <gphi *> (stmt)) ? gimple_phi_result (stmt)
8062         : gimple_get_lhs (stmt);
8063   lhs_type = TREE_TYPE (lhs);
8064
8065   bitsize = vector_element_bits_tree (vectype);
8066   vec_bitsize = TYPE_SIZE (vectype);
8067
8068   /* Get the vectorized lhs of STMT and the lane to use (counted in bits).  */
8069   tree vec_lhs, bitstart;
8070   if (slp_node)
8071     {
8072       gcc_assert (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo));
8073
8074       /* Get the correct slp vectorized stmt.  */
8075       gimple *vec_stmt = SLP_TREE_VEC_STMTS (slp_node)[vec_entry]->stmt;
8076       if (gphi *phi = dyn_cast <gphi *> (vec_stmt))
8077         vec_lhs = gimple_phi_result (phi);
8078       else
8079         vec_lhs = gimple_get_lhs (vec_stmt);
8080
8081       /* Get entry to use.  */
8082       bitstart = bitsize_int (vec_index);
8083       bitstart = int_const_binop (MULT_EXPR, bitsize, bitstart);
8084     }
8085   else
8086     {
8087       enum vect_def_type dt = STMT_VINFO_DEF_TYPE (stmt_info);
8088       vec_lhs = vect_get_vec_def_for_operand_1 (stmt_info, dt);
8089       gcc_checking_assert (ncopies == 1
8090                            || !LOOP_VINFO_FULLY_MASKED_P (loop_vinfo));
8091
8092       /* For multiple copies, get the last copy.  */
8093       for (int i = 1; i < ncopies; ++i)
8094         vec_lhs = vect_get_vec_def_for_stmt_copy (loop_vinfo, vec_lhs);
8095
8096       /* Get the last lane in the vector.  */
8097       bitstart = int_const_binop (MINUS_EXPR, vec_bitsize, bitsize);
8098     }
8099
8100   /* Ensure the VEC_LHS for lane extraction stmts satisfy loop-closed PHI
8101      requirement, insert one phi node for it.  It looks like:
8102          loop;
8103        BB:
8104          # lhs' = PHI <lhs>
8105      ==>
8106          loop;
8107        BB:
8108          # vec_lhs' = PHI <vec_lhs>
8109          new_tree = lane_extract <vec_lhs', ...>;
8110          lhs' = new_tree;  */
8111
8112   basic_block exit_bb = single_exit (loop)->dest;
8113   gcc_assert (single_pred_p (exit_bb));
8114
8115   tree vec_lhs_phi = copy_ssa_name (vec_lhs);
8116   gimple *phi = create_phi_node (vec_lhs_phi, exit_bb);
8117   SET_PHI_ARG_DEF (phi, single_exit (loop)->dest_idx, vec_lhs);
8118
8119   gimple_seq stmts = NULL;
8120   tree new_tree;
8121   if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
8122     {
8123       /* Emit:
8124
8125            SCALAR_RES = EXTRACT_LAST <VEC_LHS, MASK>
8126
8127          where VEC_LHS is the vectorized live-out result and MASK is
8128          the loop mask for the final iteration.  */
8129       gcc_assert (ncopies == 1 && !slp_node);
8130       tree scalar_type = TREE_TYPE (STMT_VINFO_VECTYPE (stmt_info));
8131       tree mask = vect_get_loop_mask (gsi, &LOOP_VINFO_MASKS (loop_vinfo), 1,
8132                                       vectype, 0);
8133       tree scalar_res = gimple_build (&stmts, CFN_EXTRACT_LAST, scalar_type,
8134                                       mask, vec_lhs_phi);
8135
8136       /* Convert the extracted vector element to the required scalar type.  */
8137       new_tree = gimple_convert (&stmts, lhs_type, scalar_res);
8138     }
8139   else
8140     {
8141       tree bftype = TREE_TYPE (vectype);
8142       if (VECTOR_BOOLEAN_TYPE_P (vectype))
8143         bftype = build_nonstandard_integer_type (tree_to_uhwi (bitsize), 1);
8144       new_tree = build3 (BIT_FIELD_REF, bftype, vec_lhs_phi, bitsize, bitstart);
8145       new_tree = force_gimple_operand (fold_convert (lhs_type, new_tree),
8146                                        &stmts, true, NULL_TREE);
8147     }
8148
8149   if (stmts)
8150     {
8151       gimple_stmt_iterator exit_gsi = gsi_after_labels (exit_bb);
8152       gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
8153
8154       /* Remove existing phi from lhs and create one copy from new_tree.  */
8155       tree lhs_phi = NULL_TREE;
8156       gimple_stmt_iterator gsi;
8157       for (gsi = gsi_start_phis (exit_bb); !gsi_end_p (gsi); gsi_next (&gsi))
8158         {
8159           gimple *phi = gsi_stmt (gsi);
8160           if ((gimple_phi_arg_def (phi, 0) == lhs))
8161             {
8162               remove_phi_node (&gsi, false);
8163               lhs_phi = gimple_phi_result (phi);
8164               gimple *copy = gimple_build_assign (lhs_phi, new_tree);
8165               gsi_insert_before (&exit_gsi, copy, GSI_SAME_STMT);
8166               break;
8167             }
8168         }
8169     }
8170
8171   /* Replace use of lhs with newly computed result.  If the use stmt is a
8172      single arg PHI, just replace all uses of PHI result.  It's necessary
8173      because lcssa PHI defining lhs may be before newly inserted stmt.  */
8174   use_operand_p use_p;
8175   FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, lhs)
8176     if (!flow_bb_inside_loop_p (loop, gimple_bb (use_stmt))
8177         && !is_gimple_debug (use_stmt))
8178     {
8179       if (gimple_code (use_stmt) == GIMPLE_PHI
8180           && gimple_phi_num_args (use_stmt) == 1)
8181         {
8182           replace_uses_by (gimple_phi_result (use_stmt), new_tree);
8183         }
8184       else
8185         {
8186           FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
8187             SET_USE (use_p, new_tree);
8188         }
8189       update_stmt (use_stmt);
8190     }
8191
8192   return true;
8193 }
8194
8195 /* Kill any debug uses outside LOOP of SSA names defined in STMT_INFO.  */
8196
8197 static void
8198 vect_loop_kill_debug_uses (class loop *loop, stmt_vec_info stmt_info)
8199 {
8200   ssa_op_iter op_iter;
8201   imm_use_iterator imm_iter;
8202   def_operand_p def_p;
8203   gimple *ustmt;
8204
8205   FOR_EACH_PHI_OR_STMT_DEF (def_p, stmt_info->stmt, op_iter, SSA_OP_DEF)
8206     {
8207       FOR_EACH_IMM_USE_STMT (ustmt, imm_iter, DEF_FROM_PTR (def_p))
8208         {
8209           basic_block bb;
8210
8211           if (!is_gimple_debug (ustmt))
8212             continue;
8213
8214           bb = gimple_bb (ustmt);
8215
8216           if (!flow_bb_inside_loop_p (loop, bb))
8217             {
8218               if (gimple_debug_bind_p (ustmt))
8219                 {
8220                   if (dump_enabled_p ())
8221                     dump_printf_loc (MSG_NOTE, vect_location,
8222                                      "killing debug use\n");
8223
8224                   gimple_debug_bind_reset_value (ustmt);
8225                   update_stmt (ustmt);
8226                 }
8227               else
8228                 gcc_unreachable ();
8229             }
8230         }
8231     }
8232 }
8233
8234 /* Given loop represented by LOOP_VINFO, return true if computation of
8235    LOOP_VINFO_NITERS (= LOOP_VINFO_NITERSM1 + 1) doesn't overflow, false
8236    otherwise.  */
8237
8238 static bool
8239 loop_niters_no_overflow (loop_vec_info loop_vinfo)
8240 {
8241   /* Constant case.  */
8242   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
8243     {
8244       tree cst_niters = LOOP_VINFO_NITERS (loop_vinfo);
8245       tree cst_nitersm1 = LOOP_VINFO_NITERSM1 (loop_vinfo);
8246
8247       gcc_assert (TREE_CODE (cst_niters) == INTEGER_CST);
8248       gcc_assert (TREE_CODE (cst_nitersm1) == INTEGER_CST);
8249       if (wi::to_widest (cst_nitersm1) < wi::to_widest (cst_niters))
8250         return true;
8251     }
8252
8253   widest_int max;
8254   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
8255   /* Check the upper bound of loop niters.  */
8256   if (get_max_loop_iterations (loop, &max))
8257     {
8258       tree type = TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo));
8259       signop sgn = TYPE_SIGN (type);
8260       widest_int type_max = widest_int::from (wi::max_value (type), sgn);
8261       if (max < type_max)
8262         return true;
8263     }
8264   return false;
8265 }
8266
8267 /* Return a mask type with half the number of elements as OLD_TYPE,
8268    given that it should have mode NEW_MODE.  */
8269
8270 tree
8271 vect_halve_mask_nunits (tree old_type, machine_mode new_mode)
8272 {
8273   poly_uint64 nunits = exact_div (TYPE_VECTOR_SUBPARTS (old_type), 2);
8274   return build_truth_vector_type_for_mode (nunits, new_mode);
8275 }
8276
8277 /* Return a mask type with twice as many elements as OLD_TYPE,
8278    given that it should have mode NEW_MODE.  */
8279
8280 tree
8281 vect_double_mask_nunits (tree old_type, machine_mode new_mode)
8282 {
8283   poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (old_type) * 2;
8284   return build_truth_vector_type_for_mode (nunits, new_mode);
8285 }
8286
8287 /* Record that a fully-masked version of LOOP_VINFO would need MASKS to
8288    contain a sequence of NVECTORS masks that each control a vector of type
8289    VECTYPE.  If SCALAR_MASK is nonnull, the fully-masked loop would AND
8290    these vector masks with the vector version of SCALAR_MASK.  */
8291
8292 void
8293 vect_record_loop_mask (loop_vec_info loop_vinfo, vec_loop_masks *masks,
8294                        unsigned int nvectors, tree vectype, tree scalar_mask)
8295 {
8296   gcc_assert (nvectors != 0);
8297   if (masks->length () < nvectors)
8298     masks->safe_grow_cleared (nvectors);
8299   rgroup_masks *rgm = &(*masks)[nvectors - 1];
8300   /* The number of scalars per iteration and the number of vectors are
8301      both compile-time constants.  */
8302   unsigned int nscalars_per_iter
8303     = exact_div (nvectors * TYPE_VECTOR_SUBPARTS (vectype),
8304                  LOOP_VINFO_VECT_FACTOR (loop_vinfo)).to_constant ();
8305
8306   if (scalar_mask)
8307     {
8308       scalar_cond_masked_key cond (scalar_mask, nvectors);
8309       loop_vinfo->scalar_cond_masked_set.add (cond);
8310     }
8311
8312   if (rgm->max_nscalars_per_iter < nscalars_per_iter)
8313     {
8314       rgm->max_nscalars_per_iter = nscalars_per_iter;
8315       rgm->mask_type = truth_type_for (vectype);
8316     }
8317 }
8318
8319 /* Given a complete set of masks MASKS, extract mask number INDEX
8320    for an rgroup that operates on NVECTORS vectors of type VECTYPE,
8321    where 0 <= INDEX < NVECTORS.  Insert any set-up statements before GSI.
8322
8323    See the comment above vec_loop_masks for more details about the mask
8324    arrangement.  */
8325
8326 tree
8327 vect_get_loop_mask (gimple_stmt_iterator *gsi, vec_loop_masks *masks,
8328                     unsigned int nvectors, tree vectype, unsigned int index)
8329 {
8330   rgroup_masks *rgm = &(*masks)[nvectors - 1];
8331   tree mask_type = rgm->mask_type;
8332
8333   /* Populate the rgroup's mask array, if this is the first time we've
8334      used it.  */
8335   if (rgm->masks.is_empty ())
8336     {
8337       rgm->masks.safe_grow_cleared (nvectors);
8338       for (unsigned int i = 0; i < nvectors; ++i)
8339         {
8340           tree mask = make_temp_ssa_name (mask_type, NULL, "loop_mask");
8341           /* Provide a dummy definition until the real one is available.  */
8342           SSA_NAME_DEF_STMT (mask) = gimple_build_nop ();
8343           rgm->masks[i] = mask;
8344         }
8345     }
8346
8347   tree mask = rgm->masks[index];
8348   if (maybe_ne (TYPE_VECTOR_SUBPARTS (mask_type),
8349                 TYPE_VECTOR_SUBPARTS (vectype)))
8350     {
8351       /* A loop mask for data type X can be reused for data type Y
8352          if X has N times more elements than Y and if Y's elements
8353          are N times bigger than X's.  In this case each sequence
8354          of N elements in the loop mask will be all-zero or all-one.
8355          We can then view-convert the mask so that each sequence of
8356          N elements is replaced by a single element.  */
8357       gcc_assert (multiple_p (TYPE_VECTOR_SUBPARTS (mask_type),
8358                               TYPE_VECTOR_SUBPARTS (vectype)));
8359       gimple_seq seq = NULL;
8360       mask_type = truth_type_for (vectype);
8361       mask = gimple_build (&seq, VIEW_CONVERT_EXPR, mask_type, mask);
8362       if (seq)
8363         gsi_insert_seq_before (gsi, seq, GSI_SAME_STMT);
8364     }
8365   return mask;
8366 }
8367
8368 /* Scale profiling counters by estimation for LOOP which is vectorized
8369    by factor VF.  */
8370
8371 static void
8372 scale_profile_for_vect_loop (class loop *loop, unsigned vf)
8373 {
8374   edge preheader = loop_preheader_edge (loop);
8375   /* Reduce loop iterations by the vectorization factor.  */
8376   gcov_type new_est_niter = niter_for_unrolled_loop (loop, vf);
8377   profile_count freq_h = loop->header->count, freq_e = preheader->count ();
8378
8379   if (freq_h.nonzero_p ())
8380     {
8381       profile_probability p;
8382
8383       /* Avoid dropping loop body profile counter to 0 because of zero count
8384          in loop's preheader.  */
8385       if (!(freq_e == profile_count::zero ()))
8386         freq_e = freq_e.force_nonzero ();
8387       p = freq_e.apply_scale (new_est_niter + 1, 1).probability_in (freq_h);
8388       scale_loop_frequencies (loop, p);
8389     }
8390
8391   edge exit_e = single_exit (loop);
8392   exit_e->probability = profile_probability::always ()
8393                                  .apply_scale (1, new_est_niter + 1);
8394
8395   edge exit_l = single_pred_edge (loop->latch);
8396   profile_probability prob = exit_l->probability;
8397   exit_l->probability = exit_e->probability.invert ();
8398   if (prob.initialized_p () && exit_l->probability.initialized_p ())
8399     scale_bbs_frequencies (&loop->latch, 1, exit_l->probability / prob);
8400 }
8401
8402 /* Vectorize STMT_INFO if relevant, inserting any new instructions before GSI.
8403    When vectorizing STMT_INFO as a store, set *SEEN_STORE to its
8404    stmt_vec_info.  */
8405
8406 static void
8407 vect_transform_loop_stmt (loop_vec_info loop_vinfo, stmt_vec_info stmt_info,
8408                           gimple_stmt_iterator *gsi, stmt_vec_info *seen_store)
8409 {
8410   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
8411   poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
8412
8413   if (dump_enabled_p ())
8414     dump_printf_loc (MSG_NOTE, vect_location,
8415                      "------>vectorizing statement: %G", stmt_info->stmt);
8416
8417   if (MAY_HAVE_DEBUG_BIND_STMTS && !STMT_VINFO_LIVE_P (stmt_info))
8418     vect_loop_kill_debug_uses (loop, stmt_info);
8419
8420   if (!STMT_VINFO_RELEVANT_P (stmt_info)
8421       && !STMT_VINFO_LIVE_P (stmt_info))
8422     return;
8423
8424   if (STMT_VINFO_VECTYPE (stmt_info))
8425     {
8426       poly_uint64 nunits
8427         = TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info));
8428       if (!STMT_SLP_TYPE (stmt_info)
8429           && maybe_ne (nunits, vf)
8430           && dump_enabled_p ())
8431         /* For SLP VF is set according to unrolling factor, and not
8432            to vector size, hence for SLP this print is not valid.  */
8433         dump_printf_loc (MSG_NOTE, vect_location, "multiple-types.\n");
8434     }
8435
8436   /* Pure SLP statements have already been vectorized.  We still need
8437      to apply loop vectorization to hybrid SLP statements.  */
8438   if (PURE_SLP_STMT (stmt_info))
8439     return;
8440
8441   if (dump_enabled_p ())
8442     dump_printf_loc (MSG_NOTE, vect_location, "transform statement.\n");
8443
8444   if (vect_transform_stmt (loop_vinfo, stmt_info, gsi, NULL, NULL))
8445     *seen_store = stmt_info;
8446 }
8447
8448 /* Helper function to pass to simplify_replace_tree to enable replacing tree's
8449    in the hash_map with its corresponding values.  */
8450
8451 static tree
8452 find_in_mapping (tree t, void *context)
8453 {
8454   hash_map<tree,tree>* mapping = (hash_map<tree, tree>*) context;
8455
8456   tree *value = mapping->get (t);
8457   return value ? *value : t;
8458 }
8459
8460 /* Update EPILOGUE's loop_vec_info.  EPILOGUE was constructed as a copy of the
8461    original loop that has now been vectorized.
8462
8463    The inits of the data_references need to be advanced with the number of
8464    iterations of the main loop.  This has been computed in vect_do_peeling and
8465    is stored in parameter ADVANCE.  We first restore the data_references
8466    initial offset with the values recored in ORIG_DRS_INIT.
8467
8468    Since the loop_vec_info of this EPILOGUE was constructed for the original
8469    loop, its stmt_vec_infos all point to the original statements.  These need
8470    to be updated to point to their corresponding copies as well as the SSA_NAMES
8471    in their PATTERN_DEF_SEQs and RELATED_STMTs.
8472
8473    The data_reference's connections also need to be updated.  Their
8474    corresponding dr_vec_info need to be reconnected to the EPILOGUE's
8475    stmt_vec_infos, their statements need to point to their corresponding copy,
8476    if they are gather loads or scatter stores then their reference needs to be
8477    updated to point to its corresponding copy and finally we set
8478    'base_misaligned' to false as we have already peeled for alignment in the
8479    prologue of the main loop.  */
8480
8481 static void
8482 update_epilogue_loop_vinfo (class loop *epilogue, tree advance)
8483 {
8484   loop_vec_info epilogue_vinfo = loop_vec_info_for_loop (epilogue);
8485   auto_vec<gimple *> stmt_worklist;
8486   hash_map<tree,tree> mapping;
8487   gimple *orig_stmt, *new_stmt;
8488   gimple_stmt_iterator epilogue_gsi;
8489   gphi_iterator epilogue_phi_gsi;
8490   stmt_vec_info stmt_vinfo = NULL, related_vinfo;
8491   basic_block *epilogue_bbs = get_loop_body (epilogue);
8492   unsigned i;
8493
8494   LOOP_VINFO_BBS (epilogue_vinfo) = epilogue_bbs;
8495
8496   /* Advance data_reference's with the number of iterations of the previous
8497      loop and its prologue.  */
8498   vect_update_inits_of_drs (epilogue_vinfo, advance, PLUS_EXPR);
8499
8500
8501   /* The EPILOGUE loop is a copy of the original loop so they share the same
8502      gimple UIDs.  In this loop we update the loop_vec_info of the EPILOGUE to
8503      point to the copied statements.  We also create a mapping of all LHS' in
8504      the original loop and all the LHS' in the EPILOGUE and create worklists to
8505      update teh STMT_VINFO_PATTERN_DEF_SEQs and STMT_VINFO_RELATED_STMTs.  */
8506   for (unsigned i = 0; i < epilogue->num_nodes; ++i)
8507     {
8508       for (epilogue_phi_gsi = gsi_start_phis (epilogue_bbs[i]);
8509            !gsi_end_p (epilogue_phi_gsi); gsi_next (&epilogue_phi_gsi))
8510         {
8511           new_stmt = epilogue_phi_gsi.phi ();
8512
8513           gcc_assert (gimple_uid (new_stmt) > 0);
8514           stmt_vinfo
8515             = epilogue_vinfo->stmt_vec_infos[gimple_uid (new_stmt) - 1];
8516
8517           orig_stmt = STMT_VINFO_STMT (stmt_vinfo);
8518           STMT_VINFO_STMT (stmt_vinfo) = new_stmt;
8519
8520           mapping.put (gimple_phi_result (orig_stmt),
8521                        gimple_phi_result (new_stmt));
8522           /* PHI nodes can not have patterns or related statements.  */
8523           gcc_assert (STMT_VINFO_PATTERN_DEF_SEQ (stmt_vinfo) == NULL
8524                       && STMT_VINFO_RELATED_STMT (stmt_vinfo) == NULL);
8525         }
8526
8527       for (epilogue_gsi = gsi_start_bb (epilogue_bbs[i]);
8528            !gsi_end_p (epilogue_gsi); gsi_next (&epilogue_gsi))
8529         {
8530           new_stmt = gsi_stmt (epilogue_gsi);
8531
8532           gcc_assert (gimple_uid (new_stmt) > 0);
8533           stmt_vinfo
8534             = epilogue_vinfo->stmt_vec_infos[gimple_uid (new_stmt) - 1];
8535
8536           orig_stmt = STMT_VINFO_STMT (stmt_vinfo);
8537           STMT_VINFO_STMT (stmt_vinfo) = new_stmt;
8538
8539           if (tree old_lhs = gimple_get_lhs (orig_stmt))
8540             mapping.put (old_lhs, gimple_get_lhs (new_stmt));
8541
8542           if (STMT_VINFO_PATTERN_DEF_SEQ (stmt_vinfo))
8543             {
8544               gimple_seq seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_vinfo);
8545               for (gimple_stmt_iterator gsi = gsi_start (seq);
8546                    !gsi_end_p (gsi); gsi_next (&gsi))
8547                 stmt_worklist.safe_push (gsi_stmt (gsi));
8548             }
8549
8550           related_vinfo = STMT_VINFO_RELATED_STMT (stmt_vinfo);
8551           if (related_vinfo != NULL && related_vinfo != stmt_vinfo)
8552             {
8553               gimple *stmt = STMT_VINFO_STMT (related_vinfo);
8554               stmt_worklist.safe_push (stmt);
8555               /* Set BB such that the assert in
8556                 'get_initial_def_for_reduction' is able to determine that
8557                 the BB of the related stmt is inside this loop.  */
8558               gimple_set_bb (stmt,
8559                              gimple_bb (new_stmt));
8560               related_vinfo = STMT_VINFO_RELATED_STMT (related_vinfo);
8561               gcc_assert (related_vinfo == NULL
8562                           || related_vinfo == stmt_vinfo);
8563             }
8564         }
8565     }
8566
8567   /* The PATTERN_DEF_SEQs and RELATED_STMTs in the epilogue were constructed
8568      using the original main loop and thus need to be updated to refer to the
8569      cloned variables used in the epilogue.  */
8570   for (unsigned i = 0; i < stmt_worklist.length (); ++i)
8571     {
8572       gimple *stmt = stmt_worklist[i];
8573       tree *new_op;
8574
8575       for (unsigned j = 1; j < gimple_num_ops (stmt); ++j)
8576         {
8577           tree op = gimple_op (stmt, j);
8578           if ((new_op = mapping.get(op)))
8579             gimple_set_op (stmt, j, *new_op);
8580           else
8581             {
8582               /* PR92429: The last argument of simplify_replace_tree disables
8583                  folding when replacing arguments.  This is required as
8584                  otherwise you might end up with different statements than the
8585                  ones analyzed in vect_loop_analyze, leading to different
8586                  vectorization.  */
8587               op = simplify_replace_tree (op, NULL_TREE, NULL_TREE,
8588                                           &find_in_mapping, &mapping, false);
8589               gimple_set_op (stmt, j, op);
8590             }
8591         }
8592     }
8593
8594   struct data_reference *dr;
8595   vec<data_reference_p> datarefs = epilogue_vinfo->shared->datarefs;
8596   FOR_EACH_VEC_ELT (datarefs, i, dr)
8597     {
8598       orig_stmt = DR_STMT (dr);
8599       gcc_assert (gimple_uid (orig_stmt) > 0);
8600       stmt_vinfo = epilogue_vinfo->stmt_vec_infos[gimple_uid (orig_stmt) - 1];
8601       /* Data references for gather loads and scatter stores do not use the
8602          updated offset we set using ADVANCE.  Instead we have to make sure the
8603          reference in the data references point to the corresponding copy of
8604          the original in the epilogue.  */
8605       if (STMT_VINFO_MEMORY_ACCESS_TYPE (vect_stmt_to_vectorize (stmt_vinfo))
8606           == VMAT_GATHER_SCATTER)
8607         {
8608           DR_REF (dr)
8609             = simplify_replace_tree (DR_REF (dr), NULL_TREE, NULL_TREE,
8610                                      &find_in_mapping, &mapping);
8611           DR_BASE_ADDRESS (dr)
8612             = simplify_replace_tree (DR_BASE_ADDRESS (dr), NULL_TREE, NULL_TREE,
8613                                      &find_in_mapping, &mapping);
8614         }
8615       DR_STMT (dr) = STMT_VINFO_STMT (stmt_vinfo);
8616       stmt_vinfo->dr_aux.stmt = stmt_vinfo;
8617       /* The vector size of the epilogue is smaller than that of the main loop
8618          so the alignment is either the same or lower. This means the dr will
8619          thus by definition be aligned.  */
8620       STMT_VINFO_DR_INFO (stmt_vinfo)->base_misaligned = false;
8621     }
8622
8623   epilogue_vinfo->shared->datarefs_copy.release ();
8624   epilogue_vinfo->shared->save_datarefs ();
8625 }
8626
8627 /* Function vect_transform_loop.
8628
8629    The analysis phase has determined that the loop is vectorizable.
8630    Vectorize the loop - created vectorized stmts to replace the scalar
8631    stmts in the loop, and update the loop exit condition.
8632    Returns scalar epilogue loop if any.  */
8633
8634 class loop *
8635 vect_transform_loop (loop_vec_info loop_vinfo, gimple *loop_vectorized_call)
8636 {
8637   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
8638   class loop *epilogue = NULL;
8639   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
8640   int nbbs = loop->num_nodes;
8641   int i;
8642   tree niters_vector = NULL_TREE;
8643   tree step_vector = NULL_TREE;
8644   tree niters_vector_mult_vf = NULL_TREE;
8645   poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
8646   unsigned int lowest_vf = constant_lower_bound (vf);
8647   gimple *stmt;
8648   bool check_profitability = false;
8649   unsigned int th;
8650
8651   DUMP_VECT_SCOPE ("vec_transform_loop");
8652
8653   loop_vinfo->shared->check_datarefs ();
8654
8655   /* Use the more conservative vectorization threshold.  If the number
8656      of iterations is constant assume the cost check has been performed
8657      by our caller.  If the threshold makes all loops profitable that
8658      run at least the (estimated) vectorization factor number of times
8659      checking is pointless, too.  */
8660   th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
8661   if (vect_apply_runtime_profitability_check_p (loop_vinfo))
8662     {
8663       if (dump_enabled_p ())
8664         dump_printf_loc (MSG_NOTE, vect_location,
8665                          "Profitability threshold is %d loop iterations.\n",
8666                          th);
8667       check_profitability = true;
8668     }
8669
8670   /* Make sure there exists a single-predecessor exit bb.  Do this before
8671      versioning.   */
8672   edge e = single_exit (loop);
8673   if (! single_pred_p (e->dest))
8674     {
8675       split_loop_exit_edge (e, true);
8676       if (dump_enabled_p ())
8677         dump_printf (MSG_NOTE, "split exit edge\n");
8678     }
8679
8680   /* Version the loop first, if required, so the profitability check
8681      comes first.  */
8682
8683   if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
8684     {
8685       class loop *sloop
8686         = vect_loop_versioning (loop_vinfo, loop_vectorized_call);
8687       sloop->force_vectorize = false;
8688       check_profitability = false;
8689     }
8690
8691   /* Make sure there exists a single-predecessor exit bb also on the
8692      scalar loop copy.  Do this after versioning but before peeling
8693      so CFG structure is fine for both scalar and if-converted loop
8694      to make slpeel_duplicate_current_defs_from_edges face matched
8695      loop closed PHI nodes on the exit.  */
8696   if (LOOP_VINFO_SCALAR_LOOP (loop_vinfo))
8697     {
8698       e = single_exit (LOOP_VINFO_SCALAR_LOOP (loop_vinfo));
8699       if (! single_pred_p (e->dest))
8700         {
8701           split_loop_exit_edge (e, true);
8702           if (dump_enabled_p ())
8703             dump_printf (MSG_NOTE, "split exit edge of scalar loop\n");
8704         }
8705     }
8706
8707   tree niters = vect_build_loop_niters (loop_vinfo);
8708   LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = niters;
8709   tree nitersm1 = unshare_expr (LOOP_VINFO_NITERSM1 (loop_vinfo));
8710   bool niters_no_overflow = loop_niters_no_overflow (loop_vinfo);
8711   tree advance;
8712   drs_init_vec orig_drs_init;
8713
8714   epilogue = vect_do_peeling (loop_vinfo, niters, nitersm1, &niters_vector,
8715                               &step_vector, &niters_vector_mult_vf, th,
8716                               check_profitability, niters_no_overflow,
8717                               &advance);
8718
8719   if (LOOP_VINFO_SCALAR_LOOP (loop_vinfo)
8720       && LOOP_VINFO_SCALAR_LOOP_SCALING (loop_vinfo).initialized_p ())
8721     scale_loop_frequencies (LOOP_VINFO_SCALAR_LOOP (loop_vinfo),
8722                             LOOP_VINFO_SCALAR_LOOP_SCALING (loop_vinfo));
8723
8724   if (niters_vector == NULL_TREE)
8725     {
8726       if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
8727           && !LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
8728           && known_eq (lowest_vf, vf))
8729         {
8730           niters_vector
8731             = build_int_cst (TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo)),
8732                              LOOP_VINFO_INT_NITERS (loop_vinfo) / lowest_vf);
8733           step_vector = build_one_cst (TREE_TYPE (niters));
8734         }
8735       else
8736         vect_gen_vector_loop_niters (loop_vinfo, niters, &niters_vector,
8737                                      &step_vector, niters_no_overflow);
8738     }
8739
8740   /* 1) Make sure the loop header has exactly two entries
8741      2) Make sure we have a preheader basic block.  */
8742
8743   gcc_assert (EDGE_COUNT (loop->header->preds) == 2);
8744
8745   split_edge (loop_preheader_edge (loop));
8746
8747   if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
8748       && vect_use_loop_mask_for_alignment_p (loop_vinfo))
8749     /* This will deal with any possible peeling.  */
8750     vect_prepare_for_masked_peels (loop_vinfo);
8751
8752   /* Schedule the SLP instances first, then handle loop vectorization
8753      below.  */
8754   if (!loop_vinfo->slp_instances.is_empty ())
8755     {
8756       DUMP_VECT_SCOPE ("scheduling SLP instances");
8757       vect_schedule_slp (loop_vinfo);
8758     }
8759
8760   /* FORNOW: the vectorizer supports only loops which body consist
8761      of one basic block (header + empty latch). When the vectorizer will
8762      support more involved loop forms, the order by which the BBs are
8763      traversed need to be reconsidered.  */
8764
8765   for (i = 0; i < nbbs; i++)
8766     {
8767       basic_block bb = bbs[i];
8768       stmt_vec_info stmt_info;
8769
8770       for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
8771            gsi_next (&si))
8772         {
8773           gphi *phi = si.phi ();
8774           if (dump_enabled_p ())
8775             dump_printf_loc (MSG_NOTE, vect_location,
8776                              "------>vectorizing phi: %G", phi);
8777           stmt_info = loop_vinfo->lookup_stmt (phi);
8778           if (!stmt_info)
8779             continue;
8780
8781           if (MAY_HAVE_DEBUG_BIND_STMTS && !STMT_VINFO_LIVE_P (stmt_info))
8782             vect_loop_kill_debug_uses (loop, stmt_info);
8783
8784           if (!STMT_VINFO_RELEVANT_P (stmt_info)
8785               && !STMT_VINFO_LIVE_P (stmt_info))
8786             continue;
8787
8788           if (STMT_VINFO_VECTYPE (stmt_info)
8789               && (maybe_ne
8790                   (TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info)), vf))
8791               && dump_enabled_p ())
8792             dump_printf_loc (MSG_NOTE, vect_location, "multiple-types.\n");
8793
8794           if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def
8795                || STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
8796                || STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def
8797                || STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle
8798                || STMT_VINFO_DEF_TYPE (stmt_info) == vect_internal_def)
8799               && ! PURE_SLP_STMT (stmt_info))
8800             {
8801               if (dump_enabled_p ())
8802                 dump_printf_loc (MSG_NOTE, vect_location, "transform phi.\n");
8803               vect_transform_stmt (loop_vinfo, stmt_info, NULL, NULL, NULL);
8804             }
8805         }
8806
8807       for (gimple_stmt_iterator si = gsi_start_bb (bb);
8808            !gsi_end_p (si);)
8809         {
8810           stmt = gsi_stmt (si);
8811           /* During vectorization remove existing clobber stmts.  */
8812           if (gimple_clobber_p (stmt))
8813             {
8814               unlink_stmt_vdef (stmt);
8815               gsi_remove (&si, true);
8816               release_defs (stmt);
8817             }
8818           else
8819             {
8820               stmt_info = loop_vinfo->lookup_stmt (stmt);
8821
8822               /* vector stmts created in the outer-loop during vectorization of
8823                  stmts in an inner-loop may not have a stmt_info, and do not
8824                  need to be vectorized.  */
8825               stmt_vec_info seen_store = NULL;
8826               if (stmt_info)
8827                 {
8828                   if (STMT_VINFO_IN_PATTERN_P (stmt_info))
8829                     {
8830                       gimple *def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
8831                       for (gimple_stmt_iterator subsi = gsi_start (def_seq);
8832                            !gsi_end_p (subsi); gsi_next (&subsi))
8833                         {
8834                           stmt_vec_info pat_stmt_info
8835                             = loop_vinfo->lookup_stmt (gsi_stmt (subsi));
8836                           vect_transform_loop_stmt (loop_vinfo, pat_stmt_info,
8837                                                     &si, &seen_store);
8838                         }
8839                       stmt_vec_info pat_stmt_info
8840                         = STMT_VINFO_RELATED_STMT (stmt_info);
8841                       vect_transform_loop_stmt (loop_vinfo, pat_stmt_info, &si,
8842                                                 &seen_store);
8843                     }
8844                   vect_transform_loop_stmt (loop_vinfo, stmt_info, &si,
8845                                             &seen_store);
8846                 }
8847               gsi_next (&si);
8848               if (seen_store)
8849                 {
8850                   if (STMT_VINFO_GROUPED_ACCESS (seen_store))
8851                     /* Interleaving.  If IS_STORE is TRUE, the
8852                        vectorization of the interleaving chain was
8853                        completed - free all the stores in the chain.  */
8854                     vect_remove_stores (loop_vinfo,
8855                                         DR_GROUP_FIRST_ELEMENT (seen_store));
8856                   else
8857                     /* Free the attached stmt_vec_info and remove the stmt.  */
8858                     loop_vinfo->remove_stmt (stmt_info);
8859                 }
8860             }
8861         }
8862
8863       /* Stub out scalar statements that must not survive vectorization.
8864          Doing this here helps with grouped statements, or statements that
8865          are involved in patterns.  */
8866       for (gimple_stmt_iterator gsi = gsi_start_bb (bb);
8867            !gsi_end_p (gsi); gsi_next (&gsi))
8868         {
8869           gcall *call = dyn_cast <gcall *> (gsi_stmt (gsi));
8870           if (call && gimple_call_internal_p (call, IFN_MASK_LOAD))
8871             {
8872               tree lhs = gimple_get_lhs (call);
8873               if (!VECTOR_TYPE_P (TREE_TYPE (lhs)))
8874                 {
8875                   tree zero = build_zero_cst (TREE_TYPE (lhs));
8876                   gimple *new_stmt = gimple_build_assign (lhs, zero);
8877                   gsi_replace (&gsi, new_stmt, true);
8878                 }
8879             }
8880         }
8881     }                           /* BBs in loop */
8882
8883   /* The vectorization factor is always > 1, so if we use an IV increment of 1.
8884      a zero NITERS becomes a nonzero NITERS_VECTOR.  */
8885   if (integer_onep (step_vector))
8886     niters_no_overflow = true;
8887   vect_set_loop_condition (loop, loop_vinfo, niters_vector, step_vector,
8888                            niters_vector_mult_vf, !niters_no_overflow);
8889
8890   unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
8891   scale_profile_for_vect_loop (loop, assumed_vf);
8892
8893   /* True if the final iteration might not handle a full vector's
8894      worth of scalar iterations.  */
8895   bool final_iter_may_be_partial = LOOP_VINFO_FULLY_MASKED_P (loop_vinfo);
8896   /* The minimum number of iterations performed by the epilogue.  This
8897      is 1 when peeling for gaps because we always need a final scalar
8898      iteration.  */
8899   int min_epilogue_iters = LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) ? 1 : 0;
8900   /* +1 to convert latch counts to loop iteration counts,
8901      -min_epilogue_iters to remove iterations that cannot be performed
8902        by the vector code.  */
8903   int bias_for_lowest = 1 - min_epilogue_iters;
8904   int bias_for_assumed = bias_for_lowest;
8905   int alignment_npeels = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
8906   if (alignment_npeels && LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
8907     {
8908       /* When the amount of peeling is known at compile time, the first
8909          iteration will have exactly alignment_npeels active elements.
8910          In the worst case it will have at least one.  */
8911       int min_first_active = (alignment_npeels > 0 ? alignment_npeels : 1);
8912       bias_for_lowest += lowest_vf - min_first_active;
8913       bias_for_assumed += assumed_vf - min_first_active;
8914     }
8915   /* In these calculations the "- 1" converts loop iteration counts
8916      back to latch counts.  */
8917   if (loop->any_upper_bound)
8918     loop->nb_iterations_upper_bound
8919       = (final_iter_may_be_partial
8920          ? wi::udiv_ceil (loop->nb_iterations_upper_bound + bias_for_lowest,
8921                           lowest_vf) - 1
8922          : wi::udiv_floor (loop->nb_iterations_upper_bound + bias_for_lowest,
8923                            lowest_vf) - 1);
8924   if (loop->any_likely_upper_bound)
8925     loop->nb_iterations_likely_upper_bound
8926       = (final_iter_may_be_partial
8927          ? wi::udiv_ceil (loop->nb_iterations_likely_upper_bound
8928                           + bias_for_lowest, lowest_vf) - 1
8929          : wi::udiv_floor (loop->nb_iterations_likely_upper_bound
8930                            + bias_for_lowest, lowest_vf) - 1);
8931   if (loop->any_estimate)
8932     loop->nb_iterations_estimate
8933       = (final_iter_may_be_partial
8934          ? wi::udiv_ceil (loop->nb_iterations_estimate + bias_for_assumed,
8935                           assumed_vf) - 1
8936          : wi::udiv_floor (loop->nb_iterations_estimate + bias_for_assumed,
8937                            assumed_vf) - 1);
8938
8939   if (dump_enabled_p ())
8940     {
8941       if (!LOOP_VINFO_EPILOGUE_P (loop_vinfo))
8942         {
8943           dump_printf_loc (MSG_NOTE, vect_location,
8944                            "LOOP VECTORIZED\n");
8945           if (loop->inner)
8946             dump_printf_loc (MSG_NOTE, vect_location,
8947                              "OUTER LOOP VECTORIZED\n");
8948           dump_printf (MSG_NOTE, "\n");
8949         }
8950       else
8951         dump_printf_loc (MSG_NOTE, vect_location,
8952                          "LOOP EPILOGUE VECTORIZED (MODE=%s)\n",
8953                          GET_MODE_NAME (loop_vinfo->vector_mode));
8954     }
8955
8956   /* Loops vectorized with a variable factor won't benefit from
8957      unrolling/peeling.  */
8958   if (!vf.is_constant ())
8959     {
8960       loop->unroll = 1;
8961       if (dump_enabled_p ())
8962         dump_printf_loc (MSG_NOTE, vect_location, "Disabling unrolling due to"
8963                          " variable-length vectorization factor\n");
8964     }
8965   /* Free SLP instances here because otherwise stmt reference counting
8966      won't work.  */
8967   slp_instance instance;
8968   FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), i, instance)
8969     vect_free_slp_instance (instance, true);
8970   LOOP_VINFO_SLP_INSTANCES (loop_vinfo).release ();
8971   /* Clear-up safelen field since its value is invalid after vectorization
8972      since vectorized loop can have loop-carried dependencies.  */
8973   loop->safelen = 0;
8974
8975   if (epilogue)
8976     {
8977       update_epilogue_loop_vinfo (epilogue, advance);
8978
8979       epilogue->simduid = loop->simduid;
8980       epilogue->force_vectorize = loop->force_vectorize;
8981       epilogue->dont_vectorize = false;
8982     }
8983
8984   return epilogue;
8985 }
8986
8987 /* The code below is trying to perform simple optimization - revert
8988    if-conversion for masked stores, i.e. if the mask of a store is zero
8989    do not perform it and all stored value producers also if possible.
8990    For example,
8991      for (i=0; i<n; i++)
8992        if (c[i])
8993         {
8994           p1[i] += 1;
8995           p2[i] = p3[i] +2;
8996         }
8997    this transformation will produce the following semi-hammock:
8998
8999    if (!mask__ifc__42.18_165 == { 0, 0, 0, 0, 0, 0, 0, 0 })
9000      {
9001        vect__11.19_170 = MASK_LOAD (vectp_p1.20_168, 0B, mask__ifc__42.18_165);
9002        vect__12.22_172 = vect__11.19_170 + vect_cst__171;
9003        MASK_STORE (vectp_p1.23_175, 0B, mask__ifc__42.18_165, vect__12.22_172);
9004        vect__18.25_182 = MASK_LOAD (vectp_p3.26_180, 0B, mask__ifc__42.18_165);
9005        vect__19.28_184 = vect__18.25_182 + vect_cst__183;
9006        MASK_STORE (vectp_p2.29_187, 0B, mask__ifc__42.18_165, vect__19.28_184);
9007      }
9008 */
9009
9010 void
9011 optimize_mask_stores (class loop *loop)
9012 {
9013   basic_block *bbs = get_loop_body (loop);
9014   unsigned nbbs = loop->num_nodes;
9015   unsigned i;
9016   basic_block bb;
9017   class loop *bb_loop;
9018   gimple_stmt_iterator gsi;
9019   gimple *stmt;
9020   auto_vec<gimple *> worklist;
9021   auto_purge_vect_location sentinel;
9022
9023   vect_location = find_loop_location (loop);
9024   /* Pick up all masked stores in loop if any.  */
9025   for (i = 0; i < nbbs; i++)
9026     {
9027       bb = bbs[i];
9028       for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi);
9029            gsi_next (&gsi))
9030         {
9031           stmt = gsi_stmt (gsi);
9032           if (gimple_call_internal_p (stmt, IFN_MASK_STORE))
9033             worklist.safe_push (stmt);
9034         }
9035     }
9036
9037   free (bbs);
9038   if (worklist.is_empty ())
9039     return;
9040
9041   /* Loop has masked stores.  */
9042   while (!worklist.is_empty ())
9043     {
9044       gimple *last, *last_store;
9045       edge e, efalse;
9046       tree mask;
9047       basic_block store_bb, join_bb;
9048       gimple_stmt_iterator gsi_to;
9049       tree vdef, new_vdef;
9050       gphi *phi;
9051       tree vectype;
9052       tree zero;
9053
9054       last = worklist.pop ();
9055       mask = gimple_call_arg (last, 2);
9056       bb = gimple_bb (last);
9057       /* Create then_bb and if-then structure in CFG, then_bb belongs to
9058          the same loop as if_bb.  It could be different to LOOP when two
9059          level loop-nest is vectorized and mask_store belongs to the inner
9060          one.  */
9061       e = split_block (bb, last);
9062       bb_loop = bb->loop_father;
9063       gcc_assert (loop == bb_loop || flow_loop_nested_p (loop, bb_loop));
9064       join_bb = e->dest;
9065       store_bb = create_empty_bb (bb);
9066       add_bb_to_loop (store_bb, bb_loop);
9067       e->flags = EDGE_TRUE_VALUE;
9068       efalse = make_edge (bb, store_bb, EDGE_FALSE_VALUE);
9069       /* Put STORE_BB to likely part.  */
9070       efalse->probability = profile_probability::unlikely ();
9071       store_bb->count = efalse->count ();
9072       make_single_succ_edge (store_bb, join_bb, EDGE_FALLTHRU);
9073       if (dom_info_available_p (CDI_DOMINATORS))
9074         set_immediate_dominator (CDI_DOMINATORS, store_bb, bb);
9075       if (dump_enabled_p ())
9076         dump_printf_loc (MSG_NOTE, vect_location,
9077                          "Create new block %d to sink mask stores.",
9078                          store_bb->index);
9079       /* Create vector comparison with boolean result.  */
9080       vectype = TREE_TYPE (mask);
9081       zero = build_zero_cst (vectype);
9082       stmt = gimple_build_cond (EQ_EXPR, mask, zero, NULL_TREE, NULL_TREE);
9083       gsi = gsi_last_bb (bb);
9084       gsi_insert_after (&gsi, stmt, GSI_SAME_STMT);
9085       /* Create new PHI node for vdef of the last masked store:
9086          .MEM_2 = VDEF <.MEM_1>
9087          will be converted to
9088          .MEM.3 = VDEF <.MEM_1>
9089          and new PHI node will be created in join bb
9090          .MEM_2 = PHI <.MEM_1, .MEM_3>
9091       */
9092       vdef = gimple_vdef (last);
9093       new_vdef = make_ssa_name (gimple_vop (cfun), last);
9094       gimple_set_vdef (last, new_vdef);
9095       phi = create_phi_node (vdef, join_bb);
9096       add_phi_arg (phi, new_vdef, EDGE_SUCC (store_bb, 0), UNKNOWN_LOCATION);
9097
9098       /* Put all masked stores with the same mask to STORE_BB if possible.  */
9099       while (true)
9100         {
9101           gimple_stmt_iterator gsi_from;
9102           gimple *stmt1 = NULL;
9103
9104           /* Move masked store to STORE_BB.  */
9105           last_store = last;
9106           gsi = gsi_for_stmt (last);
9107           gsi_from = gsi;
9108           /* Shift GSI to the previous stmt for further traversal.  */
9109           gsi_prev (&gsi);
9110           gsi_to = gsi_start_bb (store_bb);
9111           gsi_move_before (&gsi_from, &gsi_to);
9112           /* Setup GSI_TO to the non-empty block start.  */
9113           gsi_to = gsi_start_bb (store_bb);
9114           if (dump_enabled_p ())
9115             dump_printf_loc (MSG_NOTE, vect_location,
9116                              "Move stmt to created bb\n%G", last);
9117           /* Move all stored value producers if possible.  */
9118           while (!gsi_end_p (gsi))
9119             {
9120               tree lhs;
9121               imm_use_iterator imm_iter;
9122               use_operand_p use_p;
9123               bool res;
9124
9125               /* Skip debug statements.  */
9126               if (is_gimple_debug (gsi_stmt (gsi)))
9127                 {
9128                   gsi_prev (&gsi);
9129                   continue;
9130                 }
9131               stmt1 = gsi_stmt (gsi);
9132               /* Do not consider statements writing to memory or having
9133                  volatile operand.  */
9134               if (gimple_vdef (stmt1)
9135                   || gimple_has_volatile_ops (stmt1))
9136                 break;
9137               gsi_from = gsi;
9138               gsi_prev (&gsi);
9139               lhs = gimple_get_lhs (stmt1);
9140               if (!lhs)
9141                 break;
9142
9143               /* LHS of vectorized stmt must be SSA_NAME.  */
9144               if (TREE_CODE (lhs) != SSA_NAME)
9145                 break;
9146
9147               if (!VECTOR_TYPE_P (TREE_TYPE (lhs)))
9148                 {
9149                   /* Remove dead scalar statement.  */
9150                   if (has_zero_uses (lhs))
9151                     {
9152                       gsi_remove (&gsi_from, true);
9153                       continue;
9154                     }
9155                 }
9156
9157               /* Check that LHS does not have uses outside of STORE_BB.  */
9158               res = true;
9159               FOR_EACH_IMM_USE_FAST (use_p, imm_iter, lhs)
9160                 {
9161                   gimple *use_stmt;
9162                   use_stmt = USE_STMT (use_p);
9163                   if (is_gimple_debug (use_stmt))
9164                     continue;
9165                   if (gimple_bb (use_stmt) != store_bb)
9166                     {
9167                       res = false;
9168                       break;
9169                     }
9170                 }
9171               if (!res)
9172                 break;
9173
9174               if (gimple_vuse (stmt1)
9175                   && gimple_vuse (stmt1) != gimple_vuse (last_store))
9176                 break;
9177
9178               /* Can move STMT1 to STORE_BB.  */
9179               if (dump_enabled_p ())
9180                 dump_printf_loc (MSG_NOTE, vect_location,
9181                                  "Move stmt to created bb\n%G", stmt1);
9182               gsi_move_before (&gsi_from, &gsi_to);
9183               /* Shift GSI_TO for further insertion.  */
9184               gsi_prev (&gsi_to);
9185             }
9186           /* Put other masked stores with the same mask to STORE_BB.  */
9187           if (worklist.is_empty ()
9188               || gimple_call_arg (worklist.last (), 2) != mask
9189               || worklist.last () != stmt1)
9190             break;
9191           last = worklist.pop ();
9192         }
9193       add_phi_arg (phi, gimple_vuse (last_store), e, UNKNOWN_LOCATION);
9194     }
9195 }
9196
9197 /* Decide whether it is possible to use a zero-based induction variable
9198    when vectorizing LOOP_VINFO with a fully-masked loop.  If it is,
9199    return the value that the induction variable must be able to hold
9200    in order to ensure that the loop ends with an all-false mask.
9201    Return -1 otherwise.  */
9202 widest_int
9203 vect_iv_limit_for_full_masking (loop_vec_info loop_vinfo)
9204 {
9205   tree niters_skip = LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo);
9206   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
9207   unsigned HOST_WIDE_INT max_vf = vect_max_vf (loop_vinfo);
9208
9209   /* Calculate the value that the induction variable must be able
9210      to hit in order to ensure that we end the loop with an all-false mask.
9211      This involves adding the maximum number of inactive trailing scalar
9212      iterations.  */
9213   widest_int iv_limit = -1;
9214   if (max_loop_iterations (loop, &iv_limit))
9215     {
9216       if (niters_skip)
9217         {
9218           /* Add the maximum number of skipped iterations to the
9219              maximum iteration count.  */
9220           if (TREE_CODE (niters_skip) == INTEGER_CST)
9221             iv_limit += wi::to_widest (niters_skip);
9222           else
9223             iv_limit += max_vf - 1;
9224         }
9225       else if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo))
9226         /* Make a conservatively-correct assumption.  */
9227         iv_limit += max_vf - 1;
9228
9229       /* IV_LIMIT is the maximum number of latch iterations, which is also
9230          the maximum in-range IV value.  Round this value down to the previous
9231          vector alignment boundary and then add an extra full iteration.  */
9232       poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
9233       iv_limit = (iv_limit & -(int) known_alignment (vf)) + max_vf;
9234     }
9235   return iv_limit;
9236 }
9237