gcc/tree-vect-loop.c

   1 /* Loop Vectorization
   2    Copyright (C) 2003-2020 Free Software Foundation, Inc.
   3    Contributed by Dorit Naishlos <dorit@il.ibm.com> and
   4    Ira Rosen <irar@il.ibm.com>
   5
   6 This file is part of GCC.
   7
   8 GCC is free software; you can redistribute it and/or modify it under
   9 the terms of the GNU General Public License as published by the Free
  10 Software Foundation; either version 3, or (at your option) any later
  11 version.
  12
  13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
  14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
  15 FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  16 for more details.
  17
  18 You should have received a copy of the GNU General Public License
  19 along with GCC; see the file COPYING3.  If not see
  20 <http://www.gnu.org/licenses/>.  */
  21
  22 #include "config.h"
  23 #include "system.h"
  24 #include "coretypes.h"
  25 #include "backend.h"
  26 #include "target.h"
  27 #include "rtl.h"
  28 #include "tree.h"
  29 #include "gimple.h"
  30 #include "cfghooks.h"
  31 #include "tree-pass.h"
  32 #include "ssa.h"
  33 #include "optabs-tree.h"
  34 #include "diagnostic-core.h"
  35 #include "fold-const.h"
  36 #include "stor-layout.h"
  37 #include "cfganal.h"
  38 #include "gimplify.h"
  39 #include "gimple-iterator.h"
  40 #include "gimplify-me.h"
  41 #include "tree-ssa-loop-ivopts.h"
  42 #include "tree-ssa-loop-manip.h"
  43 #include "tree-ssa-loop-niter.h"
  44 #include "tree-ssa-loop.h"
  45 #include "cfgloop.h"
  46 #include "tree-scalar-evolution.h"
  47 #include "tree-vectorizer.h"
  48 #include "gimple-fold.h"
  49 #include "cgraph.h"
  50 #include "tree-cfg.h"
  51 #include "tree-if-conv.h"
  52 #include "internal-fn.h"
  53 #include "tree-vector-builder.h"
  54 #include "vec-perm-indices.h"
  55 #include "tree-eh.h"
  56
  57 /* Loop Vectorization Pass.
  58
  59    This pass tries to vectorize loops.
  60
  61    For example, the vectorizer transforms the following simple loop:
  62
  63         short a[N]; short b[N]; short c[N]; int i;
  64
  65         for (i=0; i<N; i++){
  66           a[i] = b[i] + c[i];
  67         }
  68
  69    as if it was manually vectorized by rewriting the source code into:
  70
  71         typedef int __attribute__((mode(V8HI))) v8hi;
  72         short a[N];  short b[N]; short c[N];   int i;
  73         v8hi *pa = (v8hi*)a, *pb = (v8hi*)b, *pc = (v8hi*)c;
  74         v8hi va, vb, vc;
  75
  76         for (i=0; i<N/8; i++){
  77           vb = pb[i];
  78           vc = pc[i];
  79           va = vb + vc;
  80           pa[i] = va;
  81         }
  82
  83         The main entry to this pass is vectorize_loops(), in which
  84    the vectorizer applies a set of analyses on a given set of loops,
  85    followed by the actual vectorization transformation for the loops that
  86    had successfully passed the analysis phase.
  87         Throughout this pass we make a distinction between two types of
  88    data: scalars (which are represented by SSA_NAMES), and memory references
  89    ("data-refs").  These two types of data require different handling both
  90    during analysis and transformation. The types of data-refs that the
  91    vectorizer currently supports are ARRAY_REFS which base is an array DECL
  92    (not a pointer), and INDIRECT_REFS through pointers; both array and pointer
  93    accesses are required to have a simple (consecutive) access pattern.
  94
  95    Analysis phase:
  96    ===============
  97         The driver for the analysis phase is vect_analyze_loop().
  98    It applies a set of analyses, some of which rely on the scalar evolution
  99    analyzer (scev) developed by Sebastian Pop.
 100
 101         During the analysis phase the vectorizer records some information
 102    per stmt in a "stmt_vec_info" struct which is attached to each stmt in the
 103    loop, as well as general information about the loop as a whole, which is
 104    recorded in a "loop_vec_info" struct attached to each loop.
 105
 106    Transformation phase:
 107    =====================
 108         The loop transformation phase scans all the stmts in the loop, and
 109    creates a vector stmt (or a sequence of stmts) for each scalar stmt S in
 110    the loop that needs to be vectorized.  It inserts the vector code sequence
 111    just before the scalar stmt S, and records a pointer to the vector code
 112    in STMT_VINFO_VEC_STMT (stmt_info) (stmt_info is the stmt_vec_info struct
 113    attached to S).  This pointer will be used for the vectorization of following
 114    stmts which use the def of stmt S. Stmt S is removed if it writes to memory;
 115    otherwise, we rely on dead code elimination for removing it.
 116
 117         For example, say stmt S1 was vectorized into stmt VS1:
 118
 119    VS1: vb = px[i];
 120    S1:  b = x[i];    STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
 121    S2:  a = b;
 122
 123    To vectorize stmt S2, the vectorizer first finds the stmt that defines
 124    the operand 'b' (S1), and gets the relevant vector def 'vb' from the
 125    vector stmt VS1 pointed to by STMT_VINFO_VEC_STMT (stmt_info (S1)).  The
 126    resulting sequence would be:
 127
 128    VS1: vb = px[i];
 129    S1:  b = x[i];       STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
 130    VS2: va = vb;
 131    S2:  a = b;          STMT_VINFO_VEC_STMT (stmt_info (S2)) = VS2
 132
 133         Operands that are not SSA_NAMEs, are data-refs that appear in
 134    load/store operations (like 'x[i]' in S1), and are handled differently.
 135
 136    Target modeling:
 137    =================
 138         Currently the only target specific information that is used is the
 139    size of the vector (in bytes) - "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD".
 140    Targets that can support different sizes of vectors, for now will need
 141    to specify one value for "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD".  More
 142    flexibility will be added in the future.
 143
 144         Since we only vectorize operations which vector form can be
 145    expressed using existing tree codes, to verify that an operation is
 146    supported, the vectorizer checks the relevant optab at the relevant
 147    machine_mode (e.g, optab_handler (add_optab, V8HImode)).  If
 148    the value found is CODE_FOR_nothing, then there's no target support, and
 149    we can't vectorize the stmt.
 150
 151    For additional information on this project see:
 152    http://gcc.gnu.org/projects/tree-ssa/vectorization.html
 153 */
 154
 155 static void vect_estimate_min_profitable_iters (loop_vec_info, int *, int *);
 156 static stmt_vec_info vect_is_simple_reduction (loop_vec_info, stmt_vec_info,
 157                                                bool *, bool *);
 158
 159 /* Subroutine of vect_determine_vf_for_stmt that handles only one
 160    statement.  VECTYPE_MAYBE_SET_P is true if STMT_VINFO_VECTYPE
 161    may already be set for general statements (not just data refs).  */
 162
 163 static opt_result
 164 vect_determine_vf_for_stmt_1 (vec_info *vinfo, stmt_vec_info stmt_info,
 165                               bool vectype_maybe_set_p,
 166                               poly_uint64 *vf)
 167 {
 168   gimple *stmt = stmt_info->stmt;
 169
 170   if ((!STMT_VINFO_RELEVANT_P (stmt_info)
 171        && !STMT_VINFO_LIVE_P (stmt_info))
 172       || gimple_clobber_p (stmt))
 173     {
 174       if (dump_enabled_p ())
 175         dump_printf_loc (MSG_NOTE, vect_location, "skip.\n");
 176       return opt_result::success ();
 177     }
 178
 179   tree stmt_vectype, nunits_vectype;
 180   opt_result res = vect_get_vector_types_for_stmt (vinfo, stmt_info,
 181                                                    &stmt_vectype,
 182                                                    &nunits_vectype);
 183   if (!res)
 184     return res;
 185
 186   if (stmt_vectype)
 187     {
 188       if (STMT_VINFO_VECTYPE (stmt_info))
 189         /* The only case when a vectype had been already set is for stmts
 190            that contain a data ref, or for "pattern-stmts" (stmts generated
 191            by the vectorizer to represent/replace a certain idiom).  */
 192         gcc_assert ((STMT_VINFO_DATA_REF (stmt_info)
 193                      || vectype_maybe_set_p)
 194                     && STMT_VINFO_VECTYPE (stmt_info) == stmt_vectype);
 195       else
 196         STMT_VINFO_VECTYPE (stmt_info) = stmt_vectype;
 197     }
 198
 199   if (nunits_vectype)
 200     vect_update_max_nunits (vf, nunits_vectype);
 201
 202   return opt_result::success ();
 203 }
 204
 205 /* Subroutine of vect_determine_vectorization_factor.  Set the vector
 206    types of STMT_INFO and all attached pattern statements and update
 207    the vectorization factor VF accordingly.  Return true on success
 208    or false if something prevented vectorization.  */
 209
 210 static opt_result
 211 vect_determine_vf_for_stmt (vec_info *vinfo,
 212                             stmt_vec_info stmt_info, poly_uint64 *vf)
 213 {
 214   if (dump_enabled_p ())
 215     dump_printf_loc (MSG_NOTE, vect_location, "==> examining statement: %G",
 216                      stmt_info->stmt);
 217   opt_result res = vect_determine_vf_for_stmt_1 (vinfo, stmt_info, false, vf);
 218   if (!res)
 219     return res;
 220
 221   if (STMT_VINFO_IN_PATTERN_P (stmt_info)
 222       && STMT_VINFO_RELATED_STMT (stmt_info))
 223     {
 224       gimple *pattern_def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
 225       stmt_info = STMT_VINFO_RELATED_STMT (stmt_info);
 226
 227       /* If a pattern statement has def stmts, analyze them too.  */
 228       for (gimple_stmt_iterator si = gsi_start (pattern_def_seq);
 229            !gsi_end_p (si); gsi_next (&si))
 230         {
 231           stmt_vec_info def_stmt_info = vinfo->lookup_stmt (gsi_stmt (si));
 232           if (dump_enabled_p ())
 233             dump_printf_loc (MSG_NOTE, vect_location,
 234                              "==> examining pattern def stmt: %G",
 235                              def_stmt_info->stmt);
 236           res = vect_determine_vf_for_stmt_1 (vinfo, def_stmt_info, true, vf);
 237           if (!res)
 238             return res;
 239         }
 240
 241       if (dump_enabled_p ())
 242         dump_printf_loc (MSG_NOTE, vect_location,
 243                          "==> examining pattern statement: %G",
 244                          stmt_info->stmt);
 245       res = vect_determine_vf_for_stmt_1 (vinfo, stmt_info, true, vf);
 246       if (!res)
 247         return res;
 248     }
 249
 250   return opt_result::success ();
 251 }
 252
 253 /* Function vect_determine_vectorization_factor
 254
 255    Determine the vectorization factor (VF).  VF is the number of data elements
 256    that are operated upon in parallel in a single iteration of the vectorized
 257    loop.  For example, when vectorizing a loop that operates on 4byte elements,
 258    on a target with vector size (VS) 16byte, the VF is set to 4, since 4
 259    elements can fit in a single vector register.
 260
 261    We currently support vectorization of loops in which all types operated upon
 262    are of the same size.  Therefore this function currently sets VF according to
 263    the size of the types operated upon, and fails if there are multiple sizes
 264    in the loop.
 265
 266    VF is also the factor by which the loop iterations are strip-mined, e.g.:
 267    original loop:
 268         for (i=0; i<N; i++){
 269           a[i] = b[i] + c[i];
 270         }
 271
 272    vectorized loop:
 273         for (i=0; i<N; i+=VF){
 274           a[i:VF] = b[i:VF] + c[i:VF];
 275         }
 276 */
 277
 278 static opt_result
 279 vect_determine_vectorization_factor (loop_vec_info loop_vinfo)
 280 {
 281   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
 282   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
 283   unsigned nbbs = loop->num_nodes;
 284   poly_uint64 vectorization_factor = 1;
 285   tree scalar_type = NULL_TREE;
 286   gphi *phi;
 287   tree vectype;
 288   stmt_vec_info stmt_info;
 289   unsigned i;
 290
 291   DUMP_VECT_SCOPE ("vect_determine_vectorization_factor");
 292
 293   for (i = 0; i < nbbs; i++)
 294     {
 295       basic_block bb = bbs[i];
 296
 297       for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
 298            gsi_next (&si))
 299         {
 300           phi = si.phi ();
 301           stmt_info = loop_vinfo->lookup_stmt (phi);
 302           if (dump_enabled_p ())
 303             dump_printf_loc (MSG_NOTE, vect_location, "==> examining phi: %G",
 304                              phi);
 305
 306           gcc_assert (stmt_info);
 307
 308           if (STMT_VINFO_RELEVANT_P (stmt_info)
 309               || STMT_VINFO_LIVE_P (stmt_info))
 310             {
 311               gcc_assert (!STMT_VINFO_VECTYPE (stmt_info));
 312               scalar_type = TREE_TYPE (PHI_RESULT (phi));
 313
 314               if (dump_enabled_p ())
 315                 dump_printf_loc (MSG_NOTE, vect_location,
 316                                  "get vectype for scalar type:  %T\n",
 317                                  scalar_type);
 318
 319               vectype = get_vectype_for_scalar_type (loop_vinfo, scalar_type);
 320               if (!vectype)
 321                 return opt_result::failure_at (phi,
 322                                                "not vectorized: unsupported "
 323                                                "data-type %T\n",
 324                                                scalar_type);
 325               STMT_VINFO_VECTYPE (stmt_info) = vectype;
 326
 327               if (dump_enabled_p ())
 328                 dump_printf_loc (MSG_NOTE, vect_location, "vectype: %T\n",
 329                                  vectype);
 330
 331               if (dump_enabled_p ())
 332                 {
 333                   dump_printf_loc (MSG_NOTE, vect_location, "nunits = ");
 334                   dump_dec (MSG_NOTE, TYPE_VECTOR_SUBPARTS (vectype));
 335                   dump_printf (MSG_NOTE, "\n");
 336                 }
 337
 338               vect_update_max_nunits (&vectorization_factor, vectype);
 339             }
 340         }
 341
 342       for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
 343            gsi_next (&si))
 344         {
 345           if (is_gimple_debug (gsi_stmt (si)))
 346             continue;
 347           stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
 348           opt_result res
 349             = vect_determine_vf_for_stmt (loop_vinfo,
 350                                           stmt_info, &vectorization_factor);
 351           if (!res)
 352             return res;
 353         }
 354     }
 355
 356   /* TODO: Analyze cost. Decide if worth while to vectorize.  */
 357   if (dump_enabled_p ())
 358     {
 359       dump_printf_loc (MSG_NOTE, vect_location, "vectorization factor = ");
 360       dump_dec (MSG_NOTE, vectorization_factor);
 361       dump_printf (MSG_NOTE, "\n");
 362     }
 363
 364   if (known_le (vectorization_factor, 1U))
 365     return opt_result::failure_at (vect_location,
 366                                    "not vectorized: unsupported data-type\n");
 367   LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
 368   return opt_result::success ();
 369 }
 370
 371
 372 /* Function vect_is_simple_iv_evolution.
 373
 374    FORNOW: A simple evolution of an induction variables in the loop is
 375    considered a polynomial evolution.  */
 376
 377 static bool
 378 vect_is_simple_iv_evolution (unsigned loop_nb, tree access_fn, tree * init,
 379                              tree * step)
 380 {
 381   tree init_expr;
 382   tree step_expr;
 383   tree evolution_part = evolution_part_in_loop_num (access_fn, loop_nb);
 384   basic_block bb;
 385
 386   /* When there is no evolution in this loop, the evolution function
 387      is not "simple".  */
 388   if (evolution_part == NULL_TREE)
 389     return false;
 390
 391   /* When the evolution is a polynomial of degree >= 2
 392      the evolution function is not "simple".  */
 393   if (tree_is_chrec (evolution_part))
 394     return false;
 395
 396   step_expr = evolution_part;
 397   init_expr = unshare_expr (initial_condition_in_loop_num (access_fn, loop_nb));
 398
 399   if (dump_enabled_p ())
 400     dump_printf_loc (MSG_NOTE, vect_location, "step: %T,  init: %T\n",
 401                      step_expr, init_expr);
 402
 403   *init = init_expr;
 404   *step = step_expr;
 405
 406   if (TREE_CODE (step_expr) != INTEGER_CST
 407       && (TREE_CODE (step_expr) != SSA_NAME
 408           || ((bb = gimple_bb (SSA_NAME_DEF_STMT (step_expr)))
 409               && flow_bb_inside_loop_p (get_loop (cfun, loop_nb), bb))
 410           || (!INTEGRAL_TYPE_P (TREE_TYPE (step_expr))
 411               && (!SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr))
 412                   || !flag_associative_math)))
 413       && (TREE_CODE (step_expr) != REAL_CST
 414           || !flag_associative_math))
 415     {
 416       if (dump_enabled_p ())
 417         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 418                          "step unknown.\n");
 419       return false;
 420     }
 421
 422   return true;
 423 }
 424
 425 /* Return true if PHI, described by STMT_INFO, is the inner PHI in
 426    what we are assuming is a double reduction.  For example, given
 427    a structure like this:
 428
 429       outer1:
 430         x_1 = PHI <x_4(outer2), ...>;
 431         ...
 432
 433       inner:
 434         x_2 = PHI <x_1(outer1), ...>;
 435         ...
 436         x_3 = ...;
 437         ...
 438
 439       outer2:
 440         x_4 = PHI <x_3(inner)>;
 441         ...
 442
 443    outer loop analysis would treat x_1 as a double reduction phi and
 444    this function would then return true for x_2.  */
 445
 446 static bool
 447 vect_inner_phi_in_double_reduction_p (loop_vec_info loop_vinfo, gphi *phi)
 448 {
 449   use_operand_p use_p;
 450   ssa_op_iter op_iter;
 451   FOR_EACH_PHI_ARG (use_p, phi, op_iter, SSA_OP_USE)
 452     if (stmt_vec_info def_info = loop_vinfo->lookup_def (USE_FROM_PTR (use_p)))
 453       if (STMT_VINFO_DEF_TYPE (def_info) == vect_double_reduction_def)
 454         return true;
 455   return false;
 456 }
 457
 458 /* Function vect_analyze_scalar_cycles_1.
 459
 460    Examine the cross iteration def-use cycles of scalar variables
 461    in LOOP.  LOOP_VINFO represents the loop that is now being
 462    considered for vectorization (can be LOOP, or an outer-loop
 463    enclosing LOOP).  */
 464
 465 static void
 466 vect_analyze_scalar_cycles_1 (loop_vec_info loop_vinfo, class loop *loop)
 467 {
 468   basic_block bb = loop->header;
 469   tree init, step;
 470   auto_vec<stmt_vec_info, 64> worklist;
 471   gphi_iterator gsi;
 472   bool double_reduc, reduc_chain;
 473
 474   DUMP_VECT_SCOPE ("vect_analyze_scalar_cycles");
 475
 476   /* First - identify all inductions.  Reduction detection assumes that all the
 477      inductions have been identified, therefore, this order must not be
 478      changed.  */
 479   for (gsi = gsi_start_phis  (bb); !gsi_end_p (gsi); gsi_next (&gsi))
 480     {
 481       gphi *phi = gsi.phi ();
 482       tree access_fn = NULL;
 483       tree def = PHI_RESULT (phi);
 484       stmt_vec_info stmt_vinfo = loop_vinfo->lookup_stmt (phi);
 485
 486       if (dump_enabled_p ())
 487         dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: %G", phi);
 488
 489       /* Skip virtual phi's.  The data dependences that are associated with
 490          virtual defs/uses (i.e., memory accesses) are analyzed elsewhere.  */
 491       if (virtual_operand_p (def))
 492         continue;
 493
 494       STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_unknown_def_type;
 495
 496       /* Analyze the evolution function.  */
 497       access_fn = analyze_scalar_evolution (loop, def);
 498       if (access_fn)
 499         {
 500           STRIP_NOPS (access_fn);
 501           if (dump_enabled_p ())
 502             dump_printf_loc (MSG_NOTE, vect_location,
 503                              "Access function of PHI: %T\n", access_fn);
 504           STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo)
 505             = initial_condition_in_loop_num (access_fn, loop->num);
 506           STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo)
 507             = evolution_part_in_loop_num (access_fn, loop->num);
 508         }
 509
 510       if (!access_fn
 511           || vect_inner_phi_in_double_reduction_p (loop_vinfo, phi)
 512           || !vect_is_simple_iv_evolution (loop->num, access_fn, &init, &step)
 513           || (LOOP_VINFO_LOOP (loop_vinfo) != loop
 514               && TREE_CODE (step) != INTEGER_CST))
 515         {
 516           worklist.safe_push (stmt_vinfo);
 517           continue;
 518         }
 519
 520       gcc_assert (STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo)
 521                   != NULL_TREE);
 522       gcc_assert (STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo) != NULL_TREE);
 523
 524       if (dump_enabled_p ())
 525         dump_printf_loc (MSG_NOTE, vect_location, "Detected induction.\n");
 526       STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_induction_def;
 527     }
 528
 529
 530   /* Second - identify all reductions and nested cycles.  */
 531   while (worklist.length () > 0)
 532     {
 533       stmt_vec_info stmt_vinfo = worklist.pop ();
 534       gphi *phi = as_a <gphi *> (stmt_vinfo->stmt);
 535       tree def = PHI_RESULT (phi);
 536
 537       if (dump_enabled_p ())
 538         dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: %G", phi);
 539
 540       gcc_assert (!virtual_operand_p (def)
 541                   && STMT_VINFO_DEF_TYPE (stmt_vinfo) == vect_unknown_def_type);
 542
 543       stmt_vec_info reduc_stmt_info
 544         = vect_is_simple_reduction (loop_vinfo, stmt_vinfo, &double_reduc,
 545                                     &reduc_chain);
 546       if (reduc_stmt_info)
 547         {
 548           STMT_VINFO_REDUC_DEF (stmt_vinfo) = reduc_stmt_info;
 549           STMT_VINFO_REDUC_DEF (reduc_stmt_info) = stmt_vinfo;
 550           if (double_reduc)
 551             {
 552               if (dump_enabled_p ())
 553                 dump_printf_loc (MSG_NOTE, vect_location,
 554                                  "Detected double reduction.\n");
 555
 556               STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_double_reduction_def;
 557               STMT_VINFO_DEF_TYPE (reduc_stmt_info) = vect_double_reduction_def;
 558             }
 559           else
 560             {
 561               if (loop != LOOP_VINFO_LOOP (loop_vinfo))
 562                 {
 563                   if (dump_enabled_p ())
 564                     dump_printf_loc (MSG_NOTE, vect_location,
 565                                      "Detected vectorizable nested cycle.\n");
 566
 567                   STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_nested_cycle;
 568                 }
 569               else
 570                 {
 571                   if (dump_enabled_p ())
 572                     dump_printf_loc (MSG_NOTE, vect_location,
 573                                      "Detected reduction.\n");
 574
 575                   STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_reduction_def;
 576                   STMT_VINFO_DEF_TYPE (reduc_stmt_info) = vect_reduction_def;
 577                   /* Store the reduction cycles for possible vectorization in
 578                      loop-aware SLP if it was not detected as reduction
 579                      chain.  */
 580                   if (! reduc_chain)
 581                     LOOP_VINFO_REDUCTIONS (loop_vinfo).safe_push
 582                       (reduc_stmt_info);
 583                 }
 584             }
 585         }
 586       else
 587         if (dump_enabled_p ())
 588           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 589                            "Unknown def-use cycle pattern.\n");
 590     }
 591 }
 592
 593
 594 /* Function vect_analyze_scalar_cycles.
 595
 596    Examine the cross iteration def-use cycles of scalar variables, by
 597    analyzing the loop-header PHIs of scalar variables.  Classify each
 598    cycle as one of the following: invariant, induction, reduction, unknown.
 599    We do that for the loop represented by LOOP_VINFO, and also to its
 600    inner-loop, if exists.
 601    Examples for scalar cycles:
 602
 603    Example1: reduction:
 604
 605               loop1:
 606               for (i=0; i<N; i++)
 607                  sum += a[i];
 608
 609    Example2: induction:
 610
 611               loop2:
 612               for (i=0; i<N; i++)
 613                  a[i] = i;  */
 614
 615 static void
 616 vect_analyze_scalar_cycles (loop_vec_info loop_vinfo)
 617 {
 618   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
 619
 620   vect_analyze_scalar_cycles_1 (loop_vinfo, loop);
 621
 622   /* When vectorizing an outer-loop, the inner-loop is executed sequentially.
 623      Reductions in such inner-loop therefore have different properties than
 624      the reductions in the nest that gets vectorized:
 625      1. When vectorized, they are executed in the same order as in the original
 626         scalar loop, so we can't change the order of computation when
 627         vectorizing them.
 628      2. FIXME: Inner-loop reductions can be used in the inner-loop, so the
 629         current checks are too strict.  */
 630
 631   if (loop->inner)
 632     vect_analyze_scalar_cycles_1 (loop_vinfo, loop->inner);
 633 }
 634
 635 /* Transfer group and reduction information from STMT_INFO to its
 636    pattern stmt.  */
 637
 638 static void
 639 vect_fixup_reduc_chain (stmt_vec_info stmt_info)
 640 {
 641   stmt_vec_info firstp = STMT_VINFO_RELATED_STMT (stmt_info);
 642   stmt_vec_info stmtp;
 643   gcc_assert (!REDUC_GROUP_FIRST_ELEMENT (firstp)
 644               && REDUC_GROUP_FIRST_ELEMENT (stmt_info));
 645   REDUC_GROUP_SIZE (firstp) = REDUC_GROUP_SIZE (stmt_info);
 646   do
 647     {
 648       stmtp = STMT_VINFO_RELATED_STMT (stmt_info);
 649       gcc_checking_assert (STMT_VINFO_DEF_TYPE (stmtp)
 650                            == STMT_VINFO_DEF_TYPE (stmt_info));
 651       REDUC_GROUP_FIRST_ELEMENT (stmtp) = firstp;
 652       stmt_info = REDUC_GROUP_NEXT_ELEMENT (stmt_info);
 653       if (stmt_info)
 654         REDUC_GROUP_NEXT_ELEMENT (stmtp)
 655           = STMT_VINFO_RELATED_STMT (stmt_info);
 656     }
 657   while (stmt_info);
 658 }
 659
 660 /* Fixup scalar cycles that now have their stmts detected as patterns.  */
 661
 662 static void
 663 vect_fixup_scalar_cycles_with_patterns (loop_vec_info loop_vinfo)
 664 {
 665   stmt_vec_info first;
 666   unsigned i;
 667
 668   FOR_EACH_VEC_ELT (LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo), i, first)
 669     if (STMT_VINFO_IN_PATTERN_P (first))
 670       {
 671         stmt_vec_info next = REDUC_GROUP_NEXT_ELEMENT (first);
 672         while (next)
 673           {
 674             if (! STMT_VINFO_IN_PATTERN_P (next)
 675                 || STMT_VINFO_REDUC_IDX (STMT_VINFO_RELATED_STMT (next)) == -1)
 676               break;
 677             next = REDUC_GROUP_NEXT_ELEMENT (next);
 678           }
 679         /* If not all stmt in the chain are patterns or if we failed
 680            to update STMT_VINFO_REDUC_IDX try to handle the chain
 681            without patterns.  */
 682         if (! next
 683             && STMT_VINFO_REDUC_IDX (STMT_VINFO_RELATED_STMT (first)) != -1)
 684           {
 685             vect_fixup_reduc_chain (first);
 686             LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo)[i]
 687               = STMT_VINFO_RELATED_STMT (first);
 688           }
 689       }
 690 }
 691
 692 /* Function vect_get_loop_niters.
 693
 694    Determine how many iterations the loop is executed and place it
 695    in NUMBER_OF_ITERATIONS.  Place the number of latch iterations
 696    in NUMBER_OF_ITERATIONSM1.  Place the condition under which the
 697    niter information holds in ASSUMPTIONS.
 698
 699    Return the loop exit condition.  */
 700
 701
 702 static gcond *
 703 vect_get_loop_niters (class loop *loop, tree *assumptions,
 704                       tree *number_of_iterations, tree *number_of_iterationsm1)
 705 {
 706   edge exit = single_exit (loop);
 707   class tree_niter_desc niter_desc;
 708   tree niter_assumptions, niter, may_be_zero;
 709   gcond *cond = get_loop_exit_condition (loop);
 710
 711   *assumptions = boolean_true_node;
 712   *number_of_iterationsm1 = chrec_dont_know;
 713   *number_of_iterations = chrec_dont_know;
 714   DUMP_VECT_SCOPE ("get_loop_niters");
 715
 716   if (!exit)
 717     return cond;
 718
 719   may_be_zero = NULL_TREE;
 720   if (!number_of_iterations_exit_assumptions (loop, exit, &niter_desc, NULL)
 721       || chrec_contains_undetermined (niter_desc.niter))
 722     return cond;
 723
 724   niter_assumptions = niter_desc.assumptions;
 725   may_be_zero = niter_desc.may_be_zero;
 726   niter = niter_desc.niter;
 727
 728   if (may_be_zero && integer_zerop (may_be_zero))
 729     may_be_zero = NULL_TREE;
 730
 731   if (may_be_zero)
 732     {
 733       if (COMPARISON_CLASS_P (may_be_zero))
 734         {
 735           /* Try to combine may_be_zero with assumptions, this can simplify
 736              computation of niter expression.  */
 737           if (niter_assumptions && !integer_nonzerop (niter_assumptions))
 738             niter_assumptions = fold_build2 (TRUTH_AND_EXPR, boolean_type_node,
 739                                              niter_assumptions,
 740                                              fold_build1 (TRUTH_NOT_EXPR,
 741                                                           boolean_type_node,
 742                                                           may_be_zero));
 743           else
 744             niter = fold_build3 (COND_EXPR, TREE_TYPE (niter), may_be_zero,
 745                                  build_int_cst (TREE_TYPE (niter), 0),
 746                                  rewrite_to_non_trapping_overflow (niter));
 747
 748           may_be_zero = NULL_TREE;
 749         }
 750       else if (integer_nonzerop (may_be_zero))
 751         {
 752           *number_of_iterationsm1 = build_int_cst (TREE_TYPE (niter), 0);
 753           *number_of_iterations = build_int_cst (TREE_TYPE (niter), 1);
 754           return cond;
 755         }
 756       else
 757         return cond;
 758     }
 759
 760   *assumptions = niter_assumptions;
 761   *number_of_iterationsm1 = niter;
 762
 763   /* We want the number of loop header executions which is the number
 764      of latch executions plus one.
 765      ???  For UINT_MAX latch executions this number overflows to zero
 766      for loops like do { n++; } while (n != 0);  */
 767   if (niter && !chrec_contains_undetermined (niter))
 768     niter = fold_build2 (PLUS_EXPR, TREE_TYPE (niter), unshare_expr (niter),
 769                           build_int_cst (TREE_TYPE (niter), 1));
 770   *number_of_iterations = niter;
 771
 772   return cond;
 773 }
 774
 775 /* Function bb_in_loop_p
 776
 777    Used as predicate for dfs order traversal of the loop bbs.  */
 778
 779 static bool
 780 bb_in_loop_p (const_basic_block bb, const void *data)
 781 {
 782   const class loop *const loop = (const class loop *)data;
 783   if (flow_bb_inside_loop_p (loop, bb))
 784     return true;
 785   return false;
 786 }
 787
 788
 789 /* Create and initialize a new loop_vec_info struct for LOOP_IN, as well as
 790    stmt_vec_info structs for all the stmts in LOOP_IN.  */
 791
 792 _loop_vec_info::_loop_vec_info (class loop *loop_in, vec_info_shared *shared)
 793   : vec_info (vec_info::loop, init_cost (loop_in), shared),
 794     loop (loop_in),
 795     bbs (XCNEWVEC (basic_block, loop->num_nodes)),
 796     num_itersm1 (NULL_TREE),
 797     num_iters (NULL_TREE),
 798     num_iters_unchanged (NULL_TREE),
 799     num_iters_assumptions (NULL_TREE),
 800     th (0),
 801     versioning_threshold (0),
 802     vectorization_factor (0),
 803     max_vectorization_factor (0),
 804     mask_skip_niters (NULL_TREE),
 805     mask_compare_type (NULL_TREE),
 806     simd_if_cond (NULL_TREE),
 807     unaligned_dr (NULL),
 808     peeling_for_alignment (0),
 809     ptr_mask (0),
 810     ivexpr_map (NULL),
 811     scan_map (NULL),
 812     slp_unrolling_factor (1),
 813     single_scalar_iteration_cost (0),
 814     vec_outside_cost (0),
 815     vec_inside_cost (0),
 816     vectorizable (false),
 817     can_fully_mask_p (true),
 818     fully_masked_p (false),
 819     peeling_for_gaps (false),
 820     peeling_for_niter (false),
 821     no_data_dependencies (false),
 822     has_mask_store (false),
 823     scalar_loop_scaling (profile_probability::uninitialized ()),
 824     scalar_loop (NULL),
 825     orig_loop_info (NULL)
 826 {
 827   /* CHECKME: We want to visit all BBs before their successors (except for
 828      latch blocks, for which this assertion wouldn't hold).  In the simple
 829      case of the loop forms we allow, a dfs order of the BBs would the same
 830      as reversed postorder traversal, so we are safe.  */
 831
 832   unsigned int nbbs = dfs_enumerate_from (loop->header, 0, bb_in_loop_p,
 833                                           bbs, loop->num_nodes, loop);
 834   gcc_assert (nbbs == loop->num_nodes);
 835
 836   for (unsigned int i = 0; i < nbbs; i++)
 837     {
 838       basic_block bb = bbs[i];
 839       gimple_stmt_iterator si;
 840
 841       for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
 842         {
 843           gimple *phi = gsi_stmt (si);
 844           gimple_set_uid (phi, 0);
 845           add_stmt (phi);
 846         }
 847
 848       for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
 849         {
 850           gimple *stmt = gsi_stmt (si);
 851           gimple_set_uid (stmt, 0);
 852           if (is_gimple_debug (stmt))
 853             continue;
 854           add_stmt (stmt);
 855           /* If .GOMP_SIMD_LANE call for the current loop has 3 arguments, the
 856              third argument is the #pragma omp simd if (x) condition, when 0,
 857              loop shouldn't be vectorized, when non-zero constant, it should
 858              be vectorized normally, otherwise versioned with vectorized loop
 859              done if the condition is non-zero at runtime.  */
 860           if (loop_in->simduid
 861               && is_gimple_call (stmt)
 862               && gimple_call_internal_p (stmt)
 863               && gimple_call_internal_fn (stmt) == IFN_GOMP_SIMD_LANE
 864               && gimple_call_num_args (stmt) >= 3
 865               && TREE_CODE (gimple_call_arg (stmt, 0)) == SSA_NAME
 866               && (loop_in->simduid
 867                   == SSA_NAME_VAR (gimple_call_arg (stmt, 0))))
 868             {
 869               tree arg = gimple_call_arg (stmt, 2);
 870               if (integer_zerop (arg) || TREE_CODE (arg) == SSA_NAME)
 871                 simd_if_cond = arg;
 872               else
 873                 gcc_assert (integer_nonzerop (arg));
 874             }
 875         }
 876     }
 877
 878   epilogue_vinfos.create (6);
 879 }
 880
 881 /* Free all levels of MASKS.  */
 882
 883 void
 884 release_vec_loop_masks (vec_loop_masks *masks)
 885 {
 886   rgroup_masks *rgm;
 887   unsigned int i;
 888   FOR_EACH_VEC_ELT (*masks, i, rgm)
 889     rgm->masks.release ();
 890   masks->release ();
 891 }
 892
 893 /* Free all memory used by the _loop_vec_info, as well as all the
 894    stmt_vec_info structs of all the stmts in the loop.  */
 895
 896 _loop_vec_info::~_loop_vec_info ()
 897 {
 898   free (bbs);
 899
 900   release_vec_loop_masks (&masks);
 901   delete ivexpr_map;
 902   delete scan_map;
 903   epilogue_vinfos.release ();
 904
 905   loop->aux = NULL;
 906 }
 907
 908 /* Return an invariant or register for EXPR and emit necessary
 909    computations in the LOOP_VINFO loop preheader.  */
 910
 911 tree
 912 cse_and_gimplify_to_preheader (loop_vec_info loop_vinfo, tree expr)
 913 {
 914   if (is_gimple_reg (expr)
 915       || is_gimple_min_invariant (expr))
 916     return expr;
 917
 918   if (! loop_vinfo->ivexpr_map)
 919     loop_vinfo->ivexpr_map = new hash_map<tree_operand_hash, tree>;
 920   tree &cached = loop_vinfo->ivexpr_map->get_or_insert (expr);
 921   if (! cached)
 922     {
 923       gimple_seq stmts = NULL;
 924       cached = force_gimple_operand (unshare_expr (expr),
 925                                      &stmts, true, NULL_TREE);
 926       if (stmts)
 927         {
 928           edge e = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo));
 929           gsi_insert_seq_on_edge_immediate (e, stmts);
 930         }
 931     }
 932   return cached;
 933 }
 934
 935 /* Return true if we can use CMP_TYPE as the comparison type to produce
 936    all masks required to mask LOOP_VINFO.  */
 937
 938 static bool
 939 can_produce_all_loop_masks_p (loop_vec_info loop_vinfo, tree cmp_type)
 940 {
 941   rgroup_masks *rgm;
 942   unsigned int i;
 943   FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo), i, rgm)
 944     if (rgm->mask_type != NULL_TREE
 945         && !direct_internal_fn_supported_p (IFN_WHILE_ULT,
 946                                             cmp_type, rgm->mask_type,
 947                                             OPTIMIZE_FOR_SPEED))
 948       return false;
 949   return true;
 950 }
 951
 952 /* Calculate the maximum number of scalars per iteration for every
 953    rgroup in LOOP_VINFO.  */
 954
 955 static unsigned int
 956 vect_get_max_nscalars_per_iter (loop_vec_info loop_vinfo)
 957 {
 958   unsigned int res = 1;
 959   unsigned int i;
 960   rgroup_masks *rgm;
 961   FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo), i, rgm)
 962     res = MAX (res, rgm->max_nscalars_per_iter);
 963   return res;
 964 }
 965
 966 /* Each statement in LOOP_VINFO can be masked where necessary.  Check
 967    whether we can actually generate the masks required.  Return true if so,
 968    storing the type of the scalar IV in LOOP_VINFO_MASK_COMPARE_TYPE.  */
 969
 970 static bool
 971 vect_verify_full_masking (loop_vec_info loop_vinfo)
 972 {
 973   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
 974   unsigned int min_ni_width;
 975   unsigned int max_nscalars_per_iter
 976     = vect_get_max_nscalars_per_iter (loop_vinfo);
 977
 978   /* Use a normal loop if there are no statements that need masking.
 979      This only happens in rare degenerate cases: it means that the loop
 980      has no loads, no stores, and no live-out values.  */
 981   if (LOOP_VINFO_MASKS (loop_vinfo).is_empty ())
 982     return false;
 983
 984   /* Get the maximum number of iterations that is representable
 985      in the counter type.  */
 986   tree ni_type = TREE_TYPE (LOOP_VINFO_NITERSM1 (loop_vinfo));
 987   widest_int max_ni = wi::to_widest (TYPE_MAX_VALUE (ni_type)) + 1;
 988
 989   /* Get a more refined estimate for the number of iterations.  */
 990   widest_int max_back_edges;
 991   if (max_loop_iterations (loop, &max_back_edges))
 992     max_ni = wi::smin (max_ni, max_back_edges + 1);
 993
 994   /* Account for rgroup masks, in which each bit is replicated N times.  */
 995   max_ni *= max_nscalars_per_iter;
 996
 997   /* Work out how many bits we need to represent the limit.  */
 998   min_ni_width = wi::min_precision (max_ni, UNSIGNED);
 999
1000   /* Find a scalar mode for which WHILE_ULT is supported.  */
1001   opt_scalar_int_mode cmp_mode_iter;
1002   tree cmp_type = NULL_TREE;
1003   tree iv_type = NULL_TREE;
1004   widest_int iv_limit = vect_iv_limit_for_full_masking (loop_vinfo);
1005   unsigned int iv_precision = UINT_MAX;
1006
1007   if (iv_limit != -1)
1008     iv_precision = wi::min_precision (iv_limit * max_nscalars_per_iter,
1009                                       UNSIGNED);
1010
1011   FOR_EACH_MODE_IN_CLASS (cmp_mode_iter, MODE_INT)
1012     {
1013       unsigned int cmp_bits = GET_MODE_BITSIZE (cmp_mode_iter.require ());
1014       if (cmp_bits >= min_ni_width
1015           && targetm.scalar_mode_supported_p (cmp_mode_iter.require ()))
1016         {
1017           tree this_type = build_nonstandard_integer_type (cmp_bits, true);
1018           if (this_type
1019               && can_produce_all_loop_masks_p (loop_vinfo, this_type))
1020             {
1021               /* Although we could stop as soon as we find a valid mode,
1022                  there are at least two reasons why that's not always the
1023                  best choice:
1024
1025                  - An IV that's Pmode or wider is more likely to be reusable
1026                    in address calculations than an IV that's narrower than
1027                    Pmode.
1028
1029                  - Doing the comparison in IV_PRECISION or wider allows
1030                    a natural 0-based IV, whereas using a narrower comparison
1031                    type requires mitigations against wrap-around.
1032
1033                  Conversely, if the IV limit is variable, doing the comparison
1034                  in a wider type than the original type can introduce
1035                  unnecessary extensions, so picking the widest valid mode
1036                  is not always a good choice either.
1037
1038                  Here we prefer the first IV type that's Pmode or wider,
1039                  and the first comparison type that's IV_PRECISION or wider.
1040                  (The comparison type must be no wider than the IV type,
1041                  to avoid extensions in the vector loop.)
1042
1043                  ??? We might want to try continuing beyond Pmode for ILP32
1044                  targets if CMP_BITS < IV_PRECISION.  */
1045               iv_type = this_type;
1046               if (!cmp_type || iv_precision > TYPE_PRECISION (cmp_type))
1047                 cmp_type = this_type;
1048               if (cmp_bits >= GET_MODE_BITSIZE (Pmode))
1049                 break;
1050             }
1051         }
1052     }
1053
1054   if (!cmp_type)
1055     return false;
1056
1057   LOOP_VINFO_MASK_COMPARE_TYPE (loop_vinfo) = cmp_type;
1058   LOOP_VINFO_MASK_IV_TYPE (loop_vinfo) = iv_type;
1059   return true;
1060 }
1061
1062 /* Calculate the cost of one scalar iteration of the loop.  */
1063 static void
1064 vect_compute_single_scalar_iteration_cost (loop_vec_info loop_vinfo)
1065 {
1066   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1067   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1068   int nbbs = loop->num_nodes, factor;
1069   int innerloop_iters, i;
1070
1071   DUMP_VECT_SCOPE ("vect_compute_single_scalar_iteration_cost");
1072
1073   /* Gather costs for statements in the scalar loop.  */
1074
1075   /* FORNOW.  */
1076   innerloop_iters = 1;
1077   if (loop->inner)
1078     innerloop_iters = 50; /* FIXME */
1079
1080   for (i = 0; i < nbbs; i++)
1081     {
1082       gimple_stmt_iterator si;
1083       basic_block bb = bbs[i];
1084
1085       if (bb->loop_father == loop->inner)
1086         factor = innerloop_iters;
1087       else
1088         factor = 1;
1089
1090       for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
1091         {
1092           gimple *stmt = gsi_stmt (si);
1093           stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (stmt);
1094
1095           if (!is_gimple_assign (stmt) && !is_gimple_call (stmt))
1096             continue;
1097
1098           /* Skip stmts that are not vectorized inside the loop.  */
1099           stmt_vec_info vstmt_info = vect_stmt_to_vectorize (stmt_info);
1100           if (!STMT_VINFO_RELEVANT_P (vstmt_info)
1101               && (!STMT_VINFO_LIVE_P (vstmt_info)
1102                   || !VECTORIZABLE_CYCLE_DEF
1103                         (STMT_VINFO_DEF_TYPE (vstmt_info))))
1104             continue;
1105
1106           vect_cost_for_stmt kind;
1107           if (STMT_VINFO_DATA_REF (stmt_info))
1108             {
1109               if (DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
1110                kind = scalar_load;
1111              else
1112                kind = scalar_store;
1113             }
1114           else if (vect_nop_conversion_p (stmt_info))
1115             continue;
1116           else
1117             kind = scalar_stmt;
1118
1119           record_stmt_cost (&LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1120                             factor, kind, stmt_info, 0, vect_prologue);
1121         }
1122     }
1123
1124   /* Now accumulate cost.  */
1125   void *target_cost_data = init_cost (loop);
1126   stmt_info_for_cost *si;
1127   int j;
1128   FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1129                     j, si)
1130     (void) add_stmt_cost (loop_vinfo, target_cost_data, si->count,
1131                           si->kind, si->stmt_info, si->vectype,
1132                           si->misalign, vect_body);
1133   unsigned dummy, body_cost = 0;
1134   finish_cost (target_cost_data, &dummy, &body_cost, &dummy);
1135   destroy_cost_data (target_cost_data);
1136   LOOP_VINFO_SINGLE_SCALAR_ITERATION_COST (loop_vinfo) = body_cost;
1137 }
1138
1139
1140 /* Function vect_analyze_loop_form_1.
1141
1142    Verify that certain CFG restrictions hold, including:
1143    - the loop has a pre-header
1144    - the loop has a single entry and exit
1145    - the loop exit condition is simple enough
1146    - the number of iterations can be analyzed, i.e, a countable loop.  The
1147      niter could be analyzed under some assumptions.  */
1148
1149 opt_result
1150 vect_analyze_loop_form_1 (class loop *loop, gcond **loop_cond,
1151                           tree *assumptions, tree *number_of_iterationsm1,
1152                           tree *number_of_iterations, gcond **inner_loop_cond)
1153 {
1154   DUMP_VECT_SCOPE ("vect_analyze_loop_form");
1155
1156   /* Different restrictions apply when we are considering an inner-most loop,
1157      vs. an outer (nested) loop.
1158      (FORNOW. May want to relax some of these restrictions in the future).  */
1159
1160   if (!loop->inner)
1161     {
1162       /* Inner-most loop.  We currently require that the number of BBs is
1163          exactly 2 (the header and latch).  Vectorizable inner-most loops
1164          look like this:
1165
1166                         (pre-header)
1167                            |
1168                           header <--------+
1169                            | |            |
1170                            | +--> latch --+
1171                            |
1172                         (exit-bb)  */
1173
1174       if (loop->num_nodes != 2)
1175         return opt_result::failure_at (vect_location,
1176                                        "not vectorized:"
1177                                        " control flow in loop.\n");
1178
1179       if (empty_block_p (loop->header))
1180         return opt_result::failure_at (vect_location,
1181                                        "not vectorized: empty loop.\n");
1182     }
1183   else
1184     {
1185       class loop *innerloop = loop->inner;
1186       edge entryedge;
1187
1188       /* Nested loop. We currently require that the loop is doubly-nested,
1189          contains a single inner loop, and the number of BBs is exactly 5.
1190          Vectorizable outer-loops look like this:
1191
1192                         (pre-header)
1193                            |
1194                           header <---+
1195                            |         |
1196                           inner-loop |
1197                            |         |
1198                           tail ------+
1199                            |
1200                         (exit-bb)
1201
1202          The inner-loop has the properties expected of inner-most loops
1203          as described above.  */
1204
1205       if ((loop->inner)->inner || (loop->inner)->next)
1206         return opt_result::failure_at (vect_location,
1207                                        "not vectorized:"
1208                                        " multiple nested loops.\n");
1209
1210       if (loop->num_nodes != 5)
1211         return opt_result::failure_at (vect_location,
1212                                        "not vectorized:"
1213                                        " control flow in loop.\n");
1214
1215       entryedge = loop_preheader_edge (innerloop);
1216       if (entryedge->src != loop->header
1217           || !single_exit (innerloop)
1218           || single_exit (innerloop)->dest != EDGE_PRED (loop->latch, 0)->src)
1219         return opt_result::failure_at (vect_location,
1220                                        "not vectorized:"
1221                                        " unsupported outerloop form.\n");
1222
1223       /* Analyze the inner-loop.  */
1224       tree inner_niterm1, inner_niter, inner_assumptions;
1225       opt_result res
1226         = vect_analyze_loop_form_1 (loop->inner, inner_loop_cond,
1227                                     &inner_assumptions, &inner_niterm1,
1228                                     &inner_niter, NULL);
1229       if (!res)
1230         {
1231           if (dump_enabled_p ())
1232             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1233                              "not vectorized: Bad inner loop.\n");
1234           return res;
1235         }
1236
1237       /* Don't support analyzing niter under assumptions for inner
1238          loop.  */
1239       if (!integer_onep (inner_assumptions))
1240         return opt_result::failure_at (vect_location,
1241                                        "not vectorized: Bad inner loop.\n");
1242
1243       if (!expr_invariant_in_loop_p (loop, inner_niter))
1244         return opt_result::failure_at (vect_location,
1245                                        "not vectorized: inner-loop count not"
1246                                        " invariant.\n");
1247
1248       if (dump_enabled_p ())
1249         dump_printf_loc (MSG_NOTE, vect_location,
1250                          "Considering outer-loop vectorization.\n");
1251     }
1252
1253   if (!single_exit (loop))
1254     return opt_result::failure_at (vect_location,
1255                                    "not vectorized: multiple exits.\n");
1256   if (EDGE_COUNT (loop->header->preds) != 2)
1257     return opt_result::failure_at (vect_location,
1258                                    "not vectorized:"
1259                                    " too many incoming edges.\n");
1260
1261   /* We assume that the loop exit condition is at the end of the loop. i.e,
1262      that the loop is represented as a do-while (with a proper if-guard
1263      before the loop if needed), where the loop header contains all the
1264      executable statements, and the latch is empty.  */
1265   if (!empty_block_p (loop->latch)
1266       || !gimple_seq_empty_p (phi_nodes (loop->latch)))
1267     return opt_result::failure_at (vect_location,
1268                                    "not vectorized: latch block not empty.\n");
1269
1270   /* Make sure the exit is not abnormal.  */
1271   edge e = single_exit (loop);
1272   if (e->flags & EDGE_ABNORMAL)
1273     return opt_result::failure_at (vect_location,
1274                                    "not vectorized:"
1275                                    " abnormal loop exit edge.\n");
1276
1277   *loop_cond = vect_get_loop_niters (loop, assumptions, number_of_iterations,
1278                                      number_of_iterationsm1);
1279   if (!*loop_cond)
1280     return opt_result::failure_at
1281       (vect_location,
1282        "not vectorized: complicated exit condition.\n");
1283
1284   if (integer_zerop (*assumptions)
1285       || !*number_of_iterations
1286       || chrec_contains_undetermined (*number_of_iterations))
1287     return opt_result::failure_at
1288       (*loop_cond,
1289        "not vectorized: number of iterations cannot be computed.\n");
1290
1291   if (integer_zerop (*number_of_iterations))
1292     return opt_result::failure_at
1293       (*loop_cond,
1294        "not vectorized: number of iterations = 0.\n");
1295
1296   return opt_result::success ();
1297 }
1298
1299 /* Analyze LOOP form and return a loop_vec_info if it is of suitable form.  */
1300
1301 opt_loop_vec_info
1302 vect_analyze_loop_form (class loop *loop, vec_info_shared *shared)
1303 {
1304   tree assumptions, number_of_iterations, number_of_iterationsm1;
1305   gcond *loop_cond, *inner_loop_cond = NULL;
1306
1307   opt_result res
1308     = vect_analyze_loop_form_1 (loop, &loop_cond,
1309                                 &assumptions, &number_of_iterationsm1,
1310                                 &number_of_iterations, &inner_loop_cond);
1311   if (!res)
1312     return opt_loop_vec_info::propagate_failure (res);
1313
1314   loop_vec_info loop_vinfo = new _loop_vec_info (loop, shared);
1315   LOOP_VINFO_NITERSM1 (loop_vinfo) = number_of_iterationsm1;
1316   LOOP_VINFO_NITERS (loop_vinfo) = number_of_iterations;
1317   LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = number_of_iterations;
1318   if (!integer_onep (assumptions))
1319     {
1320       /* We consider to vectorize this loop by versioning it under
1321          some assumptions.  In order to do this, we need to clear
1322          existing information computed by scev and niter analyzer.  */
1323       scev_reset_htab ();
1324       free_numbers_of_iterations_estimates (loop);
1325       /* Also set flag for this loop so that following scev and niter
1326          analysis are done under the assumptions.  */
1327       loop_constraint_set (loop, LOOP_C_FINITE);
1328       /* Also record the assumptions for versioning.  */
1329       LOOP_VINFO_NITERS_ASSUMPTIONS (loop_vinfo) = assumptions;
1330     }
1331
1332   if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
1333     {
1334       if (dump_enabled_p ())
1335         {
1336           dump_printf_loc (MSG_NOTE, vect_location,
1337                            "Symbolic number of iterations is ");
1338           dump_generic_expr (MSG_NOTE, TDF_DETAILS, number_of_iterations);
1339           dump_printf (MSG_NOTE, "\n");
1340         }
1341     }
1342
1343   stmt_vec_info loop_cond_info = loop_vinfo->lookup_stmt (loop_cond);
1344   STMT_VINFO_TYPE (loop_cond_info) = loop_exit_ctrl_vec_info_type;
1345   if (inner_loop_cond)
1346     {
1347       stmt_vec_info inner_loop_cond_info
1348         = loop_vinfo->lookup_stmt (inner_loop_cond);
1349       STMT_VINFO_TYPE (inner_loop_cond_info) = loop_exit_ctrl_vec_info_type;
1350     }
1351
1352   gcc_assert (!loop->aux);
1353   loop->aux = loop_vinfo;
1354   return opt_loop_vec_info::success (loop_vinfo);
1355 }
1356
1357
1358
1359 /* Scan the loop stmts and dependent on whether there are any (non-)SLP
1360    statements update the vectorization factor.  */
1361
1362 static void
1363 vect_update_vf_for_slp (loop_vec_info loop_vinfo)
1364 {
1365   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1366   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1367   int nbbs = loop->num_nodes;
1368   poly_uint64 vectorization_factor;
1369   int i;
1370
1371   DUMP_VECT_SCOPE ("vect_update_vf_for_slp");
1372
1373   vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1374   gcc_assert (known_ne (vectorization_factor, 0U));
1375
1376   /* If all the stmts in the loop can be SLPed, we perform only SLP, and
1377      vectorization factor of the loop is the unrolling factor required by
1378      the SLP instances.  If that unrolling factor is 1, we say, that we
1379      perform pure SLP on loop - cross iteration parallelism is not
1380      exploited.  */
1381   bool only_slp_in_loop = true;
1382   for (i = 0; i < nbbs; i++)
1383     {
1384       basic_block bb = bbs[i];
1385       for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
1386            gsi_next (&si))
1387         {
1388           stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (si.phi ());
1389           if (!stmt_info)
1390             continue;
1391           if ((STMT_VINFO_RELEVANT_P (stmt_info)
1392                || VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
1393               && !PURE_SLP_STMT (stmt_info))
1394             /* STMT needs both SLP and loop-based vectorization.  */
1395             only_slp_in_loop = false;
1396         }
1397       for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
1398            gsi_next (&si))
1399         {
1400           if (is_gimple_debug (gsi_stmt (si)))
1401             continue;
1402           stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
1403           stmt_info = vect_stmt_to_vectorize (stmt_info);
1404           if ((STMT_VINFO_RELEVANT_P (stmt_info)
1405                || VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
1406               && !PURE_SLP_STMT (stmt_info))
1407             /* STMT needs both SLP and loop-based vectorization.  */
1408             only_slp_in_loop = false;
1409         }
1410     }
1411
1412   if (only_slp_in_loop)
1413     {
1414       if (dump_enabled_p ())
1415         dump_printf_loc (MSG_NOTE, vect_location,
1416                          "Loop contains only SLP stmts\n");
1417       vectorization_factor = LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo);
1418     }
1419   else
1420     {
1421       if (dump_enabled_p ())
1422         dump_printf_loc (MSG_NOTE, vect_location,
1423                          "Loop contains SLP and non-SLP stmts\n");
1424       /* Both the vectorization factor and unroll factor have the form
1425          GET_MODE_SIZE (loop_vinfo->vector_mode) * X for some rational X,
1426          so they must have a common multiple.  */
1427       vectorization_factor
1428         = force_common_multiple (vectorization_factor,
1429                                  LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo));
1430     }
1431
1432   LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
1433   if (dump_enabled_p ())
1434     {
1435       dump_printf_loc (MSG_NOTE, vect_location,
1436                        "Updating vectorization factor to ");
1437       dump_dec (MSG_NOTE, vectorization_factor);
1438       dump_printf (MSG_NOTE, ".\n");
1439     }
1440 }
1441
1442 /* Return true if STMT_INFO describes a double reduction phi and if
1443    the other phi in the reduction is also relevant for vectorization.
1444    This rejects cases such as:
1445
1446       outer1:
1447         x_1 = PHI <x_3(outer2), ...>;
1448         ...
1449
1450       inner:
1451         x_2 = ...;
1452         ...
1453
1454       outer2:
1455         x_3 = PHI <x_2(inner)>;
1456
1457    if nothing in x_2 or elsewhere makes x_1 relevant.  */
1458
1459 static bool
1460 vect_active_double_reduction_p (stmt_vec_info stmt_info)
1461 {
1462   if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_double_reduction_def)
1463     return false;
1464
1465   return STMT_VINFO_RELEVANT_P (STMT_VINFO_REDUC_DEF (stmt_info));
1466 }
1467
1468 /* Function vect_analyze_loop_operations.
1469
1470    Scan the loop stmts and make sure they are all vectorizable.  */
1471
1472 static opt_result
1473 vect_analyze_loop_operations (loop_vec_info loop_vinfo)
1474 {
1475   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1476   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1477   int nbbs = loop->num_nodes;
1478   int i;
1479   stmt_vec_info stmt_info;
1480   bool need_to_vectorize = false;
1481   bool ok;
1482
1483   DUMP_VECT_SCOPE ("vect_analyze_loop_operations");
1484
1485   auto_vec<stmt_info_for_cost> cost_vec;
1486
1487   for (i = 0; i < nbbs; i++)
1488     {
1489       basic_block bb = bbs[i];
1490
1491       for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
1492            gsi_next (&si))
1493         {
1494           gphi *phi = si.phi ();
1495           ok = true;
1496
1497           stmt_info = loop_vinfo->lookup_stmt (phi);
1498           if (dump_enabled_p ())
1499             dump_printf_loc (MSG_NOTE, vect_location, "examining phi: %G", phi);
1500           if (virtual_operand_p (gimple_phi_result (phi)))
1501             continue;
1502
1503           /* Inner-loop loop-closed exit phi in outer-loop vectorization
1504              (i.e., a phi in the tail of the outer-loop).  */
1505           if (! is_loop_header_bb_p (bb))
1506             {
1507               /* FORNOW: we currently don't support the case that these phis
1508                  are not used in the outerloop (unless it is double reduction,
1509                  i.e., this phi is vect_reduction_def), cause this case
1510                  requires to actually do something here.  */
1511               if (STMT_VINFO_LIVE_P (stmt_info)
1512                   && !vect_active_double_reduction_p (stmt_info))
1513                 return opt_result::failure_at (phi,
1514                                                "Unsupported loop-closed phi"
1515                                                " in outer-loop.\n");
1516
1517               /* If PHI is used in the outer loop, we check that its operand
1518                  is defined in the inner loop.  */
1519               if (STMT_VINFO_RELEVANT_P (stmt_info))
1520                 {
1521                   tree phi_op;
1522
1523                   if (gimple_phi_num_args (phi) != 1)
1524                     return opt_result::failure_at (phi, "unsupported phi");
1525
1526                   phi_op = PHI_ARG_DEF (phi, 0);
1527                   stmt_vec_info op_def_info = loop_vinfo->lookup_def (phi_op);
1528                   if (!op_def_info)
1529                     return opt_result::failure_at (phi, "unsupported phi\n");
1530
1531                   if (STMT_VINFO_RELEVANT (op_def_info) != vect_used_in_outer
1532                       && (STMT_VINFO_RELEVANT (op_def_info)
1533                           != vect_used_in_outer_by_reduction))
1534                     return opt_result::failure_at (phi, "unsupported phi\n");
1535
1536                   if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_internal_def
1537                        || (STMT_VINFO_DEF_TYPE (stmt_info)
1538                            == vect_double_reduction_def))
1539                       && !vectorizable_lc_phi (loop_vinfo,
1540                                                stmt_info, NULL, NULL))
1541                     return opt_result::failure_at (phi, "unsupported phi\n");
1542                 }
1543
1544               continue;
1545             }
1546
1547           gcc_assert (stmt_info);
1548
1549           if ((STMT_VINFO_RELEVANT (stmt_info) == vect_used_in_scope
1550                || STMT_VINFO_LIVE_P (stmt_info))
1551               && STMT_VINFO_DEF_TYPE (stmt_info) != vect_induction_def)
1552             /* A scalar-dependence cycle that we don't support.  */
1553             return opt_result::failure_at (phi,
1554                                            "not vectorized:"
1555                                            " scalar dependence cycle.\n");
1556
1557           if (STMT_VINFO_RELEVANT_P (stmt_info))
1558             {
1559               need_to_vectorize = true;
1560               if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def
1561                   && ! PURE_SLP_STMT (stmt_info))
1562                 ok = vectorizable_induction (loop_vinfo,
1563                                              stmt_info, NULL, NULL, NULL,
1564                                              &cost_vec);
1565               else if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
1566                         || (STMT_VINFO_DEF_TYPE (stmt_info)
1567                             == vect_double_reduction_def)
1568                         || STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle)
1569                        && ! PURE_SLP_STMT (stmt_info))
1570                 ok = vectorizable_reduction (loop_vinfo,
1571                                              stmt_info, NULL, NULL, &cost_vec);
1572             }
1573
1574           /* SLP PHIs are tested by vect_slp_analyze_node_operations.  */
1575           if (ok
1576               && STMT_VINFO_LIVE_P (stmt_info)
1577               && !PURE_SLP_STMT (stmt_info))
1578             ok = vectorizable_live_operation (loop_vinfo,
1579                                               stmt_info, NULL, NULL, NULL,
1580                                               -1, false, &cost_vec);
1581
1582           if (!ok)
1583             return opt_result::failure_at (phi,
1584                                            "not vectorized: relevant phi not "
1585                                            "supported: %G",
1586                                            static_cast <gimple *> (phi));
1587         }
1588
1589       for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
1590            gsi_next (&si))
1591         {
1592           gimple *stmt = gsi_stmt (si);
1593           if (!gimple_clobber_p (stmt)
1594               && !is_gimple_debug (stmt))
1595             {
1596               opt_result res
1597                 = vect_analyze_stmt (loop_vinfo,
1598                                      loop_vinfo->lookup_stmt (stmt),
1599                                      &need_to_vectorize,
1600                                      NULL, NULL, &cost_vec);
1601               if (!res)
1602                 return res;
1603             }
1604         }
1605     } /* bbs */
1606
1607   add_stmt_costs (loop_vinfo, loop_vinfo->target_cost_data, &cost_vec);
1608
1609   /* All operations in the loop are either irrelevant (deal with loop
1610      control, or dead), or only used outside the loop and can be moved
1611      out of the loop (e.g. invariants, inductions).  The loop can be
1612      optimized away by scalar optimizations.  We're better off not
1613      touching this loop.  */
1614   if (!need_to_vectorize)
1615     {
1616       if (dump_enabled_p ())
1617         dump_printf_loc (MSG_NOTE, vect_location,
1618                          "All the computation can be taken out of the loop.\n");
1619       return opt_result::failure_at
1620         (vect_location,
1621          "not vectorized: redundant loop. no profit to vectorize.\n");
1622     }
1623
1624   return opt_result::success ();
1625 }
1626
1627 /* Analyze the cost of the loop described by LOOP_VINFO.  Decide if it
1628    is worthwhile to vectorize.  Return 1 if definitely yes, 0 if
1629    definitely no, or -1 if it's worth retrying.  */
1630
1631 static int
1632 vect_analyze_loop_costing (loop_vec_info loop_vinfo)
1633 {
1634   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1635   unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
1636
1637   /* Only fully-masked loops can have iteration counts less than the
1638      vectorization factor.  */
1639   if (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
1640     {
1641       HOST_WIDE_INT max_niter;
1642
1643       if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
1644         max_niter = LOOP_VINFO_INT_NITERS (loop_vinfo);
1645       else
1646         max_niter = max_stmt_executions_int (loop);
1647
1648       if (max_niter != -1
1649           && (unsigned HOST_WIDE_INT) max_niter < assumed_vf)
1650         {
1651           if (dump_enabled_p ())
1652             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1653                              "not vectorized: iteration count smaller than "
1654                              "vectorization factor.\n");
1655           return 0;
1656         }
1657     }
1658
1659   int min_profitable_iters, min_profitable_estimate;
1660   vect_estimate_min_profitable_iters (loop_vinfo, &min_profitable_iters,
1661                                       &min_profitable_estimate);
1662
1663   if (min_profitable_iters < 0)
1664     {
1665       if (dump_enabled_p ())
1666         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1667                          "not vectorized: vectorization not profitable.\n");
1668       if (dump_enabled_p ())
1669         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1670                          "not vectorized: vector version will never be "
1671                          "profitable.\n");
1672       return -1;
1673     }
1674
1675   int min_scalar_loop_bound = (param_min_vect_loop_bound
1676                                * assumed_vf);
1677
1678   /* Use the cost model only if it is more conservative than user specified
1679      threshold.  */
1680   unsigned int th = (unsigned) MAX (min_scalar_loop_bound,
1681                                     min_profitable_iters);
1682
1683   LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo) = th;
1684
1685   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
1686       && LOOP_VINFO_INT_NITERS (loop_vinfo) < th)
1687     {
1688       if (dump_enabled_p ())
1689         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1690                          "not vectorized: vectorization not profitable.\n");
1691       if (dump_enabled_p ())
1692         dump_printf_loc (MSG_NOTE, vect_location,
1693                          "not vectorized: iteration count smaller than user "
1694                          "specified loop bound parameter or minimum profitable "
1695                          "iterations (whichever is more conservative).\n");
1696       return 0;
1697     }
1698
1699   /* The static profitablity threshold min_profitable_estimate includes
1700      the cost of having to check at runtime whether the scalar loop
1701      should be used instead.  If it turns out that we don't need or want
1702      such a check, the threshold we should use for the static estimate
1703      is simply the point at which the vector loop becomes more profitable
1704      than the scalar loop.  */
1705   if (min_profitable_estimate > min_profitable_iters
1706       && !LOOP_REQUIRES_VERSIONING (loop_vinfo)
1707       && !LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo)
1708       && !LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
1709       && !vect_apply_runtime_profitability_check_p (loop_vinfo))
1710     {
1711       if (dump_enabled_p ())
1712         dump_printf_loc (MSG_NOTE, vect_location, "no need for a runtime"
1713                          " choice between the scalar and vector loops\n");
1714       min_profitable_estimate = min_profitable_iters;
1715     }
1716
1717   HOST_WIDE_INT estimated_niter;
1718
1719   /* If we are vectorizing an epilogue then we know the maximum number of
1720      scalar iterations it will cover is at least one lower than the
1721      vectorization factor of the main loop.  */
1722   if (LOOP_VINFO_EPILOGUE_P (loop_vinfo))
1723     estimated_niter
1724       = vect_vf_for_cost (LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo)) - 1;
1725   else
1726     {
1727       estimated_niter = estimated_stmt_executions_int (loop);
1728       if (estimated_niter == -1)
1729         estimated_niter = likely_max_stmt_executions_int (loop);
1730     }
1731   if (estimated_niter != -1
1732       && ((unsigned HOST_WIDE_INT) estimated_niter
1733           < MAX (th, (unsigned) min_profitable_estimate)))
1734     {
1735       if (dump_enabled_p ())
1736         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1737                          "not vectorized: estimated iteration count too "
1738                          "small.\n");
1739       if (dump_enabled_p ())
1740         dump_printf_loc (MSG_NOTE, vect_location,
1741                          "not vectorized: estimated iteration count smaller "
1742                          "than specified loop bound parameter or minimum "
1743                          "profitable iterations (whichever is more "
1744                          "conservative).\n");
1745       return -1;
1746     }
1747
1748   return 1;
1749 }
1750
1751 static opt_result
1752 vect_get_datarefs_in_loop (loop_p loop, basic_block *bbs,
1753                            vec<data_reference_p> *datarefs,
1754                            unsigned int *n_stmts)
1755 {
1756   *n_stmts = 0;
1757   for (unsigned i = 0; i < loop->num_nodes; i++)
1758     for (gimple_stmt_iterator gsi = gsi_start_bb (bbs[i]);
1759          !gsi_end_p (gsi); gsi_next (&gsi))
1760       {
1761         gimple *stmt = gsi_stmt (gsi);
1762         if (is_gimple_debug (stmt))
1763           continue;
1764         ++(*n_stmts);
1765         opt_result res = vect_find_stmt_data_reference (loop, stmt, datarefs);
1766         if (!res)
1767           {
1768             if (is_gimple_call (stmt) && loop->safelen)
1769               {
1770                 tree fndecl = gimple_call_fndecl (stmt), op;
1771                 if (fndecl != NULL_TREE)
1772                   {
1773                     cgraph_node *node = cgraph_node::get (fndecl);
1774                     if (node != NULL && node->simd_clones != NULL)
1775                       {
1776                         unsigned int j, n = gimple_call_num_args (stmt);
1777                         for (j = 0; j < n; j++)
1778                           {
1779                             op = gimple_call_arg (stmt, j);
1780                             if (DECL_P (op)
1781                                 || (REFERENCE_CLASS_P (op)
1782                                     && get_base_address (op)))
1783                               break;
1784                           }
1785                         op = gimple_call_lhs (stmt);
1786                         /* Ignore #pragma omp declare simd functions
1787                            if they don't have data references in the
1788                            call stmt itself.  */
1789                         if (j == n
1790                             && !(op
1791                                  && (DECL_P (op)
1792                                      || (REFERENCE_CLASS_P (op)
1793                                          && get_base_address (op)))))
1794                           continue;
1795                       }
1796                   }
1797               }
1798             return res;
1799           }
1800         /* If dependence analysis will give up due to the limit on the
1801            number of datarefs stop here and fail fatally.  */
1802         if (datarefs->length ()
1803             > (unsigned)param_loop_max_datarefs_for_datadeps)
1804           return opt_result::failure_at (stmt, "exceeded param "
1805                                          "loop-max-datarefs-for-datadeps\n");
1806       }
1807   return opt_result::success ();
1808 }
1809
1810 /* Look for SLP-only access groups and turn each individual access into its own
1811    group.  */
1812 static void
1813 vect_dissolve_slp_only_groups (loop_vec_info loop_vinfo)
1814 {
1815   unsigned int i;
1816   struct data_reference *dr;
1817
1818   DUMP_VECT_SCOPE ("vect_dissolve_slp_only_groups");
1819
1820   vec<data_reference_p> datarefs = loop_vinfo->shared->datarefs;
1821   FOR_EACH_VEC_ELT (datarefs, i, dr)
1822     {
1823       gcc_assert (DR_REF (dr));
1824       stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (DR_STMT (dr));
1825
1826       /* Check if the load is a part of an interleaving chain.  */
1827       if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
1828         {
1829           stmt_vec_info first_element = DR_GROUP_FIRST_ELEMENT (stmt_info);
1830           unsigned int group_size = DR_GROUP_SIZE (first_element);
1831
1832           /* Check if SLP-only groups.  */
1833           if (!STMT_SLP_TYPE (stmt_info)
1834               && STMT_VINFO_SLP_VECT_ONLY (first_element))
1835             {
1836               /* Dissolve the group.  */
1837               STMT_VINFO_SLP_VECT_ONLY (first_element) = false;
1838
1839               stmt_vec_info vinfo = first_element;
1840               while (vinfo)
1841                 {
1842                   stmt_vec_info next = DR_GROUP_NEXT_ELEMENT (vinfo);
1843                   DR_GROUP_FIRST_ELEMENT (vinfo) = vinfo;
1844                   DR_GROUP_NEXT_ELEMENT (vinfo) = NULL;
1845                   DR_GROUP_SIZE (vinfo) = 1;
1846                   if (STMT_VINFO_STRIDED_P (first_element))
1847                     DR_GROUP_GAP (vinfo) = 0;
1848                   else
1849                     DR_GROUP_GAP (vinfo) = group_size - 1;
1850                   vinfo = next;
1851                 }
1852             }
1853         }
1854     }
1855 }
1856
1857
1858 /* Decides whether we need to create an epilogue loop to handle
1859    remaining scalar iterations and sets PEELING_FOR_NITERS accordingly.  */
1860
1861 void
1862 determine_peel_for_niter (loop_vec_info loop_vinfo)
1863 {
1864   LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = false;
1865
1866   unsigned HOST_WIDE_INT const_vf;
1867   HOST_WIDE_INT max_niter
1868     = likely_max_stmt_executions_int (LOOP_VINFO_LOOP (loop_vinfo));
1869
1870   unsigned th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
1871   if (!th && LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo))
1872     th = LOOP_VINFO_COST_MODEL_THRESHOLD (LOOP_VINFO_ORIG_LOOP_INFO
1873                                           (loop_vinfo));
1874
1875   if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
1876     /* The main loop handles all iterations.  */
1877     LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = false;
1878   else if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
1879            && LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) >= 0)
1880     {
1881       /* Work out the (constant) number of iterations that need to be
1882          peeled for reasons other than niters.  */
1883       unsigned int peel_niter = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
1884       if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
1885         peel_niter += 1;
1886       if (!multiple_p (LOOP_VINFO_INT_NITERS (loop_vinfo) - peel_niter,
1887                        LOOP_VINFO_VECT_FACTOR (loop_vinfo)))
1888         LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = true;
1889     }
1890   else if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
1891            /* ??? When peeling for gaps but not alignment, we could
1892               try to check whether the (variable) niters is known to be
1893               VF * N + 1.  That's something of a niche case though.  */
1894            || LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
1895            || !LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant (&const_vf)
1896            || ((tree_ctz (LOOP_VINFO_NITERS (loop_vinfo))
1897                 < (unsigned) exact_log2 (const_vf))
1898                /* In case of versioning, check if the maximum number of
1899                   iterations is greater than th.  If they are identical,
1900                   the epilogue is unnecessary.  */
1901                && (!LOOP_REQUIRES_VERSIONING (loop_vinfo)
1902                    || ((unsigned HOST_WIDE_INT) max_niter
1903                        > (th / const_vf) * const_vf))))
1904     LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = true;
1905 }
1906
1907
1908 /* Function vect_analyze_loop_2.
1909
1910    Apply a set of analyses on LOOP, and create a loop_vec_info struct
1911    for it.  The different analyses will record information in the
1912    loop_vec_info struct.  */
1913 static opt_result
1914 vect_analyze_loop_2 (loop_vec_info loop_vinfo, bool &fatal, unsigned *n_stmts)
1915 {
1916   opt_result ok = opt_result::success ();
1917   int res;
1918   unsigned int max_vf = MAX_VECTORIZATION_FACTOR;
1919   poly_uint64 min_vf = 2;
1920   loop_vec_info orig_loop_vinfo = NULL;
1921
1922   /* If we are dealing with an epilogue then orig_loop_vinfo points to the
1923      loop_vec_info of the first vectorized loop.  */
1924   if (LOOP_VINFO_EPILOGUE_P (loop_vinfo))
1925     orig_loop_vinfo = LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo);
1926   else
1927     orig_loop_vinfo = loop_vinfo;
1928   gcc_assert (orig_loop_vinfo);
1929
1930   /* The first group of checks is independent of the vector size.  */
1931   fatal = true;
1932
1933   if (LOOP_VINFO_SIMD_IF_COND (loop_vinfo)
1934       && integer_zerop (LOOP_VINFO_SIMD_IF_COND (loop_vinfo)))
1935     return opt_result::failure_at (vect_location,
1936                                    "not vectorized: simd if(0)\n");
1937
1938   /* Find all data references in the loop (which correspond to vdefs/vuses)
1939      and analyze their evolution in the loop.  */
1940
1941   loop_p loop = LOOP_VINFO_LOOP (loop_vinfo);
1942
1943   /* Gather the data references and count stmts in the loop.  */
1944   if (!LOOP_VINFO_DATAREFS (loop_vinfo).exists ())
1945     {
1946       opt_result res
1947         = vect_get_datarefs_in_loop (loop, LOOP_VINFO_BBS (loop_vinfo),
1948                                      &LOOP_VINFO_DATAREFS (loop_vinfo),
1949                                      n_stmts);
1950       if (!res)
1951         {
1952           if (dump_enabled_p ())
1953             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1954                              "not vectorized: loop contains function "
1955                              "calls or data references that cannot "
1956                              "be analyzed\n");
1957           return res;
1958         }
1959       loop_vinfo->shared->save_datarefs ();
1960     }
1961   else
1962     loop_vinfo->shared->check_datarefs ();
1963
1964   /* Analyze the data references and also adjust the minimal
1965      vectorization factor according to the loads and stores.  */
1966
1967   ok = vect_analyze_data_refs (loop_vinfo, &min_vf, &fatal);
1968   if (!ok)
1969     {
1970       if (dump_enabled_p ())
1971         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1972                          "bad data references.\n");
1973       return ok;
1974     }
1975
1976   /* Classify all cross-iteration scalar data-flow cycles.
1977      Cross-iteration cycles caused by virtual phis are analyzed separately.  */
1978   vect_analyze_scalar_cycles (loop_vinfo);
1979
1980   vect_pattern_recog (loop_vinfo);
1981
1982   vect_fixup_scalar_cycles_with_patterns (loop_vinfo);
1983
1984   /* Analyze the access patterns of the data-refs in the loop (consecutive,
1985      complex, etc.). FORNOW: Only handle consecutive access pattern.  */
1986
1987   ok = vect_analyze_data_ref_accesses (loop_vinfo);
1988   if (!ok)
1989     {
1990       if (dump_enabled_p ())
1991         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1992                          "bad data access.\n");
1993       return ok;
1994     }
1995
1996   /* Data-flow analysis to detect stmts that do not need to be vectorized.  */
1997
1998   ok = vect_mark_stmts_to_be_vectorized (loop_vinfo, &fatal);
1999   if (!ok)
2000     {
2001       if (dump_enabled_p ())
2002         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2003                          "unexpected pattern.\n");
2004       return ok;
2005     }
2006
2007   /* While the rest of the analysis below depends on it in some way.  */
2008   fatal = false;
2009
2010   /* Analyze data dependences between the data-refs in the loop
2011      and adjust the maximum vectorization factor according to
2012      the dependences.
2013      FORNOW: fail at the first data dependence that we encounter.  */
2014
2015   ok = vect_analyze_data_ref_dependences (loop_vinfo, &max_vf);
2016   if (!ok)
2017     {
2018       if (dump_enabled_p ())
2019         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2020                          "bad data dependence.\n");
2021       return ok;
2022     }
2023   if (max_vf != MAX_VECTORIZATION_FACTOR
2024       && maybe_lt (max_vf, min_vf))
2025     return opt_result::failure_at (vect_location, "bad data dependence.\n");
2026   LOOP_VINFO_MAX_VECT_FACTOR (loop_vinfo) = max_vf;
2027
2028   ok = vect_determine_vectorization_factor (loop_vinfo);
2029   if (!ok)
2030     {
2031       if (dump_enabled_p ())
2032         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2033                          "can't determine vectorization factor.\n");
2034       return ok;
2035     }
2036   if (max_vf != MAX_VECTORIZATION_FACTOR
2037       && maybe_lt (max_vf, LOOP_VINFO_VECT_FACTOR (loop_vinfo)))
2038     return opt_result::failure_at (vect_location, "bad data dependence.\n");
2039
2040   /* Compute the scalar iteration cost.  */
2041   vect_compute_single_scalar_iteration_cost (loop_vinfo);
2042
2043   poly_uint64 saved_vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2044
2045   /* Check the SLP opportunities in the loop, analyze and build SLP trees.  */
2046   ok = vect_analyze_slp (loop_vinfo, *n_stmts);
2047   if (!ok)
2048     return ok;
2049
2050   /* If there are any SLP instances mark them as pure_slp.  */
2051   bool slp = vect_make_slp_decision (loop_vinfo);
2052   if (slp)
2053     {
2054       /* Find stmts that need to be both vectorized and SLPed.  */
2055       vect_detect_hybrid_slp (loop_vinfo);
2056
2057       /* Update the vectorization factor based on the SLP decision.  */
2058       vect_update_vf_for_slp (loop_vinfo);
2059
2060       /* Optimize the SLP graph with the vectorization factor fixed.  */
2061       vect_optimize_slp (loop_vinfo);
2062     }
2063
2064   bool saved_can_fully_mask_p = LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo);
2065
2066   /* We don't expect to have to roll back to anything other than an empty
2067      set of rgroups.  */
2068   gcc_assert (LOOP_VINFO_MASKS (loop_vinfo).is_empty ());
2069
2070   /* This is the point where we can re-start analysis with SLP forced off.  */
2071 start_over:
2072
2073   /* Now the vectorization factor is final.  */
2074   poly_uint64 vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2075   gcc_assert (known_ne (vectorization_factor, 0U));
2076
2077   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) && dump_enabled_p ())
2078     {
2079       dump_printf_loc (MSG_NOTE, vect_location,
2080                        "vectorization_factor = ");
2081       dump_dec (MSG_NOTE, vectorization_factor);
2082       dump_printf (MSG_NOTE, ", niters = %wd\n",
2083                    LOOP_VINFO_INT_NITERS (loop_vinfo));
2084     }
2085
2086   /* Analyze the alignment of the data-refs in the loop.
2087      Fail if a data reference is found that cannot be vectorized.  */
2088
2089   ok = vect_analyze_data_refs_alignment (loop_vinfo);
2090   if (!ok)
2091     {
2092       if (dump_enabled_p ())
2093         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2094                          "bad data alignment.\n");
2095       return ok;
2096     }
2097
2098   /* Prune the list of ddrs to be tested at run-time by versioning for alias.
2099      It is important to call pruning after vect_analyze_data_ref_accesses,
2100      since we use grouping information gathered by interleaving analysis.  */
2101   ok = vect_prune_runtime_alias_test_list (loop_vinfo);
2102   if (!ok)
2103     return ok;
2104
2105   /* Do not invoke vect_enhance_data_refs_alignment for epilogue
2106      vectorization, since we do not want to add extra peeling or
2107      add versioning for alignment.  */
2108   if (!LOOP_VINFO_EPILOGUE_P (loop_vinfo))
2109     /* This pass will decide on using loop versioning and/or loop peeling in
2110        order to enhance the alignment of data references in the loop.  */
2111     ok = vect_enhance_data_refs_alignment (loop_vinfo);
2112   else
2113     ok = vect_verify_datarefs_alignment (loop_vinfo);
2114   if (!ok)
2115     return ok;
2116
2117   if (slp)
2118     {
2119       /* Analyze operations in the SLP instances.  Note this may
2120          remove unsupported SLP instances which makes the above
2121          SLP kind detection invalid.  */
2122       unsigned old_size = LOOP_VINFO_SLP_INSTANCES (loop_vinfo).length ();
2123       vect_slp_analyze_operations (loop_vinfo);
2124       if (LOOP_VINFO_SLP_INSTANCES (loop_vinfo).length () != old_size)
2125         {
2126           ok = opt_result::failure_at (vect_location,
2127                                        "unsupported SLP instances\n");
2128           goto again;
2129         }
2130     }
2131
2132   /* Dissolve SLP-only groups.  */
2133   vect_dissolve_slp_only_groups (loop_vinfo);
2134
2135   /* Scan all the remaining operations in the loop that are not subject
2136      to SLP and make sure they are vectorizable.  */
2137   ok = vect_analyze_loop_operations (loop_vinfo);
2138   if (!ok)
2139     {
2140       if (dump_enabled_p ())
2141         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2142                          "bad operation or unsupported loop bound.\n");
2143       return ok;
2144     }
2145
2146   /* Decide whether to use a fully-masked loop for this vectorization
2147      factor.  */
2148   LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
2149     = (LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo)
2150        && vect_verify_full_masking (loop_vinfo));
2151   if (dump_enabled_p ())
2152     {
2153       if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
2154         dump_printf_loc (MSG_NOTE, vect_location,
2155                          "using a fully-masked loop.\n");
2156       else
2157         dump_printf_loc (MSG_NOTE, vect_location,
2158                          "not using a fully-masked loop.\n");
2159     }
2160
2161   /* If epilog loop is required because of data accesses with gaps,
2162      one additional iteration needs to be peeled.  Check if there is
2163      enough iterations for vectorization.  */
2164   if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
2165       && LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2166       && !LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
2167     {
2168       poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2169       tree scalar_niters = LOOP_VINFO_NITERSM1 (loop_vinfo);
2170
2171       if (known_lt (wi::to_widest (scalar_niters), vf))
2172         return opt_result::failure_at (vect_location,
2173                                        "loop has no enough iterations to"
2174                                        " support peeling for gaps.\n");
2175     }
2176
2177   /* If we're vectorizing an epilogue loop, we either need a fully-masked
2178      loop or a loop that has a lower VF than the main loop.  */
2179   if (LOOP_VINFO_EPILOGUE_P (loop_vinfo)
2180       && !LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
2181       && maybe_ge (LOOP_VINFO_VECT_FACTOR (loop_vinfo),
2182                    LOOP_VINFO_VECT_FACTOR (orig_loop_vinfo)))
2183     return opt_result::failure_at (vect_location,
2184                                    "Vectorization factor too high for"
2185                                    " epilogue loop.\n");
2186
2187   /* Check the costings of the loop make vectorizing worthwhile.  */
2188   res = vect_analyze_loop_costing (loop_vinfo);
2189   if (res < 0)
2190     {
2191       ok = opt_result::failure_at (vect_location,
2192                                    "Loop costings may not be worthwhile.\n");
2193       goto again;
2194     }
2195   if (!res)
2196     return opt_result::failure_at (vect_location,
2197                                    "Loop costings not worthwhile.\n");
2198
2199   determine_peel_for_niter (loop_vinfo);
2200   /* If an epilogue loop is required make sure we can create one.  */
2201   if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
2202       || LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo))
2203     {
2204       if (dump_enabled_p ())
2205         dump_printf_loc (MSG_NOTE, vect_location, "epilog loop required\n");
2206       if (!vect_can_advance_ivs_p (loop_vinfo)
2207           || !slpeel_can_duplicate_loop_p (LOOP_VINFO_LOOP (loop_vinfo),
2208                                            single_exit (LOOP_VINFO_LOOP
2209                                                          (loop_vinfo))))
2210         {
2211           ok = opt_result::failure_at (vect_location,
2212                                        "not vectorized: can't create required "
2213                                        "epilog loop\n");
2214           goto again;
2215         }
2216     }
2217
2218   /* During peeling, we need to check if number of loop iterations is
2219      enough for both peeled prolog loop and vector loop.  This check
2220      can be merged along with threshold check of loop versioning, so
2221      increase threshold for this case if necessary.
2222
2223      If we are analyzing an epilogue we still want to check what its
2224      versioning threshold would be.  If we decide to vectorize the epilogues we
2225      will want to use the lowest versioning threshold of all epilogues and main
2226      loop.  This will enable us to enter a vectorized epilogue even when
2227      versioning the loop.  We can't simply check whether the epilogue requires
2228      versioning though since we may have skipped some versioning checks when
2229      analyzing the epilogue.  For instance, checks for alias versioning will be
2230      skipped when dealing with epilogues as we assume we already checked them
2231      for the main loop.  So instead we always check the 'orig_loop_vinfo'.  */
2232   if (LOOP_REQUIRES_VERSIONING (orig_loop_vinfo))
2233     {
2234       poly_uint64 niters_th = 0;
2235       unsigned int th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
2236
2237       if (!vect_use_loop_mask_for_alignment_p (loop_vinfo))
2238         {
2239           /* Niters for peeled prolog loop.  */
2240           if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) < 0)
2241             {
2242               dr_vec_info *dr_info = LOOP_VINFO_UNALIGNED_DR (loop_vinfo);
2243               tree vectype = STMT_VINFO_VECTYPE (dr_info->stmt);
2244               niters_th += TYPE_VECTOR_SUBPARTS (vectype) - 1;
2245             }
2246           else
2247             niters_th += LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
2248         }
2249
2250       /* Niters for at least one iteration of vectorized loop.  */
2251       if (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
2252         niters_th += LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2253       /* One additional iteration because of peeling for gap.  */
2254       if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
2255         niters_th += 1;
2256
2257       /*  Use the same condition as vect_transform_loop to decide when to use
2258           the cost to determine a versioning threshold.  */
2259       if (vect_apply_runtime_profitability_check_p (loop_vinfo)
2260           && ordered_p (th, niters_th))
2261         niters_th = ordered_max (poly_uint64 (th), niters_th);
2262
2263       LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo) = niters_th;
2264     }
2265
2266   gcc_assert (known_eq (vectorization_factor,
2267                         LOOP_VINFO_VECT_FACTOR (loop_vinfo)));
2268
2269   /* Ok to vectorize!  */
2270   return opt_result::success ();
2271
2272 again:
2273   /* Ensure that "ok" is false (with an opt_problem if dumping is enabled).  */
2274   gcc_assert (!ok);
2275
2276   /* Try again with SLP forced off but if we didn't do any SLP there is
2277      no point in re-trying.  */
2278   if (!slp)
2279     return ok;
2280
2281   /* If there are reduction chains re-trying will fail anyway.  */
2282   if (! LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo).is_empty ())
2283     return ok;
2284
2285   /* Likewise if the grouped loads or stores in the SLP cannot be handled
2286      via interleaving or lane instructions.  */
2287   slp_instance instance;
2288   slp_tree node;
2289   unsigned i, j;
2290   FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), i, instance)
2291     {
2292       stmt_vec_info vinfo;
2293       vinfo = SLP_TREE_SCALAR_STMTS (SLP_INSTANCE_TREE (instance))[0];
2294       if (! STMT_VINFO_GROUPED_ACCESS (vinfo))
2295         continue;
2296       vinfo = DR_GROUP_FIRST_ELEMENT (vinfo);
2297       unsigned int size = DR_GROUP_SIZE (vinfo);
2298       tree vectype = STMT_VINFO_VECTYPE (vinfo);
2299       if (! vect_store_lanes_supported (vectype, size, false)
2300          && ! known_eq (TYPE_VECTOR_SUBPARTS (vectype), 1U)
2301          && ! vect_grouped_store_supported (vectype, size))
2302         return opt_result::failure_at (vinfo->stmt,
2303                                        "unsupported grouped store\n");
2304       FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), j, node)
2305         {
2306           vinfo = SLP_TREE_SCALAR_STMTS (node)[0];
2307           vinfo = DR_GROUP_FIRST_ELEMENT (vinfo);
2308           bool single_element_p = !DR_GROUP_NEXT_ELEMENT (vinfo);
2309           size = DR_GROUP_SIZE (vinfo);
2310           vectype = STMT_VINFO_VECTYPE (vinfo);
2311           if (! vect_load_lanes_supported (vectype, size, false)
2312               && ! vect_grouped_load_supported (vectype, single_element_p,
2313                                                 size))
2314             return opt_result::failure_at (vinfo->stmt,
2315                                            "unsupported grouped load\n");
2316         }
2317     }
2318
2319   if (dump_enabled_p ())
2320     dump_printf_loc (MSG_NOTE, vect_location,
2321                      "re-trying with SLP disabled\n");
2322
2323   /* Roll back state appropriately.  No SLP this time.  */
2324   slp = false;
2325   /* Restore vectorization factor as it were without SLP.  */
2326   LOOP_VINFO_VECT_FACTOR (loop_vinfo) = saved_vectorization_factor;
2327   /* Free the SLP instances.  */
2328   FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), j, instance)
2329     vect_free_slp_instance (instance, false);
2330   LOOP_VINFO_SLP_INSTANCES (loop_vinfo).release ();
2331   /* Reset SLP type to loop_vect on all stmts.  */
2332   for (i = 0; i < LOOP_VINFO_LOOP (loop_vinfo)->num_nodes; ++i)
2333     {
2334       basic_block bb = LOOP_VINFO_BBS (loop_vinfo)[i];
2335       for (gimple_stmt_iterator si = gsi_start_phis (bb);
2336            !gsi_end_p (si); gsi_next (&si))
2337         {
2338           stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
2339           STMT_SLP_TYPE (stmt_info) = loop_vect;
2340           if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
2341               || STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
2342             {
2343               /* vectorizable_reduction adjusts reduction stmt def-types,
2344                  restore them to that of the PHI.  */
2345               STMT_VINFO_DEF_TYPE (STMT_VINFO_REDUC_DEF (stmt_info))
2346                 = STMT_VINFO_DEF_TYPE (stmt_info);
2347               STMT_VINFO_DEF_TYPE (vect_stmt_to_vectorize
2348                                         (STMT_VINFO_REDUC_DEF (stmt_info)))
2349                 = STMT_VINFO_DEF_TYPE (stmt_info);
2350             }
2351         }
2352       for (gimple_stmt_iterator si = gsi_start_bb (bb);
2353            !gsi_end_p (si); gsi_next (&si))
2354         {
2355           if (is_gimple_debug (gsi_stmt (si)))
2356             continue;
2357           stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
2358           STMT_SLP_TYPE (stmt_info) = loop_vect;
2359           if (STMT_VINFO_IN_PATTERN_P (stmt_info))
2360             {
2361               gimple *pattern_def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
2362               stmt_info = STMT_VINFO_RELATED_STMT (stmt_info);
2363               STMT_SLP_TYPE (stmt_info) = loop_vect;
2364               for (gimple_stmt_iterator pi = gsi_start (pattern_def_seq);
2365                    !gsi_end_p (pi); gsi_next (&pi))
2366                 STMT_SLP_TYPE (loop_vinfo->lookup_stmt (gsi_stmt (pi)))
2367                   = loop_vect;
2368             }
2369         }
2370     }
2371   /* Free optimized alias test DDRS.  */
2372   LOOP_VINFO_LOWER_BOUNDS (loop_vinfo).truncate (0);
2373   LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo).release ();
2374   LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo).release ();
2375   /* Reset target cost data.  */
2376   destroy_cost_data (LOOP_VINFO_TARGET_COST_DATA (loop_vinfo));
2377   LOOP_VINFO_TARGET_COST_DATA (loop_vinfo)
2378     = init_cost (LOOP_VINFO_LOOP (loop_vinfo));
2379   /* Reset accumulated rgroup information.  */
2380   release_vec_loop_masks (&LOOP_VINFO_MASKS (loop_vinfo));
2381   /* Reset assorted flags.  */
2382   LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = false;
2383   LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) = false;
2384   LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo) = 0;
2385   LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo) = 0;
2386   LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = saved_can_fully_mask_p;
2387
2388   goto start_over;
2389 }
2390
2391 /* Return true if vectorizing a loop using NEW_LOOP_VINFO appears
2392    to be better than vectorizing it using OLD_LOOP_VINFO.  Assume that
2393    OLD_LOOP_VINFO is better unless something specifically indicates
2394    otherwise.
2395
2396    Note that this deliberately isn't a partial order.  */
2397
2398 static bool
2399 vect_better_loop_vinfo_p (loop_vec_info new_loop_vinfo,
2400                           loop_vec_info old_loop_vinfo)
2401 {
2402   struct loop *loop = LOOP_VINFO_LOOP (new_loop_vinfo);
2403   gcc_assert (LOOP_VINFO_LOOP (old_loop_vinfo) == loop);
2404
2405   poly_int64 new_vf = LOOP_VINFO_VECT_FACTOR (new_loop_vinfo);
2406   poly_int64 old_vf = LOOP_VINFO_VECT_FACTOR (old_loop_vinfo);
2407
2408   /* Always prefer a VF of loop->simdlen over any other VF.  */
2409   if (loop->simdlen)
2410     {
2411       bool new_simdlen_p = known_eq (new_vf, loop->simdlen);
2412       bool old_simdlen_p = known_eq (old_vf, loop->simdlen);
2413       if (new_simdlen_p != old_simdlen_p)
2414         return new_simdlen_p;
2415     }
2416
2417   /* Limit the VFs to what is likely to be the maximum number of iterations,
2418      to handle cases in which at least one loop_vinfo is fully-masked.  */
2419   HOST_WIDE_INT estimated_max_niter = likely_max_stmt_executions_int (loop);
2420   if (estimated_max_niter != -1)
2421     {
2422       if (known_le (estimated_max_niter, new_vf))
2423         new_vf = estimated_max_niter;
2424       if (known_le (estimated_max_niter, old_vf))
2425         old_vf = estimated_max_niter;
2426     }
2427
2428   /* Check whether the (fractional) cost per scalar iteration is lower
2429      or higher: new_inside_cost / new_vf vs. old_inside_cost / old_vf.  */
2430   poly_widest_int rel_new = (new_loop_vinfo->vec_inside_cost
2431                              * poly_widest_int (old_vf));
2432   poly_widest_int rel_old = (old_loop_vinfo->vec_inside_cost
2433                              * poly_widest_int (new_vf));
2434   if (maybe_lt (rel_old, rel_new))
2435     {
2436       /* When old_loop_vinfo uses a variable vectorization factor,
2437          we know that it has a lower cost for at least one runtime VF.
2438          However, we don't know how likely that VF is.
2439
2440          One option would be to compare the costs for the estimated VFs.
2441          The problem is that that can put too much pressure on the cost
2442          model.  E.g. if the estimated VF is also the lowest possible VF,
2443          and if old_loop_vinfo is 1 unit worse than new_loop_vinfo
2444          for the estimated VF, we'd then choose new_loop_vinfo even
2445          though (a) new_loop_vinfo might not actually be better than
2446          old_loop_vinfo for that VF and (b) it would be significantly
2447          worse at larger VFs.
2448
2449          Here we go for a hacky compromise: pick new_loop_vinfo if it is
2450          no more expensive than old_loop_vinfo even after doubling the
2451          estimated old_loop_vinfo VF.  For all but trivial loops, this
2452          ensures that we only pick new_loop_vinfo if it is significantly
2453          better than old_loop_vinfo at the estimated VF.  */
2454       if (rel_new.is_constant ())
2455         return false;
2456
2457       HOST_WIDE_INT new_estimated_vf = estimated_poly_value (new_vf);
2458       HOST_WIDE_INT old_estimated_vf = estimated_poly_value (old_vf);
2459       widest_int estimated_rel_new = (new_loop_vinfo->vec_inside_cost
2460                                       * widest_int (old_estimated_vf));
2461       widest_int estimated_rel_old = (old_loop_vinfo->vec_inside_cost
2462                                       * widest_int (new_estimated_vf));
2463       return estimated_rel_new * 2 <= estimated_rel_old;
2464     }
2465   if (known_lt (rel_new, rel_old))
2466     return true;
2467
2468   /* If there's nothing to choose between the loop bodies, see whether
2469      there's a difference in the prologue and epilogue costs.  */
2470   if (new_loop_vinfo->vec_outside_cost != old_loop_vinfo->vec_outside_cost)
2471     return new_loop_vinfo->vec_outside_cost < old_loop_vinfo->vec_outside_cost;
2472
2473   return false;
2474 }
2475
2476 /* Decide whether to replace OLD_LOOP_VINFO with NEW_LOOP_VINFO.  Return
2477    true if we should.  */
2478
2479 static bool
2480 vect_joust_loop_vinfos (loop_vec_info new_loop_vinfo,
2481                         loop_vec_info old_loop_vinfo)
2482 {
2483   if (!vect_better_loop_vinfo_p (new_loop_vinfo, old_loop_vinfo))
2484     return false;
2485
2486   if (dump_enabled_p ())
2487     dump_printf_loc (MSG_NOTE, vect_location,
2488                      "***** Preferring vector mode %s to vector mode %s\n",
2489                      GET_MODE_NAME (new_loop_vinfo->vector_mode),
2490                      GET_MODE_NAME (old_loop_vinfo->vector_mode));
2491   return true;
2492 }
2493
2494 /* Function vect_analyze_loop.
2495
2496    Apply a set of analyses on LOOP, and create a loop_vec_info struct
2497    for it.  The different analyses will record information in the
2498    loop_vec_info struct.  */
2499 opt_loop_vec_info
2500 vect_analyze_loop (class loop *loop, vec_info_shared *shared)
2501 {
2502   auto_vector_modes vector_modes;
2503
2504   /* Autodetect first vector size we try.  */
2505   unsigned int autovec_flags
2506     = targetm.vectorize.autovectorize_vector_modes (&vector_modes,
2507                                                     loop->simdlen != 0);
2508   unsigned int mode_i = 0;
2509
2510   DUMP_VECT_SCOPE ("analyze_loop_nest");
2511
2512   if (loop_outer (loop)
2513       && loop_vec_info_for_loop (loop_outer (loop))
2514       && LOOP_VINFO_VECTORIZABLE_P (loop_vec_info_for_loop (loop_outer (loop))))
2515     return opt_loop_vec_info::failure_at (vect_location,
2516                                           "outer-loop already vectorized.\n");
2517
2518   if (!find_loop_nest (loop, &shared->loop_nest))
2519     return opt_loop_vec_info::failure_at
2520       (vect_location,
2521        "not vectorized: loop nest containing two or more consecutive inner"
2522        " loops cannot be vectorized\n");
2523
2524   unsigned n_stmts = 0;
2525   machine_mode autodetected_vector_mode = VOIDmode;
2526   opt_loop_vec_info first_loop_vinfo = opt_loop_vec_info::success (NULL);
2527   machine_mode next_vector_mode = VOIDmode;
2528   poly_uint64 lowest_th = 0;
2529   unsigned vectorized_loops = 0;
2530   bool pick_lowest_cost_p = ((autovec_flags & VECT_COMPARE_COSTS)
2531                              && !unlimited_cost_model (loop));
2532
2533   bool vect_epilogues = false;
2534   opt_result res = opt_result::success ();
2535   unsigned HOST_WIDE_INT simdlen = loop->simdlen;
2536   while (1)
2537     {
2538       /* Check the CFG characteristics of the loop (nesting, entry/exit).  */
2539       opt_loop_vec_info loop_vinfo = vect_analyze_loop_form (loop, shared);
2540       if (!loop_vinfo)
2541         {
2542           if (dump_enabled_p ())
2543             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2544                              "bad loop form.\n");
2545           gcc_checking_assert (first_loop_vinfo == NULL);
2546           return loop_vinfo;
2547         }
2548       loop_vinfo->vector_mode = next_vector_mode;
2549
2550       bool fatal = false;
2551
2552       /* When pick_lowest_cost_p is true, we should in principle iterate
2553          over all the loop_vec_infos that LOOP_VINFO could replace and
2554          try to vectorize LOOP_VINFO under the same conditions.
2555          E.g. when trying to replace an epilogue loop, we should vectorize
2556          LOOP_VINFO as an epilogue loop with the same VF limit.  When trying
2557          to replace the main loop, we should vectorize LOOP_VINFO as a main
2558          loop too.
2559
2560          However, autovectorize_vector_modes is usually sorted as follows:
2561
2562          - Modes that naturally produce lower VFs usually follow modes that
2563            naturally produce higher VFs.
2564
2565          - When modes naturally produce the same VF, maskable modes
2566            usually follow unmaskable ones, so that the maskable mode
2567            can be used to vectorize the epilogue of the unmaskable mode.
2568
2569          This order is preferred because it leads to the maximum
2570          epilogue vectorization opportunities.  Targets should only use
2571          a different order if they want to make wide modes available while
2572          disparaging them relative to earlier, smaller modes.  The assumption
2573          in that case is that the wider modes are more expensive in some
2574          way that isn't reflected directly in the costs.
2575
2576          There should therefore be few interesting cases in which
2577          LOOP_VINFO fails when treated as an epilogue loop, succeeds when
2578          treated as a standalone loop, and ends up being genuinely cheaper
2579          than FIRST_LOOP_VINFO.  */
2580       if (vect_epilogues)
2581         LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo) = first_loop_vinfo;
2582
2583       res = vect_analyze_loop_2 (loop_vinfo, fatal, &n_stmts);
2584       if (mode_i == 0)
2585         autodetected_vector_mode = loop_vinfo->vector_mode;
2586       if (dump_enabled_p ())
2587         {
2588           if (res)
2589             dump_printf_loc (MSG_NOTE, vect_location,
2590                              "***** Analysis succeeded with vector mode %s\n",
2591                              GET_MODE_NAME (loop_vinfo->vector_mode));
2592           else
2593             dump_printf_loc (MSG_NOTE, vect_location,
2594                              "***** Analysis failed with vector mode %s\n",
2595                              GET_MODE_NAME (loop_vinfo->vector_mode));
2596         }
2597
2598       loop->aux = NULL;
2599
2600       if (!fatal)
2601         while (mode_i < vector_modes.length ()
2602                && vect_chooses_same_modes_p (loop_vinfo, vector_modes[mode_i]))
2603           {
2604             if (dump_enabled_p ())
2605               dump_printf_loc (MSG_NOTE, vect_location,
2606                                "***** The result for vector mode %s would"
2607                                " be the same\n",
2608                                GET_MODE_NAME (vector_modes[mode_i]));
2609             mode_i += 1;
2610           }
2611
2612       if (res)
2613         {
2614           LOOP_VINFO_VECTORIZABLE_P (loop_vinfo) = 1;
2615           vectorized_loops++;
2616
2617           /* Once we hit the desired simdlen for the first time,
2618              discard any previous attempts.  */
2619           if (simdlen
2620               && known_eq (LOOP_VINFO_VECT_FACTOR (loop_vinfo), simdlen))
2621             {
2622               delete first_loop_vinfo;
2623               first_loop_vinfo = opt_loop_vec_info::success (NULL);
2624               LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo) = NULL;
2625               simdlen = 0;
2626             }
2627           else if (pick_lowest_cost_p && first_loop_vinfo)
2628             {
2629               /* Keep trying to roll back vectorization attempts while the
2630                  loop_vec_infos they produced were worse than this one.  */
2631               vec<loop_vec_info> &vinfos = first_loop_vinfo->epilogue_vinfos;
2632               while (!vinfos.is_empty ()
2633                      && vect_joust_loop_vinfos (loop_vinfo, vinfos.last ()))
2634                 {
2635                   gcc_assert (vect_epilogues);
2636                   delete vinfos.pop ();
2637                 }
2638               if (vinfos.is_empty ()
2639                   && vect_joust_loop_vinfos (loop_vinfo, first_loop_vinfo))
2640                 {
2641                   delete first_loop_vinfo;
2642                   first_loop_vinfo = opt_loop_vec_info::success (NULL);
2643                   LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo) = NULL;
2644                 }
2645             }
2646
2647           if (first_loop_vinfo == NULL)
2648             {
2649               first_loop_vinfo = loop_vinfo;
2650               lowest_th = LOOP_VINFO_VERSIONING_THRESHOLD (first_loop_vinfo);
2651             }
2652           else if (vect_epilogues
2653                    /* For now only allow one epilogue loop.  */
2654                    && first_loop_vinfo->epilogue_vinfos.is_empty ())
2655             {
2656               first_loop_vinfo->epilogue_vinfos.safe_push (loop_vinfo);
2657               poly_uint64 th = LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo);
2658               gcc_assert (!LOOP_REQUIRES_VERSIONING (loop_vinfo)
2659                           || maybe_ne (lowest_th, 0U));
2660               /* Keep track of the known smallest versioning
2661                  threshold.  */
2662               if (ordered_p (lowest_th, th))
2663                 lowest_th = ordered_min (lowest_th, th);
2664             }
2665           else
2666             delete loop_vinfo;
2667
2668           /* Only vectorize epilogues if PARAM_VECT_EPILOGUES_NOMASK is
2669              enabled, SIMDUID is not set, it is the innermost loop and we have
2670              either already found the loop's SIMDLEN or there was no SIMDLEN to
2671              begin with.
2672              TODO: Enable epilogue vectorization for loops with SIMDUID set.  */
2673           vect_epilogues = (!simdlen
2674                             && loop->inner == NULL
2675                             && param_vect_epilogues_nomask
2676                             && LOOP_VINFO_PEELING_FOR_NITER (first_loop_vinfo)
2677                             && !loop->simduid
2678                             /* For now only allow one epilogue loop, but allow
2679                                pick_lowest_cost_p to replace it.  */
2680                             && (first_loop_vinfo->epilogue_vinfos.is_empty ()
2681                                 || pick_lowest_cost_p));
2682
2683           /* Commit to first_loop_vinfo if we have no reason to try
2684              alternatives.  */
2685           if (!simdlen && !vect_epilogues && !pick_lowest_cost_p)
2686             break;
2687         }
2688       else
2689         {
2690           delete loop_vinfo;
2691           if (fatal)
2692             {
2693               gcc_checking_assert (first_loop_vinfo == NULL);
2694               break;
2695             }
2696         }
2697
2698       if (mode_i < vector_modes.length ()
2699           && VECTOR_MODE_P (autodetected_vector_mode)
2700           && (related_vector_mode (vector_modes[mode_i],
2701                                    GET_MODE_INNER (autodetected_vector_mode))
2702               == autodetected_vector_mode)
2703           && (related_vector_mode (autodetected_vector_mode,
2704                                    GET_MODE_INNER (vector_modes[mode_i]))
2705               == vector_modes[mode_i]))
2706         {
2707           if (dump_enabled_p ())
2708             dump_printf_loc (MSG_NOTE, vect_location,
2709                              "***** Skipping vector mode %s, which would"
2710                              " repeat the analysis for %s\n",
2711                              GET_MODE_NAME (vector_modes[mode_i]),
2712                              GET_MODE_NAME (autodetected_vector_mode));
2713           mode_i += 1;
2714         }
2715
2716       if (mode_i == vector_modes.length ()
2717           || autodetected_vector_mode == VOIDmode)
2718         break;
2719
2720       /* Try the next biggest vector size.  */
2721       next_vector_mode = vector_modes[mode_i++];
2722       if (dump_enabled_p ())
2723         dump_printf_loc (MSG_NOTE, vect_location,
2724                          "***** Re-trying analysis with vector mode %s\n",
2725                          GET_MODE_NAME (next_vector_mode));
2726     }
2727
2728   if (first_loop_vinfo)
2729     {
2730       loop->aux = (loop_vec_info) first_loop_vinfo;
2731       if (dump_enabled_p ())
2732         dump_printf_loc (MSG_NOTE, vect_location,
2733                          "***** Choosing vector mode %s\n",
2734                          GET_MODE_NAME (first_loop_vinfo->vector_mode));
2735       LOOP_VINFO_VERSIONING_THRESHOLD (first_loop_vinfo) = lowest_th;
2736       return first_loop_vinfo;
2737     }
2738
2739   return opt_loop_vec_info::propagate_failure (res);
2740 }
2741
2742 /* Return true if there is an in-order reduction function for CODE, storing
2743    it in *REDUC_FN if so.  */
2744
2745 static bool
2746 fold_left_reduction_fn (tree_code code, internal_fn *reduc_fn)
2747 {
2748   switch (code)
2749     {
2750     case PLUS_EXPR:
2751       *reduc_fn = IFN_FOLD_LEFT_PLUS;
2752       return true;
2753
2754     default:
2755       return false;
2756     }
2757 }
2758
2759 /* Function reduction_fn_for_scalar_code
2760
2761    Input:
2762    CODE - tree_code of a reduction operations.
2763
2764    Output:
2765    REDUC_FN - the corresponding internal function to be used to reduce the
2766       vector of partial results into a single scalar result, or IFN_LAST
2767       if the operation is a supported reduction operation, but does not have
2768       such an internal function.
2769
2770    Return FALSE if CODE currently cannot be vectorized as reduction.  */
2771
2772 static bool
2773 reduction_fn_for_scalar_code (enum tree_code code, internal_fn *reduc_fn)
2774 {
2775   switch (code)
2776     {
2777       case MAX_EXPR:
2778         *reduc_fn = IFN_REDUC_MAX;
2779         return true;
2780
2781       case MIN_EXPR:
2782         *reduc_fn = IFN_REDUC_MIN;
2783         return true;
2784
2785       case PLUS_EXPR:
2786         *reduc_fn = IFN_REDUC_PLUS;
2787         return true;
2788
2789       case BIT_AND_EXPR:
2790         *reduc_fn = IFN_REDUC_AND;
2791         return true;
2792
2793       case BIT_IOR_EXPR:
2794         *reduc_fn = IFN_REDUC_IOR;
2795         return true;
2796
2797       case BIT_XOR_EXPR:
2798         *reduc_fn = IFN_REDUC_XOR;
2799         return true;
2800
2801       case MULT_EXPR:
2802       case MINUS_EXPR:
2803         *reduc_fn = IFN_LAST;
2804         return true;
2805
2806       default:
2807        return false;
2808     }
2809 }
2810
2811 /* If there is a neutral value X such that SLP reduction NODE would not
2812    be affected by the introduction of additional X elements, return that X,
2813    otherwise return null.  CODE is the code of the reduction and VECTOR_TYPE
2814    is the vector type that would hold element X.  REDUC_CHAIN is true if
2815    the SLP statements perform a single reduction, false if each statement
2816    performs an independent reduction.  */
2817
2818 static tree
2819 neutral_op_for_slp_reduction (slp_tree slp_node, tree vector_type,
2820                               tree_code code, bool reduc_chain)
2821 {
2822   vec<stmt_vec_info> stmts = SLP_TREE_SCALAR_STMTS (slp_node);
2823   stmt_vec_info stmt_vinfo = stmts[0];
2824   tree scalar_type = TREE_TYPE (vector_type);
2825   class loop *loop = gimple_bb (stmt_vinfo->stmt)->loop_father;
2826   gcc_assert (loop);
2827
2828   switch (code)
2829     {
2830     case WIDEN_SUM_EXPR:
2831     case DOT_PROD_EXPR:
2832     case SAD_EXPR:
2833     case PLUS_EXPR:
2834     case MINUS_EXPR:
2835     case BIT_IOR_EXPR:
2836     case BIT_XOR_EXPR:
2837       return build_zero_cst (scalar_type);
2838
2839     case MULT_EXPR:
2840       return build_one_cst (scalar_type);
2841
2842     case BIT_AND_EXPR:
2843       return build_all_ones_cst (scalar_type);
2844
2845     case MAX_EXPR:
2846     case MIN_EXPR:
2847       /* For MIN/MAX the initial values are neutral.  A reduction chain
2848          has only a single initial value, so that value is neutral for
2849          all statements.  */
2850       if (reduc_chain)
2851         return PHI_ARG_DEF_FROM_EDGE (stmt_vinfo->stmt,
2852                                       loop_preheader_edge (loop));
2853       return NULL_TREE;
2854
2855     default:
2856       return NULL_TREE;
2857     }
2858 }
2859
2860 /* Error reporting helper for vect_is_simple_reduction below.  GIMPLE statement
2861    STMT is printed with a message MSG. */
2862
2863 static void
2864 report_vect_op (dump_flags_t msg_type, gimple *stmt, const char *msg)
2865 {
2866   dump_printf_loc (msg_type, vect_location, "%s%G", msg, stmt);
2867 }
2868
2869 /* Return true if we need an in-order reduction for operation CODE
2870    on type TYPE.  NEED_WRAPPING_INTEGRAL_OVERFLOW is true if integer
2871    overflow must wrap.  */
2872
2873 bool
2874 needs_fold_left_reduction_p (tree type, tree_code code)
2875 {
2876   /* CHECKME: check for !flag_finite_math_only too?  */
2877   if (SCALAR_FLOAT_TYPE_P (type))
2878     switch (code)
2879       {
2880       case MIN_EXPR:
2881       case MAX_EXPR:
2882         return false;
2883
2884       default:
2885         return !flag_associative_math;
2886       }
2887
2888   if (INTEGRAL_TYPE_P (type))
2889     {
2890       if (!operation_no_trapping_overflow (type, code))
2891         return true;
2892       return false;
2893     }
2894
2895   if (SAT_FIXED_POINT_TYPE_P (type))
2896     return true;
2897
2898   return false;
2899 }
2900
2901 /* Return true if the reduction PHI in LOOP with latch arg LOOP_ARG and
2902    has a handled computation expression.  Store the main reduction
2903    operation in *CODE.  */
2904
2905 static bool
2906 check_reduction_path (dump_user_location_t loc, loop_p loop, gphi *phi,
2907                       tree loop_arg, enum tree_code *code,
2908                       vec<std::pair<ssa_op_iter, use_operand_p> > &path)
2909 {
2910   auto_bitmap visited;
2911   tree lookfor = PHI_RESULT (phi);
2912   ssa_op_iter curri;
2913   use_operand_p curr = op_iter_init_phiuse (&curri, phi, SSA_OP_USE);
2914   while (USE_FROM_PTR (curr) != loop_arg)
2915     curr = op_iter_next_use (&curri);
2916   curri.i = curri.numops;
2917   do
2918     {
2919       path.safe_push (std::make_pair (curri, curr));
2920       tree use = USE_FROM_PTR (curr);
2921       if (use == lookfor)
2922         break;
2923       gimple *def = SSA_NAME_DEF_STMT (use);
2924       if (gimple_nop_p (def)
2925           || ! flow_bb_inside_loop_p (loop, gimple_bb (def)))
2926         {
2927 pop:
2928           do
2929             {
2930               std::pair<ssa_op_iter, use_operand_p> x = path.pop ();
2931               curri = x.first;
2932               curr = x.second;
2933               do
2934                 curr = op_iter_next_use (&curri);
2935               /* Skip already visited or non-SSA operands (from iterating
2936                  over PHI args).  */
2937               while (curr != NULL_USE_OPERAND_P
2938                      && (TREE_CODE (USE_FROM_PTR (curr)) != SSA_NAME
2939                          || ! bitmap_set_bit (visited,
2940                                               SSA_NAME_VERSION
2941                                                 (USE_FROM_PTR (curr)))));
2942             }
2943           while (curr == NULL_USE_OPERAND_P && ! path.is_empty ());
2944           if (curr == NULL_USE_OPERAND_P)
2945             break;
2946         }
2947       else
2948         {
2949           if (gimple_code (def) == GIMPLE_PHI)
2950             curr = op_iter_init_phiuse (&curri, as_a <gphi *>(def), SSA_OP_USE);
2951           else
2952             curr = op_iter_init_use (&curri, def, SSA_OP_USE);
2953           while (curr != NULL_USE_OPERAND_P
2954                  && (TREE_CODE (USE_FROM_PTR (curr)) != SSA_NAME
2955                      || ! bitmap_set_bit (visited,
2956                                           SSA_NAME_VERSION
2957                                             (USE_FROM_PTR (curr)))))
2958             curr = op_iter_next_use (&curri);
2959           if (curr == NULL_USE_OPERAND_P)
2960             goto pop;
2961         }
2962     }
2963   while (1);
2964   if (dump_file && (dump_flags & TDF_DETAILS))
2965     {
2966       dump_printf_loc (MSG_NOTE, loc, "reduction path: ");
2967       unsigned i;
2968       std::pair<ssa_op_iter, use_operand_p> *x;
2969       FOR_EACH_VEC_ELT (path, i, x)
2970         dump_printf (MSG_NOTE, "%T ", USE_FROM_PTR (x->second));
2971       dump_printf (MSG_NOTE, "\n");
2972     }
2973
2974   /* Check whether the reduction path detected is valid.  */
2975   bool fail = path.length () == 0;
2976   bool neg = false;
2977   int sign = -1;
2978   *code = ERROR_MARK;
2979   for (unsigned i = 1; i < path.length (); ++i)
2980     {
2981       gimple *use_stmt = USE_STMT (path[i].second);
2982       tree op = USE_FROM_PTR (path[i].second);
2983       if (! is_gimple_assign (use_stmt)
2984           /* The following make sure we can compute the operand index
2985              easily plus it mostly disallows chaining via COND_EXPR condition
2986              operands.  */
2987           || (gimple_assign_rhs1_ptr (use_stmt) != path[i].second->use
2988               && (gimple_num_ops (use_stmt) <= 2
2989                   || gimple_assign_rhs2_ptr (use_stmt) != path[i].second->use)
2990               && (gimple_num_ops (use_stmt) <= 3
2991                   || gimple_assign_rhs3_ptr (use_stmt) != path[i].second->use)))
2992         {
2993           fail = true;
2994           break;
2995         }
2996       /* Check there's only a single stmt the op is used on inside
2997          of the loop.  */
2998       imm_use_iterator imm_iter;
2999       gimple *op_use_stmt;
3000       unsigned cnt = 0;
3001       FOR_EACH_IMM_USE_STMT (op_use_stmt, imm_iter, op)
3002         if (!is_gimple_debug (op_use_stmt)
3003             && flow_bb_inside_loop_p (loop, gimple_bb (op_use_stmt)))
3004           {
3005             /* We want to allow x + x but not x < 1 ? x : 2.  */
3006             if (is_gimple_assign (op_use_stmt)
3007                 && gimple_assign_rhs_code (op_use_stmt) == COND_EXPR)
3008               {
3009                 use_operand_p use_p;
3010                 FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
3011                   cnt++;
3012               }
3013             else
3014               cnt++;
3015           }
3016       if (cnt != 1)
3017         {
3018           fail = true;
3019           break;
3020         }
3021       tree_code use_code = gimple_assign_rhs_code (use_stmt);
3022       if (use_code == MINUS_EXPR)
3023         {
3024           use_code = PLUS_EXPR;
3025           /* Track whether we negate the reduction value each iteration.  */
3026           if (gimple_assign_rhs2 (use_stmt) == op)
3027             neg = ! neg;
3028         }
3029       if (CONVERT_EXPR_CODE_P (use_code)
3030           && tree_nop_conversion_p (TREE_TYPE (gimple_assign_lhs (use_stmt)),
3031                                     TREE_TYPE (gimple_assign_rhs1 (use_stmt))))
3032         ;
3033       else if (*code == ERROR_MARK)
3034         {
3035           *code = use_code;
3036           sign = TYPE_SIGN (TREE_TYPE (gimple_assign_lhs (use_stmt)));
3037         }
3038       else if (use_code != *code)
3039         {
3040           fail = true;
3041           break;
3042         }
3043       else if ((use_code == MIN_EXPR
3044                 || use_code == MAX_EXPR)
3045                && sign != TYPE_SIGN (TREE_TYPE (gimple_assign_lhs (use_stmt))))
3046         {
3047           fail = true;
3048           break;
3049         }
3050     }
3051   return ! fail && ! neg && *code != ERROR_MARK;
3052 }
3053
3054 bool
3055 check_reduction_path (dump_user_location_t loc, loop_p loop, gphi *phi,
3056                       tree loop_arg, enum tree_code code)
3057 {
3058   auto_vec<std::pair<ssa_op_iter, use_operand_p> > path;
3059   enum tree_code code_;
3060   return (check_reduction_path (loc, loop, phi, loop_arg, &code_, path)
3061           && code_ == code);
3062 }
3063
3064
3065
3066 /* Function vect_is_simple_reduction
3067
3068    (1) Detect a cross-iteration def-use cycle that represents a simple
3069    reduction computation.  We look for the following pattern:
3070
3071    loop_header:
3072      a1 = phi < a0, a2 >
3073      a3 = ...
3074      a2 = operation (a3, a1)
3075
3076    or
3077
3078    a3 = ...
3079    loop_header:
3080      a1 = phi < a0, a2 >
3081      a2 = operation (a3, a1)
3082
3083    such that:
3084    1. operation is commutative and associative and it is safe to
3085       change the order of the computation
3086    2. no uses for a2 in the loop (a2 is used out of the loop)
3087    3. no uses of a1 in the loop besides the reduction operation
3088    4. no uses of a1 outside the loop.
3089
3090    Conditions 1,4 are tested here.
3091    Conditions 2,3 are tested in vect_mark_stmts_to_be_vectorized.
3092
3093    (2) Detect a cross-iteration def-use cycle in nested loops, i.e.,
3094    nested cycles.
3095
3096    (3) Detect cycles of phi nodes in outer-loop vectorization, i.e., double
3097    reductions:
3098
3099      a1 = phi < a0, a2 >
3100      inner loop (def of a3)
3101      a2 = phi < a3 >
3102
3103    (4) Detect condition expressions, ie:
3104      for (int i = 0; i < N; i++)
3105        if (a[i] < val)
3106         ret_val = a[i];
3107
3108 */
3109
3110 static stmt_vec_info
3111 vect_is_simple_reduction (loop_vec_info loop_info, stmt_vec_info phi_info,
3112                           bool *double_reduc, bool *reduc_chain_p)
3113 {
3114   gphi *phi = as_a <gphi *> (phi_info->stmt);
3115   gimple *phi_use_stmt = NULL;
3116   imm_use_iterator imm_iter;
3117   use_operand_p use_p;
3118
3119   *double_reduc = false;
3120   *reduc_chain_p = false;
3121   STMT_VINFO_REDUC_TYPE (phi_info) = TREE_CODE_REDUCTION;
3122
3123   tree phi_name = PHI_RESULT (phi);
3124   /* ???  If there are no uses of the PHI result the inner loop reduction
3125      won't be detected as possibly double-reduction by vectorizable_reduction
3126      because that tries to walk the PHI arg from the preheader edge which
3127      can be constant.  See PR60382.  */
3128   if (has_zero_uses (phi_name))
3129     return NULL;
3130   class loop *loop = (gimple_bb (phi))->loop_father;
3131   unsigned nphi_def_loop_uses = 0;
3132   FOR_EACH_IMM_USE_FAST (use_p, imm_iter, phi_name)
3133     {
3134       gimple *use_stmt = USE_STMT (use_p);
3135       if (is_gimple_debug (use_stmt))
3136         continue;
3137
3138       if (!flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
3139         {
3140           if (dump_enabled_p ())
3141             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3142                              "intermediate value used outside loop.\n");
3143
3144           return NULL;
3145         }
3146
3147       nphi_def_loop_uses++;
3148       phi_use_stmt = use_stmt;
3149     }
3150
3151   tree latch_def = PHI_ARG_DEF_FROM_EDGE (phi, loop_latch_edge (loop));
3152   if (TREE_CODE (latch_def) != SSA_NAME)
3153     {
3154       if (dump_enabled_p ())
3155         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3156                          "reduction: not ssa_name: %T\n", latch_def);
3157       return NULL;
3158     }
3159
3160   stmt_vec_info def_stmt_info = loop_info->lookup_def (latch_def);
3161   if (!def_stmt_info
3162       || !flow_bb_inside_loop_p (loop, gimple_bb (def_stmt_info->stmt)))
3163     return NULL;
3164
3165   bool nested_in_vect_loop
3166     = flow_loop_nested_p (LOOP_VINFO_LOOP (loop_info), loop);
3167   unsigned nlatch_def_loop_uses = 0;
3168   auto_vec<gphi *, 3> lcphis;
3169   bool inner_loop_of_double_reduc = false;
3170   FOR_EACH_IMM_USE_FAST (use_p, imm_iter, latch_def)
3171     {
3172       gimple *use_stmt = USE_STMT (use_p);
3173       if (is_gimple_debug (use_stmt))
3174         continue;
3175       if (flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
3176         nlatch_def_loop_uses++;
3177       else
3178         {
3179           /* We can have more than one loop-closed PHI.  */
3180           lcphis.safe_push (as_a <gphi *> (use_stmt));
3181           if (nested_in_vect_loop
3182               && (STMT_VINFO_DEF_TYPE (loop_info->lookup_stmt (use_stmt))
3183                   == vect_double_reduction_def))
3184             inner_loop_of_double_reduc = true;
3185         }
3186     }
3187
3188   /* If we are vectorizing an inner reduction we are executing that
3189      in the original order only in case we are not dealing with a
3190      double reduction.  */
3191   if (nested_in_vect_loop && !inner_loop_of_double_reduc)
3192     {
3193       if (dump_enabled_p ())
3194         report_vect_op (MSG_NOTE, def_stmt_info->stmt,
3195                         "detected nested cycle: ");
3196       return def_stmt_info;
3197     }
3198
3199   /* If this isn't a nested cycle or if the nested cycle reduction value
3200      is used ouside of the inner loop we cannot handle uses of the reduction
3201      value.  */
3202   if (nlatch_def_loop_uses > 1 || nphi_def_loop_uses > 1)
3203     {
3204       if (dump_enabled_p ())
3205         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3206                          "reduction used in loop.\n");
3207       return NULL;
3208     }
3209
3210   /* If DEF_STMT is a phi node itself, we expect it to have a single argument
3211      defined in the inner loop.  */
3212   if (gphi *def_stmt = dyn_cast <gphi *> (def_stmt_info->stmt))
3213     {
3214       tree op1 = PHI_ARG_DEF (def_stmt, 0);
3215       if (gimple_phi_num_args (def_stmt) != 1
3216           || TREE_CODE (op1) != SSA_NAME)
3217         {
3218           if (dump_enabled_p ())
3219             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3220                              "unsupported phi node definition.\n");
3221
3222           return NULL;
3223         }
3224
3225       gimple *def1 = SSA_NAME_DEF_STMT (op1);
3226       if (gimple_bb (def1)
3227           && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
3228           && loop->inner
3229           && flow_bb_inside_loop_p (loop->inner, gimple_bb (def1))
3230           && is_gimple_assign (def1)
3231           && is_a <gphi *> (phi_use_stmt)
3232           && flow_bb_inside_loop_p (loop->inner, gimple_bb (phi_use_stmt)))
3233         {
3234           if (dump_enabled_p ())
3235             report_vect_op (MSG_NOTE, def_stmt,
3236                             "detected double reduction: ");
3237
3238           *double_reduc = true;
3239           return def_stmt_info;
3240         }
3241
3242       return NULL;
3243     }
3244
3245   /* Look for the expression computing latch_def from then loop PHI result.  */
3246   auto_vec<std::pair<ssa_op_iter, use_operand_p> > path;
3247   enum tree_code code;
3248   if (check_reduction_path (vect_location, loop, phi, latch_def, &code,
3249                             path))
3250     {
3251       STMT_VINFO_REDUC_CODE (phi_info) = code;
3252       if (code == COND_EXPR && !nested_in_vect_loop)
3253         STMT_VINFO_REDUC_TYPE (phi_info) = COND_REDUCTION;
3254
3255       /* Fill in STMT_VINFO_REDUC_IDX and gather stmts for an SLP
3256          reduction chain for which the additional restriction is that
3257          all operations in the chain are the same.  */
3258       auto_vec<stmt_vec_info, 8> reduc_chain;
3259       unsigned i;
3260       bool is_slp_reduc = !nested_in_vect_loop && code != COND_EXPR;
3261       for (i = path.length () - 1; i >= 1; --i)
3262         {
3263           gimple *stmt = USE_STMT (path[i].second);
3264           stmt_vec_info stmt_info = loop_info->lookup_stmt (stmt);
3265           STMT_VINFO_REDUC_IDX (stmt_info)
3266             = path[i].second->use - gimple_assign_rhs1_ptr (stmt);
3267           enum tree_code stmt_code = gimple_assign_rhs_code (stmt);
3268           bool leading_conversion = (CONVERT_EXPR_CODE_P (stmt_code)
3269                                      && (i == 1 || i == path.length () - 1));
3270           if ((stmt_code != code && !leading_conversion)
3271               /* We can only handle the final value in epilogue
3272                  generation for reduction chains.  */
3273               || (i != 1 && !has_single_use (gimple_assign_lhs (stmt))))
3274             is_slp_reduc = false;
3275           /* For reduction chains we support a trailing/leading
3276              conversions.  We do not store those in the actual chain.  */
3277           if (leading_conversion)
3278             continue;
3279           reduc_chain.safe_push (stmt_info);
3280         }
3281       if (is_slp_reduc && reduc_chain.length () > 1)
3282         {
3283           for (unsigned i = 0; i < reduc_chain.length () - 1; ++i)
3284             {
3285               REDUC_GROUP_FIRST_ELEMENT (reduc_chain[i]) = reduc_chain[0];
3286               REDUC_GROUP_NEXT_ELEMENT (reduc_chain[i]) = reduc_chain[i+1];
3287             }
3288           REDUC_GROUP_FIRST_ELEMENT (reduc_chain.last ()) = reduc_chain[0];
3289           REDUC_GROUP_NEXT_ELEMENT (reduc_chain.last ()) = NULL;
3290
3291           /* Save the chain for further analysis in SLP detection.  */
3292           LOOP_VINFO_REDUCTION_CHAINS (loop_info).safe_push (reduc_chain[0]);
3293           REDUC_GROUP_SIZE (reduc_chain[0]) = reduc_chain.length ();
3294
3295           *reduc_chain_p = true;
3296           if (dump_enabled_p ())
3297             dump_printf_loc (MSG_NOTE, vect_location,
3298                             "reduction: detected reduction chain\n");
3299         }
3300       else if (dump_enabled_p ())
3301         dump_printf_loc (MSG_NOTE, vect_location,
3302                          "reduction: detected reduction\n");
3303
3304       return def_stmt_info;
3305     }
3306
3307   if (dump_enabled_p ())
3308     dump_printf_loc (MSG_NOTE, vect_location,
3309                      "reduction: unknown pattern\n");
3310
3311   return NULL;
3312 }
3313
3314 /* Calculate cost of peeling the loop PEEL_ITERS_PROLOGUE times.  */
3315 int
3316 vect_get_known_peeling_cost (loop_vec_info loop_vinfo, int peel_iters_prologue,
3317                              int *peel_iters_epilogue,
3318                              stmt_vector_for_cost *scalar_cost_vec,
3319                              stmt_vector_for_cost *prologue_cost_vec,
3320                              stmt_vector_for_cost *epilogue_cost_vec)
3321 {
3322   int retval = 0;
3323   int assumed_vf = vect_vf_for_cost (loop_vinfo);
3324
3325   if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
3326     {
3327       *peel_iters_epilogue = assumed_vf / 2;
3328       if (dump_enabled_p ())
3329         dump_printf_loc (MSG_NOTE, vect_location,
3330                          "cost model: epilogue peel iters set to vf/2 "
3331                          "because loop iterations are unknown .\n");
3332
3333       /* If peeled iterations are known but number of scalar loop
3334          iterations are unknown, count a taken branch per peeled loop.  */
3335       retval = record_stmt_cost (prologue_cost_vec, 1, cond_branch_taken,
3336                                  NULL, NULL_TREE, 0, vect_prologue);
3337       retval += record_stmt_cost (epilogue_cost_vec, 1, cond_branch_taken,
3338                                   NULL, NULL_TREE, 0, vect_epilogue);
3339     }
3340   else
3341     {
3342       int niters = LOOP_VINFO_INT_NITERS (loop_vinfo);
3343       peel_iters_prologue = niters < peel_iters_prologue ?
3344                             niters : peel_iters_prologue;
3345       *peel_iters_epilogue = (niters - peel_iters_prologue) % assumed_vf;
3346       /* If we need to peel for gaps, but no peeling is required, we have to
3347          peel VF iterations.  */
3348       if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) && !*peel_iters_epilogue)
3349         *peel_iters_epilogue = assumed_vf;
3350     }
3351
3352   stmt_info_for_cost *si;
3353   int j;
3354   if (peel_iters_prologue)
3355     FOR_EACH_VEC_ELT (*scalar_cost_vec, j, si)
3356       retval += record_stmt_cost (prologue_cost_vec,
3357                                   si->count * peel_iters_prologue,
3358                                   si->kind, si->stmt_info, si->misalign,
3359                                   vect_prologue);
3360   if (*peel_iters_epilogue)
3361     FOR_EACH_VEC_ELT (*scalar_cost_vec, j, si)
3362       retval += record_stmt_cost (epilogue_cost_vec,
3363                                   si->count * *peel_iters_epilogue,
3364                                   si->kind, si->stmt_info, si->misalign,
3365                                   vect_epilogue);
3366
3367   return retval;
3368 }
3369
3370 /* Function vect_estimate_min_profitable_iters
3371
3372    Return the number of iterations required for the vector version of the
3373    loop to be profitable relative to the cost of the scalar version of the
3374    loop.
3375
3376    *RET_MIN_PROFITABLE_NITERS is a cost model profitability threshold
3377    of iterations for vectorization.  -1 value means loop vectorization
3378    is not profitable.  This returned value may be used for dynamic
3379    profitability check.
3380
3381    *RET_MIN_PROFITABLE_ESTIMATE is a profitability threshold to be used
3382    for static check against estimated number of iterations.  */
3383
3384 static void
3385 vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo,
3386                                     int *ret_min_profitable_niters,
3387                                     int *ret_min_profitable_estimate)
3388 {
3389   int min_profitable_iters;
3390   int min_profitable_estimate;
3391   int peel_iters_prologue;
3392   int peel_iters_epilogue;
3393   unsigned vec_inside_cost = 0;
3394   int vec_outside_cost = 0;
3395   unsigned vec_prologue_cost = 0;
3396   unsigned vec_epilogue_cost = 0;
3397   int scalar_single_iter_cost = 0;
3398   int scalar_outside_cost = 0;
3399   int assumed_vf = vect_vf_for_cost (loop_vinfo);
3400   int npeel = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
3401   void *target_cost_data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
3402
3403   /* Cost model disabled.  */
3404   if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
3405     {
3406       if (dump_enabled_p ())
3407         dump_printf_loc (MSG_NOTE, vect_location, "cost model disabled.\n");
3408       *ret_min_profitable_niters = 0;
3409       *ret_min_profitable_estimate = 0;
3410       return;
3411     }
3412
3413   /* Requires loop versioning tests to handle misalignment.  */
3414   if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo))
3415     {
3416       /*  FIXME: Make cost depend on complexity of individual check.  */
3417       unsigned len = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).length ();
3418       (void) add_stmt_cost (loop_vinfo, target_cost_data, len, vector_stmt,
3419                             NULL, NULL_TREE, 0, vect_prologue);
3420       if (dump_enabled_p ())
3421         dump_printf (MSG_NOTE,
3422                      "cost model: Adding cost of checks for loop "
3423                      "versioning to treat misalignment.\n");
3424     }
3425
3426   /* Requires loop versioning with alias checks.  */
3427   if (LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
3428     {
3429       /*  FIXME: Make cost depend on complexity of individual check.  */
3430       unsigned len = LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo).length ();
3431       (void) add_stmt_cost (loop_vinfo, target_cost_data, len, vector_stmt,
3432                             NULL, NULL_TREE, 0, vect_prologue);
3433       len = LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo).length ();
3434       if (len)
3435         /* Count LEN - 1 ANDs and LEN comparisons.  */
3436         (void) add_stmt_cost (loop_vinfo, target_cost_data, len * 2 - 1,
3437                               scalar_stmt, NULL, NULL_TREE, 0, vect_prologue);
3438       len = LOOP_VINFO_LOWER_BOUNDS (loop_vinfo).length ();
3439       if (len)
3440         {
3441           /* Count LEN - 1 ANDs and LEN comparisons.  */
3442           unsigned int nstmts = len * 2 - 1;
3443           /* +1 for each bias that needs adding.  */
3444           for (unsigned int i = 0; i < len; ++i)
3445             if (!LOOP_VINFO_LOWER_BOUNDS (loop_vinfo)[i].unsigned_p)
3446               nstmts += 1;
3447           (void) add_stmt_cost (loop_vinfo, target_cost_data, nstmts,
3448                                 scalar_stmt, NULL, NULL_TREE, 0, vect_prologue);
3449         }
3450       if (dump_enabled_p ())
3451         dump_printf (MSG_NOTE,
3452                      "cost model: Adding cost of checks for loop "
3453                      "versioning aliasing.\n");
3454     }
3455
3456   /* Requires loop versioning with niter checks.  */
3457   if (LOOP_REQUIRES_VERSIONING_FOR_NITERS (loop_vinfo))
3458     {
3459       /*  FIXME: Make cost depend on complexity of individual check.  */
3460       (void) add_stmt_cost (loop_vinfo, target_cost_data, 1, vector_stmt,
3461                             NULL, NULL_TREE, 0, vect_prologue);
3462       if (dump_enabled_p ())
3463         dump_printf (MSG_NOTE,
3464                      "cost model: Adding cost of checks for loop "
3465                      "versioning niters.\n");
3466     }
3467
3468   if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
3469     (void) add_stmt_cost (loop_vinfo, target_cost_data, 1, cond_branch_taken,
3470                           NULL, NULL_TREE, 0, vect_prologue);
3471
3472   /* Count statements in scalar loop.  Using this as scalar cost for a single
3473      iteration for now.
3474
3475      TODO: Add outer loop support.
3476
3477      TODO: Consider assigning different costs to different scalar
3478      statements.  */
3479
3480   scalar_single_iter_cost
3481     = LOOP_VINFO_SINGLE_SCALAR_ITERATION_COST (loop_vinfo);
3482
3483   /* Add additional cost for the peeled instructions in prologue and epilogue
3484      loop.  (For fully-masked loops there will be no peeling.)
3485
3486      FORNOW: If we don't know the value of peel_iters for prologue or epilogue
3487      at compile-time - we assume it's vf/2 (the worst would be vf-1).
3488
3489      TODO: Build an expression that represents peel_iters for prologue and
3490      epilogue to be used in a run-time test.  */
3491
3492   if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
3493     {
3494       peel_iters_prologue = 0;
3495       peel_iters_epilogue = 0;
3496
3497       if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
3498         {
3499           /* We need to peel exactly one iteration.  */
3500           peel_iters_epilogue += 1;
3501           stmt_info_for_cost *si;
3502           int j;
3503           FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
3504                             j, si)
3505             (void) add_stmt_cost (loop_vinfo, target_cost_data, si->count,
3506                                   si->kind, si->stmt_info, si->vectype,
3507                                   si->misalign, vect_epilogue);
3508         }
3509
3510       /* Calculate how many masks we need to generate.  */
3511       unsigned int num_masks = 0;
3512       rgroup_masks *rgm;
3513       unsigned int num_vectors_m1;
3514       FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo), num_vectors_m1, rgm)
3515         if (rgm->mask_type)
3516           num_masks += num_vectors_m1 + 1;
3517       gcc_assert (num_masks > 0);
3518
3519       /* In the worst case, we need to generate each mask in the prologue
3520          and in the loop body.  One of the loop body mask instructions
3521          replaces the comparison in the scalar loop, and since we don't
3522          count the scalar comparison against the scalar body, we shouldn't
3523          count that vector instruction against the vector body either.
3524
3525          Sometimes we can use unpacks instead of generating prologue
3526          masks and sometimes the prologue mask will fold to a constant,
3527          so the actual prologue cost might be smaller.  However, it's
3528          simpler and safer to use the worst-case cost; if this ends up
3529          being the tie-breaker between vectorizing or not, then it's
3530          probably better not to vectorize.  */
3531       (void) add_stmt_cost (loop_vinfo,
3532                             target_cost_data, num_masks, vector_stmt,
3533                             NULL, NULL_TREE, 0, vect_prologue);
3534       (void) add_stmt_cost (loop_vinfo,
3535                             target_cost_data, num_masks - 1, vector_stmt,
3536                             NULL, NULL_TREE, 0, vect_body);
3537     }
3538   else if (npeel < 0)
3539     {
3540       peel_iters_prologue = assumed_vf / 2;
3541       if (dump_enabled_p ())
3542         dump_printf (MSG_NOTE, "cost model: "
3543                      "prologue peel iters set to vf/2.\n");
3544
3545       /* If peeling for alignment is unknown, loop bound of main loop becomes
3546          unknown.  */
3547       peel_iters_epilogue = assumed_vf / 2;
3548       if (dump_enabled_p ())
3549         dump_printf (MSG_NOTE, "cost model: "
3550                      "epilogue peel iters set to vf/2 because "
3551                      "peeling for alignment is unknown.\n");
3552
3553       /* If peeled iterations are unknown, count a taken branch and a not taken
3554          branch per peeled loop. Even if scalar loop iterations are known,
3555          vector iterations are not known since peeled prologue iterations are
3556          not known. Hence guards remain the same.  */
3557       (void) add_stmt_cost (loop_vinfo, target_cost_data, 1, cond_branch_taken,
3558                             NULL, NULL_TREE, 0, vect_prologue);
3559       (void) add_stmt_cost (loop_vinfo,
3560                             target_cost_data, 1, cond_branch_not_taken,
3561                             NULL, NULL_TREE, 0, vect_prologue);
3562       (void) add_stmt_cost (loop_vinfo, target_cost_data, 1, cond_branch_taken,
3563                             NULL, NULL_TREE, 0, vect_epilogue);
3564       (void) add_stmt_cost (loop_vinfo,
3565                             target_cost_data, 1, cond_branch_not_taken,
3566                             NULL, NULL_TREE, 0, vect_epilogue);
3567       stmt_info_for_cost *si;
3568       int j;
3569       FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo), j, si)
3570         {
3571           (void) add_stmt_cost (loop_vinfo, target_cost_data,
3572                                 si->count * peel_iters_prologue,
3573                                 si->kind, si->stmt_info, si->vectype,
3574                                 si->misalign,
3575                                 vect_prologue);
3576           (void) add_stmt_cost (loop_vinfo, target_cost_data,
3577                                 si->count * peel_iters_epilogue,
3578                                 si->kind, si->stmt_info, si->vectype,
3579                                 si->misalign,
3580                                 vect_epilogue);
3581         }
3582     }
3583   else
3584     {
3585       stmt_vector_for_cost prologue_cost_vec, epilogue_cost_vec;
3586       stmt_info_for_cost *si;
3587       int j;
3588       void *data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
3589
3590       prologue_cost_vec.create (2);
3591       epilogue_cost_vec.create (2);
3592       peel_iters_prologue = npeel;
3593
3594       (void) vect_get_known_peeling_cost (loop_vinfo, peel_iters_prologue,
3595                                           &peel_iters_epilogue,
3596                                           &LOOP_VINFO_SCALAR_ITERATION_COST
3597                                             (loop_vinfo),
3598                                           &prologue_cost_vec,
3599                                           &epilogue_cost_vec);
3600
3601       FOR_EACH_VEC_ELT (prologue_cost_vec, j, si)
3602         (void) add_stmt_cost (loop_vinfo,
3603                               data, si->count, si->kind, si->stmt_info,
3604                               si->vectype, si->misalign, vect_prologue);
3605
3606       FOR_EACH_VEC_ELT (epilogue_cost_vec, j, si)
3607         (void) add_stmt_cost (loop_vinfo,
3608                               data, si->count, si->kind, si->stmt_info,
3609                               si->vectype, si->misalign, vect_epilogue);
3610
3611       prologue_cost_vec.release ();
3612       epilogue_cost_vec.release ();
3613     }
3614
3615   /* FORNOW: The scalar outside cost is incremented in one of the
3616      following ways:
3617
3618      1. The vectorizer checks for alignment and aliasing and generates
3619      a condition that allows dynamic vectorization.  A cost model
3620      check is ANDED with the versioning condition.  Hence scalar code
3621      path now has the added cost of the versioning check.
3622
3623        if (cost > th & versioning_check)
3624          jmp to vector code
3625
3626      Hence run-time scalar is incremented by not-taken branch cost.
3627
3628      2. The vectorizer then checks if a prologue is required.  If the
3629      cost model check was not done before during versioning, it has to
3630      be done before the prologue check.
3631
3632        if (cost <= th)
3633          prologue = scalar_iters
3634        if (prologue == 0)
3635          jmp to vector code
3636        else
3637          execute prologue
3638        if (prologue == num_iters)
3639          go to exit
3640
3641      Hence the run-time scalar cost is incremented by a taken branch,
3642      plus a not-taken branch, plus a taken branch cost.
3643
3644      3. The vectorizer then checks if an epilogue is required.  If the
3645      cost model check was not done before during prologue check, it
3646      has to be done with the epilogue check.
3647
3648        if (prologue == 0)
3649          jmp to vector code
3650        else
3651          execute prologue
3652        if (prologue == num_iters)
3653          go to exit
3654        vector code:
3655          if ((cost <= th) | (scalar_iters-prologue-epilogue == 0))
3656            jmp to epilogue
3657
3658      Hence the run-time scalar cost should be incremented by 2 taken
3659      branches.
3660
3661      TODO: The back end may reorder the BBS's differently and reverse
3662      conditions/branch directions.  Change the estimates below to
3663      something more reasonable.  */
3664
3665   /* If the number of iterations is known and we do not do versioning, we can
3666      decide whether to vectorize at compile time.  Hence the scalar version
3667      do not carry cost model guard costs.  */
3668   if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
3669       || LOOP_REQUIRES_VERSIONING (loop_vinfo))
3670     {
3671       /* Cost model check occurs at versioning.  */
3672       if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
3673         scalar_outside_cost += vect_get_stmt_cost (cond_branch_not_taken);
3674       else
3675         {
3676           /* Cost model check occurs at prologue generation.  */
3677           if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) < 0)
3678             scalar_outside_cost += 2 * vect_get_stmt_cost (cond_branch_taken)
3679               + vect_get_stmt_cost (cond_branch_not_taken);
3680           /* Cost model check occurs at epilogue generation.  */
3681           else
3682             scalar_outside_cost += 2 * vect_get_stmt_cost (cond_branch_taken);
3683         }
3684     }
3685
3686   /* Complete the target-specific cost calculations.  */
3687   finish_cost (LOOP_VINFO_TARGET_COST_DATA (loop_vinfo), &vec_prologue_cost,
3688                &vec_inside_cost, &vec_epilogue_cost);
3689
3690   vec_outside_cost = (int)(vec_prologue_cost + vec_epilogue_cost);
3691
3692   /* Stash the costs so that we can compare two loop_vec_infos.  */
3693   loop_vinfo->vec_inside_cost = vec_inside_cost;
3694   loop_vinfo->vec_outside_cost = vec_outside_cost;
3695
3696   if (dump_enabled_p ())
3697     {
3698       dump_printf_loc (MSG_NOTE, vect_location, "Cost model analysis: \n");
3699       dump_printf (MSG_NOTE, "  Vector inside of loop cost: %d\n",
3700                    vec_inside_cost);
3701       dump_printf (MSG_NOTE, "  Vector prologue cost: %d\n",
3702                    vec_prologue_cost);
3703       dump_printf (MSG_NOTE, "  Vector epilogue cost: %d\n",
3704                    vec_epilogue_cost);
3705       dump_printf (MSG_NOTE, "  Scalar iteration cost: %d\n",
3706                    scalar_single_iter_cost);
3707       dump_printf (MSG_NOTE, "  Scalar outside cost: %d\n",
3708                    scalar_outside_cost);
3709       dump_printf (MSG_NOTE, "  Vector outside cost: %d\n",
3710                    vec_outside_cost);
3711       dump_printf (MSG_NOTE, "  prologue iterations: %d\n",
3712                    peel_iters_prologue);
3713       dump_printf (MSG_NOTE, "  epilogue iterations: %d\n",
3714                    peel_iters_epilogue);
3715     }
3716
3717   /* Calculate number of iterations required to make the vector version
3718      profitable, relative to the loop bodies only.  The following condition
3719      must hold true:
3720      SIC * niters + SOC > VIC * ((niters - NPEEL) / VF) + VOC
3721      where
3722      SIC = scalar iteration cost, VIC = vector iteration cost,
3723      VOC = vector outside cost, VF = vectorization factor,
3724      NPEEL = prologue iterations + epilogue iterations,
3725      SOC = scalar outside cost for run time cost model check.  */
3726
3727   int saving_per_viter = (scalar_single_iter_cost * assumed_vf
3728                           - vec_inside_cost);
3729   if (saving_per_viter <= 0)
3730     {
3731       if (LOOP_VINFO_LOOP (loop_vinfo)->force_vectorize)
3732         warning_at (vect_location.get_location_t (), OPT_Wopenmp_simd,
3733                     "vectorization did not happen for a simd loop");
3734
3735       if (dump_enabled_p ())
3736         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3737                          "cost model: the vector iteration cost = %d "
3738                          "divided by the scalar iteration cost = %d "
3739                          "is greater or equal to the vectorization factor = %d"
3740                          ".\n",
3741                          vec_inside_cost, scalar_single_iter_cost, assumed_vf);
3742       *ret_min_profitable_niters = -1;
3743       *ret_min_profitable_estimate = -1;
3744       return;
3745     }
3746
3747   /* ??? The "if" arm is written to handle all cases; see below for what
3748      we would do for !LOOP_VINFO_FULLY_MASKED_P.  */
3749   if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
3750     {
3751       /* Rewriting the condition above in terms of the number of
3752          vector iterations (vniters) rather than the number of
3753          scalar iterations (niters) gives:
3754
3755          SIC * (vniters * VF + NPEEL) + SOC > VIC * vniters + VOC
3756
3757          <==> vniters * (SIC * VF - VIC) > VOC - SIC * NPEEL - SOC
3758
3759          For integer N, X and Y when X > 0:
3760
3761          N * X > Y <==> N >= (Y /[floor] X) + 1.  */
3762       int outside_overhead = (vec_outside_cost
3763                               - scalar_single_iter_cost * peel_iters_prologue
3764                               - scalar_single_iter_cost * peel_iters_epilogue
3765                               - scalar_outside_cost);
3766       /* We're only interested in cases that require at least one
3767          vector iteration.  */
3768       int min_vec_niters = 1;
3769       if (outside_overhead > 0)
3770         min_vec_niters = outside_overhead / saving_per_viter + 1;
3771
3772       if (dump_enabled_p ())
3773         dump_printf (MSG_NOTE, "  Minimum number of vector iterations: %d\n",
3774                      min_vec_niters);
3775
3776       if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
3777         {
3778           /* Now that we know the minimum number of vector iterations,
3779              find the minimum niters for which the scalar cost is larger:
3780
3781              SIC * niters > VIC * vniters + VOC - SOC
3782
3783              We know that the minimum niters is no more than
3784              vniters * VF + NPEEL, but it might be (and often is) less
3785              than that if a partial vector iteration is cheaper than the
3786              equivalent scalar code.  */
3787           int threshold = (vec_inside_cost * min_vec_niters
3788                            + vec_outside_cost
3789                            - scalar_outside_cost);
3790           if (threshold <= 0)
3791             min_profitable_iters = 1;
3792           else
3793             min_profitable_iters = threshold / scalar_single_iter_cost + 1;
3794         }
3795       else
3796         /* Convert the number of vector iterations into a number of
3797            scalar iterations.  */
3798         min_profitable_iters = (min_vec_niters * assumed_vf
3799                                 + peel_iters_prologue
3800                                 + peel_iters_epilogue);
3801     }
3802   else
3803     {
3804       min_profitable_iters = ((vec_outside_cost - scalar_outside_cost)
3805                               * assumed_vf
3806                               - vec_inside_cost * peel_iters_prologue
3807                               - vec_inside_cost * peel_iters_epilogue);
3808       if (min_profitable_iters <= 0)
3809         min_profitable_iters = 0;
3810       else
3811         {
3812           min_profitable_iters /= saving_per_viter;
3813
3814           if ((scalar_single_iter_cost * assumed_vf * min_profitable_iters)
3815               <= (((int) vec_inside_cost * min_profitable_iters)
3816                   + (((int) vec_outside_cost - scalar_outside_cost)
3817                      * assumed_vf)))
3818             min_profitable_iters++;
3819         }
3820     }
3821
3822   if (dump_enabled_p ())
3823     dump_printf (MSG_NOTE,
3824                  "  Calculated minimum iters for profitability: %d\n",
3825                  min_profitable_iters);
3826
3827   if (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
3828       && min_profitable_iters < (assumed_vf + peel_iters_prologue))
3829     /* We want the vectorized loop to execute at least once.  */
3830     min_profitable_iters = assumed_vf + peel_iters_prologue;
3831
3832   if (dump_enabled_p ())
3833     dump_printf_loc (MSG_NOTE, vect_location,
3834                      "  Runtime profitability threshold = %d\n",
3835                      min_profitable_iters);
3836
3837   *ret_min_profitable_niters = min_profitable_iters;
3838
3839   /* Calculate number of iterations required to make the vector version
3840      profitable, relative to the loop bodies only.
3841
3842      Non-vectorized variant is SIC * niters and it must win over vector
3843      variant on the expected loop trip count.  The following condition must hold true:
3844      SIC * niters > VIC * ((niters - NPEEL) / VF) + VOC + SOC  */
3845
3846   if (vec_outside_cost <= 0)
3847     min_profitable_estimate = 0;
3848   else if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
3849     {
3850       /* This is a repeat of the code above, but with + SOC rather
3851          than - SOC.  */
3852       int outside_overhead = (vec_outside_cost
3853                               - scalar_single_iter_cost * peel_iters_prologue
3854                               - scalar_single_iter_cost * peel_iters_epilogue
3855                               + scalar_outside_cost);
3856       int min_vec_niters = 1;
3857       if (outside_overhead > 0)
3858         min_vec_niters = outside_overhead / saving_per_viter + 1;
3859
3860       if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
3861         {
3862           int threshold = (vec_inside_cost * min_vec_niters
3863                            + vec_outside_cost
3864                            + scalar_outside_cost);
3865           min_profitable_estimate = threshold / scalar_single_iter_cost + 1;
3866         }
3867       else
3868         min_profitable_estimate = (min_vec_niters * assumed_vf
3869                                    + peel_iters_prologue
3870                                    + peel_iters_epilogue);
3871     }
3872   else
3873     {
3874       min_profitable_estimate = ((vec_outside_cost + scalar_outside_cost)
3875                                  * assumed_vf
3876                                  - vec_inside_cost * peel_iters_prologue
3877                                  - vec_inside_cost * peel_iters_epilogue)
3878                                  / ((scalar_single_iter_cost * assumed_vf)
3879                                    - vec_inside_cost);
3880     }
3881   min_profitable_estimate = MAX (min_profitable_estimate, min_profitable_iters);
3882   if (dump_enabled_p ())
3883     dump_printf_loc (MSG_NOTE, vect_location,
3884                      "  Static estimate profitability threshold = %d\n",
3885                      min_profitable_estimate);
3886
3887   *ret_min_profitable_estimate = min_profitable_estimate;
3888 }
3889
3890 /* Writes into SEL a mask for a vec_perm, equivalent to a vec_shr by OFFSET
3891    vector elements (not bits) for a vector with NELT elements.  */
3892 static void
3893 calc_vec_perm_mask_for_shift (unsigned int offset, unsigned int nelt,
3894                               vec_perm_builder *sel)
3895 {
3896   /* The encoding is a single stepped pattern.  Any wrap-around is handled
3897      by vec_perm_indices.  */
3898   sel->new_vector (nelt, 1, 3);
3899   for (unsigned int i = 0; i < 3; i++)
3900     sel->quick_push (i + offset);
3901 }
3902
3903 /* Checks whether the target supports whole-vector shifts for vectors of mode
3904    MODE.  This is the case if _either_ the platform handles vec_shr_optab, _or_
3905    it supports vec_perm_const with masks for all necessary shift amounts.  */
3906 static bool
3907 have_whole_vector_shift (machine_mode mode)
3908 {
3909   if (optab_handler (vec_shr_optab, mode) != CODE_FOR_nothing)
3910     return true;
3911
3912   /* Variable-length vectors should be handled via the optab.  */
3913   unsigned int nelt;
3914   if (!GET_MODE_NUNITS (mode).is_constant (&nelt))
3915     return false;
3916
3917   vec_perm_builder sel;
3918   vec_perm_indices indices;
3919   for (unsigned int i = nelt / 2; i >= 1; i /= 2)
3920     {
3921       calc_vec_perm_mask_for_shift (i, nelt, &sel);
3922       indices.new_vector (sel, 2, nelt);
3923       if (!can_vec_perm_const_p (mode, indices, false))
3924         return false;
3925     }
3926   return true;
3927 }
3928
3929 /* TODO: Close dependency between vect_model_*_cost and vectorizable_*
3930    functions. Design better to avoid maintenance issues.  */
3931
3932 /* Function vect_model_reduction_cost.
3933
3934    Models cost for a reduction operation, including the vector ops
3935    generated within the strip-mine loop, the initial definition before
3936    the loop, and the epilogue code that must be generated.  */
3937
3938 static void
3939 vect_model_reduction_cost (loop_vec_info loop_vinfo,
3940                            stmt_vec_info stmt_info, internal_fn reduc_fn,
3941                            vect_reduction_type reduction_type,
3942                            int ncopies, stmt_vector_for_cost *cost_vec)
3943 {
3944   int prologue_cost = 0, epilogue_cost = 0, inside_cost;
3945   enum tree_code code;
3946   optab optab;
3947   tree vectype;
3948   machine_mode mode;
3949   class loop *loop = NULL;
3950
3951   if (loop_vinfo)
3952     loop = LOOP_VINFO_LOOP (loop_vinfo);
3953
3954   /* Condition reductions generate two reductions in the loop.  */
3955   if (reduction_type == COND_REDUCTION)
3956     ncopies *= 2;
3957
3958   vectype = STMT_VINFO_VECTYPE (stmt_info);
3959   mode = TYPE_MODE (vectype);
3960   stmt_vec_info orig_stmt_info = vect_orig_stmt (stmt_info);
3961
3962   code = gimple_assign_rhs_code (orig_stmt_info->stmt);
3963
3964   if (reduction_type == EXTRACT_LAST_REDUCTION)
3965     /* No extra instructions are needed in the prologue.  The loop body
3966        operations are costed in vectorizable_condition.  */
3967     inside_cost = 0;
3968   else if (reduction_type == FOLD_LEFT_REDUCTION)
3969     {
3970       /* No extra instructions needed in the prologue.  */
3971       prologue_cost = 0;
3972
3973       if (reduc_fn != IFN_LAST)
3974         /* Count one reduction-like operation per vector.  */
3975         inside_cost = record_stmt_cost (cost_vec, ncopies, vec_to_scalar,
3976                                         stmt_info, 0, vect_body);
3977       else
3978         {
3979           /* Use NELEMENTS extracts and NELEMENTS scalar ops.  */
3980           unsigned int nelements = ncopies * vect_nunits_for_cost (vectype);
3981           inside_cost = record_stmt_cost (cost_vec, nelements,
3982                                           vec_to_scalar, stmt_info, 0,
3983                                           vect_body);
3984           inside_cost += record_stmt_cost (cost_vec, nelements,
3985                                            scalar_stmt, stmt_info, 0,
3986                                            vect_body);
3987         }
3988     }
3989   else
3990     {
3991       /* Add in cost for initial definition.
3992          For cond reduction we have four vectors: initial index, step,
3993          initial result of the data reduction, initial value of the index
3994          reduction.  */
3995       int prologue_stmts = reduction_type == COND_REDUCTION ? 4 : 1;
3996       prologue_cost += record_stmt_cost (cost_vec, prologue_stmts,
3997                                          scalar_to_vec, stmt_info, 0,
3998                                          vect_prologue);
3999
4000       /* Cost of reduction op inside loop.  */
4001       inside_cost = record_stmt_cost (cost_vec, ncopies, vector_stmt,
4002                                       stmt_info, 0, vect_body);
4003     }
4004
4005   /* Determine cost of epilogue code.
4006
4007      We have a reduction operator that will reduce the vector in one statement.
4008      Also requires scalar extract.  */
4009
4010   if (!loop || !nested_in_vect_loop_p (loop, orig_stmt_info))
4011     {
4012       if (reduc_fn != IFN_LAST)
4013         {
4014           if (reduction_type == COND_REDUCTION)
4015             {
4016               /* An EQ stmt and an COND_EXPR stmt.  */
4017               epilogue_cost += record_stmt_cost (cost_vec, 2,
4018                                                  vector_stmt, stmt_info, 0,
4019                                                  vect_epilogue);
4020               /* Reduction of the max index and a reduction of the found
4021                  values.  */
4022               epilogue_cost += record_stmt_cost (cost_vec, 2,
4023                                                  vec_to_scalar, stmt_info, 0,
4024                                                  vect_epilogue);
4025               /* A broadcast of the max value.  */
4026               epilogue_cost += record_stmt_cost (cost_vec, 1,
4027                                                  scalar_to_vec, stmt_info, 0,
4028                                                  vect_epilogue);
4029             }
4030           else
4031             {
4032               epilogue_cost += record_stmt_cost (cost_vec, 1, vector_stmt,
4033                                                  stmt_info, 0, vect_epilogue);
4034               epilogue_cost += record_stmt_cost (cost_vec, 1,
4035                                                  vec_to_scalar, stmt_info, 0,
4036                                                  vect_epilogue);
4037             }
4038         }
4039       else if (reduction_type == COND_REDUCTION)
4040         {
4041           unsigned estimated_nunits = vect_nunits_for_cost (vectype);
4042           /* Extraction of scalar elements.  */
4043           epilogue_cost += record_stmt_cost (cost_vec,
4044                                              2 * estimated_nunits,
4045                                              vec_to_scalar, stmt_info, 0,
4046                                              vect_epilogue);
4047           /* Scalar max reductions via COND_EXPR / MAX_EXPR.  */
4048           epilogue_cost += record_stmt_cost (cost_vec,
4049                                              2 * estimated_nunits - 3,
4050                                              scalar_stmt, stmt_info, 0,
4051                                              vect_epilogue);
4052         }
4053       else if (reduction_type == EXTRACT_LAST_REDUCTION
4054                || reduction_type == FOLD_LEFT_REDUCTION)
4055         /* No extra instructions need in the epilogue.  */
4056         ;
4057       else
4058         {
4059           int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
4060           tree bitsize =
4061             TYPE_SIZE (TREE_TYPE (gimple_assign_lhs (orig_stmt_info->stmt)));
4062           int element_bitsize = tree_to_uhwi (bitsize);
4063           int nelements = vec_size_in_bits / element_bitsize;
4064
4065           if (code == COND_EXPR)
4066             code = MAX_EXPR;
4067
4068           optab = optab_for_tree_code (code, vectype, optab_default);
4069
4070           /* We have a whole vector shift available.  */
4071           if (optab != unknown_optab
4072               && VECTOR_MODE_P (mode)
4073               && optab_handler (optab, mode) != CODE_FOR_nothing
4074               && have_whole_vector_shift (mode))
4075             {
4076               /* Final reduction via vector shifts and the reduction operator.
4077                  Also requires scalar extract.  */
4078               epilogue_cost += record_stmt_cost (cost_vec,
4079                                                  exact_log2 (nelements) * 2,
4080                                                  vector_stmt, stmt_info, 0,
4081                                                  vect_epilogue);
4082               epilogue_cost += record_stmt_cost (cost_vec, 1,
4083                                                  vec_to_scalar, stmt_info, 0,
4084                                                  vect_epilogue);
4085             }
4086           else
4087             /* Use extracts and reduction op for final reduction.  For N
4088                elements, we have N extracts and N-1 reduction ops.  */
4089             epilogue_cost += record_stmt_cost (cost_vec,
4090                                                nelements + nelements - 1,
4091                                                vector_stmt, stmt_info, 0,
4092                                                vect_epilogue);
4093         }
4094     }
4095
4096   if (dump_enabled_p ())
4097     dump_printf (MSG_NOTE,
4098                  "vect_model_reduction_cost: inside_cost = %d, "
4099                  "prologue_cost = %d, epilogue_cost = %d .\n", inside_cost,
4100                  prologue_cost, epilogue_cost);
4101 }
4102
4103
4104 /* Function vect_model_induction_cost.
4105
4106    Models cost for induction operations.  */
4107
4108 static void
4109 vect_model_induction_cost (stmt_vec_info stmt_info, int ncopies,
4110                            stmt_vector_for_cost *cost_vec)
4111 {
4112   unsigned inside_cost, prologue_cost;
4113
4114   if (PURE_SLP_STMT (stmt_info))
4115     return;
4116
4117   /* loop cost for vec_loop.  */
4118   inside_cost = record_stmt_cost (cost_vec, ncopies, vector_stmt,
4119                                   stmt_info, 0, vect_body);
4120
4121   /* prologue cost for vec_init and vec_step.  */
4122   prologue_cost = record_stmt_cost (cost_vec, 2, scalar_to_vec,
4123                                     stmt_info, 0, vect_prologue);
4124
4125   if (dump_enabled_p ())
4126     dump_printf_loc (MSG_NOTE, vect_location,
4127                      "vect_model_induction_cost: inside_cost = %d, "
4128                      "prologue_cost = %d .\n", inside_cost, prologue_cost);
4129 }
4130
4131
4132
4133 /* Function get_initial_def_for_reduction
4134
4135    Input:
4136    STMT_VINFO - a stmt that performs a reduction operation in the loop.
4137    INIT_VAL - the initial value of the reduction variable
4138
4139    Output:
4140    ADJUSTMENT_DEF - a tree that holds a value to be added to the final result
4141         of the reduction (used for adjusting the epilog - see below).
4142    Return a vector variable, initialized according to the operation that
4143         STMT_VINFO performs. This vector will be used as the initial value
4144         of the vector of partial results.
4145
4146    Option1 (adjust in epilog): Initialize the vector as follows:
4147      add/bit or/xor:    [0,0,...,0,0]
4148      mult/bit and:      [1,1,...,1,1]
4149      min/max/cond_expr: [init_val,init_val,..,init_val,init_val]
4150    and when necessary (e.g. add/mult case) let the caller know
4151    that it needs to adjust the result by init_val.
4152
4153    Option2: Initialize the vector as follows:
4154      add/bit or/xor:    [init_val,0,0,...,0]
4155      mult/bit and:      [init_val,1,1,...,1]
4156      min/max/cond_expr: [init_val,init_val,...,init_val]
4157    and no adjustments are needed.
4158
4159    For example, for the following code:
4160
4161    s = init_val;
4162    for (i=0;i<n;i++)
4163      s = s + a[i];
4164
4165    STMT_VINFO is 's = s + a[i]', and the reduction variable is 's'.
4166    For a vector of 4 units, we want to return either [0,0,0,init_val],
4167    or [0,0,0,0] and let the caller know that it needs to adjust
4168    the result at the end by 'init_val'.
4169
4170    FORNOW, we are using the 'adjust in epilog' scheme, because this way the
4171    initialization vector is simpler (same element in all entries), if
4172    ADJUSTMENT_DEF is not NULL, and Option2 otherwise.
4173
4174    A cost model should help decide between these two schemes.  */
4175
4176 static tree
4177 get_initial_def_for_reduction (loop_vec_info loop_vinfo,
4178                                stmt_vec_info stmt_vinfo,
4179                                enum tree_code code, tree init_val,
4180                                tree *adjustment_def)
4181 {
4182   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
4183   tree scalar_type = TREE_TYPE (init_val);
4184   tree vectype = get_vectype_for_scalar_type (loop_vinfo, scalar_type);
4185   tree def_for_init;
4186   tree init_def;
4187   REAL_VALUE_TYPE real_init_val = dconst0;
4188   int int_init_val = 0;
4189   gimple_seq stmts = NULL;
4190
4191   gcc_assert (vectype);
4192
4193   gcc_assert (POINTER_TYPE_P (scalar_type) || INTEGRAL_TYPE_P (scalar_type)
4194               || SCALAR_FLOAT_TYPE_P (scalar_type));
4195
4196   gcc_assert (nested_in_vect_loop_p (loop, stmt_vinfo)
4197               || loop == (gimple_bb (stmt_vinfo->stmt))->loop_father);
4198
4199   /* ADJUSTMENT_DEF is NULL when called from
4200      vect_create_epilog_for_reduction to vectorize double reduction.  */
4201   if (adjustment_def)
4202     *adjustment_def = NULL;
4203
4204   switch (code)
4205     {
4206     case WIDEN_SUM_EXPR:
4207     case DOT_PROD_EXPR:
4208     case SAD_EXPR:
4209     case PLUS_EXPR:
4210     case MINUS_EXPR:
4211     case BIT_IOR_EXPR:
4212     case BIT_XOR_EXPR:
4213     case MULT_EXPR:
4214     case BIT_AND_EXPR:
4215       {
4216         if (code == MULT_EXPR)
4217           {
4218             real_init_val = dconst1;
4219             int_init_val = 1;
4220           }
4221
4222         if (code == BIT_AND_EXPR)
4223           int_init_val = -1;
4224
4225         if (SCALAR_FLOAT_TYPE_P (scalar_type))
4226           def_for_init = build_real (scalar_type, real_init_val);
4227         else
4228           def_for_init = build_int_cst (scalar_type, int_init_val);
4229
4230         if (adjustment_def || operand_equal_p (def_for_init, init_val, 0))
4231           {
4232             /* Option1: the first element is '0' or '1' as well.  */
4233             if (!operand_equal_p (def_for_init, init_val, 0))
4234               *adjustment_def = init_val;
4235             init_def = gimple_build_vector_from_val (&stmts, vectype,
4236                                                      def_for_init);
4237           }
4238         else if (!TYPE_VECTOR_SUBPARTS (vectype).is_constant ())
4239           {
4240             /* Option2 (variable length): the first element is INIT_VAL.  */
4241             init_def = gimple_build_vector_from_val (&stmts, vectype,
4242                                                      def_for_init);
4243             init_def = gimple_build (&stmts, CFN_VEC_SHL_INSERT,
4244                                      vectype, init_def, init_val);
4245           }
4246         else
4247           {
4248             /* Option2: the first element is INIT_VAL.  */
4249             tree_vector_builder elts (vectype, 1, 2);
4250             elts.quick_push (init_val);
4251             elts.quick_push (def_for_init);
4252             init_def = gimple_build_vector (&stmts, &elts);
4253           }
4254       }
4255       break;
4256
4257     case MIN_EXPR:
4258     case MAX_EXPR:
4259     case COND_EXPR:
4260       {
4261         init_val = gimple_convert (&stmts, TREE_TYPE (vectype), init_val);
4262         init_def = gimple_build_vector_from_val (&stmts, vectype, init_val);
4263       }
4264       break;
4265
4266     default:
4267       gcc_unreachable ();
4268     }
4269
4270   if (stmts)
4271     gsi_insert_seq_on_edge_immediate (loop_preheader_edge (loop), stmts);
4272   return init_def;
4273 }
4274
4275 /* Get at the initial defs for the reduction PHIs in SLP_NODE.
4276    NUMBER_OF_VECTORS is the number of vector defs to create.
4277    If NEUTRAL_OP is nonnull, introducing extra elements of that
4278    value will not change the result.  */
4279
4280 static void
4281 get_initial_defs_for_reduction (vec_info *vinfo,
4282                                 slp_tree slp_node,
4283                                 vec<tree> *vec_oprnds,
4284                                 unsigned int number_of_vectors,
4285                                 bool reduc_chain, tree neutral_op)
4286 {
4287   vec<stmt_vec_info> stmts = SLP_TREE_SCALAR_STMTS (slp_node);
4288   stmt_vec_info stmt_vinfo = stmts[0];
4289   unsigned HOST_WIDE_INT nunits;
4290   unsigned j, number_of_places_left_in_vector;
4291   tree vector_type;
4292   unsigned int group_size = stmts.length ();
4293   unsigned int i;
4294   class loop *loop;
4295
4296   vector_type = STMT_VINFO_VECTYPE (stmt_vinfo);
4297
4298   gcc_assert (STMT_VINFO_DEF_TYPE (stmt_vinfo) == vect_reduction_def);
4299
4300   loop = (gimple_bb (stmt_vinfo->stmt))->loop_father;
4301   gcc_assert (loop);
4302   edge pe = loop_preheader_edge (loop);
4303
4304   gcc_assert (!reduc_chain || neutral_op);
4305
4306   /* NUMBER_OF_COPIES is the number of times we need to use the same values in
4307      created vectors. It is greater than 1 if unrolling is performed.
4308
4309      For example, we have two scalar operands, s1 and s2 (e.g., group of
4310      strided accesses of size two), while NUNITS is four (i.e., four scalars
4311      of this type can be packed in a vector).  The output vector will contain
4312      two copies of each scalar operand: {s1, s2, s1, s2}.  (NUMBER_OF_COPIES
4313      will be 2).
4314
4315      If REDUC_GROUP_SIZE > NUNITS, the scalars will be split into several
4316      vectors containing the operands.
4317
4318      For example, NUNITS is four as before, and the group size is 8
4319      (s1, s2, ..., s8).  We will create two vectors {s1, s2, s3, s4} and
4320      {s5, s6, s7, s8}.  */
4321
4322   if (!TYPE_VECTOR_SUBPARTS (vector_type).is_constant (&nunits))
4323     nunits = group_size;
4324
4325   number_of_places_left_in_vector = nunits;
4326   bool constant_p = true;
4327   tree_vector_builder elts (vector_type, nunits, 1);
4328   elts.quick_grow (nunits);
4329   gimple_seq ctor_seq = NULL;
4330   for (j = 0; j < nunits * number_of_vectors; ++j)
4331     {
4332       tree op;
4333       i = j % group_size;
4334       stmt_vinfo = stmts[i];
4335
4336       /* Get the def before the loop.  In reduction chain we have only
4337          one initial value.  Else we have as many as PHIs in the group.  */
4338       if (reduc_chain)
4339         op = j != 0 ? neutral_op : PHI_ARG_DEF_FROM_EDGE (stmt_vinfo->stmt, pe);
4340       else if (((vec_oprnds->length () + 1) * nunits
4341                 - number_of_places_left_in_vector >= group_size)
4342                && neutral_op)
4343         op = neutral_op;
4344       else
4345         op = PHI_ARG_DEF_FROM_EDGE (stmt_vinfo->stmt, pe);
4346
4347       /* Create 'vect_ = {op0,op1,...,opn}'.  */
4348       number_of_places_left_in_vector--;
4349       elts[nunits - number_of_places_left_in_vector - 1] = op;
4350       if (!CONSTANT_CLASS_P (op))
4351         constant_p = false;
4352
4353       if (number_of_places_left_in_vector == 0)
4354         {
4355           tree init;
4356           if (constant_p && !neutral_op
4357               ? multiple_p (TYPE_VECTOR_SUBPARTS (vector_type), nunits)
4358               : known_eq (TYPE_VECTOR_SUBPARTS (vector_type), nunits))
4359             /* Build the vector directly from ELTS.  */
4360             init = gimple_build_vector (&ctor_seq, &elts);
4361           else if (neutral_op)
4362             {
4363               /* Build a vector of the neutral value and shift the
4364                  other elements into place.  */
4365               init = gimple_build_vector_from_val (&ctor_seq, vector_type,
4366                                                    neutral_op);
4367               int k = nunits;
4368               while (k > 0 && elts[k - 1] == neutral_op)
4369                 k -= 1;
4370               while (k > 0)
4371                 {
4372                   k -= 1;
4373                   init = gimple_build (&ctor_seq, CFN_VEC_SHL_INSERT,
4374                                        vector_type, init, elts[k]);
4375                 }
4376             }
4377           else
4378             {
4379               /* First time round, duplicate ELTS to fill the
4380                  required number of vectors.  */
4381               duplicate_and_interleave (vinfo, &ctor_seq, vector_type, elts,
4382                                         number_of_vectors, *vec_oprnds);
4383               break;
4384             }
4385           vec_oprnds->quick_push (init);
4386
4387           number_of_places_left_in_vector = nunits;
4388           elts.new_vector (vector_type, nunits, 1);
4389           elts.quick_grow (nunits);
4390           constant_p = true;
4391         }
4392     }
4393   if (ctor_seq != NULL)
4394     gsi_insert_seq_on_edge_immediate (pe, ctor_seq);
4395 }
4396
4397 /* For a statement STMT_INFO taking part in a reduction operation return
4398    the stmt_vec_info the meta information is stored on.  */
4399
4400 stmt_vec_info
4401 info_for_reduction (vec_info *vinfo, stmt_vec_info stmt_info)
4402 {
4403   stmt_info = vect_orig_stmt (stmt_info);
4404   gcc_assert (STMT_VINFO_REDUC_DEF (stmt_info));
4405   if (!is_a <gphi *> (stmt_info->stmt))
4406     stmt_info = STMT_VINFO_REDUC_DEF (stmt_info);
4407   gphi *phi = as_a <gphi *> (stmt_info->stmt);
4408   if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
4409     {
4410       if (gimple_phi_num_args (phi) == 1)
4411         stmt_info = STMT_VINFO_REDUC_DEF (stmt_info);
4412     }
4413   else if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle)
4414     {
4415       edge pe = loop_preheader_edge (gimple_bb (phi)->loop_father);
4416       stmt_vec_info info
4417           = vinfo->lookup_def (PHI_ARG_DEF_FROM_EDGE (phi, pe));
4418       if (info && STMT_VINFO_DEF_TYPE (info) == vect_double_reduction_def)
4419         stmt_info = info;
4420     }
4421   return stmt_info;
4422 }
4423
4424 /* Function vect_create_epilog_for_reduction
4425
4426    Create code at the loop-epilog to finalize the result of a reduction
4427    computation.
4428
4429    STMT_INFO is the scalar reduction stmt that is being vectorized.
4430    SLP_NODE is an SLP node containing a group of reduction statements. The
4431      first one in this group is STMT_INFO.
4432    SLP_NODE_INSTANCE is the SLP node instance containing SLP_NODE
4433    REDUC_INDEX says which rhs operand of the STMT_INFO is the reduction phi
4434      (counting from 0)
4435
4436    This function:
4437    1. Completes the reduction def-use cycles.
4438    2. "Reduces" each vector of partial results VECT_DEFS into a single result,
4439       by calling the function specified by REDUC_FN if available, or by
4440       other means (whole-vector shifts or a scalar loop).
4441       The function also creates a new phi node at the loop exit to preserve
4442       loop-closed form, as illustrated below.
4443
4444      The flow at the entry to this function:
4445
4446         loop:
4447           vec_def = phi <vec_init, null>        # REDUCTION_PHI
4448           VECT_DEF = vector_stmt                # vectorized form of STMT_INFO
4449           s_loop = scalar_stmt                  # (scalar) STMT_INFO
4450         loop_exit:
4451           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
4452           use <s_out0>
4453           use <s_out0>
4454
4455      The above is transformed by this function into:
4456
4457         loop:
4458           vec_def = phi <vec_init, VECT_DEF>    # REDUCTION_PHI
4459           VECT_DEF = vector_stmt                # vectorized form of STMT_INFO
4460           s_loop = scalar_stmt                  # (scalar) STMT_INFO
4461         loop_exit:
4462           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
4463           v_out1 = phi <VECT_DEF>               # NEW_EXIT_PHI
4464           v_out2 = reduce <v_out1>
4465           s_out3 = extract_field <v_out2, 0>
4466           s_out4 = adjust_result <s_out3>
4467           use <s_out4>
4468           use <s_out4>
4469 */
4470
4471 static void
4472 vect_create_epilog_for_reduction (loop_vec_info loop_vinfo,
4473                                   stmt_vec_info stmt_info,
4474                                   slp_tree slp_node,
4475                                   slp_instance slp_node_instance)
4476 {
4477   stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info);
4478   gcc_assert (reduc_info->is_reduc_info);
4479   /* For double reductions we need to get at the inner loop reduction
4480      stmt which has the meta info attached.  Our stmt_info is that of the
4481      loop-closed PHI of the inner loop which we remember as
4482      def for the reduction PHI generation.  */
4483   bool double_reduc = false;
4484   stmt_vec_info rdef_info = stmt_info;
4485   if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
4486     {
4487       gcc_assert (!slp_node);
4488       double_reduc = true;
4489       stmt_info = loop_vinfo->lookup_def (gimple_phi_arg_def
4490                                             (stmt_info->stmt, 0));
4491       stmt_info = vect_stmt_to_vectorize (stmt_info);
4492     }
4493   gphi *reduc_def_stmt
4494     = as_a <gphi *> (STMT_VINFO_REDUC_DEF (vect_orig_stmt (stmt_info))->stmt);
4495   enum tree_code code = STMT_VINFO_REDUC_CODE (reduc_info);
4496   internal_fn reduc_fn = STMT_VINFO_REDUC_FN (reduc_info);
4497   tree vectype;
4498   machine_mode mode;
4499   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo), *outer_loop = NULL;
4500   basic_block exit_bb;
4501   tree scalar_dest;
4502   tree scalar_type;
4503   gimple *new_phi = NULL, *phi;
4504   gimple_stmt_iterator exit_gsi;
4505   tree new_temp = NULL_TREE, new_name, new_scalar_dest;
4506   gimple *epilog_stmt = NULL;
4507   gimple *exit_phi;
4508   tree bitsize;
4509   tree def;
4510   tree orig_name, scalar_result;
4511   imm_use_iterator imm_iter, phi_imm_iter;
4512   use_operand_p use_p, phi_use_p;
4513   gimple *use_stmt;
4514   bool nested_in_vect_loop = false;
4515   auto_vec<gimple *> new_phis;
4516   int j, i;
4517   auto_vec<tree> scalar_results;
4518   unsigned int group_size = 1, k;
4519   auto_vec<gimple *> phis;
4520   bool slp_reduc = false;
4521   bool direct_slp_reduc;
4522   tree new_phi_result;
4523   tree induction_index = NULL_TREE;
4524
4525   if (slp_node)
4526     group_size = SLP_TREE_LANES (slp_node);
4527
4528   if (nested_in_vect_loop_p (loop, stmt_info))
4529     {
4530       outer_loop = loop;
4531       loop = loop->inner;
4532       nested_in_vect_loop = true;
4533       gcc_assert (!slp_node);
4534     }
4535   gcc_assert (!nested_in_vect_loop || double_reduc);
4536
4537   vectype = STMT_VINFO_REDUC_VECTYPE (reduc_info);
4538   gcc_assert (vectype);
4539   mode = TYPE_MODE (vectype);
4540
4541   tree initial_def = NULL;
4542   tree induc_val = NULL_TREE;
4543   tree adjustment_def = NULL;
4544   if (slp_node)
4545     ;
4546   else
4547     {
4548       /* Get at the scalar def before the loop, that defines the initial value
4549          of the reduction variable.  */
4550       initial_def = PHI_ARG_DEF_FROM_EDGE (reduc_def_stmt,
4551                                            loop_preheader_edge (loop));
4552       /* Optimize: for induction condition reduction, if we can't use zero
4553          for induc_val, use initial_def.  */
4554       if (STMT_VINFO_REDUC_TYPE (reduc_info) == INTEGER_INDUC_COND_REDUCTION)
4555         induc_val = STMT_VINFO_VEC_INDUC_COND_INITIAL_VAL (reduc_info);
4556       else if (double_reduc)
4557         ;
4558       else if (nested_in_vect_loop)
4559         ;
4560       else
4561         adjustment_def = STMT_VINFO_REDUC_EPILOGUE_ADJUSTMENT (reduc_info);
4562     }
4563
4564   unsigned vec_num;
4565   int ncopies;
4566   if (slp_node)
4567     {
4568       vec_num = SLP_TREE_VEC_STMTS (slp_node_instance->reduc_phis).length ();
4569       ncopies = 1;
4570     }
4571   else
4572     {
4573       stmt_vec_info reduc_info = loop_vinfo->lookup_stmt (reduc_def_stmt);
4574       vec_num = 1;
4575       ncopies = STMT_VINFO_VEC_STMTS (reduc_info).length ();
4576     }
4577
4578   /* For cond reductions we want to create a new vector (INDEX_COND_EXPR)
4579      which is updated with the current index of the loop for every match of
4580      the original loop's cond_expr (VEC_STMT).  This results in a vector
4581      containing the last time the condition passed for that vector lane.
4582      The first match will be a 1 to allow 0 to be used for non-matching
4583      indexes.  If there are no matches at all then the vector will be all
4584      zeroes.
4585
4586      PR92772: This algorithm is broken for architectures that support
4587      masked vectors, but do not provide fold_extract_last.  */
4588   if (STMT_VINFO_REDUC_TYPE (reduc_info) == COND_REDUCTION)
4589     {
4590       auto_vec<std::pair<tree, bool>, 2> ccompares;
4591       stmt_vec_info cond_info = STMT_VINFO_REDUC_DEF (reduc_info);
4592       cond_info = vect_stmt_to_vectorize (cond_info);
4593       while (cond_info != reduc_info)
4594         {
4595           if (gimple_assign_rhs_code (cond_info->stmt) == COND_EXPR)
4596             {
4597               gimple *vec_stmt = STMT_VINFO_VEC_STMTS (cond_info)[0];
4598               gcc_assert (gimple_assign_rhs_code (vec_stmt) == VEC_COND_EXPR);
4599               ccompares.safe_push
4600                 (std::make_pair (unshare_expr (gimple_assign_rhs1 (vec_stmt)),
4601                                  STMT_VINFO_REDUC_IDX (cond_info) == 2));
4602             }
4603           cond_info
4604             = loop_vinfo->lookup_def (gimple_op (cond_info->stmt,
4605                                                  1 + STMT_VINFO_REDUC_IDX
4606                                                         (cond_info)));
4607           cond_info = vect_stmt_to_vectorize (cond_info);
4608         }
4609       gcc_assert (ccompares.length () != 0);
4610
4611       tree indx_before_incr, indx_after_incr;
4612       poly_uint64 nunits_out = TYPE_VECTOR_SUBPARTS (vectype);
4613       int scalar_precision
4614         = GET_MODE_PRECISION (SCALAR_TYPE_MODE (TREE_TYPE (vectype)));
4615       tree cr_index_scalar_type = make_unsigned_type (scalar_precision);
4616       tree cr_index_vector_type = get_related_vectype_for_scalar_type
4617         (TYPE_MODE (vectype), cr_index_scalar_type,
4618          TYPE_VECTOR_SUBPARTS (vectype));
4619
4620       /* First we create a simple vector induction variable which starts
4621          with the values {1,2,3,...} (SERIES_VECT) and increments by the
4622          vector size (STEP).  */
4623
4624       /* Create a {1,2,3,...} vector.  */
4625       tree series_vect = build_index_vector (cr_index_vector_type, 1, 1);
4626
4627       /* Create a vector of the step value.  */
4628       tree step = build_int_cst (cr_index_scalar_type, nunits_out);
4629       tree vec_step = build_vector_from_val (cr_index_vector_type, step);
4630
4631       /* Create an induction variable.  */
4632       gimple_stmt_iterator incr_gsi;
4633       bool insert_after;
4634       standard_iv_increment_position (loop, &incr_gsi, &insert_after);
4635       create_iv (series_vect, vec_step, NULL_TREE, loop, &incr_gsi,
4636                  insert_after, &indx_before_incr, &indx_after_incr);
4637
4638       /* Next create a new phi node vector (NEW_PHI_TREE) which starts
4639          filled with zeros (VEC_ZERO).  */
4640
4641       /* Create a vector of 0s.  */
4642       tree zero = build_zero_cst (cr_index_scalar_type);
4643       tree vec_zero = build_vector_from_val (cr_index_vector_type, zero);
4644
4645       /* Create a vector phi node.  */
4646       tree new_phi_tree = make_ssa_name (cr_index_vector_type);
4647       new_phi = create_phi_node (new_phi_tree, loop->header);
4648       add_phi_arg (as_a <gphi *> (new_phi), vec_zero,
4649                    loop_preheader_edge (loop), UNKNOWN_LOCATION);
4650
4651       /* Now take the condition from the loops original cond_exprs
4652          and produce a new cond_exprs (INDEX_COND_EXPR) which for
4653          every match uses values from the induction variable
4654          (INDEX_BEFORE_INCR) otherwise uses values from the phi node
4655          (NEW_PHI_TREE).
4656          Finally, we update the phi (NEW_PHI_TREE) to take the value of
4657          the new cond_expr (INDEX_COND_EXPR).  */
4658       gimple_seq stmts = NULL;
4659       for (int i = ccompares.length () - 1; i != -1; --i)
4660         {
4661           tree ccompare = ccompares[i].first;
4662           if (ccompares[i].second)
4663             new_phi_tree = gimple_build (&stmts, VEC_COND_EXPR,
4664                                          cr_index_vector_type,
4665                                          ccompare,
4666                                          indx_before_incr, new_phi_tree);
4667           else
4668             new_phi_tree = gimple_build (&stmts, VEC_COND_EXPR,
4669                                          cr_index_vector_type,
4670                                          ccompare,
4671                                          new_phi_tree, indx_before_incr);
4672         }
4673       gsi_insert_seq_before (&incr_gsi, stmts, GSI_SAME_STMT);
4674
4675       /* Update the phi with the vec cond.  */
4676       induction_index = new_phi_tree;
4677       add_phi_arg (as_a <gphi *> (new_phi), induction_index,
4678                    loop_latch_edge (loop), UNKNOWN_LOCATION);
4679     }
4680
4681   /* 2. Create epilog code.
4682         The reduction epilog code operates across the elements of the vector
4683         of partial results computed by the vectorized loop.
4684         The reduction epilog code consists of:
4685
4686         step 1: compute the scalar result in a vector (v_out2)
4687         step 2: extract the scalar result (s_out3) from the vector (v_out2)
4688         step 3: adjust the scalar result (s_out3) if needed.
4689
4690         Step 1 can be accomplished using one the following three schemes:
4691           (scheme 1) using reduc_fn, if available.
4692           (scheme 2) using whole-vector shifts, if available.
4693           (scheme 3) using a scalar loop. In this case steps 1+2 above are
4694                      combined.
4695
4696           The overall epilog code looks like this:
4697
4698           s_out0 = phi <s_loop>         # original EXIT_PHI
4699           v_out1 = phi <VECT_DEF>       # NEW_EXIT_PHI
4700           v_out2 = reduce <v_out1>              # step 1
4701           s_out3 = extract_field <v_out2, 0>    # step 2
4702           s_out4 = adjust_result <s_out3>       # step 3
4703
4704           (step 3 is optional, and steps 1 and 2 may be combined).
4705           Lastly, the uses of s_out0 are replaced by s_out4.  */
4706
4707
4708   /* 2.1 Create new loop-exit-phis to preserve loop-closed form:
4709          v_out1 = phi <VECT_DEF>
4710          Store them in NEW_PHIS.  */
4711   if (double_reduc)
4712     loop = outer_loop;
4713   exit_bb = single_exit (loop)->dest;
4714   new_phis.create (slp_node ? vec_num : ncopies);
4715   for (unsigned i = 0; i < vec_num; i++)
4716     {
4717       if (slp_node)
4718         def = vect_get_slp_vect_def (slp_node, i);
4719       else
4720         def = gimple_get_lhs (STMT_VINFO_VEC_STMTS (rdef_info)[0]);
4721       for (j = 0; j < ncopies; j++)
4722         {
4723           tree new_def = copy_ssa_name (def);
4724           phi = create_phi_node (new_def, exit_bb);
4725           if (j == 0)
4726             new_phis.quick_push (phi);
4727           else
4728             {
4729               def = gimple_get_lhs (STMT_VINFO_VEC_STMTS (rdef_info)[j]);
4730               new_phis.quick_push (phi);
4731             }
4732
4733           SET_PHI_ARG_DEF (phi, single_exit (loop)->dest_idx, def);
4734         }
4735     }
4736
4737   exit_gsi = gsi_after_labels (exit_bb);
4738
4739   /* 2.2 Get the relevant tree-code to use in the epilog for schemes 2,3
4740          (i.e. when reduc_fn is not available) and in the final adjustment
4741          code (if needed).  Also get the original scalar reduction variable as
4742          defined in the loop.  In case STMT is a "pattern-stmt" (i.e. - it
4743          represents a reduction pattern), the tree-code and scalar-def are
4744          taken from the original stmt that the pattern-stmt (STMT) replaces.
4745          Otherwise (it is a regular reduction) - the tree-code and scalar-def
4746          are taken from STMT.  */
4747
4748   stmt_vec_info orig_stmt_info = vect_orig_stmt (stmt_info);
4749   if (orig_stmt_info != stmt_info)
4750     {
4751       /* Reduction pattern  */
4752       gcc_assert (STMT_VINFO_IN_PATTERN_P (orig_stmt_info));
4753       gcc_assert (STMT_VINFO_RELATED_STMT (orig_stmt_info) == stmt_info);
4754     }
4755
4756   scalar_dest = gimple_assign_lhs (orig_stmt_info->stmt);
4757   scalar_type = TREE_TYPE (scalar_dest);
4758   scalar_results.create (group_size);
4759   new_scalar_dest = vect_create_destination_var (scalar_dest, NULL);
4760   bitsize = TYPE_SIZE (scalar_type);
4761
4762   /* SLP reduction without reduction chain, e.g.,
4763      # a1 = phi <a2, a0>
4764      # b1 = phi <b2, b0>
4765      a2 = operation (a1)
4766      b2 = operation (b1)  */
4767   slp_reduc = (slp_node && !REDUC_GROUP_FIRST_ELEMENT (stmt_info));
4768
4769   /* True if we should implement SLP_REDUC using native reduction operations
4770      instead of scalar operations.  */
4771   direct_slp_reduc = (reduc_fn != IFN_LAST
4772                       && slp_reduc
4773                       && !TYPE_VECTOR_SUBPARTS (vectype).is_constant ());
4774
4775   /* In case of reduction chain, e.g.,
4776      # a1 = phi <a3, a0>
4777      a2 = operation (a1)
4778      a3 = operation (a2),
4779
4780      we may end up with more than one vector result.  Here we reduce them to
4781      one vector.  */
4782   if (REDUC_GROUP_FIRST_ELEMENT (stmt_info) || direct_slp_reduc)
4783     {
4784       gimple_seq stmts = NULL;
4785       tree first_vect = PHI_RESULT (new_phis[0]);
4786       first_vect = gimple_convert (&stmts, vectype, first_vect);
4787       for (k = 1; k < new_phis.length (); k++)
4788         {
4789           gimple *next_phi = new_phis[k];
4790           tree second_vect = PHI_RESULT (next_phi);
4791           second_vect = gimple_convert (&stmts, vectype, second_vect);
4792           first_vect = gimple_build (&stmts, code, vectype,
4793                                      first_vect, second_vect);
4794         }
4795       gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
4796
4797       new_phi_result = first_vect;
4798       new_phis.truncate (0);
4799       new_phis.safe_push (SSA_NAME_DEF_STMT (first_vect));
4800     }
4801   /* Likewise if we couldn't use a single defuse cycle.  */
4802   else if (ncopies > 1)
4803     {
4804       gimple_seq stmts = NULL;
4805       tree first_vect = PHI_RESULT (new_phis[0]);
4806       first_vect = gimple_convert (&stmts, vectype, first_vect);
4807       for (int k = 1; k < ncopies; ++k)
4808         {
4809           tree second_vect = PHI_RESULT (new_phis[k]);
4810           second_vect = gimple_convert (&stmts, vectype, second_vect);
4811           first_vect = gimple_build (&stmts, code, vectype,
4812                                      first_vect, second_vect);
4813         }
4814       gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
4815       new_phi_result = first_vect;
4816       new_phis.truncate (0);
4817       new_phis.safe_push (SSA_NAME_DEF_STMT (first_vect));
4818     }
4819   else
4820     new_phi_result = PHI_RESULT (new_phis[0]);
4821
4822   if (STMT_VINFO_REDUC_TYPE (reduc_info) == COND_REDUCTION
4823       && reduc_fn != IFN_LAST)
4824     {
4825       /* For condition reductions, we have a vector (NEW_PHI_RESULT) containing
4826          various data values where the condition matched and another vector
4827          (INDUCTION_INDEX) containing all the indexes of those matches.  We
4828          need to extract the last matching index (which will be the index with
4829          highest value) and use this to index into the data vector.
4830          For the case where there were no matches, the data vector will contain
4831          all default values and the index vector will be all zeros.  */
4832
4833       /* Get various versions of the type of the vector of indexes.  */
4834       tree index_vec_type = TREE_TYPE (induction_index);
4835       gcc_checking_assert (TYPE_UNSIGNED (index_vec_type));
4836       tree index_scalar_type = TREE_TYPE (index_vec_type);
4837       tree index_vec_cmp_type = truth_type_for (index_vec_type);
4838
4839       /* Get an unsigned integer version of the type of the data vector.  */
4840       int scalar_precision
4841         = GET_MODE_PRECISION (SCALAR_TYPE_MODE (scalar_type));
4842       tree scalar_type_unsigned = make_unsigned_type (scalar_precision);
4843       tree vectype_unsigned = build_vector_type
4844         (scalar_type_unsigned, TYPE_VECTOR_SUBPARTS (vectype));
4845
4846       /* First we need to create a vector (ZERO_VEC) of zeros and another
4847          vector (MAX_INDEX_VEC) filled with the last matching index, which we
4848          can create using a MAX reduction and then expanding.
4849          In the case where the loop never made any matches, the max index will
4850          be zero.  */
4851
4852       /* Vector of {0, 0, 0,...}.  */
4853       tree zero_vec = build_zero_cst (vectype);
4854
4855       gimple_seq stmts = NULL;
4856       new_phi_result = gimple_convert (&stmts, vectype, new_phi_result);
4857       gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
4858
4859       /* Find maximum value from the vector of found indexes.  */
4860       tree max_index = make_ssa_name (index_scalar_type);
4861       gcall *max_index_stmt = gimple_build_call_internal (IFN_REDUC_MAX,
4862                                                           1, induction_index);
4863       gimple_call_set_lhs (max_index_stmt, max_index);
4864       gsi_insert_before (&exit_gsi, max_index_stmt, GSI_SAME_STMT);
4865
4866       /* Vector of {max_index, max_index, max_index,...}.  */
4867       tree max_index_vec = make_ssa_name (index_vec_type);
4868       tree max_index_vec_rhs = build_vector_from_val (index_vec_type,
4869                                                       max_index);
4870       gimple *max_index_vec_stmt = gimple_build_assign (max_index_vec,
4871                                                         max_index_vec_rhs);
4872       gsi_insert_before (&exit_gsi, max_index_vec_stmt, GSI_SAME_STMT);
4873
4874       /* Next we compare the new vector (MAX_INDEX_VEC) full of max indexes
4875          with the vector (INDUCTION_INDEX) of found indexes, choosing values
4876          from the data vector (NEW_PHI_RESULT) for matches, 0 (ZERO_VEC)
4877          otherwise.  Only one value should match, resulting in a vector
4878          (VEC_COND) with one data value and the rest zeros.
4879          In the case where the loop never made any matches, every index will
4880          match, resulting in a vector with all data values (which will all be
4881          the default value).  */
4882
4883       /* Compare the max index vector to the vector of found indexes to find
4884          the position of the max value.  */
4885       tree vec_compare = make_ssa_name (index_vec_cmp_type);
4886       gimple *vec_compare_stmt = gimple_build_assign (vec_compare, EQ_EXPR,
4887                                                       induction_index,
4888                                                       max_index_vec);
4889       gsi_insert_before (&exit_gsi, vec_compare_stmt, GSI_SAME_STMT);
4890
4891       /* Use the compare to choose either values from the data vector or
4892          zero.  */
4893       tree vec_cond = make_ssa_name (vectype);
4894       gimple *vec_cond_stmt = gimple_build_assign (vec_cond, VEC_COND_EXPR,
4895                                                    vec_compare, new_phi_result,
4896                                                    zero_vec);
4897       gsi_insert_before (&exit_gsi, vec_cond_stmt, GSI_SAME_STMT);
4898
4899       /* Finally we need to extract the data value from the vector (VEC_COND)
4900          into a scalar (MATCHED_DATA_REDUC).  Logically we want to do a OR
4901          reduction, but because this doesn't exist, we can use a MAX reduction
4902          instead.  The data value might be signed or a float so we need to cast
4903          it first.
4904          In the case where the loop never made any matches, the data values are
4905          all identical, and so will reduce down correctly.  */
4906
4907       /* Make the matched data values unsigned.  */
4908       tree vec_cond_cast = make_ssa_name (vectype_unsigned);
4909       tree vec_cond_cast_rhs = build1 (VIEW_CONVERT_EXPR, vectype_unsigned,
4910                                        vec_cond);
4911       gimple *vec_cond_cast_stmt = gimple_build_assign (vec_cond_cast,
4912                                                         VIEW_CONVERT_EXPR,
4913                                                         vec_cond_cast_rhs);
4914       gsi_insert_before (&exit_gsi, vec_cond_cast_stmt, GSI_SAME_STMT);
4915
4916       /* Reduce down to a scalar value.  */
4917       tree data_reduc = make_ssa_name (scalar_type_unsigned);
4918       gcall *data_reduc_stmt = gimple_build_call_internal (IFN_REDUC_MAX,
4919                                                            1, vec_cond_cast);
4920       gimple_call_set_lhs (data_reduc_stmt, data_reduc);
4921       gsi_insert_before (&exit_gsi, data_reduc_stmt, GSI_SAME_STMT);
4922
4923       /* Convert the reduced value back to the result type and set as the
4924          result.  */
4925       stmts = NULL;
4926       new_temp = gimple_build (&stmts, VIEW_CONVERT_EXPR, scalar_type,
4927                                data_reduc);
4928       gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
4929       scalar_results.safe_push (new_temp);
4930     }
4931   else if (STMT_VINFO_REDUC_TYPE (reduc_info) == COND_REDUCTION
4932            && reduc_fn == IFN_LAST)
4933     {
4934       /* Condition reduction without supported IFN_REDUC_MAX.  Generate
4935          idx = 0;
4936          idx_val = induction_index[0];
4937          val = data_reduc[0];
4938          for (idx = 0, val = init, i = 0; i < nelts; ++i)
4939            if (induction_index[i] > idx_val)
4940              val = data_reduc[i], idx_val = induction_index[i];
4941          return val;  */
4942
4943       tree data_eltype = TREE_TYPE (TREE_TYPE (new_phi_result));
4944       tree idx_eltype = TREE_TYPE (TREE_TYPE (induction_index));
4945       unsigned HOST_WIDE_INT el_size = tree_to_uhwi (TYPE_SIZE (idx_eltype));
4946       poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (TREE_TYPE (induction_index));
4947       /* Enforced by vectorizable_reduction, which ensures we have target
4948          support before allowing a conditional reduction on variable-length
4949          vectors.  */
4950       unsigned HOST_WIDE_INT v_size = el_size * nunits.to_constant ();
4951       tree idx_val = NULL_TREE, val = NULL_TREE;
4952       for (unsigned HOST_WIDE_INT off = 0; off < v_size; off += el_size)
4953         {
4954           tree old_idx_val = idx_val;
4955           tree old_val = val;
4956           idx_val = make_ssa_name (idx_eltype);
4957           epilog_stmt = gimple_build_assign (idx_val, BIT_FIELD_REF,
4958                                              build3 (BIT_FIELD_REF, idx_eltype,
4959                                                      induction_index,
4960                                                      bitsize_int (el_size),
4961                                                      bitsize_int (off)));
4962           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4963           val = make_ssa_name (data_eltype);
4964           epilog_stmt = gimple_build_assign (val, BIT_FIELD_REF,
4965                                              build3 (BIT_FIELD_REF,
4966                                                      data_eltype,
4967                                                      new_phi_result,
4968                                                      bitsize_int (el_size),
4969                                                      bitsize_int (off)));
4970           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4971           if (off != 0)
4972             {
4973               tree new_idx_val = idx_val;
4974               if (off != v_size - el_size)
4975                 {
4976                   new_idx_val = make_ssa_name (idx_eltype);
4977                   epilog_stmt = gimple_build_assign (new_idx_val,
4978                                                      MAX_EXPR, idx_val,
4979                                                      old_idx_val);
4980                   gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4981                 }
4982               tree new_val = make_ssa_name (data_eltype);
4983               epilog_stmt = gimple_build_assign (new_val,
4984                                                  COND_EXPR,
4985                                                  build2 (GT_EXPR,
4986                                                          boolean_type_node,
4987                                                          idx_val,
4988                                                          old_idx_val),
4989                                                  val, old_val);
4990               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4991               idx_val = new_idx_val;
4992               val = new_val;
4993             }
4994         }
4995       /* Convert the reduced value back to the result type and set as the
4996          result.  */
4997       gimple_seq stmts = NULL;
4998       val = gimple_convert (&stmts, scalar_type, val);
4999       gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
5000       scalar_results.safe_push (val);
5001     }
5002
5003   /* 2.3 Create the reduction code, using one of the three schemes described
5004          above. In SLP we simply need to extract all the elements from the
5005          vector (without reducing them), so we use scalar shifts.  */
5006   else if (reduc_fn != IFN_LAST && !slp_reduc)
5007     {
5008       tree tmp;
5009       tree vec_elem_type;
5010
5011       /* Case 1:  Create:
5012          v_out2 = reduc_expr <v_out1>  */
5013
5014       if (dump_enabled_p ())
5015         dump_printf_loc (MSG_NOTE, vect_location,
5016                          "Reduce using direct vector reduction.\n");
5017
5018       gimple_seq stmts = NULL;
5019       new_phi_result = gimple_convert (&stmts, vectype, new_phi_result);
5020       vec_elem_type = TREE_TYPE (TREE_TYPE (new_phi_result));
5021       new_temp = gimple_build (&stmts, as_combined_fn (reduc_fn),
5022                                vec_elem_type, new_phi_result);
5023       new_temp = gimple_convert (&stmts, scalar_type, new_temp);
5024       gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
5025
5026       if ((STMT_VINFO_REDUC_TYPE (reduc_info) == INTEGER_INDUC_COND_REDUCTION)
5027           && induc_val)
5028         {
5029           /* Earlier we set the initial value to be a vector if induc_val
5030              values.  Check the result and if it is induc_val then replace
5031              with the original initial value, unless induc_val is
5032              the same as initial_def already.  */
5033           tree zcompare = build2 (EQ_EXPR, boolean_type_node, new_temp,
5034                                   induc_val);
5035
5036           tmp = make_ssa_name (new_scalar_dest);
5037           epilog_stmt = gimple_build_assign (tmp, COND_EXPR, zcompare,
5038                                              initial_def, new_temp);
5039           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5040           new_temp = tmp;
5041         }
5042
5043       scalar_results.safe_push (new_temp);
5044     }
5045   else if (direct_slp_reduc)
5046     {
5047       /* Here we create one vector for each of the REDUC_GROUP_SIZE results,
5048          with the elements for other SLP statements replaced with the
5049          neutral value.  We can then do a normal reduction on each vector.  */
5050
5051       /* Enforced by vectorizable_reduction.  */
5052       gcc_assert (new_phis.length () == 1);
5053       gcc_assert (pow2p_hwi (group_size));
5054
5055       slp_tree orig_phis_slp_node = slp_node_instance->reduc_phis;
5056       vec<stmt_vec_info> orig_phis
5057         = SLP_TREE_SCALAR_STMTS (orig_phis_slp_node);
5058       gimple_seq seq = NULL;
5059
5060       /* Build a vector {0, 1, 2, ...}, with the same number of elements
5061          and the same element size as VECTYPE.  */
5062       tree index = build_index_vector (vectype, 0, 1);
5063       tree index_type = TREE_TYPE (index);
5064       tree index_elt_type = TREE_TYPE (index_type);
5065       tree mask_type = truth_type_for (index_type);
5066
5067       /* Create a vector that, for each element, identifies which of
5068          the REDUC_GROUP_SIZE results should use it.  */
5069       tree index_mask = build_int_cst (index_elt_type, group_size - 1);
5070       index = gimple_build (&seq, BIT_AND_EXPR, index_type, index,
5071                             build_vector_from_val (index_type, index_mask));
5072
5073       /* Get a neutral vector value.  This is simply a splat of the neutral
5074          scalar value if we have one, otherwise the initial scalar value
5075          is itself a neutral value.  */
5076       tree vector_identity = NULL_TREE;
5077       tree neutral_op = NULL_TREE;
5078       if (slp_node)
5079         {
5080           stmt_vec_info first = REDUC_GROUP_FIRST_ELEMENT (stmt_info);
5081           neutral_op
5082             = neutral_op_for_slp_reduction (slp_node_instance->reduc_phis,
5083                                             vectype, code, first != NULL);
5084         }
5085       if (neutral_op)
5086         vector_identity = gimple_build_vector_from_val (&seq, vectype,
5087                                                         neutral_op);
5088       for (unsigned int i = 0; i < group_size; ++i)
5089         {
5090           /* If there's no univeral neutral value, we can use the
5091              initial scalar value from the original PHI.  This is used
5092              for MIN and MAX reduction, for example.  */
5093           if (!neutral_op)
5094             {
5095               tree scalar_value
5096                 = PHI_ARG_DEF_FROM_EDGE (orig_phis[i]->stmt,
5097                                          loop_preheader_edge (loop));
5098               scalar_value = gimple_convert (&seq, TREE_TYPE (vectype),
5099                                              scalar_value);
5100               vector_identity = gimple_build_vector_from_val (&seq, vectype,
5101                                                               scalar_value);
5102             }
5103
5104           /* Calculate the equivalent of:
5105
5106              sel[j] = (index[j] == i);
5107
5108              which selects the elements of NEW_PHI_RESULT that should
5109              be included in the result.  */
5110           tree compare_val = build_int_cst (index_elt_type, i);
5111           compare_val = build_vector_from_val (index_type, compare_val);
5112           tree sel = gimple_build (&seq, EQ_EXPR, mask_type,
5113                                    index, compare_val);
5114
5115           /* Calculate the equivalent of:
5116
5117              vec = seq ? new_phi_result : vector_identity;
5118
5119              VEC is now suitable for a full vector reduction.  */
5120           tree vec = gimple_build (&seq, VEC_COND_EXPR, vectype,
5121                                    sel, new_phi_result, vector_identity);
5122
5123           /* Do the reduction and convert it to the appropriate type.  */
5124           tree scalar = gimple_build (&seq, as_combined_fn (reduc_fn),
5125                                       TREE_TYPE (vectype), vec);
5126           scalar = gimple_convert (&seq, scalar_type, scalar);
5127           scalar_results.safe_push (scalar);
5128         }
5129       gsi_insert_seq_before (&exit_gsi, seq, GSI_SAME_STMT);
5130     }
5131   else
5132     {
5133       bool reduce_with_shift;
5134       tree vec_temp;
5135
5136       gcc_assert (slp_reduc || new_phis.length () == 1);
5137
5138       /* See if the target wants to do the final (shift) reduction
5139          in a vector mode of smaller size and first reduce upper/lower
5140          halves against each other.  */
5141       enum machine_mode mode1 = mode;
5142       tree stype = TREE_TYPE (vectype);
5143       unsigned nunits = TYPE_VECTOR_SUBPARTS (vectype).to_constant ();
5144       unsigned nunits1 = nunits;
5145       if ((mode1 = targetm.vectorize.split_reduction (mode)) != mode
5146           && new_phis.length () == 1)
5147         {
5148           nunits1 = GET_MODE_NUNITS (mode1).to_constant ();
5149           /* For SLP reductions we have to make sure lanes match up, but
5150              since we're doing individual element final reduction reducing
5151              vector width here is even more important.
5152              ???  We can also separate lanes with permutes, for the common
5153              case of power-of-two group-size odd/even extracts would work.  */
5154           if (slp_reduc && nunits != nunits1)
5155             {
5156               nunits1 = least_common_multiple (nunits1, group_size);
5157               gcc_assert (exact_log2 (nunits1) != -1 && nunits1 <= nunits);
5158             }
5159         }
5160       if (!slp_reduc
5161           && (mode1 = targetm.vectorize.split_reduction (mode)) != mode)
5162         nunits1 = GET_MODE_NUNITS (mode1).to_constant ();
5163
5164       tree vectype1 = get_related_vectype_for_scalar_type (TYPE_MODE (vectype),
5165                                                            stype, nunits1);
5166       reduce_with_shift = have_whole_vector_shift (mode1);
5167       if (!VECTOR_MODE_P (mode1))
5168         reduce_with_shift = false;
5169       else
5170         {
5171           optab optab = optab_for_tree_code (code, vectype1, optab_default);
5172           if (optab_handler (optab, mode1) == CODE_FOR_nothing)
5173             reduce_with_shift = false;
5174         }
5175
5176       /* First reduce the vector to the desired vector size we should
5177          do shift reduction on by combining upper and lower halves.  */
5178       new_temp = new_phi_result;
5179       while (nunits > nunits1)
5180         {
5181           nunits /= 2;
5182           vectype1 = get_related_vectype_for_scalar_type (TYPE_MODE (vectype),
5183                                                           stype, nunits);
5184           unsigned int bitsize = tree_to_uhwi (TYPE_SIZE (vectype1));
5185
5186           /* The target has to make sure we support lowpart/highpart
5187              extraction, either via direct vector extract or through
5188              an integer mode punning.  */
5189           tree dst1, dst2;
5190           if (convert_optab_handler (vec_extract_optab,
5191                                      TYPE_MODE (TREE_TYPE (new_temp)),
5192                                      TYPE_MODE (vectype1))
5193               != CODE_FOR_nothing)
5194             {
5195               /* Extract sub-vectors directly once vec_extract becomes
5196                  a conversion optab.  */
5197               dst1 = make_ssa_name (vectype1);
5198               epilog_stmt
5199                   = gimple_build_assign (dst1, BIT_FIELD_REF,
5200                                          build3 (BIT_FIELD_REF, vectype1,
5201                                                  new_temp, TYPE_SIZE (vectype1),
5202                                                  bitsize_int (0)));
5203               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5204               dst2 =  make_ssa_name (vectype1);
5205               epilog_stmt
5206                   = gimple_build_assign (dst2, BIT_FIELD_REF,
5207                                          build3 (BIT_FIELD_REF, vectype1,
5208                                                  new_temp, TYPE_SIZE (vectype1),
5209                                                  bitsize_int (bitsize)));
5210               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5211             }
5212           else
5213             {
5214               /* Extract via punning to appropriately sized integer mode
5215                  vector.  */
5216               tree eltype = build_nonstandard_integer_type (bitsize, 1);
5217               tree etype = build_vector_type (eltype, 2);
5218               gcc_assert (convert_optab_handler (vec_extract_optab,
5219                                                  TYPE_MODE (etype),
5220                                                  TYPE_MODE (eltype))
5221                           != CODE_FOR_nothing);
5222               tree tem = make_ssa_name (etype);
5223               epilog_stmt = gimple_build_assign (tem, VIEW_CONVERT_EXPR,
5224                                                  build1 (VIEW_CONVERT_EXPR,
5225                                                          etype, new_temp));
5226               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5227               new_temp = tem;
5228               tem = make_ssa_name (eltype);
5229               epilog_stmt
5230                   = gimple_build_assign (tem, BIT_FIELD_REF,
5231                                          build3 (BIT_FIELD_REF, eltype,
5232                                                  new_temp, TYPE_SIZE (eltype),
5233                                                  bitsize_int (0)));
5234               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5235               dst1 = make_ssa_name (vectype1);
5236               epilog_stmt = gimple_build_assign (dst1, VIEW_CONVERT_EXPR,
5237                                                  build1 (VIEW_CONVERT_EXPR,
5238                                                          vectype1, tem));
5239               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5240               tem = make_ssa_name (eltype);
5241               epilog_stmt
5242                   = gimple_build_assign (tem, BIT_FIELD_REF,
5243                                          build3 (BIT_FIELD_REF, eltype,
5244                                                  new_temp, TYPE_SIZE (eltype),
5245                                                  bitsize_int (bitsize)));
5246               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5247               dst2 =  make_ssa_name (vectype1);
5248               epilog_stmt = gimple_build_assign (dst2, VIEW_CONVERT_EXPR,
5249                                                  build1 (VIEW_CONVERT_EXPR,
5250                                                          vectype1, tem));
5251               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5252             }
5253
5254           new_temp = make_ssa_name (vectype1);
5255           epilog_stmt = gimple_build_assign (new_temp, code, dst1, dst2);
5256           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5257           new_phis[0] = epilog_stmt;
5258         }
5259
5260       if (reduce_with_shift && !slp_reduc)
5261         {
5262           int element_bitsize = tree_to_uhwi (bitsize);
5263           /* Enforced by vectorizable_reduction, which disallows SLP reductions
5264              for variable-length vectors and also requires direct target support
5265              for loop reductions.  */
5266           int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype1));
5267           int nelements = vec_size_in_bits / element_bitsize;
5268           vec_perm_builder sel;
5269           vec_perm_indices indices;
5270
5271           int elt_offset;
5272
5273           tree zero_vec = build_zero_cst (vectype1);
5274           /* Case 2: Create:
5275              for (offset = nelements/2; offset >= 1; offset/=2)
5276                 {
5277                   Create:  va' = vec_shift <va, offset>
5278                   Create:  va = vop <va, va'>
5279                 }  */
5280
5281           tree rhs;
5282
5283           if (dump_enabled_p ())
5284             dump_printf_loc (MSG_NOTE, vect_location,
5285                              "Reduce using vector shifts\n");
5286
5287           gimple_seq stmts = NULL;
5288           new_temp = gimple_convert (&stmts, vectype1, new_temp);
5289           for (elt_offset = nelements / 2;
5290                elt_offset >= 1;
5291                elt_offset /= 2)
5292             {
5293               calc_vec_perm_mask_for_shift (elt_offset, nelements, &sel);
5294               indices.new_vector (sel, 2, nelements);
5295               tree mask = vect_gen_perm_mask_any (vectype1, indices);
5296               new_name = gimple_build (&stmts, VEC_PERM_EXPR, vectype1,
5297                                        new_temp, zero_vec, mask);
5298               new_temp = gimple_build (&stmts, code,
5299                                        vectype1, new_name, new_temp);
5300             }
5301           gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
5302
5303           /* 2.4  Extract the final scalar result.  Create:
5304              s_out3 = extract_field <v_out2, bitpos>  */
5305
5306           if (dump_enabled_p ())
5307             dump_printf_loc (MSG_NOTE, vect_location,
5308                              "extract scalar result\n");
5309
5310           rhs = build3 (BIT_FIELD_REF, scalar_type, new_temp,
5311                         bitsize, bitsize_zero_node);
5312           epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
5313           new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
5314           gimple_assign_set_lhs (epilog_stmt, new_temp);
5315           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5316           scalar_results.safe_push (new_temp);
5317         }
5318       else
5319         {
5320           /* Case 3: Create:
5321              s = extract_field <v_out2, 0>
5322              for (offset = element_size;
5323                   offset < vector_size;
5324                   offset += element_size;)
5325                {
5326                  Create:  s' = extract_field <v_out2, offset>
5327                  Create:  s = op <s, s'>  // For non SLP cases
5328                }  */
5329
5330           if (dump_enabled_p ())
5331             dump_printf_loc (MSG_NOTE, vect_location,
5332                              "Reduce using scalar code.\n");
5333
5334           int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype1));
5335           int element_bitsize = tree_to_uhwi (bitsize);
5336           tree compute_type = TREE_TYPE (vectype);
5337           gimple_seq stmts = NULL;
5338           FOR_EACH_VEC_ELT (new_phis, i, new_phi)
5339             {
5340               int bit_offset;
5341               if (gimple_code (new_phi) == GIMPLE_PHI)
5342                 vec_temp = PHI_RESULT (new_phi);
5343               else
5344                 vec_temp = gimple_assign_lhs (new_phi);
5345               new_temp = gimple_build (&stmts, BIT_FIELD_REF, compute_type,
5346                                        vec_temp, bitsize, bitsize_zero_node);
5347
5348               /* In SLP we don't need to apply reduction operation, so we just
5349                  collect s' values in SCALAR_RESULTS.  */
5350               if (slp_reduc)
5351                 scalar_results.safe_push (new_temp);
5352
5353               for (bit_offset = element_bitsize;
5354                    bit_offset < vec_size_in_bits;
5355                    bit_offset += element_bitsize)
5356                 {
5357                   tree bitpos = bitsize_int (bit_offset);
5358                   new_name = gimple_build (&stmts, BIT_FIELD_REF,
5359                                            compute_type, vec_temp,
5360                                            bitsize, bitpos);
5361                   if (slp_reduc)
5362                     {
5363                       /* In SLP we don't need to apply reduction operation, so
5364                          we just collect s' values in SCALAR_RESULTS.  */
5365                       new_temp = new_name;
5366                       scalar_results.safe_push (new_name);
5367                     }
5368                   else
5369                     new_temp = gimple_build (&stmts, code, compute_type,
5370                                              new_name, new_temp);
5371                 }
5372             }
5373
5374           /* The only case where we need to reduce scalar results in SLP, is
5375              unrolling.  If the size of SCALAR_RESULTS is greater than
5376              REDUC_GROUP_SIZE, we reduce them combining elements modulo
5377              REDUC_GROUP_SIZE.  */
5378           if (slp_reduc)
5379             {
5380               tree res, first_res, new_res;
5381
5382               /* Reduce multiple scalar results in case of SLP unrolling.  */
5383               for (j = group_size; scalar_results.iterate (j, &res);
5384                    j++)
5385                 {
5386                   first_res = scalar_results[j % group_size];
5387                   new_res = gimple_build (&stmts, code, compute_type,
5388                                           first_res, res);
5389                   scalar_results[j % group_size] = new_res;
5390                 }
5391               for (k = 0; k < group_size; k++)
5392                 scalar_results[k] = gimple_convert (&stmts, scalar_type,
5393                                                     scalar_results[k]);
5394             }
5395           else
5396             {
5397               /* Not SLP - we have one scalar to keep in SCALAR_RESULTS.  */
5398               new_temp = gimple_convert (&stmts, scalar_type, new_temp);
5399               scalar_results.safe_push (new_temp);
5400             }
5401
5402           gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
5403         }
5404
5405       if ((STMT_VINFO_REDUC_TYPE (reduc_info) == INTEGER_INDUC_COND_REDUCTION)
5406           && induc_val)
5407         {
5408           /* Earlier we set the initial value to be a vector if induc_val
5409              values.  Check the result and if it is induc_val then replace
5410              with the original initial value, unless induc_val is
5411              the same as initial_def already.  */
5412           tree zcompare = build2 (EQ_EXPR, boolean_type_node, new_temp,
5413                                   induc_val);
5414
5415           tree tmp = make_ssa_name (new_scalar_dest);
5416           epilog_stmt = gimple_build_assign (tmp, COND_EXPR, zcompare,
5417                                              initial_def, new_temp);
5418           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5419           scalar_results[0] = tmp;
5420         }
5421     }
5422
5423   /* 2.5 Adjust the final result by the initial value of the reduction
5424          variable. (When such adjustment is not needed, then
5425          'adjustment_def' is zero).  For example, if code is PLUS we create:
5426          new_temp = loop_exit_def + adjustment_def  */
5427
5428   if (adjustment_def)
5429     {
5430       gcc_assert (!slp_reduc);
5431       gimple_seq stmts = NULL;
5432       if (nested_in_vect_loop)
5433         {
5434           new_phi = new_phis[0];
5435           gcc_assert (VECTOR_TYPE_P (TREE_TYPE (adjustment_def)));
5436           adjustment_def = gimple_convert (&stmts, vectype, adjustment_def);
5437           new_temp = gimple_build (&stmts, code, vectype,
5438                                    PHI_RESULT (new_phi), adjustment_def);
5439         }
5440       else
5441         {
5442           new_temp = scalar_results[0];
5443           gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) != VECTOR_TYPE);
5444           adjustment_def = gimple_convert (&stmts, scalar_type, adjustment_def);
5445           new_temp = gimple_build (&stmts, code, scalar_type,
5446                                    new_temp, adjustment_def);
5447         }
5448
5449       epilog_stmt = gimple_seq_last_stmt (stmts);
5450       gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
5451       if (nested_in_vect_loop)
5452         {
5453           if (!double_reduc)
5454             scalar_results.quick_push (new_temp);
5455           else
5456             scalar_results[0] = new_temp;
5457         }
5458       else
5459         scalar_results[0] = new_temp;
5460
5461       new_phis[0] = epilog_stmt;
5462     }
5463
5464   if (double_reduc)
5465     loop = loop->inner;
5466
5467   /* 2.6  Handle the loop-exit phis.  Replace the uses of scalar loop-exit
5468           phis with new adjusted scalar results, i.e., replace use <s_out0>
5469           with use <s_out4>.
5470
5471      Transform:
5472         loop_exit:
5473           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
5474           v_out1 = phi <VECT_DEF>               # NEW_EXIT_PHI
5475           v_out2 = reduce <v_out1>
5476           s_out3 = extract_field <v_out2, 0>
5477           s_out4 = adjust_result <s_out3>
5478           use <s_out0>
5479           use <s_out0>
5480
5481      into:
5482
5483         loop_exit:
5484           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
5485           v_out1 = phi <VECT_DEF>               # NEW_EXIT_PHI
5486           v_out2 = reduce <v_out1>
5487           s_out3 = extract_field <v_out2, 0>
5488           s_out4 = adjust_result <s_out3>
5489           use <s_out4>
5490           use <s_out4> */
5491
5492
5493   /* In SLP reduction chain we reduce vector results into one vector if
5494      necessary, hence we set here REDUC_GROUP_SIZE to 1.  SCALAR_DEST is the
5495      LHS of the last stmt in the reduction chain, since we are looking for
5496      the loop exit phi node.  */
5497   if (REDUC_GROUP_FIRST_ELEMENT (stmt_info))
5498     {
5499       stmt_vec_info dest_stmt_info
5500         = vect_orig_stmt (SLP_TREE_SCALAR_STMTS (slp_node)[group_size - 1]);
5501       scalar_dest = gimple_assign_lhs (dest_stmt_info->stmt);
5502       group_size = 1;
5503     }
5504
5505   /* In SLP we may have several statements in NEW_PHIS and REDUCTION_PHIS (in
5506      case that REDUC_GROUP_SIZE is greater than vectorization factor).
5507      Therefore, we need to match SCALAR_RESULTS with corresponding statements.
5508      The first (REDUC_GROUP_SIZE / number of new vector stmts) scalar results
5509      correspond to the first vector stmt, etc.
5510      (RATIO is equal to (REDUC_GROUP_SIZE / number of new vector stmts)).  */
5511   if (group_size > new_phis.length ())
5512     gcc_assert (!(group_size % new_phis.length ()));
5513
5514   for (k = 0; k < group_size; k++)
5515     {
5516       if (slp_reduc)
5517         {
5518           stmt_vec_info scalar_stmt_info = SLP_TREE_SCALAR_STMTS (slp_node)[k];
5519
5520           orig_stmt_info = STMT_VINFO_RELATED_STMT (scalar_stmt_info);
5521           /* SLP statements can't participate in patterns.  */
5522           gcc_assert (!orig_stmt_info);
5523           scalar_dest = gimple_assign_lhs (scalar_stmt_info->stmt);
5524         }
5525
5526       if (nested_in_vect_loop)
5527         {
5528           if (double_reduc)
5529             loop = outer_loop;
5530           else
5531             gcc_unreachable ();
5532         }
5533
5534       phis.create (3);
5535       /* Find the loop-closed-use at the loop exit of the original scalar
5536          result.  (The reduction result is expected to have two immediate uses,
5537          one at the latch block, and one at the loop exit).  For double
5538          reductions we are looking for exit phis of the outer loop.  */
5539       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, scalar_dest)
5540         {
5541           if (!flow_bb_inside_loop_p (loop, gimple_bb (USE_STMT (use_p))))
5542             {
5543               if (!is_gimple_debug (USE_STMT (use_p)))
5544                 phis.safe_push (USE_STMT (use_p));
5545             }
5546           else
5547             {
5548               if (double_reduc && gimple_code (USE_STMT (use_p)) == GIMPLE_PHI)
5549                 {
5550                   tree phi_res = PHI_RESULT (USE_STMT (use_p));
5551
5552                   FOR_EACH_IMM_USE_FAST (phi_use_p, phi_imm_iter, phi_res)
5553                     {
5554                       if (!flow_bb_inside_loop_p (loop,
5555                                              gimple_bb (USE_STMT (phi_use_p)))
5556                           && !is_gimple_debug (USE_STMT (phi_use_p)))
5557                         phis.safe_push (USE_STMT (phi_use_p));
5558                     }
5559                 }
5560             }
5561         }
5562
5563       FOR_EACH_VEC_ELT (phis, i, exit_phi)
5564         {
5565           /* Replace the uses:  */
5566           orig_name = PHI_RESULT (exit_phi);
5567           scalar_result = scalar_results[k];
5568           FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name)
5569             {
5570               FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
5571                 SET_USE (use_p, scalar_result);
5572               update_stmt (use_stmt);
5573             }
5574         }
5575
5576       phis.release ();
5577     }
5578 }
5579
5580 /* Return a vector of type VECTYPE that is equal to the vector select
5581    operation "MASK ? VEC : IDENTITY".  Insert the select statements
5582    before GSI.  */
5583
5584 static tree
5585 merge_with_identity (gimple_stmt_iterator *gsi, tree mask, tree vectype,
5586                      tree vec, tree identity)
5587 {
5588   tree cond = make_temp_ssa_name (vectype, NULL, "cond");
5589   gimple *new_stmt = gimple_build_assign (cond, VEC_COND_EXPR,
5590                                           mask, vec, identity);
5591   gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
5592   return cond;
5593 }
5594
5595 /* Successively apply CODE to each element of VECTOR_RHS, in left-to-right
5596    order, starting with LHS.  Insert the extraction statements before GSI and
5597    associate the new scalar SSA names with variable SCALAR_DEST.
5598    Return the SSA name for the result.  */
5599
5600 static tree
5601 vect_expand_fold_left (gimple_stmt_iterator *gsi, tree scalar_dest,
5602                        tree_code code, tree lhs, tree vector_rhs)
5603 {
5604   tree vectype = TREE_TYPE (vector_rhs);
5605   tree scalar_type = TREE_TYPE (vectype);
5606   tree bitsize = TYPE_SIZE (scalar_type);
5607   unsigned HOST_WIDE_INT vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
5608   unsigned HOST_WIDE_INT element_bitsize = tree_to_uhwi (bitsize);
5609
5610   for (unsigned HOST_WIDE_INT bit_offset = 0;
5611        bit_offset < vec_size_in_bits;
5612        bit_offset += element_bitsize)
5613     {
5614       tree bitpos = bitsize_int (bit_offset);
5615       tree rhs = build3 (BIT_FIELD_REF, scalar_type, vector_rhs,
5616                          bitsize, bitpos);
5617
5618       gassign *stmt = gimple_build_assign (scalar_dest, rhs);
5619       rhs = make_ssa_name (scalar_dest, stmt);
5620       gimple_assign_set_lhs (stmt, rhs);
5621       gsi_insert_before (gsi, stmt, GSI_SAME_STMT);
5622
5623       stmt = gimple_build_assign (scalar_dest, code, lhs, rhs);
5624       tree new_name = make_ssa_name (scalar_dest, stmt);
5625       gimple_assign_set_lhs (stmt, new_name);
5626       gsi_insert_before (gsi, stmt, GSI_SAME_STMT);
5627       lhs = new_name;
5628     }
5629   return lhs;
5630 }
5631
5632 /* Get a masked internal function equivalent to REDUC_FN.  VECTYPE_IN is the
5633    type of the vector input.  */
5634
5635 static internal_fn
5636 get_masked_reduction_fn (internal_fn reduc_fn, tree vectype_in)
5637 {
5638   internal_fn mask_reduc_fn;
5639
5640   switch (reduc_fn)
5641     {
5642     case IFN_FOLD_LEFT_PLUS:
5643       mask_reduc_fn = IFN_MASK_FOLD_LEFT_PLUS;
5644       break;
5645
5646     default:
5647       return IFN_LAST;
5648     }
5649
5650   if (direct_internal_fn_supported_p (mask_reduc_fn, vectype_in,
5651                                       OPTIMIZE_FOR_SPEED))
5652     return mask_reduc_fn;
5653   return IFN_LAST;
5654 }
5655
5656 /* Perform an in-order reduction (FOLD_LEFT_REDUCTION).  STMT_INFO is the
5657    statement that sets the live-out value.  REDUC_DEF_STMT is the phi
5658    statement.  CODE is the operation performed by STMT_INFO and OPS are
5659    its scalar operands.  REDUC_INDEX is the index of the operand in
5660    OPS that is set by REDUC_DEF_STMT.  REDUC_FN is the function that
5661    implements in-order reduction, or IFN_LAST if we should open-code it.
5662    VECTYPE_IN is the type of the vector input.  MASKS specifies the masks
5663    that should be used to control the operation in a fully-masked loop.  */
5664
5665 static bool
5666 vectorize_fold_left_reduction (loop_vec_info loop_vinfo,
5667                                stmt_vec_info stmt_info,
5668                                gimple_stmt_iterator *gsi,
5669                                gimple **vec_stmt, slp_tree slp_node,
5670                                gimple *reduc_def_stmt,
5671                                tree_code code, internal_fn reduc_fn,
5672                                tree ops[3], tree vectype_in,
5673                                int reduc_index, vec_loop_masks *masks)
5674 {
5675   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
5676   tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
5677   internal_fn mask_reduc_fn = get_masked_reduction_fn (reduc_fn, vectype_in);
5678
5679   int ncopies;
5680   if (slp_node)
5681     ncopies = 1;
5682   else
5683     ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
5684
5685   gcc_assert (!nested_in_vect_loop_p (loop, stmt_info));
5686   gcc_assert (ncopies == 1);
5687   gcc_assert (TREE_CODE_LENGTH (code) == binary_op);
5688
5689   if (slp_node)
5690     gcc_assert (known_eq (TYPE_VECTOR_SUBPARTS (vectype_out),
5691                           TYPE_VECTOR_SUBPARTS (vectype_in)));
5692
5693   tree op0 = ops[1 - reduc_index];
5694
5695   int group_size = 1;
5696   stmt_vec_info scalar_dest_def_info;
5697   auto_vec<tree> vec_oprnds0;
5698   if (slp_node)
5699     {
5700       auto_vec<vec<tree> > vec_defs (2);
5701       vect_get_slp_defs (loop_vinfo, slp_node, &vec_defs);
5702       vec_oprnds0.safe_splice (vec_defs[1 - reduc_index]);
5703       vec_defs[0].release ();
5704       vec_defs[1].release ();
5705       group_size = SLP_TREE_SCALAR_STMTS (slp_node).length ();
5706       scalar_dest_def_info = SLP_TREE_SCALAR_STMTS (slp_node)[group_size - 1];
5707     }
5708   else
5709     {
5710       vect_get_vec_defs_for_operand (loop_vinfo, stmt_info, 1,
5711                                      op0, &vec_oprnds0);
5712       scalar_dest_def_info = stmt_info;
5713     }
5714
5715   tree scalar_dest = gimple_assign_lhs (scalar_dest_def_info->stmt);
5716   tree scalar_type = TREE_TYPE (scalar_dest);
5717   tree reduc_var = gimple_phi_result (reduc_def_stmt);
5718
5719   int vec_num = vec_oprnds0.length ();
5720   gcc_assert (vec_num == 1 || slp_node);
5721   tree vec_elem_type = TREE_TYPE (vectype_out);
5722   gcc_checking_assert (useless_type_conversion_p (scalar_type, vec_elem_type));
5723
5724   tree vector_identity = NULL_TREE;
5725   if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
5726     vector_identity = build_zero_cst (vectype_out);
5727
5728   tree scalar_dest_var = vect_create_destination_var (scalar_dest, NULL);
5729   int i;
5730   tree def0;
5731   FOR_EACH_VEC_ELT (vec_oprnds0, i, def0)
5732     {
5733       gimple *new_stmt;
5734       tree mask = NULL_TREE;
5735       if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
5736         mask = vect_get_loop_mask (gsi, masks, vec_num, vectype_in, i);
5737
5738       /* Handle MINUS by adding the negative.  */
5739       if (reduc_fn != IFN_LAST && code == MINUS_EXPR)
5740         {
5741           tree negated = make_ssa_name (vectype_out);
5742           new_stmt = gimple_build_assign (negated, NEGATE_EXPR, def0);
5743           gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
5744           def0 = negated;
5745         }
5746
5747       if (mask && mask_reduc_fn == IFN_LAST)
5748         def0 = merge_with_identity (gsi, mask, vectype_out, def0,
5749                                     vector_identity);
5750
5751       /* On the first iteration the input is simply the scalar phi
5752          result, and for subsequent iterations it is the output of
5753          the preceding operation.  */
5754       if (reduc_fn != IFN_LAST || (mask && mask_reduc_fn != IFN_LAST))
5755         {
5756           if (mask && mask_reduc_fn != IFN_LAST)
5757             new_stmt = gimple_build_call_internal (mask_reduc_fn, 3, reduc_var,
5758                                                    def0, mask);
5759           else
5760             new_stmt = gimple_build_call_internal (reduc_fn, 2, reduc_var,
5761                                                    def0);
5762           /* For chained SLP reductions the output of the previous reduction
5763              operation serves as the input of the next. For the final statement
5764              the output cannot be a temporary - we reuse the original
5765              scalar destination of the last statement.  */
5766           if (i != vec_num - 1)
5767             {
5768               gimple_set_lhs (new_stmt, scalar_dest_var);
5769               reduc_var = make_ssa_name (scalar_dest_var, new_stmt);
5770               gimple_set_lhs (new_stmt, reduc_var);
5771             }
5772         }
5773       else
5774         {
5775           reduc_var = vect_expand_fold_left (gsi, scalar_dest_var, code,
5776                                              reduc_var, def0);
5777           new_stmt = SSA_NAME_DEF_STMT (reduc_var);
5778           /* Remove the statement, so that we can use the same code paths
5779              as for statements that we've just created.  */
5780           gimple_stmt_iterator tmp_gsi = gsi_for_stmt (new_stmt);
5781           gsi_remove (&tmp_gsi, true);
5782         }
5783
5784       if (i == vec_num - 1)
5785         {
5786           gimple_set_lhs (new_stmt, scalar_dest);
5787           vect_finish_replace_stmt (loop_vinfo,
5788                                     scalar_dest_def_info,
5789                                     new_stmt);
5790         }
5791       else
5792         vect_finish_stmt_generation (loop_vinfo,
5793                                      scalar_dest_def_info,
5794                                      new_stmt, gsi);
5795
5796       if (slp_node)
5797         SLP_TREE_VEC_STMTS (slp_node).quick_push (new_stmt);
5798       else
5799         {
5800           STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_stmt);
5801           *vec_stmt = new_stmt;
5802         }
5803     }
5804
5805   return true;
5806 }
5807
5808 /* Function is_nonwrapping_integer_induction.
5809
5810    Check if STMT_VINO (which is part of loop LOOP) both increments and
5811    does not cause overflow.  */
5812
5813 static bool
5814 is_nonwrapping_integer_induction (stmt_vec_info stmt_vinfo, class loop *loop)
5815 {
5816   gphi *phi = as_a <gphi *> (stmt_vinfo->stmt);
5817   tree base = STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo);
5818   tree step = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo);
5819   tree lhs_type = TREE_TYPE (gimple_phi_result (phi));
5820   widest_int ni, max_loop_value, lhs_max;
5821   wi::overflow_type overflow = wi::OVF_NONE;
5822
5823   /* Make sure the loop is integer based.  */
5824   if (TREE_CODE (base) != INTEGER_CST
5825       || TREE_CODE (step) != INTEGER_CST)
5826     return false;
5827
5828   /* Check that the max size of the loop will not wrap.  */
5829
5830   if (TYPE_OVERFLOW_UNDEFINED (lhs_type))
5831     return true;
5832
5833   if (! max_stmt_executions (loop, &ni))
5834     return false;
5835
5836   max_loop_value = wi::mul (wi::to_widest (step), ni, TYPE_SIGN (lhs_type),
5837                             &overflow);
5838   if (overflow)
5839     return false;
5840
5841   max_loop_value = wi::add (wi::to_widest (base), max_loop_value,
5842                             TYPE_SIGN (lhs_type), &overflow);
5843   if (overflow)
5844     return false;
5845
5846   return (wi::min_precision (max_loop_value, TYPE_SIGN (lhs_type))
5847           <= TYPE_PRECISION (lhs_type));
5848 }
5849
5850 /* Check if masking can be supported by inserting a conditional expression.
5851    CODE is the code for the operation.  COND_FN is the conditional internal
5852    function, if it exists.  VECTYPE_IN is the type of the vector input.  */
5853 static bool
5854 use_mask_by_cond_expr_p (enum tree_code code, internal_fn cond_fn,
5855                          tree vectype_in)
5856 {
5857   if (cond_fn != IFN_LAST
5858       && direct_internal_fn_supported_p (cond_fn, vectype_in,
5859                                          OPTIMIZE_FOR_SPEED))
5860     return false;
5861
5862   switch (code)
5863     {
5864     case DOT_PROD_EXPR:
5865     case SAD_EXPR:
5866       return true;
5867
5868     default:
5869       return false;
5870     }
5871 }
5872
5873 /* Insert a conditional expression to enable masked vectorization.  CODE is the
5874    code for the operation.  VOP is the array of operands.  MASK is the loop
5875    mask.  GSI is a statement iterator used to place the new conditional
5876    expression.  */
5877 static void
5878 build_vect_cond_expr (enum tree_code code, tree vop[3], tree mask,
5879                       gimple_stmt_iterator *gsi)
5880 {
5881   switch (code)
5882     {
5883     case DOT_PROD_EXPR:
5884       {
5885         tree vectype = TREE_TYPE (vop[1]);
5886         tree zero = build_zero_cst (vectype);
5887         tree masked_op1 = make_temp_ssa_name (vectype, NULL, "masked_op1");
5888         gassign *select = gimple_build_assign (masked_op1, VEC_COND_EXPR,
5889                                                mask, vop[1], zero);
5890         gsi_insert_before (gsi, select, GSI_SAME_STMT);
5891         vop[1] = masked_op1;
5892         break;
5893       }
5894
5895     case SAD_EXPR:
5896       {
5897         tree vectype = TREE_TYPE (vop[1]);
5898         tree masked_op1 = make_temp_ssa_name (vectype, NULL, "masked_op1");
5899         gassign *select = gimple_build_assign (masked_op1, VEC_COND_EXPR,
5900                                                mask, vop[1], vop[0]);
5901         gsi_insert_before (gsi, select, GSI_SAME_STMT);
5902         vop[1] = masked_op1;
5903         break;
5904       }
5905
5906     default:
5907       gcc_unreachable ();
5908     }
5909 }
5910
5911 /* Function vectorizable_reduction.
5912
5913    Check if STMT_INFO performs a reduction operation that can be vectorized.
5914    If VEC_STMT is also passed, vectorize STMT_INFO: create a vectorized
5915    stmt to replace it, put it in VEC_STMT, and insert it at GSI.
5916    Return true if STMT_INFO is vectorizable in this way.
5917
5918    This function also handles reduction idioms (patterns) that have been
5919    recognized in advance during vect_pattern_recog.  In this case, STMT_INFO
5920    may be of this form:
5921      X = pattern_expr (arg0, arg1, ..., X)
5922    and its STMT_VINFO_RELATED_STMT points to the last stmt in the original
5923    sequence that had been detected and replaced by the pattern-stmt
5924    (STMT_INFO).
5925
5926    This function also handles reduction of condition expressions, for example:
5927      for (int i = 0; i < N; i++)
5928        if (a[i] < value)
5929          last = a[i];
5930    This is handled by vectorising the loop and creating an additional vector
5931    containing the loop indexes for which "a[i] < value" was true.  In the
5932    function epilogue this is reduced to a single max value and then used to
5933    index into the vector of results.
5934
5935    In some cases of reduction patterns, the type of the reduction variable X is
5936    different than the type of the other arguments of STMT_INFO.
5937    In such cases, the vectype that is used when transforming STMT_INFO into
5938    a vector stmt is different than the vectype that is used to determine the
5939    vectorization factor, because it consists of a different number of elements
5940    than the actual number of elements that are being operated upon in parallel.
5941
5942    For example, consider an accumulation of shorts into an int accumulator.
5943    On some targets it's possible to vectorize this pattern operating on 8
5944    shorts at a time (hence, the vectype for purposes of determining the
5945    vectorization factor should be V8HI); on the other hand, the vectype that
5946    is used to create the vector form is actually V4SI (the type of the result).
5947
5948    Upon entry to this function, STMT_VINFO_VECTYPE records the vectype that
5949    indicates what is the actual level of parallelism (V8HI in the example), so
5950    that the right vectorization factor would be derived.  This vectype
5951    corresponds to the type of arguments to the reduction stmt, and should *NOT*
5952    be used to create the vectorized stmt.  The right vectype for the vectorized
5953    stmt is obtained from the type of the result X:
5954       get_vectype_for_scalar_type (vinfo, TREE_TYPE (X))
5955
5956    This means that, contrary to "regular" reductions (or "regular" stmts in
5957    general), the following equation:
5958       STMT_VINFO_VECTYPE == get_vectype_for_scalar_type (vinfo, TREE_TYPE (X))
5959    does *NOT* necessarily hold for reduction patterns.  */
5960
5961 bool
5962 vectorizable_reduction (loop_vec_info loop_vinfo,
5963                         stmt_vec_info stmt_info, slp_tree slp_node,
5964                         slp_instance slp_node_instance,
5965                         stmt_vector_for_cost *cost_vec)
5966 {
5967   tree scalar_dest;
5968   tree vectype_in = NULL_TREE;
5969   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
5970   enum vect_def_type cond_reduc_dt = vect_unknown_def_type;
5971   stmt_vec_info cond_stmt_vinfo = NULL;
5972   tree scalar_type;
5973   int i;
5974   int ncopies;
5975   bool single_defuse_cycle = false;
5976   bool nested_cycle = false;
5977   bool double_reduc = false;
5978   int vec_num;
5979   tree tem;
5980   tree cr_index_scalar_type = NULL_TREE, cr_index_vector_type = NULL_TREE;
5981   tree cond_reduc_val = NULL_TREE;
5982
5983   /* Make sure it was already recognized as a reduction computation.  */
5984   if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_reduction_def
5985       && STMT_VINFO_DEF_TYPE (stmt_info) != vect_double_reduction_def
5986       && STMT_VINFO_DEF_TYPE (stmt_info) != vect_nested_cycle)
5987     return false;
5988
5989   /* The stmt we store reduction analysis meta on.  */
5990   stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info);
5991   reduc_info->is_reduc_info = true;
5992
5993   if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle)
5994     {
5995       if (is_a <gphi *> (stmt_info->stmt))
5996         /* Analysis for double-reduction is done on the outer
5997            loop PHI, nested cycles have no further restrictions.  */
5998         STMT_VINFO_TYPE (stmt_info) = cycle_phi_info_type;
5999       else
6000         STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
6001       return true;
6002     }
6003
6004   stmt_vec_info orig_stmt_of_analysis = stmt_info;
6005   stmt_vec_info phi_info = stmt_info;
6006   if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
6007       || STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
6008     {
6009       if (!is_a <gphi *> (stmt_info->stmt))
6010         {
6011           STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
6012           return true;
6013         }
6014       if (slp_node)
6015         {
6016           slp_node_instance->reduc_phis = slp_node;
6017           /* ???  We're leaving slp_node to point to the PHIs, we only
6018              need it to get at the number of vector stmts which wasn't
6019              yet initialized for the instance root.  */
6020         }
6021       if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def)
6022         stmt_info = vect_stmt_to_vectorize (STMT_VINFO_REDUC_DEF (stmt_info));
6023       else /* STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def */
6024         {
6025           use_operand_p use_p;
6026           gimple *use_stmt;
6027           bool res = single_imm_use (gimple_phi_result (stmt_info->stmt),
6028                                      &use_p, &use_stmt);
6029           gcc_assert (res);
6030           phi_info = loop_vinfo->lookup_stmt (use_stmt);
6031           stmt_info = vect_stmt_to_vectorize (STMT_VINFO_REDUC_DEF (phi_info));
6032         }
6033     }
6034
6035   /* PHIs should not participate in patterns.  */
6036   gcc_assert (!STMT_VINFO_RELATED_STMT (phi_info));
6037   gphi *reduc_def_phi = as_a <gphi *> (phi_info->stmt);
6038
6039   /* Verify following REDUC_IDX from the latch def leads us back to the PHI
6040      and compute the reduction chain length.  */
6041   tree reduc_def = PHI_ARG_DEF_FROM_EDGE (reduc_def_phi,
6042                                           loop_latch_edge (loop));
6043   unsigned reduc_chain_length = 0;
6044   bool only_slp_reduc_chain = true;
6045   stmt_info = NULL;
6046   while (reduc_def != PHI_RESULT (reduc_def_phi))
6047     {
6048       stmt_vec_info def = loop_vinfo->lookup_def (reduc_def);
6049       stmt_vec_info vdef = vect_stmt_to_vectorize (def);
6050       if (STMT_VINFO_REDUC_IDX (vdef) == -1)
6051         {
6052           if (dump_enabled_p ())
6053             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6054                              "reduction chain broken by patterns.\n");
6055           return false;
6056         }
6057       if (!REDUC_GROUP_FIRST_ELEMENT (vdef))
6058         only_slp_reduc_chain = false;
6059       /* ???  For epilogue generation live members of the chain need
6060          to point back to the PHI via their original stmt for
6061          info_for_reduction to work.  */
6062       if (STMT_VINFO_LIVE_P (vdef))
6063         STMT_VINFO_REDUC_DEF (def) = phi_info;
6064       gassign *assign = dyn_cast <gassign *> (vdef->stmt);
6065       if (!assign)
6066         {
6067           if (dump_enabled_p ())
6068             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6069                              "reduction chain includes calls.\n");
6070           return false;
6071         }
6072       if (CONVERT_EXPR_CODE_P (gimple_assign_rhs_code (assign)))
6073         {
6074           if (!tree_nop_conversion_p (TREE_TYPE (gimple_assign_lhs (assign)),
6075                                       TREE_TYPE (gimple_assign_rhs1 (assign))))
6076             {
6077               if (dump_enabled_p ())
6078                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6079                                  "conversion in the reduction chain.\n");
6080               return false;
6081             }
6082         }
6083       else if (!stmt_info)
6084         /* First non-conversion stmt.  */
6085         stmt_info = vdef;
6086       reduc_def = gimple_op (vdef->stmt, 1 + STMT_VINFO_REDUC_IDX (vdef));
6087       reduc_chain_length++;
6088     }
6089   /* PHIs should not participate in patterns.  */
6090   gcc_assert (!STMT_VINFO_RELATED_STMT (phi_info));
6091
6092   if (nested_in_vect_loop_p (loop, stmt_info))
6093     {
6094       loop = loop->inner;
6095       nested_cycle = true;
6096     }
6097
6098   /* STMT_VINFO_REDUC_DEF doesn't point to the first but the last
6099      element.  */
6100   if (slp_node && REDUC_GROUP_FIRST_ELEMENT (stmt_info))
6101     {
6102       gcc_assert (!REDUC_GROUP_NEXT_ELEMENT (stmt_info));
6103       stmt_info = REDUC_GROUP_FIRST_ELEMENT (stmt_info);
6104     }
6105   if (REDUC_GROUP_FIRST_ELEMENT (stmt_info))
6106     gcc_assert (slp_node
6107                 && REDUC_GROUP_FIRST_ELEMENT (stmt_info) == stmt_info);
6108
6109   /* 1. Is vectorizable reduction?  */
6110   /* Not supportable if the reduction variable is used in the loop, unless
6111      it's a reduction chain.  */
6112   if (STMT_VINFO_RELEVANT (stmt_info) > vect_used_in_outer
6113       && !REDUC_GROUP_FIRST_ELEMENT (stmt_info))
6114     return false;
6115
6116   /* Reductions that are not used even in an enclosing outer-loop,
6117      are expected to be "live" (used out of the loop).  */
6118   if (STMT_VINFO_RELEVANT (stmt_info) == vect_unused_in_scope
6119       && !STMT_VINFO_LIVE_P (stmt_info))
6120     return false;
6121
6122   /* 2. Has this been recognized as a reduction pattern?
6123
6124      Check if STMT represents a pattern that has been recognized
6125      in earlier analysis stages.  For stmts that represent a pattern,
6126      the STMT_VINFO_RELATED_STMT field records the last stmt in
6127      the original sequence that constitutes the pattern.  */
6128
6129   stmt_vec_info orig_stmt_info = STMT_VINFO_RELATED_STMT (stmt_info);
6130   if (orig_stmt_info)
6131     {
6132       gcc_assert (STMT_VINFO_IN_PATTERN_P (orig_stmt_info));
6133       gcc_assert (!STMT_VINFO_IN_PATTERN_P (stmt_info));
6134     }
6135
6136   /* 3. Check the operands of the operation.  The first operands are defined
6137         inside the loop body. The last operand is the reduction variable,
6138         which is defined by the loop-header-phi.  */
6139
6140   tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
6141   STMT_VINFO_REDUC_VECTYPE (reduc_info) = vectype_out;
6142   gassign *stmt = as_a <gassign *> (stmt_info->stmt);
6143   enum tree_code code = gimple_assign_rhs_code (stmt);
6144   bool lane_reduc_code_p
6145     = (code == DOT_PROD_EXPR || code == WIDEN_SUM_EXPR || code == SAD_EXPR);
6146   int op_type = TREE_CODE_LENGTH (code);
6147
6148   scalar_dest = gimple_assign_lhs (stmt);
6149   scalar_type = TREE_TYPE (scalar_dest);
6150   if (!POINTER_TYPE_P (scalar_type) && !INTEGRAL_TYPE_P (scalar_type)
6151       && !SCALAR_FLOAT_TYPE_P (scalar_type))
6152     return false;
6153
6154   /* Do not try to vectorize bit-precision reductions.  */
6155   if (!type_has_mode_precision_p (scalar_type))
6156     return false;
6157
6158   /* For lane-reducing ops we're reducing the number of reduction PHIs
6159      which means the only use of that may be in the lane-reducing operation.  */
6160   if (lane_reduc_code_p
6161       && reduc_chain_length != 1
6162       && !only_slp_reduc_chain)
6163     {
6164       if (dump_enabled_p ())
6165         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6166                          "lane-reducing reduction with extra stmts.\n");
6167       return false;
6168     }
6169
6170   /* All uses but the last are expected to be defined in the loop.
6171      The last use is the reduction variable.  In case of nested cycle this
6172      assumption is not true: we use reduc_index to record the index of the
6173      reduction variable.  */
6174   /* ???  To get at invariant/constant uses on the SLP node we have to
6175      get to it here, slp_node is still the reduction PHI.  */
6176   slp_tree slp_for_stmt_info = NULL;
6177   if (slp_node)
6178     {
6179       slp_for_stmt_info = slp_node_instance->root;
6180       /* And then there's reduction chain with a conversion ...  */
6181       if (SLP_TREE_REPRESENTATIVE (slp_for_stmt_info) != stmt_info)
6182         slp_for_stmt_info = SLP_TREE_CHILDREN (slp_for_stmt_info)[0];
6183       gcc_assert (SLP_TREE_REPRESENTATIVE (slp_for_stmt_info) == stmt_info);
6184     }
6185   slp_tree *slp_op = XALLOCAVEC (slp_tree, op_type);
6186   /* We need to skip an extra operand for COND_EXPRs with embedded
6187      comparison.  */
6188   unsigned opno_adjust = 0;
6189   if (code == COND_EXPR
6190       && COMPARISON_CLASS_P (gimple_assign_rhs1 (stmt)))
6191     opno_adjust = 1;
6192   for (i = 0; i < op_type; i++)
6193     {
6194       /* The condition of COND_EXPR is checked in vectorizable_condition().  */
6195       if (i == 0 && code == COND_EXPR)
6196         continue;
6197
6198       stmt_vec_info def_stmt_info;
6199       enum vect_def_type dt;
6200       tree op;
6201       if (!vect_is_simple_use (loop_vinfo, stmt_info, slp_for_stmt_info,
6202                                i + opno_adjust, &op, &slp_op[i], &dt, &tem,
6203                                &def_stmt_info))
6204         {
6205           if (dump_enabled_p ())
6206             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6207                              "use not simple.\n");
6208           return false;
6209         }
6210       if (i == STMT_VINFO_REDUC_IDX (stmt_info))
6211         continue;
6212
6213       /* There should be only one cycle def in the stmt, the one
6214          leading to reduc_def.  */
6215       if (VECTORIZABLE_CYCLE_DEF (dt))
6216         return false;
6217
6218       /* To properly compute ncopies we are interested in the widest
6219          non-reduction input type in case we're looking at a widening
6220          accumulation that we later handle in vect_transform_reduction.  */
6221       if (lane_reduc_code_p
6222           && tem
6223           && (!vectype_in
6224               || (GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (vectype_in)))
6225                   < GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (tem))))))
6226         vectype_in = tem;
6227
6228       if (code == COND_EXPR)
6229         {
6230           /* Record how the non-reduction-def value of COND_EXPR is defined.  */
6231           if (dt == vect_constant_def)
6232             {
6233               cond_reduc_dt = dt;
6234               cond_reduc_val = op;
6235             }
6236           if (dt == vect_induction_def
6237               && def_stmt_info
6238               && is_nonwrapping_integer_induction (def_stmt_info, loop))
6239             {
6240               cond_reduc_dt = dt;
6241               cond_stmt_vinfo = def_stmt_info;
6242             }
6243         }
6244     }
6245   if (!vectype_in)
6246     vectype_in = STMT_VINFO_VECTYPE (phi_info);
6247   STMT_VINFO_REDUC_VECTYPE_IN (reduc_info) = vectype_in;
6248
6249   enum vect_reduction_type v_reduc_type = STMT_VINFO_REDUC_TYPE (phi_info);
6250   STMT_VINFO_REDUC_TYPE (reduc_info) = v_reduc_type;
6251   /* If we have a condition reduction, see if we can simplify it further.  */
6252   if (v_reduc_type == COND_REDUCTION)
6253     {
6254       if (slp_node)
6255         return false;
6256
6257       /* When the condition uses the reduction value in the condition, fail.  */
6258       if (STMT_VINFO_REDUC_IDX (stmt_info) == 0)
6259         {
6260           if (dump_enabled_p ())
6261             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6262                              "condition depends on previous iteration\n");
6263           return false;
6264         }
6265
6266       if (reduc_chain_length == 1
6267           && direct_internal_fn_supported_p (IFN_FOLD_EXTRACT_LAST,
6268                                              vectype_in, OPTIMIZE_FOR_SPEED))
6269         {
6270           if (dump_enabled_p ())
6271             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6272                              "optimizing condition reduction with"
6273                              " FOLD_EXTRACT_LAST.\n");
6274           STMT_VINFO_REDUC_TYPE (reduc_info) = EXTRACT_LAST_REDUCTION;
6275         }
6276       else if (cond_reduc_dt == vect_induction_def)
6277         {
6278           tree base
6279             = STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (cond_stmt_vinfo);
6280           tree step = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (cond_stmt_vinfo);
6281
6282           gcc_assert (TREE_CODE (base) == INTEGER_CST
6283                       && TREE_CODE (step) == INTEGER_CST);
6284           cond_reduc_val = NULL_TREE;
6285           enum tree_code cond_reduc_op_code = ERROR_MARK;
6286           tree res = PHI_RESULT (STMT_VINFO_STMT (cond_stmt_vinfo));
6287           if (!types_compatible_p (TREE_TYPE (res), TREE_TYPE (base)))
6288             ;
6289           /* Find a suitable value, for MAX_EXPR below base, for MIN_EXPR
6290              above base; punt if base is the minimum value of the type for
6291              MAX_EXPR or maximum value of the type for MIN_EXPR for now.  */
6292           else if (tree_int_cst_sgn (step) == -1)
6293             {
6294               cond_reduc_op_code = MIN_EXPR;
6295               if (tree_int_cst_sgn (base) == -1)
6296                 cond_reduc_val = build_int_cst (TREE_TYPE (base), 0);
6297               else if (tree_int_cst_lt (base,
6298                                         TYPE_MAX_VALUE (TREE_TYPE (base))))
6299                 cond_reduc_val
6300                   = int_const_binop (PLUS_EXPR, base, integer_one_node);
6301             }
6302           else
6303             {
6304               cond_reduc_op_code = MAX_EXPR;
6305               if (tree_int_cst_sgn (base) == 1)
6306                 cond_reduc_val = build_int_cst (TREE_TYPE (base), 0);
6307               else if (tree_int_cst_lt (TYPE_MIN_VALUE (TREE_TYPE (base)),
6308                                         base))
6309                 cond_reduc_val
6310                   = int_const_binop (MINUS_EXPR, base, integer_one_node);
6311             }
6312           if (cond_reduc_val)
6313             {
6314               if (dump_enabled_p ())
6315                 dump_printf_loc (MSG_NOTE, vect_location,
6316                                  "condition expression based on "
6317                                  "integer induction.\n");
6318               STMT_VINFO_REDUC_CODE (reduc_info) = cond_reduc_op_code;
6319               STMT_VINFO_VEC_INDUC_COND_INITIAL_VAL (reduc_info)
6320                 = cond_reduc_val;
6321               STMT_VINFO_REDUC_TYPE (reduc_info) = INTEGER_INDUC_COND_REDUCTION;
6322             }
6323         }
6324       else if (cond_reduc_dt == vect_constant_def)
6325         {
6326           enum vect_def_type cond_initial_dt;
6327           tree cond_initial_val
6328             = PHI_ARG_DEF_FROM_EDGE (reduc_def_phi, loop_preheader_edge (loop));
6329
6330           gcc_assert (cond_reduc_val != NULL_TREE);
6331           vect_is_simple_use (cond_initial_val, loop_vinfo, &cond_initial_dt);
6332           if (cond_initial_dt == vect_constant_def
6333               && types_compatible_p (TREE_TYPE (cond_initial_val),
6334                                      TREE_TYPE (cond_reduc_val)))
6335             {
6336               tree e = fold_binary (LE_EXPR, boolean_type_node,
6337                                     cond_initial_val, cond_reduc_val);
6338               if (e && (integer_onep (e) || integer_zerop (e)))
6339                 {
6340                   if (dump_enabled_p ())
6341                     dump_printf_loc (MSG_NOTE, vect_location,
6342                                      "condition expression based on "
6343                                      "compile time constant.\n");
6344                   /* Record reduction code at analysis stage.  */
6345                   STMT_VINFO_REDUC_CODE (reduc_info)
6346                     = integer_onep (e) ? MAX_EXPR : MIN_EXPR;
6347                   STMT_VINFO_REDUC_TYPE (reduc_info) = CONST_COND_REDUCTION;
6348                 }
6349             }
6350         }
6351     }
6352
6353   if (STMT_VINFO_LIVE_P (phi_info))
6354     return false;
6355
6356   if (slp_node)
6357     ncopies = 1;
6358   else
6359     ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
6360
6361   gcc_assert (ncopies >= 1);
6362
6363   poly_uint64 nunits_out = TYPE_VECTOR_SUBPARTS (vectype_out);
6364
6365   if (nested_cycle)
6366     {
6367       gcc_assert (STMT_VINFO_DEF_TYPE (reduc_info)
6368                   == vect_double_reduction_def);
6369       double_reduc = true;
6370     }
6371
6372   /* 4.2. Check support for the epilog operation.
6373
6374           If STMT represents a reduction pattern, then the type of the
6375           reduction variable may be different than the type of the rest
6376           of the arguments.  For example, consider the case of accumulation
6377           of shorts into an int accumulator; The original code:
6378                         S1: int_a = (int) short_a;
6379           orig_stmt->   S2: int_acc = plus <int_a ,int_acc>;
6380
6381           was replaced with:
6382                         STMT: int_acc = widen_sum <short_a, int_acc>
6383
6384           This means that:
6385           1. The tree-code that is used to create the vector operation in the
6386              epilog code (that reduces the partial results) is not the
6387              tree-code of STMT, but is rather the tree-code of the original
6388              stmt from the pattern that STMT is replacing.  I.e, in the example
6389              above we want to use 'widen_sum' in the loop, but 'plus' in the
6390              epilog.
6391           2. The type (mode) we use to check available target support
6392              for the vector operation to be created in the *epilog*, is
6393              determined by the type of the reduction variable (in the example
6394              above we'd check this: optab_handler (plus_optab, vect_int_mode])).
6395              However the type (mode) we use to check available target support
6396              for the vector operation to be created *inside the loop*, is
6397              determined by the type of the other arguments to STMT (in the
6398              example we'd check this: optab_handler (widen_sum_optab,
6399              vect_short_mode)).
6400
6401           This is contrary to "regular" reductions, in which the types of all
6402           the arguments are the same as the type of the reduction variable.
6403           For "regular" reductions we can therefore use the same vector type
6404           (and also the same tree-code) when generating the epilog code and
6405           when generating the code inside the loop.  */
6406
6407   enum tree_code orig_code = STMT_VINFO_REDUC_CODE (phi_info);
6408   STMT_VINFO_REDUC_CODE (reduc_info) = orig_code;
6409
6410   vect_reduction_type reduction_type = STMT_VINFO_REDUC_TYPE (reduc_info);
6411   if (reduction_type == TREE_CODE_REDUCTION)
6412     {
6413       /* Check whether it's ok to change the order of the computation.
6414          Generally, when vectorizing a reduction we change the order of the
6415          computation.  This may change the behavior of the program in some
6416          cases, so we need to check that this is ok.  One exception is when
6417          vectorizing an outer-loop: the inner-loop is executed sequentially,
6418          and therefore vectorizing reductions in the inner-loop during
6419          outer-loop vectorization is safe.  */
6420       if (needs_fold_left_reduction_p (scalar_type, orig_code))
6421         {
6422           /* When vectorizing a reduction chain w/o SLP the reduction PHI
6423              is not directy used in stmt.  */
6424           if (!only_slp_reduc_chain
6425               && reduc_chain_length != 1)
6426             {
6427               if (dump_enabled_p ())
6428                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6429                                  "in-order reduction chain without SLP.\n");
6430               return false;
6431             }
6432           STMT_VINFO_REDUC_TYPE (reduc_info)
6433             = reduction_type = FOLD_LEFT_REDUCTION;
6434         }
6435       else if (!commutative_tree_code (orig_code)
6436                || !associative_tree_code (orig_code))
6437         {
6438           if (dump_enabled_p ())
6439             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6440                             "reduction: not commutative/associative");
6441           return false;
6442         }
6443     }
6444
6445   if ((double_reduc || reduction_type != TREE_CODE_REDUCTION)
6446       && ncopies > 1)
6447     {
6448       if (dump_enabled_p ())
6449         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6450                          "multiple types in double reduction or condition "
6451                          "reduction or fold-left reduction.\n");
6452       return false;
6453     }
6454
6455   internal_fn reduc_fn = IFN_LAST;
6456   if (reduction_type == TREE_CODE_REDUCTION
6457       || reduction_type == FOLD_LEFT_REDUCTION
6458       || reduction_type == INTEGER_INDUC_COND_REDUCTION
6459       || reduction_type == CONST_COND_REDUCTION)
6460     {
6461       if (reduction_type == FOLD_LEFT_REDUCTION
6462           ? fold_left_reduction_fn (orig_code, &reduc_fn)
6463           : reduction_fn_for_scalar_code (orig_code, &reduc_fn))
6464         {
6465           if (reduc_fn != IFN_LAST
6466               && !direct_internal_fn_supported_p (reduc_fn, vectype_out,
6467                                                   OPTIMIZE_FOR_SPEED))
6468             {
6469               if (dump_enabled_p ())
6470                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6471                                  "reduc op not supported by target.\n");
6472
6473               reduc_fn = IFN_LAST;
6474             }
6475         }
6476       else
6477         {
6478           if (!nested_cycle || double_reduc)
6479             {
6480               if (dump_enabled_p ())
6481                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6482                                  "no reduc code for scalar code.\n");
6483
6484               return false;
6485             }
6486         }
6487     }
6488   else if (reduction_type == COND_REDUCTION)
6489     {
6490       int scalar_precision
6491         = GET_MODE_PRECISION (SCALAR_TYPE_MODE (scalar_type));
6492       cr_index_scalar_type = make_unsigned_type (scalar_precision);
6493       cr_index_vector_type = build_vector_type (cr_index_scalar_type,
6494                                                 nunits_out);
6495
6496       if (direct_internal_fn_supported_p (IFN_REDUC_MAX, cr_index_vector_type,
6497                                           OPTIMIZE_FOR_SPEED))
6498         reduc_fn = IFN_REDUC_MAX;
6499     }
6500   STMT_VINFO_REDUC_FN (reduc_info) = reduc_fn;
6501
6502   if (reduction_type != EXTRACT_LAST_REDUCTION
6503       && (!nested_cycle || double_reduc)
6504       && reduc_fn == IFN_LAST
6505       && !nunits_out.is_constant ())
6506     {
6507       if (dump_enabled_p ())
6508         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6509                          "missing target support for reduction on"
6510                          " variable-length vectors.\n");
6511       return false;
6512     }
6513
6514   /* For SLP reductions, see if there is a neutral value we can use.  */
6515   tree neutral_op = NULL_TREE;
6516   if (slp_node)
6517     neutral_op = neutral_op_for_slp_reduction
6518       (slp_node_instance->reduc_phis, vectype_out, orig_code,
6519        REDUC_GROUP_FIRST_ELEMENT (stmt_info) != NULL);
6520
6521   if (double_reduc && reduction_type == FOLD_LEFT_REDUCTION)
6522     {
6523       /* We can't support in-order reductions of code such as this:
6524
6525            for (int i = 0; i < n1; ++i)
6526              for (int j = 0; j < n2; ++j)
6527                l += a[j];
6528
6529          since GCC effectively transforms the loop when vectorizing:
6530
6531            for (int i = 0; i < n1 / VF; ++i)
6532              for (int j = 0; j < n2; ++j)
6533                for (int k = 0; k < VF; ++k)
6534                  l += a[j];
6535
6536          which is a reassociation of the original operation.  */
6537       if (dump_enabled_p ())
6538         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6539                          "in-order double reduction not supported.\n");
6540
6541       return false;
6542     }
6543
6544   if (reduction_type == FOLD_LEFT_REDUCTION
6545       && slp_node
6546       && !REDUC_GROUP_FIRST_ELEMENT (stmt_info))
6547     {
6548       /* We cannot use in-order reductions in this case because there is
6549          an implicit reassociation of the operations involved.  */
6550       if (dump_enabled_p ())
6551         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6552                          "in-order unchained SLP reductions not supported.\n");
6553       return false;
6554     }
6555
6556   /* For double reductions, and for SLP reductions with a neutral value,
6557      we construct a variable-length initial vector by loading a vector
6558      full of the neutral value and then shift-and-inserting the start
6559      values into the low-numbered elements.  */
6560   if ((double_reduc || neutral_op)
6561       && !nunits_out.is_constant ()
6562       && !direct_internal_fn_supported_p (IFN_VEC_SHL_INSERT,
6563                                           vectype_out, OPTIMIZE_FOR_SPEED))
6564     {
6565       if (dump_enabled_p ())
6566         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6567                          "reduction on variable-length vectors requires"
6568                          " target support for a vector-shift-and-insert"
6569                          " operation.\n");
6570       return false;
6571     }
6572
6573   /* Check extra constraints for variable-length unchained SLP reductions.  */
6574   if (STMT_SLP_TYPE (stmt_info)
6575       && !REDUC_GROUP_FIRST_ELEMENT (stmt_info)
6576       && !nunits_out.is_constant ())
6577     {
6578       /* We checked above that we could build the initial vector when
6579          there's a neutral element value.  Check here for the case in
6580          which each SLP statement has its own initial value and in which
6581          that value needs to be repeated for every instance of the
6582          statement within the initial vector.  */
6583       unsigned int group_size = SLP_TREE_LANES (slp_node);
6584       if (!neutral_op
6585           && !can_duplicate_and_interleave_p (loop_vinfo, group_size,
6586                                               TREE_TYPE (vectype_out)))
6587         {
6588           if (dump_enabled_p ())
6589             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6590                              "unsupported form of SLP reduction for"
6591                              " variable-length vectors: cannot build"
6592                              " initial vector.\n");
6593           return false;
6594         }
6595       /* The epilogue code relies on the number of elements being a multiple
6596          of the group size.  The duplicate-and-interleave approach to setting
6597          up the initial vector does too.  */
6598       if (!multiple_p (nunits_out, group_size))
6599         {
6600           if (dump_enabled_p ())
6601             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6602                              "unsupported form of SLP reduction for"
6603                              " variable-length vectors: the vector size"
6604                              " is not a multiple of the number of results.\n");
6605           return false;
6606         }
6607     }
6608
6609   if (reduction_type == COND_REDUCTION)
6610     {
6611       widest_int ni;
6612
6613       if (! max_loop_iterations (loop, &ni))
6614         {
6615           if (dump_enabled_p ())
6616             dump_printf_loc (MSG_NOTE, vect_location,
6617                              "loop count not known, cannot create cond "
6618                              "reduction.\n");
6619           return false;
6620         }
6621       /* Convert backedges to iterations.  */
6622       ni += 1;
6623
6624       /* The additional index will be the same type as the condition.  Check
6625          that the loop can fit into this less one (because we'll use up the
6626          zero slot for when there are no matches).  */
6627       tree max_index = TYPE_MAX_VALUE (cr_index_scalar_type);
6628       if (wi::geu_p (ni, wi::to_widest (max_index)))
6629         {
6630           if (dump_enabled_p ())
6631             dump_printf_loc (MSG_NOTE, vect_location,
6632                              "loop size is greater than data size.\n");
6633           return false;
6634         }
6635     }
6636
6637   /* In case the vectorization factor (VF) is bigger than the number
6638      of elements that we can fit in a vectype (nunits), we have to generate
6639      more than one vector stmt - i.e - we need to "unroll" the
6640      vector stmt by a factor VF/nunits.  For more details see documentation
6641      in vectorizable_operation.  */
6642
6643   /* If the reduction is used in an outer loop we need to generate
6644      VF intermediate results, like so (e.g. for ncopies=2):
6645         r0 = phi (init, r0)
6646         r1 = phi (init, r1)
6647         r0 = x0 + r0;
6648         r1 = x1 + r1;
6649     (i.e. we generate VF results in 2 registers).
6650     In this case we have a separate def-use cycle for each copy, and therefore
6651     for each copy we get the vector def for the reduction variable from the
6652     respective phi node created for this copy.
6653
6654     Otherwise (the reduction is unused in the loop nest), we can combine
6655     together intermediate results, like so (e.g. for ncopies=2):
6656         r = phi (init, r)
6657         r = x0 + r;
6658         r = x1 + r;
6659    (i.e. we generate VF/2 results in a single register).
6660    In this case for each copy we get the vector def for the reduction variable
6661    from the vectorized reduction operation generated in the previous iteration.
6662
6663    This only works when we see both the reduction PHI and its only consumer
6664    in vectorizable_reduction and there are no intermediate stmts
6665    participating.  */
6666   if (ncopies > 1
6667       && (STMT_VINFO_RELEVANT (stmt_info) <= vect_used_only_live)
6668       && reduc_chain_length == 1)
6669     single_defuse_cycle = true;
6670
6671   if (single_defuse_cycle || lane_reduc_code_p)
6672     {
6673       gcc_assert (code != COND_EXPR);
6674
6675       /* 4. Supportable by target?  */
6676       bool ok = true;
6677
6678       /* 4.1. check support for the operation in the loop  */
6679       optab optab = optab_for_tree_code (code, vectype_in, optab_vector);
6680       if (!optab)
6681         {
6682           if (dump_enabled_p ())
6683             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6684                              "no optab.\n");
6685           ok = false;
6686         }
6687
6688       machine_mode vec_mode = TYPE_MODE (vectype_in);
6689       if (ok && optab_handler (optab, vec_mode) == CODE_FOR_nothing)
6690         {
6691           if (dump_enabled_p ())
6692             dump_printf (MSG_NOTE, "op not supported by target.\n");
6693           if (maybe_ne (GET_MODE_SIZE (vec_mode), UNITS_PER_WORD)
6694               || !vect_worthwhile_without_simd_p (loop_vinfo, code))
6695             ok = false;
6696           else
6697             if (dump_enabled_p ())
6698               dump_printf (MSG_NOTE, "proceeding using word mode.\n");
6699         }
6700
6701       /* Worthwhile without SIMD support?  */
6702       if (ok
6703           && !VECTOR_MODE_P (TYPE_MODE (vectype_in))
6704           && !vect_worthwhile_without_simd_p (loop_vinfo, code))
6705         {
6706           if (dump_enabled_p ())
6707             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6708                              "not worthwhile without SIMD support.\n");
6709           ok = false;
6710         }
6711
6712       /* lane-reducing operations have to go through vect_transform_reduction.
6713          For the other cases try without the single cycle optimization.  */
6714       if (!ok)
6715         {
6716           if (lane_reduc_code_p)
6717             return false;
6718           else
6719             single_defuse_cycle = false;
6720         }
6721     }
6722   STMT_VINFO_FORCE_SINGLE_CYCLE (reduc_info) = single_defuse_cycle;
6723
6724   /* If the reduction stmt is one of the patterns that have lane
6725      reduction embedded we cannot handle the case of ! single_defuse_cycle.  */
6726   if ((ncopies > 1 && ! single_defuse_cycle)
6727       && lane_reduc_code_p)
6728     {
6729       if (dump_enabled_p ())
6730         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6731                          "multi def-use cycle not possible for lane-reducing "
6732                          "reduction operation\n");
6733       return false;
6734     }
6735
6736   if (slp_node
6737       && !(!single_defuse_cycle
6738            && code != DOT_PROD_EXPR
6739            && code != WIDEN_SUM_EXPR
6740            && code != SAD_EXPR
6741            && reduction_type != FOLD_LEFT_REDUCTION))
6742     for (i = 0; i < op_type; i++)
6743       if (!vect_maybe_update_slp_op_vectype (slp_op[i], vectype_in))
6744         {
6745           if (dump_enabled_p ())
6746             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6747                              "incompatible vector types for invariants\n");
6748           return false;
6749         }
6750
6751   if (slp_node)
6752     vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
6753   else
6754     vec_num = 1;
6755
6756   vect_model_reduction_cost (loop_vinfo, stmt_info, reduc_fn,
6757                              reduction_type, ncopies, cost_vec);
6758   if (dump_enabled_p ()
6759       && reduction_type == FOLD_LEFT_REDUCTION)
6760     dump_printf_loc (MSG_NOTE, vect_location,
6761                      "using an in-order (fold-left) reduction.\n");
6762   STMT_VINFO_TYPE (orig_stmt_of_analysis) = cycle_phi_info_type;
6763   /* All but single defuse-cycle optimized, lane-reducing and fold-left
6764      reductions go through their own vectorizable_* routines.  */
6765   if (!single_defuse_cycle
6766       && code != DOT_PROD_EXPR
6767       && code != WIDEN_SUM_EXPR
6768       && code != SAD_EXPR
6769       && reduction_type != FOLD_LEFT_REDUCTION)
6770     {
6771       stmt_vec_info tem
6772         = vect_stmt_to_vectorize (STMT_VINFO_REDUC_DEF (phi_info));
6773       if (slp_node && REDUC_GROUP_FIRST_ELEMENT (tem))
6774         {
6775           gcc_assert (!REDUC_GROUP_NEXT_ELEMENT (tem));
6776           tem = REDUC_GROUP_FIRST_ELEMENT (tem);
6777         }
6778       STMT_VINFO_DEF_TYPE (vect_orig_stmt (tem)) = vect_internal_def;
6779       STMT_VINFO_DEF_TYPE (tem) = vect_internal_def;
6780     }
6781   else if (loop_vinfo && LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo))
6782     {
6783       vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo);
6784       internal_fn cond_fn = get_conditional_internal_fn (code);
6785
6786       if (reduction_type != FOLD_LEFT_REDUCTION
6787           && !use_mask_by_cond_expr_p (code, cond_fn, vectype_in)
6788           && (cond_fn == IFN_LAST
6789               || !direct_internal_fn_supported_p (cond_fn, vectype_in,
6790                                                   OPTIMIZE_FOR_SPEED)))
6791         {
6792           if (dump_enabled_p ())
6793             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6794                              "can't use a fully-masked loop because no"
6795                              " conditional operation is available.\n");
6796           LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = false;
6797         }
6798       else if (reduction_type == FOLD_LEFT_REDUCTION
6799                && reduc_fn == IFN_LAST
6800                && !expand_vec_cond_expr_p (vectype_in,
6801                                            truth_type_for (vectype_in),
6802                                            SSA_NAME))
6803         {
6804           if (dump_enabled_p ())
6805             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6806                              "can't use a fully-masked loop because no"
6807                              " conditional operation is available.\n");
6808           LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = false;
6809         }
6810       else
6811         vect_record_loop_mask (loop_vinfo, masks, ncopies * vec_num,
6812                                vectype_in, NULL);
6813     }
6814   return true;
6815 }
6816
6817 /* Transform the definition stmt STMT_INFO of a reduction PHI backedge
6818    value.  */
6819
6820 bool
6821 vect_transform_reduction (loop_vec_info loop_vinfo,
6822                           stmt_vec_info stmt_info, gimple_stmt_iterator *gsi,
6823                           gimple **vec_stmt, slp_tree slp_node)
6824 {
6825   tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
6826   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
6827   int i;
6828   int ncopies;
6829   int vec_num;
6830
6831   stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info);
6832   gcc_assert (reduc_info->is_reduc_info);
6833
6834   if (nested_in_vect_loop_p (loop, stmt_info))
6835     {
6836       loop = loop->inner;
6837       gcc_assert (STMT_VINFO_DEF_TYPE (reduc_info) == vect_double_reduction_def);
6838     }
6839
6840   gassign *stmt = as_a <gassign *> (stmt_info->stmt);
6841   enum tree_code code = gimple_assign_rhs_code (stmt);
6842   int op_type = TREE_CODE_LENGTH (code);
6843
6844   /* Flatten RHS.  */
6845   tree ops[3];
6846   switch (get_gimple_rhs_class (code))
6847     {
6848     case GIMPLE_TERNARY_RHS:
6849       ops[2] = gimple_assign_rhs3 (stmt);
6850       /* Fall thru.  */
6851     case GIMPLE_BINARY_RHS:
6852       ops[0] = gimple_assign_rhs1 (stmt);
6853       ops[1] = gimple_assign_rhs2 (stmt);
6854       break;
6855     default:
6856       gcc_unreachable ();
6857     }
6858
6859   /* All uses but the last are expected to be defined in the loop.
6860      The last use is the reduction variable.  In case of nested cycle this
6861      assumption is not true: we use reduc_index to record the index of the
6862      reduction variable.  */
6863   stmt_vec_info phi_info = STMT_VINFO_REDUC_DEF (vect_orig_stmt (stmt_info));
6864   gphi *reduc_def_phi = as_a <gphi *> (phi_info->stmt);
6865   int reduc_index = STMT_VINFO_REDUC_IDX (stmt_info);
6866   tree vectype_in = STMT_VINFO_REDUC_VECTYPE_IN (reduc_info);
6867
6868   if (slp_node)
6869     {
6870       ncopies = 1;
6871       vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
6872     }
6873   else
6874     {
6875       ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
6876       vec_num = 1;
6877     }
6878
6879   internal_fn cond_fn = get_conditional_internal_fn (code);
6880   vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo);
6881   bool mask_by_cond_expr = use_mask_by_cond_expr_p (code, cond_fn, vectype_in);
6882
6883   /* Transform.  */
6884   tree new_temp = NULL_TREE;
6885   auto_vec<tree> vec_oprnds0;
6886   auto_vec<tree> vec_oprnds1;
6887   auto_vec<tree> vec_oprnds2;
6888   tree def0;
6889
6890   if (dump_enabled_p ())
6891     dump_printf_loc (MSG_NOTE, vect_location, "transform reduction.\n");
6892
6893   /* FORNOW: Multiple types are not supported for condition.  */
6894   if (code == COND_EXPR)
6895     gcc_assert (ncopies == 1);
6896
6897   bool masked_loop_p = LOOP_VINFO_FULLY_MASKED_P (loop_vinfo);
6898
6899   vect_reduction_type reduction_type = STMT_VINFO_REDUC_TYPE (reduc_info);
6900   if (reduction_type == FOLD_LEFT_REDUCTION)
6901     {
6902       internal_fn reduc_fn = STMT_VINFO_REDUC_FN (reduc_info);
6903       return vectorize_fold_left_reduction
6904           (loop_vinfo, stmt_info, gsi, vec_stmt, slp_node, reduc_def_phi, code,
6905            reduc_fn, ops, vectype_in, reduc_index, masks);
6906     }
6907
6908   bool single_defuse_cycle = STMT_VINFO_FORCE_SINGLE_CYCLE (reduc_info);
6909   gcc_assert (single_defuse_cycle
6910               || code == DOT_PROD_EXPR
6911               || code == WIDEN_SUM_EXPR
6912               || code == SAD_EXPR);
6913
6914   /* Create the destination vector  */
6915   tree scalar_dest = gimple_assign_lhs (stmt);
6916   tree vec_dest = vect_create_destination_var (scalar_dest, vectype_out);
6917
6918   vect_get_vec_defs (loop_vinfo, stmt_info, slp_node, ncopies,
6919                      single_defuse_cycle && reduc_index == 0
6920                      ? NULL_TREE : ops[0], &vec_oprnds0,
6921                      single_defuse_cycle && reduc_index == 1
6922                      ? NULL_TREE : ops[1], &vec_oprnds1,
6923                      op_type == ternary_op
6924                      && !(single_defuse_cycle && reduc_index == 2)
6925                      ? ops[2] : NULL_TREE, &vec_oprnds2);
6926   if (single_defuse_cycle)
6927     {
6928       gcc_assert (!slp_node);
6929       vect_get_vec_defs_for_operand (loop_vinfo, stmt_info, 1,
6930                                      ops[reduc_index],
6931                                      reduc_index == 0 ? &vec_oprnds0
6932                                      : (reduc_index == 1 ? &vec_oprnds1
6933                                         : &vec_oprnds2));
6934     }
6935
6936   FOR_EACH_VEC_ELT (vec_oprnds0, i, def0)
6937     {
6938       gimple *new_stmt;
6939       tree vop[3] = { def0, vec_oprnds1[i], NULL_TREE };
6940       if (masked_loop_p && !mask_by_cond_expr)
6941         {
6942           /* Make sure that the reduction accumulator is vop[0].  */
6943           if (reduc_index == 1)
6944             {
6945               gcc_assert (commutative_tree_code (code));
6946               std::swap (vop[0], vop[1]);
6947             }
6948           tree mask = vect_get_loop_mask (gsi, masks, vec_num * ncopies,
6949                                           vectype_in, i);
6950           gcall *call = gimple_build_call_internal (cond_fn, 4, mask,
6951                                                     vop[0], vop[1], vop[0]);
6952           new_temp = make_ssa_name (vec_dest, call);
6953           gimple_call_set_lhs (call, new_temp);
6954           gimple_call_set_nothrow (call, true);
6955           vect_finish_stmt_generation (loop_vinfo, stmt_info, call, gsi);
6956           new_stmt = call;
6957         }
6958       else
6959         {
6960           if (op_type == ternary_op)
6961             vop[2] = vec_oprnds2[i];
6962
6963           if (masked_loop_p && mask_by_cond_expr)
6964             {
6965               tree mask = vect_get_loop_mask (gsi, masks, vec_num * ncopies,
6966                                               vectype_in, i);
6967               build_vect_cond_expr (code, vop, mask, gsi);
6968             }
6969
6970           new_stmt = gimple_build_assign (vec_dest, code,
6971                                           vop[0], vop[1], vop[2]);
6972           new_temp = make_ssa_name (vec_dest, new_stmt);
6973           gimple_assign_set_lhs (new_stmt, new_temp);
6974           vect_finish_stmt_generation (loop_vinfo, stmt_info, new_stmt, gsi);
6975         }
6976
6977       if (slp_node)
6978         SLP_TREE_VEC_STMTS (slp_node).quick_push (new_stmt);
6979       else if (single_defuse_cycle
6980                && i < ncopies - 1)
6981         {
6982           if (reduc_index == 0)
6983             vec_oprnds0.safe_push (gimple_get_lhs (new_stmt));
6984           else if (reduc_index == 1)
6985             vec_oprnds1.safe_push (gimple_get_lhs (new_stmt));
6986           else if (reduc_index == 2)
6987             vec_oprnds2.safe_push (gimple_get_lhs (new_stmt));
6988         }
6989       else
6990         STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_stmt);
6991     }
6992
6993   if (!slp_node)
6994     *vec_stmt = STMT_VINFO_VEC_STMTS (stmt_info)[0];
6995
6996   return true;
6997 }
6998
6999 /* Transform phase of a cycle PHI.  */
7000
7001 bool
7002 vect_transform_cycle_phi (loop_vec_info loop_vinfo,
7003                           stmt_vec_info stmt_info, gimple **vec_stmt,
7004                           slp_tree slp_node, slp_instance slp_node_instance)
7005 {
7006   tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
7007   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
7008   int i;
7009   int ncopies;
7010   int j;
7011   bool nested_cycle = false;
7012   int vec_num;
7013
7014   if (nested_in_vect_loop_p (loop, stmt_info))
7015     {
7016       loop = loop->inner;
7017       nested_cycle = true;
7018     }
7019
7020   stmt_vec_info reduc_stmt_info = STMT_VINFO_REDUC_DEF (stmt_info);
7021   reduc_stmt_info = vect_stmt_to_vectorize (reduc_stmt_info);
7022   stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info);
7023   gcc_assert (reduc_info->is_reduc_info);
7024
7025   if (STMT_VINFO_REDUC_TYPE (reduc_info) == EXTRACT_LAST_REDUCTION
7026       || STMT_VINFO_REDUC_TYPE (reduc_info) == FOLD_LEFT_REDUCTION)
7027     /* Leave the scalar phi in place.  */
7028     return true;
7029
7030   tree vectype_in = STMT_VINFO_REDUC_VECTYPE_IN (reduc_info);
7031   /* For a nested cycle we do not fill the above.  */
7032   if (!vectype_in)
7033     vectype_in = STMT_VINFO_VECTYPE (stmt_info);
7034   gcc_assert (vectype_in);
7035
7036   if (slp_node)
7037     {
7038       /* The size vect_schedule_slp_instance computes is off for us.  */
7039       vec_num = vect_get_num_vectors (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
7040                                       * SLP_TREE_LANES (slp_node), vectype_in);
7041       ncopies = 1;
7042     }
7043   else
7044     {
7045       vec_num = 1;
7046       ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
7047     }
7048
7049   /* Check whether we should use a single PHI node and accumulate
7050      vectors to one before the backedge.  */
7051   if (STMT_VINFO_FORCE_SINGLE_CYCLE (reduc_info))
7052     ncopies = 1;
7053
7054   /* Create the destination vector  */
7055   gphi *phi = as_a <gphi *> (stmt_info->stmt);
7056   tree vec_dest = vect_create_destination_var (gimple_phi_result (phi),
7057                                                vectype_out);
7058
7059   /* Get the loop-entry arguments.  */
7060   tree vec_initial_def;
7061   auto_vec<tree> vec_initial_defs;
7062   if (slp_node)
7063     {
7064       vec_initial_defs.reserve (vec_num);
7065       gcc_assert (slp_node == slp_node_instance->reduc_phis);
7066       stmt_vec_info first = REDUC_GROUP_FIRST_ELEMENT (reduc_stmt_info);
7067       tree neutral_op
7068         = neutral_op_for_slp_reduction (slp_node, vectype_out,
7069                                         STMT_VINFO_REDUC_CODE (reduc_info),
7070                                         first != NULL);
7071       get_initial_defs_for_reduction (loop_vinfo, slp_node_instance->reduc_phis,
7072                                       &vec_initial_defs, vec_num,
7073                                       first != NULL, neutral_op);
7074     }
7075   else
7076     {
7077       /* Get at the scalar def before the loop, that defines the initial
7078          value of the reduction variable.  */
7079       tree initial_def = PHI_ARG_DEF_FROM_EDGE (phi,
7080                                                 loop_preheader_edge (loop));
7081       /* Optimize: if initial_def is for REDUC_MAX smaller than the base
7082          and we can't use zero for induc_val, use initial_def.  Similarly
7083          for REDUC_MIN and initial_def larger than the base.  */
7084       if (STMT_VINFO_REDUC_TYPE (reduc_info) == INTEGER_INDUC_COND_REDUCTION)
7085         {
7086           tree induc_val = STMT_VINFO_VEC_INDUC_COND_INITIAL_VAL (reduc_info);
7087           if (TREE_CODE (initial_def) == INTEGER_CST
7088               && !integer_zerop (induc_val)
7089               && ((STMT_VINFO_REDUC_CODE (reduc_info) == MAX_EXPR
7090                    && tree_int_cst_lt (initial_def, induc_val))
7091                   || (STMT_VINFO_REDUC_CODE (reduc_info) == MIN_EXPR
7092                       && tree_int_cst_lt (induc_val, initial_def))))
7093             {
7094               induc_val = initial_def;
7095               /* Communicate we used the initial_def to epilouge
7096                  generation.  */
7097               STMT_VINFO_VEC_INDUC_COND_INITIAL_VAL (reduc_info) = NULL_TREE;
7098             }
7099           vec_initial_def = build_vector_from_val (vectype_out, induc_val);
7100           vec_initial_defs.create (ncopies);
7101           for (i = 0; i < ncopies; ++i)
7102             vec_initial_defs.quick_push (vec_initial_def);
7103         }
7104       else if (nested_cycle)
7105         {
7106           /* Do not use an adjustment def as that case is not supported
7107              correctly if ncopies is not one.  */
7108           vect_get_vec_defs_for_operand (loop_vinfo, reduc_stmt_info,
7109                                          ncopies, initial_def,
7110                                          &vec_initial_defs);
7111         }
7112       else
7113         {
7114           tree adjustment_def = NULL_TREE;
7115           tree *adjustment_defp = &adjustment_def;
7116           enum tree_code code = STMT_VINFO_REDUC_CODE (reduc_info);
7117           if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
7118             adjustment_defp = NULL;
7119           vec_initial_def
7120             = get_initial_def_for_reduction (loop_vinfo, reduc_stmt_info, code,
7121                                              initial_def, adjustment_defp);
7122           STMT_VINFO_REDUC_EPILOGUE_ADJUSTMENT (reduc_info) = adjustment_def;
7123           vec_initial_defs.create (ncopies);
7124           for (i = 0; i < ncopies; ++i)
7125             vec_initial_defs.quick_push (vec_initial_def);
7126         }
7127     }
7128
7129   /* Generate the reduction PHIs upfront.  */
7130   for (i = 0; i < vec_num; i++)
7131     {
7132       tree vec_init_def = vec_initial_defs[i];
7133       for (j = 0; j < ncopies; j++)
7134         {
7135           /* Create the reduction-phi that defines the reduction
7136              operand.  */
7137           gphi *new_phi = create_phi_node (vec_dest, loop->header);
7138
7139           /* Set the loop-entry arg of the reduction-phi.  */
7140           if (j != 0 && nested_cycle)
7141             vec_init_def = vec_initial_defs[j];
7142           add_phi_arg (new_phi, vec_init_def, loop_preheader_edge (loop),
7143                        UNKNOWN_LOCATION);
7144
7145           /* The loop-latch arg is set in epilogue processing.  */
7146
7147           if (slp_node)
7148             SLP_TREE_VEC_STMTS (slp_node).quick_push (new_phi);
7149           else
7150             {
7151               if (j == 0)
7152                 *vec_stmt = new_phi;
7153               STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_phi);
7154             }
7155         }
7156     }
7157
7158   return true;
7159 }
7160
7161 /* Vectorizes LC PHIs.  */
7162
7163 bool
7164 vectorizable_lc_phi (loop_vec_info loop_vinfo,
7165                      stmt_vec_info stmt_info, gimple **vec_stmt,
7166                      slp_tree slp_node)
7167 {
7168   if (!loop_vinfo
7169       || !is_a <gphi *> (stmt_info->stmt)
7170       || gimple_phi_num_args (stmt_info->stmt) != 1)
7171     return false;
7172
7173   if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_internal_def
7174       && STMT_VINFO_DEF_TYPE (stmt_info) != vect_double_reduction_def)
7175     return false;
7176
7177   if (!vec_stmt) /* transformation not required.  */
7178     {
7179       STMT_VINFO_TYPE (stmt_info) = lc_phi_info_type;
7180       return true;
7181     }
7182
7183   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
7184   tree scalar_dest = gimple_phi_result (stmt_info->stmt);
7185   basic_block bb = gimple_bb (stmt_info->stmt);
7186   edge e = single_pred_edge (bb);
7187   tree vec_dest = vect_create_destination_var (scalar_dest, vectype);
7188   auto_vec<tree> vec_oprnds;
7189   vect_get_vec_defs (loop_vinfo, stmt_info, slp_node,
7190                      !slp_node ? vect_get_num_copies (loop_vinfo, vectype) : 1,
7191                      gimple_phi_arg_def (stmt_info->stmt, 0), &vec_oprnds);
7192   for (unsigned i = 0; i < vec_oprnds.length (); i++)
7193     {
7194       /* Create the vectorized LC PHI node.  */
7195       gphi *new_phi = create_phi_node (vec_dest, bb);
7196       add_phi_arg (new_phi, vec_oprnds[i], e, UNKNOWN_LOCATION);
7197       if (slp_node)
7198         SLP_TREE_VEC_STMTS (slp_node).quick_push (new_phi);
7199       else
7200         STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_phi);
7201     }
7202   if (!slp_node)
7203     *vec_stmt = STMT_VINFO_VEC_STMTS (stmt_info)[0];
7204
7205   return true;
7206 }
7207
7208
7209 /* Function vect_min_worthwhile_factor.
7210
7211    For a loop where we could vectorize the operation indicated by CODE,
7212    return the minimum vectorization factor that makes it worthwhile
7213    to use generic vectors.  */
7214 static unsigned int
7215 vect_min_worthwhile_factor (enum tree_code code)
7216 {
7217   switch (code)
7218     {
7219     case PLUS_EXPR:
7220     case MINUS_EXPR:
7221     case NEGATE_EXPR:
7222       return 4;
7223
7224     case BIT_AND_EXPR:
7225     case BIT_IOR_EXPR:
7226     case BIT_XOR_EXPR:
7227     case BIT_NOT_EXPR:
7228       return 2;
7229
7230     default:
7231       return INT_MAX;
7232     }
7233 }
7234
7235 /* Return true if VINFO indicates we are doing loop vectorization and if
7236    it is worth decomposing CODE operations into scalar operations for
7237    that loop's vectorization factor.  */
7238
7239 bool
7240 vect_worthwhile_without_simd_p (vec_info *vinfo, tree_code code)
7241 {
7242   loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
7243   unsigned HOST_WIDE_INT value;
7244   return (loop_vinfo
7245           && LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant (&value)
7246           && value >= vect_min_worthwhile_factor (code));
7247 }
7248
7249 /* Function vectorizable_induction
7250
7251    Check if STMT_INFO performs an induction computation that can be vectorized.
7252    If VEC_STMT is also passed, vectorize the induction PHI: create a vectorized
7253    phi to replace it, put it in VEC_STMT, and add it to the same basic block.
7254    Return true if STMT_INFO is vectorizable in this way.  */
7255
7256 bool
7257 vectorizable_induction (loop_vec_info loop_vinfo,
7258                         stmt_vec_info stmt_info,
7259                         gimple_stmt_iterator *gsi ATTRIBUTE_UNUSED,
7260                         gimple **vec_stmt, slp_tree slp_node,
7261                         stmt_vector_for_cost *cost_vec)
7262 {
7263   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
7264   unsigned ncopies;
7265   bool nested_in_vect_loop = false;
7266   class loop *iv_loop;
7267   tree vec_def;
7268   edge pe = loop_preheader_edge (loop);
7269   basic_block new_bb;
7270   tree new_vec, vec_init, vec_step, t;
7271   tree new_name;
7272   gimple *new_stmt;
7273   gphi *induction_phi;
7274   tree induc_def, vec_dest;
7275   tree init_expr, step_expr;
7276   poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
7277   unsigned i;
7278   tree expr;
7279   gimple_seq stmts;
7280   gimple_stmt_iterator si;
7281
7282   gphi *phi = dyn_cast <gphi *> (stmt_info->stmt);
7283   if (!phi)
7284     return false;
7285
7286   if (!STMT_VINFO_RELEVANT_P (stmt_info))
7287     return false;
7288
7289   /* Make sure it was recognized as induction computation.  */
7290   if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_induction_def)
7291     return false;
7292
7293   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
7294   poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
7295
7296   if (slp_node)
7297     ncopies = 1;
7298   else
7299     ncopies = vect_get_num_copies (loop_vinfo, vectype);
7300   gcc_assert (ncopies >= 1);
7301
7302   /* FORNOW. These restrictions should be relaxed.  */
7303   if (nested_in_vect_loop_p (loop, stmt_info))
7304     {
7305       imm_use_iterator imm_iter;
7306       use_operand_p use_p;
7307       gimple *exit_phi;
7308       edge latch_e;
7309       tree loop_arg;
7310
7311       if (ncopies > 1)
7312         {
7313           if (dump_enabled_p ())
7314             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7315                              "multiple types in nested loop.\n");
7316           return false;
7317         }
7318
7319       /* FORNOW: outer loop induction with SLP not supported.  */
7320       if (STMT_SLP_TYPE (stmt_info))
7321         return false;
7322
7323       exit_phi = NULL;
7324       latch_e = loop_latch_edge (loop->inner);
7325       loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
7326       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, loop_arg)
7327         {
7328           gimple *use_stmt = USE_STMT (use_p);
7329           if (is_gimple_debug (use_stmt))
7330             continue;
7331
7332           if (!flow_bb_inside_loop_p (loop->inner, gimple_bb (use_stmt)))
7333             {
7334               exit_phi = use_stmt;
7335               break;
7336             }
7337         }
7338       if (exit_phi)
7339         {
7340           stmt_vec_info exit_phi_vinfo = loop_vinfo->lookup_stmt (exit_phi);
7341           if (!(STMT_VINFO_RELEVANT_P (exit_phi_vinfo)
7342                 && !STMT_VINFO_LIVE_P (exit_phi_vinfo)))
7343             {
7344               if (dump_enabled_p ())
7345                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7346                                  "inner-loop induction only used outside "
7347                                  "of the outer vectorized loop.\n");
7348               return false;
7349             }
7350         }
7351
7352       nested_in_vect_loop = true;
7353       iv_loop = loop->inner;
7354     }
7355   else
7356     iv_loop = loop;
7357   gcc_assert (iv_loop == (gimple_bb (phi))->loop_father);
7358
7359   if (slp_node && !nunits.is_constant ())
7360     {
7361       /* The current SLP code creates the initial value element-by-element.  */
7362       if (dump_enabled_p ())
7363         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7364                          "SLP induction not supported for variable-length"
7365                          " vectors.\n");
7366       return false;
7367     }
7368
7369   if (!vec_stmt) /* transformation not required.  */
7370     {
7371       STMT_VINFO_TYPE (stmt_info) = induc_vec_info_type;
7372       DUMP_VECT_SCOPE ("vectorizable_induction");
7373       vect_model_induction_cost (stmt_info, ncopies, cost_vec);
7374       return true;
7375     }
7376
7377   /* Transform.  */
7378
7379   /* Compute a vector variable, initialized with the first VF values of
7380      the induction variable.  E.g., for an iv with IV_PHI='X' and
7381      evolution S, for a vector of 4 units, we want to compute:
7382      [X, X + S, X + 2*S, X + 3*S].  */
7383
7384   if (dump_enabled_p ())
7385     dump_printf_loc (MSG_NOTE, vect_location, "transform induction phi.\n");
7386
7387   step_expr = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_info);
7388   gcc_assert (step_expr != NULL_TREE);
7389   tree step_vectype = get_same_sized_vectype (TREE_TYPE (step_expr), vectype);
7390
7391   pe = loop_preheader_edge (iv_loop);
7392   init_expr = PHI_ARG_DEF_FROM_EDGE (phi,
7393                                      loop_preheader_edge (iv_loop));
7394
7395   stmts = NULL;
7396   if (!nested_in_vect_loop)
7397     {
7398       /* Convert the initial value to the IV update type.  */
7399       tree new_type = TREE_TYPE (step_expr);
7400       init_expr = gimple_convert (&stmts, new_type, init_expr);
7401
7402       /* If we are using the loop mask to "peel" for alignment then we need
7403          to adjust the start value here.  */
7404       tree skip_niters = LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo);
7405       if (skip_niters != NULL_TREE)
7406         {
7407           if (FLOAT_TYPE_P (vectype))
7408             skip_niters = gimple_build (&stmts, FLOAT_EXPR, new_type,
7409                                         skip_niters);
7410           else
7411             skip_niters = gimple_convert (&stmts, new_type, skip_niters);
7412           tree skip_step = gimple_build (&stmts, MULT_EXPR, new_type,
7413                                          skip_niters, step_expr);
7414           init_expr = gimple_build (&stmts, MINUS_EXPR, new_type,
7415                                     init_expr, skip_step);
7416         }
7417     }
7418
7419   if (stmts)
7420     {
7421       new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
7422       gcc_assert (!new_bb);
7423     }
7424
7425   /* Find the first insertion point in the BB.  */
7426   basic_block bb = gimple_bb (phi);
7427   si = gsi_after_labels (bb);
7428
7429   /* For SLP induction we have to generate several IVs as for example
7430      with group size 3 we need [i, i, i, i + S] [i + S, i + S, i + 2*S, i + 2*S]
7431      [i + 2*S, i + 3*S, i + 3*S, i + 3*S].  The step is the same uniform
7432      [VF*S, VF*S, VF*S, VF*S] for all.  */
7433   if (slp_node)
7434     {
7435       /* Enforced above.  */
7436       unsigned int const_nunits = nunits.to_constant ();
7437
7438       /* Generate [VF*S, VF*S, ... ].  */
7439       if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
7440         {
7441           expr = build_int_cst (integer_type_node, vf);
7442           expr = fold_convert (TREE_TYPE (step_expr), expr);
7443         }
7444       else
7445         expr = build_int_cst (TREE_TYPE (step_expr), vf);
7446       new_name = fold_build2 (MULT_EXPR, TREE_TYPE (step_expr),
7447                               expr, step_expr);
7448       if (! CONSTANT_CLASS_P (new_name))
7449         new_name = vect_init_vector (loop_vinfo, stmt_info, new_name,
7450                                      TREE_TYPE (step_expr), NULL);
7451       new_vec = build_vector_from_val (step_vectype, new_name);
7452       vec_step = vect_init_vector (loop_vinfo, stmt_info,
7453                                    new_vec, step_vectype, NULL);
7454
7455       /* Now generate the IVs.  */
7456       unsigned group_size = SLP_TREE_LANES (slp_node);
7457       unsigned nvects = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
7458       unsigned elts = const_nunits * nvects;
7459       /* Compute the number of distinct IVs we need.  First reduce
7460          group_size if it is a multiple of const_nunits so we get
7461          one IV for a group_size of 4 but const_nunits 2.  */
7462       unsigned group_sizep = group_size;
7463       if (group_sizep % const_nunits == 0)
7464         group_sizep = group_sizep / const_nunits;
7465       unsigned nivs = least_common_multiple (group_sizep,
7466                                              const_nunits) / const_nunits;
7467       gcc_assert (elts % group_size == 0);
7468       tree elt = init_expr;
7469       unsigned ivn;
7470       for (ivn = 0; ivn < nivs; ++ivn)
7471         {
7472           tree_vector_builder elts (step_vectype, const_nunits, 1);
7473           stmts = NULL;
7474           for (unsigned eltn = 0; eltn < const_nunits; ++eltn)
7475             {
7476               if (ivn*const_nunits + eltn >= group_size
7477                   && (ivn * const_nunits + eltn) % group_size == 0)
7478                 elt = gimple_build (&stmts, PLUS_EXPR, TREE_TYPE (elt),
7479                                     elt, step_expr);
7480               elts.quick_push (elt);
7481             }
7482           vec_init = gimple_build_vector (&stmts, &elts);
7483           vec_init = gimple_convert (&stmts, vectype, vec_init);
7484           if (stmts)
7485             {
7486               new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
7487               gcc_assert (!new_bb);
7488             }
7489
7490           /* Create the induction-phi that defines the induction-operand.  */
7491           vec_dest = vect_get_new_vect_var (vectype, vect_simple_var, "vec_iv_");
7492           induction_phi = create_phi_node (vec_dest, iv_loop->header);
7493           induc_def = PHI_RESULT (induction_phi);
7494
7495           /* Create the iv update inside the loop  */
7496           gimple_seq stmts = NULL;
7497           vec_def = gimple_convert (&stmts, step_vectype, induc_def);
7498           vec_def = gimple_build (&stmts,
7499                                   PLUS_EXPR, step_vectype, vec_def, vec_step);
7500           vec_def = gimple_convert (&stmts, vectype, vec_def);
7501           gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
7502
7503           /* Set the arguments of the phi node:  */
7504           add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION);
7505           add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
7506                        UNKNOWN_LOCATION);
7507
7508           SLP_TREE_VEC_STMTS (slp_node).quick_push (induction_phi);
7509         }
7510       /* Fill up to the number of vectors we need for the whole group.  */
7511       nivs = least_common_multiple (group_size,
7512                                     const_nunits) / const_nunits;
7513       for (; ivn < nivs; ++ivn)
7514         SLP_TREE_VEC_STMTS (slp_node)
7515           .quick_push (SLP_TREE_VEC_STMTS (slp_node)[0]);
7516
7517       /* Re-use IVs when we can.  */
7518       if (ivn < nvects)
7519         {
7520           unsigned vfp
7521             = least_common_multiple (group_size, const_nunits) / group_size;
7522           /* Generate [VF'*S, VF'*S, ... ].  */
7523           if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
7524             {
7525               expr = build_int_cst (integer_type_node, vfp);
7526               expr = fold_convert (TREE_TYPE (step_expr), expr);
7527             }
7528           else
7529             expr = build_int_cst (TREE_TYPE (step_expr), vfp);
7530           new_name = fold_build2 (MULT_EXPR, TREE_TYPE (step_expr),
7531                                   expr, step_expr);
7532           if (! CONSTANT_CLASS_P (new_name))
7533             new_name = vect_init_vector (loop_vinfo, stmt_info, new_name,
7534                                          TREE_TYPE (step_expr), NULL);
7535           new_vec = build_vector_from_val (step_vectype, new_name);
7536           vec_step = vect_init_vector (loop_vinfo, stmt_info, new_vec,
7537                                        step_vectype, NULL);
7538           for (; ivn < nvects; ++ivn)
7539             {
7540               gimple *iv = SLP_TREE_VEC_STMTS (slp_node)[ivn - nivs];
7541               tree def;
7542               if (gimple_code (iv) == GIMPLE_PHI)
7543                 def = gimple_phi_result (iv);
7544               else
7545                 def = gimple_assign_lhs (iv);
7546               gimple_seq stmts = NULL;
7547               def = gimple_convert (&stmts, step_vectype, def);
7548               def = gimple_build (&stmts,
7549                                   PLUS_EXPR, step_vectype, def, vec_step);
7550               def = gimple_convert (&stmts, vectype, def);
7551               if (gimple_code (iv) == GIMPLE_PHI)
7552                 gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
7553               else
7554                 {
7555                   gimple_stmt_iterator tgsi = gsi_for_stmt (iv);
7556                   gsi_insert_seq_after (&tgsi, stmts, GSI_CONTINUE_LINKING);
7557                 }
7558               SLP_TREE_VEC_STMTS (slp_node)
7559                 .quick_push (SSA_NAME_DEF_STMT (def));
7560             }
7561         }
7562
7563       return true;
7564     }
7565
7566   /* Create the vector that holds the initial_value of the induction.  */
7567   if (nested_in_vect_loop)
7568     {
7569       /* iv_loop is nested in the loop to be vectorized.  init_expr had already
7570          been created during vectorization of previous stmts.  We obtain it
7571          from the STMT_VINFO_VEC_STMT of the defining stmt.  */
7572       auto_vec<tree> vec_inits;
7573       vect_get_vec_defs_for_operand (loop_vinfo, stmt_info, 1,
7574                                      init_expr, &vec_inits);
7575       vec_init = vec_inits[0];
7576       /* If the initial value is not of proper type, convert it.  */
7577       if (!useless_type_conversion_p (vectype, TREE_TYPE (vec_init)))
7578         {
7579           new_stmt
7580             = gimple_build_assign (vect_get_new_ssa_name (vectype,
7581                                                           vect_simple_var,
7582                                                           "vec_iv_"),
7583                                    VIEW_CONVERT_EXPR,
7584                                    build1 (VIEW_CONVERT_EXPR, vectype,
7585                                            vec_init));
7586           vec_init = gimple_assign_lhs (new_stmt);
7587           new_bb = gsi_insert_on_edge_immediate (loop_preheader_edge (iv_loop),
7588                                                  new_stmt);
7589           gcc_assert (!new_bb);
7590         }
7591     }
7592   else
7593     {
7594       /* iv_loop is the loop to be vectorized. Create:
7595          vec_init = [X, X+S, X+2*S, X+3*S] (S = step_expr, X = init_expr)  */
7596       stmts = NULL;
7597       new_name = gimple_convert (&stmts, TREE_TYPE (step_expr), init_expr);
7598
7599       unsigned HOST_WIDE_INT const_nunits;
7600       if (nunits.is_constant (&const_nunits))
7601         {
7602           tree_vector_builder elts (step_vectype, const_nunits, 1);
7603           elts.quick_push (new_name);
7604           for (i = 1; i < const_nunits; i++)
7605             {
7606               /* Create: new_name_i = new_name + step_expr  */
7607               new_name = gimple_build (&stmts, PLUS_EXPR, TREE_TYPE (new_name),
7608                                        new_name, step_expr);
7609               elts.quick_push (new_name);
7610             }
7611           /* Create a vector from [new_name_0, new_name_1, ...,
7612              new_name_nunits-1]  */
7613           vec_init = gimple_build_vector (&stmts, &elts);
7614         }
7615       else if (INTEGRAL_TYPE_P (TREE_TYPE (step_expr)))
7616         /* Build the initial value directly from a VEC_SERIES_EXPR.  */
7617         vec_init = gimple_build (&stmts, VEC_SERIES_EXPR, step_vectype,
7618                                  new_name, step_expr);
7619       else
7620         {
7621           /* Build:
7622                 [base, base, base, ...]
7623                 + (vectype) [0, 1, 2, ...] * [step, step, step, ...].  */
7624           gcc_assert (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)));
7625           gcc_assert (flag_associative_math);
7626           tree index = build_index_vector (step_vectype, 0, 1);
7627           tree base_vec = gimple_build_vector_from_val (&stmts, step_vectype,
7628                                                         new_name);
7629           tree step_vec = gimple_build_vector_from_val (&stmts, step_vectype,
7630                                                         step_expr);
7631           vec_init = gimple_build (&stmts, FLOAT_EXPR, step_vectype, index);
7632           vec_init = gimple_build (&stmts, MULT_EXPR, step_vectype,
7633                                    vec_init, step_vec);
7634           vec_init = gimple_build (&stmts, PLUS_EXPR, step_vectype,
7635                                    vec_init, base_vec);
7636         }
7637       vec_init = gimple_convert (&stmts, vectype, vec_init);
7638
7639       if (stmts)
7640         {
7641           new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
7642           gcc_assert (!new_bb);
7643         }
7644     }
7645
7646
7647   /* Create the vector that holds the step of the induction.  */
7648   if (nested_in_vect_loop)
7649     /* iv_loop is nested in the loop to be vectorized. Generate:
7650        vec_step = [S, S, S, S]  */
7651     new_name = step_expr;
7652   else
7653     {
7654       /* iv_loop is the loop to be vectorized. Generate:
7655           vec_step = [VF*S, VF*S, VF*S, VF*S]  */
7656       gimple_seq seq = NULL;
7657       if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
7658         {
7659           expr = build_int_cst (integer_type_node, vf);
7660           expr = gimple_build (&seq, FLOAT_EXPR, TREE_TYPE (step_expr), expr);
7661         }
7662       else
7663         expr = build_int_cst (TREE_TYPE (step_expr), vf);
7664       new_name = gimple_build (&seq, MULT_EXPR, TREE_TYPE (step_expr),
7665                                expr, step_expr);
7666       if (seq)
7667         {
7668           new_bb = gsi_insert_seq_on_edge_immediate (pe, seq);
7669           gcc_assert (!new_bb);
7670         }
7671     }
7672
7673   t = unshare_expr (new_name);
7674   gcc_assert (CONSTANT_CLASS_P (new_name)
7675               || TREE_CODE (new_name) == SSA_NAME);
7676   new_vec = build_vector_from_val (step_vectype, t);
7677   vec_step = vect_init_vector (loop_vinfo, stmt_info,
7678                                new_vec, step_vectype, NULL);
7679
7680
7681   /* Create the following def-use cycle:
7682      loop prolog:
7683          vec_init = ...
7684          vec_step = ...
7685      loop:
7686          vec_iv = PHI <vec_init, vec_loop>
7687          ...
7688          STMT
7689          ...
7690          vec_loop = vec_iv + vec_step;  */
7691
7692   /* Create the induction-phi that defines the induction-operand.  */
7693   vec_dest = vect_get_new_vect_var (vectype, vect_simple_var, "vec_iv_");
7694   induction_phi = create_phi_node (vec_dest, iv_loop->header);
7695   induc_def = PHI_RESULT (induction_phi);
7696
7697   /* Create the iv update inside the loop  */
7698   stmts = NULL;
7699   vec_def = gimple_convert (&stmts, step_vectype, induc_def);
7700   vec_def = gimple_build (&stmts, PLUS_EXPR, step_vectype, vec_def, vec_step);
7701   vec_def = gimple_convert (&stmts, vectype, vec_def);
7702   gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
7703   new_stmt = SSA_NAME_DEF_STMT (vec_def);
7704
7705   /* Set the arguments of the phi node:  */
7706   add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION);
7707   add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
7708                UNKNOWN_LOCATION);
7709
7710   STMT_VINFO_VEC_STMTS (stmt_info).safe_push (induction_phi);
7711   *vec_stmt = induction_phi;
7712
7713   /* In case that vectorization factor (VF) is bigger than the number
7714      of elements that we can fit in a vectype (nunits), we have to generate
7715      more than one vector stmt - i.e - we need to "unroll" the
7716      vector stmt by a factor VF/nunits.  For more details see documentation
7717      in vectorizable_operation.  */
7718
7719   if (ncopies > 1)
7720     {
7721       gimple_seq seq = NULL;
7722       /* FORNOW. This restriction should be relaxed.  */
7723       gcc_assert (!nested_in_vect_loop);
7724
7725       /* Create the vector that holds the step of the induction.  */
7726       if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
7727         {
7728           expr = build_int_cst (integer_type_node, nunits);
7729           expr = gimple_build (&seq, FLOAT_EXPR, TREE_TYPE (step_expr), expr);
7730         }
7731       else
7732         expr = build_int_cst (TREE_TYPE (step_expr), nunits);
7733       new_name = gimple_build (&seq, MULT_EXPR, TREE_TYPE (step_expr),
7734                                expr, step_expr);
7735       if (seq)
7736         {
7737           new_bb = gsi_insert_seq_on_edge_immediate (pe, seq);
7738           gcc_assert (!new_bb);
7739         }
7740
7741       t = unshare_expr (new_name);
7742       gcc_assert (CONSTANT_CLASS_P (new_name)
7743                   || TREE_CODE (new_name) == SSA_NAME);
7744       new_vec = build_vector_from_val (step_vectype, t);
7745       vec_step = vect_init_vector (loop_vinfo, stmt_info,
7746                                    new_vec, step_vectype, NULL);
7747
7748       vec_def = induc_def;
7749       for (i = 1; i < ncopies; i++)
7750         {
7751           /* vec_i = vec_prev + vec_step  */
7752           gimple_seq stmts = NULL;
7753           vec_def = gimple_convert (&stmts, step_vectype, vec_def);
7754           vec_def = gimple_build (&stmts,
7755                                   PLUS_EXPR, step_vectype, vec_def, vec_step);
7756           vec_def = gimple_convert (&stmts, vectype, vec_def);
7757
7758           gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
7759           new_stmt = SSA_NAME_DEF_STMT (vec_def);
7760           STMT_VINFO_VEC_STMTS (stmt_info).safe_push (new_stmt);
7761         }
7762     }
7763
7764   if (dump_enabled_p ())
7765     dump_printf_loc (MSG_NOTE, vect_location,
7766                      "transform induction: created def-use cycle: %G%G",
7767                      induction_phi, SSA_NAME_DEF_STMT (vec_def));
7768
7769   return true;
7770 }
7771
7772 /* Function vectorizable_live_operation.
7773
7774    STMT_INFO computes a value that is used outside the loop.  Check if
7775    it can be supported.  */
7776
7777 bool
7778 vectorizable_live_operation (loop_vec_info loop_vinfo,
7779                              stmt_vec_info stmt_info,
7780                              gimple_stmt_iterator *gsi,
7781                              slp_tree slp_node, slp_instance slp_node_instance,
7782                              int slp_index, bool vec_stmt_p,
7783                              stmt_vector_for_cost *)
7784 {
7785   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
7786   imm_use_iterator imm_iter;
7787   tree lhs, lhs_type, bitsize, vec_bitsize;
7788   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
7789   poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
7790   int ncopies;
7791   gimple *use_stmt;
7792   auto_vec<tree> vec_oprnds;
7793   int vec_entry = 0;
7794   poly_uint64 vec_index = 0;
7795
7796   gcc_assert (STMT_VINFO_LIVE_P (stmt_info));
7797
7798   /* If a stmt of a reduction is live, vectorize it via
7799      vect_create_epilog_for_reduction.  vectorizable_reduction assessed
7800      validity so just trigger the transform here.  */
7801   if (STMT_VINFO_REDUC_DEF (vect_orig_stmt (stmt_info)))
7802     {
7803       if (!vec_stmt_p)
7804         return true;
7805       if (slp_node)
7806         {
7807           /* For reduction chains the meta-info is attached to
7808              the group leader.  */
7809           if (REDUC_GROUP_FIRST_ELEMENT (stmt_info))
7810             stmt_info = REDUC_GROUP_FIRST_ELEMENT (stmt_info);
7811           /* For SLP reductions we vectorize the epilogue for
7812              all involved stmts together.  */
7813           else if (slp_index != 0)
7814             return true;
7815           else
7816             /* For SLP reductions the meta-info is attached to
7817                the representative.  */
7818             stmt_info = SLP_TREE_REPRESENTATIVE (slp_node);
7819         }
7820       stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info);
7821       gcc_assert (reduc_info->is_reduc_info);
7822       if (STMT_VINFO_REDUC_TYPE (reduc_info) == FOLD_LEFT_REDUCTION
7823           || STMT_VINFO_REDUC_TYPE (reduc_info) == EXTRACT_LAST_REDUCTION)
7824         return true;
7825       vect_create_epilog_for_reduction (loop_vinfo, stmt_info, slp_node,
7826                                         slp_node_instance);
7827       return true;
7828     }
7829
7830   /* FORNOW.  CHECKME.  */
7831   if (nested_in_vect_loop_p (loop, stmt_info))
7832     return false;
7833
7834   /* If STMT is not relevant and it is a simple assignment and its inputs are
7835      invariant then it can remain in place, unvectorized.  The original last
7836      scalar value that it computes will be used.  */
7837   if (!STMT_VINFO_RELEVANT_P (stmt_info))
7838     {
7839       gcc_assert (is_simple_and_all_uses_invariant (stmt_info, loop_vinfo));
7840       if (dump_enabled_p ())
7841         dump_printf_loc (MSG_NOTE, vect_location,
7842                          "statement is simple and uses invariant.  Leaving in "
7843                          "place.\n");
7844       return true;
7845     }
7846
7847   if (slp_node)
7848     ncopies = 1;
7849   else
7850     ncopies = vect_get_num_copies (loop_vinfo, vectype);
7851
7852   if (slp_node)
7853     {
7854       gcc_assert (slp_index >= 0);
7855
7856       int num_scalar = SLP_TREE_LANES (slp_node);
7857       int num_vec = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
7858
7859       /* Get the last occurrence of the scalar index from the concatenation of
7860          all the slp vectors. Calculate which slp vector it is and the index
7861          within.  */
7862       poly_uint64 pos = (num_vec * nunits) - num_scalar + slp_index;
7863
7864       /* Calculate which vector contains the result, and which lane of
7865          that vector we need.  */
7866       if (!can_div_trunc_p (pos, nunits, &vec_entry, &vec_index))
7867         {
7868           if (dump_enabled_p ())
7869             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7870                              "Cannot determine which vector holds the"
7871                              " final result.\n");
7872           return false;
7873         }
7874     }
7875
7876   if (!vec_stmt_p)
7877     {
7878       /* No transformation required.  */
7879       if (LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo))
7880         {
7881           if (!direct_internal_fn_supported_p (IFN_EXTRACT_LAST, vectype,
7882                                                OPTIMIZE_FOR_SPEED))
7883             {
7884               if (dump_enabled_p ())
7885                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7886                                  "can't use a fully-masked loop because "
7887                                  "the target doesn't support extract last "
7888                                  "reduction.\n");
7889               LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = false;
7890             }
7891           else if (slp_node)
7892             {
7893               if (dump_enabled_p ())
7894                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7895                                  "can't use a fully-masked loop because an "
7896                                  "SLP statement is live after the loop.\n");
7897               LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = false;
7898             }
7899           else if (ncopies > 1)
7900             {
7901               if (dump_enabled_p ())
7902                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7903                                  "can't use a fully-masked loop because"
7904                                  " ncopies is greater than 1.\n");
7905               LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = false;
7906             }
7907           else
7908             {
7909               gcc_assert (ncopies == 1 && !slp_node);
7910               vect_record_loop_mask (loop_vinfo,
7911                                      &LOOP_VINFO_MASKS (loop_vinfo),
7912                                      1, vectype, NULL);
7913             }
7914         }
7915       return true;
7916     }
7917
7918   /* Use the lhs of the original scalar statement.  */
7919   gimple *stmt = vect_orig_stmt (stmt_info)->stmt;
7920
7921   lhs = (is_a <gphi *> (stmt)) ? gimple_phi_result (stmt)
7922         : gimple_get_lhs (stmt);
7923   lhs_type = TREE_TYPE (lhs);
7924
7925   bitsize = vector_element_bits_tree (vectype);
7926   vec_bitsize = TYPE_SIZE (vectype);
7927
7928   /* Get the vectorized lhs of STMT and the lane to use (counted in bits).  */
7929   tree vec_lhs, bitstart;
7930   if (slp_node)
7931     {
7932       gcc_assert (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo));
7933
7934       /* Get the correct slp vectorized stmt.  */
7935       gimple *vec_stmt = SLP_TREE_VEC_STMTS (slp_node)[vec_entry];
7936       if (gphi *phi = dyn_cast <gphi *> (vec_stmt))
7937         vec_lhs = gimple_phi_result (phi);
7938       else
7939         vec_lhs = gimple_get_lhs (vec_stmt);
7940
7941       /* Get entry to use.  */
7942       bitstart = bitsize_int (vec_index);
7943       bitstart = int_const_binop (MULT_EXPR, bitsize, bitstart);
7944     }
7945   else
7946     {
7947       /* For multiple copies, get the last copy.  */
7948       vec_lhs = gimple_get_lhs (STMT_VINFO_VEC_STMTS (stmt_info).last ());
7949
7950       /* Get the last lane in the vector.  */
7951       bitstart = int_const_binop (MINUS_EXPR, vec_bitsize, bitsize);
7952     }
7953
7954   /* Ensure the VEC_LHS for lane extraction stmts satisfy loop-closed PHI
7955      requirement, insert one phi node for it.  It looks like:
7956          loop;
7957        BB:
7958          # lhs' = PHI <lhs>
7959      ==>
7960          loop;
7961        BB:
7962          # vec_lhs' = PHI <vec_lhs>
7963          new_tree = lane_extract <vec_lhs', ...>;
7964          lhs' = new_tree;  */
7965
7966   basic_block exit_bb = single_exit (loop)->dest;
7967   gcc_assert (single_pred_p (exit_bb));
7968
7969   tree vec_lhs_phi = copy_ssa_name (vec_lhs);
7970   gimple *phi = create_phi_node (vec_lhs_phi, exit_bb);
7971   SET_PHI_ARG_DEF (phi, single_exit (loop)->dest_idx, vec_lhs);
7972
7973   gimple_seq stmts = NULL;
7974   tree new_tree;
7975   if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
7976     {
7977       /* Emit:
7978
7979            SCALAR_RES = EXTRACT_LAST <VEC_LHS, MASK>
7980
7981          where VEC_LHS is the vectorized live-out result and MASK is
7982          the loop mask for the final iteration.  */
7983       gcc_assert (ncopies == 1 && !slp_node);
7984       tree scalar_type = TREE_TYPE (STMT_VINFO_VECTYPE (stmt_info));
7985       tree mask = vect_get_loop_mask (gsi, &LOOP_VINFO_MASKS (loop_vinfo), 1,
7986                                       vectype, 0);
7987       tree scalar_res = gimple_build (&stmts, CFN_EXTRACT_LAST, scalar_type,
7988                                       mask, vec_lhs_phi);
7989
7990       /* Convert the extracted vector element to the required scalar type.  */
7991       new_tree = gimple_convert (&stmts, lhs_type, scalar_res);
7992     }
7993   else
7994     {
7995       tree bftype = TREE_TYPE (vectype);
7996       if (VECTOR_BOOLEAN_TYPE_P (vectype))
7997         bftype = build_nonstandard_integer_type (tree_to_uhwi (bitsize), 1);
7998       new_tree = build3 (BIT_FIELD_REF, bftype, vec_lhs_phi, bitsize, bitstart);
7999       new_tree = force_gimple_operand (fold_convert (lhs_type, new_tree),
8000                                        &stmts, true, NULL_TREE);
8001     }
8002
8003   if (stmts)
8004     {
8005       gimple_stmt_iterator exit_gsi = gsi_after_labels (exit_bb);
8006       gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
8007
8008       /* Remove existing phi from lhs and create one copy from new_tree.  */
8009       tree lhs_phi = NULL_TREE;
8010       gimple_stmt_iterator gsi;
8011       for (gsi = gsi_start_phis (exit_bb); !gsi_end_p (gsi); gsi_next (&gsi))
8012         {
8013           gimple *phi = gsi_stmt (gsi);
8014           if ((gimple_phi_arg_def (phi, 0) == lhs))
8015             {
8016               remove_phi_node (&gsi, false);
8017               lhs_phi = gimple_phi_result (phi);
8018               gimple *copy = gimple_build_assign (lhs_phi, new_tree);
8019               gsi_insert_before (&exit_gsi, copy, GSI_SAME_STMT);
8020               break;
8021             }
8022         }
8023     }
8024
8025   /* Replace use of lhs with newly computed result.  If the use stmt is a
8026      single arg PHI, just replace all uses of PHI result.  It's necessary
8027      because lcssa PHI defining lhs may be before newly inserted stmt.  */
8028   use_operand_p use_p;
8029   FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, lhs)
8030     if (!flow_bb_inside_loop_p (loop, gimple_bb (use_stmt))
8031         && !is_gimple_debug (use_stmt))
8032     {
8033       if (gimple_code (use_stmt) == GIMPLE_PHI
8034           && gimple_phi_num_args (use_stmt) == 1)
8035         {
8036           replace_uses_by (gimple_phi_result (use_stmt), new_tree);
8037         }
8038       else
8039         {
8040           FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
8041             SET_USE (use_p, new_tree);
8042         }
8043       update_stmt (use_stmt);
8044     }
8045
8046   return true;
8047 }
8048
8049 /* Kill any debug uses outside LOOP of SSA names defined in STMT_INFO.  */
8050
8051 static void
8052 vect_loop_kill_debug_uses (class loop *loop, stmt_vec_info stmt_info)
8053 {
8054   ssa_op_iter op_iter;
8055   imm_use_iterator imm_iter;
8056   def_operand_p def_p;
8057   gimple *ustmt;
8058
8059   FOR_EACH_PHI_OR_STMT_DEF (def_p, stmt_info->stmt, op_iter, SSA_OP_DEF)
8060     {
8061       FOR_EACH_IMM_USE_STMT (ustmt, imm_iter, DEF_FROM_PTR (def_p))
8062         {
8063           basic_block bb;
8064
8065           if (!is_gimple_debug (ustmt))
8066             continue;
8067
8068           bb = gimple_bb (ustmt);
8069
8070           if (!flow_bb_inside_loop_p (loop, bb))
8071             {
8072               if (gimple_debug_bind_p (ustmt))
8073                 {
8074                   if (dump_enabled_p ())
8075                     dump_printf_loc (MSG_NOTE, vect_location,
8076                                      "killing debug use\n");
8077
8078                   gimple_debug_bind_reset_value (ustmt);
8079                   update_stmt (ustmt);
8080                 }
8081               else
8082                 gcc_unreachable ();
8083             }
8084         }
8085     }
8086 }
8087
8088 /* Given loop represented by LOOP_VINFO, return true if computation of
8089    LOOP_VINFO_NITERS (= LOOP_VINFO_NITERSM1 + 1) doesn't overflow, false
8090    otherwise.  */
8091
8092 static bool
8093 loop_niters_no_overflow (loop_vec_info loop_vinfo)
8094 {
8095   /* Constant case.  */
8096   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
8097     {
8098       tree cst_niters = LOOP_VINFO_NITERS (loop_vinfo);
8099       tree cst_nitersm1 = LOOP_VINFO_NITERSM1 (loop_vinfo);
8100
8101       gcc_assert (TREE_CODE (cst_niters) == INTEGER_CST);
8102       gcc_assert (TREE_CODE (cst_nitersm1) == INTEGER_CST);
8103       if (wi::to_widest (cst_nitersm1) < wi::to_widest (cst_niters))
8104         return true;
8105     }
8106
8107   widest_int max;
8108   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
8109   /* Check the upper bound of loop niters.  */
8110   if (get_max_loop_iterations (loop, &max))
8111     {
8112       tree type = TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo));
8113       signop sgn = TYPE_SIGN (type);
8114       widest_int type_max = widest_int::from (wi::max_value (type), sgn);
8115       if (max < type_max)
8116         return true;
8117     }
8118   return false;
8119 }
8120
8121 /* Return a mask type with half the number of elements as OLD_TYPE,
8122    given that it should have mode NEW_MODE.  */
8123
8124 tree
8125 vect_halve_mask_nunits (tree old_type, machine_mode new_mode)
8126 {
8127   poly_uint64 nunits = exact_div (TYPE_VECTOR_SUBPARTS (old_type), 2);
8128   return build_truth_vector_type_for_mode (nunits, new_mode);
8129 }
8130
8131 /* Return a mask type with twice as many elements as OLD_TYPE,
8132    given that it should have mode NEW_MODE.  */
8133
8134 tree
8135 vect_double_mask_nunits (tree old_type, machine_mode new_mode)
8136 {
8137   poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (old_type) * 2;
8138   return build_truth_vector_type_for_mode (nunits, new_mode);
8139 }
8140
8141 /* Record that a fully-masked version of LOOP_VINFO would need MASKS to
8142    contain a sequence of NVECTORS masks that each control a vector of type
8143    VECTYPE.  If SCALAR_MASK is nonnull, the fully-masked loop would AND
8144    these vector masks with the vector version of SCALAR_MASK.  */
8145
8146 void
8147 vect_record_loop_mask (loop_vec_info loop_vinfo, vec_loop_masks *masks,
8148                        unsigned int nvectors, tree vectype, tree scalar_mask)
8149 {
8150   gcc_assert (nvectors != 0);
8151   if (masks->length () < nvectors)
8152     masks->safe_grow_cleared (nvectors);
8153   rgroup_masks *rgm = &(*masks)[nvectors - 1];
8154   /* The number of scalars per iteration and the number of vectors are
8155      both compile-time constants.  */
8156   unsigned int nscalars_per_iter
8157     = exact_div (nvectors * TYPE_VECTOR_SUBPARTS (vectype),
8158                  LOOP_VINFO_VECT_FACTOR (loop_vinfo)).to_constant ();
8159
8160   if (scalar_mask)
8161     {
8162       scalar_cond_masked_key cond (scalar_mask, nvectors);
8163       loop_vinfo->scalar_cond_masked_set.add (cond);
8164     }
8165
8166   if (rgm->max_nscalars_per_iter < nscalars_per_iter)
8167     {
8168       rgm->max_nscalars_per_iter = nscalars_per_iter;
8169       rgm->mask_type = truth_type_for (vectype);
8170     }
8171 }
8172
8173 /* Given a complete set of masks MASKS, extract mask number INDEX
8174    for an rgroup that operates on NVECTORS vectors of type VECTYPE,
8175    where 0 <= INDEX < NVECTORS.  Insert any set-up statements before GSI.
8176
8177    See the comment above vec_loop_masks for more details about the mask
8178    arrangement.  */
8179
8180 tree
8181 vect_get_loop_mask (gimple_stmt_iterator *gsi, vec_loop_masks *masks,
8182                     unsigned int nvectors, tree vectype, unsigned int index)
8183 {
8184   rgroup_masks *rgm = &(*masks)[nvectors - 1];
8185   tree mask_type = rgm->mask_type;
8186
8187   /* Populate the rgroup's mask array, if this is the first time we've
8188      used it.  */
8189   if (rgm->masks.is_empty ())
8190     {
8191       rgm->masks.safe_grow_cleared (nvectors);
8192       for (unsigned int i = 0; i < nvectors; ++i)
8193         {
8194           tree mask = make_temp_ssa_name (mask_type, NULL, "loop_mask");
8195           /* Provide a dummy definition until the real one is available.  */
8196           SSA_NAME_DEF_STMT (mask) = gimple_build_nop ();
8197           rgm->masks[i] = mask;
8198         }
8199     }
8200
8201   tree mask = rgm->masks[index];
8202   if (maybe_ne (TYPE_VECTOR_SUBPARTS (mask_type),
8203                 TYPE_VECTOR_SUBPARTS (vectype)))
8204     {
8205       /* A loop mask for data type X can be reused for data type Y
8206          if X has N times more elements than Y and if Y's elements
8207          are N times bigger than X's.  In this case each sequence
8208          of N elements in the loop mask will be all-zero or all-one.
8209          We can then view-convert the mask so that each sequence of
8210          N elements is replaced by a single element.  */
8211       gcc_assert (multiple_p (TYPE_VECTOR_SUBPARTS (mask_type),
8212                               TYPE_VECTOR_SUBPARTS (vectype)));
8213       gimple_seq seq = NULL;
8214       mask_type = truth_type_for (vectype);
8215       mask = gimple_build (&seq, VIEW_CONVERT_EXPR, mask_type, mask);
8216       if (seq)
8217         gsi_insert_seq_before (gsi, seq, GSI_SAME_STMT);
8218     }
8219   return mask;
8220 }
8221
8222 /* Scale profiling counters by estimation for LOOP which is vectorized
8223    by factor VF.  */
8224
8225 static void
8226 scale_profile_for_vect_loop (class loop *loop, unsigned vf)
8227 {
8228   edge preheader = loop_preheader_edge (loop);
8229   /* Reduce loop iterations by the vectorization factor.  */
8230   gcov_type new_est_niter = niter_for_unrolled_loop (loop, vf);
8231   profile_count freq_h = loop->header->count, freq_e = preheader->count ();
8232
8233   if (freq_h.nonzero_p ())
8234     {
8235       profile_probability p;
8236
8237       /* Avoid dropping loop body profile counter to 0 because of zero count
8238          in loop's preheader.  */
8239       if (!(freq_e == profile_count::zero ()))
8240         freq_e = freq_e.force_nonzero ();
8241       p = freq_e.apply_scale (new_est_niter + 1, 1).probability_in (freq_h);
8242       scale_loop_frequencies (loop, p);
8243     }
8244
8245   edge exit_e = single_exit (loop);
8246   exit_e->probability = profile_probability::always ()
8247                                  .apply_scale (1, new_est_niter + 1);
8248
8249   edge exit_l = single_pred_edge (loop->latch);
8250   profile_probability prob = exit_l->probability;
8251   exit_l->probability = exit_e->probability.invert ();
8252   if (prob.initialized_p () && exit_l->probability.initialized_p ())
8253     scale_bbs_frequencies (&loop->latch, 1, exit_l->probability / prob);
8254 }
8255
8256 /* Vectorize STMT_INFO if relevant, inserting any new instructions before GSI.
8257    When vectorizing STMT_INFO as a store, set *SEEN_STORE to its
8258    stmt_vec_info.  */
8259
8260 static void
8261 vect_transform_loop_stmt (loop_vec_info loop_vinfo, stmt_vec_info stmt_info,
8262                           gimple_stmt_iterator *gsi, stmt_vec_info *seen_store)
8263 {
8264   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
8265   poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
8266
8267   if (dump_enabled_p ())
8268     dump_printf_loc (MSG_NOTE, vect_location,
8269                      "------>vectorizing statement: %G", stmt_info->stmt);
8270
8271   if (MAY_HAVE_DEBUG_BIND_STMTS && !STMT_VINFO_LIVE_P (stmt_info))
8272     vect_loop_kill_debug_uses (loop, stmt_info);
8273
8274   if (!STMT_VINFO_RELEVANT_P (stmt_info)
8275       && !STMT_VINFO_LIVE_P (stmt_info))
8276     return;
8277
8278   if (STMT_VINFO_VECTYPE (stmt_info))
8279     {
8280       poly_uint64 nunits
8281         = TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info));
8282       if (!STMT_SLP_TYPE (stmt_info)
8283           && maybe_ne (nunits, vf)
8284           && dump_enabled_p ())
8285         /* For SLP VF is set according to unrolling factor, and not
8286            to vector size, hence for SLP this print is not valid.  */
8287         dump_printf_loc (MSG_NOTE, vect_location, "multiple-types.\n");
8288     }
8289
8290   /* Pure SLP statements have already been vectorized.  We still need
8291      to apply loop vectorization to hybrid SLP statements.  */
8292   if (PURE_SLP_STMT (stmt_info))
8293     return;
8294
8295   if (dump_enabled_p ())
8296     dump_printf_loc (MSG_NOTE, vect_location, "transform statement.\n");
8297
8298   if (vect_transform_stmt (loop_vinfo, stmt_info, gsi, NULL, NULL))
8299     *seen_store = stmt_info;
8300 }
8301
8302 /* Helper function to pass to simplify_replace_tree to enable replacing tree's
8303    in the hash_map with its corresponding values.  */
8304
8305 static tree
8306 find_in_mapping (tree t, void *context)
8307 {
8308   hash_map<tree,tree>* mapping = (hash_map<tree, tree>*) context;
8309
8310   tree *value = mapping->get (t);
8311   return value ? *value : t;
8312 }
8313
8314 /* Update EPILOGUE's loop_vec_info.  EPILOGUE was constructed as a copy of the
8315    original loop that has now been vectorized.
8316
8317    The inits of the data_references need to be advanced with the number of
8318    iterations of the main loop.  This has been computed in vect_do_peeling and
8319    is stored in parameter ADVANCE.  We first restore the data_references
8320    initial offset with the values recored in ORIG_DRS_INIT.
8321
8322    Since the loop_vec_info of this EPILOGUE was constructed for the original
8323    loop, its stmt_vec_infos all point to the original statements.  These need
8324    to be updated to point to their corresponding copies as well as the SSA_NAMES
8325    in their PATTERN_DEF_SEQs and RELATED_STMTs.
8326
8327    The data_reference's connections also need to be updated.  Their
8328    corresponding dr_vec_info need to be reconnected to the EPILOGUE's
8329    stmt_vec_infos, their statements need to point to their corresponding copy,
8330    if they are gather loads or scatter stores then their reference needs to be
8331    updated to point to its corresponding copy and finally we set
8332    'base_misaligned' to false as we have already peeled for alignment in the
8333    prologue of the main loop.  */
8334
8335 static void
8336 update_epilogue_loop_vinfo (class loop *epilogue, tree advance)
8337 {
8338   loop_vec_info epilogue_vinfo = loop_vec_info_for_loop (epilogue);
8339   auto_vec<gimple *> stmt_worklist;
8340   hash_map<tree,tree> mapping;
8341   gimple *orig_stmt, *new_stmt;
8342   gimple_stmt_iterator epilogue_gsi;
8343   gphi_iterator epilogue_phi_gsi;
8344   stmt_vec_info stmt_vinfo = NULL, related_vinfo;
8345   basic_block *epilogue_bbs = get_loop_body (epilogue);
8346   unsigned i;
8347
8348   LOOP_VINFO_BBS (epilogue_vinfo) = epilogue_bbs;
8349
8350   /* Advance data_reference's with the number of iterations of the previous
8351      loop and its prologue.  */
8352   vect_update_inits_of_drs (epilogue_vinfo, advance, PLUS_EXPR);
8353
8354
8355   /* The EPILOGUE loop is a copy of the original loop so they share the same
8356      gimple UIDs.  In this loop we update the loop_vec_info of the EPILOGUE to
8357      point to the copied statements.  We also create a mapping of all LHS' in
8358      the original loop and all the LHS' in the EPILOGUE and create worklists to
8359      update teh STMT_VINFO_PATTERN_DEF_SEQs and STMT_VINFO_RELATED_STMTs.  */
8360   for (unsigned i = 0; i < epilogue->num_nodes; ++i)
8361     {
8362       for (epilogue_phi_gsi = gsi_start_phis (epilogue_bbs[i]);
8363            !gsi_end_p (epilogue_phi_gsi); gsi_next (&epilogue_phi_gsi))
8364         {
8365           new_stmt = epilogue_phi_gsi.phi ();
8366
8367           gcc_assert (gimple_uid (new_stmt) > 0);
8368           stmt_vinfo
8369             = epilogue_vinfo->stmt_vec_infos[gimple_uid (new_stmt) - 1];
8370
8371           orig_stmt = STMT_VINFO_STMT (stmt_vinfo);
8372           STMT_VINFO_STMT (stmt_vinfo) = new_stmt;
8373
8374           mapping.put (gimple_phi_result (orig_stmt),
8375                        gimple_phi_result (new_stmt));
8376           /* PHI nodes can not have patterns or related statements.  */
8377           gcc_assert (STMT_VINFO_PATTERN_DEF_SEQ (stmt_vinfo) == NULL
8378                       && STMT_VINFO_RELATED_STMT (stmt_vinfo) == NULL);
8379         }
8380
8381       for (epilogue_gsi = gsi_start_bb (epilogue_bbs[i]);
8382            !gsi_end_p (epilogue_gsi); gsi_next (&epilogue_gsi))
8383         {
8384           new_stmt = gsi_stmt (epilogue_gsi);
8385           if (is_gimple_debug (new_stmt))
8386             continue;
8387
8388           gcc_assert (gimple_uid (new_stmt) > 0);
8389           stmt_vinfo
8390             = epilogue_vinfo->stmt_vec_infos[gimple_uid (new_stmt) - 1];
8391
8392           orig_stmt = STMT_VINFO_STMT (stmt_vinfo);
8393           STMT_VINFO_STMT (stmt_vinfo) = new_stmt;
8394
8395           if (tree old_lhs = gimple_get_lhs (orig_stmt))
8396             mapping.put (old_lhs, gimple_get_lhs (new_stmt));
8397
8398           if (STMT_VINFO_PATTERN_DEF_SEQ (stmt_vinfo))
8399             {
8400               gimple_seq seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_vinfo);
8401               for (gimple_stmt_iterator gsi = gsi_start (seq);
8402                    !gsi_end_p (gsi); gsi_next (&gsi))
8403                 stmt_worklist.safe_push (gsi_stmt (gsi));
8404             }
8405
8406           related_vinfo = STMT_VINFO_RELATED_STMT (stmt_vinfo);
8407           if (related_vinfo != NULL && related_vinfo != stmt_vinfo)
8408             {
8409               gimple *stmt = STMT_VINFO_STMT (related_vinfo);
8410               stmt_worklist.safe_push (stmt);
8411               /* Set BB such that the assert in
8412                 'get_initial_def_for_reduction' is able to determine that
8413                 the BB of the related stmt is inside this loop.  */
8414               gimple_set_bb (stmt,
8415                              gimple_bb (new_stmt));
8416               related_vinfo = STMT_VINFO_RELATED_STMT (related_vinfo);
8417               gcc_assert (related_vinfo == NULL
8418                           || related_vinfo == stmt_vinfo);
8419             }
8420         }
8421     }
8422
8423   /* The PATTERN_DEF_SEQs and RELATED_STMTs in the epilogue were constructed
8424      using the original main loop and thus need to be updated to refer to the
8425      cloned variables used in the epilogue.  */
8426   for (unsigned i = 0; i < stmt_worklist.length (); ++i)
8427     {
8428       gimple *stmt = stmt_worklist[i];
8429       tree *new_op;
8430
8431       for (unsigned j = 1; j < gimple_num_ops (stmt); ++j)
8432         {
8433           tree op = gimple_op (stmt, j);
8434           if ((new_op = mapping.get(op)))
8435             gimple_set_op (stmt, j, *new_op);
8436           else
8437             {
8438               /* PR92429: The last argument of simplify_replace_tree disables
8439                  folding when replacing arguments.  This is required as
8440                  otherwise you might end up with different statements than the
8441                  ones analyzed in vect_loop_analyze, leading to different
8442                  vectorization.  */
8443               op = simplify_replace_tree (op, NULL_TREE, NULL_TREE,
8444                                           &find_in_mapping, &mapping, false);
8445               gimple_set_op (stmt, j, op);
8446             }
8447         }
8448     }
8449
8450   struct data_reference *dr;
8451   vec<data_reference_p> datarefs = epilogue_vinfo->shared->datarefs;
8452   FOR_EACH_VEC_ELT (datarefs, i, dr)
8453     {
8454       orig_stmt = DR_STMT (dr);
8455       gcc_assert (gimple_uid (orig_stmt) > 0);
8456       stmt_vinfo = epilogue_vinfo->stmt_vec_infos[gimple_uid (orig_stmt) - 1];
8457       /* Data references for gather loads and scatter stores do not use the
8458          updated offset we set using ADVANCE.  Instead we have to make sure the
8459          reference in the data references point to the corresponding copy of
8460          the original in the epilogue.  */
8461       if (STMT_VINFO_MEMORY_ACCESS_TYPE (vect_stmt_to_vectorize (stmt_vinfo))
8462           == VMAT_GATHER_SCATTER)
8463         {
8464           DR_REF (dr)
8465             = simplify_replace_tree (DR_REF (dr), NULL_TREE, NULL_TREE,
8466                                      &find_in_mapping, &mapping);
8467           DR_BASE_ADDRESS (dr)
8468             = simplify_replace_tree (DR_BASE_ADDRESS (dr), NULL_TREE, NULL_TREE,
8469                                      &find_in_mapping, &mapping);
8470         }
8471       DR_STMT (dr) = STMT_VINFO_STMT (stmt_vinfo);
8472       stmt_vinfo->dr_aux.stmt = stmt_vinfo;
8473       /* The vector size of the epilogue is smaller than that of the main loop
8474          so the alignment is either the same or lower. This means the dr will
8475          thus by definition be aligned.  */
8476       STMT_VINFO_DR_INFO (stmt_vinfo)->base_misaligned = false;
8477     }
8478
8479   epilogue_vinfo->shared->datarefs_copy.release ();
8480   epilogue_vinfo->shared->save_datarefs ();
8481 }
8482
8483 /* Function vect_transform_loop.
8484
8485    The analysis phase has determined that the loop is vectorizable.
8486    Vectorize the loop - created vectorized stmts to replace the scalar
8487    stmts in the loop, and update the loop exit condition.
8488    Returns scalar epilogue loop if any.  */
8489
8490 class loop *
8491 vect_transform_loop (loop_vec_info loop_vinfo, gimple *loop_vectorized_call)
8492 {
8493   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
8494   class loop *epilogue = NULL;
8495   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
8496   int nbbs = loop->num_nodes;
8497   int i;
8498   tree niters_vector = NULL_TREE;
8499   tree step_vector = NULL_TREE;
8500   tree niters_vector_mult_vf = NULL_TREE;
8501   poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
8502   unsigned int lowest_vf = constant_lower_bound (vf);
8503   gimple *stmt;
8504   bool check_profitability = false;
8505   unsigned int th;
8506
8507   DUMP_VECT_SCOPE ("vec_transform_loop");
8508
8509   loop_vinfo->shared->check_datarefs ();
8510
8511   /* Use the more conservative vectorization threshold.  If the number
8512      of iterations is constant assume the cost check has been performed
8513      by our caller.  If the threshold makes all loops profitable that
8514      run at least the (estimated) vectorization factor number of times
8515      checking is pointless, too.  */
8516   th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
8517   if (vect_apply_runtime_profitability_check_p (loop_vinfo))
8518     {
8519       if (dump_enabled_p ())
8520         dump_printf_loc (MSG_NOTE, vect_location,
8521                          "Profitability threshold is %d loop iterations.\n",
8522                          th);
8523       check_profitability = true;
8524     }
8525
8526   /* Make sure there exists a single-predecessor exit bb.  Do this before
8527      versioning.   */
8528   edge e = single_exit (loop);
8529   if (! single_pred_p (e->dest))
8530     {
8531       split_loop_exit_edge (e, true);
8532       if (dump_enabled_p ())
8533         dump_printf (MSG_NOTE, "split exit edge\n");
8534     }
8535
8536   /* Version the loop first, if required, so the profitability check
8537      comes first.  */
8538
8539   if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
8540     {
8541       class loop *sloop
8542         = vect_loop_versioning (loop_vinfo, loop_vectorized_call);
8543       sloop->force_vectorize = false;
8544       check_profitability = false;
8545     }
8546
8547   /* Make sure there exists a single-predecessor exit bb also on the
8548      scalar loop copy.  Do this after versioning but before peeling
8549      so CFG structure is fine for both scalar and if-converted loop
8550      to make slpeel_duplicate_current_defs_from_edges face matched
8551      loop closed PHI nodes on the exit.  */
8552   if (LOOP_VINFO_SCALAR_LOOP (loop_vinfo))
8553     {
8554       e = single_exit (LOOP_VINFO_SCALAR_LOOP (loop_vinfo));
8555       if (! single_pred_p (e->dest))
8556         {
8557           split_loop_exit_edge (e, true);
8558           if (dump_enabled_p ())
8559             dump_printf (MSG_NOTE, "split exit edge of scalar loop\n");
8560         }
8561     }
8562
8563   tree niters = vect_build_loop_niters (loop_vinfo);
8564   LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = niters;
8565   tree nitersm1 = unshare_expr (LOOP_VINFO_NITERSM1 (loop_vinfo));
8566   bool niters_no_overflow = loop_niters_no_overflow (loop_vinfo);
8567   tree advance;
8568   drs_init_vec orig_drs_init;
8569
8570   epilogue = vect_do_peeling (loop_vinfo, niters, nitersm1, &niters_vector,
8571                               &step_vector, &niters_vector_mult_vf, th,
8572                               check_profitability, niters_no_overflow,
8573                               &advance);
8574
8575   if (LOOP_VINFO_SCALAR_LOOP (loop_vinfo)
8576       && LOOP_VINFO_SCALAR_LOOP_SCALING (loop_vinfo).initialized_p ())
8577     scale_loop_frequencies (LOOP_VINFO_SCALAR_LOOP (loop_vinfo),
8578                             LOOP_VINFO_SCALAR_LOOP_SCALING (loop_vinfo));
8579
8580   if (niters_vector == NULL_TREE)
8581     {
8582       if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
8583           && !LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
8584           && known_eq (lowest_vf, vf))
8585         {
8586           niters_vector
8587             = build_int_cst (TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo)),
8588                              LOOP_VINFO_INT_NITERS (loop_vinfo) / lowest_vf);
8589           step_vector = build_one_cst (TREE_TYPE (niters));
8590         }
8591       else
8592         vect_gen_vector_loop_niters (loop_vinfo, niters, &niters_vector,
8593                                      &step_vector, niters_no_overflow);
8594     }
8595
8596   /* 1) Make sure the loop header has exactly two entries
8597      2) Make sure we have a preheader basic block.  */
8598
8599   gcc_assert (EDGE_COUNT (loop->header->preds) == 2);
8600
8601   split_edge (loop_preheader_edge (loop));
8602
8603   if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
8604       && vect_use_loop_mask_for_alignment_p (loop_vinfo))
8605     /* This will deal with any possible peeling.  */
8606     vect_prepare_for_masked_peels (loop_vinfo);
8607
8608   /* Schedule the SLP instances first, then handle loop vectorization
8609      below.  */
8610   if (!loop_vinfo->slp_instances.is_empty ())
8611     {
8612       DUMP_VECT_SCOPE ("scheduling SLP instances");
8613       vect_schedule_slp (loop_vinfo);
8614     }
8615
8616   /* FORNOW: the vectorizer supports only loops which body consist
8617      of one basic block (header + empty latch). When the vectorizer will
8618      support more involved loop forms, the order by which the BBs are
8619      traversed need to be reconsidered.  */
8620
8621   for (i = 0; i < nbbs; i++)
8622     {
8623       basic_block bb = bbs[i];
8624       stmt_vec_info stmt_info;
8625
8626       for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
8627            gsi_next (&si))
8628         {
8629           gphi *phi = si.phi ();
8630           if (dump_enabled_p ())
8631             dump_printf_loc (MSG_NOTE, vect_location,
8632                              "------>vectorizing phi: %G", phi);
8633           stmt_info = loop_vinfo->lookup_stmt (phi);
8634           if (!stmt_info)
8635             continue;
8636
8637           if (MAY_HAVE_DEBUG_BIND_STMTS && !STMT_VINFO_LIVE_P (stmt_info))
8638             vect_loop_kill_debug_uses (loop, stmt_info);
8639
8640           if (!STMT_VINFO_RELEVANT_P (stmt_info)
8641               && !STMT_VINFO_LIVE_P (stmt_info))
8642             continue;
8643
8644           if (STMT_VINFO_VECTYPE (stmt_info)
8645               && (maybe_ne
8646                   (TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info)), vf))
8647               && dump_enabled_p ())
8648             dump_printf_loc (MSG_NOTE, vect_location, "multiple-types.\n");
8649
8650           if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def
8651                || STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
8652                || STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def
8653                || STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle
8654                || STMT_VINFO_DEF_TYPE (stmt_info) == vect_internal_def)
8655               && ! PURE_SLP_STMT (stmt_info))
8656             {
8657               if (dump_enabled_p ())
8658                 dump_printf_loc (MSG_NOTE, vect_location, "transform phi.\n");
8659               vect_transform_stmt (loop_vinfo, stmt_info, NULL, NULL, NULL);
8660             }
8661         }
8662
8663       for (gimple_stmt_iterator si = gsi_start_bb (bb);
8664            !gsi_end_p (si);)
8665         {
8666           stmt = gsi_stmt (si);
8667           /* During vectorization remove existing clobber stmts.  */
8668           if (gimple_clobber_p (stmt))
8669             {
8670               unlink_stmt_vdef (stmt);
8671               gsi_remove (&si, true);
8672               release_defs (stmt);
8673             }
8674           else
8675             {
8676               /* Ignore vector stmts created in the outer loop.  */
8677               stmt_info = loop_vinfo->lookup_stmt (stmt);
8678
8679               /* vector stmts created in the outer-loop during vectorization of
8680                  stmts in an inner-loop may not have a stmt_info, and do not
8681                  need to be vectorized.  */
8682               stmt_vec_info seen_store = NULL;
8683               if (stmt_info)
8684                 {
8685                   if (STMT_VINFO_IN_PATTERN_P (stmt_info))
8686                     {
8687                       gimple *def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
8688                       for (gimple_stmt_iterator subsi = gsi_start (def_seq);
8689                            !gsi_end_p (subsi); gsi_next (&subsi))
8690                         {
8691                           stmt_vec_info pat_stmt_info
8692                             = loop_vinfo->lookup_stmt (gsi_stmt (subsi));
8693                           vect_transform_loop_stmt (loop_vinfo, pat_stmt_info,
8694                                                     &si, &seen_store);
8695                         }
8696                       stmt_vec_info pat_stmt_info
8697                         = STMT_VINFO_RELATED_STMT (stmt_info);
8698                       vect_transform_loop_stmt (loop_vinfo, pat_stmt_info, &si,
8699                                                 &seen_store);
8700                     }
8701                   vect_transform_loop_stmt (loop_vinfo, stmt_info, &si,
8702                                             &seen_store);
8703                 }
8704               gsi_next (&si);
8705               if (seen_store)
8706                 {
8707                   if (STMT_VINFO_GROUPED_ACCESS (seen_store))
8708                     /* Interleaving.  If IS_STORE is TRUE, the
8709                        vectorization of the interleaving chain was
8710                        completed - free all the stores in the chain.  */
8711                     vect_remove_stores (loop_vinfo,
8712                                         DR_GROUP_FIRST_ELEMENT (seen_store));
8713                   else
8714                     /* Free the attached stmt_vec_info and remove the stmt.  */
8715                     loop_vinfo->remove_stmt (stmt_info);
8716                 }
8717             }
8718         }
8719
8720       /* Stub out scalar statements that must not survive vectorization.
8721          Doing this here helps with grouped statements, or statements that
8722          are involved in patterns.  */
8723       for (gimple_stmt_iterator gsi = gsi_start_bb (bb);
8724            !gsi_end_p (gsi); gsi_next (&gsi))
8725         {
8726           gcall *call = dyn_cast <gcall *> (gsi_stmt (gsi));
8727           if (call && gimple_call_internal_p (call, IFN_MASK_LOAD))
8728             {
8729               tree lhs = gimple_get_lhs (call);
8730               if (!VECTOR_TYPE_P (TREE_TYPE (lhs)))
8731                 {
8732                   tree zero = build_zero_cst (TREE_TYPE (lhs));
8733                   gimple *new_stmt = gimple_build_assign (lhs, zero);
8734                   gsi_replace (&gsi, new_stmt, true);
8735                 }
8736             }
8737         }
8738     }                           /* BBs in loop */
8739
8740   /* The vectorization factor is always > 1, so if we use an IV increment of 1.
8741      a zero NITERS becomes a nonzero NITERS_VECTOR.  */
8742   if (integer_onep (step_vector))
8743     niters_no_overflow = true;
8744   vect_set_loop_condition (loop, loop_vinfo, niters_vector, step_vector,
8745                            niters_vector_mult_vf, !niters_no_overflow);
8746
8747   unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
8748   scale_profile_for_vect_loop (loop, assumed_vf);
8749
8750   /* True if the final iteration might not handle a full vector's
8751      worth of scalar iterations.  */
8752   bool final_iter_may_be_partial = LOOP_VINFO_FULLY_MASKED_P (loop_vinfo);
8753   /* The minimum number of iterations performed by the epilogue.  This
8754      is 1 when peeling for gaps because we always need a final scalar
8755      iteration.  */
8756   int min_epilogue_iters = LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) ? 1 : 0;
8757   /* +1 to convert latch counts to loop iteration counts,
8758      -min_epilogue_iters to remove iterations that cannot be performed
8759        by the vector code.  */
8760   int bias_for_lowest = 1 - min_epilogue_iters;
8761   int bias_for_assumed = bias_for_lowest;
8762   int alignment_npeels = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
8763   if (alignment_npeels && LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
8764     {
8765       /* When the amount of peeling is known at compile time, the first
8766          iteration will have exactly alignment_npeels active elements.
8767          In the worst case it will have at least one.  */
8768       int min_first_active = (alignment_npeels > 0 ? alignment_npeels : 1);
8769       bias_for_lowest += lowest_vf - min_first_active;
8770       bias_for_assumed += assumed_vf - min_first_active;
8771     }
8772   /* In these calculations the "- 1" converts loop iteration counts
8773      back to latch counts.  */
8774   if (loop->any_upper_bound)
8775     loop->nb_iterations_upper_bound
8776       = (final_iter_may_be_partial
8777          ? wi::udiv_ceil (loop->nb_iterations_upper_bound + bias_for_lowest,
8778                           lowest_vf) - 1
8779          : wi::udiv_floor (loop->nb_iterations_upper_bound + bias_for_lowest,
8780                            lowest_vf) - 1);
8781   if (loop->any_likely_upper_bound)
8782     loop->nb_iterations_likely_upper_bound
8783       = (final_iter_may_be_partial
8784          ? wi::udiv_ceil (loop->nb_iterations_likely_upper_bound
8785                           + bias_for_lowest, lowest_vf) - 1
8786          : wi::udiv_floor (loop->nb_iterations_likely_upper_bound
8787                            + bias_for_lowest, lowest_vf) - 1);
8788   if (loop->any_estimate)
8789     loop->nb_iterations_estimate
8790       = (final_iter_may_be_partial
8791          ? wi::udiv_ceil (loop->nb_iterations_estimate + bias_for_assumed,
8792                           assumed_vf) - 1
8793          : wi::udiv_floor (loop->nb_iterations_estimate + bias_for_assumed,
8794                            assumed_vf) - 1);
8795
8796   if (dump_enabled_p ())
8797     {
8798       if (!LOOP_VINFO_EPILOGUE_P (loop_vinfo))
8799         {
8800           dump_printf_loc (MSG_NOTE, vect_location,
8801                            "LOOP VECTORIZED\n");
8802           if (loop->inner)
8803             dump_printf_loc (MSG_NOTE, vect_location,
8804                              "OUTER LOOP VECTORIZED\n");
8805           dump_printf (MSG_NOTE, "\n");
8806         }
8807       else
8808         dump_printf_loc (MSG_NOTE, vect_location,
8809                          "LOOP EPILOGUE VECTORIZED (MODE=%s)\n",
8810                          GET_MODE_NAME (loop_vinfo->vector_mode));
8811     }
8812
8813   /* Loops vectorized with a variable factor won't benefit from
8814      unrolling/peeling.  */
8815   if (!vf.is_constant ())
8816     {
8817       loop->unroll = 1;
8818       if (dump_enabled_p ())
8819         dump_printf_loc (MSG_NOTE, vect_location, "Disabling unrolling due to"
8820                          " variable-length vectorization factor\n");
8821     }
8822   /* Free SLP instances here because otherwise stmt reference counting
8823      won't work.  */
8824   slp_instance instance;
8825   FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), i, instance)
8826     vect_free_slp_instance (instance, true);
8827   LOOP_VINFO_SLP_INSTANCES (loop_vinfo).release ();
8828   /* Clear-up safelen field since its value is invalid after vectorization
8829      since vectorized loop can have loop-carried dependencies.  */
8830   loop->safelen = 0;
8831
8832   if (epilogue)
8833     {
8834       update_epilogue_loop_vinfo (epilogue, advance);
8835
8836       epilogue->simduid = loop->simduid;
8837       epilogue->force_vectorize = loop->force_vectorize;
8838       epilogue->dont_vectorize = false;
8839     }
8840
8841   return epilogue;
8842 }
8843
8844 /* The code below is trying to perform simple optimization - revert
8845    if-conversion for masked stores, i.e. if the mask of a store is zero
8846    do not perform it and all stored value producers also if possible.
8847    For example,
8848      for (i=0; i<n; i++)
8849        if (c[i])
8850         {
8851           p1[i] += 1;
8852           p2[i] = p3[i] +2;
8853         }
8854    this transformation will produce the following semi-hammock:
8855
8856    if (!mask__ifc__42.18_165 == { 0, 0, 0, 0, 0, 0, 0, 0 })
8857      {
8858        vect__11.19_170 = MASK_LOAD (vectp_p1.20_168, 0B, mask__ifc__42.18_165);
8859        vect__12.22_172 = vect__11.19_170 + vect_cst__171;
8860        MASK_STORE (vectp_p1.23_175, 0B, mask__ifc__42.18_165, vect__12.22_172);
8861        vect__18.25_182 = MASK_LOAD (vectp_p3.26_180, 0B, mask__ifc__42.18_165);
8862        vect__19.28_184 = vect__18.25_182 + vect_cst__183;
8863        MASK_STORE (vectp_p2.29_187, 0B, mask__ifc__42.18_165, vect__19.28_184);
8864      }
8865 */
8866
8867 void
8868 optimize_mask_stores (class loop *loop)
8869 {
8870   basic_block *bbs = get_loop_body (loop);
8871   unsigned nbbs = loop->num_nodes;
8872   unsigned i;
8873   basic_block bb;
8874   class loop *bb_loop;
8875   gimple_stmt_iterator gsi;
8876   gimple *stmt;
8877   auto_vec<gimple *> worklist;
8878   auto_purge_vect_location sentinel;
8879
8880   vect_location = find_loop_location (loop);
8881   /* Pick up all masked stores in loop if any.  */
8882   for (i = 0; i < nbbs; i++)
8883     {
8884       bb = bbs[i];
8885       for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi);
8886            gsi_next (&gsi))
8887         {
8888           stmt = gsi_stmt (gsi);
8889           if (gimple_call_internal_p (stmt, IFN_MASK_STORE))
8890             worklist.safe_push (stmt);
8891         }
8892     }
8893
8894   free (bbs);
8895   if (worklist.is_empty ())
8896     return;
8897
8898   /* Loop has masked stores.  */
8899   while (!worklist.is_empty ())
8900     {
8901       gimple *last, *last_store;
8902       edge e, efalse;
8903       tree mask;
8904       basic_block store_bb, join_bb;
8905       gimple_stmt_iterator gsi_to;
8906       tree vdef, new_vdef;
8907       gphi *phi;
8908       tree vectype;
8909       tree zero;
8910
8911       last = worklist.pop ();
8912       mask = gimple_call_arg (last, 2);
8913       bb = gimple_bb (last);
8914       /* Create then_bb and if-then structure in CFG, then_bb belongs to
8915          the same loop as if_bb.  It could be different to LOOP when two
8916          level loop-nest is vectorized and mask_store belongs to the inner
8917          one.  */
8918       e = split_block (bb, last);
8919       bb_loop = bb->loop_father;
8920       gcc_assert (loop == bb_loop || flow_loop_nested_p (loop, bb_loop));
8921       join_bb = e->dest;
8922       store_bb = create_empty_bb (bb);
8923       add_bb_to_loop (store_bb, bb_loop);
8924       e->flags = EDGE_TRUE_VALUE;
8925       efalse = make_edge (bb, store_bb, EDGE_FALSE_VALUE);
8926       /* Put STORE_BB to likely part.  */
8927       efalse->probability = profile_probability::unlikely ();
8928       store_bb->count = efalse->count ();
8929       make_single_succ_edge (store_bb, join_bb, EDGE_FALLTHRU);
8930       if (dom_info_available_p (CDI_DOMINATORS))
8931         set_immediate_dominator (CDI_DOMINATORS, store_bb, bb);
8932       if (dump_enabled_p ())
8933         dump_printf_loc (MSG_NOTE, vect_location,
8934                          "Create new block %d to sink mask stores.",
8935                          store_bb->index);
8936       /* Create vector comparison with boolean result.  */
8937       vectype = TREE_TYPE (mask);
8938       zero = build_zero_cst (vectype);
8939       stmt = gimple_build_cond (EQ_EXPR, mask, zero, NULL_TREE, NULL_TREE);
8940       gsi = gsi_last_bb (bb);
8941       gsi_insert_after (&gsi, stmt, GSI_SAME_STMT);
8942       /* Create new PHI node for vdef of the last masked store:
8943          .MEM_2 = VDEF <.MEM_1>
8944          will be converted to
8945          .MEM.3 = VDEF <.MEM_1>
8946          and new PHI node will be created in join bb
8947          .MEM_2 = PHI <.MEM_1, .MEM_3>
8948       */
8949       vdef = gimple_vdef (last);
8950       new_vdef = make_ssa_name (gimple_vop (cfun), last);
8951       gimple_set_vdef (last, new_vdef);
8952       phi = create_phi_node (vdef, join_bb);
8953       add_phi_arg (phi, new_vdef, EDGE_SUCC (store_bb, 0), UNKNOWN_LOCATION);
8954
8955       /* Put all masked stores with the same mask to STORE_BB if possible.  */
8956       while (true)
8957         {
8958           gimple_stmt_iterator gsi_from;
8959           gimple *stmt1 = NULL;
8960
8961           /* Move masked store to STORE_BB.  */
8962           last_store = last;
8963           gsi = gsi_for_stmt (last);
8964           gsi_from = gsi;
8965           /* Shift GSI to the previous stmt for further traversal.  */
8966           gsi_prev (&gsi);
8967           gsi_to = gsi_start_bb (store_bb);
8968           gsi_move_before (&gsi_from, &gsi_to);
8969           /* Setup GSI_TO to the non-empty block start.  */
8970           gsi_to = gsi_start_bb (store_bb);
8971           if (dump_enabled_p ())
8972             dump_printf_loc (MSG_NOTE, vect_location,
8973                              "Move stmt to created bb\n%G", last);
8974           /* Move all stored value producers if possible.  */
8975           while (!gsi_end_p (gsi))
8976             {
8977               tree lhs;
8978               imm_use_iterator imm_iter;
8979               use_operand_p use_p;
8980               bool res;
8981
8982               /* Skip debug statements.  */
8983               if (is_gimple_debug (gsi_stmt (gsi)))
8984                 {
8985                   gsi_prev (&gsi);
8986                   continue;
8987                 }
8988               stmt1 = gsi_stmt (gsi);
8989               /* Do not consider statements writing to memory or having
8990                  volatile operand.  */
8991               if (gimple_vdef (stmt1)
8992                   || gimple_has_volatile_ops (stmt1))
8993                 break;
8994               gsi_from = gsi;
8995               gsi_prev (&gsi);
8996               lhs = gimple_get_lhs (stmt1);
8997               if (!lhs)
8998                 break;
8999
9000               /* LHS of vectorized stmt must be SSA_NAME.  */
9001               if (TREE_CODE (lhs) != SSA_NAME)
9002                 break;
9003
9004               if (!VECTOR_TYPE_P (TREE_TYPE (lhs)))
9005                 {
9006                   /* Remove dead scalar statement.  */
9007                   if (has_zero_uses (lhs))
9008                     {
9009                       gsi_remove (&gsi_from, true);
9010                       continue;
9011                     }
9012                 }
9013
9014               /* Check that LHS does not have uses outside of STORE_BB.  */
9015               res = true;
9016               FOR_EACH_IMM_USE_FAST (use_p, imm_iter, lhs)
9017                 {
9018                   gimple *use_stmt;
9019                   use_stmt = USE_STMT (use_p);
9020                   if (is_gimple_debug (use_stmt))
9021                     continue;
9022                   if (gimple_bb (use_stmt) != store_bb)
9023                     {
9024                       res = false;
9025                       break;
9026                     }
9027                 }
9028               if (!res)
9029                 break;
9030
9031               if (gimple_vuse (stmt1)
9032                   && gimple_vuse (stmt1) != gimple_vuse (last_store))
9033                 break;
9034
9035               /* Can move STMT1 to STORE_BB.  */
9036               if (dump_enabled_p ())
9037                 dump_printf_loc (MSG_NOTE, vect_location,
9038                                  "Move stmt to created bb\n%G", stmt1);
9039               gsi_move_before (&gsi_from, &gsi_to);
9040               /* Shift GSI_TO for further insertion.  */
9041               gsi_prev (&gsi_to);
9042             }
9043           /* Put other masked stores with the same mask to STORE_BB.  */
9044           if (worklist.is_empty ()
9045               || gimple_call_arg (worklist.last (), 2) != mask
9046               || worklist.last () != stmt1)
9047             break;
9048           last = worklist.pop ();
9049         }
9050       add_phi_arg (phi, gimple_vuse (last_store), e, UNKNOWN_LOCATION);
9051     }
9052 }
9053
9054 /* Decide whether it is possible to use a zero-based induction variable
9055    when vectorizing LOOP_VINFO with a fully-masked loop.  If it is,
9056    return the value that the induction variable must be able to hold
9057    in order to ensure that the loop ends with an all-false mask.
9058    Return -1 otherwise.  */
9059 widest_int
9060 vect_iv_limit_for_full_masking (loop_vec_info loop_vinfo)
9061 {
9062   tree niters_skip = LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo);
9063   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
9064   unsigned HOST_WIDE_INT max_vf = vect_max_vf (loop_vinfo);
9065
9066   /* Calculate the value that the induction variable must be able
9067      to hit in order to ensure that we end the loop with an all-false mask.
9068      This involves adding the maximum number of inactive trailing scalar
9069      iterations.  */
9070   widest_int iv_limit = -1;
9071   if (max_loop_iterations (loop, &iv_limit))
9072     {
9073       if (niters_skip)
9074         {
9075           /* Add the maximum number of skipped iterations to the
9076              maximum iteration count.  */
9077           if (TREE_CODE (niters_skip) == INTEGER_CST)
9078             iv_limit += wi::to_widest (niters_skip);
9079           else
9080             iv_limit += max_vf - 1;
9081         }
9082       else if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo))
9083         /* Make a conservatively-correct assumption.  */
9084         iv_limit += max_vf - 1;
9085
9086       /* IV_LIMIT is the maximum number of latch iterations, which is also
9087          the maximum in-range IV value.  Round this value down to the previous
9088          vector alignment boundary and then add an extra full iteration.  */
9089       poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
9090       iv_limit = (iv_limit & -(int) known_alignment (vf)) + max_vf;
9091     }
9092   return iv_limit;
9093 }
9094