gcc/tree-vect-loop.c

   1 /* Loop Vectorization
   2    Copyright (C) 2003-2019 Free Software Foundation, Inc.
   3    Contributed by Dorit Naishlos <dorit@il.ibm.com> and
   4    Ira Rosen <irar@il.ibm.com>
   5
   6 This file is part of GCC.
   7
   8 GCC is free software; you can redistribute it and/or modify it under
   9 the terms of the GNU General Public License as published by the Free
  10 Software Foundation; either version 3, or (at your option) any later
  11 version.
  12
  13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
  14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
  15 FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  16 for more details.
  17
  18 You should have received a copy of the GNU General Public License
  19 along with GCC; see the file COPYING3.  If not see
  20 <http://www.gnu.org/licenses/>.  */
  21
  22 #include "config.h"
  23 #include "system.h"
  24 #include "coretypes.h"
  25 #include "backend.h"
  26 #include "target.h"
  27 #include "rtl.h"
  28 #include "tree.h"
  29 #include "gimple.h"
  30 #include "cfghooks.h"
  31 #include "tree-pass.h"
  32 #include "ssa.h"
  33 #include "optabs-tree.h"
  34 #include "diagnostic-core.h"
  35 #include "fold-const.h"
  36 #include "stor-layout.h"
  37 #include "cfganal.h"
  38 #include "gimplify.h"
  39 #include "gimple-iterator.h"
  40 #include "gimplify-me.h"
  41 #include "tree-ssa-loop-ivopts.h"
  42 #include "tree-ssa-loop-manip.h"
  43 #include "tree-ssa-loop-niter.h"
  44 #include "tree-ssa-loop.h"
  45 #include "cfgloop.h"
  46 #include "params.h"
  47 #include "tree-scalar-evolution.h"
  48 #include "tree-vectorizer.h"
  49 #include "gimple-fold.h"
  50 #include "cgraph.h"
  51 #include "tree-cfg.h"
  52 #include "tree-if-conv.h"
  53 #include "internal-fn.h"
  54 #include "tree-vector-builder.h"
  55 #include "vec-perm-indices.h"
  56 #include "tree-eh.h"
  57
  58 /* Loop Vectorization Pass.
  59
  60    This pass tries to vectorize loops.
  61
  62    For example, the vectorizer transforms the following simple loop:
  63
  64         short a[N]; short b[N]; short c[N]; int i;
  65
  66         for (i=0; i<N; i++){
  67           a[i] = b[i] + c[i];
  68         }
  69
  70    as if it was manually vectorized by rewriting the source code into:
  71
  72         typedef int __attribute__((mode(V8HI))) v8hi;
  73         short a[N];  short b[N]; short c[N];   int i;
  74         v8hi *pa = (v8hi*)a, *pb = (v8hi*)b, *pc = (v8hi*)c;
  75         v8hi va, vb, vc;
  76
  77         for (i=0; i<N/8; i++){
  78           vb = pb[i];
  79           vc = pc[i];
  80           va = vb + vc;
  81           pa[i] = va;
  82         }
  83
  84         The main entry to this pass is vectorize_loops(), in which
  85    the vectorizer applies a set of analyses on a given set of loops,
  86    followed by the actual vectorization transformation for the loops that
  87    had successfully passed the analysis phase.
  88         Throughout this pass we make a distinction between two types of
  89    data: scalars (which are represented by SSA_NAMES), and memory references
  90    ("data-refs").  These two types of data require different handling both
  91    during analysis and transformation. The types of data-refs that the
  92    vectorizer currently supports are ARRAY_REFS which base is an array DECL
  93    (not a pointer), and INDIRECT_REFS through pointers; both array and pointer
  94    accesses are required to have a simple (consecutive) access pattern.
  95
  96    Analysis phase:
  97    ===============
  98         The driver for the analysis phase is vect_analyze_loop().
  99    It applies a set of analyses, some of which rely on the scalar evolution
 100    analyzer (scev) developed by Sebastian Pop.
 101
 102         During the analysis phase the vectorizer records some information
 103    per stmt in a "stmt_vec_info" struct which is attached to each stmt in the
 104    loop, as well as general information about the loop as a whole, which is
 105    recorded in a "loop_vec_info" struct attached to each loop.
 106
 107    Transformation phase:
 108    =====================
 109         The loop transformation phase scans all the stmts in the loop, and
 110    creates a vector stmt (or a sequence of stmts) for each scalar stmt S in
 111    the loop that needs to be vectorized.  It inserts the vector code sequence
 112    just before the scalar stmt S, and records a pointer to the vector code
 113    in STMT_VINFO_VEC_STMT (stmt_info) (stmt_info is the stmt_vec_info struct
 114    attached to S).  This pointer will be used for the vectorization of following
 115    stmts which use the def of stmt S. Stmt S is removed if it writes to memory;
 116    otherwise, we rely on dead code elimination for removing it.
 117
 118         For example, say stmt S1 was vectorized into stmt VS1:
 119
 120    VS1: vb = px[i];
 121    S1:  b = x[i];    STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
 122    S2:  a = b;
 123
 124    To vectorize stmt S2, the vectorizer first finds the stmt that defines
 125    the operand 'b' (S1), and gets the relevant vector def 'vb' from the
 126    vector stmt VS1 pointed to by STMT_VINFO_VEC_STMT (stmt_info (S1)).  The
 127    resulting sequence would be:
 128
 129    VS1: vb = px[i];
 130    S1:  b = x[i];       STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
 131    VS2: va = vb;
 132    S2:  a = b;          STMT_VINFO_VEC_STMT (stmt_info (S2)) = VS2
 133
 134         Operands that are not SSA_NAMEs, are data-refs that appear in
 135    load/store operations (like 'x[i]' in S1), and are handled differently.
 136
 137    Target modeling:
 138    =================
 139         Currently the only target specific information that is used is the
 140    size of the vector (in bytes) - "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD".
 141    Targets that can support different sizes of vectors, for now will need
 142    to specify one value for "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD".  More
 143    flexibility will be added in the future.
 144
 145         Since we only vectorize operations which vector form can be
 146    expressed using existing tree codes, to verify that an operation is
 147    supported, the vectorizer checks the relevant optab at the relevant
 148    machine_mode (e.g, optab_handler (add_optab, V8HImode)).  If
 149    the value found is CODE_FOR_nothing, then there's no target support, and
 150    we can't vectorize the stmt.
 151
 152    For additional information on this project see:
 153    http://gcc.gnu.org/projects/tree-ssa/vectorization.html
 154 */
 155
 156 static void vect_estimate_min_profitable_iters (loop_vec_info, int *, int *);
 157 static stmt_vec_info vect_is_simple_reduction (loop_vec_info, stmt_vec_info,
 158                                                bool *);
 159
 160 /* Subroutine of vect_determine_vf_for_stmt that handles only one
 161    statement.  VECTYPE_MAYBE_SET_P is true if STMT_VINFO_VECTYPE
 162    may already be set for general statements (not just data refs).  */
 163
 164 static opt_result
 165 vect_determine_vf_for_stmt_1 (stmt_vec_info stmt_info,
 166                               bool vectype_maybe_set_p,
 167                               poly_uint64 *vf,
 168                               vec<stmt_vec_info > *mask_producers)
 169 {
 170   gimple *stmt = stmt_info->stmt;
 171
 172   if ((!STMT_VINFO_RELEVANT_P (stmt_info)
 173        && !STMT_VINFO_LIVE_P (stmt_info))
 174       || gimple_clobber_p (stmt))
 175     {
 176       if (dump_enabled_p ())
 177         dump_printf_loc (MSG_NOTE, vect_location, "skip.\n");
 178       return opt_result::success ();
 179     }
 180
 181   tree stmt_vectype, nunits_vectype;
 182   opt_result res = vect_get_vector_types_for_stmt (stmt_info, &stmt_vectype,
 183                                                    &nunits_vectype);
 184   if (!res)
 185     return res;
 186
 187   if (stmt_vectype)
 188     {
 189       if (STMT_VINFO_VECTYPE (stmt_info))
 190         /* The only case when a vectype had been already set is for stmts
 191            that contain a data ref, or for "pattern-stmts" (stmts generated
 192            by the vectorizer to represent/replace a certain idiom).  */
 193         gcc_assert ((STMT_VINFO_DATA_REF (stmt_info)
 194                      || vectype_maybe_set_p)
 195                     && STMT_VINFO_VECTYPE (stmt_info) == stmt_vectype);
 196       else if (stmt_vectype == boolean_type_node)
 197         mask_producers->safe_push (stmt_info);
 198       else
 199         STMT_VINFO_VECTYPE (stmt_info) = stmt_vectype;
 200     }
 201
 202   if (nunits_vectype)
 203     vect_update_max_nunits (vf, nunits_vectype);
 204
 205   return opt_result::success ();
 206 }
 207
 208 /* Subroutine of vect_determine_vectorization_factor.  Set the vector
 209    types of STMT_INFO and all attached pattern statements and update
 210    the vectorization factor VF accordingly.  If some of the statements
 211    produce a mask result whose vector type can only be calculated later,
 212    add them to MASK_PRODUCERS.  Return true on success or false if
 213    something prevented vectorization.  */
 214
 215 static opt_result
 216 vect_determine_vf_for_stmt (stmt_vec_info stmt_info, poly_uint64 *vf,
 217                             vec<stmt_vec_info > *mask_producers)
 218 {
 219   vec_info *vinfo = stmt_info->vinfo;
 220   if (dump_enabled_p ())
 221     dump_printf_loc (MSG_NOTE, vect_location, "==> examining statement: %G",
 222                      stmt_info->stmt);
 223   opt_result res
 224     = vect_determine_vf_for_stmt_1 (stmt_info, false, vf, mask_producers);
 225   if (!res)
 226     return res;
 227
 228   if (STMT_VINFO_IN_PATTERN_P (stmt_info)
 229       && STMT_VINFO_RELATED_STMT (stmt_info))
 230     {
 231       gimple *pattern_def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
 232       stmt_info = STMT_VINFO_RELATED_STMT (stmt_info);
 233
 234       /* If a pattern statement has def stmts, analyze them too.  */
 235       for (gimple_stmt_iterator si = gsi_start (pattern_def_seq);
 236            !gsi_end_p (si); gsi_next (&si))
 237         {
 238           stmt_vec_info def_stmt_info = vinfo->lookup_stmt (gsi_stmt (si));
 239           if (dump_enabled_p ())
 240             dump_printf_loc (MSG_NOTE, vect_location,
 241                              "==> examining pattern def stmt: %G",
 242                              def_stmt_info->stmt);
 243           if (!vect_determine_vf_for_stmt_1 (def_stmt_info, true,
 244                                              vf, mask_producers))
 245           res = vect_determine_vf_for_stmt_1 (def_stmt_info, true,
 246                                               vf, mask_producers);
 247           if (!res)
 248             return res;
 249         }
 250
 251       if (dump_enabled_p ())
 252         dump_printf_loc (MSG_NOTE, vect_location,
 253                          "==> examining pattern statement: %G",
 254                          stmt_info->stmt);
 255       res = vect_determine_vf_for_stmt_1 (stmt_info, true, vf, mask_producers);
 256       if (!res)
 257         return res;
 258     }
 259
 260   return opt_result::success ();
 261 }
 262
 263 /* Function vect_determine_vectorization_factor
 264
 265    Determine the vectorization factor (VF).  VF is the number of data elements
 266    that are operated upon in parallel in a single iteration of the vectorized
 267    loop.  For example, when vectorizing a loop that operates on 4byte elements,
 268    on a target with vector size (VS) 16byte, the VF is set to 4, since 4
 269    elements can fit in a single vector register.
 270
 271    We currently support vectorization of loops in which all types operated upon
 272    are of the same size.  Therefore this function currently sets VF according to
 273    the size of the types operated upon, and fails if there are multiple sizes
 274    in the loop.
 275
 276    VF is also the factor by which the loop iterations are strip-mined, e.g.:
 277    original loop:
 278         for (i=0; i<N; i++){
 279           a[i] = b[i] + c[i];
 280         }
 281
 282    vectorized loop:
 283         for (i=0; i<N; i+=VF){
 284           a[i:VF] = b[i:VF] + c[i:VF];
 285         }
 286 */
 287
 288 static opt_result
 289 vect_determine_vectorization_factor (loop_vec_info loop_vinfo)
 290 {
 291   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
 292   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
 293   unsigned nbbs = loop->num_nodes;
 294   poly_uint64 vectorization_factor = 1;
 295   tree scalar_type = NULL_TREE;
 296   gphi *phi;
 297   tree vectype;
 298   stmt_vec_info stmt_info;
 299   unsigned i;
 300   auto_vec<stmt_vec_info> mask_producers;
 301
 302   DUMP_VECT_SCOPE ("vect_determine_vectorization_factor");
 303
 304   for (i = 0; i < nbbs; i++)
 305     {
 306       basic_block bb = bbs[i];
 307
 308       for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
 309            gsi_next (&si))
 310         {
 311           phi = si.phi ();
 312           stmt_info = loop_vinfo->lookup_stmt (phi);
 313           if (dump_enabled_p ())
 314             dump_printf_loc (MSG_NOTE, vect_location, "==> examining phi: %G",
 315                              phi);
 316
 317           gcc_assert (stmt_info);
 318
 319           if (STMT_VINFO_RELEVANT_P (stmt_info)
 320               || STMT_VINFO_LIVE_P (stmt_info))
 321             {
 322               gcc_assert (!STMT_VINFO_VECTYPE (stmt_info));
 323               scalar_type = TREE_TYPE (PHI_RESULT (phi));
 324
 325               if (dump_enabled_p ())
 326                 dump_printf_loc (MSG_NOTE, vect_location,
 327                                  "get vectype for scalar type:  %T\n",
 328                                  scalar_type);
 329
 330               vectype = get_vectype_for_scalar_type (scalar_type);
 331               if (!vectype)
 332                 return opt_result::failure_at (phi,
 333                                                "not vectorized: unsupported "
 334                                                "data-type %T\n",
 335                                                scalar_type);
 336               STMT_VINFO_VECTYPE (stmt_info) = vectype;
 337
 338               if (dump_enabled_p ())
 339                 dump_printf_loc (MSG_NOTE, vect_location, "vectype: %T\n",
 340                                  vectype);
 341
 342               if (dump_enabled_p ())
 343                 {
 344                   dump_printf_loc (MSG_NOTE, vect_location, "nunits = ");
 345                   dump_dec (MSG_NOTE, TYPE_VECTOR_SUBPARTS (vectype));
 346                   dump_printf (MSG_NOTE, "\n");
 347                 }
 348
 349               vect_update_max_nunits (&vectorization_factor, vectype);
 350             }
 351         }
 352
 353       for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
 354            gsi_next (&si))
 355         {
 356           stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
 357           opt_result res
 358             = vect_determine_vf_for_stmt (stmt_info, &vectorization_factor,
 359                                           &mask_producers);
 360           if (!res)
 361             return res;
 362         }
 363     }
 364
 365   /* TODO: Analyze cost. Decide if worth while to vectorize.  */
 366   if (dump_enabled_p ())
 367     {
 368       dump_printf_loc (MSG_NOTE, vect_location, "vectorization factor = ");
 369       dump_dec (MSG_NOTE, vectorization_factor);
 370       dump_printf (MSG_NOTE, "\n");
 371     }
 372
 373   if (known_le (vectorization_factor, 1U))
 374     return opt_result::failure_at (vect_location,
 375                                    "not vectorized: unsupported data-type\n");
 376   LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
 377
 378   for (i = 0; i < mask_producers.length (); i++)
 379     {
 380       stmt_info = mask_producers[i];
 381       opt_tree mask_type = vect_get_mask_type_for_stmt (stmt_info);
 382       if (!mask_type)
 383         return opt_result::propagate_failure (mask_type);
 384       STMT_VINFO_VECTYPE (stmt_info) = mask_type;
 385     }
 386
 387   return opt_result::success ();
 388 }
 389
 390
 391 /* Function vect_is_simple_iv_evolution.
 392
 393    FORNOW: A simple evolution of an induction variables in the loop is
 394    considered a polynomial evolution.  */
 395
 396 static bool
 397 vect_is_simple_iv_evolution (unsigned loop_nb, tree access_fn, tree * init,
 398                              tree * step)
 399 {
 400   tree init_expr;
 401   tree step_expr;
 402   tree evolution_part = evolution_part_in_loop_num (access_fn, loop_nb);
 403   basic_block bb;
 404
 405   /* When there is no evolution in this loop, the evolution function
 406      is not "simple".  */
 407   if (evolution_part == NULL_TREE)
 408     return false;
 409
 410   /* When the evolution is a polynomial of degree >= 2
 411      the evolution function is not "simple".  */
 412   if (tree_is_chrec (evolution_part))
 413     return false;
 414
 415   step_expr = evolution_part;
 416   init_expr = unshare_expr (initial_condition_in_loop_num (access_fn, loop_nb));
 417
 418   if (dump_enabled_p ())
 419     dump_printf_loc (MSG_NOTE, vect_location, "step: %T,  init: %T\n",
 420                      step_expr, init_expr);
 421
 422   *init = init_expr;
 423   *step = step_expr;
 424
 425   if (TREE_CODE (step_expr) != INTEGER_CST
 426       && (TREE_CODE (step_expr) != SSA_NAME
 427           || ((bb = gimple_bb (SSA_NAME_DEF_STMT (step_expr)))
 428               && flow_bb_inside_loop_p (get_loop (cfun, loop_nb), bb))
 429           || (!INTEGRAL_TYPE_P (TREE_TYPE (step_expr))
 430               && (!SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr))
 431                   || !flag_associative_math)))
 432       && (TREE_CODE (step_expr) != REAL_CST
 433           || !flag_associative_math))
 434     {
 435       if (dump_enabled_p ())
 436         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 437                          "step unknown.\n");
 438       return false;
 439     }
 440
 441   return true;
 442 }
 443
 444 /* Return true if PHI, described by STMT_INFO, is the inner PHI in
 445    what we are assuming is a double reduction.  For example, given
 446    a structure like this:
 447
 448       outer1:
 449         x_1 = PHI <x_4(outer2), ...>;
 450         ...
 451
 452       inner:
 453         x_2 = PHI <x_1(outer1), ...>;
 454         ...
 455         x_3 = ...;
 456         ...
 457
 458       outer2:
 459         x_4 = PHI <x_3(inner)>;
 460         ...
 461
 462    outer loop analysis would treat x_1 as a double reduction phi and
 463    this function would then return true for x_2.  */
 464
 465 static bool
 466 vect_inner_phi_in_double_reduction_p (stmt_vec_info stmt_info, gphi *phi)
 467 {
 468   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
 469   use_operand_p use_p;
 470   ssa_op_iter op_iter;
 471   FOR_EACH_PHI_ARG (use_p, phi, op_iter, SSA_OP_USE)
 472     if (stmt_vec_info def_info = loop_vinfo->lookup_def (USE_FROM_PTR (use_p)))
 473       if (STMT_VINFO_DEF_TYPE (def_info) == vect_double_reduction_def)
 474         return true;
 475   return false;
 476 }
 477
 478 /* Function vect_analyze_scalar_cycles_1.
 479
 480    Examine the cross iteration def-use cycles of scalar variables
 481    in LOOP.  LOOP_VINFO represents the loop that is now being
 482    considered for vectorization (can be LOOP, or an outer-loop
 483    enclosing LOOP).  */
 484
 485 static void
 486 vect_analyze_scalar_cycles_1 (loop_vec_info loop_vinfo, class loop *loop)
 487 {
 488   basic_block bb = loop->header;
 489   tree init, step;
 490   auto_vec<stmt_vec_info, 64> worklist;
 491   gphi_iterator gsi;
 492   bool double_reduc;
 493
 494   DUMP_VECT_SCOPE ("vect_analyze_scalar_cycles");
 495
 496   /* First - identify all inductions.  Reduction detection assumes that all the
 497      inductions have been identified, therefore, this order must not be
 498      changed.  */
 499   for (gsi = gsi_start_phis  (bb); !gsi_end_p (gsi); gsi_next (&gsi))
 500     {
 501       gphi *phi = gsi.phi ();
 502       tree access_fn = NULL;
 503       tree def = PHI_RESULT (phi);
 504       stmt_vec_info stmt_vinfo = loop_vinfo->lookup_stmt (phi);
 505
 506       if (dump_enabled_p ())
 507         dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: %G", phi);
 508
 509       /* Skip virtual phi's.  The data dependences that are associated with
 510          virtual defs/uses (i.e., memory accesses) are analyzed elsewhere.  */
 511       if (virtual_operand_p (def))
 512         continue;
 513
 514       STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_unknown_def_type;
 515
 516       /* Analyze the evolution function.  */
 517       access_fn = analyze_scalar_evolution (loop, def);
 518       if (access_fn)
 519         {
 520           STRIP_NOPS (access_fn);
 521           if (dump_enabled_p ())
 522             dump_printf_loc (MSG_NOTE, vect_location,
 523                              "Access function of PHI: %T\n", access_fn);
 524           STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo)
 525             = initial_condition_in_loop_num (access_fn, loop->num);
 526           STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo)
 527             = evolution_part_in_loop_num (access_fn, loop->num);
 528         }
 529
 530       if (!access_fn
 531           || vect_inner_phi_in_double_reduction_p (stmt_vinfo, phi)
 532           || !vect_is_simple_iv_evolution (loop->num, access_fn, &init, &step)
 533           || (LOOP_VINFO_LOOP (loop_vinfo) != loop
 534               && TREE_CODE (step) != INTEGER_CST))
 535         {
 536           worklist.safe_push (stmt_vinfo);
 537           continue;
 538         }
 539
 540       gcc_assert (STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo)
 541                   != NULL_TREE);
 542       gcc_assert (STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo) != NULL_TREE);
 543
 544       if (dump_enabled_p ())
 545         dump_printf_loc (MSG_NOTE, vect_location, "Detected induction.\n");
 546       STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_induction_def;
 547     }
 548
 549
 550   /* Second - identify all reductions and nested cycles.  */
 551   while (worklist.length () > 0)
 552     {
 553       stmt_vec_info stmt_vinfo = worklist.pop ();
 554       gphi *phi = as_a <gphi *> (stmt_vinfo->stmt);
 555       tree def = PHI_RESULT (phi);
 556
 557       if (dump_enabled_p ())
 558         dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: %G", phi);
 559
 560       gcc_assert (!virtual_operand_p (def)
 561                   && STMT_VINFO_DEF_TYPE (stmt_vinfo) == vect_unknown_def_type);
 562
 563       stmt_vec_info reduc_stmt_info
 564         = vect_is_simple_reduction (loop_vinfo, stmt_vinfo, &double_reduc);
 565       if (reduc_stmt_info)
 566         {
 567           STMT_VINFO_REDUC_DEF (stmt_vinfo) = reduc_stmt_info;
 568           STMT_VINFO_REDUC_DEF (reduc_stmt_info) = stmt_vinfo;
 569           if (double_reduc)
 570             {
 571               if (dump_enabled_p ())
 572                 dump_printf_loc (MSG_NOTE, vect_location,
 573                                  "Detected double reduction.\n");
 574
 575               STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_double_reduction_def;
 576               STMT_VINFO_DEF_TYPE (reduc_stmt_info) = vect_double_reduction_def;
 577             }
 578           else
 579             {
 580               if (loop != LOOP_VINFO_LOOP (loop_vinfo))
 581                 {
 582                   if (dump_enabled_p ())
 583                     dump_printf_loc (MSG_NOTE, vect_location,
 584                                      "Detected vectorizable nested cycle.\n");
 585
 586                   STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_nested_cycle;
 587                 }
 588               else
 589                 {
 590                   if (dump_enabled_p ())
 591                     dump_printf_loc (MSG_NOTE, vect_location,
 592                                      "Detected reduction.\n");
 593
 594                   STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_reduction_def;
 595                   STMT_VINFO_DEF_TYPE (reduc_stmt_info) = vect_reduction_def;
 596                   /* Store the reduction cycles for possible vectorization in
 597                      loop-aware SLP if it was not detected as reduction
 598                      chain.  */
 599                   if (! REDUC_GROUP_FIRST_ELEMENT (reduc_stmt_info))
 600                     LOOP_VINFO_REDUCTIONS (loop_vinfo).safe_push
 601                       (reduc_stmt_info);
 602                 }
 603             }
 604         }
 605       else
 606         if (dump_enabled_p ())
 607           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 608                            "Unknown def-use cycle pattern.\n");
 609     }
 610 }
 611
 612
 613 /* Function vect_analyze_scalar_cycles.
 614
 615    Examine the cross iteration def-use cycles of scalar variables, by
 616    analyzing the loop-header PHIs of scalar variables.  Classify each
 617    cycle as one of the following: invariant, induction, reduction, unknown.
 618    We do that for the loop represented by LOOP_VINFO, and also to its
 619    inner-loop, if exists.
 620    Examples for scalar cycles:
 621
 622    Example1: reduction:
 623
 624               loop1:
 625               for (i=0; i<N; i++)
 626                  sum += a[i];
 627
 628    Example2: induction:
 629
 630               loop2:
 631               for (i=0; i<N; i++)
 632                  a[i] = i;  */
 633
 634 static void
 635 vect_analyze_scalar_cycles (loop_vec_info loop_vinfo)
 636 {
 637   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
 638
 639   vect_analyze_scalar_cycles_1 (loop_vinfo, loop);
 640
 641   /* When vectorizing an outer-loop, the inner-loop is executed sequentially.
 642      Reductions in such inner-loop therefore have different properties than
 643      the reductions in the nest that gets vectorized:
 644      1. When vectorized, they are executed in the same order as in the original
 645         scalar loop, so we can't change the order of computation when
 646         vectorizing them.
 647      2. FIXME: Inner-loop reductions can be used in the inner-loop, so the
 648         current checks are too strict.  */
 649
 650   if (loop->inner)
 651     vect_analyze_scalar_cycles_1 (loop_vinfo, loop->inner);
 652 }
 653
 654 /* Transfer group and reduction information from STMT_INFO to its
 655    pattern stmt.  */
 656
 657 static void
 658 vect_fixup_reduc_chain (stmt_vec_info stmt_info)
 659 {
 660   stmt_vec_info firstp = STMT_VINFO_RELATED_STMT (stmt_info);
 661   stmt_vec_info stmtp;
 662   gcc_assert (!REDUC_GROUP_FIRST_ELEMENT (firstp)
 663               && REDUC_GROUP_FIRST_ELEMENT (stmt_info));
 664   REDUC_GROUP_SIZE (firstp) = REDUC_GROUP_SIZE (stmt_info);
 665   do
 666     {
 667       stmtp = STMT_VINFO_RELATED_STMT (stmt_info);
 668       REDUC_GROUP_FIRST_ELEMENT (stmtp) = firstp;
 669       stmt_info = REDUC_GROUP_NEXT_ELEMENT (stmt_info);
 670       if (stmt_info)
 671         REDUC_GROUP_NEXT_ELEMENT (stmtp)
 672           = STMT_VINFO_RELATED_STMT (stmt_info);
 673     }
 674   while (stmt_info);
 675   STMT_VINFO_DEF_TYPE (stmtp) = vect_reduction_def;
 676 }
 677
 678 /* Fixup scalar cycles that now have their stmts detected as patterns.  */
 679
 680 static void
 681 vect_fixup_scalar_cycles_with_patterns (loop_vec_info loop_vinfo)
 682 {
 683   stmt_vec_info first;
 684   unsigned i;
 685
 686   FOR_EACH_VEC_ELT (LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo), i, first)
 687     if (STMT_VINFO_IN_PATTERN_P (first))
 688       {
 689         stmt_vec_info next = REDUC_GROUP_NEXT_ELEMENT (first);
 690         while (next)
 691           {
 692             if (! STMT_VINFO_IN_PATTERN_P (next))
 693               break;
 694             next = REDUC_GROUP_NEXT_ELEMENT (next);
 695           }
 696         /* If not all stmt in the chain are patterns try to handle
 697            the chain without patterns.  */
 698         if (! next)
 699           {
 700             vect_fixup_reduc_chain (first);
 701             LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo)[i]
 702               = STMT_VINFO_RELATED_STMT (first);
 703           }
 704       }
 705 }
 706
 707 /* Function vect_get_loop_niters.
 708
 709    Determine how many iterations the loop is executed and place it
 710    in NUMBER_OF_ITERATIONS.  Place the number of latch iterations
 711    in NUMBER_OF_ITERATIONSM1.  Place the condition under which the
 712    niter information holds in ASSUMPTIONS.
 713
 714    Return the loop exit condition.  */
 715
 716
 717 static gcond *
 718 vect_get_loop_niters (class loop *loop, tree *assumptions,
 719                       tree *number_of_iterations, tree *number_of_iterationsm1)
 720 {
 721   edge exit = single_exit (loop);
 722   class tree_niter_desc niter_desc;
 723   tree niter_assumptions, niter, may_be_zero;
 724   gcond *cond = get_loop_exit_condition (loop);
 725
 726   *assumptions = boolean_true_node;
 727   *number_of_iterationsm1 = chrec_dont_know;
 728   *number_of_iterations = chrec_dont_know;
 729   DUMP_VECT_SCOPE ("get_loop_niters");
 730
 731   if (!exit)
 732     return cond;
 733
 734   may_be_zero = NULL_TREE;
 735   if (!number_of_iterations_exit_assumptions (loop, exit, &niter_desc, NULL)
 736       || chrec_contains_undetermined (niter_desc.niter))
 737     return cond;
 738
 739   niter_assumptions = niter_desc.assumptions;
 740   may_be_zero = niter_desc.may_be_zero;
 741   niter = niter_desc.niter;
 742
 743   if (may_be_zero && integer_zerop (may_be_zero))
 744     may_be_zero = NULL_TREE;
 745
 746   if (may_be_zero)
 747     {
 748       if (COMPARISON_CLASS_P (may_be_zero))
 749         {
 750           /* Try to combine may_be_zero with assumptions, this can simplify
 751              computation of niter expression.  */
 752           if (niter_assumptions && !integer_nonzerop (niter_assumptions))
 753             niter_assumptions = fold_build2 (TRUTH_AND_EXPR, boolean_type_node,
 754                                              niter_assumptions,
 755                                              fold_build1 (TRUTH_NOT_EXPR,
 756                                                           boolean_type_node,
 757                                                           may_be_zero));
 758           else
 759             niter = fold_build3 (COND_EXPR, TREE_TYPE (niter), may_be_zero,
 760                                  build_int_cst (TREE_TYPE (niter), 0),
 761                                  rewrite_to_non_trapping_overflow (niter));
 762
 763           may_be_zero = NULL_TREE;
 764         }
 765       else if (integer_nonzerop (may_be_zero))
 766         {
 767           *number_of_iterationsm1 = build_int_cst (TREE_TYPE (niter), 0);
 768           *number_of_iterations = build_int_cst (TREE_TYPE (niter), 1);
 769           return cond;
 770         }
 771       else
 772         return cond;
 773     }
 774
 775   *assumptions = niter_assumptions;
 776   *number_of_iterationsm1 = niter;
 777
 778   /* We want the number of loop header executions which is the number
 779      of latch executions plus one.
 780      ???  For UINT_MAX latch executions this number overflows to zero
 781      for loops like do { n++; } while (n != 0);  */
 782   if (niter && !chrec_contains_undetermined (niter))
 783     niter = fold_build2 (PLUS_EXPR, TREE_TYPE (niter), unshare_expr (niter),
 784                           build_int_cst (TREE_TYPE (niter), 1));
 785   *number_of_iterations = niter;
 786
 787   return cond;
 788 }
 789
 790 /* Function bb_in_loop_p
 791
 792    Used as predicate for dfs order traversal of the loop bbs.  */
 793
 794 static bool
 795 bb_in_loop_p (const_basic_block bb, const void *data)
 796 {
 797   const class loop *const loop = (const class loop *)data;
 798   if (flow_bb_inside_loop_p (loop, bb))
 799     return true;
 800   return false;
 801 }
 802
 803
 804 /* Create and initialize a new loop_vec_info struct for LOOP_IN, as well as
 805    stmt_vec_info structs for all the stmts in LOOP_IN.  */
 806
 807 _loop_vec_info::_loop_vec_info (class loop *loop_in, vec_info_shared *shared)
 808   : vec_info (vec_info::loop, init_cost (loop_in), shared),
 809     loop (loop_in),
 810     bbs (XCNEWVEC (basic_block, loop->num_nodes)),
 811     num_itersm1 (NULL_TREE),
 812     num_iters (NULL_TREE),
 813     num_iters_unchanged (NULL_TREE),
 814     num_iters_assumptions (NULL_TREE),
 815     th (0),
 816     versioning_threshold (0),
 817     vectorization_factor (0),
 818     max_vectorization_factor (0),
 819     mask_skip_niters (NULL_TREE),
 820     mask_compare_type (NULL_TREE),
 821     simd_if_cond (NULL_TREE),
 822     unaligned_dr (NULL),
 823     peeling_for_alignment (0),
 824     ptr_mask (0),
 825     ivexpr_map (NULL),
 826     scan_map (NULL),
 827     slp_unrolling_factor (1),
 828     single_scalar_iteration_cost (0),
 829     vectorizable (false),
 830     can_fully_mask_p (true),
 831     fully_masked_p (false),
 832     peeling_for_gaps (false),
 833     peeling_for_niter (false),
 834     no_data_dependencies (false),
 835     has_mask_store (false),
 836     scalar_loop_scaling (profile_probability::uninitialized ()),
 837     scalar_loop (NULL),
 838     orig_loop_info (NULL)
 839 {
 840   /* CHECKME: We want to visit all BBs before their successors (except for
 841      latch blocks, for which this assertion wouldn't hold).  In the simple
 842      case of the loop forms we allow, a dfs order of the BBs would the same
 843      as reversed postorder traversal, so we are safe.  */
 844
 845   unsigned int nbbs = dfs_enumerate_from (loop->header, 0, bb_in_loop_p,
 846                                           bbs, loop->num_nodes, loop);
 847   gcc_assert (nbbs == loop->num_nodes);
 848
 849   for (unsigned int i = 0; i < nbbs; i++)
 850     {
 851       basic_block bb = bbs[i];
 852       gimple_stmt_iterator si;
 853
 854       for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
 855         {
 856           gimple *phi = gsi_stmt (si);
 857           gimple_set_uid (phi, 0);
 858           add_stmt (phi);
 859         }
 860
 861       for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
 862         {
 863           gimple *stmt = gsi_stmt (si);
 864           gimple_set_uid (stmt, 0);
 865           add_stmt (stmt);
 866           /* If .GOMP_SIMD_LANE call for the current loop has 3 arguments, the
 867              third argument is the #pragma omp simd if (x) condition, when 0,
 868              loop shouldn't be vectorized, when non-zero constant, it should
 869              be vectorized normally, otherwise versioned with vectorized loop
 870              done if the condition is non-zero at runtime.  */
 871           if (loop_in->simduid
 872               && is_gimple_call (stmt)
 873               && gimple_call_internal_p (stmt)
 874               && gimple_call_internal_fn (stmt) == IFN_GOMP_SIMD_LANE
 875               && gimple_call_num_args (stmt) >= 3
 876               && TREE_CODE (gimple_call_arg (stmt, 0)) == SSA_NAME
 877               && (loop_in->simduid
 878                   == SSA_NAME_VAR (gimple_call_arg (stmt, 0))))
 879             {
 880               tree arg = gimple_call_arg (stmt, 2);
 881               if (integer_zerop (arg) || TREE_CODE (arg) == SSA_NAME)
 882                 simd_if_cond = arg;
 883               else
 884                 gcc_assert (integer_nonzerop (arg));
 885             }
 886         }
 887     }
 888 }
 889
 890 /* Free all levels of MASKS.  */
 891
 892 void
 893 release_vec_loop_masks (vec_loop_masks *masks)
 894 {
 895   rgroup_masks *rgm;
 896   unsigned int i;
 897   FOR_EACH_VEC_ELT (*masks, i, rgm)
 898     rgm->masks.release ();
 899   masks->release ();
 900 }
 901
 902 /* Free all memory used by the _loop_vec_info, as well as all the
 903    stmt_vec_info structs of all the stmts in the loop.  */
 904
 905 _loop_vec_info::~_loop_vec_info ()
 906 {
 907   free (bbs);
 908
 909   release_vec_loop_masks (&masks);
 910   delete ivexpr_map;
 911   delete scan_map;
 912
 913   loop->aux = NULL;
 914 }
 915
 916 /* Return an invariant or register for EXPR and emit necessary
 917    computations in the LOOP_VINFO loop preheader.  */
 918
 919 tree
 920 cse_and_gimplify_to_preheader (loop_vec_info loop_vinfo, tree expr)
 921 {
 922   if (is_gimple_reg (expr)
 923       || is_gimple_min_invariant (expr))
 924     return expr;
 925
 926   if (! loop_vinfo->ivexpr_map)
 927     loop_vinfo->ivexpr_map = new hash_map<tree_operand_hash, tree>;
 928   tree &cached = loop_vinfo->ivexpr_map->get_or_insert (expr);
 929   if (! cached)
 930     {
 931       gimple_seq stmts = NULL;
 932       cached = force_gimple_operand (unshare_expr (expr),
 933                                      &stmts, true, NULL_TREE);
 934       if (stmts)
 935         {
 936           edge e = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo));
 937           gsi_insert_seq_on_edge_immediate (e, stmts);
 938         }
 939     }
 940   return cached;
 941 }
 942
 943 /* Return true if we can use CMP_TYPE as the comparison type to produce
 944    all masks required to mask LOOP_VINFO.  */
 945
 946 static bool
 947 can_produce_all_loop_masks_p (loop_vec_info loop_vinfo, tree cmp_type)
 948 {
 949   rgroup_masks *rgm;
 950   unsigned int i;
 951   FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo), i, rgm)
 952     if (rgm->mask_type != NULL_TREE
 953         && !direct_internal_fn_supported_p (IFN_WHILE_ULT,
 954                                             cmp_type, rgm->mask_type,
 955                                             OPTIMIZE_FOR_SPEED))
 956       return false;
 957   return true;
 958 }
 959
 960 /* Calculate the maximum number of scalars per iteration for every
 961    rgroup in LOOP_VINFO.  */
 962
 963 static unsigned int
 964 vect_get_max_nscalars_per_iter (loop_vec_info loop_vinfo)
 965 {
 966   unsigned int res = 1;
 967   unsigned int i;
 968   rgroup_masks *rgm;
 969   FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo), i, rgm)
 970     res = MAX (res, rgm->max_nscalars_per_iter);
 971   return res;
 972 }
 973
 974 /* Each statement in LOOP_VINFO can be masked where necessary.  Check
 975    whether we can actually generate the masks required.  Return true if so,
 976    storing the type of the scalar IV in LOOP_VINFO_MASK_COMPARE_TYPE.  */
 977
 978 static bool
 979 vect_verify_full_masking (loop_vec_info loop_vinfo)
 980 {
 981   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
 982   unsigned int min_ni_width;
 983   unsigned int max_nscalars_per_iter
 984     = vect_get_max_nscalars_per_iter (loop_vinfo);
 985
 986   /* Use a normal loop if there are no statements that need masking.
 987      This only happens in rare degenerate cases: it means that the loop
 988      has no loads, no stores, and no live-out values.  */
 989   if (LOOP_VINFO_MASKS (loop_vinfo).is_empty ())
 990     return false;
 991
 992   /* Get the maximum number of iterations that is representable
 993      in the counter type.  */
 994   tree ni_type = TREE_TYPE (LOOP_VINFO_NITERSM1 (loop_vinfo));
 995   widest_int max_ni = wi::to_widest (TYPE_MAX_VALUE (ni_type)) + 1;
 996
 997   /* Get a more refined estimate for the number of iterations.  */
 998   widest_int max_back_edges;
 999   if (max_loop_iterations (loop, &max_back_edges))
1000     max_ni = wi::smin (max_ni, max_back_edges + 1);
1001
1002   /* Account for rgroup masks, in which each bit is replicated N times.  */
1003   max_ni *= max_nscalars_per_iter;
1004
1005   /* Work out how many bits we need to represent the limit.  */
1006   min_ni_width = wi::min_precision (max_ni, UNSIGNED);
1007
1008   /* Find a scalar mode for which WHILE_ULT is supported.  */
1009   opt_scalar_int_mode cmp_mode_iter;
1010   tree cmp_type = NULL_TREE;
1011   tree iv_type = NULL_TREE;
1012   widest_int iv_limit = vect_iv_limit_for_full_masking (loop_vinfo);
1013   unsigned int iv_precision = UINT_MAX;
1014
1015   if (iv_limit != -1)
1016     iv_precision = wi::min_precision (iv_limit * max_nscalars_per_iter,
1017                                       UNSIGNED);
1018
1019   FOR_EACH_MODE_IN_CLASS (cmp_mode_iter, MODE_INT)
1020     {
1021       unsigned int cmp_bits = GET_MODE_BITSIZE (cmp_mode_iter.require ());
1022       if (cmp_bits >= min_ni_width
1023           && targetm.scalar_mode_supported_p (cmp_mode_iter.require ()))
1024         {
1025           tree this_type = build_nonstandard_integer_type (cmp_bits, true);
1026           if (this_type
1027               && can_produce_all_loop_masks_p (loop_vinfo, this_type))
1028             {
1029               /* Although we could stop as soon as we find a valid mode,
1030                  there are at least two reasons why that's not always the
1031                  best choice:
1032
1033                  - An IV that's Pmode or wider is more likely to be reusable
1034                    in address calculations than an IV that's narrower than
1035                    Pmode.
1036
1037                  - Doing the comparison in IV_PRECISION or wider allows
1038                    a natural 0-based IV, whereas using a narrower comparison
1039                    type requires mitigations against wrap-around.
1040
1041                  Conversely, if the IV limit is variable, doing the comparison
1042                  in a wider type than the original type can introduce
1043                  unnecessary extensions, so picking the widest valid mode
1044                  is not always a good choice either.
1045
1046                  Here we prefer the first IV type that's Pmode or wider,
1047                  and the first comparison type that's IV_PRECISION or wider.
1048                  (The comparison type must be no wider than the IV type,
1049                  to avoid extensions in the vector loop.)
1050
1051                  ??? We might want to try continuing beyond Pmode for ILP32
1052                  targets if CMP_BITS < IV_PRECISION.  */
1053               iv_type = this_type;
1054               if (!cmp_type || iv_precision > TYPE_PRECISION (cmp_type))
1055                 cmp_type = this_type;
1056               if (cmp_bits >= GET_MODE_BITSIZE (Pmode))
1057                 break;
1058             }
1059         }
1060     }
1061
1062   if (!cmp_type)
1063     return false;
1064
1065   LOOP_VINFO_MASK_COMPARE_TYPE (loop_vinfo) = cmp_type;
1066   LOOP_VINFO_MASK_IV_TYPE (loop_vinfo) = iv_type;
1067   return true;
1068 }
1069
1070 /* Calculate the cost of one scalar iteration of the loop.  */
1071 static void
1072 vect_compute_single_scalar_iteration_cost (loop_vec_info loop_vinfo)
1073 {
1074   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1075   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1076   int nbbs = loop->num_nodes, factor;
1077   int innerloop_iters, i;
1078
1079   DUMP_VECT_SCOPE ("vect_compute_single_scalar_iteration_cost");
1080
1081   /* Gather costs for statements in the scalar loop.  */
1082
1083   /* FORNOW.  */
1084   innerloop_iters = 1;
1085   if (loop->inner)
1086     innerloop_iters = 50; /* FIXME */
1087
1088   for (i = 0; i < nbbs; i++)
1089     {
1090       gimple_stmt_iterator si;
1091       basic_block bb = bbs[i];
1092
1093       if (bb->loop_father == loop->inner)
1094         factor = innerloop_iters;
1095       else
1096         factor = 1;
1097
1098       for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
1099         {
1100           gimple *stmt = gsi_stmt (si);
1101           stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (stmt);
1102
1103           if (!is_gimple_assign (stmt) && !is_gimple_call (stmt))
1104             continue;
1105
1106           /* Skip stmts that are not vectorized inside the loop.  */
1107           stmt_vec_info vstmt_info = vect_stmt_to_vectorize (stmt_info);
1108           if (!STMT_VINFO_RELEVANT_P (vstmt_info)
1109               && (!STMT_VINFO_LIVE_P (vstmt_info)
1110                   || !VECTORIZABLE_CYCLE_DEF
1111                         (STMT_VINFO_DEF_TYPE (vstmt_info))))
1112             continue;
1113
1114           vect_cost_for_stmt kind;
1115           if (STMT_VINFO_DATA_REF (stmt_info))
1116             {
1117               if (DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
1118                kind = scalar_load;
1119              else
1120                kind = scalar_store;
1121             }
1122           else
1123             kind = scalar_stmt;
1124
1125           record_stmt_cost (&LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1126                             factor, kind, stmt_info, 0, vect_prologue);
1127         }
1128     }
1129
1130   /* Now accumulate cost.  */
1131   void *target_cost_data = init_cost (loop);
1132   stmt_info_for_cost *si;
1133   int j;
1134   FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1135                     j, si)
1136     (void) add_stmt_cost (target_cost_data, si->count,
1137                           si->kind, si->stmt_info, si->misalign,
1138                           vect_body);
1139   unsigned dummy, body_cost = 0;
1140   finish_cost (target_cost_data, &dummy, &body_cost, &dummy);
1141   destroy_cost_data (target_cost_data);
1142   LOOP_VINFO_SINGLE_SCALAR_ITERATION_COST (loop_vinfo) = body_cost;
1143 }
1144
1145
1146 /* Function vect_analyze_loop_form_1.
1147
1148    Verify that certain CFG restrictions hold, including:
1149    - the loop has a pre-header
1150    - the loop has a single entry and exit
1151    - the loop exit condition is simple enough
1152    - the number of iterations can be analyzed, i.e, a countable loop.  The
1153      niter could be analyzed under some assumptions.  */
1154
1155 opt_result
1156 vect_analyze_loop_form_1 (class loop *loop, gcond **loop_cond,
1157                           tree *assumptions, tree *number_of_iterationsm1,
1158                           tree *number_of_iterations, gcond **inner_loop_cond)
1159 {
1160   DUMP_VECT_SCOPE ("vect_analyze_loop_form");
1161
1162   /* Different restrictions apply when we are considering an inner-most loop,
1163      vs. an outer (nested) loop.
1164      (FORNOW. May want to relax some of these restrictions in the future).  */
1165
1166   if (!loop->inner)
1167     {
1168       /* Inner-most loop.  We currently require that the number of BBs is
1169          exactly 2 (the header and latch).  Vectorizable inner-most loops
1170          look like this:
1171
1172                         (pre-header)
1173                            |
1174                           header <--------+
1175                            | |            |
1176                            | +--> latch --+
1177                            |
1178                         (exit-bb)  */
1179
1180       if (loop->num_nodes != 2)
1181         return opt_result::failure_at (vect_location,
1182                                        "not vectorized:"
1183                                        " control flow in loop.\n");
1184
1185       if (empty_block_p (loop->header))
1186         return opt_result::failure_at (vect_location,
1187                                        "not vectorized: empty loop.\n");
1188     }
1189   else
1190     {
1191       class loop *innerloop = loop->inner;
1192       edge entryedge;
1193
1194       /* Nested loop. We currently require that the loop is doubly-nested,
1195          contains a single inner loop, and the number of BBs is exactly 5.
1196          Vectorizable outer-loops look like this:
1197
1198                         (pre-header)
1199                            |
1200                           header <---+
1201                            |         |
1202                           inner-loop |
1203                            |         |
1204                           tail ------+
1205                            |
1206                         (exit-bb)
1207
1208          The inner-loop has the properties expected of inner-most loops
1209          as described above.  */
1210
1211       if ((loop->inner)->inner || (loop->inner)->next)
1212         return opt_result::failure_at (vect_location,
1213                                        "not vectorized:"
1214                                        " multiple nested loops.\n");
1215
1216       if (loop->num_nodes != 5)
1217         return opt_result::failure_at (vect_location,
1218                                        "not vectorized:"
1219                                        " control flow in loop.\n");
1220
1221       entryedge = loop_preheader_edge (innerloop);
1222       if (entryedge->src != loop->header
1223           || !single_exit (innerloop)
1224           || single_exit (innerloop)->dest != EDGE_PRED (loop->latch, 0)->src)
1225         return opt_result::failure_at (vect_location,
1226                                        "not vectorized:"
1227                                        " unsupported outerloop form.\n");
1228
1229       /* Analyze the inner-loop.  */
1230       tree inner_niterm1, inner_niter, inner_assumptions;
1231       opt_result res
1232         = vect_analyze_loop_form_1 (loop->inner, inner_loop_cond,
1233                                     &inner_assumptions, &inner_niterm1,
1234                                     &inner_niter, NULL);
1235       if (!res)
1236         {
1237           if (dump_enabled_p ())
1238             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1239                              "not vectorized: Bad inner loop.\n");
1240           return res;
1241         }
1242
1243       /* Don't support analyzing niter under assumptions for inner
1244          loop.  */
1245       if (!integer_onep (inner_assumptions))
1246         return opt_result::failure_at (vect_location,
1247                                        "not vectorized: Bad inner loop.\n");
1248
1249       if (!expr_invariant_in_loop_p (loop, inner_niter))
1250         return opt_result::failure_at (vect_location,
1251                                        "not vectorized: inner-loop count not"
1252                                        " invariant.\n");
1253
1254       if (dump_enabled_p ())
1255         dump_printf_loc (MSG_NOTE, vect_location,
1256                          "Considering outer-loop vectorization.\n");
1257     }
1258
1259   if (!single_exit (loop))
1260     return opt_result::failure_at (vect_location,
1261                                    "not vectorized: multiple exits.\n");
1262   if (EDGE_COUNT (loop->header->preds) != 2)
1263     return opt_result::failure_at (vect_location,
1264                                    "not vectorized:"
1265                                    " too many incoming edges.\n");
1266
1267   /* We assume that the loop exit condition is at the end of the loop. i.e,
1268      that the loop is represented as a do-while (with a proper if-guard
1269      before the loop if needed), where the loop header contains all the
1270      executable statements, and the latch is empty.  */
1271   if (!empty_block_p (loop->latch)
1272       || !gimple_seq_empty_p (phi_nodes (loop->latch)))
1273     return opt_result::failure_at (vect_location,
1274                                    "not vectorized: latch block not empty.\n");
1275
1276   /* Make sure the exit is not abnormal.  */
1277   edge e = single_exit (loop);
1278   if (e->flags & EDGE_ABNORMAL)
1279     return opt_result::failure_at (vect_location,
1280                                    "not vectorized:"
1281                                    " abnormal loop exit edge.\n");
1282
1283   *loop_cond = vect_get_loop_niters (loop, assumptions, number_of_iterations,
1284                                      number_of_iterationsm1);
1285   if (!*loop_cond)
1286     return opt_result::failure_at
1287       (vect_location,
1288        "not vectorized: complicated exit condition.\n");
1289
1290   if (integer_zerop (*assumptions)
1291       || !*number_of_iterations
1292       || chrec_contains_undetermined (*number_of_iterations))
1293     return opt_result::failure_at
1294       (*loop_cond,
1295        "not vectorized: number of iterations cannot be computed.\n");
1296
1297   if (integer_zerop (*number_of_iterations))
1298     return opt_result::failure_at
1299       (*loop_cond,
1300        "not vectorized: number of iterations = 0.\n");
1301
1302   return opt_result::success ();
1303 }
1304
1305 /* Analyze LOOP form and return a loop_vec_info if it is of suitable form.  */
1306
1307 opt_loop_vec_info
1308 vect_analyze_loop_form (class loop *loop, vec_info_shared *shared)
1309 {
1310   tree assumptions, number_of_iterations, number_of_iterationsm1;
1311   gcond *loop_cond, *inner_loop_cond = NULL;
1312
1313   opt_result res
1314     = vect_analyze_loop_form_1 (loop, &loop_cond,
1315                                 &assumptions, &number_of_iterationsm1,
1316                                 &number_of_iterations, &inner_loop_cond);
1317   if (!res)
1318     return opt_loop_vec_info::propagate_failure (res);
1319
1320   loop_vec_info loop_vinfo = new _loop_vec_info (loop, shared);
1321   LOOP_VINFO_NITERSM1 (loop_vinfo) = number_of_iterationsm1;
1322   LOOP_VINFO_NITERS (loop_vinfo) = number_of_iterations;
1323   LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = number_of_iterations;
1324   if (!integer_onep (assumptions))
1325     {
1326       /* We consider to vectorize this loop by versioning it under
1327          some assumptions.  In order to do this, we need to clear
1328          existing information computed by scev and niter analyzer.  */
1329       scev_reset_htab ();
1330       free_numbers_of_iterations_estimates (loop);
1331       /* Also set flag for this loop so that following scev and niter
1332          analysis are done under the assumptions.  */
1333       loop_constraint_set (loop, LOOP_C_FINITE);
1334       /* Also record the assumptions for versioning.  */
1335       LOOP_VINFO_NITERS_ASSUMPTIONS (loop_vinfo) = assumptions;
1336     }
1337
1338   if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
1339     {
1340       if (dump_enabled_p ())
1341         {
1342           dump_printf_loc (MSG_NOTE, vect_location,
1343                            "Symbolic number of iterations is ");
1344           dump_generic_expr (MSG_NOTE, TDF_DETAILS, number_of_iterations);
1345           dump_printf (MSG_NOTE, "\n");
1346         }
1347     }
1348
1349   stmt_vec_info loop_cond_info = loop_vinfo->lookup_stmt (loop_cond);
1350   STMT_VINFO_TYPE (loop_cond_info) = loop_exit_ctrl_vec_info_type;
1351   if (inner_loop_cond)
1352     {
1353       stmt_vec_info inner_loop_cond_info
1354         = loop_vinfo->lookup_stmt (inner_loop_cond);
1355       STMT_VINFO_TYPE (inner_loop_cond_info) = loop_exit_ctrl_vec_info_type;
1356     }
1357
1358   gcc_assert (!loop->aux);
1359   loop->aux = loop_vinfo;
1360   return opt_loop_vec_info::success (loop_vinfo);
1361 }
1362
1363
1364
1365 /* Scan the loop stmts and dependent on whether there are any (non-)SLP
1366    statements update the vectorization factor.  */
1367
1368 static void
1369 vect_update_vf_for_slp (loop_vec_info loop_vinfo)
1370 {
1371   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1372   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1373   int nbbs = loop->num_nodes;
1374   poly_uint64 vectorization_factor;
1375   int i;
1376
1377   DUMP_VECT_SCOPE ("vect_update_vf_for_slp");
1378
1379   vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1380   gcc_assert (known_ne (vectorization_factor, 0U));
1381
1382   /* If all the stmts in the loop can be SLPed, we perform only SLP, and
1383      vectorization factor of the loop is the unrolling factor required by
1384      the SLP instances.  If that unrolling factor is 1, we say, that we
1385      perform pure SLP on loop - cross iteration parallelism is not
1386      exploited.  */
1387   bool only_slp_in_loop = true;
1388   for (i = 0; i < nbbs; i++)
1389     {
1390       basic_block bb = bbs[i];
1391       for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
1392            gsi_next (&si))
1393         {
1394           stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
1395           stmt_info = vect_stmt_to_vectorize (stmt_info);
1396           if ((STMT_VINFO_RELEVANT_P (stmt_info)
1397                || VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
1398               && !PURE_SLP_STMT (stmt_info))
1399             /* STMT needs both SLP and loop-based vectorization.  */
1400             only_slp_in_loop = false;
1401         }
1402     }
1403
1404   if (only_slp_in_loop)
1405     {
1406       if (dump_enabled_p ())
1407         dump_printf_loc (MSG_NOTE, vect_location,
1408                          "Loop contains only SLP stmts\n");
1409       vectorization_factor = LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo);
1410     }
1411   else
1412     {
1413       if (dump_enabled_p ())
1414         dump_printf_loc (MSG_NOTE, vect_location,
1415                          "Loop contains SLP and non-SLP stmts\n");
1416       /* Both the vectorization factor and unroll factor have the form
1417          current_vector_size * X for some rational X, so they must have
1418          a common multiple.  */
1419       vectorization_factor
1420         = force_common_multiple (vectorization_factor,
1421                                  LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo));
1422     }
1423
1424   LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
1425   if (dump_enabled_p ())
1426     {
1427       dump_printf_loc (MSG_NOTE, vect_location,
1428                        "Updating vectorization factor to ");
1429       dump_dec (MSG_NOTE, vectorization_factor);
1430       dump_printf (MSG_NOTE, ".\n");
1431     }
1432 }
1433
1434 /* Return true if STMT_INFO describes a double reduction phi and if
1435    the other phi in the reduction is also relevant for vectorization.
1436    This rejects cases such as:
1437
1438       outer1:
1439         x_1 = PHI <x_3(outer2), ...>;
1440         ...
1441
1442       inner:
1443         x_2 = ...;
1444         ...
1445
1446       outer2:
1447         x_3 = PHI <x_2(inner)>;
1448
1449    if nothing in x_2 or elsewhere makes x_1 relevant.  */
1450
1451 static bool
1452 vect_active_double_reduction_p (stmt_vec_info stmt_info)
1453 {
1454   if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_double_reduction_def)
1455     return false;
1456
1457   return STMT_VINFO_RELEVANT_P (STMT_VINFO_REDUC_DEF (stmt_info));
1458 }
1459
1460 /* Function vect_analyze_loop_operations.
1461
1462    Scan the loop stmts and make sure they are all vectorizable.  */
1463
1464 static opt_result
1465 vect_analyze_loop_operations (loop_vec_info loop_vinfo)
1466 {
1467   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1468   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1469   int nbbs = loop->num_nodes;
1470   int i;
1471   stmt_vec_info stmt_info;
1472   bool need_to_vectorize = false;
1473   bool ok;
1474
1475   DUMP_VECT_SCOPE ("vect_analyze_loop_operations");
1476
1477   auto_vec<stmt_info_for_cost> cost_vec;
1478
1479   for (i = 0; i < nbbs; i++)
1480     {
1481       basic_block bb = bbs[i];
1482
1483       for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
1484            gsi_next (&si))
1485         {
1486           gphi *phi = si.phi ();
1487           ok = true;
1488
1489           stmt_info = loop_vinfo->lookup_stmt (phi);
1490           if (dump_enabled_p ())
1491             dump_printf_loc (MSG_NOTE, vect_location, "examining phi: %G", phi);
1492           if (virtual_operand_p (gimple_phi_result (phi)))
1493             continue;
1494
1495           /* Inner-loop loop-closed exit phi in outer-loop vectorization
1496              (i.e., a phi in the tail of the outer-loop).  */
1497           if (! is_loop_header_bb_p (bb))
1498             {
1499               /* FORNOW: we currently don't support the case that these phis
1500                  are not used in the outerloop (unless it is double reduction,
1501                  i.e., this phi is vect_reduction_def), cause this case
1502                  requires to actually do something here.  */
1503               if (STMT_VINFO_LIVE_P (stmt_info)
1504                   && !vect_active_double_reduction_p (stmt_info))
1505                 return opt_result::failure_at (phi,
1506                                                "Unsupported loop-closed phi"
1507                                                " in outer-loop.\n");
1508
1509               /* If PHI is used in the outer loop, we check that its operand
1510                  is defined in the inner loop.  */
1511               if (STMT_VINFO_RELEVANT_P (stmt_info))
1512                 {
1513                   tree phi_op;
1514
1515                   if (gimple_phi_num_args (phi) != 1)
1516                     return opt_result::failure_at (phi, "unsupported phi");
1517
1518                   phi_op = PHI_ARG_DEF (phi, 0);
1519                   stmt_vec_info op_def_info = loop_vinfo->lookup_def (phi_op);
1520                   if (!op_def_info)
1521                     return opt_result::failure_at (phi, "unsupported phi\n");
1522
1523                   if (STMT_VINFO_RELEVANT (op_def_info) != vect_used_in_outer
1524                       && (STMT_VINFO_RELEVANT (op_def_info)
1525                           != vect_used_in_outer_by_reduction))
1526                     return opt_result::failure_at (phi, "unsupported phi\n");
1527
1528                   if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_internal_def
1529                        || (STMT_VINFO_DEF_TYPE (stmt_info)
1530                            == vect_double_reduction_def))
1531                       && !vectorizable_lc_phi (stmt_info, NULL, NULL))
1532                     return opt_result::failure_at (phi, "unsupported phi\n");
1533                 }
1534
1535               continue;
1536             }
1537
1538           gcc_assert (stmt_info);
1539
1540           if ((STMT_VINFO_RELEVANT (stmt_info) == vect_used_in_scope
1541                || STMT_VINFO_LIVE_P (stmt_info))
1542               && STMT_VINFO_DEF_TYPE (stmt_info) != vect_induction_def)
1543             /* A scalar-dependence cycle that we don't support.  */
1544             return opt_result::failure_at (phi,
1545                                            "not vectorized:"
1546                                            " scalar dependence cycle.\n");
1547
1548           if (STMT_VINFO_RELEVANT_P (stmt_info))
1549             {
1550               need_to_vectorize = true;
1551               if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def
1552                   && ! PURE_SLP_STMT (stmt_info))
1553                 ok = vectorizable_induction (stmt_info, NULL, NULL, NULL,
1554                                              &cost_vec);
1555               else if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
1556                         || (STMT_VINFO_DEF_TYPE (stmt_info)
1557                             == vect_double_reduction_def)
1558                         || STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle)
1559                        && ! PURE_SLP_STMT (stmt_info))
1560                 ok = vectorizable_reduction (stmt_info, NULL, NULL, &cost_vec);
1561             }
1562
1563           /* SLP PHIs are tested by vect_slp_analyze_node_operations.  */
1564           if (ok
1565               && STMT_VINFO_LIVE_P (stmt_info)
1566               && !PURE_SLP_STMT (stmt_info))
1567             ok = vectorizable_live_operation (stmt_info, NULL, NULL, NULL,
1568                                               -1, false, &cost_vec);
1569
1570           if (!ok)
1571             return opt_result::failure_at (phi,
1572                                            "not vectorized: relevant phi not "
1573                                            "supported: %G",
1574                                            static_cast <gimple *> (phi));
1575         }
1576
1577       for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
1578            gsi_next (&si))
1579         {
1580           gimple *stmt = gsi_stmt (si);
1581           if (!gimple_clobber_p (stmt))
1582             {
1583               opt_result res
1584                 = vect_analyze_stmt (loop_vinfo->lookup_stmt (stmt),
1585                                      &need_to_vectorize,
1586                                      NULL, NULL, &cost_vec);
1587               if (!res)
1588                 return res;
1589             }
1590         }
1591     } /* bbs */
1592
1593   add_stmt_costs (loop_vinfo->target_cost_data, &cost_vec);
1594
1595   /* All operations in the loop are either irrelevant (deal with loop
1596      control, or dead), or only used outside the loop and can be moved
1597      out of the loop (e.g. invariants, inductions).  The loop can be
1598      optimized away by scalar optimizations.  We're better off not
1599      touching this loop.  */
1600   if (!need_to_vectorize)
1601     {
1602       if (dump_enabled_p ())
1603         dump_printf_loc (MSG_NOTE, vect_location,
1604                          "All the computation can be taken out of the loop.\n");
1605       return opt_result::failure_at
1606         (vect_location,
1607          "not vectorized: redundant loop. no profit to vectorize.\n");
1608     }
1609
1610   return opt_result::success ();
1611 }
1612
1613 /* Analyze the cost of the loop described by LOOP_VINFO.  Decide if it
1614    is worthwhile to vectorize.  Return 1 if definitely yes, 0 if
1615    definitely no, or -1 if it's worth retrying.  */
1616
1617 static int
1618 vect_analyze_loop_costing (loop_vec_info loop_vinfo)
1619 {
1620   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1621   unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
1622
1623   /* Only fully-masked loops can have iteration counts less than the
1624      vectorization factor.  */
1625   if (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
1626     {
1627       HOST_WIDE_INT max_niter;
1628
1629       if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
1630         max_niter = LOOP_VINFO_INT_NITERS (loop_vinfo);
1631       else
1632         max_niter = max_stmt_executions_int (loop);
1633
1634       if (max_niter != -1
1635           && (unsigned HOST_WIDE_INT) max_niter < assumed_vf)
1636         {
1637           if (dump_enabled_p ())
1638             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1639                              "not vectorized: iteration count smaller than "
1640                              "vectorization factor.\n");
1641           return 0;
1642         }
1643     }
1644
1645   int min_profitable_iters, min_profitable_estimate;
1646   vect_estimate_min_profitable_iters (loop_vinfo, &min_profitable_iters,
1647                                       &min_profitable_estimate);
1648
1649   if (min_profitable_iters < 0)
1650     {
1651       if (dump_enabled_p ())
1652         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1653                          "not vectorized: vectorization not profitable.\n");
1654       if (dump_enabled_p ())
1655         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1656                          "not vectorized: vector version will never be "
1657                          "profitable.\n");
1658       return -1;
1659     }
1660
1661   int min_scalar_loop_bound = (PARAM_VALUE (PARAM_MIN_VECT_LOOP_BOUND)
1662                                * assumed_vf);
1663
1664   /* Use the cost model only if it is more conservative than user specified
1665      threshold.  */
1666   unsigned int th = (unsigned) MAX (min_scalar_loop_bound,
1667                                     min_profitable_iters);
1668
1669   LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo) = th;
1670
1671   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
1672       && LOOP_VINFO_INT_NITERS (loop_vinfo) < th)
1673     {
1674       if (dump_enabled_p ())
1675         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1676                          "not vectorized: vectorization not profitable.\n");
1677       if (dump_enabled_p ())
1678         dump_printf_loc (MSG_NOTE, vect_location,
1679                          "not vectorized: iteration count smaller than user "
1680                          "specified loop bound parameter or minimum profitable "
1681                          "iterations (whichever is more conservative).\n");
1682       return 0;
1683     }
1684
1685   HOST_WIDE_INT estimated_niter = estimated_stmt_executions_int (loop);
1686   if (estimated_niter == -1)
1687     estimated_niter = likely_max_stmt_executions_int (loop);
1688   if (estimated_niter != -1
1689       && ((unsigned HOST_WIDE_INT) estimated_niter
1690           < MAX (th, (unsigned) min_profitable_estimate)))
1691     {
1692       if (dump_enabled_p ())
1693         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1694                          "not vectorized: estimated iteration count too "
1695                          "small.\n");
1696       if (dump_enabled_p ())
1697         dump_printf_loc (MSG_NOTE, vect_location,
1698                          "not vectorized: estimated iteration count smaller "
1699                          "than specified loop bound parameter or minimum "
1700                          "profitable iterations (whichever is more "
1701                          "conservative).\n");
1702       return -1;
1703     }
1704
1705   return 1;
1706 }
1707
1708 static opt_result
1709 vect_get_datarefs_in_loop (loop_p loop, basic_block *bbs,
1710                            vec<data_reference_p> *datarefs,
1711                            unsigned int *n_stmts)
1712 {
1713   *n_stmts = 0;
1714   for (unsigned i = 0; i < loop->num_nodes; i++)
1715     for (gimple_stmt_iterator gsi = gsi_start_bb (bbs[i]);
1716          !gsi_end_p (gsi); gsi_next (&gsi))
1717       {
1718         gimple *stmt = gsi_stmt (gsi);
1719         if (is_gimple_debug (stmt))
1720           continue;
1721         ++(*n_stmts);
1722         opt_result res = vect_find_stmt_data_reference (loop, stmt, datarefs);
1723         if (!res)
1724           {
1725             if (is_gimple_call (stmt) && loop->safelen)
1726               {
1727                 tree fndecl = gimple_call_fndecl (stmt), op;
1728                 if (fndecl != NULL_TREE)
1729                   {
1730                     cgraph_node *node = cgraph_node::get (fndecl);
1731                     if (node != NULL && node->simd_clones != NULL)
1732                       {
1733                         unsigned int j, n = gimple_call_num_args (stmt);
1734                         for (j = 0; j < n; j++)
1735                           {
1736                             op = gimple_call_arg (stmt, j);
1737                             if (DECL_P (op)
1738                                 || (REFERENCE_CLASS_P (op)
1739                                     && get_base_address (op)))
1740                               break;
1741                           }
1742                         op = gimple_call_lhs (stmt);
1743                         /* Ignore #pragma omp declare simd functions
1744                            if they don't have data references in the
1745                            call stmt itself.  */
1746                         if (j == n
1747                             && !(op
1748                                  && (DECL_P (op)
1749                                      || (REFERENCE_CLASS_P (op)
1750                                          && get_base_address (op)))))
1751                           continue;
1752                       }
1753                   }
1754               }
1755             return res;
1756           }
1757         /* If dependence analysis will give up due to the limit on the
1758            number of datarefs stop here and fail fatally.  */
1759         if (datarefs->length ()
1760             > (unsigned)PARAM_VALUE (PARAM_LOOP_MAX_DATAREFS_FOR_DATADEPS))
1761           return opt_result::failure_at (stmt, "exceeded param "
1762                                          "loop-max-datarefs-for-datadeps\n");
1763       }
1764   return opt_result::success ();
1765 }
1766
1767 /* Look for SLP-only access groups and turn each individual access into its own
1768    group.  */
1769 static void
1770 vect_dissolve_slp_only_groups (loop_vec_info loop_vinfo)
1771 {
1772   unsigned int i;
1773   struct data_reference *dr;
1774
1775   DUMP_VECT_SCOPE ("vect_dissolve_slp_only_groups");
1776
1777   vec<data_reference_p> datarefs = loop_vinfo->shared->datarefs;
1778   FOR_EACH_VEC_ELT (datarefs, i, dr)
1779     {
1780       gcc_assert (DR_REF (dr));
1781       stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (DR_STMT (dr));
1782
1783       /* Check if the load is a part of an interleaving chain.  */
1784       if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
1785         {
1786           stmt_vec_info first_element = DR_GROUP_FIRST_ELEMENT (stmt_info);
1787           unsigned int group_size = DR_GROUP_SIZE (first_element);
1788
1789           /* Check if SLP-only groups.  */
1790           if (!STMT_SLP_TYPE (stmt_info)
1791               && STMT_VINFO_SLP_VECT_ONLY (first_element))
1792             {
1793               /* Dissolve the group.  */
1794               STMT_VINFO_SLP_VECT_ONLY (first_element) = false;
1795
1796               stmt_vec_info vinfo = first_element;
1797               while (vinfo)
1798                 {
1799                   stmt_vec_info next = DR_GROUP_NEXT_ELEMENT (vinfo);
1800                   DR_GROUP_FIRST_ELEMENT (vinfo) = vinfo;
1801                   DR_GROUP_NEXT_ELEMENT (vinfo) = NULL;
1802                   DR_GROUP_SIZE (vinfo) = 1;
1803                   DR_GROUP_GAP (vinfo) = group_size - 1;
1804                   vinfo = next;
1805                 }
1806             }
1807         }
1808     }
1809 }
1810
1811 /* Function vect_analyze_loop_2.
1812
1813    Apply a set of analyses on LOOP, and create a loop_vec_info struct
1814    for it.  The different analyses will record information in the
1815    loop_vec_info struct.  */
1816 static opt_result
1817 vect_analyze_loop_2 (loop_vec_info loop_vinfo, bool &fatal, unsigned *n_stmts)
1818 {
1819   opt_result ok = opt_result::success ();
1820   int res;
1821   unsigned int max_vf = MAX_VECTORIZATION_FACTOR;
1822   poly_uint64 min_vf = 2;
1823
1824   /* The first group of checks is independent of the vector size.  */
1825   fatal = true;
1826
1827   if (LOOP_VINFO_SIMD_IF_COND (loop_vinfo)
1828       && integer_zerop (LOOP_VINFO_SIMD_IF_COND (loop_vinfo)))
1829     return opt_result::failure_at (vect_location,
1830                                    "not vectorized: simd if(0)\n");
1831
1832   /* Find all data references in the loop (which correspond to vdefs/vuses)
1833      and analyze their evolution in the loop.  */
1834
1835   loop_p loop = LOOP_VINFO_LOOP (loop_vinfo);
1836
1837   /* Gather the data references and count stmts in the loop.  */
1838   if (!LOOP_VINFO_DATAREFS (loop_vinfo).exists ())
1839     {
1840       opt_result res
1841         = vect_get_datarefs_in_loop (loop, LOOP_VINFO_BBS (loop_vinfo),
1842                                      &LOOP_VINFO_DATAREFS (loop_vinfo),
1843                                      n_stmts);
1844       if (!res)
1845         {
1846           if (dump_enabled_p ())
1847             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1848                              "not vectorized: loop contains function "
1849                              "calls or data references that cannot "
1850                              "be analyzed\n");
1851           return res;
1852         }
1853       loop_vinfo->shared->save_datarefs ();
1854     }
1855   else
1856     loop_vinfo->shared->check_datarefs ();
1857
1858   /* Analyze the data references and also adjust the minimal
1859      vectorization factor according to the loads and stores.  */
1860
1861   ok = vect_analyze_data_refs (loop_vinfo, &min_vf, &fatal);
1862   if (!ok)
1863     {
1864       if (dump_enabled_p ())
1865         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1866                          "bad data references.\n");
1867       return ok;
1868     }
1869
1870   /* Classify all cross-iteration scalar data-flow cycles.
1871      Cross-iteration cycles caused by virtual phis are analyzed separately.  */
1872   vect_analyze_scalar_cycles (loop_vinfo);
1873
1874   vect_pattern_recog (loop_vinfo);
1875
1876   vect_fixup_scalar_cycles_with_patterns (loop_vinfo);
1877
1878   /* Analyze the access patterns of the data-refs in the loop (consecutive,
1879      complex, etc.). FORNOW: Only handle consecutive access pattern.  */
1880
1881   ok = vect_analyze_data_ref_accesses (loop_vinfo);
1882   if (!ok)
1883     {
1884       if (dump_enabled_p ())
1885         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1886                          "bad data access.\n");
1887       return ok;
1888     }
1889
1890   /* Data-flow analysis to detect stmts that do not need to be vectorized.  */
1891
1892   ok = vect_mark_stmts_to_be_vectorized (loop_vinfo, &fatal);
1893   if (!ok)
1894     {
1895       if (dump_enabled_p ())
1896         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1897                          "unexpected pattern.\n");
1898       return ok;
1899     }
1900
1901   /* While the rest of the analysis below depends on it in some way.  */
1902   fatal = false;
1903
1904   /* Analyze data dependences between the data-refs in the loop
1905      and adjust the maximum vectorization factor according to
1906      the dependences.
1907      FORNOW: fail at the first data dependence that we encounter.  */
1908
1909   ok = vect_analyze_data_ref_dependences (loop_vinfo, &max_vf);
1910   if (!ok)
1911     {
1912       if (dump_enabled_p ())
1913         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1914                          "bad data dependence.\n");
1915       return ok;
1916     }
1917   if (max_vf != MAX_VECTORIZATION_FACTOR
1918       && maybe_lt (max_vf, min_vf))
1919     return opt_result::failure_at (vect_location, "bad data dependence.\n");
1920   LOOP_VINFO_MAX_VECT_FACTOR (loop_vinfo) = max_vf;
1921
1922   ok = vect_determine_vectorization_factor (loop_vinfo);
1923   if (!ok)
1924     {
1925       if (dump_enabled_p ())
1926         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1927                          "can't determine vectorization factor.\n");
1928       return ok;
1929     }
1930   if (max_vf != MAX_VECTORIZATION_FACTOR
1931       && maybe_lt (max_vf, LOOP_VINFO_VECT_FACTOR (loop_vinfo)))
1932     return opt_result::failure_at (vect_location, "bad data dependence.\n");
1933
1934   /* Compute the scalar iteration cost.  */
1935   vect_compute_single_scalar_iteration_cost (loop_vinfo);
1936
1937   poly_uint64 saved_vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1938   unsigned th;
1939
1940   /* Check the SLP opportunities in the loop, analyze and build SLP trees.  */
1941   ok = vect_analyze_slp (loop_vinfo, *n_stmts);
1942   if (!ok)
1943     return ok;
1944
1945   /* If there are any SLP instances mark them as pure_slp.  */
1946   bool slp = vect_make_slp_decision (loop_vinfo);
1947   if (slp)
1948     {
1949       /* Find stmts that need to be both vectorized and SLPed.  */
1950       vect_detect_hybrid_slp (loop_vinfo);
1951
1952       /* Update the vectorization factor based on the SLP decision.  */
1953       vect_update_vf_for_slp (loop_vinfo);
1954     }
1955
1956   bool saved_can_fully_mask_p = LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo);
1957
1958   /* We don't expect to have to roll back to anything other than an empty
1959      set of rgroups.  */
1960   gcc_assert (LOOP_VINFO_MASKS (loop_vinfo).is_empty ());
1961
1962   /* This is the point where we can re-start analysis with SLP forced off.  */
1963 start_over:
1964
1965   /* Now the vectorization factor is final.  */
1966   poly_uint64 vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1967   gcc_assert (known_ne (vectorization_factor, 0U));
1968
1969   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) && dump_enabled_p ())
1970     {
1971       dump_printf_loc (MSG_NOTE, vect_location,
1972                        "vectorization_factor = ");
1973       dump_dec (MSG_NOTE, vectorization_factor);
1974       dump_printf (MSG_NOTE, ", niters = %wd\n",
1975                    LOOP_VINFO_INT_NITERS (loop_vinfo));
1976     }
1977
1978   HOST_WIDE_INT max_niter
1979     = likely_max_stmt_executions_int (LOOP_VINFO_LOOP (loop_vinfo));
1980
1981   /* Analyze the alignment of the data-refs in the loop.
1982      Fail if a data reference is found that cannot be vectorized.  */
1983
1984   ok = vect_analyze_data_refs_alignment (loop_vinfo);
1985   if (!ok)
1986     {
1987       if (dump_enabled_p ())
1988         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1989                          "bad data alignment.\n");
1990       return ok;
1991     }
1992
1993   /* Prune the list of ddrs to be tested at run-time by versioning for alias.
1994      It is important to call pruning after vect_analyze_data_ref_accesses,
1995      since we use grouping information gathered by interleaving analysis.  */
1996   ok = vect_prune_runtime_alias_test_list (loop_vinfo);
1997   if (!ok)
1998     return ok;
1999
2000   /* Do not invoke vect_enhance_data_refs_alignment for epilogue
2001      vectorization, since we do not want to add extra peeling or
2002      add versioning for alignment.  */
2003   if (!LOOP_VINFO_EPILOGUE_P (loop_vinfo))
2004     /* This pass will decide on using loop versioning and/or loop peeling in
2005        order to enhance the alignment of data references in the loop.  */
2006     ok = vect_enhance_data_refs_alignment (loop_vinfo);
2007   else
2008     ok = vect_verify_datarefs_alignment (loop_vinfo);
2009   if (!ok)
2010     return ok;
2011
2012   if (slp)
2013     {
2014       /* Analyze operations in the SLP instances.  Note this may
2015          remove unsupported SLP instances which makes the above
2016          SLP kind detection invalid.  */
2017       unsigned old_size = LOOP_VINFO_SLP_INSTANCES (loop_vinfo).length ();
2018       vect_slp_analyze_operations (loop_vinfo);
2019       if (LOOP_VINFO_SLP_INSTANCES (loop_vinfo).length () != old_size)
2020         {
2021           ok = opt_result::failure_at (vect_location,
2022                                        "unsupported SLP instances\n");
2023           goto again;
2024         }
2025     }
2026
2027   /* Dissolve SLP-only groups.  */
2028   vect_dissolve_slp_only_groups (loop_vinfo);
2029
2030   /* Scan all the remaining operations in the loop that are not subject
2031      to SLP and make sure they are vectorizable.  */
2032   ok = vect_analyze_loop_operations (loop_vinfo);
2033   if (!ok)
2034     {
2035       if (dump_enabled_p ())
2036         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2037                          "bad operation or unsupported loop bound.\n");
2038       return ok;
2039     }
2040
2041   /* Decide whether to use a fully-masked loop for this vectorization
2042      factor.  */
2043   LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
2044     = (LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo)
2045        && vect_verify_full_masking (loop_vinfo));
2046   if (dump_enabled_p ())
2047     {
2048       if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
2049         dump_printf_loc (MSG_NOTE, vect_location,
2050                          "using a fully-masked loop.\n");
2051       else
2052         dump_printf_loc (MSG_NOTE, vect_location,
2053                          "not using a fully-masked loop.\n");
2054     }
2055
2056   /* If epilog loop is required because of data accesses with gaps,
2057      one additional iteration needs to be peeled.  Check if there is
2058      enough iterations for vectorization.  */
2059   if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
2060       && LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2061       && !LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
2062     {
2063       poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2064       tree scalar_niters = LOOP_VINFO_NITERSM1 (loop_vinfo);
2065
2066       if (known_lt (wi::to_widest (scalar_niters), vf))
2067         return opt_result::failure_at (vect_location,
2068                                        "loop has no enough iterations to"
2069                                        " support peeling for gaps.\n");
2070     }
2071
2072   /* Check the costings of the loop make vectorizing worthwhile.  */
2073   res = vect_analyze_loop_costing (loop_vinfo);
2074   if (res < 0)
2075     {
2076       ok = opt_result::failure_at (vect_location,
2077                                    "Loop costings may not be worthwhile.\n");
2078       goto again;
2079     }
2080   if (!res)
2081     return opt_result::failure_at (vect_location,
2082                                    "Loop costings not worthwhile.\n");
2083
2084   /* Decide whether we need to create an epilogue loop to handle
2085      remaining scalar iterations.  */
2086   th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
2087
2088   unsigned HOST_WIDE_INT const_vf;
2089   if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
2090     /* The main loop handles all iterations.  */
2091     LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = false;
2092   else if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2093            && LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) >= 0)
2094     {
2095       /* Work out the (constant) number of iterations that need to be
2096          peeled for reasons other than niters.  */
2097       unsigned int peel_niter = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
2098       if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
2099         peel_niter += 1;
2100       if (!multiple_p (LOOP_VINFO_INT_NITERS (loop_vinfo) - peel_niter,
2101                        LOOP_VINFO_VECT_FACTOR (loop_vinfo)))
2102         LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = true;
2103     }
2104   else if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
2105            /* ??? When peeling for gaps but not alignment, we could
2106               try to check whether the (variable) niters is known to be
2107               VF * N + 1.  That's something of a niche case though.  */
2108            || LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
2109            || !LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant (&const_vf)
2110            || ((tree_ctz (LOOP_VINFO_NITERS (loop_vinfo))
2111                 < (unsigned) exact_log2 (const_vf))
2112                /* In case of versioning, check if the maximum number of
2113                   iterations is greater than th.  If they are identical,
2114                   the epilogue is unnecessary.  */
2115                && (!LOOP_REQUIRES_VERSIONING (loop_vinfo)
2116                    || ((unsigned HOST_WIDE_INT) max_niter
2117                        > (th / const_vf) * const_vf))))
2118     LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = true;
2119
2120   /* If an epilogue loop is required make sure we can create one.  */
2121   if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
2122       || LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo))
2123     {
2124       if (dump_enabled_p ())
2125         dump_printf_loc (MSG_NOTE, vect_location, "epilog loop required\n");
2126       if (!vect_can_advance_ivs_p (loop_vinfo)
2127           || !slpeel_can_duplicate_loop_p (LOOP_VINFO_LOOP (loop_vinfo),
2128                                            single_exit (LOOP_VINFO_LOOP
2129                                                          (loop_vinfo))))
2130         {
2131           ok = opt_result::failure_at (vect_location,
2132                                        "not vectorized: can't create required "
2133                                        "epilog loop\n");
2134           goto again;
2135         }
2136     }
2137
2138   /* During peeling, we need to check if number of loop iterations is
2139      enough for both peeled prolog loop and vector loop.  This check
2140      can be merged along with threshold check of loop versioning, so
2141      increase threshold for this case if necessary.  */
2142   if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
2143     {
2144       poly_uint64 niters_th = 0;
2145
2146       if (!vect_use_loop_mask_for_alignment_p (loop_vinfo))
2147         {
2148           /* Niters for peeled prolog loop.  */
2149           if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) < 0)
2150             {
2151               dr_vec_info *dr_info = LOOP_VINFO_UNALIGNED_DR (loop_vinfo);
2152               tree vectype = STMT_VINFO_VECTYPE (dr_info->stmt);
2153               niters_th += TYPE_VECTOR_SUBPARTS (vectype) - 1;
2154             }
2155           else
2156             niters_th += LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
2157         }
2158
2159       /* Niters for at least one iteration of vectorized loop.  */
2160       if (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
2161         niters_th += LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2162       /* One additional iteration because of peeling for gap.  */
2163       if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
2164         niters_th += 1;
2165       LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo) = niters_th;
2166     }
2167
2168   gcc_assert (known_eq (vectorization_factor,
2169                         LOOP_VINFO_VECT_FACTOR (loop_vinfo)));
2170
2171   /* Ok to vectorize!  */
2172   return opt_result::success ();
2173
2174 again:
2175   /* Ensure that "ok" is false (with an opt_problem if dumping is enabled).  */
2176   gcc_assert (!ok);
2177
2178   /* Try again with SLP forced off but if we didn't do any SLP there is
2179      no point in re-trying.  */
2180   if (!slp)
2181     return ok;
2182
2183   /* If there are reduction chains re-trying will fail anyway.  */
2184   if (! LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo).is_empty ())
2185     return ok;
2186
2187   /* Likewise if the grouped loads or stores in the SLP cannot be handled
2188      via interleaving or lane instructions.  */
2189   slp_instance instance;
2190   slp_tree node;
2191   unsigned i, j;
2192   FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), i, instance)
2193     {
2194       stmt_vec_info vinfo;
2195       vinfo = SLP_TREE_SCALAR_STMTS (SLP_INSTANCE_TREE (instance))[0];
2196       if (! STMT_VINFO_GROUPED_ACCESS (vinfo))
2197         continue;
2198       vinfo = DR_GROUP_FIRST_ELEMENT (vinfo);
2199       unsigned int size = DR_GROUP_SIZE (vinfo);
2200       tree vectype = STMT_VINFO_VECTYPE (vinfo);
2201       if (! vect_store_lanes_supported (vectype, size, false)
2202          && ! known_eq (TYPE_VECTOR_SUBPARTS (vectype), 1U)
2203          && ! vect_grouped_store_supported (vectype, size))
2204         return opt_result::failure_at (vinfo->stmt,
2205                                        "unsupported grouped store\n");
2206       FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), j, node)
2207         {
2208           vinfo = SLP_TREE_SCALAR_STMTS (node)[0];
2209           vinfo = DR_GROUP_FIRST_ELEMENT (vinfo);
2210           bool single_element_p = !DR_GROUP_NEXT_ELEMENT (vinfo);
2211           size = DR_GROUP_SIZE (vinfo);
2212           vectype = STMT_VINFO_VECTYPE (vinfo);
2213           if (! vect_load_lanes_supported (vectype, size, false)
2214               && ! vect_grouped_load_supported (vectype, single_element_p,
2215                                                 size))
2216             return opt_result::failure_at (vinfo->stmt,
2217                                            "unsupported grouped load\n");
2218         }
2219     }
2220
2221   if (dump_enabled_p ())
2222     dump_printf_loc (MSG_NOTE, vect_location,
2223                      "re-trying with SLP disabled\n");
2224
2225   /* Roll back state appropriately.  No SLP this time.  */
2226   slp = false;
2227   /* Restore vectorization factor as it were without SLP.  */
2228   LOOP_VINFO_VECT_FACTOR (loop_vinfo) = saved_vectorization_factor;
2229   /* Free the SLP instances.  */
2230   FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), j, instance)
2231     vect_free_slp_instance (instance, false);
2232   LOOP_VINFO_SLP_INSTANCES (loop_vinfo).release ();
2233   /* Reset SLP type to loop_vect on all stmts.  */
2234   for (i = 0; i < LOOP_VINFO_LOOP (loop_vinfo)->num_nodes; ++i)
2235     {
2236       basic_block bb = LOOP_VINFO_BBS (loop_vinfo)[i];
2237       for (gimple_stmt_iterator si = gsi_start_phis (bb);
2238            !gsi_end_p (si); gsi_next (&si))
2239         {
2240           stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
2241           STMT_SLP_TYPE (stmt_info) = loop_vect;
2242         }
2243       for (gimple_stmt_iterator si = gsi_start_bb (bb);
2244            !gsi_end_p (si); gsi_next (&si))
2245         {
2246           stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
2247           STMT_SLP_TYPE (stmt_info) = loop_vect;
2248           if (STMT_VINFO_IN_PATTERN_P (stmt_info))
2249             {
2250               gimple *pattern_def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
2251               stmt_info = STMT_VINFO_RELATED_STMT (stmt_info);
2252               STMT_SLP_TYPE (stmt_info) = loop_vect;
2253               for (gimple_stmt_iterator pi = gsi_start (pattern_def_seq);
2254                    !gsi_end_p (pi); gsi_next (&pi))
2255                 STMT_SLP_TYPE (loop_vinfo->lookup_stmt (gsi_stmt (pi)))
2256                   = loop_vect;
2257             }
2258         }
2259     }
2260   /* Free optimized alias test DDRS.  */
2261   LOOP_VINFO_LOWER_BOUNDS (loop_vinfo).truncate (0);
2262   LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo).release ();
2263   LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo).release ();
2264   /* Reset target cost data.  */
2265   destroy_cost_data (LOOP_VINFO_TARGET_COST_DATA (loop_vinfo));
2266   LOOP_VINFO_TARGET_COST_DATA (loop_vinfo)
2267     = init_cost (LOOP_VINFO_LOOP (loop_vinfo));
2268   /* Reset accumulated rgroup information.  */
2269   release_vec_loop_masks (&LOOP_VINFO_MASKS (loop_vinfo));
2270   /* Reset assorted flags.  */
2271   LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = false;
2272   LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) = false;
2273   LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo) = 0;
2274   LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo) = 0;
2275   LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = saved_can_fully_mask_p;
2276
2277   goto start_over;
2278 }
2279
2280 /* Function vect_analyze_loop.
2281
2282    Apply a set of analyses on LOOP, and create a loop_vec_info struct
2283    for it.  The different analyses will record information in the
2284    loop_vec_info struct.  If ORIG_LOOP_VINFO is not NULL epilogue must
2285    be vectorized.  */
2286 opt_loop_vec_info
2287 vect_analyze_loop (class loop *loop, loop_vec_info orig_loop_vinfo,
2288                    vec_info_shared *shared)
2289 {
2290   auto_vector_sizes vector_sizes;
2291
2292   /* Autodetect first vector size we try.  */
2293   current_vector_size = 0;
2294   targetm.vectorize.autovectorize_vector_sizes (&vector_sizes,
2295                                                 loop->simdlen != 0);
2296   unsigned int next_size = 0;
2297
2298   DUMP_VECT_SCOPE ("analyze_loop_nest");
2299
2300   if (loop_outer (loop)
2301       && loop_vec_info_for_loop (loop_outer (loop))
2302       && LOOP_VINFO_VECTORIZABLE_P (loop_vec_info_for_loop (loop_outer (loop))))
2303     return opt_loop_vec_info::failure_at (vect_location,
2304                                           "outer-loop already vectorized.\n");
2305
2306   if (!find_loop_nest (loop, &shared->loop_nest))
2307     return opt_loop_vec_info::failure_at
2308       (vect_location,
2309        "not vectorized: loop nest containing two or more consecutive inner"
2310        " loops cannot be vectorized\n");
2311
2312   unsigned n_stmts = 0;
2313   poly_uint64 autodetected_vector_size = 0;
2314   opt_loop_vec_info first_loop_vinfo = opt_loop_vec_info::success (NULL);
2315   poly_uint64 first_vector_size = 0;
2316   while (1)
2317     {
2318       /* Check the CFG characteristics of the loop (nesting, entry/exit).  */
2319       opt_loop_vec_info loop_vinfo
2320         = vect_analyze_loop_form (loop, shared);
2321       if (!loop_vinfo)
2322         {
2323           if (dump_enabled_p ())
2324             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2325                              "bad loop form.\n");
2326           gcc_checking_assert (first_loop_vinfo == NULL);
2327           return loop_vinfo;
2328         }
2329
2330       bool fatal = false;
2331
2332       if (orig_loop_vinfo)
2333         LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo) = orig_loop_vinfo;
2334
2335       opt_result res = vect_analyze_loop_2 (loop_vinfo, fatal, &n_stmts);
2336       if (res)
2337         {
2338           LOOP_VINFO_VECTORIZABLE_P (loop_vinfo) = 1;
2339
2340           if (loop->simdlen
2341               && maybe_ne (LOOP_VINFO_VECT_FACTOR (loop_vinfo),
2342                            (unsigned HOST_WIDE_INT) loop->simdlen))
2343             {
2344               if (first_loop_vinfo == NULL)
2345                 {
2346                   first_loop_vinfo = loop_vinfo;
2347                   first_vector_size = current_vector_size;
2348                   loop->aux = NULL;
2349                 }
2350               else
2351                 delete loop_vinfo;
2352             }
2353           else
2354             {
2355               delete first_loop_vinfo;
2356               return loop_vinfo;
2357             }
2358         }
2359       else
2360         delete loop_vinfo;
2361
2362       if (next_size == 0)
2363         autodetected_vector_size = current_vector_size;
2364
2365       if (next_size < vector_sizes.length ()
2366           && known_eq (vector_sizes[next_size], autodetected_vector_size))
2367         next_size += 1;
2368
2369       if (fatal)
2370         {
2371           gcc_checking_assert (first_loop_vinfo == NULL);
2372           return opt_loop_vec_info::propagate_failure (res);
2373         }
2374
2375       if (next_size == vector_sizes.length ()
2376           || known_eq (current_vector_size, 0U))
2377         {
2378           if (first_loop_vinfo)
2379             {
2380               current_vector_size = first_vector_size;
2381               loop->aux = (loop_vec_info) first_loop_vinfo;
2382               if (dump_enabled_p ())
2383                 {
2384                   dump_printf_loc (MSG_NOTE, vect_location,
2385                                    "***** Choosing vector size ");
2386                   dump_dec (MSG_NOTE, current_vector_size);
2387                   dump_printf (MSG_NOTE, "\n");
2388                 }
2389               return first_loop_vinfo;
2390             }
2391           else
2392             return opt_loop_vec_info::propagate_failure (res);
2393         }
2394
2395       /* Try the next biggest vector size.  */
2396       current_vector_size = vector_sizes[next_size++];
2397       if (dump_enabled_p ())
2398         {
2399           dump_printf_loc (MSG_NOTE, vect_location,
2400                            "***** Re-trying analysis with "
2401                            "vector size ");
2402           dump_dec (MSG_NOTE, current_vector_size);
2403           dump_printf (MSG_NOTE, "\n");
2404         }
2405     }
2406 }
2407
2408 /* Return true if there is an in-order reduction function for CODE, storing
2409    it in *REDUC_FN if so.  */
2410
2411 static bool
2412 fold_left_reduction_fn (tree_code code, internal_fn *reduc_fn)
2413 {
2414   switch (code)
2415     {
2416     case PLUS_EXPR:
2417       *reduc_fn = IFN_FOLD_LEFT_PLUS;
2418       return true;
2419
2420     default:
2421       return false;
2422     }
2423 }
2424
2425 /* Function reduction_fn_for_scalar_code
2426
2427    Input:
2428    CODE - tree_code of a reduction operations.
2429
2430    Output:
2431    REDUC_FN - the corresponding internal function to be used to reduce the
2432       vector of partial results into a single scalar result, or IFN_LAST
2433       if the operation is a supported reduction operation, but does not have
2434       such an internal function.
2435
2436    Return FALSE if CODE currently cannot be vectorized as reduction.  */
2437
2438 static bool
2439 reduction_fn_for_scalar_code (enum tree_code code, internal_fn *reduc_fn)
2440 {
2441   switch (code)
2442     {
2443       case MAX_EXPR:
2444         *reduc_fn = IFN_REDUC_MAX;
2445         return true;
2446
2447       case MIN_EXPR:
2448         *reduc_fn = IFN_REDUC_MIN;
2449         return true;
2450
2451       case PLUS_EXPR:
2452         *reduc_fn = IFN_REDUC_PLUS;
2453         return true;
2454
2455       case BIT_AND_EXPR:
2456         *reduc_fn = IFN_REDUC_AND;
2457         return true;
2458
2459       case BIT_IOR_EXPR:
2460         *reduc_fn = IFN_REDUC_IOR;
2461         return true;
2462
2463       case BIT_XOR_EXPR:
2464         *reduc_fn = IFN_REDUC_XOR;
2465         return true;
2466
2467       case MULT_EXPR:
2468       case MINUS_EXPR:
2469         *reduc_fn = IFN_LAST;
2470         return true;
2471
2472       default:
2473        return false;
2474     }
2475 }
2476
2477 /* If there is a neutral value X such that SLP reduction NODE would not
2478    be affected by the introduction of additional X elements, return that X,
2479    otherwise return null.  CODE is the code of the reduction.  REDUC_CHAIN
2480    is true if the SLP statements perform a single reduction, false if each
2481    statement performs an independent reduction.  */
2482
2483 static tree
2484 neutral_op_for_slp_reduction (slp_tree slp_node, tree_code code,
2485                               bool reduc_chain)
2486 {
2487   vec<stmt_vec_info> stmts = SLP_TREE_SCALAR_STMTS (slp_node);
2488   stmt_vec_info stmt_vinfo = stmts[0];
2489   tree vector_type = STMT_VINFO_VECTYPE (stmt_vinfo);
2490   tree scalar_type = TREE_TYPE (vector_type);
2491   class loop *loop = gimple_bb (stmt_vinfo->stmt)->loop_father;
2492   gcc_assert (loop);
2493
2494   switch (code)
2495     {
2496     case WIDEN_SUM_EXPR:
2497     case DOT_PROD_EXPR:
2498     case SAD_EXPR:
2499     case PLUS_EXPR:
2500     case MINUS_EXPR:
2501     case BIT_IOR_EXPR:
2502     case BIT_XOR_EXPR:
2503       return build_zero_cst (scalar_type);
2504
2505     case MULT_EXPR:
2506       return build_one_cst (scalar_type);
2507
2508     case BIT_AND_EXPR:
2509       return build_all_ones_cst (scalar_type);
2510
2511     case MAX_EXPR:
2512     case MIN_EXPR:
2513       /* For MIN/MAX the initial values are neutral.  A reduction chain
2514          has only a single initial value, so that value is neutral for
2515          all statements.  */
2516       if (reduc_chain)
2517         return PHI_ARG_DEF_FROM_EDGE (stmt_vinfo->stmt,
2518                                       loop_preheader_edge (loop));
2519       return NULL_TREE;
2520
2521     default:
2522       return NULL_TREE;
2523     }
2524 }
2525
2526 /* Error reporting helper for vect_is_simple_reduction below.  GIMPLE statement
2527    STMT is printed with a message MSG. */
2528
2529 static void
2530 report_vect_op (dump_flags_t msg_type, gimple *stmt, const char *msg)
2531 {
2532   dump_printf_loc (msg_type, vect_location, "%s%G", msg, stmt);
2533 }
2534
2535 /* Return true if we need an in-order reduction for operation CODE
2536    on type TYPE.  NEED_WRAPPING_INTEGRAL_OVERFLOW is true if integer
2537    overflow must wrap.  */
2538
2539 bool
2540 needs_fold_left_reduction_p (tree type, tree_code code)
2541 {
2542   /* CHECKME: check for !flag_finite_math_only too?  */
2543   if (SCALAR_FLOAT_TYPE_P (type))
2544     switch (code)
2545       {
2546       case MIN_EXPR:
2547       case MAX_EXPR:
2548         return false;
2549
2550       default:
2551         return !flag_associative_math;
2552       }
2553
2554   if (INTEGRAL_TYPE_P (type))
2555     {
2556       if (!operation_no_trapping_overflow (type, code))
2557         return true;
2558       return false;
2559     }
2560
2561   if (SAT_FIXED_POINT_TYPE_P (type))
2562     return true;
2563
2564   return false;
2565 }
2566
2567 /* Return true if the reduction PHI in LOOP with latch arg LOOP_ARG and
2568    reduction operation CODE has a handled computation expression.  */
2569
2570 static bool
2571 check_reduction_path (dump_user_location_t loc, loop_p loop, gphi *phi,
2572                       tree loop_arg, enum tree_code code,
2573                       vec<std::pair<ssa_op_iter, use_operand_p> > &path)
2574 {
2575   auto_bitmap visited;
2576   tree lookfor = PHI_RESULT (phi);
2577   ssa_op_iter curri;
2578   use_operand_p curr = op_iter_init_phiuse (&curri, phi, SSA_OP_USE);
2579   while (USE_FROM_PTR (curr) != loop_arg)
2580     curr = op_iter_next_use (&curri);
2581   curri.i = curri.numops;
2582   do
2583     {
2584       path.safe_push (std::make_pair (curri, curr));
2585       tree use = USE_FROM_PTR (curr);
2586       if (use == lookfor)
2587         break;
2588       gimple *def = SSA_NAME_DEF_STMT (use);
2589       if (gimple_nop_p (def)
2590           || ! flow_bb_inside_loop_p (loop, gimple_bb (def)))
2591         {
2592 pop:
2593           do
2594             {
2595               std::pair<ssa_op_iter, use_operand_p> x = path.pop ();
2596               curri = x.first;
2597               curr = x.second;
2598               do
2599                 curr = op_iter_next_use (&curri);
2600               /* Skip already visited or non-SSA operands (from iterating
2601                  over PHI args).  */
2602               while (curr != NULL_USE_OPERAND_P
2603                      && (TREE_CODE (USE_FROM_PTR (curr)) != SSA_NAME
2604                          || ! bitmap_set_bit (visited,
2605                                               SSA_NAME_VERSION
2606                                                 (USE_FROM_PTR (curr)))));
2607             }
2608           while (curr == NULL_USE_OPERAND_P && ! path.is_empty ());
2609           if (curr == NULL_USE_OPERAND_P)
2610             break;
2611         }
2612       else
2613         {
2614           if (gimple_code (def) == GIMPLE_PHI)
2615             curr = op_iter_init_phiuse (&curri, as_a <gphi *>(def), SSA_OP_USE);
2616           else
2617             curr = op_iter_init_use (&curri, def, SSA_OP_USE);
2618           while (curr != NULL_USE_OPERAND_P
2619                  && (TREE_CODE (USE_FROM_PTR (curr)) != SSA_NAME
2620                      || ! bitmap_set_bit (visited,
2621                                           SSA_NAME_VERSION
2622                                             (USE_FROM_PTR (curr)))))
2623             curr = op_iter_next_use (&curri);
2624           if (curr == NULL_USE_OPERAND_P)
2625             goto pop;
2626         }
2627     }
2628   while (1);
2629   if (dump_file && (dump_flags & TDF_DETAILS))
2630     {
2631       dump_printf_loc (MSG_NOTE, loc, "reduction path: ");
2632       unsigned i;
2633       std::pair<ssa_op_iter, use_operand_p> *x;
2634       FOR_EACH_VEC_ELT (path, i, x)
2635         dump_printf (MSG_NOTE, "%T ", USE_FROM_PTR (x->second));
2636       dump_printf (MSG_NOTE, "\n");
2637     }
2638
2639   /* Check whether the reduction path detected is valid.  */
2640   bool fail = path.length () == 0;
2641   bool neg = false;
2642   for (unsigned i = 1; i < path.length (); ++i)
2643     {
2644       gimple *use_stmt = USE_STMT (path[i].second);
2645       tree op = USE_FROM_PTR (path[i].second);
2646       if (! has_single_use (op)
2647           || ! is_gimple_assign (use_stmt)
2648           /* The following make sure we can compute the operand index
2649              easily plus it mostly disallows chaining via COND_EXPR condition
2650              operands.  */
2651           || (gimple_assign_rhs1 (use_stmt) != op
2652               && gimple_assign_rhs2 (use_stmt) != op
2653               && gimple_assign_rhs3 (use_stmt) != op))
2654         {
2655           fail = true;
2656           break;
2657         }
2658       if (gimple_assign_rhs_code (use_stmt) != code)
2659         {
2660           if (code == PLUS_EXPR
2661               && gimple_assign_rhs_code (use_stmt) == MINUS_EXPR)
2662             {
2663               /* Track whether we negate the reduction value each iteration.  */
2664               if (gimple_assign_rhs2 (use_stmt) == op)
2665                 neg = ! neg;
2666             }
2667           else
2668             {
2669               fail = true;
2670               break;
2671             }
2672         }
2673     }
2674   return ! fail && ! neg;
2675 }
2676
2677 bool
2678 check_reduction_path (dump_user_location_t loc, loop_p loop, gphi *phi,
2679                       tree loop_arg, enum tree_code code)
2680 {
2681   auto_vec<std::pair<ssa_op_iter, use_operand_p> > path;
2682   return check_reduction_path (loc, loop, phi, loop_arg, code, path);
2683 }
2684
2685
2686
2687 /* Function vect_is_simple_reduction
2688
2689    (1) Detect a cross-iteration def-use cycle that represents a simple
2690    reduction computation.  We look for the following pattern:
2691
2692    loop_header:
2693      a1 = phi < a0, a2 >
2694      a3 = ...
2695      a2 = operation (a3, a1)
2696
2697    or
2698
2699    a3 = ...
2700    loop_header:
2701      a1 = phi < a0, a2 >
2702      a2 = operation (a3, a1)
2703
2704    such that:
2705    1. operation is commutative and associative and it is safe to
2706       change the order of the computation
2707    2. no uses for a2 in the loop (a2 is used out of the loop)
2708    3. no uses of a1 in the loop besides the reduction operation
2709    4. no uses of a1 outside the loop.
2710
2711    Conditions 1,4 are tested here.
2712    Conditions 2,3 are tested in vect_mark_stmts_to_be_vectorized.
2713
2714    (2) Detect a cross-iteration def-use cycle in nested loops, i.e.,
2715    nested cycles.
2716
2717    (3) Detect cycles of phi nodes in outer-loop vectorization, i.e., double
2718    reductions:
2719
2720      a1 = phi < a0, a2 >
2721      inner loop (def of a3)
2722      a2 = phi < a3 >
2723
2724    (4) Detect condition expressions, ie:
2725      for (int i = 0; i < N; i++)
2726        if (a[i] < val)
2727         ret_val = a[i];
2728
2729 */
2730
2731 static stmt_vec_info
2732 vect_is_simple_reduction (loop_vec_info loop_info, stmt_vec_info phi_info,
2733                           bool *double_reduc)
2734 {
2735   gphi *phi = as_a <gphi *> (phi_info->stmt);
2736   gimple *phi_use_stmt = NULL;
2737   imm_use_iterator imm_iter;
2738   use_operand_p use_p;
2739
2740   *double_reduc = false;
2741   STMT_VINFO_REDUC_TYPE (phi_info) = TREE_CODE_REDUCTION;
2742
2743   tree phi_name = PHI_RESULT (phi);
2744   /* ???  If there are no uses of the PHI result the inner loop reduction
2745      won't be detected as possibly double-reduction by vectorizable_reduction
2746      because that tries to walk the PHI arg from the preheader edge which
2747      can be constant.  See PR60382.  */
2748   if (has_zero_uses (phi_name))
2749     return NULL;
2750   class loop *loop = (gimple_bb (phi))->loop_father;
2751   unsigned nphi_def_loop_uses = 0;
2752   FOR_EACH_IMM_USE_FAST (use_p, imm_iter, phi_name)
2753     {
2754       gimple *use_stmt = USE_STMT (use_p);
2755       if (is_gimple_debug (use_stmt))
2756         continue;
2757
2758       if (!flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
2759         {
2760           if (dump_enabled_p ())
2761             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2762                              "intermediate value used outside loop.\n");
2763
2764           return NULL;
2765         }
2766
2767       nphi_def_loop_uses++;
2768       phi_use_stmt = use_stmt;
2769     }
2770
2771   tree latch_def = PHI_ARG_DEF_FROM_EDGE (phi, loop_latch_edge (loop));
2772   if (TREE_CODE (latch_def) != SSA_NAME)
2773     {
2774       if (dump_enabled_p ())
2775         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2776                          "reduction: not ssa_name: %T\n", latch_def);
2777       return NULL;
2778     }
2779
2780   stmt_vec_info def_stmt_info = loop_info->lookup_def (latch_def);
2781   if (!def_stmt_info
2782       || !flow_bb_inside_loop_p (loop, gimple_bb (def_stmt_info->stmt)))
2783     return NULL;
2784
2785   bool nested_in_vect_loop
2786     = flow_loop_nested_p (LOOP_VINFO_LOOP (loop_info), loop);
2787   unsigned nlatch_def_loop_uses = 0;
2788   auto_vec<gphi *, 3> lcphis;
2789   bool inner_loop_of_double_reduc = false;
2790   FOR_EACH_IMM_USE_FAST (use_p, imm_iter, latch_def)
2791     {
2792       gimple *use_stmt = USE_STMT (use_p);
2793       if (is_gimple_debug (use_stmt))
2794         continue;
2795       if (flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
2796         nlatch_def_loop_uses++;
2797       else
2798         {
2799           /* We can have more than one loop-closed PHI.  */
2800           lcphis.safe_push (as_a <gphi *> (use_stmt));
2801           if (nested_in_vect_loop
2802               && (STMT_VINFO_DEF_TYPE (loop_info->lookup_stmt (use_stmt))
2803                   == vect_double_reduction_def))
2804             inner_loop_of_double_reduc = true;
2805         }
2806     }
2807
2808   /* If we are vectorizing an inner reduction we are executing that
2809      in the original order only in case we are not dealing with a
2810      double reduction.  */
2811   if (nested_in_vect_loop && !inner_loop_of_double_reduc)
2812     {
2813       if (dump_enabled_p ())
2814         report_vect_op (MSG_NOTE, def_stmt_info->stmt,
2815                         "detected nested cycle: ");
2816       return def_stmt_info;
2817     }
2818
2819   /* If this isn't a nested cycle or if the nested cycle reduction value
2820      is used ouside of the inner loop we cannot handle uses of the reduction
2821      value.  */
2822   if (nlatch_def_loop_uses > 1 || nphi_def_loop_uses > 1)
2823     {
2824       if (dump_enabled_p ())
2825         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2826                          "reduction used in loop.\n");
2827       return NULL;
2828     }
2829
2830   /* If DEF_STMT is a phi node itself, we expect it to have a single argument
2831      defined in the inner loop.  */
2832   if (gphi *def_stmt = dyn_cast <gphi *> (def_stmt_info->stmt))
2833     {
2834       tree op1 = PHI_ARG_DEF (def_stmt, 0);
2835       if (gimple_phi_num_args (def_stmt) != 1
2836           || TREE_CODE (op1) != SSA_NAME)
2837         {
2838           if (dump_enabled_p ())
2839             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2840                              "unsupported phi node definition.\n");
2841
2842           return NULL;
2843         }
2844
2845       gimple *def1 = SSA_NAME_DEF_STMT (op1);
2846       if (gimple_bb (def1)
2847           && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
2848           && loop->inner
2849           && flow_bb_inside_loop_p (loop->inner, gimple_bb (def1))
2850           && is_gimple_assign (def1)
2851           && is_a <gphi *> (phi_use_stmt)
2852           && flow_bb_inside_loop_p (loop->inner, gimple_bb (phi_use_stmt)))
2853         {
2854           if (dump_enabled_p ())
2855             report_vect_op (MSG_NOTE, def_stmt,
2856                             "detected double reduction: ");
2857
2858           *double_reduc = true;
2859           return def_stmt_info;
2860         }
2861
2862       return NULL;
2863     }
2864
2865   gassign *def_stmt = dyn_cast <gassign *> (def_stmt_info->stmt);
2866   if (!def_stmt)
2867     {
2868       if (dump_enabled_p ())
2869         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2870                          "reduction: unhandled reduction operation: %G",
2871                          def_stmt_info->stmt);
2872       return NULL;
2873     }
2874   enum tree_code code = gimple_assign_rhs_code (def_stmt);
2875
2876   /* We can handle "res -= x[i]", which is non-associative by
2877      simply rewriting this into "res += -x[i]".  Avoid changing
2878      gimple instruction for the first simple tests and only do this
2879      if we're allowed to change code at all.  */
2880   if (code == MINUS_EXPR && gimple_assign_rhs2 (def_stmt) != phi_name)
2881     code = PLUS_EXPR;
2882
2883   tree op1, op2;
2884   if (code == COND_EXPR)
2885     {
2886       if (! nested_in_vect_loop)
2887         STMT_VINFO_REDUC_TYPE (phi_info) = COND_REDUCTION;
2888       op1 = gimple_assign_rhs2 (def_stmt);
2889       op2 = gimple_assign_rhs3 (def_stmt);
2890     }
2891   else if (get_gimple_rhs_class (code) == GIMPLE_BINARY_RHS)
2892     {
2893       op1 = gimple_assign_rhs1 (def_stmt);
2894       op2 = gimple_assign_rhs2 (def_stmt);
2895     }
2896   else
2897     {
2898       if (dump_enabled_p ())
2899         report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2900                         "reduction: not handled operation: ");
2901       return NULL;
2902     }
2903
2904   if (TREE_CODE (op1) != SSA_NAME && TREE_CODE (op2) != SSA_NAME)
2905     {
2906       if (dump_enabled_p ())
2907         report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2908                         "reduction: both uses not ssa_names: ");
2909
2910       return NULL;
2911     }
2912
2913   /* Reduction is safe. We're dealing with one of the following:
2914      1) integer arithmetic and no trapv
2915      2) floating point arithmetic, and special flags permit this optimization
2916      3) nested cycle (i.e., outer loop vectorization).  */
2917
2918   /* Check for the simple case that one def is the reduction def,
2919      defined by the PHI node.  */
2920   stmt_vec_info def1_info = loop_info->lookup_def (op1);
2921   stmt_vec_info def2_info = loop_info->lookup_def (op2);
2922   if (def2_info && def2_info->stmt == phi)
2923     {
2924       STMT_VINFO_REDUC_IDX (def_stmt_info) = 1 + (code == COND_EXPR ? 1 : 0);
2925       if (dump_enabled_p ())
2926         report_vect_op (MSG_NOTE, def_stmt, "detected reduction: ");
2927       return def_stmt_info;
2928     }
2929   else if (def1_info && def1_info->stmt == phi)
2930     {
2931       STMT_VINFO_REDUC_IDX (def_stmt_info) = 0 + (code == COND_EXPR ? 1 : 0);
2932       if (dump_enabled_p ())
2933         report_vect_op (MSG_NOTE, def_stmt, "detected reduction: ");
2934       return def_stmt_info;
2935     }
2936
2937   /* Look for the expression computing latch_def from then loop PHI result
2938      in a way involving more than one stmt.  */
2939   auto_vec<std::pair<ssa_op_iter, use_operand_p> > path;
2940   if (check_reduction_path (vect_location, loop, phi, latch_def, code,
2941                             path))
2942     {
2943       /* Try building an SLP reduction chain for which the additional
2944          restriction is that all operations in the chain are the same.  */
2945       auto_vec<stmt_vec_info, 8> reduc_chain;
2946       unsigned i;
2947       bool is_slp_reduc = !nested_in_vect_loop && code != COND_EXPR;
2948       for (i = path.length () - 1; i >= 1; --i)
2949         {
2950           gimple *stmt = USE_STMT (path[i].second);
2951           if (gimple_assign_rhs_code (stmt) != code)
2952             is_slp_reduc = false;
2953           stmt_vec_info stmt_info = loop_info->lookup_stmt (stmt);
2954           STMT_VINFO_REDUC_IDX (stmt_info)
2955             = path[i].second->use - gimple_assign_rhs1_ptr (stmt);
2956           reduc_chain.safe_push (stmt_info);
2957         }
2958       if (is_slp_reduc)
2959         {
2960           for (unsigned i = 0; i < reduc_chain.length () - 1; ++i)
2961             {
2962               REDUC_GROUP_FIRST_ELEMENT (reduc_chain[i]) = reduc_chain[0];
2963               REDUC_GROUP_NEXT_ELEMENT (reduc_chain[i]) = reduc_chain[i+1];
2964             }
2965           REDUC_GROUP_FIRST_ELEMENT (reduc_chain.last ()) = reduc_chain[0];
2966           REDUC_GROUP_NEXT_ELEMENT (reduc_chain.last ()) = NULL;
2967
2968           /* Save the chain for further analysis in SLP detection.  */
2969           LOOP_VINFO_REDUCTION_CHAINS (loop_info).safe_push (reduc_chain[0]);
2970           REDUC_GROUP_SIZE (reduc_chain[0]) = reduc_chain.length ();
2971
2972           if (dump_enabled_p ())
2973             report_vect_op (MSG_NOTE, def_stmt,
2974                             "reduction: detected reduction chain: ");
2975         }
2976
2977       return def_stmt_info;
2978     }
2979
2980   if (dump_enabled_p ())
2981     {
2982       report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2983                       "reduction: unknown pattern: ");
2984     }
2985
2986   return NULL;
2987 }
2988
2989 /* Calculate cost of peeling the loop PEEL_ITERS_PROLOGUE times.  */
2990 int
2991 vect_get_known_peeling_cost (loop_vec_info loop_vinfo, int peel_iters_prologue,
2992                              int *peel_iters_epilogue,
2993                              stmt_vector_for_cost *scalar_cost_vec,
2994                              stmt_vector_for_cost *prologue_cost_vec,
2995                              stmt_vector_for_cost *epilogue_cost_vec)
2996 {
2997   int retval = 0;
2998   int assumed_vf = vect_vf_for_cost (loop_vinfo);
2999
3000   if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
3001     {
3002       *peel_iters_epilogue = assumed_vf / 2;
3003       if (dump_enabled_p ())
3004         dump_printf_loc (MSG_NOTE, vect_location,
3005                          "cost model: epilogue peel iters set to vf/2 "
3006                          "because loop iterations are unknown .\n");
3007
3008       /* If peeled iterations are known but number of scalar loop
3009          iterations are unknown, count a taken branch per peeled loop.  */
3010       retval = record_stmt_cost (prologue_cost_vec, 1, cond_branch_taken,
3011                                  NULL, 0, vect_prologue);
3012       retval += record_stmt_cost (epilogue_cost_vec, 1, cond_branch_taken,
3013                                   NULL, 0, vect_epilogue);
3014     }
3015   else
3016     {
3017       int niters = LOOP_VINFO_INT_NITERS (loop_vinfo);
3018       peel_iters_prologue = niters < peel_iters_prologue ?
3019                             niters : peel_iters_prologue;
3020       *peel_iters_epilogue = (niters - peel_iters_prologue) % assumed_vf;
3021       /* If we need to peel for gaps, but no peeling is required, we have to
3022          peel VF iterations.  */
3023       if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) && !*peel_iters_epilogue)
3024         *peel_iters_epilogue = assumed_vf;
3025     }
3026
3027   stmt_info_for_cost *si;
3028   int j;
3029   if (peel_iters_prologue)
3030     FOR_EACH_VEC_ELT (*scalar_cost_vec, j, si)
3031       retval += record_stmt_cost (prologue_cost_vec,
3032                                   si->count * peel_iters_prologue,
3033                                   si->kind, si->stmt_info, si->misalign,
3034                                   vect_prologue);
3035   if (*peel_iters_epilogue)
3036     FOR_EACH_VEC_ELT (*scalar_cost_vec, j, si)
3037       retval += record_stmt_cost (epilogue_cost_vec,
3038                                   si->count * *peel_iters_epilogue,
3039                                   si->kind, si->stmt_info, si->misalign,
3040                                   vect_epilogue);
3041
3042   return retval;
3043 }
3044
3045 /* Function vect_estimate_min_profitable_iters
3046
3047    Return the number of iterations required for the vector version of the
3048    loop to be profitable relative to the cost of the scalar version of the
3049    loop.
3050
3051    *RET_MIN_PROFITABLE_NITERS is a cost model profitability threshold
3052    of iterations for vectorization.  -1 value means loop vectorization
3053    is not profitable.  This returned value may be used for dynamic
3054    profitability check.
3055
3056    *RET_MIN_PROFITABLE_ESTIMATE is a profitability threshold to be used
3057    for static check against estimated number of iterations.  */
3058
3059 static void
3060 vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo,
3061                                     int *ret_min_profitable_niters,
3062                                     int *ret_min_profitable_estimate)
3063 {
3064   int min_profitable_iters;
3065   int min_profitable_estimate;
3066   int peel_iters_prologue;
3067   int peel_iters_epilogue;
3068   unsigned vec_inside_cost = 0;
3069   int vec_outside_cost = 0;
3070   unsigned vec_prologue_cost = 0;
3071   unsigned vec_epilogue_cost = 0;
3072   int scalar_single_iter_cost = 0;
3073   int scalar_outside_cost = 0;
3074   int assumed_vf = vect_vf_for_cost (loop_vinfo);
3075   int npeel = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
3076   void *target_cost_data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
3077
3078   /* Cost model disabled.  */
3079   if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
3080     {
3081       if (dump_enabled_p ())
3082         dump_printf_loc (MSG_NOTE, vect_location, "cost model disabled.\n");
3083       *ret_min_profitable_niters = 0;
3084       *ret_min_profitable_estimate = 0;
3085       return;
3086     }
3087
3088   /* Requires loop versioning tests to handle misalignment.  */
3089   if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo))
3090     {
3091       /*  FIXME: Make cost depend on complexity of individual check.  */
3092       unsigned len = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).length ();
3093       (void) add_stmt_cost (target_cost_data, len, vector_stmt, NULL, 0,
3094                             vect_prologue);
3095       if (dump_enabled_p ())
3096         dump_printf (MSG_NOTE,
3097                      "cost model: Adding cost of checks for loop "
3098                      "versioning to treat misalignment.\n");
3099     }
3100
3101   /* Requires loop versioning with alias checks.  */
3102   if (LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
3103     {
3104       /*  FIXME: Make cost depend on complexity of individual check.  */
3105       unsigned len = LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo).length ();
3106       (void) add_stmt_cost (target_cost_data, len, vector_stmt, NULL, 0,
3107                             vect_prologue);
3108       len = LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo).length ();
3109       if (len)
3110         /* Count LEN - 1 ANDs and LEN comparisons.  */
3111         (void) add_stmt_cost (target_cost_data, len * 2 - 1, scalar_stmt,
3112                               NULL, 0, vect_prologue);
3113       len = LOOP_VINFO_LOWER_BOUNDS (loop_vinfo).length ();
3114       if (len)
3115         {
3116           /* Count LEN - 1 ANDs and LEN comparisons.  */
3117           unsigned int nstmts = len * 2 - 1;
3118           /* +1 for each bias that needs adding.  */
3119           for (unsigned int i = 0; i < len; ++i)
3120             if (!LOOP_VINFO_LOWER_BOUNDS (loop_vinfo)[i].unsigned_p)
3121               nstmts += 1;
3122           (void) add_stmt_cost (target_cost_data, nstmts, scalar_stmt,
3123                                 NULL, 0, vect_prologue);
3124         }
3125       if (dump_enabled_p ())
3126         dump_printf (MSG_NOTE,
3127                      "cost model: Adding cost of checks for loop "
3128                      "versioning aliasing.\n");
3129     }
3130
3131   /* Requires loop versioning with niter checks.  */
3132   if (LOOP_REQUIRES_VERSIONING_FOR_NITERS (loop_vinfo))
3133     {
3134       /*  FIXME: Make cost depend on complexity of individual check.  */
3135       (void) add_stmt_cost (target_cost_data, 1, vector_stmt, NULL, 0,
3136                             vect_prologue);
3137       if (dump_enabled_p ())
3138         dump_printf (MSG_NOTE,
3139                      "cost model: Adding cost of checks for loop "
3140                      "versioning niters.\n");
3141     }
3142
3143   if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
3144     (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken, NULL, 0,
3145                           vect_prologue);
3146
3147   /* Count statements in scalar loop.  Using this as scalar cost for a single
3148      iteration for now.
3149
3150      TODO: Add outer loop support.
3151
3152      TODO: Consider assigning different costs to different scalar
3153      statements.  */
3154
3155   scalar_single_iter_cost
3156     = LOOP_VINFO_SINGLE_SCALAR_ITERATION_COST (loop_vinfo);
3157
3158   /* Add additional cost for the peeled instructions in prologue and epilogue
3159      loop.  (For fully-masked loops there will be no peeling.)
3160
3161      FORNOW: If we don't know the value of peel_iters for prologue or epilogue
3162      at compile-time - we assume it's vf/2 (the worst would be vf-1).
3163
3164      TODO: Build an expression that represents peel_iters for prologue and
3165      epilogue to be used in a run-time test.  */
3166
3167   if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
3168     {
3169       peel_iters_prologue = 0;
3170       peel_iters_epilogue = 0;
3171
3172       if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
3173         {
3174           /* We need to peel exactly one iteration.  */
3175           peel_iters_epilogue += 1;
3176           stmt_info_for_cost *si;
3177           int j;
3178           FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
3179                             j, si)
3180             (void) add_stmt_cost (target_cost_data, si->count,
3181                                   si->kind, si->stmt_info, si->misalign,
3182                                   vect_epilogue);
3183         }
3184     }
3185   else if (npeel < 0)
3186     {
3187       peel_iters_prologue = assumed_vf / 2;
3188       if (dump_enabled_p ())
3189         dump_printf (MSG_NOTE, "cost model: "
3190                      "prologue peel iters set to vf/2.\n");
3191
3192       /* If peeling for alignment is unknown, loop bound of main loop becomes
3193          unknown.  */
3194       peel_iters_epilogue = assumed_vf / 2;
3195       if (dump_enabled_p ())
3196         dump_printf (MSG_NOTE, "cost model: "
3197                      "epilogue peel iters set to vf/2 because "
3198                      "peeling for alignment is unknown.\n");
3199
3200       /* If peeled iterations are unknown, count a taken branch and a not taken
3201          branch per peeled loop. Even if scalar loop iterations are known,
3202          vector iterations are not known since peeled prologue iterations are
3203          not known. Hence guards remain the same.  */
3204       (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken,
3205                             NULL, 0, vect_prologue);
3206       (void) add_stmt_cost (target_cost_data, 1, cond_branch_not_taken,
3207                             NULL, 0, vect_prologue);
3208       (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken,
3209                             NULL, 0, vect_epilogue);
3210       (void) add_stmt_cost (target_cost_data, 1, cond_branch_not_taken,
3211                             NULL, 0, vect_epilogue);
3212       stmt_info_for_cost *si;
3213       int j;
3214       FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo), j, si)
3215         {
3216           (void) add_stmt_cost (target_cost_data,
3217                                 si->count * peel_iters_prologue,
3218                                 si->kind, si->stmt_info, si->misalign,
3219                                 vect_prologue);
3220           (void) add_stmt_cost (target_cost_data,
3221                                 si->count * peel_iters_epilogue,
3222                                 si->kind, si->stmt_info, si->misalign,
3223                                 vect_epilogue);
3224         }
3225     }
3226   else
3227     {
3228       stmt_vector_for_cost prologue_cost_vec, epilogue_cost_vec;
3229       stmt_info_for_cost *si;
3230       int j;
3231       void *data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
3232
3233       prologue_cost_vec.create (2);
3234       epilogue_cost_vec.create (2);
3235       peel_iters_prologue = npeel;
3236
3237       (void) vect_get_known_peeling_cost (loop_vinfo, peel_iters_prologue,
3238                                           &peel_iters_epilogue,
3239                                           &LOOP_VINFO_SCALAR_ITERATION_COST
3240                                             (loop_vinfo),
3241                                           &prologue_cost_vec,
3242                                           &epilogue_cost_vec);
3243
3244       FOR_EACH_VEC_ELT (prologue_cost_vec, j, si)
3245         (void) add_stmt_cost (data, si->count, si->kind, si->stmt_info,
3246                               si->misalign, vect_prologue);
3247
3248       FOR_EACH_VEC_ELT (epilogue_cost_vec, j, si)
3249         (void) add_stmt_cost (data, si->count, si->kind, si->stmt_info,
3250                               si->misalign, vect_epilogue);
3251
3252       prologue_cost_vec.release ();
3253       epilogue_cost_vec.release ();
3254     }
3255
3256   /* FORNOW: The scalar outside cost is incremented in one of the
3257      following ways:
3258
3259      1. The vectorizer checks for alignment and aliasing and generates
3260      a condition that allows dynamic vectorization.  A cost model
3261      check is ANDED with the versioning condition.  Hence scalar code
3262      path now has the added cost of the versioning check.
3263
3264        if (cost > th & versioning_check)
3265          jmp to vector code
3266
3267      Hence run-time scalar is incremented by not-taken branch cost.
3268
3269      2. The vectorizer then checks if a prologue is required.  If the
3270      cost model check was not done before during versioning, it has to
3271      be done before the prologue check.
3272
3273        if (cost <= th)
3274          prologue = scalar_iters
3275        if (prologue == 0)
3276          jmp to vector code
3277        else
3278          execute prologue
3279        if (prologue == num_iters)
3280          go to exit
3281
3282      Hence the run-time scalar cost is incremented by a taken branch,
3283      plus a not-taken branch, plus a taken branch cost.
3284
3285      3. The vectorizer then checks if an epilogue is required.  If the
3286      cost model check was not done before during prologue check, it
3287      has to be done with the epilogue check.
3288
3289        if (prologue == 0)
3290          jmp to vector code
3291        else
3292          execute prologue
3293        if (prologue == num_iters)
3294          go to exit
3295        vector code:
3296          if ((cost <= th) | (scalar_iters-prologue-epilogue == 0))
3297            jmp to epilogue
3298
3299      Hence the run-time scalar cost should be incremented by 2 taken
3300      branches.
3301
3302      TODO: The back end may reorder the BBS's differently and reverse
3303      conditions/branch directions.  Change the estimates below to
3304      something more reasonable.  */
3305
3306   /* If the number of iterations is known and we do not do versioning, we can
3307      decide whether to vectorize at compile time.  Hence the scalar version
3308      do not carry cost model guard costs.  */
3309   if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
3310       || LOOP_REQUIRES_VERSIONING (loop_vinfo))
3311     {
3312       /* Cost model check occurs at versioning.  */
3313       if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
3314         scalar_outside_cost += vect_get_stmt_cost (cond_branch_not_taken);
3315       else
3316         {
3317           /* Cost model check occurs at prologue generation.  */
3318           if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) < 0)
3319             scalar_outside_cost += 2 * vect_get_stmt_cost (cond_branch_taken)
3320               + vect_get_stmt_cost (cond_branch_not_taken);
3321           /* Cost model check occurs at epilogue generation.  */
3322           else
3323             scalar_outside_cost += 2 * vect_get_stmt_cost (cond_branch_taken);
3324         }
3325     }
3326
3327   /* Complete the target-specific cost calculations.  */
3328   finish_cost (LOOP_VINFO_TARGET_COST_DATA (loop_vinfo), &vec_prologue_cost,
3329                &vec_inside_cost, &vec_epilogue_cost);
3330
3331   vec_outside_cost = (int)(vec_prologue_cost + vec_epilogue_cost);
3332
3333   if (dump_enabled_p ())
3334     {
3335       dump_printf_loc (MSG_NOTE, vect_location, "Cost model analysis: \n");
3336       dump_printf (MSG_NOTE, "  Vector inside of loop cost: %d\n",
3337                    vec_inside_cost);
3338       dump_printf (MSG_NOTE, "  Vector prologue cost: %d\n",
3339                    vec_prologue_cost);
3340       dump_printf (MSG_NOTE, "  Vector epilogue cost: %d\n",
3341                    vec_epilogue_cost);
3342       dump_printf (MSG_NOTE, "  Scalar iteration cost: %d\n",
3343                    scalar_single_iter_cost);
3344       dump_printf (MSG_NOTE, "  Scalar outside cost: %d\n",
3345                    scalar_outside_cost);
3346       dump_printf (MSG_NOTE, "  Vector outside cost: %d\n",
3347                    vec_outside_cost);
3348       dump_printf (MSG_NOTE, "  prologue iterations: %d\n",
3349                    peel_iters_prologue);
3350       dump_printf (MSG_NOTE, "  epilogue iterations: %d\n",
3351                    peel_iters_epilogue);
3352     }
3353
3354   /* Calculate number of iterations required to make the vector version
3355      profitable, relative to the loop bodies only.  The following condition
3356      must hold true:
3357      SIC * niters + SOC > VIC * ((niters - NPEEL) / VF) + VOC
3358      where
3359      SIC = scalar iteration cost, VIC = vector iteration cost,
3360      VOC = vector outside cost, VF = vectorization factor,
3361      NPEEL = prologue iterations + epilogue iterations,
3362      SOC = scalar outside cost for run time cost model check.  */
3363
3364   int saving_per_viter = (scalar_single_iter_cost * assumed_vf
3365                           - vec_inside_cost);
3366   if (saving_per_viter <= 0)
3367     {
3368       if (LOOP_VINFO_LOOP (loop_vinfo)->force_vectorize)
3369         warning_at (vect_location.get_location_t (), OPT_Wopenmp_simd,
3370                     "vectorization did not happen for a simd loop");
3371
3372       if (dump_enabled_p ())
3373         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3374                          "cost model: the vector iteration cost = %d "
3375                          "divided by the scalar iteration cost = %d "
3376                          "is greater or equal to the vectorization factor = %d"
3377                          ".\n",
3378                          vec_inside_cost, scalar_single_iter_cost, assumed_vf);
3379       *ret_min_profitable_niters = -1;
3380       *ret_min_profitable_estimate = -1;
3381       return;
3382     }
3383
3384   /* ??? The "if" arm is written to handle all cases; see below for what
3385      we would do for !LOOP_VINFO_FULLY_MASKED_P.  */
3386   if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
3387     {
3388       /* Rewriting the condition above in terms of the number of
3389          vector iterations (vniters) rather than the number of
3390          scalar iterations (niters) gives:
3391
3392          SIC * (vniters * VF + NPEEL) + SOC > VIC * vniters + VOC
3393
3394          <==> vniters * (SIC * VF - VIC) > VOC - SIC * NPEEL - SOC
3395
3396          For integer N, X and Y when X > 0:
3397
3398          N * X > Y <==> N >= (Y /[floor] X) + 1.  */
3399       int outside_overhead = (vec_outside_cost
3400                               - scalar_single_iter_cost * peel_iters_prologue
3401                               - scalar_single_iter_cost * peel_iters_epilogue
3402                               - scalar_outside_cost);
3403       /* We're only interested in cases that require at least one
3404          vector iteration.  */
3405       int min_vec_niters = 1;
3406       if (outside_overhead > 0)
3407         min_vec_niters = outside_overhead / saving_per_viter + 1;
3408
3409       if (dump_enabled_p ())
3410         dump_printf (MSG_NOTE, "  Minimum number of vector iterations: %d\n",
3411                      min_vec_niters);
3412
3413       if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
3414         {
3415           /* Now that we know the minimum number of vector iterations,
3416              find the minimum niters for which the scalar cost is larger:
3417
3418              SIC * niters > VIC * vniters + VOC - SOC
3419
3420              We know that the minimum niters is no more than
3421              vniters * VF + NPEEL, but it might be (and often is) less
3422              than that if a partial vector iteration is cheaper than the
3423              equivalent scalar code.  */
3424           int threshold = (vec_inside_cost * min_vec_niters
3425                            + vec_outside_cost
3426                            - scalar_outside_cost);
3427           if (threshold <= 0)
3428             min_profitable_iters = 1;
3429           else
3430             min_profitable_iters = threshold / scalar_single_iter_cost + 1;
3431         }
3432       else
3433         /* Convert the number of vector iterations into a number of
3434            scalar iterations.  */
3435         min_profitable_iters = (min_vec_niters * assumed_vf
3436                                 + peel_iters_prologue
3437                                 + peel_iters_epilogue);
3438     }
3439   else
3440     {
3441       min_profitable_iters = ((vec_outside_cost - scalar_outside_cost)
3442                               * assumed_vf
3443                               - vec_inside_cost * peel_iters_prologue
3444                               - vec_inside_cost * peel_iters_epilogue);
3445       if (min_profitable_iters <= 0)
3446         min_profitable_iters = 0;
3447       else
3448         {
3449           min_profitable_iters /= saving_per_viter;
3450
3451           if ((scalar_single_iter_cost * assumed_vf * min_profitable_iters)
3452               <= (((int) vec_inside_cost * min_profitable_iters)
3453                   + (((int) vec_outside_cost - scalar_outside_cost)
3454                      * assumed_vf)))
3455             min_profitable_iters++;
3456         }
3457     }
3458
3459   if (dump_enabled_p ())
3460     dump_printf (MSG_NOTE,
3461                  "  Calculated minimum iters for profitability: %d\n",
3462                  min_profitable_iters);
3463
3464   if (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
3465       && min_profitable_iters < (assumed_vf + peel_iters_prologue))
3466     /* We want the vectorized loop to execute at least once.  */
3467     min_profitable_iters = assumed_vf + peel_iters_prologue;
3468
3469   if (dump_enabled_p ())
3470     dump_printf_loc (MSG_NOTE, vect_location,
3471                      "  Runtime profitability threshold = %d\n",
3472                      min_profitable_iters);
3473
3474   *ret_min_profitable_niters = min_profitable_iters;
3475
3476   /* Calculate number of iterations required to make the vector version
3477      profitable, relative to the loop bodies only.
3478
3479      Non-vectorized variant is SIC * niters and it must win over vector
3480      variant on the expected loop trip count.  The following condition must hold true:
3481      SIC * niters > VIC * ((niters - NPEEL) / VF) + VOC + SOC  */
3482
3483   if (vec_outside_cost <= 0)
3484     min_profitable_estimate = 0;
3485   else if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
3486     {
3487       /* This is a repeat of the code above, but with + SOC rather
3488          than - SOC.  */
3489       int outside_overhead = (vec_outside_cost
3490                               - scalar_single_iter_cost * peel_iters_prologue
3491                               - scalar_single_iter_cost * peel_iters_epilogue
3492                               + scalar_outside_cost);
3493       int min_vec_niters = 1;
3494       if (outside_overhead > 0)
3495         min_vec_niters = outside_overhead / saving_per_viter + 1;
3496
3497       if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
3498         {
3499           int threshold = (vec_inside_cost * min_vec_niters
3500                            + vec_outside_cost
3501                            + scalar_outside_cost);
3502           min_profitable_estimate = threshold / scalar_single_iter_cost + 1;
3503         }
3504       else
3505         min_profitable_estimate = (min_vec_niters * assumed_vf
3506                                    + peel_iters_prologue
3507                                    + peel_iters_epilogue);
3508     }
3509   else
3510     {
3511       min_profitable_estimate = ((vec_outside_cost + scalar_outside_cost)
3512                                  * assumed_vf
3513                                  - vec_inside_cost * peel_iters_prologue
3514                                  - vec_inside_cost * peel_iters_epilogue)
3515                                  / ((scalar_single_iter_cost * assumed_vf)
3516                                    - vec_inside_cost);
3517     }
3518   min_profitable_estimate = MAX (min_profitable_estimate, min_profitable_iters);
3519   if (dump_enabled_p ())
3520     dump_printf_loc (MSG_NOTE, vect_location,
3521                      "  Static estimate profitability threshold = %d\n",
3522                      min_profitable_estimate);
3523
3524   *ret_min_profitable_estimate = min_profitable_estimate;
3525 }
3526
3527 /* Writes into SEL a mask for a vec_perm, equivalent to a vec_shr by OFFSET
3528    vector elements (not bits) for a vector with NELT elements.  */
3529 static void
3530 calc_vec_perm_mask_for_shift (unsigned int offset, unsigned int nelt,
3531                               vec_perm_builder *sel)
3532 {
3533   /* The encoding is a single stepped pattern.  Any wrap-around is handled
3534      by vec_perm_indices.  */
3535   sel->new_vector (nelt, 1, 3);
3536   for (unsigned int i = 0; i < 3; i++)
3537     sel->quick_push (i + offset);
3538 }
3539
3540 /* Checks whether the target supports whole-vector shifts for vectors of mode
3541    MODE.  This is the case if _either_ the platform handles vec_shr_optab, _or_
3542    it supports vec_perm_const with masks for all necessary shift amounts.  */
3543 static bool
3544 have_whole_vector_shift (machine_mode mode)
3545 {
3546   if (optab_handler (vec_shr_optab, mode) != CODE_FOR_nothing)
3547     return true;
3548
3549   /* Variable-length vectors should be handled via the optab.  */
3550   unsigned int nelt;
3551   if (!GET_MODE_NUNITS (mode).is_constant (&nelt))
3552     return false;
3553
3554   vec_perm_builder sel;
3555   vec_perm_indices indices;
3556   for (unsigned int i = nelt / 2; i >= 1; i /= 2)
3557     {
3558       calc_vec_perm_mask_for_shift (i, nelt, &sel);
3559       indices.new_vector (sel, 2, nelt);
3560       if (!can_vec_perm_const_p (mode, indices, false))
3561         return false;
3562     }
3563   return true;
3564 }
3565
3566 /* TODO: Close dependency between vect_model_*_cost and vectorizable_*
3567    functions. Design better to avoid maintenance issues.  */
3568
3569 /* Function vect_model_reduction_cost.
3570
3571    Models cost for a reduction operation, including the vector ops
3572    generated within the strip-mine loop, the initial definition before
3573    the loop, and the epilogue code that must be generated.  */
3574
3575 static void
3576 vect_model_reduction_cost (stmt_vec_info stmt_info, internal_fn reduc_fn,
3577                            vect_reduction_type reduction_type,
3578                            int ncopies, stmt_vector_for_cost *cost_vec)
3579 {
3580   int prologue_cost = 0, epilogue_cost = 0, inside_cost;
3581   enum tree_code code;
3582   optab optab;
3583   tree vectype;
3584   machine_mode mode;
3585   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
3586   class loop *loop = NULL;
3587
3588   if (loop_vinfo)
3589     loop = LOOP_VINFO_LOOP (loop_vinfo);
3590
3591   /* Condition reductions generate two reductions in the loop.  */
3592   if (reduction_type == COND_REDUCTION)
3593     ncopies *= 2;
3594
3595   vectype = STMT_VINFO_VECTYPE (stmt_info);
3596   mode = TYPE_MODE (vectype);
3597   stmt_vec_info orig_stmt_info = vect_orig_stmt (stmt_info);
3598
3599   code = gimple_assign_rhs_code (orig_stmt_info->stmt);
3600
3601   if (reduction_type == EXTRACT_LAST_REDUCTION
3602       || reduction_type == FOLD_LEFT_REDUCTION)
3603     {
3604       /* No extra instructions needed in the prologue.  */
3605       prologue_cost = 0;
3606
3607       if (reduction_type == EXTRACT_LAST_REDUCTION || reduc_fn != IFN_LAST)
3608         /* Count one reduction-like operation per vector.  */
3609         inside_cost = record_stmt_cost (cost_vec, ncopies, vec_to_scalar,
3610                                         stmt_info, 0, vect_body);
3611       else
3612         {
3613           /* Use NELEMENTS extracts and NELEMENTS scalar ops.  */
3614           unsigned int nelements = ncopies * vect_nunits_for_cost (vectype);
3615           inside_cost = record_stmt_cost (cost_vec, nelements,
3616                                           vec_to_scalar, stmt_info, 0,
3617                                           vect_body);
3618           inside_cost += record_stmt_cost (cost_vec, nelements,
3619                                            scalar_stmt, stmt_info, 0,
3620                                            vect_body);
3621         }
3622     }
3623   else
3624     {
3625       /* Add in cost for initial definition.
3626          For cond reduction we have four vectors: initial index, step,
3627          initial result of the data reduction, initial value of the index
3628          reduction.  */
3629       int prologue_stmts = reduction_type == COND_REDUCTION ? 4 : 1;
3630       prologue_cost += record_stmt_cost (cost_vec, prologue_stmts,
3631                                          scalar_to_vec, stmt_info, 0,
3632                                          vect_prologue);
3633
3634       /* Cost of reduction op inside loop.  */
3635       inside_cost = record_stmt_cost (cost_vec, ncopies, vector_stmt,
3636                                       stmt_info, 0, vect_body);
3637     }
3638
3639   /* Determine cost of epilogue code.
3640
3641      We have a reduction operator that will reduce the vector in one statement.
3642      Also requires scalar extract.  */
3643
3644   if (!loop || !nested_in_vect_loop_p (loop, orig_stmt_info))
3645     {
3646       if (reduc_fn != IFN_LAST)
3647         {
3648           if (reduction_type == COND_REDUCTION)
3649             {
3650               /* An EQ stmt and an COND_EXPR stmt.  */
3651               epilogue_cost += record_stmt_cost (cost_vec, 2,
3652                                                  vector_stmt, stmt_info, 0,
3653                                                  vect_epilogue);
3654               /* Reduction of the max index and a reduction of the found
3655                  values.  */
3656               epilogue_cost += record_stmt_cost (cost_vec, 2,
3657                                                  vec_to_scalar, stmt_info, 0,
3658                                                  vect_epilogue);
3659               /* A broadcast of the max value.  */
3660               epilogue_cost += record_stmt_cost (cost_vec, 1,
3661                                                  scalar_to_vec, stmt_info, 0,
3662                                                  vect_epilogue);
3663             }
3664           else
3665             {
3666               epilogue_cost += record_stmt_cost (cost_vec, 1, vector_stmt,
3667                                                  stmt_info, 0, vect_epilogue);
3668               epilogue_cost += record_stmt_cost (cost_vec, 1,
3669                                                  vec_to_scalar, stmt_info, 0,
3670                                                  vect_epilogue);
3671             }
3672         }
3673       else if (reduction_type == COND_REDUCTION)
3674         {
3675           unsigned estimated_nunits = vect_nunits_for_cost (vectype);
3676           /* Extraction of scalar elements.  */
3677           epilogue_cost += record_stmt_cost (cost_vec,
3678                                              2 * estimated_nunits,
3679                                              vec_to_scalar, stmt_info, 0,
3680                                              vect_epilogue);
3681           /* Scalar max reductions via COND_EXPR / MAX_EXPR.  */
3682           epilogue_cost += record_stmt_cost (cost_vec,
3683                                              2 * estimated_nunits - 3,
3684                                              scalar_stmt, stmt_info, 0,
3685                                              vect_epilogue);
3686         }
3687       else if (reduction_type == EXTRACT_LAST_REDUCTION
3688                || reduction_type == FOLD_LEFT_REDUCTION)
3689         /* No extra instructions need in the epilogue.  */
3690         ;
3691       else
3692         {
3693           int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
3694           tree bitsize =
3695             TYPE_SIZE (TREE_TYPE (gimple_assign_lhs (orig_stmt_info->stmt)));
3696           int element_bitsize = tree_to_uhwi (bitsize);
3697           int nelements = vec_size_in_bits / element_bitsize;
3698
3699           if (code == COND_EXPR)
3700             code = MAX_EXPR;
3701
3702           optab = optab_for_tree_code (code, vectype, optab_default);
3703
3704           /* We have a whole vector shift available.  */
3705           if (optab != unknown_optab
3706               && VECTOR_MODE_P (mode)
3707               && optab_handler (optab, mode) != CODE_FOR_nothing
3708               && have_whole_vector_shift (mode))
3709             {
3710               /* Final reduction via vector shifts and the reduction operator.
3711                  Also requires scalar extract.  */
3712               epilogue_cost += record_stmt_cost (cost_vec,
3713                                                  exact_log2 (nelements) * 2,
3714                                                  vector_stmt, stmt_info, 0,
3715                                                  vect_epilogue);
3716               epilogue_cost += record_stmt_cost (cost_vec, 1,
3717                                                  vec_to_scalar, stmt_info, 0,
3718                                                  vect_epilogue);
3719             }
3720           else
3721             /* Use extracts and reduction op for final reduction.  For N
3722                elements, we have N extracts and N-1 reduction ops.  */
3723             epilogue_cost += record_stmt_cost (cost_vec,
3724                                                nelements + nelements - 1,
3725                                                vector_stmt, stmt_info, 0,
3726                                                vect_epilogue);
3727         }
3728     }
3729
3730   if (dump_enabled_p ())
3731     dump_printf (MSG_NOTE,
3732                  "vect_model_reduction_cost: inside_cost = %d, "
3733                  "prologue_cost = %d, epilogue_cost = %d .\n", inside_cost,
3734                  prologue_cost, epilogue_cost);
3735 }
3736
3737
3738 /* Function vect_model_induction_cost.
3739
3740    Models cost for induction operations.  */
3741
3742 static void
3743 vect_model_induction_cost (stmt_vec_info stmt_info, int ncopies,
3744                            stmt_vector_for_cost *cost_vec)
3745 {
3746   unsigned inside_cost, prologue_cost;
3747
3748   if (PURE_SLP_STMT (stmt_info))
3749     return;
3750
3751   /* loop cost for vec_loop.  */
3752   inside_cost = record_stmt_cost (cost_vec, ncopies, vector_stmt,
3753                                   stmt_info, 0, vect_body);
3754
3755   /* prologue cost for vec_init and vec_step.  */
3756   prologue_cost = record_stmt_cost (cost_vec, 2, scalar_to_vec,
3757                                     stmt_info, 0, vect_prologue);
3758
3759   if (dump_enabled_p ())
3760     dump_printf_loc (MSG_NOTE, vect_location,
3761                      "vect_model_induction_cost: inside_cost = %d, "
3762                      "prologue_cost = %d .\n", inside_cost, prologue_cost);
3763 }
3764
3765
3766
3767 /* Function get_initial_def_for_reduction
3768
3769    Input:
3770    STMT_VINFO - a stmt that performs a reduction operation in the loop.
3771    INIT_VAL - the initial value of the reduction variable
3772
3773    Output:
3774    ADJUSTMENT_DEF - a tree that holds a value to be added to the final result
3775         of the reduction (used for adjusting the epilog - see below).
3776    Return a vector variable, initialized according to the operation that
3777         STMT_VINFO performs. This vector will be used as the initial value
3778         of the vector of partial results.
3779
3780    Option1 (adjust in epilog): Initialize the vector as follows:
3781      add/bit or/xor:    [0,0,...,0,0]
3782      mult/bit and:      [1,1,...,1,1]
3783      min/max/cond_expr: [init_val,init_val,..,init_val,init_val]
3784    and when necessary (e.g. add/mult case) let the caller know
3785    that it needs to adjust the result by init_val.
3786
3787    Option2: Initialize the vector as follows:
3788      add/bit or/xor:    [init_val,0,0,...,0]
3789      mult/bit and:      [init_val,1,1,...,1]
3790      min/max/cond_expr: [init_val,init_val,...,init_val]
3791    and no adjustments are needed.
3792
3793    For example, for the following code:
3794
3795    s = init_val;
3796    for (i=0;i<n;i++)
3797      s = s + a[i];
3798
3799    STMT_VINFO is 's = s + a[i]', and the reduction variable is 's'.
3800    For a vector of 4 units, we want to return either [0,0,0,init_val],
3801    or [0,0,0,0] and let the caller know that it needs to adjust
3802    the result at the end by 'init_val'.
3803
3804    FORNOW, we are using the 'adjust in epilog' scheme, because this way the
3805    initialization vector is simpler (same element in all entries), if
3806    ADJUSTMENT_DEF is not NULL, and Option2 otherwise.
3807
3808    A cost model should help decide between these two schemes.  */
3809
3810 static tree
3811 get_initial_def_for_reduction (stmt_vec_info stmt_vinfo,
3812                                enum tree_code code, tree init_val,
3813                                tree *adjustment_def)
3814 {
3815   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_vinfo);
3816   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
3817   tree scalar_type = TREE_TYPE (init_val);
3818   tree vectype = get_vectype_for_scalar_type (scalar_type);
3819   tree def_for_init;
3820   tree init_def;
3821   REAL_VALUE_TYPE real_init_val = dconst0;
3822   int int_init_val = 0;
3823   gimple_seq stmts = NULL;
3824
3825   gcc_assert (vectype);
3826
3827   gcc_assert (POINTER_TYPE_P (scalar_type) || INTEGRAL_TYPE_P (scalar_type)
3828               || SCALAR_FLOAT_TYPE_P (scalar_type));
3829
3830   gcc_assert (nested_in_vect_loop_p (loop, stmt_vinfo)
3831               || loop == (gimple_bb (stmt_vinfo->stmt))->loop_father);
3832
3833   /* ADJUSTMENT_DEF is NULL when called from
3834      vect_create_epilog_for_reduction to vectorize double reduction.  */
3835   if (adjustment_def)
3836     *adjustment_def = NULL;
3837
3838   switch (code)
3839     {
3840     case WIDEN_SUM_EXPR:
3841     case DOT_PROD_EXPR:
3842     case SAD_EXPR:
3843     case PLUS_EXPR:
3844     case MINUS_EXPR:
3845     case BIT_IOR_EXPR:
3846     case BIT_XOR_EXPR:
3847     case MULT_EXPR:
3848     case BIT_AND_EXPR:
3849       {
3850         if (code == MULT_EXPR)
3851           {
3852             real_init_val = dconst1;
3853             int_init_val = 1;
3854           }
3855
3856         if (code == BIT_AND_EXPR)
3857           int_init_val = -1;
3858
3859         if (SCALAR_FLOAT_TYPE_P (scalar_type))
3860           def_for_init = build_real (scalar_type, real_init_val);
3861         else
3862           def_for_init = build_int_cst (scalar_type, int_init_val);
3863
3864         if (adjustment_def || operand_equal_p (def_for_init, init_val, 0))
3865           {
3866             /* Option1: the first element is '0' or '1' as well.  */
3867             if (!operand_equal_p (def_for_init, init_val, 0))
3868               *adjustment_def = init_val;
3869             init_def = gimple_build_vector_from_val (&stmts, vectype,
3870                                                      def_for_init);
3871           }
3872         else if (!TYPE_VECTOR_SUBPARTS (vectype).is_constant ())
3873           {
3874             /* Option2 (variable length): the first element is INIT_VAL.  */
3875             init_def = gimple_build_vector_from_val (&stmts, vectype,
3876                                                      def_for_init);
3877             init_def = gimple_build (&stmts, CFN_VEC_SHL_INSERT,
3878                                      vectype, init_def, init_val);
3879           }
3880         else
3881           {
3882             /* Option2: the first element is INIT_VAL.  */
3883             tree_vector_builder elts (vectype, 1, 2);
3884             elts.quick_push (init_val);
3885             elts.quick_push (def_for_init);
3886             init_def = gimple_build_vector (&stmts, &elts);
3887           }
3888       }
3889       break;
3890
3891     case MIN_EXPR:
3892     case MAX_EXPR:
3893     case COND_EXPR:
3894       {
3895         init_val = gimple_convert (&stmts, TREE_TYPE (vectype), init_val);
3896         init_def = gimple_build_vector_from_val (&stmts, vectype, init_val);
3897       }
3898       break;
3899
3900     default:
3901       gcc_unreachable ();
3902     }
3903
3904   if (stmts)
3905     gsi_insert_seq_on_edge_immediate (loop_preheader_edge (loop), stmts);
3906   return init_def;
3907 }
3908
3909 /* Get at the initial defs for the reduction PHIs in SLP_NODE.
3910    NUMBER_OF_VECTORS is the number of vector defs to create.
3911    If NEUTRAL_OP is nonnull, introducing extra elements of that
3912    value will not change the result.  */
3913
3914 static void
3915 get_initial_defs_for_reduction (slp_tree slp_node,
3916                                 vec<tree> *vec_oprnds,
3917                                 unsigned int number_of_vectors,
3918                                 bool reduc_chain, tree neutral_op)
3919 {
3920   vec<stmt_vec_info> stmts = SLP_TREE_SCALAR_STMTS (slp_node);
3921   stmt_vec_info stmt_vinfo = stmts[0];
3922   unsigned HOST_WIDE_INT nunits;
3923   unsigned j, number_of_places_left_in_vector;
3924   tree vector_type;
3925   unsigned int group_size = stmts.length ();
3926   unsigned int i;
3927   class loop *loop;
3928
3929   vector_type = STMT_VINFO_VECTYPE (stmt_vinfo);
3930
3931   gcc_assert (STMT_VINFO_DEF_TYPE (stmt_vinfo) == vect_reduction_def);
3932
3933   loop = (gimple_bb (stmt_vinfo->stmt))->loop_father;
3934   gcc_assert (loop);
3935   edge pe = loop_preheader_edge (loop);
3936
3937   gcc_assert (!reduc_chain || neutral_op);
3938
3939   /* NUMBER_OF_COPIES is the number of times we need to use the same values in
3940      created vectors. It is greater than 1 if unrolling is performed.
3941
3942      For example, we have two scalar operands, s1 and s2 (e.g., group of
3943      strided accesses of size two), while NUNITS is four (i.e., four scalars
3944      of this type can be packed in a vector).  The output vector will contain
3945      two copies of each scalar operand: {s1, s2, s1, s2}.  (NUMBER_OF_COPIES
3946      will be 2).
3947
3948      If REDUC_GROUP_SIZE > NUNITS, the scalars will be split into several
3949      vectors containing the operands.
3950
3951      For example, NUNITS is four as before, and the group size is 8
3952      (s1, s2, ..., s8).  We will create two vectors {s1, s2, s3, s4} and
3953      {s5, s6, s7, s8}.  */
3954
3955   if (!TYPE_VECTOR_SUBPARTS (vector_type).is_constant (&nunits))
3956     nunits = group_size;
3957
3958   number_of_places_left_in_vector = nunits;
3959   bool constant_p = true;
3960   tree_vector_builder elts (vector_type, nunits, 1);
3961   elts.quick_grow (nunits);
3962   gimple_seq ctor_seq = NULL;
3963   for (j = 0; j < nunits * number_of_vectors; ++j)
3964     {
3965       tree op;
3966       i = j % group_size;
3967       stmt_vinfo = stmts[i];
3968
3969       /* Get the def before the loop.  In reduction chain we have only
3970          one initial value.  Else we have as many as PHIs in the group.  */
3971       if (reduc_chain)
3972         op = j != 0 ? neutral_op : PHI_ARG_DEF_FROM_EDGE (stmt_vinfo->stmt, pe);
3973       else if (((vec_oprnds->length () + 1) * nunits
3974                 - number_of_places_left_in_vector >= group_size)
3975                && neutral_op)
3976         op = neutral_op;
3977       else
3978         op = PHI_ARG_DEF_FROM_EDGE (stmt_vinfo->stmt, pe);
3979
3980       /* Create 'vect_ = {op0,op1,...,opn}'.  */
3981       number_of_places_left_in_vector--;
3982       elts[nunits - number_of_places_left_in_vector - 1] = op;
3983       if (!CONSTANT_CLASS_P (op))
3984         constant_p = false;
3985
3986       if (number_of_places_left_in_vector == 0)
3987         {
3988           tree init;
3989           if (constant_p && !neutral_op
3990               ? multiple_p (TYPE_VECTOR_SUBPARTS (vector_type), nunits)
3991               : known_eq (TYPE_VECTOR_SUBPARTS (vector_type), nunits))
3992             /* Build the vector directly from ELTS.  */
3993             init = gimple_build_vector (&ctor_seq, &elts);
3994           else if (neutral_op)
3995             {
3996               /* Build a vector of the neutral value and shift the
3997                  other elements into place.  */
3998               init = gimple_build_vector_from_val (&ctor_seq, vector_type,
3999                                                    neutral_op);
4000               int k = nunits;
4001               while (k > 0 && elts[k - 1] == neutral_op)
4002                 k -= 1;
4003               while (k > 0)
4004                 {
4005                   k -= 1;
4006                   init = gimple_build (&ctor_seq, CFN_VEC_SHL_INSERT,
4007                                        vector_type, init, elts[k]);
4008                 }
4009             }
4010           else
4011             {
4012               /* First time round, duplicate ELTS to fill the
4013                  required number of vectors.  */
4014               duplicate_and_interleave (&ctor_seq, vector_type, elts,
4015                                         number_of_vectors, *vec_oprnds);
4016               break;
4017             }
4018           vec_oprnds->quick_push (init);
4019
4020           number_of_places_left_in_vector = nunits;
4021           elts.new_vector (vector_type, nunits, 1);
4022           elts.quick_grow (nunits);
4023           constant_p = true;
4024         }
4025     }
4026   if (ctor_seq != NULL)
4027     gsi_insert_seq_on_edge_immediate (pe, ctor_seq);
4028 }
4029
4030 /* For a statement STMT_INFO taking part in a reduction operation return
4031    the stmt_vec_info the meta information is stored on.  */
4032
4033 stmt_vec_info
4034 info_for_reduction (stmt_vec_info stmt_info)
4035 {
4036   stmt_info = vect_orig_stmt (stmt_info);
4037   gcc_assert (STMT_VINFO_REDUC_DEF (stmt_info));
4038   if (!is_a <gphi *> (stmt_info->stmt))
4039     stmt_info = STMT_VINFO_REDUC_DEF (stmt_info);
4040   gphi *phi = as_a <gphi *> (stmt_info->stmt);
4041   if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
4042     {
4043       if (gimple_phi_num_args (phi) == 1)
4044         stmt_info = STMT_VINFO_REDUC_DEF (stmt_info);
4045     }
4046   else if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle)
4047     {
4048       edge pe = loop_preheader_edge (gimple_bb (phi)->loop_father);
4049       stmt_vec_info info
4050           = stmt_info->vinfo->lookup_def (PHI_ARG_DEF_FROM_EDGE (phi, pe));
4051       if (info && STMT_VINFO_DEF_TYPE (info) == vect_double_reduction_def)
4052         stmt_info = info;
4053     }
4054   return stmt_info;
4055 }
4056
4057 /* Function vect_create_epilog_for_reduction
4058
4059    Create code at the loop-epilog to finalize the result of a reduction
4060    computation.
4061
4062    STMT_INFO is the scalar reduction stmt that is being vectorized.
4063    SLP_NODE is an SLP node containing a group of reduction statements. The
4064      first one in this group is STMT_INFO.
4065    SLP_NODE_INSTANCE is the SLP node instance containing SLP_NODE
4066    REDUC_INDEX says which rhs operand of the STMT_INFO is the reduction phi
4067      (counting from 0)
4068
4069    This function:
4070    1. Completes the reduction def-use cycles.
4071    2. "Reduces" each vector of partial results VECT_DEFS into a single result,
4072       by calling the function specified by REDUC_FN if available, or by
4073       other means (whole-vector shifts or a scalar loop).
4074       The function also creates a new phi node at the loop exit to preserve
4075       loop-closed form, as illustrated below.
4076
4077      The flow at the entry to this function:
4078
4079         loop:
4080           vec_def = phi <vec_init, null>        # REDUCTION_PHI
4081           VECT_DEF = vector_stmt                # vectorized form of STMT_INFO
4082           s_loop = scalar_stmt                  # (scalar) STMT_INFO
4083         loop_exit:
4084           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
4085           use <s_out0>
4086           use <s_out0>
4087
4088      The above is transformed by this function into:
4089
4090         loop:
4091           vec_def = phi <vec_init, VECT_DEF>    # REDUCTION_PHI
4092           VECT_DEF = vector_stmt                # vectorized form of STMT_INFO
4093           s_loop = scalar_stmt                  # (scalar) STMT_INFO
4094         loop_exit:
4095           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
4096           v_out1 = phi <VECT_DEF>               # NEW_EXIT_PHI
4097           v_out2 = reduce <v_out1>
4098           s_out3 = extract_field <v_out2, 0>
4099           s_out4 = adjust_result <s_out3>
4100           use <s_out4>
4101           use <s_out4>
4102 */
4103
4104 static void
4105 vect_create_epilog_for_reduction (stmt_vec_info stmt_info,
4106                                   slp_tree slp_node,
4107                                   slp_instance slp_node_instance)
4108 {
4109   stmt_vec_info reduc_info = info_for_reduction (stmt_info);
4110   gcc_assert (reduc_info->is_reduc_info);
4111   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4112   /* For double reductions we need to get at the inner loop reduction
4113      stmt which has the meta info attached.  Our stmt_info is that of the
4114      loop-closed PHI of the inner loop which we remember as
4115      def for the reduction PHI generation.  */
4116   bool double_reduc = false;
4117   stmt_vec_info rdef_info = stmt_info;
4118   if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
4119     {
4120       gcc_assert (!slp_node);
4121       double_reduc = true;
4122       stmt_info = loop_vinfo->lookup_def (gimple_phi_arg_def
4123                                             (stmt_info->stmt, 0));
4124       stmt_info = vect_stmt_to_vectorize (stmt_info);
4125     }
4126   gphi *reduc_def_stmt
4127     = as_a <gphi *> (STMT_VINFO_REDUC_DEF (vect_orig_stmt (stmt_info))->stmt);
4128   enum tree_code code = STMT_VINFO_REDUC_CODE (reduc_info);
4129   internal_fn reduc_fn = STMT_VINFO_REDUC_FN (reduc_info);
4130   tree neutral_op = NULL_TREE;
4131   if (slp_node)
4132     neutral_op
4133       = neutral_op_for_slp_reduction (slp_node_instance->reduc_phis, code,
4134                                       REDUC_GROUP_FIRST_ELEMENT (stmt_info));
4135   stmt_vec_info prev_phi_info;
4136   tree vectype;
4137   machine_mode mode;
4138   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo), *outer_loop = NULL;
4139   basic_block exit_bb;
4140   tree scalar_dest;
4141   tree scalar_type;
4142   gimple *new_phi = NULL, *phi;
4143   stmt_vec_info phi_info;
4144   gimple_stmt_iterator exit_gsi;
4145   tree vec_dest;
4146   tree new_temp = NULL_TREE, new_dest, new_name, new_scalar_dest;
4147   gimple *epilog_stmt = NULL;
4148   gimple *exit_phi;
4149   tree bitsize;
4150   tree expr, def;
4151   tree orig_name, scalar_result;
4152   imm_use_iterator imm_iter, phi_imm_iter;
4153   use_operand_p use_p, phi_use_p;
4154   gimple *use_stmt;
4155   bool nested_in_vect_loop = false;
4156   auto_vec<gimple *> new_phis;
4157   int j, i;
4158   auto_vec<tree> scalar_results;
4159   unsigned int group_size = 1, k;
4160   auto_vec<gimple *> phis;
4161   bool slp_reduc = false;
4162   bool direct_slp_reduc;
4163   tree new_phi_result;
4164   tree induction_index = NULL_TREE;
4165
4166   if (slp_node)
4167     group_size = SLP_TREE_SCALAR_STMTS (slp_node).length ();
4168
4169   if (nested_in_vect_loop_p (loop, stmt_info))
4170     {
4171       outer_loop = loop;
4172       loop = loop->inner;
4173       nested_in_vect_loop = true;
4174       gcc_assert (!slp_node);
4175     }
4176   gcc_assert (!nested_in_vect_loop || double_reduc);
4177
4178   vectype = STMT_VINFO_VECTYPE (stmt_info);
4179   gcc_assert (vectype);
4180   mode = TYPE_MODE (vectype);
4181
4182   tree initial_def = NULL;
4183   tree induc_val = NULL_TREE;
4184   tree adjustment_def = NULL;
4185   if (slp_node)
4186     ;
4187   else
4188     {
4189       /* Get at the scalar def before the loop, that defines the initial value
4190          of the reduction variable.  */
4191       initial_def = PHI_ARG_DEF_FROM_EDGE (reduc_def_stmt,
4192                                            loop_preheader_edge (loop));
4193       /* Optimize: for induction condition reduction, if we can't use zero
4194          for induc_val, use initial_def.  */
4195       if (STMT_VINFO_REDUC_TYPE (reduc_info) == INTEGER_INDUC_COND_REDUCTION)
4196         induc_val = STMT_VINFO_VEC_INDUC_COND_INITIAL_VAL (reduc_info);
4197       else if (double_reduc)
4198         ;
4199       else if (nested_in_vect_loop)
4200         ;
4201       else
4202         adjustment_def = STMT_VINFO_REDUC_EPILOGUE_ADJUSTMENT (reduc_info);
4203     }
4204
4205   unsigned vec_num;
4206   int ncopies;
4207   if (slp_node)
4208     {
4209       vec_num = SLP_TREE_VEC_STMTS (slp_node_instance->reduc_phis).length ();
4210       ncopies = 1;
4211     }
4212   else
4213     {
4214       vec_num = 1;
4215       ncopies = 0;
4216       phi_info = STMT_VINFO_VEC_STMT (loop_vinfo->lookup_stmt (reduc_def_stmt));
4217       do
4218         {
4219           ncopies++;
4220           phi_info = STMT_VINFO_RELATED_STMT (phi_info);
4221         }
4222       while (phi_info);
4223     }
4224
4225   /* For cond reductions we want to create a new vector (INDEX_COND_EXPR)
4226      which is updated with the current index of the loop for every match of
4227      the original loop's cond_expr (VEC_STMT).  This results in a vector
4228      containing the last time the condition passed for that vector lane.
4229      The first match will be a 1 to allow 0 to be used for non-matching
4230      indexes.  If there are no matches at all then the vector will be all
4231      zeroes.  */
4232   if (STMT_VINFO_REDUC_TYPE (reduc_info) == COND_REDUCTION)
4233     {
4234       tree indx_before_incr, indx_after_incr;
4235       poly_uint64 nunits_out = TYPE_VECTOR_SUBPARTS (vectype);
4236
4237       gimple *vec_stmt = STMT_VINFO_VEC_STMT (stmt_info)->stmt;
4238       gcc_assert (gimple_assign_rhs_code (vec_stmt) == VEC_COND_EXPR);
4239
4240       int scalar_precision
4241         = GET_MODE_PRECISION (SCALAR_TYPE_MODE (TREE_TYPE (vectype)));
4242       tree cr_index_scalar_type = make_unsigned_type (scalar_precision);
4243       tree cr_index_vector_type = build_vector_type
4244         (cr_index_scalar_type, TYPE_VECTOR_SUBPARTS (vectype));
4245
4246       /* First we create a simple vector induction variable which starts
4247          with the values {1,2,3,...} (SERIES_VECT) and increments by the
4248          vector size (STEP).  */
4249
4250       /* Create a {1,2,3,...} vector.  */
4251       tree series_vect = build_index_vector (cr_index_vector_type, 1, 1);
4252
4253       /* Create a vector of the step value.  */
4254       tree step = build_int_cst (cr_index_scalar_type, nunits_out);
4255       tree vec_step = build_vector_from_val (cr_index_vector_type, step);
4256
4257       /* Create an induction variable.  */
4258       gimple_stmt_iterator incr_gsi;
4259       bool insert_after;
4260       standard_iv_increment_position (loop, &incr_gsi, &insert_after);
4261       create_iv (series_vect, vec_step, NULL_TREE, loop, &incr_gsi,
4262                  insert_after, &indx_before_incr, &indx_after_incr);
4263
4264       /* Next create a new phi node vector (NEW_PHI_TREE) which starts
4265          filled with zeros (VEC_ZERO).  */
4266
4267       /* Create a vector of 0s.  */
4268       tree zero = build_zero_cst (cr_index_scalar_type);
4269       tree vec_zero = build_vector_from_val (cr_index_vector_type, zero);
4270
4271       /* Create a vector phi node.  */
4272       tree new_phi_tree = make_ssa_name (cr_index_vector_type);
4273       new_phi = create_phi_node (new_phi_tree, loop->header);
4274       loop_vinfo->add_stmt (new_phi);
4275       add_phi_arg (as_a <gphi *> (new_phi), vec_zero,
4276                    loop_preheader_edge (loop), UNKNOWN_LOCATION);
4277
4278       /* Now take the condition from the loops original cond_expr
4279          (VEC_STMT) and produce a new cond_expr (INDEX_COND_EXPR) which for
4280          every match uses values from the induction variable
4281          (INDEX_BEFORE_INCR) otherwise uses values from the phi node
4282          (NEW_PHI_TREE).
4283          Finally, we update the phi (NEW_PHI_TREE) to take the value of
4284          the new cond_expr (INDEX_COND_EXPR).  */
4285
4286       /* Duplicate the condition from vec_stmt.  */
4287       tree ccompare = unshare_expr (gimple_assign_rhs1 (vec_stmt));
4288
4289       /* Create a conditional, where the condition is taken from vec_stmt
4290          (CCOMPARE).  The then and else values mirror the main VEC_COND_EXPR:
4291          the reduction phi corresponds to NEW_PHI_TREE and the new values
4292          correspond to INDEX_BEFORE_INCR.  */
4293       gcc_assert (STMT_VINFO_REDUC_IDX (stmt_info) >= 1);
4294       tree index_cond_expr;
4295       if (STMT_VINFO_REDUC_IDX (stmt_info) == 2)
4296         index_cond_expr = build3 (VEC_COND_EXPR, cr_index_vector_type,
4297                                   ccompare, indx_before_incr, new_phi_tree);
4298       else
4299         index_cond_expr = build3 (VEC_COND_EXPR, cr_index_vector_type,
4300                                   ccompare, new_phi_tree, indx_before_incr);
4301       induction_index = make_ssa_name (cr_index_vector_type);
4302       gimple *index_condition = gimple_build_assign (induction_index,
4303                                                      index_cond_expr);
4304       gsi_insert_before (&incr_gsi, index_condition, GSI_SAME_STMT);
4305       stmt_vec_info index_vec_info = loop_vinfo->add_stmt (index_condition);
4306       STMT_VINFO_VECTYPE (index_vec_info) = cr_index_vector_type;
4307
4308       /* Update the phi with the vec cond.  */
4309       add_phi_arg (as_a <gphi *> (new_phi), induction_index,
4310                    loop_latch_edge (loop), UNKNOWN_LOCATION);
4311     }
4312
4313   /* 2. Create epilog code.
4314         The reduction epilog code operates across the elements of the vector
4315         of partial results computed by the vectorized loop.
4316         The reduction epilog code consists of:
4317
4318         step 1: compute the scalar result in a vector (v_out2)
4319         step 2: extract the scalar result (s_out3) from the vector (v_out2)
4320         step 3: adjust the scalar result (s_out3) if needed.
4321
4322         Step 1 can be accomplished using one the following three schemes:
4323           (scheme 1) using reduc_fn, if available.
4324           (scheme 2) using whole-vector shifts, if available.
4325           (scheme 3) using a scalar loop. In this case steps 1+2 above are
4326                      combined.
4327
4328           The overall epilog code looks like this:
4329
4330           s_out0 = phi <s_loop>         # original EXIT_PHI
4331           v_out1 = phi <VECT_DEF>       # NEW_EXIT_PHI
4332           v_out2 = reduce <v_out1>              # step 1
4333           s_out3 = extract_field <v_out2, 0>    # step 2
4334           s_out4 = adjust_result <s_out3>       # step 3
4335
4336           (step 3 is optional, and steps 1 and 2 may be combined).
4337           Lastly, the uses of s_out0 are replaced by s_out4.  */
4338
4339
4340   /* 2.1 Create new loop-exit-phis to preserve loop-closed form:
4341          v_out1 = phi <VECT_DEF>
4342          Store them in NEW_PHIS.  */
4343   if (double_reduc)
4344     loop = outer_loop;
4345   exit_bb = single_exit (loop)->dest;
4346   prev_phi_info = NULL;
4347   new_phis.create (slp_node ? vec_num : ncopies);
4348   for (unsigned i = 0; i < vec_num; i++)
4349     {
4350       if (slp_node)
4351         def = gimple_get_lhs (SLP_TREE_VEC_STMTS (slp_node)[i]->stmt);
4352       else
4353         def = gimple_get_lhs (STMT_VINFO_VEC_STMT (rdef_info)->stmt);
4354       for (j = 0; j < ncopies; j++)
4355         {
4356           tree new_def = copy_ssa_name (def);
4357           phi = create_phi_node (new_def, exit_bb);
4358           stmt_vec_info phi_info = loop_vinfo->add_stmt (phi);
4359           if (j == 0)
4360             new_phis.quick_push (phi);
4361           else
4362             {
4363               def = vect_get_vec_def_for_stmt_copy (loop_vinfo, def);
4364               STMT_VINFO_RELATED_STMT (prev_phi_info) = phi_info;
4365             }
4366
4367           SET_PHI_ARG_DEF (phi, single_exit (loop)->dest_idx, def);
4368           prev_phi_info = phi_info;
4369         }
4370     }
4371
4372   exit_gsi = gsi_after_labels (exit_bb);
4373
4374   /* 2.2 Get the relevant tree-code to use in the epilog for schemes 2,3
4375          (i.e. when reduc_fn is not available) and in the final adjustment
4376          code (if needed).  Also get the original scalar reduction variable as
4377          defined in the loop.  In case STMT is a "pattern-stmt" (i.e. - it
4378          represents a reduction pattern), the tree-code and scalar-def are
4379          taken from the original stmt that the pattern-stmt (STMT) replaces.
4380          Otherwise (it is a regular reduction) - the tree-code and scalar-def
4381          are taken from STMT.  */
4382
4383   stmt_vec_info orig_stmt_info = vect_orig_stmt (stmt_info);
4384   if (orig_stmt_info != stmt_info)
4385     {
4386       /* Reduction pattern  */
4387       gcc_assert (STMT_VINFO_IN_PATTERN_P (orig_stmt_info));
4388       gcc_assert (STMT_VINFO_RELATED_STMT (orig_stmt_info) == stmt_info);
4389     }
4390
4391   scalar_dest = gimple_assign_lhs (orig_stmt_info->stmt);
4392   scalar_type = TREE_TYPE (scalar_dest);
4393   scalar_results.create (group_size);
4394   new_scalar_dest = vect_create_destination_var (scalar_dest, NULL);
4395   bitsize = TYPE_SIZE (scalar_type);
4396
4397   /* SLP reduction without reduction chain, e.g.,
4398      # a1 = phi <a2, a0>
4399      # b1 = phi <b2, b0>
4400      a2 = operation (a1)
4401      b2 = operation (b1)  */
4402   slp_reduc = (slp_node && !REDUC_GROUP_FIRST_ELEMENT (stmt_info));
4403
4404   /* True if we should implement SLP_REDUC using native reduction operations
4405      instead of scalar operations.  */
4406   direct_slp_reduc = (reduc_fn != IFN_LAST
4407                       && slp_reduc
4408                       && !TYPE_VECTOR_SUBPARTS (vectype).is_constant ());
4409
4410   /* In case of reduction chain, e.g.,
4411      # a1 = phi <a3, a0>
4412      a2 = operation (a1)
4413      a3 = operation (a2),
4414
4415      we may end up with more than one vector result.  Here we reduce them to
4416      one vector.  */
4417   if (REDUC_GROUP_FIRST_ELEMENT (stmt_info) || direct_slp_reduc)
4418     {
4419       tree first_vect = PHI_RESULT (new_phis[0]);
4420       gassign *new_vec_stmt = NULL;
4421       vec_dest = vect_create_destination_var (scalar_dest, vectype);
4422       for (k = 1; k < new_phis.length (); k++)
4423         {
4424           gimple *next_phi = new_phis[k];
4425           tree second_vect = PHI_RESULT (next_phi);
4426           tree tem = make_ssa_name (vec_dest, new_vec_stmt);
4427           new_vec_stmt = gimple_build_assign (tem, code,
4428                                               first_vect, second_vect);
4429           gsi_insert_before (&exit_gsi, new_vec_stmt, GSI_SAME_STMT);
4430           first_vect = tem;
4431         }
4432
4433       new_phi_result = first_vect;
4434       if (new_vec_stmt)
4435         {
4436           new_phis.truncate (0);
4437           new_phis.safe_push (new_vec_stmt);
4438         }
4439     }
4440   /* Likewise if we couldn't use a single defuse cycle.  */
4441   else if (ncopies > 1)
4442     {
4443       gcc_assert (new_phis.length () == 1);
4444       tree first_vect = PHI_RESULT (new_phis[0]);
4445       gassign *new_vec_stmt = NULL;
4446       vec_dest = vect_create_destination_var (scalar_dest, vectype);
4447       stmt_vec_info next_phi_info = loop_vinfo->lookup_stmt (new_phis[0]);
4448       for (int k = 1; k < ncopies; ++k)
4449         {
4450           next_phi_info = STMT_VINFO_RELATED_STMT (next_phi_info);
4451           tree second_vect = PHI_RESULT (next_phi_info->stmt);
4452           tree tem = make_ssa_name (vec_dest, new_vec_stmt);
4453           new_vec_stmt = gimple_build_assign (tem, code,
4454                                               first_vect, second_vect);
4455           gsi_insert_before (&exit_gsi, new_vec_stmt, GSI_SAME_STMT);
4456           first_vect = tem;
4457         }
4458       new_phi_result = first_vect;
4459       new_phis.truncate (0);
4460       new_phis.safe_push (new_vec_stmt);
4461     }
4462   else
4463     new_phi_result = PHI_RESULT (new_phis[0]);
4464
4465   if (STMT_VINFO_REDUC_TYPE (reduc_info) == COND_REDUCTION
4466       && reduc_fn != IFN_LAST)
4467     {
4468       /* For condition reductions, we have a vector (NEW_PHI_RESULT) containing
4469          various data values where the condition matched and another vector
4470          (INDUCTION_INDEX) containing all the indexes of those matches.  We
4471          need to extract the last matching index (which will be the index with
4472          highest value) and use this to index into the data vector.
4473          For the case where there were no matches, the data vector will contain
4474          all default values and the index vector will be all zeros.  */
4475
4476       /* Get various versions of the type of the vector of indexes.  */
4477       tree index_vec_type = TREE_TYPE (induction_index);
4478       gcc_checking_assert (TYPE_UNSIGNED (index_vec_type));
4479       tree index_scalar_type = TREE_TYPE (index_vec_type);
4480       tree index_vec_cmp_type = build_same_sized_truth_vector_type
4481         (index_vec_type);
4482
4483       /* Get an unsigned integer version of the type of the data vector.  */
4484       int scalar_precision
4485         = GET_MODE_PRECISION (SCALAR_TYPE_MODE (scalar_type));
4486       tree scalar_type_unsigned = make_unsigned_type (scalar_precision);
4487       tree vectype_unsigned = build_vector_type
4488         (scalar_type_unsigned, TYPE_VECTOR_SUBPARTS (vectype));
4489
4490       /* First we need to create a vector (ZERO_VEC) of zeros and another
4491          vector (MAX_INDEX_VEC) filled with the last matching index, which we
4492          can create using a MAX reduction and then expanding.
4493          In the case where the loop never made any matches, the max index will
4494          be zero.  */
4495
4496       /* Vector of {0, 0, 0,...}.  */
4497       tree zero_vec = make_ssa_name (vectype);
4498       tree zero_vec_rhs = build_zero_cst (vectype);
4499       gimple *zero_vec_stmt = gimple_build_assign (zero_vec, zero_vec_rhs);
4500       gsi_insert_before (&exit_gsi, zero_vec_stmt, GSI_SAME_STMT);
4501
4502       /* Find maximum value from the vector of found indexes.  */
4503       tree max_index = make_ssa_name (index_scalar_type);
4504       gcall *max_index_stmt = gimple_build_call_internal (IFN_REDUC_MAX,
4505                                                           1, induction_index);
4506       gimple_call_set_lhs (max_index_stmt, max_index);
4507       gsi_insert_before (&exit_gsi, max_index_stmt, GSI_SAME_STMT);
4508
4509       /* Vector of {max_index, max_index, max_index,...}.  */
4510       tree max_index_vec = make_ssa_name (index_vec_type);
4511       tree max_index_vec_rhs = build_vector_from_val (index_vec_type,
4512                                                       max_index);
4513       gimple *max_index_vec_stmt = gimple_build_assign (max_index_vec,
4514                                                         max_index_vec_rhs);
4515       gsi_insert_before (&exit_gsi, max_index_vec_stmt, GSI_SAME_STMT);
4516
4517       /* Next we compare the new vector (MAX_INDEX_VEC) full of max indexes
4518          with the vector (INDUCTION_INDEX) of found indexes, choosing values
4519          from the data vector (NEW_PHI_RESULT) for matches, 0 (ZERO_VEC)
4520          otherwise.  Only one value should match, resulting in a vector
4521          (VEC_COND) with one data value and the rest zeros.
4522          In the case where the loop never made any matches, every index will
4523          match, resulting in a vector with all data values (which will all be
4524          the default value).  */
4525
4526       /* Compare the max index vector to the vector of found indexes to find
4527          the position of the max value.  */
4528       tree vec_compare = make_ssa_name (index_vec_cmp_type);
4529       gimple *vec_compare_stmt = gimple_build_assign (vec_compare, EQ_EXPR,
4530                                                       induction_index,
4531                                                       max_index_vec);
4532       gsi_insert_before (&exit_gsi, vec_compare_stmt, GSI_SAME_STMT);
4533
4534       /* Use the compare to choose either values from the data vector or
4535          zero.  */
4536       tree vec_cond = make_ssa_name (vectype);
4537       gimple *vec_cond_stmt = gimple_build_assign (vec_cond, VEC_COND_EXPR,
4538                                                    vec_compare, new_phi_result,
4539                                                    zero_vec);
4540       gsi_insert_before (&exit_gsi, vec_cond_stmt, GSI_SAME_STMT);
4541
4542       /* Finally we need to extract the data value from the vector (VEC_COND)
4543          into a scalar (MATCHED_DATA_REDUC).  Logically we want to do a OR
4544          reduction, but because this doesn't exist, we can use a MAX reduction
4545          instead.  The data value might be signed or a float so we need to cast
4546          it first.
4547          In the case where the loop never made any matches, the data values are
4548          all identical, and so will reduce down correctly.  */
4549
4550       /* Make the matched data values unsigned.  */
4551       tree vec_cond_cast = make_ssa_name (vectype_unsigned);
4552       tree vec_cond_cast_rhs = build1 (VIEW_CONVERT_EXPR, vectype_unsigned,
4553                                        vec_cond);
4554       gimple *vec_cond_cast_stmt = gimple_build_assign (vec_cond_cast,
4555                                                         VIEW_CONVERT_EXPR,
4556                                                         vec_cond_cast_rhs);
4557       gsi_insert_before (&exit_gsi, vec_cond_cast_stmt, GSI_SAME_STMT);
4558
4559       /* Reduce down to a scalar value.  */
4560       tree data_reduc = make_ssa_name (scalar_type_unsigned);
4561       gcall *data_reduc_stmt = gimple_build_call_internal (IFN_REDUC_MAX,
4562                                                            1, vec_cond_cast);
4563       gimple_call_set_lhs (data_reduc_stmt, data_reduc);
4564       gsi_insert_before (&exit_gsi, data_reduc_stmt, GSI_SAME_STMT);
4565
4566       /* Convert the reduced value back to the result type and set as the
4567          result.  */
4568       gimple_seq stmts = NULL;
4569       new_temp = gimple_build (&stmts, VIEW_CONVERT_EXPR, scalar_type,
4570                                data_reduc);
4571       gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
4572       scalar_results.safe_push (new_temp);
4573     }
4574   else if (STMT_VINFO_REDUC_TYPE (reduc_info) == COND_REDUCTION
4575            && reduc_fn == IFN_LAST)
4576     {
4577       /* Condition reduction without supported IFN_REDUC_MAX.  Generate
4578          idx = 0;
4579          idx_val = induction_index[0];
4580          val = data_reduc[0];
4581          for (idx = 0, val = init, i = 0; i < nelts; ++i)
4582            if (induction_index[i] > idx_val)
4583              val = data_reduc[i], idx_val = induction_index[i];
4584          return val;  */
4585
4586       tree data_eltype = TREE_TYPE (TREE_TYPE (new_phi_result));
4587       tree idx_eltype = TREE_TYPE (TREE_TYPE (induction_index));
4588       unsigned HOST_WIDE_INT el_size = tree_to_uhwi (TYPE_SIZE (idx_eltype));
4589       poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (TREE_TYPE (induction_index));
4590       /* Enforced by vectorizable_reduction, which ensures we have target
4591          support before allowing a conditional reduction on variable-length
4592          vectors.  */
4593       unsigned HOST_WIDE_INT v_size = el_size * nunits.to_constant ();
4594       tree idx_val = NULL_TREE, val = NULL_TREE;
4595       for (unsigned HOST_WIDE_INT off = 0; off < v_size; off += el_size)
4596         {
4597           tree old_idx_val = idx_val;
4598           tree old_val = val;
4599           idx_val = make_ssa_name (idx_eltype);
4600           epilog_stmt = gimple_build_assign (idx_val, BIT_FIELD_REF,
4601                                              build3 (BIT_FIELD_REF, idx_eltype,
4602                                                      induction_index,
4603                                                      bitsize_int (el_size),
4604                                                      bitsize_int (off)));
4605           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4606           val = make_ssa_name (data_eltype);
4607           epilog_stmt = gimple_build_assign (val, BIT_FIELD_REF,
4608                                              build3 (BIT_FIELD_REF,
4609                                                      data_eltype,
4610                                                      new_phi_result,
4611                                                      bitsize_int (el_size),
4612                                                      bitsize_int (off)));
4613           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4614           if (off != 0)
4615             {
4616               tree new_idx_val = idx_val;
4617               if (off != v_size - el_size)
4618                 {
4619                   new_idx_val = make_ssa_name (idx_eltype);
4620                   epilog_stmt = gimple_build_assign (new_idx_val,
4621                                                      MAX_EXPR, idx_val,
4622                                                      old_idx_val);
4623                   gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4624                 }
4625               tree new_val = make_ssa_name (data_eltype);
4626               epilog_stmt = gimple_build_assign (new_val,
4627                                                  COND_EXPR,
4628                                                  build2 (GT_EXPR,
4629                                                          boolean_type_node,
4630                                                          idx_val,
4631                                                          old_idx_val),
4632                                                  val, old_val);
4633               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4634               idx_val = new_idx_val;
4635               val = new_val;
4636             }
4637         }
4638       /* Convert the reduced value back to the result type and set as the
4639          result.  */
4640       gimple_seq stmts = NULL;
4641       val = gimple_convert (&stmts, scalar_type, val);
4642       gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
4643       scalar_results.safe_push (val);
4644     }
4645
4646   /* 2.3 Create the reduction code, using one of the three schemes described
4647          above. In SLP we simply need to extract all the elements from the
4648          vector (without reducing them), so we use scalar shifts.  */
4649   else if (reduc_fn != IFN_LAST && !slp_reduc)
4650     {
4651       tree tmp;
4652       tree vec_elem_type;
4653
4654       /* Case 1:  Create:
4655          v_out2 = reduc_expr <v_out1>  */
4656
4657       if (dump_enabled_p ())
4658         dump_printf_loc (MSG_NOTE, vect_location,
4659                          "Reduce using direct vector reduction.\n");
4660
4661       vec_elem_type = TREE_TYPE (TREE_TYPE (new_phi_result));
4662       if (!useless_type_conversion_p (scalar_type, vec_elem_type))
4663         {
4664           tree tmp_dest
4665             = vect_create_destination_var (scalar_dest, vec_elem_type);
4666           epilog_stmt = gimple_build_call_internal (reduc_fn, 1,
4667                                                     new_phi_result);
4668           gimple_set_lhs (epilog_stmt, tmp_dest);
4669           new_temp = make_ssa_name (tmp_dest, epilog_stmt);
4670           gimple_set_lhs (epilog_stmt, new_temp);
4671           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4672
4673           epilog_stmt = gimple_build_assign (new_scalar_dest, NOP_EXPR,
4674                                              new_temp);
4675         }
4676       else
4677         {
4678           epilog_stmt = gimple_build_call_internal (reduc_fn, 1,
4679                                                     new_phi_result);
4680           gimple_set_lhs (epilog_stmt, new_scalar_dest);
4681         }
4682
4683       new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
4684       gimple_set_lhs (epilog_stmt, new_temp);
4685       gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4686
4687       if ((STMT_VINFO_REDUC_TYPE (reduc_info) == INTEGER_INDUC_COND_REDUCTION)
4688           && induc_val)
4689         {
4690           /* Earlier we set the initial value to be a vector if induc_val
4691              values.  Check the result and if it is induc_val then replace
4692              with the original initial value, unless induc_val is
4693              the same as initial_def already.  */
4694           tree zcompare = build2 (EQ_EXPR, boolean_type_node, new_temp,
4695                                   induc_val);
4696
4697           tmp = make_ssa_name (new_scalar_dest);
4698           epilog_stmt = gimple_build_assign (tmp, COND_EXPR, zcompare,
4699                                              initial_def, new_temp);
4700           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4701           new_temp = tmp;
4702         }
4703
4704       scalar_results.safe_push (new_temp);
4705     }
4706   else if (direct_slp_reduc)
4707     {
4708       /* Here we create one vector for each of the REDUC_GROUP_SIZE results,
4709          with the elements for other SLP statements replaced with the
4710          neutral value.  We can then do a normal reduction on each vector.  */
4711
4712       /* Enforced by vectorizable_reduction.  */
4713       gcc_assert (new_phis.length () == 1);
4714       gcc_assert (pow2p_hwi (group_size));
4715
4716       slp_tree orig_phis_slp_node = slp_node_instance->reduc_phis;
4717       vec<stmt_vec_info> orig_phis
4718         = SLP_TREE_SCALAR_STMTS (orig_phis_slp_node);
4719       gimple_seq seq = NULL;
4720
4721       /* Build a vector {0, 1, 2, ...}, with the same number of elements
4722          and the same element size as VECTYPE.  */
4723       tree index = build_index_vector (vectype, 0, 1);
4724       tree index_type = TREE_TYPE (index);
4725       tree index_elt_type = TREE_TYPE (index_type);
4726       tree mask_type = build_same_sized_truth_vector_type (index_type);
4727
4728       /* Create a vector that, for each element, identifies which of
4729          the REDUC_GROUP_SIZE results should use it.  */
4730       tree index_mask = build_int_cst (index_elt_type, group_size - 1);
4731       index = gimple_build (&seq, BIT_AND_EXPR, index_type, index,
4732                             build_vector_from_val (index_type, index_mask));
4733
4734       /* Get a neutral vector value.  This is simply a splat of the neutral
4735          scalar value if we have one, otherwise the initial scalar value
4736          is itself a neutral value.  */
4737       tree vector_identity = NULL_TREE;
4738       if (neutral_op)
4739         vector_identity = gimple_build_vector_from_val (&seq, vectype,
4740                                                         neutral_op);
4741       for (unsigned int i = 0; i < group_size; ++i)
4742         {
4743           /* If there's no univeral neutral value, we can use the
4744              initial scalar value from the original PHI.  This is used
4745              for MIN and MAX reduction, for example.  */
4746           if (!neutral_op)
4747             {
4748               tree scalar_value
4749                 = PHI_ARG_DEF_FROM_EDGE (orig_phis[i]->stmt,
4750                                          loop_preheader_edge (loop));
4751               vector_identity = gimple_build_vector_from_val (&seq, vectype,
4752                                                               scalar_value);
4753             }
4754
4755           /* Calculate the equivalent of:
4756
4757              sel[j] = (index[j] == i);
4758
4759              which selects the elements of NEW_PHI_RESULT that should
4760              be included in the result.  */
4761           tree compare_val = build_int_cst (index_elt_type, i);
4762           compare_val = build_vector_from_val (index_type, compare_val);
4763           tree sel = gimple_build (&seq, EQ_EXPR, mask_type,
4764                                    index, compare_val);
4765
4766           /* Calculate the equivalent of:
4767
4768              vec = seq ? new_phi_result : vector_identity;
4769
4770              VEC is now suitable for a full vector reduction.  */
4771           tree vec = gimple_build (&seq, VEC_COND_EXPR, vectype,
4772                                    sel, new_phi_result, vector_identity);
4773
4774           /* Do the reduction and convert it to the appropriate type.  */
4775           tree scalar = gimple_build (&seq, as_combined_fn (reduc_fn),
4776                                       TREE_TYPE (vectype), vec);
4777           scalar = gimple_convert (&seq, scalar_type, scalar);
4778           scalar_results.safe_push (scalar);
4779         }
4780       gsi_insert_seq_before (&exit_gsi, seq, GSI_SAME_STMT);
4781     }
4782   else
4783     {
4784       bool reduce_with_shift;
4785       tree vec_temp;
4786
4787       /* See if the target wants to do the final (shift) reduction
4788          in a vector mode of smaller size and first reduce upper/lower
4789          halves against each other.  */
4790       enum machine_mode mode1 = mode;
4791       unsigned sz = tree_to_uhwi (TYPE_SIZE_UNIT (vectype));
4792       unsigned sz1 = sz;
4793       if (!slp_reduc
4794           && (mode1 = targetm.vectorize.split_reduction (mode)) != mode)
4795         sz1 = GET_MODE_SIZE (mode1).to_constant ();
4796
4797       tree vectype1 = get_vectype_for_scalar_type_and_size (scalar_type, sz1);
4798       reduce_with_shift = have_whole_vector_shift (mode1);
4799       if (!VECTOR_MODE_P (mode1))
4800         reduce_with_shift = false;
4801       else
4802         {
4803           optab optab = optab_for_tree_code (code, vectype1, optab_default);
4804           if (optab_handler (optab, mode1) == CODE_FOR_nothing)
4805             reduce_with_shift = false;
4806         }
4807
4808       /* First reduce the vector to the desired vector size we should
4809          do shift reduction on by combining upper and lower halves.  */
4810       new_temp = new_phi_result;
4811       while (sz > sz1)
4812         {
4813           gcc_assert (!slp_reduc);
4814           sz /= 2;
4815           vectype1 = get_vectype_for_scalar_type_and_size (scalar_type, sz);
4816
4817           /* The target has to make sure we support lowpart/highpart
4818              extraction, either via direct vector extract or through
4819              an integer mode punning.  */
4820           tree dst1, dst2;
4821           if (convert_optab_handler (vec_extract_optab,
4822                                      TYPE_MODE (TREE_TYPE (new_temp)),
4823                                      TYPE_MODE (vectype1))
4824               != CODE_FOR_nothing)
4825             {
4826               /* Extract sub-vectors directly once vec_extract becomes
4827                  a conversion optab.  */
4828               dst1 = make_ssa_name (vectype1);
4829               epilog_stmt
4830                   = gimple_build_assign (dst1, BIT_FIELD_REF,
4831                                          build3 (BIT_FIELD_REF, vectype1,
4832                                                  new_temp, TYPE_SIZE (vectype1),
4833                                                  bitsize_int (0)));
4834               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4835               dst2 =  make_ssa_name (vectype1);
4836               epilog_stmt
4837                   = gimple_build_assign (dst2, BIT_FIELD_REF,
4838                                          build3 (BIT_FIELD_REF, vectype1,
4839                                                  new_temp, TYPE_SIZE (vectype1),
4840                                                  bitsize_int (sz * BITS_PER_UNIT)));
4841               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4842             }
4843           else
4844             {
4845               /* Extract via punning to appropriately sized integer mode
4846                  vector.  */
4847               tree eltype = build_nonstandard_integer_type (sz * BITS_PER_UNIT,
4848                                                             1);
4849               tree etype = build_vector_type (eltype, 2);
4850               gcc_assert (convert_optab_handler (vec_extract_optab,
4851                                                  TYPE_MODE (etype),
4852                                                  TYPE_MODE (eltype))
4853                           != CODE_FOR_nothing);
4854               tree tem = make_ssa_name (etype);
4855               epilog_stmt = gimple_build_assign (tem, VIEW_CONVERT_EXPR,
4856                                                  build1 (VIEW_CONVERT_EXPR,
4857                                                          etype, new_temp));
4858               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4859               new_temp = tem;
4860               tem = make_ssa_name (eltype);
4861               epilog_stmt
4862                   = gimple_build_assign (tem, BIT_FIELD_REF,
4863                                          build3 (BIT_FIELD_REF, eltype,
4864                                                  new_temp, TYPE_SIZE (eltype),
4865                                                  bitsize_int (0)));
4866               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4867               dst1 = make_ssa_name (vectype1);
4868               epilog_stmt = gimple_build_assign (dst1, VIEW_CONVERT_EXPR,
4869                                                  build1 (VIEW_CONVERT_EXPR,
4870                                                          vectype1, tem));
4871               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4872               tem = make_ssa_name (eltype);
4873               epilog_stmt
4874                   = gimple_build_assign (tem, BIT_FIELD_REF,
4875                                          build3 (BIT_FIELD_REF, eltype,
4876                                                  new_temp, TYPE_SIZE (eltype),
4877                                                  bitsize_int (sz * BITS_PER_UNIT)));
4878               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4879               dst2 =  make_ssa_name (vectype1);
4880               epilog_stmt = gimple_build_assign (dst2, VIEW_CONVERT_EXPR,
4881                                                  build1 (VIEW_CONVERT_EXPR,
4882                                                          vectype1, tem));
4883               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4884             }
4885
4886           new_temp = make_ssa_name (vectype1);
4887           epilog_stmt = gimple_build_assign (new_temp, code, dst1, dst2);
4888           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4889         }
4890
4891       if (reduce_with_shift && !slp_reduc)
4892         {
4893           int element_bitsize = tree_to_uhwi (bitsize);
4894           /* Enforced by vectorizable_reduction, which disallows SLP reductions
4895              for variable-length vectors and also requires direct target support
4896              for loop reductions.  */
4897           int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype1));
4898           int nelements = vec_size_in_bits / element_bitsize;
4899           vec_perm_builder sel;
4900           vec_perm_indices indices;
4901
4902           int elt_offset;
4903
4904           tree zero_vec = build_zero_cst (vectype1);
4905           /* Case 2: Create:
4906              for (offset = nelements/2; offset >= 1; offset/=2)
4907                 {
4908                   Create:  va' = vec_shift <va, offset>
4909                   Create:  va = vop <va, va'>
4910                 }  */
4911
4912           tree rhs;
4913
4914           if (dump_enabled_p ())
4915             dump_printf_loc (MSG_NOTE, vect_location,
4916                              "Reduce using vector shifts\n");
4917
4918           vec_dest = vect_create_destination_var (scalar_dest, vectype1);
4919           for (elt_offset = nelements / 2;
4920                elt_offset >= 1;
4921                elt_offset /= 2)
4922             {
4923               calc_vec_perm_mask_for_shift (elt_offset, nelements, &sel);
4924               indices.new_vector (sel, 2, nelements);
4925               tree mask = vect_gen_perm_mask_any (vectype1, indices);
4926               epilog_stmt = gimple_build_assign (vec_dest, VEC_PERM_EXPR,
4927                                                  new_temp, zero_vec, mask);
4928               new_name = make_ssa_name (vec_dest, epilog_stmt);
4929               gimple_assign_set_lhs (epilog_stmt, new_name);
4930               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4931
4932               epilog_stmt = gimple_build_assign (vec_dest, code, new_name,
4933                                                  new_temp);
4934               new_temp = make_ssa_name (vec_dest, epilog_stmt);
4935               gimple_assign_set_lhs (epilog_stmt, new_temp);
4936               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4937             }
4938
4939           /* 2.4  Extract the final scalar result.  Create:
4940              s_out3 = extract_field <v_out2, bitpos>  */
4941
4942           if (dump_enabled_p ())
4943             dump_printf_loc (MSG_NOTE, vect_location,
4944                              "extract scalar result\n");
4945
4946           rhs = build3 (BIT_FIELD_REF, scalar_type, new_temp,
4947                         bitsize, bitsize_zero_node);
4948           epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
4949           new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
4950           gimple_assign_set_lhs (epilog_stmt, new_temp);
4951           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4952           scalar_results.safe_push (new_temp);
4953         }
4954       else
4955         {
4956           /* Case 3: Create:
4957              s = extract_field <v_out2, 0>
4958              for (offset = element_size;
4959                   offset < vector_size;
4960                   offset += element_size;)
4961                {
4962                  Create:  s' = extract_field <v_out2, offset>
4963                  Create:  s = op <s, s'>  // For non SLP cases
4964                }  */
4965
4966           if (dump_enabled_p ())
4967             dump_printf_loc (MSG_NOTE, vect_location,
4968                              "Reduce using scalar code.\n");
4969
4970           int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype1));
4971           int element_bitsize = tree_to_uhwi (bitsize);
4972           FOR_EACH_VEC_ELT (new_phis, i, new_phi)
4973             {
4974               int bit_offset;
4975               if (gimple_code (new_phi) == GIMPLE_PHI)
4976                 vec_temp = PHI_RESULT (new_phi);
4977               else
4978                 vec_temp = gimple_assign_lhs (new_phi);
4979               tree rhs = build3 (BIT_FIELD_REF, scalar_type, vec_temp, bitsize,
4980                                  bitsize_zero_node);
4981               epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
4982               new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
4983               gimple_assign_set_lhs (epilog_stmt, new_temp);
4984               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4985
4986               /* In SLP we don't need to apply reduction operation, so we just
4987                  collect s' values in SCALAR_RESULTS.  */
4988               if (slp_reduc)
4989                 scalar_results.safe_push (new_temp);
4990
4991               for (bit_offset = element_bitsize;
4992                    bit_offset < vec_size_in_bits;
4993                    bit_offset += element_bitsize)
4994                 {
4995                   tree bitpos = bitsize_int (bit_offset);
4996                   tree rhs = build3 (BIT_FIELD_REF, scalar_type, vec_temp,
4997                                      bitsize, bitpos);
4998
4999                   epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
5000                   new_name = make_ssa_name (new_scalar_dest, epilog_stmt);
5001                   gimple_assign_set_lhs (epilog_stmt, new_name);
5002                   gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5003
5004                   if (slp_reduc)
5005                     {
5006                       /* In SLP we don't need to apply reduction operation, so
5007                          we just collect s' values in SCALAR_RESULTS.  */
5008                       new_temp = new_name;
5009                       scalar_results.safe_push (new_name);
5010                     }
5011                   else
5012                     {
5013                       epilog_stmt = gimple_build_assign (new_scalar_dest, code,
5014                                                          new_name, new_temp);
5015                       new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
5016                       gimple_assign_set_lhs (epilog_stmt, new_temp);
5017                       gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5018                     }
5019                 }
5020             }
5021
5022           /* The only case where we need to reduce scalar results in SLP, is
5023              unrolling.  If the size of SCALAR_RESULTS is greater than
5024              REDUC_GROUP_SIZE, we reduce them combining elements modulo
5025              REDUC_GROUP_SIZE.  */
5026           if (slp_reduc)
5027             {
5028               tree res, first_res, new_res;
5029               gimple *new_stmt;
5030
5031               /* Reduce multiple scalar results in case of SLP unrolling.  */
5032               for (j = group_size; scalar_results.iterate (j, &res);
5033                    j++)
5034                 {
5035                   first_res = scalar_results[j % group_size];
5036                   new_stmt = gimple_build_assign (new_scalar_dest, code,
5037                                                   first_res, res);
5038                   new_res = make_ssa_name (new_scalar_dest, new_stmt);
5039                   gimple_assign_set_lhs (new_stmt, new_res);
5040                   gsi_insert_before (&exit_gsi, new_stmt, GSI_SAME_STMT);
5041                   scalar_results[j % group_size] = new_res;
5042                 }
5043             }
5044           else
5045             /* Not SLP - we have one scalar to keep in SCALAR_RESULTS.  */
5046             scalar_results.safe_push (new_temp);
5047         }
5048
5049       if ((STMT_VINFO_REDUC_TYPE (reduc_info) == INTEGER_INDUC_COND_REDUCTION)
5050           && induc_val)
5051         {
5052           /* Earlier we set the initial value to be a vector if induc_val
5053              values.  Check the result and if it is induc_val then replace
5054              with the original initial value, unless induc_val is
5055              the same as initial_def already.  */
5056           tree zcompare = build2 (EQ_EXPR, boolean_type_node, new_temp,
5057                                   induc_val);
5058
5059           tree tmp = make_ssa_name (new_scalar_dest);
5060           epilog_stmt = gimple_build_assign (tmp, COND_EXPR, zcompare,
5061                                              initial_def, new_temp);
5062           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5063           scalar_results[0] = tmp;
5064         }
5065     }
5066
5067   /* 2.5 Adjust the final result by the initial value of the reduction
5068          variable. (When such adjustment is not needed, then
5069          'adjustment_def' is zero).  For example, if code is PLUS we create:
5070          new_temp = loop_exit_def + adjustment_def  */
5071
5072   if (adjustment_def)
5073     {
5074       gcc_assert (!slp_reduc);
5075       if (nested_in_vect_loop)
5076         {
5077           new_phi = new_phis[0];
5078           gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) == VECTOR_TYPE);
5079           expr = build2 (code, vectype, PHI_RESULT (new_phi), adjustment_def);
5080           new_dest = vect_create_destination_var (scalar_dest, vectype);
5081         }
5082       else
5083         {
5084           new_temp = scalar_results[0];
5085           gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) != VECTOR_TYPE);
5086           expr = build2 (code, scalar_type, new_temp, adjustment_def);
5087           new_dest = vect_create_destination_var (scalar_dest, scalar_type);
5088         }
5089
5090       epilog_stmt = gimple_build_assign (new_dest, expr);
5091       new_temp = make_ssa_name (new_dest, epilog_stmt);
5092       gimple_assign_set_lhs (epilog_stmt, new_temp);
5093       gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5094       if (nested_in_vect_loop)
5095         {
5096           stmt_vec_info epilog_stmt_info = loop_vinfo->add_stmt (epilog_stmt);
5097           STMT_VINFO_RELATED_STMT (epilog_stmt_info)
5098             = STMT_VINFO_RELATED_STMT (loop_vinfo->lookup_stmt (new_phi));
5099
5100           if (!double_reduc)
5101             scalar_results.quick_push (new_temp);
5102           else
5103             scalar_results[0] = new_temp;
5104         }
5105       else
5106         scalar_results[0] = new_temp;
5107
5108       new_phis[0] = epilog_stmt;
5109     }
5110
5111   if (double_reduc)
5112     loop = loop->inner;
5113
5114   /* 2.6  Handle the loop-exit phis.  Replace the uses of scalar loop-exit
5115           phis with new adjusted scalar results, i.e., replace use <s_out0>
5116           with use <s_out4>.
5117
5118      Transform:
5119         loop_exit:
5120           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
5121           v_out1 = phi <VECT_DEF>               # NEW_EXIT_PHI
5122           v_out2 = reduce <v_out1>
5123           s_out3 = extract_field <v_out2, 0>
5124           s_out4 = adjust_result <s_out3>
5125           use <s_out0>
5126           use <s_out0>
5127
5128      into:
5129
5130         loop_exit:
5131           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
5132           v_out1 = phi <VECT_DEF>               # NEW_EXIT_PHI
5133           v_out2 = reduce <v_out1>
5134           s_out3 = extract_field <v_out2, 0>
5135           s_out4 = adjust_result <s_out3>
5136           use <s_out4>
5137           use <s_out4> */
5138
5139
5140   /* In SLP reduction chain we reduce vector results into one vector if
5141      necessary, hence we set here REDUC_GROUP_SIZE to 1.  SCALAR_DEST is the
5142      LHS of the last stmt in the reduction chain, since we are looking for
5143      the loop exit phi node.  */
5144   if (REDUC_GROUP_FIRST_ELEMENT (stmt_info))
5145     {
5146       stmt_vec_info dest_stmt_info
5147         = vect_orig_stmt (SLP_TREE_SCALAR_STMTS (slp_node)[group_size - 1]);
5148       scalar_dest = gimple_assign_lhs (dest_stmt_info->stmt);
5149       group_size = 1;
5150     }
5151
5152   /* In SLP we may have several statements in NEW_PHIS and REDUCTION_PHIS (in
5153      case that REDUC_GROUP_SIZE is greater than vectorization factor).
5154      Therefore, we need to match SCALAR_RESULTS with corresponding statements.
5155      The first (REDUC_GROUP_SIZE / number of new vector stmts) scalar results
5156      correspond to the first vector stmt, etc.
5157      (RATIO is equal to (REDUC_GROUP_SIZE / number of new vector stmts)).  */
5158   if (group_size > new_phis.length ())
5159     gcc_assert (!(group_size % new_phis.length ()));
5160
5161   for (k = 0; k < group_size; k++)
5162     {
5163       if (slp_reduc)
5164         {
5165           stmt_vec_info scalar_stmt_info = SLP_TREE_SCALAR_STMTS (slp_node)[k];
5166
5167           orig_stmt_info = STMT_VINFO_RELATED_STMT (scalar_stmt_info);
5168           /* SLP statements can't participate in patterns.  */
5169           gcc_assert (!orig_stmt_info);
5170           scalar_dest = gimple_assign_lhs (scalar_stmt_info->stmt);
5171         }
5172
5173       if (nested_in_vect_loop)
5174         {
5175           if (double_reduc)
5176             loop = outer_loop;
5177           else
5178             gcc_unreachable ();
5179         }
5180
5181       phis.create (3);
5182       /* Find the loop-closed-use at the loop exit of the original scalar
5183          result.  (The reduction result is expected to have two immediate uses,
5184          one at the latch block, and one at the loop exit).  For double
5185          reductions we are looking for exit phis of the outer loop.  */
5186       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, scalar_dest)
5187         {
5188           if (!flow_bb_inside_loop_p (loop, gimple_bb (USE_STMT (use_p))))
5189             {
5190               if (!is_gimple_debug (USE_STMT (use_p)))
5191                 phis.safe_push (USE_STMT (use_p));
5192             }
5193           else
5194             {
5195               if (double_reduc && gimple_code (USE_STMT (use_p)) == GIMPLE_PHI)
5196                 {
5197                   tree phi_res = PHI_RESULT (USE_STMT (use_p));
5198
5199                   FOR_EACH_IMM_USE_FAST (phi_use_p, phi_imm_iter, phi_res)
5200                     {
5201                       if (!flow_bb_inside_loop_p (loop,
5202                                              gimple_bb (USE_STMT (phi_use_p)))
5203                           && !is_gimple_debug (USE_STMT (phi_use_p)))
5204                         phis.safe_push (USE_STMT (phi_use_p));
5205                     }
5206                 }
5207             }
5208         }
5209
5210       FOR_EACH_VEC_ELT (phis, i, exit_phi)
5211         {
5212           /* Replace the uses:  */
5213           orig_name = PHI_RESULT (exit_phi);
5214           scalar_result = scalar_results[k];
5215           FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name)
5216             FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
5217               SET_USE (use_p, scalar_result);
5218         }
5219
5220       phis.release ();
5221     }
5222 }
5223
5224 /* Return a vector of type VECTYPE that is equal to the vector select
5225    operation "MASK ? VEC : IDENTITY".  Insert the select statements
5226    before GSI.  */
5227
5228 static tree
5229 merge_with_identity (gimple_stmt_iterator *gsi, tree mask, tree vectype,
5230                      tree vec, tree identity)
5231 {
5232   tree cond = make_temp_ssa_name (vectype, NULL, "cond");
5233   gimple *new_stmt = gimple_build_assign (cond, VEC_COND_EXPR,
5234                                           mask, vec, identity);
5235   gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
5236   return cond;
5237 }
5238
5239 /* Successively apply CODE to each element of VECTOR_RHS, in left-to-right
5240    order, starting with LHS.  Insert the extraction statements before GSI and
5241    associate the new scalar SSA names with variable SCALAR_DEST.
5242    Return the SSA name for the result.  */
5243
5244 static tree
5245 vect_expand_fold_left (gimple_stmt_iterator *gsi, tree scalar_dest,
5246                        tree_code code, tree lhs, tree vector_rhs)
5247 {
5248   tree vectype = TREE_TYPE (vector_rhs);
5249   tree scalar_type = TREE_TYPE (vectype);
5250   tree bitsize = TYPE_SIZE (scalar_type);
5251   unsigned HOST_WIDE_INT vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
5252   unsigned HOST_WIDE_INT element_bitsize = tree_to_uhwi (bitsize);
5253
5254   for (unsigned HOST_WIDE_INT bit_offset = 0;
5255        bit_offset < vec_size_in_bits;
5256        bit_offset += element_bitsize)
5257     {
5258       tree bitpos = bitsize_int (bit_offset);
5259       tree rhs = build3 (BIT_FIELD_REF, scalar_type, vector_rhs,
5260                          bitsize, bitpos);
5261
5262       gassign *stmt = gimple_build_assign (scalar_dest, rhs);
5263       rhs = make_ssa_name (scalar_dest, stmt);
5264       gimple_assign_set_lhs (stmt, rhs);
5265       gsi_insert_before (gsi, stmt, GSI_SAME_STMT);
5266
5267       stmt = gimple_build_assign (scalar_dest, code, lhs, rhs);
5268       tree new_name = make_ssa_name (scalar_dest, stmt);
5269       gimple_assign_set_lhs (stmt, new_name);
5270       gsi_insert_before (gsi, stmt, GSI_SAME_STMT);
5271       lhs = new_name;
5272     }
5273   return lhs;
5274 }
5275
5276 /* Get a masked internal function equivalent to REDUC_FN.  VECTYPE_IN is the
5277    type of the vector input.  */
5278
5279 static internal_fn
5280 get_masked_reduction_fn (internal_fn reduc_fn, tree vectype_in)
5281 {
5282   internal_fn mask_reduc_fn;
5283
5284   switch (reduc_fn)
5285     {
5286     case IFN_FOLD_LEFT_PLUS:
5287       mask_reduc_fn = IFN_MASK_FOLD_LEFT_PLUS;
5288       break;
5289
5290     default:
5291       return IFN_LAST;
5292     }
5293
5294   if (direct_internal_fn_supported_p (mask_reduc_fn, vectype_in,
5295                                       OPTIMIZE_FOR_SPEED))
5296     return mask_reduc_fn;
5297   return IFN_LAST;
5298 }
5299
5300 /* Perform an in-order reduction (FOLD_LEFT_REDUCTION).  STMT_INFO is the
5301    statement that sets the live-out value.  REDUC_DEF_STMT is the phi
5302    statement.  CODE is the operation performed by STMT_INFO and OPS are
5303    its scalar operands.  REDUC_INDEX is the index of the operand in
5304    OPS that is set by REDUC_DEF_STMT.  REDUC_FN is the function that
5305    implements in-order reduction, or IFN_LAST if we should open-code it.
5306    VECTYPE_IN is the type of the vector input.  MASKS specifies the masks
5307    that should be used to control the operation in a fully-masked loop.  */
5308
5309 static bool
5310 vectorize_fold_left_reduction (stmt_vec_info stmt_info,
5311                                gimple_stmt_iterator *gsi,
5312                                stmt_vec_info *vec_stmt, slp_tree slp_node,
5313                                gimple *reduc_def_stmt,
5314                                tree_code code, internal_fn reduc_fn,
5315                                tree ops[3], tree vectype_in,
5316                                int reduc_index, vec_loop_masks *masks)
5317 {
5318   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
5319   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
5320   tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
5321   stmt_vec_info new_stmt_info = NULL;
5322   internal_fn mask_reduc_fn = get_masked_reduction_fn (reduc_fn, vectype_in);
5323
5324   int ncopies;
5325   if (slp_node)
5326     ncopies = 1;
5327   else
5328     ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
5329
5330   gcc_assert (!nested_in_vect_loop_p (loop, stmt_info));
5331   gcc_assert (ncopies == 1);
5332   gcc_assert (TREE_CODE_LENGTH (code) == binary_op);
5333
5334   if (slp_node)
5335     gcc_assert (known_eq (TYPE_VECTOR_SUBPARTS (vectype_out),
5336                           TYPE_VECTOR_SUBPARTS (vectype_in)));
5337
5338   tree op0 = ops[1 - reduc_index];
5339
5340   int group_size = 1;
5341   stmt_vec_info scalar_dest_def_info;
5342   auto_vec<tree> vec_oprnds0;
5343   if (slp_node)
5344     {
5345       auto_vec<vec<tree> > vec_defs (2);
5346       auto_vec<tree> sops(2);
5347       sops.quick_push (ops[0]);
5348       sops.quick_push (ops[1]);
5349       vect_get_slp_defs (sops, slp_node, &vec_defs);
5350       vec_oprnds0.safe_splice (vec_defs[1 - reduc_index]);
5351       vec_defs[0].release ();
5352       vec_defs[1].release ();
5353       group_size = SLP_TREE_SCALAR_STMTS (slp_node).length ();
5354       scalar_dest_def_info = SLP_TREE_SCALAR_STMTS (slp_node)[group_size - 1];
5355     }
5356   else
5357     {
5358       tree loop_vec_def0 = vect_get_vec_def_for_operand (op0, stmt_info);
5359       vec_oprnds0.create (1);
5360       vec_oprnds0.quick_push (loop_vec_def0);
5361       scalar_dest_def_info = stmt_info;
5362     }
5363
5364   tree scalar_dest = gimple_assign_lhs (scalar_dest_def_info->stmt);
5365   tree scalar_type = TREE_TYPE (scalar_dest);
5366   tree reduc_var = gimple_phi_result (reduc_def_stmt);
5367
5368   int vec_num = vec_oprnds0.length ();
5369   gcc_assert (vec_num == 1 || slp_node);
5370   tree vec_elem_type = TREE_TYPE (vectype_out);
5371   gcc_checking_assert (useless_type_conversion_p (scalar_type, vec_elem_type));
5372
5373   tree vector_identity = NULL_TREE;
5374   if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
5375     vector_identity = build_zero_cst (vectype_out);
5376
5377   tree scalar_dest_var = vect_create_destination_var (scalar_dest, NULL);
5378   int i;
5379   tree def0;
5380   FOR_EACH_VEC_ELT (vec_oprnds0, i, def0)
5381     {
5382       gimple *new_stmt;
5383       tree mask = NULL_TREE;
5384       if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
5385         mask = vect_get_loop_mask (gsi, masks, vec_num, vectype_in, i);
5386
5387       /* Handle MINUS by adding the negative.  */
5388       if (reduc_fn != IFN_LAST && code == MINUS_EXPR)
5389         {
5390           tree negated = make_ssa_name (vectype_out);
5391           new_stmt = gimple_build_assign (negated, NEGATE_EXPR, def0);
5392           gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
5393           def0 = negated;
5394         }
5395
5396       if (mask && mask_reduc_fn == IFN_LAST)
5397         def0 = merge_with_identity (gsi, mask, vectype_out, def0,
5398                                     vector_identity);
5399
5400       /* On the first iteration the input is simply the scalar phi
5401          result, and for subsequent iterations it is the output of
5402          the preceding operation.  */
5403       if (reduc_fn != IFN_LAST || (mask && mask_reduc_fn != IFN_LAST))
5404         {
5405           if (mask && mask_reduc_fn != IFN_LAST)
5406             new_stmt = gimple_build_call_internal (mask_reduc_fn, 3, reduc_var,
5407                                                    def0, mask);
5408           else
5409             new_stmt = gimple_build_call_internal (reduc_fn, 2, reduc_var,
5410                                                    def0);
5411           /* For chained SLP reductions the output of the previous reduction
5412              operation serves as the input of the next. For the final statement
5413              the output cannot be a temporary - we reuse the original
5414              scalar destination of the last statement.  */
5415           if (i != vec_num - 1)
5416             {
5417               gimple_set_lhs (new_stmt, scalar_dest_var);
5418               reduc_var = make_ssa_name (scalar_dest_var, new_stmt);
5419               gimple_set_lhs (new_stmt, reduc_var);
5420             }
5421         }
5422       else
5423         {
5424           reduc_var = vect_expand_fold_left (gsi, scalar_dest_var, code,
5425                                              reduc_var, def0);
5426           new_stmt = SSA_NAME_DEF_STMT (reduc_var);
5427           /* Remove the statement, so that we can use the same code paths
5428              as for statements that we've just created.  */
5429           gimple_stmt_iterator tmp_gsi = gsi_for_stmt (new_stmt);
5430           gsi_remove (&tmp_gsi, true);
5431         }
5432
5433       if (i == vec_num - 1)
5434         {
5435           gimple_set_lhs (new_stmt, scalar_dest);
5436           new_stmt_info = vect_finish_replace_stmt (scalar_dest_def_info,
5437                                                     new_stmt);
5438         }
5439       else
5440         new_stmt_info = vect_finish_stmt_generation (scalar_dest_def_info,
5441                                                      new_stmt, gsi);
5442
5443       if (slp_node)
5444         SLP_TREE_VEC_STMTS (slp_node).quick_push (new_stmt_info);
5445     }
5446
5447   if (!slp_node)
5448     STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt_info;
5449
5450   return true;
5451 }
5452
5453 /* Function is_nonwrapping_integer_induction.
5454
5455    Check if STMT_VINO (which is part of loop LOOP) both increments and
5456    does not cause overflow.  */
5457
5458 static bool
5459 is_nonwrapping_integer_induction (stmt_vec_info stmt_vinfo, class loop *loop)
5460 {
5461   gphi *phi = as_a <gphi *> (stmt_vinfo->stmt);
5462   tree base = STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo);
5463   tree step = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo);
5464   tree lhs_type = TREE_TYPE (gimple_phi_result (phi));
5465   widest_int ni, max_loop_value, lhs_max;
5466   wi::overflow_type overflow = wi::OVF_NONE;
5467
5468   /* Make sure the loop is integer based.  */
5469   if (TREE_CODE (base) != INTEGER_CST
5470       || TREE_CODE (step) != INTEGER_CST)
5471     return false;
5472
5473   /* Check that the max size of the loop will not wrap.  */
5474
5475   if (TYPE_OVERFLOW_UNDEFINED (lhs_type))
5476     return true;
5477
5478   if (! max_stmt_executions (loop, &ni))
5479     return false;
5480
5481   max_loop_value = wi::mul (wi::to_widest (step), ni, TYPE_SIGN (lhs_type),
5482                             &overflow);
5483   if (overflow)
5484     return false;
5485
5486   max_loop_value = wi::add (wi::to_widest (base), max_loop_value,
5487                             TYPE_SIGN (lhs_type), &overflow);
5488   if (overflow)
5489     return false;
5490
5491   return (wi::min_precision (max_loop_value, TYPE_SIGN (lhs_type))
5492           <= TYPE_PRECISION (lhs_type));
5493 }
5494
5495 /* Check if masking can be supported by inserting a conditional expression.
5496    CODE is the code for the operation.  COND_FN is the conditional internal
5497    function, if it exists.  VECTYPE_IN is the type of the vector input.  */
5498 static bool
5499 use_mask_by_cond_expr_p (enum tree_code code, internal_fn cond_fn,
5500                          tree vectype_in)
5501 {
5502   if (cond_fn != IFN_LAST
5503       && direct_internal_fn_supported_p (cond_fn, vectype_in,
5504                                          OPTIMIZE_FOR_SPEED))
5505     return false;
5506
5507   switch (code)
5508     {
5509     case DOT_PROD_EXPR:
5510     case SAD_EXPR:
5511       return true;
5512
5513     default:
5514       return false;
5515     }
5516 }
5517
5518 /* Insert a conditional expression to enable masked vectorization.  CODE is the
5519    code for the operation.  VOP is the array of operands.  MASK is the loop
5520    mask.  GSI is a statement iterator used to place the new conditional
5521    expression.  */
5522 static void
5523 build_vect_cond_expr (enum tree_code code, tree vop[3], tree mask,
5524                       gimple_stmt_iterator *gsi)
5525 {
5526   switch (code)
5527     {
5528     case DOT_PROD_EXPR:
5529       {
5530         tree vectype = TREE_TYPE (vop[1]);
5531         tree zero = build_zero_cst (vectype);
5532         tree masked_op1 = make_temp_ssa_name (vectype, NULL, "masked_op1");
5533         gassign *select = gimple_build_assign (masked_op1, VEC_COND_EXPR,
5534                                                mask, vop[1], zero);
5535         gsi_insert_before (gsi, select, GSI_SAME_STMT);
5536         vop[1] = masked_op1;
5537         break;
5538       }
5539
5540     case SAD_EXPR:
5541       {
5542         tree vectype = TREE_TYPE (vop[1]);
5543         tree masked_op1 = make_temp_ssa_name (vectype, NULL, "masked_op1");
5544         gassign *select = gimple_build_assign (masked_op1, VEC_COND_EXPR,
5545                                                mask, vop[1], vop[0]);
5546         gsi_insert_before (gsi, select, GSI_SAME_STMT);
5547         vop[1] = masked_op1;
5548         break;
5549       }
5550
5551     default:
5552       gcc_unreachable ();
5553     }
5554 }
5555
5556 /* Function vectorizable_reduction.
5557
5558    Check if STMT_INFO performs a reduction operation that can be vectorized.
5559    If VEC_STMT is also passed, vectorize STMT_INFO: create a vectorized
5560    stmt to replace it, put it in VEC_STMT, and insert it at GSI.
5561    Return true if STMT_INFO is vectorizable in this way.
5562
5563    This function also handles reduction idioms (patterns) that have been
5564    recognized in advance during vect_pattern_recog.  In this case, STMT_INFO
5565    may be of this form:
5566      X = pattern_expr (arg0, arg1, ..., X)
5567    and its STMT_VINFO_RELATED_STMT points to the last stmt in the original
5568    sequence that had been detected and replaced by the pattern-stmt
5569    (STMT_INFO).
5570
5571    This function also handles reduction of condition expressions, for example:
5572      for (int i = 0; i < N; i++)
5573        if (a[i] < value)
5574          last = a[i];
5575    This is handled by vectorising the loop and creating an additional vector
5576    containing the loop indexes for which "a[i] < value" was true.  In the
5577    function epilogue this is reduced to a single max value and then used to
5578    index into the vector of results.
5579
5580    In some cases of reduction patterns, the type of the reduction variable X is
5581    different than the type of the other arguments of STMT_INFO.
5582    In such cases, the vectype that is used when transforming STMT_INFO into
5583    a vector stmt is different than the vectype that is used to determine the
5584    vectorization factor, because it consists of a different number of elements
5585    than the actual number of elements that are being operated upon in parallel.
5586
5587    For example, consider an accumulation of shorts into an int accumulator.
5588    On some targets it's possible to vectorize this pattern operating on 8
5589    shorts at a time (hence, the vectype for purposes of determining the
5590    vectorization factor should be V8HI); on the other hand, the vectype that
5591    is used to create the vector form is actually V4SI (the type of the result).
5592
5593    Upon entry to this function, STMT_VINFO_VECTYPE records the vectype that
5594    indicates what is the actual level of parallelism (V8HI in the example), so
5595    that the right vectorization factor would be derived.  This vectype
5596    corresponds to the type of arguments to the reduction stmt, and should *NOT*
5597    be used to create the vectorized stmt.  The right vectype for the vectorized
5598    stmt is obtained from the type of the result X:
5599         get_vectype_for_scalar_type (TREE_TYPE (X))
5600
5601    This means that, contrary to "regular" reductions (or "regular" stmts in
5602    general), the following equation:
5603       STMT_VINFO_VECTYPE == get_vectype_for_scalar_type (TREE_TYPE (X))
5604    does *NOT* necessarily hold for reduction patterns.  */
5605
5606 bool
5607 vectorizable_reduction (stmt_vec_info stmt_info, slp_tree slp_node,
5608                         slp_instance slp_node_instance,
5609                         stmt_vector_for_cost *cost_vec)
5610 {
5611   tree scalar_dest;
5612   tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
5613   tree vectype_in = NULL_TREE;
5614   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
5615   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
5616   enum tree_code code;
5617   int op_type;
5618   enum vect_def_type dt, cond_reduc_dt = vect_unknown_def_type;
5619   stmt_vec_info cond_stmt_vinfo = NULL;
5620   tree scalar_type;
5621   int i;
5622   int ncopies;
5623   bool single_defuse_cycle = false;
5624   tree ops[3];
5625   enum vect_def_type dts[3];
5626   bool nested_cycle = false, found_nested_cycle_def = false;
5627   bool double_reduc = false;
5628   int vec_num;
5629   tree tem;
5630   tree cr_index_scalar_type = NULL_TREE, cr_index_vector_type = NULL_TREE;
5631   tree cond_reduc_val = NULL_TREE;
5632
5633   /* Make sure it was already recognized as a reduction computation.  */
5634   if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_reduction_def
5635       && STMT_VINFO_DEF_TYPE (stmt_info) != vect_double_reduction_def
5636       && STMT_VINFO_DEF_TYPE (stmt_info) != vect_nested_cycle)
5637     return false;
5638
5639   /* The stmt we store reduction analysis meta on.  */
5640   stmt_vec_info reduc_info = info_for_reduction (stmt_info);
5641   reduc_info->is_reduc_info = true;
5642
5643   if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle)
5644     {
5645       if (is_a <gphi *> (stmt_info->stmt))
5646         /* Analysis for double-reduction is done on the outer
5647            loop PHI, nested cycles have no further restrictions.  */
5648         STMT_VINFO_TYPE (stmt_info) = cycle_phi_info_type;
5649       else
5650         STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
5651       return true;
5652     }
5653
5654   stmt_vec_info orig_stmt_of_analysis = stmt_info;
5655   if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
5656       || STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
5657     {
5658       if (!is_a <gphi *> (stmt_info->stmt))
5659         {
5660           STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
5661           return true;
5662         }
5663       if (slp_node)
5664         {
5665           slp_node_instance->reduc_phis = slp_node;
5666           /* ???  We're leaving slp_node to point to the PHIs, we only
5667              need it to get at the number of vector stmts which wasn't
5668              yet initialized for the instance root.  */
5669         }
5670       if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def)
5671         stmt_info = vect_stmt_to_vectorize (STMT_VINFO_REDUC_DEF (stmt_info));
5672       else /* STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def */
5673         {
5674           use_operand_p use_p;
5675           gimple *use_stmt;
5676           bool res = single_imm_use (gimple_phi_result (stmt_info->stmt),
5677                                      &use_p, &use_stmt);
5678           gcc_assert (res);
5679           stmt_info = loop_vinfo->lookup_stmt (use_stmt);
5680           stmt_info = vect_stmt_to_vectorize (STMT_VINFO_REDUC_DEF (stmt_info));
5681         }
5682       /* STMT_VINFO_REDUC_DEF doesn't point to the first but the last
5683          element.  */
5684       if (slp_node && REDUC_GROUP_FIRST_ELEMENT (stmt_info))
5685         {
5686           gcc_assert (!REDUC_GROUP_NEXT_ELEMENT (stmt_info));
5687           stmt_info = REDUC_GROUP_FIRST_ELEMENT (stmt_info);
5688         }
5689     }
5690
5691   if (nested_in_vect_loop_p (loop, stmt_info))
5692     {
5693       loop = loop->inner;
5694       nested_cycle = true;
5695     }
5696
5697   if (REDUC_GROUP_FIRST_ELEMENT (stmt_info))
5698     gcc_assert (slp_node
5699                 && REDUC_GROUP_FIRST_ELEMENT (stmt_info) == stmt_info);
5700
5701   /* 1. Is vectorizable reduction?  */
5702   /* Not supportable if the reduction variable is used in the loop, unless
5703      it's a reduction chain.  */
5704   if (STMT_VINFO_RELEVANT (stmt_info) > vect_used_in_outer
5705       && !REDUC_GROUP_FIRST_ELEMENT (stmt_info))
5706     return false;
5707
5708   /* Reductions that are not used even in an enclosing outer-loop,
5709      are expected to be "live" (used out of the loop).  */
5710   if (STMT_VINFO_RELEVANT (stmt_info) == vect_unused_in_scope
5711       && !STMT_VINFO_LIVE_P (stmt_info))
5712     return false;
5713
5714   /* 2. Has this been recognized as a reduction pattern?
5715
5716      Check if STMT represents a pattern that has been recognized
5717      in earlier analysis stages.  For stmts that represent a pattern,
5718      the STMT_VINFO_RELATED_STMT field records the last stmt in
5719      the original sequence that constitutes the pattern.  */
5720
5721   stmt_vec_info orig_stmt_info = STMT_VINFO_RELATED_STMT (stmt_info);
5722   if (orig_stmt_info)
5723     {
5724       gcc_assert (STMT_VINFO_IN_PATTERN_P (orig_stmt_info));
5725       gcc_assert (!STMT_VINFO_IN_PATTERN_P (stmt_info));
5726     }
5727
5728   /* 3. Check the operands of the operation.  The first operands are defined
5729         inside the loop body. The last operand is the reduction variable,
5730         which is defined by the loop-header-phi.  */
5731
5732   gassign *stmt = as_a <gassign *> (stmt_info->stmt);
5733
5734   /* Flatten RHS.  */
5735   switch (get_gimple_rhs_class (gimple_assign_rhs_code (stmt)))
5736     {
5737     case GIMPLE_BINARY_RHS:
5738       code = gimple_assign_rhs_code (stmt);
5739       op_type = TREE_CODE_LENGTH (code);
5740       gcc_assert (op_type == binary_op);
5741       ops[0] = gimple_assign_rhs1 (stmt);
5742       ops[1] = gimple_assign_rhs2 (stmt);
5743       break;
5744
5745     case GIMPLE_TERNARY_RHS:
5746       code = gimple_assign_rhs_code (stmt);
5747       op_type = TREE_CODE_LENGTH (code);
5748       gcc_assert (op_type == ternary_op);
5749       ops[0] = gimple_assign_rhs1 (stmt);
5750       ops[1] = gimple_assign_rhs2 (stmt);
5751       ops[2] = gimple_assign_rhs3 (stmt);
5752       break;
5753
5754     case GIMPLE_UNARY_RHS:
5755     case GIMPLE_SINGLE_RHS:
5756       return false;
5757
5758     default:
5759       gcc_unreachable ();
5760     }
5761
5762   if (code == COND_EXPR && slp_node)
5763     return false;
5764
5765   scalar_dest = gimple_assign_lhs (stmt);
5766   scalar_type = TREE_TYPE (scalar_dest);
5767   if (!POINTER_TYPE_P (scalar_type) && !INTEGRAL_TYPE_P (scalar_type)
5768       && !SCALAR_FLOAT_TYPE_P (scalar_type))
5769     return false;
5770
5771   /* Do not try to vectorize bit-precision reductions.  */
5772   if (!type_has_mode_precision_p (scalar_type))
5773     return false;
5774
5775   /* All uses but the last are expected to be defined in the loop.
5776      The last use is the reduction variable.  In case of nested cycle this
5777      assumption is not true: we use reduc_index to record the index of the
5778      reduction variable.  */
5779   stmt_vec_info phi_info = STMT_VINFO_REDUC_DEF (vect_orig_stmt (stmt_info));
5780   /* PHIs should not participate in patterns.  */
5781   gcc_assert (!STMT_VINFO_RELATED_STMT (phi_info));
5782   gphi *reduc_def_phi = as_a <gphi *> (phi_info->stmt);
5783   tree reduc_def = PHI_RESULT (reduc_def_phi);
5784   int reduc_index = -1;
5785   for (i = 0; i < op_type; i++)
5786     {
5787       /* The condition of COND_EXPR is checked in vectorizable_condition().  */
5788       if (i == 0 && code == COND_EXPR)
5789         continue;
5790
5791       stmt_vec_info def_stmt_info;
5792       if (!vect_is_simple_use (ops[i], loop_vinfo, &dts[i], &tem,
5793                                &def_stmt_info))
5794         {
5795           if (dump_enabled_p ())
5796             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5797                              "use not simple.\n");
5798           return false;
5799         }
5800       dt = dts[i];
5801       if (dt == vect_reduction_def
5802           && ops[i] == reduc_def)
5803         {
5804           reduc_index = i;
5805           continue;
5806         }
5807       else if (tem)
5808         {
5809           /* To properly compute ncopies we are interested in the widest
5810              input type in case we're looking at a widening accumulation.  */
5811           if (!vectype_in
5812               || (GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (vectype_in)))
5813                   < GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (tem)))))
5814             vectype_in = tem;
5815         }
5816
5817       if (dt != vect_internal_def
5818           && dt != vect_external_def
5819           && dt != vect_constant_def
5820           && dt != vect_induction_def
5821           && !(dt == vect_nested_cycle && nested_cycle))
5822         return false;
5823
5824       if (dt == vect_nested_cycle
5825           && ops[i] == reduc_def)
5826         {
5827           found_nested_cycle_def = true;
5828           reduc_index = i;
5829         }
5830
5831       if (code == COND_EXPR)
5832         {
5833           /* Record how the non-reduction-def value of COND_EXPR is defined.  */
5834           if (dt == vect_constant_def)
5835             {
5836               cond_reduc_dt = dt;
5837               cond_reduc_val = ops[i];
5838             }
5839           if (dt == vect_induction_def
5840               && def_stmt_info
5841               && is_nonwrapping_integer_induction (def_stmt_info, loop))
5842             {
5843               cond_reduc_dt = dt;
5844               cond_stmt_vinfo = def_stmt_info;
5845             }
5846         }
5847     }
5848   if (!vectype_in)
5849     vectype_in = vectype_out;
5850   STMT_VINFO_REDUC_VECTYPE_IN (reduc_info) = vectype_in;
5851   /* For the SSA cycle we store on each participating stmt the operand index
5852      where the cycle continues.  Store the one relevant for the actual
5853      operation in the reduction meta.  */
5854   STMT_VINFO_REDUC_IDX (reduc_info) = reduc_index;
5855
5856   if (!(reduc_index == -1
5857         || dts[reduc_index] == vect_reduction_def
5858         || dts[reduc_index] == vect_nested_cycle
5859         || ((dts[reduc_index] == vect_internal_def
5860              || dts[reduc_index] == vect_external_def
5861              || dts[reduc_index] == vect_constant_def
5862              || dts[reduc_index] == vect_induction_def)
5863             && nested_cycle && found_nested_cycle_def)))
5864     {
5865       /* For pattern recognized stmts, orig_stmt might be a reduction,
5866          but some helper statements for the pattern might not, or
5867          might be COND_EXPRs with reduction uses in the condition.  */
5868       gcc_assert (orig_stmt_info);
5869       return false;
5870     }
5871
5872   enum vect_reduction_type v_reduc_type = STMT_VINFO_REDUC_TYPE (phi_info);
5873   STMT_VINFO_REDUC_TYPE (reduc_info) = v_reduc_type;
5874   /* If we have a condition reduction, see if we can simplify it further.  */
5875   if (v_reduc_type == COND_REDUCTION)
5876     {
5877       /* TODO: We can't yet handle reduction chains, since we need to treat
5878          each COND_EXPR in the chain specially, not just the last one.
5879          E.g. for:
5880
5881             x_1 = PHI <x_3, ...>
5882             x_2 = a_2 ? ... : x_1;
5883             x_3 = a_3 ? ... : x_2;
5884
5885          we're interested in the last element in x_3 for which a_2 || a_3
5886          is true, whereas the current reduction chain handling would
5887          vectorize x_2 as a normal VEC_COND_EXPR and only treat x_3
5888          as a reduction operation.  */
5889       if (reduc_index == -1)
5890         {
5891           if (dump_enabled_p ())
5892             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5893                              "conditional reduction chains not supported\n");
5894           return false;
5895         }
5896
5897       /* When the condition uses the reduction value in the condition, fail.  */
5898       if (reduc_index == 0)
5899         {
5900           if (dump_enabled_p ())
5901             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5902                              "condition depends on previous iteration\n");
5903           return false;
5904         }
5905
5906       if (direct_internal_fn_supported_p (IFN_FOLD_EXTRACT_LAST,
5907                                           vectype_in, OPTIMIZE_FOR_SPEED))
5908         {
5909           if (dump_enabled_p ())
5910             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5911                              "optimizing condition reduction with"
5912                              " FOLD_EXTRACT_LAST.\n");
5913           STMT_VINFO_REDUC_TYPE (reduc_info) = EXTRACT_LAST_REDUCTION;
5914         }
5915       else if (cond_reduc_dt == vect_induction_def)
5916         {
5917           tree base
5918             = STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (cond_stmt_vinfo);
5919           tree step = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (cond_stmt_vinfo);
5920
5921           gcc_assert (TREE_CODE (base) == INTEGER_CST
5922                       && TREE_CODE (step) == INTEGER_CST);
5923           cond_reduc_val = NULL_TREE;
5924           enum tree_code cond_reduc_op_code = ERROR_MARK;
5925           tree res = PHI_RESULT (STMT_VINFO_STMT (cond_stmt_vinfo));
5926           if (!types_compatible_p (TREE_TYPE (res), TREE_TYPE (base)))
5927             ;
5928           /* Find a suitable value, for MAX_EXPR below base, for MIN_EXPR
5929              above base; punt if base is the minimum value of the type for
5930              MAX_EXPR or maximum value of the type for MIN_EXPR for now.  */
5931           else if (tree_int_cst_sgn (step) == -1)
5932             {
5933               cond_reduc_op_code = MIN_EXPR;
5934               if (tree_int_cst_sgn (base) == -1)
5935                 cond_reduc_val = build_int_cst (TREE_TYPE (base), 0);
5936               else if (tree_int_cst_lt (base,
5937                                         TYPE_MAX_VALUE (TREE_TYPE (base))))
5938                 cond_reduc_val
5939                   = int_const_binop (PLUS_EXPR, base, integer_one_node);
5940             }
5941           else
5942             {
5943               cond_reduc_op_code = MAX_EXPR;
5944               if (tree_int_cst_sgn (base) == 1)
5945                 cond_reduc_val = build_int_cst (TREE_TYPE (base), 0);
5946               else if (tree_int_cst_lt (TYPE_MIN_VALUE (TREE_TYPE (base)),
5947                                         base))
5948                 cond_reduc_val
5949                   = int_const_binop (MINUS_EXPR, base, integer_one_node);
5950             }
5951           if (cond_reduc_val)
5952             {
5953               if (dump_enabled_p ())
5954                 dump_printf_loc (MSG_NOTE, vect_location,
5955                                  "condition expression based on "
5956                                  "integer induction.\n");
5957               STMT_VINFO_VEC_COND_REDUC_CODE (reduc_info) = cond_reduc_op_code;
5958               STMT_VINFO_VEC_INDUC_COND_INITIAL_VAL (reduc_info)
5959                 = cond_reduc_val;
5960               STMT_VINFO_REDUC_TYPE (reduc_info) = INTEGER_INDUC_COND_REDUCTION;
5961             }
5962         }
5963       else if (cond_reduc_dt == vect_constant_def)
5964         {
5965           enum vect_def_type cond_initial_dt;
5966           gimple *def_stmt = SSA_NAME_DEF_STMT (ops[reduc_index]);
5967           tree cond_initial_val
5968             = PHI_ARG_DEF_FROM_EDGE (def_stmt, loop_preheader_edge (loop));
5969
5970           gcc_assert (cond_reduc_val != NULL_TREE);
5971           vect_is_simple_use (cond_initial_val, loop_vinfo, &cond_initial_dt);
5972           if (cond_initial_dt == vect_constant_def
5973               && types_compatible_p (TREE_TYPE (cond_initial_val),
5974                                      TREE_TYPE (cond_reduc_val)))
5975             {
5976               tree e = fold_binary (LE_EXPR, boolean_type_node,
5977                                     cond_initial_val, cond_reduc_val);
5978               if (e && (integer_onep (e) || integer_zerop (e)))
5979                 {
5980                   if (dump_enabled_p ())
5981                     dump_printf_loc (MSG_NOTE, vect_location,
5982                                      "condition expression based on "
5983                                      "compile time constant.\n");
5984                   /* Record reduction code at analysis stage.  */
5985                   STMT_VINFO_VEC_COND_REDUC_CODE (reduc_info)
5986                     = integer_onep (e) ? MAX_EXPR : MIN_EXPR;
5987                   STMT_VINFO_REDUC_TYPE (reduc_info) = CONST_COND_REDUCTION;
5988                 }
5989             }
5990         }
5991     }
5992
5993   if (REDUC_GROUP_FIRST_ELEMENT (stmt_info))
5994     /* We changed STMT to be the first stmt in reduction chain, hence we
5995        check that in this case the first element in the chain is STMT.  */
5996     gcc_assert (REDUC_GROUP_FIRST_ELEMENT (STMT_VINFO_REDUC_DEF (phi_info))
5997                 == vect_orig_stmt (stmt_info));
5998
5999   if (STMT_VINFO_LIVE_P (phi_info))
6000     return false;
6001
6002   if (slp_node)
6003     ncopies = 1;
6004   else
6005     ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
6006
6007   gcc_assert (ncopies >= 1);
6008
6009   poly_uint64 nunits_out = TYPE_VECTOR_SUBPARTS (vectype_out);
6010
6011   if (nested_cycle)
6012     {
6013       gcc_assert (STMT_VINFO_DEF_TYPE (reduc_info)
6014                   == vect_double_reduction_def);
6015       double_reduc = true;
6016     }
6017
6018   /* 4.2. Check support for the epilog operation.
6019
6020           If STMT represents a reduction pattern, then the type of the
6021           reduction variable may be different than the type of the rest
6022           of the arguments.  For example, consider the case of accumulation
6023           of shorts into an int accumulator; The original code:
6024                         S1: int_a = (int) short_a;
6025           orig_stmt->   S2: int_acc = plus <int_a ,int_acc>;
6026
6027           was replaced with:
6028                         STMT: int_acc = widen_sum <short_a, int_acc>
6029
6030           This means that:
6031           1. The tree-code that is used to create the vector operation in the
6032              epilog code (that reduces the partial results) is not the
6033              tree-code of STMT, but is rather the tree-code of the original
6034              stmt from the pattern that STMT is replacing.  I.e, in the example
6035              above we want to use 'widen_sum' in the loop, but 'plus' in the
6036              epilog.
6037           2. The type (mode) we use to check available target support
6038              for the vector operation to be created in the *epilog*, is
6039              determined by the type of the reduction variable (in the example
6040              above we'd check this: optab_handler (plus_optab, vect_int_mode])).
6041              However the type (mode) we use to check available target support
6042              for the vector operation to be created *inside the loop*, is
6043              determined by the type of the other arguments to STMT (in the
6044              example we'd check this: optab_handler (widen_sum_optab,
6045              vect_short_mode)).
6046
6047           This is contrary to "regular" reductions, in which the types of all
6048           the arguments are the same as the type of the reduction variable.
6049           For "regular" reductions we can therefore use the same vector type
6050           (and also the same tree-code) when generating the epilog code and
6051           when generating the code inside the loop.  */
6052
6053   vect_reduction_type reduction_type = STMT_VINFO_REDUC_TYPE (reduc_info);
6054   enum tree_code orig_code = ERROR_MARK;
6055   if (reduction_type == CONST_COND_REDUCTION
6056       || reduction_type == INTEGER_INDUC_COND_REDUCTION)
6057     {
6058       /* For simple condition reductions, replace with the actual expression
6059          we want to base our reduction around.  */
6060       orig_code = STMT_VINFO_VEC_COND_REDUC_CODE (reduc_info);
6061       gcc_assert (orig_code == MAX_EXPR || orig_code == MIN_EXPR);
6062     }
6063   else if (reduction_type == COND_REDUCTION)
6064     orig_code = COND_EXPR;
6065   else if (reduction_type == TREE_CODE_REDUCTION
6066            || reduction_type == FOLD_LEFT_REDUCTION)
6067     {
6068       if (orig_stmt_info)
6069         orig_code = gimple_assign_rhs_code (orig_stmt_info->stmt);
6070       else
6071         orig_code = code;
6072       gcc_assert (vectype_out);
6073       if (orig_code == MINUS_EXPR)
6074         orig_code = PLUS_EXPR;
6075     }
6076   STMT_VINFO_REDUC_CODE (reduc_info) = orig_code;
6077
6078   if (reduction_type == TREE_CODE_REDUCTION)
6079     {
6080       /* Check whether it's ok to change the order of the computation.
6081          Generally, when vectorizing a reduction we change the order of the
6082          computation.  This may change the behavior of the program in some
6083          cases, so we need to check that this is ok.  One exception is when
6084          vectorizing an outer-loop: the inner-loop is executed sequentially,
6085          and therefore vectorizing reductions in the inner-loop during
6086          outer-loop vectorization is safe.  */
6087       if (needs_fold_left_reduction_p (scalar_type, orig_code))
6088         {
6089           STMT_VINFO_REDUC_TYPE (reduc_info)
6090             = reduction_type = FOLD_LEFT_REDUCTION;
6091           /* When vectorizing a reduction chain w/o SLP the reduction PHI is not
6092              directy used in stmt.  */
6093           if (reduc_index == -1)
6094             {
6095               if (dump_enabled_p ())
6096                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6097                                  "in-order reduction chain without SLP.\n");
6098               return false;
6099             }
6100         }
6101       else if (!commutative_tree_code (orig_code)
6102                || !associative_tree_code (orig_code))
6103         {
6104           if (dump_enabled_p ())
6105             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6106                             "reduction: not commutative/associative");
6107           return false;
6108         }
6109     }
6110
6111   if ((double_reduc || reduction_type != TREE_CODE_REDUCTION)
6112       && ncopies > 1)
6113     {
6114       if (dump_enabled_p ())
6115         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6116                          "multiple types in double reduction or condition "
6117                          "reduction or fold-left reduction.\n");
6118       return false;
6119     }
6120
6121   internal_fn reduc_fn = IFN_LAST;
6122   if (reduction_type == TREE_CODE_REDUCTION
6123       || reduction_type == FOLD_LEFT_REDUCTION
6124       || reduction_type == INTEGER_INDUC_COND_REDUCTION
6125       || reduction_type == CONST_COND_REDUCTION)
6126     {
6127       if (reduction_type == FOLD_LEFT_REDUCTION
6128           ? fold_left_reduction_fn (orig_code, &reduc_fn)
6129           : reduction_fn_for_scalar_code (orig_code, &reduc_fn))
6130         {
6131           if (reduc_fn != IFN_LAST
6132               && !direct_internal_fn_supported_p (reduc_fn, vectype_out,
6133                                                   OPTIMIZE_FOR_SPEED))
6134             {
6135               if (dump_enabled_p ())
6136                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6137                                  "reduc op not supported by target.\n");
6138
6139               reduc_fn = IFN_LAST;
6140             }
6141         }
6142       else
6143         {
6144           if (!nested_cycle || double_reduc)
6145             {
6146               if (dump_enabled_p ())
6147                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6148                                  "no reduc code for scalar code.\n");
6149
6150               return false;
6151             }
6152         }
6153     }
6154   else if (reduction_type == COND_REDUCTION)
6155     {
6156       int scalar_precision
6157         = GET_MODE_PRECISION (SCALAR_TYPE_MODE (scalar_type));
6158       cr_index_scalar_type = make_unsigned_type (scalar_precision);
6159       cr_index_vector_type = build_vector_type (cr_index_scalar_type,
6160                                                 nunits_out);
6161
6162       if (direct_internal_fn_supported_p (IFN_REDUC_MAX, cr_index_vector_type,
6163                                           OPTIMIZE_FOR_SPEED))
6164         reduc_fn = IFN_REDUC_MAX;
6165     }
6166   STMT_VINFO_REDUC_FN (reduc_info) = reduc_fn;
6167
6168   if (reduction_type != EXTRACT_LAST_REDUCTION
6169       && (!nested_cycle || double_reduc)
6170       && reduc_fn == IFN_LAST
6171       && !nunits_out.is_constant ())
6172     {
6173       if (dump_enabled_p ())
6174         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6175                          "missing target support for reduction on"
6176                          " variable-length vectors.\n");
6177       return false;
6178     }
6179
6180   /* For SLP reductions, see if there is a neutral value we can use.  */
6181   tree neutral_op = NULL_TREE;
6182   if (slp_node)
6183     neutral_op = neutral_op_for_slp_reduction
6184       (slp_node_instance->reduc_phis, code,
6185        REDUC_GROUP_FIRST_ELEMENT (stmt_info) != NULL);
6186
6187   if (double_reduc && reduction_type == FOLD_LEFT_REDUCTION)
6188     {
6189       /* We can't support in-order reductions of code such as this:
6190
6191            for (int i = 0; i < n1; ++i)
6192              for (int j = 0; j < n2; ++j)
6193                l += a[j];
6194
6195          since GCC effectively transforms the loop when vectorizing:
6196
6197            for (int i = 0; i < n1 / VF; ++i)
6198              for (int j = 0; j < n2; ++j)
6199                for (int k = 0; k < VF; ++k)
6200                  l += a[j];
6201
6202          which is a reassociation of the original operation.  */
6203       if (dump_enabled_p ())
6204         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6205                          "in-order double reduction not supported.\n");
6206
6207       return false;
6208     }
6209
6210   if (reduction_type == FOLD_LEFT_REDUCTION
6211       && slp_node
6212       && !REDUC_GROUP_FIRST_ELEMENT (stmt_info))
6213     {
6214       /* We cannot use in-order reductions in this case because there is
6215          an implicit reassociation of the operations involved.  */
6216       if (dump_enabled_p ())
6217         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6218                          "in-order unchained SLP reductions not supported.\n");
6219       return false;
6220     }
6221
6222   /* For double reductions, and for SLP reductions with a neutral value,
6223      we construct a variable-length initial vector by loading a vector
6224      full of the neutral value and then shift-and-inserting the start
6225      values into the low-numbered elements.  */
6226   if ((double_reduc || neutral_op)
6227       && !nunits_out.is_constant ()
6228       && !direct_internal_fn_supported_p (IFN_VEC_SHL_INSERT,
6229                                           vectype_out, OPTIMIZE_FOR_SPEED))
6230     {
6231       if (dump_enabled_p ())
6232         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6233                          "reduction on variable-length vectors requires"
6234                          " target support for a vector-shift-and-insert"
6235                          " operation.\n");
6236       return false;
6237     }
6238
6239   /* Check extra constraints for variable-length unchained SLP reductions.  */
6240   if (STMT_SLP_TYPE (stmt_info)
6241       && !REDUC_GROUP_FIRST_ELEMENT (stmt_info)
6242       && !nunits_out.is_constant ())
6243     {
6244       /* We checked above that we could build the initial vector when
6245          there's a neutral element value.  Check here for the case in
6246          which each SLP statement has its own initial value and in which
6247          that value needs to be repeated for every instance of the
6248          statement within the initial vector.  */
6249       unsigned int group_size = SLP_INSTANCE_GROUP_SIZE (slp_node_instance);
6250       scalar_mode elt_mode = SCALAR_TYPE_MODE (TREE_TYPE (vectype_out));
6251       if (!neutral_op
6252           && !can_duplicate_and_interleave_p (group_size, elt_mode))
6253         {
6254           if (dump_enabled_p ())
6255             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6256                              "unsupported form of SLP reduction for"
6257                              " variable-length vectors: cannot build"
6258                              " initial vector.\n");
6259           return false;
6260         }
6261       /* The epilogue code relies on the number of elements being a multiple
6262          of the group size.  The duplicate-and-interleave approach to setting
6263          up the the initial vector does too.  */
6264       if (!multiple_p (nunits_out, group_size))
6265         {
6266           if (dump_enabled_p ())
6267             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6268                              "unsupported form of SLP reduction for"
6269                              " variable-length vectors: the vector size"
6270                              " is not a multiple of the number of results.\n");
6271           return false;
6272         }
6273     }
6274
6275   /* In case of widenning multiplication by a constant, we update the type
6276      of the constant to be the type of the other operand.  We check that the
6277      constant fits the type in the pattern recognition pass.  */
6278   if (code == DOT_PROD_EXPR
6279       && !types_compatible_p (TREE_TYPE (ops[0]), TREE_TYPE (ops[1])))
6280     /* No testcase for this.  PR49478.  */
6281     gcc_unreachable ();
6282
6283   if (reduction_type == COND_REDUCTION)
6284     {
6285       widest_int ni;
6286
6287       if (! max_loop_iterations (loop, &ni))
6288         {
6289           if (dump_enabled_p ())
6290             dump_printf_loc (MSG_NOTE, vect_location,
6291                              "loop count not known, cannot create cond "
6292                              "reduction.\n");
6293           return false;
6294         }
6295       /* Convert backedges to iterations.  */
6296       ni += 1;
6297
6298       /* The additional index will be the same type as the condition.  Check
6299          that the loop can fit into this less one (because we'll use up the
6300          zero slot for when there are no matches).  */
6301       tree max_index = TYPE_MAX_VALUE (cr_index_scalar_type);
6302       if (wi::geu_p (ni, wi::to_widest (max_index)))
6303         {
6304           if (dump_enabled_p ())
6305             dump_printf_loc (MSG_NOTE, vect_location,
6306                              "loop size is greater than data size.\n");
6307           return false;
6308         }
6309     }
6310
6311   /* In case the vectorization factor (VF) is bigger than the number
6312      of elements that we can fit in a vectype (nunits), we have to generate
6313      more than one vector stmt - i.e - we need to "unroll" the
6314      vector stmt by a factor VF/nunits.  For more details see documentation
6315      in vectorizable_operation.  */
6316
6317   /* If the reduction is used in an outer loop we need to generate
6318      VF intermediate results, like so (e.g. for ncopies=2):
6319         r0 = phi (init, r0)
6320         r1 = phi (init, r1)
6321         r0 = x0 + r0;
6322         r1 = x1 + r1;
6323     (i.e. we generate VF results in 2 registers).
6324     In this case we have a separate def-use cycle for each copy, and therefore
6325     for each copy we get the vector def for the reduction variable from the
6326     respective phi node created for this copy.
6327
6328     Otherwise (the reduction is unused in the loop nest), we can combine
6329     together intermediate results, like so (e.g. for ncopies=2):
6330         r = phi (init, r)
6331         r = x0 + r;
6332         r = x1 + r;
6333    (i.e. we generate VF/2 results in a single register).
6334    In this case for each copy we get the vector def for the reduction variable
6335    from the vectorized reduction operation generated in the previous iteration.
6336
6337    This only works when we see both the reduction PHI and its only consumer
6338    in vectorizable_reduction and there are no intermediate stmts
6339    participating.  */
6340   stmt_vec_info use_stmt_info;
6341   tree reduc_phi_result = gimple_phi_result (reduc_def_phi);
6342   if (ncopies > 1
6343       && (STMT_VINFO_RELEVANT (stmt_info) <= vect_used_only_live)
6344       && (use_stmt_info = loop_vinfo->lookup_single_use (reduc_phi_result))
6345       && (!STMT_VINFO_IN_PATTERN_P (use_stmt_info)
6346           || !STMT_VINFO_PATTERN_DEF_SEQ (use_stmt_info))
6347       && vect_stmt_to_vectorize (use_stmt_info) == stmt_info)
6348     STMT_VINFO_FORCE_SINGLE_CYCLE (reduc_info) = single_defuse_cycle = true;
6349
6350   if (single_defuse_cycle
6351       || code == DOT_PROD_EXPR
6352       || code == WIDEN_SUM_EXPR
6353       || code == SAD_EXPR)
6354     {
6355       gcc_assert (code != COND_EXPR);
6356
6357       /* 4. Supportable by target?  */
6358
6359       /* 4.1. check support for the operation in the loop  */
6360       optab optab = optab_for_tree_code (code, vectype_in, optab_default);
6361       if (!optab)
6362         {
6363           if (dump_enabled_p ())
6364             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6365                              "no optab.\n");
6366
6367           return false;
6368         }
6369
6370       machine_mode vec_mode = TYPE_MODE (vectype_in);
6371       if (optab_handler (optab, vec_mode) == CODE_FOR_nothing)
6372         {
6373           if (dump_enabled_p ())
6374             dump_printf (MSG_NOTE, "op not supported by target.\n");
6375
6376           if (maybe_ne (GET_MODE_SIZE (vec_mode), UNITS_PER_WORD)
6377               || !vect_worthwhile_without_simd_p (loop_vinfo, code))
6378             return false;
6379
6380           if (dump_enabled_p ())
6381             dump_printf (MSG_NOTE, "proceeding using word mode.\n");
6382         }
6383
6384       /* Worthwhile without SIMD support?  */
6385       if (!VECTOR_MODE_P (TYPE_MODE (vectype_in))
6386           && !vect_worthwhile_without_simd_p (loop_vinfo, code))
6387         {
6388           if (dump_enabled_p ())
6389             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6390                              "not worthwhile without SIMD support.\n");
6391
6392           return false;
6393         }
6394     }
6395
6396   /* If the reduction stmt is one of the patterns that have lane
6397      reduction embedded we cannot handle the case of ! single_defuse_cycle.  */
6398   if ((ncopies > 1
6399        && ! single_defuse_cycle)
6400       && (code == DOT_PROD_EXPR
6401           || code == WIDEN_SUM_EXPR
6402           || code == SAD_EXPR))
6403     {
6404       if (dump_enabled_p ())
6405         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6406                          "multi def-use cycle not possible for lane-reducing "
6407                          "reduction operation\n");
6408       return false;
6409     }
6410
6411   if (slp_node)
6412     vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
6413   else
6414     vec_num = 1;
6415
6416   internal_fn cond_fn = get_conditional_internal_fn (code);
6417   vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo);
6418   bool mask_by_cond_expr = use_mask_by_cond_expr_p (code, cond_fn, vectype_in);
6419
6420   vect_model_reduction_cost (stmt_info, reduc_fn, reduction_type, ncopies,
6421                              cost_vec);
6422   if (loop_vinfo && LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo))
6423     {
6424       if (reduction_type != FOLD_LEFT_REDUCTION
6425           && !mask_by_cond_expr
6426           && (cond_fn == IFN_LAST
6427               || !direct_internal_fn_supported_p (cond_fn, vectype_in,
6428                                                   OPTIMIZE_FOR_SPEED)))
6429         {
6430           if (dump_enabled_p ())
6431             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6432                              "can't use a fully-masked loop because no"
6433                              " conditional operation is available.\n");
6434           LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = false;
6435         }
6436       else if (reduc_index == -1)
6437         {
6438           if (dump_enabled_p ())
6439             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6440                              "can't use a fully-masked loop for chained"
6441                              " reductions.\n");
6442           LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = false;
6443         }
6444       else
6445         vect_record_loop_mask (loop_vinfo, masks, ncopies * vec_num,
6446                                vectype_in);
6447     }
6448   if (dump_enabled_p ()
6449       && reduction_type == FOLD_LEFT_REDUCTION)
6450     dump_printf_loc (MSG_NOTE, vect_location,
6451                      "using an in-order (fold-left) reduction.\n");
6452   STMT_VINFO_TYPE (orig_stmt_of_analysis) = cycle_phi_info_type;
6453   /* All but single defuse-cycle optimized, lane-reducing and fold-left
6454      reductions go through their own vectorizable_* routines.  */
6455   if (!single_defuse_cycle
6456       && code != DOT_PROD_EXPR
6457       && code != WIDEN_SUM_EXPR
6458       && code != SAD_EXPR
6459       && reduction_type != FOLD_LEFT_REDUCTION)
6460     {
6461       STMT_VINFO_DEF_TYPE (stmt_info) = vect_internal_def;
6462       STMT_VINFO_DEF_TYPE (vect_orig_stmt (stmt_info)) = vect_internal_def;
6463     }
6464   return true;
6465 }
6466
6467 /* Transform the definition stmt STMT_INFO of a reduction PHI backedge
6468    value.  */
6469
6470 bool
6471 vect_transform_reduction (stmt_vec_info stmt_info, gimple_stmt_iterator *gsi,
6472                           stmt_vec_info *vec_stmt, slp_tree slp_node)
6473 {
6474   tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
6475   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
6476   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
6477   int i;
6478   int ncopies;
6479   int j;
6480   int vec_num;
6481
6482   stmt_vec_info reduc_info = info_for_reduction (stmt_info);
6483   gcc_assert (reduc_info->is_reduc_info);
6484
6485   if (nested_in_vect_loop_p (loop, stmt_info))
6486     {
6487       loop = loop->inner;
6488       gcc_assert (STMT_VINFO_DEF_TYPE (reduc_info) == vect_double_reduction_def);
6489     }
6490
6491   gassign *stmt = as_a <gassign *> (stmt_info->stmt);
6492   enum tree_code code = gimple_assign_rhs_code (stmt);
6493   int op_type = TREE_CODE_LENGTH (code);
6494
6495   /* Flatten RHS.  */
6496   tree ops[3];
6497   switch (get_gimple_rhs_class (code))
6498     {
6499     case GIMPLE_TERNARY_RHS:
6500       ops[2] = gimple_assign_rhs3 (stmt);
6501       /* Fall thru.  */
6502     case GIMPLE_BINARY_RHS:
6503       ops[0] = gimple_assign_rhs1 (stmt);
6504       ops[1] = gimple_assign_rhs2 (stmt);
6505       break;
6506     default:
6507       gcc_unreachable ();
6508     }
6509
6510   /* All uses but the last are expected to be defined in the loop.
6511      The last use is the reduction variable.  In case of nested cycle this
6512      assumption is not true: we use reduc_index to record the index of the
6513      reduction variable.  */
6514   stmt_vec_info phi_info = STMT_VINFO_REDUC_DEF (vect_orig_stmt (stmt_info));
6515   gphi *reduc_def_phi = as_a <gphi *> (phi_info->stmt);
6516   int reduc_index = STMT_VINFO_REDUC_IDX (reduc_info);
6517   tree vectype_in = STMT_VINFO_REDUC_VECTYPE_IN (reduc_info);
6518
6519   if (slp_node)
6520     {
6521       ncopies = 1;
6522       vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
6523     }
6524   else
6525     {
6526       ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
6527       vec_num = 1;
6528     }
6529
6530   internal_fn cond_fn = get_conditional_internal_fn (code);
6531   vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo);
6532   bool mask_by_cond_expr = use_mask_by_cond_expr_p (code, cond_fn, vectype_in);
6533
6534   /* Transform.  */
6535   stmt_vec_info new_stmt_info = NULL;
6536   stmt_vec_info prev_stmt_info;
6537   tree new_temp = NULL_TREE;
6538   auto_vec<tree> vec_oprnds0;
6539   auto_vec<tree> vec_oprnds1;
6540   auto_vec<tree> vec_oprnds2;
6541   tree def0;
6542
6543   if (dump_enabled_p ())
6544     dump_printf_loc (MSG_NOTE, vect_location, "transform reduction.\n");
6545
6546   /* FORNOW: Multiple types are not supported for condition.  */
6547   if (code == COND_EXPR)
6548     gcc_assert (ncopies == 1);
6549
6550   bool masked_loop_p = LOOP_VINFO_FULLY_MASKED_P (loop_vinfo);
6551
6552   vect_reduction_type reduction_type = STMT_VINFO_REDUC_TYPE (reduc_info);
6553   if (reduction_type == FOLD_LEFT_REDUCTION)
6554     {
6555       internal_fn reduc_fn = STMT_VINFO_REDUC_FN (reduc_info);
6556       return vectorize_fold_left_reduction
6557           (stmt_info, gsi, vec_stmt, slp_node, reduc_def_phi, code,
6558            reduc_fn, ops, vectype_in, reduc_index, masks);
6559     }
6560
6561   bool single_defuse_cycle = STMT_VINFO_FORCE_SINGLE_CYCLE (reduc_info);
6562   gcc_assert (single_defuse_cycle
6563               || code == DOT_PROD_EXPR
6564               || code == WIDEN_SUM_EXPR
6565               || code == SAD_EXPR);
6566
6567   /* Create the destination vector  */
6568   tree scalar_dest = gimple_assign_lhs (stmt);
6569   tree vec_dest = vect_create_destination_var (scalar_dest, vectype_out);
6570
6571   prev_stmt_info = NULL;
6572   if (!slp_node)
6573     {
6574       vec_oprnds0.create (1);
6575       vec_oprnds1.create (1);
6576       if (op_type == ternary_op)
6577         vec_oprnds2.create (1);
6578     }
6579
6580   for (j = 0; j < ncopies; j++)
6581     {
6582       /* Handle uses.  */
6583       if (j == 0)
6584         {
6585           if (slp_node)
6586             {
6587               /* Get vec defs for all the operands except the reduction index,
6588                  ensuring the ordering of the ops in the vector is kept.  */
6589               auto_vec<tree, 3> slp_ops;
6590               auto_vec<vec<tree>, 3> vec_defs;
6591
6592               slp_ops.quick_push (ops[0]);
6593               slp_ops.quick_push (ops[1]);
6594               if (op_type == ternary_op)
6595                 slp_ops.quick_push (ops[2]);
6596
6597               vect_get_slp_defs (slp_ops, slp_node, &vec_defs);
6598
6599               vec_oprnds0.safe_splice (vec_defs[0]);
6600               vec_defs[0].release ();
6601               vec_oprnds1.safe_splice (vec_defs[1]);
6602               vec_defs[1].release ();
6603               if (op_type == ternary_op)
6604                 {
6605                   vec_oprnds2.safe_splice (vec_defs[2]);
6606                   vec_defs[2].release ();
6607                 }
6608             }
6609           else
6610             {
6611               vec_oprnds0.quick_push
6612                 (vect_get_vec_def_for_operand (ops[0], stmt_info));
6613               vec_oprnds1.quick_push
6614                 (vect_get_vec_def_for_operand (ops[1], stmt_info));
6615               if (op_type == ternary_op)
6616                 vec_oprnds2.quick_push
6617                   (vect_get_vec_def_for_operand (ops[2], stmt_info));
6618             }
6619         }
6620       else
6621         {
6622           if (!slp_node)
6623             {
6624               gcc_assert (reduc_index != -1 || ! single_defuse_cycle);
6625
6626               if (single_defuse_cycle && reduc_index == 0)
6627                 vec_oprnds0[0] = gimple_get_lhs (new_stmt_info->stmt);
6628               else
6629                 vec_oprnds0[0]
6630                   = vect_get_vec_def_for_stmt_copy (loop_vinfo,
6631                                                     vec_oprnds0[0]);
6632               if (single_defuse_cycle && reduc_index == 1)
6633                 vec_oprnds1[0] = gimple_get_lhs (new_stmt_info->stmt);
6634               else
6635                 vec_oprnds1[0]
6636                   = vect_get_vec_def_for_stmt_copy (loop_vinfo,
6637                                                     vec_oprnds1[0]);
6638               if (op_type == ternary_op)
6639                 {
6640                   if (single_defuse_cycle && reduc_index == 2)
6641                     vec_oprnds2[0] = gimple_get_lhs (new_stmt_info->stmt);
6642                   else
6643                     vec_oprnds2[0]
6644                       = vect_get_vec_def_for_stmt_copy (loop_vinfo,
6645                                                         vec_oprnds2[0]);
6646                 }
6647             }
6648         }
6649
6650       FOR_EACH_VEC_ELT (vec_oprnds0, i, def0)
6651         {
6652           tree vop[3] = { def0, vec_oprnds1[i], NULL_TREE };
6653           if (masked_loop_p && !mask_by_cond_expr)
6654             {
6655               /* Make sure that the reduction accumulator is vop[0].  */
6656               if (reduc_index == 1)
6657                 {
6658                   gcc_assert (commutative_tree_code (code));
6659                   std::swap (vop[0], vop[1]);
6660                 }
6661               tree mask = vect_get_loop_mask (gsi, masks, vec_num * ncopies,
6662                                               vectype_in, i * ncopies + j);
6663               gcall *call = gimple_build_call_internal (cond_fn, 4, mask,
6664                                                         vop[0], vop[1],
6665                                                         vop[0]);
6666               new_temp = make_ssa_name (vec_dest, call);
6667               gimple_call_set_lhs (call, new_temp);
6668               gimple_call_set_nothrow (call, true);
6669               new_stmt_info
6670                 = vect_finish_stmt_generation (stmt_info, call, gsi);
6671             }
6672           else
6673             {
6674               if (op_type == ternary_op)
6675                 vop[2] = vec_oprnds2[i];
6676
6677               if (masked_loop_p && mask_by_cond_expr)
6678                 {
6679                   tree mask = vect_get_loop_mask (gsi, masks,
6680                                                   vec_num * ncopies,
6681                                                   vectype_in, i * ncopies + j);
6682                   build_vect_cond_expr (code, vop, mask, gsi);
6683                 }
6684
6685               gassign *new_stmt = gimple_build_assign (vec_dest, code,
6686                                                        vop[0], vop[1], vop[2]);
6687               new_temp = make_ssa_name (vec_dest, new_stmt);
6688               gimple_assign_set_lhs (new_stmt, new_temp);
6689               new_stmt_info
6690                 = vect_finish_stmt_generation (stmt_info, new_stmt, gsi);
6691             }
6692
6693           if (slp_node)
6694             SLP_TREE_VEC_STMTS (slp_node).quick_push (new_stmt_info);
6695         }
6696
6697       if (slp_node || single_defuse_cycle)
6698         continue;
6699
6700       if (j == 0)
6701         STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt_info;
6702       else
6703         STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt_info;
6704
6705       prev_stmt_info = new_stmt_info;
6706     }
6707
6708   if (single_defuse_cycle && !slp_node)
6709     STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt_info;
6710
6711   return true;
6712 }
6713
6714 /* Transform phase of a cycle PHI.  */
6715
6716 bool
6717 vect_transform_cycle_phi (stmt_vec_info stmt_info, stmt_vec_info *vec_stmt,
6718                           slp_tree slp_node, slp_instance slp_node_instance)
6719 {
6720   tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
6721   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
6722   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
6723   int i;
6724   int ncopies;
6725   stmt_vec_info prev_phi_info;
6726   int j;
6727   bool nested_cycle = false;
6728   int vec_num;
6729
6730   if (nested_in_vect_loop_p (loop, stmt_info))
6731     {
6732       loop = loop->inner;
6733       nested_cycle = true;
6734     }
6735
6736   stmt_vec_info reduc_stmt_info = STMT_VINFO_REDUC_DEF (stmt_info);
6737   reduc_stmt_info = vect_stmt_to_vectorize (reduc_stmt_info);
6738   stmt_vec_info reduc_info = info_for_reduction (stmt_info);
6739   gcc_assert (reduc_info->is_reduc_info);
6740
6741   if (STMT_VINFO_REDUC_TYPE (reduc_info) == EXTRACT_LAST_REDUCTION
6742       || STMT_VINFO_REDUC_TYPE (reduc_info) == FOLD_LEFT_REDUCTION)
6743     /* Leave the scalar phi in place.  */
6744     return true;
6745
6746   tree vectype_in = STMT_VINFO_REDUC_VECTYPE_IN (reduc_info);
6747   /* For a nested cycle we do not fill the above.  */
6748   if (!vectype_in)
6749     vectype_in = STMT_VINFO_VECTYPE (stmt_info);
6750   gcc_assert (vectype_in);
6751
6752   if (slp_node)
6753     {
6754       /* The size vect_schedule_slp_instance computes is off for us.  */
6755       vec_num = vect_get_num_vectors
6756           (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
6757            * SLP_TREE_SCALAR_STMTS (slp_node).length (), vectype_in);
6758       ncopies = 1;
6759     }
6760   else
6761     {
6762       vec_num = 1;
6763       ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
6764     }
6765
6766   /* Check whether we should use a single PHI node and accumulate
6767      vectors to one before the backedge.  */
6768   if (STMT_VINFO_FORCE_SINGLE_CYCLE (reduc_info))
6769     ncopies = 1;
6770
6771   /* Create the destination vector  */
6772   gphi *phi = as_a <gphi *> (stmt_info->stmt);
6773   tree vec_dest = vect_create_destination_var (gimple_phi_result (phi),
6774                                                vectype_out);
6775
6776   /* Get the loop-entry arguments.  */
6777   tree vec_initial_def;
6778   auto_vec<tree> vec_initial_defs;
6779   if (slp_node)
6780     {
6781       vec_initial_defs.reserve (vec_num);
6782       gcc_assert (slp_node == slp_node_instance->reduc_phis);
6783       stmt_vec_info first = REDUC_GROUP_FIRST_ELEMENT (reduc_stmt_info);
6784       tree neutral_op
6785         = neutral_op_for_slp_reduction (slp_node,
6786                                         STMT_VINFO_REDUC_CODE (reduc_info),
6787                                         first != NULL);
6788       get_initial_defs_for_reduction (slp_node_instance->reduc_phis,
6789                                       &vec_initial_defs, vec_num,
6790                                       first != NULL, neutral_op);
6791     }
6792   else
6793     {
6794       /* Get at the scalar def before the loop, that defines the initial
6795          value of the reduction variable.  */
6796       tree initial_def = PHI_ARG_DEF_FROM_EDGE (phi,
6797                                                 loop_preheader_edge (loop));
6798       /* Optimize: if initial_def is for REDUC_MAX smaller than the base
6799          and we can't use zero for induc_val, use initial_def.  Similarly
6800          for REDUC_MIN and initial_def larger than the base.  */
6801       if (STMT_VINFO_REDUC_TYPE (reduc_info) == INTEGER_INDUC_COND_REDUCTION)
6802         {
6803           tree induc_val = STMT_VINFO_VEC_INDUC_COND_INITIAL_VAL (reduc_info);
6804           if (TREE_CODE (initial_def) == INTEGER_CST
6805               && !integer_zerop (induc_val)
6806               && (((STMT_VINFO_VEC_COND_REDUC_CODE (reduc_info) == MAX_EXPR)
6807                    && tree_int_cst_lt (initial_def, induc_val))
6808                   || ((STMT_VINFO_VEC_COND_REDUC_CODE (reduc_info) == MIN_EXPR)
6809                       && tree_int_cst_lt (induc_val, initial_def))))
6810             {
6811               induc_val = initial_def;
6812               /* Communicate we used the initial_def to epilouge
6813                  generation.  */
6814               STMT_VINFO_VEC_INDUC_COND_INITIAL_VAL (reduc_info) = NULL_TREE;
6815             }
6816           vec_initial_def = build_vector_from_val (vectype_out, induc_val);
6817         }
6818       else if (nested_cycle)
6819         {
6820           /* Do not use an adjustment def as that case is not supported
6821              correctly if ncopies is not one.  */
6822           vec_initial_def = vect_get_vec_def_for_operand (initial_def,
6823                                                           reduc_stmt_info);
6824         }
6825       else
6826         {
6827           tree adjustment_def = NULL_TREE;
6828           tree *adjustment_defp = &adjustment_def;
6829           enum tree_code code = STMT_VINFO_REDUC_CODE (reduc_info);
6830           if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
6831             adjustment_defp = NULL;
6832           vec_initial_def
6833             = get_initial_def_for_reduction (reduc_stmt_info, code,
6834                                              initial_def, adjustment_defp);
6835           STMT_VINFO_REDUC_EPILOGUE_ADJUSTMENT (reduc_info) = adjustment_def;
6836         }
6837       vec_initial_defs.create (1);
6838       vec_initial_defs.quick_push (vec_initial_def);
6839     }
6840
6841   /* Generate the reduction PHIs upfront.  */
6842   prev_phi_info = NULL;
6843   for (i = 0; i < vec_num; i++)
6844     {
6845       tree vec_init_def = vec_initial_defs[i];
6846       for (j = 0; j < ncopies; j++)
6847         {
6848           /* Create the reduction-phi that defines the reduction
6849              operand.  */
6850           gphi *new_phi = create_phi_node (vec_dest, loop->header);
6851           stmt_vec_info new_phi_info = loop_vinfo->add_stmt (new_phi);
6852
6853           /* Set the loop-entry arg of the reduction-phi.  */
6854           if (j != 0 && nested_cycle)
6855             vec_init_def = vect_get_vec_def_for_stmt_copy (loop_vinfo,
6856                                                            vec_init_def);
6857           add_phi_arg (new_phi, vec_init_def, loop_preheader_edge (loop),
6858                        UNKNOWN_LOCATION);
6859
6860           /* The loop-latch arg is set in epilogue processing.  */
6861
6862           if (slp_node)
6863             SLP_TREE_VEC_STMTS (slp_node).quick_push (new_phi_info);
6864           else
6865             {
6866               if (j == 0)
6867                 STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_phi_info;
6868               else
6869                 STMT_VINFO_RELATED_STMT (prev_phi_info) = new_phi_info;
6870               prev_phi_info = new_phi_info;
6871             }
6872         }
6873     }
6874
6875   return true;
6876 }
6877
6878 /* Vectorizes LC PHIs.  */
6879
6880 bool
6881 vectorizable_lc_phi (stmt_vec_info stmt_info, stmt_vec_info *vec_stmt,
6882                      slp_tree slp_node)
6883 {
6884   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
6885   if (!loop_vinfo
6886       || !is_a <gphi *> (stmt_info->stmt)
6887       || gimple_phi_num_args (stmt_info->stmt) != 1)
6888     return false;
6889
6890   if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_internal_def
6891       && STMT_VINFO_DEF_TYPE (stmt_info) != vect_double_reduction_def)
6892     return false;
6893
6894   if (!vec_stmt) /* transformation not required.  */
6895     {
6896       STMT_VINFO_TYPE (stmt_info) = lc_phi_info_type;
6897       return true;
6898     }
6899
6900   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
6901   tree scalar_dest = gimple_phi_result (stmt_info->stmt);
6902   basic_block bb = gimple_bb (stmt_info->stmt);
6903   edge e = single_pred_edge (bb);
6904   tree vec_dest = vect_create_destination_var (scalar_dest, vectype);
6905   vec<tree> vec_oprnds = vNULL;
6906   vect_get_vec_defs (gimple_phi_arg_def (stmt_info->stmt, 0), NULL_TREE,
6907                      stmt_info, &vec_oprnds, NULL, slp_node);
6908   if (slp_node)
6909     {
6910       unsigned vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
6911       gcc_assert (vec_oprnds.length () == vec_num);
6912       for (unsigned i = 0; i < vec_num; i++)
6913         {
6914           /* Create the vectorized LC PHI node.  */
6915           gphi *new_phi = create_phi_node (vec_dest, bb);
6916           add_phi_arg (new_phi, vec_oprnds[i], e, UNKNOWN_LOCATION);
6917           stmt_vec_info new_phi_info = loop_vinfo->add_stmt (new_phi);
6918           SLP_TREE_VEC_STMTS (slp_node).quick_push (new_phi_info);
6919         }
6920     }
6921   else
6922     {
6923       unsigned ncopies = vect_get_num_copies (loop_vinfo, vectype);
6924       stmt_vec_info prev_phi_info = NULL;
6925       for (unsigned i = 0; i < ncopies; i++)
6926         {
6927           if (i != 0)
6928             vect_get_vec_defs_for_stmt_copy (loop_vinfo, &vec_oprnds, NULL);
6929           /* Create the vectorized LC PHI node.  */
6930           gphi *new_phi = create_phi_node (vec_dest, bb);
6931           add_phi_arg (new_phi, vec_oprnds[0], e, UNKNOWN_LOCATION);
6932           stmt_vec_info new_phi_info = loop_vinfo->add_stmt (new_phi);
6933           if (i == 0)
6934             STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_phi_info;
6935           else
6936             STMT_VINFO_RELATED_STMT (prev_phi_info) = new_phi_info;
6937           prev_phi_info = new_phi_info;
6938         }
6939     }
6940   vec_oprnds.release ();
6941
6942   return true;
6943 }
6944
6945
6946 /* Function vect_min_worthwhile_factor.
6947
6948    For a loop where we could vectorize the operation indicated by CODE,
6949    return the minimum vectorization factor that makes it worthwhile
6950    to use generic vectors.  */
6951 static unsigned int
6952 vect_min_worthwhile_factor (enum tree_code code)
6953 {
6954   switch (code)
6955     {
6956     case PLUS_EXPR:
6957     case MINUS_EXPR:
6958     case NEGATE_EXPR:
6959       return 4;
6960
6961     case BIT_AND_EXPR:
6962     case BIT_IOR_EXPR:
6963     case BIT_XOR_EXPR:
6964     case BIT_NOT_EXPR:
6965       return 2;
6966
6967     default:
6968       return INT_MAX;
6969     }
6970 }
6971
6972 /* Return true if VINFO indicates we are doing loop vectorization and if
6973    it is worth decomposing CODE operations into scalar operations for
6974    that loop's vectorization factor.  */
6975
6976 bool
6977 vect_worthwhile_without_simd_p (vec_info *vinfo, tree_code code)
6978 {
6979   loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
6980   unsigned HOST_WIDE_INT value;
6981   return (loop_vinfo
6982           && LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant (&value)
6983           && value >= vect_min_worthwhile_factor (code));
6984 }
6985
6986 /* Function vectorizable_induction
6987
6988    Check if STMT_INFO performs an induction computation that can be vectorized.
6989    If VEC_STMT is also passed, vectorize the induction PHI: create a vectorized
6990    phi to replace it, put it in VEC_STMT, and add it to the same basic block.
6991    Return true if STMT_INFO is vectorizable in this way.  */
6992
6993 bool
6994 vectorizable_induction (stmt_vec_info stmt_info,
6995                         gimple_stmt_iterator *gsi ATTRIBUTE_UNUSED,
6996                         stmt_vec_info *vec_stmt, slp_tree slp_node,
6997                         stmt_vector_for_cost *cost_vec)
6998 {
6999   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
7000   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
7001   unsigned ncopies;
7002   bool nested_in_vect_loop = false;
7003   class loop *iv_loop;
7004   tree vec_def;
7005   edge pe = loop_preheader_edge (loop);
7006   basic_block new_bb;
7007   tree new_vec, vec_init, vec_step, t;
7008   tree new_name;
7009   gimple *new_stmt;
7010   gphi *induction_phi;
7011   tree induc_def, vec_dest;
7012   tree init_expr, step_expr;
7013   poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
7014   unsigned i;
7015   tree expr;
7016   gimple_seq stmts;
7017   imm_use_iterator imm_iter;
7018   use_operand_p use_p;
7019   gimple *exit_phi;
7020   edge latch_e;
7021   tree loop_arg;
7022   gimple_stmt_iterator si;
7023
7024   gphi *phi = dyn_cast <gphi *> (stmt_info->stmt);
7025   if (!phi)
7026     return false;
7027
7028   if (!STMT_VINFO_RELEVANT_P (stmt_info))
7029     return false;
7030
7031   /* Make sure it was recognized as induction computation.  */
7032   if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_induction_def)
7033     return false;
7034
7035   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
7036   poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
7037
7038   if (slp_node)
7039     ncopies = 1;
7040   else
7041     ncopies = vect_get_num_copies (loop_vinfo, vectype);
7042   gcc_assert (ncopies >= 1);
7043
7044   /* FORNOW. These restrictions should be relaxed.  */
7045   if (nested_in_vect_loop_p (loop, stmt_info))
7046     {
7047       imm_use_iterator imm_iter;
7048       use_operand_p use_p;
7049       gimple *exit_phi;
7050       edge latch_e;
7051       tree loop_arg;
7052
7053       if (ncopies > 1)
7054         {
7055           if (dump_enabled_p ())
7056             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7057                              "multiple types in nested loop.\n");
7058           return false;
7059         }
7060
7061       /* FORNOW: outer loop induction with SLP not supported.  */
7062       if (STMT_SLP_TYPE (stmt_info))
7063         return false;
7064
7065       exit_phi = NULL;
7066       latch_e = loop_latch_edge (loop->inner);
7067       loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
7068       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, loop_arg)
7069         {
7070           gimple *use_stmt = USE_STMT (use_p);
7071           if (is_gimple_debug (use_stmt))
7072             continue;
7073
7074           if (!flow_bb_inside_loop_p (loop->inner, gimple_bb (use_stmt)))
7075             {
7076               exit_phi = use_stmt;
7077               break;
7078             }
7079         }
7080       if (exit_phi)
7081         {
7082           stmt_vec_info exit_phi_vinfo = loop_vinfo->lookup_stmt (exit_phi);
7083           if (!(STMT_VINFO_RELEVANT_P (exit_phi_vinfo)
7084                 && !STMT_VINFO_LIVE_P (exit_phi_vinfo)))
7085             {
7086               if (dump_enabled_p ())
7087                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7088                                  "inner-loop induction only used outside "
7089                                  "of the outer vectorized loop.\n");
7090               return false;
7091             }
7092         }
7093
7094       nested_in_vect_loop = true;
7095       iv_loop = loop->inner;
7096     }
7097   else
7098     iv_loop = loop;
7099   gcc_assert (iv_loop == (gimple_bb (phi))->loop_father);
7100
7101   if (slp_node && !nunits.is_constant ())
7102     {
7103       /* The current SLP code creates the initial value element-by-element.  */
7104       if (dump_enabled_p ())
7105         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7106                          "SLP induction not supported for variable-length"
7107                          " vectors.\n");
7108       return false;
7109     }
7110
7111   if (!vec_stmt) /* transformation not required.  */
7112     {
7113       STMT_VINFO_TYPE (stmt_info) = induc_vec_info_type;
7114       DUMP_VECT_SCOPE ("vectorizable_induction");
7115       vect_model_induction_cost (stmt_info, ncopies, cost_vec);
7116       return true;
7117     }
7118
7119   /* Transform.  */
7120
7121   /* Compute a vector variable, initialized with the first VF values of
7122      the induction variable.  E.g., for an iv with IV_PHI='X' and
7123      evolution S, for a vector of 4 units, we want to compute:
7124      [X, X + S, X + 2*S, X + 3*S].  */
7125
7126   if (dump_enabled_p ())
7127     dump_printf_loc (MSG_NOTE, vect_location, "transform induction phi.\n");
7128
7129   latch_e = loop_latch_edge (iv_loop);
7130   loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
7131
7132   step_expr = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_info);
7133   gcc_assert (step_expr != NULL_TREE);
7134   tree step_vectype = get_same_sized_vectype (TREE_TYPE (step_expr), vectype);
7135
7136   pe = loop_preheader_edge (iv_loop);
7137   init_expr = PHI_ARG_DEF_FROM_EDGE (phi,
7138                                      loop_preheader_edge (iv_loop));
7139
7140   stmts = NULL;
7141   if (!nested_in_vect_loop)
7142     {
7143       /* Convert the initial value to the IV update type.  */
7144       tree new_type = TREE_TYPE (step_expr);
7145       init_expr = gimple_convert (&stmts, new_type, init_expr);
7146
7147       /* If we are using the loop mask to "peel" for alignment then we need
7148          to adjust the start value here.  */
7149       tree skip_niters = LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo);
7150       if (skip_niters != NULL_TREE)
7151         {
7152           if (FLOAT_TYPE_P (vectype))
7153             skip_niters = gimple_build (&stmts, FLOAT_EXPR, new_type,
7154                                         skip_niters);
7155           else
7156             skip_niters = gimple_convert (&stmts, new_type, skip_niters);
7157           tree skip_step = gimple_build (&stmts, MULT_EXPR, new_type,
7158                                          skip_niters, step_expr);
7159           init_expr = gimple_build (&stmts, MINUS_EXPR, new_type,
7160                                     init_expr, skip_step);
7161         }
7162     }
7163
7164   if (stmts)
7165     {
7166       new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
7167       gcc_assert (!new_bb);
7168     }
7169
7170   /* Find the first insertion point in the BB.  */
7171   basic_block bb = gimple_bb (phi);
7172   si = gsi_after_labels (bb);
7173
7174   /* For SLP induction we have to generate several IVs as for example
7175      with group size 3 we need [i, i, i, i + S] [i + S, i + S, i + 2*S, i + 2*S]
7176      [i + 2*S, i + 3*S, i + 3*S, i + 3*S].  The step is the same uniform
7177      [VF*S, VF*S, VF*S, VF*S] for all.  */
7178   if (slp_node)
7179     {
7180       /* Enforced above.  */
7181       unsigned int const_nunits = nunits.to_constant ();
7182
7183       /* Generate [VF*S, VF*S, ... ].  */
7184       if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
7185         {
7186           expr = build_int_cst (integer_type_node, vf);
7187           expr = fold_convert (TREE_TYPE (step_expr), expr);
7188         }
7189       else
7190         expr = build_int_cst (TREE_TYPE (step_expr), vf);
7191       new_name = fold_build2 (MULT_EXPR, TREE_TYPE (step_expr),
7192                               expr, step_expr);
7193       if (! CONSTANT_CLASS_P (new_name))
7194         new_name = vect_init_vector (stmt_info, new_name,
7195                                      TREE_TYPE (step_expr), NULL);
7196       new_vec = build_vector_from_val (step_vectype, new_name);
7197       vec_step = vect_init_vector (stmt_info, new_vec, step_vectype, NULL);
7198
7199       /* Now generate the IVs.  */
7200       unsigned group_size = SLP_TREE_SCALAR_STMTS (slp_node).length ();
7201       unsigned nvects = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
7202       unsigned elts = const_nunits * nvects;
7203       unsigned nivs = least_common_multiple (group_size,
7204                                              const_nunits) / const_nunits;
7205       gcc_assert (elts % group_size == 0);
7206       tree elt = init_expr;
7207       unsigned ivn;
7208       for (ivn = 0; ivn < nivs; ++ivn)
7209         {
7210           tree_vector_builder elts (step_vectype, const_nunits, 1);
7211           stmts = NULL;
7212           for (unsigned eltn = 0; eltn < const_nunits; ++eltn)
7213             {
7214               if (ivn*const_nunits + eltn >= group_size
7215                   && (ivn * const_nunits + eltn) % group_size == 0)
7216                 elt = gimple_build (&stmts, PLUS_EXPR, TREE_TYPE (elt),
7217                                     elt, step_expr);
7218               elts.quick_push (elt);
7219             }
7220           vec_init = gimple_build_vector (&stmts, &elts);
7221           vec_init = gimple_convert (&stmts, vectype, vec_init);
7222           if (stmts)
7223             {
7224               new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
7225               gcc_assert (!new_bb);
7226             }
7227
7228           /* Create the induction-phi that defines the induction-operand.  */
7229           vec_dest = vect_get_new_vect_var (vectype, vect_simple_var, "vec_iv_");
7230           induction_phi = create_phi_node (vec_dest, iv_loop->header);
7231           stmt_vec_info induction_phi_info
7232             = loop_vinfo->add_stmt (induction_phi);
7233           induc_def = PHI_RESULT (induction_phi);
7234
7235           /* Create the iv update inside the loop  */
7236           gimple_seq stmts = NULL;
7237           vec_def = gimple_convert (&stmts, step_vectype, induc_def);
7238           vec_def = gimple_build (&stmts,
7239                                   PLUS_EXPR, step_vectype, vec_def, vec_step);
7240           vec_def = gimple_convert (&stmts, vectype, vec_def);
7241           loop_vinfo->add_stmt (SSA_NAME_DEF_STMT (vec_def));
7242           gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
7243
7244           /* Set the arguments of the phi node:  */
7245           add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION);
7246           add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
7247                        UNKNOWN_LOCATION);
7248
7249           SLP_TREE_VEC_STMTS (slp_node).quick_push (induction_phi_info);
7250         }
7251
7252       /* Re-use IVs when we can.  */
7253       if (ivn < nvects)
7254         {
7255           unsigned vfp
7256             = least_common_multiple (group_size, const_nunits) / group_size;
7257           /* Generate [VF'*S, VF'*S, ... ].  */
7258           if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
7259             {
7260               expr = build_int_cst (integer_type_node, vfp);
7261               expr = fold_convert (TREE_TYPE (step_expr), expr);
7262             }
7263           else
7264             expr = build_int_cst (TREE_TYPE (step_expr), vfp);
7265           new_name = fold_build2 (MULT_EXPR, TREE_TYPE (step_expr),
7266                                   expr, step_expr);
7267           if (! CONSTANT_CLASS_P (new_name))
7268             new_name = vect_init_vector (stmt_info, new_name,
7269                                          TREE_TYPE (step_expr), NULL);
7270           new_vec = build_vector_from_val (step_vectype, new_name);
7271           vec_step = vect_init_vector (stmt_info, new_vec, step_vectype, NULL);
7272           for (; ivn < nvects; ++ivn)
7273             {
7274               gimple *iv = SLP_TREE_VEC_STMTS (slp_node)[ivn - nivs]->stmt;
7275               tree def;
7276               if (gimple_code (iv) == GIMPLE_PHI)
7277                 def = gimple_phi_result (iv);
7278               else
7279                 def = gimple_assign_lhs (iv);
7280               gimple_seq stmts = NULL;
7281               def = gimple_convert (&stmts, step_vectype, def);
7282               def = gimple_build (&stmts,
7283                                   PLUS_EXPR, step_vectype, def, vec_step);
7284               def = gimple_convert (&stmts, vectype, def);
7285               if (gimple_code (iv) == GIMPLE_PHI)
7286                 gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
7287               else
7288                 {
7289                   gimple_stmt_iterator tgsi = gsi_for_stmt (iv);
7290                   gsi_insert_seq_after (&tgsi, stmts, GSI_CONTINUE_LINKING);
7291                 }
7292               SLP_TREE_VEC_STMTS (slp_node).quick_push
7293                 (loop_vinfo->add_stmt (SSA_NAME_DEF_STMT (def)));
7294             }
7295         }
7296
7297       return true;
7298     }
7299
7300   /* Create the vector that holds the initial_value of the induction.  */
7301   if (nested_in_vect_loop)
7302     {
7303       /* iv_loop is nested in the loop to be vectorized.  init_expr had already
7304          been created during vectorization of previous stmts.  We obtain it
7305          from the STMT_VINFO_VEC_STMT of the defining stmt.  */
7306       vec_init = vect_get_vec_def_for_operand (init_expr, stmt_info);
7307       /* If the initial value is not of proper type, convert it.  */
7308       if (!useless_type_conversion_p (vectype, TREE_TYPE (vec_init)))
7309         {
7310           new_stmt
7311             = gimple_build_assign (vect_get_new_ssa_name (vectype,
7312                                                           vect_simple_var,
7313                                                           "vec_iv_"),
7314                                    VIEW_CONVERT_EXPR,
7315                                    build1 (VIEW_CONVERT_EXPR, vectype,
7316                                            vec_init));
7317           vec_init = gimple_assign_lhs (new_stmt);
7318           new_bb = gsi_insert_on_edge_immediate (loop_preheader_edge (iv_loop),
7319                                                  new_stmt);
7320           gcc_assert (!new_bb);
7321           loop_vinfo->add_stmt (new_stmt);
7322         }
7323     }
7324   else
7325     {
7326       /* iv_loop is the loop to be vectorized. Create:
7327          vec_init = [X, X+S, X+2*S, X+3*S] (S = step_expr, X = init_expr)  */
7328       stmts = NULL;
7329       new_name = gimple_convert (&stmts, TREE_TYPE (step_expr), init_expr);
7330
7331       unsigned HOST_WIDE_INT const_nunits;
7332       if (nunits.is_constant (&const_nunits))
7333         {
7334           tree_vector_builder elts (step_vectype, const_nunits, 1);
7335           elts.quick_push (new_name);
7336           for (i = 1; i < const_nunits; i++)
7337             {
7338               /* Create: new_name_i = new_name + step_expr  */
7339               new_name = gimple_build (&stmts, PLUS_EXPR, TREE_TYPE (new_name),
7340                                        new_name, step_expr);
7341               elts.quick_push (new_name);
7342             }
7343           /* Create a vector from [new_name_0, new_name_1, ...,
7344              new_name_nunits-1]  */
7345           vec_init = gimple_build_vector (&stmts, &elts);
7346         }
7347       else if (INTEGRAL_TYPE_P (TREE_TYPE (step_expr)))
7348         /* Build the initial value directly from a VEC_SERIES_EXPR.  */
7349         vec_init = gimple_build (&stmts, VEC_SERIES_EXPR, step_vectype,
7350                                  new_name, step_expr);
7351       else
7352         {
7353           /* Build:
7354                 [base, base, base, ...]
7355                 + (vectype) [0, 1, 2, ...] * [step, step, step, ...].  */
7356           gcc_assert (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)));
7357           gcc_assert (flag_associative_math);
7358           tree index = build_index_vector (step_vectype, 0, 1);
7359           tree base_vec = gimple_build_vector_from_val (&stmts, step_vectype,
7360                                                         new_name);
7361           tree step_vec = gimple_build_vector_from_val (&stmts, step_vectype,
7362                                                         step_expr);
7363           vec_init = gimple_build (&stmts, FLOAT_EXPR, step_vectype, index);
7364           vec_init = gimple_build (&stmts, MULT_EXPR, step_vectype,
7365                                    vec_init, step_vec);
7366           vec_init = gimple_build (&stmts, PLUS_EXPR, step_vectype,
7367                                    vec_init, base_vec);
7368         }
7369       vec_init = gimple_convert (&stmts, vectype, vec_init);
7370
7371       if (stmts)
7372         {
7373           new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
7374           gcc_assert (!new_bb);
7375         }
7376     }
7377
7378
7379   /* Create the vector that holds the step of the induction.  */
7380   if (nested_in_vect_loop)
7381     /* iv_loop is nested in the loop to be vectorized. Generate:
7382        vec_step = [S, S, S, S]  */
7383     new_name = step_expr;
7384   else
7385     {
7386       /* iv_loop is the loop to be vectorized. Generate:
7387           vec_step = [VF*S, VF*S, VF*S, VF*S]  */
7388       gimple_seq seq = NULL;
7389       if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
7390         {
7391           expr = build_int_cst (integer_type_node, vf);
7392           expr = gimple_build (&seq, FLOAT_EXPR, TREE_TYPE (step_expr), expr);
7393         }
7394       else
7395         expr = build_int_cst (TREE_TYPE (step_expr), vf);
7396       new_name = gimple_build (&seq, MULT_EXPR, TREE_TYPE (step_expr),
7397                                expr, step_expr);
7398       if (seq)
7399         {
7400           new_bb = gsi_insert_seq_on_edge_immediate (pe, seq);
7401           gcc_assert (!new_bb);
7402         }
7403     }
7404
7405   t = unshare_expr (new_name);
7406   gcc_assert (CONSTANT_CLASS_P (new_name)
7407               || TREE_CODE (new_name) == SSA_NAME);
7408   new_vec = build_vector_from_val (step_vectype, t);
7409   vec_step = vect_init_vector (stmt_info, new_vec, step_vectype, NULL);
7410
7411
7412   /* Create the following def-use cycle:
7413      loop prolog:
7414          vec_init = ...
7415          vec_step = ...
7416      loop:
7417          vec_iv = PHI <vec_init, vec_loop>
7418          ...
7419          STMT
7420          ...
7421          vec_loop = vec_iv + vec_step;  */
7422
7423   /* Create the induction-phi that defines the induction-operand.  */
7424   vec_dest = vect_get_new_vect_var (vectype, vect_simple_var, "vec_iv_");
7425   induction_phi = create_phi_node (vec_dest, iv_loop->header);
7426   stmt_vec_info induction_phi_info = loop_vinfo->add_stmt (induction_phi);
7427   induc_def = PHI_RESULT (induction_phi);
7428
7429   /* Create the iv update inside the loop  */
7430   stmts = NULL;
7431   vec_def = gimple_convert (&stmts, step_vectype, induc_def);
7432   vec_def = gimple_build (&stmts, PLUS_EXPR, step_vectype, vec_def, vec_step);
7433   vec_def = gimple_convert (&stmts, vectype, vec_def);
7434   gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
7435   new_stmt = SSA_NAME_DEF_STMT (vec_def);
7436   stmt_vec_info new_stmt_info = loop_vinfo->add_stmt (new_stmt);
7437
7438   /* Set the arguments of the phi node:  */
7439   add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION);
7440   add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
7441                UNKNOWN_LOCATION);
7442
7443   STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = induction_phi_info;
7444
7445   /* In case that vectorization factor (VF) is bigger than the number
7446      of elements that we can fit in a vectype (nunits), we have to generate
7447      more than one vector stmt - i.e - we need to "unroll" the
7448      vector stmt by a factor VF/nunits.  For more details see documentation
7449      in vectorizable_operation.  */
7450
7451   if (ncopies > 1)
7452     {
7453       gimple_seq seq = NULL;
7454       stmt_vec_info prev_stmt_vinfo;
7455       /* FORNOW. This restriction should be relaxed.  */
7456       gcc_assert (!nested_in_vect_loop);
7457
7458       /* Create the vector that holds the step of the induction.  */
7459       if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
7460         {
7461           expr = build_int_cst (integer_type_node, nunits);
7462           expr = gimple_build (&seq, FLOAT_EXPR, TREE_TYPE (step_expr), expr);
7463         }
7464       else
7465         expr = build_int_cst (TREE_TYPE (step_expr), nunits);
7466       new_name = gimple_build (&seq, MULT_EXPR, TREE_TYPE (step_expr),
7467                                expr, step_expr);
7468       if (seq)
7469         {
7470           new_bb = gsi_insert_seq_on_edge_immediate (pe, seq);
7471           gcc_assert (!new_bb);
7472         }
7473
7474       t = unshare_expr (new_name);
7475       gcc_assert (CONSTANT_CLASS_P (new_name)
7476                   || TREE_CODE (new_name) == SSA_NAME);
7477       new_vec = build_vector_from_val (step_vectype, t);
7478       vec_step = vect_init_vector (stmt_info, new_vec, step_vectype, NULL);
7479
7480       vec_def = induc_def;
7481       prev_stmt_vinfo = induction_phi_info;
7482       for (i = 1; i < ncopies; i++)
7483         {
7484           /* vec_i = vec_prev + vec_step  */
7485           gimple_seq stmts = NULL;
7486           vec_def = gimple_convert (&stmts, step_vectype, vec_def);
7487           vec_def = gimple_build (&stmts,
7488                                   PLUS_EXPR, step_vectype, vec_def, vec_step);
7489           vec_def = gimple_convert (&stmts, vectype, vec_def);
7490
7491           gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
7492           new_stmt = SSA_NAME_DEF_STMT (vec_def);
7493           new_stmt_info = loop_vinfo->add_stmt (new_stmt);
7494           STMT_VINFO_RELATED_STMT (prev_stmt_vinfo) = new_stmt_info;
7495           prev_stmt_vinfo = new_stmt_info;
7496         }
7497     }
7498
7499   if (nested_in_vect_loop)
7500     {
7501       /* Find the loop-closed exit-phi of the induction, and record
7502          the final vector of induction results:  */
7503       exit_phi = NULL;
7504       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, loop_arg)
7505         {
7506           gimple *use_stmt = USE_STMT (use_p);
7507           if (is_gimple_debug (use_stmt))
7508             continue;
7509
7510           if (!flow_bb_inside_loop_p (iv_loop, gimple_bb (use_stmt)))
7511             {
7512               exit_phi = use_stmt;
7513               break;
7514             }
7515         }
7516       if (exit_phi)
7517         {
7518           stmt_vec_info stmt_vinfo = loop_vinfo->lookup_stmt (exit_phi);
7519           /* FORNOW. Currently not supporting the case that an inner-loop induction
7520              is not used in the outer-loop (i.e. only outside the outer-loop).  */
7521           gcc_assert (STMT_VINFO_RELEVANT_P (stmt_vinfo)
7522                       && !STMT_VINFO_LIVE_P (stmt_vinfo));
7523
7524           STMT_VINFO_VEC_STMT (stmt_vinfo) = new_stmt_info;
7525           if (dump_enabled_p ())
7526             dump_printf_loc (MSG_NOTE, vect_location,
7527                              "vector of inductions after inner-loop:%G",
7528                              new_stmt);
7529         }
7530     }
7531
7532
7533   if (dump_enabled_p ())
7534     dump_printf_loc (MSG_NOTE, vect_location,
7535                      "transform induction: created def-use cycle: %G%G",
7536                      induction_phi, SSA_NAME_DEF_STMT (vec_def));
7537
7538   return true;
7539 }
7540
7541 /* Function vectorizable_live_operation.
7542
7543    STMT_INFO computes a value that is used outside the loop.  Check if
7544    it can be supported.  */
7545
7546 bool
7547 vectorizable_live_operation (stmt_vec_info stmt_info,
7548                              gimple_stmt_iterator *gsi,
7549                              slp_tree slp_node, slp_instance slp_node_instance,
7550                              int slp_index, bool vec_stmt_p,
7551                              stmt_vector_for_cost *)
7552 {
7553   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
7554   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
7555   imm_use_iterator imm_iter;
7556   tree lhs, lhs_type, bitsize, vec_bitsize;
7557   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
7558   poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
7559   int ncopies;
7560   gimple *use_stmt;
7561   auto_vec<tree> vec_oprnds;
7562   int vec_entry = 0;
7563   poly_uint64 vec_index = 0;
7564
7565   gcc_assert (STMT_VINFO_LIVE_P (stmt_info));
7566
7567   /* The last stmt of a reduction is live and vectorized via
7568      vect_create_epilog_for_reduction.  vectorizable_reduction assessed
7569      validity so just trigger the transform here.  */
7570   if (STMT_VINFO_REDUC_DEF (vect_orig_stmt (stmt_info)))
7571     {
7572       if (!vec_stmt_p)
7573         return true;
7574       if (slp_node)
7575         {
7576           /* For reduction chains the meta-info is attached to
7577              the group leader.  */
7578           if (REDUC_GROUP_FIRST_ELEMENT (stmt_info))
7579             stmt_info = REDUC_GROUP_FIRST_ELEMENT (stmt_info);
7580           /* For SLP reductions we vectorize the epilogue for
7581              all involved stmts together.  */
7582           else if (slp_index != 0)
7583             return true;
7584         }
7585       stmt_vec_info reduc_info = info_for_reduction (stmt_info);
7586       gcc_assert (reduc_info->is_reduc_info);
7587       if (STMT_VINFO_REDUC_TYPE (reduc_info) == FOLD_LEFT_REDUCTION
7588           || STMT_VINFO_REDUC_TYPE (reduc_info) == EXTRACT_LAST_REDUCTION)
7589         return true;
7590       vect_create_epilog_for_reduction (stmt_info, slp_node,
7591                                         slp_node_instance);
7592       return true;
7593     }
7594
7595   /* FORNOW.  CHECKME.  */
7596   if (nested_in_vect_loop_p (loop, stmt_info))
7597     return false;
7598
7599   /* If STMT is not relevant and it is a simple assignment and its inputs are
7600      invariant then it can remain in place, unvectorized.  The original last
7601      scalar value that it computes will be used.  */
7602   if (!STMT_VINFO_RELEVANT_P (stmt_info))
7603     {
7604       gcc_assert (is_simple_and_all_uses_invariant (stmt_info, loop_vinfo));
7605       if (dump_enabled_p ())
7606         dump_printf_loc (MSG_NOTE, vect_location,
7607                          "statement is simple and uses invariant.  Leaving in "
7608                          "place.\n");
7609       return true;
7610     }
7611
7612   if (slp_node)
7613     ncopies = 1;
7614   else
7615     ncopies = vect_get_num_copies (loop_vinfo, vectype);
7616
7617   if (slp_node)
7618     {
7619       gcc_assert (slp_index >= 0);
7620
7621       int num_scalar = SLP_TREE_SCALAR_STMTS (slp_node).length ();
7622       int num_vec = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
7623
7624       /* Get the last occurrence of the scalar index from the concatenation of
7625          all the slp vectors. Calculate which slp vector it is and the index
7626          within.  */
7627       poly_uint64 pos = (num_vec * nunits) - num_scalar + slp_index;
7628
7629       /* Calculate which vector contains the result, and which lane of
7630          that vector we need.  */
7631       if (!can_div_trunc_p (pos, nunits, &vec_entry, &vec_index))
7632         {
7633           if (dump_enabled_p ())
7634             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7635                              "Cannot determine which vector holds the"
7636                              " final result.\n");
7637           return false;
7638         }
7639     }
7640
7641   if (!vec_stmt_p)
7642     {
7643       /* No transformation required.  */
7644       if (LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo))
7645         {
7646           if (!direct_internal_fn_supported_p (IFN_EXTRACT_LAST, vectype,
7647                                                OPTIMIZE_FOR_SPEED))
7648             {
7649               if (dump_enabled_p ())
7650                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7651                                  "can't use a fully-masked loop because "
7652                                  "the target doesn't support extract last "
7653                                  "reduction.\n");
7654               LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = false;
7655             }
7656           else if (slp_node)
7657             {
7658               if (dump_enabled_p ())
7659                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7660                                  "can't use a fully-masked loop because an "
7661                                  "SLP statement is live after the loop.\n");
7662               LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = false;
7663             }
7664           else if (ncopies > 1)
7665             {
7666               if (dump_enabled_p ())
7667                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7668                                  "can't use a fully-masked loop because"
7669                                  " ncopies is greater than 1.\n");
7670               LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = false;
7671             }
7672           else
7673             {
7674               gcc_assert (ncopies == 1 && !slp_node);
7675               vect_record_loop_mask (loop_vinfo,
7676                                      &LOOP_VINFO_MASKS (loop_vinfo),
7677                                      1, vectype);
7678             }
7679         }
7680       return true;
7681     }
7682
7683   /* Use the lhs of the original scalar statement.  */
7684   gimple *stmt = vect_orig_stmt (stmt_info)->stmt;
7685
7686   lhs = (is_a <gphi *> (stmt)) ? gimple_phi_result (stmt)
7687         : gimple_get_lhs (stmt);
7688   lhs_type = TREE_TYPE (lhs);
7689
7690   bitsize = (VECTOR_BOOLEAN_TYPE_P (vectype)
7691              ? bitsize_int (TYPE_PRECISION (TREE_TYPE (vectype)))
7692              : TYPE_SIZE (TREE_TYPE (vectype)));
7693   vec_bitsize = TYPE_SIZE (vectype);
7694
7695   /* Get the vectorized lhs of STMT and the lane to use (counted in bits).  */
7696   tree vec_lhs, bitstart;
7697   if (slp_node)
7698     {
7699       gcc_assert (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo));
7700
7701       /* Get the correct slp vectorized stmt.  */
7702       gimple *vec_stmt = SLP_TREE_VEC_STMTS (slp_node)[vec_entry]->stmt;
7703       if (gphi *phi = dyn_cast <gphi *> (vec_stmt))
7704         vec_lhs = gimple_phi_result (phi);
7705       else
7706         vec_lhs = gimple_get_lhs (vec_stmt);
7707
7708       /* Get entry to use.  */
7709       bitstart = bitsize_int (vec_index);
7710       bitstart = int_const_binop (MULT_EXPR, bitsize, bitstart);
7711     }
7712   else
7713     {
7714       enum vect_def_type dt = STMT_VINFO_DEF_TYPE (stmt_info);
7715       vec_lhs = vect_get_vec_def_for_operand_1 (stmt_info, dt);
7716       gcc_checking_assert (ncopies == 1
7717                            || !LOOP_VINFO_FULLY_MASKED_P (loop_vinfo));
7718
7719       /* For multiple copies, get the last copy.  */
7720       for (int i = 1; i < ncopies; ++i)
7721         vec_lhs = vect_get_vec_def_for_stmt_copy (loop_vinfo, vec_lhs);
7722
7723       /* Get the last lane in the vector.  */
7724       bitstart = int_const_binop (MINUS_EXPR, vec_bitsize, bitsize);
7725     }
7726
7727   gimple_seq stmts = NULL;
7728   tree new_tree;
7729   if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
7730     {
7731       /* Emit:
7732
7733            SCALAR_RES = EXTRACT_LAST <VEC_LHS, MASK>
7734
7735          where VEC_LHS is the vectorized live-out result and MASK is
7736          the loop mask for the final iteration.  */
7737       gcc_assert (ncopies == 1 && !slp_node);
7738       tree scalar_type = TREE_TYPE (STMT_VINFO_VECTYPE (stmt_info));
7739       tree mask = vect_get_loop_mask (gsi, &LOOP_VINFO_MASKS (loop_vinfo),
7740                                       1, vectype, 0);
7741       tree scalar_res = gimple_build (&stmts, CFN_EXTRACT_LAST,
7742                                       scalar_type, mask, vec_lhs);
7743
7744       /* Convert the extracted vector element to the required scalar type.  */
7745       new_tree = gimple_convert (&stmts, lhs_type, scalar_res);
7746     }
7747   else
7748     {
7749       tree bftype = TREE_TYPE (vectype);
7750       if (VECTOR_BOOLEAN_TYPE_P (vectype))
7751         bftype = build_nonstandard_integer_type (tree_to_uhwi (bitsize), 1);
7752       new_tree = build3 (BIT_FIELD_REF, bftype, vec_lhs, bitsize, bitstart);
7753       new_tree = force_gimple_operand (fold_convert (lhs_type, new_tree),
7754                                        &stmts, true, NULL_TREE);
7755     }
7756
7757   if (stmts)
7758     gsi_insert_seq_on_edge_immediate (single_exit (loop), stmts);
7759
7760   /* Replace use of lhs with newly computed result.  If the use stmt is a
7761      single arg PHI, just replace all uses of PHI result.  It's necessary
7762      because lcssa PHI defining lhs may be before newly inserted stmt.  */
7763   use_operand_p use_p;
7764   FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, lhs)
7765     if (!flow_bb_inside_loop_p (loop, gimple_bb (use_stmt))
7766         && !is_gimple_debug (use_stmt))
7767     {
7768       if (gimple_code (use_stmt) == GIMPLE_PHI
7769           && gimple_phi_num_args (use_stmt) == 1)
7770         {
7771           replace_uses_by (gimple_phi_result (use_stmt), new_tree);
7772         }
7773       else
7774         {
7775           FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
7776             SET_USE (use_p, new_tree);
7777         }
7778       update_stmt (use_stmt);
7779     }
7780
7781   return true;
7782 }
7783
7784 /* Kill any debug uses outside LOOP of SSA names defined in STMT_INFO.  */
7785
7786 static void
7787 vect_loop_kill_debug_uses (class loop *loop, stmt_vec_info stmt_info)
7788 {
7789   ssa_op_iter op_iter;
7790   imm_use_iterator imm_iter;
7791   def_operand_p def_p;
7792   gimple *ustmt;
7793
7794   FOR_EACH_PHI_OR_STMT_DEF (def_p, stmt_info->stmt, op_iter, SSA_OP_DEF)
7795     {
7796       FOR_EACH_IMM_USE_STMT (ustmt, imm_iter, DEF_FROM_PTR (def_p))
7797         {
7798           basic_block bb;
7799
7800           if (!is_gimple_debug (ustmt))
7801             continue;
7802
7803           bb = gimple_bb (ustmt);
7804
7805           if (!flow_bb_inside_loop_p (loop, bb))
7806             {
7807               if (gimple_debug_bind_p (ustmt))
7808                 {
7809                   if (dump_enabled_p ())
7810                     dump_printf_loc (MSG_NOTE, vect_location,
7811                                      "killing debug use\n");
7812
7813                   gimple_debug_bind_reset_value (ustmt);
7814                   update_stmt (ustmt);
7815                 }
7816               else
7817                 gcc_unreachable ();
7818             }
7819         }
7820     }
7821 }
7822
7823 /* Given loop represented by LOOP_VINFO, return true if computation of
7824    LOOP_VINFO_NITERS (= LOOP_VINFO_NITERSM1 + 1) doesn't overflow, false
7825    otherwise.  */
7826
7827 static bool
7828 loop_niters_no_overflow (loop_vec_info loop_vinfo)
7829 {
7830   /* Constant case.  */
7831   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
7832     {
7833       tree cst_niters = LOOP_VINFO_NITERS (loop_vinfo);
7834       tree cst_nitersm1 = LOOP_VINFO_NITERSM1 (loop_vinfo);
7835
7836       gcc_assert (TREE_CODE (cst_niters) == INTEGER_CST);
7837       gcc_assert (TREE_CODE (cst_nitersm1) == INTEGER_CST);
7838       if (wi::to_widest (cst_nitersm1) < wi::to_widest (cst_niters))
7839         return true;
7840     }
7841
7842   widest_int max;
7843   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
7844   /* Check the upper bound of loop niters.  */
7845   if (get_max_loop_iterations (loop, &max))
7846     {
7847       tree type = TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo));
7848       signop sgn = TYPE_SIGN (type);
7849       widest_int type_max = widest_int::from (wi::max_value (type), sgn);
7850       if (max < type_max)
7851         return true;
7852     }
7853   return false;
7854 }
7855
7856 /* Return a mask type with half the number of elements as TYPE.  */
7857
7858 tree
7859 vect_halve_mask_nunits (tree type)
7860 {
7861   poly_uint64 nunits = exact_div (TYPE_VECTOR_SUBPARTS (type), 2);
7862   return build_truth_vector_type (nunits, current_vector_size);
7863 }
7864
7865 /* Return a mask type with twice as many elements as TYPE.  */
7866
7867 tree
7868 vect_double_mask_nunits (tree type)
7869 {
7870   poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (type) * 2;
7871   return build_truth_vector_type (nunits, current_vector_size);
7872 }
7873
7874 /* Record that a fully-masked version of LOOP_VINFO would need MASKS to
7875    contain a sequence of NVECTORS masks that each control a vector of type
7876    VECTYPE.  */
7877
7878 void
7879 vect_record_loop_mask (loop_vec_info loop_vinfo, vec_loop_masks *masks,
7880                        unsigned int nvectors, tree vectype)
7881 {
7882   gcc_assert (nvectors != 0);
7883   if (masks->length () < nvectors)
7884     masks->safe_grow_cleared (nvectors);
7885   rgroup_masks *rgm = &(*masks)[nvectors - 1];
7886   /* The number of scalars per iteration and the number of vectors are
7887      both compile-time constants.  */
7888   unsigned int nscalars_per_iter
7889     = exact_div (nvectors * TYPE_VECTOR_SUBPARTS (vectype),
7890                  LOOP_VINFO_VECT_FACTOR (loop_vinfo)).to_constant ();
7891   if (rgm->max_nscalars_per_iter < nscalars_per_iter)
7892     {
7893       rgm->max_nscalars_per_iter = nscalars_per_iter;
7894       rgm->mask_type = build_same_sized_truth_vector_type (vectype);
7895     }
7896 }
7897
7898 /* Given a complete set of masks MASKS, extract mask number INDEX
7899    for an rgroup that operates on NVECTORS vectors of type VECTYPE,
7900    where 0 <= INDEX < NVECTORS.  Insert any set-up statements before GSI.
7901
7902    See the comment above vec_loop_masks for more details about the mask
7903    arrangement.  */
7904
7905 tree
7906 vect_get_loop_mask (gimple_stmt_iterator *gsi, vec_loop_masks *masks,
7907                     unsigned int nvectors, tree vectype, unsigned int index)
7908 {
7909   rgroup_masks *rgm = &(*masks)[nvectors - 1];
7910   tree mask_type = rgm->mask_type;
7911
7912   /* Populate the rgroup's mask array, if this is the first time we've
7913      used it.  */
7914   if (rgm->masks.is_empty ())
7915     {
7916       rgm->masks.safe_grow_cleared (nvectors);
7917       for (unsigned int i = 0; i < nvectors; ++i)
7918         {
7919           tree mask = make_temp_ssa_name (mask_type, NULL, "loop_mask");
7920           /* Provide a dummy definition until the real one is available.  */
7921           SSA_NAME_DEF_STMT (mask) = gimple_build_nop ();
7922           rgm->masks[i] = mask;
7923         }
7924     }
7925
7926   tree mask = rgm->masks[index];
7927   if (maybe_ne (TYPE_VECTOR_SUBPARTS (mask_type),
7928                 TYPE_VECTOR_SUBPARTS (vectype)))
7929     {
7930       /* A loop mask for data type X can be reused for data type Y
7931          if X has N times more elements than Y and if Y's elements
7932          are N times bigger than X's.  In this case each sequence
7933          of N elements in the loop mask will be all-zero or all-one.
7934          We can then view-convert the mask so that each sequence of
7935          N elements is replaced by a single element.  */
7936       gcc_assert (multiple_p (TYPE_VECTOR_SUBPARTS (mask_type),
7937                               TYPE_VECTOR_SUBPARTS (vectype)));
7938       gimple_seq seq = NULL;
7939       mask_type = build_same_sized_truth_vector_type (vectype);
7940       mask = gimple_build (&seq, VIEW_CONVERT_EXPR, mask_type, mask);
7941       if (seq)
7942         gsi_insert_seq_before (gsi, seq, GSI_SAME_STMT);
7943     }
7944   return mask;
7945 }
7946
7947 /* Scale profiling counters by estimation for LOOP which is vectorized
7948    by factor VF.  */
7949
7950 static void
7951 scale_profile_for_vect_loop (class loop *loop, unsigned vf)
7952 {
7953   edge preheader = loop_preheader_edge (loop);
7954   /* Reduce loop iterations by the vectorization factor.  */
7955   gcov_type new_est_niter = niter_for_unrolled_loop (loop, vf);
7956   profile_count freq_h = loop->header->count, freq_e = preheader->count ();
7957
7958   if (freq_h.nonzero_p ())
7959     {
7960       profile_probability p;
7961
7962       /* Avoid dropping loop body profile counter to 0 because of zero count
7963          in loop's preheader.  */
7964       if (!(freq_e == profile_count::zero ()))
7965         freq_e = freq_e.force_nonzero ();
7966       p = freq_e.apply_scale (new_est_niter + 1, 1).probability_in (freq_h);
7967       scale_loop_frequencies (loop, p);
7968     }
7969
7970   edge exit_e = single_exit (loop);
7971   exit_e->probability = profile_probability::always ()
7972                                  .apply_scale (1, new_est_niter + 1);
7973
7974   edge exit_l = single_pred_edge (loop->latch);
7975   profile_probability prob = exit_l->probability;
7976   exit_l->probability = exit_e->probability.invert ();
7977   if (prob.initialized_p () && exit_l->probability.initialized_p ())
7978     scale_bbs_frequencies (&loop->latch, 1, exit_l->probability / prob);
7979 }
7980
7981 /* Vectorize STMT_INFO if relevant, inserting any new instructions before GSI.
7982    When vectorizing STMT_INFO as a store, set *SEEN_STORE to its
7983    stmt_vec_info.  */
7984
7985 static void
7986 vect_transform_loop_stmt (loop_vec_info loop_vinfo, stmt_vec_info stmt_info,
7987                           gimple_stmt_iterator *gsi, stmt_vec_info *seen_store)
7988 {
7989   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
7990   poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
7991
7992   if (dump_enabled_p ())
7993     dump_printf_loc (MSG_NOTE, vect_location,
7994                      "------>vectorizing statement: %G", stmt_info->stmt);
7995
7996   if (MAY_HAVE_DEBUG_BIND_STMTS && !STMT_VINFO_LIVE_P (stmt_info))
7997     vect_loop_kill_debug_uses (loop, stmt_info);
7998
7999   if (!STMT_VINFO_RELEVANT_P (stmt_info)
8000       && !STMT_VINFO_LIVE_P (stmt_info))
8001     return;
8002
8003   if (STMT_VINFO_VECTYPE (stmt_info))
8004     {
8005       poly_uint64 nunits
8006         = TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info));
8007       if (!STMT_SLP_TYPE (stmt_info)
8008           && maybe_ne (nunits, vf)
8009           && dump_enabled_p ())
8010         /* For SLP VF is set according to unrolling factor, and not
8011            to vector size, hence for SLP this print is not valid.  */
8012         dump_printf_loc (MSG_NOTE, vect_location, "multiple-types.\n");
8013     }
8014
8015   /* Pure SLP statements have already been vectorized.  We still need
8016      to apply loop vectorization to hybrid SLP statements.  */
8017   if (PURE_SLP_STMT (stmt_info))
8018     return;
8019
8020   if (dump_enabled_p ())
8021     dump_printf_loc (MSG_NOTE, vect_location, "transform statement.\n");
8022
8023   if (vect_transform_stmt (stmt_info, gsi, NULL, NULL))
8024     *seen_store = stmt_info;
8025 }
8026
8027 /* Function vect_transform_loop.
8028
8029    The analysis phase has determined that the loop is vectorizable.
8030    Vectorize the loop - created vectorized stmts to replace the scalar
8031    stmts in the loop, and update the loop exit condition.
8032    Returns scalar epilogue loop if any.  */
8033
8034 class loop *
8035 vect_transform_loop (loop_vec_info loop_vinfo)
8036 {
8037   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
8038   class loop *epilogue = NULL;
8039   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
8040   int nbbs = loop->num_nodes;
8041   int i;
8042   tree niters_vector = NULL_TREE;
8043   tree step_vector = NULL_TREE;
8044   tree niters_vector_mult_vf = NULL_TREE;
8045   poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
8046   unsigned int lowest_vf = constant_lower_bound (vf);
8047   gimple *stmt;
8048   bool check_profitability = false;
8049   unsigned int th;
8050
8051   DUMP_VECT_SCOPE ("vec_transform_loop");
8052
8053   loop_vinfo->shared->check_datarefs ();
8054
8055   /* Use the more conservative vectorization threshold.  If the number
8056      of iterations is constant assume the cost check has been performed
8057      by our caller.  If the threshold makes all loops profitable that
8058      run at least the (estimated) vectorization factor number of times
8059      checking is pointless, too.  */
8060   th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
8061   if (th >= vect_vf_for_cost (loop_vinfo)
8062       && !LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
8063     {
8064       if (dump_enabled_p ())
8065         dump_printf_loc (MSG_NOTE, vect_location,
8066                          "Profitability threshold is %d loop iterations.\n",
8067                          th);
8068       check_profitability = true;
8069     }
8070
8071   /* Make sure there exists a single-predecessor exit bb.  Do this before
8072      versioning.   */
8073   edge e = single_exit (loop);
8074   if (! single_pred_p (e->dest))
8075     {
8076       split_loop_exit_edge (e, true);
8077       if (dump_enabled_p ())
8078         dump_printf (MSG_NOTE, "split exit edge\n");
8079     }
8080
8081   /* Version the loop first, if required, so the profitability check
8082      comes first.  */
8083
8084   if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
8085     {
8086       poly_uint64 versioning_threshold
8087         = LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo);
8088       if (check_profitability
8089           && ordered_p (poly_uint64 (th), versioning_threshold))
8090         {
8091           versioning_threshold = ordered_max (poly_uint64 (th),
8092                                               versioning_threshold);
8093           check_profitability = false;
8094         }
8095       class loop *sloop
8096         = vect_loop_versioning (loop_vinfo, th, check_profitability,
8097                                 versioning_threshold);
8098       sloop->force_vectorize = false;
8099       check_profitability = false;
8100     }
8101
8102   /* Make sure there exists a single-predecessor exit bb also on the
8103      scalar loop copy.  Do this after versioning but before peeling
8104      so CFG structure is fine for both scalar and if-converted loop
8105      to make slpeel_duplicate_current_defs_from_edges face matched
8106      loop closed PHI nodes on the exit.  */
8107   if (LOOP_VINFO_SCALAR_LOOP (loop_vinfo))
8108     {
8109       e = single_exit (LOOP_VINFO_SCALAR_LOOP (loop_vinfo));
8110       if (! single_pred_p (e->dest))
8111         {
8112           split_loop_exit_edge (e, true);
8113           if (dump_enabled_p ())
8114             dump_printf (MSG_NOTE, "split exit edge of scalar loop\n");
8115         }
8116     }
8117
8118   tree niters = vect_build_loop_niters (loop_vinfo);
8119   LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = niters;
8120   tree nitersm1 = unshare_expr (LOOP_VINFO_NITERSM1 (loop_vinfo));
8121   bool niters_no_overflow = loop_niters_no_overflow (loop_vinfo);
8122   epilogue = vect_do_peeling (loop_vinfo, niters, nitersm1, &niters_vector,
8123                               &step_vector, &niters_vector_mult_vf, th,
8124                               check_profitability, niters_no_overflow);
8125   if (LOOP_VINFO_SCALAR_LOOP (loop_vinfo)
8126       && LOOP_VINFO_SCALAR_LOOP_SCALING (loop_vinfo).initialized_p ())
8127     scale_loop_frequencies (LOOP_VINFO_SCALAR_LOOP (loop_vinfo),
8128                             LOOP_VINFO_SCALAR_LOOP_SCALING (loop_vinfo));
8129
8130   if (niters_vector == NULL_TREE)
8131     {
8132       if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
8133           && !LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
8134           && known_eq (lowest_vf, vf))
8135         {
8136           niters_vector
8137             = build_int_cst (TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo)),
8138                              LOOP_VINFO_INT_NITERS (loop_vinfo) / lowest_vf);
8139           step_vector = build_one_cst (TREE_TYPE (niters));
8140         }
8141       else
8142         vect_gen_vector_loop_niters (loop_vinfo, niters, &niters_vector,
8143                                      &step_vector, niters_no_overflow);
8144     }
8145
8146   /* 1) Make sure the loop header has exactly two entries
8147      2) Make sure we have a preheader basic block.  */
8148
8149   gcc_assert (EDGE_COUNT (loop->header->preds) == 2);
8150
8151   split_edge (loop_preheader_edge (loop));
8152
8153   if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
8154       && vect_use_loop_mask_for_alignment_p (loop_vinfo))
8155     /* This will deal with any possible peeling.  */
8156     vect_prepare_for_masked_peels (loop_vinfo);
8157
8158   /* Schedule the SLP instances first, then handle loop vectorization
8159      below.  */
8160   if (!loop_vinfo->slp_instances.is_empty ())
8161     {
8162       DUMP_VECT_SCOPE ("scheduling SLP instances");
8163       vect_schedule_slp (loop_vinfo);
8164     }
8165
8166   /* FORNOW: the vectorizer supports only loops which body consist
8167      of one basic block (header + empty latch). When the vectorizer will
8168      support more involved loop forms, the order by which the BBs are
8169      traversed need to be reconsidered.  */
8170
8171   for (i = 0; i < nbbs; i++)
8172     {
8173       basic_block bb = bbs[i];
8174       stmt_vec_info stmt_info;
8175
8176       for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
8177            gsi_next (&si))
8178         {
8179           gphi *phi = si.phi ();
8180           if (dump_enabled_p ())
8181             dump_printf_loc (MSG_NOTE, vect_location,
8182                              "------>vectorizing phi: %G", phi);
8183           stmt_info = loop_vinfo->lookup_stmt (phi);
8184           if (!stmt_info)
8185             continue;
8186
8187           if (MAY_HAVE_DEBUG_BIND_STMTS && !STMT_VINFO_LIVE_P (stmt_info))
8188             vect_loop_kill_debug_uses (loop, stmt_info);
8189
8190           if (!STMT_VINFO_RELEVANT_P (stmt_info)
8191               && !STMT_VINFO_LIVE_P (stmt_info))
8192             continue;
8193
8194           if (STMT_VINFO_VECTYPE (stmt_info)
8195               && (maybe_ne
8196                   (TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info)), vf))
8197               && dump_enabled_p ())
8198             dump_printf_loc (MSG_NOTE, vect_location, "multiple-types.\n");
8199
8200           if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def
8201                || STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
8202                || STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def
8203                || STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle
8204                || STMT_VINFO_DEF_TYPE (stmt_info) == vect_internal_def)
8205               && ! PURE_SLP_STMT (stmt_info))
8206             {
8207               if (dump_enabled_p ())
8208                 dump_printf_loc (MSG_NOTE, vect_location, "transform phi.\n");
8209               vect_transform_stmt (stmt_info, NULL, NULL, NULL);
8210             }
8211         }
8212
8213       for (gimple_stmt_iterator si = gsi_start_bb (bb);
8214            !gsi_end_p (si);)
8215         {
8216           stmt = gsi_stmt (si);
8217           /* During vectorization remove existing clobber stmts.  */
8218           if (gimple_clobber_p (stmt))
8219             {
8220               unlink_stmt_vdef (stmt);
8221               gsi_remove (&si, true);
8222               release_defs (stmt);
8223             }
8224           else
8225             {
8226               stmt_info = loop_vinfo->lookup_stmt (stmt);
8227
8228               /* vector stmts created in the outer-loop during vectorization of
8229                  stmts in an inner-loop may not have a stmt_info, and do not
8230                  need to be vectorized.  */
8231               stmt_vec_info seen_store = NULL;
8232               if (stmt_info)
8233                 {
8234                   if (STMT_VINFO_IN_PATTERN_P (stmt_info))
8235                     {
8236                       gimple *def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
8237                       for (gimple_stmt_iterator subsi = gsi_start (def_seq);
8238                            !gsi_end_p (subsi); gsi_next (&subsi))
8239                         {
8240                           stmt_vec_info pat_stmt_info
8241                             = loop_vinfo->lookup_stmt (gsi_stmt (subsi));
8242                           vect_transform_loop_stmt (loop_vinfo, pat_stmt_info,
8243                                                     &si, &seen_store);
8244                         }
8245                       stmt_vec_info pat_stmt_info
8246                         = STMT_VINFO_RELATED_STMT (stmt_info);
8247                       vect_transform_loop_stmt (loop_vinfo, pat_stmt_info, &si,
8248                                                 &seen_store);
8249                     }
8250                   vect_transform_loop_stmt (loop_vinfo, stmt_info, &si,
8251                                             &seen_store);
8252                 }
8253               gsi_next (&si);
8254               if (seen_store)
8255                 {
8256                   if (STMT_VINFO_GROUPED_ACCESS (seen_store))
8257                     /* Interleaving.  If IS_STORE is TRUE, the
8258                        vectorization of the interleaving chain was
8259                        completed - free all the stores in the chain.  */
8260                     vect_remove_stores (DR_GROUP_FIRST_ELEMENT (seen_store));
8261                   else
8262                     /* Free the attached stmt_vec_info and remove the stmt.  */
8263                     loop_vinfo->remove_stmt (stmt_info);
8264                 }
8265             }
8266         }
8267
8268       /* Stub out scalar statements that must not survive vectorization.
8269          Doing this here helps with grouped statements, or statements that
8270          are involved in patterns.  */
8271       for (gimple_stmt_iterator gsi = gsi_start_bb (bb);
8272            !gsi_end_p (gsi); gsi_next (&gsi))
8273         {
8274           gcall *call = dyn_cast <gcall *> (gsi_stmt (gsi));
8275           if (call && gimple_call_internal_p (call, IFN_MASK_LOAD))
8276             {
8277               tree lhs = gimple_get_lhs (call);
8278               if (!VECTOR_TYPE_P (TREE_TYPE (lhs)))
8279                 {
8280                   tree zero = build_zero_cst (TREE_TYPE (lhs));
8281                   gimple *new_stmt = gimple_build_assign (lhs, zero);
8282                   gsi_replace (&gsi, new_stmt, true);
8283                 }
8284             }
8285         }
8286     }                           /* BBs in loop */
8287
8288   /* The vectorization factor is always > 1, so if we use an IV increment of 1.
8289      a zero NITERS becomes a nonzero NITERS_VECTOR.  */
8290   if (integer_onep (step_vector))
8291     niters_no_overflow = true;
8292   vect_set_loop_condition (loop, loop_vinfo, niters_vector, step_vector,
8293                            niters_vector_mult_vf, !niters_no_overflow);
8294
8295   unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
8296   scale_profile_for_vect_loop (loop, assumed_vf);
8297
8298   /* True if the final iteration might not handle a full vector's
8299      worth of scalar iterations.  */
8300   bool final_iter_may_be_partial = LOOP_VINFO_FULLY_MASKED_P (loop_vinfo);
8301   /* The minimum number of iterations performed by the epilogue.  This
8302      is 1 when peeling for gaps because we always need a final scalar
8303      iteration.  */
8304   int min_epilogue_iters = LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) ? 1 : 0;
8305   /* +1 to convert latch counts to loop iteration counts,
8306      -min_epilogue_iters to remove iterations that cannot be performed
8307        by the vector code.  */
8308   int bias_for_lowest = 1 - min_epilogue_iters;
8309   int bias_for_assumed = bias_for_lowest;
8310   int alignment_npeels = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
8311   if (alignment_npeels && LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
8312     {
8313       /* When the amount of peeling is known at compile time, the first
8314          iteration will have exactly alignment_npeels active elements.
8315          In the worst case it will have at least one.  */
8316       int min_first_active = (alignment_npeels > 0 ? alignment_npeels : 1);
8317       bias_for_lowest += lowest_vf - min_first_active;
8318       bias_for_assumed += assumed_vf - min_first_active;
8319     }
8320   /* In these calculations the "- 1" converts loop iteration counts
8321      back to latch counts.  */
8322   if (loop->any_upper_bound)
8323     loop->nb_iterations_upper_bound
8324       = (final_iter_may_be_partial
8325          ? wi::udiv_ceil (loop->nb_iterations_upper_bound + bias_for_lowest,
8326                           lowest_vf) - 1
8327          : wi::udiv_floor (loop->nb_iterations_upper_bound + bias_for_lowest,
8328                            lowest_vf) - 1);
8329   if (loop->any_likely_upper_bound)
8330     loop->nb_iterations_likely_upper_bound
8331       = (final_iter_may_be_partial
8332          ? wi::udiv_ceil (loop->nb_iterations_likely_upper_bound
8333                           + bias_for_lowest, lowest_vf) - 1
8334          : wi::udiv_floor (loop->nb_iterations_likely_upper_bound
8335                            + bias_for_lowest, lowest_vf) - 1);
8336   if (loop->any_estimate)
8337     loop->nb_iterations_estimate
8338       = (final_iter_may_be_partial
8339          ? wi::udiv_ceil (loop->nb_iterations_estimate + bias_for_assumed,
8340                           assumed_vf) - 1
8341          : wi::udiv_floor (loop->nb_iterations_estimate + bias_for_assumed,
8342                            assumed_vf) - 1);
8343
8344   if (dump_enabled_p ())
8345     {
8346       if (!LOOP_VINFO_EPILOGUE_P (loop_vinfo))
8347         {
8348           dump_printf_loc (MSG_NOTE, vect_location,
8349                            "LOOP VECTORIZED\n");
8350           if (loop->inner)
8351             dump_printf_loc (MSG_NOTE, vect_location,
8352                              "OUTER LOOP VECTORIZED\n");
8353           dump_printf (MSG_NOTE, "\n");
8354         }
8355       else
8356         {
8357           dump_printf_loc (MSG_NOTE, vect_location,
8358                            "LOOP EPILOGUE VECTORIZED (VS=");
8359           dump_dec (MSG_NOTE, current_vector_size);
8360           dump_printf (MSG_NOTE, ")\n");
8361         }
8362     }
8363
8364   /* Loops vectorized with a variable factor won't benefit from
8365      unrolling/peeling.  */
8366   if (!vf.is_constant ())
8367     {
8368       loop->unroll = 1;
8369       if (dump_enabled_p ())
8370         dump_printf_loc (MSG_NOTE, vect_location, "Disabling unrolling due to"
8371                          " variable-length vectorization factor\n");
8372     }
8373   /* Free SLP instances here because otherwise stmt reference counting
8374      won't work.  */
8375   slp_instance instance;
8376   FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), i, instance)
8377     vect_free_slp_instance (instance, true);
8378   LOOP_VINFO_SLP_INSTANCES (loop_vinfo).release ();
8379   /* Clear-up safelen field since its value is invalid after vectorization
8380      since vectorized loop can have loop-carried dependencies.  */
8381   loop->safelen = 0;
8382
8383   /* Don't vectorize epilogue for epilogue.  */
8384   if (LOOP_VINFO_EPILOGUE_P (loop_vinfo))
8385     epilogue = NULL;
8386
8387   if (!PARAM_VALUE (PARAM_VECT_EPILOGUES_NOMASK))
8388     epilogue = NULL;
8389
8390   if (epilogue)
8391     {
8392       auto_vector_sizes vector_sizes;
8393       targetm.vectorize.autovectorize_vector_sizes (&vector_sizes, false);
8394       unsigned int next_size = 0;
8395
8396       /* Note LOOP_VINFO_NITERS_KNOWN_P and LOOP_VINFO_INT_NITERS work
8397          on niters already ajusted for the iterations of the prologue.  */
8398       if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
8399           && known_eq (vf, lowest_vf))
8400         {
8401           unsigned HOST_WIDE_INT eiters
8402             = (LOOP_VINFO_INT_NITERS (loop_vinfo)
8403                - LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo));
8404           eiters
8405             = eiters % lowest_vf + LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo);
8406           epilogue->nb_iterations_upper_bound = eiters - 1;
8407           epilogue->any_upper_bound = true;
8408
8409           unsigned int ratio;
8410           while (next_size < vector_sizes.length ()
8411                  && !(constant_multiple_p (current_vector_size,
8412                                            vector_sizes[next_size], &ratio)
8413                       && eiters >= lowest_vf / ratio))
8414             next_size += 1;
8415         }
8416       else
8417         while (next_size < vector_sizes.length ()
8418                && maybe_lt (current_vector_size, vector_sizes[next_size]))
8419           next_size += 1;
8420
8421       if (next_size == vector_sizes.length ())
8422         epilogue = NULL;
8423     }
8424
8425   if (epilogue)
8426     {
8427       epilogue->force_vectorize = loop->force_vectorize;
8428       epilogue->safelen = loop->safelen;
8429       epilogue->dont_vectorize = false;
8430
8431       /* We may need to if-convert epilogue to vectorize it.  */
8432       if (LOOP_VINFO_SCALAR_LOOP (loop_vinfo))
8433         tree_if_conversion (epilogue);
8434     }
8435
8436   return epilogue;
8437 }
8438
8439 /* The code below is trying to perform simple optimization - revert
8440    if-conversion for masked stores, i.e. if the mask of a store is zero
8441    do not perform it and all stored value producers also if possible.
8442    For example,
8443      for (i=0; i<n; i++)
8444        if (c[i])
8445         {
8446           p1[i] += 1;
8447           p2[i] = p3[i] +2;
8448         }
8449    this transformation will produce the following semi-hammock:
8450
8451    if (!mask__ifc__42.18_165 == { 0, 0, 0, 0, 0, 0, 0, 0 })
8452      {
8453        vect__11.19_170 = MASK_LOAD (vectp_p1.20_168, 0B, mask__ifc__42.18_165);
8454        vect__12.22_172 = vect__11.19_170 + vect_cst__171;
8455        MASK_STORE (vectp_p1.23_175, 0B, mask__ifc__42.18_165, vect__12.22_172);
8456        vect__18.25_182 = MASK_LOAD (vectp_p3.26_180, 0B, mask__ifc__42.18_165);
8457        vect__19.28_184 = vect__18.25_182 + vect_cst__183;
8458        MASK_STORE (vectp_p2.29_187, 0B, mask__ifc__42.18_165, vect__19.28_184);
8459      }
8460 */
8461
8462 void
8463 optimize_mask_stores (class loop *loop)
8464 {
8465   basic_block *bbs = get_loop_body (loop);
8466   unsigned nbbs = loop->num_nodes;
8467   unsigned i;
8468   basic_block bb;
8469   class loop *bb_loop;
8470   gimple_stmt_iterator gsi;
8471   gimple *stmt;
8472   auto_vec<gimple *> worklist;
8473   auto_purge_vect_location sentinel;
8474
8475   vect_location = find_loop_location (loop);
8476   /* Pick up all masked stores in loop if any.  */
8477   for (i = 0; i < nbbs; i++)
8478     {
8479       bb = bbs[i];
8480       for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi);
8481            gsi_next (&gsi))
8482         {
8483           stmt = gsi_stmt (gsi);
8484           if (gimple_call_internal_p (stmt, IFN_MASK_STORE))
8485             worklist.safe_push (stmt);
8486         }
8487     }
8488
8489   free (bbs);
8490   if (worklist.is_empty ())
8491     return;
8492
8493   /* Loop has masked stores.  */
8494   while (!worklist.is_empty ())
8495     {
8496       gimple *last, *last_store;
8497       edge e, efalse;
8498       tree mask;
8499       basic_block store_bb, join_bb;
8500       gimple_stmt_iterator gsi_to;
8501       tree vdef, new_vdef;
8502       gphi *phi;
8503       tree vectype;
8504       tree zero;
8505
8506       last = worklist.pop ();
8507       mask = gimple_call_arg (last, 2);
8508       bb = gimple_bb (last);
8509       /* Create then_bb and if-then structure in CFG, then_bb belongs to
8510          the same loop as if_bb.  It could be different to LOOP when two
8511          level loop-nest is vectorized and mask_store belongs to the inner
8512          one.  */
8513       e = split_block (bb, last);
8514       bb_loop = bb->loop_father;
8515       gcc_assert (loop == bb_loop || flow_loop_nested_p (loop, bb_loop));
8516       join_bb = e->dest;
8517       store_bb = create_empty_bb (bb);
8518       add_bb_to_loop (store_bb, bb_loop);
8519       e->flags = EDGE_TRUE_VALUE;
8520       efalse = make_edge (bb, store_bb, EDGE_FALSE_VALUE);
8521       /* Put STORE_BB to likely part.  */
8522       efalse->probability = profile_probability::unlikely ();
8523       store_bb->count = efalse->count ();
8524       make_single_succ_edge (store_bb, join_bb, EDGE_FALLTHRU);
8525       if (dom_info_available_p (CDI_DOMINATORS))
8526         set_immediate_dominator (CDI_DOMINATORS, store_bb, bb);
8527       if (dump_enabled_p ())
8528         dump_printf_loc (MSG_NOTE, vect_location,
8529                          "Create new block %d to sink mask stores.",
8530                          store_bb->index);
8531       /* Create vector comparison with boolean result.  */
8532       vectype = TREE_TYPE (mask);
8533       zero = build_zero_cst (vectype);
8534       stmt = gimple_build_cond (EQ_EXPR, mask, zero, NULL_TREE, NULL_TREE);
8535       gsi = gsi_last_bb (bb);
8536       gsi_insert_after (&gsi, stmt, GSI_SAME_STMT);
8537       /* Create new PHI node for vdef of the last masked store:
8538          .MEM_2 = VDEF <.MEM_1>
8539          will be converted to
8540          .MEM.3 = VDEF <.MEM_1>
8541          and new PHI node will be created in join bb
8542          .MEM_2 = PHI <.MEM_1, .MEM_3>
8543       */
8544       vdef = gimple_vdef (last);
8545       new_vdef = make_ssa_name (gimple_vop (cfun), last);
8546       gimple_set_vdef (last, new_vdef);
8547       phi = create_phi_node (vdef, join_bb);
8548       add_phi_arg (phi, new_vdef, EDGE_SUCC (store_bb, 0), UNKNOWN_LOCATION);
8549
8550       /* Put all masked stores with the same mask to STORE_BB if possible.  */
8551       while (true)
8552         {
8553           gimple_stmt_iterator gsi_from;
8554           gimple *stmt1 = NULL;
8555
8556           /* Move masked store to STORE_BB.  */
8557           last_store = last;
8558           gsi = gsi_for_stmt (last);
8559           gsi_from = gsi;
8560           /* Shift GSI to the previous stmt for further traversal.  */
8561           gsi_prev (&gsi);
8562           gsi_to = gsi_start_bb (store_bb);
8563           gsi_move_before (&gsi_from, &gsi_to);
8564           /* Setup GSI_TO to the non-empty block start.  */
8565           gsi_to = gsi_start_bb (store_bb);
8566           if (dump_enabled_p ())
8567             dump_printf_loc (MSG_NOTE, vect_location,
8568                              "Move stmt to created bb\n%G", last);
8569           /* Move all stored value producers if possible.  */
8570           while (!gsi_end_p (gsi))
8571             {
8572               tree lhs;
8573               imm_use_iterator imm_iter;
8574               use_operand_p use_p;
8575               bool res;
8576
8577               /* Skip debug statements.  */
8578               if (is_gimple_debug (gsi_stmt (gsi)))
8579                 {
8580                   gsi_prev (&gsi);
8581                   continue;
8582                 }
8583               stmt1 = gsi_stmt (gsi);
8584               /* Do not consider statements writing to memory or having
8585                  volatile operand.  */
8586               if (gimple_vdef (stmt1)
8587                   || gimple_has_volatile_ops (stmt1))
8588                 break;
8589               gsi_from = gsi;
8590               gsi_prev (&gsi);
8591               lhs = gimple_get_lhs (stmt1);
8592               if (!lhs)
8593                 break;
8594
8595               /* LHS of vectorized stmt must be SSA_NAME.  */
8596               if (TREE_CODE (lhs) != SSA_NAME)
8597                 break;
8598
8599               if (!VECTOR_TYPE_P (TREE_TYPE (lhs)))
8600                 {
8601                   /* Remove dead scalar statement.  */
8602                   if (has_zero_uses (lhs))
8603                     {
8604                       gsi_remove (&gsi_from, true);
8605                       continue;
8606                     }
8607                 }
8608
8609               /* Check that LHS does not have uses outside of STORE_BB.  */
8610               res = true;
8611               FOR_EACH_IMM_USE_FAST (use_p, imm_iter, lhs)
8612                 {
8613                   gimple *use_stmt;
8614                   use_stmt = USE_STMT (use_p);
8615                   if (is_gimple_debug (use_stmt))
8616                     continue;
8617                   if (gimple_bb (use_stmt) != store_bb)
8618                     {
8619                       res = false;
8620                       break;
8621                     }
8622                 }
8623               if (!res)
8624                 break;
8625
8626               if (gimple_vuse (stmt1)
8627                   && gimple_vuse (stmt1) != gimple_vuse (last_store))
8628                 break;
8629
8630               /* Can move STMT1 to STORE_BB.  */
8631               if (dump_enabled_p ())
8632                 dump_printf_loc (MSG_NOTE, vect_location,
8633                                  "Move stmt to created bb\n%G", stmt1);
8634               gsi_move_before (&gsi_from, &gsi_to);
8635               /* Shift GSI_TO for further insertion.  */
8636               gsi_prev (&gsi_to);
8637             }
8638           /* Put other masked stores with the same mask to STORE_BB.  */
8639           if (worklist.is_empty ()
8640               || gimple_call_arg (worklist.last (), 2) != mask
8641               || worklist.last () != stmt1)
8642             break;
8643           last = worklist.pop ();
8644         }
8645       add_phi_arg (phi, gimple_vuse (last_store), e, UNKNOWN_LOCATION);
8646     }
8647 }
8648
8649 /* Decide whether it is possible to use a zero-based induction variable
8650    when vectorizing LOOP_VINFO with a fully-masked loop.  If it is,
8651    return the value that the induction variable must be able to hold
8652    in order to ensure that the loop ends with an all-false mask.
8653    Return -1 otherwise.  */
8654 widest_int
8655 vect_iv_limit_for_full_masking (loop_vec_info loop_vinfo)
8656 {
8657   tree niters_skip = LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo);
8658   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
8659   unsigned HOST_WIDE_INT max_vf = vect_max_vf (loop_vinfo);
8660
8661   /* Calculate the value that the induction variable must be able
8662      to hit in order to ensure that we end the loop with an all-false mask.
8663      This involves adding the maximum number of inactive trailing scalar
8664      iterations.  */
8665   widest_int iv_limit = -1;
8666   if (max_loop_iterations (loop, &iv_limit))
8667     {
8668       if (niters_skip)
8669         {
8670           /* Add the maximum number of skipped iterations to the
8671              maximum iteration count.  */
8672           if (TREE_CODE (niters_skip) == INTEGER_CST)
8673             iv_limit += wi::to_widest (niters_skip);
8674           else
8675             iv_limit += max_vf - 1;
8676         }
8677       else if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo))
8678         /* Make a conservatively-correct assumption.  */
8679         iv_limit += max_vf - 1;
8680
8681       /* IV_LIMIT is the maximum number of latch iterations, which is also
8682          the maximum in-range IV value.  Round this value down to the previous
8683          vector alignment boundary and then add an extra full iteration.  */
8684       poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
8685       iv_limit = (iv_limit & -(int) known_alignment (vf)) + max_vf;
8686     }
8687   return iv_limit;
8688 }
8689