gcc/tree-vect-loop.c

   1 /* Loop Vectorization
   2    Copyright (C) 2003-2019 Free Software Foundation, Inc.
   3    Contributed by Dorit Naishlos <dorit@il.ibm.com> and
   4    Ira Rosen <irar@il.ibm.com>
   5
   6 This file is part of GCC.
   7
   8 GCC is free software; you can redistribute it and/or modify it under
   9 the terms of the GNU General Public License as published by the Free
  10 Software Foundation; either version 3, or (at your option) any later
  11 version.
  12
  13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
  14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
  15 FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  16 for more details.
  17
  18 You should have received a copy of the GNU General Public License
  19 along with GCC; see the file COPYING3.  If not see
  20 <http://www.gnu.org/licenses/>.  */
  21
  22 #include "config.h"
  23 #include "system.h"
  24 #include "coretypes.h"
  25 #include "backend.h"
  26 #include "target.h"
  27 #include "rtl.h"
  28 #include "tree.h"
  29 #include "gimple.h"
  30 #include "cfghooks.h"
  31 #include "tree-pass.h"
  32 #include "ssa.h"
  33 #include "optabs-tree.h"
  34 #include "diagnostic-core.h"
  35 #include "fold-const.h"
  36 #include "stor-layout.h"
  37 #include "cfganal.h"
  38 #include "gimplify.h"
  39 #include "gimple-iterator.h"
  40 #include "gimplify-me.h"
  41 #include "tree-ssa-loop-ivopts.h"
  42 #include "tree-ssa-loop-manip.h"
  43 #include "tree-ssa-loop-niter.h"
  44 #include "tree-ssa-loop.h"
  45 #include "cfgloop.h"
  46 #include "params.h"
  47 #include "tree-scalar-evolution.h"
  48 #include "tree-vectorizer.h"
  49 #include "gimple-fold.h"
  50 #include "cgraph.h"
  51 #include "tree-cfg.h"
  52 #include "tree-if-conv.h"
  53 #include "internal-fn.h"
  54 #include "tree-vector-builder.h"
  55 #include "vec-perm-indices.h"
  56 #include "tree-eh.h"
  57
  58 /* Loop Vectorization Pass.
  59
  60    This pass tries to vectorize loops.
  61
  62    For example, the vectorizer transforms the following simple loop:
  63
  64         short a[N]; short b[N]; short c[N]; int i;
  65
  66         for (i=0; i<N; i++){
  67           a[i] = b[i] + c[i];
  68         }
  69
  70    as if it was manually vectorized by rewriting the source code into:
  71
  72         typedef int __attribute__((mode(V8HI))) v8hi;
  73         short a[N];  short b[N]; short c[N];   int i;
  74         v8hi *pa = (v8hi*)a, *pb = (v8hi*)b, *pc = (v8hi*)c;
  75         v8hi va, vb, vc;
  76
  77         for (i=0; i<N/8; i++){
  78           vb = pb[i];
  79           vc = pc[i];
  80           va = vb + vc;
  81           pa[i] = va;
  82         }
  83
  84         The main entry to this pass is vectorize_loops(), in which
  85    the vectorizer applies a set of analyses on a given set of loops,
  86    followed by the actual vectorization transformation for the loops that
  87    had successfully passed the analysis phase.
  88         Throughout this pass we make a distinction between two types of
  89    data: scalars (which are represented by SSA_NAMES), and memory references
  90    ("data-refs").  These two types of data require different handling both
  91    during analysis and transformation. The types of data-refs that the
  92    vectorizer currently supports are ARRAY_REFS which base is an array DECL
  93    (not a pointer), and INDIRECT_REFS through pointers; both array and pointer
  94    accesses are required to have a simple (consecutive) access pattern.
  95
  96    Analysis phase:
  97    ===============
  98         The driver for the analysis phase is vect_analyze_loop().
  99    It applies a set of analyses, some of which rely on the scalar evolution
 100    analyzer (scev) developed by Sebastian Pop.
 101
 102         During the analysis phase the vectorizer records some information
 103    per stmt in a "stmt_vec_info" struct which is attached to each stmt in the
 104    loop, as well as general information about the loop as a whole, which is
 105    recorded in a "loop_vec_info" struct attached to each loop.
 106
 107    Transformation phase:
 108    =====================
 109         The loop transformation phase scans all the stmts in the loop, and
 110    creates a vector stmt (or a sequence of stmts) for each scalar stmt S in
 111    the loop that needs to be vectorized.  It inserts the vector code sequence
 112    just before the scalar stmt S, and records a pointer to the vector code
 113    in STMT_VINFO_VEC_STMT (stmt_info) (stmt_info is the stmt_vec_info struct
 114    attached to S).  This pointer will be used for the vectorization of following
 115    stmts which use the def of stmt S. Stmt S is removed if it writes to memory;
 116    otherwise, we rely on dead code elimination for removing it.
 117
 118         For example, say stmt S1 was vectorized into stmt VS1:
 119
 120    VS1: vb = px[i];
 121    S1:  b = x[i];    STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
 122    S2:  a = b;
 123
 124    To vectorize stmt S2, the vectorizer first finds the stmt that defines
 125    the operand 'b' (S1), and gets the relevant vector def 'vb' from the
 126    vector stmt VS1 pointed to by STMT_VINFO_VEC_STMT (stmt_info (S1)).  The
 127    resulting sequence would be:
 128
 129    VS1: vb = px[i];
 130    S1:  b = x[i];       STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
 131    VS2: va = vb;
 132    S2:  a = b;          STMT_VINFO_VEC_STMT (stmt_info (S2)) = VS2
 133
 134         Operands that are not SSA_NAMEs, are data-refs that appear in
 135    load/store operations (like 'x[i]' in S1), and are handled differently.
 136
 137    Target modeling:
 138    =================
 139         Currently the only target specific information that is used is the
 140    size of the vector (in bytes) - "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD".
 141    Targets that can support different sizes of vectors, for now will need
 142    to specify one value for "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD".  More
 143    flexibility will be added in the future.
 144
 145         Since we only vectorize operations which vector form can be
 146    expressed using existing tree codes, to verify that an operation is
 147    supported, the vectorizer checks the relevant optab at the relevant
 148    machine_mode (e.g, optab_handler (add_optab, V8HImode)).  If
 149    the value found is CODE_FOR_nothing, then there's no target support, and
 150    we can't vectorize the stmt.
 151
 152    For additional information on this project see:
 153    http://gcc.gnu.org/projects/tree-ssa/vectorization.html
 154 */
 155
 156 static void vect_estimate_min_profitable_iters (loop_vec_info, int *, int *);
 157 static stmt_vec_info vect_is_simple_reduction (loop_vec_info, stmt_vec_info,
 158                                                bool *);
 159
 160 /* Subroutine of vect_determine_vf_for_stmt that handles only one
 161    statement.  VECTYPE_MAYBE_SET_P is true if STMT_VINFO_VECTYPE
 162    may already be set for general statements (not just data refs).  */
 163
 164 static opt_result
 165 vect_determine_vf_for_stmt_1 (stmt_vec_info stmt_info,
 166                               bool vectype_maybe_set_p,
 167                               poly_uint64 *vf,
 168                               vec<stmt_vec_info > *mask_producers)
 169 {
 170   gimple *stmt = stmt_info->stmt;
 171
 172   if ((!STMT_VINFO_RELEVANT_P (stmt_info)
 173        && !STMT_VINFO_LIVE_P (stmt_info))
 174       || gimple_clobber_p (stmt))
 175     {
 176       if (dump_enabled_p ())
 177         dump_printf_loc (MSG_NOTE, vect_location, "skip.\n");
 178       return opt_result::success ();
 179     }
 180
 181   tree stmt_vectype, nunits_vectype;
 182   opt_result res = vect_get_vector_types_for_stmt (stmt_info, &stmt_vectype,
 183                                                    &nunits_vectype);
 184   if (!res)
 185     return res;
 186
 187   if (stmt_vectype)
 188     {
 189       if (STMT_VINFO_VECTYPE (stmt_info))
 190         /* The only case when a vectype had been already set is for stmts
 191            that contain a data ref, or for "pattern-stmts" (stmts generated
 192            by the vectorizer to represent/replace a certain idiom).  */
 193         gcc_assert ((STMT_VINFO_DATA_REF (stmt_info)
 194                      || vectype_maybe_set_p)
 195                     && STMT_VINFO_VECTYPE (stmt_info) == stmt_vectype);
 196       else if (stmt_vectype == boolean_type_node)
 197         mask_producers->safe_push (stmt_info);
 198       else
 199         STMT_VINFO_VECTYPE (stmt_info) = stmt_vectype;
 200     }
 201
 202   if (nunits_vectype)
 203     vect_update_max_nunits (vf, nunits_vectype);
 204
 205   return opt_result::success ();
 206 }
 207
 208 /* Subroutine of vect_determine_vectorization_factor.  Set the vector
 209    types of STMT_INFO and all attached pattern statements and update
 210    the vectorization factor VF accordingly.  If some of the statements
 211    produce a mask result whose vector type can only be calculated later,
 212    add them to MASK_PRODUCERS.  Return true on success or false if
 213    something prevented vectorization.  */
 214
 215 static opt_result
 216 vect_determine_vf_for_stmt (stmt_vec_info stmt_info, poly_uint64 *vf,
 217                             vec<stmt_vec_info > *mask_producers)
 218 {
 219   vec_info *vinfo = stmt_info->vinfo;
 220   if (dump_enabled_p ())
 221     dump_printf_loc (MSG_NOTE, vect_location, "==> examining statement: %G",
 222                      stmt_info->stmt);
 223   opt_result res
 224     = vect_determine_vf_for_stmt_1 (stmt_info, false, vf, mask_producers);
 225   if (!res)
 226     return res;
 227
 228   if (STMT_VINFO_IN_PATTERN_P (stmt_info)
 229       && STMT_VINFO_RELATED_STMT (stmt_info))
 230     {
 231       gimple *pattern_def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
 232       stmt_info = STMT_VINFO_RELATED_STMT (stmt_info);
 233
 234       /* If a pattern statement has def stmts, analyze them too.  */
 235       for (gimple_stmt_iterator si = gsi_start (pattern_def_seq);
 236            !gsi_end_p (si); gsi_next (&si))
 237         {
 238           stmt_vec_info def_stmt_info = vinfo->lookup_stmt (gsi_stmt (si));
 239           if (dump_enabled_p ())
 240             dump_printf_loc (MSG_NOTE, vect_location,
 241                              "==> examining pattern def stmt: %G",
 242                              def_stmt_info->stmt);
 243           if (!vect_determine_vf_for_stmt_1 (def_stmt_info, true,
 244                                              vf, mask_producers))
 245           res = vect_determine_vf_for_stmt_1 (def_stmt_info, true,
 246                                               vf, mask_producers);
 247           if (!res)
 248             return res;
 249         }
 250
 251       if (dump_enabled_p ())
 252         dump_printf_loc (MSG_NOTE, vect_location,
 253                          "==> examining pattern statement: %G",
 254                          stmt_info->stmt);
 255       res = vect_determine_vf_for_stmt_1 (stmt_info, true, vf, mask_producers);
 256       if (!res)
 257         return res;
 258     }
 259
 260   return opt_result::success ();
 261 }
 262
 263 /* Function vect_determine_vectorization_factor
 264
 265    Determine the vectorization factor (VF).  VF is the number of data elements
 266    that are operated upon in parallel in a single iteration of the vectorized
 267    loop.  For example, when vectorizing a loop that operates on 4byte elements,
 268    on a target with vector size (VS) 16byte, the VF is set to 4, since 4
 269    elements can fit in a single vector register.
 270
 271    We currently support vectorization of loops in which all types operated upon
 272    are of the same size.  Therefore this function currently sets VF according to
 273    the size of the types operated upon, and fails if there are multiple sizes
 274    in the loop.
 275
 276    VF is also the factor by which the loop iterations are strip-mined, e.g.:
 277    original loop:
 278         for (i=0; i<N; i++){
 279           a[i] = b[i] + c[i];
 280         }
 281
 282    vectorized loop:
 283         for (i=0; i<N; i+=VF){
 284           a[i:VF] = b[i:VF] + c[i:VF];
 285         }
 286 */
 287
 288 static opt_result
 289 vect_determine_vectorization_factor (loop_vec_info loop_vinfo)
 290 {
 291   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
 292   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
 293   unsigned nbbs = loop->num_nodes;
 294   poly_uint64 vectorization_factor = 1;
 295   tree scalar_type = NULL_TREE;
 296   gphi *phi;
 297   tree vectype;
 298   stmt_vec_info stmt_info;
 299   unsigned i;
 300   auto_vec<stmt_vec_info> mask_producers;
 301
 302   DUMP_VECT_SCOPE ("vect_determine_vectorization_factor");
 303
 304   for (i = 0; i < nbbs; i++)
 305     {
 306       basic_block bb = bbs[i];
 307
 308       for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
 309            gsi_next (&si))
 310         {
 311           phi = si.phi ();
 312           stmt_info = loop_vinfo->lookup_stmt (phi);
 313           if (dump_enabled_p ())
 314             dump_printf_loc (MSG_NOTE, vect_location, "==> examining phi: %G",
 315                              phi);
 316
 317           gcc_assert (stmt_info);
 318
 319           if (STMT_VINFO_RELEVANT_P (stmt_info)
 320               || STMT_VINFO_LIVE_P (stmt_info))
 321             {
 322               gcc_assert (!STMT_VINFO_VECTYPE (stmt_info));
 323               scalar_type = TREE_TYPE (PHI_RESULT (phi));
 324
 325               if (dump_enabled_p ())
 326                 dump_printf_loc (MSG_NOTE, vect_location,
 327                                  "get vectype for scalar type:  %T\n",
 328                                  scalar_type);
 329
 330               vectype = get_vectype_for_scalar_type (scalar_type);
 331               if (!vectype)
 332                 return opt_result::failure_at (phi,
 333                                                "not vectorized: unsupported "
 334                                                "data-type %T\n",
 335                                                scalar_type);
 336               STMT_VINFO_VECTYPE (stmt_info) = vectype;
 337
 338               if (dump_enabled_p ())
 339                 dump_printf_loc (MSG_NOTE, vect_location, "vectype: %T\n",
 340                                  vectype);
 341
 342               if (dump_enabled_p ())
 343                 {
 344                   dump_printf_loc (MSG_NOTE, vect_location, "nunits = ");
 345                   dump_dec (MSG_NOTE, TYPE_VECTOR_SUBPARTS (vectype));
 346                   dump_printf (MSG_NOTE, "\n");
 347                 }
 348
 349               vect_update_max_nunits (&vectorization_factor, vectype);
 350             }
 351         }
 352
 353       for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
 354            gsi_next (&si))
 355         {
 356           stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
 357           opt_result res
 358             = vect_determine_vf_for_stmt (stmt_info, &vectorization_factor,
 359                                           &mask_producers);
 360           if (!res)
 361             return res;
 362         }
 363     }
 364
 365   /* TODO: Analyze cost. Decide if worth while to vectorize.  */
 366   if (dump_enabled_p ())
 367     {
 368       dump_printf_loc (MSG_NOTE, vect_location, "vectorization factor = ");
 369       dump_dec (MSG_NOTE, vectorization_factor);
 370       dump_printf (MSG_NOTE, "\n");
 371     }
 372
 373   if (known_le (vectorization_factor, 1U))
 374     return opt_result::failure_at (vect_location,
 375                                    "not vectorized: unsupported data-type\n");
 376   LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
 377
 378   for (i = 0; i < mask_producers.length (); i++)
 379     {
 380       stmt_info = mask_producers[i];
 381       opt_tree mask_type = vect_get_mask_type_for_stmt (stmt_info);
 382       if (!mask_type)
 383         return opt_result::propagate_failure (mask_type);
 384       STMT_VINFO_VECTYPE (stmt_info) = mask_type;
 385     }
 386
 387   return opt_result::success ();
 388 }
 389
 390
 391 /* Function vect_is_simple_iv_evolution.
 392
 393    FORNOW: A simple evolution of an induction variables in the loop is
 394    considered a polynomial evolution.  */
 395
 396 static bool
 397 vect_is_simple_iv_evolution (unsigned loop_nb, tree access_fn, tree * init,
 398                              tree * step)
 399 {
 400   tree init_expr;
 401   tree step_expr;
 402   tree evolution_part = evolution_part_in_loop_num (access_fn, loop_nb);
 403   basic_block bb;
 404
 405   /* When there is no evolution in this loop, the evolution function
 406      is not "simple".  */
 407   if (evolution_part == NULL_TREE)
 408     return false;
 409
 410   /* When the evolution is a polynomial of degree >= 2
 411      the evolution function is not "simple".  */
 412   if (tree_is_chrec (evolution_part))
 413     return false;
 414
 415   step_expr = evolution_part;
 416   init_expr = unshare_expr (initial_condition_in_loop_num (access_fn, loop_nb));
 417
 418   if (dump_enabled_p ())
 419     dump_printf_loc (MSG_NOTE, vect_location, "step: %T,  init: %T\n",
 420                      step_expr, init_expr);
 421
 422   *init = init_expr;
 423   *step = step_expr;
 424
 425   if (TREE_CODE (step_expr) != INTEGER_CST
 426       && (TREE_CODE (step_expr) != SSA_NAME
 427           || ((bb = gimple_bb (SSA_NAME_DEF_STMT (step_expr)))
 428               && flow_bb_inside_loop_p (get_loop (cfun, loop_nb), bb))
 429           || (!INTEGRAL_TYPE_P (TREE_TYPE (step_expr))
 430               && (!SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr))
 431                   || !flag_associative_math)))
 432       && (TREE_CODE (step_expr) != REAL_CST
 433           || !flag_associative_math))
 434     {
 435       if (dump_enabled_p ())
 436         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 437                          "step unknown.\n");
 438       return false;
 439     }
 440
 441   return true;
 442 }
 443
 444 /* Return true if PHI, described by STMT_INFO, is the inner PHI in
 445    what we are assuming is a double reduction.  For example, given
 446    a structure like this:
 447
 448       outer1:
 449         x_1 = PHI <x_4(outer2), ...>;
 450         ...
 451
 452       inner:
 453         x_2 = PHI <x_1(outer1), ...>;
 454         ...
 455         x_3 = ...;
 456         ...
 457
 458       outer2:
 459         x_4 = PHI <x_3(inner)>;
 460         ...
 461
 462    outer loop analysis would treat x_1 as a double reduction phi and
 463    this function would then return true for x_2.  */
 464
 465 static bool
 466 vect_inner_phi_in_double_reduction_p (stmt_vec_info stmt_info, gphi *phi)
 467 {
 468   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
 469   use_operand_p use_p;
 470   ssa_op_iter op_iter;
 471   FOR_EACH_PHI_ARG (use_p, phi, op_iter, SSA_OP_USE)
 472     if (stmt_vec_info def_info = loop_vinfo->lookup_def (USE_FROM_PTR (use_p)))
 473       if (STMT_VINFO_DEF_TYPE (def_info) == vect_double_reduction_def)
 474         return true;
 475   return false;
 476 }
 477
 478 /* Function vect_analyze_scalar_cycles_1.
 479
 480    Examine the cross iteration def-use cycles of scalar variables
 481    in LOOP.  LOOP_VINFO represents the loop that is now being
 482    considered for vectorization (can be LOOP, or an outer-loop
 483    enclosing LOOP).  */
 484
 485 static void
 486 vect_analyze_scalar_cycles_1 (loop_vec_info loop_vinfo, class loop *loop)
 487 {
 488   basic_block bb = loop->header;
 489   tree init, step;
 490   auto_vec<stmt_vec_info, 64> worklist;
 491   gphi_iterator gsi;
 492   bool double_reduc;
 493
 494   DUMP_VECT_SCOPE ("vect_analyze_scalar_cycles");
 495
 496   /* First - identify all inductions.  Reduction detection assumes that all the
 497      inductions have been identified, therefore, this order must not be
 498      changed.  */
 499   for (gsi = gsi_start_phis  (bb); !gsi_end_p (gsi); gsi_next (&gsi))
 500     {
 501       gphi *phi = gsi.phi ();
 502       tree access_fn = NULL;
 503       tree def = PHI_RESULT (phi);
 504       stmt_vec_info stmt_vinfo = loop_vinfo->lookup_stmt (phi);
 505
 506       if (dump_enabled_p ())
 507         dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: %G", phi);
 508
 509       /* Skip virtual phi's.  The data dependences that are associated with
 510          virtual defs/uses (i.e., memory accesses) are analyzed elsewhere.  */
 511       if (virtual_operand_p (def))
 512         continue;
 513
 514       STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_unknown_def_type;
 515
 516       /* Analyze the evolution function.  */
 517       access_fn = analyze_scalar_evolution (loop, def);
 518       if (access_fn)
 519         {
 520           STRIP_NOPS (access_fn);
 521           if (dump_enabled_p ())
 522             dump_printf_loc (MSG_NOTE, vect_location,
 523                              "Access function of PHI: %T\n", access_fn);
 524           STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo)
 525             = initial_condition_in_loop_num (access_fn, loop->num);
 526           STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo)
 527             = evolution_part_in_loop_num (access_fn, loop->num);
 528         }
 529
 530       if (!access_fn
 531           || vect_inner_phi_in_double_reduction_p (stmt_vinfo, phi)
 532           || !vect_is_simple_iv_evolution (loop->num, access_fn, &init, &step)
 533           || (LOOP_VINFO_LOOP (loop_vinfo) != loop
 534               && TREE_CODE (step) != INTEGER_CST))
 535         {
 536           worklist.safe_push (stmt_vinfo);
 537           continue;
 538         }
 539
 540       gcc_assert (STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo)
 541                   != NULL_TREE);
 542       gcc_assert (STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo) != NULL_TREE);
 543
 544       if (dump_enabled_p ())
 545         dump_printf_loc (MSG_NOTE, vect_location, "Detected induction.\n");
 546       STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_induction_def;
 547     }
 548
 549
 550   /* Second - identify all reductions and nested cycles.  */
 551   while (worklist.length () > 0)
 552     {
 553       stmt_vec_info stmt_vinfo = worklist.pop ();
 554       gphi *phi = as_a <gphi *> (stmt_vinfo->stmt);
 555       tree def = PHI_RESULT (phi);
 556
 557       if (dump_enabled_p ())
 558         dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: %G", phi);
 559
 560       gcc_assert (!virtual_operand_p (def)
 561                   && STMT_VINFO_DEF_TYPE (stmt_vinfo) == vect_unknown_def_type);
 562
 563       stmt_vec_info reduc_stmt_info
 564         = vect_is_simple_reduction (loop_vinfo, stmt_vinfo, &double_reduc);
 565       if (reduc_stmt_info)
 566         {
 567           STMT_VINFO_REDUC_DEF (stmt_vinfo) = reduc_stmt_info;
 568           STMT_VINFO_REDUC_DEF (reduc_stmt_info) = stmt_vinfo;
 569           if (double_reduc)
 570             {
 571               if (dump_enabled_p ())
 572                 dump_printf_loc (MSG_NOTE, vect_location,
 573                                  "Detected double reduction.\n");
 574
 575               STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_double_reduction_def;
 576               STMT_VINFO_DEF_TYPE (reduc_stmt_info) = vect_double_reduction_def;
 577             }
 578           else
 579             {
 580               if (loop != LOOP_VINFO_LOOP (loop_vinfo))
 581                 {
 582                   if (dump_enabled_p ())
 583                     dump_printf_loc (MSG_NOTE, vect_location,
 584                                      "Detected vectorizable nested cycle.\n");
 585
 586                   STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_nested_cycle;
 587                 }
 588               else
 589                 {
 590                   if (dump_enabled_p ())
 591                     dump_printf_loc (MSG_NOTE, vect_location,
 592                                      "Detected reduction.\n");
 593
 594                   STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_reduction_def;
 595                   STMT_VINFO_DEF_TYPE (reduc_stmt_info) = vect_reduction_def;
 596                   /* Store the reduction cycles for possible vectorization in
 597                      loop-aware SLP if it was not detected as reduction
 598                      chain.  */
 599                   if (! REDUC_GROUP_FIRST_ELEMENT (reduc_stmt_info))
 600                     LOOP_VINFO_REDUCTIONS (loop_vinfo).safe_push
 601                       (reduc_stmt_info);
 602                 }
 603             }
 604         }
 605       else
 606         if (dump_enabled_p ())
 607           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 608                            "Unknown def-use cycle pattern.\n");
 609     }
 610 }
 611
 612
 613 /* Function vect_analyze_scalar_cycles.
 614
 615    Examine the cross iteration def-use cycles of scalar variables, by
 616    analyzing the loop-header PHIs of scalar variables.  Classify each
 617    cycle as one of the following: invariant, induction, reduction, unknown.
 618    We do that for the loop represented by LOOP_VINFO, and also to its
 619    inner-loop, if exists.
 620    Examples for scalar cycles:
 621
 622    Example1: reduction:
 623
 624               loop1:
 625               for (i=0; i<N; i++)
 626                  sum += a[i];
 627
 628    Example2: induction:
 629
 630               loop2:
 631               for (i=0; i<N; i++)
 632                  a[i] = i;  */
 633
 634 static void
 635 vect_analyze_scalar_cycles (loop_vec_info loop_vinfo)
 636 {
 637   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
 638
 639   vect_analyze_scalar_cycles_1 (loop_vinfo, loop);
 640
 641   /* When vectorizing an outer-loop, the inner-loop is executed sequentially.
 642      Reductions in such inner-loop therefore have different properties than
 643      the reductions in the nest that gets vectorized:
 644      1. When vectorized, they are executed in the same order as in the original
 645         scalar loop, so we can't change the order of computation when
 646         vectorizing them.
 647      2. FIXME: Inner-loop reductions can be used in the inner-loop, so the
 648         current checks are too strict.  */
 649
 650   if (loop->inner)
 651     vect_analyze_scalar_cycles_1 (loop_vinfo, loop->inner);
 652 }
 653
 654 /* Transfer group and reduction information from STMT_INFO to its
 655    pattern stmt.  */
 656
 657 static void
 658 vect_fixup_reduc_chain (stmt_vec_info stmt_info)
 659 {
 660   stmt_vec_info firstp = STMT_VINFO_RELATED_STMT (stmt_info);
 661   stmt_vec_info stmtp;
 662   gcc_assert (!REDUC_GROUP_FIRST_ELEMENT (firstp)
 663               && REDUC_GROUP_FIRST_ELEMENT (stmt_info));
 664   REDUC_GROUP_SIZE (firstp) = REDUC_GROUP_SIZE (stmt_info);
 665   do
 666     {
 667       stmtp = STMT_VINFO_RELATED_STMT (stmt_info);
 668       REDUC_GROUP_FIRST_ELEMENT (stmtp) = firstp;
 669       stmt_info = REDUC_GROUP_NEXT_ELEMENT (stmt_info);
 670       if (stmt_info)
 671         REDUC_GROUP_NEXT_ELEMENT (stmtp)
 672           = STMT_VINFO_RELATED_STMT (stmt_info);
 673     }
 674   while (stmt_info);
 675   STMT_VINFO_DEF_TYPE (stmtp) = vect_reduction_def;
 676 }
 677
 678 /* Fixup scalar cycles that now have their stmts detected as patterns.  */
 679
 680 static void
 681 vect_fixup_scalar_cycles_with_patterns (loop_vec_info loop_vinfo)
 682 {
 683   stmt_vec_info first;
 684   unsigned i;
 685
 686   FOR_EACH_VEC_ELT (LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo), i, first)
 687     if (STMT_VINFO_IN_PATTERN_P (first))
 688       {
 689         stmt_vec_info next = REDUC_GROUP_NEXT_ELEMENT (first);
 690         while (next)
 691           {
 692             if (! STMT_VINFO_IN_PATTERN_P (next))
 693               break;
 694             next = REDUC_GROUP_NEXT_ELEMENT (next);
 695           }
 696         /* If not all stmt in the chain are patterns try to handle
 697            the chain without patterns.  */
 698         if (! next)
 699           {
 700             vect_fixup_reduc_chain (first);
 701             LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo)[i]
 702               = STMT_VINFO_RELATED_STMT (first);
 703           }
 704       }
 705 }
 706
 707 /* Function vect_get_loop_niters.
 708
 709    Determine how many iterations the loop is executed and place it
 710    in NUMBER_OF_ITERATIONS.  Place the number of latch iterations
 711    in NUMBER_OF_ITERATIONSM1.  Place the condition under which the
 712    niter information holds in ASSUMPTIONS.
 713
 714    Return the loop exit condition.  */
 715
 716
 717 static gcond *
 718 vect_get_loop_niters (class loop *loop, tree *assumptions,
 719                       tree *number_of_iterations, tree *number_of_iterationsm1)
 720 {
 721   edge exit = single_exit (loop);
 722   class tree_niter_desc niter_desc;
 723   tree niter_assumptions, niter, may_be_zero;
 724   gcond *cond = get_loop_exit_condition (loop);
 725
 726   *assumptions = boolean_true_node;
 727   *number_of_iterationsm1 = chrec_dont_know;
 728   *number_of_iterations = chrec_dont_know;
 729   DUMP_VECT_SCOPE ("get_loop_niters");
 730
 731   if (!exit)
 732     return cond;
 733
 734   may_be_zero = NULL_TREE;
 735   if (!number_of_iterations_exit_assumptions (loop, exit, &niter_desc, NULL)
 736       || chrec_contains_undetermined (niter_desc.niter))
 737     return cond;
 738
 739   niter_assumptions = niter_desc.assumptions;
 740   may_be_zero = niter_desc.may_be_zero;
 741   niter = niter_desc.niter;
 742
 743   if (may_be_zero && integer_zerop (may_be_zero))
 744     may_be_zero = NULL_TREE;
 745
 746   if (may_be_zero)
 747     {
 748       if (COMPARISON_CLASS_P (may_be_zero))
 749         {
 750           /* Try to combine may_be_zero with assumptions, this can simplify
 751              computation of niter expression.  */
 752           if (niter_assumptions && !integer_nonzerop (niter_assumptions))
 753             niter_assumptions = fold_build2 (TRUTH_AND_EXPR, boolean_type_node,
 754                                              niter_assumptions,
 755                                              fold_build1 (TRUTH_NOT_EXPR,
 756                                                           boolean_type_node,
 757                                                           may_be_zero));
 758           else
 759             niter = fold_build3 (COND_EXPR, TREE_TYPE (niter), may_be_zero,
 760                                  build_int_cst (TREE_TYPE (niter), 0),
 761                                  rewrite_to_non_trapping_overflow (niter));
 762
 763           may_be_zero = NULL_TREE;
 764         }
 765       else if (integer_nonzerop (may_be_zero))
 766         {
 767           *number_of_iterationsm1 = build_int_cst (TREE_TYPE (niter), 0);
 768           *number_of_iterations = build_int_cst (TREE_TYPE (niter), 1);
 769           return cond;
 770         }
 771       else
 772         return cond;
 773     }
 774
 775   *assumptions = niter_assumptions;
 776   *number_of_iterationsm1 = niter;
 777
 778   /* We want the number of loop header executions which is the number
 779      of latch executions plus one.
 780      ???  For UINT_MAX latch executions this number overflows to zero
 781      for loops like do { n++; } while (n != 0);  */
 782   if (niter && !chrec_contains_undetermined (niter))
 783     niter = fold_build2 (PLUS_EXPR, TREE_TYPE (niter), unshare_expr (niter),
 784                           build_int_cst (TREE_TYPE (niter), 1));
 785   *number_of_iterations = niter;
 786
 787   return cond;
 788 }
 789
 790 /* Function bb_in_loop_p
 791
 792    Used as predicate for dfs order traversal of the loop bbs.  */
 793
 794 static bool
 795 bb_in_loop_p (const_basic_block bb, const void *data)
 796 {
 797   const class loop *const loop = (const class loop *)data;
 798   if (flow_bb_inside_loop_p (loop, bb))
 799     return true;
 800   return false;
 801 }
 802
 803
 804 /* Create and initialize a new loop_vec_info struct for LOOP_IN, as well as
 805    stmt_vec_info structs for all the stmts in LOOP_IN.  */
 806
 807 _loop_vec_info::_loop_vec_info (class loop *loop_in, vec_info_shared *shared)
 808   : vec_info (vec_info::loop, init_cost (loop_in), shared),
 809     loop (loop_in),
 810     bbs (XCNEWVEC (basic_block, loop->num_nodes)),
 811     num_itersm1 (NULL_TREE),
 812     num_iters (NULL_TREE),
 813     num_iters_unchanged (NULL_TREE),
 814     num_iters_assumptions (NULL_TREE),
 815     th (0),
 816     versioning_threshold (0),
 817     vectorization_factor (0),
 818     max_vectorization_factor (0),
 819     mask_skip_niters (NULL_TREE),
 820     mask_compare_type (NULL_TREE),
 821     simd_if_cond (NULL_TREE),
 822     unaligned_dr (NULL),
 823     peeling_for_alignment (0),
 824     ptr_mask (0),
 825     ivexpr_map (NULL),
 826     scan_map (NULL),
 827     slp_unrolling_factor (1),
 828     single_scalar_iteration_cost (0),
 829     vectorizable (false),
 830     can_fully_mask_p (true),
 831     fully_masked_p (false),
 832     peeling_for_gaps (false),
 833     peeling_for_niter (false),
 834     no_data_dependencies (false),
 835     has_mask_store (false),
 836     scalar_loop_scaling (profile_probability::uninitialized ()),
 837     scalar_loop (NULL),
 838     orig_loop_info (NULL)
 839 {
 840   /* CHECKME: We want to visit all BBs before their successors (except for
 841      latch blocks, for which this assertion wouldn't hold).  In the simple
 842      case of the loop forms we allow, a dfs order of the BBs would the same
 843      as reversed postorder traversal, so we are safe.  */
 844
 845   unsigned int nbbs = dfs_enumerate_from (loop->header, 0, bb_in_loop_p,
 846                                           bbs, loop->num_nodes, loop);
 847   gcc_assert (nbbs == loop->num_nodes);
 848
 849   for (unsigned int i = 0; i < nbbs; i++)
 850     {
 851       basic_block bb = bbs[i];
 852       gimple_stmt_iterator si;
 853
 854       for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
 855         {
 856           gimple *phi = gsi_stmt (si);
 857           gimple_set_uid (phi, 0);
 858           add_stmt (phi);
 859         }
 860
 861       for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
 862         {
 863           gimple *stmt = gsi_stmt (si);
 864           gimple_set_uid (stmt, 0);
 865           add_stmt (stmt);
 866           /* If .GOMP_SIMD_LANE call for the current loop has 3 arguments, the
 867              third argument is the #pragma omp simd if (x) condition, when 0,
 868              loop shouldn't be vectorized, when non-zero constant, it should
 869              be vectorized normally, otherwise versioned with vectorized loop
 870              done if the condition is non-zero at runtime.  */
 871           if (loop_in->simduid
 872               && is_gimple_call (stmt)
 873               && gimple_call_internal_p (stmt)
 874               && gimple_call_internal_fn (stmt) == IFN_GOMP_SIMD_LANE
 875               && gimple_call_num_args (stmt) >= 3
 876               && TREE_CODE (gimple_call_arg (stmt, 0)) == SSA_NAME
 877               && (loop_in->simduid
 878                   == SSA_NAME_VAR (gimple_call_arg (stmt, 0))))
 879             {
 880               tree arg = gimple_call_arg (stmt, 2);
 881               if (integer_zerop (arg) || TREE_CODE (arg) == SSA_NAME)
 882                 simd_if_cond = arg;
 883               else
 884                 gcc_assert (integer_nonzerop (arg));
 885             }
 886         }
 887     }
 888 }
 889
 890 /* Free all levels of MASKS.  */
 891
 892 void
 893 release_vec_loop_masks (vec_loop_masks *masks)
 894 {
 895   rgroup_masks *rgm;
 896   unsigned int i;
 897   FOR_EACH_VEC_ELT (*masks, i, rgm)
 898     rgm->masks.release ();
 899   masks->release ();
 900 }
 901
 902 /* Free all memory used by the _loop_vec_info, as well as all the
 903    stmt_vec_info structs of all the stmts in the loop.  */
 904
 905 _loop_vec_info::~_loop_vec_info ()
 906 {
 907   free (bbs);
 908
 909   release_vec_loop_masks (&masks);
 910   delete ivexpr_map;
 911   delete scan_map;
 912
 913   loop->aux = NULL;
 914 }
 915
 916 /* Return an invariant or register for EXPR and emit necessary
 917    computations in the LOOP_VINFO loop preheader.  */
 918
 919 tree
 920 cse_and_gimplify_to_preheader (loop_vec_info loop_vinfo, tree expr)
 921 {
 922   if (is_gimple_reg (expr)
 923       || is_gimple_min_invariant (expr))
 924     return expr;
 925
 926   if (! loop_vinfo->ivexpr_map)
 927     loop_vinfo->ivexpr_map = new hash_map<tree_operand_hash, tree>;
 928   tree &cached = loop_vinfo->ivexpr_map->get_or_insert (expr);
 929   if (! cached)
 930     {
 931       gimple_seq stmts = NULL;
 932       cached = force_gimple_operand (unshare_expr (expr),
 933                                      &stmts, true, NULL_TREE);
 934       if (stmts)
 935         {
 936           edge e = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo));
 937           gsi_insert_seq_on_edge_immediate (e, stmts);
 938         }
 939     }
 940   return cached;
 941 }
 942
 943 /* Return true if we can use CMP_TYPE as the comparison type to produce
 944    all masks required to mask LOOP_VINFO.  */
 945
 946 static bool
 947 can_produce_all_loop_masks_p (loop_vec_info loop_vinfo, tree cmp_type)
 948 {
 949   rgroup_masks *rgm;
 950   unsigned int i;
 951   FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo), i, rgm)
 952     if (rgm->mask_type != NULL_TREE
 953         && !direct_internal_fn_supported_p (IFN_WHILE_ULT,
 954                                             cmp_type, rgm->mask_type,
 955                                             OPTIMIZE_FOR_SPEED))
 956       return false;
 957   return true;
 958 }
 959
 960 /* Calculate the maximum number of scalars per iteration for every
 961    rgroup in LOOP_VINFO.  */
 962
 963 static unsigned int
 964 vect_get_max_nscalars_per_iter (loop_vec_info loop_vinfo)
 965 {
 966   unsigned int res = 1;
 967   unsigned int i;
 968   rgroup_masks *rgm;
 969   FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo), i, rgm)
 970     res = MAX (res, rgm->max_nscalars_per_iter);
 971   return res;
 972 }
 973
 974 /* Each statement in LOOP_VINFO can be masked where necessary.  Check
 975    whether we can actually generate the masks required.  Return true if so,
 976    storing the type of the scalar IV in LOOP_VINFO_MASK_COMPARE_TYPE.  */
 977
 978 static bool
 979 vect_verify_full_masking (loop_vec_info loop_vinfo)
 980 {
 981   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
 982   unsigned int min_ni_width;
 983   unsigned int max_nscalars_per_iter
 984     = vect_get_max_nscalars_per_iter (loop_vinfo);
 985
 986   /* Use a normal loop if there are no statements that need masking.
 987      This only happens in rare degenerate cases: it means that the loop
 988      has no loads, no stores, and no live-out values.  */
 989   if (LOOP_VINFO_MASKS (loop_vinfo).is_empty ())
 990     return false;
 991
 992   /* Get the maximum number of iterations that is representable
 993      in the counter type.  */
 994   tree ni_type = TREE_TYPE (LOOP_VINFO_NITERSM1 (loop_vinfo));
 995   widest_int max_ni = wi::to_widest (TYPE_MAX_VALUE (ni_type)) + 1;
 996
 997   /* Get a more refined estimate for the number of iterations.  */
 998   widest_int max_back_edges;
 999   if (max_loop_iterations (loop, &max_back_edges))
1000     max_ni = wi::smin (max_ni, max_back_edges + 1);
1001
1002   /* Account for rgroup masks, in which each bit is replicated N times.  */
1003   max_ni *= max_nscalars_per_iter;
1004
1005   /* Work out how many bits we need to represent the limit.  */
1006   min_ni_width = wi::min_precision (max_ni, UNSIGNED);
1007
1008   /* Find a scalar mode for which WHILE_ULT is supported.  */
1009   opt_scalar_int_mode cmp_mode_iter;
1010   tree cmp_type = NULL_TREE;
1011   tree iv_type = NULL_TREE;
1012   widest_int iv_limit = vect_iv_limit_for_full_masking (loop_vinfo);
1013   unsigned int iv_precision = UINT_MAX;
1014
1015   if (iv_limit != -1)
1016     iv_precision = wi::min_precision (iv_limit * max_nscalars_per_iter,
1017                                       UNSIGNED);
1018
1019   FOR_EACH_MODE_IN_CLASS (cmp_mode_iter, MODE_INT)
1020     {
1021       unsigned int cmp_bits = GET_MODE_BITSIZE (cmp_mode_iter.require ());
1022       if (cmp_bits >= min_ni_width
1023           && targetm.scalar_mode_supported_p (cmp_mode_iter.require ()))
1024         {
1025           tree this_type = build_nonstandard_integer_type (cmp_bits, true);
1026           if (this_type
1027               && can_produce_all_loop_masks_p (loop_vinfo, this_type))
1028             {
1029               /* Although we could stop as soon as we find a valid mode,
1030                  there are at least two reasons why that's not always the
1031                  best choice:
1032
1033                  - An IV that's Pmode or wider is more likely to be reusable
1034                    in address calculations than an IV that's narrower than
1035                    Pmode.
1036
1037                  - Doing the comparison in IV_PRECISION or wider allows
1038                    a natural 0-based IV, whereas using a narrower comparison
1039                    type requires mitigations against wrap-around.
1040
1041                  Conversely, if the IV limit is variable, doing the comparison
1042                  in a wider type than the original type can introduce
1043                  unnecessary extensions, so picking the widest valid mode
1044                  is not always a good choice either.
1045
1046                  Here we prefer the first IV type that's Pmode or wider,
1047                  and the first comparison type that's IV_PRECISION or wider.
1048                  (The comparison type must be no wider than the IV type,
1049                  to avoid extensions in the vector loop.)
1050
1051                  ??? We might want to try continuing beyond Pmode for ILP32
1052                  targets if CMP_BITS < IV_PRECISION.  */
1053               iv_type = this_type;
1054               if (!cmp_type || iv_precision > TYPE_PRECISION (cmp_type))
1055                 cmp_type = this_type;
1056               if (cmp_bits >= GET_MODE_BITSIZE (Pmode))
1057                 break;
1058             }
1059         }
1060     }
1061
1062   if (!cmp_type)
1063     return false;
1064
1065   LOOP_VINFO_MASK_COMPARE_TYPE (loop_vinfo) = cmp_type;
1066   LOOP_VINFO_MASK_IV_TYPE (loop_vinfo) = iv_type;
1067   return true;
1068 }
1069
1070 /* Calculate the cost of one scalar iteration of the loop.  */
1071 static void
1072 vect_compute_single_scalar_iteration_cost (loop_vec_info loop_vinfo)
1073 {
1074   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1075   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1076   int nbbs = loop->num_nodes, factor;
1077   int innerloop_iters, i;
1078
1079   DUMP_VECT_SCOPE ("vect_compute_single_scalar_iteration_cost");
1080
1081   /* Gather costs for statements in the scalar loop.  */
1082
1083   /* FORNOW.  */
1084   innerloop_iters = 1;
1085   if (loop->inner)
1086     innerloop_iters = 50; /* FIXME */
1087
1088   for (i = 0; i < nbbs; i++)
1089     {
1090       gimple_stmt_iterator si;
1091       basic_block bb = bbs[i];
1092
1093       if (bb->loop_father == loop->inner)
1094         factor = innerloop_iters;
1095       else
1096         factor = 1;
1097
1098       for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
1099         {
1100           gimple *stmt = gsi_stmt (si);
1101           stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (stmt);
1102
1103           if (!is_gimple_assign (stmt) && !is_gimple_call (stmt))
1104             continue;
1105
1106           /* Skip stmts that are not vectorized inside the loop.  */
1107           stmt_vec_info vstmt_info = vect_stmt_to_vectorize (stmt_info);
1108           if (!STMT_VINFO_RELEVANT_P (vstmt_info)
1109               && (!STMT_VINFO_LIVE_P (vstmt_info)
1110                   || !VECTORIZABLE_CYCLE_DEF
1111                         (STMT_VINFO_DEF_TYPE (vstmt_info))))
1112             continue;
1113
1114           vect_cost_for_stmt kind;
1115           if (STMT_VINFO_DATA_REF (stmt_info))
1116             {
1117               if (DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
1118                kind = scalar_load;
1119              else
1120                kind = scalar_store;
1121             }
1122           else
1123             kind = scalar_stmt;
1124
1125           record_stmt_cost (&LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1126                             factor, kind, stmt_info, 0, vect_prologue);
1127         }
1128     }
1129
1130   /* Now accumulate cost.  */
1131   void *target_cost_data = init_cost (loop);
1132   stmt_info_for_cost *si;
1133   int j;
1134   FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1135                     j, si)
1136     (void) add_stmt_cost (target_cost_data, si->count,
1137                           si->kind, si->stmt_info, si->misalign,
1138                           vect_body);
1139   unsigned dummy, body_cost = 0;
1140   finish_cost (target_cost_data, &dummy, &body_cost, &dummy);
1141   destroy_cost_data (target_cost_data);
1142   LOOP_VINFO_SINGLE_SCALAR_ITERATION_COST (loop_vinfo) = body_cost;
1143 }
1144
1145
1146 /* Function vect_analyze_loop_form_1.
1147
1148    Verify that certain CFG restrictions hold, including:
1149    - the loop has a pre-header
1150    - the loop has a single entry and exit
1151    - the loop exit condition is simple enough
1152    - the number of iterations can be analyzed, i.e, a countable loop.  The
1153      niter could be analyzed under some assumptions.  */
1154
1155 opt_result
1156 vect_analyze_loop_form_1 (class loop *loop, gcond **loop_cond,
1157                           tree *assumptions, tree *number_of_iterationsm1,
1158                           tree *number_of_iterations, gcond **inner_loop_cond)
1159 {
1160   DUMP_VECT_SCOPE ("vect_analyze_loop_form");
1161
1162   /* Different restrictions apply when we are considering an inner-most loop,
1163      vs. an outer (nested) loop.
1164      (FORNOW. May want to relax some of these restrictions in the future).  */
1165
1166   if (!loop->inner)
1167     {
1168       /* Inner-most loop.  We currently require that the number of BBs is
1169          exactly 2 (the header and latch).  Vectorizable inner-most loops
1170          look like this:
1171
1172                         (pre-header)
1173                            |
1174                           header <--------+
1175                            | |            |
1176                            | +--> latch --+
1177                            |
1178                         (exit-bb)  */
1179
1180       if (loop->num_nodes != 2)
1181         return opt_result::failure_at (vect_location,
1182                                        "not vectorized:"
1183                                        " control flow in loop.\n");
1184
1185       if (empty_block_p (loop->header))
1186         return opt_result::failure_at (vect_location,
1187                                        "not vectorized: empty loop.\n");
1188     }
1189   else
1190     {
1191       class loop *innerloop = loop->inner;
1192       edge entryedge;
1193
1194       /* Nested loop. We currently require that the loop is doubly-nested,
1195          contains a single inner loop, and the number of BBs is exactly 5.
1196          Vectorizable outer-loops look like this:
1197
1198                         (pre-header)
1199                            |
1200                           header <---+
1201                            |         |
1202                           inner-loop |
1203                            |         |
1204                           tail ------+
1205                            |
1206                         (exit-bb)
1207
1208          The inner-loop has the properties expected of inner-most loops
1209          as described above.  */
1210
1211       if ((loop->inner)->inner || (loop->inner)->next)
1212         return opt_result::failure_at (vect_location,
1213                                        "not vectorized:"
1214                                        " multiple nested loops.\n");
1215
1216       if (loop->num_nodes != 5)
1217         return opt_result::failure_at (vect_location,
1218                                        "not vectorized:"
1219                                        " control flow in loop.\n");
1220
1221       entryedge = loop_preheader_edge (innerloop);
1222       if (entryedge->src != loop->header
1223           || !single_exit (innerloop)
1224           || single_exit (innerloop)->dest != EDGE_PRED (loop->latch, 0)->src)
1225         return opt_result::failure_at (vect_location,
1226                                        "not vectorized:"
1227                                        " unsupported outerloop form.\n");
1228
1229       /* Analyze the inner-loop.  */
1230       tree inner_niterm1, inner_niter, inner_assumptions;
1231       opt_result res
1232         = vect_analyze_loop_form_1 (loop->inner, inner_loop_cond,
1233                                     &inner_assumptions, &inner_niterm1,
1234                                     &inner_niter, NULL);
1235       if (!res)
1236         {
1237           if (dump_enabled_p ())
1238             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1239                              "not vectorized: Bad inner loop.\n");
1240           return res;
1241         }
1242
1243       /* Don't support analyzing niter under assumptions for inner
1244          loop.  */
1245       if (!integer_onep (inner_assumptions))
1246         return opt_result::failure_at (vect_location,
1247                                        "not vectorized: Bad inner loop.\n");
1248
1249       if (!expr_invariant_in_loop_p (loop, inner_niter))
1250         return opt_result::failure_at (vect_location,
1251                                        "not vectorized: inner-loop count not"
1252                                        " invariant.\n");
1253
1254       if (dump_enabled_p ())
1255         dump_printf_loc (MSG_NOTE, vect_location,
1256                          "Considering outer-loop vectorization.\n");
1257     }
1258
1259   if (!single_exit (loop))
1260     return opt_result::failure_at (vect_location,
1261                                    "not vectorized: multiple exits.\n");
1262   if (EDGE_COUNT (loop->header->preds) != 2)
1263     return opt_result::failure_at (vect_location,
1264                                    "not vectorized:"
1265                                    " too many incoming edges.\n");
1266
1267   /* We assume that the loop exit condition is at the end of the loop. i.e,
1268      that the loop is represented as a do-while (with a proper if-guard
1269      before the loop if needed), where the loop header contains all the
1270      executable statements, and the latch is empty.  */
1271   if (!empty_block_p (loop->latch)
1272       || !gimple_seq_empty_p (phi_nodes (loop->latch)))
1273     return opt_result::failure_at (vect_location,
1274                                    "not vectorized: latch block not empty.\n");
1275
1276   /* Make sure the exit is not abnormal.  */
1277   edge e = single_exit (loop);
1278   if (e->flags & EDGE_ABNORMAL)
1279     return opt_result::failure_at (vect_location,
1280                                    "not vectorized:"
1281                                    " abnormal loop exit edge.\n");
1282
1283   *loop_cond = vect_get_loop_niters (loop, assumptions, number_of_iterations,
1284                                      number_of_iterationsm1);
1285   if (!*loop_cond)
1286     return opt_result::failure_at
1287       (vect_location,
1288        "not vectorized: complicated exit condition.\n");
1289
1290   if (integer_zerop (*assumptions)
1291       || !*number_of_iterations
1292       || chrec_contains_undetermined (*number_of_iterations))
1293     return opt_result::failure_at
1294       (*loop_cond,
1295        "not vectorized: number of iterations cannot be computed.\n");
1296
1297   if (integer_zerop (*number_of_iterations))
1298     return opt_result::failure_at
1299       (*loop_cond,
1300        "not vectorized: number of iterations = 0.\n");
1301
1302   return opt_result::success ();
1303 }
1304
1305 /* Analyze LOOP form and return a loop_vec_info if it is of suitable form.  */
1306
1307 opt_loop_vec_info
1308 vect_analyze_loop_form (class loop *loop, vec_info_shared *shared)
1309 {
1310   tree assumptions, number_of_iterations, number_of_iterationsm1;
1311   gcond *loop_cond, *inner_loop_cond = NULL;
1312
1313   opt_result res
1314     = vect_analyze_loop_form_1 (loop, &loop_cond,
1315                                 &assumptions, &number_of_iterationsm1,
1316                                 &number_of_iterations, &inner_loop_cond);
1317   if (!res)
1318     return opt_loop_vec_info::propagate_failure (res);
1319
1320   loop_vec_info loop_vinfo = new _loop_vec_info (loop, shared);
1321   LOOP_VINFO_NITERSM1 (loop_vinfo) = number_of_iterationsm1;
1322   LOOP_VINFO_NITERS (loop_vinfo) = number_of_iterations;
1323   LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = number_of_iterations;
1324   if (!integer_onep (assumptions))
1325     {
1326       /* We consider to vectorize this loop by versioning it under
1327          some assumptions.  In order to do this, we need to clear
1328          existing information computed by scev and niter analyzer.  */
1329       scev_reset_htab ();
1330       free_numbers_of_iterations_estimates (loop);
1331       /* Also set flag for this loop so that following scev and niter
1332          analysis are done under the assumptions.  */
1333       loop_constraint_set (loop, LOOP_C_FINITE);
1334       /* Also record the assumptions for versioning.  */
1335       LOOP_VINFO_NITERS_ASSUMPTIONS (loop_vinfo) = assumptions;
1336     }
1337
1338   if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
1339     {
1340       if (dump_enabled_p ())
1341         {
1342           dump_printf_loc (MSG_NOTE, vect_location,
1343                            "Symbolic number of iterations is ");
1344           dump_generic_expr (MSG_NOTE, TDF_DETAILS, number_of_iterations);
1345           dump_printf (MSG_NOTE, "\n");
1346         }
1347     }
1348
1349   stmt_vec_info loop_cond_info = loop_vinfo->lookup_stmt (loop_cond);
1350   STMT_VINFO_TYPE (loop_cond_info) = loop_exit_ctrl_vec_info_type;
1351   if (inner_loop_cond)
1352     {
1353       stmt_vec_info inner_loop_cond_info
1354         = loop_vinfo->lookup_stmt (inner_loop_cond);
1355       STMT_VINFO_TYPE (inner_loop_cond_info) = loop_exit_ctrl_vec_info_type;
1356     }
1357
1358   gcc_assert (!loop->aux);
1359   loop->aux = loop_vinfo;
1360   return opt_loop_vec_info::success (loop_vinfo);
1361 }
1362
1363
1364
1365 /* Scan the loop stmts and dependent on whether there are any (non-)SLP
1366    statements update the vectorization factor.  */
1367
1368 static void
1369 vect_update_vf_for_slp (loop_vec_info loop_vinfo)
1370 {
1371   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1372   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1373   int nbbs = loop->num_nodes;
1374   poly_uint64 vectorization_factor;
1375   int i;
1376
1377   DUMP_VECT_SCOPE ("vect_update_vf_for_slp");
1378
1379   vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1380   gcc_assert (known_ne (vectorization_factor, 0U));
1381
1382   /* If all the stmts in the loop can be SLPed, we perform only SLP, and
1383      vectorization factor of the loop is the unrolling factor required by
1384      the SLP instances.  If that unrolling factor is 1, we say, that we
1385      perform pure SLP on loop - cross iteration parallelism is not
1386      exploited.  */
1387   bool only_slp_in_loop = true;
1388   for (i = 0; i < nbbs; i++)
1389     {
1390       basic_block bb = bbs[i];
1391       for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
1392            gsi_next (&si))
1393         {
1394           stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
1395           stmt_info = vect_stmt_to_vectorize (stmt_info);
1396           if ((STMT_VINFO_RELEVANT_P (stmt_info)
1397                || VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
1398               && !PURE_SLP_STMT (stmt_info))
1399             /* STMT needs both SLP and loop-based vectorization.  */
1400             only_slp_in_loop = false;
1401         }
1402     }
1403
1404   if (only_slp_in_loop)
1405     {
1406       if (dump_enabled_p ())
1407         dump_printf_loc (MSG_NOTE, vect_location,
1408                          "Loop contains only SLP stmts\n");
1409       vectorization_factor = LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo);
1410     }
1411   else
1412     {
1413       if (dump_enabled_p ())
1414         dump_printf_loc (MSG_NOTE, vect_location,
1415                          "Loop contains SLP and non-SLP stmts\n");
1416       /* Both the vectorization factor and unroll factor have the form
1417          current_vector_size * X for some rational X, so they must have
1418          a common multiple.  */
1419       vectorization_factor
1420         = force_common_multiple (vectorization_factor,
1421                                  LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo));
1422     }
1423
1424   LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
1425   if (dump_enabled_p ())
1426     {
1427       dump_printf_loc (MSG_NOTE, vect_location,
1428                        "Updating vectorization factor to ");
1429       dump_dec (MSG_NOTE, vectorization_factor);
1430       dump_printf (MSG_NOTE, ".\n");
1431     }
1432 }
1433
1434 /* Return true if STMT_INFO describes a double reduction phi and if
1435    the other phi in the reduction is also relevant for vectorization.
1436    This rejects cases such as:
1437
1438       outer1:
1439         x_1 = PHI <x_3(outer2), ...>;
1440         ...
1441
1442       inner:
1443         x_2 = ...;
1444         ...
1445
1446       outer2:
1447         x_3 = PHI <x_2(inner)>;
1448
1449    if nothing in x_2 or elsewhere makes x_1 relevant.  */
1450
1451 static bool
1452 vect_active_double_reduction_p (stmt_vec_info stmt_info)
1453 {
1454   if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_double_reduction_def)
1455     return false;
1456
1457   return STMT_VINFO_RELEVANT_P (STMT_VINFO_REDUC_DEF (stmt_info));
1458 }
1459
1460 /* Function vect_analyze_loop_operations.
1461
1462    Scan the loop stmts and make sure they are all vectorizable.  */
1463
1464 static opt_result
1465 vect_analyze_loop_operations (loop_vec_info loop_vinfo)
1466 {
1467   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1468   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1469   int nbbs = loop->num_nodes;
1470   int i;
1471   stmt_vec_info stmt_info;
1472   bool need_to_vectorize = false;
1473   bool ok;
1474
1475   DUMP_VECT_SCOPE ("vect_analyze_loop_operations");
1476
1477   auto_vec<stmt_info_for_cost> cost_vec;
1478
1479   for (i = 0; i < nbbs; i++)
1480     {
1481       basic_block bb = bbs[i];
1482
1483       for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
1484            gsi_next (&si))
1485         {
1486           gphi *phi = si.phi ();
1487           ok = true;
1488
1489           stmt_info = loop_vinfo->lookup_stmt (phi);
1490           if (dump_enabled_p ())
1491             dump_printf_loc (MSG_NOTE, vect_location, "examining phi: %G", phi);
1492           if (virtual_operand_p (gimple_phi_result (phi)))
1493             continue;
1494
1495           /* Inner-loop loop-closed exit phi in outer-loop vectorization
1496              (i.e., a phi in the tail of the outer-loop).  */
1497           if (! is_loop_header_bb_p (bb))
1498             {
1499               /* FORNOW: we currently don't support the case that these phis
1500                  are not used in the outerloop (unless it is double reduction,
1501                  i.e., this phi is vect_reduction_def), cause this case
1502                  requires to actually do something here.  */
1503               if (STMT_VINFO_LIVE_P (stmt_info)
1504                   && !vect_active_double_reduction_p (stmt_info))
1505                 return opt_result::failure_at (phi,
1506                                                "Unsupported loop-closed phi"
1507                                                " in outer-loop.\n");
1508
1509               /* If PHI is used in the outer loop, we check that its operand
1510                  is defined in the inner loop.  */
1511               if (STMT_VINFO_RELEVANT_P (stmt_info))
1512                 {
1513                   tree phi_op;
1514
1515                   if (gimple_phi_num_args (phi) != 1)
1516                     return opt_result::failure_at (phi, "unsupported phi");
1517
1518                   phi_op = PHI_ARG_DEF (phi, 0);
1519                   stmt_vec_info op_def_info = loop_vinfo->lookup_def (phi_op);
1520                   if (!op_def_info)
1521                     return opt_result::failure_at (phi, "unsupported phi\n");
1522
1523                   if (STMT_VINFO_RELEVANT (op_def_info) != vect_used_in_outer
1524                       && (STMT_VINFO_RELEVANT (op_def_info)
1525                           != vect_used_in_outer_by_reduction))
1526                     return opt_result::failure_at (phi, "unsupported phi\n");
1527
1528                   if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_internal_def
1529                        || (STMT_VINFO_DEF_TYPE (stmt_info)
1530                            == vect_double_reduction_def))
1531                       && !vectorizable_lc_phi (stmt_info, NULL, NULL))
1532                     return opt_result::failure_at (phi, "unsupported phi\n");
1533                 }
1534
1535               continue;
1536             }
1537
1538           gcc_assert (stmt_info);
1539
1540           if ((STMT_VINFO_RELEVANT (stmt_info) == vect_used_in_scope
1541                || STMT_VINFO_LIVE_P (stmt_info))
1542               && STMT_VINFO_DEF_TYPE (stmt_info) != vect_induction_def)
1543             /* A scalar-dependence cycle that we don't support.  */
1544             return opt_result::failure_at (phi,
1545                                            "not vectorized:"
1546                                            " scalar dependence cycle.\n");
1547
1548           if (STMT_VINFO_RELEVANT_P (stmt_info))
1549             {
1550               need_to_vectorize = true;
1551               if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def
1552                   && ! PURE_SLP_STMT (stmt_info))
1553                 ok = vectorizable_induction (stmt_info, NULL, NULL, NULL,
1554                                              &cost_vec);
1555               else if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
1556                         || (STMT_VINFO_DEF_TYPE (stmt_info)
1557                             == vect_double_reduction_def)
1558                         || STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle)
1559                        && ! PURE_SLP_STMT (stmt_info))
1560                 ok = vectorizable_reduction (stmt_info, NULL, NULL, &cost_vec);
1561             }
1562
1563           /* SLP PHIs are tested by vect_slp_analyze_node_operations.  */
1564           if (ok
1565               && STMT_VINFO_LIVE_P (stmt_info)
1566               && !PURE_SLP_STMT (stmt_info))
1567             ok = vectorizable_live_operation (stmt_info, NULL, NULL, NULL,
1568                                               -1, false, &cost_vec);
1569
1570           if (!ok)
1571             return opt_result::failure_at (phi,
1572                                            "not vectorized: relevant phi not "
1573                                            "supported: %G",
1574                                            static_cast <gimple *> (phi));
1575         }
1576
1577       for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
1578            gsi_next (&si))
1579         {
1580           gimple *stmt = gsi_stmt (si);
1581           if (!gimple_clobber_p (stmt))
1582             {
1583               opt_result res
1584                 = vect_analyze_stmt (loop_vinfo->lookup_stmt (stmt),
1585                                      &need_to_vectorize,
1586                                      NULL, NULL, &cost_vec);
1587               if (!res)
1588                 return res;
1589             }
1590         }
1591     } /* bbs */
1592
1593   add_stmt_costs (loop_vinfo->target_cost_data, &cost_vec);
1594
1595   /* All operations in the loop are either irrelevant (deal with loop
1596      control, or dead), or only used outside the loop and can be moved
1597      out of the loop (e.g. invariants, inductions).  The loop can be
1598      optimized away by scalar optimizations.  We're better off not
1599      touching this loop.  */
1600   if (!need_to_vectorize)
1601     {
1602       if (dump_enabled_p ())
1603         dump_printf_loc (MSG_NOTE, vect_location,
1604                          "All the computation can be taken out of the loop.\n");
1605       return opt_result::failure_at
1606         (vect_location,
1607          "not vectorized: redundant loop. no profit to vectorize.\n");
1608     }
1609
1610   return opt_result::success ();
1611 }
1612
1613 /* Analyze the cost of the loop described by LOOP_VINFO.  Decide if it
1614    is worthwhile to vectorize.  Return 1 if definitely yes, 0 if
1615    definitely no, or -1 if it's worth retrying.  */
1616
1617 static int
1618 vect_analyze_loop_costing (loop_vec_info loop_vinfo)
1619 {
1620   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1621   unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
1622
1623   /* Only fully-masked loops can have iteration counts less than the
1624      vectorization factor.  */
1625   if (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
1626     {
1627       HOST_WIDE_INT max_niter;
1628
1629       if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
1630         max_niter = LOOP_VINFO_INT_NITERS (loop_vinfo);
1631       else
1632         max_niter = max_stmt_executions_int (loop);
1633
1634       if (max_niter != -1
1635           && (unsigned HOST_WIDE_INT) max_niter < assumed_vf)
1636         {
1637           if (dump_enabled_p ())
1638             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1639                              "not vectorized: iteration count smaller than "
1640                              "vectorization factor.\n");
1641           return 0;
1642         }
1643     }
1644
1645   int min_profitable_iters, min_profitable_estimate;
1646   vect_estimate_min_profitable_iters (loop_vinfo, &min_profitable_iters,
1647                                       &min_profitable_estimate);
1648
1649   if (min_profitable_iters < 0)
1650     {
1651       if (dump_enabled_p ())
1652         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1653                          "not vectorized: vectorization not profitable.\n");
1654       if (dump_enabled_p ())
1655         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1656                          "not vectorized: vector version will never be "
1657                          "profitable.\n");
1658       return -1;
1659     }
1660
1661   int min_scalar_loop_bound = (PARAM_VALUE (PARAM_MIN_VECT_LOOP_BOUND)
1662                                * assumed_vf);
1663
1664   /* Use the cost model only if it is more conservative than user specified
1665      threshold.  */
1666   unsigned int th = (unsigned) MAX (min_scalar_loop_bound,
1667                                     min_profitable_iters);
1668
1669   LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo) = th;
1670
1671   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
1672       && LOOP_VINFO_INT_NITERS (loop_vinfo) < th)
1673     {
1674       if (dump_enabled_p ())
1675         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1676                          "not vectorized: vectorization not profitable.\n");
1677       if (dump_enabled_p ())
1678         dump_printf_loc (MSG_NOTE, vect_location,
1679                          "not vectorized: iteration count smaller than user "
1680                          "specified loop bound parameter or minimum profitable "
1681                          "iterations (whichever is more conservative).\n");
1682       return 0;
1683     }
1684
1685   HOST_WIDE_INT estimated_niter = estimated_stmt_executions_int (loop);
1686   if (estimated_niter == -1)
1687     estimated_niter = likely_max_stmt_executions_int (loop);
1688   if (estimated_niter != -1
1689       && ((unsigned HOST_WIDE_INT) estimated_niter
1690           < MAX (th, (unsigned) min_profitable_estimate)))
1691     {
1692       if (dump_enabled_p ())
1693         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1694                          "not vectorized: estimated iteration count too "
1695                          "small.\n");
1696       if (dump_enabled_p ())
1697         dump_printf_loc (MSG_NOTE, vect_location,
1698                          "not vectorized: estimated iteration count smaller "
1699                          "than specified loop bound parameter or minimum "
1700                          "profitable iterations (whichever is more "
1701                          "conservative).\n");
1702       return -1;
1703     }
1704
1705   return 1;
1706 }
1707
1708 static opt_result
1709 vect_get_datarefs_in_loop (loop_p loop, basic_block *bbs,
1710                            vec<data_reference_p> *datarefs,
1711                            unsigned int *n_stmts)
1712 {
1713   *n_stmts = 0;
1714   for (unsigned i = 0; i < loop->num_nodes; i++)
1715     for (gimple_stmt_iterator gsi = gsi_start_bb (bbs[i]);
1716          !gsi_end_p (gsi); gsi_next (&gsi))
1717       {
1718         gimple *stmt = gsi_stmt (gsi);
1719         if (is_gimple_debug (stmt))
1720           continue;
1721         ++(*n_stmts);
1722         opt_result res = vect_find_stmt_data_reference (loop, stmt, datarefs);
1723         if (!res)
1724           {
1725             if (is_gimple_call (stmt) && loop->safelen)
1726               {
1727                 tree fndecl = gimple_call_fndecl (stmt), op;
1728                 if (fndecl != NULL_TREE)
1729                   {
1730                     cgraph_node *node = cgraph_node::get (fndecl);
1731                     if (node != NULL && node->simd_clones != NULL)
1732                       {
1733                         unsigned int j, n = gimple_call_num_args (stmt);
1734                         for (j = 0; j < n; j++)
1735                           {
1736                             op = gimple_call_arg (stmt, j);
1737                             if (DECL_P (op)
1738                                 || (REFERENCE_CLASS_P (op)
1739                                     && get_base_address (op)))
1740                               break;
1741                           }
1742                         op = gimple_call_lhs (stmt);
1743                         /* Ignore #pragma omp declare simd functions
1744                            if they don't have data references in the
1745                            call stmt itself.  */
1746                         if (j == n
1747                             && !(op
1748                                  && (DECL_P (op)
1749                                      || (REFERENCE_CLASS_P (op)
1750                                          && get_base_address (op)))))
1751                           continue;
1752                       }
1753                   }
1754               }
1755             return res;
1756           }
1757         /* If dependence analysis will give up due to the limit on the
1758            number of datarefs stop here and fail fatally.  */
1759         if (datarefs->length ()
1760             > (unsigned)PARAM_VALUE (PARAM_LOOP_MAX_DATAREFS_FOR_DATADEPS))
1761           return opt_result::failure_at (stmt, "exceeded param "
1762                                          "loop-max-datarefs-for-datadeps\n");
1763       }
1764   return opt_result::success ();
1765 }
1766
1767 /* Look for SLP-only access groups and turn each individual access into its own
1768    group.  */
1769 static void
1770 vect_dissolve_slp_only_groups (loop_vec_info loop_vinfo)
1771 {
1772   unsigned int i;
1773   struct data_reference *dr;
1774
1775   DUMP_VECT_SCOPE ("vect_dissolve_slp_only_groups");
1776
1777   vec<data_reference_p> datarefs = loop_vinfo->shared->datarefs;
1778   FOR_EACH_VEC_ELT (datarefs, i, dr)
1779     {
1780       gcc_assert (DR_REF (dr));
1781       stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (DR_STMT (dr));
1782
1783       /* Check if the load is a part of an interleaving chain.  */
1784       if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
1785         {
1786           stmt_vec_info first_element = DR_GROUP_FIRST_ELEMENT (stmt_info);
1787           unsigned int group_size = DR_GROUP_SIZE (first_element);
1788
1789           /* Check if SLP-only groups.  */
1790           if (!STMT_SLP_TYPE (stmt_info)
1791               && STMT_VINFO_SLP_VECT_ONLY (first_element))
1792             {
1793               /* Dissolve the group.  */
1794               STMT_VINFO_SLP_VECT_ONLY (first_element) = false;
1795
1796               stmt_vec_info vinfo = first_element;
1797               while (vinfo)
1798                 {
1799                   stmt_vec_info next = DR_GROUP_NEXT_ELEMENT (vinfo);
1800                   DR_GROUP_FIRST_ELEMENT (vinfo) = vinfo;
1801                   DR_GROUP_NEXT_ELEMENT (vinfo) = NULL;
1802                   DR_GROUP_SIZE (vinfo) = 1;
1803                   DR_GROUP_GAP (vinfo) = group_size - 1;
1804                   vinfo = next;
1805                 }
1806             }
1807         }
1808     }
1809 }
1810
1811
1812 /* Decides whether we need to create an epilogue loop to handle
1813    remaining scalar iterations and sets PEELING_FOR_NITERS accordingly.  */
1814
1815 void
1816 determine_peel_for_niter (loop_vec_info loop_vinfo)
1817 {
1818   LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = false;
1819
1820   unsigned HOST_WIDE_INT const_vf;
1821   HOST_WIDE_INT max_niter
1822     = likely_max_stmt_executions_int (LOOP_VINFO_LOOP (loop_vinfo));
1823
1824   unsigned th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
1825   if (!th && LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo))
1826     th = LOOP_VINFO_COST_MODEL_THRESHOLD (LOOP_VINFO_ORIG_LOOP_INFO
1827                                           (loop_vinfo));
1828
1829   if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
1830     /* The main loop handles all iterations.  */
1831     LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = false;
1832   else if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
1833            && LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) >= 0)
1834     {
1835       /* Work out the (constant) number of iterations that need to be
1836          peeled for reasons other than niters.  */
1837       unsigned int peel_niter = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
1838       if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
1839         peel_niter += 1;
1840       if (!multiple_p (LOOP_VINFO_INT_NITERS (loop_vinfo) - peel_niter,
1841                        LOOP_VINFO_VECT_FACTOR (loop_vinfo)))
1842         LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = true;
1843     }
1844   else if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
1845            /* ??? When peeling for gaps but not alignment, we could
1846               try to check whether the (variable) niters is known to be
1847               VF * N + 1.  That's something of a niche case though.  */
1848            || LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
1849            || !LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant (&const_vf)
1850            || ((tree_ctz (LOOP_VINFO_NITERS (loop_vinfo))
1851                 < (unsigned) exact_log2 (const_vf))
1852                /* In case of versioning, check if the maximum number of
1853                   iterations is greater than th.  If they are identical,
1854                   the epilogue is unnecessary.  */
1855                && (!LOOP_REQUIRES_VERSIONING (loop_vinfo)
1856                    || ((unsigned HOST_WIDE_INT) max_niter
1857                        > (th / const_vf) * const_vf))))
1858     LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = true;
1859 }
1860
1861
1862 /* Function vect_analyze_loop_2.
1863
1864    Apply a set of analyses on LOOP, and create a loop_vec_info struct
1865    for it.  The different analyses will record information in the
1866    loop_vec_info struct.  */
1867 static opt_result
1868 vect_analyze_loop_2 (loop_vec_info loop_vinfo, bool &fatal, unsigned *n_stmts)
1869 {
1870   opt_result ok = opt_result::success ();
1871   int res;
1872   unsigned int max_vf = MAX_VECTORIZATION_FACTOR;
1873   poly_uint64 min_vf = 2;
1874
1875   /* The first group of checks is independent of the vector size.  */
1876   fatal = true;
1877
1878   if (LOOP_VINFO_SIMD_IF_COND (loop_vinfo)
1879       && integer_zerop (LOOP_VINFO_SIMD_IF_COND (loop_vinfo)))
1880     return opt_result::failure_at (vect_location,
1881                                    "not vectorized: simd if(0)\n");
1882
1883   /* Find all data references in the loop (which correspond to vdefs/vuses)
1884      and analyze their evolution in the loop.  */
1885
1886   loop_p loop = LOOP_VINFO_LOOP (loop_vinfo);
1887
1888   /* Gather the data references and count stmts in the loop.  */
1889   if (!LOOP_VINFO_DATAREFS (loop_vinfo).exists ())
1890     {
1891       opt_result res
1892         = vect_get_datarefs_in_loop (loop, LOOP_VINFO_BBS (loop_vinfo),
1893                                      &LOOP_VINFO_DATAREFS (loop_vinfo),
1894                                      n_stmts);
1895       if (!res)
1896         {
1897           if (dump_enabled_p ())
1898             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1899                              "not vectorized: loop contains function "
1900                              "calls or data references that cannot "
1901                              "be analyzed\n");
1902           return res;
1903         }
1904       loop_vinfo->shared->save_datarefs ();
1905     }
1906   else
1907     loop_vinfo->shared->check_datarefs ();
1908
1909   /* Analyze the data references and also adjust the minimal
1910      vectorization factor according to the loads and stores.  */
1911
1912   ok = vect_analyze_data_refs (loop_vinfo, &min_vf, &fatal);
1913   if (!ok)
1914     {
1915       if (dump_enabled_p ())
1916         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1917                          "bad data references.\n");
1918       return ok;
1919     }
1920
1921   /* Classify all cross-iteration scalar data-flow cycles.
1922      Cross-iteration cycles caused by virtual phis are analyzed separately.  */
1923   vect_analyze_scalar_cycles (loop_vinfo);
1924
1925   vect_pattern_recog (loop_vinfo);
1926
1927   vect_fixup_scalar_cycles_with_patterns (loop_vinfo);
1928
1929   /* Analyze the access patterns of the data-refs in the loop (consecutive,
1930      complex, etc.). FORNOW: Only handle consecutive access pattern.  */
1931
1932   ok = vect_analyze_data_ref_accesses (loop_vinfo);
1933   if (!ok)
1934     {
1935       if (dump_enabled_p ())
1936         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1937                          "bad data access.\n");
1938       return ok;
1939     }
1940
1941   /* Data-flow analysis to detect stmts that do not need to be vectorized.  */
1942
1943   ok = vect_mark_stmts_to_be_vectorized (loop_vinfo, &fatal);
1944   if (!ok)
1945     {
1946       if (dump_enabled_p ())
1947         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1948                          "unexpected pattern.\n");
1949       return ok;
1950     }
1951
1952   /* While the rest of the analysis below depends on it in some way.  */
1953   fatal = false;
1954
1955   /* Analyze data dependences between the data-refs in the loop
1956      and adjust the maximum vectorization factor according to
1957      the dependences.
1958      FORNOW: fail at the first data dependence that we encounter.  */
1959
1960   ok = vect_analyze_data_ref_dependences (loop_vinfo, &max_vf);
1961   if (!ok)
1962     {
1963       if (dump_enabled_p ())
1964         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1965                          "bad data dependence.\n");
1966       return ok;
1967     }
1968   if (max_vf != MAX_VECTORIZATION_FACTOR
1969       && maybe_lt (max_vf, min_vf))
1970     return opt_result::failure_at (vect_location, "bad data dependence.\n");
1971   LOOP_VINFO_MAX_VECT_FACTOR (loop_vinfo) = max_vf;
1972
1973   ok = vect_determine_vectorization_factor (loop_vinfo);
1974   if (!ok)
1975     {
1976       if (dump_enabled_p ())
1977         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1978                          "can't determine vectorization factor.\n");
1979       return ok;
1980     }
1981   if (max_vf != MAX_VECTORIZATION_FACTOR
1982       && maybe_lt (max_vf, LOOP_VINFO_VECT_FACTOR (loop_vinfo)))
1983     return opt_result::failure_at (vect_location, "bad data dependence.\n");
1984
1985   /* Compute the scalar iteration cost.  */
1986   vect_compute_single_scalar_iteration_cost (loop_vinfo);
1987
1988   poly_uint64 saved_vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1989
1990   /* Check the SLP opportunities in the loop, analyze and build SLP trees.  */
1991   ok = vect_analyze_slp (loop_vinfo, *n_stmts);
1992   if (!ok)
1993     return ok;
1994
1995   /* If there are any SLP instances mark them as pure_slp.  */
1996   bool slp = vect_make_slp_decision (loop_vinfo);
1997   if (slp)
1998     {
1999       /* Find stmts that need to be both vectorized and SLPed.  */
2000       vect_detect_hybrid_slp (loop_vinfo);
2001
2002       /* Update the vectorization factor based on the SLP decision.  */
2003       vect_update_vf_for_slp (loop_vinfo);
2004     }
2005
2006   bool saved_can_fully_mask_p = LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo);
2007
2008   /* We don't expect to have to roll back to anything other than an empty
2009      set of rgroups.  */
2010   gcc_assert (LOOP_VINFO_MASKS (loop_vinfo).is_empty ());
2011
2012   /* This is the point where we can re-start analysis with SLP forced off.  */
2013 start_over:
2014
2015   /* Now the vectorization factor is final.  */
2016   poly_uint64 vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2017   gcc_assert (known_ne (vectorization_factor, 0U));
2018
2019   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) && dump_enabled_p ())
2020     {
2021       dump_printf_loc (MSG_NOTE, vect_location,
2022                        "vectorization_factor = ");
2023       dump_dec (MSG_NOTE, vectorization_factor);
2024       dump_printf (MSG_NOTE, ", niters = %wd\n",
2025                    LOOP_VINFO_INT_NITERS (loop_vinfo));
2026     }
2027
2028   /* Analyze the alignment of the data-refs in the loop.
2029      Fail if a data reference is found that cannot be vectorized.  */
2030
2031   ok = vect_analyze_data_refs_alignment (loop_vinfo);
2032   if (!ok)
2033     {
2034       if (dump_enabled_p ())
2035         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2036                          "bad data alignment.\n");
2037       return ok;
2038     }
2039
2040   /* Prune the list of ddrs to be tested at run-time by versioning for alias.
2041      It is important to call pruning after vect_analyze_data_ref_accesses,
2042      since we use grouping information gathered by interleaving analysis.  */
2043   ok = vect_prune_runtime_alias_test_list (loop_vinfo);
2044   if (!ok)
2045     return ok;
2046
2047   /* Do not invoke vect_enhance_data_refs_alignment for epilogue
2048      vectorization, since we do not want to add extra peeling or
2049      add versioning for alignment.  */
2050   if (!LOOP_VINFO_EPILOGUE_P (loop_vinfo))
2051     /* This pass will decide on using loop versioning and/or loop peeling in
2052        order to enhance the alignment of data references in the loop.  */
2053     ok = vect_enhance_data_refs_alignment (loop_vinfo);
2054   else
2055     ok = vect_verify_datarefs_alignment (loop_vinfo);
2056   if (!ok)
2057     return ok;
2058
2059   if (slp)
2060     {
2061       /* Analyze operations in the SLP instances.  Note this may
2062          remove unsupported SLP instances which makes the above
2063          SLP kind detection invalid.  */
2064       unsigned old_size = LOOP_VINFO_SLP_INSTANCES (loop_vinfo).length ();
2065       vect_slp_analyze_operations (loop_vinfo);
2066       if (LOOP_VINFO_SLP_INSTANCES (loop_vinfo).length () != old_size)
2067         {
2068           ok = opt_result::failure_at (vect_location,
2069                                        "unsupported SLP instances\n");
2070           goto again;
2071         }
2072     }
2073
2074   /* Dissolve SLP-only groups.  */
2075   vect_dissolve_slp_only_groups (loop_vinfo);
2076
2077   /* Scan all the remaining operations in the loop that are not subject
2078      to SLP and make sure they are vectorizable.  */
2079   ok = vect_analyze_loop_operations (loop_vinfo);
2080   if (!ok)
2081     {
2082       if (dump_enabled_p ())
2083         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2084                          "bad operation or unsupported loop bound.\n");
2085       return ok;
2086     }
2087
2088   /* Decide whether to use a fully-masked loop for this vectorization
2089      factor.  */
2090   LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
2091     = (LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo)
2092        && vect_verify_full_masking (loop_vinfo));
2093   if (dump_enabled_p ())
2094     {
2095       if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
2096         dump_printf_loc (MSG_NOTE, vect_location,
2097                          "using a fully-masked loop.\n");
2098       else
2099         dump_printf_loc (MSG_NOTE, vect_location,
2100                          "not using a fully-masked loop.\n");
2101     }
2102
2103   /* If epilog loop is required because of data accesses with gaps,
2104      one additional iteration needs to be peeled.  Check if there is
2105      enough iterations for vectorization.  */
2106   if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
2107       && LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2108       && !LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
2109     {
2110       poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2111       tree scalar_niters = LOOP_VINFO_NITERSM1 (loop_vinfo);
2112
2113       if (known_lt (wi::to_widest (scalar_niters), vf))
2114         return opt_result::failure_at (vect_location,
2115                                        "loop has no enough iterations to"
2116                                        " support peeling for gaps.\n");
2117     }
2118
2119   /* Check the costings of the loop make vectorizing worthwhile.  */
2120   res = vect_analyze_loop_costing (loop_vinfo);
2121   if (res < 0)
2122     {
2123       ok = opt_result::failure_at (vect_location,
2124                                    "Loop costings may not be worthwhile.\n");
2125       goto again;
2126     }
2127   if (!res)
2128     return opt_result::failure_at (vect_location,
2129                                    "Loop costings not worthwhile.\n");
2130
2131   determine_peel_for_niter (loop_vinfo);
2132   /* If an epilogue loop is required make sure we can create one.  */
2133   if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
2134       || LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo))
2135     {
2136       if (dump_enabled_p ())
2137         dump_printf_loc (MSG_NOTE, vect_location, "epilog loop required\n");
2138       if (!vect_can_advance_ivs_p (loop_vinfo)
2139           || !slpeel_can_duplicate_loop_p (LOOP_VINFO_LOOP (loop_vinfo),
2140                                            single_exit (LOOP_VINFO_LOOP
2141                                                          (loop_vinfo))))
2142         {
2143           ok = opt_result::failure_at (vect_location,
2144                                        "not vectorized: can't create required "
2145                                        "epilog loop\n");
2146           goto again;
2147         }
2148     }
2149
2150   /* During peeling, we need to check if number of loop iterations is
2151      enough for both peeled prolog loop and vector loop.  This check
2152      can be merged along with threshold check of loop versioning, so
2153      increase threshold for this case if necessary.  */
2154   if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
2155     {
2156       poly_uint64 niters_th = 0;
2157       unsigned int th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
2158
2159       if (!vect_use_loop_mask_for_alignment_p (loop_vinfo))
2160         {
2161           /* Niters for peeled prolog loop.  */
2162           if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) < 0)
2163             {
2164               dr_vec_info *dr_info = LOOP_VINFO_UNALIGNED_DR (loop_vinfo);
2165               tree vectype = STMT_VINFO_VECTYPE (dr_info->stmt);
2166               niters_th += TYPE_VECTOR_SUBPARTS (vectype) - 1;
2167             }
2168           else
2169             niters_th += LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
2170         }
2171
2172       /* Niters for at least one iteration of vectorized loop.  */
2173       if (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
2174         niters_th += LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2175       /* One additional iteration because of peeling for gap.  */
2176       if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
2177         niters_th += 1;
2178
2179       /*  Use the same condition as vect_transform_loop to decide when to use
2180           the cost to determine a versioning threshold.  */
2181       if (th >= vect_vf_for_cost (loop_vinfo)
2182           && !LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2183           && ordered_p (th, niters_th))
2184         niters_th = ordered_max (poly_uint64 (th), niters_th);
2185
2186       LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo) = niters_th;
2187     }
2188
2189   gcc_assert (known_eq (vectorization_factor,
2190                         LOOP_VINFO_VECT_FACTOR (loop_vinfo)));
2191
2192   /* Ok to vectorize!  */
2193   return opt_result::success ();
2194
2195 again:
2196   /* Ensure that "ok" is false (with an opt_problem if dumping is enabled).  */
2197   gcc_assert (!ok);
2198
2199   /* Try again with SLP forced off but if we didn't do any SLP there is
2200      no point in re-trying.  */
2201   if (!slp)
2202     return ok;
2203
2204   /* If there are reduction chains re-trying will fail anyway.  */
2205   if (! LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo).is_empty ())
2206     return ok;
2207
2208   /* Likewise if the grouped loads or stores in the SLP cannot be handled
2209      via interleaving or lane instructions.  */
2210   slp_instance instance;
2211   slp_tree node;
2212   unsigned i, j;
2213   FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), i, instance)
2214     {
2215       stmt_vec_info vinfo;
2216       vinfo = SLP_TREE_SCALAR_STMTS (SLP_INSTANCE_TREE (instance))[0];
2217       if (! STMT_VINFO_GROUPED_ACCESS (vinfo))
2218         continue;
2219       vinfo = DR_GROUP_FIRST_ELEMENT (vinfo);
2220       unsigned int size = DR_GROUP_SIZE (vinfo);
2221       tree vectype = STMT_VINFO_VECTYPE (vinfo);
2222       if (! vect_store_lanes_supported (vectype, size, false)
2223          && ! known_eq (TYPE_VECTOR_SUBPARTS (vectype), 1U)
2224          && ! vect_grouped_store_supported (vectype, size))
2225         return opt_result::failure_at (vinfo->stmt,
2226                                        "unsupported grouped store\n");
2227       FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), j, node)
2228         {
2229           vinfo = SLP_TREE_SCALAR_STMTS (node)[0];
2230           vinfo = DR_GROUP_FIRST_ELEMENT (vinfo);
2231           bool single_element_p = !DR_GROUP_NEXT_ELEMENT (vinfo);
2232           size = DR_GROUP_SIZE (vinfo);
2233           vectype = STMT_VINFO_VECTYPE (vinfo);
2234           if (! vect_load_lanes_supported (vectype, size, false)
2235               && ! vect_grouped_load_supported (vectype, single_element_p,
2236                                                 size))
2237             return opt_result::failure_at (vinfo->stmt,
2238                                            "unsupported grouped load\n");
2239         }
2240     }
2241
2242   if (dump_enabled_p ())
2243     dump_printf_loc (MSG_NOTE, vect_location,
2244                      "re-trying with SLP disabled\n");
2245
2246   /* Roll back state appropriately.  No SLP this time.  */
2247   slp = false;
2248   /* Restore vectorization factor as it were without SLP.  */
2249   LOOP_VINFO_VECT_FACTOR (loop_vinfo) = saved_vectorization_factor;
2250   /* Free the SLP instances.  */
2251   FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), j, instance)
2252     vect_free_slp_instance (instance, false);
2253   LOOP_VINFO_SLP_INSTANCES (loop_vinfo).release ();
2254   /* Reset SLP type to loop_vect on all stmts.  */
2255   for (i = 0; i < LOOP_VINFO_LOOP (loop_vinfo)->num_nodes; ++i)
2256     {
2257       basic_block bb = LOOP_VINFO_BBS (loop_vinfo)[i];
2258       for (gimple_stmt_iterator si = gsi_start_phis (bb);
2259            !gsi_end_p (si); gsi_next (&si))
2260         {
2261           stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
2262           STMT_SLP_TYPE (stmt_info) = loop_vect;
2263         }
2264       for (gimple_stmt_iterator si = gsi_start_bb (bb);
2265            !gsi_end_p (si); gsi_next (&si))
2266         {
2267           stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
2268           STMT_SLP_TYPE (stmt_info) = loop_vect;
2269           if (STMT_VINFO_IN_PATTERN_P (stmt_info))
2270             {
2271               gimple *pattern_def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
2272               stmt_info = STMT_VINFO_RELATED_STMT (stmt_info);
2273               STMT_SLP_TYPE (stmt_info) = loop_vect;
2274               for (gimple_stmt_iterator pi = gsi_start (pattern_def_seq);
2275                    !gsi_end_p (pi); gsi_next (&pi))
2276                 STMT_SLP_TYPE (loop_vinfo->lookup_stmt (gsi_stmt (pi)))
2277                   = loop_vect;
2278             }
2279         }
2280     }
2281   /* Free optimized alias test DDRS.  */
2282   LOOP_VINFO_LOWER_BOUNDS (loop_vinfo).truncate (0);
2283   LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo).release ();
2284   LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo).release ();
2285   /* Reset target cost data.  */
2286   destroy_cost_data (LOOP_VINFO_TARGET_COST_DATA (loop_vinfo));
2287   LOOP_VINFO_TARGET_COST_DATA (loop_vinfo)
2288     = init_cost (LOOP_VINFO_LOOP (loop_vinfo));
2289   /* Reset accumulated rgroup information.  */
2290   release_vec_loop_masks (&LOOP_VINFO_MASKS (loop_vinfo));
2291   /* Reset assorted flags.  */
2292   LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = false;
2293   LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) = false;
2294   LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo) = 0;
2295   LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo) = 0;
2296   LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = saved_can_fully_mask_p;
2297
2298   goto start_over;
2299 }
2300
2301 /* Function vect_analyze_loop.
2302
2303    Apply a set of analyses on LOOP, and create a loop_vec_info struct
2304    for it.  The different analyses will record information in the
2305    loop_vec_info struct.  If ORIG_LOOP_VINFO is not NULL epilogue must
2306    be vectorized.  */
2307 opt_loop_vec_info
2308 vect_analyze_loop (class loop *loop, loop_vec_info orig_loop_vinfo,
2309                    vec_info_shared *shared)
2310 {
2311   auto_vector_sizes vector_sizes;
2312
2313   /* Autodetect first vector size we try.  */
2314   current_vector_size = 0;
2315   targetm.vectorize.autovectorize_vector_sizes (&vector_sizes,
2316                                                 loop->simdlen != 0);
2317   unsigned int next_size = 0;
2318
2319   DUMP_VECT_SCOPE ("analyze_loop_nest");
2320
2321   if (loop_outer (loop)
2322       && loop_vec_info_for_loop (loop_outer (loop))
2323       && LOOP_VINFO_VECTORIZABLE_P (loop_vec_info_for_loop (loop_outer (loop))))
2324     return opt_loop_vec_info::failure_at (vect_location,
2325                                           "outer-loop already vectorized.\n");
2326
2327   if (!find_loop_nest (loop, &shared->loop_nest))
2328     return opt_loop_vec_info::failure_at
2329       (vect_location,
2330        "not vectorized: loop nest containing two or more consecutive inner"
2331        " loops cannot be vectorized\n");
2332
2333   unsigned n_stmts = 0;
2334   poly_uint64 autodetected_vector_size = 0;
2335   opt_loop_vec_info first_loop_vinfo = opt_loop_vec_info::success (NULL);
2336   poly_uint64 first_vector_size = 0;
2337   while (1)
2338     {
2339       /* Check the CFG characteristics of the loop (nesting, entry/exit).  */
2340       opt_loop_vec_info loop_vinfo
2341         = vect_analyze_loop_form (loop, shared);
2342       if (!loop_vinfo)
2343         {
2344           if (dump_enabled_p ())
2345             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2346                              "bad loop form.\n");
2347           gcc_checking_assert (first_loop_vinfo == NULL);
2348           return loop_vinfo;
2349         }
2350
2351       bool fatal = false;
2352
2353       if (orig_loop_vinfo)
2354         LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo) = orig_loop_vinfo;
2355
2356       opt_result res = vect_analyze_loop_2 (loop_vinfo, fatal, &n_stmts);
2357       if (res)
2358         {
2359           LOOP_VINFO_VECTORIZABLE_P (loop_vinfo) = 1;
2360
2361           if (loop->simdlen
2362               && maybe_ne (LOOP_VINFO_VECT_FACTOR (loop_vinfo),
2363                            (unsigned HOST_WIDE_INT) loop->simdlen))
2364             {
2365               if (first_loop_vinfo == NULL)
2366                 {
2367                   first_loop_vinfo = loop_vinfo;
2368                   first_vector_size = current_vector_size;
2369                   loop->aux = NULL;
2370                 }
2371               else
2372                 delete loop_vinfo;
2373             }
2374           else
2375             {
2376               delete first_loop_vinfo;
2377               return loop_vinfo;
2378             }
2379         }
2380       else
2381         delete loop_vinfo;
2382
2383       if (next_size == 0)
2384         autodetected_vector_size = current_vector_size;
2385
2386       if (next_size < vector_sizes.length ()
2387           && known_eq (vector_sizes[next_size], autodetected_vector_size))
2388         next_size += 1;
2389
2390       if (fatal)
2391         {
2392           gcc_checking_assert (first_loop_vinfo == NULL);
2393           return opt_loop_vec_info::propagate_failure (res);
2394         }
2395
2396       if (next_size == vector_sizes.length ()
2397           || known_eq (current_vector_size, 0U))
2398         {
2399           if (first_loop_vinfo)
2400             {
2401               current_vector_size = first_vector_size;
2402               loop->aux = (loop_vec_info) first_loop_vinfo;
2403               if (dump_enabled_p ())
2404                 {
2405                   dump_printf_loc (MSG_NOTE, vect_location,
2406                                    "***** Choosing vector size ");
2407                   dump_dec (MSG_NOTE, current_vector_size);
2408                   dump_printf (MSG_NOTE, "\n");
2409                 }
2410               return first_loop_vinfo;
2411             }
2412           else
2413             return opt_loop_vec_info::propagate_failure (res);
2414         }
2415
2416       /* Try the next biggest vector size.  */
2417       current_vector_size = vector_sizes[next_size++];
2418       if (dump_enabled_p ())
2419         {
2420           dump_printf_loc (MSG_NOTE, vect_location,
2421                            "***** Re-trying analysis with "
2422                            "vector size ");
2423           dump_dec (MSG_NOTE, current_vector_size);
2424           dump_printf (MSG_NOTE, "\n");
2425         }
2426     }
2427 }
2428
2429 /* Return true if there is an in-order reduction function for CODE, storing
2430    it in *REDUC_FN if so.  */
2431
2432 static bool
2433 fold_left_reduction_fn (tree_code code, internal_fn *reduc_fn)
2434 {
2435   switch (code)
2436     {
2437     case PLUS_EXPR:
2438       *reduc_fn = IFN_FOLD_LEFT_PLUS;
2439       return true;
2440
2441     default:
2442       return false;
2443     }
2444 }
2445
2446 /* Function reduction_fn_for_scalar_code
2447
2448    Input:
2449    CODE - tree_code of a reduction operations.
2450
2451    Output:
2452    REDUC_FN - the corresponding internal function to be used to reduce the
2453       vector of partial results into a single scalar result, or IFN_LAST
2454       if the operation is a supported reduction operation, but does not have
2455       such an internal function.
2456
2457    Return FALSE if CODE currently cannot be vectorized as reduction.  */
2458
2459 static bool
2460 reduction_fn_for_scalar_code (enum tree_code code, internal_fn *reduc_fn)
2461 {
2462   switch (code)
2463     {
2464       case MAX_EXPR:
2465         *reduc_fn = IFN_REDUC_MAX;
2466         return true;
2467
2468       case MIN_EXPR:
2469         *reduc_fn = IFN_REDUC_MIN;
2470         return true;
2471
2472       case PLUS_EXPR:
2473         *reduc_fn = IFN_REDUC_PLUS;
2474         return true;
2475
2476       case BIT_AND_EXPR:
2477         *reduc_fn = IFN_REDUC_AND;
2478         return true;
2479
2480       case BIT_IOR_EXPR:
2481         *reduc_fn = IFN_REDUC_IOR;
2482         return true;
2483
2484       case BIT_XOR_EXPR:
2485         *reduc_fn = IFN_REDUC_XOR;
2486         return true;
2487
2488       case MULT_EXPR:
2489       case MINUS_EXPR:
2490         *reduc_fn = IFN_LAST;
2491         return true;
2492
2493       default:
2494        return false;
2495     }
2496 }
2497
2498 /* If there is a neutral value X such that SLP reduction NODE would not
2499    be affected by the introduction of additional X elements, return that X,
2500    otherwise return null.  CODE is the code of the reduction.  REDUC_CHAIN
2501    is true if the SLP statements perform a single reduction, false if each
2502    statement performs an independent reduction.  */
2503
2504 static tree
2505 neutral_op_for_slp_reduction (slp_tree slp_node, tree_code code,
2506                               bool reduc_chain)
2507 {
2508   vec<stmt_vec_info> stmts = SLP_TREE_SCALAR_STMTS (slp_node);
2509   stmt_vec_info stmt_vinfo = stmts[0];
2510   tree vector_type = STMT_VINFO_VECTYPE (stmt_vinfo);
2511   tree scalar_type = TREE_TYPE (vector_type);
2512   class loop *loop = gimple_bb (stmt_vinfo->stmt)->loop_father;
2513   gcc_assert (loop);
2514
2515   switch (code)
2516     {
2517     case WIDEN_SUM_EXPR:
2518     case DOT_PROD_EXPR:
2519     case SAD_EXPR:
2520     case PLUS_EXPR:
2521     case MINUS_EXPR:
2522     case BIT_IOR_EXPR:
2523     case BIT_XOR_EXPR:
2524       return build_zero_cst (scalar_type);
2525
2526     case MULT_EXPR:
2527       return build_one_cst (scalar_type);
2528
2529     case BIT_AND_EXPR:
2530       return build_all_ones_cst (scalar_type);
2531
2532     case MAX_EXPR:
2533     case MIN_EXPR:
2534       /* For MIN/MAX the initial values are neutral.  A reduction chain
2535          has only a single initial value, so that value is neutral for
2536          all statements.  */
2537       if (reduc_chain)
2538         return PHI_ARG_DEF_FROM_EDGE (stmt_vinfo->stmt,
2539                                       loop_preheader_edge (loop));
2540       return NULL_TREE;
2541
2542     default:
2543       return NULL_TREE;
2544     }
2545 }
2546
2547 /* Error reporting helper for vect_is_simple_reduction below.  GIMPLE statement
2548    STMT is printed with a message MSG. */
2549
2550 static void
2551 report_vect_op (dump_flags_t msg_type, gimple *stmt, const char *msg)
2552 {
2553   dump_printf_loc (msg_type, vect_location, "%s%G", msg, stmt);
2554 }
2555
2556 /* Return true if we need an in-order reduction for operation CODE
2557    on type TYPE.  NEED_WRAPPING_INTEGRAL_OVERFLOW is true if integer
2558    overflow must wrap.  */
2559
2560 bool
2561 needs_fold_left_reduction_p (tree type, tree_code code)
2562 {
2563   /* CHECKME: check for !flag_finite_math_only too?  */
2564   if (SCALAR_FLOAT_TYPE_P (type))
2565     switch (code)
2566       {
2567       case MIN_EXPR:
2568       case MAX_EXPR:
2569         return false;
2570
2571       default:
2572         return !flag_associative_math;
2573       }
2574
2575   if (INTEGRAL_TYPE_P (type))
2576     {
2577       if (!operation_no_trapping_overflow (type, code))
2578         return true;
2579       return false;
2580     }
2581
2582   if (SAT_FIXED_POINT_TYPE_P (type))
2583     return true;
2584
2585   return false;
2586 }
2587
2588 /* Return true if the reduction PHI in LOOP with latch arg LOOP_ARG and
2589    reduction operation CODE has a handled computation expression.  */
2590
2591 static bool
2592 check_reduction_path (dump_user_location_t loc, loop_p loop, gphi *phi,
2593                       tree loop_arg, enum tree_code code,
2594                       vec<std::pair<ssa_op_iter, use_operand_p> > &path)
2595 {
2596   auto_bitmap visited;
2597   tree lookfor = PHI_RESULT (phi);
2598   ssa_op_iter curri;
2599   use_operand_p curr = op_iter_init_phiuse (&curri, phi, SSA_OP_USE);
2600   while (USE_FROM_PTR (curr) != loop_arg)
2601     curr = op_iter_next_use (&curri);
2602   curri.i = curri.numops;
2603   do
2604     {
2605       path.safe_push (std::make_pair (curri, curr));
2606       tree use = USE_FROM_PTR (curr);
2607       if (use == lookfor)
2608         break;
2609       gimple *def = SSA_NAME_DEF_STMT (use);
2610       if (gimple_nop_p (def)
2611           || ! flow_bb_inside_loop_p (loop, gimple_bb (def)))
2612         {
2613 pop:
2614           do
2615             {
2616               std::pair<ssa_op_iter, use_operand_p> x = path.pop ();
2617               curri = x.first;
2618               curr = x.second;
2619               do
2620                 curr = op_iter_next_use (&curri);
2621               /* Skip already visited or non-SSA operands (from iterating
2622                  over PHI args).  */
2623               while (curr != NULL_USE_OPERAND_P
2624                      && (TREE_CODE (USE_FROM_PTR (curr)) != SSA_NAME
2625                          || ! bitmap_set_bit (visited,
2626                                               SSA_NAME_VERSION
2627                                                 (USE_FROM_PTR (curr)))));
2628             }
2629           while (curr == NULL_USE_OPERAND_P && ! path.is_empty ());
2630           if (curr == NULL_USE_OPERAND_P)
2631             break;
2632         }
2633       else
2634         {
2635           if (gimple_code (def) == GIMPLE_PHI)
2636             curr = op_iter_init_phiuse (&curri, as_a <gphi *>(def), SSA_OP_USE);
2637           else
2638             curr = op_iter_init_use (&curri, def, SSA_OP_USE);
2639           while (curr != NULL_USE_OPERAND_P
2640                  && (TREE_CODE (USE_FROM_PTR (curr)) != SSA_NAME
2641                      || ! bitmap_set_bit (visited,
2642                                           SSA_NAME_VERSION
2643                                             (USE_FROM_PTR (curr)))))
2644             curr = op_iter_next_use (&curri);
2645           if (curr == NULL_USE_OPERAND_P)
2646             goto pop;
2647         }
2648     }
2649   while (1);
2650   if (dump_file && (dump_flags & TDF_DETAILS))
2651     {
2652       dump_printf_loc (MSG_NOTE, loc, "reduction path: ");
2653       unsigned i;
2654       std::pair<ssa_op_iter, use_operand_p> *x;
2655       FOR_EACH_VEC_ELT (path, i, x)
2656         dump_printf (MSG_NOTE, "%T ", USE_FROM_PTR (x->second));
2657       dump_printf (MSG_NOTE, "\n");
2658     }
2659
2660   /* Check whether the reduction path detected is valid.  */
2661   bool fail = path.length () == 0;
2662   bool neg = false;
2663   for (unsigned i = 1; i < path.length (); ++i)
2664     {
2665       gimple *use_stmt = USE_STMT (path[i].second);
2666       tree op = USE_FROM_PTR (path[i].second);
2667       if (! has_single_use (op)
2668           || ! is_gimple_assign (use_stmt)
2669           /* The following make sure we can compute the operand index
2670              easily plus it mostly disallows chaining via COND_EXPR condition
2671              operands.  */
2672           || (gimple_assign_rhs1 (use_stmt) != op
2673               && gimple_assign_rhs2 (use_stmt) != op
2674               && gimple_assign_rhs3 (use_stmt) != op))
2675         {
2676           fail = true;
2677           break;
2678         }
2679       if (gimple_assign_rhs_code (use_stmt) != code)
2680         {
2681           if (code == PLUS_EXPR
2682               && gimple_assign_rhs_code (use_stmt) == MINUS_EXPR)
2683             {
2684               /* Track whether we negate the reduction value each iteration.  */
2685               if (gimple_assign_rhs2 (use_stmt) == op)
2686                 neg = ! neg;
2687             }
2688           else
2689             {
2690               fail = true;
2691               break;
2692             }
2693         }
2694     }
2695   return ! fail && ! neg;
2696 }
2697
2698 bool
2699 check_reduction_path (dump_user_location_t loc, loop_p loop, gphi *phi,
2700                       tree loop_arg, enum tree_code code)
2701 {
2702   auto_vec<std::pair<ssa_op_iter, use_operand_p> > path;
2703   return check_reduction_path (loc, loop, phi, loop_arg, code, path);
2704 }
2705
2706
2707
2708 /* Function vect_is_simple_reduction
2709
2710    (1) Detect a cross-iteration def-use cycle that represents a simple
2711    reduction computation.  We look for the following pattern:
2712
2713    loop_header:
2714      a1 = phi < a0, a2 >
2715      a3 = ...
2716      a2 = operation (a3, a1)
2717
2718    or
2719
2720    a3 = ...
2721    loop_header:
2722      a1 = phi < a0, a2 >
2723      a2 = operation (a3, a1)
2724
2725    such that:
2726    1. operation is commutative and associative and it is safe to
2727       change the order of the computation
2728    2. no uses for a2 in the loop (a2 is used out of the loop)
2729    3. no uses of a1 in the loop besides the reduction operation
2730    4. no uses of a1 outside the loop.
2731
2732    Conditions 1,4 are tested here.
2733    Conditions 2,3 are tested in vect_mark_stmts_to_be_vectorized.
2734
2735    (2) Detect a cross-iteration def-use cycle in nested loops, i.e.,
2736    nested cycles.
2737
2738    (3) Detect cycles of phi nodes in outer-loop vectorization, i.e., double
2739    reductions:
2740
2741      a1 = phi < a0, a2 >
2742      inner loop (def of a3)
2743      a2 = phi < a3 >
2744
2745    (4) Detect condition expressions, ie:
2746      for (int i = 0; i < N; i++)
2747        if (a[i] < val)
2748         ret_val = a[i];
2749
2750 */
2751
2752 static stmt_vec_info
2753 vect_is_simple_reduction (loop_vec_info loop_info, stmt_vec_info phi_info,
2754                           bool *double_reduc)
2755 {
2756   gphi *phi = as_a <gphi *> (phi_info->stmt);
2757   gimple *phi_use_stmt = NULL;
2758   imm_use_iterator imm_iter;
2759   use_operand_p use_p;
2760
2761   *double_reduc = false;
2762   STMT_VINFO_REDUC_TYPE (phi_info) = TREE_CODE_REDUCTION;
2763
2764   tree phi_name = PHI_RESULT (phi);
2765   /* ???  If there are no uses of the PHI result the inner loop reduction
2766      won't be detected as possibly double-reduction by vectorizable_reduction
2767      because that tries to walk the PHI arg from the preheader edge which
2768      can be constant.  See PR60382.  */
2769   if (has_zero_uses (phi_name))
2770     return NULL;
2771   class loop *loop = (gimple_bb (phi))->loop_father;
2772   unsigned nphi_def_loop_uses = 0;
2773   FOR_EACH_IMM_USE_FAST (use_p, imm_iter, phi_name)
2774     {
2775       gimple *use_stmt = USE_STMT (use_p);
2776       if (is_gimple_debug (use_stmt))
2777         continue;
2778
2779       if (!flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
2780         {
2781           if (dump_enabled_p ())
2782             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2783                              "intermediate value used outside loop.\n");
2784
2785           return NULL;
2786         }
2787
2788       nphi_def_loop_uses++;
2789       phi_use_stmt = use_stmt;
2790     }
2791
2792   tree latch_def = PHI_ARG_DEF_FROM_EDGE (phi, loop_latch_edge (loop));
2793   if (TREE_CODE (latch_def) != SSA_NAME)
2794     {
2795       if (dump_enabled_p ())
2796         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2797                          "reduction: not ssa_name: %T\n", latch_def);
2798       return NULL;
2799     }
2800
2801   stmt_vec_info def_stmt_info = loop_info->lookup_def (latch_def);
2802   if (!def_stmt_info
2803       || !flow_bb_inside_loop_p (loop, gimple_bb (def_stmt_info->stmt)))
2804     return NULL;
2805
2806   bool nested_in_vect_loop
2807     = flow_loop_nested_p (LOOP_VINFO_LOOP (loop_info), loop);
2808   unsigned nlatch_def_loop_uses = 0;
2809   auto_vec<gphi *, 3> lcphis;
2810   bool inner_loop_of_double_reduc = false;
2811   FOR_EACH_IMM_USE_FAST (use_p, imm_iter, latch_def)
2812     {
2813       gimple *use_stmt = USE_STMT (use_p);
2814       if (is_gimple_debug (use_stmt))
2815         continue;
2816       if (flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
2817         nlatch_def_loop_uses++;
2818       else
2819         {
2820           /* We can have more than one loop-closed PHI.  */
2821           lcphis.safe_push (as_a <gphi *> (use_stmt));
2822           if (nested_in_vect_loop
2823               && (STMT_VINFO_DEF_TYPE (loop_info->lookup_stmt (use_stmt))
2824                   == vect_double_reduction_def))
2825             inner_loop_of_double_reduc = true;
2826         }
2827     }
2828
2829   /* If we are vectorizing an inner reduction we are executing that
2830      in the original order only in case we are not dealing with a
2831      double reduction.  */
2832   if (nested_in_vect_loop && !inner_loop_of_double_reduc)
2833     {
2834       if (dump_enabled_p ())
2835         report_vect_op (MSG_NOTE, def_stmt_info->stmt,
2836                         "detected nested cycle: ");
2837       return def_stmt_info;
2838     }
2839
2840   /* If this isn't a nested cycle or if the nested cycle reduction value
2841      is used ouside of the inner loop we cannot handle uses of the reduction
2842      value.  */
2843   if (nlatch_def_loop_uses > 1 || nphi_def_loop_uses > 1)
2844     {
2845       if (dump_enabled_p ())
2846         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2847                          "reduction used in loop.\n");
2848       return NULL;
2849     }
2850
2851   /* If DEF_STMT is a phi node itself, we expect it to have a single argument
2852      defined in the inner loop.  */
2853   if (gphi *def_stmt = dyn_cast <gphi *> (def_stmt_info->stmt))
2854     {
2855       tree op1 = PHI_ARG_DEF (def_stmt, 0);
2856       if (gimple_phi_num_args (def_stmt) != 1
2857           || TREE_CODE (op1) != SSA_NAME)
2858         {
2859           if (dump_enabled_p ())
2860             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2861                              "unsupported phi node definition.\n");
2862
2863           return NULL;
2864         }
2865
2866       gimple *def1 = SSA_NAME_DEF_STMT (op1);
2867       if (gimple_bb (def1)
2868           && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
2869           && loop->inner
2870           && flow_bb_inside_loop_p (loop->inner, gimple_bb (def1))
2871           && is_gimple_assign (def1)
2872           && is_a <gphi *> (phi_use_stmt)
2873           && flow_bb_inside_loop_p (loop->inner, gimple_bb (phi_use_stmt)))
2874         {
2875           if (dump_enabled_p ())
2876             report_vect_op (MSG_NOTE, def_stmt,
2877                             "detected double reduction: ");
2878
2879           *double_reduc = true;
2880           return def_stmt_info;
2881         }
2882
2883       return NULL;
2884     }
2885
2886   gassign *def_stmt = dyn_cast <gassign *> (def_stmt_info->stmt);
2887   if (!def_stmt)
2888     {
2889       if (dump_enabled_p ())
2890         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2891                          "reduction: unhandled reduction operation: %G",
2892                          def_stmt_info->stmt);
2893       return NULL;
2894     }
2895   enum tree_code code = gimple_assign_rhs_code (def_stmt);
2896
2897   /* We can handle "res -= x[i]", which is non-associative by
2898      simply rewriting this into "res += -x[i]".  Avoid changing
2899      gimple instruction for the first simple tests and only do this
2900      if we're allowed to change code at all.  */
2901   if (code == MINUS_EXPR && gimple_assign_rhs2 (def_stmt) != phi_name)
2902     code = PLUS_EXPR;
2903
2904   tree op1, op2;
2905   if (code == COND_EXPR)
2906     {
2907       if (! nested_in_vect_loop)
2908         STMT_VINFO_REDUC_TYPE (phi_info) = COND_REDUCTION;
2909       op1 = gimple_assign_rhs2 (def_stmt);
2910       op2 = gimple_assign_rhs3 (def_stmt);
2911     }
2912   else if (get_gimple_rhs_class (code) == GIMPLE_BINARY_RHS)
2913     {
2914       op1 = gimple_assign_rhs1 (def_stmt);
2915       op2 = gimple_assign_rhs2 (def_stmt);
2916     }
2917   else
2918     {
2919       if (dump_enabled_p ())
2920         report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2921                         "reduction: not handled operation: ");
2922       return NULL;
2923     }
2924
2925   if (TREE_CODE (op1) != SSA_NAME && TREE_CODE (op2) != SSA_NAME)
2926     {
2927       if (dump_enabled_p ())
2928         report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2929                         "reduction: both uses not ssa_names: ");
2930
2931       return NULL;
2932     }
2933
2934   /* Reduction is safe. We're dealing with one of the following:
2935      1) integer arithmetic and no trapv
2936      2) floating point arithmetic, and special flags permit this optimization
2937      3) nested cycle (i.e., outer loop vectorization).  */
2938
2939   /* Check for the simple case that one def is the reduction def,
2940      defined by the PHI node.  */
2941   stmt_vec_info def1_info = loop_info->lookup_def (op1);
2942   stmt_vec_info def2_info = loop_info->lookup_def (op2);
2943   if (def2_info && def2_info->stmt == phi)
2944     {
2945       STMT_VINFO_REDUC_IDX (def_stmt_info) = 1 + (code == COND_EXPR ? 1 : 0);
2946       if (dump_enabled_p ())
2947         report_vect_op (MSG_NOTE, def_stmt, "detected reduction: ");
2948       return def_stmt_info;
2949     }
2950   else if (def1_info && def1_info->stmt == phi)
2951     {
2952       STMT_VINFO_REDUC_IDX (def_stmt_info) = 0 + (code == COND_EXPR ? 1 : 0);
2953       if (dump_enabled_p ())
2954         report_vect_op (MSG_NOTE, def_stmt, "detected reduction: ");
2955       return def_stmt_info;
2956     }
2957
2958   /* Look for the expression computing latch_def from then loop PHI result
2959      in a way involving more than one stmt.  */
2960   auto_vec<std::pair<ssa_op_iter, use_operand_p> > path;
2961   if (check_reduction_path (vect_location, loop, phi, latch_def, code,
2962                             path))
2963     {
2964       /* Try building an SLP reduction chain for which the additional
2965          restriction is that all operations in the chain are the same.  */
2966       auto_vec<stmt_vec_info, 8> reduc_chain;
2967       unsigned i;
2968       bool is_slp_reduc = !nested_in_vect_loop && code != COND_EXPR;
2969       for (i = path.length () - 1; i >= 1; --i)
2970         {
2971           gimple *stmt = USE_STMT (path[i].second);
2972           if (gimple_assign_rhs_code (stmt) != code)
2973             is_slp_reduc = false;
2974           stmt_vec_info stmt_info = loop_info->lookup_stmt (stmt);
2975           STMT_VINFO_REDUC_IDX (stmt_info)
2976             = path[i].second->use - gimple_assign_rhs1_ptr (stmt);
2977           reduc_chain.safe_push (stmt_info);
2978         }
2979       if (is_slp_reduc)
2980         {
2981           for (unsigned i = 0; i < reduc_chain.length () - 1; ++i)
2982             {
2983               REDUC_GROUP_FIRST_ELEMENT (reduc_chain[i]) = reduc_chain[0];
2984               REDUC_GROUP_NEXT_ELEMENT (reduc_chain[i]) = reduc_chain[i+1];
2985             }
2986           REDUC_GROUP_FIRST_ELEMENT (reduc_chain.last ()) = reduc_chain[0];
2987           REDUC_GROUP_NEXT_ELEMENT (reduc_chain.last ()) = NULL;
2988
2989           /* Save the chain for further analysis in SLP detection.  */
2990           LOOP_VINFO_REDUCTION_CHAINS (loop_info).safe_push (reduc_chain[0]);
2991           REDUC_GROUP_SIZE (reduc_chain[0]) = reduc_chain.length ();
2992
2993           if (dump_enabled_p ())
2994             report_vect_op (MSG_NOTE, def_stmt,
2995                             "reduction: detected reduction chain: ");
2996         }
2997
2998       return def_stmt_info;
2999     }
3000
3001   if (dump_enabled_p ())
3002     {
3003       report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
3004                       "reduction: unknown pattern: ");
3005     }
3006
3007   return NULL;
3008 }
3009
3010 /* Calculate cost of peeling the loop PEEL_ITERS_PROLOGUE times.  */
3011 int
3012 vect_get_known_peeling_cost (loop_vec_info loop_vinfo, int peel_iters_prologue,
3013                              int *peel_iters_epilogue,
3014                              stmt_vector_for_cost *scalar_cost_vec,
3015                              stmt_vector_for_cost *prologue_cost_vec,
3016                              stmt_vector_for_cost *epilogue_cost_vec)
3017 {
3018   int retval = 0;
3019   int assumed_vf = vect_vf_for_cost (loop_vinfo);
3020
3021   if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
3022     {
3023       *peel_iters_epilogue = assumed_vf / 2;
3024       if (dump_enabled_p ())
3025         dump_printf_loc (MSG_NOTE, vect_location,
3026                          "cost model: epilogue peel iters set to vf/2 "
3027                          "because loop iterations are unknown .\n");
3028
3029       /* If peeled iterations are known but number of scalar loop
3030          iterations are unknown, count a taken branch per peeled loop.  */
3031       retval = record_stmt_cost (prologue_cost_vec, 1, cond_branch_taken,
3032                                  NULL, 0, vect_prologue);
3033       retval += record_stmt_cost (epilogue_cost_vec, 1, cond_branch_taken,
3034                                   NULL, 0, vect_epilogue);
3035     }
3036   else
3037     {
3038       int niters = LOOP_VINFO_INT_NITERS (loop_vinfo);
3039       peel_iters_prologue = niters < peel_iters_prologue ?
3040                             niters : peel_iters_prologue;
3041       *peel_iters_epilogue = (niters - peel_iters_prologue) % assumed_vf;
3042       /* If we need to peel for gaps, but no peeling is required, we have to
3043          peel VF iterations.  */
3044       if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) && !*peel_iters_epilogue)
3045         *peel_iters_epilogue = assumed_vf;
3046     }
3047
3048   stmt_info_for_cost *si;
3049   int j;
3050   if (peel_iters_prologue)
3051     FOR_EACH_VEC_ELT (*scalar_cost_vec, j, si)
3052       retval += record_stmt_cost (prologue_cost_vec,
3053                                   si->count * peel_iters_prologue,
3054                                   si->kind, si->stmt_info, si->misalign,
3055                                   vect_prologue);
3056   if (*peel_iters_epilogue)
3057     FOR_EACH_VEC_ELT (*scalar_cost_vec, j, si)
3058       retval += record_stmt_cost (epilogue_cost_vec,
3059                                   si->count * *peel_iters_epilogue,
3060                                   si->kind, si->stmt_info, si->misalign,
3061                                   vect_epilogue);
3062
3063   return retval;
3064 }
3065
3066 /* Function vect_estimate_min_profitable_iters
3067
3068    Return the number of iterations required for the vector version of the
3069    loop to be profitable relative to the cost of the scalar version of the
3070    loop.
3071
3072    *RET_MIN_PROFITABLE_NITERS is a cost model profitability threshold
3073    of iterations for vectorization.  -1 value means loop vectorization
3074    is not profitable.  This returned value may be used for dynamic
3075    profitability check.
3076
3077    *RET_MIN_PROFITABLE_ESTIMATE is a profitability threshold to be used
3078    for static check against estimated number of iterations.  */
3079
3080 static void
3081 vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo,
3082                                     int *ret_min_profitable_niters,
3083                                     int *ret_min_profitable_estimate)
3084 {
3085   int min_profitable_iters;
3086   int min_profitable_estimate;
3087   int peel_iters_prologue;
3088   int peel_iters_epilogue;
3089   unsigned vec_inside_cost = 0;
3090   int vec_outside_cost = 0;
3091   unsigned vec_prologue_cost = 0;
3092   unsigned vec_epilogue_cost = 0;
3093   int scalar_single_iter_cost = 0;
3094   int scalar_outside_cost = 0;
3095   int assumed_vf = vect_vf_for_cost (loop_vinfo);
3096   int npeel = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
3097   void *target_cost_data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
3098
3099   /* Cost model disabled.  */
3100   if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
3101     {
3102       if (dump_enabled_p ())
3103         dump_printf_loc (MSG_NOTE, vect_location, "cost model disabled.\n");
3104       *ret_min_profitable_niters = 0;
3105       *ret_min_profitable_estimate = 0;
3106       return;
3107     }
3108
3109   /* Requires loop versioning tests to handle misalignment.  */
3110   if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo))
3111     {
3112       /*  FIXME: Make cost depend on complexity of individual check.  */
3113       unsigned len = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).length ();
3114       (void) add_stmt_cost (target_cost_data, len, vector_stmt, NULL, 0,
3115                             vect_prologue);
3116       if (dump_enabled_p ())
3117         dump_printf (MSG_NOTE,
3118                      "cost model: Adding cost of checks for loop "
3119                      "versioning to treat misalignment.\n");
3120     }
3121
3122   /* Requires loop versioning with alias checks.  */
3123   if (LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
3124     {
3125       /*  FIXME: Make cost depend on complexity of individual check.  */
3126       unsigned len = LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo).length ();
3127       (void) add_stmt_cost (target_cost_data, len, vector_stmt, NULL, 0,
3128                             vect_prologue);
3129       len = LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo).length ();
3130       if (len)
3131         /* Count LEN - 1 ANDs and LEN comparisons.  */
3132         (void) add_stmt_cost (target_cost_data, len * 2 - 1, scalar_stmt,
3133                               NULL, 0, vect_prologue);
3134       len = LOOP_VINFO_LOWER_BOUNDS (loop_vinfo).length ();
3135       if (len)
3136         {
3137           /* Count LEN - 1 ANDs and LEN comparisons.  */
3138           unsigned int nstmts = len * 2 - 1;
3139           /* +1 for each bias that needs adding.  */
3140           for (unsigned int i = 0; i < len; ++i)
3141             if (!LOOP_VINFO_LOWER_BOUNDS (loop_vinfo)[i].unsigned_p)
3142               nstmts += 1;
3143           (void) add_stmt_cost (target_cost_data, nstmts, scalar_stmt,
3144                                 NULL, 0, vect_prologue);
3145         }
3146       if (dump_enabled_p ())
3147         dump_printf (MSG_NOTE,
3148                      "cost model: Adding cost of checks for loop "
3149                      "versioning aliasing.\n");
3150     }
3151
3152   /* Requires loop versioning with niter checks.  */
3153   if (LOOP_REQUIRES_VERSIONING_FOR_NITERS (loop_vinfo))
3154     {
3155       /*  FIXME: Make cost depend on complexity of individual check.  */
3156       (void) add_stmt_cost (target_cost_data, 1, vector_stmt, NULL, 0,
3157                             vect_prologue);
3158       if (dump_enabled_p ())
3159         dump_printf (MSG_NOTE,
3160                      "cost model: Adding cost of checks for loop "
3161                      "versioning niters.\n");
3162     }
3163
3164   if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
3165     (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken, NULL, 0,
3166                           vect_prologue);
3167
3168   /* Count statements in scalar loop.  Using this as scalar cost for a single
3169      iteration for now.
3170
3171      TODO: Add outer loop support.
3172
3173      TODO: Consider assigning different costs to different scalar
3174      statements.  */
3175
3176   scalar_single_iter_cost
3177     = LOOP_VINFO_SINGLE_SCALAR_ITERATION_COST (loop_vinfo);
3178
3179   /* Add additional cost for the peeled instructions in prologue and epilogue
3180      loop.  (For fully-masked loops there will be no peeling.)
3181
3182      FORNOW: If we don't know the value of peel_iters for prologue or epilogue
3183      at compile-time - we assume it's vf/2 (the worst would be vf-1).
3184
3185      TODO: Build an expression that represents peel_iters for prologue and
3186      epilogue to be used in a run-time test.  */
3187
3188   if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
3189     {
3190       peel_iters_prologue = 0;
3191       peel_iters_epilogue = 0;
3192
3193       if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
3194         {
3195           /* We need to peel exactly one iteration.  */
3196           peel_iters_epilogue += 1;
3197           stmt_info_for_cost *si;
3198           int j;
3199           FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
3200                             j, si)
3201             (void) add_stmt_cost (target_cost_data, si->count,
3202                                   si->kind, si->stmt_info, si->misalign,
3203                                   vect_epilogue);
3204         }
3205     }
3206   else if (npeel < 0)
3207     {
3208       peel_iters_prologue = assumed_vf / 2;
3209       if (dump_enabled_p ())
3210         dump_printf (MSG_NOTE, "cost model: "
3211                      "prologue peel iters set to vf/2.\n");
3212
3213       /* If peeling for alignment is unknown, loop bound of main loop becomes
3214          unknown.  */
3215       peel_iters_epilogue = assumed_vf / 2;
3216       if (dump_enabled_p ())
3217         dump_printf (MSG_NOTE, "cost model: "
3218                      "epilogue peel iters set to vf/2 because "
3219                      "peeling for alignment is unknown.\n");
3220
3221       /* If peeled iterations are unknown, count a taken branch and a not taken
3222          branch per peeled loop. Even if scalar loop iterations are known,
3223          vector iterations are not known since peeled prologue iterations are
3224          not known. Hence guards remain the same.  */
3225       (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken,
3226                             NULL, 0, vect_prologue);
3227       (void) add_stmt_cost (target_cost_data, 1, cond_branch_not_taken,
3228                             NULL, 0, vect_prologue);
3229       (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken,
3230                             NULL, 0, vect_epilogue);
3231       (void) add_stmt_cost (target_cost_data, 1, cond_branch_not_taken,
3232                             NULL, 0, vect_epilogue);
3233       stmt_info_for_cost *si;
3234       int j;
3235       FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo), j, si)
3236         {
3237           (void) add_stmt_cost (target_cost_data,
3238                                 si->count * peel_iters_prologue,
3239                                 si->kind, si->stmt_info, si->misalign,
3240                                 vect_prologue);
3241           (void) add_stmt_cost (target_cost_data,
3242                                 si->count * peel_iters_epilogue,
3243                                 si->kind, si->stmt_info, si->misalign,
3244                                 vect_epilogue);
3245         }
3246     }
3247   else
3248     {
3249       stmt_vector_for_cost prologue_cost_vec, epilogue_cost_vec;
3250       stmt_info_for_cost *si;
3251       int j;
3252       void *data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
3253
3254       prologue_cost_vec.create (2);
3255       epilogue_cost_vec.create (2);
3256       peel_iters_prologue = npeel;
3257
3258       (void) vect_get_known_peeling_cost (loop_vinfo, peel_iters_prologue,
3259                                           &peel_iters_epilogue,
3260                                           &LOOP_VINFO_SCALAR_ITERATION_COST
3261                                             (loop_vinfo),
3262                                           &prologue_cost_vec,
3263                                           &epilogue_cost_vec);
3264
3265       FOR_EACH_VEC_ELT (prologue_cost_vec, j, si)
3266         (void) add_stmt_cost (data, si->count, si->kind, si->stmt_info,
3267                               si->misalign, vect_prologue);
3268
3269       FOR_EACH_VEC_ELT (epilogue_cost_vec, j, si)
3270         (void) add_stmt_cost (data, si->count, si->kind, si->stmt_info,
3271                               si->misalign, vect_epilogue);
3272
3273       prologue_cost_vec.release ();
3274       epilogue_cost_vec.release ();
3275     }
3276
3277   /* FORNOW: The scalar outside cost is incremented in one of the
3278      following ways:
3279
3280      1. The vectorizer checks for alignment and aliasing and generates
3281      a condition that allows dynamic vectorization.  A cost model
3282      check is ANDED with the versioning condition.  Hence scalar code
3283      path now has the added cost of the versioning check.
3284
3285        if (cost > th & versioning_check)
3286          jmp to vector code
3287
3288      Hence run-time scalar is incremented by not-taken branch cost.
3289
3290      2. The vectorizer then checks if a prologue is required.  If the
3291      cost model check was not done before during versioning, it has to
3292      be done before the prologue check.
3293
3294        if (cost <= th)
3295          prologue = scalar_iters
3296        if (prologue == 0)
3297          jmp to vector code
3298        else
3299          execute prologue
3300        if (prologue == num_iters)
3301          go to exit
3302
3303      Hence the run-time scalar cost is incremented by a taken branch,
3304      plus a not-taken branch, plus a taken branch cost.
3305
3306      3. The vectorizer then checks if an epilogue is required.  If the
3307      cost model check was not done before during prologue check, it
3308      has to be done with the epilogue check.
3309
3310        if (prologue == 0)
3311          jmp to vector code
3312        else
3313          execute prologue
3314        if (prologue == num_iters)
3315          go to exit
3316        vector code:
3317          if ((cost <= th) | (scalar_iters-prologue-epilogue == 0))
3318            jmp to epilogue
3319
3320      Hence the run-time scalar cost should be incremented by 2 taken
3321      branches.
3322
3323      TODO: The back end may reorder the BBS's differently and reverse
3324      conditions/branch directions.  Change the estimates below to
3325      something more reasonable.  */
3326
3327   /* If the number of iterations is known and we do not do versioning, we can
3328      decide whether to vectorize at compile time.  Hence the scalar version
3329      do not carry cost model guard costs.  */
3330   if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
3331       || LOOP_REQUIRES_VERSIONING (loop_vinfo))
3332     {
3333       /* Cost model check occurs at versioning.  */
3334       if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
3335         scalar_outside_cost += vect_get_stmt_cost (cond_branch_not_taken);
3336       else
3337         {
3338           /* Cost model check occurs at prologue generation.  */
3339           if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) < 0)
3340             scalar_outside_cost += 2 * vect_get_stmt_cost (cond_branch_taken)
3341               + vect_get_stmt_cost (cond_branch_not_taken);
3342           /* Cost model check occurs at epilogue generation.  */
3343           else
3344             scalar_outside_cost += 2 * vect_get_stmt_cost (cond_branch_taken);
3345         }
3346     }
3347
3348   /* Complete the target-specific cost calculations.  */
3349   finish_cost (LOOP_VINFO_TARGET_COST_DATA (loop_vinfo), &vec_prologue_cost,
3350                &vec_inside_cost, &vec_epilogue_cost);
3351
3352   vec_outside_cost = (int)(vec_prologue_cost + vec_epilogue_cost);
3353
3354   if (dump_enabled_p ())
3355     {
3356       dump_printf_loc (MSG_NOTE, vect_location, "Cost model analysis: \n");
3357       dump_printf (MSG_NOTE, "  Vector inside of loop cost: %d\n",
3358                    vec_inside_cost);
3359       dump_printf (MSG_NOTE, "  Vector prologue cost: %d\n",
3360                    vec_prologue_cost);
3361       dump_printf (MSG_NOTE, "  Vector epilogue cost: %d\n",
3362                    vec_epilogue_cost);
3363       dump_printf (MSG_NOTE, "  Scalar iteration cost: %d\n",
3364                    scalar_single_iter_cost);
3365       dump_printf (MSG_NOTE, "  Scalar outside cost: %d\n",
3366                    scalar_outside_cost);
3367       dump_printf (MSG_NOTE, "  Vector outside cost: %d\n",
3368                    vec_outside_cost);
3369       dump_printf (MSG_NOTE, "  prologue iterations: %d\n",
3370                    peel_iters_prologue);
3371       dump_printf (MSG_NOTE, "  epilogue iterations: %d\n",
3372                    peel_iters_epilogue);
3373     }
3374
3375   /* Calculate number of iterations required to make the vector version
3376      profitable, relative to the loop bodies only.  The following condition
3377      must hold true:
3378      SIC * niters + SOC > VIC * ((niters - NPEEL) / VF) + VOC
3379      where
3380      SIC = scalar iteration cost, VIC = vector iteration cost,
3381      VOC = vector outside cost, VF = vectorization factor,
3382      NPEEL = prologue iterations + epilogue iterations,
3383      SOC = scalar outside cost for run time cost model check.  */
3384
3385   int saving_per_viter = (scalar_single_iter_cost * assumed_vf
3386                           - vec_inside_cost);
3387   if (saving_per_viter <= 0)
3388     {
3389       if (LOOP_VINFO_LOOP (loop_vinfo)->force_vectorize)
3390         warning_at (vect_location.get_location_t (), OPT_Wopenmp_simd,
3391                     "vectorization did not happen for a simd loop");
3392
3393       if (dump_enabled_p ())
3394         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3395                          "cost model: the vector iteration cost = %d "
3396                          "divided by the scalar iteration cost = %d "
3397                          "is greater or equal to the vectorization factor = %d"
3398                          ".\n",
3399                          vec_inside_cost, scalar_single_iter_cost, assumed_vf);
3400       *ret_min_profitable_niters = -1;
3401       *ret_min_profitable_estimate = -1;
3402       return;
3403     }
3404
3405   /* ??? The "if" arm is written to handle all cases; see below for what
3406      we would do for !LOOP_VINFO_FULLY_MASKED_P.  */
3407   if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
3408     {
3409       /* Rewriting the condition above in terms of the number of
3410          vector iterations (vniters) rather than the number of
3411          scalar iterations (niters) gives:
3412
3413          SIC * (vniters * VF + NPEEL) + SOC > VIC * vniters + VOC
3414
3415          <==> vniters * (SIC * VF - VIC) > VOC - SIC * NPEEL - SOC
3416
3417          For integer N, X and Y when X > 0:
3418
3419          N * X > Y <==> N >= (Y /[floor] X) + 1.  */
3420       int outside_overhead = (vec_outside_cost
3421                               - scalar_single_iter_cost * peel_iters_prologue
3422                               - scalar_single_iter_cost * peel_iters_epilogue
3423                               - scalar_outside_cost);
3424       /* We're only interested in cases that require at least one
3425          vector iteration.  */
3426       int min_vec_niters = 1;
3427       if (outside_overhead > 0)
3428         min_vec_niters = outside_overhead / saving_per_viter + 1;
3429
3430       if (dump_enabled_p ())
3431         dump_printf (MSG_NOTE, "  Minimum number of vector iterations: %d\n",
3432                      min_vec_niters);
3433
3434       if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
3435         {
3436           /* Now that we know the minimum number of vector iterations,
3437              find the minimum niters for which the scalar cost is larger:
3438
3439              SIC * niters > VIC * vniters + VOC - SOC
3440
3441              We know that the minimum niters is no more than
3442              vniters * VF + NPEEL, but it might be (and often is) less
3443              than that if a partial vector iteration is cheaper than the
3444              equivalent scalar code.  */
3445           int threshold = (vec_inside_cost * min_vec_niters
3446                            + vec_outside_cost
3447                            - scalar_outside_cost);
3448           if (threshold <= 0)
3449             min_profitable_iters = 1;
3450           else
3451             min_profitable_iters = threshold / scalar_single_iter_cost + 1;
3452         }
3453       else
3454         /* Convert the number of vector iterations into a number of
3455            scalar iterations.  */
3456         min_profitable_iters = (min_vec_niters * assumed_vf
3457                                 + peel_iters_prologue
3458                                 + peel_iters_epilogue);
3459     }
3460   else
3461     {
3462       min_profitable_iters = ((vec_outside_cost - scalar_outside_cost)
3463                               * assumed_vf
3464                               - vec_inside_cost * peel_iters_prologue
3465                               - vec_inside_cost * peel_iters_epilogue);
3466       if (min_profitable_iters <= 0)
3467         min_profitable_iters = 0;
3468       else
3469         {
3470           min_profitable_iters /= saving_per_viter;
3471
3472           if ((scalar_single_iter_cost * assumed_vf * min_profitable_iters)
3473               <= (((int) vec_inside_cost * min_profitable_iters)
3474                   + (((int) vec_outside_cost - scalar_outside_cost)
3475                      * assumed_vf)))
3476             min_profitable_iters++;
3477         }
3478     }
3479
3480   if (dump_enabled_p ())
3481     dump_printf (MSG_NOTE,
3482                  "  Calculated minimum iters for profitability: %d\n",
3483                  min_profitable_iters);
3484
3485   if (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
3486       && min_profitable_iters < (assumed_vf + peel_iters_prologue))
3487     /* We want the vectorized loop to execute at least once.  */
3488     min_profitable_iters = assumed_vf + peel_iters_prologue;
3489
3490   if (dump_enabled_p ())
3491     dump_printf_loc (MSG_NOTE, vect_location,
3492                      "  Runtime profitability threshold = %d\n",
3493                      min_profitable_iters);
3494
3495   *ret_min_profitable_niters = min_profitable_iters;
3496
3497   /* Calculate number of iterations required to make the vector version
3498      profitable, relative to the loop bodies only.
3499
3500      Non-vectorized variant is SIC * niters and it must win over vector
3501      variant on the expected loop trip count.  The following condition must hold true:
3502      SIC * niters > VIC * ((niters - NPEEL) / VF) + VOC + SOC  */
3503
3504   if (vec_outside_cost <= 0)
3505     min_profitable_estimate = 0;
3506   else if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
3507     {
3508       /* This is a repeat of the code above, but with + SOC rather
3509          than - SOC.  */
3510       int outside_overhead = (vec_outside_cost
3511                               - scalar_single_iter_cost * peel_iters_prologue
3512                               - scalar_single_iter_cost * peel_iters_epilogue
3513                               + scalar_outside_cost);
3514       int min_vec_niters = 1;
3515       if (outside_overhead > 0)
3516         min_vec_niters = outside_overhead / saving_per_viter + 1;
3517
3518       if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
3519         {
3520           int threshold = (vec_inside_cost * min_vec_niters
3521                            + vec_outside_cost
3522                            + scalar_outside_cost);
3523           min_profitable_estimate = threshold / scalar_single_iter_cost + 1;
3524         }
3525       else
3526         min_profitable_estimate = (min_vec_niters * assumed_vf
3527                                    + peel_iters_prologue
3528                                    + peel_iters_epilogue);
3529     }
3530   else
3531     {
3532       min_profitable_estimate = ((vec_outside_cost + scalar_outside_cost)
3533                                  * assumed_vf
3534                                  - vec_inside_cost * peel_iters_prologue
3535                                  - vec_inside_cost * peel_iters_epilogue)
3536                                  / ((scalar_single_iter_cost * assumed_vf)
3537                                    - vec_inside_cost);
3538     }
3539   min_profitable_estimate = MAX (min_profitable_estimate, min_profitable_iters);
3540   if (dump_enabled_p ())
3541     dump_printf_loc (MSG_NOTE, vect_location,
3542                      "  Static estimate profitability threshold = %d\n",
3543                      min_profitable_estimate);
3544
3545   *ret_min_profitable_estimate = min_profitable_estimate;
3546 }
3547
3548 /* Writes into SEL a mask for a vec_perm, equivalent to a vec_shr by OFFSET
3549    vector elements (not bits) for a vector with NELT elements.  */
3550 static void
3551 calc_vec_perm_mask_for_shift (unsigned int offset, unsigned int nelt,
3552                               vec_perm_builder *sel)
3553 {
3554   /* The encoding is a single stepped pattern.  Any wrap-around is handled
3555      by vec_perm_indices.  */
3556   sel->new_vector (nelt, 1, 3);
3557   for (unsigned int i = 0; i < 3; i++)
3558     sel->quick_push (i + offset);
3559 }
3560
3561 /* Checks whether the target supports whole-vector shifts for vectors of mode
3562    MODE.  This is the case if _either_ the platform handles vec_shr_optab, _or_
3563    it supports vec_perm_const with masks for all necessary shift amounts.  */
3564 static bool
3565 have_whole_vector_shift (machine_mode mode)
3566 {
3567   if (optab_handler (vec_shr_optab, mode) != CODE_FOR_nothing)
3568     return true;
3569
3570   /* Variable-length vectors should be handled via the optab.  */
3571   unsigned int nelt;
3572   if (!GET_MODE_NUNITS (mode).is_constant (&nelt))
3573     return false;
3574
3575   vec_perm_builder sel;
3576   vec_perm_indices indices;
3577   for (unsigned int i = nelt / 2; i >= 1; i /= 2)
3578     {
3579       calc_vec_perm_mask_for_shift (i, nelt, &sel);
3580       indices.new_vector (sel, 2, nelt);
3581       if (!can_vec_perm_const_p (mode, indices, false))
3582         return false;
3583     }
3584   return true;
3585 }
3586
3587 /* TODO: Close dependency between vect_model_*_cost and vectorizable_*
3588    functions. Design better to avoid maintenance issues.  */
3589
3590 /* Function vect_model_reduction_cost.
3591
3592    Models cost for a reduction operation, including the vector ops
3593    generated within the strip-mine loop, the initial definition before
3594    the loop, and the epilogue code that must be generated.  */
3595
3596 static void
3597 vect_model_reduction_cost (stmt_vec_info stmt_info, internal_fn reduc_fn,
3598                            vect_reduction_type reduction_type,
3599                            int ncopies, stmt_vector_for_cost *cost_vec)
3600 {
3601   int prologue_cost = 0, epilogue_cost = 0, inside_cost;
3602   enum tree_code code;
3603   optab optab;
3604   tree vectype;
3605   machine_mode mode;
3606   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
3607   class loop *loop = NULL;
3608
3609   if (loop_vinfo)
3610     loop = LOOP_VINFO_LOOP (loop_vinfo);
3611
3612   /* Condition reductions generate two reductions in the loop.  */
3613   if (reduction_type == COND_REDUCTION)
3614     ncopies *= 2;
3615
3616   vectype = STMT_VINFO_VECTYPE (stmt_info);
3617   mode = TYPE_MODE (vectype);
3618   stmt_vec_info orig_stmt_info = vect_orig_stmt (stmt_info);
3619
3620   code = gimple_assign_rhs_code (orig_stmt_info->stmt);
3621
3622   if (reduction_type == EXTRACT_LAST_REDUCTION
3623       || reduction_type == FOLD_LEFT_REDUCTION)
3624     {
3625       /* No extra instructions needed in the prologue.  */
3626       prologue_cost = 0;
3627
3628       if (reduction_type == EXTRACT_LAST_REDUCTION || reduc_fn != IFN_LAST)
3629         /* Count one reduction-like operation per vector.  */
3630         inside_cost = record_stmt_cost (cost_vec, ncopies, vec_to_scalar,
3631                                         stmt_info, 0, vect_body);
3632       else
3633         {
3634           /* Use NELEMENTS extracts and NELEMENTS scalar ops.  */
3635           unsigned int nelements = ncopies * vect_nunits_for_cost (vectype);
3636           inside_cost = record_stmt_cost (cost_vec, nelements,
3637                                           vec_to_scalar, stmt_info, 0,
3638                                           vect_body);
3639           inside_cost += record_stmt_cost (cost_vec, nelements,
3640                                            scalar_stmt, stmt_info, 0,
3641                                            vect_body);
3642         }
3643     }
3644   else
3645     {
3646       /* Add in cost for initial definition.
3647          For cond reduction we have four vectors: initial index, step,
3648          initial result of the data reduction, initial value of the index
3649          reduction.  */
3650       int prologue_stmts = reduction_type == COND_REDUCTION ? 4 : 1;
3651       prologue_cost += record_stmt_cost (cost_vec, prologue_stmts,
3652                                          scalar_to_vec, stmt_info, 0,
3653                                          vect_prologue);
3654
3655       /* Cost of reduction op inside loop.  */
3656       inside_cost = record_stmt_cost (cost_vec, ncopies, vector_stmt,
3657                                       stmt_info, 0, vect_body);
3658     }
3659
3660   /* Determine cost of epilogue code.
3661
3662      We have a reduction operator that will reduce the vector in one statement.
3663      Also requires scalar extract.  */
3664
3665   if (!loop || !nested_in_vect_loop_p (loop, orig_stmt_info))
3666     {
3667       if (reduc_fn != IFN_LAST)
3668         {
3669           if (reduction_type == COND_REDUCTION)
3670             {
3671               /* An EQ stmt and an COND_EXPR stmt.  */
3672               epilogue_cost += record_stmt_cost (cost_vec, 2,
3673                                                  vector_stmt, stmt_info, 0,
3674                                                  vect_epilogue);
3675               /* Reduction of the max index and a reduction of the found
3676                  values.  */
3677               epilogue_cost += record_stmt_cost (cost_vec, 2,
3678                                                  vec_to_scalar, stmt_info, 0,
3679                                                  vect_epilogue);
3680               /* A broadcast of the max value.  */
3681               epilogue_cost += record_stmt_cost (cost_vec, 1,
3682                                                  scalar_to_vec, stmt_info, 0,
3683                                                  vect_epilogue);
3684             }
3685           else
3686             {
3687               epilogue_cost += record_stmt_cost (cost_vec, 1, vector_stmt,
3688                                                  stmt_info, 0, vect_epilogue);
3689               epilogue_cost += record_stmt_cost (cost_vec, 1,
3690                                                  vec_to_scalar, stmt_info, 0,
3691                                                  vect_epilogue);
3692             }
3693         }
3694       else if (reduction_type == COND_REDUCTION)
3695         {
3696           unsigned estimated_nunits = vect_nunits_for_cost (vectype);
3697           /* Extraction of scalar elements.  */
3698           epilogue_cost += record_stmt_cost (cost_vec,
3699                                              2 * estimated_nunits,
3700                                              vec_to_scalar, stmt_info, 0,
3701                                              vect_epilogue);
3702           /* Scalar max reductions via COND_EXPR / MAX_EXPR.  */
3703           epilogue_cost += record_stmt_cost (cost_vec,
3704                                              2 * estimated_nunits - 3,
3705                                              scalar_stmt, stmt_info, 0,
3706                                              vect_epilogue);
3707         }
3708       else if (reduction_type == EXTRACT_LAST_REDUCTION
3709                || reduction_type == FOLD_LEFT_REDUCTION)
3710         /* No extra instructions need in the epilogue.  */
3711         ;
3712       else
3713         {
3714           int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
3715           tree bitsize =
3716             TYPE_SIZE (TREE_TYPE (gimple_assign_lhs (orig_stmt_info->stmt)));
3717           int element_bitsize = tree_to_uhwi (bitsize);
3718           int nelements = vec_size_in_bits / element_bitsize;
3719
3720           if (code == COND_EXPR)
3721             code = MAX_EXPR;
3722
3723           optab = optab_for_tree_code (code, vectype, optab_default);
3724
3725           /* We have a whole vector shift available.  */
3726           if (optab != unknown_optab
3727               && VECTOR_MODE_P (mode)
3728               && optab_handler (optab, mode) != CODE_FOR_nothing
3729               && have_whole_vector_shift (mode))
3730             {
3731               /* Final reduction via vector shifts and the reduction operator.
3732                  Also requires scalar extract.  */
3733               epilogue_cost += record_stmt_cost (cost_vec,
3734                                                  exact_log2 (nelements) * 2,
3735                                                  vector_stmt, stmt_info, 0,
3736                                                  vect_epilogue);
3737               epilogue_cost += record_stmt_cost (cost_vec, 1,
3738                                                  vec_to_scalar, stmt_info, 0,
3739                                                  vect_epilogue);
3740             }
3741           else
3742             /* Use extracts and reduction op for final reduction.  For N
3743                elements, we have N extracts and N-1 reduction ops.  */
3744             epilogue_cost += record_stmt_cost (cost_vec,
3745                                                nelements + nelements - 1,
3746                                                vector_stmt, stmt_info, 0,
3747                                                vect_epilogue);
3748         }
3749     }
3750
3751   if (dump_enabled_p ())
3752     dump_printf (MSG_NOTE,
3753                  "vect_model_reduction_cost: inside_cost = %d, "
3754                  "prologue_cost = %d, epilogue_cost = %d .\n", inside_cost,
3755                  prologue_cost, epilogue_cost);
3756 }
3757
3758
3759 /* Function vect_model_induction_cost.
3760
3761    Models cost for induction operations.  */
3762
3763 static void
3764 vect_model_induction_cost (stmt_vec_info stmt_info, int ncopies,
3765                            stmt_vector_for_cost *cost_vec)
3766 {
3767   unsigned inside_cost, prologue_cost;
3768
3769   if (PURE_SLP_STMT (stmt_info))
3770     return;
3771
3772   /* loop cost for vec_loop.  */
3773   inside_cost = record_stmt_cost (cost_vec, ncopies, vector_stmt,
3774                                   stmt_info, 0, vect_body);
3775
3776   /* prologue cost for vec_init and vec_step.  */
3777   prologue_cost = record_stmt_cost (cost_vec, 2, scalar_to_vec,
3778                                     stmt_info, 0, vect_prologue);
3779
3780   if (dump_enabled_p ())
3781     dump_printf_loc (MSG_NOTE, vect_location,
3782                      "vect_model_induction_cost: inside_cost = %d, "
3783                      "prologue_cost = %d .\n", inside_cost, prologue_cost);
3784 }
3785
3786
3787
3788 /* Function get_initial_def_for_reduction
3789
3790    Input:
3791    STMT_VINFO - a stmt that performs a reduction operation in the loop.
3792    INIT_VAL - the initial value of the reduction variable
3793
3794    Output:
3795    ADJUSTMENT_DEF - a tree that holds a value to be added to the final result
3796         of the reduction (used for adjusting the epilog - see below).
3797    Return a vector variable, initialized according to the operation that
3798         STMT_VINFO performs. This vector will be used as the initial value
3799         of the vector of partial results.
3800
3801    Option1 (adjust in epilog): Initialize the vector as follows:
3802      add/bit or/xor:    [0,0,...,0,0]
3803      mult/bit and:      [1,1,...,1,1]
3804      min/max/cond_expr: [init_val,init_val,..,init_val,init_val]
3805    and when necessary (e.g. add/mult case) let the caller know
3806    that it needs to adjust the result by init_val.
3807
3808    Option2: Initialize the vector as follows:
3809      add/bit or/xor:    [init_val,0,0,...,0]
3810      mult/bit and:      [init_val,1,1,...,1]
3811      min/max/cond_expr: [init_val,init_val,...,init_val]
3812    and no adjustments are needed.
3813
3814    For example, for the following code:
3815
3816    s = init_val;
3817    for (i=0;i<n;i++)
3818      s = s + a[i];
3819
3820    STMT_VINFO is 's = s + a[i]', and the reduction variable is 's'.
3821    For a vector of 4 units, we want to return either [0,0,0,init_val],
3822    or [0,0,0,0] and let the caller know that it needs to adjust
3823    the result at the end by 'init_val'.
3824
3825    FORNOW, we are using the 'adjust in epilog' scheme, because this way the
3826    initialization vector is simpler (same element in all entries), if
3827    ADJUSTMENT_DEF is not NULL, and Option2 otherwise.
3828
3829    A cost model should help decide between these two schemes.  */
3830
3831 static tree
3832 get_initial_def_for_reduction (stmt_vec_info stmt_vinfo,
3833                                enum tree_code code, tree init_val,
3834                                tree *adjustment_def)
3835 {
3836   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_vinfo);
3837   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
3838   tree scalar_type = TREE_TYPE (init_val);
3839   tree vectype = get_vectype_for_scalar_type (scalar_type);
3840   tree def_for_init;
3841   tree init_def;
3842   REAL_VALUE_TYPE real_init_val = dconst0;
3843   int int_init_val = 0;
3844   gimple_seq stmts = NULL;
3845
3846   gcc_assert (vectype);
3847
3848   gcc_assert (POINTER_TYPE_P (scalar_type) || INTEGRAL_TYPE_P (scalar_type)
3849               || SCALAR_FLOAT_TYPE_P (scalar_type));
3850
3851   gcc_assert (nested_in_vect_loop_p (loop, stmt_vinfo)
3852               || loop == (gimple_bb (stmt_vinfo->stmt))->loop_father);
3853
3854   /* ADJUSTMENT_DEF is NULL when called from
3855      vect_create_epilog_for_reduction to vectorize double reduction.  */
3856   if (adjustment_def)
3857     *adjustment_def = NULL;
3858
3859   switch (code)
3860     {
3861     case WIDEN_SUM_EXPR:
3862     case DOT_PROD_EXPR:
3863     case SAD_EXPR:
3864     case PLUS_EXPR:
3865     case MINUS_EXPR:
3866     case BIT_IOR_EXPR:
3867     case BIT_XOR_EXPR:
3868     case MULT_EXPR:
3869     case BIT_AND_EXPR:
3870       {
3871         if (code == MULT_EXPR)
3872           {
3873             real_init_val = dconst1;
3874             int_init_val = 1;
3875           }
3876
3877         if (code == BIT_AND_EXPR)
3878           int_init_val = -1;
3879
3880         if (SCALAR_FLOAT_TYPE_P (scalar_type))
3881           def_for_init = build_real (scalar_type, real_init_val);
3882         else
3883           def_for_init = build_int_cst (scalar_type, int_init_val);
3884
3885         if (adjustment_def || operand_equal_p (def_for_init, init_val, 0))
3886           {
3887             /* Option1: the first element is '0' or '1' as well.  */
3888             if (!operand_equal_p (def_for_init, init_val, 0))
3889               *adjustment_def = init_val;
3890             init_def = gimple_build_vector_from_val (&stmts, vectype,
3891                                                      def_for_init);
3892           }
3893         else if (!TYPE_VECTOR_SUBPARTS (vectype).is_constant ())
3894           {
3895             /* Option2 (variable length): the first element is INIT_VAL.  */
3896             init_def = gimple_build_vector_from_val (&stmts, vectype,
3897                                                      def_for_init);
3898             init_def = gimple_build (&stmts, CFN_VEC_SHL_INSERT,
3899                                      vectype, init_def, init_val);
3900           }
3901         else
3902           {
3903             /* Option2: the first element is INIT_VAL.  */
3904             tree_vector_builder elts (vectype, 1, 2);
3905             elts.quick_push (init_val);
3906             elts.quick_push (def_for_init);
3907             init_def = gimple_build_vector (&stmts, &elts);
3908           }
3909       }
3910       break;
3911
3912     case MIN_EXPR:
3913     case MAX_EXPR:
3914     case COND_EXPR:
3915       {
3916         init_val = gimple_convert (&stmts, TREE_TYPE (vectype), init_val);
3917         init_def = gimple_build_vector_from_val (&stmts, vectype, init_val);
3918       }
3919       break;
3920
3921     default:
3922       gcc_unreachable ();
3923     }
3924
3925   if (stmts)
3926     gsi_insert_seq_on_edge_immediate (loop_preheader_edge (loop), stmts);
3927   return init_def;
3928 }
3929
3930 /* Get at the initial defs for the reduction PHIs in SLP_NODE.
3931    NUMBER_OF_VECTORS is the number of vector defs to create.
3932    If NEUTRAL_OP is nonnull, introducing extra elements of that
3933    value will not change the result.  */
3934
3935 static void
3936 get_initial_defs_for_reduction (slp_tree slp_node,
3937                                 vec<tree> *vec_oprnds,
3938                                 unsigned int number_of_vectors,
3939                                 bool reduc_chain, tree neutral_op)
3940 {
3941   vec<stmt_vec_info> stmts = SLP_TREE_SCALAR_STMTS (slp_node);
3942   stmt_vec_info stmt_vinfo = stmts[0];
3943   unsigned HOST_WIDE_INT nunits;
3944   unsigned j, number_of_places_left_in_vector;
3945   tree vector_type;
3946   unsigned int group_size = stmts.length ();
3947   unsigned int i;
3948   class loop *loop;
3949
3950   vector_type = STMT_VINFO_VECTYPE (stmt_vinfo);
3951
3952   gcc_assert (STMT_VINFO_DEF_TYPE (stmt_vinfo) == vect_reduction_def);
3953
3954   loop = (gimple_bb (stmt_vinfo->stmt))->loop_father;
3955   gcc_assert (loop);
3956   edge pe = loop_preheader_edge (loop);
3957
3958   gcc_assert (!reduc_chain || neutral_op);
3959
3960   /* NUMBER_OF_COPIES is the number of times we need to use the same values in
3961      created vectors. It is greater than 1 if unrolling is performed.
3962
3963      For example, we have two scalar operands, s1 and s2 (e.g., group of
3964      strided accesses of size two), while NUNITS is four (i.e., four scalars
3965      of this type can be packed in a vector).  The output vector will contain
3966      two copies of each scalar operand: {s1, s2, s1, s2}.  (NUMBER_OF_COPIES
3967      will be 2).
3968
3969      If REDUC_GROUP_SIZE > NUNITS, the scalars will be split into several
3970      vectors containing the operands.
3971
3972      For example, NUNITS is four as before, and the group size is 8
3973      (s1, s2, ..., s8).  We will create two vectors {s1, s2, s3, s4} and
3974      {s5, s6, s7, s8}.  */
3975
3976   if (!TYPE_VECTOR_SUBPARTS (vector_type).is_constant (&nunits))
3977     nunits = group_size;
3978
3979   number_of_places_left_in_vector = nunits;
3980   bool constant_p = true;
3981   tree_vector_builder elts (vector_type, nunits, 1);
3982   elts.quick_grow (nunits);
3983   gimple_seq ctor_seq = NULL;
3984   for (j = 0; j < nunits * number_of_vectors; ++j)
3985     {
3986       tree op;
3987       i = j % group_size;
3988       stmt_vinfo = stmts[i];
3989
3990       /* Get the def before the loop.  In reduction chain we have only
3991          one initial value.  Else we have as many as PHIs in the group.  */
3992       if (reduc_chain)
3993         op = j != 0 ? neutral_op : PHI_ARG_DEF_FROM_EDGE (stmt_vinfo->stmt, pe);
3994       else if (((vec_oprnds->length () + 1) * nunits
3995                 - number_of_places_left_in_vector >= group_size)
3996                && neutral_op)
3997         op = neutral_op;
3998       else
3999         op = PHI_ARG_DEF_FROM_EDGE (stmt_vinfo->stmt, pe);
4000
4001       /* Create 'vect_ = {op0,op1,...,opn}'.  */
4002       number_of_places_left_in_vector--;
4003       elts[nunits - number_of_places_left_in_vector - 1] = op;
4004       if (!CONSTANT_CLASS_P (op))
4005         constant_p = false;
4006
4007       if (number_of_places_left_in_vector == 0)
4008         {
4009           tree init;
4010           if (constant_p && !neutral_op
4011               ? multiple_p (TYPE_VECTOR_SUBPARTS (vector_type), nunits)
4012               : known_eq (TYPE_VECTOR_SUBPARTS (vector_type), nunits))
4013             /* Build the vector directly from ELTS.  */
4014             init = gimple_build_vector (&ctor_seq, &elts);
4015           else if (neutral_op)
4016             {
4017               /* Build a vector of the neutral value and shift the
4018                  other elements into place.  */
4019               init = gimple_build_vector_from_val (&ctor_seq, vector_type,
4020                                                    neutral_op);
4021               int k = nunits;
4022               while (k > 0 && elts[k - 1] == neutral_op)
4023                 k -= 1;
4024               while (k > 0)
4025                 {
4026                   k -= 1;
4027                   init = gimple_build (&ctor_seq, CFN_VEC_SHL_INSERT,
4028                                        vector_type, init, elts[k]);
4029                 }
4030             }
4031           else
4032             {
4033               /* First time round, duplicate ELTS to fill the
4034                  required number of vectors.  */
4035               duplicate_and_interleave (&ctor_seq, vector_type, elts,
4036                                         number_of_vectors, *vec_oprnds);
4037               break;
4038             }
4039           vec_oprnds->quick_push (init);
4040
4041           number_of_places_left_in_vector = nunits;
4042           elts.new_vector (vector_type, nunits, 1);
4043           elts.quick_grow (nunits);
4044           constant_p = true;
4045         }
4046     }
4047   if (ctor_seq != NULL)
4048     gsi_insert_seq_on_edge_immediate (pe, ctor_seq);
4049 }
4050
4051 /* For a statement STMT_INFO taking part in a reduction operation return
4052    the stmt_vec_info the meta information is stored on.  */
4053
4054 stmt_vec_info
4055 info_for_reduction (stmt_vec_info stmt_info)
4056 {
4057   stmt_info = vect_orig_stmt (stmt_info);
4058   gcc_assert (STMT_VINFO_REDUC_DEF (stmt_info));
4059   if (!is_a <gphi *> (stmt_info->stmt))
4060     stmt_info = STMT_VINFO_REDUC_DEF (stmt_info);
4061   gphi *phi = as_a <gphi *> (stmt_info->stmt);
4062   if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
4063     {
4064       if (gimple_phi_num_args (phi) == 1)
4065         stmt_info = STMT_VINFO_REDUC_DEF (stmt_info);
4066     }
4067   else if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle)
4068     {
4069       edge pe = loop_preheader_edge (gimple_bb (phi)->loop_father);
4070       stmt_vec_info info
4071           = stmt_info->vinfo->lookup_def (PHI_ARG_DEF_FROM_EDGE (phi, pe));
4072       if (info && STMT_VINFO_DEF_TYPE (info) == vect_double_reduction_def)
4073         stmt_info = info;
4074     }
4075   return stmt_info;
4076 }
4077
4078 /* Function vect_create_epilog_for_reduction
4079
4080    Create code at the loop-epilog to finalize the result of a reduction
4081    computation.
4082
4083    STMT_INFO is the scalar reduction stmt that is being vectorized.
4084    SLP_NODE is an SLP node containing a group of reduction statements. The
4085      first one in this group is STMT_INFO.
4086    SLP_NODE_INSTANCE is the SLP node instance containing SLP_NODE
4087    REDUC_INDEX says which rhs operand of the STMT_INFO is the reduction phi
4088      (counting from 0)
4089
4090    This function:
4091    1. Completes the reduction def-use cycles.
4092    2. "Reduces" each vector of partial results VECT_DEFS into a single result,
4093       by calling the function specified by REDUC_FN if available, or by
4094       other means (whole-vector shifts or a scalar loop).
4095       The function also creates a new phi node at the loop exit to preserve
4096       loop-closed form, as illustrated below.
4097
4098      The flow at the entry to this function:
4099
4100         loop:
4101           vec_def = phi <vec_init, null>        # REDUCTION_PHI
4102           VECT_DEF = vector_stmt                # vectorized form of STMT_INFO
4103           s_loop = scalar_stmt                  # (scalar) STMT_INFO
4104         loop_exit:
4105           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
4106           use <s_out0>
4107           use <s_out0>
4108
4109      The above is transformed by this function into:
4110
4111         loop:
4112           vec_def = phi <vec_init, VECT_DEF>    # REDUCTION_PHI
4113           VECT_DEF = vector_stmt                # vectorized form of STMT_INFO
4114           s_loop = scalar_stmt                  # (scalar) STMT_INFO
4115         loop_exit:
4116           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
4117           v_out1 = phi <VECT_DEF>               # NEW_EXIT_PHI
4118           v_out2 = reduce <v_out1>
4119           s_out3 = extract_field <v_out2, 0>
4120           s_out4 = adjust_result <s_out3>
4121           use <s_out4>
4122           use <s_out4>
4123 */
4124
4125 static void
4126 vect_create_epilog_for_reduction (stmt_vec_info stmt_info,
4127                                   slp_tree slp_node,
4128                                   slp_instance slp_node_instance)
4129 {
4130   stmt_vec_info reduc_info = info_for_reduction (stmt_info);
4131   gcc_assert (reduc_info->is_reduc_info);
4132   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4133   /* For double reductions we need to get at the inner loop reduction
4134      stmt which has the meta info attached.  Our stmt_info is that of the
4135      loop-closed PHI of the inner loop which we remember as
4136      def for the reduction PHI generation.  */
4137   bool double_reduc = false;
4138   stmt_vec_info rdef_info = stmt_info;
4139   if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
4140     {
4141       gcc_assert (!slp_node);
4142       double_reduc = true;
4143       stmt_info = loop_vinfo->lookup_def (gimple_phi_arg_def
4144                                             (stmt_info->stmt, 0));
4145       stmt_info = vect_stmt_to_vectorize (stmt_info);
4146     }
4147   gphi *reduc_def_stmt
4148     = as_a <gphi *> (STMT_VINFO_REDUC_DEF (vect_orig_stmt (stmt_info))->stmt);
4149   enum tree_code code = STMT_VINFO_REDUC_CODE (reduc_info);
4150   internal_fn reduc_fn = STMT_VINFO_REDUC_FN (reduc_info);
4151   tree neutral_op = NULL_TREE;
4152   if (slp_node)
4153     neutral_op
4154       = neutral_op_for_slp_reduction (slp_node_instance->reduc_phis, code,
4155                                       REDUC_GROUP_FIRST_ELEMENT (stmt_info));
4156   stmt_vec_info prev_phi_info;
4157   tree vectype;
4158   machine_mode mode;
4159   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo), *outer_loop = NULL;
4160   basic_block exit_bb;
4161   tree scalar_dest;
4162   tree scalar_type;
4163   gimple *new_phi = NULL, *phi;
4164   stmt_vec_info phi_info;
4165   gimple_stmt_iterator exit_gsi;
4166   tree vec_dest;
4167   tree new_temp = NULL_TREE, new_dest, new_name, new_scalar_dest;
4168   gimple *epilog_stmt = NULL;
4169   gimple *exit_phi;
4170   tree bitsize;
4171   tree expr, def;
4172   tree orig_name, scalar_result;
4173   imm_use_iterator imm_iter, phi_imm_iter;
4174   use_operand_p use_p, phi_use_p;
4175   gimple *use_stmt;
4176   bool nested_in_vect_loop = false;
4177   auto_vec<gimple *> new_phis;
4178   int j, i;
4179   auto_vec<tree> scalar_results;
4180   unsigned int group_size = 1, k;
4181   auto_vec<gimple *> phis;
4182   bool slp_reduc = false;
4183   bool direct_slp_reduc;
4184   tree new_phi_result;
4185   tree induction_index = NULL_TREE;
4186
4187   if (slp_node)
4188     group_size = SLP_TREE_SCALAR_STMTS (slp_node).length ();
4189
4190   if (nested_in_vect_loop_p (loop, stmt_info))
4191     {
4192       outer_loop = loop;
4193       loop = loop->inner;
4194       nested_in_vect_loop = true;
4195       gcc_assert (!slp_node);
4196     }
4197   gcc_assert (!nested_in_vect_loop || double_reduc);
4198
4199   vectype = STMT_VINFO_VECTYPE (stmt_info);
4200   gcc_assert (vectype);
4201   mode = TYPE_MODE (vectype);
4202
4203   tree initial_def = NULL;
4204   tree induc_val = NULL_TREE;
4205   tree adjustment_def = NULL;
4206   if (slp_node)
4207     ;
4208   else
4209     {
4210       /* Get at the scalar def before the loop, that defines the initial value
4211          of the reduction variable.  */
4212       initial_def = PHI_ARG_DEF_FROM_EDGE (reduc_def_stmt,
4213                                            loop_preheader_edge (loop));
4214       /* Optimize: for induction condition reduction, if we can't use zero
4215          for induc_val, use initial_def.  */
4216       if (STMT_VINFO_REDUC_TYPE (reduc_info) == INTEGER_INDUC_COND_REDUCTION)
4217         induc_val = STMT_VINFO_VEC_INDUC_COND_INITIAL_VAL (reduc_info);
4218       else if (double_reduc)
4219         ;
4220       else if (nested_in_vect_loop)
4221         ;
4222       else
4223         adjustment_def = STMT_VINFO_REDUC_EPILOGUE_ADJUSTMENT (reduc_info);
4224     }
4225
4226   unsigned vec_num;
4227   int ncopies;
4228   if (slp_node)
4229     {
4230       vec_num = SLP_TREE_VEC_STMTS (slp_node_instance->reduc_phis).length ();
4231       ncopies = 1;
4232     }
4233   else
4234     {
4235       vec_num = 1;
4236       ncopies = 0;
4237       phi_info = STMT_VINFO_VEC_STMT (loop_vinfo->lookup_stmt (reduc_def_stmt));
4238       do
4239         {
4240           ncopies++;
4241           phi_info = STMT_VINFO_RELATED_STMT (phi_info);
4242         }
4243       while (phi_info);
4244     }
4245
4246   /* For cond reductions we want to create a new vector (INDEX_COND_EXPR)
4247      which is updated with the current index of the loop for every match of
4248      the original loop's cond_expr (VEC_STMT).  This results in a vector
4249      containing the last time the condition passed for that vector lane.
4250      The first match will be a 1 to allow 0 to be used for non-matching
4251      indexes.  If there are no matches at all then the vector will be all
4252      zeroes.  */
4253   if (STMT_VINFO_REDUC_TYPE (reduc_info) == COND_REDUCTION)
4254     {
4255       tree indx_before_incr, indx_after_incr;
4256       poly_uint64 nunits_out = TYPE_VECTOR_SUBPARTS (vectype);
4257
4258       gimple *vec_stmt = STMT_VINFO_VEC_STMT (stmt_info)->stmt;
4259       gcc_assert (gimple_assign_rhs_code (vec_stmt) == VEC_COND_EXPR);
4260
4261       int scalar_precision
4262         = GET_MODE_PRECISION (SCALAR_TYPE_MODE (TREE_TYPE (vectype)));
4263       tree cr_index_scalar_type = make_unsigned_type (scalar_precision);
4264       tree cr_index_vector_type = build_vector_type
4265         (cr_index_scalar_type, TYPE_VECTOR_SUBPARTS (vectype));
4266
4267       /* First we create a simple vector induction variable which starts
4268          with the values {1,2,3,...} (SERIES_VECT) and increments by the
4269          vector size (STEP).  */
4270
4271       /* Create a {1,2,3,...} vector.  */
4272       tree series_vect = build_index_vector (cr_index_vector_type, 1, 1);
4273
4274       /* Create a vector of the step value.  */
4275       tree step = build_int_cst (cr_index_scalar_type, nunits_out);
4276       tree vec_step = build_vector_from_val (cr_index_vector_type, step);
4277
4278       /* Create an induction variable.  */
4279       gimple_stmt_iterator incr_gsi;
4280       bool insert_after;
4281       standard_iv_increment_position (loop, &incr_gsi, &insert_after);
4282       create_iv (series_vect, vec_step, NULL_TREE, loop, &incr_gsi,
4283                  insert_after, &indx_before_incr, &indx_after_incr);
4284
4285       /* Next create a new phi node vector (NEW_PHI_TREE) which starts
4286          filled with zeros (VEC_ZERO).  */
4287
4288       /* Create a vector of 0s.  */
4289       tree zero = build_zero_cst (cr_index_scalar_type);
4290       tree vec_zero = build_vector_from_val (cr_index_vector_type, zero);
4291
4292       /* Create a vector phi node.  */
4293       tree new_phi_tree = make_ssa_name (cr_index_vector_type);
4294       new_phi = create_phi_node (new_phi_tree, loop->header);
4295       loop_vinfo->add_stmt (new_phi);
4296       add_phi_arg (as_a <gphi *> (new_phi), vec_zero,
4297                    loop_preheader_edge (loop), UNKNOWN_LOCATION);
4298
4299       /* Now take the condition from the loops original cond_expr
4300          (VEC_STMT) and produce a new cond_expr (INDEX_COND_EXPR) which for
4301          every match uses values from the induction variable
4302          (INDEX_BEFORE_INCR) otherwise uses values from the phi node
4303          (NEW_PHI_TREE).
4304          Finally, we update the phi (NEW_PHI_TREE) to take the value of
4305          the new cond_expr (INDEX_COND_EXPR).  */
4306
4307       /* Duplicate the condition from vec_stmt.  */
4308       tree ccompare = unshare_expr (gimple_assign_rhs1 (vec_stmt));
4309
4310       /* Create a conditional, where the condition is taken from vec_stmt
4311          (CCOMPARE).  The then and else values mirror the main VEC_COND_EXPR:
4312          the reduction phi corresponds to NEW_PHI_TREE and the new values
4313          correspond to INDEX_BEFORE_INCR.  */
4314       gcc_assert (STMT_VINFO_REDUC_IDX (stmt_info) >= 1);
4315       tree index_cond_expr;
4316       if (STMT_VINFO_REDUC_IDX (stmt_info) == 2)
4317         index_cond_expr = build3 (VEC_COND_EXPR, cr_index_vector_type,
4318                                   ccompare, indx_before_incr, new_phi_tree);
4319       else
4320         index_cond_expr = build3 (VEC_COND_EXPR, cr_index_vector_type,
4321                                   ccompare, new_phi_tree, indx_before_incr);
4322       induction_index = make_ssa_name (cr_index_vector_type);
4323       gimple *index_condition = gimple_build_assign (induction_index,
4324                                                      index_cond_expr);
4325       gsi_insert_before (&incr_gsi, index_condition, GSI_SAME_STMT);
4326       stmt_vec_info index_vec_info = loop_vinfo->add_stmt (index_condition);
4327       STMT_VINFO_VECTYPE (index_vec_info) = cr_index_vector_type;
4328
4329       /* Update the phi with the vec cond.  */
4330       add_phi_arg (as_a <gphi *> (new_phi), induction_index,
4331                    loop_latch_edge (loop), UNKNOWN_LOCATION);
4332     }
4333
4334   /* 2. Create epilog code.
4335         The reduction epilog code operates across the elements of the vector
4336         of partial results computed by the vectorized loop.
4337         The reduction epilog code consists of:
4338
4339         step 1: compute the scalar result in a vector (v_out2)
4340         step 2: extract the scalar result (s_out3) from the vector (v_out2)
4341         step 3: adjust the scalar result (s_out3) if needed.
4342
4343         Step 1 can be accomplished using one the following three schemes:
4344           (scheme 1) using reduc_fn, if available.
4345           (scheme 2) using whole-vector shifts, if available.
4346           (scheme 3) using a scalar loop. In this case steps 1+2 above are
4347                      combined.
4348
4349           The overall epilog code looks like this:
4350
4351           s_out0 = phi <s_loop>         # original EXIT_PHI
4352           v_out1 = phi <VECT_DEF>       # NEW_EXIT_PHI
4353           v_out2 = reduce <v_out1>              # step 1
4354           s_out3 = extract_field <v_out2, 0>    # step 2
4355           s_out4 = adjust_result <s_out3>       # step 3
4356
4357           (step 3 is optional, and steps 1 and 2 may be combined).
4358           Lastly, the uses of s_out0 are replaced by s_out4.  */
4359
4360
4361   /* 2.1 Create new loop-exit-phis to preserve loop-closed form:
4362          v_out1 = phi <VECT_DEF>
4363          Store them in NEW_PHIS.  */
4364   if (double_reduc)
4365     loop = outer_loop;
4366   exit_bb = single_exit (loop)->dest;
4367   prev_phi_info = NULL;
4368   new_phis.create (slp_node ? vec_num : ncopies);
4369   for (unsigned i = 0; i < vec_num; i++)
4370     {
4371       if (slp_node)
4372         def = gimple_get_lhs (SLP_TREE_VEC_STMTS (slp_node)[i]->stmt);
4373       else
4374         def = gimple_get_lhs (STMT_VINFO_VEC_STMT (rdef_info)->stmt);
4375       for (j = 0; j < ncopies; j++)
4376         {
4377           tree new_def = copy_ssa_name (def);
4378           phi = create_phi_node (new_def, exit_bb);
4379           stmt_vec_info phi_info = loop_vinfo->add_stmt (phi);
4380           if (j == 0)
4381             new_phis.quick_push (phi);
4382           else
4383             {
4384               def = vect_get_vec_def_for_stmt_copy (loop_vinfo, def);
4385               STMT_VINFO_RELATED_STMT (prev_phi_info) = phi_info;
4386             }
4387
4388           SET_PHI_ARG_DEF (phi, single_exit (loop)->dest_idx, def);
4389           prev_phi_info = phi_info;
4390         }
4391     }
4392
4393   exit_gsi = gsi_after_labels (exit_bb);
4394
4395   /* 2.2 Get the relevant tree-code to use in the epilog for schemes 2,3
4396          (i.e. when reduc_fn is not available) and in the final adjustment
4397          code (if needed).  Also get the original scalar reduction variable as
4398          defined in the loop.  In case STMT is a "pattern-stmt" (i.e. - it
4399          represents a reduction pattern), the tree-code and scalar-def are
4400          taken from the original stmt that the pattern-stmt (STMT) replaces.
4401          Otherwise (it is a regular reduction) - the tree-code and scalar-def
4402          are taken from STMT.  */
4403
4404   stmt_vec_info orig_stmt_info = vect_orig_stmt (stmt_info);
4405   if (orig_stmt_info != stmt_info)
4406     {
4407       /* Reduction pattern  */
4408       gcc_assert (STMT_VINFO_IN_PATTERN_P (orig_stmt_info));
4409       gcc_assert (STMT_VINFO_RELATED_STMT (orig_stmt_info) == stmt_info);
4410     }
4411
4412   scalar_dest = gimple_assign_lhs (orig_stmt_info->stmt);
4413   scalar_type = TREE_TYPE (scalar_dest);
4414   scalar_results.create (group_size);
4415   new_scalar_dest = vect_create_destination_var (scalar_dest, NULL);
4416   bitsize = TYPE_SIZE (scalar_type);
4417
4418   /* SLP reduction without reduction chain, e.g.,
4419      # a1 = phi <a2, a0>
4420      # b1 = phi <b2, b0>
4421      a2 = operation (a1)
4422      b2 = operation (b1)  */
4423   slp_reduc = (slp_node && !REDUC_GROUP_FIRST_ELEMENT (stmt_info));
4424
4425   /* True if we should implement SLP_REDUC using native reduction operations
4426      instead of scalar operations.  */
4427   direct_slp_reduc = (reduc_fn != IFN_LAST
4428                       && slp_reduc
4429                       && !TYPE_VECTOR_SUBPARTS (vectype).is_constant ());
4430
4431   /* In case of reduction chain, e.g.,
4432      # a1 = phi <a3, a0>
4433      a2 = operation (a1)
4434      a3 = operation (a2),
4435
4436      we may end up with more than one vector result.  Here we reduce them to
4437      one vector.  */
4438   if (REDUC_GROUP_FIRST_ELEMENT (stmt_info) || direct_slp_reduc)
4439     {
4440       tree first_vect = PHI_RESULT (new_phis[0]);
4441       gassign *new_vec_stmt = NULL;
4442       vec_dest = vect_create_destination_var (scalar_dest, vectype);
4443       for (k = 1; k < new_phis.length (); k++)
4444         {
4445           gimple *next_phi = new_phis[k];
4446           tree second_vect = PHI_RESULT (next_phi);
4447           tree tem = make_ssa_name (vec_dest, new_vec_stmt);
4448           new_vec_stmt = gimple_build_assign (tem, code,
4449                                               first_vect, second_vect);
4450           gsi_insert_before (&exit_gsi, new_vec_stmt, GSI_SAME_STMT);
4451           first_vect = tem;
4452         }
4453
4454       new_phi_result = first_vect;
4455       if (new_vec_stmt)
4456         {
4457           new_phis.truncate (0);
4458           new_phis.safe_push (new_vec_stmt);
4459         }
4460     }
4461   /* Likewise if we couldn't use a single defuse cycle.  */
4462   else if (ncopies > 1)
4463     {
4464       gcc_assert (new_phis.length () == 1);
4465       tree first_vect = PHI_RESULT (new_phis[0]);
4466       gassign *new_vec_stmt = NULL;
4467       vec_dest = vect_create_destination_var (scalar_dest, vectype);
4468       stmt_vec_info next_phi_info = loop_vinfo->lookup_stmt (new_phis[0]);
4469       for (int k = 1; k < ncopies; ++k)
4470         {
4471           next_phi_info = STMT_VINFO_RELATED_STMT (next_phi_info);
4472           tree second_vect = PHI_RESULT (next_phi_info->stmt);
4473           tree tem = make_ssa_name (vec_dest, new_vec_stmt);
4474           new_vec_stmt = gimple_build_assign (tem, code,
4475                                               first_vect, second_vect);
4476           gsi_insert_before (&exit_gsi, new_vec_stmt, GSI_SAME_STMT);
4477           first_vect = tem;
4478         }
4479       new_phi_result = first_vect;
4480       new_phis.truncate (0);
4481       new_phis.safe_push (new_vec_stmt);
4482     }
4483   else
4484     new_phi_result = PHI_RESULT (new_phis[0]);
4485
4486   if (STMT_VINFO_REDUC_TYPE (reduc_info) == COND_REDUCTION
4487       && reduc_fn != IFN_LAST)
4488     {
4489       /* For condition reductions, we have a vector (NEW_PHI_RESULT) containing
4490          various data values where the condition matched and another vector
4491          (INDUCTION_INDEX) containing all the indexes of those matches.  We
4492          need to extract the last matching index (which will be the index with
4493          highest value) and use this to index into the data vector.
4494          For the case where there were no matches, the data vector will contain
4495          all default values and the index vector will be all zeros.  */
4496
4497       /* Get various versions of the type of the vector of indexes.  */
4498       tree index_vec_type = TREE_TYPE (induction_index);
4499       gcc_checking_assert (TYPE_UNSIGNED (index_vec_type));
4500       tree index_scalar_type = TREE_TYPE (index_vec_type);
4501       tree index_vec_cmp_type = build_same_sized_truth_vector_type
4502         (index_vec_type);
4503
4504       /* Get an unsigned integer version of the type of the data vector.  */
4505       int scalar_precision
4506         = GET_MODE_PRECISION (SCALAR_TYPE_MODE (scalar_type));
4507       tree scalar_type_unsigned = make_unsigned_type (scalar_precision);
4508       tree vectype_unsigned = build_vector_type
4509         (scalar_type_unsigned, TYPE_VECTOR_SUBPARTS (vectype));
4510
4511       /* First we need to create a vector (ZERO_VEC) of zeros and another
4512          vector (MAX_INDEX_VEC) filled with the last matching index, which we
4513          can create using a MAX reduction and then expanding.
4514          In the case where the loop never made any matches, the max index will
4515          be zero.  */
4516
4517       /* Vector of {0, 0, 0,...}.  */
4518       tree zero_vec = make_ssa_name (vectype);
4519       tree zero_vec_rhs = build_zero_cst (vectype);
4520       gimple *zero_vec_stmt = gimple_build_assign (zero_vec, zero_vec_rhs);
4521       gsi_insert_before (&exit_gsi, zero_vec_stmt, GSI_SAME_STMT);
4522
4523       /* Find maximum value from the vector of found indexes.  */
4524       tree max_index = make_ssa_name (index_scalar_type);
4525       gcall *max_index_stmt = gimple_build_call_internal (IFN_REDUC_MAX,
4526                                                           1, induction_index);
4527       gimple_call_set_lhs (max_index_stmt, max_index);
4528       gsi_insert_before (&exit_gsi, max_index_stmt, GSI_SAME_STMT);
4529
4530       /* Vector of {max_index, max_index, max_index,...}.  */
4531       tree max_index_vec = make_ssa_name (index_vec_type);
4532       tree max_index_vec_rhs = build_vector_from_val (index_vec_type,
4533                                                       max_index);
4534       gimple *max_index_vec_stmt = gimple_build_assign (max_index_vec,
4535                                                         max_index_vec_rhs);
4536       gsi_insert_before (&exit_gsi, max_index_vec_stmt, GSI_SAME_STMT);
4537
4538       /* Next we compare the new vector (MAX_INDEX_VEC) full of max indexes
4539          with the vector (INDUCTION_INDEX) of found indexes, choosing values
4540          from the data vector (NEW_PHI_RESULT) for matches, 0 (ZERO_VEC)
4541          otherwise.  Only one value should match, resulting in a vector
4542          (VEC_COND) with one data value and the rest zeros.
4543          In the case where the loop never made any matches, every index will
4544          match, resulting in a vector with all data values (which will all be
4545          the default value).  */
4546
4547       /* Compare the max index vector to the vector of found indexes to find
4548          the position of the max value.  */
4549       tree vec_compare = make_ssa_name (index_vec_cmp_type);
4550       gimple *vec_compare_stmt = gimple_build_assign (vec_compare, EQ_EXPR,
4551                                                       induction_index,
4552                                                       max_index_vec);
4553       gsi_insert_before (&exit_gsi, vec_compare_stmt, GSI_SAME_STMT);
4554
4555       /* Use the compare to choose either values from the data vector or
4556          zero.  */
4557       tree vec_cond = make_ssa_name (vectype);
4558       gimple *vec_cond_stmt = gimple_build_assign (vec_cond, VEC_COND_EXPR,
4559                                                    vec_compare, new_phi_result,
4560                                                    zero_vec);
4561       gsi_insert_before (&exit_gsi, vec_cond_stmt, GSI_SAME_STMT);
4562
4563       /* Finally we need to extract the data value from the vector (VEC_COND)
4564          into a scalar (MATCHED_DATA_REDUC).  Logically we want to do a OR
4565          reduction, but because this doesn't exist, we can use a MAX reduction
4566          instead.  The data value might be signed or a float so we need to cast
4567          it first.
4568          In the case where the loop never made any matches, the data values are
4569          all identical, and so will reduce down correctly.  */
4570
4571       /* Make the matched data values unsigned.  */
4572       tree vec_cond_cast = make_ssa_name (vectype_unsigned);
4573       tree vec_cond_cast_rhs = build1 (VIEW_CONVERT_EXPR, vectype_unsigned,
4574                                        vec_cond);
4575       gimple *vec_cond_cast_stmt = gimple_build_assign (vec_cond_cast,
4576                                                         VIEW_CONVERT_EXPR,
4577                                                         vec_cond_cast_rhs);
4578       gsi_insert_before (&exit_gsi, vec_cond_cast_stmt, GSI_SAME_STMT);
4579
4580       /* Reduce down to a scalar value.  */
4581       tree data_reduc = make_ssa_name (scalar_type_unsigned);
4582       gcall *data_reduc_stmt = gimple_build_call_internal (IFN_REDUC_MAX,
4583                                                            1, vec_cond_cast);
4584       gimple_call_set_lhs (data_reduc_stmt, data_reduc);
4585       gsi_insert_before (&exit_gsi, data_reduc_stmt, GSI_SAME_STMT);
4586
4587       /* Convert the reduced value back to the result type and set as the
4588          result.  */
4589       gimple_seq stmts = NULL;
4590       new_temp = gimple_build (&stmts, VIEW_CONVERT_EXPR, scalar_type,
4591                                data_reduc);
4592       gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
4593       scalar_results.safe_push (new_temp);
4594     }
4595   else if (STMT_VINFO_REDUC_TYPE (reduc_info) == COND_REDUCTION
4596            && reduc_fn == IFN_LAST)
4597     {
4598       /* Condition reduction without supported IFN_REDUC_MAX.  Generate
4599          idx = 0;
4600          idx_val = induction_index[0];
4601          val = data_reduc[0];
4602          for (idx = 0, val = init, i = 0; i < nelts; ++i)
4603            if (induction_index[i] > idx_val)
4604              val = data_reduc[i], idx_val = induction_index[i];
4605          return val;  */
4606
4607       tree data_eltype = TREE_TYPE (TREE_TYPE (new_phi_result));
4608       tree idx_eltype = TREE_TYPE (TREE_TYPE (induction_index));
4609       unsigned HOST_WIDE_INT el_size = tree_to_uhwi (TYPE_SIZE (idx_eltype));
4610       poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (TREE_TYPE (induction_index));
4611       /* Enforced by vectorizable_reduction, which ensures we have target
4612          support before allowing a conditional reduction on variable-length
4613          vectors.  */
4614       unsigned HOST_WIDE_INT v_size = el_size * nunits.to_constant ();
4615       tree idx_val = NULL_TREE, val = NULL_TREE;
4616       for (unsigned HOST_WIDE_INT off = 0; off < v_size; off += el_size)
4617         {
4618           tree old_idx_val = idx_val;
4619           tree old_val = val;
4620           idx_val = make_ssa_name (idx_eltype);
4621           epilog_stmt = gimple_build_assign (idx_val, BIT_FIELD_REF,
4622                                              build3 (BIT_FIELD_REF, idx_eltype,
4623                                                      induction_index,
4624                                                      bitsize_int (el_size),
4625                                                      bitsize_int (off)));
4626           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4627           val = make_ssa_name (data_eltype);
4628           epilog_stmt = gimple_build_assign (val, BIT_FIELD_REF,
4629                                              build3 (BIT_FIELD_REF,
4630                                                      data_eltype,
4631                                                      new_phi_result,
4632                                                      bitsize_int (el_size),
4633                                                      bitsize_int (off)));
4634           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4635           if (off != 0)
4636             {
4637               tree new_idx_val = idx_val;
4638               if (off != v_size - el_size)
4639                 {
4640                   new_idx_val = make_ssa_name (idx_eltype);
4641                   epilog_stmt = gimple_build_assign (new_idx_val,
4642                                                      MAX_EXPR, idx_val,
4643                                                      old_idx_val);
4644                   gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4645                 }
4646               tree new_val = make_ssa_name (data_eltype);
4647               epilog_stmt = gimple_build_assign (new_val,
4648                                                  COND_EXPR,
4649                                                  build2 (GT_EXPR,
4650                                                          boolean_type_node,
4651                                                          idx_val,
4652                                                          old_idx_val),
4653                                                  val, old_val);
4654               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4655               idx_val = new_idx_val;
4656               val = new_val;
4657             }
4658         }
4659       /* Convert the reduced value back to the result type and set as the
4660          result.  */
4661       gimple_seq stmts = NULL;
4662       val = gimple_convert (&stmts, scalar_type, val);
4663       gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
4664       scalar_results.safe_push (val);
4665     }
4666
4667   /* 2.3 Create the reduction code, using one of the three schemes described
4668          above. In SLP we simply need to extract all the elements from the
4669          vector (without reducing them), so we use scalar shifts.  */
4670   else if (reduc_fn != IFN_LAST && !slp_reduc)
4671     {
4672       tree tmp;
4673       tree vec_elem_type;
4674
4675       /* Case 1:  Create:
4676          v_out2 = reduc_expr <v_out1>  */
4677
4678       if (dump_enabled_p ())
4679         dump_printf_loc (MSG_NOTE, vect_location,
4680                          "Reduce using direct vector reduction.\n");
4681
4682       vec_elem_type = TREE_TYPE (TREE_TYPE (new_phi_result));
4683       if (!useless_type_conversion_p (scalar_type, vec_elem_type))
4684         {
4685           tree tmp_dest
4686             = vect_create_destination_var (scalar_dest, vec_elem_type);
4687           epilog_stmt = gimple_build_call_internal (reduc_fn, 1,
4688                                                     new_phi_result);
4689           gimple_set_lhs (epilog_stmt, tmp_dest);
4690           new_temp = make_ssa_name (tmp_dest, epilog_stmt);
4691           gimple_set_lhs (epilog_stmt, new_temp);
4692           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4693
4694           epilog_stmt = gimple_build_assign (new_scalar_dest, NOP_EXPR,
4695                                              new_temp);
4696         }
4697       else
4698         {
4699           epilog_stmt = gimple_build_call_internal (reduc_fn, 1,
4700                                                     new_phi_result);
4701           gimple_set_lhs (epilog_stmt, new_scalar_dest);
4702         }
4703
4704       new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
4705       gimple_set_lhs (epilog_stmt, new_temp);
4706       gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4707
4708       if ((STMT_VINFO_REDUC_TYPE (reduc_info) == INTEGER_INDUC_COND_REDUCTION)
4709           && induc_val)
4710         {
4711           /* Earlier we set the initial value to be a vector if induc_val
4712              values.  Check the result and if it is induc_val then replace
4713              with the original initial value, unless induc_val is
4714              the same as initial_def already.  */
4715           tree zcompare = build2 (EQ_EXPR, boolean_type_node, new_temp,
4716                                   induc_val);
4717
4718           tmp = make_ssa_name (new_scalar_dest);
4719           epilog_stmt = gimple_build_assign (tmp, COND_EXPR, zcompare,
4720                                              initial_def, new_temp);
4721           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4722           new_temp = tmp;
4723         }
4724
4725       scalar_results.safe_push (new_temp);
4726     }
4727   else if (direct_slp_reduc)
4728     {
4729       /* Here we create one vector for each of the REDUC_GROUP_SIZE results,
4730          with the elements for other SLP statements replaced with the
4731          neutral value.  We can then do a normal reduction on each vector.  */
4732
4733       /* Enforced by vectorizable_reduction.  */
4734       gcc_assert (new_phis.length () == 1);
4735       gcc_assert (pow2p_hwi (group_size));
4736
4737       slp_tree orig_phis_slp_node = slp_node_instance->reduc_phis;
4738       vec<stmt_vec_info> orig_phis
4739         = SLP_TREE_SCALAR_STMTS (orig_phis_slp_node);
4740       gimple_seq seq = NULL;
4741
4742       /* Build a vector {0, 1, 2, ...}, with the same number of elements
4743          and the same element size as VECTYPE.  */
4744       tree index = build_index_vector (vectype, 0, 1);
4745       tree index_type = TREE_TYPE (index);
4746       tree index_elt_type = TREE_TYPE (index_type);
4747       tree mask_type = build_same_sized_truth_vector_type (index_type);
4748
4749       /* Create a vector that, for each element, identifies which of
4750          the REDUC_GROUP_SIZE results should use it.  */
4751       tree index_mask = build_int_cst (index_elt_type, group_size - 1);
4752       index = gimple_build (&seq, BIT_AND_EXPR, index_type, index,
4753                             build_vector_from_val (index_type, index_mask));
4754
4755       /* Get a neutral vector value.  This is simply a splat of the neutral
4756          scalar value if we have one, otherwise the initial scalar value
4757          is itself a neutral value.  */
4758       tree vector_identity = NULL_TREE;
4759       if (neutral_op)
4760         vector_identity = gimple_build_vector_from_val (&seq, vectype,
4761                                                         neutral_op);
4762       for (unsigned int i = 0; i < group_size; ++i)
4763         {
4764           /* If there's no univeral neutral value, we can use the
4765              initial scalar value from the original PHI.  This is used
4766              for MIN and MAX reduction, for example.  */
4767           if (!neutral_op)
4768             {
4769               tree scalar_value
4770                 = PHI_ARG_DEF_FROM_EDGE (orig_phis[i]->stmt,
4771                                          loop_preheader_edge (loop));
4772               vector_identity = gimple_build_vector_from_val (&seq, vectype,
4773                                                               scalar_value);
4774             }
4775
4776           /* Calculate the equivalent of:
4777
4778              sel[j] = (index[j] == i);
4779
4780              which selects the elements of NEW_PHI_RESULT that should
4781              be included in the result.  */
4782           tree compare_val = build_int_cst (index_elt_type, i);
4783           compare_val = build_vector_from_val (index_type, compare_val);
4784           tree sel = gimple_build (&seq, EQ_EXPR, mask_type,
4785                                    index, compare_val);
4786
4787           /* Calculate the equivalent of:
4788
4789              vec = seq ? new_phi_result : vector_identity;
4790
4791              VEC is now suitable for a full vector reduction.  */
4792           tree vec = gimple_build (&seq, VEC_COND_EXPR, vectype,
4793                                    sel, new_phi_result, vector_identity);
4794
4795           /* Do the reduction and convert it to the appropriate type.  */
4796           tree scalar = gimple_build (&seq, as_combined_fn (reduc_fn),
4797                                       TREE_TYPE (vectype), vec);
4798           scalar = gimple_convert (&seq, scalar_type, scalar);
4799           scalar_results.safe_push (scalar);
4800         }
4801       gsi_insert_seq_before (&exit_gsi, seq, GSI_SAME_STMT);
4802     }
4803   else
4804     {
4805       bool reduce_with_shift;
4806       tree vec_temp;
4807
4808       /* See if the target wants to do the final (shift) reduction
4809          in a vector mode of smaller size and first reduce upper/lower
4810          halves against each other.  */
4811       enum machine_mode mode1 = mode;
4812       unsigned sz = tree_to_uhwi (TYPE_SIZE_UNIT (vectype));
4813       unsigned sz1 = sz;
4814       if (!slp_reduc
4815           && (mode1 = targetm.vectorize.split_reduction (mode)) != mode)
4816         sz1 = GET_MODE_SIZE (mode1).to_constant ();
4817
4818       tree vectype1 = get_vectype_for_scalar_type_and_size (scalar_type, sz1);
4819       reduce_with_shift = have_whole_vector_shift (mode1);
4820       if (!VECTOR_MODE_P (mode1))
4821         reduce_with_shift = false;
4822       else
4823         {
4824           optab optab = optab_for_tree_code (code, vectype1, optab_default);
4825           if (optab_handler (optab, mode1) == CODE_FOR_nothing)
4826             reduce_with_shift = false;
4827         }
4828
4829       /* First reduce the vector to the desired vector size we should
4830          do shift reduction on by combining upper and lower halves.  */
4831       new_temp = new_phi_result;
4832       while (sz > sz1)
4833         {
4834           gcc_assert (!slp_reduc);
4835           sz /= 2;
4836           vectype1 = get_vectype_for_scalar_type_and_size (scalar_type, sz);
4837
4838           /* The target has to make sure we support lowpart/highpart
4839              extraction, either via direct vector extract or through
4840              an integer mode punning.  */
4841           tree dst1, dst2;
4842           if (convert_optab_handler (vec_extract_optab,
4843                                      TYPE_MODE (TREE_TYPE (new_temp)),
4844                                      TYPE_MODE (vectype1))
4845               != CODE_FOR_nothing)
4846             {
4847               /* Extract sub-vectors directly once vec_extract becomes
4848                  a conversion optab.  */
4849               dst1 = make_ssa_name (vectype1);
4850               epilog_stmt
4851                   = gimple_build_assign (dst1, BIT_FIELD_REF,
4852                                          build3 (BIT_FIELD_REF, vectype1,
4853                                                  new_temp, TYPE_SIZE (vectype1),
4854                                                  bitsize_int (0)));
4855               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4856               dst2 =  make_ssa_name (vectype1);
4857               epilog_stmt
4858                   = gimple_build_assign (dst2, BIT_FIELD_REF,
4859                                          build3 (BIT_FIELD_REF, vectype1,
4860                                                  new_temp, TYPE_SIZE (vectype1),
4861                                                  bitsize_int (sz * BITS_PER_UNIT)));
4862               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4863             }
4864           else
4865             {
4866               /* Extract via punning to appropriately sized integer mode
4867                  vector.  */
4868               tree eltype = build_nonstandard_integer_type (sz * BITS_PER_UNIT,
4869                                                             1);
4870               tree etype = build_vector_type (eltype, 2);
4871               gcc_assert (convert_optab_handler (vec_extract_optab,
4872                                                  TYPE_MODE (etype),
4873                                                  TYPE_MODE (eltype))
4874                           != CODE_FOR_nothing);
4875               tree tem = make_ssa_name (etype);
4876               epilog_stmt = gimple_build_assign (tem, VIEW_CONVERT_EXPR,
4877                                                  build1 (VIEW_CONVERT_EXPR,
4878                                                          etype, new_temp));
4879               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4880               new_temp = tem;
4881               tem = make_ssa_name (eltype);
4882               epilog_stmt
4883                   = gimple_build_assign (tem, BIT_FIELD_REF,
4884                                          build3 (BIT_FIELD_REF, eltype,
4885                                                  new_temp, TYPE_SIZE (eltype),
4886                                                  bitsize_int (0)));
4887               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4888               dst1 = make_ssa_name (vectype1);
4889               epilog_stmt = gimple_build_assign (dst1, VIEW_CONVERT_EXPR,
4890                                                  build1 (VIEW_CONVERT_EXPR,
4891                                                          vectype1, tem));
4892               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4893               tem = make_ssa_name (eltype);
4894               epilog_stmt
4895                   = gimple_build_assign (tem, BIT_FIELD_REF,
4896                                          build3 (BIT_FIELD_REF, eltype,
4897                                                  new_temp, TYPE_SIZE (eltype),
4898                                                  bitsize_int (sz * BITS_PER_UNIT)));
4899               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4900               dst2 =  make_ssa_name (vectype1);
4901               epilog_stmt = gimple_build_assign (dst2, VIEW_CONVERT_EXPR,
4902                                                  build1 (VIEW_CONVERT_EXPR,
4903                                                          vectype1, tem));
4904               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4905             }
4906
4907           new_temp = make_ssa_name (vectype1);
4908           epilog_stmt = gimple_build_assign (new_temp, code, dst1, dst2);
4909           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4910         }
4911
4912       if (reduce_with_shift && !slp_reduc)
4913         {
4914           int element_bitsize = tree_to_uhwi (bitsize);
4915           /* Enforced by vectorizable_reduction, which disallows SLP reductions
4916              for variable-length vectors and also requires direct target support
4917              for loop reductions.  */
4918           int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype1));
4919           int nelements = vec_size_in_bits / element_bitsize;
4920           vec_perm_builder sel;
4921           vec_perm_indices indices;
4922
4923           int elt_offset;
4924
4925           tree zero_vec = build_zero_cst (vectype1);
4926           /* Case 2: Create:
4927              for (offset = nelements/2; offset >= 1; offset/=2)
4928                 {
4929                   Create:  va' = vec_shift <va, offset>
4930                   Create:  va = vop <va, va'>
4931                 }  */
4932
4933           tree rhs;
4934
4935           if (dump_enabled_p ())
4936             dump_printf_loc (MSG_NOTE, vect_location,
4937                              "Reduce using vector shifts\n");
4938
4939           vec_dest = vect_create_destination_var (scalar_dest, vectype1);
4940           for (elt_offset = nelements / 2;
4941                elt_offset >= 1;
4942                elt_offset /= 2)
4943             {
4944               calc_vec_perm_mask_for_shift (elt_offset, nelements, &sel);
4945               indices.new_vector (sel, 2, nelements);
4946               tree mask = vect_gen_perm_mask_any (vectype1, indices);
4947               epilog_stmt = gimple_build_assign (vec_dest, VEC_PERM_EXPR,
4948                                                  new_temp, zero_vec, mask);
4949               new_name = make_ssa_name (vec_dest, epilog_stmt);
4950               gimple_assign_set_lhs (epilog_stmt, new_name);
4951               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4952
4953               epilog_stmt = gimple_build_assign (vec_dest, code, new_name,
4954                                                  new_temp);
4955               new_temp = make_ssa_name (vec_dest, epilog_stmt);
4956               gimple_assign_set_lhs (epilog_stmt, new_temp);
4957               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4958             }
4959
4960           /* 2.4  Extract the final scalar result.  Create:
4961              s_out3 = extract_field <v_out2, bitpos>  */
4962
4963           if (dump_enabled_p ())
4964             dump_printf_loc (MSG_NOTE, vect_location,
4965                              "extract scalar result\n");
4966
4967           rhs = build3 (BIT_FIELD_REF, scalar_type, new_temp,
4968                         bitsize, bitsize_zero_node);
4969           epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
4970           new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
4971           gimple_assign_set_lhs (epilog_stmt, new_temp);
4972           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4973           scalar_results.safe_push (new_temp);
4974         }
4975       else
4976         {
4977           /* Case 3: Create:
4978              s = extract_field <v_out2, 0>
4979              for (offset = element_size;
4980                   offset < vector_size;
4981                   offset += element_size;)
4982                {
4983                  Create:  s' = extract_field <v_out2, offset>
4984                  Create:  s = op <s, s'>  // For non SLP cases
4985                }  */
4986
4987           if (dump_enabled_p ())
4988             dump_printf_loc (MSG_NOTE, vect_location,
4989                              "Reduce using scalar code.\n");
4990
4991           int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype1));
4992           int element_bitsize = tree_to_uhwi (bitsize);
4993           FOR_EACH_VEC_ELT (new_phis, i, new_phi)
4994             {
4995               int bit_offset;
4996               if (gimple_code (new_phi) == GIMPLE_PHI)
4997                 vec_temp = PHI_RESULT (new_phi);
4998               else
4999                 vec_temp = gimple_assign_lhs (new_phi);
5000               tree rhs = build3 (BIT_FIELD_REF, scalar_type, vec_temp, bitsize,
5001                                  bitsize_zero_node);
5002               epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
5003               new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
5004               gimple_assign_set_lhs (epilog_stmt, new_temp);
5005               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5006
5007               /* In SLP we don't need to apply reduction operation, so we just
5008                  collect s' values in SCALAR_RESULTS.  */
5009               if (slp_reduc)
5010                 scalar_results.safe_push (new_temp);
5011
5012               for (bit_offset = element_bitsize;
5013                    bit_offset < vec_size_in_bits;
5014                    bit_offset += element_bitsize)
5015                 {
5016                   tree bitpos = bitsize_int (bit_offset);
5017                   tree rhs = build3 (BIT_FIELD_REF, scalar_type, vec_temp,
5018                                      bitsize, bitpos);
5019
5020                   epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
5021                   new_name = make_ssa_name (new_scalar_dest, epilog_stmt);
5022                   gimple_assign_set_lhs (epilog_stmt, new_name);
5023                   gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5024
5025                   if (slp_reduc)
5026                     {
5027                       /* In SLP we don't need to apply reduction operation, so
5028                          we just collect s' values in SCALAR_RESULTS.  */
5029                       new_temp = new_name;
5030                       scalar_results.safe_push (new_name);
5031                     }
5032                   else
5033                     {
5034                       epilog_stmt = gimple_build_assign (new_scalar_dest, code,
5035                                                          new_name, new_temp);
5036                       new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
5037                       gimple_assign_set_lhs (epilog_stmt, new_temp);
5038                       gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5039                     }
5040                 }
5041             }
5042
5043           /* The only case where we need to reduce scalar results in SLP, is
5044              unrolling.  If the size of SCALAR_RESULTS is greater than
5045              REDUC_GROUP_SIZE, we reduce them combining elements modulo
5046              REDUC_GROUP_SIZE.  */
5047           if (slp_reduc)
5048             {
5049               tree res, first_res, new_res;
5050               gimple *new_stmt;
5051
5052               /* Reduce multiple scalar results in case of SLP unrolling.  */
5053               for (j = group_size; scalar_results.iterate (j, &res);
5054                    j++)
5055                 {
5056                   first_res = scalar_results[j % group_size];
5057                   new_stmt = gimple_build_assign (new_scalar_dest, code,
5058                                                   first_res, res);
5059                   new_res = make_ssa_name (new_scalar_dest, new_stmt);
5060                   gimple_assign_set_lhs (new_stmt, new_res);
5061                   gsi_insert_before (&exit_gsi, new_stmt, GSI_SAME_STMT);
5062                   scalar_results[j % group_size] = new_res;
5063                 }
5064             }
5065           else
5066             /* Not SLP - we have one scalar to keep in SCALAR_RESULTS.  */
5067             scalar_results.safe_push (new_temp);
5068         }
5069
5070       if ((STMT_VINFO_REDUC_TYPE (reduc_info) == INTEGER_INDUC_COND_REDUCTION)
5071           && induc_val)
5072         {
5073           /* Earlier we set the initial value to be a vector if induc_val
5074              values.  Check the result and if it is induc_val then replace
5075              with the original initial value, unless induc_val is
5076              the same as initial_def already.  */
5077           tree zcompare = build2 (EQ_EXPR, boolean_type_node, new_temp,
5078                                   induc_val);
5079
5080           tree tmp = make_ssa_name (new_scalar_dest);
5081           epilog_stmt = gimple_build_assign (tmp, COND_EXPR, zcompare,
5082                                              initial_def, new_temp);
5083           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5084           scalar_results[0] = tmp;
5085         }
5086     }
5087
5088   /* 2.5 Adjust the final result by the initial value of the reduction
5089          variable. (When such adjustment is not needed, then
5090          'adjustment_def' is zero).  For example, if code is PLUS we create:
5091          new_temp = loop_exit_def + adjustment_def  */
5092
5093   if (adjustment_def)
5094     {
5095       gcc_assert (!slp_reduc);
5096       if (nested_in_vect_loop)
5097         {
5098           new_phi = new_phis[0];
5099           gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) == VECTOR_TYPE);
5100           expr = build2 (code, vectype, PHI_RESULT (new_phi), adjustment_def);
5101           new_dest = vect_create_destination_var (scalar_dest, vectype);
5102         }
5103       else
5104         {
5105           new_temp = scalar_results[0];
5106           gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) != VECTOR_TYPE);
5107           expr = build2 (code, scalar_type, new_temp, adjustment_def);
5108           new_dest = vect_create_destination_var (scalar_dest, scalar_type);
5109         }
5110
5111       epilog_stmt = gimple_build_assign (new_dest, expr);
5112       new_temp = make_ssa_name (new_dest, epilog_stmt);
5113       gimple_assign_set_lhs (epilog_stmt, new_temp);
5114       gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5115       if (nested_in_vect_loop)
5116         {
5117           stmt_vec_info epilog_stmt_info = loop_vinfo->add_stmt (epilog_stmt);
5118           STMT_VINFO_RELATED_STMT (epilog_stmt_info)
5119             = STMT_VINFO_RELATED_STMT (loop_vinfo->lookup_stmt (new_phi));
5120
5121           if (!double_reduc)
5122             scalar_results.quick_push (new_temp);
5123           else
5124             scalar_results[0] = new_temp;
5125         }
5126       else
5127         scalar_results[0] = new_temp;
5128
5129       new_phis[0] = epilog_stmt;
5130     }
5131
5132   if (double_reduc)
5133     loop = loop->inner;
5134
5135   /* 2.6  Handle the loop-exit phis.  Replace the uses of scalar loop-exit
5136           phis with new adjusted scalar results, i.e., replace use <s_out0>
5137           with use <s_out4>.
5138
5139      Transform:
5140         loop_exit:
5141           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
5142           v_out1 = phi <VECT_DEF>               # NEW_EXIT_PHI
5143           v_out2 = reduce <v_out1>
5144           s_out3 = extract_field <v_out2, 0>
5145           s_out4 = adjust_result <s_out3>
5146           use <s_out0>
5147           use <s_out0>
5148
5149      into:
5150
5151         loop_exit:
5152           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
5153           v_out1 = phi <VECT_DEF>               # NEW_EXIT_PHI
5154           v_out2 = reduce <v_out1>
5155           s_out3 = extract_field <v_out2, 0>
5156           s_out4 = adjust_result <s_out3>
5157           use <s_out4>
5158           use <s_out4> */
5159
5160
5161   /* In SLP reduction chain we reduce vector results into one vector if
5162      necessary, hence we set here REDUC_GROUP_SIZE to 1.  SCALAR_DEST is the
5163      LHS of the last stmt in the reduction chain, since we are looking for
5164      the loop exit phi node.  */
5165   if (REDUC_GROUP_FIRST_ELEMENT (stmt_info))
5166     {
5167       stmt_vec_info dest_stmt_info
5168         = vect_orig_stmt (SLP_TREE_SCALAR_STMTS (slp_node)[group_size - 1]);
5169       scalar_dest = gimple_assign_lhs (dest_stmt_info->stmt);
5170       group_size = 1;
5171     }
5172
5173   /* In SLP we may have several statements in NEW_PHIS and REDUCTION_PHIS (in
5174      case that REDUC_GROUP_SIZE is greater than vectorization factor).
5175      Therefore, we need to match SCALAR_RESULTS with corresponding statements.
5176      The first (REDUC_GROUP_SIZE / number of new vector stmts) scalar results
5177      correspond to the first vector stmt, etc.
5178      (RATIO is equal to (REDUC_GROUP_SIZE / number of new vector stmts)).  */
5179   if (group_size > new_phis.length ())
5180     gcc_assert (!(group_size % new_phis.length ()));
5181
5182   for (k = 0; k < group_size; k++)
5183     {
5184       if (slp_reduc)
5185         {
5186           stmt_vec_info scalar_stmt_info = SLP_TREE_SCALAR_STMTS (slp_node)[k];
5187
5188           orig_stmt_info = STMT_VINFO_RELATED_STMT (scalar_stmt_info);
5189           /* SLP statements can't participate in patterns.  */
5190           gcc_assert (!orig_stmt_info);
5191           scalar_dest = gimple_assign_lhs (scalar_stmt_info->stmt);
5192         }
5193
5194       if (nested_in_vect_loop)
5195         {
5196           if (double_reduc)
5197             loop = outer_loop;
5198           else
5199             gcc_unreachable ();
5200         }
5201
5202       phis.create (3);
5203       /* Find the loop-closed-use at the loop exit of the original scalar
5204          result.  (The reduction result is expected to have two immediate uses,
5205          one at the latch block, and one at the loop exit).  For double
5206          reductions we are looking for exit phis of the outer loop.  */
5207       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, scalar_dest)
5208         {
5209           if (!flow_bb_inside_loop_p (loop, gimple_bb (USE_STMT (use_p))))
5210             {
5211               if (!is_gimple_debug (USE_STMT (use_p)))
5212                 phis.safe_push (USE_STMT (use_p));
5213             }
5214           else
5215             {
5216               if (double_reduc && gimple_code (USE_STMT (use_p)) == GIMPLE_PHI)
5217                 {
5218                   tree phi_res = PHI_RESULT (USE_STMT (use_p));
5219
5220                   FOR_EACH_IMM_USE_FAST (phi_use_p, phi_imm_iter, phi_res)
5221                     {
5222                       if (!flow_bb_inside_loop_p (loop,
5223                                              gimple_bb (USE_STMT (phi_use_p)))
5224                           && !is_gimple_debug (USE_STMT (phi_use_p)))
5225                         phis.safe_push (USE_STMT (phi_use_p));
5226                     }
5227                 }
5228             }
5229         }
5230
5231       FOR_EACH_VEC_ELT (phis, i, exit_phi)
5232         {
5233           /* Replace the uses:  */
5234           orig_name = PHI_RESULT (exit_phi);
5235           scalar_result = scalar_results[k];
5236           FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name)
5237             FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
5238               SET_USE (use_p, scalar_result);
5239         }
5240
5241       phis.release ();
5242     }
5243 }
5244
5245 /* Return a vector of type VECTYPE that is equal to the vector select
5246    operation "MASK ? VEC : IDENTITY".  Insert the select statements
5247    before GSI.  */
5248
5249 static tree
5250 merge_with_identity (gimple_stmt_iterator *gsi, tree mask, tree vectype,
5251                      tree vec, tree identity)
5252 {
5253   tree cond = make_temp_ssa_name (vectype, NULL, "cond");
5254   gimple *new_stmt = gimple_build_assign (cond, VEC_COND_EXPR,
5255                                           mask, vec, identity);
5256   gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
5257   return cond;
5258 }
5259
5260 /* Successively apply CODE to each element of VECTOR_RHS, in left-to-right
5261    order, starting with LHS.  Insert the extraction statements before GSI and
5262    associate the new scalar SSA names with variable SCALAR_DEST.
5263    Return the SSA name for the result.  */
5264
5265 static tree
5266 vect_expand_fold_left (gimple_stmt_iterator *gsi, tree scalar_dest,
5267                        tree_code code, tree lhs, tree vector_rhs)
5268 {
5269   tree vectype = TREE_TYPE (vector_rhs);
5270   tree scalar_type = TREE_TYPE (vectype);
5271   tree bitsize = TYPE_SIZE (scalar_type);
5272   unsigned HOST_WIDE_INT vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
5273   unsigned HOST_WIDE_INT element_bitsize = tree_to_uhwi (bitsize);
5274
5275   for (unsigned HOST_WIDE_INT bit_offset = 0;
5276        bit_offset < vec_size_in_bits;
5277        bit_offset += element_bitsize)
5278     {
5279       tree bitpos = bitsize_int (bit_offset);
5280       tree rhs = build3 (BIT_FIELD_REF, scalar_type, vector_rhs,
5281                          bitsize, bitpos);
5282
5283       gassign *stmt = gimple_build_assign (scalar_dest, rhs);
5284       rhs = make_ssa_name (scalar_dest, stmt);
5285       gimple_assign_set_lhs (stmt, rhs);
5286       gsi_insert_before (gsi, stmt, GSI_SAME_STMT);
5287
5288       stmt = gimple_build_assign (scalar_dest, code, lhs, rhs);
5289       tree new_name = make_ssa_name (scalar_dest, stmt);
5290       gimple_assign_set_lhs (stmt, new_name);
5291       gsi_insert_before (gsi, stmt, GSI_SAME_STMT);
5292       lhs = new_name;
5293     }
5294   return lhs;
5295 }
5296
5297 /* Get a masked internal function equivalent to REDUC_FN.  VECTYPE_IN is the
5298    type of the vector input.  */
5299
5300 static internal_fn
5301 get_masked_reduction_fn (internal_fn reduc_fn, tree vectype_in)
5302 {
5303   internal_fn mask_reduc_fn;
5304
5305   switch (reduc_fn)
5306     {
5307     case IFN_FOLD_LEFT_PLUS:
5308       mask_reduc_fn = IFN_MASK_FOLD_LEFT_PLUS;
5309       break;
5310
5311     default:
5312       return IFN_LAST;
5313     }
5314
5315   if (direct_internal_fn_supported_p (mask_reduc_fn, vectype_in,
5316                                       OPTIMIZE_FOR_SPEED))
5317     return mask_reduc_fn;
5318   return IFN_LAST;
5319 }
5320
5321 /* Perform an in-order reduction (FOLD_LEFT_REDUCTION).  STMT_INFO is the
5322    statement that sets the live-out value.  REDUC_DEF_STMT is the phi
5323    statement.  CODE is the operation performed by STMT_INFO and OPS are
5324    its scalar operands.  REDUC_INDEX is the index of the operand in
5325    OPS that is set by REDUC_DEF_STMT.  REDUC_FN is the function that
5326    implements in-order reduction, or IFN_LAST if we should open-code it.
5327    VECTYPE_IN is the type of the vector input.  MASKS specifies the masks
5328    that should be used to control the operation in a fully-masked loop.  */
5329
5330 static bool
5331 vectorize_fold_left_reduction (stmt_vec_info stmt_info,
5332                                gimple_stmt_iterator *gsi,
5333                                stmt_vec_info *vec_stmt, slp_tree slp_node,
5334                                gimple *reduc_def_stmt,
5335                                tree_code code, internal_fn reduc_fn,
5336                                tree ops[3], tree vectype_in,
5337                                int reduc_index, vec_loop_masks *masks)
5338 {
5339   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
5340   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
5341   tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
5342   stmt_vec_info new_stmt_info = NULL;
5343   internal_fn mask_reduc_fn = get_masked_reduction_fn (reduc_fn, vectype_in);
5344
5345   int ncopies;
5346   if (slp_node)
5347     ncopies = 1;
5348   else
5349     ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
5350
5351   gcc_assert (!nested_in_vect_loop_p (loop, stmt_info));
5352   gcc_assert (ncopies == 1);
5353   gcc_assert (TREE_CODE_LENGTH (code) == binary_op);
5354
5355   if (slp_node)
5356     gcc_assert (known_eq (TYPE_VECTOR_SUBPARTS (vectype_out),
5357                           TYPE_VECTOR_SUBPARTS (vectype_in)));
5358
5359   tree op0 = ops[1 - reduc_index];
5360
5361   int group_size = 1;
5362   stmt_vec_info scalar_dest_def_info;
5363   auto_vec<tree> vec_oprnds0;
5364   if (slp_node)
5365     {
5366       auto_vec<vec<tree> > vec_defs (2);
5367       auto_vec<tree> sops(2);
5368       sops.quick_push (ops[0]);
5369       sops.quick_push (ops[1]);
5370       vect_get_slp_defs (sops, slp_node, &vec_defs);
5371       vec_oprnds0.safe_splice (vec_defs[1 - reduc_index]);
5372       vec_defs[0].release ();
5373       vec_defs[1].release ();
5374       group_size = SLP_TREE_SCALAR_STMTS (slp_node).length ();
5375       scalar_dest_def_info = SLP_TREE_SCALAR_STMTS (slp_node)[group_size - 1];
5376     }
5377   else
5378     {
5379       tree loop_vec_def0 = vect_get_vec_def_for_operand (op0, stmt_info);
5380       vec_oprnds0.create (1);
5381       vec_oprnds0.quick_push (loop_vec_def0);
5382       scalar_dest_def_info = stmt_info;
5383     }
5384
5385   tree scalar_dest = gimple_assign_lhs (scalar_dest_def_info->stmt);
5386   tree scalar_type = TREE_TYPE (scalar_dest);
5387   tree reduc_var = gimple_phi_result (reduc_def_stmt);
5388
5389   int vec_num = vec_oprnds0.length ();
5390   gcc_assert (vec_num == 1 || slp_node);
5391   tree vec_elem_type = TREE_TYPE (vectype_out);
5392   gcc_checking_assert (useless_type_conversion_p (scalar_type, vec_elem_type));
5393
5394   tree vector_identity = NULL_TREE;
5395   if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
5396     vector_identity = build_zero_cst (vectype_out);
5397
5398   tree scalar_dest_var = vect_create_destination_var (scalar_dest, NULL);
5399   int i;
5400   tree def0;
5401   FOR_EACH_VEC_ELT (vec_oprnds0, i, def0)
5402     {
5403       gimple *new_stmt;
5404       tree mask = NULL_TREE;
5405       if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
5406         mask = vect_get_loop_mask (gsi, masks, vec_num, vectype_in, i);
5407
5408       /* Handle MINUS by adding the negative.  */
5409       if (reduc_fn != IFN_LAST && code == MINUS_EXPR)
5410         {
5411           tree negated = make_ssa_name (vectype_out);
5412           new_stmt = gimple_build_assign (negated, NEGATE_EXPR, def0);
5413           gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
5414           def0 = negated;
5415         }
5416
5417       if (mask && mask_reduc_fn == IFN_LAST)
5418         def0 = merge_with_identity (gsi, mask, vectype_out, def0,
5419                                     vector_identity);
5420
5421       /* On the first iteration the input is simply the scalar phi
5422          result, and for subsequent iterations it is the output of
5423          the preceding operation.  */
5424       if (reduc_fn != IFN_LAST || (mask && mask_reduc_fn != IFN_LAST))
5425         {
5426           if (mask && mask_reduc_fn != IFN_LAST)
5427             new_stmt = gimple_build_call_internal (mask_reduc_fn, 3, reduc_var,
5428                                                    def0, mask);
5429           else
5430             new_stmt = gimple_build_call_internal (reduc_fn, 2, reduc_var,
5431                                                    def0);
5432           /* For chained SLP reductions the output of the previous reduction
5433              operation serves as the input of the next. For the final statement
5434              the output cannot be a temporary - we reuse the original
5435              scalar destination of the last statement.  */
5436           if (i != vec_num - 1)
5437             {
5438               gimple_set_lhs (new_stmt, scalar_dest_var);
5439               reduc_var = make_ssa_name (scalar_dest_var, new_stmt);
5440               gimple_set_lhs (new_stmt, reduc_var);
5441             }
5442         }
5443       else
5444         {
5445           reduc_var = vect_expand_fold_left (gsi, scalar_dest_var, code,
5446                                              reduc_var, def0);
5447           new_stmt = SSA_NAME_DEF_STMT (reduc_var);
5448           /* Remove the statement, so that we can use the same code paths
5449              as for statements that we've just created.  */
5450           gimple_stmt_iterator tmp_gsi = gsi_for_stmt (new_stmt);
5451           gsi_remove (&tmp_gsi, true);
5452         }
5453
5454       if (i == vec_num - 1)
5455         {
5456           gimple_set_lhs (new_stmt, scalar_dest);
5457           new_stmt_info = vect_finish_replace_stmt (scalar_dest_def_info,
5458                                                     new_stmt);
5459         }
5460       else
5461         new_stmt_info = vect_finish_stmt_generation (scalar_dest_def_info,
5462                                                      new_stmt, gsi);
5463
5464       if (slp_node)
5465         SLP_TREE_VEC_STMTS (slp_node).quick_push (new_stmt_info);
5466     }
5467
5468   if (!slp_node)
5469     STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt_info;
5470
5471   return true;
5472 }
5473
5474 /* Function is_nonwrapping_integer_induction.
5475
5476    Check if STMT_VINO (which is part of loop LOOP) both increments and
5477    does not cause overflow.  */
5478
5479 static bool
5480 is_nonwrapping_integer_induction (stmt_vec_info stmt_vinfo, class loop *loop)
5481 {
5482   gphi *phi = as_a <gphi *> (stmt_vinfo->stmt);
5483   tree base = STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo);
5484   tree step = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo);
5485   tree lhs_type = TREE_TYPE (gimple_phi_result (phi));
5486   widest_int ni, max_loop_value, lhs_max;
5487   wi::overflow_type overflow = wi::OVF_NONE;
5488
5489   /* Make sure the loop is integer based.  */
5490   if (TREE_CODE (base) != INTEGER_CST
5491       || TREE_CODE (step) != INTEGER_CST)
5492     return false;
5493
5494   /* Check that the max size of the loop will not wrap.  */
5495
5496   if (TYPE_OVERFLOW_UNDEFINED (lhs_type))
5497     return true;
5498
5499   if (! max_stmt_executions (loop, &ni))
5500     return false;
5501
5502   max_loop_value = wi::mul (wi::to_widest (step), ni, TYPE_SIGN (lhs_type),
5503                             &overflow);
5504   if (overflow)
5505     return false;
5506
5507   max_loop_value = wi::add (wi::to_widest (base), max_loop_value,
5508                             TYPE_SIGN (lhs_type), &overflow);
5509   if (overflow)
5510     return false;
5511
5512   return (wi::min_precision (max_loop_value, TYPE_SIGN (lhs_type))
5513           <= TYPE_PRECISION (lhs_type));
5514 }
5515
5516 /* Check if masking can be supported by inserting a conditional expression.
5517    CODE is the code for the operation.  COND_FN is the conditional internal
5518    function, if it exists.  VECTYPE_IN is the type of the vector input.  */
5519 static bool
5520 use_mask_by_cond_expr_p (enum tree_code code, internal_fn cond_fn,
5521                          tree vectype_in)
5522 {
5523   if (cond_fn != IFN_LAST
5524       && direct_internal_fn_supported_p (cond_fn, vectype_in,
5525                                          OPTIMIZE_FOR_SPEED))
5526     return false;
5527
5528   switch (code)
5529     {
5530     case DOT_PROD_EXPR:
5531     case SAD_EXPR:
5532       return true;
5533
5534     default:
5535       return false;
5536     }
5537 }
5538
5539 /* Insert a conditional expression to enable masked vectorization.  CODE is the
5540    code for the operation.  VOP is the array of operands.  MASK is the loop
5541    mask.  GSI is a statement iterator used to place the new conditional
5542    expression.  */
5543 static void
5544 build_vect_cond_expr (enum tree_code code, tree vop[3], tree mask,
5545                       gimple_stmt_iterator *gsi)
5546 {
5547   switch (code)
5548     {
5549     case DOT_PROD_EXPR:
5550       {
5551         tree vectype = TREE_TYPE (vop[1]);
5552         tree zero = build_zero_cst (vectype);
5553         tree masked_op1 = make_temp_ssa_name (vectype, NULL, "masked_op1");
5554         gassign *select = gimple_build_assign (masked_op1, VEC_COND_EXPR,
5555                                                mask, vop[1], zero);
5556         gsi_insert_before (gsi, select, GSI_SAME_STMT);
5557         vop[1] = masked_op1;
5558         break;
5559       }
5560
5561     case SAD_EXPR:
5562       {
5563         tree vectype = TREE_TYPE (vop[1]);
5564         tree masked_op1 = make_temp_ssa_name (vectype, NULL, "masked_op1");
5565         gassign *select = gimple_build_assign (masked_op1, VEC_COND_EXPR,
5566                                                mask, vop[1], vop[0]);
5567         gsi_insert_before (gsi, select, GSI_SAME_STMT);
5568         vop[1] = masked_op1;
5569         break;
5570       }
5571
5572     default:
5573       gcc_unreachable ();
5574     }
5575 }
5576
5577 /* Function vectorizable_reduction.
5578
5579    Check if STMT_INFO performs a reduction operation that can be vectorized.
5580    If VEC_STMT is also passed, vectorize STMT_INFO: create a vectorized
5581    stmt to replace it, put it in VEC_STMT, and insert it at GSI.
5582    Return true if STMT_INFO is vectorizable in this way.
5583
5584    This function also handles reduction idioms (patterns) that have been
5585    recognized in advance during vect_pattern_recog.  In this case, STMT_INFO
5586    may be of this form:
5587      X = pattern_expr (arg0, arg1, ..., X)
5588    and its STMT_VINFO_RELATED_STMT points to the last stmt in the original
5589    sequence that had been detected and replaced by the pattern-stmt
5590    (STMT_INFO).
5591
5592    This function also handles reduction of condition expressions, for example:
5593      for (int i = 0; i < N; i++)
5594        if (a[i] < value)
5595          last = a[i];
5596    This is handled by vectorising the loop and creating an additional vector
5597    containing the loop indexes for which "a[i] < value" was true.  In the
5598    function epilogue this is reduced to a single max value and then used to
5599    index into the vector of results.
5600
5601    In some cases of reduction patterns, the type of the reduction variable X is
5602    different than the type of the other arguments of STMT_INFO.
5603    In such cases, the vectype that is used when transforming STMT_INFO into
5604    a vector stmt is different than the vectype that is used to determine the
5605    vectorization factor, because it consists of a different number of elements
5606    than the actual number of elements that are being operated upon in parallel.
5607
5608    For example, consider an accumulation of shorts into an int accumulator.
5609    On some targets it's possible to vectorize this pattern operating on 8
5610    shorts at a time (hence, the vectype for purposes of determining the
5611    vectorization factor should be V8HI); on the other hand, the vectype that
5612    is used to create the vector form is actually V4SI (the type of the result).
5613
5614    Upon entry to this function, STMT_VINFO_VECTYPE records the vectype that
5615    indicates what is the actual level of parallelism (V8HI in the example), so
5616    that the right vectorization factor would be derived.  This vectype
5617    corresponds to the type of arguments to the reduction stmt, and should *NOT*
5618    be used to create the vectorized stmt.  The right vectype for the vectorized
5619    stmt is obtained from the type of the result X:
5620         get_vectype_for_scalar_type (TREE_TYPE (X))
5621
5622    This means that, contrary to "regular" reductions (or "regular" stmts in
5623    general), the following equation:
5624       STMT_VINFO_VECTYPE == get_vectype_for_scalar_type (TREE_TYPE (X))
5625    does *NOT* necessarily hold for reduction patterns.  */
5626
5627 bool
5628 vectorizable_reduction (stmt_vec_info stmt_info, slp_tree slp_node,
5629                         slp_instance slp_node_instance,
5630                         stmt_vector_for_cost *cost_vec)
5631 {
5632   tree scalar_dest;
5633   tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
5634   tree vectype_in = NULL_TREE;
5635   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
5636   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
5637   enum tree_code code;
5638   int op_type;
5639   enum vect_def_type dt, cond_reduc_dt = vect_unknown_def_type;
5640   stmt_vec_info cond_stmt_vinfo = NULL;
5641   tree scalar_type;
5642   int i;
5643   int ncopies;
5644   bool single_defuse_cycle = false;
5645   tree ops[3];
5646   enum vect_def_type dts[3];
5647   bool nested_cycle = false, found_nested_cycle_def = false;
5648   bool double_reduc = false;
5649   int vec_num;
5650   tree tem;
5651   tree cr_index_scalar_type = NULL_TREE, cr_index_vector_type = NULL_TREE;
5652   tree cond_reduc_val = NULL_TREE;
5653
5654   /* Make sure it was already recognized as a reduction computation.  */
5655   if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_reduction_def
5656       && STMT_VINFO_DEF_TYPE (stmt_info) != vect_double_reduction_def
5657       && STMT_VINFO_DEF_TYPE (stmt_info) != vect_nested_cycle)
5658     return false;
5659
5660   /* The stmt we store reduction analysis meta on.  */
5661   stmt_vec_info reduc_info = info_for_reduction (stmt_info);
5662   reduc_info->is_reduc_info = true;
5663
5664   if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle)
5665     {
5666       if (is_a <gphi *> (stmt_info->stmt))
5667         /* Analysis for double-reduction is done on the outer
5668            loop PHI, nested cycles have no further restrictions.  */
5669         STMT_VINFO_TYPE (stmt_info) = cycle_phi_info_type;
5670       else
5671         STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
5672       return true;
5673     }
5674
5675   stmt_vec_info orig_stmt_of_analysis = stmt_info;
5676   if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
5677       || STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
5678     {
5679       if (!is_a <gphi *> (stmt_info->stmt))
5680         {
5681           STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
5682           return true;
5683         }
5684       if (slp_node)
5685         {
5686           slp_node_instance->reduc_phis = slp_node;
5687           /* ???  We're leaving slp_node to point to the PHIs, we only
5688              need it to get at the number of vector stmts which wasn't
5689              yet initialized for the instance root.  */
5690         }
5691       if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def)
5692         stmt_info = vect_stmt_to_vectorize (STMT_VINFO_REDUC_DEF (stmt_info));
5693       else /* STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def */
5694         {
5695           use_operand_p use_p;
5696           gimple *use_stmt;
5697           bool res = single_imm_use (gimple_phi_result (stmt_info->stmt),
5698                                      &use_p, &use_stmt);
5699           gcc_assert (res);
5700           stmt_info = loop_vinfo->lookup_stmt (use_stmt);
5701           stmt_info = vect_stmt_to_vectorize (STMT_VINFO_REDUC_DEF (stmt_info));
5702         }
5703       /* STMT_VINFO_REDUC_DEF doesn't point to the first but the last
5704          element.  */
5705       if (slp_node && REDUC_GROUP_FIRST_ELEMENT (stmt_info))
5706         {
5707           gcc_assert (!REDUC_GROUP_NEXT_ELEMENT (stmt_info));
5708           stmt_info = REDUC_GROUP_FIRST_ELEMENT (stmt_info);
5709         }
5710     }
5711
5712   if (nested_in_vect_loop_p (loop, stmt_info))
5713     {
5714       loop = loop->inner;
5715       nested_cycle = true;
5716     }
5717
5718   if (REDUC_GROUP_FIRST_ELEMENT (stmt_info))
5719     gcc_assert (slp_node
5720                 && REDUC_GROUP_FIRST_ELEMENT (stmt_info) == stmt_info);
5721
5722   /* 1. Is vectorizable reduction?  */
5723   /* Not supportable if the reduction variable is used in the loop, unless
5724      it's a reduction chain.  */
5725   if (STMT_VINFO_RELEVANT (stmt_info) > vect_used_in_outer
5726       && !REDUC_GROUP_FIRST_ELEMENT (stmt_info))
5727     return false;
5728
5729   /* Reductions that are not used even in an enclosing outer-loop,
5730      are expected to be "live" (used out of the loop).  */
5731   if (STMT_VINFO_RELEVANT (stmt_info) == vect_unused_in_scope
5732       && !STMT_VINFO_LIVE_P (stmt_info))
5733     return false;
5734
5735   /* 2. Has this been recognized as a reduction pattern?
5736
5737      Check if STMT represents a pattern that has been recognized
5738      in earlier analysis stages.  For stmts that represent a pattern,
5739      the STMT_VINFO_RELATED_STMT field records the last stmt in
5740      the original sequence that constitutes the pattern.  */
5741
5742   stmt_vec_info orig_stmt_info = STMT_VINFO_RELATED_STMT (stmt_info);
5743   if (orig_stmt_info)
5744     {
5745       gcc_assert (STMT_VINFO_IN_PATTERN_P (orig_stmt_info));
5746       gcc_assert (!STMT_VINFO_IN_PATTERN_P (stmt_info));
5747     }
5748
5749   /* 3. Check the operands of the operation.  The first operands are defined
5750         inside the loop body. The last operand is the reduction variable,
5751         which is defined by the loop-header-phi.  */
5752
5753   gassign *stmt = as_a <gassign *> (stmt_info->stmt);
5754
5755   /* Flatten RHS.  */
5756   switch (get_gimple_rhs_class (gimple_assign_rhs_code (stmt)))
5757     {
5758     case GIMPLE_BINARY_RHS:
5759       code = gimple_assign_rhs_code (stmt);
5760       op_type = TREE_CODE_LENGTH (code);
5761       gcc_assert (op_type == binary_op);
5762       ops[0] = gimple_assign_rhs1 (stmt);
5763       ops[1] = gimple_assign_rhs2 (stmt);
5764       break;
5765
5766     case GIMPLE_TERNARY_RHS:
5767       code = gimple_assign_rhs_code (stmt);
5768       op_type = TREE_CODE_LENGTH (code);
5769       gcc_assert (op_type == ternary_op);
5770       ops[0] = gimple_assign_rhs1 (stmt);
5771       ops[1] = gimple_assign_rhs2 (stmt);
5772       ops[2] = gimple_assign_rhs3 (stmt);
5773       break;
5774
5775     case GIMPLE_UNARY_RHS:
5776     case GIMPLE_SINGLE_RHS:
5777       return false;
5778
5779     default:
5780       gcc_unreachable ();
5781     }
5782
5783   if (code == COND_EXPR && slp_node)
5784     return false;
5785
5786   scalar_dest = gimple_assign_lhs (stmt);
5787   scalar_type = TREE_TYPE (scalar_dest);
5788   if (!POINTER_TYPE_P (scalar_type) && !INTEGRAL_TYPE_P (scalar_type)
5789       && !SCALAR_FLOAT_TYPE_P (scalar_type))
5790     return false;
5791
5792   /* Do not try to vectorize bit-precision reductions.  */
5793   if (!type_has_mode_precision_p (scalar_type))
5794     return false;
5795
5796   /* All uses but the last are expected to be defined in the loop.
5797      The last use is the reduction variable.  In case of nested cycle this
5798      assumption is not true: we use reduc_index to record the index of the
5799      reduction variable.  */
5800   stmt_vec_info phi_info = STMT_VINFO_REDUC_DEF (vect_orig_stmt (stmt_info));
5801   /* PHIs should not participate in patterns.  */
5802   gcc_assert (!STMT_VINFO_RELATED_STMT (phi_info));
5803   gphi *reduc_def_phi = as_a <gphi *> (phi_info->stmt);
5804   tree reduc_def = PHI_RESULT (reduc_def_phi);
5805   int reduc_index = -1;
5806   for (i = 0; i < op_type; i++)
5807     {
5808       /* The condition of COND_EXPR is checked in vectorizable_condition().  */
5809       if (i == 0 && code == COND_EXPR)
5810         continue;
5811
5812       stmt_vec_info def_stmt_info;
5813       if (!vect_is_simple_use (ops[i], loop_vinfo, &dts[i], &tem,
5814                                &def_stmt_info))
5815         {
5816           if (dump_enabled_p ())
5817             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5818                              "use not simple.\n");
5819           return false;
5820         }
5821       dt = dts[i];
5822       if (dt == vect_reduction_def
5823           && ops[i] == reduc_def)
5824         {
5825           reduc_index = i;
5826           continue;
5827         }
5828       else if (tem)
5829         {
5830           /* To properly compute ncopies we are interested in the widest
5831              input type in case we're looking at a widening accumulation.  */
5832           if (!vectype_in
5833               || (GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (vectype_in)))
5834                   < GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (tem)))))
5835             vectype_in = tem;
5836         }
5837
5838       if (dt != vect_internal_def
5839           && dt != vect_external_def
5840           && dt != vect_constant_def
5841           && dt != vect_induction_def
5842           && !(dt == vect_nested_cycle && nested_cycle))
5843         return false;
5844
5845       if (dt == vect_nested_cycle
5846           && ops[i] == reduc_def)
5847         {
5848           found_nested_cycle_def = true;
5849           reduc_index = i;
5850         }
5851
5852       if (code == COND_EXPR)
5853         {
5854           /* Record how the non-reduction-def value of COND_EXPR is defined.  */
5855           if (dt == vect_constant_def)
5856             {
5857               cond_reduc_dt = dt;
5858               cond_reduc_val = ops[i];
5859             }
5860           if (dt == vect_induction_def
5861               && def_stmt_info
5862               && is_nonwrapping_integer_induction (def_stmt_info, loop))
5863             {
5864               cond_reduc_dt = dt;
5865               cond_stmt_vinfo = def_stmt_info;
5866             }
5867         }
5868     }
5869   if (!vectype_in)
5870     vectype_in = vectype_out;
5871   STMT_VINFO_REDUC_VECTYPE_IN (reduc_info) = vectype_in;
5872   /* For the SSA cycle we store on each participating stmt the operand index
5873      where the cycle continues.  Store the one relevant for the actual
5874      operation in the reduction meta.  */
5875   STMT_VINFO_REDUC_IDX (reduc_info) = reduc_index;
5876
5877   if (!(reduc_index == -1
5878         || dts[reduc_index] == vect_reduction_def
5879         || dts[reduc_index] == vect_nested_cycle
5880         || ((dts[reduc_index] == vect_internal_def
5881              || dts[reduc_index] == vect_external_def
5882              || dts[reduc_index] == vect_constant_def
5883              || dts[reduc_index] == vect_induction_def)
5884             && nested_cycle && found_nested_cycle_def)))
5885     {
5886       /* For pattern recognized stmts, orig_stmt might be a reduction,
5887          but some helper statements for the pattern might not, or
5888          might be COND_EXPRs with reduction uses in the condition.  */
5889       gcc_assert (orig_stmt_info);
5890       return false;
5891     }
5892
5893   enum vect_reduction_type v_reduc_type = STMT_VINFO_REDUC_TYPE (phi_info);
5894   STMT_VINFO_REDUC_TYPE (reduc_info) = v_reduc_type;
5895   /* If we have a condition reduction, see if we can simplify it further.  */
5896   if (v_reduc_type == COND_REDUCTION)
5897     {
5898       /* TODO: We can't yet handle reduction chains, since we need to treat
5899          each COND_EXPR in the chain specially, not just the last one.
5900          E.g. for:
5901
5902             x_1 = PHI <x_3, ...>
5903             x_2 = a_2 ? ... : x_1;
5904             x_3 = a_3 ? ... : x_2;
5905
5906          we're interested in the last element in x_3 for which a_2 || a_3
5907          is true, whereas the current reduction chain handling would
5908          vectorize x_2 as a normal VEC_COND_EXPR and only treat x_3
5909          as a reduction operation.  */
5910       if (reduc_index == -1)
5911         {
5912           if (dump_enabled_p ())
5913             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5914                              "conditional reduction chains not supported\n");
5915           return false;
5916         }
5917
5918       /* When the condition uses the reduction value in the condition, fail.  */
5919       if (reduc_index == 0)
5920         {
5921           if (dump_enabled_p ())
5922             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5923                              "condition depends on previous iteration\n");
5924           return false;
5925         }
5926
5927       if (direct_internal_fn_supported_p (IFN_FOLD_EXTRACT_LAST,
5928                                           vectype_in, OPTIMIZE_FOR_SPEED))
5929         {
5930           if (dump_enabled_p ())
5931             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5932                              "optimizing condition reduction with"
5933                              " FOLD_EXTRACT_LAST.\n");
5934           STMT_VINFO_REDUC_TYPE (reduc_info) = EXTRACT_LAST_REDUCTION;
5935         }
5936       else if (cond_reduc_dt == vect_induction_def)
5937         {
5938           tree base
5939             = STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (cond_stmt_vinfo);
5940           tree step = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (cond_stmt_vinfo);
5941
5942           gcc_assert (TREE_CODE (base) == INTEGER_CST
5943                       && TREE_CODE (step) == INTEGER_CST);
5944           cond_reduc_val = NULL_TREE;
5945           enum tree_code cond_reduc_op_code = ERROR_MARK;
5946           tree res = PHI_RESULT (STMT_VINFO_STMT (cond_stmt_vinfo));
5947           if (!types_compatible_p (TREE_TYPE (res), TREE_TYPE (base)))
5948             ;
5949           /* Find a suitable value, for MAX_EXPR below base, for MIN_EXPR
5950              above base; punt if base is the minimum value of the type for
5951              MAX_EXPR or maximum value of the type for MIN_EXPR for now.  */
5952           else if (tree_int_cst_sgn (step) == -1)
5953             {
5954               cond_reduc_op_code = MIN_EXPR;
5955               if (tree_int_cst_sgn (base) == -1)
5956                 cond_reduc_val = build_int_cst (TREE_TYPE (base), 0);
5957               else if (tree_int_cst_lt (base,
5958                                         TYPE_MAX_VALUE (TREE_TYPE (base))))
5959                 cond_reduc_val
5960                   = int_const_binop (PLUS_EXPR, base, integer_one_node);
5961             }
5962           else
5963             {
5964               cond_reduc_op_code = MAX_EXPR;
5965               if (tree_int_cst_sgn (base) == 1)
5966                 cond_reduc_val = build_int_cst (TREE_TYPE (base), 0);
5967               else if (tree_int_cst_lt (TYPE_MIN_VALUE (TREE_TYPE (base)),
5968                                         base))
5969                 cond_reduc_val
5970                   = int_const_binop (MINUS_EXPR, base, integer_one_node);
5971             }
5972           if (cond_reduc_val)
5973             {
5974               if (dump_enabled_p ())
5975                 dump_printf_loc (MSG_NOTE, vect_location,
5976                                  "condition expression based on "
5977                                  "integer induction.\n");
5978               STMT_VINFO_VEC_COND_REDUC_CODE (reduc_info) = cond_reduc_op_code;
5979               STMT_VINFO_VEC_INDUC_COND_INITIAL_VAL (reduc_info)
5980                 = cond_reduc_val;
5981               STMT_VINFO_REDUC_TYPE (reduc_info) = INTEGER_INDUC_COND_REDUCTION;
5982             }
5983         }
5984       else if (cond_reduc_dt == vect_constant_def)
5985         {
5986           enum vect_def_type cond_initial_dt;
5987           gimple *def_stmt = SSA_NAME_DEF_STMT (ops[reduc_index]);
5988           tree cond_initial_val
5989             = PHI_ARG_DEF_FROM_EDGE (def_stmt, loop_preheader_edge (loop));
5990
5991           gcc_assert (cond_reduc_val != NULL_TREE);
5992           vect_is_simple_use (cond_initial_val, loop_vinfo, &cond_initial_dt);
5993           if (cond_initial_dt == vect_constant_def
5994               && types_compatible_p (TREE_TYPE (cond_initial_val),
5995                                      TREE_TYPE (cond_reduc_val)))
5996             {
5997               tree e = fold_binary (LE_EXPR, boolean_type_node,
5998                                     cond_initial_val, cond_reduc_val);
5999               if (e && (integer_onep (e) || integer_zerop (e)))
6000                 {
6001                   if (dump_enabled_p ())
6002                     dump_printf_loc (MSG_NOTE, vect_location,
6003                                      "condition expression based on "
6004                                      "compile time constant.\n");
6005                   /* Record reduction code at analysis stage.  */
6006                   STMT_VINFO_VEC_COND_REDUC_CODE (reduc_info)
6007                     = integer_onep (e) ? MAX_EXPR : MIN_EXPR;
6008                   STMT_VINFO_REDUC_TYPE (reduc_info) = CONST_COND_REDUCTION;
6009                 }
6010             }
6011         }
6012     }
6013
6014   if (REDUC_GROUP_FIRST_ELEMENT (stmt_info))
6015     /* We changed STMT to be the first stmt in reduction chain, hence we
6016        check that in this case the first element in the chain is STMT.  */
6017     gcc_assert (REDUC_GROUP_FIRST_ELEMENT (STMT_VINFO_REDUC_DEF (phi_info))
6018                 == vect_orig_stmt (stmt_info));
6019
6020   if (STMT_VINFO_LIVE_P (phi_info))
6021     return false;
6022
6023   if (slp_node)
6024     ncopies = 1;
6025   else
6026     ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
6027
6028   gcc_assert (ncopies >= 1);
6029
6030   poly_uint64 nunits_out = TYPE_VECTOR_SUBPARTS (vectype_out);
6031
6032   if (nested_cycle)
6033     {
6034       gcc_assert (STMT_VINFO_DEF_TYPE (reduc_info)
6035                   == vect_double_reduction_def);
6036       double_reduc = true;
6037     }
6038
6039   /* 4.2. Check support for the epilog operation.
6040
6041           If STMT represents a reduction pattern, then the type of the
6042           reduction variable may be different than the type of the rest
6043           of the arguments.  For example, consider the case of accumulation
6044           of shorts into an int accumulator; The original code:
6045                         S1: int_a = (int) short_a;
6046           orig_stmt->   S2: int_acc = plus <int_a ,int_acc>;
6047
6048           was replaced with:
6049                         STMT: int_acc = widen_sum <short_a, int_acc>
6050
6051           This means that:
6052           1. The tree-code that is used to create the vector operation in the
6053              epilog code (that reduces the partial results) is not the
6054              tree-code of STMT, but is rather the tree-code of the original
6055              stmt from the pattern that STMT is replacing.  I.e, in the example
6056              above we want to use 'widen_sum' in the loop, but 'plus' in the
6057              epilog.
6058           2. The type (mode) we use to check available target support
6059              for the vector operation to be created in the *epilog*, is
6060              determined by the type of the reduction variable (in the example
6061              above we'd check this: optab_handler (plus_optab, vect_int_mode])).
6062              However the type (mode) we use to check available target support
6063              for the vector operation to be created *inside the loop*, is
6064              determined by the type of the other arguments to STMT (in the
6065              example we'd check this: optab_handler (widen_sum_optab,
6066              vect_short_mode)).
6067
6068           This is contrary to "regular" reductions, in which the types of all
6069           the arguments are the same as the type of the reduction variable.
6070           For "regular" reductions we can therefore use the same vector type
6071           (and also the same tree-code) when generating the epilog code and
6072           when generating the code inside the loop.  */
6073
6074   vect_reduction_type reduction_type = STMT_VINFO_REDUC_TYPE (reduc_info);
6075   enum tree_code orig_code = ERROR_MARK;
6076   if (reduction_type == CONST_COND_REDUCTION
6077       || reduction_type == INTEGER_INDUC_COND_REDUCTION)
6078     {
6079       /* For simple condition reductions, replace with the actual expression
6080          we want to base our reduction around.  */
6081       orig_code = STMT_VINFO_VEC_COND_REDUC_CODE (reduc_info);
6082       gcc_assert (orig_code == MAX_EXPR || orig_code == MIN_EXPR);
6083     }
6084   else if (reduction_type == COND_REDUCTION)
6085     orig_code = COND_EXPR;
6086   else if (reduction_type == TREE_CODE_REDUCTION
6087            || reduction_type == FOLD_LEFT_REDUCTION)
6088     {
6089       if (orig_stmt_info)
6090         orig_code = gimple_assign_rhs_code (orig_stmt_info->stmt);
6091       else
6092         orig_code = code;
6093       gcc_assert (vectype_out);
6094       if (orig_code == MINUS_EXPR)
6095         orig_code = PLUS_EXPR;
6096     }
6097   STMT_VINFO_REDUC_CODE (reduc_info) = orig_code;
6098
6099   if (reduction_type == TREE_CODE_REDUCTION)
6100     {
6101       /* Check whether it's ok to change the order of the computation.
6102          Generally, when vectorizing a reduction we change the order of the
6103          computation.  This may change the behavior of the program in some
6104          cases, so we need to check that this is ok.  One exception is when
6105          vectorizing an outer-loop: the inner-loop is executed sequentially,
6106          and therefore vectorizing reductions in the inner-loop during
6107          outer-loop vectorization is safe.  */
6108       if (needs_fold_left_reduction_p (scalar_type, orig_code))
6109         {
6110           STMT_VINFO_REDUC_TYPE (reduc_info)
6111             = reduction_type = FOLD_LEFT_REDUCTION;
6112           /* When vectorizing a reduction chain w/o SLP the reduction PHI is not
6113              directy used in stmt.  */
6114           if (reduc_index == -1)
6115             {
6116               if (dump_enabled_p ())
6117                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6118                                  "in-order reduction chain without SLP.\n");
6119               return false;
6120             }
6121         }
6122       else if (!commutative_tree_code (orig_code)
6123                || !associative_tree_code (orig_code))
6124         {
6125           if (dump_enabled_p ())
6126             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6127                             "reduction: not commutative/associative");
6128           return false;
6129         }
6130     }
6131
6132   if ((double_reduc || reduction_type != TREE_CODE_REDUCTION)
6133       && ncopies > 1)
6134     {
6135       if (dump_enabled_p ())
6136         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6137                          "multiple types in double reduction or condition "
6138                          "reduction or fold-left reduction.\n");
6139       return false;
6140     }
6141
6142   internal_fn reduc_fn = IFN_LAST;
6143   if (reduction_type == TREE_CODE_REDUCTION
6144       || reduction_type == FOLD_LEFT_REDUCTION
6145       || reduction_type == INTEGER_INDUC_COND_REDUCTION
6146       || reduction_type == CONST_COND_REDUCTION)
6147     {
6148       if (reduction_type == FOLD_LEFT_REDUCTION
6149           ? fold_left_reduction_fn (orig_code, &reduc_fn)
6150           : reduction_fn_for_scalar_code (orig_code, &reduc_fn))
6151         {
6152           if (reduc_fn != IFN_LAST
6153               && !direct_internal_fn_supported_p (reduc_fn, vectype_out,
6154                                                   OPTIMIZE_FOR_SPEED))
6155             {
6156               if (dump_enabled_p ())
6157                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6158                                  "reduc op not supported by target.\n");
6159
6160               reduc_fn = IFN_LAST;
6161             }
6162         }
6163       else
6164         {
6165           if (!nested_cycle || double_reduc)
6166             {
6167               if (dump_enabled_p ())
6168                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6169                                  "no reduc code for scalar code.\n");
6170
6171               return false;
6172             }
6173         }
6174     }
6175   else if (reduction_type == COND_REDUCTION)
6176     {
6177       int scalar_precision
6178         = GET_MODE_PRECISION (SCALAR_TYPE_MODE (scalar_type));
6179       cr_index_scalar_type = make_unsigned_type (scalar_precision);
6180       cr_index_vector_type = build_vector_type (cr_index_scalar_type,
6181                                                 nunits_out);
6182
6183       if (direct_internal_fn_supported_p (IFN_REDUC_MAX, cr_index_vector_type,
6184                                           OPTIMIZE_FOR_SPEED))
6185         reduc_fn = IFN_REDUC_MAX;
6186     }
6187   STMT_VINFO_REDUC_FN (reduc_info) = reduc_fn;
6188
6189   if (reduction_type != EXTRACT_LAST_REDUCTION
6190       && (!nested_cycle || double_reduc)
6191       && reduc_fn == IFN_LAST
6192       && !nunits_out.is_constant ())
6193     {
6194       if (dump_enabled_p ())
6195         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6196                          "missing target support for reduction on"
6197                          " variable-length vectors.\n");
6198       return false;
6199     }
6200
6201   /* For SLP reductions, see if there is a neutral value we can use.  */
6202   tree neutral_op = NULL_TREE;
6203   if (slp_node)
6204     neutral_op = neutral_op_for_slp_reduction
6205       (slp_node_instance->reduc_phis, code,
6206        REDUC_GROUP_FIRST_ELEMENT (stmt_info) != NULL);
6207
6208   if (double_reduc && reduction_type == FOLD_LEFT_REDUCTION)
6209     {
6210       /* We can't support in-order reductions of code such as this:
6211
6212            for (int i = 0; i < n1; ++i)
6213              for (int j = 0; j < n2; ++j)
6214                l += a[j];
6215
6216          since GCC effectively transforms the loop when vectorizing:
6217
6218            for (int i = 0; i < n1 / VF; ++i)
6219              for (int j = 0; j < n2; ++j)
6220                for (int k = 0; k < VF; ++k)
6221                  l += a[j];
6222
6223          which is a reassociation of the original operation.  */
6224       if (dump_enabled_p ())
6225         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6226                          "in-order double reduction not supported.\n");
6227
6228       return false;
6229     }
6230
6231   if (reduction_type == FOLD_LEFT_REDUCTION
6232       && slp_node
6233       && !REDUC_GROUP_FIRST_ELEMENT (stmt_info))
6234     {
6235       /* We cannot use in-order reductions in this case because there is
6236          an implicit reassociation of the operations involved.  */
6237       if (dump_enabled_p ())
6238         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6239                          "in-order unchained SLP reductions not supported.\n");
6240       return false;
6241     }
6242
6243   /* For double reductions, and for SLP reductions with a neutral value,
6244      we construct a variable-length initial vector by loading a vector
6245      full of the neutral value and then shift-and-inserting the start
6246      values into the low-numbered elements.  */
6247   if ((double_reduc || neutral_op)
6248       && !nunits_out.is_constant ()
6249       && !direct_internal_fn_supported_p (IFN_VEC_SHL_INSERT,
6250                                           vectype_out, OPTIMIZE_FOR_SPEED))
6251     {
6252       if (dump_enabled_p ())
6253         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6254                          "reduction on variable-length vectors requires"
6255                          " target support for a vector-shift-and-insert"
6256                          " operation.\n");
6257       return false;
6258     }
6259
6260   /* Check extra constraints for variable-length unchained SLP reductions.  */
6261   if (STMT_SLP_TYPE (stmt_info)
6262       && !REDUC_GROUP_FIRST_ELEMENT (stmt_info)
6263       && !nunits_out.is_constant ())
6264     {
6265       /* We checked above that we could build the initial vector when
6266          there's a neutral element value.  Check here for the case in
6267          which each SLP statement has its own initial value and in which
6268          that value needs to be repeated for every instance of the
6269          statement within the initial vector.  */
6270       unsigned int group_size = SLP_INSTANCE_GROUP_SIZE (slp_node_instance);
6271       scalar_mode elt_mode = SCALAR_TYPE_MODE (TREE_TYPE (vectype_out));
6272       if (!neutral_op
6273           && !can_duplicate_and_interleave_p (group_size, elt_mode))
6274         {
6275           if (dump_enabled_p ())
6276             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6277                              "unsupported form of SLP reduction for"
6278                              " variable-length vectors: cannot build"
6279                              " initial vector.\n");
6280           return false;
6281         }
6282       /* The epilogue code relies on the number of elements being a multiple
6283          of the group size.  The duplicate-and-interleave approach to setting
6284          up the the initial vector does too.  */
6285       if (!multiple_p (nunits_out, group_size))
6286         {
6287           if (dump_enabled_p ())
6288             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6289                              "unsupported form of SLP reduction for"
6290                              " variable-length vectors: the vector size"
6291                              " is not a multiple of the number of results.\n");
6292           return false;
6293         }
6294     }
6295
6296   /* In case of widenning multiplication by a constant, we update the type
6297      of the constant to be the type of the other operand.  We check that the
6298      constant fits the type in the pattern recognition pass.  */
6299   if (code == DOT_PROD_EXPR
6300       && !types_compatible_p (TREE_TYPE (ops[0]), TREE_TYPE (ops[1])))
6301     /* No testcase for this.  PR49478.  */
6302     gcc_unreachable ();
6303
6304   if (reduction_type == COND_REDUCTION)
6305     {
6306       widest_int ni;
6307
6308       if (! max_loop_iterations (loop, &ni))
6309         {
6310           if (dump_enabled_p ())
6311             dump_printf_loc (MSG_NOTE, vect_location,
6312                              "loop count not known, cannot create cond "
6313                              "reduction.\n");
6314           return false;
6315         }
6316       /* Convert backedges to iterations.  */
6317       ni += 1;
6318
6319       /* The additional index will be the same type as the condition.  Check
6320          that the loop can fit into this less one (because we'll use up the
6321          zero slot for when there are no matches).  */
6322       tree max_index = TYPE_MAX_VALUE (cr_index_scalar_type);
6323       if (wi::geu_p (ni, wi::to_widest (max_index)))
6324         {
6325           if (dump_enabled_p ())
6326             dump_printf_loc (MSG_NOTE, vect_location,
6327                              "loop size is greater than data size.\n");
6328           return false;
6329         }
6330     }
6331
6332   /* In case the vectorization factor (VF) is bigger than the number
6333      of elements that we can fit in a vectype (nunits), we have to generate
6334      more than one vector stmt - i.e - we need to "unroll" the
6335      vector stmt by a factor VF/nunits.  For more details see documentation
6336      in vectorizable_operation.  */
6337
6338   /* If the reduction is used in an outer loop we need to generate
6339      VF intermediate results, like so (e.g. for ncopies=2):
6340         r0 = phi (init, r0)
6341         r1 = phi (init, r1)
6342         r0 = x0 + r0;
6343         r1 = x1 + r1;
6344     (i.e. we generate VF results in 2 registers).
6345     In this case we have a separate def-use cycle for each copy, and therefore
6346     for each copy we get the vector def for the reduction variable from the
6347     respective phi node created for this copy.
6348
6349     Otherwise (the reduction is unused in the loop nest), we can combine
6350     together intermediate results, like so (e.g. for ncopies=2):
6351         r = phi (init, r)
6352         r = x0 + r;
6353         r = x1 + r;
6354    (i.e. we generate VF/2 results in a single register).
6355    In this case for each copy we get the vector def for the reduction variable
6356    from the vectorized reduction operation generated in the previous iteration.
6357
6358    This only works when we see both the reduction PHI and its only consumer
6359    in vectorizable_reduction and there are no intermediate stmts
6360    participating.  */
6361   stmt_vec_info use_stmt_info;
6362   tree reduc_phi_result = gimple_phi_result (reduc_def_phi);
6363   if (ncopies > 1
6364       && (STMT_VINFO_RELEVANT (stmt_info) <= vect_used_only_live)
6365       && (use_stmt_info = loop_vinfo->lookup_single_use (reduc_phi_result))
6366       && (!STMT_VINFO_IN_PATTERN_P (use_stmt_info)
6367           || !STMT_VINFO_PATTERN_DEF_SEQ (use_stmt_info))
6368       && vect_stmt_to_vectorize (use_stmt_info) == stmt_info)
6369     STMT_VINFO_FORCE_SINGLE_CYCLE (reduc_info) = single_defuse_cycle = true;
6370
6371   if (single_defuse_cycle
6372       || code == DOT_PROD_EXPR
6373       || code == WIDEN_SUM_EXPR
6374       || code == SAD_EXPR)
6375     {
6376       gcc_assert (code != COND_EXPR);
6377
6378       /* 4. Supportable by target?  */
6379
6380       /* 4.1. check support for the operation in the loop  */
6381       optab optab = optab_for_tree_code (code, vectype_in, optab_default);
6382       if (!optab)
6383         {
6384           if (dump_enabled_p ())
6385             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6386                              "no optab.\n");
6387
6388           return false;
6389         }
6390
6391       machine_mode vec_mode = TYPE_MODE (vectype_in);
6392       if (optab_handler (optab, vec_mode) == CODE_FOR_nothing)
6393         {
6394           if (dump_enabled_p ())
6395             dump_printf (MSG_NOTE, "op not supported by target.\n");
6396
6397           if (maybe_ne (GET_MODE_SIZE (vec_mode), UNITS_PER_WORD)
6398               || !vect_worthwhile_without_simd_p (loop_vinfo, code))
6399             return false;
6400
6401           if (dump_enabled_p ())
6402             dump_printf (MSG_NOTE, "proceeding using word mode.\n");
6403         }
6404
6405       /* Worthwhile without SIMD support?  */
6406       if (!VECTOR_MODE_P (TYPE_MODE (vectype_in))
6407           && !vect_worthwhile_without_simd_p (loop_vinfo, code))
6408         {
6409           if (dump_enabled_p ())
6410             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6411                              "not worthwhile without SIMD support.\n");
6412
6413           return false;
6414         }
6415     }
6416
6417   /* If the reduction stmt is one of the patterns that have lane
6418      reduction embedded we cannot handle the case of ! single_defuse_cycle.  */
6419   if ((ncopies > 1
6420        && ! single_defuse_cycle)
6421       && (code == DOT_PROD_EXPR
6422           || code == WIDEN_SUM_EXPR
6423           || code == SAD_EXPR))
6424     {
6425       if (dump_enabled_p ())
6426         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6427                          "multi def-use cycle not possible for lane-reducing "
6428                          "reduction operation\n");
6429       return false;
6430     }
6431
6432   if (slp_node)
6433     vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
6434   else
6435     vec_num = 1;
6436
6437   internal_fn cond_fn = get_conditional_internal_fn (code);
6438   vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo);
6439   bool mask_by_cond_expr = use_mask_by_cond_expr_p (code, cond_fn, vectype_in);
6440
6441   vect_model_reduction_cost (stmt_info, reduc_fn, reduction_type, ncopies,
6442                              cost_vec);
6443   if (loop_vinfo && LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo))
6444     {
6445       if (reduction_type != FOLD_LEFT_REDUCTION
6446           && !mask_by_cond_expr
6447           && (cond_fn == IFN_LAST
6448               || !direct_internal_fn_supported_p (cond_fn, vectype_in,
6449                                                   OPTIMIZE_FOR_SPEED)))
6450         {
6451           if (dump_enabled_p ())
6452             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6453                              "can't use a fully-masked loop because no"
6454                              " conditional operation is available.\n");
6455           LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = false;
6456         }
6457       else if (reduc_index == -1)
6458         {
6459           if (dump_enabled_p ())
6460             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6461                              "can't use a fully-masked loop for chained"
6462                              " reductions.\n");
6463           LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = false;
6464         }
6465       else
6466         vect_record_loop_mask (loop_vinfo, masks, ncopies * vec_num,
6467                                vectype_in);
6468     }
6469   if (dump_enabled_p ()
6470       && reduction_type == FOLD_LEFT_REDUCTION)
6471     dump_printf_loc (MSG_NOTE, vect_location,
6472                      "using an in-order (fold-left) reduction.\n");
6473   STMT_VINFO_TYPE (orig_stmt_of_analysis) = cycle_phi_info_type;
6474   /* All but single defuse-cycle optimized, lane-reducing and fold-left
6475      reductions go through their own vectorizable_* routines.  */
6476   if (!single_defuse_cycle
6477       && code != DOT_PROD_EXPR
6478       && code != WIDEN_SUM_EXPR
6479       && code != SAD_EXPR
6480       && reduction_type != FOLD_LEFT_REDUCTION)
6481     {
6482       STMT_VINFO_DEF_TYPE (stmt_info) = vect_internal_def;
6483       STMT_VINFO_DEF_TYPE (vect_orig_stmt (stmt_info)) = vect_internal_def;
6484     }
6485   return true;
6486 }
6487
6488 /* Transform the definition stmt STMT_INFO of a reduction PHI backedge
6489    value.  */
6490
6491 bool
6492 vect_transform_reduction (stmt_vec_info stmt_info, gimple_stmt_iterator *gsi,
6493                           stmt_vec_info *vec_stmt, slp_tree slp_node)
6494 {
6495   tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
6496   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
6497   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
6498   int i;
6499   int ncopies;
6500   int j;
6501   int vec_num;
6502
6503   stmt_vec_info reduc_info = info_for_reduction (stmt_info);
6504   gcc_assert (reduc_info->is_reduc_info);
6505
6506   if (nested_in_vect_loop_p (loop, stmt_info))
6507     {
6508       loop = loop->inner;
6509       gcc_assert (STMT_VINFO_DEF_TYPE (reduc_info) == vect_double_reduction_def);
6510     }
6511
6512   gassign *stmt = as_a <gassign *> (stmt_info->stmt);
6513   enum tree_code code = gimple_assign_rhs_code (stmt);
6514   int op_type = TREE_CODE_LENGTH (code);
6515
6516   /* Flatten RHS.  */
6517   tree ops[3];
6518   switch (get_gimple_rhs_class (code))
6519     {
6520     case GIMPLE_TERNARY_RHS:
6521       ops[2] = gimple_assign_rhs3 (stmt);
6522       /* Fall thru.  */
6523     case GIMPLE_BINARY_RHS:
6524       ops[0] = gimple_assign_rhs1 (stmt);
6525       ops[1] = gimple_assign_rhs2 (stmt);
6526       break;
6527     default:
6528       gcc_unreachable ();
6529     }
6530
6531   /* All uses but the last are expected to be defined in the loop.
6532      The last use is the reduction variable.  In case of nested cycle this
6533      assumption is not true: we use reduc_index to record the index of the
6534      reduction variable.  */
6535   stmt_vec_info phi_info = STMT_VINFO_REDUC_DEF (vect_orig_stmt (stmt_info));
6536   gphi *reduc_def_phi = as_a <gphi *> (phi_info->stmt);
6537   int reduc_index = STMT_VINFO_REDUC_IDX (reduc_info);
6538   tree vectype_in = STMT_VINFO_REDUC_VECTYPE_IN (reduc_info);
6539
6540   if (slp_node)
6541     {
6542       ncopies = 1;
6543       vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
6544     }
6545   else
6546     {
6547       ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
6548       vec_num = 1;
6549     }
6550
6551   internal_fn cond_fn = get_conditional_internal_fn (code);
6552   vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo);
6553   bool mask_by_cond_expr = use_mask_by_cond_expr_p (code, cond_fn, vectype_in);
6554
6555   /* Transform.  */
6556   stmt_vec_info new_stmt_info = NULL;
6557   stmt_vec_info prev_stmt_info;
6558   tree new_temp = NULL_TREE;
6559   auto_vec<tree> vec_oprnds0;
6560   auto_vec<tree> vec_oprnds1;
6561   auto_vec<tree> vec_oprnds2;
6562   tree def0;
6563
6564   if (dump_enabled_p ())
6565     dump_printf_loc (MSG_NOTE, vect_location, "transform reduction.\n");
6566
6567   /* FORNOW: Multiple types are not supported for condition.  */
6568   if (code == COND_EXPR)
6569     gcc_assert (ncopies == 1);
6570
6571   bool masked_loop_p = LOOP_VINFO_FULLY_MASKED_P (loop_vinfo);
6572
6573   vect_reduction_type reduction_type = STMT_VINFO_REDUC_TYPE (reduc_info);
6574   if (reduction_type == FOLD_LEFT_REDUCTION)
6575     {
6576       internal_fn reduc_fn = STMT_VINFO_REDUC_FN (reduc_info);
6577       return vectorize_fold_left_reduction
6578           (stmt_info, gsi, vec_stmt, slp_node, reduc_def_phi, code,
6579            reduc_fn, ops, vectype_in, reduc_index, masks);
6580     }
6581
6582   bool single_defuse_cycle = STMT_VINFO_FORCE_SINGLE_CYCLE (reduc_info);
6583   gcc_assert (single_defuse_cycle
6584               || code == DOT_PROD_EXPR
6585               || code == WIDEN_SUM_EXPR
6586               || code == SAD_EXPR);
6587
6588   /* Create the destination vector  */
6589   tree scalar_dest = gimple_assign_lhs (stmt);
6590   tree vec_dest = vect_create_destination_var (scalar_dest, vectype_out);
6591
6592   prev_stmt_info = NULL;
6593   if (!slp_node)
6594     {
6595       vec_oprnds0.create (1);
6596       vec_oprnds1.create (1);
6597       if (op_type == ternary_op)
6598         vec_oprnds2.create (1);
6599     }
6600
6601   for (j = 0; j < ncopies; j++)
6602     {
6603       /* Handle uses.  */
6604       if (j == 0)
6605         {
6606           if (slp_node)
6607             {
6608               /* Get vec defs for all the operands except the reduction index,
6609                  ensuring the ordering of the ops in the vector is kept.  */
6610               auto_vec<tree, 3> slp_ops;
6611               auto_vec<vec<tree>, 3> vec_defs;
6612
6613               slp_ops.quick_push (ops[0]);
6614               slp_ops.quick_push (ops[1]);
6615               if (op_type == ternary_op)
6616                 slp_ops.quick_push (ops[2]);
6617
6618               vect_get_slp_defs (slp_ops, slp_node, &vec_defs);
6619
6620               vec_oprnds0.safe_splice (vec_defs[0]);
6621               vec_defs[0].release ();
6622               vec_oprnds1.safe_splice (vec_defs[1]);
6623               vec_defs[1].release ();
6624               if (op_type == ternary_op)
6625                 {
6626                   vec_oprnds2.safe_splice (vec_defs[2]);
6627                   vec_defs[2].release ();
6628                 }
6629             }
6630           else
6631             {
6632               vec_oprnds0.quick_push
6633                 (vect_get_vec_def_for_operand (ops[0], stmt_info));
6634               vec_oprnds1.quick_push
6635                 (vect_get_vec_def_for_operand (ops[1], stmt_info));
6636               if (op_type == ternary_op)
6637                 vec_oprnds2.quick_push
6638                   (vect_get_vec_def_for_operand (ops[2], stmt_info));
6639             }
6640         }
6641       else
6642         {
6643           if (!slp_node)
6644             {
6645               gcc_assert (reduc_index != -1 || ! single_defuse_cycle);
6646
6647               if (single_defuse_cycle && reduc_index == 0)
6648                 vec_oprnds0[0] = gimple_get_lhs (new_stmt_info->stmt);
6649               else
6650                 vec_oprnds0[0]
6651                   = vect_get_vec_def_for_stmt_copy (loop_vinfo,
6652                                                     vec_oprnds0[0]);
6653               if (single_defuse_cycle && reduc_index == 1)
6654                 vec_oprnds1[0] = gimple_get_lhs (new_stmt_info->stmt);
6655               else
6656                 vec_oprnds1[0]
6657                   = vect_get_vec_def_for_stmt_copy (loop_vinfo,
6658                                                     vec_oprnds1[0]);
6659               if (op_type == ternary_op)
6660                 {
6661                   if (single_defuse_cycle && reduc_index == 2)
6662                     vec_oprnds2[0] = gimple_get_lhs (new_stmt_info->stmt);
6663                   else
6664                     vec_oprnds2[0]
6665                       = vect_get_vec_def_for_stmt_copy (loop_vinfo,
6666                                                         vec_oprnds2[0]);
6667                 }
6668             }
6669         }
6670
6671       FOR_EACH_VEC_ELT (vec_oprnds0, i, def0)
6672         {
6673           tree vop[3] = { def0, vec_oprnds1[i], NULL_TREE };
6674           if (masked_loop_p && !mask_by_cond_expr)
6675             {
6676               /* Make sure that the reduction accumulator is vop[0].  */
6677               if (reduc_index == 1)
6678                 {
6679                   gcc_assert (commutative_tree_code (code));
6680                   std::swap (vop[0], vop[1]);
6681                 }
6682               tree mask = vect_get_loop_mask (gsi, masks, vec_num * ncopies,
6683                                               vectype_in, i * ncopies + j);
6684               gcall *call = gimple_build_call_internal (cond_fn, 4, mask,
6685                                                         vop[0], vop[1],
6686                                                         vop[0]);
6687               new_temp = make_ssa_name (vec_dest, call);
6688               gimple_call_set_lhs (call, new_temp);
6689               gimple_call_set_nothrow (call, true);
6690               new_stmt_info
6691                 = vect_finish_stmt_generation (stmt_info, call, gsi);
6692             }
6693           else
6694             {
6695               if (op_type == ternary_op)
6696                 vop[2] = vec_oprnds2[i];
6697
6698               if (masked_loop_p && mask_by_cond_expr)
6699                 {
6700                   tree mask = vect_get_loop_mask (gsi, masks,
6701                                                   vec_num * ncopies,
6702                                                   vectype_in, i * ncopies + j);
6703                   build_vect_cond_expr (code, vop, mask, gsi);
6704                 }
6705
6706               gassign *new_stmt = gimple_build_assign (vec_dest, code,
6707                                                        vop[0], vop[1], vop[2]);
6708               new_temp = make_ssa_name (vec_dest, new_stmt);
6709               gimple_assign_set_lhs (new_stmt, new_temp);
6710               new_stmt_info
6711                 = vect_finish_stmt_generation (stmt_info, new_stmt, gsi);
6712             }
6713
6714           if (slp_node)
6715             SLP_TREE_VEC_STMTS (slp_node).quick_push (new_stmt_info);
6716         }
6717
6718       if (slp_node || single_defuse_cycle)
6719         continue;
6720
6721       if (j == 0)
6722         STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt_info;
6723       else
6724         STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt_info;
6725
6726       prev_stmt_info = new_stmt_info;
6727     }
6728
6729   if (single_defuse_cycle && !slp_node)
6730     STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt_info;
6731
6732   return true;
6733 }
6734
6735 /* Transform phase of a cycle PHI.  */
6736
6737 bool
6738 vect_transform_cycle_phi (stmt_vec_info stmt_info, stmt_vec_info *vec_stmt,
6739                           slp_tree slp_node, slp_instance slp_node_instance)
6740 {
6741   tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
6742   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
6743   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
6744   int i;
6745   int ncopies;
6746   stmt_vec_info prev_phi_info;
6747   int j;
6748   bool nested_cycle = false;
6749   int vec_num;
6750
6751   if (nested_in_vect_loop_p (loop, stmt_info))
6752     {
6753       loop = loop->inner;
6754       nested_cycle = true;
6755     }
6756
6757   stmt_vec_info reduc_stmt_info = STMT_VINFO_REDUC_DEF (stmt_info);
6758   reduc_stmt_info = vect_stmt_to_vectorize (reduc_stmt_info);
6759   stmt_vec_info reduc_info = info_for_reduction (stmt_info);
6760   gcc_assert (reduc_info->is_reduc_info);
6761
6762   if (STMT_VINFO_REDUC_TYPE (reduc_info) == EXTRACT_LAST_REDUCTION
6763       || STMT_VINFO_REDUC_TYPE (reduc_info) == FOLD_LEFT_REDUCTION)
6764     /* Leave the scalar phi in place.  */
6765     return true;
6766
6767   tree vectype_in = STMT_VINFO_REDUC_VECTYPE_IN (reduc_info);
6768   /* For a nested cycle we do not fill the above.  */
6769   if (!vectype_in)
6770     vectype_in = STMT_VINFO_VECTYPE (stmt_info);
6771   gcc_assert (vectype_in);
6772
6773   if (slp_node)
6774     {
6775       /* The size vect_schedule_slp_instance computes is off for us.  */
6776       vec_num = vect_get_num_vectors
6777           (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
6778            * SLP_TREE_SCALAR_STMTS (slp_node).length (), vectype_in);
6779       ncopies = 1;
6780     }
6781   else
6782     {
6783       vec_num = 1;
6784       ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
6785     }
6786
6787   /* Check whether we should use a single PHI node and accumulate
6788      vectors to one before the backedge.  */
6789   if (STMT_VINFO_FORCE_SINGLE_CYCLE (reduc_info))
6790     ncopies = 1;
6791
6792   /* Create the destination vector  */
6793   gphi *phi = as_a <gphi *> (stmt_info->stmt);
6794   tree vec_dest = vect_create_destination_var (gimple_phi_result (phi),
6795                                                vectype_out);
6796
6797   /* Get the loop-entry arguments.  */
6798   tree vec_initial_def;
6799   auto_vec<tree> vec_initial_defs;
6800   if (slp_node)
6801     {
6802       vec_initial_defs.reserve (vec_num);
6803       gcc_assert (slp_node == slp_node_instance->reduc_phis);
6804       stmt_vec_info first = REDUC_GROUP_FIRST_ELEMENT (reduc_stmt_info);
6805       tree neutral_op
6806         = neutral_op_for_slp_reduction (slp_node,
6807                                         STMT_VINFO_REDUC_CODE (reduc_info),
6808                                         first != NULL);
6809       get_initial_defs_for_reduction (slp_node_instance->reduc_phis,
6810                                       &vec_initial_defs, vec_num,
6811                                       first != NULL, neutral_op);
6812     }
6813   else
6814     {
6815       /* Get at the scalar def before the loop, that defines the initial
6816          value of the reduction variable.  */
6817       tree initial_def = PHI_ARG_DEF_FROM_EDGE (phi,
6818                                                 loop_preheader_edge (loop));
6819       /* Optimize: if initial_def is for REDUC_MAX smaller than the base
6820          and we can't use zero for induc_val, use initial_def.  Similarly
6821          for REDUC_MIN and initial_def larger than the base.  */
6822       if (STMT_VINFO_REDUC_TYPE (reduc_info) == INTEGER_INDUC_COND_REDUCTION)
6823         {
6824           tree induc_val = STMT_VINFO_VEC_INDUC_COND_INITIAL_VAL (reduc_info);
6825           if (TREE_CODE (initial_def) == INTEGER_CST
6826               && !integer_zerop (induc_val)
6827               && (((STMT_VINFO_VEC_COND_REDUC_CODE (reduc_info) == MAX_EXPR)
6828                    && tree_int_cst_lt (initial_def, induc_val))
6829                   || ((STMT_VINFO_VEC_COND_REDUC_CODE (reduc_info) == MIN_EXPR)
6830                       && tree_int_cst_lt (induc_val, initial_def))))
6831             {
6832               induc_val = initial_def;
6833               /* Communicate we used the initial_def to epilouge
6834                  generation.  */
6835               STMT_VINFO_VEC_INDUC_COND_INITIAL_VAL (reduc_info) = NULL_TREE;
6836             }
6837           vec_initial_def = build_vector_from_val (vectype_out, induc_val);
6838         }
6839       else if (nested_cycle)
6840         {
6841           /* Do not use an adjustment def as that case is not supported
6842              correctly if ncopies is not one.  */
6843           vec_initial_def = vect_get_vec_def_for_operand (initial_def,
6844                                                           reduc_stmt_info);
6845         }
6846       else
6847         {
6848           tree adjustment_def = NULL_TREE;
6849           tree *adjustment_defp = &adjustment_def;
6850           enum tree_code code = STMT_VINFO_REDUC_CODE (reduc_info);
6851           if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def)
6852             adjustment_defp = NULL;
6853           vec_initial_def
6854             = get_initial_def_for_reduction (reduc_stmt_info, code,
6855                                              initial_def, adjustment_defp);
6856           STMT_VINFO_REDUC_EPILOGUE_ADJUSTMENT (reduc_info) = adjustment_def;
6857         }
6858       vec_initial_defs.create (1);
6859       vec_initial_defs.quick_push (vec_initial_def);
6860     }
6861
6862   /* Generate the reduction PHIs upfront.  */
6863   prev_phi_info = NULL;
6864   for (i = 0; i < vec_num; i++)
6865     {
6866       tree vec_init_def = vec_initial_defs[i];
6867       for (j = 0; j < ncopies; j++)
6868         {
6869           /* Create the reduction-phi that defines the reduction
6870              operand.  */
6871           gphi *new_phi = create_phi_node (vec_dest, loop->header);
6872           stmt_vec_info new_phi_info = loop_vinfo->add_stmt (new_phi);
6873
6874           /* Set the loop-entry arg of the reduction-phi.  */
6875           if (j != 0 && nested_cycle)
6876             vec_init_def = vect_get_vec_def_for_stmt_copy (loop_vinfo,
6877                                                            vec_init_def);
6878           add_phi_arg (new_phi, vec_init_def, loop_preheader_edge (loop),
6879                        UNKNOWN_LOCATION);
6880
6881           /* The loop-latch arg is set in epilogue processing.  */
6882
6883           if (slp_node)
6884             SLP_TREE_VEC_STMTS (slp_node).quick_push (new_phi_info);
6885           else
6886             {
6887               if (j == 0)
6888                 STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_phi_info;
6889               else
6890                 STMT_VINFO_RELATED_STMT (prev_phi_info) = new_phi_info;
6891               prev_phi_info = new_phi_info;
6892             }
6893         }
6894     }
6895
6896   return true;
6897 }
6898
6899 /* Vectorizes LC PHIs.  */
6900
6901 bool
6902 vectorizable_lc_phi (stmt_vec_info stmt_info, stmt_vec_info *vec_stmt,
6903                      slp_tree slp_node)
6904 {
6905   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
6906   if (!loop_vinfo
6907       || !is_a <gphi *> (stmt_info->stmt)
6908       || gimple_phi_num_args (stmt_info->stmt) != 1)
6909     return false;
6910
6911   if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_internal_def
6912       && STMT_VINFO_DEF_TYPE (stmt_info) != vect_double_reduction_def)
6913     return false;
6914
6915   if (!vec_stmt) /* transformation not required.  */
6916     {
6917       STMT_VINFO_TYPE (stmt_info) = lc_phi_info_type;
6918       return true;
6919     }
6920
6921   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
6922   tree scalar_dest = gimple_phi_result (stmt_info->stmt);
6923   basic_block bb = gimple_bb (stmt_info->stmt);
6924   edge e = single_pred_edge (bb);
6925   tree vec_dest = vect_create_destination_var (scalar_dest, vectype);
6926   vec<tree> vec_oprnds = vNULL;
6927   vect_get_vec_defs (gimple_phi_arg_def (stmt_info->stmt, 0), NULL_TREE,
6928                      stmt_info, &vec_oprnds, NULL, slp_node);
6929   if (slp_node)
6930     {
6931       unsigned vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
6932       gcc_assert (vec_oprnds.length () == vec_num);
6933       for (unsigned i = 0; i < vec_num; i++)
6934         {
6935           /* Create the vectorized LC PHI node.  */
6936           gphi *new_phi = create_phi_node (vec_dest, bb);
6937           add_phi_arg (new_phi, vec_oprnds[i], e, UNKNOWN_LOCATION);
6938           stmt_vec_info new_phi_info = loop_vinfo->add_stmt (new_phi);
6939           SLP_TREE_VEC_STMTS (slp_node).quick_push (new_phi_info);
6940         }
6941     }
6942   else
6943     {
6944       unsigned ncopies = vect_get_num_copies (loop_vinfo, vectype);
6945       stmt_vec_info prev_phi_info = NULL;
6946       for (unsigned i = 0; i < ncopies; i++)
6947         {
6948           if (i != 0)
6949             vect_get_vec_defs_for_stmt_copy (loop_vinfo, &vec_oprnds, NULL);
6950           /* Create the vectorized LC PHI node.  */
6951           gphi *new_phi = create_phi_node (vec_dest, bb);
6952           add_phi_arg (new_phi, vec_oprnds[0], e, UNKNOWN_LOCATION);
6953           stmt_vec_info new_phi_info = loop_vinfo->add_stmt (new_phi);
6954           if (i == 0)
6955             STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_phi_info;
6956           else
6957             STMT_VINFO_RELATED_STMT (prev_phi_info) = new_phi_info;
6958           prev_phi_info = new_phi_info;
6959         }
6960     }
6961   vec_oprnds.release ();
6962
6963   return true;
6964 }
6965
6966
6967 /* Function vect_min_worthwhile_factor.
6968
6969    For a loop where we could vectorize the operation indicated by CODE,
6970    return the minimum vectorization factor that makes it worthwhile
6971    to use generic vectors.  */
6972 static unsigned int
6973 vect_min_worthwhile_factor (enum tree_code code)
6974 {
6975   switch (code)
6976     {
6977     case PLUS_EXPR:
6978     case MINUS_EXPR:
6979     case NEGATE_EXPR:
6980       return 4;
6981
6982     case BIT_AND_EXPR:
6983     case BIT_IOR_EXPR:
6984     case BIT_XOR_EXPR:
6985     case BIT_NOT_EXPR:
6986       return 2;
6987
6988     default:
6989       return INT_MAX;
6990     }
6991 }
6992
6993 /* Return true if VINFO indicates we are doing loop vectorization and if
6994    it is worth decomposing CODE operations into scalar operations for
6995    that loop's vectorization factor.  */
6996
6997 bool
6998 vect_worthwhile_without_simd_p (vec_info *vinfo, tree_code code)
6999 {
7000   loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
7001   unsigned HOST_WIDE_INT value;
7002   return (loop_vinfo
7003           && LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant (&value)
7004           && value >= vect_min_worthwhile_factor (code));
7005 }
7006
7007 /* Function vectorizable_induction
7008
7009    Check if STMT_INFO performs an induction computation that can be vectorized.
7010    If VEC_STMT is also passed, vectorize the induction PHI: create a vectorized
7011    phi to replace it, put it in VEC_STMT, and add it to the same basic block.
7012    Return true if STMT_INFO is vectorizable in this way.  */
7013
7014 bool
7015 vectorizable_induction (stmt_vec_info stmt_info,
7016                         gimple_stmt_iterator *gsi ATTRIBUTE_UNUSED,
7017                         stmt_vec_info *vec_stmt, slp_tree slp_node,
7018                         stmt_vector_for_cost *cost_vec)
7019 {
7020   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
7021   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
7022   unsigned ncopies;
7023   bool nested_in_vect_loop = false;
7024   class loop *iv_loop;
7025   tree vec_def;
7026   edge pe = loop_preheader_edge (loop);
7027   basic_block new_bb;
7028   tree new_vec, vec_init, vec_step, t;
7029   tree new_name;
7030   gimple *new_stmt;
7031   gphi *induction_phi;
7032   tree induc_def, vec_dest;
7033   tree init_expr, step_expr;
7034   poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
7035   unsigned i;
7036   tree expr;
7037   gimple_seq stmts;
7038   imm_use_iterator imm_iter;
7039   use_operand_p use_p;
7040   gimple *exit_phi;
7041   edge latch_e;
7042   tree loop_arg;
7043   gimple_stmt_iterator si;
7044
7045   gphi *phi = dyn_cast <gphi *> (stmt_info->stmt);
7046   if (!phi)
7047     return false;
7048
7049   if (!STMT_VINFO_RELEVANT_P (stmt_info))
7050     return false;
7051
7052   /* Make sure it was recognized as induction computation.  */
7053   if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_induction_def)
7054     return false;
7055
7056   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
7057   poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
7058
7059   if (slp_node)
7060     ncopies = 1;
7061   else
7062     ncopies = vect_get_num_copies (loop_vinfo, vectype);
7063   gcc_assert (ncopies >= 1);
7064
7065   /* FORNOW. These restrictions should be relaxed.  */
7066   if (nested_in_vect_loop_p (loop, stmt_info))
7067     {
7068       imm_use_iterator imm_iter;
7069       use_operand_p use_p;
7070       gimple *exit_phi;
7071       edge latch_e;
7072       tree loop_arg;
7073
7074       if (ncopies > 1)
7075         {
7076           if (dump_enabled_p ())
7077             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7078                              "multiple types in nested loop.\n");
7079           return false;
7080         }
7081
7082       /* FORNOW: outer loop induction with SLP not supported.  */
7083       if (STMT_SLP_TYPE (stmt_info))
7084         return false;
7085
7086       exit_phi = NULL;
7087       latch_e = loop_latch_edge (loop->inner);
7088       loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
7089       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, loop_arg)
7090         {
7091           gimple *use_stmt = USE_STMT (use_p);
7092           if (is_gimple_debug (use_stmt))
7093             continue;
7094
7095           if (!flow_bb_inside_loop_p (loop->inner, gimple_bb (use_stmt)))
7096             {
7097               exit_phi = use_stmt;
7098               break;
7099             }
7100         }
7101       if (exit_phi)
7102         {
7103           stmt_vec_info exit_phi_vinfo = loop_vinfo->lookup_stmt (exit_phi);
7104           if (!(STMT_VINFO_RELEVANT_P (exit_phi_vinfo)
7105                 && !STMT_VINFO_LIVE_P (exit_phi_vinfo)))
7106             {
7107               if (dump_enabled_p ())
7108                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7109                                  "inner-loop induction only used outside "
7110                                  "of the outer vectorized loop.\n");
7111               return false;
7112             }
7113         }
7114
7115       nested_in_vect_loop = true;
7116       iv_loop = loop->inner;
7117     }
7118   else
7119     iv_loop = loop;
7120   gcc_assert (iv_loop == (gimple_bb (phi))->loop_father);
7121
7122   if (slp_node && !nunits.is_constant ())
7123     {
7124       /* The current SLP code creates the initial value element-by-element.  */
7125       if (dump_enabled_p ())
7126         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7127                          "SLP induction not supported for variable-length"
7128                          " vectors.\n");
7129       return false;
7130     }
7131
7132   if (!vec_stmt) /* transformation not required.  */
7133     {
7134       STMT_VINFO_TYPE (stmt_info) = induc_vec_info_type;
7135       DUMP_VECT_SCOPE ("vectorizable_induction");
7136       vect_model_induction_cost (stmt_info, ncopies, cost_vec);
7137       return true;
7138     }
7139
7140   /* Transform.  */
7141
7142   /* Compute a vector variable, initialized with the first VF values of
7143      the induction variable.  E.g., for an iv with IV_PHI='X' and
7144      evolution S, for a vector of 4 units, we want to compute:
7145      [X, X + S, X + 2*S, X + 3*S].  */
7146
7147   if (dump_enabled_p ())
7148     dump_printf_loc (MSG_NOTE, vect_location, "transform induction phi.\n");
7149
7150   latch_e = loop_latch_edge (iv_loop);
7151   loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
7152
7153   step_expr = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_info);
7154   gcc_assert (step_expr != NULL_TREE);
7155   tree step_vectype = get_same_sized_vectype (TREE_TYPE (step_expr), vectype);
7156
7157   pe = loop_preheader_edge (iv_loop);
7158   init_expr = PHI_ARG_DEF_FROM_EDGE (phi,
7159                                      loop_preheader_edge (iv_loop));
7160
7161   stmts = NULL;
7162   if (!nested_in_vect_loop)
7163     {
7164       /* Convert the initial value to the IV update type.  */
7165       tree new_type = TREE_TYPE (step_expr);
7166       init_expr = gimple_convert (&stmts, new_type, init_expr);
7167
7168       /* If we are using the loop mask to "peel" for alignment then we need
7169          to adjust the start value here.  */
7170       tree skip_niters = LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo);
7171       if (skip_niters != NULL_TREE)
7172         {
7173           if (FLOAT_TYPE_P (vectype))
7174             skip_niters = gimple_build (&stmts, FLOAT_EXPR, new_type,
7175                                         skip_niters);
7176           else
7177             skip_niters = gimple_convert (&stmts, new_type, skip_niters);
7178           tree skip_step = gimple_build (&stmts, MULT_EXPR, new_type,
7179                                          skip_niters, step_expr);
7180           init_expr = gimple_build (&stmts, MINUS_EXPR, new_type,
7181                                     init_expr, skip_step);
7182         }
7183     }
7184
7185   if (stmts)
7186     {
7187       new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
7188       gcc_assert (!new_bb);
7189     }
7190
7191   /* Find the first insertion point in the BB.  */
7192   basic_block bb = gimple_bb (phi);
7193   si = gsi_after_labels (bb);
7194
7195   /* For SLP induction we have to generate several IVs as for example
7196      with group size 3 we need [i, i, i, i + S] [i + S, i + S, i + 2*S, i + 2*S]
7197      [i + 2*S, i + 3*S, i + 3*S, i + 3*S].  The step is the same uniform
7198      [VF*S, VF*S, VF*S, VF*S] for all.  */
7199   if (slp_node)
7200     {
7201       /* Enforced above.  */
7202       unsigned int const_nunits = nunits.to_constant ();
7203
7204       /* Generate [VF*S, VF*S, ... ].  */
7205       if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
7206         {
7207           expr = build_int_cst (integer_type_node, vf);
7208           expr = fold_convert (TREE_TYPE (step_expr), expr);
7209         }
7210       else
7211         expr = build_int_cst (TREE_TYPE (step_expr), vf);
7212       new_name = fold_build2 (MULT_EXPR, TREE_TYPE (step_expr),
7213                               expr, step_expr);
7214       if (! CONSTANT_CLASS_P (new_name))
7215         new_name = vect_init_vector (stmt_info, new_name,
7216                                      TREE_TYPE (step_expr), NULL);
7217       new_vec = build_vector_from_val (step_vectype, new_name);
7218       vec_step = vect_init_vector (stmt_info, new_vec, step_vectype, NULL);
7219
7220       /* Now generate the IVs.  */
7221       unsigned group_size = SLP_TREE_SCALAR_STMTS (slp_node).length ();
7222       unsigned nvects = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
7223       unsigned elts = const_nunits * nvects;
7224       unsigned nivs = least_common_multiple (group_size,
7225                                              const_nunits) / const_nunits;
7226       gcc_assert (elts % group_size == 0);
7227       tree elt = init_expr;
7228       unsigned ivn;
7229       for (ivn = 0; ivn < nivs; ++ivn)
7230         {
7231           tree_vector_builder elts (step_vectype, const_nunits, 1);
7232           stmts = NULL;
7233           for (unsigned eltn = 0; eltn < const_nunits; ++eltn)
7234             {
7235               if (ivn*const_nunits + eltn >= group_size
7236                   && (ivn * const_nunits + eltn) % group_size == 0)
7237                 elt = gimple_build (&stmts, PLUS_EXPR, TREE_TYPE (elt),
7238                                     elt, step_expr);
7239               elts.quick_push (elt);
7240             }
7241           vec_init = gimple_build_vector (&stmts, &elts);
7242           vec_init = gimple_convert (&stmts, vectype, vec_init);
7243           if (stmts)
7244             {
7245               new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
7246               gcc_assert (!new_bb);
7247             }
7248
7249           /* Create the induction-phi that defines the induction-operand.  */
7250           vec_dest = vect_get_new_vect_var (vectype, vect_simple_var, "vec_iv_");
7251           induction_phi = create_phi_node (vec_dest, iv_loop->header);
7252           stmt_vec_info induction_phi_info
7253             = loop_vinfo->add_stmt (induction_phi);
7254           induc_def = PHI_RESULT (induction_phi);
7255
7256           /* Create the iv update inside the loop  */
7257           gimple_seq stmts = NULL;
7258           vec_def = gimple_convert (&stmts, step_vectype, induc_def);
7259           vec_def = gimple_build (&stmts,
7260                                   PLUS_EXPR, step_vectype, vec_def, vec_step);
7261           vec_def = gimple_convert (&stmts, vectype, vec_def);
7262           loop_vinfo->add_stmt (SSA_NAME_DEF_STMT (vec_def));
7263           gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
7264
7265           /* Set the arguments of the phi node:  */
7266           add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION);
7267           add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
7268                        UNKNOWN_LOCATION);
7269
7270           SLP_TREE_VEC_STMTS (slp_node).quick_push (induction_phi_info);
7271         }
7272
7273       /* Re-use IVs when we can.  */
7274       if (ivn < nvects)
7275         {
7276           unsigned vfp
7277             = least_common_multiple (group_size, const_nunits) / group_size;
7278           /* Generate [VF'*S, VF'*S, ... ].  */
7279           if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
7280             {
7281               expr = build_int_cst (integer_type_node, vfp);
7282               expr = fold_convert (TREE_TYPE (step_expr), expr);
7283             }
7284           else
7285             expr = build_int_cst (TREE_TYPE (step_expr), vfp);
7286           new_name = fold_build2 (MULT_EXPR, TREE_TYPE (step_expr),
7287                                   expr, step_expr);
7288           if (! CONSTANT_CLASS_P (new_name))
7289             new_name = vect_init_vector (stmt_info, new_name,
7290                                          TREE_TYPE (step_expr), NULL);
7291           new_vec = build_vector_from_val (step_vectype, new_name);
7292           vec_step = vect_init_vector (stmt_info, new_vec, step_vectype, NULL);
7293           for (; ivn < nvects; ++ivn)
7294             {
7295               gimple *iv = SLP_TREE_VEC_STMTS (slp_node)[ivn - nivs]->stmt;
7296               tree def;
7297               if (gimple_code (iv) == GIMPLE_PHI)
7298                 def = gimple_phi_result (iv);
7299               else
7300                 def = gimple_assign_lhs (iv);
7301               gimple_seq stmts = NULL;
7302               def = gimple_convert (&stmts, step_vectype, def);
7303               def = gimple_build (&stmts,
7304                                   PLUS_EXPR, step_vectype, def, vec_step);
7305               def = gimple_convert (&stmts, vectype, def);
7306               if (gimple_code (iv) == GIMPLE_PHI)
7307                 gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
7308               else
7309                 {
7310                   gimple_stmt_iterator tgsi = gsi_for_stmt (iv);
7311                   gsi_insert_seq_after (&tgsi, stmts, GSI_CONTINUE_LINKING);
7312                 }
7313               SLP_TREE_VEC_STMTS (slp_node).quick_push
7314                 (loop_vinfo->add_stmt (SSA_NAME_DEF_STMT (def)));
7315             }
7316         }
7317
7318       return true;
7319     }
7320
7321   /* Create the vector that holds the initial_value of the induction.  */
7322   if (nested_in_vect_loop)
7323     {
7324       /* iv_loop is nested in the loop to be vectorized.  init_expr had already
7325          been created during vectorization of previous stmts.  We obtain it
7326          from the STMT_VINFO_VEC_STMT of the defining stmt.  */
7327       vec_init = vect_get_vec_def_for_operand (init_expr, stmt_info);
7328       /* If the initial value is not of proper type, convert it.  */
7329       if (!useless_type_conversion_p (vectype, TREE_TYPE (vec_init)))
7330         {
7331           new_stmt
7332             = gimple_build_assign (vect_get_new_ssa_name (vectype,
7333                                                           vect_simple_var,
7334                                                           "vec_iv_"),
7335                                    VIEW_CONVERT_EXPR,
7336                                    build1 (VIEW_CONVERT_EXPR, vectype,
7337                                            vec_init));
7338           vec_init = gimple_assign_lhs (new_stmt);
7339           new_bb = gsi_insert_on_edge_immediate (loop_preheader_edge (iv_loop),
7340                                                  new_stmt);
7341           gcc_assert (!new_bb);
7342           loop_vinfo->add_stmt (new_stmt);
7343         }
7344     }
7345   else
7346     {
7347       /* iv_loop is the loop to be vectorized. Create:
7348          vec_init = [X, X+S, X+2*S, X+3*S] (S = step_expr, X = init_expr)  */
7349       stmts = NULL;
7350       new_name = gimple_convert (&stmts, TREE_TYPE (step_expr), init_expr);
7351
7352       unsigned HOST_WIDE_INT const_nunits;
7353       if (nunits.is_constant (&const_nunits))
7354         {
7355           tree_vector_builder elts (step_vectype, const_nunits, 1);
7356           elts.quick_push (new_name);
7357           for (i = 1; i < const_nunits; i++)
7358             {
7359               /* Create: new_name_i = new_name + step_expr  */
7360               new_name = gimple_build (&stmts, PLUS_EXPR, TREE_TYPE (new_name),
7361                                        new_name, step_expr);
7362               elts.quick_push (new_name);
7363             }
7364           /* Create a vector from [new_name_0, new_name_1, ...,
7365              new_name_nunits-1]  */
7366           vec_init = gimple_build_vector (&stmts, &elts);
7367         }
7368       else if (INTEGRAL_TYPE_P (TREE_TYPE (step_expr)))
7369         /* Build the initial value directly from a VEC_SERIES_EXPR.  */
7370         vec_init = gimple_build (&stmts, VEC_SERIES_EXPR, step_vectype,
7371                                  new_name, step_expr);
7372       else
7373         {
7374           /* Build:
7375                 [base, base, base, ...]
7376                 + (vectype) [0, 1, 2, ...] * [step, step, step, ...].  */
7377           gcc_assert (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)));
7378           gcc_assert (flag_associative_math);
7379           tree index = build_index_vector (step_vectype, 0, 1);
7380           tree base_vec = gimple_build_vector_from_val (&stmts, step_vectype,
7381                                                         new_name);
7382           tree step_vec = gimple_build_vector_from_val (&stmts, step_vectype,
7383                                                         step_expr);
7384           vec_init = gimple_build (&stmts, FLOAT_EXPR, step_vectype, index);
7385           vec_init = gimple_build (&stmts, MULT_EXPR, step_vectype,
7386                                    vec_init, step_vec);
7387           vec_init = gimple_build (&stmts, PLUS_EXPR, step_vectype,
7388                                    vec_init, base_vec);
7389         }
7390       vec_init = gimple_convert (&stmts, vectype, vec_init);
7391
7392       if (stmts)
7393         {
7394           new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
7395           gcc_assert (!new_bb);
7396         }
7397     }
7398
7399
7400   /* Create the vector that holds the step of the induction.  */
7401   if (nested_in_vect_loop)
7402     /* iv_loop is nested in the loop to be vectorized. Generate:
7403        vec_step = [S, S, S, S]  */
7404     new_name = step_expr;
7405   else
7406     {
7407       /* iv_loop is the loop to be vectorized. Generate:
7408           vec_step = [VF*S, VF*S, VF*S, VF*S]  */
7409       gimple_seq seq = NULL;
7410       if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
7411         {
7412           expr = build_int_cst (integer_type_node, vf);
7413           expr = gimple_build (&seq, FLOAT_EXPR, TREE_TYPE (step_expr), expr);
7414         }
7415       else
7416         expr = build_int_cst (TREE_TYPE (step_expr), vf);
7417       new_name = gimple_build (&seq, MULT_EXPR, TREE_TYPE (step_expr),
7418                                expr, step_expr);
7419       if (seq)
7420         {
7421           new_bb = gsi_insert_seq_on_edge_immediate (pe, seq);
7422           gcc_assert (!new_bb);
7423         }
7424     }
7425
7426   t = unshare_expr (new_name);
7427   gcc_assert (CONSTANT_CLASS_P (new_name)
7428               || TREE_CODE (new_name) == SSA_NAME);
7429   new_vec = build_vector_from_val (step_vectype, t);
7430   vec_step = vect_init_vector (stmt_info, new_vec, step_vectype, NULL);
7431
7432
7433   /* Create the following def-use cycle:
7434      loop prolog:
7435          vec_init = ...
7436          vec_step = ...
7437      loop:
7438          vec_iv = PHI <vec_init, vec_loop>
7439          ...
7440          STMT
7441          ...
7442          vec_loop = vec_iv + vec_step;  */
7443
7444   /* Create the induction-phi that defines the induction-operand.  */
7445   vec_dest = vect_get_new_vect_var (vectype, vect_simple_var, "vec_iv_");
7446   induction_phi = create_phi_node (vec_dest, iv_loop->header);
7447   stmt_vec_info induction_phi_info = loop_vinfo->add_stmt (induction_phi);
7448   induc_def = PHI_RESULT (induction_phi);
7449
7450   /* Create the iv update inside the loop  */
7451   stmts = NULL;
7452   vec_def = gimple_convert (&stmts, step_vectype, induc_def);
7453   vec_def = gimple_build (&stmts, PLUS_EXPR, step_vectype, vec_def, vec_step);
7454   vec_def = gimple_convert (&stmts, vectype, vec_def);
7455   gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
7456   new_stmt = SSA_NAME_DEF_STMT (vec_def);
7457   stmt_vec_info new_stmt_info = loop_vinfo->add_stmt (new_stmt);
7458
7459   /* Set the arguments of the phi node:  */
7460   add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION);
7461   add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
7462                UNKNOWN_LOCATION);
7463
7464   STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = induction_phi_info;
7465
7466   /* In case that vectorization factor (VF) is bigger than the number
7467      of elements that we can fit in a vectype (nunits), we have to generate
7468      more than one vector stmt - i.e - we need to "unroll" the
7469      vector stmt by a factor VF/nunits.  For more details see documentation
7470      in vectorizable_operation.  */
7471
7472   if (ncopies > 1)
7473     {
7474       gimple_seq seq = NULL;
7475       stmt_vec_info prev_stmt_vinfo;
7476       /* FORNOW. This restriction should be relaxed.  */
7477       gcc_assert (!nested_in_vect_loop);
7478
7479       /* Create the vector that holds the step of the induction.  */
7480       if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
7481         {
7482           expr = build_int_cst (integer_type_node, nunits);
7483           expr = gimple_build (&seq, FLOAT_EXPR, TREE_TYPE (step_expr), expr);
7484         }
7485       else
7486         expr = build_int_cst (TREE_TYPE (step_expr), nunits);
7487       new_name = gimple_build (&seq, MULT_EXPR, TREE_TYPE (step_expr),
7488                                expr, step_expr);
7489       if (seq)
7490         {
7491           new_bb = gsi_insert_seq_on_edge_immediate (pe, seq);
7492           gcc_assert (!new_bb);
7493         }
7494
7495       t = unshare_expr (new_name);
7496       gcc_assert (CONSTANT_CLASS_P (new_name)
7497                   || TREE_CODE (new_name) == SSA_NAME);
7498       new_vec = build_vector_from_val (step_vectype, t);
7499       vec_step = vect_init_vector (stmt_info, new_vec, step_vectype, NULL);
7500
7501       vec_def = induc_def;
7502       prev_stmt_vinfo = induction_phi_info;
7503       for (i = 1; i < ncopies; i++)
7504         {
7505           /* vec_i = vec_prev + vec_step  */
7506           gimple_seq stmts = NULL;
7507           vec_def = gimple_convert (&stmts, step_vectype, vec_def);
7508           vec_def = gimple_build (&stmts,
7509                                   PLUS_EXPR, step_vectype, vec_def, vec_step);
7510           vec_def = gimple_convert (&stmts, vectype, vec_def);
7511
7512           gsi_insert_seq_before (&si, stmts, GSI_SAME_STMT);
7513           new_stmt = SSA_NAME_DEF_STMT (vec_def);
7514           new_stmt_info = loop_vinfo->add_stmt (new_stmt);
7515           STMT_VINFO_RELATED_STMT (prev_stmt_vinfo) = new_stmt_info;
7516           prev_stmt_vinfo = new_stmt_info;
7517         }
7518     }
7519
7520   if (nested_in_vect_loop)
7521     {
7522       /* Find the loop-closed exit-phi of the induction, and record
7523          the final vector of induction results:  */
7524       exit_phi = NULL;
7525       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, loop_arg)
7526         {
7527           gimple *use_stmt = USE_STMT (use_p);
7528           if (is_gimple_debug (use_stmt))
7529             continue;
7530
7531           if (!flow_bb_inside_loop_p (iv_loop, gimple_bb (use_stmt)))
7532             {
7533               exit_phi = use_stmt;
7534               break;
7535             }
7536         }
7537       if (exit_phi)
7538         {
7539           stmt_vec_info stmt_vinfo = loop_vinfo->lookup_stmt (exit_phi);
7540           /* FORNOW. Currently not supporting the case that an inner-loop induction
7541              is not used in the outer-loop (i.e. only outside the outer-loop).  */
7542           gcc_assert (STMT_VINFO_RELEVANT_P (stmt_vinfo)
7543                       && !STMT_VINFO_LIVE_P (stmt_vinfo));
7544
7545           STMT_VINFO_VEC_STMT (stmt_vinfo) = new_stmt_info;
7546           if (dump_enabled_p ())
7547             dump_printf_loc (MSG_NOTE, vect_location,
7548                              "vector of inductions after inner-loop:%G",
7549                              new_stmt);
7550         }
7551     }
7552
7553
7554   if (dump_enabled_p ())
7555     dump_printf_loc (MSG_NOTE, vect_location,
7556                      "transform induction: created def-use cycle: %G%G",
7557                      induction_phi, SSA_NAME_DEF_STMT (vec_def));
7558
7559   return true;
7560 }
7561
7562 /* Function vectorizable_live_operation.
7563
7564    STMT_INFO computes a value that is used outside the loop.  Check if
7565    it can be supported.  */
7566
7567 bool
7568 vectorizable_live_operation (stmt_vec_info stmt_info,
7569                              gimple_stmt_iterator *gsi,
7570                              slp_tree slp_node, slp_instance slp_node_instance,
7571                              int slp_index, bool vec_stmt_p,
7572                              stmt_vector_for_cost *)
7573 {
7574   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
7575   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
7576   imm_use_iterator imm_iter;
7577   tree lhs, lhs_type, bitsize, vec_bitsize;
7578   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
7579   poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
7580   int ncopies;
7581   gimple *use_stmt;
7582   auto_vec<tree> vec_oprnds;
7583   int vec_entry = 0;
7584   poly_uint64 vec_index = 0;
7585
7586   gcc_assert (STMT_VINFO_LIVE_P (stmt_info));
7587
7588   /* The last stmt of a reduction is live and vectorized via
7589      vect_create_epilog_for_reduction.  vectorizable_reduction assessed
7590      validity so just trigger the transform here.  */
7591   if (STMT_VINFO_REDUC_DEF (vect_orig_stmt (stmt_info)))
7592     {
7593       if (!vec_stmt_p)
7594         return true;
7595       if (slp_node)
7596         {
7597           /* For reduction chains the meta-info is attached to
7598              the group leader.  */
7599           if (REDUC_GROUP_FIRST_ELEMENT (stmt_info))
7600             stmt_info = REDUC_GROUP_FIRST_ELEMENT (stmt_info);
7601           /* For SLP reductions we vectorize the epilogue for
7602              all involved stmts together.  */
7603           else if (slp_index != 0)
7604             return true;
7605         }
7606       stmt_vec_info reduc_info = info_for_reduction (stmt_info);
7607       gcc_assert (reduc_info->is_reduc_info);
7608       if (STMT_VINFO_REDUC_TYPE (reduc_info) == FOLD_LEFT_REDUCTION
7609           || STMT_VINFO_REDUC_TYPE (reduc_info) == EXTRACT_LAST_REDUCTION)
7610         return true;
7611       vect_create_epilog_for_reduction (stmt_info, slp_node,
7612                                         slp_node_instance);
7613       return true;
7614     }
7615
7616   /* FORNOW.  CHECKME.  */
7617   if (nested_in_vect_loop_p (loop, stmt_info))
7618     return false;
7619
7620   /* If STMT is not relevant and it is a simple assignment and its inputs are
7621      invariant then it can remain in place, unvectorized.  The original last
7622      scalar value that it computes will be used.  */
7623   if (!STMT_VINFO_RELEVANT_P (stmt_info))
7624     {
7625       gcc_assert (is_simple_and_all_uses_invariant (stmt_info, loop_vinfo));
7626       if (dump_enabled_p ())
7627         dump_printf_loc (MSG_NOTE, vect_location,
7628                          "statement is simple and uses invariant.  Leaving in "
7629                          "place.\n");
7630       return true;
7631     }
7632
7633   if (slp_node)
7634     ncopies = 1;
7635   else
7636     ncopies = vect_get_num_copies (loop_vinfo, vectype);
7637
7638   if (slp_node)
7639     {
7640       gcc_assert (slp_index >= 0);
7641
7642       int num_scalar = SLP_TREE_SCALAR_STMTS (slp_node).length ();
7643       int num_vec = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
7644
7645       /* Get the last occurrence of the scalar index from the concatenation of
7646          all the slp vectors. Calculate which slp vector it is and the index
7647          within.  */
7648       poly_uint64 pos = (num_vec * nunits) - num_scalar + slp_index;
7649
7650       /* Calculate which vector contains the result, and which lane of
7651          that vector we need.  */
7652       if (!can_div_trunc_p (pos, nunits, &vec_entry, &vec_index))
7653         {
7654           if (dump_enabled_p ())
7655             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7656                              "Cannot determine which vector holds the"
7657                              " final result.\n");
7658           return false;
7659         }
7660     }
7661
7662   if (!vec_stmt_p)
7663     {
7664       /* No transformation required.  */
7665       if (LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo))
7666         {
7667           if (!direct_internal_fn_supported_p (IFN_EXTRACT_LAST, vectype,
7668                                                OPTIMIZE_FOR_SPEED))
7669             {
7670               if (dump_enabled_p ())
7671                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7672                                  "can't use a fully-masked loop because "
7673                                  "the target doesn't support extract last "
7674                                  "reduction.\n");
7675               LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = false;
7676             }
7677           else if (slp_node)
7678             {
7679               if (dump_enabled_p ())
7680                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7681                                  "can't use a fully-masked loop because an "
7682                                  "SLP statement is live after the loop.\n");
7683               LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = false;
7684             }
7685           else if (ncopies > 1)
7686             {
7687               if (dump_enabled_p ())
7688                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7689                                  "can't use a fully-masked loop because"
7690                                  " ncopies is greater than 1.\n");
7691               LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = false;
7692             }
7693           else
7694             {
7695               gcc_assert (ncopies == 1 && !slp_node);
7696               vect_record_loop_mask (loop_vinfo,
7697                                      &LOOP_VINFO_MASKS (loop_vinfo),
7698                                      1, vectype);
7699             }
7700         }
7701       return true;
7702     }
7703
7704   /* Use the lhs of the original scalar statement.  */
7705   gimple *stmt = vect_orig_stmt (stmt_info)->stmt;
7706
7707   lhs = (is_a <gphi *> (stmt)) ? gimple_phi_result (stmt)
7708         : gimple_get_lhs (stmt);
7709   lhs_type = TREE_TYPE (lhs);
7710
7711   bitsize = (VECTOR_BOOLEAN_TYPE_P (vectype)
7712              ? bitsize_int (TYPE_PRECISION (TREE_TYPE (vectype)))
7713              : TYPE_SIZE (TREE_TYPE (vectype)));
7714   vec_bitsize = TYPE_SIZE (vectype);
7715
7716   /* Get the vectorized lhs of STMT and the lane to use (counted in bits).  */
7717   tree vec_lhs, bitstart;
7718   if (slp_node)
7719     {
7720       gcc_assert (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo));
7721
7722       /* Get the correct slp vectorized stmt.  */
7723       gimple *vec_stmt = SLP_TREE_VEC_STMTS (slp_node)[vec_entry]->stmt;
7724       if (gphi *phi = dyn_cast <gphi *> (vec_stmt))
7725         vec_lhs = gimple_phi_result (phi);
7726       else
7727         vec_lhs = gimple_get_lhs (vec_stmt);
7728
7729       /* Get entry to use.  */
7730       bitstart = bitsize_int (vec_index);
7731       bitstart = int_const_binop (MULT_EXPR, bitsize, bitstart);
7732     }
7733   else
7734     {
7735       enum vect_def_type dt = STMT_VINFO_DEF_TYPE (stmt_info);
7736       vec_lhs = vect_get_vec_def_for_operand_1 (stmt_info, dt);
7737       gcc_checking_assert (ncopies == 1
7738                            || !LOOP_VINFO_FULLY_MASKED_P (loop_vinfo));
7739
7740       /* For multiple copies, get the last copy.  */
7741       for (int i = 1; i < ncopies; ++i)
7742         vec_lhs = vect_get_vec_def_for_stmt_copy (loop_vinfo, vec_lhs);
7743
7744       /* Get the last lane in the vector.  */
7745       bitstart = int_const_binop (MINUS_EXPR, vec_bitsize, bitsize);
7746     }
7747
7748   gimple_seq stmts = NULL;
7749   tree new_tree;
7750   if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
7751     {
7752       /* Emit:
7753
7754            SCALAR_RES = EXTRACT_LAST <VEC_LHS, MASK>
7755
7756          where VEC_LHS is the vectorized live-out result and MASK is
7757          the loop mask for the final iteration.  */
7758       gcc_assert (ncopies == 1 && !slp_node);
7759       tree scalar_type = TREE_TYPE (STMT_VINFO_VECTYPE (stmt_info));
7760       tree mask = vect_get_loop_mask (gsi, &LOOP_VINFO_MASKS (loop_vinfo),
7761                                       1, vectype, 0);
7762       tree scalar_res = gimple_build (&stmts, CFN_EXTRACT_LAST,
7763                                       scalar_type, mask, vec_lhs);
7764
7765       /* Convert the extracted vector element to the required scalar type.  */
7766       new_tree = gimple_convert (&stmts, lhs_type, scalar_res);
7767     }
7768   else
7769     {
7770       tree bftype = TREE_TYPE (vectype);
7771       if (VECTOR_BOOLEAN_TYPE_P (vectype))
7772         bftype = build_nonstandard_integer_type (tree_to_uhwi (bitsize), 1);
7773       new_tree = build3 (BIT_FIELD_REF, bftype, vec_lhs, bitsize, bitstart);
7774       new_tree = force_gimple_operand (fold_convert (lhs_type, new_tree),
7775                                        &stmts, true, NULL_TREE);
7776     }
7777
7778   if (stmts)
7779     gsi_insert_seq_on_edge_immediate (single_exit (loop), stmts);
7780
7781   /* Replace use of lhs with newly computed result.  If the use stmt is a
7782      single arg PHI, just replace all uses of PHI result.  It's necessary
7783      because lcssa PHI defining lhs may be before newly inserted stmt.  */
7784   use_operand_p use_p;
7785   FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, lhs)
7786     if (!flow_bb_inside_loop_p (loop, gimple_bb (use_stmt))
7787         && !is_gimple_debug (use_stmt))
7788     {
7789       if (gimple_code (use_stmt) == GIMPLE_PHI
7790           && gimple_phi_num_args (use_stmt) == 1)
7791         {
7792           replace_uses_by (gimple_phi_result (use_stmt), new_tree);
7793         }
7794       else
7795         {
7796           FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
7797             SET_USE (use_p, new_tree);
7798         }
7799       update_stmt (use_stmt);
7800     }
7801
7802   return true;
7803 }
7804
7805 /* Kill any debug uses outside LOOP of SSA names defined in STMT_INFO.  */
7806
7807 static void
7808 vect_loop_kill_debug_uses (class loop *loop, stmt_vec_info stmt_info)
7809 {
7810   ssa_op_iter op_iter;
7811   imm_use_iterator imm_iter;
7812   def_operand_p def_p;
7813   gimple *ustmt;
7814
7815   FOR_EACH_PHI_OR_STMT_DEF (def_p, stmt_info->stmt, op_iter, SSA_OP_DEF)
7816     {
7817       FOR_EACH_IMM_USE_STMT (ustmt, imm_iter, DEF_FROM_PTR (def_p))
7818         {
7819           basic_block bb;
7820
7821           if (!is_gimple_debug (ustmt))
7822             continue;
7823
7824           bb = gimple_bb (ustmt);
7825
7826           if (!flow_bb_inside_loop_p (loop, bb))
7827             {
7828               if (gimple_debug_bind_p (ustmt))
7829                 {
7830                   if (dump_enabled_p ())
7831                     dump_printf_loc (MSG_NOTE, vect_location,
7832                                      "killing debug use\n");
7833
7834                   gimple_debug_bind_reset_value (ustmt);
7835                   update_stmt (ustmt);
7836                 }
7837               else
7838                 gcc_unreachable ();
7839             }
7840         }
7841     }
7842 }
7843
7844 /* Given loop represented by LOOP_VINFO, return true if computation of
7845    LOOP_VINFO_NITERS (= LOOP_VINFO_NITERSM1 + 1) doesn't overflow, false
7846    otherwise.  */
7847
7848 static bool
7849 loop_niters_no_overflow (loop_vec_info loop_vinfo)
7850 {
7851   /* Constant case.  */
7852   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
7853     {
7854       tree cst_niters = LOOP_VINFO_NITERS (loop_vinfo);
7855       tree cst_nitersm1 = LOOP_VINFO_NITERSM1 (loop_vinfo);
7856
7857       gcc_assert (TREE_CODE (cst_niters) == INTEGER_CST);
7858       gcc_assert (TREE_CODE (cst_nitersm1) == INTEGER_CST);
7859       if (wi::to_widest (cst_nitersm1) < wi::to_widest (cst_niters))
7860         return true;
7861     }
7862
7863   widest_int max;
7864   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
7865   /* Check the upper bound of loop niters.  */
7866   if (get_max_loop_iterations (loop, &max))
7867     {
7868       tree type = TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo));
7869       signop sgn = TYPE_SIGN (type);
7870       widest_int type_max = widest_int::from (wi::max_value (type), sgn);
7871       if (max < type_max)
7872         return true;
7873     }
7874   return false;
7875 }
7876
7877 /* Return a mask type with half the number of elements as TYPE.  */
7878
7879 tree
7880 vect_halve_mask_nunits (tree type)
7881 {
7882   poly_uint64 nunits = exact_div (TYPE_VECTOR_SUBPARTS (type), 2);
7883   return build_truth_vector_type (nunits, current_vector_size);
7884 }
7885
7886 /* Return a mask type with twice as many elements as TYPE.  */
7887
7888 tree
7889 vect_double_mask_nunits (tree type)
7890 {
7891   poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (type) * 2;
7892   return build_truth_vector_type (nunits, current_vector_size);
7893 }
7894
7895 /* Record that a fully-masked version of LOOP_VINFO would need MASKS to
7896    contain a sequence of NVECTORS masks that each control a vector of type
7897    VECTYPE.  */
7898
7899 void
7900 vect_record_loop_mask (loop_vec_info loop_vinfo, vec_loop_masks *masks,
7901                        unsigned int nvectors, tree vectype)
7902 {
7903   gcc_assert (nvectors != 0);
7904   if (masks->length () < nvectors)
7905     masks->safe_grow_cleared (nvectors);
7906   rgroup_masks *rgm = &(*masks)[nvectors - 1];
7907   /* The number of scalars per iteration and the number of vectors are
7908      both compile-time constants.  */
7909   unsigned int nscalars_per_iter
7910     = exact_div (nvectors * TYPE_VECTOR_SUBPARTS (vectype),
7911                  LOOP_VINFO_VECT_FACTOR (loop_vinfo)).to_constant ();
7912   if (rgm->max_nscalars_per_iter < nscalars_per_iter)
7913     {
7914       rgm->max_nscalars_per_iter = nscalars_per_iter;
7915       rgm->mask_type = build_same_sized_truth_vector_type (vectype);
7916     }
7917 }
7918
7919 /* Given a complete set of masks MASKS, extract mask number INDEX
7920    for an rgroup that operates on NVECTORS vectors of type VECTYPE,
7921    where 0 <= INDEX < NVECTORS.  Insert any set-up statements before GSI.
7922
7923    See the comment above vec_loop_masks for more details about the mask
7924    arrangement.  */
7925
7926 tree
7927 vect_get_loop_mask (gimple_stmt_iterator *gsi, vec_loop_masks *masks,
7928                     unsigned int nvectors, tree vectype, unsigned int index)
7929 {
7930   rgroup_masks *rgm = &(*masks)[nvectors - 1];
7931   tree mask_type = rgm->mask_type;
7932
7933   /* Populate the rgroup's mask array, if this is the first time we've
7934      used it.  */
7935   if (rgm->masks.is_empty ())
7936     {
7937       rgm->masks.safe_grow_cleared (nvectors);
7938       for (unsigned int i = 0; i < nvectors; ++i)
7939         {
7940           tree mask = make_temp_ssa_name (mask_type, NULL, "loop_mask");
7941           /* Provide a dummy definition until the real one is available.  */
7942           SSA_NAME_DEF_STMT (mask) = gimple_build_nop ();
7943           rgm->masks[i] = mask;
7944         }
7945     }
7946
7947   tree mask = rgm->masks[index];
7948   if (maybe_ne (TYPE_VECTOR_SUBPARTS (mask_type),
7949                 TYPE_VECTOR_SUBPARTS (vectype)))
7950     {
7951       /* A loop mask for data type X can be reused for data type Y
7952          if X has N times more elements than Y and if Y's elements
7953          are N times bigger than X's.  In this case each sequence
7954          of N elements in the loop mask will be all-zero or all-one.
7955          We can then view-convert the mask so that each sequence of
7956          N elements is replaced by a single element.  */
7957       gcc_assert (multiple_p (TYPE_VECTOR_SUBPARTS (mask_type),
7958                               TYPE_VECTOR_SUBPARTS (vectype)));
7959       gimple_seq seq = NULL;
7960       mask_type = build_same_sized_truth_vector_type (vectype);
7961       mask = gimple_build (&seq, VIEW_CONVERT_EXPR, mask_type, mask);
7962       if (seq)
7963         gsi_insert_seq_before (gsi, seq, GSI_SAME_STMT);
7964     }
7965   return mask;
7966 }
7967
7968 /* Scale profiling counters by estimation for LOOP which is vectorized
7969    by factor VF.  */
7970
7971 static void
7972 scale_profile_for_vect_loop (class loop *loop, unsigned vf)
7973 {
7974   edge preheader = loop_preheader_edge (loop);
7975   /* Reduce loop iterations by the vectorization factor.  */
7976   gcov_type new_est_niter = niter_for_unrolled_loop (loop, vf);
7977   profile_count freq_h = loop->header->count, freq_e = preheader->count ();
7978
7979   if (freq_h.nonzero_p ())
7980     {
7981       profile_probability p;
7982
7983       /* Avoid dropping loop body profile counter to 0 because of zero count
7984          in loop's preheader.  */
7985       if (!(freq_e == profile_count::zero ()))
7986         freq_e = freq_e.force_nonzero ();
7987       p = freq_e.apply_scale (new_est_niter + 1, 1).probability_in (freq_h);
7988       scale_loop_frequencies (loop, p);
7989     }
7990
7991   edge exit_e = single_exit (loop);
7992   exit_e->probability = profile_probability::always ()
7993                                  .apply_scale (1, new_est_niter + 1);
7994
7995   edge exit_l = single_pred_edge (loop->latch);
7996   profile_probability prob = exit_l->probability;
7997   exit_l->probability = exit_e->probability.invert ();
7998   if (prob.initialized_p () && exit_l->probability.initialized_p ())
7999     scale_bbs_frequencies (&loop->latch, 1, exit_l->probability / prob);
8000 }
8001
8002 /* Vectorize STMT_INFO if relevant, inserting any new instructions before GSI.
8003    When vectorizing STMT_INFO as a store, set *SEEN_STORE to its
8004    stmt_vec_info.  */
8005
8006 static void
8007 vect_transform_loop_stmt (loop_vec_info loop_vinfo, stmt_vec_info stmt_info,
8008                           gimple_stmt_iterator *gsi, stmt_vec_info *seen_store)
8009 {
8010   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
8011   poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
8012
8013   if (dump_enabled_p ())
8014     dump_printf_loc (MSG_NOTE, vect_location,
8015                      "------>vectorizing statement: %G", stmt_info->stmt);
8016
8017   if (MAY_HAVE_DEBUG_BIND_STMTS && !STMT_VINFO_LIVE_P (stmt_info))
8018     vect_loop_kill_debug_uses (loop, stmt_info);
8019
8020   if (!STMT_VINFO_RELEVANT_P (stmt_info)
8021       && !STMT_VINFO_LIVE_P (stmt_info))
8022     return;
8023
8024   if (STMT_VINFO_VECTYPE (stmt_info))
8025     {
8026       poly_uint64 nunits
8027         = TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info));
8028       if (!STMT_SLP_TYPE (stmt_info)
8029           && maybe_ne (nunits, vf)
8030           && dump_enabled_p ())
8031         /* For SLP VF is set according to unrolling factor, and not
8032            to vector size, hence for SLP this print is not valid.  */
8033         dump_printf_loc (MSG_NOTE, vect_location, "multiple-types.\n");
8034     }
8035
8036   /* Pure SLP statements have already been vectorized.  We still need
8037      to apply loop vectorization to hybrid SLP statements.  */
8038   if (PURE_SLP_STMT (stmt_info))
8039     return;
8040
8041   if (dump_enabled_p ())
8042     dump_printf_loc (MSG_NOTE, vect_location, "transform statement.\n");
8043
8044   if (vect_transform_stmt (stmt_info, gsi, NULL, NULL))
8045     *seen_store = stmt_info;
8046 }
8047
8048 /* Function vect_transform_loop.
8049
8050    The analysis phase has determined that the loop is vectorizable.
8051    Vectorize the loop - created vectorized stmts to replace the scalar
8052    stmts in the loop, and update the loop exit condition.
8053    Returns scalar epilogue loop if any.  */
8054
8055 class loop *
8056 vect_transform_loop (loop_vec_info loop_vinfo)
8057 {
8058   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
8059   class loop *epilogue = NULL;
8060   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
8061   int nbbs = loop->num_nodes;
8062   int i;
8063   tree niters_vector = NULL_TREE;
8064   tree step_vector = NULL_TREE;
8065   tree niters_vector_mult_vf = NULL_TREE;
8066   poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
8067   unsigned int lowest_vf = constant_lower_bound (vf);
8068   gimple *stmt;
8069   bool check_profitability = false;
8070   unsigned int th;
8071
8072   DUMP_VECT_SCOPE ("vec_transform_loop");
8073
8074   loop_vinfo->shared->check_datarefs ();
8075
8076   /* Use the more conservative vectorization threshold.  If the number
8077      of iterations is constant assume the cost check has been performed
8078      by our caller.  If the threshold makes all loops profitable that
8079      run at least the (estimated) vectorization factor number of times
8080      checking is pointless, too.  */
8081   th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
8082   if (th >= vect_vf_for_cost (loop_vinfo)
8083       && !LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
8084     {
8085       if (dump_enabled_p ())
8086         dump_printf_loc (MSG_NOTE, vect_location,
8087                          "Profitability threshold is %d loop iterations.\n",
8088                          th);
8089       check_profitability = true;
8090     }
8091
8092   /* Make sure there exists a single-predecessor exit bb.  Do this before
8093      versioning.   */
8094   edge e = single_exit (loop);
8095   if (! single_pred_p (e->dest))
8096     {
8097       split_loop_exit_edge (e, true);
8098       if (dump_enabled_p ())
8099         dump_printf (MSG_NOTE, "split exit edge\n");
8100     }
8101
8102   /* Version the loop first, if required, so the profitability check
8103      comes first.  */
8104
8105   if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
8106     {
8107       class loop *sloop
8108         = vect_loop_versioning (loop_vinfo);
8109       sloop->force_vectorize = false;
8110       check_profitability = false;
8111     }
8112
8113   /* Make sure there exists a single-predecessor exit bb also on the
8114      scalar loop copy.  Do this after versioning but before peeling
8115      so CFG structure is fine for both scalar and if-converted loop
8116      to make slpeel_duplicate_current_defs_from_edges face matched
8117      loop closed PHI nodes on the exit.  */
8118   if (LOOP_VINFO_SCALAR_LOOP (loop_vinfo))
8119     {
8120       e = single_exit (LOOP_VINFO_SCALAR_LOOP (loop_vinfo));
8121       if (! single_pred_p (e->dest))
8122         {
8123           split_loop_exit_edge (e, true);
8124           if (dump_enabled_p ())
8125             dump_printf (MSG_NOTE, "split exit edge of scalar loop\n");
8126         }
8127     }
8128
8129   tree niters = vect_build_loop_niters (loop_vinfo);
8130   LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = niters;
8131   tree nitersm1 = unshare_expr (LOOP_VINFO_NITERSM1 (loop_vinfo));
8132   bool niters_no_overflow = loop_niters_no_overflow (loop_vinfo);
8133   epilogue = vect_do_peeling (loop_vinfo, niters, nitersm1, &niters_vector,
8134                               &step_vector, &niters_vector_mult_vf, th,
8135                               check_profitability, niters_no_overflow);
8136   if (LOOP_VINFO_SCALAR_LOOP (loop_vinfo)
8137       && LOOP_VINFO_SCALAR_LOOP_SCALING (loop_vinfo).initialized_p ())
8138     scale_loop_frequencies (LOOP_VINFO_SCALAR_LOOP (loop_vinfo),
8139                             LOOP_VINFO_SCALAR_LOOP_SCALING (loop_vinfo));
8140
8141   if (niters_vector == NULL_TREE)
8142     {
8143       if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
8144           && !LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
8145           && known_eq (lowest_vf, vf))
8146         {
8147           niters_vector
8148             = build_int_cst (TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo)),
8149                              LOOP_VINFO_INT_NITERS (loop_vinfo) / lowest_vf);
8150           step_vector = build_one_cst (TREE_TYPE (niters));
8151         }
8152       else
8153         vect_gen_vector_loop_niters (loop_vinfo, niters, &niters_vector,
8154                                      &step_vector, niters_no_overflow);
8155     }
8156
8157   /* 1) Make sure the loop header has exactly two entries
8158      2) Make sure we have a preheader basic block.  */
8159
8160   gcc_assert (EDGE_COUNT (loop->header->preds) == 2);
8161
8162   split_edge (loop_preheader_edge (loop));
8163
8164   if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
8165       && vect_use_loop_mask_for_alignment_p (loop_vinfo))
8166     /* This will deal with any possible peeling.  */
8167     vect_prepare_for_masked_peels (loop_vinfo);
8168
8169   /* Schedule the SLP instances first, then handle loop vectorization
8170      below.  */
8171   if (!loop_vinfo->slp_instances.is_empty ())
8172     {
8173       DUMP_VECT_SCOPE ("scheduling SLP instances");
8174       vect_schedule_slp (loop_vinfo);
8175     }
8176
8177   /* FORNOW: the vectorizer supports only loops which body consist
8178      of one basic block (header + empty latch). When the vectorizer will
8179      support more involved loop forms, the order by which the BBs are
8180      traversed need to be reconsidered.  */
8181
8182   for (i = 0; i < nbbs; i++)
8183     {
8184       basic_block bb = bbs[i];
8185       stmt_vec_info stmt_info;
8186
8187       for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
8188            gsi_next (&si))
8189         {
8190           gphi *phi = si.phi ();
8191           if (dump_enabled_p ())
8192             dump_printf_loc (MSG_NOTE, vect_location,
8193                              "------>vectorizing phi: %G", phi);
8194           stmt_info = loop_vinfo->lookup_stmt (phi);
8195           if (!stmt_info)
8196             continue;
8197
8198           if (MAY_HAVE_DEBUG_BIND_STMTS && !STMT_VINFO_LIVE_P (stmt_info))
8199             vect_loop_kill_debug_uses (loop, stmt_info);
8200
8201           if (!STMT_VINFO_RELEVANT_P (stmt_info)
8202               && !STMT_VINFO_LIVE_P (stmt_info))
8203             continue;
8204
8205           if (STMT_VINFO_VECTYPE (stmt_info)
8206               && (maybe_ne
8207                   (TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info)), vf))
8208               && dump_enabled_p ())
8209             dump_printf_loc (MSG_NOTE, vect_location, "multiple-types.\n");
8210
8211           if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def
8212                || STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
8213                || STMT_VINFO_DEF_TYPE (stmt_info) == vect_double_reduction_def
8214                || STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle
8215                || STMT_VINFO_DEF_TYPE (stmt_info) == vect_internal_def)
8216               && ! PURE_SLP_STMT (stmt_info))
8217             {
8218               if (dump_enabled_p ())
8219                 dump_printf_loc (MSG_NOTE, vect_location, "transform phi.\n");
8220               vect_transform_stmt (stmt_info, NULL, NULL, NULL);
8221             }
8222         }
8223
8224       for (gimple_stmt_iterator si = gsi_start_bb (bb);
8225            !gsi_end_p (si);)
8226         {
8227           stmt = gsi_stmt (si);
8228           /* During vectorization remove existing clobber stmts.  */
8229           if (gimple_clobber_p (stmt))
8230             {
8231               unlink_stmt_vdef (stmt);
8232               gsi_remove (&si, true);
8233               release_defs (stmt);
8234             }
8235           else
8236             {
8237               stmt_info = loop_vinfo->lookup_stmt (stmt);
8238
8239               /* vector stmts created in the outer-loop during vectorization of
8240                  stmts in an inner-loop may not have a stmt_info, and do not
8241                  need to be vectorized.  */
8242               stmt_vec_info seen_store = NULL;
8243               if (stmt_info)
8244                 {
8245                   if (STMT_VINFO_IN_PATTERN_P (stmt_info))
8246                     {
8247                       gimple *def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
8248                       for (gimple_stmt_iterator subsi = gsi_start (def_seq);
8249                            !gsi_end_p (subsi); gsi_next (&subsi))
8250                         {
8251                           stmt_vec_info pat_stmt_info
8252                             = loop_vinfo->lookup_stmt (gsi_stmt (subsi));
8253                           vect_transform_loop_stmt (loop_vinfo, pat_stmt_info,
8254                                                     &si, &seen_store);
8255                         }
8256                       stmt_vec_info pat_stmt_info
8257                         = STMT_VINFO_RELATED_STMT (stmt_info);
8258                       vect_transform_loop_stmt (loop_vinfo, pat_stmt_info, &si,
8259                                                 &seen_store);
8260                     }
8261                   vect_transform_loop_stmt (loop_vinfo, stmt_info, &si,
8262                                             &seen_store);
8263                 }
8264               gsi_next (&si);
8265               if (seen_store)
8266                 {
8267                   if (STMT_VINFO_GROUPED_ACCESS (seen_store))
8268                     /* Interleaving.  If IS_STORE is TRUE, the
8269                        vectorization of the interleaving chain was
8270                        completed - free all the stores in the chain.  */
8271                     vect_remove_stores (DR_GROUP_FIRST_ELEMENT (seen_store));
8272                   else
8273                     /* Free the attached stmt_vec_info and remove the stmt.  */
8274                     loop_vinfo->remove_stmt (stmt_info);
8275                 }
8276             }
8277         }
8278
8279       /* Stub out scalar statements that must not survive vectorization.
8280          Doing this here helps with grouped statements, or statements that
8281          are involved in patterns.  */
8282       for (gimple_stmt_iterator gsi = gsi_start_bb (bb);
8283            !gsi_end_p (gsi); gsi_next (&gsi))
8284         {
8285           gcall *call = dyn_cast <gcall *> (gsi_stmt (gsi));
8286           if (call && gimple_call_internal_p (call, IFN_MASK_LOAD))
8287             {
8288               tree lhs = gimple_get_lhs (call);
8289               if (!VECTOR_TYPE_P (TREE_TYPE (lhs)))
8290                 {
8291                   tree zero = build_zero_cst (TREE_TYPE (lhs));
8292                   gimple *new_stmt = gimple_build_assign (lhs, zero);
8293                   gsi_replace (&gsi, new_stmt, true);
8294                 }
8295             }
8296         }
8297     }                           /* BBs in loop */
8298
8299   /* The vectorization factor is always > 1, so if we use an IV increment of 1.
8300      a zero NITERS becomes a nonzero NITERS_VECTOR.  */
8301   if (integer_onep (step_vector))
8302     niters_no_overflow = true;
8303   vect_set_loop_condition (loop, loop_vinfo, niters_vector, step_vector,
8304                            niters_vector_mult_vf, !niters_no_overflow);
8305
8306   unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
8307   scale_profile_for_vect_loop (loop, assumed_vf);
8308
8309   /* True if the final iteration might not handle a full vector's
8310      worth of scalar iterations.  */
8311   bool final_iter_may_be_partial = LOOP_VINFO_FULLY_MASKED_P (loop_vinfo);
8312   /* The minimum number of iterations performed by the epilogue.  This
8313      is 1 when peeling for gaps because we always need a final scalar
8314      iteration.  */
8315   int min_epilogue_iters = LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) ? 1 : 0;
8316   /* +1 to convert latch counts to loop iteration counts,
8317      -min_epilogue_iters to remove iterations that cannot be performed
8318        by the vector code.  */
8319   int bias_for_lowest = 1 - min_epilogue_iters;
8320   int bias_for_assumed = bias_for_lowest;
8321   int alignment_npeels = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
8322   if (alignment_npeels && LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
8323     {
8324       /* When the amount of peeling is known at compile time, the first
8325          iteration will have exactly alignment_npeels active elements.
8326          In the worst case it will have at least one.  */
8327       int min_first_active = (alignment_npeels > 0 ? alignment_npeels : 1);
8328       bias_for_lowest += lowest_vf - min_first_active;
8329       bias_for_assumed += assumed_vf - min_first_active;
8330     }
8331   /* In these calculations the "- 1" converts loop iteration counts
8332      back to latch counts.  */
8333   if (loop->any_upper_bound)
8334     loop->nb_iterations_upper_bound
8335       = (final_iter_may_be_partial
8336          ? wi::udiv_ceil (loop->nb_iterations_upper_bound + bias_for_lowest,
8337                           lowest_vf) - 1
8338          : wi::udiv_floor (loop->nb_iterations_upper_bound + bias_for_lowest,
8339                            lowest_vf) - 1);
8340   if (loop->any_likely_upper_bound)
8341     loop->nb_iterations_likely_upper_bound
8342       = (final_iter_may_be_partial
8343          ? wi::udiv_ceil (loop->nb_iterations_likely_upper_bound
8344                           + bias_for_lowest, lowest_vf) - 1
8345          : wi::udiv_floor (loop->nb_iterations_likely_upper_bound
8346                            + bias_for_lowest, lowest_vf) - 1);
8347   if (loop->any_estimate)
8348     loop->nb_iterations_estimate
8349       = (final_iter_may_be_partial
8350          ? wi::udiv_ceil (loop->nb_iterations_estimate + bias_for_assumed,
8351                           assumed_vf) - 1
8352          : wi::udiv_floor (loop->nb_iterations_estimate + bias_for_assumed,
8353                            assumed_vf) - 1);
8354
8355   if (dump_enabled_p ())
8356     {
8357       if (!LOOP_VINFO_EPILOGUE_P (loop_vinfo))
8358         {
8359           dump_printf_loc (MSG_NOTE, vect_location,
8360                            "LOOP VECTORIZED\n");
8361           if (loop->inner)
8362             dump_printf_loc (MSG_NOTE, vect_location,
8363                              "OUTER LOOP VECTORIZED\n");
8364           dump_printf (MSG_NOTE, "\n");
8365         }
8366       else
8367         {
8368           dump_printf_loc (MSG_NOTE, vect_location,
8369                            "LOOP EPILOGUE VECTORIZED (VS=");
8370           dump_dec (MSG_NOTE, current_vector_size);
8371           dump_printf (MSG_NOTE, ")\n");
8372         }
8373     }
8374
8375   /* Loops vectorized with a variable factor won't benefit from
8376      unrolling/peeling.  */
8377   if (!vf.is_constant ())
8378     {
8379       loop->unroll = 1;
8380       if (dump_enabled_p ())
8381         dump_printf_loc (MSG_NOTE, vect_location, "Disabling unrolling due to"
8382                          " variable-length vectorization factor\n");
8383     }
8384   /* Free SLP instances here because otherwise stmt reference counting
8385      won't work.  */
8386   slp_instance instance;
8387   FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), i, instance)
8388     vect_free_slp_instance (instance, true);
8389   LOOP_VINFO_SLP_INSTANCES (loop_vinfo).release ();
8390   /* Clear-up safelen field since its value is invalid after vectorization
8391      since vectorized loop can have loop-carried dependencies.  */
8392   loop->safelen = 0;
8393
8394   /* Don't vectorize epilogue for epilogue.  */
8395   if (LOOP_VINFO_EPILOGUE_P (loop_vinfo))
8396     epilogue = NULL;
8397
8398   if (!PARAM_VALUE (PARAM_VECT_EPILOGUES_NOMASK))
8399     epilogue = NULL;
8400
8401   if (epilogue)
8402     {
8403       auto_vector_sizes vector_sizes;
8404       targetm.vectorize.autovectorize_vector_sizes (&vector_sizes, false);
8405       unsigned int next_size = 0;
8406
8407       /* Note LOOP_VINFO_NITERS_KNOWN_P and LOOP_VINFO_INT_NITERS work
8408          on niters already ajusted for the iterations of the prologue.  */
8409       if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
8410           && known_eq (vf, lowest_vf))
8411         {
8412           unsigned HOST_WIDE_INT eiters
8413             = (LOOP_VINFO_INT_NITERS (loop_vinfo)
8414                - LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo));
8415           eiters
8416             = eiters % lowest_vf + LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo);
8417           epilogue->nb_iterations_upper_bound = eiters - 1;
8418           epilogue->any_upper_bound = true;
8419
8420           unsigned int ratio;
8421           while (next_size < vector_sizes.length ()
8422                  && !(constant_multiple_p (current_vector_size,
8423                                            vector_sizes[next_size], &ratio)
8424                       && eiters >= lowest_vf / ratio))
8425             next_size += 1;
8426         }
8427       else
8428         while (next_size < vector_sizes.length ()
8429                && maybe_lt (current_vector_size, vector_sizes[next_size]))
8430           next_size += 1;
8431
8432       if (next_size == vector_sizes.length ())
8433         epilogue = NULL;
8434     }
8435
8436   if (epilogue)
8437     {
8438       epilogue->force_vectorize = loop->force_vectorize;
8439       epilogue->safelen = loop->safelen;
8440       epilogue->dont_vectorize = false;
8441
8442       /* We may need to if-convert epilogue to vectorize it.  */
8443       if (LOOP_VINFO_SCALAR_LOOP (loop_vinfo))
8444         tree_if_conversion (epilogue);
8445     }
8446
8447   return epilogue;
8448 }
8449
8450 /* The code below is trying to perform simple optimization - revert
8451    if-conversion for masked stores, i.e. if the mask of a store is zero
8452    do not perform it and all stored value producers also if possible.
8453    For example,
8454      for (i=0; i<n; i++)
8455        if (c[i])
8456         {
8457           p1[i] += 1;
8458           p2[i] = p3[i] +2;
8459         }
8460    this transformation will produce the following semi-hammock:
8461
8462    if (!mask__ifc__42.18_165 == { 0, 0, 0, 0, 0, 0, 0, 0 })
8463      {
8464        vect__11.19_170 = MASK_LOAD (vectp_p1.20_168, 0B, mask__ifc__42.18_165);
8465        vect__12.22_172 = vect__11.19_170 + vect_cst__171;
8466        MASK_STORE (vectp_p1.23_175, 0B, mask__ifc__42.18_165, vect__12.22_172);
8467        vect__18.25_182 = MASK_LOAD (vectp_p3.26_180, 0B, mask__ifc__42.18_165);
8468        vect__19.28_184 = vect__18.25_182 + vect_cst__183;
8469        MASK_STORE (vectp_p2.29_187, 0B, mask__ifc__42.18_165, vect__19.28_184);
8470      }
8471 */
8472
8473 void
8474 optimize_mask_stores (class loop *loop)
8475 {
8476   basic_block *bbs = get_loop_body (loop);
8477   unsigned nbbs = loop->num_nodes;
8478   unsigned i;
8479   basic_block bb;
8480   class loop *bb_loop;
8481   gimple_stmt_iterator gsi;
8482   gimple *stmt;
8483   auto_vec<gimple *> worklist;
8484   auto_purge_vect_location sentinel;
8485
8486   vect_location = find_loop_location (loop);
8487   /* Pick up all masked stores in loop if any.  */
8488   for (i = 0; i < nbbs; i++)
8489     {
8490       bb = bbs[i];
8491       for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi);
8492            gsi_next (&gsi))
8493         {
8494           stmt = gsi_stmt (gsi);
8495           if (gimple_call_internal_p (stmt, IFN_MASK_STORE))
8496             worklist.safe_push (stmt);
8497         }
8498     }
8499
8500   free (bbs);
8501   if (worklist.is_empty ())
8502     return;
8503
8504   /* Loop has masked stores.  */
8505   while (!worklist.is_empty ())
8506     {
8507       gimple *last, *last_store;
8508       edge e, efalse;
8509       tree mask;
8510       basic_block store_bb, join_bb;
8511       gimple_stmt_iterator gsi_to;
8512       tree vdef, new_vdef;
8513       gphi *phi;
8514       tree vectype;
8515       tree zero;
8516
8517       last = worklist.pop ();
8518       mask = gimple_call_arg (last, 2);
8519       bb = gimple_bb (last);
8520       /* Create then_bb and if-then structure in CFG, then_bb belongs to
8521          the same loop as if_bb.  It could be different to LOOP when two
8522          level loop-nest is vectorized and mask_store belongs to the inner
8523          one.  */
8524       e = split_block (bb, last);
8525       bb_loop = bb->loop_father;
8526       gcc_assert (loop == bb_loop || flow_loop_nested_p (loop, bb_loop));
8527       join_bb = e->dest;
8528       store_bb = create_empty_bb (bb);
8529       add_bb_to_loop (store_bb, bb_loop);
8530       e->flags = EDGE_TRUE_VALUE;
8531       efalse = make_edge (bb, store_bb, EDGE_FALSE_VALUE);
8532       /* Put STORE_BB to likely part.  */
8533       efalse->probability = profile_probability::unlikely ();
8534       store_bb->count = efalse->count ();
8535       make_single_succ_edge (store_bb, join_bb, EDGE_FALLTHRU);
8536       if (dom_info_available_p (CDI_DOMINATORS))
8537         set_immediate_dominator (CDI_DOMINATORS, store_bb, bb);
8538       if (dump_enabled_p ())
8539         dump_printf_loc (MSG_NOTE, vect_location,
8540                          "Create new block %d to sink mask stores.",
8541                          store_bb->index);
8542       /* Create vector comparison with boolean result.  */
8543       vectype = TREE_TYPE (mask);
8544       zero = build_zero_cst (vectype);
8545       stmt = gimple_build_cond (EQ_EXPR, mask, zero, NULL_TREE, NULL_TREE);
8546       gsi = gsi_last_bb (bb);
8547       gsi_insert_after (&gsi, stmt, GSI_SAME_STMT);
8548       /* Create new PHI node for vdef of the last masked store:
8549          .MEM_2 = VDEF <.MEM_1>
8550          will be converted to
8551          .MEM.3 = VDEF <.MEM_1>
8552          and new PHI node will be created in join bb
8553          .MEM_2 = PHI <.MEM_1, .MEM_3>
8554       */
8555       vdef = gimple_vdef (last);
8556       new_vdef = make_ssa_name (gimple_vop (cfun), last);
8557       gimple_set_vdef (last, new_vdef);
8558       phi = create_phi_node (vdef, join_bb);
8559       add_phi_arg (phi, new_vdef, EDGE_SUCC (store_bb, 0), UNKNOWN_LOCATION);
8560
8561       /* Put all masked stores with the same mask to STORE_BB if possible.  */
8562       while (true)
8563         {
8564           gimple_stmt_iterator gsi_from;
8565           gimple *stmt1 = NULL;
8566
8567           /* Move masked store to STORE_BB.  */
8568           last_store = last;
8569           gsi = gsi_for_stmt (last);
8570           gsi_from = gsi;
8571           /* Shift GSI to the previous stmt for further traversal.  */
8572           gsi_prev (&gsi);
8573           gsi_to = gsi_start_bb (store_bb);
8574           gsi_move_before (&gsi_from, &gsi_to);
8575           /* Setup GSI_TO to the non-empty block start.  */
8576           gsi_to = gsi_start_bb (store_bb);
8577           if (dump_enabled_p ())
8578             dump_printf_loc (MSG_NOTE, vect_location,
8579                              "Move stmt to created bb\n%G", last);
8580           /* Move all stored value producers if possible.  */
8581           while (!gsi_end_p (gsi))
8582             {
8583               tree lhs;
8584               imm_use_iterator imm_iter;
8585               use_operand_p use_p;
8586               bool res;
8587
8588               /* Skip debug statements.  */
8589               if (is_gimple_debug (gsi_stmt (gsi)))
8590                 {
8591                   gsi_prev (&gsi);
8592                   continue;
8593                 }
8594               stmt1 = gsi_stmt (gsi);
8595               /* Do not consider statements writing to memory or having
8596                  volatile operand.  */
8597               if (gimple_vdef (stmt1)
8598                   || gimple_has_volatile_ops (stmt1))
8599                 break;
8600               gsi_from = gsi;
8601               gsi_prev (&gsi);
8602               lhs = gimple_get_lhs (stmt1);
8603               if (!lhs)
8604                 break;
8605
8606               /* LHS of vectorized stmt must be SSA_NAME.  */
8607               if (TREE_CODE (lhs) != SSA_NAME)
8608                 break;
8609
8610               if (!VECTOR_TYPE_P (TREE_TYPE (lhs)))
8611                 {
8612                   /* Remove dead scalar statement.  */
8613                   if (has_zero_uses (lhs))
8614                     {
8615                       gsi_remove (&gsi_from, true);
8616                       continue;
8617                     }
8618                 }
8619
8620               /* Check that LHS does not have uses outside of STORE_BB.  */
8621               res = true;
8622               FOR_EACH_IMM_USE_FAST (use_p, imm_iter, lhs)
8623                 {
8624                   gimple *use_stmt;
8625                   use_stmt = USE_STMT (use_p);
8626                   if (is_gimple_debug (use_stmt))
8627                     continue;
8628                   if (gimple_bb (use_stmt) != store_bb)
8629                     {
8630                       res = false;
8631                       break;
8632                     }
8633                 }
8634               if (!res)
8635                 break;
8636
8637               if (gimple_vuse (stmt1)
8638                   && gimple_vuse (stmt1) != gimple_vuse (last_store))
8639                 break;
8640
8641               /* Can move STMT1 to STORE_BB.  */
8642               if (dump_enabled_p ())
8643                 dump_printf_loc (MSG_NOTE, vect_location,
8644                                  "Move stmt to created bb\n%G", stmt1);
8645               gsi_move_before (&gsi_from, &gsi_to);
8646               /* Shift GSI_TO for further insertion.  */
8647               gsi_prev (&gsi_to);
8648             }
8649           /* Put other masked stores with the same mask to STORE_BB.  */
8650           if (worklist.is_empty ()
8651               || gimple_call_arg (worklist.last (), 2) != mask
8652               || worklist.last () != stmt1)
8653             break;
8654           last = worklist.pop ();
8655         }
8656       add_phi_arg (phi, gimple_vuse (last_store), e, UNKNOWN_LOCATION);
8657     }
8658 }
8659
8660 /* Decide whether it is possible to use a zero-based induction variable
8661    when vectorizing LOOP_VINFO with a fully-masked loop.  If it is,
8662    return the value that the induction variable must be able to hold
8663    in order to ensure that the loop ends with an all-false mask.
8664    Return -1 otherwise.  */
8665 widest_int
8666 vect_iv_limit_for_full_masking (loop_vec_info loop_vinfo)
8667 {
8668   tree niters_skip = LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo);
8669   class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
8670   unsigned HOST_WIDE_INT max_vf = vect_max_vf (loop_vinfo);
8671
8672   /* Calculate the value that the induction variable must be able
8673      to hit in order to ensure that we end the loop with an all-false mask.
8674      This involves adding the maximum number of inactive trailing scalar
8675      iterations.  */
8676   widest_int iv_limit = -1;
8677   if (max_loop_iterations (loop, &iv_limit))
8678     {
8679       if (niters_skip)
8680         {
8681           /* Add the maximum number of skipped iterations to the
8682              maximum iteration count.  */
8683           if (TREE_CODE (niters_skip) == INTEGER_CST)
8684             iv_limit += wi::to_widest (niters_skip);
8685           else
8686             iv_limit += max_vf - 1;
8687         }
8688       else if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo))
8689         /* Make a conservatively-correct assumption.  */
8690         iv_limit += max_vf - 1;
8691
8692       /* IV_LIMIT is the maximum number of latch iterations, which is also
8693          the maximum in-range IV value.  Round this value down to the previous
8694          vector alignment boundary and then add an extra full iteration.  */
8695       poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
8696       iv_limit = (iv_limit & -(int) known_alignment (vf)) + max_vf;
8697     }
8698   return iv_limit;
8699 }
8700