gcc/tree-vect-loop.c

   1 /* Loop Vectorization
   2    Copyright (C) 2003-2019 Free Software Foundation, Inc.
   3    Contributed by Dorit Naishlos <dorit@il.ibm.com> and
   4    Ira Rosen <irar@il.ibm.com>
   5
   6 This file is part of GCC.
   7
   8 GCC is free software; you can redistribute it and/or modify it under
   9 the terms of the GNU General Public License as published by the Free
  10 Software Foundation; either version 3, or (at your option) any later
  11 version.
  12
  13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
  14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
  15 FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  16 for more details.
  17
  18 You should have received a copy of the GNU General Public License
  19 along with GCC; see the file COPYING3.  If not see
  20 <http://www.gnu.org/licenses/>.  */
  21
  22 #include "config.h"
  23 #include "system.h"
  24 #include "coretypes.h"
  25 #include "backend.h"
  26 #include "target.h"
  27 #include "rtl.h"
  28 #include "tree.h"
  29 #include "gimple.h"
  30 #include "cfghooks.h"
  31 #include "tree-pass.h"
  32 #include "ssa.h"
  33 #include "optabs-tree.h"
  34 #include "diagnostic-core.h"
  35 #include "fold-const.h"
  36 #include "stor-layout.h"
  37 #include "cfganal.h"
  38 #include "gimplify.h"
  39 #include "gimple-iterator.h"
  40 #include "gimplify-me.h"
  41 #include "tree-ssa-loop-ivopts.h"
  42 #include "tree-ssa-loop-manip.h"
  43 #include "tree-ssa-loop-niter.h"
  44 #include "tree-ssa-loop.h"
  45 #include "cfgloop.h"
  46 #include "params.h"
  47 #include "tree-scalar-evolution.h"
  48 #include "tree-vectorizer.h"
  49 #include "gimple-fold.h"
  50 #include "cgraph.h"
  51 #include "tree-cfg.h"
  52 #include "tree-if-conv.h"
  53 #include "internal-fn.h"
  54 #include "tree-vector-builder.h"
  55 #include "vec-perm-indices.h"
  56 #include "tree-eh.h"
  57
  58 /* Loop Vectorization Pass.
  59
  60    This pass tries to vectorize loops.
  61
  62    For example, the vectorizer transforms the following simple loop:
  63
  64         short a[N]; short b[N]; short c[N]; int i;
  65
  66         for (i=0; i<N; i++){
  67           a[i] = b[i] + c[i];
  68         }
  69
  70    as if it was manually vectorized by rewriting the source code into:
  71
  72         typedef int __attribute__((mode(V8HI))) v8hi;
  73         short a[N];  short b[N]; short c[N];   int i;
  74         v8hi *pa = (v8hi*)a, *pb = (v8hi*)b, *pc = (v8hi*)c;
  75         v8hi va, vb, vc;
  76
  77         for (i=0; i<N/8; i++){
  78           vb = pb[i];
  79           vc = pc[i];
  80           va = vb + vc;
  81           pa[i] = va;
  82         }
  83
  84         The main entry to this pass is vectorize_loops(), in which
  85    the vectorizer applies a set of analyses on a given set of loops,
  86    followed by the actual vectorization transformation for the loops that
  87    had successfully passed the analysis phase.
  88         Throughout this pass we make a distinction between two types of
  89    data: scalars (which are represented by SSA_NAMES), and memory references
  90    ("data-refs").  These two types of data require different handling both
  91    during analysis and transformation. The types of data-refs that the
  92    vectorizer currently supports are ARRAY_REFS which base is an array DECL
  93    (not a pointer), and INDIRECT_REFS through pointers; both array and pointer
  94    accesses are required to have a simple (consecutive) access pattern.
  95
  96    Analysis phase:
  97    ===============
  98         The driver for the analysis phase is vect_analyze_loop().
  99    It applies a set of analyses, some of which rely on the scalar evolution
 100    analyzer (scev) developed by Sebastian Pop.
 101
 102         During the analysis phase the vectorizer records some information
 103    per stmt in a "stmt_vec_info" struct which is attached to each stmt in the
 104    loop, as well as general information about the loop as a whole, which is
 105    recorded in a "loop_vec_info" struct attached to each loop.
 106
 107    Transformation phase:
 108    =====================
 109         The loop transformation phase scans all the stmts in the loop, and
 110    creates a vector stmt (or a sequence of stmts) for each scalar stmt S in
 111    the loop that needs to be vectorized.  It inserts the vector code sequence
 112    just before the scalar stmt S, and records a pointer to the vector code
 113    in STMT_VINFO_VEC_STMT (stmt_info) (stmt_info is the stmt_vec_info struct
 114    attached to S).  This pointer will be used for the vectorization of following
 115    stmts which use the def of stmt S. Stmt S is removed if it writes to memory;
 116    otherwise, we rely on dead code elimination for removing it.
 117
 118         For example, say stmt S1 was vectorized into stmt VS1:
 119
 120    VS1: vb = px[i];
 121    S1:  b = x[i];    STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
 122    S2:  a = b;
 123
 124    To vectorize stmt S2, the vectorizer first finds the stmt that defines
 125    the operand 'b' (S1), and gets the relevant vector def 'vb' from the
 126    vector stmt VS1 pointed to by STMT_VINFO_VEC_STMT (stmt_info (S1)).  The
 127    resulting sequence would be:
 128
 129    VS1: vb = px[i];
 130    S1:  b = x[i];       STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
 131    VS2: va = vb;
 132    S2:  a = b;          STMT_VINFO_VEC_STMT (stmt_info (S2)) = VS2
 133
 134         Operands that are not SSA_NAMEs, are data-refs that appear in
 135    load/store operations (like 'x[i]' in S1), and are handled differently.
 136
 137    Target modeling:
 138    =================
 139         Currently the only target specific information that is used is the
 140    size of the vector (in bytes) - "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD".
 141    Targets that can support different sizes of vectors, for now will need
 142    to specify one value for "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD".  More
 143    flexibility will be added in the future.
 144
 145         Since we only vectorize operations which vector form can be
 146    expressed using existing tree codes, to verify that an operation is
 147    supported, the vectorizer checks the relevant optab at the relevant
 148    machine_mode (e.g, optab_handler (add_optab, V8HImode)).  If
 149    the value found is CODE_FOR_nothing, then there's no target support, and
 150    we can't vectorize the stmt.
 151
 152    For additional information on this project see:
 153    http://gcc.gnu.org/projects/tree-ssa/vectorization.html
 154 */
 155
 156 static void vect_estimate_min_profitable_iters (loop_vec_info, int *, int *);
 157
 158 /* Subroutine of vect_determine_vf_for_stmt that handles only one
 159    statement.  VECTYPE_MAYBE_SET_P is true if STMT_VINFO_VECTYPE
 160    may already be set for general statements (not just data refs).  */
 161
 162 static opt_result
 163 vect_determine_vf_for_stmt_1 (stmt_vec_info stmt_info,
 164                               bool vectype_maybe_set_p,
 165                               poly_uint64 *vf,
 166                               vec<stmt_vec_info > *mask_producers)
 167 {
 168   gimple *stmt = stmt_info->stmt;
 169
 170   if ((!STMT_VINFO_RELEVANT_P (stmt_info)
 171        && !STMT_VINFO_LIVE_P (stmt_info))
 172       || gimple_clobber_p (stmt))
 173     {
 174       if (dump_enabled_p ())
 175         dump_printf_loc (MSG_NOTE, vect_location, "skip.\n");
 176       return opt_result::success ();
 177     }
 178
 179   tree stmt_vectype, nunits_vectype;
 180   opt_result res = vect_get_vector_types_for_stmt (stmt_info, &stmt_vectype,
 181                                                    &nunits_vectype);
 182   if (!res)
 183     return res;
 184
 185   if (stmt_vectype)
 186     {
 187       if (STMT_VINFO_VECTYPE (stmt_info))
 188         /* The only case when a vectype had been already set is for stmts
 189            that contain a data ref, or for "pattern-stmts" (stmts generated
 190            by the vectorizer to represent/replace a certain idiom).  */
 191         gcc_assert ((STMT_VINFO_DATA_REF (stmt_info)
 192                      || vectype_maybe_set_p)
 193                     && STMT_VINFO_VECTYPE (stmt_info) == stmt_vectype);
 194       else if (stmt_vectype == boolean_type_node)
 195         mask_producers->safe_push (stmt_info);
 196       else
 197         STMT_VINFO_VECTYPE (stmt_info) = stmt_vectype;
 198     }
 199
 200   if (nunits_vectype)
 201     vect_update_max_nunits (vf, nunits_vectype);
 202
 203   return opt_result::success ();
 204 }
 205
 206 /* Subroutine of vect_determine_vectorization_factor.  Set the vector
 207    types of STMT_INFO and all attached pattern statements and update
 208    the vectorization factor VF accordingly.  If some of the statements
 209    produce a mask result whose vector type can only be calculated later,
 210    add them to MASK_PRODUCERS.  Return true on success or false if
 211    something prevented vectorization.  */
 212
 213 static opt_result
 214 vect_determine_vf_for_stmt (stmt_vec_info stmt_info, poly_uint64 *vf,
 215                             vec<stmt_vec_info > *mask_producers)
 216 {
 217   vec_info *vinfo = stmt_info->vinfo;
 218   if (dump_enabled_p ())
 219     dump_printf_loc (MSG_NOTE, vect_location, "==> examining statement: %G",
 220                      stmt_info->stmt);
 221   opt_result res
 222     = vect_determine_vf_for_stmt_1 (stmt_info, false, vf, mask_producers);
 223   if (!res)
 224     return res;
 225
 226   if (STMT_VINFO_IN_PATTERN_P (stmt_info)
 227       && STMT_VINFO_RELATED_STMT (stmt_info))
 228     {
 229       gimple *pattern_def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
 230       stmt_info = STMT_VINFO_RELATED_STMT (stmt_info);
 231
 232       /* If a pattern statement has def stmts, analyze them too.  */
 233       for (gimple_stmt_iterator si = gsi_start (pattern_def_seq);
 234            !gsi_end_p (si); gsi_next (&si))
 235         {
 236           stmt_vec_info def_stmt_info = vinfo->lookup_stmt (gsi_stmt (si));
 237           if (dump_enabled_p ())
 238             dump_printf_loc (MSG_NOTE, vect_location,
 239                              "==> examining pattern def stmt: %G",
 240                              def_stmt_info->stmt);
 241           if (!vect_determine_vf_for_stmt_1 (def_stmt_info, true,
 242                                              vf, mask_producers))
 243           res = vect_determine_vf_for_stmt_1 (def_stmt_info, true,
 244                                               vf, mask_producers);
 245           if (!res)
 246             return res;
 247         }
 248
 249       if (dump_enabled_p ())
 250         dump_printf_loc (MSG_NOTE, vect_location,
 251                          "==> examining pattern statement: %G",
 252                          stmt_info->stmt);
 253       res = vect_determine_vf_for_stmt_1 (stmt_info, true, vf, mask_producers);
 254       if (!res)
 255         return res;
 256     }
 257
 258   return opt_result::success ();
 259 }
 260
 261 /* Function vect_determine_vectorization_factor
 262
 263    Determine the vectorization factor (VF).  VF is the number of data elements
 264    that are operated upon in parallel in a single iteration of the vectorized
 265    loop.  For example, when vectorizing a loop that operates on 4byte elements,
 266    on a target with vector size (VS) 16byte, the VF is set to 4, since 4
 267    elements can fit in a single vector register.
 268
 269    We currently support vectorization of loops in which all types operated upon
 270    are of the same size.  Therefore this function currently sets VF according to
 271    the size of the types operated upon, and fails if there are multiple sizes
 272    in the loop.
 273
 274    VF is also the factor by which the loop iterations are strip-mined, e.g.:
 275    original loop:
 276         for (i=0; i<N; i++){
 277           a[i] = b[i] + c[i];
 278         }
 279
 280    vectorized loop:
 281         for (i=0; i<N; i+=VF){
 282           a[i:VF] = b[i:VF] + c[i:VF];
 283         }
 284 */
 285
 286 static opt_result
 287 vect_determine_vectorization_factor (loop_vec_info loop_vinfo)
 288 {
 289   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
 290   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
 291   unsigned nbbs = loop->num_nodes;
 292   poly_uint64 vectorization_factor = 1;
 293   tree scalar_type = NULL_TREE;
 294   gphi *phi;
 295   tree vectype;
 296   stmt_vec_info stmt_info;
 297   unsigned i;
 298   auto_vec<stmt_vec_info> mask_producers;
 299
 300   DUMP_VECT_SCOPE ("vect_determine_vectorization_factor");
 301
 302   for (i = 0; i < nbbs; i++)
 303     {
 304       basic_block bb = bbs[i];
 305
 306       for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
 307            gsi_next (&si))
 308         {
 309           phi = si.phi ();
 310           stmt_info = loop_vinfo->lookup_stmt (phi);
 311           if (dump_enabled_p ())
 312             dump_printf_loc (MSG_NOTE, vect_location, "==> examining phi: %G",
 313                              phi);
 314
 315           gcc_assert (stmt_info);
 316
 317           if (STMT_VINFO_RELEVANT_P (stmt_info)
 318               || STMT_VINFO_LIVE_P (stmt_info))
 319             {
 320               gcc_assert (!STMT_VINFO_VECTYPE (stmt_info));
 321               scalar_type = TREE_TYPE (PHI_RESULT (phi));
 322
 323               if (dump_enabled_p ())
 324                 dump_printf_loc (MSG_NOTE, vect_location,
 325                                  "get vectype for scalar type:  %T\n",
 326                                  scalar_type);
 327
 328               vectype = get_vectype_for_scalar_type (scalar_type);
 329               if (!vectype)
 330                 return opt_result::failure_at (phi,
 331                                                "not vectorized: unsupported "
 332                                                "data-type %T\n",
 333                                                scalar_type);
 334               STMT_VINFO_VECTYPE (stmt_info) = vectype;
 335
 336               if (dump_enabled_p ())
 337                 dump_printf_loc (MSG_NOTE, vect_location, "vectype: %T\n",
 338                                  vectype);
 339
 340               if (dump_enabled_p ())
 341                 {
 342                   dump_printf_loc (MSG_NOTE, vect_location, "nunits = ");
 343                   dump_dec (MSG_NOTE, TYPE_VECTOR_SUBPARTS (vectype));
 344                   dump_printf (MSG_NOTE, "\n");
 345                 }
 346
 347               vect_update_max_nunits (&vectorization_factor, vectype);
 348             }
 349         }
 350
 351       for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
 352            gsi_next (&si))
 353         {
 354           stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
 355           opt_result res
 356             = vect_determine_vf_for_stmt (stmt_info, &vectorization_factor,
 357                                           &mask_producers);
 358           if (!res)
 359             return res;
 360         }
 361     }
 362
 363   /* TODO: Analyze cost. Decide if worth while to vectorize.  */
 364   if (dump_enabled_p ())
 365     {
 366       dump_printf_loc (MSG_NOTE, vect_location, "vectorization factor = ");
 367       dump_dec (MSG_NOTE, vectorization_factor);
 368       dump_printf (MSG_NOTE, "\n");
 369     }
 370
 371   if (known_le (vectorization_factor, 1U))
 372     return opt_result::failure_at (vect_location,
 373                                    "not vectorized: unsupported data-type\n");
 374   LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
 375
 376   for (i = 0; i < mask_producers.length (); i++)
 377     {
 378       stmt_info = mask_producers[i];
 379       opt_tree mask_type = vect_get_mask_type_for_stmt (stmt_info);
 380       if (!mask_type)
 381         return opt_result::propagate_failure (mask_type);
 382       STMT_VINFO_VECTYPE (stmt_info) = mask_type;
 383     }
 384
 385   return opt_result::success ();
 386 }
 387
 388
 389 /* Function vect_is_simple_iv_evolution.
 390
 391    FORNOW: A simple evolution of an induction variables in the loop is
 392    considered a polynomial evolution.  */
 393
 394 static bool
 395 vect_is_simple_iv_evolution (unsigned loop_nb, tree access_fn, tree * init,
 396                              tree * step)
 397 {
 398   tree init_expr;
 399   tree step_expr;
 400   tree evolution_part = evolution_part_in_loop_num (access_fn, loop_nb);
 401   basic_block bb;
 402
 403   /* When there is no evolution in this loop, the evolution function
 404      is not "simple".  */
 405   if (evolution_part == NULL_TREE)
 406     return false;
 407
 408   /* When the evolution is a polynomial of degree >= 2
 409      the evolution function is not "simple".  */
 410   if (tree_is_chrec (evolution_part))
 411     return false;
 412
 413   step_expr = evolution_part;
 414   init_expr = unshare_expr (initial_condition_in_loop_num (access_fn, loop_nb));
 415
 416   if (dump_enabled_p ())
 417     dump_printf_loc (MSG_NOTE, vect_location, "step: %T,  init: %T\n",
 418                      step_expr, init_expr);
 419
 420   *init = init_expr;
 421   *step = step_expr;
 422
 423   if (TREE_CODE (step_expr) != INTEGER_CST
 424       && (TREE_CODE (step_expr) != SSA_NAME
 425           || ((bb = gimple_bb (SSA_NAME_DEF_STMT (step_expr)))
 426               && flow_bb_inside_loop_p (get_loop (cfun, loop_nb), bb))
 427           || (!INTEGRAL_TYPE_P (TREE_TYPE (step_expr))
 428               && (!SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr))
 429                   || !flag_associative_math)))
 430       && (TREE_CODE (step_expr) != REAL_CST
 431           || !flag_associative_math))
 432     {
 433       if (dump_enabled_p ())
 434         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 435                          "step unknown.\n");
 436       return false;
 437     }
 438
 439   return true;
 440 }
 441
 442 /* Return true if PHI, described by STMT_INFO, is the inner PHI in
 443    what we are assuming is a double reduction.  For example, given
 444    a structure like this:
 445
 446       outer1:
 447         x_1 = PHI <x_4(outer2), ...>;
 448         ...
 449
 450       inner:
 451         x_2 = PHI <x_1(outer1), ...>;
 452         ...
 453         x_3 = ...;
 454         ...
 455
 456       outer2:
 457         x_4 = PHI <x_3(inner)>;
 458         ...
 459
 460    outer loop analysis would treat x_1 as a double reduction phi and
 461    this function would then return true for x_2.  */
 462
 463 static bool
 464 vect_inner_phi_in_double_reduction_p (stmt_vec_info stmt_info, gphi *phi)
 465 {
 466   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
 467   use_operand_p use_p;
 468   ssa_op_iter op_iter;
 469   FOR_EACH_PHI_ARG (use_p, phi, op_iter, SSA_OP_USE)
 470     if (stmt_vec_info def_info = loop_vinfo->lookup_def (USE_FROM_PTR (use_p)))
 471       if (STMT_VINFO_DEF_TYPE (def_info) == vect_double_reduction_def)
 472         return true;
 473   return false;
 474 }
 475
 476 /* Function vect_analyze_scalar_cycles_1.
 477
 478    Examine the cross iteration def-use cycles of scalar variables
 479    in LOOP.  LOOP_VINFO represents the loop that is now being
 480    considered for vectorization (can be LOOP, or an outer-loop
 481    enclosing LOOP).  */
 482
 483 static void
 484 vect_analyze_scalar_cycles_1 (loop_vec_info loop_vinfo, struct loop *loop)
 485 {
 486   basic_block bb = loop->header;
 487   tree init, step;
 488   auto_vec<stmt_vec_info, 64> worklist;
 489   gphi_iterator gsi;
 490   bool double_reduc;
 491
 492   DUMP_VECT_SCOPE ("vect_analyze_scalar_cycles");
 493
 494   /* First - identify all inductions.  Reduction detection assumes that all the
 495      inductions have been identified, therefore, this order must not be
 496      changed.  */
 497   for (gsi = gsi_start_phis  (bb); !gsi_end_p (gsi); gsi_next (&gsi))
 498     {
 499       gphi *phi = gsi.phi ();
 500       tree access_fn = NULL;
 501       tree def = PHI_RESULT (phi);
 502       stmt_vec_info stmt_vinfo = loop_vinfo->lookup_stmt (phi);
 503
 504       if (dump_enabled_p ())
 505         dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: %G", phi);
 506
 507       /* Skip virtual phi's.  The data dependences that are associated with
 508          virtual defs/uses (i.e., memory accesses) are analyzed elsewhere.  */
 509       if (virtual_operand_p (def))
 510         continue;
 511
 512       STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_unknown_def_type;
 513
 514       /* Analyze the evolution function.  */
 515       access_fn = analyze_scalar_evolution (loop, def);
 516       if (access_fn)
 517         {
 518           STRIP_NOPS (access_fn);
 519           if (dump_enabled_p ())
 520             dump_printf_loc (MSG_NOTE, vect_location,
 521                              "Access function of PHI: %T\n", access_fn);
 522           STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo)
 523             = initial_condition_in_loop_num (access_fn, loop->num);
 524           STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo)
 525             = evolution_part_in_loop_num (access_fn, loop->num);
 526         }
 527
 528       if (!access_fn
 529           || vect_inner_phi_in_double_reduction_p (stmt_vinfo, phi)
 530           || !vect_is_simple_iv_evolution (loop->num, access_fn, &init, &step)
 531           || (LOOP_VINFO_LOOP (loop_vinfo) != loop
 532               && TREE_CODE (step) != INTEGER_CST))
 533         {
 534           worklist.safe_push (stmt_vinfo);
 535           continue;
 536         }
 537
 538       gcc_assert (STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo)
 539                   != NULL_TREE);
 540       gcc_assert (STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo) != NULL_TREE);
 541
 542       if (dump_enabled_p ())
 543         dump_printf_loc (MSG_NOTE, vect_location, "Detected induction.\n");
 544       STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_induction_def;
 545     }
 546
 547
 548   /* Second - identify all reductions and nested cycles.  */
 549   while (worklist.length () > 0)
 550     {
 551       stmt_vec_info stmt_vinfo = worklist.pop ();
 552       gphi *phi = as_a <gphi *> (stmt_vinfo->stmt);
 553       tree def = PHI_RESULT (phi);
 554
 555       if (dump_enabled_p ())
 556         dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: %G", phi);
 557
 558       gcc_assert (!virtual_operand_p (def)
 559                   && STMT_VINFO_DEF_TYPE (stmt_vinfo) == vect_unknown_def_type);
 560
 561       stmt_vec_info reduc_stmt_info
 562         = vect_force_simple_reduction (loop_vinfo, stmt_vinfo,
 563                                        &double_reduc, false);
 564       if (reduc_stmt_info)
 565         {
 566           if (double_reduc)
 567             {
 568               if (dump_enabled_p ())
 569                 dump_printf_loc (MSG_NOTE, vect_location,
 570                                  "Detected double reduction.\n");
 571
 572               STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_double_reduction_def;
 573               STMT_VINFO_DEF_TYPE (reduc_stmt_info)
 574                 = vect_double_reduction_def;
 575             }
 576           else
 577             {
 578               if (loop != LOOP_VINFO_LOOP (loop_vinfo))
 579                 {
 580                   if (dump_enabled_p ())
 581                     dump_printf_loc (MSG_NOTE, vect_location,
 582                                      "Detected vectorizable nested cycle.\n");
 583
 584                   STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_nested_cycle;
 585                   STMT_VINFO_DEF_TYPE (reduc_stmt_info) = vect_nested_cycle;
 586                 }
 587               else
 588                 {
 589                   if (dump_enabled_p ())
 590                     dump_printf_loc (MSG_NOTE, vect_location,
 591                                      "Detected reduction.\n");
 592
 593                   STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_reduction_def;
 594                   STMT_VINFO_DEF_TYPE (reduc_stmt_info) = vect_reduction_def;
 595                   /* Store the reduction cycles for possible vectorization in
 596                      loop-aware SLP if it was not detected as reduction
 597                      chain.  */
 598                   if (! REDUC_GROUP_FIRST_ELEMENT (reduc_stmt_info))
 599                     LOOP_VINFO_REDUCTIONS (loop_vinfo).safe_push
 600                       (reduc_stmt_info);
 601                 }
 602             }
 603         }
 604       else
 605         if (dump_enabled_p ())
 606           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 607                            "Unknown def-use cycle pattern.\n");
 608     }
 609 }
 610
 611
 612 /* Function vect_analyze_scalar_cycles.
 613
 614    Examine the cross iteration def-use cycles of scalar variables, by
 615    analyzing the loop-header PHIs of scalar variables.  Classify each
 616    cycle as one of the following: invariant, induction, reduction, unknown.
 617    We do that for the loop represented by LOOP_VINFO, and also to its
 618    inner-loop, if exists.
 619    Examples for scalar cycles:
 620
 621    Example1: reduction:
 622
 623               loop1:
 624               for (i=0; i<N; i++)
 625                  sum += a[i];
 626
 627    Example2: induction:
 628
 629               loop2:
 630               for (i=0; i<N; i++)
 631                  a[i] = i;  */
 632
 633 static void
 634 vect_analyze_scalar_cycles (loop_vec_info loop_vinfo)
 635 {
 636   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
 637
 638   vect_analyze_scalar_cycles_1 (loop_vinfo, loop);
 639
 640   /* When vectorizing an outer-loop, the inner-loop is executed sequentially.
 641      Reductions in such inner-loop therefore have different properties than
 642      the reductions in the nest that gets vectorized:
 643      1. When vectorized, they are executed in the same order as in the original
 644         scalar loop, so we can't change the order of computation when
 645         vectorizing them.
 646      2. FIXME: Inner-loop reductions can be used in the inner-loop, so the
 647         current checks are too strict.  */
 648
 649   if (loop->inner)
 650     vect_analyze_scalar_cycles_1 (loop_vinfo, loop->inner);
 651 }
 652
 653 /* Transfer group and reduction information from STMT_INFO to its
 654    pattern stmt.  */
 655
 656 static void
 657 vect_fixup_reduc_chain (stmt_vec_info stmt_info)
 658 {
 659   stmt_vec_info firstp = STMT_VINFO_RELATED_STMT (stmt_info);
 660   stmt_vec_info stmtp;
 661   gcc_assert (!REDUC_GROUP_FIRST_ELEMENT (firstp)
 662               && REDUC_GROUP_FIRST_ELEMENT (stmt_info));
 663   REDUC_GROUP_SIZE (firstp) = REDUC_GROUP_SIZE (stmt_info);
 664   do
 665     {
 666       stmtp = STMT_VINFO_RELATED_STMT (stmt_info);
 667       REDUC_GROUP_FIRST_ELEMENT (stmtp) = firstp;
 668       stmt_info = REDUC_GROUP_NEXT_ELEMENT (stmt_info);
 669       if (stmt_info)
 670         REDUC_GROUP_NEXT_ELEMENT (stmtp)
 671           = STMT_VINFO_RELATED_STMT (stmt_info);
 672     }
 673   while (stmt_info);
 674   STMT_VINFO_DEF_TYPE (stmtp) = vect_reduction_def;
 675 }
 676
 677 /* Fixup scalar cycles that now have their stmts detected as patterns.  */
 678
 679 static void
 680 vect_fixup_scalar_cycles_with_patterns (loop_vec_info loop_vinfo)
 681 {
 682   stmt_vec_info first;
 683   unsigned i;
 684
 685   FOR_EACH_VEC_ELT (LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo), i, first)
 686     if (STMT_VINFO_IN_PATTERN_P (first))
 687       {
 688         stmt_vec_info next = REDUC_GROUP_NEXT_ELEMENT (first);
 689         while (next)
 690           {
 691             if (! STMT_VINFO_IN_PATTERN_P (next))
 692               break;
 693             next = REDUC_GROUP_NEXT_ELEMENT (next);
 694           }
 695         /* If not all stmt in the chain are patterns try to handle
 696            the chain without patterns.  */
 697         if (! next)
 698           {
 699             vect_fixup_reduc_chain (first);
 700             LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo)[i]
 701               = STMT_VINFO_RELATED_STMT (first);
 702           }
 703       }
 704 }
 705
 706 /* Function vect_get_loop_niters.
 707
 708    Determine how many iterations the loop is executed and place it
 709    in NUMBER_OF_ITERATIONS.  Place the number of latch iterations
 710    in NUMBER_OF_ITERATIONSM1.  Place the condition under which the
 711    niter information holds in ASSUMPTIONS.
 712
 713    Return the loop exit condition.  */
 714
 715
 716 static gcond *
 717 vect_get_loop_niters (struct loop *loop, tree *assumptions,
 718                       tree *number_of_iterations, tree *number_of_iterationsm1)
 719 {
 720   edge exit = single_exit (loop);
 721   struct tree_niter_desc niter_desc;
 722   tree niter_assumptions, niter, may_be_zero;
 723   gcond *cond = get_loop_exit_condition (loop);
 724
 725   *assumptions = boolean_true_node;
 726   *number_of_iterationsm1 = chrec_dont_know;
 727   *number_of_iterations = chrec_dont_know;
 728   DUMP_VECT_SCOPE ("get_loop_niters");
 729
 730   if (!exit)
 731     return cond;
 732
 733   may_be_zero = NULL_TREE;
 734   if (!number_of_iterations_exit_assumptions (loop, exit, &niter_desc, NULL)
 735       || chrec_contains_undetermined (niter_desc.niter))
 736     return cond;
 737
 738   niter_assumptions = niter_desc.assumptions;
 739   may_be_zero = niter_desc.may_be_zero;
 740   niter = niter_desc.niter;
 741
 742   if (may_be_zero && integer_zerop (may_be_zero))
 743     may_be_zero = NULL_TREE;
 744
 745   if (may_be_zero)
 746     {
 747       if (COMPARISON_CLASS_P (may_be_zero))
 748         {
 749           /* Try to combine may_be_zero with assumptions, this can simplify
 750              computation of niter expression.  */
 751           if (niter_assumptions && !integer_nonzerop (niter_assumptions))
 752             niter_assumptions = fold_build2 (TRUTH_AND_EXPR, boolean_type_node,
 753                                              niter_assumptions,
 754                                              fold_build1 (TRUTH_NOT_EXPR,
 755                                                           boolean_type_node,
 756                                                           may_be_zero));
 757           else
 758             niter = fold_build3 (COND_EXPR, TREE_TYPE (niter), may_be_zero,
 759                                  build_int_cst (TREE_TYPE (niter), 0),
 760                                  rewrite_to_non_trapping_overflow (niter));
 761
 762           may_be_zero = NULL_TREE;
 763         }
 764       else if (integer_nonzerop (may_be_zero))
 765         {
 766           *number_of_iterationsm1 = build_int_cst (TREE_TYPE (niter), 0);
 767           *number_of_iterations = build_int_cst (TREE_TYPE (niter), 1);
 768           return cond;
 769         }
 770       else
 771         return cond;
 772     }
 773
 774   *assumptions = niter_assumptions;
 775   *number_of_iterationsm1 = niter;
 776
 777   /* We want the number of loop header executions which is the number
 778      of latch executions plus one.
 779      ???  For UINT_MAX latch executions this number overflows to zero
 780      for loops like do { n++; } while (n != 0);  */
 781   if (niter && !chrec_contains_undetermined (niter))
 782     niter = fold_build2 (PLUS_EXPR, TREE_TYPE (niter), unshare_expr (niter),
 783                           build_int_cst (TREE_TYPE (niter), 1));
 784   *number_of_iterations = niter;
 785
 786   return cond;
 787 }
 788
 789 /* Function bb_in_loop_p
 790
 791    Used as predicate for dfs order traversal of the loop bbs.  */
 792
 793 static bool
 794 bb_in_loop_p (const_basic_block bb, const void *data)
 795 {
 796   const struct loop *const loop = (const struct loop *)data;
 797   if (flow_bb_inside_loop_p (loop, bb))
 798     return true;
 799   return false;
 800 }
 801
 802
 803 /* Create and initialize a new loop_vec_info struct for LOOP_IN, as well as
 804    stmt_vec_info structs for all the stmts in LOOP_IN.  */
 805
 806 _loop_vec_info::_loop_vec_info (struct loop *loop_in, vec_info_shared *shared)
 807   : vec_info (vec_info::loop, init_cost (loop_in), shared),
 808     loop (loop_in),
 809     bbs (XCNEWVEC (basic_block, loop->num_nodes)),
 810     num_itersm1 (NULL_TREE),
 811     num_iters (NULL_TREE),
 812     num_iters_unchanged (NULL_TREE),
 813     num_iters_assumptions (NULL_TREE),
 814     th (0),
 815     versioning_threshold (0),
 816     vectorization_factor (0),
 817     max_vectorization_factor (0),
 818     mask_skip_niters (NULL_TREE),
 819     mask_compare_type (NULL_TREE),
 820     simd_if_cond (NULL_TREE),
 821     unaligned_dr (NULL),
 822     peeling_for_alignment (0),
 823     ptr_mask (0),
 824     ivexpr_map (NULL),
 825     scan_map (NULL),
 826     slp_unrolling_factor (1),
 827     single_scalar_iteration_cost (0),
 828     vectorizable (false),
 829     can_fully_mask_p (true),
 830     fully_masked_p (false),
 831     peeling_for_gaps (false),
 832     peeling_for_niter (false),
 833     operands_swapped (false),
 834     no_data_dependencies (false),
 835     has_mask_store (false),
 836     scalar_loop (NULL),
 837     orig_loop_info (NULL)
 838 {
 839   /* CHECKME: We want to visit all BBs before their successors (except for
 840      latch blocks, for which this assertion wouldn't hold).  In the simple
 841      case of the loop forms we allow, a dfs order of the BBs would the same
 842      as reversed postorder traversal, so we are safe.  */
 843
 844   unsigned int nbbs = dfs_enumerate_from (loop->header, 0, bb_in_loop_p,
 845                                           bbs, loop->num_nodes, loop);
 846   gcc_assert (nbbs == loop->num_nodes);
 847
 848   for (unsigned int i = 0; i < nbbs; i++)
 849     {
 850       basic_block bb = bbs[i];
 851       gimple_stmt_iterator si;
 852
 853       for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
 854         {
 855           gimple *phi = gsi_stmt (si);
 856           gimple_set_uid (phi, 0);
 857           add_stmt (phi);
 858         }
 859
 860       for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
 861         {
 862           gimple *stmt = gsi_stmt (si);
 863           gimple_set_uid (stmt, 0);
 864           add_stmt (stmt);
 865           /* If .GOMP_SIMD_LANE call for the current loop has 3 arguments, the
 866              third argument is the #pragma omp simd if (x) condition, when 0,
 867              loop shouldn't be vectorized, when non-zero constant, it should
 868              be vectorized normally, otherwise versioned with vectorized loop
 869              done if the condition is non-zero at runtime.  */
 870           if (loop_in->simduid
 871               && is_gimple_call (stmt)
 872               && gimple_call_internal_p (stmt)
 873               && gimple_call_internal_fn (stmt) == IFN_GOMP_SIMD_LANE
 874               && gimple_call_num_args (stmt) >= 3
 875               && TREE_CODE (gimple_call_arg (stmt, 0)) == SSA_NAME
 876               && (loop_in->simduid
 877                   == SSA_NAME_VAR (gimple_call_arg (stmt, 0))))
 878             {
 879               tree arg = gimple_call_arg (stmt, 2);
 880               if (integer_zerop (arg) || TREE_CODE (arg) == SSA_NAME)
 881                 simd_if_cond = arg;
 882               else
 883                 gcc_assert (integer_nonzerop (arg));
 884             }
 885         }
 886     }
 887 }
 888
 889 /* Free all levels of MASKS.  */
 890
 891 void
 892 release_vec_loop_masks (vec_loop_masks *masks)
 893 {
 894   rgroup_masks *rgm;
 895   unsigned int i;
 896   FOR_EACH_VEC_ELT (*masks, i, rgm)
 897     rgm->masks.release ();
 898   masks->release ();
 899 }
 900
 901 /* Free all memory used by the _loop_vec_info, as well as all the
 902    stmt_vec_info structs of all the stmts in the loop.  */
 903
 904 _loop_vec_info::~_loop_vec_info ()
 905 {
 906   int nbbs;
 907   gimple_stmt_iterator si;
 908   int j;
 909
 910   nbbs = loop->num_nodes;
 911   for (j = 0; j < nbbs; j++)
 912     {
 913       basic_block bb = bbs[j];
 914       for (si = gsi_start_bb (bb); !gsi_end_p (si); )
 915         {
 916           gimple *stmt = gsi_stmt (si);
 917
 918           /* We may have broken canonical form by moving a constant
 919              into RHS1 of a commutative op.  Fix such occurrences.  */
 920           if (operands_swapped && is_gimple_assign (stmt))
 921             {
 922               enum tree_code code = gimple_assign_rhs_code (stmt);
 923
 924               if ((code == PLUS_EXPR
 925                    || code == POINTER_PLUS_EXPR
 926                    || code == MULT_EXPR)
 927                   && CONSTANT_CLASS_P (gimple_assign_rhs1 (stmt)))
 928                 swap_ssa_operands (stmt,
 929                                    gimple_assign_rhs1_ptr (stmt),
 930                                    gimple_assign_rhs2_ptr (stmt));
 931               else if (code == COND_EXPR
 932                        && CONSTANT_CLASS_P (gimple_assign_rhs2 (stmt)))
 933                 {
 934                   tree cond_expr = gimple_assign_rhs1 (stmt);
 935                   enum tree_code cond_code = TREE_CODE (cond_expr);
 936
 937                   if (TREE_CODE_CLASS (cond_code) == tcc_comparison)
 938                     {
 939                       bool honor_nans = HONOR_NANS (TREE_OPERAND (cond_expr,
 940                                                                   0));
 941                       cond_code = invert_tree_comparison (cond_code,
 942                                                           honor_nans);
 943                       if (cond_code != ERROR_MARK)
 944                         {
 945                           TREE_SET_CODE (cond_expr, cond_code);
 946                           swap_ssa_operands (stmt,
 947                                              gimple_assign_rhs2_ptr (stmt),
 948                                              gimple_assign_rhs3_ptr (stmt));
 949                         }
 950                     }
 951                 }
 952             }
 953           gsi_next (&si);
 954         }
 955     }
 956
 957   free (bbs);
 958
 959   release_vec_loop_masks (&masks);
 960   delete ivexpr_map;
 961   delete scan_map;
 962
 963   loop->aux = NULL;
 964 }
 965
 966 /* Return an invariant or register for EXPR and emit necessary
 967    computations in the LOOP_VINFO loop preheader.  */
 968
 969 tree
 970 cse_and_gimplify_to_preheader (loop_vec_info loop_vinfo, tree expr)
 971 {
 972   if (is_gimple_reg (expr)
 973       || is_gimple_min_invariant (expr))
 974     return expr;
 975
 976   if (! loop_vinfo->ivexpr_map)
 977     loop_vinfo->ivexpr_map = new hash_map<tree_operand_hash, tree>;
 978   tree &cached = loop_vinfo->ivexpr_map->get_or_insert (expr);
 979   if (! cached)
 980     {
 981       gimple_seq stmts = NULL;
 982       cached = force_gimple_operand (unshare_expr (expr),
 983                                      &stmts, true, NULL_TREE);
 984       if (stmts)
 985         {
 986           edge e = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo));
 987           gsi_insert_seq_on_edge_immediate (e, stmts);
 988         }
 989     }
 990   return cached;
 991 }
 992
 993 /* Return true if we can use CMP_TYPE as the comparison type to produce
 994    all masks required to mask LOOP_VINFO.  */
 995
 996 static bool
 997 can_produce_all_loop_masks_p (loop_vec_info loop_vinfo, tree cmp_type)
 998 {
 999   rgroup_masks *rgm;
1000   unsigned int i;
1001   FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo), i, rgm)
1002     if (rgm->mask_type != NULL_TREE
1003         && !direct_internal_fn_supported_p (IFN_WHILE_ULT,
1004                                             cmp_type, rgm->mask_type,
1005                                             OPTIMIZE_FOR_SPEED))
1006       return false;
1007   return true;
1008 }
1009
1010 /* Calculate the maximum number of scalars per iteration for every
1011    rgroup in LOOP_VINFO.  */
1012
1013 static unsigned int
1014 vect_get_max_nscalars_per_iter (loop_vec_info loop_vinfo)
1015 {
1016   unsigned int res = 1;
1017   unsigned int i;
1018   rgroup_masks *rgm;
1019   FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo), i, rgm)
1020     res = MAX (res, rgm->max_nscalars_per_iter);
1021   return res;
1022 }
1023
1024 /* Each statement in LOOP_VINFO can be masked where necessary.  Check
1025    whether we can actually generate the masks required.  Return true if so,
1026    storing the type of the scalar IV in LOOP_VINFO_MASK_COMPARE_TYPE.  */
1027
1028 static bool
1029 vect_verify_full_masking (loop_vec_info loop_vinfo)
1030 {
1031   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1032   unsigned int min_ni_width;
1033   unsigned int max_nscalars_per_iter
1034     = vect_get_max_nscalars_per_iter (loop_vinfo);
1035
1036   /* Use a normal loop if there are no statements that need masking.
1037      This only happens in rare degenerate cases: it means that the loop
1038      has no loads, no stores, and no live-out values.  */
1039   if (LOOP_VINFO_MASKS (loop_vinfo).is_empty ())
1040     return false;
1041
1042   /* Get the maximum number of iterations that is representable
1043      in the counter type.  */
1044   tree ni_type = TREE_TYPE (LOOP_VINFO_NITERSM1 (loop_vinfo));
1045   widest_int max_ni = wi::to_widest (TYPE_MAX_VALUE (ni_type)) + 1;
1046
1047   /* Get a more refined estimate for the number of iterations.  */
1048   widest_int max_back_edges;
1049   if (max_loop_iterations (loop, &max_back_edges))
1050     max_ni = wi::smin (max_ni, max_back_edges + 1);
1051
1052   /* Account for rgroup masks, in which each bit is replicated N times.  */
1053   max_ni *= max_nscalars_per_iter;
1054
1055   /* Work out how many bits we need to represent the limit.  */
1056   min_ni_width = wi::min_precision (max_ni, UNSIGNED);
1057
1058   /* Find a scalar mode for which WHILE_ULT is supported.  */
1059   opt_scalar_int_mode cmp_mode_iter;
1060   tree cmp_type = NULL_TREE;
1061   tree iv_type = NULL_TREE;
1062   widest_int iv_limit = vect_iv_limit_for_full_masking (loop_vinfo);
1063   unsigned int iv_precision = UINT_MAX;
1064
1065   if (iv_limit != -1)
1066     iv_precision = wi::min_precision (iv_limit * max_nscalars_per_iter,
1067                                       UNSIGNED);
1068
1069   FOR_EACH_MODE_IN_CLASS (cmp_mode_iter, MODE_INT)
1070     {
1071       unsigned int cmp_bits = GET_MODE_BITSIZE (cmp_mode_iter.require ());
1072       if (cmp_bits >= min_ni_width
1073           && targetm.scalar_mode_supported_p (cmp_mode_iter.require ()))
1074         {
1075           tree this_type = build_nonstandard_integer_type (cmp_bits, true);
1076           if (this_type
1077               && can_produce_all_loop_masks_p (loop_vinfo, this_type))
1078             {
1079               /* Although we could stop as soon as we find a valid mode,
1080                  there are at least two reasons why that's not always the
1081                  best choice:
1082
1083                  - An IV that's Pmode or wider is more likely to be reusable
1084                    in address calculations than an IV that's narrower than
1085                    Pmode.
1086
1087                  - Doing the comparison in IV_PRECISION or wider allows
1088                    a natural 0-based IV, whereas using a narrower comparison
1089                    type requires mitigations against wrap-around.
1090
1091                  Conversely, if the IV limit is variable, doing the comparison
1092                  in a wider type than the original type can introduce
1093                  unnecessary extensions, so picking the widest valid mode
1094                  is not always a good choice either.
1095
1096                  Here we prefer the first IV type that's Pmode or wider,
1097                  and the first comparison type that's IV_PRECISION or wider.
1098                  (The comparison type must be no wider than the IV type,
1099                  to avoid extensions in the vector loop.)
1100
1101                  ??? We might want to try continuing beyond Pmode for ILP32
1102                  targets if CMP_BITS < IV_PRECISION.  */
1103               iv_type = this_type;
1104               if (!cmp_type || iv_precision > TYPE_PRECISION (cmp_type))
1105                 cmp_type = this_type;
1106               if (cmp_bits >= GET_MODE_BITSIZE (Pmode))
1107                 break;
1108             }
1109         }
1110     }
1111
1112   if (!cmp_type)
1113     return false;
1114
1115   LOOP_VINFO_MASK_COMPARE_TYPE (loop_vinfo) = cmp_type;
1116   LOOP_VINFO_MASK_IV_TYPE (loop_vinfo) = iv_type;
1117   return true;
1118 }
1119
1120 /* Calculate the cost of one scalar iteration of the loop.  */
1121 static void
1122 vect_compute_single_scalar_iteration_cost (loop_vec_info loop_vinfo)
1123 {
1124   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1125   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1126   int nbbs = loop->num_nodes, factor;
1127   int innerloop_iters, i;
1128
1129   DUMP_VECT_SCOPE ("vect_compute_single_scalar_iteration_cost");
1130
1131   /* Gather costs for statements in the scalar loop.  */
1132
1133   /* FORNOW.  */
1134   innerloop_iters = 1;
1135   if (loop->inner)
1136     innerloop_iters = 50; /* FIXME */
1137
1138   for (i = 0; i < nbbs; i++)
1139     {
1140       gimple_stmt_iterator si;
1141       basic_block bb = bbs[i];
1142
1143       if (bb->loop_father == loop->inner)
1144         factor = innerloop_iters;
1145       else
1146         factor = 1;
1147
1148       for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
1149         {
1150           gimple *stmt = gsi_stmt (si);
1151           stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (stmt);
1152
1153           if (!is_gimple_assign (stmt) && !is_gimple_call (stmt))
1154             continue;
1155
1156           /* Skip stmts that are not vectorized inside the loop.  */
1157           stmt_vec_info vstmt_info = vect_stmt_to_vectorize (stmt_info);
1158           if (!STMT_VINFO_RELEVANT_P (vstmt_info)
1159               && (!STMT_VINFO_LIVE_P (vstmt_info)
1160                   || !VECTORIZABLE_CYCLE_DEF
1161                         (STMT_VINFO_DEF_TYPE (vstmt_info))))
1162             continue;
1163
1164           vect_cost_for_stmt kind;
1165           if (STMT_VINFO_DATA_REF (stmt_info))
1166             {
1167               if (DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
1168                kind = scalar_load;
1169              else
1170                kind = scalar_store;
1171             }
1172           else
1173             kind = scalar_stmt;
1174
1175           record_stmt_cost (&LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1176                             factor, kind, stmt_info, 0, vect_prologue);
1177         }
1178     }
1179
1180   /* Now accumulate cost.  */
1181   void *target_cost_data = init_cost (loop);
1182   stmt_info_for_cost *si;
1183   int j;
1184   FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1185                     j, si)
1186     (void) add_stmt_cost (target_cost_data, si->count,
1187                           si->kind, si->stmt_info, si->misalign,
1188                           vect_body);
1189   unsigned dummy, body_cost = 0;
1190   finish_cost (target_cost_data, &dummy, &body_cost, &dummy);
1191   destroy_cost_data (target_cost_data);
1192   LOOP_VINFO_SINGLE_SCALAR_ITERATION_COST (loop_vinfo) = body_cost;
1193 }
1194
1195
1196 /* Function vect_analyze_loop_form_1.
1197
1198    Verify that certain CFG restrictions hold, including:
1199    - the loop has a pre-header
1200    - the loop has a single entry and exit
1201    - the loop exit condition is simple enough
1202    - the number of iterations can be analyzed, i.e, a countable loop.  The
1203      niter could be analyzed under some assumptions.  */
1204
1205 opt_result
1206 vect_analyze_loop_form_1 (struct loop *loop, gcond **loop_cond,
1207                           tree *assumptions, tree *number_of_iterationsm1,
1208                           tree *number_of_iterations, gcond **inner_loop_cond)
1209 {
1210   DUMP_VECT_SCOPE ("vect_analyze_loop_form");
1211
1212   /* Different restrictions apply when we are considering an inner-most loop,
1213      vs. an outer (nested) loop.
1214      (FORNOW. May want to relax some of these restrictions in the future).  */
1215
1216   if (!loop->inner)
1217     {
1218       /* Inner-most loop.  We currently require that the number of BBs is
1219          exactly 2 (the header and latch).  Vectorizable inner-most loops
1220          look like this:
1221
1222                         (pre-header)
1223                            |
1224                           header <--------+
1225                            | |            |
1226                            | +--> latch --+
1227                            |
1228                         (exit-bb)  */
1229
1230       if (loop->num_nodes != 2)
1231         return opt_result::failure_at (vect_location,
1232                                        "not vectorized:"
1233                                        " control flow in loop.\n");
1234
1235       if (empty_block_p (loop->header))
1236         return opt_result::failure_at (vect_location,
1237                                        "not vectorized: empty loop.\n");
1238     }
1239   else
1240     {
1241       struct loop *innerloop = loop->inner;
1242       edge entryedge;
1243
1244       /* Nested loop. We currently require that the loop is doubly-nested,
1245          contains a single inner loop, and the number of BBs is exactly 5.
1246          Vectorizable outer-loops look like this:
1247
1248                         (pre-header)
1249                            |
1250                           header <---+
1251                            |         |
1252                           inner-loop |
1253                            |         |
1254                           tail ------+
1255                            |
1256                         (exit-bb)
1257
1258          The inner-loop has the properties expected of inner-most loops
1259          as described above.  */
1260
1261       if ((loop->inner)->inner || (loop->inner)->next)
1262         return opt_result::failure_at (vect_location,
1263                                        "not vectorized:"
1264                                        " multiple nested loops.\n");
1265
1266       if (loop->num_nodes != 5)
1267         return opt_result::failure_at (vect_location,
1268                                        "not vectorized:"
1269                                        " control flow in loop.\n");
1270
1271       entryedge = loop_preheader_edge (innerloop);
1272       if (entryedge->src != loop->header
1273           || !single_exit (innerloop)
1274           || single_exit (innerloop)->dest != EDGE_PRED (loop->latch, 0)->src)
1275         return opt_result::failure_at (vect_location,
1276                                        "not vectorized:"
1277                                        " unsupported outerloop form.\n");
1278
1279       /* Analyze the inner-loop.  */
1280       tree inner_niterm1, inner_niter, inner_assumptions;
1281       opt_result res
1282         = vect_analyze_loop_form_1 (loop->inner, inner_loop_cond,
1283                                     &inner_assumptions, &inner_niterm1,
1284                                     &inner_niter, NULL);
1285       if (!res)
1286         {
1287           if (dump_enabled_p ())
1288             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1289                              "not vectorized: Bad inner loop.\n");
1290           return res;
1291         }
1292
1293       /* Don't support analyzing niter under assumptions for inner
1294          loop.  */
1295       if (!integer_onep (inner_assumptions))
1296         return opt_result::failure_at (vect_location,
1297                                        "not vectorized: Bad inner loop.\n");
1298
1299       if (!expr_invariant_in_loop_p (loop, inner_niter))
1300         return opt_result::failure_at (vect_location,
1301                                        "not vectorized: inner-loop count not"
1302                                        " invariant.\n");
1303
1304       if (dump_enabled_p ())
1305         dump_printf_loc (MSG_NOTE, vect_location,
1306                          "Considering outer-loop vectorization.\n");
1307     }
1308
1309   if (!single_exit (loop))
1310     return opt_result::failure_at (vect_location,
1311                                    "not vectorized: multiple exits.\n");
1312   if (EDGE_COUNT (loop->header->preds) != 2)
1313     return opt_result::failure_at (vect_location,
1314                                    "not vectorized:"
1315                                    " too many incoming edges.\n");
1316
1317   /* We assume that the loop exit condition is at the end of the loop. i.e,
1318      that the loop is represented as a do-while (with a proper if-guard
1319      before the loop if needed), where the loop header contains all the
1320      executable statements, and the latch is empty.  */
1321   if (!empty_block_p (loop->latch)
1322       || !gimple_seq_empty_p (phi_nodes (loop->latch)))
1323     return opt_result::failure_at (vect_location,
1324                                    "not vectorized: latch block not empty.\n");
1325
1326   /* Make sure the exit is not abnormal.  */
1327   edge e = single_exit (loop);
1328   if (e->flags & EDGE_ABNORMAL)
1329     return opt_result::failure_at (vect_location,
1330                                    "not vectorized:"
1331                                    " abnormal loop exit edge.\n");
1332
1333   *loop_cond = vect_get_loop_niters (loop, assumptions, number_of_iterations,
1334                                      number_of_iterationsm1);
1335   if (!*loop_cond)
1336     return opt_result::failure_at
1337       (vect_location,
1338        "not vectorized: complicated exit condition.\n");
1339
1340   if (integer_zerop (*assumptions)
1341       || !*number_of_iterations
1342       || chrec_contains_undetermined (*number_of_iterations))
1343     return opt_result::failure_at
1344       (*loop_cond,
1345        "not vectorized: number of iterations cannot be computed.\n");
1346
1347   if (integer_zerop (*number_of_iterations))
1348     return opt_result::failure_at
1349       (*loop_cond,
1350        "not vectorized: number of iterations = 0.\n");
1351
1352   return opt_result::success ();
1353 }
1354
1355 /* Analyze LOOP form and return a loop_vec_info if it is of suitable form.  */
1356
1357 opt_loop_vec_info
1358 vect_analyze_loop_form (struct loop *loop, vec_info_shared *shared)
1359 {
1360   tree assumptions, number_of_iterations, number_of_iterationsm1;
1361   gcond *loop_cond, *inner_loop_cond = NULL;
1362
1363   opt_result res
1364     = vect_analyze_loop_form_1 (loop, &loop_cond,
1365                                 &assumptions, &number_of_iterationsm1,
1366                                 &number_of_iterations, &inner_loop_cond);
1367   if (!res)
1368     return opt_loop_vec_info::propagate_failure (res);
1369
1370   loop_vec_info loop_vinfo = new _loop_vec_info (loop, shared);
1371   LOOP_VINFO_NITERSM1 (loop_vinfo) = number_of_iterationsm1;
1372   LOOP_VINFO_NITERS (loop_vinfo) = number_of_iterations;
1373   LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = number_of_iterations;
1374   if (!integer_onep (assumptions))
1375     {
1376       /* We consider to vectorize this loop by versioning it under
1377          some assumptions.  In order to do this, we need to clear
1378          existing information computed by scev and niter analyzer.  */
1379       scev_reset_htab ();
1380       free_numbers_of_iterations_estimates (loop);
1381       /* Also set flag for this loop so that following scev and niter
1382          analysis are done under the assumptions.  */
1383       loop_constraint_set (loop, LOOP_C_FINITE);
1384       /* Also record the assumptions for versioning.  */
1385       LOOP_VINFO_NITERS_ASSUMPTIONS (loop_vinfo) = assumptions;
1386     }
1387
1388   if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
1389     {
1390       if (dump_enabled_p ())
1391         {
1392           dump_printf_loc (MSG_NOTE, vect_location,
1393                            "Symbolic number of iterations is ");
1394           dump_generic_expr (MSG_NOTE, TDF_DETAILS, number_of_iterations);
1395           dump_printf (MSG_NOTE, "\n");
1396         }
1397     }
1398
1399   stmt_vec_info loop_cond_info = loop_vinfo->lookup_stmt (loop_cond);
1400   STMT_VINFO_TYPE (loop_cond_info) = loop_exit_ctrl_vec_info_type;
1401   if (inner_loop_cond)
1402     {
1403       stmt_vec_info inner_loop_cond_info
1404         = loop_vinfo->lookup_stmt (inner_loop_cond);
1405       STMT_VINFO_TYPE (inner_loop_cond_info) = loop_exit_ctrl_vec_info_type;
1406     }
1407
1408   gcc_assert (!loop->aux);
1409   loop->aux = loop_vinfo;
1410   return opt_loop_vec_info::success (loop_vinfo);
1411 }
1412
1413
1414
1415 /* Scan the loop stmts and dependent on whether there are any (non-)SLP
1416    statements update the vectorization factor.  */
1417
1418 static void
1419 vect_update_vf_for_slp (loop_vec_info loop_vinfo)
1420 {
1421   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1422   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1423   int nbbs = loop->num_nodes;
1424   poly_uint64 vectorization_factor;
1425   int i;
1426
1427   DUMP_VECT_SCOPE ("vect_update_vf_for_slp");
1428
1429   vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1430   gcc_assert (known_ne (vectorization_factor, 0U));
1431
1432   /* If all the stmts in the loop can be SLPed, we perform only SLP, and
1433      vectorization factor of the loop is the unrolling factor required by
1434      the SLP instances.  If that unrolling factor is 1, we say, that we
1435      perform pure SLP on loop - cross iteration parallelism is not
1436      exploited.  */
1437   bool only_slp_in_loop = true;
1438   for (i = 0; i < nbbs; i++)
1439     {
1440       basic_block bb = bbs[i];
1441       for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
1442            gsi_next (&si))
1443         {
1444           stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
1445           stmt_info = vect_stmt_to_vectorize (stmt_info);
1446           if ((STMT_VINFO_RELEVANT_P (stmt_info)
1447                || VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
1448               && !PURE_SLP_STMT (stmt_info))
1449             /* STMT needs both SLP and loop-based vectorization.  */
1450             only_slp_in_loop = false;
1451         }
1452     }
1453
1454   if (only_slp_in_loop)
1455     {
1456       if (dump_enabled_p ())
1457         dump_printf_loc (MSG_NOTE, vect_location,
1458                          "Loop contains only SLP stmts\n");
1459       vectorization_factor = LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo);
1460     }
1461   else
1462     {
1463       if (dump_enabled_p ())
1464         dump_printf_loc (MSG_NOTE, vect_location,
1465                          "Loop contains SLP and non-SLP stmts\n");
1466       /* Both the vectorization factor and unroll factor have the form
1467          current_vector_size * X for some rational X, so they must have
1468          a common multiple.  */
1469       vectorization_factor
1470         = force_common_multiple (vectorization_factor,
1471                                  LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo));
1472     }
1473
1474   LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
1475   if (dump_enabled_p ())
1476     {
1477       dump_printf_loc (MSG_NOTE, vect_location,
1478                        "Updating vectorization factor to ");
1479       dump_dec (MSG_NOTE, vectorization_factor);
1480       dump_printf (MSG_NOTE, ".\n");
1481     }
1482 }
1483
1484 /* Return true if STMT_INFO describes a double reduction phi and if
1485    the other phi in the reduction is also relevant for vectorization.
1486    This rejects cases such as:
1487
1488       outer1:
1489         x_1 = PHI <x_3(outer2), ...>;
1490         ...
1491
1492       inner:
1493         x_2 = ...;
1494         ...
1495
1496       outer2:
1497         x_3 = PHI <x_2(inner)>;
1498
1499    if nothing in x_2 or elsewhere makes x_1 relevant.  */
1500
1501 static bool
1502 vect_active_double_reduction_p (stmt_vec_info stmt_info)
1503 {
1504   if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_double_reduction_def)
1505     return false;
1506
1507   return STMT_VINFO_RELEVANT_P (STMT_VINFO_REDUC_DEF (stmt_info));
1508 }
1509
1510 /* Function vect_analyze_loop_operations.
1511
1512    Scan the loop stmts and make sure they are all vectorizable.  */
1513
1514 static opt_result
1515 vect_analyze_loop_operations (loop_vec_info loop_vinfo)
1516 {
1517   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1518   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1519   int nbbs = loop->num_nodes;
1520   int i;
1521   stmt_vec_info stmt_info;
1522   bool need_to_vectorize = false;
1523   bool ok;
1524
1525   DUMP_VECT_SCOPE ("vect_analyze_loop_operations");
1526
1527   auto_vec<stmt_info_for_cost> cost_vec;
1528
1529   for (i = 0; i < nbbs; i++)
1530     {
1531       basic_block bb = bbs[i];
1532
1533       for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
1534            gsi_next (&si))
1535         {
1536           gphi *phi = si.phi ();
1537           ok = true;
1538
1539           stmt_info = loop_vinfo->lookup_stmt (phi);
1540           if (dump_enabled_p ())
1541             dump_printf_loc (MSG_NOTE, vect_location, "examining phi: %G", phi);
1542           if (virtual_operand_p (gimple_phi_result (phi)))
1543             continue;
1544
1545           /* Inner-loop loop-closed exit phi in outer-loop vectorization
1546              (i.e., a phi in the tail of the outer-loop).  */
1547           if (! is_loop_header_bb_p (bb))
1548             {
1549               /* FORNOW: we currently don't support the case that these phis
1550                  are not used in the outerloop (unless it is double reduction,
1551                  i.e., this phi is vect_reduction_def), cause this case
1552                  requires to actually do something here.  */
1553               if (STMT_VINFO_LIVE_P (stmt_info)
1554                   && !vect_active_double_reduction_p (stmt_info))
1555                 return opt_result::failure_at (phi,
1556                                                "Unsupported loop-closed phi"
1557                                                " in outer-loop.\n");
1558
1559               /* If PHI is used in the outer loop, we check that its operand
1560                  is defined in the inner loop.  */
1561               if (STMT_VINFO_RELEVANT_P (stmt_info))
1562                 {
1563                   tree phi_op;
1564
1565                   if (gimple_phi_num_args (phi) != 1)
1566                     return opt_result::failure_at (phi, "unsupported phi");
1567
1568                   phi_op = PHI_ARG_DEF (phi, 0);
1569                   stmt_vec_info op_def_info = loop_vinfo->lookup_def (phi_op);
1570                   if (!op_def_info)
1571                     return opt_result::failure_at (phi, "unsupported phi");
1572
1573                   if (STMT_VINFO_RELEVANT (op_def_info) != vect_used_in_outer
1574                       && (STMT_VINFO_RELEVANT (op_def_info)
1575                           != vect_used_in_outer_by_reduction))
1576                     return opt_result::failure_at (phi, "unsupported phi");
1577                 }
1578
1579               continue;
1580             }
1581
1582           gcc_assert (stmt_info);
1583
1584           if ((STMT_VINFO_RELEVANT (stmt_info) == vect_used_in_scope
1585                || STMT_VINFO_LIVE_P (stmt_info))
1586               && STMT_VINFO_DEF_TYPE (stmt_info) != vect_induction_def)
1587             /* A scalar-dependence cycle that we don't support.  */
1588             return opt_result::failure_at (phi,
1589                                            "not vectorized:"
1590                                            " scalar dependence cycle.\n");
1591
1592           if (STMT_VINFO_RELEVANT_P (stmt_info))
1593             {
1594               need_to_vectorize = true;
1595               if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def
1596                   && ! PURE_SLP_STMT (stmt_info))
1597                 ok = vectorizable_induction (stmt_info, NULL, NULL, NULL,
1598                                              &cost_vec);
1599               else if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
1600                         || STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle)
1601                        && ! PURE_SLP_STMT (stmt_info))
1602                 ok = vectorizable_reduction (stmt_info, NULL, NULL, NULL, NULL,
1603                                              &cost_vec);
1604             }
1605
1606           /* SLP PHIs are tested by vect_slp_analyze_node_operations.  */
1607           if (ok
1608               && STMT_VINFO_LIVE_P (stmt_info)
1609               && !PURE_SLP_STMT (stmt_info))
1610             ok = vectorizable_live_operation (stmt_info, NULL, NULL, -1, NULL,
1611                                               &cost_vec);
1612
1613           if (!ok)
1614             return opt_result::failure_at (phi,
1615                                            "not vectorized: relevant phi not "
1616                                            "supported: %G",
1617                                            static_cast <gimple *> (phi));
1618         }
1619
1620       for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
1621            gsi_next (&si))
1622         {
1623           gimple *stmt = gsi_stmt (si);
1624           if (!gimple_clobber_p (stmt))
1625             {
1626               opt_result res
1627                 = vect_analyze_stmt (loop_vinfo->lookup_stmt (stmt),
1628                                      &need_to_vectorize,
1629                                      NULL, NULL, &cost_vec);
1630               if (!res)
1631                 return res;
1632             }
1633         }
1634     } /* bbs */
1635
1636   add_stmt_costs (loop_vinfo->target_cost_data, &cost_vec);
1637
1638   /* All operations in the loop are either irrelevant (deal with loop
1639      control, or dead), or only used outside the loop and can be moved
1640      out of the loop (e.g. invariants, inductions).  The loop can be
1641      optimized away by scalar optimizations.  We're better off not
1642      touching this loop.  */
1643   if (!need_to_vectorize)
1644     {
1645       if (dump_enabled_p ())
1646         dump_printf_loc (MSG_NOTE, vect_location,
1647                          "All the computation can be taken out of the loop.\n");
1648       return opt_result::failure_at
1649         (vect_location,
1650          "not vectorized: redundant loop. no profit to vectorize.\n");
1651     }
1652
1653   return opt_result::success ();
1654 }
1655
1656 /* Analyze the cost of the loop described by LOOP_VINFO.  Decide if it
1657    is worthwhile to vectorize.  Return 1 if definitely yes, 0 if
1658    definitely no, or -1 if it's worth retrying.  */
1659
1660 static int
1661 vect_analyze_loop_costing (loop_vec_info loop_vinfo)
1662 {
1663   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1664   unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
1665
1666   /* Only fully-masked loops can have iteration counts less than the
1667      vectorization factor.  */
1668   if (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
1669     {
1670       HOST_WIDE_INT max_niter;
1671
1672       if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
1673         max_niter = LOOP_VINFO_INT_NITERS (loop_vinfo);
1674       else
1675         max_niter = max_stmt_executions_int (loop);
1676
1677       if (max_niter != -1
1678           && (unsigned HOST_WIDE_INT) max_niter < assumed_vf)
1679         {
1680           if (dump_enabled_p ())
1681             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1682                              "not vectorized: iteration count smaller than "
1683                              "vectorization factor.\n");
1684           return 0;
1685         }
1686     }
1687
1688   int min_profitable_iters, min_profitable_estimate;
1689   vect_estimate_min_profitable_iters (loop_vinfo, &min_profitable_iters,
1690                                       &min_profitable_estimate);
1691
1692   if (min_profitable_iters < 0)
1693     {
1694       if (dump_enabled_p ())
1695         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1696                          "not vectorized: vectorization not profitable.\n");
1697       if (dump_enabled_p ())
1698         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1699                          "not vectorized: vector version will never be "
1700                          "profitable.\n");
1701       return -1;
1702     }
1703
1704   int min_scalar_loop_bound = (PARAM_VALUE (PARAM_MIN_VECT_LOOP_BOUND)
1705                                * assumed_vf);
1706
1707   /* Use the cost model only if it is more conservative than user specified
1708      threshold.  */
1709   unsigned int th = (unsigned) MAX (min_scalar_loop_bound,
1710                                     min_profitable_iters);
1711
1712   LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo) = th;
1713
1714   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
1715       && LOOP_VINFO_INT_NITERS (loop_vinfo) < th)
1716     {
1717       if (dump_enabled_p ())
1718         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1719                          "not vectorized: vectorization not profitable.\n");
1720       if (dump_enabled_p ())
1721         dump_printf_loc (MSG_NOTE, vect_location,
1722                          "not vectorized: iteration count smaller than user "
1723                          "specified loop bound parameter or minimum profitable "
1724                          "iterations (whichever is more conservative).\n");
1725       return 0;
1726     }
1727
1728   HOST_WIDE_INT estimated_niter = estimated_stmt_executions_int (loop);
1729   if (estimated_niter == -1)
1730     estimated_niter = likely_max_stmt_executions_int (loop);
1731   if (estimated_niter != -1
1732       && ((unsigned HOST_WIDE_INT) estimated_niter
1733           < MAX (th, (unsigned) min_profitable_estimate)))
1734     {
1735       if (dump_enabled_p ())
1736         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1737                          "not vectorized: estimated iteration count too "
1738                          "small.\n");
1739       if (dump_enabled_p ())
1740         dump_printf_loc (MSG_NOTE, vect_location,
1741                          "not vectorized: estimated iteration count smaller "
1742                          "than specified loop bound parameter or minimum "
1743                          "profitable iterations (whichever is more "
1744                          "conservative).\n");
1745       return -1;
1746     }
1747
1748   return 1;
1749 }
1750
1751 static opt_result
1752 vect_get_datarefs_in_loop (loop_p loop, basic_block *bbs,
1753                            vec<data_reference_p> *datarefs,
1754                            unsigned int *n_stmts)
1755 {
1756   *n_stmts = 0;
1757   for (unsigned i = 0; i < loop->num_nodes; i++)
1758     for (gimple_stmt_iterator gsi = gsi_start_bb (bbs[i]);
1759          !gsi_end_p (gsi); gsi_next (&gsi))
1760       {
1761         gimple *stmt = gsi_stmt (gsi);
1762         if (is_gimple_debug (stmt))
1763           continue;
1764         ++(*n_stmts);
1765         opt_result res = vect_find_stmt_data_reference (loop, stmt, datarefs);
1766         if (!res)
1767           {
1768             if (is_gimple_call (stmt) && loop->safelen)
1769               {
1770                 tree fndecl = gimple_call_fndecl (stmt), op;
1771                 if (fndecl != NULL_TREE)
1772                   {
1773                     cgraph_node *node = cgraph_node::get (fndecl);
1774                     if (node != NULL && node->simd_clones != NULL)
1775                       {
1776                         unsigned int j, n = gimple_call_num_args (stmt);
1777                         for (j = 0; j < n; j++)
1778                           {
1779                             op = gimple_call_arg (stmt, j);
1780                             if (DECL_P (op)
1781                                 || (REFERENCE_CLASS_P (op)
1782                                     && get_base_address (op)))
1783                               break;
1784                           }
1785                         op = gimple_call_lhs (stmt);
1786                         /* Ignore #pragma omp declare simd functions
1787                            if they don't have data references in the
1788                            call stmt itself.  */
1789                         if (j == n
1790                             && !(op
1791                                  && (DECL_P (op)
1792                                      || (REFERENCE_CLASS_P (op)
1793                                          && get_base_address (op)))))
1794                           continue;
1795                       }
1796                   }
1797               }
1798             return res;
1799           }
1800         /* If dependence analysis will give up due to the limit on the
1801            number of datarefs stop here and fail fatally.  */
1802         if (datarefs->length ()
1803             > (unsigned)PARAM_VALUE (PARAM_LOOP_MAX_DATAREFS_FOR_DATADEPS))
1804           return opt_result::failure_at (stmt, "exceeded param "
1805                                          "loop-max-datarefs-for-datadeps\n");
1806       }
1807   return opt_result::success ();
1808 }
1809
1810 /* Look for SLP-only access groups and turn each individual access into its own
1811    group.  */
1812 static void
1813 vect_dissolve_slp_only_groups (loop_vec_info loop_vinfo)
1814 {
1815   unsigned int i;
1816   struct data_reference *dr;
1817
1818   DUMP_VECT_SCOPE ("vect_dissolve_slp_only_groups");
1819
1820   vec<data_reference_p> datarefs = loop_vinfo->shared->datarefs;
1821   FOR_EACH_VEC_ELT (datarefs, i, dr)
1822     {
1823       gcc_assert (DR_REF (dr));
1824       stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (DR_STMT (dr));
1825
1826       /* Check if the load is a part of an interleaving chain.  */
1827       if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
1828         {
1829           stmt_vec_info first_element = DR_GROUP_FIRST_ELEMENT (stmt_info);
1830           unsigned int group_size = DR_GROUP_SIZE (first_element);
1831
1832           /* Check if SLP-only groups.  */
1833           if (!STMT_SLP_TYPE (stmt_info)
1834               && STMT_VINFO_SLP_VECT_ONLY (first_element))
1835             {
1836               /* Dissolve the group.  */
1837               STMT_VINFO_SLP_VECT_ONLY (first_element) = false;
1838
1839               stmt_vec_info vinfo = first_element;
1840               while (vinfo)
1841                 {
1842                   stmt_vec_info next = DR_GROUP_NEXT_ELEMENT (vinfo);
1843                   DR_GROUP_FIRST_ELEMENT (vinfo) = vinfo;
1844                   DR_GROUP_NEXT_ELEMENT (vinfo) = NULL;
1845                   DR_GROUP_SIZE (vinfo) = 1;
1846                   DR_GROUP_GAP (vinfo) = group_size - 1;
1847                   vinfo = next;
1848                 }
1849             }
1850         }
1851     }
1852 }
1853
1854 /* Function vect_analyze_loop_2.
1855
1856    Apply a set of analyses on LOOP, and create a loop_vec_info struct
1857    for it.  The different analyses will record information in the
1858    loop_vec_info struct.  */
1859 static opt_result
1860 vect_analyze_loop_2 (loop_vec_info loop_vinfo, bool &fatal, unsigned *n_stmts)
1861 {
1862   opt_result ok = opt_result::success ();
1863   int res;
1864   unsigned int max_vf = MAX_VECTORIZATION_FACTOR;
1865   poly_uint64 min_vf = 2;
1866
1867   /* The first group of checks is independent of the vector size.  */
1868   fatal = true;
1869
1870   if (LOOP_VINFO_SIMD_IF_COND (loop_vinfo)
1871       && integer_zerop (LOOP_VINFO_SIMD_IF_COND (loop_vinfo)))
1872     return opt_result::failure_at (vect_location,
1873                                    "not vectorized: simd if(0)\n");
1874
1875   /* Find all data references in the loop (which correspond to vdefs/vuses)
1876      and analyze their evolution in the loop.  */
1877
1878   loop_p loop = LOOP_VINFO_LOOP (loop_vinfo);
1879
1880   /* Gather the data references and count stmts in the loop.  */
1881   if (!LOOP_VINFO_DATAREFS (loop_vinfo).exists ())
1882     {
1883       opt_result res
1884         = vect_get_datarefs_in_loop (loop, LOOP_VINFO_BBS (loop_vinfo),
1885                                      &LOOP_VINFO_DATAREFS (loop_vinfo),
1886                                      n_stmts);
1887       if (!res)
1888         {
1889           if (dump_enabled_p ())
1890             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1891                              "not vectorized: loop contains function "
1892                              "calls or data references that cannot "
1893                              "be analyzed\n");
1894           return res;
1895         }
1896       loop_vinfo->shared->save_datarefs ();
1897     }
1898   else
1899     loop_vinfo->shared->check_datarefs ();
1900
1901   /* Analyze the data references and also adjust the minimal
1902      vectorization factor according to the loads and stores.  */
1903
1904   ok = vect_analyze_data_refs (loop_vinfo, &min_vf, &fatal);
1905   if (!ok)
1906     {
1907       if (dump_enabled_p ())
1908         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1909                          "bad data references.\n");
1910       return ok;
1911     }
1912
1913   /* Classify all cross-iteration scalar data-flow cycles.
1914      Cross-iteration cycles caused by virtual phis are analyzed separately.  */
1915   vect_analyze_scalar_cycles (loop_vinfo);
1916
1917   vect_pattern_recog (loop_vinfo);
1918
1919   vect_fixup_scalar_cycles_with_patterns (loop_vinfo);
1920
1921   /* Analyze the access patterns of the data-refs in the loop (consecutive,
1922      complex, etc.). FORNOW: Only handle consecutive access pattern.  */
1923
1924   ok = vect_analyze_data_ref_accesses (loop_vinfo);
1925   if (!ok)
1926     {
1927       if (dump_enabled_p ())
1928         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1929                          "bad data access.\n");
1930       return ok;
1931     }
1932
1933   /* Data-flow analysis to detect stmts that do not need to be vectorized.  */
1934
1935   ok = vect_mark_stmts_to_be_vectorized (loop_vinfo, &fatal);
1936   if (!ok)
1937     {
1938       if (dump_enabled_p ())
1939         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1940                          "unexpected pattern.\n");
1941       return ok;
1942     }
1943
1944   /* While the rest of the analysis below depends on it in some way.  */
1945   fatal = false;
1946
1947   /* Analyze data dependences between the data-refs in the loop
1948      and adjust the maximum vectorization factor according to
1949      the dependences.
1950      FORNOW: fail at the first data dependence that we encounter.  */
1951
1952   ok = vect_analyze_data_ref_dependences (loop_vinfo, &max_vf);
1953   if (!ok)
1954     {
1955       if (dump_enabled_p ())
1956         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1957                          "bad data dependence.\n");
1958       return ok;
1959     }
1960   if (max_vf != MAX_VECTORIZATION_FACTOR
1961       && maybe_lt (max_vf, min_vf))
1962     return opt_result::failure_at (vect_location, "bad data dependence.\n");
1963   LOOP_VINFO_MAX_VECT_FACTOR (loop_vinfo) = max_vf;
1964
1965   ok = vect_determine_vectorization_factor (loop_vinfo);
1966   if (!ok)
1967     {
1968       if (dump_enabled_p ())
1969         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1970                          "can't determine vectorization factor.\n");
1971       return ok;
1972     }
1973   if (max_vf != MAX_VECTORIZATION_FACTOR
1974       && maybe_lt (max_vf, LOOP_VINFO_VECT_FACTOR (loop_vinfo)))
1975     return opt_result::failure_at (vect_location, "bad data dependence.\n");
1976
1977   /* Compute the scalar iteration cost.  */
1978   vect_compute_single_scalar_iteration_cost (loop_vinfo);
1979
1980   poly_uint64 saved_vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1981   unsigned th;
1982
1983   /* Check the SLP opportunities in the loop, analyze and build SLP trees.  */
1984   ok = vect_analyze_slp (loop_vinfo, *n_stmts);
1985   if (!ok)
1986     return ok;
1987
1988   /* If there are any SLP instances mark them as pure_slp.  */
1989   bool slp = vect_make_slp_decision (loop_vinfo);
1990   if (slp)
1991     {
1992       /* Find stmts that need to be both vectorized and SLPed.  */
1993       vect_detect_hybrid_slp (loop_vinfo);
1994
1995       /* Update the vectorization factor based on the SLP decision.  */
1996       vect_update_vf_for_slp (loop_vinfo);
1997     }
1998
1999   bool saved_can_fully_mask_p = LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo);
2000
2001   /* We don't expect to have to roll back to anything other than an empty
2002      set of rgroups.  */
2003   gcc_assert (LOOP_VINFO_MASKS (loop_vinfo).is_empty ());
2004
2005   /* This is the point where we can re-start analysis with SLP forced off.  */
2006 start_over:
2007
2008   /* Now the vectorization factor is final.  */
2009   poly_uint64 vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2010   gcc_assert (known_ne (vectorization_factor, 0U));
2011
2012   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) && dump_enabled_p ())
2013     {
2014       dump_printf_loc (MSG_NOTE, vect_location,
2015                        "vectorization_factor = ");
2016       dump_dec (MSG_NOTE, vectorization_factor);
2017       dump_printf (MSG_NOTE, ", niters = %wd\n",
2018                    LOOP_VINFO_INT_NITERS (loop_vinfo));
2019     }
2020
2021   HOST_WIDE_INT max_niter
2022     = likely_max_stmt_executions_int (LOOP_VINFO_LOOP (loop_vinfo));
2023
2024   /* Analyze the alignment of the data-refs in the loop.
2025      Fail if a data reference is found that cannot be vectorized.  */
2026
2027   ok = vect_analyze_data_refs_alignment (loop_vinfo);
2028   if (!ok)
2029     {
2030       if (dump_enabled_p ())
2031         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2032                          "bad data alignment.\n");
2033       return ok;
2034     }
2035
2036   /* Prune the list of ddrs to be tested at run-time by versioning for alias.
2037      It is important to call pruning after vect_analyze_data_ref_accesses,
2038      since we use grouping information gathered by interleaving analysis.  */
2039   ok = vect_prune_runtime_alias_test_list (loop_vinfo);
2040   if (!ok)
2041     return ok;
2042
2043   /* Do not invoke vect_enhance_data_refs_alignment for epilogue
2044      vectorization, since we do not want to add extra peeling or
2045      add versioning for alignment.  */
2046   if (!LOOP_VINFO_EPILOGUE_P (loop_vinfo))
2047     /* This pass will decide on using loop versioning and/or loop peeling in
2048        order to enhance the alignment of data references in the loop.  */
2049     ok = vect_enhance_data_refs_alignment (loop_vinfo);
2050   else
2051     ok = vect_verify_datarefs_alignment (loop_vinfo);
2052   if (!ok)
2053     return ok;
2054
2055   if (slp)
2056     {
2057       /* Analyze operations in the SLP instances.  Note this may
2058          remove unsupported SLP instances which makes the above
2059          SLP kind detection invalid.  */
2060       unsigned old_size = LOOP_VINFO_SLP_INSTANCES (loop_vinfo).length ();
2061       vect_slp_analyze_operations (loop_vinfo);
2062       if (LOOP_VINFO_SLP_INSTANCES (loop_vinfo).length () != old_size)
2063         {
2064           ok = opt_result::failure_at (vect_location,
2065                                        "unsupported SLP instances\n");
2066           goto again;
2067         }
2068     }
2069
2070   /* Dissolve SLP-only groups.  */
2071   vect_dissolve_slp_only_groups (loop_vinfo);
2072
2073   /* Scan all the remaining operations in the loop that are not subject
2074      to SLP and make sure they are vectorizable.  */
2075   ok = vect_analyze_loop_operations (loop_vinfo);
2076   if (!ok)
2077     {
2078       if (dump_enabled_p ())
2079         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2080                          "bad operation or unsupported loop bound.\n");
2081       return ok;
2082     }
2083
2084   /* Decide whether to use a fully-masked loop for this vectorization
2085      factor.  */
2086   LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
2087     = (LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo)
2088        && vect_verify_full_masking (loop_vinfo));
2089   if (dump_enabled_p ())
2090     {
2091       if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
2092         dump_printf_loc (MSG_NOTE, vect_location,
2093                          "using a fully-masked loop.\n");
2094       else
2095         dump_printf_loc (MSG_NOTE, vect_location,
2096                          "not using a fully-masked loop.\n");
2097     }
2098
2099   /* If epilog loop is required because of data accesses with gaps,
2100      one additional iteration needs to be peeled.  Check if there is
2101      enough iterations for vectorization.  */
2102   if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
2103       && LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2104       && !LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
2105     {
2106       poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2107       tree scalar_niters = LOOP_VINFO_NITERSM1 (loop_vinfo);
2108
2109       if (known_lt (wi::to_widest (scalar_niters), vf))
2110         return opt_result::failure_at (vect_location,
2111                                        "loop has no enough iterations to"
2112                                        " support peeling for gaps.\n");
2113     }
2114
2115   /* Check the costings of the loop make vectorizing worthwhile.  */
2116   res = vect_analyze_loop_costing (loop_vinfo);
2117   if (res < 0)
2118     {
2119       ok = opt_result::failure_at (vect_location,
2120                                    "Loop costings may not be worthwhile.\n");
2121       goto again;
2122     }
2123   if (!res)
2124     return opt_result::failure_at (vect_location,
2125                                    "Loop costings not worthwhile.\n");
2126
2127   /* Decide whether we need to create an epilogue loop to handle
2128      remaining scalar iterations.  */
2129   th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
2130
2131   unsigned HOST_WIDE_INT const_vf;
2132   if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
2133     /* The main loop handles all iterations.  */
2134     LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = false;
2135   else if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2136            && LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) >= 0)
2137     {
2138       /* Work out the (constant) number of iterations that need to be
2139          peeled for reasons other than niters.  */
2140       unsigned int peel_niter = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
2141       if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
2142         peel_niter += 1;
2143       if (!multiple_p (LOOP_VINFO_INT_NITERS (loop_vinfo) - peel_niter,
2144                        LOOP_VINFO_VECT_FACTOR (loop_vinfo)))
2145         LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = true;
2146     }
2147   else if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
2148            /* ??? When peeling for gaps but not alignment, we could
2149               try to check whether the (variable) niters is known to be
2150               VF * N + 1.  That's something of a niche case though.  */
2151            || LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
2152            || !LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant (&const_vf)
2153            || ((tree_ctz (LOOP_VINFO_NITERS (loop_vinfo))
2154                 < (unsigned) exact_log2 (const_vf))
2155                /* In case of versioning, check if the maximum number of
2156                   iterations is greater than th.  If they are identical,
2157                   the epilogue is unnecessary.  */
2158                && (!LOOP_REQUIRES_VERSIONING (loop_vinfo)
2159                    || ((unsigned HOST_WIDE_INT) max_niter
2160                        > (th / const_vf) * const_vf))))
2161     LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = true;
2162
2163   /* If an epilogue loop is required make sure we can create one.  */
2164   if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
2165       || LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo))
2166     {
2167       if (dump_enabled_p ())
2168         dump_printf_loc (MSG_NOTE, vect_location, "epilog loop required\n");
2169       if (!vect_can_advance_ivs_p (loop_vinfo)
2170           || !slpeel_can_duplicate_loop_p (LOOP_VINFO_LOOP (loop_vinfo),
2171                                            single_exit (LOOP_VINFO_LOOP
2172                                                          (loop_vinfo))))
2173         {
2174           ok = opt_result::failure_at (vect_location,
2175                                        "not vectorized: can't create required "
2176                                        "epilog loop\n");
2177           goto again;
2178         }
2179     }
2180
2181   /* During peeling, we need to check if number of loop iterations is
2182      enough for both peeled prolog loop and vector loop.  This check
2183      can be merged along with threshold check of loop versioning, so
2184      increase threshold for this case if necessary.  */
2185   if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
2186     {
2187       poly_uint64 niters_th = 0;
2188
2189       if (!vect_use_loop_mask_for_alignment_p (loop_vinfo))
2190         {
2191           /* Niters for peeled prolog loop.  */
2192           if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) < 0)
2193             {
2194               dr_vec_info *dr_info = LOOP_VINFO_UNALIGNED_DR (loop_vinfo);
2195               tree vectype = STMT_VINFO_VECTYPE (dr_info->stmt);
2196               niters_th += TYPE_VECTOR_SUBPARTS (vectype) - 1;
2197             }
2198           else
2199             niters_th += LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
2200         }
2201
2202       /* Niters for at least one iteration of vectorized loop.  */
2203       if (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
2204         niters_th += LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2205       /* One additional iteration because of peeling for gap.  */
2206       if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
2207         niters_th += 1;
2208       LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo) = niters_th;
2209     }
2210
2211   gcc_assert (known_eq (vectorization_factor,
2212                         LOOP_VINFO_VECT_FACTOR (loop_vinfo)));
2213
2214   /* Ok to vectorize!  */
2215   return opt_result::success ();
2216
2217 again:
2218   /* Ensure that "ok" is false (with an opt_problem if dumping is enabled).  */
2219   gcc_assert (!ok);
2220
2221   /* Try again with SLP forced off but if we didn't do any SLP there is
2222      no point in re-trying.  */
2223   if (!slp)
2224     return ok;
2225
2226   /* If there are reduction chains re-trying will fail anyway.  */
2227   if (! LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo).is_empty ())
2228     return ok;
2229
2230   /* Likewise if the grouped loads or stores in the SLP cannot be handled
2231      via interleaving or lane instructions.  */
2232   slp_instance instance;
2233   slp_tree node;
2234   unsigned i, j;
2235   FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), i, instance)
2236     {
2237       stmt_vec_info vinfo;
2238       vinfo = SLP_TREE_SCALAR_STMTS (SLP_INSTANCE_TREE (instance))[0];
2239       if (! STMT_VINFO_GROUPED_ACCESS (vinfo))
2240         continue;
2241       vinfo = DR_GROUP_FIRST_ELEMENT (vinfo);
2242       unsigned int size = DR_GROUP_SIZE (vinfo);
2243       tree vectype = STMT_VINFO_VECTYPE (vinfo);
2244       if (! vect_store_lanes_supported (vectype, size, false)
2245          && ! known_eq (TYPE_VECTOR_SUBPARTS (vectype), 1U)
2246          && ! vect_grouped_store_supported (vectype, size))
2247         return opt_result::failure_at (vinfo->stmt,
2248                                        "unsupported grouped store\n");
2249       FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), j, node)
2250         {
2251           vinfo = SLP_TREE_SCALAR_STMTS (node)[0];
2252           vinfo = DR_GROUP_FIRST_ELEMENT (vinfo);
2253           bool single_element_p = !DR_GROUP_NEXT_ELEMENT (vinfo);
2254           size = DR_GROUP_SIZE (vinfo);
2255           vectype = STMT_VINFO_VECTYPE (vinfo);
2256           if (! vect_load_lanes_supported (vectype, size, false)
2257               && ! vect_grouped_load_supported (vectype, single_element_p,
2258                                                 size))
2259             return opt_result::failure_at (vinfo->stmt,
2260                                            "unsupported grouped load\n");
2261         }
2262     }
2263
2264   if (dump_enabled_p ())
2265     dump_printf_loc (MSG_NOTE, vect_location,
2266                      "re-trying with SLP disabled\n");
2267
2268   /* Roll back state appropriately.  No SLP this time.  */
2269   slp = false;
2270   /* Restore vectorization factor as it were without SLP.  */
2271   LOOP_VINFO_VECT_FACTOR (loop_vinfo) = saved_vectorization_factor;
2272   /* Free the SLP instances.  */
2273   FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), j, instance)
2274     vect_free_slp_instance (instance, false);
2275   LOOP_VINFO_SLP_INSTANCES (loop_vinfo).release ();
2276   /* Reset SLP type to loop_vect on all stmts.  */
2277   for (i = 0; i < LOOP_VINFO_LOOP (loop_vinfo)->num_nodes; ++i)
2278     {
2279       basic_block bb = LOOP_VINFO_BBS (loop_vinfo)[i];
2280       for (gimple_stmt_iterator si = gsi_start_phis (bb);
2281            !gsi_end_p (si); gsi_next (&si))
2282         {
2283           stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
2284           STMT_SLP_TYPE (stmt_info) = loop_vect;
2285         }
2286       for (gimple_stmt_iterator si = gsi_start_bb (bb);
2287            !gsi_end_p (si); gsi_next (&si))
2288         {
2289           stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
2290           STMT_SLP_TYPE (stmt_info) = loop_vect;
2291           if (STMT_VINFO_IN_PATTERN_P (stmt_info))
2292             {
2293               gimple *pattern_def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
2294               stmt_info = STMT_VINFO_RELATED_STMT (stmt_info);
2295               STMT_SLP_TYPE (stmt_info) = loop_vect;
2296               for (gimple_stmt_iterator pi = gsi_start (pattern_def_seq);
2297                    !gsi_end_p (pi); gsi_next (&pi))
2298                 STMT_SLP_TYPE (loop_vinfo->lookup_stmt (gsi_stmt (pi)))
2299                   = loop_vect;
2300             }
2301         }
2302     }
2303   /* Free optimized alias test DDRS.  */
2304   LOOP_VINFO_LOWER_BOUNDS (loop_vinfo).truncate (0);
2305   LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo).release ();
2306   LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo).release ();
2307   /* Reset target cost data.  */
2308   destroy_cost_data (LOOP_VINFO_TARGET_COST_DATA (loop_vinfo));
2309   LOOP_VINFO_TARGET_COST_DATA (loop_vinfo)
2310     = init_cost (LOOP_VINFO_LOOP (loop_vinfo));
2311   /* Reset accumulated rgroup information.  */
2312   release_vec_loop_masks (&LOOP_VINFO_MASKS (loop_vinfo));
2313   /* Reset assorted flags.  */
2314   LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = false;
2315   LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) = false;
2316   LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo) = 0;
2317   LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo) = 0;
2318   LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = saved_can_fully_mask_p;
2319
2320   goto start_over;
2321 }
2322
2323 /* Function vect_analyze_loop.
2324
2325    Apply a set of analyses on LOOP, and create a loop_vec_info struct
2326    for it.  The different analyses will record information in the
2327    loop_vec_info struct.  If ORIG_LOOP_VINFO is not NULL epilogue must
2328    be vectorized.  */
2329 opt_loop_vec_info
2330 vect_analyze_loop (struct loop *loop, loop_vec_info orig_loop_vinfo,
2331                    vec_info_shared *shared)
2332 {
2333   auto_vector_sizes vector_sizes;
2334
2335   /* Autodetect first vector size we try.  */
2336   current_vector_size = 0;
2337   targetm.vectorize.autovectorize_vector_sizes (&vector_sizes,
2338                                                 loop->simdlen != 0);
2339   unsigned int next_size = 0;
2340
2341   DUMP_VECT_SCOPE ("analyze_loop_nest");
2342
2343   if (loop_outer (loop)
2344       && loop_vec_info_for_loop (loop_outer (loop))
2345       && LOOP_VINFO_VECTORIZABLE_P (loop_vec_info_for_loop (loop_outer (loop))))
2346     return opt_loop_vec_info::failure_at (vect_location,
2347                                           "outer-loop already vectorized.\n");
2348
2349   if (!find_loop_nest (loop, &shared->loop_nest))
2350     return opt_loop_vec_info::failure_at
2351       (vect_location,
2352        "not vectorized: loop nest containing two or more consecutive inner"
2353        " loops cannot be vectorized\n");
2354
2355   unsigned n_stmts = 0;
2356   poly_uint64 autodetected_vector_size = 0;
2357   opt_loop_vec_info first_loop_vinfo = opt_loop_vec_info::success (NULL);
2358   poly_uint64 first_vector_size = 0;
2359   while (1)
2360     {
2361       /* Check the CFG characteristics of the loop (nesting, entry/exit).  */
2362       opt_loop_vec_info loop_vinfo
2363         = vect_analyze_loop_form (loop, shared);
2364       if (!loop_vinfo)
2365         {
2366           if (dump_enabled_p ())
2367             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2368                              "bad loop form.\n");
2369           gcc_checking_assert (first_loop_vinfo == NULL);
2370           return loop_vinfo;
2371         }
2372
2373       bool fatal = false;
2374
2375       if (orig_loop_vinfo)
2376         LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo) = orig_loop_vinfo;
2377
2378       opt_result res = vect_analyze_loop_2 (loop_vinfo, fatal, &n_stmts);
2379       if (res)
2380         {
2381           LOOP_VINFO_VECTORIZABLE_P (loop_vinfo) = 1;
2382
2383           if (loop->simdlen
2384               && maybe_ne (LOOP_VINFO_VECT_FACTOR (loop_vinfo),
2385                            (unsigned HOST_WIDE_INT) loop->simdlen))
2386             {
2387               if (first_loop_vinfo == NULL)
2388                 {
2389                   first_loop_vinfo = loop_vinfo;
2390                   first_vector_size = current_vector_size;
2391                   loop->aux = NULL;
2392                 }
2393               else
2394                 delete loop_vinfo;
2395             }
2396           else
2397             {
2398               delete first_loop_vinfo;
2399               return loop_vinfo;
2400             }
2401         }
2402       else
2403         delete loop_vinfo;
2404
2405       if (next_size == 0)
2406         autodetected_vector_size = current_vector_size;
2407
2408       if (next_size < vector_sizes.length ()
2409           && known_eq (vector_sizes[next_size], autodetected_vector_size))
2410         next_size += 1;
2411
2412       if (fatal)
2413         {
2414           gcc_checking_assert (first_loop_vinfo == NULL);
2415           return opt_loop_vec_info::propagate_failure (res);
2416         }
2417
2418       if (next_size == vector_sizes.length ()
2419           || known_eq (current_vector_size, 0U))
2420         {
2421           if (first_loop_vinfo)
2422             {
2423               current_vector_size = first_vector_size;
2424               loop->aux = (loop_vec_info) first_loop_vinfo;
2425               if (dump_enabled_p ())
2426                 {
2427                   dump_printf_loc (MSG_NOTE, vect_location,
2428                                    "***** Choosing vector size ");
2429                   dump_dec (MSG_NOTE, current_vector_size);
2430                   dump_printf (MSG_NOTE, "\n");
2431                 }
2432               return first_loop_vinfo;
2433             }
2434           else
2435             return opt_loop_vec_info::propagate_failure (res);
2436         }
2437
2438       /* Try the next biggest vector size.  */
2439       current_vector_size = vector_sizes[next_size++];
2440       if (dump_enabled_p ())
2441         {
2442           dump_printf_loc (MSG_NOTE, vect_location,
2443                            "***** Re-trying analysis with "
2444                            "vector size ");
2445           dump_dec (MSG_NOTE, current_vector_size);
2446           dump_printf (MSG_NOTE, "\n");
2447         }
2448     }
2449 }
2450
2451 /* Return true if there is an in-order reduction function for CODE, storing
2452    it in *REDUC_FN if so.  */
2453
2454 static bool
2455 fold_left_reduction_fn (tree_code code, internal_fn *reduc_fn)
2456 {
2457   switch (code)
2458     {
2459     case PLUS_EXPR:
2460       *reduc_fn = IFN_FOLD_LEFT_PLUS;
2461       return true;
2462
2463     default:
2464       return false;
2465     }
2466 }
2467
2468 /* Function reduction_fn_for_scalar_code
2469
2470    Input:
2471    CODE - tree_code of a reduction operations.
2472
2473    Output:
2474    REDUC_FN - the corresponding internal function to be used to reduce the
2475       vector of partial results into a single scalar result, or IFN_LAST
2476       if the operation is a supported reduction operation, but does not have
2477       such an internal function.
2478
2479    Return FALSE if CODE currently cannot be vectorized as reduction.  */
2480
2481 static bool
2482 reduction_fn_for_scalar_code (enum tree_code code, internal_fn *reduc_fn)
2483 {
2484   switch (code)
2485     {
2486       case MAX_EXPR:
2487         *reduc_fn = IFN_REDUC_MAX;
2488         return true;
2489
2490       case MIN_EXPR:
2491         *reduc_fn = IFN_REDUC_MIN;
2492         return true;
2493
2494       case PLUS_EXPR:
2495         *reduc_fn = IFN_REDUC_PLUS;
2496         return true;
2497
2498       case BIT_AND_EXPR:
2499         *reduc_fn = IFN_REDUC_AND;
2500         return true;
2501
2502       case BIT_IOR_EXPR:
2503         *reduc_fn = IFN_REDUC_IOR;
2504         return true;
2505
2506       case BIT_XOR_EXPR:
2507         *reduc_fn = IFN_REDUC_XOR;
2508         return true;
2509
2510       case MULT_EXPR:
2511       case MINUS_EXPR:
2512         *reduc_fn = IFN_LAST;
2513         return true;
2514
2515       default:
2516        return false;
2517     }
2518 }
2519
2520 /* If there is a neutral value X such that SLP reduction NODE would not
2521    be affected by the introduction of additional X elements, return that X,
2522    otherwise return null.  CODE is the code of the reduction.  REDUC_CHAIN
2523    is true if the SLP statements perform a single reduction, false if each
2524    statement performs an independent reduction.  */
2525
2526 static tree
2527 neutral_op_for_slp_reduction (slp_tree slp_node, tree_code code,
2528                               bool reduc_chain)
2529 {
2530   vec<stmt_vec_info> stmts = SLP_TREE_SCALAR_STMTS (slp_node);
2531   stmt_vec_info stmt_vinfo = stmts[0];
2532   tree vector_type = STMT_VINFO_VECTYPE (stmt_vinfo);
2533   tree scalar_type = TREE_TYPE (vector_type);
2534   struct loop *loop = gimple_bb (stmt_vinfo->stmt)->loop_father;
2535   gcc_assert (loop);
2536
2537   switch (code)
2538     {
2539     case WIDEN_SUM_EXPR:
2540     case DOT_PROD_EXPR:
2541     case SAD_EXPR:
2542     case PLUS_EXPR:
2543     case MINUS_EXPR:
2544     case BIT_IOR_EXPR:
2545     case BIT_XOR_EXPR:
2546       return build_zero_cst (scalar_type);
2547
2548     case MULT_EXPR:
2549       return build_one_cst (scalar_type);
2550
2551     case BIT_AND_EXPR:
2552       return build_all_ones_cst (scalar_type);
2553
2554     case MAX_EXPR:
2555     case MIN_EXPR:
2556       /* For MIN/MAX the initial values are neutral.  A reduction chain
2557          has only a single initial value, so that value is neutral for
2558          all statements.  */
2559       if (reduc_chain)
2560         return PHI_ARG_DEF_FROM_EDGE (stmt_vinfo->stmt,
2561                                       loop_preheader_edge (loop));
2562       return NULL_TREE;
2563
2564     default:
2565       return NULL_TREE;
2566     }
2567 }
2568
2569 /* Error reporting helper for vect_is_simple_reduction below.  GIMPLE statement
2570    STMT is printed with a message MSG. */
2571
2572 static void
2573 report_vect_op (dump_flags_t msg_type, gimple *stmt, const char *msg)
2574 {
2575   dump_printf_loc (msg_type, vect_location, "%s%G", msg, stmt);
2576 }
2577
2578 /* DEF_STMT_INFO occurs in a loop that contains a potential reduction
2579    operation.  Return true if the results of DEF_STMT_INFO are something
2580    that can be accumulated by such a reduction.  */
2581
2582 static bool
2583 vect_valid_reduction_input_p (stmt_vec_info def_stmt_info)
2584 {
2585   return (is_gimple_assign (def_stmt_info->stmt)
2586           || is_gimple_call (def_stmt_info->stmt)
2587           || STMT_VINFO_DEF_TYPE (def_stmt_info) == vect_induction_def
2588           || (gimple_code (def_stmt_info->stmt) == GIMPLE_PHI
2589               && STMT_VINFO_DEF_TYPE (def_stmt_info) == vect_internal_def
2590               && !is_loop_header_bb_p (gimple_bb (def_stmt_info->stmt))));
2591 }
2592
2593 /* Detect SLP reduction of the form:
2594
2595    #a1 = phi <a5, a0>
2596    a2 = operation (a1)
2597    a3 = operation (a2)
2598    a4 = operation (a3)
2599    a5 = operation (a4)
2600
2601    #a = phi <a5>
2602
2603    PHI is the reduction phi node (#a1 = phi <a5, a0> above)
2604    FIRST_STMT is the first reduction stmt in the chain
2605    (a2 = operation (a1)).
2606
2607    Return TRUE if a reduction chain was detected.  */
2608
2609 static bool
2610 vect_is_slp_reduction (loop_vec_info loop_info, gimple *phi,
2611                        gimple *first_stmt)
2612 {
2613   struct loop *loop = (gimple_bb (phi))->loop_father;
2614   struct loop *vect_loop = LOOP_VINFO_LOOP (loop_info);
2615   enum tree_code code;
2616   gimple *loop_use_stmt = NULL;
2617   stmt_vec_info use_stmt_info;
2618   tree lhs;
2619   imm_use_iterator imm_iter;
2620   use_operand_p use_p;
2621   int nloop_uses, size = 0, n_out_of_loop_uses;
2622   bool found = false;
2623
2624   if (loop != vect_loop)
2625     return false;
2626
2627   auto_vec<stmt_vec_info, 8> reduc_chain;
2628   lhs = PHI_RESULT (phi);
2629   code = gimple_assign_rhs_code (first_stmt);
2630   while (1)
2631     {
2632       nloop_uses = 0;
2633       n_out_of_loop_uses = 0;
2634       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, lhs)
2635         {
2636           gimple *use_stmt = USE_STMT (use_p);
2637           if (is_gimple_debug (use_stmt))
2638             continue;
2639
2640           /* Check if we got back to the reduction phi.  */
2641           if (use_stmt == phi)
2642             {
2643               loop_use_stmt = use_stmt;
2644               found = true;
2645               break;
2646             }
2647
2648           if (flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
2649             {
2650               loop_use_stmt = use_stmt;
2651               nloop_uses++;
2652             }
2653            else
2654              n_out_of_loop_uses++;
2655
2656            /* There are can be either a single use in the loop or two uses in
2657               phi nodes.  */
2658            if (nloop_uses > 1 || (n_out_of_loop_uses && nloop_uses))
2659              return false;
2660         }
2661
2662       if (found)
2663         break;
2664
2665       /* We reached a statement with no loop uses.  */
2666       if (nloop_uses == 0)
2667         return false;
2668
2669       /* This is a loop exit phi, and we haven't reached the reduction phi.  */
2670       if (gimple_code (loop_use_stmt) == GIMPLE_PHI)
2671         return false;
2672
2673       if (!is_gimple_assign (loop_use_stmt)
2674           || code != gimple_assign_rhs_code (loop_use_stmt)
2675           || !flow_bb_inside_loop_p (loop, gimple_bb (loop_use_stmt)))
2676         return false;
2677
2678       /* Insert USE_STMT into reduction chain.  */
2679       use_stmt_info = loop_info->lookup_stmt (loop_use_stmt);
2680       reduc_chain.safe_push (use_stmt_info);
2681
2682       lhs = gimple_assign_lhs (loop_use_stmt);
2683       size++;
2684    }
2685
2686   if (!found || loop_use_stmt != phi || size < 2)
2687     return false;
2688
2689   /* Swap the operands, if needed, to make the reduction operand be the second
2690      operand.  */
2691   lhs = PHI_RESULT (phi);
2692   for (unsigned i = 0; i < reduc_chain.length (); ++i)
2693     {
2694       gassign *next_stmt = as_a <gassign *> (reduc_chain[i]->stmt);
2695       if (gimple_assign_rhs2 (next_stmt) == lhs)
2696         {
2697           tree op = gimple_assign_rhs1 (next_stmt);
2698           stmt_vec_info def_stmt_info = loop_info->lookup_def (op);
2699
2700           /* Check that the other def is either defined in the loop
2701              ("vect_internal_def"), or it's an induction (defined by a
2702              loop-header phi-node).  */
2703           if (def_stmt_info
2704               && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt_info->stmt))
2705               && vect_valid_reduction_input_p (def_stmt_info))
2706             {
2707               lhs = gimple_assign_lhs (next_stmt);
2708               continue;
2709             }
2710
2711           return false;
2712         }
2713       else
2714         {
2715           tree op = gimple_assign_rhs2 (next_stmt);
2716           stmt_vec_info def_stmt_info = loop_info->lookup_def (op);
2717
2718           /* Check that the other def is either defined in the loop
2719             ("vect_internal_def"), or it's an induction (defined by a
2720             loop-header phi-node).  */
2721           if (def_stmt_info
2722               && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt_info->stmt))
2723               && vect_valid_reduction_input_p (def_stmt_info))
2724             {
2725               if (dump_enabled_p ())
2726                 dump_printf_loc (MSG_NOTE, vect_location, "swapping oprnds: %G",
2727                                  next_stmt);
2728
2729               swap_ssa_operands (next_stmt,
2730                                  gimple_assign_rhs1_ptr (next_stmt),
2731                                  gimple_assign_rhs2_ptr (next_stmt));
2732               update_stmt (next_stmt);
2733
2734               if (CONSTANT_CLASS_P (gimple_assign_rhs1 (next_stmt)))
2735                 LOOP_VINFO_OPERANDS_SWAPPED (loop_info) = true;
2736             }
2737           else
2738             return false;
2739         }
2740
2741       lhs = gimple_assign_lhs (next_stmt);
2742     }
2743
2744   /* Build up the actual chain.  */
2745   for (unsigned i = 0; i < reduc_chain.length () - 1; ++i)
2746     {
2747       REDUC_GROUP_FIRST_ELEMENT (reduc_chain[i]) = reduc_chain[0];
2748       REDUC_GROUP_NEXT_ELEMENT (reduc_chain[i]) = reduc_chain[i+1];
2749     }
2750   REDUC_GROUP_FIRST_ELEMENT (reduc_chain.last ()) = reduc_chain[0];
2751   REDUC_GROUP_NEXT_ELEMENT (reduc_chain.last ()) = NULL;
2752
2753   /* Save the chain for further analysis in SLP detection.  */
2754   LOOP_VINFO_REDUCTION_CHAINS (loop_info).safe_push (reduc_chain[0]);
2755   REDUC_GROUP_SIZE (reduc_chain[0]) = size;
2756
2757   return true;
2758 }
2759
2760 /* Return true if we need an in-order reduction for operation CODE
2761    on type TYPE.  NEED_WRAPPING_INTEGRAL_OVERFLOW is true if integer
2762    overflow must wrap.  */
2763
2764 static bool
2765 needs_fold_left_reduction_p (tree type, tree_code code,
2766                              bool need_wrapping_integral_overflow)
2767 {
2768   /* CHECKME: check for !flag_finite_math_only too?  */
2769   if (SCALAR_FLOAT_TYPE_P (type))
2770     switch (code)
2771       {
2772       case MIN_EXPR:
2773       case MAX_EXPR:
2774         return false;
2775
2776       default:
2777         return !flag_associative_math;
2778       }
2779
2780   if (INTEGRAL_TYPE_P (type))
2781     {
2782       if (!operation_no_trapping_overflow (type, code))
2783         return true;
2784       if (need_wrapping_integral_overflow
2785           && !TYPE_OVERFLOW_WRAPS (type)
2786           && operation_can_overflow (code))
2787         return true;
2788       return false;
2789     }
2790
2791   if (SAT_FIXED_POINT_TYPE_P (type))
2792     return true;
2793
2794   return false;
2795 }
2796
2797 /* Return true if the reduction PHI in LOOP with latch arg LOOP_ARG and
2798    reduction operation CODE has a handled computation expression.  */
2799
2800 bool
2801 check_reduction_path (dump_user_location_t loc, loop_p loop, gphi *phi,
2802                       tree loop_arg, enum tree_code code)
2803 {
2804   auto_vec<std::pair<ssa_op_iter, use_operand_p> > path;
2805   auto_bitmap visited;
2806   tree lookfor = PHI_RESULT (phi);
2807   ssa_op_iter curri;
2808   use_operand_p curr = op_iter_init_phiuse (&curri, phi, SSA_OP_USE);
2809   while (USE_FROM_PTR (curr) != loop_arg)
2810     curr = op_iter_next_use (&curri);
2811   curri.i = curri.numops;
2812   do
2813     {
2814       path.safe_push (std::make_pair (curri, curr));
2815       tree use = USE_FROM_PTR (curr);
2816       if (use == lookfor)
2817         break;
2818       gimple *def = SSA_NAME_DEF_STMT (use);
2819       if (gimple_nop_p (def)
2820           || ! flow_bb_inside_loop_p (loop, gimple_bb (def)))
2821         {
2822 pop:
2823           do
2824             {
2825               std::pair<ssa_op_iter, use_operand_p> x = path.pop ();
2826               curri = x.first;
2827               curr = x.second;
2828               do
2829                 curr = op_iter_next_use (&curri);
2830               /* Skip already visited or non-SSA operands (from iterating
2831                  over PHI args).  */
2832               while (curr != NULL_USE_OPERAND_P
2833                      && (TREE_CODE (USE_FROM_PTR (curr)) != SSA_NAME
2834                          || ! bitmap_set_bit (visited,
2835                                               SSA_NAME_VERSION
2836                                                 (USE_FROM_PTR (curr)))));
2837             }
2838           while (curr == NULL_USE_OPERAND_P && ! path.is_empty ());
2839           if (curr == NULL_USE_OPERAND_P)
2840             break;
2841         }
2842       else
2843         {
2844           if (gimple_code (def) == GIMPLE_PHI)
2845             curr = op_iter_init_phiuse (&curri, as_a <gphi *>(def), SSA_OP_USE);
2846           else
2847             curr = op_iter_init_use (&curri, def, SSA_OP_USE);
2848           while (curr != NULL_USE_OPERAND_P
2849                  && (TREE_CODE (USE_FROM_PTR (curr)) != SSA_NAME
2850                      || ! bitmap_set_bit (visited,
2851                                           SSA_NAME_VERSION
2852                                             (USE_FROM_PTR (curr)))))
2853             curr = op_iter_next_use (&curri);
2854           if (curr == NULL_USE_OPERAND_P)
2855             goto pop;
2856         }
2857     }
2858   while (1);
2859   if (dump_file && (dump_flags & TDF_DETAILS))
2860     {
2861       dump_printf_loc (MSG_NOTE, loc, "reduction path: ");
2862       unsigned i;
2863       std::pair<ssa_op_iter, use_operand_p> *x;
2864       FOR_EACH_VEC_ELT (path, i, x)
2865         dump_printf (MSG_NOTE, "%T ", USE_FROM_PTR (x->second));
2866       dump_printf (MSG_NOTE, "\n");
2867     }
2868
2869   /* Check whether the reduction path detected is valid.  */
2870   bool fail = path.length () == 0;
2871   bool neg = false;
2872   for (unsigned i = 1; i < path.length (); ++i)
2873     {
2874       gimple *use_stmt = USE_STMT (path[i].second);
2875       tree op = USE_FROM_PTR (path[i].second);
2876       if (! has_single_use (op)
2877           || ! is_gimple_assign (use_stmt))
2878         {
2879           fail = true;
2880           break;
2881         }
2882       if (gimple_assign_rhs_code (use_stmt) != code)
2883         {
2884           if (code == PLUS_EXPR
2885               && gimple_assign_rhs_code (use_stmt) == MINUS_EXPR)
2886             {
2887               /* Track whether we negate the reduction value each iteration.  */
2888               if (gimple_assign_rhs2 (use_stmt) == op)
2889                 neg = ! neg;
2890             }
2891           else
2892             {
2893               fail = true;
2894               break;
2895             }
2896         }
2897     }
2898   return ! fail && ! neg;
2899 }
2900
2901
2902 /* Function vect_is_simple_reduction
2903
2904    (1) Detect a cross-iteration def-use cycle that represents a simple
2905    reduction computation.  We look for the following pattern:
2906
2907    loop_header:
2908      a1 = phi < a0, a2 >
2909      a3 = ...
2910      a2 = operation (a3, a1)
2911
2912    or
2913
2914    a3 = ...
2915    loop_header:
2916      a1 = phi < a0, a2 >
2917      a2 = operation (a3, a1)
2918
2919    such that:
2920    1. operation is commutative and associative and it is safe to
2921       change the order of the computation
2922    2. no uses for a2 in the loop (a2 is used out of the loop)
2923    3. no uses of a1 in the loop besides the reduction operation
2924    4. no uses of a1 outside the loop.
2925
2926    Conditions 1,4 are tested here.
2927    Conditions 2,3 are tested in vect_mark_stmts_to_be_vectorized.
2928
2929    (2) Detect a cross-iteration def-use cycle in nested loops, i.e.,
2930    nested cycles.
2931
2932    (3) Detect cycles of phi nodes in outer-loop vectorization, i.e., double
2933    reductions:
2934
2935      a1 = phi < a0, a2 >
2936      inner loop (def of a3)
2937      a2 = phi < a3 >
2938
2939    (4) Detect condition expressions, ie:
2940      for (int i = 0; i < N; i++)
2941        if (a[i] < val)
2942         ret_val = a[i];
2943
2944 */
2945
2946 static stmt_vec_info
2947 vect_is_simple_reduction (loop_vec_info loop_info, stmt_vec_info phi_info,
2948                           bool *double_reduc,
2949                           bool need_wrapping_integral_overflow,
2950                           enum vect_reduction_type *v_reduc_type)
2951 {
2952   gphi *phi = as_a <gphi *> (phi_info->stmt);
2953   struct loop *loop = (gimple_bb (phi))->loop_father;
2954   struct loop *vect_loop = LOOP_VINFO_LOOP (loop_info);
2955   bool nested_in_vect_loop = flow_loop_nested_p (vect_loop, loop);
2956   gimple *phi_use_stmt = NULL;
2957   enum tree_code orig_code, code;
2958   tree op1, op2, op3 = NULL_TREE, op4 = NULL_TREE;
2959   tree type;
2960   tree name;
2961   imm_use_iterator imm_iter;
2962   use_operand_p use_p;
2963   bool phi_def;
2964
2965   *double_reduc = false;
2966   *v_reduc_type = TREE_CODE_REDUCTION;
2967
2968   tree phi_name = PHI_RESULT (phi);
2969   /* ???  If there are no uses of the PHI result the inner loop reduction
2970      won't be detected as possibly double-reduction by vectorizable_reduction
2971      because that tries to walk the PHI arg from the preheader edge which
2972      can be constant.  See PR60382.  */
2973   if (has_zero_uses (phi_name))
2974     return NULL;
2975   unsigned nphi_def_loop_uses = 0;
2976   FOR_EACH_IMM_USE_FAST (use_p, imm_iter, phi_name)
2977     {
2978       gimple *use_stmt = USE_STMT (use_p);
2979       if (is_gimple_debug (use_stmt))
2980         continue;
2981
2982       if (!flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
2983         {
2984           if (dump_enabled_p ())
2985             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2986                              "intermediate value used outside loop.\n");
2987
2988           return NULL;
2989         }
2990
2991       nphi_def_loop_uses++;
2992       phi_use_stmt = use_stmt;
2993     }
2994
2995   edge latch_e = loop_latch_edge (loop);
2996   tree loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
2997   if (TREE_CODE (loop_arg) != SSA_NAME)
2998     {
2999       if (dump_enabled_p ())
3000         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3001                          "reduction: not ssa_name: %T\n", loop_arg);
3002       return NULL;
3003     }
3004
3005   stmt_vec_info def_stmt_info = loop_info->lookup_def (loop_arg);
3006   if (!def_stmt_info
3007       || !flow_bb_inside_loop_p (loop, gimple_bb (def_stmt_info->stmt)))
3008     return NULL;
3009
3010   if (gassign *def_stmt = dyn_cast <gassign *> (def_stmt_info->stmt))
3011     {
3012       name = gimple_assign_lhs (def_stmt);
3013       phi_def = false;
3014     }
3015   else if (gphi *def_stmt = dyn_cast <gphi *> (def_stmt_info->stmt))
3016     {
3017       name = PHI_RESULT (def_stmt);
3018       phi_def = true;
3019     }
3020   else
3021     {
3022       if (dump_enabled_p ())
3023         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3024                          "reduction: unhandled reduction operation: %G",
3025                          def_stmt_info->stmt);
3026       return NULL;
3027     }
3028
3029   unsigned nlatch_def_loop_uses = 0;
3030   auto_vec<gphi *, 3> lcphis;
3031   bool inner_loop_of_double_reduc = false;
3032   FOR_EACH_IMM_USE_FAST (use_p, imm_iter, name)
3033     {
3034       gimple *use_stmt = USE_STMT (use_p);
3035       if (is_gimple_debug (use_stmt))
3036         continue;
3037       if (flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
3038         nlatch_def_loop_uses++;
3039       else
3040         {
3041           /* We can have more than one loop-closed PHI.  */
3042           lcphis.safe_push (as_a <gphi *> (use_stmt));
3043           if (nested_in_vect_loop
3044               && (STMT_VINFO_DEF_TYPE (loop_info->lookup_stmt (use_stmt))
3045                   == vect_double_reduction_def))
3046             inner_loop_of_double_reduc = true;
3047         }
3048     }
3049
3050   /* If this isn't a nested cycle or if the nested cycle reduction value
3051      is used ouside of the inner loop we cannot handle uses of the reduction
3052      value.  */
3053   if ((!nested_in_vect_loop || inner_loop_of_double_reduc)
3054       && (nlatch_def_loop_uses > 1 || nphi_def_loop_uses > 1))
3055     {
3056       if (dump_enabled_p ())
3057         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3058                          "reduction used in loop.\n");
3059       return NULL;
3060     }
3061
3062   /* If DEF_STMT is a phi node itself, we expect it to have a single argument
3063      defined in the inner loop.  */
3064   if (phi_def)
3065     {
3066       gphi *def_stmt = as_a <gphi *> (def_stmt_info->stmt);
3067       op1 = PHI_ARG_DEF (def_stmt, 0);
3068
3069       if (gimple_phi_num_args (def_stmt) != 1
3070           || TREE_CODE (op1) != SSA_NAME)
3071         {
3072           if (dump_enabled_p ())
3073             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3074                              "unsupported phi node definition.\n");
3075
3076           return NULL;
3077         }
3078
3079       gimple *def1 = SSA_NAME_DEF_STMT (op1);
3080       if (gimple_bb (def1)
3081           && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
3082           && loop->inner
3083           && flow_bb_inside_loop_p (loop->inner, gimple_bb (def1))
3084           && is_gimple_assign (def1)
3085           && is_a <gphi *> (phi_use_stmt)
3086           && flow_bb_inside_loop_p (loop->inner, gimple_bb (phi_use_stmt)))
3087         {
3088           if (dump_enabled_p ())
3089             report_vect_op (MSG_NOTE, def_stmt,
3090                             "detected double reduction: ");
3091
3092           *double_reduc = true;
3093           return def_stmt_info;
3094         }
3095
3096       return NULL;
3097     }
3098
3099   /* If we are vectorizing an inner reduction we are executing that
3100      in the original order only in case we are not dealing with a
3101      double reduction.  */
3102   bool check_reduction = true;
3103   if (flow_loop_nested_p (vect_loop, loop))
3104     {
3105       gphi *lcphi;
3106       unsigned i;
3107       check_reduction = false;
3108       FOR_EACH_VEC_ELT (lcphis, i, lcphi)
3109         FOR_EACH_IMM_USE_FAST (use_p, imm_iter, gimple_phi_result (lcphi))
3110           {
3111             gimple *use_stmt = USE_STMT (use_p);
3112             if (is_gimple_debug (use_stmt))
3113               continue;
3114             if (! flow_bb_inside_loop_p (vect_loop, gimple_bb (use_stmt)))
3115               check_reduction = true;
3116           }
3117     }
3118
3119   gassign *def_stmt = as_a <gassign *> (def_stmt_info->stmt);
3120   code = orig_code = gimple_assign_rhs_code (def_stmt);
3121
3122   if (nested_in_vect_loop && !check_reduction)
3123     {
3124       /* FIXME: Even for non-reductions code generation is funneled
3125          through vectorizable_reduction for the stmt defining the
3126          PHI latch value.  So we have to artificially restrict ourselves
3127          for the supported operations.  */
3128       switch (get_gimple_rhs_class (code))
3129         {
3130         case GIMPLE_BINARY_RHS:
3131         case GIMPLE_TERNARY_RHS:
3132           break;
3133         default:
3134           /* Not supported by vectorizable_reduction.  */
3135           if (dump_enabled_p ())
3136             report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
3137                             "nested cycle: not handled operation: ");
3138           return NULL;
3139         }
3140       if (dump_enabled_p ())
3141         report_vect_op (MSG_NOTE, def_stmt, "detected nested cycle: ");
3142       return def_stmt_info;
3143     }
3144
3145   /* We can handle "res -= x[i]", which is non-associative by
3146      simply rewriting this into "res += -x[i]".  Avoid changing
3147      gimple instruction for the first simple tests and only do this
3148      if we're allowed to change code at all.  */
3149   if (code == MINUS_EXPR && gimple_assign_rhs2 (def_stmt) != phi_name)
3150     code = PLUS_EXPR;
3151
3152   if (code == COND_EXPR)
3153     {
3154       if (! nested_in_vect_loop)
3155         *v_reduc_type = COND_REDUCTION;
3156
3157       op3 = gimple_assign_rhs1 (def_stmt);
3158       if (COMPARISON_CLASS_P (op3))
3159         {
3160           op4 = TREE_OPERAND (op3, 1);
3161           op3 = TREE_OPERAND (op3, 0);
3162         }
3163       if (op3 == phi_name || op4 == phi_name)
3164         {
3165           if (dump_enabled_p ())
3166             report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
3167                             "reduction: condition depends on previous"
3168                             " iteration: ");
3169           return NULL;
3170         }
3171
3172       op1 = gimple_assign_rhs2 (def_stmt);
3173       op2 = gimple_assign_rhs3 (def_stmt);
3174     }
3175   else if (!commutative_tree_code (code) || !associative_tree_code (code))
3176     {
3177       if (dump_enabled_p ())
3178         report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
3179                         "reduction: not commutative/associative: ");
3180       return NULL;
3181     }
3182   else if (get_gimple_rhs_class (code) == GIMPLE_BINARY_RHS)
3183     {
3184       op1 = gimple_assign_rhs1 (def_stmt);
3185       op2 = gimple_assign_rhs2 (def_stmt);
3186     }
3187   else
3188     {
3189       if (dump_enabled_p ())
3190         report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
3191                         "reduction: not handled operation: ");
3192       return NULL;
3193     }
3194
3195   if (TREE_CODE (op1) != SSA_NAME && TREE_CODE (op2) != SSA_NAME)
3196     {
3197       if (dump_enabled_p ())
3198         report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
3199                         "reduction: both uses not ssa_names: ");
3200
3201       return NULL;
3202     }
3203
3204   type = TREE_TYPE (gimple_assign_lhs (def_stmt));
3205   if ((TREE_CODE (op1) == SSA_NAME
3206        && !types_compatible_p (type,TREE_TYPE (op1)))
3207       || (TREE_CODE (op2) == SSA_NAME
3208           && !types_compatible_p (type, TREE_TYPE (op2)))
3209       || (op3 && TREE_CODE (op3) == SSA_NAME
3210           && !types_compatible_p (type, TREE_TYPE (op3)))
3211       || (op4 && TREE_CODE (op4) == SSA_NAME
3212           && !types_compatible_p (type, TREE_TYPE (op4))))
3213     {
3214       if (dump_enabled_p ())
3215         {
3216           dump_printf_loc (MSG_NOTE, vect_location,
3217                            "reduction: multiple types: operation type: "
3218                            "%T, operands types: %T,%T",
3219                            type,  TREE_TYPE (op1), TREE_TYPE (op2));
3220           if (op3)
3221             dump_printf (MSG_NOTE, ",%T", TREE_TYPE (op3));
3222
3223           if (op4)
3224             dump_printf (MSG_NOTE, ",%T", TREE_TYPE (op4));
3225           dump_printf (MSG_NOTE, "\n");
3226         }
3227
3228       return NULL;
3229     }
3230
3231   /* Check whether it's ok to change the order of the computation.
3232      Generally, when vectorizing a reduction we change the order of the
3233      computation.  This may change the behavior of the program in some
3234      cases, so we need to check that this is ok.  One exception is when
3235      vectorizing an outer-loop: the inner-loop is executed sequentially,
3236      and therefore vectorizing reductions in the inner-loop during
3237      outer-loop vectorization is safe.  */
3238   if (check_reduction
3239       && *v_reduc_type == TREE_CODE_REDUCTION
3240       && needs_fold_left_reduction_p (type, code,
3241                                       need_wrapping_integral_overflow))
3242     *v_reduc_type = FOLD_LEFT_REDUCTION;
3243
3244   /* Reduction is safe. We're dealing with one of the following:
3245      1) integer arithmetic and no trapv
3246      2) floating point arithmetic, and special flags permit this optimization
3247      3) nested cycle (i.e., outer loop vectorization).  */
3248   stmt_vec_info def1_info = loop_info->lookup_def (op1);
3249   stmt_vec_info def2_info = loop_info->lookup_def (op2);
3250   if (code != COND_EXPR && !def1_info && !def2_info)
3251     {
3252       if (dump_enabled_p ())
3253         report_vect_op (MSG_NOTE, def_stmt, "reduction: no defs for operands: ");
3254       return NULL;
3255     }
3256
3257   /* Check that one def is the reduction def, defined by PHI,
3258      the other def is either defined in the loop ("vect_internal_def"),
3259      or it's an induction (defined by a loop-header phi-node).  */
3260
3261   if (def2_info
3262       && def2_info->stmt == phi
3263       && (code == COND_EXPR
3264           || !def1_info
3265           || !flow_bb_inside_loop_p (loop, gimple_bb (def1_info->stmt))
3266           || vect_valid_reduction_input_p (def1_info)))
3267     {
3268       if (dump_enabled_p ())
3269         report_vect_op (MSG_NOTE, def_stmt, "detected reduction: ");
3270       return def_stmt_info;
3271     }
3272
3273   if (def1_info
3274       && def1_info->stmt == phi
3275       && (code == COND_EXPR
3276           || !def2_info
3277           || !flow_bb_inside_loop_p (loop, gimple_bb (def2_info->stmt))
3278           || vect_valid_reduction_input_p (def2_info)))
3279     {
3280       if (! nested_in_vect_loop && orig_code != MINUS_EXPR)
3281         {
3282           /* Check if we can swap operands (just for simplicity - so that
3283              the rest of the code can assume that the reduction variable
3284              is always the last (second) argument).  */
3285           if (code == COND_EXPR)
3286             {
3287               /* Swap cond_expr by inverting the condition.  */
3288               tree cond_expr = gimple_assign_rhs1 (def_stmt);
3289               enum tree_code invert_code = ERROR_MARK;
3290               enum tree_code cond_code = TREE_CODE (cond_expr);
3291
3292               if (TREE_CODE_CLASS (cond_code) == tcc_comparison)
3293                 {
3294                   bool honor_nans = HONOR_NANS (TREE_OPERAND (cond_expr, 0));
3295                   invert_code = invert_tree_comparison (cond_code, honor_nans);
3296                 }
3297               if (invert_code != ERROR_MARK)
3298                 {
3299                   TREE_SET_CODE (cond_expr, invert_code);
3300                   swap_ssa_operands (def_stmt,
3301                                      gimple_assign_rhs2_ptr (def_stmt),
3302                                      gimple_assign_rhs3_ptr (def_stmt));
3303                 }
3304               else
3305                 {
3306                   if (dump_enabled_p ())
3307                     report_vect_op (MSG_NOTE, def_stmt,
3308                                     "detected reduction: cannot swap operands "
3309                                     "for cond_expr");
3310                   return NULL;
3311                 }
3312             }
3313           else
3314             swap_ssa_operands (def_stmt, gimple_assign_rhs1_ptr (def_stmt),
3315                                gimple_assign_rhs2_ptr (def_stmt));
3316
3317           if (dump_enabled_p ())
3318             report_vect_op (MSG_NOTE, def_stmt,
3319                             "detected reduction: need to swap operands: ");
3320
3321           if (CONSTANT_CLASS_P (gimple_assign_rhs1 (def_stmt)))
3322             LOOP_VINFO_OPERANDS_SWAPPED (loop_info) = true;
3323         }
3324       else
3325         {
3326           if (dump_enabled_p ())
3327             report_vect_op (MSG_NOTE, def_stmt, "detected reduction: ");
3328         }
3329
3330       return def_stmt_info;
3331     }
3332
3333   /* Try to find SLP reduction chain.  */
3334   if (! nested_in_vect_loop
3335       && code != COND_EXPR
3336       && orig_code != MINUS_EXPR
3337       && vect_is_slp_reduction (loop_info, phi, def_stmt))
3338     {
3339       if (dump_enabled_p ())
3340         report_vect_op (MSG_NOTE, def_stmt,
3341                         "reduction: detected reduction chain: ");
3342
3343       return def_stmt_info;
3344     }
3345
3346   /* Look for the expression computing loop_arg from loop PHI result.  */
3347   if (check_reduction_path (vect_location, loop, phi, loop_arg, code))
3348     return def_stmt_info;
3349
3350   if (dump_enabled_p ())
3351     {
3352       report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
3353                       "reduction: unknown pattern: ");
3354     }
3355
3356   return NULL;
3357 }
3358
3359 /* Wrapper around vect_is_simple_reduction, which will modify code
3360    in-place if it enables detection of more reductions.  Arguments
3361    as there.  */
3362
3363 stmt_vec_info
3364 vect_force_simple_reduction (loop_vec_info loop_info, stmt_vec_info phi_info,
3365                              bool *double_reduc,
3366                              bool need_wrapping_integral_overflow)
3367 {
3368   enum vect_reduction_type v_reduc_type;
3369   stmt_vec_info def_info
3370     = vect_is_simple_reduction (loop_info, phi_info, double_reduc,
3371                                 need_wrapping_integral_overflow,
3372                                 &v_reduc_type);
3373   if (def_info)
3374     {
3375       STMT_VINFO_REDUC_TYPE (phi_info) = v_reduc_type;
3376       STMT_VINFO_REDUC_DEF (phi_info) = def_info;
3377       STMT_VINFO_REDUC_TYPE (def_info) = v_reduc_type;
3378       STMT_VINFO_REDUC_DEF (def_info) = phi_info;
3379     }
3380   return def_info;
3381 }
3382
3383 /* Calculate cost of peeling the loop PEEL_ITERS_PROLOGUE times.  */
3384 int
3385 vect_get_known_peeling_cost (loop_vec_info loop_vinfo, int peel_iters_prologue,
3386                              int *peel_iters_epilogue,
3387                              stmt_vector_for_cost *scalar_cost_vec,
3388                              stmt_vector_for_cost *prologue_cost_vec,
3389                              stmt_vector_for_cost *epilogue_cost_vec)
3390 {
3391   int retval = 0;
3392   int assumed_vf = vect_vf_for_cost (loop_vinfo);
3393
3394   if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
3395     {
3396       *peel_iters_epilogue = assumed_vf / 2;
3397       if (dump_enabled_p ())
3398         dump_printf_loc (MSG_NOTE, vect_location,
3399                          "cost model: epilogue peel iters set to vf/2 "
3400                          "because loop iterations are unknown .\n");
3401
3402       /* If peeled iterations are known but number of scalar loop
3403          iterations are unknown, count a taken branch per peeled loop.  */
3404       retval = record_stmt_cost (prologue_cost_vec, 1, cond_branch_taken,
3405                                  NULL, 0, vect_prologue);
3406       retval += record_stmt_cost (epilogue_cost_vec, 1, cond_branch_taken,
3407                                   NULL, 0, vect_epilogue);
3408     }
3409   else
3410     {
3411       int niters = LOOP_VINFO_INT_NITERS (loop_vinfo);
3412       peel_iters_prologue = niters < peel_iters_prologue ?
3413                             niters : peel_iters_prologue;
3414       *peel_iters_epilogue = (niters - peel_iters_prologue) % assumed_vf;
3415       /* If we need to peel for gaps, but no peeling is required, we have to
3416          peel VF iterations.  */
3417       if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) && !*peel_iters_epilogue)
3418         *peel_iters_epilogue = assumed_vf;
3419     }
3420
3421   stmt_info_for_cost *si;
3422   int j;
3423   if (peel_iters_prologue)
3424     FOR_EACH_VEC_ELT (*scalar_cost_vec, j, si)
3425       retval += record_stmt_cost (prologue_cost_vec,
3426                                   si->count * peel_iters_prologue,
3427                                   si->kind, si->stmt_info, si->misalign,
3428                                   vect_prologue);
3429   if (*peel_iters_epilogue)
3430     FOR_EACH_VEC_ELT (*scalar_cost_vec, j, si)
3431       retval += record_stmt_cost (epilogue_cost_vec,
3432                                   si->count * *peel_iters_epilogue,
3433                                   si->kind, si->stmt_info, si->misalign,
3434                                   vect_epilogue);
3435
3436   return retval;
3437 }
3438
3439 /* Function vect_estimate_min_profitable_iters
3440
3441    Return the number of iterations required for the vector version of the
3442    loop to be profitable relative to the cost of the scalar version of the
3443    loop.
3444
3445    *RET_MIN_PROFITABLE_NITERS is a cost model profitability threshold
3446    of iterations for vectorization.  -1 value means loop vectorization
3447    is not profitable.  This returned value may be used for dynamic
3448    profitability check.
3449
3450    *RET_MIN_PROFITABLE_ESTIMATE is a profitability threshold to be used
3451    for static check against estimated number of iterations.  */
3452
3453 static void
3454 vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo,
3455                                     int *ret_min_profitable_niters,
3456                                     int *ret_min_profitable_estimate)
3457 {
3458   int min_profitable_iters;
3459   int min_profitable_estimate;
3460   int peel_iters_prologue;
3461   int peel_iters_epilogue;
3462   unsigned vec_inside_cost = 0;
3463   int vec_outside_cost = 0;
3464   unsigned vec_prologue_cost = 0;
3465   unsigned vec_epilogue_cost = 0;
3466   int scalar_single_iter_cost = 0;
3467   int scalar_outside_cost = 0;
3468   int assumed_vf = vect_vf_for_cost (loop_vinfo);
3469   int npeel = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
3470   void *target_cost_data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
3471
3472   /* Cost model disabled.  */
3473   if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
3474     {
3475       if (dump_enabled_p ())
3476         dump_printf_loc (MSG_NOTE, vect_location, "cost model disabled.\n");
3477       *ret_min_profitable_niters = 0;
3478       *ret_min_profitable_estimate = 0;
3479       return;
3480     }
3481
3482   /* Requires loop versioning tests to handle misalignment.  */
3483   if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo))
3484     {
3485       /*  FIXME: Make cost depend on complexity of individual check.  */
3486       unsigned len = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).length ();
3487       (void) add_stmt_cost (target_cost_data, len, vector_stmt, NULL, 0,
3488                             vect_prologue);
3489       if (dump_enabled_p ())
3490         dump_printf (MSG_NOTE,
3491                      "cost model: Adding cost of checks for loop "
3492                      "versioning to treat misalignment.\n");
3493     }
3494
3495   /* Requires loop versioning with alias checks.  */
3496   if (LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
3497     {
3498       /*  FIXME: Make cost depend on complexity of individual check.  */
3499       unsigned len = LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo).length ();
3500       (void) add_stmt_cost (target_cost_data, len, vector_stmt, NULL, 0,
3501                             vect_prologue);
3502       len = LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo).length ();
3503       if (len)
3504         /* Count LEN - 1 ANDs and LEN comparisons.  */
3505         (void) add_stmt_cost (target_cost_data, len * 2 - 1, scalar_stmt,
3506                               NULL, 0, vect_prologue);
3507       len = LOOP_VINFO_LOWER_BOUNDS (loop_vinfo).length ();
3508       if (len)
3509         {
3510           /* Count LEN - 1 ANDs and LEN comparisons.  */
3511           unsigned int nstmts = len * 2 - 1;
3512           /* +1 for each bias that needs adding.  */
3513           for (unsigned int i = 0; i < len; ++i)
3514             if (!LOOP_VINFO_LOWER_BOUNDS (loop_vinfo)[i].unsigned_p)
3515               nstmts += 1;
3516           (void) add_stmt_cost (target_cost_data, nstmts, scalar_stmt,
3517                                 NULL, 0, vect_prologue);
3518         }
3519       if (dump_enabled_p ())
3520         dump_printf (MSG_NOTE,
3521                      "cost model: Adding cost of checks for loop "
3522                      "versioning aliasing.\n");
3523     }
3524
3525   /* Requires loop versioning with niter checks.  */
3526   if (LOOP_REQUIRES_VERSIONING_FOR_NITERS (loop_vinfo))
3527     {
3528       /*  FIXME: Make cost depend on complexity of individual check.  */
3529       (void) add_stmt_cost (target_cost_data, 1, vector_stmt, NULL, 0,
3530                             vect_prologue);
3531       if (dump_enabled_p ())
3532         dump_printf (MSG_NOTE,
3533                      "cost model: Adding cost of checks for loop "
3534                      "versioning niters.\n");
3535     }
3536
3537   if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
3538     (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken, NULL, 0,
3539                           vect_prologue);
3540
3541   /* Count statements in scalar loop.  Using this as scalar cost for a single
3542      iteration for now.
3543
3544      TODO: Add outer loop support.
3545
3546      TODO: Consider assigning different costs to different scalar
3547      statements.  */
3548
3549   scalar_single_iter_cost
3550     = LOOP_VINFO_SINGLE_SCALAR_ITERATION_COST (loop_vinfo);
3551
3552   /* Add additional cost for the peeled instructions in prologue and epilogue
3553      loop.  (For fully-masked loops there will be no peeling.)
3554
3555      FORNOW: If we don't know the value of peel_iters for prologue or epilogue
3556      at compile-time - we assume it's vf/2 (the worst would be vf-1).
3557
3558      TODO: Build an expression that represents peel_iters for prologue and
3559      epilogue to be used in a run-time test.  */
3560
3561   if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
3562     {
3563       peel_iters_prologue = 0;
3564       peel_iters_epilogue = 0;
3565
3566       if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
3567         {
3568           /* We need to peel exactly one iteration.  */
3569           peel_iters_epilogue += 1;
3570           stmt_info_for_cost *si;
3571           int j;
3572           FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
3573                             j, si)
3574             (void) add_stmt_cost (target_cost_data, si->count,
3575                                   si->kind, si->stmt_info, si->misalign,
3576                                   vect_epilogue);
3577         }
3578     }
3579   else if (npeel < 0)
3580     {
3581       peel_iters_prologue = assumed_vf / 2;
3582       if (dump_enabled_p ())
3583         dump_printf (MSG_NOTE, "cost model: "
3584                      "prologue peel iters set to vf/2.\n");
3585
3586       /* If peeling for alignment is unknown, loop bound of main loop becomes
3587          unknown.  */
3588       peel_iters_epilogue = assumed_vf / 2;
3589       if (dump_enabled_p ())
3590         dump_printf (MSG_NOTE, "cost model: "
3591                      "epilogue peel iters set to vf/2 because "
3592                      "peeling for alignment is unknown.\n");
3593
3594       /* If peeled iterations are unknown, count a taken branch and a not taken
3595          branch per peeled loop. Even if scalar loop iterations are known,
3596          vector iterations are not known since peeled prologue iterations are
3597          not known. Hence guards remain the same.  */
3598       (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken,
3599                             NULL, 0, vect_prologue);
3600       (void) add_stmt_cost (target_cost_data, 1, cond_branch_not_taken,
3601                             NULL, 0, vect_prologue);
3602       (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken,
3603                             NULL, 0, vect_epilogue);
3604       (void) add_stmt_cost (target_cost_data, 1, cond_branch_not_taken,
3605                             NULL, 0, vect_epilogue);
3606       stmt_info_for_cost *si;
3607       int j;
3608       FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo), j, si)
3609         {
3610           (void) add_stmt_cost (target_cost_data,
3611                                 si->count * peel_iters_prologue,
3612                                 si->kind, si->stmt_info, si->misalign,
3613                                 vect_prologue);
3614           (void) add_stmt_cost (target_cost_data,
3615                                 si->count * peel_iters_epilogue,
3616                                 si->kind, si->stmt_info, si->misalign,
3617                                 vect_epilogue);
3618         }
3619     }
3620   else
3621     {
3622       stmt_vector_for_cost prologue_cost_vec, epilogue_cost_vec;
3623       stmt_info_for_cost *si;
3624       int j;
3625       void *data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
3626
3627       prologue_cost_vec.create (2);
3628       epilogue_cost_vec.create (2);
3629       peel_iters_prologue = npeel;
3630
3631       (void) vect_get_known_peeling_cost (loop_vinfo, peel_iters_prologue,
3632                                           &peel_iters_epilogue,
3633                                           &LOOP_VINFO_SCALAR_ITERATION_COST
3634                                             (loop_vinfo),
3635                                           &prologue_cost_vec,
3636                                           &epilogue_cost_vec);
3637
3638       FOR_EACH_VEC_ELT (prologue_cost_vec, j, si)
3639         (void) add_stmt_cost (data, si->count, si->kind, si->stmt_info,
3640                               si->misalign, vect_prologue);
3641
3642       FOR_EACH_VEC_ELT (epilogue_cost_vec, j, si)
3643         (void) add_stmt_cost (data, si->count, si->kind, si->stmt_info,
3644                               si->misalign, vect_epilogue);
3645
3646       prologue_cost_vec.release ();
3647       epilogue_cost_vec.release ();
3648     }
3649
3650   /* FORNOW: The scalar outside cost is incremented in one of the
3651      following ways:
3652
3653      1. The vectorizer checks for alignment and aliasing and generates
3654      a condition that allows dynamic vectorization.  A cost model
3655      check is ANDED with the versioning condition.  Hence scalar code
3656      path now has the added cost of the versioning check.
3657
3658        if (cost > th & versioning_check)
3659          jmp to vector code
3660
3661      Hence run-time scalar is incremented by not-taken branch cost.
3662
3663      2. The vectorizer then checks if a prologue is required.  If the
3664      cost model check was not done before during versioning, it has to
3665      be done before the prologue check.
3666
3667        if (cost <= th)
3668          prologue = scalar_iters
3669        if (prologue == 0)
3670          jmp to vector code
3671        else
3672          execute prologue
3673        if (prologue == num_iters)
3674          go to exit
3675
3676      Hence the run-time scalar cost is incremented by a taken branch,
3677      plus a not-taken branch, plus a taken branch cost.
3678
3679      3. The vectorizer then checks if an epilogue is required.  If the
3680      cost model check was not done before during prologue check, it
3681      has to be done with the epilogue check.
3682
3683        if (prologue == 0)
3684          jmp to vector code
3685        else
3686          execute prologue
3687        if (prologue == num_iters)
3688          go to exit
3689        vector code:
3690          if ((cost <= th) | (scalar_iters-prologue-epilogue == 0))
3691            jmp to epilogue
3692
3693      Hence the run-time scalar cost should be incremented by 2 taken
3694      branches.
3695
3696      TODO: The back end may reorder the BBS's differently and reverse
3697      conditions/branch directions.  Change the estimates below to
3698      something more reasonable.  */
3699
3700   /* If the number of iterations is known and we do not do versioning, we can
3701      decide whether to vectorize at compile time.  Hence the scalar version
3702      do not carry cost model guard costs.  */
3703   if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
3704       || LOOP_REQUIRES_VERSIONING (loop_vinfo))
3705     {
3706       /* Cost model check occurs at versioning.  */
3707       if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
3708         scalar_outside_cost += vect_get_stmt_cost (cond_branch_not_taken);
3709       else
3710         {
3711           /* Cost model check occurs at prologue generation.  */
3712           if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) < 0)
3713             scalar_outside_cost += 2 * vect_get_stmt_cost (cond_branch_taken)
3714               + vect_get_stmt_cost (cond_branch_not_taken);
3715           /* Cost model check occurs at epilogue generation.  */
3716           else
3717             scalar_outside_cost += 2 * vect_get_stmt_cost (cond_branch_taken);
3718         }
3719     }
3720
3721   /* Complete the target-specific cost calculations.  */
3722   finish_cost (LOOP_VINFO_TARGET_COST_DATA (loop_vinfo), &vec_prologue_cost,
3723                &vec_inside_cost, &vec_epilogue_cost);
3724
3725   vec_outside_cost = (int)(vec_prologue_cost + vec_epilogue_cost);
3726
3727   if (dump_enabled_p ())
3728     {
3729       dump_printf_loc (MSG_NOTE, vect_location, "Cost model analysis: \n");
3730       dump_printf (MSG_NOTE, "  Vector inside of loop cost: %d\n",
3731                    vec_inside_cost);
3732       dump_printf (MSG_NOTE, "  Vector prologue cost: %d\n",
3733                    vec_prologue_cost);
3734       dump_printf (MSG_NOTE, "  Vector epilogue cost: %d\n",
3735                    vec_epilogue_cost);
3736       dump_printf (MSG_NOTE, "  Scalar iteration cost: %d\n",
3737                    scalar_single_iter_cost);
3738       dump_printf (MSG_NOTE, "  Scalar outside cost: %d\n",
3739                    scalar_outside_cost);
3740       dump_printf (MSG_NOTE, "  Vector outside cost: %d\n",
3741                    vec_outside_cost);
3742       dump_printf (MSG_NOTE, "  prologue iterations: %d\n",
3743                    peel_iters_prologue);
3744       dump_printf (MSG_NOTE, "  epilogue iterations: %d\n",
3745                    peel_iters_epilogue);
3746     }
3747
3748   /* Calculate number of iterations required to make the vector version
3749      profitable, relative to the loop bodies only.  The following condition
3750      must hold true:
3751      SIC * niters + SOC > VIC * ((niters - NPEEL) / VF) + VOC
3752      where
3753      SIC = scalar iteration cost, VIC = vector iteration cost,
3754      VOC = vector outside cost, VF = vectorization factor,
3755      NPEEL = prologue iterations + epilogue iterations,
3756      SOC = scalar outside cost for run time cost model check.  */
3757
3758   int saving_per_viter = (scalar_single_iter_cost * assumed_vf
3759                           - vec_inside_cost);
3760   if (saving_per_viter <= 0)
3761     {
3762       if (LOOP_VINFO_LOOP (loop_vinfo)->force_vectorize)
3763         warning_at (vect_location.get_location_t (), OPT_Wopenmp_simd,
3764                     "vectorization did not happen for a simd loop");
3765
3766       if (dump_enabled_p ())
3767         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3768                          "cost model: the vector iteration cost = %d "
3769                          "divided by the scalar iteration cost = %d "
3770                          "is greater or equal to the vectorization factor = %d"
3771                          ".\n",
3772                          vec_inside_cost, scalar_single_iter_cost, assumed_vf);
3773       *ret_min_profitable_niters = -1;
3774       *ret_min_profitable_estimate = -1;
3775       return;
3776     }
3777
3778   /* ??? The "if" arm is written to handle all cases; see below for what
3779      we would do for !LOOP_VINFO_FULLY_MASKED_P.  */
3780   if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
3781     {
3782       /* Rewriting the condition above in terms of the number of
3783          vector iterations (vniters) rather than the number of
3784          scalar iterations (niters) gives:
3785
3786          SIC * (vniters * VF + NPEEL) + SOC > VIC * vniters + VOC
3787
3788          <==> vniters * (SIC * VF - VIC) > VOC - SIC * NPEEL - SOC
3789
3790          For integer N, X and Y when X > 0:
3791
3792          N * X > Y <==> N >= (Y /[floor] X) + 1.  */
3793       int outside_overhead = (vec_outside_cost
3794                               - scalar_single_iter_cost * peel_iters_prologue
3795                               - scalar_single_iter_cost * peel_iters_epilogue
3796                               - scalar_outside_cost);
3797       /* We're only interested in cases that require at least one
3798          vector iteration.  */
3799       int min_vec_niters = 1;
3800       if (outside_overhead > 0)
3801         min_vec_niters = outside_overhead / saving_per_viter + 1;
3802
3803       if (dump_enabled_p ())
3804         dump_printf (MSG_NOTE, "  Minimum number of vector iterations: %d\n",
3805                      min_vec_niters);
3806
3807       if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
3808         {
3809           /* Now that we know the minimum number of vector iterations,
3810              find the minimum niters for which the scalar cost is larger:
3811
3812              SIC * niters > VIC * vniters + VOC - SOC
3813
3814              We know that the minimum niters is no more than
3815              vniters * VF + NPEEL, but it might be (and often is) less
3816              than that if a partial vector iteration is cheaper than the
3817              equivalent scalar code.  */
3818           int threshold = (vec_inside_cost * min_vec_niters
3819                            + vec_outside_cost
3820                            - scalar_outside_cost);
3821           if (threshold <= 0)
3822             min_profitable_iters = 1;
3823           else
3824             min_profitable_iters = threshold / scalar_single_iter_cost + 1;
3825         }
3826       else
3827         /* Convert the number of vector iterations into a number of
3828            scalar iterations.  */
3829         min_profitable_iters = (min_vec_niters * assumed_vf
3830                                 + peel_iters_prologue
3831                                 + peel_iters_epilogue);
3832     }
3833   else
3834     {
3835       min_profitable_iters = ((vec_outside_cost - scalar_outside_cost)
3836                               * assumed_vf
3837                               - vec_inside_cost * peel_iters_prologue
3838                               - vec_inside_cost * peel_iters_epilogue);
3839       if (min_profitable_iters <= 0)
3840         min_profitable_iters = 0;
3841       else
3842         {
3843           min_profitable_iters /= saving_per_viter;
3844
3845           if ((scalar_single_iter_cost * assumed_vf * min_profitable_iters)
3846               <= (((int) vec_inside_cost * min_profitable_iters)
3847                   + (((int) vec_outside_cost - scalar_outside_cost)
3848                      * assumed_vf)))
3849             min_profitable_iters++;
3850         }
3851     }
3852
3853   if (dump_enabled_p ())
3854     dump_printf (MSG_NOTE,
3855                  "  Calculated minimum iters for profitability: %d\n",
3856                  min_profitable_iters);
3857
3858   if (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
3859       && min_profitable_iters < (assumed_vf + peel_iters_prologue))
3860     /* We want the vectorized loop to execute at least once.  */
3861     min_profitable_iters = assumed_vf + peel_iters_prologue;
3862
3863   if (dump_enabled_p ())
3864     dump_printf_loc (MSG_NOTE, vect_location,
3865                      "  Runtime profitability threshold = %d\n",
3866                      min_profitable_iters);
3867
3868   *ret_min_profitable_niters = min_profitable_iters;
3869
3870   /* Calculate number of iterations required to make the vector version
3871      profitable, relative to the loop bodies only.
3872
3873      Non-vectorized variant is SIC * niters and it must win over vector
3874      variant on the expected loop trip count.  The following condition must hold true:
3875      SIC * niters > VIC * ((niters - NPEEL) / VF) + VOC + SOC  */
3876
3877   if (vec_outside_cost <= 0)
3878     min_profitable_estimate = 0;
3879   else if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
3880     {
3881       /* This is a repeat of the code above, but with + SOC rather
3882          than - SOC.  */
3883       int outside_overhead = (vec_outside_cost
3884                               - scalar_single_iter_cost * peel_iters_prologue
3885                               - scalar_single_iter_cost * peel_iters_epilogue
3886                               + scalar_outside_cost);
3887       int min_vec_niters = 1;
3888       if (outside_overhead > 0)
3889         min_vec_niters = outside_overhead / saving_per_viter + 1;
3890
3891       if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
3892         {
3893           int threshold = (vec_inside_cost * min_vec_niters
3894                            + vec_outside_cost
3895                            + scalar_outside_cost);
3896           min_profitable_estimate = threshold / scalar_single_iter_cost + 1;
3897         }
3898       else
3899         min_profitable_estimate = (min_vec_niters * assumed_vf
3900                                    + peel_iters_prologue
3901                                    + peel_iters_epilogue);
3902     }
3903   else
3904     {
3905       min_profitable_estimate = ((vec_outside_cost + scalar_outside_cost)
3906                                  * assumed_vf
3907                                  - vec_inside_cost * peel_iters_prologue
3908                                  - vec_inside_cost * peel_iters_epilogue)
3909                                  / ((scalar_single_iter_cost * assumed_vf)
3910                                    - vec_inside_cost);
3911     }
3912   min_profitable_estimate = MAX (min_profitable_estimate, min_profitable_iters);
3913   if (dump_enabled_p ())
3914     dump_printf_loc (MSG_NOTE, vect_location,
3915                      "  Static estimate profitability threshold = %d\n",
3916                      min_profitable_estimate);
3917
3918   *ret_min_profitable_estimate = min_profitable_estimate;
3919 }
3920
3921 /* Writes into SEL a mask for a vec_perm, equivalent to a vec_shr by OFFSET
3922    vector elements (not bits) for a vector with NELT elements.  */
3923 static void
3924 calc_vec_perm_mask_for_shift (unsigned int offset, unsigned int nelt,
3925                               vec_perm_builder *sel)
3926 {
3927   /* The encoding is a single stepped pattern.  Any wrap-around is handled
3928      by vec_perm_indices.  */
3929   sel->new_vector (nelt, 1, 3);
3930   for (unsigned int i = 0; i < 3; i++)
3931     sel->quick_push (i + offset);
3932 }
3933
3934 /* Checks whether the target supports whole-vector shifts for vectors of mode
3935    MODE.  This is the case if _either_ the platform handles vec_shr_optab, _or_
3936    it supports vec_perm_const with masks for all necessary shift amounts.  */
3937 static bool
3938 have_whole_vector_shift (machine_mode mode)
3939 {
3940   if (optab_handler (vec_shr_optab, mode) != CODE_FOR_nothing)
3941     return true;
3942
3943   /* Variable-length vectors should be handled via the optab.  */
3944   unsigned int nelt;
3945   if (!GET_MODE_NUNITS (mode).is_constant (&nelt))
3946     return false;
3947
3948   vec_perm_builder sel;
3949   vec_perm_indices indices;
3950   for (unsigned int i = nelt / 2; i >= 1; i /= 2)
3951     {
3952       calc_vec_perm_mask_for_shift (i, nelt, &sel);
3953       indices.new_vector (sel, 2, nelt);
3954       if (!can_vec_perm_const_p (mode, indices, false))
3955         return false;
3956     }
3957   return true;
3958 }
3959
3960 /* TODO: Close dependency between vect_model_*_cost and vectorizable_*
3961    functions. Design better to avoid maintenance issues.  */
3962
3963 /* Function vect_model_reduction_cost.
3964
3965    Models cost for a reduction operation, including the vector ops
3966    generated within the strip-mine loop, the initial definition before
3967    the loop, and the epilogue code that must be generated.  */
3968
3969 static void
3970 vect_model_reduction_cost (stmt_vec_info stmt_info, internal_fn reduc_fn,
3971                            int ncopies, stmt_vector_for_cost *cost_vec)
3972 {
3973   int prologue_cost = 0, epilogue_cost = 0, inside_cost;
3974   enum tree_code code;
3975   optab optab;
3976   tree vectype;
3977   machine_mode mode;
3978   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
3979   struct loop *loop = NULL;
3980
3981   if (loop_vinfo)
3982     loop = LOOP_VINFO_LOOP (loop_vinfo);
3983
3984   /* Condition reductions generate two reductions in the loop.  */
3985   vect_reduction_type reduction_type
3986     = STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info);
3987   if (reduction_type == COND_REDUCTION)
3988     ncopies *= 2;
3989
3990   vectype = STMT_VINFO_VECTYPE (stmt_info);
3991   mode = TYPE_MODE (vectype);
3992   stmt_vec_info orig_stmt_info = vect_orig_stmt (stmt_info);
3993
3994   code = gimple_assign_rhs_code (orig_stmt_info->stmt);
3995
3996   if (reduction_type == EXTRACT_LAST_REDUCTION
3997       || reduction_type == FOLD_LEFT_REDUCTION)
3998     {
3999       /* No extra instructions needed in the prologue.  */
4000       prologue_cost = 0;
4001
4002       if (reduction_type == EXTRACT_LAST_REDUCTION || reduc_fn != IFN_LAST)
4003         /* Count one reduction-like operation per vector.  */
4004         inside_cost = record_stmt_cost (cost_vec, ncopies, vec_to_scalar,
4005                                         stmt_info, 0, vect_body);
4006       else
4007         {
4008           /* Use NELEMENTS extracts and NELEMENTS scalar ops.  */
4009           unsigned int nelements = ncopies * vect_nunits_for_cost (vectype);
4010           inside_cost = record_stmt_cost (cost_vec, nelements,
4011                                           vec_to_scalar, stmt_info, 0,
4012                                           vect_body);
4013           inside_cost += record_stmt_cost (cost_vec, nelements,
4014                                            scalar_stmt, stmt_info, 0,
4015                                            vect_body);
4016         }
4017     }
4018   else
4019     {
4020       /* Add in cost for initial definition.
4021          For cond reduction we have four vectors: initial index, step,
4022          initial result of the data reduction, initial value of the index
4023          reduction.  */
4024       int prologue_stmts = reduction_type == COND_REDUCTION ? 4 : 1;
4025       prologue_cost += record_stmt_cost (cost_vec, prologue_stmts,
4026                                          scalar_to_vec, stmt_info, 0,
4027                                          vect_prologue);
4028
4029       /* Cost of reduction op inside loop.  */
4030       inside_cost = record_stmt_cost (cost_vec, ncopies, vector_stmt,
4031                                       stmt_info, 0, vect_body);
4032     }
4033
4034   /* Determine cost of epilogue code.
4035
4036      We have a reduction operator that will reduce the vector in one statement.
4037      Also requires scalar extract.  */
4038
4039   if (!loop || !nested_in_vect_loop_p (loop, orig_stmt_info))
4040     {
4041       if (reduc_fn != IFN_LAST)
4042         {
4043           if (reduction_type == COND_REDUCTION)
4044             {
4045               /* An EQ stmt and an COND_EXPR stmt.  */
4046               epilogue_cost += record_stmt_cost (cost_vec, 2,
4047                                                  vector_stmt, stmt_info, 0,
4048                                                  vect_epilogue);
4049               /* Reduction of the max index and a reduction of the found
4050                  values.  */
4051               epilogue_cost += record_stmt_cost (cost_vec, 2,
4052                                                  vec_to_scalar, stmt_info, 0,
4053                                                  vect_epilogue);
4054               /* A broadcast of the max value.  */
4055               epilogue_cost += record_stmt_cost (cost_vec, 1,
4056                                                  scalar_to_vec, stmt_info, 0,
4057                                                  vect_epilogue);
4058             }
4059           else
4060             {
4061               epilogue_cost += record_stmt_cost (cost_vec, 1, vector_stmt,
4062                                                  stmt_info, 0, vect_epilogue);
4063               epilogue_cost += record_stmt_cost (cost_vec, 1,
4064                                                  vec_to_scalar, stmt_info, 0,
4065                                                  vect_epilogue);
4066             }
4067         }
4068       else if (reduction_type == COND_REDUCTION)
4069         {
4070           unsigned estimated_nunits = vect_nunits_for_cost (vectype);
4071           /* Extraction of scalar elements.  */
4072           epilogue_cost += record_stmt_cost (cost_vec,
4073                                              2 * estimated_nunits,
4074                                              vec_to_scalar, stmt_info, 0,
4075                                              vect_epilogue);
4076           /* Scalar max reductions via COND_EXPR / MAX_EXPR.  */
4077           epilogue_cost += record_stmt_cost (cost_vec,
4078                                              2 * estimated_nunits - 3,
4079                                              scalar_stmt, stmt_info, 0,
4080                                              vect_epilogue);
4081         }
4082       else if (reduction_type == EXTRACT_LAST_REDUCTION
4083                || reduction_type == FOLD_LEFT_REDUCTION)
4084         /* No extra instructions need in the epilogue.  */
4085         ;
4086       else
4087         {
4088           int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
4089           tree bitsize =
4090             TYPE_SIZE (TREE_TYPE (gimple_assign_lhs (orig_stmt_info->stmt)));
4091           int element_bitsize = tree_to_uhwi (bitsize);
4092           int nelements = vec_size_in_bits / element_bitsize;
4093
4094           if (code == COND_EXPR)
4095             code = MAX_EXPR;
4096
4097           optab = optab_for_tree_code (code, vectype, optab_default);
4098
4099           /* We have a whole vector shift available.  */
4100           if (optab != unknown_optab
4101               && VECTOR_MODE_P (mode)
4102               && optab_handler (optab, mode) != CODE_FOR_nothing
4103               && have_whole_vector_shift (mode))
4104             {
4105               /* Final reduction via vector shifts and the reduction operator.
4106                  Also requires scalar extract.  */
4107               epilogue_cost += record_stmt_cost (cost_vec,
4108                                                  exact_log2 (nelements) * 2,
4109                                                  vector_stmt, stmt_info, 0,
4110                                                  vect_epilogue);
4111               epilogue_cost += record_stmt_cost (cost_vec, 1,
4112                                                  vec_to_scalar, stmt_info, 0,
4113                                                  vect_epilogue);
4114             }
4115           else
4116             /* Use extracts and reduction op for final reduction.  For N
4117                elements, we have N extracts and N-1 reduction ops.  */
4118             epilogue_cost += record_stmt_cost (cost_vec,
4119                                                nelements + nelements - 1,
4120                                                vector_stmt, stmt_info, 0,
4121                                                vect_epilogue);
4122         }
4123     }
4124
4125   if (dump_enabled_p ())
4126     dump_printf (MSG_NOTE,
4127                  "vect_model_reduction_cost: inside_cost = %d, "
4128                  "prologue_cost = %d, epilogue_cost = %d .\n", inside_cost,
4129                  prologue_cost, epilogue_cost);
4130 }
4131
4132
4133 /* Function vect_model_induction_cost.
4134
4135    Models cost for induction operations.  */
4136
4137 static void
4138 vect_model_induction_cost (stmt_vec_info stmt_info, int ncopies,
4139                            stmt_vector_for_cost *cost_vec)
4140 {
4141   unsigned inside_cost, prologue_cost;
4142
4143   if (PURE_SLP_STMT (stmt_info))
4144     return;
4145
4146   /* loop cost for vec_loop.  */
4147   inside_cost = record_stmt_cost (cost_vec, ncopies, vector_stmt,
4148                                   stmt_info, 0, vect_body);
4149
4150   /* prologue cost for vec_init and vec_step.  */
4151   prologue_cost = record_stmt_cost (cost_vec, 2, scalar_to_vec,
4152                                     stmt_info, 0, vect_prologue);
4153
4154   if (dump_enabled_p ())
4155     dump_printf_loc (MSG_NOTE, vect_location,
4156                      "vect_model_induction_cost: inside_cost = %d, "
4157                      "prologue_cost = %d .\n", inside_cost, prologue_cost);
4158 }
4159
4160
4161
4162 /* Function get_initial_def_for_reduction
4163
4164    Input:
4165    STMT_VINFO - a stmt that performs a reduction operation in the loop.
4166    INIT_VAL - the initial value of the reduction variable
4167
4168    Output:
4169    ADJUSTMENT_DEF - a tree that holds a value to be added to the final result
4170         of the reduction (used for adjusting the epilog - see below).
4171    Return a vector variable, initialized according to the operation that
4172         STMT_VINFO performs. This vector will be used as the initial value
4173         of the vector of partial results.
4174
4175    Option1 (adjust in epilog): Initialize the vector as follows:
4176      add/bit or/xor:    [0,0,...,0,0]
4177      mult/bit and:      [1,1,...,1,1]
4178      min/max/cond_expr: [init_val,init_val,..,init_val,init_val]
4179    and when necessary (e.g. add/mult case) let the caller know
4180    that it needs to adjust the result by init_val.
4181
4182    Option2: Initialize the vector as follows:
4183      add/bit or/xor:    [init_val,0,0,...,0]
4184      mult/bit and:      [init_val,1,1,...,1]
4185      min/max/cond_expr: [init_val,init_val,...,init_val]
4186    and no adjustments are needed.
4187
4188    For example, for the following code:
4189
4190    s = init_val;
4191    for (i=0;i<n;i++)
4192      s = s + a[i];
4193
4194    STMT_VINFO is 's = s + a[i]', and the reduction variable is 's'.
4195    For a vector of 4 units, we want to return either [0,0,0,init_val],
4196    or [0,0,0,0] and let the caller know that it needs to adjust
4197    the result at the end by 'init_val'.
4198
4199    FORNOW, we are using the 'adjust in epilog' scheme, because this way the
4200    initialization vector is simpler (same element in all entries), if
4201    ADJUSTMENT_DEF is not NULL, and Option2 otherwise.
4202
4203    A cost model should help decide between these two schemes.  */
4204
4205 tree
4206 get_initial_def_for_reduction (stmt_vec_info stmt_vinfo, tree init_val,
4207                                tree *adjustment_def)
4208 {
4209   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_vinfo);
4210   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
4211   tree scalar_type = TREE_TYPE (init_val);
4212   tree vectype = get_vectype_for_scalar_type (scalar_type);
4213   enum tree_code code = gimple_assign_rhs_code (stmt_vinfo->stmt);
4214   tree def_for_init;
4215   tree init_def;
4216   REAL_VALUE_TYPE real_init_val = dconst0;
4217   int int_init_val = 0;
4218   gimple_seq stmts = NULL;
4219
4220   gcc_assert (vectype);
4221
4222   gcc_assert (POINTER_TYPE_P (scalar_type) || INTEGRAL_TYPE_P (scalar_type)
4223               || SCALAR_FLOAT_TYPE_P (scalar_type));
4224
4225   gcc_assert (nested_in_vect_loop_p (loop, stmt_vinfo)
4226               || loop == (gimple_bb (stmt_vinfo->stmt))->loop_father);
4227
4228   vect_reduction_type reduction_type
4229     = STMT_VINFO_VEC_REDUCTION_TYPE (stmt_vinfo);
4230
4231   switch (code)
4232     {
4233     case WIDEN_SUM_EXPR:
4234     case DOT_PROD_EXPR:
4235     case SAD_EXPR:
4236     case PLUS_EXPR:
4237     case MINUS_EXPR:
4238     case BIT_IOR_EXPR:
4239     case BIT_XOR_EXPR:
4240     case MULT_EXPR:
4241     case BIT_AND_EXPR:
4242       {
4243         /* ADJUSTMENT_DEF is NULL when called from
4244            vect_create_epilog_for_reduction to vectorize double reduction.  */
4245         if (adjustment_def)
4246           *adjustment_def = init_val;
4247
4248         if (code == MULT_EXPR)
4249           {
4250             real_init_val = dconst1;
4251             int_init_val = 1;
4252           }
4253
4254         if (code == BIT_AND_EXPR)
4255           int_init_val = -1;
4256
4257         if (SCALAR_FLOAT_TYPE_P (scalar_type))
4258           def_for_init = build_real (scalar_type, real_init_val);
4259         else
4260           def_for_init = build_int_cst (scalar_type, int_init_val);
4261
4262         if (adjustment_def)
4263           /* Option1: the first element is '0' or '1' as well.  */
4264           init_def = gimple_build_vector_from_val (&stmts, vectype,
4265                                                    def_for_init);
4266         else if (!TYPE_VECTOR_SUBPARTS (vectype).is_constant ())
4267           {
4268             /* Option2 (variable length): the first element is INIT_VAL.  */
4269             init_def = gimple_build_vector_from_val (&stmts, vectype,
4270                                                      def_for_init);
4271             init_def = gimple_build (&stmts, CFN_VEC_SHL_INSERT,
4272                                      vectype, init_def, init_val);
4273           }
4274         else
4275           {
4276             /* Option2: the first element is INIT_VAL.  */
4277             tree_vector_builder elts (vectype, 1, 2);
4278             elts.quick_push (init_val);
4279             elts.quick_push (def_for_init);
4280             init_def = gimple_build_vector (&stmts, &elts);
4281           }
4282       }
4283       break;
4284
4285     case MIN_EXPR:
4286     case MAX_EXPR:
4287     case COND_EXPR:
4288       {
4289         if (adjustment_def)
4290           {
4291             *adjustment_def = NULL_TREE;
4292             if (reduction_type != COND_REDUCTION
4293                 && reduction_type != EXTRACT_LAST_REDUCTION)
4294               {
4295                 init_def = vect_get_vec_def_for_operand (init_val, stmt_vinfo);
4296                 break;
4297               }
4298           }
4299         init_val = gimple_convert (&stmts, TREE_TYPE (vectype), init_val);
4300         init_def = gimple_build_vector_from_val (&stmts, vectype, init_val);
4301       }
4302       break;
4303
4304     default:
4305       gcc_unreachable ();
4306     }
4307
4308   if (stmts)
4309     gsi_insert_seq_on_edge_immediate (loop_preheader_edge (loop), stmts);
4310   return init_def;
4311 }
4312
4313 /* Get at the initial defs for the reduction PHIs in SLP_NODE.
4314    NUMBER_OF_VECTORS is the number of vector defs to create.
4315    If NEUTRAL_OP is nonnull, introducing extra elements of that
4316    value will not change the result.  */
4317
4318 static void
4319 get_initial_defs_for_reduction (slp_tree slp_node,
4320                                 vec<tree> *vec_oprnds,
4321                                 unsigned int number_of_vectors,
4322                                 bool reduc_chain, tree neutral_op)
4323 {
4324   vec<stmt_vec_info> stmts = SLP_TREE_SCALAR_STMTS (slp_node);
4325   stmt_vec_info stmt_vinfo = stmts[0];
4326   unsigned HOST_WIDE_INT nunits;
4327   unsigned j, number_of_places_left_in_vector;
4328   tree vector_type;
4329   unsigned int group_size = stmts.length ();
4330   unsigned int i;
4331   struct loop *loop;
4332
4333   vector_type = STMT_VINFO_VECTYPE (stmt_vinfo);
4334
4335   gcc_assert (STMT_VINFO_DEF_TYPE (stmt_vinfo) == vect_reduction_def);
4336
4337   loop = (gimple_bb (stmt_vinfo->stmt))->loop_father;
4338   gcc_assert (loop);
4339   edge pe = loop_preheader_edge (loop);
4340
4341   gcc_assert (!reduc_chain || neutral_op);
4342
4343   /* NUMBER_OF_COPIES is the number of times we need to use the same values in
4344      created vectors. It is greater than 1 if unrolling is performed.
4345
4346      For example, we have two scalar operands, s1 and s2 (e.g., group of
4347      strided accesses of size two), while NUNITS is four (i.e., four scalars
4348      of this type can be packed in a vector).  The output vector will contain
4349      two copies of each scalar operand: {s1, s2, s1, s2}.  (NUMBER_OF_COPIES
4350      will be 2).
4351
4352      If REDUC_GROUP_SIZE > NUNITS, the scalars will be split into several
4353      vectors containing the operands.
4354
4355      For example, NUNITS is four as before, and the group size is 8
4356      (s1, s2, ..., s8).  We will create two vectors {s1, s2, s3, s4} and
4357      {s5, s6, s7, s8}.  */
4358
4359   if (!TYPE_VECTOR_SUBPARTS (vector_type).is_constant (&nunits))
4360     nunits = group_size;
4361
4362   number_of_places_left_in_vector = nunits;
4363   bool constant_p = true;
4364   tree_vector_builder elts (vector_type, nunits, 1);
4365   elts.quick_grow (nunits);
4366   gimple_seq ctor_seq = NULL;
4367   for (j = 0; j < nunits * number_of_vectors; ++j)
4368     {
4369       tree op;
4370       i = j % group_size;
4371       stmt_vinfo = stmts[i];
4372
4373       /* Get the def before the loop.  In reduction chain we have only
4374          one initial value.  Else we have as many as PHIs in the group.  */
4375       if (reduc_chain)
4376         op = j != 0 ? neutral_op : PHI_ARG_DEF_FROM_EDGE (stmt_vinfo->stmt, pe);
4377       else if (((vec_oprnds->length () + 1) * nunits
4378                 - number_of_places_left_in_vector >= group_size)
4379                && neutral_op)
4380         op = neutral_op;
4381       else
4382         op = PHI_ARG_DEF_FROM_EDGE (stmt_vinfo->stmt, pe);
4383
4384       /* Create 'vect_ = {op0,op1,...,opn}'.  */
4385       number_of_places_left_in_vector--;
4386       elts[nunits - number_of_places_left_in_vector - 1] = op;
4387       if (!CONSTANT_CLASS_P (op))
4388         constant_p = false;
4389
4390       if (number_of_places_left_in_vector == 0)
4391         {
4392           tree init;
4393           if (constant_p && !neutral_op
4394               ? multiple_p (TYPE_VECTOR_SUBPARTS (vector_type), nunits)
4395               : known_eq (TYPE_VECTOR_SUBPARTS (vector_type), nunits))
4396             /* Build the vector directly from ELTS.  */
4397             init = gimple_build_vector (&ctor_seq, &elts);
4398           else if (neutral_op)
4399             {
4400               /* Build a vector of the neutral value and shift the
4401                  other elements into place.  */
4402               init = gimple_build_vector_from_val (&ctor_seq, vector_type,
4403                                                    neutral_op);
4404               int k = nunits;
4405               while (k > 0 && elts[k - 1] == neutral_op)
4406                 k -= 1;
4407               while (k > 0)
4408                 {
4409                   k -= 1;
4410                   init = gimple_build (&ctor_seq, CFN_VEC_SHL_INSERT,
4411                                        vector_type, init, elts[k]);
4412                 }
4413             }
4414           else
4415             {
4416               /* First time round, duplicate ELTS to fill the
4417                  required number of vectors.  */
4418               duplicate_and_interleave (&ctor_seq, vector_type, elts,
4419                                         number_of_vectors, *vec_oprnds);
4420               break;
4421             }
4422           vec_oprnds->quick_push (init);
4423
4424           number_of_places_left_in_vector = nunits;
4425           elts.new_vector (vector_type, nunits, 1);
4426           elts.quick_grow (nunits);
4427           constant_p = true;
4428         }
4429     }
4430   if (ctor_seq != NULL)
4431     gsi_insert_seq_on_edge_immediate (pe, ctor_seq);
4432 }
4433
4434
4435 /* Function vect_create_epilog_for_reduction
4436
4437    Create code at the loop-epilog to finalize the result of a reduction
4438    computation.
4439
4440    VECT_DEFS is list of vector of partial results, i.e., the lhs's of vector
4441      reduction statements.
4442    STMT_INFO is the scalar reduction stmt that is being vectorized.
4443    NCOPIES is > 1 in case the vectorization factor (VF) is bigger than the
4444      number of elements that we can fit in a vectype (nunits).  In this case
4445      we have to generate more than one vector stmt - i.e - we need to "unroll"
4446      the vector stmt by a factor VF/nunits.  For more details see documentation
4447      in vectorizable_operation.
4448    REDUC_FN is the internal function for the epilog reduction.
4449    REDUCTION_PHIS is a list of the phi-nodes that carry the reduction
4450      computation.
4451    REDUC_INDEX is the index of the operand in the right hand side of the
4452      statement that is defined by REDUCTION_PHI.
4453    DOUBLE_REDUC is TRUE if double reduction phi nodes should be handled.
4454    SLP_NODE is an SLP node containing a group of reduction statements. The
4455      first one in this group is STMT_INFO.
4456    INDUC_VAL is for INTEGER_INDUC_COND_REDUCTION the value to use for the case
4457      when the COND_EXPR is never true in the loop.  For MAX_EXPR, it needs to
4458      be smaller than any value of the IV in the loop, for MIN_EXPR larger than
4459      any value of the IV in the loop.
4460    INDUC_CODE is the code for epilog reduction if INTEGER_INDUC_COND_REDUCTION.
4461    NEUTRAL_OP is the value given by neutral_op_for_slp_reduction; it is
4462      null if this is not an SLP reduction
4463
4464    This function:
4465    1. Creates the reduction def-use cycles: sets the arguments for
4466       REDUCTION_PHIS:
4467       The loop-entry argument is the vectorized initial-value of the reduction.
4468       The loop-latch argument is taken from VECT_DEFS - the vector of partial
4469       sums.
4470    2. "Reduces" each vector of partial results VECT_DEFS into a single result,
4471       by calling the function specified by REDUC_FN if available, or by
4472       other means (whole-vector shifts or a scalar loop).
4473       The function also creates a new phi node at the loop exit to preserve
4474       loop-closed form, as illustrated below.
4475
4476      The flow at the entry to this function:
4477
4478         loop:
4479           vec_def = phi <null, null>            # REDUCTION_PHI
4480           VECT_DEF = vector_stmt                # vectorized form of STMT_INFO
4481           s_loop = scalar_stmt                  # (scalar) STMT_INFO
4482         loop_exit:
4483           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
4484           use <s_out0>
4485           use <s_out0>
4486
4487      The above is transformed by this function into:
4488
4489         loop:
4490           vec_def = phi <vec_init, VECT_DEF>    # REDUCTION_PHI
4491           VECT_DEF = vector_stmt                # vectorized form of STMT_INFO
4492           s_loop = scalar_stmt                  # (scalar) STMT_INFO
4493         loop_exit:
4494           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
4495           v_out1 = phi <VECT_DEF>               # NEW_EXIT_PHI
4496           v_out2 = reduce <v_out1>
4497           s_out3 = extract_field <v_out2, 0>
4498           s_out4 = adjust_result <s_out3>
4499           use <s_out4>
4500           use <s_out4>
4501 */
4502
4503 static void
4504 vect_create_epilog_for_reduction (vec<tree> vect_defs,
4505                                   stmt_vec_info stmt_info,
4506                                   gimple *reduc_def_stmt,
4507                                   int ncopies, internal_fn reduc_fn,
4508                                   vec<stmt_vec_info> reduction_phis,
4509                                   bool double_reduc,
4510                                   slp_tree slp_node,
4511                                   slp_instance slp_node_instance,
4512                                   tree induc_val, enum tree_code induc_code,
4513                                   tree neutral_op)
4514 {
4515   stmt_vec_info prev_phi_info;
4516   tree vectype;
4517   machine_mode mode;
4518   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4519   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo), *outer_loop = NULL;
4520   basic_block exit_bb;
4521   tree scalar_dest;
4522   tree scalar_type;
4523   gimple *new_phi = NULL, *phi;
4524   stmt_vec_info phi_info;
4525   gimple_stmt_iterator exit_gsi;
4526   tree vec_dest;
4527   tree new_temp = NULL_TREE, new_dest, new_name, new_scalar_dest;
4528   gimple *epilog_stmt = NULL;
4529   enum tree_code code = gimple_assign_rhs_code (stmt_info->stmt);
4530   gimple *exit_phi;
4531   tree bitsize;
4532   tree adjustment_def = NULL;
4533   tree vec_initial_def = NULL;
4534   tree expr, def, initial_def = NULL;
4535   tree orig_name, scalar_result;
4536   imm_use_iterator imm_iter, phi_imm_iter;
4537   use_operand_p use_p, phi_use_p;
4538   gimple *use_stmt;
4539   stmt_vec_info reduction_phi_info = NULL;
4540   bool nested_in_vect_loop = false;
4541   auto_vec<gimple *> new_phis;
4542   auto_vec<stmt_vec_info> inner_phis;
4543   int j, i;
4544   auto_vec<tree> scalar_results;
4545   unsigned int group_size = 1, k, ratio;
4546   auto_vec<tree> vec_initial_defs;
4547   auto_vec<gimple *> phis;
4548   bool slp_reduc = false;
4549   bool direct_slp_reduc;
4550   tree new_phi_result;
4551   stmt_vec_info inner_phi = NULL;
4552   tree induction_index = NULL_TREE;
4553
4554   if (slp_node)
4555     group_size = SLP_TREE_SCALAR_STMTS (slp_node).length ();
4556
4557   if (nested_in_vect_loop_p (loop, stmt_info))
4558     {
4559       outer_loop = loop;
4560       loop = loop->inner;
4561       nested_in_vect_loop = true;
4562       gcc_assert (!slp_node);
4563     }
4564
4565   vectype = STMT_VINFO_VECTYPE (stmt_info);
4566   gcc_assert (vectype);
4567   mode = TYPE_MODE (vectype);
4568
4569   /* 1. Create the reduction def-use cycle:
4570      Set the arguments of REDUCTION_PHIS, i.e., transform
4571
4572         loop:
4573           vec_def = phi <null, null>            # REDUCTION_PHI
4574           VECT_DEF = vector_stmt                # vectorized form of STMT
4575           ...
4576
4577      into:
4578
4579         loop:
4580           vec_def = phi <vec_init, VECT_DEF>    # REDUCTION_PHI
4581           VECT_DEF = vector_stmt                # vectorized form of STMT
4582           ...
4583
4584      (in case of SLP, do it for all the phis). */
4585
4586   /* Get the loop-entry arguments.  */
4587   enum vect_def_type initial_def_dt = vect_unknown_def_type;
4588   if (slp_node)
4589     {
4590       unsigned vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
4591       vec_initial_defs.reserve (vec_num);
4592       get_initial_defs_for_reduction (slp_node_instance->reduc_phis,
4593                                       &vec_initial_defs, vec_num,
4594                                       REDUC_GROUP_FIRST_ELEMENT (stmt_info),
4595                                       neutral_op);
4596     }
4597   else
4598     {
4599       /* Get at the scalar def before the loop, that defines the initial value
4600          of the reduction variable.  */
4601       initial_def = PHI_ARG_DEF_FROM_EDGE (reduc_def_stmt,
4602                                            loop_preheader_edge (loop));
4603       /* Optimize: if initial_def is for REDUC_MAX smaller than the base
4604          and we can't use zero for induc_val, use initial_def.  Similarly
4605          for REDUC_MIN and initial_def larger than the base.  */
4606       if (TREE_CODE (initial_def) == INTEGER_CST
4607           && (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
4608               == INTEGER_INDUC_COND_REDUCTION)
4609           && !integer_zerop (induc_val)
4610           && ((induc_code == MAX_EXPR
4611                && tree_int_cst_lt (initial_def, induc_val))
4612               || (induc_code == MIN_EXPR
4613                   && tree_int_cst_lt (induc_val, initial_def))))
4614         induc_val = initial_def;
4615
4616       if (double_reduc)
4617         /* In case of double reduction we only create a vector variable
4618            to be put in the reduction phi node.  The actual statement
4619            creation is done later in this function.  */
4620         vec_initial_def = vect_create_destination_var (initial_def, vectype);
4621       else if (nested_in_vect_loop)
4622         {
4623           /* Do not use an adjustment def as that case is not supported
4624              correctly if ncopies is not one.  */
4625           vect_is_simple_use (initial_def, loop_vinfo, &initial_def_dt);
4626           vec_initial_def = vect_get_vec_def_for_operand (initial_def,
4627                                                           stmt_info);
4628         }
4629       else
4630         vec_initial_def
4631           = get_initial_def_for_reduction (stmt_info, initial_def,
4632                                            &adjustment_def);
4633       vec_initial_defs.create (1);
4634       vec_initial_defs.quick_push (vec_initial_def);
4635     }
4636
4637   /* Set phi nodes arguments.  */
4638   FOR_EACH_VEC_ELT (reduction_phis, i, phi_info)
4639     {
4640       tree vec_init_def = vec_initial_defs[i];
4641       tree def = vect_defs[i];
4642       for (j = 0; j < ncopies; j++)
4643         {
4644           if (j != 0)
4645             {
4646               phi_info = STMT_VINFO_RELATED_STMT (phi_info);
4647               if (nested_in_vect_loop)
4648                 vec_init_def
4649                   = vect_get_vec_def_for_stmt_copy (loop_vinfo, vec_init_def);
4650             }
4651
4652           /* Set the loop-entry arg of the reduction-phi.  */
4653
4654           gphi *phi = as_a <gphi *> (phi_info->stmt);
4655           if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
4656               == INTEGER_INDUC_COND_REDUCTION)
4657             {
4658               /* Initialise the reduction phi to zero.  This prevents initial
4659                  values of non-zero interferring with the reduction op.  */
4660               gcc_assert (ncopies == 1);
4661               gcc_assert (i == 0);
4662
4663               tree vec_init_def_type = TREE_TYPE (vec_init_def);
4664               tree induc_val_vec
4665                 = build_vector_from_val (vec_init_def_type, induc_val);
4666
4667               add_phi_arg (phi, induc_val_vec, loop_preheader_edge (loop),
4668                            UNKNOWN_LOCATION);
4669             }
4670           else
4671             add_phi_arg (phi, vec_init_def, loop_preheader_edge (loop),
4672                          UNKNOWN_LOCATION);
4673
4674           /* Set the loop-latch arg for the reduction-phi.  */
4675           if (j > 0)
4676             def = vect_get_vec_def_for_stmt_copy (loop_vinfo, def);
4677
4678           add_phi_arg (phi, def, loop_latch_edge (loop), UNKNOWN_LOCATION);
4679
4680           if (dump_enabled_p ())
4681             dump_printf_loc (MSG_NOTE, vect_location,
4682                              "transform reduction: created def-use cycle: %G%G",
4683                              phi, SSA_NAME_DEF_STMT (def));
4684         }
4685     }
4686
4687   /* For cond reductions we want to create a new vector (INDEX_COND_EXPR)
4688      which is updated with the current index of the loop for every match of
4689      the original loop's cond_expr (VEC_STMT).  This results in a vector
4690      containing the last time the condition passed for that vector lane.
4691      The first match will be a 1 to allow 0 to be used for non-matching
4692      indexes.  If there are no matches at all then the vector will be all
4693      zeroes.  */
4694   if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) == COND_REDUCTION)
4695     {
4696       tree indx_before_incr, indx_after_incr;
4697       poly_uint64 nunits_out = TYPE_VECTOR_SUBPARTS (vectype);
4698
4699       gimple *vec_stmt = STMT_VINFO_VEC_STMT (stmt_info)->stmt;
4700       gcc_assert (gimple_assign_rhs_code (vec_stmt) == VEC_COND_EXPR);
4701
4702       int scalar_precision
4703         = GET_MODE_PRECISION (SCALAR_TYPE_MODE (TREE_TYPE (vectype)));
4704       tree cr_index_scalar_type = make_unsigned_type (scalar_precision);
4705       tree cr_index_vector_type = build_vector_type
4706         (cr_index_scalar_type, TYPE_VECTOR_SUBPARTS (vectype));
4707
4708       /* First we create a simple vector induction variable which starts
4709          with the values {1,2,3,...} (SERIES_VECT) and increments by the
4710          vector size (STEP).  */
4711
4712       /* Create a {1,2,3,...} vector.  */
4713       tree series_vect = build_index_vector (cr_index_vector_type, 1, 1);
4714
4715       /* Create a vector of the step value.  */
4716       tree step = build_int_cst (cr_index_scalar_type, nunits_out);
4717       tree vec_step = build_vector_from_val (cr_index_vector_type, step);
4718
4719       /* Create an induction variable.  */
4720       gimple_stmt_iterator incr_gsi;
4721       bool insert_after;
4722       standard_iv_increment_position (loop, &incr_gsi, &insert_after);
4723       create_iv (series_vect, vec_step, NULL_TREE, loop, &incr_gsi,
4724                  insert_after, &indx_before_incr, &indx_after_incr);
4725
4726       /* Next create a new phi node vector (NEW_PHI_TREE) which starts
4727          filled with zeros (VEC_ZERO).  */
4728
4729       /* Create a vector of 0s.  */
4730       tree zero = build_zero_cst (cr_index_scalar_type);
4731       tree vec_zero = build_vector_from_val (cr_index_vector_type, zero);
4732
4733       /* Create a vector phi node.  */
4734       tree new_phi_tree = make_ssa_name (cr_index_vector_type);
4735       new_phi = create_phi_node (new_phi_tree, loop->header);
4736       loop_vinfo->add_stmt (new_phi);
4737       add_phi_arg (as_a <gphi *> (new_phi), vec_zero,
4738                    loop_preheader_edge (loop), UNKNOWN_LOCATION);
4739
4740       /* Now take the condition from the loops original cond_expr
4741          (VEC_STMT) and produce a new cond_expr (INDEX_COND_EXPR) which for
4742          every match uses values from the induction variable
4743          (INDEX_BEFORE_INCR) otherwise uses values from the phi node
4744          (NEW_PHI_TREE).
4745          Finally, we update the phi (NEW_PHI_TREE) to take the value of
4746          the new cond_expr (INDEX_COND_EXPR).  */
4747
4748       /* Duplicate the condition from vec_stmt.  */
4749       tree ccompare = unshare_expr (gimple_assign_rhs1 (vec_stmt));
4750
4751       /* Create a conditional, where the condition is taken from vec_stmt
4752          (CCOMPARE), then is the induction index (INDEX_BEFORE_INCR) and
4753          else is the phi (NEW_PHI_TREE).  */
4754       tree index_cond_expr = build3 (VEC_COND_EXPR, cr_index_vector_type,
4755                                      ccompare, indx_before_incr,
4756                                      new_phi_tree);
4757       induction_index = make_ssa_name (cr_index_vector_type);
4758       gimple *index_condition = gimple_build_assign (induction_index,
4759                                                      index_cond_expr);
4760       gsi_insert_before (&incr_gsi, index_condition, GSI_SAME_STMT);
4761       stmt_vec_info index_vec_info = loop_vinfo->add_stmt (index_condition);
4762       STMT_VINFO_VECTYPE (index_vec_info) = cr_index_vector_type;
4763
4764       /* Update the phi with the vec cond.  */
4765       add_phi_arg (as_a <gphi *> (new_phi), induction_index,
4766                    loop_latch_edge (loop), UNKNOWN_LOCATION);
4767     }
4768
4769   /* 2. Create epilog code.
4770         The reduction epilog code operates across the elements of the vector
4771         of partial results computed by the vectorized loop.
4772         The reduction epilog code consists of:
4773
4774         step 1: compute the scalar result in a vector (v_out2)
4775         step 2: extract the scalar result (s_out3) from the vector (v_out2)
4776         step 3: adjust the scalar result (s_out3) if needed.
4777
4778         Step 1 can be accomplished using one the following three schemes:
4779           (scheme 1) using reduc_fn, if available.
4780           (scheme 2) using whole-vector shifts, if available.
4781           (scheme 3) using a scalar loop. In this case steps 1+2 above are
4782                      combined.
4783
4784           The overall epilog code looks like this:
4785
4786           s_out0 = phi <s_loop>         # original EXIT_PHI
4787           v_out1 = phi <VECT_DEF>       # NEW_EXIT_PHI
4788           v_out2 = reduce <v_out1>              # step 1
4789           s_out3 = extract_field <v_out2, 0>    # step 2
4790           s_out4 = adjust_result <s_out3>       # step 3
4791
4792           (step 3 is optional, and steps 1 and 2 may be combined).
4793           Lastly, the uses of s_out0 are replaced by s_out4.  */
4794
4795
4796   /* 2.1 Create new loop-exit-phis to preserve loop-closed form:
4797          v_out1 = phi <VECT_DEF>
4798          Store them in NEW_PHIS.  */
4799
4800   exit_bb = single_exit (loop)->dest;
4801   prev_phi_info = NULL;
4802   new_phis.create (vect_defs.length ());
4803   FOR_EACH_VEC_ELT (vect_defs, i, def)
4804     {
4805       for (j = 0; j < ncopies; j++)
4806         {
4807           tree new_def = copy_ssa_name (def);
4808           phi = create_phi_node (new_def, exit_bb);
4809           stmt_vec_info phi_info = loop_vinfo->add_stmt (phi);
4810           if (j == 0)
4811             new_phis.quick_push (phi);
4812           else
4813             {
4814               def = vect_get_vec_def_for_stmt_copy (loop_vinfo, def);
4815               STMT_VINFO_RELATED_STMT (prev_phi_info) = phi_info;
4816             }
4817
4818           SET_PHI_ARG_DEF (phi, single_exit (loop)->dest_idx, def);
4819           prev_phi_info = phi_info;
4820         }
4821     }
4822
4823   /* The epilogue is created for the outer-loop, i.e., for the loop being
4824      vectorized.  Create exit phis for the outer loop.  */
4825   if (double_reduc)
4826     {
4827       loop = outer_loop;
4828       exit_bb = single_exit (loop)->dest;
4829       inner_phis.create (vect_defs.length ());
4830       FOR_EACH_VEC_ELT (new_phis, i, phi)
4831         {
4832           stmt_vec_info phi_info = loop_vinfo->lookup_stmt (phi);
4833           tree new_result = copy_ssa_name (PHI_RESULT (phi));
4834           gphi *outer_phi = create_phi_node (new_result, exit_bb);
4835           SET_PHI_ARG_DEF (outer_phi, single_exit (loop)->dest_idx,
4836                            PHI_RESULT (phi));
4837           prev_phi_info = loop_vinfo->add_stmt (outer_phi);
4838           inner_phis.quick_push (phi_info);
4839           new_phis[i] = outer_phi;
4840           while (STMT_VINFO_RELATED_STMT (phi_info))
4841             {
4842               phi_info = STMT_VINFO_RELATED_STMT (phi_info);
4843               new_result = copy_ssa_name (PHI_RESULT (phi_info->stmt));
4844               outer_phi = create_phi_node (new_result, exit_bb);
4845               SET_PHI_ARG_DEF (outer_phi, single_exit (loop)->dest_idx,
4846                                PHI_RESULT (phi_info->stmt));
4847               stmt_vec_info outer_phi_info = loop_vinfo->add_stmt (outer_phi);
4848               STMT_VINFO_RELATED_STMT (prev_phi_info) = outer_phi_info;
4849               prev_phi_info = outer_phi_info;
4850             }
4851         }
4852     }
4853
4854   exit_gsi = gsi_after_labels (exit_bb);
4855
4856   /* 2.2 Get the relevant tree-code to use in the epilog for schemes 2,3
4857          (i.e. when reduc_fn is not available) and in the final adjustment
4858          code (if needed).  Also get the original scalar reduction variable as
4859          defined in the loop.  In case STMT is a "pattern-stmt" (i.e. - it
4860          represents a reduction pattern), the tree-code and scalar-def are
4861          taken from the original stmt that the pattern-stmt (STMT) replaces.
4862          Otherwise (it is a regular reduction) - the tree-code and scalar-def
4863          are taken from STMT.  */
4864
4865   stmt_vec_info orig_stmt_info = vect_orig_stmt (stmt_info);
4866   if (orig_stmt_info != stmt_info)
4867     {
4868       /* Reduction pattern  */
4869       gcc_assert (STMT_VINFO_IN_PATTERN_P (orig_stmt_info));
4870       gcc_assert (STMT_VINFO_RELATED_STMT (orig_stmt_info) == stmt_info);
4871     }
4872
4873   code = gimple_assign_rhs_code (orig_stmt_info->stmt);
4874   /* For MINUS_EXPR the initial vector is [init_val,0,...,0], therefore,
4875      partial results are added and not subtracted.  */
4876   if (code == MINUS_EXPR)
4877     code = PLUS_EXPR;
4878
4879   scalar_dest = gimple_assign_lhs (orig_stmt_info->stmt);
4880   scalar_type = TREE_TYPE (scalar_dest);
4881   scalar_results.create (group_size);
4882   new_scalar_dest = vect_create_destination_var (scalar_dest, NULL);
4883   bitsize = TYPE_SIZE (scalar_type);
4884
4885   /* In case this is a reduction in an inner-loop while vectorizing an outer
4886      loop - we don't need to extract a single scalar result at the end of the
4887      inner-loop (unless it is double reduction, i.e., the use of reduction is
4888      outside the outer-loop).  The final vector of partial results will be used
4889      in the vectorized outer-loop, or reduced to a scalar result at the end of
4890      the outer-loop.  */
4891   if (nested_in_vect_loop && !double_reduc)
4892     goto vect_finalize_reduction;
4893
4894   /* SLP reduction without reduction chain, e.g.,
4895      # a1 = phi <a2, a0>
4896      # b1 = phi <b2, b0>
4897      a2 = operation (a1)
4898      b2 = operation (b1)  */
4899   slp_reduc = (slp_node && !REDUC_GROUP_FIRST_ELEMENT (stmt_info));
4900
4901   /* True if we should implement SLP_REDUC using native reduction operations
4902      instead of scalar operations.  */
4903   direct_slp_reduc = (reduc_fn != IFN_LAST
4904                       && slp_reduc
4905                       && !TYPE_VECTOR_SUBPARTS (vectype).is_constant ());
4906
4907   /* In case of reduction chain, e.g.,
4908      # a1 = phi <a3, a0>
4909      a2 = operation (a1)
4910      a3 = operation (a2),
4911
4912      we may end up with more than one vector result.  Here we reduce them to
4913      one vector.  */
4914   if (REDUC_GROUP_FIRST_ELEMENT (stmt_info) || direct_slp_reduc)
4915     {
4916       tree first_vect = PHI_RESULT (new_phis[0]);
4917       gassign *new_vec_stmt = NULL;
4918       vec_dest = vect_create_destination_var (scalar_dest, vectype);
4919       for (k = 1; k < new_phis.length (); k++)
4920         {
4921           gimple *next_phi = new_phis[k];
4922           tree second_vect = PHI_RESULT (next_phi);
4923           tree tem = make_ssa_name (vec_dest, new_vec_stmt);
4924           new_vec_stmt = gimple_build_assign (tem, code,
4925                                               first_vect, second_vect);
4926           gsi_insert_before (&exit_gsi, new_vec_stmt, GSI_SAME_STMT);
4927           first_vect = tem;
4928         }
4929
4930       new_phi_result = first_vect;
4931       if (new_vec_stmt)
4932         {
4933           new_phis.truncate (0);
4934           new_phis.safe_push (new_vec_stmt);
4935         }
4936     }
4937   /* Likewise if we couldn't use a single defuse cycle.  */
4938   else if (ncopies > 1)
4939     {
4940       gcc_assert (new_phis.length () == 1);
4941       tree first_vect = PHI_RESULT (new_phis[0]);
4942       gassign *new_vec_stmt = NULL;
4943       vec_dest = vect_create_destination_var (scalar_dest, vectype);
4944       stmt_vec_info next_phi_info = loop_vinfo->lookup_stmt (new_phis[0]);
4945       for (int k = 1; k < ncopies; ++k)
4946         {
4947           next_phi_info = STMT_VINFO_RELATED_STMT (next_phi_info);
4948           tree second_vect = PHI_RESULT (next_phi_info->stmt);
4949           tree tem = make_ssa_name (vec_dest, new_vec_stmt);
4950           new_vec_stmt = gimple_build_assign (tem, code,
4951                                               first_vect, second_vect);
4952           gsi_insert_before (&exit_gsi, new_vec_stmt, GSI_SAME_STMT);
4953           first_vect = tem;
4954         }
4955       new_phi_result = first_vect;
4956       new_phis.truncate (0);
4957       new_phis.safe_push (new_vec_stmt);
4958     }
4959   else
4960     new_phi_result = PHI_RESULT (new_phis[0]);
4961
4962   if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) == COND_REDUCTION
4963       && reduc_fn != IFN_LAST)
4964     {
4965       /* For condition reductions, we have a vector (NEW_PHI_RESULT) containing
4966          various data values where the condition matched and another vector
4967          (INDUCTION_INDEX) containing all the indexes of those matches.  We
4968          need to extract the last matching index (which will be the index with
4969          highest value) and use this to index into the data vector.
4970          For the case where there were no matches, the data vector will contain
4971          all default values and the index vector will be all zeros.  */
4972
4973       /* Get various versions of the type of the vector of indexes.  */
4974       tree index_vec_type = TREE_TYPE (induction_index);
4975       gcc_checking_assert (TYPE_UNSIGNED (index_vec_type));
4976       tree index_scalar_type = TREE_TYPE (index_vec_type);
4977       tree index_vec_cmp_type = build_same_sized_truth_vector_type
4978         (index_vec_type);
4979
4980       /* Get an unsigned integer version of the type of the data vector.  */
4981       int scalar_precision
4982         = GET_MODE_PRECISION (SCALAR_TYPE_MODE (scalar_type));
4983       tree scalar_type_unsigned = make_unsigned_type (scalar_precision);
4984       tree vectype_unsigned = build_vector_type
4985         (scalar_type_unsigned, TYPE_VECTOR_SUBPARTS (vectype));
4986
4987       /* First we need to create a vector (ZERO_VEC) of zeros and another
4988          vector (MAX_INDEX_VEC) filled with the last matching index, which we
4989          can create using a MAX reduction and then expanding.
4990          In the case where the loop never made any matches, the max index will
4991          be zero.  */
4992
4993       /* Vector of {0, 0, 0,...}.  */
4994       tree zero_vec = make_ssa_name (vectype);
4995       tree zero_vec_rhs = build_zero_cst (vectype);
4996       gimple *zero_vec_stmt = gimple_build_assign (zero_vec, zero_vec_rhs);
4997       gsi_insert_before (&exit_gsi, zero_vec_stmt, GSI_SAME_STMT);
4998
4999       /* Find maximum value from the vector of found indexes.  */
5000       tree max_index = make_ssa_name (index_scalar_type);
5001       gcall *max_index_stmt = gimple_build_call_internal (IFN_REDUC_MAX,
5002                                                           1, induction_index);
5003       gimple_call_set_lhs (max_index_stmt, max_index);
5004       gsi_insert_before (&exit_gsi, max_index_stmt, GSI_SAME_STMT);
5005
5006       /* Vector of {max_index, max_index, max_index,...}.  */
5007       tree max_index_vec = make_ssa_name (index_vec_type);
5008       tree max_index_vec_rhs = build_vector_from_val (index_vec_type,
5009                                                       max_index);
5010       gimple *max_index_vec_stmt = gimple_build_assign (max_index_vec,
5011                                                         max_index_vec_rhs);
5012       gsi_insert_before (&exit_gsi, max_index_vec_stmt, GSI_SAME_STMT);
5013
5014       /* Next we compare the new vector (MAX_INDEX_VEC) full of max indexes
5015          with the vector (INDUCTION_INDEX) of found indexes, choosing values
5016          from the data vector (NEW_PHI_RESULT) for matches, 0 (ZERO_VEC)
5017          otherwise.  Only one value should match, resulting in a vector
5018          (VEC_COND) with one data value and the rest zeros.
5019          In the case where the loop never made any matches, every index will
5020          match, resulting in a vector with all data values (which will all be
5021          the default value).  */
5022
5023       /* Compare the max index vector to the vector of found indexes to find
5024          the position of the max value.  */
5025       tree vec_compare = make_ssa_name (index_vec_cmp_type);
5026       gimple *vec_compare_stmt = gimple_build_assign (vec_compare, EQ_EXPR,
5027                                                       induction_index,
5028                                                       max_index_vec);
5029       gsi_insert_before (&exit_gsi, vec_compare_stmt, GSI_SAME_STMT);
5030
5031       /* Use the compare to choose either values from the data vector or
5032          zero.  */
5033       tree vec_cond = make_ssa_name (vectype);
5034       gimple *vec_cond_stmt = gimple_build_assign (vec_cond, VEC_COND_EXPR,
5035                                                    vec_compare, new_phi_result,
5036                                                    zero_vec);
5037       gsi_insert_before (&exit_gsi, vec_cond_stmt, GSI_SAME_STMT);
5038
5039       /* Finally we need to extract the data value from the vector (VEC_COND)
5040          into a scalar (MATCHED_DATA_REDUC).  Logically we want to do a OR
5041          reduction, but because this doesn't exist, we can use a MAX reduction
5042          instead.  The data value might be signed or a float so we need to cast
5043          it first.
5044          In the case where the loop never made any matches, the data values are
5045          all identical, and so will reduce down correctly.  */
5046
5047       /* Make the matched data values unsigned.  */
5048       tree vec_cond_cast = make_ssa_name (vectype_unsigned);
5049       tree vec_cond_cast_rhs = build1 (VIEW_CONVERT_EXPR, vectype_unsigned,
5050                                        vec_cond);
5051       gimple *vec_cond_cast_stmt = gimple_build_assign (vec_cond_cast,
5052                                                         VIEW_CONVERT_EXPR,
5053                                                         vec_cond_cast_rhs);
5054       gsi_insert_before (&exit_gsi, vec_cond_cast_stmt, GSI_SAME_STMT);
5055
5056       /* Reduce down to a scalar value.  */
5057       tree data_reduc = make_ssa_name (scalar_type_unsigned);
5058       gcall *data_reduc_stmt = gimple_build_call_internal (IFN_REDUC_MAX,
5059                                                            1, vec_cond_cast);
5060       gimple_call_set_lhs (data_reduc_stmt, data_reduc);
5061       gsi_insert_before (&exit_gsi, data_reduc_stmt, GSI_SAME_STMT);
5062
5063       /* Convert the reduced value back to the result type and set as the
5064          result.  */
5065       gimple_seq stmts = NULL;
5066       new_temp = gimple_build (&stmts, VIEW_CONVERT_EXPR, scalar_type,
5067                                data_reduc);
5068       gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
5069       scalar_results.safe_push (new_temp);
5070     }
5071   else if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) == COND_REDUCTION
5072            && reduc_fn == IFN_LAST)
5073     {
5074       /* Condition reduction without supported IFN_REDUC_MAX.  Generate
5075          idx = 0;
5076          idx_val = induction_index[0];
5077          val = data_reduc[0];
5078          for (idx = 0, val = init, i = 0; i < nelts; ++i)
5079            if (induction_index[i] > idx_val)
5080              val = data_reduc[i], idx_val = induction_index[i];
5081          return val;  */
5082
5083       tree data_eltype = TREE_TYPE (TREE_TYPE (new_phi_result));
5084       tree idx_eltype = TREE_TYPE (TREE_TYPE (induction_index));
5085       unsigned HOST_WIDE_INT el_size = tree_to_uhwi (TYPE_SIZE (idx_eltype));
5086       poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (TREE_TYPE (induction_index));
5087       /* Enforced by vectorizable_reduction, which ensures we have target
5088          support before allowing a conditional reduction on variable-length
5089          vectors.  */
5090       unsigned HOST_WIDE_INT v_size = el_size * nunits.to_constant ();
5091       tree idx_val = NULL_TREE, val = NULL_TREE;
5092       for (unsigned HOST_WIDE_INT off = 0; off < v_size; off += el_size)
5093         {
5094           tree old_idx_val = idx_val;
5095           tree old_val = val;
5096           idx_val = make_ssa_name (idx_eltype);
5097           epilog_stmt = gimple_build_assign (idx_val, BIT_FIELD_REF,
5098                                              build3 (BIT_FIELD_REF, idx_eltype,
5099                                                      induction_index,
5100                                                      bitsize_int (el_size),
5101                                                      bitsize_int (off)));
5102           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5103           val = make_ssa_name (data_eltype);
5104           epilog_stmt = gimple_build_assign (val, BIT_FIELD_REF,
5105                                              build3 (BIT_FIELD_REF,
5106                                                      data_eltype,
5107                                                      new_phi_result,
5108                                                      bitsize_int (el_size),
5109                                                      bitsize_int (off)));
5110           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5111           if (off != 0)
5112             {
5113               tree new_idx_val = idx_val;
5114               if (off != v_size - el_size)
5115                 {
5116                   new_idx_val = make_ssa_name (idx_eltype);
5117                   epilog_stmt = gimple_build_assign (new_idx_val,
5118                                                      MAX_EXPR, idx_val,
5119                                                      old_idx_val);
5120                   gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5121                 }
5122               tree new_val = make_ssa_name (data_eltype);
5123               epilog_stmt = gimple_build_assign (new_val,
5124                                                  COND_EXPR,
5125                                                  build2 (GT_EXPR,
5126                                                          boolean_type_node,
5127                                                          idx_val,
5128                                                          old_idx_val),
5129                                                  val, old_val);
5130               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5131               idx_val = new_idx_val;
5132               val = new_val;
5133             }
5134         }
5135       /* Convert the reduced value back to the result type and set as the
5136          result.  */
5137       gimple_seq stmts = NULL;
5138       val = gimple_convert (&stmts, scalar_type, val);
5139       gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
5140       scalar_results.safe_push (val);
5141     }
5142
5143   /* 2.3 Create the reduction code, using one of the three schemes described
5144          above. In SLP we simply need to extract all the elements from the
5145          vector (without reducing them), so we use scalar shifts.  */
5146   else if (reduc_fn != IFN_LAST && !slp_reduc)
5147     {
5148       tree tmp;
5149       tree vec_elem_type;
5150
5151       /* Case 1:  Create:
5152          v_out2 = reduc_expr <v_out1>  */
5153
5154       if (dump_enabled_p ())
5155         dump_printf_loc (MSG_NOTE, vect_location,
5156                          "Reduce using direct vector reduction.\n");
5157
5158       vec_elem_type = TREE_TYPE (TREE_TYPE (new_phi_result));
5159       if (!useless_type_conversion_p (scalar_type, vec_elem_type))
5160         {
5161           tree tmp_dest
5162             = vect_create_destination_var (scalar_dest, vec_elem_type);
5163           epilog_stmt = gimple_build_call_internal (reduc_fn, 1,
5164                                                     new_phi_result);
5165           gimple_set_lhs (epilog_stmt, tmp_dest);
5166           new_temp = make_ssa_name (tmp_dest, epilog_stmt);
5167           gimple_set_lhs (epilog_stmt, new_temp);
5168           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5169
5170           epilog_stmt = gimple_build_assign (new_scalar_dest, NOP_EXPR,
5171                                              new_temp);
5172         }
5173       else
5174         {
5175           epilog_stmt = gimple_build_call_internal (reduc_fn, 1,
5176                                                     new_phi_result);
5177           gimple_set_lhs (epilog_stmt, new_scalar_dest);
5178         }
5179
5180       new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
5181       gimple_set_lhs (epilog_stmt, new_temp);
5182       gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5183
5184       if ((STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
5185            == INTEGER_INDUC_COND_REDUCTION)
5186           && !operand_equal_p (initial_def, induc_val, 0))
5187         {
5188           /* Earlier we set the initial value to be a vector if induc_val
5189              values.  Check the result and if it is induc_val then replace
5190              with the original initial value, unless induc_val is
5191              the same as initial_def already.  */
5192           tree zcompare = build2 (EQ_EXPR, boolean_type_node, new_temp,
5193                                   induc_val);
5194
5195           tmp = make_ssa_name (new_scalar_dest);
5196           epilog_stmt = gimple_build_assign (tmp, COND_EXPR, zcompare,
5197                                              initial_def, new_temp);
5198           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5199           new_temp = tmp;
5200         }
5201
5202       scalar_results.safe_push (new_temp);
5203     }
5204   else if (direct_slp_reduc)
5205     {
5206       /* Here we create one vector for each of the REDUC_GROUP_SIZE results,
5207          with the elements for other SLP statements replaced with the
5208          neutral value.  We can then do a normal reduction on each vector.  */
5209
5210       /* Enforced by vectorizable_reduction.  */
5211       gcc_assert (new_phis.length () == 1);
5212       gcc_assert (pow2p_hwi (group_size));
5213
5214       slp_tree orig_phis_slp_node = slp_node_instance->reduc_phis;
5215       vec<stmt_vec_info> orig_phis
5216         = SLP_TREE_SCALAR_STMTS (orig_phis_slp_node);
5217       gimple_seq seq = NULL;
5218
5219       /* Build a vector {0, 1, 2, ...}, with the same number of elements
5220          and the same element size as VECTYPE.  */
5221       tree index = build_index_vector (vectype, 0, 1);
5222       tree index_type = TREE_TYPE (index);
5223       tree index_elt_type = TREE_TYPE (index_type);
5224       tree mask_type = build_same_sized_truth_vector_type (index_type);
5225
5226       /* Create a vector that, for each element, identifies which of
5227          the REDUC_GROUP_SIZE results should use it.  */
5228       tree index_mask = build_int_cst (index_elt_type, group_size - 1);
5229       index = gimple_build (&seq, BIT_AND_EXPR, index_type, index,
5230                             build_vector_from_val (index_type, index_mask));
5231
5232       /* Get a neutral vector value.  This is simply a splat of the neutral
5233          scalar value if we have one, otherwise the initial scalar value
5234          is itself a neutral value.  */
5235       tree vector_identity = NULL_TREE;
5236       if (neutral_op)
5237         vector_identity = gimple_build_vector_from_val (&seq, vectype,
5238                                                         neutral_op);
5239       for (unsigned int i = 0; i < group_size; ++i)
5240         {
5241           /* If there's no univeral neutral value, we can use the
5242              initial scalar value from the original PHI.  This is used
5243              for MIN and MAX reduction, for example.  */
5244           if (!neutral_op)
5245             {
5246               tree scalar_value
5247                 = PHI_ARG_DEF_FROM_EDGE (orig_phis[i]->stmt,
5248                                          loop_preheader_edge (loop));
5249               vector_identity = gimple_build_vector_from_val (&seq, vectype,
5250                                                               scalar_value);
5251             }
5252
5253           /* Calculate the equivalent of:
5254
5255              sel[j] = (index[j] == i);
5256
5257              which selects the elements of NEW_PHI_RESULT that should
5258              be included in the result.  */
5259           tree compare_val = build_int_cst (index_elt_type, i);
5260           compare_val = build_vector_from_val (index_type, compare_val);
5261           tree sel = gimple_build (&seq, EQ_EXPR, mask_type,
5262                                    index, compare_val);
5263
5264           /* Calculate the equivalent of:
5265
5266              vec = seq ? new_phi_result : vector_identity;
5267
5268              VEC is now suitable for a full vector reduction.  */
5269           tree vec = gimple_build (&seq, VEC_COND_EXPR, vectype,
5270                                    sel, new_phi_result, vector_identity);
5271
5272           /* Do the reduction and convert it to the appropriate type.  */
5273           tree scalar = gimple_build (&seq, as_combined_fn (reduc_fn),
5274                                       TREE_TYPE (vectype), vec);
5275           scalar = gimple_convert (&seq, scalar_type, scalar);
5276           scalar_results.safe_push (scalar);
5277         }
5278       gsi_insert_seq_before (&exit_gsi, seq, GSI_SAME_STMT);
5279     }
5280   else
5281     {
5282       bool reduce_with_shift;
5283       tree vec_temp;
5284
5285       /* COND reductions all do the final reduction with MAX_EXPR
5286          or MIN_EXPR.  */
5287       if (code == COND_EXPR)
5288         {
5289           if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
5290               == INTEGER_INDUC_COND_REDUCTION)
5291             code = induc_code;
5292           else if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
5293                    == CONST_COND_REDUCTION)
5294             code = STMT_VINFO_VEC_CONST_COND_REDUC_CODE (stmt_info);
5295           else
5296             code = MAX_EXPR;
5297         }
5298
5299       /* See if the target wants to do the final (shift) reduction
5300          in a vector mode of smaller size and first reduce upper/lower
5301          halves against each other.  */
5302       enum machine_mode mode1 = mode;
5303       unsigned sz = tree_to_uhwi (TYPE_SIZE_UNIT (vectype));
5304       unsigned sz1 = sz;
5305       if (!slp_reduc
5306           && (mode1 = targetm.vectorize.split_reduction (mode)) != mode)
5307         sz1 = GET_MODE_SIZE (mode1).to_constant ();
5308
5309       tree vectype1 = get_vectype_for_scalar_type_and_size (scalar_type, sz1);
5310       reduce_with_shift = have_whole_vector_shift (mode1);
5311       if (!VECTOR_MODE_P (mode1))
5312         reduce_with_shift = false;
5313       else
5314         {
5315           optab optab = optab_for_tree_code (code, vectype1, optab_default);
5316           if (optab_handler (optab, mode1) == CODE_FOR_nothing)
5317             reduce_with_shift = false;
5318         }
5319
5320       /* First reduce the vector to the desired vector size we should
5321          do shift reduction on by combining upper and lower halves.  */
5322       new_temp = new_phi_result;
5323       while (sz > sz1)
5324         {
5325           gcc_assert (!slp_reduc);
5326           sz /= 2;
5327           vectype1 = get_vectype_for_scalar_type_and_size (scalar_type, sz);
5328
5329           /* The target has to make sure we support lowpart/highpart
5330              extraction, either via direct vector extract or through
5331              an integer mode punning.  */
5332           tree dst1, dst2;
5333           if (convert_optab_handler (vec_extract_optab,
5334                                      TYPE_MODE (TREE_TYPE (new_temp)),
5335                                      TYPE_MODE (vectype1))
5336               != CODE_FOR_nothing)
5337             {
5338               /* Extract sub-vectors directly once vec_extract becomes
5339                  a conversion optab.  */
5340               dst1 = make_ssa_name (vectype1);
5341               epilog_stmt
5342                   = gimple_build_assign (dst1, BIT_FIELD_REF,
5343                                          build3 (BIT_FIELD_REF, vectype1,
5344                                                  new_temp, TYPE_SIZE (vectype1),
5345                                                  bitsize_int (0)));
5346               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5347               dst2 =  make_ssa_name (vectype1);
5348               epilog_stmt
5349                   = gimple_build_assign (dst2, BIT_FIELD_REF,
5350                                          build3 (BIT_FIELD_REF, vectype1,
5351                                                  new_temp, TYPE_SIZE (vectype1),
5352                                                  bitsize_int (sz * BITS_PER_UNIT)));
5353               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5354             }
5355           else
5356             {
5357               /* Extract via punning to appropriately sized integer mode
5358                  vector.  */
5359               tree eltype = build_nonstandard_integer_type (sz * BITS_PER_UNIT,
5360                                                             1);
5361               tree etype = build_vector_type (eltype, 2);
5362               gcc_assert (convert_optab_handler (vec_extract_optab,
5363                                                  TYPE_MODE (etype),
5364                                                  TYPE_MODE (eltype))
5365                           != CODE_FOR_nothing);
5366               tree tem = make_ssa_name (etype);
5367               epilog_stmt = gimple_build_assign (tem, VIEW_CONVERT_EXPR,
5368                                                  build1 (VIEW_CONVERT_EXPR,
5369                                                          etype, new_temp));
5370               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5371               new_temp = tem;
5372               tem = make_ssa_name (eltype);
5373               epilog_stmt
5374                   = gimple_build_assign (tem, BIT_FIELD_REF,
5375                                          build3 (BIT_FIELD_REF, eltype,
5376                                                  new_temp, TYPE_SIZE (eltype),
5377                                                  bitsize_int (0)));
5378               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5379               dst1 = make_ssa_name (vectype1);
5380               epilog_stmt = gimple_build_assign (dst1, VIEW_CONVERT_EXPR,
5381                                                  build1 (VIEW_CONVERT_EXPR,
5382                                                          vectype1, tem));
5383               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5384               tem = make_ssa_name (eltype);
5385               epilog_stmt
5386                   = gimple_build_assign (tem, BIT_FIELD_REF,
5387                                          build3 (BIT_FIELD_REF, eltype,
5388                                                  new_temp, TYPE_SIZE (eltype),
5389                                                  bitsize_int (sz * BITS_PER_UNIT)));
5390               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5391               dst2 =  make_ssa_name (vectype1);
5392               epilog_stmt = gimple_build_assign (dst2, VIEW_CONVERT_EXPR,
5393                                                  build1 (VIEW_CONVERT_EXPR,
5394                                                          vectype1, tem));
5395               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5396             }
5397
5398           new_temp = make_ssa_name (vectype1);
5399           epilog_stmt = gimple_build_assign (new_temp, code, dst1, dst2);
5400           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5401         }
5402
5403       if (reduce_with_shift && !slp_reduc)
5404         {
5405           int element_bitsize = tree_to_uhwi (bitsize);
5406           /* Enforced by vectorizable_reduction, which disallows SLP reductions
5407              for variable-length vectors and also requires direct target support
5408              for loop reductions.  */
5409           int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype1));
5410           int nelements = vec_size_in_bits / element_bitsize;
5411           vec_perm_builder sel;
5412           vec_perm_indices indices;
5413
5414           int elt_offset;
5415
5416           tree zero_vec = build_zero_cst (vectype1);
5417           /* Case 2: Create:
5418              for (offset = nelements/2; offset >= 1; offset/=2)
5419                 {
5420                   Create:  va' = vec_shift <va, offset>
5421                   Create:  va = vop <va, va'>
5422                 }  */
5423
5424           tree rhs;
5425
5426           if (dump_enabled_p ())
5427             dump_printf_loc (MSG_NOTE, vect_location,
5428                              "Reduce using vector shifts\n");
5429
5430           vec_dest = vect_create_destination_var (scalar_dest, vectype1);
5431           for (elt_offset = nelements / 2;
5432                elt_offset >= 1;
5433                elt_offset /= 2)
5434             {
5435               calc_vec_perm_mask_for_shift (elt_offset, nelements, &sel);
5436               indices.new_vector (sel, 2, nelements);
5437               tree mask = vect_gen_perm_mask_any (vectype1, indices);
5438               epilog_stmt = gimple_build_assign (vec_dest, VEC_PERM_EXPR,
5439                                                  new_temp, zero_vec, mask);
5440               new_name = make_ssa_name (vec_dest, epilog_stmt);
5441               gimple_assign_set_lhs (epilog_stmt, new_name);
5442               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5443
5444               epilog_stmt = gimple_build_assign (vec_dest, code, new_name,
5445                                                  new_temp);
5446               new_temp = make_ssa_name (vec_dest, epilog_stmt);
5447               gimple_assign_set_lhs (epilog_stmt, new_temp);
5448               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5449             }
5450
5451           /* 2.4  Extract the final scalar result.  Create:
5452              s_out3 = extract_field <v_out2, bitpos>  */
5453
5454           if (dump_enabled_p ())
5455             dump_printf_loc (MSG_NOTE, vect_location,
5456                              "extract scalar result\n");
5457
5458           rhs = build3 (BIT_FIELD_REF, scalar_type, new_temp,
5459                         bitsize, bitsize_zero_node);
5460           epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
5461           new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
5462           gimple_assign_set_lhs (epilog_stmt, new_temp);
5463           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5464           scalar_results.safe_push (new_temp);
5465         }
5466       else
5467         {
5468           /* Case 3: Create:
5469              s = extract_field <v_out2, 0>
5470              for (offset = element_size;
5471                   offset < vector_size;
5472                   offset += element_size;)
5473                {
5474                  Create:  s' = extract_field <v_out2, offset>
5475                  Create:  s = op <s, s'>  // For non SLP cases
5476                }  */
5477
5478           if (dump_enabled_p ())
5479             dump_printf_loc (MSG_NOTE, vect_location,
5480                              "Reduce using scalar code.\n");
5481
5482           int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype1));
5483           int element_bitsize = tree_to_uhwi (bitsize);
5484           FOR_EACH_VEC_ELT (new_phis, i, new_phi)
5485             {
5486               int bit_offset;
5487               if (gimple_code (new_phi) == GIMPLE_PHI)
5488                 vec_temp = PHI_RESULT (new_phi);
5489               else
5490                 vec_temp = gimple_assign_lhs (new_phi);
5491               tree rhs = build3 (BIT_FIELD_REF, scalar_type, vec_temp, bitsize,
5492                                  bitsize_zero_node);
5493               epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
5494               new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
5495               gimple_assign_set_lhs (epilog_stmt, new_temp);
5496               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5497
5498               /* In SLP we don't need to apply reduction operation, so we just
5499                  collect s' values in SCALAR_RESULTS.  */
5500               if (slp_reduc)
5501                 scalar_results.safe_push (new_temp);
5502
5503               for (bit_offset = element_bitsize;
5504                    bit_offset < vec_size_in_bits;
5505                    bit_offset += element_bitsize)
5506                 {
5507                   tree bitpos = bitsize_int (bit_offset);
5508                   tree rhs = build3 (BIT_FIELD_REF, scalar_type, vec_temp,
5509                                      bitsize, bitpos);
5510
5511                   epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
5512                   new_name = make_ssa_name (new_scalar_dest, epilog_stmt);
5513                   gimple_assign_set_lhs (epilog_stmt, new_name);
5514                   gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5515
5516                   if (slp_reduc)
5517                     {
5518                       /* In SLP we don't need to apply reduction operation, so
5519                          we just collect s' values in SCALAR_RESULTS.  */
5520                       new_temp = new_name;
5521                       scalar_results.safe_push (new_name);
5522                     }
5523                   else
5524                     {
5525                       epilog_stmt = gimple_build_assign (new_scalar_dest, code,
5526                                                          new_name, new_temp);
5527                       new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
5528                       gimple_assign_set_lhs (epilog_stmt, new_temp);
5529                       gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5530                     }
5531                 }
5532             }
5533
5534           /* The only case where we need to reduce scalar results in SLP, is
5535              unrolling.  If the size of SCALAR_RESULTS is greater than
5536              REDUC_GROUP_SIZE, we reduce them combining elements modulo
5537              REDUC_GROUP_SIZE.  */
5538           if (slp_reduc)
5539             {
5540               tree res, first_res, new_res;
5541               gimple *new_stmt;
5542
5543               /* Reduce multiple scalar results in case of SLP unrolling.  */
5544               for (j = group_size; scalar_results.iterate (j, &res);
5545                    j++)
5546                 {
5547                   first_res = scalar_results[j % group_size];
5548                   new_stmt = gimple_build_assign (new_scalar_dest, code,
5549                                                   first_res, res);
5550                   new_res = make_ssa_name (new_scalar_dest, new_stmt);
5551                   gimple_assign_set_lhs (new_stmt, new_res);
5552                   gsi_insert_before (&exit_gsi, new_stmt, GSI_SAME_STMT);
5553                   scalar_results[j % group_size] = new_res;
5554                 }
5555             }
5556           else
5557             /* Not SLP - we have one scalar to keep in SCALAR_RESULTS.  */
5558             scalar_results.safe_push (new_temp);
5559         }
5560
5561       if ((STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
5562            == INTEGER_INDUC_COND_REDUCTION)
5563           && !operand_equal_p (initial_def, induc_val, 0))
5564         {
5565           /* Earlier we set the initial value to be a vector if induc_val
5566              values.  Check the result and if it is induc_val then replace
5567              with the original initial value, unless induc_val is
5568              the same as initial_def already.  */
5569           tree zcompare = build2 (EQ_EXPR, boolean_type_node, new_temp,
5570                                   induc_val);
5571
5572           tree tmp = make_ssa_name (new_scalar_dest);
5573           epilog_stmt = gimple_build_assign (tmp, COND_EXPR, zcompare,
5574                                              initial_def, new_temp);
5575           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5576           scalar_results[0] = tmp;
5577         }
5578     }
5579
5580 vect_finalize_reduction:
5581
5582   if (double_reduc)
5583     loop = loop->inner;
5584
5585   /* 2.5 Adjust the final result by the initial value of the reduction
5586          variable. (When such adjustment is not needed, then
5587          'adjustment_def' is zero).  For example, if code is PLUS we create:
5588          new_temp = loop_exit_def + adjustment_def  */
5589
5590   if (adjustment_def)
5591     {
5592       gcc_assert (!slp_reduc);
5593       if (nested_in_vect_loop)
5594         {
5595           new_phi = new_phis[0];
5596           gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) == VECTOR_TYPE);
5597           expr = build2 (code, vectype, PHI_RESULT (new_phi), adjustment_def);
5598           new_dest = vect_create_destination_var (scalar_dest, vectype);
5599         }
5600       else
5601         {
5602           new_temp = scalar_results[0];
5603           gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) != VECTOR_TYPE);
5604           expr = build2 (code, scalar_type, new_temp, adjustment_def);
5605           new_dest = vect_create_destination_var (scalar_dest, scalar_type);
5606         }
5607
5608       epilog_stmt = gimple_build_assign (new_dest, expr);
5609       new_temp = make_ssa_name (new_dest, epilog_stmt);
5610       gimple_assign_set_lhs (epilog_stmt, new_temp);
5611       gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5612       if (nested_in_vect_loop)
5613         {
5614           stmt_vec_info epilog_stmt_info = loop_vinfo->add_stmt (epilog_stmt);
5615           STMT_VINFO_RELATED_STMT (epilog_stmt_info)
5616             = STMT_VINFO_RELATED_STMT (loop_vinfo->lookup_stmt (new_phi));
5617
5618           if (!double_reduc)
5619             scalar_results.quick_push (new_temp);
5620           else
5621             scalar_results[0] = new_temp;
5622         }
5623       else
5624         scalar_results[0] = new_temp;
5625
5626       new_phis[0] = epilog_stmt;
5627     }
5628
5629   /* 2.6  Handle the loop-exit phis.  Replace the uses of scalar loop-exit
5630           phis with new adjusted scalar results, i.e., replace use <s_out0>
5631           with use <s_out4>.
5632
5633      Transform:
5634         loop_exit:
5635           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
5636           v_out1 = phi <VECT_DEF>               # NEW_EXIT_PHI
5637           v_out2 = reduce <v_out1>
5638           s_out3 = extract_field <v_out2, 0>
5639           s_out4 = adjust_result <s_out3>
5640           use <s_out0>
5641           use <s_out0>
5642
5643      into:
5644
5645         loop_exit:
5646           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
5647           v_out1 = phi <VECT_DEF>               # NEW_EXIT_PHI
5648           v_out2 = reduce <v_out1>
5649           s_out3 = extract_field <v_out2, 0>
5650           s_out4 = adjust_result <s_out3>
5651           use <s_out4>
5652           use <s_out4> */
5653
5654
5655   /* In SLP reduction chain we reduce vector results into one vector if
5656      necessary, hence we set here REDUC_GROUP_SIZE to 1.  SCALAR_DEST is the
5657      LHS of the last stmt in the reduction chain, since we are looking for
5658      the loop exit phi node.  */
5659   if (REDUC_GROUP_FIRST_ELEMENT (stmt_info))
5660     {
5661       stmt_vec_info dest_stmt_info
5662         = vect_orig_stmt (SLP_TREE_SCALAR_STMTS (slp_node)[group_size - 1]);
5663       scalar_dest = gimple_assign_lhs (dest_stmt_info->stmt);
5664       group_size = 1;
5665     }
5666
5667   /* In SLP we may have several statements in NEW_PHIS and REDUCTION_PHIS (in
5668      case that REDUC_GROUP_SIZE is greater than vectorization factor).
5669      Therefore, we need to match SCALAR_RESULTS with corresponding statements.
5670      The first (REDUC_GROUP_SIZE / number of new vector stmts) scalar results
5671      correspond to the first vector stmt, etc.
5672      (RATIO is equal to (REDUC_GROUP_SIZE / number of new vector stmts)).  */
5673   if (group_size > new_phis.length ())
5674     {
5675       ratio = group_size / new_phis.length ();
5676       gcc_assert (!(group_size % new_phis.length ()));
5677     }
5678   else
5679     ratio = 1;
5680
5681   stmt_vec_info epilog_stmt_info = NULL;
5682   for (k = 0; k < group_size; k++)
5683     {
5684       if (k % ratio == 0)
5685         {
5686           epilog_stmt_info = loop_vinfo->lookup_stmt (new_phis[k / ratio]);
5687           reduction_phi_info = reduction_phis[k / ratio];
5688           if (double_reduc)
5689             inner_phi = inner_phis[k / ratio];
5690         }
5691
5692       if (slp_reduc)
5693         {
5694           stmt_vec_info scalar_stmt_info = SLP_TREE_SCALAR_STMTS (slp_node)[k];
5695
5696           orig_stmt_info = STMT_VINFO_RELATED_STMT (scalar_stmt_info);
5697           /* SLP statements can't participate in patterns.  */
5698           gcc_assert (!orig_stmt_info);
5699           scalar_dest = gimple_assign_lhs (scalar_stmt_info->stmt);
5700         }
5701
5702       phis.create (3);
5703       /* Find the loop-closed-use at the loop exit of the original scalar
5704          result.  (The reduction result is expected to have two immediate uses -
5705          one at the latch block, and one at the loop exit).  */
5706       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, scalar_dest)
5707         if (!flow_bb_inside_loop_p (loop, gimple_bb (USE_STMT (use_p)))
5708             && !is_gimple_debug (USE_STMT (use_p)))
5709           phis.safe_push (USE_STMT (use_p));
5710
5711       /* While we expect to have found an exit_phi because of loop-closed-ssa
5712          form we can end up without one if the scalar cycle is dead.  */
5713
5714       FOR_EACH_VEC_ELT (phis, i, exit_phi)
5715         {
5716           if (outer_loop)
5717             {
5718               stmt_vec_info exit_phi_vinfo
5719                 = loop_vinfo->lookup_stmt (exit_phi);
5720               gphi *vect_phi;
5721
5722               if (double_reduc)
5723                 STMT_VINFO_VEC_STMT (exit_phi_vinfo) = inner_phi;
5724               else
5725                 STMT_VINFO_VEC_STMT (exit_phi_vinfo) = epilog_stmt_info;
5726               if (!double_reduc
5727                   || STMT_VINFO_DEF_TYPE (exit_phi_vinfo)
5728                       != vect_double_reduction_def)
5729                 continue;
5730
5731               /* Handle double reduction:
5732
5733                  stmt1: s1 = phi <s0, s2>  - double reduction phi (outer loop)
5734                  stmt2:   s3 = phi <s1, s4> - (regular) reduc phi (inner loop)
5735                  stmt3:   s4 = use (s3)     - (regular) reduc stmt (inner loop)
5736                  stmt4: s2 = phi <s4>      - double reduction stmt (outer loop)
5737
5738                  At that point the regular reduction (stmt2 and stmt3) is
5739                  already vectorized, as well as the exit phi node, stmt4.
5740                  Here we vectorize the phi node of double reduction, stmt1, and
5741                  update all relevant statements.  */
5742
5743               /* Go through all the uses of s2 to find double reduction phi
5744                  node, i.e., stmt1 above.  */
5745               orig_name = PHI_RESULT (exit_phi);
5746               FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name)
5747                 {
5748                   stmt_vec_info use_stmt_vinfo;
5749                   tree vect_phi_init, preheader_arg, vect_phi_res;
5750                   basic_block bb = gimple_bb (use_stmt);
5751
5752                   /* Check that USE_STMT is really double reduction phi
5753                      node.  */
5754                   if (gimple_code (use_stmt) != GIMPLE_PHI
5755                       || gimple_phi_num_args (use_stmt) != 2
5756                       || bb->loop_father != outer_loop)
5757                     continue;
5758                   use_stmt_vinfo = loop_vinfo->lookup_stmt (use_stmt);
5759                   if (!use_stmt_vinfo
5760                       || STMT_VINFO_DEF_TYPE (use_stmt_vinfo)
5761                           != vect_double_reduction_def)
5762                     continue;
5763
5764                   /* Create vector phi node for double reduction:
5765                      vs1 = phi <vs0, vs2>
5766                      vs1 was created previously in this function by a call to
5767                        vect_get_vec_def_for_operand and is stored in
5768                        vec_initial_def;
5769                      vs2 is defined by INNER_PHI, the vectorized EXIT_PHI;
5770                      vs0 is created here.  */
5771
5772                   /* Create vector phi node.  */
5773                   vect_phi = create_phi_node (vec_initial_def, bb);
5774                   loop_vec_info_for_loop (outer_loop)->add_stmt (vect_phi);
5775
5776                   /* Create vs0 - initial def of the double reduction phi.  */
5777                   preheader_arg = PHI_ARG_DEF_FROM_EDGE (use_stmt,
5778                                              loop_preheader_edge (outer_loop));
5779                   vect_phi_init = get_initial_def_for_reduction
5780                     (stmt_info, preheader_arg, NULL);
5781
5782                   /* Update phi node arguments with vs0 and vs2.  */
5783                   add_phi_arg (vect_phi, vect_phi_init,
5784                                loop_preheader_edge (outer_loop),
5785                                UNKNOWN_LOCATION);
5786                   add_phi_arg (vect_phi, PHI_RESULT (inner_phi->stmt),
5787                                loop_latch_edge (outer_loop), UNKNOWN_LOCATION);
5788                   if (dump_enabled_p ())
5789                     dump_printf_loc (MSG_NOTE, vect_location,
5790                                      "created double reduction phi node: %G",
5791                                      vect_phi);
5792
5793                   vect_phi_res = PHI_RESULT (vect_phi);
5794
5795                   /* Replace the use, i.e., set the correct vs1 in the regular
5796                      reduction phi node.  FORNOW, NCOPIES is always 1, so the
5797                      loop is redundant.  */
5798                   stmt_vec_info use_info = reduction_phi_info;
5799                   for (j = 0; j < ncopies; j++)
5800                     {
5801                       edge pr_edge = loop_preheader_edge (loop);
5802                       SET_PHI_ARG_DEF (as_a <gphi *> (use_info->stmt),
5803                                        pr_edge->dest_idx, vect_phi_res);
5804                       use_info = STMT_VINFO_RELATED_STMT (use_info);
5805                     }
5806                 }
5807             }
5808         }
5809
5810       phis.release ();
5811       if (nested_in_vect_loop)
5812         {
5813           if (double_reduc)
5814             loop = outer_loop;
5815           else
5816             continue;
5817         }
5818
5819       phis.create (3);
5820       /* Find the loop-closed-use at the loop exit of the original scalar
5821          result.  (The reduction result is expected to have two immediate uses,
5822          one at the latch block, and one at the loop exit).  For double
5823          reductions we are looking for exit phis of the outer loop.  */
5824       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, scalar_dest)
5825         {
5826           if (!flow_bb_inside_loop_p (loop, gimple_bb (USE_STMT (use_p))))
5827             {
5828               if (!is_gimple_debug (USE_STMT (use_p)))
5829                 phis.safe_push (USE_STMT (use_p));
5830             }
5831           else
5832             {
5833               if (double_reduc && gimple_code (USE_STMT (use_p)) == GIMPLE_PHI)
5834                 {
5835                   tree phi_res = PHI_RESULT (USE_STMT (use_p));
5836
5837                   FOR_EACH_IMM_USE_FAST (phi_use_p, phi_imm_iter, phi_res)
5838                     {
5839                       if (!flow_bb_inside_loop_p (loop,
5840                                              gimple_bb (USE_STMT (phi_use_p)))
5841                           && !is_gimple_debug (USE_STMT (phi_use_p)))
5842                         phis.safe_push (USE_STMT (phi_use_p));
5843                     }
5844                 }
5845             }
5846         }
5847
5848       FOR_EACH_VEC_ELT (phis, i, exit_phi)
5849         {
5850           /* Replace the uses:  */
5851           orig_name = PHI_RESULT (exit_phi);
5852           scalar_result = scalar_results[k];
5853           FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name)
5854             FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
5855               SET_USE (use_p, scalar_result);
5856         }
5857
5858       phis.release ();
5859     }
5860 }
5861
5862 /* Return a vector of type VECTYPE that is equal to the vector select
5863    operation "MASK ? VEC : IDENTITY".  Insert the select statements
5864    before GSI.  */
5865
5866 static tree
5867 merge_with_identity (gimple_stmt_iterator *gsi, tree mask, tree vectype,
5868                      tree vec, tree identity)
5869 {
5870   tree cond = make_temp_ssa_name (vectype, NULL, "cond");
5871   gimple *new_stmt = gimple_build_assign (cond, VEC_COND_EXPR,
5872                                           mask, vec, identity);
5873   gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
5874   return cond;
5875 }
5876
5877 /* Successively apply CODE to each element of VECTOR_RHS, in left-to-right
5878    order, starting with LHS.  Insert the extraction statements before GSI and
5879    associate the new scalar SSA names with variable SCALAR_DEST.
5880    Return the SSA name for the result.  */
5881
5882 static tree
5883 vect_expand_fold_left (gimple_stmt_iterator *gsi, tree scalar_dest,
5884                        tree_code code, tree lhs, tree vector_rhs)
5885 {
5886   tree vectype = TREE_TYPE (vector_rhs);
5887   tree scalar_type = TREE_TYPE (vectype);
5888   tree bitsize = TYPE_SIZE (scalar_type);
5889   unsigned HOST_WIDE_INT vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
5890   unsigned HOST_WIDE_INT element_bitsize = tree_to_uhwi (bitsize);
5891
5892   for (unsigned HOST_WIDE_INT bit_offset = 0;
5893        bit_offset < vec_size_in_bits;
5894        bit_offset += element_bitsize)
5895     {
5896       tree bitpos = bitsize_int (bit_offset);
5897       tree rhs = build3 (BIT_FIELD_REF, scalar_type, vector_rhs,
5898                          bitsize, bitpos);
5899
5900       gassign *stmt = gimple_build_assign (scalar_dest, rhs);
5901       rhs = make_ssa_name (scalar_dest, stmt);
5902       gimple_assign_set_lhs (stmt, rhs);
5903       gsi_insert_before (gsi, stmt, GSI_SAME_STMT);
5904
5905       stmt = gimple_build_assign (scalar_dest, code, lhs, rhs);
5906       tree new_name = make_ssa_name (scalar_dest, stmt);
5907       gimple_assign_set_lhs (stmt, new_name);
5908       gsi_insert_before (gsi, stmt, GSI_SAME_STMT);
5909       lhs = new_name;
5910     }
5911   return lhs;
5912 }
5913
5914 /* Get a masked internal function equivalent to REDUC_FN.  VECTYPE_IN is the
5915    type of the vector input.  */
5916
5917 static internal_fn
5918 get_masked_reduction_fn (internal_fn reduc_fn, tree vectype_in)
5919 {
5920   internal_fn mask_reduc_fn;
5921
5922   switch (reduc_fn)
5923     {
5924     case IFN_FOLD_LEFT_PLUS:
5925       mask_reduc_fn = IFN_MASK_FOLD_LEFT_PLUS;
5926       break;
5927
5928     default:
5929       return IFN_LAST;
5930     }
5931
5932   if (direct_internal_fn_supported_p (mask_reduc_fn, vectype_in,
5933                                       OPTIMIZE_FOR_SPEED))
5934     return mask_reduc_fn;
5935   return IFN_LAST;
5936 }
5937
5938 /* Perform an in-order reduction (FOLD_LEFT_REDUCTION).  STMT_INFO is the
5939    statement that sets the live-out value.  REDUC_DEF_STMT is the phi
5940    statement.  CODE is the operation performed by STMT_INFO and OPS are
5941    its scalar operands.  REDUC_INDEX is the index of the operand in
5942    OPS that is set by REDUC_DEF_STMT.  REDUC_FN is the function that
5943    implements in-order reduction, or IFN_LAST if we should open-code it.
5944    VECTYPE_IN is the type of the vector input.  MASKS specifies the masks
5945    that should be used to control the operation in a fully-masked loop.  */
5946
5947 static bool
5948 vectorize_fold_left_reduction (stmt_vec_info stmt_info,
5949                                gimple_stmt_iterator *gsi,
5950                                stmt_vec_info *vec_stmt, slp_tree slp_node,
5951                                gimple *reduc_def_stmt,
5952                                tree_code code, internal_fn reduc_fn,
5953                                tree ops[3], tree vectype_in,
5954                                int reduc_index, vec_loop_masks *masks)
5955 {
5956   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
5957   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
5958   tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
5959   stmt_vec_info new_stmt_info = NULL;
5960   internal_fn mask_reduc_fn = get_masked_reduction_fn (reduc_fn, vectype_in);
5961
5962   int ncopies;
5963   if (slp_node)
5964     ncopies = 1;
5965   else
5966     ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
5967
5968   gcc_assert (!nested_in_vect_loop_p (loop, stmt_info));
5969   gcc_assert (ncopies == 1);
5970   gcc_assert (TREE_CODE_LENGTH (code) == binary_op);
5971   gcc_assert (reduc_index == (code == MINUS_EXPR ? 0 : 1));
5972   gcc_assert (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
5973               == FOLD_LEFT_REDUCTION);
5974
5975   if (slp_node)
5976     gcc_assert (known_eq (TYPE_VECTOR_SUBPARTS (vectype_out),
5977                           TYPE_VECTOR_SUBPARTS (vectype_in)));
5978
5979   tree op0 = ops[1 - reduc_index];
5980
5981   int group_size = 1;
5982   stmt_vec_info scalar_dest_def_info;
5983   auto_vec<tree> vec_oprnds0;
5984   if (slp_node)
5985     {
5986       auto_vec<vec<tree> > vec_defs (2);
5987       auto_vec<tree> sops(2);
5988       sops.quick_push (ops[0]);
5989       sops.quick_push (ops[1]);
5990       vect_get_slp_defs (sops, slp_node, &vec_defs);
5991       vec_oprnds0.safe_splice (vec_defs[1 - reduc_index]);
5992       vec_defs[0].release ();
5993       vec_defs[1].release ();
5994       group_size = SLP_TREE_SCALAR_STMTS (slp_node).length ();
5995       scalar_dest_def_info = SLP_TREE_SCALAR_STMTS (slp_node)[group_size - 1];
5996     }
5997   else
5998     {
5999       tree loop_vec_def0 = vect_get_vec_def_for_operand (op0, stmt_info);
6000       vec_oprnds0.create (1);
6001       vec_oprnds0.quick_push (loop_vec_def0);
6002       scalar_dest_def_info = stmt_info;
6003     }
6004
6005   tree scalar_dest = gimple_assign_lhs (scalar_dest_def_info->stmt);
6006   tree scalar_type = TREE_TYPE (scalar_dest);
6007   tree reduc_var = gimple_phi_result (reduc_def_stmt);
6008
6009   int vec_num = vec_oprnds0.length ();
6010   gcc_assert (vec_num == 1 || slp_node);
6011   tree vec_elem_type = TREE_TYPE (vectype_out);
6012   gcc_checking_assert (useless_type_conversion_p (scalar_type, vec_elem_type));
6013
6014   tree vector_identity = NULL_TREE;
6015   if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
6016     vector_identity = build_zero_cst (vectype_out);
6017
6018   tree scalar_dest_var = vect_create_destination_var (scalar_dest, NULL);
6019   int i;
6020   tree def0;
6021   FOR_EACH_VEC_ELT (vec_oprnds0, i, def0)
6022     {
6023       gimple *new_stmt;
6024       tree mask = NULL_TREE;
6025       if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
6026         mask = vect_get_loop_mask (gsi, masks, vec_num, vectype_in, i);
6027
6028       /* Handle MINUS by adding the negative.  */
6029       if (reduc_fn != IFN_LAST && code == MINUS_EXPR)
6030         {
6031           tree negated = make_ssa_name (vectype_out);
6032           new_stmt = gimple_build_assign (negated, NEGATE_EXPR, def0);
6033           gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
6034           def0 = negated;
6035         }
6036
6037       if (mask && mask_reduc_fn == IFN_LAST)
6038         def0 = merge_with_identity (gsi, mask, vectype_out, def0,
6039                                     vector_identity);
6040
6041       /* On the first iteration the input is simply the scalar phi
6042          result, and for subsequent iterations it is the output of
6043          the preceding operation.  */
6044       if (reduc_fn != IFN_LAST || (mask && mask_reduc_fn != IFN_LAST))
6045         {
6046           if (mask && mask_reduc_fn != IFN_LAST)
6047             new_stmt = gimple_build_call_internal (mask_reduc_fn, 3, reduc_var,
6048                                                    def0, mask);
6049           else
6050             new_stmt = gimple_build_call_internal (reduc_fn, 2, reduc_var,
6051                                                    def0);
6052           /* For chained SLP reductions the output of the previous reduction
6053              operation serves as the input of the next. For the final statement
6054              the output cannot be a temporary - we reuse the original
6055              scalar destination of the last statement.  */
6056           if (i != vec_num - 1)
6057             {
6058               gimple_set_lhs (new_stmt, scalar_dest_var);
6059               reduc_var = make_ssa_name (scalar_dest_var, new_stmt);
6060               gimple_set_lhs (new_stmt, reduc_var);
6061             }
6062         }
6063       else
6064         {
6065           reduc_var = vect_expand_fold_left (gsi, scalar_dest_var, code,
6066                                              reduc_var, def0);
6067           new_stmt = SSA_NAME_DEF_STMT (reduc_var);
6068           /* Remove the statement, so that we can use the same code paths
6069              as for statements that we've just created.  */
6070           gimple_stmt_iterator tmp_gsi = gsi_for_stmt (new_stmt);
6071           gsi_remove (&tmp_gsi, true);
6072         }
6073
6074       if (i == vec_num - 1)
6075         {
6076           gimple_set_lhs (new_stmt, scalar_dest);
6077           new_stmt_info = vect_finish_replace_stmt (scalar_dest_def_info,
6078                                                     new_stmt);
6079         }
6080       else
6081         new_stmt_info = vect_finish_stmt_generation (scalar_dest_def_info,
6082                                                      new_stmt, gsi);
6083
6084       if (slp_node)
6085         SLP_TREE_VEC_STMTS (slp_node).quick_push (new_stmt_info);
6086     }
6087
6088   if (!slp_node)
6089     STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt_info;
6090
6091   return true;
6092 }
6093
6094 /* Function is_nonwrapping_integer_induction.
6095
6096    Check if STMT_VINO (which is part of loop LOOP) both increments and
6097    does not cause overflow.  */
6098
6099 static bool
6100 is_nonwrapping_integer_induction (stmt_vec_info stmt_vinfo, struct loop *loop)
6101 {
6102   gphi *phi = as_a <gphi *> (stmt_vinfo->stmt);
6103   tree base = STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo);
6104   tree step = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo);
6105   tree lhs_type = TREE_TYPE (gimple_phi_result (phi));
6106   widest_int ni, max_loop_value, lhs_max;
6107   wi::overflow_type overflow = wi::OVF_NONE;
6108
6109   /* Make sure the loop is integer based.  */
6110   if (TREE_CODE (base) != INTEGER_CST
6111       || TREE_CODE (step) != INTEGER_CST)
6112     return false;
6113
6114   /* Check that the max size of the loop will not wrap.  */
6115
6116   if (TYPE_OVERFLOW_UNDEFINED (lhs_type))
6117     return true;
6118
6119   if (! max_stmt_executions (loop, &ni))
6120     return false;
6121
6122   max_loop_value = wi::mul (wi::to_widest (step), ni, TYPE_SIGN (lhs_type),
6123                             &overflow);
6124   if (overflow)
6125     return false;
6126
6127   max_loop_value = wi::add (wi::to_widest (base), max_loop_value,
6128                             TYPE_SIGN (lhs_type), &overflow);
6129   if (overflow)
6130     return false;
6131
6132   return (wi::min_precision (max_loop_value, TYPE_SIGN (lhs_type))
6133           <= TYPE_PRECISION (lhs_type));
6134 }
6135
6136 /* Check if masking can be supported by inserting a conditional expression.
6137    CODE is the code for the operation.  COND_FN is the conditional internal
6138    function, if it exists.  VECTYPE_IN is the type of the vector input.  */
6139 static bool
6140 use_mask_by_cond_expr_p (enum tree_code code, internal_fn cond_fn,
6141                          tree vectype_in)
6142 {
6143   if (cond_fn != IFN_LAST
6144       && direct_internal_fn_supported_p (cond_fn, vectype_in,
6145                                          OPTIMIZE_FOR_SPEED))
6146     return false;
6147
6148   switch (code)
6149     {
6150     case DOT_PROD_EXPR:
6151     case SAD_EXPR:
6152       return true;
6153
6154     default:
6155       return false;
6156     }
6157 }
6158
6159 /* Insert a conditional expression to enable masked vectorization.  CODE is the
6160    code for the operation.  VOP is the array of operands.  MASK is the loop
6161    mask.  GSI is a statement iterator used to place the new conditional
6162    expression.  */
6163 static void
6164 build_vect_cond_expr (enum tree_code code, tree vop[3], tree mask,
6165                       gimple_stmt_iterator *gsi)
6166 {
6167   switch (code)
6168     {
6169     case DOT_PROD_EXPR:
6170       {
6171         tree vectype = TREE_TYPE (vop[1]);
6172         tree zero = build_zero_cst (vectype);
6173         tree masked_op1 = make_temp_ssa_name (vectype, NULL, "masked_op1");
6174         gassign *select = gimple_build_assign (masked_op1, VEC_COND_EXPR,
6175                                                mask, vop[1], zero);
6176         gsi_insert_before (gsi, select, GSI_SAME_STMT);
6177         vop[1] = masked_op1;
6178         break;
6179       }
6180
6181     case SAD_EXPR:
6182       {
6183         tree vectype = TREE_TYPE (vop[1]);
6184         tree masked_op1 = make_temp_ssa_name (vectype, NULL, "masked_op1");
6185         gassign *select = gimple_build_assign (masked_op1, VEC_COND_EXPR,
6186                                                mask, vop[1], vop[0]);
6187         gsi_insert_before (gsi, select, GSI_SAME_STMT);
6188         vop[1] = masked_op1;
6189         break;
6190       }
6191
6192     default:
6193       gcc_unreachable ();
6194     }
6195 }
6196
6197 /* Function vectorizable_reduction.
6198
6199    Check if STMT_INFO performs a reduction operation that can be vectorized.
6200    If VEC_STMT is also passed, vectorize STMT_INFO: create a vectorized
6201    stmt to replace it, put it in VEC_STMT, and insert it at GSI.
6202    Return true if STMT_INFO is vectorizable in this way.
6203
6204    This function also handles reduction idioms (patterns) that have been
6205    recognized in advance during vect_pattern_recog.  In this case, STMT_INFO
6206    may be of this form:
6207      X = pattern_expr (arg0, arg1, ..., X)
6208    and its STMT_VINFO_RELATED_STMT points to the last stmt in the original
6209    sequence that had been detected and replaced by the pattern-stmt
6210    (STMT_INFO).
6211
6212    This function also handles reduction of condition expressions, for example:
6213      for (int i = 0; i < N; i++)
6214        if (a[i] < value)
6215          last = a[i];
6216    This is handled by vectorising the loop and creating an additional vector
6217    containing the loop indexes for which "a[i] < value" was true.  In the
6218    function epilogue this is reduced to a single max value and then used to
6219    index into the vector of results.
6220
6221    In some cases of reduction patterns, the type of the reduction variable X is
6222    different than the type of the other arguments of STMT_INFO.
6223    In such cases, the vectype that is used when transforming STMT_INFO into
6224    a vector stmt is different than the vectype that is used to determine the
6225    vectorization factor, because it consists of a different number of elements
6226    than the actual number of elements that are being operated upon in parallel.
6227
6228    For example, consider an accumulation of shorts into an int accumulator.
6229    On some targets it's possible to vectorize this pattern operating on 8
6230    shorts at a time (hence, the vectype for purposes of determining the
6231    vectorization factor should be V8HI); on the other hand, the vectype that
6232    is used to create the vector form is actually V4SI (the type of the result).
6233
6234    Upon entry to this function, STMT_VINFO_VECTYPE records the vectype that
6235    indicates what is the actual level of parallelism (V8HI in the example), so
6236    that the right vectorization factor would be derived.  This vectype
6237    corresponds to the type of arguments to the reduction stmt, and should *NOT*
6238    be used to create the vectorized stmt.  The right vectype for the vectorized
6239    stmt is obtained from the type of the result X:
6240         get_vectype_for_scalar_type (TREE_TYPE (X))
6241
6242    This means that, contrary to "regular" reductions (or "regular" stmts in
6243    general), the following equation:
6244       STMT_VINFO_VECTYPE == get_vectype_for_scalar_type (TREE_TYPE (X))
6245    does *NOT* necessarily hold for reduction patterns.  */
6246
6247 bool
6248 vectorizable_reduction (stmt_vec_info stmt_info, gimple_stmt_iterator *gsi,
6249                         stmt_vec_info *vec_stmt, slp_tree slp_node,
6250                         slp_instance slp_node_instance,
6251                         stmt_vector_for_cost *cost_vec)
6252 {
6253   tree vec_dest;
6254   tree scalar_dest;
6255   tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
6256   tree vectype_in = NULL_TREE;
6257   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
6258   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
6259   enum tree_code code, orig_code;
6260   internal_fn reduc_fn;
6261   machine_mode vec_mode;
6262   int op_type;
6263   optab optab;
6264   tree new_temp = NULL_TREE;
6265   enum vect_def_type dt, cond_reduc_dt = vect_unknown_def_type;
6266   stmt_vec_info cond_stmt_vinfo = NULL;
6267   enum tree_code cond_reduc_op_code = ERROR_MARK;
6268   tree scalar_type;
6269   bool is_simple_use;
6270   int i;
6271   int ncopies;
6272   int epilog_copies;
6273   stmt_vec_info prev_stmt_info, prev_phi_info;
6274   bool single_defuse_cycle = false;
6275   stmt_vec_info new_stmt_info = NULL;
6276   int j;
6277   tree ops[3];
6278   enum vect_def_type dts[3];
6279   bool nested_cycle = false, found_nested_cycle_def = false;
6280   bool double_reduc = false;
6281   basic_block def_bb;
6282   struct loop * def_stmt_loop;
6283   tree def_arg;
6284   auto_vec<tree> vec_oprnds0;
6285   auto_vec<tree> vec_oprnds1;
6286   auto_vec<tree> vec_oprnds2;
6287   auto_vec<tree> vect_defs;
6288   auto_vec<stmt_vec_info> phis;
6289   int vec_num;
6290   tree def0, tem;
6291   tree cr_index_scalar_type = NULL_TREE, cr_index_vector_type = NULL_TREE;
6292   tree cond_reduc_val = NULL_TREE;
6293
6294   /* Make sure it was already recognized as a reduction computation.  */
6295   if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_reduction_def
6296       && STMT_VINFO_DEF_TYPE (stmt_info) != vect_nested_cycle)
6297     return false;
6298
6299   if (nested_in_vect_loop_p (loop, stmt_info))
6300     {
6301       loop = loop->inner;
6302       nested_cycle = true;
6303     }
6304
6305   if (REDUC_GROUP_FIRST_ELEMENT (stmt_info))
6306     gcc_assert (slp_node
6307                 && REDUC_GROUP_FIRST_ELEMENT (stmt_info) == stmt_info);
6308
6309   if (gphi *phi = dyn_cast <gphi *> (stmt_info->stmt))
6310     {
6311       tree phi_result = gimple_phi_result (phi);
6312       /* Analysis is fully done on the reduction stmt invocation.  */
6313       if (! vec_stmt)
6314         {
6315           if (slp_node)
6316             slp_node_instance->reduc_phis = slp_node;
6317
6318           STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
6319           return true;
6320         }
6321
6322       if (STMT_VINFO_REDUC_TYPE (stmt_info) == FOLD_LEFT_REDUCTION)
6323         /* Leave the scalar phi in place.  Note that checking
6324            STMT_VINFO_VEC_REDUCTION_TYPE (as below) only works
6325            for reductions involving a single statement.  */
6326         return true;
6327
6328       stmt_vec_info reduc_stmt_info = STMT_VINFO_REDUC_DEF (stmt_info);
6329       reduc_stmt_info = vect_stmt_to_vectorize (reduc_stmt_info);
6330
6331       if (STMT_VINFO_VEC_REDUCTION_TYPE (reduc_stmt_info)
6332           == EXTRACT_LAST_REDUCTION)
6333         /* Leave the scalar phi in place.  */
6334         return true;
6335
6336       gassign *reduc_stmt = as_a <gassign *> (reduc_stmt_info->stmt);
6337       code = gimple_assign_rhs_code (reduc_stmt);
6338       for (unsigned k = 1; k < gimple_num_ops (reduc_stmt); ++k)
6339         {
6340           tree op = gimple_op (reduc_stmt, k);
6341           if (op == phi_result)
6342             continue;
6343           if (k == 1 && code == COND_EXPR)
6344             continue;
6345           bool is_simple_use = vect_is_simple_use (op, loop_vinfo, &dt);
6346           gcc_assert (is_simple_use);
6347           if (dt == vect_constant_def || dt == vect_external_def)
6348             continue;
6349           if (!vectype_in
6350               || (GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (vectype_in)))
6351                   < GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (op)))))
6352             vectype_in = get_vectype_for_scalar_type (TREE_TYPE (op));
6353           break;
6354         }
6355       /* For a nested cycle we might end up with an operation like
6356          phi_result * phi_result.  */
6357       if (!vectype_in)
6358         vectype_in = STMT_VINFO_VECTYPE (stmt_info);
6359       gcc_assert (vectype_in);
6360
6361       if (slp_node)
6362         ncopies = 1;
6363       else
6364         ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
6365
6366       stmt_vec_info use_stmt_info;
6367       if (ncopies > 1
6368           && STMT_VINFO_RELEVANT (reduc_stmt_info) <= vect_used_only_live
6369           && (use_stmt_info = loop_vinfo->lookup_single_use (phi_result))
6370           && vect_stmt_to_vectorize (use_stmt_info) == reduc_stmt_info)
6371         single_defuse_cycle = true;
6372
6373       /* Create the destination vector  */
6374       scalar_dest = gimple_assign_lhs (reduc_stmt);
6375       vec_dest = vect_create_destination_var (scalar_dest, vectype_out);
6376
6377       if (slp_node)
6378         /* The size vect_schedule_slp_instance computes is off for us.  */
6379         vec_num = vect_get_num_vectors
6380           (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
6381            * SLP_TREE_SCALAR_STMTS (slp_node).length (),
6382            vectype_in);
6383       else
6384         vec_num = 1;
6385
6386       /* Generate the reduction PHIs upfront.  */
6387       prev_phi_info = NULL;
6388       for (j = 0; j < ncopies; j++)
6389         {
6390           if (j == 0 || !single_defuse_cycle)
6391             {
6392               for (i = 0; i < vec_num; i++)
6393                 {
6394                   /* Create the reduction-phi that defines the reduction
6395                      operand.  */
6396                   gimple *new_phi = create_phi_node (vec_dest, loop->header);
6397                   stmt_vec_info new_phi_info = loop_vinfo->add_stmt (new_phi);
6398
6399                   if (slp_node)
6400                     SLP_TREE_VEC_STMTS (slp_node).quick_push (new_phi_info);
6401                   else
6402                     {
6403                       if (j == 0)
6404                         STMT_VINFO_VEC_STMT (stmt_info)
6405                           = *vec_stmt = new_phi_info;
6406                       else
6407                         STMT_VINFO_RELATED_STMT (prev_phi_info) = new_phi_info;
6408                       prev_phi_info = new_phi_info;
6409                     }
6410                 }
6411             }
6412         }
6413
6414       return true;
6415     }
6416
6417   /* 1. Is vectorizable reduction?  */
6418   /* Not supportable if the reduction variable is used in the loop, unless
6419      it's a reduction chain.  */
6420   if (STMT_VINFO_RELEVANT (stmt_info) > vect_used_in_outer
6421       && !REDUC_GROUP_FIRST_ELEMENT (stmt_info))
6422     return false;
6423
6424   /* Reductions that are not used even in an enclosing outer-loop,
6425      are expected to be "live" (used out of the loop).  */
6426   if (STMT_VINFO_RELEVANT (stmt_info) == vect_unused_in_scope
6427       && !STMT_VINFO_LIVE_P (stmt_info))
6428     return false;
6429
6430   /* 2. Has this been recognized as a reduction pattern?
6431
6432      Check if STMT represents a pattern that has been recognized
6433      in earlier analysis stages.  For stmts that represent a pattern,
6434      the STMT_VINFO_RELATED_STMT field records the last stmt in
6435      the original sequence that constitutes the pattern.  */
6436
6437   stmt_vec_info orig_stmt_info = STMT_VINFO_RELATED_STMT (stmt_info);
6438   if (orig_stmt_info)
6439     {
6440       gcc_assert (STMT_VINFO_IN_PATTERN_P (orig_stmt_info));
6441       gcc_assert (!STMT_VINFO_IN_PATTERN_P (stmt_info));
6442     }
6443
6444   /* 3. Check the operands of the operation.  The first operands are defined
6445         inside the loop body. The last operand is the reduction variable,
6446         which is defined by the loop-header-phi.  */
6447
6448   gassign *stmt = as_a <gassign *> (stmt_info->stmt);
6449
6450   /* Flatten RHS.  */
6451   switch (get_gimple_rhs_class (gimple_assign_rhs_code (stmt)))
6452     {
6453     case GIMPLE_BINARY_RHS:
6454       code = gimple_assign_rhs_code (stmt);
6455       op_type = TREE_CODE_LENGTH (code);
6456       gcc_assert (op_type == binary_op);
6457       ops[0] = gimple_assign_rhs1 (stmt);
6458       ops[1] = gimple_assign_rhs2 (stmt);
6459       break;
6460
6461     case GIMPLE_TERNARY_RHS:
6462       code = gimple_assign_rhs_code (stmt);
6463       op_type = TREE_CODE_LENGTH (code);
6464       gcc_assert (op_type == ternary_op);
6465       ops[0] = gimple_assign_rhs1 (stmt);
6466       ops[1] = gimple_assign_rhs2 (stmt);
6467       ops[2] = gimple_assign_rhs3 (stmt);
6468       break;
6469
6470     case GIMPLE_UNARY_RHS:
6471       return false;
6472
6473     default:
6474       gcc_unreachable ();
6475     }
6476
6477   if (code == COND_EXPR && slp_node)
6478     return false;
6479
6480   scalar_dest = gimple_assign_lhs (stmt);
6481   scalar_type = TREE_TYPE (scalar_dest);
6482   if (!POINTER_TYPE_P (scalar_type) && !INTEGRAL_TYPE_P (scalar_type)
6483       && !SCALAR_FLOAT_TYPE_P (scalar_type))
6484     return false;
6485
6486   /* Do not try to vectorize bit-precision reductions.  */
6487   if (!type_has_mode_precision_p (scalar_type))
6488     return false;
6489
6490   /* All uses but the last are expected to be defined in the loop.
6491      The last use is the reduction variable.  In case of nested cycle this
6492      assumption is not true: we use reduc_index to record the index of the
6493      reduction variable.  */
6494   stmt_vec_info reduc_def_info;
6495   if (orig_stmt_info)
6496     reduc_def_info = STMT_VINFO_REDUC_DEF (orig_stmt_info);
6497   else
6498     reduc_def_info = STMT_VINFO_REDUC_DEF (stmt_info);
6499   gcc_assert (reduc_def_info);
6500   gphi *reduc_def_phi = as_a <gphi *> (reduc_def_info->stmt);
6501   tree reduc_def = PHI_RESULT (reduc_def_phi);
6502   int reduc_index = -1;
6503   for (i = 0; i < op_type; i++)
6504     {
6505       /* The condition of COND_EXPR is checked in vectorizable_condition().  */
6506       if (i == 0 && code == COND_EXPR)
6507         continue;
6508
6509       stmt_vec_info def_stmt_info;
6510       is_simple_use = vect_is_simple_use (ops[i], loop_vinfo, &dts[i], &tem,
6511                                           &def_stmt_info);
6512       dt = dts[i];
6513       gcc_assert (is_simple_use);
6514       if (dt == vect_reduction_def
6515           && ops[i] == reduc_def)
6516         {
6517           reduc_index = i;
6518           continue;
6519         }
6520       else if (tem)
6521         {
6522           /* To properly compute ncopies we are interested in the widest
6523              input type in case we're looking at a widening accumulation.  */
6524           if (!vectype_in
6525               || (GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (vectype_in)))
6526                   < GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (tem)))))
6527             vectype_in = tem;
6528         }
6529
6530       if (dt != vect_internal_def
6531           && dt != vect_external_def
6532           && dt != vect_constant_def
6533           && dt != vect_induction_def
6534           && !(dt == vect_nested_cycle && nested_cycle))
6535         return false;
6536
6537       if (dt == vect_nested_cycle
6538           && ops[i] == reduc_def)
6539         {
6540           found_nested_cycle_def = true;
6541           reduc_index = i;
6542         }
6543
6544       if (i == 1 && code == COND_EXPR)
6545         {
6546           /* Record how value of COND_EXPR is defined.  */
6547           if (dt == vect_constant_def)
6548             {
6549               cond_reduc_dt = dt;
6550               cond_reduc_val = ops[i];
6551             }
6552           if (dt == vect_induction_def
6553               && def_stmt_info
6554               && is_nonwrapping_integer_induction (def_stmt_info, loop))
6555             {
6556               cond_reduc_dt = dt;
6557               cond_stmt_vinfo = def_stmt_info;
6558             }
6559         }
6560     }
6561
6562   if (!vectype_in)
6563     vectype_in = vectype_out;
6564
6565   /* When vectorizing a reduction chain w/o SLP the reduction PHI is not
6566      directy used in stmt.  */
6567   if (reduc_index == -1)
6568     {
6569       if (STMT_VINFO_REDUC_TYPE (stmt_info) == FOLD_LEFT_REDUCTION)
6570         {
6571           if (dump_enabled_p ())
6572             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6573                              "in-order reduction chain without SLP.\n");
6574           return false;
6575         }
6576     }
6577
6578   if (!(reduc_index == -1
6579         || dts[reduc_index] == vect_reduction_def
6580         || dts[reduc_index] == vect_nested_cycle
6581         || ((dts[reduc_index] == vect_internal_def
6582              || dts[reduc_index] == vect_external_def
6583              || dts[reduc_index] == vect_constant_def
6584              || dts[reduc_index] == vect_induction_def)
6585             && nested_cycle && found_nested_cycle_def)))
6586     {
6587       /* For pattern recognized stmts, orig_stmt might be a reduction,
6588          but some helper statements for the pattern might not, or
6589          might be COND_EXPRs with reduction uses in the condition.  */
6590       gcc_assert (orig_stmt_info);
6591       return false;
6592     }
6593
6594   /* PHIs should not participate in patterns.  */
6595   gcc_assert (!STMT_VINFO_RELATED_STMT (reduc_def_info));
6596   enum vect_reduction_type v_reduc_type
6597     = STMT_VINFO_REDUC_TYPE (reduc_def_info);
6598   stmt_vec_info tmp = STMT_VINFO_REDUC_DEF (reduc_def_info);
6599
6600   STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) = v_reduc_type;
6601   /* If we have a condition reduction, see if we can simplify it further.  */
6602   if (v_reduc_type == COND_REDUCTION)
6603     {
6604       /* TODO: We can't yet handle reduction chains, since we need to treat
6605          each COND_EXPR in the chain specially, not just the last one.
6606          E.g. for:
6607
6608             x_1 = PHI <x_3, ...>
6609             x_2 = a_2 ? ... : x_1;
6610             x_3 = a_3 ? ... : x_2;
6611
6612          we're interested in the last element in x_3 for which a_2 || a_3
6613          is true, whereas the current reduction chain handling would
6614          vectorize x_2 as a normal VEC_COND_EXPR and only treat x_3
6615          as a reduction operation.  */
6616       if (reduc_index == -1)
6617         {
6618           if (dump_enabled_p ())
6619             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6620                              "conditional reduction chains not supported\n");
6621           return false;
6622         }
6623
6624       /* vect_is_simple_reduction ensured that operand 2 is the
6625          loop-carried operand.  */
6626       gcc_assert (reduc_index == 2);
6627
6628       /* Loop peeling modifies initial value of reduction PHI, which
6629          makes the reduction stmt to be transformed different to the
6630          original stmt analyzed.  We need to record reduction code for
6631          CONST_COND_REDUCTION type reduction at analyzing stage, thus
6632          it can be used directly at transform stage.  */
6633       if (STMT_VINFO_VEC_CONST_COND_REDUC_CODE (stmt_info) == MAX_EXPR
6634           || STMT_VINFO_VEC_CONST_COND_REDUC_CODE (stmt_info) == MIN_EXPR)
6635         {
6636           /* Also set the reduction type to CONST_COND_REDUCTION.  */
6637           gcc_assert (cond_reduc_dt == vect_constant_def);
6638           STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) = CONST_COND_REDUCTION;
6639         }
6640       else if (direct_internal_fn_supported_p (IFN_FOLD_EXTRACT_LAST,
6641                                                vectype_in, OPTIMIZE_FOR_SPEED))
6642         {
6643           if (dump_enabled_p ())
6644             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6645                              "optimizing condition reduction with"
6646                              " FOLD_EXTRACT_LAST.\n");
6647           STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) = EXTRACT_LAST_REDUCTION;
6648         }
6649       else if (cond_reduc_dt == vect_induction_def)
6650         {
6651           tree base
6652             = STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (cond_stmt_vinfo);
6653           tree step = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (cond_stmt_vinfo);
6654
6655           gcc_assert (TREE_CODE (base) == INTEGER_CST
6656                       && TREE_CODE (step) == INTEGER_CST);
6657           cond_reduc_val = NULL_TREE;
6658           /* Find a suitable value, for MAX_EXPR below base, for MIN_EXPR
6659              above base; punt if base is the minimum value of the type for
6660              MAX_EXPR or maximum value of the type for MIN_EXPR for now.  */
6661           if (tree_int_cst_sgn (step) == -1)
6662             {
6663               cond_reduc_op_code = MIN_EXPR;
6664               if (tree_int_cst_sgn (base) == -1)
6665                 cond_reduc_val = build_int_cst (TREE_TYPE (base), 0);
6666               else if (tree_int_cst_lt (base,
6667                                         TYPE_MAX_VALUE (TREE_TYPE (base))))
6668                 cond_reduc_val
6669                   = int_const_binop (PLUS_EXPR, base, integer_one_node);
6670             }
6671           else
6672             {
6673               cond_reduc_op_code = MAX_EXPR;
6674               if (tree_int_cst_sgn (base) == 1)
6675                 cond_reduc_val = build_int_cst (TREE_TYPE (base), 0);
6676               else if (tree_int_cst_lt (TYPE_MIN_VALUE (TREE_TYPE (base)),
6677                                         base))
6678                 cond_reduc_val
6679                   = int_const_binop (MINUS_EXPR, base, integer_one_node);
6680             }
6681           if (cond_reduc_val)
6682             {
6683               if (dump_enabled_p ())
6684                 dump_printf_loc (MSG_NOTE, vect_location,
6685                                  "condition expression based on "
6686                                  "integer induction.\n");
6687               STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
6688                 = INTEGER_INDUC_COND_REDUCTION;
6689             }
6690         }
6691       else if (cond_reduc_dt == vect_constant_def)
6692         {
6693           enum vect_def_type cond_initial_dt;
6694           gimple *def_stmt = SSA_NAME_DEF_STMT (ops[reduc_index]);
6695           tree cond_initial_val
6696             = PHI_ARG_DEF_FROM_EDGE (def_stmt, loop_preheader_edge (loop));
6697
6698           gcc_assert (cond_reduc_val != NULL_TREE);
6699           vect_is_simple_use (cond_initial_val, loop_vinfo, &cond_initial_dt);
6700           if (cond_initial_dt == vect_constant_def
6701               && types_compatible_p (TREE_TYPE (cond_initial_val),
6702                                      TREE_TYPE (cond_reduc_val)))
6703             {
6704               tree e = fold_binary (LE_EXPR, boolean_type_node,
6705                                     cond_initial_val, cond_reduc_val);
6706               if (e && (integer_onep (e) || integer_zerop (e)))
6707                 {
6708                   if (dump_enabled_p ())
6709                     dump_printf_loc (MSG_NOTE, vect_location,
6710                                      "condition expression based on "
6711                                      "compile time constant.\n");
6712                   /* Record reduction code at analysis stage.  */
6713                   STMT_VINFO_VEC_CONST_COND_REDUC_CODE (stmt_info)
6714                     = integer_onep (e) ? MAX_EXPR : MIN_EXPR;
6715                   STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
6716                     = CONST_COND_REDUCTION;
6717                 }
6718             }
6719         }
6720     }
6721
6722   if (orig_stmt_info)
6723     gcc_assert (tmp == orig_stmt_info
6724                 || REDUC_GROUP_FIRST_ELEMENT (tmp) == orig_stmt_info);
6725   else
6726     /* We changed STMT to be the first stmt in reduction chain, hence we
6727        check that in this case the first element in the chain is STMT.  */
6728     gcc_assert (tmp == stmt_info
6729                 || REDUC_GROUP_FIRST_ELEMENT (tmp) == stmt_info);
6730
6731   if (STMT_VINFO_LIVE_P (reduc_def_info))
6732     return false;
6733
6734   if (slp_node)
6735     ncopies = 1;
6736   else
6737     ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
6738
6739   gcc_assert (ncopies >= 1);
6740
6741   vec_mode = TYPE_MODE (vectype_in);
6742   poly_uint64 nunits_out = TYPE_VECTOR_SUBPARTS (vectype_out);
6743
6744   if (nested_cycle)
6745     {
6746       def_bb = gimple_bb (reduc_def_phi);
6747       def_stmt_loop = def_bb->loop_father;
6748       def_arg = PHI_ARG_DEF_FROM_EDGE (reduc_def_phi,
6749                                        loop_preheader_edge (def_stmt_loop));
6750       stmt_vec_info def_arg_stmt_info = loop_vinfo->lookup_def (def_arg);
6751       if (def_arg_stmt_info
6752           && (STMT_VINFO_DEF_TYPE (def_arg_stmt_info)
6753               == vect_double_reduction_def))
6754         double_reduc = true;
6755     }
6756
6757   vect_reduction_type reduction_type
6758     = STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info);
6759   if ((double_reduc || reduction_type != TREE_CODE_REDUCTION)
6760       && ncopies > 1)
6761     {
6762       if (dump_enabled_p ())
6763         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6764                          "multiple types in double reduction or condition "
6765                          "reduction.\n");
6766       return false;
6767     }
6768
6769   if (code == COND_EXPR)
6770     {
6771       /* Only call during the analysis stage, otherwise we'll lose
6772          STMT_VINFO_TYPE.  */
6773       if (!vec_stmt && !vectorizable_condition (stmt_info, gsi, NULL,
6774                                                 true, NULL, cost_vec))
6775         {
6776           if (dump_enabled_p ())
6777             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6778                              "unsupported condition in reduction\n");
6779           return false;
6780         }
6781     }
6782   else if (code == LSHIFT_EXPR || code == RSHIFT_EXPR
6783            || code == LROTATE_EXPR || code == RROTATE_EXPR)
6784     {
6785       /* Only call during the analysis stage, otherwise we'll lose
6786          STMT_VINFO_TYPE.  We only support this for nested cycles
6787          without double reductions at the moment.  */
6788       if (!nested_cycle
6789           || double_reduc
6790           || (!vec_stmt && !vectorizable_shift (stmt_info, gsi, NULL,
6791                                                 NULL, cost_vec)))
6792         {
6793           if (dump_enabled_p ())
6794             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6795                              "unsupported shift or rotation in reduction\n");
6796           return false;
6797         }
6798     }
6799   else
6800     {
6801       /* 4. Supportable by target?  */
6802
6803       /* 4.1. check support for the operation in the loop  */
6804       optab = optab_for_tree_code (code, vectype_in, optab_default);
6805       if (!optab)
6806         {
6807           if (dump_enabled_p ())
6808             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6809                              "no optab.\n");
6810
6811           return false;
6812         }
6813
6814       if (optab_handler (optab, vec_mode) == CODE_FOR_nothing)
6815         {
6816           if (dump_enabled_p ())
6817             dump_printf (MSG_NOTE, "op not supported by target.\n");
6818
6819           if (maybe_ne (GET_MODE_SIZE (vec_mode), UNITS_PER_WORD)
6820               || !vect_worthwhile_without_simd_p (loop_vinfo, code))
6821             return false;
6822
6823           if (dump_enabled_p ())
6824             dump_printf (MSG_NOTE, "proceeding using word mode.\n");
6825         }
6826
6827       /* Worthwhile without SIMD support?  */
6828       if (!VECTOR_MODE_P (TYPE_MODE (vectype_in))
6829           && !vect_worthwhile_without_simd_p (loop_vinfo, code))
6830         {
6831           if (dump_enabled_p ())
6832             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6833                              "not worthwhile without SIMD support.\n");
6834
6835           return false;
6836         }
6837     }
6838
6839   /* 4.2. Check support for the epilog operation.
6840
6841           If STMT represents a reduction pattern, then the type of the
6842           reduction variable may be different than the type of the rest
6843           of the arguments.  For example, consider the case of accumulation
6844           of shorts into an int accumulator; The original code:
6845                         S1: int_a = (int) short_a;
6846           orig_stmt->   S2: int_acc = plus <int_a ,int_acc>;
6847
6848           was replaced with:
6849                         STMT: int_acc = widen_sum <short_a, int_acc>
6850
6851           This means that:
6852           1. The tree-code that is used to create the vector operation in the
6853              epilog code (that reduces the partial results) is not the
6854              tree-code of STMT, but is rather the tree-code of the original
6855              stmt from the pattern that STMT is replacing.  I.e, in the example
6856              above we want to use 'widen_sum' in the loop, but 'plus' in the
6857              epilog.
6858           2. The type (mode) we use to check available target support
6859              for the vector operation to be created in the *epilog*, is
6860              determined by the type of the reduction variable (in the example
6861              above we'd check this: optab_handler (plus_optab, vect_int_mode])).
6862              However the type (mode) we use to check available target support
6863              for the vector operation to be created *inside the loop*, is
6864              determined by the type of the other arguments to STMT (in the
6865              example we'd check this: optab_handler (widen_sum_optab,
6866              vect_short_mode)).
6867
6868           This is contrary to "regular" reductions, in which the types of all
6869           the arguments are the same as the type of the reduction variable.
6870           For "regular" reductions we can therefore use the same vector type
6871           (and also the same tree-code) when generating the epilog code and
6872           when generating the code inside the loop.  */
6873
6874   if (orig_stmt_info
6875       && (reduction_type == TREE_CODE_REDUCTION
6876           || reduction_type == FOLD_LEFT_REDUCTION))
6877     {
6878       /* This is a reduction pattern: get the vectype from the type of the
6879          reduction variable, and get the tree-code from orig_stmt.  */
6880       orig_code = gimple_assign_rhs_code (orig_stmt_info->stmt);
6881       gcc_assert (vectype_out);
6882       vec_mode = TYPE_MODE (vectype_out);
6883     }
6884   else
6885     {
6886       /* Regular reduction: use the same vectype and tree-code as used for
6887          the vector code inside the loop can be used for the epilog code. */
6888       orig_code = code;
6889
6890       if (code == MINUS_EXPR)
6891         orig_code = PLUS_EXPR;
6892
6893       /* For simple condition reductions, replace with the actual expression
6894          we want to base our reduction around.  */
6895       if (reduction_type == CONST_COND_REDUCTION)
6896         {
6897           orig_code = STMT_VINFO_VEC_CONST_COND_REDUC_CODE (stmt_info);
6898           gcc_assert (orig_code == MAX_EXPR || orig_code == MIN_EXPR);
6899         }
6900       else if (reduction_type == INTEGER_INDUC_COND_REDUCTION)
6901         orig_code = cond_reduc_op_code;
6902     }
6903
6904   reduc_fn = IFN_LAST;
6905
6906   if (reduction_type == TREE_CODE_REDUCTION
6907       || reduction_type == FOLD_LEFT_REDUCTION
6908       || reduction_type == INTEGER_INDUC_COND_REDUCTION
6909       || reduction_type == CONST_COND_REDUCTION)
6910     {
6911       if (reduction_type == FOLD_LEFT_REDUCTION
6912           ? fold_left_reduction_fn (orig_code, &reduc_fn)
6913           : reduction_fn_for_scalar_code (orig_code, &reduc_fn))
6914         {
6915           if (reduc_fn != IFN_LAST
6916               && !direct_internal_fn_supported_p (reduc_fn, vectype_out,
6917                                                   OPTIMIZE_FOR_SPEED))
6918             {
6919               if (dump_enabled_p ())
6920                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6921                                  "reduc op not supported by target.\n");
6922
6923               reduc_fn = IFN_LAST;
6924             }
6925         }
6926       else
6927         {
6928           if (!nested_cycle || double_reduc)
6929             {
6930               if (dump_enabled_p ())
6931                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6932                                  "no reduc code for scalar code.\n");
6933
6934               return false;
6935             }
6936         }
6937     }
6938   else if (reduction_type == COND_REDUCTION)
6939     {
6940       int scalar_precision
6941         = GET_MODE_PRECISION (SCALAR_TYPE_MODE (scalar_type));
6942       cr_index_scalar_type = make_unsigned_type (scalar_precision);
6943       cr_index_vector_type = build_vector_type (cr_index_scalar_type,
6944                                                 nunits_out);
6945
6946       if (direct_internal_fn_supported_p (IFN_REDUC_MAX, cr_index_vector_type,
6947                                           OPTIMIZE_FOR_SPEED))
6948         reduc_fn = IFN_REDUC_MAX;
6949     }
6950
6951   if (reduction_type != EXTRACT_LAST_REDUCTION
6952       && (!nested_cycle || double_reduc)
6953       && reduc_fn == IFN_LAST
6954       && !nunits_out.is_constant ())
6955     {
6956       if (dump_enabled_p ())
6957         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6958                          "missing target support for reduction on"
6959                          " variable-length vectors.\n");
6960       return false;
6961     }
6962
6963   /* For SLP reductions, see if there is a neutral value we can use.  */
6964   tree neutral_op = NULL_TREE;
6965   if (slp_node)
6966     neutral_op = neutral_op_for_slp_reduction
6967       (slp_node_instance->reduc_phis, code,
6968        REDUC_GROUP_FIRST_ELEMENT (stmt_info) != NULL);
6969
6970   if (double_reduc && reduction_type == FOLD_LEFT_REDUCTION)
6971     {
6972       /* We can't support in-order reductions of code such as this:
6973
6974            for (int i = 0; i < n1; ++i)
6975              for (int j = 0; j < n2; ++j)
6976                l += a[j];
6977
6978          since GCC effectively transforms the loop when vectorizing:
6979
6980            for (int i = 0; i < n1 / VF; ++i)
6981              for (int j = 0; j < n2; ++j)
6982                for (int k = 0; k < VF; ++k)
6983                  l += a[j];
6984
6985          which is a reassociation of the original operation.  */
6986       if (dump_enabled_p ())
6987         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6988                          "in-order double reduction not supported.\n");
6989
6990       return false;
6991     }
6992
6993   if (reduction_type == FOLD_LEFT_REDUCTION
6994       && slp_node
6995       && !REDUC_GROUP_FIRST_ELEMENT (stmt_info))
6996     {
6997       /* We cannot use in-order reductions in this case because there is
6998          an implicit reassociation of the operations involved.  */
6999       if (dump_enabled_p ())
7000         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7001                          "in-order unchained SLP reductions not supported.\n");
7002       return false;
7003     }
7004
7005   /* For double reductions, and for SLP reductions with a neutral value,
7006      we construct a variable-length initial vector by loading a vector
7007      full of the neutral value and then shift-and-inserting the start
7008      values into the low-numbered elements.  */
7009   if ((double_reduc || neutral_op)
7010       && !nunits_out.is_constant ()
7011       && !direct_internal_fn_supported_p (IFN_VEC_SHL_INSERT,
7012                                           vectype_out, OPTIMIZE_FOR_SPEED))
7013     {
7014       if (dump_enabled_p ())
7015         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7016                          "reduction on variable-length vectors requires"
7017                          " target support for a vector-shift-and-insert"
7018                          " operation.\n");
7019       return false;
7020     }
7021
7022   /* Check extra constraints for variable-length unchained SLP reductions.  */
7023   if (STMT_SLP_TYPE (stmt_info)
7024       && !REDUC_GROUP_FIRST_ELEMENT (stmt_info)
7025       && !nunits_out.is_constant ())
7026     {
7027       /* We checked above that we could build the initial vector when
7028          there's a neutral element value.  Check here for the case in
7029          which each SLP statement has its own initial value and in which
7030          that value needs to be repeated for every instance of the
7031          statement within the initial vector.  */
7032       unsigned int group_size = SLP_TREE_SCALAR_STMTS (slp_node).length ();
7033       scalar_mode elt_mode = SCALAR_TYPE_MODE (TREE_TYPE (vectype_out));
7034       if (!neutral_op
7035           && !can_duplicate_and_interleave_p (group_size, elt_mode))
7036         {
7037           if (dump_enabled_p ())
7038             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7039                              "unsupported form of SLP reduction for"
7040                              " variable-length vectors: cannot build"
7041                              " initial vector.\n");
7042           return false;
7043         }
7044       /* The epilogue code relies on the number of elements being a multiple
7045          of the group size.  The duplicate-and-interleave approach to setting
7046          up the the initial vector does too.  */
7047       if (!multiple_p (nunits_out, group_size))
7048         {
7049           if (dump_enabled_p ())
7050             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7051                              "unsupported form of SLP reduction for"
7052                              " variable-length vectors: the vector size"
7053                              " is not a multiple of the number of results.\n");
7054           return false;
7055         }
7056     }
7057
7058   /* In case of widenning multiplication by a constant, we update the type
7059      of the constant to be the type of the other operand.  We check that the
7060      constant fits the type in the pattern recognition pass.  */
7061   if (code == DOT_PROD_EXPR
7062       && !types_compatible_p (TREE_TYPE (ops[0]), TREE_TYPE (ops[1])))
7063     {
7064       if (TREE_CODE (ops[0]) == INTEGER_CST)
7065         ops[0] = fold_convert (TREE_TYPE (ops[1]), ops[0]);
7066       else if (TREE_CODE (ops[1]) == INTEGER_CST)
7067         ops[1] = fold_convert (TREE_TYPE (ops[0]), ops[1]);
7068       else
7069         {
7070           if (dump_enabled_p ())
7071             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7072                              "invalid types in dot-prod\n");
7073
7074           return false;
7075         }
7076     }
7077
7078   if (reduction_type == COND_REDUCTION)
7079     {
7080       widest_int ni;
7081
7082       if (! max_loop_iterations (loop, &ni))
7083         {
7084           if (dump_enabled_p ())
7085             dump_printf_loc (MSG_NOTE, vect_location,
7086                              "loop count not known, cannot create cond "
7087                              "reduction.\n");
7088           return false;
7089         }
7090       /* Convert backedges to iterations.  */
7091       ni += 1;
7092
7093       /* The additional index will be the same type as the condition.  Check
7094          that the loop can fit into this less one (because we'll use up the
7095          zero slot for when there are no matches).  */
7096       tree max_index = TYPE_MAX_VALUE (cr_index_scalar_type);
7097       if (wi::geu_p (ni, wi::to_widest (max_index)))
7098         {
7099           if (dump_enabled_p ())
7100             dump_printf_loc (MSG_NOTE, vect_location,
7101                              "loop size is greater than data size.\n");
7102           return false;
7103         }
7104     }
7105
7106   /* In case the vectorization factor (VF) is bigger than the number
7107      of elements that we can fit in a vectype (nunits), we have to generate
7108      more than one vector stmt - i.e - we need to "unroll" the
7109      vector stmt by a factor VF/nunits.  For more details see documentation
7110      in vectorizable_operation.  */
7111
7112   /* If the reduction is used in an outer loop we need to generate
7113      VF intermediate results, like so (e.g. for ncopies=2):
7114         r0 = phi (init, r0)
7115         r1 = phi (init, r1)
7116         r0 = x0 + r0;
7117         r1 = x1 + r1;
7118     (i.e. we generate VF results in 2 registers).
7119     In this case we have a separate def-use cycle for each copy, and therefore
7120     for each copy we get the vector def for the reduction variable from the
7121     respective phi node created for this copy.
7122
7123     Otherwise (the reduction is unused in the loop nest), we can combine
7124     together intermediate results, like so (e.g. for ncopies=2):
7125         r = phi (init, r)
7126         r = x0 + r;
7127         r = x1 + r;
7128    (i.e. we generate VF/2 results in a single register).
7129    In this case for each copy we get the vector def for the reduction variable
7130    from the vectorized reduction operation generated in the previous iteration.
7131
7132    This only works when we see both the reduction PHI and its only consumer
7133    in vectorizable_reduction and there are no intermediate stmts
7134    participating.  */
7135   stmt_vec_info use_stmt_info;
7136   tree reduc_phi_result = gimple_phi_result (reduc_def_phi);
7137   if (ncopies > 1
7138       && (STMT_VINFO_RELEVANT (stmt_info) <= vect_used_only_live)
7139       && (use_stmt_info = loop_vinfo->lookup_single_use (reduc_phi_result))
7140       && vect_stmt_to_vectorize (use_stmt_info) == stmt_info)
7141     {
7142       single_defuse_cycle = true;
7143       epilog_copies = 1;
7144     }
7145   else
7146     epilog_copies = ncopies;
7147
7148   /* If the reduction stmt is one of the patterns that have lane
7149      reduction embedded we cannot handle the case of ! single_defuse_cycle.  */
7150   if ((ncopies > 1
7151        && ! single_defuse_cycle)
7152       && (code == DOT_PROD_EXPR
7153           || code == WIDEN_SUM_EXPR
7154           || code == SAD_EXPR))
7155     {
7156       if (dump_enabled_p ())
7157         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7158                          "multi def-use cycle not possible for lane-reducing "
7159                          "reduction operation\n");
7160       return false;
7161     }
7162
7163   if (slp_node)
7164     vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
7165   else
7166     vec_num = 1;
7167
7168   internal_fn cond_fn = get_conditional_internal_fn (code);
7169   vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo);
7170   bool mask_by_cond_expr = use_mask_by_cond_expr_p (code, cond_fn, vectype_in);
7171
7172   if (!vec_stmt) /* transformation not required.  */
7173     {
7174       vect_model_reduction_cost (stmt_info, reduc_fn, ncopies, cost_vec);
7175       if (loop_vinfo && LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo))
7176         {
7177           if (reduction_type != FOLD_LEFT_REDUCTION
7178               && !mask_by_cond_expr
7179               && (cond_fn == IFN_LAST
7180                   || !direct_internal_fn_supported_p (cond_fn, vectype_in,
7181                                                       OPTIMIZE_FOR_SPEED)))
7182             {
7183               if (dump_enabled_p ())
7184                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7185                                  "can't use a fully-masked loop because no"
7186                                  " conditional operation is available.\n");
7187               LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = false;
7188             }
7189           else if (reduc_index == -1)
7190             {
7191               if (dump_enabled_p ())
7192                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7193                                  "can't use a fully-masked loop for chained"
7194                                  " reductions.\n");
7195               LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = false;
7196             }
7197           else
7198             vect_record_loop_mask (loop_vinfo, masks, ncopies * vec_num,
7199                                    vectype_in);
7200         }
7201       if (dump_enabled_p ()
7202           && reduction_type == FOLD_LEFT_REDUCTION)
7203         dump_printf_loc (MSG_NOTE, vect_location,
7204                          "using an in-order (fold-left) reduction.\n");
7205       STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
7206       return true;
7207     }
7208
7209   /* Transform.  */
7210
7211   if (dump_enabled_p ())
7212     dump_printf_loc (MSG_NOTE, vect_location, "transform reduction.\n");
7213
7214   /* FORNOW: Multiple types are not supported for condition.  */
7215   if (code == COND_EXPR)
7216     gcc_assert (ncopies == 1);
7217
7218   bool masked_loop_p = LOOP_VINFO_FULLY_MASKED_P (loop_vinfo);
7219
7220   if (reduction_type == FOLD_LEFT_REDUCTION)
7221     return vectorize_fold_left_reduction
7222       (stmt_info, gsi, vec_stmt, slp_node, reduc_def_phi, code,
7223        reduc_fn, ops, vectype_in, reduc_index, masks);
7224
7225   if (reduction_type == EXTRACT_LAST_REDUCTION)
7226     {
7227       gcc_assert (!slp_node);
7228       return vectorizable_condition (stmt_info, gsi, vec_stmt,
7229                                      true, NULL, NULL);
7230     }
7231
7232   /* Create the destination vector  */
7233   vec_dest = vect_create_destination_var (scalar_dest, vectype_out);
7234
7235   prev_stmt_info = NULL;
7236   prev_phi_info = NULL;
7237   if (!slp_node)
7238     {
7239       vec_oprnds0.create (1);
7240       vec_oprnds1.create (1);
7241       if (op_type == ternary_op)
7242         vec_oprnds2.create (1);
7243     }
7244
7245   phis.create (vec_num);
7246   vect_defs.create (vec_num);
7247   if (!slp_node)
7248     vect_defs.quick_push (NULL_TREE);
7249
7250   if (slp_node)
7251     phis.splice (SLP_TREE_VEC_STMTS (slp_node_instance->reduc_phis));
7252   else
7253     phis.quick_push (STMT_VINFO_VEC_STMT (reduc_def_info));
7254
7255   for (j = 0; j < ncopies; j++)
7256     {
7257       if (code == COND_EXPR)
7258         {
7259           gcc_assert (!slp_node);
7260           vectorizable_condition (stmt_info, gsi, vec_stmt,
7261                                   true, NULL, NULL);
7262           break;
7263         }
7264       if (code == LSHIFT_EXPR
7265           || code == RSHIFT_EXPR)
7266         {
7267           vectorizable_shift (stmt_info, gsi, vec_stmt, slp_node, NULL);
7268           break;
7269         }
7270
7271       /* Handle uses.  */
7272       if (j == 0)
7273         {
7274           if (slp_node)
7275             {
7276               /* Get vec defs for all the operands except the reduction index,
7277                  ensuring the ordering of the ops in the vector is kept.  */
7278               auto_vec<tree, 3> slp_ops;
7279               auto_vec<vec<tree>, 3> vec_defs;
7280
7281               slp_ops.quick_push (ops[0]);
7282               slp_ops.quick_push (ops[1]);
7283               if (op_type == ternary_op)
7284                 slp_ops.quick_push (ops[2]);
7285
7286               vect_get_slp_defs (slp_ops, slp_node, &vec_defs);
7287
7288               vec_oprnds0.safe_splice (vec_defs[0]);
7289               vec_defs[0].release ();
7290               vec_oprnds1.safe_splice (vec_defs[1]);
7291               vec_defs[1].release ();
7292               if (op_type == ternary_op)
7293                 {
7294                   vec_oprnds2.safe_splice (vec_defs[2]);
7295                   vec_defs[2].release ();
7296                 }
7297             }
7298           else
7299             {
7300               vec_oprnds0.quick_push
7301                 (vect_get_vec_def_for_operand (ops[0], stmt_info));
7302               vec_oprnds1.quick_push
7303                 (vect_get_vec_def_for_operand (ops[1], stmt_info));
7304               if (op_type == ternary_op)
7305                 vec_oprnds2.quick_push
7306                   (vect_get_vec_def_for_operand (ops[2], stmt_info));
7307             }
7308         }
7309       else
7310         {
7311           if (!slp_node)
7312             {
7313               gcc_assert (reduc_index != -1 || ! single_defuse_cycle);
7314
7315               if (single_defuse_cycle && reduc_index == 0)
7316                 vec_oprnds0[0] = gimple_get_lhs (new_stmt_info->stmt);
7317               else
7318                 vec_oprnds0[0]
7319                   = vect_get_vec_def_for_stmt_copy (loop_vinfo,
7320                                                     vec_oprnds0[0]);
7321               if (single_defuse_cycle && reduc_index == 1)
7322                 vec_oprnds1[0] = gimple_get_lhs (new_stmt_info->stmt);
7323               else
7324                 vec_oprnds1[0]
7325                   = vect_get_vec_def_for_stmt_copy (loop_vinfo,
7326                                                     vec_oprnds1[0]);
7327               if (op_type == ternary_op)
7328                 {
7329                   if (single_defuse_cycle && reduc_index == 2)
7330                     vec_oprnds2[0] = gimple_get_lhs (new_stmt_info->stmt);
7331                   else
7332                     vec_oprnds2[0]
7333                       = vect_get_vec_def_for_stmt_copy (loop_vinfo,
7334                                                         vec_oprnds2[0]);
7335                 }
7336             }
7337         }
7338
7339       FOR_EACH_VEC_ELT (vec_oprnds0, i, def0)
7340         {
7341           tree vop[3] = { def0, vec_oprnds1[i], NULL_TREE };
7342           if (masked_loop_p && !mask_by_cond_expr)
7343             {
7344               /* Make sure that the reduction accumulator is vop[0].  */
7345               if (reduc_index == 1)
7346                 {
7347                   gcc_assert (commutative_tree_code (code));
7348                   std::swap (vop[0], vop[1]);
7349                 }
7350               tree mask = vect_get_loop_mask (gsi, masks, vec_num * ncopies,
7351                                               vectype_in, i * ncopies + j);
7352               gcall *call = gimple_build_call_internal (cond_fn, 4, mask,
7353                                                         vop[0], vop[1],
7354                                                         vop[0]);
7355               new_temp = make_ssa_name (vec_dest, call);
7356               gimple_call_set_lhs (call, new_temp);
7357               gimple_call_set_nothrow (call, true);
7358               new_stmt_info
7359                 = vect_finish_stmt_generation (stmt_info, call, gsi);
7360             }
7361           else
7362             {
7363               if (op_type == ternary_op)
7364                 vop[2] = vec_oprnds2[i];
7365
7366               if (masked_loop_p && mask_by_cond_expr)
7367                 {
7368                   tree mask = vect_get_loop_mask (gsi, masks,
7369                                                   vec_num * ncopies,
7370                                                   vectype_in, i * ncopies + j);
7371                   build_vect_cond_expr (code, vop, mask, gsi);
7372                 }
7373
7374               gassign *new_stmt = gimple_build_assign (vec_dest, code,
7375                                                        vop[0], vop[1], vop[2]);
7376               new_temp = make_ssa_name (vec_dest, new_stmt);
7377               gimple_assign_set_lhs (new_stmt, new_temp);
7378               new_stmt_info
7379                 = vect_finish_stmt_generation (stmt_info, new_stmt, gsi);
7380             }
7381
7382           if (slp_node)
7383             {
7384               SLP_TREE_VEC_STMTS (slp_node).quick_push (new_stmt_info);
7385               vect_defs.quick_push (new_temp);
7386             }
7387           else
7388             vect_defs[0] = new_temp;
7389         }
7390
7391       if (slp_node)
7392         continue;
7393
7394       if (j == 0)
7395         STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt_info;
7396       else
7397         STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt_info;
7398
7399       prev_stmt_info = new_stmt_info;
7400     }
7401
7402   /* Finalize the reduction-phi (set its arguments) and create the
7403      epilog reduction code.  */
7404   if ((!single_defuse_cycle || code == COND_EXPR) && !slp_node)
7405     vect_defs[0] = gimple_get_lhs ((*vec_stmt)->stmt);
7406
7407   vect_create_epilog_for_reduction (vect_defs, stmt_info, reduc_def_phi,
7408                                     epilog_copies, reduc_fn, phis,
7409                                     double_reduc, slp_node, slp_node_instance,
7410                                     cond_reduc_val, cond_reduc_op_code,
7411                                     neutral_op);
7412
7413   return true;
7414 }
7415
7416 /* Function vect_min_worthwhile_factor.
7417
7418    For a loop where we could vectorize the operation indicated by CODE,
7419    return the minimum vectorization factor that makes it worthwhile
7420    to use generic vectors.  */
7421 static unsigned int
7422 vect_min_worthwhile_factor (enum tree_code code)
7423 {
7424   switch (code)
7425     {
7426     case PLUS_EXPR:
7427     case MINUS_EXPR:
7428     case NEGATE_EXPR:
7429       return 4;
7430
7431     case BIT_AND_EXPR:
7432     case BIT_IOR_EXPR:
7433     case BIT_XOR_EXPR:
7434     case BIT_NOT_EXPR:
7435       return 2;
7436
7437     default:
7438       return INT_MAX;
7439     }
7440 }
7441
7442 /* Return true if VINFO indicates we are doing loop vectorization and if
7443    it is worth decomposing CODE operations into scalar operations for
7444    that loop's vectorization factor.  */
7445
7446 bool
7447 vect_worthwhile_without_simd_p (vec_info *vinfo, tree_code code)
7448 {
7449   loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
7450   unsigned HOST_WIDE_INT value;
7451   return (loop_vinfo
7452           && LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant (&value)
7453           && value >= vect_min_worthwhile_factor (code));
7454 }
7455
7456 /* Function vectorizable_induction
7457
7458    Check if STMT_INFO performs an induction computation that can be vectorized.
7459    If VEC_STMT is also passed, vectorize the induction PHI: create a vectorized
7460    phi to replace it, put it in VEC_STMT, and add it to the same basic block.
7461    Return true if STMT_INFO is vectorizable in this way.  */
7462
7463 bool
7464 vectorizable_induction (stmt_vec_info stmt_info,
7465                         gimple_stmt_iterator *gsi ATTRIBUTE_UNUSED,
7466                         stmt_vec_info *vec_stmt, slp_tree slp_node,
7467                         stmt_vector_for_cost *cost_vec)
7468 {
7469   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
7470   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
7471   unsigned ncopies;
7472   bool nested_in_vect_loop = false;
7473   struct loop *iv_loop;
7474   tree vec_def;
7475   edge pe = loop_preheader_edge (loop);
7476   basic_block new_bb;
7477   tree new_vec, vec_init, vec_step, t;
7478   tree new_name;
7479   gimple *new_stmt;
7480   gphi *induction_phi;
7481   tree induc_def, vec_dest;
7482   tree init_expr, step_expr;
7483   poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
7484   unsigned i;
7485   tree expr;
7486   gimple_seq stmts;
7487   imm_use_iterator imm_iter;
7488   use_operand_p use_p;
7489   gimple *exit_phi;
7490   edge latch_e;
7491   tree loop_arg;
7492   gimple_stmt_iterator si;
7493
7494   gphi *phi = dyn_cast <gphi *> (stmt_info->stmt);
7495   if (!phi)
7496     return false;
7497
7498   if (!STMT_VINFO_RELEVANT_P (stmt_info))
7499     return false;
7500
7501   /* Make sure it was recognized as induction computation.  */
7502   if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_induction_def)
7503     return false;
7504
7505   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
7506   poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
7507
7508   if (slp_node)
7509     ncopies = 1;
7510   else
7511     ncopies = vect_get_num_copies (loop_vinfo, vectype);
7512   gcc_assert (ncopies >= 1);
7513
7514   /* FORNOW. These restrictions should be relaxed.  */
7515   if (nested_in_vect_loop_p (loop, stmt_info))
7516     {
7517       imm_use_iterator imm_iter;
7518       use_operand_p use_p;
7519       gimple *exit_phi;
7520       edge latch_e;
7521       tree loop_arg;
7522
7523       if (ncopies > 1)
7524         {
7525           if (dump_enabled_p ())
7526             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7527                              "multiple types in nested loop.\n");
7528           return false;
7529         }
7530
7531       /* FORNOW: outer loop induction with SLP not supported.  */
7532       if (STMT_SLP_TYPE (stmt_info))
7533         return false;
7534
7535       exit_phi = NULL;
7536       latch_e = loop_latch_edge (loop->inner);
7537       loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
7538       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, loop_arg)
7539         {
7540           gimple *use_stmt = USE_STMT (use_p);
7541           if (is_gimple_debug (use_stmt))
7542             continue;
7543
7544           if (!flow_bb_inside_loop_p (loop->inner, gimple_bb (use_stmt)))
7545             {
7546               exit_phi = use_stmt;
7547               break;
7548             }
7549         }
7550       if (exit_phi)
7551         {
7552           stmt_vec_info exit_phi_vinfo = loop_vinfo->lookup_stmt (exit_phi);
7553           if (!(STMT_VINFO_RELEVANT_P (exit_phi_vinfo)
7554                 && !STMT_VINFO_LIVE_P (exit_phi_vinfo)))
7555             {
7556               if (dump_enabled_p ())
7557                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7558                                  "inner-loop induction only used outside "
7559                                  "of the outer vectorized loop.\n");
7560               return false;
7561             }
7562         }
7563
7564       nested_in_vect_loop = true;
7565       iv_loop = loop->inner;
7566     }
7567   else
7568     iv_loop = loop;
7569   gcc_assert (iv_loop == (gimple_bb (phi))->loop_father);
7570
7571   if (slp_node && !nunits.is_constant ())
7572     {
7573       /* The current SLP code creates the initial value element-by-element.  */
7574       if (dump_enabled_p ())
7575         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7576                          "SLP induction not supported for variable-length"
7577                          " vectors.\n");
7578       return false;
7579     }
7580
7581   if (!vec_stmt) /* transformation not required.  */
7582     {
7583       STMT_VINFO_TYPE (stmt_info) = induc_vec_info_type;
7584       DUMP_VECT_SCOPE ("vectorizable_induction");
7585       vect_model_induction_cost (stmt_info, ncopies, cost_vec);
7586       return true;
7587     }
7588
7589   /* Transform.  */
7590
7591   /* Compute a vector variable, initialized with the first VF values of
7592      the induction variable.  E.g., for an iv with IV_PHI='X' and
7593      evolution S, for a vector of 4 units, we want to compute:
7594      [X, X + S, X + 2*S, X + 3*S].  */
7595
7596   if (dump_enabled_p ())
7597     dump_printf_loc (MSG_NOTE, vect_location, "transform induction phi.\n");
7598
7599   latch_e = loop_latch_edge (iv_loop);
7600   loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
7601
7602   step_expr = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_info);
7603   gcc_assert (step_expr != NULL_TREE);
7604
7605   pe = loop_preheader_edge (iv_loop);
7606   init_expr = PHI_ARG_DEF_FROM_EDGE (phi,
7607                                      loop_preheader_edge (iv_loop));
7608
7609   stmts = NULL;
7610   if (!nested_in_vect_loop)
7611     {
7612       /* Convert the initial value to the desired type.  */
7613       tree new_type = TREE_TYPE (vectype);
7614       init_expr = gimple_convert (&stmts, new_type, init_expr);
7615
7616       /* If we are using the loop mask to "peel" for alignment then we need
7617          to adjust the start value here.  */
7618       tree skip_niters = LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo);
7619       if (skip_niters != NULL_TREE)
7620         {
7621           if (FLOAT_TYPE_P (vectype))
7622             skip_niters = gimple_build (&stmts, FLOAT_EXPR, new_type,
7623                                         skip_niters);
7624           else
7625             skip_niters = gimple_convert (&stmts, new_type, skip_niters);
7626           tree skip_step = gimple_build (&stmts, MULT_EXPR, new_type,
7627                                          skip_niters, step_expr);
7628           init_expr = gimple_build (&stmts, MINUS_EXPR, new_type,
7629                                     init_expr, skip_step);
7630         }
7631     }
7632
7633   /* Convert the step to the desired type.  */
7634   step_expr = gimple_convert (&stmts, TREE_TYPE (vectype), step_expr);
7635
7636   if (stmts)
7637     {
7638       new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
7639       gcc_assert (!new_bb);
7640     }
7641
7642   /* Find the first insertion point in the BB.  */
7643   basic_block bb = gimple_bb (phi);
7644   si = gsi_after_labels (bb);
7645
7646   /* For SLP induction we have to generate several IVs as for example
7647      with group size 3 we need [i, i, i, i + S] [i + S, i + S, i + 2*S, i + 2*S]
7648      [i + 2*S, i + 3*S, i + 3*S, i + 3*S].  The step is the same uniform
7649      [VF*S, VF*S, VF*S, VF*S] for all.  */
7650   if (slp_node)
7651     {
7652       /* Enforced above.  */
7653       unsigned int const_nunits = nunits.to_constant ();
7654
7655       /* Generate [VF*S, VF*S, ... ].  */
7656       if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
7657         {
7658           expr = build_int_cst (integer_type_node, vf);
7659           expr = fold_convert (TREE_TYPE (step_expr), expr);
7660         }
7661       else
7662         expr = build_int_cst (TREE_TYPE (step_expr), vf);
7663       new_name = fold_build2 (MULT_EXPR, TREE_TYPE (step_expr),
7664                               expr, step_expr);
7665       if (! CONSTANT_CLASS_P (new_name))
7666         new_name = vect_init_vector (stmt_info, new_name,
7667                                      TREE_TYPE (step_expr), NULL);
7668       new_vec = build_vector_from_val (vectype, new_name);
7669       vec_step = vect_init_vector (stmt_info, new_vec, vectype, NULL);
7670
7671       /* Now generate the IVs.  */
7672       unsigned group_size = SLP_TREE_SCALAR_STMTS (slp_node).length ();
7673       unsigned nvects = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
7674       unsigned elts = const_nunits * nvects;
7675       unsigned nivs = least_common_multiple (group_size,
7676                                              const_nunits) / const_nunits;
7677       gcc_assert (elts % group_size == 0);
7678       tree elt = init_expr;
7679       unsigned ivn;
7680       for (ivn = 0; ivn < nivs; ++ivn)
7681         {
7682           tree_vector_builder elts (vectype, const_nunits, 1);
7683           stmts = NULL;
7684           for (unsigned eltn = 0; eltn < const_nunits; ++eltn)
7685             {
7686               if (ivn*const_nunits + eltn >= group_size
7687                   && (ivn * const_nunits + eltn) % group_size == 0)
7688                 elt = gimple_build (&stmts, PLUS_EXPR, TREE_TYPE (elt),
7689                                     elt, step_expr);
7690               elts.quick_push (elt);
7691             }
7692           vec_init = gimple_build_vector (&stmts, &elts);
7693           if (stmts)
7694             {
7695               new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
7696               gcc_assert (!new_bb);
7697             }
7698
7699           /* Create the induction-phi that defines the induction-operand.  */
7700           vec_dest = vect_get_new_vect_var (vectype, vect_simple_var, "vec_iv_");
7701           induction_phi = create_phi_node (vec_dest, iv_loop->header);
7702           stmt_vec_info induction_phi_info
7703             = loop_vinfo->add_stmt (induction_phi);
7704           induc_def = PHI_RESULT (induction_phi);
7705
7706           /* Create the iv update inside the loop  */
7707           vec_def = make_ssa_name (vec_dest);
7708           new_stmt = gimple_build_assign (vec_def, PLUS_EXPR, induc_def, vec_step);
7709           gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
7710           loop_vinfo->add_stmt (new_stmt);
7711
7712           /* Set the arguments of the phi node:  */
7713           add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION);
7714           add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
7715                        UNKNOWN_LOCATION);
7716
7717           SLP_TREE_VEC_STMTS (slp_node).quick_push (induction_phi_info);
7718         }
7719
7720       /* Re-use IVs when we can.  */
7721       if (ivn < nvects)
7722         {
7723           unsigned vfp
7724             = least_common_multiple (group_size, const_nunits) / group_size;
7725           /* Generate [VF'*S, VF'*S, ... ].  */
7726           if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
7727             {
7728               expr = build_int_cst (integer_type_node, vfp);
7729               expr = fold_convert (TREE_TYPE (step_expr), expr);
7730             }
7731           else
7732             expr = build_int_cst (TREE_TYPE (step_expr), vfp);
7733           new_name = fold_build2 (MULT_EXPR, TREE_TYPE (step_expr),
7734                                   expr, step_expr);
7735           if (! CONSTANT_CLASS_P (new_name))
7736             new_name = vect_init_vector (stmt_info, new_name,
7737                                          TREE_TYPE (step_expr), NULL);
7738           new_vec = build_vector_from_val (vectype, new_name);
7739           vec_step = vect_init_vector (stmt_info, new_vec, vectype, NULL);
7740           for (; ivn < nvects; ++ivn)
7741             {
7742               gimple *iv = SLP_TREE_VEC_STMTS (slp_node)[ivn - nivs]->stmt;
7743               tree def;
7744               if (gimple_code (iv) == GIMPLE_PHI)
7745                 def = gimple_phi_result (iv);
7746               else
7747                 def = gimple_assign_lhs (iv);
7748               new_stmt = gimple_build_assign (make_ssa_name (vectype),
7749                                               PLUS_EXPR,
7750                                               def, vec_step);
7751               if (gimple_code (iv) == GIMPLE_PHI)
7752                 gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
7753               else
7754                 {
7755                   gimple_stmt_iterator tgsi = gsi_for_stmt (iv);
7756                   gsi_insert_after (&tgsi, new_stmt, GSI_CONTINUE_LINKING);
7757                 }
7758               SLP_TREE_VEC_STMTS (slp_node).quick_push
7759                 (loop_vinfo->add_stmt (new_stmt));
7760             }
7761         }
7762
7763       return true;
7764     }
7765
7766   /* Create the vector that holds the initial_value of the induction.  */
7767   if (nested_in_vect_loop)
7768     {
7769       /* iv_loop is nested in the loop to be vectorized.  init_expr had already
7770          been created during vectorization of previous stmts.  We obtain it
7771          from the STMT_VINFO_VEC_STMT of the defining stmt.  */
7772       vec_init = vect_get_vec_def_for_operand (init_expr, stmt_info);
7773       /* If the initial value is not of proper type, convert it.  */
7774       if (!useless_type_conversion_p (vectype, TREE_TYPE (vec_init)))
7775         {
7776           new_stmt
7777             = gimple_build_assign (vect_get_new_ssa_name (vectype,
7778                                                           vect_simple_var,
7779                                                           "vec_iv_"),
7780                                    VIEW_CONVERT_EXPR,
7781                                    build1 (VIEW_CONVERT_EXPR, vectype,
7782                                            vec_init));
7783           vec_init = gimple_assign_lhs (new_stmt);
7784           new_bb = gsi_insert_on_edge_immediate (loop_preheader_edge (iv_loop),
7785                                                  new_stmt);
7786           gcc_assert (!new_bb);
7787           loop_vinfo->add_stmt (new_stmt);
7788         }
7789     }
7790   else
7791     {
7792       /* iv_loop is the loop to be vectorized. Create:
7793          vec_init = [X, X+S, X+2*S, X+3*S] (S = step_expr, X = init_expr)  */
7794       stmts = NULL;
7795       new_name = gimple_convert (&stmts, TREE_TYPE (vectype), init_expr);
7796
7797       unsigned HOST_WIDE_INT const_nunits;
7798       if (nunits.is_constant (&const_nunits))
7799         {
7800           tree_vector_builder elts (vectype, const_nunits, 1);
7801           elts.quick_push (new_name);
7802           for (i = 1; i < const_nunits; i++)
7803             {
7804               /* Create: new_name_i = new_name + step_expr  */
7805               new_name = gimple_build (&stmts, PLUS_EXPR, TREE_TYPE (new_name),
7806                                        new_name, step_expr);
7807               elts.quick_push (new_name);
7808             }
7809           /* Create a vector from [new_name_0, new_name_1, ...,
7810              new_name_nunits-1]  */
7811           vec_init = gimple_build_vector (&stmts, &elts);
7812         }
7813       else if (INTEGRAL_TYPE_P (TREE_TYPE (step_expr)))
7814         /* Build the initial value directly from a VEC_SERIES_EXPR.  */
7815         vec_init = gimple_build (&stmts, VEC_SERIES_EXPR, vectype,
7816                                  new_name, step_expr);
7817       else
7818         {
7819           /* Build:
7820                 [base, base, base, ...]
7821                 + (vectype) [0, 1, 2, ...] * [step, step, step, ...].  */
7822           gcc_assert (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)));
7823           gcc_assert (flag_associative_math);
7824           tree index = build_index_vector (vectype, 0, 1);
7825           tree base_vec = gimple_build_vector_from_val (&stmts, vectype,
7826                                                         new_name);
7827           tree step_vec = gimple_build_vector_from_val (&stmts, vectype,
7828                                                         step_expr);
7829           vec_init = gimple_build (&stmts, FLOAT_EXPR, vectype, index);
7830           vec_init = gimple_build (&stmts, MULT_EXPR, vectype,
7831                                    vec_init, step_vec);
7832           vec_init = gimple_build (&stmts, PLUS_EXPR, vectype,
7833                                    vec_init, base_vec);
7834         }
7835
7836       if (stmts)
7837         {
7838           new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
7839           gcc_assert (!new_bb);
7840         }
7841     }
7842
7843
7844   /* Create the vector that holds the step of the induction.  */
7845   if (nested_in_vect_loop)
7846     /* iv_loop is nested in the loop to be vectorized. Generate:
7847        vec_step = [S, S, S, S]  */
7848     new_name = step_expr;
7849   else
7850     {
7851       /* iv_loop is the loop to be vectorized. Generate:
7852           vec_step = [VF*S, VF*S, VF*S, VF*S]  */
7853       gimple_seq seq = NULL;
7854       if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
7855         {
7856           expr = build_int_cst (integer_type_node, vf);
7857           expr = gimple_build (&seq, FLOAT_EXPR, TREE_TYPE (step_expr), expr);
7858         }
7859       else
7860         expr = build_int_cst (TREE_TYPE (step_expr), vf);
7861       new_name = gimple_build (&seq, MULT_EXPR, TREE_TYPE (step_expr),
7862                                expr, step_expr);
7863       if (seq)
7864         {
7865           new_bb = gsi_insert_seq_on_edge_immediate (pe, seq);
7866           gcc_assert (!new_bb);
7867         }
7868     }
7869
7870   t = unshare_expr (new_name);
7871   gcc_assert (CONSTANT_CLASS_P (new_name)
7872               || TREE_CODE (new_name) == SSA_NAME);
7873   new_vec = build_vector_from_val (vectype, t);
7874   vec_step = vect_init_vector (stmt_info, new_vec, vectype, NULL);
7875
7876
7877   /* Create the following def-use cycle:
7878      loop prolog:
7879          vec_init = ...
7880          vec_step = ...
7881      loop:
7882          vec_iv = PHI <vec_init, vec_loop>
7883          ...
7884          STMT
7885          ...
7886          vec_loop = vec_iv + vec_step;  */
7887
7888   /* Create the induction-phi that defines the induction-operand.  */
7889   vec_dest = vect_get_new_vect_var (vectype, vect_simple_var, "vec_iv_");
7890   induction_phi = create_phi_node (vec_dest, iv_loop->header);
7891   stmt_vec_info induction_phi_info = loop_vinfo->add_stmt (induction_phi);
7892   induc_def = PHI_RESULT (induction_phi);
7893
7894   /* Create the iv update inside the loop  */
7895   vec_def = make_ssa_name (vec_dest);
7896   new_stmt = gimple_build_assign (vec_def, PLUS_EXPR, induc_def, vec_step);
7897   gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
7898   stmt_vec_info new_stmt_info = loop_vinfo->add_stmt (new_stmt);
7899
7900   /* Set the arguments of the phi node:  */
7901   add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION);
7902   add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
7903                UNKNOWN_LOCATION);
7904
7905   STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = induction_phi_info;
7906
7907   /* In case that vectorization factor (VF) is bigger than the number
7908      of elements that we can fit in a vectype (nunits), we have to generate
7909      more than one vector stmt - i.e - we need to "unroll" the
7910      vector stmt by a factor VF/nunits.  For more details see documentation
7911      in vectorizable_operation.  */
7912
7913   if (ncopies > 1)
7914     {
7915       gimple_seq seq = NULL;
7916       stmt_vec_info prev_stmt_vinfo;
7917       /* FORNOW. This restriction should be relaxed.  */
7918       gcc_assert (!nested_in_vect_loop);
7919
7920       /* Create the vector that holds the step of the induction.  */
7921       if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
7922         {
7923           expr = build_int_cst (integer_type_node, nunits);
7924           expr = gimple_build (&seq, FLOAT_EXPR, TREE_TYPE (step_expr), expr);
7925         }
7926       else
7927         expr = build_int_cst (TREE_TYPE (step_expr), nunits);
7928       new_name = gimple_build (&seq, MULT_EXPR, TREE_TYPE (step_expr),
7929                                expr, step_expr);
7930       if (seq)
7931         {
7932           new_bb = gsi_insert_seq_on_edge_immediate (pe, seq);
7933           gcc_assert (!new_bb);
7934         }
7935
7936       t = unshare_expr (new_name);
7937       gcc_assert (CONSTANT_CLASS_P (new_name)
7938                   || TREE_CODE (new_name) == SSA_NAME);
7939       new_vec = build_vector_from_val (vectype, t);
7940       vec_step = vect_init_vector (stmt_info, new_vec, vectype, NULL);
7941
7942       vec_def = induc_def;
7943       prev_stmt_vinfo = induction_phi_info;
7944       for (i = 1; i < ncopies; i++)
7945         {
7946           /* vec_i = vec_prev + vec_step  */
7947           new_stmt = gimple_build_assign (vec_dest, PLUS_EXPR,
7948                                           vec_def, vec_step);
7949           vec_def = make_ssa_name (vec_dest, new_stmt);
7950           gimple_assign_set_lhs (new_stmt, vec_def);
7951
7952           gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
7953           new_stmt_info = loop_vinfo->add_stmt (new_stmt);
7954           STMT_VINFO_RELATED_STMT (prev_stmt_vinfo) = new_stmt_info;
7955           prev_stmt_vinfo = new_stmt_info;
7956         }
7957     }
7958
7959   if (nested_in_vect_loop)
7960     {
7961       /* Find the loop-closed exit-phi of the induction, and record
7962          the final vector of induction results:  */
7963       exit_phi = NULL;
7964       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, loop_arg)
7965         {
7966           gimple *use_stmt = USE_STMT (use_p);
7967           if (is_gimple_debug (use_stmt))
7968             continue;
7969
7970           if (!flow_bb_inside_loop_p (iv_loop, gimple_bb (use_stmt)))
7971             {
7972               exit_phi = use_stmt;
7973               break;
7974             }
7975         }
7976       if (exit_phi)
7977         {
7978           stmt_vec_info stmt_vinfo = loop_vinfo->lookup_stmt (exit_phi);
7979           /* FORNOW. Currently not supporting the case that an inner-loop induction
7980              is not used in the outer-loop (i.e. only outside the outer-loop).  */
7981           gcc_assert (STMT_VINFO_RELEVANT_P (stmt_vinfo)
7982                       && !STMT_VINFO_LIVE_P (stmt_vinfo));
7983
7984           STMT_VINFO_VEC_STMT (stmt_vinfo) = new_stmt_info;
7985           if (dump_enabled_p ())
7986             dump_printf_loc (MSG_NOTE, vect_location,
7987                              "vector of inductions after inner-loop:%G",
7988                              new_stmt);
7989         }
7990     }
7991
7992
7993   if (dump_enabled_p ())
7994     dump_printf_loc (MSG_NOTE, vect_location,
7995                      "transform induction: created def-use cycle: %G%G",
7996                      induction_phi, SSA_NAME_DEF_STMT (vec_def));
7997
7998   return true;
7999 }
8000
8001 /* Function vectorizable_live_operation.
8002
8003    STMT_INFO computes a value that is used outside the loop.  Check if
8004    it can be supported.  */
8005
8006 bool
8007 vectorizable_live_operation (stmt_vec_info stmt_info,
8008                              gimple_stmt_iterator *gsi ATTRIBUTE_UNUSED,
8009                              slp_tree slp_node, int slp_index,
8010                              stmt_vec_info *vec_stmt,
8011                              stmt_vector_for_cost *)
8012 {
8013   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
8014   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
8015   imm_use_iterator imm_iter;
8016   tree lhs, lhs_type, bitsize, vec_bitsize;
8017   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
8018   poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
8019   int ncopies;
8020   gimple *use_stmt;
8021   auto_vec<tree> vec_oprnds;
8022   int vec_entry = 0;
8023   poly_uint64 vec_index = 0;
8024
8025   gcc_assert (STMT_VINFO_LIVE_P (stmt_info));
8026
8027   if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def)
8028     return false;
8029
8030   /* FORNOW.  CHECKME.  */
8031   if (nested_in_vect_loop_p (loop, stmt_info))
8032     return false;
8033
8034   /* If STMT is not relevant and it is a simple assignment and its inputs are
8035      invariant then it can remain in place, unvectorized.  The original last
8036      scalar value that it computes will be used.  */
8037   if (!STMT_VINFO_RELEVANT_P (stmt_info))
8038     {
8039       gcc_assert (is_simple_and_all_uses_invariant (stmt_info, loop_vinfo));
8040       if (dump_enabled_p ())
8041         dump_printf_loc (MSG_NOTE, vect_location,
8042                          "statement is simple and uses invariant.  Leaving in "
8043                          "place.\n");
8044       return true;
8045     }
8046
8047   if (slp_node)
8048     ncopies = 1;
8049   else
8050     ncopies = vect_get_num_copies (loop_vinfo, vectype);
8051
8052   if (slp_node)
8053     {
8054       gcc_assert (slp_index >= 0);
8055
8056       int num_scalar = SLP_TREE_SCALAR_STMTS (slp_node).length ();
8057       int num_vec = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
8058
8059       /* Get the last occurrence of the scalar index from the concatenation of
8060          all the slp vectors. Calculate which slp vector it is and the index
8061          within.  */
8062       poly_uint64 pos = (num_vec * nunits) - num_scalar + slp_index;
8063
8064       /* Calculate which vector contains the result, and which lane of
8065          that vector we need.  */
8066       if (!can_div_trunc_p (pos, nunits, &vec_entry, &vec_index))
8067         {
8068           if (dump_enabled_p ())
8069             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8070                              "Cannot determine which vector holds the"
8071                              " final result.\n");
8072           return false;
8073         }
8074     }
8075
8076   if (!vec_stmt)
8077     {
8078       /* No transformation required.  */
8079       if (LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo))
8080         {
8081           if (!direct_internal_fn_supported_p (IFN_EXTRACT_LAST, vectype,
8082                                                OPTIMIZE_FOR_SPEED))
8083             {
8084               if (dump_enabled_p ())
8085                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8086                                  "can't use a fully-masked loop because "
8087                                  "the target doesn't support extract last "
8088                                  "reduction.\n");
8089               LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = false;
8090             }
8091           else if (slp_node)
8092             {
8093               if (dump_enabled_p ())
8094                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8095                                  "can't use a fully-masked loop because an "
8096                                  "SLP statement is live after the loop.\n");
8097               LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = false;
8098             }
8099           else if (ncopies > 1)
8100             {
8101               if (dump_enabled_p ())
8102                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8103                                  "can't use a fully-masked loop because"
8104                                  " ncopies is greater than 1.\n");
8105               LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = false;
8106             }
8107           else
8108             {
8109               gcc_assert (ncopies == 1 && !slp_node);
8110               vect_record_loop_mask (loop_vinfo,
8111                                      &LOOP_VINFO_MASKS (loop_vinfo),
8112                                      1, vectype);
8113             }
8114         }
8115       return true;
8116     }
8117
8118   /* Use the lhs of the original scalar statement.  */
8119   gimple *stmt = vect_orig_stmt (stmt_info)->stmt;
8120
8121   lhs = (is_a <gphi *> (stmt)) ? gimple_phi_result (stmt)
8122         : gimple_get_lhs (stmt);
8123   lhs_type = TREE_TYPE (lhs);
8124
8125   bitsize = (VECTOR_BOOLEAN_TYPE_P (vectype)
8126              ? bitsize_int (TYPE_PRECISION (TREE_TYPE (vectype)))
8127              : TYPE_SIZE (TREE_TYPE (vectype)));
8128   vec_bitsize = TYPE_SIZE (vectype);
8129
8130   /* Get the vectorized lhs of STMT and the lane to use (counted in bits).  */
8131   tree vec_lhs, bitstart;
8132   if (slp_node)
8133     {
8134       gcc_assert (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo));
8135
8136       /* Get the correct slp vectorized stmt.  */
8137       gimple *vec_stmt = SLP_TREE_VEC_STMTS (slp_node)[vec_entry]->stmt;
8138       if (gphi *phi = dyn_cast <gphi *> (vec_stmt))
8139         vec_lhs = gimple_phi_result (phi);
8140       else
8141         vec_lhs = gimple_get_lhs (vec_stmt);
8142
8143       /* Get entry to use.  */
8144       bitstart = bitsize_int (vec_index);
8145       bitstart = int_const_binop (MULT_EXPR, bitsize, bitstart);
8146     }
8147   else
8148     {
8149       enum vect_def_type dt = STMT_VINFO_DEF_TYPE (stmt_info);
8150       vec_lhs = vect_get_vec_def_for_operand_1 (stmt_info, dt);
8151       gcc_checking_assert (ncopies == 1
8152                            || !LOOP_VINFO_FULLY_MASKED_P (loop_vinfo));
8153
8154       /* For multiple copies, get the last copy.  */
8155       for (int i = 1; i < ncopies; ++i)
8156         vec_lhs = vect_get_vec_def_for_stmt_copy (loop_vinfo, vec_lhs);
8157
8158       /* Get the last lane in the vector.  */
8159       bitstart = int_const_binop (MINUS_EXPR, vec_bitsize, bitsize);
8160     }
8161
8162   gimple_seq stmts = NULL;
8163   tree new_tree;
8164   if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
8165     {
8166       /* Emit:
8167
8168            SCALAR_RES = EXTRACT_LAST <VEC_LHS, MASK>
8169
8170          where VEC_LHS is the vectorized live-out result and MASK is
8171          the loop mask for the final iteration.  */
8172       gcc_assert (ncopies == 1 && !slp_node);
8173       tree scalar_type = TREE_TYPE (STMT_VINFO_VECTYPE (stmt_info));
8174       tree mask = vect_get_loop_mask (gsi, &LOOP_VINFO_MASKS (loop_vinfo),
8175                                       1, vectype, 0);
8176       tree scalar_res = gimple_build (&stmts, CFN_EXTRACT_LAST,
8177                                       scalar_type, mask, vec_lhs);
8178
8179       /* Convert the extracted vector element to the required scalar type.  */
8180       new_tree = gimple_convert (&stmts, lhs_type, scalar_res);
8181     }
8182   else
8183     {
8184       tree bftype = TREE_TYPE (vectype);
8185       if (VECTOR_BOOLEAN_TYPE_P (vectype))
8186         bftype = build_nonstandard_integer_type (tree_to_uhwi (bitsize), 1);
8187       new_tree = build3 (BIT_FIELD_REF, bftype, vec_lhs, bitsize, bitstart);
8188       new_tree = force_gimple_operand (fold_convert (lhs_type, new_tree),
8189                                        &stmts, true, NULL_TREE);
8190     }
8191
8192   if (stmts)
8193     gsi_insert_seq_on_edge_immediate (single_exit (loop), stmts);
8194
8195   /* Replace use of lhs with newly computed result.  If the use stmt is a
8196      single arg PHI, just replace all uses of PHI result.  It's necessary
8197      because lcssa PHI defining lhs may be before newly inserted stmt.  */
8198   use_operand_p use_p;
8199   FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, lhs)
8200     if (!flow_bb_inside_loop_p (loop, gimple_bb (use_stmt))
8201         && !is_gimple_debug (use_stmt))
8202     {
8203       if (gimple_code (use_stmt) == GIMPLE_PHI
8204           && gimple_phi_num_args (use_stmt) == 1)
8205         {
8206           replace_uses_by (gimple_phi_result (use_stmt), new_tree);
8207         }
8208       else
8209         {
8210           FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
8211             SET_USE (use_p, new_tree);
8212         }
8213       update_stmt (use_stmt);
8214     }
8215
8216   return true;
8217 }
8218
8219 /* Kill any debug uses outside LOOP of SSA names defined in STMT_INFO.  */
8220
8221 static void
8222 vect_loop_kill_debug_uses (struct loop *loop, stmt_vec_info stmt_info)
8223 {
8224   ssa_op_iter op_iter;
8225   imm_use_iterator imm_iter;
8226   def_operand_p def_p;
8227   gimple *ustmt;
8228
8229   FOR_EACH_PHI_OR_STMT_DEF (def_p, stmt_info->stmt, op_iter, SSA_OP_DEF)
8230     {
8231       FOR_EACH_IMM_USE_STMT (ustmt, imm_iter, DEF_FROM_PTR (def_p))
8232         {
8233           basic_block bb;
8234
8235           if (!is_gimple_debug (ustmt))
8236             continue;
8237
8238           bb = gimple_bb (ustmt);
8239
8240           if (!flow_bb_inside_loop_p (loop, bb))
8241             {
8242               if (gimple_debug_bind_p (ustmt))
8243                 {
8244                   if (dump_enabled_p ())
8245                     dump_printf_loc (MSG_NOTE, vect_location,
8246                                      "killing debug use\n");
8247
8248                   gimple_debug_bind_reset_value (ustmt);
8249                   update_stmt (ustmt);
8250                 }
8251               else
8252                 gcc_unreachable ();
8253             }
8254         }
8255     }
8256 }
8257
8258 /* Given loop represented by LOOP_VINFO, return true if computation of
8259    LOOP_VINFO_NITERS (= LOOP_VINFO_NITERSM1 + 1) doesn't overflow, false
8260    otherwise.  */
8261
8262 static bool
8263 loop_niters_no_overflow (loop_vec_info loop_vinfo)
8264 {
8265   /* Constant case.  */
8266   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
8267     {
8268       tree cst_niters = LOOP_VINFO_NITERS (loop_vinfo);
8269       tree cst_nitersm1 = LOOP_VINFO_NITERSM1 (loop_vinfo);
8270
8271       gcc_assert (TREE_CODE (cst_niters) == INTEGER_CST);
8272       gcc_assert (TREE_CODE (cst_nitersm1) == INTEGER_CST);
8273       if (wi::to_widest (cst_nitersm1) < wi::to_widest (cst_niters))
8274         return true;
8275     }
8276
8277   widest_int max;
8278   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
8279   /* Check the upper bound of loop niters.  */
8280   if (get_max_loop_iterations (loop, &max))
8281     {
8282       tree type = TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo));
8283       signop sgn = TYPE_SIGN (type);
8284       widest_int type_max = widest_int::from (wi::max_value (type), sgn);
8285       if (max < type_max)
8286         return true;
8287     }
8288   return false;
8289 }
8290
8291 /* Return a mask type with half the number of elements as TYPE.  */
8292
8293 tree
8294 vect_halve_mask_nunits (tree type)
8295 {
8296   poly_uint64 nunits = exact_div (TYPE_VECTOR_SUBPARTS (type), 2);
8297   return build_truth_vector_type (nunits, current_vector_size);
8298 }
8299
8300 /* Return a mask type with twice as many elements as TYPE.  */
8301
8302 tree
8303 vect_double_mask_nunits (tree type)
8304 {
8305   poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (type) * 2;
8306   return build_truth_vector_type (nunits, current_vector_size);
8307 }
8308
8309 /* Record that a fully-masked version of LOOP_VINFO would need MASKS to
8310    contain a sequence of NVECTORS masks that each control a vector of type
8311    VECTYPE.  */
8312
8313 void
8314 vect_record_loop_mask (loop_vec_info loop_vinfo, vec_loop_masks *masks,
8315                        unsigned int nvectors, tree vectype)
8316 {
8317   gcc_assert (nvectors != 0);
8318   if (masks->length () < nvectors)
8319     masks->safe_grow_cleared (nvectors);
8320   rgroup_masks *rgm = &(*masks)[nvectors - 1];
8321   /* The number of scalars per iteration and the number of vectors are
8322      both compile-time constants.  */
8323   unsigned int nscalars_per_iter
8324     = exact_div (nvectors * TYPE_VECTOR_SUBPARTS (vectype),
8325                  LOOP_VINFO_VECT_FACTOR (loop_vinfo)).to_constant ();
8326   if (rgm->max_nscalars_per_iter < nscalars_per_iter)
8327     {
8328       rgm->max_nscalars_per_iter = nscalars_per_iter;
8329       rgm->mask_type = build_same_sized_truth_vector_type (vectype);
8330     }
8331 }
8332
8333 /* Given a complete set of masks MASKS, extract mask number INDEX
8334    for an rgroup that operates on NVECTORS vectors of type VECTYPE,
8335    where 0 <= INDEX < NVECTORS.  Insert any set-up statements before GSI.
8336
8337    See the comment above vec_loop_masks for more details about the mask
8338    arrangement.  */
8339
8340 tree
8341 vect_get_loop_mask (gimple_stmt_iterator *gsi, vec_loop_masks *masks,
8342                     unsigned int nvectors, tree vectype, unsigned int index)
8343 {
8344   rgroup_masks *rgm = &(*masks)[nvectors - 1];
8345   tree mask_type = rgm->mask_type;
8346
8347   /* Populate the rgroup's mask array, if this is the first time we've
8348      used it.  */
8349   if (rgm->masks.is_empty ())
8350     {
8351       rgm->masks.safe_grow_cleared (nvectors);
8352       for (unsigned int i = 0; i < nvectors; ++i)
8353         {
8354           tree mask = make_temp_ssa_name (mask_type, NULL, "loop_mask");
8355           /* Provide a dummy definition until the real one is available.  */
8356           SSA_NAME_DEF_STMT (mask) = gimple_build_nop ();
8357           rgm->masks[i] = mask;
8358         }
8359     }
8360
8361   tree mask = rgm->masks[index];
8362   if (maybe_ne (TYPE_VECTOR_SUBPARTS (mask_type),
8363                 TYPE_VECTOR_SUBPARTS (vectype)))
8364     {
8365       /* A loop mask for data type X can be reused for data type Y
8366          if X has N times more elements than Y and if Y's elements
8367          are N times bigger than X's.  In this case each sequence
8368          of N elements in the loop mask will be all-zero or all-one.
8369          We can then view-convert the mask so that each sequence of
8370          N elements is replaced by a single element.  */
8371       gcc_assert (multiple_p (TYPE_VECTOR_SUBPARTS (mask_type),
8372                               TYPE_VECTOR_SUBPARTS (vectype)));
8373       gimple_seq seq = NULL;
8374       mask_type = build_same_sized_truth_vector_type (vectype);
8375       mask = gimple_build (&seq, VIEW_CONVERT_EXPR, mask_type, mask);
8376       if (seq)
8377         gsi_insert_seq_before (gsi, seq, GSI_SAME_STMT);
8378     }
8379   return mask;
8380 }
8381
8382 /* Scale profiling counters by estimation for LOOP which is vectorized
8383    by factor VF.  */
8384
8385 static void
8386 scale_profile_for_vect_loop (struct loop *loop, unsigned vf)
8387 {
8388   edge preheader = loop_preheader_edge (loop);
8389   /* Reduce loop iterations by the vectorization factor.  */
8390   gcov_type new_est_niter = niter_for_unrolled_loop (loop, vf);
8391   profile_count freq_h = loop->header->count, freq_e = preheader->count ();
8392
8393   if (freq_h.nonzero_p ())
8394     {
8395       profile_probability p;
8396
8397       /* Avoid dropping loop body profile counter to 0 because of zero count
8398          in loop's preheader.  */
8399       if (!(freq_e == profile_count::zero ()))
8400         freq_e = freq_e.force_nonzero ();
8401       p = freq_e.apply_scale (new_est_niter + 1, 1).probability_in (freq_h);
8402       scale_loop_frequencies (loop, p);
8403     }
8404
8405   edge exit_e = single_exit (loop);
8406   exit_e->probability = profile_probability::always ()
8407                                  .apply_scale (1, new_est_niter + 1);
8408
8409   edge exit_l = single_pred_edge (loop->latch);
8410   profile_probability prob = exit_l->probability;
8411   exit_l->probability = exit_e->probability.invert ();
8412   if (prob.initialized_p () && exit_l->probability.initialized_p ())
8413     scale_bbs_frequencies (&loop->latch, 1, exit_l->probability / prob);
8414 }
8415
8416 /* Vectorize STMT_INFO if relevant, inserting any new instructions before GSI.
8417    When vectorizing STMT_INFO as a store, set *SEEN_STORE to its
8418    stmt_vec_info.  */
8419
8420 static void
8421 vect_transform_loop_stmt (loop_vec_info loop_vinfo, stmt_vec_info stmt_info,
8422                           gimple_stmt_iterator *gsi, stmt_vec_info *seen_store)
8423 {
8424   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
8425   poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
8426
8427   if (dump_enabled_p ())
8428     dump_printf_loc (MSG_NOTE, vect_location,
8429                      "------>vectorizing statement: %G", stmt_info->stmt);
8430
8431   if (MAY_HAVE_DEBUG_BIND_STMTS && !STMT_VINFO_LIVE_P (stmt_info))
8432     vect_loop_kill_debug_uses (loop, stmt_info);
8433
8434   if (!STMT_VINFO_RELEVANT_P (stmt_info)
8435       && !STMT_VINFO_LIVE_P (stmt_info))
8436     return;
8437
8438   if (STMT_VINFO_VECTYPE (stmt_info))
8439     {
8440       poly_uint64 nunits
8441         = TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info));
8442       if (!STMT_SLP_TYPE (stmt_info)
8443           && maybe_ne (nunits, vf)
8444           && dump_enabled_p ())
8445         /* For SLP VF is set according to unrolling factor, and not
8446            to vector size, hence for SLP this print is not valid.  */
8447         dump_printf_loc (MSG_NOTE, vect_location, "multiple-types.\n");
8448     }
8449
8450   /* Pure SLP statements have already been vectorized.  We still need
8451      to apply loop vectorization to hybrid SLP statements.  */
8452   if (PURE_SLP_STMT (stmt_info))
8453     return;
8454
8455   if (dump_enabled_p ())
8456     dump_printf_loc (MSG_NOTE, vect_location, "transform statement.\n");
8457
8458   if (vect_transform_stmt (stmt_info, gsi, NULL, NULL))
8459     *seen_store = stmt_info;
8460 }
8461
8462 /* Function vect_transform_loop.
8463
8464    The analysis phase has determined that the loop is vectorizable.
8465    Vectorize the loop - created vectorized stmts to replace the scalar
8466    stmts in the loop, and update the loop exit condition.
8467    Returns scalar epilogue loop if any.  */
8468
8469 struct loop *
8470 vect_transform_loop (loop_vec_info loop_vinfo)
8471 {
8472   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
8473   struct loop *epilogue = NULL;
8474   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
8475   int nbbs = loop->num_nodes;
8476   int i;
8477   tree niters_vector = NULL_TREE;
8478   tree step_vector = NULL_TREE;
8479   tree niters_vector_mult_vf = NULL_TREE;
8480   poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
8481   unsigned int lowest_vf = constant_lower_bound (vf);
8482   gimple *stmt;
8483   bool check_profitability = false;
8484   unsigned int th;
8485
8486   DUMP_VECT_SCOPE ("vec_transform_loop");
8487
8488   loop_vinfo->shared->check_datarefs ();
8489
8490   /* Use the more conservative vectorization threshold.  If the number
8491      of iterations is constant assume the cost check has been performed
8492      by our caller.  If the threshold makes all loops profitable that
8493      run at least the (estimated) vectorization factor number of times
8494      checking is pointless, too.  */
8495   th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
8496   if (th >= vect_vf_for_cost (loop_vinfo)
8497       && !LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
8498     {
8499       if (dump_enabled_p ())
8500         dump_printf_loc (MSG_NOTE, vect_location,
8501                          "Profitability threshold is %d loop iterations.\n",
8502                          th);
8503       check_profitability = true;
8504     }
8505
8506   /* Make sure there exists a single-predecessor exit bb.  Do this before
8507      versioning.   */
8508   edge e = single_exit (loop);
8509   if (! single_pred_p (e->dest))
8510     {
8511       split_loop_exit_edge (e, true);
8512       if (dump_enabled_p ())
8513         dump_printf (MSG_NOTE, "split exit edge\n");
8514     }
8515
8516   /* Version the loop first, if required, so the profitability check
8517      comes first.  */
8518
8519   if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
8520     {
8521       poly_uint64 versioning_threshold
8522         = LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo);
8523       if (check_profitability
8524           && ordered_p (poly_uint64 (th), versioning_threshold))
8525         {
8526           versioning_threshold = ordered_max (poly_uint64 (th),
8527                                               versioning_threshold);
8528           check_profitability = false;
8529         }
8530       struct loop *sloop
8531         = vect_loop_versioning (loop_vinfo, th, check_profitability,
8532                                 versioning_threshold);
8533       sloop->force_vectorize = false;
8534       check_profitability = false;
8535     }
8536
8537   /* Make sure there exists a single-predecessor exit bb also on the
8538      scalar loop copy.  Do this after versioning but before peeling
8539      so CFG structure is fine for both scalar and if-converted loop
8540      to make slpeel_duplicate_current_defs_from_edges face matched
8541      loop closed PHI nodes on the exit.  */
8542   if (LOOP_VINFO_SCALAR_LOOP (loop_vinfo))
8543     {
8544       e = single_exit (LOOP_VINFO_SCALAR_LOOP (loop_vinfo));
8545       if (! single_pred_p (e->dest))
8546         {
8547           split_loop_exit_edge (e, true);
8548           if (dump_enabled_p ())
8549             dump_printf (MSG_NOTE, "split exit edge of scalar loop\n");
8550         }
8551     }
8552
8553   tree niters = vect_build_loop_niters (loop_vinfo);
8554   LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = niters;
8555   tree nitersm1 = unshare_expr (LOOP_VINFO_NITERSM1 (loop_vinfo));
8556   bool niters_no_overflow = loop_niters_no_overflow (loop_vinfo);
8557   epilogue = vect_do_peeling (loop_vinfo, niters, nitersm1, &niters_vector,
8558                               &step_vector, &niters_vector_mult_vf, th,
8559                               check_profitability, niters_no_overflow);
8560
8561   if (niters_vector == NULL_TREE)
8562     {
8563       if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
8564           && !LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
8565           && known_eq (lowest_vf, vf))
8566         {
8567           niters_vector
8568             = build_int_cst (TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo)),
8569                              LOOP_VINFO_INT_NITERS (loop_vinfo) / lowest_vf);
8570           step_vector = build_one_cst (TREE_TYPE (niters));
8571         }
8572       else
8573         vect_gen_vector_loop_niters (loop_vinfo, niters, &niters_vector,
8574                                      &step_vector, niters_no_overflow);
8575     }
8576
8577   /* 1) Make sure the loop header has exactly two entries
8578      2) Make sure we have a preheader basic block.  */
8579
8580   gcc_assert (EDGE_COUNT (loop->header->preds) == 2);
8581
8582   split_edge (loop_preheader_edge (loop));
8583
8584   if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
8585       && vect_use_loop_mask_for_alignment_p (loop_vinfo))
8586     /* This will deal with any possible peeling.  */
8587     vect_prepare_for_masked_peels (loop_vinfo);
8588
8589   /* Schedule the SLP instances first, then handle loop vectorization
8590      below.  */
8591   if (!loop_vinfo->slp_instances.is_empty ())
8592     {
8593       DUMP_VECT_SCOPE ("scheduling SLP instances");
8594       vect_schedule_slp (loop_vinfo);
8595     }
8596
8597   /* FORNOW: the vectorizer supports only loops which body consist
8598      of one basic block (header + empty latch). When the vectorizer will
8599      support more involved loop forms, the order by which the BBs are
8600      traversed need to be reconsidered.  */
8601
8602   for (i = 0; i < nbbs; i++)
8603     {
8604       basic_block bb = bbs[i];
8605       stmt_vec_info stmt_info;
8606
8607       for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
8608            gsi_next (&si))
8609         {
8610           gphi *phi = si.phi ();
8611           if (dump_enabled_p ())
8612             dump_printf_loc (MSG_NOTE, vect_location,
8613                              "------>vectorizing phi: %G", phi);
8614           stmt_info = loop_vinfo->lookup_stmt (phi);
8615           if (!stmt_info)
8616             continue;
8617
8618           if (MAY_HAVE_DEBUG_BIND_STMTS && !STMT_VINFO_LIVE_P (stmt_info))
8619             vect_loop_kill_debug_uses (loop, stmt_info);
8620
8621           if (!STMT_VINFO_RELEVANT_P (stmt_info)
8622               && !STMT_VINFO_LIVE_P (stmt_info))
8623             continue;
8624
8625           if (STMT_VINFO_VECTYPE (stmt_info)
8626               && (maybe_ne
8627                   (TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info)), vf))
8628               && dump_enabled_p ())
8629             dump_printf_loc (MSG_NOTE, vect_location, "multiple-types.\n");
8630
8631           if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def
8632                || STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
8633                || STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle)
8634               && ! PURE_SLP_STMT (stmt_info))
8635             {
8636               if (dump_enabled_p ())
8637                 dump_printf_loc (MSG_NOTE, vect_location, "transform phi.\n");
8638               vect_transform_stmt (stmt_info, NULL, NULL, NULL);
8639             }
8640         }
8641
8642       for (gimple_stmt_iterator si = gsi_start_bb (bb);
8643            !gsi_end_p (si);)
8644         {
8645           stmt = gsi_stmt (si);
8646           /* During vectorization remove existing clobber stmts.  */
8647           if (gimple_clobber_p (stmt))
8648             {
8649               unlink_stmt_vdef (stmt);
8650               gsi_remove (&si, true);
8651               release_defs (stmt);
8652             }
8653           else
8654             {
8655               stmt_info = loop_vinfo->lookup_stmt (stmt);
8656
8657               /* vector stmts created in the outer-loop during vectorization of
8658                  stmts in an inner-loop may not have a stmt_info, and do not
8659                  need to be vectorized.  */
8660               stmt_vec_info seen_store = NULL;
8661               if (stmt_info)
8662                 {
8663                   if (STMT_VINFO_IN_PATTERN_P (stmt_info))
8664                     {
8665                       gimple *def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
8666                       for (gimple_stmt_iterator subsi = gsi_start (def_seq);
8667                            !gsi_end_p (subsi); gsi_next (&subsi))
8668                         {
8669                           stmt_vec_info pat_stmt_info
8670                             = loop_vinfo->lookup_stmt (gsi_stmt (subsi));
8671                           vect_transform_loop_stmt (loop_vinfo, pat_stmt_info,
8672                                                     &si, &seen_store);
8673                         }
8674                       stmt_vec_info pat_stmt_info
8675                         = STMT_VINFO_RELATED_STMT (stmt_info);
8676                       vect_transform_loop_stmt (loop_vinfo, pat_stmt_info, &si,
8677                                                 &seen_store);
8678                     }
8679                   vect_transform_loop_stmt (loop_vinfo, stmt_info, &si,
8680                                             &seen_store);
8681                 }
8682               gsi_next (&si);
8683               if (seen_store)
8684                 {
8685                   if (STMT_VINFO_GROUPED_ACCESS (seen_store))
8686                     /* Interleaving.  If IS_STORE is TRUE, the
8687                        vectorization of the interleaving chain was
8688                        completed - free all the stores in the chain.  */
8689                     vect_remove_stores (DR_GROUP_FIRST_ELEMENT (seen_store));
8690                   else
8691                     /* Free the attached stmt_vec_info and remove the stmt.  */
8692                     loop_vinfo->remove_stmt (stmt_info);
8693                 }
8694             }
8695         }
8696
8697       /* Stub out scalar statements that must not survive vectorization.
8698          Doing this here helps with grouped statements, or statements that
8699          are involved in patterns.  */
8700       for (gimple_stmt_iterator gsi = gsi_start_bb (bb);
8701            !gsi_end_p (gsi); gsi_next (&gsi))
8702         {
8703           gcall *call = dyn_cast <gcall *> (gsi_stmt (gsi));
8704           if (call && gimple_call_internal_p (call, IFN_MASK_LOAD))
8705             {
8706               tree lhs = gimple_get_lhs (call);
8707               if (!VECTOR_TYPE_P (TREE_TYPE (lhs)))
8708                 {
8709                   tree zero = build_zero_cst (TREE_TYPE (lhs));
8710                   gimple *new_stmt = gimple_build_assign (lhs, zero);
8711                   gsi_replace (&gsi, new_stmt, true);
8712                 }
8713             }
8714         }
8715     }                           /* BBs in loop */
8716
8717   /* The vectorization factor is always > 1, so if we use an IV increment of 1.
8718      a zero NITERS becomes a nonzero NITERS_VECTOR.  */
8719   if (integer_onep (step_vector))
8720     niters_no_overflow = true;
8721   vect_set_loop_condition (loop, loop_vinfo, niters_vector, step_vector,
8722                            niters_vector_mult_vf, !niters_no_overflow);
8723
8724   unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
8725   scale_profile_for_vect_loop (loop, assumed_vf);
8726
8727   /* True if the final iteration might not handle a full vector's
8728      worth of scalar iterations.  */
8729   bool final_iter_may_be_partial = LOOP_VINFO_FULLY_MASKED_P (loop_vinfo);
8730   /* The minimum number of iterations performed by the epilogue.  This
8731      is 1 when peeling for gaps because we always need a final scalar
8732      iteration.  */
8733   int min_epilogue_iters = LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) ? 1 : 0;
8734   /* +1 to convert latch counts to loop iteration counts,
8735      -min_epilogue_iters to remove iterations that cannot be performed
8736        by the vector code.  */
8737   int bias_for_lowest = 1 - min_epilogue_iters;
8738   int bias_for_assumed = bias_for_lowest;
8739   int alignment_npeels = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
8740   if (alignment_npeels && LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
8741     {
8742       /* When the amount of peeling is known at compile time, the first
8743          iteration will have exactly alignment_npeels active elements.
8744          In the worst case it will have at least one.  */
8745       int min_first_active = (alignment_npeels > 0 ? alignment_npeels : 1);
8746       bias_for_lowest += lowest_vf - min_first_active;
8747       bias_for_assumed += assumed_vf - min_first_active;
8748     }
8749   /* In these calculations the "- 1" converts loop iteration counts
8750      back to latch counts.  */
8751   if (loop->any_upper_bound)
8752     loop->nb_iterations_upper_bound
8753       = (final_iter_may_be_partial
8754          ? wi::udiv_ceil (loop->nb_iterations_upper_bound + bias_for_lowest,
8755                           lowest_vf) - 1
8756          : wi::udiv_floor (loop->nb_iterations_upper_bound + bias_for_lowest,
8757                            lowest_vf) - 1);
8758   if (loop->any_likely_upper_bound)
8759     loop->nb_iterations_likely_upper_bound
8760       = (final_iter_may_be_partial
8761          ? wi::udiv_ceil (loop->nb_iterations_likely_upper_bound
8762                           + bias_for_lowest, lowest_vf) - 1
8763          : wi::udiv_floor (loop->nb_iterations_likely_upper_bound
8764                            + bias_for_lowest, lowest_vf) - 1);
8765   if (loop->any_estimate)
8766     loop->nb_iterations_estimate
8767       = (final_iter_may_be_partial
8768          ? wi::udiv_ceil (loop->nb_iterations_estimate + bias_for_assumed,
8769                           assumed_vf) - 1
8770          : wi::udiv_floor (loop->nb_iterations_estimate + bias_for_assumed,
8771                            assumed_vf) - 1);
8772
8773   if (dump_enabled_p ())
8774     {
8775       if (!LOOP_VINFO_EPILOGUE_P (loop_vinfo))
8776         {
8777           dump_printf_loc (MSG_NOTE, vect_location,
8778                            "LOOP VECTORIZED\n");
8779           if (loop->inner)
8780             dump_printf_loc (MSG_NOTE, vect_location,
8781                              "OUTER LOOP VECTORIZED\n");
8782           dump_printf (MSG_NOTE, "\n");
8783         }
8784       else
8785         {
8786           dump_printf_loc (MSG_NOTE, vect_location,
8787                            "LOOP EPILOGUE VECTORIZED (VS=");
8788           dump_dec (MSG_NOTE, current_vector_size);
8789           dump_printf (MSG_NOTE, ")\n");
8790         }
8791     }
8792
8793   /* Loops vectorized with a variable factor won't benefit from
8794      unrolling/peeling.  */
8795   if (!vf.is_constant ())
8796     {
8797       loop->unroll = 1;
8798       if (dump_enabled_p ())
8799         dump_printf_loc (MSG_NOTE, vect_location, "Disabling unrolling due to"
8800                          " variable-length vectorization factor\n");
8801     }
8802   /* Free SLP instances here because otherwise stmt reference counting
8803      won't work.  */
8804   slp_instance instance;
8805   FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), i, instance)
8806     vect_free_slp_instance (instance, true);
8807   LOOP_VINFO_SLP_INSTANCES (loop_vinfo).release ();
8808   /* Clear-up safelen field since its value is invalid after vectorization
8809      since vectorized loop can have loop-carried dependencies.  */
8810   loop->safelen = 0;
8811
8812   /* Don't vectorize epilogue for epilogue.  */
8813   if (LOOP_VINFO_EPILOGUE_P (loop_vinfo))
8814     epilogue = NULL;
8815
8816   if (!PARAM_VALUE (PARAM_VECT_EPILOGUES_NOMASK))
8817     epilogue = NULL;
8818
8819   if (epilogue)
8820     {
8821       auto_vector_sizes vector_sizes;
8822       targetm.vectorize.autovectorize_vector_sizes (&vector_sizes, false);
8823       unsigned int next_size = 0;
8824
8825       /* Note LOOP_VINFO_NITERS_KNOWN_P and LOOP_VINFO_INT_NITERS work
8826          on niters already ajusted for the iterations of the prologue.  */
8827       if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
8828           && known_eq (vf, lowest_vf))
8829         {
8830           unsigned HOST_WIDE_INT eiters
8831             = (LOOP_VINFO_INT_NITERS (loop_vinfo)
8832                - LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo));
8833           eiters
8834             = eiters % lowest_vf + LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo);
8835           epilogue->nb_iterations_upper_bound = eiters - 1;
8836           epilogue->any_upper_bound = true;
8837
8838           unsigned int ratio;
8839           while (next_size < vector_sizes.length ()
8840                  && !(constant_multiple_p (current_vector_size,
8841                                            vector_sizes[next_size], &ratio)
8842                       && eiters >= lowest_vf / ratio))
8843             next_size += 1;
8844         }
8845       else
8846         while (next_size < vector_sizes.length ()
8847                && maybe_lt (current_vector_size, vector_sizes[next_size]))
8848           next_size += 1;
8849
8850       if (next_size == vector_sizes.length ())
8851         epilogue = NULL;
8852     }
8853
8854   if (epilogue)
8855     {
8856       epilogue->force_vectorize = loop->force_vectorize;
8857       epilogue->safelen = loop->safelen;
8858       epilogue->dont_vectorize = false;
8859
8860       /* We may need to if-convert epilogue to vectorize it.  */
8861       if (LOOP_VINFO_SCALAR_LOOP (loop_vinfo))
8862         tree_if_conversion (epilogue);
8863     }
8864
8865   return epilogue;
8866 }
8867
8868 /* The code below is trying to perform simple optimization - revert
8869    if-conversion for masked stores, i.e. if the mask of a store is zero
8870    do not perform it and all stored value producers also if possible.
8871    For example,
8872      for (i=0; i<n; i++)
8873        if (c[i])
8874         {
8875           p1[i] += 1;
8876           p2[i] = p3[i] +2;
8877         }
8878    this transformation will produce the following semi-hammock:
8879
8880    if (!mask__ifc__42.18_165 == { 0, 0, 0, 0, 0, 0, 0, 0 })
8881      {
8882        vect__11.19_170 = MASK_LOAD (vectp_p1.20_168, 0B, mask__ifc__42.18_165);
8883        vect__12.22_172 = vect__11.19_170 + vect_cst__171;
8884        MASK_STORE (vectp_p1.23_175, 0B, mask__ifc__42.18_165, vect__12.22_172);
8885        vect__18.25_182 = MASK_LOAD (vectp_p3.26_180, 0B, mask__ifc__42.18_165);
8886        vect__19.28_184 = vect__18.25_182 + vect_cst__183;
8887        MASK_STORE (vectp_p2.29_187, 0B, mask__ifc__42.18_165, vect__19.28_184);
8888      }
8889 */
8890
8891 void
8892 optimize_mask_stores (struct loop *loop)
8893 {
8894   basic_block *bbs = get_loop_body (loop);
8895   unsigned nbbs = loop->num_nodes;
8896   unsigned i;
8897   basic_block bb;
8898   struct loop *bb_loop;
8899   gimple_stmt_iterator gsi;
8900   gimple *stmt;
8901   auto_vec<gimple *> worklist;
8902   auto_purge_vect_location sentinel;
8903
8904   vect_location = find_loop_location (loop);
8905   /* Pick up all masked stores in loop if any.  */
8906   for (i = 0; i < nbbs; i++)
8907     {
8908       bb = bbs[i];
8909       for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi);
8910            gsi_next (&gsi))
8911         {
8912           stmt = gsi_stmt (gsi);
8913           if (gimple_call_internal_p (stmt, IFN_MASK_STORE))
8914             worklist.safe_push (stmt);
8915         }
8916     }
8917
8918   free (bbs);
8919   if (worklist.is_empty ())
8920     return;
8921
8922   /* Loop has masked stores.  */
8923   while (!worklist.is_empty ())
8924     {
8925       gimple *last, *last_store;
8926       edge e, efalse;
8927       tree mask;
8928       basic_block store_bb, join_bb;
8929       gimple_stmt_iterator gsi_to;
8930       tree vdef, new_vdef;
8931       gphi *phi;
8932       tree vectype;
8933       tree zero;
8934
8935       last = worklist.pop ();
8936       mask = gimple_call_arg (last, 2);
8937       bb = gimple_bb (last);
8938       /* Create then_bb and if-then structure in CFG, then_bb belongs to
8939          the same loop as if_bb.  It could be different to LOOP when two
8940          level loop-nest is vectorized and mask_store belongs to the inner
8941          one.  */
8942       e = split_block (bb, last);
8943       bb_loop = bb->loop_father;
8944       gcc_assert (loop == bb_loop || flow_loop_nested_p (loop, bb_loop));
8945       join_bb = e->dest;
8946       store_bb = create_empty_bb (bb);
8947       add_bb_to_loop (store_bb, bb_loop);
8948       e->flags = EDGE_TRUE_VALUE;
8949       efalse = make_edge (bb, store_bb, EDGE_FALSE_VALUE);
8950       /* Put STORE_BB to likely part.  */
8951       efalse->probability = profile_probability::unlikely ();
8952       store_bb->count = efalse->count ();
8953       make_single_succ_edge (store_bb, join_bb, EDGE_FALLTHRU);
8954       if (dom_info_available_p (CDI_DOMINATORS))
8955         set_immediate_dominator (CDI_DOMINATORS, store_bb, bb);
8956       if (dump_enabled_p ())
8957         dump_printf_loc (MSG_NOTE, vect_location,
8958                          "Create new block %d to sink mask stores.",
8959                          store_bb->index);
8960       /* Create vector comparison with boolean result.  */
8961       vectype = TREE_TYPE (mask);
8962       zero = build_zero_cst (vectype);
8963       stmt = gimple_build_cond (EQ_EXPR, mask, zero, NULL_TREE, NULL_TREE);
8964       gsi = gsi_last_bb (bb);
8965       gsi_insert_after (&gsi, stmt, GSI_SAME_STMT);
8966       /* Create new PHI node for vdef of the last masked store:
8967          .MEM_2 = VDEF <.MEM_1>
8968          will be converted to
8969          .MEM.3 = VDEF <.MEM_1>
8970          and new PHI node will be created in join bb
8971          .MEM_2 = PHI <.MEM_1, .MEM_3>
8972       */
8973       vdef = gimple_vdef (last);
8974       new_vdef = make_ssa_name (gimple_vop (cfun), last);
8975       gimple_set_vdef (last, new_vdef);
8976       phi = create_phi_node (vdef, join_bb);
8977       add_phi_arg (phi, new_vdef, EDGE_SUCC (store_bb, 0), UNKNOWN_LOCATION);
8978
8979       /* Put all masked stores with the same mask to STORE_BB if possible.  */
8980       while (true)
8981         {
8982           gimple_stmt_iterator gsi_from;
8983           gimple *stmt1 = NULL;
8984
8985           /* Move masked store to STORE_BB.  */
8986           last_store = last;
8987           gsi = gsi_for_stmt (last);
8988           gsi_from = gsi;
8989           /* Shift GSI to the previous stmt for further traversal.  */
8990           gsi_prev (&gsi);
8991           gsi_to = gsi_start_bb (store_bb);
8992           gsi_move_before (&gsi_from, &gsi_to);
8993           /* Setup GSI_TO to the non-empty block start.  */
8994           gsi_to = gsi_start_bb (store_bb);
8995           if (dump_enabled_p ())
8996             dump_printf_loc (MSG_NOTE, vect_location,
8997                              "Move stmt to created bb\n%G", last);
8998           /* Move all stored value producers if possible.  */
8999           while (!gsi_end_p (gsi))
9000             {
9001               tree lhs;
9002               imm_use_iterator imm_iter;
9003               use_operand_p use_p;
9004               bool res;
9005
9006               /* Skip debug statements.  */
9007               if (is_gimple_debug (gsi_stmt (gsi)))
9008                 {
9009                   gsi_prev (&gsi);
9010                   continue;
9011                 }
9012               stmt1 = gsi_stmt (gsi);
9013               /* Do not consider statements writing to memory or having
9014                  volatile operand.  */
9015               if (gimple_vdef (stmt1)
9016                   || gimple_has_volatile_ops (stmt1))
9017                 break;
9018               gsi_from = gsi;
9019               gsi_prev (&gsi);
9020               lhs = gimple_get_lhs (stmt1);
9021               if (!lhs)
9022                 break;
9023
9024               /* LHS of vectorized stmt must be SSA_NAME.  */
9025               if (TREE_CODE (lhs) != SSA_NAME)
9026                 break;
9027
9028               if (!VECTOR_TYPE_P (TREE_TYPE (lhs)))
9029                 {
9030                   /* Remove dead scalar statement.  */
9031                   if (has_zero_uses (lhs))
9032                     {
9033                       gsi_remove (&gsi_from, true);
9034                       continue;
9035                     }
9036                 }
9037
9038               /* Check that LHS does not have uses outside of STORE_BB.  */
9039               res = true;
9040               FOR_EACH_IMM_USE_FAST (use_p, imm_iter, lhs)
9041                 {
9042                   gimple *use_stmt;
9043                   use_stmt = USE_STMT (use_p);
9044                   if (is_gimple_debug (use_stmt))
9045                     continue;
9046                   if (gimple_bb (use_stmt) != store_bb)
9047                     {
9048                       res = false;
9049                       break;
9050                     }
9051                 }
9052               if (!res)
9053                 break;
9054
9055               if (gimple_vuse (stmt1)
9056                   && gimple_vuse (stmt1) != gimple_vuse (last_store))
9057                 break;
9058
9059               /* Can move STMT1 to STORE_BB.  */
9060               if (dump_enabled_p ())
9061                 dump_printf_loc (MSG_NOTE, vect_location,
9062                                  "Move stmt to created bb\n%G", stmt1);
9063               gsi_move_before (&gsi_from, &gsi_to);
9064               /* Shift GSI_TO for further insertion.  */
9065               gsi_prev (&gsi_to);
9066             }
9067           /* Put other masked stores with the same mask to STORE_BB.  */
9068           if (worklist.is_empty ()
9069               || gimple_call_arg (worklist.last (), 2) != mask
9070               || worklist.last () != stmt1)
9071             break;
9072           last = worklist.pop ();
9073         }
9074       add_phi_arg (phi, gimple_vuse (last_store), e, UNKNOWN_LOCATION);
9075     }
9076 }
9077
9078 /* Decide whether it is possible to use a zero-based induction variable
9079    when vectorizing LOOP_VINFO with a fully-masked loop.  If it is,
9080    return the value that the induction variable must be able to hold
9081    in order to ensure that the loop ends with an all-false mask.
9082    Return -1 otherwise.  */
9083 widest_int
9084 vect_iv_limit_for_full_masking (loop_vec_info loop_vinfo)
9085 {
9086   tree niters_skip = LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo);
9087   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
9088   unsigned HOST_WIDE_INT max_vf = vect_max_vf (loop_vinfo);
9089
9090   /* Calculate the value that the induction variable must be able
9091      to hit in order to ensure that we end the loop with an all-false mask.
9092      This involves adding the maximum number of inactive trailing scalar
9093      iterations.  */
9094   widest_int iv_limit = -1;
9095   if (max_loop_iterations (loop, &iv_limit))
9096     {
9097       if (niters_skip)
9098         {
9099           /* Add the maximum number of skipped iterations to the
9100              maximum iteration count.  */
9101           if (TREE_CODE (niters_skip) == INTEGER_CST)
9102             iv_limit += wi::to_widest (niters_skip);
9103           else
9104             iv_limit += max_vf - 1;
9105         }
9106       else if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo))
9107         /* Make a conservatively-correct assumption.  */
9108         iv_limit += max_vf - 1;
9109
9110       /* IV_LIMIT is the maximum number of latch iterations, which is also
9111          the maximum in-range IV value.  Round this value down to the previous
9112          vector alignment boundary and then add an extra full iteration.  */
9113       poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
9114       iv_limit = (iv_limit & -(int) known_alignment (vf)) + max_vf;
9115     }
9116   return iv_limit;
9117 }
9118