gcc/tree-vect-loop.c

   1 /* Loop Vectorization
   2    Copyright (C) 2003-2019 Free Software Foundation, Inc.
   3    Contributed by Dorit Naishlos <dorit@il.ibm.com> and
   4    Ira Rosen <irar@il.ibm.com>
   5
   6 This file is part of GCC.
   7
   8 GCC is free software; you can redistribute it and/or modify it under
   9 the terms of the GNU General Public License as published by the Free
  10 Software Foundation; either version 3, or (at your option) any later
  11 version.
  12
  13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
  14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
  15 FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  16 for more details.
  17
  18 You should have received a copy of the GNU General Public License
  19 along with GCC; see the file COPYING3.  If not see
  20 <http://www.gnu.org/licenses/>.  */
  21
  22 #include "config.h"
  23 #include "system.h"
  24 #include "coretypes.h"
  25 #include "backend.h"
  26 #include "target.h"
  27 #include "rtl.h"
  28 #include "tree.h"
  29 #include "gimple.h"
  30 #include "cfghooks.h"
  31 #include "tree-pass.h"
  32 #include "ssa.h"
  33 #include "optabs-tree.h"
  34 #include "diagnostic-core.h"
  35 #include "fold-const.h"
  36 #include "stor-layout.h"
  37 #include "cfganal.h"
  38 #include "gimplify.h"
  39 #include "gimple-iterator.h"
  40 #include "gimplify-me.h"
  41 #include "tree-ssa-loop-ivopts.h"
  42 #include "tree-ssa-loop-manip.h"
  43 #include "tree-ssa-loop-niter.h"
  44 #include "tree-ssa-loop.h"
  45 #include "cfgloop.h"
  46 #include "params.h"
  47 #include "tree-scalar-evolution.h"
  48 #include "tree-vectorizer.h"
  49 #include "gimple-fold.h"
  50 #include "cgraph.h"
  51 #include "tree-cfg.h"
  52 #include "tree-if-conv.h"
  53 #include "internal-fn.h"
  54 #include "tree-vector-builder.h"
  55 #include "vec-perm-indices.h"
  56 #include "tree-eh.h"
  57
  58 /* Loop Vectorization Pass.
  59
  60    This pass tries to vectorize loops.
  61
  62    For example, the vectorizer transforms the following simple loop:
  63
  64         short a[N]; short b[N]; short c[N]; int i;
  65
  66         for (i=0; i<N; i++){
  67           a[i] = b[i] + c[i];
  68         }
  69
  70    as if it was manually vectorized by rewriting the source code into:
  71
  72         typedef int __attribute__((mode(V8HI))) v8hi;
  73         short a[N];  short b[N]; short c[N];   int i;
  74         v8hi *pa = (v8hi*)a, *pb = (v8hi*)b, *pc = (v8hi*)c;
  75         v8hi va, vb, vc;
  76
  77         for (i=0; i<N/8; i++){
  78           vb = pb[i];
  79           vc = pc[i];
  80           va = vb + vc;
  81           pa[i] = va;
  82         }
  83
  84         The main entry to this pass is vectorize_loops(), in which
  85    the vectorizer applies a set of analyses on a given set of loops,
  86    followed by the actual vectorization transformation for the loops that
  87    had successfully passed the analysis phase.
  88         Throughout this pass we make a distinction between two types of
  89    data: scalars (which are represented by SSA_NAMES), and memory references
  90    ("data-refs").  These two types of data require different handling both
  91    during analysis and transformation. The types of data-refs that the
  92    vectorizer currently supports are ARRAY_REFS which base is an array DECL
  93    (not a pointer), and INDIRECT_REFS through pointers; both array and pointer
  94    accesses are required to have a simple (consecutive) access pattern.
  95
  96    Analysis phase:
  97    ===============
  98         The driver for the analysis phase is vect_analyze_loop().
  99    It applies a set of analyses, some of which rely on the scalar evolution
 100    analyzer (scev) developed by Sebastian Pop.
 101
 102         During the analysis phase the vectorizer records some information
 103    per stmt in a "stmt_vec_info" struct which is attached to each stmt in the
 104    loop, as well as general information about the loop as a whole, which is
 105    recorded in a "loop_vec_info" struct attached to each loop.
 106
 107    Transformation phase:
 108    =====================
 109         The loop transformation phase scans all the stmts in the loop, and
 110    creates a vector stmt (or a sequence of stmts) for each scalar stmt S in
 111    the loop that needs to be vectorized.  It inserts the vector code sequence
 112    just before the scalar stmt S, and records a pointer to the vector code
 113    in STMT_VINFO_VEC_STMT (stmt_info) (stmt_info is the stmt_vec_info struct
 114    attached to S).  This pointer will be used for the vectorization of following
 115    stmts which use the def of stmt S. Stmt S is removed if it writes to memory;
 116    otherwise, we rely on dead code elimination for removing it.
 117
 118         For example, say stmt S1 was vectorized into stmt VS1:
 119
 120    VS1: vb = px[i];
 121    S1:  b = x[i];    STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
 122    S2:  a = b;
 123
 124    To vectorize stmt S2, the vectorizer first finds the stmt that defines
 125    the operand 'b' (S1), and gets the relevant vector def 'vb' from the
 126    vector stmt VS1 pointed to by STMT_VINFO_VEC_STMT (stmt_info (S1)).  The
 127    resulting sequence would be:
 128
 129    VS1: vb = px[i];
 130    S1:  b = x[i];       STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
 131    VS2: va = vb;
 132    S2:  a = b;          STMT_VINFO_VEC_STMT (stmt_info (S2)) = VS2
 133
 134         Operands that are not SSA_NAMEs, are data-refs that appear in
 135    load/store operations (like 'x[i]' in S1), and are handled differently.
 136
 137    Target modeling:
 138    =================
 139         Currently the only target specific information that is used is the
 140    size of the vector (in bytes) - "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD".
 141    Targets that can support different sizes of vectors, for now will need
 142    to specify one value for "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD".  More
 143    flexibility will be added in the future.
 144
 145         Since we only vectorize operations which vector form can be
 146    expressed using existing tree codes, to verify that an operation is
 147    supported, the vectorizer checks the relevant optab at the relevant
 148    machine_mode (e.g, optab_handler (add_optab, V8HImode)).  If
 149    the value found is CODE_FOR_nothing, then there's no target support, and
 150    we can't vectorize the stmt.
 151
 152    For additional information on this project see:
 153    http://gcc.gnu.org/projects/tree-ssa/vectorization.html
 154 */
 155
 156 static void vect_estimate_min_profitable_iters (loop_vec_info, int *, int *);
 157
 158 /* Subroutine of vect_determine_vf_for_stmt that handles only one
 159    statement.  VECTYPE_MAYBE_SET_P is true if STMT_VINFO_VECTYPE
 160    may already be set for general statements (not just data refs).  */
 161
 162 static opt_result
 163 vect_determine_vf_for_stmt_1 (stmt_vec_info stmt_info,
 164                               bool vectype_maybe_set_p,
 165                               poly_uint64 *vf,
 166                               vec<stmt_vec_info > *mask_producers)
 167 {
 168   gimple *stmt = stmt_info->stmt;
 169
 170   if ((!STMT_VINFO_RELEVANT_P (stmt_info)
 171        && !STMT_VINFO_LIVE_P (stmt_info))
 172       || gimple_clobber_p (stmt))
 173     {
 174       if (dump_enabled_p ())
 175         dump_printf_loc (MSG_NOTE, vect_location, "skip.\n");
 176       return opt_result::success ();
 177     }
 178
 179   tree stmt_vectype, nunits_vectype;
 180   opt_result res = vect_get_vector_types_for_stmt (stmt_info, &stmt_vectype,
 181                                                    &nunits_vectype);
 182   if (!res)
 183     return res;
 184
 185   if (stmt_vectype)
 186     {
 187       if (STMT_VINFO_VECTYPE (stmt_info))
 188         /* The only case when a vectype had been already set is for stmts
 189            that contain a data ref, or for "pattern-stmts" (stmts generated
 190            by the vectorizer to represent/replace a certain idiom).  */
 191         gcc_assert ((STMT_VINFO_DATA_REF (stmt_info)
 192                      || vectype_maybe_set_p)
 193                     && STMT_VINFO_VECTYPE (stmt_info) == stmt_vectype);
 194       else if (stmt_vectype == boolean_type_node)
 195         mask_producers->safe_push (stmt_info);
 196       else
 197         STMT_VINFO_VECTYPE (stmt_info) = stmt_vectype;
 198     }
 199
 200   if (nunits_vectype)
 201     vect_update_max_nunits (vf, nunits_vectype);
 202
 203   return opt_result::success ();
 204 }
 205
 206 /* Subroutine of vect_determine_vectorization_factor.  Set the vector
 207    types of STMT_INFO and all attached pattern statements and update
 208    the vectorization factor VF accordingly.  If some of the statements
 209    produce a mask result whose vector type can only be calculated later,
 210    add them to MASK_PRODUCERS.  Return true on success or false if
 211    something prevented vectorization.  */
 212
 213 static opt_result
 214 vect_determine_vf_for_stmt (stmt_vec_info stmt_info, poly_uint64 *vf,
 215                             vec<stmt_vec_info > *mask_producers)
 216 {
 217   vec_info *vinfo = stmt_info->vinfo;
 218   if (dump_enabled_p ())
 219     dump_printf_loc (MSG_NOTE, vect_location, "==> examining statement: %G",
 220                      stmt_info->stmt);
 221   opt_result res
 222     = vect_determine_vf_for_stmt_1 (stmt_info, false, vf, mask_producers);
 223   if (!res)
 224     return res;
 225
 226   if (STMT_VINFO_IN_PATTERN_P (stmt_info)
 227       && STMT_VINFO_RELATED_STMT (stmt_info))
 228     {
 229       gimple *pattern_def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
 230       stmt_info = STMT_VINFO_RELATED_STMT (stmt_info);
 231
 232       /* If a pattern statement has def stmts, analyze them too.  */
 233       for (gimple_stmt_iterator si = gsi_start (pattern_def_seq);
 234            !gsi_end_p (si); gsi_next (&si))
 235         {
 236           stmt_vec_info def_stmt_info = vinfo->lookup_stmt (gsi_stmt (si));
 237           if (dump_enabled_p ())
 238             dump_printf_loc (MSG_NOTE, vect_location,
 239                              "==> examining pattern def stmt: %G",
 240                              def_stmt_info->stmt);
 241           if (!vect_determine_vf_for_stmt_1 (def_stmt_info, true,
 242                                              vf, mask_producers))
 243           res = vect_determine_vf_for_stmt_1 (def_stmt_info, true,
 244                                               vf, mask_producers);
 245           if (!res)
 246             return res;
 247         }
 248
 249       if (dump_enabled_p ())
 250         dump_printf_loc (MSG_NOTE, vect_location,
 251                          "==> examining pattern statement: %G",
 252                          stmt_info->stmt);
 253       res = vect_determine_vf_for_stmt_1 (stmt_info, true, vf, mask_producers);
 254       if (!res)
 255         return res;
 256     }
 257
 258   return opt_result::success ();
 259 }
 260
 261 /* Function vect_determine_vectorization_factor
 262
 263    Determine the vectorization factor (VF).  VF is the number of data elements
 264    that are operated upon in parallel in a single iteration of the vectorized
 265    loop.  For example, when vectorizing a loop that operates on 4byte elements,
 266    on a target with vector size (VS) 16byte, the VF is set to 4, since 4
 267    elements can fit in a single vector register.
 268
 269    We currently support vectorization of loops in which all types operated upon
 270    are of the same size.  Therefore this function currently sets VF according to
 271    the size of the types operated upon, and fails if there are multiple sizes
 272    in the loop.
 273
 274    VF is also the factor by which the loop iterations are strip-mined, e.g.:
 275    original loop:
 276         for (i=0; i<N; i++){
 277           a[i] = b[i] + c[i];
 278         }
 279
 280    vectorized loop:
 281         for (i=0; i<N; i+=VF){
 282           a[i:VF] = b[i:VF] + c[i:VF];
 283         }
 284 */
 285
 286 static opt_result
 287 vect_determine_vectorization_factor (loop_vec_info loop_vinfo)
 288 {
 289   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
 290   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
 291   unsigned nbbs = loop->num_nodes;
 292   poly_uint64 vectorization_factor = 1;
 293   tree scalar_type = NULL_TREE;
 294   gphi *phi;
 295   tree vectype;
 296   stmt_vec_info stmt_info;
 297   unsigned i;
 298   auto_vec<stmt_vec_info> mask_producers;
 299
 300   DUMP_VECT_SCOPE ("vect_determine_vectorization_factor");
 301
 302   for (i = 0; i < nbbs; i++)
 303     {
 304       basic_block bb = bbs[i];
 305
 306       for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
 307            gsi_next (&si))
 308         {
 309           phi = si.phi ();
 310           stmt_info = loop_vinfo->lookup_stmt (phi);
 311           if (dump_enabled_p ())
 312             dump_printf_loc (MSG_NOTE, vect_location, "==> examining phi: %G",
 313                              phi);
 314
 315           gcc_assert (stmt_info);
 316
 317           if (STMT_VINFO_RELEVANT_P (stmt_info)
 318               || STMT_VINFO_LIVE_P (stmt_info))
 319             {
 320               gcc_assert (!STMT_VINFO_VECTYPE (stmt_info));
 321               scalar_type = TREE_TYPE (PHI_RESULT (phi));
 322
 323               if (dump_enabled_p ())
 324                 dump_printf_loc (MSG_NOTE, vect_location,
 325                                  "get vectype for scalar type:  %T\n",
 326                                  scalar_type);
 327
 328               vectype = get_vectype_for_scalar_type (scalar_type);
 329               if (!vectype)
 330                 return opt_result::failure_at (phi,
 331                                                "not vectorized: unsupported "
 332                                                "data-type %T\n",
 333                                                scalar_type);
 334               STMT_VINFO_VECTYPE (stmt_info) = vectype;
 335
 336               if (dump_enabled_p ())
 337                 dump_printf_loc (MSG_NOTE, vect_location, "vectype: %T\n",
 338                                  vectype);
 339
 340               if (dump_enabled_p ())
 341                 {
 342                   dump_printf_loc (MSG_NOTE, vect_location, "nunits = ");
 343                   dump_dec (MSG_NOTE, TYPE_VECTOR_SUBPARTS (vectype));
 344                   dump_printf (MSG_NOTE, "\n");
 345                 }
 346
 347               vect_update_max_nunits (&vectorization_factor, vectype);
 348             }
 349         }
 350
 351       for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
 352            gsi_next (&si))
 353         {
 354           stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
 355           opt_result res
 356             = vect_determine_vf_for_stmt (stmt_info, &vectorization_factor,
 357                                           &mask_producers);
 358           if (!res)
 359             return res;
 360         }
 361     }
 362
 363   /* TODO: Analyze cost. Decide if worth while to vectorize.  */
 364   if (dump_enabled_p ())
 365     {
 366       dump_printf_loc (MSG_NOTE, vect_location, "vectorization factor = ");
 367       dump_dec (MSG_NOTE, vectorization_factor);
 368       dump_printf (MSG_NOTE, "\n");
 369     }
 370
 371   if (known_le (vectorization_factor, 1U))
 372     return opt_result::failure_at (vect_location,
 373                                    "not vectorized: unsupported data-type\n");
 374   LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
 375
 376   for (i = 0; i < mask_producers.length (); i++)
 377     {
 378       stmt_info = mask_producers[i];
 379       opt_tree mask_type = vect_get_mask_type_for_stmt (stmt_info);
 380       if (!mask_type)
 381         return opt_result::propagate_failure (mask_type);
 382       STMT_VINFO_VECTYPE (stmt_info) = mask_type;
 383     }
 384
 385   return opt_result::success ();
 386 }
 387
 388
 389 /* Function vect_is_simple_iv_evolution.
 390
 391    FORNOW: A simple evolution of an induction variables in the loop is
 392    considered a polynomial evolution.  */
 393
 394 static bool
 395 vect_is_simple_iv_evolution (unsigned loop_nb, tree access_fn, tree * init,
 396                              tree * step)
 397 {
 398   tree init_expr;
 399   tree step_expr;
 400   tree evolution_part = evolution_part_in_loop_num (access_fn, loop_nb);
 401   basic_block bb;
 402
 403   /* When there is no evolution in this loop, the evolution function
 404      is not "simple".  */
 405   if (evolution_part == NULL_TREE)
 406     return false;
 407
 408   /* When the evolution is a polynomial of degree >= 2
 409      the evolution function is not "simple".  */
 410   if (tree_is_chrec (evolution_part))
 411     return false;
 412
 413   step_expr = evolution_part;
 414   init_expr = unshare_expr (initial_condition_in_loop_num (access_fn, loop_nb));
 415
 416   if (dump_enabled_p ())
 417     dump_printf_loc (MSG_NOTE, vect_location, "step: %T,  init: %T\n",
 418                      step_expr, init_expr);
 419
 420   *init = init_expr;
 421   *step = step_expr;
 422
 423   if (TREE_CODE (step_expr) != INTEGER_CST
 424       && (TREE_CODE (step_expr) != SSA_NAME
 425           || ((bb = gimple_bb (SSA_NAME_DEF_STMT (step_expr)))
 426               && flow_bb_inside_loop_p (get_loop (cfun, loop_nb), bb))
 427           || (!INTEGRAL_TYPE_P (TREE_TYPE (step_expr))
 428               && (!SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr))
 429                   || !flag_associative_math)))
 430       && (TREE_CODE (step_expr) != REAL_CST
 431           || !flag_associative_math))
 432     {
 433       if (dump_enabled_p ())
 434         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 435                          "step unknown.\n");
 436       return false;
 437     }
 438
 439   return true;
 440 }
 441
 442 /* Return true if PHI, described by STMT_INFO, is the inner PHI in
 443    what we are assuming is a double reduction.  For example, given
 444    a structure like this:
 445
 446       outer1:
 447         x_1 = PHI <x_4(outer2), ...>;
 448         ...
 449
 450       inner:
 451         x_2 = PHI <x_1(outer1), ...>;
 452         ...
 453         x_3 = ...;
 454         ...
 455
 456       outer2:
 457         x_4 = PHI <x_3(inner)>;
 458         ...
 459
 460    outer loop analysis would treat x_1 as a double reduction phi and
 461    this function would then return true for x_2.  */
 462
 463 static bool
 464 vect_inner_phi_in_double_reduction_p (stmt_vec_info stmt_info, gphi *phi)
 465 {
 466   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
 467   use_operand_p use_p;
 468   ssa_op_iter op_iter;
 469   FOR_EACH_PHI_ARG (use_p, phi, op_iter, SSA_OP_USE)
 470     if (stmt_vec_info def_info = loop_vinfo->lookup_def (USE_FROM_PTR (use_p)))
 471       if (STMT_VINFO_DEF_TYPE (def_info) == vect_double_reduction_def)
 472         return true;
 473   return false;
 474 }
 475
 476 /* Function vect_analyze_scalar_cycles_1.
 477
 478    Examine the cross iteration def-use cycles of scalar variables
 479    in LOOP.  LOOP_VINFO represents the loop that is now being
 480    considered for vectorization (can be LOOP, or an outer-loop
 481    enclosing LOOP).  */
 482
 483 static void
 484 vect_analyze_scalar_cycles_1 (loop_vec_info loop_vinfo, struct loop *loop)
 485 {
 486   basic_block bb = loop->header;
 487   tree init, step;
 488   auto_vec<stmt_vec_info, 64> worklist;
 489   gphi_iterator gsi;
 490   bool double_reduc;
 491
 492   DUMP_VECT_SCOPE ("vect_analyze_scalar_cycles");
 493
 494   /* First - identify all inductions.  Reduction detection assumes that all the
 495      inductions have been identified, therefore, this order must not be
 496      changed.  */
 497   for (gsi = gsi_start_phis  (bb); !gsi_end_p (gsi); gsi_next (&gsi))
 498     {
 499       gphi *phi = gsi.phi ();
 500       tree access_fn = NULL;
 501       tree def = PHI_RESULT (phi);
 502       stmt_vec_info stmt_vinfo = loop_vinfo->lookup_stmt (phi);
 503
 504       if (dump_enabled_p ())
 505         dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: %G", phi);
 506
 507       /* Skip virtual phi's.  The data dependences that are associated with
 508          virtual defs/uses (i.e., memory accesses) are analyzed elsewhere.  */
 509       if (virtual_operand_p (def))
 510         continue;
 511
 512       STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_unknown_def_type;
 513
 514       /* Analyze the evolution function.  */
 515       access_fn = analyze_scalar_evolution (loop, def);
 516       if (access_fn)
 517         {
 518           STRIP_NOPS (access_fn);
 519           if (dump_enabled_p ())
 520             dump_printf_loc (MSG_NOTE, vect_location,
 521                              "Access function of PHI: %T\n", access_fn);
 522           STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo)
 523             = initial_condition_in_loop_num (access_fn, loop->num);
 524           STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo)
 525             = evolution_part_in_loop_num (access_fn, loop->num);
 526         }
 527
 528       if (!access_fn
 529           || vect_inner_phi_in_double_reduction_p (stmt_vinfo, phi)
 530           || !vect_is_simple_iv_evolution (loop->num, access_fn, &init, &step)
 531           || (LOOP_VINFO_LOOP (loop_vinfo) != loop
 532               && TREE_CODE (step) != INTEGER_CST))
 533         {
 534           worklist.safe_push (stmt_vinfo);
 535           continue;
 536         }
 537
 538       gcc_assert (STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo)
 539                   != NULL_TREE);
 540       gcc_assert (STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo) != NULL_TREE);
 541
 542       if (dump_enabled_p ())
 543         dump_printf_loc (MSG_NOTE, vect_location, "Detected induction.\n");
 544       STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_induction_def;
 545     }
 546
 547
 548   /* Second - identify all reductions and nested cycles.  */
 549   while (worklist.length () > 0)
 550     {
 551       stmt_vec_info stmt_vinfo = worklist.pop ();
 552       gphi *phi = as_a <gphi *> (stmt_vinfo->stmt);
 553       tree def = PHI_RESULT (phi);
 554
 555       if (dump_enabled_p ())
 556         dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: %G", phi);
 557
 558       gcc_assert (!virtual_operand_p (def)
 559                   && STMT_VINFO_DEF_TYPE (stmt_vinfo) == vect_unknown_def_type);
 560
 561       stmt_vec_info reduc_stmt_info
 562         = vect_force_simple_reduction (loop_vinfo, stmt_vinfo,
 563                                        &double_reduc, false);
 564       if (reduc_stmt_info)
 565         {
 566           if (double_reduc)
 567             {
 568               if (dump_enabled_p ())
 569                 dump_printf_loc (MSG_NOTE, vect_location,
 570                                  "Detected double reduction.\n");
 571
 572               STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_double_reduction_def;
 573               STMT_VINFO_DEF_TYPE (reduc_stmt_info)
 574                 = vect_double_reduction_def;
 575             }
 576           else
 577             {
 578               if (loop != LOOP_VINFO_LOOP (loop_vinfo))
 579                 {
 580                   if (dump_enabled_p ())
 581                     dump_printf_loc (MSG_NOTE, vect_location,
 582                                      "Detected vectorizable nested cycle.\n");
 583
 584                   STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_nested_cycle;
 585                   STMT_VINFO_DEF_TYPE (reduc_stmt_info) = vect_nested_cycle;
 586                 }
 587               else
 588                 {
 589                   if (dump_enabled_p ())
 590                     dump_printf_loc (MSG_NOTE, vect_location,
 591                                      "Detected reduction.\n");
 592
 593                   STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_reduction_def;
 594                   STMT_VINFO_DEF_TYPE (reduc_stmt_info) = vect_reduction_def;
 595                   /* Store the reduction cycles for possible vectorization in
 596                      loop-aware SLP if it was not detected as reduction
 597                      chain.  */
 598                   if (! REDUC_GROUP_FIRST_ELEMENT (reduc_stmt_info))
 599                     LOOP_VINFO_REDUCTIONS (loop_vinfo).safe_push
 600                       (reduc_stmt_info);
 601                 }
 602             }
 603         }
 604       else
 605         if (dump_enabled_p ())
 606           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 607                            "Unknown def-use cycle pattern.\n");
 608     }
 609 }
 610
 611
 612 /* Function vect_analyze_scalar_cycles.
 613
 614    Examine the cross iteration def-use cycles of scalar variables, by
 615    analyzing the loop-header PHIs of scalar variables.  Classify each
 616    cycle as one of the following: invariant, induction, reduction, unknown.
 617    We do that for the loop represented by LOOP_VINFO, and also to its
 618    inner-loop, if exists.
 619    Examples for scalar cycles:
 620
 621    Example1: reduction:
 622
 623               loop1:
 624               for (i=0; i<N; i++)
 625                  sum += a[i];
 626
 627    Example2: induction:
 628
 629               loop2:
 630               for (i=0; i<N; i++)
 631                  a[i] = i;  */
 632
 633 static void
 634 vect_analyze_scalar_cycles (loop_vec_info loop_vinfo)
 635 {
 636   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
 637
 638   vect_analyze_scalar_cycles_1 (loop_vinfo, loop);
 639
 640   /* When vectorizing an outer-loop, the inner-loop is executed sequentially.
 641      Reductions in such inner-loop therefore have different properties than
 642      the reductions in the nest that gets vectorized:
 643      1. When vectorized, they are executed in the same order as in the original
 644         scalar loop, so we can't change the order of computation when
 645         vectorizing them.
 646      2. FIXME: Inner-loop reductions can be used in the inner-loop, so the
 647         current checks are too strict.  */
 648
 649   if (loop->inner)
 650     vect_analyze_scalar_cycles_1 (loop_vinfo, loop->inner);
 651 }
 652
 653 /* Transfer group and reduction information from STMT_INFO to its
 654    pattern stmt.  */
 655
 656 static void
 657 vect_fixup_reduc_chain (stmt_vec_info stmt_info)
 658 {
 659   stmt_vec_info firstp = STMT_VINFO_RELATED_STMT (stmt_info);
 660   stmt_vec_info stmtp;
 661   gcc_assert (!REDUC_GROUP_FIRST_ELEMENT (firstp)
 662               && REDUC_GROUP_FIRST_ELEMENT (stmt_info));
 663   REDUC_GROUP_SIZE (firstp) = REDUC_GROUP_SIZE (stmt_info);
 664   do
 665     {
 666       stmtp = STMT_VINFO_RELATED_STMT (stmt_info);
 667       REDUC_GROUP_FIRST_ELEMENT (stmtp) = firstp;
 668       stmt_info = REDUC_GROUP_NEXT_ELEMENT (stmt_info);
 669       if (stmt_info)
 670         REDUC_GROUP_NEXT_ELEMENT (stmtp)
 671           = STMT_VINFO_RELATED_STMT (stmt_info);
 672     }
 673   while (stmt_info);
 674   STMT_VINFO_DEF_TYPE (stmtp) = vect_reduction_def;
 675 }
 676
 677 /* Fixup scalar cycles that now have their stmts detected as patterns.  */
 678
 679 static void
 680 vect_fixup_scalar_cycles_with_patterns (loop_vec_info loop_vinfo)
 681 {
 682   stmt_vec_info first;
 683   unsigned i;
 684
 685   FOR_EACH_VEC_ELT (LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo), i, first)
 686     if (STMT_VINFO_IN_PATTERN_P (first))
 687       {
 688         stmt_vec_info next = REDUC_GROUP_NEXT_ELEMENT (first);
 689         while (next)
 690           {
 691             if (! STMT_VINFO_IN_PATTERN_P (next))
 692               break;
 693             next = REDUC_GROUP_NEXT_ELEMENT (next);
 694           }
 695         /* If not all stmt in the chain are patterns try to handle
 696            the chain without patterns.  */
 697         if (! next)
 698           {
 699             vect_fixup_reduc_chain (first);
 700             LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo)[i]
 701               = STMT_VINFO_RELATED_STMT (first);
 702           }
 703       }
 704 }
 705
 706 /* Function vect_get_loop_niters.
 707
 708    Determine how many iterations the loop is executed and place it
 709    in NUMBER_OF_ITERATIONS.  Place the number of latch iterations
 710    in NUMBER_OF_ITERATIONSM1.  Place the condition under which the
 711    niter information holds in ASSUMPTIONS.
 712
 713    Return the loop exit condition.  */
 714
 715
 716 static gcond *
 717 vect_get_loop_niters (struct loop *loop, tree *assumptions,
 718                       tree *number_of_iterations, tree *number_of_iterationsm1)
 719 {
 720   edge exit = single_exit (loop);
 721   struct tree_niter_desc niter_desc;
 722   tree niter_assumptions, niter, may_be_zero;
 723   gcond *cond = get_loop_exit_condition (loop);
 724
 725   *assumptions = boolean_true_node;
 726   *number_of_iterationsm1 = chrec_dont_know;
 727   *number_of_iterations = chrec_dont_know;
 728   DUMP_VECT_SCOPE ("get_loop_niters");
 729
 730   if (!exit)
 731     return cond;
 732
 733   niter = chrec_dont_know;
 734   may_be_zero = NULL_TREE;
 735   niter_assumptions = boolean_true_node;
 736   if (!number_of_iterations_exit_assumptions (loop, exit, &niter_desc, NULL)
 737       || chrec_contains_undetermined (niter_desc.niter))
 738     return cond;
 739
 740   niter_assumptions = niter_desc.assumptions;
 741   may_be_zero = niter_desc.may_be_zero;
 742   niter = niter_desc.niter;
 743
 744   if (may_be_zero && integer_zerop (may_be_zero))
 745     may_be_zero = NULL_TREE;
 746
 747   if (may_be_zero)
 748     {
 749       if (COMPARISON_CLASS_P (may_be_zero))
 750         {
 751           /* Try to combine may_be_zero with assumptions, this can simplify
 752              computation of niter expression.  */
 753           if (niter_assumptions && !integer_nonzerop (niter_assumptions))
 754             niter_assumptions = fold_build2 (TRUTH_AND_EXPR, boolean_type_node,
 755                                              niter_assumptions,
 756                                              fold_build1 (TRUTH_NOT_EXPR,
 757                                                           boolean_type_node,
 758                                                           may_be_zero));
 759           else
 760             niter = fold_build3 (COND_EXPR, TREE_TYPE (niter), may_be_zero,
 761                                  build_int_cst (TREE_TYPE (niter), 0),
 762                                  rewrite_to_non_trapping_overflow (niter));
 763
 764           may_be_zero = NULL_TREE;
 765         }
 766       else if (integer_nonzerop (may_be_zero))
 767         {
 768           *number_of_iterationsm1 = build_int_cst (TREE_TYPE (niter), 0);
 769           *number_of_iterations = build_int_cst (TREE_TYPE (niter), 1);
 770           return cond;
 771         }
 772       else
 773         return cond;
 774     }
 775
 776   *assumptions = niter_assumptions;
 777   *number_of_iterationsm1 = niter;
 778
 779   /* We want the number of loop header executions which is the number
 780      of latch executions plus one.
 781      ???  For UINT_MAX latch executions this number overflows to zero
 782      for loops like do { n++; } while (n != 0);  */
 783   if (niter && !chrec_contains_undetermined (niter))
 784     niter = fold_build2 (PLUS_EXPR, TREE_TYPE (niter), unshare_expr (niter),
 785                           build_int_cst (TREE_TYPE (niter), 1));
 786   *number_of_iterations = niter;
 787
 788   return cond;
 789 }
 790
 791 /* Function bb_in_loop_p
 792
 793    Used as predicate for dfs order traversal of the loop bbs.  */
 794
 795 static bool
 796 bb_in_loop_p (const_basic_block bb, const void *data)
 797 {
 798   const struct loop *const loop = (const struct loop *)data;
 799   if (flow_bb_inside_loop_p (loop, bb))
 800     return true;
 801   return false;
 802 }
 803
 804
 805 /* Create and initialize a new loop_vec_info struct for LOOP_IN, as well as
 806    stmt_vec_info structs for all the stmts in LOOP_IN.  */
 807
 808 _loop_vec_info::_loop_vec_info (struct loop *loop_in, vec_info_shared *shared)
 809   : vec_info (vec_info::loop, init_cost (loop_in), shared),
 810     loop (loop_in),
 811     bbs (XCNEWVEC (basic_block, loop->num_nodes)),
 812     num_itersm1 (NULL_TREE),
 813     num_iters (NULL_TREE),
 814     num_iters_unchanged (NULL_TREE),
 815     num_iters_assumptions (NULL_TREE),
 816     th (0),
 817     versioning_threshold (0),
 818     vectorization_factor (0),
 819     max_vectorization_factor (0),
 820     mask_skip_niters (NULL_TREE),
 821     mask_compare_type (NULL_TREE),
 822     unaligned_dr (NULL),
 823     peeling_for_alignment (0),
 824     ptr_mask (0),
 825     ivexpr_map (NULL),
 826     slp_unrolling_factor (1),
 827     single_scalar_iteration_cost (0),
 828     vectorizable (false),
 829     can_fully_mask_p (true),
 830     fully_masked_p (false),
 831     peeling_for_gaps (false),
 832     peeling_for_niter (false),
 833     operands_swapped (false),
 834     no_data_dependencies (false),
 835     has_mask_store (false),
 836     scalar_loop (NULL),
 837     orig_loop_info (NULL)
 838 {
 839   /* CHECKME: We want to visit all BBs before their successors (except for
 840      latch blocks, for which this assertion wouldn't hold).  In the simple
 841      case of the loop forms we allow, a dfs order of the BBs would the same
 842      as reversed postorder traversal, so we are safe.  */
 843
 844   unsigned int nbbs = dfs_enumerate_from (loop->header, 0, bb_in_loop_p,
 845                                           bbs, loop->num_nodes, loop);
 846   gcc_assert (nbbs == loop->num_nodes);
 847
 848   for (unsigned int i = 0; i < nbbs; i++)
 849     {
 850       basic_block bb = bbs[i];
 851       gimple_stmt_iterator si;
 852
 853       for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
 854         {
 855           gimple *phi = gsi_stmt (si);
 856           gimple_set_uid (phi, 0);
 857           add_stmt (phi);
 858         }
 859
 860       for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
 861         {
 862           gimple *stmt = gsi_stmt (si);
 863           gimple_set_uid (stmt, 0);
 864           add_stmt (stmt);
 865         }
 866     }
 867 }
 868
 869 /* Free all levels of MASKS.  */
 870
 871 void
 872 release_vec_loop_masks (vec_loop_masks *masks)
 873 {
 874   rgroup_masks *rgm;
 875   unsigned int i;
 876   FOR_EACH_VEC_ELT (*masks, i, rgm)
 877     rgm->masks.release ();
 878   masks->release ();
 879 }
 880
 881 /* Free all memory used by the _loop_vec_info, as well as all the
 882    stmt_vec_info structs of all the stmts in the loop.  */
 883
 884 _loop_vec_info::~_loop_vec_info ()
 885 {
 886   int nbbs;
 887   gimple_stmt_iterator si;
 888   int j;
 889
 890   nbbs = loop->num_nodes;
 891   for (j = 0; j < nbbs; j++)
 892     {
 893       basic_block bb = bbs[j];
 894       for (si = gsi_start_bb (bb); !gsi_end_p (si); )
 895         {
 896           gimple *stmt = gsi_stmt (si);
 897
 898           /* We may have broken canonical form by moving a constant
 899              into RHS1 of a commutative op.  Fix such occurrences.  */
 900           if (operands_swapped && is_gimple_assign (stmt))
 901             {
 902               enum tree_code code = gimple_assign_rhs_code (stmt);
 903
 904               if ((code == PLUS_EXPR
 905                    || code == POINTER_PLUS_EXPR
 906                    || code == MULT_EXPR)
 907                   && CONSTANT_CLASS_P (gimple_assign_rhs1 (stmt)))
 908                 swap_ssa_operands (stmt,
 909                                    gimple_assign_rhs1_ptr (stmt),
 910                                    gimple_assign_rhs2_ptr (stmt));
 911               else if (code == COND_EXPR
 912                        && CONSTANT_CLASS_P (gimple_assign_rhs2 (stmt)))
 913                 {
 914                   tree cond_expr = gimple_assign_rhs1 (stmt);
 915                   enum tree_code cond_code = TREE_CODE (cond_expr);
 916
 917                   if (TREE_CODE_CLASS (cond_code) == tcc_comparison)
 918                     {
 919                       bool honor_nans = HONOR_NANS (TREE_OPERAND (cond_expr,
 920                                                                   0));
 921                       cond_code = invert_tree_comparison (cond_code,
 922                                                           honor_nans);
 923                       if (cond_code != ERROR_MARK)
 924                         {
 925                           TREE_SET_CODE (cond_expr, cond_code);
 926                           swap_ssa_operands (stmt,
 927                                              gimple_assign_rhs2_ptr (stmt),
 928                                              gimple_assign_rhs3_ptr (stmt));
 929                         }
 930                     }
 931                 }
 932             }
 933           gsi_next (&si);
 934         }
 935     }
 936
 937   free (bbs);
 938
 939   release_vec_loop_masks (&masks);
 940   delete ivexpr_map;
 941
 942   loop->aux = NULL;
 943 }
 944
 945 /* Return an invariant or register for EXPR and emit necessary
 946    computations in the LOOP_VINFO loop preheader.  */
 947
 948 tree
 949 cse_and_gimplify_to_preheader (loop_vec_info loop_vinfo, tree expr)
 950 {
 951   if (is_gimple_reg (expr)
 952       || is_gimple_min_invariant (expr))
 953     return expr;
 954
 955   if (! loop_vinfo->ivexpr_map)
 956     loop_vinfo->ivexpr_map = new hash_map<tree_operand_hash, tree>;
 957   tree &cached = loop_vinfo->ivexpr_map->get_or_insert (expr);
 958   if (! cached)
 959     {
 960       gimple_seq stmts = NULL;
 961       cached = force_gimple_operand (unshare_expr (expr),
 962                                      &stmts, true, NULL_TREE);
 963       if (stmts)
 964         {
 965           edge e = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo));
 966           gsi_insert_seq_on_edge_immediate (e, stmts);
 967         }
 968     }
 969   return cached;
 970 }
 971
 972 /* Return true if we can use CMP_TYPE as the comparison type to produce
 973    all masks required to mask LOOP_VINFO.  */
 974
 975 static bool
 976 can_produce_all_loop_masks_p (loop_vec_info loop_vinfo, tree cmp_type)
 977 {
 978   rgroup_masks *rgm;
 979   unsigned int i;
 980   FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo), i, rgm)
 981     if (rgm->mask_type != NULL_TREE
 982         && !direct_internal_fn_supported_p (IFN_WHILE_ULT,
 983                                             cmp_type, rgm->mask_type,
 984                                             OPTIMIZE_FOR_SPEED))
 985       return false;
 986   return true;
 987 }
 988
 989 /* Calculate the maximum number of scalars per iteration for every
 990    rgroup in LOOP_VINFO.  */
 991
 992 static unsigned int
 993 vect_get_max_nscalars_per_iter (loop_vec_info loop_vinfo)
 994 {
 995   unsigned int res = 1;
 996   unsigned int i;
 997   rgroup_masks *rgm;
 998   FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo), i, rgm)
 999     res = MAX (res, rgm->max_nscalars_per_iter);
1000   return res;
1001 }
1002
1003 /* Each statement in LOOP_VINFO can be masked where necessary.  Check
1004    whether we can actually generate the masks required.  Return true if so,
1005    storing the type of the scalar IV in LOOP_VINFO_MASK_COMPARE_TYPE.  */
1006
1007 static bool
1008 vect_verify_full_masking (loop_vec_info loop_vinfo)
1009 {
1010   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1011   unsigned int min_ni_width;
1012
1013   /* Use a normal loop if there are no statements that need masking.
1014      This only happens in rare degenerate cases: it means that the loop
1015      has no loads, no stores, and no live-out values.  */
1016   if (LOOP_VINFO_MASKS (loop_vinfo).is_empty ())
1017     return false;
1018
1019   /* Get the maximum number of iterations that is representable
1020      in the counter type.  */
1021   tree ni_type = TREE_TYPE (LOOP_VINFO_NITERSM1 (loop_vinfo));
1022   widest_int max_ni = wi::to_widest (TYPE_MAX_VALUE (ni_type)) + 1;
1023
1024   /* Get a more refined estimate for the number of iterations.  */
1025   widest_int max_back_edges;
1026   if (max_loop_iterations (loop, &max_back_edges))
1027     max_ni = wi::smin (max_ni, max_back_edges + 1);
1028
1029   /* Account for rgroup masks, in which each bit is replicated N times.  */
1030   max_ni *= vect_get_max_nscalars_per_iter (loop_vinfo);
1031
1032   /* Work out how many bits we need to represent the limit.  */
1033   min_ni_width = wi::min_precision (max_ni, UNSIGNED);
1034
1035   /* Find a scalar mode for which WHILE_ULT is supported.  */
1036   opt_scalar_int_mode cmp_mode_iter;
1037   tree cmp_type = NULL_TREE;
1038   FOR_EACH_MODE_IN_CLASS (cmp_mode_iter, MODE_INT)
1039     {
1040       unsigned int cmp_bits = GET_MODE_BITSIZE (cmp_mode_iter.require ());
1041       if (cmp_bits >= min_ni_width
1042           && targetm.scalar_mode_supported_p (cmp_mode_iter.require ()))
1043         {
1044           tree this_type = build_nonstandard_integer_type (cmp_bits, true);
1045           if (this_type
1046               && can_produce_all_loop_masks_p (loop_vinfo, this_type))
1047             {
1048               /* Although we could stop as soon as we find a valid mode,
1049                  it's often better to continue until we hit Pmode, since the
1050                  operands to the WHILE are more likely to be reusable in
1051                  address calculations.  */
1052               cmp_type = this_type;
1053               if (cmp_bits >= GET_MODE_BITSIZE (Pmode))
1054                 break;
1055             }
1056         }
1057     }
1058
1059   if (!cmp_type)
1060     return false;
1061
1062   LOOP_VINFO_MASK_COMPARE_TYPE (loop_vinfo) = cmp_type;
1063   return true;
1064 }
1065
1066 /* Calculate the cost of one scalar iteration of the loop.  */
1067 static void
1068 vect_compute_single_scalar_iteration_cost (loop_vec_info loop_vinfo)
1069 {
1070   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1071   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1072   int nbbs = loop->num_nodes, factor;
1073   int innerloop_iters, i;
1074
1075   DUMP_VECT_SCOPE ("vect_compute_single_scalar_iteration_cost");
1076
1077   /* Gather costs for statements in the scalar loop.  */
1078
1079   /* FORNOW.  */
1080   innerloop_iters = 1;
1081   if (loop->inner)
1082     innerloop_iters = 50; /* FIXME */
1083
1084   for (i = 0; i < nbbs; i++)
1085     {
1086       gimple_stmt_iterator si;
1087       basic_block bb = bbs[i];
1088
1089       if (bb->loop_father == loop->inner)
1090         factor = innerloop_iters;
1091       else
1092         factor = 1;
1093
1094       for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
1095         {
1096           gimple *stmt = gsi_stmt (si);
1097           stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (stmt);
1098
1099           if (!is_gimple_assign (stmt) && !is_gimple_call (stmt))
1100             continue;
1101
1102           /* Skip stmts that are not vectorized inside the loop.  */
1103           stmt_vec_info vstmt_info = vect_stmt_to_vectorize (stmt_info);
1104           if (!STMT_VINFO_RELEVANT_P (vstmt_info)
1105               && (!STMT_VINFO_LIVE_P (vstmt_info)
1106                   || !VECTORIZABLE_CYCLE_DEF
1107                         (STMT_VINFO_DEF_TYPE (vstmt_info))))
1108             continue;
1109
1110           vect_cost_for_stmt kind;
1111           if (STMT_VINFO_DATA_REF (stmt_info))
1112             {
1113               if (DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
1114                kind = scalar_load;
1115              else
1116                kind = scalar_store;
1117             }
1118           else
1119             kind = scalar_stmt;
1120
1121           record_stmt_cost (&LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1122                             factor, kind, stmt_info, 0, vect_prologue);
1123         }
1124     }
1125
1126   /* Now accumulate cost.  */
1127   void *target_cost_data = init_cost (loop);
1128   stmt_info_for_cost *si;
1129   int j;
1130   FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1131                     j, si)
1132     (void) add_stmt_cost (target_cost_data, si->count,
1133                           si->kind, si->stmt_info, si->misalign,
1134                           vect_body);
1135   unsigned dummy, body_cost = 0;
1136   finish_cost (target_cost_data, &dummy, &body_cost, &dummy);
1137   destroy_cost_data (target_cost_data);
1138   LOOP_VINFO_SINGLE_SCALAR_ITERATION_COST (loop_vinfo) = body_cost;
1139 }
1140
1141
1142 /* Function vect_analyze_loop_form_1.
1143
1144    Verify that certain CFG restrictions hold, including:
1145    - the loop has a pre-header
1146    - the loop has a single entry and exit
1147    - the loop exit condition is simple enough
1148    - the number of iterations can be analyzed, i.e, a countable loop.  The
1149      niter could be analyzed under some assumptions.  */
1150
1151 opt_result
1152 vect_analyze_loop_form_1 (struct loop *loop, gcond **loop_cond,
1153                           tree *assumptions, tree *number_of_iterationsm1,
1154                           tree *number_of_iterations, gcond **inner_loop_cond)
1155 {
1156   DUMP_VECT_SCOPE ("vect_analyze_loop_form");
1157
1158   /* Different restrictions apply when we are considering an inner-most loop,
1159      vs. an outer (nested) loop.
1160      (FORNOW. May want to relax some of these restrictions in the future).  */
1161
1162   if (!loop->inner)
1163     {
1164       /* Inner-most loop.  We currently require that the number of BBs is
1165          exactly 2 (the header and latch).  Vectorizable inner-most loops
1166          look like this:
1167
1168                         (pre-header)
1169                            |
1170                           header <--------+
1171                            | |            |
1172                            | +--> latch --+
1173                            |
1174                         (exit-bb)  */
1175
1176       if (loop->num_nodes != 2)
1177         return opt_result::failure_at (vect_location,
1178                                        "not vectorized:"
1179                                        " control flow in loop.\n");
1180
1181       if (empty_block_p (loop->header))
1182         return opt_result::failure_at (vect_location,
1183                                        "not vectorized: empty loop.\n");
1184     }
1185   else
1186     {
1187       struct loop *innerloop = loop->inner;
1188       edge entryedge;
1189
1190       /* Nested loop. We currently require that the loop is doubly-nested,
1191          contains a single inner loop, and the number of BBs is exactly 5.
1192          Vectorizable outer-loops look like this:
1193
1194                         (pre-header)
1195                            |
1196                           header <---+
1197                            |         |
1198                           inner-loop |
1199                            |         |
1200                           tail ------+
1201                            |
1202                         (exit-bb)
1203
1204          The inner-loop has the properties expected of inner-most loops
1205          as described above.  */
1206
1207       if ((loop->inner)->inner || (loop->inner)->next)
1208         return opt_result::failure_at (vect_location,
1209                                        "not vectorized:"
1210                                        " multiple nested loops.\n");
1211
1212       if (loop->num_nodes != 5)
1213         return opt_result::failure_at (vect_location,
1214                                        "not vectorized:"
1215                                        " control flow in loop.\n");
1216
1217       entryedge = loop_preheader_edge (innerloop);
1218       if (entryedge->src != loop->header
1219           || !single_exit (innerloop)
1220           || single_exit (innerloop)->dest != EDGE_PRED (loop->latch, 0)->src)
1221         return opt_result::failure_at (vect_location,
1222                                        "not vectorized:"
1223                                        " unsupported outerloop form.\n");
1224
1225       /* Analyze the inner-loop.  */
1226       tree inner_niterm1, inner_niter, inner_assumptions;
1227       opt_result res
1228         = vect_analyze_loop_form_1 (loop->inner, inner_loop_cond,
1229                                     &inner_assumptions, &inner_niterm1,
1230                                     &inner_niter, NULL);
1231       if (!res)
1232         {
1233           if (dump_enabled_p ())
1234             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1235                              "not vectorized: Bad inner loop.\n");
1236           return res;
1237         }
1238
1239       /* Don't support analyzing niter under assumptions for inner
1240          loop.  */
1241       if (!integer_onep (inner_assumptions))
1242         return opt_result::failure_at (vect_location,
1243                                        "not vectorized: Bad inner loop.\n");
1244
1245       if (!expr_invariant_in_loop_p (loop, inner_niter))
1246         return opt_result::failure_at (vect_location,
1247                                        "not vectorized: inner-loop count not"
1248                                        " invariant.\n");
1249
1250       if (dump_enabled_p ())
1251         dump_printf_loc (MSG_NOTE, vect_location,
1252                          "Considering outer-loop vectorization.\n");
1253     }
1254
1255   if (!single_exit (loop))
1256     return opt_result::failure_at (vect_location,
1257                                    "not vectorized: multiple exits.\n");
1258   if (EDGE_COUNT (loop->header->preds) != 2)
1259     return opt_result::failure_at (vect_location,
1260                                    "not vectorized:"
1261                                    " too many incoming edges.\n");
1262
1263   /* We assume that the loop exit condition is at the end of the loop. i.e,
1264      that the loop is represented as a do-while (with a proper if-guard
1265      before the loop if needed), where the loop header contains all the
1266      executable statements, and the latch is empty.  */
1267   if (!empty_block_p (loop->latch)
1268       || !gimple_seq_empty_p (phi_nodes (loop->latch)))
1269     return opt_result::failure_at (vect_location,
1270                                    "not vectorized: latch block not empty.\n");
1271
1272   /* Make sure the exit is not abnormal.  */
1273   edge e = single_exit (loop);
1274   if (e->flags & EDGE_ABNORMAL)
1275     return opt_result::failure_at (vect_location,
1276                                    "not vectorized:"
1277                                    " abnormal loop exit edge.\n");
1278
1279   *loop_cond = vect_get_loop_niters (loop, assumptions, number_of_iterations,
1280                                      number_of_iterationsm1);
1281   if (!*loop_cond)
1282     return opt_result::failure_at
1283       (vect_location,
1284        "not vectorized: complicated exit condition.\n");
1285
1286   if (integer_zerop (*assumptions)
1287       || !*number_of_iterations
1288       || chrec_contains_undetermined (*number_of_iterations))
1289     return opt_result::failure_at
1290       (*loop_cond,
1291        "not vectorized: number of iterations cannot be computed.\n");
1292
1293   if (integer_zerop (*number_of_iterations))
1294     return opt_result::failure_at
1295       (*loop_cond,
1296        "not vectorized: number of iterations = 0.\n");
1297
1298   return opt_result::success ();
1299 }
1300
1301 /* Analyze LOOP form and return a loop_vec_info if it is of suitable form.  */
1302
1303 opt_loop_vec_info
1304 vect_analyze_loop_form (struct loop *loop, vec_info_shared *shared)
1305 {
1306   tree assumptions, number_of_iterations, number_of_iterationsm1;
1307   gcond *loop_cond, *inner_loop_cond = NULL;
1308
1309   opt_result res
1310     = vect_analyze_loop_form_1 (loop, &loop_cond,
1311                                 &assumptions, &number_of_iterationsm1,
1312                                 &number_of_iterations, &inner_loop_cond);
1313   if (!res)
1314     return opt_loop_vec_info::propagate_failure (res);
1315
1316   loop_vec_info loop_vinfo = new _loop_vec_info (loop, shared);
1317   LOOP_VINFO_NITERSM1 (loop_vinfo) = number_of_iterationsm1;
1318   LOOP_VINFO_NITERS (loop_vinfo) = number_of_iterations;
1319   LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = number_of_iterations;
1320   if (!integer_onep (assumptions))
1321     {
1322       /* We consider to vectorize this loop by versioning it under
1323          some assumptions.  In order to do this, we need to clear
1324          existing information computed by scev and niter analyzer.  */
1325       scev_reset_htab ();
1326       free_numbers_of_iterations_estimates (loop);
1327       /* Also set flag for this loop so that following scev and niter
1328          analysis are done under the assumptions.  */
1329       loop_constraint_set (loop, LOOP_C_FINITE);
1330       /* Also record the assumptions for versioning.  */
1331       LOOP_VINFO_NITERS_ASSUMPTIONS (loop_vinfo) = assumptions;
1332     }
1333
1334   if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
1335     {
1336       if (dump_enabled_p ())
1337         {
1338           dump_printf_loc (MSG_NOTE, vect_location,
1339                            "Symbolic number of iterations is ");
1340           dump_generic_expr (MSG_NOTE, TDF_DETAILS, number_of_iterations);
1341           dump_printf (MSG_NOTE, "\n");
1342         }
1343     }
1344
1345   stmt_vec_info loop_cond_info = loop_vinfo->lookup_stmt (loop_cond);
1346   STMT_VINFO_TYPE (loop_cond_info) = loop_exit_ctrl_vec_info_type;
1347   if (inner_loop_cond)
1348     {
1349       stmt_vec_info inner_loop_cond_info
1350         = loop_vinfo->lookup_stmt (inner_loop_cond);
1351       STMT_VINFO_TYPE (inner_loop_cond_info) = loop_exit_ctrl_vec_info_type;
1352     }
1353
1354   gcc_assert (!loop->aux);
1355   loop->aux = loop_vinfo;
1356   return opt_loop_vec_info::success (loop_vinfo);
1357 }
1358
1359
1360
1361 /* Scan the loop stmts and dependent on whether there are any (non-)SLP
1362    statements update the vectorization factor.  */
1363
1364 static void
1365 vect_update_vf_for_slp (loop_vec_info loop_vinfo)
1366 {
1367   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1368   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1369   int nbbs = loop->num_nodes;
1370   poly_uint64 vectorization_factor;
1371   int i;
1372
1373   DUMP_VECT_SCOPE ("vect_update_vf_for_slp");
1374
1375   vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1376   gcc_assert (known_ne (vectorization_factor, 0U));
1377
1378   /* If all the stmts in the loop can be SLPed, we perform only SLP, and
1379      vectorization factor of the loop is the unrolling factor required by
1380      the SLP instances.  If that unrolling factor is 1, we say, that we
1381      perform pure SLP on loop - cross iteration parallelism is not
1382      exploited.  */
1383   bool only_slp_in_loop = true;
1384   for (i = 0; i < nbbs; i++)
1385     {
1386       basic_block bb = bbs[i];
1387       for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
1388            gsi_next (&si))
1389         {
1390           stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
1391           stmt_info = vect_stmt_to_vectorize (stmt_info);
1392           if ((STMT_VINFO_RELEVANT_P (stmt_info)
1393                || VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
1394               && !PURE_SLP_STMT (stmt_info))
1395             /* STMT needs both SLP and loop-based vectorization.  */
1396             only_slp_in_loop = false;
1397         }
1398     }
1399
1400   if (only_slp_in_loop)
1401     {
1402       if (dump_enabled_p ())
1403         dump_printf_loc (MSG_NOTE, vect_location,
1404                          "Loop contains only SLP stmts\n");
1405       vectorization_factor = LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo);
1406     }
1407   else
1408     {
1409       if (dump_enabled_p ())
1410         dump_printf_loc (MSG_NOTE, vect_location,
1411                          "Loop contains SLP and non-SLP stmts\n");
1412       /* Both the vectorization factor and unroll factor have the form
1413          current_vector_size * X for some rational X, so they must have
1414          a common multiple.  */
1415       vectorization_factor
1416         = force_common_multiple (vectorization_factor,
1417                                  LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo));
1418     }
1419
1420   LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
1421   if (dump_enabled_p ())
1422     {
1423       dump_printf_loc (MSG_NOTE, vect_location,
1424                        "Updating vectorization factor to ");
1425       dump_dec (MSG_NOTE, vectorization_factor);
1426       dump_printf (MSG_NOTE, ".\n");
1427     }
1428 }
1429
1430 /* Return true if STMT_INFO describes a double reduction phi and if
1431    the other phi in the reduction is also relevant for vectorization.
1432    This rejects cases such as:
1433
1434       outer1:
1435         x_1 = PHI <x_3(outer2), ...>;
1436         ...
1437
1438       inner:
1439         x_2 = ...;
1440         ...
1441
1442       outer2:
1443         x_3 = PHI <x_2(inner)>;
1444
1445    if nothing in x_2 or elsewhere makes x_1 relevant.  */
1446
1447 static bool
1448 vect_active_double_reduction_p (stmt_vec_info stmt_info)
1449 {
1450   if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_double_reduction_def)
1451     return false;
1452
1453   return STMT_VINFO_RELEVANT_P (STMT_VINFO_REDUC_DEF (stmt_info));
1454 }
1455
1456 /* Function vect_analyze_loop_operations.
1457
1458    Scan the loop stmts and make sure they are all vectorizable.  */
1459
1460 static opt_result
1461 vect_analyze_loop_operations (loop_vec_info loop_vinfo)
1462 {
1463   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1464   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1465   int nbbs = loop->num_nodes;
1466   int i;
1467   stmt_vec_info stmt_info;
1468   bool need_to_vectorize = false;
1469   bool ok;
1470
1471   DUMP_VECT_SCOPE ("vect_analyze_loop_operations");
1472
1473   auto_vec<stmt_info_for_cost> cost_vec;
1474
1475   for (i = 0; i < nbbs; i++)
1476     {
1477       basic_block bb = bbs[i];
1478
1479       for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
1480            gsi_next (&si))
1481         {
1482           gphi *phi = si.phi ();
1483           ok = true;
1484
1485           stmt_info = loop_vinfo->lookup_stmt (phi);
1486           if (dump_enabled_p ())
1487             dump_printf_loc (MSG_NOTE, vect_location, "examining phi: %G", phi);
1488           if (virtual_operand_p (gimple_phi_result (phi)))
1489             continue;
1490
1491           /* Inner-loop loop-closed exit phi in outer-loop vectorization
1492              (i.e., a phi in the tail of the outer-loop).  */
1493           if (! is_loop_header_bb_p (bb))
1494             {
1495               /* FORNOW: we currently don't support the case that these phis
1496                  are not used in the outerloop (unless it is double reduction,
1497                  i.e., this phi is vect_reduction_def), cause this case
1498                  requires to actually do something here.  */
1499               if (STMT_VINFO_LIVE_P (stmt_info)
1500                   && !vect_active_double_reduction_p (stmt_info))
1501                 return opt_result::failure_at (phi,
1502                                                "Unsupported loop-closed phi"
1503                                                " in outer-loop.\n");
1504
1505               /* If PHI is used in the outer loop, we check that its operand
1506                  is defined in the inner loop.  */
1507               if (STMT_VINFO_RELEVANT_P (stmt_info))
1508                 {
1509                   tree phi_op;
1510
1511                   if (gimple_phi_num_args (phi) != 1)
1512                     return opt_result::failure_at (phi, "unsupported phi");
1513
1514                   phi_op = PHI_ARG_DEF (phi, 0);
1515                   stmt_vec_info op_def_info = loop_vinfo->lookup_def (phi_op);
1516                   if (!op_def_info)
1517                     return opt_result::failure_at (phi, "unsupported phi");
1518
1519                   if (STMT_VINFO_RELEVANT (op_def_info) != vect_used_in_outer
1520                       && (STMT_VINFO_RELEVANT (op_def_info)
1521                           != vect_used_in_outer_by_reduction))
1522                     return opt_result::failure_at (phi, "unsupported phi");
1523                 }
1524
1525               continue;
1526             }
1527
1528           gcc_assert (stmt_info);
1529
1530           if ((STMT_VINFO_RELEVANT (stmt_info) == vect_used_in_scope
1531                || STMT_VINFO_LIVE_P (stmt_info))
1532               && STMT_VINFO_DEF_TYPE (stmt_info) != vect_induction_def)
1533             /* A scalar-dependence cycle that we don't support.  */
1534             return opt_result::failure_at (phi,
1535                                            "not vectorized:"
1536                                            " scalar dependence cycle.\n");
1537
1538           if (STMT_VINFO_RELEVANT_P (stmt_info))
1539             {
1540               need_to_vectorize = true;
1541               if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def
1542                   && ! PURE_SLP_STMT (stmt_info))
1543                 ok = vectorizable_induction (stmt_info, NULL, NULL, NULL,
1544                                              &cost_vec);
1545               else if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
1546                         || STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle)
1547                        && ! PURE_SLP_STMT (stmt_info))
1548                 ok = vectorizable_reduction (stmt_info, NULL, NULL, NULL, NULL,
1549                                              &cost_vec);
1550             }
1551
1552           /* SLP PHIs are tested by vect_slp_analyze_node_operations.  */
1553           if (ok
1554               && STMT_VINFO_LIVE_P (stmt_info)
1555               && !PURE_SLP_STMT (stmt_info))
1556             ok = vectorizable_live_operation (stmt_info, NULL, NULL, -1, NULL,
1557                                               &cost_vec);
1558
1559           if (!ok)
1560             return opt_result::failure_at (phi,
1561                                            "not vectorized: relevant phi not "
1562                                            "supported: %G",
1563                                            static_cast <gimple *> (phi));
1564         }
1565
1566       for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
1567            gsi_next (&si))
1568         {
1569           gimple *stmt = gsi_stmt (si);
1570           if (!gimple_clobber_p (stmt))
1571             {
1572               opt_result res
1573                 = vect_analyze_stmt (loop_vinfo->lookup_stmt (stmt),
1574                                      &need_to_vectorize,
1575                                      NULL, NULL, &cost_vec);
1576               if (!res)
1577                 return res;
1578             }
1579         }
1580     } /* bbs */
1581
1582   add_stmt_costs (loop_vinfo->target_cost_data, &cost_vec);
1583
1584   /* All operations in the loop are either irrelevant (deal with loop
1585      control, or dead), or only used outside the loop and can be moved
1586      out of the loop (e.g. invariants, inductions).  The loop can be
1587      optimized away by scalar optimizations.  We're better off not
1588      touching this loop.  */
1589   if (!need_to_vectorize)
1590     {
1591       if (dump_enabled_p ())
1592         dump_printf_loc (MSG_NOTE, vect_location,
1593                          "All the computation can be taken out of the loop.\n");
1594       return opt_result::failure_at
1595         (vect_location,
1596          "not vectorized: redundant loop. no profit to vectorize.\n");
1597     }
1598
1599   return opt_result::success ();
1600 }
1601
1602 /* Analyze the cost of the loop described by LOOP_VINFO.  Decide if it
1603    is worthwhile to vectorize.  Return 1 if definitely yes, 0 if
1604    definitely no, or -1 if it's worth retrying.  */
1605
1606 static int
1607 vect_analyze_loop_costing (loop_vec_info loop_vinfo)
1608 {
1609   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1610   unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
1611
1612   /* Only fully-masked loops can have iteration counts less than the
1613      vectorization factor.  */
1614   if (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
1615     {
1616       HOST_WIDE_INT max_niter;
1617
1618       if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
1619         max_niter = LOOP_VINFO_INT_NITERS (loop_vinfo);
1620       else
1621         max_niter = max_stmt_executions_int (loop);
1622
1623       if (max_niter != -1
1624           && (unsigned HOST_WIDE_INT) max_niter < assumed_vf)
1625         {
1626           if (dump_enabled_p ())
1627             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1628                              "not vectorized: iteration count smaller than "
1629                              "vectorization factor.\n");
1630           return 0;
1631         }
1632     }
1633
1634   int min_profitable_iters, min_profitable_estimate;
1635   vect_estimate_min_profitable_iters (loop_vinfo, &min_profitable_iters,
1636                                       &min_profitable_estimate);
1637
1638   if (min_profitable_iters < 0)
1639     {
1640       if (dump_enabled_p ())
1641         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1642                          "not vectorized: vectorization not profitable.\n");
1643       if (dump_enabled_p ())
1644         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1645                          "not vectorized: vector version will never be "
1646                          "profitable.\n");
1647       return -1;
1648     }
1649
1650   int min_scalar_loop_bound = (PARAM_VALUE (PARAM_MIN_VECT_LOOP_BOUND)
1651                                * assumed_vf);
1652
1653   /* Use the cost model only if it is more conservative than user specified
1654      threshold.  */
1655   unsigned int th = (unsigned) MAX (min_scalar_loop_bound,
1656                                     min_profitable_iters);
1657
1658   LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo) = th;
1659
1660   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
1661       && LOOP_VINFO_INT_NITERS (loop_vinfo) < th)
1662     {
1663       if (dump_enabled_p ())
1664         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1665                          "not vectorized: vectorization not profitable.\n");
1666       if (dump_enabled_p ())
1667         dump_printf_loc (MSG_NOTE, vect_location,
1668                          "not vectorized: iteration count smaller than user "
1669                          "specified loop bound parameter or minimum profitable "
1670                          "iterations (whichever is more conservative).\n");
1671       return 0;
1672     }
1673
1674   HOST_WIDE_INT estimated_niter = estimated_stmt_executions_int (loop);
1675   if (estimated_niter == -1)
1676     estimated_niter = likely_max_stmt_executions_int (loop);
1677   if (estimated_niter != -1
1678       && ((unsigned HOST_WIDE_INT) estimated_niter
1679           < MAX (th, (unsigned) min_profitable_estimate)))
1680     {
1681       if (dump_enabled_p ())
1682         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1683                          "not vectorized: estimated iteration count too "
1684                          "small.\n");
1685       if (dump_enabled_p ())
1686         dump_printf_loc (MSG_NOTE, vect_location,
1687                          "not vectorized: estimated iteration count smaller "
1688                          "than specified loop bound parameter or minimum "
1689                          "profitable iterations (whichever is more "
1690                          "conservative).\n");
1691       return -1;
1692     }
1693
1694   return 1;
1695 }
1696
1697 static opt_result
1698 vect_get_datarefs_in_loop (loop_p loop, basic_block *bbs,
1699                            vec<data_reference_p> *datarefs,
1700                            unsigned int *n_stmts)
1701 {
1702   *n_stmts = 0;
1703   for (unsigned i = 0; i < loop->num_nodes; i++)
1704     for (gimple_stmt_iterator gsi = gsi_start_bb (bbs[i]);
1705          !gsi_end_p (gsi); gsi_next (&gsi))
1706       {
1707         gimple *stmt = gsi_stmt (gsi);
1708         if (is_gimple_debug (stmt))
1709           continue;
1710         ++(*n_stmts);
1711         opt_result res = vect_find_stmt_data_reference (loop, stmt, datarefs);
1712         if (!res)
1713           {
1714             if (is_gimple_call (stmt) && loop->safelen)
1715               {
1716                 tree fndecl = gimple_call_fndecl (stmt), op;
1717                 if (fndecl != NULL_TREE)
1718                   {
1719                     cgraph_node *node = cgraph_node::get (fndecl);
1720                     if (node != NULL && node->simd_clones != NULL)
1721                       {
1722                         unsigned int j, n = gimple_call_num_args (stmt);
1723                         for (j = 0; j < n; j++)
1724                           {
1725                             op = gimple_call_arg (stmt, j);
1726                             if (DECL_P (op)
1727                                 || (REFERENCE_CLASS_P (op)
1728                                     && get_base_address (op)))
1729                               break;
1730                           }
1731                         op = gimple_call_lhs (stmt);
1732                         /* Ignore #pragma omp declare simd functions
1733                            if they don't have data references in the
1734                            call stmt itself.  */
1735                         if (j == n
1736                             && !(op
1737                                  && (DECL_P (op)
1738                                      || (REFERENCE_CLASS_P (op)
1739                                          && get_base_address (op)))))
1740                           continue;
1741                       }
1742                   }
1743               }
1744             return res;
1745           }
1746         /* If dependence analysis will give up due to the limit on the
1747            number of datarefs stop here and fail fatally.  */
1748         if (datarefs->length ()
1749             > (unsigned)PARAM_VALUE (PARAM_LOOP_MAX_DATAREFS_FOR_DATADEPS))
1750           return opt_result::failure_at (stmt, "exceeded param "
1751                                          "loop-max-datarefs-for-datadeps\n");
1752       }
1753   return opt_result::success ();
1754 }
1755
1756 /* Function vect_analyze_loop_2.
1757
1758    Apply a set of analyses on LOOP, and create a loop_vec_info struct
1759    for it.  The different analyses will record information in the
1760    loop_vec_info struct.  */
1761 static opt_result
1762 vect_analyze_loop_2 (loop_vec_info loop_vinfo, bool &fatal, unsigned *n_stmts)
1763 {
1764   opt_result ok = opt_result::success ();
1765   int res;
1766   unsigned int max_vf = MAX_VECTORIZATION_FACTOR;
1767   poly_uint64 min_vf = 2;
1768
1769   /* The first group of checks is independent of the vector size.  */
1770   fatal = true;
1771
1772   /* Find all data references in the loop (which correspond to vdefs/vuses)
1773      and analyze their evolution in the loop.  */
1774
1775   loop_p loop = LOOP_VINFO_LOOP (loop_vinfo);
1776
1777   /* Gather the data references and count stmts in the loop.  */
1778   if (!LOOP_VINFO_DATAREFS (loop_vinfo).exists ())
1779     {
1780       opt_result res
1781         = vect_get_datarefs_in_loop (loop, LOOP_VINFO_BBS (loop_vinfo),
1782                                      &LOOP_VINFO_DATAREFS (loop_vinfo),
1783                                      n_stmts);
1784       if (!res)
1785         {
1786           if (dump_enabled_p ())
1787             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1788                              "not vectorized: loop contains function "
1789                              "calls or data references that cannot "
1790                              "be analyzed\n");
1791           return res;
1792         }
1793       loop_vinfo->shared->save_datarefs ();
1794     }
1795   else
1796     loop_vinfo->shared->check_datarefs ();
1797
1798   /* Analyze the data references and also adjust the minimal
1799      vectorization factor according to the loads and stores.  */
1800
1801   ok = vect_analyze_data_refs (loop_vinfo, &min_vf);
1802   if (!ok)
1803     {
1804       if (dump_enabled_p ())
1805         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1806                          "bad data references.\n");
1807       return ok;
1808     }
1809
1810   /* Classify all cross-iteration scalar data-flow cycles.
1811      Cross-iteration cycles caused by virtual phis are analyzed separately.  */
1812   vect_analyze_scalar_cycles (loop_vinfo);
1813
1814   vect_pattern_recog (loop_vinfo);
1815
1816   vect_fixup_scalar_cycles_with_patterns (loop_vinfo);
1817
1818   /* Analyze the access patterns of the data-refs in the loop (consecutive,
1819      complex, etc.). FORNOW: Only handle consecutive access pattern.  */
1820
1821   ok = vect_analyze_data_ref_accesses (loop_vinfo);
1822   if (!ok)
1823     {
1824       if (dump_enabled_p ())
1825         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1826                          "bad data access.\n");
1827       return ok;
1828     }
1829
1830   /* Data-flow analysis to detect stmts that do not need to be vectorized.  */
1831
1832   ok = vect_mark_stmts_to_be_vectorized (loop_vinfo);
1833   if (!ok)
1834     {
1835       if (dump_enabled_p ())
1836         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1837                          "unexpected pattern.\n");
1838       return ok;
1839     }
1840
1841   /* While the rest of the analysis below depends on it in some way.  */
1842   fatal = false;
1843
1844   /* Analyze data dependences between the data-refs in the loop
1845      and adjust the maximum vectorization factor according to
1846      the dependences.
1847      FORNOW: fail at the first data dependence that we encounter.  */
1848
1849   ok = vect_analyze_data_ref_dependences (loop_vinfo, &max_vf);
1850   if (!ok)
1851     {
1852       if (dump_enabled_p ())
1853         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1854                          "bad data dependence.\n");
1855       return ok;
1856     }
1857   if (max_vf != MAX_VECTORIZATION_FACTOR
1858       && maybe_lt (max_vf, min_vf))
1859     return opt_result::failure_at (vect_location, "bad data dependence.\n");
1860   LOOP_VINFO_MAX_VECT_FACTOR (loop_vinfo) = max_vf;
1861
1862   ok = vect_determine_vectorization_factor (loop_vinfo);
1863   if (!ok)
1864     {
1865       if (dump_enabled_p ())
1866         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1867                          "can't determine vectorization factor.\n");
1868       return ok;
1869     }
1870   if (max_vf != MAX_VECTORIZATION_FACTOR
1871       && maybe_lt (max_vf, LOOP_VINFO_VECT_FACTOR (loop_vinfo)))
1872     return opt_result::failure_at (vect_location, "bad data dependence.\n");
1873
1874   /* Compute the scalar iteration cost.  */
1875   vect_compute_single_scalar_iteration_cost (loop_vinfo);
1876
1877   poly_uint64 saved_vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1878   unsigned th;
1879
1880   /* Check the SLP opportunities in the loop, analyze and build SLP trees.  */
1881   ok = vect_analyze_slp (loop_vinfo, *n_stmts);
1882   if (!ok)
1883     return ok;
1884
1885   /* If there are any SLP instances mark them as pure_slp.  */
1886   bool slp = vect_make_slp_decision (loop_vinfo);
1887   if (slp)
1888     {
1889       /* Find stmts that need to be both vectorized and SLPed.  */
1890       vect_detect_hybrid_slp (loop_vinfo);
1891
1892       /* Update the vectorization factor based on the SLP decision.  */
1893       vect_update_vf_for_slp (loop_vinfo);
1894     }
1895
1896   bool saved_can_fully_mask_p = LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo);
1897
1898   /* We don't expect to have to roll back to anything other than an empty
1899      set of rgroups.  */
1900   gcc_assert (LOOP_VINFO_MASKS (loop_vinfo).is_empty ());
1901
1902   /* This is the point where we can re-start analysis with SLP forced off.  */
1903 start_over:
1904
1905   /* Now the vectorization factor is final.  */
1906   poly_uint64 vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1907   gcc_assert (known_ne (vectorization_factor, 0U));
1908
1909   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) && dump_enabled_p ())
1910     {
1911       dump_printf_loc (MSG_NOTE, vect_location,
1912                        "vectorization_factor = ");
1913       dump_dec (MSG_NOTE, vectorization_factor);
1914       dump_printf (MSG_NOTE, ", niters = %wd\n",
1915                    LOOP_VINFO_INT_NITERS (loop_vinfo));
1916     }
1917
1918   HOST_WIDE_INT max_niter
1919     = likely_max_stmt_executions_int (LOOP_VINFO_LOOP (loop_vinfo));
1920
1921   /* Analyze the alignment of the data-refs in the loop.
1922      Fail if a data reference is found that cannot be vectorized.  */
1923
1924   ok = vect_analyze_data_refs_alignment (loop_vinfo);
1925   if (!ok)
1926     {
1927       if (dump_enabled_p ())
1928         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1929                          "bad data alignment.\n");
1930       return ok;
1931     }
1932
1933   /* Prune the list of ddrs to be tested at run-time by versioning for alias.
1934      It is important to call pruning after vect_analyze_data_ref_accesses,
1935      since we use grouping information gathered by interleaving analysis.  */
1936   ok = vect_prune_runtime_alias_test_list (loop_vinfo);
1937   if (!ok)
1938     return ok;
1939
1940   /* Do not invoke vect_enhance_data_refs_alignment for epilogue
1941      vectorization, since we do not want to add extra peeling or
1942      add versioning for alignment.  */
1943   if (!LOOP_VINFO_EPILOGUE_P (loop_vinfo))
1944     /* This pass will decide on using loop versioning and/or loop peeling in
1945        order to enhance the alignment of data references in the loop.  */
1946     ok = vect_enhance_data_refs_alignment (loop_vinfo);
1947   else
1948     ok = vect_verify_datarefs_alignment (loop_vinfo);
1949   if (!ok)
1950     return ok;
1951
1952   if (slp)
1953     {
1954       /* Analyze operations in the SLP instances.  Note this may
1955          remove unsupported SLP instances which makes the above
1956          SLP kind detection invalid.  */
1957       unsigned old_size = LOOP_VINFO_SLP_INSTANCES (loop_vinfo).length ();
1958       vect_slp_analyze_operations (loop_vinfo);
1959       if (LOOP_VINFO_SLP_INSTANCES (loop_vinfo).length () != old_size)
1960         {
1961           ok = opt_result::failure_at (vect_location,
1962                                        "unsupported SLP instances\n");
1963           goto again;
1964         }
1965     }
1966
1967   /* Scan all the remaining operations in the loop that are not subject
1968      to SLP and make sure they are vectorizable.  */
1969   ok = vect_analyze_loop_operations (loop_vinfo);
1970   if (!ok)
1971     {
1972       if (dump_enabled_p ())
1973         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1974                          "bad operation or unsupported loop bound.\n");
1975       return ok;
1976     }
1977
1978   /* Decide whether to use a fully-masked loop for this vectorization
1979      factor.  */
1980   LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
1981     = (LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo)
1982        && vect_verify_full_masking (loop_vinfo));
1983   if (dump_enabled_p ())
1984     {
1985       if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
1986         dump_printf_loc (MSG_NOTE, vect_location,
1987                          "using a fully-masked loop.\n");
1988       else
1989         dump_printf_loc (MSG_NOTE, vect_location,
1990                          "not using a fully-masked loop.\n");
1991     }
1992
1993   /* If epilog loop is required because of data accesses with gaps,
1994      one additional iteration needs to be peeled.  Check if there is
1995      enough iterations for vectorization.  */
1996   if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
1997       && LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
1998       && !LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
1999     {
2000       poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2001       tree scalar_niters = LOOP_VINFO_NITERSM1 (loop_vinfo);
2002
2003       if (known_lt (wi::to_widest (scalar_niters), vf))
2004         return opt_result::failure_at (vect_location,
2005                                        "loop has no enough iterations to"
2006                                        " support peeling for gaps.\n");
2007     }
2008
2009   /* Check the costings of the loop make vectorizing worthwhile.  */
2010   res = vect_analyze_loop_costing (loop_vinfo);
2011   if (res < 0)
2012     {
2013       ok = opt_result::failure_at (vect_location,
2014                                    "Loop costings may not be worthwhile.\n");
2015       goto again;
2016     }
2017   if (!res)
2018     return opt_result::failure_at (vect_location,
2019                                    "Loop costings not worthwhile.\n");
2020
2021   /* Decide whether we need to create an epilogue loop to handle
2022      remaining scalar iterations.  */
2023   th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
2024
2025   unsigned HOST_WIDE_INT const_vf;
2026   if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
2027     /* The main loop handles all iterations.  */
2028     LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = false;
2029   else if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2030            && LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) >= 0)
2031     {
2032       /* Work out the (constant) number of iterations that need to be
2033          peeled for reasons other than niters.  */
2034       unsigned int peel_niter = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
2035       if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
2036         peel_niter += 1;
2037       if (!multiple_p (LOOP_VINFO_INT_NITERS (loop_vinfo) - peel_niter,
2038                        LOOP_VINFO_VECT_FACTOR (loop_vinfo)))
2039         LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = true;
2040     }
2041   else if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
2042            /* ??? When peeling for gaps but not alignment, we could
2043               try to check whether the (variable) niters is known to be
2044               VF * N + 1.  That's something of a niche case though.  */
2045            || LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
2046            || !LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant (&const_vf)
2047            || ((tree_ctz (LOOP_VINFO_NITERS (loop_vinfo))
2048                 < (unsigned) exact_log2 (const_vf))
2049                /* In case of versioning, check if the maximum number of
2050                   iterations is greater than th.  If they are identical,
2051                   the epilogue is unnecessary.  */
2052                && (!LOOP_REQUIRES_VERSIONING (loop_vinfo)
2053                    || ((unsigned HOST_WIDE_INT) max_niter
2054                        > (th / const_vf) * const_vf))))
2055     LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = true;
2056
2057   /* If an epilogue loop is required make sure we can create one.  */
2058   if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
2059       || LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo))
2060     {
2061       if (dump_enabled_p ())
2062         dump_printf_loc (MSG_NOTE, vect_location, "epilog loop required\n");
2063       if (!vect_can_advance_ivs_p (loop_vinfo)
2064           || !slpeel_can_duplicate_loop_p (LOOP_VINFO_LOOP (loop_vinfo),
2065                                            single_exit (LOOP_VINFO_LOOP
2066                                                          (loop_vinfo))))
2067         {
2068           ok = opt_result::failure_at (vect_location,
2069                                        "not vectorized: can't create required "
2070                                        "epilog loop\n");
2071           goto again;
2072         }
2073     }
2074
2075   /* During peeling, we need to check if number of loop iterations is
2076      enough for both peeled prolog loop and vector loop.  This check
2077      can be merged along with threshold check of loop versioning, so
2078      increase threshold for this case if necessary.  */
2079   if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
2080     {
2081       poly_uint64 niters_th = 0;
2082
2083       if (!vect_use_loop_mask_for_alignment_p (loop_vinfo))
2084         {
2085           /* Niters for peeled prolog loop.  */
2086           if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) < 0)
2087             {
2088               dr_vec_info *dr_info = LOOP_VINFO_UNALIGNED_DR (loop_vinfo);
2089               tree vectype = STMT_VINFO_VECTYPE (dr_info->stmt);
2090               niters_th += TYPE_VECTOR_SUBPARTS (vectype) - 1;
2091             }
2092           else
2093             niters_th += LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
2094         }
2095
2096       /* Niters for at least one iteration of vectorized loop.  */
2097       if (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
2098         niters_th += LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2099       /* One additional iteration because of peeling for gap.  */
2100       if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
2101         niters_th += 1;
2102       LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo) = niters_th;
2103     }
2104
2105   gcc_assert (known_eq (vectorization_factor,
2106                         LOOP_VINFO_VECT_FACTOR (loop_vinfo)));
2107
2108   /* Ok to vectorize!  */
2109   return opt_result::success ();
2110
2111 again:
2112   /* Ensure that "ok" is false (with an opt_problem if dumping is enabled).  */
2113   gcc_assert (!ok);
2114
2115   /* Try again with SLP forced off but if we didn't do any SLP there is
2116      no point in re-trying.  */
2117   if (!slp)
2118     return ok;
2119
2120   /* If there are reduction chains re-trying will fail anyway.  */
2121   if (! LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo).is_empty ())
2122     return ok;
2123
2124   /* Likewise if the grouped loads or stores in the SLP cannot be handled
2125      via interleaving or lane instructions.  */
2126   slp_instance instance;
2127   slp_tree node;
2128   unsigned i, j;
2129   FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), i, instance)
2130     {
2131       stmt_vec_info vinfo;
2132       vinfo = SLP_TREE_SCALAR_STMTS (SLP_INSTANCE_TREE (instance))[0];
2133       if (! STMT_VINFO_GROUPED_ACCESS (vinfo))
2134         continue;
2135       vinfo = DR_GROUP_FIRST_ELEMENT (vinfo);
2136       unsigned int size = DR_GROUP_SIZE (vinfo);
2137       tree vectype = STMT_VINFO_VECTYPE (vinfo);
2138       if (! vect_store_lanes_supported (vectype, size, false)
2139          && ! known_eq (TYPE_VECTOR_SUBPARTS (vectype), 1U)
2140          && ! vect_grouped_store_supported (vectype, size))
2141         return opt_result::failure_at (vinfo->stmt,
2142                                        "unsupported grouped store\n");
2143       FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), j, node)
2144         {
2145           vinfo = SLP_TREE_SCALAR_STMTS (node)[0];
2146           vinfo = DR_GROUP_FIRST_ELEMENT (vinfo);
2147           bool single_element_p = !DR_GROUP_NEXT_ELEMENT (vinfo);
2148           size = DR_GROUP_SIZE (vinfo);
2149           vectype = STMT_VINFO_VECTYPE (vinfo);
2150           if (! vect_load_lanes_supported (vectype, size, false)
2151               && ! vect_grouped_load_supported (vectype, single_element_p,
2152                                                 size))
2153             return opt_result::failure_at (vinfo->stmt,
2154                                            "unsupported grouped load\n");
2155         }
2156     }
2157
2158   if (dump_enabled_p ())
2159     dump_printf_loc (MSG_NOTE, vect_location,
2160                      "re-trying with SLP disabled\n");
2161
2162   /* Roll back state appropriately.  No SLP this time.  */
2163   slp = false;
2164   /* Restore vectorization factor as it were without SLP.  */
2165   LOOP_VINFO_VECT_FACTOR (loop_vinfo) = saved_vectorization_factor;
2166   /* Free the SLP instances.  */
2167   FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), j, instance)
2168     vect_free_slp_instance (instance, false);
2169   LOOP_VINFO_SLP_INSTANCES (loop_vinfo).release ();
2170   /* Reset SLP type to loop_vect on all stmts.  */
2171   for (i = 0; i < LOOP_VINFO_LOOP (loop_vinfo)->num_nodes; ++i)
2172     {
2173       basic_block bb = LOOP_VINFO_BBS (loop_vinfo)[i];
2174       for (gimple_stmt_iterator si = gsi_start_phis (bb);
2175            !gsi_end_p (si); gsi_next (&si))
2176         {
2177           stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
2178           STMT_SLP_TYPE (stmt_info) = loop_vect;
2179         }
2180       for (gimple_stmt_iterator si = gsi_start_bb (bb);
2181            !gsi_end_p (si); gsi_next (&si))
2182         {
2183           stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
2184           STMT_SLP_TYPE (stmt_info) = loop_vect;
2185           if (STMT_VINFO_IN_PATTERN_P (stmt_info))
2186             {
2187               gimple *pattern_def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
2188               stmt_info = STMT_VINFO_RELATED_STMT (stmt_info);
2189               STMT_SLP_TYPE (stmt_info) = loop_vect;
2190               for (gimple_stmt_iterator pi = gsi_start (pattern_def_seq);
2191                    !gsi_end_p (pi); gsi_next (&pi))
2192                 STMT_SLP_TYPE (loop_vinfo->lookup_stmt (gsi_stmt (pi)))
2193                   = loop_vect;
2194             }
2195         }
2196     }
2197   /* Free optimized alias test DDRS.  */
2198   LOOP_VINFO_LOWER_BOUNDS (loop_vinfo).truncate (0);
2199   LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo).release ();
2200   LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo).release ();
2201   /* Reset target cost data.  */
2202   destroy_cost_data (LOOP_VINFO_TARGET_COST_DATA (loop_vinfo));
2203   LOOP_VINFO_TARGET_COST_DATA (loop_vinfo)
2204     = init_cost (LOOP_VINFO_LOOP (loop_vinfo));
2205   /* Reset accumulated rgroup information.  */
2206   release_vec_loop_masks (&LOOP_VINFO_MASKS (loop_vinfo));
2207   /* Reset assorted flags.  */
2208   LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = false;
2209   LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) = false;
2210   LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo) = 0;
2211   LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo) = 0;
2212   LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = saved_can_fully_mask_p;
2213
2214   goto start_over;
2215 }
2216
2217 /* Function vect_analyze_loop.
2218
2219    Apply a set of analyses on LOOP, and create a loop_vec_info struct
2220    for it.  The different analyses will record information in the
2221    loop_vec_info struct.  If ORIG_LOOP_VINFO is not NULL epilogue must
2222    be vectorized.  */
2223 opt_loop_vec_info
2224 vect_analyze_loop (struct loop *loop, loop_vec_info orig_loop_vinfo,
2225                    vec_info_shared *shared)
2226 {
2227   auto_vector_sizes vector_sizes;
2228
2229   /* Autodetect first vector size we try.  */
2230   current_vector_size = 0;
2231   targetm.vectorize.autovectorize_vector_sizes (&vector_sizes);
2232   unsigned int next_size = 0;
2233
2234   DUMP_VECT_SCOPE ("analyze_loop_nest");
2235
2236   if (loop_outer (loop)
2237       && loop_vec_info_for_loop (loop_outer (loop))
2238       && LOOP_VINFO_VECTORIZABLE_P (loop_vec_info_for_loop (loop_outer (loop))))
2239     return opt_loop_vec_info::failure_at (vect_location,
2240                                           "outer-loop already vectorized.\n");
2241
2242   if (!find_loop_nest (loop, &shared->loop_nest))
2243     return opt_loop_vec_info::failure_at
2244       (vect_location,
2245        "not vectorized: loop nest containing two or more consecutive inner"
2246        " loops cannot be vectorized\n");
2247
2248   unsigned n_stmts = 0;
2249   poly_uint64 autodetected_vector_size = 0;
2250   while (1)
2251     {
2252       /* Check the CFG characteristics of the loop (nesting, entry/exit).  */
2253       opt_loop_vec_info loop_vinfo
2254         = vect_analyze_loop_form (loop, shared);
2255       if (!loop_vinfo)
2256         {
2257           if (dump_enabled_p ())
2258             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2259                              "bad loop form.\n");
2260           return loop_vinfo;
2261         }
2262
2263       bool fatal = false;
2264
2265       if (orig_loop_vinfo)
2266         LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo) = orig_loop_vinfo;
2267
2268       opt_result res = vect_analyze_loop_2 (loop_vinfo, fatal, &n_stmts);
2269       if (res)
2270         {
2271           LOOP_VINFO_VECTORIZABLE_P (loop_vinfo) = 1;
2272
2273           return loop_vinfo;
2274         }
2275
2276       delete loop_vinfo;
2277
2278       if (next_size == 0)
2279         autodetected_vector_size = current_vector_size;
2280
2281       if (next_size < vector_sizes.length ()
2282           && known_eq (vector_sizes[next_size], autodetected_vector_size))
2283         next_size += 1;
2284
2285       if (fatal
2286           || next_size == vector_sizes.length ()
2287           || known_eq (current_vector_size, 0U))
2288         return opt_loop_vec_info::propagate_failure (res);
2289
2290       /* Try the next biggest vector size.  */
2291       current_vector_size = vector_sizes[next_size++];
2292       if (dump_enabled_p ())
2293         {
2294           dump_printf_loc (MSG_NOTE, vect_location,
2295                            "***** Re-trying analysis with "
2296                            "vector size ");
2297           dump_dec (MSG_NOTE, current_vector_size);
2298           dump_printf (MSG_NOTE, "\n");
2299         }
2300     }
2301 }
2302
2303 /* Return true if there is an in-order reduction function for CODE, storing
2304    it in *REDUC_FN if so.  */
2305
2306 static bool
2307 fold_left_reduction_fn (tree_code code, internal_fn *reduc_fn)
2308 {
2309   switch (code)
2310     {
2311     case PLUS_EXPR:
2312       *reduc_fn = IFN_FOLD_LEFT_PLUS;
2313       return true;
2314
2315     default:
2316       return false;
2317     }
2318 }
2319
2320 /* Function reduction_fn_for_scalar_code
2321
2322    Input:
2323    CODE - tree_code of a reduction operations.
2324
2325    Output:
2326    REDUC_FN - the corresponding internal function to be used to reduce the
2327       vector of partial results into a single scalar result, or IFN_LAST
2328       if the operation is a supported reduction operation, but does not have
2329       such an internal function.
2330
2331    Return FALSE if CODE currently cannot be vectorized as reduction.  */
2332
2333 static bool
2334 reduction_fn_for_scalar_code (enum tree_code code, internal_fn *reduc_fn)
2335 {
2336   switch (code)
2337     {
2338       case MAX_EXPR:
2339         *reduc_fn = IFN_REDUC_MAX;
2340         return true;
2341
2342       case MIN_EXPR:
2343         *reduc_fn = IFN_REDUC_MIN;
2344         return true;
2345
2346       case PLUS_EXPR:
2347         *reduc_fn = IFN_REDUC_PLUS;
2348         return true;
2349
2350       case BIT_AND_EXPR:
2351         *reduc_fn = IFN_REDUC_AND;
2352         return true;
2353
2354       case BIT_IOR_EXPR:
2355         *reduc_fn = IFN_REDUC_IOR;
2356         return true;
2357
2358       case BIT_XOR_EXPR:
2359         *reduc_fn = IFN_REDUC_XOR;
2360         return true;
2361
2362       case MULT_EXPR:
2363       case MINUS_EXPR:
2364         *reduc_fn = IFN_LAST;
2365         return true;
2366
2367       default:
2368        return false;
2369     }
2370 }
2371
2372 /* If there is a neutral value X such that SLP reduction NODE would not
2373    be affected by the introduction of additional X elements, return that X,
2374    otherwise return null.  CODE is the code of the reduction.  REDUC_CHAIN
2375    is true if the SLP statements perform a single reduction, false if each
2376    statement performs an independent reduction.  */
2377
2378 static tree
2379 neutral_op_for_slp_reduction (slp_tree slp_node, tree_code code,
2380                               bool reduc_chain)
2381 {
2382   vec<stmt_vec_info> stmts = SLP_TREE_SCALAR_STMTS (slp_node);
2383   stmt_vec_info stmt_vinfo = stmts[0];
2384   tree vector_type = STMT_VINFO_VECTYPE (stmt_vinfo);
2385   tree scalar_type = TREE_TYPE (vector_type);
2386   struct loop *loop = gimple_bb (stmt_vinfo->stmt)->loop_father;
2387   gcc_assert (loop);
2388
2389   switch (code)
2390     {
2391     case WIDEN_SUM_EXPR:
2392     case DOT_PROD_EXPR:
2393     case SAD_EXPR:
2394     case PLUS_EXPR:
2395     case MINUS_EXPR:
2396     case BIT_IOR_EXPR:
2397     case BIT_XOR_EXPR:
2398       return build_zero_cst (scalar_type);
2399
2400     case MULT_EXPR:
2401       return build_one_cst (scalar_type);
2402
2403     case BIT_AND_EXPR:
2404       return build_all_ones_cst (scalar_type);
2405
2406     case MAX_EXPR:
2407     case MIN_EXPR:
2408       /* For MIN/MAX the initial values are neutral.  A reduction chain
2409          has only a single initial value, so that value is neutral for
2410          all statements.  */
2411       if (reduc_chain)
2412         return PHI_ARG_DEF_FROM_EDGE (stmt_vinfo->stmt,
2413                                       loop_preheader_edge (loop));
2414       return NULL_TREE;
2415
2416     default:
2417       return NULL_TREE;
2418     }
2419 }
2420
2421 /* Error reporting helper for vect_is_simple_reduction below.  GIMPLE statement
2422    STMT is printed with a message MSG. */
2423
2424 static void
2425 report_vect_op (dump_flags_t msg_type, gimple *stmt, const char *msg)
2426 {
2427   dump_printf_loc (msg_type, vect_location, "%s%G", msg, stmt);
2428 }
2429
2430 /* DEF_STMT_INFO occurs in a loop that contains a potential reduction
2431    operation.  Return true if the results of DEF_STMT_INFO are something
2432    that can be accumulated by such a reduction.  */
2433
2434 static bool
2435 vect_valid_reduction_input_p (stmt_vec_info def_stmt_info)
2436 {
2437   return (is_gimple_assign (def_stmt_info->stmt)
2438           || is_gimple_call (def_stmt_info->stmt)
2439           || STMT_VINFO_DEF_TYPE (def_stmt_info) == vect_induction_def
2440           || (gimple_code (def_stmt_info->stmt) == GIMPLE_PHI
2441               && STMT_VINFO_DEF_TYPE (def_stmt_info) == vect_internal_def
2442               && !is_loop_header_bb_p (gimple_bb (def_stmt_info->stmt))));
2443 }
2444
2445 /* Detect SLP reduction of the form:
2446
2447    #a1 = phi <a5, a0>
2448    a2 = operation (a1)
2449    a3 = operation (a2)
2450    a4 = operation (a3)
2451    a5 = operation (a4)
2452
2453    #a = phi <a5>
2454
2455    PHI is the reduction phi node (#a1 = phi <a5, a0> above)
2456    FIRST_STMT is the first reduction stmt in the chain
2457    (a2 = operation (a1)).
2458
2459    Return TRUE if a reduction chain was detected.  */
2460
2461 static bool
2462 vect_is_slp_reduction (loop_vec_info loop_info, gimple *phi,
2463                        gimple *first_stmt)
2464 {
2465   struct loop *loop = (gimple_bb (phi))->loop_father;
2466   struct loop *vect_loop = LOOP_VINFO_LOOP (loop_info);
2467   enum tree_code code;
2468   gimple *loop_use_stmt = NULL;
2469   stmt_vec_info use_stmt_info;
2470   tree lhs;
2471   imm_use_iterator imm_iter;
2472   use_operand_p use_p;
2473   int nloop_uses, size = 0, n_out_of_loop_uses;
2474   bool found = false;
2475
2476   if (loop != vect_loop)
2477     return false;
2478
2479   auto_vec<stmt_vec_info, 8> reduc_chain;
2480   lhs = PHI_RESULT (phi);
2481   code = gimple_assign_rhs_code (first_stmt);
2482   while (1)
2483     {
2484       nloop_uses = 0;
2485       n_out_of_loop_uses = 0;
2486       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, lhs)
2487         {
2488           gimple *use_stmt = USE_STMT (use_p);
2489           if (is_gimple_debug (use_stmt))
2490             continue;
2491
2492           /* Check if we got back to the reduction phi.  */
2493           if (use_stmt == phi)
2494             {
2495               loop_use_stmt = use_stmt;
2496               found = true;
2497               break;
2498             }
2499
2500           if (flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
2501             {
2502               loop_use_stmt = use_stmt;
2503               nloop_uses++;
2504             }
2505            else
2506              n_out_of_loop_uses++;
2507
2508            /* There are can be either a single use in the loop or two uses in
2509               phi nodes.  */
2510            if (nloop_uses > 1 || (n_out_of_loop_uses && nloop_uses))
2511              return false;
2512         }
2513
2514       if (found)
2515         break;
2516
2517       /* We reached a statement with no loop uses.  */
2518       if (nloop_uses == 0)
2519         return false;
2520
2521       /* This is a loop exit phi, and we haven't reached the reduction phi.  */
2522       if (gimple_code (loop_use_stmt) == GIMPLE_PHI)
2523         return false;
2524
2525       if (!is_gimple_assign (loop_use_stmt)
2526           || code != gimple_assign_rhs_code (loop_use_stmt)
2527           || !flow_bb_inside_loop_p (loop, gimple_bb (loop_use_stmt)))
2528         return false;
2529
2530       /* Insert USE_STMT into reduction chain.  */
2531       use_stmt_info = loop_info->lookup_stmt (loop_use_stmt);
2532       reduc_chain.safe_push (use_stmt_info);
2533
2534       lhs = gimple_assign_lhs (loop_use_stmt);
2535       size++;
2536    }
2537
2538   if (!found || loop_use_stmt != phi || size < 2)
2539     return false;
2540
2541   /* Swap the operands, if needed, to make the reduction operand be the second
2542      operand.  */
2543   lhs = PHI_RESULT (phi);
2544   for (unsigned i = 0; i < reduc_chain.length (); ++i)
2545     {
2546       gassign *next_stmt = as_a <gassign *> (reduc_chain[i]->stmt);
2547       if (gimple_assign_rhs2 (next_stmt) == lhs)
2548         {
2549           tree op = gimple_assign_rhs1 (next_stmt);
2550           stmt_vec_info def_stmt_info = loop_info->lookup_def (op);
2551
2552           /* Check that the other def is either defined in the loop
2553              ("vect_internal_def"), or it's an induction (defined by a
2554              loop-header phi-node).  */
2555           if (def_stmt_info
2556               && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt_info->stmt))
2557               && vect_valid_reduction_input_p (def_stmt_info))
2558             {
2559               lhs = gimple_assign_lhs (next_stmt);
2560               continue;
2561             }
2562
2563           return false;
2564         }
2565       else
2566         {
2567           tree op = gimple_assign_rhs2 (next_stmt);
2568           stmt_vec_info def_stmt_info = loop_info->lookup_def (op);
2569
2570           /* Check that the other def is either defined in the loop
2571             ("vect_internal_def"), or it's an induction (defined by a
2572             loop-header phi-node).  */
2573           if (def_stmt_info
2574               && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt_info->stmt))
2575               && vect_valid_reduction_input_p (def_stmt_info))
2576             {
2577               if (dump_enabled_p ())
2578                 dump_printf_loc (MSG_NOTE, vect_location, "swapping oprnds: %G",
2579                                  next_stmt);
2580
2581               swap_ssa_operands (next_stmt,
2582                                  gimple_assign_rhs1_ptr (next_stmt),
2583                                  gimple_assign_rhs2_ptr (next_stmt));
2584               update_stmt (next_stmt);
2585
2586               if (CONSTANT_CLASS_P (gimple_assign_rhs1 (next_stmt)))
2587                 LOOP_VINFO_OPERANDS_SWAPPED (loop_info) = true;
2588             }
2589           else
2590             return false;
2591         }
2592
2593       lhs = gimple_assign_lhs (next_stmt);
2594     }
2595
2596   /* Build up the actual chain.  */
2597   for (unsigned i = 0; i < reduc_chain.length () - 1; ++i)
2598     {
2599       REDUC_GROUP_FIRST_ELEMENT (reduc_chain[i]) = reduc_chain[0];
2600       REDUC_GROUP_NEXT_ELEMENT (reduc_chain[i]) = reduc_chain[i+1];
2601     }
2602   REDUC_GROUP_FIRST_ELEMENT (reduc_chain.last ()) = reduc_chain[0];
2603   REDUC_GROUP_NEXT_ELEMENT (reduc_chain.last ()) = NULL;
2604
2605   /* Save the chain for further analysis in SLP detection.  */
2606   LOOP_VINFO_REDUCTION_CHAINS (loop_info).safe_push (reduc_chain[0]);
2607   REDUC_GROUP_SIZE (reduc_chain[0]) = size;
2608
2609   return true;
2610 }
2611
2612 /* Return true if we need an in-order reduction for operation CODE
2613    on type TYPE.  NEED_WRAPPING_INTEGRAL_OVERFLOW is true if integer
2614    overflow must wrap.  */
2615
2616 static bool
2617 needs_fold_left_reduction_p (tree type, tree_code code,
2618                              bool need_wrapping_integral_overflow)
2619 {
2620   /* CHECKME: check for !flag_finite_math_only too?  */
2621   if (SCALAR_FLOAT_TYPE_P (type))
2622     switch (code)
2623       {
2624       case MIN_EXPR:
2625       case MAX_EXPR:
2626         return false;
2627
2628       default:
2629         return !flag_associative_math;
2630       }
2631
2632   if (INTEGRAL_TYPE_P (type))
2633     {
2634       if (!operation_no_trapping_overflow (type, code))
2635         return true;
2636       if (need_wrapping_integral_overflow
2637           && !TYPE_OVERFLOW_WRAPS (type)
2638           && operation_can_overflow (code))
2639         return true;
2640       return false;
2641     }
2642
2643   if (SAT_FIXED_POINT_TYPE_P (type))
2644     return true;
2645
2646   return false;
2647 }
2648
2649 /* Return true if the reduction PHI in LOOP with latch arg LOOP_ARG and
2650    reduction operation CODE has a handled computation expression.  */
2651
2652 bool
2653 check_reduction_path (dump_user_location_t loc, loop_p loop, gphi *phi,
2654                       tree loop_arg, enum tree_code code)
2655 {
2656   auto_vec<std::pair<ssa_op_iter, use_operand_p> > path;
2657   auto_bitmap visited;
2658   tree lookfor = PHI_RESULT (phi);
2659   ssa_op_iter curri;
2660   use_operand_p curr = op_iter_init_phiuse (&curri, phi, SSA_OP_USE);
2661   while (USE_FROM_PTR (curr) != loop_arg)
2662     curr = op_iter_next_use (&curri);
2663   curri.i = curri.numops;
2664   do
2665     {
2666       path.safe_push (std::make_pair (curri, curr));
2667       tree use = USE_FROM_PTR (curr);
2668       if (use == lookfor)
2669         break;
2670       gimple *def = SSA_NAME_DEF_STMT (use);
2671       if (gimple_nop_p (def)
2672           || ! flow_bb_inside_loop_p (loop, gimple_bb (def)))
2673         {
2674 pop:
2675           do
2676             {
2677               std::pair<ssa_op_iter, use_operand_p> x = path.pop ();
2678               curri = x.first;
2679               curr = x.second;
2680               do
2681                 curr = op_iter_next_use (&curri);
2682               /* Skip already visited or non-SSA operands (from iterating
2683                  over PHI args).  */
2684               while (curr != NULL_USE_OPERAND_P
2685                      && (TREE_CODE (USE_FROM_PTR (curr)) != SSA_NAME
2686                          || ! bitmap_set_bit (visited,
2687                                               SSA_NAME_VERSION
2688                                                 (USE_FROM_PTR (curr)))));
2689             }
2690           while (curr == NULL_USE_OPERAND_P && ! path.is_empty ());
2691           if (curr == NULL_USE_OPERAND_P)
2692             break;
2693         }
2694       else
2695         {
2696           if (gimple_code (def) == GIMPLE_PHI)
2697             curr = op_iter_init_phiuse (&curri, as_a <gphi *>(def), SSA_OP_USE);
2698           else
2699             curr = op_iter_init_use (&curri, def, SSA_OP_USE);
2700           while (curr != NULL_USE_OPERAND_P
2701                  && (TREE_CODE (USE_FROM_PTR (curr)) != SSA_NAME
2702                      || ! bitmap_set_bit (visited,
2703                                           SSA_NAME_VERSION
2704                                             (USE_FROM_PTR (curr)))))
2705             curr = op_iter_next_use (&curri);
2706           if (curr == NULL_USE_OPERAND_P)
2707             goto pop;
2708         }
2709     }
2710   while (1);
2711   if (dump_file && (dump_flags & TDF_DETAILS))
2712     {
2713       dump_printf_loc (MSG_NOTE, loc, "reduction path: ");
2714       unsigned i;
2715       std::pair<ssa_op_iter, use_operand_p> *x;
2716       FOR_EACH_VEC_ELT (path, i, x)
2717         dump_printf (MSG_NOTE, "%T ", USE_FROM_PTR (x->second));
2718       dump_printf (MSG_NOTE, "\n");
2719     }
2720
2721   /* Check whether the reduction path detected is valid.  */
2722   bool fail = path.length () == 0;
2723   bool neg = false;
2724   for (unsigned i = 1; i < path.length (); ++i)
2725     {
2726       gimple *use_stmt = USE_STMT (path[i].second);
2727       tree op = USE_FROM_PTR (path[i].second);
2728       if (! has_single_use (op)
2729           || ! is_gimple_assign (use_stmt))
2730         {
2731           fail = true;
2732           break;
2733         }
2734       if (gimple_assign_rhs_code (use_stmt) != code)
2735         {
2736           if (code == PLUS_EXPR
2737               && gimple_assign_rhs_code (use_stmt) == MINUS_EXPR)
2738             {
2739               /* Track whether we negate the reduction value each iteration.  */
2740               if (gimple_assign_rhs2 (use_stmt) == op)
2741                 neg = ! neg;
2742             }
2743           else
2744             {
2745               fail = true;
2746               break;
2747             }
2748         }
2749     }
2750   return ! fail && ! neg;
2751 }
2752
2753
2754 /* Function vect_is_simple_reduction
2755
2756    (1) Detect a cross-iteration def-use cycle that represents a simple
2757    reduction computation.  We look for the following pattern:
2758
2759    loop_header:
2760      a1 = phi < a0, a2 >
2761      a3 = ...
2762      a2 = operation (a3, a1)
2763
2764    or
2765
2766    a3 = ...
2767    loop_header:
2768      a1 = phi < a0, a2 >
2769      a2 = operation (a3, a1)
2770
2771    such that:
2772    1. operation is commutative and associative and it is safe to
2773       change the order of the computation
2774    2. no uses for a2 in the loop (a2 is used out of the loop)
2775    3. no uses of a1 in the loop besides the reduction operation
2776    4. no uses of a1 outside the loop.
2777
2778    Conditions 1,4 are tested here.
2779    Conditions 2,3 are tested in vect_mark_stmts_to_be_vectorized.
2780
2781    (2) Detect a cross-iteration def-use cycle in nested loops, i.e.,
2782    nested cycles.
2783
2784    (3) Detect cycles of phi nodes in outer-loop vectorization, i.e., double
2785    reductions:
2786
2787      a1 = phi < a0, a2 >
2788      inner loop (def of a3)
2789      a2 = phi < a3 >
2790
2791    (4) Detect condition expressions, ie:
2792      for (int i = 0; i < N; i++)
2793        if (a[i] < val)
2794         ret_val = a[i];
2795
2796 */
2797
2798 static stmt_vec_info
2799 vect_is_simple_reduction (loop_vec_info loop_info, stmt_vec_info phi_info,
2800                           bool *double_reduc,
2801                           bool need_wrapping_integral_overflow,
2802                           enum vect_reduction_type *v_reduc_type)
2803 {
2804   gphi *phi = as_a <gphi *> (phi_info->stmt);
2805   struct loop *loop = (gimple_bb (phi))->loop_father;
2806   struct loop *vect_loop = LOOP_VINFO_LOOP (loop_info);
2807   bool nested_in_vect_loop = flow_loop_nested_p (vect_loop, loop);
2808   gimple *phi_use_stmt = NULL;
2809   enum tree_code orig_code, code;
2810   tree op1, op2, op3 = NULL_TREE, op4 = NULL_TREE;
2811   tree type;
2812   tree name;
2813   imm_use_iterator imm_iter;
2814   use_operand_p use_p;
2815   bool phi_def;
2816
2817   *double_reduc = false;
2818   *v_reduc_type = TREE_CODE_REDUCTION;
2819
2820   tree phi_name = PHI_RESULT (phi);
2821   /* ???  If there are no uses of the PHI result the inner loop reduction
2822      won't be detected as possibly double-reduction by vectorizable_reduction
2823      because that tries to walk the PHI arg from the preheader edge which
2824      can be constant.  See PR60382.  */
2825   if (has_zero_uses (phi_name))
2826     return NULL;
2827   unsigned nphi_def_loop_uses = 0;
2828   FOR_EACH_IMM_USE_FAST (use_p, imm_iter, phi_name)
2829     {
2830       gimple *use_stmt = USE_STMT (use_p);
2831       if (is_gimple_debug (use_stmt))
2832         continue;
2833
2834       if (!flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
2835         {
2836           if (dump_enabled_p ())
2837             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2838                              "intermediate value used outside loop.\n");
2839
2840           return NULL;
2841         }
2842
2843       nphi_def_loop_uses++;
2844       phi_use_stmt = use_stmt;
2845     }
2846
2847   edge latch_e = loop_latch_edge (loop);
2848   tree loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
2849   if (TREE_CODE (loop_arg) != SSA_NAME)
2850     {
2851       if (dump_enabled_p ())
2852         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2853                          "reduction: not ssa_name: %T\n", loop_arg);
2854       return NULL;
2855     }
2856
2857   stmt_vec_info def_stmt_info = loop_info->lookup_def (loop_arg);
2858   if (!def_stmt_info
2859       || !flow_bb_inside_loop_p (loop, gimple_bb (def_stmt_info->stmt)))
2860     return NULL;
2861
2862   if (gassign *def_stmt = dyn_cast <gassign *> (def_stmt_info->stmt))
2863     {
2864       name = gimple_assign_lhs (def_stmt);
2865       phi_def = false;
2866     }
2867   else if (gphi *def_stmt = dyn_cast <gphi *> (def_stmt_info->stmt))
2868     {
2869       name = PHI_RESULT (def_stmt);
2870       phi_def = true;
2871     }
2872   else
2873     {
2874       if (dump_enabled_p ())
2875         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2876                          "reduction: unhandled reduction operation: %G",
2877                          def_stmt_info->stmt);
2878       return NULL;
2879     }
2880
2881   unsigned nlatch_def_loop_uses = 0;
2882   auto_vec<gphi *, 3> lcphis;
2883   bool inner_loop_of_double_reduc = false;
2884   FOR_EACH_IMM_USE_FAST (use_p, imm_iter, name)
2885     {
2886       gimple *use_stmt = USE_STMT (use_p);
2887       if (is_gimple_debug (use_stmt))
2888         continue;
2889       if (flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
2890         nlatch_def_loop_uses++;
2891       else
2892         {
2893           /* We can have more than one loop-closed PHI.  */
2894           lcphis.safe_push (as_a <gphi *> (use_stmt));
2895           if (nested_in_vect_loop
2896               && (STMT_VINFO_DEF_TYPE (loop_info->lookup_stmt (use_stmt))
2897                   == vect_double_reduction_def))
2898             inner_loop_of_double_reduc = true;
2899         }
2900     }
2901
2902   /* If this isn't a nested cycle or if the nested cycle reduction value
2903      is used ouside of the inner loop we cannot handle uses of the reduction
2904      value.  */
2905   if ((!nested_in_vect_loop || inner_loop_of_double_reduc)
2906       && (nlatch_def_loop_uses > 1 || nphi_def_loop_uses > 1))
2907     {
2908       if (dump_enabled_p ())
2909         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2910                          "reduction used in loop.\n");
2911       return NULL;
2912     }
2913
2914   /* If DEF_STMT is a phi node itself, we expect it to have a single argument
2915      defined in the inner loop.  */
2916   if (phi_def)
2917     {
2918       gphi *def_stmt = as_a <gphi *> (def_stmt_info->stmt);
2919       op1 = PHI_ARG_DEF (def_stmt, 0);
2920
2921       if (gimple_phi_num_args (def_stmt) != 1
2922           || TREE_CODE (op1) != SSA_NAME)
2923         {
2924           if (dump_enabled_p ())
2925             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2926                              "unsupported phi node definition.\n");
2927
2928           return NULL;
2929         }
2930
2931       gimple *def1 = SSA_NAME_DEF_STMT (op1);
2932       if (gimple_bb (def1)
2933           && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
2934           && loop->inner
2935           && flow_bb_inside_loop_p (loop->inner, gimple_bb (def1))
2936           && is_gimple_assign (def1)
2937           && is_a <gphi *> (phi_use_stmt)
2938           && flow_bb_inside_loop_p (loop->inner, gimple_bb (phi_use_stmt)))
2939         {
2940           if (dump_enabled_p ())
2941             report_vect_op (MSG_NOTE, def_stmt,
2942                             "detected double reduction: ");
2943
2944           *double_reduc = true;
2945           return def_stmt_info;
2946         }
2947
2948       return NULL;
2949     }
2950
2951   /* If we are vectorizing an inner reduction we are executing that
2952      in the original order only in case we are not dealing with a
2953      double reduction.  */
2954   bool check_reduction = true;
2955   if (flow_loop_nested_p (vect_loop, loop))
2956     {
2957       gphi *lcphi;
2958       unsigned i;
2959       check_reduction = false;
2960       FOR_EACH_VEC_ELT (lcphis, i, lcphi)
2961         FOR_EACH_IMM_USE_FAST (use_p, imm_iter, gimple_phi_result (lcphi))
2962           {
2963             gimple *use_stmt = USE_STMT (use_p);
2964             if (is_gimple_debug (use_stmt))
2965               continue;
2966             if (! flow_bb_inside_loop_p (vect_loop, gimple_bb (use_stmt)))
2967               check_reduction = true;
2968           }
2969     }
2970
2971   gassign *def_stmt = as_a <gassign *> (def_stmt_info->stmt);
2972   code = orig_code = gimple_assign_rhs_code (def_stmt);
2973
2974   if (nested_in_vect_loop && !check_reduction)
2975     {
2976       /* FIXME: Even for non-reductions code generation is funneled
2977          through vectorizable_reduction for the stmt defining the
2978          PHI latch value.  So we have to artificially restrict ourselves
2979          for the supported operations.  */
2980       switch (get_gimple_rhs_class (code))
2981         {
2982         case GIMPLE_BINARY_RHS:
2983         case GIMPLE_TERNARY_RHS:
2984           break;
2985         default:
2986           /* Not supported by vectorizable_reduction.  */
2987           if (dump_enabled_p ())
2988             report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2989                             "nested cycle: not handled operation: ");
2990           return NULL;
2991         }
2992       if (dump_enabled_p ())
2993         report_vect_op (MSG_NOTE, def_stmt, "detected nested cycle: ");
2994       return def_stmt_info;
2995     }
2996
2997   /* We can handle "res -= x[i]", which is non-associative by
2998      simply rewriting this into "res += -x[i]".  Avoid changing
2999      gimple instruction for the first simple tests and only do this
3000      if we're allowed to change code at all.  */
3001   if (code == MINUS_EXPR && gimple_assign_rhs2 (def_stmt) != phi_name)
3002     code = PLUS_EXPR;
3003
3004   if (code == COND_EXPR)
3005     {
3006       if (! nested_in_vect_loop)
3007         *v_reduc_type = COND_REDUCTION;
3008
3009       op3 = gimple_assign_rhs1 (def_stmt);
3010       if (COMPARISON_CLASS_P (op3))
3011         {
3012           op4 = TREE_OPERAND (op3, 1);
3013           op3 = TREE_OPERAND (op3, 0);
3014         }
3015       if (op3 == phi_name || op4 == phi_name)
3016         {
3017           if (dump_enabled_p ())
3018             report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
3019                             "reduction: condition depends on previous"
3020                             " iteration: ");
3021           return NULL;
3022         }
3023
3024       op1 = gimple_assign_rhs2 (def_stmt);
3025       op2 = gimple_assign_rhs3 (def_stmt);
3026     }
3027   else if (!commutative_tree_code (code) || !associative_tree_code (code))
3028     {
3029       if (dump_enabled_p ())
3030         report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
3031                         "reduction: not commutative/associative: ");
3032       return NULL;
3033     }
3034   else if (get_gimple_rhs_class (code) == GIMPLE_BINARY_RHS)
3035     {
3036       op1 = gimple_assign_rhs1 (def_stmt);
3037       op2 = gimple_assign_rhs2 (def_stmt);
3038     }
3039   else
3040     {
3041       if (dump_enabled_p ())
3042         report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
3043                         "reduction: not handled operation: ");
3044       return NULL;
3045     }
3046
3047   if (TREE_CODE (op1) != SSA_NAME && TREE_CODE (op2) != SSA_NAME)
3048     {
3049       if (dump_enabled_p ())
3050         report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
3051                         "reduction: both uses not ssa_names: ");
3052
3053       return NULL;
3054     }
3055
3056   type = TREE_TYPE (gimple_assign_lhs (def_stmt));
3057   if ((TREE_CODE (op1) == SSA_NAME
3058        && !types_compatible_p (type,TREE_TYPE (op1)))
3059       || (TREE_CODE (op2) == SSA_NAME
3060           && !types_compatible_p (type, TREE_TYPE (op2)))
3061       || (op3 && TREE_CODE (op3) == SSA_NAME
3062           && !types_compatible_p (type, TREE_TYPE (op3)))
3063       || (op4 && TREE_CODE (op4) == SSA_NAME
3064           && !types_compatible_p (type, TREE_TYPE (op4))))
3065     {
3066       if (dump_enabled_p ())
3067         {
3068           dump_printf_loc (MSG_NOTE, vect_location,
3069                            "reduction: multiple types: operation type: "
3070                            "%T, operands types: %T,%T",
3071                            type,  TREE_TYPE (op1), TREE_TYPE (op2));
3072           if (op3)
3073             dump_printf (MSG_NOTE, ",%T", TREE_TYPE (op3));
3074
3075           if (op4)
3076             dump_printf (MSG_NOTE, ",%T", TREE_TYPE (op4));
3077           dump_printf (MSG_NOTE, "\n");
3078         }
3079
3080       return NULL;
3081     }
3082
3083   /* Check whether it's ok to change the order of the computation.
3084      Generally, when vectorizing a reduction we change the order of the
3085      computation.  This may change the behavior of the program in some
3086      cases, so we need to check that this is ok.  One exception is when
3087      vectorizing an outer-loop: the inner-loop is executed sequentially,
3088      and therefore vectorizing reductions in the inner-loop during
3089      outer-loop vectorization is safe.  */
3090   if (check_reduction
3091       && *v_reduc_type == TREE_CODE_REDUCTION
3092       && needs_fold_left_reduction_p (type, code,
3093                                       need_wrapping_integral_overflow))
3094     *v_reduc_type = FOLD_LEFT_REDUCTION;
3095
3096   /* Reduction is safe. We're dealing with one of the following:
3097      1) integer arithmetic and no trapv
3098      2) floating point arithmetic, and special flags permit this optimization
3099      3) nested cycle (i.e., outer loop vectorization).  */
3100   stmt_vec_info def1_info = loop_info->lookup_def (op1);
3101   stmt_vec_info def2_info = loop_info->lookup_def (op2);
3102   if (code != COND_EXPR && !def1_info && !def2_info)
3103     {
3104       if (dump_enabled_p ())
3105         report_vect_op (MSG_NOTE, def_stmt, "reduction: no defs for operands: ");
3106       return NULL;
3107     }
3108
3109   /* Check that one def is the reduction def, defined by PHI,
3110      the other def is either defined in the loop ("vect_internal_def"),
3111      or it's an induction (defined by a loop-header phi-node).  */
3112
3113   if (def2_info
3114       && def2_info->stmt == phi
3115       && (code == COND_EXPR
3116           || !def1_info
3117           || !flow_bb_inside_loop_p (loop, gimple_bb (def1_info->stmt))
3118           || vect_valid_reduction_input_p (def1_info)))
3119     {
3120       if (dump_enabled_p ())
3121         report_vect_op (MSG_NOTE, def_stmt, "detected reduction: ");
3122       return def_stmt_info;
3123     }
3124
3125   if (def1_info
3126       && def1_info->stmt == phi
3127       && (code == COND_EXPR
3128           || !def2_info
3129           || !flow_bb_inside_loop_p (loop, gimple_bb (def2_info->stmt))
3130           || vect_valid_reduction_input_p (def2_info)))
3131     {
3132       if (! nested_in_vect_loop && orig_code != MINUS_EXPR)
3133         {
3134           /* Check if we can swap operands (just for simplicity - so that
3135              the rest of the code can assume that the reduction variable
3136              is always the last (second) argument).  */
3137           if (code == COND_EXPR)
3138             {
3139               /* Swap cond_expr by inverting the condition.  */
3140               tree cond_expr = gimple_assign_rhs1 (def_stmt);
3141               enum tree_code invert_code = ERROR_MARK;
3142               enum tree_code cond_code = TREE_CODE (cond_expr);
3143
3144               if (TREE_CODE_CLASS (cond_code) == tcc_comparison)
3145                 {
3146                   bool honor_nans = HONOR_NANS (TREE_OPERAND (cond_expr, 0));
3147                   invert_code = invert_tree_comparison (cond_code, honor_nans);
3148                 }
3149               if (invert_code != ERROR_MARK)
3150                 {
3151                   TREE_SET_CODE (cond_expr, invert_code);
3152                   swap_ssa_operands (def_stmt,
3153                                      gimple_assign_rhs2_ptr (def_stmt),
3154                                      gimple_assign_rhs3_ptr (def_stmt));
3155                 }
3156               else
3157                 {
3158                   if (dump_enabled_p ())
3159                     report_vect_op (MSG_NOTE, def_stmt,
3160                                     "detected reduction: cannot swap operands "
3161                                     "for cond_expr");
3162                   return NULL;
3163                 }
3164             }
3165           else
3166             swap_ssa_operands (def_stmt, gimple_assign_rhs1_ptr (def_stmt),
3167                                gimple_assign_rhs2_ptr (def_stmt));
3168
3169           if (dump_enabled_p ())
3170             report_vect_op (MSG_NOTE, def_stmt,
3171                             "detected reduction: need to swap operands: ");
3172
3173           if (CONSTANT_CLASS_P (gimple_assign_rhs1 (def_stmt)))
3174             LOOP_VINFO_OPERANDS_SWAPPED (loop_info) = true;
3175         }
3176       else
3177         {
3178           if (dump_enabled_p ())
3179             report_vect_op (MSG_NOTE, def_stmt, "detected reduction: ");
3180         }
3181
3182       return def_stmt_info;
3183     }
3184
3185   /* Try to find SLP reduction chain.  */
3186   if (! nested_in_vect_loop
3187       && code != COND_EXPR
3188       && orig_code != MINUS_EXPR
3189       && vect_is_slp_reduction (loop_info, phi, def_stmt))
3190     {
3191       if (dump_enabled_p ())
3192         report_vect_op (MSG_NOTE, def_stmt,
3193                         "reduction: detected reduction chain: ");
3194
3195       return def_stmt_info;
3196     }
3197
3198   /* Look for the expression computing loop_arg from loop PHI result.  */
3199   if (check_reduction_path (vect_location, loop, phi, loop_arg, code))
3200     return def_stmt_info;
3201
3202   if (dump_enabled_p ())
3203     {
3204       report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
3205                       "reduction: unknown pattern: ");
3206     }
3207
3208   return NULL;
3209 }
3210
3211 /* Wrapper around vect_is_simple_reduction, which will modify code
3212    in-place if it enables detection of more reductions.  Arguments
3213    as there.  */
3214
3215 stmt_vec_info
3216 vect_force_simple_reduction (loop_vec_info loop_info, stmt_vec_info phi_info,
3217                              bool *double_reduc,
3218                              bool need_wrapping_integral_overflow)
3219 {
3220   enum vect_reduction_type v_reduc_type;
3221   stmt_vec_info def_info
3222     = vect_is_simple_reduction (loop_info, phi_info, double_reduc,
3223                                 need_wrapping_integral_overflow,
3224                                 &v_reduc_type);
3225   if (def_info)
3226     {
3227       STMT_VINFO_REDUC_TYPE (phi_info) = v_reduc_type;
3228       STMT_VINFO_REDUC_DEF (phi_info) = def_info;
3229       STMT_VINFO_REDUC_TYPE (def_info) = v_reduc_type;
3230       STMT_VINFO_REDUC_DEF (def_info) = phi_info;
3231     }
3232   return def_info;
3233 }
3234
3235 /* Calculate cost of peeling the loop PEEL_ITERS_PROLOGUE times.  */
3236 int
3237 vect_get_known_peeling_cost (loop_vec_info loop_vinfo, int peel_iters_prologue,
3238                              int *peel_iters_epilogue,
3239                              stmt_vector_for_cost *scalar_cost_vec,
3240                              stmt_vector_for_cost *prologue_cost_vec,
3241                              stmt_vector_for_cost *epilogue_cost_vec)
3242 {
3243   int retval = 0;
3244   int assumed_vf = vect_vf_for_cost (loop_vinfo);
3245
3246   if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
3247     {
3248       *peel_iters_epilogue = assumed_vf / 2;
3249       if (dump_enabled_p ())
3250         dump_printf_loc (MSG_NOTE, vect_location,
3251                          "cost model: epilogue peel iters set to vf/2 "
3252                          "because loop iterations are unknown .\n");
3253
3254       /* If peeled iterations are known but number of scalar loop
3255          iterations are unknown, count a taken branch per peeled loop.  */
3256       retval = record_stmt_cost (prologue_cost_vec, 1, cond_branch_taken,
3257                                  NULL, 0, vect_prologue);
3258       retval = record_stmt_cost (prologue_cost_vec, 1, cond_branch_taken,
3259                                  NULL, 0, vect_epilogue);
3260     }
3261   else
3262     {
3263       int niters = LOOP_VINFO_INT_NITERS (loop_vinfo);
3264       peel_iters_prologue = niters < peel_iters_prologue ?
3265                             niters : peel_iters_prologue;
3266       *peel_iters_epilogue = (niters - peel_iters_prologue) % assumed_vf;
3267       /* If we need to peel for gaps, but no peeling is required, we have to
3268          peel VF iterations.  */
3269       if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) && !*peel_iters_epilogue)
3270         *peel_iters_epilogue = assumed_vf;
3271     }
3272
3273   stmt_info_for_cost *si;
3274   int j;
3275   if (peel_iters_prologue)
3276     FOR_EACH_VEC_ELT (*scalar_cost_vec, j, si)
3277       retval += record_stmt_cost (prologue_cost_vec,
3278                                   si->count * peel_iters_prologue,
3279                                   si->kind, si->stmt_info, si->misalign,
3280                                   vect_prologue);
3281   if (*peel_iters_epilogue)
3282     FOR_EACH_VEC_ELT (*scalar_cost_vec, j, si)
3283       retval += record_stmt_cost (epilogue_cost_vec,
3284                                   si->count * *peel_iters_epilogue,
3285                                   si->kind, si->stmt_info, si->misalign,
3286                                   vect_epilogue);
3287
3288   return retval;
3289 }
3290
3291 /* Function vect_estimate_min_profitable_iters
3292
3293    Return the number of iterations required for the vector version of the
3294    loop to be profitable relative to the cost of the scalar version of the
3295    loop.
3296
3297    *RET_MIN_PROFITABLE_NITERS is a cost model profitability threshold
3298    of iterations for vectorization.  -1 value means loop vectorization
3299    is not profitable.  This returned value may be used for dynamic
3300    profitability check.
3301
3302    *RET_MIN_PROFITABLE_ESTIMATE is a profitability threshold to be used
3303    for static check against estimated number of iterations.  */
3304
3305 static void
3306 vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo,
3307                                     int *ret_min_profitable_niters,
3308                                     int *ret_min_profitable_estimate)
3309 {
3310   int min_profitable_iters;
3311   int min_profitable_estimate;
3312   int peel_iters_prologue;
3313   int peel_iters_epilogue;
3314   unsigned vec_inside_cost = 0;
3315   int vec_outside_cost = 0;
3316   unsigned vec_prologue_cost = 0;
3317   unsigned vec_epilogue_cost = 0;
3318   int scalar_single_iter_cost = 0;
3319   int scalar_outside_cost = 0;
3320   int assumed_vf = vect_vf_for_cost (loop_vinfo);
3321   int npeel = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
3322   void *target_cost_data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
3323
3324   /* Cost model disabled.  */
3325   if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
3326     {
3327       if (dump_enabled_p ())
3328         dump_printf_loc (MSG_NOTE, vect_location, "cost model disabled.\n");
3329       *ret_min_profitable_niters = 0;
3330       *ret_min_profitable_estimate = 0;
3331       return;
3332     }
3333
3334   /* Requires loop versioning tests to handle misalignment.  */
3335   if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo))
3336     {
3337       /*  FIXME: Make cost depend on complexity of individual check.  */
3338       unsigned len = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).length ();
3339       (void) add_stmt_cost (target_cost_data, len, vector_stmt, NULL, 0,
3340                             vect_prologue);
3341       if (dump_enabled_p ())
3342         dump_printf (MSG_NOTE,
3343                      "cost model: Adding cost of checks for loop "
3344                      "versioning to treat misalignment.\n");
3345     }
3346
3347   /* Requires loop versioning with alias checks.  */
3348   if (LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
3349     {
3350       /*  FIXME: Make cost depend on complexity of individual check.  */
3351       unsigned len = LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo).length ();
3352       (void) add_stmt_cost (target_cost_data, len, vector_stmt, NULL, 0,
3353                             vect_prologue);
3354       len = LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo).length ();
3355       if (len)
3356         /* Count LEN - 1 ANDs and LEN comparisons.  */
3357         (void) add_stmt_cost (target_cost_data, len * 2 - 1, scalar_stmt,
3358                               NULL, 0, vect_prologue);
3359       len = LOOP_VINFO_LOWER_BOUNDS (loop_vinfo).length ();
3360       if (len)
3361         {
3362           /* Count LEN - 1 ANDs and LEN comparisons.  */
3363           unsigned int nstmts = len * 2 - 1;
3364           /* +1 for each bias that needs adding.  */
3365           for (unsigned int i = 0; i < len; ++i)
3366             if (!LOOP_VINFO_LOWER_BOUNDS (loop_vinfo)[i].unsigned_p)
3367               nstmts += 1;
3368           (void) add_stmt_cost (target_cost_data, nstmts, scalar_stmt,
3369                                 NULL, 0, vect_prologue);
3370         }
3371       if (dump_enabled_p ())
3372         dump_printf (MSG_NOTE,
3373                      "cost model: Adding cost of checks for loop "
3374                      "versioning aliasing.\n");
3375     }
3376
3377   /* Requires loop versioning with niter checks.  */
3378   if (LOOP_REQUIRES_VERSIONING_FOR_NITERS (loop_vinfo))
3379     {
3380       /*  FIXME: Make cost depend on complexity of individual check.  */
3381       (void) add_stmt_cost (target_cost_data, 1, vector_stmt, NULL, 0,
3382                             vect_prologue);
3383       if (dump_enabled_p ())
3384         dump_printf (MSG_NOTE,
3385                      "cost model: Adding cost of checks for loop "
3386                      "versioning niters.\n");
3387     }
3388
3389   if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
3390     (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken, NULL, 0,
3391                           vect_prologue);
3392
3393   /* Count statements in scalar loop.  Using this as scalar cost for a single
3394      iteration for now.
3395
3396      TODO: Add outer loop support.
3397
3398      TODO: Consider assigning different costs to different scalar
3399      statements.  */
3400
3401   scalar_single_iter_cost
3402     = LOOP_VINFO_SINGLE_SCALAR_ITERATION_COST (loop_vinfo);
3403
3404   /* Add additional cost for the peeled instructions in prologue and epilogue
3405      loop.  (For fully-masked loops there will be no peeling.)
3406
3407      FORNOW: If we don't know the value of peel_iters for prologue or epilogue
3408      at compile-time - we assume it's vf/2 (the worst would be vf-1).
3409
3410      TODO: Build an expression that represents peel_iters for prologue and
3411      epilogue to be used in a run-time test.  */
3412
3413   if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
3414     {
3415       peel_iters_prologue = 0;
3416       peel_iters_epilogue = 0;
3417
3418       if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
3419         {
3420           /* We need to peel exactly one iteration.  */
3421           peel_iters_epilogue += 1;
3422           stmt_info_for_cost *si;
3423           int j;
3424           FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
3425                             j, si)
3426             (void) add_stmt_cost (target_cost_data, si->count,
3427                                   si->kind, si->stmt_info, si->misalign,
3428                                   vect_epilogue);
3429         }
3430     }
3431   else if (npeel < 0)
3432     {
3433       peel_iters_prologue = assumed_vf / 2;
3434       if (dump_enabled_p ())
3435         dump_printf (MSG_NOTE, "cost model: "
3436                      "prologue peel iters set to vf/2.\n");
3437
3438       /* If peeling for alignment is unknown, loop bound of main loop becomes
3439          unknown.  */
3440       peel_iters_epilogue = assumed_vf / 2;
3441       if (dump_enabled_p ())
3442         dump_printf (MSG_NOTE, "cost model: "
3443                      "epilogue peel iters set to vf/2 because "
3444                      "peeling for alignment is unknown.\n");
3445
3446       /* If peeled iterations are unknown, count a taken branch and a not taken
3447          branch per peeled loop. Even if scalar loop iterations are known,
3448          vector iterations are not known since peeled prologue iterations are
3449          not known. Hence guards remain the same.  */
3450       (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken,
3451                             NULL, 0, vect_prologue);
3452       (void) add_stmt_cost (target_cost_data, 1, cond_branch_not_taken,
3453                             NULL, 0, vect_prologue);
3454       (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken,
3455                             NULL, 0, vect_epilogue);
3456       (void) add_stmt_cost (target_cost_data, 1, cond_branch_not_taken,
3457                             NULL, 0, vect_epilogue);
3458       stmt_info_for_cost *si;
3459       int j;
3460       FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo), j, si)
3461         {
3462           (void) add_stmt_cost (target_cost_data,
3463                                 si->count * peel_iters_prologue,
3464                                 si->kind, si->stmt_info, si->misalign,
3465                                 vect_prologue);
3466           (void) add_stmt_cost (target_cost_data,
3467                                 si->count * peel_iters_epilogue,
3468                                 si->kind, si->stmt_info, si->misalign,
3469                                 vect_epilogue);
3470         }
3471     }
3472   else
3473     {
3474       stmt_vector_for_cost prologue_cost_vec, epilogue_cost_vec;
3475       stmt_info_for_cost *si;
3476       int j;
3477       void *data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
3478
3479       prologue_cost_vec.create (2);
3480       epilogue_cost_vec.create (2);
3481       peel_iters_prologue = npeel;
3482
3483       (void) vect_get_known_peeling_cost (loop_vinfo, peel_iters_prologue,
3484                                           &peel_iters_epilogue,
3485                                           &LOOP_VINFO_SCALAR_ITERATION_COST
3486                                             (loop_vinfo),
3487                                           &prologue_cost_vec,
3488                                           &epilogue_cost_vec);
3489
3490       FOR_EACH_VEC_ELT (prologue_cost_vec, j, si)
3491         (void) add_stmt_cost (data, si->count, si->kind, si->stmt_info,
3492                               si->misalign, vect_prologue);
3493
3494       FOR_EACH_VEC_ELT (epilogue_cost_vec, j, si)
3495         (void) add_stmt_cost (data, si->count, si->kind, si->stmt_info,
3496                               si->misalign, vect_epilogue);
3497
3498       prologue_cost_vec.release ();
3499       epilogue_cost_vec.release ();
3500     }
3501
3502   /* FORNOW: The scalar outside cost is incremented in one of the
3503      following ways:
3504
3505      1. The vectorizer checks for alignment and aliasing and generates
3506      a condition that allows dynamic vectorization.  A cost model
3507      check is ANDED with the versioning condition.  Hence scalar code
3508      path now has the added cost of the versioning check.
3509
3510        if (cost > th & versioning_check)
3511          jmp to vector code
3512
3513      Hence run-time scalar is incremented by not-taken branch cost.
3514
3515      2. The vectorizer then checks if a prologue is required.  If the
3516      cost model check was not done before during versioning, it has to
3517      be done before the prologue check.
3518
3519        if (cost <= th)
3520          prologue = scalar_iters
3521        if (prologue == 0)
3522          jmp to vector code
3523        else
3524          execute prologue
3525        if (prologue == num_iters)
3526          go to exit
3527
3528      Hence the run-time scalar cost is incremented by a taken branch,
3529      plus a not-taken branch, plus a taken branch cost.
3530
3531      3. The vectorizer then checks if an epilogue is required.  If the
3532      cost model check was not done before during prologue check, it
3533      has to be done with the epilogue check.
3534
3535        if (prologue == 0)
3536          jmp to vector code
3537        else
3538          execute prologue
3539        if (prologue == num_iters)
3540          go to exit
3541        vector code:
3542          if ((cost <= th) | (scalar_iters-prologue-epilogue == 0))
3543            jmp to epilogue
3544
3545      Hence the run-time scalar cost should be incremented by 2 taken
3546      branches.
3547
3548      TODO: The back end may reorder the BBS's differently and reverse
3549      conditions/branch directions.  Change the estimates below to
3550      something more reasonable.  */
3551
3552   /* If the number of iterations is known and we do not do versioning, we can
3553      decide whether to vectorize at compile time.  Hence the scalar version
3554      do not carry cost model guard costs.  */
3555   if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
3556       || LOOP_REQUIRES_VERSIONING (loop_vinfo))
3557     {
3558       /* Cost model check occurs at versioning.  */
3559       if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
3560         scalar_outside_cost += vect_get_stmt_cost (cond_branch_not_taken);
3561       else
3562         {
3563           /* Cost model check occurs at prologue generation.  */
3564           if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) < 0)
3565             scalar_outside_cost += 2 * vect_get_stmt_cost (cond_branch_taken)
3566               + vect_get_stmt_cost (cond_branch_not_taken);
3567           /* Cost model check occurs at epilogue generation.  */
3568           else
3569             scalar_outside_cost += 2 * vect_get_stmt_cost (cond_branch_taken);
3570         }
3571     }
3572
3573   /* Complete the target-specific cost calculations.  */
3574   finish_cost (LOOP_VINFO_TARGET_COST_DATA (loop_vinfo), &vec_prologue_cost,
3575                &vec_inside_cost, &vec_epilogue_cost);
3576
3577   vec_outside_cost = (int)(vec_prologue_cost + vec_epilogue_cost);
3578
3579   if (dump_enabled_p ())
3580     {
3581       dump_printf_loc (MSG_NOTE, vect_location, "Cost model analysis: \n");
3582       dump_printf (MSG_NOTE, "  Vector inside of loop cost: %d\n",
3583                    vec_inside_cost);
3584       dump_printf (MSG_NOTE, "  Vector prologue cost: %d\n",
3585                    vec_prologue_cost);
3586       dump_printf (MSG_NOTE, "  Vector epilogue cost: %d\n",
3587                    vec_epilogue_cost);
3588       dump_printf (MSG_NOTE, "  Scalar iteration cost: %d\n",
3589                    scalar_single_iter_cost);
3590       dump_printf (MSG_NOTE, "  Scalar outside cost: %d\n",
3591                    scalar_outside_cost);
3592       dump_printf (MSG_NOTE, "  Vector outside cost: %d\n",
3593                    vec_outside_cost);
3594       dump_printf (MSG_NOTE, "  prologue iterations: %d\n",
3595                    peel_iters_prologue);
3596       dump_printf (MSG_NOTE, "  epilogue iterations: %d\n",
3597                    peel_iters_epilogue);
3598     }
3599
3600   /* Calculate number of iterations required to make the vector version
3601      profitable, relative to the loop bodies only.  The following condition
3602      must hold true:
3603      SIC * niters + SOC > VIC * ((niters - NPEEL) / VF) + VOC
3604      where
3605      SIC = scalar iteration cost, VIC = vector iteration cost,
3606      VOC = vector outside cost, VF = vectorization factor,
3607      NPEEL = prologue iterations + epilogue iterations,
3608      SOC = scalar outside cost for run time cost model check.  */
3609
3610   int saving_per_viter = (scalar_single_iter_cost * assumed_vf
3611                           - vec_inside_cost);
3612   if (saving_per_viter <= 0)
3613     {
3614       if (LOOP_VINFO_LOOP (loop_vinfo)->force_vectorize)
3615         warning_at (vect_location.get_location_t (), OPT_Wopenmp_simd,
3616                     "vectorization did not happen for a simd loop");
3617
3618       if (dump_enabled_p ())
3619         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3620                          "cost model: the vector iteration cost = %d "
3621                          "divided by the scalar iteration cost = %d "
3622                          "is greater or equal to the vectorization factor = %d"
3623                          ".\n",
3624                          vec_inside_cost, scalar_single_iter_cost, assumed_vf);
3625       *ret_min_profitable_niters = -1;
3626       *ret_min_profitable_estimate = -1;
3627       return;
3628     }
3629
3630   /* ??? The "if" arm is written to handle all cases; see below for what
3631      we would do for !LOOP_VINFO_FULLY_MASKED_P.  */
3632   if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
3633     {
3634       /* Rewriting the condition above in terms of the number of
3635          vector iterations (vniters) rather than the number of
3636          scalar iterations (niters) gives:
3637
3638          SIC * (vniters * VF + NPEEL) + SOC > VIC * vniters + VOC
3639
3640          <==> vniters * (SIC * VF - VIC) > VOC - SIC * NPEEL - SOC
3641
3642          For integer N, X and Y when X > 0:
3643
3644          N * X > Y <==> N >= (Y /[floor] X) + 1.  */
3645       int outside_overhead = (vec_outside_cost
3646                               - scalar_single_iter_cost * peel_iters_prologue
3647                               - scalar_single_iter_cost * peel_iters_epilogue
3648                               - scalar_outside_cost);
3649       /* We're only interested in cases that require at least one
3650          vector iteration.  */
3651       int min_vec_niters = 1;
3652       if (outside_overhead > 0)
3653         min_vec_niters = outside_overhead / saving_per_viter + 1;
3654
3655       if (dump_enabled_p ())
3656         dump_printf (MSG_NOTE, "  Minimum number of vector iterations: %d\n",
3657                      min_vec_niters);
3658
3659       if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
3660         {
3661           /* Now that we know the minimum number of vector iterations,
3662              find the minimum niters for which the scalar cost is larger:
3663
3664              SIC * niters > VIC * vniters + VOC - SOC
3665
3666              We know that the minimum niters is no more than
3667              vniters * VF + NPEEL, but it might be (and often is) less
3668              than that if a partial vector iteration is cheaper than the
3669              equivalent scalar code.  */
3670           int threshold = (vec_inside_cost * min_vec_niters
3671                            + vec_outside_cost
3672                            - scalar_outside_cost);
3673           if (threshold <= 0)
3674             min_profitable_iters = 1;
3675           else
3676             min_profitable_iters = threshold / scalar_single_iter_cost + 1;
3677         }
3678       else
3679         /* Convert the number of vector iterations into a number of
3680            scalar iterations.  */
3681         min_profitable_iters = (min_vec_niters * assumed_vf
3682                                 + peel_iters_prologue
3683                                 + peel_iters_epilogue);
3684     }
3685   else
3686     {
3687       min_profitable_iters = ((vec_outside_cost - scalar_outside_cost)
3688                               * assumed_vf
3689                               - vec_inside_cost * peel_iters_prologue
3690                               - vec_inside_cost * peel_iters_epilogue);
3691       if (min_profitable_iters <= 0)
3692         min_profitable_iters = 0;
3693       else
3694         {
3695           min_profitable_iters /= saving_per_viter;
3696
3697           if ((scalar_single_iter_cost * assumed_vf * min_profitable_iters)
3698               <= (((int) vec_inside_cost * min_profitable_iters)
3699                   + (((int) vec_outside_cost - scalar_outside_cost)
3700                      * assumed_vf)))
3701             min_profitable_iters++;
3702         }
3703     }
3704
3705   if (dump_enabled_p ())
3706     dump_printf (MSG_NOTE,
3707                  "  Calculated minimum iters for profitability: %d\n",
3708                  min_profitable_iters);
3709
3710   if (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
3711       && min_profitable_iters < (assumed_vf + peel_iters_prologue))
3712     /* We want the vectorized loop to execute at least once.  */
3713     min_profitable_iters = assumed_vf + peel_iters_prologue;
3714
3715   if (dump_enabled_p ())
3716     dump_printf_loc (MSG_NOTE, vect_location,
3717                      "  Runtime profitability threshold = %d\n",
3718                      min_profitable_iters);
3719
3720   *ret_min_profitable_niters = min_profitable_iters;
3721
3722   /* Calculate number of iterations required to make the vector version
3723      profitable, relative to the loop bodies only.
3724
3725      Non-vectorized variant is SIC * niters and it must win over vector
3726      variant on the expected loop trip count.  The following condition must hold true:
3727      SIC * niters > VIC * ((niters - NPEEL) / VF) + VOC + SOC  */
3728
3729   if (vec_outside_cost <= 0)
3730     min_profitable_estimate = 0;
3731   else if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
3732     {
3733       /* This is a repeat of the code above, but with + SOC rather
3734          than - SOC.  */
3735       int outside_overhead = (vec_outside_cost
3736                               - scalar_single_iter_cost * peel_iters_prologue
3737                               - scalar_single_iter_cost * peel_iters_epilogue
3738                               + scalar_outside_cost);
3739       int min_vec_niters = 1;
3740       if (outside_overhead > 0)
3741         min_vec_niters = outside_overhead / saving_per_viter + 1;
3742
3743       if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
3744         {
3745           int threshold = (vec_inside_cost * min_vec_niters
3746                            + vec_outside_cost
3747                            + scalar_outside_cost);
3748           min_profitable_estimate = threshold / scalar_single_iter_cost + 1;
3749         }
3750       else
3751         min_profitable_estimate = (min_vec_niters * assumed_vf
3752                                    + peel_iters_prologue
3753                                    + peel_iters_epilogue);
3754     }
3755   else
3756     {
3757       min_profitable_estimate = ((vec_outside_cost + scalar_outside_cost)
3758                                  * assumed_vf
3759                                  - vec_inside_cost * peel_iters_prologue
3760                                  - vec_inside_cost * peel_iters_epilogue)
3761                                  / ((scalar_single_iter_cost * assumed_vf)
3762                                    - vec_inside_cost);
3763     }
3764   min_profitable_estimate = MAX (min_profitable_estimate, min_profitable_iters);
3765   if (dump_enabled_p ())
3766     dump_printf_loc (MSG_NOTE, vect_location,
3767                      "  Static estimate profitability threshold = %d\n",
3768                      min_profitable_estimate);
3769
3770   *ret_min_profitable_estimate = min_profitable_estimate;
3771 }
3772
3773 /* Writes into SEL a mask for a vec_perm, equivalent to a vec_shr by OFFSET
3774    vector elements (not bits) for a vector with NELT elements.  */
3775 static void
3776 calc_vec_perm_mask_for_shift (unsigned int offset, unsigned int nelt,
3777                               vec_perm_builder *sel)
3778 {
3779   /* The encoding is a single stepped pattern.  Any wrap-around is handled
3780      by vec_perm_indices.  */
3781   sel->new_vector (nelt, 1, 3);
3782   for (unsigned int i = 0; i < 3; i++)
3783     sel->quick_push (i + offset);
3784 }
3785
3786 /* Checks whether the target supports whole-vector shifts for vectors of mode
3787    MODE.  This is the case if _either_ the platform handles vec_shr_optab, _or_
3788    it supports vec_perm_const with masks for all necessary shift amounts.  */
3789 static bool
3790 have_whole_vector_shift (machine_mode mode)
3791 {
3792   if (optab_handler (vec_shr_optab, mode) != CODE_FOR_nothing)
3793     return true;
3794
3795   /* Variable-length vectors should be handled via the optab.  */
3796   unsigned int nelt;
3797   if (!GET_MODE_NUNITS (mode).is_constant (&nelt))
3798     return false;
3799
3800   vec_perm_builder sel;
3801   vec_perm_indices indices;
3802   for (unsigned int i = nelt / 2; i >= 1; i /= 2)
3803     {
3804       calc_vec_perm_mask_for_shift (i, nelt, &sel);
3805       indices.new_vector (sel, 2, nelt);
3806       if (!can_vec_perm_const_p (mode, indices, false))
3807         return false;
3808     }
3809   return true;
3810 }
3811
3812 /* TODO: Close dependency between vect_model_*_cost and vectorizable_*
3813    functions. Design better to avoid maintenance issues.  */
3814
3815 /* Function vect_model_reduction_cost.
3816
3817    Models cost for a reduction operation, including the vector ops
3818    generated within the strip-mine loop, the initial definition before
3819    the loop, and the epilogue code that must be generated.  */
3820
3821 static void
3822 vect_model_reduction_cost (stmt_vec_info stmt_info, internal_fn reduc_fn,
3823                            int ncopies, stmt_vector_for_cost *cost_vec)
3824 {
3825   int prologue_cost = 0, epilogue_cost = 0, inside_cost;
3826   enum tree_code code;
3827   optab optab;
3828   tree vectype;
3829   machine_mode mode;
3830   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
3831   struct loop *loop = NULL;
3832
3833   if (loop_vinfo)
3834     loop = LOOP_VINFO_LOOP (loop_vinfo);
3835
3836   /* Condition reductions generate two reductions in the loop.  */
3837   vect_reduction_type reduction_type
3838     = STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info);
3839   if (reduction_type == COND_REDUCTION)
3840     ncopies *= 2;
3841
3842   vectype = STMT_VINFO_VECTYPE (stmt_info);
3843   mode = TYPE_MODE (vectype);
3844   stmt_vec_info orig_stmt_info = vect_orig_stmt (stmt_info);
3845
3846   code = gimple_assign_rhs_code (orig_stmt_info->stmt);
3847
3848   if (reduction_type == EXTRACT_LAST_REDUCTION
3849       || reduction_type == FOLD_LEFT_REDUCTION)
3850     {
3851       /* No extra instructions needed in the prologue.  */
3852       prologue_cost = 0;
3853
3854       if (reduction_type == EXTRACT_LAST_REDUCTION || reduc_fn != IFN_LAST)
3855         /* Count one reduction-like operation per vector.  */
3856         inside_cost = record_stmt_cost (cost_vec, ncopies, vec_to_scalar,
3857                                         stmt_info, 0, vect_body);
3858       else
3859         {
3860           /* Use NELEMENTS extracts and NELEMENTS scalar ops.  */
3861           unsigned int nelements = ncopies * vect_nunits_for_cost (vectype);
3862           inside_cost = record_stmt_cost (cost_vec, nelements,
3863                                           vec_to_scalar, stmt_info, 0,
3864                                           vect_body);
3865           inside_cost += record_stmt_cost (cost_vec, nelements,
3866                                            scalar_stmt, stmt_info, 0,
3867                                            vect_body);
3868         }
3869     }
3870   else
3871     {
3872       /* Add in cost for initial definition.
3873          For cond reduction we have four vectors: initial index, step,
3874          initial result of the data reduction, initial value of the index
3875          reduction.  */
3876       int prologue_stmts = reduction_type == COND_REDUCTION ? 4 : 1;
3877       prologue_cost += record_stmt_cost (cost_vec, prologue_stmts,
3878                                          scalar_to_vec, stmt_info, 0,
3879                                          vect_prologue);
3880
3881       /* Cost of reduction op inside loop.  */
3882       inside_cost = record_stmt_cost (cost_vec, ncopies, vector_stmt,
3883                                       stmt_info, 0, vect_body);
3884     }
3885
3886   /* Determine cost of epilogue code.
3887
3888      We have a reduction operator that will reduce the vector in one statement.
3889      Also requires scalar extract.  */
3890
3891   if (!loop || !nested_in_vect_loop_p (loop, orig_stmt_info))
3892     {
3893       if (reduc_fn != IFN_LAST)
3894         {
3895           if (reduction_type == COND_REDUCTION)
3896             {
3897               /* An EQ stmt and an COND_EXPR stmt.  */
3898               epilogue_cost += record_stmt_cost (cost_vec, 2,
3899                                                  vector_stmt, stmt_info, 0,
3900                                                  vect_epilogue);
3901               /* Reduction of the max index and a reduction of the found
3902                  values.  */
3903               epilogue_cost += record_stmt_cost (cost_vec, 2,
3904                                                  vec_to_scalar, stmt_info, 0,
3905                                                  vect_epilogue);
3906               /* A broadcast of the max value.  */
3907               epilogue_cost += record_stmt_cost (cost_vec, 1,
3908                                                  scalar_to_vec, stmt_info, 0,
3909                                                  vect_epilogue);
3910             }
3911           else
3912             {
3913               epilogue_cost += record_stmt_cost (cost_vec, 1, vector_stmt,
3914                                                  stmt_info, 0, vect_epilogue);
3915               epilogue_cost += record_stmt_cost (cost_vec, 1,
3916                                                  vec_to_scalar, stmt_info, 0,
3917                                                  vect_epilogue);
3918             }
3919         }
3920       else if (reduction_type == COND_REDUCTION)
3921         {
3922           unsigned estimated_nunits = vect_nunits_for_cost (vectype);
3923           /* Extraction of scalar elements.  */
3924           epilogue_cost += record_stmt_cost (cost_vec,
3925                                              2 * estimated_nunits,
3926                                              vec_to_scalar, stmt_info, 0,
3927                                              vect_epilogue);
3928           /* Scalar max reductions via COND_EXPR / MAX_EXPR.  */
3929           epilogue_cost += record_stmt_cost (cost_vec,
3930                                              2 * estimated_nunits - 3,
3931                                              scalar_stmt, stmt_info, 0,
3932                                              vect_epilogue);
3933         }
3934       else if (reduction_type == EXTRACT_LAST_REDUCTION
3935                || reduction_type == FOLD_LEFT_REDUCTION)
3936         /* No extra instructions need in the epilogue.  */
3937         ;
3938       else
3939         {
3940           int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
3941           tree bitsize =
3942             TYPE_SIZE (TREE_TYPE (gimple_assign_lhs (orig_stmt_info->stmt)));
3943           int element_bitsize = tree_to_uhwi (bitsize);
3944           int nelements = vec_size_in_bits / element_bitsize;
3945
3946           if (code == COND_EXPR)
3947             code = MAX_EXPR;
3948
3949           optab = optab_for_tree_code (code, vectype, optab_default);
3950
3951           /* We have a whole vector shift available.  */
3952           if (optab != unknown_optab
3953               && VECTOR_MODE_P (mode)
3954               && optab_handler (optab, mode) != CODE_FOR_nothing
3955               && have_whole_vector_shift (mode))
3956             {
3957               /* Final reduction via vector shifts and the reduction operator.
3958                  Also requires scalar extract.  */
3959               epilogue_cost += record_stmt_cost (cost_vec,
3960                                                  exact_log2 (nelements) * 2,
3961                                                  vector_stmt, stmt_info, 0,
3962                                                  vect_epilogue);
3963               epilogue_cost += record_stmt_cost (cost_vec, 1,
3964                                                  vec_to_scalar, stmt_info, 0,
3965                                                  vect_epilogue);
3966             }
3967           else
3968             /* Use extracts and reduction op for final reduction.  For N
3969                elements, we have N extracts and N-1 reduction ops.  */
3970             epilogue_cost += record_stmt_cost (cost_vec,
3971                                                nelements + nelements - 1,
3972                                                vector_stmt, stmt_info, 0,
3973                                                vect_epilogue);
3974         }
3975     }
3976
3977   if (dump_enabled_p ())
3978     dump_printf (MSG_NOTE,
3979                  "vect_model_reduction_cost: inside_cost = %d, "
3980                  "prologue_cost = %d, epilogue_cost = %d .\n", inside_cost,
3981                  prologue_cost, epilogue_cost);
3982 }
3983
3984
3985 /* Function vect_model_induction_cost.
3986
3987    Models cost for induction operations.  */
3988
3989 static void
3990 vect_model_induction_cost (stmt_vec_info stmt_info, int ncopies,
3991                            stmt_vector_for_cost *cost_vec)
3992 {
3993   unsigned inside_cost, prologue_cost;
3994
3995   if (PURE_SLP_STMT (stmt_info))
3996     return;
3997
3998   /* loop cost for vec_loop.  */
3999   inside_cost = record_stmt_cost (cost_vec, ncopies, vector_stmt,
4000                                   stmt_info, 0, vect_body);
4001
4002   /* prologue cost for vec_init and vec_step.  */
4003   prologue_cost = record_stmt_cost (cost_vec, 2, scalar_to_vec,
4004                                     stmt_info, 0, vect_prologue);
4005
4006   if (dump_enabled_p ())
4007     dump_printf_loc (MSG_NOTE, vect_location,
4008                      "vect_model_induction_cost: inside_cost = %d, "
4009                      "prologue_cost = %d .\n", inside_cost, prologue_cost);
4010 }
4011
4012
4013
4014 /* Function get_initial_def_for_reduction
4015
4016    Input:
4017    STMT_VINFO - a stmt that performs a reduction operation in the loop.
4018    INIT_VAL - the initial value of the reduction variable
4019
4020    Output:
4021    ADJUSTMENT_DEF - a tree that holds a value to be added to the final result
4022         of the reduction (used for adjusting the epilog - see below).
4023    Return a vector variable, initialized according to the operation that
4024         STMT_VINFO performs. This vector will be used as the initial value
4025         of the vector of partial results.
4026
4027    Option1 (adjust in epilog): Initialize the vector as follows:
4028      add/bit or/xor:    [0,0,...,0,0]
4029      mult/bit and:      [1,1,...,1,1]
4030      min/max/cond_expr: [init_val,init_val,..,init_val,init_val]
4031    and when necessary (e.g. add/mult case) let the caller know
4032    that it needs to adjust the result by init_val.
4033
4034    Option2: Initialize the vector as follows:
4035      add/bit or/xor:    [init_val,0,0,...,0]
4036      mult/bit and:      [init_val,1,1,...,1]
4037      min/max/cond_expr: [init_val,init_val,...,init_val]
4038    and no adjustments are needed.
4039
4040    For example, for the following code:
4041
4042    s = init_val;
4043    for (i=0;i<n;i++)
4044      s = s + a[i];
4045
4046    STMT_VINFO is 's = s + a[i]', and the reduction variable is 's'.
4047    For a vector of 4 units, we want to return either [0,0,0,init_val],
4048    or [0,0,0,0] and let the caller know that it needs to adjust
4049    the result at the end by 'init_val'.
4050
4051    FORNOW, we are using the 'adjust in epilog' scheme, because this way the
4052    initialization vector is simpler (same element in all entries), if
4053    ADJUSTMENT_DEF is not NULL, and Option2 otherwise.
4054
4055    A cost model should help decide between these two schemes.  */
4056
4057 tree
4058 get_initial_def_for_reduction (stmt_vec_info stmt_vinfo, tree init_val,
4059                                tree *adjustment_def)
4060 {
4061   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_vinfo);
4062   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
4063   tree scalar_type = TREE_TYPE (init_val);
4064   tree vectype = get_vectype_for_scalar_type (scalar_type);
4065   enum tree_code code = gimple_assign_rhs_code (stmt_vinfo->stmt);
4066   tree def_for_init;
4067   tree init_def;
4068   REAL_VALUE_TYPE real_init_val = dconst0;
4069   int int_init_val = 0;
4070   gimple_seq stmts = NULL;
4071
4072   gcc_assert (vectype);
4073
4074   gcc_assert (POINTER_TYPE_P (scalar_type) || INTEGRAL_TYPE_P (scalar_type)
4075               || SCALAR_FLOAT_TYPE_P (scalar_type));
4076
4077   gcc_assert (nested_in_vect_loop_p (loop, stmt_vinfo)
4078               || loop == (gimple_bb (stmt_vinfo->stmt))->loop_father);
4079
4080   vect_reduction_type reduction_type
4081     = STMT_VINFO_VEC_REDUCTION_TYPE (stmt_vinfo);
4082
4083   switch (code)
4084     {
4085     case WIDEN_SUM_EXPR:
4086     case DOT_PROD_EXPR:
4087     case SAD_EXPR:
4088     case PLUS_EXPR:
4089     case MINUS_EXPR:
4090     case BIT_IOR_EXPR:
4091     case BIT_XOR_EXPR:
4092     case MULT_EXPR:
4093     case BIT_AND_EXPR:
4094       {
4095         /* ADJUSTMENT_DEF is NULL when called from
4096            vect_create_epilog_for_reduction to vectorize double reduction.  */
4097         if (adjustment_def)
4098           *adjustment_def = init_val;
4099
4100         if (code == MULT_EXPR)
4101           {
4102             real_init_val = dconst1;
4103             int_init_val = 1;
4104           }
4105
4106         if (code == BIT_AND_EXPR)
4107           int_init_val = -1;
4108
4109         if (SCALAR_FLOAT_TYPE_P (scalar_type))
4110           def_for_init = build_real (scalar_type, real_init_val);
4111         else
4112           def_for_init = build_int_cst (scalar_type, int_init_val);
4113
4114         if (adjustment_def)
4115           /* Option1: the first element is '0' or '1' as well.  */
4116           init_def = gimple_build_vector_from_val (&stmts, vectype,
4117                                                    def_for_init);
4118         else if (!TYPE_VECTOR_SUBPARTS (vectype).is_constant ())
4119           {
4120             /* Option2 (variable length): the first element is INIT_VAL.  */
4121             init_def = gimple_build_vector_from_val (&stmts, vectype,
4122                                                      def_for_init);
4123             init_def = gimple_build (&stmts, CFN_VEC_SHL_INSERT,
4124                                      vectype, init_def, init_val);
4125           }
4126         else
4127           {
4128             /* Option2: the first element is INIT_VAL.  */
4129             tree_vector_builder elts (vectype, 1, 2);
4130             elts.quick_push (init_val);
4131             elts.quick_push (def_for_init);
4132             init_def = gimple_build_vector (&stmts, &elts);
4133           }
4134       }
4135       break;
4136
4137     case MIN_EXPR:
4138     case MAX_EXPR:
4139     case COND_EXPR:
4140       {
4141         if (adjustment_def)
4142           {
4143             *adjustment_def = NULL_TREE;
4144             if (reduction_type != COND_REDUCTION
4145                 && reduction_type != EXTRACT_LAST_REDUCTION)
4146               {
4147                 init_def = vect_get_vec_def_for_operand (init_val, stmt_vinfo);
4148                 break;
4149               }
4150           }
4151         init_val = gimple_convert (&stmts, TREE_TYPE (vectype), init_val);
4152         init_def = gimple_build_vector_from_val (&stmts, vectype, init_val);
4153       }
4154       break;
4155
4156     default:
4157       gcc_unreachable ();
4158     }
4159
4160   if (stmts)
4161     gsi_insert_seq_on_edge_immediate (loop_preheader_edge (loop), stmts);
4162   return init_def;
4163 }
4164
4165 /* Get at the initial defs for the reduction PHIs in SLP_NODE.
4166    NUMBER_OF_VECTORS is the number of vector defs to create.
4167    If NEUTRAL_OP is nonnull, introducing extra elements of that
4168    value will not change the result.  */
4169
4170 static void
4171 get_initial_defs_for_reduction (slp_tree slp_node,
4172                                 vec<tree> *vec_oprnds,
4173                                 unsigned int number_of_vectors,
4174                                 bool reduc_chain, tree neutral_op)
4175 {
4176   vec<stmt_vec_info> stmts = SLP_TREE_SCALAR_STMTS (slp_node);
4177   stmt_vec_info stmt_vinfo = stmts[0];
4178   unsigned HOST_WIDE_INT nunits;
4179   unsigned j, number_of_places_left_in_vector;
4180   tree vector_type;
4181   unsigned int group_size = stmts.length ();
4182   unsigned int i;
4183   struct loop *loop;
4184
4185   vector_type = STMT_VINFO_VECTYPE (stmt_vinfo);
4186
4187   gcc_assert (STMT_VINFO_DEF_TYPE (stmt_vinfo) == vect_reduction_def);
4188
4189   loop = (gimple_bb (stmt_vinfo->stmt))->loop_father;
4190   gcc_assert (loop);
4191   edge pe = loop_preheader_edge (loop);
4192
4193   gcc_assert (!reduc_chain || neutral_op);
4194
4195   /* NUMBER_OF_COPIES is the number of times we need to use the same values in
4196      created vectors. It is greater than 1 if unrolling is performed.
4197
4198      For example, we have two scalar operands, s1 and s2 (e.g., group of
4199      strided accesses of size two), while NUNITS is four (i.e., four scalars
4200      of this type can be packed in a vector).  The output vector will contain
4201      two copies of each scalar operand: {s1, s2, s1, s2}.  (NUMBER_OF_COPIES
4202      will be 2).
4203
4204      If REDUC_GROUP_SIZE > NUNITS, the scalars will be split into several
4205      vectors containing the operands.
4206
4207      For example, NUNITS is four as before, and the group size is 8
4208      (s1, s2, ..., s8).  We will create two vectors {s1, s2, s3, s4} and
4209      {s5, s6, s7, s8}.  */
4210
4211   if (!TYPE_VECTOR_SUBPARTS (vector_type).is_constant (&nunits))
4212     nunits = group_size;
4213
4214   number_of_places_left_in_vector = nunits;
4215   bool constant_p = true;
4216   tree_vector_builder elts (vector_type, nunits, 1);
4217   elts.quick_grow (nunits);
4218   gimple_seq ctor_seq = NULL;
4219   for (j = 0; j < nunits * number_of_vectors; ++j)
4220     {
4221       tree op;
4222       i = j % group_size;
4223       stmt_vinfo = stmts[i];
4224
4225       /* Get the def before the loop.  In reduction chain we have only
4226          one initial value.  Else we have as many as PHIs in the group.  */
4227       if (reduc_chain)
4228         op = j != 0 ? neutral_op : PHI_ARG_DEF_FROM_EDGE (stmt_vinfo->stmt, pe);
4229       else if (((vec_oprnds->length () + 1) * nunits
4230                 - number_of_places_left_in_vector >= group_size)
4231                && neutral_op)
4232         op = neutral_op;
4233       else
4234         op = PHI_ARG_DEF_FROM_EDGE (stmt_vinfo->stmt, pe);
4235
4236       /* Create 'vect_ = {op0,op1,...,opn}'.  */
4237       number_of_places_left_in_vector--;
4238       elts[nunits - number_of_places_left_in_vector - 1] = op;
4239       if (!CONSTANT_CLASS_P (op))
4240         constant_p = false;
4241
4242       if (number_of_places_left_in_vector == 0)
4243         {
4244           tree init;
4245           if (constant_p && !neutral_op
4246               ? multiple_p (TYPE_VECTOR_SUBPARTS (vector_type), nunits)
4247               : known_eq (TYPE_VECTOR_SUBPARTS (vector_type), nunits))
4248             /* Build the vector directly from ELTS.  */
4249             init = gimple_build_vector (&ctor_seq, &elts);
4250           else if (neutral_op)
4251             {
4252               /* Build a vector of the neutral value and shift the
4253                  other elements into place.  */
4254               init = gimple_build_vector_from_val (&ctor_seq, vector_type,
4255                                                    neutral_op);
4256               int k = nunits;
4257               while (k > 0 && elts[k - 1] == neutral_op)
4258                 k -= 1;
4259               while (k > 0)
4260                 {
4261                   k -= 1;
4262                   init = gimple_build (&ctor_seq, CFN_VEC_SHL_INSERT,
4263                                        vector_type, init, elts[k]);
4264                 }
4265             }
4266           else
4267             {
4268               /* First time round, duplicate ELTS to fill the
4269                  required number of vectors.  */
4270               duplicate_and_interleave (&ctor_seq, vector_type, elts,
4271                                         number_of_vectors, *vec_oprnds);
4272               break;
4273             }
4274           vec_oprnds->quick_push (init);
4275
4276           number_of_places_left_in_vector = nunits;
4277           elts.new_vector (vector_type, nunits, 1);
4278           elts.quick_grow (nunits);
4279           constant_p = true;
4280         }
4281     }
4282   if (ctor_seq != NULL)
4283     gsi_insert_seq_on_edge_immediate (pe, ctor_seq);
4284 }
4285
4286
4287 /* Function vect_create_epilog_for_reduction
4288
4289    Create code at the loop-epilog to finalize the result of a reduction
4290    computation.
4291
4292    VECT_DEFS is list of vector of partial results, i.e., the lhs's of vector
4293      reduction statements.
4294    STMT_INFO is the scalar reduction stmt that is being vectorized.
4295    NCOPIES is > 1 in case the vectorization factor (VF) is bigger than the
4296      number of elements that we can fit in a vectype (nunits).  In this case
4297      we have to generate more than one vector stmt - i.e - we need to "unroll"
4298      the vector stmt by a factor VF/nunits.  For more details see documentation
4299      in vectorizable_operation.
4300    REDUC_FN is the internal function for the epilog reduction.
4301    REDUCTION_PHIS is a list of the phi-nodes that carry the reduction
4302      computation.
4303    REDUC_INDEX is the index of the operand in the right hand side of the
4304      statement that is defined by REDUCTION_PHI.
4305    DOUBLE_REDUC is TRUE if double reduction phi nodes should be handled.
4306    SLP_NODE is an SLP node containing a group of reduction statements. The
4307      first one in this group is STMT_INFO.
4308    INDUC_VAL is for INTEGER_INDUC_COND_REDUCTION the value to use for the case
4309      when the COND_EXPR is never true in the loop.  For MAX_EXPR, it needs to
4310      be smaller than any value of the IV in the loop, for MIN_EXPR larger than
4311      any value of the IV in the loop.
4312    INDUC_CODE is the code for epilog reduction if INTEGER_INDUC_COND_REDUCTION.
4313    NEUTRAL_OP is the value given by neutral_op_for_slp_reduction; it is
4314      null if this is not an SLP reduction
4315
4316    This function:
4317    1. Creates the reduction def-use cycles: sets the arguments for
4318       REDUCTION_PHIS:
4319       The loop-entry argument is the vectorized initial-value of the reduction.
4320       The loop-latch argument is taken from VECT_DEFS - the vector of partial
4321       sums.
4322    2. "Reduces" each vector of partial results VECT_DEFS into a single result,
4323       by calling the function specified by REDUC_FN if available, or by
4324       other means (whole-vector shifts or a scalar loop).
4325       The function also creates a new phi node at the loop exit to preserve
4326       loop-closed form, as illustrated below.
4327
4328      The flow at the entry to this function:
4329
4330         loop:
4331           vec_def = phi <null, null>            # REDUCTION_PHI
4332           VECT_DEF = vector_stmt                # vectorized form of STMT_INFO
4333           s_loop = scalar_stmt                  # (scalar) STMT_INFO
4334         loop_exit:
4335           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
4336           use <s_out0>
4337           use <s_out0>
4338
4339      The above is transformed by this function into:
4340
4341         loop:
4342           vec_def = phi <vec_init, VECT_DEF>    # REDUCTION_PHI
4343           VECT_DEF = vector_stmt                # vectorized form of STMT_INFO
4344           s_loop = scalar_stmt                  # (scalar) STMT_INFO
4345         loop_exit:
4346           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
4347           v_out1 = phi <VECT_DEF>               # NEW_EXIT_PHI
4348           v_out2 = reduce <v_out1>
4349           s_out3 = extract_field <v_out2, 0>
4350           s_out4 = adjust_result <s_out3>
4351           use <s_out4>
4352           use <s_out4>
4353 */
4354
4355 static void
4356 vect_create_epilog_for_reduction (vec<tree> vect_defs,
4357                                   stmt_vec_info stmt_info,
4358                                   gimple *reduc_def_stmt,
4359                                   int ncopies, internal_fn reduc_fn,
4360                                   vec<stmt_vec_info> reduction_phis,
4361                                   bool double_reduc,
4362                                   slp_tree slp_node,
4363                                   slp_instance slp_node_instance,
4364                                   tree induc_val, enum tree_code induc_code,
4365                                   tree neutral_op)
4366 {
4367   stmt_vec_info prev_phi_info;
4368   tree vectype;
4369   machine_mode mode;
4370   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4371   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo), *outer_loop = NULL;
4372   basic_block exit_bb;
4373   tree scalar_dest;
4374   tree scalar_type;
4375   gimple *new_phi = NULL, *phi;
4376   stmt_vec_info phi_info;
4377   gimple_stmt_iterator exit_gsi;
4378   tree vec_dest;
4379   tree new_temp = NULL_TREE, new_dest, new_name, new_scalar_dest;
4380   gimple *epilog_stmt = NULL;
4381   enum tree_code code = gimple_assign_rhs_code (stmt_info->stmt);
4382   gimple *exit_phi;
4383   tree bitsize;
4384   tree adjustment_def = NULL;
4385   tree vec_initial_def = NULL;
4386   tree expr, def, initial_def = NULL;
4387   tree orig_name, scalar_result;
4388   imm_use_iterator imm_iter, phi_imm_iter;
4389   use_operand_p use_p, phi_use_p;
4390   gimple *use_stmt;
4391   stmt_vec_info reduction_phi_info = NULL;
4392   bool nested_in_vect_loop = false;
4393   auto_vec<gimple *> new_phis;
4394   auto_vec<stmt_vec_info> inner_phis;
4395   int j, i;
4396   auto_vec<tree> scalar_results;
4397   unsigned int group_size = 1, k, ratio;
4398   auto_vec<tree> vec_initial_defs;
4399   auto_vec<gimple *> phis;
4400   bool slp_reduc = false;
4401   bool direct_slp_reduc;
4402   tree new_phi_result;
4403   stmt_vec_info inner_phi = NULL;
4404   tree induction_index = NULL_TREE;
4405
4406   if (slp_node)
4407     group_size = SLP_TREE_SCALAR_STMTS (slp_node).length ();
4408
4409   if (nested_in_vect_loop_p (loop, stmt_info))
4410     {
4411       outer_loop = loop;
4412       loop = loop->inner;
4413       nested_in_vect_loop = true;
4414       gcc_assert (!slp_node);
4415     }
4416
4417   vectype = STMT_VINFO_VECTYPE (stmt_info);
4418   gcc_assert (vectype);
4419   mode = TYPE_MODE (vectype);
4420
4421   /* 1. Create the reduction def-use cycle:
4422      Set the arguments of REDUCTION_PHIS, i.e., transform
4423
4424         loop:
4425           vec_def = phi <null, null>            # REDUCTION_PHI
4426           VECT_DEF = vector_stmt                # vectorized form of STMT
4427           ...
4428
4429      into:
4430
4431         loop:
4432           vec_def = phi <vec_init, VECT_DEF>    # REDUCTION_PHI
4433           VECT_DEF = vector_stmt                # vectorized form of STMT
4434           ...
4435
4436      (in case of SLP, do it for all the phis). */
4437
4438   /* Get the loop-entry arguments.  */
4439   enum vect_def_type initial_def_dt = vect_unknown_def_type;
4440   if (slp_node)
4441     {
4442       unsigned vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
4443       vec_initial_defs.reserve (vec_num);
4444       get_initial_defs_for_reduction (slp_node_instance->reduc_phis,
4445                                       &vec_initial_defs, vec_num,
4446                                       REDUC_GROUP_FIRST_ELEMENT (stmt_info),
4447                                       neutral_op);
4448     }
4449   else
4450     {
4451       /* Get at the scalar def before the loop, that defines the initial value
4452          of the reduction variable.  */
4453       initial_def = PHI_ARG_DEF_FROM_EDGE (reduc_def_stmt,
4454                                            loop_preheader_edge (loop));
4455       /* Optimize: if initial_def is for REDUC_MAX smaller than the base
4456          and we can't use zero for induc_val, use initial_def.  Similarly
4457          for REDUC_MIN and initial_def larger than the base.  */
4458       if (TREE_CODE (initial_def) == INTEGER_CST
4459           && (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
4460               == INTEGER_INDUC_COND_REDUCTION)
4461           && !integer_zerop (induc_val)
4462           && ((induc_code == MAX_EXPR
4463                && tree_int_cst_lt (initial_def, induc_val))
4464               || (induc_code == MIN_EXPR
4465                   && tree_int_cst_lt (induc_val, initial_def))))
4466         induc_val = initial_def;
4467
4468       if (double_reduc)
4469         /* In case of double reduction we only create a vector variable
4470            to be put in the reduction phi node.  The actual statement
4471            creation is done later in this function.  */
4472         vec_initial_def = vect_create_destination_var (initial_def, vectype);
4473       else if (nested_in_vect_loop)
4474         {
4475           /* Do not use an adjustment def as that case is not supported
4476              correctly if ncopies is not one.  */
4477           vect_is_simple_use (initial_def, loop_vinfo, &initial_def_dt);
4478           vec_initial_def = vect_get_vec_def_for_operand (initial_def,
4479                                                           stmt_info);
4480         }
4481       else
4482         vec_initial_def
4483           = get_initial_def_for_reduction (stmt_info, initial_def,
4484                                            &adjustment_def);
4485       vec_initial_defs.create (1);
4486       vec_initial_defs.quick_push (vec_initial_def);
4487     }
4488
4489   /* Set phi nodes arguments.  */
4490   FOR_EACH_VEC_ELT (reduction_phis, i, phi_info)
4491     {
4492       tree vec_init_def = vec_initial_defs[i];
4493       tree def = vect_defs[i];
4494       for (j = 0; j < ncopies; j++)
4495         {
4496           if (j != 0)
4497             {
4498               phi_info = STMT_VINFO_RELATED_STMT (phi_info);
4499               if (nested_in_vect_loop)
4500                 vec_init_def
4501                   = vect_get_vec_def_for_stmt_copy (loop_vinfo, vec_init_def);
4502             }
4503
4504           /* Set the loop-entry arg of the reduction-phi.  */
4505
4506           gphi *phi = as_a <gphi *> (phi_info->stmt);
4507           if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
4508               == INTEGER_INDUC_COND_REDUCTION)
4509             {
4510               /* Initialise the reduction phi to zero.  This prevents initial
4511                  values of non-zero interferring with the reduction op.  */
4512               gcc_assert (ncopies == 1);
4513               gcc_assert (i == 0);
4514
4515               tree vec_init_def_type = TREE_TYPE (vec_init_def);
4516               tree induc_val_vec
4517                 = build_vector_from_val (vec_init_def_type, induc_val);
4518
4519               add_phi_arg (phi, induc_val_vec, loop_preheader_edge (loop),
4520                            UNKNOWN_LOCATION);
4521             }
4522           else
4523             add_phi_arg (phi, vec_init_def, loop_preheader_edge (loop),
4524                          UNKNOWN_LOCATION);
4525
4526           /* Set the loop-latch arg for the reduction-phi.  */
4527           if (j > 0)
4528             def = vect_get_vec_def_for_stmt_copy (loop_vinfo, def);
4529
4530           add_phi_arg (phi, def, loop_latch_edge (loop), UNKNOWN_LOCATION);
4531
4532           if (dump_enabled_p ())
4533             dump_printf_loc (MSG_NOTE, vect_location,
4534                              "transform reduction: created def-use cycle: %G%G",
4535                              phi, SSA_NAME_DEF_STMT (def));
4536         }
4537     }
4538
4539   /* For cond reductions we want to create a new vector (INDEX_COND_EXPR)
4540      which is updated with the current index of the loop for every match of
4541      the original loop's cond_expr (VEC_STMT).  This results in a vector
4542      containing the last time the condition passed for that vector lane.
4543      The first match will be a 1 to allow 0 to be used for non-matching
4544      indexes.  If there are no matches at all then the vector will be all
4545      zeroes.  */
4546   if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) == COND_REDUCTION)
4547     {
4548       tree indx_before_incr, indx_after_incr;
4549       poly_uint64 nunits_out = TYPE_VECTOR_SUBPARTS (vectype);
4550
4551       gimple *vec_stmt = STMT_VINFO_VEC_STMT (stmt_info)->stmt;
4552       gcc_assert (gimple_assign_rhs_code (vec_stmt) == VEC_COND_EXPR);
4553
4554       int scalar_precision
4555         = GET_MODE_PRECISION (SCALAR_TYPE_MODE (TREE_TYPE (vectype)));
4556       tree cr_index_scalar_type = make_unsigned_type (scalar_precision);
4557       tree cr_index_vector_type = build_vector_type
4558         (cr_index_scalar_type, TYPE_VECTOR_SUBPARTS (vectype));
4559
4560       /* First we create a simple vector induction variable which starts
4561          with the values {1,2,3,...} (SERIES_VECT) and increments by the
4562          vector size (STEP).  */
4563
4564       /* Create a {1,2,3,...} vector.  */
4565       tree series_vect = build_index_vector (cr_index_vector_type, 1, 1);
4566
4567       /* Create a vector of the step value.  */
4568       tree step = build_int_cst (cr_index_scalar_type, nunits_out);
4569       tree vec_step = build_vector_from_val (cr_index_vector_type, step);
4570
4571       /* Create an induction variable.  */
4572       gimple_stmt_iterator incr_gsi;
4573       bool insert_after;
4574       standard_iv_increment_position (loop, &incr_gsi, &insert_after);
4575       create_iv (series_vect, vec_step, NULL_TREE, loop, &incr_gsi,
4576                  insert_after, &indx_before_incr, &indx_after_incr);
4577
4578       /* Next create a new phi node vector (NEW_PHI_TREE) which starts
4579          filled with zeros (VEC_ZERO).  */
4580
4581       /* Create a vector of 0s.  */
4582       tree zero = build_zero_cst (cr_index_scalar_type);
4583       tree vec_zero = build_vector_from_val (cr_index_vector_type, zero);
4584
4585       /* Create a vector phi node.  */
4586       tree new_phi_tree = make_ssa_name (cr_index_vector_type);
4587       new_phi = create_phi_node (new_phi_tree, loop->header);
4588       loop_vinfo->add_stmt (new_phi);
4589       add_phi_arg (as_a <gphi *> (new_phi), vec_zero,
4590                    loop_preheader_edge (loop), UNKNOWN_LOCATION);
4591
4592       /* Now take the condition from the loops original cond_expr
4593          (VEC_STMT) and produce a new cond_expr (INDEX_COND_EXPR) which for
4594          every match uses values from the induction variable
4595          (INDEX_BEFORE_INCR) otherwise uses values from the phi node
4596          (NEW_PHI_TREE).
4597          Finally, we update the phi (NEW_PHI_TREE) to take the value of
4598          the new cond_expr (INDEX_COND_EXPR).  */
4599
4600       /* Duplicate the condition from vec_stmt.  */
4601       tree ccompare = unshare_expr (gimple_assign_rhs1 (vec_stmt));
4602
4603       /* Create a conditional, where the condition is taken from vec_stmt
4604          (CCOMPARE), then is the induction index (INDEX_BEFORE_INCR) and
4605          else is the phi (NEW_PHI_TREE).  */
4606       tree index_cond_expr = build3 (VEC_COND_EXPR, cr_index_vector_type,
4607                                      ccompare, indx_before_incr,
4608                                      new_phi_tree);
4609       induction_index = make_ssa_name (cr_index_vector_type);
4610       gimple *index_condition = gimple_build_assign (induction_index,
4611                                                      index_cond_expr);
4612       gsi_insert_before (&incr_gsi, index_condition, GSI_SAME_STMT);
4613       stmt_vec_info index_vec_info = loop_vinfo->add_stmt (index_condition);
4614       STMT_VINFO_VECTYPE (index_vec_info) = cr_index_vector_type;
4615
4616       /* Update the phi with the vec cond.  */
4617       add_phi_arg (as_a <gphi *> (new_phi), induction_index,
4618                    loop_latch_edge (loop), UNKNOWN_LOCATION);
4619     }
4620
4621   /* 2. Create epilog code.
4622         The reduction epilog code operates across the elements of the vector
4623         of partial results computed by the vectorized loop.
4624         The reduction epilog code consists of:
4625
4626         step 1: compute the scalar result in a vector (v_out2)
4627         step 2: extract the scalar result (s_out3) from the vector (v_out2)
4628         step 3: adjust the scalar result (s_out3) if needed.
4629
4630         Step 1 can be accomplished using one the following three schemes:
4631           (scheme 1) using reduc_fn, if available.
4632           (scheme 2) using whole-vector shifts, if available.
4633           (scheme 3) using a scalar loop. In this case steps 1+2 above are
4634                      combined.
4635
4636           The overall epilog code looks like this:
4637
4638           s_out0 = phi <s_loop>         # original EXIT_PHI
4639           v_out1 = phi <VECT_DEF>       # NEW_EXIT_PHI
4640           v_out2 = reduce <v_out1>              # step 1
4641           s_out3 = extract_field <v_out2, 0>    # step 2
4642           s_out4 = adjust_result <s_out3>       # step 3
4643
4644           (step 3 is optional, and steps 1 and 2 may be combined).
4645           Lastly, the uses of s_out0 are replaced by s_out4.  */
4646
4647
4648   /* 2.1 Create new loop-exit-phis to preserve loop-closed form:
4649          v_out1 = phi <VECT_DEF>
4650          Store them in NEW_PHIS.  */
4651
4652   exit_bb = single_exit (loop)->dest;
4653   prev_phi_info = NULL;
4654   new_phis.create (vect_defs.length ());
4655   FOR_EACH_VEC_ELT (vect_defs, i, def)
4656     {
4657       for (j = 0; j < ncopies; j++)
4658         {
4659           tree new_def = copy_ssa_name (def);
4660           phi = create_phi_node (new_def, exit_bb);
4661           stmt_vec_info phi_info = loop_vinfo->add_stmt (phi);
4662           if (j == 0)
4663             new_phis.quick_push (phi);
4664           else
4665             {
4666               def = vect_get_vec_def_for_stmt_copy (loop_vinfo, def);
4667               STMT_VINFO_RELATED_STMT (prev_phi_info) = phi_info;
4668             }
4669
4670           SET_PHI_ARG_DEF (phi, single_exit (loop)->dest_idx, def);
4671           prev_phi_info = phi_info;
4672         }
4673     }
4674
4675   /* The epilogue is created for the outer-loop, i.e., for the loop being
4676      vectorized.  Create exit phis for the outer loop.  */
4677   if (double_reduc)
4678     {
4679       loop = outer_loop;
4680       exit_bb = single_exit (loop)->dest;
4681       inner_phis.create (vect_defs.length ());
4682       FOR_EACH_VEC_ELT (new_phis, i, phi)
4683         {
4684           stmt_vec_info phi_info = loop_vinfo->lookup_stmt (phi);
4685           tree new_result = copy_ssa_name (PHI_RESULT (phi));
4686           gphi *outer_phi = create_phi_node (new_result, exit_bb);
4687           SET_PHI_ARG_DEF (outer_phi, single_exit (loop)->dest_idx,
4688                            PHI_RESULT (phi));
4689           prev_phi_info = loop_vinfo->add_stmt (outer_phi);
4690           inner_phis.quick_push (phi_info);
4691           new_phis[i] = outer_phi;
4692           while (STMT_VINFO_RELATED_STMT (phi_info))
4693             {
4694               phi_info = STMT_VINFO_RELATED_STMT (phi_info);
4695               new_result = copy_ssa_name (PHI_RESULT (phi_info->stmt));
4696               outer_phi = create_phi_node (new_result, exit_bb);
4697               SET_PHI_ARG_DEF (outer_phi, single_exit (loop)->dest_idx,
4698                                PHI_RESULT (phi_info->stmt));
4699               stmt_vec_info outer_phi_info = loop_vinfo->add_stmt (outer_phi);
4700               STMT_VINFO_RELATED_STMT (prev_phi_info) = outer_phi_info;
4701               prev_phi_info = outer_phi_info;
4702             }
4703         }
4704     }
4705
4706   exit_gsi = gsi_after_labels (exit_bb);
4707
4708   /* 2.2 Get the relevant tree-code to use in the epilog for schemes 2,3
4709          (i.e. when reduc_fn is not available) and in the final adjustment
4710          code (if needed).  Also get the original scalar reduction variable as
4711          defined in the loop.  In case STMT is a "pattern-stmt" (i.e. - it
4712          represents a reduction pattern), the tree-code and scalar-def are
4713          taken from the original stmt that the pattern-stmt (STMT) replaces.
4714          Otherwise (it is a regular reduction) - the tree-code and scalar-def
4715          are taken from STMT.  */
4716
4717   stmt_vec_info orig_stmt_info = vect_orig_stmt (stmt_info);
4718   if (orig_stmt_info != stmt_info)
4719     {
4720       /* Reduction pattern  */
4721       gcc_assert (STMT_VINFO_IN_PATTERN_P (orig_stmt_info));
4722       gcc_assert (STMT_VINFO_RELATED_STMT (orig_stmt_info) == stmt_info);
4723     }
4724
4725   code = gimple_assign_rhs_code (orig_stmt_info->stmt);
4726   /* For MINUS_EXPR the initial vector is [init_val,0,...,0], therefore,
4727      partial results are added and not subtracted.  */
4728   if (code == MINUS_EXPR)
4729     code = PLUS_EXPR;
4730
4731   scalar_dest = gimple_assign_lhs (orig_stmt_info->stmt);
4732   scalar_type = TREE_TYPE (scalar_dest);
4733   scalar_results.create (group_size);
4734   new_scalar_dest = vect_create_destination_var (scalar_dest, NULL);
4735   bitsize = TYPE_SIZE (scalar_type);
4736
4737   /* In case this is a reduction in an inner-loop while vectorizing an outer
4738      loop - we don't need to extract a single scalar result at the end of the
4739      inner-loop (unless it is double reduction, i.e., the use of reduction is
4740      outside the outer-loop).  The final vector of partial results will be used
4741      in the vectorized outer-loop, or reduced to a scalar result at the end of
4742      the outer-loop.  */
4743   if (nested_in_vect_loop && !double_reduc)
4744     goto vect_finalize_reduction;
4745
4746   /* SLP reduction without reduction chain, e.g.,
4747      # a1 = phi <a2, a0>
4748      # b1 = phi <b2, b0>
4749      a2 = operation (a1)
4750      b2 = operation (b1)  */
4751   slp_reduc = (slp_node && !REDUC_GROUP_FIRST_ELEMENT (stmt_info));
4752
4753   /* True if we should implement SLP_REDUC using native reduction operations
4754      instead of scalar operations.  */
4755   direct_slp_reduc = (reduc_fn != IFN_LAST
4756                       && slp_reduc
4757                       && !TYPE_VECTOR_SUBPARTS (vectype).is_constant ());
4758
4759   /* In case of reduction chain, e.g.,
4760      # a1 = phi <a3, a0>
4761      a2 = operation (a1)
4762      a3 = operation (a2),
4763
4764      we may end up with more than one vector result.  Here we reduce them to
4765      one vector.  */
4766   if (REDUC_GROUP_FIRST_ELEMENT (stmt_info) || direct_slp_reduc)
4767     {
4768       tree first_vect = PHI_RESULT (new_phis[0]);
4769       gassign *new_vec_stmt = NULL;
4770       vec_dest = vect_create_destination_var (scalar_dest, vectype);
4771       for (k = 1; k < new_phis.length (); k++)
4772         {
4773           gimple *next_phi = new_phis[k];
4774           tree second_vect = PHI_RESULT (next_phi);
4775           tree tem = make_ssa_name (vec_dest, new_vec_stmt);
4776           new_vec_stmt = gimple_build_assign (tem, code,
4777                                               first_vect, second_vect);
4778           gsi_insert_before (&exit_gsi, new_vec_stmt, GSI_SAME_STMT);
4779           first_vect = tem;
4780         }
4781
4782       new_phi_result = first_vect;
4783       if (new_vec_stmt)
4784         {
4785           new_phis.truncate (0);
4786           new_phis.safe_push (new_vec_stmt);
4787         }
4788     }
4789   /* Likewise if we couldn't use a single defuse cycle.  */
4790   else if (ncopies > 1)
4791     {
4792       gcc_assert (new_phis.length () == 1);
4793       tree first_vect = PHI_RESULT (new_phis[0]);
4794       gassign *new_vec_stmt = NULL;
4795       vec_dest = vect_create_destination_var (scalar_dest, vectype);
4796       stmt_vec_info next_phi_info = loop_vinfo->lookup_stmt (new_phis[0]);
4797       for (int k = 1; k < ncopies; ++k)
4798         {
4799           next_phi_info = STMT_VINFO_RELATED_STMT (next_phi_info);
4800           tree second_vect = PHI_RESULT (next_phi_info->stmt);
4801           tree tem = make_ssa_name (vec_dest, new_vec_stmt);
4802           new_vec_stmt = gimple_build_assign (tem, code,
4803                                               first_vect, second_vect);
4804           gsi_insert_before (&exit_gsi, new_vec_stmt, GSI_SAME_STMT);
4805           first_vect = tem;
4806         }
4807       new_phi_result = first_vect;
4808       new_phis.truncate (0);
4809       new_phis.safe_push (new_vec_stmt);
4810     }
4811   else
4812     new_phi_result = PHI_RESULT (new_phis[0]);
4813
4814   if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) == COND_REDUCTION
4815       && reduc_fn != IFN_LAST)
4816     {
4817       /* For condition reductions, we have a vector (NEW_PHI_RESULT) containing
4818          various data values where the condition matched and another vector
4819          (INDUCTION_INDEX) containing all the indexes of those matches.  We
4820          need to extract the last matching index (which will be the index with
4821          highest value) and use this to index into the data vector.
4822          For the case where there were no matches, the data vector will contain
4823          all default values and the index vector will be all zeros.  */
4824
4825       /* Get various versions of the type of the vector of indexes.  */
4826       tree index_vec_type = TREE_TYPE (induction_index);
4827       gcc_checking_assert (TYPE_UNSIGNED (index_vec_type));
4828       tree index_scalar_type = TREE_TYPE (index_vec_type);
4829       tree index_vec_cmp_type = build_same_sized_truth_vector_type
4830         (index_vec_type);
4831
4832       /* Get an unsigned integer version of the type of the data vector.  */
4833       int scalar_precision
4834         = GET_MODE_PRECISION (SCALAR_TYPE_MODE (scalar_type));
4835       tree scalar_type_unsigned = make_unsigned_type (scalar_precision);
4836       tree vectype_unsigned = build_vector_type
4837         (scalar_type_unsigned, TYPE_VECTOR_SUBPARTS (vectype));
4838
4839       /* First we need to create a vector (ZERO_VEC) of zeros and another
4840          vector (MAX_INDEX_VEC) filled with the last matching index, which we
4841          can create using a MAX reduction and then expanding.
4842          In the case where the loop never made any matches, the max index will
4843          be zero.  */
4844
4845       /* Vector of {0, 0, 0,...}.  */
4846       tree zero_vec = make_ssa_name (vectype);
4847       tree zero_vec_rhs = build_zero_cst (vectype);
4848       gimple *zero_vec_stmt = gimple_build_assign (zero_vec, zero_vec_rhs);
4849       gsi_insert_before (&exit_gsi, zero_vec_stmt, GSI_SAME_STMT);
4850
4851       /* Find maximum value from the vector of found indexes.  */
4852       tree max_index = make_ssa_name (index_scalar_type);
4853       gcall *max_index_stmt = gimple_build_call_internal (IFN_REDUC_MAX,
4854                                                           1, induction_index);
4855       gimple_call_set_lhs (max_index_stmt, max_index);
4856       gsi_insert_before (&exit_gsi, max_index_stmt, GSI_SAME_STMT);
4857
4858       /* Vector of {max_index, max_index, max_index,...}.  */
4859       tree max_index_vec = make_ssa_name (index_vec_type);
4860       tree max_index_vec_rhs = build_vector_from_val (index_vec_type,
4861                                                       max_index);
4862       gimple *max_index_vec_stmt = gimple_build_assign (max_index_vec,
4863                                                         max_index_vec_rhs);
4864       gsi_insert_before (&exit_gsi, max_index_vec_stmt, GSI_SAME_STMT);
4865
4866       /* Next we compare the new vector (MAX_INDEX_VEC) full of max indexes
4867          with the vector (INDUCTION_INDEX) of found indexes, choosing values
4868          from the data vector (NEW_PHI_RESULT) for matches, 0 (ZERO_VEC)
4869          otherwise.  Only one value should match, resulting in a vector
4870          (VEC_COND) with one data value and the rest zeros.
4871          In the case where the loop never made any matches, every index will
4872          match, resulting in a vector with all data values (which will all be
4873          the default value).  */
4874
4875       /* Compare the max index vector to the vector of found indexes to find
4876          the position of the max value.  */
4877       tree vec_compare = make_ssa_name (index_vec_cmp_type);
4878       gimple *vec_compare_stmt = gimple_build_assign (vec_compare, EQ_EXPR,
4879                                                       induction_index,
4880                                                       max_index_vec);
4881       gsi_insert_before (&exit_gsi, vec_compare_stmt, GSI_SAME_STMT);
4882
4883       /* Use the compare to choose either values from the data vector or
4884          zero.  */
4885       tree vec_cond = make_ssa_name (vectype);
4886       gimple *vec_cond_stmt = gimple_build_assign (vec_cond, VEC_COND_EXPR,
4887                                                    vec_compare, new_phi_result,
4888                                                    zero_vec);
4889       gsi_insert_before (&exit_gsi, vec_cond_stmt, GSI_SAME_STMT);
4890
4891       /* Finally we need to extract the data value from the vector (VEC_COND)
4892          into a scalar (MATCHED_DATA_REDUC).  Logically we want to do a OR
4893          reduction, but because this doesn't exist, we can use a MAX reduction
4894          instead.  The data value might be signed or a float so we need to cast
4895          it first.
4896          In the case where the loop never made any matches, the data values are
4897          all identical, and so will reduce down correctly.  */
4898
4899       /* Make the matched data values unsigned.  */
4900       tree vec_cond_cast = make_ssa_name (vectype_unsigned);
4901       tree vec_cond_cast_rhs = build1 (VIEW_CONVERT_EXPR, vectype_unsigned,
4902                                        vec_cond);
4903       gimple *vec_cond_cast_stmt = gimple_build_assign (vec_cond_cast,
4904                                                         VIEW_CONVERT_EXPR,
4905                                                         vec_cond_cast_rhs);
4906       gsi_insert_before (&exit_gsi, vec_cond_cast_stmt, GSI_SAME_STMT);
4907
4908       /* Reduce down to a scalar value.  */
4909       tree data_reduc = make_ssa_name (scalar_type_unsigned);
4910       gcall *data_reduc_stmt = gimple_build_call_internal (IFN_REDUC_MAX,
4911                                                            1, vec_cond_cast);
4912       gimple_call_set_lhs (data_reduc_stmt, data_reduc);
4913       gsi_insert_before (&exit_gsi, data_reduc_stmt, GSI_SAME_STMT);
4914
4915       /* Convert the reduced value back to the result type and set as the
4916          result.  */
4917       gimple_seq stmts = NULL;
4918       new_temp = gimple_build (&stmts, VIEW_CONVERT_EXPR, scalar_type,
4919                                data_reduc);
4920       gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
4921       scalar_results.safe_push (new_temp);
4922     }
4923   else if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) == COND_REDUCTION
4924            && reduc_fn == IFN_LAST)
4925     {
4926       /* Condition reduction without supported IFN_REDUC_MAX.  Generate
4927          idx = 0;
4928          idx_val = induction_index[0];
4929          val = data_reduc[0];
4930          for (idx = 0, val = init, i = 0; i < nelts; ++i)
4931            if (induction_index[i] > idx_val)
4932              val = data_reduc[i], idx_val = induction_index[i];
4933          return val;  */
4934
4935       tree data_eltype = TREE_TYPE (TREE_TYPE (new_phi_result));
4936       tree idx_eltype = TREE_TYPE (TREE_TYPE (induction_index));
4937       unsigned HOST_WIDE_INT el_size = tree_to_uhwi (TYPE_SIZE (idx_eltype));
4938       poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (TREE_TYPE (induction_index));
4939       /* Enforced by vectorizable_reduction, which ensures we have target
4940          support before allowing a conditional reduction on variable-length
4941          vectors.  */
4942       unsigned HOST_WIDE_INT v_size = el_size * nunits.to_constant ();
4943       tree idx_val = NULL_TREE, val = NULL_TREE;
4944       for (unsigned HOST_WIDE_INT off = 0; off < v_size; off += el_size)
4945         {
4946           tree old_idx_val = idx_val;
4947           tree old_val = val;
4948           idx_val = make_ssa_name (idx_eltype);
4949           epilog_stmt = gimple_build_assign (idx_val, BIT_FIELD_REF,
4950                                              build3 (BIT_FIELD_REF, idx_eltype,
4951                                                      induction_index,
4952                                                      bitsize_int (el_size),
4953                                                      bitsize_int (off)));
4954           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4955           val = make_ssa_name (data_eltype);
4956           epilog_stmt = gimple_build_assign (val, BIT_FIELD_REF,
4957                                              build3 (BIT_FIELD_REF,
4958                                                      data_eltype,
4959                                                      new_phi_result,
4960                                                      bitsize_int (el_size),
4961                                                      bitsize_int (off)));
4962           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4963           if (off != 0)
4964             {
4965               tree new_idx_val = idx_val;
4966               tree new_val = val;
4967               if (off != v_size - el_size)
4968                 {
4969                   new_idx_val = make_ssa_name (idx_eltype);
4970                   epilog_stmt = gimple_build_assign (new_idx_val,
4971                                                      MAX_EXPR, idx_val,
4972                                                      old_idx_val);
4973                   gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4974                 }
4975               new_val = make_ssa_name (data_eltype);
4976               epilog_stmt = gimple_build_assign (new_val,
4977                                                  COND_EXPR,
4978                                                  build2 (GT_EXPR,
4979                                                          boolean_type_node,
4980                                                          idx_val,
4981                                                          old_idx_val),
4982                                                  val, old_val);
4983               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4984               idx_val = new_idx_val;
4985               val = new_val;
4986             }
4987         }
4988       /* Convert the reduced value back to the result type and set as the
4989          result.  */
4990       gimple_seq stmts = NULL;
4991       val = gimple_convert (&stmts, scalar_type, val);
4992       gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
4993       scalar_results.safe_push (val);
4994     }
4995
4996   /* 2.3 Create the reduction code, using one of the three schemes described
4997          above. In SLP we simply need to extract all the elements from the
4998          vector (without reducing them), so we use scalar shifts.  */
4999   else if (reduc_fn != IFN_LAST && !slp_reduc)
5000     {
5001       tree tmp;
5002       tree vec_elem_type;
5003
5004       /* Case 1:  Create:
5005          v_out2 = reduc_expr <v_out1>  */
5006
5007       if (dump_enabled_p ())
5008         dump_printf_loc (MSG_NOTE, vect_location,
5009                          "Reduce using direct vector reduction.\n");
5010
5011       vec_elem_type = TREE_TYPE (TREE_TYPE (new_phi_result));
5012       if (!useless_type_conversion_p (scalar_type, vec_elem_type))
5013         {
5014           tree tmp_dest
5015             = vect_create_destination_var (scalar_dest, vec_elem_type);
5016           epilog_stmt = gimple_build_call_internal (reduc_fn, 1,
5017                                                     new_phi_result);
5018           gimple_set_lhs (epilog_stmt, tmp_dest);
5019           new_temp = make_ssa_name (tmp_dest, epilog_stmt);
5020           gimple_set_lhs (epilog_stmt, new_temp);
5021           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5022
5023           epilog_stmt = gimple_build_assign (new_scalar_dest, NOP_EXPR,
5024                                              new_temp);
5025         }
5026       else
5027         {
5028           epilog_stmt = gimple_build_call_internal (reduc_fn, 1,
5029                                                     new_phi_result);
5030           gimple_set_lhs (epilog_stmt, new_scalar_dest);
5031         }
5032
5033       new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
5034       gimple_set_lhs (epilog_stmt, new_temp);
5035       gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5036
5037       if ((STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
5038            == INTEGER_INDUC_COND_REDUCTION)
5039           && !operand_equal_p (initial_def, induc_val, 0))
5040         {
5041           /* Earlier we set the initial value to be a vector if induc_val
5042              values.  Check the result and if it is induc_val then replace
5043              with the original initial value, unless induc_val is
5044              the same as initial_def already.  */
5045           tree zcompare = build2 (EQ_EXPR, boolean_type_node, new_temp,
5046                                   induc_val);
5047
5048           tmp = make_ssa_name (new_scalar_dest);
5049           epilog_stmt = gimple_build_assign (tmp, COND_EXPR, zcompare,
5050                                              initial_def, new_temp);
5051           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5052           new_temp = tmp;
5053         }
5054
5055       scalar_results.safe_push (new_temp);
5056     }
5057   else if (direct_slp_reduc)
5058     {
5059       /* Here we create one vector for each of the REDUC_GROUP_SIZE results,
5060          with the elements for other SLP statements replaced with the
5061          neutral value.  We can then do a normal reduction on each vector.  */
5062
5063       /* Enforced by vectorizable_reduction.  */
5064       gcc_assert (new_phis.length () == 1);
5065       gcc_assert (pow2p_hwi (group_size));
5066
5067       slp_tree orig_phis_slp_node = slp_node_instance->reduc_phis;
5068       vec<stmt_vec_info> orig_phis
5069         = SLP_TREE_SCALAR_STMTS (orig_phis_slp_node);
5070       gimple_seq seq = NULL;
5071
5072       /* Build a vector {0, 1, 2, ...}, with the same number of elements
5073          and the same element size as VECTYPE.  */
5074       tree index = build_index_vector (vectype, 0, 1);
5075       tree index_type = TREE_TYPE (index);
5076       tree index_elt_type = TREE_TYPE (index_type);
5077       tree mask_type = build_same_sized_truth_vector_type (index_type);
5078
5079       /* Create a vector that, for each element, identifies which of
5080          the REDUC_GROUP_SIZE results should use it.  */
5081       tree index_mask = build_int_cst (index_elt_type, group_size - 1);
5082       index = gimple_build (&seq, BIT_AND_EXPR, index_type, index,
5083                             build_vector_from_val (index_type, index_mask));
5084
5085       /* Get a neutral vector value.  This is simply a splat of the neutral
5086          scalar value if we have one, otherwise the initial scalar value
5087          is itself a neutral value.  */
5088       tree vector_identity = NULL_TREE;
5089       if (neutral_op)
5090         vector_identity = gimple_build_vector_from_val (&seq, vectype,
5091                                                         neutral_op);
5092       for (unsigned int i = 0; i < group_size; ++i)
5093         {
5094           /* If there's no univeral neutral value, we can use the
5095              initial scalar value from the original PHI.  This is used
5096              for MIN and MAX reduction, for example.  */
5097           if (!neutral_op)
5098             {
5099               tree scalar_value
5100                 = PHI_ARG_DEF_FROM_EDGE (orig_phis[i]->stmt,
5101                                          loop_preheader_edge (loop));
5102               vector_identity = gimple_build_vector_from_val (&seq, vectype,
5103                                                               scalar_value);
5104             }
5105
5106           /* Calculate the equivalent of:
5107
5108              sel[j] = (index[j] == i);
5109
5110              which selects the elements of NEW_PHI_RESULT that should
5111              be included in the result.  */
5112           tree compare_val = build_int_cst (index_elt_type, i);
5113           compare_val = build_vector_from_val (index_type, compare_val);
5114           tree sel = gimple_build (&seq, EQ_EXPR, mask_type,
5115                                    index, compare_val);
5116
5117           /* Calculate the equivalent of:
5118
5119              vec = seq ? new_phi_result : vector_identity;
5120
5121              VEC is now suitable for a full vector reduction.  */
5122           tree vec = gimple_build (&seq, VEC_COND_EXPR, vectype,
5123                                    sel, new_phi_result, vector_identity);
5124
5125           /* Do the reduction and convert it to the appropriate type.  */
5126           tree scalar = gimple_build (&seq, as_combined_fn (reduc_fn),
5127                                       TREE_TYPE (vectype), vec);
5128           scalar = gimple_convert (&seq, scalar_type, scalar);
5129           scalar_results.safe_push (scalar);
5130         }
5131       gsi_insert_seq_before (&exit_gsi, seq, GSI_SAME_STMT);
5132     }
5133   else
5134     {
5135       bool reduce_with_shift;
5136       tree vec_temp;
5137
5138       /* COND reductions all do the final reduction with MAX_EXPR
5139          or MIN_EXPR.  */
5140       if (code == COND_EXPR)
5141         {
5142           if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
5143               == INTEGER_INDUC_COND_REDUCTION)
5144             code = induc_code;
5145           else if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
5146                    == CONST_COND_REDUCTION)
5147             code = STMT_VINFO_VEC_CONST_COND_REDUC_CODE (stmt_info);
5148           else
5149             code = MAX_EXPR;
5150         }
5151
5152       /* See if the target wants to do the final (shift) reduction
5153          in a vector mode of smaller size and first reduce upper/lower
5154          halves against each other.  */
5155       enum machine_mode mode1 = mode;
5156       tree vectype1 = vectype;
5157       unsigned sz = tree_to_uhwi (TYPE_SIZE_UNIT (vectype));
5158       unsigned sz1 = sz;
5159       if (!slp_reduc
5160           && (mode1 = targetm.vectorize.split_reduction (mode)) != mode)
5161         sz1 = GET_MODE_SIZE (mode1).to_constant ();
5162
5163       vectype1 = get_vectype_for_scalar_type_and_size (scalar_type, sz1);
5164       reduce_with_shift = have_whole_vector_shift (mode1);
5165       if (!VECTOR_MODE_P (mode1))
5166         reduce_with_shift = false;
5167       else
5168         {
5169           optab optab = optab_for_tree_code (code, vectype1, optab_default);
5170           if (optab_handler (optab, mode1) == CODE_FOR_nothing)
5171             reduce_with_shift = false;
5172         }
5173
5174       /* First reduce the vector to the desired vector size we should
5175          do shift reduction on by combining upper and lower halves.  */
5176       new_temp = new_phi_result;
5177       while (sz > sz1)
5178         {
5179           gcc_assert (!slp_reduc);
5180           sz /= 2;
5181           vectype1 = get_vectype_for_scalar_type_and_size (scalar_type, sz);
5182
5183           /* The target has to make sure we support lowpart/highpart
5184              extraction, either via direct vector extract or through
5185              an integer mode punning.  */
5186           tree dst1, dst2;
5187           if (convert_optab_handler (vec_extract_optab,
5188                                      TYPE_MODE (TREE_TYPE (new_temp)),
5189                                      TYPE_MODE (vectype1))
5190               != CODE_FOR_nothing)
5191             {
5192               /* Extract sub-vectors directly once vec_extract becomes
5193                  a conversion optab.  */
5194               dst1 = make_ssa_name (vectype1);
5195               epilog_stmt
5196                   = gimple_build_assign (dst1, BIT_FIELD_REF,
5197                                          build3 (BIT_FIELD_REF, vectype1,
5198                                                  new_temp, TYPE_SIZE (vectype1),
5199                                                  bitsize_int (0)));
5200               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5201               dst2 =  make_ssa_name (vectype1);
5202               epilog_stmt
5203                   = gimple_build_assign (dst2, BIT_FIELD_REF,
5204                                          build3 (BIT_FIELD_REF, vectype1,
5205                                                  new_temp, TYPE_SIZE (vectype1),
5206                                                  bitsize_int (sz * BITS_PER_UNIT)));
5207               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5208             }
5209           else
5210             {
5211               /* Extract via punning to appropriately sized integer mode
5212                  vector.  */
5213               tree eltype = build_nonstandard_integer_type (sz * BITS_PER_UNIT,
5214                                                             1);
5215               tree etype = build_vector_type (eltype, 2);
5216               gcc_assert (convert_optab_handler (vec_extract_optab,
5217                                                  TYPE_MODE (etype),
5218                                                  TYPE_MODE (eltype))
5219                           != CODE_FOR_nothing);
5220               tree tem = make_ssa_name (etype);
5221               epilog_stmt = gimple_build_assign (tem, VIEW_CONVERT_EXPR,
5222                                                  build1 (VIEW_CONVERT_EXPR,
5223                                                          etype, new_temp));
5224               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5225               new_temp = tem;
5226               tem = make_ssa_name (eltype);
5227               epilog_stmt
5228                   = gimple_build_assign (tem, BIT_FIELD_REF,
5229                                          build3 (BIT_FIELD_REF, eltype,
5230                                                  new_temp, TYPE_SIZE (eltype),
5231                                                  bitsize_int (0)));
5232               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5233               dst1 = make_ssa_name (vectype1);
5234               epilog_stmt = gimple_build_assign (dst1, VIEW_CONVERT_EXPR,
5235                                                  build1 (VIEW_CONVERT_EXPR,
5236                                                          vectype1, tem));
5237               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5238               tem = make_ssa_name (eltype);
5239               epilog_stmt
5240                   = gimple_build_assign (tem, BIT_FIELD_REF,
5241                                          build3 (BIT_FIELD_REF, eltype,
5242                                                  new_temp, TYPE_SIZE (eltype),
5243                                                  bitsize_int (sz * BITS_PER_UNIT)));
5244               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5245               dst2 =  make_ssa_name (vectype1);
5246               epilog_stmt = gimple_build_assign (dst2, VIEW_CONVERT_EXPR,
5247                                                  build1 (VIEW_CONVERT_EXPR,
5248                                                          vectype1, tem));
5249               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5250             }
5251
5252           new_temp = make_ssa_name (vectype1);
5253           epilog_stmt = gimple_build_assign (new_temp, code, dst1, dst2);
5254           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5255         }
5256
5257       if (reduce_with_shift && !slp_reduc)
5258         {
5259           int element_bitsize = tree_to_uhwi (bitsize);
5260           /* Enforced by vectorizable_reduction, which disallows SLP reductions
5261              for variable-length vectors and also requires direct target support
5262              for loop reductions.  */
5263           int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype1));
5264           int nelements = vec_size_in_bits / element_bitsize;
5265           vec_perm_builder sel;
5266           vec_perm_indices indices;
5267
5268           int elt_offset;
5269
5270           tree zero_vec = build_zero_cst (vectype1);
5271           /* Case 2: Create:
5272              for (offset = nelements/2; offset >= 1; offset/=2)
5273                 {
5274                   Create:  va' = vec_shift <va, offset>
5275                   Create:  va = vop <va, va'>
5276                 }  */
5277
5278           tree rhs;
5279
5280           if (dump_enabled_p ())
5281             dump_printf_loc (MSG_NOTE, vect_location,
5282                              "Reduce using vector shifts\n");
5283
5284           mode1 = TYPE_MODE (vectype1);
5285           vec_dest = vect_create_destination_var (scalar_dest, vectype1);
5286           for (elt_offset = nelements / 2;
5287                elt_offset >= 1;
5288                elt_offset /= 2)
5289             {
5290               calc_vec_perm_mask_for_shift (elt_offset, nelements, &sel);
5291               indices.new_vector (sel, 2, nelements);
5292               tree mask = vect_gen_perm_mask_any (vectype1, indices);
5293               epilog_stmt = gimple_build_assign (vec_dest, VEC_PERM_EXPR,
5294                                                  new_temp, zero_vec, mask);
5295               new_name = make_ssa_name (vec_dest, epilog_stmt);
5296               gimple_assign_set_lhs (epilog_stmt, new_name);
5297               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5298
5299               epilog_stmt = gimple_build_assign (vec_dest, code, new_name,
5300                                                  new_temp);
5301               new_temp = make_ssa_name (vec_dest, epilog_stmt);
5302               gimple_assign_set_lhs (epilog_stmt, new_temp);
5303               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5304             }
5305
5306           /* 2.4  Extract the final scalar result.  Create:
5307              s_out3 = extract_field <v_out2, bitpos>  */
5308
5309           if (dump_enabled_p ())
5310             dump_printf_loc (MSG_NOTE, vect_location,
5311                              "extract scalar result\n");
5312
5313           rhs = build3 (BIT_FIELD_REF, scalar_type, new_temp,
5314                         bitsize, bitsize_zero_node);
5315           epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
5316           new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
5317           gimple_assign_set_lhs (epilog_stmt, new_temp);
5318           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5319           scalar_results.safe_push (new_temp);
5320         }
5321       else
5322         {
5323           /* Case 3: Create:
5324              s = extract_field <v_out2, 0>
5325              for (offset = element_size;
5326                   offset < vector_size;
5327                   offset += element_size;)
5328                {
5329                  Create:  s' = extract_field <v_out2, offset>
5330                  Create:  s = op <s, s'>  // For non SLP cases
5331                }  */
5332
5333           if (dump_enabled_p ())
5334             dump_printf_loc (MSG_NOTE, vect_location,
5335                              "Reduce using scalar code.\n");
5336
5337           int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype1));
5338           int element_bitsize = tree_to_uhwi (bitsize);
5339           FOR_EACH_VEC_ELT (new_phis, i, new_phi)
5340             {
5341               int bit_offset;
5342               if (gimple_code (new_phi) == GIMPLE_PHI)
5343                 vec_temp = PHI_RESULT (new_phi);
5344               else
5345                 vec_temp = gimple_assign_lhs (new_phi);
5346               tree rhs = build3 (BIT_FIELD_REF, scalar_type, vec_temp, bitsize,
5347                                  bitsize_zero_node);
5348               epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
5349               new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
5350               gimple_assign_set_lhs (epilog_stmt, new_temp);
5351               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5352
5353               /* In SLP we don't need to apply reduction operation, so we just
5354                  collect s' values in SCALAR_RESULTS.  */
5355               if (slp_reduc)
5356                 scalar_results.safe_push (new_temp);
5357
5358               for (bit_offset = element_bitsize;
5359                    bit_offset < vec_size_in_bits;
5360                    bit_offset += element_bitsize)
5361                 {
5362                   tree bitpos = bitsize_int (bit_offset);
5363                   tree rhs = build3 (BIT_FIELD_REF, scalar_type, vec_temp,
5364                                      bitsize, bitpos);
5365
5366                   epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
5367                   new_name = make_ssa_name (new_scalar_dest, epilog_stmt);
5368                   gimple_assign_set_lhs (epilog_stmt, new_name);
5369                   gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5370
5371                   if (slp_reduc)
5372                     {
5373                       /* In SLP we don't need to apply reduction operation, so
5374                          we just collect s' values in SCALAR_RESULTS.  */
5375                       new_temp = new_name;
5376                       scalar_results.safe_push (new_name);
5377                     }
5378                   else
5379                     {
5380                       epilog_stmt = gimple_build_assign (new_scalar_dest, code,
5381                                                          new_name, new_temp);
5382                       new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
5383                       gimple_assign_set_lhs (epilog_stmt, new_temp);
5384                       gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5385                     }
5386                 }
5387             }
5388
5389           /* The only case where we need to reduce scalar results in SLP, is
5390              unrolling.  If the size of SCALAR_RESULTS is greater than
5391              REDUC_GROUP_SIZE, we reduce them combining elements modulo
5392              REDUC_GROUP_SIZE.  */
5393           if (slp_reduc)
5394             {
5395               tree res, first_res, new_res;
5396               gimple *new_stmt;
5397
5398               /* Reduce multiple scalar results in case of SLP unrolling.  */
5399               for (j = group_size; scalar_results.iterate (j, &res);
5400                    j++)
5401                 {
5402                   first_res = scalar_results[j % group_size];
5403                   new_stmt = gimple_build_assign (new_scalar_dest, code,
5404                                                   first_res, res);
5405                   new_res = make_ssa_name (new_scalar_dest, new_stmt);
5406                   gimple_assign_set_lhs (new_stmt, new_res);
5407                   gsi_insert_before (&exit_gsi, new_stmt, GSI_SAME_STMT);
5408                   scalar_results[j % group_size] = new_res;
5409                 }
5410             }
5411           else
5412             /* Not SLP - we have one scalar to keep in SCALAR_RESULTS.  */
5413             scalar_results.safe_push (new_temp);
5414         }
5415
5416       if ((STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
5417            == INTEGER_INDUC_COND_REDUCTION)
5418           && !operand_equal_p (initial_def, induc_val, 0))
5419         {
5420           /* Earlier we set the initial value to be a vector if induc_val
5421              values.  Check the result and if it is induc_val then replace
5422              with the original initial value, unless induc_val is
5423              the same as initial_def already.  */
5424           tree zcompare = build2 (EQ_EXPR, boolean_type_node, new_temp,
5425                                   induc_val);
5426
5427           tree tmp = make_ssa_name (new_scalar_dest);
5428           epilog_stmt = gimple_build_assign (tmp, COND_EXPR, zcompare,
5429                                              initial_def, new_temp);
5430           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5431           scalar_results[0] = tmp;
5432         }
5433     }
5434
5435 vect_finalize_reduction:
5436
5437   if (double_reduc)
5438     loop = loop->inner;
5439
5440   /* 2.5 Adjust the final result by the initial value of the reduction
5441          variable. (When such adjustment is not needed, then
5442          'adjustment_def' is zero).  For example, if code is PLUS we create:
5443          new_temp = loop_exit_def + adjustment_def  */
5444
5445   if (adjustment_def)
5446     {
5447       gcc_assert (!slp_reduc);
5448       if (nested_in_vect_loop)
5449         {
5450           new_phi = new_phis[0];
5451           gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) == VECTOR_TYPE);
5452           expr = build2 (code, vectype, PHI_RESULT (new_phi), adjustment_def);
5453           new_dest = vect_create_destination_var (scalar_dest, vectype);
5454         }
5455       else
5456         {
5457           new_temp = scalar_results[0];
5458           gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) != VECTOR_TYPE);
5459           expr = build2 (code, scalar_type, new_temp, adjustment_def);
5460           new_dest = vect_create_destination_var (scalar_dest, scalar_type);
5461         }
5462
5463       epilog_stmt = gimple_build_assign (new_dest, expr);
5464       new_temp = make_ssa_name (new_dest, epilog_stmt);
5465       gimple_assign_set_lhs (epilog_stmt, new_temp);
5466       gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5467       if (nested_in_vect_loop)
5468         {
5469           stmt_vec_info epilog_stmt_info = loop_vinfo->add_stmt (epilog_stmt);
5470           STMT_VINFO_RELATED_STMT (epilog_stmt_info)
5471             = STMT_VINFO_RELATED_STMT (loop_vinfo->lookup_stmt (new_phi));
5472
5473           if (!double_reduc)
5474             scalar_results.quick_push (new_temp);
5475           else
5476             scalar_results[0] = new_temp;
5477         }
5478       else
5479         scalar_results[0] = new_temp;
5480
5481       new_phis[0] = epilog_stmt;
5482     }
5483
5484   /* 2.6  Handle the loop-exit phis.  Replace the uses of scalar loop-exit
5485           phis with new adjusted scalar results, i.e., replace use <s_out0>
5486           with use <s_out4>.
5487
5488      Transform:
5489         loop_exit:
5490           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
5491           v_out1 = phi <VECT_DEF>               # NEW_EXIT_PHI
5492           v_out2 = reduce <v_out1>
5493           s_out3 = extract_field <v_out2, 0>
5494           s_out4 = adjust_result <s_out3>
5495           use <s_out0>
5496           use <s_out0>
5497
5498      into:
5499
5500         loop_exit:
5501           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
5502           v_out1 = phi <VECT_DEF>               # NEW_EXIT_PHI
5503           v_out2 = reduce <v_out1>
5504           s_out3 = extract_field <v_out2, 0>
5505           s_out4 = adjust_result <s_out3>
5506           use <s_out4>
5507           use <s_out4> */
5508
5509
5510   /* In SLP reduction chain we reduce vector results into one vector if
5511      necessary, hence we set here REDUC_GROUP_SIZE to 1.  SCALAR_DEST is the
5512      LHS of the last stmt in the reduction chain, since we are looking for
5513      the loop exit phi node.  */
5514   if (REDUC_GROUP_FIRST_ELEMENT (stmt_info))
5515     {
5516       stmt_vec_info dest_stmt_info
5517         = vect_orig_stmt (SLP_TREE_SCALAR_STMTS (slp_node)[group_size - 1]);
5518       scalar_dest = gimple_assign_lhs (dest_stmt_info->stmt);
5519       group_size = 1;
5520     }
5521
5522   /* In SLP we may have several statements in NEW_PHIS and REDUCTION_PHIS (in
5523      case that REDUC_GROUP_SIZE is greater than vectorization factor).
5524      Therefore, we need to match SCALAR_RESULTS with corresponding statements.
5525      The first (REDUC_GROUP_SIZE / number of new vector stmts) scalar results
5526      correspond to the first vector stmt, etc.
5527      (RATIO is equal to (REDUC_GROUP_SIZE / number of new vector stmts)).  */
5528   if (group_size > new_phis.length ())
5529     {
5530       ratio = group_size / new_phis.length ();
5531       gcc_assert (!(group_size % new_phis.length ()));
5532     }
5533   else
5534     ratio = 1;
5535
5536   stmt_vec_info epilog_stmt_info = NULL;
5537   for (k = 0; k < group_size; k++)
5538     {
5539       if (k % ratio == 0)
5540         {
5541           epilog_stmt_info = loop_vinfo->lookup_stmt (new_phis[k / ratio]);
5542           reduction_phi_info = reduction_phis[k / ratio];
5543           if (double_reduc)
5544             inner_phi = inner_phis[k / ratio];
5545         }
5546
5547       if (slp_reduc)
5548         {
5549           stmt_vec_info scalar_stmt_info = SLP_TREE_SCALAR_STMTS (slp_node)[k];
5550
5551           orig_stmt_info = STMT_VINFO_RELATED_STMT (scalar_stmt_info);
5552           /* SLP statements can't participate in patterns.  */
5553           gcc_assert (!orig_stmt_info);
5554           scalar_dest = gimple_assign_lhs (scalar_stmt_info->stmt);
5555         }
5556
5557       phis.create (3);
5558       /* Find the loop-closed-use at the loop exit of the original scalar
5559          result.  (The reduction result is expected to have two immediate uses -
5560          one at the latch block, and one at the loop exit).  */
5561       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, scalar_dest)
5562         if (!flow_bb_inside_loop_p (loop, gimple_bb (USE_STMT (use_p)))
5563             && !is_gimple_debug (USE_STMT (use_p)))
5564           phis.safe_push (USE_STMT (use_p));
5565
5566       /* While we expect to have found an exit_phi because of loop-closed-ssa
5567          form we can end up without one if the scalar cycle is dead.  */
5568
5569       FOR_EACH_VEC_ELT (phis, i, exit_phi)
5570         {
5571           if (outer_loop)
5572             {
5573               stmt_vec_info exit_phi_vinfo
5574                 = loop_vinfo->lookup_stmt (exit_phi);
5575               gphi *vect_phi;
5576
5577               if (double_reduc)
5578                 STMT_VINFO_VEC_STMT (exit_phi_vinfo) = inner_phi;
5579               else
5580                 STMT_VINFO_VEC_STMT (exit_phi_vinfo) = epilog_stmt_info;
5581               if (!double_reduc
5582                   || STMT_VINFO_DEF_TYPE (exit_phi_vinfo)
5583                       != vect_double_reduction_def)
5584                 continue;
5585
5586               /* Handle double reduction:
5587
5588                  stmt1: s1 = phi <s0, s2>  - double reduction phi (outer loop)
5589                  stmt2:   s3 = phi <s1, s4> - (regular) reduc phi (inner loop)
5590                  stmt3:   s4 = use (s3)     - (regular) reduc stmt (inner loop)
5591                  stmt4: s2 = phi <s4>      - double reduction stmt (outer loop)
5592
5593                  At that point the regular reduction (stmt2 and stmt3) is
5594                  already vectorized, as well as the exit phi node, stmt4.
5595                  Here we vectorize the phi node of double reduction, stmt1, and
5596                  update all relevant statements.  */
5597
5598               /* Go through all the uses of s2 to find double reduction phi
5599                  node, i.e., stmt1 above.  */
5600               orig_name = PHI_RESULT (exit_phi);
5601               FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name)
5602                 {
5603                   stmt_vec_info use_stmt_vinfo;
5604                   tree vect_phi_init, preheader_arg, vect_phi_res;
5605                   basic_block bb = gimple_bb (use_stmt);
5606
5607                   /* Check that USE_STMT is really double reduction phi
5608                      node.  */
5609                   if (gimple_code (use_stmt) != GIMPLE_PHI
5610                       || gimple_phi_num_args (use_stmt) != 2
5611                       || bb->loop_father != outer_loop)
5612                     continue;
5613                   use_stmt_vinfo = loop_vinfo->lookup_stmt (use_stmt);
5614                   if (!use_stmt_vinfo
5615                       || STMT_VINFO_DEF_TYPE (use_stmt_vinfo)
5616                           != vect_double_reduction_def)
5617                     continue;
5618
5619                   /* Create vector phi node for double reduction:
5620                      vs1 = phi <vs0, vs2>
5621                      vs1 was created previously in this function by a call to
5622                        vect_get_vec_def_for_operand and is stored in
5623                        vec_initial_def;
5624                      vs2 is defined by INNER_PHI, the vectorized EXIT_PHI;
5625                      vs0 is created here.  */
5626
5627                   /* Create vector phi node.  */
5628                   vect_phi = create_phi_node (vec_initial_def, bb);
5629                   loop_vec_info_for_loop (outer_loop)->add_stmt (vect_phi);
5630
5631                   /* Create vs0 - initial def of the double reduction phi.  */
5632                   preheader_arg = PHI_ARG_DEF_FROM_EDGE (use_stmt,
5633                                              loop_preheader_edge (outer_loop));
5634                   vect_phi_init = get_initial_def_for_reduction
5635                     (stmt_info, preheader_arg, NULL);
5636
5637                   /* Update phi node arguments with vs0 and vs2.  */
5638                   add_phi_arg (vect_phi, vect_phi_init,
5639                                loop_preheader_edge (outer_loop),
5640                                UNKNOWN_LOCATION);
5641                   add_phi_arg (vect_phi, PHI_RESULT (inner_phi->stmt),
5642                                loop_latch_edge (outer_loop), UNKNOWN_LOCATION);
5643                   if (dump_enabled_p ())
5644                     dump_printf_loc (MSG_NOTE, vect_location,
5645                                      "created double reduction phi node: %G",
5646                                      vect_phi);
5647
5648                   vect_phi_res = PHI_RESULT (vect_phi);
5649
5650                   /* Replace the use, i.e., set the correct vs1 in the regular
5651                      reduction phi node.  FORNOW, NCOPIES is always 1, so the
5652                      loop is redundant.  */
5653                   stmt_vec_info use_info = reduction_phi_info;
5654                   for (j = 0; j < ncopies; j++)
5655                     {
5656                       edge pr_edge = loop_preheader_edge (loop);
5657                       SET_PHI_ARG_DEF (as_a <gphi *> (use_info->stmt),
5658                                        pr_edge->dest_idx, vect_phi_res);
5659                       use_info = STMT_VINFO_RELATED_STMT (use_info);
5660                     }
5661                 }
5662             }
5663         }
5664
5665       phis.release ();
5666       if (nested_in_vect_loop)
5667         {
5668           if (double_reduc)
5669             loop = outer_loop;
5670           else
5671             continue;
5672         }
5673
5674       phis.create (3);
5675       /* Find the loop-closed-use at the loop exit of the original scalar
5676          result.  (The reduction result is expected to have two immediate uses,
5677          one at the latch block, and one at the loop exit).  For double
5678          reductions we are looking for exit phis of the outer loop.  */
5679       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, scalar_dest)
5680         {
5681           if (!flow_bb_inside_loop_p (loop, gimple_bb (USE_STMT (use_p))))
5682             {
5683               if (!is_gimple_debug (USE_STMT (use_p)))
5684                 phis.safe_push (USE_STMT (use_p));
5685             }
5686           else
5687             {
5688               if (double_reduc && gimple_code (USE_STMT (use_p)) == GIMPLE_PHI)
5689                 {
5690                   tree phi_res = PHI_RESULT (USE_STMT (use_p));
5691
5692                   FOR_EACH_IMM_USE_FAST (phi_use_p, phi_imm_iter, phi_res)
5693                     {
5694                       if (!flow_bb_inside_loop_p (loop,
5695                                              gimple_bb (USE_STMT (phi_use_p)))
5696                           && !is_gimple_debug (USE_STMT (phi_use_p)))
5697                         phis.safe_push (USE_STMT (phi_use_p));
5698                     }
5699                 }
5700             }
5701         }
5702
5703       FOR_EACH_VEC_ELT (phis, i, exit_phi)
5704         {
5705           /* Replace the uses:  */
5706           orig_name = PHI_RESULT (exit_phi);
5707           scalar_result = scalar_results[k];
5708           FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name)
5709             FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
5710               SET_USE (use_p, scalar_result);
5711         }
5712
5713       phis.release ();
5714     }
5715 }
5716
5717 /* Return a vector of type VECTYPE that is equal to the vector select
5718    operation "MASK ? VEC : IDENTITY".  Insert the select statements
5719    before GSI.  */
5720
5721 static tree
5722 merge_with_identity (gimple_stmt_iterator *gsi, tree mask, tree vectype,
5723                      tree vec, tree identity)
5724 {
5725   tree cond = make_temp_ssa_name (vectype, NULL, "cond");
5726   gimple *new_stmt = gimple_build_assign (cond, VEC_COND_EXPR,
5727                                           mask, vec, identity);
5728   gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
5729   return cond;
5730 }
5731
5732 /* Successively apply CODE to each element of VECTOR_RHS, in left-to-right
5733    order, starting with LHS.  Insert the extraction statements before GSI and
5734    associate the new scalar SSA names with variable SCALAR_DEST.
5735    Return the SSA name for the result.  */
5736
5737 static tree
5738 vect_expand_fold_left (gimple_stmt_iterator *gsi, tree scalar_dest,
5739                        tree_code code, tree lhs, tree vector_rhs)
5740 {
5741   tree vectype = TREE_TYPE (vector_rhs);
5742   tree scalar_type = TREE_TYPE (vectype);
5743   tree bitsize = TYPE_SIZE (scalar_type);
5744   unsigned HOST_WIDE_INT vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
5745   unsigned HOST_WIDE_INT element_bitsize = tree_to_uhwi (bitsize);
5746
5747   for (unsigned HOST_WIDE_INT bit_offset = 0;
5748        bit_offset < vec_size_in_bits;
5749        bit_offset += element_bitsize)
5750     {
5751       tree bitpos = bitsize_int (bit_offset);
5752       tree rhs = build3 (BIT_FIELD_REF, scalar_type, vector_rhs,
5753                          bitsize, bitpos);
5754
5755       gassign *stmt = gimple_build_assign (scalar_dest, rhs);
5756       rhs = make_ssa_name (scalar_dest, stmt);
5757       gimple_assign_set_lhs (stmt, rhs);
5758       gsi_insert_before (gsi, stmt, GSI_SAME_STMT);
5759
5760       stmt = gimple_build_assign (scalar_dest, code, lhs, rhs);
5761       tree new_name = make_ssa_name (scalar_dest, stmt);
5762       gimple_assign_set_lhs (stmt, new_name);
5763       gsi_insert_before (gsi, stmt, GSI_SAME_STMT);
5764       lhs = new_name;
5765     }
5766   return lhs;
5767 }
5768
5769 /* Perform an in-order reduction (FOLD_LEFT_REDUCTION).  STMT_INFO is the
5770    statement that sets the live-out value.  REDUC_DEF_STMT is the phi
5771    statement.  CODE is the operation performed by STMT_INFO and OPS are
5772    its scalar operands.  REDUC_INDEX is the index of the operand in
5773    OPS that is set by REDUC_DEF_STMT.  REDUC_FN is the function that
5774    implements in-order reduction, or IFN_LAST if we should open-code it.
5775    VECTYPE_IN is the type of the vector input.  MASKS specifies the masks
5776    that should be used to control the operation in a fully-masked loop.  */
5777
5778 static bool
5779 vectorize_fold_left_reduction (stmt_vec_info stmt_info,
5780                                gimple_stmt_iterator *gsi,
5781                                stmt_vec_info *vec_stmt, slp_tree slp_node,
5782                                gimple *reduc_def_stmt,
5783                                tree_code code, internal_fn reduc_fn,
5784                                tree ops[3], tree vectype_in,
5785                                int reduc_index, vec_loop_masks *masks)
5786 {
5787   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
5788   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
5789   tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
5790   stmt_vec_info new_stmt_info = NULL;
5791
5792   int ncopies;
5793   if (slp_node)
5794     ncopies = 1;
5795   else
5796     ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
5797
5798   gcc_assert (!nested_in_vect_loop_p (loop, stmt_info));
5799   gcc_assert (ncopies == 1);
5800   gcc_assert (TREE_CODE_LENGTH (code) == binary_op);
5801   gcc_assert (reduc_index == (code == MINUS_EXPR ? 0 : 1));
5802   gcc_assert (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
5803               == FOLD_LEFT_REDUCTION);
5804
5805   if (slp_node)
5806     gcc_assert (known_eq (TYPE_VECTOR_SUBPARTS (vectype_out),
5807                           TYPE_VECTOR_SUBPARTS (vectype_in)));
5808
5809   tree op0 = ops[1 - reduc_index];
5810
5811   int group_size = 1;
5812   stmt_vec_info scalar_dest_def_info;
5813   auto_vec<tree> vec_oprnds0;
5814   if (slp_node)
5815     {
5816       auto_vec<vec<tree> > vec_defs (2);
5817       auto_vec<tree> sops(2);
5818       sops.quick_push (ops[0]);
5819       sops.quick_push (ops[1]);
5820       vect_get_slp_defs (sops, slp_node, &vec_defs);
5821       vec_oprnds0.safe_splice (vec_defs[1 - reduc_index]);
5822       vec_defs[0].release ();
5823       vec_defs[1].release ();
5824       group_size = SLP_TREE_SCALAR_STMTS (slp_node).length ();
5825       scalar_dest_def_info = SLP_TREE_SCALAR_STMTS (slp_node)[group_size - 1];
5826     }
5827   else
5828     {
5829       tree loop_vec_def0 = vect_get_vec_def_for_operand (op0, stmt_info);
5830       vec_oprnds0.create (1);
5831       vec_oprnds0.quick_push (loop_vec_def0);
5832       scalar_dest_def_info = stmt_info;
5833     }
5834
5835   tree scalar_dest = gimple_assign_lhs (scalar_dest_def_info->stmt);
5836   tree scalar_type = TREE_TYPE (scalar_dest);
5837   tree reduc_var = gimple_phi_result (reduc_def_stmt);
5838
5839   int vec_num = vec_oprnds0.length ();
5840   gcc_assert (vec_num == 1 || slp_node);
5841   tree vec_elem_type = TREE_TYPE (vectype_out);
5842   gcc_checking_assert (useless_type_conversion_p (scalar_type, vec_elem_type));
5843
5844   tree vector_identity = NULL_TREE;
5845   if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
5846     vector_identity = build_zero_cst (vectype_out);
5847
5848   tree scalar_dest_var = vect_create_destination_var (scalar_dest, NULL);
5849   int i;
5850   tree def0;
5851   FOR_EACH_VEC_ELT (vec_oprnds0, i, def0)
5852     {
5853       gimple *new_stmt;
5854       tree mask = NULL_TREE;
5855       if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
5856         mask = vect_get_loop_mask (gsi, masks, vec_num, vectype_in, i);
5857
5858       /* Handle MINUS by adding the negative.  */
5859       if (reduc_fn != IFN_LAST && code == MINUS_EXPR)
5860         {
5861           tree negated = make_ssa_name (vectype_out);
5862           new_stmt = gimple_build_assign (negated, NEGATE_EXPR, def0);
5863           gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
5864           def0 = negated;
5865         }
5866
5867       if (mask)
5868         def0 = merge_with_identity (gsi, mask, vectype_out, def0,
5869                                     vector_identity);
5870
5871       /* On the first iteration the input is simply the scalar phi
5872          result, and for subsequent iterations it is the output of
5873          the preceding operation.  */
5874       if (reduc_fn != IFN_LAST)
5875         {
5876           new_stmt = gimple_build_call_internal (reduc_fn, 2, reduc_var, def0);
5877           /* For chained SLP reductions the output of the previous reduction
5878              operation serves as the input of the next. For the final statement
5879              the output cannot be a temporary - we reuse the original
5880              scalar destination of the last statement.  */
5881           if (i != vec_num - 1)
5882             {
5883               gimple_set_lhs (new_stmt, scalar_dest_var);
5884               reduc_var = make_ssa_name (scalar_dest_var, new_stmt);
5885               gimple_set_lhs (new_stmt, reduc_var);
5886             }
5887         }
5888       else
5889         {
5890           reduc_var = vect_expand_fold_left (gsi, scalar_dest_var, code,
5891                                              reduc_var, def0);
5892           new_stmt = SSA_NAME_DEF_STMT (reduc_var);
5893           /* Remove the statement, so that we can use the same code paths
5894              as for statements that we've just created.  */
5895           gimple_stmt_iterator tmp_gsi = gsi_for_stmt (new_stmt);
5896           gsi_remove (&tmp_gsi, true);
5897         }
5898
5899       if (i == vec_num - 1)
5900         {
5901           gimple_set_lhs (new_stmt, scalar_dest);
5902           new_stmt_info = vect_finish_replace_stmt (scalar_dest_def_info,
5903                                                     new_stmt);
5904         }
5905       else
5906         new_stmt_info = vect_finish_stmt_generation (scalar_dest_def_info,
5907                                                      new_stmt, gsi);
5908
5909       if (slp_node)
5910         SLP_TREE_VEC_STMTS (slp_node).quick_push (new_stmt_info);
5911     }
5912
5913   if (!slp_node)
5914     STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt_info;
5915
5916   return true;
5917 }
5918
5919 /* Function is_nonwrapping_integer_induction.
5920
5921    Check if STMT_VINO (which is part of loop LOOP) both increments and
5922    does not cause overflow.  */
5923
5924 static bool
5925 is_nonwrapping_integer_induction (stmt_vec_info stmt_vinfo, struct loop *loop)
5926 {
5927   gphi *phi = as_a <gphi *> (stmt_vinfo->stmt);
5928   tree base = STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo);
5929   tree step = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo);
5930   tree lhs_type = TREE_TYPE (gimple_phi_result (phi));
5931   widest_int ni, max_loop_value, lhs_max;
5932   wi::overflow_type overflow = wi::OVF_NONE;
5933
5934   /* Make sure the loop is integer based.  */
5935   if (TREE_CODE (base) != INTEGER_CST
5936       || TREE_CODE (step) != INTEGER_CST)
5937     return false;
5938
5939   /* Check that the max size of the loop will not wrap.  */
5940
5941   if (TYPE_OVERFLOW_UNDEFINED (lhs_type))
5942     return true;
5943
5944   if (! max_stmt_executions (loop, &ni))
5945     return false;
5946
5947   max_loop_value = wi::mul (wi::to_widest (step), ni, TYPE_SIGN (lhs_type),
5948                             &overflow);
5949   if (overflow)
5950     return false;
5951
5952   max_loop_value = wi::add (wi::to_widest (base), max_loop_value,
5953                             TYPE_SIGN (lhs_type), &overflow);
5954   if (overflow)
5955     return false;
5956
5957   return (wi::min_precision (max_loop_value, TYPE_SIGN (lhs_type))
5958           <= TYPE_PRECISION (lhs_type));
5959 }
5960
5961 /* Check if masking can be supported by inserting a conditional expression.
5962    CODE is the code for the operation.  COND_FN is the conditional internal
5963    function, if it exists.  VECTYPE_IN is the type of the vector input.  */
5964 static bool
5965 use_mask_by_cond_expr_p (enum tree_code code, internal_fn cond_fn,
5966                          tree vectype_in)
5967 {
5968   if (cond_fn != IFN_LAST
5969       && direct_internal_fn_supported_p (cond_fn, vectype_in,
5970                                          OPTIMIZE_FOR_SPEED))
5971     return false;
5972
5973   switch (code)
5974     {
5975     case DOT_PROD_EXPR:
5976     case SAD_EXPR:
5977       return true;
5978
5979     default:
5980       return false;
5981     }
5982 }
5983
5984 /* Insert a conditional expression to enable masked vectorization.  CODE is the
5985    code for the operation.  VOP is the array of operands.  MASK is the loop
5986    mask.  GSI is a statement iterator used to place the new conditional
5987    expression.  */
5988 static void
5989 build_vect_cond_expr (enum tree_code code, tree vop[3], tree mask,
5990                       gimple_stmt_iterator *gsi)
5991 {
5992   switch (code)
5993     {
5994     case DOT_PROD_EXPR:
5995       {
5996         tree vectype = TREE_TYPE (vop[1]);
5997         tree zero = build_zero_cst (vectype);
5998         tree masked_op1 = make_temp_ssa_name (vectype, NULL, "masked_op1");
5999         gassign *select = gimple_build_assign (masked_op1, VEC_COND_EXPR,
6000                                                mask, vop[1], zero);
6001         gsi_insert_before (gsi, select, GSI_SAME_STMT);
6002         vop[1] = masked_op1;
6003         break;
6004       }
6005
6006     case SAD_EXPR:
6007       {
6008         tree vectype = TREE_TYPE (vop[1]);
6009         tree masked_op1 = make_temp_ssa_name (vectype, NULL, "masked_op1");
6010         gassign *select = gimple_build_assign (masked_op1, VEC_COND_EXPR,
6011                                                mask, vop[1], vop[0]);
6012         gsi_insert_before (gsi, select, GSI_SAME_STMT);
6013         vop[1] = masked_op1;
6014         break;
6015       }
6016
6017     default:
6018       gcc_unreachable ();
6019     }
6020 }
6021
6022 /* Function vectorizable_reduction.
6023
6024    Check if STMT_INFO performs a reduction operation that can be vectorized.
6025    If VEC_STMT is also passed, vectorize STMT_INFO: create a vectorized
6026    stmt to replace it, put it in VEC_STMT, and insert it at GSI.
6027    Return true if STMT_INFO is vectorizable in this way.
6028
6029    This function also handles reduction idioms (patterns) that have been
6030    recognized in advance during vect_pattern_recog.  In this case, STMT_INFO
6031    may be of this form:
6032      X = pattern_expr (arg0, arg1, ..., X)
6033    and its STMT_VINFO_RELATED_STMT points to the last stmt in the original
6034    sequence that had been detected and replaced by the pattern-stmt
6035    (STMT_INFO).
6036
6037    This function also handles reduction of condition expressions, for example:
6038      for (int i = 0; i < N; i++)
6039        if (a[i] < value)
6040          last = a[i];
6041    This is handled by vectorising the loop and creating an additional vector
6042    containing the loop indexes for which "a[i] < value" was true.  In the
6043    function epilogue this is reduced to a single max value and then used to
6044    index into the vector of results.
6045
6046    In some cases of reduction patterns, the type of the reduction variable X is
6047    different than the type of the other arguments of STMT_INFO.
6048    In such cases, the vectype that is used when transforming STMT_INFO into
6049    a vector stmt is different than the vectype that is used to determine the
6050    vectorization factor, because it consists of a different number of elements
6051    than the actual number of elements that are being operated upon in parallel.
6052
6053    For example, consider an accumulation of shorts into an int accumulator.
6054    On some targets it's possible to vectorize this pattern operating on 8
6055    shorts at a time (hence, the vectype for purposes of determining the
6056    vectorization factor should be V8HI); on the other hand, the vectype that
6057    is used to create the vector form is actually V4SI (the type of the result).
6058
6059    Upon entry to this function, STMT_VINFO_VECTYPE records the vectype that
6060    indicates what is the actual level of parallelism (V8HI in the example), so
6061    that the right vectorization factor would be derived.  This vectype
6062    corresponds to the type of arguments to the reduction stmt, and should *NOT*
6063    be used to create the vectorized stmt.  The right vectype for the vectorized
6064    stmt is obtained from the type of the result X:
6065         get_vectype_for_scalar_type (TREE_TYPE (X))
6066
6067    This means that, contrary to "regular" reductions (or "regular" stmts in
6068    general), the following equation:
6069       STMT_VINFO_VECTYPE == get_vectype_for_scalar_type (TREE_TYPE (X))
6070    does *NOT* necessarily hold for reduction patterns.  */
6071
6072 bool
6073 vectorizable_reduction (stmt_vec_info stmt_info, gimple_stmt_iterator *gsi,
6074                         stmt_vec_info *vec_stmt, slp_tree slp_node,
6075                         slp_instance slp_node_instance,
6076                         stmt_vector_for_cost *cost_vec)
6077 {
6078   tree vec_dest;
6079   tree scalar_dest;
6080   tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
6081   tree vectype_in = NULL_TREE;
6082   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
6083   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
6084   enum tree_code code, orig_code;
6085   internal_fn reduc_fn;
6086   machine_mode vec_mode;
6087   int op_type;
6088   optab optab;
6089   tree new_temp = NULL_TREE;
6090   enum vect_def_type dt, cond_reduc_dt = vect_unknown_def_type;
6091   stmt_vec_info cond_stmt_vinfo = NULL;
6092   enum tree_code cond_reduc_op_code = ERROR_MARK;
6093   tree scalar_type;
6094   bool is_simple_use;
6095   int i;
6096   int ncopies;
6097   int epilog_copies;
6098   stmt_vec_info prev_stmt_info, prev_phi_info;
6099   bool single_defuse_cycle = false;
6100   stmt_vec_info new_stmt_info = NULL;
6101   int j;
6102   tree ops[3];
6103   enum vect_def_type dts[3];
6104   bool nested_cycle = false, found_nested_cycle_def = false;
6105   bool double_reduc = false;
6106   basic_block def_bb;
6107   struct loop * def_stmt_loop;
6108   tree def_arg;
6109   auto_vec<tree> vec_oprnds0;
6110   auto_vec<tree> vec_oprnds1;
6111   auto_vec<tree> vec_oprnds2;
6112   auto_vec<tree> vect_defs;
6113   auto_vec<stmt_vec_info> phis;
6114   int vec_num;
6115   tree def0, tem;
6116   tree cr_index_scalar_type = NULL_TREE, cr_index_vector_type = NULL_TREE;
6117   tree cond_reduc_val = NULL_TREE;
6118
6119   /* Make sure it was already recognized as a reduction computation.  */
6120   if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_reduction_def
6121       && STMT_VINFO_DEF_TYPE (stmt_info) != vect_nested_cycle)
6122     return false;
6123
6124   if (nested_in_vect_loop_p (loop, stmt_info))
6125     {
6126       loop = loop->inner;
6127       nested_cycle = true;
6128     }
6129
6130   if (REDUC_GROUP_FIRST_ELEMENT (stmt_info))
6131     gcc_assert (slp_node
6132                 && REDUC_GROUP_FIRST_ELEMENT (stmt_info) == stmt_info);
6133
6134   if (gphi *phi = dyn_cast <gphi *> (stmt_info->stmt))
6135     {
6136       tree phi_result = gimple_phi_result (phi);
6137       /* Analysis is fully done on the reduction stmt invocation.  */
6138       if (! vec_stmt)
6139         {
6140           if (slp_node)
6141             slp_node_instance->reduc_phis = slp_node;
6142
6143           STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
6144           return true;
6145         }
6146
6147       if (STMT_VINFO_REDUC_TYPE (stmt_info) == FOLD_LEFT_REDUCTION)
6148         /* Leave the scalar phi in place.  Note that checking
6149            STMT_VINFO_VEC_REDUCTION_TYPE (as below) only works
6150            for reductions involving a single statement.  */
6151         return true;
6152
6153       stmt_vec_info reduc_stmt_info = STMT_VINFO_REDUC_DEF (stmt_info);
6154       reduc_stmt_info = vect_stmt_to_vectorize (reduc_stmt_info);
6155
6156       if (STMT_VINFO_VEC_REDUCTION_TYPE (reduc_stmt_info)
6157           == EXTRACT_LAST_REDUCTION)
6158         /* Leave the scalar phi in place.  */
6159         return true;
6160
6161       gassign *reduc_stmt = as_a <gassign *> (reduc_stmt_info->stmt);
6162       code = gimple_assign_rhs_code (reduc_stmt);
6163       for (unsigned k = 1; k < gimple_num_ops (reduc_stmt); ++k)
6164         {
6165           tree op = gimple_op (reduc_stmt, k);
6166           if (op == phi_result)
6167             continue;
6168           if (k == 1 && code == COND_EXPR)
6169             continue;
6170           bool is_simple_use = vect_is_simple_use (op, loop_vinfo, &dt);
6171           gcc_assert (is_simple_use);
6172           if (dt == vect_constant_def || dt == vect_external_def)
6173             continue;
6174           if (!vectype_in
6175               || (GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (vectype_in)))
6176                   < GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (op)))))
6177             vectype_in = get_vectype_for_scalar_type (TREE_TYPE (op));
6178           break;
6179         }
6180       /* For a nested cycle we might end up with an operation like
6181          phi_result * phi_result.  */
6182       if (!vectype_in)
6183         vectype_in = STMT_VINFO_VECTYPE (stmt_info);
6184       gcc_assert (vectype_in);
6185
6186       if (slp_node)
6187         ncopies = 1;
6188       else
6189         ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
6190
6191       stmt_vec_info use_stmt_info;
6192       if (ncopies > 1
6193           && STMT_VINFO_RELEVANT (reduc_stmt_info) <= vect_used_only_live
6194           && (use_stmt_info = loop_vinfo->lookup_single_use (phi_result))
6195           && vect_stmt_to_vectorize (use_stmt_info) == reduc_stmt_info)
6196         single_defuse_cycle = true;
6197
6198       /* Create the destination vector  */
6199       scalar_dest = gimple_assign_lhs (reduc_stmt);
6200       vec_dest = vect_create_destination_var (scalar_dest, vectype_out);
6201
6202       if (slp_node)
6203         /* The size vect_schedule_slp_instance computes is off for us.  */
6204         vec_num = vect_get_num_vectors
6205           (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
6206            * SLP_TREE_SCALAR_STMTS (slp_node).length (),
6207            vectype_in);
6208       else
6209         vec_num = 1;
6210
6211       /* Generate the reduction PHIs upfront.  */
6212       prev_phi_info = NULL;
6213       for (j = 0; j < ncopies; j++)
6214         {
6215           if (j == 0 || !single_defuse_cycle)
6216             {
6217               for (i = 0; i < vec_num; i++)
6218                 {
6219                   /* Create the reduction-phi that defines the reduction
6220                      operand.  */
6221                   gimple *new_phi = create_phi_node (vec_dest, loop->header);
6222                   stmt_vec_info new_phi_info = loop_vinfo->add_stmt (new_phi);
6223
6224                   if (slp_node)
6225                     SLP_TREE_VEC_STMTS (slp_node).quick_push (new_phi_info);
6226                   else
6227                     {
6228                       if (j == 0)
6229                         STMT_VINFO_VEC_STMT (stmt_info)
6230                           = *vec_stmt = new_phi_info;
6231                       else
6232                         STMT_VINFO_RELATED_STMT (prev_phi_info) = new_phi_info;
6233                       prev_phi_info = new_phi_info;
6234                     }
6235                 }
6236             }
6237         }
6238
6239       return true;
6240     }
6241
6242   /* 1. Is vectorizable reduction?  */
6243   /* Not supportable if the reduction variable is used in the loop, unless
6244      it's a reduction chain.  */
6245   if (STMT_VINFO_RELEVANT (stmt_info) > vect_used_in_outer
6246       && !REDUC_GROUP_FIRST_ELEMENT (stmt_info))
6247     return false;
6248
6249   /* Reductions that are not used even in an enclosing outer-loop,
6250      are expected to be "live" (used out of the loop).  */
6251   if (STMT_VINFO_RELEVANT (stmt_info) == vect_unused_in_scope
6252       && !STMT_VINFO_LIVE_P (stmt_info))
6253     return false;
6254
6255   /* 2. Has this been recognized as a reduction pattern?
6256
6257      Check if STMT represents a pattern that has been recognized
6258      in earlier analysis stages.  For stmts that represent a pattern,
6259      the STMT_VINFO_RELATED_STMT field records the last stmt in
6260      the original sequence that constitutes the pattern.  */
6261
6262   stmt_vec_info orig_stmt_info = STMT_VINFO_RELATED_STMT (stmt_info);
6263   if (orig_stmt_info)
6264     {
6265       gcc_assert (STMT_VINFO_IN_PATTERN_P (orig_stmt_info));
6266       gcc_assert (!STMT_VINFO_IN_PATTERN_P (stmt_info));
6267     }
6268
6269   /* 3. Check the operands of the operation.  The first operands are defined
6270         inside the loop body. The last operand is the reduction variable,
6271         which is defined by the loop-header-phi.  */
6272
6273   gassign *stmt = as_a <gassign *> (stmt_info->stmt);
6274
6275   /* Flatten RHS.  */
6276   switch (get_gimple_rhs_class (gimple_assign_rhs_code (stmt)))
6277     {
6278     case GIMPLE_BINARY_RHS:
6279       code = gimple_assign_rhs_code (stmt);
6280       op_type = TREE_CODE_LENGTH (code);
6281       gcc_assert (op_type == binary_op);
6282       ops[0] = gimple_assign_rhs1 (stmt);
6283       ops[1] = gimple_assign_rhs2 (stmt);
6284       break;
6285
6286     case GIMPLE_TERNARY_RHS:
6287       code = gimple_assign_rhs_code (stmt);
6288       op_type = TREE_CODE_LENGTH (code);
6289       gcc_assert (op_type == ternary_op);
6290       ops[0] = gimple_assign_rhs1 (stmt);
6291       ops[1] = gimple_assign_rhs2 (stmt);
6292       ops[2] = gimple_assign_rhs3 (stmt);
6293       break;
6294
6295     case GIMPLE_UNARY_RHS:
6296       return false;
6297
6298     default:
6299       gcc_unreachable ();
6300     }
6301
6302   if (code == COND_EXPR && slp_node)
6303     return false;
6304
6305   scalar_dest = gimple_assign_lhs (stmt);
6306   scalar_type = TREE_TYPE (scalar_dest);
6307   if (!POINTER_TYPE_P (scalar_type) && !INTEGRAL_TYPE_P (scalar_type)
6308       && !SCALAR_FLOAT_TYPE_P (scalar_type))
6309     return false;
6310
6311   /* Do not try to vectorize bit-precision reductions.  */
6312   if (!type_has_mode_precision_p (scalar_type))
6313     return false;
6314
6315   /* All uses but the last are expected to be defined in the loop.
6316      The last use is the reduction variable.  In case of nested cycle this
6317      assumption is not true: we use reduc_index to record the index of the
6318      reduction variable.  */
6319   stmt_vec_info reduc_def_info;
6320   if (orig_stmt_info)
6321     reduc_def_info = STMT_VINFO_REDUC_DEF (orig_stmt_info);
6322   else
6323     reduc_def_info = STMT_VINFO_REDUC_DEF (stmt_info);
6324   gcc_assert (reduc_def_info);
6325   gphi *reduc_def_phi = as_a <gphi *> (reduc_def_info->stmt);
6326   tree reduc_def = PHI_RESULT (reduc_def_phi);
6327   int reduc_index = -1;
6328   for (i = 0; i < op_type; i++)
6329     {
6330       /* The condition of COND_EXPR is checked in vectorizable_condition().  */
6331       if (i == 0 && code == COND_EXPR)
6332         continue;
6333
6334       stmt_vec_info def_stmt_info;
6335       is_simple_use = vect_is_simple_use (ops[i], loop_vinfo, &dts[i], &tem,
6336                                           &def_stmt_info);
6337       dt = dts[i];
6338       gcc_assert (is_simple_use);
6339       if (dt == vect_reduction_def
6340           && ops[i] == reduc_def)
6341         {
6342           reduc_index = i;
6343           continue;
6344         }
6345       else if (tem)
6346         {
6347           /* To properly compute ncopies we are interested in the widest
6348              input type in case we're looking at a widening accumulation.  */
6349           if (!vectype_in
6350               || (GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (vectype_in)))
6351                   < GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (tem)))))
6352             vectype_in = tem;
6353         }
6354
6355       if (dt != vect_internal_def
6356           && dt != vect_external_def
6357           && dt != vect_constant_def
6358           && dt != vect_induction_def
6359           && !(dt == vect_nested_cycle && nested_cycle))
6360         return false;
6361
6362       if (dt == vect_nested_cycle
6363           && ops[i] == reduc_def)
6364         {
6365           found_nested_cycle_def = true;
6366           reduc_index = i;
6367         }
6368
6369       if (i == 1 && code == COND_EXPR)
6370         {
6371           /* Record how value of COND_EXPR is defined.  */
6372           if (dt == vect_constant_def)
6373             {
6374               cond_reduc_dt = dt;
6375               cond_reduc_val = ops[i];
6376             }
6377           if (dt == vect_induction_def
6378               && def_stmt_info
6379               && is_nonwrapping_integer_induction (def_stmt_info, loop))
6380             {
6381               cond_reduc_dt = dt;
6382               cond_stmt_vinfo = def_stmt_info;
6383             }
6384         }
6385     }
6386
6387   if (!vectype_in)
6388     vectype_in = vectype_out;
6389
6390   /* When vectorizing a reduction chain w/o SLP the reduction PHI is not
6391      directy used in stmt.  */
6392   if (reduc_index == -1)
6393     {
6394       if (STMT_VINFO_REDUC_TYPE (stmt_info) == FOLD_LEFT_REDUCTION)
6395         {
6396           if (dump_enabled_p ())
6397             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6398                              "in-order reduction chain without SLP.\n");
6399           return false;
6400         }
6401     }
6402
6403   if (!(reduc_index == -1
6404         || dts[reduc_index] == vect_reduction_def
6405         || dts[reduc_index] == vect_nested_cycle
6406         || ((dts[reduc_index] == vect_internal_def
6407              || dts[reduc_index] == vect_external_def
6408              || dts[reduc_index] == vect_constant_def
6409              || dts[reduc_index] == vect_induction_def)
6410             && nested_cycle && found_nested_cycle_def)))
6411     {
6412       /* For pattern recognized stmts, orig_stmt might be a reduction,
6413          but some helper statements for the pattern might not, or
6414          might be COND_EXPRs with reduction uses in the condition.  */
6415       gcc_assert (orig_stmt_info);
6416       return false;
6417     }
6418
6419   /* PHIs should not participate in patterns.  */
6420   gcc_assert (!STMT_VINFO_RELATED_STMT (reduc_def_info));
6421   enum vect_reduction_type v_reduc_type
6422     = STMT_VINFO_REDUC_TYPE (reduc_def_info);
6423   stmt_vec_info tmp = STMT_VINFO_REDUC_DEF (reduc_def_info);
6424
6425   STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) = v_reduc_type;
6426   /* If we have a condition reduction, see if we can simplify it further.  */
6427   if (v_reduc_type == COND_REDUCTION)
6428     {
6429       /* TODO: We can't yet handle reduction chains, since we need to treat
6430          each COND_EXPR in the chain specially, not just the last one.
6431          E.g. for:
6432
6433             x_1 = PHI <x_3, ...>
6434             x_2 = a_2 ? ... : x_1;
6435             x_3 = a_3 ? ... : x_2;
6436
6437          we're interested in the last element in x_3 for which a_2 || a_3
6438          is true, whereas the current reduction chain handling would
6439          vectorize x_2 as a normal VEC_COND_EXPR and only treat x_3
6440          as a reduction operation.  */
6441       if (reduc_index == -1)
6442         {
6443           if (dump_enabled_p ())
6444             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6445                              "conditional reduction chains not supported\n");
6446           return false;
6447         }
6448
6449       /* vect_is_simple_reduction ensured that operand 2 is the
6450          loop-carried operand.  */
6451       gcc_assert (reduc_index == 2);
6452
6453       /* Loop peeling modifies initial value of reduction PHI, which
6454          makes the reduction stmt to be transformed different to the
6455          original stmt analyzed.  We need to record reduction code for
6456          CONST_COND_REDUCTION type reduction at analyzing stage, thus
6457          it can be used directly at transform stage.  */
6458       if (STMT_VINFO_VEC_CONST_COND_REDUC_CODE (stmt_info) == MAX_EXPR
6459           || STMT_VINFO_VEC_CONST_COND_REDUC_CODE (stmt_info) == MIN_EXPR)
6460         {
6461           /* Also set the reduction type to CONST_COND_REDUCTION.  */
6462           gcc_assert (cond_reduc_dt == vect_constant_def);
6463           STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) = CONST_COND_REDUCTION;
6464         }
6465       else if (direct_internal_fn_supported_p (IFN_FOLD_EXTRACT_LAST,
6466                                                vectype_in, OPTIMIZE_FOR_SPEED))
6467         {
6468           if (dump_enabled_p ())
6469             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6470                              "optimizing condition reduction with"
6471                              " FOLD_EXTRACT_LAST.\n");
6472           STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) = EXTRACT_LAST_REDUCTION;
6473         }
6474       else if (cond_reduc_dt == vect_induction_def)
6475         {
6476           tree base
6477             = STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (cond_stmt_vinfo);
6478           tree step = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (cond_stmt_vinfo);
6479
6480           gcc_assert (TREE_CODE (base) == INTEGER_CST
6481                       && TREE_CODE (step) == INTEGER_CST);
6482           cond_reduc_val = NULL_TREE;
6483           /* Find a suitable value, for MAX_EXPR below base, for MIN_EXPR
6484              above base; punt if base is the minimum value of the type for
6485              MAX_EXPR or maximum value of the type for MIN_EXPR for now.  */
6486           if (tree_int_cst_sgn (step) == -1)
6487             {
6488               cond_reduc_op_code = MIN_EXPR;
6489               if (tree_int_cst_sgn (base) == -1)
6490                 cond_reduc_val = build_int_cst (TREE_TYPE (base), 0);
6491               else if (tree_int_cst_lt (base,
6492                                         TYPE_MAX_VALUE (TREE_TYPE (base))))
6493                 cond_reduc_val
6494                   = int_const_binop (PLUS_EXPR, base, integer_one_node);
6495             }
6496           else
6497             {
6498               cond_reduc_op_code = MAX_EXPR;
6499               if (tree_int_cst_sgn (base) == 1)
6500                 cond_reduc_val = build_int_cst (TREE_TYPE (base), 0);
6501               else if (tree_int_cst_lt (TYPE_MIN_VALUE (TREE_TYPE (base)),
6502                                         base))
6503                 cond_reduc_val
6504                   = int_const_binop (MINUS_EXPR, base, integer_one_node);
6505             }
6506           if (cond_reduc_val)
6507             {
6508               if (dump_enabled_p ())
6509                 dump_printf_loc (MSG_NOTE, vect_location,
6510                                  "condition expression based on "
6511                                  "integer induction.\n");
6512               STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
6513                 = INTEGER_INDUC_COND_REDUCTION;
6514             }
6515         }
6516       else if (cond_reduc_dt == vect_constant_def)
6517         {
6518           enum vect_def_type cond_initial_dt;
6519           gimple *def_stmt = SSA_NAME_DEF_STMT (ops[reduc_index]);
6520           tree cond_initial_val
6521             = PHI_ARG_DEF_FROM_EDGE (def_stmt, loop_preheader_edge (loop));
6522
6523           gcc_assert (cond_reduc_val != NULL_TREE);
6524           vect_is_simple_use (cond_initial_val, loop_vinfo, &cond_initial_dt);
6525           if (cond_initial_dt == vect_constant_def
6526               && types_compatible_p (TREE_TYPE (cond_initial_val),
6527                                      TREE_TYPE (cond_reduc_val)))
6528             {
6529               tree e = fold_binary (LE_EXPR, boolean_type_node,
6530                                     cond_initial_val, cond_reduc_val);
6531               if (e && (integer_onep (e) || integer_zerop (e)))
6532                 {
6533                   if (dump_enabled_p ())
6534                     dump_printf_loc (MSG_NOTE, vect_location,
6535                                      "condition expression based on "
6536                                      "compile time constant.\n");
6537                   /* Record reduction code at analysis stage.  */
6538                   STMT_VINFO_VEC_CONST_COND_REDUC_CODE (stmt_info)
6539                     = integer_onep (e) ? MAX_EXPR : MIN_EXPR;
6540                   STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
6541                     = CONST_COND_REDUCTION;
6542                 }
6543             }
6544         }
6545     }
6546
6547   if (orig_stmt_info)
6548     gcc_assert (tmp == orig_stmt_info
6549                 || REDUC_GROUP_FIRST_ELEMENT (tmp) == orig_stmt_info);
6550   else
6551     /* We changed STMT to be the first stmt in reduction chain, hence we
6552        check that in this case the first element in the chain is STMT.  */
6553     gcc_assert (tmp == stmt_info
6554                 || REDUC_GROUP_FIRST_ELEMENT (tmp) == stmt_info);
6555
6556   if (STMT_VINFO_LIVE_P (reduc_def_info))
6557     return false;
6558
6559   if (slp_node)
6560     ncopies = 1;
6561   else
6562     ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
6563
6564   gcc_assert (ncopies >= 1);
6565
6566   vec_mode = TYPE_MODE (vectype_in);
6567   poly_uint64 nunits_out = TYPE_VECTOR_SUBPARTS (vectype_out);
6568
6569   if (nested_cycle)
6570     {
6571       def_bb = gimple_bb (reduc_def_phi);
6572       def_stmt_loop = def_bb->loop_father;
6573       def_arg = PHI_ARG_DEF_FROM_EDGE (reduc_def_phi,
6574                                        loop_preheader_edge (def_stmt_loop));
6575       stmt_vec_info def_arg_stmt_info = loop_vinfo->lookup_def (def_arg);
6576       if (def_arg_stmt_info
6577           && (STMT_VINFO_DEF_TYPE (def_arg_stmt_info)
6578               == vect_double_reduction_def))
6579         double_reduc = true;
6580     }
6581
6582   vect_reduction_type reduction_type
6583     = STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info);
6584   if ((double_reduc || reduction_type != TREE_CODE_REDUCTION)
6585       && ncopies > 1)
6586     {
6587       if (dump_enabled_p ())
6588         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6589                          "multiple types in double reduction or condition "
6590                          "reduction.\n");
6591       return false;
6592     }
6593
6594   if (code == COND_EXPR)
6595     {
6596       /* Only call during the analysis stage, otherwise we'll lose
6597          STMT_VINFO_TYPE.  */
6598       if (!vec_stmt && !vectorizable_condition (stmt_info, gsi, NULL,
6599                                                 true, NULL, cost_vec))
6600         {
6601           if (dump_enabled_p ())
6602             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6603                              "unsupported condition in reduction\n");
6604           return false;
6605         }
6606     }
6607   else if (code == LSHIFT_EXPR || code == RSHIFT_EXPR
6608            || code == LROTATE_EXPR || code == RROTATE_EXPR)
6609     {
6610       /* Only call during the analysis stage, otherwise we'll lose
6611          STMT_VINFO_TYPE.  We only support this for nested cycles
6612          without double reductions at the moment.  */
6613       if (!nested_cycle
6614           || double_reduc
6615           || (!vec_stmt && !vectorizable_shift (stmt_info, gsi, NULL,
6616                                                 NULL, cost_vec)))
6617         {
6618           if (dump_enabled_p ())
6619             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6620                              "unsupported shift or rotation in reduction\n");
6621           return false;
6622         }
6623     }
6624   else
6625     {
6626       /* 4. Supportable by target?  */
6627
6628       /* 4.1. check support for the operation in the loop  */
6629       optab = optab_for_tree_code (code, vectype_in, optab_default);
6630       if (!optab)
6631         {
6632           if (dump_enabled_p ())
6633             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6634                              "no optab.\n");
6635
6636           return false;
6637         }
6638
6639       if (optab_handler (optab, vec_mode) == CODE_FOR_nothing)
6640         {
6641           if (dump_enabled_p ())
6642             dump_printf (MSG_NOTE, "op not supported by target.\n");
6643
6644           if (maybe_ne (GET_MODE_SIZE (vec_mode), UNITS_PER_WORD)
6645               || !vect_worthwhile_without_simd_p (loop_vinfo, code))
6646             return false;
6647
6648           if (dump_enabled_p ())
6649             dump_printf (MSG_NOTE, "proceeding using word mode.\n");
6650         }
6651
6652       /* Worthwhile without SIMD support?  */
6653       if (!VECTOR_MODE_P (TYPE_MODE (vectype_in))
6654           && !vect_worthwhile_without_simd_p (loop_vinfo, code))
6655         {
6656           if (dump_enabled_p ())
6657             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6658                              "not worthwhile without SIMD support.\n");
6659
6660           return false;
6661         }
6662     }
6663
6664   /* 4.2. Check support for the epilog operation.
6665
6666           If STMT represents a reduction pattern, then the type of the
6667           reduction variable may be different than the type of the rest
6668           of the arguments.  For example, consider the case of accumulation
6669           of shorts into an int accumulator; The original code:
6670                         S1: int_a = (int) short_a;
6671           orig_stmt->   S2: int_acc = plus <int_a ,int_acc>;
6672
6673           was replaced with:
6674                         STMT: int_acc = widen_sum <short_a, int_acc>
6675
6676           This means that:
6677           1. The tree-code that is used to create the vector operation in the
6678              epilog code (that reduces the partial results) is not the
6679              tree-code of STMT, but is rather the tree-code of the original
6680              stmt from the pattern that STMT is replacing.  I.e, in the example
6681              above we want to use 'widen_sum' in the loop, but 'plus' in the
6682              epilog.
6683           2. The type (mode) we use to check available target support
6684              for the vector operation to be created in the *epilog*, is
6685              determined by the type of the reduction variable (in the example
6686              above we'd check this: optab_handler (plus_optab, vect_int_mode])).
6687              However the type (mode) we use to check available target support
6688              for the vector operation to be created *inside the loop*, is
6689              determined by the type of the other arguments to STMT (in the
6690              example we'd check this: optab_handler (widen_sum_optab,
6691              vect_short_mode)).
6692
6693           This is contrary to "regular" reductions, in which the types of all
6694           the arguments are the same as the type of the reduction variable.
6695           For "regular" reductions we can therefore use the same vector type
6696           (and also the same tree-code) when generating the epilog code and
6697           when generating the code inside the loop.  */
6698
6699   if (orig_stmt_info
6700       && (reduction_type == TREE_CODE_REDUCTION
6701           || reduction_type == FOLD_LEFT_REDUCTION))
6702     {
6703       /* This is a reduction pattern: get the vectype from the type of the
6704          reduction variable, and get the tree-code from orig_stmt.  */
6705       orig_code = gimple_assign_rhs_code (orig_stmt_info->stmt);
6706       gcc_assert (vectype_out);
6707       vec_mode = TYPE_MODE (vectype_out);
6708     }
6709   else
6710     {
6711       /* Regular reduction: use the same vectype and tree-code as used for
6712          the vector code inside the loop can be used for the epilog code. */
6713       orig_code = code;
6714
6715       if (code == MINUS_EXPR)
6716         orig_code = PLUS_EXPR;
6717
6718       /* For simple condition reductions, replace with the actual expression
6719          we want to base our reduction around.  */
6720       if (reduction_type == CONST_COND_REDUCTION)
6721         {
6722           orig_code = STMT_VINFO_VEC_CONST_COND_REDUC_CODE (stmt_info);
6723           gcc_assert (orig_code == MAX_EXPR || orig_code == MIN_EXPR);
6724         }
6725       else if (reduction_type == INTEGER_INDUC_COND_REDUCTION)
6726         orig_code = cond_reduc_op_code;
6727     }
6728
6729   reduc_fn = IFN_LAST;
6730
6731   if (reduction_type == TREE_CODE_REDUCTION
6732       || reduction_type == FOLD_LEFT_REDUCTION
6733       || reduction_type == INTEGER_INDUC_COND_REDUCTION
6734       || reduction_type == CONST_COND_REDUCTION)
6735     {
6736       if (reduction_type == FOLD_LEFT_REDUCTION
6737           ? fold_left_reduction_fn (orig_code, &reduc_fn)
6738           : reduction_fn_for_scalar_code (orig_code, &reduc_fn))
6739         {
6740           if (reduc_fn != IFN_LAST
6741               && !direct_internal_fn_supported_p (reduc_fn, vectype_out,
6742                                                   OPTIMIZE_FOR_SPEED))
6743             {
6744               if (dump_enabled_p ())
6745                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6746                                  "reduc op not supported by target.\n");
6747
6748               reduc_fn = IFN_LAST;
6749             }
6750         }
6751       else
6752         {
6753           if (!nested_cycle || double_reduc)
6754             {
6755               if (dump_enabled_p ())
6756                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6757                                  "no reduc code for scalar code.\n");
6758
6759               return false;
6760             }
6761         }
6762     }
6763   else if (reduction_type == COND_REDUCTION)
6764     {
6765       int scalar_precision
6766         = GET_MODE_PRECISION (SCALAR_TYPE_MODE (scalar_type));
6767       cr_index_scalar_type = make_unsigned_type (scalar_precision);
6768       cr_index_vector_type = build_vector_type (cr_index_scalar_type,
6769                                                 nunits_out);
6770
6771       if (direct_internal_fn_supported_p (IFN_REDUC_MAX, cr_index_vector_type,
6772                                           OPTIMIZE_FOR_SPEED))
6773         reduc_fn = IFN_REDUC_MAX;
6774     }
6775
6776   if (reduction_type != EXTRACT_LAST_REDUCTION
6777       && (!nested_cycle || double_reduc)
6778       && reduc_fn == IFN_LAST
6779       && !nunits_out.is_constant ())
6780     {
6781       if (dump_enabled_p ())
6782         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6783                          "missing target support for reduction on"
6784                          " variable-length vectors.\n");
6785       return false;
6786     }
6787
6788   /* For SLP reductions, see if there is a neutral value we can use.  */
6789   tree neutral_op = NULL_TREE;
6790   if (slp_node)
6791     neutral_op = neutral_op_for_slp_reduction
6792       (slp_node_instance->reduc_phis, code,
6793        REDUC_GROUP_FIRST_ELEMENT (stmt_info) != NULL);
6794
6795   if (double_reduc && reduction_type == FOLD_LEFT_REDUCTION)
6796     {
6797       /* We can't support in-order reductions of code such as this:
6798
6799            for (int i = 0; i < n1; ++i)
6800              for (int j = 0; j < n2; ++j)
6801                l += a[j];
6802
6803          since GCC effectively transforms the loop when vectorizing:
6804
6805            for (int i = 0; i < n1 / VF; ++i)
6806              for (int j = 0; j < n2; ++j)
6807                for (int k = 0; k < VF; ++k)
6808                  l += a[j];
6809
6810          which is a reassociation of the original operation.  */
6811       if (dump_enabled_p ())
6812         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6813                          "in-order double reduction not supported.\n");
6814
6815       return false;
6816     }
6817
6818   if (reduction_type == FOLD_LEFT_REDUCTION
6819       && slp_node
6820       && !REDUC_GROUP_FIRST_ELEMENT (stmt_info))
6821     {
6822       /* We cannot use in-order reductions in this case because there is
6823          an implicit reassociation of the operations involved.  */
6824       if (dump_enabled_p ())
6825         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6826                          "in-order unchained SLP reductions not supported.\n");
6827       return false;
6828     }
6829
6830   /* For double reductions, and for SLP reductions with a neutral value,
6831      we construct a variable-length initial vector by loading a vector
6832      full of the neutral value and then shift-and-inserting the start
6833      values into the low-numbered elements.  */
6834   if ((double_reduc || neutral_op)
6835       && !nunits_out.is_constant ()
6836       && !direct_internal_fn_supported_p (IFN_VEC_SHL_INSERT,
6837                                           vectype_out, OPTIMIZE_FOR_SPEED))
6838     {
6839       if (dump_enabled_p ())
6840         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6841                          "reduction on variable-length vectors requires"
6842                          " target support for a vector-shift-and-insert"
6843                          " operation.\n");
6844       return false;
6845     }
6846
6847   /* Check extra constraints for variable-length unchained SLP reductions.  */
6848   if (STMT_SLP_TYPE (stmt_info)
6849       && !REDUC_GROUP_FIRST_ELEMENT (stmt_info)
6850       && !nunits_out.is_constant ())
6851     {
6852       /* We checked above that we could build the initial vector when
6853          there's a neutral element value.  Check here for the case in
6854          which each SLP statement has its own initial value and in which
6855          that value needs to be repeated for every instance of the
6856          statement within the initial vector.  */
6857       unsigned int group_size = SLP_TREE_SCALAR_STMTS (slp_node).length ();
6858       scalar_mode elt_mode = SCALAR_TYPE_MODE (TREE_TYPE (vectype_out));
6859       if (!neutral_op
6860           && !can_duplicate_and_interleave_p (group_size, elt_mode))
6861         {
6862           if (dump_enabled_p ())
6863             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6864                              "unsupported form of SLP reduction for"
6865                              " variable-length vectors: cannot build"
6866                              " initial vector.\n");
6867           return false;
6868         }
6869       /* The epilogue code relies on the number of elements being a multiple
6870          of the group size.  The duplicate-and-interleave approach to setting
6871          up the the initial vector does too.  */
6872       if (!multiple_p (nunits_out, group_size))
6873         {
6874           if (dump_enabled_p ())
6875             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6876                              "unsupported form of SLP reduction for"
6877                              " variable-length vectors: the vector size"
6878                              " is not a multiple of the number of results.\n");
6879           return false;
6880         }
6881     }
6882
6883   /* In case of widenning multiplication by a constant, we update the type
6884      of the constant to be the type of the other operand.  We check that the
6885      constant fits the type in the pattern recognition pass.  */
6886   if (code == DOT_PROD_EXPR
6887       && !types_compatible_p (TREE_TYPE (ops[0]), TREE_TYPE (ops[1])))
6888     {
6889       if (TREE_CODE (ops[0]) == INTEGER_CST)
6890         ops[0] = fold_convert (TREE_TYPE (ops[1]), ops[0]);
6891       else if (TREE_CODE (ops[1]) == INTEGER_CST)
6892         ops[1] = fold_convert (TREE_TYPE (ops[0]), ops[1]);
6893       else
6894         {
6895           if (dump_enabled_p ())
6896             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6897                              "invalid types in dot-prod\n");
6898
6899           return false;
6900         }
6901     }
6902
6903   if (reduction_type == COND_REDUCTION)
6904     {
6905       widest_int ni;
6906
6907       if (! max_loop_iterations (loop, &ni))
6908         {
6909           if (dump_enabled_p ())
6910             dump_printf_loc (MSG_NOTE, vect_location,
6911                              "loop count not known, cannot create cond "
6912                              "reduction.\n");
6913           return false;
6914         }
6915       /* Convert backedges to iterations.  */
6916       ni += 1;
6917
6918       /* The additional index will be the same type as the condition.  Check
6919          that the loop can fit into this less one (because we'll use up the
6920          zero slot for when there are no matches).  */
6921       tree max_index = TYPE_MAX_VALUE (cr_index_scalar_type);
6922       if (wi::geu_p (ni, wi::to_widest (max_index)))
6923         {
6924           if (dump_enabled_p ())
6925             dump_printf_loc (MSG_NOTE, vect_location,
6926                              "loop size is greater than data size.\n");
6927           return false;
6928         }
6929     }
6930
6931   /* In case the vectorization factor (VF) is bigger than the number
6932      of elements that we can fit in a vectype (nunits), we have to generate
6933      more than one vector stmt - i.e - we need to "unroll" the
6934      vector stmt by a factor VF/nunits.  For more details see documentation
6935      in vectorizable_operation.  */
6936
6937   /* If the reduction is used in an outer loop we need to generate
6938      VF intermediate results, like so (e.g. for ncopies=2):
6939         r0 = phi (init, r0)
6940         r1 = phi (init, r1)
6941         r0 = x0 + r0;
6942         r1 = x1 + r1;
6943     (i.e. we generate VF results in 2 registers).
6944     In this case we have a separate def-use cycle for each copy, and therefore
6945     for each copy we get the vector def for the reduction variable from the
6946     respective phi node created for this copy.
6947
6948     Otherwise (the reduction is unused in the loop nest), we can combine
6949     together intermediate results, like so (e.g. for ncopies=2):
6950         r = phi (init, r)
6951         r = x0 + r;
6952         r = x1 + r;
6953    (i.e. we generate VF/2 results in a single register).
6954    In this case for each copy we get the vector def for the reduction variable
6955    from the vectorized reduction operation generated in the previous iteration.
6956
6957    This only works when we see both the reduction PHI and its only consumer
6958    in vectorizable_reduction and there are no intermediate stmts
6959    participating.  */
6960   stmt_vec_info use_stmt_info;
6961   tree reduc_phi_result = gimple_phi_result (reduc_def_phi);
6962   if (ncopies > 1
6963       && (STMT_VINFO_RELEVANT (stmt_info) <= vect_used_only_live)
6964       && (use_stmt_info = loop_vinfo->lookup_single_use (reduc_phi_result))
6965       && vect_stmt_to_vectorize (use_stmt_info) == stmt_info)
6966     {
6967       single_defuse_cycle = true;
6968       epilog_copies = 1;
6969     }
6970   else
6971     epilog_copies = ncopies;
6972
6973   /* If the reduction stmt is one of the patterns that have lane
6974      reduction embedded we cannot handle the case of ! single_defuse_cycle.  */
6975   if ((ncopies > 1
6976        && ! single_defuse_cycle)
6977       && (code == DOT_PROD_EXPR
6978           || code == WIDEN_SUM_EXPR
6979           || code == SAD_EXPR))
6980     {
6981       if (dump_enabled_p ())
6982         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6983                          "multi def-use cycle not possible for lane-reducing "
6984                          "reduction operation\n");
6985       return false;
6986     }
6987
6988   if (slp_node)
6989     vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
6990   else
6991     vec_num = 1;
6992
6993   internal_fn cond_fn = get_conditional_internal_fn (code);
6994   vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo);
6995   bool mask_by_cond_expr = use_mask_by_cond_expr_p (code, cond_fn, vectype_in);
6996
6997   if (!vec_stmt) /* transformation not required.  */
6998     {
6999       vect_model_reduction_cost (stmt_info, reduc_fn, ncopies, cost_vec);
7000       if (loop_vinfo && LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo))
7001         {
7002           if (reduction_type != FOLD_LEFT_REDUCTION
7003               && !mask_by_cond_expr
7004               && (cond_fn == IFN_LAST
7005                   || !direct_internal_fn_supported_p (cond_fn, vectype_in,
7006                                                       OPTIMIZE_FOR_SPEED)))
7007             {
7008               if (dump_enabled_p ())
7009                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7010                                  "can't use a fully-masked loop because no"
7011                                  " conditional operation is available.\n");
7012               LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = false;
7013             }
7014           else if (reduc_index == -1)
7015             {
7016               if (dump_enabled_p ())
7017                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7018                                  "can't use a fully-masked loop for chained"
7019                                  " reductions.\n");
7020               LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = false;
7021             }
7022           else
7023             vect_record_loop_mask (loop_vinfo, masks, ncopies * vec_num,
7024                                    vectype_in);
7025         }
7026       if (dump_enabled_p ()
7027           && reduction_type == FOLD_LEFT_REDUCTION)
7028         dump_printf_loc (MSG_NOTE, vect_location,
7029                          "using an in-order (fold-left) reduction.\n");
7030       STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
7031       return true;
7032     }
7033
7034   /* Transform.  */
7035
7036   if (dump_enabled_p ())
7037     dump_printf_loc (MSG_NOTE, vect_location, "transform reduction.\n");
7038
7039   /* FORNOW: Multiple types are not supported for condition.  */
7040   if (code == COND_EXPR)
7041     gcc_assert (ncopies == 1);
7042
7043   bool masked_loop_p = LOOP_VINFO_FULLY_MASKED_P (loop_vinfo);
7044
7045   if (reduction_type == FOLD_LEFT_REDUCTION)
7046     return vectorize_fold_left_reduction
7047       (stmt_info, gsi, vec_stmt, slp_node, reduc_def_phi, code,
7048        reduc_fn, ops, vectype_in, reduc_index, masks);
7049
7050   if (reduction_type == EXTRACT_LAST_REDUCTION)
7051     {
7052       gcc_assert (!slp_node);
7053       return vectorizable_condition (stmt_info, gsi, vec_stmt,
7054                                      true, NULL, NULL);
7055     }
7056
7057   /* Create the destination vector  */
7058   vec_dest = vect_create_destination_var (scalar_dest, vectype_out);
7059
7060   prev_stmt_info = NULL;
7061   prev_phi_info = NULL;
7062   if (!slp_node)
7063     {
7064       vec_oprnds0.create (1);
7065       vec_oprnds1.create (1);
7066       if (op_type == ternary_op)
7067         vec_oprnds2.create (1);
7068     }
7069
7070   phis.create (vec_num);
7071   vect_defs.create (vec_num);
7072   if (!slp_node)
7073     vect_defs.quick_push (NULL_TREE);
7074
7075   if (slp_node)
7076     phis.splice (SLP_TREE_VEC_STMTS (slp_node_instance->reduc_phis));
7077   else
7078     phis.quick_push (STMT_VINFO_VEC_STMT (reduc_def_info));
7079
7080   for (j = 0; j < ncopies; j++)
7081     {
7082       if (code == COND_EXPR)
7083         {
7084           gcc_assert (!slp_node);
7085           vectorizable_condition (stmt_info, gsi, vec_stmt,
7086                                   true, NULL, NULL);
7087           break;
7088         }
7089       if (code == LSHIFT_EXPR
7090           || code == RSHIFT_EXPR)
7091         {
7092           vectorizable_shift (stmt_info, gsi, vec_stmt, slp_node, NULL);
7093           break;
7094         }
7095
7096       /* Handle uses.  */
7097       if (j == 0)
7098         {
7099           if (slp_node)
7100             {
7101               /* Get vec defs for all the operands except the reduction index,
7102                  ensuring the ordering of the ops in the vector is kept.  */
7103               auto_vec<tree, 3> slp_ops;
7104               auto_vec<vec<tree>, 3> vec_defs;
7105
7106               slp_ops.quick_push (ops[0]);
7107               slp_ops.quick_push (ops[1]);
7108               if (op_type == ternary_op)
7109                 slp_ops.quick_push (ops[2]);
7110
7111               vect_get_slp_defs (slp_ops, slp_node, &vec_defs);
7112
7113               vec_oprnds0.safe_splice (vec_defs[0]);
7114               vec_defs[0].release ();
7115               vec_oprnds1.safe_splice (vec_defs[1]);
7116               vec_defs[1].release ();
7117               if (op_type == ternary_op)
7118                 {
7119                   vec_oprnds2.safe_splice (vec_defs[2]);
7120                   vec_defs[2].release ();
7121                 }
7122             }
7123           else
7124             {
7125               vec_oprnds0.quick_push
7126                 (vect_get_vec_def_for_operand (ops[0], stmt_info));
7127               vec_oprnds1.quick_push
7128                 (vect_get_vec_def_for_operand (ops[1], stmt_info));
7129               if (op_type == ternary_op)
7130                 vec_oprnds2.quick_push
7131                   (vect_get_vec_def_for_operand (ops[2], stmt_info));
7132             }
7133         }
7134       else
7135         {
7136           if (!slp_node)
7137             {
7138               gcc_assert (reduc_index != -1 || ! single_defuse_cycle);
7139
7140               if (single_defuse_cycle && reduc_index == 0)
7141                 vec_oprnds0[0] = gimple_get_lhs (new_stmt_info->stmt);
7142               else
7143                 vec_oprnds0[0]
7144                   = vect_get_vec_def_for_stmt_copy (loop_vinfo,
7145                                                     vec_oprnds0[0]);
7146               if (single_defuse_cycle && reduc_index == 1)
7147                 vec_oprnds1[0] = gimple_get_lhs (new_stmt_info->stmt);
7148               else
7149                 vec_oprnds1[0]
7150                   = vect_get_vec_def_for_stmt_copy (loop_vinfo,
7151                                                     vec_oprnds1[0]);
7152               if (op_type == ternary_op)
7153                 {
7154                   if (single_defuse_cycle && reduc_index == 2)
7155                     vec_oprnds2[0] = gimple_get_lhs (new_stmt_info->stmt);
7156                   else
7157                     vec_oprnds2[0]
7158                       = vect_get_vec_def_for_stmt_copy (loop_vinfo,
7159                                                         vec_oprnds2[0]);
7160                 }
7161             }
7162         }
7163
7164       FOR_EACH_VEC_ELT (vec_oprnds0, i, def0)
7165         {
7166           tree vop[3] = { def0, vec_oprnds1[i], NULL_TREE };
7167           if (masked_loop_p && !mask_by_cond_expr)
7168             {
7169               /* Make sure that the reduction accumulator is vop[0].  */
7170               if (reduc_index == 1)
7171                 {
7172                   gcc_assert (commutative_tree_code (code));
7173                   std::swap (vop[0], vop[1]);
7174                 }
7175               tree mask = vect_get_loop_mask (gsi, masks, vec_num * ncopies,
7176                                               vectype_in, i * ncopies + j);
7177               gcall *call = gimple_build_call_internal (cond_fn, 4, mask,
7178                                                         vop[0], vop[1],
7179                                                         vop[0]);
7180               new_temp = make_ssa_name (vec_dest, call);
7181               gimple_call_set_lhs (call, new_temp);
7182               gimple_call_set_nothrow (call, true);
7183               new_stmt_info
7184                 = vect_finish_stmt_generation (stmt_info, call, gsi);
7185             }
7186           else
7187             {
7188               if (op_type == ternary_op)
7189                 vop[2] = vec_oprnds2[i];
7190
7191               if (masked_loop_p && mask_by_cond_expr)
7192                 {
7193                   tree mask = vect_get_loop_mask (gsi, masks,
7194                                                   vec_num * ncopies,
7195                                                   vectype_in, i * ncopies + j);
7196                   build_vect_cond_expr (code, vop, mask, gsi);
7197                 }
7198
7199               gassign *new_stmt = gimple_build_assign (vec_dest, code,
7200                                                        vop[0], vop[1], vop[2]);
7201               new_temp = make_ssa_name (vec_dest, new_stmt);
7202               gimple_assign_set_lhs (new_stmt, new_temp);
7203               new_stmt_info
7204                 = vect_finish_stmt_generation (stmt_info, new_stmt, gsi);
7205             }
7206
7207           if (slp_node)
7208             {
7209               SLP_TREE_VEC_STMTS (slp_node).quick_push (new_stmt_info);
7210               vect_defs.quick_push (new_temp);
7211             }
7212           else
7213             vect_defs[0] = new_temp;
7214         }
7215
7216       if (slp_node)
7217         continue;
7218
7219       if (j == 0)
7220         STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt_info;
7221       else
7222         STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt_info;
7223
7224       prev_stmt_info = new_stmt_info;
7225     }
7226
7227   /* Finalize the reduction-phi (set its arguments) and create the
7228      epilog reduction code.  */
7229   if ((!single_defuse_cycle || code == COND_EXPR) && !slp_node)
7230     vect_defs[0] = gimple_get_lhs ((*vec_stmt)->stmt);
7231
7232   vect_create_epilog_for_reduction (vect_defs, stmt_info, reduc_def_phi,
7233                                     epilog_copies, reduc_fn, phis,
7234                                     double_reduc, slp_node, slp_node_instance,
7235                                     cond_reduc_val, cond_reduc_op_code,
7236                                     neutral_op);
7237
7238   return true;
7239 }
7240
7241 /* Function vect_min_worthwhile_factor.
7242
7243    For a loop where we could vectorize the operation indicated by CODE,
7244    return the minimum vectorization factor that makes it worthwhile
7245    to use generic vectors.  */
7246 static unsigned int
7247 vect_min_worthwhile_factor (enum tree_code code)
7248 {
7249   switch (code)
7250     {
7251     case PLUS_EXPR:
7252     case MINUS_EXPR:
7253     case NEGATE_EXPR:
7254       return 4;
7255
7256     case BIT_AND_EXPR:
7257     case BIT_IOR_EXPR:
7258     case BIT_XOR_EXPR:
7259     case BIT_NOT_EXPR:
7260       return 2;
7261
7262     default:
7263       return INT_MAX;
7264     }
7265 }
7266
7267 /* Return true if VINFO indicates we are doing loop vectorization and if
7268    it is worth decomposing CODE operations into scalar operations for
7269    that loop's vectorization factor.  */
7270
7271 bool
7272 vect_worthwhile_without_simd_p (vec_info *vinfo, tree_code code)
7273 {
7274   loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
7275   unsigned HOST_WIDE_INT value;
7276   return (loop_vinfo
7277           && LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant (&value)
7278           && value >= vect_min_worthwhile_factor (code));
7279 }
7280
7281 /* Function vectorizable_induction
7282
7283    Check if STMT_INFO performs an induction computation that can be vectorized.
7284    If VEC_STMT is also passed, vectorize the induction PHI: create a vectorized
7285    phi to replace it, put it in VEC_STMT, and add it to the same basic block.
7286    Return true if STMT_INFO is vectorizable in this way.  */
7287
7288 bool
7289 vectorizable_induction (stmt_vec_info stmt_info,
7290                         gimple_stmt_iterator *gsi ATTRIBUTE_UNUSED,
7291                         stmt_vec_info *vec_stmt, slp_tree slp_node,
7292                         stmt_vector_for_cost *cost_vec)
7293 {
7294   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
7295   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
7296   unsigned ncopies;
7297   bool nested_in_vect_loop = false;
7298   struct loop *iv_loop;
7299   tree vec_def;
7300   edge pe = loop_preheader_edge (loop);
7301   basic_block new_bb;
7302   tree new_vec, vec_init, vec_step, t;
7303   tree new_name;
7304   gimple *new_stmt;
7305   gphi *induction_phi;
7306   tree induc_def, vec_dest;
7307   tree init_expr, step_expr;
7308   poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
7309   unsigned i;
7310   tree expr;
7311   gimple_seq stmts;
7312   imm_use_iterator imm_iter;
7313   use_operand_p use_p;
7314   gimple *exit_phi;
7315   edge latch_e;
7316   tree loop_arg;
7317   gimple_stmt_iterator si;
7318
7319   gphi *phi = dyn_cast <gphi *> (stmt_info->stmt);
7320   if (!phi)
7321     return false;
7322
7323   if (!STMT_VINFO_RELEVANT_P (stmt_info))
7324     return false;
7325
7326   /* Make sure it was recognized as induction computation.  */
7327   if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_induction_def)
7328     return false;
7329
7330   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
7331   poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
7332
7333   if (slp_node)
7334     ncopies = 1;
7335   else
7336     ncopies = vect_get_num_copies (loop_vinfo, vectype);
7337   gcc_assert (ncopies >= 1);
7338
7339   /* FORNOW. These restrictions should be relaxed.  */
7340   if (nested_in_vect_loop_p (loop, stmt_info))
7341     {
7342       imm_use_iterator imm_iter;
7343       use_operand_p use_p;
7344       gimple *exit_phi;
7345       edge latch_e;
7346       tree loop_arg;
7347
7348       if (ncopies > 1)
7349         {
7350           if (dump_enabled_p ())
7351             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7352                              "multiple types in nested loop.\n");
7353           return false;
7354         }
7355
7356       /* FORNOW: outer loop induction with SLP not supported.  */
7357       if (STMT_SLP_TYPE (stmt_info))
7358         return false;
7359
7360       exit_phi = NULL;
7361       latch_e = loop_latch_edge (loop->inner);
7362       loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
7363       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, loop_arg)
7364         {
7365           gimple *use_stmt = USE_STMT (use_p);
7366           if (is_gimple_debug (use_stmt))
7367             continue;
7368
7369           if (!flow_bb_inside_loop_p (loop->inner, gimple_bb (use_stmt)))
7370             {
7371               exit_phi = use_stmt;
7372               break;
7373             }
7374         }
7375       if (exit_phi)
7376         {
7377           stmt_vec_info exit_phi_vinfo = loop_vinfo->lookup_stmt (exit_phi);
7378           if (!(STMT_VINFO_RELEVANT_P (exit_phi_vinfo)
7379                 && !STMT_VINFO_LIVE_P (exit_phi_vinfo)))
7380             {
7381               if (dump_enabled_p ())
7382                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7383                                  "inner-loop induction only used outside "
7384                                  "of the outer vectorized loop.\n");
7385               return false;
7386             }
7387         }
7388
7389       nested_in_vect_loop = true;
7390       iv_loop = loop->inner;
7391     }
7392   else
7393     iv_loop = loop;
7394   gcc_assert (iv_loop == (gimple_bb (phi))->loop_father);
7395
7396   if (slp_node && !nunits.is_constant ())
7397     {
7398       /* The current SLP code creates the initial value element-by-element.  */
7399       if (dump_enabled_p ())
7400         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7401                          "SLP induction not supported for variable-length"
7402                          " vectors.\n");
7403       return false;
7404     }
7405
7406   if (!vec_stmt) /* transformation not required.  */
7407     {
7408       STMT_VINFO_TYPE (stmt_info) = induc_vec_info_type;
7409       DUMP_VECT_SCOPE ("vectorizable_induction");
7410       vect_model_induction_cost (stmt_info, ncopies, cost_vec);
7411       return true;
7412     }
7413
7414   /* Transform.  */
7415
7416   /* Compute a vector variable, initialized with the first VF values of
7417      the induction variable.  E.g., for an iv with IV_PHI='X' and
7418      evolution S, for a vector of 4 units, we want to compute:
7419      [X, X + S, X + 2*S, X + 3*S].  */
7420
7421   if (dump_enabled_p ())
7422     dump_printf_loc (MSG_NOTE, vect_location, "transform induction phi.\n");
7423
7424   latch_e = loop_latch_edge (iv_loop);
7425   loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
7426
7427   step_expr = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_info);
7428   gcc_assert (step_expr != NULL_TREE);
7429
7430   pe = loop_preheader_edge (iv_loop);
7431   init_expr = PHI_ARG_DEF_FROM_EDGE (phi,
7432                                      loop_preheader_edge (iv_loop));
7433
7434   stmts = NULL;
7435   if (!nested_in_vect_loop)
7436     {
7437       /* Convert the initial value to the desired type.  */
7438       tree new_type = TREE_TYPE (vectype);
7439       init_expr = gimple_convert (&stmts, new_type, init_expr);
7440
7441       /* If we are using the loop mask to "peel" for alignment then we need
7442          to adjust the start value here.  */
7443       tree skip_niters = LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo);
7444       if (skip_niters != NULL_TREE)
7445         {
7446           if (FLOAT_TYPE_P (vectype))
7447             skip_niters = gimple_build (&stmts, FLOAT_EXPR, new_type,
7448                                         skip_niters);
7449           else
7450             skip_niters = gimple_convert (&stmts, new_type, skip_niters);
7451           tree skip_step = gimple_build (&stmts, MULT_EXPR, new_type,
7452                                          skip_niters, step_expr);
7453           init_expr = gimple_build (&stmts, MINUS_EXPR, new_type,
7454                                     init_expr, skip_step);
7455         }
7456     }
7457
7458   /* Convert the step to the desired type.  */
7459   step_expr = gimple_convert (&stmts, TREE_TYPE (vectype), step_expr);
7460
7461   if (stmts)
7462     {
7463       new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
7464       gcc_assert (!new_bb);
7465     }
7466
7467   /* Find the first insertion point in the BB.  */
7468   basic_block bb = gimple_bb (phi);
7469   si = gsi_after_labels (bb);
7470
7471   /* For SLP induction we have to generate several IVs as for example
7472      with group size 3 we need [i, i, i, i + S] [i + S, i + S, i + 2*S, i + 2*S]
7473      [i + 2*S, i + 3*S, i + 3*S, i + 3*S].  The step is the same uniform
7474      [VF*S, VF*S, VF*S, VF*S] for all.  */
7475   if (slp_node)
7476     {
7477       /* Enforced above.  */
7478       unsigned int const_nunits = nunits.to_constant ();
7479
7480       /* Generate [VF*S, VF*S, ... ].  */
7481       if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
7482         {
7483           expr = build_int_cst (integer_type_node, vf);
7484           expr = fold_convert (TREE_TYPE (step_expr), expr);
7485         }
7486       else
7487         expr = build_int_cst (TREE_TYPE (step_expr), vf);
7488       new_name = fold_build2 (MULT_EXPR, TREE_TYPE (step_expr),
7489                               expr, step_expr);
7490       if (! CONSTANT_CLASS_P (new_name))
7491         new_name = vect_init_vector (stmt_info, new_name,
7492                                      TREE_TYPE (step_expr), NULL);
7493       new_vec = build_vector_from_val (vectype, new_name);
7494       vec_step = vect_init_vector (stmt_info, new_vec, vectype, NULL);
7495
7496       /* Now generate the IVs.  */
7497       unsigned group_size = SLP_TREE_SCALAR_STMTS (slp_node).length ();
7498       unsigned nvects = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
7499       unsigned elts = const_nunits * nvects;
7500       unsigned nivs = least_common_multiple (group_size,
7501                                              const_nunits) / const_nunits;
7502       gcc_assert (elts % group_size == 0);
7503       tree elt = init_expr;
7504       unsigned ivn;
7505       for (ivn = 0; ivn < nivs; ++ivn)
7506         {
7507           tree_vector_builder elts (vectype, const_nunits, 1);
7508           stmts = NULL;
7509           for (unsigned eltn = 0; eltn < const_nunits; ++eltn)
7510             {
7511               if (ivn*const_nunits + eltn >= group_size
7512                   && (ivn * const_nunits + eltn) % group_size == 0)
7513                 elt = gimple_build (&stmts, PLUS_EXPR, TREE_TYPE (elt),
7514                                     elt, step_expr);
7515               elts.quick_push (elt);
7516             }
7517           vec_init = gimple_build_vector (&stmts, &elts);
7518           if (stmts)
7519             {
7520               new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
7521               gcc_assert (!new_bb);
7522             }
7523
7524           /* Create the induction-phi that defines the induction-operand.  */
7525           vec_dest = vect_get_new_vect_var (vectype, vect_simple_var, "vec_iv_");
7526           induction_phi = create_phi_node (vec_dest, iv_loop->header);
7527           stmt_vec_info induction_phi_info
7528             = loop_vinfo->add_stmt (induction_phi);
7529           induc_def = PHI_RESULT (induction_phi);
7530
7531           /* Create the iv update inside the loop  */
7532           vec_def = make_ssa_name (vec_dest);
7533           new_stmt = gimple_build_assign (vec_def, PLUS_EXPR, induc_def, vec_step);
7534           gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
7535           loop_vinfo->add_stmt (new_stmt);
7536
7537           /* Set the arguments of the phi node:  */
7538           add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION);
7539           add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
7540                        UNKNOWN_LOCATION);
7541
7542           SLP_TREE_VEC_STMTS (slp_node).quick_push (induction_phi_info);
7543         }
7544
7545       /* Re-use IVs when we can.  */
7546       if (ivn < nvects)
7547         {
7548           unsigned vfp
7549             = least_common_multiple (group_size, const_nunits) / group_size;
7550           /* Generate [VF'*S, VF'*S, ... ].  */
7551           if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
7552             {
7553               expr = build_int_cst (integer_type_node, vfp);
7554               expr = fold_convert (TREE_TYPE (step_expr), expr);
7555             }
7556           else
7557             expr = build_int_cst (TREE_TYPE (step_expr), vfp);
7558           new_name = fold_build2 (MULT_EXPR, TREE_TYPE (step_expr),
7559                                   expr, step_expr);
7560           if (! CONSTANT_CLASS_P (new_name))
7561             new_name = vect_init_vector (stmt_info, new_name,
7562                                          TREE_TYPE (step_expr), NULL);
7563           new_vec = build_vector_from_val (vectype, new_name);
7564           vec_step = vect_init_vector (stmt_info, new_vec, vectype, NULL);
7565           for (; ivn < nvects; ++ivn)
7566             {
7567               gimple *iv = SLP_TREE_VEC_STMTS (slp_node)[ivn - nivs]->stmt;
7568               tree def;
7569               if (gimple_code (iv) == GIMPLE_PHI)
7570                 def = gimple_phi_result (iv);
7571               else
7572                 def = gimple_assign_lhs (iv);
7573               new_stmt = gimple_build_assign (make_ssa_name (vectype),
7574                                               PLUS_EXPR,
7575                                               def, vec_step);
7576               if (gimple_code (iv) == GIMPLE_PHI)
7577                 gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
7578               else
7579                 {
7580                   gimple_stmt_iterator tgsi = gsi_for_stmt (iv);
7581                   gsi_insert_after (&tgsi, new_stmt, GSI_CONTINUE_LINKING);
7582                 }
7583               SLP_TREE_VEC_STMTS (slp_node).quick_push
7584                 (loop_vinfo->add_stmt (new_stmt));
7585             }
7586         }
7587
7588       return true;
7589     }
7590
7591   /* Create the vector that holds the initial_value of the induction.  */
7592   if (nested_in_vect_loop)
7593     {
7594       /* iv_loop is nested in the loop to be vectorized.  init_expr had already
7595          been created during vectorization of previous stmts.  We obtain it
7596          from the STMT_VINFO_VEC_STMT of the defining stmt.  */
7597       vec_init = vect_get_vec_def_for_operand (init_expr, stmt_info);
7598       /* If the initial value is not of proper type, convert it.  */
7599       if (!useless_type_conversion_p (vectype, TREE_TYPE (vec_init)))
7600         {
7601           new_stmt
7602             = gimple_build_assign (vect_get_new_ssa_name (vectype,
7603                                                           vect_simple_var,
7604                                                           "vec_iv_"),
7605                                    VIEW_CONVERT_EXPR,
7606                                    build1 (VIEW_CONVERT_EXPR, vectype,
7607                                            vec_init));
7608           vec_init = gimple_assign_lhs (new_stmt);
7609           new_bb = gsi_insert_on_edge_immediate (loop_preheader_edge (iv_loop),
7610                                                  new_stmt);
7611           gcc_assert (!new_bb);
7612           loop_vinfo->add_stmt (new_stmt);
7613         }
7614     }
7615   else
7616     {
7617       /* iv_loop is the loop to be vectorized. Create:
7618          vec_init = [X, X+S, X+2*S, X+3*S] (S = step_expr, X = init_expr)  */
7619       stmts = NULL;
7620       new_name = gimple_convert (&stmts, TREE_TYPE (vectype), init_expr);
7621
7622       unsigned HOST_WIDE_INT const_nunits;
7623       if (nunits.is_constant (&const_nunits))
7624         {
7625           tree_vector_builder elts (vectype, const_nunits, 1);
7626           elts.quick_push (new_name);
7627           for (i = 1; i < const_nunits; i++)
7628             {
7629               /* Create: new_name_i = new_name + step_expr  */
7630               new_name = gimple_build (&stmts, PLUS_EXPR, TREE_TYPE (new_name),
7631                                        new_name, step_expr);
7632               elts.quick_push (new_name);
7633             }
7634           /* Create a vector from [new_name_0, new_name_1, ...,
7635              new_name_nunits-1]  */
7636           vec_init = gimple_build_vector (&stmts, &elts);
7637         }
7638       else if (INTEGRAL_TYPE_P (TREE_TYPE (step_expr)))
7639         /* Build the initial value directly from a VEC_SERIES_EXPR.  */
7640         vec_init = gimple_build (&stmts, VEC_SERIES_EXPR, vectype,
7641                                  new_name, step_expr);
7642       else
7643         {
7644           /* Build:
7645                 [base, base, base, ...]
7646                 + (vectype) [0, 1, 2, ...] * [step, step, step, ...].  */
7647           gcc_assert (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)));
7648           gcc_assert (flag_associative_math);
7649           tree index = build_index_vector (vectype, 0, 1);
7650           tree base_vec = gimple_build_vector_from_val (&stmts, vectype,
7651                                                         new_name);
7652           tree step_vec = gimple_build_vector_from_val (&stmts, vectype,
7653                                                         step_expr);
7654           vec_init = gimple_build (&stmts, FLOAT_EXPR, vectype, index);
7655           vec_init = gimple_build (&stmts, MULT_EXPR, vectype,
7656                                    vec_init, step_vec);
7657           vec_init = gimple_build (&stmts, PLUS_EXPR, vectype,
7658                                    vec_init, base_vec);
7659         }
7660
7661       if (stmts)
7662         {
7663           new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
7664           gcc_assert (!new_bb);
7665         }
7666     }
7667
7668
7669   /* Create the vector that holds the step of the induction.  */
7670   if (nested_in_vect_loop)
7671     /* iv_loop is nested in the loop to be vectorized. Generate:
7672        vec_step = [S, S, S, S]  */
7673     new_name = step_expr;
7674   else
7675     {
7676       /* iv_loop is the loop to be vectorized. Generate:
7677           vec_step = [VF*S, VF*S, VF*S, VF*S]  */
7678       gimple_seq seq = NULL;
7679       if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
7680         {
7681           expr = build_int_cst (integer_type_node, vf);
7682           expr = gimple_build (&seq, FLOAT_EXPR, TREE_TYPE (step_expr), expr);
7683         }
7684       else
7685         expr = build_int_cst (TREE_TYPE (step_expr), vf);
7686       new_name = gimple_build (&seq, MULT_EXPR, TREE_TYPE (step_expr),
7687                                expr, step_expr);
7688       if (seq)
7689         {
7690           new_bb = gsi_insert_seq_on_edge_immediate (pe, seq);
7691           gcc_assert (!new_bb);
7692         }
7693     }
7694
7695   t = unshare_expr (new_name);
7696   gcc_assert (CONSTANT_CLASS_P (new_name)
7697               || TREE_CODE (new_name) == SSA_NAME);
7698   new_vec = build_vector_from_val (vectype, t);
7699   vec_step = vect_init_vector (stmt_info, new_vec, vectype, NULL);
7700
7701
7702   /* Create the following def-use cycle:
7703      loop prolog:
7704          vec_init = ...
7705          vec_step = ...
7706      loop:
7707          vec_iv = PHI <vec_init, vec_loop>
7708          ...
7709          STMT
7710          ...
7711          vec_loop = vec_iv + vec_step;  */
7712
7713   /* Create the induction-phi that defines the induction-operand.  */
7714   vec_dest = vect_get_new_vect_var (vectype, vect_simple_var, "vec_iv_");
7715   induction_phi = create_phi_node (vec_dest, iv_loop->header);
7716   stmt_vec_info induction_phi_info = loop_vinfo->add_stmt (induction_phi);
7717   induc_def = PHI_RESULT (induction_phi);
7718
7719   /* Create the iv update inside the loop  */
7720   vec_def = make_ssa_name (vec_dest);
7721   new_stmt = gimple_build_assign (vec_def, PLUS_EXPR, induc_def, vec_step);
7722   gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
7723   stmt_vec_info new_stmt_info = loop_vinfo->add_stmt (new_stmt);
7724
7725   /* Set the arguments of the phi node:  */
7726   add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION);
7727   add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
7728                UNKNOWN_LOCATION);
7729
7730   STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = induction_phi_info;
7731
7732   /* In case that vectorization factor (VF) is bigger than the number
7733      of elements that we can fit in a vectype (nunits), we have to generate
7734      more than one vector stmt - i.e - we need to "unroll" the
7735      vector stmt by a factor VF/nunits.  For more details see documentation
7736      in vectorizable_operation.  */
7737
7738   if (ncopies > 1)
7739     {
7740       gimple_seq seq = NULL;
7741       stmt_vec_info prev_stmt_vinfo;
7742       /* FORNOW. This restriction should be relaxed.  */
7743       gcc_assert (!nested_in_vect_loop);
7744
7745       /* Create the vector that holds the step of the induction.  */
7746       if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
7747         {
7748           expr = build_int_cst (integer_type_node, nunits);
7749           expr = gimple_build (&seq, FLOAT_EXPR, TREE_TYPE (step_expr), expr);
7750         }
7751       else
7752         expr = build_int_cst (TREE_TYPE (step_expr), nunits);
7753       new_name = gimple_build (&seq, MULT_EXPR, TREE_TYPE (step_expr),
7754                                expr, step_expr);
7755       if (seq)
7756         {
7757           new_bb = gsi_insert_seq_on_edge_immediate (pe, seq);
7758           gcc_assert (!new_bb);
7759         }
7760
7761       t = unshare_expr (new_name);
7762       gcc_assert (CONSTANT_CLASS_P (new_name)
7763                   || TREE_CODE (new_name) == SSA_NAME);
7764       new_vec = build_vector_from_val (vectype, t);
7765       vec_step = vect_init_vector (stmt_info, new_vec, vectype, NULL);
7766
7767       vec_def = induc_def;
7768       prev_stmt_vinfo = induction_phi_info;
7769       for (i = 1; i < ncopies; i++)
7770         {
7771           /* vec_i = vec_prev + vec_step  */
7772           new_stmt = gimple_build_assign (vec_dest, PLUS_EXPR,
7773                                           vec_def, vec_step);
7774           vec_def = make_ssa_name (vec_dest, new_stmt);
7775           gimple_assign_set_lhs (new_stmt, vec_def);
7776
7777           gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
7778           new_stmt_info = loop_vinfo->add_stmt (new_stmt);
7779           STMT_VINFO_RELATED_STMT (prev_stmt_vinfo) = new_stmt_info;
7780           prev_stmt_vinfo = new_stmt_info;
7781         }
7782     }
7783
7784   if (nested_in_vect_loop)
7785     {
7786       /* Find the loop-closed exit-phi of the induction, and record
7787          the final vector of induction results:  */
7788       exit_phi = NULL;
7789       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, loop_arg)
7790         {
7791           gimple *use_stmt = USE_STMT (use_p);
7792           if (is_gimple_debug (use_stmt))
7793             continue;
7794
7795           if (!flow_bb_inside_loop_p (iv_loop, gimple_bb (use_stmt)))
7796             {
7797               exit_phi = use_stmt;
7798               break;
7799             }
7800         }
7801       if (exit_phi)
7802         {
7803           stmt_vec_info stmt_vinfo = loop_vinfo->lookup_stmt (exit_phi);
7804           /* FORNOW. Currently not supporting the case that an inner-loop induction
7805              is not used in the outer-loop (i.e. only outside the outer-loop).  */
7806           gcc_assert (STMT_VINFO_RELEVANT_P (stmt_vinfo)
7807                       && !STMT_VINFO_LIVE_P (stmt_vinfo));
7808
7809           STMT_VINFO_VEC_STMT (stmt_vinfo) = new_stmt_info;
7810           if (dump_enabled_p ())
7811             dump_printf_loc (MSG_NOTE, vect_location,
7812                              "vector of inductions after inner-loop:%G",
7813                              new_stmt);
7814         }
7815     }
7816
7817
7818   if (dump_enabled_p ())
7819     dump_printf_loc (MSG_NOTE, vect_location,
7820                      "transform induction: created def-use cycle: %G%G",
7821                      induction_phi, SSA_NAME_DEF_STMT (vec_def));
7822
7823   return true;
7824 }
7825
7826 /* Function vectorizable_live_operation.
7827
7828    STMT_INFO computes a value that is used outside the loop.  Check if
7829    it can be supported.  */
7830
7831 bool
7832 vectorizable_live_operation (stmt_vec_info stmt_info,
7833                              gimple_stmt_iterator *gsi ATTRIBUTE_UNUSED,
7834                              slp_tree slp_node, int slp_index,
7835                              stmt_vec_info *vec_stmt,
7836                              stmt_vector_for_cost *)
7837 {
7838   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
7839   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
7840   imm_use_iterator imm_iter;
7841   tree lhs, lhs_type, bitsize, vec_bitsize;
7842   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
7843   poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
7844   int ncopies;
7845   gimple *use_stmt;
7846   auto_vec<tree> vec_oprnds;
7847   int vec_entry = 0;
7848   poly_uint64 vec_index = 0;
7849
7850   gcc_assert (STMT_VINFO_LIVE_P (stmt_info));
7851
7852   if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def)
7853     return false;
7854
7855   /* FORNOW.  CHECKME.  */
7856   if (nested_in_vect_loop_p (loop, stmt_info))
7857     return false;
7858
7859   /* If STMT is not relevant and it is a simple assignment and its inputs are
7860      invariant then it can remain in place, unvectorized.  The original last
7861      scalar value that it computes will be used.  */
7862   if (!STMT_VINFO_RELEVANT_P (stmt_info))
7863     {
7864       gcc_assert (is_simple_and_all_uses_invariant (stmt_info, loop_vinfo));
7865       if (dump_enabled_p ())
7866         dump_printf_loc (MSG_NOTE, vect_location,
7867                          "statement is simple and uses invariant.  Leaving in "
7868                          "place.\n");
7869       return true;
7870     }
7871
7872   if (slp_node)
7873     ncopies = 1;
7874   else
7875     ncopies = vect_get_num_copies (loop_vinfo, vectype);
7876
7877   if (slp_node)
7878     {
7879       gcc_assert (slp_index >= 0);
7880
7881       int num_scalar = SLP_TREE_SCALAR_STMTS (slp_node).length ();
7882       int num_vec = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
7883
7884       /* Get the last occurrence of the scalar index from the concatenation of
7885          all the slp vectors. Calculate which slp vector it is and the index
7886          within.  */
7887       poly_uint64 pos = (num_vec * nunits) - num_scalar + slp_index;
7888
7889       /* Calculate which vector contains the result, and which lane of
7890          that vector we need.  */
7891       if (!can_div_trunc_p (pos, nunits, &vec_entry, &vec_index))
7892         {
7893           if (dump_enabled_p ())
7894             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7895                              "Cannot determine which vector holds the"
7896                              " final result.\n");
7897           return false;
7898         }
7899     }
7900
7901   if (!vec_stmt)
7902     {
7903       /* No transformation required.  */
7904       if (LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo))
7905         {
7906           if (!direct_internal_fn_supported_p (IFN_EXTRACT_LAST, vectype,
7907                                                OPTIMIZE_FOR_SPEED))
7908             {
7909               if (dump_enabled_p ())
7910                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7911                                  "can't use a fully-masked loop because "
7912                                  "the target doesn't support extract last "
7913                                  "reduction.\n");
7914               LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = false;
7915             }
7916           else if (slp_node)
7917             {
7918               if (dump_enabled_p ())
7919                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7920                                  "can't use a fully-masked loop because an "
7921                                  "SLP statement is live after the loop.\n");
7922               LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = false;
7923             }
7924           else if (ncopies > 1)
7925             {
7926               if (dump_enabled_p ())
7927                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7928                                  "can't use a fully-masked loop because"
7929                                  " ncopies is greater than 1.\n");
7930               LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = false;
7931             }
7932           else
7933             {
7934               gcc_assert (ncopies == 1 && !slp_node);
7935               vect_record_loop_mask (loop_vinfo,
7936                                      &LOOP_VINFO_MASKS (loop_vinfo),
7937                                      1, vectype);
7938             }
7939         }
7940       return true;
7941     }
7942
7943   /* Use the lhs of the original scalar statement.  */
7944   gimple *stmt = vect_orig_stmt (stmt_info)->stmt;
7945
7946   lhs = (is_a <gphi *> (stmt)) ? gimple_phi_result (stmt)
7947         : gimple_get_lhs (stmt);
7948   lhs_type = TREE_TYPE (lhs);
7949
7950   bitsize = (VECTOR_BOOLEAN_TYPE_P (vectype)
7951              ? bitsize_int (TYPE_PRECISION (TREE_TYPE (vectype)))
7952              : TYPE_SIZE (TREE_TYPE (vectype)));
7953   vec_bitsize = TYPE_SIZE (vectype);
7954
7955   /* Get the vectorized lhs of STMT and the lane to use (counted in bits).  */
7956   tree vec_lhs, bitstart;
7957   if (slp_node)
7958     {
7959       gcc_assert (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo));
7960
7961       /* Get the correct slp vectorized stmt.  */
7962       gimple *vec_stmt = SLP_TREE_VEC_STMTS (slp_node)[vec_entry]->stmt;
7963       if (gphi *phi = dyn_cast <gphi *> (vec_stmt))
7964         vec_lhs = gimple_phi_result (phi);
7965       else
7966         vec_lhs = gimple_get_lhs (vec_stmt);
7967
7968       /* Get entry to use.  */
7969       bitstart = bitsize_int (vec_index);
7970       bitstart = int_const_binop (MULT_EXPR, bitsize, bitstart);
7971     }
7972   else
7973     {
7974       enum vect_def_type dt = STMT_VINFO_DEF_TYPE (stmt_info);
7975       vec_lhs = vect_get_vec_def_for_operand_1 (stmt_info, dt);
7976       gcc_checking_assert (ncopies == 1
7977                            || !LOOP_VINFO_FULLY_MASKED_P (loop_vinfo));
7978
7979       /* For multiple copies, get the last copy.  */
7980       for (int i = 1; i < ncopies; ++i)
7981         vec_lhs = vect_get_vec_def_for_stmt_copy (loop_vinfo, vec_lhs);
7982
7983       /* Get the last lane in the vector.  */
7984       bitstart = int_const_binop (MINUS_EXPR, vec_bitsize, bitsize);
7985     }
7986
7987   gimple_seq stmts = NULL;
7988   tree new_tree;
7989   if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
7990     {
7991       /* Emit:
7992
7993            SCALAR_RES = EXTRACT_LAST <VEC_LHS, MASK>
7994
7995          where VEC_LHS is the vectorized live-out result and MASK is
7996          the loop mask for the final iteration.  */
7997       gcc_assert (ncopies == 1 && !slp_node);
7998       tree scalar_type = TREE_TYPE (STMT_VINFO_VECTYPE (stmt_info));
7999       tree mask = vect_get_loop_mask (gsi, &LOOP_VINFO_MASKS (loop_vinfo),
8000                                       1, vectype, 0);
8001       tree scalar_res = gimple_build (&stmts, CFN_EXTRACT_LAST,
8002                                       scalar_type, mask, vec_lhs);
8003
8004       /* Convert the extracted vector element to the required scalar type.  */
8005       new_tree = gimple_convert (&stmts, lhs_type, scalar_res);
8006     }
8007   else
8008     {
8009       tree bftype = TREE_TYPE (vectype);
8010       if (VECTOR_BOOLEAN_TYPE_P (vectype))
8011         bftype = build_nonstandard_integer_type (tree_to_uhwi (bitsize), 1);
8012       new_tree = build3 (BIT_FIELD_REF, bftype, vec_lhs, bitsize, bitstart);
8013       new_tree = force_gimple_operand (fold_convert (lhs_type, new_tree),
8014                                        &stmts, true, NULL_TREE);
8015     }
8016
8017   if (stmts)
8018     gsi_insert_seq_on_edge_immediate (single_exit (loop), stmts);
8019
8020   /* Replace use of lhs with newly computed result.  If the use stmt is a
8021      single arg PHI, just replace all uses of PHI result.  It's necessary
8022      because lcssa PHI defining lhs may be before newly inserted stmt.  */
8023   use_operand_p use_p;
8024   FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, lhs)
8025     if (!flow_bb_inside_loop_p (loop, gimple_bb (use_stmt))
8026         && !is_gimple_debug (use_stmt))
8027     {
8028       if (gimple_code (use_stmt) == GIMPLE_PHI
8029           && gimple_phi_num_args (use_stmt) == 1)
8030         {
8031           replace_uses_by (gimple_phi_result (use_stmt), new_tree);
8032         }
8033       else
8034         {
8035           FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
8036             SET_USE (use_p, new_tree);
8037         }
8038       update_stmt (use_stmt);
8039     }
8040
8041   return true;
8042 }
8043
8044 /* Kill any debug uses outside LOOP of SSA names defined in STMT_INFO.  */
8045
8046 static void
8047 vect_loop_kill_debug_uses (struct loop *loop, stmt_vec_info stmt_info)
8048 {
8049   ssa_op_iter op_iter;
8050   imm_use_iterator imm_iter;
8051   def_operand_p def_p;
8052   gimple *ustmt;
8053
8054   FOR_EACH_PHI_OR_STMT_DEF (def_p, stmt_info->stmt, op_iter, SSA_OP_DEF)
8055     {
8056       FOR_EACH_IMM_USE_STMT (ustmt, imm_iter, DEF_FROM_PTR (def_p))
8057         {
8058           basic_block bb;
8059
8060           if (!is_gimple_debug (ustmt))
8061             continue;
8062
8063           bb = gimple_bb (ustmt);
8064
8065           if (!flow_bb_inside_loop_p (loop, bb))
8066             {
8067               if (gimple_debug_bind_p (ustmt))
8068                 {
8069                   if (dump_enabled_p ())
8070                     dump_printf_loc (MSG_NOTE, vect_location,
8071                                      "killing debug use\n");
8072
8073                   gimple_debug_bind_reset_value (ustmt);
8074                   update_stmt (ustmt);
8075                 }
8076               else
8077                 gcc_unreachable ();
8078             }
8079         }
8080     }
8081 }
8082
8083 /* Given loop represented by LOOP_VINFO, return true if computation of
8084    LOOP_VINFO_NITERS (= LOOP_VINFO_NITERSM1 + 1) doesn't overflow, false
8085    otherwise.  */
8086
8087 static bool
8088 loop_niters_no_overflow (loop_vec_info loop_vinfo)
8089 {
8090   /* Constant case.  */
8091   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
8092     {
8093       tree cst_niters = LOOP_VINFO_NITERS (loop_vinfo);
8094       tree cst_nitersm1 = LOOP_VINFO_NITERSM1 (loop_vinfo);
8095
8096       gcc_assert (TREE_CODE (cst_niters) == INTEGER_CST);
8097       gcc_assert (TREE_CODE (cst_nitersm1) == INTEGER_CST);
8098       if (wi::to_widest (cst_nitersm1) < wi::to_widest (cst_niters))
8099         return true;
8100     }
8101
8102   widest_int max;
8103   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
8104   /* Check the upper bound of loop niters.  */
8105   if (get_max_loop_iterations (loop, &max))
8106     {
8107       tree type = TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo));
8108       signop sgn = TYPE_SIGN (type);
8109       widest_int type_max = widest_int::from (wi::max_value (type), sgn);
8110       if (max < type_max)
8111         return true;
8112     }
8113   return false;
8114 }
8115
8116 /* Return a mask type with half the number of elements as TYPE.  */
8117
8118 tree
8119 vect_halve_mask_nunits (tree type)
8120 {
8121   poly_uint64 nunits = exact_div (TYPE_VECTOR_SUBPARTS (type), 2);
8122   return build_truth_vector_type (nunits, current_vector_size);
8123 }
8124
8125 /* Return a mask type with twice as many elements as TYPE.  */
8126
8127 tree
8128 vect_double_mask_nunits (tree type)
8129 {
8130   poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (type) * 2;
8131   return build_truth_vector_type (nunits, current_vector_size);
8132 }
8133
8134 /* Record that a fully-masked version of LOOP_VINFO would need MASKS to
8135    contain a sequence of NVECTORS masks that each control a vector of type
8136    VECTYPE.  */
8137
8138 void
8139 vect_record_loop_mask (loop_vec_info loop_vinfo, vec_loop_masks *masks,
8140                        unsigned int nvectors, tree vectype)
8141 {
8142   gcc_assert (nvectors != 0);
8143   if (masks->length () < nvectors)
8144     masks->safe_grow_cleared (nvectors);
8145   rgroup_masks *rgm = &(*masks)[nvectors - 1];
8146   /* The number of scalars per iteration and the number of vectors are
8147      both compile-time constants.  */
8148   unsigned int nscalars_per_iter
8149     = exact_div (nvectors * TYPE_VECTOR_SUBPARTS (vectype),
8150                  LOOP_VINFO_VECT_FACTOR (loop_vinfo)).to_constant ();
8151   if (rgm->max_nscalars_per_iter < nscalars_per_iter)
8152     {
8153       rgm->max_nscalars_per_iter = nscalars_per_iter;
8154       rgm->mask_type = build_same_sized_truth_vector_type (vectype);
8155     }
8156 }
8157
8158 /* Given a complete set of masks MASKS, extract mask number INDEX
8159    for an rgroup that operates on NVECTORS vectors of type VECTYPE,
8160    where 0 <= INDEX < NVECTORS.  Insert any set-up statements before GSI.
8161
8162    See the comment above vec_loop_masks for more details about the mask
8163    arrangement.  */
8164
8165 tree
8166 vect_get_loop_mask (gimple_stmt_iterator *gsi, vec_loop_masks *masks,
8167                     unsigned int nvectors, tree vectype, unsigned int index)
8168 {
8169   rgroup_masks *rgm = &(*masks)[nvectors - 1];
8170   tree mask_type = rgm->mask_type;
8171
8172   /* Populate the rgroup's mask array, if this is the first time we've
8173      used it.  */
8174   if (rgm->masks.is_empty ())
8175     {
8176       rgm->masks.safe_grow_cleared (nvectors);
8177       for (unsigned int i = 0; i < nvectors; ++i)
8178         {
8179           tree mask = make_temp_ssa_name (mask_type, NULL, "loop_mask");
8180           /* Provide a dummy definition until the real one is available.  */
8181           SSA_NAME_DEF_STMT (mask) = gimple_build_nop ();
8182           rgm->masks[i] = mask;
8183         }
8184     }
8185
8186   tree mask = rgm->masks[index];
8187   if (maybe_ne (TYPE_VECTOR_SUBPARTS (mask_type),
8188                 TYPE_VECTOR_SUBPARTS (vectype)))
8189     {
8190       /* A loop mask for data type X can be reused for data type Y
8191          if X has N times more elements than Y and if Y's elements
8192          are N times bigger than X's.  In this case each sequence
8193          of N elements in the loop mask will be all-zero or all-one.
8194          We can then view-convert the mask so that each sequence of
8195          N elements is replaced by a single element.  */
8196       gcc_assert (multiple_p (TYPE_VECTOR_SUBPARTS (mask_type),
8197                               TYPE_VECTOR_SUBPARTS (vectype)));
8198       gimple_seq seq = NULL;
8199       mask_type = build_same_sized_truth_vector_type (vectype);
8200       mask = gimple_build (&seq, VIEW_CONVERT_EXPR, mask_type, mask);
8201       if (seq)
8202         gsi_insert_seq_before (gsi, seq, GSI_SAME_STMT);
8203     }
8204   return mask;
8205 }
8206
8207 /* Scale profiling counters by estimation for LOOP which is vectorized
8208    by factor VF.  */
8209
8210 static void
8211 scale_profile_for_vect_loop (struct loop *loop, unsigned vf)
8212 {
8213   edge preheader = loop_preheader_edge (loop);
8214   /* Reduce loop iterations by the vectorization factor.  */
8215   gcov_type new_est_niter = niter_for_unrolled_loop (loop, vf);
8216   profile_count freq_h = loop->header->count, freq_e = preheader->count ();
8217
8218   if (freq_h.nonzero_p ())
8219     {
8220       profile_probability p;
8221
8222       /* Avoid dropping loop body profile counter to 0 because of zero count
8223          in loop's preheader.  */
8224       if (!(freq_e == profile_count::zero ()))
8225         freq_e = freq_e.force_nonzero ();
8226       p = freq_e.apply_scale (new_est_niter + 1, 1).probability_in (freq_h);
8227       scale_loop_frequencies (loop, p);
8228     }
8229
8230   edge exit_e = single_exit (loop);
8231   exit_e->probability = profile_probability::always ()
8232                                  .apply_scale (1, new_est_niter + 1);
8233
8234   edge exit_l = single_pred_edge (loop->latch);
8235   profile_probability prob = exit_l->probability;
8236   exit_l->probability = exit_e->probability.invert ();
8237   if (prob.initialized_p () && exit_l->probability.initialized_p ())
8238     scale_bbs_frequencies (&loop->latch, 1, exit_l->probability / prob);
8239 }
8240
8241 /* Vectorize STMT_INFO if relevant, inserting any new instructions before GSI.
8242    When vectorizing STMT_INFO as a store, set *SEEN_STORE to its
8243    stmt_vec_info.  */
8244
8245 static void
8246 vect_transform_loop_stmt (loop_vec_info loop_vinfo, stmt_vec_info stmt_info,
8247                           gimple_stmt_iterator *gsi, stmt_vec_info *seen_store)
8248 {
8249   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
8250   poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
8251
8252   if (dump_enabled_p ())
8253     dump_printf_loc (MSG_NOTE, vect_location,
8254                      "------>vectorizing statement: %G", stmt_info->stmt);
8255
8256   if (MAY_HAVE_DEBUG_BIND_STMTS && !STMT_VINFO_LIVE_P (stmt_info))
8257     vect_loop_kill_debug_uses (loop, stmt_info);
8258
8259   if (!STMT_VINFO_RELEVANT_P (stmt_info)
8260       && !STMT_VINFO_LIVE_P (stmt_info))
8261     return;
8262
8263   if (STMT_VINFO_VECTYPE (stmt_info))
8264     {
8265       poly_uint64 nunits
8266         = TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info));
8267       if (!STMT_SLP_TYPE (stmt_info)
8268           && maybe_ne (nunits, vf)
8269           && dump_enabled_p ())
8270         /* For SLP VF is set according to unrolling factor, and not
8271            to vector size, hence for SLP this print is not valid.  */
8272         dump_printf_loc (MSG_NOTE, vect_location, "multiple-types.\n");
8273     }
8274
8275   /* Pure SLP statements have already been vectorized.  We still need
8276      to apply loop vectorization to hybrid SLP statements.  */
8277   if (PURE_SLP_STMT (stmt_info))
8278     return;
8279
8280   if (dump_enabled_p ())
8281     dump_printf_loc (MSG_NOTE, vect_location, "transform statement.\n");
8282
8283   if (vect_transform_stmt (stmt_info, gsi, NULL, NULL))
8284     *seen_store = stmt_info;
8285 }
8286
8287 /* Function vect_transform_loop.
8288
8289    The analysis phase has determined that the loop is vectorizable.
8290    Vectorize the loop - created vectorized stmts to replace the scalar
8291    stmts in the loop, and update the loop exit condition.
8292    Returns scalar epilogue loop if any.  */
8293
8294 struct loop *
8295 vect_transform_loop (loop_vec_info loop_vinfo)
8296 {
8297   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
8298   struct loop *epilogue = NULL;
8299   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
8300   int nbbs = loop->num_nodes;
8301   int i;
8302   tree niters_vector = NULL_TREE;
8303   tree step_vector = NULL_TREE;
8304   tree niters_vector_mult_vf = NULL_TREE;
8305   poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
8306   unsigned int lowest_vf = constant_lower_bound (vf);
8307   gimple *stmt;
8308   bool check_profitability = false;
8309   unsigned int th;
8310
8311   DUMP_VECT_SCOPE ("vec_transform_loop");
8312
8313   loop_vinfo->shared->check_datarefs ();
8314
8315   /* Use the more conservative vectorization threshold.  If the number
8316      of iterations is constant assume the cost check has been performed
8317      by our caller.  If the threshold makes all loops profitable that
8318      run at least the (estimated) vectorization factor number of times
8319      checking is pointless, too.  */
8320   th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
8321   if (th >= vect_vf_for_cost (loop_vinfo)
8322       && !LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
8323     {
8324       if (dump_enabled_p ())
8325         dump_printf_loc (MSG_NOTE, vect_location,
8326                          "Profitability threshold is %d loop iterations.\n",
8327                          th);
8328       check_profitability = true;
8329     }
8330
8331   /* Make sure there exists a single-predecessor exit bb.  Do this before
8332      versioning.   */
8333   edge e = single_exit (loop);
8334   if (! single_pred_p (e->dest))
8335     {
8336       split_loop_exit_edge (e, true);
8337       if (dump_enabled_p ())
8338         dump_printf (MSG_NOTE, "split exit edge\n");
8339     }
8340
8341   /* Version the loop first, if required, so the profitability check
8342      comes first.  */
8343
8344   if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
8345     {
8346       poly_uint64 versioning_threshold
8347         = LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo);
8348       if (check_profitability
8349           && ordered_p (poly_uint64 (th), versioning_threshold))
8350         {
8351           versioning_threshold = ordered_max (poly_uint64 (th),
8352                                               versioning_threshold);
8353           check_profitability = false;
8354         }
8355       struct loop *sloop
8356         = vect_loop_versioning (loop_vinfo, th, check_profitability,
8357                                 versioning_threshold);
8358       sloop->force_vectorize = false;
8359       check_profitability = false;
8360     }
8361
8362   /* Make sure there exists a single-predecessor exit bb also on the
8363      scalar loop copy.  Do this after versioning but before peeling
8364      so CFG structure is fine for both scalar and if-converted loop
8365      to make slpeel_duplicate_current_defs_from_edges face matched
8366      loop closed PHI nodes on the exit.  */
8367   if (LOOP_VINFO_SCALAR_LOOP (loop_vinfo))
8368     {
8369       e = single_exit (LOOP_VINFO_SCALAR_LOOP (loop_vinfo));
8370       if (! single_pred_p (e->dest))
8371         {
8372           split_loop_exit_edge (e, true);
8373           if (dump_enabled_p ())
8374             dump_printf (MSG_NOTE, "split exit edge of scalar loop\n");
8375         }
8376     }
8377
8378   tree niters = vect_build_loop_niters (loop_vinfo);
8379   LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = niters;
8380   tree nitersm1 = unshare_expr (LOOP_VINFO_NITERSM1 (loop_vinfo));
8381   bool niters_no_overflow = loop_niters_no_overflow (loop_vinfo);
8382   epilogue = vect_do_peeling (loop_vinfo, niters, nitersm1, &niters_vector,
8383                               &step_vector, &niters_vector_mult_vf, th,
8384                               check_profitability, niters_no_overflow);
8385
8386   if (niters_vector == NULL_TREE)
8387     {
8388       if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
8389           && !LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
8390           && known_eq (lowest_vf, vf))
8391         {
8392           niters_vector
8393             = build_int_cst (TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo)),
8394                              LOOP_VINFO_INT_NITERS (loop_vinfo) / lowest_vf);
8395           step_vector = build_one_cst (TREE_TYPE (niters));
8396         }
8397       else
8398         vect_gen_vector_loop_niters (loop_vinfo, niters, &niters_vector,
8399                                      &step_vector, niters_no_overflow);
8400     }
8401
8402   /* 1) Make sure the loop header has exactly two entries
8403      2) Make sure we have a preheader basic block.  */
8404
8405   gcc_assert (EDGE_COUNT (loop->header->preds) == 2);
8406
8407   split_edge (loop_preheader_edge (loop));
8408
8409   if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
8410       && vect_use_loop_mask_for_alignment_p (loop_vinfo))
8411     /* This will deal with any possible peeling.  */
8412     vect_prepare_for_masked_peels (loop_vinfo);
8413
8414   /* Schedule the SLP instances first, then handle loop vectorization
8415      below.  */
8416   if (!loop_vinfo->slp_instances.is_empty ())
8417     {
8418       DUMP_VECT_SCOPE ("scheduling SLP instances");
8419       vect_schedule_slp (loop_vinfo);
8420     }
8421
8422   /* FORNOW: the vectorizer supports only loops which body consist
8423      of one basic block (header + empty latch). When the vectorizer will
8424      support more involved loop forms, the order by which the BBs are
8425      traversed need to be reconsidered.  */
8426
8427   for (i = 0; i < nbbs; i++)
8428     {
8429       basic_block bb = bbs[i];
8430       stmt_vec_info stmt_info;
8431
8432       for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
8433            gsi_next (&si))
8434         {
8435           gphi *phi = si.phi ();
8436           if (dump_enabled_p ())
8437             dump_printf_loc (MSG_NOTE, vect_location,
8438                              "------>vectorizing phi: %G", phi);
8439           stmt_info = loop_vinfo->lookup_stmt (phi);
8440           if (!stmt_info)
8441             continue;
8442
8443           if (MAY_HAVE_DEBUG_BIND_STMTS && !STMT_VINFO_LIVE_P (stmt_info))
8444             vect_loop_kill_debug_uses (loop, stmt_info);
8445
8446           if (!STMT_VINFO_RELEVANT_P (stmt_info)
8447               && !STMT_VINFO_LIVE_P (stmt_info))
8448             continue;
8449
8450           if (STMT_VINFO_VECTYPE (stmt_info)
8451               && (maybe_ne
8452                   (TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info)), vf))
8453               && dump_enabled_p ())
8454             dump_printf_loc (MSG_NOTE, vect_location, "multiple-types.\n");
8455
8456           if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def
8457                || STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
8458                || STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle)
8459               && ! PURE_SLP_STMT (stmt_info))
8460             {
8461               if (dump_enabled_p ())
8462                 dump_printf_loc (MSG_NOTE, vect_location, "transform phi.\n");
8463               vect_transform_stmt (stmt_info, NULL, NULL, NULL);
8464             }
8465         }
8466
8467       for (gimple_stmt_iterator si = gsi_start_bb (bb);
8468            !gsi_end_p (si);)
8469         {
8470           stmt = gsi_stmt (si);
8471           /* During vectorization remove existing clobber stmts.  */
8472           if (gimple_clobber_p (stmt))
8473             {
8474               unlink_stmt_vdef (stmt);
8475               gsi_remove (&si, true);
8476               release_defs (stmt);
8477             }
8478           else
8479             {
8480               stmt_info = loop_vinfo->lookup_stmt (stmt);
8481
8482               /* vector stmts created in the outer-loop during vectorization of
8483                  stmts in an inner-loop may not have a stmt_info, and do not
8484                  need to be vectorized.  */
8485               stmt_vec_info seen_store = NULL;
8486               if (stmt_info)
8487                 {
8488                   if (STMT_VINFO_IN_PATTERN_P (stmt_info))
8489                     {
8490                       gimple *def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
8491                       for (gimple_stmt_iterator subsi = gsi_start (def_seq);
8492                            !gsi_end_p (subsi); gsi_next (&subsi))
8493                         {
8494                           stmt_vec_info pat_stmt_info
8495                             = loop_vinfo->lookup_stmt (gsi_stmt (subsi));
8496                           vect_transform_loop_stmt (loop_vinfo, pat_stmt_info,
8497                                                     &si, &seen_store);
8498                         }
8499                       stmt_vec_info pat_stmt_info
8500                         = STMT_VINFO_RELATED_STMT (stmt_info);
8501                       vect_transform_loop_stmt (loop_vinfo, pat_stmt_info, &si,
8502                                                 &seen_store);
8503                     }
8504                   vect_transform_loop_stmt (loop_vinfo, stmt_info, &si,
8505                                             &seen_store);
8506                 }
8507               gsi_next (&si);
8508               if (seen_store)
8509                 {
8510                   if (STMT_VINFO_GROUPED_ACCESS (seen_store))
8511                     /* Interleaving.  If IS_STORE is TRUE, the
8512                        vectorization of the interleaving chain was
8513                        completed - free all the stores in the chain.  */
8514                     vect_remove_stores (DR_GROUP_FIRST_ELEMENT (seen_store));
8515                   else
8516                     /* Free the attached stmt_vec_info and remove the stmt.  */
8517                     loop_vinfo->remove_stmt (stmt_info);
8518                 }
8519             }
8520         }
8521
8522       /* Stub out scalar statements that must not survive vectorization.
8523          Doing this here helps with grouped statements, or statements that
8524          are involved in patterns.  */
8525       for (gimple_stmt_iterator gsi = gsi_start_bb (bb);
8526            !gsi_end_p (gsi); gsi_next (&gsi))
8527         {
8528           gcall *call = dyn_cast <gcall *> (gsi_stmt (gsi));
8529           if (call && gimple_call_internal_p (call, IFN_MASK_LOAD))
8530             {
8531               tree lhs = gimple_get_lhs (call);
8532               if (!VECTOR_TYPE_P (TREE_TYPE (lhs)))
8533                 {
8534                   tree zero = build_zero_cst (TREE_TYPE (lhs));
8535                   gimple *new_stmt = gimple_build_assign (lhs, zero);
8536                   gsi_replace (&gsi, new_stmt, true);
8537                 }
8538             }
8539         }
8540     }                           /* BBs in loop */
8541
8542   /* The vectorization factor is always > 1, so if we use an IV increment of 1.
8543      a zero NITERS becomes a nonzero NITERS_VECTOR.  */
8544   if (integer_onep (step_vector))
8545     niters_no_overflow = true;
8546   vect_set_loop_condition (loop, loop_vinfo, niters_vector, step_vector,
8547                            niters_vector_mult_vf, !niters_no_overflow);
8548
8549   unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
8550   scale_profile_for_vect_loop (loop, assumed_vf);
8551
8552   /* True if the final iteration might not handle a full vector's
8553      worth of scalar iterations.  */
8554   bool final_iter_may_be_partial = LOOP_VINFO_FULLY_MASKED_P (loop_vinfo);
8555   /* The minimum number of iterations performed by the epilogue.  This
8556      is 1 when peeling for gaps because we always need a final scalar
8557      iteration.  */
8558   int min_epilogue_iters = LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) ? 1 : 0;
8559   /* +1 to convert latch counts to loop iteration counts,
8560      -min_epilogue_iters to remove iterations that cannot be performed
8561        by the vector code.  */
8562   int bias_for_lowest = 1 - min_epilogue_iters;
8563   int bias_for_assumed = bias_for_lowest;
8564   int alignment_npeels = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
8565   if (alignment_npeels && LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
8566     {
8567       /* When the amount of peeling is known at compile time, the first
8568          iteration will have exactly alignment_npeels active elements.
8569          In the worst case it will have at least one.  */
8570       int min_first_active = (alignment_npeels > 0 ? alignment_npeels : 1);
8571       bias_for_lowest += lowest_vf - min_first_active;
8572       bias_for_assumed += assumed_vf - min_first_active;
8573     }
8574   /* In these calculations the "- 1" converts loop iteration counts
8575      back to latch counts.  */
8576   if (loop->any_upper_bound)
8577     loop->nb_iterations_upper_bound
8578       = (final_iter_may_be_partial
8579          ? wi::udiv_ceil (loop->nb_iterations_upper_bound + bias_for_lowest,
8580                           lowest_vf) - 1
8581          : wi::udiv_floor (loop->nb_iterations_upper_bound + bias_for_lowest,
8582                            lowest_vf) - 1);
8583   if (loop->any_likely_upper_bound)
8584     loop->nb_iterations_likely_upper_bound
8585       = (final_iter_may_be_partial
8586          ? wi::udiv_ceil (loop->nb_iterations_likely_upper_bound
8587                           + bias_for_lowest, lowest_vf) - 1
8588          : wi::udiv_floor (loop->nb_iterations_likely_upper_bound
8589                            + bias_for_lowest, lowest_vf) - 1);
8590   if (loop->any_estimate)
8591     loop->nb_iterations_estimate
8592       = (final_iter_may_be_partial
8593          ? wi::udiv_ceil (loop->nb_iterations_estimate + bias_for_assumed,
8594                           assumed_vf) - 1
8595          : wi::udiv_floor (loop->nb_iterations_estimate + bias_for_assumed,
8596                            assumed_vf) - 1);
8597
8598   if (dump_enabled_p ())
8599     {
8600       if (!LOOP_VINFO_EPILOGUE_P (loop_vinfo))
8601         {
8602           dump_printf_loc (MSG_NOTE, vect_location,
8603                            "LOOP VECTORIZED\n");
8604           if (loop->inner)
8605             dump_printf_loc (MSG_NOTE, vect_location,
8606                              "OUTER LOOP VECTORIZED\n");
8607           dump_printf (MSG_NOTE, "\n");
8608         }
8609       else
8610         {
8611           dump_printf_loc (MSG_NOTE, vect_location,
8612                            "LOOP EPILOGUE VECTORIZED (VS=");
8613           dump_dec (MSG_NOTE, current_vector_size);
8614           dump_printf (MSG_NOTE, ")\n");
8615         }
8616     }
8617
8618   /* Loops vectorized with a variable factor won't benefit from
8619      unrolling/peeling.  */
8620   if (!vf.is_constant ())
8621     {
8622       loop->unroll = 1;
8623       if (dump_enabled_p ())
8624         dump_printf_loc (MSG_NOTE, vect_location, "Disabling unrolling due to"
8625                          " variable-length vectorization factor\n");
8626     }
8627   /* Free SLP instances here because otherwise stmt reference counting
8628      won't work.  */
8629   slp_instance instance;
8630   FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), i, instance)
8631     vect_free_slp_instance (instance, true);
8632   LOOP_VINFO_SLP_INSTANCES (loop_vinfo).release ();
8633   /* Clear-up safelen field since its value is invalid after vectorization
8634      since vectorized loop can have loop-carried dependencies.  */
8635   loop->safelen = 0;
8636
8637   /* Don't vectorize epilogue for epilogue.  */
8638   if (LOOP_VINFO_EPILOGUE_P (loop_vinfo))
8639     epilogue = NULL;
8640
8641   if (!PARAM_VALUE (PARAM_VECT_EPILOGUES_NOMASK))
8642     epilogue = NULL;
8643
8644   if (epilogue)
8645     {
8646       auto_vector_sizes vector_sizes;
8647       targetm.vectorize.autovectorize_vector_sizes (&vector_sizes);
8648       unsigned int next_size = 0;
8649
8650       /* Note LOOP_VINFO_NITERS_KNOWN_P and LOOP_VINFO_INT_NITERS work
8651          on niters already ajusted for the iterations of the prologue.  */
8652       if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
8653           && known_eq (vf, lowest_vf))
8654         {
8655           unsigned HOST_WIDE_INT eiters
8656             = (LOOP_VINFO_INT_NITERS (loop_vinfo)
8657                - LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo));
8658           eiters
8659             = eiters % lowest_vf + LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo);
8660           epilogue->nb_iterations_upper_bound = eiters - 1;
8661           epilogue->any_upper_bound = true;
8662
8663           unsigned int ratio;
8664           while (next_size < vector_sizes.length ()
8665                  && !(constant_multiple_p (current_vector_size,
8666                                            vector_sizes[next_size], &ratio)
8667                       && eiters >= lowest_vf / ratio))
8668             next_size += 1;
8669         }
8670       else
8671         while (next_size < vector_sizes.length ()
8672                && maybe_lt (current_vector_size, vector_sizes[next_size]))
8673           next_size += 1;
8674
8675       if (next_size == vector_sizes.length ())
8676         epilogue = NULL;
8677     }
8678
8679   if (epilogue)
8680     {
8681       epilogue->force_vectorize = loop->force_vectorize;
8682       epilogue->safelen = loop->safelen;
8683       epilogue->dont_vectorize = false;
8684
8685       /* We may need to if-convert epilogue to vectorize it.  */
8686       if (LOOP_VINFO_SCALAR_LOOP (loop_vinfo))
8687         tree_if_conversion (epilogue);
8688     }
8689
8690   return epilogue;
8691 }
8692
8693 /* The code below is trying to perform simple optimization - revert
8694    if-conversion for masked stores, i.e. if the mask of a store is zero
8695    do not perform it and all stored value producers also if possible.
8696    For example,
8697      for (i=0; i<n; i++)
8698        if (c[i])
8699         {
8700           p1[i] += 1;
8701           p2[i] = p3[i] +2;
8702         }
8703    this transformation will produce the following semi-hammock:
8704
8705    if (!mask__ifc__42.18_165 == { 0, 0, 0, 0, 0, 0, 0, 0 })
8706      {
8707        vect__11.19_170 = MASK_LOAD (vectp_p1.20_168, 0B, mask__ifc__42.18_165);
8708        vect__12.22_172 = vect__11.19_170 + vect_cst__171;
8709        MASK_STORE (vectp_p1.23_175, 0B, mask__ifc__42.18_165, vect__12.22_172);
8710        vect__18.25_182 = MASK_LOAD (vectp_p3.26_180, 0B, mask__ifc__42.18_165);
8711        vect__19.28_184 = vect__18.25_182 + vect_cst__183;
8712        MASK_STORE (vectp_p2.29_187, 0B, mask__ifc__42.18_165, vect__19.28_184);
8713      }
8714 */
8715
8716 void
8717 optimize_mask_stores (struct loop *loop)
8718 {
8719   basic_block *bbs = get_loop_body (loop);
8720   unsigned nbbs = loop->num_nodes;
8721   unsigned i;
8722   basic_block bb;
8723   struct loop *bb_loop;
8724   gimple_stmt_iterator gsi;
8725   gimple *stmt;
8726   auto_vec<gimple *> worklist;
8727   auto_purge_vect_location sentinel;
8728
8729   vect_location = find_loop_location (loop);
8730   /* Pick up all masked stores in loop if any.  */
8731   for (i = 0; i < nbbs; i++)
8732     {
8733       bb = bbs[i];
8734       for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi);
8735            gsi_next (&gsi))
8736         {
8737           stmt = gsi_stmt (gsi);
8738           if (gimple_call_internal_p (stmt, IFN_MASK_STORE))
8739             worklist.safe_push (stmt);
8740         }
8741     }
8742
8743   free (bbs);
8744   if (worklist.is_empty ())
8745     return;
8746
8747   /* Loop has masked stores.  */
8748   while (!worklist.is_empty ())
8749     {
8750       gimple *last, *last_store;
8751       edge e, efalse;
8752       tree mask;
8753       basic_block store_bb, join_bb;
8754       gimple_stmt_iterator gsi_to;
8755       tree vdef, new_vdef;
8756       gphi *phi;
8757       tree vectype;
8758       tree zero;
8759
8760       last = worklist.pop ();
8761       mask = gimple_call_arg (last, 2);
8762       bb = gimple_bb (last);
8763       /* Create then_bb and if-then structure in CFG, then_bb belongs to
8764          the same loop as if_bb.  It could be different to LOOP when two
8765          level loop-nest is vectorized and mask_store belongs to the inner
8766          one.  */
8767       e = split_block (bb, last);
8768       bb_loop = bb->loop_father;
8769       gcc_assert (loop == bb_loop || flow_loop_nested_p (loop, bb_loop));
8770       join_bb = e->dest;
8771       store_bb = create_empty_bb (bb);
8772       add_bb_to_loop (store_bb, bb_loop);
8773       e->flags = EDGE_TRUE_VALUE;
8774       efalse = make_edge (bb, store_bb, EDGE_FALSE_VALUE);
8775       /* Put STORE_BB to likely part.  */
8776       efalse->probability = profile_probability::unlikely ();
8777       store_bb->count = efalse->count ();
8778       make_single_succ_edge (store_bb, join_bb, EDGE_FALLTHRU);
8779       if (dom_info_available_p (CDI_DOMINATORS))
8780         set_immediate_dominator (CDI_DOMINATORS, store_bb, bb);
8781       if (dump_enabled_p ())
8782         dump_printf_loc (MSG_NOTE, vect_location,
8783                          "Create new block %d to sink mask stores.",
8784                          store_bb->index);
8785       /* Create vector comparison with boolean result.  */
8786       vectype = TREE_TYPE (mask);
8787       zero = build_zero_cst (vectype);
8788       stmt = gimple_build_cond (EQ_EXPR, mask, zero, NULL_TREE, NULL_TREE);
8789       gsi = gsi_last_bb (bb);
8790       gsi_insert_after (&gsi, stmt, GSI_SAME_STMT);
8791       /* Create new PHI node for vdef of the last masked store:
8792          .MEM_2 = VDEF <.MEM_1>
8793          will be converted to
8794          .MEM.3 = VDEF <.MEM_1>
8795          and new PHI node will be created in join bb
8796          .MEM_2 = PHI <.MEM_1, .MEM_3>
8797       */
8798       vdef = gimple_vdef (last);
8799       new_vdef = make_ssa_name (gimple_vop (cfun), last);
8800       gimple_set_vdef (last, new_vdef);
8801       phi = create_phi_node (vdef, join_bb);
8802       add_phi_arg (phi, new_vdef, EDGE_SUCC (store_bb, 0), UNKNOWN_LOCATION);
8803
8804       /* Put all masked stores with the same mask to STORE_BB if possible.  */
8805       while (true)
8806         {
8807           gimple_stmt_iterator gsi_from;
8808           gimple *stmt1 = NULL;
8809
8810           /* Move masked store to STORE_BB.  */
8811           last_store = last;
8812           gsi = gsi_for_stmt (last);
8813           gsi_from = gsi;
8814           /* Shift GSI to the previous stmt for further traversal.  */
8815           gsi_prev (&gsi);
8816           gsi_to = gsi_start_bb (store_bb);
8817           gsi_move_before (&gsi_from, &gsi_to);
8818           /* Setup GSI_TO to the non-empty block start.  */
8819           gsi_to = gsi_start_bb (store_bb);
8820           if (dump_enabled_p ())
8821             dump_printf_loc (MSG_NOTE, vect_location,
8822                              "Move stmt to created bb\n%G", last);
8823           /* Move all stored value producers if possible.  */
8824           while (!gsi_end_p (gsi))
8825             {
8826               tree lhs;
8827               imm_use_iterator imm_iter;
8828               use_operand_p use_p;
8829               bool res;
8830
8831               /* Skip debug statements.  */
8832               if (is_gimple_debug (gsi_stmt (gsi)))
8833                 {
8834                   gsi_prev (&gsi);
8835                   continue;
8836                 }
8837               stmt1 = gsi_stmt (gsi);
8838               /* Do not consider statements writing to memory or having
8839                  volatile operand.  */
8840               if (gimple_vdef (stmt1)
8841                   || gimple_has_volatile_ops (stmt1))
8842                 break;
8843               gsi_from = gsi;
8844               gsi_prev (&gsi);
8845               lhs = gimple_get_lhs (stmt1);
8846               if (!lhs)
8847                 break;
8848
8849               /* LHS of vectorized stmt must be SSA_NAME.  */
8850               if (TREE_CODE (lhs) != SSA_NAME)
8851                 break;
8852
8853               if (!VECTOR_TYPE_P (TREE_TYPE (lhs)))
8854                 {
8855                   /* Remove dead scalar statement.  */
8856                   if (has_zero_uses (lhs))
8857                     {
8858                       gsi_remove (&gsi_from, true);
8859                       continue;
8860                     }
8861                 }
8862
8863               /* Check that LHS does not have uses outside of STORE_BB.  */
8864               res = true;
8865               FOR_EACH_IMM_USE_FAST (use_p, imm_iter, lhs)
8866                 {
8867                   gimple *use_stmt;
8868                   use_stmt = USE_STMT (use_p);
8869                   if (is_gimple_debug (use_stmt))
8870                     continue;
8871                   if (gimple_bb (use_stmt) != store_bb)
8872                     {
8873                       res = false;
8874                       break;
8875                     }
8876                 }
8877               if (!res)
8878                 break;
8879
8880               if (gimple_vuse (stmt1)
8881                   && gimple_vuse (stmt1) != gimple_vuse (last_store))
8882                 break;
8883
8884               /* Can move STMT1 to STORE_BB.  */
8885               if (dump_enabled_p ())
8886                 dump_printf_loc (MSG_NOTE, vect_location,
8887                                  "Move stmt to created bb\n%G", stmt1);
8888               gsi_move_before (&gsi_from, &gsi_to);
8889               /* Shift GSI_TO for further insertion.  */
8890               gsi_prev (&gsi_to);
8891             }
8892           /* Put other masked stores with the same mask to STORE_BB.  */
8893           if (worklist.is_empty ()
8894               || gimple_call_arg (worklist.last (), 2) != mask
8895               || worklist.last () != stmt1)
8896             break;
8897           last = worklist.pop ();
8898         }
8899       add_phi_arg (phi, gimple_vuse (last_store), e, UNKNOWN_LOCATION);
8900     }
8901 }