gcc/tree-vect-loop.c

   1 /* Loop Vectorization
   2    Copyright (C) 2003-2019 Free Software Foundation, Inc.
   3    Contributed by Dorit Naishlos <dorit@il.ibm.com> and
   4    Ira Rosen <irar@il.ibm.com>
   5
   6 This file is part of GCC.
   7
   8 GCC is free software; you can redistribute it and/or modify it under
   9 the terms of the GNU General Public License as published by the Free
  10 Software Foundation; either version 3, or (at your option) any later
  11 version.
  12
  13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
  14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
  15 FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  16 for more details.
  17
  18 You should have received a copy of the GNU General Public License
  19 along with GCC; see the file COPYING3.  If not see
  20 <http://www.gnu.org/licenses/>.  */
  21
  22 #include "config.h"
  23 #include "system.h"
  24 #include "coretypes.h"
  25 #include "backend.h"
  26 #include "target.h"
  27 #include "rtl.h"
  28 #include "tree.h"
  29 #include "gimple.h"
  30 #include "cfghooks.h"
  31 #include "tree-pass.h"
  32 #include "ssa.h"
  33 #include "optabs-tree.h"
  34 #include "diagnostic-core.h"
  35 #include "fold-const.h"
  36 #include "stor-layout.h"
  37 #include "cfganal.h"
  38 #include "gimplify.h"
  39 #include "gimple-iterator.h"
  40 #include "gimplify-me.h"
  41 #include "tree-ssa-loop-ivopts.h"
  42 #include "tree-ssa-loop-manip.h"
  43 #include "tree-ssa-loop-niter.h"
  44 #include "tree-ssa-loop.h"
  45 #include "cfgloop.h"
  46 #include "params.h"
  47 #include "tree-scalar-evolution.h"
  48 #include "tree-vectorizer.h"
  49 #include "gimple-fold.h"
  50 #include "cgraph.h"
  51 #include "tree-cfg.h"
  52 #include "tree-if-conv.h"
  53 #include "internal-fn.h"
  54 #include "tree-vector-builder.h"
  55 #include "vec-perm-indices.h"
  56 #include "tree-eh.h"
  57
  58 /* Loop Vectorization Pass.
  59
  60    This pass tries to vectorize loops.
  61
  62    For example, the vectorizer transforms the following simple loop:
  63
  64         short a[N]; short b[N]; short c[N]; int i;
  65
  66         for (i=0; i<N; i++){
  67           a[i] = b[i] + c[i];
  68         }
  69
  70    as if it was manually vectorized by rewriting the source code into:
  71
  72         typedef int __attribute__((mode(V8HI))) v8hi;
  73         short a[N];  short b[N]; short c[N];   int i;
  74         v8hi *pa = (v8hi*)a, *pb = (v8hi*)b, *pc = (v8hi*)c;
  75         v8hi va, vb, vc;
  76
  77         for (i=0; i<N/8; i++){
  78           vb = pb[i];
  79           vc = pc[i];
  80           va = vb + vc;
  81           pa[i] = va;
  82         }
  83
  84         The main entry to this pass is vectorize_loops(), in which
  85    the vectorizer applies a set of analyses on a given set of loops,
  86    followed by the actual vectorization transformation for the loops that
  87    had successfully passed the analysis phase.
  88         Throughout this pass we make a distinction between two types of
  89    data: scalars (which are represented by SSA_NAMES), and memory references
  90    ("data-refs").  These two types of data require different handling both
  91    during analysis and transformation. The types of data-refs that the
  92    vectorizer currently supports are ARRAY_REFS which base is an array DECL
  93    (not a pointer), and INDIRECT_REFS through pointers; both array and pointer
  94    accesses are required to have a simple (consecutive) access pattern.
  95
  96    Analysis phase:
  97    ===============
  98         The driver for the analysis phase is vect_analyze_loop().
  99    It applies a set of analyses, some of which rely on the scalar evolution
 100    analyzer (scev) developed by Sebastian Pop.
 101
 102         During the analysis phase the vectorizer records some information
 103    per stmt in a "stmt_vec_info" struct which is attached to each stmt in the
 104    loop, as well as general information about the loop as a whole, which is
 105    recorded in a "loop_vec_info" struct attached to each loop.
 106
 107    Transformation phase:
 108    =====================
 109         The loop transformation phase scans all the stmts in the loop, and
 110    creates a vector stmt (or a sequence of stmts) for each scalar stmt S in
 111    the loop that needs to be vectorized.  It inserts the vector code sequence
 112    just before the scalar stmt S, and records a pointer to the vector code
 113    in STMT_VINFO_VEC_STMT (stmt_info) (stmt_info is the stmt_vec_info struct
 114    attached to S).  This pointer will be used for the vectorization of following
 115    stmts which use the def of stmt S. Stmt S is removed if it writes to memory;
 116    otherwise, we rely on dead code elimination for removing it.
 117
 118         For example, say stmt S1 was vectorized into stmt VS1:
 119
 120    VS1: vb = px[i];
 121    S1:  b = x[i];    STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
 122    S2:  a = b;
 123
 124    To vectorize stmt S2, the vectorizer first finds the stmt that defines
 125    the operand 'b' (S1), and gets the relevant vector def 'vb' from the
 126    vector stmt VS1 pointed to by STMT_VINFO_VEC_STMT (stmt_info (S1)).  The
 127    resulting sequence would be:
 128
 129    VS1: vb = px[i];
 130    S1:  b = x[i];       STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
 131    VS2: va = vb;
 132    S2:  a = b;          STMT_VINFO_VEC_STMT (stmt_info (S2)) = VS2
 133
 134         Operands that are not SSA_NAMEs, are data-refs that appear in
 135    load/store operations (like 'x[i]' in S1), and are handled differently.
 136
 137    Target modeling:
 138    =================
 139         Currently the only target specific information that is used is the
 140    size of the vector (in bytes) - "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD".
 141    Targets that can support different sizes of vectors, for now will need
 142    to specify one value for "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD".  More
 143    flexibility will be added in the future.
 144
 145         Since we only vectorize operations which vector form can be
 146    expressed using existing tree codes, to verify that an operation is
 147    supported, the vectorizer checks the relevant optab at the relevant
 148    machine_mode (e.g, optab_handler (add_optab, V8HImode)).  If
 149    the value found is CODE_FOR_nothing, then there's no target support, and
 150    we can't vectorize the stmt.
 151
 152    For additional information on this project see:
 153    http://gcc.gnu.org/projects/tree-ssa/vectorization.html
 154 */
 155
 156 static void vect_estimate_min_profitable_iters (loop_vec_info, int *, int *);
 157
 158 /* Subroutine of vect_determine_vf_for_stmt that handles only one
 159    statement.  VECTYPE_MAYBE_SET_P is true if STMT_VINFO_VECTYPE
 160    may already be set for general statements (not just data refs).  */
 161
 162 static opt_result
 163 vect_determine_vf_for_stmt_1 (stmt_vec_info stmt_info,
 164                               bool vectype_maybe_set_p,
 165                               poly_uint64 *vf,
 166                               vec<stmt_vec_info > *mask_producers)
 167 {
 168   gimple *stmt = stmt_info->stmt;
 169
 170   if ((!STMT_VINFO_RELEVANT_P (stmt_info)
 171        && !STMT_VINFO_LIVE_P (stmt_info))
 172       || gimple_clobber_p (stmt))
 173     {
 174       if (dump_enabled_p ())
 175         dump_printf_loc (MSG_NOTE, vect_location, "skip.\n");
 176       return opt_result::success ();
 177     }
 178
 179   tree stmt_vectype, nunits_vectype;
 180   opt_result res = vect_get_vector_types_for_stmt (stmt_info, &stmt_vectype,
 181                                                    &nunits_vectype);
 182   if (!res)
 183     return res;
 184
 185   if (stmt_vectype)
 186     {
 187       if (STMT_VINFO_VECTYPE (stmt_info))
 188         /* The only case when a vectype had been already set is for stmts
 189            that contain a data ref, or for "pattern-stmts" (stmts generated
 190            by the vectorizer to represent/replace a certain idiom).  */
 191         gcc_assert ((STMT_VINFO_DATA_REF (stmt_info)
 192                      || vectype_maybe_set_p)
 193                     && STMT_VINFO_VECTYPE (stmt_info) == stmt_vectype);
 194       else if (stmt_vectype == boolean_type_node)
 195         mask_producers->safe_push (stmt_info);
 196       else
 197         STMT_VINFO_VECTYPE (stmt_info) = stmt_vectype;
 198     }
 199
 200   if (nunits_vectype)
 201     vect_update_max_nunits (vf, nunits_vectype);
 202
 203   return opt_result::success ();
 204 }
 205
 206 /* Subroutine of vect_determine_vectorization_factor.  Set the vector
 207    types of STMT_INFO and all attached pattern statements and update
 208    the vectorization factor VF accordingly.  If some of the statements
 209    produce a mask result whose vector type can only be calculated later,
 210    add them to MASK_PRODUCERS.  Return true on success or false if
 211    something prevented vectorization.  */
 212
 213 static opt_result
 214 vect_determine_vf_for_stmt (stmt_vec_info stmt_info, poly_uint64 *vf,
 215                             vec<stmt_vec_info > *mask_producers)
 216 {
 217   vec_info *vinfo = stmt_info->vinfo;
 218   if (dump_enabled_p ())
 219     dump_printf_loc (MSG_NOTE, vect_location, "==> examining statement: %G",
 220                      stmt_info->stmt);
 221   opt_result res
 222     = vect_determine_vf_for_stmt_1 (stmt_info, false, vf, mask_producers);
 223   if (!res)
 224     return res;
 225
 226   if (STMT_VINFO_IN_PATTERN_P (stmt_info)
 227       && STMT_VINFO_RELATED_STMT (stmt_info))
 228     {
 229       gimple *pattern_def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
 230       stmt_info = STMT_VINFO_RELATED_STMT (stmt_info);
 231
 232       /* If a pattern statement has def stmts, analyze them too.  */
 233       for (gimple_stmt_iterator si = gsi_start (pattern_def_seq);
 234            !gsi_end_p (si); gsi_next (&si))
 235         {
 236           stmt_vec_info def_stmt_info = vinfo->lookup_stmt (gsi_stmt (si));
 237           if (dump_enabled_p ())
 238             dump_printf_loc (MSG_NOTE, vect_location,
 239                              "==> examining pattern def stmt: %G",
 240                              def_stmt_info->stmt);
 241           if (!vect_determine_vf_for_stmt_1 (def_stmt_info, true,
 242                                              vf, mask_producers))
 243           res = vect_determine_vf_for_stmt_1 (def_stmt_info, true,
 244                                               vf, mask_producers);
 245           if (!res)
 246             return res;
 247         }
 248
 249       if (dump_enabled_p ())
 250         dump_printf_loc (MSG_NOTE, vect_location,
 251                          "==> examining pattern statement: %G",
 252                          stmt_info->stmt);
 253       res = vect_determine_vf_for_stmt_1 (stmt_info, true, vf, mask_producers);
 254       if (!res)
 255         return res;
 256     }
 257
 258   return opt_result::success ();
 259 }
 260
 261 /* Function vect_determine_vectorization_factor
 262
 263    Determine the vectorization factor (VF).  VF is the number of data elements
 264    that are operated upon in parallel in a single iteration of the vectorized
 265    loop.  For example, when vectorizing a loop that operates on 4byte elements,
 266    on a target with vector size (VS) 16byte, the VF is set to 4, since 4
 267    elements can fit in a single vector register.
 268
 269    We currently support vectorization of loops in which all types operated upon
 270    are of the same size.  Therefore this function currently sets VF according to
 271    the size of the types operated upon, and fails if there are multiple sizes
 272    in the loop.
 273
 274    VF is also the factor by which the loop iterations are strip-mined, e.g.:
 275    original loop:
 276         for (i=0; i<N; i++){
 277           a[i] = b[i] + c[i];
 278         }
 279
 280    vectorized loop:
 281         for (i=0; i<N; i+=VF){
 282           a[i:VF] = b[i:VF] + c[i:VF];
 283         }
 284 */
 285
 286 static opt_result
 287 vect_determine_vectorization_factor (loop_vec_info loop_vinfo)
 288 {
 289   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
 290   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
 291   unsigned nbbs = loop->num_nodes;
 292   poly_uint64 vectorization_factor = 1;
 293   tree scalar_type = NULL_TREE;
 294   gphi *phi;
 295   tree vectype;
 296   stmt_vec_info stmt_info;
 297   unsigned i;
 298   auto_vec<stmt_vec_info> mask_producers;
 299
 300   DUMP_VECT_SCOPE ("vect_determine_vectorization_factor");
 301
 302   for (i = 0; i < nbbs; i++)
 303     {
 304       basic_block bb = bbs[i];
 305
 306       for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
 307            gsi_next (&si))
 308         {
 309           phi = si.phi ();
 310           stmt_info = loop_vinfo->lookup_stmt (phi);
 311           if (dump_enabled_p ())
 312             dump_printf_loc (MSG_NOTE, vect_location, "==> examining phi: %G",
 313                              phi);
 314
 315           gcc_assert (stmt_info);
 316
 317           if (STMT_VINFO_RELEVANT_P (stmt_info)
 318               || STMT_VINFO_LIVE_P (stmt_info))
 319             {
 320               gcc_assert (!STMT_VINFO_VECTYPE (stmt_info));
 321               scalar_type = TREE_TYPE (PHI_RESULT (phi));
 322
 323               if (dump_enabled_p ())
 324                 dump_printf_loc (MSG_NOTE, vect_location,
 325                                  "get vectype for scalar type:  %T\n",
 326                                  scalar_type);
 327
 328               vectype = get_vectype_for_scalar_type (scalar_type);
 329               if (!vectype)
 330                 return opt_result::failure_at (phi,
 331                                                "not vectorized: unsupported "
 332                                                "data-type %T\n",
 333                                                scalar_type);
 334               STMT_VINFO_VECTYPE (stmt_info) = vectype;
 335
 336               if (dump_enabled_p ())
 337                 dump_printf_loc (MSG_NOTE, vect_location, "vectype: %T\n",
 338                                  vectype);
 339
 340               if (dump_enabled_p ())
 341                 {
 342                   dump_printf_loc (MSG_NOTE, vect_location, "nunits = ");
 343                   dump_dec (MSG_NOTE, TYPE_VECTOR_SUBPARTS (vectype));
 344                   dump_printf (MSG_NOTE, "\n");
 345                 }
 346
 347               vect_update_max_nunits (&vectorization_factor, vectype);
 348             }
 349         }
 350
 351       for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
 352            gsi_next (&si))
 353         {
 354           stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
 355           opt_result res
 356             = vect_determine_vf_for_stmt (stmt_info, &vectorization_factor,
 357                                           &mask_producers);
 358           if (!res)
 359             return res;
 360         }
 361     }
 362
 363   /* TODO: Analyze cost. Decide if worth while to vectorize.  */
 364   if (dump_enabled_p ())
 365     {
 366       dump_printf_loc (MSG_NOTE, vect_location, "vectorization factor = ");
 367       dump_dec (MSG_NOTE, vectorization_factor);
 368       dump_printf (MSG_NOTE, "\n");
 369     }
 370
 371   if (known_le (vectorization_factor, 1U))
 372     return opt_result::failure_at (vect_location,
 373                                    "not vectorized: unsupported data-type\n");
 374   LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
 375
 376   for (i = 0; i < mask_producers.length (); i++)
 377     {
 378       stmt_info = mask_producers[i];
 379       opt_tree mask_type = vect_get_mask_type_for_stmt (stmt_info);
 380       if (!mask_type)
 381         return opt_result::propagate_failure (mask_type);
 382       STMT_VINFO_VECTYPE (stmt_info) = mask_type;
 383     }
 384
 385   return opt_result::success ();
 386 }
 387
 388
 389 /* Function vect_is_simple_iv_evolution.
 390
 391    FORNOW: A simple evolution of an induction variables in the loop is
 392    considered a polynomial evolution.  */
 393
 394 static bool
 395 vect_is_simple_iv_evolution (unsigned loop_nb, tree access_fn, tree * init,
 396                              tree * step)
 397 {
 398   tree init_expr;
 399   tree step_expr;
 400   tree evolution_part = evolution_part_in_loop_num (access_fn, loop_nb);
 401   basic_block bb;
 402
 403   /* When there is no evolution in this loop, the evolution function
 404      is not "simple".  */
 405   if (evolution_part == NULL_TREE)
 406     return false;
 407
 408   /* When the evolution is a polynomial of degree >= 2
 409      the evolution function is not "simple".  */
 410   if (tree_is_chrec (evolution_part))
 411     return false;
 412
 413   step_expr = evolution_part;
 414   init_expr = unshare_expr (initial_condition_in_loop_num (access_fn, loop_nb));
 415
 416   if (dump_enabled_p ())
 417     dump_printf_loc (MSG_NOTE, vect_location, "step: %T,  init: %T\n",
 418                      step_expr, init_expr);
 419
 420   *init = init_expr;
 421   *step = step_expr;
 422
 423   if (TREE_CODE (step_expr) != INTEGER_CST
 424       && (TREE_CODE (step_expr) != SSA_NAME
 425           || ((bb = gimple_bb (SSA_NAME_DEF_STMT (step_expr)))
 426               && flow_bb_inside_loop_p (get_loop (cfun, loop_nb), bb))
 427           || (!INTEGRAL_TYPE_P (TREE_TYPE (step_expr))
 428               && (!SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr))
 429                   || !flag_associative_math)))
 430       && (TREE_CODE (step_expr) != REAL_CST
 431           || !flag_associative_math))
 432     {
 433       if (dump_enabled_p ())
 434         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 435                          "step unknown.\n");
 436       return false;
 437     }
 438
 439   return true;
 440 }
 441
 442 /* Return true if PHI, described by STMT_INFO, is the inner PHI in
 443    what we are assuming is a double reduction.  For example, given
 444    a structure like this:
 445
 446       outer1:
 447         x_1 = PHI <x_4(outer2), ...>;
 448         ...
 449
 450       inner:
 451         x_2 = PHI <x_1(outer1), ...>;
 452         ...
 453         x_3 = ...;
 454         ...
 455
 456       outer2:
 457         x_4 = PHI <x_3(inner)>;
 458         ...
 459
 460    outer loop analysis would treat x_1 as a double reduction phi and
 461    this function would then return true for x_2.  */
 462
 463 static bool
 464 vect_inner_phi_in_double_reduction_p (stmt_vec_info stmt_info, gphi *phi)
 465 {
 466   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
 467   use_operand_p use_p;
 468   ssa_op_iter op_iter;
 469   FOR_EACH_PHI_ARG (use_p, phi, op_iter, SSA_OP_USE)
 470     if (stmt_vec_info def_info = loop_vinfo->lookup_def (USE_FROM_PTR (use_p)))
 471       if (STMT_VINFO_DEF_TYPE (def_info) == vect_double_reduction_def)
 472         return true;
 473   return false;
 474 }
 475
 476 /* Function vect_analyze_scalar_cycles_1.
 477
 478    Examine the cross iteration def-use cycles of scalar variables
 479    in LOOP.  LOOP_VINFO represents the loop that is now being
 480    considered for vectorization (can be LOOP, or an outer-loop
 481    enclosing LOOP).  */
 482
 483 static void
 484 vect_analyze_scalar_cycles_1 (loop_vec_info loop_vinfo, struct loop *loop)
 485 {
 486   basic_block bb = loop->header;
 487   tree init, step;
 488   auto_vec<stmt_vec_info, 64> worklist;
 489   gphi_iterator gsi;
 490   bool double_reduc;
 491
 492   DUMP_VECT_SCOPE ("vect_analyze_scalar_cycles");
 493
 494   /* First - identify all inductions.  Reduction detection assumes that all the
 495      inductions have been identified, therefore, this order must not be
 496      changed.  */
 497   for (gsi = gsi_start_phis  (bb); !gsi_end_p (gsi); gsi_next (&gsi))
 498     {
 499       gphi *phi = gsi.phi ();
 500       tree access_fn = NULL;
 501       tree def = PHI_RESULT (phi);
 502       stmt_vec_info stmt_vinfo = loop_vinfo->lookup_stmt (phi);
 503
 504       if (dump_enabled_p ())
 505         dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: %G", phi);
 506
 507       /* Skip virtual phi's.  The data dependences that are associated with
 508          virtual defs/uses (i.e., memory accesses) are analyzed elsewhere.  */
 509       if (virtual_operand_p (def))
 510         continue;
 511
 512       STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_unknown_def_type;
 513
 514       /* Analyze the evolution function.  */
 515       access_fn = analyze_scalar_evolution (loop, def);
 516       if (access_fn)
 517         {
 518           STRIP_NOPS (access_fn);
 519           if (dump_enabled_p ())
 520             dump_printf_loc (MSG_NOTE, vect_location,
 521                              "Access function of PHI: %T\n", access_fn);
 522           STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo)
 523             = initial_condition_in_loop_num (access_fn, loop->num);
 524           STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo)
 525             = evolution_part_in_loop_num (access_fn, loop->num);
 526         }
 527
 528       if (!access_fn
 529           || vect_inner_phi_in_double_reduction_p (stmt_vinfo, phi)
 530           || !vect_is_simple_iv_evolution (loop->num, access_fn, &init, &step)
 531           || (LOOP_VINFO_LOOP (loop_vinfo) != loop
 532               && TREE_CODE (step) != INTEGER_CST))
 533         {
 534           worklist.safe_push (stmt_vinfo);
 535           continue;
 536         }
 537
 538       gcc_assert (STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo)
 539                   != NULL_TREE);
 540       gcc_assert (STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo) != NULL_TREE);
 541
 542       if (dump_enabled_p ())
 543         dump_printf_loc (MSG_NOTE, vect_location, "Detected induction.\n");
 544       STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_induction_def;
 545     }
 546
 547
 548   /* Second - identify all reductions and nested cycles.  */
 549   while (worklist.length () > 0)
 550     {
 551       stmt_vec_info stmt_vinfo = worklist.pop ();
 552       gphi *phi = as_a <gphi *> (stmt_vinfo->stmt);
 553       tree def = PHI_RESULT (phi);
 554
 555       if (dump_enabled_p ())
 556         dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: %G", phi);
 557
 558       gcc_assert (!virtual_operand_p (def)
 559                   && STMT_VINFO_DEF_TYPE (stmt_vinfo) == vect_unknown_def_type);
 560
 561       stmt_vec_info reduc_stmt_info
 562         = vect_force_simple_reduction (loop_vinfo, stmt_vinfo,
 563                                        &double_reduc, false);
 564       if (reduc_stmt_info)
 565         {
 566           if (double_reduc)
 567             {
 568               if (dump_enabled_p ())
 569                 dump_printf_loc (MSG_NOTE, vect_location,
 570                                  "Detected double reduction.\n");
 571
 572               STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_double_reduction_def;
 573               STMT_VINFO_DEF_TYPE (reduc_stmt_info)
 574                 = vect_double_reduction_def;
 575             }
 576           else
 577             {
 578               if (loop != LOOP_VINFO_LOOP (loop_vinfo))
 579                 {
 580                   if (dump_enabled_p ())
 581                     dump_printf_loc (MSG_NOTE, vect_location,
 582                                      "Detected vectorizable nested cycle.\n");
 583
 584                   STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_nested_cycle;
 585                   STMT_VINFO_DEF_TYPE (reduc_stmt_info) = vect_nested_cycle;
 586                 }
 587               else
 588                 {
 589                   if (dump_enabled_p ())
 590                     dump_printf_loc (MSG_NOTE, vect_location,
 591                                      "Detected reduction.\n");
 592
 593                   STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_reduction_def;
 594                   STMT_VINFO_DEF_TYPE (reduc_stmt_info) = vect_reduction_def;
 595                   /* Store the reduction cycles for possible vectorization in
 596                      loop-aware SLP if it was not detected as reduction
 597                      chain.  */
 598                   if (! REDUC_GROUP_FIRST_ELEMENT (reduc_stmt_info))
 599                     LOOP_VINFO_REDUCTIONS (loop_vinfo).safe_push
 600                       (reduc_stmt_info);
 601                 }
 602             }
 603         }
 604       else
 605         if (dump_enabled_p ())
 606           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 607                            "Unknown def-use cycle pattern.\n");
 608     }
 609 }
 610
 611
 612 /* Function vect_analyze_scalar_cycles.
 613
 614    Examine the cross iteration def-use cycles of scalar variables, by
 615    analyzing the loop-header PHIs of scalar variables.  Classify each
 616    cycle as one of the following: invariant, induction, reduction, unknown.
 617    We do that for the loop represented by LOOP_VINFO, and also to its
 618    inner-loop, if exists.
 619    Examples for scalar cycles:
 620
 621    Example1: reduction:
 622
 623               loop1:
 624               for (i=0; i<N; i++)
 625                  sum += a[i];
 626
 627    Example2: induction:
 628
 629               loop2:
 630               for (i=0; i<N; i++)
 631                  a[i] = i;  */
 632
 633 static void
 634 vect_analyze_scalar_cycles (loop_vec_info loop_vinfo)
 635 {
 636   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
 637
 638   vect_analyze_scalar_cycles_1 (loop_vinfo, loop);
 639
 640   /* When vectorizing an outer-loop, the inner-loop is executed sequentially.
 641      Reductions in such inner-loop therefore have different properties than
 642      the reductions in the nest that gets vectorized:
 643      1. When vectorized, they are executed in the same order as in the original
 644         scalar loop, so we can't change the order of computation when
 645         vectorizing them.
 646      2. FIXME: Inner-loop reductions can be used in the inner-loop, so the
 647         current checks are too strict.  */
 648
 649   if (loop->inner)
 650     vect_analyze_scalar_cycles_1 (loop_vinfo, loop->inner);
 651 }
 652
 653 /* Transfer group and reduction information from STMT_INFO to its
 654    pattern stmt.  */
 655
 656 static void
 657 vect_fixup_reduc_chain (stmt_vec_info stmt_info)
 658 {
 659   stmt_vec_info firstp = STMT_VINFO_RELATED_STMT (stmt_info);
 660   stmt_vec_info stmtp;
 661   gcc_assert (!REDUC_GROUP_FIRST_ELEMENT (firstp)
 662               && REDUC_GROUP_FIRST_ELEMENT (stmt_info));
 663   REDUC_GROUP_SIZE (firstp) = REDUC_GROUP_SIZE (stmt_info);
 664   do
 665     {
 666       stmtp = STMT_VINFO_RELATED_STMT (stmt_info);
 667       REDUC_GROUP_FIRST_ELEMENT (stmtp) = firstp;
 668       stmt_info = REDUC_GROUP_NEXT_ELEMENT (stmt_info);
 669       if (stmt_info)
 670         REDUC_GROUP_NEXT_ELEMENT (stmtp)
 671           = STMT_VINFO_RELATED_STMT (stmt_info);
 672     }
 673   while (stmt_info);
 674   STMT_VINFO_DEF_TYPE (stmtp) = vect_reduction_def;
 675 }
 676
 677 /* Fixup scalar cycles that now have their stmts detected as patterns.  */
 678
 679 static void
 680 vect_fixup_scalar_cycles_with_patterns (loop_vec_info loop_vinfo)
 681 {
 682   stmt_vec_info first;
 683   unsigned i;
 684
 685   FOR_EACH_VEC_ELT (LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo), i, first)
 686     if (STMT_VINFO_IN_PATTERN_P (first))
 687       {
 688         stmt_vec_info next = REDUC_GROUP_NEXT_ELEMENT (first);
 689         while (next)
 690           {
 691             if (! STMT_VINFO_IN_PATTERN_P (next))
 692               break;
 693             next = REDUC_GROUP_NEXT_ELEMENT (next);
 694           }
 695         /* If not all stmt in the chain are patterns try to handle
 696            the chain without patterns.  */
 697         if (! next)
 698           {
 699             vect_fixup_reduc_chain (first);
 700             LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo)[i]
 701               = STMT_VINFO_RELATED_STMT (first);
 702           }
 703       }
 704 }
 705
 706 /* Function vect_get_loop_niters.
 707
 708    Determine how many iterations the loop is executed and place it
 709    in NUMBER_OF_ITERATIONS.  Place the number of latch iterations
 710    in NUMBER_OF_ITERATIONSM1.  Place the condition under which the
 711    niter information holds in ASSUMPTIONS.
 712
 713    Return the loop exit condition.  */
 714
 715
 716 static gcond *
 717 vect_get_loop_niters (struct loop *loop, tree *assumptions,
 718                       tree *number_of_iterations, tree *number_of_iterationsm1)
 719 {
 720   edge exit = single_exit (loop);
 721   struct tree_niter_desc niter_desc;
 722   tree niter_assumptions, niter, may_be_zero;
 723   gcond *cond = get_loop_exit_condition (loop);
 724
 725   *assumptions = boolean_true_node;
 726   *number_of_iterationsm1 = chrec_dont_know;
 727   *number_of_iterations = chrec_dont_know;
 728   DUMP_VECT_SCOPE ("get_loop_niters");
 729
 730   if (!exit)
 731     return cond;
 732
 733   niter = chrec_dont_know;
 734   may_be_zero = NULL_TREE;
 735   niter_assumptions = boolean_true_node;
 736   if (!number_of_iterations_exit_assumptions (loop, exit, &niter_desc, NULL)
 737       || chrec_contains_undetermined (niter_desc.niter))
 738     return cond;
 739
 740   niter_assumptions = niter_desc.assumptions;
 741   may_be_zero = niter_desc.may_be_zero;
 742   niter = niter_desc.niter;
 743
 744   if (may_be_zero && integer_zerop (may_be_zero))
 745     may_be_zero = NULL_TREE;
 746
 747   if (may_be_zero)
 748     {
 749       if (COMPARISON_CLASS_P (may_be_zero))
 750         {
 751           /* Try to combine may_be_zero with assumptions, this can simplify
 752              computation of niter expression.  */
 753           if (niter_assumptions && !integer_nonzerop (niter_assumptions))
 754             niter_assumptions = fold_build2 (TRUTH_AND_EXPR, boolean_type_node,
 755                                              niter_assumptions,
 756                                              fold_build1 (TRUTH_NOT_EXPR,
 757                                                           boolean_type_node,
 758                                                           may_be_zero));
 759           else
 760             niter = fold_build3 (COND_EXPR, TREE_TYPE (niter), may_be_zero,
 761                                  build_int_cst (TREE_TYPE (niter), 0),
 762                                  rewrite_to_non_trapping_overflow (niter));
 763
 764           may_be_zero = NULL_TREE;
 765         }
 766       else if (integer_nonzerop (may_be_zero))
 767         {
 768           *number_of_iterationsm1 = build_int_cst (TREE_TYPE (niter), 0);
 769           *number_of_iterations = build_int_cst (TREE_TYPE (niter), 1);
 770           return cond;
 771         }
 772       else
 773         return cond;
 774     }
 775
 776   *assumptions = niter_assumptions;
 777   *number_of_iterationsm1 = niter;
 778
 779   /* We want the number of loop header executions which is the number
 780      of latch executions plus one.
 781      ???  For UINT_MAX latch executions this number overflows to zero
 782      for loops like do { n++; } while (n != 0);  */
 783   if (niter && !chrec_contains_undetermined (niter))
 784     niter = fold_build2 (PLUS_EXPR, TREE_TYPE (niter), unshare_expr (niter),
 785                           build_int_cst (TREE_TYPE (niter), 1));
 786   *number_of_iterations = niter;
 787
 788   return cond;
 789 }
 790
 791 /* Function bb_in_loop_p
 792
 793    Used as predicate for dfs order traversal of the loop bbs.  */
 794
 795 static bool
 796 bb_in_loop_p (const_basic_block bb, const void *data)
 797 {
 798   const struct loop *const loop = (const struct loop *)data;
 799   if (flow_bb_inside_loop_p (loop, bb))
 800     return true;
 801   return false;
 802 }
 803
 804
 805 /* Create and initialize a new loop_vec_info struct for LOOP_IN, as well as
 806    stmt_vec_info structs for all the stmts in LOOP_IN.  */
 807
 808 _loop_vec_info::_loop_vec_info (struct loop *loop_in, vec_info_shared *shared)
 809   : vec_info (vec_info::loop, init_cost (loop_in), shared),
 810     loop (loop_in),
 811     bbs (XCNEWVEC (basic_block, loop->num_nodes)),
 812     num_itersm1 (NULL_TREE),
 813     num_iters (NULL_TREE),
 814     num_iters_unchanged (NULL_TREE),
 815     num_iters_assumptions (NULL_TREE),
 816     th (0),
 817     versioning_threshold (0),
 818     vectorization_factor (0),
 819     max_vectorization_factor (0),
 820     mask_skip_niters (NULL_TREE),
 821     mask_compare_type (NULL_TREE),
 822     unaligned_dr (NULL),
 823     peeling_for_alignment (0),
 824     ptr_mask (0),
 825     ivexpr_map (NULL),
 826     slp_unrolling_factor (1),
 827     single_scalar_iteration_cost (0),
 828     vectorizable (false),
 829     can_fully_mask_p (true),
 830     fully_masked_p (false),
 831     peeling_for_gaps (false),
 832     peeling_for_niter (false),
 833     operands_swapped (false),
 834     no_data_dependencies (false),
 835     has_mask_store (false),
 836     scalar_loop (NULL),
 837     orig_loop_info (NULL)
 838 {
 839   /* CHECKME: We want to visit all BBs before their successors (except for
 840      latch blocks, for which this assertion wouldn't hold).  In the simple
 841      case of the loop forms we allow, a dfs order of the BBs would the same
 842      as reversed postorder traversal, so we are safe.  */
 843
 844   unsigned int nbbs = dfs_enumerate_from (loop->header, 0, bb_in_loop_p,
 845                                           bbs, loop->num_nodes, loop);
 846   gcc_assert (nbbs == loop->num_nodes);
 847
 848   for (unsigned int i = 0; i < nbbs; i++)
 849     {
 850       basic_block bb = bbs[i];
 851       gimple_stmt_iterator si;
 852
 853       for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
 854         {
 855           gimple *phi = gsi_stmt (si);
 856           gimple_set_uid (phi, 0);
 857           add_stmt (phi);
 858         }
 859
 860       for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
 861         {
 862           gimple *stmt = gsi_stmt (si);
 863           gimple_set_uid (stmt, 0);
 864           add_stmt (stmt);
 865         }
 866     }
 867 }
 868
 869 /* Free all levels of MASKS.  */
 870
 871 void
 872 release_vec_loop_masks (vec_loop_masks *masks)
 873 {
 874   rgroup_masks *rgm;
 875   unsigned int i;
 876   FOR_EACH_VEC_ELT (*masks, i, rgm)
 877     rgm->masks.release ();
 878   masks->release ();
 879 }
 880
 881 /* Free all memory used by the _loop_vec_info, as well as all the
 882    stmt_vec_info structs of all the stmts in the loop.  */
 883
 884 _loop_vec_info::~_loop_vec_info ()
 885 {
 886   int nbbs;
 887   gimple_stmt_iterator si;
 888   int j;
 889
 890   nbbs = loop->num_nodes;
 891   for (j = 0; j < nbbs; j++)
 892     {
 893       basic_block bb = bbs[j];
 894       for (si = gsi_start_bb (bb); !gsi_end_p (si); )
 895         {
 896           gimple *stmt = gsi_stmt (si);
 897
 898           /* We may have broken canonical form by moving a constant
 899              into RHS1 of a commutative op.  Fix such occurrences.  */
 900           if (operands_swapped && is_gimple_assign (stmt))
 901             {
 902               enum tree_code code = gimple_assign_rhs_code (stmt);
 903
 904               if ((code == PLUS_EXPR
 905                    || code == POINTER_PLUS_EXPR
 906                    || code == MULT_EXPR)
 907                   && CONSTANT_CLASS_P (gimple_assign_rhs1 (stmt)))
 908                 swap_ssa_operands (stmt,
 909                                    gimple_assign_rhs1_ptr (stmt),
 910                                    gimple_assign_rhs2_ptr (stmt));
 911               else if (code == COND_EXPR
 912                        && CONSTANT_CLASS_P (gimple_assign_rhs2 (stmt)))
 913                 {
 914                   tree cond_expr = gimple_assign_rhs1 (stmt);
 915                   enum tree_code cond_code = TREE_CODE (cond_expr);
 916
 917                   if (TREE_CODE_CLASS (cond_code) == tcc_comparison)
 918                     {
 919                       bool honor_nans = HONOR_NANS (TREE_OPERAND (cond_expr,
 920                                                                   0));
 921                       cond_code = invert_tree_comparison (cond_code,
 922                                                           honor_nans);
 923                       if (cond_code != ERROR_MARK)
 924                         {
 925                           TREE_SET_CODE (cond_expr, cond_code);
 926                           swap_ssa_operands (stmt,
 927                                              gimple_assign_rhs2_ptr (stmt),
 928                                              gimple_assign_rhs3_ptr (stmt));
 929                         }
 930                     }
 931                 }
 932             }
 933           gsi_next (&si);
 934         }
 935     }
 936
 937   free (bbs);
 938
 939   release_vec_loop_masks (&masks);
 940   delete ivexpr_map;
 941
 942   loop->aux = NULL;
 943 }
 944
 945 /* Return an invariant or register for EXPR and emit necessary
 946    computations in the LOOP_VINFO loop preheader.  */
 947
 948 tree
 949 cse_and_gimplify_to_preheader (loop_vec_info loop_vinfo, tree expr)
 950 {
 951   if (is_gimple_reg (expr)
 952       || is_gimple_min_invariant (expr))
 953     return expr;
 954
 955   if (! loop_vinfo->ivexpr_map)
 956     loop_vinfo->ivexpr_map = new hash_map<tree_operand_hash, tree>;
 957   tree &cached = loop_vinfo->ivexpr_map->get_or_insert (expr);
 958   if (! cached)
 959     {
 960       gimple_seq stmts = NULL;
 961       cached = force_gimple_operand (unshare_expr (expr),
 962                                      &stmts, true, NULL_TREE);
 963       if (stmts)
 964         {
 965           edge e = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo));
 966           gsi_insert_seq_on_edge_immediate (e, stmts);
 967         }
 968     }
 969   return cached;
 970 }
 971
 972 /* Return true if we can use CMP_TYPE as the comparison type to produce
 973    all masks required to mask LOOP_VINFO.  */
 974
 975 static bool
 976 can_produce_all_loop_masks_p (loop_vec_info loop_vinfo, tree cmp_type)
 977 {
 978   rgroup_masks *rgm;
 979   unsigned int i;
 980   FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo), i, rgm)
 981     if (rgm->mask_type != NULL_TREE
 982         && !direct_internal_fn_supported_p (IFN_WHILE_ULT,
 983                                             cmp_type, rgm->mask_type,
 984                                             OPTIMIZE_FOR_SPEED))
 985       return false;
 986   return true;
 987 }
 988
 989 /* Calculate the maximum number of scalars per iteration for every
 990    rgroup in LOOP_VINFO.  */
 991
 992 static unsigned int
 993 vect_get_max_nscalars_per_iter (loop_vec_info loop_vinfo)
 994 {
 995   unsigned int res = 1;
 996   unsigned int i;
 997   rgroup_masks *rgm;
 998   FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo), i, rgm)
 999     res = MAX (res, rgm->max_nscalars_per_iter);
1000   return res;
1001 }
1002
1003 /* Each statement in LOOP_VINFO can be masked where necessary.  Check
1004    whether we can actually generate the masks required.  Return true if so,
1005    storing the type of the scalar IV in LOOP_VINFO_MASK_COMPARE_TYPE.  */
1006
1007 static bool
1008 vect_verify_full_masking (loop_vec_info loop_vinfo)
1009 {
1010   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1011   unsigned int min_ni_width;
1012
1013   /* Use a normal loop if there are no statements that need masking.
1014      This only happens in rare degenerate cases: it means that the loop
1015      has no loads, no stores, and no live-out values.  */
1016   if (LOOP_VINFO_MASKS (loop_vinfo).is_empty ())
1017     return false;
1018
1019   /* Get the maximum number of iterations that is representable
1020      in the counter type.  */
1021   tree ni_type = TREE_TYPE (LOOP_VINFO_NITERSM1 (loop_vinfo));
1022   widest_int max_ni = wi::to_widest (TYPE_MAX_VALUE (ni_type)) + 1;
1023
1024   /* Get a more refined estimate for the number of iterations.  */
1025   widest_int max_back_edges;
1026   if (max_loop_iterations (loop, &max_back_edges))
1027     max_ni = wi::smin (max_ni, max_back_edges + 1);
1028
1029   /* Account for rgroup masks, in which each bit is replicated N times.  */
1030   max_ni *= vect_get_max_nscalars_per_iter (loop_vinfo);
1031
1032   /* Work out how many bits we need to represent the limit.  */
1033   min_ni_width = wi::min_precision (max_ni, UNSIGNED);
1034
1035   /* Find a scalar mode for which WHILE_ULT is supported.  */
1036   opt_scalar_int_mode cmp_mode_iter;
1037   tree cmp_type = NULL_TREE;
1038   FOR_EACH_MODE_IN_CLASS (cmp_mode_iter, MODE_INT)
1039     {
1040       unsigned int cmp_bits = GET_MODE_BITSIZE (cmp_mode_iter.require ());
1041       if (cmp_bits >= min_ni_width
1042           && targetm.scalar_mode_supported_p (cmp_mode_iter.require ()))
1043         {
1044           tree this_type = build_nonstandard_integer_type (cmp_bits, true);
1045           if (this_type
1046               && can_produce_all_loop_masks_p (loop_vinfo, this_type))
1047             {
1048               /* Although we could stop as soon as we find a valid mode,
1049                  it's often better to continue until we hit Pmode, since the
1050                  operands to the WHILE are more likely to be reusable in
1051                  address calculations.  */
1052               cmp_type = this_type;
1053               if (cmp_bits >= GET_MODE_BITSIZE (Pmode))
1054                 break;
1055             }
1056         }
1057     }
1058
1059   if (!cmp_type)
1060     return false;
1061
1062   LOOP_VINFO_MASK_COMPARE_TYPE (loop_vinfo) = cmp_type;
1063   return true;
1064 }
1065
1066 /* Calculate the cost of one scalar iteration of the loop.  */
1067 static void
1068 vect_compute_single_scalar_iteration_cost (loop_vec_info loop_vinfo)
1069 {
1070   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1071   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1072   int nbbs = loop->num_nodes, factor;
1073   int innerloop_iters, i;
1074
1075   DUMP_VECT_SCOPE ("vect_compute_single_scalar_iteration_cost");
1076
1077   /* Gather costs for statements in the scalar loop.  */
1078
1079   /* FORNOW.  */
1080   innerloop_iters = 1;
1081   if (loop->inner)
1082     innerloop_iters = 50; /* FIXME */
1083
1084   for (i = 0; i < nbbs; i++)
1085     {
1086       gimple_stmt_iterator si;
1087       basic_block bb = bbs[i];
1088
1089       if (bb->loop_father == loop->inner)
1090         factor = innerloop_iters;
1091       else
1092         factor = 1;
1093
1094       for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
1095         {
1096           gimple *stmt = gsi_stmt (si);
1097           stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (stmt);
1098
1099           if (!is_gimple_assign (stmt) && !is_gimple_call (stmt))
1100             continue;
1101
1102           /* Skip stmts that are not vectorized inside the loop.  */
1103           stmt_vec_info vstmt_info = vect_stmt_to_vectorize (stmt_info);
1104           if (!STMT_VINFO_RELEVANT_P (vstmt_info)
1105               && (!STMT_VINFO_LIVE_P (vstmt_info)
1106                   || !VECTORIZABLE_CYCLE_DEF
1107                         (STMT_VINFO_DEF_TYPE (vstmt_info))))
1108             continue;
1109
1110           vect_cost_for_stmt kind;
1111           if (STMT_VINFO_DATA_REF (stmt_info))
1112             {
1113               if (DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
1114                kind = scalar_load;
1115              else
1116                kind = scalar_store;
1117             }
1118           else
1119             kind = scalar_stmt;
1120
1121           record_stmt_cost (&LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1122                             factor, kind, stmt_info, 0, vect_prologue);
1123         }
1124     }
1125
1126   /* Now accumulate cost.  */
1127   void *target_cost_data = init_cost (loop);
1128   stmt_info_for_cost *si;
1129   int j;
1130   FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1131                     j, si)
1132     (void) add_stmt_cost (target_cost_data, si->count,
1133                           si->kind, si->stmt_info, si->misalign,
1134                           vect_body);
1135   unsigned dummy, body_cost = 0;
1136   finish_cost (target_cost_data, &dummy, &body_cost, &dummy);
1137   destroy_cost_data (target_cost_data);
1138   LOOP_VINFO_SINGLE_SCALAR_ITERATION_COST (loop_vinfo) = body_cost;
1139 }
1140
1141
1142 /* Function vect_analyze_loop_form_1.
1143
1144    Verify that certain CFG restrictions hold, including:
1145    - the loop has a pre-header
1146    - the loop has a single entry and exit
1147    - the loop exit condition is simple enough
1148    - the number of iterations can be analyzed, i.e, a countable loop.  The
1149      niter could be analyzed under some assumptions.  */
1150
1151 opt_result
1152 vect_analyze_loop_form_1 (struct loop *loop, gcond **loop_cond,
1153                           tree *assumptions, tree *number_of_iterationsm1,
1154                           tree *number_of_iterations, gcond **inner_loop_cond)
1155 {
1156   DUMP_VECT_SCOPE ("vect_analyze_loop_form");
1157
1158   /* Different restrictions apply when we are considering an inner-most loop,
1159      vs. an outer (nested) loop.
1160      (FORNOW. May want to relax some of these restrictions in the future).  */
1161
1162   if (!loop->inner)
1163     {
1164       /* Inner-most loop.  We currently require that the number of BBs is
1165          exactly 2 (the header and latch).  Vectorizable inner-most loops
1166          look like this:
1167
1168                         (pre-header)
1169                            |
1170                           header <--------+
1171                            | |            |
1172                            | +--> latch --+
1173                            |
1174                         (exit-bb)  */
1175
1176       if (loop->num_nodes != 2)
1177         return opt_result::failure_at (vect_location,
1178                                        "not vectorized:"
1179                                        " control flow in loop.\n");
1180
1181       if (empty_block_p (loop->header))
1182         return opt_result::failure_at (vect_location,
1183                                        "not vectorized: empty loop.\n");
1184     }
1185   else
1186     {
1187       struct loop *innerloop = loop->inner;
1188       edge entryedge;
1189
1190       /* Nested loop. We currently require that the loop is doubly-nested,
1191          contains a single inner loop, and the number of BBs is exactly 5.
1192          Vectorizable outer-loops look like this:
1193
1194                         (pre-header)
1195                            |
1196                           header <---+
1197                            |         |
1198                           inner-loop |
1199                            |         |
1200                           tail ------+
1201                            |
1202                         (exit-bb)
1203
1204          The inner-loop has the properties expected of inner-most loops
1205          as described above.  */
1206
1207       if ((loop->inner)->inner || (loop->inner)->next)
1208         return opt_result::failure_at (vect_location,
1209                                        "not vectorized:"
1210                                        " multiple nested loops.\n");
1211
1212       if (loop->num_nodes != 5)
1213         return opt_result::failure_at (vect_location,
1214                                        "not vectorized:"
1215                                        " control flow in loop.\n");
1216
1217       entryedge = loop_preheader_edge (innerloop);
1218       if (entryedge->src != loop->header
1219           || !single_exit (innerloop)
1220           || single_exit (innerloop)->dest != EDGE_PRED (loop->latch, 0)->src)
1221         return opt_result::failure_at (vect_location,
1222                                        "not vectorized:"
1223                                        " unsupported outerloop form.\n");
1224
1225       /* Analyze the inner-loop.  */
1226       tree inner_niterm1, inner_niter, inner_assumptions;
1227       opt_result res
1228         = vect_analyze_loop_form_1 (loop->inner, inner_loop_cond,
1229                                     &inner_assumptions, &inner_niterm1,
1230                                     &inner_niter, NULL);
1231       if (!res)
1232         {
1233           if (dump_enabled_p ())
1234             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1235                              "not vectorized: Bad inner loop.\n");
1236           return res;
1237         }
1238
1239       /* Don't support analyzing niter under assumptions for inner
1240          loop.  */
1241       if (!integer_onep (inner_assumptions))
1242         return opt_result::failure_at (vect_location,
1243                                        "not vectorized: Bad inner loop.\n");
1244
1245       if (!expr_invariant_in_loop_p (loop, inner_niter))
1246         return opt_result::failure_at (vect_location,
1247                                        "not vectorized: inner-loop count not"
1248                                        " invariant.\n");
1249
1250       if (dump_enabled_p ())
1251         dump_printf_loc (MSG_NOTE, vect_location,
1252                          "Considering outer-loop vectorization.\n");
1253     }
1254
1255   if (!single_exit (loop))
1256     return opt_result::failure_at (vect_location,
1257                                    "not vectorized: multiple exits.\n");
1258   if (EDGE_COUNT (loop->header->preds) != 2)
1259     return opt_result::failure_at (vect_location,
1260                                    "not vectorized:"
1261                                    " too many incoming edges.\n");
1262
1263   /* We assume that the loop exit condition is at the end of the loop. i.e,
1264      that the loop is represented as a do-while (with a proper if-guard
1265      before the loop if needed), where the loop header contains all the
1266      executable statements, and the latch is empty.  */
1267   if (!empty_block_p (loop->latch)
1268       || !gimple_seq_empty_p (phi_nodes (loop->latch)))
1269     return opt_result::failure_at (vect_location,
1270                                    "not vectorized: latch block not empty.\n");
1271
1272   /* Make sure the exit is not abnormal.  */
1273   edge e = single_exit (loop);
1274   if (e->flags & EDGE_ABNORMAL)
1275     return opt_result::failure_at (vect_location,
1276                                    "not vectorized:"
1277                                    " abnormal loop exit edge.\n");
1278
1279   *loop_cond = vect_get_loop_niters (loop, assumptions, number_of_iterations,
1280                                      number_of_iterationsm1);
1281   if (!*loop_cond)
1282     return opt_result::failure_at
1283       (vect_location,
1284        "not vectorized: complicated exit condition.\n");
1285
1286   if (integer_zerop (*assumptions)
1287       || !*number_of_iterations
1288       || chrec_contains_undetermined (*number_of_iterations))
1289     return opt_result::failure_at
1290       (*loop_cond,
1291        "not vectorized: number of iterations cannot be computed.\n");
1292
1293   if (integer_zerop (*number_of_iterations))
1294     return opt_result::failure_at
1295       (*loop_cond,
1296        "not vectorized: number of iterations = 0.\n");
1297
1298   return opt_result::success ();
1299 }
1300
1301 /* Analyze LOOP form and return a loop_vec_info if it is of suitable form.  */
1302
1303 opt_loop_vec_info
1304 vect_analyze_loop_form (struct loop *loop, vec_info_shared *shared)
1305 {
1306   tree assumptions, number_of_iterations, number_of_iterationsm1;
1307   gcond *loop_cond, *inner_loop_cond = NULL;
1308
1309   opt_result res
1310     = vect_analyze_loop_form_1 (loop, &loop_cond,
1311                                 &assumptions, &number_of_iterationsm1,
1312                                 &number_of_iterations, &inner_loop_cond);
1313   if (!res)
1314     return opt_loop_vec_info::propagate_failure (res);
1315
1316   loop_vec_info loop_vinfo = new _loop_vec_info (loop, shared);
1317   LOOP_VINFO_NITERSM1 (loop_vinfo) = number_of_iterationsm1;
1318   LOOP_VINFO_NITERS (loop_vinfo) = number_of_iterations;
1319   LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = number_of_iterations;
1320   if (!integer_onep (assumptions))
1321     {
1322       /* We consider to vectorize this loop by versioning it under
1323          some assumptions.  In order to do this, we need to clear
1324          existing information computed by scev and niter analyzer.  */
1325       scev_reset_htab ();
1326       free_numbers_of_iterations_estimates (loop);
1327       /* Also set flag for this loop so that following scev and niter
1328          analysis are done under the assumptions.  */
1329       loop_constraint_set (loop, LOOP_C_FINITE);
1330       /* Also record the assumptions for versioning.  */
1331       LOOP_VINFO_NITERS_ASSUMPTIONS (loop_vinfo) = assumptions;
1332     }
1333
1334   if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
1335     {
1336       if (dump_enabled_p ())
1337         {
1338           dump_printf_loc (MSG_NOTE, vect_location,
1339                            "Symbolic number of iterations is ");
1340           dump_generic_expr (MSG_NOTE, TDF_DETAILS, number_of_iterations);
1341           dump_printf (MSG_NOTE, "\n");
1342         }
1343     }
1344
1345   stmt_vec_info loop_cond_info = loop_vinfo->lookup_stmt (loop_cond);
1346   STMT_VINFO_TYPE (loop_cond_info) = loop_exit_ctrl_vec_info_type;
1347   if (inner_loop_cond)
1348     {
1349       stmt_vec_info inner_loop_cond_info
1350         = loop_vinfo->lookup_stmt (inner_loop_cond);
1351       STMT_VINFO_TYPE (inner_loop_cond_info) = loop_exit_ctrl_vec_info_type;
1352     }
1353
1354   gcc_assert (!loop->aux);
1355   loop->aux = loop_vinfo;
1356   return opt_loop_vec_info::success (loop_vinfo);
1357 }
1358
1359
1360
1361 /* Scan the loop stmts and dependent on whether there are any (non-)SLP
1362    statements update the vectorization factor.  */
1363
1364 static void
1365 vect_update_vf_for_slp (loop_vec_info loop_vinfo)
1366 {
1367   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1368   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1369   int nbbs = loop->num_nodes;
1370   poly_uint64 vectorization_factor;
1371   int i;
1372
1373   DUMP_VECT_SCOPE ("vect_update_vf_for_slp");
1374
1375   vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1376   gcc_assert (known_ne (vectorization_factor, 0U));
1377
1378   /* If all the stmts in the loop can be SLPed, we perform only SLP, and
1379      vectorization factor of the loop is the unrolling factor required by
1380      the SLP instances.  If that unrolling factor is 1, we say, that we
1381      perform pure SLP on loop - cross iteration parallelism is not
1382      exploited.  */
1383   bool only_slp_in_loop = true;
1384   for (i = 0; i < nbbs; i++)
1385     {
1386       basic_block bb = bbs[i];
1387       for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
1388            gsi_next (&si))
1389         {
1390           stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
1391           stmt_info = vect_stmt_to_vectorize (stmt_info);
1392           if ((STMT_VINFO_RELEVANT_P (stmt_info)
1393                || VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
1394               && !PURE_SLP_STMT (stmt_info))
1395             /* STMT needs both SLP and loop-based vectorization.  */
1396             only_slp_in_loop = false;
1397         }
1398     }
1399
1400   if (only_slp_in_loop)
1401     {
1402       if (dump_enabled_p ())
1403         dump_printf_loc (MSG_NOTE, vect_location,
1404                          "Loop contains only SLP stmts\n");
1405       vectorization_factor = LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo);
1406     }
1407   else
1408     {
1409       if (dump_enabled_p ())
1410         dump_printf_loc (MSG_NOTE, vect_location,
1411                          "Loop contains SLP and non-SLP stmts\n");
1412       /* Both the vectorization factor and unroll factor have the form
1413          current_vector_size * X for some rational X, so they must have
1414          a common multiple.  */
1415       vectorization_factor
1416         = force_common_multiple (vectorization_factor,
1417                                  LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo));
1418     }
1419
1420   LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
1421   if (dump_enabled_p ())
1422     {
1423       dump_printf_loc (MSG_NOTE, vect_location,
1424                        "Updating vectorization factor to ");
1425       dump_dec (MSG_NOTE, vectorization_factor);
1426       dump_printf (MSG_NOTE, ".\n");
1427     }
1428 }
1429
1430 /* Return true if STMT_INFO describes a double reduction phi and if
1431    the other phi in the reduction is also relevant for vectorization.
1432    This rejects cases such as:
1433
1434       outer1:
1435         x_1 = PHI <x_3(outer2), ...>;
1436         ...
1437
1438       inner:
1439         x_2 = ...;
1440         ...
1441
1442       outer2:
1443         x_3 = PHI <x_2(inner)>;
1444
1445    if nothing in x_2 or elsewhere makes x_1 relevant.  */
1446
1447 static bool
1448 vect_active_double_reduction_p (stmt_vec_info stmt_info)
1449 {
1450   if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_double_reduction_def)
1451     return false;
1452
1453   return STMT_VINFO_RELEVANT_P (STMT_VINFO_REDUC_DEF (stmt_info));
1454 }
1455
1456 /* Function vect_analyze_loop_operations.
1457
1458    Scan the loop stmts and make sure they are all vectorizable.  */
1459
1460 static opt_result
1461 vect_analyze_loop_operations (loop_vec_info loop_vinfo)
1462 {
1463   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1464   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1465   int nbbs = loop->num_nodes;
1466   int i;
1467   stmt_vec_info stmt_info;
1468   bool need_to_vectorize = false;
1469   bool ok;
1470
1471   DUMP_VECT_SCOPE ("vect_analyze_loop_operations");
1472
1473   auto_vec<stmt_info_for_cost> cost_vec;
1474
1475   for (i = 0; i < nbbs; i++)
1476     {
1477       basic_block bb = bbs[i];
1478
1479       for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
1480            gsi_next (&si))
1481         {
1482           gphi *phi = si.phi ();
1483           ok = true;
1484
1485           stmt_info = loop_vinfo->lookup_stmt (phi);
1486           if (dump_enabled_p ())
1487             dump_printf_loc (MSG_NOTE, vect_location, "examining phi: %G", phi);
1488           if (virtual_operand_p (gimple_phi_result (phi)))
1489             continue;
1490
1491           /* Inner-loop loop-closed exit phi in outer-loop vectorization
1492              (i.e., a phi in the tail of the outer-loop).  */
1493           if (! is_loop_header_bb_p (bb))
1494             {
1495               /* FORNOW: we currently don't support the case that these phis
1496                  are not used in the outerloop (unless it is double reduction,
1497                  i.e., this phi is vect_reduction_def), cause this case
1498                  requires to actually do something here.  */
1499               if (STMT_VINFO_LIVE_P (stmt_info)
1500                   && !vect_active_double_reduction_p (stmt_info))
1501                 return opt_result::failure_at (phi,
1502                                                "Unsupported loop-closed phi"
1503                                                " in outer-loop.\n");
1504
1505               /* If PHI is used in the outer loop, we check that its operand
1506                  is defined in the inner loop.  */
1507               if (STMT_VINFO_RELEVANT_P (stmt_info))
1508                 {
1509                   tree phi_op;
1510
1511                   if (gimple_phi_num_args (phi) != 1)
1512                     return opt_result::failure_at (phi, "unsupported phi");
1513
1514                   phi_op = PHI_ARG_DEF (phi, 0);
1515                   stmt_vec_info op_def_info = loop_vinfo->lookup_def (phi_op);
1516                   if (!op_def_info)
1517                     return opt_result::failure_at (phi, "unsupported phi");
1518
1519                   if (STMT_VINFO_RELEVANT (op_def_info) != vect_used_in_outer
1520                       && (STMT_VINFO_RELEVANT (op_def_info)
1521                           != vect_used_in_outer_by_reduction))
1522                     return opt_result::failure_at (phi, "unsupported phi");
1523                 }
1524
1525               continue;
1526             }
1527
1528           gcc_assert (stmt_info);
1529
1530           if ((STMT_VINFO_RELEVANT (stmt_info) == vect_used_in_scope
1531                || STMT_VINFO_LIVE_P (stmt_info))
1532               && STMT_VINFO_DEF_TYPE (stmt_info) != vect_induction_def)
1533             /* A scalar-dependence cycle that we don't support.  */
1534             return opt_result::failure_at (phi,
1535                                            "not vectorized:"
1536                                            " scalar dependence cycle.\n");
1537
1538           if (STMT_VINFO_RELEVANT_P (stmt_info))
1539             {
1540               need_to_vectorize = true;
1541               if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def
1542                   && ! PURE_SLP_STMT (stmt_info))
1543                 ok = vectorizable_induction (stmt_info, NULL, NULL, NULL,
1544                                              &cost_vec);
1545               else if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
1546                         || STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle)
1547                        && ! PURE_SLP_STMT (stmt_info))
1548                 ok = vectorizable_reduction (stmt_info, NULL, NULL, NULL, NULL,
1549                                              &cost_vec);
1550             }
1551
1552           /* SLP PHIs are tested by vect_slp_analyze_node_operations.  */
1553           if (ok
1554               && STMT_VINFO_LIVE_P (stmt_info)
1555               && !PURE_SLP_STMT (stmt_info))
1556             ok = vectorizable_live_operation (stmt_info, NULL, NULL, -1, NULL,
1557                                               &cost_vec);
1558
1559           if (!ok)
1560             return opt_result::failure_at (phi,
1561                                            "not vectorized: relevant phi not "
1562                                            "supported: %G",
1563                                            static_cast <gimple *> (phi));
1564         }
1565
1566       for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
1567            gsi_next (&si))
1568         {
1569           gimple *stmt = gsi_stmt (si);
1570           if (!gimple_clobber_p (stmt))
1571             {
1572               opt_result res
1573                 = vect_analyze_stmt (loop_vinfo->lookup_stmt (stmt),
1574                                      &need_to_vectorize,
1575                                      NULL, NULL, &cost_vec);
1576               if (!res)
1577                 return res;
1578             }
1579         }
1580     } /* bbs */
1581
1582   add_stmt_costs (loop_vinfo->target_cost_data, &cost_vec);
1583
1584   /* All operations in the loop are either irrelevant (deal with loop
1585      control, or dead), or only used outside the loop and can be moved
1586      out of the loop (e.g. invariants, inductions).  The loop can be
1587      optimized away by scalar optimizations.  We're better off not
1588      touching this loop.  */
1589   if (!need_to_vectorize)
1590     {
1591       if (dump_enabled_p ())
1592         dump_printf_loc (MSG_NOTE, vect_location,
1593                          "All the computation can be taken out of the loop.\n");
1594       return opt_result::failure_at
1595         (vect_location,
1596          "not vectorized: redundant loop. no profit to vectorize.\n");
1597     }
1598
1599   return opt_result::success ();
1600 }
1601
1602 /* Analyze the cost of the loop described by LOOP_VINFO.  Decide if it
1603    is worthwhile to vectorize.  Return 1 if definitely yes, 0 if
1604    definitely no, or -1 if it's worth retrying.  */
1605
1606 static int
1607 vect_analyze_loop_costing (loop_vec_info loop_vinfo)
1608 {
1609   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1610   unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
1611
1612   /* Only fully-masked loops can have iteration counts less than the
1613      vectorization factor.  */
1614   if (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
1615     {
1616       HOST_WIDE_INT max_niter;
1617
1618       if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
1619         max_niter = LOOP_VINFO_INT_NITERS (loop_vinfo);
1620       else
1621         max_niter = max_stmt_executions_int (loop);
1622
1623       if (max_niter != -1
1624           && (unsigned HOST_WIDE_INT) max_niter < assumed_vf)
1625         {
1626           if (dump_enabled_p ())
1627             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1628                              "not vectorized: iteration count smaller than "
1629                              "vectorization factor.\n");
1630           return 0;
1631         }
1632     }
1633
1634   int min_profitable_iters, min_profitable_estimate;
1635   vect_estimate_min_profitable_iters (loop_vinfo, &min_profitable_iters,
1636                                       &min_profitable_estimate);
1637
1638   if (min_profitable_iters < 0)
1639     {
1640       if (dump_enabled_p ())
1641         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1642                          "not vectorized: vectorization not profitable.\n");
1643       if (dump_enabled_p ())
1644         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1645                          "not vectorized: vector version will never be "
1646                          "profitable.\n");
1647       return -1;
1648     }
1649
1650   int min_scalar_loop_bound = (PARAM_VALUE (PARAM_MIN_VECT_LOOP_BOUND)
1651                                * assumed_vf);
1652
1653   /* Use the cost model only if it is more conservative than user specified
1654      threshold.  */
1655   unsigned int th = (unsigned) MAX (min_scalar_loop_bound,
1656                                     min_profitable_iters);
1657
1658   LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo) = th;
1659
1660   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
1661       && LOOP_VINFO_INT_NITERS (loop_vinfo) < th)
1662     {
1663       if (dump_enabled_p ())
1664         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1665                          "not vectorized: vectorization not profitable.\n");
1666       if (dump_enabled_p ())
1667         dump_printf_loc (MSG_NOTE, vect_location,
1668                          "not vectorized: iteration count smaller than user "
1669                          "specified loop bound parameter or minimum profitable "
1670                          "iterations (whichever is more conservative).\n");
1671       return 0;
1672     }
1673
1674   HOST_WIDE_INT estimated_niter = estimated_stmt_executions_int (loop);
1675   if (estimated_niter == -1)
1676     estimated_niter = likely_max_stmt_executions_int (loop);
1677   if (estimated_niter != -1
1678       && ((unsigned HOST_WIDE_INT) estimated_niter
1679           < MAX (th, (unsigned) min_profitable_estimate)))
1680     {
1681       if (dump_enabled_p ())
1682         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1683                          "not vectorized: estimated iteration count too "
1684                          "small.\n");
1685       if (dump_enabled_p ())
1686         dump_printf_loc (MSG_NOTE, vect_location,
1687                          "not vectorized: estimated iteration count smaller "
1688                          "than specified loop bound parameter or minimum "
1689                          "profitable iterations (whichever is more "
1690                          "conservative).\n");
1691       return -1;
1692     }
1693
1694   return 1;
1695 }
1696
1697 static opt_result
1698 vect_get_datarefs_in_loop (loop_p loop, basic_block *bbs,
1699                            vec<data_reference_p> *datarefs,
1700                            unsigned int *n_stmts)
1701 {
1702   *n_stmts = 0;
1703   for (unsigned i = 0; i < loop->num_nodes; i++)
1704     for (gimple_stmt_iterator gsi = gsi_start_bb (bbs[i]);
1705          !gsi_end_p (gsi); gsi_next (&gsi))
1706       {
1707         gimple *stmt = gsi_stmt (gsi);
1708         if (is_gimple_debug (stmt))
1709           continue;
1710         ++(*n_stmts);
1711         opt_result res = vect_find_stmt_data_reference (loop, stmt, datarefs);
1712         if (!res)
1713           {
1714             if (is_gimple_call (stmt) && loop->safelen)
1715               {
1716                 tree fndecl = gimple_call_fndecl (stmt), op;
1717                 if (fndecl != NULL_TREE)
1718                   {
1719                     cgraph_node *node = cgraph_node::get (fndecl);
1720                     if (node != NULL && node->simd_clones != NULL)
1721                       {
1722                         unsigned int j, n = gimple_call_num_args (stmt);
1723                         for (j = 0; j < n; j++)
1724                           {
1725                             op = gimple_call_arg (stmt, j);
1726                             if (DECL_P (op)
1727                                 || (REFERENCE_CLASS_P (op)
1728                                     && get_base_address (op)))
1729                               break;
1730                           }
1731                         op = gimple_call_lhs (stmt);
1732                         /* Ignore #pragma omp declare simd functions
1733                            if they don't have data references in the
1734                            call stmt itself.  */
1735                         if (j == n
1736                             && !(op
1737                                  && (DECL_P (op)
1738                                      || (REFERENCE_CLASS_P (op)
1739                                          && get_base_address (op)))))
1740                           continue;
1741                       }
1742                   }
1743               }
1744             return res;
1745           }
1746         /* If dependence analysis will give up due to the limit on the
1747            number of datarefs stop here and fail fatally.  */
1748         if (datarefs->length ()
1749             > (unsigned)PARAM_VALUE (PARAM_LOOP_MAX_DATAREFS_FOR_DATADEPS))
1750           return opt_result::failure_at (stmt, "exceeded param "
1751                                          "loop-max-datarefs-for-datadeps\n");
1752       }
1753   return opt_result::success ();
1754 }
1755
1756 /* Function vect_analyze_loop_2.
1757
1758    Apply a set of analyses on LOOP, and create a loop_vec_info struct
1759    for it.  The different analyses will record information in the
1760    loop_vec_info struct.  */
1761 static opt_result
1762 vect_analyze_loop_2 (loop_vec_info loop_vinfo, bool &fatal, unsigned *n_stmts)
1763 {
1764   opt_result ok = opt_result::success ();
1765   int res;
1766   unsigned int max_vf = MAX_VECTORIZATION_FACTOR;
1767   poly_uint64 min_vf = 2;
1768
1769   /* The first group of checks is independent of the vector size.  */
1770   fatal = true;
1771
1772   /* Find all data references in the loop (which correspond to vdefs/vuses)
1773      and analyze their evolution in the loop.  */
1774
1775   loop_p loop = LOOP_VINFO_LOOP (loop_vinfo);
1776
1777   /* Gather the data references and count stmts in the loop.  */
1778   if (!LOOP_VINFO_DATAREFS (loop_vinfo).exists ())
1779     {
1780       opt_result res
1781         = vect_get_datarefs_in_loop (loop, LOOP_VINFO_BBS (loop_vinfo),
1782                                      &LOOP_VINFO_DATAREFS (loop_vinfo),
1783                                      n_stmts);
1784       if (!res)
1785         {
1786           if (dump_enabled_p ())
1787             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1788                              "not vectorized: loop contains function "
1789                              "calls or data references that cannot "
1790                              "be analyzed\n");
1791           return res;
1792         }
1793       loop_vinfo->shared->save_datarefs ();
1794     }
1795   else
1796     loop_vinfo->shared->check_datarefs ();
1797
1798   /* Analyze the data references and also adjust the minimal
1799      vectorization factor according to the loads and stores.  */
1800
1801   ok = vect_analyze_data_refs (loop_vinfo, &min_vf);
1802   if (!ok)
1803     {
1804       if (dump_enabled_p ())
1805         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1806                          "bad data references.\n");
1807       return ok;
1808     }
1809
1810   /* Classify all cross-iteration scalar data-flow cycles.
1811      Cross-iteration cycles caused by virtual phis are analyzed separately.  */
1812   vect_analyze_scalar_cycles (loop_vinfo);
1813
1814   vect_pattern_recog (loop_vinfo);
1815
1816   vect_fixup_scalar_cycles_with_patterns (loop_vinfo);
1817
1818   /* Analyze the access patterns of the data-refs in the loop (consecutive,
1819      complex, etc.). FORNOW: Only handle consecutive access pattern.  */
1820
1821   ok = vect_analyze_data_ref_accesses (loop_vinfo);
1822   if (!ok)
1823     {
1824       if (dump_enabled_p ())
1825         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1826                          "bad data access.\n");
1827       return ok;
1828     }
1829
1830   /* Data-flow analysis to detect stmts that do not need to be vectorized.  */
1831
1832   ok = vect_mark_stmts_to_be_vectorized (loop_vinfo);
1833   if (!ok)
1834     {
1835       if (dump_enabled_p ())
1836         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1837                          "unexpected pattern.\n");
1838       return ok;
1839     }
1840
1841   /* While the rest of the analysis below depends on it in some way.  */
1842   fatal = false;
1843
1844   /* Analyze data dependences between the data-refs in the loop
1845      and adjust the maximum vectorization factor according to
1846      the dependences.
1847      FORNOW: fail at the first data dependence that we encounter.  */
1848
1849   ok = vect_analyze_data_ref_dependences (loop_vinfo, &max_vf);
1850   if (!ok)
1851     {
1852       if (dump_enabled_p ())
1853         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1854                          "bad data dependence.\n");
1855       return ok;
1856     }
1857   if (max_vf != MAX_VECTORIZATION_FACTOR
1858       && maybe_lt (max_vf, min_vf))
1859     return opt_result::failure_at (vect_location, "bad data dependence.\n");
1860   LOOP_VINFO_MAX_VECT_FACTOR (loop_vinfo) = max_vf;
1861
1862   ok = vect_determine_vectorization_factor (loop_vinfo);
1863   if (!ok)
1864     {
1865       if (dump_enabled_p ())
1866         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1867                          "can't determine vectorization factor.\n");
1868       return ok;
1869     }
1870   if (max_vf != MAX_VECTORIZATION_FACTOR
1871       && maybe_lt (max_vf, LOOP_VINFO_VECT_FACTOR (loop_vinfo)))
1872     return opt_result::failure_at (vect_location, "bad data dependence.\n");
1873
1874   /* Compute the scalar iteration cost.  */
1875   vect_compute_single_scalar_iteration_cost (loop_vinfo);
1876
1877   poly_uint64 saved_vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1878   unsigned th;
1879
1880   /* Check the SLP opportunities in the loop, analyze and build SLP trees.  */
1881   ok = vect_analyze_slp (loop_vinfo, *n_stmts);
1882   if (!ok)
1883     return ok;
1884
1885   /* If there are any SLP instances mark them as pure_slp.  */
1886   bool slp = vect_make_slp_decision (loop_vinfo);
1887   if (slp)
1888     {
1889       /* Find stmts that need to be both vectorized and SLPed.  */
1890       vect_detect_hybrid_slp (loop_vinfo);
1891
1892       /* Update the vectorization factor based on the SLP decision.  */
1893       vect_update_vf_for_slp (loop_vinfo);
1894     }
1895
1896   bool saved_can_fully_mask_p = LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo);
1897
1898   /* We don't expect to have to roll back to anything other than an empty
1899      set of rgroups.  */
1900   gcc_assert (LOOP_VINFO_MASKS (loop_vinfo).is_empty ());
1901
1902   /* This is the point where we can re-start analysis with SLP forced off.  */
1903 start_over:
1904
1905   /* Now the vectorization factor is final.  */
1906   poly_uint64 vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1907   gcc_assert (known_ne (vectorization_factor, 0U));
1908
1909   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) && dump_enabled_p ())
1910     {
1911       dump_printf_loc (MSG_NOTE, vect_location,
1912                        "vectorization_factor = ");
1913       dump_dec (MSG_NOTE, vectorization_factor);
1914       dump_printf (MSG_NOTE, ", niters = %wd\n",
1915                    LOOP_VINFO_INT_NITERS (loop_vinfo));
1916     }
1917
1918   HOST_WIDE_INT max_niter
1919     = likely_max_stmt_executions_int (LOOP_VINFO_LOOP (loop_vinfo));
1920
1921   /* Analyze the alignment of the data-refs in the loop.
1922      Fail if a data reference is found that cannot be vectorized.  */
1923
1924   ok = vect_analyze_data_refs_alignment (loop_vinfo);
1925   if (!ok)
1926     {
1927       if (dump_enabled_p ())
1928         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1929                          "bad data alignment.\n");
1930       return ok;
1931     }
1932
1933   /* Prune the list of ddrs to be tested at run-time by versioning for alias.
1934      It is important to call pruning after vect_analyze_data_ref_accesses,
1935      since we use grouping information gathered by interleaving analysis.  */
1936   ok = vect_prune_runtime_alias_test_list (loop_vinfo);
1937   if (!ok)
1938     return ok;
1939
1940   /* Do not invoke vect_enhance_data_refs_alignment for epilogue
1941      vectorization, since we do not want to add extra peeling or
1942      add versioning for alignment.  */
1943   if (!LOOP_VINFO_EPILOGUE_P (loop_vinfo))
1944     /* This pass will decide on using loop versioning and/or loop peeling in
1945        order to enhance the alignment of data references in the loop.  */
1946     ok = vect_enhance_data_refs_alignment (loop_vinfo);
1947   else
1948     ok = vect_verify_datarefs_alignment (loop_vinfo);
1949   if (!ok)
1950     return ok;
1951
1952   if (slp)
1953     {
1954       /* Analyze operations in the SLP instances.  Note this may
1955          remove unsupported SLP instances which makes the above
1956          SLP kind detection invalid.  */
1957       unsigned old_size = LOOP_VINFO_SLP_INSTANCES (loop_vinfo).length ();
1958       vect_slp_analyze_operations (loop_vinfo);
1959       if (LOOP_VINFO_SLP_INSTANCES (loop_vinfo).length () != old_size)
1960         {
1961           ok = opt_result::failure_at (vect_location,
1962                                        "unsupported SLP instances\n");
1963           goto again;
1964         }
1965     }
1966
1967   /* Scan all the remaining operations in the loop that are not subject
1968      to SLP and make sure they are vectorizable.  */
1969   ok = vect_analyze_loop_operations (loop_vinfo);
1970   if (!ok)
1971     {
1972       if (dump_enabled_p ())
1973         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1974                          "bad operation or unsupported loop bound.\n");
1975       return ok;
1976     }
1977
1978   /* Decide whether to use a fully-masked loop for this vectorization
1979      factor.  */
1980   LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
1981     = (LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo)
1982        && vect_verify_full_masking (loop_vinfo));
1983   if (dump_enabled_p ())
1984     {
1985       if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
1986         dump_printf_loc (MSG_NOTE, vect_location,
1987                          "using a fully-masked loop.\n");
1988       else
1989         dump_printf_loc (MSG_NOTE, vect_location,
1990                          "not using a fully-masked loop.\n");
1991     }
1992
1993   /* If epilog loop is required because of data accesses with gaps,
1994      one additional iteration needs to be peeled.  Check if there is
1995      enough iterations for vectorization.  */
1996   if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
1997       && LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
1998       && !LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
1999     {
2000       poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2001       tree scalar_niters = LOOP_VINFO_NITERSM1 (loop_vinfo);
2002
2003       if (known_lt (wi::to_widest (scalar_niters), vf))
2004         return opt_result::failure_at (vect_location,
2005                                        "loop has no enough iterations to"
2006                                        " support peeling for gaps.\n");
2007     }
2008
2009   /* Check the costings of the loop make vectorizing worthwhile.  */
2010   res = vect_analyze_loop_costing (loop_vinfo);
2011   if (res < 0)
2012     {
2013       ok = opt_result::failure_at (vect_location,
2014                                    "Loop costings may not be worthwhile.\n");
2015       goto again;
2016     }
2017   if (!res)
2018     return opt_result::failure_at (vect_location,
2019                                    "Loop costings not worthwhile.\n");
2020
2021   /* Decide whether we need to create an epilogue loop to handle
2022      remaining scalar iterations.  */
2023   th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
2024
2025   unsigned HOST_WIDE_INT const_vf;
2026   if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
2027     /* The main loop handles all iterations.  */
2028     LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = false;
2029   else if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2030            && LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) >= 0)
2031     {
2032       /* Work out the (constant) number of iterations that need to be
2033          peeled for reasons other than niters.  */
2034       unsigned int peel_niter = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
2035       if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
2036         peel_niter += 1;
2037       if (!multiple_p (LOOP_VINFO_INT_NITERS (loop_vinfo) - peel_niter,
2038                        LOOP_VINFO_VECT_FACTOR (loop_vinfo)))
2039         LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = true;
2040     }
2041   else if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
2042            /* ??? When peeling for gaps but not alignment, we could
2043               try to check whether the (variable) niters is known to be
2044               VF * N + 1.  That's something of a niche case though.  */
2045            || LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
2046            || !LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant (&const_vf)
2047            || ((tree_ctz (LOOP_VINFO_NITERS (loop_vinfo))
2048                 < (unsigned) exact_log2 (const_vf))
2049                /* In case of versioning, check if the maximum number of
2050                   iterations is greater than th.  If they are identical,
2051                   the epilogue is unnecessary.  */
2052                && (!LOOP_REQUIRES_VERSIONING (loop_vinfo)
2053                    || ((unsigned HOST_WIDE_INT) max_niter
2054                        > (th / const_vf) * const_vf))))
2055     LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = true;
2056
2057   /* If an epilogue loop is required make sure we can create one.  */
2058   if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
2059       || LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo))
2060     {
2061       if (dump_enabled_p ())
2062         dump_printf_loc (MSG_NOTE, vect_location, "epilog loop required\n");
2063       if (!vect_can_advance_ivs_p (loop_vinfo)
2064           || !slpeel_can_duplicate_loop_p (LOOP_VINFO_LOOP (loop_vinfo),
2065                                            single_exit (LOOP_VINFO_LOOP
2066                                                          (loop_vinfo))))
2067         {
2068           ok = opt_result::failure_at (vect_location,
2069                                        "not vectorized: can't create required "
2070                                        "epilog loop\n");
2071           goto again;
2072         }
2073     }
2074
2075   /* During peeling, we need to check if number of loop iterations is
2076      enough for both peeled prolog loop and vector loop.  This check
2077      can be merged along with threshold check of loop versioning, so
2078      increase threshold for this case if necessary.  */
2079   if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
2080     {
2081       poly_uint64 niters_th = 0;
2082
2083       if (!vect_use_loop_mask_for_alignment_p (loop_vinfo))
2084         {
2085           /* Niters for peeled prolog loop.  */
2086           if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) < 0)
2087             {
2088               dr_vec_info *dr_info = LOOP_VINFO_UNALIGNED_DR (loop_vinfo);
2089               tree vectype = STMT_VINFO_VECTYPE (dr_info->stmt);
2090               niters_th += TYPE_VECTOR_SUBPARTS (vectype) - 1;
2091             }
2092           else
2093             niters_th += LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
2094         }
2095
2096       /* Niters for at least one iteration of vectorized loop.  */
2097       if (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
2098         niters_th += LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2099       /* One additional iteration because of peeling for gap.  */
2100       if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
2101         niters_th += 1;
2102       LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo) = niters_th;
2103     }
2104
2105   gcc_assert (known_eq (vectorization_factor,
2106                         LOOP_VINFO_VECT_FACTOR (loop_vinfo)));
2107
2108   /* Ok to vectorize!  */
2109   return opt_result::success ();
2110
2111 again:
2112   /* Ensure that "ok" is false (with an opt_problem if dumping is enabled).  */
2113   gcc_assert (!ok);
2114
2115   /* Try again with SLP forced off but if we didn't do any SLP there is
2116      no point in re-trying.  */
2117   if (!slp)
2118     return ok;
2119
2120   /* If there are reduction chains re-trying will fail anyway.  */
2121   if (! LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo).is_empty ())
2122     return ok;
2123
2124   /* Likewise if the grouped loads or stores in the SLP cannot be handled
2125      via interleaving or lane instructions.  */
2126   slp_instance instance;
2127   slp_tree node;
2128   unsigned i, j;
2129   FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), i, instance)
2130     {
2131       stmt_vec_info vinfo;
2132       vinfo = SLP_TREE_SCALAR_STMTS (SLP_INSTANCE_TREE (instance))[0];
2133       if (! STMT_VINFO_GROUPED_ACCESS (vinfo))
2134         continue;
2135       vinfo = DR_GROUP_FIRST_ELEMENT (vinfo);
2136       unsigned int size = DR_GROUP_SIZE (vinfo);
2137       tree vectype = STMT_VINFO_VECTYPE (vinfo);
2138       if (! vect_store_lanes_supported (vectype, size, false)
2139          && ! known_eq (TYPE_VECTOR_SUBPARTS (vectype), 1U)
2140          && ! vect_grouped_store_supported (vectype, size))
2141         return opt_result::failure_at (vinfo->stmt,
2142                                        "unsupported grouped store\n");
2143       FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), j, node)
2144         {
2145           vinfo = SLP_TREE_SCALAR_STMTS (node)[0];
2146           vinfo = DR_GROUP_FIRST_ELEMENT (vinfo);
2147           bool single_element_p = !DR_GROUP_NEXT_ELEMENT (vinfo);
2148           size = DR_GROUP_SIZE (vinfo);
2149           vectype = STMT_VINFO_VECTYPE (vinfo);
2150           if (! vect_load_lanes_supported (vectype, size, false)
2151               && ! vect_grouped_load_supported (vectype, single_element_p,
2152                                                 size))
2153             return opt_result::failure_at (vinfo->stmt,
2154                                            "unsupported grouped load\n");
2155         }
2156     }
2157
2158   if (dump_enabled_p ())
2159     dump_printf_loc (MSG_NOTE, vect_location,
2160                      "re-trying with SLP disabled\n");
2161
2162   /* Roll back state appropriately.  No SLP this time.  */
2163   slp = false;
2164   /* Restore vectorization factor as it were without SLP.  */
2165   LOOP_VINFO_VECT_FACTOR (loop_vinfo) = saved_vectorization_factor;
2166   /* Free the SLP instances.  */
2167   FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), j, instance)
2168     vect_free_slp_instance (instance, false);
2169   LOOP_VINFO_SLP_INSTANCES (loop_vinfo).release ();
2170   /* Reset SLP type to loop_vect on all stmts.  */
2171   for (i = 0; i < LOOP_VINFO_LOOP (loop_vinfo)->num_nodes; ++i)
2172     {
2173       basic_block bb = LOOP_VINFO_BBS (loop_vinfo)[i];
2174       for (gimple_stmt_iterator si = gsi_start_phis (bb);
2175            !gsi_end_p (si); gsi_next (&si))
2176         {
2177           stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
2178           STMT_SLP_TYPE (stmt_info) = loop_vect;
2179         }
2180       for (gimple_stmt_iterator si = gsi_start_bb (bb);
2181            !gsi_end_p (si); gsi_next (&si))
2182         {
2183           stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
2184           STMT_SLP_TYPE (stmt_info) = loop_vect;
2185           if (STMT_VINFO_IN_PATTERN_P (stmt_info))
2186             {
2187               gimple *pattern_def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
2188               stmt_info = STMT_VINFO_RELATED_STMT (stmt_info);
2189               STMT_SLP_TYPE (stmt_info) = loop_vect;
2190               for (gimple_stmt_iterator pi = gsi_start (pattern_def_seq);
2191                    !gsi_end_p (pi); gsi_next (&pi))
2192                 STMT_SLP_TYPE (loop_vinfo->lookup_stmt (gsi_stmt (pi)))
2193                   = loop_vect;
2194             }
2195         }
2196     }
2197   /* Free optimized alias test DDRS.  */
2198   LOOP_VINFO_LOWER_BOUNDS (loop_vinfo).truncate (0);
2199   LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo).release ();
2200   LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo).release ();
2201   /* Reset target cost data.  */
2202   destroy_cost_data (LOOP_VINFO_TARGET_COST_DATA (loop_vinfo));
2203   LOOP_VINFO_TARGET_COST_DATA (loop_vinfo)
2204     = init_cost (LOOP_VINFO_LOOP (loop_vinfo));
2205   /* Reset accumulated rgroup information.  */
2206   release_vec_loop_masks (&LOOP_VINFO_MASKS (loop_vinfo));
2207   /* Reset assorted flags.  */
2208   LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = false;
2209   LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) = false;
2210   LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo) = 0;
2211   LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo) = 0;
2212   LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = saved_can_fully_mask_p;
2213
2214   goto start_over;
2215 }
2216
2217 /* Function vect_analyze_loop.
2218
2219    Apply a set of analyses on LOOP, and create a loop_vec_info struct
2220    for it.  The different analyses will record information in the
2221    loop_vec_info struct.  If ORIG_LOOP_VINFO is not NULL epilogue must
2222    be vectorized.  */
2223 opt_loop_vec_info
2224 vect_analyze_loop (struct loop *loop, loop_vec_info orig_loop_vinfo,
2225                    vec_info_shared *shared)
2226 {
2227   auto_vector_sizes vector_sizes;
2228
2229   /* Autodetect first vector size we try.  */
2230   current_vector_size = 0;
2231   targetm.vectorize.autovectorize_vector_sizes (&vector_sizes);
2232   unsigned int next_size = 0;
2233
2234   DUMP_VECT_SCOPE ("analyze_loop_nest");
2235
2236   if (loop_outer (loop)
2237       && loop_vec_info_for_loop (loop_outer (loop))
2238       && LOOP_VINFO_VECTORIZABLE_P (loop_vec_info_for_loop (loop_outer (loop))))
2239     return opt_loop_vec_info::failure_at (vect_location,
2240                                           "outer-loop already vectorized.\n");
2241
2242   if (!find_loop_nest (loop, &shared->loop_nest))
2243     return opt_loop_vec_info::failure_at
2244       (vect_location,
2245        "not vectorized: loop nest containing two or more consecutive inner"
2246        " loops cannot be vectorized\n");
2247
2248   unsigned n_stmts = 0;
2249   poly_uint64 autodetected_vector_size = 0;
2250   while (1)
2251     {
2252       /* Check the CFG characteristics of the loop (nesting, entry/exit).  */
2253       opt_loop_vec_info loop_vinfo
2254         = vect_analyze_loop_form (loop, shared);
2255       if (!loop_vinfo)
2256         {
2257           if (dump_enabled_p ())
2258             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2259                              "bad loop form.\n");
2260           return loop_vinfo;
2261         }
2262
2263       bool fatal = false;
2264
2265       if (orig_loop_vinfo)
2266         LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo) = orig_loop_vinfo;
2267
2268       opt_result res = vect_analyze_loop_2 (loop_vinfo, fatal, &n_stmts);
2269       if (res)
2270         {
2271           LOOP_VINFO_VECTORIZABLE_P (loop_vinfo) = 1;
2272
2273           return loop_vinfo;
2274         }
2275
2276       delete loop_vinfo;
2277
2278       if (next_size == 0)
2279         autodetected_vector_size = current_vector_size;
2280
2281       if (next_size < vector_sizes.length ()
2282           && known_eq (vector_sizes[next_size], autodetected_vector_size))
2283         next_size += 1;
2284
2285       if (fatal
2286           || next_size == vector_sizes.length ()
2287           || known_eq (current_vector_size, 0U))
2288         return opt_loop_vec_info::propagate_failure (res);
2289
2290       /* Try the next biggest vector size.  */
2291       current_vector_size = vector_sizes[next_size++];
2292       if (dump_enabled_p ())
2293         {
2294           dump_printf_loc (MSG_NOTE, vect_location,
2295                            "***** Re-trying analysis with "
2296                            "vector size ");
2297           dump_dec (MSG_NOTE, current_vector_size);
2298           dump_printf (MSG_NOTE, "\n");
2299         }
2300     }
2301 }
2302
2303 /* Return true if there is an in-order reduction function for CODE, storing
2304    it in *REDUC_FN if so.  */
2305
2306 static bool
2307 fold_left_reduction_fn (tree_code code, internal_fn *reduc_fn)
2308 {
2309   switch (code)
2310     {
2311     case PLUS_EXPR:
2312       *reduc_fn = IFN_FOLD_LEFT_PLUS;
2313       return true;
2314
2315     default:
2316       return false;
2317     }
2318 }
2319
2320 /* Function reduction_fn_for_scalar_code
2321
2322    Input:
2323    CODE - tree_code of a reduction operations.
2324
2325    Output:
2326    REDUC_FN - the corresponding internal function to be used to reduce the
2327       vector of partial results into a single scalar result, or IFN_LAST
2328       if the operation is a supported reduction operation, but does not have
2329       such an internal function.
2330
2331    Return FALSE if CODE currently cannot be vectorized as reduction.  */
2332
2333 static bool
2334 reduction_fn_for_scalar_code (enum tree_code code, internal_fn *reduc_fn)
2335 {
2336   switch (code)
2337     {
2338       case MAX_EXPR:
2339         *reduc_fn = IFN_REDUC_MAX;
2340         return true;
2341
2342       case MIN_EXPR:
2343         *reduc_fn = IFN_REDUC_MIN;
2344         return true;
2345
2346       case PLUS_EXPR:
2347         *reduc_fn = IFN_REDUC_PLUS;
2348         return true;
2349
2350       case BIT_AND_EXPR:
2351         *reduc_fn = IFN_REDUC_AND;
2352         return true;
2353
2354       case BIT_IOR_EXPR:
2355         *reduc_fn = IFN_REDUC_IOR;
2356         return true;
2357
2358       case BIT_XOR_EXPR:
2359         *reduc_fn = IFN_REDUC_XOR;
2360         return true;
2361
2362       case MULT_EXPR:
2363       case MINUS_EXPR:
2364         *reduc_fn = IFN_LAST;
2365         return true;
2366
2367       default:
2368        return false;
2369     }
2370 }
2371
2372 /* If there is a neutral value X such that SLP reduction NODE would not
2373    be affected by the introduction of additional X elements, return that X,
2374    otherwise return null.  CODE is the code of the reduction.  REDUC_CHAIN
2375    is true if the SLP statements perform a single reduction, false if each
2376    statement performs an independent reduction.  */
2377
2378 static tree
2379 neutral_op_for_slp_reduction (slp_tree slp_node, tree_code code,
2380                               bool reduc_chain)
2381 {
2382   vec<stmt_vec_info> stmts = SLP_TREE_SCALAR_STMTS (slp_node);
2383   stmt_vec_info stmt_vinfo = stmts[0];
2384   tree vector_type = STMT_VINFO_VECTYPE (stmt_vinfo);
2385   tree scalar_type = TREE_TYPE (vector_type);
2386   struct loop *loop = gimple_bb (stmt_vinfo->stmt)->loop_father;
2387   gcc_assert (loop);
2388
2389   switch (code)
2390     {
2391     case WIDEN_SUM_EXPR:
2392     case DOT_PROD_EXPR:
2393     case SAD_EXPR:
2394     case PLUS_EXPR:
2395     case MINUS_EXPR:
2396     case BIT_IOR_EXPR:
2397     case BIT_XOR_EXPR:
2398       return build_zero_cst (scalar_type);
2399
2400     case MULT_EXPR:
2401       return build_one_cst (scalar_type);
2402
2403     case BIT_AND_EXPR:
2404       return build_all_ones_cst (scalar_type);
2405
2406     case MAX_EXPR:
2407     case MIN_EXPR:
2408       /* For MIN/MAX the initial values are neutral.  A reduction chain
2409          has only a single initial value, so that value is neutral for
2410          all statements.  */
2411       if (reduc_chain)
2412         return PHI_ARG_DEF_FROM_EDGE (stmt_vinfo->stmt,
2413                                       loop_preheader_edge (loop));
2414       return NULL_TREE;
2415
2416     default:
2417       return NULL_TREE;
2418     }
2419 }
2420
2421 /* Error reporting helper for vect_is_simple_reduction below.  GIMPLE statement
2422    STMT is printed with a message MSG. */
2423
2424 static void
2425 report_vect_op (dump_flags_t msg_type, gimple *stmt, const char *msg)
2426 {
2427   dump_printf_loc (msg_type, vect_location, "%s%G", msg, stmt);
2428 }
2429
2430 /* DEF_STMT_INFO occurs in a loop that contains a potential reduction
2431    operation.  Return true if the results of DEF_STMT_INFO are something
2432    that can be accumulated by such a reduction.  */
2433
2434 static bool
2435 vect_valid_reduction_input_p (stmt_vec_info def_stmt_info)
2436 {
2437   return (is_gimple_assign (def_stmt_info->stmt)
2438           || is_gimple_call (def_stmt_info->stmt)
2439           || STMT_VINFO_DEF_TYPE (def_stmt_info) == vect_induction_def
2440           || (gimple_code (def_stmt_info->stmt) == GIMPLE_PHI
2441               && STMT_VINFO_DEF_TYPE (def_stmt_info) == vect_internal_def
2442               && !is_loop_header_bb_p (gimple_bb (def_stmt_info->stmt))));
2443 }
2444
2445 /* Detect SLP reduction of the form:
2446
2447    #a1 = phi <a5, a0>
2448    a2 = operation (a1)
2449    a3 = operation (a2)
2450    a4 = operation (a3)
2451    a5 = operation (a4)
2452
2453    #a = phi <a5>
2454
2455    PHI is the reduction phi node (#a1 = phi <a5, a0> above)
2456    FIRST_STMT is the first reduction stmt in the chain
2457    (a2 = operation (a1)).
2458
2459    Return TRUE if a reduction chain was detected.  */
2460
2461 static bool
2462 vect_is_slp_reduction (loop_vec_info loop_info, gimple *phi,
2463                        gimple *first_stmt)
2464 {
2465   struct loop *loop = (gimple_bb (phi))->loop_father;
2466   struct loop *vect_loop = LOOP_VINFO_LOOP (loop_info);
2467   enum tree_code code;
2468   gimple *loop_use_stmt = NULL;
2469   stmt_vec_info use_stmt_info;
2470   tree lhs;
2471   imm_use_iterator imm_iter;
2472   use_operand_p use_p;
2473   int nloop_uses, size = 0, n_out_of_loop_uses;
2474   bool found = false;
2475
2476   if (loop != vect_loop)
2477     return false;
2478
2479   auto_vec<stmt_vec_info, 8> reduc_chain;
2480   lhs = PHI_RESULT (phi);
2481   code = gimple_assign_rhs_code (first_stmt);
2482   while (1)
2483     {
2484       nloop_uses = 0;
2485       n_out_of_loop_uses = 0;
2486       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, lhs)
2487         {
2488           gimple *use_stmt = USE_STMT (use_p);
2489           if (is_gimple_debug (use_stmt))
2490             continue;
2491
2492           /* Check if we got back to the reduction phi.  */
2493           if (use_stmt == phi)
2494             {
2495               loop_use_stmt = use_stmt;
2496               found = true;
2497               break;
2498             }
2499
2500           if (flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
2501             {
2502               loop_use_stmt = use_stmt;
2503               nloop_uses++;
2504             }
2505            else
2506              n_out_of_loop_uses++;
2507
2508            /* There are can be either a single use in the loop or two uses in
2509               phi nodes.  */
2510            if (nloop_uses > 1 || (n_out_of_loop_uses && nloop_uses))
2511              return false;
2512         }
2513
2514       if (found)
2515         break;
2516
2517       /* We reached a statement with no loop uses.  */
2518       if (nloop_uses == 0)
2519         return false;
2520
2521       /* This is a loop exit phi, and we haven't reached the reduction phi.  */
2522       if (gimple_code (loop_use_stmt) == GIMPLE_PHI)
2523         return false;
2524
2525       if (!is_gimple_assign (loop_use_stmt)
2526           || code != gimple_assign_rhs_code (loop_use_stmt)
2527           || !flow_bb_inside_loop_p (loop, gimple_bb (loop_use_stmt)))
2528         return false;
2529
2530       /* Insert USE_STMT into reduction chain.  */
2531       use_stmt_info = loop_info->lookup_stmt (loop_use_stmt);
2532       reduc_chain.safe_push (use_stmt_info);
2533
2534       lhs = gimple_assign_lhs (loop_use_stmt);
2535       size++;
2536    }
2537
2538   if (!found || loop_use_stmt != phi || size < 2)
2539     return false;
2540
2541   /* Swap the operands, if needed, to make the reduction operand be the second
2542      operand.  */
2543   lhs = PHI_RESULT (phi);
2544   for (unsigned i = 0; i < reduc_chain.length (); ++i)
2545     {
2546       gassign *next_stmt = as_a <gassign *> (reduc_chain[i]->stmt);
2547       if (gimple_assign_rhs2 (next_stmt) == lhs)
2548         {
2549           tree op = gimple_assign_rhs1 (next_stmt);
2550           stmt_vec_info def_stmt_info = loop_info->lookup_def (op);
2551
2552           /* Check that the other def is either defined in the loop
2553              ("vect_internal_def"), or it's an induction (defined by a
2554              loop-header phi-node).  */
2555           if (def_stmt_info
2556               && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt_info->stmt))
2557               && vect_valid_reduction_input_p (def_stmt_info))
2558             {
2559               lhs = gimple_assign_lhs (next_stmt);
2560               continue;
2561             }
2562
2563           return false;
2564         }
2565       else
2566         {
2567           tree op = gimple_assign_rhs2 (next_stmt);
2568           stmt_vec_info def_stmt_info = loop_info->lookup_def (op);
2569
2570           /* Check that the other def is either defined in the loop
2571             ("vect_internal_def"), or it's an induction (defined by a
2572             loop-header phi-node).  */
2573           if (def_stmt_info
2574               && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt_info->stmt))
2575               && vect_valid_reduction_input_p (def_stmt_info))
2576             {
2577               if (dump_enabled_p ())
2578                 dump_printf_loc (MSG_NOTE, vect_location, "swapping oprnds: %G",
2579                                  next_stmt);
2580
2581               swap_ssa_operands (next_stmt,
2582                                  gimple_assign_rhs1_ptr (next_stmt),
2583                                  gimple_assign_rhs2_ptr (next_stmt));
2584               update_stmt (next_stmt);
2585
2586               if (CONSTANT_CLASS_P (gimple_assign_rhs1 (next_stmt)))
2587                 LOOP_VINFO_OPERANDS_SWAPPED (loop_info) = true;
2588             }
2589           else
2590             return false;
2591         }
2592
2593       lhs = gimple_assign_lhs (next_stmt);
2594     }
2595
2596   /* Build up the actual chain.  */
2597   for (unsigned i = 0; i < reduc_chain.length () - 1; ++i)
2598     {
2599       REDUC_GROUP_FIRST_ELEMENT (reduc_chain[i]) = reduc_chain[0];
2600       REDUC_GROUP_NEXT_ELEMENT (reduc_chain[i]) = reduc_chain[i+1];
2601     }
2602   REDUC_GROUP_FIRST_ELEMENT (reduc_chain.last ()) = reduc_chain[0];
2603   REDUC_GROUP_NEXT_ELEMENT (reduc_chain.last ()) = NULL;
2604
2605   /* Save the chain for further analysis in SLP detection.  */
2606   LOOP_VINFO_REDUCTION_CHAINS (loop_info).safe_push (reduc_chain[0]);
2607   REDUC_GROUP_SIZE (reduc_chain[0]) = size;
2608
2609   return true;
2610 }
2611
2612 /* Return true if we need an in-order reduction for operation CODE
2613    on type TYPE.  NEED_WRAPPING_INTEGRAL_OVERFLOW is true if integer
2614    overflow must wrap.  */
2615
2616 static bool
2617 needs_fold_left_reduction_p (tree type, tree_code code,
2618                              bool need_wrapping_integral_overflow)
2619 {
2620   /* CHECKME: check for !flag_finite_math_only too?  */
2621   if (SCALAR_FLOAT_TYPE_P (type))
2622     switch (code)
2623       {
2624       case MIN_EXPR:
2625       case MAX_EXPR:
2626         return false;
2627
2628       default:
2629         return !flag_associative_math;
2630       }
2631
2632   if (INTEGRAL_TYPE_P (type))
2633     {
2634       if (!operation_no_trapping_overflow (type, code))
2635         return true;
2636       if (need_wrapping_integral_overflow
2637           && !TYPE_OVERFLOW_WRAPS (type)
2638           && operation_can_overflow (code))
2639         return true;
2640       return false;
2641     }
2642
2643   if (SAT_FIXED_POINT_TYPE_P (type))
2644     return true;
2645
2646   return false;
2647 }
2648
2649 /* Return true if the reduction PHI in LOOP with latch arg LOOP_ARG and
2650    reduction operation CODE has a handled computation expression.  */
2651
2652 bool
2653 check_reduction_path (dump_user_location_t loc, loop_p loop, gphi *phi,
2654                       tree loop_arg, enum tree_code code)
2655 {
2656   auto_vec<std::pair<ssa_op_iter, use_operand_p> > path;
2657   auto_bitmap visited;
2658   tree lookfor = PHI_RESULT (phi);
2659   ssa_op_iter curri;
2660   use_operand_p curr = op_iter_init_phiuse (&curri, phi, SSA_OP_USE);
2661   while (USE_FROM_PTR (curr) != loop_arg)
2662     curr = op_iter_next_use (&curri);
2663   curri.i = curri.numops;
2664   do
2665     {
2666       path.safe_push (std::make_pair (curri, curr));
2667       tree use = USE_FROM_PTR (curr);
2668       if (use == lookfor)
2669         break;
2670       gimple *def = SSA_NAME_DEF_STMT (use);
2671       if (gimple_nop_p (def)
2672           || ! flow_bb_inside_loop_p (loop, gimple_bb (def)))
2673         {
2674 pop:
2675           do
2676             {
2677               std::pair<ssa_op_iter, use_operand_p> x = path.pop ();
2678               curri = x.first;
2679               curr = x.second;
2680               do
2681                 curr = op_iter_next_use (&curri);
2682               /* Skip already visited or non-SSA operands (from iterating
2683                  over PHI args).  */
2684               while (curr != NULL_USE_OPERAND_P
2685                      && (TREE_CODE (USE_FROM_PTR (curr)) != SSA_NAME
2686                          || ! bitmap_set_bit (visited,
2687                                               SSA_NAME_VERSION
2688                                                 (USE_FROM_PTR (curr)))));
2689             }
2690           while (curr == NULL_USE_OPERAND_P && ! path.is_empty ());
2691           if (curr == NULL_USE_OPERAND_P)
2692             break;
2693         }
2694       else
2695         {
2696           if (gimple_code (def) == GIMPLE_PHI)
2697             curr = op_iter_init_phiuse (&curri, as_a <gphi *>(def), SSA_OP_USE);
2698           else
2699             curr = op_iter_init_use (&curri, def, SSA_OP_USE);
2700           while (curr != NULL_USE_OPERAND_P
2701                  && (TREE_CODE (USE_FROM_PTR (curr)) != SSA_NAME
2702                      || ! bitmap_set_bit (visited,
2703                                           SSA_NAME_VERSION
2704                                             (USE_FROM_PTR (curr)))))
2705             curr = op_iter_next_use (&curri);
2706           if (curr == NULL_USE_OPERAND_P)
2707             goto pop;
2708         }
2709     }
2710   while (1);
2711   if (dump_file && (dump_flags & TDF_DETAILS))
2712     {
2713       dump_printf_loc (MSG_NOTE, loc, "reduction path: ");
2714       unsigned i;
2715       std::pair<ssa_op_iter, use_operand_p> *x;
2716       FOR_EACH_VEC_ELT (path, i, x)
2717         dump_printf (MSG_NOTE, "%T ", USE_FROM_PTR (x->second));
2718       dump_printf (MSG_NOTE, "\n");
2719     }
2720
2721   /* Check whether the reduction path detected is valid.  */
2722   bool fail = path.length () == 0;
2723   bool neg = false;
2724   for (unsigned i = 1; i < path.length (); ++i)
2725     {
2726       gimple *use_stmt = USE_STMT (path[i].second);
2727       tree op = USE_FROM_PTR (path[i].second);
2728       if (! has_single_use (op)
2729           || ! is_gimple_assign (use_stmt))
2730         {
2731           fail = true;
2732           break;
2733         }
2734       if (gimple_assign_rhs_code (use_stmt) != code)
2735         {
2736           if (code == PLUS_EXPR
2737               && gimple_assign_rhs_code (use_stmt) == MINUS_EXPR)
2738             {
2739               /* Track whether we negate the reduction value each iteration.  */
2740               if (gimple_assign_rhs2 (use_stmt) == op)
2741                 neg = ! neg;
2742             }
2743           else
2744             {
2745               fail = true;
2746               break;
2747             }
2748         }
2749     }
2750   return ! fail && ! neg;
2751 }
2752
2753
2754 /* Function vect_is_simple_reduction
2755
2756    (1) Detect a cross-iteration def-use cycle that represents a simple
2757    reduction computation.  We look for the following pattern:
2758
2759    loop_header:
2760      a1 = phi < a0, a2 >
2761      a3 = ...
2762      a2 = operation (a3, a1)
2763
2764    or
2765
2766    a3 = ...
2767    loop_header:
2768      a1 = phi < a0, a2 >
2769      a2 = operation (a3, a1)
2770
2771    such that:
2772    1. operation is commutative and associative and it is safe to
2773       change the order of the computation
2774    2. no uses for a2 in the loop (a2 is used out of the loop)
2775    3. no uses of a1 in the loop besides the reduction operation
2776    4. no uses of a1 outside the loop.
2777
2778    Conditions 1,4 are tested here.
2779    Conditions 2,3 are tested in vect_mark_stmts_to_be_vectorized.
2780
2781    (2) Detect a cross-iteration def-use cycle in nested loops, i.e.,
2782    nested cycles.
2783
2784    (3) Detect cycles of phi nodes in outer-loop vectorization, i.e., double
2785    reductions:
2786
2787      a1 = phi < a0, a2 >
2788      inner loop (def of a3)
2789      a2 = phi < a3 >
2790
2791    (4) Detect condition expressions, ie:
2792      for (int i = 0; i < N; i++)
2793        if (a[i] < val)
2794         ret_val = a[i];
2795
2796 */
2797
2798 static stmt_vec_info
2799 vect_is_simple_reduction (loop_vec_info loop_info, stmt_vec_info phi_info,
2800                           bool *double_reduc,
2801                           bool need_wrapping_integral_overflow,
2802                           enum vect_reduction_type *v_reduc_type)
2803 {
2804   gphi *phi = as_a <gphi *> (phi_info->stmt);
2805   struct loop *loop = (gimple_bb (phi))->loop_father;
2806   struct loop *vect_loop = LOOP_VINFO_LOOP (loop_info);
2807   bool nested_in_vect_loop = flow_loop_nested_p (vect_loop, loop);
2808   gimple *phi_use_stmt = NULL;
2809   enum tree_code orig_code, code;
2810   tree op1, op2, op3 = NULL_TREE, op4 = NULL_TREE;
2811   tree type;
2812   tree name;
2813   imm_use_iterator imm_iter;
2814   use_operand_p use_p;
2815   bool phi_def;
2816
2817   *double_reduc = false;
2818   *v_reduc_type = TREE_CODE_REDUCTION;
2819
2820   tree phi_name = PHI_RESULT (phi);
2821   /* ???  If there are no uses of the PHI result the inner loop reduction
2822      won't be detected as possibly double-reduction by vectorizable_reduction
2823      because that tries to walk the PHI arg from the preheader edge which
2824      can be constant.  See PR60382.  */
2825   if (has_zero_uses (phi_name))
2826     return NULL;
2827   unsigned nphi_def_loop_uses = 0;
2828   FOR_EACH_IMM_USE_FAST (use_p, imm_iter, phi_name)
2829     {
2830       gimple *use_stmt = USE_STMT (use_p);
2831       if (is_gimple_debug (use_stmt))
2832         continue;
2833
2834       if (!flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
2835         {
2836           if (dump_enabled_p ())
2837             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2838                              "intermediate value used outside loop.\n");
2839
2840           return NULL;
2841         }
2842
2843       nphi_def_loop_uses++;
2844       phi_use_stmt = use_stmt;
2845     }
2846
2847   edge latch_e = loop_latch_edge (loop);
2848   tree loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
2849   if (TREE_CODE (loop_arg) != SSA_NAME)
2850     {
2851       if (dump_enabled_p ())
2852         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2853                          "reduction: not ssa_name: %T\n", loop_arg);
2854       return NULL;
2855     }
2856
2857   stmt_vec_info def_stmt_info = loop_info->lookup_def (loop_arg);
2858   if (!def_stmt_info
2859       || !flow_bb_inside_loop_p (loop, gimple_bb (def_stmt_info->stmt)))
2860     return NULL;
2861
2862   if (gassign *def_stmt = dyn_cast <gassign *> (def_stmt_info->stmt))
2863     {
2864       name = gimple_assign_lhs (def_stmt);
2865       phi_def = false;
2866     }
2867   else if (gphi *def_stmt = dyn_cast <gphi *> (def_stmt_info->stmt))
2868     {
2869       name = PHI_RESULT (def_stmt);
2870       phi_def = true;
2871     }
2872   else
2873     {
2874       if (dump_enabled_p ())
2875         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2876                          "reduction: unhandled reduction operation: %G",
2877                          def_stmt_info->stmt);
2878       return NULL;
2879     }
2880
2881   unsigned nlatch_def_loop_uses = 0;
2882   auto_vec<gphi *, 3> lcphis;
2883   bool inner_loop_of_double_reduc = false;
2884   FOR_EACH_IMM_USE_FAST (use_p, imm_iter, name)
2885     {
2886       gimple *use_stmt = USE_STMT (use_p);
2887       if (is_gimple_debug (use_stmt))
2888         continue;
2889       if (flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
2890         nlatch_def_loop_uses++;
2891       else
2892         {
2893           /* We can have more than one loop-closed PHI.  */
2894           lcphis.safe_push (as_a <gphi *> (use_stmt));
2895           if (nested_in_vect_loop
2896               && (STMT_VINFO_DEF_TYPE (loop_info->lookup_stmt (use_stmt))
2897                   == vect_double_reduction_def))
2898             inner_loop_of_double_reduc = true;
2899         }
2900     }
2901
2902   /* If this isn't a nested cycle or if the nested cycle reduction value
2903      is used ouside of the inner loop we cannot handle uses of the reduction
2904      value.  */
2905   if ((!nested_in_vect_loop || inner_loop_of_double_reduc)
2906       && (nlatch_def_loop_uses > 1 || nphi_def_loop_uses > 1))
2907     {
2908       if (dump_enabled_p ())
2909         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2910                          "reduction used in loop.\n");
2911       return NULL;
2912     }
2913
2914   /* If DEF_STMT is a phi node itself, we expect it to have a single argument
2915      defined in the inner loop.  */
2916   if (phi_def)
2917     {
2918       gphi *def_stmt = as_a <gphi *> (def_stmt_info->stmt);
2919       op1 = PHI_ARG_DEF (def_stmt, 0);
2920
2921       if (gimple_phi_num_args (def_stmt) != 1
2922           || TREE_CODE (op1) != SSA_NAME)
2923         {
2924           if (dump_enabled_p ())
2925             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2926                              "unsupported phi node definition.\n");
2927
2928           return NULL;
2929         }
2930
2931       gimple *def1 = SSA_NAME_DEF_STMT (op1);
2932       if (gimple_bb (def1)
2933           && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
2934           && loop->inner
2935           && flow_bb_inside_loop_p (loop->inner, gimple_bb (def1))
2936           && is_gimple_assign (def1)
2937           && is_a <gphi *> (phi_use_stmt)
2938           && flow_bb_inside_loop_p (loop->inner, gimple_bb (phi_use_stmt)))
2939         {
2940           if (dump_enabled_p ())
2941             report_vect_op (MSG_NOTE, def_stmt,
2942                             "detected double reduction: ");
2943
2944           *double_reduc = true;
2945           return def_stmt_info;
2946         }
2947
2948       return NULL;
2949     }
2950
2951   /* If we are vectorizing an inner reduction we are executing that
2952      in the original order only in case we are not dealing with a
2953      double reduction.  */
2954   bool check_reduction = true;
2955   if (flow_loop_nested_p (vect_loop, loop))
2956     {
2957       gphi *lcphi;
2958       unsigned i;
2959       check_reduction = false;
2960       FOR_EACH_VEC_ELT (lcphis, i, lcphi)
2961         FOR_EACH_IMM_USE_FAST (use_p, imm_iter, gimple_phi_result (lcphi))
2962           {
2963             gimple *use_stmt = USE_STMT (use_p);
2964             if (is_gimple_debug (use_stmt))
2965               continue;
2966             if (! flow_bb_inside_loop_p (vect_loop, gimple_bb (use_stmt)))
2967               check_reduction = true;
2968           }
2969     }
2970
2971   gassign *def_stmt = as_a <gassign *> (def_stmt_info->stmt);
2972   code = orig_code = gimple_assign_rhs_code (def_stmt);
2973
2974   if (nested_in_vect_loop && !check_reduction)
2975     {
2976       /* FIXME: Even for non-reductions code generation is funneled
2977          through vectorizable_reduction for the stmt defining the
2978          PHI latch value.  So we have to artificially restrict ourselves
2979          for the supported operations.  */
2980       switch (get_gimple_rhs_class (code))
2981         {
2982         case GIMPLE_BINARY_RHS:
2983         case GIMPLE_TERNARY_RHS:
2984           break;
2985         default:
2986           /* Not supported by vectorizable_reduction.  */
2987           if (dump_enabled_p ())
2988             report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2989                             "nested cycle: not handled operation: ");
2990           return NULL;
2991         }
2992       if (dump_enabled_p ())
2993         report_vect_op (MSG_NOTE, def_stmt, "detected nested cycle: ");
2994       return def_stmt_info;
2995     }
2996
2997   /* We can handle "res -= x[i]", which is non-associative by
2998      simply rewriting this into "res += -x[i]".  Avoid changing
2999      gimple instruction for the first simple tests and only do this
3000      if we're allowed to change code at all.  */
3001   if (code == MINUS_EXPR && gimple_assign_rhs2 (def_stmt) != phi_name)
3002     code = PLUS_EXPR;
3003
3004   if (code == COND_EXPR)
3005     {
3006       if (! nested_in_vect_loop)
3007         *v_reduc_type = COND_REDUCTION;
3008
3009       op3 = gimple_assign_rhs1 (def_stmt);
3010       if (COMPARISON_CLASS_P (op3))
3011         {
3012           op4 = TREE_OPERAND (op3, 1);
3013           op3 = TREE_OPERAND (op3, 0);
3014         }
3015       if (op3 == phi_name || op4 == phi_name)
3016         {
3017           if (dump_enabled_p ())
3018             report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
3019                             "reduction: condition depends on previous"
3020                             " iteration: ");
3021           return NULL;
3022         }
3023
3024       op1 = gimple_assign_rhs2 (def_stmt);
3025       op2 = gimple_assign_rhs3 (def_stmt);
3026     }
3027   else if (!commutative_tree_code (code) || !associative_tree_code (code))
3028     {
3029       if (dump_enabled_p ())
3030         report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
3031                         "reduction: not commutative/associative: ");
3032       return NULL;
3033     }
3034   else if (get_gimple_rhs_class (code) == GIMPLE_BINARY_RHS)
3035     {
3036       op1 = gimple_assign_rhs1 (def_stmt);
3037       op2 = gimple_assign_rhs2 (def_stmt);
3038     }
3039   else
3040     {
3041       if (dump_enabled_p ())
3042         report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
3043                         "reduction: not handled operation: ");
3044       return NULL;
3045     }
3046
3047   if (TREE_CODE (op1) != SSA_NAME && TREE_CODE (op2) != SSA_NAME)
3048     {
3049       if (dump_enabled_p ())
3050         report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
3051                         "reduction: both uses not ssa_names: ");
3052
3053       return NULL;
3054     }
3055
3056   type = TREE_TYPE (gimple_assign_lhs (def_stmt));
3057   if ((TREE_CODE (op1) == SSA_NAME
3058        && !types_compatible_p (type,TREE_TYPE (op1)))
3059       || (TREE_CODE (op2) == SSA_NAME
3060           && !types_compatible_p (type, TREE_TYPE (op2)))
3061       || (op3 && TREE_CODE (op3) == SSA_NAME
3062           && !types_compatible_p (type, TREE_TYPE (op3)))
3063       || (op4 && TREE_CODE (op4) == SSA_NAME
3064           && !types_compatible_p (type, TREE_TYPE (op4))))
3065     {
3066       if (dump_enabled_p ())
3067         {
3068           dump_printf_loc (MSG_NOTE, vect_location,
3069                            "reduction: multiple types: operation type: "
3070                            "%T, operands types: %T,%T",
3071                            type,  TREE_TYPE (op1), TREE_TYPE (op2));
3072           if (op3)
3073             dump_printf (MSG_NOTE, ",%T", TREE_TYPE (op3));
3074
3075           if (op4)
3076             dump_printf (MSG_NOTE, ",%T", TREE_TYPE (op4));
3077           dump_printf (MSG_NOTE, "\n");
3078         }
3079
3080       return NULL;
3081     }
3082
3083   /* Check whether it's ok to change the order of the computation.
3084      Generally, when vectorizing a reduction we change the order of the
3085      computation.  This may change the behavior of the program in some
3086      cases, so we need to check that this is ok.  One exception is when
3087      vectorizing an outer-loop: the inner-loop is executed sequentially,
3088      and therefore vectorizing reductions in the inner-loop during
3089      outer-loop vectorization is safe.  */
3090   if (check_reduction
3091       && *v_reduc_type == TREE_CODE_REDUCTION
3092       && needs_fold_left_reduction_p (type, code,
3093                                       need_wrapping_integral_overflow))
3094     *v_reduc_type = FOLD_LEFT_REDUCTION;
3095
3096   /* Reduction is safe. We're dealing with one of the following:
3097      1) integer arithmetic and no trapv
3098      2) floating point arithmetic, and special flags permit this optimization
3099      3) nested cycle (i.e., outer loop vectorization).  */
3100   stmt_vec_info def1_info = loop_info->lookup_def (op1);
3101   stmt_vec_info def2_info = loop_info->lookup_def (op2);
3102   if (code != COND_EXPR && !def1_info && !def2_info)
3103     {
3104       if (dump_enabled_p ())
3105         report_vect_op (MSG_NOTE, def_stmt, "reduction: no defs for operands: ");
3106       return NULL;
3107     }
3108
3109   /* Check that one def is the reduction def, defined by PHI,
3110      the other def is either defined in the loop ("vect_internal_def"),
3111      or it's an induction (defined by a loop-header phi-node).  */
3112
3113   if (def2_info
3114       && def2_info->stmt == phi
3115       && (code == COND_EXPR
3116           || !def1_info
3117           || !flow_bb_inside_loop_p (loop, gimple_bb (def1_info->stmt))
3118           || vect_valid_reduction_input_p (def1_info)))
3119     {
3120       if (dump_enabled_p ())
3121         report_vect_op (MSG_NOTE, def_stmt, "detected reduction: ");
3122       return def_stmt_info;
3123     }
3124
3125   if (def1_info
3126       && def1_info->stmt == phi
3127       && (code == COND_EXPR
3128           || !def2_info
3129           || !flow_bb_inside_loop_p (loop, gimple_bb (def2_info->stmt))
3130           || vect_valid_reduction_input_p (def2_info)))
3131     {
3132       if (! nested_in_vect_loop && orig_code != MINUS_EXPR)
3133         {
3134           /* Check if we can swap operands (just for simplicity - so that
3135              the rest of the code can assume that the reduction variable
3136              is always the last (second) argument).  */
3137           if (code == COND_EXPR)
3138             {
3139               /* Swap cond_expr by inverting the condition.  */
3140               tree cond_expr = gimple_assign_rhs1 (def_stmt);
3141               enum tree_code invert_code = ERROR_MARK;
3142               enum tree_code cond_code = TREE_CODE (cond_expr);
3143
3144               if (TREE_CODE_CLASS (cond_code) == tcc_comparison)
3145                 {
3146                   bool honor_nans = HONOR_NANS (TREE_OPERAND (cond_expr, 0));
3147                   invert_code = invert_tree_comparison (cond_code, honor_nans);
3148                 }
3149               if (invert_code != ERROR_MARK)
3150                 {
3151                   TREE_SET_CODE (cond_expr, invert_code);
3152                   swap_ssa_operands (def_stmt,
3153                                      gimple_assign_rhs2_ptr (def_stmt),
3154                                      gimple_assign_rhs3_ptr (def_stmt));
3155                 }
3156               else
3157                 {
3158                   if (dump_enabled_p ())
3159                     report_vect_op (MSG_NOTE, def_stmt,
3160                                     "detected reduction: cannot swap operands "
3161                                     "for cond_expr");
3162                   return NULL;
3163                 }
3164             }
3165           else
3166             swap_ssa_operands (def_stmt, gimple_assign_rhs1_ptr (def_stmt),
3167                                gimple_assign_rhs2_ptr (def_stmt));
3168
3169           if (dump_enabled_p ())
3170             report_vect_op (MSG_NOTE, def_stmt,
3171                             "detected reduction: need to swap operands: ");
3172
3173           if (CONSTANT_CLASS_P (gimple_assign_rhs1 (def_stmt)))
3174             LOOP_VINFO_OPERANDS_SWAPPED (loop_info) = true;
3175         }
3176       else
3177         {
3178           if (dump_enabled_p ())
3179             report_vect_op (MSG_NOTE, def_stmt, "detected reduction: ");
3180         }
3181
3182       return def_stmt_info;
3183     }
3184
3185   /* Try to find SLP reduction chain.  */
3186   if (! nested_in_vect_loop
3187       && code != COND_EXPR
3188       && orig_code != MINUS_EXPR
3189       && vect_is_slp_reduction (loop_info, phi, def_stmt))
3190     {
3191       if (dump_enabled_p ())
3192         report_vect_op (MSG_NOTE, def_stmt,
3193                         "reduction: detected reduction chain: ");
3194
3195       return def_stmt_info;
3196     }
3197
3198   /* Look for the expression computing loop_arg from loop PHI result.  */
3199   if (check_reduction_path (vect_location, loop, phi, loop_arg, code))
3200     return def_stmt_info;
3201
3202   if (dump_enabled_p ())
3203     {
3204       report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
3205                       "reduction: unknown pattern: ");
3206     }
3207
3208   return NULL;
3209 }
3210
3211 /* Wrapper around vect_is_simple_reduction, which will modify code
3212    in-place if it enables detection of more reductions.  Arguments
3213    as there.  */
3214
3215 stmt_vec_info
3216 vect_force_simple_reduction (loop_vec_info loop_info, stmt_vec_info phi_info,
3217                              bool *double_reduc,
3218                              bool need_wrapping_integral_overflow)
3219 {
3220   enum vect_reduction_type v_reduc_type;
3221   stmt_vec_info def_info
3222     = vect_is_simple_reduction (loop_info, phi_info, double_reduc,
3223                                 need_wrapping_integral_overflow,
3224                                 &v_reduc_type);
3225   if (def_info)
3226     {
3227       STMT_VINFO_REDUC_TYPE (phi_info) = v_reduc_type;
3228       STMT_VINFO_REDUC_DEF (phi_info) = def_info;
3229       STMT_VINFO_REDUC_TYPE (def_info) = v_reduc_type;
3230       STMT_VINFO_REDUC_DEF (def_info) = phi_info;
3231     }
3232   return def_info;
3233 }
3234
3235 /* Calculate cost of peeling the loop PEEL_ITERS_PROLOGUE times.  */
3236 int
3237 vect_get_known_peeling_cost (loop_vec_info loop_vinfo, int peel_iters_prologue,
3238                              int *peel_iters_epilogue,
3239                              stmt_vector_for_cost *scalar_cost_vec,
3240                              stmt_vector_for_cost *prologue_cost_vec,
3241                              stmt_vector_for_cost *epilogue_cost_vec)
3242 {
3243   int retval = 0;
3244   int assumed_vf = vect_vf_for_cost (loop_vinfo);
3245
3246   if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
3247     {
3248       *peel_iters_epilogue = assumed_vf / 2;
3249       if (dump_enabled_p ())
3250         dump_printf_loc (MSG_NOTE, vect_location,
3251                          "cost model: epilogue peel iters set to vf/2 "
3252                          "because loop iterations are unknown .\n");
3253
3254       /* If peeled iterations are known but number of scalar loop
3255          iterations are unknown, count a taken branch per peeled loop.  */
3256       retval = record_stmt_cost (prologue_cost_vec, 1, cond_branch_taken,
3257                                  NULL, 0, vect_prologue);
3258       retval = record_stmt_cost (prologue_cost_vec, 1, cond_branch_taken,
3259                                  NULL, 0, vect_epilogue);
3260     }
3261   else
3262     {
3263       int niters = LOOP_VINFO_INT_NITERS (loop_vinfo);
3264       peel_iters_prologue = niters < peel_iters_prologue ?
3265                             niters : peel_iters_prologue;
3266       *peel_iters_epilogue = (niters - peel_iters_prologue) % assumed_vf;
3267       /* If we need to peel for gaps, but no peeling is required, we have to
3268          peel VF iterations.  */
3269       if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) && !*peel_iters_epilogue)
3270         *peel_iters_epilogue = assumed_vf;
3271     }
3272
3273   stmt_info_for_cost *si;
3274   int j;
3275   if (peel_iters_prologue)
3276     FOR_EACH_VEC_ELT (*scalar_cost_vec, j, si)
3277       retval += record_stmt_cost (prologue_cost_vec,
3278                                   si->count * peel_iters_prologue,
3279                                   si->kind, si->stmt_info, si->misalign,
3280                                   vect_prologue);
3281   if (*peel_iters_epilogue)
3282     FOR_EACH_VEC_ELT (*scalar_cost_vec, j, si)
3283       retval += record_stmt_cost (epilogue_cost_vec,
3284                                   si->count * *peel_iters_epilogue,
3285                                   si->kind, si->stmt_info, si->misalign,
3286                                   vect_epilogue);
3287
3288   return retval;
3289 }
3290
3291 /* Function vect_estimate_min_profitable_iters
3292
3293    Return the number of iterations required for the vector version of the
3294    loop to be profitable relative to the cost of the scalar version of the
3295    loop.
3296
3297    *RET_MIN_PROFITABLE_NITERS is a cost model profitability threshold
3298    of iterations for vectorization.  -1 value means loop vectorization
3299    is not profitable.  This returned value may be used for dynamic
3300    profitability check.
3301
3302    *RET_MIN_PROFITABLE_ESTIMATE is a profitability threshold to be used
3303    for static check against estimated number of iterations.  */
3304
3305 static void
3306 vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo,
3307                                     int *ret_min_profitable_niters,
3308                                     int *ret_min_profitable_estimate)
3309 {
3310   int min_profitable_iters;
3311   int min_profitable_estimate;
3312   int peel_iters_prologue;
3313   int peel_iters_epilogue;
3314   unsigned vec_inside_cost = 0;
3315   int vec_outside_cost = 0;
3316   unsigned vec_prologue_cost = 0;
3317   unsigned vec_epilogue_cost = 0;
3318   int scalar_single_iter_cost = 0;
3319   int scalar_outside_cost = 0;
3320   int assumed_vf = vect_vf_for_cost (loop_vinfo);
3321   int npeel = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
3322   void *target_cost_data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
3323
3324   /* Cost model disabled.  */
3325   if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
3326     {
3327       if (dump_enabled_p ())
3328         dump_printf_loc (MSG_NOTE, vect_location, "cost model disabled.\n");
3329       *ret_min_profitable_niters = 0;
3330       *ret_min_profitable_estimate = 0;
3331       return;
3332     }
3333
3334   /* Requires loop versioning tests to handle misalignment.  */
3335   if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo))
3336     {
3337       /*  FIXME: Make cost depend on complexity of individual check.  */
3338       unsigned len = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).length ();
3339       (void) add_stmt_cost (target_cost_data, len, vector_stmt, NULL, 0,
3340                             vect_prologue);
3341       if (dump_enabled_p ())
3342         dump_printf (MSG_NOTE,
3343                      "cost model: Adding cost of checks for loop "
3344                      "versioning to treat misalignment.\n");
3345     }
3346
3347   /* Requires loop versioning with alias checks.  */
3348   if (LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
3349     {
3350       /*  FIXME: Make cost depend on complexity of individual check.  */
3351       unsigned len = LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo).length ();
3352       (void) add_stmt_cost (target_cost_data, len, vector_stmt, NULL, 0,
3353                             vect_prologue);
3354       len = LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo).length ();
3355       if (len)
3356         /* Count LEN - 1 ANDs and LEN comparisons.  */
3357         (void) add_stmt_cost (target_cost_data, len * 2 - 1, scalar_stmt,
3358                               NULL, 0, vect_prologue);
3359       len = LOOP_VINFO_LOWER_BOUNDS (loop_vinfo).length ();
3360       if (len)
3361         {
3362           /* Count LEN - 1 ANDs and LEN comparisons.  */
3363           unsigned int nstmts = len * 2 - 1;
3364           /* +1 for each bias that needs adding.  */
3365           for (unsigned int i = 0; i < len; ++i)
3366             if (!LOOP_VINFO_LOWER_BOUNDS (loop_vinfo)[i].unsigned_p)
3367               nstmts += 1;
3368           (void) add_stmt_cost (target_cost_data, nstmts, scalar_stmt,
3369                                 NULL, 0, vect_prologue);
3370         }
3371       if (dump_enabled_p ())
3372         dump_printf (MSG_NOTE,
3373                      "cost model: Adding cost of checks for loop "
3374                      "versioning aliasing.\n");
3375     }
3376
3377   /* Requires loop versioning with niter checks.  */
3378   if (LOOP_REQUIRES_VERSIONING_FOR_NITERS (loop_vinfo))
3379     {
3380       /*  FIXME: Make cost depend on complexity of individual check.  */
3381       (void) add_stmt_cost (target_cost_data, 1, vector_stmt, NULL, 0,
3382                             vect_prologue);
3383       if (dump_enabled_p ())
3384         dump_printf (MSG_NOTE,
3385                      "cost model: Adding cost of checks for loop "
3386                      "versioning niters.\n");
3387     }
3388
3389   if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
3390     (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken, NULL, 0,
3391                           vect_prologue);
3392
3393   /* Count statements in scalar loop.  Using this as scalar cost for a single
3394      iteration for now.
3395
3396      TODO: Add outer loop support.
3397
3398      TODO: Consider assigning different costs to different scalar
3399      statements.  */
3400
3401   scalar_single_iter_cost
3402     = LOOP_VINFO_SINGLE_SCALAR_ITERATION_COST (loop_vinfo);
3403
3404   /* Add additional cost for the peeled instructions in prologue and epilogue
3405      loop.  (For fully-masked loops there will be no peeling.)
3406
3407      FORNOW: If we don't know the value of peel_iters for prologue or epilogue
3408      at compile-time - we assume it's vf/2 (the worst would be vf-1).
3409
3410      TODO: Build an expression that represents peel_iters for prologue and
3411      epilogue to be used in a run-time test.  */
3412
3413   if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
3414     {
3415       peel_iters_prologue = 0;
3416       peel_iters_epilogue = 0;
3417
3418       if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
3419         {
3420           /* We need to peel exactly one iteration.  */
3421           peel_iters_epilogue += 1;
3422           stmt_info_for_cost *si;
3423           int j;
3424           FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
3425                             j, si)
3426             (void) add_stmt_cost (target_cost_data, si->count,
3427                                   si->kind, si->stmt_info, si->misalign,
3428                                   vect_epilogue);
3429         }
3430     }
3431   else if (npeel < 0)
3432     {
3433       peel_iters_prologue = assumed_vf / 2;
3434       if (dump_enabled_p ())
3435         dump_printf (MSG_NOTE, "cost model: "
3436                      "prologue peel iters set to vf/2.\n");
3437
3438       /* If peeling for alignment is unknown, loop bound of main loop becomes
3439          unknown.  */
3440       peel_iters_epilogue = assumed_vf / 2;
3441       if (dump_enabled_p ())
3442         dump_printf (MSG_NOTE, "cost model: "
3443                      "epilogue peel iters set to vf/2 because "
3444                      "peeling for alignment is unknown.\n");
3445
3446       /* If peeled iterations are unknown, count a taken branch and a not taken
3447          branch per peeled loop. Even if scalar loop iterations are known,
3448          vector iterations are not known since peeled prologue iterations are
3449          not known. Hence guards remain the same.  */
3450       (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken,
3451                             NULL, 0, vect_prologue);
3452       (void) add_stmt_cost (target_cost_data, 1, cond_branch_not_taken,
3453                             NULL, 0, vect_prologue);
3454       (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken,
3455                             NULL, 0, vect_epilogue);
3456       (void) add_stmt_cost (target_cost_data, 1, cond_branch_not_taken,
3457                             NULL, 0, vect_epilogue);
3458       stmt_info_for_cost *si;
3459       int j;
3460       FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo), j, si)
3461         {
3462           (void) add_stmt_cost (target_cost_data,
3463                                 si->count * peel_iters_prologue,
3464                                 si->kind, si->stmt_info, si->misalign,
3465                                 vect_prologue);
3466           (void) add_stmt_cost (target_cost_data,
3467                                 si->count * peel_iters_epilogue,
3468                                 si->kind, si->stmt_info, si->misalign,
3469                                 vect_epilogue);
3470         }
3471     }
3472   else
3473     {
3474       stmt_vector_for_cost prologue_cost_vec, epilogue_cost_vec;
3475       stmt_info_for_cost *si;
3476       int j;
3477       void *data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
3478
3479       prologue_cost_vec.create (2);
3480       epilogue_cost_vec.create (2);
3481       peel_iters_prologue = npeel;
3482
3483       (void) vect_get_known_peeling_cost (loop_vinfo, peel_iters_prologue,
3484                                           &peel_iters_epilogue,
3485                                           &LOOP_VINFO_SCALAR_ITERATION_COST
3486                                             (loop_vinfo),
3487                                           &prologue_cost_vec,
3488                                           &epilogue_cost_vec);
3489
3490       FOR_EACH_VEC_ELT (prologue_cost_vec, j, si)
3491         (void) add_stmt_cost (data, si->count, si->kind, si->stmt_info,
3492                               si->misalign, vect_prologue);
3493
3494       FOR_EACH_VEC_ELT (epilogue_cost_vec, j, si)
3495         (void) add_stmt_cost (data, si->count, si->kind, si->stmt_info,
3496                               si->misalign, vect_epilogue);
3497
3498       prologue_cost_vec.release ();
3499       epilogue_cost_vec.release ();
3500     }
3501
3502   /* FORNOW: The scalar outside cost is incremented in one of the
3503      following ways:
3504
3505      1. The vectorizer checks for alignment and aliasing and generates
3506      a condition that allows dynamic vectorization.  A cost model
3507      check is ANDED with the versioning condition.  Hence scalar code
3508      path now has the added cost of the versioning check.
3509
3510        if (cost > th & versioning_check)
3511          jmp to vector code
3512
3513      Hence run-time scalar is incremented by not-taken branch cost.
3514
3515      2. The vectorizer then checks if a prologue is required.  If the
3516      cost model check was not done before during versioning, it has to
3517      be done before the prologue check.
3518
3519        if (cost <= th)
3520          prologue = scalar_iters
3521        if (prologue == 0)
3522          jmp to vector code
3523        else
3524          execute prologue
3525        if (prologue == num_iters)
3526          go to exit
3527
3528      Hence the run-time scalar cost is incremented by a taken branch,
3529      plus a not-taken branch, plus a taken branch cost.
3530
3531      3. The vectorizer then checks if an epilogue is required.  If the
3532      cost model check was not done before during prologue check, it
3533      has to be done with the epilogue check.
3534
3535        if (prologue == 0)
3536          jmp to vector code
3537        else
3538          execute prologue
3539        if (prologue == num_iters)
3540          go to exit
3541        vector code:
3542          if ((cost <= th) | (scalar_iters-prologue-epilogue == 0))
3543            jmp to epilogue
3544
3545      Hence the run-time scalar cost should be incremented by 2 taken
3546      branches.
3547
3548      TODO: The back end may reorder the BBS's differently and reverse
3549      conditions/branch directions.  Change the estimates below to
3550      something more reasonable.  */
3551
3552   /* If the number of iterations is known and we do not do versioning, we can
3553      decide whether to vectorize at compile time.  Hence the scalar version
3554      do not carry cost model guard costs.  */
3555   if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
3556       || LOOP_REQUIRES_VERSIONING (loop_vinfo))
3557     {
3558       /* Cost model check occurs at versioning.  */
3559       if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
3560         scalar_outside_cost += vect_get_stmt_cost (cond_branch_not_taken);
3561       else
3562         {
3563           /* Cost model check occurs at prologue generation.  */
3564           if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) < 0)
3565             scalar_outside_cost += 2 * vect_get_stmt_cost (cond_branch_taken)
3566               + vect_get_stmt_cost (cond_branch_not_taken);
3567           /* Cost model check occurs at epilogue generation.  */
3568           else
3569             scalar_outside_cost += 2 * vect_get_stmt_cost (cond_branch_taken);
3570         }
3571     }
3572
3573   /* Complete the target-specific cost calculations.  */
3574   finish_cost (LOOP_VINFO_TARGET_COST_DATA (loop_vinfo), &vec_prologue_cost,
3575                &vec_inside_cost, &vec_epilogue_cost);
3576
3577   vec_outside_cost = (int)(vec_prologue_cost + vec_epilogue_cost);
3578
3579   if (dump_enabled_p ())
3580     {
3581       dump_printf_loc (MSG_NOTE, vect_location, "Cost model analysis: \n");
3582       dump_printf (MSG_NOTE, "  Vector inside of loop cost: %d\n",
3583                    vec_inside_cost);
3584       dump_printf (MSG_NOTE, "  Vector prologue cost: %d\n",
3585                    vec_prologue_cost);
3586       dump_printf (MSG_NOTE, "  Vector epilogue cost: %d\n",
3587                    vec_epilogue_cost);
3588       dump_printf (MSG_NOTE, "  Scalar iteration cost: %d\n",
3589                    scalar_single_iter_cost);
3590       dump_printf (MSG_NOTE, "  Scalar outside cost: %d\n",
3591                    scalar_outside_cost);
3592       dump_printf (MSG_NOTE, "  Vector outside cost: %d\n",
3593                    vec_outside_cost);
3594       dump_printf (MSG_NOTE, "  prologue iterations: %d\n",
3595                    peel_iters_prologue);
3596       dump_printf (MSG_NOTE, "  epilogue iterations: %d\n",
3597                    peel_iters_epilogue);
3598     }
3599
3600   /* Calculate number of iterations required to make the vector version
3601      profitable, relative to the loop bodies only.  The following condition
3602      must hold true:
3603      SIC * niters + SOC > VIC * ((niters - NPEEL) / VF) + VOC
3604      where
3605      SIC = scalar iteration cost, VIC = vector iteration cost,
3606      VOC = vector outside cost, VF = vectorization factor,
3607      NPEEL = prologue iterations + epilogue iterations,
3608      SOC = scalar outside cost for run time cost model check.  */
3609
3610   int saving_per_viter = (scalar_single_iter_cost * assumed_vf
3611                           - vec_inside_cost);
3612   if (saving_per_viter <= 0)
3613     {
3614       if (LOOP_VINFO_LOOP (loop_vinfo)->force_vectorize)
3615         warning_at (vect_location.get_location_t (), OPT_Wopenmp_simd,
3616                     "vectorization did not happen for a simd loop");
3617
3618       if (dump_enabled_p ())
3619         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3620                          "cost model: the vector iteration cost = %d "
3621                          "divided by the scalar iteration cost = %d "
3622                          "is greater or equal to the vectorization factor = %d"
3623                          ".\n",
3624                          vec_inside_cost, scalar_single_iter_cost, assumed_vf);
3625       *ret_min_profitable_niters = -1;
3626       *ret_min_profitable_estimate = -1;
3627       return;
3628     }
3629
3630   /* ??? The "if" arm is written to handle all cases; see below for what
3631      we would do for !LOOP_VINFO_FULLY_MASKED_P.  */
3632   if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
3633     {
3634       /* Rewriting the condition above in terms of the number of
3635          vector iterations (vniters) rather than the number of
3636          scalar iterations (niters) gives:
3637
3638          SIC * (vniters * VF + NPEEL) + SOC > VIC * vniters + VOC
3639
3640          <==> vniters * (SIC * VF - VIC) > VOC - SIC * NPEEL - SOC
3641
3642          For integer N, X and Y when X > 0:
3643
3644          N * X > Y <==> N >= (Y /[floor] X) + 1.  */
3645       int outside_overhead = (vec_outside_cost
3646                               - scalar_single_iter_cost * peel_iters_prologue
3647                               - scalar_single_iter_cost * peel_iters_epilogue
3648                               - scalar_outside_cost);
3649       /* We're only interested in cases that require at least one
3650          vector iteration.  */
3651       int min_vec_niters = 1;
3652       if (outside_overhead > 0)
3653         min_vec_niters = outside_overhead / saving_per_viter + 1;
3654
3655       if (dump_enabled_p ())
3656         dump_printf (MSG_NOTE, "  Minimum number of vector iterations: %d\n",
3657                      min_vec_niters);
3658
3659       if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
3660         {
3661           /* Now that we know the minimum number of vector iterations,
3662              find the minimum niters for which the scalar cost is larger:
3663
3664              SIC * niters > VIC * vniters + VOC - SOC
3665
3666              We know that the minimum niters is no more than
3667              vniters * VF + NPEEL, but it might be (and often is) less
3668              than that if a partial vector iteration is cheaper than the
3669              equivalent scalar code.  */
3670           int threshold = (vec_inside_cost * min_vec_niters
3671                            + vec_outside_cost
3672                            - scalar_outside_cost);
3673           if (threshold <= 0)
3674             min_profitable_iters = 1;
3675           else
3676             min_profitable_iters = threshold / scalar_single_iter_cost + 1;
3677         }
3678       else
3679         /* Convert the number of vector iterations into a number of
3680            scalar iterations.  */
3681         min_profitable_iters = (min_vec_niters * assumed_vf
3682                                 + peel_iters_prologue
3683                                 + peel_iters_epilogue);
3684     }
3685   else
3686     {
3687       min_profitable_iters = ((vec_outside_cost - scalar_outside_cost)
3688                               * assumed_vf
3689                               - vec_inside_cost * peel_iters_prologue
3690                               - vec_inside_cost * peel_iters_epilogue);
3691       if (min_profitable_iters <= 0)
3692         min_profitable_iters = 0;
3693       else
3694         {
3695           min_profitable_iters /= saving_per_viter;
3696
3697           if ((scalar_single_iter_cost * assumed_vf * min_profitable_iters)
3698               <= (((int) vec_inside_cost * min_profitable_iters)
3699                   + (((int) vec_outside_cost - scalar_outside_cost)
3700                      * assumed_vf)))
3701             min_profitable_iters++;
3702         }
3703     }
3704
3705   if (dump_enabled_p ())
3706     dump_printf (MSG_NOTE,
3707                  "  Calculated minimum iters for profitability: %d\n",
3708                  min_profitable_iters);
3709
3710   if (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
3711       && min_profitable_iters < (assumed_vf + peel_iters_prologue))
3712     /* We want the vectorized loop to execute at least once.  */
3713     min_profitable_iters = assumed_vf + peel_iters_prologue;
3714
3715   if (dump_enabled_p ())
3716     dump_printf_loc (MSG_NOTE, vect_location,
3717                      "  Runtime profitability threshold = %d\n",
3718                      min_profitable_iters);
3719
3720   *ret_min_profitable_niters = min_profitable_iters;
3721
3722   /* Calculate number of iterations required to make the vector version
3723      profitable, relative to the loop bodies only.
3724
3725      Non-vectorized variant is SIC * niters and it must win over vector
3726      variant on the expected loop trip count.  The following condition must hold true:
3727      SIC * niters > VIC * ((niters - NPEEL) / VF) + VOC + SOC  */
3728
3729   if (vec_outside_cost <= 0)
3730     min_profitable_estimate = 0;
3731   else if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
3732     {
3733       /* This is a repeat of the code above, but with + SOC rather
3734          than - SOC.  */
3735       int outside_overhead = (vec_outside_cost
3736                               - scalar_single_iter_cost * peel_iters_prologue
3737                               - scalar_single_iter_cost * peel_iters_epilogue
3738                               + scalar_outside_cost);
3739       int min_vec_niters = 1;
3740       if (outside_overhead > 0)
3741         min_vec_niters = outside_overhead / saving_per_viter + 1;
3742
3743       if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
3744         {
3745           int threshold = (vec_inside_cost * min_vec_niters
3746                            + vec_outside_cost
3747                            + scalar_outside_cost);
3748           min_profitable_estimate = threshold / scalar_single_iter_cost + 1;
3749         }
3750       else
3751         min_profitable_estimate = (min_vec_niters * assumed_vf
3752                                    + peel_iters_prologue
3753                                    + peel_iters_epilogue);
3754     }
3755   else
3756     {
3757       min_profitable_estimate = ((vec_outside_cost + scalar_outside_cost)
3758                                  * assumed_vf
3759                                  - vec_inside_cost * peel_iters_prologue
3760                                  - vec_inside_cost * peel_iters_epilogue)
3761                                  / ((scalar_single_iter_cost * assumed_vf)
3762                                    - vec_inside_cost);
3763     }
3764   min_profitable_estimate = MAX (min_profitable_estimate, min_profitable_iters);
3765   if (dump_enabled_p ())
3766     dump_printf_loc (MSG_NOTE, vect_location,
3767                      "  Static estimate profitability threshold = %d\n",
3768                      min_profitable_estimate);
3769
3770   *ret_min_profitable_estimate = min_profitable_estimate;
3771 }
3772
3773 /* Writes into SEL a mask for a vec_perm, equivalent to a vec_shr by OFFSET
3774    vector elements (not bits) for a vector with NELT elements.  */
3775 static void
3776 calc_vec_perm_mask_for_shift (unsigned int offset, unsigned int nelt,
3777                               vec_perm_builder *sel)
3778 {
3779   /* The encoding is a single stepped pattern.  Any wrap-around is handled
3780      by vec_perm_indices.  */
3781   sel->new_vector (nelt, 1, 3);
3782   for (unsigned int i = 0; i < 3; i++)
3783     sel->quick_push (i + offset);
3784 }
3785
3786 /* Checks whether the target supports whole-vector shifts for vectors of mode
3787    MODE.  This is the case if _either_ the platform handles vec_shr_optab, _or_
3788    it supports vec_perm_const with masks for all necessary shift amounts.  */
3789 static bool
3790 have_whole_vector_shift (machine_mode mode)
3791 {
3792   if (optab_handler (vec_shr_optab, mode) != CODE_FOR_nothing)
3793     return true;
3794
3795   /* Variable-length vectors should be handled via the optab.  */
3796   unsigned int nelt;
3797   if (!GET_MODE_NUNITS (mode).is_constant (&nelt))
3798     return false;
3799
3800   vec_perm_builder sel;
3801   vec_perm_indices indices;
3802   for (unsigned int i = nelt / 2; i >= 1; i /= 2)
3803     {
3804       calc_vec_perm_mask_for_shift (i, nelt, &sel);
3805       indices.new_vector (sel, 2, nelt);
3806       if (!can_vec_perm_const_p (mode, indices, false))
3807         return false;
3808     }
3809   return true;
3810 }
3811
3812 /* TODO: Close dependency between vect_model_*_cost and vectorizable_*
3813    functions. Design better to avoid maintenance issues.  */
3814
3815 /* Function vect_model_reduction_cost.
3816
3817    Models cost for a reduction operation, including the vector ops
3818    generated within the strip-mine loop, the initial definition before
3819    the loop, and the epilogue code that must be generated.  */
3820
3821 static void
3822 vect_model_reduction_cost (stmt_vec_info stmt_info, internal_fn reduc_fn,
3823                            int ncopies, stmt_vector_for_cost *cost_vec)
3824 {
3825   int prologue_cost = 0, epilogue_cost = 0, inside_cost;
3826   enum tree_code code;
3827   optab optab;
3828   tree vectype;
3829   machine_mode mode;
3830   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
3831   struct loop *loop = NULL;
3832
3833   if (loop_vinfo)
3834     loop = LOOP_VINFO_LOOP (loop_vinfo);
3835
3836   /* Condition reductions generate two reductions in the loop.  */
3837   vect_reduction_type reduction_type
3838     = STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info);
3839   if (reduction_type == COND_REDUCTION)
3840     ncopies *= 2;
3841
3842   vectype = STMT_VINFO_VECTYPE (stmt_info);
3843   mode = TYPE_MODE (vectype);
3844   stmt_vec_info orig_stmt_info = vect_orig_stmt (stmt_info);
3845
3846   code = gimple_assign_rhs_code (orig_stmt_info->stmt);
3847
3848   if (reduction_type == EXTRACT_LAST_REDUCTION
3849       || reduction_type == FOLD_LEFT_REDUCTION)
3850     {
3851       /* No extra instructions needed in the prologue.  */
3852       prologue_cost = 0;
3853
3854       if (reduction_type == EXTRACT_LAST_REDUCTION || reduc_fn != IFN_LAST)
3855         /* Count one reduction-like operation per vector.  */
3856         inside_cost = record_stmt_cost (cost_vec, ncopies, vec_to_scalar,
3857                                         stmt_info, 0, vect_body);
3858       else
3859         {
3860           /* Use NELEMENTS extracts and NELEMENTS scalar ops.  */
3861           unsigned int nelements = ncopies * vect_nunits_for_cost (vectype);
3862           inside_cost = record_stmt_cost (cost_vec, nelements,
3863                                           vec_to_scalar, stmt_info, 0,
3864                                           vect_body);
3865           inside_cost += record_stmt_cost (cost_vec, nelements,
3866                                            scalar_stmt, stmt_info, 0,
3867                                            vect_body);
3868         }
3869     }
3870   else
3871     {
3872       /* Add in cost for initial definition.
3873          For cond reduction we have four vectors: initial index, step,
3874          initial result of the data reduction, initial value of the index
3875          reduction.  */
3876       int prologue_stmts = reduction_type == COND_REDUCTION ? 4 : 1;
3877       prologue_cost += record_stmt_cost (cost_vec, prologue_stmts,
3878                                          scalar_to_vec, stmt_info, 0,
3879                                          vect_prologue);
3880
3881       /* Cost of reduction op inside loop.  */
3882       inside_cost = record_stmt_cost (cost_vec, ncopies, vector_stmt,
3883                                       stmt_info, 0, vect_body);
3884     }
3885
3886   /* Determine cost of epilogue code.
3887
3888      We have a reduction operator that will reduce the vector in one statement.
3889      Also requires scalar extract.  */
3890
3891   if (!loop || !nested_in_vect_loop_p (loop, orig_stmt_info))
3892     {
3893       if (reduc_fn != IFN_LAST)
3894         {
3895           if (reduction_type == COND_REDUCTION)
3896             {
3897               /* An EQ stmt and an COND_EXPR stmt.  */
3898               epilogue_cost += record_stmt_cost (cost_vec, 2,
3899                                                  vector_stmt, stmt_info, 0,
3900                                                  vect_epilogue);
3901               /* Reduction of the max index and a reduction of the found
3902                  values.  */
3903               epilogue_cost += record_stmt_cost (cost_vec, 2,
3904                                                  vec_to_scalar, stmt_info, 0,
3905                                                  vect_epilogue);
3906               /* A broadcast of the max value.  */
3907               epilogue_cost += record_stmt_cost (cost_vec, 1,
3908                                                  scalar_to_vec, stmt_info, 0,
3909                                                  vect_epilogue);
3910             }
3911           else
3912             {
3913               epilogue_cost += record_stmt_cost (cost_vec, 1, vector_stmt,
3914                                                  stmt_info, 0, vect_epilogue);
3915               epilogue_cost += record_stmt_cost (cost_vec, 1,
3916                                                  vec_to_scalar, stmt_info, 0,
3917                                                  vect_epilogue);
3918             }
3919         }
3920       else if (reduction_type == COND_REDUCTION)
3921         {
3922           unsigned estimated_nunits = vect_nunits_for_cost (vectype);
3923           /* Extraction of scalar elements.  */
3924           epilogue_cost += record_stmt_cost (cost_vec,
3925                                              2 * estimated_nunits,
3926                                              vec_to_scalar, stmt_info, 0,
3927                                              vect_epilogue);
3928           /* Scalar max reductions via COND_EXPR / MAX_EXPR.  */
3929           epilogue_cost += record_stmt_cost (cost_vec,
3930                                              2 * estimated_nunits - 3,
3931                                              scalar_stmt, stmt_info, 0,
3932                                              vect_epilogue);
3933         }
3934       else if (reduction_type == EXTRACT_LAST_REDUCTION
3935                || reduction_type == FOLD_LEFT_REDUCTION)
3936         /* No extra instructions need in the epilogue.  */
3937         ;
3938       else
3939         {
3940           int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
3941           tree bitsize =
3942             TYPE_SIZE (TREE_TYPE (gimple_assign_lhs (orig_stmt_info->stmt)));
3943           int element_bitsize = tree_to_uhwi (bitsize);
3944           int nelements = vec_size_in_bits / element_bitsize;
3945
3946           if (code == COND_EXPR)
3947             code = MAX_EXPR;
3948
3949           optab = optab_for_tree_code (code, vectype, optab_default);
3950
3951           /* We have a whole vector shift available.  */
3952           if (optab != unknown_optab
3953               && VECTOR_MODE_P (mode)
3954               && optab_handler (optab, mode) != CODE_FOR_nothing
3955               && have_whole_vector_shift (mode))
3956             {
3957               /* Final reduction via vector shifts and the reduction operator.
3958                  Also requires scalar extract.  */
3959               epilogue_cost += record_stmt_cost (cost_vec,
3960                                                  exact_log2 (nelements) * 2,
3961                                                  vector_stmt, stmt_info, 0,
3962                                                  vect_epilogue);
3963               epilogue_cost += record_stmt_cost (cost_vec, 1,
3964                                                  vec_to_scalar, stmt_info, 0,
3965                                                  vect_epilogue);
3966             }
3967           else
3968             /* Use extracts and reduction op for final reduction.  For N
3969                elements, we have N extracts and N-1 reduction ops.  */
3970             epilogue_cost += record_stmt_cost (cost_vec,
3971                                                nelements + nelements - 1,
3972                                                vector_stmt, stmt_info, 0,
3973                                                vect_epilogue);
3974         }
3975     }
3976
3977   if (dump_enabled_p ())
3978     dump_printf (MSG_NOTE,
3979                  "vect_model_reduction_cost: inside_cost = %d, "
3980                  "prologue_cost = %d, epilogue_cost = %d .\n", inside_cost,
3981                  prologue_cost, epilogue_cost);
3982 }
3983
3984
3985 /* Function vect_model_induction_cost.
3986
3987    Models cost for induction operations.  */
3988
3989 static void
3990 vect_model_induction_cost (stmt_vec_info stmt_info, int ncopies,
3991                            stmt_vector_for_cost *cost_vec)
3992 {
3993   unsigned inside_cost, prologue_cost;
3994
3995   if (PURE_SLP_STMT (stmt_info))
3996     return;
3997
3998   /* loop cost for vec_loop.  */
3999   inside_cost = record_stmt_cost (cost_vec, ncopies, vector_stmt,
4000                                   stmt_info, 0, vect_body);
4001
4002   /* prologue cost for vec_init and vec_step.  */
4003   prologue_cost = record_stmt_cost (cost_vec, 2, scalar_to_vec,
4004                                     stmt_info, 0, vect_prologue);
4005
4006   if (dump_enabled_p ())
4007     dump_printf_loc (MSG_NOTE, vect_location,
4008                      "vect_model_induction_cost: inside_cost = %d, "
4009                      "prologue_cost = %d .\n", inside_cost, prologue_cost);
4010 }
4011
4012
4013
4014 /* Function get_initial_def_for_reduction
4015
4016    Input:
4017    STMT_VINFO - a stmt that performs a reduction operation in the loop.
4018    INIT_VAL - the initial value of the reduction variable
4019
4020    Output:
4021    ADJUSTMENT_DEF - a tree that holds a value to be added to the final result
4022         of the reduction (used for adjusting the epilog - see below).
4023    Return a vector variable, initialized according to the operation that
4024         STMT_VINFO performs. This vector will be used as the initial value
4025         of the vector of partial results.
4026
4027    Option1 (adjust in epilog): Initialize the vector as follows:
4028      add/bit or/xor:    [0,0,...,0,0]
4029      mult/bit and:      [1,1,...,1,1]
4030      min/max/cond_expr: [init_val,init_val,..,init_val,init_val]
4031    and when necessary (e.g. add/mult case) let the caller know
4032    that it needs to adjust the result by init_val.
4033
4034    Option2: Initialize the vector as follows:
4035      add/bit or/xor:    [init_val,0,0,...,0]
4036      mult/bit and:      [init_val,1,1,...,1]
4037      min/max/cond_expr: [init_val,init_val,...,init_val]
4038    and no adjustments are needed.
4039
4040    For example, for the following code:
4041
4042    s = init_val;
4043    for (i=0;i<n;i++)
4044      s = s + a[i];
4045
4046    STMT_VINFO is 's = s + a[i]', and the reduction variable is 's'.
4047    For a vector of 4 units, we want to return either [0,0,0,init_val],
4048    or [0,0,0,0] and let the caller know that it needs to adjust
4049    the result at the end by 'init_val'.
4050
4051    FORNOW, we are using the 'adjust in epilog' scheme, because this way the
4052    initialization vector is simpler (same element in all entries), if
4053    ADJUSTMENT_DEF is not NULL, and Option2 otherwise.
4054
4055    A cost model should help decide between these two schemes.  */
4056
4057 tree
4058 get_initial_def_for_reduction (stmt_vec_info stmt_vinfo, tree init_val,
4059                                tree *adjustment_def)
4060 {
4061   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_vinfo);
4062   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
4063   tree scalar_type = TREE_TYPE (init_val);
4064   tree vectype = get_vectype_for_scalar_type (scalar_type);
4065   enum tree_code code = gimple_assign_rhs_code (stmt_vinfo->stmt);
4066   tree def_for_init;
4067   tree init_def;
4068   REAL_VALUE_TYPE real_init_val = dconst0;
4069   int int_init_val = 0;
4070   gimple_seq stmts = NULL;
4071
4072   gcc_assert (vectype);
4073
4074   gcc_assert (POINTER_TYPE_P (scalar_type) || INTEGRAL_TYPE_P (scalar_type)
4075               || SCALAR_FLOAT_TYPE_P (scalar_type));
4076
4077   gcc_assert (nested_in_vect_loop_p (loop, stmt_vinfo)
4078               || loop == (gimple_bb (stmt_vinfo->stmt))->loop_father);
4079
4080   vect_reduction_type reduction_type
4081     = STMT_VINFO_VEC_REDUCTION_TYPE (stmt_vinfo);
4082
4083   switch (code)
4084     {
4085     case WIDEN_SUM_EXPR:
4086     case DOT_PROD_EXPR:
4087     case SAD_EXPR:
4088     case PLUS_EXPR:
4089     case MINUS_EXPR:
4090     case BIT_IOR_EXPR:
4091     case BIT_XOR_EXPR:
4092     case MULT_EXPR:
4093     case BIT_AND_EXPR:
4094       {
4095         /* ADJUSTMENT_DEF is NULL when called from
4096            vect_create_epilog_for_reduction to vectorize double reduction.  */
4097         if (adjustment_def)
4098           *adjustment_def = init_val;
4099
4100         if (code == MULT_EXPR)
4101           {
4102             real_init_val = dconst1;
4103             int_init_val = 1;
4104           }
4105
4106         if (code == BIT_AND_EXPR)
4107           int_init_val = -1;
4108
4109         if (SCALAR_FLOAT_TYPE_P (scalar_type))
4110           def_for_init = build_real (scalar_type, real_init_val);
4111         else
4112           def_for_init = build_int_cst (scalar_type, int_init_val);
4113
4114         if (adjustment_def)
4115           /* Option1: the first element is '0' or '1' as well.  */
4116           init_def = gimple_build_vector_from_val (&stmts, vectype,
4117                                                    def_for_init);
4118         else if (!TYPE_VECTOR_SUBPARTS (vectype).is_constant ())
4119           {
4120             /* Option2 (variable length): the first element is INIT_VAL.  */
4121             init_def = gimple_build_vector_from_val (&stmts, vectype,
4122                                                      def_for_init);
4123             init_def = gimple_build (&stmts, CFN_VEC_SHL_INSERT,
4124                                      vectype, init_def, init_val);
4125           }
4126         else
4127           {
4128             /* Option2: the first element is INIT_VAL.  */
4129             tree_vector_builder elts (vectype, 1, 2);
4130             elts.quick_push (init_val);
4131             elts.quick_push (def_for_init);
4132             init_def = gimple_build_vector (&stmts, &elts);
4133           }
4134       }
4135       break;
4136
4137     case MIN_EXPR:
4138     case MAX_EXPR:
4139     case COND_EXPR:
4140       {
4141         if (adjustment_def)
4142           {
4143             *adjustment_def = NULL_TREE;
4144             if (reduction_type != COND_REDUCTION
4145                 && reduction_type != EXTRACT_LAST_REDUCTION)
4146               {
4147                 init_def = vect_get_vec_def_for_operand (init_val, stmt_vinfo);
4148                 break;
4149               }
4150           }
4151         init_val = gimple_convert (&stmts, TREE_TYPE (vectype), init_val);
4152         init_def = gimple_build_vector_from_val (&stmts, vectype, init_val);
4153       }
4154       break;
4155
4156     default:
4157       gcc_unreachable ();
4158     }
4159
4160   if (stmts)
4161     gsi_insert_seq_on_edge_immediate (loop_preheader_edge (loop), stmts);
4162   return init_def;
4163 }
4164
4165 /* Get at the initial defs for the reduction PHIs in SLP_NODE.
4166    NUMBER_OF_VECTORS is the number of vector defs to create.
4167    If NEUTRAL_OP is nonnull, introducing extra elements of that
4168    value will not change the result.  */
4169
4170 static void
4171 get_initial_defs_for_reduction (slp_tree slp_node,
4172                                 vec<tree> *vec_oprnds,
4173                                 unsigned int number_of_vectors,
4174                                 bool reduc_chain, tree neutral_op)
4175 {
4176   vec<stmt_vec_info> stmts = SLP_TREE_SCALAR_STMTS (slp_node);
4177   stmt_vec_info stmt_vinfo = stmts[0];
4178   unsigned HOST_WIDE_INT nunits;
4179   unsigned j, number_of_places_left_in_vector;
4180   tree vector_type;
4181   unsigned int group_size = stmts.length ();
4182   unsigned int i;
4183   struct loop *loop;
4184
4185   vector_type = STMT_VINFO_VECTYPE (stmt_vinfo);
4186
4187   gcc_assert (STMT_VINFO_DEF_TYPE (stmt_vinfo) == vect_reduction_def);
4188
4189   loop = (gimple_bb (stmt_vinfo->stmt))->loop_father;
4190   gcc_assert (loop);
4191   edge pe = loop_preheader_edge (loop);
4192
4193   gcc_assert (!reduc_chain || neutral_op);
4194
4195   /* NUMBER_OF_COPIES is the number of times we need to use the same values in
4196      created vectors. It is greater than 1 if unrolling is performed.
4197
4198      For example, we have two scalar operands, s1 and s2 (e.g., group of
4199      strided accesses of size two), while NUNITS is four (i.e., four scalars
4200      of this type can be packed in a vector).  The output vector will contain
4201      two copies of each scalar operand: {s1, s2, s1, s2}.  (NUMBER_OF_COPIES
4202      will be 2).
4203
4204      If REDUC_GROUP_SIZE > NUNITS, the scalars will be split into several
4205      vectors containing the operands.
4206
4207      For example, NUNITS is four as before, and the group size is 8
4208      (s1, s2, ..., s8).  We will create two vectors {s1, s2, s3, s4} and
4209      {s5, s6, s7, s8}.  */
4210
4211   if (!TYPE_VECTOR_SUBPARTS (vector_type).is_constant (&nunits))
4212     nunits = group_size;
4213
4214   number_of_places_left_in_vector = nunits;
4215   bool constant_p = true;
4216   tree_vector_builder elts (vector_type, nunits, 1);
4217   elts.quick_grow (nunits);
4218   gimple_seq ctor_seq = NULL;
4219   for (j = 0; j < nunits * number_of_vectors; ++j)
4220     {
4221       tree op;
4222       i = j % group_size;
4223       stmt_vinfo = stmts[i];
4224
4225       /* Get the def before the loop.  In reduction chain we have only
4226          one initial value.  Else we have as many as PHIs in the group.  */
4227       if (reduc_chain)
4228         op = j != 0 ? neutral_op : PHI_ARG_DEF_FROM_EDGE (stmt_vinfo->stmt, pe);
4229       else if (((vec_oprnds->length () + 1) * nunits
4230                 - number_of_places_left_in_vector >= group_size)
4231                && neutral_op)
4232         op = neutral_op;
4233       else
4234         op = PHI_ARG_DEF_FROM_EDGE (stmt_vinfo->stmt, pe);
4235
4236       /* Create 'vect_ = {op0,op1,...,opn}'.  */
4237       number_of_places_left_in_vector--;
4238       elts[nunits - number_of_places_left_in_vector - 1] = op;
4239       if (!CONSTANT_CLASS_P (op))
4240         constant_p = false;
4241
4242       if (number_of_places_left_in_vector == 0)
4243         {
4244           tree init;
4245           if (constant_p && !neutral_op
4246               ? multiple_p (TYPE_VECTOR_SUBPARTS (vector_type), nunits)
4247               : known_eq (TYPE_VECTOR_SUBPARTS (vector_type), nunits))
4248             /* Build the vector directly from ELTS.  */
4249             init = gimple_build_vector (&ctor_seq, &elts);
4250           else if (neutral_op)
4251             {
4252               /* Build a vector of the neutral value and shift the
4253                  other elements into place.  */
4254               init = gimple_build_vector_from_val (&ctor_seq, vector_type,
4255                                                    neutral_op);
4256               int k = nunits;
4257               while (k > 0 && elts[k - 1] == neutral_op)
4258                 k -= 1;
4259               while (k > 0)
4260                 {
4261                   k -= 1;
4262                   init = gimple_build (&ctor_seq, CFN_VEC_SHL_INSERT,
4263                                        vector_type, init, elts[k]);
4264                 }
4265             }
4266           else
4267             {
4268               /* First time round, duplicate ELTS to fill the
4269                  required number of vectors.  */
4270               duplicate_and_interleave (&ctor_seq, vector_type, elts,
4271                                         number_of_vectors, *vec_oprnds);
4272               break;
4273             }
4274           vec_oprnds->quick_push (init);
4275
4276           number_of_places_left_in_vector = nunits;
4277           elts.new_vector (vector_type, nunits, 1);
4278           elts.quick_grow (nunits);
4279           constant_p = true;
4280         }
4281     }
4282   if (ctor_seq != NULL)
4283     gsi_insert_seq_on_edge_immediate (pe, ctor_seq);
4284 }
4285
4286
4287 /* Function vect_create_epilog_for_reduction
4288
4289    Create code at the loop-epilog to finalize the result of a reduction
4290    computation.
4291
4292    VECT_DEFS is list of vector of partial results, i.e., the lhs's of vector
4293      reduction statements.
4294    STMT_INFO is the scalar reduction stmt that is being vectorized.
4295    NCOPIES is > 1 in case the vectorization factor (VF) is bigger than the
4296      number of elements that we can fit in a vectype (nunits).  In this case
4297      we have to generate more than one vector stmt - i.e - we need to "unroll"
4298      the vector stmt by a factor VF/nunits.  For more details see documentation
4299      in vectorizable_operation.
4300    REDUC_FN is the internal function for the epilog reduction.
4301    REDUCTION_PHIS is a list of the phi-nodes that carry the reduction
4302      computation.
4303    REDUC_INDEX is the index of the operand in the right hand side of the
4304      statement that is defined by REDUCTION_PHI.
4305    DOUBLE_REDUC is TRUE if double reduction phi nodes should be handled.
4306    SLP_NODE is an SLP node containing a group of reduction statements. The
4307      first one in this group is STMT_INFO.
4308    INDUC_VAL is for INTEGER_INDUC_COND_REDUCTION the value to use for the case
4309      when the COND_EXPR is never true in the loop.  For MAX_EXPR, it needs to
4310      be smaller than any value of the IV in the loop, for MIN_EXPR larger than
4311      any value of the IV in the loop.
4312    INDUC_CODE is the code for epilog reduction if INTEGER_INDUC_COND_REDUCTION.
4313    NEUTRAL_OP is the value given by neutral_op_for_slp_reduction; it is
4314      null if this is not an SLP reduction
4315
4316    This function:
4317    1. Creates the reduction def-use cycles: sets the arguments for
4318       REDUCTION_PHIS:
4319       The loop-entry argument is the vectorized initial-value of the reduction.
4320       The loop-latch argument is taken from VECT_DEFS - the vector of partial
4321       sums.
4322    2. "Reduces" each vector of partial results VECT_DEFS into a single result,
4323       by calling the function specified by REDUC_FN if available, or by
4324       other means (whole-vector shifts or a scalar loop).
4325       The function also creates a new phi node at the loop exit to preserve
4326       loop-closed form, as illustrated below.
4327
4328      The flow at the entry to this function:
4329
4330         loop:
4331           vec_def = phi <null, null>            # REDUCTION_PHI
4332           VECT_DEF = vector_stmt                # vectorized form of STMT_INFO
4333           s_loop = scalar_stmt                  # (scalar) STMT_INFO
4334         loop_exit:
4335           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
4336           use <s_out0>
4337           use <s_out0>
4338
4339      The above is transformed by this function into:
4340
4341         loop:
4342           vec_def = phi <vec_init, VECT_DEF>    # REDUCTION_PHI
4343           VECT_DEF = vector_stmt                # vectorized form of STMT_INFO
4344           s_loop = scalar_stmt                  # (scalar) STMT_INFO
4345         loop_exit:
4346           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
4347           v_out1 = phi <VECT_DEF>               # NEW_EXIT_PHI
4348           v_out2 = reduce <v_out1>
4349           s_out3 = extract_field <v_out2, 0>
4350           s_out4 = adjust_result <s_out3>
4351           use <s_out4>
4352           use <s_out4>
4353 */
4354
4355 static void
4356 vect_create_epilog_for_reduction (vec<tree> vect_defs,
4357                                   stmt_vec_info stmt_info,
4358                                   gimple *reduc_def_stmt,
4359                                   int ncopies, internal_fn reduc_fn,
4360                                   vec<stmt_vec_info> reduction_phis,
4361                                   bool double_reduc,
4362                                   slp_tree slp_node,
4363                                   slp_instance slp_node_instance,
4364                                   tree induc_val, enum tree_code induc_code,
4365                                   tree neutral_op)
4366 {
4367   stmt_vec_info prev_phi_info;
4368   tree vectype;
4369   machine_mode mode;
4370   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4371   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo), *outer_loop = NULL;
4372   basic_block exit_bb;
4373   tree scalar_dest;
4374   tree scalar_type;
4375   gimple *new_phi = NULL, *phi;
4376   stmt_vec_info phi_info;
4377   gimple_stmt_iterator exit_gsi;
4378   tree vec_dest;
4379   tree new_temp = NULL_TREE, new_dest, new_name, new_scalar_dest;
4380   gimple *epilog_stmt = NULL;
4381   enum tree_code code = gimple_assign_rhs_code (stmt_info->stmt);
4382   gimple *exit_phi;
4383   tree bitsize;
4384   tree adjustment_def = NULL;
4385   tree vec_initial_def = NULL;
4386   tree expr, def, initial_def = NULL;
4387   tree orig_name, scalar_result;
4388   imm_use_iterator imm_iter, phi_imm_iter;
4389   use_operand_p use_p, phi_use_p;
4390   gimple *use_stmt;
4391   stmt_vec_info reduction_phi_info = NULL;
4392   bool nested_in_vect_loop = false;
4393   auto_vec<gimple *> new_phis;
4394   auto_vec<stmt_vec_info> inner_phis;
4395   int j, i;
4396   auto_vec<tree> scalar_results;
4397   unsigned int group_size = 1, k, ratio;
4398   auto_vec<tree> vec_initial_defs;
4399   auto_vec<gimple *> phis;
4400   bool slp_reduc = false;
4401   bool direct_slp_reduc;
4402   tree new_phi_result;
4403   stmt_vec_info inner_phi = NULL;
4404   tree induction_index = NULL_TREE;
4405
4406   if (slp_node)
4407     group_size = SLP_TREE_SCALAR_STMTS (slp_node).length ();
4408
4409   if (nested_in_vect_loop_p (loop, stmt_info))
4410     {
4411       outer_loop = loop;
4412       loop = loop->inner;
4413       nested_in_vect_loop = true;
4414       gcc_assert (!slp_node);
4415     }
4416
4417   vectype = STMT_VINFO_VECTYPE (stmt_info);
4418   gcc_assert (vectype);
4419   mode = TYPE_MODE (vectype);
4420
4421   /* 1. Create the reduction def-use cycle:
4422      Set the arguments of REDUCTION_PHIS, i.e., transform
4423
4424         loop:
4425           vec_def = phi <null, null>            # REDUCTION_PHI
4426           VECT_DEF = vector_stmt                # vectorized form of STMT
4427           ...
4428
4429      into:
4430
4431         loop:
4432           vec_def = phi <vec_init, VECT_DEF>    # REDUCTION_PHI
4433           VECT_DEF = vector_stmt                # vectorized form of STMT
4434           ...
4435
4436      (in case of SLP, do it for all the phis). */
4437
4438   /* Get the loop-entry arguments.  */
4439   enum vect_def_type initial_def_dt = vect_unknown_def_type;
4440   if (slp_node)
4441     {
4442       unsigned vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
4443       vec_initial_defs.reserve (vec_num);
4444       get_initial_defs_for_reduction (slp_node_instance->reduc_phis,
4445                                       &vec_initial_defs, vec_num,
4446                                       REDUC_GROUP_FIRST_ELEMENT (stmt_info),
4447                                       neutral_op);
4448     }
4449   else
4450     {
4451       /* Get at the scalar def before the loop, that defines the initial value
4452          of the reduction variable.  */
4453       initial_def = PHI_ARG_DEF_FROM_EDGE (reduc_def_stmt,
4454                                            loop_preheader_edge (loop));
4455       /* Optimize: if initial_def is for REDUC_MAX smaller than the base
4456          and we can't use zero for induc_val, use initial_def.  Similarly
4457          for REDUC_MIN and initial_def larger than the base.  */
4458       if (TREE_CODE (initial_def) == INTEGER_CST
4459           && (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
4460               == INTEGER_INDUC_COND_REDUCTION)
4461           && !integer_zerop (induc_val)
4462           && ((induc_code == MAX_EXPR
4463                && tree_int_cst_lt (initial_def, induc_val))
4464               || (induc_code == MIN_EXPR
4465                   && tree_int_cst_lt (induc_val, initial_def))))
4466         induc_val = initial_def;
4467
4468       if (double_reduc)
4469         /* In case of double reduction we only create a vector variable
4470            to be put in the reduction phi node.  The actual statement
4471            creation is done later in this function.  */
4472         vec_initial_def = vect_create_destination_var (initial_def, vectype);
4473       else if (nested_in_vect_loop)
4474         {
4475           /* Do not use an adjustment def as that case is not supported
4476              correctly if ncopies is not one.  */
4477           vect_is_simple_use (initial_def, loop_vinfo, &initial_def_dt);
4478           vec_initial_def = vect_get_vec_def_for_operand (initial_def,
4479                                                           stmt_info);
4480         }
4481       else
4482         vec_initial_def
4483           = get_initial_def_for_reduction (stmt_info, initial_def,
4484                                            &adjustment_def);
4485       vec_initial_defs.create (1);
4486       vec_initial_defs.quick_push (vec_initial_def);
4487     }
4488
4489   /* Set phi nodes arguments.  */
4490   FOR_EACH_VEC_ELT (reduction_phis, i, phi_info)
4491     {
4492       tree vec_init_def = vec_initial_defs[i];
4493       tree def = vect_defs[i];
4494       for (j = 0; j < ncopies; j++)
4495         {
4496           if (j != 0)
4497             {
4498               phi_info = STMT_VINFO_RELATED_STMT (phi_info);
4499               if (nested_in_vect_loop)
4500                 vec_init_def
4501                   = vect_get_vec_def_for_stmt_copy (loop_vinfo, vec_init_def);
4502             }
4503
4504           /* Set the loop-entry arg of the reduction-phi.  */
4505
4506           gphi *phi = as_a <gphi *> (phi_info->stmt);
4507           if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
4508               == INTEGER_INDUC_COND_REDUCTION)
4509             {
4510               /* Initialise the reduction phi to zero.  This prevents initial
4511                  values of non-zero interferring with the reduction op.  */
4512               gcc_assert (ncopies == 1);
4513               gcc_assert (i == 0);
4514
4515               tree vec_init_def_type = TREE_TYPE (vec_init_def);
4516               tree induc_val_vec
4517                 = build_vector_from_val (vec_init_def_type, induc_val);
4518
4519               add_phi_arg (phi, induc_val_vec, loop_preheader_edge (loop),
4520                            UNKNOWN_LOCATION);
4521             }
4522           else
4523             add_phi_arg (phi, vec_init_def, loop_preheader_edge (loop),
4524                          UNKNOWN_LOCATION);
4525
4526           /* Set the loop-latch arg for the reduction-phi.  */
4527           if (j > 0)
4528             def = vect_get_vec_def_for_stmt_copy (loop_vinfo, def);
4529
4530           add_phi_arg (phi, def, loop_latch_edge (loop), UNKNOWN_LOCATION);
4531
4532           if (dump_enabled_p ())
4533             dump_printf_loc (MSG_NOTE, vect_location,
4534                              "transform reduction: created def-use cycle: %G%G",
4535                              phi, SSA_NAME_DEF_STMT (def));
4536         }
4537     }
4538
4539   /* For cond reductions we want to create a new vector (INDEX_COND_EXPR)
4540      which is updated with the current index of the loop for every match of
4541      the original loop's cond_expr (VEC_STMT).  This results in a vector
4542      containing the last time the condition passed for that vector lane.
4543      The first match will be a 1 to allow 0 to be used for non-matching
4544      indexes.  If there are no matches at all then the vector will be all
4545      zeroes.  */
4546   if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) == COND_REDUCTION)
4547     {
4548       tree indx_before_incr, indx_after_incr;
4549       poly_uint64 nunits_out = TYPE_VECTOR_SUBPARTS (vectype);
4550
4551       gimple *vec_stmt = STMT_VINFO_VEC_STMT (stmt_info)->stmt;
4552       gcc_assert (gimple_assign_rhs_code (vec_stmt) == VEC_COND_EXPR);
4553
4554       int scalar_precision
4555         = GET_MODE_PRECISION (SCALAR_TYPE_MODE (TREE_TYPE (vectype)));
4556       tree cr_index_scalar_type = make_unsigned_type (scalar_precision);
4557       tree cr_index_vector_type = build_vector_type
4558         (cr_index_scalar_type, TYPE_VECTOR_SUBPARTS (vectype));
4559
4560       /* First we create a simple vector induction variable which starts
4561          with the values {1,2,3,...} (SERIES_VECT) and increments by the
4562          vector size (STEP).  */
4563
4564       /* Create a {1,2,3,...} vector.  */
4565       tree series_vect = build_index_vector (cr_index_vector_type, 1, 1);
4566
4567       /* Create a vector of the step value.  */
4568       tree step = build_int_cst (cr_index_scalar_type, nunits_out);
4569       tree vec_step = build_vector_from_val (cr_index_vector_type, step);
4570
4571       /* Create an induction variable.  */
4572       gimple_stmt_iterator incr_gsi;
4573       bool insert_after;
4574       standard_iv_increment_position (loop, &incr_gsi, &insert_after);
4575       create_iv (series_vect, vec_step, NULL_TREE, loop, &incr_gsi,
4576                  insert_after, &indx_before_incr, &indx_after_incr);
4577
4578       /* Next create a new phi node vector (NEW_PHI_TREE) which starts
4579          filled with zeros (VEC_ZERO).  */
4580
4581       /* Create a vector of 0s.  */
4582       tree zero = build_zero_cst (cr_index_scalar_type);
4583       tree vec_zero = build_vector_from_val (cr_index_vector_type, zero);
4584
4585       /* Create a vector phi node.  */
4586       tree new_phi_tree = make_ssa_name (cr_index_vector_type);
4587       new_phi = create_phi_node (new_phi_tree, loop->header);
4588       loop_vinfo->add_stmt (new_phi);
4589       add_phi_arg (as_a <gphi *> (new_phi), vec_zero,
4590                    loop_preheader_edge (loop), UNKNOWN_LOCATION);
4591
4592       /* Now take the condition from the loops original cond_expr
4593          (VEC_STMT) and produce a new cond_expr (INDEX_COND_EXPR) which for
4594          every match uses values from the induction variable
4595          (INDEX_BEFORE_INCR) otherwise uses values from the phi node
4596          (NEW_PHI_TREE).
4597          Finally, we update the phi (NEW_PHI_TREE) to take the value of
4598          the new cond_expr (INDEX_COND_EXPR).  */
4599
4600       /* Duplicate the condition from vec_stmt.  */
4601       tree ccompare = unshare_expr (gimple_assign_rhs1 (vec_stmt));
4602
4603       /* Create a conditional, where the condition is taken from vec_stmt
4604          (CCOMPARE), then is the induction index (INDEX_BEFORE_INCR) and
4605          else is the phi (NEW_PHI_TREE).  */
4606       tree index_cond_expr = build3 (VEC_COND_EXPR, cr_index_vector_type,
4607                                      ccompare, indx_before_incr,
4608                                      new_phi_tree);
4609       induction_index = make_ssa_name (cr_index_vector_type);
4610       gimple *index_condition = gimple_build_assign (induction_index,
4611                                                      index_cond_expr);
4612       gsi_insert_before (&incr_gsi, index_condition, GSI_SAME_STMT);
4613       stmt_vec_info index_vec_info = loop_vinfo->add_stmt (index_condition);
4614       STMT_VINFO_VECTYPE (index_vec_info) = cr_index_vector_type;
4615
4616       /* Update the phi with the vec cond.  */
4617       add_phi_arg (as_a <gphi *> (new_phi), induction_index,
4618                    loop_latch_edge (loop), UNKNOWN_LOCATION);
4619     }
4620
4621   /* 2. Create epilog code.
4622         The reduction epilog code operates across the elements of the vector
4623         of partial results computed by the vectorized loop.
4624         The reduction epilog code consists of:
4625
4626         step 1: compute the scalar result in a vector (v_out2)
4627         step 2: extract the scalar result (s_out3) from the vector (v_out2)
4628         step 3: adjust the scalar result (s_out3) if needed.
4629
4630         Step 1 can be accomplished using one the following three schemes:
4631           (scheme 1) using reduc_fn, if available.
4632           (scheme 2) using whole-vector shifts, if available.
4633           (scheme 3) using a scalar loop. In this case steps 1+2 above are
4634                      combined.
4635
4636           The overall epilog code looks like this:
4637
4638           s_out0 = phi <s_loop>         # original EXIT_PHI
4639           v_out1 = phi <VECT_DEF>       # NEW_EXIT_PHI
4640           v_out2 = reduce <v_out1>              # step 1
4641           s_out3 = extract_field <v_out2, 0>    # step 2
4642           s_out4 = adjust_result <s_out3>       # step 3
4643
4644           (step 3 is optional, and steps 1 and 2 may be combined).
4645           Lastly, the uses of s_out0 are replaced by s_out4.  */
4646
4647
4648   /* 2.1 Create new loop-exit-phis to preserve loop-closed form:
4649          v_out1 = phi <VECT_DEF>
4650          Store them in NEW_PHIS.  */
4651
4652   exit_bb = single_exit (loop)->dest;
4653   prev_phi_info = NULL;
4654   new_phis.create (vect_defs.length ());
4655   FOR_EACH_VEC_ELT (vect_defs, i, def)
4656     {
4657       for (j = 0; j < ncopies; j++)
4658         {
4659           tree new_def = copy_ssa_name (def);
4660           phi = create_phi_node (new_def, exit_bb);
4661           stmt_vec_info phi_info = loop_vinfo->add_stmt (phi);
4662           if (j == 0)
4663             new_phis.quick_push (phi);
4664           else
4665             {
4666               def = vect_get_vec_def_for_stmt_copy (loop_vinfo, def);
4667               STMT_VINFO_RELATED_STMT (prev_phi_info) = phi_info;
4668             }
4669
4670           SET_PHI_ARG_DEF (phi, single_exit (loop)->dest_idx, def);
4671           prev_phi_info = phi_info;
4672         }
4673     }
4674
4675   /* The epilogue is created for the outer-loop, i.e., for the loop being
4676      vectorized.  Create exit phis for the outer loop.  */
4677   if (double_reduc)
4678     {
4679       loop = outer_loop;
4680       exit_bb = single_exit (loop)->dest;
4681       inner_phis.create (vect_defs.length ());
4682       FOR_EACH_VEC_ELT (new_phis, i, phi)
4683         {
4684           stmt_vec_info phi_info = loop_vinfo->lookup_stmt (phi);
4685           tree new_result = copy_ssa_name (PHI_RESULT (phi));
4686           gphi *outer_phi = create_phi_node (new_result, exit_bb);
4687           SET_PHI_ARG_DEF (outer_phi, single_exit (loop)->dest_idx,
4688                            PHI_RESULT (phi));
4689           prev_phi_info = loop_vinfo->add_stmt (outer_phi);
4690           inner_phis.quick_push (phi_info);
4691           new_phis[i] = outer_phi;
4692           while (STMT_VINFO_RELATED_STMT (phi_info))
4693             {
4694               phi_info = STMT_VINFO_RELATED_STMT (phi_info);
4695               new_result = copy_ssa_name (PHI_RESULT (phi_info->stmt));
4696               outer_phi = create_phi_node (new_result, exit_bb);
4697               SET_PHI_ARG_DEF (outer_phi, single_exit (loop)->dest_idx,
4698                                PHI_RESULT (phi_info->stmt));
4699               stmt_vec_info outer_phi_info = loop_vinfo->add_stmt (outer_phi);
4700               STMT_VINFO_RELATED_STMT (prev_phi_info) = outer_phi_info;
4701               prev_phi_info = outer_phi_info;
4702             }
4703         }
4704     }
4705
4706   exit_gsi = gsi_after_labels (exit_bb);
4707
4708   /* 2.2 Get the relevant tree-code to use in the epilog for schemes 2,3
4709          (i.e. when reduc_fn is not available) and in the final adjustment
4710          code (if needed).  Also get the original scalar reduction variable as
4711          defined in the loop.  In case STMT is a "pattern-stmt" (i.e. - it
4712          represents a reduction pattern), the tree-code and scalar-def are
4713          taken from the original stmt that the pattern-stmt (STMT) replaces.
4714          Otherwise (it is a regular reduction) - the tree-code and scalar-def
4715          are taken from STMT.  */
4716
4717   stmt_vec_info orig_stmt_info = vect_orig_stmt (stmt_info);
4718   if (orig_stmt_info != stmt_info)
4719     {
4720       /* Reduction pattern  */
4721       gcc_assert (STMT_VINFO_IN_PATTERN_P (orig_stmt_info));
4722       gcc_assert (STMT_VINFO_RELATED_STMT (orig_stmt_info) == stmt_info);
4723     }
4724
4725   code = gimple_assign_rhs_code (orig_stmt_info->stmt);
4726   /* For MINUS_EXPR the initial vector is [init_val,0,...,0], therefore,
4727      partial results are added and not subtracted.  */
4728   if (code == MINUS_EXPR)
4729     code = PLUS_EXPR;
4730
4731   scalar_dest = gimple_assign_lhs (orig_stmt_info->stmt);
4732   scalar_type = TREE_TYPE (scalar_dest);
4733   scalar_results.create (group_size);
4734   new_scalar_dest = vect_create_destination_var (scalar_dest, NULL);
4735   bitsize = TYPE_SIZE (scalar_type);
4736
4737   /* In case this is a reduction in an inner-loop while vectorizing an outer
4738      loop - we don't need to extract a single scalar result at the end of the
4739      inner-loop (unless it is double reduction, i.e., the use of reduction is
4740      outside the outer-loop).  The final vector of partial results will be used
4741      in the vectorized outer-loop, or reduced to a scalar result at the end of
4742      the outer-loop.  */
4743   if (nested_in_vect_loop && !double_reduc)
4744     goto vect_finalize_reduction;
4745
4746   /* SLP reduction without reduction chain, e.g.,
4747      # a1 = phi <a2, a0>
4748      # b1 = phi <b2, b0>
4749      a2 = operation (a1)
4750      b2 = operation (b1)  */
4751   slp_reduc = (slp_node && !REDUC_GROUP_FIRST_ELEMENT (stmt_info));
4752
4753   /* True if we should implement SLP_REDUC using native reduction operations
4754      instead of scalar operations.  */
4755   direct_slp_reduc = (reduc_fn != IFN_LAST
4756                       && slp_reduc
4757                       && !TYPE_VECTOR_SUBPARTS (vectype).is_constant ());
4758
4759   /* In case of reduction chain, e.g.,
4760      # a1 = phi <a3, a0>
4761      a2 = operation (a1)
4762      a3 = operation (a2),
4763
4764      we may end up with more than one vector result.  Here we reduce them to
4765      one vector.  */
4766   if (REDUC_GROUP_FIRST_ELEMENT (stmt_info) || direct_slp_reduc)
4767     {
4768       tree first_vect = PHI_RESULT (new_phis[0]);
4769       gassign *new_vec_stmt = NULL;
4770       vec_dest = vect_create_destination_var (scalar_dest, vectype);
4771       for (k = 1; k < new_phis.length (); k++)
4772         {
4773           gimple *next_phi = new_phis[k];
4774           tree second_vect = PHI_RESULT (next_phi);
4775           tree tem = make_ssa_name (vec_dest, new_vec_stmt);
4776           new_vec_stmt = gimple_build_assign (tem, code,
4777                                               first_vect, second_vect);
4778           gsi_insert_before (&exit_gsi, new_vec_stmt, GSI_SAME_STMT);
4779           first_vect = tem;
4780         }
4781
4782       new_phi_result = first_vect;
4783       if (new_vec_stmt)
4784         {
4785           new_phis.truncate (0);
4786           new_phis.safe_push (new_vec_stmt);
4787         }
4788     }
4789   /* Likewise if we couldn't use a single defuse cycle.  */
4790   else if (ncopies > 1)
4791     {
4792       gcc_assert (new_phis.length () == 1);
4793       tree first_vect = PHI_RESULT (new_phis[0]);
4794       gassign *new_vec_stmt = NULL;
4795       vec_dest = vect_create_destination_var (scalar_dest, vectype);
4796       stmt_vec_info next_phi_info = loop_vinfo->lookup_stmt (new_phis[0]);
4797       for (int k = 1; k < ncopies; ++k)
4798         {
4799           next_phi_info = STMT_VINFO_RELATED_STMT (next_phi_info);
4800           tree second_vect = PHI_RESULT (next_phi_info->stmt);
4801           tree tem = make_ssa_name (vec_dest, new_vec_stmt);
4802           new_vec_stmt = gimple_build_assign (tem, code,
4803                                               first_vect, second_vect);
4804           gsi_insert_before (&exit_gsi, new_vec_stmt, GSI_SAME_STMT);
4805           first_vect = tem;
4806         }
4807       new_phi_result = first_vect;
4808       new_phis.truncate (0);
4809       new_phis.safe_push (new_vec_stmt);
4810     }
4811   else
4812     new_phi_result = PHI_RESULT (new_phis[0]);
4813
4814   if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) == COND_REDUCTION
4815       && reduc_fn != IFN_LAST)
4816     {
4817       /* For condition reductions, we have a vector (NEW_PHI_RESULT) containing
4818          various data values where the condition matched and another vector
4819          (INDUCTION_INDEX) containing all the indexes of those matches.  We
4820          need to extract the last matching index (which will be the index with
4821          highest value) and use this to index into the data vector.
4822          For the case where there were no matches, the data vector will contain
4823          all default values and the index vector will be all zeros.  */
4824
4825       /* Get various versions of the type of the vector of indexes.  */
4826       tree index_vec_type = TREE_TYPE (induction_index);
4827       gcc_checking_assert (TYPE_UNSIGNED (index_vec_type));
4828       tree index_scalar_type = TREE_TYPE (index_vec_type);
4829       tree index_vec_cmp_type = build_same_sized_truth_vector_type
4830         (index_vec_type);
4831
4832       /* Get an unsigned integer version of the type of the data vector.  */
4833       int scalar_precision
4834         = GET_MODE_PRECISION (SCALAR_TYPE_MODE (scalar_type));
4835       tree scalar_type_unsigned = make_unsigned_type (scalar_precision);
4836       tree vectype_unsigned = build_vector_type
4837         (scalar_type_unsigned, TYPE_VECTOR_SUBPARTS (vectype));
4838
4839       /* First we need to create a vector (ZERO_VEC) of zeros and another
4840          vector (MAX_INDEX_VEC) filled with the last matching index, which we
4841          can create using a MAX reduction and then expanding.
4842          In the case where the loop never made any matches, the max index will
4843          be zero.  */
4844
4845       /* Vector of {0, 0, 0,...}.  */
4846       tree zero_vec = make_ssa_name (vectype);
4847       tree zero_vec_rhs = build_zero_cst (vectype);
4848       gimple *zero_vec_stmt = gimple_build_assign (zero_vec, zero_vec_rhs);
4849       gsi_insert_before (&exit_gsi, zero_vec_stmt, GSI_SAME_STMT);
4850
4851       /* Find maximum value from the vector of found indexes.  */
4852       tree max_index = make_ssa_name (index_scalar_type);
4853       gcall *max_index_stmt = gimple_build_call_internal (IFN_REDUC_MAX,
4854                                                           1, induction_index);
4855       gimple_call_set_lhs (max_index_stmt, max_index);
4856       gsi_insert_before (&exit_gsi, max_index_stmt, GSI_SAME_STMT);
4857
4858       /* Vector of {max_index, max_index, max_index,...}.  */
4859       tree max_index_vec = make_ssa_name (index_vec_type);
4860       tree max_index_vec_rhs = build_vector_from_val (index_vec_type,
4861                                                       max_index);
4862       gimple *max_index_vec_stmt = gimple_build_assign (max_index_vec,
4863                                                         max_index_vec_rhs);
4864       gsi_insert_before (&exit_gsi, max_index_vec_stmt, GSI_SAME_STMT);
4865
4866       /* Next we compare the new vector (MAX_INDEX_VEC) full of max indexes
4867          with the vector (INDUCTION_INDEX) of found indexes, choosing values
4868          from the data vector (NEW_PHI_RESULT) for matches, 0 (ZERO_VEC)
4869          otherwise.  Only one value should match, resulting in a vector
4870          (VEC_COND) with one data value and the rest zeros.
4871          In the case where the loop never made any matches, every index will
4872          match, resulting in a vector with all data values (which will all be
4873          the default value).  */
4874
4875       /* Compare the max index vector to the vector of found indexes to find
4876          the position of the max value.  */
4877       tree vec_compare = make_ssa_name (index_vec_cmp_type);
4878       gimple *vec_compare_stmt = gimple_build_assign (vec_compare, EQ_EXPR,
4879                                                       induction_index,
4880                                                       max_index_vec);
4881       gsi_insert_before (&exit_gsi, vec_compare_stmt, GSI_SAME_STMT);
4882
4883       /* Use the compare to choose either values from the data vector or
4884          zero.  */
4885       tree vec_cond = make_ssa_name (vectype);
4886       gimple *vec_cond_stmt = gimple_build_assign (vec_cond, VEC_COND_EXPR,
4887                                                    vec_compare, new_phi_result,
4888                                                    zero_vec);
4889       gsi_insert_before (&exit_gsi, vec_cond_stmt, GSI_SAME_STMT);
4890
4891       /* Finally we need to extract the data value from the vector (VEC_COND)
4892          into a scalar (MATCHED_DATA_REDUC).  Logically we want to do a OR
4893          reduction, but because this doesn't exist, we can use a MAX reduction
4894          instead.  The data value might be signed or a float so we need to cast
4895          it first.
4896          In the case where the loop never made any matches, the data values are
4897          all identical, and so will reduce down correctly.  */
4898
4899       /* Make the matched data values unsigned.  */
4900       tree vec_cond_cast = make_ssa_name (vectype_unsigned);
4901       tree vec_cond_cast_rhs = build1 (VIEW_CONVERT_EXPR, vectype_unsigned,
4902                                        vec_cond);
4903       gimple *vec_cond_cast_stmt = gimple_build_assign (vec_cond_cast,
4904                                                         VIEW_CONVERT_EXPR,
4905                                                         vec_cond_cast_rhs);
4906       gsi_insert_before (&exit_gsi, vec_cond_cast_stmt, GSI_SAME_STMT);
4907
4908       /* Reduce down to a scalar value.  */
4909       tree data_reduc = make_ssa_name (scalar_type_unsigned);
4910       gcall *data_reduc_stmt = gimple_build_call_internal (IFN_REDUC_MAX,
4911                                                            1, vec_cond_cast);
4912       gimple_call_set_lhs (data_reduc_stmt, data_reduc);
4913       gsi_insert_before (&exit_gsi, data_reduc_stmt, GSI_SAME_STMT);
4914
4915       /* Convert the reduced value back to the result type and set as the
4916          result.  */
4917       gimple_seq stmts = NULL;
4918       new_temp = gimple_build (&stmts, VIEW_CONVERT_EXPR, scalar_type,
4919                                data_reduc);
4920       gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
4921       scalar_results.safe_push (new_temp);
4922     }
4923   else if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) == COND_REDUCTION
4924            && reduc_fn == IFN_LAST)
4925     {
4926       /* Condition reduction without supported IFN_REDUC_MAX.  Generate
4927          idx = 0;
4928          idx_val = induction_index[0];
4929          val = data_reduc[0];
4930          for (idx = 0, val = init, i = 0; i < nelts; ++i)
4931            if (induction_index[i] > idx_val)
4932              val = data_reduc[i], idx_val = induction_index[i];
4933          return val;  */
4934
4935       tree data_eltype = TREE_TYPE (TREE_TYPE (new_phi_result));
4936       tree idx_eltype = TREE_TYPE (TREE_TYPE (induction_index));
4937       unsigned HOST_WIDE_INT el_size = tree_to_uhwi (TYPE_SIZE (idx_eltype));
4938       poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (TREE_TYPE (induction_index));
4939       /* Enforced by vectorizable_reduction, which ensures we have target
4940          support before allowing a conditional reduction on variable-length
4941          vectors.  */
4942       unsigned HOST_WIDE_INT v_size = el_size * nunits.to_constant ();
4943       tree idx_val = NULL_TREE, val = NULL_TREE;
4944       for (unsigned HOST_WIDE_INT off = 0; off < v_size; off += el_size)
4945         {
4946           tree old_idx_val = idx_val;
4947           tree old_val = val;
4948           idx_val = make_ssa_name (idx_eltype);
4949           epilog_stmt = gimple_build_assign (idx_val, BIT_FIELD_REF,
4950                                              build3 (BIT_FIELD_REF, idx_eltype,
4951                                                      induction_index,
4952                                                      bitsize_int (el_size),
4953                                                      bitsize_int (off)));
4954           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4955           val = make_ssa_name (data_eltype);
4956           epilog_stmt = gimple_build_assign (val, BIT_FIELD_REF,
4957                                              build3 (BIT_FIELD_REF,
4958                                                      data_eltype,
4959                                                      new_phi_result,
4960                                                      bitsize_int (el_size),
4961                                                      bitsize_int (off)));
4962           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4963           if (off != 0)
4964             {
4965               tree new_idx_val = idx_val;
4966               tree new_val = val;
4967               if (off != v_size - el_size)
4968                 {
4969                   new_idx_val = make_ssa_name (idx_eltype);
4970                   epilog_stmt = gimple_build_assign (new_idx_val,
4971                                                      MAX_EXPR, idx_val,
4972                                                      old_idx_val);
4973                   gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4974                 }
4975               new_val = make_ssa_name (data_eltype);
4976               epilog_stmt = gimple_build_assign (new_val,
4977                                                  COND_EXPR,
4978                                                  build2 (GT_EXPR,
4979                                                          boolean_type_node,
4980                                                          idx_val,
4981                                                          old_idx_val),
4982                                                  val, old_val);
4983               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4984               idx_val = new_idx_val;
4985               val = new_val;
4986             }
4987         }
4988       /* Convert the reduced value back to the result type and set as the
4989          result.  */
4990       gimple_seq stmts = NULL;
4991       val = gimple_convert (&stmts, scalar_type, val);
4992       gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
4993       scalar_results.safe_push (val);
4994     }
4995
4996   /* 2.3 Create the reduction code, using one of the three schemes described
4997          above. In SLP we simply need to extract all the elements from the
4998          vector (without reducing them), so we use scalar shifts.  */
4999   else if (reduc_fn != IFN_LAST && !slp_reduc)
5000     {
5001       tree tmp;
5002       tree vec_elem_type;
5003
5004       /* Case 1:  Create:
5005          v_out2 = reduc_expr <v_out1>  */
5006
5007       if (dump_enabled_p ())
5008         dump_printf_loc (MSG_NOTE, vect_location,
5009                          "Reduce using direct vector reduction.\n");
5010
5011       vec_elem_type = TREE_TYPE (TREE_TYPE (new_phi_result));
5012       if (!useless_type_conversion_p (scalar_type, vec_elem_type))
5013         {
5014           tree tmp_dest
5015             = vect_create_destination_var (scalar_dest, vec_elem_type);
5016           epilog_stmt = gimple_build_call_internal (reduc_fn, 1,
5017                                                     new_phi_result);
5018           gimple_set_lhs (epilog_stmt, tmp_dest);
5019           new_temp = make_ssa_name (tmp_dest, epilog_stmt);
5020           gimple_set_lhs (epilog_stmt, new_temp);
5021           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5022
5023           epilog_stmt = gimple_build_assign (new_scalar_dest, NOP_EXPR,
5024                                              new_temp);
5025         }
5026       else
5027         {
5028           epilog_stmt = gimple_build_call_internal (reduc_fn, 1,
5029                                                     new_phi_result);
5030           gimple_set_lhs (epilog_stmt, new_scalar_dest);
5031         }
5032
5033       new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
5034       gimple_set_lhs (epilog_stmt, new_temp);
5035       gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5036
5037       if ((STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
5038            == INTEGER_INDUC_COND_REDUCTION)
5039           && !operand_equal_p (initial_def, induc_val, 0))
5040         {
5041           /* Earlier we set the initial value to be a vector if induc_val
5042              values.  Check the result and if it is induc_val then replace
5043              with the original initial value, unless induc_val is
5044              the same as initial_def already.  */
5045           tree zcompare = build2 (EQ_EXPR, boolean_type_node, new_temp,
5046                                   induc_val);
5047
5048           tmp = make_ssa_name (new_scalar_dest);
5049           epilog_stmt = gimple_build_assign (tmp, COND_EXPR, zcompare,
5050                                              initial_def, new_temp);
5051           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5052           new_temp = tmp;
5053         }
5054
5055       scalar_results.safe_push (new_temp);
5056     }
5057   else if (direct_slp_reduc)
5058     {
5059       /* Here we create one vector for each of the REDUC_GROUP_SIZE results,
5060          with the elements for other SLP statements replaced with the
5061          neutral value.  We can then do a normal reduction on each vector.  */
5062
5063       /* Enforced by vectorizable_reduction.  */
5064       gcc_assert (new_phis.length () == 1);
5065       gcc_assert (pow2p_hwi (group_size));
5066
5067       slp_tree orig_phis_slp_node = slp_node_instance->reduc_phis;
5068       vec<stmt_vec_info> orig_phis
5069         = SLP_TREE_SCALAR_STMTS (orig_phis_slp_node);
5070       gimple_seq seq = NULL;
5071
5072       /* Build a vector {0, 1, 2, ...}, with the same number of elements
5073          and the same element size as VECTYPE.  */
5074       tree index = build_index_vector (vectype, 0, 1);
5075       tree index_type = TREE_TYPE (index);
5076       tree index_elt_type = TREE_TYPE (index_type);
5077       tree mask_type = build_same_sized_truth_vector_type (index_type);
5078
5079       /* Create a vector that, for each element, identifies which of
5080          the REDUC_GROUP_SIZE results should use it.  */
5081       tree index_mask = build_int_cst (index_elt_type, group_size - 1);
5082       index = gimple_build (&seq, BIT_AND_EXPR, index_type, index,
5083                             build_vector_from_val (index_type, index_mask));
5084
5085       /* Get a neutral vector value.  This is simply a splat of the neutral
5086          scalar value if we have one, otherwise the initial scalar value
5087          is itself a neutral value.  */
5088       tree vector_identity = NULL_TREE;
5089       if (neutral_op)
5090         vector_identity = gimple_build_vector_from_val (&seq, vectype,
5091                                                         neutral_op);
5092       for (unsigned int i = 0; i < group_size; ++i)
5093         {
5094           /* If there's no univeral neutral value, we can use the
5095              initial scalar value from the original PHI.  This is used
5096              for MIN and MAX reduction, for example.  */
5097           if (!neutral_op)
5098             {
5099               tree scalar_value
5100                 = PHI_ARG_DEF_FROM_EDGE (orig_phis[i]->stmt,
5101                                          loop_preheader_edge (loop));
5102               vector_identity = gimple_build_vector_from_val (&seq, vectype,
5103                                                               scalar_value);
5104             }
5105
5106           /* Calculate the equivalent of:
5107
5108              sel[j] = (index[j] == i);
5109
5110              which selects the elements of NEW_PHI_RESULT that should
5111              be included in the result.  */
5112           tree compare_val = build_int_cst (index_elt_type, i);
5113           compare_val = build_vector_from_val (index_type, compare_val);
5114           tree sel = gimple_build (&seq, EQ_EXPR, mask_type,
5115                                    index, compare_val);
5116
5117           /* Calculate the equivalent of:
5118
5119              vec = seq ? new_phi_result : vector_identity;
5120
5121              VEC is now suitable for a full vector reduction.  */
5122           tree vec = gimple_build (&seq, VEC_COND_EXPR, vectype,
5123                                    sel, new_phi_result, vector_identity);
5124
5125           /* Do the reduction and convert it to the appropriate type.  */
5126           tree scalar = gimple_build (&seq, as_combined_fn (reduc_fn),
5127                                       TREE_TYPE (vectype), vec);
5128           scalar = gimple_convert (&seq, scalar_type, scalar);
5129           scalar_results.safe_push (scalar);
5130         }
5131       gsi_insert_seq_before (&exit_gsi, seq, GSI_SAME_STMT);
5132     }
5133   else
5134     {
5135       bool reduce_with_shift;
5136       tree vec_temp;
5137
5138       /* COND reductions all do the final reduction with MAX_EXPR
5139          or MIN_EXPR.  */
5140       if (code == COND_EXPR)
5141         {
5142           if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
5143               == INTEGER_INDUC_COND_REDUCTION)
5144             code = induc_code;
5145           else if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
5146                    == CONST_COND_REDUCTION)
5147             code = STMT_VINFO_VEC_CONST_COND_REDUC_CODE (stmt_info);
5148           else
5149             code = MAX_EXPR;
5150         }
5151
5152       /* See if the target wants to do the final (shift) reduction
5153          in a vector mode of smaller size and first reduce upper/lower
5154          halves against each other.  */
5155       enum machine_mode mode1 = mode;
5156       tree vectype1 = vectype;
5157       unsigned sz = tree_to_uhwi (TYPE_SIZE_UNIT (vectype));
5158       unsigned sz1 = sz;
5159       if (!slp_reduc
5160           && (mode1 = targetm.vectorize.split_reduction (mode)) != mode)
5161         sz1 = GET_MODE_SIZE (mode1).to_constant ();
5162
5163       vectype1 = get_vectype_for_scalar_type_and_size (scalar_type, sz1);
5164       reduce_with_shift = have_whole_vector_shift (mode1);
5165       if (!VECTOR_MODE_P (mode1))
5166         reduce_with_shift = false;
5167       else
5168         {
5169           optab optab = optab_for_tree_code (code, vectype1, optab_default);
5170           if (optab_handler (optab, mode1) == CODE_FOR_nothing)
5171             reduce_with_shift = false;
5172         }
5173
5174       /* First reduce the vector to the desired vector size we should
5175          do shift reduction on by combining upper and lower halves.  */
5176       new_temp = new_phi_result;
5177       while (sz > sz1)
5178         {
5179           gcc_assert (!slp_reduc);
5180           sz /= 2;
5181           vectype1 = get_vectype_for_scalar_type_and_size (scalar_type, sz);
5182
5183           /* The target has to make sure we support lowpart/highpart
5184              extraction, either via direct vector extract or through
5185              an integer mode punning.  */
5186           tree dst1, dst2;
5187           if (convert_optab_handler (vec_extract_optab,
5188                                      TYPE_MODE (TREE_TYPE (new_temp)),
5189                                      TYPE_MODE (vectype1))
5190               != CODE_FOR_nothing)
5191             {
5192               /* Extract sub-vectors directly once vec_extract becomes
5193                  a conversion optab.  */
5194               dst1 = make_ssa_name (vectype1);
5195               epilog_stmt
5196                   = gimple_build_assign (dst1, BIT_FIELD_REF,
5197                                          build3 (BIT_FIELD_REF, vectype1,
5198                                                  new_temp, TYPE_SIZE (vectype1),
5199                                                  bitsize_int (0)));
5200               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5201               dst2 =  make_ssa_name (vectype1);
5202               epilog_stmt
5203                   = gimple_build_assign (dst2, BIT_FIELD_REF,
5204                                          build3 (BIT_FIELD_REF, vectype1,
5205                                                  new_temp, TYPE_SIZE (vectype1),
5206                                                  bitsize_int (sz * BITS_PER_UNIT)));
5207               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5208             }
5209           else
5210             {
5211               /* Extract via punning to appropriately sized integer mode
5212                  vector.  */
5213               tree eltype = build_nonstandard_integer_type (sz * BITS_PER_UNIT,
5214                                                             1);
5215               tree etype = build_vector_type (eltype, 2);
5216               gcc_assert (convert_optab_handler (vec_extract_optab,
5217                                                  TYPE_MODE (etype),
5218                                                  TYPE_MODE (eltype))
5219                           != CODE_FOR_nothing);
5220               tree tem = make_ssa_name (etype);
5221               epilog_stmt = gimple_build_assign (tem, VIEW_CONVERT_EXPR,
5222                                                  build1 (VIEW_CONVERT_EXPR,
5223                                                          etype, new_temp));
5224               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5225               new_temp = tem;
5226               tem = make_ssa_name (eltype);
5227               epilog_stmt
5228                   = gimple_build_assign (tem, BIT_FIELD_REF,
5229                                          build3 (BIT_FIELD_REF, eltype,
5230                                                  new_temp, TYPE_SIZE (eltype),
5231                                                  bitsize_int (0)));
5232               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5233               dst1 = make_ssa_name (vectype1);
5234               epilog_stmt = gimple_build_assign (dst1, VIEW_CONVERT_EXPR,
5235                                                  build1 (VIEW_CONVERT_EXPR,
5236                                                          vectype1, tem));
5237               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5238               tem = make_ssa_name (eltype);
5239               epilog_stmt
5240                   = gimple_build_assign (tem, BIT_FIELD_REF,
5241                                          build3 (BIT_FIELD_REF, eltype,
5242                                                  new_temp, TYPE_SIZE (eltype),
5243                                                  bitsize_int (sz * BITS_PER_UNIT)));
5244               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5245               dst2 =  make_ssa_name (vectype1);
5246               epilog_stmt = gimple_build_assign (dst2, VIEW_CONVERT_EXPR,
5247                                                  build1 (VIEW_CONVERT_EXPR,
5248                                                          vectype1, tem));
5249               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5250             }
5251
5252           new_temp = make_ssa_name (vectype1);
5253           epilog_stmt = gimple_build_assign (new_temp, code, dst1, dst2);
5254           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5255         }
5256
5257       if (reduce_with_shift && !slp_reduc)
5258         {
5259           int element_bitsize = tree_to_uhwi (bitsize);
5260           /* Enforced by vectorizable_reduction, which disallows SLP reductions
5261              for variable-length vectors and also requires direct target support
5262              for loop reductions.  */
5263           int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype1));
5264           int nelements = vec_size_in_bits / element_bitsize;
5265           vec_perm_builder sel;
5266           vec_perm_indices indices;
5267
5268           int elt_offset;
5269
5270           tree zero_vec = build_zero_cst (vectype1);
5271           /* Case 2: Create:
5272              for (offset = nelements/2; offset >= 1; offset/=2)
5273                 {
5274                   Create:  va' = vec_shift <va, offset>
5275                   Create:  va = vop <va, va'>
5276                 }  */
5277
5278           tree rhs;
5279
5280           if (dump_enabled_p ())
5281             dump_printf_loc (MSG_NOTE, vect_location,
5282                              "Reduce using vector shifts\n");
5283
5284           mode1 = TYPE_MODE (vectype1);
5285           vec_dest = vect_create_destination_var (scalar_dest, vectype1);
5286           for (elt_offset = nelements / 2;
5287                elt_offset >= 1;
5288                elt_offset /= 2)
5289             {
5290               calc_vec_perm_mask_for_shift (elt_offset, nelements, &sel);
5291               indices.new_vector (sel, 2, nelements);
5292               tree mask = vect_gen_perm_mask_any (vectype1, indices);
5293               epilog_stmt = gimple_build_assign (vec_dest, VEC_PERM_EXPR,
5294                                                  new_temp, zero_vec, mask);
5295               new_name = make_ssa_name (vec_dest, epilog_stmt);
5296               gimple_assign_set_lhs (epilog_stmt, new_name);
5297               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5298
5299               epilog_stmt = gimple_build_assign (vec_dest, code, new_name,
5300                                                  new_temp);
5301               new_temp = make_ssa_name (vec_dest, epilog_stmt);
5302               gimple_assign_set_lhs (epilog_stmt, new_temp);
5303               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5304             }
5305
5306           /* 2.4  Extract the final scalar result.  Create:
5307              s_out3 = extract_field <v_out2, bitpos>  */
5308
5309           if (dump_enabled_p ())
5310             dump_printf_loc (MSG_NOTE, vect_location,
5311                              "extract scalar result\n");
5312
5313           rhs = build3 (BIT_FIELD_REF, scalar_type, new_temp,
5314                         bitsize, bitsize_zero_node);
5315           epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
5316           new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
5317           gimple_assign_set_lhs (epilog_stmt, new_temp);
5318           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5319           scalar_results.safe_push (new_temp);
5320         }
5321       else
5322         {
5323           /* Case 3: Create:
5324              s = extract_field <v_out2, 0>
5325              for (offset = element_size;
5326                   offset < vector_size;
5327                   offset += element_size;)
5328                {
5329                  Create:  s' = extract_field <v_out2, offset>
5330                  Create:  s = op <s, s'>  // For non SLP cases
5331                }  */
5332
5333           if (dump_enabled_p ())
5334             dump_printf_loc (MSG_NOTE, vect_location,
5335                              "Reduce using scalar code.\n");
5336
5337           int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype1));
5338           int element_bitsize = tree_to_uhwi (bitsize);
5339           FOR_EACH_VEC_ELT (new_phis, i, new_phi)
5340             {
5341               int bit_offset;
5342               if (gimple_code (new_phi) == GIMPLE_PHI)
5343                 vec_temp = PHI_RESULT (new_phi);
5344               else
5345                 vec_temp = gimple_assign_lhs (new_phi);
5346               tree rhs = build3 (BIT_FIELD_REF, scalar_type, vec_temp, bitsize,
5347                                  bitsize_zero_node);
5348               epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
5349               new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
5350               gimple_assign_set_lhs (epilog_stmt, new_temp);
5351               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5352
5353               /* In SLP we don't need to apply reduction operation, so we just
5354                  collect s' values in SCALAR_RESULTS.  */
5355               if (slp_reduc)
5356                 scalar_results.safe_push (new_temp);
5357
5358               for (bit_offset = element_bitsize;
5359                    bit_offset < vec_size_in_bits;
5360                    bit_offset += element_bitsize)
5361                 {
5362                   tree bitpos = bitsize_int (bit_offset);
5363                   tree rhs = build3 (BIT_FIELD_REF, scalar_type, vec_temp,
5364                                      bitsize, bitpos);
5365
5366                   epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
5367                   new_name = make_ssa_name (new_scalar_dest, epilog_stmt);
5368                   gimple_assign_set_lhs (epilog_stmt, new_name);
5369                   gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5370
5371                   if (slp_reduc)
5372                     {
5373                       /* In SLP we don't need to apply reduction operation, so
5374                          we just collect s' values in SCALAR_RESULTS.  */
5375                       new_temp = new_name;
5376                       scalar_results.safe_push (new_name);
5377                     }
5378                   else
5379                     {
5380                       epilog_stmt = gimple_build_assign (new_scalar_dest, code,
5381                                                          new_name, new_temp);
5382                       new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
5383                       gimple_assign_set_lhs (epilog_stmt, new_temp);
5384                       gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5385                     }
5386                 }
5387             }
5388
5389           /* The only case where we need to reduce scalar results in SLP, is
5390              unrolling.  If the size of SCALAR_RESULTS is greater than
5391              REDUC_GROUP_SIZE, we reduce them combining elements modulo
5392              REDUC_GROUP_SIZE.  */
5393           if (slp_reduc)
5394             {
5395               tree res, first_res, new_res;
5396               gimple *new_stmt;
5397
5398               /* Reduce multiple scalar results in case of SLP unrolling.  */
5399               for (j = group_size; scalar_results.iterate (j, &res);
5400                    j++)
5401                 {
5402                   first_res = scalar_results[j % group_size];
5403                   new_stmt = gimple_build_assign (new_scalar_dest, code,
5404                                                   first_res, res);
5405                   new_res = make_ssa_name (new_scalar_dest, new_stmt);
5406                   gimple_assign_set_lhs (new_stmt, new_res);
5407                   gsi_insert_before (&exit_gsi, new_stmt, GSI_SAME_STMT);
5408                   scalar_results[j % group_size] = new_res;
5409                 }
5410             }
5411           else
5412             /* Not SLP - we have one scalar to keep in SCALAR_RESULTS.  */
5413             scalar_results.safe_push (new_temp);
5414         }
5415
5416       if ((STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
5417            == INTEGER_INDUC_COND_REDUCTION)
5418           && !operand_equal_p (initial_def, induc_val, 0))
5419         {
5420           /* Earlier we set the initial value to be a vector if induc_val
5421              values.  Check the result and if it is induc_val then replace
5422              with the original initial value, unless induc_val is
5423              the same as initial_def already.  */
5424           tree zcompare = build2 (EQ_EXPR, boolean_type_node, new_temp,
5425                                   induc_val);
5426
5427           tree tmp = make_ssa_name (new_scalar_dest);
5428           epilog_stmt = gimple_build_assign (tmp, COND_EXPR, zcompare,
5429                                              initial_def, new_temp);
5430           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5431           scalar_results[0] = tmp;
5432         }
5433     }
5434
5435 vect_finalize_reduction:
5436
5437   if (double_reduc)
5438     loop = loop->inner;
5439
5440   /* 2.5 Adjust the final result by the initial value of the reduction
5441          variable. (When such adjustment is not needed, then
5442          'adjustment_def' is zero).  For example, if code is PLUS we create:
5443          new_temp = loop_exit_def + adjustment_def  */
5444
5445   if (adjustment_def)
5446     {
5447       gcc_assert (!slp_reduc);
5448       if (nested_in_vect_loop)
5449         {
5450           new_phi = new_phis[0];
5451           gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) == VECTOR_TYPE);
5452           expr = build2 (code, vectype, PHI_RESULT (new_phi), adjustment_def);
5453           new_dest = vect_create_destination_var (scalar_dest, vectype);
5454         }
5455       else
5456         {
5457           new_temp = scalar_results[0];
5458           gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) != VECTOR_TYPE);
5459           expr = build2 (code, scalar_type, new_temp, adjustment_def);
5460           new_dest = vect_create_destination_var (scalar_dest, scalar_type);
5461         }
5462
5463       epilog_stmt = gimple_build_assign (new_dest, expr);
5464       new_temp = make_ssa_name (new_dest, epilog_stmt);
5465       gimple_assign_set_lhs (epilog_stmt, new_temp);
5466       gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5467       if (nested_in_vect_loop)
5468         {
5469           stmt_vec_info epilog_stmt_info = loop_vinfo->add_stmt (epilog_stmt);
5470           STMT_VINFO_RELATED_STMT (epilog_stmt_info)
5471             = STMT_VINFO_RELATED_STMT (loop_vinfo->lookup_stmt (new_phi));
5472
5473           if (!double_reduc)
5474             scalar_results.quick_push (new_temp);
5475           else
5476             scalar_results[0] = new_temp;
5477         }
5478       else
5479         scalar_results[0] = new_temp;
5480
5481       new_phis[0] = epilog_stmt;
5482     }
5483
5484   /* 2.6  Handle the loop-exit phis.  Replace the uses of scalar loop-exit
5485           phis with new adjusted scalar results, i.e., replace use <s_out0>
5486           with use <s_out4>.
5487
5488      Transform:
5489         loop_exit:
5490           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
5491           v_out1 = phi <VECT_DEF>               # NEW_EXIT_PHI
5492           v_out2 = reduce <v_out1>
5493           s_out3 = extract_field <v_out2, 0>
5494           s_out4 = adjust_result <s_out3>
5495           use <s_out0>
5496           use <s_out0>
5497
5498      into:
5499
5500         loop_exit:
5501           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
5502           v_out1 = phi <VECT_DEF>               # NEW_EXIT_PHI
5503           v_out2 = reduce <v_out1>
5504           s_out3 = extract_field <v_out2, 0>
5505           s_out4 = adjust_result <s_out3>
5506           use <s_out4>
5507           use <s_out4> */
5508
5509
5510   /* In SLP reduction chain we reduce vector results into one vector if
5511      necessary, hence we set here REDUC_GROUP_SIZE to 1.  SCALAR_DEST is the
5512      LHS of the last stmt in the reduction chain, since we are looking for
5513      the loop exit phi node.  */
5514   if (REDUC_GROUP_FIRST_ELEMENT (stmt_info))
5515     {
5516       stmt_vec_info dest_stmt_info
5517         = vect_orig_stmt (SLP_TREE_SCALAR_STMTS (slp_node)[group_size - 1]);
5518       scalar_dest = gimple_assign_lhs (dest_stmt_info->stmt);
5519       group_size = 1;
5520     }
5521
5522   /* In SLP we may have several statements in NEW_PHIS and REDUCTION_PHIS (in
5523      case that REDUC_GROUP_SIZE is greater than vectorization factor).
5524      Therefore, we need to match SCALAR_RESULTS with corresponding statements.
5525      The first (REDUC_GROUP_SIZE / number of new vector stmts) scalar results
5526      correspond to the first vector stmt, etc.
5527      (RATIO is equal to (REDUC_GROUP_SIZE / number of new vector stmts)).  */
5528   if (group_size > new_phis.length ())
5529     {
5530       ratio = group_size / new_phis.length ();
5531       gcc_assert (!(group_size % new_phis.length ()));
5532     }
5533   else
5534     ratio = 1;
5535
5536   stmt_vec_info epilog_stmt_info = NULL;
5537   for (k = 0; k < group_size; k++)
5538     {
5539       if (k % ratio == 0)
5540         {
5541           epilog_stmt_info = loop_vinfo->lookup_stmt (new_phis[k / ratio]);
5542           reduction_phi_info = reduction_phis[k / ratio];
5543           if (double_reduc)
5544             inner_phi = inner_phis[k / ratio];
5545         }
5546
5547       if (slp_reduc)
5548         {
5549           stmt_vec_info scalar_stmt_info = SLP_TREE_SCALAR_STMTS (slp_node)[k];
5550
5551           orig_stmt_info = STMT_VINFO_RELATED_STMT (scalar_stmt_info);
5552           /* SLP statements can't participate in patterns.  */
5553           gcc_assert (!orig_stmt_info);
5554           scalar_dest = gimple_assign_lhs (scalar_stmt_info->stmt);
5555         }
5556
5557       phis.create (3);
5558       /* Find the loop-closed-use at the loop exit of the original scalar
5559          result.  (The reduction result is expected to have two immediate uses -
5560          one at the latch block, and one at the loop exit).  */
5561       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, scalar_dest)
5562         if (!flow_bb_inside_loop_p (loop, gimple_bb (USE_STMT (use_p)))
5563             && !is_gimple_debug (USE_STMT (use_p)))
5564           phis.safe_push (USE_STMT (use_p));
5565
5566       /* While we expect to have found an exit_phi because of loop-closed-ssa
5567          form we can end up without one if the scalar cycle is dead.  */
5568
5569       FOR_EACH_VEC_ELT (phis, i, exit_phi)
5570         {
5571           if (outer_loop)
5572             {
5573               stmt_vec_info exit_phi_vinfo
5574                 = loop_vinfo->lookup_stmt (exit_phi);
5575               gphi *vect_phi;
5576
5577               if (double_reduc)
5578                 STMT_VINFO_VEC_STMT (exit_phi_vinfo) = inner_phi;
5579               else
5580                 STMT_VINFO_VEC_STMT (exit_phi_vinfo) = epilog_stmt_info;
5581               if (!double_reduc
5582                   || STMT_VINFO_DEF_TYPE (exit_phi_vinfo)
5583                       != vect_double_reduction_def)
5584                 continue;
5585
5586               /* Handle double reduction:
5587
5588                  stmt1: s1 = phi <s0, s2>  - double reduction phi (outer loop)
5589                  stmt2:   s3 = phi <s1, s4> - (regular) reduc phi (inner loop)
5590                  stmt3:   s4 = use (s3)     - (regular) reduc stmt (inner loop)
5591                  stmt4: s2 = phi <s4>      - double reduction stmt (outer loop)
5592
5593                  At that point the regular reduction (stmt2 and stmt3) is
5594                  already vectorized, as well as the exit phi node, stmt4.
5595                  Here we vectorize the phi node of double reduction, stmt1, and
5596                  update all relevant statements.  */
5597
5598               /* Go through all the uses of s2 to find double reduction phi
5599                  node, i.e., stmt1 above.  */
5600               orig_name = PHI_RESULT (exit_phi);
5601               FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name)
5602                 {
5603                   stmt_vec_info use_stmt_vinfo;
5604                   tree vect_phi_init, preheader_arg, vect_phi_res;
5605                   basic_block bb = gimple_bb (use_stmt);
5606
5607                   /* Check that USE_STMT is really double reduction phi
5608                      node.  */
5609                   if (gimple_code (use_stmt) != GIMPLE_PHI
5610                       || gimple_phi_num_args (use_stmt) != 2
5611                       || bb->loop_father != outer_loop)
5612                     continue;
5613                   use_stmt_vinfo = loop_vinfo->lookup_stmt (use_stmt);
5614                   if (!use_stmt_vinfo
5615                       || STMT_VINFO_DEF_TYPE (use_stmt_vinfo)
5616                           != vect_double_reduction_def)
5617                     continue;
5618
5619                   /* Create vector phi node for double reduction:
5620                      vs1 = phi <vs0, vs2>
5621                      vs1 was created previously in this function by a call to
5622                        vect_get_vec_def_for_operand and is stored in
5623                        vec_initial_def;
5624                      vs2 is defined by INNER_PHI, the vectorized EXIT_PHI;
5625                      vs0 is created here.  */
5626
5627                   /* Create vector phi node.  */
5628                   vect_phi = create_phi_node (vec_initial_def, bb);
5629                   loop_vec_info_for_loop (outer_loop)->add_stmt (vect_phi);
5630
5631                   /* Create vs0 - initial def of the double reduction phi.  */
5632                   preheader_arg = PHI_ARG_DEF_FROM_EDGE (use_stmt,
5633                                              loop_preheader_edge (outer_loop));
5634                   vect_phi_init = get_initial_def_for_reduction
5635                     (stmt_info, preheader_arg, NULL);
5636
5637                   /* Update phi node arguments with vs0 and vs2.  */
5638                   add_phi_arg (vect_phi, vect_phi_init,
5639                                loop_preheader_edge (outer_loop),
5640                                UNKNOWN_LOCATION);
5641                   add_phi_arg (vect_phi, PHI_RESULT (inner_phi->stmt),
5642                                loop_latch_edge (outer_loop), UNKNOWN_LOCATION);
5643                   if (dump_enabled_p ())
5644                     dump_printf_loc (MSG_NOTE, vect_location,
5645                                      "created double reduction phi node: %G",
5646                                      vect_phi);
5647
5648                   vect_phi_res = PHI_RESULT (vect_phi);
5649
5650                   /* Replace the use, i.e., set the correct vs1 in the regular
5651                      reduction phi node.  FORNOW, NCOPIES is always 1, so the
5652                      loop is redundant.  */
5653                   stmt_vec_info use_info = reduction_phi_info;
5654                   for (j = 0; j < ncopies; j++)
5655                     {
5656                       edge pr_edge = loop_preheader_edge (loop);
5657                       SET_PHI_ARG_DEF (as_a <gphi *> (use_info->stmt),
5658                                        pr_edge->dest_idx, vect_phi_res);
5659                       use_info = STMT_VINFO_RELATED_STMT (use_info);
5660                     }
5661                 }
5662             }
5663         }
5664
5665       phis.release ();
5666       if (nested_in_vect_loop)
5667         {
5668           if (double_reduc)
5669             loop = outer_loop;
5670           else
5671             continue;
5672         }
5673
5674       phis.create (3);
5675       /* Find the loop-closed-use at the loop exit of the original scalar
5676          result.  (The reduction result is expected to have two immediate uses,
5677          one at the latch block, and one at the loop exit).  For double
5678          reductions we are looking for exit phis of the outer loop.  */
5679       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, scalar_dest)
5680         {
5681           if (!flow_bb_inside_loop_p (loop, gimple_bb (USE_STMT (use_p))))
5682             {
5683               if (!is_gimple_debug (USE_STMT (use_p)))
5684                 phis.safe_push (USE_STMT (use_p));
5685             }
5686           else
5687             {
5688               if (double_reduc && gimple_code (USE_STMT (use_p)) == GIMPLE_PHI)
5689                 {
5690                   tree phi_res = PHI_RESULT (USE_STMT (use_p));
5691
5692                   FOR_EACH_IMM_USE_FAST (phi_use_p, phi_imm_iter, phi_res)
5693                     {
5694                       if (!flow_bb_inside_loop_p (loop,
5695                                              gimple_bb (USE_STMT (phi_use_p)))
5696                           && !is_gimple_debug (USE_STMT (phi_use_p)))
5697                         phis.safe_push (USE_STMT (phi_use_p));
5698                     }
5699                 }
5700             }
5701         }
5702
5703       FOR_EACH_VEC_ELT (phis, i, exit_phi)
5704         {
5705           /* Replace the uses:  */
5706           orig_name = PHI_RESULT (exit_phi);
5707           scalar_result = scalar_results[k];
5708           FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name)
5709             FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
5710               SET_USE (use_p, scalar_result);
5711         }
5712
5713       phis.release ();
5714     }
5715 }
5716
5717 /* Return a vector of type VECTYPE that is equal to the vector select
5718    operation "MASK ? VEC : IDENTITY".  Insert the select statements
5719    before GSI.  */
5720
5721 static tree
5722 merge_with_identity (gimple_stmt_iterator *gsi, tree mask, tree vectype,
5723                      tree vec, tree identity)
5724 {
5725   tree cond = make_temp_ssa_name (vectype, NULL, "cond");
5726   gimple *new_stmt = gimple_build_assign (cond, VEC_COND_EXPR,
5727                                           mask, vec, identity);
5728   gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
5729   return cond;
5730 }
5731
5732 /* Successively apply CODE to each element of VECTOR_RHS, in left-to-right
5733    order, starting with LHS.  Insert the extraction statements before GSI and
5734    associate the new scalar SSA names with variable SCALAR_DEST.
5735    Return the SSA name for the result.  */
5736
5737 static tree
5738 vect_expand_fold_left (gimple_stmt_iterator *gsi, tree scalar_dest,
5739                        tree_code code, tree lhs, tree vector_rhs)
5740 {
5741   tree vectype = TREE_TYPE (vector_rhs);
5742   tree scalar_type = TREE_TYPE (vectype);
5743   tree bitsize = TYPE_SIZE (scalar_type);
5744   unsigned HOST_WIDE_INT vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
5745   unsigned HOST_WIDE_INT element_bitsize = tree_to_uhwi (bitsize);
5746
5747   for (unsigned HOST_WIDE_INT bit_offset = 0;
5748        bit_offset < vec_size_in_bits;
5749        bit_offset += element_bitsize)
5750     {
5751       tree bitpos = bitsize_int (bit_offset);
5752       tree rhs = build3 (BIT_FIELD_REF, scalar_type, vector_rhs,
5753                          bitsize, bitpos);
5754
5755       gassign *stmt = gimple_build_assign (scalar_dest, rhs);
5756       rhs = make_ssa_name (scalar_dest, stmt);
5757       gimple_assign_set_lhs (stmt, rhs);
5758       gsi_insert_before (gsi, stmt, GSI_SAME_STMT);
5759
5760       stmt = gimple_build_assign (scalar_dest, code, lhs, rhs);
5761       tree new_name = make_ssa_name (scalar_dest, stmt);
5762       gimple_assign_set_lhs (stmt, new_name);
5763       gsi_insert_before (gsi, stmt, GSI_SAME_STMT);
5764       lhs = new_name;
5765     }
5766   return lhs;
5767 }
5768
5769 /* Perform an in-order reduction (FOLD_LEFT_REDUCTION).  STMT_INFO is the
5770    statement that sets the live-out value.  REDUC_DEF_STMT is the phi
5771    statement.  CODE is the operation performed by STMT_INFO and OPS are
5772    its scalar operands.  REDUC_INDEX is the index of the operand in
5773    OPS that is set by REDUC_DEF_STMT.  REDUC_FN is the function that
5774    implements in-order reduction, or IFN_LAST if we should open-code it.
5775    VECTYPE_IN is the type of the vector input.  MASKS specifies the masks
5776    that should be used to control the operation in a fully-masked loop.  */
5777
5778 static bool
5779 vectorize_fold_left_reduction (stmt_vec_info stmt_info,
5780                                gimple_stmt_iterator *gsi,
5781                                stmt_vec_info *vec_stmt, slp_tree slp_node,
5782                                gimple *reduc_def_stmt,
5783                                tree_code code, internal_fn reduc_fn,
5784                                tree ops[3], tree vectype_in,
5785                                int reduc_index, vec_loop_masks *masks)
5786 {
5787   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
5788   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
5789   tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
5790   stmt_vec_info new_stmt_info = NULL;
5791
5792   int ncopies;
5793   if (slp_node)
5794     ncopies = 1;
5795   else
5796     ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
5797
5798   gcc_assert (!nested_in_vect_loop_p (loop, stmt_info));
5799   gcc_assert (ncopies == 1);
5800   gcc_assert (TREE_CODE_LENGTH (code) == binary_op);
5801   gcc_assert (reduc_index == (code == MINUS_EXPR ? 0 : 1));
5802   gcc_assert (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
5803               == FOLD_LEFT_REDUCTION);
5804
5805   if (slp_node)
5806     gcc_assert (known_eq (TYPE_VECTOR_SUBPARTS (vectype_out),
5807                           TYPE_VECTOR_SUBPARTS (vectype_in)));
5808
5809   tree op0 = ops[1 - reduc_index];
5810
5811   int group_size = 1;
5812   stmt_vec_info scalar_dest_def_info;
5813   auto_vec<tree> vec_oprnds0;
5814   if (slp_node)
5815     {
5816       auto_vec<vec<tree> > vec_defs (2);
5817       auto_vec<tree> sops(2);
5818       sops.quick_push (ops[0]);
5819       sops.quick_push (ops[1]);
5820       vect_get_slp_defs (sops, slp_node, &vec_defs);
5821       vec_oprnds0.safe_splice (vec_defs[1 - reduc_index]);
5822       vec_defs[0].release ();
5823       vec_defs[1].release ();
5824       group_size = SLP_TREE_SCALAR_STMTS (slp_node).length ();
5825       scalar_dest_def_info = SLP_TREE_SCALAR_STMTS (slp_node)[group_size - 1];
5826     }
5827   else
5828     {
5829       tree loop_vec_def0 = vect_get_vec_def_for_operand (op0, stmt_info);
5830       vec_oprnds0.create (1);
5831       vec_oprnds0.quick_push (loop_vec_def0);
5832       scalar_dest_def_info = stmt_info;
5833     }
5834
5835   tree scalar_dest = gimple_assign_lhs (scalar_dest_def_info->stmt);
5836   tree scalar_type = TREE_TYPE (scalar_dest);
5837   tree reduc_var = gimple_phi_result (reduc_def_stmt);
5838
5839   int vec_num = vec_oprnds0.length ();
5840   gcc_assert (vec_num == 1 || slp_node);
5841   tree vec_elem_type = TREE_TYPE (vectype_out);
5842   gcc_checking_assert (useless_type_conversion_p (scalar_type, vec_elem_type));
5843
5844   tree vector_identity = NULL_TREE;
5845   if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
5846     vector_identity = build_zero_cst (vectype_out);
5847
5848   tree scalar_dest_var = vect_create_destination_var (scalar_dest, NULL);
5849   int i;
5850   tree def0;
5851   FOR_EACH_VEC_ELT (vec_oprnds0, i, def0)
5852     {
5853       gimple *new_stmt;
5854       tree mask = NULL_TREE;
5855       if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
5856         mask = vect_get_loop_mask (gsi, masks, vec_num, vectype_in, i);
5857
5858       /* Handle MINUS by adding the negative.  */
5859       if (reduc_fn != IFN_LAST && code == MINUS_EXPR)
5860         {
5861           tree negated = make_ssa_name (vectype_out);
5862           new_stmt = gimple_build_assign (negated, NEGATE_EXPR, def0);
5863           gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
5864           def0 = negated;
5865         }
5866
5867       if (mask)
5868         def0 = merge_with_identity (gsi, mask, vectype_out, def0,
5869                                     vector_identity);
5870
5871       /* On the first iteration the input is simply the scalar phi
5872          result, and for subsequent iterations it is the output of
5873          the preceding operation.  */
5874       if (reduc_fn != IFN_LAST)
5875         {
5876           new_stmt = gimple_build_call_internal (reduc_fn, 2, reduc_var, def0);
5877           /* For chained SLP reductions the output of the previous reduction
5878              operation serves as the input of the next. For the final statement
5879              the output cannot be a temporary - we reuse the original
5880              scalar destination of the last statement.  */
5881           if (i != vec_num - 1)
5882             {
5883               gimple_set_lhs (new_stmt, scalar_dest_var);
5884               reduc_var = make_ssa_name (scalar_dest_var, new_stmt);
5885               gimple_set_lhs (new_stmt, reduc_var);
5886             }
5887         }
5888       else
5889         {
5890           reduc_var = vect_expand_fold_left (gsi, scalar_dest_var, code,
5891                                              reduc_var, def0);
5892           new_stmt = SSA_NAME_DEF_STMT (reduc_var);
5893           /* Remove the statement, so that we can use the same code paths
5894              as for statements that we've just created.  */
5895           gimple_stmt_iterator tmp_gsi = gsi_for_stmt (new_stmt);
5896           gsi_remove (&tmp_gsi, true);
5897         }
5898
5899       if (i == vec_num - 1)
5900         {
5901           gimple_set_lhs (new_stmt, scalar_dest);
5902           new_stmt_info = vect_finish_replace_stmt (scalar_dest_def_info,
5903                                                     new_stmt);
5904         }
5905       else
5906         new_stmt_info = vect_finish_stmt_generation (scalar_dest_def_info,
5907                                                      new_stmt, gsi);
5908
5909       if (slp_node)
5910         SLP_TREE_VEC_STMTS (slp_node).quick_push (new_stmt_info);
5911     }
5912
5913   if (!slp_node)
5914     STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt_info;
5915
5916   return true;
5917 }
5918
5919 /* Function is_nonwrapping_integer_induction.
5920
5921    Check if STMT_VINO (which is part of loop LOOP) both increments and
5922    does not cause overflow.  */
5923
5924 static bool
5925 is_nonwrapping_integer_induction (stmt_vec_info stmt_vinfo, struct loop *loop)
5926 {
5927   gphi *phi = as_a <gphi *> (stmt_vinfo->stmt);
5928   tree base = STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo);
5929   tree step = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo);
5930   tree lhs_type = TREE_TYPE (gimple_phi_result (phi));
5931   widest_int ni, max_loop_value, lhs_max;
5932   wi::overflow_type overflow = wi::OVF_NONE;
5933
5934   /* Make sure the loop is integer based.  */
5935   if (TREE_CODE (base) != INTEGER_CST
5936       || TREE_CODE (step) != INTEGER_CST)
5937     return false;
5938
5939   /* Check that the max size of the loop will not wrap.  */
5940
5941   if (TYPE_OVERFLOW_UNDEFINED (lhs_type))
5942     return true;
5943
5944   if (! max_stmt_executions (loop, &ni))
5945     return false;
5946
5947   max_loop_value = wi::mul (wi::to_widest (step), ni, TYPE_SIGN (lhs_type),
5948                             &overflow);
5949   if (overflow)
5950     return false;
5951
5952   max_loop_value = wi::add (wi::to_widest (base), max_loop_value,
5953                             TYPE_SIGN (lhs_type), &overflow);
5954   if (overflow)
5955     return false;
5956
5957   return (wi::min_precision (max_loop_value, TYPE_SIGN (lhs_type))
5958           <= TYPE_PRECISION (lhs_type));
5959 }
5960
5961 /* Function vectorizable_reduction.
5962
5963    Check if STMT_INFO performs a reduction operation that can be vectorized.
5964    If VEC_STMT is also passed, vectorize STMT_INFO: create a vectorized
5965    stmt to replace it, put it in VEC_STMT, and insert it at GSI.
5966    Return true if STMT_INFO is vectorizable in this way.
5967
5968    This function also handles reduction idioms (patterns) that have been
5969    recognized in advance during vect_pattern_recog.  In this case, STMT_INFO
5970    may be of this form:
5971      X = pattern_expr (arg0, arg1, ..., X)
5972    and its STMT_VINFO_RELATED_STMT points to the last stmt in the original
5973    sequence that had been detected and replaced by the pattern-stmt
5974    (STMT_INFO).
5975
5976    This function also handles reduction of condition expressions, for example:
5977      for (int i = 0; i < N; i++)
5978        if (a[i] < value)
5979          last = a[i];
5980    This is handled by vectorising the loop and creating an additional vector
5981    containing the loop indexes for which "a[i] < value" was true.  In the
5982    function epilogue this is reduced to a single max value and then used to
5983    index into the vector of results.
5984
5985    In some cases of reduction patterns, the type of the reduction variable X is
5986    different than the type of the other arguments of STMT_INFO.
5987    In such cases, the vectype that is used when transforming STMT_INFO into
5988    a vector stmt is different than the vectype that is used to determine the
5989    vectorization factor, because it consists of a different number of elements
5990    than the actual number of elements that are being operated upon in parallel.
5991
5992    For example, consider an accumulation of shorts into an int accumulator.
5993    On some targets it's possible to vectorize this pattern operating on 8
5994    shorts at a time (hence, the vectype for purposes of determining the
5995    vectorization factor should be V8HI); on the other hand, the vectype that
5996    is used to create the vector form is actually V4SI (the type of the result).
5997
5998    Upon entry to this function, STMT_VINFO_VECTYPE records the vectype that
5999    indicates what is the actual level of parallelism (V8HI in the example), so
6000    that the right vectorization factor would be derived.  This vectype
6001    corresponds to the type of arguments to the reduction stmt, and should *NOT*
6002    be used to create the vectorized stmt.  The right vectype for the vectorized
6003    stmt is obtained from the type of the result X:
6004         get_vectype_for_scalar_type (TREE_TYPE (X))
6005
6006    This means that, contrary to "regular" reductions (or "regular" stmts in
6007    general), the following equation:
6008       STMT_VINFO_VECTYPE == get_vectype_for_scalar_type (TREE_TYPE (X))
6009    does *NOT* necessarily hold for reduction patterns.  */
6010
6011 bool
6012 vectorizable_reduction (stmt_vec_info stmt_info, gimple_stmt_iterator *gsi,
6013                         stmt_vec_info *vec_stmt, slp_tree slp_node,
6014                         slp_instance slp_node_instance,
6015                         stmt_vector_for_cost *cost_vec)
6016 {
6017   tree vec_dest;
6018   tree scalar_dest;
6019   tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
6020   tree vectype_in = NULL_TREE;
6021   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
6022   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
6023   enum tree_code code, orig_code;
6024   internal_fn reduc_fn;
6025   machine_mode vec_mode;
6026   int op_type;
6027   optab optab;
6028   tree new_temp = NULL_TREE;
6029   enum vect_def_type dt, cond_reduc_dt = vect_unknown_def_type;
6030   stmt_vec_info cond_stmt_vinfo = NULL;
6031   enum tree_code cond_reduc_op_code = ERROR_MARK;
6032   tree scalar_type;
6033   bool is_simple_use;
6034   int i;
6035   int ncopies;
6036   int epilog_copies;
6037   stmt_vec_info prev_stmt_info, prev_phi_info;
6038   bool single_defuse_cycle = false;
6039   stmt_vec_info new_stmt_info = NULL;
6040   int j;
6041   tree ops[3];
6042   enum vect_def_type dts[3];
6043   bool nested_cycle = false, found_nested_cycle_def = false;
6044   bool double_reduc = false;
6045   basic_block def_bb;
6046   struct loop * def_stmt_loop;
6047   tree def_arg;
6048   auto_vec<tree> vec_oprnds0;
6049   auto_vec<tree> vec_oprnds1;
6050   auto_vec<tree> vec_oprnds2;
6051   auto_vec<tree> vect_defs;
6052   auto_vec<stmt_vec_info> phis;
6053   int vec_num;
6054   tree def0, tem;
6055   tree cr_index_scalar_type = NULL_TREE, cr_index_vector_type = NULL_TREE;
6056   tree cond_reduc_val = NULL_TREE;
6057
6058   /* Make sure it was already recognized as a reduction computation.  */
6059   if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_reduction_def
6060       && STMT_VINFO_DEF_TYPE (stmt_info) != vect_nested_cycle)
6061     return false;
6062
6063   if (nested_in_vect_loop_p (loop, stmt_info))
6064     {
6065       loop = loop->inner;
6066       nested_cycle = true;
6067     }
6068
6069   if (REDUC_GROUP_FIRST_ELEMENT (stmt_info))
6070     gcc_assert (slp_node
6071                 && REDUC_GROUP_FIRST_ELEMENT (stmt_info) == stmt_info);
6072
6073   if (gphi *phi = dyn_cast <gphi *> (stmt_info->stmt))
6074     {
6075       tree phi_result = gimple_phi_result (phi);
6076       /* Analysis is fully done on the reduction stmt invocation.  */
6077       if (! vec_stmt)
6078         {
6079           if (slp_node)
6080             slp_node_instance->reduc_phis = slp_node;
6081
6082           STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
6083           return true;
6084         }
6085
6086       if (STMT_VINFO_REDUC_TYPE (stmt_info) == FOLD_LEFT_REDUCTION)
6087         /* Leave the scalar phi in place.  Note that checking
6088            STMT_VINFO_VEC_REDUCTION_TYPE (as below) only works
6089            for reductions involving a single statement.  */
6090         return true;
6091
6092       stmt_vec_info reduc_stmt_info = STMT_VINFO_REDUC_DEF (stmt_info);
6093       reduc_stmt_info = vect_stmt_to_vectorize (reduc_stmt_info);
6094
6095       if (STMT_VINFO_VEC_REDUCTION_TYPE (reduc_stmt_info)
6096           == EXTRACT_LAST_REDUCTION)
6097         /* Leave the scalar phi in place.  */
6098         return true;
6099
6100       gassign *reduc_stmt = as_a <gassign *> (reduc_stmt_info->stmt);
6101       code = gimple_assign_rhs_code (reduc_stmt);
6102       for (unsigned k = 1; k < gimple_num_ops (reduc_stmt); ++k)
6103         {
6104           tree op = gimple_op (reduc_stmt, k);
6105           if (op == phi_result)
6106             continue;
6107           if (k == 1 && code == COND_EXPR)
6108             continue;
6109           bool is_simple_use = vect_is_simple_use (op, loop_vinfo, &dt);
6110           gcc_assert (is_simple_use);
6111           if (dt == vect_constant_def || dt == vect_external_def)
6112             continue;
6113           if (!vectype_in
6114               || (GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (vectype_in)))
6115                   < GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (op)))))
6116             vectype_in = get_vectype_for_scalar_type (TREE_TYPE (op));
6117           break;
6118         }
6119       /* For a nested cycle we might end up with an operation like
6120          phi_result * phi_result.  */
6121       if (!vectype_in)
6122         vectype_in = STMT_VINFO_VECTYPE (stmt_info);
6123       gcc_assert (vectype_in);
6124
6125       if (slp_node)
6126         ncopies = 1;
6127       else
6128         ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
6129
6130       stmt_vec_info use_stmt_info;
6131       if (ncopies > 1
6132           && STMT_VINFO_RELEVANT (reduc_stmt_info) <= vect_used_only_live
6133           && (use_stmt_info = loop_vinfo->lookup_single_use (phi_result))
6134           && vect_stmt_to_vectorize (use_stmt_info) == reduc_stmt_info)
6135         single_defuse_cycle = true;
6136
6137       /* Create the destination vector  */
6138       scalar_dest = gimple_assign_lhs (reduc_stmt);
6139       vec_dest = vect_create_destination_var (scalar_dest, vectype_out);
6140
6141       if (slp_node)
6142         /* The size vect_schedule_slp_instance computes is off for us.  */
6143         vec_num = vect_get_num_vectors
6144           (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
6145            * SLP_TREE_SCALAR_STMTS (slp_node).length (),
6146            vectype_in);
6147       else
6148         vec_num = 1;
6149
6150       /* Generate the reduction PHIs upfront.  */
6151       prev_phi_info = NULL;
6152       for (j = 0; j < ncopies; j++)
6153         {
6154           if (j == 0 || !single_defuse_cycle)
6155             {
6156               for (i = 0; i < vec_num; i++)
6157                 {
6158                   /* Create the reduction-phi that defines the reduction
6159                      operand.  */
6160                   gimple *new_phi = create_phi_node (vec_dest, loop->header);
6161                   stmt_vec_info new_phi_info = loop_vinfo->add_stmt (new_phi);
6162
6163                   if (slp_node)
6164                     SLP_TREE_VEC_STMTS (slp_node).quick_push (new_phi_info);
6165                   else
6166                     {
6167                       if (j == 0)
6168                         STMT_VINFO_VEC_STMT (stmt_info)
6169                           = *vec_stmt = new_phi_info;
6170                       else
6171                         STMT_VINFO_RELATED_STMT (prev_phi_info) = new_phi_info;
6172                       prev_phi_info = new_phi_info;
6173                     }
6174                 }
6175             }
6176         }
6177
6178       return true;
6179     }
6180
6181   /* 1. Is vectorizable reduction?  */
6182   /* Not supportable if the reduction variable is used in the loop, unless
6183      it's a reduction chain.  */
6184   if (STMT_VINFO_RELEVANT (stmt_info) > vect_used_in_outer
6185       && !REDUC_GROUP_FIRST_ELEMENT (stmt_info))
6186     return false;
6187
6188   /* Reductions that are not used even in an enclosing outer-loop,
6189      are expected to be "live" (used out of the loop).  */
6190   if (STMT_VINFO_RELEVANT (stmt_info) == vect_unused_in_scope
6191       && !STMT_VINFO_LIVE_P (stmt_info))
6192     return false;
6193
6194   /* 2. Has this been recognized as a reduction pattern?
6195
6196      Check if STMT represents a pattern that has been recognized
6197      in earlier analysis stages.  For stmts that represent a pattern,
6198      the STMT_VINFO_RELATED_STMT field records the last stmt in
6199      the original sequence that constitutes the pattern.  */
6200
6201   stmt_vec_info orig_stmt_info = STMT_VINFO_RELATED_STMT (stmt_info);
6202   if (orig_stmt_info)
6203     {
6204       gcc_assert (STMT_VINFO_IN_PATTERN_P (orig_stmt_info));
6205       gcc_assert (!STMT_VINFO_IN_PATTERN_P (stmt_info));
6206     }
6207
6208   /* 3. Check the operands of the operation.  The first operands are defined
6209         inside the loop body. The last operand is the reduction variable,
6210         which is defined by the loop-header-phi.  */
6211
6212   gassign *stmt = as_a <gassign *> (stmt_info->stmt);
6213
6214   /* Flatten RHS.  */
6215   switch (get_gimple_rhs_class (gimple_assign_rhs_code (stmt)))
6216     {
6217     case GIMPLE_BINARY_RHS:
6218       code = gimple_assign_rhs_code (stmt);
6219       op_type = TREE_CODE_LENGTH (code);
6220       gcc_assert (op_type == binary_op);
6221       ops[0] = gimple_assign_rhs1 (stmt);
6222       ops[1] = gimple_assign_rhs2 (stmt);
6223       break;
6224
6225     case GIMPLE_TERNARY_RHS:
6226       code = gimple_assign_rhs_code (stmt);
6227       op_type = TREE_CODE_LENGTH (code);
6228       gcc_assert (op_type == ternary_op);
6229       ops[0] = gimple_assign_rhs1 (stmt);
6230       ops[1] = gimple_assign_rhs2 (stmt);
6231       ops[2] = gimple_assign_rhs3 (stmt);
6232       break;
6233
6234     case GIMPLE_UNARY_RHS:
6235       return false;
6236
6237     default:
6238       gcc_unreachable ();
6239     }
6240
6241   if (code == COND_EXPR && slp_node)
6242     return false;
6243
6244   scalar_dest = gimple_assign_lhs (stmt);
6245   scalar_type = TREE_TYPE (scalar_dest);
6246   if (!POINTER_TYPE_P (scalar_type) && !INTEGRAL_TYPE_P (scalar_type)
6247       && !SCALAR_FLOAT_TYPE_P (scalar_type))
6248     return false;
6249
6250   /* Do not try to vectorize bit-precision reductions.  */
6251   if (!type_has_mode_precision_p (scalar_type))
6252     return false;
6253
6254   /* All uses but the last are expected to be defined in the loop.
6255      The last use is the reduction variable.  In case of nested cycle this
6256      assumption is not true: we use reduc_index to record the index of the
6257      reduction variable.  */
6258   stmt_vec_info reduc_def_info;
6259   if (orig_stmt_info)
6260     reduc_def_info = STMT_VINFO_REDUC_DEF (orig_stmt_info);
6261   else
6262     reduc_def_info = STMT_VINFO_REDUC_DEF (stmt_info);
6263   gcc_assert (reduc_def_info);
6264   gphi *reduc_def_phi = as_a <gphi *> (reduc_def_info->stmt);
6265   tree reduc_def = PHI_RESULT (reduc_def_phi);
6266   int reduc_index = -1;
6267   for (i = 0; i < op_type; i++)
6268     {
6269       /* The condition of COND_EXPR is checked in vectorizable_condition().  */
6270       if (i == 0 && code == COND_EXPR)
6271         continue;
6272
6273       stmt_vec_info def_stmt_info;
6274       is_simple_use = vect_is_simple_use (ops[i], loop_vinfo, &dts[i], &tem,
6275                                           &def_stmt_info);
6276       dt = dts[i];
6277       gcc_assert (is_simple_use);
6278       if (dt == vect_reduction_def
6279           && ops[i] == reduc_def)
6280         {
6281           reduc_index = i;
6282           continue;
6283         }
6284       else if (tem)
6285         {
6286           /* To properly compute ncopies we are interested in the widest
6287              input type in case we're looking at a widening accumulation.  */
6288           if (!vectype_in
6289               || (GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (vectype_in)))
6290                   < GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (tem)))))
6291             vectype_in = tem;
6292         }
6293
6294       if (dt != vect_internal_def
6295           && dt != vect_external_def
6296           && dt != vect_constant_def
6297           && dt != vect_induction_def
6298           && !(dt == vect_nested_cycle && nested_cycle))
6299         return false;
6300
6301       if (dt == vect_nested_cycle
6302           && ops[i] == reduc_def)
6303         {
6304           found_nested_cycle_def = true;
6305           reduc_index = i;
6306         }
6307
6308       if (i == 1 && code == COND_EXPR)
6309         {
6310           /* Record how value of COND_EXPR is defined.  */
6311           if (dt == vect_constant_def)
6312             {
6313               cond_reduc_dt = dt;
6314               cond_reduc_val = ops[i];
6315             }
6316           if (dt == vect_induction_def
6317               && def_stmt_info
6318               && is_nonwrapping_integer_induction (def_stmt_info, loop))
6319             {
6320               cond_reduc_dt = dt;
6321               cond_stmt_vinfo = def_stmt_info;
6322             }
6323         }
6324     }
6325
6326   if (!vectype_in)
6327     vectype_in = vectype_out;
6328
6329   /* When vectorizing a reduction chain w/o SLP the reduction PHI is not
6330      directy used in stmt.  */
6331   if (reduc_index == -1)
6332     {
6333       if (STMT_VINFO_REDUC_TYPE (stmt_info) == FOLD_LEFT_REDUCTION)
6334         {
6335           if (dump_enabled_p ())
6336             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6337                              "in-order reduction chain without SLP.\n");
6338           return false;
6339         }
6340     }
6341
6342   if (!(reduc_index == -1
6343         || dts[reduc_index] == vect_reduction_def
6344         || dts[reduc_index] == vect_nested_cycle
6345         || ((dts[reduc_index] == vect_internal_def
6346              || dts[reduc_index] == vect_external_def
6347              || dts[reduc_index] == vect_constant_def
6348              || dts[reduc_index] == vect_induction_def)
6349             && nested_cycle && found_nested_cycle_def)))
6350     {
6351       /* For pattern recognized stmts, orig_stmt might be a reduction,
6352          but some helper statements for the pattern might not, or
6353          might be COND_EXPRs with reduction uses in the condition.  */
6354       gcc_assert (orig_stmt_info);
6355       return false;
6356     }
6357
6358   /* PHIs should not participate in patterns.  */
6359   gcc_assert (!STMT_VINFO_RELATED_STMT (reduc_def_info));
6360   enum vect_reduction_type v_reduc_type
6361     = STMT_VINFO_REDUC_TYPE (reduc_def_info);
6362   stmt_vec_info tmp = STMT_VINFO_REDUC_DEF (reduc_def_info);
6363
6364   STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) = v_reduc_type;
6365   /* If we have a condition reduction, see if we can simplify it further.  */
6366   if (v_reduc_type == COND_REDUCTION)
6367     {
6368       /* TODO: We can't yet handle reduction chains, since we need to treat
6369          each COND_EXPR in the chain specially, not just the last one.
6370          E.g. for:
6371
6372             x_1 = PHI <x_3, ...>
6373             x_2 = a_2 ? ... : x_1;
6374             x_3 = a_3 ? ... : x_2;
6375
6376          we're interested in the last element in x_3 for which a_2 || a_3
6377          is true, whereas the current reduction chain handling would
6378          vectorize x_2 as a normal VEC_COND_EXPR and only treat x_3
6379          as a reduction operation.  */
6380       if (reduc_index == -1)
6381         {
6382           if (dump_enabled_p ())
6383             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6384                              "conditional reduction chains not supported\n");
6385           return false;
6386         }
6387
6388       /* vect_is_simple_reduction ensured that operand 2 is the
6389          loop-carried operand.  */
6390       gcc_assert (reduc_index == 2);
6391
6392       /* Loop peeling modifies initial value of reduction PHI, which
6393          makes the reduction stmt to be transformed different to the
6394          original stmt analyzed.  We need to record reduction code for
6395          CONST_COND_REDUCTION type reduction at analyzing stage, thus
6396          it can be used directly at transform stage.  */
6397       if (STMT_VINFO_VEC_CONST_COND_REDUC_CODE (stmt_info) == MAX_EXPR
6398           || STMT_VINFO_VEC_CONST_COND_REDUC_CODE (stmt_info) == MIN_EXPR)
6399         {
6400           /* Also set the reduction type to CONST_COND_REDUCTION.  */
6401           gcc_assert (cond_reduc_dt == vect_constant_def);
6402           STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) = CONST_COND_REDUCTION;
6403         }
6404       else if (direct_internal_fn_supported_p (IFN_FOLD_EXTRACT_LAST,
6405                                                vectype_in, OPTIMIZE_FOR_SPEED))
6406         {
6407           if (dump_enabled_p ())
6408             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6409                              "optimizing condition reduction with"
6410                              " FOLD_EXTRACT_LAST.\n");
6411           STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) = EXTRACT_LAST_REDUCTION;
6412         }
6413       else if (cond_reduc_dt == vect_induction_def)
6414         {
6415           tree base
6416             = STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (cond_stmt_vinfo);
6417           tree step = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (cond_stmt_vinfo);
6418
6419           gcc_assert (TREE_CODE (base) == INTEGER_CST
6420                       && TREE_CODE (step) == INTEGER_CST);
6421           cond_reduc_val = NULL_TREE;
6422           /* Find a suitable value, for MAX_EXPR below base, for MIN_EXPR
6423              above base; punt if base is the minimum value of the type for
6424              MAX_EXPR or maximum value of the type for MIN_EXPR for now.  */
6425           if (tree_int_cst_sgn (step) == -1)
6426             {
6427               cond_reduc_op_code = MIN_EXPR;
6428               if (tree_int_cst_sgn (base) == -1)
6429                 cond_reduc_val = build_int_cst (TREE_TYPE (base), 0);
6430               else if (tree_int_cst_lt (base,
6431                                         TYPE_MAX_VALUE (TREE_TYPE (base))))
6432                 cond_reduc_val
6433                   = int_const_binop (PLUS_EXPR, base, integer_one_node);
6434             }
6435           else
6436             {
6437               cond_reduc_op_code = MAX_EXPR;
6438               if (tree_int_cst_sgn (base) == 1)
6439                 cond_reduc_val = build_int_cst (TREE_TYPE (base), 0);
6440               else if (tree_int_cst_lt (TYPE_MIN_VALUE (TREE_TYPE (base)),
6441                                         base))
6442                 cond_reduc_val
6443                   = int_const_binop (MINUS_EXPR, base, integer_one_node);
6444             }
6445           if (cond_reduc_val)
6446             {
6447               if (dump_enabled_p ())
6448                 dump_printf_loc (MSG_NOTE, vect_location,
6449                                  "condition expression based on "
6450                                  "integer induction.\n");
6451               STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
6452                 = INTEGER_INDUC_COND_REDUCTION;
6453             }
6454         }
6455       else if (cond_reduc_dt == vect_constant_def)
6456         {
6457           enum vect_def_type cond_initial_dt;
6458           gimple *def_stmt = SSA_NAME_DEF_STMT (ops[reduc_index]);
6459           tree cond_initial_val
6460             = PHI_ARG_DEF_FROM_EDGE (def_stmt, loop_preheader_edge (loop));
6461
6462           gcc_assert (cond_reduc_val != NULL_TREE);
6463           vect_is_simple_use (cond_initial_val, loop_vinfo, &cond_initial_dt);
6464           if (cond_initial_dt == vect_constant_def
6465               && types_compatible_p (TREE_TYPE (cond_initial_val),
6466                                      TREE_TYPE (cond_reduc_val)))
6467             {
6468               tree e = fold_binary (LE_EXPR, boolean_type_node,
6469                                     cond_initial_val, cond_reduc_val);
6470               if (e && (integer_onep (e) || integer_zerop (e)))
6471                 {
6472                   if (dump_enabled_p ())
6473                     dump_printf_loc (MSG_NOTE, vect_location,
6474                                      "condition expression based on "
6475                                      "compile time constant.\n");
6476                   /* Record reduction code at analysis stage.  */
6477                   STMT_VINFO_VEC_CONST_COND_REDUC_CODE (stmt_info)
6478                     = integer_onep (e) ? MAX_EXPR : MIN_EXPR;
6479                   STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
6480                     = CONST_COND_REDUCTION;
6481                 }
6482             }
6483         }
6484     }
6485
6486   if (orig_stmt_info)
6487     gcc_assert (tmp == orig_stmt_info
6488                 || REDUC_GROUP_FIRST_ELEMENT (tmp) == orig_stmt_info);
6489   else
6490     /* We changed STMT to be the first stmt in reduction chain, hence we
6491        check that in this case the first element in the chain is STMT.  */
6492     gcc_assert (tmp == stmt_info
6493                 || REDUC_GROUP_FIRST_ELEMENT (tmp) == stmt_info);
6494
6495   if (STMT_VINFO_LIVE_P (reduc_def_info))
6496     return false;
6497
6498   if (slp_node)
6499     ncopies = 1;
6500   else
6501     ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
6502
6503   gcc_assert (ncopies >= 1);
6504
6505   vec_mode = TYPE_MODE (vectype_in);
6506   poly_uint64 nunits_out = TYPE_VECTOR_SUBPARTS (vectype_out);
6507
6508   if (nested_cycle)
6509     {
6510       def_bb = gimple_bb (reduc_def_phi);
6511       def_stmt_loop = def_bb->loop_father;
6512       def_arg = PHI_ARG_DEF_FROM_EDGE (reduc_def_phi,
6513                                        loop_preheader_edge (def_stmt_loop));
6514       stmt_vec_info def_arg_stmt_info = loop_vinfo->lookup_def (def_arg);
6515       if (def_arg_stmt_info
6516           && (STMT_VINFO_DEF_TYPE (def_arg_stmt_info)
6517               == vect_double_reduction_def))
6518         double_reduc = true;
6519     }
6520
6521   vect_reduction_type reduction_type
6522     = STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info);
6523   if ((double_reduc || reduction_type != TREE_CODE_REDUCTION)
6524       && ncopies > 1)
6525     {
6526       if (dump_enabled_p ())
6527         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6528                          "multiple types in double reduction or condition "
6529                          "reduction.\n");
6530       return false;
6531     }
6532
6533   if (code == COND_EXPR)
6534     {
6535       /* Only call during the analysis stage, otherwise we'll lose
6536          STMT_VINFO_TYPE.  */
6537       if (!vec_stmt && !vectorizable_condition (stmt_info, gsi, NULL,
6538                                                 true, NULL, cost_vec))
6539         {
6540           if (dump_enabled_p ())
6541             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6542                              "unsupported condition in reduction\n");
6543           return false;
6544         }
6545     }
6546   else if (code == LSHIFT_EXPR || code == RSHIFT_EXPR
6547            || code == LROTATE_EXPR || code == RROTATE_EXPR)
6548     {
6549       /* Only call during the analysis stage, otherwise we'll lose
6550          STMT_VINFO_TYPE.  We only support this for nested cycles
6551          without double reductions at the moment.  */
6552       if (!nested_cycle
6553           || double_reduc
6554           || (!vec_stmt && !vectorizable_shift (stmt_info, gsi, NULL,
6555                                                 NULL, cost_vec)))
6556         {
6557           if (dump_enabled_p ())
6558             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6559                              "unsupported shift or rotation in reduction\n");
6560           return false;
6561         }
6562     }
6563   else
6564     {
6565       /* 4. Supportable by target?  */
6566
6567       /* 4.1. check support for the operation in the loop  */
6568       optab = optab_for_tree_code (code, vectype_in, optab_default);
6569       if (!optab)
6570         {
6571           if (dump_enabled_p ())
6572             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6573                              "no optab.\n");
6574
6575           return false;
6576         }
6577
6578       if (optab_handler (optab, vec_mode) == CODE_FOR_nothing)
6579         {
6580           if (dump_enabled_p ())
6581             dump_printf (MSG_NOTE, "op not supported by target.\n");
6582
6583           if (maybe_ne (GET_MODE_SIZE (vec_mode), UNITS_PER_WORD)
6584               || !vect_worthwhile_without_simd_p (loop_vinfo, code))
6585             return false;
6586
6587           if (dump_enabled_p ())
6588             dump_printf (MSG_NOTE, "proceeding using word mode.\n");
6589         }
6590
6591       /* Worthwhile without SIMD support?  */
6592       if (!VECTOR_MODE_P (TYPE_MODE (vectype_in))
6593           && !vect_worthwhile_without_simd_p (loop_vinfo, code))
6594         {
6595           if (dump_enabled_p ())
6596             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6597                              "not worthwhile without SIMD support.\n");
6598
6599           return false;
6600         }
6601     }
6602
6603   /* 4.2. Check support for the epilog operation.
6604
6605           If STMT represents a reduction pattern, then the type of the
6606           reduction variable may be different than the type of the rest
6607           of the arguments.  For example, consider the case of accumulation
6608           of shorts into an int accumulator; The original code:
6609                         S1: int_a = (int) short_a;
6610           orig_stmt->   S2: int_acc = plus <int_a ,int_acc>;
6611
6612           was replaced with:
6613                         STMT: int_acc = widen_sum <short_a, int_acc>
6614
6615           This means that:
6616           1. The tree-code that is used to create the vector operation in the
6617              epilog code (that reduces the partial results) is not the
6618              tree-code of STMT, but is rather the tree-code of the original
6619              stmt from the pattern that STMT is replacing.  I.e, in the example
6620              above we want to use 'widen_sum' in the loop, but 'plus' in the
6621              epilog.
6622           2. The type (mode) we use to check available target support
6623              for the vector operation to be created in the *epilog*, is
6624              determined by the type of the reduction variable (in the example
6625              above we'd check this: optab_handler (plus_optab, vect_int_mode])).
6626              However the type (mode) we use to check available target support
6627              for the vector operation to be created *inside the loop*, is
6628              determined by the type of the other arguments to STMT (in the
6629              example we'd check this: optab_handler (widen_sum_optab,
6630              vect_short_mode)).
6631
6632           This is contrary to "regular" reductions, in which the types of all
6633           the arguments are the same as the type of the reduction variable.
6634           For "regular" reductions we can therefore use the same vector type
6635           (and also the same tree-code) when generating the epilog code and
6636           when generating the code inside the loop.  */
6637
6638   if (orig_stmt_info
6639       && (reduction_type == TREE_CODE_REDUCTION
6640           || reduction_type == FOLD_LEFT_REDUCTION))
6641     {
6642       /* This is a reduction pattern: get the vectype from the type of the
6643          reduction variable, and get the tree-code from orig_stmt.  */
6644       orig_code = gimple_assign_rhs_code (orig_stmt_info->stmt);
6645       gcc_assert (vectype_out);
6646       vec_mode = TYPE_MODE (vectype_out);
6647     }
6648   else
6649     {
6650       /* Regular reduction: use the same vectype and tree-code as used for
6651          the vector code inside the loop can be used for the epilog code. */
6652       orig_code = code;
6653
6654       if (code == MINUS_EXPR)
6655         orig_code = PLUS_EXPR;
6656
6657       /* For simple condition reductions, replace with the actual expression
6658          we want to base our reduction around.  */
6659       if (reduction_type == CONST_COND_REDUCTION)
6660         {
6661           orig_code = STMT_VINFO_VEC_CONST_COND_REDUC_CODE (stmt_info);
6662           gcc_assert (orig_code == MAX_EXPR || orig_code == MIN_EXPR);
6663         }
6664       else if (reduction_type == INTEGER_INDUC_COND_REDUCTION)
6665         orig_code = cond_reduc_op_code;
6666     }
6667
6668   reduc_fn = IFN_LAST;
6669
6670   if (reduction_type == TREE_CODE_REDUCTION
6671       || reduction_type == FOLD_LEFT_REDUCTION
6672       || reduction_type == INTEGER_INDUC_COND_REDUCTION
6673       || reduction_type == CONST_COND_REDUCTION)
6674     {
6675       if (reduction_type == FOLD_LEFT_REDUCTION
6676           ? fold_left_reduction_fn (orig_code, &reduc_fn)
6677           : reduction_fn_for_scalar_code (orig_code, &reduc_fn))
6678         {
6679           if (reduc_fn != IFN_LAST
6680               && !direct_internal_fn_supported_p (reduc_fn, vectype_out,
6681                                                   OPTIMIZE_FOR_SPEED))
6682             {
6683               if (dump_enabled_p ())
6684                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6685                                  "reduc op not supported by target.\n");
6686
6687               reduc_fn = IFN_LAST;
6688             }
6689         }
6690       else
6691         {
6692           if (!nested_cycle || double_reduc)
6693             {
6694               if (dump_enabled_p ())
6695                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6696                                  "no reduc code for scalar code.\n");
6697
6698               return false;
6699             }
6700         }
6701     }
6702   else if (reduction_type == COND_REDUCTION)
6703     {
6704       int scalar_precision
6705         = GET_MODE_PRECISION (SCALAR_TYPE_MODE (scalar_type));
6706       cr_index_scalar_type = make_unsigned_type (scalar_precision);
6707       cr_index_vector_type = build_vector_type (cr_index_scalar_type,
6708                                                 nunits_out);
6709
6710       if (direct_internal_fn_supported_p (IFN_REDUC_MAX, cr_index_vector_type,
6711                                           OPTIMIZE_FOR_SPEED))
6712         reduc_fn = IFN_REDUC_MAX;
6713     }
6714
6715   if (reduction_type != EXTRACT_LAST_REDUCTION
6716       && (!nested_cycle || double_reduc)
6717       && reduc_fn == IFN_LAST
6718       && !nunits_out.is_constant ())
6719     {
6720       if (dump_enabled_p ())
6721         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6722                          "missing target support for reduction on"
6723                          " variable-length vectors.\n");
6724       return false;
6725     }
6726
6727   /* For SLP reductions, see if there is a neutral value we can use.  */
6728   tree neutral_op = NULL_TREE;
6729   if (slp_node)
6730     neutral_op = neutral_op_for_slp_reduction
6731       (slp_node_instance->reduc_phis, code,
6732        REDUC_GROUP_FIRST_ELEMENT (stmt_info) != NULL);
6733
6734   if (double_reduc && reduction_type == FOLD_LEFT_REDUCTION)
6735     {
6736       /* We can't support in-order reductions of code such as this:
6737
6738            for (int i = 0; i < n1; ++i)
6739              for (int j = 0; j < n2; ++j)
6740                l += a[j];
6741
6742          since GCC effectively transforms the loop when vectorizing:
6743
6744            for (int i = 0; i < n1 / VF; ++i)
6745              for (int j = 0; j < n2; ++j)
6746                for (int k = 0; k < VF; ++k)
6747                  l += a[j];
6748
6749          which is a reassociation of the original operation.  */
6750       if (dump_enabled_p ())
6751         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6752                          "in-order double reduction not supported.\n");
6753
6754       return false;
6755     }
6756
6757   if (reduction_type == FOLD_LEFT_REDUCTION
6758       && slp_node
6759       && !REDUC_GROUP_FIRST_ELEMENT (stmt_info))
6760     {
6761       /* We cannot use in-order reductions in this case because there is
6762          an implicit reassociation of the operations involved.  */
6763       if (dump_enabled_p ())
6764         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6765                          "in-order unchained SLP reductions not supported.\n");
6766       return false;
6767     }
6768
6769   /* For double reductions, and for SLP reductions with a neutral value,
6770      we construct a variable-length initial vector by loading a vector
6771      full of the neutral value and then shift-and-inserting the start
6772      values into the low-numbered elements.  */
6773   if ((double_reduc || neutral_op)
6774       && !nunits_out.is_constant ()
6775       && !direct_internal_fn_supported_p (IFN_VEC_SHL_INSERT,
6776                                           vectype_out, OPTIMIZE_FOR_SPEED))
6777     {
6778       if (dump_enabled_p ())
6779         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6780                          "reduction on variable-length vectors requires"
6781                          " target support for a vector-shift-and-insert"
6782                          " operation.\n");
6783       return false;
6784     }
6785
6786   /* Check extra constraints for variable-length unchained SLP reductions.  */
6787   if (STMT_SLP_TYPE (stmt_info)
6788       && !REDUC_GROUP_FIRST_ELEMENT (stmt_info)
6789       && !nunits_out.is_constant ())
6790     {
6791       /* We checked above that we could build the initial vector when
6792          there's a neutral element value.  Check here for the case in
6793          which each SLP statement has its own initial value and in which
6794          that value needs to be repeated for every instance of the
6795          statement within the initial vector.  */
6796       unsigned int group_size = SLP_TREE_SCALAR_STMTS (slp_node).length ();
6797       scalar_mode elt_mode = SCALAR_TYPE_MODE (TREE_TYPE (vectype_out));
6798       if (!neutral_op
6799           && !can_duplicate_and_interleave_p (group_size, elt_mode))
6800         {
6801           if (dump_enabled_p ())
6802             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6803                              "unsupported form of SLP reduction for"
6804                              " variable-length vectors: cannot build"
6805                              " initial vector.\n");
6806           return false;
6807         }
6808       /* The epilogue code relies on the number of elements being a multiple
6809          of the group size.  The duplicate-and-interleave approach to setting
6810          up the the initial vector does too.  */
6811       if (!multiple_p (nunits_out, group_size))
6812         {
6813           if (dump_enabled_p ())
6814             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6815                              "unsupported form of SLP reduction for"
6816                              " variable-length vectors: the vector size"
6817                              " is not a multiple of the number of results.\n");
6818           return false;
6819         }
6820     }
6821
6822   /* In case of widenning multiplication by a constant, we update the type
6823      of the constant to be the type of the other operand.  We check that the
6824      constant fits the type in the pattern recognition pass.  */
6825   if (code == DOT_PROD_EXPR
6826       && !types_compatible_p (TREE_TYPE (ops[0]), TREE_TYPE (ops[1])))
6827     {
6828       if (TREE_CODE (ops[0]) == INTEGER_CST)
6829         ops[0] = fold_convert (TREE_TYPE (ops[1]), ops[0]);
6830       else if (TREE_CODE (ops[1]) == INTEGER_CST)
6831         ops[1] = fold_convert (TREE_TYPE (ops[0]), ops[1]);
6832       else
6833         {
6834           if (dump_enabled_p ())
6835             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6836                              "invalid types in dot-prod\n");
6837
6838           return false;
6839         }
6840     }
6841
6842   if (reduction_type == COND_REDUCTION)
6843     {
6844       widest_int ni;
6845
6846       if (! max_loop_iterations (loop, &ni))
6847         {
6848           if (dump_enabled_p ())
6849             dump_printf_loc (MSG_NOTE, vect_location,
6850                              "loop count not known, cannot create cond "
6851                              "reduction.\n");
6852           return false;
6853         }
6854       /* Convert backedges to iterations.  */
6855       ni += 1;
6856
6857       /* The additional index will be the same type as the condition.  Check
6858          that the loop can fit into this less one (because we'll use up the
6859          zero slot for when there are no matches).  */
6860       tree max_index = TYPE_MAX_VALUE (cr_index_scalar_type);
6861       if (wi::geu_p (ni, wi::to_widest (max_index)))
6862         {
6863           if (dump_enabled_p ())
6864             dump_printf_loc (MSG_NOTE, vect_location,
6865                              "loop size is greater than data size.\n");
6866           return false;
6867         }
6868     }
6869
6870   /* In case the vectorization factor (VF) is bigger than the number
6871      of elements that we can fit in a vectype (nunits), we have to generate
6872      more than one vector stmt - i.e - we need to "unroll" the
6873      vector stmt by a factor VF/nunits.  For more details see documentation
6874      in vectorizable_operation.  */
6875
6876   /* If the reduction is used in an outer loop we need to generate
6877      VF intermediate results, like so (e.g. for ncopies=2):
6878         r0 = phi (init, r0)
6879         r1 = phi (init, r1)
6880         r0 = x0 + r0;
6881         r1 = x1 + r1;
6882     (i.e. we generate VF results in 2 registers).
6883     In this case we have a separate def-use cycle for each copy, and therefore
6884     for each copy we get the vector def for the reduction variable from the
6885     respective phi node created for this copy.
6886
6887     Otherwise (the reduction is unused in the loop nest), we can combine
6888     together intermediate results, like so (e.g. for ncopies=2):
6889         r = phi (init, r)
6890         r = x0 + r;
6891         r = x1 + r;
6892    (i.e. we generate VF/2 results in a single register).
6893    In this case for each copy we get the vector def for the reduction variable
6894    from the vectorized reduction operation generated in the previous iteration.
6895
6896    This only works when we see both the reduction PHI and its only consumer
6897    in vectorizable_reduction and there are no intermediate stmts
6898    participating.  */
6899   stmt_vec_info use_stmt_info;
6900   tree reduc_phi_result = gimple_phi_result (reduc_def_phi);
6901   if (ncopies > 1
6902       && (STMT_VINFO_RELEVANT (stmt_info) <= vect_used_only_live)
6903       && (use_stmt_info = loop_vinfo->lookup_single_use (reduc_phi_result))
6904       && vect_stmt_to_vectorize (use_stmt_info) == stmt_info)
6905     {
6906       single_defuse_cycle = true;
6907       epilog_copies = 1;
6908     }
6909   else
6910     epilog_copies = ncopies;
6911
6912   /* If the reduction stmt is one of the patterns that have lane
6913      reduction embedded we cannot handle the case of ! single_defuse_cycle.  */
6914   if ((ncopies > 1
6915        && ! single_defuse_cycle)
6916       && (code == DOT_PROD_EXPR
6917           || code == WIDEN_SUM_EXPR
6918           || code == SAD_EXPR))
6919     {
6920       if (dump_enabled_p ())
6921         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6922                          "multi def-use cycle not possible for lane-reducing "
6923                          "reduction operation\n");
6924       return false;
6925     }
6926
6927   if (slp_node)
6928     vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
6929   else
6930     vec_num = 1;
6931
6932   internal_fn cond_fn = get_conditional_internal_fn (code);
6933   vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo);
6934
6935   if (!vec_stmt) /* transformation not required.  */
6936     {
6937       vect_model_reduction_cost (stmt_info, reduc_fn, ncopies, cost_vec);
6938       if (loop_vinfo && LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo))
6939         {
6940           if (reduction_type != FOLD_LEFT_REDUCTION
6941               && (cond_fn == IFN_LAST
6942                   || !direct_internal_fn_supported_p (cond_fn, vectype_in,
6943                                                       OPTIMIZE_FOR_SPEED)))
6944             {
6945               if (dump_enabled_p ())
6946                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6947                                  "can't use a fully-masked loop because no"
6948                                  " conditional operation is available.\n");
6949               LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = false;
6950             }
6951           else if (reduc_index == -1)
6952             {
6953               if (dump_enabled_p ())
6954                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6955                                  "can't use a fully-masked loop for chained"
6956                                  " reductions.\n");
6957               LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = false;
6958             }
6959           else
6960             vect_record_loop_mask (loop_vinfo, masks, ncopies * vec_num,
6961                                    vectype_in);
6962         }
6963       if (dump_enabled_p ()
6964           && reduction_type == FOLD_LEFT_REDUCTION)
6965         dump_printf_loc (MSG_NOTE, vect_location,
6966                          "using an in-order (fold-left) reduction.\n");
6967       STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
6968       return true;
6969     }
6970
6971   /* Transform.  */
6972
6973   if (dump_enabled_p ())
6974     dump_printf_loc (MSG_NOTE, vect_location, "transform reduction.\n");
6975
6976   /* FORNOW: Multiple types are not supported for condition.  */
6977   if (code == COND_EXPR)
6978     gcc_assert (ncopies == 1);
6979
6980   bool masked_loop_p = LOOP_VINFO_FULLY_MASKED_P (loop_vinfo);
6981
6982   if (reduction_type == FOLD_LEFT_REDUCTION)
6983     return vectorize_fold_left_reduction
6984       (stmt_info, gsi, vec_stmt, slp_node, reduc_def_phi, code,
6985        reduc_fn, ops, vectype_in, reduc_index, masks);
6986
6987   if (reduction_type == EXTRACT_LAST_REDUCTION)
6988     {
6989       gcc_assert (!slp_node);
6990       return vectorizable_condition (stmt_info, gsi, vec_stmt,
6991                                      true, NULL, NULL);
6992     }
6993
6994   /* Create the destination vector  */
6995   vec_dest = vect_create_destination_var (scalar_dest, vectype_out);
6996
6997   prev_stmt_info = NULL;
6998   prev_phi_info = NULL;
6999   if (!slp_node)
7000     {
7001       vec_oprnds0.create (1);
7002       vec_oprnds1.create (1);
7003       if (op_type == ternary_op)
7004         vec_oprnds2.create (1);
7005     }
7006
7007   phis.create (vec_num);
7008   vect_defs.create (vec_num);
7009   if (!slp_node)
7010     vect_defs.quick_push (NULL_TREE);
7011
7012   if (slp_node)
7013     phis.splice (SLP_TREE_VEC_STMTS (slp_node_instance->reduc_phis));
7014   else
7015     phis.quick_push (STMT_VINFO_VEC_STMT (reduc_def_info));
7016
7017   for (j = 0; j < ncopies; j++)
7018     {
7019       if (code == COND_EXPR)
7020         {
7021           gcc_assert (!slp_node);
7022           vectorizable_condition (stmt_info, gsi, vec_stmt,
7023                                   true, NULL, NULL);
7024           break;
7025         }
7026       if (code == LSHIFT_EXPR
7027           || code == RSHIFT_EXPR)
7028         {
7029           vectorizable_shift (stmt_info, gsi, vec_stmt, slp_node, NULL);
7030           break;
7031         }
7032
7033       /* Handle uses.  */
7034       if (j == 0)
7035         {
7036           if (slp_node)
7037             {
7038               /* Get vec defs for all the operands except the reduction index,
7039                  ensuring the ordering of the ops in the vector is kept.  */
7040               auto_vec<tree, 3> slp_ops;
7041               auto_vec<vec<tree>, 3> vec_defs;
7042
7043               slp_ops.quick_push (ops[0]);
7044               slp_ops.quick_push (ops[1]);
7045               if (op_type == ternary_op)
7046                 slp_ops.quick_push (ops[2]);
7047
7048               vect_get_slp_defs (slp_ops, slp_node, &vec_defs);
7049
7050               vec_oprnds0.safe_splice (vec_defs[0]);
7051               vec_defs[0].release ();
7052               vec_oprnds1.safe_splice (vec_defs[1]);
7053               vec_defs[1].release ();
7054               if (op_type == ternary_op)
7055                 {
7056                   vec_oprnds2.safe_splice (vec_defs[2]);
7057                   vec_defs[2].release ();
7058                 }
7059             }
7060           else
7061             {
7062               vec_oprnds0.quick_push
7063                 (vect_get_vec_def_for_operand (ops[0], stmt_info));
7064               vec_oprnds1.quick_push
7065                 (vect_get_vec_def_for_operand (ops[1], stmt_info));
7066               if (op_type == ternary_op)
7067                 vec_oprnds2.quick_push
7068                   (vect_get_vec_def_for_operand (ops[2], stmt_info));
7069             }
7070         }
7071       else
7072         {
7073           if (!slp_node)
7074             {
7075               gcc_assert (reduc_index != -1 || ! single_defuse_cycle);
7076
7077               if (single_defuse_cycle && reduc_index == 0)
7078                 vec_oprnds0[0] = gimple_get_lhs (new_stmt_info->stmt);
7079               else
7080                 vec_oprnds0[0]
7081                   = vect_get_vec_def_for_stmt_copy (loop_vinfo,
7082                                                     vec_oprnds0[0]);
7083               if (single_defuse_cycle && reduc_index == 1)
7084                 vec_oprnds1[0] = gimple_get_lhs (new_stmt_info->stmt);
7085               else
7086                 vec_oprnds1[0]
7087                   = vect_get_vec_def_for_stmt_copy (loop_vinfo,
7088                                                     vec_oprnds1[0]);
7089               if (op_type == ternary_op)
7090                 {
7091                   if (single_defuse_cycle && reduc_index == 2)
7092                     vec_oprnds2[0] = gimple_get_lhs (new_stmt_info->stmt);
7093                   else
7094                     vec_oprnds2[0]
7095                       = vect_get_vec_def_for_stmt_copy (loop_vinfo,
7096                                                         vec_oprnds2[0]);
7097                 }
7098             }
7099         }
7100
7101       FOR_EACH_VEC_ELT (vec_oprnds0, i, def0)
7102         {
7103           tree vop[3] = { def0, vec_oprnds1[i], NULL_TREE };
7104           if (masked_loop_p)
7105             {
7106               /* Make sure that the reduction accumulator is vop[0].  */
7107               if (reduc_index == 1)
7108                 {
7109                   gcc_assert (commutative_tree_code (code));
7110                   std::swap (vop[0], vop[1]);
7111                 }
7112               tree mask = vect_get_loop_mask (gsi, masks, vec_num * ncopies,
7113                                               vectype_in, i * ncopies + j);
7114               gcall *call = gimple_build_call_internal (cond_fn, 4, mask,
7115                                                         vop[0], vop[1],
7116                                                         vop[0]);
7117               new_temp = make_ssa_name (vec_dest, call);
7118               gimple_call_set_lhs (call, new_temp);
7119               gimple_call_set_nothrow (call, true);
7120               new_stmt_info
7121                 = vect_finish_stmt_generation (stmt_info, call, gsi);
7122             }
7123           else
7124             {
7125               if (op_type == ternary_op)
7126                 vop[2] = vec_oprnds2[i];
7127
7128               gassign *new_stmt = gimple_build_assign (vec_dest, code,
7129                                                        vop[0], vop[1], vop[2]);
7130               new_temp = make_ssa_name (vec_dest, new_stmt);
7131               gimple_assign_set_lhs (new_stmt, new_temp);
7132               new_stmt_info
7133                 = vect_finish_stmt_generation (stmt_info, new_stmt, gsi);
7134             }
7135
7136           if (slp_node)
7137             {
7138               SLP_TREE_VEC_STMTS (slp_node).quick_push (new_stmt_info);
7139               vect_defs.quick_push (new_temp);
7140             }
7141           else
7142             vect_defs[0] = new_temp;
7143         }
7144
7145       if (slp_node)
7146         continue;
7147
7148       if (j == 0)
7149         STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt_info;
7150       else
7151         STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt_info;
7152
7153       prev_stmt_info = new_stmt_info;
7154     }
7155
7156   /* Finalize the reduction-phi (set its arguments) and create the
7157      epilog reduction code.  */
7158   if ((!single_defuse_cycle || code == COND_EXPR) && !slp_node)
7159     vect_defs[0] = gimple_get_lhs ((*vec_stmt)->stmt);
7160
7161   vect_create_epilog_for_reduction (vect_defs, stmt_info, reduc_def_phi,
7162                                     epilog_copies, reduc_fn, phis,
7163                                     double_reduc, slp_node, slp_node_instance,
7164                                     cond_reduc_val, cond_reduc_op_code,
7165                                     neutral_op);
7166
7167   return true;
7168 }
7169
7170 /* Function vect_min_worthwhile_factor.
7171
7172    For a loop where we could vectorize the operation indicated by CODE,
7173    return the minimum vectorization factor that makes it worthwhile
7174    to use generic vectors.  */
7175 static unsigned int
7176 vect_min_worthwhile_factor (enum tree_code code)
7177 {
7178   switch (code)
7179     {
7180     case PLUS_EXPR:
7181     case MINUS_EXPR:
7182     case NEGATE_EXPR:
7183       return 4;
7184
7185     case BIT_AND_EXPR:
7186     case BIT_IOR_EXPR:
7187     case BIT_XOR_EXPR:
7188     case BIT_NOT_EXPR:
7189       return 2;
7190
7191     default:
7192       return INT_MAX;
7193     }
7194 }
7195
7196 /* Return true if VINFO indicates we are doing loop vectorization and if
7197    it is worth decomposing CODE operations into scalar operations for
7198    that loop's vectorization factor.  */
7199
7200 bool
7201 vect_worthwhile_without_simd_p (vec_info *vinfo, tree_code code)
7202 {
7203   loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
7204   unsigned HOST_WIDE_INT value;
7205   return (loop_vinfo
7206           && LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant (&value)
7207           && value >= vect_min_worthwhile_factor (code));
7208 }
7209
7210 /* Function vectorizable_induction
7211
7212    Check if STMT_INFO performs an induction computation that can be vectorized.
7213    If VEC_STMT is also passed, vectorize the induction PHI: create a vectorized
7214    phi to replace it, put it in VEC_STMT, and add it to the same basic block.
7215    Return true if STMT_INFO is vectorizable in this way.  */
7216
7217 bool
7218 vectorizable_induction (stmt_vec_info stmt_info,
7219                         gimple_stmt_iterator *gsi ATTRIBUTE_UNUSED,
7220                         stmt_vec_info *vec_stmt, slp_tree slp_node,
7221                         stmt_vector_for_cost *cost_vec)
7222 {
7223   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
7224   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
7225   unsigned ncopies;
7226   bool nested_in_vect_loop = false;
7227   struct loop *iv_loop;
7228   tree vec_def;
7229   edge pe = loop_preheader_edge (loop);
7230   basic_block new_bb;
7231   tree new_vec, vec_init, vec_step, t;
7232   tree new_name;
7233   gimple *new_stmt;
7234   gphi *induction_phi;
7235   tree induc_def, vec_dest;
7236   tree init_expr, step_expr;
7237   poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
7238   unsigned i;
7239   tree expr;
7240   gimple_seq stmts;
7241   imm_use_iterator imm_iter;
7242   use_operand_p use_p;
7243   gimple *exit_phi;
7244   edge latch_e;
7245   tree loop_arg;
7246   gimple_stmt_iterator si;
7247
7248   gphi *phi = dyn_cast <gphi *> (stmt_info->stmt);
7249   if (!phi)
7250     return false;
7251
7252   if (!STMT_VINFO_RELEVANT_P (stmt_info))
7253     return false;
7254
7255   /* Make sure it was recognized as induction computation.  */
7256   if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_induction_def)
7257     return false;
7258
7259   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
7260   poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
7261
7262   if (slp_node)
7263     ncopies = 1;
7264   else
7265     ncopies = vect_get_num_copies (loop_vinfo, vectype);
7266   gcc_assert (ncopies >= 1);
7267
7268   /* FORNOW. These restrictions should be relaxed.  */
7269   if (nested_in_vect_loop_p (loop, stmt_info))
7270     {
7271       imm_use_iterator imm_iter;
7272       use_operand_p use_p;
7273       gimple *exit_phi;
7274       edge latch_e;
7275       tree loop_arg;
7276
7277       if (ncopies > 1)
7278         {
7279           if (dump_enabled_p ())
7280             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7281                              "multiple types in nested loop.\n");
7282           return false;
7283         }
7284
7285       /* FORNOW: outer loop induction with SLP not supported.  */
7286       if (STMT_SLP_TYPE (stmt_info))
7287         return false;
7288
7289       exit_phi = NULL;
7290       latch_e = loop_latch_edge (loop->inner);
7291       loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
7292       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, loop_arg)
7293         {
7294           gimple *use_stmt = USE_STMT (use_p);
7295           if (is_gimple_debug (use_stmt))
7296             continue;
7297
7298           if (!flow_bb_inside_loop_p (loop->inner, gimple_bb (use_stmt)))
7299             {
7300               exit_phi = use_stmt;
7301               break;
7302             }
7303         }
7304       if (exit_phi)
7305         {
7306           stmt_vec_info exit_phi_vinfo = loop_vinfo->lookup_stmt (exit_phi);
7307           if (!(STMT_VINFO_RELEVANT_P (exit_phi_vinfo)
7308                 && !STMT_VINFO_LIVE_P (exit_phi_vinfo)))
7309             {
7310               if (dump_enabled_p ())
7311                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7312                                  "inner-loop induction only used outside "
7313                                  "of the outer vectorized loop.\n");
7314               return false;
7315             }
7316         }
7317
7318       nested_in_vect_loop = true;
7319       iv_loop = loop->inner;
7320     }
7321   else
7322     iv_loop = loop;
7323   gcc_assert (iv_loop == (gimple_bb (phi))->loop_father);
7324
7325   if (slp_node && !nunits.is_constant ())
7326     {
7327       /* The current SLP code creates the initial value element-by-element.  */
7328       if (dump_enabled_p ())
7329         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7330                          "SLP induction not supported for variable-length"
7331                          " vectors.\n");
7332       return false;
7333     }
7334
7335   if (!vec_stmt) /* transformation not required.  */
7336     {
7337       STMT_VINFO_TYPE (stmt_info) = induc_vec_info_type;
7338       DUMP_VECT_SCOPE ("vectorizable_induction");
7339       vect_model_induction_cost (stmt_info, ncopies, cost_vec);
7340       return true;
7341     }
7342
7343   /* Transform.  */
7344
7345   /* Compute a vector variable, initialized with the first VF values of
7346      the induction variable.  E.g., for an iv with IV_PHI='X' and
7347      evolution S, for a vector of 4 units, we want to compute:
7348      [X, X + S, X + 2*S, X + 3*S].  */
7349
7350   if (dump_enabled_p ())
7351     dump_printf_loc (MSG_NOTE, vect_location, "transform induction phi.\n");
7352
7353   latch_e = loop_latch_edge (iv_loop);
7354   loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
7355
7356   step_expr = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_info);
7357   gcc_assert (step_expr != NULL_TREE);
7358
7359   pe = loop_preheader_edge (iv_loop);
7360   init_expr = PHI_ARG_DEF_FROM_EDGE (phi,
7361                                      loop_preheader_edge (iv_loop));
7362
7363   stmts = NULL;
7364   if (!nested_in_vect_loop)
7365     {
7366       /* Convert the initial value to the desired type.  */
7367       tree new_type = TREE_TYPE (vectype);
7368       init_expr = gimple_convert (&stmts, new_type, init_expr);
7369
7370       /* If we are using the loop mask to "peel" for alignment then we need
7371          to adjust the start value here.  */
7372       tree skip_niters = LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo);
7373       if (skip_niters != NULL_TREE)
7374         {
7375           if (FLOAT_TYPE_P (vectype))
7376             skip_niters = gimple_build (&stmts, FLOAT_EXPR, new_type,
7377                                         skip_niters);
7378           else
7379             skip_niters = gimple_convert (&stmts, new_type, skip_niters);
7380           tree skip_step = gimple_build (&stmts, MULT_EXPR, new_type,
7381                                          skip_niters, step_expr);
7382           init_expr = gimple_build (&stmts, MINUS_EXPR, new_type,
7383                                     init_expr, skip_step);
7384         }
7385     }
7386
7387   /* Convert the step to the desired type.  */
7388   step_expr = gimple_convert (&stmts, TREE_TYPE (vectype), step_expr);
7389
7390   if (stmts)
7391     {
7392       new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
7393       gcc_assert (!new_bb);
7394     }
7395
7396   /* Find the first insertion point in the BB.  */
7397   basic_block bb = gimple_bb (phi);
7398   si = gsi_after_labels (bb);
7399
7400   /* For SLP induction we have to generate several IVs as for example
7401      with group size 3 we need [i, i, i, i + S] [i + S, i + S, i + 2*S, i + 2*S]
7402      [i + 2*S, i + 3*S, i + 3*S, i + 3*S].  The step is the same uniform
7403      [VF*S, VF*S, VF*S, VF*S] for all.  */
7404   if (slp_node)
7405     {
7406       /* Enforced above.  */
7407       unsigned int const_nunits = nunits.to_constant ();
7408
7409       /* Generate [VF*S, VF*S, ... ].  */
7410       if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
7411         {
7412           expr = build_int_cst (integer_type_node, vf);
7413           expr = fold_convert (TREE_TYPE (step_expr), expr);
7414         }
7415       else
7416         expr = build_int_cst (TREE_TYPE (step_expr), vf);
7417       new_name = fold_build2 (MULT_EXPR, TREE_TYPE (step_expr),
7418                               expr, step_expr);
7419       if (! CONSTANT_CLASS_P (new_name))
7420         new_name = vect_init_vector (stmt_info, new_name,
7421                                      TREE_TYPE (step_expr), NULL);
7422       new_vec = build_vector_from_val (vectype, new_name);
7423       vec_step = vect_init_vector (stmt_info, new_vec, vectype, NULL);
7424
7425       /* Now generate the IVs.  */
7426       unsigned group_size = SLP_TREE_SCALAR_STMTS (slp_node).length ();
7427       unsigned nvects = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
7428       unsigned elts = const_nunits * nvects;
7429       unsigned nivs = least_common_multiple (group_size,
7430                                              const_nunits) / const_nunits;
7431       gcc_assert (elts % group_size == 0);
7432       tree elt = init_expr;
7433       unsigned ivn;
7434       for (ivn = 0; ivn < nivs; ++ivn)
7435         {
7436           tree_vector_builder elts (vectype, const_nunits, 1);
7437           stmts = NULL;
7438           for (unsigned eltn = 0; eltn < const_nunits; ++eltn)
7439             {
7440               if (ivn*const_nunits + eltn >= group_size
7441                   && (ivn * const_nunits + eltn) % group_size == 0)
7442                 elt = gimple_build (&stmts, PLUS_EXPR, TREE_TYPE (elt),
7443                                     elt, step_expr);
7444               elts.quick_push (elt);
7445             }
7446           vec_init = gimple_build_vector (&stmts, &elts);
7447           if (stmts)
7448             {
7449               new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
7450               gcc_assert (!new_bb);
7451             }
7452
7453           /* Create the induction-phi that defines the induction-operand.  */
7454           vec_dest = vect_get_new_vect_var (vectype, vect_simple_var, "vec_iv_");
7455           induction_phi = create_phi_node (vec_dest, iv_loop->header);
7456           stmt_vec_info induction_phi_info
7457             = loop_vinfo->add_stmt (induction_phi);
7458           induc_def = PHI_RESULT (induction_phi);
7459
7460           /* Create the iv update inside the loop  */
7461           vec_def = make_ssa_name (vec_dest);
7462           new_stmt = gimple_build_assign (vec_def, PLUS_EXPR, induc_def, vec_step);
7463           gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
7464           loop_vinfo->add_stmt (new_stmt);
7465
7466           /* Set the arguments of the phi node:  */
7467           add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION);
7468           add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
7469                        UNKNOWN_LOCATION);
7470
7471           SLP_TREE_VEC_STMTS (slp_node).quick_push (induction_phi_info);
7472         }
7473
7474       /* Re-use IVs when we can.  */
7475       if (ivn < nvects)
7476         {
7477           unsigned vfp
7478             = least_common_multiple (group_size, const_nunits) / group_size;
7479           /* Generate [VF'*S, VF'*S, ... ].  */
7480           if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
7481             {
7482               expr = build_int_cst (integer_type_node, vfp);
7483               expr = fold_convert (TREE_TYPE (step_expr), expr);
7484             }
7485           else
7486             expr = build_int_cst (TREE_TYPE (step_expr), vfp);
7487           new_name = fold_build2 (MULT_EXPR, TREE_TYPE (step_expr),
7488                                   expr, step_expr);
7489           if (! CONSTANT_CLASS_P (new_name))
7490             new_name = vect_init_vector (stmt_info, new_name,
7491                                          TREE_TYPE (step_expr), NULL);
7492           new_vec = build_vector_from_val (vectype, new_name);
7493           vec_step = vect_init_vector (stmt_info, new_vec, vectype, NULL);
7494           for (; ivn < nvects; ++ivn)
7495             {
7496               gimple *iv = SLP_TREE_VEC_STMTS (slp_node)[ivn - nivs]->stmt;
7497               tree def;
7498               if (gimple_code (iv) == GIMPLE_PHI)
7499                 def = gimple_phi_result (iv);
7500               else
7501                 def = gimple_assign_lhs (iv);
7502               new_stmt = gimple_build_assign (make_ssa_name (vectype),
7503                                               PLUS_EXPR,
7504                                               def, vec_step);
7505               if (gimple_code (iv) == GIMPLE_PHI)
7506                 gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
7507               else
7508                 {
7509                   gimple_stmt_iterator tgsi = gsi_for_stmt (iv);
7510                   gsi_insert_after (&tgsi, new_stmt, GSI_CONTINUE_LINKING);
7511                 }
7512               SLP_TREE_VEC_STMTS (slp_node).quick_push
7513                 (loop_vinfo->add_stmt (new_stmt));
7514             }
7515         }
7516
7517       return true;
7518     }
7519
7520   /* Create the vector that holds the initial_value of the induction.  */
7521   if (nested_in_vect_loop)
7522     {
7523       /* iv_loop is nested in the loop to be vectorized.  init_expr had already
7524          been created during vectorization of previous stmts.  We obtain it
7525          from the STMT_VINFO_VEC_STMT of the defining stmt.  */
7526       vec_init = vect_get_vec_def_for_operand (init_expr, stmt_info);
7527       /* If the initial value is not of proper type, convert it.  */
7528       if (!useless_type_conversion_p (vectype, TREE_TYPE (vec_init)))
7529         {
7530           new_stmt
7531             = gimple_build_assign (vect_get_new_ssa_name (vectype,
7532                                                           vect_simple_var,
7533                                                           "vec_iv_"),
7534                                    VIEW_CONVERT_EXPR,
7535                                    build1 (VIEW_CONVERT_EXPR, vectype,
7536                                            vec_init));
7537           vec_init = gimple_assign_lhs (new_stmt);
7538           new_bb = gsi_insert_on_edge_immediate (loop_preheader_edge (iv_loop),
7539                                                  new_stmt);
7540           gcc_assert (!new_bb);
7541           loop_vinfo->add_stmt (new_stmt);
7542         }
7543     }
7544   else
7545     {
7546       /* iv_loop is the loop to be vectorized. Create:
7547          vec_init = [X, X+S, X+2*S, X+3*S] (S = step_expr, X = init_expr)  */
7548       stmts = NULL;
7549       new_name = gimple_convert (&stmts, TREE_TYPE (vectype), init_expr);
7550
7551       unsigned HOST_WIDE_INT const_nunits;
7552       if (nunits.is_constant (&const_nunits))
7553         {
7554           tree_vector_builder elts (vectype, const_nunits, 1);
7555           elts.quick_push (new_name);
7556           for (i = 1; i < const_nunits; i++)
7557             {
7558               /* Create: new_name_i = new_name + step_expr  */
7559               new_name = gimple_build (&stmts, PLUS_EXPR, TREE_TYPE (new_name),
7560                                        new_name, step_expr);
7561               elts.quick_push (new_name);
7562             }
7563           /* Create a vector from [new_name_0, new_name_1, ...,
7564              new_name_nunits-1]  */
7565           vec_init = gimple_build_vector (&stmts, &elts);
7566         }
7567       else if (INTEGRAL_TYPE_P (TREE_TYPE (step_expr)))
7568         /* Build the initial value directly from a VEC_SERIES_EXPR.  */
7569         vec_init = gimple_build (&stmts, VEC_SERIES_EXPR, vectype,
7570                                  new_name, step_expr);
7571       else
7572         {
7573           /* Build:
7574                 [base, base, base, ...]
7575                 + (vectype) [0, 1, 2, ...] * [step, step, step, ...].  */
7576           gcc_assert (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)));
7577           gcc_assert (flag_associative_math);
7578           tree index = build_index_vector (vectype, 0, 1);
7579           tree base_vec = gimple_build_vector_from_val (&stmts, vectype,
7580                                                         new_name);
7581           tree step_vec = gimple_build_vector_from_val (&stmts, vectype,
7582                                                         step_expr);
7583           vec_init = gimple_build (&stmts, FLOAT_EXPR, vectype, index);
7584           vec_init = gimple_build (&stmts, MULT_EXPR, vectype,
7585                                    vec_init, step_vec);
7586           vec_init = gimple_build (&stmts, PLUS_EXPR, vectype,
7587                                    vec_init, base_vec);
7588         }
7589
7590       if (stmts)
7591         {
7592           new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
7593           gcc_assert (!new_bb);
7594         }
7595     }
7596
7597
7598   /* Create the vector that holds the step of the induction.  */
7599   if (nested_in_vect_loop)
7600     /* iv_loop is nested in the loop to be vectorized. Generate:
7601        vec_step = [S, S, S, S]  */
7602     new_name = step_expr;
7603   else
7604     {
7605       /* iv_loop is the loop to be vectorized. Generate:
7606           vec_step = [VF*S, VF*S, VF*S, VF*S]  */
7607       gimple_seq seq = NULL;
7608       if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
7609         {
7610           expr = build_int_cst (integer_type_node, vf);
7611           expr = gimple_build (&seq, FLOAT_EXPR, TREE_TYPE (step_expr), expr);
7612         }
7613       else
7614         expr = build_int_cst (TREE_TYPE (step_expr), vf);
7615       new_name = gimple_build (&seq, MULT_EXPR, TREE_TYPE (step_expr),
7616                                expr, step_expr);
7617       if (seq)
7618         {
7619           new_bb = gsi_insert_seq_on_edge_immediate (pe, seq);
7620           gcc_assert (!new_bb);
7621         }
7622     }
7623
7624   t = unshare_expr (new_name);
7625   gcc_assert (CONSTANT_CLASS_P (new_name)
7626               || TREE_CODE (new_name) == SSA_NAME);
7627   new_vec = build_vector_from_val (vectype, t);
7628   vec_step = vect_init_vector (stmt_info, new_vec, vectype, NULL);
7629
7630
7631   /* Create the following def-use cycle:
7632      loop prolog:
7633          vec_init = ...
7634          vec_step = ...
7635      loop:
7636          vec_iv = PHI <vec_init, vec_loop>
7637          ...
7638          STMT
7639          ...
7640          vec_loop = vec_iv + vec_step;  */
7641
7642   /* Create the induction-phi that defines the induction-operand.  */
7643   vec_dest = vect_get_new_vect_var (vectype, vect_simple_var, "vec_iv_");
7644   induction_phi = create_phi_node (vec_dest, iv_loop->header);
7645   stmt_vec_info induction_phi_info = loop_vinfo->add_stmt (induction_phi);
7646   induc_def = PHI_RESULT (induction_phi);
7647
7648   /* Create the iv update inside the loop  */
7649   vec_def = make_ssa_name (vec_dest);
7650   new_stmt = gimple_build_assign (vec_def, PLUS_EXPR, induc_def, vec_step);
7651   gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
7652   stmt_vec_info new_stmt_info = loop_vinfo->add_stmt (new_stmt);
7653
7654   /* Set the arguments of the phi node:  */
7655   add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION);
7656   add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
7657                UNKNOWN_LOCATION);
7658
7659   STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = induction_phi_info;
7660
7661   /* In case that vectorization factor (VF) is bigger than the number
7662      of elements that we can fit in a vectype (nunits), we have to generate
7663      more than one vector stmt - i.e - we need to "unroll" the
7664      vector stmt by a factor VF/nunits.  For more details see documentation
7665      in vectorizable_operation.  */
7666
7667   if (ncopies > 1)
7668     {
7669       gimple_seq seq = NULL;
7670       stmt_vec_info prev_stmt_vinfo;
7671       /* FORNOW. This restriction should be relaxed.  */
7672       gcc_assert (!nested_in_vect_loop);
7673
7674       /* Create the vector that holds the step of the induction.  */
7675       if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
7676         {
7677           expr = build_int_cst (integer_type_node, nunits);
7678           expr = gimple_build (&seq, FLOAT_EXPR, TREE_TYPE (step_expr), expr);
7679         }
7680       else
7681         expr = build_int_cst (TREE_TYPE (step_expr), nunits);
7682       new_name = gimple_build (&seq, MULT_EXPR, TREE_TYPE (step_expr),
7683                                expr, step_expr);
7684       if (seq)
7685         {
7686           new_bb = gsi_insert_seq_on_edge_immediate (pe, seq);
7687           gcc_assert (!new_bb);
7688         }
7689
7690       t = unshare_expr (new_name);
7691       gcc_assert (CONSTANT_CLASS_P (new_name)
7692                   || TREE_CODE (new_name) == SSA_NAME);
7693       new_vec = build_vector_from_val (vectype, t);
7694       vec_step = vect_init_vector (stmt_info, new_vec, vectype, NULL);
7695
7696       vec_def = induc_def;
7697       prev_stmt_vinfo = induction_phi_info;
7698       for (i = 1; i < ncopies; i++)
7699         {
7700           /* vec_i = vec_prev + vec_step  */
7701           new_stmt = gimple_build_assign (vec_dest, PLUS_EXPR,
7702                                           vec_def, vec_step);
7703           vec_def = make_ssa_name (vec_dest, new_stmt);
7704           gimple_assign_set_lhs (new_stmt, vec_def);
7705
7706           gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
7707           new_stmt_info = loop_vinfo->add_stmt (new_stmt);
7708           STMT_VINFO_RELATED_STMT (prev_stmt_vinfo) = new_stmt_info;
7709           prev_stmt_vinfo = new_stmt_info;
7710         }
7711     }
7712
7713   if (nested_in_vect_loop)
7714     {
7715       /* Find the loop-closed exit-phi of the induction, and record
7716          the final vector of induction results:  */
7717       exit_phi = NULL;
7718       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, loop_arg)
7719         {
7720           gimple *use_stmt = USE_STMT (use_p);
7721           if (is_gimple_debug (use_stmt))
7722             continue;
7723
7724           if (!flow_bb_inside_loop_p (iv_loop, gimple_bb (use_stmt)))
7725             {
7726               exit_phi = use_stmt;
7727               break;
7728             }
7729         }
7730       if (exit_phi)
7731         {
7732           stmt_vec_info stmt_vinfo = loop_vinfo->lookup_stmt (exit_phi);
7733           /* FORNOW. Currently not supporting the case that an inner-loop induction
7734              is not used in the outer-loop (i.e. only outside the outer-loop).  */
7735           gcc_assert (STMT_VINFO_RELEVANT_P (stmt_vinfo)
7736                       && !STMT_VINFO_LIVE_P (stmt_vinfo));
7737
7738           STMT_VINFO_VEC_STMT (stmt_vinfo) = new_stmt_info;
7739           if (dump_enabled_p ())
7740             dump_printf_loc (MSG_NOTE, vect_location,
7741                              "vector of inductions after inner-loop:%G",
7742                              new_stmt);
7743         }
7744     }
7745
7746
7747   if (dump_enabled_p ())
7748     dump_printf_loc (MSG_NOTE, vect_location,
7749                      "transform induction: created def-use cycle: %G%G",
7750                      induction_phi, SSA_NAME_DEF_STMT (vec_def));
7751
7752   return true;
7753 }
7754
7755 /* Function vectorizable_live_operation.
7756
7757    STMT_INFO computes a value that is used outside the loop.  Check if
7758    it can be supported.  */
7759
7760 bool
7761 vectorizable_live_operation (stmt_vec_info stmt_info,
7762                              gimple_stmt_iterator *gsi ATTRIBUTE_UNUSED,
7763                              slp_tree slp_node, int slp_index,
7764                              stmt_vec_info *vec_stmt,
7765                              stmt_vector_for_cost *)
7766 {
7767   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
7768   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
7769   imm_use_iterator imm_iter;
7770   tree lhs, lhs_type, bitsize, vec_bitsize;
7771   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
7772   poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
7773   int ncopies;
7774   gimple *use_stmt;
7775   auto_vec<tree> vec_oprnds;
7776   int vec_entry = 0;
7777   poly_uint64 vec_index = 0;
7778
7779   gcc_assert (STMT_VINFO_LIVE_P (stmt_info));
7780
7781   if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def)
7782     return false;
7783
7784   /* FORNOW.  CHECKME.  */
7785   if (nested_in_vect_loop_p (loop, stmt_info))
7786     return false;
7787
7788   /* If STMT is not relevant and it is a simple assignment and its inputs are
7789      invariant then it can remain in place, unvectorized.  The original last
7790      scalar value that it computes will be used.  */
7791   if (!STMT_VINFO_RELEVANT_P (stmt_info))
7792     {
7793       gcc_assert (is_simple_and_all_uses_invariant (stmt_info, loop_vinfo));
7794       if (dump_enabled_p ())
7795         dump_printf_loc (MSG_NOTE, vect_location,
7796                          "statement is simple and uses invariant.  Leaving in "
7797                          "place.\n");
7798       return true;
7799     }
7800
7801   if (slp_node)
7802     ncopies = 1;
7803   else
7804     ncopies = vect_get_num_copies (loop_vinfo, vectype);
7805
7806   if (slp_node)
7807     {
7808       gcc_assert (slp_index >= 0);
7809
7810       int num_scalar = SLP_TREE_SCALAR_STMTS (slp_node).length ();
7811       int num_vec = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
7812
7813       /* Get the last occurrence of the scalar index from the concatenation of
7814          all the slp vectors. Calculate which slp vector it is and the index
7815          within.  */
7816       poly_uint64 pos = (num_vec * nunits) - num_scalar + slp_index;
7817
7818       /* Calculate which vector contains the result, and which lane of
7819          that vector we need.  */
7820       if (!can_div_trunc_p (pos, nunits, &vec_entry, &vec_index))
7821         {
7822           if (dump_enabled_p ())
7823             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7824                              "Cannot determine which vector holds the"
7825                              " final result.\n");
7826           return false;
7827         }
7828     }
7829
7830   if (!vec_stmt)
7831     {
7832       /* No transformation required.  */
7833       if (LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo))
7834         {
7835           if (!direct_internal_fn_supported_p (IFN_EXTRACT_LAST, vectype,
7836                                                OPTIMIZE_FOR_SPEED))
7837             {
7838               if (dump_enabled_p ())
7839                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7840                                  "can't use a fully-masked loop because "
7841                                  "the target doesn't support extract last "
7842                                  "reduction.\n");
7843               LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = false;
7844             }
7845           else if (slp_node)
7846             {
7847               if (dump_enabled_p ())
7848                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7849                                  "can't use a fully-masked loop because an "
7850                                  "SLP statement is live after the loop.\n");
7851               LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = false;
7852             }
7853           else if (ncopies > 1)
7854             {
7855               if (dump_enabled_p ())
7856                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7857                                  "can't use a fully-masked loop because"
7858                                  " ncopies is greater than 1.\n");
7859               LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = false;
7860             }
7861           else
7862             {
7863               gcc_assert (ncopies == 1 && !slp_node);
7864               vect_record_loop_mask (loop_vinfo,
7865                                      &LOOP_VINFO_MASKS (loop_vinfo),
7866                                      1, vectype);
7867             }
7868         }
7869       return true;
7870     }
7871
7872   /* Use the lhs of the original scalar statement.  */
7873   gimple *stmt = vect_orig_stmt (stmt_info)->stmt;
7874
7875   lhs = (is_a <gphi *> (stmt)) ? gimple_phi_result (stmt)
7876         : gimple_get_lhs (stmt);
7877   lhs_type = TREE_TYPE (lhs);
7878
7879   bitsize = (VECTOR_BOOLEAN_TYPE_P (vectype)
7880              ? bitsize_int (TYPE_PRECISION (TREE_TYPE (vectype)))
7881              : TYPE_SIZE (TREE_TYPE (vectype)));
7882   vec_bitsize = TYPE_SIZE (vectype);
7883
7884   /* Get the vectorized lhs of STMT and the lane to use (counted in bits).  */
7885   tree vec_lhs, bitstart;
7886   if (slp_node)
7887     {
7888       gcc_assert (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo));
7889
7890       /* Get the correct slp vectorized stmt.  */
7891       gimple *vec_stmt = SLP_TREE_VEC_STMTS (slp_node)[vec_entry]->stmt;
7892       if (gphi *phi = dyn_cast <gphi *> (vec_stmt))
7893         vec_lhs = gimple_phi_result (phi);
7894       else
7895         vec_lhs = gimple_get_lhs (vec_stmt);
7896
7897       /* Get entry to use.  */
7898       bitstart = bitsize_int (vec_index);
7899       bitstart = int_const_binop (MULT_EXPR, bitsize, bitstart);
7900     }
7901   else
7902     {
7903       enum vect_def_type dt = STMT_VINFO_DEF_TYPE (stmt_info);
7904       vec_lhs = vect_get_vec_def_for_operand_1 (stmt_info, dt);
7905       gcc_checking_assert (ncopies == 1
7906                            || !LOOP_VINFO_FULLY_MASKED_P (loop_vinfo));
7907
7908       /* For multiple copies, get the last copy.  */
7909       for (int i = 1; i < ncopies; ++i)
7910         vec_lhs = vect_get_vec_def_for_stmt_copy (loop_vinfo, vec_lhs);
7911
7912       /* Get the last lane in the vector.  */
7913       bitstart = int_const_binop (MINUS_EXPR, vec_bitsize, bitsize);
7914     }
7915
7916   gimple_seq stmts = NULL;
7917   tree new_tree;
7918   if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
7919     {
7920       /* Emit:
7921
7922            SCALAR_RES = EXTRACT_LAST <VEC_LHS, MASK>
7923
7924          where VEC_LHS is the vectorized live-out result and MASK is
7925          the loop mask for the final iteration.  */
7926       gcc_assert (ncopies == 1 && !slp_node);
7927       tree scalar_type = TREE_TYPE (STMT_VINFO_VECTYPE (stmt_info));
7928       tree mask = vect_get_loop_mask (gsi, &LOOP_VINFO_MASKS (loop_vinfo),
7929                                       1, vectype, 0);
7930       tree scalar_res = gimple_build (&stmts, CFN_EXTRACT_LAST,
7931                                       scalar_type, mask, vec_lhs);
7932
7933       /* Convert the extracted vector element to the required scalar type.  */
7934       new_tree = gimple_convert (&stmts, lhs_type, scalar_res);
7935     }
7936   else
7937     {
7938       tree bftype = TREE_TYPE (vectype);
7939       if (VECTOR_BOOLEAN_TYPE_P (vectype))
7940         bftype = build_nonstandard_integer_type (tree_to_uhwi (bitsize), 1);
7941       new_tree = build3 (BIT_FIELD_REF, bftype, vec_lhs, bitsize, bitstart);
7942       new_tree = force_gimple_operand (fold_convert (lhs_type, new_tree),
7943                                        &stmts, true, NULL_TREE);
7944     }
7945
7946   if (stmts)
7947     gsi_insert_seq_on_edge_immediate (single_exit (loop), stmts);
7948
7949   /* Replace use of lhs with newly computed result.  If the use stmt is a
7950      single arg PHI, just replace all uses of PHI result.  It's necessary
7951      because lcssa PHI defining lhs may be before newly inserted stmt.  */
7952   use_operand_p use_p;
7953   FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, lhs)
7954     if (!flow_bb_inside_loop_p (loop, gimple_bb (use_stmt))
7955         && !is_gimple_debug (use_stmt))
7956     {
7957       if (gimple_code (use_stmt) == GIMPLE_PHI
7958           && gimple_phi_num_args (use_stmt) == 1)
7959         {
7960           replace_uses_by (gimple_phi_result (use_stmt), new_tree);
7961         }
7962       else
7963         {
7964           FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
7965             SET_USE (use_p, new_tree);
7966         }
7967       update_stmt (use_stmt);
7968     }
7969
7970   return true;
7971 }
7972
7973 /* Kill any debug uses outside LOOP of SSA names defined in STMT_INFO.  */
7974
7975 static void
7976 vect_loop_kill_debug_uses (struct loop *loop, stmt_vec_info stmt_info)
7977 {
7978   ssa_op_iter op_iter;
7979   imm_use_iterator imm_iter;
7980   def_operand_p def_p;
7981   gimple *ustmt;
7982
7983   FOR_EACH_PHI_OR_STMT_DEF (def_p, stmt_info->stmt, op_iter, SSA_OP_DEF)
7984     {
7985       FOR_EACH_IMM_USE_STMT (ustmt, imm_iter, DEF_FROM_PTR (def_p))
7986         {
7987           basic_block bb;
7988
7989           if (!is_gimple_debug (ustmt))
7990             continue;
7991
7992           bb = gimple_bb (ustmt);
7993
7994           if (!flow_bb_inside_loop_p (loop, bb))
7995             {
7996               if (gimple_debug_bind_p (ustmt))
7997                 {
7998                   if (dump_enabled_p ())
7999                     dump_printf_loc (MSG_NOTE, vect_location,
8000                                      "killing debug use\n");
8001
8002                   gimple_debug_bind_reset_value (ustmt);
8003                   update_stmt (ustmt);
8004                 }
8005               else
8006                 gcc_unreachable ();
8007             }
8008         }
8009     }
8010 }
8011
8012 /* Given loop represented by LOOP_VINFO, return true if computation of
8013    LOOP_VINFO_NITERS (= LOOP_VINFO_NITERSM1 + 1) doesn't overflow, false
8014    otherwise.  */
8015
8016 static bool
8017 loop_niters_no_overflow (loop_vec_info loop_vinfo)
8018 {
8019   /* Constant case.  */
8020   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
8021     {
8022       tree cst_niters = LOOP_VINFO_NITERS (loop_vinfo);
8023       tree cst_nitersm1 = LOOP_VINFO_NITERSM1 (loop_vinfo);
8024
8025       gcc_assert (TREE_CODE (cst_niters) == INTEGER_CST);
8026       gcc_assert (TREE_CODE (cst_nitersm1) == INTEGER_CST);
8027       if (wi::to_widest (cst_nitersm1) < wi::to_widest (cst_niters))
8028         return true;
8029     }
8030
8031   widest_int max;
8032   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
8033   /* Check the upper bound of loop niters.  */
8034   if (get_max_loop_iterations (loop, &max))
8035     {
8036       tree type = TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo));
8037       signop sgn = TYPE_SIGN (type);
8038       widest_int type_max = widest_int::from (wi::max_value (type), sgn);
8039       if (max < type_max)
8040         return true;
8041     }
8042   return false;
8043 }
8044
8045 /* Return a mask type with half the number of elements as TYPE.  */
8046
8047 tree
8048 vect_halve_mask_nunits (tree type)
8049 {
8050   poly_uint64 nunits = exact_div (TYPE_VECTOR_SUBPARTS (type), 2);
8051   return build_truth_vector_type (nunits, current_vector_size);
8052 }
8053
8054 /* Return a mask type with twice as many elements as TYPE.  */
8055
8056 tree
8057 vect_double_mask_nunits (tree type)
8058 {
8059   poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (type) * 2;
8060   return build_truth_vector_type (nunits, current_vector_size);
8061 }
8062
8063 /* Record that a fully-masked version of LOOP_VINFO would need MASKS to
8064    contain a sequence of NVECTORS masks that each control a vector of type
8065    VECTYPE.  */
8066
8067 void
8068 vect_record_loop_mask (loop_vec_info loop_vinfo, vec_loop_masks *masks,
8069                        unsigned int nvectors, tree vectype)
8070 {
8071   gcc_assert (nvectors != 0);
8072   if (masks->length () < nvectors)
8073     masks->safe_grow_cleared (nvectors);
8074   rgroup_masks *rgm = &(*masks)[nvectors - 1];
8075   /* The number of scalars per iteration and the number of vectors are
8076      both compile-time constants.  */
8077   unsigned int nscalars_per_iter
8078     = exact_div (nvectors * TYPE_VECTOR_SUBPARTS (vectype),
8079                  LOOP_VINFO_VECT_FACTOR (loop_vinfo)).to_constant ();
8080   if (rgm->max_nscalars_per_iter < nscalars_per_iter)
8081     {
8082       rgm->max_nscalars_per_iter = nscalars_per_iter;
8083       rgm->mask_type = build_same_sized_truth_vector_type (vectype);
8084     }
8085 }
8086
8087 /* Given a complete set of masks MASKS, extract mask number INDEX
8088    for an rgroup that operates on NVECTORS vectors of type VECTYPE,
8089    where 0 <= INDEX < NVECTORS.  Insert any set-up statements before GSI.
8090
8091    See the comment above vec_loop_masks for more details about the mask
8092    arrangement.  */
8093
8094 tree
8095 vect_get_loop_mask (gimple_stmt_iterator *gsi, vec_loop_masks *masks,
8096                     unsigned int nvectors, tree vectype, unsigned int index)
8097 {
8098   rgroup_masks *rgm = &(*masks)[nvectors - 1];
8099   tree mask_type = rgm->mask_type;
8100
8101   /* Populate the rgroup's mask array, if this is the first time we've
8102      used it.  */
8103   if (rgm->masks.is_empty ())
8104     {
8105       rgm->masks.safe_grow_cleared (nvectors);
8106       for (unsigned int i = 0; i < nvectors; ++i)
8107         {
8108           tree mask = make_temp_ssa_name (mask_type, NULL, "loop_mask");
8109           /* Provide a dummy definition until the real one is available.  */
8110           SSA_NAME_DEF_STMT (mask) = gimple_build_nop ();
8111           rgm->masks[i] = mask;
8112         }
8113     }
8114
8115   tree mask = rgm->masks[index];
8116   if (maybe_ne (TYPE_VECTOR_SUBPARTS (mask_type),
8117                 TYPE_VECTOR_SUBPARTS (vectype)))
8118     {
8119       /* A loop mask for data type X can be reused for data type Y
8120          if X has N times more elements than Y and if Y's elements
8121          are N times bigger than X's.  In this case each sequence
8122          of N elements in the loop mask will be all-zero or all-one.
8123          We can then view-convert the mask so that each sequence of
8124          N elements is replaced by a single element.  */
8125       gcc_assert (multiple_p (TYPE_VECTOR_SUBPARTS (mask_type),
8126                               TYPE_VECTOR_SUBPARTS (vectype)));
8127       gimple_seq seq = NULL;
8128       mask_type = build_same_sized_truth_vector_type (vectype);
8129       mask = gimple_build (&seq, VIEW_CONVERT_EXPR, mask_type, mask);
8130       if (seq)
8131         gsi_insert_seq_before (gsi, seq, GSI_SAME_STMT);
8132     }
8133   return mask;
8134 }
8135
8136 /* Scale profiling counters by estimation for LOOP which is vectorized
8137    by factor VF.  */
8138
8139 static void
8140 scale_profile_for_vect_loop (struct loop *loop, unsigned vf)
8141 {
8142   edge preheader = loop_preheader_edge (loop);
8143   /* Reduce loop iterations by the vectorization factor.  */
8144   gcov_type new_est_niter = niter_for_unrolled_loop (loop, vf);
8145   profile_count freq_h = loop->header->count, freq_e = preheader->count ();
8146
8147   if (freq_h.nonzero_p ())
8148     {
8149       profile_probability p;
8150
8151       /* Avoid dropping loop body profile counter to 0 because of zero count
8152          in loop's preheader.  */
8153       if (!(freq_e == profile_count::zero ()))
8154         freq_e = freq_e.force_nonzero ();
8155       p = freq_e.apply_scale (new_est_niter + 1, 1).probability_in (freq_h);
8156       scale_loop_frequencies (loop, p);
8157     }
8158
8159   edge exit_e = single_exit (loop);
8160   exit_e->probability = profile_probability::always ()
8161                                  .apply_scale (1, new_est_niter + 1);
8162
8163   edge exit_l = single_pred_edge (loop->latch);
8164   profile_probability prob = exit_l->probability;
8165   exit_l->probability = exit_e->probability.invert ();
8166   if (prob.initialized_p () && exit_l->probability.initialized_p ())
8167     scale_bbs_frequencies (&loop->latch, 1, exit_l->probability / prob);
8168 }
8169
8170 /* Vectorize STMT_INFO if relevant, inserting any new instructions before GSI.
8171    When vectorizing STMT_INFO as a store, set *SEEN_STORE to its
8172    stmt_vec_info.  */
8173
8174 static void
8175 vect_transform_loop_stmt (loop_vec_info loop_vinfo, stmt_vec_info stmt_info,
8176                           gimple_stmt_iterator *gsi, stmt_vec_info *seen_store)
8177 {
8178   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
8179   poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
8180
8181   if (dump_enabled_p ())
8182     dump_printf_loc (MSG_NOTE, vect_location,
8183                      "------>vectorizing statement: %G", stmt_info->stmt);
8184
8185   if (MAY_HAVE_DEBUG_BIND_STMTS && !STMT_VINFO_LIVE_P (stmt_info))
8186     vect_loop_kill_debug_uses (loop, stmt_info);
8187
8188   if (!STMT_VINFO_RELEVANT_P (stmt_info)
8189       && !STMT_VINFO_LIVE_P (stmt_info))
8190     return;
8191
8192   if (STMT_VINFO_VECTYPE (stmt_info))
8193     {
8194       poly_uint64 nunits
8195         = TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info));
8196       if (!STMT_SLP_TYPE (stmt_info)
8197           && maybe_ne (nunits, vf)
8198           && dump_enabled_p ())
8199         /* For SLP VF is set according to unrolling factor, and not
8200            to vector size, hence for SLP this print is not valid.  */
8201         dump_printf_loc (MSG_NOTE, vect_location, "multiple-types.\n");
8202     }
8203
8204   /* Pure SLP statements have already been vectorized.  We still need
8205      to apply loop vectorization to hybrid SLP statements.  */
8206   if (PURE_SLP_STMT (stmt_info))
8207     return;
8208
8209   if (dump_enabled_p ())
8210     dump_printf_loc (MSG_NOTE, vect_location, "transform statement.\n");
8211
8212   if (vect_transform_stmt (stmt_info, gsi, NULL, NULL))
8213     *seen_store = stmt_info;
8214 }
8215
8216 /* Function vect_transform_loop.
8217
8218    The analysis phase has determined that the loop is vectorizable.
8219    Vectorize the loop - created vectorized stmts to replace the scalar
8220    stmts in the loop, and update the loop exit condition.
8221    Returns scalar epilogue loop if any.  */
8222
8223 struct loop *
8224 vect_transform_loop (loop_vec_info loop_vinfo)
8225 {
8226   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
8227   struct loop *epilogue = NULL;
8228   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
8229   int nbbs = loop->num_nodes;
8230   int i;
8231   tree niters_vector = NULL_TREE;
8232   tree step_vector = NULL_TREE;
8233   tree niters_vector_mult_vf = NULL_TREE;
8234   poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
8235   unsigned int lowest_vf = constant_lower_bound (vf);
8236   gimple *stmt;
8237   bool check_profitability = false;
8238   unsigned int th;
8239
8240   DUMP_VECT_SCOPE ("vec_transform_loop");
8241
8242   loop_vinfo->shared->check_datarefs ();
8243
8244   /* Use the more conservative vectorization threshold.  If the number
8245      of iterations is constant assume the cost check has been performed
8246      by our caller.  If the threshold makes all loops profitable that
8247      run at least the (estimated) vectorization factor number of times
8248      checking is pointless, too.  */
8249   th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
8250   if (th >= vect_vf_for_cost (loop_vinfo)
8251       && !LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
8252     {
8253       if (dump_enabled_p ())
8254         dump_printf_loc (MSG_NOTE, vect_location,
8255                          "Profitability threshold is %d loop iterations.\n",
8256                          th);
8257       check_profitability = true;
8258     }
8259
8260   /* Make sure there exists a single-predecessor exit bb.  Do this before
8261      versioning.   */
8262   edge e = single_exit (loop);
8263   if (! single_pred_p (e->dest))
8264     {
8265       split_loop_exit_edge (e, true);
8266       if (dump_enabled_p ())
8267         dump_printf (MSG_NOTE, "split exit edge\n");
8268     }
8269
8270   /* Version the loop first, if required, so the profitability check
8271      comes first.  */
8272
8273   if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
8274     {
8275       poly_uint64 versioning_threshold
8276         = LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo);
8277       if (check_profitability
8278           && ordered_p (poly_uint64 (th), versioning_threshold))
8279         {
8280           versioning_threshold = ordered_max (poly_uint64 (th),
8281                                               versioning_threshold);
8282           check_profitability = false;
8283         }
8284       struct loop *sloop
8285         = vect_loop_versioning (loop_vinfo, th, check_profitability,
8286                                 versioning_threshold);
8287       sloop->force_vectorize = false;
8288       check_profitability = false;
8289     }
8290
8291   /* Make sure there exists a single-predecessor exit bb also on the
8292      scalar loop copy.  Do this after versioning but before peeling
8293      so CFG structure is fine for both scalar and if-converted loop
8294      to make slpeel_duplicate_current_defs_from_edges face matched
8295      loop closed PHI nodes on the exit.  */
8296   if (LOOP_VINFO_SCALAR_LOOP (loop_vinfo))
8297     {
8298       e = single_exit (LOOP_VINFO_SCALAR_LOOP (loop_vinfo));
8299       if (! single_pred_p (e->dest))
8300         {
8301           split_loop_exit_edge (e, true);
8302           if (dump_enabled_p ())
8303             dump_printf (MSG_NOTE, "split exit edge of scalar loop\n");
8304         }
8305     }
8306
8307   tree niters = vect_build_loop_niters (loop_vinfo);
8308   LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = niters;
8309   tree nitersm1 = unshare_expr (LOOP_VINFO_NITERSM1 (loop_vinfo));
8310   bool niters_no_overflow = loop_niters_no_overflow (loop_vinfo);
8311   epilogue = vect_do_peeling (loop_vinfo, niters, nitersm1, &niters_vector,
8312                               &step_vector, &niters_vector_mult_vf, th,
8313                               check_profitability, niters_no_overflow);
8314
8315   if (niters_vector == NULL_TREE)
8316     {
8317       if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
8318           && !LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
8319           && known_eq (lowest_vf, vf))
8320         {
8321           niters_vector
8322             = build_int_cst (TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo)),
8323                              LOOP_VINFO_INT_NITERS (loop_vinfo) / lowest_vf);
8324           step_vector = build_one_cst (TREE_TYPE (niters));
8325         }
8326       else
8327         vect_gen_vector_loop_niters (loop_vinfo, niters, &niters_vector,
8328                                      &step_vector, niters_no_overflow);
8329     }
8330
8331   /* 1) Make sure the loop header has exactly two entries
8332      2) Make sure we have a preheader basic block.  */
8333
8334   gcc_assert (EDGE_COUNT (loop->header->preds) == 2);
8335
8336   split_edge (loop_preheader_edge (loop));
8337
8338   if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
8339       && vect_use_loop_mask_for_alignment_p (loop_vinfo))
8340     /* This will deal with any possible peeling.  */
8341     vect_prepare_for_masked_peels (loop_vinfo);
8342
8343   /* Schedule the SLP instances first, then handle loop vectorization
8344      below.  */
8345   if (!loop_vinfo->slp_instances.is_empty ())
8346     {
8347       DUMP_VECT_SCOPE ("scheduling SLP instances");
8348       vect_schedule_slp (loop_vinfo);
8349     }
8350
8351   /* FORNOW: the vectorizer supports only loops which body consist
8352      of one basic block (header + empty latch). When the vectorizer will
8353      support more involved loop forms, the order by which the BBs are
8354      traversed need to be reconsidered.  */
8355
8356   for (i = 0; i < nbbs; i++)
8357     {
8358       basic_block bb = bbs[i];
8359       stmt_vec_info stmt_info;
8360
8361       for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
8362            gsi_next (&si))
8363         {
8364           gphi *phi = si.phi ();
8365           if (dump_enabled_p ())
8366             dump_printf_loc (MSG_NOTE, vect_location,
8367                              "------>vectorizing phi: %G", phi);
8368           stmt_info = loop_vinfo->lookup_stmt (phi);
8369           if (!stmt_info)
8370             continue;
8371
8372           if (MAY_HAVE_DEBUG_BIND_STMTS && !STMT_VINFO_LIVE_P (stmt_info))
8373             vect_loop_kill_debug_uses (loop, stmt_info);
8374
8375           if (!STMT_VINFO_RELEVANT_P (stmt_info)
8376               && !STMT_VINFO_LIVE_P (stmt_info))
8377             continue;
8378
8379           if (STMT_VINFO_VECTYPE (stmt_info)
8380               && (maybe_ne
8381                   (TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info)), vf))
8382               && dump_enabled_p ())
8383             dump_printf_loc (MSG_NOTE, vect_location, "multiple-types.\n");
8384
8385           if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def
8386                || STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
8387                || STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle)
8388               && ! PURE_SLP_STMT (stmt_info))
8389             {
8390               if (dump_enabled_p ())
8391                 dump_printf_loc (MSG_NOTE, vect_location, "transform phi.\n");
8392               vect_transform_stmt (stmt_info, NULL, NULL, NULL);
8393             }
8394         }
8395
8396       for (gimple_stmt_iterator si = gsi_start_bb (bb);
8397            !gsi_end_p (si);)
8398         {
8399           stmt = gsi_stmt (si);
8400           /* During vectorization remove existing clobber stmts.  */
8401           if (gimple_clobber_p (stmt))
8402             {
8403               unlink_stmt_vdef (stmt);
8404               gsi_remove (&si, true);
8405               release_defs (stmt);
8406             }
8407           else
8408             {
8409               stmt_info = loop_vinfo->lookup_stmt (stmt);
8410
8411               /* vector stmts created in the outer-loop during vectorization of
8412                  stmts in an inner-loop may not have a stmt_info, and do not
8413                  need to be vectorized.  */
8414               stmt_vec_info seen_store = NULL;
8415               if (stmt_info)
8416                 {
8417                   if (STMT_VINFO_IN_PATTERN_P (stmt_info))
8418                     {
8419                       gimple *def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
8420                       for (gimple_stmt_iterator subsi = gsi_start (def_seq);
8421                            !gsi_end_p (subsi); gsi_next (&subsi))
8422                         {
8423                           stmt_vec_info pat_stmt_info
8424                             = loop_vinfo->lookup_stmt (gsi_stmt (subsi));
8425                           vect_transform_loop_stmt (loop_vinfo, pat_stmt_info,
8426                                                     &si, &seen_store);
8427                         }
8428                       stmt_vec_info pat_stmt_info
8429                         = STMT_VINFO_RELATED_STMT (stmt_info);
8430                       vect_transform_loop_stmt (loop_vinfo, pat_stmt_info, &si,
8431                                                 &seen_store);
8432                     }
8433                   vect_transform_loop_stmt (loop_vinfo, stmt_info, &si,
8434                                             &seen_store);
8435                 }
8436               gsi_next (&si);
8437               if (seen_store)
8438                 {
8439                   if (STMT_VINFO_GROUPED_ACCESS (seen_store))
8440                     /* Interleaving.  If IS_STORE is TRUE, the
8441                        vectorization of the interleaving chain was
8442                        completed - free all the stores in the chain.  */
8443                     vect_remove_stores (DR_GROUP_FIRST_ELEMENT (seen_store));
8444                   else
8445                     /* Free the attached stmt_vec_info and remove the stmt.  */
8446                     loop_vinfo->remove_stmt (stmt_info);
8447                 }
8448             }
8449         }
8450
8451       /* Stub out scalar statements that must not survive vectorization.
8452          Doing this here helps with grouped statements, or statements that
8453          are involved in patterns.  */
8454       for (gimple_stmt_iterator gsi = gsi_start_bb (bb);
8455            !gsi_end_p (gsi); gsi_next (&gsi))
8456         {
8457           gcall *call = dyn_cast <gcall *> (gsi_stmt (gsi));
8458           if (call && gimple_call_internal_p (call, IFN_MASK_LOAD))
8459             {
8460               tree lhs = gimple_get_lhs (call);
8461               if (!VECTOR_TYPE_P (TREE_TYPE (lhs)))
8462                 {
8463                   tree zero = build_zero_cst (TREE_TYPE (lhs));
8464                   gimple *new_stmt = gimple_build_assign (lhs, zero);
8465                   gsi_replace (&gsi, new_stmt, true);
8466                 }
8467             }
8468         }
8469     }                           /* BBs in loop */
8470
8471   /* The vectorization factor is always > 1, so if we use an IV increment of 1.
8472      a zero NITERS becomes a nonzero NITERS_VECTOR.  */
8473   if (integer_onep (step_vector))
8474     niters_no_overflow = true;
8475   vect_set_loop_condition (loop, loop_vinfo, niters_vector, step_vector,
8476                            niters_vector_mult_vf, !niters_no_overflow);
8477
8478   unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
8479   scale_profile_for_vect_loop (loop, assumed_vf);
8480
8481   /* True if the final iteration might not handle a full vector's
8482      worth of scalar iterations.  */
8483   bool final_iter_may_be_partial = LOOP_VINFO_FULLY_MASKED_P (loop_vinfo);
8484   /* The minimum number of iterations performed by the epilogue.  This
8485      is 1 when peeling for gaps because we always need a final scalar
8486      iteration.  */
8487   int min_epilogue_iters = LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) ? 1 : 0;
8488   /* +1 to convert latch counts to loop iteration counts,
8489      -min_epilogue_iters to remove iterations that cannot be performed
8490        by the vector code.  */
8491   int bias_for_lowest = 1 - min_epilogue_iters;
8492   int bias_for_assumed = bias_for_lowest;
8493   int alignment_npeels = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
8494   if (alignment_npeels && LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
8495     {
8496       /* When the amount of peeling is known at compile time, the first
8497          iteration will have exactly alignment_npeels active elements.
8498          In the worst case it will have at least one.  */
8499       int min_first_active = (alignment_npeels > 0 ? alignment_npeels : 1);
8500       bias_for_lowest += lowest_vf - min_first_active;
8501       bias_for_assumed += assumed_vf - min_first_active;
8502     }
8503   /* In these calculations the "- 1" converts loop iteration counts
8504      back to latch counts.  */
8505   if (loop->any_upper_bound)
8506     loop->nb_iterations_upper_bound
8507       = (final_iter_may_be_partial
8508          ? wi::udiv_ceil (loop->nb_iterations_upper_bound + bias_for_lowest,
8509                           lowest_vf) - 1
8510          : wi::udiv_floor (loop->nb_iterations_upper_bound + bias_for_lowest,
8511                            lowest_vf) - 1);
8512   if (loop->any_likely_upper_bound)
8513     loop->nb_iterations_likely_upper_bound
8514       = (final_iter_may_be_partial
8515          ? wi::udiv_ceil (loop->nb_iterations_likely_upper_bound
8516                           + bias_for_lowest, lowest_vf) - 1
8517          : wi::udiv_floor (loop->nb_iterations_likely_upper_bound
8518                            + bias_for_lowest, lowest_vf) - 1);
8519   if (loop->any_estimate)
8520     loop->nb_iterations_estimate
8521       = (final_iter_may_be_partial
8522          ? wi::udiv_ceil (loop->nb_iterations_estimate + bias_for_assumed,
8523                           assumed_vf) - 1
8524          : wi::udiv_floor (loop->nb_iterations_estimate + bias_for_assumed,
8525                            assumed_vf) - 1);
8526
8527   if (dump_enabled_p ())
8528     {
8529       if (!LOOP_VINFO_EPILOGUE_P (loop_vinfo))
8530         {
8531           dump_printf_loc (MSG_NOTE, vect_location,
8532                            "LOOP VECTORIZED\n");
8533           if (loop->inner)
8534             dump_printf_loc (MSG_NOTE, vect_location,
8535                              "OUTER LOOP VECTORIZED\n");
8536           dump_printf (MSG_NOTE, "\n");
8537         }
8538       else
8539         {
8540           dump_printf_loc (MSG_NOTE, vect_location,
8541                            "LOOP EPILOGUE VECTORIZED (VS=");
8542           dump_dec (MSG_NOTE, current_vector_size);
8543           dump_printf (MSG_NOTE, ")\n");
8544         }
8545     }
8546
8547   /* Loops vectorized with a variable factor won't benefit from
8548      unrolling/peeling.  */
8549   if (!vf.is_constant ())
8550     {
8551       loop->unroll = 1;
8552       if (dump_enabled_p ())
8553         dump_printf_loc (MSG_NOTE, vect_location, "Disabling unrolling due to"
8554                          " variable-length vectorization factor\n");
8555     }
8556   /* Free SLP instances here because otherwise stmt reference counting
8557      won't work.  */
8558   slp_instance instance;
8559   FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), i, instance)
8560     vect_free_slp_instance (instance, true);
8561   LOOP_VINFO_SLP_INSTANCES (loop_vinfo).release ();
8562   /* Clear-up safelen field since its value is invalid after vectorization
8563      since vectorized loop can have loop-carried dependencies.  */
8564   loop->safelen = 0;
8565
8566   /* Don't vectorize epilogue for epilogue.  */
8567   if (LOOP_VINFO_EPILOGUE_P (loop_vinfo))
8568     epilogue = NULL;
8569
8570   if (!PARAM_VALUE (PARAM_VECT_EPILOGUES_NOMASK))
8571     epilogue = NULL;
8572
8573   if (epilogue)
8574     {
8575       auto_vector_sizes vector_sizes;
8576       targetm.vectorize.autovectorize_vector_sizes (&vector_sizes);
8577       unsigned int next_size = 0;
8578
8579       /* Note LOOP_VINFO_NITERS_KNOWN_P and LOOP_VINFO_INT_NITERS work
8580          on niters already ajusted for the iterations of the prologue.  */
8581       if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
8582           && known_eq (vf, lowest_vf))
8583         {
8584           unsigned HOST_WIDE_INT eiters
8585             = (LOOP_VINFO_INT_NITERS (loop_vinfo)
8586                - LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo));
8587           eiters
8588             = eiters % lowest_vf + LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo);
8589           epilogue->nb_iterations_upper_bound = eiters - 1;
8590           epilogue->any_upper_bound = true;
8591
8592           unsigned int ratio;
8593           while (next_size < vector_sizes.length ()
8594                  && !(constant_multiple_p (current_vector_size,
8595                                            vector_sizes[next_size], &ratio)
8596                       && eiters >= lowest_vf / ratio))
8597             next_size += 1;
8598         }
8599       else
8600         while (next_size < vector_sizes.length ()
8601                && maybe_lt (current_vector_size, vector_sizes[next_size]))
8602           next_size += 1;
8603
8604       if (next_size == vector_sizes.length ())
8605         epilogue = NULL;
8606     }
8607
8608   if (epilogue)
8609     {
8610       epilogue->force_vectorize = loop->force_vectorize;
8611       epilogue->safelen = loop->safelen;
8612       epilogue->dont_vectorize = false;
8613
8614       /* We may need to if-convert epilogue to vectorize it.  */
8615       if (LOOP_VINFO_SCALAR_LOOP (loop_vinfo))
8616         tree_if_conversion (epilogue);
8617     }
8618
8619   return epilogue;
8620 }
8621
8622 /* The code below is trying to perform simple optimization - revert
8623    if-conversion for masked stores, i.e. if the mask of a store is zero
8624    do not perform it and all stored value producers also if possible.
8625    For example,
8626      for (i=0; i<n; i++)
8627        if (c[i])
8628         {
8629           p1[i] += 1;
8630           p2[i] = p3[i] +2;
8631         }
8632    this transformation will produce the following semi-hammock:
8633
8634    if (!mask__ifc__42.18_165 == { 0, 0, 0, 0, 0, 0, 0, 0 })
8635      {
8636        vect__11.19_170 = MASK_LOAD (vectp_p1.20_168, 0B, mask__ifc__42.18_165);
8637        vect__12.22_172 = vect__11.19_170 + vect_cst__171;
8638        MASK_STORE (vectp_p1.23_175, 0B, mask__ifc__42.18_165, vect__12.22_172);
8639        vect__18.25_182 = MASK_LOAD (vectp_p3.26_180, 0B, mask__ifc__42.18_165);
8640        vect__19.28_184 = vect__18.25_182 + vect_cst__183;
8641        MASK_STORE (vectp_p2.29_187, 0B, mask__ifc__42.18_165, vect__19.28_184);
8642      }
8643 */
8644
8645 void
8646 optimize_mask_stores (struct loop *loop)
8647 {
8648   basic_block *bbs = get_loop_body (loop);
8649   unsigned nbbs = loop->num_nodes;
8650   unsigned i;
8651   basic_block bb;
8652   struct loop *bb_loop;
8653   gimple_stmt_iterator gsi;
8654   gimple *stmt;
8655   auto_vec<gimple *> worklist;
8656   auto_purge_vect_location sentinel;
8657
8658   vect_location = find_loop_location (loop);
8659   /* Pick up all masked stores in loop if any.  */
8660   for (i = 0; i < nbbs; i++)
8661     {
8662       bb = bbs[i];
8663       for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi);
8664            gsi_next (&gsi))
8665         {
8666           stmt = gsi_stmt (gsi);
8667           if (gimple_call_internal_p (stmt, IFN_MASK_STORE))
8668             worklist.safe_push (stmt);
8669         }
8670     }
8671
8672   free (bbs);
8673   if (worklist.is_empty ())
8674     return;
8675
8676   /* Loop has masked stores.  */
8677   while (!worklist.is_empty ())
8678     {
8679       gimple *last, *last_store;
8680       edge e, efalse;
8681       tree mask;
8682       basic_block store_bb, join_bb;
8683       gimple_stmt_iterator gsi_to;
8684       tree vdef, new_vdef;
8685       gphi *phi;
8686       tree vectype;
8687       tree zero;
8688
8689       last = worklist.pop ();
8690       mask = gimple_call_arg (last, 2);
8691       bb = gimple_bb (last);
8692       /* Create then_bb and if-then structure in CFG, then_bb belongs to
8693          the same loop as if_bb.  It could be different to LOOP when two
8694          level loop-nest is vectorized and mask_store belongs to the inner
8695          one.  */
8696       e = split_block (bb, last);
8697       bb_loop = bb->loop_father;
8698       gcc_assert (loop == bb_loop || flow_loop_nested_p (loop, bb_loop));
8699       join_bb = e->dest;
8700       store_bb = create_empty_bb (bb);
8701       add_bb_to_loop (store_bb, bb_loop);
8702       e->flags = EDGE_TRUE_VALUE;
8703       efalse = make_edge (bb, store_bb, EDGE_FALSE_VALUE);
8704       /* Put STORE_BB to likely part.  */
8705       efalse->probability = profile_probability::unlikely ();
8706       store_bb->count = efalse->count ();
8707       make_single_succ_edge (store_bb, join_bb, EDGE_FALLTHRU);
8708       if (dom_info_available_p (CDI_DOMINATORS))
8709         set_immediate_dominator (CDI_DOMINATORS, store_bb, bb);
8710       if (dump_enabled_p ())
8711         dump_printf_loc (MSG_NOTE, vect_location,
8712                          "Create new block %d to sink mask stores.",
8713                          store_bb->index);
8714       /* Create vector comparison with boolean result.  */
8715       vectype = TREE_TYPE (mask);
8716       zero = build_zero_cst (vectype);
8717       stmt = gimple_build_cond (EQ_EXPR, mask, zero, NULL_TREE, NULL_TREE);
8718       gsi = gsi_last_bb (bb);
8719       gsi_insert_after (&gsi, stmt, GSI_SAME_STMT);
8720       /* Create new PHI node for vdef of the last masked store:
8721          .MEM_2 = VDEF <.MEM_1>
8722          will be converted to
8723          .MEM.3 = VDEF <.MEM_1>
8724          and new PHI node will be created in join bb
8725          .MEM_2 = PHI <.MEM_1, .MEM_3>
8726       */
8727       vdef = gimple_vdef (last);
8728       new_vdef = make_ssa_name (gimple_vop (cfun), last);
8729       gimple_set_vdef (last, new_vdef);
8730       phi = create_phi_node (vdef, join_bb);
8731       add_phi_arg (phi, new_vdef, EDGE_SUCC (store_bb, 0), UNKNOWN_LOCATION);
8732
8733       /* Put all masked stores with the same mask to STORE_BB if possible.  */
8734       while (true)
8735         {
8736           gimple_stmt_iterator gsi_from;
8737           gimple *stmt1 = NULL;
8738
8739           /* Move masked store to STORE_BB.  */
8740           last_store = last;
8741           gsi = gsi_for_stmt (last);
8742           gsi_from = gsi;
8743           /* Shift GSI to the previous stmt for further traversal.  */
8744           gsi_prev (&gsi);
8745           gsi_to = gsi_start_bb (store_bb);
8746           gsi_move_before (&gsi_from, &gsi_to);
8747           /* Setup GSI_TO to the non-empty block start.  */
8748           gsi_to = gsi_start_bb (store_bb);
8749           if (dump_enabled_p ())
8750             dump_printf_loc (MSG_NOTE, vect_location,
8751                              "Move stmt to created bb\n%G", last);
8752           /* Move all stored value producers if possible.  */
8753           while (!gsi_end_p (gsi))
8754             {
8755               tree lhs;
8756               imm_use_iterator imm_iter;
8757               use_operand_p use_p;
8758               bool res;
8759
8760               /* Skip debug statements.  */
8761               if (is_gimple_debug (gsi_stmt (gsi)))
8762                 {
8763                   gsi_prev (&gsi);
8764                   continue;
8765                 }
8766               stmt1 = gsi_stmt (gsi);
8767               /* Do not consider statements writing to memory or having
8768                  volatile operand.  */
8769               if (gimple_vdef (stmt1)
8770                   || gimple_has_volatile_ops (stmt1))
8771                 break;
8772               gsi_from = gsi;
8773               gsi_prev (&gsi);
8774               lhs = gimple_get_lhs (stmt1);
8775               if (!lhs)
8776                 break;
8777
8778               /* LHS of vectorized stmt must be SSA_NAME.  */
8779               if (TREE_CODE (lhs) != SSA_NAME)
8780                 break;
8781
8782               if (!VECTOR_TYPE_P (TREE_TYPE (lhs)))
8783                 {
8784                   /* Remove dead scalar statement.  */
8785                   if (has_zero_uses (lhs))
8786                     {
8787                       gsi_remove (&gsi_from, true);
8788                       continue;
8789                     }
8790                 }
8791
8792               /* Check that LHS does not have uses outside of STORE_BB.  */
8793               res = true;
8794               FOR_EACH_IMM_USE_FAST (use_p, imm_iter, lhs)
8795                 {
8796                   gimple *use_stmt;
8797                   use_stmt = USE_STMT (use_p);
8798                   if (is_gimple_debug (use_stmt))
8799                     continue;
8800                   if (gimple_bb (use_stmt) != store_bb)
8801                     {
8802                       res = false;
8803                       break;
8804                     }
8805                 }
8806               if (!res)
8807                 break;
8808
8809               if (gimple_vuse (stmt1)
8810                   && gimple_vuse (stmt1) != gimple_vuse (last_store))
8811                 break;
8812
8813               /* Can move STMT1 to STORE_BB.  */
8814               if (dump_enabled_p ())
8815                 dump_printf_loc (MSG_NOTE, vect_location,
8816                                  "Move stmt to created bb\n%G", stmt1);
8817               gsi_move_before (&gsi_from, &gsi_to);
8818               /* Shift GSI_TO for further insertion.  */
8819               gsi_prev (&gsi_to);
8820             }
8821           /* Put other masked stores with the same mask to STORE_BB.  */
8822           if (worklist.is_empty ()
8823               || gimple_call_arg (worklist.last (), 2) != mask
8824               || worklist.last () != stmt1)
8825             break;
8826           last = worklist.pop ();
8827         }
8828       add_phi_arg (phi, gimple_vuse (last_store), e, UNKNOWN_LOCATION);
8829     }
8830 }