gcc/tree-vect-loop.c

   1 /* Loop Vectorization
   2    Copyright (C) 2003-2019 Free Software Foundation, Inc.
   3    Contributed by Dorit Naishlos <dorit@il.ibm.com> and
   4    Ira Rosen <irar@il.ibm.com>
   5
   6 This file is part of GCC.
   7
   8 GCC is free software; you can redistribute it and/or modify it under
   9 the terms of the GNU General Public License as published by the Free
  10 Software Foundation; either version 3, or (at your option) any later
  11 version.
  12
  13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
  14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
  15 FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  16 for more details.
  17
  18 You should have received a copy of the GNU General Public License
  19 along with GCC; see the file COPYING3.  If not see
  20 <http://www.gnu.org/licenses/>.  */
  21
  22 #include "config.h"
  23 #include "system.h"
  24 #include "coretypes.h"
  25 #include "backend.h"
  26 #include "target.h"
  27 #include "rtl.h"
  28 #include "tree.h"
  29 #include "gimple.h"
  30 #include "cfghooks.h"
  31 #include "tree-pass.h"
  32 #include "ssa.h"
  33 #include "optabs-tree.h"
  34 #include "diagnostic-core.h"
  35 #include "fold-const.h"
  36 #include "stor-layout.h"
  37 #include "cfganal.h"
  38 #include "gimplify.h"
  39 #include "gimple-iterator.h"
  40 #include "gimplify-me.h"
  41 #include "tree-ssa-loop-ivopts.h"
  42 #include "tree-ssa-loop-manip.h"
  43 #include "tree-ssa-loop-niter.h"
  44 #include "tree-ssa-loop.h"
  45 #include "cfgloop.h"
  46 #include "params.h"
  47 #include "tree-scalar-evolution.h"
  48 #include "tree-vectorizer.h"
  49 #include "gimple-fold.h"
  50 #include "cgraph.h"
  51 #include "tree-cfg.h"
  52 #include "tree-if-conv.h"
  53 #include "internal-fn.h"
  54 #include "tree-vector-builder.h"
  55 #include "vec-perm-indices.h"
  56 #include "tree-eh.h"
  57
  58 /* Loop Vectorization Pass.
  59
  60    This pass tries to vectorize loops.
  61
  62    For example, the vectorizer transforms the following simple loop:
  63
  64         short a[N]; short b[N]; short c[N]; int i;
  65
  66         for (i=0; i<N; i++){
  67           a[i] = b[i] + c[i];
  68         }
  69
  70    as if it was manually vectorized by rewriting the source code into:
  71
  72         typedef int __attribute__((mode(V8HI))) v8hi;
  73         short a[N];  short b[N]; short c[N];   int i;
  74         v8hi *pa = (v8hi*)a, *pb = (v8hi*)b, *pc = (v8hi*)c;
  75         v8hi va, vb, vc;
  76
  77         for (i=0; i<N/8; i++){
  78           vb = pb[i];
  79           vc = pc[i];
  80           va = vb + vc;
  81           pa[i] = va;
  82         }
  83
  84         The main entry to this pass is vectorize_loops(), in which
  85    the vectorizer applies a set of analyses on a given set of loops,
  86    followed by the actual vectorization transformation for the loops that
  87    had successfully passed the analysis phase.
  88         Throughout this pass we make a distinction between two types of
  89    data: scalars (which are represented by SSA_NAMES), and memory references
  90    ("data-refs").  These two types of data require different handling both
  91    during analysis and transformation. The types of data-refs that the
  92    vectorizer currently supports are ARRAY_REFS which base is an array DECL
  93    (not a pointer), and INDIRECT_REFS through pointers; both array and pointer
  94    accesses are required to have a simple (consecutive) access pattern.
  95
  96    Analysis phase:
  97    ===============
  98         The driver for the analysis phase is vect_analyze_loop().
  99    It applies a set of analyses, some of which rely on the scalar evolution
 100    analyzer (scev) developed by Sebastian Pop.
 101
 102         During the analysis phase the vectorizer records some information
 103    per stmt in a "stmt_vec_info" struct which is attached to each stmt in the
 104    loop, as well as general information about the loop as a whole, which is
 105    recorded in a "loop_vec_info" struct attached to each loop.
 106
 107    Transformation phase:
 108    =====================
 109         The loop transformation phase scans all the stmts in the loop, and
 110    creates a vector stmt (or a sequence of stmts) for each scalar stmt S in
 111    the loop that needs to be vectorized.  It inserts the vector code sequence
 112    just before the scalar stmt S, and records a pointer to the vector code
 113    in STMT_VINFO_VEC_STMT (stmt_info) (stmt_info is the stmt_vec_info struct
 114    attached to S).  This pointer will be used for the vectorization of following
 115    stmts which use the def of stmt S. Stmt S is removed if it writes to memory;
 116    otherwise, we rely on dead code elimination for removing it.
 117
 118         For example, say stmt S1 was vectorized into stmt VS1:
 119
 120    VS1: vb = px[i];
 121    S1:  b = x[i];    STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
 122    S2:  a = b;
 123
 124    To vectorize stmt S2, the vectorizer first finds the stmt that defines
 125    the operand 'b' (S1), and gets the relevant vector def 'vb' from the
 126    vector stmt VS1 pointed to by STMT_VINFO_VEC_STMT (stmt_info (S1)).  The
 127    resulting sequence would be:
 128
 129    VS1: vb = px[i];
 130    S1:  b = x[i];       STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
 131    VS2: va = vb;
 132    S2:  a = b;          STMT_VINFO_VEC_STMT (stmt_info (S2)) = VS2
 133
 134         Operands that are not SSA_NAMEs, are data-refs that appear in
 135    load/store operations (like 'x[i]' in S1), and are handled differently.
 136
 137    Target modeling:
 138    =================
 139         Currently the only target specific information that is used is the
 140    size of the vector (in bytes) - "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD".
 141    Targets that can support different sizes of vectors, for now will need
 142    to specify one value for "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD".  More
 143    flexibility will be added in the future.
 144
 145         Since we only vectorize operations which vector form can be
 146    expressed using existing tree codes, to verify that an operation is
 147    supported, the vectorizer checks the relevant optab at the relevant
 148    machine_mode (e.g, optab_handler (add_optab, V8HImode)).  If
 149    the value found is CODE_FOR_nothing, then there's no target support, and
 150    we can't vectorize the stmt.
 151
 152    For additional information on this project see:
 153    http://gcc.gnu.org/projects/tree-ssa/vectorization.html
 154 */
 155
 156 static void vect_estimate_min_profitable_iters (loop_vec_info, int *, int *);
 157
 158 /* Subroutine of vect_determine_vf_for_stmt that handles only one
 159    statement.  VECTYPE_MAYBE_SET_P is true if STMT_VINFO_VECTYPE
 160    may already be set for general statements (not just data refs).  */
 161
 162 static opt_result
 163 vect_determine_vf_for_stmt_1 (stmt_vec_info stmt_info,
 164                               bool vectype_maybe_set_p,
 165                               poly_uint64 *vf,
 166                               vec<stmt_vec_info > *mask_producers)
 167 {
 168   gimple *stmt = stmt_info->stmt;
 169
 170   if ((!STMT_VINFO_RELEVANT_P (stmt_info)
 171        && !STMT_VINFO_LIVE_P (stmt_info))
 172       || gimple_clobber_p (stmt))
 173     {
 174       if (dump_enabled_p ())
 175         dump_printf_loc (MSG_NOTE, vect_location, "skip.\n");
 176       return opt_result::success ();
 177     }
 178
 179   tree stmt_vectype, nunits_vectype;
 180   opt_result res = vect_get_vector_types_for_stmt (stmt_info, &stmt_vectype,
 181                                                    &nunits_vectype);
 182   if (!res)
 183     return res;
 184
 185   if (stmt_vectype)
 186     {
 187       if (STMT_VINFO_VECTYPE (stmt_info))
 188         /* The only case when a vectype had been already set is for stmts
 189            that contain a data ref, or for "pattern-stmts" (stmts generated
 190            by the vectorizer to represent/replace a certain idiom).  */
 191         gcc_assert ((STMT_VINFO_DATA_REF (stmt_info)
 192                      || vectype_maybe_set_p)
 193                     && STMT_VINFO_VECTYPE (stmt_info) == stmt_vectype);
 194       else if (stmt_vectype == boolean_type_node)
 195         mask_producers->safe_push (stmt_info);
 196       else
 197         STMT_VINFO_VECTYPE (stmt_info) = stmt_vectype;
 198     }
 199
 200   if (nunits_vectype)
 201     vect_update_max_nunits (vf, nunits_vectype);
 202
 203   return opt_result::success ();
 204 }
 205
 206 /* Subroutine of vect_determine_vectorization_factor.  Set the vector
 207    types of STMT_INFO and all attached pattern statements and update
 208    the vectorization factor VF accordingly.  If some of the statements
 209    produce a mask result whose vector type can only be calculated later,
 210    add them to MASK_PRODUCERS.  Return true on success or false if
 211    something prevented vectorization.  */
 212
 213 static opt_result
 214 vect_determine_vf_for_stmt (stmt_vec_info stmt_info, poly_uint64 *vf,
 215                             vec<stmt_vec_info > *mask_producers)
 216 {
 217   vec_info *vinfo = stmt_info->vinfo;
 218   if (dump_enabled_p ())
 219     dump_printf_loc (MSG_NOTE, vect_location, "==> examining statement: %G",
 220                      stmt_info->stmt);
 221   opt_result res
 222     = vect_determine_vf_for_stmt_1 (stmt_info, false, vf, mask_producers);
 223   if (!res)
 224     return res;
 225
 226   if (STMT_VINFO_IN_PATTERN_P (stmt_info)
 227       && STMT_VINFO_RELATED_STMT (stmt_info))
 228     {
 229       gimple *pattern_def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
 230       stmt_info = STMT_VINFO_RELATED_STMT (stmt_info);
 231
 232       /* If a pattern statement has def stmts, analyze them too.  */
 233       for (gimple_stmt_iterator si = gsi_start (pattern_def_seq);
 234            !gsi_end_p (si); gsi_next (&si))
 235         {
 236           stmt_vec_info def_stmt_info = vinfo->lookup_stmt (gsi_stmt (si));
 237           if (dump_enabled_p ())
 238             dump_printf_loc (MSG_NOTE, vect_location,
 239                              "==> examining pattern def stmt: %G",
 240                              def_stmt_info->stmt);
 241           if (!vect_determine_vf_for_stmt_1 (def_stmt_info, true,
 242                                              vf, mask_producers))
 243           res = vect_determine_vf_for_stmt_1 (def_stmt_info, true,
 244                                               vf, mask_producers);
 245           if (!res)
 246             return res;
 247         }
 248
 249       if (dump_enabled_p ())
 250         dump_printf_loc (MSG_NOTE, vect_location,
 251                          "==> examining pattern statement: %G",
 252                          stmt_info->stmt);
 253       res = vect_determine_vf_for_stmt_1 (stmt_info, true, vf, mask_producers);
 254       if (!res)
 255         return res;
 256     }
 257
 258   return opt_result::success ();
 259 }
 260
 261 /* Function vect_determine_vectorization_factor
 262
 263    Determine the vectorization factor (VF).  VF is the number of data elements
 264    that are operated upon in parallel in a single iteration of the vectorized
 265    loop.  For example, when vectorizing a loop that operates on 4byte elements,
 266    on a target with vector size (VS) 16byte, the VF is set to 4, since 4
 267    elements can fit in a single vector register.
 268
 269    We currently support vectorization of loops in which all types operated upon
 270    are of the same size.  Therefore this function currently sets VF according to
 271    the size of the types operated upon, and fails if there are multiple sizes
 272    in the loop.
 273
 274    VF is also the factor by which the loop iterations are strip-mined, e.g.:
 275    original loop:
 276         for (i=0; i<N; i++){
 277           a[i] = b[i] + c[i];
 278         }
 279
 280    vectorized loop:
 281         for (i=0; i<N; i+=VF){
 282           a[i:VF] = b[i:VF] + c[i:VF];
 283         }
 284 */
 285
 286 static opt_result
 287 vect_determine_vectorization_factor (loop_vec_info loop_vinfo)
 288 {
 289   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
 290   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
 291   unsigned nbbs = loop->num_nodes;
 292   poly_uint64 vectorization_factor = 1;
 293   tree scalar_type = NULL_TREE;
 294   gphi *phi;
 295   tree vectype;
 296   stmt_vec_info stmt_info;
 297   unsigned i;
 298   auto_vec<stmt_vec_info> mask_producers;
 299
 300   DUMP_VECT_SCOPE ("vect_determine_vectorization_factor");
 301
 302   for (i = 0; i < nbbs; i++)
 303     {
 304       basic_block bb = bbs[i];
 305
 306       for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
 307            gsi_next (&si))
 308         {
 309           phi = si.phi ();
 310           stmt_info = loop_vinfo->lookup_stmt (phi);
 311           if (dump_enabled_p ())
 312             dump_printf_loc (MSG_NOTE, vect_location, "==> examining phi: %G",
 313                              phi);
 314
 315           gcc_assert (stmt_info);
 316
 317           if (STMT_VINFO_RELEVANT_P (stmt_info)
 318               || STMT_VINFO_LIVE_P (stmt_info))
 319             {
 320               gcc_assert (!STMT_VINFO_VECTYPE (stmt_info));
 321               scalar_type = TREE_TYPE (PHI_RESULT (phi));
 322
 323               if (dump_enabled_p ())
 324                 dump_printf_loc (MSG_NOTE, vect_location,
 325                                  "get vectype for scalar type:  %T\n",
 326                                  scalar_type);
 327
 328               vectype = get_vectype_for_scalar_type (scalar_type);
 329               if (!vectype)
 330                 return opt_result::failure_at (phi,
 331                                                "not vectorized: unsupported "
 332                                                "data-type %T\n",
 333                                                scalar_type);
 334               STMT_VINFO_VECTYPE (stmt_info) = vectype;
 335
 336               if (dump_enabled_p ())
 337                 dump_printf_loc (MSG_NOTE, vect_location, "vectype: %T\n",
 338                                  vectype);
 339
 340               if (dump_enabled_p ())
 341                 {
 342                   dump_printf_loc (MSG_NOTE, vect_location, "nunits = ");
 343                   dump_dec (MSG_NOTE, TYPE_VECTOR_SUBPARTS (vectype));
 344                   dump_printf (MSG_NOTE, "\n");
 345                 }
 346
 347               vect_update_max_nunits (&vectorization_factor, vectype);
 348             }
 349         }
 350
 351       for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
 352            gsi_next (&si))
 353         {
 354           stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
 355           opt_result res
 356             = vect_determine_vf_for_stmt (stmt_info, &vectorization_factor,
 357                                           &mask_producers);
 358           if (!res)
 359             return res;
 360         }
 361     }
 362
 363   /* TODO: Analyze cost. Decide if worth while to vectorize.  */
 364   if (dump_enabled_p ())
 365     {
 366       dump_printf_loc (MSG_NOTE, vect_location, "vectorization factor = ");
 367       dump_dec (MSG_NOTE, vectorization_factor);
 368       dump_printf (MSG_NOTE, "\n");
 369     }
 370
 371   if (known_le (vectorization_factor, 1U))
 372     return opt_result::failure_at (vect_location,
 373                                    "not vectorized: unsupported data-type\n");
 374   LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
 375
 376   for (i = 0; i < mask_producers.length (); i++)
 377     {
 378       stmt_info = mask_producers[i];
 379       opt_tree mask_type = vect_get_mask_type_for_stmt (stmt_info);
 380       if (!mask_type)
 381         return opt_result::propagate_failure (mask_type);
 382       STMT_VINFO_VECTYPE (stmt_info) = mask_type;
 383     }
 384
 385   return opt_result::success ();
 386 }
 387
 388
 389 /* Function vect_is_simple_iv_evolution.
 390
 391    FORNOW: A simple evolution of an induction variables in the loop is
 392    considered a polynomial evolution.  */
 393
 394 static bool
 395 vect_is_simple_iv_evolution (unsigned loop_nb, tree access_fn, tree * init,
 396                              tree * step)
 397 {
 398   tree init_expr;
 399   tree step_expr;
 400   tree evolution_part = evolution_part_in_loop_num (access_fn, loop_nb);
 401   basic_block bb;
 402
 403   /* When there is no evolution in this loop, the evolution function
 404      is not "simple".  */
 405   if (evolution_part == NULL_TREE)
 406     return false;
 407
 408   /* When the evolution is a polynomial of degree >= 2
 409      the evolution function is not "simple".  */
 410   if (tree_is_chrec (evolution_part))
 411     return false;
 412
 413   step_expr = evolution_part;
 414   init_expr = unshare_expr (initial_condition_in_loop_num (access_fn, loop_nb));
 415
 416   if (dump_enabled_p ())
 417     dump_printf_loc (MSG_NOTE, vect_location, "step: %T,  init: %T\n",
 418                      step_expr, init_expr);
 419
 420   *init = init_expr;
 421   *step = step_expr;
 422
 423   if (TREE_CODE (step_expr) != INTEGER_CST
 424       && (TREE_CODE (step_expr) != SSA_NAME
 425           || ((bb = gimple_bb (SSA_NAME_DEF_STMT (step_expr)))
 426               && flow_bb_inside_loop_p (get_loop (cfun, loop_nb), bb))
 427           || (!INTEGRAL_TYPE_P (TREE_TYPE (step_expr))
 428               && (!SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr))
 429                   || !flag_associative_math)))
 430       && (TREE_CODE (step_expr) != REAL_CST
 431           || !flag_associative_math))
 432     {
 433       if (dump_enabled_p ())
 434         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 435                          "step unknown.\n");
 436       return false;
 437     }
 438
 439   return true;
 440 }
 441
 442 /* Return true if PHI, described by STMT_INFO, is the inner PHI in
 443    what we are assuming is a double reduction.  For example, given
 444    a structure like this:
 445
 446       outer1:
 447         x_1 = PHI <x_4(outer2), ...>;
 448         ...
 449
 450       inner:
 451         x_2 = PHI <x_1(outer1), ...>;
 452         ...
 453         x_3 = ...;
 454         ...
 455
 456       outer2:
 457         x_4 = PHI <x_3(inner)>;
 458         ...
 459
 460    outer loop analysis would treat x_1 as a double reduction phi and
 461    this function would then return true for x_2.  */
 462
 463 static bool
 464 vect_inner_phi_in_double_reduction_p (stmt_vec_info stmt_info, gphi *phi)
 465 {
 466   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
 467   use_operand_p use_p;
 468   ssa_op_iter op_iter;
 469   FOR_EACH_PHI_ARG (use_p, phi, op_iter, SSA_OP_USE)
 470     if (stmt_vec_info def_info = loop_vinfo->lookup_def (USE_FROM_PTR (use_p)))
 471       if (STMT_VINFO_DEF_TYPE (def_info) == vect_double_reduction_def)
 472         return true;
 473   return false;
 474 }
 475
 476 /* Function vect_analyze_scalar_cycles_1.
 477
 478    Examine the cross iteration def-use cycles of scalar variables
 479    in LOOP.  LOOP_VINFO represents the loop that is now being
 480    considered for vectorization (can be LOOP, or an outer-loop
 481    enclosing LOOP).  */
 482
 483 static void
 484 vect_analyze_scalar_cycles_1 (loop_vec_info loop_vinfo, struct loop *loop)
 485 {
 486   basic_block bb = loop->header;
 487   tree init, step;
 488   auto_vec<stmt_vec_info, 64> worklist;
 489   gphi_iterator gsi;
 490   bool double_reduc;
 491
 492   DUMP_VECT_SCOPE ("vect_analyze_scalar_cycles");
 493
 494   /* First - identify all inductions.  Reduction detection assumes that all the
 495      inductions have been identified, therefore, this order must not be
 496      changed.  */
 497   for (gsi = gsi_start_phis  (bb); !gsi_end_p (gsi); gsi_next (&gsi))
 498     {
 499       gphi *phi = gsi.phi ();
 500       tree access_fn = NULL;
 501       tree def = PHI_RESULT (phi);
 502       stmt_vec_info stmt_vinfo = loop_vinfo->lookup_stmt (phi);
 503
 504       if (dump_enabled_p ())
 505         dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: %G", phi);
 506
 507       /* Skip virtual phi's.  The data dependences that are associated with
 508          virtual defs/uses (i.e., memory accesses) are analyzed elsewhere.  */
 509       if (virtual_operand_p (def))
 510         continue;
 511
 512       STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_unknown_def_type;
 513
 514       /* Analyze the evolution function.  */
 515       access_fn = analyze_scalar_evolution (loop, def);
 516       if (access_fn)
 517         {
 518           STRIP_NOPS (access_fn);
 519           if (dump_enabled_p ())
 520             dump_printf_loc (MSG_NOTE, vect_location,
 521                              "Access function of PHI: %T\n", access_fn);
 522           STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo)
 523             = initial_condition_in_loop_num (access_fn, loop->num);
 524           STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo)
 525             = evolution_part_in_loop_num (access_fn, loop->num);
 526         }
 527
 528       if (!access_fn
 529           || vect_inner_phi_in_double_reduction_p (stmt_vinfo, phi)
 530           || !vect_is_simple_iv_evolution (loop->num, access_fn, &init, &step)
 531           || (LOOP_VINFO_LOOP (loop_vinfo) != loop
 532               && TREE_CODE (step) != INTEGER_CST))
 533         {
 534           worklist.safe_push (stmt_vinfo);
 535           continue;
 536         }
 537
 538       gcc_assert (STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo)
 539                   != NULL_TREE);
 540       gcc_assert (STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo) != NULL_TREE);
 541
 542       if (dump_enabled_p ())
 543         dump_printf_loc (MSG_NOTE, vect_location, "Detected induction.\n");
 544       STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_induction_def;
 545     }
 546
 547
 548   /* Second - identify all reductions and nested cycles.  */
 549   while (worklist.length () > 0)
 550     {
 551       stmt_vec_info stmt_vinfo = worklist.pop ();
 552       gphi *phi = as_a <gphi *> (stmt_vinfo->stmt);
 553       tree def = PHI_RESULT (phi);
 554
 555       if (dump_enabled_p ())
 556         dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: %G", phi);
 557
 558       gcc_assert (!virtual_operand_p (def)
 559                   && STMT_VINFO_DEF_TYPE (stmt_vinfo) == vect_unknown_def_type);
 560
 561       stmt_vec_info reduc_stmt_info
 562         = vect_force_simple_reduction (loop_vinfo, stmt_vinfo,
 563                                        &double_reduc, false);
 564       if (reduc_stmt_info)
 565         {
 566           if (double_reduc)
 567             {
 568               if (dump_enabled_p ())
 569                 dump_printf_loc (MSG_NOTE, vect_location,
 570                                  "Detected double reduction.\n");
 571
 572               STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_double_reduction_def;
 573               STMT_VINFO_DEF_TYPE (reduc_stmt_info)
 574                 = vect_double_reduction_def;
 575             }
 576           else
 577             {
 578               if (loop != LOOP_VINFO_LOOP (loop_vinfo))
 579                 {
 580                   if (dump_enabled_p ())
 581                     dump_printf_loc (MSG_NOTE, vect_location,
 582                                      "Detected vectorizable nested cycle.\n");
 583
 584                   STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_nested_cycle;
 585                   STMT_VINFO_DEF_TYPE (reduc_stmt_info) = vect_nested_cycle;
 586                 }
 587               else
 588                 {
 589                   if (dump_enabled_p ())
 590                     dump_printf_loc (MSG_NOTE, vect_location,
 591                                      "Detected reduction.\n");
 592
 593                   STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_reduction_def;
 594                   STMT_VINFO_DEF_TYPE (reduc_stmt_info) = vect_reduction_def;
 595                   /* Store the reduction cycles for possible vectorization in
 596                      loop-aware SLP if it was not detected as reduction
 597                      chain.  */
 598                   if (! REDUC_GROUP_FIRST_ELEMENT (reduc_stmt_info))
 599                     LOOP_VINFO_REDUCTIONS (loop_vinfo).safe_push
 600                       (reduc_stmt_info);
 601                 }
 602             }
 603         }
 604       else
 605         if (dump_enabled_p ())
 606           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 607                            "Unknown def-use cycle pattern.\n");
 608     }
 609 }
 610
 611
 612 /* Function vect_analyze_scalar_cycles.
 613
 614    Examine the cross iteration def-use cycles of scalar variables, by
 615    analyzing the loop-header PHIs of scalar variables.  Classify each
 616    cycle as one of the following: invariant, induction, reduction, unknown.
 617    We do that for the loop represented by LOOP_VINFO, and also to its
 618    inner-loop, if exists.
 619    Examples for scalar cycles:
 620
 621    Example1: reduction:
 622
 623               loop1:
 624               for (i=0; i<N; i++)
 625                  sum += a[i];
 626
 627    Example2: induction:
 628
 629               loop2:
 630               for (i=0; i<N; i++)
 631                  a[i] = i;  */
 632
 633 static void
 634 vect_analyze_scalar_cycles (loop_vec_info loop_vinfo)
 635 {
 636   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
 637
 638   vect_analyze_scalar_cycles_1 (loop_vinfo, loop);
 639
 640   /* When vectorizing an outer-loop, the inner-loop is executed sequentially.
 641      Reductions in such inner-loop therefore have different properties than
 642      the reductions in the nest that gets vectorized:
 643      1. When vectorized, they are executed in the same order as in the original
 644         scalar loop, so we can't change the order of computation when
 645         vectorizing them.
 646      2. FIXME: Inner-loop reductions can be used in the inner-loop, so the
 647         current checks are too strict.  */
 648
 649   if (loop->inner)
 650     vect_analyze_scalar_cycles_1 (loop_vinfo, loop->inner);
 651 }
 652
 653 /* Transfer group and reduction information from STMT_INFO to its
 654    pattern stmt.  */
 655
 656 static void
 657 vect_fixup_reduc_chain (stmt_vec_info stmt_info)
 658 {
 659   stmt_vec_info firstp = STMT_VINFO_RELATED_STMT (stmt_info);
 660   stmt_vec_info stmtp;
 661   gcc_assert (!REDUC_GROUP_FIRST_ELEMENT (firstp)
 662               && REDUC_GROUP_FIRST_ELEMENT (stmt_info));
 663   REDUC_GROUP_SIZE (firstp) = REDUC_GROUP_SIZE (stmt_info);
 664   do
 665     {
 666       stmtp = STMT_VINFO_RELATED_STMT (stmt_info);
 667       REDUC_GROUP_FIRST_ELEMENT (stmtp) = firstp;
 668       stmt_info = REDUC_GROUP_NEXT_ELEMENT (stmt_info);
 669       if (stmt_info)
 670         REDUC_GROUP_NEXT_ELEMENT (stmtp)
 671           = STMT_VINFO_RELATED_STMT (stmt_info);
 672     }
 673   while (stmt_info);
 674   STMT_VINFO_DEF_TYPE (stmtp) = vect_reduction_def;
 675 }
 676
 677 /* Fixup scalar cycles that now have their stmts detected as patterns.  */
 678
 679 static void
 680 vect_fixup_scalar_cycles_with_patterns (loop_vec_info loop_vinfo)
 681 {
 682   stmt_vec_info first;
 683   unsigned i;
 684
 685   FOR_EACH_VEC_ELT (LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo), i, first)
 686     if (STMT_VINFO_IN_PATTERN_P (first))
 687       {
 688         stmt_vec_info next = REDUC_GROUP_NEXT_ELEMENT (first);
 689         while (next)
 690           {
 691             if (! STMT_VINFO_IN_PATTERN_P (next))
 692               break;
 693             next = REDUC_GROUP_NEXT_ELEMENT (next);
 694           }
 695         /* If not all stmt in the chain are patterns try to handle
 696            the chain without patterns.  */
 697         if (! next)
 698           {
 699             vect_fixup_reduc_chain (first);
 700             LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo)[i]
 701               = STMT_VINFO_RELATED_STMT (first);
 702           }
 703       }
 704 }
 705
 706 /* Function vect_get_loop_niters.
 707
 708    Determine how many iterations the loop is executed and place it
 709    in NUMBER_OF_ITERATIONS.  Place the number of latch iterations
 710    in NUMBER_OF_ITERATIONSM1.  Place the condition under which the
 711    niter information holds in ASSUMPTIONS.
 712
 713    Return the loop exit condition.  */
 714
 715
 716 static gcond *
 717 vect_get_loop_niters (struct loop *loop, tree *assumptions,
 718                       tree *number_of_iterations, tree *number_of_iterationsm1)
 719 {
 720   edge exit = single_exit (loop);
 721   struct tree_niter_desc niter_desc;
 722   tree niter_assumptions, niter, may_be_zero;
 723   gcond *cond = get_loop_exit_condition (loop);
 724
 725   *assumptions = boolean_true_node;
 726   *number_of_iterationsm1 = chrec_dont_know;
 727   *number_of_iterations = chrec_dont_know;
 728   DUMP_VECT_SCOPE ("get_loop_niters");
 729
 730   if (!exit)
 731     return cond;
 732
 733   niter = chrec_dont_know;
 734   may_be_zero = NULL_TREE;
 735   niter_assumptions = boolean_true_node;
 736   if (!number_of_iterations_exit_assumptions (loop, exit, &niter_desc, NULL)
 737       || chrec_contains_undetermined (niter_desc.niter))
 738     return cond;
 739
 740   niter_assumptions = niter_desc.assumptions;
 741   may_be_zero = niter_desc.may_be_zero;
 742   niter = niter_desc.niter;
 743
 744   if (may_be_zero && integer_zerop (may_be_zero))
 745     may_be_zero = NULL_TREE;
 746
 747   if (may_be_zero)
 748     {
 749       if (COMPARISON_CLASS_P (may_be_zero))
 750         {
 751           /* Try to combine may_be_zero with assumptions, this can simplify
 752              computation of niter expression.  */
 753           if (niter_assumptions && !integer_nonzerop (niter_assumptions))
 754             niter_assumptions = fold_build2 (TRUTH_AND_EXPR, boolean_type_node,
 755                                              niter_assumptions,
 756                                              fold_build1 (TRUTH_NOT_EXPR,
 757                                                           boolean_type_node,
 758                                                           may_be_zero));
 759           else
 760             niter = fold_build3 (COND_EXPR, TREE_TYPE (niter), may_be_zero,
 761                                  build_int_cst (TREE_TYPE (niter), 0),
 762                                  rewrite_to_non_trapping_overflow (niter));
 763
 764           may_be_zero = NULL_TREE;
 765         }
 766       else if (integer_nonzerop (may_be_zero))
 767         {
 768           *number_of_iterationsm1 = build_int_cst (TREE_TYPE (niter), 0);
 769           *number_of_iterations = build_int_cst (TREE_TYPE (niter), 1);
 770           return cond;
 771         }
 772       else
 773         return cond;
 774     }
 775
 776   *assumptions = niter_assumptions;
 777   *number_of_iterationsm1 = niter;
 778
 779   /* We want the number of loop header executions which is the number
 780      of latch executions plus one.
 781      ???  For UINT_MAX latch executions this number overflows to zero
 782      for loops like do { n++; } while (n != 0);  */
 783   if (niter && !chrec_contains_undetermined (niter))
 784     niter = fold_build2 (PLUS_EXPR, TREE_TYPE (niter), unshare_expr (niter),
 785                           build_int_cst (TREE_TYPE (niter), 1));
 786   *number_of_iterations = niter;
 787
 788   return cond;
 789 }
 790
 791 /* Function bb_in_loop_p
 792
 793    Used as predicate for dfs order traversal of the loop bbs.  */
 794
 795 static bool
 796 bb_in_loop_p (const_basic_block bb, const void *data)
 797 {
 798   const struct loop *const loop = (const struct loop *)data;
 799   if (flow_bb_inside_loop_p (loop, bb))
 800     return true;
 801   return false;
 802 }
 803
 804
 805 /* Create and initialize a new loop_vec_info struct for LOOP_IN, as well as
 806    stmt_vec_info structs for all the stmts in LOOP_IN.  */
 807
 808 _loop_vec_info::_loop_vec_info (struct loop *loop_in, vec_info_shared *shared)
 809   : vec_info (vec_info::loop, init_cost (loop_in), shared),
 810     loop (loop_in),
 811     bbs (XCNEWVEC (basic_block, loop->num_nodes)),
 812     num_itersm1 (NULL_TREE),
 813     num_iters (NULL_TREE),
 814     num_iters_unchanged (NULL_TREE),
 815     num_iters_assumptions (NULL_TREE),
 816     th (0),
 817     versioning_threshold (0),
 818     vectorization_factor (0),
 819     max_vectorization_factor (0),
 820     mask_skip_niters (NULL_TREE),
 821     mask_compare_type (NULL_TREE),
 822     unaligned_dr (NULL),
 823     peeling_for_alignment (0),
 824     ptr_mask (0),
 825     ivexpr_map (NULL),
 826     slp_unrolling_factor (1),
 827     single_scalar_iteration_cost (0),
 828     vectorizable (false),
 829     can_fully_mask_p (true),
 830     fully_masked_p (false),
 831     peeling_for_gaps (false),
 832     peeling_for_niter (false),
 833     operands_swapped (false),
 834     no_data_dependencies (false),
 835     has_mask_store (false),
 836     scalar_loop (NULL),
 837     orig_loop_info (NULL)
 838 {
 839   /* CHECKME: We want to visit all BBs before their successors (except for
 840      latch blocks, for which this assertion wouldn't hold).  In the simple
 841      case of the loop forms we allow, a dfs order of the BBs would the same
 842      as reversed postorder traversal, so we are safe.  */
 843
 844   unsigned int nbbs = dfs_enumerate_from (loop->header, 0, bb_in_loop_p,
 845                                           bbs, loop->num_nodes, loop);
 846   gcc_assert (nbbs == loop->num_nodes);
 847
 848   for (unsigned int i = 0; i < nbbs; i++)
 849     {
 850       basic_block bb = bbs[i];
 851       gimple_stmt_iterator si;
 852
 853       for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
 854         {
 855           gimple *phi = gsi_stmt (si);
 856           gimple_set_uid (phi, 0);
 857           add_stmt (phi);
 858         }
 859
 860       for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
 861         {
 862           gimple *stmt = gsi_stmt (si);
 863           gimple_set_uid (stmt, 0);
 864           add_stmt (stmt);
 865         }
 866     }
 867 }
 868
 869 /* Free all levels of MASKS.  */
 870
 871 void
 872 release_vec_loop_masks (vec_loop_masks *masks)
 873 {
 874   rgroup_masks *rgm;
 875   unsigned int i;
 876   FOR_EACH_VEC_ELT (*masks, i, rgm)
 877     rgm->masks.release ();
 878   masks->release ();
 879 }
 880
 881 /* Free all memory used by the _loop_vec_info, as well as all the
 882    stmt_vec_info structs of all the stmts in the loop.  */
 883
 884 _loop_vec_info::~_loop_vec_info ()
 885 {
 886   int nbbs;
 887   gimple_stmt_iterator si;
 888   int j;
 889
 890   nbbs = loop->num_nodes;
 891   for (j = 0; j < nbbs; j++)
 892     {
 893       basic_block bb = bbs[j];
 894       for (si = gsi_start_bb (bb); !gsi_end_p (si); )
 895         {
 896           gimple *stmt = gsi_stmt (si);
 897
 898           /* We may have broken canonical form by moving a constant
 899              into RHS1 of a commutative op.  Fix such occurrences.  */
 900           if (operands_swapped && is_gimple_assign (stmt))
 901             {
 902               enum tree_code code = gimple_assign_rhs_code (stmt);
 903
 904               if ((code == PLUS_EXPR
 905                    || code == POINTER_PLUS_EXPR
 906                    || code == MULT_EXPR)
 907                   && CONSTANT_CLASS_P (gimple_assign_rhs1 (stmt)))
 908                 swap_ssa_operands (stmt,
 909                                    gimple_assign_rhs1_ptr (stmt),
 910                                    gimple_assign_rhs2_ptr (stmt));
 911               else if (code == COND_EXPR
 912                        && CONSTANT_CLASS_P (gimple_assign_rhs2 (stmt)))
 913                 {
 914                   tree cond_expr = gimple_assign_rhs1 (stmt);
 915                   enum tree_code cond_code = TREE_CODE (cond_expr);
 916
 917                   if (TREE_CODE_CLASS (cond_code) == tcc_comparison)
 918                     {
 919                       bool honor_nans = HONOR_NANS (TREE_OPERAND (cond_expr,
 920                                                                   0));
 921                       cond_code = invert_tree_comparison (cond_code,
 922                                                           honor_nans);
 923                       if (cond_code != ERROR_MARK)
 924                         {
 925                           TREE_SET_CODE (cond_expr, cond_code);
 926                           swap_ssa_operands (stmt,
 927                                              gimple_assign_rhs2_ptr (stmt),
 928                                              gimple_assign_rhs3_ptr (stmt));
 929                         }
 930                     }
 931                 }
 932             }
 933           gsi_next (&si);
 934         }
 935     }
 936
 937   free (bbs);
 938
 939   release_vec_loop_masks (&masks);
 940   delete ivexpr_map;
 941
 942   loop->aux = NULL;
 943 }
 944
 945 /* Return an invariant or register for EXPR and emit necessary
 946    computations in the LOOP_VINFO loop preheader.  */
 947
 948 tree
 949 cse_and_gimplify_to_preheader (loop_vec_info loop_vinfo, tree expr)
 950 {
 951   if (is_gimple_reg (expr)
 952       || is_gimple_min_invariant (expr))
 953     return expr;
 954
 955   if (! loop_vinfo->ivexpr_map)
 956     loop_vinfo->ivexpr_map = new hash_map<tree_operand_hash, tree>;
 957   tree &cached = loop_vinfo->ivexpr_map->get_or_insert (expr);
 958   if (! cached)
 959     {
 960       gimple_seq stmts = NULL;
 961       cached = force_gimple_operand (unshare_expr (expr),
 962                                      &stmts, true, NULL_TREE);
 963       if (stmts)
 964         {
 965           edge e = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo));
 966           gsi_insert_seq_on_edge_immediate (e, stmts);
 967         }
 968     }
 969   return cached;
 970 }
 971
 972 /* Return true if we can use CMP_TYPE as the comparison type to produce
 973    all masks required to mask LOOP_VINFO.  */
 974
 975 static bool
 976 can_produce_all_loop_masks_p (loop_vec_info loop_vinfo, tree cmp_type)
 977 {
 978   rgroup_masks *rgm;
 979   unsigned int i;
 980   FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo), i, rgm)
 981     if (rgm->mask_type != NULL_TREE
 982         && !direct_internal_fn_supported_p (IFN_WHILE_ULT,
 983                                             cmp_type, rgm->mask_type,
 984                                             OPTIMIZE_FOR_SPEED))
 985       return false;
 986   return true;
 987 }
 988
 989 /* Calculate the maximum number of scalars per iteration for every
 990    rgroup in LOOP_VINFO.  */
 991
 992 static unsigned int
 993 vect_get_max_nscalars_per_iter (loop_vec_info loop_vinfo)
 994 {
 995   unsigned int res = 1;
 996   unsigned int i;
 997   rgroup_masks *rgm;
 998   FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo), i, rgm)
 999     res = MAX (res, rgm->max_nscalars_per_iter);
1000   return res;
1001 }
1002
1003 /* Each statement in LOOP_VINFO can be masked where necessary.  Check
1004    whether we can actually generate the masks required.  Return true if so,
1005    storing the type of the scalar IV in LOOP_VINFO_MASK_COMPARE_TYPE.  */
1006
1007 static bool
1008 vect_verify_full_masking (loop_vec_info loop_vinfo)
1009 {
1010   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1011   unsigned int min_ni_width;
1012
1013   /* Use a normal loop if there are no statements that need masking.
1014      This only happens in rare degenerate cases: it means that the loop
1015      has no loads, no stores, and no live-out values.  */
1016   if (LOOP_VINFO_MASKS (loop_vinfo).is_empty ())
1017     return false;
1018
1019   /* Get the maximum number of iterations that is representable
1020      in the counter type.  */
1021   tree ni_type = TREE_TYPE (LOOP_VINFO_NITERSM1 (loop_vinfo));
1022   widest_int max_ni = wi::to_widest (TYPE_MAX_VALUE (ni_type)) + 1;
1023
1024   /* Get a more refined estimate for the number of iterations.  */
1025   widest_int max_back_edges;
1026   if (max_loop_iterations (loop, &max_back_edges))
1027     max_ni = wi::smin (max_ni, max_back_edges + 1);
1028
1029   /* Account for rgroup masks, in which each bit is replicated N times.  */
1030   max_ni *= vect_get_max_nscalars_per_iter (loop_vinfo);
1031
1032   /* Work out how many bits we need to represent the limit.  */
1033   min_ni_width = wi::min_precision (max_ni, UNSIGNED);
1034
1035   /* Find a scalar mode for which WHILE_ULT is supported.  */
1036   opt_scalar_int_mode cmp_mode_iter;
1037   tree cmp_type = NULL_TREE;
1038   FOR_EACH_MODE_IN_CLASS (cmp_mode_iter, MODE_INT)
1039     {
1040       unsigned int cmp_bits = GET_MODE_BITSIZE (cmp_mode_iter.require ());
1041       if (cmp_bits >= min_ni_width
1042           && targetm.scalar_mode_supported_p (cmp_mode_iter.require ()))
1043         {
1044           tree this_type = build_nonstandard_integer_type (cmp_bits, true);
1045           if (this_type
1046               && can_produce_all_loop_masks_p (loop_vinfo, this_type))
1047             {
1048               /* Although we could stop as soon as we find a valid mode,
1049                  it's often better to continue until we hit Pmode, since the
1050                  operands to the WHILE are more likely to be reusable in
1051                  address calculations.  */
1052               cmp_type = this_type;
1053               if (cmp_bits >= GET_MODE_BITSIZE (Pmode))
1054                 break;
1055             }
1056         }
1057     }
1058
1059   if (!cmp_type)
1060     return false;
1061
1062   LOOP_VINFO_MASK_COMPARE_TYPE (loop_vinfo) = cmp_type;
1063   return true;
1064 }
1065
1066 /* Calculate the cost of one scalar iteration of the loop.  */
1067 static void
1068 vect_compute_single_scalar_iteration_cost (loop_vec_info loop_vinfo)
1069 {
1070   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1071   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1072   int nbbs = loop->num_nodes, factor;
1073   int innerloop_iters, i;
1074
1075   DUMP_VECT_SCOPE ("vect_compute_single_scalar_iteration_cost");
1076
1077   /* Gather costs for statements in the scalar loop.  */
1078
1079   /* FORNOW.  */
1080   innerloop_iters = 1;
1081   if (loop->inner)
1082     innerloop_iters = 50; /* FIXME */
1083
1084   for (i = 0; i < nbbs; i++)
1085     {
1086       gimple_stmt_iterator si;
1087       basic_block bb = bbs[i];
1088
1089       if (bb->loop_father == loop->inner)
1090         factor = innerloop_iters;
1091       else
1092         factor = 1;
1093
1094       for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
1095         {
1096           gimple *stmt = gsi_stmt (si);
1097           stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (stmt);
1098
1099           if (!is_gimple_assign (stmt) && !is_gimple_call (stmt))
1100             continue;
1101
1102           /* Skip stmts that are not vectorized inside the loop.  */
1103           stmt_vec_info vstmt_info = vect_stmt_to_vectorize (stmt_info);
1104           if (!STMT_VINFO_RELEVANT_P (vstmt_info)
1105               && (!STMT_VINFO_LIVE_P (vstmt_info)
1106                   || !VECTORIZABLE_CYCLE_DEF
1107                         (STMT_VINFO_DEF_TYPE (vstmt_info))))
1108             continue;
1109
1110           vect_cost_for_stmt kind;
1111           if (STMT_VINFO_DATA_REF (stmt_info))
1112             {
1113               if (DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
1114                kind = scalar_load;
1115              else
1116                kind = scalar_store;
1117             }
1118           else
1119             kind = scalar_stmt;
1120
1121           record_stmt_cost (&LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1122                             factor, kind, stmt_info, 0, vect_prologue);
1123         }
1124     }
1125
1126   /* Now accumulate cost.  */
1127   void *target_cost_data = init_cost (loop);
1128   stmt_info_for_cost *si;
1129   int j;
1130   FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1131                     j, si)
1132     (void) add_stmt_cost (target_cost_data, si->count,
1133                           si->kind, si->stmt_info, si->misalign,
1134                           vect_body);
1135   unsigned dummy, body_cost = 0;
1136   finish_cost (target_cost_data, &dummy, &body_cost, &dummy);
1137   destroy_cost_data (target_cost_data);
1138   LOOP_VINFO_SINGLE_SCALAR_ITERATION_COST (loop_vinfo) = body_cost;
1139 }
1140
1141
1142 /* Function vect_analyze_loop_form_1.
1143
1144    Verify that certain CFG restrictions hold, including:
1145    - the loop has a pre-header
1146    - the loop has a single entry and exit
1147    - the loop exit condition is simple enough
1148    - the number of iterations can be analyzed, i.e, a countable loop.  The
1149      niter could be analyzed under some assumptions.  */
1150
1151 opt_result
1152 vect_analyze_loop_form_1 (struct loop *loop, gcond **loop_cond,
1153                           tree *assumptions, tree *number_of_iterationsm1,
1154                           tree *number_of_iterations, gcond **inner_loop_cond)
1155 {
1156   DUMP_VECT_SCOPE ("vect_analyze_loop_form");
1157
1158   /* Different restrictions apply when we are considering an inner-most loop,
1159      vs. an outer (nested) loop.
1160      (FORNOW. May want to relax some of these restrictions in the future).  */
1161
1162   if (!loop->inner)
1163     {
1164       /* Inner-most loop.  We currently require that the number of BBs is
1165          exactly 2 (the header and latch).  Vectorizable inner-most loops
1166          look like this:
1167
1168                         (pre-header)
1169                            |
1170                           header <--------+
1171                            | |            |
1172                            | +--> latch --+
1173                            |
1174                         (exit-bb)  */
1175
1176       if (loop->num_nodes != 2)
1177         return opt_result::failure_at (vect_location,
1178                                        "not vectorized:"
1179                                        " control flow in loop.\n");
1180
1181       if (empty_block_p (loop->header))
1182         return opt_result::failure_at (vect_location,
1183                                        "not vectorized: empty loop.\n");
1184     }
1185   else
1186     {
1187       struct loop *innerloop = loop->inner;
1188       edge entryedge;
1189
1190       /* Nested loop. We currently require that the loop is doubly-nested,
1191          contains a single inner loop, and the number of BBs is exactly 5.
1192          Vectorizable outer-loops look like this:
1193
1194                         (pre-header)
1195                            |
1196                           header <---+
1197                            |         |
1198                           inner-loop |
1199                            |         |
1200                           tail ------+
1201                            |
1202                         (exit-bb)
1203
1204          The inner-loop has the properties expected of inner-most loops
1205          as described above.  */
1206
1207       if ((loop->inner)->inner || (loop->inner)->next)
1208         return opt_result::failure_at (vect_location,
1209                                        "not vectorized:"
1210                                        " multiple nested loops.\n");
1211
1212       if (loop->num_nodes != 5)
1213         return opt_result::failure_at (vect_location,
1214                                        "not vectorized:"
1215                                        " control flow in loop.\n");
1216
1217       entryedge = loop_preheader_edge (innerloop);
1218       if (entryedge->src != loop->header
1219           || !single_exit (innerloop)
1220           || single_exit (innerloop)->dest != EDGE_PRED (loop->latch, 0)->src)
1221         return opt_result::failure_at (vect_location,
1222                                        "not vectorized:"
1223                                        " unsupported outerloop form.\n");
1224
1225       /* Analyze the inner-loop.  */
1226       tree inner_niterm1, inner_niter, inner_assumptions;
1227       opt_result res
1228         = vect_analyze_loop_form_1 (loop->inner, inner_loop_cond,
1229                                     &inner_assumptions, &inner_niterm1,
1230                                     &inner_niter, NULL);
1231       if (!res)
1232         {
1233           if (dump_enabled_p ())
1234             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1235                              "not vectorized: Bad inner loop.\n");
1236           return res;
1237         }
1238
1239       /* Don't support analyzing niter under assumptions for inner
1240          loop.  */
1241       if (!integer_onep (inner_assumptions))
1242         return opt_result::failure_at (vect_location,
1243                                        "not vectorized: Bad inner loop.\n");
1244
1245       if (!expr_invariant_in_loop_p (loop, inner_niter))
1246         return opt_result::failure_at (vect_location,
1247                                        "not vectorized: inner-loop count not"
1248                                        " invariant.\n");
1249
1250       if (dump_enabled_p ())
1251         dump_printf_loc (MSG_NOTE, vect_location,
1252                          "Considering outer-loop vectorization.\n");
1253     }
1254
1255   if (!single_exit (loop))
1256     return opt_result::failure_at (vect_location,
1257                                    "not vectorized: multiple exits.\n");
1258   if (EDGE_COUNT (loop->header->preds) != 2)
1259     return opt_result::failure_at (vect_location,
1260                                    "not vectorized:"
1261                                    " too many incoming edges.\n");
1262
1263   /* We assume that the loop exit condition is at the end of the loop. i.e,
1264      that the loop is represented as a do-while (with a proper if-guard
1265      before the loop if needed), where the loop header contains all the
1266      executable statements, and the latch is empty.  */
1267   if (!empty_block_p (loop->latch)
1268       || !gimple_seq_empty_p (phi_nodes (loop->latch)))
1269     return opt_result::failure_at (vect_location,
1270                                    "not vectorized: latch block not empty.\n");
1271
1272   /* Make sure the exit is not abnormal.  */
1273   edge e = single_exit (loop);
1274   if (e->flags & EDGE_ABNORMAL)
1275     return opt_result::failure_at (vect_location,
1276                                    "not vectorized:"
1277                                    " abnormal loop exit edge.\n");
1278
1279   *loop_cond = vect_get_loop_niters (loop, assumptions, number_of_iterations,
1280                                      number_of_iterationsm1);
1281   if (!*loop_cond)
1282     return opt_result::failure_at
1283       (vect_location,
1284        "not vectorized: complicated exit condition.\n");
1285
1286   if (integer_zerop (*assumptions)
1287       || !*number_of_iterations
1288       || chrec_contains_undetermined (*number_of_iterations))
1289     return opt_result::failure_at
1290       (*loop_cond,
1291        "not vectorized: number of iterations cannot be computed.\n");
1292
1293   if (integer_zerop (*number_of_iterations))
1294     return opt_result::failure_at
1295       (*loop_cond,
1296        "not vectorized: number of iterations = 0.\n");
1297
1298   return opt_result::success ();
1299 }
1300
1301 /* Analyze LOOP form and return a loop_vec_info if it is of suitable form.  */
1302
1303 opt_loop_vec_info
1304 vect_analyze_loop_form (struct loop *loop, vec_info_shared *shared)
1305 {
1306   tree assumptions, number_of_iterations, number_of_iterationsm1;
1307   gcond *loop_cond, *inner_loop_cond = NULL;
1308
1309   opt_result res
1310     = vect_analyze_loop_form_1 (loop, &loop_cond,
1311                                 &assumptions, &number_of_iterationsm1,
1312                                 &number_of_iterations, &inner_loop_cond);
1313   if (!res)
1314     return opt_loop_vec_info::propagate_failure (res);
1315
1316   loop_vec_info loop_vinfo = new _loop_vec_info (loop, shared);
1317   LOOP_VINFO_NITERSM1 (loop_vinfo) = number_of_iterationsm1;
1318   LOOP_VINFO_NITERS (loop_vinfo) = number_of_iterations;
1319   LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = number_of_iterations;
1320   if (!integer_onep (assumptions))
1321     {
1322       /* We consider to vectorize this loop by versioning it under
1323          some assumptions.  In order to do this, we need to clear
1324          existing information computed by scev and niter analyzer.  */
1325       scev_reset_htab ();
1326       free_numbers_of_iterations_estimates (loop);
1327       /* Also set flag for this loop so that following scev and niter
1328          analysis are done under the assumptions.  */
1329       loop_constraint_set (loop, LOOP_C_FINITE);
1330       /* Also record the assumptions for versioning.  */
1331       LOOP_VINFO_NITERS_ASSUMPTIONS (loop_vinfo) = assumptions;
1332     }
1333
1334   if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
1335     {
1336       if (dump_enabled_p ())
1337         {
1338           dump_printf_loc (MSG_NOTE, vect_location,
1339                            "Symbolic number of iterations is ");
1340           dump_generic_expr (MSG_NOTE, TDF_DETAILS, number_of_iterations);
1341           dump_printf (MSG_NOTE, "\n");
1342         }
1343     }
1344
1345   stmt_vec_info loop_cond_info = loop_vinfo->lookup_stmt (loop_cond);
1346   STMT_VINFO_TYPE (loop_cond_info) = loop_exit_ctrl_vec_info_type;
1347   if (inner_loop_cond)
1348     {
1349       stmt_vec_info inner_loop_cond_info
1350         = loop_vinfo->lookup_stmt (inner_loop_cond);
1351       STMT_VINFO_TYPE (inner_loop_cond_info) = loop_exit_ctrl_vec_info_type;
1352     }
1353
1354   gcc_assert (!loop->aux);
1355   loop->aux = loop_vinfo;
1356   return opt_loop_vec_info::success (loop_vinfo);
1357 }
1358
1359
1360
1361 /* Scan the loop stmts and dependent on whether there are any (non-)SLP
1362    statements update the vectorization factor.  */
1363
1364 static void
1365 vect_update_vf_for_slp (loop_vec_info loop_vinfo)
1366 {
1367   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1368   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1369   int nbbs = loop->num_nodes;
1370   poly_uint64 vectorization_factor;
1371   int i;
1372
1373   DUMP_VECT_SCOPE ("vect_update_vf_for_slp");
1374
1375   vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1376   gcc_assert (known_ne (vectorization_factor, 0U));
1377
1378   /* If all the stmts in the loop can be SLPed, we perform only SLP, and
1379      vectorization factor of the loop is the unrolling factor required by
1380      the SLP instances.  If that unrolling factor is 1, we say, that we
1381      perform pure SLP on loop - cross iteration parallelism is not
1382      exploited.  */
1383   bool only_slp_in_loop = true;
1384   for (i = 0; i < nbbs; i++)
1385     {
1386       basic_block bb = bbs[i];
1387       for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
1388            gsi_next (&si))
1389         {
1390           stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
1391           stmt_info = vect_stmt_to_vectorize (stmt_info);
1392           if ((STMT_VINFO_RELEVANT_P (stmt_info)
1393                || VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
1394               && !PURE_SLP_STMT (stmt_info))
1395             /* STMT needs both SLP and loop-based vectorization.  */
1396             only_slp_in_loop = false;
1397         }
1398     }
1399
1400   if (only_slp_in_loop)
1401     {
1402       if (dump_enabled_p ())
1403         dump_printf_loc (MSG_NOTE, vect_location,
1404                          "Loop contains only SLP stmts\n");
1405       vectorization_factor = LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo);
1406     }
1407   else
1408     {
1409       if (dump_enabled_p ())
1410         dump_printf_loc (MSG_NOTE, vect_location,
1411                          "Loop contains SLP and non-SLP stmts\n");
1412       /* Both the vectorization factor and unroll factor have the form
1413          current_vector_size * X for some rational X, so they must have
1414          a common multiple.  */
1415       vectorization_factor
1416         = force_common_multiple (vectorization_factor,
1417                                  LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo));
1418     }
1419
1420   LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
1421   if (dump_enabled_p ())
1422     {
1423       dump_printf_loc (MSG_NOTE, vect_location,
1424                        "Updating vectorization factor to ");
1425       dump_dec (MSG_NOTE, vectorization_factor);
1426       dump_printf (MSG_NOTE, ".\n");
1427     }
1428 }
1429
1430 /* Return true if STMT_INFO describes a double reduction phi and if
1431    the other phi in the reduction is also relevant for vectorization.
1432    This rejects cases such as:
1433
1434       outer1:
1435         x_1 = PHI <x_3(outer2), ...>;
1436         ...
1437
1438       inner:
1439         x_2 = ...;
1440         ...
1441
1442       outer2:
1443         x_3 = PHI <x_2(inner)>;
1444
1445    if nothing in x_2 or elsewhere makes x_1 relevant.  */
1446
1447 static bool
1448 vect_active_double_reduction_p (stmt_vec_info stmt_info)
1449 {
1450   if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_double_reduction_def)
1451     return false;
1452
1453   return STMT_VINFO_RELEVANT_P (STMT_VINFO_REDUC_DEF (stmt_info));
1454 }
1455
1456 /* Function vect_analyze_loop_operations.
1457
1458    Scan the loop stmts and make sure they are all vectorizable.  */
1459
1460 static opt_result
1461 vect_analyze_loop_operations (loop_vec_info loop_vinfo)
1462 {
1463   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1464   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1465   int nbbs = loop->num_nodes;
1466   int i;
1467   stmt_vec_info stmt_info;
1468   bool need_to_vectorize = false;
1469   bool ok;
1470
1471   DUMP_VECT_SCOPE ("vect_analyze_loop_operations");
1472
1473   auto_vec<stmt_info_for_cost> cost_vec;
1474
1475   for (i = 0; i < nbbs; i++)
1476     {
1477       basic_block bb = bbs[i];
1478
1479       for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
1480            gsi_next (&si))
1481         {
1482           gphi *phi = si.phi ();
1483           ok = true;
1484
1485           stmt_info = loop_vinfo->lookup_stmt (phi);
1486           if (dump_enabled_p ())
1487             dump_printf_loc (MSG_NOTE, vect_location, "examining phi: %G", phi);
1488           if (virtual_operand_p (gimple_phi_result (phi)))
1489             continue;
1490
1491           /* Inner-loop loop-closed exit phi in outer-loop vectorization
1492              (i.e., a phi in the tail of the outer-loop).  */
1493           if (! is_loop_header_bb_p (bb))
1494             {
1495               /* FORNOW: we currently don't support the case that these phis
1496                  are not used in the outerloop (unless it is double reduction,
1497                  i.e., this phi is vect_reduction_def), cause this case
1498                  requires to actually do something here.  */
1499               if (STMT_VINFO_LIVE_P (stmt_info)
1500                   && !vect_active_double_reduction_p (stmt_info))
1501                 return opt_result::failure_at (phi,
1502                                                "Unsupported loop-closed phi"
1503                                                " in outer-loop.\n");
1504
1505               /* If PHI is used in the outer loop, we check that its operand
1506                  is defined in the inner loop.  */
1507               if (STMT_VINFO_RELEVANT_P (stmt_info))
1508                 {
1509                   tree phi_op;
1510
1511                   if (gimple_phi_num_args (phi) != 1)
1512                     return opt_result::failure_at (phi, "unsupported phi");
1513
1514                   phi_op = PHI_ARG_DEF (phi, 0);
1515                   stmt_vec_info op_def_info = loop_vinfo->lookup_def (phi_op);
1516                   if (!op_def_info)
1517                     return opt_result::failure_at (phi, "unsupported phi");
1518
1519                   if (STMT_VINFO_RELEVANT (op_def_info) != vect_used_in_outer
1520                       && (STMT_VINFO_RELEVANT (op_def_info)
1521                           != vect_used_in_outer_by_reduction))
1522                     return opt_result::failure_at (phi, "unsupported phi");
1523                 }
1524
1525               continue;
1526             }
1527
1528           gcc_assert (stmt_info);
1529
1530           if ((STMT_VINFO_RELEVANT (stmt_info) == vect_used_in_scope
1531                || STMT_VINFO_LIVE_P (stmt_info))
1532               && STMT_VINFO_DEF_TYPE (stmt_info) != vect_induction_def)
1533             /* A scalar-dependence cycle that we don't support.  */
1534             return opt_result::failure_at (phi,
1535                                            "not vectorized:"
1536                                            " scalar dependence cycle.\n");
1537
1538           if (STMT_VINFO_RELEVANT_P (stmt_info))
1539             {
1540               need_to_vectorize = true;
1541               if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def
1542                   && ! PURE_SLP_STMT (stmt_info))
1543                 ok = vectorizable_induction (stmt_info, NULL, NULL, NULL,
1544                                              &cost_vec);
1545               else if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
1546                         || STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle)
1547                        && ! PURE_SLP_STMT (stmt_info))
1548                 ok = vectorizable_reduction (stmt_info, NULL, NULL, NULL, NULL,
1549                                              &cost_vec);
1550             }
1551
1552           /* SLP PHIs are tested by vect_slp_analyze_node_operations.  */
1553           if (ok
1554               && STMT_VINFO_LIVE_P (stmt_info)
1555               && !PURE_SLP_STMT (stmt_info))
1556             ok = vectorizable_live_operation (stmt_info, NULL, NULL, -1, NULL,
1557                                               &cost_vec);
1558
1559           if (!ok)
1560             return opt_result::failure_at (phi,
1561                                            "not vectorized: relevant phi not "
1562                                            "supported: %G",
1563                                            static_cast <gimple *> (phi));
1564         }
1565
1566       for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
1567            gsi_next (&si))
1568         {
1569           gimple *stmt = gsi_stmt (si);
1570           if (!gimple_clobber_p (stmt))
1571             {
1572               opt_result res
1573                 = vect_analyze_stmt (loop_vinfo->lookup_stmt (stmt),
1574                                      &need_to_vectorize,
1575                                      NULL, NULL, &cost_vec);
1576               if (!res)
1577                 return res;
1578             }
1579         }
1580     } /* bbs */
1581
1582   add_stmt_costs (loop_vinfo->target_cost_data, &cost_vec);
1583
1584   /* All operations in the loop are either irrelevant (deal with loop
1585      control, or dead), or only used outside the loop and can be moved
1586      out of the loop (e.g. invariants, inductions).  The loop can be
1587      optimized away by scalar optimizations.  We're better off not
1588      touching this loop.  */
1589   if (!need_to_vectorize)
1590     {
1591       if (dump_enabled_p ())
1592         dump_printf_loc (MSG_NOTE, vect_location,
1593                          "All the computation can be taken out of the loop.\n");
1594       return opt_result::failure_at
1595         (vect_location,
1596          "not vectorized: redundant loop. no profit to vectorize.\n");
1597     }
1598
1599   return opt_result::success ();
1600 }
1601
1602 /* Analyze the cost of the loop described by LOOP_VINFO.  Decide if it
1603    is worthwhile to vectorize.  Return 1 if definitely yes, 0 if
1604    definitely no, or -1 if it's worth retrying.  */
1605
1606 static int
1607 vect_analyze_loop_costing (loop_vec_info loop_vinfo)
1608 {
1609   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1610   unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
1611
1612   /* Only fully-masked loops can have iteration counts less than the
1613      vectorization factor.  */
1614   if (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
1615     {
1616       HOST_WIDE_INT max_niter;
1617
1618       if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
1619         max_niter = LOOP_VINFO_INT_NITERS (loop_vinfo);
1620       else
1621         max_niter = max_stmt_executions_int (loop);
1622
1623       if (max_niter != -1
1624           && (unsigned HOST_WIDE_INT) max_niter < assumed_vf)
1625         {
1626           if (dump_enabled_p ())
1627             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1628                              "not vectorized: iteration count smaller than "
1629                              "vectorization factor.\n");
1630           return 0;
1631         }
1632     }
1633
1634   int min_profitable_iters, min_profitable_estimate;
1635   vect_estimate_min_profitable_iters (loop_vinfo, &min_profitable_iters,
1636                                       &min_profitable_estimate);
1637
1638   if (min_profitable_iters < 0)
1639     {
1640       if (dump_enabled_p ())
1641         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1642                          "not vectorized: vectorization not profitable.\n");
1643       if (dump_enabled_p ())
1644         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1645                          "not vectorized: vector version will never be "
1646                          "profitable.\n");
1647       return -1;
1648     }
1649
1650   int min_scalar_loop_bound = (PARAM_VALUE (PARAM_MIN_VECT_LOOP_BOUND)
1651                                * assumed_vf);
1652
1653   /* Use the cost model only if it is more conservative than user specified
1654      threshold.  */
1655   unsigned int th = (unsigned) MAX (min_scalar_loop_bound,
1656                                     min_profitable_iters);
1657
1658   LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo) = th;
1659
1660   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
1661       && LOOP_VINFO_INT_NITERS (loop_vinfo) < th)
1662     {
1663       if (dump_enabled_p ())
1664         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1665                          "not vectorized: vectorization not profitable.\n");
1666       if (dump_enabled_p ())
1667         dump_printf_loc (MSG_NOTE, vect_location,
1668                          "not vectorized: iteration count smaller than user "
1669                          "specified loop bound parameter or minimum profitable "
1670                          "iterations (whichever is more conservative).\n");
1671       return 0;
1672     }
1673
1674   HOST_WIDE_INT estimated_niter = estimated_stmt_executions_int (loop);
1675   if (estimated_niter == -1)
1676     estimated_niter = likely_max_stmt_executions_int (loop);
1677   if (estimated_niter != -1
1678       && ((unsigned HOST_WIDE_INT) estimated_niter
1679           < MAX (th, (unsigned) min_profitable_estimate)))
1680     {
1681       if (dump_enabled_p ())
1682         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1683                          "not vectorized: estimated iteration count too "
1684                          "small.\n");
1685       if (dump_enabled_p ())
1686         dump_printf_loc (MSG_NOTE, vect_location,
1687                          "not vectorized: estimated iteration count smaller "
1688                          "than specified loop bound parameter or minimum "
1689                          "profitable iterations (whichever is more "
1690                          "conservative).\n");
1691       return -1;
1692     }
1693
1694   return 1;
1695 }
1696
1697 static opt_result
1698 vect_get_datarefs_in_loop (loop_p loop, basic_block *bbs,
1699                            vec<data_reference_p> *datarefs,
1700                            unsigned int *n_stmts)
1701 {
1702   *n_stmts = 0;
1703   for (unsigned i = 0; i < loop->num_nodes; i++)
1704     for (gimple_stmt_iterator gsi = gsi_start_bb (bbs[i]);
1705          !gsi_end_p (gsi); gsi_next (&gsi))
1706       {
1707         gimple *stmt = gsi_stmt (gsi);
1708         if (is_gimple_debug (stmt))
1709           continue;
1710         ++(*n_stmts);
1711         opt_result res = vect_find_stmt_data_reference (loop, stmt, datarefs);
1712         if (!res)
1713           {
1714             if (is_gimple_call (stmt) && loop->safelen)
1715               {
1716                 tree fndecl = gimple_call_fndecl (stmt), op;
1717                 if (fndecl != NULL_TREE)
1718                   {
1719                     cgraph_node *node = cgraph_node::get (fndecl);
1720                     if (node != NULL && node->simd_clones != NULL)
1721                       {
1722                         unsigned int j, n = gimple_call_num_args (stmt);
1723                         for (j = 0; j < n; j++)
1724                           {
1725                             op = gimple_call_arg (stmt, j);
1726                             if (DECL_P (op)
1727                                 || (REFERENCE_CLASS_P (op)
1728                                     && get_base_address (op)))
1729                               break;
1730                           }
1731                         op = gimple_call_lhs (stmt);
1732                         /* Ignore #pragma omp declare simd functions
1733                            if they don't have data references in the
1734                            call stmt itself.  */
1735                         if (j == n
1736                             && !(op
1737                                  && (DECL_P (op)
1738                                      || (REFERENCE_CLASS_P (op)
1739                                          && get_base_address (op)))))
1740                           continue;
1741                       }
1742                   }
1743               }
1744             return res;
1745           }
1746         /* If dependence analysis will give up due to the limit on the
1747            number of datarefs stop here and fail fatally.  */
1748         if (datarefs->length ()
1749             > (unsigned)PARAM_VALUE (PARAM_LOOP_MAX_DATAREFS_FOR_DATADEPS))
1750           return opt_result::failure_at (stmt, "exceeded param "
1751                                          "loop-max-datarefs-for-datadeps\n");
1752       }
1753   return opt_result::success ();
1754 }
1755
1756 /* Function vect_analyze_loop_2.
1757
1758    Apply a set of analyses on LOOP, and create a loop_vec_info struct
1759    for it.  The different analyses will record information in the
1760    loop_vec_info struct.  */
1761 static opt_result
1762 vect_analyze_loop_2 (loop_vec_info loop_vinfo, bool &fatal, unsigned *n_stmts)
1763 {
1764   opt_result ok = opt_result::success ();
1765   int res;
1766   unsigned int max_vf = MAX_VECTORIZATION_FACTOR;
1767   poly_uint64 min_vf = 2;
1768
1769   /* The first group of checks is independent of the vector size.  */
1770   fatal = true;
1771
1772   /* Find all data references in the loop (which correspond to vdefs/vuses)
1773      and analyze their evolution in the loop.  */
1774
1775   loop_p loop = LOOP_VINFO_LOOP (loop_vinfo);
1776
1777   /* Gather the data references and count stmts in the loop.  */
1778   if (!LOOP_VINFO_DATAREFS (loop_vinfo).exists ())
1779     {
1780       opt_result res
1781         = vect_get_datarefs_in_loop (loop, LOOP_VINFO_BBS (loop_vinfo),
1782                                      &LOOP_VINFO_DATAREFS (loop_vinfo),
1783                                      n_stmts);
1784       if (!res)
1785         {
1786           if (dump_enabled_p ())
1787             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1788                              "not vectorized: loop contains function "
1789                              "calls or data references that cannot "
1790                              "be analyzed\n");
1791           return res;
1792         }
1793       loop_vinfo->shared->save_datarefs ();
1794     }
1795   else
1796     loop_vinfo->shared->check_datarefs ();
1797
1798   /* Analyze the data references and also adjust the minimal
1799      vectorization factor according to the loads and stores.  */
1800
1801   ok = vect_analyze_data_refs (loop_vinfo, &min_vf);
1802   if (!ok)
1803     {
1804       if (dump_enabled_p ())
1805         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1806                          "bad data references.\n");
1807       return ok;
1808     }
1809
1810   /* Classify all cross-iteration scalar data-flow cycles.
1811      Cross-iteration cycles caused by virtual phis are analyzed separately.  */
1812   vect_analyze_scalar_cycles (loop_vinfo);
1813
1814   vect_pattern_recog (loop_vinfo);
1815
1816   vect_fixup_scalar_cycles_with_patterns (loop_vinfo);
1817
1818   /* Analyze the access patterns of the data-refs in the loop (consecutive,
1819      complex, etc.). FORNOW: Only handle consecutive access pattern.  */
1820
1821   ok = vect_analyze_data_ref_accesses (loop_vinfo);
1822   if (!ok)
1823     {
1824       if (dump_enabled_p ())
1825         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1826                          "bad data access.\n");
1827       return ok;
1828     }
1829
1830   /* Data-flow analysis to detect stmts that do not need to be vectorized.  */
1831
1832   ok = vect_mark_stmts_to_be_vectorized (loop_vinfo);
1833   if (!ok)
1834     {
1835       if (dump_enabled_p ())
1836         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1837                          "unexpected pattern.\n");
1838       return ok;
1839     }
1840
1841   /* While the rest of the analysis below depends on it in some way.  */
1842   fatal = false;
1843
1844   /* Analyze data dependences between the data-refs in the loop
1845      and adjust the maximum vectorization factor according to
1846      the dependences.
1847      FORNOW: fail at the first data dependence that we encounter.  */
1848
1849   ok = vect_analyze_data_ref_dependences (loop_vinfo, &max_vf);
1850   if (!ok)
1851     {
1852       if (dump_enabled_p ())
1853         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1854                          "bad data dependence.\n");
1855       return ok;
1856     }
1857   if (max_vf != MAX_VECTORIZATION_FACTOR
1858       && maybe_lt (max_vf, min_vf))
1859     return opt_result::failure_at (vect_location, "bad data dependence.\n");
1860   LOOP_VINFO_MAX_VECT_FACTOR (loop_vinfo) = max_vf;
1861
1862   ok = vect_determine_vectorization_factor (loop_vinfo);
1863   if (!ok)
1864     {
1865       if (dump_enabled_p ())
1866         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1867                          "can't determine vectorization factor.\n");
1868       return ok;
1869     }
1870   if (max_vf != MAX_VECTORIZATION_FACTOR
1871       && maybe_lt (max_vf, LOOP_VINFO_VECT_FACTOR (loop_vinfo)))
1872     return opt_result::failure_at (vect_location, "bad data dependence.\n");
1873
1874   /* Compute the scalar iteration cost.  */
1875   vect_compute_single_scalar_iteration_cost (loop_vinfo);
1876
1877   poly_uint64 saved_vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1878   unsigned th;
1879
1880   /* Check the SLP opportunities in the loop, analyze and build SLP trees.  */
1881   ok = vect_analyze_slp (loop_vinfo, *n_stmts);
1882   if (!ok)
1883     return ok;
1884
1885   /* If there are any SLP instances mark them as pure_slp.  */
1886   bool slp = vect_make_slp_decision (loop_vinfo);
1887   if (slp)
1888     {
1889       /* Find stmts that need to be both vectorized and SLPed.  */
1890       vect_detect_hybrid_slp (loop_vinfo);
1891
1892       /* Update the vectorization factor based on the SLP decision.  */
1893       vect_update_vf_for_slp (loop_vinfo);
1894     }
1895
1896   bool saved_can_fully_mask_p = LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo);
1897
1898   /* We don't expect to have to roll back to anything other than an empty
1899      set of rgroups.  */
1900   gcc_assert (LOOP_VINFO_MASKS (loop_vinfo).is_empty ());
1901
1902   /* This is the point where we can re-start analysis with SLP forced off.  */
1903 start_over:
1904
1905   /* Now the vectorization factor is final.  */
1906   poly_uint64 vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1907   gcc_assert (known_ne (vectorization_factor, 0U));
1908
1909   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) && dump_enabled_p ())
1910     {
1911       dump_printf_loc (MSG_NOTE, vect_location,
1912                        "vectorization_factor = ");
1913       dump_dec (MSG_NOTE, vectorization_factor);
1914       dump_printf (MSG_NOTE, ", niters = %wd\n",
1915                    LOOP_VINFO_INT_NITERS (loop_vinfo));
1916     }
1917
1918   HOST_WIDE_INT max_niter
1919     = likely_max_stmt_executions_int (LOOP_VINFO_LOOP (loop_vinfo));
1920
1921   /* Analyze the alignment of the data-refs in the loop.
1922      Fail if a data reference is found that cannot be vectorized.  */
1923
1924   ok = vect_analyze_data_refs_alignment (loop_vinfo);
1925   if (!ok)
1926     {
1927       if (dump_enabled_p ())
1928         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1929                          "bad data alignment.\n");
1930       return ok;
1931     }
1932
1933   /* Prune the list of ddrs to be tested at run-time by versioning for alias.
1934      It is important to call pruning after vect_analyze_data_ref_accesses,
1935      since we use grouping information gathered by interleaving analysis.  */
1936   ok = vect_prune_runtime_alias_test_list (loop_vinfo);
1937   if (!ok)
1938     return ok;
1939
1940   /* Do not invoke vect_enhance_data_refs_alignment for epilogue
1941      vectorization, since we do not want to add extra peeling or
1942      add versioning for alignment.  */
1943   if (!LOOP_VINFO_EPILOGUE_P (loop_vinfo))
1944     /* This pass will decide on using loop versioning and/or loop peeling in
1945        order to enhance the alignment of data references in the loop.  */
1946     ok = vect_enhance_data_refs_alignment (loop_vinfo);
1947   else
1948     ok = vect_verify_datarefs_alignment (loop_vinfo);
1949   if (!ok)
1950     return ok;
1951
1952   if (slp)
1953     {
1954       /* Analyze operations in the SLP instances.  Note this may
1955          remove unsupported SLP instances which makes the above
1956          SLP kind detection invalid.  */
1957       unsigned old_size = LOOP_VINFO_SLP_INSTANCES (loop_vinfo).length ();
1958       vect_slp_analyze_operations (loop_vinfo);
1959       if (LOOP_VINFO_SLP_INSTANCES (loop_vinfo).length () != old_size)
1960         {
1961           ok = opt_result::failure_at (vect_location,
1962                                        "unsupported SLP instances\n");
1963           goto again;
1964         }
1965     }
1966
1967   /* Scan all the remaining operations in the loop that are not subject
1968      to SLP and make sure they are vectorizable.  */
1969   ok = vect_analyze_loop_operations (loop_vinfo);
1970   if (!ok)
1971     {
1972       if (dump_enabled_p ())
1973         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1974                          "bad operation or unsupported loop bound.\n");
1975       return ok;
1976     }
1977
1978   /* Decide whether to use a fully-masked loop for this vectorization
1979      factor.  */
1980   LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
1981     = (LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo)
1982        && vect_verify_full_masking (loop_vinfo));
1983   if (dump_enabled_p ())
1984     {
1985       if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
1986         dump_printf_loc (MSG_NOTE, vect_location,
1987                          "using a fully-masked loop.\n");
1988       else
1989         dump_printf_loc (MSG_NOTE, vect_location,
1990                          "not using a fully-masked loop.\n");
1991     }
1992
1993   /* If epilog loop is required because of data accesses with gaps,
1994      one additional iteration needs to be peeled.  Check if there is
1995      enough iterations for vectorization.  */
1996   if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
1997       && LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
1998       && !LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
1999     {
2000       poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2001       tree scalar_niters = LOOP_VINFO_NITERSM1 (loop_vinfo);
2002
2003       if (known_lt (wi::to_widest (scalar_niters), vf))
2004         return opt_result::failure_at (vect_location,
2005                                        "loop has no enough iterations to"
2006                                        " support peeling for gaps.\n");
2007     }
2008
2009   /* Check the costings of the loop make vectorizing worthwhile.  */
2010   res = vect_analyze_loop_costing (loop_vinfo);
2011   if (res < 0)
2012     {
2013       ok = opt_result::failure_at (vect_location,
2014                                    "Loop costings may not be worthwhile.\n");
2015       goto again;
2016     }
2017   if (!res)
2018     return opt_result::failure_at (vect_location,
2019                                    "Loop costings not worthwhile.\n");
2020
2021   /* Decide whether we need to create an epilogue loop to handle
2022      remaining scalar iterations.  */
2023   th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
2024
2025   unsigned HOST_WIDE_INT const_vf;
2026   if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
2027     /* The main loop handles all iterations.  */
2028     LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = false;
2029   else if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2030            && LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) >= 0)
2031     {
2032       /* Work out the (constant) number of iterations that need to be
2033          peeled for reasons other than niters.  */
2034       unsigned int peel_niter = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
2035       if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
2036         peel_niter += 1;
2037       if (!multiple_p (LOOP_VINFO_INT_NITERS (loop_vinfo) - peel_niter,
2038                        LOOP_VINFO_VECT_FACTOR (loop_vinfo)))
2039         LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = true;
2040     }
2041   else if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
2042            /* ??? When peeling for gaps but not alignment, we could
2043               try to check whether the (variable) niters is known to be
2044               VF * N + 1.  That's something of a niche case though.  */
2045            || LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
2046            || !LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant (&const_vf)
2047            || ((tree_ctz (LOOP_VINFO_NITERS (loop_vinfo))
2048                 < (unsigned) exact_log2 (const_vf))
2049                /* In case of versioning, check if the maximum number of
2050                   iterations is greater than th.  If they are identical,
2051                   the epilogue is unnecessary.  */
2052                && (!LOOP_REQUIRES_VERSIONING (loop_vinfo)
2053                    || ((unsigned HOST_WIDE_INT) max_niter
2054                        > (th / const_vf) * const_vf))))
2055     LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = true;
2056
2057   /* If an epilogue loop is required make sure we can create one.  */
2058   if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
2059       || LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo))
2060     {
2061       if (dump_enabled_p ())
2062         dump_printf_loc (MSG_NOTE, vect_location, "epilog loop required\n");
2063       if (!vect_can_advance_ivs_p (loop_vinfo)
2064           || !slpeel_can_duplicate_loop_p (LOOP_VINFO_LOOP (loop_vinfo),
2065                                            single_exit (LOOP_VINFO_LOOP
2066                                                          (loop_vinfo))))
2067         {
2068           ok = opt_result::failure_at (vect_location,
2069                                        "not vectorized: can't create required "
2070                                        "epilog loop\n");
2071           goto again;
2072         }
2073     }
2074
2075   /* During peeling, we need to check if number of loop iterations is
2076      enough for both peeled prolog loop and vector loop.  This check
2077      can be merged along with threshold check of loop versioning, so
2078      increase threshold for this case if necessary.  */
2079   if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
2080     {
2081       poly_uint64 niters_th = 0;
2082
2083       if (!vect_use_loop_mask_for_alignment_p (loop_vinfo))
2084         {
2085           /* Niters for peeled prolog loop.  */
2086           if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) < 0)
2087             {
2088               dr_vec_info *dr_info = LOOP_VINFO_UNALIGNED_DR (loop_vinfo);
2089               tree vectype = STMT_VINFO_VECTYPE (dr_info->stmt);
2090               niters_th += TYPE_VECTOR_SUBPARTS (vectype) - 1;
2091             }
2092           else
2093             niters_th += LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
2094         }
2095
2096       /* Niters for at least one iteration of vectorized loop.  */
2097       if (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
2098         niters_th += LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2099       /* One additional iteration because of peeling for gap.  */
2100       if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
2101         niters_th += 1;
2102       LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo) = niters_th;
2103     }
2104
2105   gcc_assert (known_eq (vectorization_factor,
2106                         LOOP_VINFO_VECT_FACTOR (loop_vinfo)));
2107
2108   /* Ok to vectorize!  */
2109   return opt_result::success ();
2110
2111 again:
2112   /* Ensure that "ok" is false (with an opt_problem if dumping is enabled).  */
2113   gcc_assert (!ok);
2114
2115   /* Try again with SLP forced off but if we didn't do any SLP there is
2116      no point in re-trying.  */
2117   if (!slp)
2118     return ok;
2119
2120   /* If there are reduction chains re-trying will fail anyway.  */
2121   if (! LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo).is_empty ())
2122     return ok;
2123
2124   /* Likewise if the grouped loads or stores in the SLP cannot be handled
2125      via interleaving or lane instructions.  */
2126   slp_instance instance;
2127   slp_tree node;
2128   unsigned i, j;
2129   FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), i, instance)
2130     {
2131       stmt_vec_info vinfo;
2132       vinfo = SLP_TREE_SCALAR_STMTS (SLP_INSTANCE_TREE (instance))[0];
2133       if (! STMT_VINFO_GROUPED_ACCESS (vinfo))
2134         continue;
2135       vinfo = DR_GROUP_FIRST_ELEMENT (vinfo);
2136       unsigned int size = DR_GROUP_SIZE (vinfo);
2137       tree vectype = STMT_VINFO_VECTYPE (vinfo);
2138       if (! vect_store_lanes_supported (vectype, size, false)
2139          && ! known_eq (TYPE_VECTOR_SUBPARTS (vectype), 1U)
2140          && ! vect_grouped_store_supported (vectype, size))
2141         return opt_result::failure_at (vinfo->stmt,
2142                                        "unsupported grouped store\n");
2143       FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), j, node)
2144         {
2145           vinfo = SLP_TREE_SCALAR_STMTS (node)[0];
2146           vinfo = DR_GROUP_FIRST_ELEMENT (vinfo);
2147           bool single_element_p = !DR_GROUP_NEXT_ELEMENT (vinfo);
2148           size = DR_GROUP_SIZE (vinfo);
2149           vectype = STMT_VINFO_VECTYPE (vinfo);
2150           if (! vect_load_lanes_supported (vectype, size, false)
2151               && ! vect_grouped_load_supported (vectype, single_element_p,
2152                                                 size))
2153             return opt_result::failure_at (vinfo->stmt,
2154                                            "unsupported grouped load\n");
2155         }
2156     }
2157
2158   if (dump_enabled_p ())
2159     dump_printf_loc (MSG_NOTE, vect_location,
2160                      "re-trying with SLP disabled\n");
2161
2162   /* Roll back state appropriately.  No SLP this time.  */
2163   slp = false;
2164   /* Restore vectorization factor as it were without SLP.  */
2165   LOOP_VINFO_VECT_FACTOR (loop_vinfo) = saved_vectorization_factor;
2166   /* Free the SLP instances.  */
2167   FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), j, instance)
2168     vect_free_slp_instance (instance, false);
2169   LOOP_VINFO_SLP_INSTANCES (loop_vinfo).release ();
2170   /* Reset SLP type to loop_vect on all stmts.  */
2171   for (i = 0; i < LOOP_VINFO_LOOP (loop_vinfo)->num_nodes; ++i)
2172     {
2173       basic_block bb = LOOP_VINFO_BBS (loop_vinfo)[i];
2174       for (gimple_stmt_iterator si = gsi_start_phis (bb);
2175            !gsi_end_p (si); gsi_next (&si))
2176         {
2177           stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
2178           STMT_SLP_TYPE (stmt_info) = loop_vect;
2179         }
2180       for (gimple_stmt_iterator si = gsi_start_bb (bb);
2181            !gsi_end_p (si); gsi_next (&si))
2182         {
2183           stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
2184           STMT_SLP_TYPE (stmt_info) = loop_vect;
2185           if (STMT_VINFO_IN_PATTERN_P (stmt_info))
2186             {
2187               gimple *pattern_def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
2188               stmt_info = STMT_VINFO_RELATED_STMT (stmt_info);
2189               STMT_SLP_TYPE (stmt_info) = loop_vect;
2190               for (gimple_stmt_iterator pi = gsi_start (pattern_def_seq);
2191                    !gsi_end_p (pi); gsi_next (&pi))
2192                 STMT_SLP_TYPE (loop_vinfo->lookup_stmt (gsi_stmt (pi)))
2193                   = loop_vect;
2194             }
2195         }
2196     }
2197   /* Free optimized alias test DDRS.  */
2198   LOOP_VINFO_LOWER_BOUNDS (loop_vinfo).truncate (0);
2199   LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo).release ();
2200   LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo).release ();
2201   /* Reset target cost data.  */
2202   destroy_cost_data (LOOP_VINFO_TARGET_COST_DATA (loop_vinfo));
2203   LOOP_VINFO_TARGET_COST_DATA (loop_vinfo)
2204     = init_cost (LOOP_VINFO_LOOP (loop_vinfo));
2205   /* Reset accumulated rgroup information.  */
2206   release_vec_loop_masks (&LOOP_VINFO_MASKS (loop_vinfo));
2207   /* Reset assorted flags.  */
2208   LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = false;
2209   LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) = false;
2210   LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo) = 0;
2211   LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo) = 0;
2212   LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = saved_can_fully_mask_p;
2213
2214   goto start_over;
2215 }
2216
2217 /* Function vect_analyze_loop.
2218
2219    Apply a set of analyses on LOOP, and create a loop_vec_info struct
2220    for it.  The different analyses will record information in the
2221    loop_vec_info struct.  If ORIG_LOOP_VINFO is not NULL epilogue must
2222    be vectorized.  */
2223 opt_loop_vec_info
2224 vect_analyze_loop (struct loop *loop, loop_vec_info orig_loop_vinfo,
2225                    vec_info_shared *shared)
2226 {
2227   auto_vector_sizes vector_sizes;
2228
2229   /* Autodetect first vector size we try.  */
2230   current_vector_size = 0;
2231   targetm.vectorize.autovectorize_vector_sizes (&vector_sizes);
2232   unsigned int next_size = 0;
2233
2234   DUMP_VECT_SCOPE ("analyze_loop_nest");
2235
2236   if (loop_outer (loop)
2237       && loop_vec_info_for_loop (loop_outer (loop))
2238       && LOOP_VINFO_VECTORIZABLE_P (loop_vec_info_for_loop (loop_outer (loop))))
2239     return opt_loop_vec_info::failure_at (vect_location,
2240                                           "outer-loop already vectorized.\n");
2241
2242   if (!find_loop_nest (loop, &shared->loop_nest))
2243     return opt_loop_vec_info::failure_at
2244       (vect_location,
2245        "not vectorized: loop nest containing two or more consecutive inner"
2246        " loops cannot be vectorized\n");
2247
2248   unsigned n_stmts = 0;
2249   poly_uint64 autodetected_vector_size = 0;
2250   while (1)
2251     {
2252       /* Check the CFG characteristics of the loop (nesting, entry/exit).  */
2253       opt_loop_vec_info loop_vinfo
2254         = vect_analyze_loop_form (loop, shared);
2255       if (!loop_vinfo)
2256         {
2257           if (dump_enabled_p ())
2258             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2259                              "bad loop form.\n");
2260           return loop_vinfo;
2261         }
2262
2263       bool fatal = false;
2264
2265       if (orig_loop_vinfo)
2266         LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo) = orig_loop_vinfo;
2267
2268       opt_result res = vect_analyze_loop_2 (loop_vinfo, fatal, &n_stmts);
2269       if (res)
2270         {
2271           LOOP_VINFO_VECTORIZABLE_P (loop_vinfo) = 1;
2272
2273           return loop_vinfo;
2274         }
2275
2276       delete loop_vinfo;
2277
2278       if (next_size == 0)
2279         autodetected_vector_size = current_vector_size;
2280
2281       if (next_size < vector_sizes.length ()
2282           && known_eq (vector_sizes[next_size], autodetected_vector_size))
2283         next_size += 1;
2284
2285       if (fatal
2286           || next_size == vector_sizes.length ()
2287           || known_eq (current_vector_size, 0U))
2288         return opt_loop_vec_info::propagate_failure (res);
2289
2290       /* Try the next biggest vector size.  */
2291       current_vector_size = vector_sizes[next_size++];
2292       if (dump_enabled_p ())
2293         {
2294           dump_printf_loc (MSG_NOTE, vect_location,
2295                            "***** Re-trying analysis with "
2296                            "vector size ");
2297           dump_dec (MSG_NOTE, current_vector_size);
2298           dump_printf (MSG_NOTE, "\n");
2299         }
2300     }
2301 }
2302
2303 /* Return true if there is an in-order reduction function for CODE, storing
2304    it in *REDUC_FN if so.  */
2305
2306 static bool
2307 fold_left_reduction_fn (tree_code code, internal_fn *reduc_fn)
2308 {
2309   switch (code)
2310     {
2311     case PLUS_EXPR:
2312       *reduc_fn = IFN_FOLD_LEFT_PLUS;
2313       return true;
2314
2315     default:
2316       return false;
2317     }
2318 }
2319
2320 /* Function reduction_fn_for_scalar_code
2321
2322    Input:
2323    CODE - tree_code of a reduction operations.
2324
2325    Output:
2326    REDUC_FN - the corresponding internal function to be used to reduce the
2327       vector of partial results into a single scalar result, or IFN_LAST
2328       if the operation is a supported reduction operation, but does not have
2329       such an internal function.
2330
2331    Return FALSE if CODE currently cannot be vectorized as reduction.  */
2332
2333 static bool
2334 reduction_fn_for_scalar_code (enum tree_code code, internal_fn *reduc_fn)
2335 {
2336   switch (code)
2337     {
2338       case MAX_EXPR:
2339         *reduc_fn = IFN_REDUC_MAX;
2340         return true;
2341
2342       case MIN_EXPR:
2343         *reduc_fn = IFN_REDUC_MIN;
2344         return true;
2345
2346       case PLUS_EXPR:
2347         *reduc_fn = IFN_REDUC_PLUS;
2348         return true;
2349
2350       case BIT_AND_EXPR:
2351         *reduc_fn = IFN_REDUC_AND;
2352         return true;
2353
2354       case BIT_IOR_EXPR:
2355         *reduc_fn = IFN_REDUC_IOR;
2356         return true;
2357
2358       case BIT_XOR_EXPR:
2359         *reduc_fn = IFN_REDUC_XOR;
2360         return true;
2361
2362       case MULT_EXPR:
2363       case MINUS_EXPR:
2364         *reduc_fn = IFN_LAST;
2365         return true;
2366
2367       default:
2368        return false;
2369     }
2370 }
2371
2372 /* If there is a neutral value X such that SLP reduction NODE would not
2373    be affected by the introduction of additional X elements, return that X,
2374    otherwise return null.  CODE is the code of the reduction.  REDUC_CHAIN
2375    is true if the SLP statements perform a single reduction, false if each
2376    statement performs an independent reduction.  */
2377
2378 static tree
2379 neutral_op_for_slp_reduction (slp_tree slp_node, tree_code code,
2380                               bool reduc_chain)
2381 {
2382   vec<stmt_vec_info> stmts = SLP_TREE_SCALAR_STMTS (slp_node);
2383   stmt_vec_info stmt_vinfo = stmts[0];
2384   tree vector_type = STMT_VINFO_VECTYPE (stmt_vinfo);
2385   tree scalar_type = TREE_TYPE (vector_type);
2386   struct loop *loop = gimple_bb (stmt_vinfo->stmt)->loop_father;
2387   gcc_assert (loop);
2388
2389   switch (code)
2390     {
2391     case WIDEN_SUM_EXPR:
2392     case DOT_PROD_EXPR:
2393     case SAD_EXPR:
2394     case PLUS_EXPR:
2395     case MINUS_EXPR:
2396     case BIT_IOR_EXPR:
2397     case BIT_XOR_EXPR:
2398       return build_zero_cst (scalar_type);
2399
2400     case MULT_EXPR:
2401       return build_one_cst (scalar_type);
2402
2403     case BIT_AND_EXPR:
2404       return build_all_ones_cst (scalar_type);
2405
2406     case MAX_EXPR:
2407     case MIN_EXPR:
2408       /* For MIN/MAX the initial values are neutral.  A reduction chain
2409          has only a single initial value, so that value is neutral for
2410          all statements.  */
2411       if (reduc_chain)
2412         return PHI_ARG_DEF_FROM_EDGE (stmt_vinfo->stmt,
2413                                       loop_preheader_edge (loop));
2414       return NULL_TREE;
2415
2416     default:
2417       return NULL_TREE;
2418     }
2419 }
2420
2421 /* Error reporting helper for vect_is_simple_reduction below.  GIMPLE statement
2422    STMT is printed with a message MSG. */
2423
2424 static void
2425 report_vect_op (dump_flags_t msg_type, gimple *stmt, const char *msg)
2426 {
2427   dump_printf_loc (msg_type, vect_location, "%s%G", msg, stmt);
2428 }
2429
2430 /* DEF_STMT_INFO occurs in a loop that contains a potential reduction
2431    operation.  Return true if the results of DEF_STMT_INFO are something
2432    that can be accumulated by such a reduction.  */
2433
2434 static bool
2435 vect_valid_reduction_input_p (stmt_vec_info def_stmt_info)
2436 {
2437   return (is_gimple_assign (def_stmt_info->stmt)
2438           || is_gimple_call (def_stmt_info->stmt)
2439           || STMT_VINFO_DEF_TYPE (def_stmt_info) == vect_induction_def
2440           || (gimple_code (def_stmt_info->stmt) == GIMPLE_PHI
2441               && STMT_VINFO_DEF_TYPE (def_stmt_info) == vect_internal_def
2442               && !is_loop_header_bb_p (gimple_bb (def_stmt_info->stmt))));
2443 }
2444
2445 /* Detect SLP reduction of the form:
2446
2447    #a1 = phi <a5, a0>
2448    a2 = operation (a1)
2449    a3 = operation (a2)
2450    a4 = operation (a3)
2451    a5 = operation (a4)
2452
2453    #a = phi <a5>
2454
2455    PHI is the reduction phi node (#a1 = phi <a5, a0> above)
2456    FIRST_STMT is the first reduction stmt in the chain
2457    (a2 = operation (a1)).
2458
2459    Return TRUE if a reduction chain was detected.  */
2460
2461 static bool
2462 vect_is_slp_reduction (loop_vec_info loop_info, gimple *phi,
2463                        gimple *first_stmt)
2464 {
2465   struct loop *loop = (gimple_bb (phi))->loop_father;
2466   struct loop *vect_loop = LOOP_VINFO_LOOP (loop_info);
2467   enum tree_code code;
2468   gimple *loop_use_stmt = NULL;
2469   stmt_vec_info use_stmt_info;
2470   tree lhs;
2471   imm_use_iterator imm_iter;
2472   use_operand_p use_p;
2473   int nloop_uses, size = 0, n_out_of_loop_uses;
2474   bool found = false;
2475
2476   if (loop != vect_loop)
2477     return false;
2478
2479   auto_vec<stmt_vec_info, 8> reduc_chain;
2480   lhs = PHI_RESULT (phi);
2481   code = gimple_assign_rhs_code (first_stmt);
2482   while (1)
2483     {
2484       nloop_uses = 0;
2485       n_out_of_loop_uses = 0;
2486       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, lhs)
2487         {
2488           gimple *use_stmt = USE_STMT (use_p);
2489           if (is_gimple_debug (use_stmt))
2490             continue;
2491
2492           /* Check if we got back to the reduction phi.  */
2493           if (use_stmt == phi)
2494             {
2495               loop_use_stmt = use_stmt;
2496               found = true;
2497               break;
2498             }
2499
2500           if (flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
2501             {
2502               loop_use_stmt = use_stmt;
2503               nloop_uses++;
2504             }
2505            else
2506              n_out_of_loop_uses++;
2507
2508            /* There are can be either a single use in the loop or two uses in
2509               phi nodes.  */
2510            if (nloop_uses > 1 || (n_out_of_loop_uses && nloop_uses))
2511              return false;
2512         }
2513
2514       if (found)
2515         break;
2516
2517       /* We reached a statement with no loop uses.  */
2518       if (nloop_uses == 0)
2519         return false;
2520
2521       /* This is a loop exit phi, and we haven't reached the reduction phi.  */
2522       if (gimple_code (loop_use_stmt) == GIMPLE_PHI)
2523         return false;
2524
2525       if (!is_gimple_assign (loop_use_stmt)
2526           || code != gimple_assign_rhs_code (loop_use_stmt)
2527           || !flow_bb_inside_loop_p (loop, gimple_bb (loop_use_stmt)))
2528         return false;
2529
2530       /* Insert USE_STMT into reduction chain.  */
2531       use_stmt_info = loop_info->lookup_stmt (loop_use_stmt);
2532       reduc_chain.safe_push (use_stmt_info);
2533
2534       lhs = gimple_assign_lhs (loop_use_stmt);
2535       size++;
2536    }
2537
2538   if (!found || loop_use_stmt != phi || size < 2)
2539     return false;
2540
2541   /* Swap the operands, if needed, to make the reduction operand be the second
2542      operand.  */
2543   lhs = PHI_RESULT (phi);
2544   for (unsigned i = 0; i < reduc_chain.length (); ++i)
2545     {
2546       gassign *next_stmt = as_a <gassign *> (reduc_chain[i]->stmt);
2547       if (gimple_assign_rhs2 (next_stmt) == lhs)
2548         {
2549           tree op = gimple_assign_rhs1 (next_stmt);
2550           stmt_vec_info def_stmt_info = loop_info->lookup_def (op);
2551
2552           /* Check that the other def is either defined in the loop
2553              ("vect_internal_def"), or it's an induction (defined by a
2554              loop-header phi-node).  */
2555           if (def_stmt_info
2556               && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt_info->stmt))
2557               && vect_valid_reduction_input_p (def_stmt_info))
2558             {
2559               lhs = gimple_assign_lhs (next_stmt);
2560               continue;
2561             }
2562
2563           return false;
2564         }
2565       else
2566         {
2567           tree op = gimple_assign_rhs2 (next_stmt);
2568           stmt_vec_info def_stmt_info = loop_info->lookup_def (op);
2569
2570           /* Check that the other def is either defined in the loop
2571             ("vect_internal_def"), or it's an induction (defined by a
2572             loop-header phi-node).  */
2573           if (def_stmt_info
2574               && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt_info->stmt))
2575               && vect_valid_reduction_input_p (def_stmt_info))
2576             {
2577               if (dump_enabled_p ())
2578                 dump_printf_loc (MSG_NOTE, vect_location, "swapping oprnds: %G",
2579                                  next_stmt);
2580
2581               swap_ssa_operands (next_stmt,
2582                                  gimple_assign_rhs1_ptr (next_stmt),
2583                                  gimple_assign_rhs2_ptr (next_stmt));
2584               update_stmt (next_stmt);
2585
2586               if (CONSTANT_CLASS_P (gimple_assign_rhs1 (next_stmt)))
2587                 LOOP_VINFO_OPERANDS_SWAPPED (loop_info) = true;
2588             }
2589           else
2590             return false;
2591         }
2592
2593       lhs = gimple_assign_lhs (next_stmt);
2594     }
2595
2596   /* Build up the actual chain.  */
2597   for (unsigned i = 0; i < reduc_chain.length () - 1; ++i)
2598     {
2599       REDUC_GROUP_FIRST_ELEMENT (reduc_chain[i]) = reduc_chain[0];
2600       REDUC_GROUP_NEXT_ELEMENT (reduc_chain[i]) = reduc_chain[i+1];
2601     }
2602   REDUC_GROUP_FIRST_ELEMENT (reduc_chain.last ()) = reduc_chain[0];
2603   REDUC_GROUP_NEXT_ELEMENT (reduc_chain.last ()) = NULL;
2604
2605   /* Save the chain for further analysis in SLP detection.  */
2606   LOOP_VINFO_REDUCTION_CHAINS (loop_info).safe_push (reduc_chain[0]);
2607   REDUC_GROUP_SIZE (reduc_chain[0]) = size;
2608
2609   return true;
2610 }
2611
2612 /* Return true if we need an in-order reduction for operation CODE
2613    on type TYPE.  NEED_WRAPPING_INTEGRAL_OVERFLOW is true if integer
2614    overflow must wrap.  */
2615
2616 static bool
2617 needs_fold_left_reduction_p (tree type, tree_code code,
2618                              bool need_wrapping_integral_overflow)
2619 {
2620   /* CHECKME: check for !flag_finite_math_only too?  */
2621   if (SCALAR_FLOAT_TYPE_P (type))
2622     switch (code)
2623       {
2624       case MIN_EXPR:
2625       case MAX_EXPR:
2626         return false;
2627
2628       default:
2629         return !flag_associative_math;
2630       }
2631
2632   if (INTEGRAL_TYPE_P (type))
2633     {
2634       if (!operation_no_trapping_overflow (type, code))
2635         return true;
2636       if (need_wrapping_integral_overflow
2637           && !TYPE_OVERFLOW_WRAPS (type)
2638           && operation_can_overflow (code))
2639         return true;
2640       return false;
2641     }
2642
2643   if (SAT_FIXED_POINT_TYPE_P (type))
2644     return true;
2645
2646   return false;
2647 }
2648
2649 /* Return true if the reduction PHI in LOOP with latch arg LOOP_ARG and
2650    reduction operation CODE has a handled computation expression.  */
2651
2652 bool
2653 check_reduction_path (dump_user_location_t loc, loop_p loop, gphi *phi,
2654                       tree loop_arg, enum tree_code code)
2655 {
2656   auto_vec<std::pair<ssa_op_iter, use_operand_p> > path;
2657   auto_bitmap visited;
2658   tree lookfor = PHI_RESULT (phi);
2659   ssa_op_iter curri;
2660   use_operand_p curr = op_iter_init_phiuse (&curri, phi, SSA_OP_USE);
2661   while (USE_FROM_PTR (curr) != loop_arg)
2662     curr = op_iter_next_use (&curri);
2663   curri.i = curri.numops;
2664   do
2665     {
2666       path.safe_push (std::make_pair (curri, curr));
2667       tree use = USE_FROM_PTR (curr);
2668       if (use == lookfor)
2669         break;
2670       gimple *def = SSA_NAME_DEF_STMT (use);
2671       if (gimple_nop_p (def)
2672           || ! flow_bb_inside_loop_p (loop, gimple_bb (def)))
2673         {
2674 pop:
2675           do
2676             {
2677               std::pair<ssa_op_iter, use_operand_p> x = path.pop ();
2678               curri = x.first;
2679               curr = x.second;
2680               do
2681                 curr = op_iter_next_use (&curri);
2682               /* Skip already visited or non-SSA operands (from iterating
2683                  over PHI args).  */
2684               while (curr != NULL_USE_OPERAND_P
2685                      && (TREE_CODE (USE_FROM_PTR (curr)) != SSA_NAME
2686                          || ! bitmap_set_bit (visited,
2687                                               SSA_NAME_VERSION
2688                                                 (USE_FROM_PTR (curr)))));
2689             }
2690           while (curr == NULL_USE_OPERAND_P && ! path.is_empty ());
2691           if (curr == NULL_USE_OPERAND_P)
2692             break;
2693         }
2694       else
2695         {
2696           if (gimple_code (def) == GIMPLE_PHI)
2697             curr = op_iter_init_phiuse (&curri, as_a <gphi *>(def), SSA_OP_USE);
2698           else
2699             curr = op_iter_init_use (&curri, def, SSA_OP_USE);
2700           while (curr != NULL_USE_OPERAND_P
2701                  && (TREE_CODE (USE_FROM_PTR (curr)) != SSA_NAME
2702                      || ! bitmap_set_bit (visited,
2703                                           SSA_NAME_VERSION
2704                                             (USE_FROM_PTR (curr)))))
2705             curr = op_iter_next_use (&curri);
2706           if (curr == NULL_USE_OPERAND_P)
2707             goto pop;
2708         }
2709     }
2710   while (1);
2711   if (dump_file && (dump_flags & TDF_DETAILS))
2712     {
2713       dump_printf_loc (MSG_NOTE, loc, "reduction path: ");
2714       unsigned i;
2715       std::pair<ssa_op_iter, use_operand_p> *x;
2716       FOR_EACH_VEC_ELT (path, i, x)
2717         dump_printf (MSG_NOTE, "%T ", USE_FROM_PTR (x->second));
2718       dump_printf (MSG_NOTE, "\n");
2719     }
2720
2721   /* Check whether the reduction path detected is valid.  */
2722   bool fail = path.length () == 0;
2723   bool neg = false;
2724   for (unsigned i = 1; i < path.length (); ++i)
2725     {
2726       gimple *use_stmt = USE_STMT (path[i].second);
2727       tree op = USE_FROM_PTR (path[i].second);
2728       if (! has_single_use (op)
2729           || ! is_gimple_assign (use_stmt))
2730         {
2731           fail = true;
2732           break;
2733         }
2734       if (gimple_assign_rhs_code (use_stmt) != code)
2735         {
2736           if (code == PLUS_EXPR
2737               && gimple_assign_rhs_code (use_stmt) == MINUS_EXPR)
2738             {
2739               /* Track whether we negate the reduction value each iteration.  */
2740               if (gimple_assign_rhs2 (use_stmt) == op)
2741                 neg = ! neg;
2742             }
2743           else
2744             {
2745               fail = true;
2746               break;
2747             }
2748         }
2749     }
2750   return ! fail && ! neg;
2751 }
2752
2753
2754 /* Function vect_is_simple_reduction
2755
2756    (1) Detect a cross-iteration def-use cycle that represents a simple
2757    reduction computation.  We look for the following pattern:
2758
2759    loop_header:
2760      a1 = phi < a0, a2 >
2761      a3 = ...
2762      a2 = operation (a3, a1)
2763
2764    or
2765
2766    a3 = ...
2767    loop_header:
2768      a1 = phi < a0, a2 >
2769      a2 = operation (a3, a1)
2770
2771    such that:
2772    1. operation is commutative and associative and it is safe to
2773       change the order of the computation
2774    2. no uses for a2 in the loop (a2 is used out of the loop)
2775    3. no uses of a1 in the loop besides the reduction operation
2776    4. no uses of a1 outside the loop.
2777
2778    Conditions 1,4 are tested here.
2779    Conditions 2,3 are tested in vect_mark_stmts_to_be_vectorized.
2780
2781    (2) Detect a cross-iteration def-use cycle in nested loops, i.e.,
2782    nested cycles.
2783
2784    (3) Detect cycles of phi nodes in outer-loop vectorization, i.e., double
2785    reductions:
2786
2787      a1 = phi < a0, a2 >
2788      inner loop (def of a3)
2789      a2 = phi < a3 >
2790
2791    (4) Detect condition expressions, ie:
2792      for (int i = 0; i < N; i++)
2793        if (a[i] < val)
2794         ret_val = a[i];
2795
2796 */
2797
2798 static stmt_vec_info
2799 vect_is_simple_reduction (loop_vec_info loop_info, stmt_vec_info phi_info,
2800                           bool *double_reduc,
2801                           bool need_wrapping_integral_overflow,
2802                           enum vect_reduction_type *v_reduc_type)
2803 {
2804   gphi *phi = as_a <gphi *> (phi_info->stmt);
2805   struct loop *loop = (gimple_bb (phi))->loop_father;
2806   struct loop *vect_loop = LOOP_VINFO_LOOP (loop_info);
2807   bool nested_in_vect_loop = flow_loop_nested_p (vect_loop, loop);
2808   gimple *phi_use_stmt = NULL;
2809   enum tree_code orig_code, code;
2810   tree op1, op2, op3 = NULL_TREE, op4 = NULL_TREE;
2811   tree type;
2812   tree name;
2813   imm_use_iterator imm_iter;
2814   use_operand_p use_p;
2815   bool phi_def;
2816
2817   *double_reduc = false;
2818   *v_reduc_type = TREE_CODE_REDUCTION;
2819
2820   tree phi_name = PHI_RESULT (phi);
2821   /* ???  If there are no uses of the PHI result the inner loop reduction
2822      won't be detected as possibly double-reduction by vectorizable_reduction
2823      because that tries to walk the PHI arg from the preheader edge which
2824      can be constant.  See PR60382.  */
2825   if (has_zero_uses (phi_name))
2826     return NULL;
2827   unsigned nphi_def_loop_uses = 0;
2828   FOR_EACH_IMM_USE_FAST (use_p, imm_iter, phi_name)
2829     {
2830       gimple *use_stmt = USE_STMT (use_p);
2831       if (is_gimple_debug (use_stmt))
2832         continue;
2833
2834       if (!flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
2835         {
2836           if (dump_enabled_p ())
2837             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2838                              "intermediate value used outside loop.\n");
2839
2840           return NULL;
2841         }
2842
2843       nphi_def_loop_uses++;
2844       phi_use_stmt = use_stmt;
2845     }
2846
2847   edge latch_e = loop_latch_edge (loop);
2848   tree loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
2849   if (TREE_CODE (loop_arg) != SSA_NAME)
2850     {
2851       if (dump_enabled_p ())
2852         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2853                          "reduction: not ssa_name: %T\n", loop_arg);
2854       return NULL;
2855     }
2856
2857   stmt_vec_info def_stmt_info = loop_info->lookup_def (loop_arg);
2858   if (!def_stmt_info
2859       || !flow_bb_inside_loop_p (loop, gimple_bb (def_stmt_info->stmt)))
2860     return NULL;
2861
2862   if (gassign *def_stmt = dyn_cast <gassign *> (def_stmt_info->stmt))
2863     {
2864       name = gimple_assign_lhs (def_stmt);
2865       phi_def = false;
2866     }
2867   else if (gphi *def_stmt = dyn_cast <gphi *> (def_stmt_info->stmt))
2868     {
2869       name = PHI_RESULT (def_stmt);
2870       phi_def = true;
2871     }
2872   else
2873     {
2874       if (dump_enabled_p ())
2875         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2876                          "reduction: unhandled reduction operation: %G",
2877                          def_stmt_info->stmt);
2878       return NULL;
2879     }
2880
2881   unsigned nlatch_def_loop_uses = 0;
2882   auto_vec<gphi *, 3> lcphis;
2883   bool inner_loop_of_double_reduc = false;
2884   FOR_EACH_IMM_USE_FAST (use_p, imm_iter, name)
2885     {
2886       gimple *use_stmt = USE_STMT (use_p);
2887       if (is_gimple_debug (use_stmt))
2888         continue;
2889       if (flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
2890         nlatch_def_loop_uses++;
2891       else
2892         {
2893           /* We can have more than one loop-closed PHI.  */
2894           lcphis.safe_push (as_a <gphi *> (use_stmt));
2895           if (nested_in_vect_loop
2896               && (STMT_VINFO_DEF_TYPE (loop_info->lookup_stmt (use_stmt))
2897                   == vect_double_reduction_def))
2898             inner_loop_of_double_reduc = true;
2899         }
2900     }
2901
2902   /* If this isn't a nested cycle or if the nested cycle reduction value
2903      is used ouside of the inner loop we cannot handle uses of the reduction
2904      value.  */
2905   if ((!nested_in_vect_loop || inner_loop_of_double_reduc)
2906       && (nlatch_def_loop_uses > 1 || nphi_def_loop_uses > 1))
2907     {
2908       if (dump_enabled_p ())
2909         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2910                          "reduction used in loop.\n");
2911       return NULL;
2912     }
2913
2914   /* If DEF_STMT is a phi node itself, we expect it to have a single argument
2915      defined in the inner loop.  */
2916   if (phi_def)
2917     {
2918       gphi *def_stmt = as_a <gphi *> (def_stmt_info->stmt);
2919       op1 = PHI_ARG_DEF (def_stmt, 0);
2920
2921       if (gimple_phi_num_args (def_stmt) != 1
2922           || TREE_CODE (op1) != SSA_NAME)
2923         {
2924           if (dump_enabled_p ())
2925             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2926                              "unsupported phi node definition.\n");
2927
2928           return NULL;
2929         }
2930
2931       gimple *def1 = SSA_NAME_DEF_STMT (op1);
2932       if (gimple_bb (def1)
2933           && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
2934           && loop->inner
2935           && flow_bb_inside_loop_p (loop->inner, gimple_bb (def1))
2936           && is_gimple_assign (def1)
2937           && is_a <gphi *> (phi_use_stmt)
2938           && flow_bb_inside_loop_p (loop->inner, gimple_bb (phi_use_stmt)))
2939         {
2940           if (dump_enabled_p ())
2941             report_vect_op (MSG_NOTE, def_stmt,
2942                             "detected double reduction: ");
2943
2944           *double_reduc = true;
2945           return def_stmt_info;
2946         }
2947
2948       return NULL;
2949     }
2950
2951   /* If we are vectorizing an inner reduction we are executing that
2952      in the original order only in case we are not dealing with a
2953      double reduction.  */
2954   bool check_reduction = true;
2955   if (flow_loop_nested_p (vect_loop, loop))
2956     {
2957       gphi *lcphi;
2958       unsigned i;
2959       check_reduction = false;
2960       FOR_EACH_VEC_ELT (lcphis, i, lcphi)
2961         FOR_EACH_IMM_USE_FAST (use_p, imm_iter, gimple_phi_result (lcphi))
2962           {
2963             gimple *use_stmt = USE_STMT (use_p);
2964             if (is_gimple_debug (use_stmt))
2965               continue;
2966             if (! flow_bb_inside_loop_p (vect_loop, gimple_bb (use_stmt)))
2967               check_reduction = true;
2968           }
2969     }
2970
2971   gassign *def_stmt = as_a <gassign *> (def_stmt_info->stmt);
2972   code = orig_code = gimple_assign_rhs_code (def_stmt);
2973
2974   if (nested_in_vect_loop && !check_reduction)
2975     {
2976       /* FIXME: Even for non-reductions code generation is funneled
2977          through vectorizable_reduction for the stmt defining the
2978          PHI latch value.  So we have to artificially restrict ourselves
2979          for the supported operations.  */
2980       switch (get_gimple_rhs_class (code))
2981         {
2982         case GIMPLE_BINARY_RHS:
2983         case GIMPLE_TERNARY_RHS:
2984           break;
2985         default:
2986           /* Not supported by vectorizable_reduction.  */
2987           if (dump_enabled_p ())
2988             report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2989                             "nested cycle: not handled operation: ");
2990           return NULL;
2991         }
2992       if (dump_enabled_p ())
2993         report_vect_op (MSG_NOTE, def_stmt, "detected nested cycle: ");
2994       return def_stmt_info;
2995     }
2996
2997   /* We can handle "res -= x[i]", which is non-associative by
2998      simply rewriting this into "res += -x[i]".  Avoid changing
2999      gimple instruction for the first simple tests and only do this
3000      if we're allowed to change code at all.  */
3001   if (code == MINUS_EXPR && gimple_assign_rhs2 (def_stmt) != phi_name)
3002     code = PLUS_EXPR;
3003
3004   if (code == COND_EXPR)
3005     {
3006       if (! nested_in_vect_loop)
3007         *v_reduc_type = COND_REDUCTION;
3008
3009       op3 = gimple_assign_rhs1 (def_stmt);
3010       if (COMPARISON_CLASS_P (op3))
3011         {
3012           op4 = TREE_OPERAND (op3, 1);
3013           op3 = TREE_OPERAND (op3, 0);
3014         }
3015       if (op3 == phi_name || op4 == phi_name)
3016         {
3017           if (dump_enabled_p ())
3018             report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
3019                             "reduction: condition depends on previous"
3020                             " iteration: ");
3021           return NULL;
3022         }
3023
3024       op1 = gimple_assign_rhs2 (def_stmt);
3025       op2 = gimple_assign_rhs3 (def_stmt);
3026     }
3027   else if (!commutative_tree_code (code) || !associative_tree_code (code))
3028     {
3029       if (dump_enabled_p ())
3030         report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
3031                         "reduction: not commutative/associative: ");
3032       return NULL;
3033     }
3034   else if (get_gimple_rhs_class (code) == GIMPLE_BINARY_RHS)
3035     {
3036       op1 = gimple_assign_rhs1 (def_stmt);
3037       op2 = gimple_assign_rhs2 (def_stmt);
3038     }
3039   else
3040     {
3041       if (dump_enabled_p ())
3042         report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
3043                         "reduction: not handled operation: ");
3044       return NULL;
3045     }
3046
3047   if (TREE_CODE (op1) != SSA_NAME && TREE_CODE (op2) != SSA_NAME)
3048     {
3049       if (dump_enabled_p ())
3050         report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
3051                         "reduction: both uses not ssa_names: ");
3052
3053       return NULL;
3054     }
3055
3056   type = TREE_TYPE (gimple_assign_lhs (def_stmt));
3057   if ((TREE_CODE (op1) == SSA_NAME
3058        && !types_compatible_p (type,TREE_TYPE (op1)))
3059       || (TREE_CODE (op2) == SSA_NAME
3060           && !types_compatible_p (type, TREE_TYPE (op2)))
3061       || (op3 && TREE_CODE (op3) == SSA_NAME
3062           && !types_compatible_p (type, TREE_TYPE (op3)))
3063       || (op4 && TREE_CODE (op4) == SSA_NAME
3064           && !types_compatible_p (type, TREE_TYPE (op4))))
3065     {
3066       if (dump_enabled_p ())
3067         {
3068           dump_printf_loc (MSG_NOTE, vect_location,
3069                            "reduction: multiple types: operation type: "
3070                            "%T, operands types: %T,%T",
3071                            type,  TREE_TYPE (op1), TREE_TYPE (op2));
3072           if (op3)
3073             dump_printf (MSG_NOTE, ",%T", TREE_TYPE (op3));
3074
3075           if (op4)
3076             dump_printf (MSG_NOTE, ",%T", TREE_TYPE (op4));
3077           dump_printf (MSG_NOTE, "\n");
3078         }
3079
3080       return NULL;
3081     }
3082
3083   /* Check whether it's ok to change the order of the computation.
3084      Generally, when vectorizing a reduction we change the order of the
3085      computation.  This may change the behavior of the program in some
3086      cases, so we need to check that this is ok.  One exception is when
3087      vectorizing an outer-loop: the inner-loop is executed sequentially,
3088      and therefore vectorizing reductions in the inner-loop during
3089      outer-loop vectorization is safe.  */
3090   if (check_reduction
3091       && *v_reduc_type == TREE_CODE_REDUCTION
3092       && needs_fold_left_reduction_p (type, code,
3093                                       need_wrapping_integral_overflow))
3094     *v_reduc_type = FOLD_LEFT_REDUCTION;
3095
3096   /* Reduction is safe. We're dealing with one of the following:
3097      1) integer arithmetic and no trapv
3098      2) floating point arithmetic, and special flags permit this optimization
3099      3) nested cycle (i.e., outer loop vectorization).  */
3100   stmt_vec_info def1_info = loop_info->lookup_def (op1);
3101   stmt_vec_info def2_info = loop_info->lookup_def (op2);
3102   if (code != COND_EXPR && !def1_info && !def2_info)
3103     {
3104       if (dump_enabled_p ())
3105         report_vect_op (MSG_NOTE, def_stmt, "reduction: no defs for operands: ");
3106       return NULL;
3107     }
3108
3109   /* Check that one def is the reduction def, defined by PHI,
3110      the other def is either defined in the loop ("vect_internal_def"),
3111      or it's an induction (defined by a loop-header phi-node).  */
3112
3113   if (def2_info
3114       && def2_info->stmt == phi
3115       && (code == COND_EXPR
3116           || !def1_info
3117           || !flow_bb_inside_loop_p (loop, gimple_bb (def1_info->stmt))
3118           || vect_valid_reduction_input_p (def1_info)))
3119     {
3120       if (dump_enabled_p ())
3121         report_vect_op (MSG_NOTE, def_stmt, "detected reduction: ");
3122       return def_stmt_info;
3123     }
3124
3125   if (def1_info
3126       && def1_info->stmt == phi
3127       && (code == COND_EXPR
3128           || !def2_info
3129           || !flow_bb_inside_loop_p (loop, gimple_bb (def2_info->stmt))
3130           || vect_valid_reduction_input_p (def2_info)))
3131     {
3132       if (! nested_in_vect_loop && orig_code != MINUS_EXPR)
3133         {
3134           /* Check if we can swap operands (just for simplicity - so that
3135              the rest of the code can assume that the reduction variable
3136              is always the last (second) argument).  */
3137           if (code == COND_EXPR)
3138             {
3139               /* Swap cond_expr by inverting the condition.  */
3140               tree cond_expr = gimple_assign_rhs1 (def_stmt);
3141               enum tree_code invert_code = ERROR_MARK;
3142               enum tree_code cond_code = TREE_CODE (cond_expr);
3143
3144               if (TREE_CODE_CLASS (cond_code) == tcc_comparison)
3145                 {
3146                   bool honor_nans = HONOR_NANS (TREE_OPERAND (cond_expr, 0));
3147                   invert_code = invert_tree_comparison (cond_code, honor_nans);
3148                 }
3149               if (invert_code != ERROR_MARK)
3150                 {
3151                   TREE_SET_CODE (cond_expr, invert_code);
3152                   swap_ssa_operands (def_stmt,
3153                                      gimple_assign_rhs2_ptr (def_stmt),
3154                                      gimple_assign_rhs3_ptr (def_stmt));
3155                 }
3156               else
3157                 {
3158                   if (dump_enabled_p ())
3159                     report_vect_op (MSG_NOTE, def_stmt,
3160                                     "detected reduction: cannot swap operands "
3161                                     "for cond_expr");
3162                   return NULL;
3163                 }
3164             }
3165           else
3166             swap_ssa_operands (def_stmt, gimple_assign_rhs1_ptr (def_stmt),
3167                                gimple_assign_rhs2_ptr (def_stmt));
3168
3169           if (dump_enabled_p ())
3170             report_vect_op (MSG_NOTE, def_stmt,
3171                             "detected reduction: need to swap operands: ");
3172
3173           if (CONSTANT_CLASS_P (gimple_assign_rhs1 (def_stmt)))
3174             LOOP_VINFO_OPERANDS_SWAPPED (loop_info) = true;
3175         }
3176       else
3177         {
3178           if (dump_enabled_p ())
3179             report_vect_op (MSG_NOTE, def_stmt, "detected reduction: ");
3180         }
3181
3182       return def_stmt_info;
3183     }
3184
3185   /* Try to find SLP reduction chain.  */
3186   if (! nested_in_vect_loop
3187       && code != COND_EXPR
3188       && orig_code != MINUS_EXPR
3189       && vect_is_slp_reduction (loop_info, phi, def_stmt))
3190     {
3191       if (dump_enabled_p ())
3192         report_vect_op (MSG_NOTE, def_stmt,
3193                         "reduction: detected reduction chain: ");
3194
3195       return def_stmt_info;
3196     }
3197
3198   /* Look for the expression computing loop_arg from loop PHI result.  */
3199   if (check_reduction_path (vect_location, loop, phi, loop_arg, code))
3200     return def_stmt_info;
3201
3202   if (dump_enabled_p ())
3203     {
3204       report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
3205                       "reduction: unknown pattern: ");
3206     }
3207
3208   return NULL;
3209 }
3210
3211 /* Wrapper around vect_is_simple_reduction, which will modify code
3212    in-place if it enables detection of more reductions.  Arguments
3213    as there.  */
3214
3215 stmt_vec_info
3216 vect_force_simple_reduction (loop_vec_info loop_info, stmt_vec_info phi_info,
3217                              bool *double_reduc,
3218                              bool need_wrapping_integral_overflow)
3219 {
3220   enum vect_reduction_type v_reduc_type;
3221   stmt_vec_info def_info
3222     = vect_is_simple_reduction (loop_info, phi_info, double_reduc,
3223                                 need_wrapping_integral_overflow,
3224                                 &v_reduc_type);
3225   if (def_info)
3226     {
3227       STMT_VINFO_REDUC_TYPE (phi_info) = v_reduc_type;
3228       STMT_VINFO_REDUC_DEF (phi_info) = def_info;
3229       STMT_VINFO_REDUC_TYPE (def_info) = v_reduc_type;
3230       STMT_VINFO_REDUC_DEF (def_info) = phi_info;
3231     }
3232   return def_info;
3233 }
3234
3235 /* Calculate cost of peeling the loop PEEL_ITERS_PROLOGUE times.  */
3236 int
3237 vect_get_known_peeling_cost (loop_vec_info loop_vinfo, int peel_iters_prologue,
3238                              int *peel_iters_epilogue,
3239                              stmt_vector_for_cost *scalar_cost_vec,
3240                              stmt_vector_for_cost *prologue_cost_vec,
3241                              stmt_vector_for_cost *epilogue_cost_vec)
3242 {
3243   int retval = 0;
3244   int assumed_vf = vect_vf_for_cost (loop_vinfo);
3245
3246   if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
3247     {
3248       *peel_iters_epilogue = assumed_vf / 2;
3249       if (dump_enabled_p ())
3250         dump_printf_loc (MSG_NOTE, vect_location,
3251                          "cost model: epilogue peel iters set to vf/2 "
3252                          "because loop iterations are unknown .\n");
3253
3254       /* If peeled iterations are known but number of scalar loop
3255          iterations are unknown, count a taken branch per peeled loop.  */
3256       retval = record_stmt_cost (prologue_cost_vec, 1, cond_branch_taken,
3257                                  NULL, 0, vect_prologue);
3258       retval = record_stmt_cost (prologue_cost_vec, 1, cond_branch_taken,
3259                                  NULL, 0, vect_epilogue);
3260     }
3261   else
3262     {
3263       int niters = LOOP_VINFO_INT_NITERS (loop_vinfo);
3264       peel_iters_prologue = niters < peel_iters_prologue ?
3265                             niters : peel_iters_prologue;
3266       *peel_iters_epilogue = (niters - peel_iters_prologue) % assumed_vf;
3267       /* If we need to peel for gaps, but no peeling is required, we have to
3268          peel VF iterations.  */
3269       if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) && !*peel_iters_epilogue)
3270         *peel_iters_epilogue = assumed_vf;
3271     }
3272
3273   stmt_info_for_cost *si;
3274   int j;
3275   if (peel_iters_prologue)
3276     FOR_EACH_VEC_ELT (*scalar_cost_vec, j, si)
3277       retval += record_stmt_cost (prologue_cost_vec,
3278                                   si->count * peel_iters_prologue,
3279                                   si->kind, si->stmt_info, si->misalign,
3280                                   vect_prologue);
3281   if (*peel_iters_epilogue)
3282     FOR_EACH_VEC_ELT (*scalar_cost_vec, j, si)
3283       retval += record_stmt_cost (epilogue_cost_vec,
3284                                   si->count * *peel_iters_epilogue,
3285                                   si->kind, si->stmt_info, si->misalign,
3286                                   vect_epilogue);
3287
3288   return retval;
3289 }
3290
3291 /* Function vect_estimate_min_profitable_iters
3292
3293    Return the number of iterations required for the vector version of the
3294    loop to be profitable relative to the cost of the scalar version of the
3295    loop.
3296
3297    *RET_MIN_PROFITABLE_NITERS is a cost model profitability threshold
3298    of iterations for vectorization.  -1 value means loop vectorization
3299    is not profitable.  This returned value may be used for dynamic
3300    profitability check.
3301
3302    *RET_MIN_PROFITABLE_ESTIMATE is a profitability threshold to be used
3303    for static check against estimated number of iterations.  */
3304
3305 static void
3306 vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo,
3307                                     int *ret_min_profitable_niters,
3308                                     int *ret_min_profitable_estimate)
3309 {
3310   int min_profitable_iters;
3311   int min_profitable_estimate;
3312   int peel_iters_prologue;
3313   int peel_iters_epilogue;
3314   unsigned vec_inside_cost = 0;
3315   int vec_outside_cost = 0;
3316   unsigned vec_prologue_cost = 0;
3317   unsigned vec_epilogue_cost = 0;
3318   int scalar_single_iter_cost = 0;
3319   int scalar_outside_cost = 0;
3320   int assumed_vf = vect_vf_for_cost (loop_vinfo);
3321   int npeel = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
3322   void *target_cost_data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
3323
3324   /* Cost model disabled.  */
3325   if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
3326     {
3327       if (dump_enabled_p ())
3328         dump_printf_loc (MSG_NOTE, vect_location, "cost model disabled.\n");
3329       *ret_min_profitable_niters = 0;
3330       *ret_min_profitable_estimate = 0;
3331       return;
3332     }
3333
3334   /* Requires loop versioning tests to handle misalignment.  */
3335   if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo))
3336     {
3337       /*  FIXME: Make cost depend on complexity of individual check.  */
3338       unsigned len = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).length ();
3339       (void) add_stmt_cost (target_cost_data, len, vector_stmt, NULL, 0,
3340                             vect_prologue);
3341       if (dump_enabled_p ())
3342         dump_printf (MSG_NOTE,
3343                      "cost model: Adding cost of checks for loop "
3344                      "versioning to treat misalignment.\n");
3345     }
3346
3347   /* Requires loop versioning with alias checks.  */
3348   if (LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
3349     {
3350       /*  FIXME: Make cost depend on complexity of individual check.  */
3351       unsigned len = LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo).length ();
3352       (void) add_stmt_cost (target_cost_data, len, vector_stmt, NULL, 0,
3353                             vect_prologue);
3354       len = LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo).length ();
3355       if (len)
3356         /* Count LEN - 1 ANDs and LEN comparisons.  */
3357         (void) add_stmt_cost (target_cost_data, len * 2 - 1, scalar_stmt,
3358                               NULL, 0, vect_prologue);
3359       len = LOOP_VINFO_LOWER_BOUNDS (loop_vinfo).length ();
3360       if (len)
3361         {
3362           /* Count LEN - 1 ANDs and LEN comparisons.  */
3363           unsigned int nstmts = len * 2 - 1;
3364           /* +1 for each bias that needs adding.  */
3365           for (unsigned int i = 0; i < len; ++i)
3366             if (!LOOP_VINFO_LOWER_BOUNDS (loop_vinfo)[i].unsigned_p)
3367               nstmts += 1;
3368           (void) add_stmt_cost (target_cost_data, nstmts, scalar_stmt,
3369                                 NULL, 0, vect_prologue);
3370         }
3371       if (dump_enabled_p ())
3372         dump_printf (MSG_NOTE,
3373                      "cost model: Adding cost of checks for loop "
3374                      "versioning aliasing.\n");
3375     }
3376
3377   /* Requires loop versioning with niter checks.  */
3378   if (LOOP_REQUIRES_VERSIONING_FOR_NITERS (loop_vinfo))
3379     {
3380       /*  FIXME: Make cost depend on complexity of individual check.  */
3381       (void) add_stmt_cost (target_cost_data, 1, vector_stmt, NULL, 0,
3382                             vect_prologue);
3383       if (dump_enabled_p ())
3384         dump_printf (MSG_NOTE,
3385                      "cost model: Adding cost of checks for loop "
3386                      "versioning niters.\n");
3387     }
3388
3389   if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
3390     (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken, NULL, 0,
3391                           vect_prologue);
3392
3393   /* Count statements in scalar loop.  Using this as scalar cost for a single
3394      iteration for now.
3395
3396      TODO: Add outer loop support.
3397
3398      TODO: Consider assigning different costs to different scalar
3399      statements.  */
3400
3401   scalar_single_iter_cost
3402     = LOOP_VINFO_SINGLE_SCALAR_ITERATION_COST (loop_vinfo);
3403
3404   /* Add additional cost for the peeled instructions in prologue and epilogue
3405      loop.  (For fully-masked loops there will be no peeling.)
3406
3407      FORNOW: If we don't know the value of peel_iters for prologue or epilogue
3408      at compile-time - we assume it's vf/2 (the worst would be vf-1).
3409
3410      TODO: Build an expression that represents peel_iters for prologue and
3411      epilogue to be used in a run-time test.  */
3412
3413   if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
3414     {
3415       peel_iters_prologue = 0;
3416       peel_iters_epilogue = 0;
3417
3418       if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
3419         {
3420           /* We need to peel exactly one iteration.  */
3421           peel_iters_epilogue += 1;
3422           stmt_info_for_cost *si;
3423           int j;
3424           FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
3425                             j, si)
3426             (void) add_stmt_cost (target_cost_data, si->count,
3427                                   si->kind, si->stmt_info, si->misalign,
3428                                   vect_epilogue);
3429         }
3430     }
3431   else if (npeel < 0)
3432     {
3433       peel_iters_prologue = assumed_vf / 2;
3434       if (dump_enabled_p ())
3435         dump_printf (MSG_NOTE, "cost model: "
3436                      "prologue peel iters set to vf/2.\n");
3437
3438       /* If peeling for alignment is unknown, loop bound of main loop becomes
3439          unknown.  */
3440       peel_iters_epilogue = assumed_vf / 2;
3441       if (dump_enabled_p ())
3442         dump_printf (MSG_NOTE, "cost model: "
3443                      "epilogue peel iters set to vf/2 because "
3444                      "peeling for alignment is unknown.\n");
3445
3446       /* If peeled iterations are unknown, count a taken branch and a not taken
3447          branch per peeled loop. Even if scalar loop iterations are known,
3448          vector iterations are not known since peeled prologue iterations are
3449          not known. Hence guards remain the same.  */
3450       (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken,
3451                             NULL, 0, vect_prologue);
3452       (void) add_stmt_cost (target_cost_data, 1, cond_branch_not_taken,
3453                             NULL, 0, vect_prologue);
3454       (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken,
3455                             NULL, 0, vect_epilogue);
3456       (void) add_stmt_cost (target_cost_data, 1, cond_branch_not_taken,
3457                             NULL, 0, vect_epilogue);
3458       stmt_info_for_cost *si;
3459       int j;
3460       FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo), j, si)
3461         {
3462           (void) add_stmt_cost (target_cost_data,
3463                                 si->count * peel_iters_prologue,
3464                                 si->kind, si->stmt_info, si->misalign,
3465                                 vect_prologue);
3466           (void) add_stmt_cost (target_cost_data,
3467                                 si->count * peel_iters_epilogue,
3468                                 si->kind, si->stmt_info, si->misalign,
3469                                 vect_epilogue);
3470         }
3471     }
3472   else
3473     {
3474       stmt_vector_for_cost prologue_cost_vec, epilogue_cost_vec;
3475       stmt_info_for_cost *si;
3476       int j;
3477       void *data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
3478
3479       prologue_cost_vec.create (2);
3480       epilogue_cost_vec.create (2);
3481       peel_iters_prologue = npeel;
3482
3483       (void) vect_get_known_peeling_cost (loop_vinfo, peel_iters_prologue,
3484                                           &peel_iters_epilogue,
3485                                           &LOOP_VINFO_SCALAR_ITERATION_COST
3486                                             (loop_vinfo),
3487                                           &prologue_cost_vec,
3488                                           &epilogue_cost_vec);
3489
3490       FOR_EACH_VEC_ELT (prologue_cost_vec, j, si)
3491         (void) add_stmt_cost (data, si->count, si->kind, si->stmt_info,
3492                               si->misalign, vect_prologue);
3493
3494       FOR_EACH_VEC_ELT (epilogue_cost_vec, j, si)
3495         (void) add_stmt_cost (data, si->count, si->kind, si->stmt_info,
3496                               si->misalign, vect_epilogue);
3497
3498       prologue_cost_vec.release ();
3499       epilogue_cost_vec.release ();
3500     }
3501
3502   /* FORNOW: The scalar outside cost is incremented in one of the
3503      following ways:
3504
3505      1. The vectorizer checks for alignment and aliasing and generates
3506      a condition that allows dynamic vectorization.  A cost model
3507      check is ANDED with the versioning condition.  Hence scalar code
3508      path now has the added cost of the versioning check.
3509
3510        if (cost > th & versioning_check)
3511          jmp to vector code
3512
3513      Hence run-time scalar is incremented by not-taken branch cost.
3514
3515      2. The vectorizer then checks if a prologue is required.  If the
3516      cost model check was not done before during versioning, it has to
3517      be done before the prologue check.
3518
3519        if (cost <= th)
3520          prologue = scalar_iters
3521        if (prologue == 0)
3522          jmp to vector code
3523        else
3524          execute prologue
3525        if (prologue == num_iters)
3526          go to exit
3527
3528      Hence the run-time scalar cost is incremented by a taken branch,
3529      plus a not-taken branch, plus a taken branch cost.
3530
3531      3. The vectorizer then checks if an epilogue is required.  If the
3532      cost model check was not done before during prologue check, it
3533      has to be done with the epilogue check.
3534
3535        if (prologue == 0)
3536          jmp to vector code
3537        else
3538          execute prologue
3539        if (prologue == num_iters)
3540          go to exit
3541        vector code:
3542          if ((cost <= th) | (scalar_iters-prologue-epilogue == 0))
3543            jmp to epilogue
3544
3545      Hence the run-time scalar cost should be incremented by 2 taken
3546      branches.
3547
3548      TODO: The back end may reorder the BBS's differently and reverse
3549      conditions/branch directions.  Change the estimates below to
3550      something more reasonable.  */
3551
3552   /* If the number of iterations is known and we do not do versioning, we can
3553      decide whether to vectorize at compile time.  Hence the scalar version
3554      do not carry cost model guard costs.  */
3555   if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
3556       || LOOP_REQUIRES_VERSIONING (loop_vinfo))
3557     {
3558       /* Cost model check occurs at versioning.  */
3559       if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
3560         scalar_outside_cost += vect_get_stmt_cost (cond_branch_not_taken);
3561       else
3562         {
3563           /* Cost model check occurs at prologue generation.  */
3564           if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) < 0)
3565             scalar_outside_cost += 2 * vect_get_stmt_cost (cond_branch_taken)
3566               + vect_get_stmt_cost (cond_branch_not_taken);
3567           /* Cost model check occurs at epilogue generation.  */
3568           else
3569             scalar_outside_cost += 2 * vect_get_stmt_cost (cond_branch_taken);
3570         }
3571     }
3572
3573   /* Complete the target-specific cost calculations.  */
3574   finish_cost (LOOP_VINFO_TARGET_COST_DATA (loop_vinfo), &vec_prologue_cost,
3575                &vec_inside_cost, &vec_epilogue_cost);
3576
3577   vec_outside_cost = (int)(vec_prologue_cost + vec_epilogue_cost);
3578
3579   if (dump_enabled_p ())
3580     {
3581       dump_printf_loc (MSG_NOTE, vect_location, "Cost model analysis: \n");
3582       dump_printf (MSG_NOTE, "  Vector inside of loop cost: %d\n",
3583                    vec_inside_cost);
3584       dump_printf (MSG_NOTE, "  Vector prologue cost: %d\n",
3585                    vec_prologue_cost);
3586       dump_printf (MSG_NOTE, "  Vector epilogue cost: %d\n",
3587                    vec_epilogue_cost);
3588       dump_printf (MSG_NOTE, "  Scalar iteration cost: %d\n",
3589                    scalar_single_iter_cost);
3590       dump_printf (MSG_NOTE, "  Scalar outside cost: %d\n",
3591                    scalar_outside_cost);
3592       dump_printf (MSG_NOTE, "  Vector outside cost: %d\n",
3593                    vec_outside_cost);
3594       dump_printf (MSG_NOTE, "  prologue iterations: %d\n",
3595                    peel_iters_prologue);
3596       dump_printf (MSG_NOTE, "  epilogue iterations: %d\n",
3597                    peel_iters_epilogue);
3598     }
3599
3600   /* Calculate number of iterations required to make the vector version
3601      profitable, relative to the loop bodies only.  The following condition
3602      must hold true:
3603      SIC * niters + SOC > VIC * ((niters-PL_ITERS-EP_ITERS)/VF) + VOC
3604      where
3605      SIC = scalar iteration cost, VIC = vector iteration cost,
3606      VOC = vector outside cost, VF = vectorization factor,
3607      PL_ITERS = prologue iterations, EP_ITERS= epilogue iterations
3608      SOC = scalar outside cost for run time cost model check.  */
3609
3610   if ((scalar_single_iter_cost * assumed_vf) > (int) vec_inside_cost)
3611     {
3612       min_profitable_iters = ((vec_outside_cost - scalar_outside_cost)
3613                               * assumed_vf
3614                               - vec_inside_cost * peel_iters_prologue
3615                               - vec_inside_cost * peel_iters_epilogue);
3616       if (min_profitable_iters <= 0)
3617         min_profitable_iters = 0;
3618       else
3619         {
3620           min_profitable_iters /= ((scalar_single_iter_cost * assumed_vf)
3621                                    - vec_inside_cost);
3622
3623           if ((scalar_single_iter_cost * assumed_vf * min_profitable_iters)
3624               <= (((int) vec_inside_cost * min_profitable_iters)
3625                   + (((int) vec_outside_cost - scalar_outside_cost)
3626                      * assumed_vf)))
3627             min_profitable_iters++;
3628         }
3629     }
3630   /* vector version will never be profitable.  */
3631   else
3632     {
3633       if (LOOP_VINFO_LOOP (loop_vinfo)->force_vectorize)
3634         warning_at (vect_location.get_location_t (), OPT_Wopenmp_simd,
3635                     "vectorization did not happen for a simd loop");
3636
3637       if (dump_enabled_p ())
3638         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3639                          "cost model: the vector iteration cost = %d "
3640                          "divided by the scalar iteration cost = %d "
3641                          "is greater or equal to the vectorization factor = %d"
3642                          ".\n",
3643                          vec_inside_cost, scalar_single_iter_cost, assumed_vf);
3644       *ret_min_profitable_niters = -1;
3645       *ret_min_profitable_estimate = -1;
3646       return;
3647     }
3648
3649   if (dump_enabled_p ())
3650     dump_printf (MSG_NOTE,
3651                  "  Calculated minimum iters for profitability: %d\n",
3652                  min_profitable_iters);
3653
3654   if (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
3655       && min_profitable_iters < (assumed_vf + peel_iters_prologue))
3656     /* We want the vectorized loop to execute at least once.  */
3657     min_profitable_iters = assumed_vf + peel_iters_prologue;
3658
3659   if (dump_enabled_p ())
3660     dump_printf_loc (MSG_NOTE, vect_location,
3661                      "  Runtime profitability threshold = %d\n",
3662                      min_profitable_iters);
3663
3664   *ret_min_profitable_niters = min_profitable_iters;
3665
3666   /* Calculate number of iterations required to make the vector version
3667      profitable, relative to the loop bodies only.
3668
3669      Non-vectorized variant is SIC * niters and it must win over vector
3670      variant on the expected loop trip count.  The following condition must hold true:
3671      SIC * niters > VIC * ((niters-PL_ITERS-EP_ITERS)/VF) + VOC + SOC  */
3672
3673   if (vec_outside_cost <= 0)
3674     min_profitable_estimate = 0;
3675   else
3676     {
3677       min_profitable_estimate = ((vec_outside_cost + scalar_outside_cost)
3678                                  * assumed_vf
3679                                  - vec_inside_cost * peel_iters_prologue
3680                                  - vec_inside_cost * peel_iters_epilogue)
3681                                  / ((scalar_single_iter_cost * assumed_vf)
3682                                    - vec_inside_cost);
3683     }
3684   min_profitable_estimate = MAX (min_profitable_estimate, min_profitable_iters);
3685   if (dump_enabled_p ())
3686     dump_printf_loc (MSG_NOTE, vect_location,
3687                      "  Static estimate profitability threshold = %d\n",
3688                      min_profitable_estimate);
3689
3690   *ret_min_profitable_estimate = min_profitable_estimate;
3691 }
3692
3693 /* Writes into SEL a mask for a vec_perm, equivalent to a vec_shr by OFFSET
3694    vector elements (not bits) for a vector with NELT elements.  */
3695 static void
3696 calc_vec_perm_mask_for_shift (unsigned int offset, unsigned int nelt,
3697                               vec_perm_builder *sel)
3698 {
3699   /* The encoding is a single stepped pattern.  Any wrap-around is handled
3700      by vec_perm_indices.  */
3701   sel->new_vector (nelt, 1, 3);
3702   for (unsigned int i = 0; i < 3; i++)
3703     sel->quick_push (i + offset);
3704 }
3705
3706 /* Checks whether the target supports whole-vector shifts for vectors of mode
3707    MODE.  This is the case if _either_ the platform handles vec_shr_optab, _or_
3708    it supports vec_perm_const with masks for all necessary shift amounts.  */
3709 static bool
3710 have_whole_vector_shift (machine_mode mode)
3711 {
3712   if (optab_handler (vec_shr_optab, mode) != CODE_FOR_nothing)
3713     return true;
3714
3715   /* Variable-length vectors should be handled via the optab.  */
3716   unsigned int nelt;
3717   if (!GET_MODE_NUNITS (mode).is_constant (&nelt))
3718     return false;
3719
3720   vec_perm_builder sel;
3721   vec_perm_indices indices;
3722   for (unsigned int i = nelt / 2; i >= 1; i /= 2)
3723     {
3724       calc_vec_perm_mask_for_shift (i, nelt, &sel);
3725       indices.new_vector (sel, 2, nelt);
3726       if (!can_vec_perm_const_p (mode, indices, false))
3727         return false;
3728     }
3729   return true;
3730 }
3731
3732 /* TODO: Close dependency between vect_model_*_cost and vectorizable_*
3733    functions. Design better to avoid maintenance issues.  */
3734
3735 /* Function vect_model_reduction_cost.
3736
3737    Models cost for a reduction operation, including the vector ops
3738    generated within the strip-mine loop, the initial definition before
3739    the loop, and the epilogue code that must be generated.  */
3740
3741 static void
3742 vect_model_reduction_cost (stmt_vec_info stmt_info, internal_fn reduc_fn,
3743                            int ncopies, stmt_vector_for_cost *cost_vec)
3744 {
3745   int prologue_cost = 0, epilogue_cost = 0, inside_cost;
3746   enum tree_code code;
3747   optab optab;
3748   tree vectype;
3749   machine_mode mode;
3750   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
3751   struct loop *loop = NULL;
3752
3753   if (loop_vinfo)
3754     loop = LOOP_VINFO_LOOP (loop_vinfo);
3755
3756   /* Condition reductions generate two reductions in the loop.  */
3757   vect_reduction_type reduction_type
3758     = STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info);
3759   if (reduction_type == COND_REDUCTION)
3760     ncopies *= 2;
3761
3762   vectype = STMT_VINFO_VECTYPE (stmt_info);
3763   mode = TYPE_MODE (vectype);
3764   stmt_vec_info orig_stmt_info = vect_orig_stmt (stmt_info);
3765
3766   code = gimple_assign_rhs_code (orig_stmt_info->stmt);
3767
3768   if (reduction_type == EXTRACT_LAST_REDUCTION
3769       || reduction_type == FOLD_LEFT_REDUCTION)
3770     {
3771       /* No extra instructions needed in the prologue.  */
3772       prologue_cost = 0;
3773
3774       if (reduction_type == EXTRACT_LAST_REDUCTION || reduc_fn != IFN_LAST)
3775         /* Count one reduction-like operation per vector.  */
3776         inside_cost = record_stmt_cost (cost_vec, ncopies, vec_to_scalar,
3777                                         stmt_info, 0, vect_body);
3778       else
3779         {
3780           /* Use NELEMENTS extracts and NELEMENTS scalar ops.  */
3781           unsigned int nelements = ncopies * vect_nunits_for_cost (vectype);
3782           inside_cost = record_stmt_cost (cost_vec, nelements,
3783                                           vec_to_scalar, stmt_info, 0,
3784                                           vect_body);
3785           inside_cost += record_stmt_cost (cost_vec, nelements,
3786                                            scalar_stmt, stmt_info, 0,
3787                                            vect_body);
3788         }
3789     }
3790   else
3791     {
3792       /* Add in cost for initial definition.
3793          For cond reduction we have four vectors: initial index, step,
3794          initial result of the data reduction, initial value of the index
3795          reduction.  */
3796       int prologue_stmts = reduction_type == COND_REDUCTION ? 4 : 1;
3797       prologue_cost += record_stmt_cost (cost_vec, prologue_stmts,
3798                                          scalar_to_vec, stmt_info, 0,
3799                                          vect_prologue);
3800
3801       /* Cost of reduction op inside loop.  */
3802       inside_cost = record_stmt_cost (cost_vec, ncopies, vector_stmt,
3803                                       stmt_info, 0, vect_body);
3804     }
3805
3806   /* Determine cost of epilogue code.
3807
3808      We have a reduction operator that will reduce the vector in one statement.
3809      Also requires scalar extract.  */
3810
3811   if (!loop || !nested_in_vect_loop_p (loop, orig_stmt_info))
3812     {
3813       if (reduc_fn != IFN_LAST)
3814         {
3815           if (reduction_type == COND_REDUCTION)
3816             {
3817               /* An EQ stmt and an COND_EXPR stmt.  */
3818               epilogue_cost += record_stmt_cost (cost_vec, 2,
3819                                                  vector_stmt, stmt_info, 0,
3820                                                  vect_epilogue);
3821               /* Reduction of the max index and a reduction of the found
3822                  values.  */
3823               epilogue_cost += record_stmt_cost (cost_vec, 2,
3824                                                  vec_to_scalar, stmt_info, 0,
3825                                                  vect_epilogue);
3826               /* A broadcast of the max value.  */
3827               epilogue_cost += record_stmt_cost (cost_vec, 1,
3828                                                  scalar_to_vec, stmt_info, 0,
3829                                                  vect_epilogue);
3830             }
3831           else
3832             {
3833               epilogue_cost += record_stmt_cost (cost_vec, 1, vector_stmt,
3834                                                  stmt_info, 0, vect_epilogue);
3835               epilogue_cost += record_stmt_cost (cost_vec, 1,
3836                                                  vec_to_scalar, stmt_info, 0,
3837                                                  vect_epilogue);
3838             }
3839         }
3840       else if (reduction_type == COND_REDUCTION)
3841         {
3842           unsigned estimated_nunits = vect_nunits_for_cost (vectype);
3843           /* Extraction of scalar elements.  */
3844           epilogue_cost += record_stmt_cost (cost_vec,
3845                                              2 * estimated_nunits,
3846                                              vec_to_scalar, stmt_info, 0,
3847                                              vect_epilogue);
3848           /* Scalar max reductions via COND_EXPR / MAX_EXPR.  */
3849           epilogue_cost += record_stmt_cost (cost_vec,
3850                                              2 * estimated_nunits - 3,
3851                                              scalar_stmt, stmt_info, 0,
3852                                              vect_epilogue);
3853         }
3854       else if (reduction_type == EXTRACT_LAST_REDUCTION
3855                || reduction_type == FOLD_LEFT_REDUCTION)
3856         /* No extra instructions need in the epilogue.  */
3857         ;
3858       else
3859         {
3860           int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
3861           tree bitsize =
3862             TYPE_SIZE (TREE_TYPE (gimple_assign_lhs (orig_stmt_info->stmt)));
3863           int element_bitsize = tree_to_uhwi (bitsize);
3864           int nelements = vec_size_in_bits / element_bitsize;
3865
3866           if (code == COND_EXPR)
3867             code = MAX_EXPR;
3868
3869           optab = optab_for_tree_code (code, vectype, optab_default);
3870
3871           /* We have a whole vector shift available.  */
3872           if (optab != unknown_optab
3873               && VECTOR_MODE_P (mode)
3874               && optab_handler (optab, mode) != CODE_FOR_nothing
3875               && have_whole_vector_shift (mode))
3876             {
3877               /* Final reduction via vector shifts and the reduction operator.
3878                  Also requires scalar extract.  */
3879               epilogue_cost += record_stmt_cost (cost_vec,
3880                                                  exact_log2 (nelements) * 2,
3881                                                  vector_stmt, stmt_info, 0,
3882                                                  vect_epilogue);
3883               epilogue_cost += record_stmt_cost (cost_vec, 1,
3884                                                  vec_to_scalar, stmt_info, 0,
3885                                                  vect_epilogue);
3886             }
3887           else
3888             /* Use extracts and reduction op for final reduction.  For N
3889                elements, we have N extracts and N-1 reduction ops.  */
3890             epilogue_cost += record_stmt_cost (cost_vec,
3891                                                nelements + nelements - 1,
3892                                                vector_stmt, stmt_info, 0,
3893                                                vect_epilogue);
3894         }
3895     }
3896
3897   if (dump_enabled_p ())
3898     dump_printf (MSG_NOTE,
3899                  "vect_model_reduction_cost: inside_cost = %d, "
3900                  "prologue_cost = %d, epilogue_cost = %d .\n", inside_cost,
3901                  prologue_cost, epilogue_cost);
3902 }
3903
3904
3905 /* Function vect_model_induction_cost.
3906
3907    Models cost for induction operations.  */
3908
3909 static void
3910 vect_model_induction_cost (stmt_vec_info stmt_info, int ncopies,
3911                            stmt_vector_for_cost *cost_vec)
3912 {
3913   unsigned inside_cost, prologue_cost;
3914
3915   if (PURE_SLP_STMT (stmt_info))
3916     return;
3917
3918   /* loop cost for vec_loop.  */
3919   inside_cost = record_stmt_cost (cost_vec, ncopies, vector_stmt,
3920                                   stmt_info, 0, vect_body);
3921
3922   /* prologue cost for vec_init and vec_step.  */
3923   prologue_cost = record_stmt_cost (cost_vec, 2, scalar_to_vec,
3924                                     stmt_info, 0, vect_prologue);
3925
3926   if (dump_enabled_p ())
3927     dump_printf_loc (MSG_NOTE, vect_location,
3928                      "vect_model_induction_cost: inside_cost = %d, "
3929                      "prologue_cost = %d .\n", inside_cost, prologue_cost);
3930 }
3931
3932
3933
3934 /* Function get_initial_def_for_reduction
3935
3936    Input:
3937    STMT_VINFO - a stmt that performs a reduction operation in the loop.
3938    INIT_VAL - the initial value of the reduction variable
3939
3940    Output:
3941    ADJUSTMENT_DEF - a tree that holds a value to be added to the final result
3942         of the reduction (used for adjusting the epilog - see below).
3943    Return a vector variable, initialized according to the operation that
3944         STMT_VINFO performs. This vector will be used as the initial value
3945         of the vector of partial results.
3946
3947    Option1 (adjust in epilog): Initialize the vector as follows:
3948      add/bit or/xor:    [0,0,...,0,0]
3949      mult/bit and:      [1,1,...,1,1]
3950      min/max/cond_expr: [init_val,init_val,..,init_val,init_val]
3951    and when necessary (e.g. add/mult case) let the caller know
3952    that it needs to adjust the result by init_val.
3953
3954    Option2: Initialize the vector as follows:
3955      add/bit or/xor:    [init_val,0,0,...,0]
3956      mult/bit and:      [init_val,1,1,...,1]
3957      min/max/cond_expr: [init_val,init_val,...,init_val]
3958    and no adjustments are needed.
3959
3960    For example, for the following code:
3961
3962    s = init_val;
3963    for (i=0;i<n;i++)
3964      s = s + a[i];
3965
3966    STMT_VINFO is 's = s + a[i]', and the reduction variable is 's'.
3967    For a vector of 4 units, we want to return either [0,0,0,init_val],
3968    or [0,0,0,0] and let the caller know that it needs to adjust
3969    the result at the end by 'init_val'.
3970
3971    FORNOW, we are using the 'adjust in epilog' scheme, because this way the
3972    initialization vector is simpler (same element in all entries), if
3973    ADJUSTMENT_DEF is not NULL, and Option2 otherwise.
3974
3975    A cost model should help decide between these two schemes.  */
3976
3977 tree
3978 get_initial_def_for_reduction (stmt_vec_info stmt_vinfo, tree init_val,
3979                                tree *adjustment_def)
3980 {
3981   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_vinfo);
3982   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
3983   tree scalar_type = TREE_TYPE (init_val);
3984   tree vectype = get_vectype_for_scalar_type (scalar_type);
3985   enum tree_code code = gimple_assign_rhs_code (stmt_vinfo->stmt);
3986   tree def_for_init;
3987   tree init_def;
3988   REAL_VALUE_TYPE real_init_val = dconst0;
3989   int int_init_val = 0;
3990   gimple_seq stmts = NULL;
3991
3992   gcc_assert (vectype);
3993
3994   gcc_assert (POINTER_TYPE_P (scalar_type) || INTEGRAL_TYPE_P (scalar_type)
3995               || SCALAR_FLOAT_TYPE_P (scalar_type));
3996
3997   gcc_assert (nested_in_vect_loop_p (loop, stmt_vinfo)
3998               || loop == (gimple_bb (stmt_vinfo->stmt))->loop_father);
3999
4000   vect_reduction_type reduction_type
4001     = STMT_VINFO_VEC_REDUCTION_TYPE (stmt_vinfo);
4002
4003   switch (code)
4004     {
4005     case WIDEN_SUM_EXPR:
4006     case DOT_PROD_EXPR:
4007     case SAD_EXPR:
4008     case PLUS_EXPR:
4009     case MINUS_EXPR:
4010     case BIT_IOR_EXPR:
4011     case BIT_XOR_EXPR:
4012     case MULT_EXPR:
4013     case BIT_AND_EXPR:
4014       {
4015         /* ADJUSTMENT_DEF is NULL when called from
4016            vect_create_epilog_for_reduction to vectorize double reduction.  */
4017         if (adjustment_def)
4018           *adjustment_def = init_val;
4019
4020         if (code == MULT_EXPR)
4021           {
4022             real_init_val = dconst1;
4023             int_init_val = 1;
4024           }
4025
4026         if (code == BIT_AND_EXPR)
4027           int_init_val = -1;
4028
4029         if (SCALAR_FLOAT_TYPE_P (scalar_type))
4030           def_for_init = build_real (scalar_type, real_init_val);
4031         else
4032           def_for_init = build_int_cst (scalar_type, int_init_val);
4033
4034         if (adjustment_def)
4035           /* Option1: the first element is '0' or '1' as well.  */
4036           init_def = gimple_build_vector_from_val (&stmts, vectype,
4037                                                    def_for_init);
4038         else if (!TYPE_VECTOR_SUBPARTS (vectype).is_constant ())
4039           {
4040             /* Option2 (variable length): the first element is INIT_VAL.  */
4041             init_def = gimple_build_vector_from_val (&stmts, vectype,
4042                                                      def_for_init);
4043             init_def = gimple_build (&stmts, CFN_VEC_SHL_INSERT,
4044                                      vectype, init_def, init_val);
4045           }
4046         else
4047           {
4048             /* Option2: the first element is INIT_VAL.  */
4049             tree_vector_builder elts (vectype, 1, 2);
4050             elts.quick_push (init_val);
4051             elts.quick_push (def_for_init);
4052             init_def = gimple_build_vector (&stmts, &elts);
4053           }
4054       }
4055       break;
4056
4057     case MIN_EXPR:
4058     case MAX_EXPR:
4059     case COND_EXPR:
4060       {
4061         if (adjustment_def)
4062           {
4063             *adjustment_def = NULL_TREE;
4064             if (reduction_type != COND_REDUCTION
4065                 && reduction_type != EXTRACT_LAST_REDUCTION)
4066               {
4067                 init_def = vect_get_vec_def_for_operand (init_val, stmt_vinfo);
4068                 break;
4069               }
4070           }
4071         init_val = gimple_convert (&stmts, TREE_TYPE (vectype), init_val);
4072         init_def = gimple_build_vector_from_val (&stmts, vectype, init_val);
4073       }
4074       break;
4075
4076     default:
4077       gcc_unreachable ();
4078     }
4079
4080   if (stmts)
4081     gsi_insert_seq_on_edge_immediate (loop_preheader_edge (loop), stmts);
4082   return init_def;
4083 }
4084
4085 /* Get at the initial defs for the reduction PHIs in SLP_NODE.
4086    NUMBER_OF_VECTORS is the number of vector defs to create.
4087    If NEUTRAL_OP is nonnull, introducing extra elements of that
4088    value will not change the result.  */
4089
4090 static void
4091 get_initial_defs_for_reduction (slp_tree slp_node,
4092                                 vec<tree> *vec_oprnds,
4093                                 unsigned int number_of_vectors,
4094                                 bool reduc_chain, tree neutral_op)
4095 {
4096   vec<stmt_vec_info> stmts = SLP_TREE_SCALAR_STMTS (slp_node);
4097   stmt_vec_info stmt_vinfo = stmts[0];
4098   unsigned HOST_WIDE_INT nunits;
4099   unsigned j, number_of_places_left_in_vector;
4100   tree vector_type;
4101   unsigned int group_size = stmts.length ();
4102   unsigned int i;
4103   struct loop *loop;
4104
4105   vector_type = STMT_VINFO_VECTYPE (stmt_vinfo);
4106
4107   gcc_assert (STMT_VINFO_DEF_TYPE (stmt_vinfo) == vect_reduction_def);
4108
4109   loop = (gimple_bb (stmt_vinfo->stmt))->loop_father;
4110   gcc_assert (loop);
4111   edge pe = loop_preheader_edge (loop);
4112
4113   gcc_assert (!reduc_chain || neutral_op);
4114
4115   /* NUMBER_OF_COPIES is the number of times we need to use the same values in
4116      created vectors. It is greater than 1 if unrolling is performed.
4117
4118      For example, we have two scalar operands, s1 and s2 (e.g., group of
4119      strided accesses of size two), while NUNITS is four (i.e., four scalars
4120      of this type can be packed in a vector).  The output vector will contain
4121      two copies of each scalar operand: {s1, s2, s1, s2}.  (NUMBER_OF_COPIES
4122      will be 2).
4123
4124      If REDUC_GROUP_SIZE > NUNITS, the scalars will be split into several
4125      vectors containing the operands.
4126
4127      For example, NUNITS is four as before, and the group size is 8
4128      (s1, s2, ..., s8).  We will create two vectors {s1, s2, s3, s4} and
4129      {s5, s6, s7, s8}.  */
4130
4131   if (!TYPE_VECTOR_SUBPARTS (vector_type).is_constant (&nunits))
4132     nunits = group_size;
4133
4134   number_of_places_left_in_vector = nunits;
4135   bool constant_p = true;
4136   tree_vector_builder elts (vector_type, nunits, 1);
4137   elts.quick_grow (nunits);
4138   gimple_seq ctor_seq = NULL;
4139   for (j = 0; j < nunits * number_of_vectors; ++j)
4140     {
4141       tree op;
4142       i = j % group_size;
4143       stmt_vinfo = stmts[i];
4144
4145       /* Get the def before the loop.  In reduction chain we have only
4146          one initial value.  Else we have as many as PHIs in the group.  */
4147       if (reduc_chain)
4148         op = j != 0 ? neutral_op : PHI_ARG_DEF_FROM_EDGE (stmt_vinfo->stmt, pe);
4149       else if (((vec_oprnds->length () + 1) * nunits
4150                 - number_of_places_left_in_vector >= group_size)
4151                && neutral_op)
4152         op = neutral_op;
4153       else
4154         op = PHI_ARG_DEF_FROM_EDGE (stmt_vinfo->stmt, pe);
4155
4156       /* Create 'vect_ = {op0,op1,...,opn}'.  */
4157       number_of_places_left_in_vector--;
4158       elts[nunits - number_of_places_left_in_vector - 1] = op;
4159       if (!CONSTANT_CLASS_P (op))
4160         constant_p = false;
4161
4162       if (number_of_places_left_in_vector == 0)
4163         {
4164           tree init;
4165           if (constant_p && !neutral_op
4166               ? multiple_p (TYPE_VECTOR_SUBPARTS (vector_type), nunits)
4167               : known_eq (TYPE_VECTOR_SUBPARTS (vector_type), nunits))
4168             /* Build the vector directly from ELTS.  */
4169             init = gimple_build_vector (&ctor_seq, &elts);
4170           else if (neutral_op)
4171             {
4172               /* Build a vector of the neutral value and shift the
4173                  other elements into place.  */
4174               init = gimple_build_vector_from_val (&ctor_seq, vector_type,
4175                                                    neutral_op);
4176               int k = nunits;
4177               while (k > 0 && elts[k - 1] == neutral_op)
4178                 k -= 1;
4179               while (k > 0)
4180                 {
4181                   k -= 1;
4182                   init = gimple_build (&ctor_seq, CFN_VEC_SHL_INSERT,
4183                                        vector_type, init, elts[k]);
4184                 }
4185             }
4186           else
4187             {
4188               /* First time round, duplicate ELTS to fill the
4189                  required number of vectors.  */
4190               duplicate_and_interleave (&ctor_seq, vector_type, elts,
4191                                         number_of_vectors, *vec_oprnds);
4192               break;
4193             }
4194           vec_oprnds->quick_push (init);
4195
4196           number_of_places_left_in_vector = nunits;
4197           elts.new_vector (vector_type, nunits, 1);
4198           elts.quick_grow (nunits);
4199           constant_p = true;
4200         }
4201     }
4202   if (ctor_seq != NULL)
4203     gsi_insert_seq_on_edge_immediate (pe, ctor_seq);
4204 }
4205
4206
4207 /* Function vect_create_epilog_for_reduction
4208
4209    Create code at the loop-epilog to finalize the result of a reduction
4210    computation.
4211
4212    VECT_DEFS is list of vector of partial results, i.e., the lhs's of vector
4213      reduction statements.
4214    STMT_INFO is the scalar reduction stmt that is being vectorized.
4215    NCOPIES is > 1 in case the vectorization factor (VF) is bigger than the
4216      number of elements that we can fit in a vectype (nunits).  In this case
4217      we have to generate more than one vector stmt - i.e - we need to "unroll"
4218      the vector stmt by a factor VF/nunits.  For more details see documentation
4219      in vectorizable_operation.
4220    REDUC_FN is the internal function for the epilog reduction.
4221    REDUCTION_PHIS is a list of the phi-nodes that carry the reduction
4222      computation.
4223    REDUC_INDEX is the index of the operand in the right hand side of the
4224      statement that is defined by REDUCTION_PHI.
4225    DOUBLE_REDUC is TRUE if double reduction phi nodes should be handled.
4226    SLP_NODE is an SLP node containing a group of reduction statements. The
4227      first one in this group is STMT_INFO.
4228    INDUC_VAL is for INTEGER_INDUC_COND_REDUCTION the value to use for the case
4229      when the COND_EXPR is never true in the loop.  For MAX_EXPR, it needs to
4230      be smaller than any value of the IV in the loop, for MIN_EXPR larger than
4231      any value of the IV in the loop.
4232    INDUC_CODE is the code for epilog reduction if INTEGER_INDUC_COND_REDUCTION.
4233    NEUTRAL_OP is the value given by neutral_op_for_slp_reduction; it is
4234      null if this is not an SLP reduction
4235
4236    This function:
4237    1. Creates the reduction def-use cycles: sets the arguments for
4238       REDUCTION_PHIS:
4239       The loop-entry argument is the vectorized initial-value of the reduction.
4240       The loop-latch argument is taken from VECT_DEFS - the vector of partial
4241       sums.
4242    2. "Reduces" each vector of partial results VECT_DEFS into a single result,
4243       by calling the function specified by REDUC_FN if available, or by
4244       other means (whole-vector shifts or a scalar loop).
4245       The function also creates a new phi node at the loop exit to preserve
4246       loop-closed form, as illustrated below.
4247
4248      The flow at the entry to this function:
4249
4250         loop:
4251           vec_def = phi <null, null>            # REDUCTION_PHI
4252           VECT_DEF = vector_stmt                # vectorized form of STMT_INFO
4253           s_loop = scalar_stmt                  # (scalar) STMT_INFO
4254         loop_exit:
4255           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
4256           use <s_out0>
4257           use <s_out0>
4258
4259      The above is transformed by this function into:
4260
4261         loop:
4262           vec_def = phi <vec_init, VECT_DEF>    # REDUCTION_PHI
4263           VECT_DEF = vector_stmt                # vectorized form of STMT_INFO
4264           s_loop = scalar_stmt                  # (scalar) STMT_INFO
4265         loop_exit:
4266           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
4267           v_out1 = phi <VECT_DEF>               # NEW_EXIT_PHI
4268           v_out2 = reduce <v_out1>
4269           s_out3 = extract_field <v_out2, 0>
4270           s_out4 = adjust_result <s_out3>
4271           use <s_out4>
4272           use <s_out4>
4273 */
4274
4275 static void
4276 vect_create_epilog_for_reduction (vec<tree> vect_defs,
4277                                   stmt_vec_info stmt_info,
4278                                   gimple *reduc_def_stmt,
4279                                   int ncopies, internal_fn reduc_fn,
4280                                   vec<stmt_vec_info> reduction_phis,
4281                                   bool double_reduc,
4282                                   slp_tree slp_node,
4283                                   slp_instance slp_node_instance,
4284                                   tree induc_val, enum tree_code induc_code,
4285                                   tree neutral_op)
4286 {
4287   stmt_vec_info prev_phi_info;
4288   tree vectype;
4289   machine_mode mode;
4290   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4291   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo), *outer_loop = NULL;
4292   basic_block exit_bb;
4293   tree scalar_dest;
4294   tree scalar_type;
4295   gimple *new_phi = NULL, *phi;
4296   stmt_vec_info phi_info;
4297   gimple_stmt_iterator exit_gsi;
4298   tree vec_dest;
4299   tree new_temp = NULL_TREE, new_dest, new_name, new_scalar_dest;
4300   gimple *epilog_stmt = NULL;
4301   enum tree_code code = gimple_assign_rhs_code (stmt_info->stmt);
4302   gimple *exit_phi;
4303   tree bitsize;
4304   tree adjustment_def = NULL;
4305   tree vec_initial_def = NULL;
4306   tree expr, def, initial_def = NULL;
4307   tree orig_name, scalar_result;
4308   imm_use_iterator imm_iter, phi_imm_iter;
4309   use_operand_p use_p, phi_use_p;
4310   gimple *use_stmt;
4311   stmt_vec_info reduction_phi_info = NULL;
4312   bool nested_in_vect_loop = false;
4313   auto_vec<gimple *> new_phis;
4314   auto_vec<stmt_vec_info> inner_phis;
4315   int j, i;
4316   auto_vec<tree> scalar_results;
4317   unsigned int group_size = 1, k, ratio;
4318   auto_vec<tree> vec_initial_defs;
4319   auto_vec<gimple *> phis;
4320   bool slp_reduc = false;
4321   bool direct_slp_reduc;
4322   tree new_phi_result;
4323   stmt_vec_info inner_phi = NULL;
4324   tree induction_index = NULL_TREE;
4325
4326   if (slp_node)
4327     group_size = SLP_TREE_SCALAR_STMTS (slp_node).length ();
4328
4329   if (nested_in_vect_loop_p (loop, stmt_info))
4330     {
4331       outer_loop = loop;
4332       loop = loop->inner;
4333       nested_in_vect_loop = true;
4334       gcc_assert (!slp_node);
4335     }
4336
4337   vectype = STMT_VINFO_VECTYPE (stmt_info);
4338   gcc_assert (vectype);
4339   mode = TYPE_MODE (vectype);
4340
4341   /* 1. Create the reduction def-use cycle:
4342      Set the arguments of REDUCTION_PHIS, i.e., transform
4343
4344         loop:
4345           vec_def = phi <null, null>            # REDUCTION_PHI
4346           VECT_DEF = vector_stmt                # vectorized form of STMT
4347           ...
4348
4349      into:
4350
4351         loop:
4352           vec_def = phi <vec_init, VECT_DEF>    # REDUCTION_PHI
4353           VECT_DEF = vector_stmt                # vectorized form of STMT
4354           ...
4355
4356      (in case of SLP, do it for all the phis). */
4357
4358   /* Get the loop-entry arguments.  */
4359   enum vect_def_type initial_def_dt = vect_unknown_def_type;
4360   if (slp_node)
4361     {
4362       unsigned vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
4363       vec_initial_defs.reserve (vec_num);
4364       get_initial_defs_for_reduction (slp_node_instance->reduc_phis,
4365                                       &vec_initial_defs, vec_num,
4366                                       REDUC_GROUP_FIRST_ELEMENT (stmt_info),
4367                                       neutral_op);
4368     }
4369   else
4370     {
4371       /* Get at the scalar def before the loop, that defines the initial value
4372          of the reduction variable.  */
4373       initial_def = PHI_ARG_DEF_FROM_EDGE (reduc_def_stmt,
4374                                            loop_preheader_edge (loop));
4375       /* Optimize: if initial_def is for REDUC_MAX smaller than the base
4376          and we can't use zero for induc_val, use initial_def.  Similarly
4377          for REDUC_MIN and initial_def larger than the base.  */
4378       if (TREE_CODE (initial_def) == INTEGER_CST
4379           && (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
4380               == INTEGER_INDUC_COND_REDUCTION)
4381           && !integer_zerop (induc_val)
4382           && ((induc_code == MAX_EXPR
4383                && tree_int_cst_lt (initial_def, induc_val))
4384               || (induc_code == MIN_EXPR
4385                   && tree_int_cst_lt (induc_val, initial_def))))
4386         induc_val = initial_def;
4387
4388       if (double_reduc)
4389         /* In case of double reduction we only create a vector variable
4390            to be put in the reduction phi node.  The actual statement
4391            creation is done later in this function.  */
4392         vec_initial_def = vect_create_destination_var (initial_def, vectype);
4393       else if (nested_in_vect_loop)
4394         {
4395           /* Do not use an adjustment def as that case is not supported
4396              correctly if ncopies is not one.  */
4397           vect_is_simple_use (initial_def, loop_vinfo, &initial_def_dt);
4398           vec_initial_def = vect_get_vec_def_for_operand (initial_def,
4399                                                           stmt_info);
4400         }
4401       else
4402         vec_initial_def
4403           = get_initial_def_for_reduction (stmt_info, initial_def,
4404                                            &adjustment_def);
4405       vec_initial_defs.create (1);
4406       vec_initial_defs.quick_push (vec_initial_def);
4407     }
4408
4409   /* Set phi nodes arguments.  */
4410   FOR_EACH_VEC_ELT (reduction_phis, i, phi_info)
4411     {
4412       tree vec_init_def = vec_initial_defs[i];
4413       tree def = vect_defs[i];
4414       for (j = 0; j < ncopies; j++)
4415         {
4416           if (j != 0)
4417             {
4418               phi_info = STMT_VINFO_RELATED_STMT (phi_info);
4419               if (nested_in_vect_loop)
4420                 vec_init_def
4421                   = vect_get_vec_def_for_stmt_copy (loop_vinfo, vec_init_def);
4422             }
4423
4424           /* Set the loop-entry arg of the reduction-phi.  */
4425
4426           gphi *phi = as_a <gphi *> (phi_info->stmt);
4427           if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
4428               == INTEGER_INDUC_COND_REDUCTION)
4429             {
4430               /* Initialise the reduction phi to zero.  This prevents initial
4431                  values of non-zero interferring with the reduction op.  */
4432               gcc_assert (ncopies == 1);
4433               gcc_assert (i == 0);
4434
4435               tree vec_init_def_type = TREE_TYPE (vec_init_def);
4436               tree induc_val_vec
4437                 = build_vector_from_val (vec_init_def_type, induc_val);
4438
4439               add_phi_arg (phi, induc_val_vec, loop_preheader_edge (loop),
4440                            UNKNOWN_LOCATION);
4441             }
4442           else
4443             add_phi_arg (phi, vec_init_def, loop_preheader_edge (loop),
4444                          UNKNOWN_LOCATION);
4445
4446           /* Set the loop-latch arg for the reduction-phi.  */
4447           if (j > 0)
4448             def = vect_get_vec_def_for_stmt_copy (loop_vinfo, def);
4449
4450           add_phi_arg (phi, def, loop_latch_edge (loop), UNKNOWN_LOCATION);
4451
4452           if (dump_enabled_p ())
4453             dump_printf_loc (MSG_NOTE, vect_location,
4454                              "transform reduction: created def-use cycle: %G%G",
4455                              phi, SSA_NAME_DEF_STMT (def));
4456         }
4457     }
4458
4459   /* For cond reductions we want to create a new vector (INDEX_COND_EXPR)
4460      which is updated with the current index of the loop for every match of
4461      the original loop's cond_expr (VEC_STMT).  This results in a vector
4462      containing the last time the condition passed for that vector lane.
4463      The first match will be a 1 to allow 0 to be used for non-matching
4464      indexes.  If there are no matches at all then the vector will be all
4465      zeroes.  */
4466   if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) == COND_REDUCTION)
4467     {
4468       tree indx_before_incr, indx_after_incr;
4469       poly_uint64 nunits_out = TYPE_VECTOR_SUBPARTS (vectype);
4470
4471       gimple *vec_stmt = STMT_VINFO_VEC_STMT (stmt_info)->stmt;
4472       gcc_assert (gimple_assign_rhs_code (vec_stmt) == VEC_COND_EXPR);
4473
4474       int scalar_precision
4475         = GET_MODE_PRECISION (SCALAR_TYPE_MODE (TREE_TYPE (vectype)));
4476       tree cr_index_scalar_type = make_unsigned_type (scalar_precision);
4477       tree cr_index_vector_type = build_vector_type
4478         (cr_index_scalar_type, TYPE_VECTOR_SUBPARTS (vectype));
4479
4480       /* First we create a simple vector induction variable which starts
4481          with the values {1,2,3,...} (SERIES_VECT) and increments by the
4482          vector size (STEP).  */
4483
4484       /* Create a {1,2,3,...} vector.  */
4485       tree series_vect = build_index_vector (cr_index_vector_type, 1, 1);
4486
4487       /* Create a vector of the step value.  */
4488       tree step = build_int_cst (cr_index_scalar_type, nunits_out);
4489       tree vec_step = build_vector_from_val (cr_index_vector_type, step);
4490
4491       /* Create an induction variable.  */
4492       gimple_stmt_iterator incr_gsi;
4493       bool insert_after;
4494       standard_iv_increment_position (loop, &incr_gsi, &insert_after);
4495       create_iv (series_vect, vec_step, NULL_TREE, loop, &incr_gsi,
4496                  insert_after, &indx_before_incr, &indx_after_incr);
4497
4498       /* Next create a new phi node vector (NEW_PHI_TREE) which starts
4499          filled with zeros (VEC_ZERO).  */
4500
4501       /* Create a vector of 0s.  */
4502       tree zero = build_zero_cst (cr_index_scalar_type);
4503       tree vec_zero = build_vector_from_val (cr_index_vector_type, zero);
4504
4505       /* Create a vector phi node.  */
4506       tree new_phi_tree = make_ssa_name (cr_index_vector_type);
4507       new_phi = create_phi_node (new_phi_tree, loop->header);
4508       loop_vinfo->add_stmt (new_phi);
4509       add_phi_arg (as_a <gphi *> (new_phi), vec_zero,
4510                    loop_preheader_edge (loop), UNKNOWN_LOCATION);
4511
4512       /* Now take the condition from the loops original cond_expr
4513          (VEC_STMT) and produce a new cond_expr (INDEX_COND_EXPR) which for
4514          every match uses values from the induction variable
4515          (INDEX_BEFORE_INCR) otherwise uses values from the phi node
4516          (NEW_PHI_TREE).
4517          Finally, we update the phi (NEW_PHI_TREE) to take the value of
4518          the new cond_expr (INDEX_COND_EXPR).  */
4519
4520       /* Duplicate the condition from vec_stmt.  */
4521       tree ccompare = unshare_expr (gimple_assign_rhs1 (vec_stmt));
4522
4523       /* Create a conditional, where the condition is taken from vec_stmt
4524          (CCOMPARE), then is the induction index (INDEX_BEFORE_INCR) and
4525          else is the phi (NEW_PHI_TREE).  */
4526       tree index_cond_expr = build3 (VEC_COND_EXPR, cr_index_vector_type,
4527                                      ccompare, indx_before_incr,
4528                                      new_phi_tree);
4529       induction_index = make_ssa_name (cr_index_vector_type);
4530       gimple *index_condition = gimple_build_assign (induction_index,
4531                                                      index_cond_expr);
4532       gsi_insert_before (&incr_gsi, index_condition, GSI_SAME_STMT);
4533       stmt_vec_info index_vec_info = loop_vinfo->add_stmt (index_condition);
4534       STMT_VINFO_VECTYPE (index_vec_info) = cr_index_vector_type;
4535
4536       /* Update the phi with the vec cond.  */
4537       add_phi_arg (as_a <gphi *> (new_phi), induction_index,
4538                    loop_latch_edge (loop), UNKNOWN_LOCATION);
4539     }
4540
4541   /* 2. Create epilog code.
4542         The reduction epilog code operates across the elements of the vector
4543         of partial results computed by the vectorized loop.
4544         The reduction epilog code consists of:
4545
4546         step 1: compute the scalar result in a vector (v_out2)
4547         step 2: extract the scalar result (s_out3) from the vector (v_out2)
4548         step 3: adjust the scalar result (s_out3) if needed.
4549
4550         Step 1 can be accomplished using one the following three schemes:
4551           (scheme 1) using reduc_fn, if available.
4552           (scheme 2) using whole-vector shifts, if available.
4553           (scheme 3) using a scalar loop. In this case steps 1+2 above are
4554                      combined.
4555
4556           The overall epilog code looks like this:
4557
4558           s_out0 = phi <s_loop>         # original EXIT_PHI
4559           v_out1 = phi <VECT_DEF>       # NEW_EXIT_PHI
4560           v_out2 = reduce <v_out1>              # step 1
4561           s_out3 = extract_field <v_out2, 0>    # step 2
4562           s_out4 = adjust_result <s_out3>       # step 3
4563
4564           (step 3 is optional, and steps 1 and 2 may be combined).
4565           Lastly, the uses of s_out0 are replaced by s_out4.  */
4566
4567
4568   /* 2.1 Create new loop-exit-phis to preserve loop-closed form:
4569          v_out1 = phi <VECT_DEF>
4570          Store them in NEW_PHIS.  */
4571
4572   exit_bb = single_exit (loop)->dest;
4573   prev_phi_info = NULL;
4574   new_phis.create (vect_defs.length ());
4575   FOR_EACH_VEC_ELT (vect_defs, i, def)
4576     {
4577       for (j = 0; j < ncopies; j++)
4578         {
4579           tree new_def = copy_ssa_name (def);
4580           phi = create_phi_node (new_def, exit_bb);
4581           stmt_vec_info phi_info = loop_vinfo->add_stmt (phi);
4582           if (j == 0)
4583             new_phis.quick_push (phi);
4584           else
4585             {
4586               def = vect_get_vec_def_for_stmt_copy (loop_vinfo, def);
4587               STMT_VINFO_RELATED_STMT (prev_phi_info) = phi_info;
4588             }
4589
4590           SET_PHI_ARG_DEF (phi, single_exit (loop)->dest_idx, def);
4591           prev_phi_info = phi_info;
4592         }
4593     }
4594
4595   /* The epilogue is created for the outer-loop, i.e., for the loop being
4596      vectorized.  Create exit phis for the outer loop.  */
4597   if (double_reduc)
4598     {
4599       loop = outer_loop;
4600       exit_bb = single_exit (loop)->dest;
4601       inner_phis.create (vect_defs.length ());
4602       FOR_EACH_VEC_ELT (new_phis, i, phi)
4603         {
4604           stmt_vec_info phi_info = loop_vinfo->lookup_stmt (phi);
4605           tree new_result = copy_ssa_name (PHI_RESULT (phi));
4606           gphi *outer_phi = create_phi_node (new_result, exit_bb);
4607           SET_PHI_ARG_DEF (outer_phi, single_exit (loop)->dest_idx,
4608                            PHI_RESULT (phi));
4609           prev_phi_info = loop_vinfo->add_stmt (outer_phi);
4610           inner_phis.quick_push (phi_info);
4611           new_phis[i] = outer_phi;
4612           while (STMT_VINFO_RELATED_STMT (phi_info))
4613             {
4614               phi_info = STMT_VINFO_RELATED_STMT (phi_info);
4615               new_result = copy_ssa_name (PHI_RESULT (phi_info->stmt));
4616               outer_phi = create_phi_node (new_result, exit_bb);
4617               SET_PHI_ARG_DEF (outer_phi, single_exit (loop)->dest_idx,
4618                                PHI_RESULT (phi_info->stmt));
4619               stmt_vec_info outer_phi_info = loop_vinfo->add_stmt (outer_phi);
4620               STMT_VINFO_RELATED_STMT (prev_phi_info) = outer_phi_info;
4621               prev_phi_info = outer_phi_info;
4622             }
4623         }
4624     }
4625
4626   exit_gsi = gsi_after_labels (exit_bb);
4627
4628   /* 2.2 Get the relevant tree-code to use in the epilog for schemes 2,3
4629          (i.e. when reduc_fn is not available) and in the final adjustment
4630          code (if needed).  Also get the original scalar reduction variable as
4631          defined in the loop.  In case STMT is a "pattern-stmt" (i.e. - it
4632          represents a reduction pattern), the tree-code and scalar-def are
4633          taken from the original stmt that the pattern-stmt (STMT) replaces.
4634          Otherwise (it is a regular reduction) - the tree-code and scalar-def
4635          are taken from STMT.  */
4636
4637   stmt_vec_info orig_stmt_info = vect_orig_stmt (stmt_info);
4638   if (orig_stmt_info != stmt_info)
4639     {
4640       /* Reduction pattern  */
4641       gcc_assert (STMT_VINFO_IN_PATTERN_P (orig_stmt_info));
4642       gcc_assert (STMT_VINFO_RELATED_STMT (orig_stmt_info) == stmt_info);
4643     }
4644
4645   code = gimple_assign_rhs_code (orig_stmt_info->stmt);
4646   /* For MINUS_EXPR the initial vector is [init_val,0,...,0], therefore,
4647      partial results are added and not subtracted.  */
4648   if (code == MINUS_EXPR)
4649     code = PLUS_EXPR;
4650
4651   scalar_dest = gimple_assign_lhs (orig_stmt_info->stmt);
4652   scalar_type = TREE_TYPE (scalar_dest);
4653   scalar_results.create (group_size);
4654   new_scalar_dest = vect_create_destination_var (scalar_dest, NULL);
4655   bitsize = TYPE_SIZE (scalar_type);
4656
4657   /* In case this is a reduction in an inner-loop while vectorizing an outer
4658      loop - we don't need to extract a single scalar result at the end of the
4659      inner-loop (unless it is double reduction, i.e., the use of reduction is
4660      outside the outer-loop).  The final vector of partial results will be used
4661      in the vectorized outer-loop, or reduced to a scalar result at the end of
4662      the outer-loop.  */
4663   if (nested_in_vect_loop && !double_reduc)
4664     goto vect_finalize_reduction;
4665
4666   /* SLP reduction without reduction chain, e.g.,
4667      # a1 = phi <a2, a0>
4668      # b1 = phi <b2, b0>
4669      a2 = operation (a1)
4670      b2 = operation (b1)  */
4671   slp_reduc = (slp_node && !REDUC_GROUP_FIRST_ELEMENT (stmt_info));
4672
4673   /* True if we should implement SLP_REDUC using native reduction operations
4674      instead of scalar operations.  */
4675   direct_slp_reduc = (reduc_fn != IFN_LAST
4676                       && slp_reduc
4677                       && !TYPE_VECTOR_SUBPARTS (vectype).is_constant ());
4678
4679   /* In case of reduction chain, e.g.,
4680      # a1 = phi <a3, a0>
4681      a2 = operation (a1)
4682      a3 = operation (a2),
4683
4684      we may end up with more than one vector result.  Here we reduce them to
4685      one vector.  */
4686   if (REDUC_GROUP_FIRST_ELEMENT (stmt_info) || direct_slp_reduc)
4687     {
4688       tree first_vect = PHI_RESULT (new_phis[0]);
4689       gassign *new_vec_stmt = NULL;
4690       vec_dest = vect_create_destination_var (scalar_dest, vectype);
4691       for (k = 1; k < new_phis.length (); k++)
4692         {
4693           gimple *next_phi = new_phis[k];
4694           tree second_vect = PHI_RESULT (next_phi);
4695           tree tem = make_ssa_name (vec_dest, new_vec_stmt);
4696           new_vec_stmt = gimple_build_assign (tem, code,
4697                                               first_vect, second_vect);
4698           gsi_insert_before (&exit_gsi, new_vec_stmt, GSI_SAME_STMT);
4699           first_vect = tem;
4700         }
4701
4702       new_phi_result = first_vect;
4703       if (new_vec_stmt)
4704         {
4705           new_phis.truncate (0);
4706           new_phis.safe_push (new_vec_stmt);
4707         }
4708     }
4709   /* Likewise if we couldn't use a single defuse cycle.  */
4710   else if (ncopies > 1)
4711     {
4712       gcc_assert (new_phis.length () == 1);
4713       tree first_vect = PHI_RESULT (new_phis[0]);
4714       gassign *new_vec_stmt = NULL;
4715       vec_dest = vect_create_destination_var (scalar_dest, vectype);
4716       stmt_vec_info next_phi_info = loop_vinfo->lookup_stmt (new_phis[0]);
4717       for (int k = 1; k < ncopies; ++k)
4718         {
4719           next_phi_info = STMT_VINFO_RELATED_STMT (next_phi_info);
4720           tree second_vect = PHI_RESULT (next_phi_info->stmt);
4721           tree tem = make_ssa_name (vec_dest, new_vec_stmt);
4722           new_vec_stmt = gimple_build_assign (tem, code,
4723                                               first_vect, second_vect);
4724           gsi_insert_before (&exit_gsi, new_vec_stmt, GSI_SAME_STMT);
4725           first_vect = tem;
4726         }
4727       new_phi_result = first_vect;
4728       new_phis.truncate (0);
4729       new_phis.safe_push (new_vec_stmt);
4730     }
4731   else
4732     new_phi_result = PHI_RESULT (new_phis[0]);
4733
4734   if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) == COND_REDUCTION
4735       && reduc_fn != IFN_LAST)
4736     {
4737       /* For condition reductions, we have a vector (NEW_PHI_RESULT) containing
4738          various data values where the condition matched and another vector
4739          (INDUCTION_INDEX) containing all the indexes of those matches.  We
4740          need to extract the last matching index (which will be the index with
4741          highest value) and use this to index into the data vector.
4742          For the case where there were no matches, the data vector will contain
4743          all default values and the index vector will be all zeros.  */
4744
4745       /* Get various versions of the type of the vector of indexes.  */
4746       tree index_vec_type = TREE_TYPE (induction_index);
4747       gcc_checking_assert (TYPE_UNSIGNED (index_vec_type));
4748       tree index_scalar_type = TREE_TYPE (index_vec_type);
4749       tree index_vec_cmp_type = build_same_sized_truth_vector_type
4750         (index_vec_type);
4751
4752       /* Get an unsigned integer version of the type of the data vector.  */
4753       int scalar_precision
4754         = GET_MODE_PRECISION (SCALAR_TYPE_MODE (scalar_type));
4755       tree scalar_type_unsigned = make_unsigned_type (scalar_precision);
4756       tree vectype_unsigned = build_vector_type
4757         (scalar_type_unsigned, TYPE_VECTOR_SUBPARTS (vectype));
4758
4759       /* First we need to create a vector (ZERO_VEC) of zeros and another
4760          vector (MAX_INDEX_VEC) filled with the last matching index, which we
4761          can create using a MAX reduction and then expanding.
4762          In the case where the loop never made any matches, the max index will
4763          be zero.  */
4764
4765       /* Vector of {0, 0, 0,...}.  */
4766       tree zero_vec = make_ssa_name (vectype);
4767       tree zero_vec_rhs = build_zero_cst (vectype);
4768       gimple *zero_vec_stmt = gimple_build_assign (zero_vec, zero_vec_rhs);
4769       gsi_insert_before (&exit_gsi, zero_vec_stmt, GSI_SAME_STMT);
4770
4771       /* Find maximum value from the vector of found indexes.  */
4772       tree max_index = make_ssa_name (index_scalar_type);
4773       gcall *max_index_stmt = gimple_build_call_internal (IFN_REDUC_MAX,
4774                                                           1, induction_index);
4775       gimple_call_set_lhs (max_index_stmt, max_index);
4776       gsi_insert_before (&exit_gsi, max_index_stmt, GSI_SAME_STMT);
4777
4778       /* Vector of {max_index, max_index, max_index,...}.  */
4779       tree max_index_vec = make_ssa_name (index_vec_type);
4780       tree max_index_vec_rhs = build_vector_from_val (index_vec_type,
4781                                                       max_index);
4782       gimple *max_index_vec_stmt = gimple_build_assign (max_index_vec,
4783                                                         max_index_vec_rhs);
4784       gsi_insert_before (&exit_gsi, max_index_vec_stmt, GSI_SAME_STMT);
4785
4786       /* Next we compare the new vector (MAX_INDEX_VEC) full of max indexes
4787          with the vector (INDUCTION_INDEX) of found indexes, choosing values
4788          from the data vector (NEW_PHI_RESULT) for matches, 0 (ZERO_VEC)
4789          otherwise.  Only one value should match, resulting in a vector
4790          (VEC_COND) with one data value and the rest zeros.
4791          In the case where the loop never made any matches, every index will
4792          match, resulting in a vector with all data values (which will all be
4793          the default value).  */
4794
4795       /* Compare the max index vector to the vector of found indexes to find
4796          the position of the max value.  */
4797       tree vec_compare = make_ssa_name (index_vec_cmp_type);
4798       gimple *vec_compare_stmt = gimple_build_assign (vec_compare, EQ_EXPR,
4799                                                       induction_index,
4800                                                       max_index_vec);
4801       gsi_insert_before (&exit_gsi, vec_compare_stmt, GSI_SAME_STMT);
4802
4803       /* Use the compare to choose either values from the data vector or
4804          zero.  */
4805       tree vec_cond = make_ssa_name (vectype);
4806       gimple *vec_cond_stmt = gimple_build_assign (vec_cond, VEC_COND_EXPR,
4807                                                    vec_compare, new_phi_result,
4808                                                    zero_vec);
4809       gsi_insert_before (&exit_gsi, vec_cond_stmt, GSI_SAME_STMT);
4810
4811       /* Finally we need to extract the data value from the vector (VEC_COND)
4812          into a scalar (MATCHED_DATA_REDUC).  Logically we want to do a OR
4813          reduction, but because this doesn't exist, we can use a MAX reduction
4814          instead.  The data value might be signed or a float so we need to cast
4815          it first.
4816          In the case where the loop never made any matches, the data values are
4817          all identical, and so will reduce down correctly.  */
4818
4819       /* Make the matched data values unsigned.  */
4820       tree vec_cond_cast = make_ssa_name (vectype_unsigned);
4821       tree vec_cond_cast_rhs = build1 (VIEW_CONVERT_EXPR, vectype_unsigned,
4822                                        vec_cond);
4823       gimple *vec_cond_cast_stmt = gimple_build_assign (vec_cond_cast,
4824                                                         VIEW_CONVERT_EXPR,
4825                                                         vec_cond_cast_rhs);
4826       gsi_insert_before (&exit_gsi, vec_cond_cast_stmt, GSI_SAME_STMT);
4827
4828       /* Reduce down to a scalar value.  */
4829       tree data_reduc = make_ssa_name (scalar_type_unsigned);
4830       gcall *data_reduc_stmt = gimple_build_call_internal (IFN_REDUC_MAX,
4831                                                            1, vec_cond_cast);
4832       gimple_call_set_lhs (data_reduc_stmt, data_reduc);
4833       gsi_insert_before (&exit_gsi, data_reduc_stmt, GSI_SAME_STMT);
4834
4835       /* Convert the reduced value back to the result type and set as the
4836          result.  */
4837       gimple_seq stmts = NULL;
4838       new_temp = gimple_build (&stmts, VIEW_CONVERT_EXPR, scalar_type,
4839                                data_reduc);
4840       gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
4841       scalar_results.safe_push (new_temp);
4842     }
4843   else if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) == COND_REDUCTION
4844            && reduc_fn == IFN_LAST)
4845     {
4846       /* Condition reduction without supported IFN_REDUC_MAX.  Generate
4847          idx = 0;
4848          idx_val = induction_index[0];
4849          val = data_reduc[0];
4850          for (idx = 0, val = init, i = 0; i < nelts; ++i)
4851            if (induction_index[i] > idx_val)
4852              val = data_reduc[i], idx_val = induction_index[i];
4853          return val;  */
4854
4855       tree data_eltype = TREE_TYPE (TREE_TYPE (new_phi_result));
4856       tree idx_eltype = TREE_TYPE (TREE_TYPE (induction_index));
4857       unsigned HOST_WIDE_INT el_size = tree_to_uhwi (TYPE_SIZE (idx_eltype));
4858       poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (TREE_TYPE (induction_index));
4859       /* Enforced by vectorizable_reduction, which ensures we have target
4860          support before allowing a conditional reduction on variable-length
4861          vectors.  */
4862       unsigned HOST_WIDE_INT v_size = el_size * nunits.to_constant ();
4863       tree idx_val = NULL_TREE, val = NULL_TREE;
4864       for (unsigned HOST_WIDE_INT off = 0; off < v_size; off += el_size)
4865         {
4866           tree old_idx_val = idx_val;
4867           tree old_val = val;
4868           idx_val = make_ssa_name (idx_eltype);
4869           epilog_stmt = gimple_build_assign (idx_val, BIT_FIELD_REF,
4870                                              build3 (BIT_FIELD_REF, idx_eltype,
4871                                                      induction_index,
4872                                                      bitsize_int (el_size),
4873                                                      bitsize_int (off)));
4874           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4875           val = make_ssa_name (data_eltype);
4876           epilog_stmt = gimple_build_assign (val, BIT_FIELD_REF,
4877                                              build3 (BIT_FIELD_REF,
4878                                                      data_eltype,
4879                                                      new_phi_result,
4880                                                      bitsize_int (el_size),
4881                                                      bitsize_int (off)));
4882           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4883           if (off != 0)
4884             {
4885               tree new_idx_val = idx_val;
4886               tree new_val = val;
4887               if (off != v_size - el_size)
4888                 {
4889                   new_idx_val = make_ssa_name (idx_eltype);
4890                   epilog_stmt = gimple_build_assign (new_idx_val,
4891                                                      MAX_EXPR, idx_val,
4892                                                      old_idx_val);
4893                   gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4894                 }
4895               new_val = make_ssa_name (data_eltype);
4896               epilog_stmt = gimple_build_assign (new_val,
4897                                                  COND_EXPR,
4898                                                  build2 (GT_EXPR,
4899                                                          boolean_type_node,
4900                                                          idx_val,
4901                                                          old_idx_val),
4902                                                  val, old_val);
4903               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4904               idx_val = new_idx_val;
4905               val = new_val;
4906             }
4907         }
4908       /* Convert the reduced value back to the result type and set as the
4909          result.  */
4910       gimple_seq stmts = NULL;
4911       val = gimple_convert (&stmts, scalar_type, val);
4912       gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
4913       scalar_results.safe_push (val);
4914     }
4915
4916   /* 2.3 Create the reduction code, using one of the three schemes described
4917          above. In SLP we simply need to extract all the elements from the
4918          vector (without reducing them), so we use scalar shifts.  */
4919   else if (reduc_fn != IFN_LAST && !slp_reduc)
4920     {
4921       tree tmp;
4922       tree vec_elem_type;
4923
4924       /* Case 1:  Create:
4925          v_out2 = reduc_expr <v_out1>  */
4926
4927       if (dump_enabled_p ())
4928         dump_printf_loc (MSG_NOTE, vect_location,
4929                          "Reduce using direct vector reduction.\n");
4930
4931       vec_elem_type = TREE_TYPE (TREE_TYPE (new_phi_result));
4932       if (!useless_type_conversion_p (scalar_type, vec_elem_type))
4933         {
4934           tree tmp_dest
4935             = vect_create_destination_var (scalar_dest, vec_elem_type);
4936           epilog_stmt = gimple_build_call_internal (reduc_fn, 1,
4937                                                     new_phi_result);
4938           gimple_set_lhs (epilog_stmt, tmp_dest);
4939           new_temp = make_ssa_name (tmp_dest, epilog_stmt);
4940           gimple_set_lhs (epilog_stmt, new_temp);
4941           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4942
4943           epilog_stmt = gimple_build_assign (new_scalar_dest, NOP_EXPR,
4944                                              new_temp);
4945         }
4946       else
4947         {
4948           epilog_stmt = gimple_build_call_internal (reduc_fn, 1,
4949                                                     new_phi_result);
4950           gimple_set_lhs (epilog_stmt, new_scalar_dest);
4951         }
4952
4953       new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
4954       gimple_set_lhs (epilog_stmt, new_temp);
4955       gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4956
4957       if ((STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
4958            == INTEGER_INDUC_COND_REDUCTION)
4959           && !operand_equal_p (initial_def, induc_val, 0))
4960         {
4961           /* Earlier we set the initial value to be a vector if induc_val
4962              values.  Check the result and if it is induc_val then replace
4963              with the original initial value, unless induc_val is
4964              the same as initial_def already.  */
4965           tree zcompare = build2 (EQ_EXPR, boolean_type_node, new_temp,
4966                                   induc_val);
4967
4968           tmp = make_ssa_name (new_scalar_dest);
4969           epilog_stmt = gimple_build_assign (tmp, COND_EXPR, zcompare,
4970                                              initial_def, new_temp);
4971           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4972           new_temp = tmp;
4973         }
4974
4975       scalar_results.safe_push (new_temp);
4976     }
4977   else if (direct_slp_reduc)
4978     {
4979       /* Here we create one vector for each of the REDUC_GROUP_SIZE results,
4980          with the elements for other SLP statements replaced with the
4981          neutral value.  We can then do a normal reduction on each vector.  */
4982
4983       /* Enforced by vectorizable_reduction.  */
4984       gcc_assert (new_phis.length () == 1);
4985       gcc_assert (pow2p_hwi (group_size));
4986
4987       slp_tree orig_phis_slp_node = slp_node_instance->reduc_phis;
4988       vec<stmt_vec_info> orig_phis
4989         = SLP_TREE_SCALAR_STMTS (orig_phis_slp_node);
4990       gimple_seq seq = NULL;
4991
4992       /* Build a vector {0, 1, 2, ...}, with the same number of elements
4993          and the same element size as VECTYPE.  */
4994       tree index = build_index_vector (vectype, 0, 1);
4995       tree index_type = TREE_TYPE (index);
4996       tree index_elt_type = TREE_TYPE (index_type);
4997       tree mask_type = build_same_sized_truth_vector_type (index_type);
4998
4999       /* Create a vector that, for each element, identifies which of
5000          the REDUC_GROUP_SIZE results should use it.  */
5001       tree index_mask = build_int_cst (index_elt_type, group_size - 1);
5002       index = gimple_build (&seq, BIT_AND_EXPR, index_type, index,
5003                             build_vector_from_val (index_type, index_mask));
5004
5005       /* Get a neutral vector value.  This is simply a splat of the neutral
5006          scalar value if we have one, otherwise the initial scalar value
5007          is itself a neutral value.  */
5008       tree vector_identity = NULL_TREE;
5009       if (neutral_op)
5010         vector_identity = gimple_build_vector_from_val (&seq, vectype,
5011                                                         neutral_op);
5012       for (unsigned int i = 0; i < group_size; ++i)
5013         {
5014           /* If there's no univeral neutral value, we can use the
5015              initial scalar value from the original PHI.  This is used
5016              for MIN and MAX reduction, for example.  */
5017           if (!neutral_op)
5018             {
5019               tree scalar_value
5020                 = PHI_ARG_DEF_FROM_EDGE (orig_phis[i]->stmt,
5021                                          loop_preheader_edge (loop));
5022               vector_identity = gimple_build_vector_from_val (&seq, vectype,
5023                                                               scalar_value);
5024             }
5025
5026           /* Calculate the equivalent of:
5027
5028              sel[j] = (index[j] == i);
5029
5030              which selects the elements of NEW_PHI_RESULT that should
5031              be included in the result.  */
5032           tree compare_val = build_int_cst (index_elt_type, i);
5033           compare_val = build_vector_from_val (index_type, compare_val);
5034           tree sel = gimple_build (&seq, EQ_EXPR, mask_type,
5035                                    index, compare_val);
5036
5037           /* Calculate the equivalent of:
5038
5039              vec = seq ? new_phi_result : vector_identity;
5040
5041              VEC is now suitable for a full vector reduction.  */
5042           tree vec = gimple_build (&seq, VEC_COND_EXPR, vectype,
5043                                    sel, new_phi_result, vector_identity);
5044
5045           /* Do the reduction and convert it to the appropriate type.  */
5046           tree scalar = gimple_build (&seq, as_combined_fn (reduc_fn),
5047                                       TREE_TYPE (vectype), vec);
5048           scalar = gimple_convert (&seq, scalar_type, scalar);
5049           scalar_results.safe_push (scalar);
5050         }
5051       gsi_insert_seq_before (&exit_gsi, seq, GSI_SAME_STMT);
5052     }
5053   else
5054     {
5055       bool reduce_with_shift;
5056       tree vec_temp;
5057
5058       /* COND reductions all do the final reduction with MAX_EXPR
5059          or MIN_EXPR.  */
5060       if (code == COND_EXPR)
5061         {
5062           if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
5063               == INTEGER_INDUC_COND_REDUCTION)
5064             code = induc_code;
5065           else if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
5066                    == CONST_COND_REDUCTION)
5067             code = STMT_VINFO_VEC_CONST_COND_REDUC_CODE (stmt_info);
5068           else
5069             code = MAX_EXPR;
5070         }
5071
5072       /* See if the target wants to do the final (shift) reduction
5073          in a vector mode of smaller size and first reduce upper/lower
5074          halves against each other.  */
5075       enum machine_mode mode1 = mode;
5076       tree vectype1 = vectype;
5077       unsigned sz = tree_to_uhwi (TYPE_SIZE_UNIT (vectype));
5078       unsigned sz1 = sz;
5079       if (!slp_reduc
5080           && (mode1 = targetm.vectorize.split_reduction (mode)) != mode)
5081         sz1 = GET_MODE_SIZE (mode1).to_constant ();
5082
5083       vectype1 = get_vectype_for_scalar_type_and_size (scalar_type, sz1);
5084       reduce_with_shift = have_whole_vector_shift (mode1);
5085       if (!VECTOR_MODE_P (mode1))
5086         reduce_with_shift = false;
5087       else
5088         {
5089           optab optab = optab_for_tree_code (code, vectype1, optab_default);
5090           if (optab_handler (optab, mode1) == CODE_FOR_nothing)
5091             reduce_with_shift = false;
5092         }
5093
5094       /* First reduce the vector to the desired vector size we should
5095          do shift reduction on by combining upper and lower halves.  */
5096       new_temp = new_phi_result;
5097       while (sz > sz1)
5098         {
5099           gcc_assert (!slp_reduc);
5100           sz /= 2;
5101           vectype1 = get_vectype_for_scalar_type_and_size (scalar_type, sz);
5102
5103           /* The target has to make sure we support lowpart/highpart
5104              extraction, either via direct vector extract or through
5105              an integer mode punning.  */
5106           tree dst1, dst2;
5107           if (convert_optab_handler (vec_extract_optab,
5108                                      TYPE_MODE (TREE_TYPE (new_temp)),
5109                                      TYPE_MODE (vectype1))
5110               != CODE_FOR_nothing)
5111             {
5112               /* Extract sub-vectors directly once vec_extract becomes
5113                  a conversion optab.  */
5114               dst1 = make_ssa_name (vectype1);
5115               epilog_stmt
5116                   = gimple_build_assign (dst1, BIT_FIELD_REF,
5117                                          build3 (BIT_FIELD_REF, vectype1,
5118                                                  new_temp, TYPE_SIZE (vectype1),
5119                                                  bitsize_int (0)));
5120               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5121               dst2 =  make_ssa_name (vectype1);
5122               epilog_stmt
5123                   = gimple_build_assign (dst2, BIT_FIELD_REF,
5124                                          build3 (BIT_FIELD_REF, vectype1,
5125                                                  new_temp, TYPE_SIZE (vectype1),
5126                                                  bitsize_int (sz * BITS_PER_UNIT)));
5127               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5128             }
5129           else
5130             {
5131               /* Extract via punning to appropriately sized integer mode
5132                  vector.  */
5133               tree eltype = build_nonstandard_integer_type (sz * BITS_PER_UNIT,
5134                                                             1);
5135               tree etype = build_vector_type (eltype, 2);
5136               gcc_assert (convert_optab_handler (vec_extract_optab,
5137                                                  TYPE_MODE (etype),
5138                                                  TYPE_MODE (eltype))
5139                           != CODE_FOR_nothing);
5140               tree tem = make_ssa_name (etype);
5141               epilog_stmt = gimple_build_assign (tem, VIEW_CONVERT_EXPR,
5142                                                  build1 (VIEW_CONVERT_EXPR,
5143                                                          etype, new_temp));
5144               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5145               new_temp = tem;
5146               tem = make_ssa_name (eltype);
5147               epilog_stmt
5148                   = gimple_build_assign (tem, BIT_FIELD_REF,
5149                                          build3 (BIT_FIELD_REF, eltype,
5150                                                  new_temp, TYPE_SIZE (eltype),
5151                                                  bitsize_int (0)));
5152               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5153               dst1 = make_ssa_name (vectype1);
5154               epilog_stmt = gimple_build_assign (dst1, VIEW_CONVERT_EXPR,
5155                                                  build1 (VIEW_CONVERT_EXPR,
5156                                                          vectype1, tem));
5157               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5158               tem = make_ssa_name (eltype);
5159               epilog_stmt
5160                   = gimple_build_assign (tem, BIT_FIELD_REF,
5161                                          build3 (BIT_FIELD_REF, eltype,
5162                                                  new_temp, TYPE_SIZE (eltype),
5163                                                  bitsize_int (sz * BITS_PER_UNIT)));
5164               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5165               dst2 =  make_ssa_name (vectype1);
5166               epilog_stmt = gimple_build_assign (dst2, VIEW_CONVERT_EXPR,
5167                                                  build1 (VIEW_CONVERT_EXPR,
5168                                                          vectype1, tem));
5169               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5170             }
5171
5172           new_temp = make_ssa_name (vectype1);
5173           epilog_stmt = gimple_build_assign (new_temp, code, dst1, dst2);
5174           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5175         }
5176
5177       if (reduce_with_shift && !slp_reduc)
5178         {
5179           int element_bitsize = tree_to_uhwi (bitsize);
5180           /* Enforced by vectorizable_reduction, which disallows SLP reductions
5181              for variable-length vectors and also requires direct target support
5182              for loop reductions.  */
5183           int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype1));
5184           int nelements = vec_size_in_bits / element_bitsize;
5185           vec_perm_builder sel;
5186           vec_perm_indices indices;
5187
5188           int elt_offset;
5189
5190           tree zero_vec = build_zero_cst (vectype1);
5191           /* Case 2: Create:
5192              for (offset = nelements/2; offset >= 1; offset/=2)
5193                 {
5194                   Create:  va' = vec_shift <va, offset>
5195                   Create:  va = vop <va, va'>
5196                 }  */
5197
5198           tree rhs;
5199
5200           if (dump_enabled_p ())
5201             dump_printf_loc (MSG_NOTE, vect_location,
5202                              "Reduce using vector shifts\n");
5203
5204           mode1 = TYPE_MODE (vectype1);
5205           vec_dest = vect_create_destination_var (scalar_dest, vectype1);
5206           for (elt_offset = nelements / 2;
5207                elt_offset >= 1;
5208                elt_offset /= 2)
5209             {
5210               calc_vec_perm_mask_for_shift (elt_offset, nelements, &sel);
5211               indices.new_vector (sel, 2, nelements);
5212               tree mask = vect_gen_perm_mask_any (vectype1, indices);
5213               epilog_stmt = gimple_build_assign (vec_dest, VEC_PERM_EXPR,
5214                                                  new_temp, zero_vec, mask);
5215               new_name = make_ssa_name (vec_dest, epilog_stmt);
5216               gimple_assign_set_lhs (epilog_stmt, new_name);
5217               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5218
5219               epilog_stmt = gimple_build_assign (vec_dest, code, new_name,
5220                                                  new_temp);
5221               new_temp = make_ssa_name (vec_dest, epilog_stmt);
5222               gimple_assign_set_lhs (epilog_stmt, new_temp);
5223               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5224             }
5225
5226           /* 2.4  Extract the final scalar result.  Create:
5227              s_out3 = extract_field <v_out2, bitpos>  */
5228
5229           if (dump_enabled_p ())
5230             dump_printf_loc (MSG_NOTE, vect_location,
5231                              "extract scalar result\n");
5232
5233           rhs = build3 (BIT_FIELD_REF, scalar_type, new_temp,
5234                         bitsize, bitsize_zero_node);
5235           epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
5236           new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
5237           gimple_assign_set_lhs (epilog_stmt, new_temp);
5238           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5239           scalar_results.safe_push (new_temp);
5240         }
5241       else
5242         {
5243           /* Case 3: Create:
5244              s = extract_field <v_out2, 0>
5245              for (offset = element_size;
5246                   offset < vector_size;
5247                   offset += element_size;)
5248                {
5249                  Create:  s' = extract_field <v_out2, offset>
5250                  Create:  s = op <s, s'>  // For non SLP cases
5251                }  */
5252
5253           if (dump_enabled_p ())
5254             dump_printf_loc (MSG_NOTE, vect_location,
5255                              "Reduce using scalar code.\n");
5256
5257           int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype1));
5258           int element_bitsize = tree_to_uhwi (bitsize);
5259           FOR_EACH_VEC_ELT (new_phis, i, new_phi)
5260             {
5261               int bit_offset;
5262               if (gimple_code (new_phi) == GIMPLE_PHI)
5263                 vec_temp = PHI_RESULT (new_phi);
5264               else
5265                 vec_temp = gimple_assign_lhs (new_phi);
5266               tree rhs = build3 (BIT_FIELD_REF, scalar_type, vec_temp, bitsize,
5267                                  bitsize_zero_node);
5268               epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
5269               new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
5270               gimple_assign_set_lhs (epilog_stmt, new_temp);
5271               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5272
5273               /* In SLP we don't need to apply reduction operation, so we just
5274                  collect s' values in SCALAR_RESULTS.  */
5275               if (slp_reduc)
5276                 scalar_results.safe_push (new_temp);
5277
5278               for (bit_offset = element_bitsize;
5279                    bit_offset < vec_size_in_bits;
5280                    bit_offset += element_bitsize)
5281                 {
5282                   tree bitpos = bitsize_int (bit_offset);
5283                   tree rhs = build3 (BIT_FIELD_REF, scalar_type, vec_temp,
5284                                      bitsize, bitpos);
5285
5286                   epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
5287                   new_name = make_ssa_name (new_scalar_dest, epilog_stmt);
5288                   gimple_assign_set_lhs (epilog_stmt, new_name);
5289                   gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5290
5291                   if (slp_reduc)
5292                     {
5293                       /* In SLP we don't need to apply reduction operation, so
5294                          we just collect s' values in SCALAR_RESULTS.  */
5295                       new_temp = new_name;
5296                       scalar_results.safe_push (new_name);
5297                     }
5298                   else
5299                     {
5300                       epilog_stmt = gimple_build_assign (new_scalar_dest, code,
5301                                                          new_name, new_temp);
5302                       new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
5303                       gimple_assign_set_lhs (epilog_stmt, new_temp);
5304                       gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5305                     }
5306                 }
5307             }
5308
5309           /* The only case where we need to reduce scalar results in SLP, is
5310              unrolling.  If the size of SCALAR_RESULTS is greater than
5311              REDUC_GROUP_SIZE, we reduce them combining elements modulo
5312              REDUC_GROUP_SIZE.  */
5313           if (slp_reduc)
5314             {
5315               tree res, first_res, new_res;
5316               gimple *new_stmt;
5317
5318               /* Reduce multiple scalar results in case of SLP unrolling.  */
5319               for (j = group_size; scalar_results.iterate (j, &res);
5320                    j++)
5321                 {
5322                   first_res = scalar_results[j % group_size];
5323                   new_stmt = gimple_build_assign (new_scalar_dest, code,
5324                                                   first_res, res);
5325                   new_res = make_ssa_name (new_scalar_dest, new_stmt);
5326                   gimple_assign_set_lhs (new_stmt, new_res);
5327                   gsi_insert_before (&exit_gsi, new_stmt, GSI_SAME_STMT);
5328                   scalar_results[j % group_size] = new_res;
5329                 }
5330             }
5331           else
5332             /* Not SLP - we have one scalar to keep in SCALAR_RESULTS.  */
5333             scalar_results.safe_push (new_temp);
5334         }
5335
5336       if ((STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
5337            == INTEGER_INDUC_COND_REDUCTION)
5338           && !operand_equal_p (initial_def, induc_val, 0))
5339         {
5340           /* Earlier we set the initial value to be a vector if induc_val
5341              values.  Check the result and if it is induc_val then replace
5342              with the original initial value, unless induc_val is
5343              the same as initial_def already.  */
5344           tree zcompare = build2 (EQ_EXPR, boolean_type_node, new_temp,
5345                                   induc_val);
5346
5347           tree tmp = make_ssa_name (new_scalar_dest);
5348           epilog_stmt = gimple_build_assign (tmp, COND_EXPR, zcompare,
5349                                              initial_def, new_temp);
5350           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5351           scalar_results[0] = tmp;
5352         }
5353     }
5354
5355 vect_finalize_reduction:
5356
5357   if (double_reduc)
5358     loop = loop->inner;
5359
5360   /* 2.5 Adjust the final result by the initial value of the reduction
5361          variable. (When such adjustment is not needed, then
5362          'adjustment_def' is zero).  For example, if code is PLUS we create:
5363          new_temp = loop_exit_def + adjustment_def  */
5364
5365   if (adjustment_def)
5366     {
5367       gcc_assert (!slp_reduc);
5368       if (nested_in_vect_loop)
5369         {
5370           new_phi = new_phis[0];
5371           gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) == VECTOR_TYPE);
5372           expr = build2 (code, vectype, PHI_RESULT (new_phi), adjustment_def);
5373           new_dest = vect_create_destination_var (scalar_dest, vectype);
5374         }
5375       else
5376         {
5377           new_temp = scalar_results[0];
5378           gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) != VECTOR_TYPE);
5379           expr = build2 (code, scalar_type, new_temp, adjustment_def);
5380           new_dest = vect_create_destination_var (scalar_dest, scalar_type);
5381         }
5382
5383       epilog_stmt = gimple_build_assign (new_dest, expr);
5384       new_temp = make_ssa_name (new_dest, epilog_stmt);
5385       gimple_assign_set_lhs (epilog_stmt, new_temp);
5386       gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5387       if (nested_in_vect_loop)
5388         {
5389           stmt_vec_info epilog_stmt_info = loop_vinfo->add_stmt (epilog_stmt);
5390           STMT_VINFO_RELATED_STMT (epilog_stmt_info)
5391             = STMT_VINFO_RELATED_STMT (loop_vinfo->lookup_stmt (new_phi));
5392
5393           if (!double_reduc)
5394             scalar_results.quick_push (new_temp);
5395           else
5396             scalar_results[0] = new_temp;
5397         }
5398       else
5399         scalar_results[0] = new_temp;
5400
5401       new_phis[0] = epilog_stmt;
5402     }
5403
5404   /* 2.6  Handle the loop-exit phis.  Replace the uses of scalar loop-exit
5405           phis with new adjusted scalar results, i.e., replace use <s_out0>
5406           with use <s_out4>.
5407
5408      Transform:
5409         loop_exit:
5410           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
5411           v_out1 = phi <VECT_DEF>               # NEW_EXIT_PHI
5412           v_out2 = reduce <v_out1>
5413           s_out3 = extract_field <v_out2, 0>
5414           s_out4 = adjust_result <s_out3>
5415           use <s_out0>
5416           use <s_out0>
5417
5418      into:
5419
5420         loop_exit:
5421           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
5422           v_out1 = phi <VECT_DEF>               # NEW_EXIT_PHI
5423           v_out2 = reduce <v_out1>
5424           s_out3 = extract_field <v_out2, 0>
5425           s_out4 = adjust_result <s_out3>
5426           use <s_out4>
5427           use <s_out4> */
5428
5429
5430   /* In SLP reduction chain we reduce vector results into one vector if
5431      necessary, hence we set here REDUC_GROUP_SIZE to 1.  SCALAR_DEST is the
5432      LHS of the last stmt in the reduction chain, since we are looking for
5433      the loop exit phi node.  */
5434   if (REDUC_GROUP_FIRST_ELEMENT (stmt_info))
5435     {
5436       stmt_vec_info dest_stmt_info
5437         = vect_orig_stmt (SLP_TREE_SCALAR_STMTS (slp_node)[group_size - 1]);
5438       scalar_dest = gimple_assign_lhs (dest_stmt_info->stmt);
5439       group_size = 1;
5440     }
5441
5442   /* In SLP we may have several statements in NEW_PHIS and REDUCTION_PHIS (in
5443      case that REDUC_GROUP_SIZE is greater than vectorization factor).
5444      Therefore, we need to match SCALAR_RESULTS with corresponding statements.
5445      The first (REDUC_GROUP_SIZE / number of new vector stmts) scalar results
5446      correspond to the first vector stmt, etc.
5447      (RATIO is equal to (REDUC_GROUP_SIZE / number of new vector stmts)).  */
5448   if (group_size > new_phis.length ())
5449     {
5450       ratio = group_size / new_phis.length ();
5451       gcc_assert (!(group_size % new_phis.length ()));
5452     }
5453   else
5454     ratio = 1;
5455
5456   stmt_vec_info epilog_stmt_info = NULL;
5457   for (k = 0; k < group_size; k++)
5458     {
5459       if (k % ratio == 0)
5460         {
5461           epilog_stmt_info = loop_vinfo->lookup_stmt (new_phis[k / ratio]);
5462           reduction_phi_info = reduction_phis[k / ratio];
5463           if (double_reduc)
5464             inner_phi = inner_phis[k / ratio];
5465         }
5466
5467       if (slp_reduc)
5468         {
5469           stmt_vec_info scalar_stmt_info = SLP_TREE_SCALAR_STMTS (slp_node)[k];
5470
5471           orig_stmt_info = STMT_VINFO_RELATED_STMT (scalar_stmt_info);
5472           /* SLP statements can't participate in patterns.  */
5473           gcc_assert (!orig_stmt_info);
5474           scalar_dest = gimple_assign_lhs (scalar_stmt_info->stmt);
5475         }
5476
5477       phis.create (3);
5478       /* Find the loop-closed-use at the loop exit of the original scalar
5479          result.  (The reduction result is expected to have two immediate uses -
5480          one at the latch block, and one at the loop exit).  */
5481       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, scalar_dest)
5482         if (!flow_bb_inside_loop_p (loop, gimple_bb (USE_STMT (use_p)))
5483             && !is_gimple_debug (USE_STMT (use_p)))
5484           phis.safe_push (USE_STMT (use_p));
5485
5486       /* While we expect to have found an exit_phi because of loop-closed-ssa
5487          form we can end up without one if the scalar cycle is dead.  */
5488
5489       FOR_EACH_VEC_ELT (phis, i, exit_phi)
5490         {
5491           if (outer_loop)
5492             {
5493               stmt_vec_info exit_phi_vinfo
5494                 = loop_vinfo->lookup_stmt (exit_phi);
5495               gphi *vect_phi;
5496
5497               if (double_reduc)
5498                 STMT_VINFO_VEC_STMT (exit_phi_vinfo) = inner_phi;
5499               else
5500                 STMT_VINFO_VEC_STMT (exit_phi_vinfo) = epilog_stmt_info;
5501               if (!double_reduc
5502                   || STMT_VINFO_DEF_TYPE (exit_phi_vinfo)
5503                       != vect_double_reduction_def)
5504                 continue;
5505
5506               /* Handle double reduction:
5507
5508                  stmt1: s1 = phi <s0, s2>  - double reduction phi (outer loop)
5509                  stmt2:   s3 = phi <s1, s4> - (regular) reduc phi (inner loop)
5510                  stmt3:   s4 = use (s3)     - (regular) reduc stmt (inner loop)
5511                  stmt4: s2 = phi <s4>      - double reduction stmt (outer loop)
5512
5513                  At that point the regular reduction (stmt2 and stmt3) is
5514                  already vectorized, as well as the exit phi node, stmt4.
5515                  Here we vectorize the phi node of double reduction, stmt1, and
5516                  update all relevant statements.  */
5517
5518               /* Go through all the uses of s2 to find double reduction phi
5519                  node, i.e., stmt1 above.  */
5520               orig_name = PHI_RESULT (exit_phi);
5521               FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name)
5522                 {
5523                   stmt_vec_info use_stmt_vinfo;
5524                   tree vect_phi_init, preheader_arg, vect_phi_res;
5525                   basic_block bb = gimple_bb (use_stmt);
5526
5527                   /* Check that USE_STMT is really double reduction phi
5528                      node.  */
5529                   if (gimple_code (use_stmt) != GIMPLE_PHI
5530                       || gimple_phi_num_args (use_stmt) != 2
5531                       || bb->loop_father != outer_loop)
5532                     continue;
5533                   use_stmt_vinfo = loop_vinfo->lookup_stmt (use_stmt);
5534                   if (!use_stmt_vinfo
5535                       || STMT_VINFO_DEF_TYPE (use_stmt_vinfo)
5536                           != vect_double_reduction_def)
5537                     continue;
5538
5539                   /* Create vector phi node for double reduction:
5540                      vs1 = phi <vs0, vs2>
5541                      vs1 was created previously in this function by a call to
5542                        vect_get_vec_def_for_operand and is stored in
5543                        vec_initial_def;
5544                      vs2 is defined by INNER_PHI, the vectorized EXIT_PHI;
5545                      vs0 is created here.  */
5546
5547                   /* Create vector phi node.  */
5548                   vect_phi = create_phi_node (vec_initial_def, bb);
5549                   loop_vec_info_for_loop (outer_loop)->add_stmt (vect_phi);
5550
5551                   /* Create vs0 - initial def of the double reduction phi.  */
5552                   preheader_arg = PHI_ARG_DEF_FROM_EDGE (use_stmt,
5553                                              loop_preheader_edge (outer_loop));
5554                   vect_phi_init = get_initial_def_for_reduction
5555                     (stmt_info, preheader_arg, NULL);
5556
5557                   /* Update phi node arguments with vs0 and vs2.  */
5558                   add_phi_arg (vect_phi, vect_phi_init,
5559                                loop_preheader_edge (outer_loop),
5560                                UNKNOWN_LOCATION);
5561                   add_phi_arg (vect_phi, PHI_RESULT (inner_phi->stmt),
5562                                loop_latch_edge (outer_loop), UNKNOWN_LOCATION);
5563                   if (dump_enabled_p ())
5564                     dump_printf_loc (MSG_NOTE, vect_location,
5565                                      "created double reduction phi node: %G",
5566                                      vect_phi);
5567
5568                   vect_phi_res = PHI_RESULT (vect_phi);
5569
5570                   /* Replace the use, i.e., set the correct vs1 in the regular
5571                      reduction phi node.  FORNOW, NCOPIES is always 1, so the
5572                      loop is redundant.  */
5573                   stmt_vec_info use_info = reduction_phi_info;
5574                   for (j = 0; j < ncopies; j++)
5575                     {
5576                       edge pr_edge = loop_preheader_edge (loop);
5577                       SET_PHI_ARG_DEF (as_a <gphi *> (use_info->stmt),
5578                                        pr_edge->dest_idx, vect_phi_res);
5579                       use_info = STMT_VINFO_RELATED_STMT (use_info);
5580                     }
5581                 }
5582             }
5583         }
5584
5585       phis.release ();
5586       if (nested_in_vect_loop)
5587         {
5588           if (double_reduc)
5589             loop = outer_loop;
5590           else
5591             continue;
5592         }
5593
5594       phis.create (3);
5595       /* Find the loop-closed-use at the loop exit of the original scalar
5596          result.  (The reduction result is expected to have two immediate uses,
5597          one at the latch block, and one at the loop exit).  For double
5598          reductions we are looking for exit phis of the outer loop.  */
5599       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, scalar_dest)
5600         {
5601           if (!flow_bb_inside_loop_p (loop, gimple_bb (USE_STMT (use_p))))
5602             {
5603               if (!is_gimple_debug (USE_STMT (use_p)))
5604                 phis.safe_push (USE_STMT (use_p));
5605             }
5606           else
5607             {
5608               if (double_reduc && gimple_code (USE_STMT (use_p)) == GIMPLE_PHI)
5609                 {
5610                   tree phi_res = PHI_RESULT (USE_STMT (use_p));
5611
5612                   FOR_EACH_IMM_USE_FAST (phi_use_p, phi_imm_iter, phi_res)
5613                     {
5614                       if (!flow_bb_inside_loop_p (loop,
5615                                              gimple_bb (USE_STMT (phi_use_p)))
5616                           && !is_gimple_debug (USE_STMT (phi_use_p)))
5617                         phis.safe_push (USE_STMT (phi_use_p));
5618                     }
5619                 }
5620             }
5621         }
5622
5623       FOR_EACH_VEC_ELT (phis, i, exit_phi)
5624         {
5625           /* Replace the uses:  */
5626           orig_name = PHI_RESULT (exit_phi);
5627           scalar_result = scalar_results[k];
5628           FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name)
5629             FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
5630               SET_USE (use_p, scalar_result);
5631         }
5632
5633       phis.release ();
5634     }
5635 }
5636
5637 /* Return a vector of type VECTYPE that is equal to the vector select
5638    operation "MASK ? VEC : IDENTITY".  Insert the select statements
5639    before GSI.  */
5640
5641 static tree
5642 merge_with_identity (gimple_stmt_iterator *gsi, tree mask, tree vectype,
5643                      tree vec, tree identity)
5644 {
5645   tree cond = make_temp_ssa_name (vectype, NULL, "cond");
5646   gimple *new_stmt = gimple_build_assign (cond, VEC_COND_EXPR,
5647                                           mask, vec, identity);
5648   gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
5649   return cond;
5650 }
5651
5652 /* Successively apply CODE to each element of VECTOR_RHS, in left-to-right
5653    order, starting with LHS.  Insert the extraction statements before GSI and
5654    associate the new scalar SSA names with variable SCALAR_DEST.
5655    Return the SSA name for the result.  */
5656
5657 static tree
5658 vect_expand_fold_left (gimple_stmt_iterator *gsi, tree scalar_dest,
5659                        tree_code code, tree lhs, tree vector_rhs)
5660 {
5661   tree vectype = TREE_TYPE (vector_rhs);
5662   tree scalar_type = TREE_TYPE (vectype);
5663   tree bitsize = TYPE_SIZE (scalar_type);
5664   unsigned HOST_WIDE_INT vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
5665   unsigned HOST_WIDE_INT element_bitsize = tree_to_uhwi (bitsize);
5666
5667   for (unsigned HOST_WIDE_INT bit_offset = 0;
5668        bit_offset < vec_size_in_bits;
5669        bit_offset += element_bitsize)
5670     {
5671       tree bitpos = bitsize_int (bit_offset);
5672       tree rhs = build3 (BIT_FIELD_REF, scalar_type, vector_rhs,
5673                          bitsize, bitpos);
5674
5675       gassign *stmt = gimple_build_assign (scalar_dest, rhs);
5676       rhs = make_ssa_name (scalar_dest, stmt);
5677       gimple_assign_set_lhs (stmt, rhs);
5678       gsi_insert_before (gsi, stmt, GSI_SAME_STMT);
5679
5680       stmt = gimple_build_assign (scalar_dest, code, lhs, rhs);
5681       tree new_name = make_ssa_name (scalar_dest, stmt);
5682       gimple_assign_set_lhs (stmt, new_name);
5683       gsi_insert_before (gsi, stmt, GSI_SAME_STMT);
5684       lhs = new_name;
5685     }
5686   return lhs;
5687 }
5688
5689 /* Perform an in-order reduction (FOLD_LEFT_REDUCTION).  STMT_INFO is the
5690    statement that sets the live-out value.  REDUC_DEF_STMT is the phi
5691    statement.  CODE is the operation performed by STMT_INFO and OPS are
5692    its scalar operands.  REDUC_INDEX is the index of the operand in
5693    OPS that is set by REDUC_DEF_STMT.  REDUC_FN is the function that
5694    implements in-order reduction, or IFN_LAST if we should open-code it.
5695    VECTYPE_IN is the type of the vector input.  MASKS specifies the masks
5696    that should be used to control the operation in a fully-masked loop.  */
5697
5698 static bool
5699 vectorize_fold_left_reduction (stmt_vec_info stmt_info,
5700                                gimple_stmt_iterator *gsi,
5701                                stmt_vec_info *vec_stmt, slp_tree slp_node,
5702                                gimple *reduc_def_stmt,
5703                                tree_code code, internal_fn reduc_fn,
5704                                tree ops[3], tree vectype_in,
5705                                int reduc_index, vec_loop_masks *masks)
5706 {
5707   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
5708   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
5709   tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
5710   stmt_vec_info new_stmt_info = NULL;
5711
5712   int ncopies;
5713   if (slp_node)
5714     ncopies = 1;
5715   else
5716     ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
5717
5718   gcc_assert (!nested_in_vect_loop_p (loop, stmt_info));
5719   gcc_assert (ncopies == 1);
5720   gcc_assert (TREE_CODE_LENGTH (code) == binary_op);
5721   gcc_assert (reduc_index == (code == MINUS_EXPR ? 0 : 1));
5722   gcc_assert (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
5723               == FOLD_LEFT_REDUCTION);
5724
5725   if (slp_node)
5726     gcc_assert (known_eq (TYPE_VECTOR_SUBPARTS (vectype_out),
5727                           TYPE_VECTOR_SUBPARTS (vectype_in)));
5728
5729   tree op0 = ops[1 - reduc_index];
5730
5731   int group_size = 1;
5732   stmt_vec_info scalar_dest_def_info;
5733   auto_vec<tree> vec_oprnds0;
5734   if (slp_node)
5735     {
5736       auto_vec<vec<tree> > vec_defs (2);
5737       auto_vec<tree> sops(2);
5738       sops.quick_push (ops[0]);
5739       sops.quick_push (ops[1]);
5740       vect_get_slp_defs (sops, slp_node, &vec_defs);
5741       vec_oprnds0.safe_splice (vec_defs[1 - reduc_index]);
5742       vec_defs[0].release ();
5743       vec_defs[1].release ();
5744       group_size = SLP_TREE_SCALAR_STMTS (slp_node).length ();
5745       scalar_dest_def_info = SLP_TREE_SCALAR_STMTS (slp_node)[group_size - 1];
5746     }
5747   else
5748     {
5749       tree loop_vec_def0 = vect_get_vec_def_for_operand (op0, stmt_info);
5750       vec_oprnds0.create (1);
5751       vec_oprnds0.quick_push (loop_vec_def0);
5752       scalar_dest_def_info = stmt_info;
5753     }
5754
5755   tree scalar_dest = gimple_assign_lhs (scalar_dest_def_info->stmt);
5756   tree scalar_type = TREE_TYPE (scalar_dest);
5757   tree reduc_var = gimple_phi_result (reduc_def_stmt);
5758
5759   int vec_num = vec_oprnds0.length ();
5760   gcc_assert (vec_num == 1 || slp_node);
5761   tree vec_elem_type = TREE_TYPE (vectype_out);
5762   gcc_checking_assert (useless_type_conversion_p (scalar_type, vec_elem_type));
5763
5764   tree vector_identity = NULL_TREE;
5765   if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
5766     vector_identity = build_zero_cst (vectype_out);
5767
5768   tree scalar_dest_var = vect_create_destination_var (scalar_dest, NULL);
5769   int i;
5770   tree def0;
5771   FOR_EACH_VEC_ELT (vec_oprnds0, i, def0)
5772     {
5773       gimple *new_stmt;
5774       tree mask = NULL_TREE;
5775       if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
5776         mask = vect_get_loop_mask (gsi, masks, vec_num, vectype_in, i);
5777
5778       /* Handle MINUS by adding the negative.  */
5779       if (reduc_fn != IFN_LAST && code == MINUS_EXPR)
5780         {
5781           tree negated = make_ssa_name (vectype_out);
5782           new_stmt = gimple_build_assign (negated, NEGATE_EXPR, def0);
5783           gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
5784           def0 = negated;
5785         }
5786
5787       if (mask)
5788         def0 = merge_with_identity (gsi, mask, vectype_out, def0,
5789                                     vector_identity);
5790
5791       /* On the first iteration the input is simply the scalar phi
5792          result, and for subsequent iterations it is the output of
5793          the preceding operation.  */
5794       if (reduc_fn != IFN_LAST)
5795         {
5796           new_stmt = gimple_build_call_internal (reduc_fn, 2, reduc_var, def0);
5797           /* For chained SLP reductions the output of the previous reduction
5798              operation serves as the input of the next. For the final statement
5799              the output cannot be a temporary - we reuse the original
5800              scalar destination of the last statement.  */
5801           if (i != vec_num - 1)
5802             {
5803               gimple_set_lhs (new_stmt, scalar_dest_var);
5804               reduc_var = make_ssa_name (scalar_dest_var, new_stmt);
5805               gimple_set_lhs (new_stmt, reduc_var);
5806             }
5807         }
5808       else
5809         {
5810           reduc_var = vect_expand_fold_left (gsi, scalar_dest_var, code,
5811                                              reduc_var, def0);
5812           new_stmt = SSA_NAME_DEF_STMT (reduc_var);
5813           /* Remove the statement, so that we can use the same code paths
5814              as for statements that we've just created.  */
5815           gimple_stmt_iterator tmp_gsi = gsi_for_stmt (new_stmt);
5816           gsi_remove (&tmp_gsi, true);
5817         }
5818
5819       if (i == vec_num - 1)
5820         {
5821           gimple_set_lhs (new_stmt, scalar_dest);
5822           new_stmt_info = vect_finish_replace_stmt (scalar_dest_def_info,
5823                                                     new_stmt);
5824         }
5825       else
5826         new_stmt_info = vect_finish_stmt_generation (scalar_dest_def_info,
5827                                                      new_stmt, gsi);
5828
5829       if (slp_node)
5830         SLP_TREE_VEC_STMTS (slp_node).quick_push (new_stmt_info);
5831     }
5832
5833   if (!slp_node)
5834     STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt_info;
5835
5836   return true;
5837 }
5838
5839 /* Function is_nonwrapping_integer_induction.
5840
5841    Check if STMT_VINO (which is part of loop LOOP) both increments and
5842    does not cause overflow.  */
5843
5844 static bool
5845 is_nonwrapping_integer_induction (stmt_vec_info stmt_vinfo, struct loop *loop)
5846 {
5847   gphi *phi = as_a <gphi *> (stmt_vinfo->stmt);
5848   tree base = STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo);
5849   tree step = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo);
5850   tree lhs_type = TREE_TYPE (gimple_phi_result (phi));
5851   widest_int ni, max_loop_value, lhs_max;
5852   wi::overflow_type overflow = wi::OVF_NONE;
5853
5854   /* Make sure the loop is integer based.  */
5855   if (TREE_CODE (base) != INTEGER_CST
5856       || TREE_CODE (step) != INTEGER_CST)
5857     return false;
5858
5859   /* Check that the max size of the loop will not wrap.  */
5860
5861   if (TYPE_OVERFLOW_UNDEFINED (lhs_type))
5862     return true;
5863
5864   if (! max_stmt_executions (loop, &ni))
5865     return false;
5866
5867   max_loop_value = wi::mul (wi::to_widest (step), ni, TYPE_SIGN (lhs_type),
5868                             &overflow);
5869   if (overflow)
5870     return false;
5871
5872   max_loop_value = wi::add (wi::to_widest (base), max_loop_value,
5873                             TYPE_SIGN (lhs_type), &overflow);
5874   if (overflow)
5875     return false;
5876
5877   return (wi::min_precision (max_loop_value, TYPE_SIGN (lhs_type))
5878           <= TYPE_PRECISION (lhs_type));
5879 }
5880
5881 /* Function vectorizable_reduction.
5882
5883    Check if STMT_INFO performs a reduction operation that can be vectorized.
5884    If VEC_STMT is also passed, vectorize STMT_INFO: create a vectorized
5885    stmt to replace it, put it in VEC_STMT, and insert it at GSI.
5886    Return true if STMT_INFO is vectorizable in this way.
5887
5888    This function also handles reduction idioms (patterns) that have been
5889    recognized in advance during vect_pattern_recog.  In this case, STMT_INFO
5890    may be of this form:
5891      X = pattern_expr (arg0, arg1, ..., X)
5892    and its STMT_VINFO_RELATED_STMT points to the last stmt in the original
5893    sequence that had been detected and replaced by the pattern-stmt
5894    (STMT_INFO).
5895
5896    This function also handles reduction of condition expressions, for example:
5897      for (int i = 0; i < N; i++)
5898        if (a[i] < value)
5899          last = a[i];
5900    This is handled by vectorising the loop and creating an additional vector
5901    containing the loop indexes for which "a[i] < value" was true.  In the
5902    function epilogue this is reduced to a single max value and then used to
5903    index into the vector of results.
5904
5905    In some cases of reduction patterns, the type of the reduction variable X is
5906    different than the type of the other arguments of STMT_INFO.
5907    In such cases, the vectype that is used when transforming STMT_INFO into
5908    a vector stmt is different than the vectype that is used to determine the
5909    vectorization factor, because it consists of a different number of elements
5910    than the actual number of elements that are being operated upon in parallel.
5911
5912    For example, consider an accumulation of shorts into an int accumulator.
5913    On some targets it's possible to vectorize this pattern operating on 8
5914    shorts at a time (hence, the vectype for purposes of determining the
5915    vectorization factor should be V8HI); on the other hand, the vectype that
5916    is used to create the vector form is actually V4SI (the type of the result).
5917
5918    Upon entry to this function, STMT_VINFO_VECTYPE records the vectype that
5919    indicates what is the actual level of parallelism (V8HI in the example), so
5920    that the right vectorization factor would be derived.  This vectype
5921    corresponds to the type of arguments to the reduction stmt, and should *NOT*
5922    be used to create the vectorized stmt.  The right vectype for the vectorized
5923    stmt is obtained from the type of the result X:
5924         get_vectype_for_scalar_type (TREE_TYPE (X))
5925
5926    This means that, contrary to "regular" reductions (or "regular" stmts in
5927    general), the following equation:
5928       STMT_VINFO_VECTYPE == get_vectype_for_scalar_type (TREE_TYPE (X))
5929    does *NOT* necessarily hold for reduction patterns.  */
5930
5931 bool
5932 vectorizable_reduction (stmt_vec_info stmt_info, gimple_stmt_iterator *gsi,
5933                         stmt_vec_info *vec_stmt, slp_tree slp_node,
5934                         slp_instance slp_node_instance,
5935                         stmt_vector_for_cost *cost_vec)
5936 {
5937   tree vec_dest;
5938   tree scalar_dest;
5939   tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
5940   tree vectype_in = NULL_TREE;
5941   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
5942   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
5943   enum tree_code code, orig_code;
5944   internal_fn reduc_fn;
5945   machine_mode vec_mode;
5946   int op_type;
5947   optab optab;
5948   tree new_temp = NULL_TREE;
5949   enum vect_def_type dt, cond_reduc_dt = vect_unknown_def_type;
5950   stmt_vec_info cond_stmt_vinfo = NULL;
5951   enum tree_code cond_reduc_op_code = ERROR_MARK;
5952   tree scalar_type;
5953   bool is_simple_use;
5954   int i;
5955   int ncopies;
5956   int epilog_copies;
5957   stmt_vec_info prev_stmt_info, prev_phi_info;
5958   bool single_defuse_cycle = false;
5959   stmt_vec_info new_stmt_info = NULL;
5960   int j;
5961   tree ops[3];
5962   enum vect_def_type dts[3];
5963   bool nested_cycle = false, found_nested_cycle_def = false;
5964   bool double_reduc = false;
5965   basic_block def_bb;
5966   struct loop * def_stmt_loop;
5967   tree def_arg;
5968   auto_vec<tree> vec_oprnds0;
5969   auto_vec<tree> vec_oprnds1;
5970   auto_vec<tree> vec_oprnds2;
5971   auto_vec<tree> vect_defs;
5972   auto_vec<stmt_vec_info> phis;
5973   int vec_num;
5974   tree def0, tem;
5975   tree cr_index_scalar_type = NULL_TREE, cr_index_vector_type = NULL_TREE;
5976   tree cond_reduc_val = NULL_TREE;
5977
5978   /* Make sure it was already recognized as a reduction computation.  */
5979   if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_reduction_def
5980       && STMT_VINFO_DEF_TYPE (stmt_info) != vect_nested_cycle)
5981     return false;
5982
5983   if (nested_in_vect_loop_p (loop, stmt_info))
5984     {
5985       loop = loop->inner;
5986       nested_cycle = true;
5987     }
5988
5989   if (REDUC_GROUP_FIRST_ELEMENT (stmt_info))
5990     gcc_assert (slp_node
5991                 && REDUC_GROUP_FIRST_ELEMENT (stmt_info) == stmt_info);
5992
5993   if (gphi *phi = dyn_cast <gphi *> (stmt_info->stmt))
5994     {
5995       tree phi_result = gimple_phi_result (phi);
5996       /* Analysis is fully done on the reduction stmt invocation.  */
5997       if (! vec_stmt)
5998         {
5999           if (slp_node)
6000             slp_node_instance->reduc_phis = slp_node;
6001
6002           STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
6003           return true;
6004         }
6005
6006       if (STMT_VINFO_REDUC_TYPE (stmt_info) == FOLD_LEFT_REDUCTION)
6007         /* Leave the scalar phi in place.  Note that checking
6008            STMT_VINFO_VEC_REDUCTION_TYPE (as below) only works
6009            for reductions involving a single statement.  */
6010         return true;
6011
6012       stmt_vec_info reduc_stmt_info = STMT_VINFO_REDUC_DEF (stmt_info);
6013       reduc_stmt_info = vect_stmt_to_vectorize (reduc_stmt_info);
6014
6015       if (STMT_VINFO_VEC_REDUCTION_TYPE (reduc_stmt_info)
6016           == EXTRACT_LAST_REDUCTION)
6017         /* Leave the scalar phi in place.  */
6018         return true;
6019
6020       gassign *reduc_stmt = as_a <gassign *> (reduc_stmt_info->stmt);
6021       code = gimple_assign_rhs_code (reduc_stmt);
6022       for (unsigned k = 1; k < gimple_num_ops (reduc_stmt); ++k)
6023         {
6024           tree op = gimple_op (reduc_stmt, k);
6025           if (op == phi_result)
6026             continue;
6027           if (k == 1 && code == COND_EXPR)
6028             continue;
6029           bool is_simple_use = vect_is_simple_use (op, loop_vinfo, &dt);
6030           gcc_assert (is_simple_use);
6031           if (dt == vect_constant_def || dt == vect_external_def)
6032             continue;
6033           if (!vectype_in
6034               || (GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (vectype_in)))
6035                   < GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (op)))))
6036             vectype_in = get_vectype_for_scalar_type (TREE_TYPE (op));
6037           break;
6038         }
6039       /* For a nested cycle we might end up with an operation like
6040          phi_result * phi_result.  */
6041       if (!vectype_in)
6042         vectype_in = STMT_VINFO_VECTYPE (stmt_info);
6043       gcc_assert (vectype_in);
6044
6045       if (slp_node)
6046         ncopies = 1;
6047       else
6048         ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
6049
6050       stmt_vec_info use_stmt_info;
6051       if (ncopies > 1
6052           && STMT_VINFO_RELEVANT (reduc_stmt_info) <= vect_used_only_live
6053           && (use_stmt_info = loop_vinfo->lookup_single_use (phi_result))
6054           && vect_stmt_to_vectorize (use_stmt_info) == reduc_stmt_info)
6055         single_defuse_cycle = true;
6056
6057       /* Create the destination vector  */
6058       scalar_dest = gimple_assign_lhs (reduc_stmt);
6059       vec_dest = vect_create_destination_var (scalar_dest, vectype_out);
6060
6061       if (slp_node)
6062         /* The size vect_schedule_slp_instance computes is off for us.  */
6063         vec_num = vect_get_num_vectors
6064           (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
6065            * SLP_TREE_SCALAR_STMTS (slp_node).length (),
6066            vectype_in);
6067       else
6068         vec_num = 1;
6069
6070       /* Generate the reduction PHIs upfront.  */
6071       prev_phi_info = NULL;
6072       for (j = 0; j < ncopies; j++)
6073         {
6074           if (j == 0 || !single_defuse_cycle)
6075             {
6076               for (i = 0; i < vec_num; i++)
6077                 {
6078                   /* Create the reduction-phi that defines the reduction
6079                      operand.  */
6080                   gimple *new_phi = create_phi_node (vec_dest, loop->header);
6081                   stmt_vec_info new_phi_info = loop_vinfo->add_stmt (new_phi);
6082
6083                   if (slp_node)
6084                     SLP_TREE_VEC_STMTS (slp_node).quick_push (new_phi_info);
6085                   else
6086                     {
6087                       if (j == 0)
6088                         STMT_VINFO_VEC_STMT (stmt_info)
6089                           = *vec_stmt = new_phi_info;
6090                       else
6091                         STMT_VINFO_RELATED_STMT (prev_phi_info) = new_phi_info;
6092                       prev_phi_info = new_phi_info;
6093                     }
6094                 }
6095             }
6096         }
6097
6098       return true;
6099     }
6100
6101   /* 1. Is vectorizable reduction?  */
6102   /* Not supportable if the reduction variable is used in the loop, unless
6103      it's a reduction chain.  */
6104   if (STMT_VINFO_RELEVANT (stmt_info) > vect_used_in_outer
6105       && !REDUC_GROUP_FIRST_ELEMENT (stmt_info))
6106     return false;
6107
6108   /* Reductions that are not used even in an enclosing outer-loop,
6109      are expected to be "live" (used out of the loop).  */
6110   if (STMT_VINFO_RELEVANT (stmt_info) == vect_unused_in_scope
6111       && !STMT_VINFO_LIVE_P (stmt_info))
6112     return false;
6113
6114   /* 2. Has this been recognized as a reduction pattern?
6115
6116      Check if STMT represents a pattern that has been recognized
6117      in earlier analysis stages.  For stmts that represent a pattern,
6118      the STMT_VINFO_RELATED_STMT field records the last stmt in
6119      the original sequence that constitutes the pattern.  */
6120
6121   stmt_vec_info orig_stmt_info = STMT_VINFO_RELATED_STMT (stmt_info);
6122   if (orig_stmt_info)
6123     {
6124       gcc_assert (STMT_VINFO_IN_PATTERN_P (orig_stmt_info));
6125       gcc_assert (!STMT_VINFO_IN_PATTERN_P (stmt_info));
6126     }
6127
6128   /* 3. Check the operands of the operation.  The first operands are defined
6129         inside the loop body. The last operand is the reduction variable,
6130         which is defined by the loop-header-phi.  */
6131
6132   gassign *stmt = as_a <gassign *> (stmt_info->stmt);
6133
6134   /* Flatten RHS.  */
6135   switch (get_gimple_rhs_class (gimple_assign_rhs_code (stmt)))
6136     {
6137     case GIMPLE_BINARY_RHS:
6138       code = gimple_assign_rhs_code (stmt);
6139       op_type = TREE_CODE_LENGTH (code);
6140       gcc_assert (op_type == binary_op);
6141       ops[0] = gimple_assign_rhs1 (stmt);
6142       ops[1] = gimple_assign_rhs2 (stmt);
6143       break;
6144
6145     case GIMPLE_TERNARY_RHS:
6146       code = gimple_assign_rhs_code (stmt);
6147       op_type = TREE_CODE_LENGTH (code);
6148       gcc_assert (op_type == ternary_op);
6149       ops[0] = gimple_assign_rhs1 (stmt);
6150       ops[1] = gimple_assign_rhs2 (stmt);
6151       ops[2] = gimple_assign_rhs3 (stmt);
6152       break;
6153
6154     case GIMPLE_UNARY_RHS:
6155       return false;
6156
6157     default:
6158       gcc_unreachable ();
6159     }
6160
6161   if (code == COND_EXPR && slp_node)
6162     return false;
6163
6164   scalar_dest = gimple_assign_lhs (stmt);
6165   scalar_type = TREE_TYPE (scalar_dest);
6166   if (!POINTER_TYPE_P (scalar_type) && !INTEGRAL_TYPE_P (scalar_type)
6167       && !SCALAR_FLOAT_TYPE_P (scalar_type))
6168     return false;
6169
6170   /* Do not try to vectorize bit-precision reductions.  */
6171   if (!type_has_mode_precision_p (scalar_type))
6172     return false;
6173
6174   /* All uses but the last are expected to be defined in the loop.
6175      The last use is the reduction variable.  In case of nested cycle this
6176      assumption is not true: we use reduc_index to record the index of the
6177      reduction variable.  */
6178   stmt_vec_info reduc_def_info;
6179   if (orig_stmt_info)
6180     reduc_def_info = STMT_VINFO_REDUC_DEF (orig_stmt_info);
6181   else
6182     reduc_def_info = STMT_VINFO_REDUC_DEF (stmt_info);
6183   gcc_assert (reduc_def_info);
6184   gphi *reduc_def_phi = as_a <gphi *> (reduc_def_info->stmt);
6185   tree reduc_def = PHI_RESULT (reduc_def_phi);
6186   int reduc_index = -1;
6187   for (i = 0; i < op_type; i++)
6188     {
6189       /* The condition of COND_EXPR is checked in vectorizable_condition().  */
6190       if (i == 0 && code == COND_EXPR)
6191         continue;
6192
6193       stmt_vec_info def_stmt_info;
6194       is_simple_use = vect_is_simple_use (ops[i], loop_vinfo, &dts[i], &tem,
6195                                           &def_stmt_info);
6196       dt = dts[i];
6197       gcc_assert (is_simple_use);
6198       if (dt == vect_reduction_def
6199           && ops[i] == reduc_def)
6200         {
6201           reduc_index = i;
6202           continue;
6203         }
6204       else if (tem)
6205         {
6206           /* To properly compute ncopies we are interested in the widest
6207              input type in case we're looking at a widening accumulation.  */
6208           if (!vectype_in
6209               || (GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (vectype_in)))
6210                   < GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (tem)))))
6211             vectype_in = tem;
6212         }
6213
6214       if (dt != vect_internal_def
6215           && dt != vect_external_def
6216           && dt != vect_constant_def
6217           && dt != vect_induction_def
6218           && !(dt == vect_nested_cycle && nested_cycle))
6219         return false;
6220
6221       if (dt == vect_nested_cycle
6222           && ops[i] == reduc_def)
6223         {
6224           found_nested_cycle_def = true;
6225           reduc_index = i;
6226         }
6227
6228       if (i == 1 && code == COND_EXPR)
6229         {
6230           /* Record how value of COND_EXPR is defined.  */
6231           if (dt == vect_constant_def)
6232             {
6233               cond_reduc_dt = dt;
6234               cond_reduc_val = ops[i];
6235             }
6236           if (dt == vect_induction_def
6237               && def_stmt_info
6238               && is_nonwrapping_integer_induction (def_stmt_info, loop))
6239             {
6240               cond_reduc_dt = dt;
6241               cond_stmt_vinfo = def_stmt_info;
6242             }
6243         }
6244     }
6245
6246   if (!vectype_in)
6247     vectype_in = vectype_out;
6248
6249   /* When vectorizing a reduction chain w/o SLP the reduction PHI is not
6250      directy used in stmt.  */
6251   if (reduc_index == -1)
6252     {
6253       if (STMT_VINFO_REDUC_TYPE (stmt_info) == FOLD_LEFT_REDUCTION)
6254         {
6255           if (dump_enabled_p ())
6256             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6257                              "in-order reduction chain without SLP.\n");
6258           return false;
6259         }
6260     }
6261
6262   if (!(reduc_index == -1
6263         || dts[reduc_index] == vect_reduction_def
6264         || dts[reduc_index] == vect_nested_cycle
6265         || ((dts[reduc_index] == vect_internal_def
6266              || dts[reduc_index] == vect_external_def
6267              || dts[reduc_index] == vect_constant_def
6268              || dts[reduc_index] == vect_induction_def)
6269             && nested_cycle && found_nested_cycle_def)))
6270     {
6271       /* For pattern recognized stmts, orig_stmt might be a reduction,
6272          but some helper statements for the pattern might not, or
6273          might be COND_EXPRs with reduction uses in the condition.  */
6274       gcc_assert (orig_stmt_info);
6275       return false;
6276     }
6277
6278   /* PHIs should not participate in patterns.  */
6279   gcc_assert (!STMT_VINFO_RELATED_STMT (reduc_def_info));
6280   enum vect_reduction_type v_reduc_type
6281     = STMT_VINFO_REDUC_TYPE (reduc_def_info);
6282   stmt_vec_info tmp = STMT_VINFO_REDUC_DEF (reduc_def_info);
6283
6284   STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) = v_reduc_type;
6285   /* If we have a condition reduction, see if we can simplify it further.  */
6286   if (v_reduc_type == COND_REDUCTION)
6287     {
6288       /* TODO: We can't yet handle reduction chains, since we need to treat
6289          each COND_EXPR in the chain specially, not just the last one.
6290          E.g. for:
6291
6292             x_1 = PHI <x_3, ...>
6293             x_2 = a_2 ? ... : x_1;
6294             x_3 = a_3 ? ... : x_2;
6295
6296          we're interested in the last element in x_3 for which a_2 || a_3
6297          is true, whereas the current reduction chain handling would
6298          vectorize x_2 as a normal VEC_COND_EXPR and only treat x_3
6299          as a reduction operation.  */
6300       if (reduc_index == -1)
6301         {
6302           if (dump_enabled_p ())
6303             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6304                              "conditional reduction chains not supported\n");
6305           return false;
6306         }
6307
6308       /* vect_is_simple_reduction ensured that operand 2 is the
6309          loop-carried operand.  */
6310       gcc_assert (reduc_index == 2);
6311
6312       /* Loop peeling modifies initial value of reduction PHI, which
6313          makes the reduction stmt to be transformed different to the
6314          original stmt analyzed.  We need to record reduction code for
6315          CONST_COND_REDUCTION type reduction at analyzing stage, thus
6316          it can be used directly at transform stage.  */
6317       if (STMT_VINFO_VEC_CONST_COND_REDUC_CODE (stmt_info) == MAX_EXPR
6318           || STMT_VINFO_VEC_CONST_COND_REDUC_CODE (stmt_info) == MIN_EXPR)
6319         {
6320           /* Also set the reduction type to CONST_COND_REDUCTION.  */
6321           gcc_assert (cond_reduc_dt == vect_constant_def);
6322           STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) = CONST_COND_REDUCTION;
6323         }
6324       else if (direct_internal_fn_supported_p (IFN_FOLD_EXTRACT_LAST,
6325                                                vectype_in, OPTIMIZE_FOR_SPEED))
6326         {
6327           if (dump_enabled_p ())
6328             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6329                              "optimizing condition reduction with"
6330                              " FOLD_EXTRACT_LAST.\n");
6331           STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) = EXTRACT_LAST_REDUCTION;
6332         }
6333       else if (cond_reduc_dt == vect_induction_def)
6334         {
6335           tree base
6336             = STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (cond_stmt_vinfo);
6337           tree step = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (cond_stmt_vinfo);
6338
6339           gcc_assert (TREE_CODE (base) == INTEGER_CST
6340                       && TREE_CODE (step) == INTEGER_CST);
6341           cond_reduc_val = NULL_TREE;
6342           /* Find a suitable value, for MAX_EXPR below base, for MIN_EXPR
6343              above base; punt if base is the minimum value of the type for
6344              MAX_EXPR or maximum value of the type for MIN_EXPR for now.  */
6345           if (tree_int_cst_sgn (step) == -1)
6346             {
6347               cond_reduc_op_code = MIN_EXPR;
6348               if (tree_int_cst_sgn (base) == -1)
6349                 cond_reduc_val = build_int_cst (TREE_TYPE (base), 0);
6350               else if (tree_int_cst_lt (base,
6351                                         TYPE_MAX_VALUE (TREE_TYPE (base))))
6352                 cond_reduc_val
6353                   = int_const_binop (PLUS_EXPR, base, integer_one_node);
6354             }
6355           else
6356             {
6357               cond_reduc_op_code = MAX_EXPR;
6358               if (tree_int_cst_sgn (base) == 1)
6359                 cond_reduc_val = build_int_cst (TREE_TYPE (base), 0);
6360               else if (tree_int_cst_lt (TYPE_MIN_VALUE (TREE_TYPE (base)),
6361                                         base))
6362                 cond_reduc_val
6363                   = int_const_binop (MINUS_EXPR, base, integer_one_node);
6364             }
6365           if (cond_reduc_val)
6366             {
6367               if (dump_enabled_p ())
6368                 dump_printf_loc (MSG_NOTE, vect_location,
6369                                  "condition expression based on "
6370                                  "integer induction.\n");
6371               STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
6372                 = INTEGER_INDUC_COND_REDUCTION;
6373             }
6374         }
6375       else if (cond_reduc_dt == vect_constant_def)
6376         {
6377           enum vect_def_type cond_initial_dt;
6378           gimple *def_stmt = SSA_NAME_DEF_STMT (ops[reduc_index]);
6379           tree cond_initial_val
6380             = PHI_ARG_DEF_FROM_EDGE (def_stmt, loop_preheader_edge (loop));
6381
6382           gcc_assert (cond_reduc_val != NULL_TREE);
6383           vect_is_simple_use (cond_initial_val, loop_vinfo, &cond_initial_dt);
6384           if (cond_initial_dt == vect_constant_def
6385               && types_compatible_p (TREE_TYPE (cond_initial_val),
6386                                      TREE_TYPE (cond_reduc_val)))
6387             {
6388               tree e = fold_binary (LE_EXPR, boolean_type_node,
6389                                     cond_initial_val, cond_reduc_val);
6390               if (e && (integer_onep (e) || integer_zerop (e)))
6391                 {
6392                   if (dump_enabled_p ())
6393                     dump_printf_loc (MSG_NOTE, vect_location,
6394                                      "condition expression based on "
6395                                      "compile time constant.\n");
6396                   /* Record reduction code at analysis stage.  */
6397                   STMT_VINFO_VEC_CONST_COND_REDUC_CODE (stmt_info)
6398                     = integer_onep (e) ? MAX_EXPR : MIN_EXPR;
6399                   STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
6400                     = CONST_COND_REDUCTION;
6401                 }
6402             }
6403         }
6404     }
6405
6406   if (orig_stmt_info)
6407     gcc_assert (tmp == orig_stmt_info
6408                 || REDUC_GROUP_FIRST_ELEMENT (tmp) == orig_stmt_info);
6409   else
6410     /* We changed STMT to be the first stmt in reduction chain, hence we
6411        check that in this case the first element in the chain is STMT.  */
6412     gcc_assert (tmp == stmt_info
6413                 || REDUC_GROUP_FIRST_ELEMENT (tmp) == stmt_info);
6414
6415   if (STMT_VINFO_LIVE_P (reduc_def_info))
6416     return false;
6417
6418   if (slp_node)
6419     ncopies = 1;
6420   else
6421     ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
6422
6423   gcc_assert (ncopies >= 1);
6424
6425   vec_mode = TYPE_MODE (vectype_in);
6426   poly_uint64 nunits_out = TYPE_VECTOR_SUBPARTS (vectype_out);
6427
6428   if (nested_cycle)
6429     {
6430       def_bb = gimple_bb (reduc_def_phi);
6431       def_stmt_loop = def_bb->loop_father;
6432       def_arg = PHI_ARG_DEF_FROM_EDGE (reduc_def_phi,
6433                                        loop_preheader_edge (def_stmt_loop));
6434       stmt_vec_info def_arg_stmt_info = loop_vinfo->lookup_def (def_arg);
6435       if (def_arg_stmt_info
6436           && (STMT_VINFO_DEF_TYPE (def_arg_stmt_info)
6437               == vect_double_reduction_def))
6438         double_reduc = true;
6439     }
6440
6441   vect_reduction_type reduction_type
6442     = STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info);
6443   if ((double_reduc || reduction_type != TREE_CODE_REDUCTION)
6444       && ncopies > 1)
6445     {
6446       if (dump_enabled_p ())
6447         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6448                          "multiple types in double reduction or condition "
6449                          "reduction.\n");
6450       return false;
6451     }
6452
6453   if (code == COND_EXPR)
6454     {
6455       /* Only call during the analysis stage, otherwise we'll lose
6456          STMT_VINFO_TYPE.  */
6457       if (!vec_stmt && !vectorizable_condition (stmt_info, gsi, NULL,
6458                                                 true, NULL, cost_vec))
6459         {
6460           if (dump_enabled_p ())
6461             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6462                              "unsupported condition in reduction\n");
6463           return false;
6464         }
6465     }
6466   else if (code == LSHIFT_EXPR || code == RSHIFT_EXPR
6467            || code == LROTATE_EXPR || code == RROTATE_EXPR)
6468     {
6469       /* Only call during the analysis stage, otherwise we'll lose
6470          STMT_VINFO_TYPE.  We only support this for nested cycles
6471          without double reductions at the moment.  */
6472       if (!nested_cycle
6473           || double_reduc
6474           || (!vec_stmt && !vectorizable_shift (stmt_info, gsi, NULL,
6475                                                 NULL, cost_vec)))
6476         {
6477           if (dump_enabled_p ())
6478             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6479                              "unsupported shift or rotation in reduction\n");
6480           return false;
6481         }
6482     }
6483   else
6484     {
6485       /* 4. Supportable by target?  */
6486
6487       /* 4.1. check support for the operation in the loop  */
6488       optab = optab_for_tree_code (code, vectype_in, optab_default);
6489       if (!optab)
6490         {
6491           if (dump_enabled_p ())
6492             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6493                              "no optab.\n");
6494
6495           return false;
6496         }
6497
6498       if (optab_handler (optab, vec_mode) == CODE_FOR_nothing)
6499         {
6500           if (dump_enabled_p ())
6501             dump_printf (MSG_NOTE, "op not supported by target.\n");
6502
6503           if (maybe_ne (GET_MODE_SIZE (vec_mode), UNITS_PER_WORD)
6504               || !vect_worthwhile_without_simd_p (loop_vinfo, code))
6505             return false;
6506
6507           if (dump_enabled_p ())
6508             dump_printf (MSG_NOTE, "proceeding using word mode.\n");
6509         }
6510
6511       /* Worthwhile without SIMD support?  */
6512       if (!VECTOR_MODE_P (TYPE_MODE (vectype_in))
6513           && !vect_worthwhile_without_simd_p (loop_vinfo, code))
6514         {
6515           if (dump_enabled_p ())
6516             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6517                              "not worthwhile without SIMD support.\n");
6518
6519           return false;
6520         }
6521     }
6522
6523   /* 4.2. Check support for the epilog operation.
6524
6525           If STMT represents a reduction pattern, then the type of the
6526           reduction variable may be different than the type of the rest
6527           of the arguments.  For example, consider the case of accumulation
6528           of shorts into an int accumulator; The original code:
6529                         S1: int_a = (int) short_a;
6530           orig_stmt->   S2: int_acc = plus <int_a ,int_acc>;
6531
6532           was replaced with:
6533                         STMT: int_acc = widen_sum <short_a, int_acc>
6534
6535           This means that:
6536           1. The tree-code that is used to create the vector operation in the
6537              epilog code (that reduces the partial results) is not the
6538              tree-code of STMT, but is rather the tree-code of the original
6539              stmt from the pattern that STMT is replacing.  I.e, in the example
6540              above we want to use 'widen_sum' in the loop, but 'plus' in the
6541              epilog.
6542           2. The type (mode) we use to check available target support
6543              for the vector operation to be created in the *epilog*, is
6544              determined by the type of the reduction variable (in the example
6545              above we'd check this: optab_handler (plus_optab, vect_int_mode])).
6546              However the type (mode) we use to check available target support
6547              for the vector operation to be created *inside the loop*, is
6548              determined by the type of the other arguments to STMT (in the
6549              example we'd check this: optab_handler (widen_sum_optab,
6550              vect_short_mode)).
6551
6552           This is contrary to "regular" reductions, in which the types of all
6553           the arguments are the same as the type of the reduction variable.
6554           For "regular" reductions we can therefore use the same vector type
6555           (and also the same tree-code) when generating the epilog code and
6556           when generating the code inside the loop.  */
6557
6558   if (orig_stmt_info
6559       && (reduction_type == TREE_CODE_REDUCTION
6560           || reduction_type == FOLD_LEFT_REDUCTION))
6561     {
6562       /* This is a reduction pattern: get the vectype from the type of the
6563          reduction variable, and get the tree-code from orig_stmt.  */
6564       orig_code = gimple_assign_rhs_code (orig_stmt_info->stmt);
6565       gcc_assert (vectype_out);
6566       vec_mode = TYPE_MODE (vectype_out);
6567     }
6568   else
6569     {
6570       /* Regular reduction: use the same vectype and tree-code as used for
6571          the vector code inside the loop can be used for the epilog code. */
6572       orig_code = code;
6573
6574       if (code == MINUS_EXPR)
6575         orig_code = PLUS_EXPR;
6576
6577       /* For simple condition reductions, replace with the actual expression
6578          we want to base our reduction around.  */
6579       if (reduction_type == CONST_COND_REDUCTION)
6580         {
6581           orig_code = STMT_VINFO_VEC_CONST_COND_REDUC_CODE (stmt_info);
6582           gcc_assert (orig_code == MAX_EXPR || orig_code == MIN_EXPR);
6583         }
6584       else if (reduction_type == INTEGER_INDUC_COND_REDUCTION)
6585         orig_code = cond_reduc_op_code;
6586     }
6587
6588   reduc_fn = IFN_LAST;
6589
6590   if (reduction_type == TREE_CODE_REDUCTION
6591       || reduction_type == FOLD_LEFT_REDUCTION
6592       || reduction_type == INTEGER_INDUC_COND_REDUCTION
6593       || reduction_type == CONST_COND_REDUCTION)
6594     {
6595       if (reduction_type == FOLD_LEFT_REDUCTION
6596           ? fold_left_reduction_fn (orig_code, &reduc_fn)
6597           : reduction_fn_for_scalar_code (orig_code, &reduc_fn))
6598         {
6599           if (reduc_fn != IFN_LAST
6600               && !direct_internal_fn_supported_p (reduc_fn, vectype_out,
6601                                                   OPTIMIZE_FOR_SPEED))
6602             {
6603               if (dump_enabled_p ())
6604                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6605                                  "reduc op not supported by target.\n");
6606
6607               reduc_fn = IFN_LAST;
6608             }
6609         }
6610       else
6611         {
6612           if (!nested_cycle || double_reduc)
6613             {
6614               if (dump_enabled_p ())
6615                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6616                                  "no reduc code for scalar code.\n");
6617
6618               return false;
6619             }
6620         }
6621     }
6622   else if (reduction_type == COND_REDUCTION)
6623     {
6624       int scalar_precision
6625         = GET_MODE_PRECISION (SCALAR_TYPE_MODE (scalar_type));
6626       cr_index_scalar_type = make_unsigned_type (scalar_precision);
6627       cr_index_vector_type = build_vector_type (cr_index_scalar_type,
6628                                                 nunits_out);
6629
6630       if (direct_internal_fn_supported_p (IFN_REDUC_MAX, cr_index_vector_type,
6631                                           OPTIMIZE_FOR_SPEED))
6632         reduc_fn = IFN_REDUC_MAX;
6633     }
6634
6635   if (reduction_type != EXTRACT_LAST_REDUCTION
6636       && (!nested_cycle || double_reduc)
6637       && reduc_fn == IFN_LAST
6638       && !nunits_out.is_constant ())
6639     {
6640       if (dump_enabled_p ())
6641         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6642                          "missing target support for reduction on"
6643                          " variable-length vectors.\n");
6644       return false;
6645     }
6646
6647   /* For SLP reductions, see if there is a neutral value we can use.  */
6648   tree neutral_op = NULL_TREE;
6649   if (slp_node)
6650     neutral_op = neutral_op_for_slp_reduction
6651       (slp_node_instance->reduc_phis, code,
6652        REDUC_GROUP_FIRST_ELEMENT (stmt_info) != NULL);
6653
6654   if (double_reduc && reduction_type == FOLD_LEFT_REDUCTION)
6655     {
6656       /* We can't support in-order reductions of code such as this:
6657
6658            for (int i = 0; i < n1; ++i)
6659              for (int j = 0; j < n2; ++j)
6660                l += a[j];
6661
6662          since GCC effectively transforms the loop when vectorizing:
6663
6664            for (int i = 0; i < n1 / VF; ++i)
6665              for (int j = 0; j < n2; ++j)
6666                for (int k = 0; k < VF; ++k)
6667                  l += a[j];
6668
6669          which is a reassociation of the original operation.  */
6670       if (dump_enabled_p ())
6671         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6672                          "in-order double reduction not supported.\n");
6673
6674       return false;
6675     }
6676
6677   if (reduction_type == FOLD_LEFT_REDUCTION
6678       && slp_node
6679       && !REDUC_GROUP_FIRST_ELEMENT (stmt_info))
6680     {
6681       /* We cannot use in-order reductions in this case because there is
6682          an implicit reassociation of the operations involved.  */
6683       if (dump_enabled_p ())
6684         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6685                          "in-order unchained SLP reductions not supported.\n");
6686       return false;
6687     }
6688
6689   /* For double reductions, and for SLP reductions with a neutral value,
6690      we construct a variable-length initial vector by loading a vector
6691      full of the neutral value and then shift-and-inserting the start
6692      values into the low-numbered elements.  */
6693   if ((double_reduc || neutral_op)
6694       && !nunits_out.is_constant ()
6695       && !direct_internal_fn_supported_p (IFN_VEC_SHL_INSERT,
6696                                           vectype_out, OPTIMIZE_FOR_SPEED))
6697     {
6698       if (dump_enabled_p ())
6699         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6700                          "reduction on variable-length vectors requires"
6701                          " target support for a vector-shift-and-insert"
6702                          " operation.\n");
6703       return false;
6704     }
6705
6706   /* Check extra constraints for variable-length unchained SLP reductions.  */
6707   if (STMT_SLP_TYPE (stmt_info)
6708       && !REDUC_GROUP_FIRST_ELEMENT (stmt_info)
6709       && !nunits_out.is_constant ())
6710     {
6711       /* We checked above that we could build the initial vector when
6712          there's a neutral element value.  Check here for the case in
6713          which each SLP statement has its own initial value and in which
6714          that value needs to be repeated for every instance of the
6715          statement within the initial vector.  */
6716       unsigned int group_size = SLP_TREE_SCALAR_STMTS (slp_node).length ();
6717       scalar_mode elt_mode = SCALAR_TYPE_MODE (TREE_TYPE (vectype_out));
6718       if (!neutral_op
6719           && !can_duplicate_and_interleave_p (group_size, elt_mode))
6720         {
6721           if (dump_enabled_p ())
6722             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6723                              "unsupported form of SLP reduction for"
6724                              " variable-length vectors: cannot build"
6725                              " initial vector.\n");
6726           return false;
6727         }
6728       /* The epilogue code relies on the number of elements being a multiple
6729          of the group size.  The duplicate-and-interleave approach to setting
6730          up the the initial vector does too.  */
6731       if (!multiple_p (nunits_out, group_size))
6732         {
6733           if (dump_enabled_p ())
6734             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6735                              "unsupported form of SLP reduction for"
6736                              " variable-length vectors: the vector size"
6737                              " is not a multiple of the number of results.\n");
6738           return false;
6739         }
6740     }
6741
6742   /* In case of widenning multiplication by a constant, we update the type
6743      of the constant to be the type of the other operand.  We check that the
6744      constant fits the type in the pattern recognition pass.  */
6745   if (code == DOT_PROD_EXPR
6746       && !types_compatible_p (TREE_TYPE (ops[0]), TREE_TYPE (ops[1])))
6747     {
6748       if (TREE_CODE (ops[0]) == INTEGER_CST)
6749         ops[0] = fold_convert (TREE_TYPE (ops[1]), ops[0]);
6750       else if (TREE_CODE (ops[1]) == INTEGER_CST)
6751         ops[1] = fold_convert (TREE_TYPE (ops[0]), ops[1]);
6752       else
6753         {
6754           if (dump_enabled_p ())
6755             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6756                              "invalid types in dot-prod\n");
6757
6758           return false;
6759         }
6760     }
6761
6762   if (reduction_type == COND_REDUCTION)
6763     {
6764       widest_int ni;
6765
6766       if (! max_loop_iterations (loop, &ni))
6767         {
6768           if (dump_enabled_p ())
6769             dump_printf_loc (MSG_NOTE, vect_location,
6770                              "loop count not known, cannot create cond "
6771                              "reduction.\n");
6772           return false;
6773         }
6774       /* Convert backedges to iterations.  */
6775       ni += 1;
6776
6777       /* The additional index will be the same type as the condition.  Check
6778          that the loop can fit into this less one (because we'll use up the
6779          zero slot for when there are no matches).  */
6780       tree max_index = TYPE_MAX_VALUE (cr_index_scalar_type);
6781       if (wi::geu_p (ni, wi::to_widest (max_index)))
6782         {
6783           if (dump_enabled_p ())
6784             dump_printf_loc (MSG_NOTE, vect_location,
6785                              "loop size is greater than data size.\n");
6786           return false;
6787         }
6788     }
6789
6790   /* In case the vectorization factor (VF) is bigger than the number
6791      of elements that we can fit in a vectype (nunits), we have to generate
6792      more than one vector stmt - i.e - we need to "unroll" the
6793      vector stmt by a factor VF/nunits.  For more details see documentation
6794      in vectorizable_operation.  */
6795
6796   /* If the reduction is used in an outer loop we need to generate
6797      VF intermediate results, like so (e.g. for ncopies=2):
6798         r0 = phi (init, r0)
6799         r1 = phi (init, r1)
6800         r0 = x0 + r0;
6801         r1 = x1 + r1;
6802     (i.e. we generate VF results in 2 registers).
6803     In this case we have a separate def-use cycle for each copy, and therefore
6804     for each copy we get the vector def for the reduction variable from the
6805     respective phi node created for this copy.
6806
6807     Otherwise (the reduction is unused in the loop nest), we can combine
6808     together intermediate results, like so (e.g. for ncopies=2):
6809         r = phi (init, r)
6810         r = x0 + r;
6811         r = x1 + r;
6812    (i.e. we generate VF/2 results in a single register).
6813    In this case for each copy we get the vector def for the reduction variable
6814    from the vectorized reduction operation generated in the previous iteration.
6815
6816    This only works when we see both the reduction PHI and its only consumer
6817    in vectorizable_reduction and there are no intermediate stmts
6818    participating.  */
6819   stmt_vec_info use_stmt_info;
6820   tree reduc_phi_result = gimple_phi_result (reduc_def_phi);
6821   if (ncopies > 1
6822       && (STMT_VINFO_RELEVANT (stmt_info) <= vect_used_only_live)
6823       && (use_stmt_info = loop_vinfo->lookup_single_use (reduc_phi_result))
6824       && vect_stmt_to_vectorize (use_stmt_info) == stmt_info)
6825     {
6826       single_defuse_cycle = true;
6827       epilog_copies = 1;
6828     }
6829   else
6830     epilog_copies = ncopies;
6831
6832   /* If the reduction stmt is one of the patterns that have lane
6833      reduction embedded we cannot handle the case of ! single_defuse_cycle.  */
6834   if ((ncopies > 1
6835        && ! single_defuse_cycle)
6836       && (code == DOT_PROD_EXPR
6837           || code == WIDEN_SUM_EXPR
6838           || code == SAD_EXPR))
6839     {
6840       if (dump_enabled_p ())
6841         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6842                          "multi def-use cycle not possible for lane-reducing "
6843                          "reduction operation\n");
6844       return false;
6845     }
6846
6847   if (slp_node)
6848     vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
6849   else
6850     vec_num = 1;
6851
6852   internal_fn cond_fn = get_conditional_internal_fn (code);
6853   vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo);
6854
6855   if (!vec_stmt) /* transformation not required.  */
6856     {
6857       vect_model_reduction_cost (stmt_info, reduc_fn, ncopies, cost_vec);
6858       if (loop_vinfo && LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo))
6859         {
6860           if (reduction_type != FOLD_LEFT_REDUCTION
6861               && (cond_fn == IFN_LAST
6862                   || !direct_internal_fn_supported_p (cond_fn, vectype_in,
6863                                                       OPTIMIZE_FOR_SPEED)))
6864             {
6865               if (dump_enabled_p ())
6866                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6867                                  "can't use a fully-masked loop because no"
6868                                  " conditional operation is available.\n");
6869               LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = false;
6870             }
6871           else if (reduc_index == -1)
6872             {
6873               if (dump_enabled_p ())
6874                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6875                                  "can't use a fully-masked loop for chained"
6876                                  " reductions.\n");
6877               LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = false;
6878             }
6879           else
6880             vect_record_loop_mask (loop_vinfo, masks, ncopies * vec_num,
6881                                    vectype_in);
6882         }
6883       if (dump_enabled_p ()
6884           && reduction_type == FOLD_LEFT_REDUCTION)
6885         dump_printf_loc (MSG_NOTE, vect_location,
6886                          "using an in-order (fold-left) reduction.\n");
6887       STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
6888       return true;
6889     }
6890
6891   /* Transform.  */
6892
6893   if (dump_enabled_p ())
6894     dump_printf_loc (MSG_NOTE, vect_location, "transform reduction.\n");
6895
6896   /* FORNOW: Multiple types are not supported for condition.  */
6897   if (code == COND_EXPR)
6898     gcc_assert (ncopies == 1);
6899
6900   bool masked_loop_p = LOOP_VINFO_FULLY_MASKED_P (loop_vinfo);
6901
6902   if (reduction_type == FOLD_LEFT_REDUCTION)
6903     return vectorize_fold_left_reduction
6904       (stmt_info, gsi, vec_stmt, slp_node, reduc_def_phi, code,
6905        reduc_fn, ops, vectype_in, reduc_index, masks);
6906
6907   if (reduction_type == EXTRACT_LAST_REDUCTION)
6908     {
6909       gcc_assert (!slp_node);
6910       return vectorizable_condition (stmt_info, gsi, vec_stmt,
6911                                      true, NULL, NULL);
6912     }
6913
6914   /* Create the destination vector  */
6915   vec_dest = vect_create_destination_var (scalar_dest, vectype_out);
6916
6917   prev_stmt_info = NULL;
6918   prev_phi_info = NULL;
6919   if (!slp_node)
6920     {
6921       vec_oprnds0.create (1);
6922       vec_oprnds1.create (1);
6923       if (op_type == ternary_op)
6924         vec_oprnds2.create (1);
6925     }
6926
6927   phis.create (vec_num);
6928   vect_defs.create (vec_num);
6929   if (!slp_node)
6930     vect_defs.quick_push (NULL_TREE);
6931
6932   if (slp_node)
6933     phis.splice (SLP_TREE_VEC_STMTS (slp_node_instance->reduc_phis));
6934   else
6935     phis.quick_push (STMT_VINFO_VEC_STMT (reduc_def_info));
6936
6937   for (j = 0; j < ncopies; j++)
6938     {
6939       if (code == COND_EXPR)
6940         {
6941           gcc_assert (!slp_node);
6942           vectorizable_condition (stmt_info, gsi, vec_stmt,
6943                                   true, NULL, NULL);
6944           break;
6945         }
6946       if (code == LSHIFT_EXPR
6947           || code == RSHIFT_EXPR)
6948         {
6949           vectorizable_shift (stmt_info, gsi, vec_stmt, slp_node, NULL);
6950           break;
6951         }
6952
6953       /* Handle uses.  */
6954       if (j == 0)
6955         {
6956           if (slp_node)
6957             {
6958               /* Get vec defs for all the operands except the reduction index,
6959                  ensuring the ordering of the ops in the vector is kept.  */
6960               auto_vec<tree, 3> slp_ops;
6961               auto_vec<vec<tree>, 3> vec_defs;
6962
6963               slp_ops.quick_push (ops[0]);
6964               slp_ops.quick_push (ops[1]);
6965               if (op_type == ternary_op)
6966                 slp_ops.quick_push (ops[2]);
6967
6968               vect_get_slp_defs (slp_ops, slp_node, &vec_defs);
6969
6970               vec_oprnds0.safe_splice (vec_defs[0]);
6971               vec_defs[0].release ();
6972               vec_oprnds1.safe_splice (vec_defs[1]);
6973               vec_defs[1].release ();
6974               if (op_type == ternary_op)
6975                 {
6976                   vec_oprnds2.safe_splice (vec_defs[2]);
6977                   vec_defs[2].release ();
6978                 }
6979             }
6980           else
6981             {
6982               vec_oprnds0.quick_push
6983                 (vect_get_vec_def_for_operand (ops[0], stmt_info));
6984               vec_oprnds1.quick_push
6985                 (vect_get_vec_def_for_operand (ops[1], stmt_info));
6986               if (op_type == ternary_op)
6987                 vec_oprnds2.quick_push
6988                   (vect_get_vec_def_for_operand (ops[2], stmt_info));
6989             }
6990         }
6991       else
6992         {
6993           if (!slp_node)
6994             {
6995               gcc_assert (reduc_index != -1 || ! single_defuse_cycle);
6996
6997               if (single_defuse_cycle && reduc_index == 0)
6998                 vec_oprnds0[0] = gimple_get_lhs (new_stmt_info->stmt);
6999               else
7000                 vec_oprnds0[0]
7001                   = vect_get_vec_def_for_stmt_copy (loop_vinfo,
7002                                                     vec_oprnds0[0]);
7003               if (single_defuse_cycle && reduc_index == 1)
7004                 vec_oprnds1[0] = gimple_get_lhs (new_stmt_info->stmt);
7005               else
7006                 vec_oprnds1[0]
7007                   = vect_get_vec_def_for_stmt_copy (loop_vinfo,
7008                                                     vec_oprnds1[0]);
7009               if (op_type == ternary_op)
7010                 {
7011                   if (single_defuse_cycle && reduc_index == 2)
7012                     vec_oprnds2[0] = gimple_get_lhs (new_stmt_info->stmt);
7013                   else
7014                     vec_oprnds2[0]
7015                       = vect_get_vec_def_for_stmt_copy (loop_vinfo,
7016                                                         vec_oprnds2[0]);
7017                 }
7018             }
7019         }
7020
7021       FOR_EACH_VEC_ELT (vec_oprnds0, i, def0)
7022         {
7023           tree vop[3] = { def0, vec_oprnds1[i], NULL_TREE };
7024           if (masked_loop_p)
7025             {
7026               /* Make sure that the reduction accumulator is vop[0].  */
7027               if (reduc_index == 1)
7028                 {
7029                   gcc_assert (commutative_tree_code (code));
7030                   std::swap (vop[0], vop[1]);
7031                 }
7032               tree mask = vect_get_loop_mask (gsi, masks, vec_num * ncopies,
7033                                               vectype_in, i * ncopies + j);
7034               gcall *call = gimple_build_call_internal (cond_fn, 4, mask,
7035                                                         vop[0], vop[1],
7036                                                         vop[0]);
7037               new_temp = make_ssa_name (vec_dest, call);
7038               gimple_call_set_lhs (call, new_temp);
7039               gimple_call_set_nothrow (call, true);
7040               new_stmt_info
7041                 = vect_finish_stmt_generation (stmt_info, call, gsi);
7042             }
7043           else
7044             {
7045               if (op_type == ternary_op)
7046                 vop[2] = vec_oprnds2[i];
7047
7048               gassign *new_stmt = gimple_build_assign (vec_dest, code,
7049                                                        vop[0], vop[1], vop[2]);
7050               new_temp = make_ssa_name (vec_dest, new_stmt);
7051               gimple_assign_set_lhs (new_stmt, new_temp);
7052               new_stmt_info
7053                 = vect_finish_stmt_generation (stmt_info, new_stmt, gsi);
7054             }
7055
7056           if (slp_node)
7057             {
7058               SLP_TREE_VEC_STMTS (slp_node).quick_push (new_stmt_info);
7059               vect_defs.quick_push (new_temp);
7060             }
7061           else
7062             vect_defs[0] = new_temp;
7063         }
7064
7065       if (slp_node)
7066         continue;
7067
7068       if (j == 0)
7069         STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt_info;
7070       else
7071         STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt_info;
7072
7073       prev_stmt_info = new_stmt_info;
7074     }
7075
7076   /* Finalize the reduction-phi (set its arguments) and create the
7077      epilog reduction code.  */
7078   if ((!single_defuse_cycle || code == COND_EXPR) && !slp_node)
7079     vect_defs[0] = gimple_get_lhs ((*vec_stmt)->stmt);
7080
7081   vect_create_epilog_for_reduction (vect_defs, stmt_info, reduc_def_phi,
7082                                     epilog_copies, reduc_fn, phis,
7083                                     double_reduc, slp_node, slp_node_instance,
7084                                     cond_reduc_val, cond_reduc_op_code,
7085                                     neutral_op);
7086
7087   return true;
7088 }
7089
7090 /* Function vect_min_worthwhile_factor.
7091
7092    For a loop where we could vectorize the operation indicated by CODE,
7093    return the minimum vectorization factor that makes it worthwhile
7094    to use generic vectors.  */
7095 static unsigned int
7096 vect_min_worthwhile_factor (enum tree_code code)
7097 {
7098   switch (code)
7099     {
7100     case PLUS_EXPR:
7101     case MINUS_EXPR:
7102     case NEGATE_EXPR:
7103       return 4;
7104
7105     case BIT_AND_EXPR:
7106     case BIT_IOR_EXPR:
7107     case BIT_XOR_EXPR:
7108     case BIT_NOT_EXPR:
7109       return 2;
7110
7111     default:
7112       return INT_MAX;
7113     }
7114 }
7115
7116 /* Return true if VINFO indicates we are doing loop vectorization and if
7117    it is worth decomposing CODE operations into scalar operations for
7118    that loop's vectorization factor.  */
7119
7120 bool
7121 vect_worthwhile_without_simd_p (vec_info *vinfo, tree_code code)
7122 {
7123   loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
7124   unsigned HOST_WIDE_INT value;
7125   return (loop_vinfo
7126           && LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant (&value)
7127           && value >= vect_min_worthwhile_factor (code));
7128 }
7129
7130 /* Function vectorizable_induction
7131
7132    Check if STMT_INFO performs an induction computation that can be vectorized.
7133    If VEC_STMT is also passed, vectorize the induction PHI: create a vectorized
7134    phi to replace it, put it in VEC_STMT, and add it to the same basic block.
7135    Return true if STMT_INFO is vectorizable in this way.  */
7136
7137 bool
7138 vectorizable_induction (stmt_vec_info stmt_info,
7139                         gimple_stmt_iterator *gsi ATTRIBUTE_UNUSED,
7140                         stmt_vec_info *vec_stmt, slp_tree slp_node,
7141                         stmt_vector_for_cost *cost_vec)
7142 {
7143   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
7144   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
7145   unsigned ncopies;
7146   bool nested_in_vect_loop = false;
7147   struct loop *iv_loop;
7148   tree vec_def;
7149   edge pe = loop_preheader_edge (loop);
7150   basic_block new_bb;
7151   tree new_vec, vec_init, vec_step, t;
7152   tree new_name;
7153   gimple *new_stmt;
7154   gphi *induction_phi;
7155   tree induc_def, vec_dest;
7156   tree init_expr, step_expr;
7157   poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
7158   unsigned i;
7159   tree expr;
7160   gimple_seq stmts;
7161   imm_use_iterator imm_iter;
7162   use_operand_p use_p;
7163   gimple *exit_phi;
7164   edge latch_e;
7165   tree loop_arg;
7166   gimple_stmt_iterator si;
7167
7168   gphi *phi = dyn_cast <gphi *> (stmt_info->stmt);
7169   if (!phi)
7170     return false;
7171
7172   if (!STMT_VINFO_RELEVANT_P (stmt_info))
7173     return false;
7174
7175   /* Make sure it was recognized as induction computation.  */
7176   if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_induction_def)
7177     return false;
7178
7179   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
7180   poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
7181
7182   if (slp_node)
7183     ncopies = 1;
7184   else
7185     ncopies = vect_get_num_copies (loop_vinfo, vectype);
7186   gcc_assert (ncopies >= 1);
7187
7188   /* FORNOW. These restrictions should be relaxed.  */
7189   if (nested_in_vect_loop_p (loop, stmt_info))
7190     {
7191       imm_use_iterator imm_iter;
7192       use_operand_p use_p;
7193       gimple *exit_phi;
7194       edge latch_e;
7195       tree loop_arg;
7196
7197       if (ncopies > 1)
7198         {
7199           if (dump_enabled_p ())
7200             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7201                              "multiple types in nested loop.\n");
7202           return false;
7203         }
7204
7205       /* FORNOW: outer loop induction with SLP not supported.  */
7206       if (STMT_SLP_TYPE (stmt_info))
7207         return false;
7208
7209       exit_phi = NULL;
7210       latch_e = loop_latch_edge (loop->inner);
7211       loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
7212       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, loop_arg)
7213         {
7214           gimple *use_stmt = USE_STMT (use_p);
7215           if (is_gimple_debug (use_stmt))
7216             continue;
7217
7218           if (!flow_bb_inside_loop_p (loop->inner, gimple_bb (use_stmt)))
7219             {
7220               exit_phi = use_stmt;
7221               break;
7222             }
7223         }
7224       if (exit_phi)
7225         {
7226           stmt_vec_info exit_phi_vinfo = loop_vinfo->lookup_stmt (exit_phi);
7227           if (!(STMT_VINFO_RELEVANT_P (exit_phi_vinfo)
7228                 && !STMT_VINFO_LIVE_P (exit_phi_vinfo)))
7229             {
7230               if (dump_enabled_p ())
7231                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7232                                  "inner-loop induction only used outside "
7233                                  "of the outer vectorized loop.\n");
7234               return false;
7235             }
7236         }
7237
7238       nested_in_vect_loop = true;
7239       iv_loop = loop->inner;
7240     }
7241   else
7242     iv_loop = loop;
7243   gcc_assert (iv_loop == (gimple_bb (phi))->loop_father);
7244
7245   if (slp_node && !nunits.is_constant ())
7246     {
7247       /* The current SLP code creates the initial value element-by-element.  */
7248       if (dump_enabled_p ())
7249         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7250                          "SLP induction not supported for variable-length"
7251                          " vectors.\n");
7252       return false;
7253     }
7254
7255   if (!vec_stmt) /* transformation not required.  */
7256     {
7257       STMT_VINFO_TYPE (stmt_info) = induc_vec_info_type;
7258       DUMP_VECT_SCOPE ("vectorizable_induction");
7259       vect_model_induction_cost (stmt_info, ncopies, cost_vec);
7260       return true;
7261     }
7262
7263   /* Transform.  */
7264
7265   /* Compute a vector variable, initialized with the first VF values of
7266      the induction variable.  E.g., for an iv with IV_PHI='X' and
7267      evolution S, for a vector of 4 units, we want to compute:
7268      [X, X + S, X + 2*S, X + 3*S].  */
7269
7270   if (dump_enabled_p ())
7271     dump_printf_loc (MSG_NOTE, vect_location, "transform induction phi.\n");
7272
7273   latch_e = loop_latch_edge (iv_loop);
7274   loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
7275
7276   step_expr = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_info);
7277   gcc_assert (step_expr != NULL_TREE);
7278
7279   pe = loop_preheader_edge (iv_loop);
7280   init_expr = PHI_ARG_DEF_FROM_EDGE (phi,
7281                                      loop_preheader_edge (iv_loop));
7282
7283   stmts = NULL;
7284   if (!nested_in_vect_loop)
7285     {
7286       /* Convert the initial value to the desired type.  */
7287       tree new_type = TREE_TYPE (vectype);
7288       init_expr = gimple_convert (&stmts, new_type, init_expr);
7289
7290       /* If we are using the loop mask to "peel" for alignment then we need
7291          to adjust the start value here.  */
7292       tree skip_niters = LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo);
7293       if (skip_niters != NULL_TREE)
7294         {
7295           if (FLOAT_TYPE_P (vectype))
7296             skip_niters = gimple_build (&stmts, FLOAT_EXPR, new_type,
7297                                         skip_niters);
7298           else
7299             skip_niters = gimple_convert (&stmts, new_type, skip_niters);
7300           tree skip_step = gimple_build (&stmts, MULT_EXPR, new_type,
7301                                          skip_niters, step_expr);
7302           init_expr = gimple_build (&stmts, MINUS_EXPR, new_type,
7303                                     init_expr, skip_step);
7304         }
7305     }
7306
7307   /* Convert the step to the desired type.  */
7308   step_expr = gimple_convert (&stmts, TREE_TYPE (vectype), step_expr);
7309
7310   if (stmts)
7311     {
7312       new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
7313       gcc_assert (!new_bb);
7314     }
7315
7316   /* Find the first insertion point in the BB.  */
7317   basic_block bb = gimple_bb (phi);
7318   si = gsi_after_labels (bb);
7319
7320   /* For SLP induction we have to generate several IVs as for example
7321      with group size 3 we need [i, i, i, i + S] [i + S, i + S, i + 2*S, i + 2*S]
7322      [i + 2*S, i + 3*S, i + 3*S, i + 3*S].  The step is the same uniform
7323      [VF*S, VF*S, VF*S, VF*S] for all.  */
7324   if (slp_node)
7325     {
7326       /* Enforced above.  */
7327       unsigned int const_nunits = nunits.to_constant ();
7328
7329       /* Generate [VF*S, VF*S, ... ].  */
7330       if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
7331         {
7332           expr = build_int_cst (integer_type_node, vf);
7333           expr = fold_convert (TREE_TYPE (step_expr), expr);
7334         }
7335       else
7336         expr = build_int_cst (TREE_TYPE (step_expr), vf);
7337       new_name = fold_build2 (MULT_EXPR, TREE_TYPE (step_expr),
7338                               expr, step_expr);
7339       if (! CONSTANT_CLASS_P (new_name))
7340         new_name = vect_init_vector (stmt_info, new_name,
7341                                      TREE_TYPE (step_expr), NULL);
7342       new_vec = build_vector_from_val (vectype, new_name);
7343       vec_step = vect_init_vector (stmt_info, new_vec, vectype, NULL);
7344
7345       /* Now generate the IVs.  */
7346       unsigned group_size = SLP_TREE_SCALAR_STMTS (slp_node).length ();
7347       unsigned nvects = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
7348       unsigned elts = const_nunits * nvects;
7349       unsigned nivs = least_common_multiple (group_size,
7350                                              const_nunits) / const_nunits;
7351       gcc_assert (elts % group_size == 0);
7352       tree elt = init_expr;
7353       unsigned ivn;
7354       for (ivn = 0; ivn < nivs; ++ivn)
7355         {
7356           tree_vector_builder elts (vectype, const_nunits, 1);
7357           stmts = NULL;
7358           for (unsigned eltn = 0; eltn < const_nunits; ++eltn)
7359             {
7360               if (ivn*const_nunits + eltn >= group_size
7361                   && (ivn * const_nunits + eltn) % group_size == 0)
7362                 elt = gimple_build (&stmts, PLUS_EXPR, TREE_TYPE (elt),
7363                                     elt, step_expr);
7364               elts.quick_push (elt);
7365             }
7366           vec_init = gimple_build_vector (&stmts, &elts);
7367           if (stmts)
7368             {
7369               new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
7370               gcc_assert (!new_bb);
7371             }
7372
7373           /* Create the induction-phi that defines the induction-operand.  */
7374           vec_dest = vect_get_new_vect_var (vectype, vect_simple_var, "vec_iv_");
7375           induction_phi = create_phi_node (vec_dest, iv_loop->header);
7376           stmt_vec_info induction_phi_info
7377             = loop_vinfo->add_stmt (induction_phi);
7378           induc_def = PHI_RESULT (induction_phi);
7379
7380           /* Create the iv update inside the loop  */
7381           vec_def = make_ssa_name (vec_dest);
7382           new_stmt = gimple_build_assign (vec_def, PLUS_EXPR, induc_def, vec_step);
7383           gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
7384           loop_vinfo->add_stmt (new_stmt);
7385
7386           /* Set the arguments of the phi node:  */
7387           add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION);
7388           add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
7389                        UNKNOWN_LOCATION);
7390
7391           SLP_TREE_VEC_STMTS (slp_node).quick_push (induction_phi_info);
7392         }
7393
7394       /* Re-use IVs when we can.  */
7395       if (ivn < nvects)
7396         {
7397           unsigned vfp
7398             = least_common_multiple (group_size, const_nunits) / group_size;
7399           /* Generate [VF'*S, VF'*S, ... ].  */
7400           if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
7401             {
7402               expr = build_int_cst (integer_type_node, vfp);
7403               expr = fold_convert (TREE_TYPE (step_expr), expr);
7404             }
7405           else
7406             expr = build_int_cst (TREE_TYPE (step_expr), vfp);
7407           new_name = fold_build2 (MULT_EXPR, TREE_TYPE (step_expr),
7408                                   expr, step_expr);
7409           if (! CONSTANT_CLASS_P (new_name))
7410             new_name = vect_init_vector (stmt_info, new_name,
7411                                          TREE_TYPE (step_expr), NULL);
7412           new_vec = build_vector_from_val (vectype, new_name);
7413           vec_step = vect_init_vector (stmt_info, new_vec, vectype, NULL);
7414           for (; ivn < nvects; ++ivn)
7415             {
7416               gimple *iv = SLP_TREE_VEC_STMTS (slp_node)[ivn - nivs]->stmt;
7417               tree def;
7418               if (gimple_code (iv) == GIMPLE_PHI)
7419                 def = gimple_phi_result (iv);
7420               else
7421                 def = gimple_assign_lhs (iv);
7422               new_stmt = gimple_build_assign (make_ssa_name (vectype),
7423                                               PLUS_EXPR,
7424                                               def, vec_step);
7425               if (gimple_code (iv) == GIMPLE_PHI)
7426                 gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
7427               else
7428                 {
7429                   gimple_stmt_iterator tgsi = gsi_for_stmt (iv);
7430                   gsi_insert_after (&tgsi, new_stmt, GSI_CONTINUE_LINKING);
7431                 }
7432               SLP_TREE_VEC_STMTS (slp_node).quick_push
7433                 (loop_vinfo->add_stmt (new_stmt));
7434             }
7435         }
7436
7437       return true;
7438     }
7439
7440   /* Create the vector that holds the initial_value of the induction.  */
7441   if (nested_in_vect_loop)
7442     {
7443       /* iv_loop is nested in the loop to be vectorized.  init_expr had already
7444          been created during vectorization of previous stmts.  We obtain it
7445          from the STMT_VINFO_VEC_STMT of the defining stmt.  */
7446       vec_init = vect_get_vec_def_for_operand (init_expr, stmt_info);
7447       /* If the initial value is not of proper type, convert it.  */
7448       if (!useless_type_conversion_p (vectype, TREE_TYPE (vec_init)))
7449         {
7450           new_stmt
7451             = gimple_build_assign (vect_get_new_ssa_name (vectype,
7452                                                           vect_simple_var,
7453                                                           "vec_iv_"),
7454                                    VIEW_CONVERT_EXPR,
7455                                    build1 (VIEW_CONVERT_EXPR, vectype,
7456                                            vec_init));
7457           vec_init = gimple_assign_lhs (new_stmt);
7458           new_bb = gsi_insert_on_edge_immediate (loop_preheader_edge (iv_loop),
7459                                                  new_stmt);
7460           gcc_assert (!new_bb);
7461           loop_vinfo->add_stmt (new_stmt);
7462         }
7463     }
7464   else
7465     {
7466       /* iv_loop is the loop to be vectorized. Create:
7467          vec_init = [X, X+S, X+2*S, X+3*S] (S = step_expr, X = init_expr)  */
7468       stmts = NULL;
7469       new_name = gimple_convert (&stmts, TREE_TYPE (vectype), init_expr);
7470
7471       unsigned HOST_WIDE_INT const_nunits;
7472       if (nunits.is_constant (&const_nunits))
7473         {
7474           tree_vector_builder elts (vectype, const_nunits, 1);
7475           elts.quick_push (new_name);
7476           for (i = 1; i < const_nunits; i++)
7477             {
7478               /* Create: new_name_i = new_name + step_expr  */
7479               new_name = gimple_build (&stmts, PLUS_EXPR, TREE_TYPE (new_name),
7480                                        new_name, step_expr);
7481               elts.quick_push (new_name);
7482             }
7483           /* Create a vector from [new_name_0, new_name_1, ...,
7484              new_name_nunits-1]  */
7485           vec_init = gimple_build_vector (&stmts, &elts);
7486         }
7487       else if (INTEGRAL_TYPE_P (TREE_TYPE (step_expr)))
7488         /* Build the initial value directly from a VEC_SERIES_EXPR.  */
7489         vec_init = gimple_build (&stmts, VEC_SERIES_EXPR, vectype,
7490                                  new_name, step_expr);
7491       else
7492         {
7493           /* Build:
7494                 [base, base, base, ...]
7495                 + (vectype) [0, 1, 2, ...] * [step, step, step, ...].  */
7496           gcc_assert (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)));
7497           gcc_assert (flag_associative_math);
7498           tree index = build_index_vector (vectype, 0, 1);
7499           tree base_vec = gimple_build_vector_from_val (&stmts, vectype,
7500                                                         new_name);
7501           tree step_vec = gimple_build_vector_from_val (&stmts, vectype,
7502                                                         step_expr);
7503           vec_init = gimple_build (&stmts, FLOAT_EXPR, vectype, index);
7504           vec_init = gimple_build (&stmts, MULT_EXPR, vectype,
7505                                    vec_init, step_vec);
7506           vec_init = gimple_build (&stmts, PLUS_EXPR, vectype,
7507                                    vec_init, base_vec);
7508         }
7509
7510       if (stmts)
7511         {
7512           new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
7513           gcc_assert (!new_bb);
7514         }
7515     }
7516
7517
7518   /* Create the vector that holds the step of the induction.  */
7519   if (nested_in_vect_loop)
7520     /* iv_loop is nested in the loop to be vectorized. Generate:
7521        vec_step = [S, S, S, S]  */
7522     new_name = step_expr;
7523   else
7524     {
7525       /* iv_loop is the loop to be vectorized. Generate:
7526           vec_step = [VF*S, VF*S, VF*S, VF*S]  */
7527       gimple_seq seq = NULL;
7528       if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
7529         {
7530           expr = build_int_cst (integer_type_node, vf);
7531           expr = gimple_build (&seq, FLOAT_EXPR, TREE_TYPE (step_expr), expr);
7532         }
7533       else
7534         expr = build_int_cst (TREE_TYPE (step_expr), vf);
7535       new_name = gimple_build (&seq, MULT_EXPR, TREE_TYPE (step_expr),
7536                                expr, step_expr);
7537       if (seq)
7538         {
7539           new_bb = gsi_insert_seq_on_edge_immediate (pe, seq);
7540           gcc_assert (!new_bb);
7541         }
7542     }
7543
7544   t = unshare_expr (new_name);
7545   gcc_assert (CONSTANT_CLASS_P (new_name)
7546               || TREE_CODE (new_name) == SSA_NAME);
7547   new_vec = build_vector_from_val (vectype, t);
7548   vec_step = vect_init_vector (stmt_info, new_vec, vectype, NULL);
7549
7550
7551   /* Create the following def-use cycle:
7552      loop prolog:
7553          vec_init = ...
7554          vec_step = ...
7555      loop:
7556          vec_iv = PHI <vec_init, vec_loop>
7557          ...
7558          STMT
7559          ...
7560          vec_loop = vec_iv + vec_step;  */
7561
7562   /* Create the induction-phi that defines the induction-operand.  */
7563   vec_dest = vect_get_new_vect_var (vectype, vect_simple_var, "vec_iv_");
7564   induction_phi = create_phi_node (vec_dest, iv_loop->header);
7565   stmt_vec_info induction_phi_info = loop_vinfo->add_stmt (induction_phi);
7566   induc_def = PHI_RESULT (induction_phi);
7567
7568   /* Create the iv update inside the loop  */
7569   vec_def = make_ssa_name (vec_dest);
7570   new_stmt = gimple_build_assign (vec_def, PLUS_EXPR, induc_def, vec_step);
7571   gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
7572   stmt_vec_info new_stmt_info = loop_vinfo->add_stmt (new_stmt);
7573
7574   /* Set the arguments of the phi node:  */
7575   add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION);
7576   add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
7577                UNKNOWN_LOCATION);
7578
7579   STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = induction_phi_info;
7580
7581   /* In case that vectorization factor (VF) is bigger than the number
7582      of elements that we can fit in a vectype (nunits), we have to generate
7583      more than one vector stmt - i.e - we need to "unroll" the
7584      vector stmt by a factor VF/nunits.  For more details see documentation
7585      in vectorizable_operation.  */
7586
7587   if (ncopies > 1)
7588     {
7589       gimple_seq seq = NULL;
7590       stmt_vec_info prev_stmt_vinfo;
7591       /* FORNOW. This restriction should be relaxed.  */
7592       gcc_assert (!nested_in_vect_loop);
7593
7594       /* Create the vector that holds the step of the induction.  */
7595       if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
7596         {
7597           expr = build_int_cst (integer_type_node, nunits);
7598           expr = gimple_build (&seq, FLOAT_EXPR, TREE_TYPE (step_expr), expr);
7599         }
7600       else
7601         expr = build_int_cst (TREE_TYPE (step_expr), nunits);
7602       new_name = gimple_build (&seq, MULT_EXPR, TREE_TYPE (step_expr),
7603                                expr, step_expr);
7604       if (seq)
7605         {
7606           new_bb = gsi_insert_seq_on_edge_immediate (pe, seq);
7607           gcc_assert (!new_bb);
7608         }
7609
7610       t = unshare_expr (new_name);
7611       gcc_assert (CONSTANT_CLASS_P (new_name)
7612                   || TREE_CODE (new_name) == SSA_NAME);
7613       new_vec = build_vector_from_val (vectype, t);
7614       vec_step = vect_init_vector (stmt_info, new_vec, vectype, NULL);
7615
7616       vec_def = induc_def;
7617       prev_stmt_vinfo = induction_phi_info;
7618       for (i = 1; i < ncopies; i++)
7619         {
7620           /* vec_i = vec_prev + vec_step  */
7621           new_stmt = gimple_build_assign (vec_dest, PLUS_EXPR,
7622                                           vec_def, vec_step);
7623           vec_def = make_ssa_name (vec_dest, new_stmt);
7624           gimple_assign_set_lhs (new_stmt, vec_def);
7625
7626           gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
7627           new_stmt_info = loop_vinfo->add_stmt (new_stmt);
7628           STMT_VINFO_RELATED_STMT (prev_stmt_vinfo) = new_stmt_info;
7629           prev_stmt_vinfo = new_stmt_info;
7630         }
7631     }
7632
7633   if (nested_in_vect_loop)
7634     {
7635       /* Find the loop-closed exit-phi of the induction, and record
7636          the final vector of induction results:  */
7637       exit_phi = NULL;
7638       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, loop_arg)
7639         {
7640           gimple *use_stmt = USE_STMT (use_p);
7641           if (is_gimple_debug (use_stmt))
7642             continue;
7643
7644           if (!flow_bb_inside_loop_p (iv_loop, gimple_bb (use_stmt)))
7645             {
7646               exit_phi = use_stmt;
7647               break;
7648             }
7649         }
7650       if (exit_phi)
7651         {
7652           stmt_vec_info stmt_vinfo = loop_vinfo->lookup_stmt (exit_phi);
7653           /* FORNOW. Currently not supporting the case that an inner-loop induction
7654              is not used in the outer-loop (i.e. only outside the outer-loop).  */
7655           gcc_assert (STMT_VINFO_RELEVANT_P (stmt_vinfo)
7656                       && !STMT_VINFO_LIVE_P (stmt_vinfo));
7657
7658           STMT_VINFO_VEC_STMT (stmt_vinfo) = new_stmt_info;
7659           if (dump_enabled_p ())
7660             dump_printf_loc (MSG_NOTE, vect_location,
7661                              "vector of inductions after inner-loop:%G",
7662                              new_stmt);
7663         }
7664     }
7665
7666
7667   if (dump_enabled_p ())
7668     dump_printf_loc (MSG_NOTE, vect_location,
7669                      "transform induction: created def-use cycle: %G%G",
7670                      induction_phi, SSA_NAME_DEF_STMT (vec_def));
7671
7672   return true;
7673 }
7674
7675 /* Function vectorizable_live_operation.
7676
7677    STMT_INFO computes a value that is used outside the loop.  Check if
7678    it can be supported.  */
7679
7680 bool
7681 vectorizable_live_operation (stmt_vec_info stmt_info,
7682                              gimple_stmt_iterator *gsi ATTRIBUTE_UNUSED,
7683                              slp_tree slp_node, int slp_index,
7684                              stmt_vec_info *vec_stmt,
7685                              stmt_vector_for_cost *)
7686 {
7687   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
7688   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
7689   imm_use_iterator imm_iter;
7690   tree lhs, lhs_type, bitsize, vec_bitsize;
7691   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
7692   poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
7693   int ncopies;
7694   gimple *use_stmt;
7695   auto_vec<tree> vec_oprnds;
7696   int vec_entry = 0;
7697   poly_uint64 vec_index = 0;
7698
7699   gcc_assert (STMT_VINFO_LIVE_P (stmt_info));
7700
7701   if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def)
7702     return false;
7703
7704   /* FORNOW.  CHECKME.  */
7705   if (nested_in_vect_loop_p (loop, stmt_info))
7706     return false;
7707
7708   /* If STMT is not relevant and it is a simple assignment and its inputs are
7709      invariant then it can remain in place, unvectorized.  The original last
7710      scalar value that it computes will be used.  */
7711   if (!STMT_VINFO_RELEVANT_P (stmt_info))
7712     {
7713       gcc_assert (is_simple_and_all_uses_invariant (stmt_info, loop_vinfo));
7714       if (dump_enabled_p ())
7715         dump_printf_loc (MSG_NOTE, vect_location,
7716                          "statement is simple and uses invariant.  Leaving in "
7717                          "place.\n");
7718       return true;
7719     }
7720
7721   if (slp_node)
7722     ncopies = 1;
7723   else
7724     ncopies = vect_get_num_copies (loop_vinfo, vectype);
7725
7726   if (slp_node)
7727     {
7728       gcc_assert (slp_index >= 0);
7729
7730       int num_scalar = SLP_TREE_SCALAR_STMTS (slp_node).length ();
7731       int num_vec = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
7732
7733       /* Get the last occurrence of the scalar index from the concatenation of
7734          all the slp vectors. Calculate which slp vector it is and the index
7735          within.  */
7736       poly_uint64 pos = (num_vec * nunits) - num_scalar + slp_index;
7737
7738       /* Calculate which vector contains the result, and which lane of
7739          that vector we need.  */
7740       if (!can_div_trunc_p (pos, nunits, &vec_entry, &vec_index))
7741         {
7742           if (dump_enabled_p ())
7743             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7744                              "Cannot determine which vector holds the"
7745                              " final result.\n");
7746           return false;
7747         }
7748     }
7749
7750   if (!vec_stmt)
7751     {
7752       /* No transformation required.  */
7753       if (LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo))
7754         {
7755           if (!direct_internal_fn_supported_p (IFN_EXTRACT_LAST, vectype,
7756                                                OPTIMIZE_FOR_SPEED))
7757             {
7758               if (dump_enabled_p ())
7759                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7760                                  "can't use a fully-masked loop because "
7761                                  "the target doesn't support extract last "
7762                                  "reduction.\n");
7763               LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = false;
7764             }
7765           else if (slp_node)
7766             {
7767               if (dump_enabled_p ())
7768                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7769                                  "can't use a fully-masked loop because an "
7770                                  "SLP statement is live after the loop.\n");
7771               LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = false;
7772             }
7773           else if (ncopies > 1)
7774             {
7775               if (dump_enabled_p ())
7776                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7777                                  "can't use a fully-masked loop because"
7778                                  " ncopies is greater than 1.\n");
7779               LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = false;
7780             }
7781           else
7782             {
7783               gcc_assert (ncopies == 1 && !slp_node);
7784               vect_record_loop_mask (loop_vinfo,
7785                                      &LOOP_VINFO_MASKS (loop_vinfo),
7786                                      1, vectype);
7787             }
7788         }
7789       return true;
7790     }
7791
7792   /* Use the lhs of the original scalar statement.  */
7793   gimple *stmt = vect_orig_stmt (stmt_info)->stmt;
7794
7795   lhs = (is_a <gphi *> (stmt)) ? gimple_phi_result (stmt)
7796         : gimple_get_lhs (stmt);
7797   lhs_type = TREE_TYPE (lhs);
7798
7799   bitsize = (VECTOR_BOOLEAN_TYPE_P (vectype)
7800              ? bitsize_int (TYPE_PRECISION (TREE_TYPE (vectype)))
7801              : TYPE_SIZE (TREE_TYPE (vectype)));
7802   vec_bitsize = TYPE_SIZE (vectype);
7803
7804   /* Get the vectorized lhs of STMT and the lane to use (counted in bits).  */
7805   tree vec_lhs, bitstart;
7806   if (slp_node)
7807     {
7808       gcc_assert (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo));
7809
7810       /* Get the correct slp vectorized stmt.  */
7811       gimple *vec_stmt = SLP_TREE_VEC_STMTS (slp_node)[vec_entry]->stmt;
7812       if (gphi *phi = dyn_cast <gphi *> (vec_stmt))
7813         vec_lhs = gimple_phi_result (phi);
7814       else
7815         vec_lhs = gimple_get_lhs (vec_stmt);
7816
7817       /* Get entry to use.  */
7818       bitstart = bitsize_int (vec_index);
7819       bitstart = int_const_binop (MULT_EXPR, bitsize, bitstart);
7820     }
7821   else
7822     {
7823       enum vect_def_type dt = STMT_VINFO_DEF_TYPE (stmt_info);
7824       vec_lhs = vect_get_vec_def_for_operand_1 (stmt_info, dt);
7825       gcc_checking_assert (ncopies == 1
7826                            || !LOOP_VINFO_FULLY_MASKED_P (loop_vinfo));
7827
7828       /* For multiple copies, get the last copy.  */
7829       for (int i = 1; i < ncopies; ++i)
7830         vec_lhs = vect_get_vec_def_for_stmt_copy (loop_vinfo, vec_lhs);
7831
7832       /* Get the last lane in the vector.  */
7833       bitstart = int_const_binop (MINUS_EXPR, vec_bitsize, bitsize);
7834     }
7835
7836   gimple_seq stmts = NULL;
7837   tree new_tree;
7838   if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
7839     {
7840       /* Emit:
7841
7842            SCALAR_RES = EXTRACT_LAST <VEC_LHS, MASK>
7843
7844          where VEC_LHS is the vectorized live-out result and MASK is
7845          the loop mask for the final iteration.  */
7846       gcc_assert (ncopies == 1 && !slp_node);
7847       tree scalar_type = TREE_TYPE (STMT_VINFO_VECTYPE (stmt_info));
7848       tree mask = vect_get_loop_mask (gsi, &LOOP_VINFO_MASKS (loop_vinfo),
7849                                       1, vectype, 0);
7850       tree scalar_res = gimple_build (&stmts, CFN_EXTRACT_LAST,
7851                                       scalar_type, mask, vec_lhs);
7852
7853       /* Convert the extracted vector element to the required scalar type.  */
7854       new_tree = gimple_convert (&stmts, lhs_type, scalar_res);
7855     }
7856   else
7857     {
7858       tree bftype = TREE_TYPE (vectype);
7859       if (VECTOR_BOOLEAN_TYPE_P (vectype))
7860         bftype = build_nonstandard_integer_type (tree_to_uhwi (bitsize), 1);
7861       new_tree = build3 (BIT_FIELD_REF, bftype, vec_lhs, bitsize, bitstart);
7862       new_tree = force_gimple_operand (fold_convert (lhs_type, new_tree),
7863                                        &stmts, true, NULL_TREE);
7864     }
7865
7866   if (stmts)
7867     gsi_insert_seq_on_edge_immediate (single_exit (loop), stmts);
7868
7869   /* Replace use of lhs with newly computed result.  If the use stmt is a
7870      single arg PHI, just replace all uses of PHI result.  It's necessary
7871      because lcssa PHI defining lhs may be before newly inserted stmt.  */
7872   use_operand_p use_p;
7873   FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, lhs)
7874     if (!flow_bb_inside_loop_p (loop, gimple_bb (use_stmt))
7875         && !is_gimple_debug (use_stmt))
7876     {
7877       if (gimple_code (use_stmt) == GIMPLE_PHI
7878           && gimple_phi_num_args (use_stmt) == 1)
7879         {
7880           replace_uses_by (gimple_phi_result (use_stmt), new_tree);
7881         }
7882       else
7883         {
7884           FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
7885             SET_USE (use_p, new_tree);
7886         }
7887       update_stmt (use_stmt);
7888     }
7889
7890   return true;
7891 }
7892
7893 /* Kill any debug uses outside LOOP of SSA names defined in STMT_INFO.  */
7894
7895 static void
7896 vect_loop_kill_debug_uses (struct loop *loop, stmt_vec_info stmt_info)
7897 {
7898   ssa_op_iter op_iter;
7899   imm_use_iterator imm_iter;
7900   def_operand_p def_p;
7901   gimple *ustmt;
7902
7903   FOR_EACH_PHI_OR_STMT_DEF (def_p, stmt_info->stmt, op_iter, SSA_OP_DEF)
7904     {
7905       FOR_EACH_IMM_USE_STMT (ustmt, imm_iter, DEF_FROM_PTR (def_p))
7906         {
7907           basic_block bb;
7908
7909           if (!is_gimple_debug (ustmt))
7910             continue;
7911
7912           bb = gimple_bb (ustmt);
7913
7914           if (!flow_bb_inside_loop_p (loop, bb))
7915             {
7916               if (gimple_debug_bind_p (ustmt))
7917                 {
7918                   if (dump_enabled_p ())
7919                     dump_printf_loc (MSG_NOTE, vect_location,
7920                                      "killing debug use\n");
7921
7922                   gimple_debug_bind_reset_value (ustmt);
7923                   update_stmt (ustmt);
7924                 }
7925               else
7926                 gcc_unreachable ();
7927             }
7928         }
7929     }
7930 }
7931
7932 /* Given loop represented by LOOP_VINFO, return true if computation of
7933    LOOP_VINFO_NITERS (= LOOP_VINFO_NITERSM1 + 1) doesn't overflow, false
7934    otherwise.  */
7935
7936 static bool
7937 loop_niters_no_overflow (loop_vec_info loop_vinfo)
7938 {
7939   /* Constant case.  */
7940   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
7941     {
7942       tree cst_niters = LOOP_VINFO_NITERS (loop_vinfo);
7943       tree cst_nitersm1 = LOOP_VINFO_NITERSM1 (loop_vinfo);
7944
7945       gcc_assert (TREE_CODE (cst_niters) == INTEGER_CST);
7946       gcc_assert (TREE_CODE (cst_nitersm1) == INTEGER_CST);
7947       if (wi::to_widest (cst_nitersm1) < wi::to_widest (cst_niters))
7948         return true;
7949     }
7950
7951   widest_int max;
7952   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
7953   /* Check the upper bound of loop niters.  */
7954   if (get_max_loop_iterations (loop, &max))
7955     {
7956       tree type = TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo));
7957       signop sgn = TYPE_SIGN (type);
7958       widest_int type_max = widest_int::from (wi::max_value (type), sgn);
7959       if (max < type_max)
7960         return true;
7961     }
7962   return false;
7963 }
7964
7965 /* Return a mask type with half the number of elements as TYPE.  */
7966
7967 tree
7968 vect_halve_mask_nunits (tree type)
7969 {
7970   poly_uint64 nunits = exact_div (TYPE_VECTOR_SUBPARTS (type), 2);
7971   return build_truth_vector_type (nunits, current_vector_size);
7972 }
7973
7974 /* Return a mask type with twice as many elements as TYPE.  */
7975
7976 tree
7977 vect_double_mask_nunits (tree type)
7978 {
7979   poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (type) * 2;
7980   return build_truth_vector_type (nunits, current_vector_size);
7981 }
7982
7983 /* Record that a fully-masked version of LOOP_VINFO would need MASKS to
7984    contain a sequence of NVECTORS masks that each control a vector of type
7985    VECTYPE.  */
7986
7987 void
7988 vect_record_loop_mask (loop_vec_info loop_vinfo, vec_loop_masks *masks,
7989                        unsigned int nvectors, tree vectype)
7990 {
7991   gcc_assert (nvectors != 0);
7992   if (masks->length () < nvectors)
7993     masks->safe_grow_cleared (nvectors);
7994   rgroup_masks *rgm = &(*masks)[nvectors - 1];
7995   /* The number of scalars per iteration and the number of vectors are
7996      both compile-time constants.  */
7997   unsigned int nscalars_per_iter
7998     = exact_div (nvectors * TYPE_VECTOR_SUBPARTS (vectype),
7999                  LOOP_VINFO_VECT_FACTOR (loop_vinfo)).to_constant ();
8000   if (rgm->max_nscalars_per_iter < nscalars_per_iter)
8001     {
8002       rgm->max_nscalars_per_iter = nscalars_per_iter;
8003       rgm->mask_type = build_same_sized_truth_vector_type (vectype);
8004     }
8005 }
8006
8007 /* Given a complete set of masks MASKS, extract mask number INDEX
8008    for an rgroup that operates on NVECTORS vectors of type VECTYPE,
8009    where 0 <= INDEX < NVECTORS.  Insert any set-up statements before GSI.
8010
8011    See the comment above vec_loop_masks for more details about the mask
8012    arrangement.  */
8013
8014 tree
8015 vect_get_loop_mask (gimple_stmt_iterator *gsi, vec_loop_masks *masks,
8016                     unsigned int nvectors, tree vectype, unsigned int index)
8017 {
8018   rgroup_masks *rgm = &(*masks)[nvectors - 1];
8019   tree mask_type = rgm->mask_type;
8020
8021   /* Populate the rgroup's mask array, if this is the first time we've
8022      used it.  */
8023   if (rgm->masks.is_empty ())
8024     {
8025       rgm->masks.safe_grow_cleared (nvectors);
8026       for (unsigned int i = 0; i < nvectors; ++i)
8027         {
8028           tree mask = make_temp_ssa_name (mask_type, NULL, "loop_mask");
8029           /* Provide a dummy definition until the real one is available.  */
8030           SSA_NAME_DEF_STMT (mask) = gimple_build_nop ();
8031           rgm->masks[i] = mask;
8032         }
8033     }
8034
8035   tree mask = rgm->masks[index];
8036   if (maybe_ne (TYPE_VECTOR_SUBPARTS (mask_type),
8037                 TYPE_VECTOR_SUBPARTS (vectype)))
8038     {
8039       /* A loop mask for data type X can be reused for data type Y
8040          if X has N times more elements than Y and if Y's elements
8041          are N times bigger than X's.  In this case each sequence
8042          of N elements in the loop mask will be all-zero or all-one.
8043          We can then view-convert the mask so that each sequence of
8044          N elements is replaced by a single element.  */
8045       gcc_assert (multiple_p (TYPE_VECTOR_SUBPARTS (mask_type),
8046                               TYPE_VECTOR_SUBPARTS (vectype)));
8047       gimple_seq seq = NULL;
8048       mask_type = build_same_sized_truth_vector_type (vectype);
8049       mask = gimple_build (&seq, VIEW_CONVERT_EXPR, mask_type, mask);
8050       if (seq)
8051         gsi_insert_seq_before (gsi, seq, GSI_SAME_STMT);
8052     }
8053   return mask;
8054 }
8055
8056 /* Scale profiling counters by estimation for LOOP which is vectorized
8057    by factor VF.  */
8058
8059 static void
8060 scale_profile_for_vect_loop (struct loop *loop, unsigned vf)
8061 {
8062   edge preheader = loop_preheader_edge (loop);
8063   /* Reduce loop iterations by the vectorization factor.  */
8064   gcov_type new_est_niter = niter_for_unrolled_loop (loop, vf);
8065   profile_count freq_h = loop->header->count, freq_e = preheader->count ();
8066
8067   if (freq_h.nonzero_p ())
8068     {
8069       profile_probability p;
8070
8071       /* Avoid dropping loop body profile counter to 0 because of zero count
8072          in loop's preheader.  */
8073       if (!(freq_e == profile_count::zero ()))
8074         freq_e = freq_e.force_nonzero ();
8075       p = freq_e.apply_scale (new_est_niter + 1, 1).probability_in (freq_h);
8076       scale_loop_frequencies (loop, p);
8077     }
8078
8079   edge exit_e = single_exit (loop);
8080   exit_e->probability = profile_probability::always ()
8081                                  .apply_scale (1, new_est_niter + 1);
8082
8083   edge exit_l = single_pred_edge (loop->latch);
8084   profile_probability prob = exit_l->probability;
8085   exit_l->probability = exit_e->probability.invert ();
8086   if (prob.initialized_p () && exit_l->probability.initialized_p ())
8087     scale_bbs_frequencies (&loop->latch, 1, exit_l->probability / prob);
8088 }
8089
8090 /* Vectorize STMT_INFO if relevant, inserting any new instructions before GSI.
8091    When vectorizing STMT_INFO as a store, set *SEEN_STORE to its
8092    stmt_vec_info.  */
8093
8094 static void
8095 vect_transform_loop_stmt (loop_vec_info loop_vinfo, stmt_vec_info stmt_info,
8096                           gimple_stmt_iterator *gsi, stmt_vec_info *seen_store)
8097 {
8098   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
8099   poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
8100
8101   if (dump_enabled_p ())
8102     dump_printf_loc (MSG_NOTE, vect_location,
8103                      "------>vectorizing statement: %G", stmt_info->stmt);
8104
8105   if (MAY_HAVE_DEBUG_BIND_STMTS && !STMT_VINFO_LIVE_P (stmt_info))
8106     vect_loop_kill_debug_uses (loop, stmt_info);
8107
8108   if (!STMT_VINFO_RELEVANT_P (stmt_info)
8109       && !STMT_VINFO_LIVE_P (stmt_info))
8110     return;
8111
8112   if (STMT_VINFO_VECTYPE (stmt_info))
8113     {
8114       poly_uint64 nunits
8115         = TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info));
8116       if (!STMT_SLP_TYPE (stmt_info)
8117           && maybe_ne (nunits, vf)
8118           && dump_enabled_p ())
8119         /* For SLP VF is set according to unrolling factor, and not
8120            to vector size, hence for SLP this print is not valid.  */
8121         dump_printf_loc (MSG_NOTE, vect_location, "multiple-types.\n");
8122     }
8123
8124   /* Pure SLP statements have already been vectorized.  We still need
8125      to apply loop vectorization to hybrid SLP statements.  */
8126   if (PURE_SLP_STMT (stmt_info))
8127     return;
8128
8129   if (dump_enabled_p ())
8130     dump_printf_loc (MSG_NOTE, vect_location, "transform statement.\n");
8131
8132   if (vect_transform_stmt (stmt_info, gsi, NULL, NULL))
8133     *seen_store = stmt_info;
8134 }
8135
8136 /* Function vect_transform_loop.
8137
8138    The analysis phase has determined that the loop is vectorizable.
8139    Vectorize the loop - created vectorized stmts to replace the scalar
8140    stmts in the loop, and update the loop exit condition.
8141    Returns scalar epilogue loop if any.  */
8142
8143 struct loop *
8144 vect_transform_loop (loop_vec_info loop_vinfo)
8145 {
8146   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
8147   struct loop *epilogue = NULL;
8148   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
8149   int nbbs = loop->num_nodes;
8150   int i;
8151   tree niters_vector = NULL_TREE;
8152   tree step_vector = NULL_TREE;
8153   tree niters_vector_mult_vf = NULL_TREE;
8154   poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
8155   unsigned int lowest_vf = constant_lower_bound (vf);
8156   gimple *stmt;
8157   bool check_profitability = false;
8158   unsigned int th;
8159
8160   DUMP_VECT_SCOPE ("vec_transform_loop");
8161
8162   loop_vinfo->shared->check_datarefs ();
8163
8164   /* Use the more conservative vectorization threshold.  If the number
8165      of iterations is constant assume the cost check has been performed
8166      by our caller.  If the threshold makes all loops profitable that
8167      run at least the (estimated) vectorization factor number of times
8168      checking is pointless, too.  */
8169   th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
8170   if (th >= vect_vf_for_cost (loop_vinfo)
8171       && !LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
8172     {
8173       if (dump_enabled_p ())
8174         dump_printf_loc (MSG_NOTE, vect_location,
8175                          "Profitability threshold is %d loop iterations.\n",
8176                          th);
8177       check_profitability = true;
8178     }
8179
8180   /* Make sure there exists a single-predecessor exit bb.  Do this before
8181      versioning.   */
8182   edge e = single_exit (loop);
8183   if (! single_pred_p (e->dest))
8184     {
8185       split_loop_exit_edge (e, true);
8186       if (dump_enabled_p ())
8187         dump_printf (MSG_NOTE, "split exit edge\n");
8188     }
8189
8190   /* Version the loop first, if required, so the profitability check
8191      comes first.  */
8192
8193   if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
8194     {
8195       poly_uint64 versioning_threshold
8196         = LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo);
8197       if (check_profitability
8198           && ordered_p (poly_uint64 (th), versioning_threshold))
8199         {
8200           versioning_threshold = ordered_max (poly_uint64 (th),
8201                                               versioning_threshold);
8202           check_profitability = false;
8203         }
8204       vect_loop_versioning (loop_vinfo, th, check_profitability,
8205                             versioning_threshold);
8206       check_profitability = false;
8207     }
8208
8209   /* Make sure there exists a single-predecessor exit bb also on the
8210      scalar loop copy.  Do this after versioning but before peeling
8211      so CFG structure is fine for both scalar and if-converted loop
8212      to make slpeel_duplicate_current_defs_from_edges face matched
8213      loop closed PHI nodes on the exit.  */
8214   if (LOOP_VINFO_SCALAR_LOOP (loop_vinfo))
8215     {
8216       e = single_exit (LOOP_VINFO_SCALAR_LOOP (loop_vinfo));
8217       if (! single_pred_p (e->dest))
8218         {
8219           split_loop_exit_edge (e, true);
8220           if (dump_enabled_p ())
8221             dump_printf (MSG_NOTE, "split exit edge of scalar loop\n");
8222         }
8223     }
8224
8225   tree niters = vect_build_loop_niters (loop_vinfo);
8226   LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = niters;
8227   tree nitersm1 = unshare_expr (LOOP_VINFO_NITERSM1 (loop_vinfo));
8228   bool niters_no_overflow = loop_niters_no_overflow (loop_vinfo);
8229   epilogue = vect_do_peeling (loop_vinfo, niters, nitersm1, &niters_vector,
8230                               &step_vector, &niters_vector_mult_vf, th,
8231                               check_profitability, niters_no_overflow);
8232
8233   if (niters_vector == NULL_TREE)
8234     {
8235       if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
8236           && !LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
8237           && known_eq (lowest_vf, vf))
8238         {
8239           niters_vector
8240             = build_int_cst (TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo)),
8241                              LOOP_VINFO_INT_NITERS (loop_vinfo) / lowest_vf);
8242           step_vector = build_one_cst (TREE_TYPE (niters));
8243         }
8244       else
8245         vect_gen_vector_loop_niters (loop_vinfo, niters, &niters_vector,
8246                                      &step_vector, niters_no_overflow);
8247     }
8248
8249   /* 1) Make sure the loop header has exactly two entries
8250      2) Make sure we have a preheader basic block.  */
8251
8252   gcc_assert (EDGE_COUNT (loop->header->preds) == 2);
8253
8254   split_edge (loop_preheader_edge (loop));
8255
8256   if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
8257       && vect_use_loop_mask_for_alignment_p (loop_vinfo))
8258     /* This will deal with any possible peeling.  */
8259     vect_prepare_for_masked_peels (loop_vinfo);
8260
8261   /* Schedule the SLP instances first, then handle loop vectorization
8262      below.  */
8263   if (!loop_vinfo->slp_instances.is_empty ())
8264     {
8265       DUMP_VECT_SCOPE ("scheduling SLP instances");
8266       vect_schedule_slp (loop_vinfo);
8267     }
8268
8269   /* FORNOW: the vectorizer supports only loops which body consist
8270      of one basic block (header + empty latch). When the vectorizer will
8271      support more involved loop forms, the order by which the BBs are
8272      traversed need to be reconsidered.  */
8273
8274   for (i = 0; i < nbbs; i++)
8275     {
8276       basic_block bb = bbs[i];
8277       stmt_vec_info stmt_info;
8278
8279       for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
8280            gsi_next (&si))
8281         {
8282           gphi *phi = si.phi ();
8283           if (dump_enabled_p ())
8284             dump_printf_loc (MSG_NOTE, vect_location,
8285                              "------>vectorizing phi: %G", phi);
8286           stmt_info = loop_vinfo->lookup_stmt (phi);
8287           if (!stmt_info)
8288             continue;
8289
8290           if (MAY_HAVE_DEBUG_BIND_STMTS && !STMT_VINFO_LIVE_P (stmt_info))
8291             vect_loop_kill_debug_uses (loop, stmt_info);
8292
8293           if (!STMT_VINFO_RELEVANT_P (stmt_info)
8294               && !STMT_VINFO_LIVE_P (stmt_info))
8295             continue;
8296
8297           if (STMT_VINFO_VECTYPE (stmt_info)
8298               && (maybe_ne
8299                   (TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info)), vf))
8300               && dump_enabled_p ())
8301             dump_printf_loc (MSG_NOTE, vect_location, "multiple-types.\n");
8302
8303           if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def
8304                || STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
8305                || STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle)
8306               && ! PURE_SLP_STMT (stmt_info))
8307             {
8308               if (dump_enabled_p ())
8309                 dump_printf_loc (MSG_NOTE, vect_location, "transform phi.\n");
8310               vect_transform_stmt (stmt_info, NULL, NULL, NULL);
8311             }
8312         }
8313
8314       for (gimple_stmt_iterator si = gsi_start_bb (bb);
8315            !gsi_end_p (si);)
8316         {
8317           stmt = gsi_stmt (si);
8318           /* During vectorization remove existing clobber stmts.  */
8319           if (gimple_clobber_p (stmt))
8320             {
8321               unlink_stmt_vdef (stmt);
8322               gsi_remove (&si, true);
8323               release_defs (stmt);
8324             }
8325           else
8326             {
8327               stmt_info = loop_vinfo->lookup_stmt (stmt);
8328
8329               /* vector stmts created in the outer-loop during vectorization of
8330                  stmts in an inner-loop may not have a stmt_info, and do not
8331                  need to be vectorized.  */
8332               stmt_vec_info seen_store = NULL;
8333               if (stmt_info)
8334                 {
8335                   if (STMT_VINFO_IN_PATTERN_P (stmt_info))
8336                     {
8337                       gimple *def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
8338                       for (gimple_stmt_iterator subsi = gsi_start (def_seq);
8339                            !gsi_end_p (subsi); gsi_next (&subsi))
8340                         {
8341                           stmt_vec_info pat_stmt_info
8342                             = loop_vinfo->lookup_stmt (gsi_stmt (subsi));
8343                           vect_transform_loop_stmt (loop_vinfo, pat_stmt_info,
8344                                                     &si, &seen_store);
8345                         }
8346                       stmt_vec_info pat_stmt_info
8347                         = STMT_VINFO_RELATED_STMT (stmt_info);
8348                       vect_transform_loop_stmt (loop_vinfo, pat_stmt_info, &si,
8349                                                 &seen_store);
8350                     }
8351                   vect_transform_loop_stmt (loop_vinfo, stmt_info, &si,
8352                                             &seen_store);
8353                 }
8354               gsi_next (&si);
8355               if (seen_store)
8356                 {
8357                   if (STMT_VINFO_GROUPED_ACCESS (seen_store))
8358                     /* Interleaving.  If IS_STORE is TRUE, the
8359                        vectorization of the interleaving chain was
8360                        completed - free all the stores in the chain.  */
8361                     vect_remove_stores (DR_GROUP_FIRST_ELEMENT (seen_store));
8362                   else
8363                     /* Free the attached stmt_vec_info and remove the stmt.  */
8364                     loop_vinfo->remove_stmt (stmt_info);
8365                 }
8366             }
8367         }
8368
8369       /* Stub out scalar statements that must not survive vectorization.
8370          Doing this here helps with grouped statements, or statements that
8371          are involved in patterns.  */
8372       for (gimple_stmt_iterator gsi = gsi_start_bb (bb);
8373            !gsi_end_p (gsi); gsi_next (&gsi))
8374         {
8375           gcall *call = dyn_cast <gcall *> (gsi_stmt (gsi));
8376           if (call && gimple_call_internal_p (call, IFN_MASK_LOAD))
8377             {
8378               tree lhs = gimple_get_lhs (call);
8379               if (!VECTOR_TYPE_P (TREE_TYPE (lhs)))
8380                 {
8381                   tree zero = build_zero_cst (TREE_TYPE (lhs));
8382                   gimple *new_stmt = gimple_build_assign (lhs, zero);
8383                   gsi_replace (&gsi, new_stmt, true);
8384                 }
8385             }
8386         }
8387     }                           /* BBs in loop */
8388
8389   /* The vectorization factor is always > 1, so if we use an IV increment of 1.
8390      a zero NITERS becomes a nonzero NITERS_VECTOR.  */
8391   if (integer_onep (step_vector))
8392     niters_no_overflow = true;
8393   vect_set_loop_condition (loop, loop_vinfo, niters_vector, step_vector,
8394                            niters_vector_mult_vf, !niters_no_overflow);
8395
8396   unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
8397   scale_profile_for_vect_loop (loop, assumed_vf);
8398
8399   /* True if the final iteration might not handle a full vector's
8400      worth of scalar iterations.  */
8401   bool final_iter_may_be_partial = LOOP_VINFO_FULLY_MASKED_P (loop_vinfo);
8402   /* The minimum number of iterations performed by the epilogue.  This
8403      is 1 when peeling for gaps because we always need a final scalar
8404      iteration.  */
8405   int min_epilogue_iters = LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) ? 1 : 0;
8406   /* +1 to convert latch counts to loop iteration counts,
8407      -min_epilogue_iters to remove iterations that cannot be performed
8408        by the vector code.  */
8409   int bias_for_lowest = 1 - min_epilogue_iters;
8410   int bias_for_assumed = bias_for_lowest;
8411   int alignment_npeels = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
8412   if (alignment_npeels && LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
8413     {
8414       /* When the amount of peeling is known at compile time, the first
8415          iteration will have exactly alignment_npeels active elements.
8416          In the worst case it will have at least one.  */
8417       int min_first_active = (alignment_npeels > 0 ? alignment_npeels : 1);
8418       bias_for_lowest += lowest_vf - min_first_active;
8419       bias_for_assumed += assumed_vf - min_first_active;
8420     }
8421   /* In these calculations the "- 1" converts loop iteration counts
8422      back to latch counts.  */
8423   if (loop->any_upper_bound)
8424     loop->nb_iterations_upper_bound
8425       = (final_iter_may_be_partial
8426          ? wi::udiv_ceil (loop->nb_iterations_upper_bound + bias_for_lowest,
8427                           lowest_vf) - 1
8428          : wi::udiv_floor (loop->nb_iterations_upper_bound + bias_for_lowest,
8429                            lowest_vf) - 1);
8430   if (loop->any_likely_upper_bound)
8431     loop->nb_iterations_likely_upper_bound
8432       = (final_iter_may_be_partial
8433          ? wi::udiv_ceil (loop->nb_iterations_likely_upper_bound
8434                           + bias_for_lowest, lowest_vf) - 1
8435          : wi::udiv_floor (loop->nb_iterations_likely_upper_bound
8436                            + bias_for_lowest, lowest_vf) - 1);
8437   if (loop->any_estimate)
8438     loop->nb_iterations_estimate
8439       = (final_iter_may_be_partial
8440          ? wi::udiv_ceil (loop->nb_iterations_estimate + bias_for_assumed,
8441                           assumed_vf) - 1
8442          : wi::udiv_floor (loop->nb_iterations_estimate + bias_for_assumed,
8443                            assumed_vf) - 1);
8444
8445   if (dump_enabled_p ())
8446     {
8447       if (!LOOP_VINFO_EPILOGUE_P (loop_vinfo))
8448         {
8449           dump_printf_loc (MSG_NOTE, vect_location,
8450                            "LOOP VECTORIZED\n");
8451           if (loop->inner)
8452             dump_printf_loc (MSG_NOTE, vect_location,
8453                              "OUTER LOOP VECTORIZED\n");
8454           dump_printf (MSG_NOTE, "\n");
8455         }
8456       else
8457         {
8458           dump_printf_loc (MSG_NOTE, vect_location,
8459                            "LOOP EPILOGUE VECTORIZED (VS=");
8460           dump_dec (MSG_NOTE, current_vector_size);
8461           dump_printf (MSG_NOTE, ")\n");
8462         }
8463     }
8464
8465   /* Loops vectorized with a variable factor won't benefit from
8466      unrolling/peeling.  */
8467   if (!vf.is_constant ())
8468     {
8469       loop->unroll = 1;
8470       if (dump_enabled_p ())
8471         dump_printf_loc (MSG_NOTE, vect_location, "Disabling unrolling due to"
8472                          " variable-length vectorization factor\n");
8473     }
8474   /* Free SLP instances here because otherwise stmt reference counting
8475      won't work.  */
8476   slp_instance instance;
8477   FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), i, instance)
8478     vect_free_slp_instance (instance, true);
8479   LOOP_VINFO_SLP_INSTANCES (loop_vinfo).release ();
8480   /* Clear-up safelen field since its value is invalid after vectorization
8481      since vectorized loop can have loop-carried dependencies.  */
8482   loop->safelen = 0;
8483
8484   /* Don't vectorize epilogue for epilogue.  */
8485   if (LOOP_VINFO_EPILOGUE_P (loop_vinfo))
8486     epilogue = NULL;
8487
8488   if (!PARAM_VALUE (PARAM_VECT_EPILOGUES_NOMASK))
8489     epilogue = NULL;
8490
8491   if (epilogue)
8492     {
8493       auto_vector_sizes vector_sizes;
8494       targetm.vectorize.autovectorize_vector_sizes (&vector_sizes);
8495       unsigned int next_size = 0;
8496
8497       /* Note LOOP_VINFO_NITERS_KNOWN_P and LOOP_VINFO_INT_NITERS work
8498          on niters already ajusted for the iterations of the prologue.  */
8499       if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
8500           && known_eq (vf, lowest_vf))
8501         {
8502           unsigned HOST_WIDE_INT eiters
8503             = (LOOP_VINFO_INT_NITERS (loop_vinfo)
8504                - LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo));
8505           eiters
8506             = eiters % lowest_vf + LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo);
8507           epilogue->nb_iterations_upper_bound = eiters - 1;
8508           epilogue->any_upper_bound = true;
8509
8510           unsigned int ratio;
8511           while (next_size < vector_sizes.length ()
8512                  && !(constant_multiple_p (current_vector_size,
8513                                            vector_sizes[next_size], &ratio)
8514                       && eiters >= lowest_vf / ratio))
8515             next_size += 1;
8516         }
8517       else
8518         while (next_size < vector_sizes.length ()
8519                && maybe_lt (current_vector_size, vector_sizes[next_size]))
8520           next_size += 1;
8521
8522       if (next_size == vector_sizes.length ())
8523         epilogue = NULL;
8524     }
8525
8526   if (epilogue)
8527     {
8528       epilogue->force_vectorize = loop->force_vectorize;
8529       epilogue->safelen = loop->safelen;
8530       epilogue->dont_vectorize = false;
8531
8532       /* We may need to if-convert epilogue to vectorize it.  */
8533       if (LOOP_VINFO_SCALAR_LOOP (loop_vinfo))
8534         tree_if_conversion (epilogue);
8535     }
8536
8537   return epilogue;
8538 }
8539
8540 /* The code below is trying to perform simple optimization - revert
8541    if-conversion for masked stores, i.e. if the mask of a store is zero
8542    do not perform it and all stored value producers also if possible.
8543    For example,
8544      for (i=0; i<n; i++)
8545        if (c[i])
8546         {
8547           p1[i] += 1;
8548           p2[i] = p3[i] +2;
8549         }
8550    this transformation will produce the following semi-hammock:
8551
8552    if (!mask__ifc__42.18_165 == { 0, 0, 0, 0, 0, 0, 0, 0 })
8553      {
8554        vect__11.19_170 = MASK_LOAD (vectp_p1.20_168, 0B, mask__ifc__42.18_165);
8555        vect__12.22_172 = vect__11.19_170 + vect_cst__171;
8556        MASK_STORE (vectp_p1.23_175, 0B, mask__ifc__42.18_165, vect__12.22_172);
8557        vect__18.25_182 = MASK_LOAD (vectp_p3.26_180, 0B, mask__ifc__42.18_165);
8558        vect__19.28_184 = vect__18.25_182 + vect_cst__183;
8559        MASK_STORE (vectp_p2.29_187, 0B, mask__ifc__42.18_165, vect__19.28_184);
8560      }
8561 */
8562
8563 void
8564 optimize_mask_stores (struct loop *loop)
8565 {
8566   basic_block *bbs = get_loop_body (loop);
8567   unsigned nbbs = loop->num_nodes;
8568   unsigned i;
8569   basic_block bb;
8570   struct loop *bb_loop;
8571   gimple_stmt_iterator gsi;
8572   gimple *stmt;
8573   auto_vec<gimple *> worklist;
8574   auto_purge_vect_location sentinel;
8575
8576   vect_location = find_loop_location (loop);
8577   /* Pick up all masked stores in loop if any.  */
8578   for (i = 0; i < nbbs; i++)
8579     {
8580       bb = bbs[i];
8581       for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi);
8582            gsi_next (&gsi))
8583         {
8584           stmt = gsi_stmt (gsi);
8585           if (gimple_call_internal_p (stmt, IFN_MASK_STORE))
8586             worklist.safe_push (stmt);
8587         }
8588     }
8589
8590   free (bbs);
8591   if (worklist.is_empty ())
8592     return;
8593
8594   /* Loop has masked stores.  */
8595   while (!worklist.is_empty ())
8596     {
8597       gimple *last, *last_store;
8598       edge e, efalse;
8599       tree mask;
8600       basic_block store_bb, join_bb;
8601       gimple_stmt_iterator gsi_to;
8602       tree vdef, new_vdef;
8603       gphi *phi;
8604       tree vectype;
8605       tree zero;
8606
8607       last = worklist.pop ();
8608       mask = gimple_call_arg (last, 2);
8609       bb = gimple_bb (last);
8610       /* Create then_bb and if-then structure in CFG, then_bb belongs to
8611          the same loop as if_bb.  It could be different to LOOP when two
8612          level loop-nest is vectorized and mask_store belongs to the inner
8613          one.  */
8614       e = split_block (bb, last);
8615       bb_loop = bb->loop_father;
8616       gcc_assert (loop == bb_loop || flow_loop_nested_p (loop, bb_loop));
8617       join_bb = e->dest;
8618       store_bb = create_empty_bb (bb);
8619       add_bb_to_loop (store_bb, bb_loop);
8620       e->flags = EDGE_TRUE_VALUE;
8621       efalse = make_edge (bb, store_bb, EDGE_FALSE_VALUE);
8622       /* Put STORE_BB to likely part.  */
8623       efalse->probability = profile_probability::unlikely ();
8624       store_bb->count = efalse->count ();
8625       make_single_succ_edge (store_bb, join_bb, EDGE_FALLTHRU);
8626       if (dom_info_available_p (CDI_DOMINATORS))
8627         set_immediate_dominator (CDI_DOMINATORS, store_bb, bb);
8628       if (dump_enabled_p ())
8629         dump_printf_loc (MSG_NOTE, vect_location,
8630                          "Create new block %d to sink mask stores.",
8631                          store_bb->index);
8632       /* Create vector comparison with boolean result.  */
8633       vectype = TREE_TYPE (mask);
8634       zero = build_zero_cst (vectype);
8635       stmt = gimple_build_cond (EQ_EXPR, mask, zero, NULL_TREE, NULL_TREE);
8636       gsi = gsi_last_bb (bb);
8637       gsi_insert_after (&gsi, stmt, GSI_SAME_STMT);
8638       /* Create new PHI node for vdef of the last masked store:
8639          .MEM_2 = VDEF <.MEM_1>
8640          will be converted to
8641          .MEM.3 = VDEF <.MEM_1>
8642          and new PHI node will be created in join bb
8643          .MEM_2 = PHI <.MEM_1, .MEM_3>
8644       */
8645       vdef = gimple_vdef (last);
8646       new_vdef = make_ssa_name (gimple_vop (cfun), last);
8647       gimple_set_vdef (last, new_vdef);
8648       phi = create_phi_node (vdef, join_bb);
8649       add_phi_arg (phi, new_vdef, EDGE_SUCC (store_bb, 0), UNKNOWN_LOCATION);
8650
8651       /* Put all masked stores with the same mask to STORE_BB if possible.  */
8652       while (true)
8653         {
8654           gimple_stmt_iterator gsi_from;
8655           gimple *stmt1 = NULL;
8656
8657           /* Move masked store to STORE_BB.  */
8658           last_store = last;
8659           gsi = gsi_for_stmt (last);
8660           gsi_from = gsi;
8661           /* Shift GSI to the previous stmt for further traversal.  */
8662           gsi_prev (&gsi);
8663           gsi_to = gsi_start_bb (store_bb);
8664           gsi_move_before (&gsi_from, &gsi_to);
8665           /* Setup GSI_TO to the non-empty block start.  */
8666           gsi_to = gsi_start_bb (store_bb);
8667           if (dump_enabled_p ())
8668             dump_printf_loc (MSG_NOTE, vect_location,
8669                              "Move stmt to created bb\n%G", last);
8670           /* Move all stored value producers if possible.  */
8671           while (!gsi_end_p (gsi))
8672             {
8673               tree lhs;
8674               imm_use_iterator imm_iter;
8675               use_operand_p use_p;
8676               bool res;
8677
8678               /* Skip debug statements.  */
8679               if (is_gimple_debug (gsi_stmt (gsi)))
8680                 {
8681                   gsi_prev (&gsi);
8682                   continue;
8683                 }
8684               stmt1 = gsi_stmt (gsi);
8685               /* Do not consider statements writing to memory or having
8686                  volatile operand.  */
8687               if (gimple_vdef (stmt1)
8688                   || gimple_has_volatile_ops (stmt1))
8689                 break;
8690               gsi_from = gsi;
8691               gsi_prev (&gsi);
8692               lhs = gimple_get_lhs (stmt1);
8693               if (!lhs)
8694                 break;
8695
8696               /* LHS of vectorized stmt must be SSA_NAME.  */
8697               if (TREE_CODE (lhs) != SSA_NAME)
8698                 break;
8699
8700               if (!VECTOR_TYPE_P (TREE_TYPE (lhs)))
8701                 {
8702                   /* Remove dead scalar statement.  */
8703                   if (has_zero_uses (lhs))
8704                     {
8705                       gsi_remove (&gsi_from, true);
8706                       continue;
8707                     }
8708                 }
8709
8710               /* Check that LHS does not have uses outside of STORE_BB.  */
8711               res = true;
8712               FOR_EACH_IMM_USE_FAST (use_p, imm_iter, lhs)
8713                 {
8714                   gimple *use_stmt;
8715                   use_stmt = USE_STMT (use_p);
8716                   if (is_gimple_debug (use_stmt))
8717                     continue;
8718                   if (gimple_bb (use_stmt) != store_bb)
8719                     {
8720                       res = false;
8721                       break;
8722                     }
8723                 }
8724               if (!res)
8725                 break;
8726
8727               if (gimple_vuse (stmt1)
8728                   && gimple_vuse (stmt1) != gimple_vuse (last_store))
8729                 break;
8730
8731               /* Can move STMT1 to STORE_BB.  */
8732               if (dump_enabled_p ())
8733                 dump_printf_loc (MSG_NOTE, vect_location,
8734                                  "Move stmt to created bb\n%G", stmt1);
8735               gsi_move_before (&gsi_from, &gsi_to);
8736               /* Shift GSI_TO for further insertion.  */
8737               gsi_prev (&gsi_to);
8738             }
8739           /* Put other masked stores with the same mask to STORE_BB.  */
8740           if (worklist.is_empty ()
8741               || gimple_call_arg (worklist.last (), 2) != mask
8742               || worklist.last () != stmt1)
8743             break;
8744           last = worklist.pop ();
8745         }
8746       add_phi_arg (phi, gimple_vuse (last_store), e, UNKNOWN_LOCATION);
8747     }
8748 }