gcc/tree-vect-loop.c

   1 /* Loop Vectorization
   2    Copyright (C) 2003-2019 Free Software Foundation, Inc.
   3    Contributed by Dorit Naishlos <dorit@il.ibm.com> and
   4    Ira Rosen <irar@il.ibm.com>
   5
   6 This file is part of GCC.
   7
   8 GCC is free software; you can redistribute it and/or modify it under
   9 the terms of the GNU General Public License as published by the Free
  10 Software Foundation; either version 3, or (at your option) any later
  11 version.
  12
  13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
  14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
  15 FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  16 for more details.
  17
  18 You should have received a copy of the GNU General Public License
  19 along with GCC; see the file COPYING3.  If not see
  20 <http://www.gnu.org/licenses/>.  */
  21
  22 #include "config.h"
  23 #include "system.h"
  24 #include "coretypes.h"
  25 #include "backend.h"
  26 #include "target.h"
  27 #include "rtl.h"
  28 #include "tree.h"
  29 #include "gimple.h"
  30 #include "cfghooks.h"
  31 #include "tree-pass.h"
  32 #include "ssa.h"
  33 #include "optabs-tree.h"
  34 #include "diagnostic-core.h"
  35 #include "fold-const.h"
  36 #include "stor-layout.h"
  37 #include "cfganal.h"
  38 #include "gimplify.h"
  39 #include "gimple-iterator.h"
  40 #include "gimplify-me.h"
  41 #include "tree-ssa-loop-ivopts.h"
  42 #include "tree-ssa-loop-manip.h"
  43 #include "tree-ssa-loop-niter.h"
  44 #include "tree-ssa-loop.h"
  45 #include "cfgloop.h"
  46 #include "params.h"
  47 #include "tree-scalar-evolution.h"
  48 #include "tree-vectorizer.h"
  49 #include "gimple-fold.h"
  50 #include "cgraph.h"
  51 #include "tree-cfg.h"
  52 #include "tree-if-conv.h"
  53 #include "internal-fn.h"
  54 #include "tree-vector-builder.h"
  55 #include "vec-perm-indices.h"
  56 #include "tree-eh.h"
  57
  58 /* Loop Vectorization Pass.
  59
  60    This pass tries to vectorize loops.
  61
  62    For example, the vectorizer transforms the following simple loop:
  63
  64         short a[N]; short b[N]; short c[N]; int i;
  65
  66         for (i=0; i<N; i++){
  67           a[i] = b[i] + c[i];
  68         }
  69
  70    as if it was manually vectorized by rewriting the source code into:
  71
  72         typedef int __attribute__((mode(V8HI))) v8hi;
  73         short a[N];  short b[N]; short c[N];   int i;
  74         v8hi *pa = (v8hi*)a, *pb = (v8hi*)b, *pc = (v8hi*)c;
  75         v8hi va, vb, vc;
  76
  77         for (i=0; i<N/8; i++){
  78           vb = pb[i];
  79           vc = pc[i];
  80           va = vb + vc;
  81           pa[i] = va;
  82         }
  83
  84         The main entry to this pass is vectorize_loops(), in which
  85    the vectorizer applies a set of analyses on a given set of loops,
  86    followed by the actual vectorization transformation for the loops that
  87    had successfully passed the analysis phase.
  88         Throughout this pass we make a distinction between two types of
  89    data: scalars (which are represented by SSA_NAMES), and memory references
  90    ("data-refs").  These two types of data require different handling both
  91    during analysis and transformation. The types of data-refs that the
  92    vectorizer currently supports are ARRAY_REFS which base is an array DECL
  93    (not a pointer), and INDIRECT_REFS through pointers; both array and pointer
  94    accesses are required to have a simple (consecutive) access pattern.
  95
  96    Analysis phase:
  97    ===============
  98         The driver for the analysis phase is vect_analyze_loop().
  99    It applies a set of analyses, some of which rely on the scalar evolution
 100    analyzer (scev) developed by Sebastian Pop.
 101
 102         During the analysis phase the vectorizer records some information
 103    per stmt in a "stmt_vec_info" struct which is attached to each stmt in the
 104    loop, as well as general information about the loop as a whole, which is
 105    recorded in a "loop_vec_info" struct attached to each loop.
 106
 107    Transformation phase:
 108    =====================
 109         The loop transformation phase scans all the stmts in the loop, and
 110    creates a vector stmt (or a sequence of stmts) for each scalar stmt S in
 111    the loop that needs to be vectorized.  It inserts the vector code sequence
 112    just before the scalar stmt S, and records a pointer to the vector code
 113    in STMT_VINFO_VEC_STMT (stmt_info) (stmt_info is the stmt_vec_info struct
 114    attached to S).  This pointer will be used for the vectorization of following
 115    stmts which use the def of stmt S. Stmt S is removed if it writes to memory;
 116    otherwise, we rely on dead code elimination for removing it.
 117
 118         For example, say stmt S1 was vectorized into stmt VS1:
 119
 120    VS1: vb = px[i];
 121    S1:  b = x[i];    STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
 122    S2:  a = b;
 123
 124    To vectorize stmt S2, the vectorizer first finds the stmt that defines
 125    the operand 'b' (S1), and gets the relevant vector def 'vb' from the
 126    vector stmt VS1 pointed to by STMT_VINFO_VEC_STMT (stmt_info (S1)).  The
 127    resulting sequence would be:
 128
 129    VS1: vb = px[i];
 130    S1:  b = x[i];       STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
 131    VS2: va = vb;
 132    S2:  a = b;          STMT_VINFO_VEC_STMT (stmt_info (S2)) = VS2
 133
 134         Operands that are not SSA_NAMEs, are data-refs that appear in
 135    load/store operations (like 'x[i]' in S1), and are handled differently.
 136
 137    Target modeling:
 138    =================
 139         Currently the only target specific information that is used is the
 140    size of the vector (in bytes) - "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD".
 141    Targets that can support different sizes of vectors, for now will need
 142    to specify one value for "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD".  More
 143    flexibility will be added in the future.
 144
 145         Since we only vectorize operations which vector form can be
 146    expressed using existing tree codes, to verify that an operation is
 147    supported, the vectorizer checks the relevant optab at the relevant
 148    machine_mode (e.g, optab_handler (add_optab, V8HImode)).  If
 149    the value found is CODE_FOR_nothing, then there's no target support, and
 150    we can't vectorize the stmt.
 151
 152    For additional information on this project see:
 153    http://gcc.gnu.org/projects/tree-ssa/vectorization.html
 154 */
 155
 156 static void vect_estimate_min_profitable_iters (loop_vec_info, int *, int *);
 157
 158 /* Subroutine of vect_determine_vf_for_stmt that handles only one
 159    statement.  VECTYPE_MAYBE_SET_P is true if STMT_VINFO_VECTYPE
 160    may already be set for general statements (not just data refs).  */
 161
 162 static opt_result
 163 vect_determine_vf_for_stmt_1 (stmt_vec_info stmt_info,
 164                               bool vectype_maybe_set_p,
 165                               poly_uint64 *vf,
 166                               vec<stmt_vec_info > *mask_producers)
 167 {
 168   gimple *stmt = stmt_info->stmt;
 169
 170   if ((!STMT_VINFO_RELEVANT_P (stmt_info)
 171        && !STMT_VINFO_LIVE_P (stmt_info))
 172       || gimple_clobber_p (stmt))
 173     {
 174       if (dump_enabled_p ())
 175         dump_printf_loc (MSG_NOTE, vect_location, "skip.\n");
 176       return opt_result::success ();
 177     }
 178
 179   tree stmt_vectype, nunits_vectype;
 180   opt_result res = vect_get_vector_types_for_stmt (stmt_info, &stmt_vectype,
 181                                                    &nunits_vectype);
 182   if (!res)
 183     return res;
 184
 185   if (stmt_vectype)
 186     {
 187       if (STMT_VINFO_VECTYPE (stmt_info))
 188         /* The only case when a vectype had been already set is for stmts
 189            that contain a data ref, or for "pattern-stmts" (stmts generated
 190            by the vectorizer to represent/replace a certain idiom).  */
 191         gcc_assert ((STMT_VINFO_DATA_REF (stmt_info)
 192                      || vectype_maybe_set_p)
 193                     && STMT_VINFO_VECTYPE (stmt_info) == stmt_vectype);
 194       else if (stmt_vectype == boolean_type_node)
 195         mask_producers->safe_push (stmt_info);
 196       else
 197         STMT_VINFO_VECTYPE (stmt_info) = stmt_vectype;
 198     }
 199
 200   if (nunits_vectype)
 201     vect_update_max_nunits (vf, nunits_vectype);
 202
 203   return opt_result::success ();
 204 }
 205
 206 /* Subroutine of vect_determine_vectorization_factor.  Set the vector
 207    types of STMT_INFO and all attached pattern statements and update
 208    the vectorization factor VF accordingly.  If some of the statements
 209    produce a mask result whose vector type can only be calculated later,
 210    add them to MASK_PRODUCERS.  Return true on success or false if
 211    something prevented vectorization.  */
 212
 213 static opt_result
 214 vect_determine_vf_for_stmt (stmt_vec_info stmt_info, poly_uint64 *vf,
 215                             vec<stmt_vec_info > *mask_producers)
 216 {
 217   vec_info *vinfo = stmt_info->vinfo;
 218   if (dump_enabled_p ())
 219     dump_printf_loc (MSG_NOTE, vect_location, "==> examining statement: %G",
 220                      stmt_info->stmt);
 221   opt_result res
 222     = vect_determine_vf_for_stmt_1 (stmt_info, false, vf, mask_producers);
 223   if (!res)
 224     return res;
 225
 226   if (STMT_VINFO_IN_PATTERN_P (stmt_info)
 227       && STMT_VINFO_RELATED_STMT (stmt_info))
 228     {
 229       gimple *pattern_def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
 230       stmt_info = STMT_VINFO_RELATED_STMT (stmt_info);
 231
 232       /* If a pattern statement has def stmts, analyze them too.  */
 233       for (gimple_stmt_iterator si = gsi_start (pattern_def_seq);
 234            !gsi_end_p (si); gsi_next (&si))
 235         {
 236           stmt_vec_info def_stmt_info = vinfo->lookup_stmt (gsi_stmt (si));
 237           if (dump_enabled_p ())
 238             dump_printf_loc (MSG_NOTE, vect_location,
 239                              "==> examining pattern def stmt: %G",
 240                              def_stmt_info->stmt);
 241           if (!vect_determine_vf_for_stmt_1 (def_stmt_info, true,
 242                                              vf, mask_producers))
 243           res = vect_determine_vf_for_stmt_1 (def_stmt_info, true,
 244                                               vf, mask_producers);
 245           if (!res)
 246             return res;
 247         }
 248
 249       if (dump_enabled_p ())
 250         dump_printf_loc (MSG_NOTE, vect_location,
 251                          "==> examining pattern statement: %G",
 252                          stmt_info->stmt);
 253       res = vect_determine_vf_for_stmt_1 (stmt_info, true, vf, mask_producers);
 254       if (!res)
 255         return res;
 256     }
 257
 258   return opt_result::success ();
 259 }
 260
 261 /* Function vect_determine_vectorization_factor
 262
 263    Determine the vectorization factor (VF).  VF is the number of data elements
 264    that are operated upon in parallel in a single iteration of the vectorized
 265    loop.  For example, when vectorizing a loop that operates on 4byte elements,
 266    on a target with vector size (VS) 16byte, the VF is set to 4, since 4
 267    elements can fit in a single vector register.
 268
 269    We currently support vectorization of loops in which all types operated upon
 270    are of the same size.  Therefore this function currently sets VF according to
 271    the size of the types operated upon, and fails if there are multiple sizes
 272    in the loop.
 273
 274    VF is also the factor by which the loop iterations are strip-mined, e.g.:
 275    original loop:
 276         for (i=0; i<N; i++){
 277           a[i] = b[i] + c[i];
 278         }
 279
 280    vectorized loop:
 281         for (i=0; i<N; i+=VF){
 282           a[i:VF] = b[i:VF] + c[i:VF];
 283         }
 284 */
 285
 286 static opt_result
 287 vect_determine_vectorization_factor (loop_vec_info loop_vinfo)
 288 {
 289   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
 290   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
 291   unsigned nbbs = loop->num_nodes;
 292   poly_uint64 vectorization_factor = 1;
 293   tree scalar_type = NULL_TREE;
 294   gphi *phi;
 295   tree vectype;
 296   stmt_vec_info stmt_info;
 297   unsigned i;
 298   auto_vec<stmt_vec_info> mask_producers;
 299
 300   DUMP_VECT_SCOPE ("vect_determine_vectorization_factor");
 301
 302   for (i = 0; i < nbbs; i++)
 303     {
 304       basic_block bb = bbs[i];
 305
 306       for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
 307            gsi_next (&si))
 308         {
 309           phi = si.phi ();
 310           stmt_info = loop_vinfo->lookup_stmt (phi);
 311           if (dump_enabled_p ())
 312             dump_printf_loc (MSG_NOTE, vect_location, "==> examining phi: %G",
 313                              phi);
 314
 315           gcc_assert (stmt_info);
 316
 317           if (STMT_VINFO_RELEVANT_P (stmt_info)
 318               || STMT_VINFO_LIVE_P (stmt_info))
 319             {
 320               gcc_assert (!STMT_VINFO_VECTYPE (stmt_info));
 321               scalar_type = TREE_TYPE (PHI_RESULT (phi));
 322
 323               if (dump_enabled_p ())
 324                 dump_printf_loc (MSG_NOTE, vect_location,
 325                                  "get vectype for scalar type:  %T\n",
 326                                  scalar_type);
 327
 328               vectype = get_vectype_for_scalar_type (scalar_type);
 329               if (!vectype)
 330                 return opt_result::failure_at (phi,
 331                                                "not vectorized: unsupported "
 332                                                "data-type %T\n",
 333                                                scalar_type);
 334               STMT_VINFO_VECTYPE (stmt_info) = vectype;
 335
 336               if (dump_enabled_p ())
 337                 dump_printf_loc (MSG_NOTE, vect_location, "vectype: %T\n",
 338                                  vectype);
 339
 340               if (dump_enabled_p ())
 341                 {
 342                   dump_printf_loc (MSG_NOTE, vect_location, "nunits = ");
 343                   dump_dec (MSG_NOTE, TYPE_VECTOR_SUBPARTS (vectype));
 344                   dump_printf (MSG_NOTE, "\n");
 345                 }
 346
 347               vect_update_max_nunits (&vectorization_factor, vectype);
 348             }
 349         }
 350
 351       for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
 352            gsi_next (&si))
 353         {
 354           stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
 355           opt_result res
 356             = vect_determine_vf_for_stmt (stmt_info, &vectorization_factor,
 357                                           &mask_producers);
 358           if (!res)
 359             return res;
 360         }
 361     }
 362
 363   /* TODO: Analyze cost. Decide if worth while to vectorize.  */
 364   if (dump_enabled_p ())
 365     {
 366       dump_printf_loc (MSG_NOTE, vect_location, "vectorization factor = ");
 367       dump_dec (MSG_NOTE, vectorization_factor);
 368       dump_printf (MSG_NOTE, "\n");
 369     }
 370
 371   if (known_le (vectorization_factor, 1U))
 372     return opt_result::failure_at (vect_location,
 373                                    "not vectorized: unsupported data-type\n");
 374   LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
 375
 376   for (i = 0; i < mask_producers.length (); i++)
 377     {
 378       stmt_info = mask_producers[i];
 379       opt_tree mask_type = vect_get_mask_type_for_stmt (stmt_info);
 380       if (!mask_type)
 381         return opt_result::propagate_failure (mask_type);
 382       STMT_VINFO_VECTYPE (stmt_info) = mask_type;
 383     }
 384
 385   return opt_result::success ();
 386 }
 387
 388
 389 /* Function vect_is_simple_iv_evolution.
 390
 391    FORNOW: A simple evolution of an induction variables in the loop is
 392    considered a polynomial evolution.  */
 393
 394 static bool
 395 vect_is_simple_iv_evolution (unsigned loop_nb, tree access_fn, tree * init,
 396                              tree * step)
 397 {
 398   tree init_expr;
 399   tree step_expr;
 400   tree evolution_part = evolution_part_in_loop_num (access_fn, loop_nb);
 401   basic_block bb;
 402
 403   /* When there is no evolution in this loop, the evolution function
 404      is not "simple".  */
 405   if (evolution_part == NULL_TREE)
 406     return false;
 407
 408   /* When the evolution is a polynomial of degree >= 2
 409      the evolution function is not "simple".  */
 410   if (tree_is_chrec (evolution_part))
 411     return false;
 412
 413   step_expr = evolution_part;
 414   init_expr = unshare_expr (initial_condition_in_loop_num (access_fn, loop_nb));
 415
 416   if (dump_enabled_p ())
 417     dump_printf_loc (MSG_NOTE, vect_location, "step: %T,  init: %T\n",
 418                      step_expr, init_expr);
 419
 420   *init = init_expr;
 421   *step = step_expr;
 422
 423   if (TREE_CODE (step_expr) != INTEGER_CST
 424       && (TREE_CODE (step_expr) != SSA_NAME
 425           || ((bb = gimple_bb (SSA_NAME_DEF_STMT (step_expr)))
 426               && flow_bb_inside_loop_p (get_loop (cfun, loop_nb), bb))
 427           || (!INTEGRAL_TYPE_P (TREE_TYPE (step_expr))
 428               && (!SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr))
 429                   || !flag_associative_math)))
 430       && (TREE_CODE (step_expr) != REAL_CST
 431           || !flag_associative_math))
 432     {
 433       if (dump_enabled_p ())
 434         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 435                          "step unknown.\n");
 436       return false;
 437     }
 438
 439   return true;
 440 }
 441
 442 /* Return true if PHI, described by STMT_INFO, is the inner PHI in
 443    what we are assuming is a double reduction.  For example, given
 444    a structure like this:
 445
 446       outer1:
 447         x_1 = PHI <x_4(outer2), ...>;
 448         ...
 449
 450       inner:
 451         x_2 = PHI <x_1(outer1), ...>;
 452         ...
 453         x_3 = ...;
 454         ...
 455
 456       outer2:
 457         x_4 = PHI <x_3(inner)>;
 458         ...
 459
 460    outer loop analysis would treat x_1 as a double reduction phi and
 461    this function would then return true for x_2.  */
 462
 463 static bool
 464 vect_inner_phi_in_double_reduction_p (stmt_vec_info stmt_info, gphi *phi)
 465 {
 466   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
 467   use_operand_p use_p;
 468   ssa_op_iter op_iter;
 469   FOR_EACH_PHI_ARG (use_p, phi, op_iter, SSA_OP_USE)
 470     if (stmt_vec_info def_info = loop_vinfo->lookup_def (USE_FROM_PTR (use_p)))
 471       if (STMT_VINFO_DEF_TYPE (def_info) == vect_double_reduction_def)
 472         return true;
 473   return false;
 474 }
 475
 476 /* Function vect_analyze_scalar_cycles_1.
 477
 478    Examine the cross iteration def-use cycles of scalar variables
 479    in LOOP.  LOOP_VINFO represents the loop that is now being
 480    considered for vectorization (can be LOOP, or an outer-loop
 481    enclosing LOOP).  */
 482
 483 static void
 484 vect_analyze_scalar_cycles_1 (loop_vec_info loop_vinfo, struct loop *loop)
 485 {
 486   basic_block bb = loop->header;
 487   tree init, step;
 488   auto_vec<stmt_vec_info, 64> worklist;
 489   gphi_iterator gsi;
 490   bool double_reduc;
 491
 492   DUMP_VECT_SCOPE ("vect_analyze_scalar_cycles");
 493
 494   /* First - identify all inductions.  Reduction detection assumes that all the
 495      inductions have been identified, therefore, this order must not be
 496      changed.  */
 497   for (gsi = gsi_start_phis  (bb); !gsi_end_p (gsi); gsi_next (&gsi))
 498     {
 499       gphi *phi = gsi.phi ();
 500       tree access_fn = NULL;
 501       tree def = PHI_RESULT (phi);
 502       stmt_vec_info stmt_vinfo = loop_vinfo->lookup_stmt (phi);
 503
 504       if (dump_enabled_p ())
 505         dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: %G", phi);
 506
 507       /* Skip virtual phi's.  The data dependences that are associated with
 508          virtual defs/uses (i.e., memory accesses) are analyzed elsewhere.  */
 509       if (virtual_operand_p (def))
 510         continue;
 511
 512       STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_unknown_def_type;
 513
 514       /* Analyze the evolution function.  */
 515       access_fn = analyze_scalar_evolution (loop, def);
 516       if (access_fn)
 517         {
 518           STRIP_NOPS (access_fn);
 519           if (dump_enabled_p ())
 520             dump_printf_loc (MSG_NOTE, vect_location,
 521                              "Access function of PHI: %T\n", access_fn);
 522           STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo)
 523             = initial_condition_in_loop_num (access_fn, loop->num);
 524           STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo)
 525             = evolution_part_in_loop_num (access_fn, loop->num);
 526         }
 527
 528       if (!access_fn
 529           || vect_inner_phi_in_double_reduction_p (stmt_vinfo, phi)
 530           || !vect_is_simple_iv_evolution (loop->num, access_fn, &init, &step)
 531           || (LOOP_VINFO_LOOP (loop_vinfo) != loop
 532               && TREE_CODE (step) != INTEGER_CST))
 533         {
 534           worklist.safe_push (stmt_vinfo);
 535           continue;
 536         }
 537
 538       gcc_assert (STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo)
 539                   != NULL_TREE);
 540       gcc_assert (STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo) != NULL_TREE);
 541
 542       if (dump_enabled_p ())
 543         dump_printf_loc (MSG_NOTE, vect_location, "Detected induction.\n");
 544       STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_induction_def;
 545     }
 546
 547
 548   /* Second - identify all reductions and nested cycles.  */
 549   while (worklist.length () > 0)
 550     {
 551       stmt_vec_info stmt_vinfo = worklist.pop ();
 552       gphi *phi = as_a <gphi *> (stmt_vinfo->stmt);
 553       tree def = PHI_RESULT (phi);
 554
 555       if (dump_enabled_p ())
 556         dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: %G", phi);
 557
 558       gcc_assert (!virtual_operand_p (def)
 559                   && STMT_VINFO_DEF_TYPE (stmt_vinfo) == vect_unknown_def_type);
 560
 561       stmt_vec_info reduc_stmt_info
 562         = vect_force_simple_reduction (loop_vinfo, stmt_vinfo,
 563                                        &double_reduc, false);
 564       if (reduc_stmt_info)
 565         {
 566           if (double_reduc)
 567             {
 568               if (dump_enabled_p ())
 569                 dump_printf_loc (MSG_NOTE, vect_location,
 570                                  "Detected double reduction.\n");
 571
 572               STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_double_reduction_def;
 573               STMT_VINFO_DEF_TYPE (reduc_stmt_info)
 574                 = vect_double_reduction_def;
 575             }
 576           else
 577             {
 578               if (loop != LOOP_VINFO_LOOP (loop_vinfo))
 579                 {
 580                   if (dump_enabled_p ())
 581                     dump_printf_loc (MSG_NOTE, vect_location,
 582                                      "Detected vectorizable nested cycle.\n");
 583
 584                   STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_nested_cycle;
 585                   STMT_VINFO_DEF_TYPE (reduc_stmt_info) = vect_nested_cycle;
 586                 }
 587               else
 588                 {
 589                   if (dump_enabled_p ())
 590                     dump_printf_loc (MSG_NOTE, vect_location,
 591                                      "Detected reduction.\n");
 592
 593                   STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_reduction_def;
 594                   STMT_VINFO_DEF_TYPE (reduc_stmt_info) = vect_reduction_def;
 595                   /* Store the reduction cycles for possible vectorization in
 596                      loop-aware SLP if it was not detected as reduction
 597                      chain.  */
 598                   if (! REDUC_GROUP_FIRST_ELEMENT (reduc_stmt_info))
 599                     LOOP_VINFO_REDUCTIONS (loop_vinfo).safe_push
 600                       (reduc_stmt_info);
 601                 }
 602             }
 603         }
 604       else
 605         if (dump_enabled_p ())
 606           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 607                            "Unknown def-use cycle pattern.\n");
 608     }
 609 }
 610
 611
 612 /* Function vect_analyze_scalar_cycles.
 613
 614    Examine the cross iteration def-use cycles of scalar variables, by
 615    analyzing the loop-header PHIs of scalar variables.  Classify each
 616    cycle as one of the following: invariant, induction, reduction, unknown.
 617    We do that for the loop represented by LOOP_VINFO, and also to its
 618    inner-loop, if exists.
 619    Examples for scalar cycles:
 620
 621    Example1: reduction:
 622
 623               loop1:
 624               for (i=0; i<N; i++)
 625                  sum += a[i];
 626
 627    Example2: induction:
 628
 629               loop2:
 630               for (i=0; i<N; i++)
 631                  a[i] = i;  */
 632
 633 static void
 634 vect_analyze_scalar_cycles (loop_vec_info loop_vinfo)
 635 {
 636   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
 637
 638   vect_analyze_scalar_cycles_1 (loop_vinfo, loop);
 639
 640   /* When vectorizing an outer-loop, the inner-loop is executed sequentially.
 641      Reductions in such inner-loop therefore have different properties than
 642      the reductions in the nest that gets vectorized:
 643      1. When vectorized, they are executed in the same order as in the original
 644         scalar loop, so we can't change the order of computation when
 645         vectorizing them.
 646      2. FIXME: Inner-loop reductions can be used in the inner-loop, so the
 647         current checks are too strict.  */
 648
 649   if (loop->inner)
 650     vect_analyze_scalar_cycles_1 (loop_vinfo, loop->inner);
 651 }
 652
 653 /* Transfer group and reduction information from STMT_INFO to its
 654    pattern stmt.  */
 655
 656 static void
 657 vect_fixup_reduc_chain (stmt_vec_info stmt_info)
 658 {
 659   stmt_vec_info firstp = STMT_VINFO_RELATED_STMT (stmt_info);
 660   stmt_vec_info stmtp;
 661   gcc_assert (!REDUC_GROUP_FIRST_ELEMENT (firstp)
 662               && REDUC_GROUP_FIRST_ELEMENT (stmt_info));
 663   REDUC_GROUP_SIZE (firstp) = REDUC_GROUP_SIZE (stmt_info);
 664   do
 665     {
 666       stmtp = STMT_VINFO_RELATED_STMT (stmt_info);
 667       REDUC_GROUP_FIRST_ELEMENT (stmtp) = firstp;
 668       stmt_info = REDUC_GROUP_NEXT_ELEMENT (stmt_info);
 669       if (stmt_info)
 670         REDUC_GROUP_NEXT_ELEMENT (stmtp)
 671           = STMT_VINFO_RELATED_STMT (stmt_info);
 672     }
 673   while (stmt_info);
 674   STMT_VINFO_DEF_TYPE (stmtp) = vect_reduction_def;
 675 }
 676
 677 /* Fixup scalar cycles that now have their stmts detected as patterns.  */
 678
 679 static void
 680 vect_fixup_scalar_cycles_with_patterns (loop_vec_info loop_vinfo)
 681 {
 682   stmt_vec_info first;
 683   unsigned i;
 684
 685   FOR_EACH_VEC_ELT (LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo), i, first)
 686     if (STMT_VINFO_IN_PATTERN_P (first))
 687       {
 688         stmt_vec_info next = REDUC_GROUP_NEXT_ELEMENT (first);
 689         while (next)
 690           {
 691             if (! STMT_VINFO_IN_PATTERN_P (next))
 692               break;
 693             next = REDUC_GROUP_NEXT_ELEMENT (next);
 694           }
 695         /* If not all stmt in the chain are patterns try to handle
 696            the chain without patterns.  */
 697         if (! next)
 698           {
 699             vect_fixup_reduc_chain (first);
 700             LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo)[i]
 701               = STMT_VINFO_RELATED_STMT (first);
 702           }
 703       }
 704 }
 705
 706 /* Function vect_get_loop_niters.
 707
 708    Determine how many iterations the loop is executed and place it
 709    in NUMBER_OF_ITERATIONS.  Place the number of latch iterations
 710    in NUMBER_OF_ITERATIONSM1.  Place the condition under which the
 711    niter information holds in ASSUMPTIONS.
 712
 713    Return the loop exit condition.  */
 714
 715
 716 static gcond *
 717 vect_get_loop_niters (struct loop *loop, tree *assumptions,
 718                       tree *number_of_iterations, tree *number_of_iterationsm1)
 719 {
 720   edge exit = single_exit (loop);
 721   struct tree_niter_desc niter_desc;
 722   tree niter_assumptions, niter, may_be_zero;
 723   gcond *cond = get_loop_exit_condition (loop);
 724
 725   *assumptions = boolean_true_node;
 726   *number_of_iterationsm1 = chrec_dont_know;
 727   *number_of_iterations = chrec_dont_know;
 728   DUMP_VECT_SCOPE ("get_loop_niters");
 729
 730   if (!exit)
 731     return cond;
 732
 733   niter = chrec_dont_know;
 734   may_be_zero = NULL_TREE;
 735   niter_assumptions = boolean_true_node;
 736   if (!number_of_iterations_exit_assumptions (loop, exit, &niter_desc, NULL)
 737       || chrec_contains_undetermined (niter_desc.niter))
 738     return cond;
 739
 740   niter_assumptions = niter_desc.assumptions;
 741   may_be_zero = niter_desc.may_be_zero;
 742   niter = niter_desc.niter;
 743
 744   if (may_be_zero && integer_zerop (may_be_zero))
 745     may_be_zero = NULL_TREE;
 746
 747   if (may_be_zero)
 748     {
 749       if (COMPARISON_CLASS_P (may_be_zero))
 750         {
 751           /* Try to combine may_be_zero with assumptions, this can simplify
 752              computation of niter expression.  */
 753           if (niter_assumptions && !integer_nonzerop (niter_assumptions))
 754             niter_assumptions = fold_build2 (TRUTH_AND_EXPR, boolean_type_node,
 755                                              niter_assumptions,
 756                                              fold_build1 (TRUTH_NOT_EXPR,
 757                                                           boolean_type_node,
 758                                                           may_be_zero));
 759           else
 760             niter = fold_build3 (COND_EXPR, TREE_TYPE (niter), may_be_zero,
 761                                  build_int_cst (TREE_TYPE (niter), 0),
 762                                  rewrite_to_non_trapping_overflow (niter));
 763
 764           may_be_zero = NULL_TREE;
 765         }
 766       else if (integer_nonzerop (may_be_zero))
 767         {
 768           *number_of_iterationsm1 = build_int_cst (TREE_TYPE (niter), 0);
 769           *number_of_iterations = build_int_cst (TREE_TYPE (niter), 1);
 770           return cond;
 771         }
 772       else
 773         return cond;
 774     }
 775
 776   *assumptions = niter_assumptions;
 777   *number_of_iterationsm1 = niter;
 778
 779   /* We want the number of loop header executions which is the number
 780      of latch executions plus one.
 781      ???  For UINT_MAX latch executions this number overflows to zero
 782      for loops like do { n++; } while (n != 0);  */
 783   if (niter && !chrec_contains_undetermined (niter))
 784     niter = fold_build2 (PLUS_EXPR, TREE_TYPE (niter), unshare_expr (niter),
 785                           build_int_cst (TREE_TYPE (niter), 1));
 786   *number_of_iterations = niter;
 787
 788   return cond;
 789 }
 790
 791 /* Function bb_in_loop_p
 792
 793    Used as predicate for dfs order traversal of the loop bbs.  */
 794
 795 static bool
 796 bb_in_loop_p (const_basic_block bb, const void *data)
 797 {
 798   const struct loop *const loop = (const struct loop *)data;
 799   if (flow_bb_inside_loop_p (loop, bb))
 800     return true;
 801   return false;
 802 }
 803
 804
 805 /* Create and initialize a new loop_vec_info struct for LOOP_IN, as well as
 806    stmt_vec_info structs for all the stmts in LOOP_IN.  */
 807
 808 _loop_vec_info::_loop_vec_info (struct loop *loop_in, vec_info_shared *shared)
 809   : vec_info (vec_info::loop, init_cost (loop_in), shared),
 810     loop (loop_in),
 811     bbs (XCNEWVEC (basic_block, loop->num_nodes)),
 812     num_itersm1 (NULL_TREE),
 813     num_iters (NULL_TREE),
 814     num_iters_unchanged (NULL_TREE),
 815     num_iters_assumptions (NULL_TREE),
 816     th (0),
 817     versioning_threshold (0),
 818     vectorization_factor (0),
 819     max_vectorization_factor (0),
 820     mask_skip_niters (NULL_TREE),
 821     mask_compare_type (NULL_TREE),
 822     simd_if_cond (NULL_TREE),
 823     unaligned_dr (NULL),
 824     peeling_for_alignment (0),
 825     ptr_mask (0),
 826     ivexpr_map (NULL),
 827     slp_unrolling_factor (1),
 828     single_scalar_iteration_cost (0),
 829     vectorizable (false),
 830     can_fully_mask_p (true),
 831     fully_masked_p (false),
 832     peeling_for_gaps (false),
 833     peeling_for_niter (false),
 834     operands_swapped (false),
 835     no_data_dependencies (false),
 836     has_mask_store (false),
 837     scalar_loop (NULL),
 838     orig_loop_info (NULL)
 839 {
 840   /* CHECKME: We want to visit all BBs before their successors (except for
 841      latch blocks, for which this assertion wouldn't hold).  In the simple
 842      case of the loop forms we allow, a dfs order of the BBs would the same
 843      as reversed postorder traversal, so we are safe.  */
 844
 845   unsigned int nbbs = dfs_enumerate_from (loop->header, 0, bb_in_loop_p,
 846                                           bbs, loop->num_nodes, loop);
 847   gcc_assert (nbbs == loop->num_nodes);
 848
 849   for (unsigned int i = 0; i < nbbs; i++)
 850     {
 851       basic_block bb = bbs[i];
 852       gimple_stmt_iterator si;
 853
 854       for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
 855         {
 856           gimple *phi = gsi_stmt (si);
 857           gimple_set_uid (phi, 0);
 858           add_stmt (phi);
 859         }
 860
 861       for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
 862         {
 863           gimple *stmt = gsi_stmt (si);
 864           gimple_set_uid (stmt, 0);
 865           add_stmt (stmt);
 866           /* If .GOMP_SIMD_LANE call for the current loop has 2 arguments, the
 867              second argument is the #pragma omp simd if (x) condition, when 0,
 868              loop shouldn't be vectorized, when non-zero constant, it should
 869              be vectorized normally, otherwise versioned with vectorized loop
 870              done if the condition is non-zero at runtime.  */
 871           if (loop_in->simduid
 872               && is_gimple_call (stmt)
 873               && gimple_call_internal_p (stmt)
 874               && gimple_call_internal_fn (stmt) == IFN_GOMP_SIMD_LANE
 875               && gimple_call_num_args (stmt) >= 2
 876               && TREE_CODE (gimple_call_arg (stmt, 0)) == SSA_NAME
 877               && (loop_in->simduid
 878                   == SSA_NAME_VAR (gimple_call_arg (stmt, 0))))
 879             {
 880               tree arg = gimple_call_arg (stmt, 1);
 881               if (integer_zerop (arg) || TREE_CODE (arg) == SSA_NAME)
 882                 simd_if_cond = arg;
 883               else
 884                 gcc_assert (integer_nonzerop (arg));
 885             }
 886         }
 887     }
 888 }
 889
 890 /* Free all levels of MASKS.  */
 891
 892 void
 893 release_vec_loop_masks (vec_loop_masks *masks)
 894 {
 895   rgroup_masks *rgm;
 896   unsigned int i;
 897   FOR_EACH_VEC_ELT (*masks, i, rgm)
 898     rgm->masks.release ();
 899   masks->release ();
 900 }
 901
 902 /* Free all memory used by the _loop_vec_info, as well as all the
 903    stmt_vec_info structs of all the stmts in the loop.  */
 904
 905 _loop_vec_info::~_loop_vec_info ()
 906 {
 907   int nbbs;
 908   gimple_stmt_iterator si;
 909   int j;
 910
 911   nbbs = loop->num_nodes;
 912   for (j = 0; j < nbbs; j++)
 913     {
 914       basic_block bb = bbs[j];
 915       for (si = gsi_start_bb (bb); !gsi_end_p (si); )
 916         {
 917           gimple *stmt = gsi_stmt (si);
 918
 919           /* We may have broken canonical form by moving a constant
 920              into RHS1 of a commutative op.  Fix such occurrences.  */
 921           if (operands_swapped && is_gimple_assign (stmt))
 922             {
 923               enum tree_code code = gimple_assign_rhs_code (stmt);
 924
 925               if ((code == PLUS_EXPR
 926                    || code == POINTER_PLUS_EXPR
 927                    || code == MULT_EXPR)
 928                   && CONSTANT_CLASS_P (gimple_assign_rhs1 (stmt)))
 929                 swap_ssa_operands (stmt,
 930                                    gimple_assign_rhs1_ptr (stmt),
 931                                    gimple_assign_rhs2_ptr (stmt));
 932               else if (code == COND_EXPR
 933                        && CONSTANT_CLASS_P (gimple_assign_rhs2 (stmt)))
 934                 {
 935                   tree cond_expr = gimple_assign_rhs1 (stmt);
 936                   enum tree_code cond_code = TREE_CODE (cond_expr);
 937
 938                   if (TREE_CODE_CLASS (cond_code) == tcc_comparison)
 939                     {
 940                       bool honor_nans = HONOR_NANS (TREE_OPERAND (cond_expr,
 941                                                                   0));
 942                       cond_code = invert_tree_comparison (cond_code,
 943                                                           honor_nans);
 944                       if (cond_code != ERROR_MARK)
 945                         {
 946                           TREE_SET_CODE (cond_expr, cond_code);
 947                           swap_ssa_operands (stmt,
 948                                              gimple_assign_rhs2_ptr (stmt),
 949                                              gimple_assign_rhs3_ptr (stmt));
 950                         }
 951                     }
 952                 }
 953             }
 954           gsi_next (&si);
 955         }
 956     }
 957
 958   free (bbs);
 959
 960   release_vec_loop_masks (&masks);
 961   delete ivexpr_map;
 962
 963   loop->aux = NULL;
 964 }
 965
 966 /* Return an invariant or register for EXPR and emit necessary
 967    computations in the LOOP_VINFO loop preheader.  */
 968
 969 tree
 970 cse_and_gimplify_to_preheader (loop_vec_info loop_vinfo, tree expr)
 971 {
 972   if (is_gimple_reg (expr)
 973       || is_gimple_min_invariant (expr))
 974     return expr;
 975
 976   if (! loop_vinfo->ivexpr_map)
 977     loop_vinfo->ivexpr_map = new hash_map<tree_operand_hash, tree>;
 978   tree &cached = loop_vinfo->ivexpr_map->get_or_insert (expr);
 979   if (! cached)
 980     {
 981       gimple_seq stmts = NULL;
 982       cached = force_gimple_operand (unshare_expr (expr),
 983                                      &stmts, true, NULL_TREE);
 984       if (stmts)
 985         {
 986           edge e = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo));
 987           gsi_insert_seq_on_edge_immediate (e, stmts);
 988         }
 989     }
 990   return cached;
 991 }
 992
 993 /* Return true if we can use CMP_TYPE as the comparison type to produce
 994    all masks required to mask LOOP_VINFO.  */
 995
 996 static bool
 997 can_produce_all_loop_masks_p (loop_vec_info loop_vinfo, tree cmp_type)
 998 {
 999   rgroup_masks *rgm;
1000   unsigned int i;
1001   FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo), i, rgm)
1002     if (rgm->mask_type != NULL_TREE
1003         && !direct_internal_fn_supported_p (IFN_WHILE_ULT,
1004                                             cmp_type, rgm->mask_type,
1005                                             OPTIMIZE_FOR_SPEED))
1006       return false;
1007   return true;
1008 }
1009
1010 /* Calculate the maximum number of scalars per iteration for every
1011    rgroup in LOOP_VINFO.  */
1012
1013 static unsigned int
1014 vect_get_max_nscalars_per_iter (loop_vec_info loop_vinfo)
1015 {
1016   unsigned int res = 1;
1017   unsigned int i;
1018   rgroup_masks *rgm;
1019   FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo), i, rgm)
1020     res = MAX (res, rgm->max_nscalars_per_iter);
1021   return res;
1022 }
1023
1024 /* Each statement in LOOP_VINFO can be masked where necessary.  Check
1025    whether we can actually generate the masks required.  Return true if so,
1026    storing the type of the scalar IV in LOOP_VINFO_MASK_COMPARE_TYPE.  */
1027
1028 static bool
1029 vect_verify_full_masking (loop_vec_info loop_vinfo)
1030 {
1031   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1032   unsigned int min_ni_width;
1033
1034   /* Use a normal loop if there are no statements that need masking.
1035      This only happens in rare degenerate cases: it means that the loop
1036      has no loads, no stores, and no live-out values.  */
1037   if (LOOP_VINFO_MASKS (loop_vinfo).is_empty ())
1038     return false;
1039
1040   /* Get the maximum number of iterations that is representable
1041      in the counter type.  */
1042   tree ni_type = TREE_TYPE (LOOP_VINFO_NITERSM1 (loop_vinfo));
1043   widest_int max_ni = wi::to_widest (TYPE_MAX_VALUE (ni_type)) + 1;
1044
1045   /* Get a more refined estimate for the number of iterations.  */
1046   widest_int max_back_edges;
1047   if (max_loop_iterations (loop, &max_back_edges))
1048     max_ni = wi::smin (max_ni, max_back_edges + 1);
1049
1050   /* Account for rgroup masks, in which each bit is replicated N times.  */
1051   max_ni *= vect_get_max_nscalars_per_iter (loop_vinfo);
1052
1053   /* Work out how many bits we need to represent the limit.  */
1054   min_ni_width = wi::min_precision (max_ni, UNSIGNED);
1055
1056   /* Find a scalar mode for which WHILE_ULT is supported.  */
1057   opt_scalar_int_mode cmp_mode_iter;
1058   tree cmp_type = NULL_TREE;
1059   FOR_EACH_MODE_IN_CLASS (cmp_mode_iter, MODE_INT)
1060     {
1061       unsigned int cmp_bits = GET_MODE_BITSIZE (cmp_mode_iter.require ());
1062       if (cmp_bits >= min_ni_width
1063           && targetm.scalar_mode_supported_p (cmp_mode_iter.require ()))
1064         {
1065           tree this_type = build_nonstandard_integer_type (cmp_bits, true);
1066           if (this_type
1067               && can_produce_all_loop_masks_p (loop_vinfo, this_type))
1068             {
1069               /* Although we could stop as soon as we find a valid mode,
1070                  it's often better to continue until we hit Pmode, since the
1071                  operands to the WHILE are more likely to be reusable in
1072                  address calculations.  */
1073               cmp_type = this_type;
1074               if (cmp_bits >= GET_MODE_BITSIZE (Pmode))
1075                 break;
1076             }
1077         }
1078     }
1079
1080   if (!cmp_type)
1081     return false;
1082
1083   LOOP_VINFO_MASK_COMPARE_TYPE (loop_vinfo) = cmp_type;
1084   return true;
1085 }
1086
1087 /* Calculate the cost of one scalar iteration of the loop.  */
1088 static void
1089 vect_compute_single_scalar_iteration_cost (loop_vec_info loop_vinfo)
1090 {
1091   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1092   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1093   int nbbs = loop->num_nodes, factor;
1094   int innerloop_iters, i;
1095
1096   DUMP_VECT_SCOPE ("vect_compute_single_scalar_iteration_cost");
1097
1098   /* Gather costs for statements in the scalar loop.  */
1099
1100   /* FORNOW.  */
1101   innerloop_iters = 1;
1102   if (loop->inner)
1103     innerloop_iters = 50; /* FIXME */
1104
1105   for (i = 0; i < nbbs; i++)
1106     {
1107       gimple_stmt_iterator si;
1108       basic_block bb = bbs[i];
1109
1110       if (bb->loop_father == loop->inner)
1111         factor = innerloop_iters;
1112       else
1113         factor = 1;
1114
1115       for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
1116         {
1117           gimple *stmt = gsi_stmt (si);
1118           stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (stmt);
1119
1120           if (!is_gimple_assign (stmt) && !is_gimple_call (stmt))
1121             continue;
1122
1123           /* Skip stmts that are not vectorized inside the loop.  */
1124           stmt_vec_info vstmt_info = vect_stmt_to_vectorize (stmt_info);
1125           if (!STMT_VINFO_RELEVANT_P (vstmt_info)
1126               && (!STMT_VINFO_LIVE_P (vstmt_info)
1127                   || !VECTORIZABLE_CYCLE_DEF
1128                         (STMT_VINFO_DEF_TYPE (vstmt_info))))
1129             continue;
1130
1131           vect_cost_for_stmt kind;
1132           if (STMT_VINFO_DATA_REF (stmt_info))
1133             {
1134               if (DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
1135                kind = scalar_load;
1136              else
1137                kind = scalar_store;
1138             }
1139           else
1140             kind = scalar_stmt;
1141
1142           record_stmt_cost (&LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1143                             factor, kind, stmt_info, 0, vect_prologue);
1144         }
1145     }
1146
1147   /* Now accumulate cost.  */
1148   void *target_cost_data = init_cost (loop);
1149   stmt_info_for_cost *si;
1150   int j;
1151   FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1152                     j, si)
1153     (void) add_stmt_cost (target_cost_data, si->count,
1154                           si->kind, si->stmt_info, si->misalign,
1155                           vect_body);
1156   unsigned dummy, body_cost = 0;
1157   finish_cost (target_cost_data, &dummy, &body_cost, &dummy);
1158   destroy_cost_data (target_cost_data);
1159   LOOP_VINFO_SINGLE_SCALAR_ITERATION_COST (loop_vinfo) = body_cost;
1160 }
1161
1162
1163 /* Function vect_analyze_loop_form_1.
1164
1165    Verify that certain CFG restrictions hold, including:
1166    - the loop has a pre-header
1167    - the loop has a single entry and exit
1168    - the loop exit condition is simple enough
1169    - the number of iterations can be analyzed, i.e, a countable loop.  The
1170      niter could be analyzed under some assumptions.  */
1171
1172 opt_result
1173 vect_analyze_loop_form_1 (struct loop *loop, gcond **loop_cond,
1174                           tree *assumptions, tree *number_of_iterationsm1,
1175                           tree *number_of_iterations, gcond **inner_loop_cond)
1176 {
1177   DUMP_VECT_SCOPE ("vect_analyze_loop_form");
1178
1179   /* Different restrictions apply when we are considering an inner-most loop,
1180      vs. an outer (nested) loop.
1181      (FORNOW. May want to relax some of these restrictions in the future).  */
1182
1183   if (!loop->inner)
1184     {
1185       /* Inner-most loop.  We currently require that the number of BBs is
1186          exactly 2 (the header and latch).  Vectorizable inner-most loops
1187          look like this:
1188
1189                         (pre-header)
1190                            |
1191                           header <--------+
1192                            | |            |
1193                            | +--> latch --+
1194                            |
1195                         (exit-bb)  */
1196
1197       if (loop->num_nodes != 2)
1198         return opt_result::failure_at (vect_location,
1199                                        "not vectorized:"
1200                                        " control flow in loop.\n");
1201
1202       if (empty_block_p (loop->header))
1203         return opt_result::failure_at (vect_location,
1204                                        "not vectorized: empty loop.\n");
1205     }
1206   else
1207     {
1208       struct loop *innerloop = loop->inner;
1209       edge entryedge;
1210
1211       /* Nested loop. We currently require that the loop is doubly-nested,
1212          contains a single inner loop, and the number of BBs is exactly 5.
1213          Vectorizable outer-loops look like this:
1214
1215                         (pre-header)
1216                            |
1217                           header <---+
1218                            |         |
1219                           inner-loop |
1220                            |         |
1221                           tail ------+
1222                            |
1223                         (exit-bb)
1224
1225          The inner-loop has the properties expected of inner-most loops
1226          as described above.  */
1227
1228       if ((loop->inner)->inner || (loop->inner)->next)
1229         return opt_result::failure_at (vect_location,
1230                                        "not vectorized:"
1231                                        " multiple nested loops.\n");
1232
1233       if (loop->num_nodes != 5)
1234         return opt_result::failure_at (vect_location,
1235                                        "not vectorized:"
1236                                        " control flow in loop.\n");
1237
1238       entryedge = loop_preheader_edge (innerloop);
1239       if (entryedge->src != loop->header
1240           || !single_exit (innerloop)
1241           || single_exit (innerloop)->dest != EDGE_PRED (loop->latch, 0)->src)
1242         return opt_result::failure_at (vect_location,
1243                                        "not vectorized:"
1244                                        " unsupported outerloop form.\n");
1245
1246       /* Analyze the inner-loop.  */
1247       tree inner_niterm1, inner_niter, inner_assumptions;
1248       opt_result res
1249         = vect_analyze_loop_form_1 (loop->inner, inner_loop_cond,
1250                                     &inner_assumptions, &inner_niterm1,
1251                                     &inner_niter, NULL);
1252       if (!res)
1253         {
1254           if (dump_enabled_p ())
1255             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1256                              "not vectorized: Bad inner loop.\n");
1257           return res;
1258         }
1259
1260       /* Don't support analyzing niter under assumptions for inner
1261          loop.  */
1262       if (!integer_onep (inner_assumptions))
1263         return opt_result::failure_at (vect_location,
1264                                        "not vectorized: Bad inner loop.\n");
1265
1266       if (!expr_invariant_in_loop_p (loop, inner_niter))
1267         return opt_result::failure_at (vect_location,
1268                                        "not vectorized: inner-loop count not"
1269                                        " invariant.\n");
1270
1271       if (dump_enabled_p ())
1272         dump_printf_loc (MSG_NOTE, vect_location,
1273                          "Considering outer-loop vectorization.\n");
1274     }
1275
1276   if (!single_exit (loop))
1277     return opt_result::failure_at (vect_location,
1278                                    "not vectorized: multiple exits.\n");
1279   if (EDGE_COUNT (loop->header->preds) != 2)
1280     return opt_result::failure_at (vect_location,
1281                                    "not vectorized:"
1282                                    " too many incoming edges.\n");
1283
1284   /* We assume that the loop exit condition is at the end of the loop. i.e,
1285      that the loop is represented as a do-while (with a proper if-guard
1286      before the loop if needed), where the loop header contains all the
1287      executable statements, and the latch is empty.  */
1288   if (!empty_block_p (loop->latch)
1289       || !gimple_seq_empty_p (phi_nodes (loop->latch)))
1290     return opt_result::failure_at (vect_location,
1291                                    "not vectorized: latch block not empty.\n");
1292
1293   /* Make sure the exit is not abnormal.  */
1294   edge e = single_exit (loop);
1295   if (e->flags & EDGE_ABNORMAL)
1296     return opt_result::failure_at (vect_location,
1297                                    "not vectorized:"
1298                                    " abnormal loop exit edge.\n");
1299
1300   *loop_cond = vect_get_loop_niters (loop, assumptions, number_of_iterations,
1301                                      number_of_iterationsm1);
1302   if (!*loop_cond)
1303     return opt_result::failure_at
1304       (vect_location,
1305        "not vectorized: complicated exit condition.\n");
1306
1307   if (integer_zerop (*assumptions)
1308       || !*number_of_iterations
1309       || chrec_contains_undetermined (*number_of_iterations))
1310     return opt_result::failure_at
1311       (*loop_cond,
1312        "not vectorized: number of iterations cannot be computed.\n");
1313
1314   if (integer_zerop (*number_of_iterations))
1315     return opt_result::failure_at
1316       (*loop_cond,
1317        "not vectorized: number of iterations = 0.\n");
1318
1319   return opt_result::success ();
1320 }
1321
1322 /* Analyze LOOP form and return a loop_vec_info if it is of suitable form.  */
1323
1324 opt_loop_vec_info
1325 vect_analyze_loop_form (struct loop *loop, vec_info_shared *shared)
1326 {
1327   tree assumptions, number_of_iterations, number_of_iterationsm1;
1328   gcond *loop_cond, *inner_loop_cond = NULL;
1329
1330   opt_result res
1331     = vect_analyze_loop_form_1 (loop, &loop_cond,
1332                                 &assumptions, &number_of_iterationsm1,
1333                                 &number_of_iterations, &inner_loop_cond);
1334   if (!res)
1335     return opt_loop_vec_info::propagate_failure (res);
1336
1337   loop_vec_info loop_vinfo = new _loop_vec_info (loop, shared);
1338   LOOP_VINFO_NITERSM1 (loop_vinfo) = number_of_iterationsm1;
1339   LOOP_VINFO_NITERS (loop_vinfo) = number_of_iterations;
1340   LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = number_of_iterations;
1341   if (!integer_onep (assumptions))
1342     {
1343       /* We consider to vectorize this loop by versioning it under
1344          some assumptions.  In order to do this, we need to clear
1345          existing information computed by scev and niter analyzer.  */
1346       scev_reset_htab ();
1347       free_numbers_of_iterations_estimates (loop);
1348       /* Also set flag for this loop so that following scev and niter
1349          analysis are done under the assumptions.  */
1350       loop_constraint_set (loop, LOOP_C_FINITE);
1351       /* Also record the assumptions for versioning.  */
1352       LOOP_VINFO_NITERS_ASSUMPTIONS (loop_vinfo) = assumptions;
1353     }
1354
1355   if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
1356     {
1357       if (dump_enabled_p ())
1358         {
1359           dump_printf_loc (MSG_NOTE, vect_location,
1360                            "Symbolic number of iterations is ");
1361           dump_generic_expr (MSG_NOTE, TDF_DETAILS, number_of_iterations);
1362           dump_printf (MSG_NOTE, "\n");
1363         }
1364     }
1365
1366   stmt_vec_info loop_cond_info = loop_vinfo->lookup_stmt (loop_cond);
1367   STMT_VINFO_TYPE (loop_cond_info) = loop_exit_ctrl_vec_info_type;
1368   if (inner_loop_cond)
1369     {
1370       stmt_vec_info inner_loop_cond_info
1371         = loop_vinfo->lookup_stmt (inner_loop_cond);
1372       STMT_VINFO_TYPE (inner_loop_cond_info) = loop_exit_ctrl_vec_info_type;
1373     }
1374
1375   gcc_assert (!loop->aux);
1376   loop->aux = loop_vinfo;
1377   return opt_loop_vec_info::success (loop_vinfo);
1378 }
1379
1380
1381
1382 /* Scan the loop stmts and dependent on whether there are any (non-)SLP
1383    statements update the vectorization factor.  */
1384
1385 static void
1386 vect_update_vf_for_slp (loop_vec_info loop_vinfo)
1387 {
1388   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1389   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1390   int nbbs = loop->num_nodes;
1391   poly_uint64 vectorization_factor;
1392   int i;
1393
1394   DUMP_VECT_SCOPE ("vect_update_vf_for_slp");
1395
1396   vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1397   gcc_assert (known_ne (vectorization_factor, 0U));
1398
1399   /* If all the stmts in the loop can be SLPed, we perform only SLP, and
1400      vectorization factor of the loop is the unrolling factor required by
1401      the SLP instances.  If that unrolling factor is 1, we say, that we
1402      perform pure SLP on loop - cross iteration parallelism is not
1403      exploited.  */
1404   bool only_slp_in_loop = true;
1405   for (i = 0; i < nbbs; i++)
1406     {
1407       basic_block bb = bbs[i];
1408       for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
1409            gsi_next (&si))
1410         {
1411           stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
1412           stmt_info = vect_stmt_to_vectorize (stmt_info);
1413           if ((STMT_VINFO_RELEVANT_P (stmt_info)
1414                || VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
1415               && !PURE_SLP_STMT (stmt_info))
1416             /* STMT needs both SLP and loop-based vectorization.  */
1417             only_slp_in_loop = false;
1418         }
1419     }
1420
1421   if (only_slp_in_loop)
1422     {
1423       if (dump_enabled_p ())
1424         dump_printf_loc (MSG_NOTE, vect_location,
1425                          "Loop contains only SLP stmts\n");
1426       vectorization_factor = LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo);
1427     }
1428   else
1429     {
1430       if (dump_enabled_p ())
1431         dump_printf_loc (MSG_NOTE, vect_location,
1432                          "Loop contains SLP and non-SLP stmts\n");
1433       /* Both the vectorization factor and unroll factor have the form
1434          current_vector_size * X for some rational X, so they must have
1435          a common multiple.  */
1436       vectorization_factor
1437         = force_common_multiple (vectorization_factor,
1438                                  LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo));
1439     }
1440
1441   LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
1442   if (dump_enabled_p ())
1443     {
1444       dump_printf_loc (MSG_NOTE, vect_location,
1445                        "Updating vectorization factor to ");
1446       dump_dec (MSG_NOTE, vectorization_factor);
1447       dump_printf (MSG_NOTE, ".\n");
1448     }
1449 }
1450
1451 /* Return true if STMT_INFO describes a double reduction phi and if
1452    the other phi in the reduction is also relevant for vectorization.
1453    This rejects cases such as:
1454
1455       outer1:
1456         x_1 = PHI <x_3(outer2), ...>;
1457         ...
1458
1459       inner:
1460         x_2 = ...;
1461         ...
1462
1463       outer2:
1464         x_3 = PHI <x_2(inner)>;
1465
1466    if nothing in x_2 or elsewhere makes x_1 relevant.  */
1467
1468 static bool
1469 vect_active_double_reduction_p (stmt_vec_info stmt_info)
1470 {
1471   if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_double_reduction_def)
1472     return false;
1473
1474   return STMT_VINFO_RELEVANT_P (STMT_VINFO_REDUC_DEF (stmt_info));
1475 }
1476
1477 /* Function vect_analyze_loop_operations.
1478
1479    Scan the loop stmts and make sure they are all vectorizable.  */
1480
1481 static opt_result
1482 vect_analyze_loop_operations (loop_vec_info loop_vinfo)
1483 {
1484   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1485   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1486   int nbbs = loop->num_nodes;
1487   int i;
1488   stmt_vec_info stmt_info;
1489   bool need_to_vectorize = false;
1490   bool ok;
1491
1492   DUMP_VECT_SCOPE ("vect_analyze_loop_operations");
1493
1494   auto_vec<stmt_info_for_cost> cost_vec;
1495
1496   for (i = 0; i < nbbs; i++)
1497     {
1498       basic_block bb = bbs[i];
1499
1500       for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
1501            gsi_next (&si))
1502         {
1503           gphi *phi = si.phi ();
1504           ok = true;
1505
1506           stmt_info = loop_vinfo->lookup_stmt (phi);
1507           if (dump_enabled_p ())
1508             dump_printf_loc (MSG_NOTE, vect_location, "examining phi: %G", phi);
1509           if (virtual_operand_p (gimple_phi_result (phi)))
1510             continue;
1511
1512           /* Inner-loop loop-closed exit phi in outer-loop vectorization
1513              (i.e., a phi in the tail of the outer-loop).  */
1514           if (! is_loop_header_bb_p (bb))
1515             {
1516               /* FORNOW: we currently don't support the case that these phis
1517                  are not used in the outerloop (unless it is double reduction,
1518                  i.e., this phi is vect_reduction_def), cause this case
1519                  requires to actually do something here.  */
1520               if (STMT_VINFO_LIVE_P (stmt_info)
1521                   && !vect_active_double_reduction_p (stmt_info))
1522                 return opt_result::failure_at (phi,
1523                                                "Unsupported loop-closed phi"
1524                                                " in outer-loop.\n");
1525
1526               /* If PHI is used in the outer loop, we check that its operand
1527                  is defined in the inner loop.  */
1528               if (STMT_VINFO_RELEVANT_P (stmt_info))
1529                 {
1530                   tree phi_op;
1531
1532                   if (gimple_phi_num_args (phi) != 1)
1533                     return opt_result::failure_at (phi, "unsupported phi");
1534
1535                   phi_op = PHI_ARG_DEF (phi, 0);
1536                   stmt_vec_info op_def_info = loop_vinfo->lookup_def (phi_op);
1537                   if (!op_def_info)
1538                     return opt_result::failure_at (phi, "unsupported phi");
1539
1540                   if (STMT_VINFO_RELEVANT (op_def_info) != vect_used_in_outer
1541                       && (STMT_VINFO_RELEVANT (op_def_info)
1542                           != vect_used_in_outer_by_reduction))
1543                     return opt_result::failure_at (phi, "unsupported phi");
1544                 }
1545
1546               continue;
1547             }
1548
1549           gcc_assert (stmt_info);
1550
1551           if ((STMT_VINFO_RELEVANT (stmt_info) == vect_used_in_scope
1552                || STMT_VINFO_LIVE_P (stmt_info))
1553               && STMT_VINFO_DEF_TYPE (stmt_info) != vect_induction_def)
1554             /* A scalar-dependence cycle that we don't support.  */
1555             return opt_result::failure_at (phi,
1556                                            "not vectorized:"
1557                                            " scalar dependence cycle.\n");
1558
1559           if (STMT_VINFO_RELEVANT_P (stmt_info))
1560             {
1561               need_to_vectorize = true;
1562               if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def
1563                   && ! PURE_SLP_STMT (stmt_info))
1564                 ok = vectorizable_induction (stmt_info, NULL, NULL, NULL,
1565                                              &cost_vec);
1566               else if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
1567                         || STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle)
1568                        && ! PURE_SLP_STMT (stmt_info))
1569                 ok = vectorizable_reduction (stmt_info, NULL, NULL, NULL, NULL,
1570                                              &cost_vec);
1571             }
1572
1573           /* SLP PHIs are tested by vect_slp_analyze_node_operations.  */
1574           if (ok
1575               && STMT_VINFO_LIVE_P (stmt_info)
1576               && !PURE_SLP_STMT (stmt_info))
1577             ok = vectorizable_live_operation (stmt_info, NULL, NULL, -1, NULL,
1578                                               &cost_vec);
1579
1580           if (!ok)
1581             return opt_result::failure_at (phi,
1582                                            "not vectorized: relevant phi not "
1583                                            "supported: %G",
1584                                            static_cast <gimple *> (phi));
1585         }
1586
1587       for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
1588            gsi_next (&si))
1589         {
1590           gimple *stmt = gsi_stmt (si);
1591           if (!gimple_clobber_p (stmt))
1592             {
1593               opt_result res
1594                 = vect_analyze_stmt (loop_vinfo->lookup_stmt (stmt),
1595                                      &need_to_vectorize,
1596                                      NULL, NULL, &cost_vec);
1597               if (!res)
1598                 return res;
1599             }
1600         }
1601     } /* bbs */
1602
1603   add_stmt_costs (loop_vinfo->target_cost_data, &cost_vec);
1604
1605   /* All operations in the loop are either irrelevant (deal with loop
1606      control, or dead), or only used outside the loop and can be moved
1607      out of the loop (e.g. invariants, inductions).  The loop can be
1608      optimized away by scalar optimizations.  We're better off not
1609      touching this loop.  */
1610   if (!need_to_vectorize)
1611     {
1612       if (dump_enabled_p ())
1613         dump_printf_loc (MSG_NOTE, vect_location,
1614                          "All the computation can be taken out of the loop.\n");
1615       return opt_result::failure_at
1616         (vect_location,
1617          "not vectorized: redundant loop. no profit to vectorize.\n");
1618     }
1619
1620   return opt_result::success ();
1621 }
1622
1623 /* Analyze the cost of the loop described by LOOP_VINFO.  Decide if it
1624    is worthwhile to vectorize.  Return 1 if definitely yes, 0 if
1625    definitely no, or -1 if it's worth retrying.  */
1626
1627 static int
1628 vect_analyze_loop_costing (loop_vec_info loop_vinfo)
1629 {
1630   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1631   unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
1632
1633   /* Only fully-masked loops can have iteration counts less than the
1634      vectorization factor.  */
1635   if (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
1636     {
1637       HOST_WIDE_INT max_niter;
1638
1639       if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
1640         max_niter = LOOP_VINFO_INT_NITERS (loop_vinfo);
1641       else
1642         max_niter = max_stmt_executions_int (loop);
1643
1644       if (max_niter != -1
1645           && (unsigned HOST_WIDE_INT) max_niter < assumed_vf)
1646         {
1647           if (dump_enabled_p ())
1648             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1649                              "not vectorized: iteration count smaller than "
1650                              "vectorization factor.\n");
1651           return 0;
1652         }
1653     }
1654
1655   int min_profitable_iters, min_profitable_estimate;
1656   vect_estimate_min_profitable_iters (loop_vinfo, &min_profitable_iters,
1657                                       &min_profitable_estimate);
1658
1659   if (min_profitable_iters < 0)
1660     {
1661       if (dump_enabled_p ())
1662         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1663                          "not vectorized: vectorization not profitable.\n");
1664       if (dump_enabled_p ())
1665         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1666                          "not vectorized: vector version will never be "
1667                          "profitable.\n");
1668       return -1;
1669     }
1670
1671   int min_scalar_loop_bound = (PARAM_VALUE (PARAM_MIN_VECT_LOOP_BOUND)
1672                                * assumed_vf);
1673
1674   /* Use the cost model only if it is more conservative than user specified
1675      threshold.  */
1676   unsigned int th = (unsigned) MAX (min_scalar_loop_bound,
1677                                     min_profitable_iters);
1678
1679   LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo) = th;
1680
1681   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
1682       && LOOP_VINFO_INT_NITERS (loop_vinfo) < th)
1683     {
1684       if (dump_enabled_p ())
1685         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1686                          "not vectorized: vectorization not profitable.\n");
1687       if (dump_enabled_p ())
1688         dump_printf_loc (MSG_NOTE, vect_location,
1689                          "not vectorized: iteration count smaller than user "
1690                          "specified loop bound parameter or minimum profitable "
1691                          "iterations (whichever is more conservative).\n");
1692       return 0;
1693     }
1694
1695   HOST_WIDE_INT estimated_niter = estimated_stmt_executions_int (loop);
1696   if (estimated_niter == -1)
1697     estimated_niter = likely_max_stmt_executions_int (loop);
1698   if (estimated_niter != -1
1699       && ((unsigned HOST_WIDE_INT) estimated_niter
1700           < MAX (th, (unsigned) min_profitable_estimate)))
1701     {
1702       if (dump_enabled_p ())
1703         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1704                          "not vectorized: estimated iteration count too "
1705                          "small.\n");
1706       if (dump_enabled_p ())
1707         dump_printf_loc (MSG_NOTE, vect_location,
1708                          "not vectorized: estimated iteration count smaller "
1709                          "than specified loop bound parameter or minimum "
1710                          "profitable iterations (whichever is more "
1711                          "conservative).\n");
1712       return -1;
1713     }
1714
1715   return 1;
1716 }
1717
1718 static opt_result
1719 vect_get_datarefs_in_loop (loop_p loop, basic_block *bbs,
1720                            vec<data_reference_p> *datarefs,
1721                            unsigned int *n_stmts)
1722 {
1723   *n_stmts = 0;
1724   for (unsigned i = 0; i < loop->num_nodes; i++)
1725     for (gimple_stmt_iterator gsi = gsi_start_bb (bbs[i]);
1726          !gsi_end_p (gsi); gsi_next (&gsi))
1727       {
1728         gimple *stmt = gsi_stmt (gsi);
1729         if (is_gimple_debug (stmt))
1730           continue;
1731         ++(*n_stmts);
1732         opt_result res = vect_find_stmt_data_reference (loop, stmt, datarefs);
1733         if (!res)
1734           {
1735             if (is_gimple_call (stmt) && loop->safelen)
1736               {
1737                 tree fndecl = gimple_call_fndecl (stmt), op;
1738                 if (fndecl != NULL_TREE)
1739                   {
1740                     cgraph_node *node = cgraph_node::get (fndecl);
1741                     if (node != NULL && node->simd_clones != NULL)
1742                       {
1743                         unsigned int j, n = gimple_call_num_args (stmt);
1744                         for (j = 0; j < n; j++)
1745                           {
1746                             op = gimple_call_arg (stmt, j);
1747                             if (DECL_P (op)
1748                                 || (REFERENCE_CLASS_P (op)
1749                                     && get_base_address (op)))
1750                               break;
1751                           }
1752                         op = gimple_call_lhs (stmt);
1753                         /* Ignore #pragma omp declare simd functions
1754                            if they don't have data references in the
1755                            call stmt itself.  */
1756                         if (j == n
1757                             && !(op
1758                                  && (DECL_P (op)
1759                                      || (REFERENCE_CLASS_P (op)
1760                                          && get_base_address (op)))))
1761                           continue;
1762                       }
1763                   }
1764               }
1765             return res;
1766           }
1767         /* If dependence analysis will give up due to the limit on the
1768            number of datarefs stop here and fail fatally.  */
1769         if (datarefs->length ()
1770             > (unsigned)PARAM_VALUE (PARAM_LOOP_MAX_DATAREFS_FOR_DATADEPS))
1771           return opt_result::failure_at (stmt, "exceeded param "
1772                                          "loop-max-datarefs-for-datadeps\n");
1773       }
1774   return opt_result::success ();
1775 }
1776
1777 /* Function vect_analyze_loop_2.
1778
1779    Apply a set of analyses on LOOP, and create a loop_vec_info struct
1780    for it.  The different analyses will record information in the
1781    loop_vec_info struct.  */
1782 static opt_result
1783 vect_analyze_loop_2 (loop_vec_info loop_vinfo, bool &fatal, unsigned *n_stmts)
1784 {
1785   opt_result ok = opt_result::success ();
1786   int res;
1787   unsigned int max_vf = MAX_VECTORIZATION_FACTOR;
1788   poly_uint64 min_vf = 2;
1789
1790   /* The first group of checks is independent of the vector size.  */
1791   fatal = true;
1792
1793   if (LOOP_VINFO_SIMD_IF_COND (loop_vinfo)
1794       && integer_zerop (LOOP_VINFO_SIMD_IF_COND (loop_vinfo)))
1795     return opt_result::failure_at (vect_location,
1796                                    "not vectorized: simd if(0)\n");
1797
1798   /* Find all data references in the loop (which correspond to vdefs/vuses)
1799      and analyze their evolution in the loop.  */
1800
1801   loop_p loop = LOOP_VINFO_LOOP (loop_vinfo);
1802
1803   /* Gather the data references and count stmts in the loop.  */
1804   if (!LOOP_VINFO_DATAREFS (loop_vinfo).exists ())
1805     {
1806       opt_result res
1807         = vect_get_datarefs_in_loop (loop, LOOP_VINFO_BBS (loop_vinfo),
1808                                      &LOOP_VINFO_DATAREFS (loop_vinfo),
1809                                      n_stmts);
1810       if (!res)
1811         {
1812           if (dump_enabled_p ())
1813             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1814                              "not vectorized: loop contains function "
1815                              "calls or data references that cannot "
1816                              "be analyzed\n");
1817           return res;
1818         }
1819       loop_vinfo->shared->save_datarefs ();
1820     }
1821   else
1822     loop_vinfo->shared->check_datarefs ();
1823
1824   /* Analyze the data references and also adjust the minimal
1825      vectorization factor according to the loads and stores.  */
1826
1827   ok = vect_analyze_data_refs (loop_vinfo, &min_vf);
1828   if (!ok)
1829     {
1830       if (dump_enabled_p ())
1831         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1832                          "bad data references.\n");
1833       return ok;
1834     }
1835
1836   /* Classify all cross-iteration scalar data-flow cycles.
1837      Cross-iteration cycles caused by virtual phis are analyzed separately.  */
1838   vect_analyze_scalar_cycles (loop_vinfo);
1839
1840   vect_pattern_recog (loop_vinfo);
1841
1842   vect_fixup_scalar_cycles_with_patterns (loop_vinfo);
1843
1844   /* Analyze the access patterns of the data-refs in the loop (consecutive,
1845      complex, etc.). FORNOW: Only handle consecutive access pattern.  */
1846
1847   ok = vect_analyze_data_ref_accesses (loop_vinfo);
1848   if (!ok)
1849     {
1850       if (dump_enabled_p ())
1851         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1852                          "bad data access.\n");
1853       return ok;
1854     }
1855
1856   /* Data-flow analysis to detect stmts that do not need to be vectorized.  */
1857
1858   ok = vect_mark_stmts_to_be_vectorized (loop_vinfo);
1859   if (!ok)
1860     {
1861       if (dump_enabled_p ())
1862         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1863                          "unexpected pattern.\n");
1864       return ok;
1865     }
1866
1867   /* While the rest of the analysis below depends on it in some way.  */
1868   fatal = false;
1869
1870   /* Analyze data dependences between the data-refs in the loop
1871      and adjust the maximum vectorization factor according to
1872      the dependences.
1873      FORNOW: fail at the first data dependence that we encounter.  */
1874
1875   ok = vect_analyze_data_ref_dependences (loop_vinfo, &max_vf);
1876   if (!ok)
1877     {
1878       if (dump_enabled_p ())
1879         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1880                          "bad data dependence.\n");
1881       return ok;
1882     }
1883   if (max_vf != MAX_VECTORIZATION_FACTOR
1884       && maybe_lt (max_vf, min_vf))
1885     return opt_result::failure_at (vect_location, "bad data dependence.\n");
1886   LOOP_VINFO_MAX_VECT_FACTOR (loop_vinfo) = max_vf;
1887
1888   ok = vect_determine_vectorization_factor (loop_vinfo);
1889   if (!ok)
1890     {
1891       if (dump_enabled_p ())
1892         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1893                          "can't determine vectorization factor.\n");
1894       return ok;
1895     }
1896   if (max_vf != MAX_VECTORIZATION_FACTOR
1897       && maybe_lt (max_vf, LOOP_VINFO_VECT_FACTOR (loop_vinfo)))
1898     return opt_result::failure_at (vect_location, "bad data dependence.\n");
1899
1900   /* Compute the scalar iteration cost.  */
1901   vect_compute_single_scalar_iteration_cost (loop_vinfo);
1902
1903   poly_uint64 saved_vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1904   unsigned th;
1905
1906   /* Check the SLP opportunities in the loop, analyze and build SLP trees.  */
1907   ok = vect_analyze_slp (loop_vinfo, *n_stmts);
1908   if (!ok)
1909     return ok;
1910
1911   /* If there are any SLP instances mark them as pure_slp.  */
1912   bool slp = vect_make_slp_decision (loop_vinfo);
1913   if (slp)
1914     {
1915       /* Find stmts that need to be both vectorized and SLPed.  */
1916       vect_detect_hybrid_slp (loop_vinfo);
1917
1918       /* Update the vectorization factor based on the SLP decision.  */
1919       vect_update_vf_for_slp (loop_vinfo);
1920     }
1921
1922   bool saved_can_fully_mask_p = LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo);
1923
1924   /* We don't expect to have to roll back to anything other than an empty
1925      set of rgroups.  */
1926   gcc_assert (LOOP_VINFO_MASKS (loop_vinfo).is_empty ());
1927
1928   /* This is the point where we can re-start analysis with SLP forced off.  */
1929 start_over:
1930
1931   /* Now the vectorization factor is final.  */
1932   poly_uint64 vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1933   gcc_assert (known_ne (vectorization_factor, 0U));
1934
1935   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) && dump_enabled_p ())
1936     {
1937       dump_printf_loc (MSG_NOTE, vect_location,
1938                        "vectorization_factor = ");
1939       dump_dec (MSG_NOTE, vectorization_factor);
1940       dump_printf (MSG_NOTE, ", niters = %wd\n",
1941                    LOOP_VINFO_INT_NITERS (loop_vinfo));
1942     }
1943
1944   HOST_WIDE_INT max_niter
1945     = likely_max_stmt_executions_int (LOOP_VINFO_LOOP (loop_vinfo));
1946
1947   /* Analyze the alignment of the data-refs in the loop.
1948      Fail if a data reference is found that cannot be vectorized.  */
1949
1950   ok = vect_analyze_data_refs_alignment (loop_vinfo);
1951   if (!ok)
1952     {
1953       if (dump_enabled_p ())
1954         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1955                          "bad data alignment.\n");
1956       return ok;
1957     }
1958
1959   /* Prune the list of ddrs to be tested at run-time by versioning for alias.
1960      It is important to call pruning after vect_analyze_data_ref_accesses,
1961      since we use grouping information gathered by interleaving analysis.  */
1962   ok = vect_prune_runtime_alias_test_list (loop_vinfo);
1963   if (!ok)
1964     return ok;
1965
1966   /* Do not invoke vect_enhance_data_refs_alignment for epilogue
1967      vectorization, since we do not want to add extra peeling or
1968      add versioning for alignment.  */
1969   if (!LOOP_VINFO_EPILOGUE_P (loop_vinfo))
1970     /* This pass will decide on using loop versioning and/or loop peeling in
1971        order to enhance the alignment of data references in the loop.  */
1972     ok = vect_enhance_data_refs_alignment (loop_vinfo);
1973   else
1974     ok = vect_verify_datarefs_alignment (loop_vinfo);
1975   if (!ok)
1976     return ok;
1977
1978   if (slp)
1979     {
1980       /* Analyze operations in the SLP instances.  Note this may
1981          remove unsupported SLP instances which makes the above
1982          SLP kind detection invalid.  */
1983       unsigned old_size = LOOP_VINFO_SLP_INSTANCES (loop_vinfo).length ();
1984       vect_slp_analyze_operations (loop_vinfo);
1985       if (LOOP_VINFO_SLP_INSTANCES (loop_vinfo).length () != old_size)
1986         {
1987           ok = opt_result::failure_at (vect_location,
1988                                        "unsupported SLP instances\n");
1989           goto again;
1990         }
1991     }
1992
1993   /* Scan all the remaining operations in the loop that are not subject
1994      to SLP and make sure they are vectorizable.  */
1995   ok = vect_analyze_loop_operations (loop_vinfo);
1996   if (!ok)
1997     {
1998       if (dump_enabled_p ())
1999         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2000                          "bad operation or unsupported loop bound.\n");
2001       return ok;
2002     }
2003
2004   /* Decide whether to use a fully-masked loop for this vectorization
2005      factor.  */
2006   LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
2007     = (LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo)
2008        && vect_verify_full_masking (loop_vinfo));
2009   if (dump_enabled_p ())
2010     {
2011       if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
2012         dump_printf_loc (MSG_NOTE, vect_location,
2013                          "using a fully-masked loop.\n");
2014       else
2015         dump_printf_loc (MSG_NOTE, vect_location,
2016                          "not using a fully-masked loop.\n");
2017     }
2018
2019   /* If epilog loop is required because of data accesses with gaps,
2020      one additional iteration needs to be peeled.  Check if there is
2021      enough iterations for vectorization.  */
2022   if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
2023       && LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2024       && !LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
2025     {
2026       poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2027       tree scalar_niters = LOOP_VINFO_NITERSM1 (loop_vinfo);
2028
2029       if (known_lt (wi::to_widest (scalar_niters), vf))
2030         return opt_result::failure_at (vect_location,
2031                                        "loop has no enough iterations to"
2032                                        " support peeling for gaps.\n");
2033     }
2034
2035   /* Check the costings of the loop make vectorizing worthwhile.  */
2036   res = vect_analyze_loop_costing (loop_vinfo);
2037   if (res < 0)
2038     {
2039       ok = opt_result::failure_at (vect_location,
2040                                    "Loop costings may not be worthwhile.\n");
2041       goto again;
2042     }
2043   if (!res)
2044     return opt_result::failure_at (vect_location,
2045                                    "Loop costings not worthwhile.\n");
2046
2047   /* Decide whether we need to create an epilogue loop to handle
2048      remaining scalar iterations.  */
2049   th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
2050
2051   unsigned HOST_WIDE_INT const_vf;
2052   if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
2053     /* The main loop handles all iterations.  */
2054     LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = false;
2055   else if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2056            && LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) >= 0)
2057     {
2058       /* Work out the (constant) number of iterations that need to be
2059          peeled for reasons other than niters.  */
2060       unsigned int peel_niter = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
2061       if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
2062         peel_niter += 1;
2063       if (!multiple_p (LOOP_VINFO_INT_NITERS (loop_vinfo) - peel_niter,
2064                        LOOP_VINFO_VECT_FACTOR (loop_vinfo)))
2065         LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = true;
2066     }
2067   else if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
2068            /* ??? When peeling for gaps but not alignment, we could
2069               try to check whether the (variable) niters is known to be
2070               VF * N + 1.  That's something of a niche case though.  */
2071            || LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
2072            || !LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant (&const_vf)
2073            || ((tree_ctz (LOOP_VINFO_NITERS (loop_vinfo))
2074                 < (unsigned) exact_log2 (const_vf))
2075                /* In case of versioning, check if the maximum number of
2076                   iterations is greater than th.  If they are identical,
2077                   the epilogue is unnecessary.  */
2078                && (!LOOP_REQUIRES_VERSIONING (loop_vinfo)
2079                    || ((unsigned HOST_WIDE_INT) max_niter
2080                        > (th / const_vf) * const_vf))))
2081     LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = true;
2082
2083   /* If an epilogue loop is required make sure we can create one.  */
2084   if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
2085       || LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo))
2086     {
2087       if (dump_enabled_p ())
2088         dump_printf_loc (MSG_NOTE, vect_location, "epilog loop required\n");
2089       if (!vect_can_advance_ivs_p (loop_vinfo)
2090           || !slpeel_can_duplicate_loop_p (LOOP_VINFO_LOOP (loop_vinfo),
2091                                            single_exit (LOOP_VINFO_LOOP
2092                                                          (loop_vinfo))))
2093         {
2094           ok = opt_result::failure_at (vect_location,
2095                                        "not vectorized: can't create required "
2096                                        "epilog loop\n");
2097           goto again;
2098         }
2099     }
2100
2101   /* During peeling, we need to check if number of loop iterations is
2102      enough for both peeled prolog loop and vector loop.  This check
2103      can be merged along with threshold check of loop versioning, so
2104      increase threshold for this case if necessary.  */
2105   if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
2106     {
2107       poly_uint64 niters_th = 0;
2108
2109       if (!vect_use_loop_mask_for_alignment_p (loop_vinfo))
2110         {
2111           /* Niters for peeled prolog loop.  */
2112           if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) < 0)
2113             {
2114               dr_vec_info *dr_info = LOOP_VINFO_UNALIGNED_DR (loop_vinfo);
2115               tree vectype = STMT_VINFO_VECTYPE (dr_info->stmt);
2116               niters_th += TYPE_VECTOR_SUBPARTS (vectype) - 1;
2117             }
2118           else
2119             niters_th += LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
2120         }
2121
2122       /* Niters for at least one iteration of vectorized loop.  */
2123       if (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
2124         niters_th += LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2125       /* One additional iteration because of peeling for gap.  */
2126       if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
2127         niters_th += 1;
2128       LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo) = niters_th;
2129     }
2130
2131   gcc_assert (known_eq (vectorization_factor,
2132                         LOOP_VINFO_VECT_FACTOR (loop_vinfo)));
2133
2134   /* Ok to vectorize!  */
2135   return opt_result::success ();
2136
2137 again:
2138   /* Ensure that "ok" is false (with an opt_problem if dumping is enabled).  */
2139   gcc_assert (!ok);
2140
2141   /* Try again with SLP forced off but if we didn't do any SLP there is
2142      no point in re-trying.  */
2143   if (!slp)
2144     return ok;
2145
2146   /* If there are reduction chains re-trying will fail anyway.  */
2147   if (! LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo).is_empty ())
2148     return ok;
2149
2150   /* Likewise if the grouped loads or stores in the SLP cannot be handled
2151      via interleaving or lane instructions.  */
2152   slp_instance instance;
2153   slp_tree node;
2154   unsigned i, j;
2155   FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), i, instance)
2156     {
2157       stmt_vec_info vinfo;
2158       vinfo = SLP_TREE_SCALAR_STMTS (SLP_INSTANCE_TREE (instance))[0];
2159       if (! STMT_VINFO_GROUPED_ACCESS (vinfo))
2160         continue;
2161       vinfo = DR_GROUP_FIRST_ELEMENT (vinfo);
2162       unsigned int size = DR_GROUP_SIZE (vinfo);
2163       tree vectype = STMT_VINFO_VECTYPE (vinfo);
2164       if (! vect_store_lanes_supported (vectype, size, false)
2165          && ! known_eq (TYPE_VECTOR_SUBPARTS (vectype), 1U)
2166          && ! vect_grouped_store_supported (vectype, size))
2167         return opt_result::failure_at (vinfo->stmt,
2168                                        "unsupported grouped store\n");
2169       FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), j, node)
2170         {
2171           vinfo = SLP_TREE_SCALAR_STMTS (node)[0];
2172           vinfo = DR_GROUP_FIRST_ELEMENT (vinfo);
2173           bool single_element_p = !DR_GROUP_NEXT_ELEMENT (vinfo);
2174           size = DR_GROUP_SIZE (vinfo);
2175           vectype = STMT_VINFO_VECTYPE (vinfo);
2176           if (! vect_load_lanes_supported (vectype, size, false)
2177               && ! vect_grouped_load_supported (vectype, single_element_p,
2178                                                 size))
2179             return opt_result::failure_at (vinfo->stmt,
2180                                            "unsupported grouped load\n");
2181         }
2182     }
2183
2184   if (dump_enabled_p ())
2185     dump_printf_loc (MSG_NOTE, vect_location,
2186                      "re-trying with SLP disabled\n");
2187
2188   /* Roll back state appropriately.  No SLP this time.  */
2189   slp = false;
2190   /* Restore vectorization factor as it were without SLP.  */
2191   LOOP_VINFO_VECT_FACTOR (loop_vinfo) = saved_vectorization_factor;
2192   /* Free the SLP instances.  */
2193   FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), j, instance)
2194     vect_free_slp_instance (instance, false);
2195   LOOP_VINFO_SLP_INSTANCES (loop_vinfo).release ();
2196   /* Reset SLP type to loop_vect on all stmts.  */
2197   for (i = 0; i < LOOP_VINFO_LOOP (loop_vinfo)->num_nodes; ++i)
2198     {
2199       basic_block bb = LOOP_VINFO_BBS (loop_vinfo)[i];
2200       for (gimple_stmt_iterator si = gsi_start_phis (bb);
2201            !gsi_end_p (si); gsi_next (&si))
2202         {
2203           stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
2204           STMT_SLP_TYPE (stmt_info) = loop_vect;
2205         }
2206       for (gimple_stmt_iterator si = gsi_start_bb (bb);
2207            !gsi_end_p (si); gsi_next (&si))
2208         {
2209           stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
2210           STMT_SLP_TYPE (stmt_info) = loop_vect;
2211           if (STMT_VINFO_IN_PATTERN_P (stmt_info))
2212             {
2213               gimple *pattern_def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
2214               stmt_info = STMT_VINFO_RELATED_STMT (stmt_info);
2215               STMT_SLP_TYPE (stmt_info) = loop_vect;
2216               for (gimple_stmt_iterator pi = gsi_start (pattern_def_seq);
2217                    !gsi_end_p (pi); gsi_next (&pi))
2218                 STMT_SLP_TYPE (loop_vinfo->lookup_stmt (gsi_stmt (pi)))
2219                   = loop_vect;
2220             }
2221         }
2222     }
2223   /* Free optimized alias test DDRS.  */
2224   LOOP_VINFO_LOWER_BOUNDS (loop_vinfo).truncate (0);
2225   LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo).release ();
2226   LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo).release ();
2227   /* Reset target cost data.  */
2228   destroy_cost_data (LOOP_VINFO_TARGET_COST_DATA (loop_vinfo));
2229   LOOP_VINFO_TARGET_COST_DATA (loop_vinfo)
2230     = init_cost (LOOP_VINFO_LOOP (loop_vinfo));
2231   /* Reset accumulated rgroup information.  */
2232   release_vec_loop_masks (&LOOP_VINFO_MASKS (loop_vinfo));
2233   /* Reset assorted flags.  */
2234   LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = false;
2235   LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) = false;
2236   LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo) = 0;
2237   LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo) = 0;
2238   LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = saved_can_fully_mask_p;
2239
2240   goto start_over;
2241 }
2242
2243 /* Function vect_analyze_loop.
2244
2245    Apply a set of analyses on LOOP, and create a loop_vec_info struct
2246    for it.  The different analyses will record information in the
2247    loop_vec_info struct.  If ORIG_LOOP_VINFO is not NULL epilogue must
2248    be vectorized.  */
2249 opt_loop_vec_info
2250 vect_analyze_loop (struct loop *loop, loop_vec_info orig_loop_vinfo,
2251                    vec_info_shared *shared)
2252 {
2253   auto_vector_sizes vector_sizes;
2254
2255   /* Autodetect first vector size we try.  */
2256   current_vector_size = 0;
2257   targetm.vectorize.autovectorize_vector_sizes (&vector_sizes);
2258   unsigned int next_size = 0;
2259
2260   DUMP_VECT_SCOPE ("analyze_loop_nest");
2261
2262   if (loop_outer (loop)
2263       && loop_vec_info_for_loop (loop_outer (loop))
2264       && LOOP_VINFO_VECTORIZABLE_P (loop_vec_info_for_loop (loop_outer (loop))))
2265     return opt_loop_vec_info::failure_at (vect_location,
2266                                           "outer-loop already vectorized.\n");
2267
2268   if (!find_loop_nest (loop, &shared->loop_nest))
2269     return opt_loop_vec_info::failure_at
2270       (vect_location,
2271        "not vectorized: loop nest containing two or more consecutive inner"
2272        " loops cannot be vectorized\n");
2273
2274   unsigned n_stmts = 0;
2275   poly_uint64 autodetected_vector_size = 0;
2276   while (1)
2277     {
2278       /* Check the CFG characteristics of the loop (nesting, entry/exit).  */
2279       opt_loop_vec_info loop_vinfo
2280         = vect_analyze_loop_form (loop, shared);
2281       if (!loop_vinfo)
2282         {
2283           if (dump_enabled_p ())
2284             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2285                              "bad loop form.\n");
2286           return loop_vinfo;
2287         }
2288
2289       bool fatal = false;
2290
2291       if (orig_loop_vinfo)
2292         LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo) = orig_loop_vinfo;
2293
2294       opt_result res = vect_analyze_loop_2 (loop_vinfo, fatal, &n_stmts);
2295       if (res)
2296         {
2297           LOOP_VINFO_VECTORIZABLE_P (loop_vinfo) = 1;
2298
2299           return loop_vinfo;
2300         }
2301
2302       delete loop_vinfo;
2303
2304       if (next_size == 0)
2305         autodetected_vector_size = current_vector_size;
2306
2307       if (next_size < vector_sizes.length ()
2308           && known_eq (vector_sizes[next_size], autodetected_vector_size))
2309         next_size += 1;
2310
2311       if (fatal
2312           || next_size == vector_sizes.length ()
2313           || known_eq (current_vector_size, 0U))
2314         return opt_loop_vec_info::propagate_failure (res);
2315
2316       /* Try the next biggest vector size.  */
2317       current_vector_size = vector_sizes[next_size++];
2318       if (dump_enabled_p ())
2319         {
2320           dump_printf_loc (MSG_NOTE, vect_location,
2321                            "***** Re-trying analysis with "
2322                            "vector size ");
2323           dump_dec (MSG_NOTE, current_vector_size);
2324           dump_printf (MSG_NOTE, "\n");
2325         }
2326     }
2327 }
2328
2329 /* Return true if there is an in-order reduction function for CODE, storing
2330    it in *REDUC_FN if so.  */
2331
2332 static bool
2333 fold_left_reduction_fn (tree_code code, internal_fn *reduc_fn)
2334 {
2335   switch (code)
2336     {
2337     case PLUS_EXPR:
2338       *reduc_fn = IFN_FOLD_LEFT_PLUS;
2339       return true;
2340
2341     default:
2342       return false;
2343     }
2344 }
2345
2346 /* Function reduction_fn_for_scalar_code
2347
2348    Input:
2349    CODE - tree_code of a reduction operations.
2350
2351    Output:
2352    REDUC_FN - the corresponding internal function to be used to reduce the
2353       vector of partial results into a single scalar result, or IFN_LAST
2354       if the operation is a supported reduction operation, but does not have
2355       such an internal function.
2356
2357    Return FALSE if CODE currently cannot be vectorized as reduction.  */
2358
2359 static bool
2360 reduction_fn_for_scalar_code (enum tree_code code, internal_fn *reduc_fn)
2361 {
2362   switch (code)
2363     {
2364       case MAX_EXPR:
2365         *reduc_fn = IFN_REDUC_MAX;
2366         return true;
2367
2368       case MIN_EXPR:
2369         *reduc_fn = IFN_REDUC_MIN;
2370         return true;
2371
2372       case PLUS_EXPR:
2373         *reduc_fn = IFN_REDUC_PLUS;
2374         return true;
2375
2376       case BIT_AND_EXPR:
2377         *reduc_fn = IFN_REDUC_AND;
2378         return true;
2379
2380       case BIT_IOR_EXPR:
2381         *reduc_fn = IFN_REDUC_IOR;
2382         return true;
2383
2384       case BIT_XOR_EXPR:
2385         *reduc_fn = IFN_REDUC_XOR;
2386         return true;
2387
2388       case MULT_EXPR:
2389       case MINUS_EXPR:
2390         *reduc_fn = IFN_LAST;
2391         return true;
2392
2393       default:
2394        return false;
2395     }
2396 }
2397
2398 /* If there is a neutral value X such that SLP reduction NODE would not
2399    be affected by the introduction of additional X elements, return that X,
2400    otherwise return null.  CODE is the code of the reduction.  REDUC_CHAIN
2401    is true if the SLP statements perform a single reduction, false if each
2402    statement performs an independent reduction.  */
2403
2404 static tree
2405 neutral_op_for_slp_reduction (slp_tree slp_node, tree_code code,
2406                               bool reduc_chain)
2407 {
2408   vec<stmt_vec_info> stmts = SLP_TREE_SCALAR_STMTS (slp_node);
2409   stmt_vec_info stmt_vinfo = stmts[0];
2410   tree vector_type = STMT_VINFO_VECTYPE (stmt_vinfo);
2411   tree scalar_type = TREE_TYPE (vector_type);
2412   struct loop *loop = gimple_bb (stmt_vinfo->stmt)->loop_father;
2413   gcc_assert (loop);
2414
2415   switch (code)
2416     {
2417     case WIDEN_SUM_EXPR:
2418     case DOT_PROD_EXPR:
2419     case SAD_EXPR:
2420     case PLUS_EXPR:
2421     case MINUS_EXPR:
2422     case BIT_IOR_EXPR:
2423     case BIT_XOR_EXPR:
2424       return build_zero_cst (scalar_type);
2425
2426     case MULT_EXPR:
2427       return build_one_cst (scalar_type);
2428
2429     case BIT_AND_EXPR:
2430       return build_all_ones_cst (scalar_type);
2431
2432     case MAX_EXPR:
2433     case MIN_EXPR:
2434       /* For MIN/MAX the initial values are neutral.  A reduction chain
2435          has only a single initial value, so that value is neutral for
2436          all statements.  */
2437       if (reduc_chain)
2438         return PHI_ARG_DEF_FROM_EDGE (stmt_vinfo->stmt,
2439                                       loop_preheader_edge (loop));
2440       return NULL_TREE;
2441
2442     default:
2443       return NULL_TREE;
2444     }
2445 }
2446
2447 /* Error reporting helper for vect_is_simple_reduction below.  GIMPLE statement
2448    STMT is printed with a message MSG. */
2449
2450 static void
2451 report_vect_op (dump_flags_t msg_type, gimple *stmt, const char *msg)
2452 {
2453   dump_printf_loc (msg_type, vect_location, "%s%G", msg, stmt);
2454 }
2455
2456 /* DEF_STMT_INFO occurs in a loop that contains a potential reduction
2457    operation.  Return true if the results of DEF_STMT_INFO are something
2458    that can be accumulated by such a reduction.  */
2459
2460 static bool
2461 vect_valid_reduction_input_p (stmt_vec_info def_stmt_info)
2462 {
2463   return (is_gimple_assign (def_stmt_info->stmt)
2464           || is_gimple_call (def_stmt_info->stmt)
2465           || STMT_VINFO_DEF_TYPE (def_stmt_info) == vect_induction_def
2466           || (gimple_code (def_stmt_info->stmt) == GIMPLE_PHI
2467               && STMT_VINFO_DEF_TYPE (def_stmt_info) == vect_internal_def
2468               && !is_loop_header_bb_p (gimple_bb (def_stmt_info->stmt))));
2469 }
2470
2471 /* Detect SLP reduction of the form:
2472
2473    #a1 = phi <a5, a0>
2474    a2 = operation (a1)
2475    a3 = operation (a2)
2476    a4 = operation (a3)
2477    a5 = operation (a4)
2478
2479    #a = phi <a5>
2480
2481    PHI is the reduction phi node (#a1 = phi <a5, a0> above)
2482    FIRST_STMT is the first reduction stmt in the chain
2483    (a2 = operation (a1)).
2484
2485    Return TRUE if a reduction chain was detected.  */
2486
2487 static bool
2488 vect_is_slp_reduction (loop_vec_info loop_info, gimple *phi,
2489                        gimple *first_stmt)
2490 {
2491   struct loop *loop = (gimple_bb (phi))->loop_father;
2492   struct loop *vect_loop = LOOP_VINFO_LOOP (loop_info);
2493   enum tree_code code;
2494   gimple *loop_use_stmt = NULL;
2495   stmt_vec_info use_stmt_info;
2496   tree lhs;
2497   imm_use_iterator imm_iter;
2498   use_operand_p use_p;
2499   int nloop_uses, size = 0, n_out_of_loop_uses;
2500   bool found = false;
2501
2502   if (loop != vect_loop)
2503     return false;
2504
2505   auto_vec<stmt_vec_info, 8> reduc_chain;
2506   lhs = PHI_RESULT (phi);
2507   code = gimple_assign_rhs_code (first_stmt);
2508   while (1)
2509     {
2510       nloop_uses = 0;
2511       n_out_of_loop_uses = 0;
2512       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, lhs)
2513         {
2514           gimple *use_stmt = USE_STMT (use_p);
2515           if (is_gimple_debug (use_stmt))
2516             continue;
2517
2518           /* Check if we got back to the reduction phi.  */
2519           if (use_stmt == phi)
2520             {
2521               loop_use_stmt = use_stmt;
2522               found = true;
2523               break;
2524             }
2525
2526           if (flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
2527             {
2528               loop_use_stmt = use_stmt;
2529               nloop_uses++;
2530             }
2531            else
2532              n_out_of_loop_uses++;
2533
2534            /* There are can be either a single use in the loop or two uses in
2535               phi nodes.  */
2536            if (nloop_uses > 1 || (n_out_of_loop_uses && nloop_uses))
2537              return false;
2538         }
2539
2540       if (found)
2541         break;
2542
2543       /* We reached a statement with no loop uses.  */
2544       if (nloop_uses == 0)
2545         return false;
2546
2547       /* This is a loop exit phi, and we haven't reached the reduction phi.  */
2548       if (gimple_code (loop_use_stmt) == GIMPLE_PHI)
2549         return false;
2550
2551       if (!is_gimple_assign (loop_use_stmt)
2552           || code != gimple_assign_rhs_code (loop_use_stmt)
2553           || !flow_bb_inside_loop_p (loop, gimple_bb (loop_use_stmt)))
2554         return false;
2555
2556       /* Insert USE_STMT into reduction chain.  */
2557       use_stmt_info = loop_info->lookup_stmt (loop_use_stmt);
2558       reduc_chain.safe_push (use_stmt_info);
2559
2560       lhs = gimple_assign_lhs (loop_use_stmt);
2561       size++;
2562    }
2563
2564   if (!found || loop_use_stmt != phi || size < 2)
2565     return false;
2566
2567   /* Swap the operands, if needed, to make the reduction operand be the second
2568      operand.  */
2569   lhs = PHI_RESULT (phi);
2570   for (unsigned i = 0; i < reduc_chain.length (); ++i)
2571     {
2572       gassign *next_stmt = as_a <gassign *> (reduc_chain[i]->stmt);
2573       if (gimple_assign_rhs2 (next_stmt) == lhs)
2574         {
2575           tree op = gimple_assign_rhs1 (next_stmt);
2576           stmt_vec_info def_stmt_info = loop_info->lookup_def (op);
2577
2578           /* Check that the other def is either defined in the loop
2579              ("vect_internal_def"), or it's an induction (defined by a
2580              loop-header phi-node).  */
2581           if (def_stmt_info
2582               && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt_info->stmt))
2583               && vect_valid_reduction_input_p (def_stmt_info))
2584             {
2585               lhs = gimple_assign_lhs (next_stmt);
2586               continue;
2587             }
2588
2589           return false;
2590         }
2591       else
2592         {
2593           tree op = gimple_assign_rhs2 (next_stmt);
2594           stmt_vec_info def_stmt_info = loop_info->lookup_def (op);
2595
2596           /* Check that the other def is either defined in the loop
2597             ("vect_internal_def"), or it's an induction (defined by a
2598             loop-header phi-node).  */
2599           if (def_stmt_info
2600               && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt_info->stmt))
2601               && vect_valid_reduction_input_p (def_stmt_info))
2602             {
2603               if (dump_enabled_p ())
2604                 dump_printf_loc (MSG_NOTE, vect_location, "swapping oprnds: %G",
2605                                  next_stmt);
2606
2607               swap_ssa_operands (next_stmt,
2608                                  gimple_assign_rhs1_ptr (next_stmt),
2609                                  gimple_assign_rhs2_ptr (next_stmt));
2610               update_stmt (next_stmt);
2611
2612               if (CONSTANT_CLASS_P (gimple_assign_rhs1 (next_stmt)))
2613                 LOOP_VINFO_OPERANDS_SWAPPED (loop_info) = true;
2614             }
2615           else
2616             return false;
2617         }
2618
2619       lhs = gimple_assign_lhs (next_stmt);
2620     }
2621
2622   /* Build up the actual chain.  */
2623   for (unsigned i = 0; i < reduc_chain.length () - 1; ++i)
2624     {
2625       REDUC_GROUP_FIRST_ELEMENT (reduc_chain[i]) = reduc_chain[0];
2626       REDUC_GROUP_NEXT_ELEMENT (reduc_chain[i]) = reduc_chain[i+1];
2627     }
2628   REDUC_GROUP_FIRST_ELEMENT (reduc_chain.last ()) = reduc_chain[0];
2629   REDUC_GROUP_NEXT_ELEMENT (reduc_chain.last ()) = NULL;
2630
2631   /* Save the chain for further analysis in SLP detection.  */
2632   LOOP_VINFO_REDUCTION_CHAINS (loop_info).safe_push (reduc_chain[0]);
2633   REDUC_GROUP_SIZE (reduc_chain[0]) = size;
2634
2635   return true;
2636 }
2637
2638 /* Return true if we need an in-order reduction for operation CODE
2639    on type TYPE.  NEED_WRAPPING_INTEGRAL_OVERFLOW is true if integer
2640    overflow must wrap.  */
2641
2642 static bool
2643 needs_fold_left_reduction_p (tree type, tree_code code,
2644                              bool need_wrapping_integral_overflow)
2645 {
2646   /* CHECKME: check for !flag_finite_math_only too?  */
2647   if (SCALAR_FLOAT_TYPE_P (type))
2648     switch (code)
2649       {
2650       case MIN_EXPR:
2651       case MAX_EXPR:
2652         return false;
2653
2654       default:
2655         return !flag_associative_math;
2656       }
2657
2658   if (INTEGRAL_TYPE_P (type))
2659     {
2660       if (!operation_no_trapping_overflow (type, code))
2661         return true;
2662       if (need_wrapping_integral_overflow
2663           && !TYPE_OVERFLOW_WRAPS (type)
2664           && operation_can_overflow (code))
2665         return true;
2666       return false;
2667     }
2668
2669   if (SAT_FIXED_POINT_TYPE_P (type))
2670     return true;
2671
2672   return false;
2673 }
2674
2675 /* Return true if the reduction PHI in LOOP with latch arg LOOP_ARG and
2676    reduction operation CODE has a handled computation expression.  */
2677
2678 bool
2679 check_reduction_path (dump_user_location_t loc, loop_p loop, gphi *phi,
2680                       tree loop_arg, enum tree_code code)
2681 {
2682   auto_vec<std::pair<ssa_op_iter, use_operand_p> > path;
2683   auto_bitmap visited;
2684   tree lookfor = PHI_RESULT (phi);
2685   ssa_op_iter curri;
2686   use_operand_p curr = op_iter_init_phiuse (&curri, phi, SSA_OP_USE);
2687   while (USE_FROM_PTR (curr) != loop_arg)
2688     curr = op_iter_next_use (&curri);
2689   curri.i = curri.numops;
2690   do
2691     {
2692       path.safe_push (std::make_pair (curri, curr));
2693       tree use = USE_FROM_PTR (curr);
2694       if (use == lookfor)
2695         break;
2696       gimple *def = SSA_NAME_DEF_STMT (use);
2697       if (gimple_nop_p (def)
2698           || ! flow_bb_inside_loop_p (loop, gimple_bb (def)))
2699         {
2700 pop:
2701           do
2702             {
2703               std::pair<ssa_op_iter, use_operand_p> x = path.pop ();
2704               curri = x.first;
2705               curr = x.second;
2706               do
2707                 curr = op_iter_next_use (&curri);
2708               /* Skip already visited or non-SSA operands (from iterating
2709                  over PHI args).  */
2710               while (curr != NULL_USE_OPERAND_P
2711                      && (TREE_CODE (USE_FROM_PTR (curr)) != SSA_NAME
2712                          || ! bitmap_set_bit (visited,
2713                                               SSA_NAME_VERSION
2714                                                 (USE_FROM_PTR (curr)))));
2715             }
2716           while (curr == NULL_USE_OPERAND_P && ! path.is_empty ());
2717           if (curr == NULL_USE_OPERAND_P)
2718             break;
2719         }
2720       else
2721         {
2722           if (gimple_code (def) == GIMPLE_PHI)
2723             curr = op_iter_init_phiuse (&curri, as_a <gphi *>(def), SSA_OP_USE);
2724           else
2725             curr = op_iter_init_use (&curri, def, SSA_OP_USE);
2726           while (curr != NULL_USE_OPERAND_P
2727                  && (TREE_CODE (USE_FROM_PTR (curr)) != SSA_NAME
2728                      || ! bitmap_set_bit (visited,
2729                                           SSA_NAME_VERSION
2730                                             (USE_FROM_PTR (curr)))))
2731             curr = op_iter_next_use (&curri);
2732           if (curr == NULL_USE_OPERAND_P)
2733             goto pop;
2734         }
2735     }
2736   while (1);
2737   if (dump_file && (dump_flags & TDF_DETAILS))
2738     {
2739       dump_printf_loc (MSG_NOTE, loc, "reduction path: ");
2740       unsigned i;
2741       std::pair<ssa_op_iter, use_operand_p> *x;
2742       FOR_EACH_VEC_ELT (path, i, x)
2743         dump_printf (MSG_NOTE, "%T ", USE_FROM_PTR (x->second));
2744       dump_printf (MSG_NOTE, "\n");
2745     }
2746
2747   /* Check whether the reduction path detected is valid.  */
2748   bool fail = path.length () == 0;
2749   bool neg = false;
2750   for (unsigned i = 1; i < path.length (); ++i)
2751     {
2752       gimple *use_stmt = USE_STMT (path[i].second);
2753       tree op = USE_FROM_PTR (path[i].second);
2754       if (! has_single_use (op)
2755           || ! is_gimple_assign (use_stmt))
2756         {
2757           fail = true;
2758           break;
2759         }
2760       if (gimple_assign_rhs_code (use_stmt) != code)
2761         {
2762           if (code == PLUS_EXPR
2763               && gimple_assign_rhs_code (use_stmt) == MINUS_EXPR)
2764             {
2765               /* Track whether we negate the reduction value each iteration.  */
2766               if (gimple_assign_rhs2 (use_stmt) == op)
2767                 neg = ! neg;
2768             }
2769           else
2770             {
2771               fail = true;
2772               break;
2773             }
2774         }
2775     }
2776   return ! fail && ! neg;
2777 }
2778
2779
2780 /* Function vect_is_simple_reduction
2781
2782    (1) Detect a cross-iteration def-use cycle that represents a simple
2783    reduction computation.  We look for the following pattern:
2784
2785    loop_header:
2786      a1 = phi < a0, a2 >
2787      a3 = ...
2788      a2 = operation (a3, a1)
2789
2790    or
2791
2792    a3 = ...
2793    loop_header:
2794      a1 = phi < a0, a2 >
2795      a2 = operation (a3, a1)
2796
2797    such that:
2798    1. operation is commutative and associative and it is safe to
2799       change the order of the computation
2800    2. no uses for a2 in the loop (a2 is used out of the loop)
2801    3. no uses of a1 in the loop besides the reduction operation
2802    4. no uses of a1 outside the loop.
2803
2804    Conditions 1,4 are tested here.
2805    Conditions 2,3 are tested in vect_mark_stmts_to_be_vectorized.
2806
2807    (2) Detect a cross-iteration def-use cycle in nested loops, i.e.,
2808    nested cycles.
2809
2810    (3) Detect cycles of phi nodes in outer-loop vectorization, i.e., double
2811    reductions:
2812
2813      a1 = phi < a0, a2 >
2814      inner loop (def of a3)
2815      a2 = phi < a3 >
2816
2817    (4) Detect condition expressions, ie:
2818      for (int i = 0; i < N; i++)
2819        if (a[i] < val)
2820         ret_val = a[i];
2821
2822 */
2823
2824 static stmt_vec_info
2825 vect_is_simple_reduction (loop_vec_info loop_info, stmt_vec_info phi_info,
2826                           bool *double_reduc,
2827                           bool need_wrapping_integral_overflow,
2828                           enum vect_reduction_type *v_reduc_type)
2829 {
2830   gphi *phi = as_a <gphi *> (phi_info->stmt);
2831   struct loop *loop = (gimple_bb (phi))->loop_father;
2832   struct loop *vect_loop = LOOP_VINFO_LOOP (loop_info);
2833   bool nested_in_vect_loop = flow_loop_nested_p (vect_loop, loop);
2834   gimple *phi_use_stmt = NULL;
2835   enum tree_code orig_code, code;
2836   tree op1, op2, op3 = NULL_TREE, op4 = NULL_TREE;
2837   tree type;
2838   tree name;
2839   imm_use_iterator imm_iter;
2840   use_operand_p use_p;
2841   bool phi_def;
2842
2843   *double_reduc = false;
2844   *v_reduc_type = TREE_CODE_REDUCTION;
2845
2846   tree phi_name = PHI_RESULT (phi);
2847   /* ???  If there are no uses of the PHI result the inner loop reduction
2848      won't be detected as possibly double-reduction by vectorizable_reduction
2849      because that tries to walk the PHI arg from the preheader edge which
2850      can be constant.  See PR60382.  */
2851   if (has_zero_uses (phi_name))
2852     return NULL;
2853   unsigned nphi_def_loop_uses = 0;
2854   FOR_EACH_IMM_USE_FAST (use_p, imm_iter, phi_name)
2855     {
2856       gimple *use_stmt = USE_STMT (use_p);
2857       if (is_gimple_debug (use_stmt))
2858         continue;
2859
2860       if (!flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
2861         {
2862           if (dump_enabled_p ())
2863             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2864                              "intermediate value used outside loop.\n");
2865
2866           return NULL;
2867         }
2868
2869       nphi_def_loop_uses++;
2870       phi_use_stmt = use_stmt;
2871     }
2872
2873   edge latch_e = loop_latch_edge (loop);
2874   tree loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
2875   if (TREE_CODE (loop_arg) != SSA_NAME)
2876     {
2877       if (dump_enabled_p ())
2878         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2879                          "reduction: not ssa_name: %T\n", loop_arg);
2880       return NULL;
2881     }
2882
2883   stmt_vec_info def_stmt_info = loop_info->lookup_def (loop_arg);
2884   if (!def_stmt_info
2885       || !flow_bb_inside_loop_p (loop, gimple_bb (def_stmt_info->stmt)))
2886     return NULL;
2887
2888   if (gassign *def_stmt = dyn_cast <gassign *> (def_stmt_info->stmt))
2889     {
2890       name = gimple_assign_lhs (def_stmt);
2891       phi_def = false;
2892     }
2893   else if (gphi *def_stmt = dyn_cast <gphi *> (def_stmt_info->stmt))
2894     {
2895       name = PHI_RESULT (def_stmt);
2896       phi_def = true;
2897     }
2898   else
2899     {
2900       if (dump_enabled_p ())
2901         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2902                          "reduction: unhandled reduction operation: %G",
2903                          def_stmt_info->stmt);
2904       return NULL;
2905     }
2906
2907   unsigned nlatch_def_loop_uses = 0;
2908   auto_vec<gphi *, 3> lcphis;
2909   bool inner_loop_of_double_reduc = false;
2910   FOR_EACH_IMM_USE_FAST (use_p, imm_iter, name)
2911     {
2912       gimple *use_stmt = USE_STMT (use_p);
2913       if (is_gimple_debug (use_stmt))
2914         continue;
2915       if (flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
2916         nlatch_def_loop_uses++;
2917       else
2918         {
2919           /* We can have more than one loop-closed PHI.  */
2920           lcphis.safe_push (as_a <gphi *> (use_stmt));
2921           if (nested_in_vect_loop
2922               && (STMT_VINFO_DEF_TYPE (loop_info->lookup_stmt (use_stmt))
2923                   == vect_double_reduction_def))
2924             inner_loop_of_double_reduc = true;
2925         }
2926     }
2927
2928   /* If this isn't a nested cycle or if the nested cycle reduction value
2929      is used ouside of the inner loop we cannot handle uses of the reduction
2930      value.  */
2931   if ((!nested_in_vect_loop || inner_loop_of_double_reduc)
2932       && (nlatch_def_loop_uses > 1 || nphi_def_loop_uses > 1))
2933     {
2934       if (dump_enabled_p ())
2935         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2936                          "reduction used in loop.\n");
2937       return NULL;
2938     }
2939
2940   /* If DEF_STMT is a phi node itself, we expect it to have a single argument
2941      defined in the inner loop.  */
2942   if (phi_def)
2943     {
2944       gphi *def_stmt = as_a <gphi *> (def_stmt_info->stmt);
2945       op1 = PHI_ARG_DEF (def_stmt, 0);
2946
2947       if (gimple_phi_num_args (def_stmt) != 1
2948           || TREE_CODE (op1) != SSA_NAME)
2949         {
2950           if (dump_enabled_p ())
2951             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2952                              "unsupported phi node definition.\n");
2953
2954           return NULL;
2955         }
2956
2957       gimple *def1 = SSA_NAME_DEF_STMT (op1);
2958       if (gimple_bb (def1)
2959           && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
2960           && loop->inner
2961           && flow_bb_inside_loop_p (loop->inner, gimple_bb (def1))
2962           && is_gimple_assign (def1)
2963           && is_a <gphi *> (phi_use_stmt)
2964           && flow_bb_inside_loop_p (loop->inner, gimple_bb (phi_use_stmt)))
2965         {
2966           if (dump_enabled_p ())
2967             report_vect_op (MSG_NOTE, def_stmt,
2968                             "detected double reduction: ");
2969
2970           *double_reduc = true;
2971           return def_stmt_info;
2972         }
2973
2974       return NULL;
2975     }
2976
2977   /* If we are vectorizing an inner reduction we are executing that
2978      in the original order only in case we are not dealing with a
2979      double reduction.  */
2980   bool check_reduction = true;
2981   if (flow_loop_nested_p (vect_loop, loop))
2982     {
2983       gphi *lcphi;
2984       unsigned i;
2985       check_reduction = false;
2986       FOR_EACH_VEC_ELT (lcphis, i, lcphi)
2987         FOR_EACH_IMM_USE_FAST (use_p, imm_iter, gimple_phi_result (lcphi))
2988           {
2989             gimple *use_stmt = USE_STMT (use_p);
2990             if (is_gimple_debug (use_stmt))
2991               continue;
2992             if (! flow_bb_inside_loop_p (vect_loop, gimple_bb (use_stmt)))
2993               check_reduction = true;
2994           }
2995     }
2996
2997   gassign *def_stmt = as_a <gassign *> (def_stmt_info->stmt);
2998   code = orig_code = gimple_assign_rhs_code (def_stmt);
2999
3000   if (nested_in_vect_loop && !check_reduction)
3001     {
3002       /* FIXME: Even for non-reductions code generation is funneled
3003          through vectorizable_reduction for the stmt defining the
3004          PHI latch value.  So we have to artificially restrict ourselves
3005          for the supported operations.  */
3006       switch (get_gimple_rhs_class (code))
3007         {
3008         case GIMPLE_BINARY_RHS:
3009         case GIMPLE_TERNARY_RHS:
3010           break;
3011         default:
3012           /* Not supported by vectorizable_reduction.  */
3013           if (dump_enabled_p ())
3014             report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
3015                             "nested cycle: not handled operation: ");
3016           return NULL;
3017         }
3018       if (dump_enabled_p ())
3019         report_vect_op (MSG_NOTE, def_stmt, "detected nested cycle: ");
3020       return def_stmt_info;
3021     }
3022
3023   /* We can handle "res -= x[i]", which is non-associative by
3024      simply rewriting this into "res += -x[i]".  Avoid changing
3025      gimple instruction for the first simple tests and only do this
3026      if we're allowed to change code at all.  */
3027   if (code == MINUS_EXPR && gimple_assign_rhs2 (def_stmt) != phi_name)
3028     code = PLUS_EXPR;
3029
3030   if (code == COND_EXPR)
3031     {
3032       if (! nested_in_vect_loop)
3033         *v_reduc_type = COND_REDUCTION;
3034
3035       op3 = gimple_assign_rhs1 (def_stmt);
3036       if (COMPARISON_CLASS_P (op3))
3037         {
3038           op4 = TREE_OPERAND (op3, 1);
3039           op3 = TREE_OPERAND (op3, 0);
3040         }
3041       if (op3 == phi_name || op4 == phi_name)
3042         {
3043           if (dump_enabled_p ())
3044             report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
3045                             "reduction: condition depends on previous"
3046                             " iteration: ");
3047           return NULL;
3048         }
3049
3050       op1 = gimple_assign_rhs2 (def_stmt);
3051       op2 = gimple_assign_rhs3 (def_stmt);
3052     }
3053   else if (!commutative_tree_code (code) || !associative_tree_code (code))
3054     {
3055       if (dump_enabled_p ())
3056         report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
3057                         "reduction: not commutative/associative: ");
3058       return NULL;
3059     }
3060   else if (get_gimple_rhs_class (code) == GIMPLE_BINARY_RHS)
3061     {
3062       op1 = gimple_assign_rhs1 (def_stmt);
3063       op2 = gimple_assign_rhs2 (def_stmt);
3064     }
3065   else
3066     {
3067       if (dump_enabled_p ())
3068         report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
3069                         "reduction: not handled operation: ");
3070       return NULL;
3071     }
3072
3073   if (TREE_CODE (op1) != SSA_NAME && TREE_CODE (op2) != SSA_NAME)
3074     {
3075       if (dump_enabled_p ())
3076         report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
3077                         "reduction: both uses not ssa_names: ");
3078
3079       return NULL;
3080     }
3081
3082   type = TREE_TYPE (gimple_assign_lhs (def_stmt));
3083   if ((TREE_CODE (op1) == SSA_NAME
3084        && !types_compatible_p (type,TREE_TYPE (op1)))
3085       || (TREE_CODE (op2) == SSA_NAME
3086           && !types_compatible_p (type, TREE_TYPE (op2)))
3087       || (op3 && TREE_CODE (op3) == SSA_NAME
3088           && !types_compatible_p (type, TREE_TYPE (op3)))
3089       || (op4 && TREE_CODE (op4) == SSA_NAME
3090           && !types_compatible_p (type, TREE_TYPE (op4))))
3091     {
3092       if (dump_enabled_p ())
3093         {
3094           dump_printf_loc (MSG_NOTE, vect_location,
3095                            "reduction: multiple types: operation type: "
3096                            "%T, operands types: %T,%T",
3097                            type,  TREE_TYPE (op1), TREE_TYPE (op2));
3098           if (op3)
3099             dump_printf (MSG_NOTE, ",%T", TREE_TYPE (op3));
3100
3101           if (op4)
3102             dump_printf (MSG_NOTE, ",%T", TREE_TYPE (op4));
3103           dump_printf (MSG_NOTE, "\n");
3104         }
3105
3106       return NULL;
3107     }
3108
3109   /* Check whether it's ok to change the order of the computation.
3110      Generally, when vectorizing a reduction we change the order of the
3111      computation.  This may change the behavior of the program in some
3112      cases, so we need to check that this is ok.  One exception is when
3113      vectorizing an outer-loop: the inner-loop is executed sequentially,
3114      and therefore vectorizing reductions in the inner-loop during
3115      outer-loop vectorization is safe.  */
3116   if (check_reduction
3117       && *v_reduc_type == TREE_CODE_REDUCTION
3118       && needs_fold_left_reduction_p (type, code,
3119                                       need_wrapping_integral_overflow))
3120     *v_reduc_type = FOLD_LEFT_REDUCTION;
3121
3122   /* Reduction is safe. We're dealing with one of the following:
3123      1) integer arithmetic and no trapv
3124      2) floating point arithmetic, and special flags permit this optimization
3125      3) nested cycle (i.e., outer loop vectorization).  */
3126   stmt_vec_info def1_info = loop_info->lookup_def (op1);
3127   stmt_vec_info def2_info = loop_info->lookup_def (op2);
3128   if (code != COND_EXPR && !def1_info && !def2_info)
3129     {
3130       if (dump_enabled_p ())
3131         report_vect_op (MSG_NOTE, def_stmt, "reduction: no defs for operands: ");
3132       return NULL;
3133     }
3134
3135   /* Check that one def is the reduction def, defined by PHI,
3136      the other def is either defined in the loop ("vect_internal_def"),
3137      or it's an induction (defined by a loop-header phi-node).  */
3138
3139   if (def2_info
3140       && def2_info->stmt == phi
3141       && (code == COND_EXPR
3142           || !def1_info
3143           || !flow_bb_inside_loop_p (loop, gimple_bb (def1_info->stmt))
3144           || vect_valid_reduction_input_p (def1_info)))
3145     {
3146       if (dump_enabled_p ())
3147         report_vect_op (MSG_NOTE, def_stmt, "detected reduction: ");
3148       return def_stmt_info;
3149     }
3150
3151   if (def1_info
3152       && def1_info->stmt == phi
3153       && (code == COND_EXPR
3154           || !def2_info
3155           || !flow_bb_inside_loop_p (loop, gimple_bb (def2_info->stmt))
3156           || vect_valid_reduction_input_p (def2_info)))
3157     {
3158       if (! nested_in_vect_loop && orig_code != MINUS_EXPR)
3159         {
3160           /* Check if we can swap operands (just for simplicity - so that
3161              the rest of the code can assume that the reduction variable
3162              is always the last (second) argument).  */
3163           if (code == COND_EXPR)
3164             {
3165               /* Swap cond_expr by inverting the condition.  */
3166               tree cond_expr = gimple_assign_rhs1 (def_stmt);
3167               enum tree_code invert_code = ERROR_MARK;
3168               enum tree_code cond_code = TREE_CODE (cond_expr);
3169
3170               if (TREE_CODE_CLASS (cond_code) == tcc_comparison)
3171                 {
3172                   bool honor_nans = HONOR_NANS (TREE_OPERAND (cond_expr, 0));
3173                   invert_code = invert_tree_comparison (cond_code, honor_nans);
3174                 }
3175               if (invert_code != ERROR_MARK)
3176                 {
3177                   TREE_SET_CODE (cond_expr, invert_code);
3178                   swap_ssa_operands (def_stmt,
3179                                      gimple_assign_rhs2_ptr (def_stmt),
3180                                      gimple_assign_rhs3_ptr (def_stmt));
3181                 }
3182               else
3183                 {
3184                   if (dump_enabled_p ())
3185                     report_vect_op (MSG_NOTE, def_stmt,
3186                                     "detected reduction: cannot swap operands "
3187                                     "for cond_expr");
3188                   return NULL;
3189                 }
3190             }
3191           else
3192             swap_ssa_operands (def_stmt, gimple_assign_rhs1_ptr (def_stmt),
3193                                gimple_assign_rhs2_ptr (def_stmt));
3194
3195           if (dump_enabled_p ())
3196             report_vect_op (MSG_NOTE, def_stmt,
3197                             "detected reduction: need to swap operands: ");
3198
3199           if (CONSTANT_CLASS_P (gimple_assign_rhs1 (def_stmt)))
3200             LOOP_VINFO_OPERANDS_SWAPPED (loop_info) = true;
3201         }
3202       else
3203         {
3204           if (dump_enabled_p ())
3205             report_vect_op (MSG_NOTE, def_stmt, "detected reduction: ");
3206         }
3207
3208       return def_stmt_info;
3209     }
3210
3211   /* Try to find SLP reduction chain.  */
3212   if (! nested_in_vect_loop
3213       && code != COND_EXPR
3214       && orig_code != MINUS_EXPR
3215       && vect_is_slp_reduction (loop_info, phi, def_stmt))
3216     {
3217       if (dump_enabled_p ())
3218         report_vect_op (MSG_NOTE, def_stmt,
3219                         "reduction: detected reduction chain: ");
3220
3221       return def_stmt_info;
3222     }
3223
3224   /* Look for the expression computing loop_arg from loop PHI result.  */
3225   if (check_reduction_path (vect_location, loop, phi, loop_arg, code))
3226     return def_stmt_info;
3227
3228   if (dump_enabled_p ())
3229     {
3230       report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
3231                       "reduction: unknown pattern: ");
3232     }
3233
3234   return NULL;
3235 }
3236
3237 /* Wrapper around vect_is_simple_reduction, which will modify code
3238    in-place if it enables detection of more reductions.  Arguments
3239    as there.  */
3240
3241 stmt_vec_info
3242 vect_force_simple_reduction (loop_vec_info loop_info, stmt_vec_info phi_info,
3243                              bool *double_reduc,
3244                              bool need_wrapping_integral_overflow)
3245 {
3246   enum vect_reduction_type v_reduc_type;
3247   stmt_vec_info def_info
3248     = vect_is_simple_reduction (loop_info, phi_info, double_reduc,
3249                                 need_wrapping_integral_overflow,
3250                                 &v_reduc_type);
3251   if (def_info)
3252     {
3253       STMT_VINFO_REDUC_TYPE (phi_info) = v_reduc_type;
3254       STMT_VINFO_REDUC_DEF (phi_info) = def_info;
3255       STMT_VINFO_REDUC_TYPE (def_info) = v_reduc_type;
3256       STMT_VINFO_REDUC_DEF (def_info) = phi_info;
3257     }
3258   return def_info;
3259 }
3260
3261 /* Calculate cost of peeling the loop PEEL_ITERS_PROLOGUE times.  */
3262 int
3263 vect_get_known_peeling_cost (loop_vec_info loop_vinfo, int peel_iters_prologue,
3264                              int *peel_iters_epilogue,
3265                              stmt_vector_for_cost *scalar_cost_vec,
3266                              stmt_vector_for_cost *prologue_cost_vec,
3267                              stmt_vector_for_cost *epilogue_cost_vec)
3268 {
3269   int retval = 0;
3270   int assumed_vf = vect_vf_for_cost (loop_vinfo);
3271
3272   if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
3273     {
3274       *peel_iters_epilogue = assumed_vf / 2;
3275       if (dump_enabled_p ())
3276         dump_printf_loc (MSG_NOTE, vect_location,
3277                          "cost model: epilogue peel iters set to vf/2 "
3278                          "because loop iterations are unknown .\n");
3279
3280       /* If peeled iterations are known but number of scalar loop
3281          iterations are unknown, count a taken branch per peeled loop.  */
3282       retval = record_stmt_cost (prologue_cost_vec, 1, cond_branch_taken,
3283                                  NULL, 0, vect_prologue);
3284       retval = record_stmt_cost (prologue_cost_vec, 1, cond_branch_taken,
3285                                  NULL, 0, vect_epilogue);
3286     }
3287   else
3288     {
3289       int niters = LOOP_VINFO_INT_NITERS (loop_vinfo);
3290       peel_iters_prologue = niters < peel_iters_prologue ?
3291                             niters : peel_iters_prologue;
3292       *peel_iters_epilogue = (niters - peel_iters_prologue) % assumed_vf;
3293       /* If we need to peel for gaps, but no peeling is required, we have to
3294          peel VF iterations.  */
3295       if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) && !*peel_iters_epilogue)
3296         *peel_iters_epilogue = assumed_vf;
3297     }
3298
3299   stmt_info_for_cost *si;
3300   int j;
3301   if (peel_iters_prologue)
3302     FOR_EACH_VEC_ELT (*scalar_cost_vec, j, si)
3303       retval += record_stmt_cost (prologue_cost_vec,
3304                                   si->count * peel_iters_prologue,
3305                                   si->kind, si->stmt_info, si->misalign,
3306                                   vect_prologue);
3307   if (*peel_iters_epilogue)
3308     FOR_EACH_VEC_ELT (*scalar_cost_vec, j, si)
3309       retval += record_stmt_cost (epilogue_cost_vec,
3310                                   si->count * *peel_iters_epilogue,
3311                                   si->kind, si->stmt_info, si->misalign,
3312                                   vect_epilogue);
3313
3314   return retval;
3315 }
3316
3317 /* Function vect_estimate_min_profitable_iters
3318
3319    Return the number of iterations required for the vector version of the
3320    loop to be profitable relative to the cost of the scalar version of the
3321    loop.
3322
3323    *RET_MIN_PROFITABLE_NITERS is a cost model profitability threshold
3324    of iterations for vectorization.  -1 value means loop vectorization
3325    is not profitable.  This returned value may be used for dynamic
3326    profitability check.
3327
3328    *RET_MIN_PROFITABLE_ESTIMATE is a profitability threshold to be used
3329    for static check against estimated number of iterations.  */
3330
3331 static void
3332 vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo,
3333                                     int *ret_min_profitable_niters,
3334                                     int *ret_min_profitable_estimate)
3335 {
3336   int min_profitable_iters;
3337   int min_profitable_estimate;
3338   int peel_iters_prologue;
3339   int peel_iters_epilogue;
3340   unsigned vec_inside_cost = 0;
3341   int vec_outside_cost = 0;
3342   unsigned vec_prologue_cost = 0;
3343   unsigned vec_epilogue_cost = 0;
3344   int scalar_single_iter_cost = 0;
3345   int scalar_outside_cost = 0;
3346   int assumed_vf = vect_vf_for_cost (loop_vinfo);
3347   int npeel = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
3348   void *target_cost_data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
3349
3350   /* Cost model disabled.  */
3351   if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
3352     {
3353       if (dump_enabled_p ())
3354         dump_printf_loc (MSG_NOTE, vect_location, "cost model disabled.\n");
3355       *ret_min_profitable_niters = 0;
3356       *ret_min_profitable_estimate = 0;
3357       return;
3358     }
3359
3360   /* Requires loop versioning tests to handle misalignment.  */
3361   if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo))
3362     {
3363       /*  FIXME: Make cost depend on complexity of individual check.  */
3364       unsigned len = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).length ();
3365       (void) add_stmt_cost (target_cost_data, len, vector_stmt, NULL, 0,
3366                             vect_prologue);
3367       if (dump_enabled_p ())
3368         dump_printf (MSG_NOTE,
3369                      "cost model: Adding cost of checks for loop "
3370                      "versioning to treat misalignment.\n");
3371     }
3372
3373   /* Requires loop versioning with alias checks.  */
3374   if (LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
3375     {
3376       /*  FIXME: Make cost depend on complexity of individual check.  */
3377       unsigned len = LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo).length ();
3378       (void) add_stmt_cost (target_cost_data, len, vector_stmt, NULL, 0,
3379                             vect_prologue);
3380       len = LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo).length ();
3381       if (len)
3382         /* Count LEN - 1 ANDs and LEN comparisons.  */
3383         (void) add_stmt_cost (target_cost_data, len * 2 - 1, scalar_stmt,
3384                               NULL, 0, vect_prologue);
3385       len = LOOP_VINFO_LOWER_BOUNDS (loop_vinfo).length ();
3386       if (len)
3387         {
3388           /* Count LEN - 1 ANDs and LEN comparisons.  */
3389           unsigned int nstmts = len * 2 - 1;
3390           /* +1 for each bias that needs adding.  */
3391           for (unsigned int i = 0; i < len; ++i)
3392             if (!LOOP_VINFO_LOWER_BOUNDS (loop_vinfo)[i].unsigned_p)
3393               nstmts += 1;
3394           (void) add_stmt_cost (target_cost_data, nstmts, scalar_stmt,
3395                                 NULL, 0, vect_prologue);
3396         }
3397       if (dump_enabled_p ())
3398         dump_printf (MSG_NOTE,
3399                      "cost model: Adding cost of checks for loop "
3400                      "versioning aliasing.\n");
3401     }
3402
3403   /* Requires loop versioning with niter checks.  */
3404   if (LOOP_REQUIRES_VERSIONING_FOR_NITERS (loop_vinfo))
3405     {
3406       /*  FIXME: Make cost depend on complexity of individual check.  */
3407       (void) add_stmt_cost (target_cost_data, 1, vector_stmt, NULL, 0,
3408                             vect_prologue);
3409       if (dump_enabled_p ())
3410         dump_printf (MSG_NOTE,
3411                      "cost model: Adding cost of checks for loop "
3412                      "versioning niters.\n");
3413     }
3414
3415   if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
3416     (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken, NULL, 0,
3417                           vect_prologue);
3418
3419   /* Count statements in scalar loop.  Using this as scalar cost for a single
3420      iteration for now.
3421
3422      TODO: Add outer loop support.
3423
3424      TODO: Consider assigning different costs to different scalar
3425      statements.  */
3426
3427   scalar_single_iter_cost
3428     = LOOP_VINFO_SINGLE_SCALAR_ITERATION_COST (loop_vinfo);
3429
3430   /* Add additional cost for the peeled instructions in prologue and epilogue
3431      loop.  (For fully-masked loops there will be no peeling.)
3432
3433      FORNOW: If we don't know the value of peel_iters for prologue or epilogue
3434      at compile-time - we assume it's vf/2 (the worst would be vf-1).
3435
3436      TODO: Build an expression that represents peel_iters for prologue and
3437      epilogue to be used in a run-time test.  */
3438
3439   if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
3440     {
3441       peel_iters_prologue = 0;
3442       peel_iters_epilogue = 0;
3443
3444       if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
3445         {
3446           /* We need to peel exactly one iteration.  */
3447           peel_iters_epilogue += 1;
3448           stmt_info_for_cost *si;
3449           int j;
3450           FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
3451                             j, si)
3452             (void) add_stmt_cost (target_cost_data, si->count,
3453                                   si->kind, si->stmt_info, si->misalign,
3454                                   vect_epilogue);
3455         }
3456     }
3457   else if (npeel < 0)
3458     {
3459       peel_iters_prologue = assumed_vf / 2;
3460       if (dump_enabled_p ())
3461         dump_printf (MSG_NOTE, "cost model: "
3462                      "prologue peel iters set to vf/2.\n");
3463
3464       /* If peeling for alignment is unknown, loop bound of main loop becomes
3465          unknown.  */
3466       peel_iters_epilogue = assumed_vf / 2;
3467       if (dump_enabled_p ())
3468         dump_printf (MSG_NOTE, "cost model: "
3469                      "epilogue peel iters set to vf/2 because "
3470                      "peeling for alignment is unknown.\n");
3471
3472       /* If peeled iterations are unknown, count a taken branch and a not taken
3473          branch per peeled loop. Even if scalar loop iterations are known,
3474          vector iterations are not known since peeled prologue iterations are
3475          not known. Hence guards remain the same.  */
3476       (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken,
3477                             NULL, 0, vect_prologue);
3478       (void) add_stmt_cost (target_cost_data, 1, cond_branch_not_taken,
3479                             NULL, 0, vect_prologue);
3480       (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken,
3481                             NULL, 0, vect_epilogue);
3482       (void) add_stmt_cost (target_cost_data, 1, cond_branch_not_taken,
3483                             NULL, 0, vect_epilogue);
3484       stmt_info_for_cost *si;
3485       int j;
3486       FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo), j, si)
3487         {
3488           (void) add_stmt_cost (target_cost_data,
3489                                 si->count * peel_iters_prologue,
3490                                 si->kind, si->stmt_info, si->misalign,
3491                                 vect_prologue);
3492           (void) add_stmt_cost (target_cost_data,
3493                                 si->count * peel_iters_epilogue,
3494                                 si->kind, si->stmt_info, si->misalign,
3495                                 vect_epilogue);
3496         }
3497     }
3498   else
3499     {
3500       stmt_vector_for_cost prologue_cost_vec, epilogue_cost_vec;
3501       stmt_info_for_cost *si;
3502       int j;
3503       void *data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
3504
3505       prologue_cost_vec.create (2);
3506       epilogue_cost_vec.create (2);
3507       peel_iters_prologue = npeel;
3508
3509       (void) vect_get_known_peeling_cost (loop_vinfo, peel_iters_prologue,
3510                                           &peel_iters_epilogue,
3511                                           &LOOP_VINFO_SCALAR_ITERATION_COST
3512                                             (loop_vinfo),
3513                                           &prologue_cost_vec,
3514                                           &epilogue_cost_vec);
3515
3516       FOR_EACH_VEC_ELT (prologue_cost_vec, j, si)
3517         (void) add_stmt_cost (data, si->count, si->kind, si->stmt_info,
3518                               si->misalign, vect_prologue);
3519
3520       FOR_EACH_VEC_ELT (epilogue_cost_vec, j, si)
3521         (void) add_stmt_cost (data, si->count, si->kind, si->stmt_info,
3522                               si->misalign, vect_epilogue);
3523
3524       prologue_cost_vec.release ();
3525       epilogue_cost_vec.release ();
3526     }
3527
3528   /* FORNOW: The scalar outside cost is incremented in one of the
3529      following ways:
3530
3531      1. The vectorizer checks for alignment and aliasing and generates
3532      a condition that allows dynamic vectorization.  A cost model
3533      check is ANDED with the versioning condition.  Hence scalar code
3534      path now has the added cost of the versioning check.
3535
3536        if (cost > th & versioning_check)
3537          jmp to vector code
3538
3539      Hence run-time scalar is incremented by not-taken branch cost.
3540
3541      2. The vectorizer then checks if a prologue is required.  If the
3542      cost model check was not done before during versioning, it has to
3543      be done before the prologue check.
3544
3545        if (cost <= th)
3546          prologue = scalar_iters
3547        if (prologue == 0)
3548          jmp to vector code
3549        else
3550          execute prologue
3551        if (prologue == num_iters)
3552          go to exit
3553
3554      Hence the run-time scalar cost is incremented by a taken branch,
3555      plus a not-taken branch, plus a taken branch cost.
3556
3557      3. The vectorizer then checks if an epilogue is required.  If the
3558      cost model check was not done before during prologue check, it
3559      has to be done with the epilogue check.
3560
3561        if (prologue == 0)
3562          jmp to vector code
3563        else
3564          execute prologue
3565        if (prologue == num_iters)
3566          go to exit
3567        vector code:
3568          if ((cost <= th) | (scalar_iters-prologue-epilogue == 0))
3569            jmp to epilogue
3570
3571      Hence the run-time scalar cost should be incremented by 2 taken
3572      branches.
3573
3574      TODO: The back end may reorder the BBS's differently and reverse
3575      conditions/branch directions.  Change the estimates below to
3576      something more reasonable.  */
3577
3578   /* If the number of iterations is known and we do not do versioning, we can
3579      decide whether to vectorize at compile time.  Hence the scalar version
3580      do not carry cost model guard costs.  */
3581   if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
3582       || LOOP_REQUIRES_VERSIONING (loop_vinfo))
3583     {
3584       /* Cost model check occurs at versioning.  */
3585       if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
3586         scalar_outside_cost += vect_get_stmt_cost (cond_branch_not_taken);
3587       else
3588         {
3589           /* Cost model check occurs at prologue generation.  */
3590           if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) < 0)
3591             scalar_outside_cost += 2 * vect_get_stmt_cost (cond_branch_taken)
3592               + vect_get_stmt_cost (cond_branch_not_taken);
3593           /* Cost model check occurs at epilogue generation.  */
3594           else
3595             scalar_outside_cost += 2 * vect_get_stmt_cost (cond_branch_taken);
3596         }
3597     }
3598
3599   /* Complete the target-specific cost calculations.  */
3600   finish_cost (LOOP_VINFO_TARGET_COST_DATA (loop_vinfo), &vec_prologue_cost,
3601                &vec_inside_cost, &vec_epilogue_cost);
3602
3603   vec_outside_cost = (int)(vec_prologue_cost + vec_epilogue_cost);
3604
3605   if (dump_enabled_p ())
3606     {
3607       dump_printf_loc (MSG_NOTE, vect_location, "Cost model analysis: \n");
3608       dump_printf (MSG_NOTE, "  Vector inside of loop cost: %d\n",
3609                    vec_inside_cost);
3610       dump_printf (MSG_NOTE, "  Vector prologue cost: %d\n",
3611                    vec_prologue_cost);
3612       dump_printf (MSG_NOTE, "  Vector epilogue cost: %d\n",
3613                    vec_epilogue_cost);
3614       dump_printf (MSG_NOTE, "  Scalar iteration cost: %d\n",
3615                    scalar_single_iter_cost);
3616       dump_printf (MSG_NOTE, "  Scalar outside cost: %d\n",
3617                    scalar_outside_cost);
3618       dump_printf (MSG_NOTE, "  Vector outside cost: %d\n",
3619                    vec_outside_cost);
3620       dump_printf (MSG_NOTE, "  prologue iterations: %d\n",
3621                    peel_iters_prologue);
3622       dump_printf (MSG_NOTE, "  epilogue iterations: %d\n",
3623                    peel_iters_epilogue);
3624     }
3625
3626   /* Calculate number of iterations required to make the vector version
3627      profitable, relative to the loop bodies only.  The following condition
3628      must hold true:
3629      SIC * niters + SOC > VIC * ((niters - NPEEL) / VF) + VOC
3630      where
3631      SIC = scalar iteration cost, VIC = vector iteration cost,
3632      VOC = vector outside cost, VF = vectorization factor,
3633      NPEEL = prologue iterations + epilogue iterations,
3634      SOC = scalar outside cost for run time cost model check.  */
3635
3636   int saving_per_viter = (scalar_single_iter_cost * assumed_vf
3637                           - vec_inside_cost);
3638   if (saving_per_viter <= 0)
3639     {
3640       if (LOOP_VINFO_LOOP (loop_vinfo)->force_vectorize)
3641         warning_at (vect_location.get_location_t (), OPT_Wopenmp_simd,
3642                     "vectorization did not happen for a simd loop");
3643
3644       if (dump_enabled_p ())
3645         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3646                          "cost model: the vector iteration cost = %d "
3647                          "divided by the scalar iteration cost = %d "
3648                          "is greater or equal to the vectorization factor = %d"
3649                          ".\n",
3650                          vec_inside_cost, scalar_single_iter_cost, assumed_vf);
3651       *ret_min_profitable_niters = -1;
3652       *ret_min_profitable_estimate = -1;
3653       return;
3654     }
3655
3656   /* ??? The "if" arm is written to handle all cases; see below for what
3657      we would do for !LOOP_VINFO_FULLY_MASKED_P.  */
3658   if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
3659     {
3660       /* Rewriting the condition above in terms of the number of
3661          vector iterations (vniters) rather than the number of
3662          scalar iterations (niters) gives:
3663
3664          SIC * (vniters * VF + NPEEL) + SOC > VIC * vniters + VOC
3665
3666          <==> vniters * (SIC * VF - VIC) > VOC - SIC * NPEEL - SOC
3667
3668          For integer N, X and Y when X > 0:
3669
3670          N * X > Y <==> N >= (Y /[floor] X) + 1.  */
3671       int outside_overhead = (vec_outside_cost
3672                               - scalar_single_iter_cost * peel_iters_prologue
3673                               - scalar_single_iter_cost * peel_iters_epilogue
3674                               - scalar_outside_cost);
3675       /* We're only interested in cases that require at least one
3676          vector iteration.  */
3677       int min_vec_niters = 1;
3678       if (outside_overhead > 0)
3679         min_vec_niters = outside_overhead / saving_per_viter + 1;
3680
3681       if (dump_enabled_p ())
3682         dump_printf (MSG_NOTE, "  Minimum number of vector iterations: %d\n",
3683                      min_vec_niters);
3684
3685       if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
3686         {
3687           /* Now that we know the minimum number of vector iterations,
3688              find the minimum niters for which the scalar cost is larger:
3689
3690              SIC * niters > VIC * vniters + VOC - SOC
3691
3692              We know that the minimum niters is no more than
3693              vniters * VF + NPEEL, but it might be (and often is) less
3694              than that if a partial vector iteration is cheaper than the
3695              equivalent scalar code.  */
3696           int threshold = (vec_inside_cost * min_vec_niters
3697                            + vec_outside_cost
3698                            - scalar_outside_cost);
3699           if (threshold <= 0)
3700             min_profitable_iters = 1;
3701           else
3702             min_profitable_iters = threshold / scalar_single_iter_cost + 1;
3703         }
3704       else
3705         /* Convert the number of vector iterations into a number of
3706            scalar iterations.  */
3707         min_profitable_iters = (min_vec_niters * assumed_vf
3708                                 + peel_iters_prologue
3709                                 + peel_iters_epilogue);
3710     }
3711   else
3712     {
3713       min_profitable_iters = ((vec_outside_cost - scalar_outside_cost)
3714                               * assumed_vf
3715                               - vec_inside_cost * peel_iters_prologue
3716                               - vec_inside_cost * peel_iters_epilogue);
3717       if (min_profitable_iters <= 0)
3718         min_profitable_iters = 0;
3719       else
3720         {
3721           min_profitable_iters /= saving_per_viter;
3722
3723           if ((scalar_single_iter_cost * assumed_vf * min_profitable_iters)
3724               <= (((int) vec_inside_cost * min_profitable_iters)
3725                   + (((int) vec_outside_cost - scalar_outside_cost)
3726                      * assumed_vf)))
3727             min_profitable_iters++;
3728         }
3729     }
3730
3731   if (dump_enabled_p ())
3732     dump_printf (MSG_NOTE,
3733                  "  Calculated minimum iters for profitability: %d\n",
3734                  min_profitable_iters);
3735
3736   if (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
3737       && min_profitable_iters < (assumed_vf + peel_iters_prologue))
3738     /* We want the vectorized loop to execute at least once.  */
3739     min_profitable_iters = assumed_vf + peel_iters_prologue;
3740
3741   if (dump_enabled_p ())
3742     dump_printf_loc (MSG_NOTE, vect_location,
3743                      "  Runtime profitability threshold = %d\n",
3744                      min_profitable_iters);
3745
3746   *ret_min_profitable_niters = min_profitable_iters;
3747
3748   /* Calculate number of iterations required to make the vector version
3749      profitable, relative to the loop bodies only.
3750
3751      Non-vectorized variant is SIC * niters and it must win over vector
3752      variant on the expected loop trip count.  The following condition must hold true:
3753      SIC * niters > VIC * ((niters - NPEEL) / VF) + VOC + SOC  */
3754
3755   if (vec_outside_cost <= 0)
3756     min_profitable_estimate = 0;
3757   else if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
3758     {
3759       /* This is a repeat of the code above, but with + SOC rather
3760          than - SOC.  */
3761       int outside_overhead = (vec_outside_cost
3762                               - scalar_single_iter_cost * peel_iters_prologue
3763                               - scalar_single_iter_cost * peel_iters_epilogue
3764                               + scalar_outside_cost);
3765       int min_vec_niters = 1;
3766       if (outside_overhead > 0)
3767         min_vec_niters = outside_overhead / saving_per_viter + 1;
3768
3769       if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
3770         {
3771           int threshold = (vec_inside_cost * min_vec_niters
3772                            + vec_outside_cost
3773                            + scalar_outside_cost);
3774           min_profitable_estimate = threshold / scalar_single_iter_cost + 1;
3775         }
3776       else
3777         min_profitable_estimate = (min_vec_niters * assumed_vf
3778                                    + peel_iters_prologue
3779                                    + peel_iters_epilogue);
3780     }
3781   else
3782     {
3783       min_profitable_estimate = ((vec_outside_cost + scalar_outside_cost)
3784                                  * assumed_vf
3785                                  - vec_inside_cost * peel_iters_prologue
3786                                  - vec_inside_cost * peel_iters_epilogue)
3787                                  / ((scalar_single_iter_cost * assumed_vf)
3788                                    - vec_inside_cost);
3789     }
3790   min_profitable_estimate = MAX (min_profitable_estimate, min_profitable_iters);
3791   if (dump_enabled_p ())
3792     dump_printf_loc (MSG_NOTE, vect_location,
3793                      "  Static estimate profitability threshold = %d\n",
3794                      min_profitable_estimate);
3795
3796   *ret_min_profitable_estimate = min_profitable_estimate;
3797 }
3798
3799 /* Writes into SEL a mask for a vec_perm, equivalent to a vec_shr by OFFSET
3800    vector elements (not bits) for a vector with NELT elements.  */
3801 static void
3802 calc_vec_perm_mask_for_shift (unsigned int offset, unsigned int nelt,
3803                               vec_perm_builder *sel)
3804 {
3805   /* The encoding is a single stepped pattern.  Any wrap-around is handled
3806      by vec_perm_indices.  */
3807   sel->new_vector (nelt, 1, 3);
3808   for (unsigned int i = 0; i < 3; i++)
3809     sel->quick_push (i + offset);
3810 }
3811
3812 /* Checks whether the target supports whole-vector shifts for vectors of mode
3813    MODE.  This is the case if _either_ the platform handles vec_shr_optab, _or_
3814    it supports vec_perm_const with masks for all necessary shift amounts.  */
3815 static bool
3816 have_whole_vector_shift (machine_mode mode)
3817 {
3818   if (optab_handler (vec_shr_optab, mode) != CODE_FOR_nothing)
3819     return true;
3820
3821   /* Variable-length vectors should be handled via the optab.  */
3822   unsigned int nelt;
3823   if (!GET_MODE_NUNITS (mode).is_constant (&nelt))
3824     return false;
3825
3826   vec_perm_builder sel;
3827   vec_perm_indices indices;
3828   for (unsigned int i = nelt / 2; i >= 1; i /= 2)
3829     {
3830       calc_vec_perm_mask_for_shift (i, nelt, &sel);
3831       indices.new_vector (sel, 2, nelt);
3832       if (!can_vec_perm_const_p (mode, indices, false))
3833         return false;
3834     }
3835   return true;
3836 }
3837
3838 /* TODO: Close dependency between vect_model_*_cost and vectorizable_*
3839    functions. Design better to avoid maintenance issues.  */
3840
3841 /* Function vect_model_reduction_cost.
3842
3843    Models cost for a reduction operation, including the vector ops
3844    generated within the strip-mine loop, the initial definition before
3845    the loop, and the epilogue code that must be generated.  */
3846
3847 static void
3848 vect_model_reduction_cost (stmt_vec_info stmt_info, internal_fn reduc_fn,
3849                            int ncopies, stmt_vector_for_cost *cost_vec)
3850 {
3851   int prologue_cost = 0, epilogue_cost = 0, inside_cost;
3852   enum tree_code code;
3853   optab optab;
3854   tree vectype;
3855   machine_mode mode;
3856   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
3857   struct loop *loop = NULL;
3858
3859   if (loop_vinfo)
3860     loop = LOOP_VINFO_LOOP (loop_vinfo);
3861
3862   /* Condition reductions generate two reductions in the loop.  */
3863   vect_reduction_type reduction_type
3864     = STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info);
3865   if (reduction_type == COND_REDUCTION)
3866     ncopies *= 2;
3867
3868   vectype = STMT_VINFO_VECTYPE (stmt_info);
3869   mode = TYPE_MODE (vectype);
3870   stmt_vec_info orig_stmt_info = vect_orig_stmt (stmt_info);
3871
3872   code = gimple_assign_rhs_code (orig_stmt_info->stmt);
3873
3874   if (reduction_type == EXTRACT_LAST_REDUCTION
3875       || reduction_type == FOLD_LEFT_REDUCTION)
3876     {
3877       /* No extra instructions needed in the prologue.  */
3878       prologue_cost = 0;
3879
3880       if (reduction_type == EXTRACT_LAST_REDUCTION || reduc_fn != IFN_LAST)
3881         /* Count one reduction-like operation per vector.  */
3882         inside_cost = record_stmt_cost (cost_vec, ncopies, vec_to_scalar,
3883                                         stmt_info, 0, vect_body);
3884       else
3885         {
3886           /* Use NELEMENTS extracts and NELEMENTS scalar ops.  */
3887           unsigned int nelements = ncopies * vect_nunits_for_cost (vectype);
3888           inside_cost = record_stmt_cost (cost_vec, nelements,
3889                                           vec_to_scalar, stmt_info, 0,
3890                                           vect_body);
3891           inside_cost += record_stmt_cost (cost_vec, nelements,
3892                                            scalar_stmt, stmt_info, 0,
3893                                            vect_body);
3894         }
3895     }
3896   else
3897     {
3898       /* Add in cost for initial definition.
3899          For cond reduction we have four vectors: initial index, step,
3900          initial result of the data reduction, initial value of the index
3901          reduction.  */
3902       int prologue_stmts = reduction_type == COND_REDUCTION ? 4 : 1;
3903       prologue_cost += record_stmt_cost (cost_vec, prologue_stmts,
3904                                          scalar_to_vec, stmt_info, 0,
3905                                          vect_prologue);
3906
3907       /* Cost of reduction op inside loop.  */
3908       inside_cost = record_stmt_cost (cost_vec, ncopies, vector_stmt,
3909                                       stmt_info, 0, vect_body);
3910     }
3911
3912   /* Determine cost of epilogue code.
3913
3914      We have a reduction operator that will reduce the vector in one statement.
3915      Also requires scalar extract.  */
3916
3917   if (!loop || !nested_in_vect_loop_p (loop, orig_stmt_info))
3918     {
3919       if (reduc_fn != IFN_LAST)
3920         {
3921           if (reduction_type == COND_REDUCTION)
3922             {
3923               /* An EQ stmt and an COND_EXPR stmt.  */
3924               epilogue_cost += record_stmt_cost (cost_vec, 2,
3925                                                  vector_stmt, stmt_info, 0,
3926                                                  vect_epilogue);
3927               /* Reduction of the max index and a reduction of the found
3928                  values.  */
3929               epilogue_cost += record_stmt_cost (cost_vec, 2,
3930                                                  vec_to_scalar, stmt_info, 0,
3931                                                  vect_epilogue);
3932               /* A broadcast of the max value.  */
3933               epilogue_cost += record_stmt_cost (cost_vec, 1,
3934                                                  scalar_to_vec, stmt_info, 0,
3935                                                  vect_epilogue);
3936             }
3937           else
3938             {
3939               epilogue_cost += record_stmt_cost (cost_vec, 1, vector_stmt,
3940                                                  stmt_info, 0, vect_epilogue);
3941               epilogue_cost += record_stmt_cost (cost_vec, 1,
3942                                                  vec_to_scalar, stmt_info, 0,
3943                                                  vect_epilogue);
3944             }
3945         }
3946       else if (reduction_type == COND_REDUCTION)
3947         {
3948           unsigned estimated_nunits = vect_nunits_for_cost (vectype);
3949           /* Extraction of scalar elements.  */
3950           epilogue_cost += record_stmt_cost (cost_vec,
3951                                              2 * estimated_nunits,
3952                                              vec_to_scalar, stmt_info, 0,
3953                                              vect_epilogue);
3954           /* Scalar max reductions via COND_EXPR / MAX_EXPR.  */
3955           epilogue_cost += record_stmt_cost (cost_vec,
3956                                              2 * estimated_nunits - 3,
3957                                              scalar_stmt, stmt_info, 0,
3958                                              vect_epilogue);
3959         }
3960       else if (reduction_type == EXTRACT_LAST_REDUCTION
3961                || reduction_type == FOLD_LEFT_REDUCTION)
3962         /* No extra instructions need in the epilogue.  */
3963         ;
3964       else
3965         {
3966           int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
3967           tree bitsize =
3968             TYPE_SIZE (TREE_TYPE (gimple_assign_lhs (orig_stmt_info->stmt)));
3969           int element_bitsize = tree_to_uhwi (bitsize);
3970           int nelements = vec_size_in_bits / element_bitsize;
3971
3972           if (code == COND_EXPR)
3973             code = MAX_EXPR;
3974
3975           optab = optab_for_tree_code (code, vectype, optab_default);
3976
3977           /* We have a whole vector shift available.  */
3978           if (optab != unknown_optab
3979               && VECTOR_MODE_P (mode)
3980               && optab_handler (optab, mode) != CODE_FOR_nothing
3981               && have_whole_vector_shift (mode))
3982             {
3983               /* Final reduction via vector shifts and the reduction operator.
3984                  Also requires scalar extract.  */
3985               epilogue_cost += record_stmt_cost (cost_vec,
3986                                                  exact_log2 (nelements) * 2,
3987                                                  vector_stmt, stmt_info, 0,
3988                                                  vect_epilogue);
3989               epilogue_cost += record_stmt_cost (cost_vec, 1,
3990                                                  vec_to_scalar, stmt_info, 0,
3991                                                  vect_epilogue);
3992             }
3993           else
3994             /* Use extracts and reduction op for final reduction.  For N
3995                elements, we have N extracts and N-1 reduction ops.  */
3996             epilogue_cost += record_stmt_cost (cost_vec,
3997                                                nelements + nelements - 1,
3998                                                vector_stmt, stmt_info, 0,
3999                                                vect_epilogue);
4000         }
4001     }
4002
4003   if (dump_enabled_p ())
4004     dump_printf (MSG_NOTE,
4005                  "vect_model_reduction_cost: inside_cost = %d, "
4006                  "prologue_cost = %d, epilogue_cost = %d .\n", inside_cost,
4007                  prologue_cost, epilogue_cost);
4008 }
4009
4010
4011 /* Function vect_model_induction_cost.
4012
4013    Models cost for induction operations.  */
4014
4015 static void
4016 vect_model_induction_cost (stmt_vec_info stmt_info, int ncopies,
4017                            stmt_vector_for_cost *cost_vec)
4018 {
4019   unsigned inside_cost, prologue_cost;
4020
4021   if (PURE_SLP_STMT (stmt_info))
4022     return;
4023
4024   /* loop cost for vec_loop.  */
4025   inside_cost = record_stmt_cost (cost_vec, ncopies, vector_stmt,
4026                                   stmt_info, 0, vect_body);
4027
4028   /* prologue cost for vec_init and vec_step.  */
4029   prologue_cost = record_stmt_cost (cost_vec, 2, scalar_to_vec,
4030                                     stmt_info, 0, vect_prologue);
4031
4032   if (dump_enabled_p ())
4033     dump_printf_loc (MSG_NOTE, vect_location,
4034                      "vect_model_induction_cost: inside_cost = %d, "
4035                      "prologue_cost = %d .\n", inside_cost, prologue_cost);
4036 }
4037
4038
4039
4040 /* Function get_initial_def_for_reduction
4041
4042    Input:
4043    STMT_VINFO - a stmt that performs a reduction operation in the loop.
4044    INIT_VAL - the initial value of the reduction variable
4045
4046    Output:
4047    ADJUSTMENT_DEF - a tree that holds a value to be added to the final result
4048         of the reduction (used for adjusting the epilog - see below).
4049    Return a vector variable, initialized according to the operation that
4050         STMT_VINFO performs. This vector will be used as the initial value
4051         of the vector of partial results.
4052
4053    Option1 (adjust in epilog): Initialize the vector as follows:
4054      add/bit or/xor:    [0,0,...,0,0]
4055      mult/bit and:      [1,1,...,1,1]
4056      min/max/cond_expr: [init_val,init_val,..,init_val,init_val]
4057    and when necessary (e.g. add/mult case) let the caller know
4058    that it needs to adjust the result by init_val.
4059
4060    Option2: Initialize the vector as follows:
4061      add/bit or/xor:    [init_val,0,0,...,0]
4062      mult/bit and:      [init_val,1,1,...,1]
4063      min/max/cond_expr: [init_val,init_val,...,init_val]
4064    and no adjustments are needed.
4065
4066    For example, for the following code:
4067
4068    s = init_val;
4069    for (i=0;i<n;i++)
4070      s = s + a[i];
4071
4072    STMT_VINFO is 's = s + a[i]', and the reduction variable is 's'.
4073    For a vector of 4 units, we want to return either [0,0,0,init_val],
4074    or [0,0,0,0] and let the caller know that it needs to adjust
4075    the result at the end by 'init_val'.
4076
4077    FORNOW, we are using the 'adjust in epilog' scheme, because this way the
4078    initialization vector is simpler (same element in all entries), if
4079    ADJUSTMENT_DEF is not NULL, and Option2 otherwise.
4080
4081    A cost model should help decide between these two schemes.  */
4082
4083 tree
4084 get_initial_def_for_reduction (stmt_vec_info stmt_vinfo, tree init_val,
4085                                tree *adjustment_def)
4086 {
4087   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_vinfo);
4088   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
4089   tree scalar_type = TREE_TYPE (init_val);
4090   tree vectype = get_vectype_for_scalar_type (scalar_type);
4091   enum tree_code code = gimple_assign_rhs_code (stmt_vinfo->stmt);
4092   tree def_for_init;
4093   tree init_def;
4094   REAL_VALUE_TYPE real_init_val = dconst0;
4095   int int_init_val = 0;
4096   gimple_seq stmts = NULL;
4097
4098   gcc_assert (vectype);
4099
4100   gcc_assert (POINTER_TYPE_P (scalar_type) || INTEGRAL_TYPE_P (scalar_type)
4101               || SCALAR_FLOAT_TYPE_P (scalar_type));
4102
4103   gcc_assert (nested_in_vect_loop_p (loop, stmt_vinfo)
4104               || loop == (gimple_bb (stmt_vinfo->stmt))->loop_father);
4105
4106   vect_reduction_type reduction_type
4107     = STMT_VINFO_VEC_REDUCTION_TYPE (stmt_vinfo);
4108
4109   switch (code)
4110     {
4111     case WIDEN_SUM_EXPR:
4112     case DOT_PROD_EXPR:
4113     case SAD_EXPR:
4114     case PLUS_EXPR:
4115     case MINUS_EXPR:
4116     case BIT_IOR_EXPR:
4117     case BIT_XOR_EXPR:
4118     case MULT_EXPR:
4119     case BIT_AND_EXPR:
4120       {
4121         /* ADJUSTMENT_DEF is NULL when called from
4122            vect_create_epilog_for_reduction to vectorize double reduction.  */
4123         if (adjustment_def)
4124           *adjustment_def = init_val;
4125
4126         if (code == MULT_EXPR)
4127           {
4128             real_init_val = dconst1;
4129             int_init_val = 1;
4130           }
4131
4132         if (code == BIT_AND_EXPR)
4133           int_init_val = -1;
4134
4135         if (SCALAR_FLOAT_TYPE_P (scalar_type))
4136           def_for_init = build_real (scalar_type, real_init_val);
4137         else
4138           def_for_init = build_int_cst (scalar_type, int_init_val);
4139
4140         if (adjustment_def)
4141           /* Option1: the first element is '0' or '1' as well.  */
4142           init_def = gimple_build_vector_from_val (&stmts, vectype,
4143                                                    def_for_init);
4144         else if (!TYPE_VECTOR_SUBPARTS (vectype).is_constant ())
4145           {
4146             /* Option2 (variable length): the first element is INIT_VAL.  */
4147             init_def = gimple_build_vector_from_val (&stmts, vectype,
4148                                                      def_for_init);
4149             init_def = gimple_build (&stmts, CFN_VEC_SHL_INSERT,
4150                                      vectype, init_def, init_val);
4151           }
4152         else
4153           {
4154             /* Option2: the first element is INIT_VAL.  */
4155             tree_vector_builder elts (vectype, 1, 2);
4156             elts.quick_push (init_val);
4157             elts.quick_push (def_for_init);
4158             init_def = gimple_build_vector (&stmts, &elts);
4159           }
4160       }
4161       break;
4162
4163     case MIN_EXPR:
4164     case MAX_EXPR:
4165     case COND_EXPR:
4166       {
4167         if (adjustment_def)
4168           {
4169             *adjustment_def = NULL_TREE;
4170             if (reduction_type != COND_REDUCTION
4171                 && reduction_type != EXTRACT_LAST_REDUCTION)
4172               {
4173                 init_def = vect_get_vec_def_for_operand (init_val, stmt_vinfo);
4174                 break;
4175               }
4176           }
4177         init_val = gimple_convert (&stmts, TREE_TYPE (vectype), init_val);
4178         init_def = gimple_build_vector_from_val (&stmts, vectype, init_val);
4179       }
4180       break;
4181
4182     default:
4183       gcc_unreachable ();
4184     }
4185
4186   if (stmts)
4187     gsi_insert_seq_on_edge_immediate (loop_preheader_edge (loop), stmts);
4188   return init_def;
4189 }
4190
4191 /* Get at the initial defs for the reduction PHIs in SLP_NODE.
4192    NUMBER_OF_VECTORS is the number of vector defs to create.
4193    If NEUTRAL_OP is nonnull, introducing extra elements of that
4194    value will not change the result.  */
4195
4196 static void
4197 get_initial_defs_for_reduction (slp_tree slp_node,
4198                                 vec<tree> *vec_oprnds,
4199                                 unsigned int number_of_vectors,
4200                                 bool reduc_chain, tree neutral_op)
4201 {
4202   vec<stmt_vec_info> stmts = SLP_TREE_SCALAR_STMTS (slp_node);
4203   stmt_vec_info stmt_vinfo = stmts[0];
4204   unsigned HOST_WIDE_INT nunits;
4205   unsigned j, number_of_places_left_in_vector;
4206   tree vector_type;
4207   unsigned int group_size = stmts.length ();
4208   unsigned int i;
4209   struct loop *loop;
4210
4211   vector_type = STMT_VINFO_VECTYPE (stmt_vinfo);
4212
4213   gcc_assert (STMT_VINFO_DEF_TYPE (stmt_vinfo) == vect_reduction_def);
4214
4215   loop = (gimple_bb (stmt_vinfo->stmt))->loop_father;
4216   gcc_assert (loop);
4217   edge pe = loop_preheader_edge (loop);
4218
4219   gcc_assert (!reduc_chain || neutral_op);
4220
4221   /* NUMBER_OF_COPIES is the number of times we need to use the same values in
4222      created vectors. It is greater than 1 if unrolling is performed.
4223
4224      For example, we have two scalar operands, s1 and s2 (e.g., group of
4225      strided accesses of size two), while NUNITS is four (i.e., four scalars
4226      of this type can be packed in a vector).  The output vector will contain
4227      two copies of each scalar operand: {s1, s2, s1, s2}.  (NUMBER_OF_COPIES
4228      will be 2).
4229
4230      If REDUC_GROUP_SIZE > NUNITS, the scalars will be split into several
4231      vectors containing the operands.
4232
4233      For example, NUNITS is four as before, and the group size is 8
4234      (s1, s2, ..., s8).  We will create two vectors {s1, s2, s3, s4} and
4235      {s5, s6, s7, s8}.  */
4236
4237   if (!TYPE_VECTOR_SUBPARTS (vector_type).is_constant (&nunits))
4238     nunits = group_size;
4239
4240   number_of_places_left_in_vector = nunits;
4241   bool constant_p = true;
4242   tree_vector_builder elts (vector_type, nunits, 1);
4243   elts.quick_grow (nunits);
4244   gimple_seq ctor_seq = NULL;
4245   for (j = 0; j < nunits * number_of_vectors; ++j)
4246     {
4247       tree op;
4248       i = j % group_size;
4249       stmt_vinfo = stmts[i];
4250
4251       /* Get the def before the loop.  In reduction chain we have only
4252          one initial value.  Else we have as many as PHIs in the group.  */
4253       if (reduc_chain)
4254         op = j != 0 ? neutral_op : PHI_ARG_DEF_FROM_EDGE (stmt_vinfo->stmt, pe);
4255       else if (((vec_oprnds->length () + 1) * nunits
4256                 - number_of_places_left_in_vector >= group_size)
4257                && neutral_op)
4258         op = neutral_op;
4259       else
4260         op = PHI_ARG_DEF_FROM_EDGE (stmt_vinfo->stmt, pe);
4261
4262       /* Create 'vect_ = {op0,op1,...,opn}'.  */
4263       number_of_places_left_in_vector--;
4264       elts[nunits - number_of_places_left_in_vector - 1] = op;
4265       if (!CONSTANT_CLASS_P (op))
4266         constant_p = false;
4267
4268       if (number_of_places_left_in_vector == 0)
4269         {
4270           tree init;
4271           if (constant_p && !neutral_op
4272               ? multiple_p (TYPE_VECTOR_SUBPARTS (vector_type), nunits)
4273               : known_eq (TYPE_VECTOR_SUBPARTS (vector_type), nunits))
4274             /* Build the vector directly from ELTS.  */
4275             init = gimple_build_vector (&ctor_seq, &elts);
4276           else if (neutral_op)
4277             {
4278               /* Build a vector of the neutral value and shift the
4279                  other elements into place.  */
4280               init = gimple_build_vector_from_val (&ctor_seq, vector_type,
4281                                                    neutral_op);
4282               int k = nunits;
4283               while (k > 0 && elts[k - 1] == neutral_op)
4284                 k -= 1;
4285               while (k > 0)
4286                 {
4287                   k -= 1;
4288                   init = gimple_build (&ctor_seq, CFN_VEC_SHL_INSERT,
4289                                        vector_type, init, elts[k]);
4290                 }
4291             }
4292           else
4293             {
4294               /* First time round, duplicate ELTS to fill the
4295                  required number of vectors.  */
4296               duplicate_and_interleave (&ctor_seq, vector_type, elts,
4297                                         number_of_vectors, *vec_oprnds);
4298               break;
4299             }
4300           vec_oprnds->quick_push (init);
4301
4302           number_of_places_left_in_vector = nunits;
4303           elts.new_vector (vector_type, nunits, 1);
4304           elts.quick_grow (nunits);
4305           constant_p = true;
4306         }
4307     }
4308   if (ctor_seq != NULL)
4309     gsi_insert_seq_on_edge_immediate (pe, ctor_seq);
4310 }
4311
4312
4313 /* Function vect_create_epilog_for_reduction
4314
4315    Create code at the loop-epilog to finalize the result of a reduction
4316    computation.
4317
4318    VECT_DEFS is list of vector of partial results, i.e., the lhs's of vector
4319      reduction statements.
4320    STMT_INFO is the scalar reduction stmt that is being vectorized.
4321    NCOPIES is > 1 in case the vectorization factor (VF) is bigger than the
4322      number of elements that we can fit in a vectype (nunits).  In this case
4323      we have to generate more than one vector stmt - i.e - we need to "unroll"
4324      the vector stmt by a factor VF/nunits.  For more details see documentation
4325      in vectorizable_operation.
4326    REDUC_FN is the internal function for the epilog reduction.
4327    REDUCTION_PHIS is a list of the phi-nodes that carry the reduction
4328      computation.
4329    REDUC_INDEX is the index of the operand in the right hand side of the
4330      statement that is defined by REDUCTION_PHI.
4331    DOUBLE_REDUC is TRUE if double reduction phi nodes should be handled.
4332    SLP_NODE is an SLP node containing a group of reduction statements. The
4333      first one in this group is STMT_INFO.
4334    INDUC_VAL is for INTEGER_INDUC_COND_REDUCTION the value to use for the case
4335      when the COND_EXPR is never true in the loop.  For MAX_EXPR, it needs to
4336      be smaller than any value of the IV in the loop, for MIN_EXPR larger than
4337      any value of the IV in the loop.
4338    INDUC_CODE is the code for epilog reduction if INTEGER_INDUC_COND_REDUCTION.
4339    NEUTRAL_OP is the value given by neutral_op_for_slp_reduction; it is
4340      null if this is not an SLP reduction
4341
4342    This function:
4343    1. Creates the reduction def-use cycles: sets the arguments for
4344       REDUCTION_PHIS:
4345       The loop-entry argument is the vectorized initial-value of the reduction.
4346       The loop-latch argument is taken from VECT_DEFS - the vector of partial
4347       sums.
4348    2. "Reduces" each vector of partial results VECT_DEFS into a single result,
4349       by calling the function specified by REDUC_FN if available, or by
4350       other means (whole-vector shifts or a scalar loop).
4351       The function also creates a new phi node at the loop exit to preserve
4352       loop-closed form, as illustrated below.
4353
4354      The flow at the entry to this function:
4355
4356         loop:
4357           vec_def = phi <null, null>            # REDUCTION_PHI
4358           VECT_DEF = vector_stmt                # vectorized form of STMT_INFO
4359           s_loop = scalar_stmt                  # (scalar) STMT_INFO
4360         loop_exit:
4361           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
4362           use <s_out0>
4363           use <s_out0>
4364
4365      The above is transformed by this function into:
4366
4367         loop:
4368           vec_def = phi <vec_init, VECT_DEF>    # REDUCTION_PHI
4369           VECT_DEF = vector_stmt                # vectorized form of STMT_INFO
4370           s_loop = scalar_stmt                  # (scalar) STMT_INFO
4371         loop_exit:
4372           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
4373           v_out1 = phi <VECT_DEF>               # NEW_EXIT_PHI
4374           v_out2 = reduce <v_out1>
4375           s_out3 = extract_field <v_out2, 0>
4376           s_out4 = adjust_result <s_out3>
4377           use <s_out4>
4378           use <s_out4>
4379 */
4380
4381 static void
4382 vect_create_epilog_for_reduction (vec<tree> vect_defs,
4383                                   stmt_vec_info stmt_info,
4384                                   gimple *reduc_def_stmt,
4385                                   int ncopies, internal_fn reduc_fn,
4386                                   vec<stmt_vec_info> reduction_phis,
4387                                   bool double_reduc,
4388                                   slp_tree slp_node,
4389                                   slp_instance slp_node_instance,
4390                                   tree induc_val, enum tree_code induc_code,
4391                                   tree neutral_op)
4392 {
4393   stmt_vec_info prev_phi_info;
4394   tree vectype;
4395   machine_mode mode;
4396   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4397   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo), *outer_loop = NULL;
4398   basic_block exit_bb;
4399   tree scalar_dest;
4400   tree scalar_type;
4401   gimple *new_phi = NULL, *phi;
4402   stmt_vec_info phi_info;
4403   gimple_stmt_iterator exit_gsi;
4404   tree vec_dest;
4405   tree new_temp = NULL_TREE, new_dest, new_name, new_scalar_dest;
4406   gimple *epilog_stmt = NULL;
4407   enum tree_code code = gimple_assign_rhs_code (stmt_info->stmt);
4408   gimple *exit_phi;
4409   tree bitsize;
4410   tree adjustment_def = NULL;
4411   tree vec_initial_def = NULL;
4412   tree expr, def, initial_def = NULL;
4413   tree orig_name, scalar_result;
4414   imm_use_iterator imm_iter, phi_imm_iter;
4415   use_operand_p use_p, phi_use_p;
4416   gimple *use_stmt;
4417   stmt_vec_info reduction_phi_info = NULL;
4418   bool nested_in_vect_loop = false;
4419   auto_vec<gimple *> new_phis;
4420   auto_vec<stmt_vec_info> inner_phis;
4421   int j, i;
4422   auto_vec<tree> scalar_results;
4423   unsigned int group_size = 1, k, ratio;
4424   auto_vec<tree> vec_initial_defs;
4425   auto_vec<gimple *> phis;
4426   bool slp_reduc = false;
4427   bool direct_slp_reduc;
4428   tree new_phi_result;
4429   stmt_vec_info inner_phi = NULL;
4430   tree induction_index = NULL_TREE;
4431
4432   if (slp_node)
4433     group_size = SLP_TREE_SCALAR_STMTS (slp_node).length ();
4434
4435   if (nested_in_vect_loop_p (loop, stmt_info))
4436     {
4437       outer_loop = loop;
4438       loop = loop->inner;
4439       nested_in_vect_loop = true;
4440       gcc_assert (!slp_node);
4441     }
4442
4443   vectype = STMT_VINFO_VECTYPE (stmt_info);
4444   gcc_assert (vectype);
4445   mode = TYPE_MODE (vectype);
4446
4447   /* 1. Create the reduction def-use cycle:
4448      Set the arguments of REDUCTION_PHIS, i.e., transform
4449
4450         loop:
4451           vec_def = phi <null, null>            # REDUCTION_PHI
4452           VECT_DEF = vector_stmt                # vectorized form of STMT
4453           ...
4454
4455      into:
4456
4457         loop:
4458           vec_def = phi <vec_init, VECT_DEF>    # REDUCTION_PHI
4459           VECT_DEF = vector_stmt                # vectorized form of STMT
4460           ...
4461
4462      (in case of SLP, do it for all the phis). */
4463
4464   /* Get the loop-entry arguments.  */
4465   enum vect_def_type initial_def_dt = vect_unknown_def_type;
4466   if (slp_node)
4467     {
4468       unsigned vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
4469       vec_initial_defs.reserve (vec_num);
4470       get_initial_defs_for_reduction (slp_node_instance->reduc_phis,
4471                                       &vec_initial_defs, vec_num,
4472                                       REDUC_GROUP_FIRST_ELEMENT (stmt_info),
4473                                       neutral_op);
4474     }
4475   else
4476     {
4477       /* Get at the scalar def before the loop, that defines the initial value
4478          of the reduction variable.  */
4479       initial_def = PHI_ARG_DEF_FROM_EDGE (reduc_def_stmt,
4480                                            loop_preheader_edge (loop));
4481       /* Optimize: if initial_def is for REDUC_MAX smaller than the base
4482          and we can't use zero for induc_val, use initial_def.  Similarly
4483          for REDUC_MIN and initial_def larger than the base.  */
4484       if (TREE_CODE (initial_def) == INTEGER_CST
4485           && (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
4486               == INTEGER_INDUC_COND_REDUCTION)
4487           && !integer_zerop (induc_val)
4488           && ((induc_code == MAX_EXPR
4489                && tree_int_cst_lt (initial_def, induc_val))
4490               || (induc_code == MIN_EXPR
4491                   && tree_int_cst_lt (induc_val, initial_def))))
4492         induc_val = initial_def;
4493
4494       if (double_reduc)
4495         /* In case of double reduction we only create a vector variable
4496            to be put in the reduction phi node.  The actual statement
4497            creation is done later in this function.  */
4498         vec_initial_def = vect_create_destination_var (initial_def, vectype);
4499       else if (nested_in_vect_loop)
4500         {
4501           /* Do not use an adjustment def as that case is not supported
4502              correctly if ncopies is not one.  */
4503           vect_is_simple_use (initial_def, loop_vinfo, &initial_def_dt);
4504           vec_initial_def = vect_get_vec_def_for_operand (initial_def,
4505                                                           stmt_info);
4506         }
4507       else
4508         vec_initial_def
4509           = get_initial_def_for_reduction (stmt_info, initial_def,
4510                                            &adjustment_def);
4511       vec_initial_defs.create (1);
4512       vec_initial_defs.quick_push (vec_initial_def);
4513     }
4514
4515   /* Set phi nodes arguments.  */
4516   FOR_EACH_VEC_ELT (reduction_phis, i, phi_info)
4517     {
4518       tree vec_init_def = vec_initial_defs[i];
4519       tree def = vect_defs[i];
4520       for (j = 0; j < ncopies; j++)
4521         {
4522           if (j != 0)
4523             {
4524               phi_info = STMT_VINFO_RELATED_STMT (phi_info);
4525               if (nested_in_vect_loop)
4526                 vec_init_def
4527                   = vect_get_vec_def_for_stmt_copy (loop_vinfo, vec_init_def);
4528             }
4529
4530           /* Set the loop-entry arg of the reduction-phi.  */
4531
4532           gphi *phi = as_a <gphi *> (phi_info->stmt);
4533           if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
4534               == INTEGER_INDUC_COND_REDUCTION)
4535             {
4536               /* Initialise the reduction phi to zero.  This prevents initial
4537                  values of non-zero interferring with the reduction op.  */
4538               gcc_assert (ncopies == 1);
4539               gcc_assert (i == 0);
4540
4541               tree vec_init_def_type = TREE_TYPE (vec_init_def);
4542               tree induc_val_vec
4543                 = build_vector_from_val (vec_init_def_type, induc_val);
4544
4545               add_phi_arg (phi, induc_val_vec, loop_preheader_edge (loop),
4546                            UNKNOWN_LOCATION);
4547             }
4548           else
4549             add_phi_arg (phi, vec_init_def, loop_preheader_edge (loop),
4550                          UNKNOWN_LOCATION);
4551
4552           /* Set the loop-latch arg for the reduction-phi.  */
4553           if (j > 0)
4554             def = vect_get_vec_def_for_stmt_copy (loop_vinfo, def);
4555
4556           add_phi_arg (phi, def, loop_latch_edge (loop), UNKNOWN_LOCATION);
4557
4558           if (dump_enabled_p ())
4559             dump_printf_loc (MSG_NOTE, vect_location,
4560                              "transform reduction: created def-use cycle: %G%G",
4561                              phi, SSA_NAME_DEF_STMT (def));
4562         }
4563     }
4564
4565   /* For cond reductions we want to create a new vector (INDEX_COND_EXPR)
4566      which is updated with the current index of the loop for every match of
4567      the original loop's cond_expr (VEC_STMT).  This results in a vector
4568      containing the last time the condition passed for that vector lane.
4569      The first match will be a 1 to allow 0 to be used for non-matching
4570      indexes.  If there are no matches at all then the vector will be all
4571      zeroes.  */
4572   if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) == COND_REDUCTION)
4573     {
4574       tree indx_before_incr, indx_after_incr;
4575       poly_uint64 nunits_out = TYPE_VECTOR_SUBPARTS (vectype);
4576
4577       gimple *vec_stmt = STMT_VINFO_VEC_STMT (stmt_info)->stmt;
4578       gcc_assert (gimple_assign_rhs_code (vec_stmt) == VEC_COND_EXPR);
4579
4580       int scalar_precision
4581         = GET_MODE_PRECISION (SCALAR_TYPE_MODE (TREE_TYPE (vectype)));
4582       tree cr_index_scalar_type = make_unsigned_type (scalar_precision);
4583       tree cr_index_vector_type = build_vector_type
4584         (cr_index_scalar_type, TYPE_VECTOR_SUBPARTS (vectype));
4585
4586       /* First we create a simple vector induction variable which starts
4587          with the values {1,2,3,...} (SERIES_VECT) and increments by the
4588          vector size (STEP).  */
4589
4590       /* Create a {1,2,3,...} vector.  */
4591       tree series_vect = build_index_vector (cr_index_vector_type, 1, 1);
4592
4593       /* Create a vector of the step value.  */
4594       tree step = build_int_cst (cr_index_scalar_type, nunits_out);
4595       tree vec_step = build_vector_from_val (cr_index_vector_type, step);
4596
4597       /* Create an induction variable.  */
4598       gimple_stmt_iterator incr_gsi;
4599       bool insert_after;
4600       standard_iv_increment_position (loop, &incr_gsi, &insert_after);
4601       create_iv (series_vect, vec_step, NULL_TREE, loop, &incr_gsi,
4602                  insert_after, &indx_before_incr, &indx_after_incr);
4603
4604       /* Next create a new phi node vector (NEW_PHI_TREE) which starts
4605          filled with zeros (VEC_ZERO).  */
4606
4607       /* Create a vector of 0s.  */
4608       tree zero = build_zero_cst (cr_index_scalar_type);
4609       tree vec_zero = build_vector_from_val (cr_index_vector_type, zero);
4610
4611       /* Create a vector phi node.  */
4612       tree new_phi_tree = make_ssa_name (cr_index_vector_type);
4613       new_phi = create_phi_node (new_phi_tree, loop->header);
4614       loop_vinfo->add_stmt (new_phi);
4615       add_phi_arg (as_a <gphi *> (new_phi), vec_zero,
4616                    loop_preheader_edge (loop), UNKNOWN_LOCATION);
4617
4618       /* Now take the condition from the loops original cond_expr
4619          (VEC_STMT) and produce a new cond_expr (INDEX_COND_EXPR) which for
4620          every match uses values from the induction variable
4621          (INDEX_BEFORE_INCR) otherwise uses values from the phi node
4622          (NEW_PHI_TREE).
4623          Finally, we update the phi (NEW_PHI_TREE) to take the value of
4624          the new cond_expr (INDEX_COND_EXPR).  */
4625
4626       /* Duplicate the condition from vec_stmt.  */
4627       tree ccompare = unshare_expr (gimple_assign_rhs1 (vec_stmt));
4628
4629       /* Create a conditional, where the condition is taken from vec_stmt
4630          (CCOMPARE), then is the induction index (INDEX_BEFORE_INCR) and
4631          else is the phi (NEW_PHI_TREE).  */
4632       tree index_cond_expr = build3 (VEC_COND_EXPR, cr_index_vector_type,
4633                                      ccompare, indx_before_incr,
4634                                      new_phi_tree);
4635       induction_index = make_ssa_name (cr_index_vector_type);
4636       gimple *index_condition = gimple_build_assign (induction_index,
4637                                                      index_cond_expr);
4638       gsi_insert_before (&incr_gsi, index_condition, GSI_SAME_STMT);
4639       stmt_vec_info index_vec_info = loop_vinfo->add_stmt (index_condition);
4640       STMT_VINFO_VECTYPE (index_vec_info) = cr_index_vector_type;
4641
4642       /* Update the phi with the vec cond.  */
4643       add_phi_arg (as_a <gphi *> (new_phi), induction_index,
4644                    loop_latch_edge (loop), UNKNOWN_LOCATION);
4645     }
4646
4647   /* 2. Create epilog code.
4648         The reduction epilog code operates across the elements of the vector
4649         of partial results computed by the vectorized loop.
4650         The reduction epilog code consists of:
4651
4652         step 1: compute the scalar result in a vector (v_out2)
4653         step 2: extract the scalar result (s_out3) from the vector (v_out2)
4654         step 3: adjust the scalar result (s_out3) if needed.
4655
4656         Step 1 can be accomplished using one the following three schemes:
4657           (scheme 1) using reduc_fn, if available.
4658           (scheme 2) using whole-vector shifts, if available.
4659           (scheme 3) using a scalar loop. In this case steps 1+2 above are
4660                      combined.
4661
4662           The overall epilog code looks like this:
4663
4664           s_out0 = phi <s_loop>         # original EXIT_PHI
4665           v_out1 = phi <VECT_DEF>       # NEW_EXIT_PHI
4666           v_out2 = reduce <v_out1>              # step 1
4667           s_out3 = extract_field <v_out2, 0>    # step 2
4668           s_out4 = adjust_result <s_out3>       # step 3
4669
4670           (step 3 is optional, and steps 1 and 2 may be combined).
4671           Lastly, the uses of s_out0 are replaced by s_out4.  */
4672
4673
4674   /* 2.1 Create new loop-exit-phis to preserve loop-closed form:
4675          v_out1 = phi <VECT_DEF>
4676          Store them in NEW_PHIS.  */
4677
4678   exit_bb = single_exit (loop)->dest;
4679   prev_phi_info = NULL;
4680   new_phis.create (vect_defs.length ());
4681   FOR_EACH_VEC_ELT (vect_defs, i, def)
4682     {
4683       for (j = 0; j < ncopies; j++)
4684         {
4685           tree new_def = copy_ssa_name (def);
4686           phi = create_phi_node (new_def, exit_bb);
4687           stmt_vec_info phi_info = loop_vinfo->add_stmt (phi);
4688           if (j == 0)
4689             new_phis.quick_push (phi);
4690           else
4691             {
4692               def = vect_get_vec_def_for_stmt_copy (loop_vinfo, def);
4693               STMT_VINFO_RELATED_STMT (prev_phi_info) = phi_info;
4694             }
4695
4696           SET_PHI_ARG_DEF (phi, single_exit (loop)->dest_idx, def);
4697           prev_phi_info = phi_info;
4698         }
4699     }
4700
4701   /* The epilogue is created for the outer-loop, i.e., for the loop being
4702      vectorized.  Create exit phis for the outer loop.  */
4703   if (double_reduc)
4704     {
4705       loop = outer_loop;
4706       exit_bb = single_exit (loop)->dest;
4707       inner_phis.create (vect_defs.length ());
4708       FOR_EACH_VEC_ELT (new_phis, i, phi)
4709         {
4710           stmt_vec_info phi_info = loop_vinfo->lookup_stmt (phi);
4711           tree new_result = copy_ssa_name (PHI_RESULT (phi));
4712           gphi *outer_phi = create_phi_node (new_result, exit_bb);
4713           SET_PHI_ARG_DEF (outer_phi, single_exit (loop)->dest_idx,
4714                            PHI_RESULT (phi));
4715           prev_phi_info = loop_vinfo->add_stmt (outer_phi);
4716           inner_phis.quick_push (phi_info);
4717           new_phis[i] = outer_phi;
4718           while (STMT_VINFO_RELATED_STMT (phi_info))
4719             {
4720               phi_info = STMT_VINFO_RELATED_STMT (phi_info);
4721               new_result = copy_ssa_name (PHI_RESULT (phi_info->stmt));
4722               outer_phi = create_phi_node (new_result, exit_bb);
4723               SET_PHI_ARG_DEF (outer_phi, single_exit (loop)->dest_idx,
4724                                PHI_RESULT (phi_info->stmt));
4725               stmt_vec_info outer_phi_info = loop_vinfo->add_stmt (outer_phi);
4726               STMT_VINFO_RELATED_STMT (prev_phi_info) = outer_phi_info;
4727               prev_phi_info = outer_phi_info;
4728             }
4729         }
4730     }
4731
4732   exit_gsi = gsi_after_labels (exit_bb);
4733
4734   /* 2.2 Get the relevant tree-code to use in the epilog for schemes 2,3
4735          (i.e. when reduc_fn is not available) and in the final adjustment
4736          code (if needed).  Also get the original scalar reduction variable as
4737          defined in the loop.  In case STMT is a "pattern-stmt" (i.e. - it
4738          represents a reduction pattern), the tree-code and scalar-def are
4739          taken from the original stmt that the pattern-stmt (STMT) replaces.
4740          Otherwise (it is a regular reduction) - the tree-code and scalar-def
4741          are taken from STMT.  */
4742
4743   stmt_vec_info orig_stmt_info = vect_orig_stmt (stmt_info);
4744   if (orig_stmt_info != stmt_info)
4745     {
4746       /* Reduction pattern  */
4747       gcc_assert (STMT_VINFO_IN_PATTERN_P (orig_stmt_info));
4748       gcc_assert (STMT_VINFO_RELATED_STMT (orig_stmt_info) == stmt_info);
4749     }
4750
4751   code = gimple_assign_rhs_code (orig_stmt_info->stmt);
4752   /* For MINUS_EXPR the initial vector is [init_val,0,...,0], therefore,
4753      partial results are added and not subtracted.  */
4754   if (code == MINUS_EXPR)
4755     code = PLUS_EXPR;
4756
4757   scalar_dest = gimple_assign_lhs (orig_stmt_info->stmt);
4758   scalar_type = TREE_TYPE (scalar_dest);
4759   scalar_results.create (group_size);
4760   new_scalar_dest = vect_create_destination_var (scalar_dest, NULL);
4761   bitsize = TYPE_SIZE (scalar_type);
4762
4763   /* In case this is a reduction in an inner-loop while vectorizing an outer
4764      loop - we don't need to extract a single scalar result at the end of the
4765      inner-loop (unless it is double reduction, i.e., the use of reduction is
4766      outside the outer-loop).  The final vector of partial results will be used
4767      in the vectorized outer-loop, or reduced to a scalar result at the end of
4768      the outer-loop.  */
4769   if (nested_in_vect_loop && !double_reduc)
4770     goto vect_finalize_reduction;
4771
4772   /* SLP reduction without reduction chain, e.g.,
4773      # a1 = phi <a2, a0>
4774      # b1 = phi <b2, b0>
4775      a2 = operation (a1)
4776      b2 = operation (b1)  */
4777   slp_reduc = (slp_node && !REDUC_GROUP_FIRST_ELEMENT (stmt_info));
4778
4779   /* True if we should implement SLP_REDUC using native reduction operations
4780      instead of scalar operations.  */
4781   direct_slp_reduc = (reduc_fn != IFN_LAST
4782                       && slp_reduc
4783                       && !TYPE_VECTOR_SUBPARTS (vectype).is_constant ());
4784
4785   /* In case of reduction chain, e.g.,
4786      # a1 = phi <a3, a0>
4787      a2 = operation (a1)
4788      a3 = operation (a2),
4789
4790      we may end up with more than one vector result.  Here we reduce them to
4791      one vector.  */
4792   if (REDUC_GROUP_FIRST_ELEMENT (stmt_info) || direct_slp_reduc)
4793     {
4794       tree first_vect = PHI_RESULT (new_phis[0]);
4795       gassign *new_vec_stmt = NULL;
4796       vec_dest = vect_create_destination_var (scalar_dest, vectype);
4797       for (k = 1; k < new_phis.length (); k++)
4798         {
4799           gimple *next_phi = new_phis[k];
4800           tree second_vect = PHI_RESULT (next_phi);
4801           tree tem = make_ssa_name (vec_dest, new_vec_stmt);
4802           new_vec_stmt = gimple_build_assign (tem, code,
4803                                               first_vect, second_vect);
4804           gsi_insert_before (&exit_gsi, new_vec_stmt, GSI_SAME_STMT);
4805           first_vect = tem;
4806         }
4807
4808       new_phi_result = first_vect;
4809       if (new_vec_stmt)
4810         {
4811           new_phis.truncate (0);
4812           new_phis.safe_push (new_vec_stmt);
4813         }
4814     }
4815   /* Likewise if we couldn't use a single defuse cycle.  */
4816   else if (ncopies > 1)
4817     {
4818       gcc_assert (new_phis.length () == 1);
4819       tree first_vect = PHI_RESULT (new_phis[0]);
4820       gassign *new_vec_stmt = NULL;
4821       vec_dest = vect_create_destination_var (scalar_dest, vectype);
4822       stmt_vec_info next_phi_info = loop_vinfo->lookup_stmt (new_phis[0]);
4823       for (int k = 1; k < ncopies; ++k)
4824         {
4825           next_phi_info = STMT_VINFO_RELATED_STMT (next_phi_info);
4826           tree second_vect = PHI_RESULT (next_phi_info->stmt);
4827           tree tem = make_ssa_name (vec_dest, new_vec_stmt);
4828           new_vec_stmt = gimple_build_assign (tem, code,
4829                                               first_vect, second_vect);
4830           gsi_insert_before (&exit_gsi, new_vec_stmt, GSI_SAME_STMT);
4831           first_vect = tem;
4832         }
4833       new_phi_result = first_vect;
4834       new_phis.truncate (0);
4835       new_phis.safe_push (new_vec_stmt);
4836     }
4837   else
4838     new_phi_result = PHI_RESULT (new_phis[0]);
4839
4840   if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) == COND_REDUCTION
4841       && reduc_fn != IFN_LAST)
4842     {
4843       /* For condition reductions, we have a vector (NEW_PHI_RESULT) containing
4844          various data values where the condition matched and another vector
4845          (INDUCTION_INDEX) containing all the indexes of those matches.  We
4846          need to extract the last matching index (which will be the index with
4847          highest value) and use this to index into the data vector.
4848          For the case where there were no matches, the data vector will contain
4849          all default values and the index vector will be all zeros.  */
4850
4851       /* Get various versions of the type of the vector of indexes.  */
4852       tree index_vec_type = TREE_TYPE (induction_index);
4853       gcc_checking_assert (TYPE_UNSIGNED (index_vec_type));
4854       tree index_scalar_type = TREE_TYPE (index_vec_type);
4855       tree index_vec_cmp_type = build_same_sized_truth_vector_type
4856         (index_vec_type);
4857
4858       /* Get an unsigned integer version of the type of the data vector.  */
4859       int scalar_precision
4860         = GET_MODE_PRECISION (SCALAR_TYPE_MODE (scalar_type));
4861       tree scalar_type_unsigned = make_unsigned_type (scalar_precision);
4862       tree vectype_unsigned = build_vector_type
4863         (scalar_type_unsigned, TYPE_VECTOR_SUBPARTS (vectype));
4864
4865       /* First we need to create a vector (ZERO_VEC) of zeros and another
4866          vector (MAX_INDEX_VEC) filled with the last matching index, which we
4867          can create using a MAX reduction and then expanding.
4868          In the case where the loop never made any matches, the max index will
4869          be zero.  */
4870
4871       /* Vector of {0, 0, 0,...}.  */
4872       tree zero_vec = make_ssa_name (vectype);
4873       tree zero_vec_rhs = build_zero_cst (vectype);
4874       gimple *zero_vec_stmt = gimple_build_assign (zero_vec, zero_vec_rhs);
4875       gsi_insert_before (&exit_gsi, zero_vec_stmt, GSI_SAME_STMT);
4876
4877       /* Find maximum value from the vector of found indexes.  */
4878       tree max_index = make_ssa_name (index_scalar_type);
4879       gcall *max_index_stmt = gimple_build_call_internal (IFN_REDUC_MAX,
4880                                                           1, induction_index);
4881       gimple_call_set_lhs (max_index_stmt, max_index);
4882       gsi_insert_before (&exit_gsi, max_index_stmt, GSI_SAME_STMT);
4883
4884       /* Vector of {max_index, max_index, max_index,...}.  */
4885       tree max_index_vec = make_ssa_name (index_vec_type);
4886       tree max_index_vec_rhs = build_vector_from_val (index_vec_type,
4887                                                       max_index);
4888       gimple *max_index_vec_stmt = gimple_build_assign (max_index_vec,
4889                                                         max_index_vec_rhs);
4890       gsi_insert_before (&exit_gsi, max_index_vec_stmt, GSI_SAME_STMT);
4891
4892       /* Next we compare the new vector (MAX_INDEX_VEC) full of max indexes
4893          with the vector (INDUCTION_INDEX) of found indexes, choosing values
4894          from the data vector (NEW_PHI_RESULT) for matches, 0 (ZERO_VEC)
4895          otherwise.  Only one value should match, resulting in a vector
4896          (VEC_COND) with one data value and the rest zeros.
4897          In the case where the loop never made any matches, every index will
4898          match, resulting in a vector with all data values (which will all be
4899          the default value).  */
4900
4901       /* Compare the max index vector to the vector of found indexes to find
4902          the position of the max value.  */
4903       tree vec_compare = make_ssa_name (index_vec_cmp_type);
4904       gimple *vec_compare_stmt = gimple_build_assign (vec_compare, EQ_EXPR,
4905                                                       induction_index,
4906                                                       max_index_vec);
4907       gsi_insert_before (&exit_gsi, vec_compare_stmt, GSI_SAME_STMT);
4908
4909       /* Use the compare to choose either values from the data vector or
4910          zero.  */
4911       tree vec_cond = make_ssa_name (vectype);
4912       gimple *vec_cond_stmt = gimple_build_assign (vec_cond, VEC_COND_EXPR,
4913                                                    vec_compare, new_phi_result,
4914                                                    zero_vec);
4915       gsi_insert_before (&exit_gsi, vec_cond_stmt, GSI_SAME_STMT);
4916
4917       /* Finally we need to extract the data value from the vector (VEC_COND)
4918          into a scalar (MATCHED_DATA_REDUC).  Logically we want to do a OR
4919          reduction, but because this doesn't exist, we can use a MAX reduction
4920          instead.  The data value might be signed or a float so we need to cast
4921          it first.
4922          In the case where the loop never made any matches, the data values are
4923          all identical, and so will reduce down correctly.  */
4924
4925       /* Make the matched data values unsigned.  */
4926       tree vec_cond_cast = make_ssa_name (vectype_unsigned);
4927       tree vec_cond_cast_rhs = build1 (VIEW_CONVERT_EXPR, vectype_unsigned,
4928                                        vec_cond);
4929       gimple *vec_cond_cast_stmt = gimple_build_assign (vec_cond_cast,
4930                                                         VIEW_CONVERT_EXPR,
4931                                                         vec_cond_cast_rhs);
4932       gsi_insert_before (&exit_gsi, vec_cond_cast_stmt, GSI_SAME_STMT);
4933
4934       /* Reduce down to a scalar value.  */
4935       tree data_reduc = make_ssa_name (scalar_type_unsigned);
4936       gcall *data_reduc_stmt = gimple_build_call_internal (IFN_REDUC_MAX,
4937                                                            1, vec_cond_cast);
4938       gimple_call_set_lhs (data_reduc_stmt, data_reduc);
4939       gsi_insert_before (&exit_gsi, data_reduc_stmt, GSI_SAME_STMT);
4940
4941       /* Convert the reduced value back to the result type and set as the
4942          result.  */
4943       gimple_seq stmts = NULL;
4944       new_temp = gimple_build (&stmts, VIEW_CONVERT_EXPR, scalar_type,
4945                                data_reduc);
4946       gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
4947       scalar_results.safe_push (new_temp);
4948     }
4949   else if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) == COND_REDUCTION
4950            && reduc_fn == IFN_LAST)
4951     {
4952       /* Condition reduction without supported IFN_REDUC_MAX.  Generate
4953          idx = 0;
4954          idx_val = induction_index[0];
4955          val = data_reduc[0];
4956          for (idx = 0, val = init, i = 0; i < nelts; ++i)
4957            if (induction_index[i] > idx_val)
4958              val = data_reduc[i], idx_val = induction_index[i];
4959          return val;  */
4960
4961       tree data_eltype = TREE_TYPE (TREE_TYPE (new_phi_result));
4962       tree idx_eltype = TREE_TYPE (TREE_TYPE (induction_index));
4963       unsigned HOST_WIDE_INT el_size = tree_to_uhwi (TYPE_SIZE (idx_eltype));
4964       poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (TREE_TYPE (induction_index));
4965       /* Enforced by vectorizable_reduction, which ensures we have target
4966          support before allowing a conditional reduction on variable-length
4967          vectors.  */
4968       unsigned HOST_WIDE_INT v_size = el_size * nunits.to_constant ();
4969       tree idx_val = NULL_TREE, val = NULL_TREE;
4970       for (unsigned HOST_WIDE_INT off = 0; off < v_size; off += el_size)
4971         {
4972           tree old_idx_val = idx_val;
4973           tree old_val = val;
4974           idx_val = make_ssa_name (idx_eltype);
4975           epilog_stmt = gimple_build_assign (idx_val, BIT_FIELD_REF,
4976                                              build3 (BIT_FIELD_REF, idx_eltype,
4977                                                      induction_index,
4978                                                      bitsize_int (el_size),
4979                                                      bitsize_int (off)));
4980           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4981           val = make_ssa_name (data_eltype);
4982           epilog_stmt = gimple_build_assign (val, BIT_FIELD_REF,
4983                                              build3 (BIT_FIELD_REF,
4984                                                      data_eltype,
4985                                                      new_phi_result,
4986                                                      bitsize_int (el_size),
4987                                                      bitsize_int (off)));
4988           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4989           if (off != 0)
4990             {
4991               tree new_idx_val = idx_val;
4992               tree new_val = val;
4993               if (off != v_size - el_size)
4994                 {
4995                   new_idx_val = make_ssa_name (idx_eltype);
4996                   epilog_stmt = gimple_build_assign (new_idx_val,
4997                                                      MAX_EXPR, idx_val,
4998                                                      old_idx_val);
4999                   gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5000                 }
5001               new_val = make_ssa_name (data_eltype);
5002               epilog_stmt = gimple_build_assign (new_val,
5003                                                  COND_EXPR,
5004                                                  build2 (GT_EXPR,
5005                                                          boolean_type_node,
5006                                                          idx_val,
5007                                                          old_idx_val),
5008                                                  val, old_val);
5009               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5010               idx_val = new_idx_val;
5011               val = new_val;
5012             }
5013         }
5014       /* Convert the reduced value back to the result type and set as the
5015          result.  */
5016       gimple_seq stmts = NULL;
5017       val = gimple_convert (&stmts, scalar_type, val);
5018       gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
5019       scalar_results.safe_push (val);
5020     }
5021
5022   /* 2.3 Create the reduction code, using one of the three schemes described
5023          above. In SLP we simply need to extract all the elements from the
5024          vector (without reducing them), so we use scalar shifts.  */
5025   else if (reduc_fn != IFN_LAST && !slp_reduc)
5026     {
5027       tree tmp;
5028       tree vec_elem_type;
5029
5030       /* Case 1:  Create:
5031          v_out2 = reduc_expr <v_out1>  */
5032
5033       if (dump_enabled_p ())
5034         dump_printf_loc (MSG_NOTE, vect_location,
5035                          "Reduce using direct vector reduction.\n");
5036
5037       vec_elem_type = TREE_TYPE (TREE_TYPE (new_phi_result));
5038       if (!useless_type_conversion_p (scalar_type, vec_elem_type))
5039         {
5040           tree tmp_dest
5041             = vect_create_destination_var (scalar_dest, vec_elem_type);
5042           epilog_stmt = gimple_build_call_internal (reduc_fn, 1,
5043                                                     new_phi_result);
5044           gimple_set_lhs (epilog_stmt, tmp_dest);
5045           new_temp = make_ssa_name (tmp_dest, epilog_stmt);
5046           gimple_set_lhs (epilog_stmt, new_temp);
5047           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5048
5049           epilog_stmt = gimple_build_assign (new_scalar_dest, NOP_EXPR,
5050                                              new_temp);
5051         }
5052       else
5053         {
5054           epilog_stmt = gimple_build_call_internal (reduc_fn, 1,
5055                                                     new_phi_result);
5056           gimple_set_lhs (epilog_stmt, new_scalar_dest);
5057         }
5058
5059       new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
5060       gimple_set_lhs (epilog_stmt, new_temp);
5061       gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5062
5063       if ((STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
5064            == INTEGER_INDUC_COND_REDUCTION)
5065           && !operand_equal_p (initial_def, induc_val, 0))
5066         {
5067           /* Earlier we set the initial value to be a vector if induc_val
5068              values.  Check the result and if it is induc_val then replace
5069              with the original initial value, unless induc_val is
5070              the same as initial_def already.  */
5071           tree zcompare = build2 (EQ_EXPR, boolean_type_node, new_temp,
5072                                   induc_val);
5073
5074           tmp = make_ssa_name (new_scalar_dest);
5075           epilog_stmt = gimple_build_assign (tmp, COND_EXPR, zcompare,
5076                                              initial_def, new_temp);
5077           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5078           new_temp = tmp;
5079         }
5080
5081       scalar_results.safe_push (new_temp);
5082     }
5083   else if (direct_slp_reduc)
5084     {
5085       /* Here we create one vector for each of the REDUC_GROUP_SIZE results,
5086          with the elements for other SLP statements replaced with the
5087          neutral value.  We can then do a normal reduction on each vector.  */
5088
5089       /* Enforced by vectorizable_reduction.  */
5090       gcc_assert (new_phis.length () == 1);
5091       gcc_assert (pow2p_hwi (group_size));
5092
5093       slp_tree orig_phis_slp_node = slp_node_instance->reduc_phis;
5094       vec<stmt_vec_info> orig_phis
5095         = SLP_TREE_SCALAR_STMTS (orig_phis_slp_node);
5096       gimple_seq seq = NULL;
5097
5098       /* Build a vector {0, 1, 2, ...}, with the same number of elements
5099          and the same element size as VECTYPE.  */
5100       tree index = build_index_vector (vectype, 0, 1);
5101       tree index_type = TREE_TYPE (index);
5102       tree index_elt_type = TREE_TYPE (index_type);
5103       tree mask_type = build_same_sized_truth_vector_type (index_type);
5104
5105       /* Create a vector that, for each element, identifies which of
5106          the REDUC_GROUP_SIZE results should use it.  */
5107       tree index_mask = build_int_cst (index_elt_type, group_size - 1);
5108       index = gimple_build (&seq, BIT_AND_EXPR, index_type, index,
5109                             build_vector_from_val (index_type, index_mask));
5110
5111       /* Get a neutral vector value.  This is simply a splat of the neutral
5112          scalar value if we have one, otherwise the initial scalar value
5113          is itself a neutral value.  */
5114       tree vector_identity = NULL_TREE;
5115       if (neutral_op)
5116         vector_identity = gimple_build_vector_from_val (&seq, vectype,
5117                                                         neutral_op);
5118       for (unsigned int i = 0; i < group_size; ++i)
5119         {
5120           /* If there's no univeral neutral value, we can use the
5121              initial scalar value from the original PHI.  This is used
5122              for MIN and MAX reduction, for example.  */
5123           if (!neutral_op)
5124             {
5125               tree scalar_value
5126                 = PHI_ARG_DEF_FROM_EDGE (orig_phis[i]->stmt,
5127                                          loop_preheader_edge (loop));
5128               vector_identity = gimple_build_vector_from_val (&seq, vectype,
5129                                                               scalar_value);
5130             }
5131
5132           /* Calculate the equivalent of:
5133
5134              sel[j] = (index[j] == i);
5135
5136              which selects the elements of NEW_PHI_RESULT that should
5137              be included in the result.  */
5138           tree compare_val = build_int_cst (index_elt_type, i);
5139           compare_val = build_vector_from_val (index_type, compare_val);
5140           tree sel = gimple_build (&seq, EQ_EXPR, mask_type,
5141                                    index, compare_val);
5142
5143           /* Calculate the equivalent of:
5144
5145              vec = seq ? new_phi_result : vector_identity;
5146
5147              VEC is now suitable for a full vector reduction.  */
5148           tree vec = gimple_build (&seq, VEC_COND_EXPR, vectype,
5149                                    sel, new_phi_result, vector_identity);
5150
5151           /* Do the reduction and convert it to the appropriate type.  */
5152           tree scalar = gimple_build (&seq, as_combined_fn (reduc_fn),
5153                                       TREE_TYPE (vectype), vec);
5154           scalar = gimple_convert (&seq, scalar_type, scalar);
5155           scalar_results.safe_push (scalar);
5156         }
5157       gsi_insert_seq_before (&exit_gsi, seq, GSI_SAME_STMT);
5158     }
5159   else
5160     {
5161       bool reduce_with_shift;
5162       tree vec_temp;
5163
5164       /* COND reductions all do the final reduction with MAX_EXPR
5165          or MIN_EXPR.  */
5166       if (code == COND_EXPR)
5167         {
5168           if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
5169               == INTEGER_INDUC_COND_REDUCTION)
5170             code = induc_code;
5171           else if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
5172                    == CONST_COND_REDUCTION)
5173             code = STMT_VINFO_VEC_CONST_COND_REDUC_CODE (stmt_info);
5174           else
5175             code = MAX_EXPR;
5176         }
5177
5178       /* See if the target wants to do the final (shift) reduction
5179          in a vector mode of smaller size and first reduce upper/lower
5180          halves against each other.  */
5181       enum machine_mode mode1 = mode;
5182       tree vectype1 = vectype;
5183       unsigned sz = tree_to_uhwi (TYPE_SIZE_UNIT (vectype));
5184       unsigned sz1 = sz;
5185       if (!slp_reduc
5186           && (mode1 = targetm.vectorize.split_reduction (mode)) != mode)
5187         sz1 = GET_MODE_SIZE (mode1).to_constant ();
5188
5189       vectype1 = get_vectype_for_scalar_type_and_size (scalar_type, sz1);
5190       reduce_with_shift = have_whole_vector_shift (mode1);
5191       if (!VECTOR_MODE_P (mode1))
5192         reduce_with_shift = false;
5193       else
5194         {
5195           optab optab = optab_for_tree_code (code, vectype1, optab_default);
5196           if (optab_handler (optab, mode1) == CODE_FOR_nothing)
5197             reduce_with_shift = false;
5198         }
5199
5200       /* First reduce the vector to the desired vector size we should
5201          do shift reduction on by combining upper and lower halves.  */
5202       new_temp = new_phi_result;
5203       while (sz > sz1)
5204         {
5205           gcc_assert (!slp_reduc);
5206           sz /= 2;
5207           vectype1 = get_vectype_for_scalar_type_and_size (scalar_type, sz);
5208
5209           /* The target has to make sure we support lowpart/highpart
5210              extraction, either via direct vector extract or through
5211              an integer mode punning.  */
5212           tree dst1, dst2;
5213           if (convert_optab_handler (vec_extract_optab,
5214                                      TYPE_MODE (TREE_TYPE (new_temp)),
5215                                      TYPE_MODE (vectype1))
5216               != CODE_FOR_nothing)
5217             {
5218               /* Extract sub-vectors directly once vec_extract becomes
5219                  a conversion optab.  */
5220               dst1 = make_ssa_name (vectype1);
5221               epilog_stmt
5222                   = gimple_build_assign (dst1, BIT_FIELD_REF,
5223                                          build3 (BIT_FIELD_REF, vectype1,
5224                                                  new_temp, TYPE_SIZE (vectype1),
5225                                                  bitsize_int (0)));
5226               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5227               dst2 =  make_ssa_name (vectype1);
5228               epilog_stmt
5229                   = gimple_build_assign (dst2, BIT_FIELD_REF,
5230                                          build3 (BIT_FIELD_REF, vectype1,
5231                                                  new_temp, TYPE_SIZE (vectype1),
5232                                                  bitsize_int (sz * BITS_PER_UNIT)));
5233               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5234             }
5235           else
5236             {
5237               /* Extract via punning to appropriately sized integer mode
5238                  vector.  */
5239               tree eltype = build_nonstandard_integer_type (sz * BITS_PER_UNIT,
5240                                                             1);
5241               tree etype = build_vector_type (eltype, 2);
5242               gcc_assert (convert_optab_handler (vec_extract_optab,
5243                                                  TYPE_MODE (etype),
5244                                                  TYPE_MODE (eltype))
5245                           != CODE_FOR_nothing);
5246               tree tem = make_ssa_name (etype);
5247               epilog_stmt = gimple_build_assign (tem, VIEW_CONVERT_EXPR,
5248                                                  build1 (VIEW_CONVERT_EXPR,
5249                                                          etype, new_temp));
5250               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5251               new_temp = tem;
5252               tem = make_ssa_name (eltype);
5253               epilog_stmt
5254                   = gimple_build_assign (tem, BIT_FIELD_REF,
5255                                          build3 (BIT_FIELD_REF, eltype,
5256                                                  new_temp, TYPE_SIZE (eltype),
5257                                                  bitsize_int (0)));
5258               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5259               dst1 = make_ssa_name (vectype1);
5260               epilog_stmt = gimple_build_assign (dst1, VIEW_CONVERT_EXPR,
5261                                                  build1 (VIEW_CONVERT_EXPR,
5262                                                          vectype1, tem));
5263               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5264               tem = make_ssa_name (eltype);
5265               epilog_stmt
5266                   = gimple_build_assign (tem, BIT_FIELD_REF,
5267                                          build3 (BIT_FIELD_REF, eltype,
5268                                                  new_temp, TYPE_SIZE (eltype),
5269                                                  bitsize_int (sz * BITS_PER_UNIT)));
5270               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5271               dst2 =  make_ssa_name (vectype1);
5272               epilog_stmt = gimple_build_assign (dst2, VIEW_CONVERT_EXPR,
5273                                                  build1 (VIEW_CONVERT_EXPR,
5274                                                          vectype1, tem));
5275               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5276             }
5277
5278           new_temp = make_ssa_name (vectype1);
5279           epilog_stmt = gimple_build_assign (new_temp, code, dst1, dst2);
5280           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5281         }
5282
5283       if (reduce_with_shift && !slp_reduc)
5284         {
5285           int element_bitsize = tree_to_uhwi (bitsize);
5286           /* Enforced by vectorizable_reduction, which disallows SLP reductions
5287              for variable-length vectors and also requires direct target support
5288              for loop reductions.  */
5289           int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype1));
5290           int nelements = vec_size_in_bits / element_bitsize;
5291           vec_perm_builder sel;
5292           vec_perm_indices indices;
5293
5294           int elt_offset;
5295
5296           tree zero_vec = build_zero_cst (vectype1);
5297           /* Case 2: Create:
5298              for (offset = nelements/2; offset >= 1; offset/=2)
5299                 {
5300                   Create:  va' = vec_shift <va, offset>
5301                   Create:  va = vop <va, va'>
5302                 }  */
5303
5304           tree rhs;
5305
5306           if (dump_enabled_p ())
5307             dump_printf_loc (MSG_NOTE, vect_location,
5308                              "Reduce using vector shifts\n");
5309
5310           mode1 = TYPE_MODE (vectype1);
5311           vec_dest = vect_create_destination_var (scalar_dest, vectype1);
5312           for (elt_offset = nelements / 2;
5313                elt_offset >= 1;
5314                elt_offset /= 2)
5315             {
5316               calc_vec_perm_mask_for_shift (elt_offset, nelements, &sel);
5317               indices.new_vector (sel, 2, nelements);
5318               tree mask = vect_gen_perm_mask_any (vectype1, indices);
5319               epilog_stmt = gimple_build_assign (vec_dest, VEC_PERM_EXPR,
5320                                                  new_temp, zero_vec, mask);
5321               new_name = make_ssa_name (vec_dest, epilog_stmt);
5322               gimple_assign_set_lhs (epilog_stmt, new_name);
5323               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5324
5325               epilog_stmt = gimple_build_assign (vec_dest, code, new_name,
5326                                                  new_temp);
5327               new_temp = make_ssa_name (vec_dest, epilog_stmt);
5328               gimple_assign_set_lhs (epilog_stmt, new_temp);
5329               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5330             }
5331
5332           /* 2.4  Extract the final scalar result.  Create:
5333              s_out3 = extract_field <v_out2, bitpos>  */
5334
5335           if (dump_enabled_p ())
5336             dump_printf_loc (MSG_NOTE, vect_location,
5337                              "extract scalar result\n");
5338
5339           rhs = build3 (BIT_FIELD_REF, scalar_type, new_temp,
5340                         bitsize, bitsize_zero_node);
5341           epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
5342           new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
5343           gimple_assign_set_lhs (epilog_stmt, new_temp);
5344           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5345           scalar_results.safe_push (new_temp);
5346         }
5347       else
5348         {
5349           /* Case 3: Create:
5350              s = extract_field <v_out2, 0>
5351              for (offset = element_size;
5352                   offset < vector_size;
5353                   offset += element_size;)
5354                {
5355                  Create:  s' = extract_field <v_out2, offset>
5356                  Create:  s = op <s, s'>  // For non SLP cases
5357                }  */
5358
5359           if (dump_enabled_p ())
5360             dump_printf_loc (MSG_NOTE, vect_location,
5361                              "Reduce using scalar code.\n");
5362
5363           int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype1));
5364           int element_bitsize = tree_to_uhwi (bitsize);
5365           FOR_EACH_VEC_ELT (new_phis, i, new_phi)
5366             {
5367               int bit_offset;
5368               if (gimple_code (new_phi) == GIMPLE_PHI)
5369                 vec_temp = PHI_RESULT (new_phi);
5370               else
5371                 vec_temp = gimple_assign_lhs (new_phi);
5372               tree rhs = build3 (BIT_FIELD_REF, scalar_type, vec_temp, bitsize,
5373                                  bitsize_zero_node);
5374               epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
5375               new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
5376               gimple_assign_set_lhs (epilog_stmt, new_temp);
5377               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5378
5379               /* In SLP we don't need to apply reduction operation, so we just
5380                  collect s' values in SCALAR_RESULTS.  */
5381               if (slp_reduc)
5382                 scalar_results.safe_push (new_temp);
5383
5384               for (bit_offset = element_bitsize;
5385                    bit_offset < vec_size_in_bits;
5386                    bit_offset += element_bitsize)
5387                 {
5388                   tree bitpos = bitsize_int (bit_offset);
5389                   tree rhs = build3 (BIT_FIELD_REF, scalar_type, vec_temp,
5390                                      bitsize, bitpos);
5391
5392                   epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
5393                   new_name = make_ssa_name (new_scalar_dest, epilog_stmt);
5394                   gimple_assign_set_lhs (epilog_stmt, new_name);
5395                   gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5396
5397                   if (slp_reduc)
5398                     {
5399                       /* In SLP we don't need to apply reduction operation, so
5400                          we just collect s' values in SCALAR_RESULTS.  */
5401                       new_temp = new_name;
5402                       scalar_results.safe_push (new_name);
5403                     }
5404                   else
5405                     {
5406                       epilog_stmt = gimple_build_assign (new_scalar_dest, code,
5407                                                          new_name, new_temp);
5408                       new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
5409                       gimple_assign_set_lhs (epilog_stmt, new_temp);
5410                       gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5411                     }
5412                 }
5413             }
5414
5415           /* The only case where we need to reduce scalar results in SLP, is
5416              unrolling.  If the size of SCALAR_RESULTS is greater than
5417              REDUC_GROUP_SIZE, we reduce them combining elements modulo
5418              REDUC_GROUP_SIZE.  */
5419           if (slp_reduc)
5420             {
5421               tree res, first_res, new_res;
5422               gimple *new_stmt;
5423
5424               /* Reduce multiple scalar results in case of SLP unrolling.  */
5425               for (j = group_size; scalar_results.iterate (j, &res);
5426                    j++)
5427                 {
5428                   first_res = scalar_results[j % group_size];
5429                   new_stmt = gimple_build_assign (new_scalar_dest, code,
5430                                                   first_res, res);
5431                   new_res = make_ssa_name (new_scalar_dest, new_stmt);
5432                   gimple_assign_set_lhs (new_stmt, new_res);
5433                   gsi_insert_before (&exit_gsi, new_stmt, GSI_SAME_STMT);
5434                   scalar_results[j % group_size] = new_res;
5435                 }
5436             }
5437           else
5438             /* Not SLP - we have one scalar to keep in SCALAR_RESULTS.  */
5439             scalar_results.safe_push (new_temp);
5440         }
5441
5442       if ((STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
5443            == INTEGER_INDUC_COND_REDUCTION)
5444           && !operand_equal_p (initial_def, induc_val, 0))
5445         {
5446           /* Earlier we set the initial value to be a vector if induc_val
5447              values.  Check the result and if it is induc_val then replace
5448              with the original initial value, unless induc_val is
5449              the same as initial_def already.  */
5450           tree zcompare = build2 (EQ_EXPR, boolean_type_node, new_temp,
5451                                   induc_val);
5452
5453           tree tmp = make_ssa_name (new_scalar_dest);
5454           epilog_stmt = gimple_build_assign (tmp, COND_EXPR, zcompare,
5455                                              initial_def, new_temp);
5456           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5457           scalar_results[0] = tmp;
5458         }
5459     }
5460
5461 vect_finalize_reduction:
5462
5463   if (double_reduc)
5464     loop = loop->inner;
5465
5466   /* 2.5 Adjust the final result by the initial value of the reduction
5467          variable. (When such adjustment is not needed, then
5468          'adjustment_def' is zero).  For example, if code is PLUS we create:
5469          new_temp = loop_exit_def + adjustment_def  */
5470
5471   if (adjustment_def)
5472     {
5473       gcc_assert (!slp_reduc);
5474       if (nested_in_vect_loop)
5475         {
5476           new_phi = new_phis[0];
5477           gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) == VECTOR_TYPE);
5478           expr = build2 (code, vectype, PHI_RESULT (new_phi), adjustment_def);
5479           new_dest = vect_create_destination_var (scalar_dest, vectype);
5480         }
5481       else
5482         {
5483           new_temp = scalar_results[0];
5484           gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) != VECTOR_TYPE);
5485           expr = build2 (code, scalar_type, new_temp, adjustment_def);
5486           new_dest = vect_create_destination_var (scalar_dest, scalar_type);
5487         }
5488
5489       epilog_stmt = gimple_build_assign (new_dest, expr);
5490       new_temp = make_ssa_name (new_dest, epilog_stmt);
5491       gimple_assign_set_lhs (epilog_stmt, new_temp);
5492       gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5493       if (nested_in_vect_loop)
5494         {
5495           stmt_vec_info epilog_stmt_info = loop_vinfo->add_stmt (epilog_stmt);
5496           STMT_VINFO_RELATED_STMT (epilog_stmt_info)
5497             = STMT_VINFO_RELATED_STMT (loop_vinfo->lookup_stmt (new_phi));
5498
5499           if (!double_reduc)
5500             scalar_results.quick_push (new_temp);
5501           else
5502             scalar_results[0] = new_temp;
5503         }
5504       else
5505         scalar_results[0] = new_temp;
5506
5507       new_phis[0] = epilog_stmt;
5508     }
5509
5510   /* 2.6  Handle the loop-exit phis.  Replace the uses of scalar loop-exit
5511           phis with new adjusted scalar results, i.e., replace use <s_out0>
5512           with use <s_out4>.
5513
5514      Transform:
5515         loop_exit:
5516           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
5517           v_out1 = phi <VECT_DEF>               # NEW_EXIT_PHI
5518           v_out2 = reduce <v_out1>
5519           s_out3 = extract_field <v_out2, 0>
5520           s_out4 = adjust_result <s_out3>
5521           use <s_out0>
5522           use <s_out0>
5523
5524      into:
5525
5526         loop_exit:
5527           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
5528           v_out1 = phi <VECT_DEF>               # NEW_EXIT_PHI
5529           v_out2 = reduce <v_out1>
5530           s_out3 = extract_field <v_out2, 0>
5531           s_out4 = adjust_result <s_out3>
5532           use <s_out4>
5533           use <s_out4> */
5534
5535
5536   /* In SLP reduction chain we reduce vector results into one vector if
5537      necessary, hence we set here REDUC_GROUP_SIZE to 1.  SCALAR_DEST is the
5538      LHS of the last stmt in the reduction chain, since we are looking for
5539      the loop exit phi node.  */
5540   if (REDUC_GROUP_FIRST_ELEMENT (stmt_info))
5541     {
5542       stmt_vec_info dest_stmt_info
5543         = vect_orig_stmt (SLP_TREE_SCALAR_STMTS (slp_node)[group_size - 1]);
5544       scalar_dest = gimple_assign_lhs (dest_stmt_info->stmt);
5545       group_size = 1;
5546     }
5547
5548   /* In SLP we may have several statements in NEW_PHIS and REDUCTION_PHIS (in
5549      case that REDUC_GROUP_SIZE is greater than vectorization factor).
5550      Therefore, we need to match SCALAR_RESULTS with corresponding statements.
5551      The first (REDUC_GROUP_SIZE / number of new vector stmts) scalar results
5552      correspond to the first vector stmt, etc.
5553      (RATIO is equal to (REDUC_GROUP_SIZE / number of new vector stmts)).  */
5554   if (group_size > new_phis.length ())
5555     {
5556       ratio = group_size / new_phis.length ();
5557       gcc_assert (!(group_size % new_phis.length ()));
5558     }
5559   else
5560     ratio = 1;
5561
5562   stmt_vec_info epilog_stmt_info = NULL;
5563   for (k = 0; k < group_size; k++)
5564     {
5565       if (k % ratio == 0)
5566         {
5567           epilog_stmt_info = loop_vinfo->lookup_stmt (new_phis[k / ratio]);
5568           reduction_phi_info = reduction_phis[k / ratio];
5569           if (double_reduc)
5570             inner_phi = inner_phis[k / ratio];
5571         }
5572
5573       if (slp_reduc)
5574         {
5575           stmt_vec_info scalar_stmt_info = SLP_TREE_SCALAR_STMTS (slp_node)[k];
5576
5577           orig_stmt_info = STMT_VINFO_RELATED_STMT (scalar_stmt_info);
5578           /* SLP statements can't participate in patterns.  */
5579           gcc_assert (!orig_stmt_info);
5580           scalar_dest = gimple_assign_lhs (scalar_stmt_info->stmt);
5581         }
5582
5583       phis.create (3);
5584       /* Find the loop-closed-use at the loop exit of the original scalar
5585          result.  (The reduction result is expected to have two immediate uses -
5586          one at the latch block, and one at the loop exit).  */
5587       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, scalar_dest)
5588         if (!flow_bb_inside_loop_p (loop, gimple_bb (USE_STMT (use_p)))
5589             && !is_gimple_debug (USE_STMT (use_p)))
5590           phis.safe_push (USE_STMT (use_p));
5591
5592       /* While we expect to have found an exit_phi because of loop-closed-ssa
5593          form we can end up without one if the scalar cycle is dead.  */
5594
5595       FOR_EACH_VEC_ELT (phis, i, exit_phi)
5596         {
5597           if (outer_loop)
5598             {
5599               stmt_vec_info exit_phi_vinfo
5600                 = loop_vinfo->lookup_stmt (exit_phi);
5601               gphi *vect_phi;
5602
5603               if (double_reduc)
5604                 STMT_VINFO_VEC_STMT (exit_phi_vinfo) = inner_phi;
5605               else
5606                 STMT_VINFO_VEC_STMT (exit_phi_vinfo) = epilog_stmt_info;
5607               if (!double_reduc
5608                   || STMT_VINFO_DEF_TYPE (exit_phi_vinfo)
5609                       != vect_double_reduction_def)
5610                 continue;
5611
5612               /* Handle double reduction:
5613
5614                  stmt1: s1 = phi <s0, s2>  - double reduction phi (outer loop)
5615                  stmt2:   s3 = phi <s1, s4> - (regular) reduc phi (inner loop)
5616                  stmt3:   s4 = use (s3)     - (regular) reduc stmt (inner loop)
5617                  stmt4: s2 = phi <s4>      - double reduction stmt (outer loop)
5618
5619                  At that point the regular reduction (stmt2 and stmt3) is
5620                  already vectorized, as well as the exit phi node, stmt4.
5621                  Here we vectorize the phi node of double reduction, stmt1, and
5622                  update all relevant statements.  */
5623
5624               /* Go through all the uses of s2 to find double reduction phi
5625                  node, i.e., stmt1 above.  */
5626               orig_name = PHI_RESULT (exit_phi);
5627               FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name)
5628                 {
5629                   stmt_vec_info use_stmt_vinfo;
5630                   tree vect_phi_init, preheader_arg, vect_phi_res;
5631                   basic_block bb = gimple_bb (use_stmt);
5632
5633                   /* Check that USE_STMT is really double reduction phi
5634                      node.  */
5635                   if (gimple_code (use_stmt) != GIMPLE_PHI
5636                       || gimple_phi_num_args (use_stmt) != 2
5637                       || bb->loop_father != outer_loop)
5638                     continue;
5639                   use_stmt_vinfo = loop_vinfo->lookup_stmt (use_stmt);
5640                   if (!use_stmt_vinfo
5641                       || STMT_VINFO_DEF_TYPE (use_stmt_vinfo)
5642                           != vect_double_reduction_def)
5643                     continue;
5644
5645                   /* Create vector phi node for double reduction:
5646                      vs1 = phi <vs0, vs2>
5647                      vs1 was created previously in this function by a call to
5648                        vect_get_vec_def_for_operand and is stored in
5649                        vec_initial_def;
5650                      vs2 is defined by INNER_PHI, the vectorized EXIT_PHI;
5651                      vs0 is created here.  */
5652
5653                   /* Create vector phi node.  */
5654                   vect_phi = create_phi_node (vec_initial_def, bb);
5655                   loop_vec_info_for_loop (outer_loop)->add_stmt (vect_phi);
5656
5657                   /* Create vs0 - initial def of the double reduction phi.  */
5658                   preheader_arg = PHI_ARG_DEF_FROM_EDGE (use_stmt,
5659                                              loop_preheader_edge (outer_loop));
5660                   vect_phi_init = get_initial_def_for_reduction
5661                     (stmt_info, preheader_arg, NULL);
5662
5663                   /* Update phi node arguments with vs0 and vs2.  */
5664                   add_phi_arg (vect_phi, vect_phi_init,
5665                                loop_preheader_edge (outer_loop),
5666                                UNKNOWN_LOCATION);
5667                   add_phi_arg (vect_phi, PHI_RESULT (inner_phi->stmt),
5668                                loop_latch_edge (outer_loop), UNKNOWN_LOCATION);
5669                   if (dump_enabled_p ())
5670                     dump_printf_loc (MSG_NOTE, vect_location,
5671                                      "created double reduction phi node: %G",
5672                                      vect_phi);
5673
5674                   vect_phi_res = PHI_RESULT (vect_phi);
5675
5676                   /* Replace the use, i.e., set the correct vs1 in the regular
5677                      reduction phi node.  FORNOW, NCOPIES is always 1, so the
5678                      loop is redundant.  */
5679                   stmt_vec_info use_info = reduction_phi_info;
5680                   for (j = 0; j < ncopies; j++)
5681                     {
5682                       edge pr_edge = loop_preheader_edge (loop);
5683                       SET_PHI_ARG_DEF (as_a <gphi *> (use_info->stmt),
5684                                        pr_edge->dest_idx, vect_phi_res);
5685                       use_info = STMT_VINFO_RELATED_STMT (use_info);
5686                     }
5687                 }
5688             }
5689         }
5690
5691       phis.release ();
5692       if (nested_in_vect_loop)
5693         {
5694           if (double_reduc)
5695             loop = outer_loop;
5696           else
5697             continue;
5698         }
5699
5700       phis.create (3);
5701       /* Find the loop-closed-use at the loop exit of the original scalar
5702          result.  (The reduction result is expected to have two immediate uses,
5703          one at the latch block, and one at the loop exit).  For double
5704          reductions we are looking for exit phis of the outer loop.  */
5705       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, scalar_dest)
5706         {
5707           if (!flow_bb_inside_loop_p (loop, gimple_bb (USE_STMT (use_p))))
5708             {
5709               if (!is_gimple_debug (USE_STMT (use_p)))
5710                 phis.safe_push (USE_STMT (use_p));
5711             }
5712           else
5713             {
5714               if (double_reduc && gimple_code (USE_STMT (use_p)) == GIMPLE_PHI)
5715                 {
5716                   tree phi_res = PHI_RESULT (USE_STMT (use_p));
5717
5718                   FOR_EACH_IMM_USE_FAST (phi_use_p, phi_imm_iter, phi_res)
5719                     {
5720                       if (!flow_bb_inside_loop_p (loop,
5721                                              gimple_bb (USE_STMT (phi_use_p)))
5722                           && !is_gimple_debug (USE_STMT (phi_use_p)))
5723                         phis.safe_push (USE_STMT (phi_use_p));
5724                     }
5725                 }
5726             }
5727         }
5728
5729       FOR_EACH_VEC_ELT (phis, i, exit_phi)
5730         {
5731           /* Replace the uses:  */
5732           orig_name = PHI_RESULT (exit_phi);
5733           scalar_result = scalar_results[k];
5734           FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name)
5735             FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
5736               SET_USE (use_p, scalar_result);
5737         }
5738
5739       phis.release ();
5740     }
5741 }
5742
5743 /* Return a vector of type VECTYPE that is equal to the vector select
5744    operation "MASK ? VEC : IDENTITY".  Insert the select statements
5745    before GSI.  */
5746
5747 static tree
5748 merge_with_identity (gimple_stmt_iterator *gsi, tree mask, tree vectype,
5749                      tree vec, tree identity)
5750 {
5751   tree cond = make_temp_ssa_name (vectype, NULL, "cond");
5752   gimple *new_stmt = gimple_build_assign (cond, VEC_COND_EXPR,
5753                                           mask, vec, identity);
5754   gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
5755   return cond;
5756 }
5757
5758 /* Successively apply CODE to each element of VECTOR_RHS, in left-to-right
5759    order, starting with LHS.  Insert the extraction statements before GSI and
5760    associate the new scalar SSA names with variable SCALAR_DEST.
5761    Return the SSA name for the result.  */
5762
5763 static tree
5764 vect_expand_fold_left (gimple_stmt_iterator *gsi, tree scalar_dest,
5765                        tree_code code, tree lhs, tree vector_rhs)
5766 {
5767   tree vectype = TREE_TYPE (vector_rhs);
5768   tree scalar_type = TREE_TYPE (vectype);
5769   tree bitsize = TYPE_SIZE (scalar_type);
5770   unsigned HOST_WIDE_INT vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
5771   unsigned HOST_WIDE_INT element_bitsize = tree_to_uhwi (bitsize);
5772
5773   for (unsigned HOST_WIDE_INT bit_offset = 0;
5774        bit_offset < vec_size_in_bits;
5775        bit_offset += element_bitsize)
5776     {
5777       tree bitpos = bitsize_int (bit_offset);
5778       tree rhs = build3 (BIT_FIELD_REF, scalar_type, vector_rhs,
5779                          bitsize, bitpos);
5780
5781       gassign *stmt = gimple_build_assign (scalar_dest, rhs);
5782       rhs = make_ssa_name (scalar_dest, stmt);
5783       gimple_assign_set_lhs (stmt, rhs);
5784       gsi_insert_before (gsi, stmt, GSI_SAME_STMT);
5785
5786       stmt = gimple_build_assign (scalar_dest, code, lhs, rhs);
5787       tree new_name = make_ssa_name (scalar_dest, stmt);
5788       gimple_assign_set_lhs (stmt, new_name);
5789       gsi_insert_before (gsi, stmt, GSI_SAME_STMT);
5790       lhs = new_name;
5791     }
5792   return lhs;
5793 }
5794
5795 /* Perform an in-order reduction (FOLD_LEFT_REDUCTION).  STMT_INFO is the
5796    statement that sets the live-out value.  REDUC_DEF_STMT is the phi
5797    statement.  CODE is the operation performed by STMT_INFO and OPS are
5798    its scalar operands.  REDUC_INDEX is the index of the operand in
5799    OPS that is set by REDUC_DEF_STMT.  REDUC_FN is the function that
5800    implements in-order reduction, or IFN_LAST if we should open-code it.
5801    VECTYPE_IN is the type of the vector input.  MASKS specifies the masks
5802    that should be used to control the operation in a fully-masked loop.  */
5803
5804 static bool
5805 vectorize_fold_left_reduction (stmt_vec_info stmt_info,
5806                                gimple_stmt_iterator *gsi,
5807                                stmt_vec_info *vec_stmt, slp_tree slp_node,
5808                                gimple *reduc_def_stmt,
5809                                tree_code code, internal_fn reduc_fn,
5810                                tree ops[3], tree vectype_in,
5811                                int reduc_index, vec_loop_masks *masks)
5812 {
5813   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
5814   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
5815   tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
5816   stmt_vec_info new_stmt_info = NULL;
5817
5818   int ncopies;
5819   if (slp_node)
5820     ncopies = 1;
5821   else
5822     ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
5823
5824   gcc_assert (!nested_in_vect_loop_p (loop, stmt_info));
5825   gcc_assert (ncopies == 1);
5826   gcc_assert (TREE_CODE_LENGTH (code) == binary_op);
5827   gcc_assert (reduc_index == (code == MINUS_EXPR ? 0 : 1));
5828   gcc_assert (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
5829               == FOLD_LEFT_REDUCTION);
5830
5831   if (slp_node)
5832     gcc_assert (known_eq (TYPE_VECTOR_SUBPARTS (vectype_out),
5833                           TYPE_VECTOR_SUBPARTS (vectype_in)));
5834
5835   tree op0 = ops[1 - reduc_index];
5836
5837   int group_size = 1;
5838   stmt_vec_info scalar_dest_def_info;
5839   auto_vec<tree> vec_oprnds0;
5840   if (slp_node)
5841     {
5842       auto_vec<vec<tree> > vec_defs (2);
5843       auto_vec<tree> sops(2);
5844       sops.quick_push (ops[0]);
5845       sops.quick_push (ops[1]);
5846       vect_get_slp_defs (sops, slp_node, &vec_defs);
5847       vec_oprnds0.safe_splice (vec_defs[1 - reduc_index]);
5848       vec_defs[0].release ();
5849       vec_defs[1].release ();
5850       group_size = SLP_TREE_SCALAR_STMTS (slp_node).length ();
5851       scalar_dest_def_info = SLP_TREE_SCALAR_STMTS (slp_node)[group_size - 1];
5852     }
5853   else
5854     {
5855       tree loop_vec_def0 = vect_get_vec_def_for_operand (op0, stmt_info);
5856       vec_oprnds0.create (1);
5857       vec_oprnds0.quick_push (loop_vec_def0);
5858       scalar_dest_def_info = stmt_info;
5859     }
5860
5861   tree scalar_dest = gimple_assign_lhs (scalar_dest_def_info->stmt);
5862   tree scalar_type = TREE_TYPE (scalar_dest);
5863   tree reduc_var = gimple_phi_result (reduc_def_stmt);
5864
5865   int vec_num = vec_oprnds0.length ();
5866   gcc_assert (vec_num == 1 || slp_node);
5867   tree vec_elem_type = TREE_TYPE (vectype_out);
5868   gcc_checking_assert (useless_type_conversion_p (scalar_type, vec_elem_type));
5869
5870   tree vector_identity = NULL_TREE;
5871   if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
5872     vector_identity = build_zero_cst (vectype_out);
5873
5874   tree scalar_dest_var = vect_create_destination_var (scalar_dest, NULL);
5875   int i;
5876   tree def0;
5877   FOR_EACH_VEC_ELT (vec_oprnds0, i, def0)
5878     {
5879       gimple *new_stmt;
5880       tree mask = NULL_TREE;
5881       if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
5882         mask = vect_get_loop_mask (gsi, masks, vec_num, vectype_in, i);
5883
5884       /* Handle MINUS by adding the negative.  */
5885       if (reduc_fn != IFN_LAST && code == MINUS_EXPR)
5886         {
5887           tree negated = make_ssa_name (vectype_out);
5888           new_stmt = gimple_build_assign (negated, NEGATE_EXPR, def0);
5889           gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
5890           def0 = negated;
5891         }
5892
5893       if (mask)
5894         def0 = merge_with_identity (gsi, mask, vectype_out, def0,
5895                                     vector_identity);
5896
5897       /* On the first iteration the input is simply the scalar phi
5898          result, and for subsequent iterations it is the output of
5899          the preceding operation.  */
5900       if (reduc_fn != IFN_LAST)
5901         {
5902           new_stmt = gimple_build_call_internal (reduc_fn, 2, reduc_var, def0);
5903           /* For chained SLP reductions the output of the previous reduction
5904              operation serves as the input of the next. For the final statement
5905              the output cannot be a temporary - we reuse the original
5906              scalar destination of the last statement.  */
5907           if (i != vec_num - 1)
5908             {
5909               gimple_set_lhs (new_stmt, scalar_dest_var);
5910               reduc_var = make_ssa_name (scalar_dest_var, new_stmt);
5911               gimple_set_lhs (new_stmt, reduc_var);
5912             }
5913         }
5914       else
5915         {
5916           reduc_var = vect_expand_fold_left (gsi, scalar_dest_var, code,
5917                                              reduc_var, def0);
5918           new_stmt = SSA_NAME_DEF_STMT (reduc_var);
5919           /* Remove the statement, so that we can use the same code paths
5920              as for statements that we've just created.  */
5921           gimple_stmt_iterator tmp_gsi = gsi_for_stmt (new_stmt);
5922           gsi_remove (&tmp_gsi, true);
5923         }
5924
5925       if (i == vec_num - 1)
5926         {
5927           gimple_set_lhs (new_stmt, scalar_dest);
5928           new_stmt_info = vect_finish_replace_stmt (scalar_dest_def_info,
5929                                                     new_stmt);
5930         }
5931       else
5932         new_stmt_info = vect_finish_stmt_generation (scalar_dest_def_info,
5933                                                      new_stmt, gsi);
5934
5935       if (slp_node)
5936         SLP_TREE_VEC_STMTS (slp_node).quick_push (new_stmt_info);
5937     }
5938
5939   if (!slp_node)
5940     STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt_info;
5941
5942   return true;
5943 }
5944
5945 /* Function is_nonwrapping_integer_induction.
5946
5947    Check if STMT_VINO (which is part of loop LOOP) both increments and
5948    does not cause overflow.  */
5949
5950 static bool
5951 is_nonwrapping_integer_induction (stmt_vec_info stmt_vinfo, struct loop *loop)
5952 {
5953   gphi *phi = as_a <gphi *> (stmt_vinfo->stmt);
5954   tree base = STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo);
5955   tree step = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo);
5956   tree lhs_type = TREE_TYPE (gimple_phi_result (phi));
5957   widest_int ni, max_loop_value, lhs_max;
5958   wi::overflow_type overflow = wi::OVF_NONE;
5959
5960   /* Make sure the loop is integer based.  */
5961   if (TREE_CODE (base) != INTEGER_CST
5962       || TREE_CODE (step) != INTEGER_CST)
5963     return false;
5964
5965   /* Check that the max size of the loop will not wrap.  */
5966
5967   if (TYPE_OVERFLOW_UNDEFINED (lhs_type))
5968     return true;
5969
5970   if (! max_stmt_executions (loop, &ni))
5971     return false;
5972
5973   max_loop_value = wi::mul (wi::to_widest (step), ni, TYPE_SIGN (lhs_type),
5974                             &overflow);
5975   if (overflow)
5976     return false;
5977
5978   max_loop_value = wi::add (wi::to_widest (base), max_loop_value,
5979                             TYPE_SIGN (lhs_type), &overflow);
5980   if (overflow)
5981     return false;
5982
5983   return (wi::min_precision (max_loop_value, TYPE_SIGN (lhs_type))
5984           <= TYPE_PRECISION (lhs_type));
5985 }
5986
5987 /* Check if masking can be supported by inserting a conditional expression.
5988    CODE is the code for the operation.  COND_FN is the conditional internal
5989    function, if it exists.  VECTYPE_IN is the type of the vector input.  */
5990 static bool
5991 use_mask_by_cond_expr_p (enum tree_code code, internal_fn cond_fn,
5992                          tree vectype_in)
5993 {
5994   if (cond_fn != IFN_LAST
5995       && direct_internal_fn_supported_p (cond_fn, vectype_in,
5996                                          OPTIMIZE_FOR_SPEED))
5997     return false;
5998
5999   switch (code)
6000     {
6001     case DOT_PROD_EXPR:
6002     case SAD_EXPR:
6003       return true;
6004
6005     default:
6006       return false;
6007     }
6008 }
6009
6010 /* Insert a conditional expression to enable masked vectorization.  CODE is the
6011    code for the operation.  VOP is the array of operands.  MASK is the loop
6012    mask.  GSI is a statement iterator used to place the new conditional
6013    expression.  */
6014 static void
6015 build_vect_cond_expr (enum tree_code code, tree vop[3], tree mask,
6016                       gimple_stmt_iterator *gsi)
6017 {
6018   switch (code)
6019     {
6020     case DOT_PROD_EXPR:
6021       {
6022         tree vectype = TREE_TYPE (vop[1]);
6023         tree zero = build_zero_cst (vectype);
6024         tree masked_op1 = make_temp_ssa_name (vectype, NULL, "masked_op1");
6025         gassign *select = gimple_build_assign (masked_op1, VEC_COND_EXPR,
6026                                                mask, vop[1], zero);
6027         gsi_insert_before (gsi, select, GSI_SAME_STMT);
6028         vop[1] = masked_op1;
6029         break;
6030       }
6031
6032     case SAD_EXPR:
6033       {
6034         tree vectype = TREE_TYPE (vop[1]);
6035         tree masked_op1 = make_temp_ssa_name (vectype, NULL, "masked_op1");
6036         gassign *select = gimple_build_assign (masked_op1, VEC_COND_EXPR,
6037                                                mask, vop[1], vop[0]);
6038         gsi_insert_before (gsi, select, GSI_SAME_STMT);
6039         vop[1] = masked_op1;
6040         break;
6041       }
6042
6043     default:
6044       gcc_unreachable ();
6045     }
6046 }
6047
6048 /* Function vectorizable_reduction.
6049
6050    Check if STMT_INFO performs a reduction operation that can be vectorized.
6051    If VEC_STMT is also passed, vectorize STMT_INFO: create a vectorized
6052    stmt to replace it, put it in VEC_STMT, and insert it at GSI.
6053    Return true if STMT_INFO is vectorizable in this way.
6054
6055    This function also handles reduction idioms (patterns) that have been
6056    recognized in advance during vect_pattern_recog.  In this case, STMT_INFO
6057    may be of this form:
6058      X = pattern_expr (arg0, arg1, ..., X)
6059    and its STMT_VINFO_RELATED_STMT points to the last stmt in the original
6060    sequence that had been detected and replaced by the pattern-stmt
6061    (STMT_INFO).
6062
6063    This function also handles reduction of condition expressions, for example:
6064      for (int i = 0; i < N; i++)
6065        if (a[i] < value)
6066          last = a[i];
6067    This is handled by vectorising the loop and creating an additional vector
6068    containing the loop indexes for which "a[i] < value" was true.  In the
6069    function epilogue this is reduced to a single max value and then used to
6070    index into the vector of results.
6071
6072    In some cases of reduction patterns, the type of the reduction variable X is
6073    different than the type of the other arguments of STMT_INFO.
6074    In such cases, the vectype that is used when transforming STMT_INFO into
6075    a vector stmt is different than the vectype that is used to determine the
6076    vectorization factor, because it consists of a different number of elements
6077    than the actual number of elements that are being operated upon in parallel.
6078
6079    For example, consider an accumulation of shorts into an int accumulator.
6080    On some targets it's possible to vectorize this pattern operating on 8
6081    shorts at a time (hence, the vectype for purposes of determining the
6082    vectorization factor should be V8HI); on the other hand, the vectype that
6083    is used to create the vector form is actually V4SI (the type of the result).
6084
6085    Upon entry to this function, STMT_VINFO_VECTYPE records the vectype that
6086    indicates what is the actual level of parallelism (V8HI in the example), so
6087    that the right vectorization factor would be derived.  This vectype
6088    corresponds to the type of arguments to the reduction stmt, and should *NOT*
6089    be used to create the vectorized stmt.  The right vectype for the vectorized
6090    stmt is obtained from the type of the result X:
6091         get_vectype_for_scalar_type (TREE_TYPE (X))
6092
6093    This means that, contrary to "regular" reductions (or "regular" stmts in
6094    general), the following equation:
6095       STMT_VINFO_VECTYPE == get_vectype_for_scalar_type (TREE_TYPE (X))
6096    does *NOT* necessarily hold for reduction patterns.  */
6097
6098 bool
6099 vectorizable_reduction (stmt_vec_info stmt_info, gimple_stmt_iterator *gsi,
6100                         stmt_vec_info *vec_stmt, slp_tree slp_node,
6101                         slp_instance slp_node_instance,
6102                         stmt_vector_for_cost *cost_vec)
6103 {
6104   tree vec_dest;
6105   tree scalar_dest;
6106   tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
6107   tree vectype_in = NULL_TREE;
6108   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
6109   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
6110   enum tree_code code, orig_code;
6111   internal_fn reduc_fn;
6112   machine_mode vec_mode;
6113   int op_type;
6114   optab optab;
6115   tree new_temp = NULL_TREE;
6116   enum vect_def_type dt, cond_reduc_dt = vect_unknown_def_type;
6117   stmt_vec_info cond_stmt_vinfo = NULL;
6118   enum tree_code cond_reduc_op_code = ERROR_MARK;
6119   tree scalar_type;
6120   bool is_simple_use;
6121   int i;
6122   int ncopies;
6123   int epilog_copies;
6124   stmt_vec_info prev_stmt_info, prev_phi_info;
6125   bool single_defuse_cycle = false;
6126   stmt_vec_info new_stmt_info = NULL;
6127   int j;
6128   tree ops[3];
6129   enum vect_def_type dts[3];
6130   bool nested_cycle = false, found_nested_cycle_def = false;
6131   bool double_reduc = false;
6132   basic_block def_bb;
6133   struct loop * def_stmt_loop;
6134   tree def_arg;
6135   auto_vec<tree> vec_oprnds0;
6136   auto_vec<tree> vec_oprnds1;
6137   auto_vec<tree> vec_oprnds2;
6138   auto_vec<tree> vect_defs;
6139   auto_vec<stmt_vec_info> phis;
6140   int vec_num;
6141   tree def0, tem;
6142   tree cr_index_scalar_type = NULL_TREE, cr_index_vector_type = NULL_TREE;
6143   tree cond_reduc_val = NULL_TREE;
6144
6145   /* Make sure it was already recognized as a reduction computation.  */
6146   if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_reduction_def
6147       && STMT_VINFO_DEF_TYPE (stmt_info) != vect_nested_cycle)
6148     return false;
6149
6150   if (nested_in_vect_loop_p (loop, stmt_info))
6151     {
6152       loop = loop->inner;
6153       nested_cycle = true;
6154     }
6155
6156   if (REDUC_GROUP_FIRST_ELEMENT (stmt_info))
6157     gcc_assert (slp_node
6158                 && REDUC_GROUP_FIRST_ELEMENT (stmt_info) == stmt_info);
6159
6160   if (gphi *phi = dyn_cast <gphi *> (stmt_info->stmt))
6161     {
6162       tree phi_result = gimple_phi_result (phi);
6163       /* Analysis is fully done on the reduction stmt invocation.  */
6164       if (! vec_stmt)
6165         {
6166           if (slp_node)
6167             slp_node_instance->reduc_phis = slp_node;
6168
6169           STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
6170           return true;
6171         }
6172
6173       if (STMT_VINFO_REDUC_TYPE (stmt_info) == FOLD_LEFT_REDUCTION)
6174         /* Leave the scalar phi in place.  Note that checking
6175            STMT_VINFO_VEC_REDUCTION_TYPE (as below) only works
6176            for reductions involving a single statement.  */
6177         return true;
6178
6179       stmt_vec_info reduc_stmt_info = STMT_VINFO_REDUC_DEF (stmt_info);
6180       reduc_stmt_info = vect_stmt_to_vectorize (reduc_stmt_info);
6181
6182       if (STMT_VINFO_VEC_REDUCTION_TYPE (reduc_stmt_info)
6183           == EXTRACT_LAST_REDUCTION)
6184         /* Leave the scalar phi in place.  */
6185         return true;
6186
6187       gassign *reduc_stmt = as_a <gassign *> (reduc_stmt_info->stmt);
6188       code = gimple_assign_rhs_code (reduc_stmt);
6189       for (unsigned k = 1; k < gimple_num_ops (reduc_stmt); ++k)
6190         {
6191           tree op = gimple_op (reduc_stmt, k);
6192           if (op == phi_result)
6193             continue;
6194           if (k == 1 && code == COND_EXPR)
6195             continue;
6196           bool is_simple_use = vect_is_simple_use (op, loop_vinfo, &dt);
6197           gcc_assert (is_simple_use);
6198           if (dt == vect_constant_def || dt == vect_external_def)
6199             continue;
6200           if (!vectype_in
6201               || (GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (vectype_in)))
6202                   < GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (op)))))
6203             vectype_in = get_vectype_for_scalar_type (TREE_TYPE (op));
6204           break;
6205         }
6206       /* For a nested cycle we might end up with an operation like
6207          phi_result * phi_result.  */
6208       if (!vectype_in)
6209         vectype_in = STMT_VINFO_VECTYPE (stmt_info);
6210       gcc_assert (vectype_in);
6211
6212       if (slp_node)
6213         ncopies = 1;
6214       else
6215         ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
6216
6217       stmt_vec_info use_stmt_info;
6218       if (ncopies > 1
6219           && STMT_VINFO_RELEVANT (reduc_stmt_info) <= vect_used_only_live
6220           && (use_stmt_info = loop_vinfo->lookup_single_use (phi_result))
6221           && vect_stmt_to_vectorize (use_stmt_info) == reduc_stmt_info)
6222         single_defuse_cycle = true;
6223
6224       /* Create the destination vector  */
6225       scalar_dest = gimple_assign_lhs (reduc_stmt);
6226       vec_dest = vect_create_destination_var (scalar_dest, vectype_out);
6227
6228       if (slp_node)
6229         /* The size vect_schedule_slp_instance computes is off for us.  */
6230         vec_num = vect_get_num_vectors
6231           (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
6232            * SLP_TREE_SCALAR_STMTS (slp_node).length (),
6233            vectype_in);
6234       else
6235         vec_num = 1;
6236
6237       /* Generate the reduction PHIs upfront.  */
6238       prev_phi_info = NULL;
6239       for (j = 0; j < ncopies; j++)
6240         {
6241           if (j == 0 || !single_defuse_cycle)
6242             {
6243               for (i = 0; i < vec_num; i++)
6244                 {
6245                   /* Create the reduction-phi that defines the reduction
6246                      operand.  */
6247                   gimple *new_phi = create_phi_node (vec_dest, loop->header);
6248                   stmt_vec_info new_phi_info = loop_vinfo->add_stmt (new_phi);
6249
6250                   if (slp_node)
6251                     SLP_TREE_VEC_STMTS (slp_node).quick_push (new_phi_info);
6252                   else
6253                     {
6254                       if (j == 0)
6255                         STMT_VINFO_VEC_STMT (stmt_info)
6256                           = *vec_stmt = new_phi_info;
6257                       else
6258                         STMT_VINFO_RELATED_STMT (prev_phi_info) = new_phi_info;
6259                       prev_phi_info = new_phi_info;
6260                     }
6261                 }
6262             }
6263         }
6264
6265       return true;
6266     }
6267
6268   /* 1. Is vectorizable reduction?  */
6269   /* Not supportable if the reduction variable is used in the loop, unless
6270      it's a reduction chain.  */
6271   if (STMT_VINFO_RELEVANT (stmt_info) > vect_used_in_outer
6272       && !REDUC_GROUP_FIRST_ELEMENT (stmt_info))
6273     return false;
6274
6275   /* Reductions that are not used even in an enclosing outer-loop,
6276      are expected to be "live" (used out of the loop).  */
6277   if (STMT_VINFO_RELEVANT (stmt_info) == vect_unused_in_scope
6278       && !STMT_VINFO_LIVE_P (stmt_info))
6279     return false;
6280
6281   /* 2. Has this been recognized as a reduction pattern?
6282
6283      Check if STMT represents a pattern that has been recognized
6284      in earlier analysis stages.  For stmts that represent a pattern,
6285      the STMT_VINFO_RELATED_STMT field records the last stmt in
6286      the original sequence that constitutes the pattern.  */
6287
6288   stmt_vec_info orig_stmt_info = STMT_VINFO_RELATED_STMT (stmt_info);
6289   if (orig_stmt_info)
6290     {
6291       gcc_assert (STMT_VINFO_IN_PATTERN_P (orig_stmt_info));
6292       gcc_assert (!STMT_VINFO_IN_PATTERN_P (stmt_info));
6293     }
6294
6295   /* 3. Check the operands of the operation.  The first operands are defined
6296         inside the loop body. The last operand is the reduction variable,
6297         which is defined by the loop-header-phi.  */
6298
6299   gassign *stmt = as_a <gassign *> (stmt_info->stmt);
6300
6301   /* Flatten RHS.  */
6302   switch (get_gimple_rhs_class (gimple_assign_rhs_code (stmt)))
6303     {
6304     case GIMPLE_BINARY_RHS:
6305       code = gimple_assign_rhs_code (stmt);
6306       op_type = TREE_CODE_LENGTH (code);
6307       gcc_assert (op_type == binary_op);
6308       ops[0] = gimple_assign_rhs1 (stmt);
6309       ops[1] = gimple_assign_rhs2 (stmt);
6310       break;
6311
6312     case GIMPLE_TERNARY_RHS:
6313       code = gimple_assign_rhs_code (stmt);
6314       op_type = TREE_CODE_LENGTH (code);
6315       gcc_assert (op_type == ternary_op);
6316       ops[0] = gimple_assign_rhs1 (stmt);
6317       ops[1] = gimple_assign_rhs2 (stmt);
6318       ops[2] = gimple_assign_rhs3 (stmt);
6319       break;
6320
6321     case GIMPLE_UNARY_RHS:
6322       return false;
6323
6324     default:
6325       gcc_unreachable ();
6326     }
6327
6328   if (code == COND_EXPR && slp_node)
6329     return false;
6330
6331   scalar_dest = gimple_assign_lhs (stmt);
6332   scalar_type = TREE_TYPE (scalar_dest);
6333   if (!POINTER_TYPE_P (scalar_type) && !INTEGRAL_TYPE_P (scalar_type)
6334       && !SCALAR_FLOAT_TYPE_P (scalar_type))
6335     return false;
6336
6337   /* Do not try to vectorize bit-precision reductions.  */
6338   if (!type_has_mode_precision_p (scalar_type))
6339     return false;
6340
6341   /* All uses but the last are expected to be defined in the loop.
6342      The last use is the reduction variable.  In case of nested cycle this
6343      assumption is not true: we use reduc_index to record the index of the
6344      reduction variable.  */
6345   stmt_vec_info reduc_def_info;
6346   if (orig_stmt_info)
6347     reduc_def_info = STMT_VINFO_REDUC_DEF (orig_stmt_info);
6348   else
6349     reduc_def_info = STMT_VINFO_REDUC_DEF (stmt_info);
6350   gcc_assert (reduc_def_info);
6351   gphi *reduc_def_phi = as_a <gphi *> (reduc_def_info->stmt);
6352   tree reduc_def = PHI_RESULT (reduc_def_phi);
6353   int reduc_index = -1;
6354   for (i = 0; i < op_type; i++)
6355     {
6356       /* The condition of COND_EXPR is checked in vectorizable_condition().  */
6357       if (i == 0 && code == COND_EXPR)
6358         continue;
6359
6360       stmt_vec_info def_stmt_info;
6361       is_simple_use = vect_is_simple_use (ops[i], loop_vinfo, &dts[i], &tem,
6362                                           &def_stmt_info);
6363       dt = dts[i];
6364       gcc_assert (is_simple_use);
6365       if (dt == vect_reduction_def
6366           && ops[i] == reduc_def)
6367         {
6368           reduc_index = i;
6369           continue;
6370         }
6371       else if (tem)
6372         {
6373           /* To properly compute ncopies we are interested in the widest
6374              input type in case we're looking at a widening accumulation.  */
6375           if (!vectype_in
6376               || (GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (vectype_in)))
6377                   < GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (tem)))))
6378             vectype_in = tem;
6379         }
6380
6381       if (dt != vect_internal_def
6382           && dt != vect_external_def
6383           && dt != vect_constant_def
6384           && dt != vect_induction_def
6385           && !(dt == vect_nested_cycle && nested_cycle))
6386         return false;
6387
6388       if (dt == vect_nested_cycle
6389           && ops[i] == reduc_def)
6390         {
6391           found_nested_cycle_def = true;
6392           reduc_index = i;
6393         }
6394
6395       if (i == 1 && code == COND_EXPR)
6396         {
6397           /* Record how value of COND_EXPR is defined.  */
6398           if (dt == vect_constant_def)
6399             {
6400               cond_reduc_dt = dt;
6401               cond_reduc_val = ops[i];
6402             }
6403           if (dt == vect_induction_def
6404               && def_stmt_info
6405               && is_nonwrapping_integer_induction (def_stmt_info, loop))
6406             {
6407               cond_reduc_dt = dt;
6408               cond_stmt_vinfo = def_stmt_info;
6409             }
6410         }
6411     }
6412
6413   if (!vectype_in)
6414     vectype_in = vectype_out;
6415
6416   /* When vectorizing a reduction chain w/o SLP the reduction PHI is not
6417      directy used in stmt.  */
6418   if (reduc_index == -1)
6419     {
6420       if (STMT_VINFO_REDUC_TYPE (stmt_info) == FOLD_LEFT_REDUCTION)
6421         {
6422           if (dump_enabled_p ())
6423             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6424                              "in-order reduction chain without SLP.\n");
6425           return false;
6426         }
6427     }
6428
6429   if (!(reduc_index == -1
6430         || dts[reduc_index] == vect_reduction_def
6431         || dts[reduc_index] == vect_nested_cycle
6432         || ((dts[reduc_index] == vect_internal_def
6433              || dts[reduc_index] == vect_external_def
6434              || dts[reduc_index] == vect_constant_def
6435              || dts[reduc_index] == vect_induction_def)
6436             && nested_cycle && found_nested_cycle_def)))
6437     {
6438       /* For pattern recognized stmts, orig_stmt might be a reduction,
6439          but some helper statements for the pattern might not, or
6440          might be COND_EXPRs with reduction uses in the condition.  */
6441       gcc_assert (orig_stmt_info);
6442       return false;
6443     }
6444
6445   /* PHIs should not participate in patterns.  */
6446   gcc_assert (!STMT_VINFO_RELATED_STMT (reduc_def_info));
6447   enum vect_reduction_type v_reduc_type
6448     = STMT_VINFO_REDUC_TYPE (reduc_def_info);
6449   stmt_vec_info tmp = STMT_VINFO_REDUC_DEF (reduc_def_info);
6450
6451   STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) = v_reduc_type;
6452   /* If we have a condition reduction, see if we can simplify it further.  */
6453   if (v_reduc_type == COND_REDUCTION)
6454     {
6455       /* TODO: We can't yet handle reduction chains, since we need to treat
6456          each COND_EXPR in the chain specially, not just the last one.
6457          E.g. for:
6458
6459             x_1 = PHI <x_3, ...>
6460             x_2 = a_2 ? ... : x_1;
6461             x_3 = a_3 ? ... : x_2;
6462
6463          we're interested in the last element in x_3 for which a_2 || a_3
6464          is true, whereas the current reduction chain handling would
6465          vectorize x_2 as a normal VEC_COND_EXPR and only treat x_3
6466          as a reduction operation.  */
6467       if (reduc_index == -1)
6468         {
6469           if (dump_enabled_p ())
6470             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6471                              "conditional reduction chains not supported\n");
6472           return false;
6473         }
6474
6475       /* vect_is_simple_reduction ensured that operand 2 is the
6476          loop-carried operand.  */
6477       gcc_assert (reduc_index == 2);
6478
6479       /* Loop peeling modifies initial value of reduction PHI, which
6480          makes the reduction stmt to be transformed different to the
6481          original stmt analyzed.  We need to record reduction code for
6482          CONST_COND_REDUCTION type reduction at analyzing stage, thus
6483          it can be used directly at transform stage.  */
6484       if (STMT_VINFO_VEC_CONST_COND_REDUC_CODE (stmt_info) == MAX_EXPR
6485           || STMT_VINFO_VEC_CONST_COND_REDUC_CODE (stmt_info) == MIN_EXPR)
6486         {
6487           /* Also set the reduction type to CONST_COND_REDUCTION.  */
6488           gcc_assert (cond_reduc_dt == vect_constant_def);
6489           STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) = CONST_COND_REDUCTION;
6490         }
6491       else if (direct_internal_fn_supported_p (IFN_FOLD_EXTRACT_LAST,
6492                                                vectype_in, OPTIMIZE_FOR_SPEED))
6493         {
6494           if (dump_enabled_p ())
6495             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6496                              "optimizing condition reduction with"
6497                              " FOLD_EXTRACT_LAST.\n");
6498           STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) = EXTRACT_LAST_REDUCTION;
6499         }
6500       else if (cond_reduc_dt == vect_induction_def)
6501         {
6502           tree base
6503             = STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (cond_stmt_vinfo);
6504           tree step = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (cond_stmt_vinfo);
6505
6506           gcc_assert (TREE_CODE (base) == INTEGER_CST
6507                       && TREE_CODE (step) == INTEGER_CST);
6508           cond_reduc_val = NULL_TREE;
6509           /* Find a suitable value, for MAX_EXPR below base, for MIN_EXPR
6510              above base; punt if base is the minimum value of the type for
6511              MAX_EXPR or maximum value of the type for MIN_EXPR for now.  */
6512           if (tree_int_cst_sgn (step) == -1)
6513             {
6514               cond_reduc_op_code = MIN_EXPR;
6515               if (tree_int_cst_sgn (base) == -1)
6516                 cond_reduc_val = build_int_cst (TREE_TYPE (base), 0);
6517               else if (tree_int_cst_lt (base,
6518                                         TYPE_MAX_VALUE (TREE_TYPE (base))))
6519                 cond_reduc_val
6520                   = int_const_binop (PLUS_EXPR, base, integer_one_node);
6521             }
6522           else
6523             {
6524               cond_reduc_op_code = MAX_EXPR;
6525               if (tree_int_cst_sgn (base) == 1)
6526                 cond_reduc_val = build_int_cst (TREE_TYPE (base), 0);
6527               else if (tree_int_cst_lt (TYPE_MIN_VALUE (TREE_TYPE (base)),
6528                                         base))
6529                 cond_reduc_val
6530                   = int_const_binop (MINUS_EXPR, base, integer_one_node);
6531             }
6532           if (cond_reduc_val)
6533             {
6534               if (dump_enabled_p ())
6535                 dump_printf_loc (MSG_NOTE, vect_location,
6536                                  "condition expression based on "
6537                                  "integer induction.\n");
6538               STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
6539                 = INTEGER_INDUC_COND_REDUCTION;
6540             }
6541         }
6542       else if (cond_reduc_dt == vect_constant_def)
6543         {
6544           enum vect_def_type cond_initial_dt;
6545           gimple *def_stmt = SSA_NAME_DEF_STMT (ops[reduc_index]);
6546           tree cond_initial_val
6547             = PHI_ARG_DEF_FROM_EDGE (def_stmt, loop_preheader_edge (loop));
6548
6549           gcc_assert (cond_reduc_val != NULL_TREE);
6550           vect_is_simple_use (cond_initial_val, loop_vinfo, &cond_initial_dt);
6551           if (cond_initial_dt == vect_constant_def
6552               && types_compatible_p (TREE_TYPE (cond_initial_val),
6553                                      TREE_TYPE (cond_reduc_val)))
6554             {
6555               tree e = fold_binary (LE_EXPR, boolean_type_node,
6556                                     cond_initial_val, cond_reduc_val);
6557               if (e && (integer_onep (e) || integer_zerop (e)))
6558                 {
6559                   if (dump_enabled_p ())
6560                     dump_printf_loc (MSG_NOTE, vect_location,
6561                                      "condition expression based on "
6562                                      "compile time constant.\n");
6563                   /* Record reduction code at analysis stage.  */
6564                   STMT_VINFO_VEC_CONST_COND_REDUC_CODE (stmt_info)
6565                     = integer_onep (e) ? MAX_EXPR : MIN_EXPR;
6566                   STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
6567                     = CONST_COND_REDUCTION;
6568                 }
6569             }
6570         }
6571     }
6572
6573   if (orig_stmt_info)
6574     gcc_assert (tmp == orig_stmt_info
6575                 || REDUC_GROUP_FIRST_ELEMENT (tmp) == orig_stmt_info);
6576   else
6577     /* We changed STMT to be the first stmt in reduction chain, hence we
6578        check that in this case the first element in the chain is STMT.  */
6579     gcc_assert (tmp == stmt_info
6580                 || REDUC_GROUP_FIRST_ELEMENT (tmp) == stmt_info);
6581
6582   if (STMT_VINFO_LIVE_P (reduc_def_info))
6583     return false;
6584
6585   if (slp_node)
6586     ncopies = 1;
6587   else
6588     ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
6589
6590   gcc_assert (ncopies >= 1);
6591
6592   vec_mode = TYPE_MODE (vectype_in);
6593   poly_uint64 nunits_out = TYPE_VECTOR_SUBPARTS (vectype_out);
6594
6595   if (nested_cycle)
6596     {
6597       def_bb = gimple_bb (reduc_def_phi);
6598       def_stmt_loop = def_bb->loop_father;
6599       def_arg = PHI_ARG_DEF_FROM_EDGE (reduc_def_phi,
6600                                        loop_preheader_edge (def_stmt_loop));
6601       stmt_vec_info def_arg_stmt_info = loop_vinfo->lookup_def (def_arg);
6602       if (def_arg_stmt_info
6603           && (STMT_VINFO_DEF_TYPE (def_arg_stmt_info)
6604               == vect_double_reduction_def))
6605         double_reduc = true;
6606     }
6607
6608   vect_reduction_type reduction_type
6609     = STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info);
6610   if ((double_reduc || reduction_type != TREE_CODE_REDUCTION)
6611       && ncopies > 1)
6612     {
6613       if (dump_enabled_p ())
6614         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6615                          "multiple types in double reduction or condition "
6616                          "reduction.\n");
6617       return false;
6618     }
6619
6620   if (code == COND_EXPR)
6621     {
6622       /* Only call during the analysis stage, otherwise we'll lose
6623          STMT_VINFO_TYPE.  */
6624       if (!vec_stmt && !vectorizable_condition (stmt_info, gsi, NULL,
6625                                                 true, NULL, cost_vec))
6626         {
6627           if (dump_enabled_p ())
6628             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6629                              "unsupported condition in reduction\n");
6630           return false;
6631         }
6632     }
6633   else if (code == LSHIFT_EXPR || code == RSHIFT_EXPR
6634            || code == LROTATE_EXPR || code == RROTATE_EXPR)
6635     {
6636       /* Only call during the analysis stage, otherwise we'll lose
6637          STMT_VINFO_TYPE.  We only support this for nested cycles
6638          without double reductions at the moment.  */
6639       if (!nested_cycle
6640           || double_reduc
6641           || (!vec_stmt && !vectorizable_shift (stmt_info, gsi, NULL,
6642                                                 NULL, cost_vec)))
6643         {
6644           if (dump_enabled_p ())
6645             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6646                              "unsupported shift or rotation in reduction\n");
6647           return false;
6648         }
6649     }
6650   else
6651     {
6652       /* 4. Supportable by target?  */
6653
6654       /* 4.1. check support for the operation in the loop  */
6655       optab = optab_for_tree_code (code, vectype_in, optab_default);
6656       if (!optab)
6657         {
6658           if (dump_enabled_p ())
6659             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6660                              "no optab.\n");
6661
6662           return false;
6663         }
6664
6665       if (optab_handler (optab, vec_mode) == CODE_FOR_nothing)
6666         {
6667           if (dump_enabled_p ())
6668             dump_printf (MSG_NOTE, "op not supported by target.\n");
6669
6670           if (maybe_ne (GET_MODE_SIZE (vec_mode), UNITS_PER_WORD)
6671               || !vect_worthwhile_without_simd_p (loop_vinfo, code))
6672             return false;
6673
6674           if (dump_enabled_p ())
6675             dump_printf (MSG_NOTE, "proceeding using word mode.\n");
6676         }
6677
6678       /* Worthwhile without SIMD support?  */
6679       if (!VECTOR_MODE_P (TYPE_MODE (vectype_in))
6680           && !vect_worthwhile_without_simd_p (loop_vinfo, code))
6681         {
6682           if (dump_enabled_p ())
6683             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6684                              "not worthwhile without SIMD support.\n");
6685
6686           return false;
6687         }
6688     }
6689
6690   /* 4.2. Check support for the epilog operation.
6691
6692           If STMT represents a reduction pattern, then the type of the
6693           reduction variable may be different than the type of the rest
6694           of the arguments.  For example, consider the case of accumulation
6695           of shorts into an int accumulator; The original code:
6696                         S1: int_a = (int) short_a;
6697           orig_stmt->   S2: int_acc = plus <int_a ,int_acc>;
6698
6699           was replaced with:
6700                         STMT: int_acc = widen_sum <short_a, int_acc>
6701
6702           This means that:
6703           1. The tree-code that is used to create the vector operation in the
6704              epilog code (that reduces the partial results) is not the
6705              tree-code of STMT, but is rather the tree-code of the original
6706              stmt from the pattern that STMT is replacing.  I.e, in the example
6707              above we want to use 'widen_sum' in the loop, but 'plus' in the
6708              epilog.
6709           2. The type (mode) we use to check available target support
6710              for the vector operation to be created in the *epilog*, is
6711              determined by the type of the reduction variable (in the example
6712              above we'd check this: optab_handler (plus_optab, vect_int_mode])).
6713              However the type (mode) we use to check available target support
6714              for the vector operation to be created *inside the loop*, is
6715              determined by the type of the other arguments to STMT (in the
6716              example we'd check this: optab_handler (widen_sum_optab,
6717              vect_short_mode)).
6718
6719           This is contrary to "regular" reductions, in which the types of all
6720           the arguments are the same as the type of the reduction variable.
6721           For "regular" reductions we can therefore use the same vector type
6722           (and also the same tree-code) when generating the epilog code and
6723           when generating the code inside the loop.  */
6724
6725   if (orig_stmt_info
6726       && (reduction_type == TREE_CODE_REDUCTION
6727           || reduction_type == FOLD_LEFT_REDUCTION))
6728     {
6729       /* This is a reduction pattern: get the vectype from the type of the
6730          reduction variable, and get the tree-code from orig_stmt.  */
6731       orig_code = gimple_assign_rhs_code (orig_stmt_info->stmt);
6732       gcc_assert (vectype_out);
6733       vec_mode = TYPE_MODE (vectype_out);
6734     }
6735   else
6736     {
6737       /* Regular reduction: use the same vectype and tree-code as used for
6738          the vector code inside the loop can be used for the epilog code. */
6739       orig_code = code;
6740
6741       if (code == MINUS_EXPR)
6742         orig_code = PLUS_EXPR;
6743
6744       /* For simple condition reductions, replace with the actual expression
6745          we want to base our reduction around.  */
6746       if (reduction_type == CONST_COND_REDUCTION)
6747         {
6748           orig_code = STMT_VINFO_VEC_CONST_COND_REDUC_CODE (stmt_info);
6749           gcc_assert (orig_code == MAX_EXPR || orig_code == MIN_EXPR);
6750         }
6751       else if (reduction_type == INTEGER_INDUC_COND_REDUCTION)
6752         orig_code = cond_reduc_op_code;
6753     }
6754
6755   reduc_fn = IFN_LAST;
6756
6757   if (reduction_type == TREE_CODE_REDUCTION
6758       || reduction_type == FOLD_LEFT_REDUCTION
6759       || reduction_type == INTEGER_INDUC_COND_REDUCTION
6760       || reduction_type == CONST_COND_REDUCTION)
6761     {
6762       if (reduction_type == FOLD_LEFT_REDUCTION
6763           ? fold_left_reduction_fn (orig_code, &reduc_fn)
6764           : reduction_fn_for_scalar_code (orig_code, &reduc_fn))
6765         {
6766           if (reduc_fn != IFN_LAST
6767               && !direct_internal_fn_supported_p (reduc_fn, vectype_out,
6768                                                   OPTIMIZE_FOR_SPEED))
6769             {
6770               if (dump_enabled_p ())
6771                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6772                                  "reduc op not supported by target.\n");
6773
6774               reduc_fn = IFN_LAST;
6775             }
6776         }
6777       else
6778         {
6779           if (!nested_cycle || double_reduc)
6780             {
6781               if (dump_enabled_p ())
6782                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6783                                  "no reduc code for scalar code.\n");
6784
6785               return false;
6786             }
6787         }
6788     }
6789   else if (reduction_type == COND_REDUCTION)
6790     {
6791       int scalar_precision
6792         = GET_MODE_PRECISION (SCALAR_TYPE_MODE (scalar_type));
6793       cr_index_scalar_type = make_unsigned_type (scalar_precision);
6794       cr_index_vector_type = build_vector_type (cr_index_scalar_type,
6795                                                 nunits_out);
6796
6797       if (direct_internal_fn_supported_p (IFN_REDUC_MAX, cr_index_vector_type,
6798                                           OPTIMIZE_FOR_SPEED))
6799         reduc_fn = IFN_REDUC_MAX;
6800     }
6801
6802   if (reduction_type != EXTRACT_LAST_REDUCTION
6803       && (!nested_cycle || double_reduc)
6804       && reduc_fn == IFN_LAST
6805       && !nunits_out.is_constant ())
6806     {
6807       if (dump_enabled_p ())
6808         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6809                          "missing target support for reduction on"
6810                          " variable-length vectors.\n");
6811       return false;
6812     }
6813
6814   /* For SLP reductions, see if there is a neutral value we can use.  */
6815   tree neutral_op = NULL_TREE;
6816   if (slp_node)
6817     neutral_op = neutral_op_for_slp_reduction
6818       (slp_node_instance->reduc_phis, code,
6819        REDUC_GROUP_FIRST_ELEMENT (stmt_info) != NULL);
6820
6821   if (double_reduc && reduction_type == FOLD_LEFT_REDUCTION)
6822     {
6823       /* We can't support in-order reductions of code such as this:
6824
6825            for (int i = 0; i < n1; ++i)
6826              for (int j = 0; j < n2; ++j)
6827                l += a[j];
6828
6829          since GCC effectively transforms the loop when vectorizing:
6830
6831            for (int i = 0; i < n1 / VF; ++i)
6832              for (int j = 0; j < n2; ++j)
6833                for (int k = 0; k < VF; ++k)
6834                  l += a[j];
6835
6836          which is a reassociation of the original operation.  */
6837       if (dump_enabled_p ())
6838         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6839                          "in-order double reduction not supported.\n");
6840
6841       return false;
6842     }
6843
6844   if (reduction_type == FOLD_LEFT_REDUCTION
6845       && slp_node
6846       && !REDUC_GROUP_FIRST_ELEMENT (stmt_info))
6847     {
6848       /* We cannot use in-order reductions in this case because there is
6849          an implicit reassociation of the operations involved.  */
6850       if (dump_enabled_p ())
6851         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6852                          "in-order unchained SLP reductions not supported.\n");
6853       return false;
6854     }
6855
6856   /* For double reductions, and for SLP reductions with a neutral value,
6857      we construct a variable-length initial vector by loading a vector
6858      full of the neutral value and then shift-and-inserting the start
6859      values into the low-numbered elements.  */
6860   if ((double_reduc || neutral_op)
6861       && !nunits_out.is_constant ()
6862       && !direct_internal_fn_supported_p (IFN_VEC_SHL_INSERT,
6863                                           vectype_out, OPTIMIZE_FOR_SPEED))
6864     {
6865       if (dump_enabled_p ())
6866         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6867                          "reduction on variable-length vectors requires"
6868                          " target support for a vector-shift-and-insert"
6869                          " operation.\n");
6870       return false;
6871     }
6872
6873   /* Check extra constraints for variable-length unchained SLP reductions.  */
6874   if (STMT_SLP_TYPE (stmt_info)
6875       && !REDUC_GROUP_FIRST_ELEMENT (stmt_info)
6876       && !nunits_out.is_constant ())
6877     {
6878       /* We checked above that we could build the initial vector when
6879          there's a neutral element value.  Check here for the case in
6880          which each SLP statement has its own initial value and in which
6881          that value needs to be repeated for every instance of the
6882          statement within the initial vector.  */
6883       unsigned int group_size = SLP_TREE_SCALAR_STMTS (slp_node).length ();
6884       scalar_mode elt_mode = SCALAR_TYPE_MODE (TREE_TYPE (vectype_out));
6885       if (!neutral_op
6886           && !can_duplicate_and_interleave_p (group_size, elt_mode))
6887         {
6888           if (dump_enabled_p ())
6889             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6890                              "unsupported form of SLP reduction for"
6891                              " variable-length vectors: cannot build"
6892                              " initial vector.\n");
6893           return false;
6894         }
6895       /* The epilogue code relies on the number of elements being a multiple
6896          of the group size.  The duplicate-and-interleave approach to setting
6897          up the the initial vector does too.  */
6898       if (!multiple_p (nunits_out, group_size))
6899         {
6900           if (dump_enabled_p ())
6901             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6902                              "unsupported form of SLP reduction for"
6903                              " variable-length vectors: the vector size"
6904                              " is not a multiple of the number of results.\n");
6905           return false;
6906         }
6907     }
6908
6909   /* In case of widenning multiplication by a constant, we update the type
6910      of the constant to be the type of the other operand.  We check that the
6911      constant fits the type in the pattern recognition pass.  */
6912   if (code == DOT_PROD_EXPR
6913       && !types_compatible_p (TREE_TYPE (ops[0]), TREE_TYPE (ops[1])))
6914     {
6915       if (TREE_CODE (ops[0]) == INTEGER_CST)
6916         ops[0] = fold_convert (TREE_TYPE (ops[1]), ops[0]);
6917       else if (TREE_CODE (ops[1]) == INTEGER_CST)
6918         ops[1] = fold_convert (TREE_TYPE (ops[0]), ops[1]);
6919       else
6920         {
6921           if (dump_enabled_p ())
6922             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6923                              "invalid types in dot-prod\n");
6924
6925           return false;
6926         }
6927     }
6928
6929   if (reduction_type == COND_REDUCTION)
6930     {
6931       widest_int ni;
6932
6933       if (! max_loop_iterations (loop, &ni))
6934         {
6935           if (dump_enabled_p ())
6936             dump_printf_loc (MSG_NOTE, vect_location,
6937                              "loop count not known, cannot create cond "
6938                              "reduction.\n");
6939           return false;
6940         }
6941       /* Convert backedges to iterations.  */
6942       ni += 1;
6943
6944       /* The additional index will be the same type as the condition.  Check
6945          that the loop can fit into this less one (because we'll use up the
6946          zero slot for when there are no matches).  */
6947       tree max_index = TYPE_MAX_VALUE (cr_index_scalar_type);
6948       if (wi::geu_p (ni, wi::to_widest (max_index)))
6949         {
6950           if (dump_enabled_p ())
6951             dump_printf_loc (MSG_NOTE, vect_location,
6952                              "loop size is greater than data size.\n");
6953           return false;
6954         }
6955     }
6956
6957   /* In case the vectorization factor (VF) is bigger than the number
6958      of elements that we can fit in a vectype (nunits), we have to generate
6959      more than one vector stmt - i.e - we need to "unroll" the
6960      vector stmt by a factor VF/nunits.  For more details see documentation
6961      in vectorizable_operation.  */
6962
6963   /* If the reduction is used in an outer loop we need to generate
6964      VF intermediate results, like so (e.g. for ncopies=2):
6965         r0 = phi (init, r0)
6966         r1 = phi (init, r1)
6967         r0 = x0 + r0;
6968         r1 = x1 + r1;
6969     (i.e. we generate VF results in 2 registers).
6970     In this case we have a separate def-use cycle for each copy, and therefore
6971     for each copy we get the vector def for the reduction variable from the
6972     respective phi node created for this copy.
6973
6974     Otherwise (the reduction is unused in the loop nest), we can combine
6975     together intermediate results, like so (e.g. for ncopies=2):
6976         r = phi (init, r)
6977         r = x0 + r;
6978         r = x1 + r;
6979    (i.e. we generate VF/2 results in a single register).
6980    In this case for each copy we get the vector def for the reduction variable
6981    from the vectorized reduction operation generated in the previous iteration.
6982
6983    This only works when we see both the reduction PHI and its only consumer
6984    in vectorizable_reduction and there are no intermediate stmts
6985    participating.  */
6986   stmt_vec_info use_stmt_info;
6987   tree reduc_phi_result = gimple_phi_result (reduc_def_phi);
6988   if (ncopies > 1
6989       && (STMT_VINFO_RELEVANT (stmt_info) <= vect_used_only_live)
6990       && (use_stmt_info = loop_vinfo->lookup_single_use (reduc_phi_result))
6991       && vect_stmt_to_vectorize (use_stmt_info) == stmt_info)
6992     {
6993       single_defuse_cycle = true;
6994       epilog_copies = 1;
6995     }
6996   else
6997     epilog_copies = ncopies;
6998
6999   /* If the reduction stmt is one of the patterns that have lane
7000      reduction embedded we cannot handle the case of ! single_defuse_cycle.  */
7001   if ((ncopies > 1
7002        && ! single_defuse_cycle)
7003       && (code == DOT_PROD_EXPR
7004           || code == WIDEN_SUM_EXPR
7005           || code == SAD_EXPR))
7006     {
7007       if (dump_enabled_p ())
7008         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7009                          "multi def-use cycle not possible for lane-reducing "
7010                          "reduction operation\n");
7011       return false;
7012     }
7013
7014   if (slp_node)
7015     vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
7016   else
7017     vec_num = 1;
7018
7019   internal_fn cond_fn = get_conditional_internal_fn (code);
7020   vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo);
7021   bool mask_by_cond_expr = use_mask_by_cond_expr_p (code, cond_fn, vectype_in);
7022
7023   if (!vec_stmt) /* transformation not required.  */
7024     {
7025       vect_model_reduction_cost (stmt_info, reduc_fn, ncopies, cost_vec);
7026       if (loop_vinfo && LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo))
7027         {
7028           if (reduction_type != FOLD_LEFT_REDUCTION
7029               && !mask_by_cond_expr
7030               && (cond_fn == IFN_LAST
7031                   || !direct_internal_fn_supported_p (cond_fn, vectype_in,
7032                                                       OPTIMIZE_FOR_SPEED)))
7033             {
7034               if (dump_enabled_p ())
7035                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7036                                  "can't use a fully-masked loop because no"
7037                                  " conditional operation is available.\n");
7038               LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = false;
7039             }
7040           else if (reduc_index == -1)
7041             {
7042               if (dump_enabled_p ())
7043                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7044                                  "can't use a fully-masked loop for chained"
7045                                  " reductions.\n");
7046               LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = false;
7047             }
7048           else
7049             vect_record_loop_mask (loop_vinfo, masks, ncopies * vec_num,
7050                                    vectype_in);
7051         }
7052       if (dump_enabled_p ()
7053           && reduction_type == FOLD_LEFT_REDUCTION)
7054         dump_printf_loc (MSG_NOTE, vect_location,
7055                          "using an in-order (fold-left) reduction.\n");
7056       STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
7057       return true;
7058     }
7059
7060   /* Transform.  */
7061
7062   if (dump_enabled_p ())
7063     dump_printf_loc (MSG_NOTE, vect_location, "transform reduction.\n");
7064
7065   /* FORNOW: Multiple types are not supported for condition.  */
7066   if (code == COND_EXPR)
7067     gcc_assert (ncopies == 1);
7068
7069   bool masked_loop_p = LOOP_VINFO_FULLY_MASKED_P (loop_vinfo);
7070
7071   if (reduction_type == FOLD_LEFT_REDUCTION)
7072     return vectorize_fold_left_reduction
7073       (stmt_info, gsi, vec_stmt, slp_node, reduc_def_phi, code,
7074        reduc_fn, ops, vectype_in, reduc_index, masks);
7075
7076   if (reduction_type == EXTRACT_LAST_REDUCTION)
7077     {
7078       gcc_assert (!slp_node);
7079       return vectorizable_condition (stmt_info, gsi, vec_stmt,
7080                                      true, NULL, NULL);
7081     }
7082
7083   /* Create the destination vector  */
7084   vec_dest = vect_create_destination_var (scalar_dest, vectype_out);
7085
7086   prev_stmt_info = NULL;
7087   prev_phi_info = NULL;
7088   if (!slp_node)
7089     {
7090       vec_oprnds0.create (1);
7091       vec_oprnds1.create (1);
7092       if (op_type == ternary_op)
7093         vec_oprnds2.create (1);
7094     }
7095
7096   phis.create (vec_num);
7097   vect_defs.create (vec_num);
7098   if (!slp_node)
7099     vect_defs.quick_push (NULL_TREE);
7100
7101   if (slp_node)
7102     phis.splice (SLP_TREE_VEC_STMTS (slp_node_instance->reduc_phis));
7103   else
7104     phis.quick_push (STMT_VINFO_VEC_STMT (reduc_def_info));
7105
7106   for (j = 0; j < ncopies; j++)
7107     {
7108       if (code == COND_EXPR)
7109         {
7110           gcc_assert (!slp_node);
7111           vectorizable_condition (stmt_info, gsi, vec_stmt,
7112                                   true, NULL, NULL);
7113           break;
7114         }
7115       if (code == LSHIFT_EXPR
7116           || code == RSHIFT_EXPR)
7117         {
7118           vectorizable_shift (stmt_info, gsi, vec_stmt, slp_node, NULL);
7119           break;
7120         }
7121
7122       /* Handle uses.  */
7123       if (j == 0)
7124         {
7125           if (slp_node)
7126             {
7127               /* Get vec defs for all the operands except the reduction index,
7128                  ensuring the ordering of the ops in the vector is kept.  */
7129               auto_vec<tree, 3> slp_ops;
7130               auto_vec<vec<tree>, 3> vec_defs;
7131
7132               slp_ops.quick_push (ops[0]);
7133               slp_ops.quick_push (ops[1]);
7134               if (op_type == ternary_op)
7135                 slp_ops.quick_push (ops[2]);
7136
7137               vect_get_slp_defs (slp_ops, slp_node, &vec_defs);
7138
7139               vec_oprnds0.safe_splice (vec_defs[0]);
7140               vec_defs[0].release ();
7141               vec_oprnds1.safe_splice (vec_defs[1]);
7142               vec_defs[1].release ();
7143               if (op_type == ternary_op)
7144                 {
7145                   vec_oprnds2.safe_splice (vec_defs[2]);
7146                   vec_defs[2].release ();
7147                 }
7148             }
7149           else
7150             {
7151               vec_oprnds0.quick_push
7152                 (vect_get_vec_def_for_operand (ops[0], stmt_info));
7153               vec_oprnds1.quick_push
7154                 (vect_get_vec_def_for_operand (ops[1], stmt_info));
7155               if (op_type == ternary_op)
7156                 vec_oprnds2.quick_push
7157                   (vect_get_vec_def_for_operand (ops[2], stmt_info));
7158             }
7159         }
7160       else
7161         {
7162           if (!slp_node)
7163             {
7164               gcc_assert (reduc_index != -1 || ! single_defuse_cycle);
7165
7166               if (single_defuse_cycle && reduc_index == 0)
7167                 vec_oprnds0[0] = gimple_get_lhs (new_stmt_info->stmt);
7168               else
7169                 vec_oprnds0[0]
7170                   = vect_get_vec_def_for_stmt_copy (loop_vinfo,
7171                                                     vec_oprnds0[0]);
7172               if (single_defuse_cycle && reduc_index == 1)
7173                 vec_oprnds1[0] = gimple_get_lhs (new_stmt_info->stmt);
7174               else
7175                 vec_oprnds1[0]
7176                   = vect_get_vec_def_for_stmt_copy (loop_vinfo,
7177                                                     vec_oprnds1[0]);
7178               if (op_type == ternary_op)
7179                 {
7180                   if (single_defuse_cycle && reduc_index == 2)
7181                     vec_oprnds2[0] = gimple_get_lhs (new_stmt_info->stmt);
7182                   else
7183                     vec_oprnds2[0]
7184                       = vect_get_vec_def_for_stmt_copy (loop_vinfo,
7185                                                         vec_oprnds2[0]);
7186                 }
7187             }
7188         }
7189
7190       FOR_EACH_VEC_ELT (vec_oprnds0, i, def0)
7191         {
7192           tree vop[3] = { def0, vec_oprnds1[i], NULL_TREE };
7193           if (masked_loop_p && !mask_by_cond_expr)
7194             {
7195               /* Make sure that the reduction accumulator is vop[0].  */
7196               if (reduc_index == 1)
7197                 {
7198                   gcc_assert (commutative_tree_code (code));
7199                   std::swap (vop[0], vop[1]);
7200                 }
7201               tree mask = vect_get_loop_mask (gsi, masks, vec_num * ncopies,
7202                                               vectype_in, i * ncopies + j);
7203               gcall *call = gimple_build_call_internal (cond_fn, 4, mask,
7204                                                         vop[0], vop[1],
7205                                                         vop[0]);
7206               new_temp = make_ssa_name (vec_dest, call);
7207               gimple_call_set_lhs (call, new_temp);
7208               gimple_call_set_nothrow (call, true);
7209               new_stmt_info
7210                 = vect_finish_stmt_generation (stmt_info, call, gsi);
7211             }
7212           else
7213             {
7214               if (op_type == ternary_op)
7215                 vop[2] = vec_oprnds2[i];
7216
7217               if (masked_loop_p && mask_by_cond_expr)
7218                 {
7219                   tree mask = vect_get_loop_mask (gsi, masks,
7220                                                   vec_num * ncopies,
7221                                                   vectype_in, i * ncopies + j);
7222                   build_vect_cond_expr (code, vop, mask, gsi);
7223                 }
7224
7225               gassign *new_stmt = gimple_build_assign (vec_dest, code,
7226                                                        vop[0], vop[1], vop[2]);
7227               new_temp = make_ssa_name (vec_dest, new_stmt);
7228               gimple_assign_set_lhs (new_stmt, new_temp);
7229               new_stmt_info
7230                 = vect_finish_stmt_generation (stmt_info, new_stmt, gsi);
7231             }
7232
7233           if (slp_node)
7234             {
7235               SLP_TREE_VEC_STMTS (slp_node).quick_push (new_stmt_info);
7236               vect_defs.quick_push (new_temp);
7237             }
7238           else
7239             vect_defs[0] = new_temp;
7240         }
7241
7242       if (slp_node)
7243         continue;
7244
7245       if (j == 0)
7246         STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt_info;
7247       else
7248         STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt_info;
7249
7250       prev_stmt_info = new_stmt_info;
7251     }
7252
7253   /* Finalize the reduction-phi (set its arguments) and create the
7254      epilog reduction code.  */
7255   if ((!single_defuse_cycle || code == COND_EXPR) && !slp_node)
7256     vect_defs[0] = gimple_get_lhs ((*vec_stmt)->stmt);
7257
7258   vect_create_epilog_for_reduction (vect_defs, stmt_info, reduc_def_phi,
7259                                     epilog_copies, reduc_fn, phis,
7260                                     double_reduc, slp_node, slp_node_instance,
7261                                     cond_reduc_val, cond_reduc_op_code,
7262                                     neutral_op);
7263
7264   return true;
7265 }
7266
7267 /* Function vect_min_worthwhile_factor.
7268
7269    For a loop where we could vectorize the operation indicated by CODE,
7270    return the minimum vectorization factor that makes it worthwhile
7271    to use generic vectors.  */
7272 static unsigned int
7273 vect_min_worthwhile_factor (enum tree_code code)
7274 {
7275   switch (code)
7276     {
7277     case PLUS_EXPR:
7278     case MINUS_EXPR:
7279     case NEGATE_EXPR:
7280       return 4;
7281
7282     case BIT_AND_EXPR:
7283     case BIT_IOR_EXPR:
7284     case BIT_XOR_EXPR:
7285     case BIT_NOT_EXPR:
7286       return 2;
7287
7288     default:
7289       return INT_MAX;
7290     }
7291 }
7292
7293 /* Return true if VINFO indicates we are doing loop vectorization and if
7294    it is worth decomposing CODE operations into scalar operations for
7295    that loop's vectorization factor.  */
7296
7297 bool
7298 vect_worthwhile_without_simd_p (vec_info *vinfo, tree_code code)
7299 {
7300   loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
7301   unsigned HOST_WIDE_INT value;
7302   return (loop_vinfo
7303           && LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant (&value)
7304           && value >= vect_min_worthwhile_factor (code));
7305 }
7306
7307 /* Function vectorizable_induction
7308
7309    Check if STMT_INFO performs an induction computation that can be vectorized.
7310    If VEC_STMT is also passed, vectorize the induction PHI: create a vectorized
7311    phi to replace it, put it in VEC_STMT, and add it to the same basic block.
7312    Return true if STMT_INFO is vectorizable in this way.  */
7313
7314 bool
7315 vectorizable_induction (stmt_vec_info stmt_info,
7316                         gimple_stmt_iterator *gsi ATTRIBUTE_UNUSED,
7317                         stmt_vec_info *vec_stmt, slp_tree slp_node,
7318                         stmt_vector_for_cost *cost_vec)
7319 {
7320   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
7321   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
7322   unsigned ncopies;
7323   bool nested_in_vect_loop = false;
7324   struct loop *iv_loop;
7325   tree vec_def;
7326   edge pe = loop_preheader_edge (loop);
7327   basic_block new_bb;
7328   tree new_vec, vec_init, vec_step, t;
7329   tree new_name;
7330   gimple *new_stmt;
7331   gphi *induction_phi;
7332   tree induc_def, vec_dest;
7333   tree init_expr, step_expr;
7334   poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
7335   unsigned i;
7336   tree expr;
7337   gimple_seq stmts;
7338   imm_use_iterator imm_iter;
7339   use_operand_p use_p;
7340   gimple *exit_phi;
7341   edge latch_e;
7342   tree loop_arg;
7343   gimple_stmt_iterator si;
7344
7345   gphi *phi = dyn_cast <gphi *> (stmt_info->stmt);
7346   if (!phi)
7347     return false;
7348
7349   if (!STMT_VINFO_RELEVANT_P (stmt_info))
7350     return false;
7351
7352   /* Make sure it was recognized as induction computation.  */
7353   if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_induction_def)
7354     return false;
7355
7356   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
7357   poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
7358
7359   if (slp_node)
7360     ncopies = 1;
7361   else
7362     ncopies = vect_get_num_copies (loop_vinfo, vectype);
7363   gcc_assert (ncopies >= 1);
7364
7365   /* FORNOW. These restrictions should be relaxed.  */
7366   if (nested_in_vect_loop_p (loop, stmt_info))
7367     {
7368       imm_use_iterator imm_iter;
7369       use_operand_p use_p;
7370       gimple *exit_phi;
7371       edge latch_e;
7372       tree loop_arg;
7373
7374       if (ncopies > 1)
7375         {
7376           if (dump_enabled_p ())
7377             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7378                              "multiple types in nested loop.\n");
7379           return false;
7380         }
7381
7382       /* FORNOW: outer loop induction with SLP not supported.  */
7383       if (STMT_SLP_TYPE (stmt_info))
7384         return false;
7385
7386       exit_phi = NULL;
7387       latch_e = loop_latch_edge (loop->inner);
7388       loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
7389       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, loop_arg)
7390         {
7391           gimple *use_stmt = USE_STMT (use_p);
7392           if (is_gimple_debug (use_stmt))
7393             continue;
7394
7395           if (!flow_bb_inside_loop_p (loop->inner, gimple_bb (use_stmt)))
7396             {
7397               exit_phi = use_stmt;
7398               break;
7399             }
7400         }
7401       if (exit_phi)
7402         {
7403           stmt_vec_info exit_phi_vinfo = loop_vinfo->lookup_stmt (exit_phi);
7404           if (!(STMT_VINFO_RELEVANT_P (exit_phi_vinfo)
7405                 && !STMT_VINFO_LIVE_P (exit_phi_vinfo)))
7406             {
7407               if (dump_enabled_p ())
7408                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7409                                  "inner-loop induction only used outside "
7410                                  "of the outer vectorized loop.\n");
7411               return false;
7412             }
7413         }
7414
7415       nested_in_vect_loop = true;
7416       iv_loop = loop->inner;
7417     }
7418   else
7419     iv_loop = loop;
7420   gcc_assert (iv_loop == (gimple_bb (phi))->loop_father);
7421
7422   if (slp_node && !nunits.is_constant ())
7423     {
7424       /* The current SLP code creates the initial value element-by-element.  */
7425       if (dump_enabled_p ())
7426         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7427                          "SLP induction not supported for variable-length"
7428                          " vectors.\n");
7429       return false;
7430     }
7431
7432   if (!vec_stmt) /* transformation not required.  */
7433     {
7434       STMT_VINFO_TYPE (stmt_info) = induc_vec_info_type;
7435       DUMP_VECT_SCOPE ("vectorizable_induction");
7436       vect_model_induction_cost (stmt_info, ncopies, cost_vec);
7437       return true;
7438     }
7439
7440   /* Transform.  */
7441
7442   /* Compute a vector variable, initialized with the first VF values of
7443      the induction variable.  E.g., for an iv with IV_PHI='X' and
7444      evolution S, for a vector of 4 units, we want to compute:
7445      [X, X + S, X + 2*S, X + 3*S].  */
7446
7447   if (dump_enabled_p ())
7448     dump_printf_loc (MSG_NOTE, vect_location, "transform induction phi.\n");
7449
7450   latch_e = loop_latch_edge (iv_loop);
7451   loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
7452
7453   step_expr = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_info);
7454   gcc_assert (step_expr != NULL_TREE);
7455
7456   pe = loop_preheader_edge (iv_loop);
7457   init_expr = PHI_ARG_DEF_FROM_EDGE (phi,
7458                                      loop_preheader_edge (iv_loop));
7459
7460   stmts = NULL;
7461   if (!nested_in_vect_loop)
7462     {
7463       /* Convert the initial value to the desired type.  */
7464       tree new_type = TREE_TYPE (vectype);
7465       init_expr = gimple_convert (&stmts, new_type, init_expr);
7466
7467       /* If we are using the loop mask to "peel" for alignment then we need
7468          to adjust the start value here.  */
7469       tree skip_niters = LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo);
7470       if (skip_niters != NULL_TREE)
7471         {
7472           if (FLOAT_TYPE_P (vectype))
7473             skip_niters = gimple_build (&stmts, FLOAT_EXPR, new_type,
7474                                         skip_niters);
7475           else
7476             skip_niters = gimple_convert (&stmts, new_type, skip_niters);
7477           tree skip_step = gimple_build (&stmts, MULT_EXPR, new_type,
7478                                          skip_niters, step_expr);
7479           init_expr = gimple_build (&stmts, MINUS_EXPR, new_type,
7480                                     init_expr, skip_step);
7481         }
7482     }
7483
7484   /* Convert the step to the desired type.  */
7485   step_expr = gimple_convert (&stmts, TREE_TYPE (vectype), step_expr);
7486
7487   if (stmts)
7488     {
7489       new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
7490       gcc_assert (!new_bb);
7491     }
7492
7493   /* Find the first insertion point in the BB.  */
7494   basic_block bb = gimple_bb (phi);
7495   si = gsi_after_labels (bb);
7496
7497   /* For SLP induction we have to generate several IVs as for example
7498      with group size 3 we need [i, i, i, i + S] [i + S, i + S, i + 2*S, i + 2*S]
7499      [i + 2*S, i + 3*S, i + 3*S, i + 3*S].  The step is the same uniform
7500      [VF*S, VF*S, VF*S, VF*S] for all.  */
7501   if (slp_node)
7502     {
7503       /* Enforced above.  */
7504       unsigned int const_nunits = nunits.to_constant ();
7505
7506       /* Generate [VF*S, VF*S, ... ].  */
7507       if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
7508         {
7509           expr = build_int_cst (integer_type_node, vf);
7510           expr = fold_convert (TREE_TYPE (step_expr), expr);
7511         }
7512       else
7513         expr = build_int_cst (TREE_TYPE (step_expr), vf);
7514       new_name = fold_build2 (MULT_EXPR, TREE_TYPE (step_expr),
7515                               expr, step_expr);
7516       if (! CONSTANT_CLASS_P (new_name))
7517         new_name = vect_init_vector (stmt_info, new_name,
7518                                      TREE_TYPE (step_expr), NULL);
7519       new_vec = build_vector_from_val (vectype, new_name);
7520       vec_step = vect_init_vector (stmt_info, new_vec, vectype, NULL);
7521
7522       /* Now generate the IVs.  */
7523       unsigned group_size = SLP_TREE_SCALAR_STMTS (slp_node).length ();
7524       unsigned nvects = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
7525       unsigned elts = const_nunits * nvects;
7526       unsigned nivs = least_common_multiple (group_size,
7527                                              const_nunits) / const_nunits;
7528       gcc_assert (elts % group_size == 0);
7529       tree elt = init_expr;
7530       unsigned ivn;
7531       for (ivn = 0; ivn < nivs; ++ivn)
7532         {
7533           tree_vector_builder elts (vectype, const_nunits, 1);
7534           stmts = NULL;
7535           for (unsigned eltn = 0; eltn < const_nunits; ++eltn)
7536             {
7537               if (ivn*const_nunits + eltn >= group_size
7538                   && (ivn * const_nunits + eltn) % group_size == 0)
7539                 elt = gimple_build (&stmts, PLUS_EXPR, TREE_TYPE (elt),
7540                                     elt, step_expr);
7541               elts.quick_push (elt);
7542             }
7543           vec_init = gimple_build_vector (&stmts, &elts);
7544           if (stmts)
7545             {
7546               new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
7547               gcc_assert (!new_bb);
7548             }
7549
7550           /* Create the induction-phi that defines the induction-operand.  */
7551           vec_dest = vect_get_new_vect_var (vectype, vect_simple_var, "vec_iv_");
7552           induction_phi = create_phi_node (vec_dest, iv_loop->header);
7553           stmt_vec_info induction_phi_info
7554             = loop_vinfo->add_stmt (induction_phi);
7555           induc_def = PHI_RESULT (induction_phi);
7556
7557           /* Create the iv update inside the loop  */
7558           vec_def = make_ssa_name (vec_dest);
7559           new_stmt = gimple_build_assign (vec_def, PLUS_EXPR, induc_def, vec_step);
7560           gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
7561           loop_vinfo->add_stmt (new_stmt);
7562
7563           /* Set the arguments of the phi node:  */
7564           add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION);
7565           add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
7566                        UNKNOWN_LOCATION);
7567
7568           SLP_TREE_VEC_STMTS (slp_node).quick_push (induction_phi_info);
7569         }
7570
7571       /* Re-use IVs when we can.  */
7572       if (ivn < nvects)
7573         {
7574           unsigned vfp
7575             = least_common_multiple (group_size, const_nunits) / group_size;
7576           /* Generate [VF'*S, VF'*S, ... ].  */
7577           if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
7578             {
7579               expr = build_int_cst (integer_type_node, vfp);
7580               expr = fold_convert (TREE_TYPE (step_expr), expr);
7581             }
7582           else
7583             expr = build_int_cst (TREE_TYPE (step_expr), vfp);
7584           new_name = fold_build2 (MULT_EXPR, TREE_TYPE (step_expr),
7585                                   expr, step_expr);
7586           if (! CONSTANT_CLASS_P (new_name))
7587             new_name = vect_init_vector (stmt_info, new_name,
7588                                          TREE_TYPE (step_expr), NULL);
7589           new_vec = build_vector_from_val (vectype, new_name);
7590           vec_step = vect_init_vector (stmt_info, new_vec, vectype, NULL);
7591           for (; ivn < nvects; ++ivn)
7592             {
7593               gimple *iv = SLP_TREE_VEC_STMTS (slp_node)[ivn - nivs]->stmt;
7594               tree def;
7595               if (gimple_code (iv) == GIMPLE_PHI)
7596                 def = gimple_phi_result (iv);
7597               else
7598                 def = gimple_assign_lhs (iv);
7599               new_stmt = gimple_build_assign (make_ssa_name (vectype),
7600                                               PLUS_EXPR,
7601                                               def, vec_step);
7602               if (gimple_code (iv) == GIMPLE_PHI)
7603                 gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
7604               else
7605                 {
7606                   gimple_stmt_iterator tgsi = gsi_for_stmt (iv);
7607                   gsi_insert_after (&tgsi, new_stmt, GSI_CONTINUE_LINKING);
7608                 }
7609               SLP_TREE_VEC_STMTS (slp_node).quick_push
7610                 (loop_vinfo->add_stmt (new_stmt));
7611             }
7612         }
7613
7614       return true;
7615     }
7616
7617   /* Create the vector that holds the initial_value of the induction.  */
7618   if (nested_in_vect_loop)
7619     {
7620       /* iv_loop is nested in the loop to be vectorized.  init_expr had already
7621          been created during vectorization of previous stmts.  We obtain it
7622          from the STMT_VINFO_VEC_STMT of the defining stmt.  */
7623       vec_init = vect_get_vec_def_for_operand (init_expr, stmt_info);
7624       /* If the initial value is not of proper type, convert it.  */
7625       if (!useless_type_conversion_p (vectype, TREE_TYPE (vec_init)))
7626         {
7627           new_stmt
7628             = gimple_build_assign (vect_get_new_ssa_name (vectype,
7629                                                           vect_simple_var,
7630                                                           "vec_iv_"),
7631                                    VIEW_CONVERT_EXPR,
7632                                    build1 (VIEW_CONVERT_EXPR, vectype,
7633                                            vec_init));
7634           vec_init = gimple_assign_lhs (new_stmt);
7635           new_bb = gsi_insert_on_edge_immediate (loop_preheader_edge (iv_loop),
7636                                                  new_stmt);
7637           gcc_assert (!new_bb);
7638           loop_vinfo->add_stmt (new_stmt);
7639         }
7640     }
7641   else
7642     {
7643       /* iv_loop is the loop to be vectorized. Create:
7644          vec_init = [X, X+S, X+2*S, X+3*S] (S = step_expr, X = init_expr)  */
7645       stmts = NULL;
7646       new_name = gimple_convert (&stmts, TREE_TYPE (vectype), init_expr);
7647
7648       unsigned HOST_WIDE_INT const_nunits;
7649       if (nunits.is_constant (&const_nunits))
7650         {
7651           tree_vector_builder elts (vectype, const_nunits, 1);
7652           elts.quick_push (new_name);
7653           for (i = 1; i < const_nunits; i++)
7654             {
7655               /* Create: new_name_i = new_name + step_expr  */
7656               new_name = gimple_build (&stmts, PLUS_EXPR, TREE_TYPE (new_name),
7657                                        new_name, step_expr);
7658               elts.quick_push (new_name);
7659             }
7660           /* Create a vector from [new_name_0, new_name_1, ...,
7661              new_name_nunits-1]  */
7662           vec_init = gimple_build_vector (&stmts, &elts);
7663         }
7664       else if (INTEGRAL_TYPE_P (TREE_TYPE (step_expr)))
7665         /* Build the initial value directly from a VEC_SERIES_EXPR.  */
7666         vec_init = gimple_build (&stmts, VEC_SERIES_EXPR, vectype,
7667                                  new_name, step_expr);
7668       else
7669         {
7670           /* Build:
7671                 [base, base, base, ...]
7672                 + (vectype) [0, 1, 2, ...] * [step, step, step, ...].  */
7673           gcc_assert (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)));
7674           gcc_assert (flag_associative_math);
7675           tree index = build_index_vector (vectype, 0, 1);
7676           tree base_vec = gimple_build_vector_from_val (&stmts, vectype,
7677                                                         new_name);
7678           tree step_vec = gimple_build_vector_from_val (&stmts, vectype,
7679                                                         step_expr);
7680           vec_init = gimple_build (&stmts, FLOAT_EXPR, vectype, index);
7681           vec_init = gimple_build (&stmts, MULT_EXPR, vectype,
7682                                    vec_init, step_vec);
7683           vec_init = gimple_build (&stmts, PLUS_EXPR, vectype,
7684                                    vec_init, base_vec);
7685         }
7686
7687       if (stmts)
7688         {
7689           new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
7690           gcc_assert (!new_bb);
7691         }
7692     }
7693
7694
7695   /* Create the vector that holds the step of the induction.  */
7696   if (nested_in_vect_loop)
7697     /* iv_loop is nested in the loop to be vectorized. Generate:
7698        vec_step = [S, S, S, S]  */
7699     new_name = step_expr;
7700   else
7701     {
7702       /* iv_loop is the loop to be vectorized. Generate:
7703           vec_step = [VF*S, VF*S, VF*S, VF*S]  */
7704       gimple_seq seq = NULL;
7705       if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
7706         {
7707           expr = build_int_cst (integer_type_node, vf);
7708           expr = gimple_build (&seq, FLOAT_EXPR, TREE_TYPE (step_expr), expr);
7709         }
7710       else
7711         expr = build_int_cst (TREE_TYPE (step_expr), vf);
7712       new_name = gimple_build (&seq, MULT_EXPR, TREE_TYPE (step_expr),
7713                                expr, step_expr);
7714       if (seq)
7715         {
7716           new_bb = gsi_insert_seq_on_edge_immediate (pe, seq);
7717           gcc_assert (!new_bb);
7718         }
7719     }
7720
7721   t = unshare_expr (new_name);
7722   gcc_assert (CONSTANT_CLASS_P (new_name)
7723               || TREE_CODE (new_name) == SSA_NAME);
7724   new_vec = build_vector_from_val (vectype, t);
7725   vec_step = vect_init_vector (stmt_info, new_vec, vectype, NULL);
7726
7727
7728   /* Create the following def-use cycle:
7729      loop prolog:
7730          vec_init = ...
7731          vec_step = ...
7732      loop:
7733          vec_iv = PHI <vec_init, vec_loop>
7734          ...
7735          STMT
7736          ...
7737          vec_loop = vec_iv + vec_step;  */
7738
7739   /* Create the induction-phi that defines the induction-operand.  */
7740   vec_dest = vect_get_new_vect_var (vectype, vect_simple_var, "vec_iv_");
7741   induction_phi = create_phi_node (vec_dest, iv_loop->header);
7742   stmt_vec_info induction_phi_info = loop_vinfo->add_stmt (induction_phi);
7743   induc_def = PHI_RESULT (induction_phi);
7744
7745   /* Create the iv update inside the loop  */
7746   vec_def = make_ssa_name (vec_dest);
7747   new_stmt = gimple_build_assign (vec_def, PLUS_EXPR, induc_def, vec_step);
7748   gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
7749   stmt_vec_info new_stmt_info = loop_vinfo->add_stmt (new_stmt);
7750
7751   /* Set the arguments of the phi node:  */
7752   add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION);
7753   add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
7754                UNKNOWN_LOCATION);
7755
7756   STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = induction_phi_info;
7757
7758   /* In case that vectorization factor (VF) is bigger than the number
7759      of elements that we can fit in a vectype (nunits), we have to generate
7760      more than one vector stmt - i.e - we need to "unroll" the
7761      vector stmt by a factor VF/nunits.  For more details see documentation
7762      in vectorizable_operation.  */
7763
7764   if (ncopies > 1)
7765     {
7766       gimple_seq seq = NULL;
7767       stmt_vec_info prev_stmt_vinfo;
7768       /* FORNOW. This restriction should be relaxed.  */
7769       gcc_assert (!nested_in_vect_loop);
7770
7771       /* Create the vector that holds the step of the induction.  */
7772       if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
7773         {
7774           expr = build_int_cst (integer_type_node, nunits);
7775           expr = gimple_build (&seq, FLOAT_EXPR, TREE_TYPE (step_expr), expr);
7776         }
7777       else
7778         expr = build_int_cst (TREE_TYPE (step_expr), nunits);
7779       new_name = gimple_build (&seq, MULT_EXPR, TREE_TYPE (step_expr),
7780                                expr, step_expr);
7781       if (seq)
7782         {
7783           new_bb = gsi_insert_seq_on_edge_immediate (pe, seq);
7784           gcc_assert (!new_bb);
7785         }
7786
7787       t = unshare_expr (new_name);
7788       gcc_assert (CONSTANT_CLASS_P (new_name)
7789                   || TREE_CODE (new_name) == SSA_NAME);
7790       new_vec = build_vector_from_val (vectype, t);
7791       vec_step = vect_init_vector (stmt_info, new_vec, vectype, NULL);
7792
7793       vec_def = induc_def;
7794       prev_stmt_vinfo = induction_phi_info;
7795       for (i = 1; i < ncopies; i++)
7796         {
7797           /* vec_i = vec_prev + vec_step  */
7798           new_stmt = gimple_build_assign (vec_dest, PLUS_EXPR,
7799                                           vec_def, vec_step);
7800           vec_def = make_ssa_name (vec_dest, new_stmt);
7801           gimple_assign_set_lhs (new_stmt, vec_def);
7802
7803           gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
7804           new_stmt_info = loop_vinfo->add_stmt (new_stmt);
7805           STMT_VINFO_RELATED_STMT (prev_stmt_vinfo) = new_stmt_info;
7806           prev_stmt_vinfo = new_stmt_info;
7807         }
7808     }
7809
7810   if (nested_in_vect_loop)
7811     {
7812       /* Find the loop-closed exit-phi of the induction, and record
7813          the final vector of induction results:  */
7814       exit_phi = NULL;
7815       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, loop_arg)
7816         {
7817           gimple *use_stmt = USE_STMT (use_p);
7818           if (is_gimple_debug (use_stmt))
7819             continue;
7820
7821           if (!flow_bb_inside_loop_p (iv_loop, gimple_bb (use_stmt)))
7822             {
7823               exit_phi = use_stmt;
7824               break;
7825             }
7826         }
7827       if (exit_phi)
7828         {
7829           stmt_vec_info stmt_vinfo = loop_vinfo->lookup_stmt (exit_phi);
7830           /* FORNOW. Currently not supporting the case that an inner-loop induction
7831              is not used in the outer-loop (i.e. only outside the outer-loop).  */
7832           gcc_assert (STMT_VINFO_RELEVANT_P (stmt_vinfo)
7833                       && !STMT_VINFO_LIVE_P (stmt_vinfo));
7834
7835           STMT_VINFO_VEC_STMT (stmt_vinfo) = new_stmt_info;
7836           if (dump_enabled_p ())
7837             dump_printf_loc (MSG_NOTE, vect_location,
7838                              "vector of inductions after inner-loop:%G",
7839                              new_stmt);
7840         }
7841     }
7842
7843
7844   if (dump_enabled_p ())
7845     dump_printf_loc (MSG_NOTE, vect_location,
7846                      "transform induction: created def-use cycle: %G%G",
7847                      induction_phi, SSA_NAME_DEF_STMT (vec_def));
7848
7849   return true;
7850 }
7851
7852 /* Function vectorizable_live_operation.
7853
7854    STMT_INFO computes a value that is used outside the loop.  Check if
7855    it can be supported.  */
7856
7857 bool
7858 vectorizable_live_operation (stmt_vec_info stmt_info,
7859                              gimple_stmt_iterator *gsi ATTRIBUTE_UNUSED,
7860                              slp_tree slp_node, int slp_index,
7861                              stmt_vec_info *vec_stmt,
7862                              stmt_vector_for_cost *)
7863 {
7864   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
7865   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
7866   imm_use_iterator imm_iter;
7867   tree lhs, lhs_type, bitsize, vec_bitsize;
7868   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
7869   poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
7870   int ncopies;
7871   gimple *use_stmt;
7872   auto_vec<tree> vec_oprnds;
7873   int vec_entry = 0;
7874   poly_uint64 vec_index = 0;
7875
7876   gcc_assert (STMT_VINFO_LIVE_P (stmt_info));
7877
7878   if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def)
7879     return false;
7880
7881   /* FORNOW.  CHECKME.  */
7882   if (nested_in_vect_loop_p (loop, stmt_info))
7883     return false;
7884
7885   /* If STMT is not relevant and it is a simple assignment and its inputs are
7886      invariant then it can remain in place, unvectorized.  The original last
7887      scalar value that it computes will be used.  */
7888   if (!STMT_VINFO_RELEVANT_P (stmt_info))
7889     {
7890       gcc_assert (is_simple_and_all_uses_invariant (stmt_info, loop_vinfo));
7891       if (dump_enabled_p ())
7892         dump_printf_loc (MSG_NOTE, vect_location,
7893                          "statement is simple and uses invariant.  Leaving in "
7894                          "place.\n");
7895       return true;
7896     }
7897
7898   if (slp_node)
7899     ncopies = 1;
7900   else
7901     ncopies = vect_get_num_copies (loop_vinfo, vectype);
7902
7903   if (slp_node)
7904     {
7905       gcc_assert (slp_index >= 0);
7906
7907       int num_scalar = SLP_TREE_SCALAR_STMTS (slp_node).length ();
7908       int num_vec = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
7909
7910       /* Get the last occurrence of the scalar index from the concatenation of
7911          all the slp vectors. Calculate which slp vector it is and the index
7912          within.  */
7913       poly_uint64 pos = (num_vec * nunits) - num_scalar + slp_index;
7914
7915       /* Calculate which vector contains the result, and which lane of
7916          that vector we need.  */
7917       if (!can_div_trunc_p (pos, nunits, &vec_entry, &vec_index))
7918         {
7919           if (dump_enabled_p ())
7920             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7921                              "Cannot determine which vector holds the"
7922                              " final result.\n");
7923           return false;
7924         }
7925     }
7926
7927   if (!vec_stmt)
7928     {
7929       /* No transformation required.  */
7930       if (LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo))
7931         {
7932           if (!direct_internal_fn_supported_p (IFN_EXTRACT_LAST, vectype,
7933                                                OPTIMIZE_FOR_SPEED))
7934             {
7935               if (dump_enabled_p ())
7936                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7937                                  "can't use a fully-masked loop because "
7938                                  "the target doesn't support extract last "
7939                                  "reduction.\n");
7940               LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = false;
7941             }
7942           else if (slp_node)
7943             {
7944               if (dump_enabled_p ())
7945                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7946                                  "can't use a fully-masked loop because an "
7947                                  "SLP statement is live after the loop.\n");
7948               LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = false;
7949             }
7950           else if (ncopies > 1)
7951             {
7952               if (dump_enabled_p ())
7953                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7954                                  "can't use a fully-masked loop because"
7955                                  " ncopies is greater than 1.\n");
7956               LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = false;
7957             }
7958           else
7959             {
7960               gcc_assert (ncopies == 1 && !slp_node);
7961               vect_record_loop_mask (loop_vinfo,
7962                                      &LOOP_VINFO_MASKS (loop_vinfo),
7963                                      1, vectype);
7964             }
7965         }
7966       return true;
7967     }
7968
7969   /* Use the lhs of the original scalar statement.  */
7970   gimple *stmt = vect_orig_stmt (stmt_info)->stmt;
7971
7972   lhs = (is_a <gphi *> (stmt)) ? gimple_phi_result (stmt)
7973         : gimple_get_lhs (stmt);
7974   lhs_type = TREE_TYPE (lhs);
7975
7976   bitsize = (VECTOR_BOOLEAN_TYPE_P (vectype)
7977              ? bitsize_int (TYPE_PRECISION (TREE_TYPE (vectype)))
7978              : TYPE_SIZE (TREE_TYPE (vectype)));
7979   vec_bitsize = TYPE_SIZE (vectype);
7980
7981   /* Get the vectorized lhs of STMT and the lane to use (counted in bits).  */
7982   tree vec_lhs, bitstart;
7983   if (slp_node)
7984     {
7985       gcc_assert (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo));
7986
7987       /* Get the correct slp vectorized stmt.  */
7988       gimple *vec_stmt = SLP_TREE_VEC_STMTS (slp_node)[vec_entry]->stmt;
7989       if (gphi *phi = dyn_cast <gphi *> (vec_stmt))
7990         vec_lhs = gimple_phi_result (phi);
7991       else
7992         vec_lhs = gimple_get_lhs (vec_stmt);
7993
7994       /* Get entry to use.  */
7995       bitstart = bitsize_int (vec_index);
7996       bitstart = int_const_binop (MULT_EXPR, bitsize, bitstart);
7997     }
7998   else
7999     {
8000       enum vect_def_type dt = STMT_VINFO_DEF_TYPE (stmt_info);
8001       vec_lhs = vect_get_vec_def_for_operand_1 (stmt_info, dt);
8002       gcc_checking_assert (ncopies == 1
8003                            || !LOOP_VINFO_FULLY_MASKED_P (loop_vinfo));
8004
8005       /* For multiple copies, get the last copy.  */
8006       for (int i = 1; i < ncopies; ++i)
8007         vec_lhs = vect_get_vec_def_for_stmt_copy (loop_vinfo, vec_lhs);
8008
8009       /* Get the last lane in the vector.  */
8010       bitstart = int_const_binop (MINUS_EXPR, vec_bitsize, bitsize);
8011     }
8012
8013   gimple_seq stmts = NULL;
8014   tree new_tree;
8015   if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
8016     {
8017       /* Emit:
8018
8019            SCALAR_RES = EXTRACT_LAST <VEC_LHS, MASK>
8020
8021          where VEC_LHS is the vectorized live-out result and MASK is
8022          the loop mask for the final iteration.  */
8023       gcc_assert (ncopies == 1 && !slp_node);
8024       tree scalar_type = TREE_TYPE (STMT_VINFO_VECTYPE (stmt_info));
8025       tree mask = vect_get_loop_mask (gsi, &LOOP_VINFO_MASKS (loop_vinfo),
8026                                       1, vectype, 0);
8027       tree scalar_res = gimple_build (&stmts, CFN_EXTRACT_LAST,
8028                                       scalar_type, mask, vec_lhs);
8029
8030       /* Convert the extracted vector element to the required scalar type.  */
8031       new_tree = gimple_convert (&stmts, lhs_type, scalar_res);
8032     }
8033   else
8034     {
8035       tree bftype = TREE_TYPE (vectype);
8036       if (VECTOR_BOOLEAN_TYPE_P (vectype))
8037         bftype = build_nonstandard_integer_type (tree_to_uhwi (bitsize), 1);
8038       new_tree = build3 (BIT_FIELD_REF, bftype, vec_lhs, bitsize, bitstart);
8039       new_tree = force_gimple_operand (fold_convert (lhs_type, new_tree),
8040                                        &stmts, true, NULL_TREE);
8041     }
8042
8043   if (stmts)
8044     gsi_insert_seq_on_edge_immediate (single_exit (loop), stmts);
8045
8046   /* Replace use of lhs with newly computed result.  If the use stmt is a
8047      single arg PHI, just replace all uses of PHI result.  It's necessary
8048      because lcssa PHI defining lhs may be before newly inserted stmt.  */
8049   use_operand_p use_p;
8050   FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, lhs)
8051     if (!flow_bb_inside_loop_p (loop, gimple_bb (use_stmt))
8052         && !is_gimple_debug (use_stmt))
8053     {
8054       if (gimple_code (use_stmt) == GIMPLE_PHI
8055           && gimple_phi_num_args (use_stmt) == 1)
8056         {
8057           replace_uses_by (gimple_phi_result (use_stmt), new_tree);
8058         }
8059       else
8060         {
8061           FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
8062             SET_USE (use_p, new_tree);
8063         }
8064       update_stmt (use_stmt);
8065     }
8066
8067   return true;
8068 }
8069
8070 /* Kill any debug uses outside LOOP of SSA names defined in STMT_INFO.  */
8071
8072 static void
8073 vect_loop_kill_debug_uses (struct loop *loop, stmt_vec_info stmt_info)
8074 {
8075   ssa_op_iter op_iter;
8076   imm_use_iterator imm_iter;
8077   def_operand_p def_p;
8078   gimple *ustmt;
8079
8080   FOR_EACH_PHI_OR_STMT_DEF (def_p, stmt_info->stmt, op_iter, SSA_OP_DEF)
8081     {
8082       FOR_EACH_IMM_USE_STMT (ustmt, imm_iter, DEF_FROM_PTR (def_p))
8083         {
8084           basic_block bb;
8085
8086           if (!is_gimple_debug (ustmt))
8087             continue;
8088
8089           bb = gimple_bb (ustmt);
8090
8091           if (!flow_bb_inside_loop_p (loop, bb))
8092             {
8093               if (gimple_debug_bind_p (ustmt))
8094                 {
8095                   if (dump_enabled_p ())
8096                     dump_printf_loc (MSG_NOTE, vect_location,
8097                                      "killing debug use\n");
8098
8099                   gimple_debug_bind_reset_value (ustmt);
8100                   update_stmt (ustmt);
8101                 }
8102               else
8103                 gcc_unreachable ();
8104             }
8105         }
8106     }
8107 }
8108
8109 /* Given loop represented by LOOP_VINFO, return true if computation of
8110    LOOP_VINFO_NITERS (= LOOP_VINFO_NITERSM1 + 1) doesn't overflow, false
8111    otherwise.  */
8112
8113 static bool
8114 loop_niters_no_overflow (loop_vec_info loop_vinfo)
8115 {
8116   /* Constant case.  */
8117   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
8118     {
8119       tree cst_niters = LOOP_VINFO_NITERS (loop_vinfo);
8120       tree cst_nitersm1 = LOOP_VINFO_NITERSM1 (loop_vinfo);
8121
8122       gcc_assert (TREE_CODE (cst_niters) == INTEGER_CST);
8123       gcc_assert (TREE_CODE (cst_nitersm1) == INTEGER_CST);
8124       if (wi::to_widest (cst_nitersm1) < wi::to_widest (cst_niters))
8125         return true;
8126     }
8127
8128   widest_int max;
8129   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
8130   /* Check the upper bound of loop niters.  */
8131   if (get_max_loop_iterations (loop, &max))
8132     {
8133       tree type = TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo));
8134       signop sgn = TYPE_SIGN (type);
8135       widest_int type_max = widest_int::from (wi::max_value (type), sgn);
8136       if (max < type_max)
8137         return true;
8138     }
8139   return false;
8140 }
8141
8142 /* Return a mask type with half the number of elements as TYPE.  */
8143
8144 tree
8145 vect_halve_mask_nunits (tree type)
8146 {
8147   poly_uint64 nunits = exact_div (TYPE_VECTOR_SUBPARTS (type), 2);
8148   return build_truth_vector_type (nunits, current_vector_size);
8149 }
8150
8151 /* Return a mask type with twice as many elements as TYPE.  */
8152
8153 tree
8154 vect_double_mask_nunits (tree type)
8155 {
8156   poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (type) * 2;
8157   return build_truth_vector_type (nunits, current_vector_size);
8158 }
8159
8160 /* Record that a fully-masked version of LOOP_VINFO would need MASKS to
8161    contain a sequence of NVECTORS masks that each control a vector of type
8162    VECTYPE.  */
8163
8164 void
8165 vect_record_loop_mask (loop_vec_info loop_vinfo, vec_loop_masks *masks,
8166                        unsigned int nvectors, tree vectype)
8167 {
8168   gcc_assert (nvectors != 0);
8169   if (masks->length () < nvectors)
8170     masks->safe_grow_cleared (nvectors);
8171   rgroup_masks *rgm = &(*masks)[nvectors - 1];
8172   /* The number of scalars per iteration and the number of vectors are
8173      both compile-time constants.  */
8174   unsigned int nscalars_per_iter
8175     = exact_div (nvectors * TYPE_VECTOR_SUBPARTS (vectype),
8176                  LOOP_VINFO_VECT_FACTOR (loop_vinfo)).to_constant ();
8177   if (rgm->max_nscalars_per_iter < nscalars_per_iter)
8178     {
8179       rgm->max_nscalars_per_iter = nscalars_per_iter;
8180       rgm->mask_type = build_same_sized_truth_vector_type (vectype);
8181     }
8182 }
8183
8184 /* Given a complete set of masks MASKS, extract mask number INDEX
8185    for an rgroup that operates on NVECTORS vectors of type VECTYPE,
8186    where 0 <= INDEX < NVECTORS.  Insert any set-up statements before GSI.
8187
8188    See the comment above vec_loop_masks for more details about the mask
8189    arrangement.  */
8190
8191 tree
8192 vect_get_loop_mask (gimple_stmt_iterator *gsi, vec_loop_masks *masks,
8193                     unsigned int nvectors, tree vectype, unsigned int index)
8194 {
8195   rgroup_masks *rgm = &(*masks)[nvectors - 1];
8196   tree mask_type = rgm->mask_type;
8197
8198   /* Populate the rgroup's mask array, if this is the first time we've
8199      used it.  */
8200   if (rgm->masks.is_empty ())
8201     {
8202       rgm->masks.safe_grow_cleared (nvectors);
8203       for (unsigned int i = 0; i < nvectors; ++i)
8204         {
8205           tree mask = make_temp_ssa_name (mask_type, NULL, "loop_mask");
8206           /* Provide a dummy definition until the real one is available.  */
8207           SSA_NAME_DEF_STMT (mask) = gimple_build_nop ();
8208           rgm->masks[i] = mask;
8209         }
8210     }
8211
8212   tree mask = rgm->masks[index];
8213   if (maybe_ne (TYPE_VECTOR_SUBPARTS (mask_type),
8214                 TYPE_VECTOR_SUBPARTS (vectype)))
8215     {
8216       /* A loop mask for data type X can be reused for data type Y
8217          if X has N times more elements than Y and if Y's elements
8218          are N times bigger than X's.  In this case each sequence
8219          of N elements in the loop mask will be all-zero or all-one.
8220          We can then view-convert the mask so that each sequence of
8221          N elements is replaced by a single element.  */
8222       gcc_assert (multiple_p (TYPE_VECTOR_SUBPARTS (mask_type),
8223                               TYPE_VECTOR_SUBPARTS (vectype)));
8224       gimple_seq seq = NULL;
8225       mask_type = build_same_sized_truth_vector_type (vectype);
8226       mask = gimple_build (&seq, VIEW_CONVERT_EXPR, mask_type, mask);
8227       if (seq)
8228         gsi_insert_seq_before (gsi, seq, GSI_SAME_STMT);
8229     }
8230   return mask;
8231 }
8232
8233 /* Scale profiling counters by estimation for LOOP which is vectorized
8234    by factor VF.  */
8235
8236 static void
8237 scale_profile_for_vect_loop (struct loop *loop, unsigned vf)
8238 {
8239   edge preheader = loop_preheader_edge (loop);
8240   /* Reduce loop iterations by the vectorization factor.  */
8241   gcov_type new_est_niter = niter_for_unrolled_loop (loop, vf);
8242   profile_count freq_h = loop->header->count, freq_e = preheader->count ();
8243
8244   if (freq_h.nonzero_p ())
8245     {
8246       profile_probability p;
8247
8248       /* Avoid dropping loop body profile counter to 0 because of zero count
8249          in loop's preheader.  */
8250       if (!(freq_e == profile_count::zero ()))
8251         freq_e = freq_e.force_nonzero ();
8252       p = freq_e.apply_scale (new_est_niter + 1, 1).probability_in (freq_h);
8253       scale_loop_frequencies (loop, p);
8254     }
8255
8256   edge exit_e = single_exit (loop);
8257   exit_e->probability = profile_probability::always ()
8258                                  .apply_scale (1, new_est_niter + 1);
8259
8260   edge exit_l = single_pred_edge (loop->latch);
8261   profile_probability prob = exit_l->probability;
8262   exit_l->probability = exit_e->probability.invert ();
8263   if (prob.initialized_p () && exit_l->probability.initialized_p ())
8264     scale_bbs_frequencies (&loop->latch, 1, exit_l->probability / prob);
8265 }
8266
8267 /* Vectorize STMT_INFO if relevant, inserting any new instructions before GSI.
8268    When vectorizing STMT_INFO as a store, set *SEEN_STORE to its
8269    stmt_vec_info.  */
8270
8271 static void
8272 vect_transform_loop_stmt (loop_vec_info loop_vinfo, stmt_vec_info stmt_info,
8273                           gimple_stmt_iterator *gsi, stmt_vec_info *seen_store)
8274 {
8275   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
8276   poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
8277
8278   if (dump_enabled_p ())
8279     dump_printf_loc (MSG_NOTE, vect_location,
8280                      "------>vectorizing statement: %G", stmt_info->stmt);
8281
8282   if (MAY_HAVE_DEBUG_BIND_STMTS && !STMT_VINFO_LIVE_P (stmt_info))
8283     vect_loop_kill_debug_uses (loop, stmt_info);
8284
8285   if (!STMT_VINFO_RELEVANT_P (stmt_info)
8286       && !STMT_VINFO_LIVE_P (stmt_info))
8287     return;
8288
8289   if (STMT_VINFO_VECTYPE (stmt_info))
8290     {
8291       poly_uint64 nunits
8292         = TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info));
8293       if (!STMT_SLP_TYPE (stmt_info)
8294           && maybe_ne (nunits, vf)
8295           && dump_enabled_p ())
8296         /* For SLP VF is set according to unrolling factor, and not
8297            to vector size, hence for SLP this print is not valid.  */
8298         dump_printf_loc (MSG_NOTE, vect_location, "multiple-types.\n");
8299     }
8300
8301   /* Pure SLP statements have already been vectorized.  We still need
8302      to apply loop vectorization to hybrid SLP statements.  */
8303   if (PURE_SLP_STMT (stmt_info))
8304     return;
8305
8306   if (dump_enabled_p ())
8307     dump_printf_loc (MSG_NOTE, vect_location, "transform statement.\n");
8308
8309   if (vect_transform_stmt (stmt_info, gsi, NULL, NULL))
8310     *seen_store = stmt_info;
8311 }
8312
8313 /* Function vect_transform_loop.
8314
8315    The analysis phase has determined that the loop is vectorizable.
8316    Vectorize the loop - created vectorized stmts to replace the scalar
8317    stmts in the loop, and update the loop exit condition.
8318    Returns scalar epilogue loop if any.  */
8319
8320 struct loop *
8321 vect_transform_loop (loop_vec_info loop_vinfo)
8322 {
8323   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
8324   struct loop *epilogue = NULL;
8325   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
8326   int nbbs = loop->num_nodes;
8327   int i;
8328   tree niters_vector = NULL_TREE;
8329   tree step_vector = NULL_TREE;
8330   tree niters_vector_mult_vf = NULL_TREE;
8331   poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
8332   unsigned int lowest_vf = constant_lower_bound (vf);
8333   gimple *stmt;
8334   bool check_profitability = false;
8335   unsigned int th;
8336
8337   DUMP_VECT_SCOPE ("vec_transform_loop");
8338
8339   loop_vinfo->shared->check_datarefs ();
8340
8341   /* Use the more conservative vectorization threshold.  If the number
8342      of iterations is constant assume the cost check has been performed
8343      by our caller.  If the threshold makes all loops profitable that
8344      run at least the (estimated) vectorization factor number of times
8345      checking is pointless, too.  */
8346   th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
8347   if (th >= vect_vf_for_cost (loop_vinfo)
8348       && !LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
8349     {
8350       if (dump_enabled_p ())
8351         dump_printf_loc (MSG_NOTE, vect_location,
8352                          "Profitability threshold is %d loop iterations.\n",
8353                          th);
8354       check_profitability = true;
8355     }
8356
8357   /* Make sure there exists a single-predecessor exit bb.  Do this before
8358      versioning.   */
8359   edge e = single_exit (loop);
8360   if (! single_pred_p (e->dest))
8361     {
8362       split_loop_exit_edge (e, true);
8363       if (dump_enabled_p ())
8364         dump_printf (MSG_NOTE, "split exit edge\n");
8365     }
8366
8367   /* Version the loop first, if required, so the profitability check
8368      comes first.  */
8369
8370   if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
8371     {
8372       poly_uint64 versioning_threshold
8373         = LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo);
8374       if (check_profitability
8375           && ordered_p (poly_uint64 (th), versioning_threshold))
8376         {
8377           versioning_threshold = ordered_max (poly_uint64 (th),
8378                                               versioning_threshold);
8379           check_profitability = false;
8380         }
8381       struct loop *sloop
8382         = vect_loop_versioning (loop_vinfo, th, check_profitability,
8383                                 versioning_threshold);
8384       sloop->force_vectorize = false;
8385       check_profitability = false;
8386     }
8387
8388   /* Make sure there exists a single-predecessor exit bb also on the
8389      scalar loop copy.  Do this after versioning but before peeling
8390      so CFG structure is fine for both scalar and if-converted loop
8391      to make slpeel_duplicate_current_defs_from_edges face matched
8392      loop closed PHI nodes on the exit.  */
8393   if (LOOP_VINFO_SCALAR_LOOP (loop_vinfo))
8394     {
8395       e = single_exit (LOOP_VINFO_SCALAR_LOOP (loop_vinfo));
8396       if (! single_pred_p (e->dest))
8397         {
8398           split_loop_exit_edge (e, true);
8399           if (dump_enabled_p ())
8400             dump_printf (MSG_NOTE, "split exit edge of scalar loop\n");
8401         }
8402     }
8403
8404   tree niters = vect_build_loop_niters (loop_vinfo);
8405   LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = niters;
8406   tree nitersm1 = unshare_expr (LOOP_VINFO_NITERSM1 (loop_vinfo));
8407   bool niters_no_overflow = loop_niters_no_overflow (loop_vinfo);
8408   epilogue = vect_do_peeling (loop_vinfo, niters, nitersm1, &niters_vector,
8409                               &step_vector, &niters_vector_mult_vf, th,
8410                               check_profitability, niters_no_overflow);
8411
8412   if (niters_vector == NULL_TREE)
8413     {
8414       if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
8415           && !LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
8416           && known_eq (lowest_vf, vf))
8417         {
8418           niters_vector
8419             = build_int_cst (TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo)),
8420                              LOOP_VINFO_INT_NITERS (loop_vinfo) / lowest_vf);
8421           step_vector = build_one_cst (TREE_TYPE (niters));
8422         }
8423       else
8424         vect_gen_vector_loop_niters (loop_vinfo, niters, &niters_vector,
8425                                      &step_vector, niters_no_overflow);
8426     }
8427
8428   /* 1) Make sure the loop header has exactly two entries
8429      2) Make sure we have a preheader basic block.  */
8430
8431   gcc_assert (EDGE_COUNT (loop->header->preds) == 2);
8432
8433   split_edge (loop_preheader_edge (loop));
8434
8435   if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
8436       && vect_use_loop_mask_for_alignment_p (loop_vinfo))
8437     /* This will deal with any possible peeling.  */
8438     vect_prepare_for_masked_peels (loop_vinfo);
8439
8440   /* Schedule the SLP instances first, then handle loop vectorization
8441      below.  */
8442   if (!loop_vinfo->slp_instances.is_empty ())
8443     {
8444       DUMP_VECT_SCOPE ("scheduling SLP instances");
8445       vect_schedule_slp (loop_vinfo);
8446     }
8447
8448   /* FORNOW: the vectorizer supports only loops which body consist
8449      of one basic block (header + empty latch). When the vectorizer will
8450      support more involved loop forms, the order by which the BBs are
8451      traversed need to be reconsidered.  */
8452
8453   for (i = 0; i < nbbs; i++)
8454     {
8455       basic_block bb = bbs[i];
8456       stmt_vec_info stmt_info;
8457
8458       for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
8459            gsi_next (&si))
8460         {
8461           gphi *phi = si.phi ();
8462           if (dump_enabled_p ())
8463             dump_printf_loc (MSG_NOTE, vect_location,
8464                              "------>vectorizing phi: %G", phi);
8465           stmt_info = loop_vinfo->lookup_stmt (phi);
8466           if (!stmt_info)
8467             continue;
8468
8469           if (MAY_HAVE_DEBUG_BIND_STMTS && !STMT_VINFO_LIVE_P (stmt_info))
8470             vect_loop_kill_debug_uses (loop, stmt_info);
8471
8472           if (!STMT_VINFO_RELEVANT_P (stmt_info)
8473               && !STMT_VINFO_LIVE_P (stmt_info))
8474             continue;
8475
8476           if (STMT_VINFO_VECTYPE (stmt_info)
8477               && (maybe_ne
8478                   (TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info)), vf))
8479               && dump_enabled_p ())
8480             dump_printf_loc (MSG_NOTE, vect_location, "multiple-types.\n");
8481
8482           if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def
8483                || STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
8484                || STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle)
8485               && ! PURE_SLP_STMT (stmt_info))
8486             {
8487               if (dump_enabled_p ())
8488                 dump_printf_loc (MSG_NOTE, vect_location, "transform phi.\n");
8489               vect_transform_stmt (stmt_info, NULL, NULL, NULL);
8490             }
8491         }
8492
8493       for (gimple_stmt_iterator si = gsi_start_bb (bb);
8494            !gsi_end_p (si);)
8495         {
8496           stmt = gsi_stmt (si);
8497           /* During vectorization remove existing clobber stmts.  */
8498           if (gimple_clobber_p (stmt))
8499             {
8500               unlink_stmt_vdef (stmt);
8501               gsi_remove (&si, true);
8502               release_defs (stmt);
8503             }
8504           else
8505             {
8506               stmt_info = loop_vinfo->lookup_stmt (stmt);
8507
8508               /* vector stmts created in the outer-loop during vectorization of
8509                  stmts in an inner-loop may not have a stmt_info, and do not
8510                  need to be vectorized.  */
8511               stmt_vec_info seen_store = NULL;
8512               if (stmt_info)
8513                 {
8514                   if (STMT_VINFO_IN_PATTERN_P (stmt_info))
8515                     {
8516                       gimple *def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
8517                       for (gimple_stmt_iterator subsi = gsi_start (def_seq);
8518                            !gsi_end_p (subsi); gsi_next (&subsi))
8519                         {
8520                           stmt_vec_info pat_stmt_info
8521                             = loop_vinfo->lookup_stmt (gsi_stmt (subsi));
8522                           vect_transform_loop_stmt (loop_vinfo, pat_stmt_info,
8523                                                     &si, &seen_store);
8524                         }
8525                       stmt_vec_info pat_stmt_info
8526                         = STMT_VINFO_RELATED_STMT (stmt_info);
8527                       vect_transform_loop_stmt (loop_vinfo, pat_stmt_info, &si,
8528                                                 &seen_store);
8529                     }
8530                   vect_transform_loop_stmt (loop_vinfo, stmt_info, &si,
8531                                             &seen_store);
8532                 }
8533               gsi_next (&si);
8534               if (seen_store)
8535                 {
8536                   if (STMT_VINFO_GROUPED_ACCESS (seen_store))
8537                     /* Interleaving.  If IS_STORE is TRUE, the
8538                        vectorization of the interleaving chain was
8539                        completed - free all the stores in the chain.  */
8540                     vect_remove_stores (DR_GROUP_FIRST_ELEMENT (seen_store));
8541                   else
8542                     /* Free the attached stmt_vec_info and remove the stmt.  */
8543                     loop_vinfo->remove_stmt (stmt_info);
8544                 }
8545             }
8546         }
8547
8548       /* Stub out scalar statements that must not survive vectorization.
8549          Doing this here helps with grouped statements, or statements that
8550          are involved in patterns.  */
8551       for (gimple_stmt_iterator gsi = gsi_start_bb (bb);
8552            !gsi_end_p (gsi); gsi_next (&gsi))
8553         {
8554           gcall *call = dyn_cast <gcall *> (gsi_stmt (gsi));
8555           if (call && gimple_call_internal_p (call, IFN_MASK_LOAD))
8556             {
8557               tree lhs = gimple_get_lhs (call);
8558               if (!VECTOR_TYPE_P (TREE_TYPE (lhs)))
8559                 {
8560                   tree zero = build_zero_cst (TREE_TYPE (lhs));
8561                   gimple *new_stmt = gimple_build_assign (lhs, zero);
8562                   gsi_replace (&gsi, new_stmt, true);
8563                 }
8564             }
8565         }
8566     }                           /* BBs in loop */
8567
8568   /* The vectorization factor is always > 1, so if we use an IV increment of 1.
8569      a zero NITERS becomes a nonzero NITERS_VECTOR.  */
8570   if (integer_onep (step_vector))
8571     niters_no_overflow = true;
8572   vect_set_loop_condition (loop, loop_vinfo, niters_vector, step_vector,
8573                            niters_vector_mult_vf, !niters_no_overflow);
8574
8575   unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
8576   scale_profile_for_vect_loop (loop, assumed_vf);
8577
8578   /* True if the final iteration might not handle a full vector's
8579      worth of scalar iterations.  */
8580   bool final_iter_may_be_partial = LOOP_VINFO_FULLY_MASKED_P (loop_vinfo);
8581   /* The minimum number of iterations performed by the epilogue.  This
8582      is 1 when peeling for gaps because we always need a final scalar
8583      iteration.  */
8584   int min_epilogue_iters = LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) ? 1 : 0;
8585   /* +1 to convert latch counts to loop iteration counts,
8586      -min_epilogue_iters to remove iterations that cannot be performed
8587        by the vector code.  */
8588   int bias_for_lowest = 1 - min_epilogue_iters;
8589   int bias_for_assumed = bias_for_lowest;
8590   int alignment_npeels = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
8591   if (alignment_npeels && LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
8592     {
8593       /* When the amount of peeling is known at compile time, the first
8594          iteration will have exactly alignment_npeels active elements.
8595          In the worst case it will have at least one.  */
8596       int min_first_active = (alignment_npeels > 0 ? alignment_npeels : 1);
8597       bias_for_lowest += lowest_vf - min_first_active;
8598       bias_for_assumed += assumed_vf - min_first_active;
8599     }
8600   /* In these calculations the "- 1" converts loop iteration counts
8601      back to latch counts.  */
8602   if (loop->any_upper_bound)
8603     loop->nb_iterations_upper_bound
8604       = (final_iter_may_be_partial
8605          ? wi::udiv_ceil (loop->nb_iterations_upper_bound + bias_for_lowest,
8606                           lowest_vf) - 1
8607          : wi::udiv_floor (loop->nb_iterations_upper_bound + bias_for_lowest,
8608                            lowest_vf) - 1);
8609   if (loop->any_likely_upper_bound)
8610     loop->nb_iterations_likely_upper_bound
8611       = (final_iter_may_be_partial
8612          ? wi::udiv_ceil (loop->nb_iterations_likely_upper_bound
8613                           + bias_for_lowest, lowest_vf) - 1
8614          : wi::udiv_floor (loop->nb_iterations_likely_upper_bound
8615                            + bias_for_lowest, lowest_vf) - 1);
8616   if (loop->any_estimate)
8617     loop->nb_iterations_estimate
8618       = (final_iter_may_be_partial
8619          ? wi::udiv_ceil (loop->nb_iterations_estimate + bias_for_assumed,
8620                           assumed_vf) - 1
8621          : wi::udiv_floor (loop->nb_iterations_estimate + bias_for_assumed,
8622                            assumed_vf) - 1);
8623
8624   if (dump_enabled_p ())
8625     {
8626       if (!LOOP_VINFO_EPILOGUE_P (loop_vinfo))
8627         {
8628           dump_printf_loc (MSG_NOTE, vect_location,
8629                            "LOOP VECTORIZED\n");
8630           if (loop->inner)
8631             dump_printf_loc (MSG_NOTE, vect_location,
8632                              "OUTER LOOP VECTORIZED\n");
8633           dump_printf (MSG_NOTE, "\n");
8634         }
8635       else
8636         {
8637           dump_printf_loc (MSG_NOTE, vect_location,
8638                            "LOOP EPILOGUE VECTORIZED (VS=");
8639           dump_dec (MSG_NOTE, current_vector_size);
8640           dump_printf (MSG_NOTE, ")\n");
8641         }
8642     }
8643
8644   /* Loops vectorized with a variable factor won't benefit from
8645      unrolling/peeling.  */
8646   if (!vf.is_constant ())
8647     {
8648       loop->unroll = 1;
8649       if (dump_enabled_p ())
8650         dump_printf_loc (MSG_NOTE, vect_location, "Disabling unrolling due to"
8651                          " variable-length vectorization factor\n");
8652     }
8653   /* Free SLP instances here because otherwise stmt reference counting
8654      won't work.  */
8655   slp_instance instance;
8656   FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), i, instance)
8657     vect_free_slp_instance (instance, true);
8658   LOOP_VINFO_SLP_INSTANCES (loop_vinfo).release ();
8659   /* Clear-up safelen field since its value is invalid after vectorization
8660      since vectorized loop can have loop-carried dependencies.  */
8661   loop->safelen = 0;
8662
8663   /* Don't vectorize epilogue for epilogue.  */
8664   if (LOOP_VINFO_EPILOGUE_P (loop_vinfo))
8665     epilogue = NULL;
8666
8667   if (!PARAM_VALUE (PARAM_VECT_EPILOGUES_NOMASK))
8668     epilogue = NULL;
8669
8670   if (epilogue)
8671     {
8672       auto_vector_sizes vector_sizes;
8673       targetm.vectorize.autovectorize_vector_sizes (&vector_sizes);
8674       unsigned int next_size = 0;
8675
8676       /* Note LOOP_VINFO_NITERS_KNOWN_P and LOOP_VINFO_INT_NITERS work
8677          on niters already ajusted for the iterations of the prologue.  */
8678       if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
8679           && known_eq (vf, lowest_vf))
8680         {
8681           unsigned HOST_WIDE_INT eiters
8682             = (LOOP_VINFO_INT_NITERS (loop_vinfo)
8683                - LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo));
8684           eiters
8685             = eiters % lowest_vf + LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo);
8686           epilogue->nb_iterations_upper_bound = eiters - 1;
8687           epilogue->any_upper_bound = true;
8688
8689           unsigned int ratio;
8690           while (next_size < vector_sizes.length ()
8691                  && !(constant_multiple_p (current_vector_size,
8692                                            vector_sizes[next_size], &ratio)
8693                       && eiters >= lowest_vf / ratio))
8694             next_size += 1;
8695         }
8696       else
8697         while (next_size < vector_sizes.length ()
8698                && maybe_lt (current_vector_size, vector_sizes[next_size]))
8699           next_size += 1;
8700
8701       if (next_size == vector_sizes.length ())
8702         epilogue = NULL;
8703     }
8704
8705   if (epilogue)
8706     {
8707       epilogue->force_vectorize = loop->force_vectorize;
8708       epilogue->safelen = loop->safelen;
8709       epilogue->dont_vectorize = false;
8710
8711       /* We may need to if-convert epilogue to vectorize it.  */
8712       if (LOOP_VINFO_SCALAR_LOOP (loop_vinfo))
8713         tree_if_conversion (epilogue);
8714     }
8715
8716   return epilogue;
8717 }
8718
8719 /* The code below is trying to perform simple optimization - revert
8720    if-conversion for masked stores, i.e. if the mask of a store is zero
8721    do not perform it and all stored value producers also if possible.
8722    For example,
8723      for (i=0; i<n; i++)
8724        if (c[i])
8725         {
8726           p1[i] += 1;
8727           p2[i] = p3[i] +2;
8728         }
8729    this transformation will produce the following semi-hammock:
8730
8731    if (!mask__ifc__42.18_165 == { 0, 0, 0, 0, 0, 0, 0, 0 })
8732      {
8733        vect__11.19_170 = MASK_LOAD (vectp_p1.20_168, 0B, mask__ifc__42.18_165);
8734        vect__12.22_172 = vect__11.19_170 + vect_cst__171;
8735        MASK_STORE (vectp_p1.23_175, 0B, mask__ifc__42.18_165, vect__12.22_172);
8736        vect__18.25_182 = MASK_LOAD (vectp_p3.26_180, 0B, mask__ifc__42.18_165);
8737        vect__19.28_184 = vect__18.25_182 + vect_cst__183;
8738        MASK_STORE (vectp_p2.29_187, 0B, mask__ifc__42.18_165, vect__19.28_184);
8739      }
8740 */
8741
8742 void
8743 optimize_mask_stores (struct loop *loop)
8744 {
8745   basic_block *bbs = get_loop_body (loop);
8746   unsigned nbbs = loop->num_nodes;
8747   unsigned i;
8748   basic_block bb;
8749   struct loop *bb_loop;
8750   gimple_stmt_iterator gsi;
8751   gimple *stmt;
8752   auto_vec<gimple *> worklist;
8753   auto_purge_vect_location sentinel;
8754
8755   vect_location = find_loop_location (loop);
8756   /* Pick up all masked stores in loop if any.  */
8757   for (i = 0; i < nbbs; i++)
8758     {
8759       bb = bbs[i];
8760       for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi);
8761            gsi_next (&gsi))
8762         {
8763           stmt = gsi_stmt (gsi);
8764           if (gimple_call_internal_p (stmt, IFN_MASK_STORE))
8765             worklist.safe_push (stmt);
8766         }
8767     }
8768
8769   free (bbs);
8770   if (worklist.is_empty ())
8771     return;
8772
8773   /* Loop has masked stores.  */
8774   while (!worklist.is_empty ())
8775     {
8776       gimple *last, *last_store;
8777       edge e, efalse;
8778       tree mask;
8779       basic_block store_bb, join_bb;
8780       gimple_stmt_iterator gsi_to;
8781       tree vdef, new_vdef;
8782       gphi *phi;
8783       tree vectype;
8784       tree zero;
8785
8786       last = worklist.pop ();
8787       mask = gimple_call_arg (last, 2);
8788       bb = gimple_bb (last);
8789       /* Create then_bb and if-then structure in CFG, then_bb belongs to
8790          the same loop as if_bb.  It could be different to LOOP when two
8791          level loop-nest is vectorized and mask_store belongs to the inner
8792          one.  */
8793       e = split_block (bb, last);
8794       bb_loop = bb->loop_father;
8795       gcc_assert (loop == bb_loop || flow_loop_nested_p (loop, bb_loop));
8796       join_bb = e->dest;
8797       store_bb = create_empty_bb (bb);
8798       add_bb_to_loop (store_bb, bb_loop);
8799       e->flags = EDGE_TRUE_VALUE;
8800       efalse = make_edge (bb, store_bb, EDGE_FALSE_VALUE);
8801       /* Put STORE_BB to likely part.  */
8802       efalse->probability = profile_probability::unlikely ();
8803       store_bb->count = efalse->count ();
8804       make_single_succ_edge (store_bb, join_bb, EDGE_FALLTHRU);
8805       if (dom_info_available_p (CDI_DOMINATORS))
8806         set_immediate_dominator (CDI_DOMINATORS, store_bb, bb);
8807       if (dump_enabled_p ())
8808         dump_printf_loc (MSG_NOTE, vect_location,
8809                          "Create new block %d to sink mask stores.",
8810                          store_bb->index);
8811       /* Create vector comparison with boolean result.  */
8812       vectype = TREE_TYPE (mask);
8813       zero = build_zero_cst (vectype);
8814       stmt = gimple_build_cond (EQ_EXPR, mask, zero, NULL_TREE, NULL_TREE);
8815       gsi = gsi_last_bb (bb);
8816       gsi_insert_after (&gsi, stmt, GSI_SAME_STMT);
8817       /* Create new PHI node for vdef of the last masked store:
8818          .MEM_2 = VDEF <.MEM_1>
8819          will be converted to
8820          .MEM.3 = VDEF <.MEM_1>
8821          and new PHI node will be created in join bb
8822          .MEM_2 = PHI <.MEM_1, .MEM_3>
8823       */
8824       vdef = gimple_vdef (last);
8825       new_vdef = make_ssa_name (gimple_vop (cfun), last);
8826       gimple_set_vdef (last, new_vdef);
8827       phi = create_phi_node (vdef, join_bb);
8828       add_phi_arg (phi, new_vdef, EDGE_SUCC (store_bb, 0), UNKNOWN_LOCATION);
8829
8830       /* Put all masked stores with the same mask to STORE_BB if possible.  */
8831       while (true)
8832         {
8833           gimple_stmt_iterator gsi_from;
8834           gimple *stmt1 = NULL;
8835
8836           /* Move masked store to STORE_BB.  */
8837           last_store = last;
8838           gsi = gsi_for_stmt (last);
8839           gsi_from = gsi;
8840           /* Shift GSI to the previous stmt for further traversal.  */
8841           gsi_prev (&gsi);
8842           gsi_to = gsi_start_bb (store_bb);
8843           gsi_move_before (&gsi_from, &gsi_to);
8844           /* Setup GSI_TO to the non-empty block start.  */
8845           gsi_to = gsi_start_bb (store_bb);
8846           if (dump_enabled_p ())
8847             dump_printf_loc (MSG_NOTE, vect_location,
8848                              "Move stmt to created bb\n%G", last);
8849           /* Move all stored value producers if possible.  */
8850           while (!gsi_end_p (gsi))
8851             {
8852               tree lhs;
8853               imm_use_iterator imm_iter;
8854               use_operand_p use_p;
8855               bool res;
8856
8857               /* Skip debug statements.  */
8858               if (is_gimple_debug (gsi_stmt (gsi)))
8859                 {
8860                   gsi_prev (&gsi);
8861                   continue;
8862                 }
8863               stmt1 = gsi_stmt (gsi);
8864               /* Do not consider statements writing to memory or having
8865                  volatile operand.  */
8866               if (gimple_vdef (stmt1)
8867                   || gimple_has_volatile_ops (stmt1))
8868                 break;
8869               gsi_from = gsi;
8870               gsi_prev (&gsi);
8871               lhs = gimple_get_lhs (stmt1);
8872               if (!lhs)
8873                 break;
8874
8875               /* LHS of vectorized stmt must be SSA_NAME.  */
8876               if (TREE_CODE (lhs) != SSA_NAME)
8877                 break;
8878
8879               if (!VECTOR_TYPE_P (TREE_TYPE (lhs)))
8880                 {
8881                   /* Remove dead scalar statement.  */
8882                   if (has_zero_uses (lhs))
8883                     {
8884                       gsi_remove (&gsi_from, true);
8885                       continue;
8886                     }
8887                 }
8888
8889               /* Check that LHS does not have uses outside of STORE_BB.  */
8890               res = true;
8891               FOR_EACH_IMM_USE_FAST (use_p, imm_iter, lhs)
8892                 {
8893                   gimple *use_stmt;
8894                   use_stmt = USE_STMT (use_p);
8895                   if (is_gimple_debug (use_stmt))
8896                     continue;
8897                   if (gimple_bb (use_stmt) != store_bb)
8898                     {
8899                       res = false;
8900                       break;
8901                     }
8902                 }
8903               if (!res)
8904                 break;
8905
8906               if (gimple_vuse (stmt1)
8907                   && gimple_vuse (stmt1) != gimple_vuse (last_store))
8908                 break;
8909
8910               /* Can move STMT1 to STORE_BB.  */
8911               if (dump_enabled_p ())
8912                 dump_printf_loc (MSG_NOTE, vect_location,
8913                                  "Move stmt to created bb\n%G", stmt1);
8914               gsi_move_before (&gsi_from, &gsi_to);
8915               /* Shift GSI_TO for further insertion.  */
8916               gsi_prev (&gsi_to);
8917             }
8918           /* Put other masked stores with the same mask to STORE_BB.  */
8919           if (worklist.is_empty ()
8920               || gimple_call_arg (worklist.last (), 2) != mask
8921               || worklist.last () != stmt1)
8922             break;
8923           last = worklist.pop ();
8924         }
8925       add_phi_arg (phi, gimple_vuse (last_store), e, UNKNOWN_LOCATION);
8926     }
8927 }