gcc/tree-vect-loop.c

   1 /* Loop Vectorization
   2    Copyright (C) 2003-2018 Free Software Foundation, Inc.
   3    Contributed by Dorit Naishlos <dorit@il.ibm.com> and
   4    Ira Rosen <irar@il.ibm.com>
   5
   6 This file is part of GCC.
   7
   8 GCC is free software; you can redistribute it and/or modify it under
   9 the terms of the GNU General Public License as published by the Free
  10 Software Foundation; either version 3, or (at your option) any later
  11 version.
  12
  13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
  14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
  15 FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  16 for more details.
  17
  18 You should have received a copy of the GNU General Public License
  19 along with GCC; see the file COPYING3.  If not see
  20 <http://www.gnu.org/licenses/>.  */
  21
  22 #include "config.h"
  23 #include "system.h"
  24 #include "coretypes.h"
  25 #include "backend.h"
  26 #include "target.h"
  27 #include "rtl.h"
  28 #include "tree.h"
  29 #include "gimple.h"
  30 #include "cfghooks.h"
  31 #include "tree-pass.h"
  32 #include "ssa.h"
  33 #include "optabs-tree.h"
  34 #include "diagnostic-core.h"
  35 #include "fold-const.h"
  36 #include "stor-layout.h"
  37 #include "cfganal.h"
  38 #include "gimplify.h"
  39 #include "gimple-iterator.h"
  40 #include "gimplify-me.h"
  41 #include "tree-ssa-loop-ivopts.h"
  42 #include "tree-ssa-loop-manip.h"
  43 #include "tree-ssa-loop-niter.h"
  44 #include "tree-ssa-loop.h"
  45 #include "cfgloop.h"
  46 #include "params.h"
  47 #include "tree-scalar-evolution.h"
  48 #include "tree-vectorizer.h"
  49 #include "gimple-fold.h"
  50 #include "cgraph.h"
  51 #include "tree-cfg.h"
  52 #include "tree-if-conv.h"
  53 #include "internal-fn.h"
  54 #include "tree-vector-builder.h"
  55 #include "vec-perm-indices.h"
  56 #include "tree-eh.h"
  57
  58 /* Loop Vectorization Pass.
  59
  60    This pass tries to vectorize loops.
  61
  62    For example, the vectorizer transforms the following simple loop:
  63
  64         short a[N]; short b[N]; short c[N]; int i;
  65
  66         for (i=0; i<N; i++){
  67           a[i] = b[i] + c[i];
  68         }
  69
  70    as if it was manually vectorized by rewriting the source code into:
  71
  72         typedef int __attribute__((mode(V8HI))) v8hi;
  73         short a[N];  short b[N]; short c[N];   int i;
  74         v8hi *pa = (v8hi*)a, *pb = (v8hi*)b, *pc = (v8hi*)c;
  75         v8hi va, vb, vc;
  76
  77         for (i=0; i<N/8; i++){
  78           vb = pb[i];
  79           vc = pc[i];
  80           va = vb + vc;
  81           pa[i] = va;
  82         }
  83
  84         The main entry to this pass is vectorize_loops(), in which
  85    the vectorizer applies a set of analyses on a given set of loops,
  86    followed by the actual vectorization transformation for the loops that
  87    had successfully passed the analysis phase.
  88         Throughout this pass we make a distinction between two types of
  89    data: scalars (which are represented by SSA_NAMES), and memory references
  90    ("data-refs").  These two types of data require different handling both
  91    during analysis and transformation. The types of data-refs that the
  92    vectorizer currently supports are ARRAY_REFS which base is an array DECL
  93    (not a pointer), and INDIRECT_REFS through pointers; both array and pointer
  94    accesses are required to have a simple (consecutive) access pattern.
  95
  96    Analysis phase:
  97    ===============
  98         The driver for the analysis phase is vect_analyze_loop().
  99    It applies a set of analyses, some of which rely on the scalar evolution
 100    analyzer (scev) developed by Sebastian Pop.
 101
 102         During the analysis phase the vectorizer records some information
 103    per stmt in a "stmt_vec_info" struct which is attached to each stmt in the
 104    loop, as well as general information about the loop as a whole, which is
 105    recorded in a "loop_vec_info" struct attached to each loop.
 106
 107    Transformation phase:
 108    =====================
 109         The loop transformation phase scans all the stmts in the loop, and
 110    creates a vector stmt (or a sequence of stmts) for each scalar stmt S in
 111    the loop that needs to be vectorized.  It inserts the vector code sequence
 112    just before the scalar stmt S, and records a pointer to the vector code
 113    in STMT_VINFO_VEC_STMT (stmt_info) (stmt_info is the stmt_vec_info struct
 114    attached to S).  This pointer will be used for the vectorization of following
 115    stmts which use the def of stmt S. Stmt S is removed if it writes to memory;
 116    otherwise, we rely on dead code elimination for removing it.
 117
 118         For example, say stmt S1 was vectorized into stmt VS1:
 119
 120    VS1: vb = px[i];
 121    S1:  b = x[i];    STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
 122    S2:  a = b;
 123
 124    To vectorize stmt S2, the vectorizer first finds the stmt that defines
 125    the operand 'b' (S1), and gets the relevant vector def 'vb' from the
 126    vector stmt VS1 pointed to by STMT_VINFO_VEC_STMT (stmt_info (S1)).  The
 127    resulting sequence would be:
 128
 129    VS1: vb = px[i];
 130    S1:  b = x[i];       STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
 131    VS2: va = vb;
 132    S2:  a = b;          STMT_VINFO_VEC_STMT (stmt_info (S2)) = VS2
 133
 134         Operands that are not SSA_NAMEs, are data-refs that appear in
 135    load/store operations (like 'x[i]' in S1), and are handled differently.
 136
 137    Target modeling:
 138    =================
 139         Currently the only target specific information that is used is the
 140    size of the vector (in bytes) - "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD".
 141    Targets that can support different sizes of vectors, for now will need
 142    to specify one value for "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD".  More
 143    flexibility will be added in the future.
 144
 145         Since we only vectorize operations which vector form can be
 146    expressed using existing tree codes, to verify that an operation is
 147    supported, the vectorizer checks the relevant optab at the relevant
 148    machine_mode (e.g, optab_handler (add_optab, V8HImode)).  If
 149    the value found is CODE_FOR_nothing, then there's no target support, and
 150    we can't vectorize the stmt.
 151
 152    For additional information on this project see:
 153    http://gcc.gnu.org/projects/tree-ssa/vectorization.html
 154 */
 155
 156 static void vect_estimate_min_profitable_iters (loop_vec_info, int *, int *);
 157
 158 /* Subroutine of vect_determine_vf_for_stmt that handles only one
 159    statement.  VECTYPE_MAYBE_SET_P is true if STMT_VINFO_VECTYPE
 160    may already be set for general statements (not just data refs).  */
 161
 162 static bool
 163 vect_determine_vf_for_stmt_1 (stmt_vec_info stmt_info,
 164                               bool vectype_maybe_set_p,
 165                               poly_uint64 *vf,
 166                               vec<stmt_vec_info > *mask_producers)
 167 {
 168   gimple *stmt = stmt_info->stmt;
 169
 170   if ((!STMT_VINFO_RELEVANT_P (stmt_info)
 171        && !STMT_VINFO_LIVE_P (stmt_info))
 172       || gimple_clobber_p (stmt))
 173     {
 174       if (dump_enabled_p ())
 175         dump_printf_loc (MSG_NOTE, vect_location, "skip.\n");
 176       return true;
 177     }
 178
 179   tree stmt_vectype, nunits_vectype;
 180   if (!vect_get_vector_types_for_stmt (stmt_info, &stmt_vectype,
 181                                        &nunits_vectype))
 182     return false;
 183
 184   if (stmt_vectype)
 185     {
 186       if (STMT_VINFO_VECTYPE (stmt_info))
 187         /* The only case when a vectype had been already set is for stmts
 188            that contain a data ref, or for "pattern-stmts" (stmts generated
 189            by the vectorizer to represent/replace a certain idiom).  */
 190         gcc_assert ((STMT_VINFO_DATA_REF (stmt_info)
 191                      || vectype_maybe_set_p)
 192                     && STMT_VINFO_VECTYPE (stmt_info) == stmt_vectype);
 193       else if (stmt_vectype == boolean_type_node)
 194         mask_producers->safe_push (stmt_info);
 195       else
 196         STMT_VINFO_VECTYPE (stmt_info) = stmt_vectype;
 197     }
 198
 199   if (nunits_vectype)
 200     vect_update_max_nunits (vf, nunits_vectype);
 201
 202   return true;
 203 }
 204
 205 /* Subroutine of vect_determine_vectorization_factor.  Set the vector
 206    types of STMT_INFO and all attached pattern statements and update
 207    the vectorization factor VF accordingly.  If some of the statements
 208    produce a mask result whose vector type can only be calculated later,
 209    add them to MASK_PRODUCERS.  Return true on success or false if
 210    something prevented vectorization.  */
 211
 212 static bool
 213 vect_determine_vf_for_stmt (stmt_vec_info stmt_info, poly_uint64 *vf,
 214                             vec<stmt_vec_info > *mask_producers)
 215 {
 216   vec_info *vinfo = stmt_info->vinfo;
 217   if (dump_enabled_p ())
 218     {
 219       dump_printf_loc (MSG_NOTE, vect_location, "==> examining statement: ");
 220       dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt_info->stmt, 0);
 221     }
 222   if (!vect_determine_vf_for_stmt_1 (stmt_info, false, vf, mask_producers))
 223     return false;
 224
 225   if (STMT_VINFO_IN_PATTERN_P (stmt_info)
 226       && STMT_VINFO_RELATED_STMT (stmt_info))
 227     {
 228       gimple *pattern_def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
 229       stmt_info = vinfo_for_stmt (STMT_VINFO_RELATED_STMT (stmt_info));
 230
 231       /* If a pattern statement has def stmts, analyze them too.  */
 232       for (gimple_stmt_iterator si = gsi_start (pattern_def_seq);
 233            !gsi_end_p (si); gsi_next (&si))
 234         {
 235           stmt_vec_info def_stmt_info = vinfo->lookup_stmt (gsi_stmt (si));
 236           if (dump_enabled_p ())
 237             {
 238               dump_printf_loc (MSG_NOTE, vect_location,
 239                                "==> examining pattern def stmt: ");
 240               dump_gimple_stmt (MSG_NOTE, TDF_SLIM,
 241                                 def_stmt_info->stmt, 0);
 242             }
 243           if (!vect_determine_vf_for_stmt_1 (def_stmt_info, true,
 244                                              vf, mask_producers))
 245             return false;
 246         }
 247
 248       if (dump_enabled_p ())
 249         {
 250           dump_printf_loc (MSG_NOTE, vect_location,
 251                            "==> examining pattern statement: ");
 252           dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt_info->stmt, 0);
 253         }
 254       if (!vect_determine_vf_for_stmt_1 (stmt_info, true, vf, mask_producers))
 255         return false;
 256     }
 257
 258   return true;
 259 }
 260
 261 /* Function vect_determine_vectorization_factor
 262
 263    Determine the vectorization factor (VF).  VF is the number of data elements
 264    that are operated upon in parallel in a single iteration of the vectorized
 265    loop.  For example, when vectorizing a loop that operates on 4byte elements,
 266    on a target with vector size (VS) 16byte, the VF is set to 4, since 4
 267    elements can fit in a single vector register.
 268
 269    We currently support vectorization of loops in which all types operated upon
 270    are of the same size.  Therefore this function currently sets VF according to
 271    the size of the types operated upon, and fails if there are multiple sizes
 272    in the loop.
 273
 274    VF is also the factor by which the loop iterations are strip-mined, e.g.:
 275    original loop:
 276         for (i=0; i<N; i++){
 277           a[i] = b[i] + c[i];
 278         }
 279
 280    vectorized loop:
 281         for (i=0; i<N; i+=VF){
 282           a[i:VF] = b[i:VF] + c[i:VF];
 283         }
 284 */
 285
 286 static bool
 287 vect_determine_vectorization_factor (loop_vec_info loop_vinfo)
 288 {
 289   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
 290   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
 291   unsigned nbbs = loop->num_nodes;
 292   poly_uint64 vectorization_factor = 1;
 293   tree scalar_type = NULL_TREE;
 294   gphi *phi;
 295   tree vectype;
 296   stmt_vec_info stmt_info;
 297   unsigned i;
 298   auto_vec<stmt_vec_info> mask_producers;
 299
 300   DUMP_VECT_SCOPE ("vect_determine_vectorization_factor");
 301
 302   for (i = 0; i < nbbs; i++)
 303     {
 304       basic_block bb = bbs[i];
 305
 306       for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
 307            gsi_next (&si))
 308         {
 309           phi = si.phi ();
 310           stmt_info = loop_vinfo->lookup_stmt (phi);
 311           if (dump_enabled_p ())
 312             {
 313               dump_printf_loc (MSG_NOTE, vect_location, "==> examining phi: ");
 314               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
 315             }
 316
 317           gcc_assert (stmt_info);
 318
 319           if (STMT_VINFO_RELEVANT_P (stmt_info)
 320               || STMT_VINFO_LIVE_P (stmt_info))
 321             {
 322               gcc_assert (!STMT_VINFO_VECTYPE (stmt_info));
 323               scalar_type = TREE_TYPE (PHI_RESULT (phi));
 324
 325               if (dump_enabled_p ())
 326                 {
 327                   dump_printf_loc (MSG_NOTE, vect_location,
 328                                    "get vectype for scalar type:  ");
 329                   dump_generic_expr (MSG_NOTE, TDF_SLIM, scalar_type);
 330                   dump_printf (MSG_NOTE, "\n");
 331                 }
 332
 333               vectype = get_vectype_for_scalar_type (scalar_type);
 334               if (!vectype)
 335                 {
 336                   if (dump_enabled_p ())
 337                     {
 338                       dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 339                                        "not vectorized: unsupported "
 340                                        "data-type ");
 341                       dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 342                                          scalar_type);
 343                       dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 344                     }
 345                   return false;
 346                 }
 347               STMT_VINFO_VECTYPE (stmt_info) = vectype;
 348
 349               if (dump_enabled_p ())
 350                 {
 351                   dump_printf_loc (MSG_NOTE, vect_location, "vectype: ");
 352                   dump_generic_expr (MSG_NOTE, TDF_SLIM, vectype);
 353                   dump_printf (MSG_NOTE, "\n");
 354                 }
 355
 356               if (dump_enabled_p ())
 357                 {
 358                   dump_printf_loc (MSG_NOTE, vect_location, "nunits = ");
 359                   dump_dec (MSG_NOTE, TYPE_VECTOR_SUBPARTS (vectype));
 360                   dump_printf (MSG_NOTE, "\n");
 361                 }
 362
 363               vect_update_max_nunits (&vectorization_factor, vectype);
 364             }
 365         }
 366
 367       for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
 368            gsi_next (&si))
 369         {
 370           stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
 371           if (!vect_determine_vf_for_stmt (stmt_info, &vectorization_factor,
 372                                            &mask_producers))
 373             return false;
 374         }
 375     }
 376
 377   /* TODO: Analyze cost. Decide if worth while to vectorize.  */
 378   if (dump_enabled_p ())
 379     {
 380       dump_printf_loc (MSG_NOTE, vect_location, "vectorization factor = ");
 381       dump_dec (MSG_NOTE, vectorization_factor);
 382       dump_printf (MSG_NOTE, "\n");
 383     }
 384
 385   if (known_le (vectorization_factor, 1U))
 386     {
 387       if (dump_enabled_p ())
 388         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 389                          "not vectorized: unsupported data-type\n");
 390       return false;
 391     }
 392   LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
 393
 394   for (i = 0; i < mask_producers.length (); i++)
 395     {
 396       stmt_info = mask_producers[i];
 397       tree mask_type = vect_get_mask_type_for_stmt (stmt_info);
 398       if (!mask_type)
 399         return false;
 400       STMT_VINFO_VECTYPE (stmt_info) = mask_type;
 401     }
 402
 403   return true;
 404 }
 405
 406
 407 /* Function vect_is_simple_iv_evolution.
 408
 409    FORNOW: A simple evolution of an induction variables in the loop is
 410    considered a polynomial evolution.  */
 411
 412 static bool
 413 vect_is_simple_iv_evolution (unsigned loop_nb, tree access_fn, tree * init,
 414                              tree * step)
 415 {
 416   tree init_expr;
 417   tree step_expr;
 418   tree evolution_part = evolution_part_in_loop_num (access_fn, loop_nb);
 419   basic_block bb;
 420
 421   /* When there is no evolution in this loop, the evolution function
 422      is not "simple".  */
 423   if (evolution_part == NULL_TREE)
 424     return false;
 425
 426   /* When the evolution is a polynomial of degree >= 2
 427      the evolution function is not "simple".  */
 428   if (tree_is_chrec (evolution_part))
 429     return false;
 430
 431   step_expr = evolution_part;
 432   init_expr = unshare_expr (initial_condition_in_loop_num (access_fn, loop_nb));
 433
 434   if (dump_enabled_p ())
 435     {
 436       dump_printf_loc (MSG_NOTE, vect_location, "step: ");
 437       dump_generic_expr (MSG_NOTE, TDF_SLIM, step_expr);
 438       dump_printf (MSG_NOTE, ",  init: ");
 439       dump_generic_expr (MSG_NOTE, TDF_SLIM, init_expr);
 440       dump_printf (MSG_NOTE, "\n");
 441     }
 442
 443   *init = init_expr;
 444   *step = step_expr;
 445
 446   if (TREE_CODE (step_expr) != INTEGER_CST
 447       && (TREE_CODE (step_expr) != SSA_NAME
 448           || ((bb = gimple_bb (SSA_NAME_DEF_STMT (step_expr)))
 449               && flow_bb_inside_loop_p (get_loop (cfun, loop_nb), bb))
 450           || (!INTEGRAL_TYPE_P (TREE_TYPE (step_expr))
 451               && (!SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr))
 452                   || !flag_associative_math)))
 453       && (TREE_CODE (step_expr) != REAL_CST
 454           || !flag_associative_math))
 455     {
 456       if (dump_enabled_p ())
 457         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 458                          "step unknown.\n");
 459       return false;
 460     }
 461
 462   return true;
 463 }
 464
 465 /* Function vect_analyze_scalar_cycles_1.
 466
 467    Examine the cross iteration def-use cycles of scalar variables
 468    in LOOP.  LOOP_VINFO represents the loop that is now being
 469    considered for vectorization (can be LOOP, or an outer-loop
 470    enclosing LOOP).  */
 471
 472 static void
 473 vect_analyze_scalar_cycles_1 (loop_vec_info loop_vinfo, struct loop *loop)
 474 {
 475   basic_block bb = loop->header;
 476   tree init, step;
 477   auto_vec<gimple *, 64> worklist;
 478   gphi_iterator gsi;
 479   bool double_reduc;
 480
 481   DUMP_VECT_SCOPE ("vect_analyze_scalar_cycles");
 482
 483   /* First - identify all inductions.  Reduction detection assumes that all the
 484      inductions have been identified, therefore, this order must not be
 485      changed.  */
 486   for (gsi = gsi_start_phis  (bb); !gsi_end_p (gsi); gsi_next (&gsi))
 487     {
 488       gphi *phi = gsi.phi ();
 489       tree access_fn = NULL;
 490       tree def = PHI_RESULT (phi);
 491       stmt_vec_info stmt_vinfo = loop_vinfo->lookup_stmt (phi);
 492
 493       if (dump_enabled_p ())
 494         {
 495           dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: ");
 496           dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
 497         }
 498
 499       /* Skip virtual phi's.  The data dependences that are associated with
 500          virtual defs/uses (i.e., memory accesses) are analyzed elsewhere.  */
 501       if (virtual_operand_p (def))
 502         continue;
 503
 504       STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_unknown_def_type;
 505
 506       /* Analyze the evolution function.  */
 507       access_fn = analyze_scalar_evolution (loop, def);
 508       if (access_fn)
 509         {
 510           STRIP_NOPS (access_fn);
 511           if (dump_enabled_p ())
 512             {
 513               dump_printf_loc (MSG_NOTE, vect_location,
 514                                "Access function of PHI: ");
 515               dump_generic_expr (MSG_NOTE, TDF_SLIM, access_fn);
 516               dump_printf (MSG_NOTE, "\n");
 517             }
 518           STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo)
 519             = initial_condition_in_loop_num (access_fn, loop->num);
 520           STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo)
 521             = evolution_part_in_loop_num (access_fn, loop->num);
 522         }
 523
 524       if (!access_fn
 525           || !vect_is_simple_iv_evolution (loop->num, access_fn, &init, &step)
 526           || (LOOP_VINFO_LOOP (loop_vinfo) != loop
 527               && TREE_CODE (step) != INTEGER_CST))
 528         {
 529           worklist.safe_push (phi);
 530           continue;
 531         }
 532
 533       gcc_assert (STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo)
 534                   != NULL_TREE);
 535       gcc_assert (STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo) != NULL_TREE);
 536
 537       if (dump_enabled_p ())
 538         dump_printf_loc (MSG_NOTE, vect_location, "Detected induction.\n");
 539       STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_induction_def;
 540     }
 541
 542
 543   /* Second - identify all reductions and nested cycles.  */
 544   while (worklist.length () > 0)
 545     {
 546       gimple *phi = worklist.pop ();
 547       tree def = PHI_RESULT (phi);
 548       stmt_vec_info stmt_vinfo = vinfo_for_stmt (phi);
 549       gimple *reduc_stmt;
 550
 551       if (dump_enabled_p ())
 552         {
 553           dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: ");
 554           dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
 555         }
 556
 557       gcc_assert (!virtual_operand_p (def)
 558                   && STMT_VINFO_DEF_TYPE (stmt_vinfo) == vect_unknown_def_type);
 559
 560       reduc_stmt = vect_force_simple_reduction (loop_vinfo, phi,
 561                                                 &double_reduc, false);
 562       if (reduc_stmt)
 563         {
 564           if (double_reduc)
 565             {
 566               if (dump_enabled_p ())
 567                 dump_printf_loc (MSG_NOTE, vect_location,
 568                                  "Detected double reduction.\n");
 569
 570               STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_double_reduction_def;
 571               STMT_VINFO_DEF_TYPE (vinfo_for_stmt (reduc_stmt)) =
 572                                                     vect_double_reduction_def;
 573             }
 574           else
 575             {
 576               if (loop != LOOP_VINFO_LOOP (loop_vinfo))
 577                 {
 578                   if (dump_enabled_p ())
 579                     dump_printf_loc (MSG_NOTE, vect_location,
 580                                      "Detected vectorizable nested cycle.\n");
 581
 582                   STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_nested_cycle;
 583                   STMT_VINFO_DEF_TYPE (vinfo_for_stmt (reduc_stmt)) =
 584                                                              vect_nested_cycle;
 585                 }
 586               else
 587                 {
 588                   if (dump_enabled_p ())
 589                     dump_printf_loc (MSG_NOTE, vect_location,
 590                                      "Detected reduction.\n");
 591
 592                   STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_reduction_def;
 593                   STMT_VINFO_DEF_TYPE (vinfo_for_stmt (reduc_stmt)) =
 594                                                            vect_reduction_def;
 595                   /* Store the reduction cycles for possible vectorization in
 596                      loop-aware SLP if it was not detected as reduction
 597                      chain.  */
 598                   if (! REDUC_GROUP_FIRST_ELEMENT (vinfo_for_stmt (reduc_stmt)))
 599                     LOOP_VINFO_REDUCTIONS (loop_vinfo).safe_push (reduc_stmt);
 600                 }
 601             }
 602         }
 603       else
 604         if (dump_enabled_p ())
 605           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 606                            "Unknown def-use cycle pattern.\n");
 607     }
 608 }
 609
 610
 611 /* Function vect_analyze_scalar_cycles.
 612
 613    Examine the cross iteration def-use cycles of scalar variables, by
 614    analyzing the loop-header PHIs of scalar variables.  Classify each
 615    cycle as one of the following: invariant, induction, reduction, unknown.
 616    We do that for the loop represented by LOOP_VINFO, and also to its
 617    inner-loop, if exists.
 618    Examples for scalar cycles:
 619
 620    Example1: reduction:
 621
 622               loop1:
 623               for (i=0; i<N; i++)
 624                  sum += a[i];
 625
 626    Example2: induction:
 627
 628               loop2:
 629               for (i=0; i<N; i++)
 630                  a[i] = i;  */
 631
 632 static void
 633 vect_analyze_scalar_cycles (loop_vec_info loop_vinfo)
 634 {
 635   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
 636
 637   vect_analyze_scalar_cycles_1 (loop_vinfo, loop);
 638
 639   /* When vectorizing an outer-loop, the inner-loop is executed sequentially.
 640      Reductions in such inner-loop therefore have different properties than
 641      the reductions in the nest that gets vectorized:
 642      1. When vectorized, they are executed in the same order as in the original
 643         scalar loop, so we can't change the order of computation when
 644         vectorizing them.
 645      2. FIXME: Inner-loop reductions can be used in the inner-loop, so the
 646         current checks are too strict.  */
 647
 648   if (loop->inner)
 649     vect_analyze_scalar_cycles_1 (loop_vinfo, loop->inner);
 650 }
 651
 652 /* Transfer group and reduction information from STMT to its pattern stmt.  */
 653
 654 static void
 655 vect_fixup_reduc_chain (gimple *stmt)
 656 {
 657   gimple *firstp = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (stmt));
 658   gimple *stmtp;
 659   gcc_assert (!REDUC_GROUP_FIRST_ELEMENT (vinfo_for_stmt (firstp))
 660               && REDUC_GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)));
 661   REDUC_GROUP_SIZE (vinfo_for_stmt (firstp))
 662     = REDUC_GROUP_SIZE (vinfo_for_stmt (stmt));
 663   do
 664     {
 665       stmtp = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (stmt));
 666       REDUC_GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmtp)) = firstp;
 667       stmt = REDUC_GROUP_NEXT_ELEMENT (vinfo_for_stmt (stmt));
 668       if (stmt)
 669         REDUC_GROUP_NEXT_ELEMENT (vinfo_for_stmt (stmtp))
 670           = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (stmt));
 671     }
 672   while (stmt);
 673   STMT_VINFO_DEF_TYPE (vinfo_for_stmt (stmtp)) = vect_reduction_def;
 674 }
 675
 676 /* Fixup scalar cycles that now have their stmts detected as patterns.  */
 677
 678 static void
 679 vect_fixup_scalar_cycles_with_patterns (loop_vec_info loop_vinfo)
 680 {
 681   gimple *first;
 682   unsigned i;
 683
 684   FOR_EACH_VEC_ELT (LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo), i, first)
 685     if (STMT_VINFO_IN_PATTERN_P (vinfo_for_stmt (first)))
 686       {
 687         gimple *next = REDUC_GROUP_NEXT_ELEMENT (vinfo_for_stmt (first));
 688         while (next)
 689           {
 690             if (! STMT_VINFO_IN_PATTERN_P (vinfo_for_stmt (next)))
 691               break;
 692             next = REDUC_GROUP_NEXT_ELEMENT (vinfo_for_stmt (next));
 693           }
 694         /* If not all stmt in the chain are patterns try to handle
 695            the chain without patterns.  */
 696         if (! next)
 697           {
 698             vect_fixup_reduc_chain (first);
 699             LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo)[i]
 700               = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (first));
 701           }
 702       }
 703 }
 704
 705 /* Function vect_get_loop_niters.
 706
 707    Determine how many iterations the loop is executed and place it
 708    in NUMBER_OF_ITERATIONS.  Place the number of latch iterations
 709    in NUMBER_OF_ITERATIONSM1.  Place the condition under which the
 710    niter information holds in ASSUMPTIONS.
 711
 712    Return the loop exit condition.  */
 713
 714
 715 static gcond *
 716 vect_get_loop_niters (struct loop *loop, tree *assumptions,
 717                       tree *number_of_iterations, tree *number_of_iterationsm1)
 718 {
 719   edge exit = single_exit (loop);
 720   struct tree_niter_desc niter_desc;
 721   tree niter_assumptions, niter, may_be_zero;
 722   gcond *cond = get_loop_exit_condition (loop);
 723
 724   *assumptions = boolean_true_node;
 725   *number_of_iterationsm1 = chrec_dont_know;
 726   *number_of_iterations = chrec_dont_know;
 727   DUMP_VECT_SCOPE ("get_loop_niters");
 728
 729   if (!exit)
 730     return cond;
 731
 732   niter = chrec_dont_know;
 733   may_be_zero = NULL_TREE;
 734   niter_assumptions = boolean_true_node;
 735   if (!number_of_iterations_exit_assumptions (loop, exit, &niter_desc, NULL)
 736       || chrec_contains_undetermined (niter_desc.niter))
 737     return cond;
 738
 739   niter_assumptions = niter_desc.assumptions;
 740   may_be_zero = niter_desc.may_be_zero;
 741   niter = niter_desc.niter;
 742
 743   if (may_be_zero && integer_zerop (may_be_zero))
 744     may_be_zero = NULL_TREE;
 745
 746   if (may_be_zero)
 747     {
 748       if (COMPARISON_CLASS_P (may_be_zero))
 749         {
 750           /* Try to combine may_be_zero with assumptions, this can simplify
 751              computation of niter expression.  */
 752           if (niter_assumptions && !integer_nonzerop (niter_assumptions))
 753             niter_assumptions = fold_build2 (TRUTH_AND_EXPR, boolean_type_node,
 754                                              niter_assumptions,
 755                                              fold_build1 (TRUTH_NOT_EXPR,
 756                                                           boolean_type_node,
 757                                                           may_be_zero));
 758           else
 759             niter = fold_build3 (COND_EXPR, TREE_TYPE (niter), may_be_zero,
 760                                  build_int_cst (TREE_TYPE (niter), 0),
 761                                  rewrite_to_non_trapping_overflow (niter));
 762
 763           may_be_zero = NULL_TREE;
 764         }
 765       else if (integer_nonzerop (may_be_zero))
 766         {
 767           *number_of_iterationsm1 = build_int_cst (TREE_TYPE (niter), 0);
 768           *number_of_iterations = build_int_cst (TREE_TYPE (niter), 1);
 769           return cond;
 770         }
 771       else
 772         return cond;
 773     }
 774
 775   *assumptions = niter_assumptions;
 776   *number_of_iterationsm1 = niter;
 777
 778   /* We want the number of loop header executions which is the number
 779      of latch executions plus one.
 780      ???  For UINT_MAX latch executions this number overflows to zero
 781      for loops like do { n++; } while (n != 0);  */
 782   if (niter && !chrec_contains_undetermined (niter))
 783     niter = fold_build2 (PLUS_EXPR, TREE_TYPE (niter), unshare_expr (niter),
 784                           build_int_cst (TREE_TYPE (niter), 1));
 785   *number_of_iterations = niter;
 786
 787   return cond;
 788 }
 789
 790 /* Function bb_in_loop_p
 791
 792    Used as predicate for dfs order traversal of the loop bbs.  */
 793
 794 static bool
 795 bb_in_loop_p (const_basic_block bb, const void *data)
 796 {
 797   const struct loop *const loop = (const struct loop *)data;
 798   if (flow_bb_inside_loop_p (loop, bb))
 799     return true;
 800   return false;
 801 }
 802
 803
 804 /* Create and initialize a new loop_vec_info struct for LOOP_IN, as well as
 805    stmt_vec_info structs for all the stmts in LOOP_IN.  */
 806
 807 _loop_vec_info::_loop_vec_info (struct loop *loop_in, vec_info_shared *shared)
 808   : vec_info (vec_info::loop, init_cost (loop_in), shared),
 809     loop (loop_in),
 810     bbs (XCNEWVEC (basic_block, loop->num_nodes)),
 811     num_itersm1 (NULL_TREE),
 812     num_iters (NULL_TREE),
 813     num_iters_unchanged (NULL_TREE),
 814     num_iters_assumptions (NULL_TREE),
 815     th (0),
 816     versioning_threshold (0),
 817     vectorization_factor (0),
 818     max_vectorization_factor (0),
 819     mask_skip_niters (NULL_TREE),
 820     mask_compare_type (NULL_TREE),
 821     unaligned_dr (NULL),
 822     peeling_for_alignment (0),
 823     ptr_mask (0),
 824     ivexpr_map (NULL),
 825     slp_unrolling_factor (1),
 826     single_scalar_iteration_cost (0),
 827     vectorizable (false),
 828     can_fully_mask_p (true),
 829     fully_masked_p (false),
 830     peeling_for_gaps (false),
 831     peeling_for_niter (false),
 832     operands_swapped (false),
 833     no_data_dependencies (false),
 834     has_mask_store (false),
 835     scalar_loop (NULL),
 836     orig_loop_info (NULL)
 837 {
 838   /* Create/Update stmt_info for all stmts in the loop.  */
 839   basic_block *body = get_loop_body (loop);
 840   for (unsigned int i = 0; i < loop->num_nodes; i++)
 841     {
 842       basic_block bb = body[i];
 843       gimple_stmt_iterator si;
 844
 845       for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
 846         {
 847           gimple *phi = gsi_stmt (si);
 848           gimple_set_uid (phi, 0);
 849           add_stmt (phi);
 850         }
 851
 852       for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
 853         {
 854           gimple *stmt = gsi_stmt (si);
 855           gimple_set_uid (stmt, 0);
 856           add_stmt (stmt);
 857         }
 858     }
 859   free (body);
 860
 861   /* CHECKME: We want to visit all BBs before their successors (except for
 862      latch blocks, for which this assertion wouldn't hold).  In the simple
 863      case of the loop forms we allow, a dfs order of the BBs would the same
 864      as reversed postorder traversal, so we are safe.  */
 865
 866   unsigned int nbbs = dfs_enumerate_from (loop->header, 0, bb_in_loop_p,
 867                                           bbs, loop->num_nodes, loop);
 868   gcc_assert (nbbs == loop->num_nodes);
 869 }
 870
 871 /* Free all levels of MASKS.  */
 872
 873 void
 874 release_vec_loop_masks (vec_loop_masks *masks)
 875 {
 876   rgroup_masks *rgm;
 877   unsigned int i;
 878   FOR_EACH_VEC_ELT (*masks, i, rgm)
 879     rgm->masks.release ();
 880   masks->release ();
 881 }
 882
 883 /* Free all memory used by the _loop_vec_info, as well as all the
 884    stmt_vec_info structs of all the stmts in the loop.  */
 885
 886 _loop_vec_info::~_loop_vec_info ()
 887 {
 888   int nbbs;
 889   gimple_stmt_iterator si;
 890   int j;
 891
 892   /* ???  We're releasing loop_vinfos en-block.  */
 893   set_stmt_vec_info_vec (&stmt_vec_infos);
 894   nbbs = loop->num_nodes;
 895   for (j = 0; j < nbbs; j++)
 896     {
 897       basic_block bb = bbs[j];
 898       for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
 899         free_stmt_vec_info (gsi_stmt (si));
 900
 901       for (si = gsi_start_bb (bb); !gsi_end_p (si); )
 902         {
 903           gimple *stmt = gsi_stmt (si);
 904
 905           /* We may have broken canonical form by moving a constant
 906              into RHS1 of a commutative op.  Fix such occurrences.  */
 907           if (operands_swapped && is_gimple_assign (stmt))
 908             {
 909               enum tree_code code = gimple_assign_rhs_code (stmt);
 910
 911               if ((code == PLUS_EXPR
 912                    || code == POINTER_PLUS_EXPR
 913                    || code == MULT_EXPR)
 914                   && CONSTANT_CLASS_P (gimple_assign_rhs1 (stmt)))
 915                 swap_ssa_operands (stmt,
 916                                    gimple_assign_rhs1_ptr (stmt),
 917                                    gimple_assign_rhs2_ptr (stmt));
 918               else if (code == COND_EXPR
 919                        && CONSTANT_CLASS_P (gimple_assign_rhs2 (stmt)))
 920                 {
 921                   tree cond_expr = gimple_assign_rhs1 (stmt);
 922                   enum tree_code cond_code = TREE_CODE (cond_expr);
 923
 924                   if (TREE_CODE_CLASS (cond_code) == tcc_comparison)
 925                     {
 926                       bool honor_nans = HONOR_NANS (TREE_OPERAND (cond_expr,
 927                                                                   0));
 928                       cond_code = invert_tree_comparison (cond_code,
 929                                                           honor_nans);
 930                       if (cond_code != ERROR_MARK)
 931                         {
 932                           TREE_SET_CODE (cond_expr, cond_code);
 933                           swap_ssa_operands (stmt,
 934                                              gimple_assign_rhs2_ptr (stmt),
 935                                              gimple_assign_rhs3_ptr (stmt));
 936                         }
 937                     }
 938                 }
 939             }
 940
 941           /* Free stmt_vec_info.  */
 942           free_stmt_vec_info (stmt);
 943           gsi_next (&si);
 944         }
 945     }
 946
 947   free (bbs);
 948
 949   release_vec_loop_masks (&masks);
 950   delete ivexpr_map;
 951
 952   loop->aux = NULL;
 953 }
 954
 955 /* Return an invariant or register for EXPR and emit necessary
 956    computations in the LOOP_VINFO loop preheader.  */
 957
 958 tree
 959 cse_and_gimplify_to_preheader (loop_vec_info loop_vinfo, tree expr)
 960 {
 961   if (is_gimple_reg (expr)
 962       || is_gimple_min_invariant (expr))
 963     return expr;
 964
 965   if (! loop_vinfo->ivexpr_map)
 966     loop_vinfo->ivexpr_map = new hash_map<tree_operand_hash, tree>;
 967   tree &cached = loop_vinfo->ivexpr_map->get_or_insert (expr);
 968   if (! cached)
 969     {
 970       gimple_seq stmts = NULL;
 971       cached = force_gimple_operand (unshare_expr (expr),
 972                                      &stmts, true, NULL_TREE);
 973       if (stmts)
 974         {
 975           edge e = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo));
 976           gsi_insert_seq_on_edge_immediate (e, stmts);
 977         }
 978     }
 979   return cached;
 980 }
 981
 982 /* Return true if we can use CMP_TYPE as the comparison type to produce
 983    all masks required to mask LOOP_VINFO.  */
 984
 985 static bool
 986 can_produce_all_loop_masks_p (loop_vec_info loop_vinfo, tree cmp_type)
 987 {
 988   rgroup_masks *rgm;
 989   unsigned int i;
 990   FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo), i, rgm)
 991     if (rgm->mask_type != NULL_TREE
 992         && !direct_internal_fn_supported_p (IFN_WHILE_ULT,
 993                                             cmp_type, rgm->mask_type,
 994                                             OPTIMIZE_FOR_SPEED))
 995       return false;
 996   return true;
 997 }
 998
 999 /* Calculate the maximum number of scalars per iteration for every
1000    rgroup in LOOP_VINFO.  */
1001
1002 static unsigned int
1003 vect_get_max_nscalars_per_iter (loop_vec_info loop_vinfo)
1004 {
1005   unsigned int res = 1;
1006   unsigned int i;
1007   rgroup_masks *rgm;
1008   FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo), i, rgm)
1009     res = MAX (res, rgm->max_nscalars_per_iter);
1010   return res;
1011 }
1012
1013 /* Each statement in LOOP_VINFO can be masked where necessary.  Check
1014    whether we can actually generate the masks required.  Return true if so,
1015    storing the type of the scalar IV in LOOP_VINFO_MASK_COMPARE_TYPE.  */
1016
1017 static bool
1018 vect_verify_full_masking (loop_vec_info loop_vinfo)
1019 {
1020   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1021   unsigned int min_ni_width;
1022
1023   /* Use a normal loop if there are no statements that need masking.
1024      This only happens in rare degenerate cases: it means that the loop
1025      has no loads, no stores, and no live-out values.  */
1026   if (LOOP_VINFO_MASKS (loop_vinfo).is_empty ())
1027     return false;
1028
1029   /* Get the maximum number of iterations that is representable
1030      in the counter type.  */
1031   tree ni_type = TREE_TYPE (LOOP_VINFO_NITERSM1 (loop_vinfo));
1032   widest_int max_ni = wi::to_widest (TYPE_MAX_VALUE (ni_type)) + 1;
1033
1034   /* Get a more refined estimate for the number of iterations.  */
1035   widest_int max_back_edges;
1036   if (max_loop_iterations (loop, &max_back_edges))
1037     max_ni = wi::smin (max_ni, max_back_edges + 1);
1038
1039   /* Account for rgroup masks, in which each bit is replicated N times.  */
1040   max_ni *= vect_get_max_nscalars_per_iter (loop_vinfo);
1041
1042   /* Work out how many bits we need to represent the limit.  */
1043   min_ni_width = wi::min_precision (max_ni, UNSIGNED);
1044
1045   /* Find a scalar mode for which WHILE_ULT is supported.  */
1046   opt_scalar_int_mode cmp_mode_iter;
1047   tree cmp_type = NULL_TREE;
1048   FOR_EACH_MODE_IN_CLASS (cmp_mode_iter, MODE_INT)
1049     {
1050       unsigned int cmp_bits = GET_MODE_BITSIZE (cmp_mode_iter.require ());
1051       if (cmp_bits >= min_ni_width
1052           && targetm.scalar_mode_supported_p (cmp_mode_iter.require ()))
1053         {
1054           tree this_type = build_nonstandard_integer_type (cmp_bits, true);
1055           if (this_type
1056               && can_produce_all_loop_masks_p (loop_vinfo, this_type))
1057             {
1058               /* Although we could stop as soon as we find a valid mode,
1059                  it's often better to continue until we hit Pmode, since the
1060                  operands to the WHILE are more likely to be reusable in
1061                  address calculations.  */
1062               cmp_type = this_type;
1063               if (cmp_bits >= GET_MODE_BITSIZE (Pmode))
1064                 break;
1065             }
1066         }
1067     }
1068
1069   if (!cmp_type)
1070     return false;
1071
1072   LOOP_VINFO_MASK_COMPARE_TYPE (loop_vinfo) = cmp_type;
1073   return true;
1074 }
1075
1076 /* Calculate the cost of one scalar iteration of the loop.  */
1077 static void
1078 vect_compute_single_scalar_iteration_cost (loop_vec_info loop_vinfo)
1079 {
1080   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1081   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1082   int nbbs = loop->num_nodes, factor;
1083   int innerloop_iters, i;
1084
1085   /* Gather costs for statements in the scalar loop.  */
1086
1087   /* FORNOW.  */
1088   innerloop_iters = 1;
1089   if (loop->inner)
1090     innerloop_iters = 50; /* FIXME */
1091
1092   for (i = 0; i < nbbs; i++)
1093     {
1094       gimple_stmt_iterator si;
1095       basic_block bb = bbs[i];
1096
1097       if (bb->loop_father == loop->inner)
1098         factor = innerloop_iters;
1099       else
1100         factor = 1;
1101
1102       for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
1103         {
1104           gimple *stmt = gsi_stmt (si);
1105           stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (stmt);
1106
1107           if (!is_gimple_assign (stmt) && !is_gimple_call (stmt))
1108             continue;
1109
1110           /* Skip stmts that are not vectorized inside the loop.  */
1111           if (stmt_info
1112               && !STMT_VINFO_RELEVANT_P (stmt_info)
1113               && (!STMT_VINFO_LIVE_P (stmt_info)
1114                   || !VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
1115               && !STMT_VINFO_IN_PATTERN_P (stmt_info))
1116             continue;
1117
1118           vect_cost_for_stmt kind;
1119           if (STMT_VINFO_DATA_REF (stmt_info))
1120             {
1121               if (DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
1122                kind = scalar_load;
1123              else
1124                kind = scalar_store;
1125             }
1126           else
1127             kind = scalar_stmt;
1128
1129           record_stmt_cost (&LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1130                             factor, kind, stmt_info, 0, vect_prologue);
1131         }
1132     }
1133
1134   /* Now accumulate cost.  */
1135   void *target_cost_data = init_cost (loop);
1136   stmt_info_for_cost *si;
1137   int j;
1138   FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1139                     j, si)
1140     {
1141       struct _stmt_vec_info *stmt_info
1142         = si->stmt ? vinfo_for_stmt (si->stmt) : NULL;
1143       (void) add_stmt_cost (target_cost_data, si->count,
1144                             si->kind, stmt_info, si->misalign,
1145                             vect_body);
1146     }
1147   unsigned dummy, body_cost = 0;
1148   finish_cost (target_cost_data, &dummy, &body_cost, &dummy);
1149   destroy_cost_data (target_cost_data);
1150   LOOP_VINFO_SINGLE_SCALAR_ITERATION_COST (loop_vinfo) = body_cost;
1151 }
1152
1153
1154 /* Function vect_analyze_loop_form_1.
1155
1156    Verify that certain CFG restrictions hold, including:
1157    - the loop has a pre-header
1158    - the loop has a single entry and exit
1159    - the loop exit condition is simple enough
1160    - the number of iterations can be analyzed, i.e, a countable loop.  The
1161      niter could be analyzed under some assumptions.  */
1162
1163 bool
1164 vect_analyze_loop_form_1 (struct loop *loop, gcond **loop_cond,
1165                           tree *assumptions, tree *number_of_iterationsm1,
1166                           tree *number_of_iterations, gcond **inner_loop_cond)
1167 {
1168   DUMP_VECT_SCOPE ("vect_analyze_loop_form");
1169
1170   /* Different restrictions apply when we are considering an inner-most loop,
1171      vs. an outer (nested) loop.
1172      (FORNOW. May want to relax some of these restrictions in the future).  */
1173
1174   if (!loop->inner)
1175     {
1176       /* Inner-most loop.  We currently require that the number of BBs is
1177          exactly 2 (the header and latch).  Vectorizable inner-most loops
1178          look like this:
1179
1180                         (pre-header)
1181                            |
1182                           header <--------+
1183                            | |            |
1184                            | +--> latch --+
1185                            |
1186                         (exit-bb)  */
1187
1188       if (loop->num_nodes != 2)
1189         {
1190           if (dump_enabled_p ())
1191             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1192                              "not vectorized: control flow in loop.\n");
1193           return false;
1194         }
1195
1196       if (empty_block_p (loop->header))
1197         {
1198           if (dump_enabled_p ())
1199             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1200                              "not vectorized: empty loop.\n");
1201           return false;
1202         }
1203     }
1204   else
1205     {
1206       struct loop *innerloop = loop->inner;
1207       edge entryedge;
1208
1209       /* Nested loop. We currently require that the loop is doubly-nested,
1210          contains a single inner loop, and the number of BBs is exactly 5.
1211          Vectorizable outer-loops look like this:
1212
1213                         (pre-header)
1214                            |
1215                           header <---+
1216                            |         |
1217                           inner-loop |
1218                            |         |
1219                           tail ------+
1220                            |
1221                         (exit-bb)
1222
1223          The inner-loop has the properties expected of inner-most loops
1224          as described above.  */
1225
1226       if ((loop->inner)->inner || (loop->inner)->next)
1227         {
1228           if (dump_enabled_p ())
1229             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1230                              "not vectorized: multiple nested loops.\n");
1231           return false;
1232         }
1233
1234       if (loop->num_nodes != 5)
1235         {
1236           if (dump_enabled_p ())
1237             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1238                              "not vectorized: control flow in loop.\n");
1239           return false;
1240         }
1241
1242       entryedge = loop_preheader_edge (innerloop);
1243       if (entryedge->src != loop->header
1244           || !single_exit (innerloop)
1245           || single_exit (innerloop)->dest != EDGE_PRED (loop->latch, 0)->src)
1246         {
1247           if (dump_enabled_p ())
1248             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1249                              "not vectorized: unsupported outerloop form.\n");
1250           return false;
1251         }
1252
1253       /* Analyze the inner-loop.  */
1254       tree inner_niterm1, inner_niter, inner_assumptions;
1255       if (! vect_analyze_loop_form_1 (loop->inner, inner_loop_cond,
1256                                       &inner_assumptions, &inner_niterm1,
1257                                       &inner_niter, NULL)
1258           /* Don't support analyzing niter under assumptions for inner
1259              loop.  */
1260           || !integer_onep (inner_assumptions))
1261         {
1262           if (dump_enabled_p ())
1263             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1264                              "not vectorized: Bad inner loop.\n");
1265           return false;
1266         }
1267
1268       if (!expr_invariant_in_loop_p (loop, inner_niter))
1269         {
1270           if (dump_enabled_p ())
1271             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1272                              "not vectorized: inner-loop count not"
1273                              " invariant.\n");
1274           return false;
1275         }
1276
1277       if (dump_enabled_p ())
1278         dump_printf_loc (MSG_NOTE, vect_location,
1279                          "Considering outer-loop vectorization.\n");
1280     }
1281
1282   if (!single_exit (loop)
1283       || EDGE_COUNT (loop->header->preds) != 2)
1284     {
1285       if (dump_enabled_p ())
1286         {
1287           if (!single_exit (loop))
1288             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1289                              "not vectorized: multiple exits.\n");
1290           else if (EDGE_COUNT (loop->header->preds) != 2)
1291             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1292                              "not vectorized: too many incoming edges.\n");
1293         }
1294       return false;
1295     }
1296
1297   /* We assume that the loop exit condition is at the end of the loop. i.e,
1298      that the loop is represented as a do-while (with a proper if-guard
1299      before the loop if needed), where the loop header contains all the
1300      executable statements, and the latch is empty.  */
1301   if (!empty_block_p (loop->latch)
1302       || !gimple_seq_empty_p (phi_nodes (loop->latch)))
1303     {
1304       if (dump_enabled_p ())
1305         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1306                          "not vectorized: latch block not empty.\n");
1307       return false;
1308     }
1309
1310   /* Make sure the exit is not abnormal.  */
1311   edge e = single_exit (loop);
1312   if (e->flags & EDGE_ABNORMAL)
1313     {
1314       if (dump_enabled_p ())
1315         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1316                          "not vectorized: abnormal loop exit edge.\n");
1317       return false;
1318     }
1319
1320   *loop_cond = vect_get_loop_niters (loop, assumptions, number_of_iterations,
1321                                      number_of_iterationsm1);
1322   if (!*loop_cond)
1323     {
1324       if (dump_enabled_p ())
1325         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1326                          "not vectorized: complicated exit condition.\n");
1327       return false;
1328     }
1329
1330   if (integer_zerop (*assumptions)
1331       || !*number_of_iterations
1332       || chrec_contains_undetermined (*number_of_iterations))
1333     {
1334       if (dump_enabled_p ())
1335         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1336                          "not vectorized: number of iterations cannot be "
1337                          "computed.\n");
1338       return false;
1339     }
1340
1341   if (integer_zerop (*number_of_iterations))
1342     {
1343       if (dump_enabled_p ())
1344         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1345                          "not vectorized: number of iterations = 0.\n");
1346       return false;
1347     }
1348
1349   return true;
1350 }
1351
1352 /* Analyze LOOP form and return a loop_vec_info if it is of suitable form.  */
1353
1354 loop_vec_info
1355 vect_analyze_loop_form (struct loop *loop, vec_info_shared *shared)
1356 {
1357   tree assumptions, number_of_iterations, number_of_iterationsm1;
1358   gcond *loop_cond, *inner_loop_cond = NULL;
1359
1360   if (! vect_analyze_loop_form_1 (loop, &loop_cond,
1361                                   &assumptions, &number_of_iterationsm1,
1362                                   &number_of_iterations, &inner_loop_cond))
1363     return NULL;
1364
1365   loop_vec_info loop_vinfo = new _loop_vec_info (loop, shared);
1366   LOOP_VINFO_NITERSM1 (loop_vinfo) = number_of_iterationsm1;
1367   LOOP_VINFO_NITERS (loop_vinfo) = number_of_iterations;
1368   LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = number_of_iterations;
1369   if (!integer_onep (assumptions))
1370     {
1371       /* We consider to vectorize this loop by versioning it under
1372          some assumptions.  In order to do this, we need to clear
1373          existing information computed by scev and niter analyzer.  */
1374       scev_reset_htab ();
1375       free_numbers_of_iterations_estimates (loop);
1376       /* Also set flag for this loop so that following scev and niter
1377          analysis are done under the assumptions.  */
1378       loop_constraint_set (loop, LOOP_C_FINITE);
1379       /* Also record the assumptions for versioning.  */
1380       LOOP_VINFO_NITERS_ASSUMPTIONS (loop_vinfo) = assumptions;
1381     }
1382
1383   if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
1384     {
1385       if (dump_enabled_p ())
1386         {
1387           dump_printf_loc (MSG_NOTE, vect_location,
1388                            "Symbolic number of iterations is ");
1389           dump_generic_expr (MSG_NOTE, TDF_DETAILS, number_of_iterations);
1390           dump_printf (MSG_NOTE, "\n");
1391         }
1392     }
1393
1394   stmt_vec_info loop_cond_info = loop_vinfo->lookup_stmt (loop_cond);
1395   STMT_VINFO_TYPE (loop_cond_info) = loop_exit_ctrl_vec_info_type;
1396   if (inner_loop_cond)
1397     {
1398       stmt_vec_info inner_loop_cond_info
1399         = loop_vinfo->lookup_stmt (inner_loop_cond);
1400       STMT_VINFO_TYPE (inner_loop_cond_info) = loop_exit_ctrl_vec_info_type;
1401     }
1402
1403   gcc_assert (!loop->aux);
1404   loop->aux = loop_vinfo;
1405   return loop_vinfo;
1406 }
1407
1408
1409
1410 /* Scan the loop stmts and dependent on whether there are any (non-)SLP
1411    statements update the vectorization factor.  */
1412
1413 static void
1414 vect_update_vf_for_slp (loop_vec_info loop_vinfo)
1415 {
1416   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1417   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1418   int nbbs = loop->num_nodes;
1419   poly_uint64 vectorization_factor;
1420   int i;
1421
1422   DUMP_VECT_SCOPE ("vect_update_vf_for_slp");
1423
1424   vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1425   gcc_assert (known_ne (vectorization_factor, 0U));
1426
1427   /* If all the stmts in the loop can be SLPed, we perform only SLP, and
1428      vectorization factor of the loop is the unrolling factor required by
1429      the SLP instances.  If that unrolling factor is 1, we say, that we
1430      perform pure SLP on loop - cross iteration parallelism is not
1431      exploited.  */
1432   bool only_slp_in_loop = true;
1433   for (i = 0; i < nbbs; i++)
1434     {
1435       basic_block bb = bbs[i];
1436       for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
1437            gsi_next (&si))
1438         {
1439           gimple *stmt = gsi_stmt (si);
1440           stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
1441           if (STMT_VINFO_IN_PATTERN_P (stmt_info)
1442               && STMT_VINFO_RELATED_STMT (stmt_info))
1443             {
1444               stmt = STMT_VINFO_RELATED_STMT (stmt_info);
1445               stmt_info = vinfo_for_stmt (stmt);
1446             }
1447           if ((STMT_VINFO_RELEVANT_P (stmt_info)
1448                || VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
1449               && !PURE_SLP_STMT (stmt_info))
1450             /* STMT needs both SLP and loop-based vectorization.  */
1451             only_slp_in_loop = false;
1452         }
1453     }
1454
1455   if (only_slp_in_loop)
1456     {
1457       dump_printf_loc (MSG_NOTE, vect_location,
1458                        "Loop contains only SLP stmts\n");
1459       vectorization_factor = LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo);
1460     }
1461   else
1462     {
1463       dump_printf_loc (MSG_NOTE, vect_location,
1464                        "Loop contains SLP and non-SLP stmts\n");
1465       /* Both the vectorization factor and unroll factor have the form
1466          current_vector_size * X for some rational X, so they must have
1467          a common multiple.  */
1468       vectorization_factor
1469         = force_common_multiple (vectorization_factor,
1470                                  LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo));
1471     }
1472
1473   LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
1474   if (dump_enabled_p ())
1475     {
1476       dump_printf_loc (MSG_NOTE, vect_location,
1477                        "Updating vectorization factor to ");
1478       dump_dec (MSG_NOTE, vectorization_factor);
1479       dump_printf (MSG_NOTE, ".\n");
1480     }
1481 }
1482
1483 /* Return true if STMT_INFO describes a double reduction phi and if
1484    the other phi in the reduction is also relevant for vectorization.
1485    This rejects cases such as:
1486
1487       outer1:
1488         x_1 = PHI <x_3(outer2), ...>;
1489         ...
1490
1491       inner:
1492         x_2 = ...;
1493         ...
1494
1495       outer2:
1496         x_3 = PHI <x_2(inner)>;
1497
1498    if nothing in x_2 or elsewhere makes x_1 relevant.  */
1499
1500 static bool
1501 vect_active_double_reduction_p (stmt_vec_info stmt_info)
1502 {
1503   if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_double_reduction_def)
1504     return false;
1505
1506   gimple *other_phi = STMT_VINFO_REDUC_DEF (stmt_info);
1507   return STMT_VINFO_RELEVANT_P (vinfo_for_stmt (other_phi));
1508 }
1509
1510 /* Function vect_analyze_loop_operations.
1511
1512    Scan the loop stmts and make sure they are all vectorizable.  */
1513
1514 static bool
1515 vect_analyze_loop_operations (loop_vec_info loop_vinfo)
1516 {
1517   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1518   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1519   int nbbs = loop->num_nodes;
1520   int i;
1521   stmt_vec_info stmt_info;
1522   bool need_to_vectorize = false;
1523   bool ok;
1524
1525   DUMP_VECT_SCOPE ("vect_analyze_loop_operations");
1526
1527   stmt_vector_for_cost cost_vec;
1528   cost_vec.create (2);
1529
1530   for (i = 0; i < nbbs; i++)
1531     {
1532       basic_block bb = bbs[i];
1533
1534       for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
1535            gsi_next (&si))
1536         {
1537           gphi *phi = si.phi ();
1538           ok = true;
1539
1540           stmt_info = loop_vinfo->lookup_stmt (phi);
1541           if (dump_enabled_p ())
1542             {
1543               dump_printf_loc (MSG_NOTE, vect_location, "examining phi: ");
1544               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
1545             }
1546           if (virtual_operand_p (gimple_phi_result (phi)))
1547             continue;
1548
1549           /* Inner-loop loop-closed exit phi in outer-loop vectorization
1550              (i.e., a phi in the tail of the outer-loop).  */
1551           if (! is_loop_header_bb_p (bb))
1552             {
1553               /* FORNOW: we currently don't support the case that these phis
1554                  are not used in the outerloop (unless it is double reduction,
1555                  i.e., this phi is vect_reduction_def), cause this case
1556                  requires to actually do something here.  */
1557               if (STMT_VINFO_LIVE_P (stmt_info)
1558                   && !vect_active_double_reduction_p (stmt_info))
1559                 {
1560                   if (dump_enabled_p ())
1561                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1562                                      "Unsupported loop-closed phi in "
1563                                      "outer-loop.\n");
1564                   return false;
1565                 }
1566
1567               /* If PHI is used in the outer loop, we check that its operand
1568                  is defined in the inner loop.  */
1569               if (STMT_VINFO_RELEVANT_P (stmt_info))
1570                 {
1571                   tree phi_op;
1572                   gimple *op_def_stmt;
1573
1574                   if (gimple_phi_num_args (phi) != 1)
1575                     return false;
1576
1577                   phi_op = PHI_ARG_DEF (phi, 0);
1578                   if (TREE_CODE (phi_op) != SSA_NAME)
1579                     return false;
1580
1581                   op_def_stmt = SSA_NAME_DEF_STMT (phi_op);
1582                   if (gimple_nop_p (op_def_stmt)
1583                       || !flow_bb_inside_loop_p (loop, gimple_bb (op_def_stmt))
1584                       || !vinfo_for_stmt (op_def_stmt))
1585                     return false;
1586
1587                   if (STMT_VINFO_RELEVANT (vinfo_for_stmt (op_def_stmt))
1588                         != vect_used_in_outer
1589                       && STMT_VINFO_RELEVANT (vinfo_for_stmt (op_def_stmt))
1590                            != vect_used_in_outer_by_reduction)
1591                     return false;
1592                 }
1593
1594               continue;
1595             }
1596
1597           gcc_assert (stmt_info);
1598
1599           if ((STMT_VINFO_RELEVANT (stmt_info) == vect_used_in_scope
1600                || STMT_VINFO_LIVE_P (stmt_info))
1601               && STMT_VINFO_DEF_TYPE (stmt_info) != vect_induction_def)
1602             {
1603               /* A scalar-dependence cycle that we don't support.  */
1604               if (dump_enabled_p ())
1605                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1606                                  "not vectorized: scalar dependence cycle.\n");
1607               return false;
1608             }
1609
1610           if (STMT_VINFO_RELEVANT_P (stmt_info))
1611             {
1612               need_to_vectorize = true;
1613               if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def
1614                   && ! PURE_SLP_STMT (stmt_info))
1615                 ok = vectorizable_induction (phi, NULL, NULL, NULL, &cost_vec);
1616               else if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
1617                         || STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle)
1618                        && ! PURE_SLP_STMT (stmt_info))
1619                 ok = vectorizable_reduction (phi, NULL, NULL, NULL, NULL,
1620                                              &cost_vec);
1621             }
1622
1623           /* SLP PHIs are tested by vect_slp_analyze_node_operations.  */
1624           if (ok
1625               && STMT_VINFO_LIVE_P (stmt_info)
1626               && !PURE_SLP_STMT (stmt_info))
1627             ok = vectorizable_live_operation (phi, NULL, NULL, -1, NULL,
1628                                               &cost_vec);
1629
1630           if (!ok)
1631             {
1632               if (dump_enabled_p ())
1633                 {
1634                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1635                                    "not vectorized: relevant phi not "
1636                                    "supported: ");
1637                   dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, phi, 0);
1638                 }
1639               return false;
1640             }
1641         }
1642
1643       for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
1644            gsi_next (&si))
1645         {
1646           gimple *stmt = gsi_stmt (si);
1647           if (!gimple_clobber_p (stmt)
1648               && !vect_analyze_stmt (stmt, &need_to_vectorize, NULL, NULL,
1649                                      &cost_vec))
1650             return false;
1651         }
1652     } /* bbs */
1653
1654   add_stmt_costs (loop_vinfo->target_cost_data, &cost_vec);
1655   cost_vec.release ();
1656
1657   /* All operations in the loop are either irrelevant (deal with loop
1658      control, or dead), or only used outside the loop and can be moved
1659      out of the loop (e.g. invariants, inductions).  The loop can be
1660      optimized away by scalar optimizations.  We're better off not
1661      touching this loop.  */
1662   if (!need_to_vectorize)
1663     {
1664       if (dump_enabled_p ())
1665         dump_printf_loc (MSG_NOTE, vect_location,
1666                          "All the computation can be taken out of the loop.\n");
1667       if (dump_enabled_p ())
1668         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1669                          "not vectorized: redundant loop. no profit to "
1670                          "vectorize.\n");
1671       return false;
1672     }
1673
1674   return true;
1675 }
1676
1677 /* Analyze the cost of the loop described by LOOP_VINFO.  Decide if it
1678    is worthwhile to vectorize.  Return 1 if definitely yes, 0 if
1679    definitely no, or -1 if it's worth retrying.  */
1680
1681 static int
1682 vect_analyze_loop_costing (loop_vec_info loop_vinfo)
1683 {
1684   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1685   unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
1686
1687   /* Only fully-masked loops can have iteration counts less than the
1688      vectorization factor.  */
1689   if (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
1690     {
1691       HOST_WIDE_INT max_niter;
1692
1693       if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
1694         max_niter = LOOP_VINFO_INT_NITERS (loop_vinfo);
1695       else
1696         max_niter = max_stmt_executions_int (loop);
1697
1698       if (max_niter != -1
1699           && (unsigned HOST_WIDE_INT) max_niter < assumed_vf)
1700         {
1701           if (dump_enabled_p ())
1702             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1703                              "not vectorized: iteration count smaller than "
1704                              "vectorization factor.\n");
1705           return 0;
1706         }
1707     }
1708
1709   int min_profitable_iters, min_profitable_estimate;
1710   vect_estimate_min_profitable_iters (loop_vinfo, &min_profitable_iters,
1711                                       &min_profitable_estimate);
1712
1713   if (min_profitable_iters < 0)
1714     {
1715       if (dump_enabled_p ())
1716         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1717                          "not vectorized: vectorization not profitable.\n");
1718       if (dump_enabled_p ())
1719         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1720                          "not vectorized: vector version will never be "
1721                          "profitable.\n");
1722       return -1;
1723     }
1724
1725   int min_scalar_loop_bound = (PARAM_VALUE (PARAM_MIN_VECT_LOOP_BOUND)
1726                                * assumed_vf);
1727
1728   /* Use the cost model only if it is more conservative than user specified
1729      threshold.  */
1730   unsigned int th = (unsigned) MAX (min_scalar_loop_bound,
1731                                     min_profitable_iters);
1732
1733   LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo) = th;
1734
1735   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
1736       && LOOP_VINFO_INT_NITERS (loop_vinfo) < th)
1737     {
1738       if (dump_enabled_p ())
1739         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1740                          "not vectorized: vectorization not profitable.\n");
1741       if (dump_enabled_p ())
1742         dump_printf_loc (MSG_NOTE, vect_location,
1743                          "not vectorized: iteration count smaller than user "
1744                          "specified loop bound parameter or minimum profitable "
1745                          "iterations (whichever is more conservative).\n");
1746       return 0;
1747     }
1748
1749   HOST_WIDE_INT estimated_niter = estimated_stmt_executions_int (loop);
1750   if (estimated_niter == -1)
1751     estimated_niter = likely_max_stmt_executions_int (loop);
1752   if (estimated_niter != -1
1753       && ((unsigned HOST_WIDE_INT) estimated_niter
1754           < MAX (th, (unsigned) min_profitable_estimate)))
1755     {
1756       if (dump_enabled_p ())
1757         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1758                          "not vectorized: estimated iteration count too "
1759                          "small.\n");
1760       if (dump_enabled_p ())
1761         dump_printf_loc (MSG_NOTE, vect_location,
1762                          "not vectorized: estimated iteration count smaller "
1763                          "than specified loop bound parameter or minimum "
1764                          "profitable iterations (whichever is more "
1765                          "conservative).\n");
1766       return -1;
1767     }
1768
1769   return 1;
1770 }
1771
1772 static bool
1773 vect_get_datarefs_in_loop (loop_p loop, basic_block *bbs,
1774                            vec<data_reference_p> *datarefs,
1775                            unsigned int *n_stmts)
1776 {
1777   *n_stmts = 0;
1778   for (unsigned i = 0; i < loop->num_nodes; i++)
1779     for (gimple_stmt_iterator gsi = gsi_start_bb (bbs[i]);
1780          !gsi_end_p (gsi); gsi_next (&gsi))
1781       {
1782         gimple *stmt = gsi_stmt (gsi);
1783         if (is_gimple_debug (stmt))
1784           continue;
1785         ++(*n_stmts);
1786         if (!vect_find_stmt_data_reference (loop, stmt, datarefs))
1787           {
1788             if (is_gimple_call (stmt) && loop->safelen)
1789               {
1790                 tree fndecl = gimple_call_fndecl (stmt), op;
1791                 if (fndecl != NULL_TREE)
1792                   {
1793                     cgraph_node *node = cgraph_node::get (fndecl);
1794                     if (node != NULL && node->simd_clones != NULL)
1795                       {
1796                         unsigned int j, n = gimple_call_num_args (stmt);
1797                         for (j = 0; j < n; j++)
1798                           {
1799                             op = gimple_call_arg (stmt, j);
1800                             if (DECL_P (op)
1801                                 || (REFERENCE_CLASS_P (op)
1802                                     && get_base_address (op)))
1803                               break;
1804                           }
1805                         op = gimple_call_lhs (stmt);
1806                         /* Ignore #pragma omp declare simd functions
1807                            if they don't have data references in the
1808                            call stmt itself.  */
1809                         if (j == n
1810                             && !(op
1811                                  && (DECL_P (op)
1812                                      || (REFERENCE_CLASS_P (op)
1813                                          && get_base_address (op)))))
1814                           continue;
1815                       }
1816                   }
1817               }
1818             return false;
1819           }
1820         /* If dependence analysis will give up due to the limit on the
1821            number of datarefs stop here and fail fatally.  */
1822         if (datarefs->length ()
1823             > (unsigned)PARAM_VALUE (PARAM_LOOP_MAX_DATAREFS_FOR_DATADEPS))
1824           return false;
1825       }
1826   return true;
1827 }
1828
1829 /* Function vect_analyze_loop_2.
1830
1831    Apply a set of analyses on LOOP, and create a loop_vec_info struct
1832    for it.  The different analyses will record information in the
1833    loop_vec_info struct.  */
1834 static bool
1835 vect_analyze_loop_2 (loop_vec_info loop_vinfo, bool &fatal, unsigned *n_stmts)
1836 {
1837   bool ok;
1838   int res;
1839   unsigned int max_vf = MAX_VECTORIZATION_FACTOR;
1840   poly_uint64 min_vf = 2;
1841
1842   /* The first group of checks is independent of the vector size.  */
1843   fatal = true;
1844
1845   /* Find all data references in the loop (which correspond to vdefs/vuses)
1846      and analyze their evolution in the loop.  */
1847
1848   loop_p loop = LOOP_VINFO_LOOP (loop_vinfo);
1849
1850   /* Gather the data references and count stmts in the loop.  */
1851   if (!LOOP_VINFO_DATAREFS (loop_vinfo).exists ())
1852     {
1853       if (!vect_get_datarefs_in_loop (loop, LOOP_VINFO_BBS (loop_vinfo),
1854                                       &LOOP_VINFO_DATAREFS (loop_vinfo),
1855                                       n_stmts))
1856         {
1857           if (dump_enabled_p ())
1858             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1859                              "not vectorized: loop contains function "
1860                              "calls or data references that cannot "
1861                              "be analyzed\n");
1862           return false;
1863         }
1864       loop_vinfo->shared->save_datarefs ();
1865     }
1866   else
1867     loop_vinfo->shared->check_datarefs ();
1868
1869   /* Analyze the data references and also adjust the minimal
1870      vectorization factor according to the loads and stores.  */
1871
1872   ok = vect_analyze_data_refs (loop_vinfo, &min_vf);
1873   if (!ok)
1874     {
1875       if (dump_enabled_p ())
1876         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1877                          "bad data references.\n");
1878       return false;
1879     }
1880
1881   /* Classify all cross-iteration scalar data-flow cycles.
1882      Cross-iteration cycles caused by virtual phis are analyzed separately.  */
1883   vect_analyze_scalar_cycles (loop_vinfo);
1884
1885   vect_pattern_recog (loop_vinfo);
1886
1887   vect_fixup_scalar_cycles_with_patterns (loop_vinfo);
1888
1889   /* Analyze the access patterns of the data-refs in the loop (consecutive,
1890      complex, etc.). FORNOW: Only handle consecutive access pattern.  */
1891
1892   ok = vect_analyze_data_ref_accesses (loop_vinfo);
1893   if (!ok)
1894     {
1895       if (dump_enabled_p ())
1896         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1897                          "bad data access.\n");
1898       return false;
1899     }
1900
1901   /* Data-flow analysis to detect stmts that do not need to be vectorized.  */
1902
1903   ok = vect_mark_stmts_to_be_vectorized (loop_vinfo);
1904   if (!ok)
1905     {
1906       if (dump_enabled_p ())
1907         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1908                          "unexpected pattern.\n");
1909       return false;
1910     }
1911
1912   /* While the rest of the analysis below depends on it in some way.  */
1913   fatal = false;
1914
1915   /* Analyze data dependences between the data-refs in the loop
1916      and adjust the maximum vectorization factor according to
1917      the dependences.
1918      FORNOW: fail at the first data dependence that we encounter.  */
1919
1920   ok = vect_analyze_data_ref_dependences (loop_vinfo, &max_vf);
1921   if (!ok
1922       || (max_vf != MAX_VECTORIZATION_FACTOR
1923           && maybe_lt (max_vf, min_vf)))
1924     {
1925       if (dump_enabled_p ())
1926             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1927                              "bad data dependence.\n");
1928       return false;
1929     }
1930   LOOP_VINFO_MAX_VECT_FACTOR (loop_vinfo) = max_vf;
1931
1932   ok = vect_determine_vectorization_factor (loop_vinfo);
1933   if (!ok)
1934     {
1935       if (dump_enabled_p ())
1936         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1937                          "can't determine vectorization factor.\n");
1938       return false;
1939     }
1940   if (max_vf != MAX_VECTORIZATION_FACTOR
1941       && maybe_lt (max_vf, LOOP_VINFO_VECT_FACTOR (loop_vinfo)))
1942     {
1943       if (dump_enabled_p ())
1944         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1945                          "bad data dependence.\n");
1946       return false;
1947     }
1948
1949   /* Compute the scalar iteration cost.  */
1950   vect_compute_single_scalar_iteration_cost (loop_vinfo);
1951
1952   poly_uint64 saved_vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1953   unsigned th;
1954
1955   /* Check the SLP opportunities in the loop, analyze and build SLP trees.  */
1956   ok = vect_analyze_slp (loop_vinfo, *n_stmts);
1957   if (!ok)
1958     return false;
1959
1960   /* If there are any SLP instances mark them as pure_slp.  */
1961   bool slp = vect_make_slp_decision (loop_vinfo);
1962   if (slp)
1963     {
1964       /* Find stmts that need to be both vectorized and SLPed.  */
1965       vect_detect_hybrid_slp (loop_vinfo);
1966
1967       /* Update the vectorization factor based on the SLP decision.  */
1968       vect_update_vf_for_slp (loop_vinfo);
1969     }
1970
1971   bool saved_can_fully_mask_p = LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo);
1972
1973   /* We don't expect to have to roll back to anything other than an empty
1974      set of rgroups.  */
1975   gcc_assert (LOOP_VINFO_MASKS (loop_vinfo).is_empty ());
1976
1977   /* This is the point where we can re-start analysis with SLP forced off.  */
1978 start_over:
1979
1980   /* Now the vectorization factor is final.  */
1981   poly_uint64 vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1982   gcc_assert (known_ne (vectorization_factor, 0U));
1983
1984   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) && dump_enabled_p ())
1985     {
1986       dump_printf_loc (MSG_NOTE, vect_location,
1987                        "vectorization_factor = ");
1988       dump_dec (MSG_NOTE, vectorization_factor);
1989       dump_printf (MSG_NOTE, ", niters = " HOST_WIDE_INT_PRINT_DEC "\n",
1990                    LOOP_VINFO_INT_NITERS (loop_vinfo));
1991     }
1992
1993   HOST_WIDE_INT max_niter
1994     = likely_max_stmt_executions_int (LOOP_VINFO_LOOP (loop_vinfo));
1995
1996   /* Analyze the alignment of the data-refs in the loop.
1997      Fail if a data reference is found that cannot be vectorized.  */
1998
1999   ok = vect_analyze_data_refs_alignment (loop_vinfo);
2000   if (!ok)
2001     {
2002       if (dump_enabled_p ())
2003         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2004                          "bad data alignment.\n");
2005       return false;
2006     }
2007
2008   /* Prune the list of ddrs to be tested at run-time by versioning for alias.
2009      It is important to call pruning after vect_analyze_data_ref_accesses,
2010      since we use grouping information gathered by interleaving analysis.  */
2011   ok = vect_prune_runtime_alias_test_list (loop_vinfo);
2012   if (!ok)
2013     return false;
2014
2015   /* Do not invoke vect_enhance_data_refs_alignment for eplilogue
2016      vectorization.  */
2017   if (!LOOP_VINFO_EPILOGUE_P (loop_vinfo))
2018     {
2019     /* This pass will decide on using loop versioning and/or loop peeling in
2020        order to enhance the alignment of data references in the loop.  */
2021     ok = vect_enhance_data_refs_alignment (loop_vinfo);
2022     if (!ok)
2023       {
2024         if (dump_enabled_p ())
2025           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2026                            "bad data alignment.\n");
2027         return false;
2028       }
2029     }
2030
2031   if (slp)
2032     {
2033       /* Analyze operations in the SLP instances.  Note this may
2034          remove unsupported SLP instances which makes the above
2035          SLP kind detection invalid.  */
2036       unsigned old_size = LOOP_VINFO_SLP_INSTANCES (loop_vinfo).length ();
2037       vect_slp_analyze_operations (loop_vinfo);
2038       if (LOOP_VINFO_SLP_INSTANCES (loop_vinfo).length () != old_size)
2039         goto again;
2040     }
2041
2042   /* Scan all the remaining operations in the loop that are not subject
2043      to SLP and make sure they are vectorizable.  */
2044   ok = vect_analyze_loop_operations (loop_vinfo);
2045   if (!ok)
2046     {
2047       if (dump_enabled_p ())
2048         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2049                          "bad operation or unsupported loop bound.\n");
2050       return false;
2051     }
2052
2053   /* Decide whether to use a fully-masked loop for this vectorization
2054      factor.  */
2055   LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
2056     = (LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo)
2057        && vect_verify_full_masking (loop_vinfo));
2058   if (dump_enabled_p ())
2059     {
2060       if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
2061         dump_printf_loc (MSG_NOTE, vect_location,
2062                          "using a fully-masked loop.\n");
2063       else
2064         dump_printf_loc (MSG_NOTE, vect_location,
2065                          "not using a fully-masked loop.\n");
2066     }
2067
2068   /* If epilog loop is required because of data accesses with gaps,
2069      one additional iteration needs to be peeled.  Check if there is
2070      enough iterations for vectorization.  */
2071   if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
2072       && LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2073       && !LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
2074     {
2075       poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2076       tree scalar_niters = LOOP_VINFO_NITERSM1 (loop_vinfo);
2077
2078       if (known_lt (wi::to_widest (scalar_niters), vf))
2079         {
2080           if (dump_enabled_p ())
2081             dump_printf_loc (MSG_NOTE, vect_location,
2082                              "loop has no enough iterations to support"
2083                              " peeling for gaps.\n");
2084           return false;
2085         }
2086     }
2087
2088   /* Check the costings of the loop make vectorizing worthwhile.  */
2089   res = vect_analyze_loop_costing (loop_vinfo);
2090   if (res < 0)
2091     goto again;
2092   if (!res)
2093     {
2094       if (dump_enabled_p ())
2095         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2096                          "Loop costings not worthwhile.\n");
2097       return false;
2098     }
2099
2100   /* Decide whether we need to create an epilogue loop to handle
2101      remaining scalar iterations.  */
2102   th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
2103
2104   unsigned HOST_WIDE_INT const_vf;
2105   if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
2106     /* The main loop handles all iterations.  */
2107     LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = false;
2108   else if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2109            && LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) > 0)
2110     {
2111       if (!multiple_p (LOOP_VINFO_INT_NITERS (loop_vinfo)
2112                        - LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo),
2113                        LOOP_VINFO_VECT_FACTOR (loop_vinfo)))
2114         LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = true;
2115     }
2116   else if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
2117            || !LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant (&const_vf)
2118            || ((tree_ctz (LOOP_VINFO_NITERS (loop_vinfo))
2119                 < (unsigned) exact_log2 (const_vf))
2120                /* In case of versioning, check if the maximum number of
2121                   iterations is greater than th.  If they are identical,
2122                   the epilogue is unnecessary.  */
2123                && (!LOOP_REQUIRES_VERSIONING (loop_vinfo)
2124                    || ((unsigned HOST_WIDE_INT) max_niter
2125                        > (th / const_vf) * const_vf))))
2126     LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = true;
2127
2128   /* If an epilogue loop is required make sure we can create one.  */
2129   if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
2130       || LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo))
2131     {
2132       if (dump_enabled_p ())
2133         dump_printf_loc (MSG_NOTE, vect_location, "epilog loop required\n");
2134       if (!vect_can_advance_ivs_p (loop_vinfo)
2135           || !slpeel_can_duplicate_loop_p (LOOP_VINFO_LOOP (loop_vinfo),
2136                                            single_exit (LOOP_VINFO_LOOP
2137                                                          (loop_vinfo))))
2138         {
2139           if (dump_enabled_p ())
2140             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2141                              "not vectorized: can't create required "
2142                              "epilog loop\n");
2143           goto again;
2144         }
2145     }
2146
2147   /* During peeling, we need to check if number of loop iterations is
2148      enough for both peeled prolog loop and vector loop.  This check
2149      can be merged along with threshold check of loop versioning, so
2150      increase threshold for this case if necessary.  */
2151   if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
2152     {
2153       poly_uint64 niters_th = 0;
2154
2155       if (!vect_use_loop_mask_for_alignment_p (loop_vinfo))
2156         {
2157           /* Niters for peeled prolog loop.  */
2158           if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) < 0)
2159             {
2160               struct data_reference *dr = LOOP_VINFO_UNALIGNED_DR (loop_vinfo);
2161               tree vectype
2162                 = STMT_VINFO_VECTYPE (vinfo_for_stmt (vect_dr_stmt (dr)));
2163               niters_th += TYPE_VECTOR_SUBPARTS (vectype) - 1;
2164             }
2165           else
2166             niters_th += LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
2167         }
2168
2169       /* Niters for at least one iteration of vectorized loop.  */
2170       if (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
2171         niters_th += LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2172       /* One additional iteration because of peeling for gap.  */
2173       if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
2174         niters_th += 1;
2175       LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo) = niters_th;
2176     }
2177
2178   gcc_assert (known_eq (vectorization_factor,
2179                         LOOP_VINFO_VECT_FACTOR (loop_vinfo)));
2180
2181   /* Ok to vectorize!  */
2182   return true;
2183
2184 again:
2185   /* Try again with SLP forced off but if we didn't do any SLP there is
2186      no point in re-trying.  */
2187   if (!slp)
2188     return false;
2189
2190   /* If there are reduction chains re-trying will fail anyway.  */
2191   if (! LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo).is_empty ())
2192     return false;
2193
2194   /* Likewise if the grouped loads or stores in the SLP cannot be handled
2195      via interleaving or lane instructions.  */
2196   slp_instance instance;
2197   slp_tree node;
2198   unsigned i, j;
2199   FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), i, instance)
2200     {
2201       stmt_vec_info vinfo;
2202       vinfo = vinfo_for_stmt
2203           (SLP_TREE_SCALAR_STMTS (SLP_INSTANCE_TREE (instance))[0]);
2204       if (! STMT_VINFO_GROUPED_ACCESS (vinfo))
2205         continue;
2206       vinfo = vinfo_for_stmt (DR_GROUP_FIRST_ELEMENT (vinfo));
2207       unsigned int size = DR_GROUP_SIZE (vinfo);
2208       tree vectype = STMT_VINFO_VECTYPE (vinfo);
2209       if (! vect_store_lanes_supported (vectype, size, false)
2210          && ! known_eq (TYPE_VECTOR_SUBPARTS (vectype), 1U)
2211          && ! vect_grouped_store_supported (vectype, size))
2212        return false;
2213       FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), j, node)
2214         {
2215           vinfo = vinfo_for_stmt (SLP_TREE_SCALAR_STMTS (node)[0]);
2216           vinfo = vinfo_for_stmt (DR_GROUP_FIRST_ELEMENT (vinfo));
2217           bool single_element_p = !DR_GROUP_NEXT_ELEMENT (vinfo);
2218           size = DR_GROUP_SIZE (vinfo);
2219           vectype = STMT_VINFO_VECTYPE (vinfo);
2220           if (! vect_load_lanes_supported (vectype, size, false)
2221               && ! vect_grouped_load_supported (vectype, single_element_p,
2222                                                 size))
2223             return false;
2224         }
2225     }
2226
2227   if (dump_enabled_p ())
2228     dump_printf_loc (MSG_NOTE, vect_location,
2229                      "re-trying with SLP disabled\n");
2230
2231   /* Roll back state appropriately.  No SLP this time.  */
2232   slp = false;
2233   /* Restore vectorization factor as it were without SLP.  */
2234   LOOP_VINFO_VECT_FACTOR (loop_vinfo) = saved_vectorization_factor;
2235   /* Free the SLP instances.  */
2236   FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), j, instance)
2237     vect_free_slp_instance (instance, false);
2238   LOOP_VINFO_SLP_INSTANCES (loop_vinfo).release ();
2239   /* Reset SLP type to loop_vect on all stmts.  */
2240   for (i = 0; i < LOOP_VINFO_LOOP (loop_vinfo)->num_nodes; ++i)
2241     {
2242       basic_block bb = LOOP_VINFO_BBS (loop_vinfo)[i];
2243       for (gimple_stmt_iterator si = gsi_start_phis (bb);
2244            !gsi_end_p (si); gsi_next (&si))
2245         {
2246           stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
2247           STMT_SLP_TYPE (stmt_info) = loop_vect;
2248         }
2249       for (gimple_stmt_iterator si = gsi_start_bb (bb);
2250            !gsi_end_p (si); gsi_next (&si))
2251         {
2252           stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (gsi_stmt (si));
2253           STMT_SLP_TYPE (stmt_info) = loop_vect;
2254           if (STMT_VINFO_IN_PATTERN_P (stmt_info))
2255             {
2256               gimple *pattern_def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
2257               stmt_info = vinfo_for_stmt (STMT_VINFO_RELATED_STMT (stmt_info));
2258               STMT_SLP_TYPE (stmt_info) = loop_vect;
2259               for (gimple_stmt_iterator pi = gsi_start (pattern_def_seq);
2260                    !gsi_end_p (pi); gsi_next (&pi))
2261                 STMT_SLP_TYPE (loop_vinfo->lookup_stmt (gsi_stmt (pi)))
2262                   = loop_vect;
2263             }
2264         }
2265     }
2266   /* Free optimized alias test DDRS.  */
2267   LOOP_VINFO_LOWER_BOUNDS (loop_vinfo).truncate (0);
2268   LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo).release ();
2269   LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo).release ();
2270   /* Reset target cost data.  */
2271   destroy_cost_data (LOOP_VINFO_TARGET_COST_DATA (loop_vinfo));
2272   LOOP_VINFO_TARGET_COST_DATA (loop_vinfo)
2273     = init_cost (LOOP_VINFO_LOOP (loop_vinfo));
2274   /* Reset accumulated rgroup information.  */
2275   release_vec_loop_masks (&LOOP_VINFO_MASKS (loop_vinfo));
2276   /* Reset assorted flags.  */
2277   LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = false;
2278   LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) = false;
2279   LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo) = 0;
2280   LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo) = 0;
2281   LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = saved_can_fully_mask_p;
2282
2283   goto start_over;
2284 }
2285
2286 /* Function vect_analyze_loop.
2287
2288    Apply a set of analyses on LOOP, and create a loop_vec_info struct
2289    for it.  The different analyses will record information in the
2290    loop_vec_info struct.  If ORIG_LOOP_VINFO is not NULL epilogue must
2291    be vectorized.  */
2292 loop_vec_info
2293 vect_analyze_loop (struct loop *loop, loop_vec_info orig_loop_vinfo,
2294                    vec_info_shared *shared)
2295 {
2296   loop_vec_info loop_vinfo;
2297   auto_vector_sizes vector_sizes;
2298
2299   /* Autodetect first vector size we try.  */
2300   current_vector_size = 0;
2301   targetm.vectorize.autovectorize_vector_sizes (&vector_sizes);
2302   unsigned int next_size = 0;
2303
2304   DUMP_VECT_SCOPE ("analyze_loop_nest");
2305
2306   if (loop_outer (loop)
2307       && loop_vec_info_for_loop (loop_outer (loop))
2308       && LOOP_VINFO_VECTORIZABLE_P (loop_vec_info_for_loop (loop_outer (loop))))
2309     {
2310       if (dump_enabled_p ())
2311         dump_printf_loc (MSG_NOTE, vect_location,
2312                          "outer-loop already vectorized.\n");
2313       return NULL;
2314     }
2315
2316   if (!find_loop_nest (loop, &shared->loop_nest))
2317     {
2318       if (dump_enabled_p ())
2319         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2320                          "not vectorized: loop nest containing two "
2321                          "or more consecutive inner loops cannot be "
2322                          "vectorized\n");
2323       return NULL;
2324     }
2325
2326   unsigned n_stmts = 0;
2327   poly_uint64 autodetected_vector_size = 0;
2328   while (1)
2329     {
2330       /* Check the CFG characteristics of the loop (nesting, entry/exit).  */
2331       loop_vinfo = vect_analyze_loop_form (loop, shared);
2332       if (!loop_vinfo)
2333         {
2334           if (dump_enabled_p ())
2335             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2336                              "bad loop form.\n");
2337           return NULL;
2338         }
2339
2340       bool fatal = false;
2341
2342       if (orig_loop_vinfo)
2343         LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo) = orig_loop_vinfo;
2344
2345       if (vect_analyze_loop_2 (loop_vinfo, fatal, &n_stmts))
2346         {
2347           LOOP_VINFO_VECTORIZABLE_P (loop_vinfo) = 1;
2348
2349           return loop_vinfo;
2350         }
2351
2352       delete loop_vinfo;
2353
2354       if (next_size == 0)
2355         autodetected_vector_size = current_vector_size;
2356
2357       if (next_size < vector_sizes.length ()
2358           && known_eq (vector_sizes[next_size], autodetected_vector_size))
2359         next_size += 1;
2360
2361       if (fatal
2362           || next_size == vector_sizes.length ()
2363           || known_eq (current_vector_size, 0U))
2364         return NULL;
2365
2366       /* Try the next biggest vector size.  */
2367       current_vector_size = vector_sizes[next_size++];
2368       if (dump_enabled_p ())
2369         {
2370           dump_printf_loc (MSG_NOTE, vect_location,
2371                            "***** Re-trying analysis with "
2372                            "vector size ");
2373           dump_dec (MSG_NOTE, current_vector_size);
2374           dump_printf (MSG_NOTE, "\n");
2375         }
2376     }
2377 }
2378
2379 /* Return true if there is an in-order reduction function for CODE, storing
2380    it in *REDUC_FN if so.  */
2381
2382 static bool
2383 fold_left_reduction_fn (tree_code code, internal_fn *reduc_fn)
2384 {
2385   switch (code)
2386     {
2387     case PLUS_EXPR:
2388       *reduc_fn = IFN_FOLD_LEFT_PLUS;
2389       return true;
2390
2391     default:
2392       return false;
2393     }
2394 }
2395
2396 /* Function reduction_fn_for_scalar_code
2397
2398    Input:
2399    CODE - tree_code of a reduction operations.
2400
2401    Output:
2402    REDUC_FN - the corresponding internal function to be used to reduce the
2403       vector of partial results into a single scalar result, or IFN_LAST
2404       if the operation is a supported reduction operation, but does not have
2405       such an internal function.
2406
2407    Return FALSE if CODE currently cannot be vectorized as reduction.  */
2408
2409 static bool
2410 reduction_fn_for_scalar_code (enum tree_code code, internal_fn *reduc_fn)
2411 {
2412   switch (code)
2413     {
2414       case MAX_EXPR:
2415         *reduc_fn = IFN_REDUC_MAX;
2416         return true;
2417
2418       case MIN_EXPR:
2419         *reduc_fn = IFN_REDUC_MIN;
2420         return true;
2421
2422       case PLUS_EXPR:
2423         *reduc_fn = IFN_REDUC_PLUS;
2424         return true;
2425
2426       case BIT_AND_EXPR:
2427         *reduc_fn = IFN_REDUC_AND;
2428         return true;
2429
2430       case BIT_IOR_EXPR:
2431         *reduc_fn = IFN_REDUC_IOR;
2432         return true;
2433
2434       case BIT_XOR_EXPR:
2435         *reduc_fn = IFN_REDUC_XOR;
2436         return true;
2437
2438       case MULT_EXPR:
2439       case MINUS_EXPR:
2440         *reduc_fn = IFN_LAST;
2441         return true;
2442
2443       default:
2444        return false;
2445     }
2446 }
2447
2448 /* If there is a neutral value X such that SLP reduction NODE would not
2449    be affected by the introduction of additional X elements, return that X,
2450    otherwise return null.  CODE is the code of the reduction.  REDUC_CHAIN
2451    is true if the SLP statements perform a single reduction, false if each
2452    statement performs an independent reduction.  */
2453
2454 static tree
2455 neutral_op_for_slp_reduction (slp_tree slp_node, tree_code code,
2456                               bool reduc_chain)
2457 {
2458   vec<gimple *> stmts = SLP_TREE_SCALAR_STMTS (slp_node);
2459   gimple *stmt = stmts[0];
2460   stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt);
2461   tree vector_type = STMT_VINFO_VECTYPE (stmt_vinfo);
2462   tree scalar_type = TREE_TYPE (vector_type);
2463   struct loop *loop = gimple_bb (stmt)->loop_father;
2464   gcc_assert (loop);
2465
2466   switch (code)
2467     {
2468     case WIDEN_SUM_EXPR:
2469     case DOT_PROD_EXPR:
2470     case SAD_EXPR:
2471     case PLUS_EXPR:
2472     case MINUS_EXPR:
2473     case BIT_IOR_EXPR:
2474     case BIT_XOR_EXPR:
2475       return build_zero_cst (scalar_type);
2476
2477     case MULT_EXPR:
2478       return build_one_cst (scalar_type);
2479
2480     case BIT_AND_EXPR:
2481       return build_all_ones_cst (scalar_type);
2482
2483     case MAX_EXPR:
2484     case MIN_EXPR:
2485       /* For MIN/MAX the initial values are neutral.  A reduction chain
2486          has only a single initial value, so that value is neutral for
2487          all statements.  */
2488       if (reduc_chain)
2489         return PHI_ARG_DEF_FROM_EDGE (stmt, loop_preheader_edge (loop));
2490       return NULL_TREE;
2491
2492     default:
2493       return NULL_TREE;
2494     }
2495 }
2496
2497 /* Error reporting helper for vect_is_simple_reduction below.  GIMPLE statement
2498    STMT is printed with a message MSG. */
2499
2500 static void
2501 report_vect_op (dump_flags_t msg_type, gimple *stmt, const char *msg)
2502 {
2503   dump_printf_loc (msg_type, vect_location, "%s", msg);
2504   dump_gimple_stmt (msg_type, TDF_SLIM, stmt, 0);
2505 }
2506
2507 /* DEF_STMT occurs in a loop that contains a potential reduction operation.
2508    Return true if the results of DEF_STMT are something that can be
2509    accumulated by such a reduction.  */
2510
2511 static bool
2512 vect_valid_reduction_input_p (gimple *def_stmt)
2513 {
2514   stmt_vec_info def_stmt_info = vinfo_for_stmt (def_stmt);
2515   return (is_gimple_assign (def_stmt)
2516           || is_gimple_call (def_stmt)
2517           || STMT_VINFO_DEF_TYPE (def_stmt_info) == vect_induction_def
2518           || (gimple_code (def_stmt) == GIMPLE_PHI
2519               && STMT_VINFO_DEF_TYPE (def_stmt_info) == vect_internal_def
2520               && !is_loop_header_bb_p (gimple_bb (def_stmt))));
2521 }
2522
2523 /* Detect SLP reduction of the form:
2524
2525    #a1 = phi <a5, a0>
2526    a2 = operation (a1)
2527    a3 = operation (a2)
2528    a4 = operation (a3)
2529    a5 = operation (a4)
2530
2531    #a = phi <a5>
2532
2533    PHI is the reduction phi node (#a1 = phi <a5, a0> above)
2534    FIRST_STMT is the first reduction stmt in the chain
2535    (a2 = operation (a1)).
2536
2537    Return TRUE if a reduction chain was detected.  */
2538
2539 static bool
2540 vect_is_slp_reduction (loop_vec_info loop_info, gimple *phi,
2541                        gimple *first_stmt)
2542 {
2543   struct loop *loop = (gimple_bb (phi))->loop_father;
2544   struct loop *vect_loop = LOOP_VINFO_LOOP (loop_info);
2545   enum tree_code code;
2546   gimple *current_stmt = NULL, *loop_use_stmt = NULL, *first, *next_stmt;
2547   stmt_vec_info use_stmt_info, current_stmt_info;
2548   tree lhs;
2549   imm_use_iterator imm_iter;
2550   use_operand_p use_p;
2551   int nloop_uses, size = 0, n_out_of_loop_uses;
2552   bool found = false;
2553
2554   if (loop != vect_loop)
2555     return false;
2556
2557   lhs = PHI_RESULT (phi);
2558   code = gimple_assign_rhs_code (first_stmt);
2559   while (1)
2560     {
2561       nloop_uses = 0;
2562       n_out_of_loop_uses = 0;
2563       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, lhs)
2564         {
2565           gimple *use_stmt = USE_STMT (use_p);
2566           if (is_gimple_debug (use_stmt))
2567             continue;
2568
2569           /* Check if we got back to the reduction phi.  */
2570           if (use_stmt == phi)
2571             {
2572               loop_use_stmt = use_stmt;
2573               found = true;
2574               break;
2575             }
2576
2577           if (flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
2578             {
2579               loop_use_stmt = use_stmt;
2580               nloop_uses++;
2581             }
2582            else
2583              n_out_of_loop_uses++;
2584
2585            /* There are can be either a single use in the loop or two uses in
2586               phi nodes.  */
2587            if (nloop_uses > 1 || (n_out_of_loop_uses && nloop_uses))
2588              return false;
2589         }
2590
2591       if (found)
2592         break;
2593
2594       /* We reached a statement with no loop uses.  */
2595       if (nloop_uses == 0)
2596         return false;
2597
2598       /* This is a loop exit phi, and we haven't reached the reduction phi.  */
2599       if (gimple_code (loop_use_stmt) == GIMPLE_PHI)
2600         return false;
2601
2602       if (!is_gimple_assign (loop_use_stmt)
2603           || code != gimple_assign_rhs_code (loop_use_stmt)
2604           || !flow_bb_inside_loop_p (loop, gimple_bb (loop_use_stmt)))
2605         return false;
2606
2607       /* Insert USE_STMT into reduction chain.  */
2608       use_stmt_info = loop_info->lookup_stmt (loop_use_stmt);
2609       if (current_stmt)
2610         {
2611           current_stmt_info = vinfo_for_stmt (current_stmt);
2612           REDUC_GROUP_NEXT_ELEMENT (current_stmt_info) = loop_use_stmt;
2613           REDUC_GROUP_FIRST_ELEMENT (use_stmt_info)
2614             = REDUC_GROUP_FIRST_ELEMENT (current_stmt_info);
2615         }
2616       else
2617         REDUC_GROUP_FIRST_ELEMENT (use_stmt_info) = loop_use_stmt;
2618
2619       lhs = gimple_assign_lhs (loop_use_stmt);
2620       current_stmt = loop_use_stmt;
2621       size++;
2622    }
2623
2624   if (!found || loop_use_stmt != phi || size < 2)
2625     return false;
2626
2627   /* Swap the operands, if needed, to make the reduction operand be the second
2628      operand.  */
2629   lhs = PHI_RESULT (phi);
2630   next_stmt = REDUC_GROUP_FIRST_ELEMENT (vinfo_for_stmt (current_stmt));
2631   while (next_stmt)
2632     {
2633       if (gimple_assign_rhs2 (next_stmt) == lhs)
2634         {
2635           tree op = gimple_assign_rhs1 (next_stmt);
2636           gimple *def_stmt = NULL;
2637
2638           if (TREE_CODE (op) == SSA_NAME)
2639             def_stmt = SSA_NAME_DEF_STMT (op);
2640
2641           /* Check that the other def is either defined in the loop
2642              ("vect_internal_def"), or it's an induction (defined by a
2643              loop-header phi-node).  */
2644           if (def_stmt
2645               && gimple_bb (def_stmt)
2646               && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
2647               && vect_valid_reduction_input_p (def_stmt))
2648             {
2649               lhs = gimple_assign_lhs (next_stmt);
2650               next_stmt = REDUC_GROUP_NEXT_ELEMENT (vinfo_for_stmt (next_stmt));
2651               continue;
2652             }
2653
2654           return false;
2655         }
2656       else
2657         {
2658           tree op = gimple_assign_rhs2 (next_stmt);
2659           gimple *def_stmt = NULL;
2660
2661           if (TREE_CODE (op) == SSA_NAME)
2662             def_stmt = SSA_NAME_DEF_STMT (op);
2663
2664           /* Check that the other def is either defined in the loop
2665             ("vect_internal_def"), or it's an induction (defined by a
2666             loop-header phi-node).  */
2667           if (def_stmt
2668               && gimple_bb (def_stmt)
2669               && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
2670               && vect_valid_reduction_input_p (def_stmt))
2671             {
2672               if (dump_enabled_p ())
2673                 {
2674                   dump_printf_loc (MSG_NOTE, vect_location, "swapping oprnds: ");
2675                   dump_gimple_stmt (MSG_NOTE, TDF_SLIM, next_stmt, 0);
2676                 }
2677
2678               swap_ssa_operands (next_stmt,
2679                                  gimple_assign_rhs1_ptr (next_stmt),
2680                                  gimple_assign_rhs2_ptr (next_stmt));
2681               update_stmt (next_stmt);
2682
2683               if (CONSTANT_CLASS_P (gimple_assign_rhs1 (next_stmt)))
2684                 LOOP_VINFO_OPERANDS_SWAPPED (loop_info) = true;
2685             }
2686           else
2687             return false;
2688         }
2689
2690       lhs = gimple_assign_lhs (next_stmt);
2691       next_stmt = REDUC_GROUP_NEXT_ELEMENT (vinfo_for_stmt (next_stmt));
2692     }
2693
2694   /* Save the chain for further analysis in SLP detection.  */
2695   first = REDUC_GROUP_FIRST_ELEMENT (vinfo_for_stmt (current_stmt));
2696   LOOP_VINFO_REDUCTION_CHAINS (loop_info).safe_push (first);
2697   REDUC_GROUP_SIZE (vinfo_for_stmt (first)) = size;
2698
2699   return true;
2700 }
2701
2702 /* Return true if we need an in-order reduction for operation CODE
2703    on type TYPE.  NEED_WRAPPING_INTEGRAL_OVERFLOW is true if integer
2704    overflow must wrap.  */
2705
2706 static bool
2707 needs_fold_left_reduction_p (tree type, tree_code code,
2708                              bool need_wrapping_integral_overflow)
2709 {
2710   /* CHECKME: check for !flag_finite_math_only too?  */
2711   if (SCALAR_FLOAT_TYPE_P (type))
2712     switch (code)
2713       {
2714       case MIN_EXPR:
2715       case MAX_EXPR:
2716         return false;
2717
2718       default:
2719         return !flag_associative_math;
2720       }
2721
2722   if (INTEGRAL_TYPE_P (type))
2723     {
2724       if (!operation_no_trapping_overflow (type, code))
2725         return true;
2726       if (need_wrapping_integral_overflow
2727           && !TYPE_OVERFLOW_WRAPS (type)
2728           && operation_can_overflow (code))
2729         return true;
2730       return false;
2731     }
2732
2733   if (SAT_FIXED_POINT_TYPE_P (type))
2734     return true;
2735
2736   return false;
2737 }
2738
2739 /* Return true if the reduction PHI in LOOP with latch arg LOOP_ARG and
2740    reduction operation CODE has a handled computation expression.  */
2741
2742 bool
2743 check_reduction_path (dump_user_location_t loc, loop_p loop, gphi *phi,
2744                       tree loop_arg, enum tree_code code)
2745 {
2746   auto_vec<std::pair<ssa_op_iter, use_operand_p> > path;
2747   auto_bitmap visited;
2748   tree lookfor = PHI_RESULT (phi);
2749   ssa_op_iter curri;
2750   use_operand_p curr = op_iter_init_phiuse (&curri, phi, SSA_OP_USE);
2751   while (USE_FROM_PTR (curr) != loop_arg)
2752     curr = op_iter_next_use (&curri);
2753   curri.i = curri.numops;
2754   do
2755     {
2756       path.safe_push (std::make_pair (curri, curr));
2757       tree use = USE_FROM_PTR (curr);
2758       if (use == lookfor)
2759         break;
2760       gimple *def = SSA_NAME_DEF_STMT (use);
2761       if (gimple_nop_p (def)
2762           || ! flow_bb_inside_loop_p (loop, gimple_bb (def)))
2763         {
2764 pop:
2765           do
2766             {
2767               std::pair<ssa_op_iter, use_operand_p> x = path.pop ();
2768               curri = x.first;
2769               curr = x.second;
2770               do
2771                 curr = op_iter_next_use (&curri);
2772               /* Skip already visited or non-SSA operands (from iterating
2773                  over PHI args).  */
2774               while (curr != NULL_USE_OPERAND_P
2775                      && (TREE_CODE (USE_FROM_PTR (curr)) != SSA_NAME
2776                          || ! bitmap_set_bit (visited,
2777                                               SSA_NAME_VERSION
2778                                                 (USE_FROM_PTR (curr)))));
2779             }
2780           while (curr == NULL_USE_OPERAND_P && ! path.is_empty ());
2781           if (curr == NULL_USE_OPERAND_P)
2782             break;
2783         }
2784       else
2785         {
2786           if (gimple_code (def) == GIMPLE_PHI)
2787             curr = op_iter_init_phiuse (&curri, as_a <gphi *>(def), SSA_OP_USE);
2788           else
2789             curr = op_iter_init_use (&curri, def, SSA_OP_USE);
2790           while (curr != NULL_USE_OPERAND_P
2791                  && (TREE_CODE (USE_FROM_PTR (curr)) != SSA_NAME
2792                      || ! bitmap_set_bit (visited,
2793                                           SSA_NAME_VERSION
2794                                             (USE_FROM_PTR (curr)))))
2795             curr = op_iter_next_use (&curri);
2796           if (curr == NULL_USE_OPERAND_P)
2797             goto pop;
2798         }
2799     }
2800   while (1);
2801   if (dump_file && (dump_flags & TDF_DETAILS))
2802     {
2803       dump_printf_loc (MSG_NOTE, loc, "reduction path: ");
2804       unsigned i;
2805       std::pair<ssa_op_iter, use_operand_p> *x;
2806       FOR_EACH_VEC_ELT (path, i, x)
2807         {
2808           dump_generic_expr (MSG_NOTE, TDF_SLIM, USE_FROM_PTR (x->second));
2809           dump_printf (MSG_NOTE, " ");
2810         }
2811       dump_printf (MSG_NOTE, "\n");
2812     }
2813
2814   /* Check whether the reduction path detected is valid.  */
2815   bool fail = path.length () == 0;
2816   bool neg = false;
2817   for (unsigned i = 1; i < path.length (); ++i)
2818     {
2819       gimple *use_stmt = USE_STMT (path[i].second);
2820       tree op = USE_FROM_PTR (path[i].second);
2821       if (! has_single_use (op)
2822           || ! is_gimple_assign (use_stmt))
2823         {
2824           fail = true;
2825           break;
2826         }
2827       if (gimple_assign_rhs_code (use_stmt) != code)
2828         {
2829           if (code == PLUS_EXPR
2830               && gimple_assign_rhs_code (use_stmt) == MINUS_EXPR)
2831             {
2832               /* Track whether we negate the reduction value each iteration.  */
2833               if (gimple_assign_rhs2 (use_stmt) == op)
2834                 neg = ! neg;
2835             }
2836           else
2837             {
2838               fail = true;
2839               break;
2840             }
2841         }
2842     }
2843   return ! fail && ! neg;
2844 }
2845
2846
2847 /* Function vect_is_simple_reduction
2848
2849    (1) Detect a cross-iteration def-use cycle that represents a simple
2850    reduction computation.  We look for the following pattern:
2851
2852    loop_header:
2853      a1 = phi < a0, a2 >
2854      a3 = ...
2855      a2 = operation (a3, a1)
2856
2857    or
2858
2859    a3 = ...
2860    loop_header:
2861      a1 = phi < a0, a2 >
2862      a2 = operation (a3, a1)
2863
2864    such that:
2865    1. operation is commutative and associative and it is safe to
2866       change the order of the computation
2867    2. no uses for a2 in the loop (a2 is used out of the loop)
2868    3. no uses of a1 in the loop besides the reduction operation
2869    4. no uses of a1 outside the loop.
2870
2871    Conditions 1,4 are tested here.
2872    Conditions 2,3 are tested in vect_mark_stmts_to_be_vectorized.
2873
2874    (2) Detect a cross-iteration def-use cycle in nested loops, i.e.,
2875    nested cycles.
2876
2877    (3) Detect cycles of phi nodes in outer-loop vectorization, i.e., double
2878    reductions:
2879
2880      a1 = phi < a0, a2 >
2881      inner loop (def of a3)
2882      a2 = phi < a3 >
2883
2884    (4) Detect condition expressions, ie:
2885      for (int i = 0; i < N; i++)
2886        if (a[i] < val)
2887         ret_val = a[i];
2888
2889 */
2890
2891 static gimple *
2892 vect_is_simple_reduction (loop_vec_info loop_info, gimple *phi,
2893                           bool *double_reduc,
2894                           bool need_wrapping_integral_overflow,
2895                           enum vect_reduction_type *v_reduc_type)
2896 {
2897   struct loop *loop = (gimple_bb (phi))->loop_father;
2898   struct loop *vect_loop = LOOP_VINFO_LOOP (loop_info);
2899   gimple *def_stmt, *def1 = NULL, *def2 = NULL, *phi_use_stmt = NULL;
2900   enum tree_code orig_code, code;
2901   tree op1, op2, op3 = NULL_TREE, op4 = NULL_TREE;
2902   tree type;
2903   int nloop_uses;
2904   tree name;
2905   imm_use_iterator imm_iter;
2906   use_operand_p use_p;
2907   bool phi_def;
2908
2909   *double_reduc = false;
2910   *v_reduc_type = TREE_CODE_REDUCTION;
2911
2912   tree phi_name = PHI_RESULT (phi);
2913   /* ???  If there are no uses of the PHI result the inner loop reduction
2914      won't be detected as possibly double-reduction by vectorizable_reduction
2915      because that tries to walk the PHI arg from the preheader edge which
2916      can be constant.  See PR60382.  */
2917   if (has_zero_uses (phi_name))
2918     return NULL;
2919   nloop_uses = 0;
2920   FOR_EACH_IMM_USE_FAST (use_p, imm_iter, phi_name)
2921     {
2922       gimple *use_stmt = USE_STMT (use_p);
2923       if (is_gimple_debug (use_stmt))
2924         continue;
2925
2926       if (!flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
2927         {
2928           if (dump_enabled_p ())
2929             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2930                              "intermediate value used outside loop.\n");
2931
2932           return NULL;
2933         }
2934
2935       nloop_uses++;
2936       if (nloop_uses > 1)
2937         {
2938           if (dump_enabled_p ())
2939             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2940                              "reduction value used in loop.\n");
2941           return NULL;
2942         }
2943
2944       phi_use_stmt = use_stmt;
2945     }
2946
2947   edge latch_e = loop_latch_edge (loop);
2948   tree loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
2949   if (TREE_CODE (loop_arg) != SSA_NAME)
2950     {
2951       if (dump_enabled_p ())
2952         {
2953           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2954                            "reduction: not ssa_name: ");
2955           dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, loop_arg);
2956           dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
2957         }
2958       return NULL;
2959     }
2960
2961   def_stmt = SSA_NAME_DEF_STMT (loop_arg);
2962   if (is_gimple_assign (def_stmt))
2963     {
2964       name = gimple_assign_lhs (def_stmt);
2965       phi_def = false;
2966     }
2967   else if (gimple_code (def_stmt) == GIMPLE_PHI)
2968     {
2969       name = PHI_RESULT (def_stmt);
2970       phi_def = true;
2971     }
2972   else
2973     {
2974       if (dump_enabled_p ())
2975         {
2976           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2977                            "reduction: unhandled reduction operation: ");
2978           dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, def_stmt, 0);
2979         }
2980       return NULL;
2981     }
2982
2983   if (! flow_bb_inside_loop_p (loop, gimple_bb (def_stmt)))
2984     return NULL;
2985
2986   nloop_uses = 0;
2987   auto_vec<gphi *, 3> lcphis;
2988   FOR_EACH_IMM_USE_FAST (use_p, imm_iter, name)
2989     {
2990       gimple *use_stmt = USE_STMT (use_p);
2991       if (is_gimple_debug (use_stmt))
2992         continue;
2993       if (flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
2994         nloop_uses++;
2995       else
2996         /* We can have more than one loop-closed PHI.  */
2997         lcphis.safe_push (as_a <gphi *> (use_stmt));
2998       if (nloop_uses > 1)
2999         {
3000           if (dump_enabled_p ())
3001             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3002                              "reduction used in loop.\n");
3003           return NULL;
3004         }
3005     }
3006
3007   /* If DEF_STMT is a phi node itself, we expect it to have a single argument
3008      defined in the inner loop.  */
3009   if (phi_def)
3010     {
3011       op1 = PHI_ARG_DEF (def_stmt, 0);
3012
3013       if (gimple_phi_num_args (def_stmt) != 1
3014           || TREE_CODE (op1) != SSA_NAME)
3015         {
3016           if (dump_enabled_p ())
3017             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3018                              "unsupported phi node definition.\n");
3019
3020           return NULL;
3021         }
3022
3023       def1 = SSA_NAME_DEF_STMT (op1);
3024       if (gimple_bb (def1)
3025           && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
3026           && loop->inner
3027           && flow_bb_inside_loop_p (loop->inner, gimple_bb (def1))
3028           && is_gimple_assign (def1)
3029           && flow_bb_inside_loop_p (loop->inner, gimple_bb (phi_use_stmt)))
3030         {
3031           if (dump_enabled_p ())
3032             report_vect_op (MSG_NOTE, def_stmt,
3033                             "detected double reduction: ");
3034
3035           *double_reduc = true;
3036           return def_stmt;
3037         }
3038
3039       return NULL;
3040     }
3041
3042   /* If we are vectorizing an inner reduction we are executing that
3043      in the original order only in case we are not dealing with a
3044      double reduction.  */
3045   bool check_reduction = true;
3046   if (flow_loop_nested_p (vect_loop, loop))
3047     {
3048       gphi *lcphi;
3049       unsigned i;
3050       check_reduction = false;
3051       FOR_EACH_VEC_ELT (lcphis, i, lcphi)
3052         FOR_EACH_IMM_USE_FAST (use_p, imm_iter, gimple_phi_result (lcphi))
3053           {
3054             gimple *use_stmt = USE_STMT (use_p);
3055             if (is_gimple_debug (use_stmt))
3056               continue;
3057             if (! flow_bb_inside_loop_p (vect_loop, gimple_bb (use_stmt)))
3058               check_reduction = true;
3059           }
3060     }
3061
3062   bool nested_in_vect_loop = flow_loop_nested_p (vect_loop, loop);
3063   code = orig_code = gimple_assign_rhs_code (def_stmt);
3064
3065   /* We can handle "res -= x[i]", which is non-associative by
3066      simply rewriting this into "res += -x[i]".  Avoid changing
3067      gimple instruction for the first simple tests and only do this
3068      if we're allowed to change code at all.  */
3069   if (code == MINUS_EXPR && gimple_assign_rhs2 (def_stmt) != phi_name)
3070     code = PLUS_EXPR;
3071
3072   if (code == COND_EXPR)
3073     {
3074       if (! nested_in_vect_loop)
3075         *v_reduc_type = COND_REDUCTION;
3076
3077       op3 = gimple_assign_rhs1 (def_stmt);
3078       if (COMPARISON_CLASS_P (op3))
3079         {
3080           op4 = TREE_OPERAND (op3, 1);
3081           op3 = TREE_OPERAND (op3, 0);
3082         }
3083       if (op3 == phi_name || op4 == phi_name)
3084         {
3085           if (dump_enabled_p ())
3086             report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
3087                             "reduction: condition depends on previous"
3088                             " iteration: ");
3089           return NULL;
3090         }
3091
3092       op1 = gimple_assign_rhs2 (def_stmt);
3093       op2 = gimple_assign_rhs3 (def_stmt);
3094     }
3095   else if (!commutative_tree_code (code) || !associative_tree_code (code))
3096     {
3097       if (dump_enabled_p ())
3098         report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
3099                         "reduction: not commutative/associative: ");
3100       return NULL;
3101     }
3102   else if (get_gimple_rhs_class (code) == GIMPLE_BINARY_RHS)
3103     {
3104       op1 = gimple_assign_rhs1 (def_stmt);
3105       op2 = gimple_assign_rhs2 (def_stmt);
3106     }
3107   else
3108     {
3109       if (dump_enabled_p ())
3110         report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
3111                         "reduction: not handled operation: ");
3112       return NULL;
3113     }
3114
3115   if (TREE_CODE (op1) != SSA_NAME && TREE_CODE (op2) != SSA_NAME)
3116     {
3117       if (dump_enabled_p ())
3118         report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
3119                         "reduction: both uses not ssa_names: ");
3120
3121       return NULL;
3122     }
3123
3124   type = TREE_TYPE (gimple_assign_lhs (def_stmt));
3125   if ((TREE_CODE (op1) == SSA_NAME
3126        && !types_compatible_p (type,TREE_TYPE (op1)))
3127       || (TREE_CODE (op2) == SSA_NAME
3128           && !types_compatible_p (type, TREE_TYPE (op2)))
3129       || (op3 && TREE_CODE (op3) == SSA_NAME
3130           && !types_compatible_p (type, TREE_TYPE (op3)))
3131       || (op4 && TREE_CODE (op4) == SSA_NAME
3132           && !types_compatible_p (type, TREE_TYPE (op4))))
3133     {
3134       if (dump_enabled_p ())
3135         {
3136           dump_printf_loc (MSG_NOTE, vect_location,
3137                            "reduction: multiple types: operation type: ");
3138           dump_generic_expr (MSG_NOTE, TDF_SLIM, type);
3139           dump_printf (MSG_NOTE, ", operands types: ");
3140           dump_generic_expr (MSG_NOTE, TDF_SLIM,
3141                              TREE_TYPE (op1));
3142           dump_printf (MSG_NOTE, ",");
3143           dump_generic_expr (MSG_NOTE, TDF_SLIM,
3144                              TREE_TYPE (op2));
3145           if (op3)
3146             {
3147               dump_printf (MSG_NOTE, ",");
3148               dump_generic_expr (MSG_NOTE, TDF_SLIM,
3149                                  TREE_TYPE (op3));
3150             }
3151
3152           if (op4)
3153             {
3154               dump_printf (MSG_NOTE, ",");
3155               dump_generic_expr (MSG_NOTE, TDF_SLIM,
3156                                  TREE_TYPE (op4));
3157             }
3158           dump_printf (MSG_NOTE, "\n");
3159         }
3160
3161       return NULL;
3162     }
3163
3164   /* Check whether it's ok to change the order of the computation.
3165      Generally, when vectorizing a reduction we change the order of the
3166      computation.  This may change the behavior of the program in some
3167      cases, so we need to check that this is ok.  One exception is when
3168      vectorizing an outer-loop: the inner-loop is executed sequentially,
3169      and therefore vectorizing reductions in the inner-loop during
3170      outer-loop vectorization is safe.  */
3171   if (check_reduction
3172       && *v_reduc_type == TREE_CODE_REDUCTION
3173       && needs_fold_left_reduction_p (type, code,
3174                                       need_wrapping_integral_overflow))
3175     *v_reduc_type = FOLD_LEFT_REDUCTION;
3176
3177   /* Reduction is safe. We're dealing with one of the following:
3178      1) integer arithmetic and no trapv
3179      2) floating point arithmetic, and special flags permit this optimization
3180      3) nested cycle (i.e., outer loop vectorization).  */
3181   if (TREE_CODE (op1) == SSA_NAME)
3182     def1 = SSA_NAME_DEF_STMT (op1);
3183
3184   if (TREE_CODE (op2) == SSA_NAME)
3185     def2 = SSA_NAME_DEF_STMT (op2);
3186
3187   if (code != COND_EXPR
3188       && ((!def1 || gimple_nop_p (def1)) && (!def2 || gimple_nop_p (def2))))
3189     {
3190       if (dump_enabled_p ())
3191         report_vect_op (MSG_NOTE, def_stmt, "reduction: no defs for operands: ");
3192       return NULL;
3193     }
3194
3195   /* Check that one def is the reduction def, defined by PHI,
3196      the other def is either defined in the loop ("vect_internal_def"),
3197      or it's an induction (defined by a loop-header phi-node).  */
3198
3199   if (def2 && def2 == phi
3200       && (code == COND_EXPR
3201           || !def1 || gimple_nop_p (def1)
3202           || !flow_bb_inside_loop_p (loop, gimple_bb (def1))
3203           || vect_valid_reduction_input_p (def1)))
3204     {
3205       if (dump_enabled_p ())
3206         report_vect_op (MSG_NOTE, def_stmt, "detected reduction: ");
3207       return def_stmt;
3208     }
3209
3210   if (def1 && def1 == phi
3211       && (code == COND_EXPR
3212           || !def2 || gimple_nop_p (def2)
3213           || !flow_bb_inside_loop_p (loop, gimple_bb (def2))
3214           || vect_valid_reduction_input_p (def2)))
3215     {
3216       if (! nested_in_vect_loop && orig_code != MINUS_EXPR)
3217         {
3218           /* Check if we can swap operands (just for simplicity - so that
3219              the rest of the code can assume that the reduction variable
3220              is always the last (second) argument).  */
3221           if (code == COND_EXPR)
3222             {
3223               /* Swap cond_expr by inverting the condition.  */
3224               tree cond_expr = gimple_assign_rhs1 (def_stmt);
3225               enum tree_code invert_code = ERROR_MARK;
3226               enum tree_code cond_code = TREE_CODE (cond_expr);
3227
3228               if (TREE_CODE_CLASS (cond_code) == tcc_comparison)
3229                 {
3230                   bool honor_nans = HONOR_NANS (TREE_OPERAND (cond_expr, 0));
3231                   invert_code = invert_tree_comparison (cond_code, honor_nans);
3232                 }
3233               if (invert_code != ERROR_MARK)
3234                 {
3235                   TREE_SET_CODE (cond_expr, invert_code);
3236                   swap_ssa_operands (def_stmt,
3237                                      gimple_assign_rhs2_ptr (def_stmt),
3238                                      gimple_assign_rhs3_ptr (def_stmt));
3239                 }
3240               else
3241                 {
3242                   if (dump_enabled_p ())
3243                     report_vect_op (MSG_NOTE, def_stmt,
3244                                     "detected reduction: cannot swap operands "
3245                                     "for cond_expr");
3246                   return NULL;
3247                 }
3248             }
3249           else
3250             swap_ssa_operands (def_stmt, gimple_assign_rhs1_ptr (def_stmt),
3251                                gimple_assign_rhs2_ptr (def_stmt));
3252
3253           if (dump_enabled_p ())
3254             report_vect_op (MSG_NOTE, def_stmt,
3255                             "detected reduction: need to swap operands: ");
3256
3257           if (CONSTANT_CLASS_P (gimple_assign_rhs1 (def_stmt)))
3258             LOOP_VINFO_OPERANDS_SWAPPED (loop_info) = true;
3259         }
3260       else
3261         {
3262           if (dump_enabled_p ())
3263             report_vect_op (MSG_NOTE, def_stmt, "detected reduction: ");
3264         }
3265
3266       return def_stmt;
3267     }
3268
3269   /* Try to find SLP reduction chain.  */
3270   if (! nested_in_vect_loop
3271       && code != COND_EXPR
3272       && orig_code != MINUS_EXPR
3273       && vect_is_slp_reduction (loop_info, phi, def_stmt))
3274     {
3275       if (dump_enabled_p ())
3276         report_vect_op (MSG_NOTE, def_stmt,
3277                         "reduction: detected reduction chain: ");
3278
3279       return def_stmt;
3280     }
3281
3282   /* Dissolve group eventually half-built by vect_is_slp_reduction.  */
3283   gimple *first = REDUC_GROUP_FIRST_ELEMENT (vinfo_for_stmt (def_stmt));
3284   while (first)
3285     {
3286       gimple *next = REDUC_GROUP_NEXT_ELEMENT (vinfo_for_stmt (first));
3287       REDUC_GROUP_FIRST_ELEMENT (vinfo_for_stmt (first)) = NULL;
3288       REDUC_GROUP_NEXT_ELEMENT (vinfo_for_stmt (first)) = NULL;
3289       first = next;
3290     }
3291
3292   /* Look for the expression computing loop_arg from loop PHI result.  */
3293   if (check_reduction_path (vect_location, loop, as_a <gphi *> (phi), loop_arg,
3294                             code))
3295     return def_stmt;
3296
3297   if (dump_enabled_p ())
3298     {
3299       report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
3300                       "reduction: unknown pattern: ");
3301     }
3302
3303   return NULL;
3304 }
3305
3306 /* Wrapper around vect_is_simple_reduction, which will modify code
3307    in-place if it enables detection of more reductions.  Arguments
3308    as there.  */
3309
3310 gimple *
3311 vect_force_simple_reduction (loop_vec_info loop_info, gimple *phi,
3312                              bool *double_reduc,
3313                              bool need_wrapping_integral_overflow)
3314 {
3315   enum vect_reduction_type v_reduc_type;
3316   gimple *def = vect_is_simple_reduction (loop_info, phi, double_reduc,
3317                                           need_wrapping_integral_overflow,
3318                                           &v_reduc_type);
3319   if (def)
3320     {
3321       stmt_vec_info reduc_def_info = vinfo_for_stmt (phi);
3322       STMT_VINFO_REDUC_TYPE (reduc_def_info) = v_reduc_type;
3323       STMT_VINFO_REDUC_DEF (reduc_def_info) = def;
3324       reduc_def_info = vinfo_for_stmt (def);
3325       STMT_VINFO_REDUC_TYPE (reduc_def_info) = v_reduc_type;
3326       STMT_VINFO_REDUC_DEF (reduc_def_info) = phi;
3327     }
3328   return def;
3329 }
3330
3331 /* Calculate cost of peeling the loop PEEL_ITERS_PROLOGUE times.  */
3332 int
3333 vect_get_known_peeling_cost (loop_vec_info loop_vinfo, int peel_iters_prologue,
3334                              int *peel_iters_epilogue,
3335                              stmt_vector_for_cost *scalar_cost_vec,
3336                              stmt_vector_for_cost *prologue_cost_vec,
3337                              stmt_vector_for_cost *epilogue_cost_vec)
3338 {
3339   int retval = 0;
3340   int assumed_vf = vect_vf_for_cost (loop_vinfo);
3341
3342   if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
3343     {
3344       *peel_iters_epilogue = assumed_vf / 2;
3345       if (dump_enabled_p ())
3346         dump_printf_loc (MSG_NOTE, vect_location,
3347                          "cost model: epilogue peel iters set to vf/2 "
3348                          "because loop iterations are unknown .\n");
3349
3350       /* If peeled iterations are known but number of scalar loop
3351          iterations are unknown, count a taken branch per peeled loop.  */
3352       retval = record_stmt_cost (prologue_cost_vec, 1, cond_branch_taken,
3353                                  NULL, 0, vect_prologue);
3354       retval = record_stmt_cost (prologue_cost_vec, 1, cond_branch_taken,
3355                                  NULL, 0, vect_epilogue);
3356     }
3357   else
3358     {
3359       int niters = LOOP_VINFO_INT_NITERS (loop_vinfo);
3360       peel_iters_prologue = niters < peel_iters_prologue ?
3361                             niters : peel_iters_prologue;
3362       *peel_iters_epilogue = (niters - peel_iters_prologue) % assumed_vf;
3363       /* If we need to peel for gaps, but no peeling is required, we have to
3364          peel VF iterations.  */
3365       if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) && !*peel_iters_epilogue)
3366         *peel_iters_epilogue = assumed_vf;
3367     }
3368
3369   stmt_info_for_cost *si;
3370   int j;
3371   if (peel_iters_prologue)
3372     FOR_EACH_VEC_ELT (*scalar_cost_vec, j, si)
3373         {
3374           stmt_vec_info stmt_info
3375             = si->stmt ? vinfo_for_stmt (si->stmt) : NULL;
3376           retval += record_stmt_cost (prologue_cost_vec,
3377                                       si->count * peel_iters_prologue,
3378                                       si->kind, stmt_info, si->misalign,
3379                                       vect_prologue);
3380         }
3381   if (*peel_iters_epilogue)
3382     FOR_EACH_VEC_ELT (*scalar_cost_vec, j, si)
3383         {
3384           stmt_vec_info stmt_info
3385             = si->stmt ? vinfo_for_stmt (si->stmt) : NULL;
3386           retval += record_stmt_cost (epilogue_cost_vec,
3387                                       si->count * *peel_iters_epilogue,
3388                                       si->kind, stmt_info, si->misalign,
3389                                       vect_epilogue);
3390         }
3391
3392   return retval;
3393 }
3394
3395 /* Function vect_estimate_min_profitable_iters
3396
3397    Return the number of iterations required for the vector version of the
3398    loop to be profitable relative to the cost of the scalar version of the
3399    loop.
3400
3401    *RET_MIN_PROFITABLE_NITERS is a cost model profitability threshold
3402    of iterations for vectorization.  -1 value means loop vectorization
3403    is not profitable.  This returned value may be used for dynamic
3404    profitability check.
3405
3406    *RET_MIN_PROFITABLE_ESTIMATE is a profitability threshold to be used
3407    for static check against estimated number of iterations.  */
3408
3409 static void
3410 vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo,
3411                                     int *ret_min_profitable_niters,
3412                                     int *ret_min_profitable_estimate)
3413 {
3414   int min_profitable_iters;
3415   int min_profitable_estimate;
3416   int peel_iters_prologue;
3417   int peel_iters_epilogue;
3418   unsigned vec_inside_cost = 0;
3419   int vec_outside_cost = 0;
3420   unsigned vec_prologue_cost = 0;
3421   unsigned vec_epilogue_cost = 0;
3422   int scalar_single_iter_cost = 0;
3423   int scalar_outside_cost = 0;
3424   int assumed_vf = vect_vf_for_cost (loop_vinfo);
3425   int npeel = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
3426   void *target_cost_data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
3427
3428   /* Cost model disabled.  */
3429   if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
3430     {
3431       dump_printf_loc (MSG_NOTE, vect_location, "cost model disabled.\n");
3432       *ret_min_profitable_niters = 0;
3433       *ret_min_profitable_estimate = 0;
3434       return;
3435     }
3436
3437   /* Requires loop versioning tests to handle misalignment.  */
3438   if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo))
3439     {
3440       /*  FIXME: Make cost depend on complexity of individual check.  */
3441       unsigned len = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).length ();
3442       (void) add_stmt_cost (target_cost_data, len, vector_stmt, NULL, 0,
3443                             vect_prologue);
3444       dump_printf (MSG_NOTE,
3445                    "cost model: Adding cost of checks for loop "
3446                    "versioning to treat misalignment.\n");
3447     }
3448
3449   /* Requires loop versioning with alias checks.  */
3450   if (LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
3451     {
3452       /*  FIXME: Make cost depend on complexity of individual check.  */
3453       unsigned len = LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo).length ();
3454       (void) add_stmt_cost (target_cost_data, len, vector_stmt, NULL, 0,
3455                             vect_prologue);
3456       len = LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo).length ();
3457       if (len)
3458         /* Count LEN - 1 ANDs and LEN comparisons.  */
3459         (void) add_stmt_cost (target_cost_data, len * 2 - 1, scalar_stmt,
3460                               NULL, 0, vect_prologue);
3461       len = LOOP_VINFO_LOWER_BOUNDS (loop_vinfo).length ();
3462       if (len)
3463         {
3464           /* Count LEN - 1 ANDs and LEN comparisons.  */
3465           unsigned int nstmts = len * 2 - 1;
3466           /* +1 for each bias that needs adding.  */
3467           for (unsigned int i = 0; i < len; ++i)
3468             if (!LOOP_VINFO_LOWER_BOUNDS (loop_vinfo)[i].unsigned_p)
3469               nstmts += 1;
3470           (void) add_stmt_cost (target_cost_data, nstmts, scalar_stmt,
3471                                 NULL, 0, vect_prologue);
3472         }
3473       dump_printf (MSG_NOTE,
3474                    "cost model: Adding cost of checks for loop "
3475                    "versioning aliasing.\n");
3476     }
3477
3478   /* Requires loop versioning with niter checks.  */
3479   if (LOOP_REQUIRES_VERSIONING_FOR_NITERS (loop_vinfo))
3480     {
3481       /*  FIXME: Make cost depend on complexity of individual check.  */
3482       (void) add_stmt_cost (target_cost_data, 1, vector_stmt, NULL, 0,
3483                             vect_prologue);
3484       dump_printf (MSG_NOTE,
3485                    "cost model: Adding cost of checks for loop "
3486                    "versioning niters.\n");
3487     }
3488
3489   if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
3490     (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken, NULL, 0,
3491                           vect_prologue);
3492
3493   /* Count statements in scalar loop.  Using this as scalar cost for a single
3494      iteration for now.
3495
3496      TODO: Add outer loop support.
3497
3498      TODO: Consider assigning different costs to different scalar
3499      statements.  */
3500
3501   scalar_single_iter_cost
3502     = LOOP_VINFO_SINGLE_SCALAR_ITERATION_COST (loop_vinfo);
3503
3504   /* Add additional cost for the peeled instructions in prologue and epilogue
3505      loop.  (For fully-masked loops there will be no peeling.)
3506
3507      FORNOW: If we don't know the value of peel_iters for prologue or epilogue
3508      at compile-time - we assume it's vf/2 (the worst would be vf-1).
3509
3510      TODO: Build an expression that represents peel_iters for prologue and
3511      epilogue to be used in a run-time test.  */
3512
3513   if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
3514     {
3515       peel_iters_prologue = 0;
3516       peel_iters_epilogue = 0;
3517
3518       if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
3519         {
3520           /* We need to peel exactly one iteration.  */
3521           peel_iters_epilogue += 1;
3522           stmt_info_for_cost *si;
3523           int j;
3524           FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
3525                             j, si)
3526             {
3527               struct _stmt_vec_info *stmt_info
3528                 = si->stmt ? vinfo_for_stmt (si->stmt) : NULL;
3529               (void) add_stmt_cost (target_cost_data, si->count,
3530                                     si->kind, stmt_info, si->misalign,
3531                                     vect_epilogue);
3532             }
3533         }
3534     }
3535   else if (npeel < 0)
3536     {
3537       peel_iters_prologue = assumed_vf / 2;
3538       dump_printf (MSG_NOTE, "cost model: "
3539                    "prologue peel iters set to vf/2.\n");
3540
3541       /* If peeling for alignment is unknown, loop bound of main loop becomes
3542          unknown.  */
3543       peel_iters_epilogue = assumed_vf / 2;
3544       dump_printf (MSG_NOTE, "cost model: "
3545                    "epilogue peel iters set to vf/2 because "
3546                    "peeling for alignment is unknown.\n");
3547
3548       /* If peeled iterations are unknown, count a taken branch and a not taken
3549          branch per peeled loop. Even if scalar loop iterations are known,
3550          vector iterations are not known since peeled prologue iterations are
3551          not known. Hence guards remain the same.  */
3552       (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken,
3553                             NULL, 0, vect_prologue);
3554       (void) add_stmt_cost (target_cost_data, 1, cond_branch_not_taken,
3555                             NULL, 0, vect_prologue);
3556       (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken,
3557                             NULL, 0, vect_epilogue);
3558       (void) add_stmt_cost (target_cost_data, 1, cond_branch_not_taken,
3559                             NULL, 0, vect_epilogue);
3560       stmt_info_for_cost *si;
3561       int j;
3562       FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo), j, si)
3563         {
3564           struct _stmt_vec_info *stmt_info
3565             = si->stmt ? vinfo_for_stmt (si->stmt) : NULL;
3566           (void) add_stmt_cost (target_cost_data,
3567                                 si->count * peel_iters_prologue,
3568                                 si->kind, stmt_info, si->misalign,
3569                                 vect_prologue);
3570           (void) add_stmt_cost (target_cost_data,
3571                                 si->count * peel_iters_epilogue,
3572                                 si->kind, stmt_info, si->misalign,
3573                                 vect_epilogue);
3574         }
3575     }
3576   else
3577     {
3578       stmt_vector_for_cost prologue_cost_vec, epilogue_cost_vec;
3579       stmt_info_for_cost *si;
3580       int j;
3581       void *data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
3582
3583       prologue_cost_vec.create (2);
3584       epilogue_cost_vec.create (2);
3585       peel_iters_prologue = npeel;
3586
3587       (void) vect_get_known_peeling_cost (loop_vinfo, peel_iters_prologue,
3588                                           &peel_iters_epilogue,
3589                                           &LOOP_VINFO_SCALAR_ITERATION_COST
3590                                             (loop_vinfo),
3591                                           &prologue_cost_vec,
3592                                           &epilogue_cost_vec);
3593
3594       FOR_EACH_VEC_ELT (prologue_cost_vec, j, si)
3595         {
3596           struct _stmt_vec_info *stmt_info
3597             = si->stmt ? vinfo_for_stmt (si->stmt) : NULL;
3598           (void) add_stmt_cost (data, si->count, si->kind, stmt_info,
3599                                 si->misalign, vect_prologue);
3600         }
3601
3602       FOR_EACH_VEC_ELT (epilogue_cost_vec, j, si)
3603         {
3604           struct _stmt_vec_info *stmt_info
3605             = si->stmt ? vinfo_for_stmt (si->stmt) : NULL;
3606           (void) add_stmt_cost (data, si->count, si->kind, stmt_info,
3607                                 si->misalign, vect_epilogue);
3608         }
3609
3610       prologue_cost_vec.release ();
3611       epilogue_cost_vec.release ();
3612     }
3613
3614   /* FORNOW: The scalar outside cost is incremented in one of the
3615      following ways:
3616
3617      1. The vectorizer checks for alignment and aliasing and generates
3618      a condition that allows dynamic vectorization.  A cost model
3619      check is ANDED with the versioning condition.  Hence scalar code
3620      path now has the added cost of the versioning check.
3621
3622        if (cost > th & versioning_check)
3623          jmp to vector code
3624
3625      Hence run-time scalar is incremented by not-taken branch cost.
3626
3627      2. The vectorizer then checks if a prologue is required.  If the
3628      cost model check was not done before during versioning, it has to
3629      be done before the prologue check.
3630
3631        if (cost <= th)
3632          prologue = scalar_iters
3633        if (prologue == 0)
3634          jmp to vector code
3635        else
3636          execute prologue
3637        if (prologue == num_iters)
3638          go to exit
3639
3640      Hence the run-time scalar cost is incremented by a taken branch,
3641      plus a not-taken branch, plus a taken branch cost.
3642
3643      3. The vectorizer then checks if an epilogue is required.  If the
3644      cost model check was not done before during prologue check, it
3645      has to be done with the epilogue check.
3646
3647        if (prologue == 0)
3648          jmp to vector code
3649        else
3650          execute prologue
3651        if (prologue == num_iters)
3652          go to exit
3653        vector code:
3654          if ((cost <= th) | (scalar_iters-prologue-epilogue == 0))
3655            jmp to epilogue
3656
3657      Hence the run-time scalar cost should be incremented by 2 taken
3658      branches.
3659
3660      TODO: The back end may reorder the BBS's differently and reverse
3661      conditions/branch directions.  Change the estimates below to
3662      something more reasonable.  */
3663
3664   /* If the number of iterations is known and we do not do versioning, we can
3665      decide whether to vectorize at compile time.  Hence the scalar version
3666      do not carry cost model guard costs.  */
3667   if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
3668       || LOOP_REQUIRES_VERSIONING (loop_vinfo))
3669     {
3670       /* Cost model check occurs at versioning.  */
3671       if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
3672         scalar_outside_cost += vect_get_stmt_cost (cond_branch_not_taken);
3673       else
3674         {
3675           /* Cost model check occurs at prologue generation.  */
3676           if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) < 0)
3677             scalar_outside_cost += 2 * vect_get_stmt_cost (cond_branch_taken)
3678               + vect_get_stmt_cost (cond_branch_not_taken);
3679           /* Cost model check occurs at epilogue generation.  */
3680           else
3681             scalar_outside_cost += 2 * vect_get_stmt_cost (cond_branch_taken);
3682         }
3683     }
3684
3685   /* Complete the target-specific cost calculations.  */
3686   finish_cost (LOOP_VINFO_TARGET_COST_DATA (loop_vinfo), &vec_prologue_cost,
3687                &vec_inside_cost, &vec_epilogue_cost);
3688
3689   vec_outside_cost = (int)(vec_prologue_cost + vec_epilogue_cost);
3690
3691   if (dump_enabled_p ())
3692     {
3693       dump_printf_loc (MSG_NOTE, vect_location, "Cost model analysis: \n");
3694       dump_printf (MSG_NOTE, "  Vector inside of loop cost: %d\n",
3695                    vec_inside_cost);
3696       dump_printf (MSG_NOTE, "  Vector prologue cost: %d\n",
3697                    vec_prologue_cost);
3698       dump_printf (MSG_NOTE, "  Vector epilogue cost: %d\n",
3699                    vec_epilogue_cost);
3700       dump_printf (MSG_NOTE, "  Scalar iteration cost: %d\n",
3701                    scalar_single_iter_cost);
3702       dump_printf (MSG_NOTE, "  Scalar outside cost: %d\n",
3703                    scalar_outside_cost);
3704       dump_printf (MSG_NOTE, "  Vector outside cost: %d\n",
3705                    vec_outside_cost);
3706       dump_printf (MSG_NOTE, "  prologue iterations: %d\n",
3707                    peel_iters_prologue);
3708       dump_printf (MSG_NOTE, "  epilogue iterations: %d\n",
3709                    peel_iters_epilogue);
3710     }
3711
3712   /* Calculate number of iterations required to make the vector version
3713      profitable, relative to the loop bodies only.  The following condition
3714      must hold true:
3715      SIC * niters + SOC > VIC * ((niters-PL_ITERS-EP_ITERS)/VF) + VOC
3716      where
3717      SIC = scalar iteration cost, VIC = vector iteration cost,
3718      VOC = vector outside cost, VF = vectorization factor,
3719      PL_ITERS = prologue iterations, EP_ITERS= epilogue iterations
3720      SOC = scalar outside cost for run time cost model check.  */
3721
3722   if ((scalar_single_iter_cost * assumed_vf) > (int) vec_inside_cost)
3723     {
3724       min_profitable_iters = ((vec_outside_cost - scalar_outside_cost)
3725                               * assumed_vf
3726                               - vec_inside_cost * peel_iters_prologue
3727                               - vec_inside_cost * peel_iters_epilogue);
3728       if (min_profitable_iters <= 0)
3729         min_profitable_iters = 0;
3730       else
3731         {
3732           min_profitable_iters /= ((scalar_single_iter_cost * assumed_vf)
3733                                    - vec_inside_cost);
3734
3735           if ((scalar_single_iter_cost * assumed_vf * min_profitable_iters)
3736               <= (((int) vec_inside_cost * min_profitable_iters)
3737                   + (((int) vec_outside_cost - scalar_outside_cost)
3738                      * assumed_vf)))
3739             min_profitable_iters++;
3740         }
3741     }
3742   /* vector version will never be profitable.  */
3743   else
3744     {
3745       if (LOOP_VINFO_LOOP (loop_vinfo)->force_vectorize)
3746         warning_at (vect_location.get_location_t (), OPT_Wopenmp_simd,
3747                     "vectorization did not happen for a simd loop");
3748
3749       if (dump_enabled_p ())
3750         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3751                          "cost model: the vector iteration cost = %d "
3752                          "divided by the scalar iteration cost = %d "
3753                          "is greater or equal to the vectorization factor = %d"
3754                          ".\n",
3755                          vec_inside_cost, scalar_single_iter_cost, assumed_vf);
3756       *ret_min_profitable_niters = -1;
3757       *ret_min_profitable_estimate = -1;
3758       return;
3759     }
3760
3761   dump_printf (MSG_NOTE,
3762                "  Calculated minimum iters for profitability: %d\n",
3763                min_profitable_iters);
3764
3765   if (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
3766       && min_profitable_iters < (assumed_vf + peel_iters_prologue))
3767     /* We want the vectorized loop to execute at least once.  */
3768     min_profitable_iters = assumed_vf + peel_iters_prologue;
3769
3770   if (dump_enabled_p ())
3771     dump_printf_loc (MSG_NOTE, vect_location,
3772                      "  Runtime profitability threshold = %d\n",
3773                      min_profitable_iters);
3774
3775   *ret_min_profitable_niters = min_profitable_iters;
3776
3777   /* Calculate number of iterations required to make the vector version
3778      profitable, relative to the loop bodies only.
3779
3780      Non-vectorized variant is SIC * niters and it must win over vector
3781      variant on the expected loop trip count.  The following condition must hold true:
3782      SIC * niters > VIC * ((niters-PL_ITERS-EP_ITERS)/VF) + VOC + SOC  */
3783
3784   if (vec_outside_cost <= 0)
3785     min_profitable_estimate = 0;
3786   else
3787     {
3788       min_profitable_estimate = ((vec_outside_cost + scalar_outside_cost)
3789                                  * assumed_vf
3790                                  - vec_inside_cost * peel_iters_prologue
3791                                  - vec_inside_cost * peel_iters_epilogue)
3792                                  / ((scalar_single_iter_cost * assumed_vf)
3793                                    - vec_inside_cost);
3794     }
3795   min_profitable_estimate = MAX (min_profitable_estimate, min_profitable_iters);
3796   if (dump_enabled_p ())
3797     dump_printf_loc (MSG_NOTE, vect_location,
3798                      "  Static estimate profitability threshold = %d\n",
3799                      min_profitable_estimate);
3800
3801   *ret_min_profitable_estimate = min_profitable_estimate;
3802 }
3803
3804 /* Writes into SEL a mask for a vec_perm, equivalent to a vec_shr by OFFSET
3805    vector elements (not bits) for a vector with NELT elements.  */
3806 static void
3807 calc_vec_perm_mask_for_shift (unsigned int offset, unsigned int nelt,
3808                               vec_perm_builder *sel)
3809 {
3810   /* The encoding is a single stepped pattern.  Any wrap-around is handled
3811      by vec_perm_indices.  */
3812   sel->new_vector (nelt, 1, 3);
3813   for (unsigned int i = 0; i < 3; i++)
3814     sel->quick_push (i + offset);
3815 }
3816
3817 /* Checks whether the target supports whole-vector shifts for vectors of mode
3818    MODE.  This is the case if _either_ the platform handles vec_shr_optab, _or_
3819    it supports vec_perm_const with masks for all necessary shift amounts.  */
3820 static bool
3821 have_whole_vector_shift (machine_mode mode)
3822 {
3823   if (optab_handler (vec_shr_optab, mode) != CODE_FOR_nothing)
3824     return true;
3825
3826   /* Variable-length vectors should be handled via the optab.  */
3827   unsigned int nelt;
3828   if (!GET_MODE_NUNITS (mode).is_constant (&nelt))
3829     return false;
3830
3831   vec_perm_builder sel;
3832   vec_perm_indices indices;
3833   for (unsigned int i = nelt / 2; i >= 1; i /= 2)
3834     {
3835       calc_vec_perm_mask_for_shift (i, nelt, &sel);
3836       indices.new_vector (sel, 2, nelt);
3837       if (!can_vec_perm_const_p (mode, indices, false))
3838         return false;
3839     }
3840   return true;
3841 }
3842
3843 /* TODO: Close dependency between vect_model_*_cost and vectorizable_*
3844    functions. Design better to avoid maintenance issues.  */
3845
3846 /* Function vect_model_reduction_cost.
3847
3848    Models cost for a reduction operation, including the vector ops
3849    generated within the strip-mine loop, the initial definition before
3850    the loop, and the epilogue code that must be generated.  */
3851
3852 static void
3853 vect_model_reduction_cost (stmt_vec_info stmt_info, internal_fn reduc_fn,
3854                            int ncopies, stmt_vector_for_cost *cost_vec)
3855 {
3856   int prologue_cost = 0, epilogue_cost = 0, inside_cost;
3857   enum tree_code code;
3858   optab optab;
3859   tree vectype;
3860   gimple *orig_stmt;
3861   machine_mode mode;
3862   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
3863   struct loop *loop = NULL;
3864
3865   if (loop_vinfo)
3866     loop = LOOP_VINFO_LOOP (loop_vinfo);
3867
3868   /* Condition reductions generate two reductions in the loop.  */
3869   vect_reduction_type reduction_type
3870     = STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info);
3871   if (reduction_type == COND_REDUCTION)
3872     ncopies *= 2;
3873
3874   vectype = STMT_VINFO_VECTYPE (stmt_info);
3875   mode = TYPE_MODE (vectype);
3876   orig_stmt = STMT_VINFO_RELATED_STMT (stmt_info);
3877
3878   if (!orig_stmt)
3879     orig_stmt = STMT_VINFO_STMT (stmt_info);
3880
3881   code = gimple_assign_rhs_code (orig_stmt);
3882
3883   if (reduction_type == EXTRACT_LAST_REDUCTION
3884       || reduction_type == FOLD_LEFT_REDUCTION)
3885     {
3886       /* No extra instructions needed in the prologue.  */
3887       prologue_cost = 0;
3888
3889       if (reduction_type == EXTRACT_LAST_REDUCTION || reduc_fn != IFN_LAST)
3890         /* Count one reduction-like operation per vector.  */
3891         inside_cost = record_stmt_cost (cost_vec, ncopies, vec_to_scalar,
3892                                         stmt_info, 0, vect_body);
3893       else
3894         {
3895           /* Use NELEMENTS extracts and NELEMENTS scalar ops.  */
3896           unsigned int nelements = ncopies * vect_nunits_for_cost (vectype);
3897           inside_cost = record_stmt_cost (cost_vec, nelements,
3898                                           vec_to_scalar, stmt_info, 0,
3899                                           vect_body);
3900           inside_cost += record_stmt_cost (cost_vec, nelements,
3901                                            scalar_stmt, stmt_info, 0,
3902                                            vect_body);
3903         }
3904     }
3905   else
3906     {
3907       /* Add in cost for initial definition.
3908          For cond reduction we have four vectors: initial index, step,
3909          initial result of the data reduction, initial value of the index
3910          reduction.  */
3911       int prologue_stmts = reduction_type == COND_REDUCTION ? 4 : 1;
3912       prologue_cost += record_stmt_cost (cost_vec, prologue_stmts,
3913                                          scalar_to_vec, stmt_info, 0,
3914                                          vect_prologue);
3915
3916       /* Cost of reduction op inside loop.  */
3917       inside_cost = record_stmt_cost (cost_vec, ncopies, vector_stmt,
3918                                       stmt_info, 0, vect_body);
3919     }
3920
3921   /* Determine cost of epilogue code.
3922
3923      We have a reduction operator that will reduce the vector in one statement.
3924      Also requires scalar extract.  */
3925
3926   if (!loop || !nested_in_vect_loop_p (loop, orig_stmt))
3927     {
3928       if (reduc_fn != IFN_LAST)
3929         {
3930           if (reduction_type == COND_REDUCTION)
3931             {
3932               /* An EQ stmt and an COND_EXPR stmt.  */
3933               epilogue_cost += record_stmt_cost (cost_vec, 2,
3934                                                  vector_stmt, stmt_info, 0,
3935                                                  vect_epilogue);
3936               /* Reduction of the max index and a reduction of the found
3937                  values.  */
3938               epilogue_cost += record_stmt_cost (cost_vec, 2,
3939                                                  vec_to_scalar, stmt_info, 0,
3940                                                  vect_epilogue);
3941               /* A broadcast of the max value.  */
3942               epilogue_cost += record_stmt_cost (cost_vec, 1,
3943                                                  scalar_to_vec, stmt_info, 0,
3944                                                  vect_epilogue);
3945             }
3946           else
3947             {
3948               epilogue_cost += record_stmt_cost (cost_vec, 1, vector_stmt,
3949                                                  stmt_info, 0, vect_epilogue);
3950               epilogue_cost += record_stmt_cost (cost_vec, 1,
3951                                                  vec_to_scalar, stmt_info, 0,
3952                                                  vect_epilogue);
3953             }
3954         }
3955       else if (reduction_type == COND_REDUCTION)
3956         {
3957           unsigned estimated_nunits = vect_nunits_for_cost (vectype);
3958           /* Extraction of scalar elements.  */
3959           epilogue_cost += record_stmt_cost (cost_vec,
3960                                              2 * estimated_nunits,
3961                                              vec_to_scalar, stmt_info, 0,
3962                                              vect_epilogue);
3963           /* Scalar max reductions via COND_EXPR / MAX_EXPR.  */
3964           epilogue_cost += record_stmt_cost (cost_vec,
3965                                              2 * estimated_nunits - 3,
3966                                              scalar_stmt, stmt_info, 0,
3967                                              vect_epilogue);
3968         }
3969       else if (reduction_type == EXTRACT_LAST_REDUCTION
3970                || reduction_type == FOLD_LEFT_REDUCTION)
3971         /* No extra instructions need in the epilogue.  */
3972         ;
3973       else
3974         {
3975           int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
3976           tree bitsize =
3977             TYPE_SIZE (TREE_TYPE (gimple_assign_lhs (orig_stmt)));
3978           int element_bitsize = tree_to_uhwi (bitsize);
3979           int nelements = vec_size_in_bits / element_bitsize;
3980
3981           if (code == COND_EXPR)
3982             code = MAX_EXPR;
3983
3984           optab = optab_for_tree_code (code, vectype, optab_default);
3985
3986           /* We have a whole vector shift available.  */
3987           if (optab != unknown_optab
3988               && VECTOR_MODE_P (mode)
3989               && optab_handler (optab, mode) != CODE_FOR_nothing
3990               && have_whole_vector_shift (mode))
3991             {
3992               /* Final reduction via vector shifts and the reduction operator.
3993                  Also requires scalar extract.  */
3994               epilogue_cost += record_stmt_cost (cost_vec,
3995                                                  exact_log2 (nelements) * 2,
3996                                                  vector_stmt, stmt_info, 0,
3997                                                  vect_epilogue);
3998               epilogue_cost += record_stmt_cost (cost_vec, 1,
3999                                                  vec_to_scalar, stmt_info, 0,
4000                                                  vect_epilogue);
4001             }
4002           else
4003             /* Use extracts and reduction op for final reduction.  For N
4004                elements, we have N extracts and N-1 reduction ops.  */
4005             epilogue_cost += record_stmt_cost (cost_vec,
4006                                                nelements + nelements - 1,
4007                                                vector_stmt, stmt_info, 0,
4008                                                vect_epilogue);
4009         }
4010     }
4011
4012   if (dump_enabled_p ())
4013     dump_printf (MSG_NOTE,
4014                  "vect_model_reduction_cost: inside_cost = %d, "
4015                  "prologue_cost = %d, epilogue_cost = %d .\n", inside_cost,
4016                  prologue_cost, epilogue_cost);
4017 }
4018
4019
4020 /* Function vect_model_induction_cost.
4021
4022    Models cost for induction operations.  */
4023
4024 static void
4025 vect_model_induction_cost (stmt_vec_info stmt_info, int ncopies,
4026                            stmt_vector_for_cost *cost_vec)
4027 {
4028   unsigned inside_cost, prologue_cost;
4029
4030   if (PURE_SLP_STMT (stmt_info))
4031     return;
4032
4033   /* loop cost for vec_loop.  */
4034   inside_cost = record_stmt_cost (cost_vec, ncopies, vector_stmt,
4035                                   stmt_info, 0, vect_body);
4036
4037   /* prologue cost for vec_init and vec_step.  */
4038   prologue_cost = record_stmt_cost (cost_vec, 2, scalar_to_vec,
4039                                     stmt_info, 0, vect_prologue);
4040
4041   if (dump_enabled_p ())
4042     dump_printf_loc (MSG_NOTE, vect_location,
4043                      "vect_model_induction_cost: inside_cost = %d, "
4044                      "prologue_cost = %d .\n", inside_cost, prologue_cost);
4045 }
4046
4047
4048
4049 /* Function get_initial_def_for_reduction
4050
4051    Input:
4052    STMT - a stmt that performs a reduction operation in the loop.
4053    INIT_VAL - the initial value of the reduction variable
4054
4055    Output:
4056    ADJUSTMENT_DEF - a tree that holds a value to be added to the final result
4057         of the reduction (used for adjusting the epilog - see below).
4058    Return a vector variable, initialized according to the operation that STMT
4059         performs. This vector will be used as the initial value of the
4060         vector of partial results.
4061
4062    Option1 (adjust in epilog): Initialize the vector as follows:
4063      add/bit or/xor:    [0,0,...,0,0]
4064      mult/bit and:      [1,1,...,1,1]
4065      min/max/cond_expr: [init_val,init_val,..,init_val,init_val]
4066    and when necessary (e.g. add/mult case) let the caller know
4067    that it needs to adjust the result by init_val.
4068
4069    Option2: Initialize the vector as follows:
4070      add/bit or/xor:    [init_val,0,0,...,0]
4071      mult/bit and:      [init_val,1,1,...,1]
4072      min/max/cond_expr: [init_val,init_val,...,init_val]
4073    and no adjustments are needed.
4074
4075    For example, for the following code:
4076
4077    s = init_val;
4078    for (i=0;i<n;i++)
4079      s = s + a[i];
4080
4081    STMT is 's = s + a[i]', and the reduction variable is 's'.
4082    For a vector of 4 units, we want to return either [0,0,0,init_val],
4083    or [0,0,0,0] and let the caller know that it needs to adjust
4084    the result at the end by 'init_val'.
4085
4086    FORNOW, we are using the 'adjust in epilog' scheme, because this way the
4087    initialization vector is simpler (same element in all entries), if
4088    ADJUSTMENT_DEF is not NULL, and Option2 otherwise.
4089
4090    A cost model should help decide between these two schemes.  */
4091
4092 tree
4093 get_initial_def_for_reduction (gimple *stmt, tree init_val,
4094                                tree *adjustment_def)
4095 {
4096   stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt);
4097   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_vinfo);
4098   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
4099   tree scalar_type = TREE_TYPE (init_val);
4100   tree vectype = get_vectype_for_scalar_type (scalar_type);
4101   enum tree_code code = gimple_assign_rhs_code (stmt);
4102   tree def_for_init;
4103   tree init_def;
4104   REAL_VALUE_TYPE real_init_val = dconst0;
4105   int int_init_val = 0;
4106   gimple_seq stmts = NULL;
4107
4108   gcc_assert (vectype);
4109
4110   gcc_assert (POINTER_TYPE_P (scalar_type) || INTEGRAL_TYPE_P (scalar_type)
4111               || SCALAR_FLOAT_TYPE_P (scalar_type));
4112
4113   gcc_assert (nested_in_vect_loop_p (loop, stmt)
4114               || loop == (gimple_bb (stmt))->loop_father);
4115
4116   vect_reduction_type reduction_type
4117     = STMT_VINFO_VEC_REDUCTION_TYPE (stmt_vinfo);
4118
4119   switch (code)
4120     {
4121     case WIDEN_SUM_EXPR:
4122     case DOT_PROD_EXPR:
4123     case SAD_EXPR:
4124     case PLUS_EXPR:
4125     case MINUS_EXPR:
4126     case BIT_IOR_EXPR:
4127     case BIT_XOR_EXPR:
4128     case MULT_EXPR:
4129     case BIT_AND_EXPR:
4130       {
4131         /* ADJUSTMENT_DEF is NULL when called from
4132            vect_create_epilog_for_reduction to vectorize double reduction.  */
4133         if (adjustment_def)
4134           *adjustment_def = init_val;
4135
4136         if (code == MULT_EXPR)
4137           {
4138             real_init_val = dconst1;
4139             int_init_val = 1;
4140           }
4141
4142         if (code == BIT_AND_EXPR)
4143           int_init_val = -1;
4144
4145         if (SCALAR_FLOAT_TYPE_P (scalar_type))
4146           def_for_init = build_real (scalar_type, real_init_val);
4147         else
4148           def_for_init = build_int_cst (scalar_type, int_init_val);
4149
4150         if (adjustment_def)
4151           /* Option1: the first element is '0' or '1' as well.  */
4152           init_def = gimple_build_vector_from_val (&stmts, vectype,
4153                                                    def_for_init);
4154         else if (!TYPE_VECTOR_SUBPARTS (vectype).is_constant ())
4155           {
4156             /* Option2 (variable length): the first element is INIT_VAL.  */
4157             init_def = gimple_build_vector_from_val (&stmts, vectype,
4158                                                      def_for_init);
4159             init_def = gimple_build (&stmts, CFN_VEC_SHL_INSERT,
4160                                      vectype, init_def, init_val);
4161           }
4162         else
4163           {
4164             /* Option2: the first element is INIT_VAL.  */
4165             tree_vector_builder elts (vectype, 1, 2);
4166             elts.quick_push (init_val);
4167             elts.quick_push (def_for_init);
4168             init_def = gimple_build_vector (&stmts, &elts);
4169           }
4170       }
4171       break;
4172
4173     case MIN_EXPR:
4174     case MAX_EXPR:
4175     case COND_EXPR:
4176       {
4177         if (adjustment_def)
4178           {
4179             *adjustment_def = NULL_TREE;
4180             if (reduction_type != COND_REDUCTION
4181                 && reduction_type != EXTRACT_LAST_REDUCTION)
4182               {
4183                 init_def = vect_get_vec_def_for_operand (init_val, stmt);
4184                 break;
4185               }
4186           }
4187         init_val = gimple_convert (&stmts, TREE_TYPE (vectype), init_val);
4188         init_def = gimple_build_vector_from_val (&stmts, vectype, init_val);
4189       }
4190       break;
4191
4192     default:
4193       gcc_unreachable ();
4194     }
4195
4196   if (stmts)
4197     gsi_insert_seq_on_edge_immediate (loop_preheader_edge (loop), stmts);
4198   return init_def;
4199 }
4200
4201 /* Get at the initial defs for the reduction PHIs in SLP_NODE.
4202    NUMBER_OF_VECTORS is the number of vector defs to create.
4203    If NEUTRAL_OP is nonnull, introducing extra elements of that
4204    value will not change the result.  */
4205
4206 static void
4207 get_initial_defs_for_reduction (slp_tree slp_node,
4208                                 vec<tree> *vec_oprnds,
4209                                 unsigned int number_of_vectors,
4210                                 bool reduc_chain, tree neutral_op)
4211 {
4212   vec<gimple *> stmts = SLP_TREE_SCALAR_STMTS (slp_node);
4213   gimple *stmt = stmts[0];
4214   stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt);
4215   unsigned HOST_WIDE_INT nunits;
4216   unsigned j, number_of_places_left_in_vector;
4217   tree vector_type;
4218   tree vop;
4219   int group_size = stmts.length ();
4220   unsigned int vec_num, i;
4221   unsigned number_of_copies = 1;
4222   vec<tree> voprnds;
4223   voprnds.create (number_of_vectors);
4224   struct loop *loop;
4225   auto_vec<tree, 16> permute_results;
4226
4227   vector_type = STMT_VINFO_VECTYPE (stmt_vinfo);
4228
4229   gcc_assert (STMT_VINFO_DEF_TYPE (stmt_vinfo) == vect_reduction_def);
4230
4231   loop = (gimple_bb (stmt))->loop_father;
4232   gcc_assert (loop);
4233   edge pe = loop_preheader_edge (loop);
4234
4235   gcc_assert (!reduc_chain || neutral_op);
4236
4237   /* NUMBER_OF_COPIES is the number of times we need to use the same values in
4238      created vectors. It is greater than 1 if unrolling is performed.
4239
4240      For example, we have two scalar operands, s1 and s2 (e.g., group of
4241      strided accesses of size two), while NUNITS is four (i.e., four scalars
4242      of this type can be packed in a vector).  The output vector will contain
4243      two copies of each scalar operand: {s1, s2, s1, s2}.  (NUMBER_OF_COPIES
4244      will be 2).
4245
4246      If REDUC_GROUP_SIZE > NUNITS, the scalars will be split into several
4247      vectors containing the operands.
4248
4249      For example, NUNITS is four as before, and the group size is 8
4250      (s1, s2, ..., s8).  We will create two vectors {s1, s2, s3, s4} and
4251      {s5, s6, s7, s8}.  */
4252
4253   if (!TYPE_VECTOR_SUBPARTS (vector_type).is_constant (&nunits))
4254     nunits = group_size;
4255
4256   number_of_copies = nunits * number_of_vectors / group_size;
4257
4258   number_of_places_left_in_vector = nunits;
4259   bool constant_p = true;
4260   tree_vector_builder elts (vector_type, nunits, 1);
4261   elts.quick_grow (nunits);
4262   for (j = 0; j < number_of_copies; j++)
4263     {
4264       for (i = group_size - 1; stmts.iterate (i, &stmt); i--)
4265         {
4266           tree op;
4267           /* Get the def before the loop.  In reduction chain we have only
4268              one initial value.  */
4269           if ((j != (number_of_copies - 1)
4270                || (reduc_chain && i != 0))
4271               && neutral_op)
4272             op = neutral_op;
4273           else
4274             op = PHI_ARG_DEF_FROM_EDGE (stmt, pe);
4275
4276           /* Create 'vect_ = {op0,op1,...,opn}'.  */
4277           number_of_places_left_in_vector--;
4278           elts[number_of_places_left_in_vector] = op;
4279           if (!CONSTANT_CLASS_P (op))
4280             constant_p = false;
4281
4282           if (number_of_places_left_in_vector == 0)
4283             {
4284               gimple_seq ctor_seq = NULL;
4285               tree init;
4286               if (constant_p && !neutral_op
4287                   ? multiple_p (TYPE_VECTOR_SUBPARTS (vector_type), nunits)
4288                   : known_eq (TYPE_VECTOR_SUBPARTS (vector_type), nunits))
4289                 /* Build the vector directly from ELTS.  */
4290                 init = gimple_build_vector (&ctor_seq, &elts);
4291               else if (neutral_op)
4292                 {
4293                   /* Build a vector of the neutral value and shift the
4294                      other elements into place.  */
4295                   init = gimple_build_vector_from_val (&ctor_seq, vector_type,
4296                                                        neutral_op);
4297                   int k = nunits;
4298                   while (k > 0 && elts[k - 1] == neutral_op)
4299                     k -= 1;
4300                   while (k > 0)
4301                     {
4302                       k -= 1;
4303                       init = gimple_build (&ctor_seq, CFN_VEC_SHL_INSERT,
4304                                            vector_type, init, elts[k]);
4305                     }
4306                 }
4307               else
4308                 {
4309                   /* First time round, duplicate ELTS to fill the
4310                      required number of vectors, then cherry pick the
4311                      appropriate result for each iteration.  */
4312                   if (vec_oprnds->is_empty ())
4313                     duplicate_and_interleave (&ctor_seq, vector_type, elts,
4314                                               number_of_vectors,
4315                                               permute_results);
4316                   init = permute_results[number_of_vectors - j - 1];
4317                 }
4318               if (ctor_seq != NULL)
4319                 gsi_insert_seq_on_edge_immediate (pe, ctor_seq);
4320               voprnds.quick_push (init);
4321
4322               number_of_places_left_in_vector = nunits;
4323               elts.new_vector (vector_type, nunits, 1);
4324               elts.quick_grow (nunits);
4325               constant_p = true;
4326             }
4327         }
4328     }
4329
4330   /* Since the vectors are created in the reverse order, we should invert
4331      them.  */
4332   vec_num = voprnds.length ();
4333   for (j = vec_num; j != 0; j--)
4334     {
4335       vop = voprnds[j - 1];
4336       vec_oprnds->quick_push (vop);
4337     }
4338
4339   voprnds.release ();
4340
4341   /* In case that VF is greater than the unrolling factor needed for the SLP
4342      group of stmts, NUMBER_OF_VECTORS to be created is greater than
4343      NUMBER_OF_SCALARS/NUNITS or NUNITS/NUMBER_OF_SCALARS, and hence we have
4344      to replicate the vectors.  */
4345   tree neutral_vec = NULL;
4346   while (number_of_vectors > vec_oprnds->length ())
4347     {
4348       if (neutral_op)
4349         {
4350           if (!neutral_vec)
4351             {
4352               gimple_seq ctor_seq = NULL;
4353               neutral_vec = gimple_build_vector_from_val
4354                 (&ctor_seq, vector_type, neutral_op);
4355               if (ctor_seq != NULL)
4356                 gsi_insert_seq_on_edge_immediate (pe, ctor_seq);
4357             }
4358           vec_oprnds->quick_push (neutral_vec);
4359         }
4360       else
4361         {
4362           for (i = 0; vec_oprnds->iterate (i, &vop) && i < vec_num; i++)
4363             vec_oprnds->quick_push (vop);
4364         }
4365     }
4366 }
4367
4368
4369 /* Function vect_create_epilog_for_reduction
4370
4371    Create code at the loop-epilog to finalize the result of a reduction
4372    computation.
4373
4374    VECT_DEFS is list of vector of partial results, i.e., the lhs's of vector
4375      reduction statements.
4376    STMT is the scalar reduction stmt that is being vectorized.
4377    NCOPIES is > 1 in case the vectorization factor (VF) is bigger than the
4378      number of elements that we can fit in a vectype (nunits).  In this case
4379      we have to generate more than one vector stmt - i.e - we need to "unroll"
4380      the vector stmt by a factor VF/nunits.  For more details see documentation
4381      in vectorizable_operation.
4382    REDUC_FN is the internal function for the epilog reduction.
4383    REDUCTION_PHIS is a list of the phi-nodes that carry the reduction
4384      computation.
4385    REDUC_INDEX is the index of the operand in the right hand side of the
4386      statement that is defined by REDUCTION_PHI.
4387    DOUBLE_REDUC is TRUE if double reduction phi nodes should be handled.
4388    SLP_NODE is an SLP node containing a group of reduction statements. The
4389      first one in this group is STMT.
4390    INDUC_VAL is for INTEGER_INDUC_COND_REDUCTION the value to use for the case
4391      when the COND_EXPR is never true in the loop.  For MAX_EXPR, it needs to
4392      be smaller than any value of the IV in the loop, for MIN_EXPR larger than
4393      any value of the IV in the loop.
4394    INDUC_CODE is the code for epilog reduction if INTEGER_INDUC_COND_REDUCTION.
4395    NEUTRAL_OP is the value given by neutral_op_for_slp_reduction; it is
4396      null if this is not an SLP reduction
4397
4398    This function:
4399    1. Creates the reduction def-use cycles: sets the arguments for
4400       REDUCTION_PHIS:
4401       The loop-entry argument is the vectorized initial-value of the reduction.
4402       The loop-latch argument is taken from VECT_DEFS - the vector of partial
4403       sums.
4404    2. "Reduces" each vector of partial results VECT_DEFS into a single result,
4405       by calling the function specified by REDUC_FN if available, or by
4406       other means (whole-vector shifts or a scalar loop).
4407       The function also creates a new phi node at the loop exit to preserve
4408       loop-closed form, as illustrated below.
4409
4410      The flow at the entry to this function:
4411
4412         loop:
4413           vec_def = phi <null, null>            # REDUCTION_PHI
4414           VECT_DEF = vector_stmt                # vectorized form of STMT
4415           s_loop = scalar_stmt                  # (scalar) STMT
4416         loop_exit:
4417           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
4418           use <s_out0>
4419           use <s_out0>
4420
4421      The above is transformed by this function into:
4422
4423         loop:
4424           vec_def = phi <vec_init, VECT_DEF>    # REDUCTION_PHI
4425           VECT_DEF = vector_stmt                # vectorized form of STMT
4426           s_loop = scalar_stmt                  # (scalar) STMT
4427         loop_exit:
4428           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
4429           v_out1 = phi <VECT_DEF>               # NEW_EXIT_PHI
4430           v_out2 = reduce <v_out1>
4431           s_out3 = extract_field <v_out2, 0>
4432           s_out4 = adjust_result <s_out3>
4433           use <s_out4>
4434           use <s_out4>
4435 */
4436
4437 static void
4438 vect_create_epilog_for_reduction (vec<tree> vect_defs, gimple *stmt,
4439                                   gimple *reduc_def_stmt,
4440                                   int ncopies, internal_fn reduc_fn,
4441                                   vec<gimple *> reduction_phis,
4442                                   bool double_reduc,
4443                                   slp_tree slp_node,
4444                                   slp_instance slp_node_instance,
4445                                   tree induc_val, enum tree_code induc_code,
4446                                   tree neutral_op)
4447 {
4448   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4449   stmt_vec_info prev_phi_info;
4450   tree vectype;
4451   machine_mode mode;
4452   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4453   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo), *outer_loop = NULL;
4454   basic_block exit_bb;
4455   tree scalar_dest;
4456   tree scalar_type;
4457   gimple *new_phi = NULL, *phi;
4458   gimple_stmt_iterator exit_gsi;
4459   tree vec_dest;
4460   tree new_temp = NULL_TREE, new_dest, new_name, new_scalar_dest;
4461   gimple *epilog_stmt = NULL;
4462   enum tree_code code = gimple_assign_rhs_code (stmt);
4463   gimple *exit_phi;
4464   tree bitsize;
4465   tree adjustment_def = NULL;
4466   tree vec_initial_def = NULL;
4467   tree expr, def, initial_def = NULL;
4468   tree orig_name, scalar_result;
4469   imm_use_iterator imm_iter, phi_imm_iter;
4470   use_operand_p use_p, phi_use_p;
4471   gimple *use_stmt, *orig_stmt, *reduction_phi = NULL;
4472   bool nested_in_vect_loop = false;
4473   auto_vec<gimple *> new_phis;
4474   auto_vec<gimple *> inner_phis;
4475   enum vect_def_type dt = vect_unknown_def_type;
4476   int j, i;
4477   auto_vec<tree> scalar_results;
4478   unsigned int group_size = 1, k, ratio;
4479   auto_vec<tree> vec_initial_defs;
4480   auto_vec<gimple *> phis;
4481   bool slp_reduc = false;
4482   bool direct_slp_reduc;
4483   tree new_phi_result;
4484   gimple *inner_phi = NULL;
4485   tree induction_index = NULL_TREE;
4486
4487   if (slp_node)
4488     group_size = SLP_TREE_SCALAR_STMTS (slp_node).length ();
4489
4490   if (nested_in_vect_loop_p (loop, stmt))
4491     {
4492       outer_loop = loop;
4493       loop = loop->inner;
4494       nested_in_vect_loop = true;
4495       gcc_assert (!slp_node);
4496     }
4497
4498   vectype = STMT_VINFO_VECTYPE (stmt_info);
4499   gcc_assert (vectype);
4500   mode = TYPE_MODE (vectype);
4501
4502   /* 1. Create the reduction def-use cycle:
4503      Set the arguments of REDUCTION_PHIS, i.e., transform
4504
4505         loop:
4506           vec_def = phi <null, null>            # REDUCTION_PHI
4507           VECT_DEF = vector_stmt                # vectorized form of STMT
4508           ...
4509
4510      into:
4511
4512         loop:
4513           vec_def = phi <vec_init, VECT_DEF>    # REDUCTION_PHI
4514           VECT_DEF = vector_stmt                # vectorized form of STMT
4515           ...
4516
4517      (in case of SLP, do it for all the phis). */
4518
4519   /* Get the loop-entry arguments.  */
4520   enum vect_def_type initial_def_dt = vect_unknown_def_type;
4521   if (slp_node)
4522     {
4523       unsigned vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
4524       vec_initial_defs.reserve (vec_num);
4525       get_initial_defs_for_reduction (slp_node_instance->reduc_phis,
4526                                       &vec_initial_defs, vec_num,
4527                                       REDUC_GROUP_FIRST_ELEMENT (stmt_info),
4528                                       neutral_op);
4529     }
4530   else
4531     {
4532       /* Get at the scalar def before the loop, that defines the initial value
4533          of the reduction variable.  */
4534       initial_def = PHI_ARG_DEF_FROM_EDGE (reduc_def_stmt,
4535                                            loop_preheader_edge (loop));
4536       /* Optimize: if initial_def is for REDUC_MAX smaller than the base
4537          and we can't use zero for induc_val, use initial_def.  Similarly
4538          for REDUC_MIN and initial_def larger than the base.  */
4539       if (TREE_CODE (initial_def) == INTEGER_CST
4540           && (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
4541               == INTEGER_INDUC_COND_REDUCTION)
4542           && !integer_zerop (induc_val)
4543           && ((induc_code == MAX_EXPR
4544                && tree_int_cst_lt (initial_def, induc_val))
4545               || (induc_code == MIN_EXPR
4546                   && tree_int_cst_lt (induc_val, initial_def))))
4547         induc_val = initial_def;
4548
4549       if (double_reduc)
4550         /* In case of double reduction we only create a vector variable
4551            to be put in the reduction phi node.  The actual statement
4552            creation is done later in this function.  */
4553         vec_initial_def = vect_create_destination_var (initial_def, vectype);
4554       else if (nested_in_vect_loop)
4555         {
4556           /* Do not use an adjustment def as that case is not supported
4557              correctly if ncopies is not one.  */
4558           vect_is_simple_use (initial_def, loop_vinfo, &initial_def_dt);
4559           vec_initial_def = vect_get_vec_def_for_operand (initial_def, stmt);
4560         }
4561       else
4562         vec_initial_def = get_initial_def_for_reduction (stmt, initial_def,
4563                                                          &adjustment_def);
4564       vec_initial_defs.create (1);
4565       vec_initial_defs.quick_push (vec_initial_def);
4566     }
4567
4568   /* Set phi nodes arguments.  */
4569   FOR_EACH_VEC_ELT (reduction_phis, i, phi)
4570     {
4571       tree vec_init_def = vec_initial_defs[i];
4572       tree def = vect_defs[i];
4573       for (j = 0; j < ncopies; j++)
4574         {
4575           if (j != 0)
4576             {
4577               phi = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (phi));
4578               if (nested_in_vect_loop)
4579                 vec_init_def
4580                   = vect_get_vec_def_for_stmt_copy (initial_def_dt,
4581                                                     vec_init_def);
4582             }
4583
4584           /* Set the loop-entry arg of the reduction-phi.  */
4585
4586           if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
4587               == INTEGER_INDUC_COND_REDUCTION)
4588             {
4589               /* Initialise the reduction phi to zero.  This prevents initial
4590                  values of non-zero interferring with the reduction op.  */
4591               gcc_assert (ncopies == 1);
4592               gcc_assert (i == 0);
4593
4594               tree vec_init_def_type = TREE_TYPE (vec_init_def);
4595               tree induc_val_vec
4596                 = build_vector_from_val (vec_init_def_type, induc_val);
4597
4598               add_phi_arg (as_a <gphi *> (phi), induc_val_vec,
4599                            loop_preheader_edge (loop), UNKNOWN_LOCATION);
4600             }
4601           else
4602             add_phi_arg (as_a <gphi *> (phi), vec_init_def,
4603                          loop_preheader_edge (loop), UNKNOWN_LOCATION);
4604
4605           /* Set the loop-latch arg for the reduction-phi.  */
4606           if (j > 0)
4607             def = vect_get_vec_def_for_stmt_copy (vect_unknown_def_type, def);
4608
4609           add_phi_arg (as_a <gphi *> (phi), def, loop_latch_edge (loop),
4610                        UNKNOWN_LOCATION);
4611
4612           if (dump_enabled_p ())
4613             {
4614               dump_printf_loc (MSG_NOTE, vect_location,
4615                                "transform reduction: created def-use cycle: ");
4616               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
4617               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, SSA_NAME_DEF_STMT (def), 0);
4618             }
4619         }
4620     }
4621
4622   /* For cond reductions we want to create a new vector (INDEX_COND_EXPR)
4623      which is updated with the current index of the loop for every match of
4624      the original loop's cond_expr (VEC_STMT).  This results in a vector
4625      containing the last time the condition passed for that vector lane.
4626      The first match will be a 1 to allow 0 to be used for non-matching
4627      indexes.  If there are no matches at all then the vector will be all
4628      zeroes.  */
4629   if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) == COND_REDUCTION)
4630     {
4631       tree indx_before_incr, indx_after_incr;
4632       poly_uint64 nunits_out = TYPE_VECTOR_SUBPARTS (vectype);
4633
4634       gimple *vec_stmt = STMT_VINFO_VEC_STMT (stmt_info);
4635       gcc_assert (gimple_assign_rhs_code (vec_stmt) == VEC_COND_EXPR);
4636
4637       int scalar_precision
4638         = GET_MODE_PRECISION (SCALAR_TYPE_MODE (TREE_TYPE (vectype)));
4639       tree cr_index_scalar_type = make_unsigned_type (scalar_precision);
4640       tree cr_index_vector_type = build_vector_type
4641         (cr_index_scalar_type, TYPE_VECTOR_SUBPARTS (vectype));
4642
4643       /* First we create a simple vector induction variable which starts
4644          with the values {1,2,3,...} (SERIES_VECT) and increments by the
4645          vector size (STEP).  */
4646
4647       /* Create a {1,2,3,...} vector.  */
4648       tree series_vect = build_index_vector (cr_index_vector_type, 1, 1);
4649
4650       /* Create a vector of the step value.  */
4651       tree step = build_int_cst (cr_index_scalar_type, nunits_out);
4652       tree vec_step = build_vector_from_val (cr_index_vector_type, step);
4653
4654       /* Create an induction variable.  */
4655       gimple_stmt_iterator incr_gsi;
4656       bool insert_after;
4657       standard_iv_increment_position (loop, &incr_gsi, &insert_after);
4658       create_iv (series_vect, vec_step, NULL_TREE, loop, &incr_gsi,
4659                  insert_after, &indx_before_incr, &indx_after_incr);
4660
4661       /* Next create a new phi node vector (NEW_PHI_TREE) which starts
4662          filled with zeros (VEC_ZERO).  */
4663
4664       /* Create a vector of 0s.  */
4665       tree zero = build_zero_cst (cr_index_scalar_type);
4666       tree vec_zero = build_vector_from_val (cr_index_vector_type, zero);
4667
4668       /* Create a vector phi node.  */
4669       tree new_phi_tree = make_ssa_name (cr_index_vector_type);
4670       new_phi = create_phi_node (new_phi_tree, loop->header);
4671       loop_vinfo->add_stmt (new_phi);
4672       add_phi_arg (as_a <gphi *> (new_phi), vec_zero,
4673                    loop_preheader_edge (loop), UNKNOWN_LOCATION);
4674
4675       /* Now take the condition from the loops original cond_expr
4676          (VEC_STMT) and produce a new cond_expr (INDEX_COND_EXPR) which for
4677          every match uses values from the induction variable
4678          (INDEX_BEFORE_INCR) otherwise uses values from the phi node
4679          (NEW_PHI_TREE).
4680          Finally, we update the phi (NEW_PHI_TREE) to take the value of
4681          the new cond_expr (INDEX_COND_EXPR).  */
4682
4683       /* Duplicate the condition from vec_stmt.  */
4684       tree ccompare = unshare_expr (gimple_assign_rhs1 (vec_stmt));
4685
4686       /* Create a conditional, where the condition is taken from vec_stmt
4687          (CCOMPARE), then is the induction index (INDEX_BEFORE_INCR) and
4688          else is the phi (NEW_PHI_TREE).  */
4689       tree index_cond_expr = build3 (VEC_COND_EXPR, cr_index_vector_type,
4690                                      ccompare, indx_before_incr,
4691                                      new_phi_tree);
4692       induction_index = make_ssa_name (cr_index_vector_type);
4693       gimple *index_condition = gimple_build_assign (induction_index,
4694                                                      index_cond_expr);
4695       gsi_insert_before (&incr_gsi, index_condition, GSI_SAME_STMT);
4696       stmt_vec_info index_vec_info = loop_vinfo->add_stmt (index_condition);
4697       STMT_VINFO_VECTYPE (index_vec_info) = cr_index_vector_type;
4698
4699       /* Update the phi with the vec cond.  */
4700       add_phi_arg (as_a <gphi *> (new_phi), induction_index,
4701                    loop_latch_edge (loop), UNKNOWN_LOCATION);
4702     }
4703
4704   /* 2. Create epilog code.
4705         The reduction epilog code operates across the elements of the vector
4706         of partial results computed by the vectorized loop.
4707         The reduction epilog code consists of:
4708
4709         step 1: compute the scalar result in a vector (v_out2)
4710         step 2: extract the scalar result (s_out3) from the vector (v_out2)
4711         step 3: adjust the scalar result (s_out3) if needed.
4712
4713         Step 1 can be accomplished using one the following three schemes:
4714           (scheme 1) using reduc_fn, if available.
4715           (scheme 2) using whole-vector shifts, if available.
4716           (scheme 3) using a scalar loop. In this case steps 1+2 above are
4717                      combined.
4718
4719           The overall epilog code looks like this:
4720
4721           s_out0 = phi <s_loop>         # original EXIT_PHI
4722           v_out1 = phi <VECT_DEF>       # NEW_EXIT_PHI
4723           v_out2 = reduce <v_out1>              # step 1
4724           s_out3 = extract_field <v_out2, 0>    # step 2
4725           s_out4 = adjust_result <s_out3>       # step 3
4726
4727           (step 3 is optional, and steps 1 and 2 may be combined).
4728           Lastly, the uses of s_out0 are replaced by s_out4.  */
4729
4730
4731   /* 2.1 Create new loop-exit-phis to preserve loop-closed form:
4732          v_out1 = phi <VECT_DEF>
4733          Store them in NEW_PHIS.  */
4734
4735   exit_bb = single_exit (loop)->dest;
4736   prev_phi_info = NULL;
4737   new_phis.create (vect_defs.length ());
4738   FOR_EACH_VEC_ELT (vect_defs, i, def)
4739     {
4740       for (j = 0; j < ncopies; j++)
4741         {
4742           tree new_def = copy_ssa_name (def);
4743           phi = create_phi_node (new_def, exit_bb);
4744           stmt_vec_info phi_info = loop_vinfo->add_stmt (phi);
4745           if (j == 0)
4746             new_phis.quick_push (phi);
4747           else
4748             {
4749               def = vect_get_vec_def_for_stmt_copy (dt, def);
4750               STMT_VINFO_RELATED_STMT (prev_phi_info) = phi;
4751             }
4752
4753           SET_PHI_ARG_DEF (phi, single_exit (loop)->dest_idx, def);
4754           prev_phi_info = phi_info;
4755         }
4756     }
4757
4758   /* The epilogue is created for the outer-loop, i.e., for the loop being
4759      vectorized.  Create exit phis for the outer loop.  */
4760   if (double_reduc)
4761     {
4762       loop = outer_loop;
4763       exit_bb = single_exit (loop)->dest;
4764       inner_phis.create (vect_defs.length ());
4765       FOR_EACH_VEC_ELT (new_phis, i, phi)
4766         {
4767           tree new_result = copy_ssa_name (PHI_RESULT (phi));
4768           gphi *outer_phi = create_phi_node (new_result, exit_bb);
4769           SET_PHI_ARG_DEF (outer_phi, single_exit (loop)->dest_idx,
4770                            PHI_RESULT (phi));
4771           prev_phi_info = loop_vinfo->add_stmt (outer_phi);
4772           inner_phis.quick_push (phi);
4773           new_phis[i] = outer_phi;
4774           while (STMT_VINFO_RELATED_STMT (vinfo_for_stmt (phi)))
4775             {
4776               phi = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (phi));
4777               new_result = copy_ssa_name (PHI_RESULT (phi));
4778               outer_phi = create_phi_node (new_result, exit_bb);
4779               SET_PHI_ARG_DEF (outer_phi, single_exit (loop)->dest_idx,
4780                                PHI_RESULT (phi));
4781               stmt_vec_info outer_phi_info = loop_vinfo->add_stmt (outer_phi);
4782               STMT_VINFO_RELATED_STMT (prev_phi_info) = outer_phi;
4783               prev_phi_info = outer_phi_info;
4784             }
4785         }
4786     }
4787
4788   exit_gsi = gsi_after_labels (exit_bb);
4789
4790   /* 2.2 Get the relevant tree-code to use in the epilog for schemes 2,3
4791          (i.e. when reduc_fn is not available) and in the final adjustment
4792          code (if needed).  Also get the original scalar reduction variable as
4793          defined in the loop.  In case STMT is a "pattern-stmt" (i.e. - it
4794          represents a reduction pattern), the tree-code and scalar-def are
4795          taken from the original stmt that the pattern-stmt (STMT) replaces.
4796          Otherwise (it is a regular reduction) - the tree-code and scalar-def
4797          are taken from STMT.  */
4798
4799   orig_stmt = STMT_VINFO_RELATED_STMT (stmt_info);
4800   if (!orig_stmt)
4801     {
4802       /* Regular reduction  */
4803       orig_stmt = stmt;
4804     }
4805   else
4806     {
4807       /* Reduction pattern  */
4808       stmt_vec_info stmt_vinfo = vinfo_for_stmt (orig_stmt);
4809       gcc_assert (STMT_VINFO_IN_PATTERN_P (stmt_vinfo));
4810       gcc_assert (STMT_VINFO_RELATED_STMT (stmt_vinfo) == stmt);
4811     }
4812
4813   code = gimple_assign_rhs_code (orig_stmt);
4814   /* For MINUS_EXPR the initial vector is [init_val,0,...,0], therefore,
4815      partial results are added and not subtracted.  */
4816   if (code == MINUS_EXPR)
4817     code = PLUS_EXPR;
4818
4819   scalar_dest = gimple_assign_lhs (orig_stmt);
4820   scalar_type = TREE_TYPE (scalar_dest);
4821   scalar_results.create (group_size);
4822   new_scalar_dest = vect_create_destination_var (scalar_dest, NULL);
4823   bitsize = TYPE_SIZE (scalar_type);
4824
4825   /* In case this is a reduction in an inner-loop while vectorizing an outer
4826      loop - we don't need to extract a single scalar result at the end of the
4827      inner-loop (unless it is double reduction, i.e., the use of reduction is
4828      outside the outer-loop).  The final vector of partial results will be used
4829      in the vectorized outer-loop, or reduced to a scalar result at the end of
4830      the outer-loop.  */
4831   if (nested_in_vect_loop && !double_reduc)
4832     goto vect_finalize_reduction;
4833
4834   /* SLP reduction without reduction chain, e.g.,
4835      # a1 = phi <a2, a0>
4836      # b1 = phi <b2, b0>
4837      a2 = operation (a1)
4838      b2 = operation (b1)  */
4839   slp_reduc = (slp_node && !REDUC_GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)));
4840
4841   /* True if we should implement SLP_REDUC using native reduction operations
4842      instead of scalar operations.  */
4843   direct_slp_reduc = (reduc_fn != IFN_LAST
4844                       && slp_reduc
4845                       && !TYPE_VECTOR_SUBPARTS (vectype).is_constant ());
4846
4847   /* In case of reduction chain, e.g.,
4848      # a1 = phi <a3, a0>
4849      a2 = operation (a1)
4850      a3 = operation (a2),
4851
4852      we may end up with more than one vector result.  Here we reduce them to
4853      one vector.  */
4854   if (REDUC_GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) || direct_slp_reduc)
4855     {
4856       tree first_vect = PHI_RESULT (new_phis[0]);
4857       gassign *new_vec_stmt = NULL;
4858       vec_dest = vect_create_destination_var (scalar_dest, vectype);
4859       for (k = 1; k < new_phis.length (); k++)
4860         {
4861           gimple *next_phi = new_phis[k];
4862           tree second_vect = PHI_RESULT (next_phi);
4863           tree tem = make_ssa_name (vec_dest, new_vec_stmt);
4864           new_vec_stmt = gimple_build_assign (tem, code,
4865                                               first_vect, second_vect);
4866           gsi_insert_before (&exit_gsi, new_vec_stmt, GSI_SAME_STMT);
4867           first_vect = tem;
4868         }
4869
4870       new_phi_result = first_vect;
4871       if (new_vec_stmt)
4872         {
4873           new_phis.truncate (0);
4874           new_phis.safe_push (new_vec_stmt);
4875         }
4876     }
4877   /* Likewise if we couldn't use a single defuse cycle.  */
4878   else if (ncopies > 1)
4879     {
4880       gcc_assert (new_phis.length () == 1);
4881       tree first_vect = PHI_RESULT (new_phis[0]);
4882       gassign *new_vec_stmt = NULL;
4883       vec_dest = vect_create_destination_var (scalar_dest, vectype);
4884       gimple *next_phi = new_phis[0];
4885       for (int k = 1; k < ncopies; ++k)
4886         {
4887           next_phi = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (next_phi));
4888           tree second_vect = PHI_RESULT (next_phi);
4889           tree tem = make_ssa_name (vec_dest, new_vec_stmt);
4890           new_vec_stmt = gimple_build_assign (tem, code,
4891                                               first_vect, second_vect);
4892           gsi_insert_before (&exit_gsi, new_vec_stmt, GSI_SAME_STMT);
4893           first_vect = tem;
4894         }
4895       new_phi_result = first_vect;
4896       new_phis.truncate (0);
4897       new_phis.safe_push (new_vec_stmt);
4898     }
4899   else
4900     new_phi_result = PHI_RESULT (new_phis[0]);
4901
4902   if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) == COND_REDUCTION
4903       && reduc_fn != IFN_LAST)
4904     {
4905       /* For condition reductions, we have a vector (NEW_PHI_RESULT) containing
4906          various data values where the condition matched and another vector
4907          (INDUCTION_INDEX) containing all the indexes of those matches.  We
4908          need to extract the last matching index (which will be the index with
4909          highest value) and use this to index into the data vector.
4910          For the case where there were no matches, the data vector will contain
4911          all default values and the index vector will be all zeros.  */
4912
4913       /* Get various versions of the type of the vector of indexes.  */
4914       tree index_vec_type = TREE_TYPE (induction_index);
4915       gcc_checking_assert (TYPE_UNSIGNED (index_vec_type));
4916       tree index_scalar_type = TREE_TYPE (index_vec_type);
4917       tree index_vec_cmp_type = build_same_sized_truth_vector_type
4918         (index_vec_type);
4919
4920       /* Get an unsigned integer version of the type of the data vector.  */
4921       int scalar_precision
4922         = GET_MODE_PRECISION (SCALAR_TYPE_MODE (scalar_type));
4923       tree scalar_type_unsigned = make_unsigned_type (scalar_precision);
4924       tree vectype_unsigned = build_vector_type
4925         (scalar_type_unsigned, TYPE_VECTOR_SUBPARTS (vectype));
4926
4927       /* First we need to create a vector (ZERO_VEC) of zeros and another
4928          vector (MAX_INDEX_VEC) filled with the last matching index, which we
4929          can create using a MAX reduction and then expanding.
4930          In the case where the loop never made any matches, the max index will
4931          be zero.  */
4932
4933       /* Vector of {0, 0, 0,...}.  */
4934       tree zero_vec = make_ssa_name (vectype);
4935       tree zero_vec_rhs = build_zero_cst (vectype);
4936       gimple *zero_vec_stmt = gimple_build_assign (zero_vec, zero_vec_rhs);
4937       gsi_insert_before (&exit_gsi, zero_vec_stmt, GSI_SAME_STMT);
4938
4939       /* Find maximum value from the vector of found indexes.  */
4940       tree max_index = make_ssa_name (index_scalar_type);
4941       gcall *max_index_stmt = gimple_build_call_internal (IFN_REDUC_MAX,
4942                                                           1, induction_index);
4943       gimple_call_set_lhs (max_index_stmt, max_index);
4944       gsi_insert_before (&exit_gsi, max_index_stmt, GSI_SAME_STMT);
4945
4946       /* Vector of {max_index, max_index, max_index,...}.  */
4947       tree max_index_vec = make_ssa_name (index_vec_type);
4948       tree max_index_vec_rhs = build_vector_from_val (index_vec_type,
4949                                                       max_index);
4950       gimple *max_index_vec_stmt = gimple_build_assign (max_index_vec,
4951                                                         max_index_vec_rhs);
4952       gsi_insert_before (&exit_gsi, max_index_vec_stmt, GSI_SAME_STMT);
4953
4954       /* Next we compare the new vector (MAX_INDEX_VEC) full of max indexes
4955          with the vector (INDUCTION_INDEX) of found indexes, choosing values
4956          from the data vector (NEW_PHI_RESULT) for matches, 0 (ZERO_VEC)
4957          otherwise.  Only one value should match, resulting in a vector
4958          (VEC_COND) with one data value and the rest zeros.
4959          In the case where the loop never made any matches, every index will
4960          match, resulting in a vector with all data values (which will all be
4961          the default value).  */
4962
4963       /* Compare the max index vector to the vector of found indexes to find
4964          the position of the max value.  */
4965       tree vec_compare = make_ssa_name (index_vec_cmp_type);
4966       gimple *vec_compare_stmt = gimple_build_assign (vec_compare, EQ_EXPR,
4967                                                       induction_index,
4968                                                       max_index_vec);
4969       gsi_insert_before (&exit_gsi, vec_compare_stmt, GSI_SAME_STMT);
4970
4971       /* Use the compare to choose either values from the data vector or
4972          zero.  */
4973       tree vec_cond = make_ssa_name (vectype);
4974       gimple *vec_cond_stmt = gimple_build_assign (vec_cond, VEC_COND_EXPR,
4975                                                    vec_compare, new_phi_result,
4976                                                    zero_vec);
4977       gsi_insert_before (&exit_gsi, vec_cond_stmt, GSI_SAME_STMT);
4978
4979       /* Finally we need to extract the data value from the vector (VEC_COND)
4980          into a scalar (MATCHED_DATA_REDUC).  Logically we want to do a OR
4981          reduction, but because this doesn't exist, we can use a MAX reduction
4982          instead.  The data value might be signed or a float so we need to cast
4983          it first.
4984          In the case where the loop never made any matches, the data values are
4985          all identical, and so will reduce down correctly.  */
4986
4987       /* Make the matched data values unsigned.  */
4988       tree vec_cond_cast = make_ssa_name (vectype_unsigned);
4989       tree vec_cond_cast_rhs = build1 (VIEW_CONVERT_EXPR, vectype_unsigned,
4990                                        vec_cond);
4991       gimple *vec_cond_cast_stmt = gimple_build_assign (vec_cond_cast,
4992                                                         VIEW_CONVERT_EXPR,
4993                                                         vec_cond_cast_rhs);
4994       gsi_insert_before (&exit_gsi, vec_cond_cast_stmt, GSI_SAME_STMT);
4995
4996       /* Reduce down to a scalar value.  */
4997       tree data_reduc = make_ssa_name (scalar_type_unsigned);
4998       gcall *data_reduc_stmt = gimple_build_call_internal (IFN_REDUC_MAX,
4999                                                            1, vec_cond_cast);
5000       gimple_call_set_lhs (data_reduc_stmt, data_reduc);
5001       gsi_insert_before (&exit_gsi, data_reduc_stmt, GSI_SAME_STMT);
5002
5003       /* Convert the reduced value back to the result type and set as the
5004          result.  */
5005       gimple_seq stmts = NULL;
5006       new_temp = gimple_build (&stmts, VIEW_CONVERT_EXPR, scalar_type,
5007                                data_reduc);
5008       gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
5009       scalar_results.safe_push (new_temp);
5010     }
5011   else if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) == COND_REDUCTION
5012            && reduc_fn == IFN_LAST)
5013     {
5014       /* Condition reduction without supported IFN_REDUC_MAX.  Generate
5015          idx = 0;
5016          idx_val = induction_index[0];
5017          val = data_reduc[0];
5018          for (idx = 0, val = init, i = 0; i < nelts; ++i)
5019            if (induction_index[i] > idx_val)
5020              val = data_reduc[i], idx_val = induction_index[i];
5021          return val;  */
5022
5023       tree data_eltype = TREE_TYPE (TREE_TYPE (new_phi_result));
5024       tree idx_eltype = TREE_TYPE (TREE_TYPE (induction_index));
5025       unsigned HOST_WIDE_INT el_size = tree_to_uhwi (TYPE_SIZE (idx_eltype));
5026       poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (TREE_TYPE (induction_index));
5027       /* Enforced by vectorizable_reduction, which ensures we have target
5028          support before allowing a conditional reduction on variable-length
5029          vectors.  */
5030       unsigned HOST_WIDE_INT v_size = el_size * nunits.to_constant ();
5031       tree idx_val = NULL_TREE, val = NULL_TREE;
5032       for (unsigned HOST_WIDE_INT off = 0; off < v_size; off += el_size)
5033         {
5034           tree old_idx_val = idx_val;
5035           tree old_val = val;
5036           idx_val = make_ssa_name (idx_eltype);
5037           epilog_stmt = gimple_build_assign (idx_val, BIT_FIELD_REF,
5038                                              build3 (BIT_FIELD_REF, idx_eltype,
5039                                                      induction_index,
5040                                                      bitsize_int (el_size),
5041                                                      bitsize_int (off)));
5042           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5043           val = make_ssa_name (data_eltype);
5044           epilog_stmt = gimple_build_assign (val, BIT_FIELD_REF,
5045                                              build3 (BIT_FIELD_REF,
5046                                                      data_eltype,
5047                                                      new_phi_result,
5048                                                      bitsize_int (el_size),
5049                                                      bitsize_int (off)));
5050           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5051           if (off != 0)
5052             {
5053               tree new_idx_val = idx_val;
5054               tree new_val = val;
5055               if (off != v_size - el_size)
5056                 {
5057                   new_idx_val = make_ssa_name (idx_eltype);
5058                   epilog_stmt = gimple_build_assign (new_idx_val,
5059                                                      MAX_EXPR, idx_val,
5060                                                      old_idx_val);
5061                   gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5062                 }
5063               new_val = make_ssa_name (data_eltype);
5064               epilog_stmt = gimple_build_assign (new_val,
5065                                                  COND_EXPR,
5066                                                  build2 (GT_EXPR,
5067                                                          boolean_type_node,
5068                                                          idx_val,
5069                                                          old_idx_val),
5070                                                  val, old_val);
5071               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5072               idx_val = new_idx_val;
5073               val = new_val;
5074             }
5075         }
5076       /* Convert the reduced value back to the result type and set as the
5077          result.  */
5078       gimple_seq stmts = NULL;
5079       val = gimple_convert (&stmts, scalar_type, val);
5080       gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
5081       scalar_results.safe_push (val);
5082     }
5083
5084   /* 2.3 Create the reduction code, using one of the three schemes described
5085          above. In SLP we simply need to extract all the elements from the
5086          vector (without reducing them), so we use scalar shifts.  */
5087   else if (reduc_fn != IFN_LAST && !slp_reduc)
5088     {
5089       tree tmp;
5090       tree vec_elem_type;
5091
5092       /* Case 1:  Create:
5093          v_out2 = reduc_expr <v_out1>  */
5094
5095       if (dump_enabled_p ())
5096         dump_printf_loc (MSG_NOTE, vect_location,
5097                          "Reduce using direct vector reduction.\n");
5098
5099       vec_elem_type = TREE_TYPE (TREE_TYPE (new_phi_result));
5100       if (!useless_type_conversion_p (scalar_type, vec_elem_type))
5101         {
5102           tree tmp_dest
5103             = vect_create_destination_var (scalar_dest, vec_elem_type);
5104           epilog_stmt = gimple_build_call_internal (reduc_fn, 1,
5105                                                     new_phi_result);
5106           gimple_set_lhs (epilog_stmt, tmp_dest);
5107           new_temp = make_ssa_name (tmp_dest, epilog_stmt);
5108           gimple_set_lhs (epilog_stmt, new_temp);
5109           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5110
5111           epilog_stmt = gimple_build_assign (new_scalar_dest, NOP_EXPR,
5112                                              new_temp);
5113         }
5114       else
5115         {
5116           epilog_stmt = gimple_build_call_internal (reduc_fn, 1,
5117                                                     new_phi_result);
5118           gimple_set_lhs (epilog_stmt, new_scalar_dest);
5119         }
5120
5121       new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
5122       gimple_set_lhs (epilog_stmt, new_temp);
5123       gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5124
5125       if ((STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
5126            == INTEGER_INDUC_COND_REDUCTION)
5127           && !operand_equal_p (initial_def, induc_val, 0))
5128         {
5129           /* Earlier we set the initial value to be a vector if induc_val
5130              values.  Check the result and if it is induc_val then replace
5131              with the original initial value, unless induc_val is
5132              the same as initial_def already.  */
5133           tree zcompare = build2 (EQ_EXPR, boolean_type_node, new_temp,
5134                                   induc_val);
5135
5136           tmp = make_ssa_name (new_scalar_dest);
5137           epilog_stmt = gimple_build_assign (tmp, COND_EXPR, zcompare,
5138                                              initial_def, new_temp);
5139           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5140           new_temp = tmp;
5141         }
5142
5143       scalar_results.safe_push (new_temp);
5144     }
5145   else if (direct_slp_reduc)
5146     {
5147       /* Here we create one vector for each of the REDUC_GROUP_SIZE results,
5148          with the elements for other SLP statements replaced with the
5149          neutral value.  We can then do a normal reduction on each vector.  */
5150
5151       /* Enforced by vectorizable_reduction.  */
5152       gcc_assert (new_phis.length () == 1);
5153       gcc_assert (pow2p_hwi (group_size));
5154
5155       slp_tree orig_phis_slp_node = slp_node_instance->reduc_phis;
5156       vec<gimple *> orig_phis = SLP_TREE_SCALAR_STMTS (orig_phis_slp_node);
5157       gimple_seq seq = NULL;
5158
5159       /* Build a vector {0, 1, 2, ...}, with the same number of elements
5160          and the same element size as VECTYPE.  */
5161       tree index = build_index_vector (vectype, 0, 1);
5162       tree index_type = TREE_TYPE (index);
5163       tree index_elt_type = TREE_TYPE (index_type);
5164       tree mask_type = build_same_sized_truth_vector_type (index_type);
5165
5166       /* Create a vector that, for each element, identifies which of
5167          the REDUC_GROUP_SIZE results should use it.  */
5168       tree index_mask = build_int_cst (index_elt_type, group_size - 1);
5169       index = gimple_build (&seq, BIT_AND_EXPR, index_type, index,
5170                             build_vector_from_val (index_type, index_mask));
5171
5172       /* Get a neutral vector value.  This is simply a splat of the neutral
5173          scalar value if we have one, otherwise the initial scalar value
5174          is itself a neutral value.  */
5175       tree vector_identity = NULL_TREE;
5176       if (neutral_op)
5177         vector_identity = gimple_build_vector_from_val (&seq, vectype,
5178                                                         neutral_op);
5179       for (unsigned int i = 0; i < group_size; ++i)
5180         {
5181           /* If there's no univeral neutral value, we can use the
5182              initial scalar value from the original PHI.  This is used
5183              for MIN and MAX reduction, for example.  */
5184           if (!neutral_op)
5185             {
5186               tree scalar_value
5187                 = PHI_ARG_DEF_FROM_EDGE (orig_phis[i],
5188                                          loop_preheader_edge (loop));
5189               vector_identity = gimple_build_vector_from_val (&seq, vectype,
5190                                                               scalar_value);
5191             }
5192
5193           /* Calculate the equivalent of:
5194
5195              sel[j] = (index[j] == i);
5196
5197              which selects the elements of NEW_PHI_RESULT that should
5198              be included in the result.  */
5199           tree compare_val = build_int_cst (index_elt_type, i);
5200           compare_val = build_vector_from_val (index_type, compare_val);
5201           tree sel = gimple_build (&seq, EQ_EXPR, mask_type,
5202                                    index, compare_val);
5203
5204           /* Calculate the equivalent of:
5205
5206              vec = seq ? new_phi_result : vector_identity;
5207
5208              VEC is now suitable for a full vector reduction.  */
5209           tree vec = gimple_build (&seq, VEC_COND_EXPR, vectype,
5210                                    sel, new_phi_result, vector_identity);
5211
5212           /* Do the reduction and convert it to the appropriate type.  */
5213           tree scalar = gimple_build (&seq, as_combined_fn (reduc_fn),
5214                                       TREE_TYPE (vectype), vec);
5215           scalar = gimple_convert (&seq, scalar_type, scalar);
5216           scalar_results.safe_push (scalar);
5217         }
5218       gsi_insert_seq_before (&exit_gsi, seq, GSI_SAME_STMT);
5219     }
5220   else
5221     {
5222       bool reduce_with_shift;
5223       tree vec_temp;
5224
5225       /* COND reductions all do the final reduction with MAX_EXPR
5226          or MIN_EXPR.  */
5227       if (code == COND_EXPR)
5228         {
5229           if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
5230               == INTEGER_INDUC_COND_REDUCTION)
5231             code = induc_code;
5232           else
5233             code = MAX_EXPR;
5234         }
5235
5236       /* See if the target wants to do the final (shift) reduction
5237          in a vector mode of smaller size and first reduce upper/lower
5238          halves against each other.  */
5239       enum machine_mode mode1 = mode;
5240       tree vectype1 = vectype;
5241       unsigned sz = tree_to_uhwi (TYPE_SIZE_UNIT (vectype));
5242       unsigned sz1 = sz;
5243       if (!slp_reduc
5244           && (mode1 = targetm.vectorize.split_reduction (mode)) != mode)
5245         sz1 = GET_MODE_SIZE (mode1).to_constant ();
5246
5247       vectype1 = get_vectype_for_scalar_type_and_size (scalar_type, sz1);
5248       reduce_with_shift = have_whole_vector_shift (mode1);
5249       if (!VECTOR_MODE_P (mode1))
5250         reduce_with_shift = false;
5251       else
5252         {
5253           optab optab = optab_for_tree_code (code, vectype1, optab_default);
5254           if (optab_handler (optab, mode1) == CODE_FOR_nothing)
5255             reduce_with_shift = false;
5256         }
5257
5258       /* First reduce the vector to the desired vector size we should
5259          do shift reduction on by combining upper and lower halves.  */
5260       new_temp = new_phi_result;
5261       while (sz > sz1)
5262         {
5263           gcc_assert (!slp_reduc);
5264           sz /= 2;
5265           vectype1 = get_vectype_for_scalar_type_and_size (scalar_type, sz);
5266
5267           /* The target has to make sure we support lowpart/highpart
5268              extraction, either via direct vector extract or through
5269              an integer mode punning.  */
5270           tree dst1, dst2;
5271           if (convert_optab_handler (vec_extract_optab,
5272                                      TYPE_MODE (TREE_TYPE (new_temp)),
5273                                      TYPE_MODE (vectype1))
5274               != CODE_FOR_nothing)
5275             {
5276               /* Extract sub-vectors directly once vec_extract becomes
5277                  a conversion optab.  */
5278               dst1 = make_ssa_name (vectype1);
5279               epilog_stmt
5280                   = gimple_build_assign (dst1, BIT_FIELD_REF,
5281                                          build3 (BIT_FIELD_REF, vectype1,
5282                                                  new_temp, TYPE_SIZE (vectype1),
5283                                                  bitsize_int (0)));
5284               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5285               dst2 =  make_ssa_name (vectype1);
5286               epilog_stmt
5287                   = gimple_build_assign (dst2, BIT_FIELD_REF,
5288                                          build3 (BIT_FIELD_REF, vectype1,
5289                                                  new_temp, TYPE_SIZE (vectype1),
5290                                                  bitsize_int (sz * BITS_PER_UNIT)));
5291               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5292             }
5293           else
5294             {
5295               /* Extract via punning to appropriately sized integer mode
5296                  vector.  */
5297               tree eltype = build_nonstandard_integer_type (sz * BITS_PER_UNIT,
5298                                                             1);
5299               tree etype = build_vector_type (eltype, 2);
5300               gcc_assert (convert_optab_handler (vec_extract_optab,
5301                                                  TYPE_MODE (etype),
5302                                                  TYPE_MODE (eltype))
5303                           != CODE_FOR_nothing);
5304               tree tem = make_ssa_name (etype);
5305               epilog_stmt = gimple_build_assign (tem, VIEW_CONVERT_EXPR,
5306                                                  build1 (VIEW_CONVERT_EXPR,
5307                                                          etype, new_temp));
5308               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5309               new_temp = tem;
5310               tem = make_ssa_name (eltype);
5311               epilog_stmt
5312                   = gimple_build_assign (tem, BIT_FIELD_REF,
5313                                          build3 (BIT_FIELD_REF, eltype,
5314                                                  new_temp, TYPE_SIZE (eltype),
5315                                                  bitsize_int (0)));
5316               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5317               dst1 = make_ssa_name (vectype1);
5318               epilog_stmt = gimple_build_assign (dst1, VIEW_CONVERT_EXPR,
5319                                                  build1 (VIEW_CONVERT_EXPR,
5320                                                          vectype1, tem));
5321               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5322               tem = make_ssa_name (eltype);
5323               epilog_stmt
5324                   = gimple_build_assign (tem, BIT_FIELD_REF,
5325                                          build3 (BIT_FIELD_REF, eltype,
5326                                                  new_temp, TYPE_SIZE (eltype),
5327                                                  bitsize_int (sz * BITS_PER_UNIT)));
5328               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5329               dst2 =  make_ssa_name (vectype1);
5330               epilog_stmt = gimple_build_assign (dst2, VIEW_CONVERT_EXPR,
5331                                                  build1 (VIEW_CONVERT_EXPR,
5332                                                          vectype1, tem));
5333               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5334             }
5335
5336           new_temp = make_ssa_name (vectype1);
5337           epilog_stmt = gimple_build_assign (new_temp, code, dst1, dst2);
5338           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5339         }
5340
5341       if (reduce_with_shift && !slp_reduc)
5342         {
5343           int element_bitsize = tree_to_uhwi (bitsize);
5344           /* Enforced by vectorizable_reduction, which disallows SLP reductions
5345              for variable-length vectors and also requires direct target support
5346              for loop reductions.  */
5347           int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype1));
5348           int nelements = vec_size_in_bits / element_bitsize;
5349           vec_perm_builder sel;
5350           vec_perm_indices indices;
5351
5352           int elt_offset;
5353
5354           tree zero_vec = build_zero_cst (vectype1);
5355           /* Case 2: Create:
5356              for (offset = nelements/2; offset >= 1; offset/=2)
5357                 {
5358                   Create:  va' = vec_shift <va, offset>
5359                   Create:  va = vop <va, va'>
5360                 }  */
5361
5362           tree rhs;
5363
5364           if (dump_enabled_p ())
5365             dump_printf_loc (MSG_NOTE, vect_location,
5366                              "Reduce using vector shifts\n");
5367
5368           mode1 = TYPE_MODE (vectype1);
5369           vec_dest = vect_create_destination_var (scalar_dest, vectype1);
5370           for (elt_offset = nelements / 2;
5371                elt_offset >= 1;
5372                elt_offset /= 2)
5373             {
5374               calc_vec_perm_mask_for_shift (elt_offset, nelements, &sel);
5375               indices.new_vector (sel, 2, nelements);
5376               tree mask = vect_gen_perm_mask_any (vectype1, indices);
5377               epilog_stmt = gimple_build_assign (vec_dest, VEC_PERM_EXPR,
5378                                                  new_temp, zero_vec, mask);
5379               new_name = make_ssa_name (vec_dest, epilog_stmt);
5380               gimple_assign_set_lhs (epilog_stmt, new_name);
5381               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5382
5383               epilog_stmt = gimple_build_assign (vec_dest, code, new_name,
5384                                                  new_temp);
5385               new_temp = make_ssa_name (vec_dest, epilog_stmt);
5386               gimple_assign_set_lhs (epilog_stmt, new_temp);
5387               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5388             }
5389
5390           /* 2.4  Extract the final scalar result.  Create:
5391              s_out3 = extract_field <v_out2, bitpos>  */
5392
5393           if (dump_enabled_p ())
5394             dump_printf_loc (MSG_NOTE, vect_location,
5395                              "extract scalar result\n");
5396
5397           rhs = build3 (BIT_FIELD_REF, scalar_type, new_temp,
5398                         bitsize, bitsize_zero_node);
5399           epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
5400           new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
5401           gimple_assign_set_lhs (epilog_stmt, new_temp);
5402           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5403           scalar_results.safe_push (new_temp);
5404         }
5405       else
5406         {
5407           /* Case 3: Create:
5408              s = extract_field <v_out2, 0>
5409              for (offset = element_size;
5410                   offset < vector_size;
5411                   offset += element_size;)
5412                {
5413                  Create:  s' = extract_field <v_out2, offset>
5414                  Create:  s = op <s, s'>  // For non SLP cases
5415                }  */
5416
5417           if (dump_enabled_p ())
5418             dump_printf_loc (MSG_NOTE, vect_location,
5419                              "Reduce using scalar code.\n");
5420
5421           int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype1));
5422           int element_bitsize = tree_to_uhwi (bitsize);
5423           FOR_EACH_VEC_ELT (new_phis, i, new_phi)
5424             {
5425               int bit_offset;
5426               if (gimple_code (new_phi) == GIMPLE_PHI)
5427                 vec_temp = PHI_RESULT (new_phi);
5428               else
5429                 vec_temp = gimple_assign_lhs (new_phi);
5430               tree rhs = build3 (BIT_FIELD_REF, scalar_type, vec_temp, bitsize,
5431                                  bitsize_zero_node);
5432               epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
5433               new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
5434               gimple_assign_set_lhs (epilog_stmt, new_temp);
5435               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5436
5437               /* In SLP we don't need to apply reduction operation, so we just
5438                  collect s' values in SCALAR_RESULTS.  */
5439               if (slp_reduc)
5440                 scalar_results.safe_push (new_temp);
5441
5442               for (bit_offset = element_bitsize;
5443                    bit_offset < vec_size_in_bits;
5444                    bit_offset += element_bitsize)
5445                 {
5446                   tree bitpos = bitsize_int (bit_offset);
5447                   tree rhs = build3 (BIT_FIELD_REF, scalar_type, vec_temp,
5448                                      bitsize, bitpos);
5449
5450                   epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
5451                   new_name = make_ssa_name (new_scalar_dest, epilog_stmt);
5452                   gimple_assign_set_lhs (epilog_stmt, new_name);
5453                   gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5454
5455                   if (slp_reduc)
5456                     {
5457                       /* In SLP we don't need to apply reduction operation, so
5458                          we just collect s' values in SCALAR_RESULTS.  */
5459                       new_temp = new_name;
5460                       scalar_results.safe_push (new_name);
5461                     }
5462                   else
5463                     {
5464                       epilog_stmt = gimple_build_assign (new_scalar_dest, code,
5465                                                          new_name, new_temp);
5466                       new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
5467                       gimple_assign_set_lhs (epilog_stmt, new_temp);
5468                       gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5469                     }
5470                 }
5471             }
5472
5473           /* The only case where we need to reduce scalar results in SLP, is
5474              unrolling.  If the size of SCALAR_RESULTS is greater than
5475              REDUC_GROUP_SIZE, we reduce them combining elements modulo
5476              REDUC_GROUP_SIZE.  */
5477           if (slp_reduc)
5478             {
5479               tree res, first_res, new_res;
5480               gimple *new_stmt;
5481
5482               /* Reduce multiple scalar results in case of SLP unrolling.  */
5483               for (j = group_size; scalar_results.iterate (j, &res);
5484                    j++)
5485                 {
5486                   first_res = scalar_results[j % group_size];
5487                   new_stmt = gimple_build_assign (new_scalar_dest, code,
5488                                                   first_res, res);
5489                   new_res = make_ssa_name (new_scalar_dest, new_stmt);
5490                   gimple_assign_set_lhs (new_stmt, new_res);
5491                   gsi_insert_before (&exit_gsi, new_stmt, GSI_SAME_STMT);
5492                   scalar_results[j % group_size] = new_res;
5493                 }
5494             }
5495           else
5496             /* Not SLP - we have one scalar to keep in SCALAR_RESULTS.  */
5497             scalar_results.safe_push (new_temp);
5498         }
5499
5500       if ((STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
5501            == INTEGER_INDUC_COND_REDUCTION)
5502           && !operand_equal_p (initial_def, induc_val, 0))
5503         {
5504           /* Earlier we set the initial value to be a vector if induc_val
5505              values.  Check the result and if it is induc_val then replace
5506              with the original initial value, unless induc_val is
5507              the same as initial_def already.  */
5508           tree zcompare = build2 (EQ_EXPR, boolean_type_node, new_temp,
5509                                   induc_val);
5510
5511           tree tmp = make_ssa_name (new_scalar_dest);
5512           epilog_stmt = gimple_build_assign (tmp, COND_EXPR, zcompare,
5513                                              initial_def, new_temp);
5514           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5515           scalar_results[0] = tmp;
5516         }
5517     }
5518
5519 vect_finalize_reduction:
5520
5521   if (double_reduc)
5522     loop = loop->inner;
5523
5524   /* 2.5 Adjust the final result by the initial value of the reduction
5525          variable. (When such adjustment is not needed, then
5526          'adjustment_def' is zero).  For example, if code is PLUS we create:
5527          new_temp = loop_exit_def + adjustment_def  */
5528
5529   if (adjustment_def)
5530     {
5531       gcc_assert (!slp_reduc);
5532       if (nested_in_vect_loop)
5533         {
5534           new_phi = new_phis[0];
5535           gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) == VECTOR_TYPE);
5536           expr = build2 (code, vectype, PHI_RESULT (new_phi), adjustment_def);
5537           new_dest = vect_create_destination_var (scalar_dest, vectype);
5538         }
5539       else
5540         {
5541           new_temp = scalar_results[0];
5542           gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) != VECTOR_TYPE);
5543           expr = build2 (code, scalar_type, new_temp, adjustment_def);
5544           new_dest = vect_create_destination_var (scalar_dest, scalar_type);
5545         }
5546
5547       epilog_stmt = gimple_build_assign (new_dest, expr);
5548       new_temp = make_ssa_name (new_dest, epilog_stmt);
5549       gimple_assign_set_lhs (epilog_stmt, new_temp);
5550       gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5551       if (nested_in_vect_loop)
5552         {
5553           stmt_vec_info epilog_stmt_info = loop_vinfo->add_stmt (epilog_stmt);
5554           STMT_VINFO_RELATED_STMT (epilog_stmt_info)
5555             = STMT_VINFO_RELATED_STMT (loop_vinfo->lookup_stmt (new_phi));
5556
5557           if (!double_reduc)
5558             scalar_results.quick_push (new_temp);
5559           else
5560             scalar_results[0] = new_temp;
5561         }
5562       else
5563         scalar_results[0] = new_temp;
5564
5565       new_phis[0] = epilog_stmt;
5566     }
5567
5568   /* 2.6  Handle the loop-exit phis.  Replace the uses of scalar loop-exit
5569           phis with new adjusted scalar results, i.e., replace use <s_out0>
5570           with use <s_out4>.
5571
5572      Transform:
5573         loop_exit:
5574           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
5575           v_out1 = phi <VECT_DEF>               # NEW_EXIT_PHI
5576           v_out2 = reduce <v_out1>
5577           s_out3 = extract_field <v_out2, 0>
5578           s_out4 = adjust_result <s_out3>
5579           use <s_out0>
5580           use <s_out0>
5581
5582      into:
5583
5584         loop_exit:
5585           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
5586           v_out1 = phi <VECT_DEF>               # NEW_EXIT_PHI
5587           v_out2 = reduce <v_out1>
5588           s_out3 = extract_field <v_out2, 0>
5589           s_out4 = adjust_result <s_out3>
5590           use <s_out4>
5591           use <s_out4> */
5592
5593
5594   /* In SLP reduction chain we reduce vector results into one vector if
5595      necessary, hence we set here REDUC_GROUP_SIZE to 1.  SCALAR_DEST is the
5596      LHS of the last stmt in the reduction chain, since we are looking for
5597      the loop exit phi node.  */
5598   if (REDUC_GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
5599     {
5600       gimple *dest_stmt = SLP_TREE_SCALAR_STMTS (slp_node)[group_size - 1];
5601       /* Handle reduction patterns.  */
5602       if (STMT_VINFO_RELATED_STMT (vinfo_for_stmt (dest_stmt)))
5603         dest_stmt = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (dest_stmt));
5604
5605       scalar_dest = gimple_assign_lhs (dest_stmt);
5606       group_size = 1;
5607     }
5608
5609   /* In SLP we may have several statements in NEW_PHIS and REDUCTION_PHIS (in
5610      case that REDUC_GROUP_SIZE is greater than vectorization factor).
5611      Therefore, we need to match SCALAR_RESULTS with corresponding statements.
5612      The first (REDUC_GROUP_SIZE / number of new vector stmts) scalar results
5613      correspond to the first vector stmt, etc.
5614      (RATIO is equal to (REDUC_GROUP_SIZE / number of new vector stmts)).  */
5615   if (group_size > new_phis.length ())
5616     {
5617       ratio = group_size / new_phis.length ();
5618       gcc_assert (!(group_size % new_phis.length ()));
5619     }
5620   else
5621     ratio = 1;
5622
5623   for (k = 0; k < group_size; k++)
5624     {
5625       if (k % ratio == 0)
5626         {
5627           epilog_stmt = new_phis[k / ratio];
5628           reduction_phi = reduction_phis[k / ratio];
5629           if (double_reduc)
5630             inner_phi = inner_phis[k / ratio];
5631         }
5632
5633       if (slp_reduc)
5634         {
5635           gimple *current_stmt = SLP_TREE_SCALAR_STMTS (slp_node)[k];
5636
5637           orig_stmt = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (current_stmt));
5638           /* SLP statements can't participate in patterns.  */
5639           gcc_assert (!orig_stmt);
5640           scalar_dest = gimple_assign_lhs (current_stmt);
5641         }
5642
5643       phis.create (3);
5644       /* Find the loop-closed-use at the loop exit of the original scalar
5645          result.  (The reduction result is expected to have two immediate uses -
5646          one at the latch block, and one at the loop exit).  */
5647       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, scalar_dest)
5648         if (!flow_bb_inside_loop_p (loop, gimple_bb (USE_STMT (use_p)))
5649             && !is_gimple_debug (USE_STMT (use_p)))
5650           phis.safe_push (USE_STMT (use_p));
5651
5652       /* While we expect to have found an exit_phi because of loop-closed-ssa
5653          form we can end up without one if the scalar cycle is dead.  */
5654
5655       FOR_EACH_VEC_ELT (phis, i, exit_phi)
5656         {
5657           if (outer_loop)
5658             {
5659               stmt_vec_info exit_phi_vinfo
5660                 = loop_vinfo->lookup_stmt (exit_phi);
5661               gphi *vect_phi;
5662
5663               /* FORNOW. Currently not supporting the case that an inner-loop
5664                  reduction is not used in the outer-loop (but only outside the
5665                  outer-loop), unless it is double reduction.  */
5666               gcc_assert ((STMT_VINFO_RELEVANT_P (exit_phi_vinfo)
5667                            && !STMT_VINFO_LIVE_P (exit_phi_vinfo))
5668                           || double_reduc);
5669
5670               if (double_reduc)
5671                 STMT_VINFO_VEC_STMT (exit_phi_vinfo) = inner_phi;
5672               else
5673                 STMT_VINFO_VEC_STMT (exit_phi_vinfo) = epilog_stmt;
5674               if (!double_reduc
5675                   || STMT_VINFO_DEF_TYPE (exit_phi_vinfo)
5676                       != vect_double_reduction_def)
5677                 continue;
5678
5679               /* Handle double reduction:
5680
5681                  stmt1: s1 = phi <s0, s2>  - double reduction phi (outer loop)
5682                  stmt2:   s3 = phi <s1, s4> - (regular) reduc phi (inner loop)
5683                  stmt3:   s4 = use (s3)     - (regular) reduc stmt (inner loop)
5684                  stmt4: s2 = phi <s4>      - double reduction stmt (outer loop)
5685
5686                  At that point the regular reduction (stmt2 and stmt3) is
5687                  already vectorized, as well as the exit phi node, stmt4.
5688                  Here we vectorize the phi node of double reduction, stmt1, and
5689                  update all relevant statements.  */
5690
5691               /* Go through all the uses of s2 to find double reduction phi
5692                  node, i.e., stmt1 above.  */
5693               orig_name = PHI_RESULT (exit_phi);
5694               FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name)
5695                 {
5696                   stmt_vec_info use_stmt_vinfo;
5697                   tree vect_phi_init, preheader_arg, vect_phi_res;
5698                   basic_block bb = gimple_bb (use_stmt);
5699                   gimple *use;
5700
5701                   /* Check that USE_STMT is really double reduction phi
5702                      node.  */
5703                   if (gimple_code (use_stmt) != GIMPLE_PHI
5704                       || gimple_phi_num_args (use_stmt) != 2
5705                       || bb->loop_father != outer_loop)
5706                     continue;
5707                   use_stmt_vinfo = loop_vinfo->lookup_stmt (use_stmt);
5708                   if (!use_stmt_vinfo
5709                       || STMT_VINFO_DEF_TYPE (use_stmt_vinfo)
5710                           != vect_double_reduction_def)
5711                     continue;
5712
5713                   /* Create vector phi node for double reduction:
5714                      vs1 = phi <vs0, vs2>
5715                      vs1 was created previously in this function by a call to
5716                        vect_get_vec_def_for_operand and is stored in
5717                        vec_initial_def;
5718                      vs2 is defined by INNER_PHI, the vectorized EXIT_PHI;
5719                      vs0 is created here.  */
5720
5721                   /* Create vector phi node.  */
5722                   vect_phi = create_phi_node (vec_initial_def, bb);
5723                   loop_vec_info_for_loop (outer_loop)->add_stmt (vect_phi);
5724
5725                   /* Create vs0 - initial def of the double reduction phi.  */
5726                   preheader_arg = PHI_ARG_DEF_FROM_EDGE (use_stmt,
5727                                              loop_preheader_edge (outer_loop));
5728                   vect_phi_init = get_initial_def_for_reduction
5729                     (stmt, preheader_arg, NULL);
5730
5731                   /* Update phi node arguments with vs0 and vs2.  */
5732                   add_phi_arg (vect_phi, vect_phi_init,
5733                                loop_preheader_edge (outer_loop),
5734                                UNKNOWN_LOCATION);
5735                   add_phi_arg (vect_phi, PHI_RESULT (inner_phi),
5736                                loop_latch_edge (outer_loop), UNKNOWN_LOCATION);
5737                   if (dump_enabled_p ())
5738                     {
5739                       dump_printf_loc (MSG_NOTE, vect_location,
5740                                        "created double reduction phi node: ");
5741                       dump_gimple_stmt (MSG_NOTE, TDF_SLIM, vect_phi, 0);
5742                     }
5743
5744                   vect_phi_res = PHI_RESULT (vect_phi);
5745
5746                   /* Replace the use, i.e., set the correct vs1 in the regular
5747                      reduction phi node.  FORNOW, NCOPIES is always 1, so the
5748                      loop is redundant.  */
5749                   use = reduction_phi;
5750                   for (j = 0; j < ncopies; j++)
5751                     {
5752                       edge pr_edge = loop_preheader_edge (loop);
5753                       SET_PHI_ARG_DEF (use, pr_edge->dest_idx, vect_phi_res);
5754                       use = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (use));
5755                     }
5756                 }
5757             }
5758         }
5759
5760       phis.release ();
5761       if (nested_in_vect_loop)
5762         {
5763           if (double_reduc)
5764             loop = outer_loop;
5765           else
5766             continue;
5767         }
5768
5769       phis.create (3);
5770       /* Find the loop-closed-use at the loop exit of the original scalar
5771          result.  (The reduction result is expected to have two immediate uses,
5772          one at the latch block, and one at the loop exit).  For double
5773          reductions we are looking for exit phis of the outer loop.  */
5774       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, scalar_dest)
5775         {
5776           if (!flow_bb_inside_loop_p (loop, gimple_bb (USE_STMT (use_p))))
5777             {
5778               if (!is_gimple_debug (USE_STMT (use_p)))
5779                 phis.safe_push (USE_STMT (use_p));
5780             }
5781           else
5782             {
5783               if (double_reduc && gimple_code (USE_STMT (use_p)) == GIMPLE_PHI)
5784                 {
5785                   tree phi_res = PHI_RESULT (USE_STMT (use_p));
5786
5787                   FOR_EACH_IMM_USE_FAST (phi_use_p, phi_imm_iter, phi_res)
5788                     {
5789                       if (!flow_bb_inside_loop_p (loop,
5790                                              gimple_bb (USE_STMT (phi_use_p)))
5791                           && !is_gimple_debug (USE_STMT (phi_use_p)))
5792                         phis.safe_push (USE_STMT (phi_use_p));
5793                     }
5794                 }
5795             }
5796         }
5797
5798       FOR_EACH_VEC_ELT (phis, i, exit_phi)
5799         {
5800           /* Replace the uses:  */
5801           orig_name = PHI_RESULT (exit_phi);
5802           scalar_result = scalar_results[k];
5803           FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name)
5804             FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
5805               SET_USE (use_p, scalar_result);
5806         }
5807
5808       phis.release ();
5809     }
5810 }
5811
5812 /* Return a vector of type VECTYPE that is equal to the vector select
5813    operation "MASK ? VEC : IDENTITY".  Insert the select statements
5814    before GSI.  */
5815
5816 static tree
5817 merge_with_identity (gimple_stmt_iterator *gsi, tree mask, tree vectype,
5818                      tree vec, tree identity)
5819 {
5820   tree cond = make_temp_ssa_name (vectype, NULL, "cond");
5821   gimple *new_stmt = gimple_build_assign (cond, VEC_COND_EXPR,
5822                                           mask, vec, identity);
5823   gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
5824   return cond;
5825 }
5826
5827 /* Successively apply CODE to each element of VECTOR_RHS, in left-to-right
5828    order, starting with LHS.  Insert the extraction statements before GSI and
5829    associate the new scalar SSA names with variable SCALAR_DEST.
5830    Return the SSA name for the result.  */
5831
5832 static tree
5833 vect_expand_fold_left (gimple_stmt_iterator *gsi, tree scalar_dest,
5834                        tree_code code, tree lhs, tree vector_rhs)
5835 {
5836   tree vectype = TREE_TYPE (vector_rhs);
5837   tree scalar_type = TREE_TYPE (vectype);
5838   tree bitsize = TYPE_SIZE (scalar_type);
5839   unsigned HOST_WIDE_INT vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
5840   unsigned HOST_WIDE_INT element_bitsize = tree_to_uhwi (bitsize);
5841
5842   for (unsigned HOST_WIDE_INT bit_offset = 0;
5843        bit_offset < vec_size_in_bits;
5844        bit_offset += element_bitsize)
5845     {
5846       tree bitpos = bitsize_int (bit_offset);
5847       tree rhs = build3 (BIT_FIELD_REF, scalar_type, vector_rhs,
5848                          bitsize, bitpos);
5849
5850       gassign *stmt = gimple_build_assign (scalar_dest, rhs);
5851       rhs = make_ssa_name (scalar_dest, stmt);
5852       gimple_assign_set_lhs (stmt, rhs);
5853       gsi_insert_before (gsi, stmt, GSI_SAME_STMT);
5854
5855       stmt = gimple_build_assign (scalar_dest, code, lhs, rhs);
5856       tree new_name = make_ssa_name (scalar_dest, stmt);
5857       gimple_assign_set_lhs (stmt, new_name);
5858       gsi_insert_before (gsi, stmt, GSI_SAME_STMT);
5859       lhs = new_name;
5860     }
5861   return lhs;
5862 }
5863
5864 /* Perform an in-order reduction (FOLD_LEFT_REDUCTION).  STMT is the
5865    statement that sets the live-out value.  REDUC_DEF_STMT is the phi
5866    statement.  CODE is the operation performed by STMT and OPS are
5867    its scalar operands.  REDUC_INDEX is the index of the operand in
5868    OPS that is set by REDUC_DEF_STMT.  REDUC_FN is the function that
5869    implements in-order reduction, or IFN_LAST if we should open-code it.
5870    VECTYPE_IN is the type of the vector input.  MASKS specifies the masks
5871    that should be used to control the operation in a fully-masked loop.  */
5872
5873 static bool
5874 vectorize_fold_left_reduction (gimple *stmt, gimple_stmt_iterator *gsi,
5875                                gimple **vec_stmt, slp_tree slp_node,
5876                                gimple *reduc_def_stmt,
5877                                tree_code code, internal_fn reduc_fn,
5878                                tree ops[3], tree vectype_in,
5879                                int reduc_index, vec_loop_masks *masks)
5880 {
5881   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
5882   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
5883   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
5884   tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
5885   gimple *new_stmt = NULL;
5886
5887   int ncopies;
5888   if (slp_node)
5889     ncopies = 1;
5890   else
5891     ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
5892
5893   gcc_assert (!nested_in_vect_loop_p (loop, stmt));
5894   gcc_assert (ncopies == 1);
5895   gcc_assert (TREE_CODE_LENGTH (code) == binary_op);
5896   gcc_assert (reduc_index == (code == MINUS_EXPR ? 0 : 1));
5897   gcc_assert (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
5898               == FOLD_LEFT_REDUCTION);
5899
5900   if (slp_node)
5901     gcc_assert (known_eq (TYPE_VECTOR_SUBPARTS (vectype_out),
5902                           TYPE_VECTOR_SUBPARTS (vectype_in)));
5903
5904   tree op0 = ops[1 - reduc_index];
5905
5906   int group_size = 1;
5907   gimple *scalar_dest_def;
5908   auto_vec<tree> vec_oprnds0;
5909   if (slp_node)
5910     {
5911       vect_get_vec_defs (op0, NULL_TREE, stmt, &vec_oprnds0, NULL, slp_node);
5912       group_size = SLP_TREE_SCALAR_STMTS (slp_node).length ();
5913       scalar_dest_def = SLP_TREE_SCALAR_STMTS (slp_node)[group_size - 1];
5914     }
5915   else
5916     {
5917       tree loop_vec_def0 = vect_get_vec_def_for_operand (op0, stmt);
5918       vec_oprnds0.create (1);
5919       vec_oprnds0.quick_push (loop_vec_def0);
5920       scalar_dest_def = stmt;
5921     }
5922
5923   tree scalar_dest = gimple_assign_lhs (scalar_dest_def);
5924   tree scalar_type = TREE_TYPE (scalar_dest);
5925   tree reduc_var = gimple_phi_result (reduc_def_stmt);
5926
5927   int vec_num = vec_oprnds0.length ();
5928   gcc_assert (vec_num == 1 || slp_node);
5929   tree vec_elem_type = TREE_TYPE (vectype_out);
5930   gcc_checking_assert (useless_type_conversion_p (scalar_type, vec_elem_type));
5931
5932   tree vector_identity = NULL_TREE;
5933   if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
5934     vector_identity = build_zero_cst (vectype_out);
5935
5936   tree scalar_dest_var = vect_create_destination_var (scalar_dest, NULL);
5937   int i;
5938   tree def0;
5939   FOR_EACH_VEC_ELT (vec_oprnds0, i, def0)
5940     {
5941       tree mask = NULL_TREE;
5942       if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
5943         mask = vect_get_loop_mask (gsi, masks, vec_num, vectype_in, i);
5944
5945       /* Handle MINUS by adding the negative.  */
5946       if (reduc_fn != IFN_LAST && code == MINUS_EXPR)
5947         {
5948           tree negated = make_ssa_name (vectype_out);
5949           new_stmt = gimple_build_assign (negated, NEGATE_EXPR, def0);
5950           gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
5951           def0 = negated;
5952         }
5953
5954       if (mask)
5955         def0 = merge_with_identity (gsi, mask, vectype_out, def0,
5956                                     vector_identity);
5957
5958       /* On the first iteration the input is simply the scalar phi
5959          result, and for subsequent iterations it is the output of
5960          the preceding operation.  */
5961       if (reduc_fn != IFN_LAST)
5962         {
5963           new_stmt = gimple_build_call_internal (reduc_fn, 2, reduc_var, def0);
5964           /* For chained SLP reductions the output of the previous reduction
5965              operation serves as the input of the next. For the final statement
5966              the output cannot be a temporary - we reuse the original
5967              scalar destination of the last statement.  */
5968           if (i != vec_num - 1)
5969             {
5970               gimple_set_lhs (new_stmt, scalar_dest_var);
5971               reduc_var = make_ssa_name (scalar_dest_var, new_stmt);
5972               gimple_set_lhs (new_stmt, reduc_var);
5973             }
5974         }
5975       else
5976         {
5977           reduc_var = vect_expand_fold_left (gsi, scalar_dest_var, code,
5978                                              reduc_var, def0);
5979           new_stmt = SSA_NAME_DEF_STMT (reduc_var);
5980           /* Remove the statement, so that we can use the same code paths
5981              as for statements that we've just created.  */
5982           gimple_stmt_iterator tmp_gsi = gsi_for_stmt (new_stmt);
5983           gsi_remove (&tmp_gsi, false);
5984         }
5985
5986       if (i == vec_num - 1)
5987         {
5988           gimple_set_lhs (new_stmt, scalar_dest);
5989           vect_finish_replace_stmt (scalar_dest_def, new_stmt);
5990         }
5991       else
5992         vect_finish_stmt_generation (scalar_dest_def, new_stmt, gsi);
5993
5994       if (slp_node)
5995         SLP_TREE_VEC_STMTS (slp_node).quick_push (new_stmt);
5996     }
5997
5998   if (!slp_node)
5999     STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt;
6000
6001   return true;
6002 }
6003
6004 /* Function is_nonwrapping_integer_induction.
6005
6006    Check if STMT (which is part of loop LOOP) both increments and
6007    does not cause overflow.  */
6008
6009 static bool
6010 is_nonwrapping_integer_induction (gimple *stmt, struct loop *loop)
6011 {
6012   stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt);
6013   tree base = STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo);
6014   tree step = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo);
6015   tree lhs_type = TREE_TYPE (gimple_phi_result (stmt));
6016   widest_int ni, max_loop_value, lhs_max;
6017   wi::overflow_type overflow = wi::OVF_NONE;
6018
6019   /* Make sure the loop is integer based.  */
6020   if (TREE_CODE (base) != INTEGER_CST
6021       || TREE_CODE (step) != INTEGER_CST)
6022     return false;
6023
6024   /* Check that the max size of the loop will not wrap.  */
6025
6026   if (TYPE_OVERFLOW_UNDEFINED (lhs_type))
6027     return true;
6028
6029   if (! max_stmt_executions (loop, &ni))
6030     return false;
6031
6032   max_loop_value = wi::mul (wi::to_widest (step), ni, TYPE_SIGN (lhs_type),
6033                             &overflow);
6034   if (overflow)
6035     return false;
6036
6037   max_loop_value = wi::add (wi::to_widest (base), max_loop_value,
6038                             TYPE_SIGN (lhs_type), &overflow);
6039   if (overflow)
6040     return false;
6041
6042   return (wi::min_precision (max_loop_value, TYPE_SIGN (lhs_type))
6043           <= TYPE_PRECISION (lhs_type));
6044 }
6045
6046 /* Function vectorizable_reduction.
6047
6048    Check if STMT performs a reduction operation that can be vectorized.
6049    If VEC_STMT is also passed, vectorize the STMT: create a vectorized
6050    stmt to replace it, put it in VEC_STMT, and insert it at GSI.
6051    Return FALSE if not a vectorizable STMT, TRUE otherwise.
6052
6053    This function also handles reduction idioms (patterns) that have been
6054    recognized in advance during vect_pattern_recog.  In this case, STMT may be
6055    of this form:
6056      X = pattern_expr (arg0, arg1, ..., X)
6057    and it's STMT_VINFO_RELATED_STMT points to the last stmt in the original
6058    sequence that had been detected and replaced by the pattern-stmt (STMT).
6059
6060    This function also handles reduction of condition expressions, for example:
6061      for (int i = 0; i < N; i++)
6062        if (a[i] < value)
6063          last = a[i];
6064    This is handled by vectorising the loop and creating an additional vector
6065    containing the loop indexes for which "a[i] < value" was true.  In the
6066    function epilogue this is reduced to a single max value and then used to
6067    index into the vector of results.
6068
6069    In some cases of reduction patterns, the type of the reduction variable X is
6070    different than the type of the other arguments of STMT.
6071    In such cases, the vectype that is used when transforming STMT into a vector
6072    stmt is different than the vectype that is used to determine the
6073    vectorization factor, because it consists of a different number of elements
6074    than the actual number of elements that are being operated upon in parallel.
6075
6076    For example, consider an accumulation of shorts into an int accumulator.
6077    On some targets it's possible to vectorize this pattern operating on 8
6078    shorts at a time (hence, the vectype for purposes of determining the
6079    vectorization factor should be V8HI); on the other hand, the vectype that
6080    is used to create the vector form is actually V4SI (the type of the result).
6081
6082    Upon entry to this function, STMT_VINFO_VECTYPE records the vectype that
6083    indicates what is the actual level of parallelism (V8HI in the example), so
6084    that the right vectorization factor would be derived.  This vectype
6085    corresponds to the type of arguments to the reduction stmt, and should *NOT*
6086    be used to create the vectorized stmt.  The right vectype for the vectorized
6087    stmt is obtained from the type of the result X:
6088         get_vectype_for_scalar_type (TREE_TYPE (X))
6089
6090    This means that, contrary to "regular" reductions (or "regular" stmts in
6091    general), the following equation:
6092       STMT_VINFO_VECTYPE == get_vectype_for_scalar_type (TREE_TYPE (X))
6093    does *NOT* necessarily hold for reduction patterns.  */
6094
6095 bool
6096 vectorizable_reduction (gimple *stmt, gimple_stmt_iterator *gsi,
6097                         gimple **vec_stmt, slp_tree slp_node,
6098                         slp_instance slp_node_instance,
6099                         stmt_vector_for_cost *cost_vec)
6100 {
6101   tree vec_dest;
6102   tree scalar_dest;
6103   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
6104   tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
6105   tree vectype_in = NULL_TREE;
6106   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
6107   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
6108   enum tree_code code, orig_code;
6109   internal_fn reduc_fn;
6110   machine_mode vec_mode;
6111   int op_type;
6112   optab optab;
6113   tree new_temp = NULL_TREE;
6114   gimple *def_stmt;
6115   enum vect_def_type dt, cond_reduc_dt = vect_unknown_def_type;
6116   gimple *cond_reduc_def_stmt = NULL;
6117   enum tree_code cond_reduc_op_code = ERROR_MARK;
6118   tree scalar_type;
6119   bool is_simple_use;
6120   gimple *orig_stmt;
6121   stmt_vec_info orig_stmt_info = NULL;
6122   int i;
6123   int ncopies;
6124   int epilog_copies;
6125   stmt_vec_info prev_stmt_info, prev_phi_info;
6126   bool single_defuse_cycle = false;
6127   gimple *new_stmt = NULL;
6128   int j;
6129   tree ops[3];
6130   enum vect_def_type dts[3];
6131   bool nested_cycle = false, found_nested_cycle_def = false;
6132   bool double_reduc = false;
6133   basic_block def_bb;
6134   struct loop * def_stmt_loop, *outer_loop = NULL;
6135   tree def_arg;
6136   gimple *def_arg_stmt;
6137   auto_vec<tree> vec_oprnds0;
6138   auto_vec<tree> vec_oprnds1;
6139   auto_vec<tree> vec_oprnds2;
6140   auto_vec<tree> vect_defs;
6141   auto_vec<gimple *> phis;
6142   int vec_num;
6143   tree def0, tem;
6144   tree cr_index_scalar_type = NULL_TREE, cr_index_vector_type = NULL_TREE;
6145   tree cond_reduc_val = NULL_TREE;
6146
6147   /* Make sure it was already recognized as a reduction computation.  */
6148   if (STMT_VINFO_DEF_TYPE (vinfo_for_stmt (stmt)) != vect_reduction_def
6149       && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (stmt)) != vect_nested_cycle)
6150     return false;
6151
6152   if (nested_in_vect_loop_p (loop, stmt))
6153     {
6154       outer_loop = loop;
6155       loop = loop->inner;
6156       nested_cycle = true;
6157     }
6158
6159   if (REDUC_GROUP_FIRST_ELEMENT (stmt_info))
6160     gcc_assert (slp_node && REDUC_GROUP_FIRST_ELEMENT (stmt_info) == stmt);
6161
6162   if (gimple_code (stmt) == GIMPLE_PHI)
6163     {
6164       /* Analysis is fully done on the reduction stmt invocation.  */
6165       if (! vec_stmt)
6166         {
6167           if (slp_node)
6168             slp_node_instance->reduc_phis = slp_node;
6169
6170           STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
6171           return true;
6172         }
6173
6174       if (STMT_VINFO_REDUC_TYPE (stmt_info) == FOLD_LEFT_REDUCTION)
6175         /* Leave the scalar phi in place.  Note that checking
6176            STMT_VINFO_VEC_REDUCTION_TYPE (as below) only works
6177            for reductions involving a single statement.  */
6178         return true;
6179
6180       gimple *reduc_stmt = STMT_VINFO_REDUC_DEF (stmt_info);
6181       if (STMT_VINFO_IN_PATTERN_P (vinfo_for_stmt (reduc_stmt)))
6182         reduc_stmt = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (reduc_stmt));
6183
6184       if (STMT_VINFO_VEC_REDUCTION_TYPE (vinfo_for_stmt (reduc_stmt))
6185           == EXTRACT_LAST_REDUCTION)
6186         /* Leave the scalar phi in place.  */
6187         return true;
6188
6189       gcc_assert (is_gimple_assign (reduc_stmt));
6190       for (unsigned k = 1; k < gimple_num_ops (reduc_stmt); ++k)
6191         {
6192           tree op = gimple_op (reduc_stmt, k);
6193           if (op == gimple_phi_result (stmt))
6194             continue;
6195           if (k == 1
6196               && gimple_assign_rhs_code (reduc_stmt) == COND_EXPR)
6197             continue;
6198           if (!vectype_in
6199               || (GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (vectype_in)))
6200                   < GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (op)))))
6201             vectype_in = get_vectype_for_scalar_type (TREE_TYPE (op));
6202           break;
6203         }
6204       gcc_assert (vectype_in);
6205
6206       if (slp_node)
6207         ncopies = 1;
6208       else
6209         ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
6210
6211       use_operand_p use_p;
6212       gimple *use_stmt;
6213       if (ncopies > 1
6214           && (STMT_VINFO_RELEVANT (vinfo_for_stmt (reduc_stmt))
6215               <= vect_used_only_live)
6216           && single_imm_use (gimple_phi_result (stmt), &use_p, &use_stmt)
6217           && (use_stmt == reduc_stmt
6218               || (STMT_VINFO_RELATED_STMT (vinfo_for_stmt (use_stmt))
6219                   == reduc_stmt)))
6220         single_defuse_cycle = true;
6221
6222       /* Create the destination vector  */
6223       scalar_dest = gimple_assign_lhs (reduc_stmt);
6224       vec_dest = vect_create_destination_var (scalar_dest, vectype_out);
6225
6226       if (slp_node)
6227         /* The size vect_schedule_slp_instance computes is off for us.  */
6228         vec_num = vect_get_num_vectors
6229           (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
6230            * SLP_TREE_SCALAR_STMTS (slp_node).length (),
6231            vectype_in);
6232       else
6233         vec_num = 1;
6234
6235       /* Generate the reduction PHIs upfront.  */
6236       prev_phi_info = NULL;
6237       for (j = 0; j < ncopies; j++)
6238         {
6239           if (j == 0 || !single_defuse_cycle)
6240             {
6241               for (i = 0; i < vec_num; i++)
6242                 {
6243                   /* Create the reduction-phi that defines the reduction
6244                      operand.  */
6245                   gimple *new_phi = create_phi_node (vec_dest, loop->header);
6246                   stmt_vec_info new_phi_info = loop_vinfo->add_stmt (new_phi);
6247
6248                   if (slp_node)
6249                     SLP_TREE_VEC_STMTS (slp_node).quick_push (new_phi);
6250                   else
6251                     {
6252                       if (j == 0)
6253                         STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_phi;
6254                       else
6255                         STMT_VINFO_RELATED_STMT (prev_phi_info) = new_phi;
6256                       prev_phi_info = new_phi_info;
6257                     }
6258                 }
6259             }
6260         }
6261
6262       return true;
6263     }
6264
6265   /* 1. Is vectorizable reduction?  */
6266   /* Not supportable if the reduction variable is used in the loop, unless
6267      it's a reduction chain.  */
6268   if (STMT_VINFO_RELEVANT (stmt_info) > vect_used_in_outer
6269       && !REDUC_GROUP_FIRST_ELEMENT (stmt_info))
6270     return false;
6271
6272   /* Reductions that are not used even in an enclosing outer-loop,
6273      are expected to be "live" (used out of the loop).  */
6274   if (STMT_VINFO_RELEVANT (stmt_info) == vect_unused_in_scope
6275       && !STMT_VINFO_LIVE_P (stmt_info))
6276     return false;
6277
6278   /* 2. Has this been recognized as a reduction pattern?
6279
6280      Check if STMT represents a pattern that has been recognized
6281      in earlier analysis stages.  For stmts that represent a pattern,
6282      the STMT_VINFO_RELATED_STMT field records the last stmt in
6283      the original sequence that constitutes the pattern.  */
6284
6285   orig_stmt = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (stmt));
6286   if (orig_stmt)
6287     {
6288       orig_stmt_info = vinfo_for_stmt (orig_stmt);
6289       gcc_assert (STMT_VINFO_IN_PATTERN_P (orig_stmt_info));
6290       gcc_assert (!STMT_VINFO_IN_PATTERN_P (stmt_info));
6291     }
6292
6293   /* 3. Check the operands of the operation.  The first operands are defined
6294         inside the loop body. The last operand is the reduction variable,
6295         which is defined by the loop-header-phi.  */
6296
6297   gcc_assert (is_gimple_assign (stmt));
6298
6299   /* Flatten RHS.  */
6300   switch (get_gimple_rhs_class (gimple_assign_rhs_code (stmt)))
6301     {
6302     case GIMPLE_BINARY_RHS:
6303       code = gimple_assign_rhs_code (stmt);
6304       op_type = TREE_CODE_LENGTH (code);
6305       gcc_assert (op_type == binary_op);
6306       ops[0] = gimple_assign_rhs1 (stmt);
6307       ops[1] = gimple_assign_rhs2 (stmt);
6308       break;
6309
6310     case GIMPLE_TERNARY_RHS:
6311       code = gimple_assign_rhs_code (stmt);
6312       op_type = TREE_CODE_LENGTH (code);
6313       gcc_assert (op_type == ternary_op);
6314       ops[0] = gimple_assign_rhs1 (stmt);
6315       ops[1] = gimple_assign_rhs2 (stmt);
6316       ops[2] = gimple_assign_rhs3 (stmt);
6317       break;
6318
6319     case GIMPLE_UNARY_RHS:
6320       return false;
6321
6322     default:
6323       gcc_unreachable ();
6324     }
6325
6326   if (code == COND_EXPR && slp_node)
6327     return false;
6328
6329   scalar_dest = gimple_assign_lhs (stmt);
6330   scalar_type = TREE_TYPE (scalar_dest);
6331   if (!POINTER_TYPE_P (scalar_type) && !INTEGRAL_TYPE_P (scalar_type)
6332       && !SCALAR_FLOAT_TYPE_P (scalar_type))
6333     return false;
6334
6335   /* Do not try to vectorize bit-precision reductions.  */
6336   if (!type_has_mode_precision_p (scalar_type))
6337     return false;
6338
6339   /* All uses but the last are expected to be defined in the loop.
6340      The last use is the reduction variable.  In case of nested cycle this
6341      assumption is not true: we use reduc_index to record the index of the
6342      reduction variable.  */
6343   gimple *reduc_def_stmt = NULL;
6344   int reduc_index = -1;
6345   for (i = 0; i < op_type; i++)
6346     {
6347       /* The condition of COND_EXPR is checked in vectorizable_condition().  */
6348       if (i == 0 && code == COND_EXPR)
6349         continue;
6350
6351       is_simple_use = vect_is_simple_use (ops[i], loop_vinfo,
6352                                           &dts[i], &tem, &def_stmt);
6353       dt = dts[i];
6354       gcc_assert (is_simple_use);
6355       if (dt == vect_reduction_def)
6356         {
6357           reduc_def_stmt = def_stmt;
6358           reduc_index = i;
6359           continue;
6360         }
6361       else if (tem)
6362         {
6363           /* To properly compute ncopies we are interested in the widest
6364              input type in case we're looking at a widening accumulation.  */
6365           if (!vectype_in
6366               || (GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (vectype_in)))
6367                   < GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (tem)))))
6368             vectype_in = tem;
6369         }
6370
6371       if (dt != vect_internal_def
6372           && dt != vect_external_def
6373           && dt != vect_constant_def
6374           && dt != vect_induction_def
6375           && !(dt == vect_nested_cycle && nested_cycle))
6376         return false;
6377
6378       if (dt == vect_nested_cycle)
6379         {
6380           found_nested_cycle_def = true;
6381           reduc_def_stmt = def_stmt;
6382           reduc_index = i;
6383         }
6384
6385       if (i == 1 && code == COND_EXPR)
6386         {
6387           /* Record how value of COND_EXPR is defined.  */
6388           if (dt == vect_constant_def)
6389             {
6390               cond_reduc_dt = dt;
6391               cond_reduc_val = ops[i];
6392             }
6393           if (dt == vect_induction_def
6394               && def_stmt != NULL
6395               && is_nonwrapping_integer_induction (def_stmt, loop))
6396             {
6397               cond_reduc_dt = dt;
6398               cond_reduc_def_stmt = def_stmt;
6399             }
6400         }
6401     }
6402
6403   if (!vectype_in)
6404     vectype_in = vectype_out;
6405
6406   /* When vectorizing a reduction chain w/o SLP the reduction PHI is not
6407      directy used in stmt.  */
6408   if (reduc_index == -1)
6409     {
6410       if (STMT_VINFO_REDUC_TYPE (stmt_info) == FOLD_LEFT_REDUCTION)
6411         {
6412           if (dump_enabled_p ())
6413             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6414                              "in-order reduction chain without SLP.\n");
6415           return false;
6416         }
6417
6418       if (orig_stmt)
6419         reduc_def_stmt = STMT_VINFO_REDUC_DEF (orig_stmt_info);
6420       else
6421         reduc_def_stmt = STMT_VINFO_REDUC_DEF (stmt_info);
6422     }
6423
6424   if (! reduc_def_stmt || gimple_code (reduc_def_stmt) != GIMPLE_PHI)
6425     return false;
6426
6427   if (!(reduc_index == -1
6428         || dts[reduc_index] == vect_reduction_def
6429         || dts[reduc_index] == vect_nested_cycle
6430         || ((dts[reduc_index] == vect_internal_def
6431              || dts[reduc_index] == vect_external_def
6432              || dts[reduc_index] == vect_constant_def
6433              || dts[reduc_index] == vect_induction_def)
6434             && nested_cycle && found_nested_cycle_def)))
6435     {
6436       /* For pattern recognized stmts, orig_stmt might be a reduction,
6437          but some helper statements for the pattern might not, or
6438          might be COND_EXPRs with reduction uses in the condition.  */
6439       gcc_assert (orig_stmt);
6440       return false;
6441     }
6442
6443   stmt_vec_info reduc_def_info = vinfo_for_stmt (reduc_def_stmt);
6444   /* PHIs should not participate in patterns.  */
6445   gcc_assert (!STMT_VINFO_RELATED_STMT (reduc_def_info));
6446   enum vect_reduction_type v_reduc_type
6447     = STMT_VINFO_REDUC_TYPE (reduc_def_info);
6448   gimple *tmp = STMT_VINFO_REDUC_DEF (reduc_def_info);
6449
6450   STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) = v_reduc_type;
6451   /* If we have a condition reduction, see if we can simplify it further.  */
6452   if (v_reduc_type == COND_REDUCTION)
6453     {
6454       /* TODO: We can't yet handle reduction chains, since we need to treat
6455          each COND_EXPR in the chain specially, not just the last one.
6456          E.g. for:
6457
6458             x_1 = PHI <x_3, ...>
6459             x_2 = a_2 ? ... : x_1;
6460             x_3 = a_3 ? ... : x_2;
6461
6462          we're interested in the last element in x_3 for which a_2 || a_3
6463          is true, whereas the current reduction chain handling would
6464          vectorize x_2 as a normal VEC_COND_EXPR and only treat x_3
6465          as a reduction operation.  */
6466       if (reduc_index == -1)
6467         {
6468           if (dump_enabled_p ())
6469             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6470                              "conditional reduction chains not supported\n");
6471           return false;
6472         }
6473
6474       /* vect_is_simple_reduction ensured that operand 2 is the
6475          loop-carried operand.  */
6476       gcc_assert (reduc_index == 2);
6477
6478       /* Loop peeling modifies initial value of reduction PHI, which
6479          makes the reduction stmt to be transformed different to the
6480          original stmt analyzed.  We need to record reduction code for
6481          CONST_COND_REDUCTION type reduction at analyzing stage, thus
6482          it can be used directly at transform stage.  */
6483       if (STMT_VINFO_VEC_CONST_COND_REDUC_CODE (stmt_info) == MAX_EXPR
6484           || STMT_VINFO_VEC_CONST_COND_REDUC_CODE (stmt_info) == MIN_EXPR)
6485         {
6486           /* Also set the reduction type to CONST_COND_REDUCTION.  */
6487           gcc_assert (cond_reduc_dt == vect_constant_def);
6488           STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) = CONST_COND_REDUCTION;
6489         }
6490       else if (direct_internal_fn_supported_p (IFN_FOLD_EXTRACT_LAST,
6491                                                vectype_in, OPTIMIZE_FOR_SPEED))
6492         {
6493           if (dump_enabled_p ())
6494             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6495                              "optimizing condition reduction with"
6496                              " FOLD_EXTRACT_LAST.\n");
6497           STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) = EXTRACT_LAST_REDUCTION;
6498         }
6499       else if (cond_reduc_dt == vect_induction_def)
6500         {
6501           stmt_vec_info cond_stmt_vinfo = vinfo_for_stmt (cond_reduc_def_stmt);
6502           tree base
6503             = STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (cond_stmt_vinfo);
6504           tree step = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (cond_stmt_vinfo);
6505
6506           gcc_assert (TREE_CODE (base) == INTEGER_CST
6507                       && TREE_CODE (step) == INTEGER_CST);
6508           cond_reduc_val = NULL_TREE;
6509           /* Find a suitable value, for MAX_EXPR below base, for MIN_EXPR
6510              above base; punt if base is the minimum value of the type for
6511              MAX_EXPR or maximum value of the type for MIN_EXPR for now.  */
6512           if (tree_int_cst_sgn (step) == -1)
6513             {
6514               cond_reduc_op_code = MIN_EXPR;
6515               if (tree_int_cst_sgn (base) == -1)
6516                 cond_reduc_val = build_int_cst (TREE_TYPE (base), 0);
6517               else if (tree_int_cst_lt (base,
6518                                         TYPE_MAX_VALUE (TREE_TYPE (base))))
6519                 cond_reduc_val
6520                   = int_const_binop (PLUS_EXPR, base, integer_one_node);
6521             }
6522           else
6523             {
6524               cond_reduc_op_code = MAX_EXPR;
6525               if (tree_int_cst_sgn (base) == 1)
6526                 cond_reduc_val = build_int_cst (TREE_TYPE (base), 0);
6527               else if (tree_int_cst_lt (TYPE_MIN_VALUE (TREE_TYPE (base)),
6528                                         base))
6529                 cond_reduc_val
6530                   = int_const_binop (MINUS_EXPR, base, integer_one_node);
6531             }
6532           if (cond_reduc_val)
6533             {
6534               if (dump_enabled_p ())
6535                 dump_printf_loc (MSG_NOTE, vect_location,
6536                                  "condition expression based on "
6537                                  "integer induction.\n");
6538               STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
6539                 = INTEGER_INDUC_COND_REDUCTION;
6540             }
6541         }
6542       else if (cond_reduc_dt == vect_constant_def)
6543         {
6544           enum vect_def_type cond_initial_dt;
6545           gimple *def_stmt = SSA_NAME_DEF_STMT (ops[reduc_index]);
6546           tree cond_initial_val
6547             = PHI_ARG_DEF_FROM_EDGE (def_stmt, loop_preheader_edge (loop));
6548
6549           gcc_assert (cond_reduc_val != NULL_TREE);
6550           vect_is_simple_use (cond_initial_val, loop_vinfo, &cond_initial_dt);
6551           if (cond_initial_dt == vect_constant_def
6552               && types_compatible_p (TREE_TYPE (cond_initial_val),
6553                                      TREE_TYPE (cond_reduc_val)))
6554             {
6555               tree e = fold_binary (LE_EXPR, boolean_type_node,
6556                                     cond_initial_val, cond_reduc_val);
6557               if (e && (integer_onep (e) || integer_zerop (e)))
6558                 {
6559                   if (dump_enabled_p ())
6560                     dump_printf_loc (MSG_NOTE, vect_location,
6561                                      "condition expression based on "
6562                                      "compile time constant.\n");
6563                   /* Record reduction code at analysis stage.  */
6564                   STMT_VINFO_VEC_CONST_COND_REDUC_CODE (stmt_info)
6565                     = integer_onep (e) ? MAX_EXPR : MIN_EXPR;
6566                   STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
6567                     = CONST_COND_REDUCTION;
6568                 }
6569             }
6570         }
6571     }
6572
6573   if (orig_stmt)
6574     gcc_assert (tmp == orig_stmt
6575                 || (REDUC_GROUP_FIRST_ELEMENT (vinfo_for_stmt (tmp))
6576                     == orig_stmt));
6577   else
6578     /* We changed STMT to be the first stmt in reduction chain, hence we
6579        check that in this case the first element in the chain is STMT.  */
6580     gcc_assert (stmt == tmp
6581                 || REDUC_GROUP_FIRST_ELEMENT (vinfo_for_stmt (tmp)) == stmt);
6582
6583   if (STMT_VINFO_LIVE_P (vinfo_for_stmt (reduc_def_stmt)))
6584     return false;
6585
6586   if (slp_node)
6587     ncopies = 1;
6588   else
6589     ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
6590
6591   gcc_assert (ncopies >= 1);
6592
6593   vec_mode = TYPE_MODE (vectype_in);
6594   poly_uint64 nunits_out = TYPE_VECTOR_SUBPARTS (vectype_out);
6595
6596   if (code == COND_EXPR)
6597     {
6598       /* Only call during the analysis stage, otherwise we'll lose
6599          STMT_VINFO_TYPE.  */
6600       if (!vec_stmt && !vectorizable_condition (stmt, gsi, NULL,
6601                                                 ops[reduc_index], 0, NULL,
6602                                                 cost_vec))
6603         {
6604           if (dump_enabled_p ())
6605             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6606                              "unsupported condition in reduction\n");
6607           return false;
6608         }
6609     }
6610   else
6611     {
6612       /* 4. Supportable by target?  */
6613
6614       if (code == LSHIFT_EXPR || code == RSHIFT_EXPR
6615           || code == LROTATE_EXPR || code == RROTATE_EXPR)
6616         {
6617           /* Shifts and rotates are only supported by vectorizable_shifts,
6618              not vectorizable_reduction.  */
6619           if (dump_enabled_p ())
6620             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6621                              "unsupported shift or rotation.\n");
6622           return false;
6623         }
6624
6625       /* 4.1. check support for the operation in the loop  */
6626       optab = optab_for_tree_code (code, vectype_in, optab_default);
6627       if (!optab)
6628         {
6629           if (dump_enabled_p ())
6630             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6631                              "no optab.\n");
6632
6633           return false;
6634         }
6635
6636       if (optab_handler (optab, vec_mode) == CODE_FOR_nothing)
6637         {
6638           if (dump_enabled_p ())
6639             dump_printf (MSG_NOTE, "op not supported by target.\n");
6640
6641           if (maybe_ne (GET_MODE_SIZE (vec_mode), UNITS_PER_WORD)
6642               || !vect_worthwhile_without_simd_p (loop_vinfo, code))
6643             return false;
6644
6645           if (dump_enabled_p ())
6646             dump_printf (MSG_NOTE, "proceeding using word mode.\n");
6647         }
6648
6649       /* Worthwhile without SIMD support?  */
6650       if (!VECTOR_MODE_P (TYPE_MODE (vectype_in))
6651           && !vect_worthwhile_without_simd_p (loop_vinfo, code))
6652         {
6653           if (dump_enabled_p ())
6654             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6655                              "not worthwhile without SIMD support.\n");
6656
6657           return false;
6658         }
6659     }
6660
6661   /* 4.2. Check support for the epilog operation.
6662
6663           If STMT represents a reduction pattern, then the type of the
6664           reduction variable may be different than the type of the rest
6665           of the arguments.  For example, consider the case of accumulation
6666           of shorts into an int accumulator; The original code:
6667                         S1: int_a = (int) short_a;
6668           orig_stmt->   S2: int_acc = plus <int_a ,int_acc>;
6669
6670           was replaced with:
6671                         STMT: int_acc = widen_sum <short_a, int_acc>
6672
6673           This means that:
6674           1. The tree-code that is used to create the vector operation in the
6675              epilog code (that reduces the partial results) is not the
6676              tree-code of STMT, but is rather the tree-code of the original
6677              stmt from the pattern that STMT is replacing.  I.e, in the example
6678              above we want to use 'widen_sum' in the loop, but 'plus' in the
6679              epilog.
6680           2. The type (mode) we use to check available target support
6681              for the vector operation to be created in the *epilog*, is
6682              determined by the type of the reduction variable (in the example
6683              above we'd check this: optab_handler (plus_optab, vect_int_mode])).
6684              However the type (mode) we use to check available target support
6685              for the vector operation to be created *inside the loop*, is
6686              determined by the type of the other arguments to STMT (in the
6687              example we'd check this: optab_handler (widen_sum_optab,
6688              vect_short_mode)).
6689
6690           This is contrary to "regular" reductions, in which the types of all
6691           the arguments are the same as the type of the reduction variable.
6692           For "regular" reductions we can therefore use the same vector type
6693           (and also the same tree-code) when generating the epilog code and
6694           when generating the code inside the loop.  */
6695
6696   vect_reduction_type reduction_type
6697     = STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info);
6698   if (orig_stmt
6699       && (reduction_type == TREE_CODE_REDUCTION
6700           || reduction_type == FOLD_LEFT_REDUCTION))
6701     {
6702       /* This is a reduction pattern: get the vectype from the type of the
6703          reduction variable, and get the tree-code from orig_stmt.  */
6704       orig_code = gimple_assign_rhs_code (orig_stmt);
6705       gcc_assert (vectype_out);
6706       vec_mode = TYPE_MODE (vectype_out);
6707     }
6708   else
6709     {
6710       /* Regular reduction: use the same vectype and tree-code as used for
6711          the vector code inside the loop can be used for the epilog code. */
6712       orig_code = code;
6713
6714       if (code == MINUS_EXPR)
6715         orig_code = PLUS_EXPR;
6716
6717       /* For simple condition reductions, replace with the actual expression
6718          we want to base our reduction around.  */
6719       if (reduction_type == CONST_COND_REDUCTION)
6720         {
6721           orig_code = STMT_VINFO_VEC_CONST_COND_REDUC_CODE (stmt_info);
6722           gcc_assert (orig_code == MAX_EXPR || orig_code == MIN_EXPR);
6723         }
6724       else if (reduction_type == INTEGER_INDUC_COND_REDUCTION)
6725         orig_code = cond_reduc_op_code;
6726     }
6727
6728   if (nested_cycle)
6729     {
6730       def_bb = gimple_bb (reduc_def_stmt);
6731       def_stmt_loop = def_bb->loop_father;
6732       def_arg = PHI_ARG_DEF_FROM_EDGE (reduc_def_stmt,
6733                                        loop_preheader_edge (def_stmt_loop));
6734       if (TREE_CODE (def_arg) == SSA_NAME
6735           && (def_arg_stmt = SSA_NAME_DEF_STMT (def_arg))
6736           && gimple_code (def_arg_stmt) == GIMPLE_PHI
6737           && flow_bb_inside_loop_p (outer_loop, gimple_bb (def_arg_stmt))
6738           && vinfo_for_stmt (def_arg_stmt)
6739           && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_arg_stmt))
6740               == vect_double_reduction_def)
6741         double_reduc = true;
6742     }
6743
6744   reduc_fn = IFN_LAST;
6745
6746   if (reduction_type == TREE_CODE_REDUCTION
6747       || reduction_type == FOLD_LEFT_REDUCTION
6748       || reduction_type == INTEGER_INDUC_COND_REDUCTION
6749       || reduction_type == CONST_COND_REDUCTION)
6750     {
6751       if (reduction_type == FOLD_LEFT_REDUCTION
6752           ? fold_left_reduction_fn (orig_code, &reduc_fn)
6753           : reduction_fn_for_scalar_code (orig_code, &reduc_fn))
6754         {
6755           if (reduc_fn != IFN_LAST
6756               && !direct_internal_fn_supported_p (reduc_fn, vectype_out,
6757                                                   OPTIMIZE_FOR_SPEED))
6758             {
6759               if (dump_enabled_p ())
6760                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6761                                  "reduc op not supported by target.\n");
6762
6763               reduc_fn = IFN_LAST;
6764             }
6765         }
6766       else
6767         {
6768           if (!nested_cycle || double_reduc)
6769             {
6770               if (dump_enabled_p ())
6771                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6772                                  "no reduc code for scalar code.\n");
6773
6774               return false;
6775             }
6776         }
6777     }
6778   else if (reduction_type == COND_REDUCTION)
6779     {
6780       int scalar_precision
6781         = GET_MODE_PRECISION (SCALAR_TYPE_MODE (scalar_type));
6782       cr_index_scalar_type = make_unsigned_type (scalar_precision);
6783       cr_index_vector_type = build_vector_type (cr_index_scalar_type,
6784                                                 nunits_out);
6785
6786       if (direct_internal_fn_supported_p (IFN_REDUC_MAX, cr_index_vector_type,
6787                                           OPTIMIZE_FOR_SPEED))
6788         reduc_fn = IFN_REDUC_MAX;
6789     }
6790
6791   if (reduction_type != EXTRACT_LAST_REDUCTION
6792       && reduc_fn == IFN_LAST
6793       && !nunits_out.is_constant ())
6794     {
6795       if (dump_enabled_p ())
6796         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6797                          "missing target support for reduction on"
6798                          " variable-length vectors.\n");
6799       return false;
6800     }
6801
6802   if ((double_reduc || reduction_type != TREE_CODE_REDUCTION)
6803       && ncopies > 1)
6804     {
6805       if (dump_enabled_p ())
6806         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6807                          "multiple types in double reduction or condition "
6808                          "reduction.\n");
6809       return false;
6810     }
6811
6812   /* For SLP reductions, see if there is a neutral value we can use.  */
6813   tree neutral_op = NULL_TREE;
6814   if (slp_node)
6815     neutral_op = neutral_op_for_slp_reduction
6816                    (slp_node_instance->reduc_phis, code,
6817                     REDUC_GROUP_FIRST_ELEMENT (stmt_info) != NULL);
6818
6819   if (double_reduc && reduction_type == FOLD_LEFT_REDUCTION)
6820     {
6821       /* We can't support in-order reductions of code such as this:
6822
6823            for (int i = 0; i < n1; ++i)
6824              for (int j = 0; j < n2; ++j)
6825                l += a[j];
6826
6827          since GCC effectively transforms the loop when vectorizing:
6828
6829            for (int i = 0; i < n1 / VF; ++i)
6830              for (int j = 0; j < n2; ++j)
6831                for (int k = 0; k < VF; ++k)
6832                  l += a[j];
6833
6834          which is a reassociation of the original operation.  */
6835       if (dump_enabled_p ())
6836         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6837                          "in-order double reduction not supported.\n");
6838
6839       return false;
6840     }
6841
6842   if (reduction_type == FOLD_LEFT_REDUCTION
6843       && slp_node
6844       && !REDUC_GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
6845     {
6846       /* We cannot use in-order reductions in this case because there is
6847          an implicit reassociation of the operations involved.  */
6848       if (dump_enabled_p ())
6849         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6850                          "in-order unchained SLP reductions not supported.\n");
6851       return false;
6852     }
6853
6854   /* For double reductions, and for SLP reductions with a neutral value,
6855      we construct a variable-length initial vector by loading a vector
6856      full of the neutral value and then shift-and-inserting the start
6857      values into the low-numbered elements.  */
6858   if ((double_reduc || neutral_op)
6859       && !nunits_out.is_constant ()
6860       && !direct_internal_fn_supported_p (IFN_VEC_SHL_INSERT,
6861                                           vectype_out, OPTIMIZE_FOR_SPEED))
6862     {
6863       if (dump_enabled_p ())
6864         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6865                          "reduction on variable-length vectors requires"
6866                          " target support for a vector-shift-and-insert"
6867                          " operation.\n");
6868       return false;
6869     }
6870
6871   /* Check extra constraints for variable-length unchained SLP reductions.  */
6872   if (STMT_SLP_TYPE (stmt_info)
6873       && !REDUC_GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt))
6874       && !nunits_out.is_constant ())
6875     {
6876       /* We checked above that we could build the initial vector when
6877          there's a neutral element value.  Check here for the case in
6878          which each SLP statement has its own initial value and in which
6879          that value needs to be repeated for every instance of the
6880          statement within the initial vector.  */
6881       unsigned int group_size = SLP_TREE_SCALAR_STMTS (slp_node).length ();
6882       scalar_mode elt_mode = SCALAR_TYPE_MODE (TREE_TYPE (vectype_out));
6883       if (!neutral_op
6884           && !can_duplicate_and_interleave_p (group_size, elt_mode))
6885         {
6886           if (dump_enabled_p ())
6887             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6888                              "unsupported form of SLP reduction for"
6889                              " variable-length vectors: cannot build"
6890                              " initial vector.\n");
6891           return false;
6892         }
6893       /* The epilogue code relies on the number of elements being a multiple
6894          of the group size.  The duplicate-and-interleave approach to setting
6895          up the the initial vector does too.  */
6896       if (!multiple_p (nunits_out, group_size))
6897         {
6898           if (dump_enabled_p ())
6899             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6900                              "unsupported form of SLP reduction for"
6901                              " variable-length vectors: the vector size"
6902                              " is not a multiple of the number of results.\n");
6903           return false;
6904         }
6905     }
6906
6907   /* In case of widenning multiplication by a constant, we update the type
6908      of the constant to be the type of the other operand.  We check that the
6909      constant fits the type in the pattern recognition pass.  */
6910   if (code == DOT_PROD_EXPR
6911       && !types_compatible_p (TREE_TYPE (ops[0]), TREE_TYPE (ops[1])))
6912     {
6913       if (TREE_CODE (ops[0]) == INTEGER_CST)
6914         ops[0] = fold_convert (TREE_TYPE (ops[1]), ops[0]);
6915       else if (TREE_CODE (ops[1]) == INTEGER_CST)
6916         ops[1] = fold_convert (TREE_TYPE (ops[0]), ops[1]);
6917       else
6918         {
6919           if (dump_enabled_p ())
6920             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6921                              "invalid types in dot-prod\n");
6922
6923           return false;
6924         }
6925     }
6926
6927   if (reduction_type == COND_REDUCTION)
6928     {
6929       widest_int ni;
6930
6931       if (! max_loop_iterations (loop, &ni))
6932         {
6933           if (dump_enabled_p ())
6934             dump_printf_loc (MSG_NOTE, vect_location,
6935                              "loop count not known, cannot create cond "
6936                              "reduction.\n");
6937           return false;
6938         }
6939       /* Convert backedges to iterations.  */
6940       ni += 1;
6941
6942       /* The additional index will be the same type as the condition.  Check
6943          that the loop can fit into this less one (because we'll use up the
6944          zero slot for when there are no matches).  */
6945       tree max_index = TYPE_MAX_VALUE (cr_index_scalar_type);
6946       if (wi::geu_p (ni, wi::to_widest (max_index)))
6947         {
6948           if (dump_enabled_p ())
6949             dump_printf_loc (MSG_NOTE, vect_location,
6950                              "loop size is greater than data size.\n");
6951           return false;
6952         }
6953     }
6954
6955   /* In case the vectorization factor (VF) is bigger than the number
6956      of elements that we can fit in a vectype (nunits), we have to generate
6957      more than one vector stmt - i.e - we need to "unroll" the
6958      vector stmt by a factor VF/nunits.  For more details see documentation
6959      in vectorizable_operation.  */
6960
6961   /* If the reduction is used in an outer loop we need to generate
6962      VF intermediate results, like so (e.g. for ncopies=2):
6963         r0 = phi (init, r0)
6964         r1 = phi (init, r1)
6965         r0 = x0 + r0;
6966         r1 = x1 + r1;
6967     (i.e. we generate VF results in 2 registers).
6968     In this case we have a separate def-use cycle for each copy, and therefore
6969     for each copy we get the vector def for the reduction variable from the
6970     respective phi node created for this copy.
6971
6972     Otherwise (the reduction is unused in the loop nest), we can combine
6973     together intermediate results, like so (e.g. for ncopies=2):
6974         r = phi (init, r)
6975         r = x0 + r;
6976         r = x1 + r;
6977    (i.e. we generate VF/2 results in a single register).
6978    In this case for each copy we get the vector def for the reduction variable
6979    from the vectorized reduction operation generated in the previous iteration.
6980
6981    This only works when we see both the reduction PHI and its only consumer
6982    in vectorizable_reduction and there are no intermediate stmts
6983    participating.  */
6984   use_operand_p use_p;
6985   gimple *use_stmt;
6986   if (ncopies > 1
6987       && (STMT_VINFO_RELEVANT (stmt_info) <= vect_used_only_live)
6988       && single_imm_use (gimple_phi_result (reduc_def_stmt), &use_p, &use_stmt)
6989       && (use_stmt == stmt
6990           || STMT_VINFO_RELATED_STMT (vinfo_for_stmt (use_stmt)) == stmt))
6991     {
6992       single_defuse_cycle = true;
6993       epilog_copies = 1;
6994     }
6995   else
6996     epilog_copies = ncopies;
6997
6998   /* If the reduction stmt is one of the patterns that have lane
6999      reduction embedded we cannot handle the case of ! single_defuse_cycle.  */
7000   if ((ncopies > 1
7001        && ! single_defuse_cycle)
7002       && (code == DOT_PROD_EXPR
7003           || code == WIDEN_SUM_EXPR
7004           || code == SAD_EXPR))
7005     {
7006       if (dump_enabled_p ())
7007         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7008                          "multi def-use cycle not possible for lane-reducing "
7009                          "reduction operation\n");
7010       return false;
7011     }
7012
7013   if (slp_node)
7014     vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
7015   else
7016     vec_num = 1;
7017
7018   internal_fn cond_fn = get_conditional_internal_fn (code);
7019   vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo);
7020
7021   if (!vec_stmt) /* transformation not required.  */
7022     {
7023       vect_model_reduction_cost (stmt_info, reduc_fn, ncopies, cost_vec);
7024       if (loop_vinfo && LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo))
7025         {
7026           if (reduction_type != FOLD_LEFT_REDUCTION
7027               && (cond_fn == IFN_LAST
7028                   || !direct_internal_fn_supported_p (cond_fn, vectype_in,
7029                                                       OPTIMIZE_FOR_SPEED)))
7030             {
7031               if (dump_enabled_p ())
7032                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7033                                  "can't use a fully-masked loop because no"
7034                                  " conditional operation is available.\n");
7035               LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = false;
7036             }
7037           else if (reduc_index == -1)
7038             {
7039               if (dump_enabled_p ())
7040                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7041                                  "can't use a fully-masked loop for chained"
7042                                  " reductions.\n");
7043               LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = false;
7044             }
7045           else
7046             vect_record_loop_mask (loop_vinfo, masks, ncopies * vec_num,
7047                                    vectype_in);
7048         }
7049       if (dump_enabled_p ()
7050           && reduction_type == FOLD_LEFT_REDUCTION)
7051         dump_printf_loc (MSG_NOTE, vect_location,
7052                          "using an in-order (fold-left) reduction.\n");
7053       STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
7054       return true;
7055     }
7056
7057   /* Transform.  */
7058
7059   if (dump_enabled_p ())
7060     dump_printf_loc (MSG_NOTE, vect_location, "transform reduction.\n");
7061
7062   /* FORNOW: Multiple types are not supported for condition.  */
7063   if (code == COND_EXPR)
7064     gcc_assert (ncopies == 1);
7065
7066   bool masked_loop_p = LOOP_VINFO_FULLY_MASKED_P (loop_vinfo);
7067
7068   if (reduction_type == FOLD_LEFT_REDUCTION)
7069     return vectorize_fold_left_reduction
7070       (stmt, gsi, vec_stmt, slp_node, reduc_def_stmt, code,
7071        reduc_fn, ops, vectype_in, reduc_index, masks);
7072
7073   if (reduction_type == EXTRACT_LAST_REDUCTION)
7074     {
7075       gcc_assert (!slp_node);
7076       return vectorizable_condition (stmt, gsi, vec_stmt,
7077                                      NULL, reduc_index, NULL, NULL);
7078     }
7079
7080   /* Create the destination vector  */
7081   vec_dest = vect_create_destination_var (scalar_dest, vectype_out);
7082
7083   prev_stmt_info = NULL;
7084   prev_phi_info = NULL;
7085   if (!slp_node)
7086     {
7087       vec_oprnds0.create (1);
7088       vec_oprnds1.create (1);
7089       if (op_type == ternary_op)
7090         vec_oprnds2.create (1);
7091     }
7092
7093   phis.create (vec_num);
7094   vect_defs.create (vec_num);
7095   if (!slp_node)
7096     vect_defs.quick_push (NULL_TREE);
7097
7098   if (slp_node)
7099     phis.splice (SLP_TREE_VEC_STMTS (slp_node_instance->reduc_phis));
7100   else
7101     phis.quick_push (STMT_VINFO_VEC_STMT (vinfo_for_stmt (reduc_def_stmt)));
7102
7103   for (j = 0; j < ncopies; j++)
7104     {
7105       if (code == COND_EXPR)
7106         {
7107           gcc_assert (!slp_node);
7108           vectorizable_condition (stmt, gsi, vec_stmt,
7109                                   PHI_RESULT (phis[0]),
7110                                   reduc_index, NULL, NULL);
7111           /* Multiple types are not supported for condition.  */
7112           break;
7113         }
7114
7115       /* Handle uses.  */
7116       if (j == 0)
7117         {
7118           if (slp_node)
7119             {
7120               /* Get vec defs for all the operands except the reduction index,
7121                  ensuring the ordering of the ops in the vector is kept.  */
7122               auto_vec<tree, 3> slp_ops;
7123               auto_vec<vec<tree>, 3> vec_defs;
7124
7125               slp_ops.quick_push (ops[0]);
7126               slp_ops.quick_push (ops[1]);
7127               if (op_type == ternary_op)
7128                 slp_ops.quick_push (ops[2]);
7129
7130               vect_get_slp_defs (slp_ops, slp_node, &vec_defs);
7131
7132               vec_oprnds0.safe_splice (vec_defs[0]);
7133               vec_defs[0].release ();
7134               vec_oprnds1.safe_splice (vec_defs[1]);
7135               vec_defs[1].release ();
7136               if (op_type == ternary_op)
7137                 {
7138                   vec_oprnds2.safe_splice (vec_defs[2]);
7139                   vec_defs[2].release ();
7140                 }
7141             }
7142           else
7143             {
7144               vec_oprnds0.quick_push
7145                 (vect_get_vec_def_for_operand (ops[0], stmt));
7146               vec_oprnds1.quick_push
7147                 (vect_get_vec_def_for_operand (ops[1], stmt));
7148               if (op_type == ternary_op)
7149                 vec_oprnds2.quick_push
7150                   (vect_get_vec_def_for_operand (ops[2], stmt));
7151             }
7152         }
7153       else
7154         {
7155           if (!slp_node)
7156             {
7157               gcc_assert (reduc_index != -1 || ! single_defuse_cycle);
7158
7159               if (single_defuse_cycle && reduc_index == 0)
7160                 vec_oprnds0[0] = gimple_get_lhs (new_stmt);
7161               else
7162                 vec_oprnds0[0]
7163                   = vect_get_vec_def_for_stmt_copy (dts[0], vec_oprnds0[0]);
7164               if (single_defuse_cycle && reduc_index == 1)
7165                 vec_oprnds1[0] = gimple_get_lhs (new_stmt);
7166               else
7167                 vec_oprnds1[0]
7168                   = vect_get_vec_def_for_stmt_copy (dts[1], vec_oprnds1[0]);
7169               if (op_type == ternary_op)
7170                 {
7171                   if (single_defuse_cycle && reduc_index == 2)
7172                     vec_oprnds2[0] = gimple_get_lhs (new_stmt);
7173                   else
7174                     vec_oprnds2[0]
7175                       = vect_get_vec_def_for_stmt_copy (dts[2], vec_oprnds2[0]);
7176                 }
7177             }
7178         }
7179
7180       FOR_EACH_VEC_ELT (vec_oprnds0, i, def0)
7181         {
7182           tree vop[3] = { def0, vec_oprnds1[i], NULL_TREE };
7183           if (masked_loop_p)
7184             {
7185               /* Make sure that the reduction accumulator is vop[0].  */
7186               if (reduc_index == 1)
7187                 {
7188                   gcc_assert (commutative_tree_code (code));
7189                   std::swap (vop[0], vop[1]);
7190                 }
7191               tree mask = vect_get_loop_mask (gsi, masks, vec_num * ncopies,
7192                                               vectype_in, i * ncopies + j);
7193               gcall *call = gimple_build_call_internal (cond_fn, 4, mask,
7194                                                         vop[0], vop[1],
7195                                                         vop[0]);
7196               new_temp = make_ssa_name (vec_dest, call);
7197               gimple_call_set_lhs (call, new_temp);
7198               gimple_call_set_nothrow (call, true);
7199               new_stmt = call;
7200             }
7201           else
7202             {
7203               if (op_type == ternary_op)
7204                 vop[2] = vec_oprnds2[i];
7205
7206               new_stmt = gimple_build_assign (vec_dest, code,
7207                                               vop[0], vop[1], vop[2]);
7208               new_temp = make_ssa_name (vec_dest, new_stmt);
7209               gimple_assign_set_lhs (new_stmt, new_temp);
7210             }
7211           vect_finish_stmt_generation (stmt, new_stmt, gsi);
7212
7213           if (slp_node)
7214             {
7215               SLP_TREE_VEC_STMTS (slp_node).quick_push (new_stmt);
7216               vect_defs.quick_push (new_temp);
7217             }
7218           else
7219             vect_defs[0] = new_temp;
7220         }
7221
7222       if (slp_node)
7223         continue;
7224
7225       if (j == 0)
7226         STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt;
7227       else
7228         STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
7229
7230       prev_stmt_info = vinfo_for_stmt (new_stmt);
7231     }
7232
7233   /* Finalize the reduction-phi (set its arguments) and create the
7234      epilog reduction code.  */
7235   if ((!single_defuse_cycle || code == COND_EXPR) && !slp_node)
7236     vect_defs[0] = gimple_get_lhs (*vec_stmt);
7237
7238   vect_create_epilog_for_reduction (vect_defs, stmt, reduc_def_stmt,
7239                                     epilog_copies, reduc_fn, phis,
7240                                     double_reduc, slp_node, slp_node_instance,
7241                                     cond_reduc_val, cond_reduc_op_code,
7242                                     neutral_op);
7243
7244   return true;
7245 }
7246
7247 /* Function vect_min_worthwhile_factor.
7248
7249    For a loop where we could vectorize the operation indicated by CODE,
7250    return the minimum vectorization factor that makes it worthwhile
7251    to use generic vectors.  */
7252 static unsigned int
7253 vect_min_worthwhile_factor (enum tree_code code)
7254 {
7255   switch (code)
7256     {
7257     case PLUS_EXPR:
7258     case MINUS_EXPR:
7259     case NEGATE_EXPR:
7260       return 4;
7261
7262     case BIT_AND_EXPR:
7263     case BIT_IOR_EXPR:
7264     case BIT_XOR_EXPR:
7265     case BIT_NOT_EXPR:
7266       return 2;
7267
7268     default:
7269       return INT_MAX;
7270     }
7271 }
7272
7273 /* Return true if VINFO indicates we are doing loop vectorization and if
7274    it is worth decomposing CODE operations into scalar operations for
7275    that loop's vectorization factor.  */
7276
7277 bool
7278 vect_worthwhile_without_simd_p (vec_info *vinfo, tree_code code)
7279 {
7280   loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
7281   unsigned HOST_WIDE_INT value;
7282   return (loop_vinfo
7283           && LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant (&value)
7284           && value >= vect_min_worthwhile_factor (code));
7285 }
7286
7287 /* Function vectorizable_induction
7288
7289    Check if PHI performs an induction computation that can be vectorized.
7290    If VEC_STMT is also passed, vectorize the induction PHI: create a vectorized
7291    phi to replace it, put it in VEC_STMT, and add it to the same basic block.
7292    Return FALSE if not a vectorizable STMT, TRUE otherwise.  */
7293
7294 bool
7295 vectorizable_induction (gimple *phi,
7296                         gimple_stmt_iterator *gsi ATTRIBUTE_UNUSED,
7297                         gimple **vec_stmt, slp_tree slp_node,
7298                         stmt_vector_for_cost *cost_vec)
7299 {
7300   stmt_vec_info stmt_info = vinfo_for_stmt (phi);
7301   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
7302   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
7303   unsigned ncopies;
7304   bool nested_in_vect_loop = false;
7305   struct loop *iv_loop;
7306   tree vec_def;
7307   edge pe = loop_preheader_edge (loop);
7308   basic_block new_bb;
7309   tree new_vec, vec_init, vec_step, t;
7310   tree new_name;
7311   gimple *new_stmt;
7312   gphi *induction_phi;
7313   tree induc_def, vec_dest;
7314   tree init_expr, step_expr;
7315   poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
7316   unsigned i;
7317   tree expr;
7318   gimple_seq stmts;
7319   imm_use_iterator imm_iter;
7320   use_operand_p use_p;
7321   gimple *exit_phi;
7322   edge latch_e;
7323   tree loop_arg;
7324   gimple_stmt_iterator si;
7325   basic_block bb = gimple_bb (phi);
7326
7327   if (gimple_code (phi) != GIMPLE_PHI)
7328     return false;
7329
7330   if (!STMT_VINFO_RELEVANT_P (stmt_info))
7331     return false;
7332
7333   /* Make sure it was recognized as induction computation.  */
7334   if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_induction_def)
7335     return false;
7336
7337   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
7338   poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
7339
7340   if (slp_node)
7341     ncopies = 1;
7342   else
7343     ncopies = vect_get_num_copies (loop_vinfo, vectype);
7344   gcc_assert (ncopies >= 1);
7345
7346   /* FORNOW. These restrictions should be relaxed.  */
7347   if (nested_in_vect_loop_p (loop, phi))
7348     {
7349       imm_use_iterator imm_iter;
7350       use_operand_p use_p;
7351       gimple *exit_phi;
7352       edge latch_e;
7353       tree loop_arg;
7354
7355       if (ncopies > 1)
7356         {
7357           if (dump_enabled_p ())
7358             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7359                              "multiple types in nested loop.\n");
7360           return false;
7361         }
7362
7363       /* FORNOW: outer loop induction with SLP not supported.  */
7364       if (STMT_SLP_TYPE (stmt_info))
7365         return false;
7366
7367       exit_phi = NULL;
7368       latch_e = loop_latch_edge (loop->inner);
7369       loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
7370       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, loop_arg)
7371         {
7372           gimple *use_stmt = USE_STMT (use_p);
7373           if (is_gimple_debug (use_stmt))
7374             continue;
7375
7376           if (!flow_bb_inside_loop_p (loop->inner, gimple_bb (use_stmt)))
7377             {
7378               exit_phi = use_stmt;
7379               break;
7380             }
7381         }
7382       if (exit_phi)
7383         {
7384           stmt_vec_info exit_phi_vinfo = loop_vinfo->lookup_stmt (exit_phi);
7385           if (!(STMT_VINFO_RELEVANT_P (exit_phi_vinfo)
7386                 && !STMT_VINFO_LIVE_P (exit_phi_vinfo)))
7387             {
7388               if (dump_enabled_p ())
7389                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7390                                  "inner-loop induction only used outside "
7391                                  "of the outer vectorized loop.\n");
7392               return false;
7393             }
7394         }
7395
7396       nested_in_vect_loop = true;
7397       iv_loop = loop->inner;
7398     }
7399   else
7400     iv_loop = loop;
7401   gcc_assert (iv_loop == (gimple_bb (phi))->loop_father);
7402
7403   if (slp_node && !nunits.is_constant ())
7404     {
7405       /* The current SLP code creates the initial value element-by-element.  */
7406       if (dump_enabled_p ())
7407         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7408                          "SLP induction not supported for variable-length"
7409                          " vectors.\n");
7410       return false;
7411     }
7412
7413   if (!vec_stmt) /* transformation not required.  */
7414     {
7415       STMT_VINFO_TYPE (stmt_info) = induc_vec_info_type;
7416       DUMP_VECT_SCOPE ("vectorizable_induction");
7417       vect_model_induction_cost (stmt_info, ncopies, cost_vec);
7418       return true;
7419     }
7420
7421   /* Transform.  */
7422
7423   /* Compute a vector variable, initialized with the first VF values of
7424      the induction variable.  E.g., for an iv with IV_PHI='X' and
7425      evolution S, for a vector of 4 units, we want to compute:
7426      [X, X + S, X + 2*S, X + 3*S].  */
7427
7428   if (dump_enabled_p ())
7429     dump_printf_loc (MSG_NOTE, vect_location, "transform induction phi.\n");
7430
7431   latch_e = loop_latch_edge (iv_loop);
7432   loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
7433
7434   step_expr = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_info);
7435   gcc_assert (step_expr != NULL_TREE);
7436
7437   pe = loop_preheader_edge (iv_loop);
7438   init_expr = PHI_ARG_DEF_FROM_EDGE (phi,
7439                                      loop_preheader_edge (iv_loop));
7440
7441   stmts = NULL;
7442   if (!nested_in_vect_loop)
7443     {
7444       /* Convert the initial value to the desired type.  */
7445       tree new_type = TREE_TYPE (vectype);
7446       init_expr = gimple_convert (&stmts, new_type, init_expr);
7447
7448       /* If we are using the loop mask to "peel" for alignment then we need
7449          to adjust the start value here.  */
7450       tree skip_niters = LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo);
7451       if (skip_niters != NULL_TREE)
7452         {
7453           if (FLOAT_TYPE_P (vectype))
7454             skip_niters = gimple_build (&stmts, FLOAT_EXPR, new_type,
7455                                         skip_niters);
7456           else
7457             skip_niters = gimple_convert (&stmts, new_type, skip_niters);
7458           tree skip_step = gimple_build (&stmts, MULT_EXPR, new_type,
7459                                          skip_niters, step_expr);
7460           init_expr = gimple_build (&stmts, MINUS_EXPR, new_type,
7461                                     init_expr, skip_step);
7462         }
7463     }
7464
7465   /* Convert the step to the desired type.  */
7466   step_expr = gimple_convert (&stmts, TREE_TYPE (vectype), step_expr);
7467
7468   if (stmts)
7469     {
7470       new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
7471       gcc_assert (!new_bb);
7472     }
7473
7474   /* Find the first insertion point in the BB.  */
7475   si = gsi_after_labels (bb);
7476
7477   /* For SLP induction we have to generate several IVs as for example
7478      with group size 3 we need [i, i, i, i + S] [i + S, i + S, i + 2*S, i + 2*S]
7479      [i + 2*S, i + 3*S, i + 3*S, i + 3*S].  The step is the same uniform
7480      [VF*S, VF*S, VF*S, VF*S] for all.  */
7481   if (slp_node)
7482     {
7483       /* Enforced above.  */
7484       unsigned int const_nunits = nunits.to_constant ();
7485
7486       /* Generate [VF*S, VF*S, ... ].  */
7487       if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
7488         {
7489           expr = build_int_cst (integer_type_node, vf);
7490           expr = fold_convert (TREE_TYPE (step_expr), expr);
7491         }
7492       else
7493         expr = build_int_cst (TREE_TYPE (step_expr), vf);
7494       new_name = fold_build2 (MULT_EXPR, TREE_TYPE (step_expr),
7495                               expr, step_expr);
7496       if (! CONSTANT_CLASS_P (new_name))
7497         new_name = vect_init_vector (phi, new_name,
7498                                      TREE_TYPE (step_expr), NULL);
7499       new_vec = build_vector_from_val (vectype, new_name);
7500       vec_step = vect_init_vector (phi, new_vec, vectype, NULL);
7501
7502       /* Now generate the IVs.  */
7503       unsigned group_size = SLP_TREE_SCALAR_STMTS (slp_node).length ();
7504       unsigned nvects = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
7505       unsigned elts = const_nunits * nvects;
7506       unsigned nivs = least_common_multiple (group_size,
7507                                              const_nunits) / const_nunits;
7508       gcc_assert (elts % group_size == 0);
7509       tree elt = init_expr;
7510       unsigned ivn;
7511       for (ivn = 0; ivn < nivs; ++ivn)
7512         {
7513           tree_vector_builder elts (vectype, const_nunits, 1);
7514           stmts = NULL;
7515           for (unsigned eltn = 0; eltn < const_nunits; ++eltn)
7516             {
7517               if (ivn*const_nunits + eltn >= group_size
7518                   && (ivn * const_nunits + eltn) % group_size == 0)
7519                 elt = gimple_build (&stmts, PLUS_EXPR, TREE_TYPE (elt),
7520                                     elt, step_expr);
7521               elts.quick_push (elt);
7522             }
7523           vec_init = gimple_build_vector (&stmts, &elts);
7524           if (stmts)
7525             {
7526               new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
7527               gcc_assert (!new_bb);
7528             }
7529
7530           /* Create the induction-phi that defines the induction-operand.  */
7531           vec_dest = vect_get_new_vect_var (vectype, vect_simple_var, "vec_iv_");
7532           induction_phi = create_phi_node (vec_dest, iv_loop->header);
7533           loop_vinfo->add_stmt (induction_phi);
7534           induc_def = PHI_RESULT (induction_phi);
7535
7536           /* Create the iv update inside the loop  */
7537           vec_def = make_ssa_name (vec_dest);
7538           new_stmt = gimple_build_assign (vec_def, PLUS_EXPR, induc_def, vec_step);
7539           gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
7540           loop_vinfo->add_stmt (new_stmt);
7541
7542           /* Set the arguments of the phi node:  */
7543           add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION);
7544           add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
7545                        UNKNOWN_LOCATION);
7546
7547           SLP_TREE_VEC_STMTS (slp_node).quick_push (induction_phi);
7548         }
7549
7550       /* Re-use IVs when we can.  */
7551       if (ivn < nvects)
7552         {
7553           unsigned vfp
7554             = least_common_multiple (group_size, const_nunits) / group_size;
7555           /* Generate [VF'*S, VF'*S, ... ].  */
7556           if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
7557             {
7558               expr = build_int_cst (integer_type_node, vfp);
7559               expr = fold_convert (TREE_TYPE (step_expr), expr);
7560             }
7561           else
7562             expr = build_int_cst (TREE_TYPE (step_expr), vfp);
7563           new_name = fold_build2 (MULT_EXPR, TREE_TYPE (step_expr),
7564                                   expr, step_expr);
7565           if (! CONSTANT_CLASS_P (new_name))
7566             new_name = vect_init_vector (phi, new_name,
7567                                          TREE_TYPE (step_expr), NULL);
7568           new_vec = build_vector_from_val (vectype, new_name);
7569           vec_step = vect_init_vector (phi, new_vec, vectype, NULL);
7570           for (; ivn < nvects; ++ivn)
7571             {
7572               gimple *iv = SLP_TREE_VEC_STMTS (slp_node)[ivn - nivs];
7573               tree def;
7574               if (gimple_code (iv) == GIMPLE_PHI)
7575                 def = gimple_phi_result (iv);
7576               else
7577                 def = gimple_assign_lhs (iv);
7578               new_stmt = gimple_build_assign (make_ssa_name (vectype),
7579                                               PLUS_EXPR,
7580                                               def, vec_step);
7581               if (gimple_code (iv) == GIMPLE_PHI)
7582                 gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
7583               else
7584                 {
7585                   gimple_stmt_iterator tgsi = gsi_for_stmt (iv);
7586                   gsi_insert_after (&tgsi, new_stmt, GSI_CONTINUE_LINKING);
7587                 }
7588               loop_vinfo->add_stmt (new_stmt);
7589               SLP_TREE_VEC_STMTS (slp_node).quick_push (new_stmt);
7590             }
7591         }
7592
7593       return true;
7594     }
7595
7596   /* Create the vector that holds the initial_value of the induction.  */
7597   if (nested_in_vect_loop)
7598     {
7599       /* iv_loop is nested in the loop to be vectorized.  init_expr had already
7600          been created during vectorization of previous stmts.  We obtain it
7601          from the STMT_VINFO_VEC_STMT of the defining stmt.  */
7602       vec_init = vect_get_vec_def_for_operand (init_expr, phi);
7603       /* If the initial value is not of proper type, convert it.  */
7604       if (!useless_type_conversion_p (vectype, TREE_TYPE (vec_init)))
7605         {
7606           new_stmt
7607             = gimple_build_assign (vect_get_new_ssa_name (vectype,
7608                                                           vect_simple_var,
7609                                                           "vec_iv_"),
7610                                    VIEW_CONVERT_EXPR,
7611                                    build1 (VIEW_CONVERT_EXPR, vectype,
7612                                            vec_init));
7613           vec_init = gimple_assign_lhs (new_stmt);
7614           new_bb = gsi_insert_on_edge_immediate (loop_preheader_edge (iv_loop),
7615                                                  new_stmt);
7616           gcc_assert (!new_bb);
7617           loop_vinfo->add_stmt (new_stmt);
7618         }
7619     }
7620   else
7621     {
7622       /* iv_loop is the loop to be vectorized. Create:
7623          vec_init = [X, X+S, X+2*S, X+3*S] (S = step_expr, X = init_expr)  */
7624       stmts = NULL;
7625       new_name = gimple_convert (&stmts, TREE_TYPE (vectype), init_expr);
7626
7627       unsigned HOST_WIDE_INT const_nunits;
7628       if (nunits.is_constant (&const_nunits))
7629         {
7630           tree_vector_builder elts (vectype, const_nunits, 1);
7631           elts.quick_push (new_name);
7632           for (i = 1; i < const_nunits; i++)
7633             {
7634               /* Create: new_name_i = new_name + step_expr  */
7635               new_name = gimple_build (&stmts, PLUS_EXPR, TREE_TYPE (new_name),
7636                                        new_name, step_expr);
7637               elts.quick_push (new_name);
7638             }
7639           /* Create a vector from [new_name_0, new_name_1, ...,
7640              new_name_nunits-1]  */
7641           vec_init = gimple_build_vector (&stmts, &elts);
7642         }
7643       else if (INTEGRAL_TYPE_P (TREE_TYPE (step_expr)))
7644         /* Build the initial value directly from a VEC_SERIES_EXPR.  */
7645         vec_init = gimple_build (&stmts, VEC_SERIES_EXPR, vectype,
7646                                  new_name, step_expr);
7647       else
7648         {
7649           /* Build:
7650                 [base, base, base, ...]
7651                 + (vectype) [0, 1, 2, ...] * [step, step, step, ...].  */
7652           gcc_assert (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)));
7653           gcc_assert (flag_associative_math);
7654           tree index = build_index_vector (vectype, 0, 1);
7655           tree base_vec = gimple_build_vector_from_val (&stmts, vectype,
7656                                                         new_name);
7657           tree step_vec = gimple_build_vector_from_val (&stmts, vectype,
7658                                                         step_expr);
7659           vec_init = gimple_build (&stmts, FLOAT_EXPR, vectype, index);
7660           vec_init = gimple_build (&stmts, MULT_EXPR, vectype,
7661                                    vec_init, step_vec);
7662           vec_init = gimple_build (&stmts, PLUS_EXPR, vectype,
7663                                    vec_init, base_vec);
7664         }
7665
7666       if (stmts)
7667         {
7668           new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
7669           gcc_assert (!new_bb);
7670         }
7671     }
7672
7673
7674   /* Create the vector that holds the step of the induction.  */
7675   if (nested_in_vect_loop)
7676     /* iv_loop is nested in the loop to be vectorized. Generate:
7677        vec_step = [S, S, S, S]  */
7678     new_name = step_expr;
7679   else
7680     {
7681       /* iv_loop is the loop to be vectorized. Generate:
7682           vec_step = [VF*S, VF*S, VF*S, VF*S]  */
7683       gimple_seq seq = NULL;
7684       if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
7685         {
7686           expr = build_int_cst (integer_type_node, vf);
7687           expr = gimple_build (&seq, FLOAT_EXPR, TREE_TYPE (step_expr), expr);
7688         }
7689       else
7690         expr = build_int_cst (TREE_TYPE (step_expr), vf);
7691       new_name = gimple_build (&seq, MULT_EXPR, TREE_TYPE (step_expr),
7692                                expr, step_expr);
7693       if (seq)
7694         {
7695           new_bb = gsi_insert_seq_on_edge_immediate (pe, seq);
7696           gcc_assert (!new_bb);
7697         }
7698     }
7699
7700   t = unshare_expr (new_name);
7701   gcc_assert (CONSTANT_CLASS_P (new_name)
7702               || TREE_CODE (new_name) == SSA_NAME);
7703   new_vec = build_vector_from_val (vectype, t);
7704   vec_step = vect_init_vector (phi, new_vec, vectype, NULL);
7705
7706
7707   /* Create the following def-use cycle:
7708      loop prolog:
7709          vec_init = ...
7710          vec_step = ...
7711      loop:
7712          vec_iv = PHI <vec_init, vec_loop>
7713          ...
7714          STMT
7715          ...
7716          vec_loop = vec_iv + vec_step;  */
7717
7718   /* Create the induction-phi that defines the induction-operand.  */
7719   vec_dest = vect_get_new_vect_var (vectype, vect_simple_var, "vec_iv_");
7720   induction_phi = create_phi_node (vec_dest, iv_loop->header);
7721   stmt_vec_info induction_phi_info = loop_vinfo->add_stmt (induction_phi);
7722   induc_def = PHI_RESULT (induction_phi);
7723
7724   /* Create the iv update inside the loop  */
7725   vec_def = make_ssa_name (vec_dest);
7726   new_stmt = gimple_build_assign (vec_def, PLUS_EXPR, induc_def, vec_step);
7727   gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
7728   stmt_vec_info new_stmt_info = loop_vinfo->add_stmt (new_stmt);
7729
7730   /* Set the arguments of the phi node:  */
7731   add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION);
7732   add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
7733                UNKNOWN_LOCATION);
7734
7735   STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = induction_phi;
7736
7737   /* In case that vectorization factor (VF) is bigger than the number
7738      of elements that we can fit in a vectype (nunits), we have to generate
7739      more than one vector stmt - i.e - we need to "unroll" the
7740      vector stmt by a factor VF/nunits.  For more details see documentation
7741      in vectorizable_operation.  */
7742
7743   if (ncopies > 1)
7744     {
7745       gimple_seq seq = NULL;
7746       stmt_vec_info prev_stmt_vinfo;
7747       /* FORNOW. This restriction should be relaxed.  */
7748       gcc_assert (!nested_in_vect_loop);
7749
7750       /* Create the vector that holds the step of the induction.  */
7751       if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
7752         {
7753           expr = build_int_cst (integer_type_node, nunits);
7754           expr = gimple_build (&seq, FLOAT_EXPR, TREE_TYPE (step_expr), expr);
7755         }
7756       else
7757         expr = build_int_cst (TREE_TYPE (step_expr), nunits);
7758       new_name = gimple_build (&seq, MULT_EXPR, TREE_TYPE (step_expr),
7759                                expr, step_expr);
7760       if (seq)
7761         {
7762           new_bb = gsi_insert_seq_on_edge_immediate (pe, seq);
7763           gcc_assert (!new_bb);
7764         }
7765
7766       t = unshare_expr (new_name);
7767       gcc_assert (CONSTANT_CLASS_P (new_name)
7768                   || TREE_CODE (new_name) == SSA_NAME);
7769       new_vec = build_vector_from_val (vectype, t);
7770       vec_step = vect_init_vector (phi, new_vec, vectype, NULL);
7771
7772       vec_def = induc_def;
7773       prev_stmt_vinfo = induction_phi_info;
7774       for (i = 1; i < ncopies; i++)
7775         {
7776           /* vec_i = vec_prev + vec_step  */
7777           new_stmt = gimple_build_assign (vec_dest, PLUS_EXPR,
7778                                           vec_def, vec_step);
7779           vec_def = make_ssa_name (vec_dest, new_stmt);
7780           gimple_assign_set_lhs (new_stmt, vec_def);
7781
7782           gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
7783           new_stmt_info = loop_vinfo->add_stmt (new_stmt);
7784           STMT_VINFO_RELATED_STMT (prev_stmt_vinfo) = new_stmt;
7785           prev_stmt_vinfo = new_stmt_info;
7786         }
7787     }
7788
7789   if (nested_in_vect_loop)
7790     {
7791       /* Find the loop-closed exit-phi of the induction, and record
7792          the final vector of induction results:  */
7793       exit_phi = NULL;
7794       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, loop_arg)
7795         {
7796           gimple *use_stmt = USE_STMT (use_p);
7797           if (is_gimple_debug (use_stmt))
7798             continue;
7799
7800           if (!flow_bb_inside_loop_p (iv_loop, gimple_bb (use_stmt)))
7801             {
7802               exit_phi = use_stmt;
7803               break;
7804             }
7805         }
7806       if (exit_phi)
7807         {
7808           stmt_vec_info stmt_vinfo = loop_vinfo->lookup_stmt (exit_phi);
7809           /* FORNOW. Currently not supporting the case that an inner-loop induction
7810              is not used in the outer-loop (i.e. only outside the outer-loop).  */
7811           gcc_assert (STMT_VINFO_RELEVANT_P (stmt_vinfo)
7812                       && !STMT_VINFO_LIVE_P (stmt_vinfo));
7813
7814           STMT_VINFO_VEC_STMT (stmt_vinfo) = new_stmt;
7815           if (dump_enabled_p ())
7816             {
7817               dump_printf_loc (MSG_NOTE, vect_location,
7818                                "vector of inductions after inner-loop:");
7819               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, new_stmt, 0);
7820             }
7821         }
7822     }
7823
7824
7825   if (dump_enabled_p ())
7826     {
7827       dump_printf_loc (MSG_NOTE, vect_location,
7828                        "transform induction: created def-use cycle: ");
7829       dump_gimple_stmt (MSG_NOTE, TDF_SLIM, induction_phi, 0);
7830       dump_gimple_stmt (MSG_NOTE, TDF_SLIM,
7831                         SSA_NAME_DEF_STMT (vec_def), 0);
7832     }
7833
7834   return true;
7835 }
7836
7837 /* Function vectorizable_live_operation.
7838
7839    STMT computes a value that is used outside the loop.  Check if
7840    it can be supported.  */
7841
7842 bool
7843 vectorizable_live_operation (gimple *stmt,
7844                              gimple_stmt_iterator *gsi ATTRIBUTE_UNUSED,
7845                              slp_tree slp_node, int slp_index,
7846                              gimple **vec_stmt,
7847                              stmt_vector_for_cost *)
7848 {
7849   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
7850   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
7851   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
7852   imm_use_iterator imm_iter;
7853   tree lhs, lhs_type, bitsize, vec_bitsize;
7854   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
7855   poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
7856   int ncopies;
7857   gimple *use_stmt;
7858   auto_vec<tree> vec_oprnds;
7859   int vec_entry = 0;
7860   poly_uint64 vec_index = 0;
7861
7862   gcc_assert (STMT_VINFO_LIVE_P (stmt_info));
7863
7864   if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def)
7865     return false;
7866
7867   /* FORNOW.  CHECKME.  */
7868   if (nested_in_vect_loop_p (loop, stmt))
7869     return false;
7870
7871   /* If STMT is not relevant and it is a simple assignment and its inputs are
7872      invariant then it can remain in place, unvectorized.  The original last
7873      scalar value that it computes will be used.  */
7874   if (!STMT_VINFO_RELEVANT_P (stmt_info))
7875     {
7876       gcc_assert (is_simple_and_all_uses_invariant (stmt, loop_vinfo));
7877       if (dump_enabled_p ())
7878         dump_printf_loc (MSG_NOTE, vect_location,
7879                          "statement is simple and uses invariant.  Leaving in "
7880                          "place.\n");
7881       return true;
7882     }
7883
7884   if (slp_node)
7885     ncopies = 1;
7886   else
7887     ncopies = vect_get_num_copies (loop_vinfo, vectype);
7888
7889   if (slp_node)
7890     {
7891       gcc_assert (slp_index >= 0);
7892
7893       int num_scalar = SLP_TREE_SCALAR_STMTS (slp_node).length ();
7894       int num_vec = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
7895
7896       /* Get the last occurrence of the scalar index from the concatenation of
7897          all the slp vectors. Calculate which slp vector it is and the index
7898          within.  */
7899       poly_uint64 pos = (num_vec * nunits) - num_scalar + slp_index;
7900
7901       /* Calculate which vector contains the result, and which lane of
7902          that vector we need.  */
7903       if (!can_div_trunc_p (pos, nunits, &vec_entry, &vec_index))
7904         {
7905           if (dump_enabled_p ())
7906             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7907                              "Cannot determine which vector holds the"
7908                              " final result.\n");
7909           return false;
7910         }
7911     }
7912
7913   if (!vec_stmt)
7914     {
7915       /* No transformation required.  */
7916       if (LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo))
7917         {
7918           if (!direct_internal_fn_supported_p (IFN_EXTRACT_LAST, vectype,
7919                                                OPTIMIZE_FOR_SPEED))
7920             {
7921               if (dump_enabled_p ())
7922                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7923                                  "can't use a fully-masked loop because "
7924                                  "the target doesn't support extract last "
7925                                  "reduction.\n");
7926               LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = false;
7927             }
7928           else if (slp_node)
7929             {
7930               if (dump_enabled_p ())
7931                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7932                                  "can't use a fully-masked loop because an "
7933                                  "SLP statement is live after the loop.\n");
7934               LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = false;
7935             }
7936           else if (ncopies > 1)
7937             {
7938               if (dump_enabled_p ())
7939                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7940                                  "can't use a fully-masked loop because"
7941                                  " ncopies is greater than 1.\n");
7942               LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = false;
7943             }
7944           else
7945             {
7946               gcc_assert (ncopies == 1 && !slp_node);
7947               vect_record_loop_mask (loop_vinfo,
7948                                      &LOOP_VINFO_MASKS (loop_vinfo),
7949                                      1, vectype);
7950             }
7951         }
7952       return true;
7953     }
7954
7955   /* If stmt has a related stmt, then use that for getting the lhs.  */
7956   if (is_pattern_stmt_p (stmt_info))
7957     stmt = STMT_VINFO_RELATED_STMT (stmt_info);
7958
7959   lhs = (is_a <gphi *> (stmt)) ? gimple_phi_result (stmt)
7960         : gimple_get_lhs (stmt);
7961   lhs_type = TREE_TYPE (lhs);
7962
7963   bitsize = (VECTOR_BOOLEAN_TYPE_P (vectype)
7964              ? bitsize_int (TYPE_PRECISION (TREE_TYPE (vectype)))
7965              : TYPE_SIZE (TREE_TYPE (vectype)));
7966   vec_bitsize = TYPE_SIZE (vectype);
7967
7968   /* Get the vectorized lhs of STMT and the lane to use (counted in bits).  */
7969   tree vec_lhs, bitstart;
7970   if (slp_node)
7971     {
7972       gcc_assert (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo));
7973
7974       /* Get the correct slp vectorized stmt.  */
7975       gimple *vec_stmt = SLP_TREE_VEC_STMTS (slp_node)[vec_entry];
7976       if (gphi *phi = dyn_cast <gphi *> (vec_stmt))
7977         vec_lhs = gimple_phi_result (phi);
7978       else
7979         vec_lhs = gimple_get_lhs (vec_stmt);
7980
7981       /* Get entry to use.  */
7982       bitstart = bitsize_int (vec_index);
7983       bitstart = int_const_binop (MULT_EXPR, bitsize, bitstart);
7984     }
7985   else
7986     {
7987       enum vect_def_type dt = STMT_VINFO_DEF_TYPE (stmt_info);
7988       vec_lhs = vect_get_vec_def_for_operand_1 (stmt, dt);
7989       gcc_checking_assert (ncopies == 1
7990                            || !LOOP_VINFO_FULLY_MASKED_P (loop_vinfo));
7991
7992       /* For multiple copies, get the last copy.  */
7993       for (int i = 1; i < ncopies; ++i)
7994         vec_lhs = vect_get_vec_def_for_stmt_copy (vect_unknown_def_type,
7995                                                   vec_lhs);
7996
7997       /* Get the last lane in the vector.  */
7998       bitstart = int_const_binop (MINUS_EXPR, vec_bitsize, bitsize);
7999     }
8000
8001   gimple_seq stmts = NULL;
8002   tree new_tree;
8003   if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
8004     {
8005       /* Emit:
8006
8007            SCALAR_RES = EXTRACT_LAST <VEC_LHS, MASK>
8008
8009          where VEC_LHS is the vectorized live-out result and MASK is
8010          the loop mask for the final iteration.  */
8011       gcc_assert (ncopies == 1 && !slp_node);
8012       tree scalar_type = TREE_TYPE (STMT_VINFO_VECTYPE (stmt_info));
8013       tree mask = vect_get_loop_mask (gsi, &LOOP_VINFO_MASKS (loop_vinfo),
8014                                       1, vectype, 0);
8015       tree scalar_res = gimple_build (&stmts, CFN_EXTRACT_LAST,
8016                                       scalar_type, mask, vec_lhs);
8017
8018       /* Convert the extracted vector element to the required scalar type.  */
8019       new_tree = gimple_convert (&stmts, lhs_type, scalar_res);
8020     }
8021   else
8022     {
8023       tree bftype = TREE_TYPE (vectype);
8024       if (VECTOR_BOOLEAN_TYPE_P (vectype))
8025         bftype = build_nonstandard_integer_type (tree_to_uhwi (bitsize), 1);
8026       new_tree = build3 (BIT_FIELD_REF, bftype, vec_lhs, bitsize, bitstart);
8027       new_tree = force_gimple_operand (fold_convert (lhs_type, new_tree),
8028                                        &stmts, true, NULL_TREE);
8029     }
8030
8031   if (stmts)
8032     gsi_insert_seq_on_edge_immediate (single_exit (loop), stmts);
8033
8034   /* Replace use of lhs with newly computed result.  If the use stmt is a
8035      single arg PHI, just replace all uses of PHI result.  It's necessary
8036      because lcssa PHI defining lhs may be before newly inserted stmt.  */
8037   use_operand_p use_p;
8038   FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, lhs)
8039     if (!flow_bb_inside_loop_p (loop, gimple_bb (use_stmt))
8040         && !is_gimple_debug (use_stmt))
8041     {
8042       if (gimple_code (use_stmt) == GIMPLE_PHI
8043           && gimple_phi_num_args (use_stmt) == 1)
8044         {
8045           replace_uses_by (gimple_phi_result (use_stmt), new_tree);
8046         }
8047       else
8048         {
8049           FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
8050             SET_USE (use_p, new_tree);
8051         }
8052       update_stmt (use_stmt);
8053     }
8054
8055   return true;
8056 }
8057
8058 /* Kill any debug uses outside LOOP of SSA names defined in STMT.  */
8059
8060 static void
8061 vect_loop_kill_debug_uses (struct loop *loop, gimple *stmt)
8062 {
8063   ssa_op_iter op_iter;
8064   imm_use_iterator imm_iter;
8065   def_operand_p def_p;
8066   gimple *ustmt;
8067
8068   FOR_EACH_PHI_OR_STMT_DEF (def_p, stmt, op_iter, SSA_OP_DEF)
8069     {
8070       FOR_EACH_IMM_USE_STMT (ustmt, imm_iter, DEF_FROM_PTR (def_p))
8071         {
8072           basic_block bb;
8073
8074           if (!is_gimple_debug (ustmt))
8075             continue;
8076
8077           bb = gimple_bb (ustmt);
8078
8079           if (!flow_bb_inside_loop_p (loop, bb))
8080             {
8081               if (gimple_debug_bind_p (ustmt))
8082                 {
8083                   if (dump_enabled_p ())
8084                     dump_printf_loc (MSG_NOTE, vect_location,
8085                                      "killing debug use\n");
8086
8087                   gimple_debug_bind_reset_value (ustmt);
8088                   update_stmt (ustmt);
8089                 }
8090               else
8091                 gcc_unreachable ();
8092             }
8093         }
8094     }
8095 }
8096
8097 /* Given loop represented by LOOP_VINFO, return true if computation of
8098    LOOP_VINFO_NITERS (= LOOP_VINFO_NITERSM1 + 1) doesn't overflow, false
8099    otherwise.  */
8100
8101 static bool
8102 loop_niters_no_overflow (loop_vec_info loop_vinfo)
8103 {
8104   /* Constant case.  */
8105   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
8106     {
8107       tree cst_niters = LOOP_VINFO_NITERS (loop_vinfo);
8108       tree cst_nitersm1 = LOOP_VINFO_NITERSM1 (loop_vinfo);
8109
8110       gcc_assert (TREE_CODE (cst_niters) == INTEGER_CST);
8111       gcc_assert (TREE_CODE (cst_nitersm1) == INTEGER_CST);
8112       if (wi::to_widest (cst_nitersm1) < wi::to_widest (cst_niters))
8113         return true;
8114     }
8115
8116   widest_int max;
8117   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
8118   /* Check the upper bound of loop niters.  */
8119   if (get_max_loop_iterations (loop, &max))
8120     {
8121       tree type = TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo));
8122       signop sgn = TYPE_SIGN (type);
8123       widest_int type_max = widest_int::from (wi::max_value (type), sgn);
8124       if (max < type_max)
8125         return true;
8126     }
8127   return false;
8128 }
8129
8130 /* Return a mask type with half the number of elements as TYPE.  */
8131
8132 tree
8133 vect_halve_mask_nunits (tree type)
8134 {
8135   poly_uint64 nunits = exact_div (TYPE_VECTOR_SUBPARTS (type), 2);
8136   return build_truth_vector_type (nunits, current_vector_size);
8137 }
8138
8139 /* Return a mask type with twice as many elements as TYPE.  */
8140
8141 tree
8142 vect_double_mask_nunits (tree type)
8143 {
8144   poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (type) * 2;
8145   return build_truth_vector_type (nunits, current_vector_size);
8146 }
8147
8148 /* Record that a fully-masked version of LOOP_VINFO would need MASKS to
8149    contain a sequence of NVECTORS masks that each control a vector of type
8150    VECTYPE.  */
8151
8152 void
8153 vect_record_loop_mask (loop_vec_info loop_vinfo, vec_loop_masks *masks,
8154                        unsigned int nvectors, tree vectype)
8155 {
8156   gcc_assert (nvectors != 0);
8157   if (masks->length () < nvectors)
8158     masks->safe_grow_cleared (nvectors);
8159   rgroup_masks *rgm = &(*masks)[nvectors - 1];
8160   /* The number of scalars per iteration and the number of vectors are
8161      both compile-time constants.  */
8162   unsigned int nscalars_per_iter
8163     = exact_div (nvectors * TYPE_VECTOR_SUBPARTS (vectype),
8164                  LOOP_VINFO_VECT_FACTOR (loop_vinfo)).to_constant ();
8165   if (rgm->max_nscalars_per_iter < nscalars_per_iter)
8166     {
8167       rgm->max_nscalars_per_iter = nscalars_per_iter;
8168       rgm->mask_type = build_same_sized_truth_vector_type (vectype);
8169     }
8170 }
8171
8172 /* Given a complete set of masks MASKS, extract mask number INDEX
8173    for an rgroup that operates on NVECTORS vectors of type VECTYPE,
8174    where 0 <= INDEX < NVECTORS.  Insert any set-up statements before GSI.
8175
8176    See the comment above vec_loop_masks for more details about the mask
8177    arrangement.  */
8178
8179 tree
8180 vect_get_loop_mask (gimple_stmt_iterator *gsi, vec_loop_masks *masks,
8181                     unsigned int nvectors, tree vectype, unsigned int index)
8182 {
8183   rgroup_masks *rgm = &(*masks)[nvectors - 1];
8184   tree mask_type = rgm->mask_type;
8185
8186   /* Populate the rgroup's mask array, if this is the first time we've
8187      used it.  */
8188   if (rgm->masks.is_empty ())
8189     {
8190       rgm->masks.safe_grow_cleared (nvectors);
8191       for (unsigned int i = 0; i < nvectors; ++i)
8192         {
8193           tree mask = make_temp_ssa_name (mask_type, NULL, "loop_mask");
8194           /* Provide a dummy definition until the real one is available.  */
8195           SSA_NAME_DEF_STMT (mask) = gimple_build_nop ();
8196           rgm->masks[i] = mask;
8197         }
8198     }
8199
8200   tree mask = rgm->masks[index];
8201   if (maybe_ne (TYPE_VECTOR_SUBPARTS (mask_type),
8202                 TYPE_VECTOR_SUBPARTS (vectype)))
8203     {
8204       /* A loop mask for data type X can be reused for data type Y
8205          if X has N times more elements than Y and if Y's elements
8206          are N times bigger than X's.  In this case each sequence
8207          of N elements in the loop mask will be all-zero or all-one.
8208          We can then view-convert the mask so that each sequence of
8209          N elements is replaced by a single element.  */
8210       gcc_assert (multiple_p (TYPE_VECTOR_SUBPARTS (mask_type),
8211                               TYPE_VECTOR_SUBPARTS (vectype)));
8212       gimple_seq seq = NULL;
8213       mask_type = build_same_sized_truth_vector_type (vectype);
8214       mask = gimple_build (&seq, VIEW_CONVERT_EXPR, mask_type, mask);
8215       if (seq)
8216         gsi_insert_seq_before (gsi, seq, GSI_SAME_STMT);
8217     }
8218   return mask;
8219 }
8220
8221 /* Scale profiling counters by estimation for LOOP which is vectorized
8222    by factor VF.  */
8223
8224 static void
8225 scale_profile_for_vect_loop (struct loop *loop, unsigned vf)
8226 {
8227   edge preheader = loop_preheader_edge (loop);
8228   /* Reduce loop iterations by the vectorization factor.  */
8229   gcov_type new_est_niter = niter_for_unrolled_loop (loop, vf);
8230   profile_count freq_h = loop->header->count, freq_e = preheader->count ();
8231
8232   if (freq_h.nonzero_p ())
8233     {
8234       profile_probability p;
8235
8236       /* Avoid dropping loop body profile counter to 0 because of zero count
8237          in loop's preheader.  */
8238       if (!(freq_e == profile_count::zero ()))
8239         freq_e = freq_e.force_nonzero ();
8240       p = freq_e.apply_scale (new_est_niter + 1, 1).probability_in (freq_h);
8241       scale_loop_frequencies (loop, p);
8242     }
8243
8244   edge exit_e = single_exit (loop);
8245   exit_e->probability = profile_probability::always ()
8246                                  .apply_scale (1, new_est_niter + 1);
8247
8248   edge exit_l = single_pred_edge (loop->latch);
8249   profile_probability prob = exit_l->probability;
8250   exit_l->probability = exit_e->probability.invert ();
8251   if (prob.initialized_p () && exit_l->probability.initialized_p ())
8252     scale_bbs_frequencies (&loop->latch, 1, exit_l->probability / prob);
8253 }
8254
8255 /* Vectorize STMT if relevant, inserting any new instructions before GSI.
8256    When vectorizing STMT as a store, set *SEEN_STORE to its stmt_vec_info.
8257    *SLP_SCHEDULE is a running record of whether we have called
8258    vect_schedule_slp.  */
8259
8260 static void
8261 vect_transform_loop_stmt (loop_vec_info loop_vinfo, gimple *stmt,
8262                           gimple_stmt_iterator *gsi,
8263                           stmt_vec_info *seen_store, bool *slp_scheduled)
8264 {
8265   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
8266   poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
8267   stmt_vec_info stmt_info = loop_vinfo->lookup_stmt (stmt);
8268   if (!stmt_info)
8269     return;
8270
8271   if (dump_enabled_p ())
8272     {
8273       dump_printf_loc (MSG_NOTE, vect_location,
8274                        "------>vectorizing statement: ");
8275       dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
8276     }
8277
8278   if (MAY_HAVE_DEBUG_BIND_STMTS && !STMT_VINFO_LIVE_P (stmt_info))
8279     vect_loop_kill_debug_uses (loop, stmt);
8280
8281   if (!STMT_VINFO_RELEVANT_P (stmt_info)
8282       && !STMT_VINFO_LIVE_P (stmt_info))
8283     return;
8284
8285   if (STMT_VINFO_VECTYPE (stmt_info))
8286     {
8287       poly_uint64 nunits
8288         = TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info));
8289       if (!STMT_SLP_TYPE (stmt_info)
8290           && maybe_ne (nunits, vf)
8291           && dump_enabled_p ())
8292         /* For SLP VF is set according to unrolling factor, and not
8293            to vector size, hence for SLP this print is not valid.  */
8294         dump_printf_loc (MSG_NOTE, vect_location, "multiple-types.\n");
8295     }
8296
8297   /* SLP.  Schedule all the SLP instances when the first SLP stmt is
8298      reached.  */
8299   if (slp_vect_type slptype = STMT_SLP_TYPE (stmt_info))
8300     {
8301
8302       if (!*slp_scheduled)
8303         {
8304           *slp_scheduled = true;
8305
8306           DUMP_VECT_SCOPE ("scheduling SLP instances");
8307
8308           vect_schedule_slp (loop_vinfo);
8309         }
8310
8311       /* Hybrid SLP stmts must be vectorized in addition to SLP.  */
8312       if (slptype == pure_slp)
8313         return;
8314     }
8315
8316   if (dump_enabled_p ())
8317     dump_printf_loc (MSG_NOTE, vect_location, "transform statement.\n");
8318
8319   bool grouped_store = false;
8320   if (vect_transform_stmt (stmt, gsi, &grouped_store, NULL, NULL))
8321     *seen_store = stmt_info;
8322 }
8323
8324 /* Function vect_transform_loop.
8325
8326    The analysis phase has determined that the loop is vectorizable.
8327    Vectorize the loop - created vectorized stmts to replace the scalar
8328    stmts in the loop, and update the loop exit condition.
8329    Returns scalar epilogue loop if any.  */
8330
8331 struct loop *
8332 vect_transform_loop (loop_vec_info loop_vinfo)
8333 {
8334   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
8335   struct loop *epilogue = NULL;
8336   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
8337   int nbbs = loop->num_nodes;
8338   int i;
8339   tree niters_vector = NULL_TREE;
8340   tree step_vector = NULL_TREE;
8341   tree niters_vector_mult_vf = NULL_TREE;
8342   poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
8343   unsigned int lowest_vf = constant_lower_bound (vf);
8344   bool slp_scheduled = false;
8345   gimple *stmt;
8346   bool check_profitability = false;
8347   unsigned int th;
8348
8349   DUMP_VECT_SCOPE ("vec_transform_loop");
8350
8351   loop_vinfo->shared->check_datarefs ();
8352
8353   /* Use the more conservative vectorization threshold.  If the number
8354      of iterations is constant assume the cost check has been performed
8355      by our caller.  If the threshold makes all loops profitable that
8356      run at least the (estimated) vectorization factor number of times
8357      checking is pointless, too.  */
8358   th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
8359   if (th >= vect_vf_for_cost (loop_vinfo)
8360       && !LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
8361     {
8362       if (dump_enabled_p ())
8363         dump_printf_loc (MSG_NOTE, vect_location,
8364                          "Profitability threshold is %d loop iterations.\n",
8365                          th);
8366       check_profitability = true;
8367     }
8368
8369   /* Make sure there exists a single-predecessor exit bb.  Do this before
8370      versioning.   */
8371   edge e = single_exit (loop);
8372   if (! single_pred_p (e->dest))
8373     {
8374       split_loop_exit_edge (e);
8375       if (dump_enabled_p ())
8376         dump_printf (MSG_NOTE, "split exit edge\n");
8377     }
8378
8379   /* Version the loop first, if required, so the profitability check
8380      comes first.  */
8381
8382   if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
8383     {
8384       poly_uint64 versioning_threshold
8385         = LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo);
8386       if (check_profitability
8387           && ordered_p (poly_uint64 (th), versioning_threshold))
8388         {
8389           versioning_threshold = ordered_max (poly_uint64 (th),
8390                                               versioning_threshold);
8391           check_profitability = false;
8392         }
8393       vect_loop_versioning (loop_vinfo, th, check_profitability,
8394                             versioning_threshold);
8395       check_profitability = false;
8396     }
8397
8398   /* Make sure there exists a single-predecessor exit bb also on the
8399      scalar loop copy.  Do this after versioning but before peeling
8400      so CFG structure is fine for both scalar and if-converted loop
8401      to make slpeel_duplicate_current_defs_from_edges face matched
8402      loop closed PHI nodes on the exit.  */
8403   if (LOOP_VINFO_SCALAR_LOOP (loop_vinfo))
8404     {
8405       e = single_exit (LOOP_VINFO_SCALAR_LOOP (loop_vinfo));
8406       if (! single_pred_p (e->dest))
8407         {
8408           split_loop_exit_edge (e);
8409           if (dump_enabled_p ())
8410             dump_printf (MSG_NOTE, "split exit edge of scalar loop\n");
8411         }
8412     }
8413
8414   tree niters = vect_build_loop_niters (loop_vinfo);
8415   LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = niters;
8416   tree nitersm1 = unshare_expr (LOOP_VINFO_NITERSM1 (loop_vinfo));
8417   bool niters_no_overflow = loop_niters_no_overflow (loop_vinfo);
8418   epilogue = vect_do_peeling (loop_vinfo, niters, nitersm1, &niters_vector,
8419                               &step_vector, &niters_vector_mult_vf, th,
8420                               check_profitability, niters_no_overflow);
8421
8422   if (niters_vector == NULL_TREE)
8423     {
8424       if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
8425           && !LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
8426           && known_eq (lowest_vf, vf))
8427         {
8428           niters_vector
8429             = build_int_cst (TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo)),
8430                              LOOP_VINFO_INT_NITERS (loop_vinfo) / lowest_vf);
8431           step_vector = build_one_cst (TREE_TYPE (niters));
8432         }
8433       else
8434         vect_gen_vector_loop_niters (loop_vinfo, niters, &niters_vector,
8435                                      &step_vector, niters_no_overflow);
8436     }
8437
8438   /* 1) Make sure the loop header has exactly two entries
8439      2) Make sure we have a preheader basic block.  */
8440
8441   gcc_assert (EDGE_COUNT (loop->header->preds) == 2);
8442
8443   split_edge (loop_preheader_edge (loop));
8444
8445   if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
8446       && vect_use_loop_mask_for_alignment_p (loop_vinfo))
8447     /* This will deal with any possible peeling.  */
8448     vect_prepare_for_masked_peels (loop_vinfo);
8449
8450   /* FORNOW: the vectorizer supports only loops which body consist
8451      of one basic block (header + empty latch). When the vectorizer will
8452      support more involved loop forms, the order by which the BBs are
8453      traversed need to be reconsidered.  */
8454
8455   for (i = 0; i < nbbs; i++)
8456     {
8457       basic_block bb = bbs[i];
8458       stmt_vec_info stmt_info;
8459
8460       for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
8461            gsi_next (&si))
8462         {
8463           gphi *phi = si.phi ();
8464           if (dump_enabled_p ())
8465             {
8466               dump_printf_loc (MSG_NOTE, vect_location,
8467                                "------>vectorizing phi: ");
8468               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
8469             }
8470           stmt_info = loop_vinfo->lookup_stmt (phi);
8471           if (!stmt_info)
8472             continue;
8473
8474           if (MAY_HAVE_DEBUG_BIND_STMTS && !STMT_VINFO_LIVE_P (stmt_info))
8475             vect_loop_kill_debug_uses (loop, phi);
8476
8477           if (!STMT_VINFO_RELEVANT_P (stmt_info)
8478               && !STMT_VINFO_LIVE_P (stmt_info))
8479             continue;
8480
8481           if (STMT_VINFO_VECTYPE (stmt_info)
8482               && (maybe_ne
8483                   (TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info)), vf))
8484               && dump_enabled_p ())
8485             dump_printf_loc (MSG_NOTE, vect_location, "multiple-types.\n");
8486
8487           if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def
8488                || STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
8489                || STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle)
8490               && ! PURE_SLP_STMT (stmt_info))
8491             {
8492               if (dump_enabled_p ())
8493                 dump_printf_loc (MSG_NOTE, vect_location, "transform phi.\n");
8494               vect_transform_stmt (phi, NULL, NULL, NULL, NULL);
8495             }
8496         }
8497
8498       for (gimple_stmt_iterator si = gsi_start_bb (bb);
8499            !gsi_end_p (si);)
8500         {
8501           stmt = gsi_stmt (si);
8502           /* During vectorization remove existing clobber stmts.  */
8503           if (gimple_clobber_p (stmt))
8504             {
8505               unlink_stmt_vdef (stmt);
8506               gsi_remove (&si, true);
8507               release_defs (stmt);
8508             }
8509           else
8510             {
8511               stmt_info = loop_vinfo->lookup_stmt (stmt);
8512
8513               /* vector stmts created in the outer-loop during vectorization of
8514                  stmts in an inner-loop may not have a stmt_info, and do not
8515                  need to be vectorized.  */
8516               stmt_vec_info seen_store = NULL;
8517               if (stmt_info)
8518                 {
8519                   if (STMT_VINFO_IN_PATTERN_P (stmt_info))
8520                     {
8521                       gimple *def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
8522                       for (gimple_stmt_iterator subsi = gsi_start (def_seq);
8523                            !gsi_end_p (subsi); gsi_next (&subsi))
8524                         vect_transform_loop_stmt (loop_vinfo,
8525                                                   gsi_stmt (subsi), &si,
8526                                                   &seen_store,
8527                                                   &slp_scheduled);
8528                       gimple *pat_stmt = STMT_VINFO_RELATED_STMT (stmt_info);
8529                       vect_transform_loop_stmt (loop_vinfo, pat_stmt, &si,
8530                                                 &seen_store, &slp_scheduled);
8531                     }
8532                   vect_transform_loop_stmt (loop_vinfo, stmt, &si,
8533                                             &seen_store, &slp_scheduled);
8534                 }
8535               if (seen_store)
8536                 {
8537                   if (STMT_VINFO_GROUPED_ACCESS (seen_store))
8538                     {
8539                       /* Interleaving.  If IS_STORE is TRUE, the
8540                          vectorization of the interleaving chain was
8541                          completed - free all the stores in the chain.  */
8542                       gsi_next (&si);
8543                       vect_remove_stores (DR_GROUP_FIRST_ELEMENT (seen_store));
8544                     }
8545                   else
8546                     {
8547                       /* Free the attached stmt_vec_info and remove the
8548                          stmt.  */
8549                       free_stmt_vec_info (stmt);
8550                       unlink_stmt_vdef (stmt);
8551                       gsi_remove (&si, true);
8552                       release_defs (stmt);
8553                     }
8554                 }
8555               else
8556                 gsi_next (&si);
8557             }
8558         }
8559
8560       /* Stub out scalar statements that must not survive vectorization.
8561          Doing this here helps with grouped statements, or statements that
8562          are involved in patterns.  */
8563       for (gimple_stmt_iterator gsi = gsi_start_bb (bb);
8564            !gsi_end_p (gsi); gsi_next (&gsi))
8565         {
8566           gcall *call = dyn_cast <gcall *> (gsi_stmt (gsi));
8567           if (call && gimple_call_internal_p (call, IFN_MASK_LOAD))
8568             {
8569               tree lhs = gimple_get_lhs (call);
8570               if (!VECTOR_TYPE_P (TREE_TYPE (lhs)))
8571                 {
8572                   tree zero = build_zero_cst (TREE_TYPE (lhs));
8573                   gimple *new_stmt = gimple_build_assign (lhs, zero);
8574                   gsi_replace (&gsi, new_stmt, true);
8575                 }
8576             }
8577         }
8578     }                           /* BBs in loop */
8579
8580   /* The vectorization factor is always > 1, so if we use an IV increment of 1.
8581      a zero NITERS becomes a nonzero NITERS_VECTOR.  */
8582   if (integer_onep (step_vector))
8583     niters_no_overflow = true;
8584   vect_set_loop_condition (loop, loop_vinfo, niters_vector, step_vector,
8585                            niters_vector_mult_vf, !niters_no_overflow);
8586
8587   unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
8588   scale_profile_for_vect_loop (loop, assumed_vf);
8589
8590   /* True if the final iteration might not handle a full vector's
8591      worth of scalar iterations.  */
8592   bool final_iter_may_be_partial = LOOP_VINFO_FULLY_MASKED_P (loop_vinfo);
8593   /* The minimum number of iterations performed by the epilogue.  This
8594      is 1 when peeling for gaps because we always need a final scalar
8595      iteration.  */
8596   int min_epilogue_iters = LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) ? 1 : 0;
8597   /* +1 to convert latch counts to loop iteration counts,
8598      -min_epilogue_iters to remove iterations that cannot be performed
8599        by the vector code.  */
8600   int bias_for_lowest = 1 - min_epilogue_iters;
8601   int bias_for_assumed = bias_for_lowest;
8602   int alignment_npeels = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
8603   if (alignment_npeels && LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
8604     {
8605       /* When the amount of peeling is known at compile time, the first
8606          iteration will have exactly alignment_npeels active elements.
8607          In the worst case it will have at least one.  */
8608       int min_first_active = (alignment_npeels > 0 ? alignment_npeels : 1);
8609       bias_for_lowest += lowest_vf - min_first_active;
8610       bias_for_assumed += assumed_vf - min_first_active;
8611     }
8612   /* In these calculations the "- 1" converts loop iteration counts
8613      back to latch counts.  */
8614   if (loop->any_upper_bound)
8615     loop->nb_iterations_upper_bound
8616       = (final_iter_may_be_partial
8617          ? wi::udiv_ceil (loop->nb_iterations_upper_bound + bias_for_lowest,
8618                           lowest_vf) - 1
8619          : wi::udiv_floor (loop->nb_iterations_upper_bound + bias_for_lowest,
8620                            lowest_vf) - 1);
8621   if (loop->any_likely_upper_bound)
8622     loop->nb_iterations_likely_upper_bound
8623       = (final_iter_may_be_partial
8624          ? wi::udiv_ceil (loop->nb_iterations_likely_upper_bound
8625                           + bias_for_lowest, lowest_vf) - 1
8626          : wi::udiv_floor (loop->nb_iterations_likely_upper_bound
8627                            + bias_for_lowest, lowest_vf) - 1);
8628   if (loop->any_estimate)
8629     loop->nb_iterations_estimate
8630       = (final_iter_may_be_partial
8631          ? wi::udiv_ceil (loop->nb_iterations_estimate + bias_for_assumed,
8632                           assumed_vf) - 1
8633          : wi::udiv_floor (loop->nb_iterations_estimate + bias_for_assumed,
8634                            assumed_vf) - 1);
8635
8636   if (dump_enabled_p ())
8637     {
8638       if (!LOOP_VINFO_EPILOGUE_P (loop_vinfo))
8639         {
8640           dump_printf_loc (MSG_NOTE, vect_location,
8641                            "LOOP VECTORIZED\n");
8642           if (loop->inner)
8643             dump_printf_loc (MSG_NOTE, vect_location,
8644                              "OUTER LOOP VECTORIZED\n");
8645           dump_printf (MSG_NOTE, "\n");
8646         }
8647       else
8648         {
8649           dump_printf_loc (MSG_NOTE, vect_location,
8650                            "LOOP EPILOGUE VECTORIZED (VS=");
8651           dump_dec (MSG_NOTE, current_vector_size);
8652           dump_printf (MSG_NOTE, ")\n");
8653         }
8654     }
8655
8656   /* Free SLP instances here because otherwise stmt reference counting
8657      won't work.  */
8658   slp_instance instance;
8659   FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), i, instance)
8660     vect_free_slp_instance (instance, true);
8661   LOOP_VINFO_SLP_INSTANCES (loop_vinfo).release ();
8662   /* Clear-up safelen field since its value is invalid after vectorization
8663      since vectorized loop can have loop-carried dependencies.  */
8664   loop->safelen = 0;
8665
8666   /* Don't vectorize epilogue for epilogue.  */
8667   if (LOOP_VINFO_EPILOGUE_P (loop_vinfo))
8668     epilogue = NULL;
8669
8670   if (!PARAM_VALUE (PARAM_VECT_EPILOGUES_NOMASK))
8671     epilogue = NULL;
8672
8673   if (epilogue)
8674     {
8675       auto_vector_sizes vector_sizes;
8676       targetm.vectorize.autovectorize_vector_sizes (&vector_sizes);
8677       unsigned int next_size = 0;
8678
8679       if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
8680           && LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) >= 0
8681           && known_eq (vf, lowest_vf))
8682         {
8683           unsigned int eiters
8684             = (LOOP_VINFO_INT_NITERS (loop_vinfo)
8685                - LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo));
8686           eiters = eiters % lowest_vf;
8687           epilogue->nb_iterations_upper_bound = eiters - 1;
8688
8689           unsigned int ratio;
8690           while (next_size < vector_sizes.length ()
8691                  && !(constant_multiple_p (current_vector_size,
8692                                            vector_sizes[next_size], &ratio)
8693                       && eiters >= lowest_vf / ratio))
8694             next_size += 1;
8695         }
8696       else
8697         while (next_size < vector_sizes.length ()
8698                && maybe_lt (current_vector_size, vector_sizes[next_size]))
8699           next_size += 1;
8700
8701       if (next_size == vector_sizes.length ())
8702         epilogue = NULL;
8703     }
8704
8705   if (epilogue)
8706     {
8707       epilogue->force_vectorize = loop->force_vectorize;
8708       epilogue->safelen = loop->safelen;
8709       epilogue->dont_vectorize = false;
8710
8711       /* We may need to if-convert epilogue to vectorize it.  */
8712       if (LOOP_VINFO_SCALAR_LOOP (loop_vinfo))
8713         tree_if_conversion (epilogue);
8714     }
8715
8716   return epilogue;
8717 }
8718
8719 /* The code below is trying to perform simple optimization - revert
8720    if-conversion for masked stores, i.e. if the mask of a store is zero
8721    do not perform it and all stored value producers also if possible.
8722    For example,
8723      for (i=0; i<n; i++)
8724        if (c[i])
8725         {
8726           p1[i] += 1;
8727           p2[i] = p3[i] +2;
8728         }
8729    this transformation will produce the following semi-hammock:
8730
8731    if (!mask__ifc__42.18_165 == { 0, 0, 0, 0, 0, 0, 0, 0 })
8732      {
8733        vect__11.19_170 = MASK_LOAD (vectp_p1.20_168, 0B, mask__ifc__42.18_165);
8734        vect__12.22_172 = vect__11.19_170 + vect_cst__171;
8735        MASK_STORE (vectp_p1.23_175, 0B, mask__ifc__42.18_165, vect__12.22_172);
8736        vect__18.25_182 = MASK_LOAD (vectp_p3.26_180, 0B, mask__ifc__42.18_165);
8737        vect__19.28_184 = vect__18.25_182 + vect_cst__183;
8738        MASK_STORE (vectp_p2.29_187, 0B, mask__ifc__42.18_165, vect__19.28_184);
8739      }
8740 */
8741
8742 void
8743 optimize_mask_stores (struct loop *loop)
8744 {
8745   basic_block *bbs = get_loop_body (loop);
8746   unsigned nbbs = loop->num_nodes;
8747   unsigned i;
8748   basic_block bb;
8749   struct loop *bb_loop;
8750   gimple_stmt_iterator gsi;
8751   gimple *stmt;
8752   auto_vec<gimple *> worklist;
8753
8754   vect_location = find_loop_location (loop);
8755   /* Pick up all masked stores in loop if any.  */
8756   for (i = 0; i < nbbs; i++)
8757     {
8758       bb = bbs[i];
8759       for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi);
8760            gsi_next (&gsi))
8761         {
8762           stmt = gsi_stmt (gsi);
8763           if (gimple_call_internal_p (stmt, IFN_MASK_STORE))
8764             worklist.safe_push (stmt);
8765         }
8766     }
8767
8768   free (bbs);
8769   if (worklist.is_empty ())
8770     return;
8771
8772   /* Loop has masked stores.  */
8773   while (!worklist.is_empty ())
8774     {
8775       gimple *last, *last_store;
8776       edge e, efalse;
8777       tree mask;
8778       basic_block store_bb, join_bb;
8779       gimple_stmt_iterator gsi_to;
8780       tree vdef, new_vdef;
8781       gphi *phi;
8782       tree vectype;
8783       tree zero;
8784
8785       last = worklist.pop ();
8786       mask = gimple_call_arg (last, 2);
8787       bb = gimple_bb (last);
8788       /* Create then_bb and if-then structure in CFG, then_bb belongs to
8789          the same loop as if_bb.  It could be different to LOOP when two
8790          level loop-nest is vectorized and mask_store belongs to the inner
8791          one.  */
8792       e = split_block (bb, last);
8793       bb_loop = bb->loop_father;
8794       gcc_assert (loop == bb_loop || flow_loop_nested_p (loop, bb_loop));
8795       join_bb = e->dest;
8796       store_bb = create_empty_bb (bb);
8797       add_bb_to_loop (store_bb, bb_loop);
8798       e->flags = EDGE_TRUE_VALUE;
8799       efalse = make_edge (bb, store_bb, EDGE_FALSE_VALUE);
8800       /* Put STORE_BB to likely part.  */
8801       efalse->probability = profile_probability::unlikely ();
8802       store_bb->count = efalse->count ();
8803       make_single_succ_edge (store_bb, join_bb, EDGE_FALLTHRU);
8804       if (dom_info_available_p (CDI_DOMINATORS))
8805         set_immediate_dominator (CDI_DOMINATORS, store_bb, bb);
8806       if (dump_enabled_p ())
8807         dump_printf_loc (MSG_NOTE, vect_location,
8808                          "Create new block %d to sink mask stores.",
8809                          store_bb->index);
8810       /* Create vector comparison with boolean result.  */
8811       vectype = TREE_TYPE (mask);
8812       zero = build_zero_cst (vectype);
8813       stmt = gimple_build_cond (EQ_EXPR, mask, zero, NULL_TREE, NULL_TREE);
8814       gsi = gsi_last_bb (bb);
8815       gsi_insert_after (&gsi, stmt, GSI_SAME_STMT);
8816       /* Create new PHI node for vdef of the last masked store:
8817          .MEM_2 = VDEF <.MEM_1>
8818          will be converted to
8819          .MEM.3 = VDEF <.MEM_1>
8820          and new PHI node will be created in join bb
8821          .MEM_2 = PHI <.MEM_1, .MEM_3>
8822       */
8823       vdef = gimple_vdef (last);
8824       new_vdef = make_ssa_name (gimple_vop (cfun), last);
8825       gimple_set_vdef (last, new_vdef);
8826       phi = create_phi_node (vdef, join_bb);
8827       add_phi_arg (phi, new_vdef, EDGE_SUCC (store_bb, 0), UNKNOWN_LOCATION);
8828
8829       /* Put all masked stores with the same mask to STORE_BB if possible.  */
8830       while (true)
8831         {
8832           gimple_stmt_iterator gsi_from;
8833           gimple *stmt1 = NULL;
8834
8835           /* Move masked store to STORE_BB.  */
8836           last_store = last;
8837           gsi = gsi_for_stmt (last);
8838           gsi_from = gsi;
8839           /* Shift GSI to the previous stmt for further traversal.  */
8840           gsi_prev (&gsi);
8841           gsi_to = gsi_start_bb (store_bb);
8842           gsi_move_before (&gsi_from, &gsi_to);
8843           /* Setup GSI_TO to the non-empty block start.  */
8844           gsi_to = gsi_start_bb (store_bb);
8845           if (dump_enabled_p ())
8846             {
8847               dump_printf_loc (MSG_NOTE, vect_location,
8848                                "Move stmt to created bb\n");
8849               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, last, 0);
8850             }
8851           /* Move all stored value producers if possible.  */
8852           while (!gsi_end_p (gsi))
8853             {
8854               tree lhs;
8855               imm_use_iterator imm_iter;
8856               use_operand_p use_p;
8857               bool res;
8858
8859               /* Skip debug statements.  */
8860               if (is_gimple_debug (gsi_stmt (gsi)))
8861                 {
8862                   gsi_prev (&gsi);
8863                   continue;
8864                 }
8865               stmt1 = gsi_stmt (gsi);
8866               /* Do not consider statements writing to memory or having
8867                  volatile operand.  */
8868               if (gimple_vdef (stmt1)
8869                   || gimple_has_volatile_ops (stmt1))
8870                 break;
8871               gsi_from = gsi;
8872               gsi_prev (&gsi);
8873               lhs = gimple_get_lhs (stmt1);
8874               if (!lhs)
8875                 break;
8876
8877               /* LHS of vectorized stmt must be SSA_NAME.  */
8878               if (TREE_CODE (lhs) != SSA_NAME)
8879                 break;
8880
8881               if (!VECTOR_TYPE_P (TREE_TYPE (lhs)))
8882                 {
8883                   /* Remove dead scalar statement.  */
8884                   if (has_zero_uses (lhs))
8885                     {
8886                       gsi_remove (&gsi_from, true);
8887                       continue;
8888                     }
8889                 }
8890
8891               /* Check that LHS does not have uses outside of STORE_BB.  */
8892               res = true;
8893               FOR_EACH_IMM_USE_FAST (use_p, imm_iter, lhs)
8894                 {
8895                   gimple *use_stmt;
8896                   use_stmt = USE_STMT (use_p);
8897                   if (is_gimple_debug (use_stmt))
8898                     continue;
8899                   if (gimple_bb (use_stmt) != store_bb)
8900                     {
8901                       res = false;
8902                       break;
8903                     }
8904                 }
8905               if (!res)
8906                 break;
8907
8908               if (gimple_vuse (stmt1)
8909                   && gimple_vuse (stmt1) != gimple_vuse (last_store))
8910                 break;
8911
8912               /* Can move STMT1 to STORE_BB.  */
8913               if (dump_enabled_p ())
8914                 {
8915                   dump_printf_loc (MSG_NOTE, vect_location,
8916                                    "Move stmt to created bb\n");
8917                   dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt1, 0);
8918                 }
8919               gsi_move_before (&gsi_from, &gsi_to);
8920               /* Shift GSI_TO for further insertion.  */
8921               gsi_prev (&gsi_to);
8922             }
8923           /* Put other masked stores with the same mask to STORE_BB.  */
8924           if (worklist.is_empty ()
8925               || gimple_call_arg (worklist.last (), 2) != mask
8926               || worklist.last () != stmt1)
8927             break;
8928           last = worklist.pop ();
8929         }
8930       add_phi_arg (phi, gimple_vuse (last_store), e, UNKNOWN_LOCATION);
8931     }
8932 }